725 files changed, 443465 insertions, 81192 deletions
diff --git a/xlators/Makefile.am b/xlators/Makefile.am
index 2abb5219488..ef20cbb64fa 100644
--- a/xlators/Makefile.am
+++ b/xlators/Makefile.am
@@ -1,3 +1,13 @@
-SUBDIRS = cluster storage protocol performance debug features encryption mount
+if BUILD_GNFS
+  GNFS_DIR = nfs
+endif
 
-CLEANFILES = 
+DIST_SUBDIRS = cluster storage protocol performance debug features \
+          mount nfs mgmt system playground meta
+
+SUBDIRS = cluster storage protocol performance debug features \
+          mount ${GNFS_DIR} mgmt system playground meta
+
+EXTRA_DIST = xlator.sym
+
+CLEANFILES =
diff --git a/xlators/bindings/Makefile.am b/xlators/bindings/Makefile.am
deleted file mode 100644
index f7766580257..00000000000
--- a/xlators/bindings/Makefile.am
+++ /dev/null
@@ -1 +0,0 @@
-SUBDIRS = $(BINDINGS_SUBDIRS)
diff --git a/xlators/bindings/python/src/Makefile.am b/xlators/bindings/python/src/Makefile.am
deleted file mode 100644
index c0b9141c667..00000000000
--- a/xlators/bindings/python/src/Makefile.am
+++ /dev/null
@@ -1,19 +0,0 @@
-
-xlator_PROGRAMS = python.so
-
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/bindings
-
-python_PYTHON = gluster.py glustertypes.py glusterstack.py
-
-pythondir = $(xlatordir)/python
-
-python_so_SOURCES = python.c
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \
-	$(PYTHON_CPPLAGS) -DGLUSTER_PYTHON_PATH=\"$(pythondir)\"
-
-AM_LDFLAGS = $(PYTHON_LDFLAGS)
-
-CLEANFILES = 
-
diff --git a/xlators/bindings/python/src/gluster.py b/xlators/bindings/python/src/gluster.py
deleted file mode 100644
index ee0eb131011..00000000000
--- a/xlators/bindings/python/src/gluster.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#   Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
-#   This file is part of GlusterFS.
-#
-#   GlusterFS is free software; you can redistribute it and/or modify
-#   it under the terms of the GNU General Public License as published
-#   by the Free Software Foundation; either version 3 of the License,
-#   or (at your option) any later version.
-#
-#   GlusterFS is distributed in the hope that it will be useful, but
-#   WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-#   General Public License for more details.
-#
-#   You should have received a copy of the GNU General Public License
-#   along with this program.  If not, see
-#   <http://www.gnu.org/licenses/>.
-from ctypes import *
-from glustertypes import *
-from glusterstack import *
-import sys
-import inspect
-
-libglusterfs = CDLL("libglusterfs.so")
-_gf_log = libglusterfs._gf_log
-_gf_log.restype = c_int32
-_gf_log.argtypes = [c_char_p, c_char_p, c_char_p, c_int32, c_int, c_char_p]
-
-gf_log_loglevel = c_int.in_dll(libglusterfs, "gf_log_loglevel")
-
-GF_LOG_NONE = 0
-GF_LOG_CRITICAL = 1
-GF_LOG_ERROR = 2
-GF_LOG_WARNING = 3
-GF_LOG_DEBUG = 4
-
-def gf_log(module, level, fmt, *params):
-    if level <= gf_log_loglevel:
-        frame = sys._getframe(1)
-        _gf_log(module, frame.f_code.co_filename, frame.f_code.co_name,
-                frame.f_lineno, level, fmt, *params)
-
-class ComplexTranslator(object):
-    def __init__(self, xlator):
-        self.xlator = xlator_t.from_address(xlator)
-
-    def __getattr__(self, item):
-        return getattr(self.xlator, item)
diff --git a/xlators/bindings/python/src/glusterstack.py b/xlators/bindings/python/src/glusterstack.py
deleted file mode 100644
index ba24c81652e..00000000000
--- a/xlators/bindings/python/src/glusterstack.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#   Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
-#   This file is part of GlusterFS.
-#
-#   GlusterFS is free software; you can redistribute it and/or modify
-#   it under the terms of the GNU General Public License as published
-#   by the Free Software Foundation; either version 3 of the License,
-#   or (at your option) any later version.
-#
-#   GlusterFS is distributed in the hope that it will be useful, but
-#   WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-#   General Public License for more details.
-#
-#   You should have received a copy of the GNU General Public License
-#   along with this program.  If not, see
-#   <http://www.gnu.org/licenses/>.
-from ctypes import *
-from glustertypes import *
-
-libc = CDLL("libc.so.6")
-calloc = libc.calloc
-calloc.argtypes = [c_int, c_int]
-calloc.restype = c_void_p
-
-# TODO: Can these be done in C somehow?
-def stack_wind(frame, rfn, obj, fn, *params):
-    """Frame is a frame object"""
-    _new = cast(calloc(1, sizeof(call_frame_t)), POINTER(call_frame_t))
-    _new[0].root = frame.root
-    _new[0].next = frame.root[0].frames.next
-    _new[0].prev = pointer(frame.root[0].frames)
-    if frame.root[0].frames.next:
-        frame.root[0].frames.next[0].prev = _new
-    frame.root[0].frames.next = _new
-    _new[0].this = obj
-    # TODO: Type checking like tmp_cbk?
-    _new[0].ret = rfn
-    _new[0].parent = pointer(frame)
-    _new[0].cookie = cast(_new, c_void_p)
-    # TODO: Initialize lock
-    #_new.lock.init()
-    frame.ref_count += 1
-    fn(_new, obj, *params)
-
-def stack_unwind(frame, *params):
-    """Frame is a frame object"""
-    fn = frame[0].ret
-    parent = frame[0].parent[0]
-    parent.ref_count -= 1
-
-    op_ret = params[0]
-    op_err = params[1]
-    params = params[2:]
-    fn(parent, call_frame_t.from_address(frame[0].cookie), parent.this,
-            op_ret, op_err, *params)
diff --git a/xlators/bindings/python/src/glustertypes.py b/xlators/bindings/python/src/glustertypes.py
deleted file mode 100644
index e9069d07c72..00000000000
--- a/xlators/bindings/python/src/glustertypes.py
+++ /dev/null
@@ -1,167 +0,0 @@
-#   Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
-#   This file is part of GlusterFS.
-#
-#   GlusterFS is free software; you can redistribute it and/or modify
-#   it under the terms of the GNU General Public License as published
-#   by the Free Software Foundation; either version 3 of the License,
-#   or (at your option) any later version.
-#
-#   GlusterFS is distributed in the hope that it will be useful, but
-#   WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-#   General Public License for more details.
-#
-#   You should have received a copy of the GNU General Public License
-#   along with this program.  If not, see
-#   <http://www.gnu.org/licenses/>.
-from ctypes import *
-import collections
-
-#
-# Forward declaration of some gluster types
-#
-class call_frame_t(Structure):
-    pass
-
-class call_ctx_t(Structure):
-    pass
-
-class call_pool_t(Structure):
-    pass
-
-class xlator_t(Structure):
-    def _getFirstChild(self):
-        return self.children[0].xlator
-    firstChild = property(_getFirstChild)
-
-class xlator_list_t(Structure):
-    pass
-
-class xlator_fops(Structure):
-    pass
-
-class xlator_mops(Structure):
-    pass
-
-class glusterfs_ctx_t(Structure):
-    pass
-
-class list_head(Structure):
-    pass
-
-class dict_t(Structure):
-    pass
-
-class inode_table_t(Structure):
-    pass
-
-class fd_t(Structure):
-    pass
-
-class iovec(Structure):
-    _fields_ = [
-            ("iov_base", c_void_p),
-            ("iov_len", c_size_t),
-            ]
-
-    def __init__(self, s):
-        self.iov_base = cast(c_char_p(s), c_void_p)
-        self.iov_len = len(s)
-
-    def getBytes(self):
-        return string_at(self.iov_base, self.iov_len)
-
-# This is a pthread_spinlock_t
-# TODO: what happens to volatile-ness?
-gf_lock_t = c_int
-
-uid_t = c_uint32
-gid_t = c_uint32
-pid_t = c_int32
-
-off_t = c_int64
-
-#
-# Function pointer types
-#
-ret_fn_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(call_frame_t),
-                              POINTER(xlator_t), c_int32, c_int32)
-
-fini_fn_t = CFUNCTYPE(None, POINTER(xlator_t))
-init_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t))
-event_notify_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t), c_int32, c_void_p)
-
-list_head._fields_ = [
-        ("next", POINTER(list_head)),
-        ("prev", POINTER(list_head)),
-        ]
-
-call_frame_t._fields_ = [
-        ("root", POINTER(call_ctx_t)),
-        ("parent", POINTER(call_frame_t)),
-        ("next", POINTER(call_frame_t)),
-        ("prev", POINTER(call_frame_t)),
-        ("local", c_void_p),
-        ("this", POINTER(xlator_t)),
-        ("ret", ret_fn_t),
-        ("ref_count", c_int32),
-        ("lock", gf_lock_t),
-        ("cookie", c_void_p),
-        ("op", c_int32),
-        ("type", c_int8),
-        ]
-
-call_ctx_t._fields_ = [
-        ("all_frames", list_head),
-        ("trans", c_void_p),
-        ("pool", call_pool_t),
-        ("unique", c_uint64),
-        ("state", c_void_p),
-        ("uid", uid_t),
-        ("gid", gid_t),
-        ("pid", pid_t),
-        ("frames", call_frame_t),
-        ("req_refs", POINTER(dict_t)),
-        ("rsp_refs", POINTER(dict_t)),
-        ]
-
-xlator_t._fields_ = [
-        ("name", c_char_p),
-        ("type", c_char_p),
-        ("next", POINTER(xlator_t)),
-        ("prev", POINTER(xlator_t)),
-        ("parent", POINTER(xlator_t)),
-        ("children", POINTER(xlator_list_t)),
-        ("fops", POINTER(xlator_fops)),
-        ("mops", POINTER(xlator_mops)),
-        ("fini", fini_fn_t),
-        ("init", init_fn_t),
-        ("notify", event_notify_fn_t),
-        ("options", POINTER(dict_t)),
-        ("ctx", POINTER(glusterfs_ctx_t)),
-        ("itable", POINTER(inode_table_t)),
-        ("ready", c_char),
-        ("private", c_void_p),
-        ]
-
-xlator_list_t._fields_ = [
-        ("xlator", POINTER(xlator_t)),
-        ("next", POINTER(xlator_list_t)),
-        ]
-
-fop_functions = collections.defaultdict(lambda: c_void_p)
-fop_function_names = ['lookup', 'forget', 'stat', 'fstat', 'chmod', 'fchmod',
-        'chown', 'fchown', 'truncate', 'ftruncate', 'utimens', 'access',
-        'readlink', 'mknod', 'mkdir', 'unlink', 'rmdir', 'symlink',
-        'rename', 'link', 'create', 'open', 'readv', 'writev', 'flush',
-        'close', 'fsync', 'opendir', 'readdir', 'closedir', 'fsyncdir',
-        'statfs', 'setxattr', 'getxattr', 'removexattr', 'lk', 'writedir',
-        # TODO: Call backs?
-        ]
-
-fop_writev_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(xlator_t),
-                                  POINTER(fd_t), POINTER(iovec), c_int32,
-                                  off_t)
-
-fop_functions['writev'] = fop_writev_t
-xlator_fops._fields_ = [(f, fop_functions[f]) for f in fop_function_names]
diff --git a/xlators/bindings/python/src/python.c b/xlators/bindings/python/src/python.c
deleted file mode 100644
index 7f32d7f2997..00000000000
--- a/xlators/bindings/python/src/python.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
-   Copyright (c) 2007-2009 Chris AtLee <chris@atlee.ca>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#include <Python.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "xlator.h"
-#include "logging.h"
-#include "defaults.h"
-
-typedef struct
-{
-    char        *scriptname;
-    PyObject    *pXlator;
-    PyObject    *pScriptModule;
-    PyObject    *pGlusterModule;
-    PyThreadState *pInterp;
-
-    PyObject    *pFrameType, *pVectorType, *pFdType;
-} python_private_t;
-
-int32_t
-python_writev (call_frame_t *frame,
-              xlator_t *this,
-              fd_t *fd,
-              struct iovec *vector,
-              int32_t count, 
-              off_t offset)
-{
-  python_private_t *priv = (python_private_t *)this->private;
-  gf_log("python", GF_LOG_DEBUG, "In writev");
-  if (PyObject_HasAttrString(priv->pXlator, "writev"))
-  {
-
-      PyObject *retval = PyObject_CallMethod(priv->pXlator, "writev",
-              "O O O i l",
-              PyObject_CallMethod(priv->pFrameType, "from_address", "O&", PyLong_FromVoidPtr, frame),
-              PyObject_CallMethod(priv->pFdType, "from_address", "O&", PyLong_FromVoidPtr, fd),
-              PyObject_CallMethod(priv->pVectorType, "from_address", "O&", PyLong_FromVoidPtr, vector),
-              count,
-              offset);
-      if (PyErr_Occurred())
-      {
-          PyErr_Print();
-      }
-      Py_XDECREF(retval);
-  }
-  else
-  {
-      return default_writev(frame, this, fd, vector, count, offset);
-  }
-  return 0;
-}
-
-struct xlator_fops fops = {
-    .writev       = python_writev
-};
-
-struct xlator_mops mops = {
-};
-
-static PyObject *
-AnonModule_FromFile (const char* fname)
-{
-    // Get the builtins
-    PyThreadState* pThread = PyThreadState_Get();
-    PyObject *pBuiltins = pThread->interp->builtins;
-
-    if (PyErr_Occurred())
-    {
-        PyErr_Print();
-        return NULL;
-    }
-
-    // Create a new dictionary for running code in
-    PyObject *pModuleDict = PyDict_New();
-    PyDict_SetItemString(pModuleDict, "__builtins__", pBuiltins);
-    Py_INCREF(pBuiltins);
-
-    // Run the file in the new context
-    FILE* fp = fopen(fname, "r");
-    PyRun_File(fp, fname, Py_file_input, pModuleDict, pModuleDict);
-    fclose(fp);
-    if (PyErr_Occurred())
-    {
-        PyErr_Print();
-        Py_DECREF(pModuleDict);
-        Py_DECREF(pBuiltins);
-        return NULL;
-    }
-
-    // Create an object to hold the new context
-    PyRun_String("class ModuleWrapper(object):\n\tpass\n", Py_single_input, pModuleDict, pModuleDict);
-    if (PyErr_Occurred())
-    {
-        PyErr_Print();
-        Py_DECREF(pModuleDict);
-        Py_DECREF(pBuiltins);
-        return NULL;
-    }
-    PyObject *pModule = PyRun_String("ModuleWrapper()", Py_eval_input, pModuleDict, pModuleDict);
-    if (PyErr_Occurred())
-    {
-        PyErr_Print();
-        Py_DECREF(pModuleDict);
-        Py_DECREF(pBuiltins);
-        Py_XDECREF(pModule);
-        return NULL;
-    }
-
-    // Set the new context's dictionary to the one we used to run the code
-    // inside
-    PyObject_SetAttrString(pModule, "__dict__", pModuleDict);
-    if (PyErr_Occurred())
-    {
-        PyErr_Print();
-        Py_DECREF(pModuleDict);
-        Py_DECREF(pBuiltins);
-        Py_DECREF(pModule);
-        return NULL;
-    }
-
-    return pModule;
-}
-
-int32_t
-init (xlator_t *this)
-{
-  // This is ok to call more than once per process
-  Py_InitializeEx(0);
-
-  if (!this->children) {
-    gf_log ("python", GF_LOG_ERROR, 
-            "FATAL: python should have exactly one child");
-    return -1;
-  }
-
-  python_private_t *priv = CALLOC (sizeof (python_private_t), 1);
-  ERR_ABORT (priv);
-
-  data_t *scriptname = dict_get (this->options, "scriptname");
-  if (scriptname) {
-      priv->scriptname = data_to_str(scriptname);
-  } else {
-      gf_log("python", GF_LOG_ERROR,
-              "FATAL: python requires the scriptname parameter");
-      return -1;
-  }
-
-  priv->pInterp = Py_NewInterpreter();
-    
-  // Adjust python's path
-  PyObject *syspath = PySys_GetObject("path");
-  PyObject *path = PyString_FromString(GLUSTER_PYTHON_PATH);
-  PyList_Append(syspath, path);
-  Py_DECREF(path);
-
-  gf_log("python", GF_LOG_DEBUG,
-          "Loading gluster module");
-
-  priv->pGlusterModule = PyImport_ImportModule("gluster");
-  if (PyErr_Occurred())
-  {
-      PyErr_Print();
-      return -1;
-  }
-
-  priv->pFrameType = PyObject_GetAttrString(priv->pGlusterModule, "call_frame_t");
-  priv->pFdType = PyObject_GetAttrString(priv->pGlusterModule, "fd_t");
-  priv->pVectorType = PyObject_GetAttrString(priv->pGlusterModule, "iovec");
-
-  gf_log("python", GF_LOG_DEBUG, "Loading script...%s", priv->scriptname);
-  
-  priv->pScriptModule = AnonModule_FromFile(priv->scriptname);
-  if (!priv->pScriptModule || PyErr_Occurred())
-  {
-      gf_log("python", GF_LOG_ERROR, "Error loading %s", priv->scriptname);
-      PyErr_Print();
-      return -1;
-  }
-
-  if (!PyObject_HasAttrString(priv->pScriptModule, "xlator"))
-  {
-      gf_log("python", GF_LOG_ERROR, "%s does not have a xlator attribute", priv->scriptname);
-      return -1;
-  }
-  gf_log("python", GF_LOG_DEBUG, "Instantiating translator");
-  priv->pXlator = PyObject_CallMethod(priv->pScriptModule, "xlator", "O&",
-          PyLong_FromVoidPtr, this);
-  if (PyErr_Occurred() || !priv->pXlator)
-  {
-      PyErr_Print();
-      return -1;
-  }
-
-  this->private = priv;
-
-  gf_log ("python", GF_LOG_DEBUG, "python xlator loaded");
-  return 0;
-}
-
-void 
-fini (xlator_t *this)
-{
-  python_private_t *priv = (python_private_t*)(this->private);
-  Py_DECREF(priv->pXlator);
-  Py_DECREF(priv->pScriptModule);
-  Py_DECREF(priv->pGlusterModule);
-  Py_DECREF(priv->pFrameType);
-  Py_DECREF(priv->pFdType);
-  Py_DECREF(priv->pVectorType);
-  Py_EndInterpreter(priv->pInterp);
-  return;
-}
diff --git a/xlators/bindings/python/src/testxlator.py b/xlators/bindings/python/src/testxlator.py
deleted file mode 100644
index 507455c856a..00000000000
--- a/xlators/bindings/python/src/testxlator.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#   Copyright (c) 2007 Chris AtLee <chris@atlee.ca>
-#   This file is part of GlusterFS.
-#
-#   GlusterFS is free software; you can redistribute it and/or modify
-#   it under the terms of the GNU General Public License as published
-#   by the Free Software Foundation; either version 3 of the License,
-#   or (at your option) any later version.
-#
-#   GlusterFS is distributed in the hope that it will be useful, but
-#   WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-#   General Public License for more details.
-#
-#   You should have received a copy of the GNU General Public License
-#   along with this program.  If not, see
-#   <http://www.gnu.org/licenses/>.
-
-"""
-This is a test translator written in python.
-
-Important things to note:
-    This file must be import-able from glusterfsd.  This probably means
-    setting PYTHONPATH to where this file is located.
-
-    This file must have a top-level xlator class object that will be
-    used to instantiate individual translators.
-"""
-from gluster import *
-
-class MyXlator(ComplexTranslator):
-    name = "MyXlator"
-    def writev_cbk(self, frame, cookie, op_ret, op_errno, buf):
-        stack_unwind(frame, op_ret, op_errno, buf)
-        return 0
-
-    def writev(self, frame, fd, vector, count, offset):
-        gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len)
-        # TODO: Use cookie to pass this to writev_cbk
-        old_count = vector.iov_len
-
-        data = vector.getBytes().encode("zlib")
-
-        vector = iovec(data)
-        gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len)
-
-        @ret_fn_t
-        def rfn(frame, prev, this, op_ret, op_errno, *params):
-            if len(params) == 0:
-                params = [0]
-            return self.writev_cbk(frame, prev, old_count, op_errno, *params)
-
-        stack_wind(frame, rfn, self.firstChild,
-                self.firstChild[0].fops[0].writev, fd, vector, count, offset)
-        return 0
-
-xlator = MyXlator
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am
index a6ddb3564a9..8e067d5ab58 100644
--- a/xlators/cluster/Makefile.am
+++ b/xlators/cluster/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = unify stripe afr dht ha map
+SUBDIRS = afr dht ec
 
 CLEANFILES = 
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am
index 1bde9e5bad7..610819b28fc 100644
--- a/xlators/cluster/afr/src/Makefile.am
+++ b/xlators/cluster/afr/src/Makefile.am
@@ -1,20 +1,35 @@
 xlator_LTLIBRARIES = afr.la
 xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
 
-afr_la_LDFLAGS = -module -avoidversion 
+afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \
+	afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \
+	afr-read-txn.c \
+	$(top_builddir)/xlators/lib/src/libxlator.c
 
-afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c
+AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \
+	afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \
+	afr-self-heal-name.c
+
+afr_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c
 afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
 
-noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h
+noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \
+	afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \
+	afr-common.c afr-self-heald.h \
+	$(top_builddir)/xlators/lib/src/libxlator.h afr-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+	-I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
+	-I$(top_srcdir)/rpc/rpc-lib/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
-	    -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+AM_CFLAGS = -Wall $(GF_CFLAGS)
 
-CLEANFILES = 
+CLEANFILES =
 
 uninstall-local:
 	rm -f $(DESTDIR)$(xlatordir)/replicate.so
 
 install-data-hook:
-	ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so
-\ No newline at end of file
+	ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
new file mode 100644
index 00000000000..032ab5c8001
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -0,0 +1,7878 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <libgen.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#include <glusterfs/glusterfs.h>
+#include "afr.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/hashfn.h>
+#include <glusterfs/list.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/events.h>
+#include <glusterfs/upcall-utils.h>
+
+#include "afr-inode-read.h"
+#include "afr-inode-write.h"
+#include "afr-dir-read.h"
+#include "afr-dir-write.h"
+#include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-self-heald.h"
+#include "afr-messages.h"
+
+int32_t
+afr_quorum_errno(afr_private_t *priv)
+{
+    return ENOTCONN;
+}
+
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+                         pid_t pid)
+{
+    if (!__is_root_gfid(pargfid)) {
+        return _gf_false;
+    }
+
+    if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) {
+        /*For backward compatibility /.landfill is private*/
+        return _gf_true;
+    }
+
+    if (pid == GF_CLIENT_PID_GSYNCD) {
+        /*geo-rep needs to create/sync private directory on slave because
+         * it appears in changelog*/
+        return _gf_false;
+    }
+
+    if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) {
+        if (strcmp(name, priv->anon_inode_name) == 0) {
+            /* anonymous-inode dir is private*/
+            return _gf_true;
+        }
+    } else {
+        if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) ==
+            0) {
+            /* anonymous-inode dir prefix is private for geo-rep to work*/
+            return _gf_true;
+        }
+    }
+
+    return _gf_false;
+}
+
+void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+                         unsigned char *replies)
+{
+    int i = 0;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+            replies[i] = 1;
+        } else {
+            replies[i] = 0;
+        }
+    }
+}
+
+int
+afr_fav_child_reset_sink_xattrs(void *opaque);
+
+int
+afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque);
+
+static void
+afr_discover_done(call_frame_t *frame, xlator_t *this);
+
+int
+afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int op_ret, int op_errno, dict_t *xdata)
+{
+    afr_local_t *local = frame->local;
+    afr_private_t *priv = this->private;
+    int i = (long)cookie;
+
+    local->cont.lk.dom_lock_op_ret[i] = op_ret;
+    local->cont.lk.dom_lock_op_errno[i] = op_errno;
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+               "%s: Failed to acquire %s on %s",
+               uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM,
+               priv->children[i]->name);
+    } else {
+        local->cont.lk.dom_locked_nodes[i] = 1;
+    }
+
+    syncbarrier_wake(&local->barrier);
+
+    return 0;
+}
+
+int
+afr_dom_lock_acquire(call_frame_t *frame)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    struct gf_flock flock = {
+        0,
+    };
+    int i = 0;
+
+    priv = frame->this->private;
+    local = frame->local;
+    local->cont.lk.dom_locked_nodes = GF_CALLOC(
+        priv->child_count, sizeof(*local->cont.lk.locked_nodes),
+        gf_afr_mt_char);
+    if (!local->cont.lk.dom_locked_nodes) {
+        return -ENOMEM;
+    }
+    local->cont.lk.dom_lock_op_ret = GF_CALLOC(
+        priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret),
+        gf_afr_mt_int32_t);
+    if (!local->cont.lk.dom_lock_op_ret) {
+        return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+    }
+    local->cont.lk.dom_lock_op_errno = GF_CALLOC(
+        priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno),
+        gf_afr_mt_int32_t);
+    if (!local->cont.lk.dom_lock_op_errno) {
+        return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+    }
+    flock.l_type = F_WRLCK;
+
+    AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+              local->fd, F_SETLK, &flock, NULL);
+
+    if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL))
+        goto blocking_lock;
+
+    /*If any of the bricks returned EAGAIN, we still need blocking locks.*/
+    if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) !=
+        priv->child_count) {
+        for (i = 0; i < priv->child_count; i++) {
+            if (local->cont.lk.dom_lock_op_ret[i] == -1 &&
+                local->cont.lk.dom_lock_op_errno[i] == EAGAIN)
+                goto blocking_lock;
+        }
+    }
+
+    return 0;
+
+blocking_lock:
+    afr_dom_lock_release(frame);
+    AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+              local->fd, F_SETLKW, &flock, NULL);
+    if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) {
+        afr_dom_lock_release(frame);
+        return -afr_quorum_errno(priv);
+    }
+
+    return 0;
+}
+
+int
+afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int op_ret, int op_errno, dict_t *xdata)
+{
+    afr_local_t *local = frame->local;
+    afr_private_t *priv = this->private;
+    int i = (long)cookie;
+
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+               "%s: Failed to release %s on %s", local->loc.path,
+               AFR_LK_HEAL_DOM, priv->children[i]->name);
+    }
+    local->cont.lk.dom_locked_nodes[i] = 0;
+
+    syncbarrier_wake(&local->barrier);
+
+    return 0;
+}
+
+void
+afr_dom_lock_release(call_frame_t *frame)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    unsigned char *locked_on = NULL;
+    struct gf_flock flock = {
+        0,
+    };
+
+    local = frame->local;
+    priv = frame->this->private;
+    locked_on = local->cont.lk.dom_locked_nodes;
+    if (AFR_COUNT(locked_on, priv->child_count) == 0)
+        return;
+    flock.l_type = F_UNLCK;
+
+    AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk,
+               AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL);
+
+    return;
+}
+
+static void
+afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info)
+{
+    if (!info)
+        return;
+    if (info->xdata_req)
+        dict_unref(info->xdata_req);
+    if (info->fd)
+        fd_unref(info->fd);
+    GF_FREE(info->locked_nodes);
+    GF_FREE(info->child_up_event_gen);
+    GF_FREE(info->child_down_event_gen);
+    GF_FREE(info);
+}
+
+static int
+afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this)
+{
+    afr_private_t *priv = this->private;
+    afr_local_t *local = frame->local;
+    afr_lk_heal_info_t *info = NULL;
+    afr_fd_ctx_t *fd_ctx = NULL;
+    int ret = -ENOMEM;
+
+    info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t);
+    if (!info) {
+        goto cleanup;
+    }
+    INIT_LIST_HEAD(&info->pos);
+    info->fd = fd_ref(local->fd);
+    info->cmd = local->cont.lk.cmd;
+    info->pid = frame->root->pid;
+    info->flock = local->cont.lk.user_flock;
+    info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL);
+    if (!info->xdata_req) {
+        goto cleanup;
+    }
+    info->lk_owner = frame->root->lk_owner;
+    info->locked_nodes = GF_MALLOC(
+        sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char);
+    if (!info->locked_nodes) {
+        goto cleanup;
+    }
+    memcpy(info->locked_nodes, local->cont.lk.locked_nodes,
+           sizeof(*info->locked_nodes) * priv->child_count);
+    info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen),
+                                         priv->child_count, gf_afr_mt_int32_t);
+    if (!info->child_up_event_gen) {
+        goto cleanup;
+    }
+    info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen),
+                                           priv->child_count,
+                                           gf_afr_mt_int32_t);
+    if (!info->child_down_event_gen) {
+        goto cleanup;
+    }
+
+    LOCK(&local->fd->lock);
+    {
+        fd_ctx = __afr_fd_ctx_get(local->fd, this);
+        if (fd_ctx)
+            fd_ctx->lk_heal_info = info;
+    }
+    UNLOCK(&local->fd->lock);
+    if (!fd_ctx) {
+        goto cleanup;
+    }
+
+    LOCK(&priv->lock);
+    {
+        list_add_tail(&info->pos, &priv->saved_locks);
+    }
+    UNLOCK(&priv->lock);
+
+    return 0;
+cleanup:
+    gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+           "%s: Failed to add lock to healq",
+           uuid_utoa(local->fd->inode->gfid));
+    if (info) {
+        afr_lk_heal_info_cleanup(info);
+        if (fd_ctx) {
+            LOCK(&local->fd->lock);
+            {
+                fd_ctx->lk_heal_info = NULL;
+            }
+            UNLOCK(&local->fd->lock);
+        }
+    }
+    return ret;
+}
+
+static int
+afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this)
+{
+    afr_private_t *priv = this->private;
+    struct gf_flock flock = local->cont.lk.user_flock;
+    afr_lk_heal_info_t *info = NULL;
+    afr_fd_ctx_t *fd_ctx = NULL;
+    int ret = -EINVAL;
+
+    fd_ctx = afr_fd_ctx_get(local->fd, this);
+    if (!fd_ctx || !fd_ctx->lk_heal_info) {
+        goto out;
+    }
+
+    info = fd_ctx->lk_heal_info;
+    if ((info->flock.l_start != flock.l_start) ||
+        (info->flock.l_whence != flock.l_whence) ||
+        (info->flock.l_len != flock.l_len)) {
+        /*TODO: Compare lkowners too.*/
+        goto out;
+    }
+
+    LOCK(&priv->lock);
+    {
+        list_del(&fd_ctx->lk_heal_info->pos);
+    }
+    UNLOCK(&priv->lock);
+
+    afr_lk_heal_info_cleanup(info);
+    fd_ctx->lk_heal_info = NULL;
+    ret = 0;
+out:
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+               "%s: Failed to remove lock from healq",
+               uuid_utoa(local->fd->inode->gfid));
+    return ret;
+}
+
+int
+afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+                  dict_t *xdata)
+{
+    afr_local_t *local = frame->local;
+    int i = (long)cookie;
+
+    local->replies[i].valid = 1;
+    local->replies[i].op_ret = op_ret;
+    local->replies[i].op_errno = op_errno;
+    if (op_ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+               "Failed to heal lock on child %d for %s", i,
+               uuid_utoa(local->fd->inode->gfid));
+    }
+    syncbarrier_wake(&local->barrier);
+    return 0;
+}
+
+int
+afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+{
+    afr_local_t *local = frame->local;
+    int i = (long)cookie;
+
+    local->replies[i].valid = 1;
+    local->replies[i].op_ret = op_ret;
+    local->replies[i].op_errno = op_errno;
+    if (op_ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+               "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid));
+    } else {
+        local->cont.lk.getlk_rsp[i] = *lock;
+    }
+
+    syncbarrier_wake(&local->barrier);
+    return 0;
+}
+
+static gf_boolean_t
+afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv,
+                        afr_lk_heal_info_t *info)
+{
+    int i = 0;
+    afr_local_t *local = frame->local;
+    struct gf_flock flock = {
+        0,
+    };
+    gf_boolean_t ret = _gf_true;
+    char *wind_on = alloca0(priv->child_count);
+    unsigned char *success_replies = alloca0(priv->child_count);
+    local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp),
+                                         priv->child_count, gf_afr_mt_gf_lock);
+
+    flock = info->flock;
+    for (i = 0; i < priv->child_count; i++) {
+        if (info->locked_nodes[i])
+            wind_on[i] = 1;
+    }
+
+    AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock,
+               info->xdata_req);
+
+    afr_fill_success_replies(local, priv, success_replies);
+    if (AFR_COUNT(success_replies, priv->child_count) == 0) {
+        ret = _gf_false;
+        goto out;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].valid || local->replies[i].op_ret != 0)
+            continue;
+        if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK)
+            continue;
+        /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/
+        if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner,
+                             &info->lk_owner)) {
+            ret = _gf_false;
+            break;
+        }
+    }
+out:
+    afr_local_replies_wipe(local, priv);
+    GF_FREE(local->cont.lk.getlk_rsp);
+    local->cont.lk.getlk_rsp = NULL;
+    return ret;
+}
+
+static void
+afr_mark_fd_bad(fd_t *fd, xlator_t *this)
+{
+    afr_fd_ctx_t *fd_ctx = NULL;
+
+    if (!fd)
+        return;
+    LOCK(&fd->lock);
+    {
+        fd_ctx = __afr_fd_ctx_get(fd, this);
+        if (fd_ctx) {
+            fd_ctx->is_fd_bad = _gf_true;
+            fd_ctx->lk_heal_info = NULL;
+        }
+    }
+    UNLOCK(&fd->lock);
+}
+
+static void
+afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info)
+{
+    LOCK(&priv->lock);
+    {
+        list_del(&info->pos);
+        list_add_tail(&info->pos, &priv->lk_healq);
+    }
+    UNLOCK(&priv->lock);
+}
+
+static void
+afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv,
+                 afr_lk_heal_info_t *info)
+{
+    int i = 0;
+    int op_errno = 0;
+    int32_t *current_event_gen = NULL;
+    afr_local_t *local = frame->local;
+    xlator_t *this = frame->this;
+    char *wind_on = alloca0(priv->child_count);
+    gf_boolean_t retry = _gf_true;
+
+    frame->root->pid = info->pid;
+    lk_owner_copy(&frame->root->lk_owner, &info->lk_owner);
+
+    op_errno = -afr_dom_lock_acquire(frame);
+    if ((op_errno != 0)) {
+        goto release;
+    }
+
+    if (!afr_does_lk_owner_match(frame, priv, info)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM,
+               "Ignoring lock heal for %s since lk-onwers mismatch. "
+               "Lock possibly pre-empted by another client.",
+               uuid_utoa(info->fd->inode->gfid));
+        goto release;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (info->locked_nodes[i])
+            continue;
+        wind_on[i] = 1;
+    }
+
+    current_event_gen = alloca(priv->child_count);
+    memcpy(current_event_gen, info->child_up_event_gen,
+           priv->child_count * sizeof *current_event_gen);
+    AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd,
+               &info->flock, info->xdata_req);
+
+    LOCK(&priv->lock);
+    {
+        for (i = 0; i < priv->child_count; i++) {
+            if (!wind_on[i])
+                continue;
+            if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) {
+                continue;
+            }
+
+            if ((current_event_gen[i] == info->child_up_event_gen[i]) &&
+                (current_event_gen[i] > info->child_down_event_gen[i])) {
+                info->locked_nodes[i] = 1;
+                retry = _gf_false;
+                list_del_init(&info->pos);
+                list_add_tail(&info->pos, &priv->saved_locks);
+            } else {
+                /*We received subsequent child up/down events while heal was in
+                 * progress; don't mark child as healed. Attempt again on the
+                 * new child up*/
+                gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM,
+                       "Event gen mismatch: skipped healing lock on child %d "
+                       "for %s.",
+                       i, uuid_utoa(info->fd->inode->gfid));
+            }
+        }
+    }
+    UNLOCK(&priv->lock);
+
+release:
+    afr_dom_lock_release(frame);
+    if (retry)
+        afr_add_lock_to_lkhealq(priv, info);
+    return;
+}
+
+static int
+afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque)
+{
+    STACK_DESTROY(frame->root);
+    return 0;
+}
+
+static int
+afr_lock_heal(void *opaque)
+{
+    call_frame_t *frame = (call_frame_t *)opaque;
+    call_frame_t *iter_frame = NULL;
+    xlator_t *this = frame->this;
+    afr_private_t *priv = this->private;
+    afr_lk_heal_info_t *info = NULL;
+    afr_lk_heal_info_t *tmp = NULL;
+    struct list_head healq = {
+        0,
+    };
+    int ret = 0;
+
+    iter_frame = afr_copy_frame(frame);
+    if (!iter_frame) {
+        return ENOMEM;
+    }
+
+    INIT_LIST_HEAD(&healq);
+    LOCK(&priv->lock);
+    {
+        list_splice_init(&priv->lk_healq, &healq);
+    }
+    UNLOCK(&priv->lock);
+
+    list_for_each_entry_safe(info, tmp, &healq, pos)
+    {
+        GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) <
+                   priv->child_count));
+        ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd);
+        afr_lock_heal_do(iter_frame, priv, info);
+        AFR_STACK_RESET(iter_frame);
+        if (iter_frame->local == NULL) {
+            ret = ENOTCONN;
+            gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN,
+                   AFR_MSG_LK_HEAL_DOM,
+                   "Aborting processing of lk_healq."
+                   "Healing will be reattempted on next child up for locks "
+                   "that are still in quorum.");
+            LOCK(&priv->lock);
+            {
+                list_add_tail(&healq, &priv->lk_healq);
+            }
+            UNLOCK(&priv->lock);
+            break;
+        }
+    }
+
+    AFR_STACK_DESTROY(iter_frame);
+    return ret;
+}
+
+static int
+__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child)
+{
+    int ret = 0;
+    call_frame_t *frame = NULL;
+    afr_lk_heal_info_t *info = NULL;
+    afr_lk_heal_info_t *tmp = NULL;
+
+    if (priv->shd.iamshd)
+        return 0;
+
+    list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+    {
+        info->child_up_event_gen[child] = priv->event_generation;
+        list_del_init(&info->pos);
+        list_add_tail(&info->pos, &priv->lk_healq);
+    }
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame)
+        return -1;
+
+    ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame,
+                       frame);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM,
+               "Failed to launch lock heal synctask");
+
+    return ret;
+}
+
+static int
+__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child)
+{
+    afr_lk_heal_info_t *info = NULL;
+    afr_lk_heal_info_t *tmp = NULL;
+
+    if (priv->shd.iamshd)
+        return 0;
+    list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+    {
+        info->child_down_event_gen[child] = priv->event_generation;
+        if (info->locked_nodes[child] == 1)
+            info->locked_nodes[child] = 0;
+        if (!afr_has_quorum(info->locked_nodes, this, NULL)) {
+            /* Since the lock was lost on quorum no. of nodes, we should
+             * not attempt to heal it anymore. Some other client could have
+             * acquired the lock, modified data and released it and this
+             * client wouldn't know about it if we heal it.*/
+            afr_mark_fd_bad(info->fd, this);
+            list_del(&info->pos);
+            afr_lk_heal_info_cleanup(info);
+            /* We're not winding an unlock on the node where the lock is still
+             * present because when fencing logic switches over to the new
+             * client (since we marked the fd bad), it should preempt any
+             * existing lock. */
+        }
+    }
+    return 0;
+}
+
+gf_boolean_t
+afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
+                              int32_t *op_errno)
+{
+    if (priv->consistent_io && local->call_count != priv->child_count) {
+        gf_msg(THIS->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN,
+               "All subvolumes are not up");
+        if (op_errno)
+            *op_errno = ENOTCONN;
+        return _gf_false;
+    }
+    return _gf_true;
+}
+
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata)
+{
+    int ret = 0;
+    uint32_t lk_mode = GF_LK_ADVISORY;
+
+    ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode);
+    if (!ret && lk_mode == GF_LK_MANDATORY)
+        return _gf_true;
+
+    return _gf_false;
+}
+
+call_frame_t *
+afr_copy_frame(call_frame_t *base)
+{
+    afr_local_t *local = NULL;
+    call_frame_t *frame = NULL;
+    int op_errno = 0;
+
+    frame = copy_frame(base);
+    if (!frame)
+        return NULL;
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local) {
+        AFR_STACK_DESTROY(frame);
+        return NULL;
+    }
+
+    return frame;
+}
+
+/* Check if an entry or inode could be undergoing a transaction. */
+gf_boolean_t
+afr_is_possibly_under_txn(afr_transaction_type type, afr_local_t *local,
+                          xlator_t *this)
+{
+    int i = 0;
+    int tmp = 0;
+    afr_private_t *priv = NULL;
+    GF_UNUSED char *key = NULL;
+    int keylen = 0;
+
+    priv = this->private;
+
+    if (type == AFR_ENTRY_TRANSACTION) {
+        key = GLUSTERFS_PARENT_ENTRYLK;
+        keylen = SLEN(GLUSTERFS_PARENT_ENTRYLK);
+    } else if (type == AFR_DATA_TRANSACTION) {
+        /*FIXME: Use GLUSTERFS_INODELK_DOM_COUNT etc. once
+         * pl_inodelk_xattr_fill supports separate keys for different
+         * domains.*/
+        key = GLUSTERFS_INODELK_COUNT;
+        keylen = SLEN(GLUSTERFS_INODELK_COUNT);
+    }
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].xdata)
+            continue;
+        if (dict_get_int32n(local->replies[i].xdata, key, keylen, &tmp) == 0)
+            if (tmp)
+                return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+static void
+afr_inode_ctx_destroy(afr_inode_ctx_t *ctx)
+{
+    int i = 0;
+
+    if (!ctx)
+        return;
+
+    for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+        GF_FREE(ctx->pre_op_done[i]);
+    }
+
+    GF_FREE(ctx);
+}
+
+int
+__afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
+{
+    uint64_t ctx_int = 0;
+    int ret = -1;
+    int i = -1;
+    int num_locks = -1;
+    afr_inode_ctx_t *ictx = NULL;
+    afr_lock_t *lock = NULL;
+    afr_private_t *priv = this->private;
+
+    ret = __inode_ctx_get(inode, this, &ctx_int);
+    if (ret == 0) {
+        *ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int;
+        return 0;
+    }
+
+    ictx = GF_CALLOC(1, sizeof(afr_inode_ctx_t), gf_afr_mt_inode_ctx_t);
+    if (!ictx)
+        goto out;
+
+    for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+        ictx->pre_op_done[i] = GF_CALLOC(sizeof *ictx->pre_op_done[i],
+                                         priv->child_count, gf_afr_mt_int32_t);
+        if (!ictx->pre_op_done[i]) {
+            ret = -ENOMEM;
+            goto out;
+        }
+    }
+
+    num_locks = sizeof(ictx->lock) / sizeof(afr_lock_t);
+    for (i = 0; i < num_locks; i++) {
+        lock = &ictx->lock[i];
+        INIT_LIST_HEAD(&lock->post_op);
+        INIT_LIST_HEAD(&lock->frozen);
+        INIT_LIST_HEAD(&lock->waiting);
+        INIT_LIST_HEAD(&lock->owners);
+    }
+
+    ctx_int = (uint64_t)(uintptr_t)ictx;
+    ret = __inode_ctx_set(inode, this, &ctx_int);
+    if (ret) {
+        goto out;
+    }
+
+    ictx->spb_choice = -1;
+    ictx->read_subvol = 0;
+    ictx->write_subvol = 0;
+    ictx->lock_count = 0;
+    ret = 0;
+    *ctx = ictx;
+out:
+    if (ret) {
+        afr_inode_ctx_destroy(ictx);
+    }
+    return ret;
+}
+
+/*
+ * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS:
+ *
+ * |<----------   64bit   ------------>|
+ *  63           32 31    16 15       0
+ * |   EVENT_GEN   |  DATA  | METADATA |
+ *
+ *
+ *  METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which
+ *                              metadata can be attempted to be read.
+ *
+ *                              bit-0 => priv->subvolumes[0]
+ *                              bit-1 => priv->subvolumes[1]
+ *                              ... etc. till bit-15
+ *
+ *  DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data
+ *                           can be attempted to be read.
+ *
+ *                           bit-16 => priv->subvolumes[0]
+ *                           bit-17 => priv->subvolumes[1]
+ *                           ... etc. till bit-31
+ *
+ *  EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation)
+ *                                when DATA and METADATA was last updated.
+ *
+ *                                If EVENT_GEN is < priv->event_generation,
+ *                                or is 0, it means afr_inode_refresh() needs
+ *                                to be called to recalculate the bitmaps.
+ */
+
+int
+__afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local,
+                              inode_t *inode)
+{
+    int i = 0;
+    int txn_type = 0;
+    int count = 0;
+    int index = -1;
+    uint16_t datamap_old = 0;
+    uint16_t metadatamap_old = 0;
+    uint16_t datamap = 0;
+    uint16_t metadatamap = 0;
+    uint16_t tmp_map = 0;
+    uint16_t mask = 0;
+    uint32_t event = 0;
+    uint64_t val = 0;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+    txn_type = local->transaction.type;
+
+    if (txn_type == AFR_DATA_TRANSACTION)
+        val = local->inode_ctx->write_subvol;
+    else
+        val = local->inode_ctx->read_subvol;
+
+    metadatamap_old = metadatamap = (val & 0x000000000000ffff);
+    datamap_old = datamap = (val & 0x00000000ffff0000) >> 16;
+    event = (val & 0xffffffff00000000) >> 32;
+
+    if (txn_type == AFR_DATA_TRANSACTION)
+        tmp_map = datamap;
+    else if (txn_type == AFR_METADATA_TRANSACTION)
+        tmp_map = metadatamap;
+
+    count = gf_bits_count(tmp_map);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->transaction.failed_subvols[i])
+            continue;
+
+        mask = 1 << i;
+        if (txn_type == AFR_METADATA_TRANSACTION)
+            metadatamap &= ~mask;
+        else if (txn_type == AFR_DATA_TRANSACTION)
+            datamap &= ~mask;
+    }
+
+    switch (txn_type) {
+        case AFR_METADATA_TRANSACTION:
+            if ((metadatamap_old != 0) && (metadatamap == 0) && (count == 1)) {
+                index = gf_bits_index(tmp_map);
+                local->transaction.in_flight_sb_errno = local->replies[index]
+                                                            .op_errno;
+                local->transaction.in_flight_sb = _gf_true;
+                metadatamap |= (1 << index);
+            }
+            if (metadatamap_old != metadatamap) {
+                __afr_inode_need_refresh_set(inode, this);
+            }
+            break;
+
+        case AFR_DATA_TRANSACTION:
+            if ((datamap_old != 0) && (datamap == 0) && (count == 1)) {
+                index = gf_bits_index(tmp_map);
+                local->transaction.in_flight_sb_errno = local->replies[index]
+                                                            .op_errno;
+                local->transaction.in_flight_sb = _gf_true;
+                datamap |= (1 << index);
+            }
+            if (datamap_old != datamap)
+                __afr_inode_need_refresh_set(inode, this);
+            break;
+
+        default:
+            break;
+    }
+
+    val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
+          (((uint64_t)event) << 32);
+
+    if (txn_type == AFR_DATA_TRANSACTION)
+        local->inode_ctx->write_subvol = val;
+    local->inode_ctx->read_subvol = val;
+
+    return 0;
+}
+
+gf_boolean_t
+afr_is_symmetric_error(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int op_errno = 0;
+    int i_errno = 0;
+    gf_boolean_t matching_errors = _gf_true;
+    int i = 0;
+
+    priv = this->private;
+    local = frame->local;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].valid)
+            continue;
+        if (local->replies[i].op_ret != -1) {
+            /* Operation succeeded on at least one subvol,
+               so it is not a failed-everywhere situation.
+            */
+            matching_errors = _gf_false;
+            break;
+        }
+        i_errno = local->replies[i].op_errno;
+
+        if (i_errno == ENOTCONN) {
+            /* ENOTCONN is not a symmetric error. We do not
+               know if the operation was performed on the
+               backend or not.
+            */
+            matching_errors = _gf_false;
+            break;
+        }
+
+        if (!op_errno) {
+            op_errno = i_errno;
+        } else if (op_errno != i_errno) {
+            /* Mismatching op_errno's */
+            matching_errors = _gf_false;
+            break;
+        }
+    }
+
+    return matching_errors;
+}
+
+int
+afr_set_in_flight_sb_status(xlator_t *this, call_frame_t *frame, inode_t *inode)
+{
+    int ret = -1;
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    /* If this transaction saw no failures, then exit. */
+    if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) == 0)
+        return 0;
+
+    if (afr_is_symmetric_error(frame, this))
+        return 0;
+
+    LOCK(&inode->lock);
+    {
+        ret = __afr_set_in_flight_sb_status(this, local, inode);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+int
+__afr_inode_read_subvol_get_small(inode_t *inode, xlator_t *this,
+                                  unsigned char *data, unsigned char *metadata,
+                                  int *event_p)
+{
+    afr_private_t *priv = NULL;
+    int ret = -1;
+    uint16_t datamap = 0;
+    uint16_t metadatamap = 0;
+    uint32_t event = 0;
+    uint64_t val = 0;
+    int i = 0;
+    afr_inode_ctx_t *ctx = NULL;
+
+    priv = this->private;
+
+    ret = __afr_inode_ctx_get(this, inode, &ctx);
+    if (ret < 0)
+        return ret;
+
+    val = ctx->read_subvol;
+
+    metadatamap = (val & 0x000000000000ffff);
+    datamap = (val & 0x00000000ffff0000) >> 16;
+    event = (val & 0xffffffff00000000) >> 32;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (metadata)
+            metadata[i] = (metadatamap >> i) & 1;
+        if (data)
+            data[i] = (datamap >> i) & 1;
+    }
+
+    if (event_p)
+        *event_p = event;
+    return ret;
+}
+
+int
+__afr_inode_read_subvol_set_small(inode_t *inode, xlator_t *this,
+                                  unsigned char *data, unsigned char *metadata,
+                                  int event)
+{
+    afr_private_t *priv = NULL;
+    uint16_t datamap = 0;
+    uint16_t metadatamap = 0;
+    uint64_t val = 0;
+    int i = 0;
+    int ret = -1;
+    afr_inode_ctx_t *ctx = NULL;
+
+    priv = this->private;
+
+    ret = __afr_inode_ctx_get(this, inode, &ctx);
+    if (ret)
+        goto out;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (data[i])
+            datamap |= (1 << i);
+        if (metadata[i])
+            metadatamap |= (1 << i);
+    }
+
+    val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
+          (((uint64_t)event) << 32);
+
+    ctx->read_subvol = val;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
+                            unsigned char *metadata, int *event_p)
+{
+    afr_private_t *priv = NULL;
+    int ret = -1;
+
+    priv = this->private;
+
+    if (priv->child_count <= 16)
+        ret = __afr_inode_read_subvol_get_small(inode, this, data, metadata,
+                                                event_p);
+    else
+        /* TBD: allocate structure with array and read from it */
+        ret = -1;
+
+    return ret;
+}
+
+int
+__afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
+                                   int *spb_choice)
+{
+    afr_inode_ctx_t *ctx = NULL;
+    int ret = -1;
+
+    ret = __afr_inode_ctx_get(this, inode, &ctx);
+    if (ret < 0)
+        return ret;
+
+    *spb_choice = ctx->spb_choice;
+    return 0;
+}
+
+int
+__afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data,
+                            unsigned char *metadata, int event)
+{
+    afr_private_t *priv = NULL;
+    int ret = -1;
+
+    priv = this->private;
+
+    if (priv->child_count <= 16)
+        ret = __afr_inode_read_subvol_set_small(inode, this, data, metadata,
+                                                event);
+    else
+        ret = -1;
+
+    return ret;
+}
+
+int
+__afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this,
+                                   int spb_choice)
+{
+    afr_inode_ctx_t *ctx = NULL;
+    int ret = -1;
+
+    ret = __afr_inode_ctx_get(this, inode, &ctx);
+    if (ret)
+        goto out;
+
+    ctx->spb_choice = spb_choice;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
+                          unsigned char *metadata, int *event_p)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    LOCK(&inode->lock);
+    {
+        ret = __afr_inode_read_subvol_get(inode, this, data, metadata, event_p);
+    }
+    UNLOCK(&inode->lock);
+out:
+    return ret;
+}
+
+int
+afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this,
+                       unsigned char *readable, int *event_p, int type)
+{
+    afr_private_t *priv = this->private;
+    afr_local_t *local = frame->local;
+    unsigned char *data = alloca0(priv->child_count);
+    unsigned char *metadata = alloca0(priv->child_count);
+    int data_count = 0;
+    int metadata_count = 0;
+    int event_generation = 0;
+    int ret = 0;
+
+    ret = afr_inode_read_subvol_get(inode, this, data, metadata,
+                                    &event_generation);
+    if (ret == -1)
+        return -EIO;
+
+    data_count = AFR_COUNT(data, priv->child_count);
+    metadata_count = AFR_COUNT(metadata, priv->child_count);
+
+    if (inode->ia_type == IA_IFDIR) {
+        /* For directories, allow even if it is in data split-brain. */
+        if (type == AFR_METADATA_TRANSACTION || local->op == GF_FOP_STAT ||
+            local->op == GF_FOP_FSTAT) {
+            if (!metadata_count)
+                return -EIO;
+        }
+    } else {
+        /* For files, abort in case of data/metadata split-brain. */
+        if (!data_count || !metadata_count) {
+            return -EIO;
+        }
+    }
+
+    if (type == AFR_METADATA_TRANSACTION && readable)
+        memcpy(readable, metadata, priv->child_count * sizeof *metadata);
+    if (type == AFR_DATA_TRANSACTION && readable) {
+        if (!data_count)
+            memcpy(readable, local->child_up,
+                   priv->child_count * sizeof *readable);
+        else
+            memcpy(readable, data, priv->child_count * sizeof *data);
+    }
+    if (event_p)
+        *event_p = event_generation;
+    return 0;
+}
+
+static int
+afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
+                                 int *spb_choice)
+{
+    int ret = -1;
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    LOCK(&inode->lock);
+    {
+        ret = __afr_inode_split_brain_choice_get(inode, this, spb_choice);
+    }
+    UNLOCK(&inode->lock);
+out:
+    return ret;
+}
+
+/*
+ * frame is used to get the favourite policy. Since
+ * afr_inode_split_brain_choice_get was called with afr_open, it is possible to
+ * have a frame with out local->replies. So in that case, frame is passed as
+ * null, hence this function will handle the frame NULL case.
+ */
+int
+afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this,
+                                call_frame_t *frame, int *spb_subvol)
+{
+    int ret = -1;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("afr", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, spb_subvol, out);
+
+    priv = this->private;
+
+    ret = afr_inode_split_brain_choice_get(inode, this, spb_subvol);
+    if (*spb_subvol < 0 && priv->fav_child_policy && frame && frame->local) {
+        local = frame->local;
+        *spb_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode,
+                                               NULL);
+        if (*spb_subvol >= 0) {
+            ret = 0;
+        }
+    }
+
+out:
+    return ret;
+}
+int
+afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data,
+                          unsigned char *metadata, int event)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    LOCK(&inode->lock);
+    {
+        ret = __afr_inode_read_subvol_set(inode, this, data, metadata, event);
+    }
+    UNLOCK(&inode->lock);
+out:
+    return ret;
+}
+
+int
+afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, int spb_choice)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    LOCK(&inode->lock);
+    {
+        ret = __afr_inode_split_brain_choice_set(inode, this, spb_choice);
+    }
+    UNLOCK(&inode->lock);
+out:
+    return ret;
+}
+
+/* The caller of this should perform afr_inode_refresh, if this function
+ * returns _gf_true
+ */
+gf_boolean_t
+afr_is_inode_refresh_reqd(inode_t *inode, xlator_t *this, int event_gen1,
+                          int event_gen2)
+{
+    gf_boolean_t need_refresh = _gf_false;
+    afr_inode_ctx_t *ctx = NULL;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    LOCK(&inode->lock);
+    {
+        ret = __afr_inode_ctx_get(this, inode, &ctx);
+        if (ret)
+            goto unlock;
+
+        need_refresh = ctx->need_refresh;
+        /* Hoping that the caller will do inode_refresh followed by
+         * this, hence setting the need_refresh to false */
+        ctx->need_refresh = _gf_false;
+    }
+unlock:
+    UNLOCK(&inode->lock);
+
+    if (event_gen1 != event_gen2)
+        need_refresh = _gf_true;
+out:
+    return need_refresh;
+}
+
+int
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
+{
+    int ret = -1;
+    afr_inode_ctx_t *ctx = NULL;
+
+    ret = __afr_inode_ctx_get(this, inode, &ctx);
+    if (ret == 0) {
+        ctx->need_refresh = _gf_true;
+    }
+
+    return ret;
+}
+
+int
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    LOCK(&inode->lock);
+    {
+        ret = __afr_inode_need_refresh_set(inode, this);
+    }
+    UNLOCK(&inode->lock);
+out:
+    return ret;
+}
+
+int
+afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode)
+{
+    afr_inode_ctx_t *ctx = NULL;
+    int ret = -1;
+
+    if (!inode)
+        return ret;
+
+    LOCK(&inode->lock);
+    {
+        ret = __afr_inode_ctx_get(this, inode, &ctx);
+        if (ret < 0 || !ctx) {
+            UNLOCK(&inode->lock);
+            gf_msg(this->name, GF_LOG_WARNING, 0,
+                   AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
+                   "Failed to cancel split-brain choice timer.");
+            goto out;
+        }
+        ctx->spb_choice = -1;
+        if (ctx->timer) {
+            gf_timer_call_cancel(this->ctx, ctx->timer);
+            ctx->timer = NULL;
+        }
+        ret = 0;
+    }
+    UNLOCK(&inode->lock);
+out:
+    return ret;
+}
+
+void
+afr_set_split_brain_choice_cbk(void *data)
+{
+    inode_t *inode = data;
+    xlator_t *this = THIS;
+
+    afr_spb_choice_timeout_cancel(this, inode);
+    inode_invalidate(inode);
+    inode_unref(inode);
+    return;
+}
+
+int
+afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque)
+{
+    int op_errno = ENOMEM;
+    afr_private_t *priv = NULL;
+    afr_inode_ctx_t *ctx = NULL;
+    inode_t *inode = NULL;
+    loc_t *loc = NULL;
+    xlator_t *this = NULL;
+    afr_spbc_timeout_t *data = opaque;
+    struct timespec delta = {
+        0,
+    };
+    gf_boolean_t timer_set = _gf_false;
+    gf_boolean_t timer_cancelled = _gf_false;
+    gf_boolean_t timer_reset = _gf_false;
+    int old_spb_choice = -1;
+
+    frame = data->frame;
+    loc = data->loc;
+    this = frame->this;
+    priv = this->private;
+
+    if (ret) {
+        op_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    delta.tv_sec = priv->spb_choice_timeout;
+    delta.tv_nsec = 0;
+
+    if (!loc->inode) {
+        ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    if (!(data->d_spb || data->m_spb)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
+               "Cannot set "
+               "replica.split-brain-choice on %s. File is"
+               " not in data/metadata split-brain.",
+               uuid_utoa(loc->gfid));
+        ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    /*
+     * we're ref'ing the inode before LOCK like it is done elsewhere in the
+     * code. If we ref after LOCK, coverity complains of possible deadlocks.
+     */
+    inode = inode_ref(loc->inode);
+
+    LOCK(&inode->lock);
+    {
+        ret = __afr_inode_ctx_get(this, inode, &ctx);
+        if (ret) {
+            UNLOCK(&inode->lock);
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
+                   "Failed to get inode_ctx for %s", loc->name);
+            goto post_unlock;
+        }
+
+        old_spb_choice = ctx->spb_choice;
+        ctx->spb_choice = data->spb_child_index;
+
+        /* Possible changes in spb-choice :
+         *         valid to -1    : cancel timer and unref
+         *         valid to valid : cancel timer and inject new one
+         *         -1    to -1    : unref and do not do anything
+         *         -1 to valid    : inject timer
+         */
+
+        /* ctx->timer is NULL iff previous value of
+         * ctx->spb_choice is -1
+         */
+        if (ctx->timer) {
+            if (ctx->spb_choice == -1) {
+                if (!gf_timer_call_cancel(this->ctx, ctx->timer)) {
+                    ctx->timer = NULL;
+                    timer_cancelled = _gf_true;
+                }
+                /* If timer cancel failed here it means that the
+                 *  previous cbk will be executed which will set
+                 *  spb_choice to -1. So we can consider the
+                 *  'valid to -1' case to be a success
+                 *  (i.e. ret = 0) and goto unlock.
+                 */
+                goto unlock;
+            }
+            goto reset_timer;
+        } else {
+            if (ctx->spb_choice == -1)
+                goto unlock;
+            goto set_timer;
+        }
+
+    reset_timer:
+        ret = gf_timer_call_cancel(this->ctx, ctx->timer);
+        if (ret != 0) {
+            /* We need to bail out now instead of launching a new
+             * timer. Otherwise the cbk of the previous timer event
+             * will cancel the new ctx->timer.
+             */
+            ctx->spb_choice = old_spb_choice;
+            ret = -1;
+            op_errno = EAGAIN;
+            goto unlock;
+        }
+        ctx->timer = NULL;
+        timer_reset = _gf_true;
+
+    set_timer:
+        ctx->timer = gf_timer_call_after(this->ctx, delta,
+                                         afr_set_split_brain_choice_cbk, inode);
+        if (!ctx->timer) {
+            ctx->spb_choice = old_spb_choice;
+            ret = -1;
+            op_errno = ENOMEM;
+        }
+        if (!timer_reset && ctx->timer)
+            timer_set = _gf_true;
+        if (timer_reset && !ctx->timer)
+            timer_cancelled = _gf_true;
+    }
+unlock:
+    UNLOCK(&inode->lock);
+post_unlock:
+    if (!timer_set)
+        inode_unref(inode);
+    if (timer_cancelled)
+        inode_unref(inode);
+    /*
+     * We need to invalidate the inode to prevent the kernel from serving
+     * reads from an older cached value despite a change in spb_choice to
+     * a new value.
+     */
+    inode_invalidate(inode);
+out:
+    GF_FREE(data);
+    AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+    return 0;
+}
+
+int
+afr_accused_fill(xlator_t *this, dict_t *xdata, unsigned char *accused,
+                 afr_transaction_type type)
+{
+    afr_private_t *priv = NULL;
+    int i = 0;
+    int idx = afr_index_for_transaction_type(type);
+    void *pending_raw = NULL;
+    int pending[3];
+    int ret = 0;
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        ret = dict_get_ptr(xdata, priv->pending_key[i], &pending_raw);
+        if (ret) /* no pending flags */
+            continue;
+        memcpy(pending, pending_raw, sizeof(pending));
+
+        if (ntoh32(pending[idx]))
+            accused[i] = 1;
+    }
+
+    return 0;
+}
+
+int
+afr_accuse_smallfiles(xlator_t *this, struct afr_reply *replies,
+                      unsigned char *data_accused)
+{
+    int i = 0;
+    afr_private_t *priv = NULL;
+    uint64_t maxsize = 0;
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (replies[i].valid && replies[i].xdata &&
+            dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE))
+            continue;
+        if (data_accused[i])
+            continue;
+        if (replies[i].poststat.ia_size > maxsize)
+            maxsize = replies[i].poststat.ia_size;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (data_accused[i])
+            continue;
+        if (AFR_IS_ARBITER_BRICK(priv, i))
+            continue;
+        if (replies[i].poststat.ia_size < maxsize)
+            data_accused[i] = 1;
+    }
+
+    return 0;
+}
+
+int
+afr_readables_fill(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                   unsigned char *data_accused, unsigned char *metadata_accused,
+                   unsigned char *data_readable,
+                   unsigned char *metadata_readable, struct afr_reply *replies)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    dict_t *xdata = NULL;
+    int i = 0;
+    int ret = 0;
+    ia_type_t ia_type = IA_INVAL;
+
+    local = frame->local;
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        data_readable[i] = 1;
+        metadata_readable[i] = 1;
+    }
+    if (AFR_IS_ARBITER_BRICK(priv, ARBITER_BRICK_INDEX)) {
+        data_readable[ARBITER_BRICK_INDEX] = 0;
+        metadata_readable[ARBITER_BRICK_INDEX] = 0;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (replies) { /* Lookup */
+            if (!replies[i].valid || replies[i].op_ret == -1 ||
+                (replies[i].xdata &&
+                 dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE))) {
+                data_readable[i] = 0;
+                metadata_readable[i] = 0;
+                continue;
+            }
+
+            xdata = replies[i].xdata;
+            ia_type = replies[i].poststat.ia_type;
+        } else { /* pre-op xattrop */
+            xdata = local->transaction.changelog_xdata[i];
+            ia_type = inode->ia_type;
+        }
+
+        if (!xdata)
+            continue; /* mkdir_cbk sends NULL xdata_rsp. */
+        afr_accused_fill(this, xdata, data_accused,
+                         (ia_type == IA_IFDIR) ? AFR_ENTRY_TRANSACTION
+                                               : AFR_DATA_TRANSACTION);
+
+        afr_accused_fill(this, xdata, metadata_accused,
+                         AFR_METADATA_TRANSACTION);
+    }
+
+    if (replies && ia_type != IA_INVAL && ia_type != IA_IFDIR &&
+        /* We want to accuse small files only when we know for
+         * sure that there is no IO happening. Otherwise, the
+         * ia_sizes obtained in post-refresh replies may
+         * mismatch due to a race between inode-refresh and
+         * ongoing writes, causing spurious heal launches*/
+        !afr_is_possibly_under_txn(AFR_DATA_TRANSACTION, local, this)) {
+        afr_accuse_smallfiles(this, replies, data_accused);
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (data_accused[i]) {
+            data_readable[i] = 0;
+            ret = 1;
+        }
+        if (metadata_accused[i]) {
+            metadata_readable[i] = 0;
+            ret = 1;
+        }
+    }
+    return ret;
+}
+
+int
+afr_replies_interpret(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                      gf_boolean_t *start_heal)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    struct afr_reply *replies = NULL;
+    int event_generation = 0;
+    int i = 0;
+    unsigned char *data_accused = NULL;
+    unsigned char *metadata_accused = NULL;
+    unsigned char *data_readable = NULL;
+    unsigned char *metadata_readable = NULL;
+    int ret = 0;
+
+    local = frame->local;
+    priv = this->private;
+    replies = local->replies;
+    event_generation = local->event_generation;
+
+    data_accused = alloca0(priv->child_count);
+    data_readable = alloca0(priv->child_count);
+    metadata_accused = alloca0(priv->child_count);
+    metadata_readable = alloca0(priv->child_count);
+
+    ret = afr_readables_fill(frame, this, inode, data_accused, metadata_accused,
+                             data_readable, metadata_readable, replies);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (start_heal && priv->child_up[i] &&
+            (data_accused[i] || metadata_accused[i])) {
+            *start_heal = _gf_true;
+            break;
+        }
+    }
+    afr_inode_read_subvol_set(inode, this, data_readable, metadata_readable,
+                              event_generation);
+    return ret;
+}
+
+int
+afr_refresh_selfheal_done(int ret, call_frame_t *heal, void *opaque)
+{
+    if (heal)
+        AFR_STACK_DESTROY(heal);
+    return 0;
+}
+
+int
+afr_inode_refresh_err(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+    int err = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->replies[i].valid && !local->replies[i].op_ret) {
+            err = 0;
+            goto ret;
+        }
+    }
+
+    err = afr_final_errno(local, priv);
+ret:
+    return err;
+}
+
+gf_boolean_t
+afr_selfheal_enabled(const xlator_t *this)
+{
+    const afr_private_t *priv = this->private;
+
+    return priv->data_self_heal || priv->metadata_self_heal ||
+           priv->entry_self_heal;
+}
+
+int
+afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
+{
+    call_frame_t *heal_frame = NULL;
+    afr_local_t *heal_local = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    inode_t *inode = NULL;
+    int event_generation = 0;
+    int read_subvol = -1;
+    int ret = 0;
+
+    local = frame->local;
+    inode = local->inode;
+    priv = this->private;
+
+    if (err)
+        goto refresh_done;
+
+    if (local->op == GF_FOP_LOOKUP)
+        goto refresh_done;
+
+    ret = afr_inode_get_readable(frame, inode, this, local->readable,
+                                 &event_generation, local->transaction.type);
+
+    if (ret == -EIO) {
+        /* No readable subvolume even after refresh ==> splitbrain.*/
+        if (!priv->fav_child_policy) {
+            err = EIO;
+            goto refresh_done;
+        }
+        read_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode,
+                                               NULL);
+        if (read_subvol == -1) {
+            err = EIO;
+            goto refresh_done;
+        }
+
+        heal_frame = afr_frame_create(this, NULL);
+        if (!heal_frame) {
+            err = EIO;
+            goto refresh_done;
+        }
+        heal_local = heal_frame->local;
+        heal_local->xdata_req = dict_new();
+        if (!heal_local->xdata_req) {
+            err = EIO;
+            AFR_STACK_DESTROY(heal_frame);
+            goto refresh_done;
+        }
+        heal_local->heal_frame = frame;
+        ret = synctask_new(this->ctx->env, afr_fav_child_reset_sink_xattrs,
+                           afr_fav_child_reset_sink_xattrs_cbk, heal_frame,
+                           heal_frame);
+        return 0;
+    }
+
+refresh_done:
+    afr_local_replies_wipe(local, this->private);
+    local->refreshfn(frame, this, err);
+
+    return 0;
+}
+
+int
+afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error)
+{
+    call_frame_t *heal_frame = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    gf_boolean_t start_heal = _gf_false;
+    afr_local_t *heal_local = NULL;
+    unsigned char *success_replies = NULL;
+    int ret = 0;
+
+    if (error != 0) {
+        goto refresh_done;
+    }
+
+    local = frame->local;
+    priv = this->private;
+    success_replies = alloca0(priv->child_count);
+    afr_fill_success_replies(local, priv, success_replies);
+
+    if (priv->thin_arbiter_count && local->is_read_txn &&
+        AFR_COUNT(success_replies, priv->child_count) != priv->child_count) {
+        /* We need to query the good bricks and/or thin-arbiter.*/
+        if (success_replies[0]) {
+            local->read_txn_query_child = AFR_CHILD_ZERO;
+        } else if (success_replies[1]) {
+            local->read_txn_query_child = AFR_CHILD_ONE;
+        }
+        error = EINVAL;
+        goto refresh_done;
+    }
+
+    if (!afr_has_quorum(success_replies, this, frame)) {
+        error = afr_final_errno(frame->local, this->private);
+        if (!error)
+            error = afr_quorum_errno(priv);
+        goto refresh_done;
+    }
+
+    ret = afr_replies_interpret(frame, this, local->refreshinode, &start_heal);
+
+    if (ret && afr_selfheal_enabled(this) && start_heal) {
+        heal_frame = afr_frame_create(this, NULL);
+        if (!heal_frame)
+            goto refresh_done;
+        heal_local = heal_frame->local;
+        heal_local->refreshinode = inode_ref(local->refreshinode);
+        heal_local->heal_frame = heal_frame;
+        if (!afr_throttled_selfheal(heal_frame, this)) {
+            AFR_STACK_DESTROY(heal_frame);
+            goto refresh_done;
+        }
+    }
+
+refresh_done:
+    afr_txn_refresh_done(frame, this, error);
+
+    return 0;
+}
+
+void
+afr_inode_refresh_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                             int op_ret, int op_errno, struct iatt *buf,
+                             dict_t *xdata, struct iatt *par)
+{
+    afr_local_t *local = NULL;
+    int call_child = (long)cookie;
+    int8_t need_heal = 1;
+    int call_count = 0;
+    int ret = 0;
+
+    local = frame->local;
+    local->replies[call_child].valid = 1;
+    local->replies[call_child].op_ret = op_ret;
+    local->replies[call_child].op_errno = op_errno;
+    if (op_ret != -1) {
+        local->replies[call_child].poststat = *buf;
+        if (par)
+            local->replies[call_child].postparent = *par;
+        if (xdata)
+            local->replies[call_child].xdata = dict_ref(xdata);
+    }
+
+    if (xdata) {
+        ret = dict_get_int8(xdata, "link-count", &need_heal);
+        if (ret) {
+            gf_msg_debug(this->name, -ret, "Unable to get link count");
+        }
+    }
+
+    local->replies[call_child].need_heal = need_heal;
+    call_count = afr_frame_return(frame);
+    if (call_count == 0) {
+        afr_set_need_heal(this, local);
+        ret = afr_inode_refresh_err(frame, this);
+        if (ret) {
+            gf_msg_debug(this->name, ret, "afr_inode_refresh_err failed");
+        }
+        afr_inode_refresh_done(frame, this, ret);
+    }
+}
+
+int
+afr_inode_refresh_subvol_with_lookup_cbk(call_frame_t *frame, void *cookie,
+                                         xlator_t *this, int op_ret,
+                                         int op_errno, inode_t *inode,
+                                         struct iatt *buf, dict_t *xdata,
+                                         struct iatt *par)
+{
+    afr_inode_refresh_subvol_cbk(frame, cookie, this, op_ret, op_errno, buf,
+                                 xdata, par);
+    return 0;
+}
+
+int
+afr_inode_refresh_subvol_with_lookup(call_frame_t *frame, xlator_t *this, int i,
+                                     inode_t *inode, uuid_t gfid, dict_t *xdata)
+{
+    loc_t loc = {
+        0,
+    };
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    loc.inode = inode;
+    if (gf_uuid_is_null(inode->gfid) && gfid) {
+        /* To handle setattr/setxattr on yet to be linked inode from
+         * dht */
+        gf_uuid_copy(loc.gfid, gfid);
+    } else {
+        gf_uuid_copy(loc.gfid, inode->gfid);
+    }
+
+    STACK_WIND_COOKIE(frame, afr_inode_refresh_subvol_with_lookup_cbk,
+                      (void *)(long)i, priv->children[i],
+                      priv->children[i]->fops->lookup, &loc, xdata);
+    return 0;
+}
+
+int
+afr_inode_refresh_subvol_with_fstat_cbk(call_frame_t *frame, void *cookie,
+                                        xlator_t *this, int32_t op_ret,
+                                        int32_t op_errno, struct iatt *buf,
+                                        dict_t *xdata)
+{
+    afr_inode_refresh_subvol_cbk(frame, cookie, this, op_ret, op_errno, buf,
+                                 xdata, NULL);
+    return 0;
+}
+
+int
+afr_inode_refresh_subvol_with_fstat(call_frame_t *frame, xlator_t *this, int i,
+                                    dict_t *xdata)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    STACK_WIND_COOKIE(frame, afr_inode_refresh_subvol_with_fstat_cbk,
+                      (void *)(long)i, priv->children[i],
+                      priv->children[i]->fops->fstat, local->fd, xdata);
+    return 0;
+}
+
+int
+afr_inode_refresh_do(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int call_count = 0;
+    int i = 0;
+    int ret = 0;
+    dict_t *xdata = NULL;
+    afr_fd_ctx_t *fd_ctx = NULL;
+    unsigned char *wind_subvols = NULL;
+
+    priv = this->private;
+    local = frame->local;
+    wind_subvols = alloca0(priv->child_count);
+
+    afr_local_replies_wipe(local, priv);
+
+    if (local->fd) {
+        fd_ctx = afr_fd_ctx_get(local->fd, this);
+        if (!fd_ctx) {
+            afr_inode_refresh_done(frame, this, EINVAL);
+            return 0;
+        }
+    }
+
+    xdata = dict_new();
+    if (!xdata) {
+        afr_inode_refresh_done(frame, this, ENOMEM);
+        return 0;
+    }
+
+    ret = afr_xattr_req_prepare(this, xdata);
+    if (ret != 0) {
+        dict_unref(xdata);
+        afr_inode_refresh_done(frame, this, -ret);
+        return 0;
+    }
+
+    ret = dict_set_sizen_str_sizen(xdata, "link-count", GF_XATTROP_INDEX_COUNT);
+    if (ret) {
+        gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
+    }
+
+    ret = dict_set_str_sizen(xdata, GLUSTERFS_INODELK_DOM_COUNT, this->name);
+    if (ret) {
+        gf_msg_debug(this->name, -ret,
+                     "Unable to set inodelk-dom-count in dict ");
+    }
+
+    if (local->fd) {
+        for (i = 0; i < priv->child_count; i++) {
+            if (local->child_up[i] && fd_ctx->opened_on[i] == AFR_FD_OPENED)
+                wind_subvols[i] = 1;
+        }
+    } else {
+        memcpy(wind_subvols, local->child_up,
+               sizeof(*local->child_up) * priv->child_count);
+    }
+
+    local->call_count = AFR_COUNT(wind_subvols, priv->child_count);
+
+    call_count = local->call_count;
+    if (!call_count) {
+        dict_unref(xdata);
+        if (local->fd && AFR_COUNT(local->child_up, priv->child_count))
+            afr_inode_refresh_done(frame, this, EBADFD);
+        else
+            afr_inode_refresh_done(frame, this, ENOTCONN);
+        return 0;
+    }
+    for (i = 0; i < priv->child_count; i++) {
+        if (!wind_subvols[i])
+            continue;
+
+        if (local->fd)
+            afr_inode_refresh_subvol_with_fstat(frame, this, i, xdata);
+        else
+            afr_inode_refresh_subvol_with_lookup(
+                frame, this, i, local->refreshinode, local->refreshgfid, xdata);
+
+        if (!--call_count)
+            break;
+    }
+
+    dict_unref(xdata);
+
+    return 0;
+}
+
+int
+afr_inode_refresh(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                  uuid_t gfid, afr_inode_refresh_cbk_t refreshfn)
+{
+    afr_local_t *local = NULL;
+
+    local = frame->local;
+
+    local->refreshfn = refreshfn;
+
+    if (local->refreshinode) {
+        inode_unref(local->refreshinode);
+        local->refreshinode = NULL;
+    }
+
+    local->refreshinode = inode_ref(inode);
+
+    if (gfid)
+        gf_uuid_copy(local->refreshgfid, gfid);
+    else
+        gf_uuid_clear(local->refreshgfid);
+
+    afr_inode_refresh_do(frame, this);
+
+    return 0;
+}
+
+int
+afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req)
+{
+    int i = 0;
+    afr_private_t *priv = NULL;
+    int ret = 0;
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        ret = dict_set_uint64(xattr_req, priv->pending_key[i],
+                              AFR_NUM_CHANGE_LOGS * sizeof(int));
+        if (ret < 0)
+            gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+                   "Unable to set dict value for %s", priv->pending_key[i]);
+        /* 3 = data+metadata+entry */
+    }
+    ret = dict_set_uint64(xattr_req, AFR_DIRTY,
+                          AFR_NUM_CHANGE_LOGS * sizeof(int));
+    if (ret) {
+        gf_msg_debug(this->name, -ret,
+                     "failed to set dirty "
+                     "query flag");
+    }
+
+    ret = dict_set_int32_sizen(xattr_req, "list-xattr", 1);
+    if (ret) {
+        gf_msg_debug(this->name, -ret, "Unable to set list-xattr in dict ");
+    }
+
+    return ret;
+}
+
+int
+afr_lookup_xattr_req_prepare(afr_local_t *local, xlator_t *this,
+                             dict_t *xattr_req, loc_t *loc)
+{
+    int ret = -ENOMEM;
+
+    if (!local->xattr_req)
+        local->xattr_req = dict_new();
+
+    if (!local->xattr_req)
+        goto out;
+
+    if (xattr_req && (xattr_req != local->xattr_req))
+        dict_copy(xattr_req, local->xattr_req);
+
+    ret = afr_xattr_req_prepare(this, local->xattr_req);
+
+    ret = dict_set_uint64(local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+               "%s: Unable to set dict value for %s", loc->path,
+               GLUSTERFS_INODELK_COUNT);
+    }
+    ret = dict_set_uint64(local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+               "%s: Unable to set dict value for %s", loc->path,
+               GLUSTERFS_ENTRYLK_COUNT);
+    }
+
+    ret = dict_set_uint32(local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+               "%s: Unable to set dict value for %s", loc->path,
+               GLUSTERFS_PARENT_ENTRYLK);
+    }
+
+    ret = dict_set_sizen_str_sizen(local->xattr_req, "link-count",
+                                   GF_XATTROP_INDEX_COUNT);
+    if (ret) {
+        gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable)
+{
+    int i = 0;
+    int child = -1;
+    int64_t read_iter = -1;
+    int64_t pending_read = -1;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i])
+            continue;
+        read_iter = GF_ATOMIC_GET(priv->pending_reads[i]);
+        if (child == -1 || read_iter < pending_read) {
+            pending_read = read_iter;
+            child = i;
+        }
+    }
+
+    return child;
+}
+
+static int32_t
+afr_least_latency_child(afr_private_t *priv, unsigned char *readable)
+{
+    int32_t i = 0;
+    int child = -1;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+            priv->child_latency[i] < 0)
+            continue;
+
+        if (child == -1 ||
+            priv->child_latency[i] < priv->child_latency[child]) {
+            child = i;
+        }
+    }
+    return child;
+}
+
+static int32_t
+afr_least_latency_times_pending_reads_child(afr_private_t *priv,
+                                            unsigned char *readable)
+{
+    int32_t i = 0;
+    int child = -1;
+    int64_t pending_read = 0;
+    int64_t latency = -1;
+    int64_t least_latency = -1;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+            priv->child_latency[i] < 0)
+            continue;
+
+        pending_read = GF_ATOMIC_GET(priv->pending_reads[i]);
+        latency = (pending_read + 1) * priv->child_latency[i];
+
+        if (child == -1 || latency < least_latency) {
+            least_latency = latency;
+            child = i;
+        }
+    }
+    return child;
+}
+
+int
+afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv,
+               unsigned char *readable)
+{
+    uuid_t gfid_copy = {
+        0,
+    };
+    pid_t pid;
+    int child = -1;
+
+    switch (priv->hash_mode) {
+        case AFR_READ_POLICY_FIRST_UP:
+            break;
+        case AFR_READ_POLICY_GFID_HASH:
+            gf_uuid_copy(gfid_copy, args->gfid);
+            child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
+                    priv->child_count;
+            break;
+        case AFR_READ_POLICY_GFID_PID_HASH:
+            if (args->ia_type != IA_IFDIR) {
+                /*
+                 * Why getpid?  Because it's one of the cheapest calls
+                 * available - faster than gethostname etc. - and
+                 * returns a constant-length value that's sure to be
+                 * shorter than a UUID. It's still very unlikely to be
+                 * the same across clients, so it still provides good
+                 * mixing.  We're not trying for perfection here. All we
+                 * need is a low probability that multiple clients
+                 * won't converge on the same subvolume.
+                 */
+                gf_uuid_copy(gfid_copy, args->gfid);
+                pid = getpid();
+                *(pid_t *)gfid_copy ^= pid;
+            }
+            child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
+                    priv->child_count;
+            break;
+        case AFR_READ_POLICY_LESS_LOAD:
+            child = afr_least_pending_reads_child(priv, readable);
+            break;
+        case AFR_READ_POLICY_LEAST_LATENCY:
+            child = afr_least_latency_child(priv, readable);
+            break;
+        case AFR_READ_POLICY_LOAD_LATENCY_HYBRID:
+            child = afr_least_latency_times_pending_reads_child(priv, readable);
+            break;
+    }
+
+    return child;
+}
+
+int
+afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
+                                 unsigned char *readable,
+                                 afr_read_subvol_args_t *args)
+{
+    int i = 0;
+    int read_subvol = -1;
+    afr_private_t *priv = NULL;
+    afr_read_subvol_args_t local_args = {
+        0,
+    };
+
+    priv = this->private;
+
+    /* first preference - explicitly specified or local subvolume */
+    if (priv->read_child >= 0 && readable[priv->read_child])
+        return priv->read_child;
+
+    if (inode_is_linked(inode)) {
+        gf_uuid_copy(local_args.gfid, inode->gfid);
+        local_args.ia_type = inode->ia_type;
+    } else if (args) {
+        local_args = *args;
+    }
+
+    /* second preference - use hashed mode */
+    read_subvol = afr_hash_child(&local_args, priv, readable);
+    if (read_subvol >= 0 && readable[read_subvol])
+        return read_subvol;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (readable[i])
+            return i;
+    }
+
+    /* no readable subvolumes, either split brain or all subvols down */
+
+    return -1;
+}
+
+int
+afr_inode_read_subvol_type_get(inode_t *inode, xlator_t *this,
+                               unsigned char *readable, int *event_p, int type)
+{
+    int ret = -1;
+
+    if (type == AFR_METADATA_TRANSACTION)
+        ret = afr_inode_read_subvol_get(inode, this, 0, readable, event_p);
+    else
+        ret = afr_inode_read_subvol_get(inode, this, readable, 0, event_p);
+    return ret;
+}
+
+void
+afr_readables_intersect_get(inode_t *inode, xlator_t *this, int *event,
+                            unsigned char *intersection)
+{
+    afr_private_t *priv = NULL;
+    unsigned char *data_readable = NULL;
+    unsigned char *metadata_readable = NULL;
+    unsigned char *intersect = NULL;
+
+    priv = this->private;
+    data_readable = alloca0(priv->child_count);
+    metadata_readable = alloca0(priv->child_count);
+    intersect = alloca0(priv->child_count);
+
+    afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable,
+                              event);
+
+    AFR_INTERSECT(intersect, data_readable, metadata_readable,
+                  priv->child_count);
+    if (intersection)
+        memcpy(intersection, intersect,
+               sizeof(*intersection) * priv->child_count);
+}
+
+int
+afr_read_subvol_get(inode_t *inode, xlator_t *this, int *subvol_p,
+                    unsigned char *readables, int *event_p,
+                    afr_transaction_type type, afr_read_subvol_args_t *args)
+{
+    afr_private_t *priv = NULL;
+    unsigned char *readable = NULL;
+    unsigned char *intersection = NULL;
+    int subvol = -1;
+    int event = 0;
+
+    priv = this->private;
+
+    readable = alloca0(priv->child_count);
+    intersection = alloca0(priv->child_count);
+
+    afr_inode_read_subvol_type_get(inode, this, readable, &event, type);
+
+    afr_readables_intersect_get(inode, this, &event, intersection);
+
+    if (AFR_COUNT(intersection, priv->child_count) > 0)
+        subvol = afr_read_subvol_select_by_policy(inode, this, intersection,
+                                                  args);
+    else
+        subvol = afr_read_subvol_select_by_policy(inode, this, readable, args);
+    if (subvol_p)
+        *subvol_p = subvol;
+    if (event_p)
+        *event_p = event;
+    if (readables)
+        memcpy(readables, readable, sizeof(*readables) * priv->child_count);
+    return subvol;
+}
+
+void
+afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    int i = 0;
+
+    priv = this->private;
+
+    afr_matrix_cleanup(local->pending, priv->child_count);
+
+    GF_FREE(local->internal_lock.lower_locked_nodes);
+
+    afr_lockees_cleanup(&local->internal_lock);
+
+    GF_FREE(local->transaction.pre_op);
+
+    GF_FREE(local->transaction.pre_op_sources);
+    if (local->transaction.changelog_xdata) {
+        for (i = 0; i < priv->child_count; i++) {
+            if (!local->transaction.changelog_xdata[i])
+                continue;
+            dict_unref(local->transaction.changelog_xdata[i]);
+        }
+        GF_FREE(local->transaction.changelog_xdata);
+    }
+
+    GF_FREE(local->transaction.failed_subvols);
+
+    GF_FREE(local->transaction.basename);
+    GF_FREE(local->transaction.new_basename);
+
+    loc_wipe(&local->transaction.parent_loc);
+    loc_wipe(&local->transaction.new_parent_loc);
+}
+
+void
+afr_reply_wipe(struct afr_reply *reply)
+{
+    if (reply->xdata) {
+        dict_unref(reply->xdata);
+        reply->xdata = NULL;
+    }
+
+    if (reply->xattr) {
+        dict_unref(reply->xattr);
+        reply->xattr = NULL;
+    }
+}
+
+void
+afr_replies_wipe(struct afr_reply *replies, int count)
+{
+    int i = 0;
+
+    for (i = 0; i < count; i++) {
+        afr_reply_wipe(&replies[i]);
+    }
+}
+
+void
+afr_local_replies_wipe(afr_local_t *local, afr_private_t *priv)
+{
+    if (!local->replies)
+        return;
+
+    afr_replies_wipe(local->replies, priv->child_count);
+
+    memset(local->replies, 0, sizeof(*local->replies) * priv->child_count);
+}
+
+static gf_boolean_t
+afr_fop_lock_is_unlock(call_frame_t *frame)
+{
+    afr_local_t *local = frame->local;
+    switch (local->op) {
+        case GF_FOP_INODELK:
+        case GF_FOP_FINODELK:
+            if ((F_UNLCK == local->cont.inodelk.in_flock.l_type) &&
+                (local->cont.inodelk.in_cmd == F_SETLKW ||
+                 local->cont.inodelk.in_cmd == F_SETLK))
+                return _gf_true;
+            break;
+        case GF_FOP_ENTRYLK:
+        case GF_FOP_FENTRYLK:
+            if (ENTRYLK_UNLOCK == local->cont.entrylk.in_cmd)
+                return _gf_true;
+            break;
+        default:
+            return _gf_false;
+    }
+    return _gf_false;
+}
+
+static gf_boolean_t
+afr_lk_is_unlock(int32_t cmd, struct gf_flock *flock)
+{
+    switch (cmd) {
+        case F_RESLK_UNLCK:
+            return _gf_true;
+            break;
+
+#if F_SETLKW != F_SETLKW64
+        case F_SETLKW64:
+#endif
+        case F_SETLKW:
+
+#if F_SETLK != F_SETLK64
+        case F_SETLK64:
+#endif
+        case F_SETLK:
+            if (F_UNLCK == flock->l_type)
+                return _gf_true;
+            break;
+        default:
+            return _gf_false;
+    }
+    return _gf_false;
+}
+
+void
+afr_handle_inconsistent_fop(call_frame_t *frame, int32_t *op_ret,
+                            int32_t *op_errno)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+
+    if (!frame || !frame->this || !frame->local || !frame->this->private)
+        return;
+
+    if (*op_ret < 0)
+        return;
+
+    /* Failing inodelk/entrylk/lk here is not a good idea because we
+     * need to cleanup the locks on the other bricks if we choose to fail
+     * the fop here. The brick may go down just after unwind happens as well
+     * so anyways the fop will fail when the next fop is sent so leaving
+     * it like this for now.*/
+    local = frame->local;
+    switch (local->op) {
+        case GF_FOP_LOOKUP:
+        case GF_FOP_INODELK:
+        case GF_FOP_FINODELK:
+        case GF_FOP_ENTRYLK:
+        case GF_FOP_FENTRYLK:
+        case GF_FOP_LK:
+            return;
+        default:
+            break;
+    }
+
+    priv = frame->this->private;
+    if (!priv->consistent_io)
+        return;
+
+    if (local->event_generation &&
+        (local->event_generation != priv->event_generation))
+        goto inconsistent;
+
+    return;
+inconsistent:
+    *op_ret = -1;
+    *op_errno = ENOTCONN;
+}
+
+void
+afr_local_cleanup(afr_local_t *local, xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+
+    if (!local)
+        return;
+
+    syncbarrier_destroy(&local->barrier);
+
+    afr_local_transaction_cleanup(local, this);
+
+    priv = this->private;
+
+    loc_wipe(&local->loc);
+    loc_wipe(&local->newloc);
+
+    if (local->fd)
+        fd_unref(local->fd);
+
+    if (local->xattr_req)
+        dict_unref(local->xattr_req);
+
+    if (local->xattr_rsp)
+        dict_unref(local->xattr_rsp);
+
+    if (local->dict)
+        dict_unref(local->dict);
+
+    afr_local_replies_wipe(local, priv);
+    GF_FREE(local->replies);
+
+    GF_FREE(local->child_up);
+
+    GF_FREE(local->read_attempted);
+
+    GF_FREE(local->readable);
+    GF_FREE(local->readable2);
+
+    if (local->inode)
+        inode_unref(local->inode);
+
+    if (local->parent)
+        inode_unref(local->parent);
+
+    if (local->parent2)
+        inode_unref(local->parent2);
+
+    if (local->refreshinode)
+        inode_unref(local->refreshinode);
+
+    { /* getxattr */
+        GF_FREE(local->cont.getxattr.name);
+    }
+
+    { /* lk */
+        GF_FREE(local->cont.lk.locked_nodes);
+        GF_FREE(local->cont.lk.dom_locked_nodes);
+        GF_FREE(local->cont.lk.dom_lock_op_ret);
+        GF_FREE(local->cont.lk.dom_lock_op_errno);
+    }
+
+    { /* create */
+        if (local->cont.create.fd)
+            fd_unref(local->cont.create.fd);
+        if (local->cont.create.params)
+            dict_unref(local->cont.create.params);
+    }
+
+    { /* mknod */
+        if (local->cont.mknod.params)
+            dict_unref(local->cont.mknod.params);
+    }
+
+    { /* mkdir */
+        if (local->cont.mkdir.params)
+            dict_unref(local->cont.mkdir.params);
+    }
+
+    { /* symlink */
+        if (local->cont.symlink.params)
+            dict_unref(local->cont.symlink.params);
+    }
+
+    { /* writev */
+        GF_FREE(local->cont.writev.vector);
+        if (local->cont.writev.iobref)
+            iobref_unref(local->cont.writev.iobref);
+    }
+
+    { /* setxattr */
+        if (local->cont.setxattr.dict)
+            dict_unref(local->cont.setxattr.dict);
+    }
+
+    { /* fsetxattr */
+        if (local->cont.fsetxattr.dict)
+            dict_unref(local->cont.fsetxattr.dict);
+    }
+
+    { /* removexattr */
+        GF_FREE(local->cont.removexattr.name);
+    }
+    { /* xattrop */
+        if (local->cont.xattrop.xattr)
+            dict_unref(local->cont.xattrop.xattr);
+    }
+    { /* symlink */
+        GF_FREE(local->cont.symlink.linkpath);
+    }
+
+    { /* opendir */
+        GF_FREE(local->cont.opendir.checksum);
+    }
+
+    { /* open */
+        if (local->cont.open.fd)
+            fd_unref(local->cont.open.fd);
+    }
+
+    { /* readdirp */
+        if (local->cont.readdir.dict)
+            dict_unref(local->cont.readdir.dict);
+    }
+
+    { /* inodelk */
+        GF_FREE(local->cont.inodelk.volume);
+        if (local->cont.inodelk.xdata)
+            dict_unref(local->cont.inodelk.xdata);
+    }
+
+    { /* entrylk */
+        GF_FREE(local->cont.entrylk.volume);
+        GF_FREE(local->cont.entrylk.basename);
+        if (local->cont.entrylk.xdata)
+            dict_unref(local->cont.entrylk.xdata);
+    }
+
+    if (local->xdata_req)
+        dict_unref(local->xdata_req);
+
+    if (local->xdata_rsp)
+        dict_unref(local->xdata_rsp);
+}
+
+int
+afr_frame_return(call_frame_t *frame)
+{
+    afr_local_t *local = NULL;
+    int call_count = 0;
+
+    local = frame->local;
+
+    LOCK(&frame->lock);
+    {
+        call_count = --local->call_count;
+    }
+    UNLOCK(&frame->lock);
+
+    return call_count;
+}
+
+static char *afr_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
+
+gf_boolean_t
+afr_is_xattr_ignorable(char *key)
+{
+    int i = 0;
+
+    if (!strncmp(key, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX)))
+        return _gf_true;
+    for (i = 0; afr_ignore_xattrs[i]; i++) {
+        if (!strcmp(key, afr_ignore_xattrs[i]))
+            return _gf_true;
+    }
+    return _gf_false;
+}
+
+static gf_boolean_t
+afr_xattr_match_needed(dict_t *this, char *key1, data_t *value1, void *data)
+{
+    /* Ignore all non-disk (i.e. virtual) xattrs right away. */
+    if (!gf_is_valid_xattr_namespace(key1))
+        return _gf_false;
+
+    /* Ignore on-disk xattrs that AFR doesn't need to heal. */
+    if (!afr_is_xattr_ignorable(key1))
+        return _gf_true;
+
+    return _gf_false;
+}
+
+gf_boolean_t
+afr_xattrs_are_equal(dict_t *dict1, dict_t *dict2)
+{
+    return are_dicts_equal(dict1, dict2, afr_xattr_match_needed, NULL);
+}
+
+static int
+afr_get_parent_read_subvol(xlator_t *this, inode_t *parent,
+                           struct afr_reply *replies, unsigned char *readable)
+{
+    int i = 0;
+    int par_read_subvol = -1;
+    int par_read_subvol_iter = -1;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    if (parent)
+        par_read_subvol = afr_data_subvol_get(parent, this, NULL, NULL, NULL,
+                                              NULL);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid)
+            continue;
+
+        if (replies[i].op_ret < 0)
+            continue;
+
+        if (par_read_subvol_iter == -1) {
+            par_read_subvol_iter = i;
+            continue;
+        }
+
+        if ((par_read_subvol_iter != par_read_subvol) && readable[i])
+            par_read_subvol_iter = i;
+
+        if (i == par_read_subvol)
+            par_read_subvol_iter = i;
+    }
+    /* At the end of the for-loop, the only reason why @par_read_subvol_iter
+     * could be -1 is when this LOOKUP has failed on all sub-volumes.
+     * So it is okay to send an arbitrary subvolume (0 in this case)
+     * as parent read subvol.
+     */
+    if (par_read_subvol_iter == -1)
+        par_read_subvol_iter = 0;
+
+    return par_read_subvol_iter;
+}
+
+int
+afr_read_subvol_decide(inode_t *inode, xlator_t *this,
+                       afr_read_subvol_args_t *args, unsigned char *readable)
+{
+    int event = 0;
+    afr_private_t *priv = NULL;
+    unsigned char *intersection = NULL;
+
+    priv = this->private;
+    intersection = alloca0(priv->child_count);
+
+    afr_readables_intersect_get(inode, this, &event, intersection);
+
+    if (AFR_COUNT(intersection, priv->child_count) <= 0) {
+        /* TODO: If we have one brick with valid data_readable and
+         * another with metadata_readable, try to send an iatt with
+         * valid bits from both.*/
+        return -1;
+    }
+
+    memcpy(readable, intersection, sizeof(*readable) * priv->child_count);
+
+    return afr_read_subvol_select_by_policy(inode, this, intersection, args);
+}
+
+static inline int
+afr_first_up_child(call_frame_t *frame, xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int i = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++)
+        if (local->replies[i].valid && local->replies[i].op_ret == 0)
+            return i;
+    return -1;
+}
+
+static void
+afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this,
+                           unsigned char *success_replies,
+                           unsigned char *data_readable, int *read_subvol)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int spb_subvol = -1;
+    int child_count = -1;
+
+    if (*read_subvol != -1)
+        return;
+
+    priv = this->private;
+    local = frame->local;
+    child_count = priv->child_count;
+
+    afr_split_brain_read_subvol_get(local->inode, this, frame, &spb_subvol);
+    if ((spb_subvol >= 0) &&
+        (AFR_COUNT(success_replies, child_count) == child_count)) {
+        *read_subvol = spb_subvol;
+    } else if (!priv->quorum_count ||
+               frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) {
+        *read_subvol = afr_first_up_child(frame, this);
+    } else if (priv->quorum_count &&
+               afr_has_quorum(data_readable, this, NULL)) {
+        /* read_subvol is guaranteed to be valid if we hit this path. */
+        *read_subvol = afr_first_up_child(frame, this);
+    } else {
+        /* If quorum is enabled and we do not have a
+           readable yet, it means all good copies are down.
+        */
+        local->op_ret = -1;
+        local->op_errno = ENOTCONN;
+        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR,
+               "no read "
+               "subvols for %s",
+               local->loc.path);
+    }
+    if (*read_subvol >= 0)
+        dict_del_sizen(local->replies[*read_subvol].xdata, GF_CONTENT_KEY);
+}
+
+static void
+afr_lookup_done(call_frame_t *frame, xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int i = -1;
+    int op_errno = 0;
+    int read_subvol = 0;
+    int par_read_subvol = 0;
+    int ret = -1;
+    unsigned char *readable = NULL;
+    unsigned char *success_replies = NULL;
+    int event = 0;
+    struct afr_reply *replies = NULL;
+    uuid_t read_gfid = {
+        0,
+    };
+    gf_boolean_t locked_entry = _gf_false;
+    gf_boolean_t in_flight_create = _gf_false;
+    gf_boolean_t can_interpret = _gf_true;
+    inode_t *parent = NULL;
+    ia_type_t ia_type = IA_INVAL;
+    afr_read_subvol_args_t args = {
+        0,
+    };
+    char *gfid_heal_msg = NULL;
+
+    priv = this->private;
+    local = frame->local;
+    replies = local->replies;
+    parent = local->loc.parent;
+
+    locked_entry = afr_is_possibly_under_txn(AFR_ENTRY_TRANSACTION, local,
+                                             this);
+
+    readable = alloca0(priv->child_count);
+    success_replies = alloca0(priv->child_count);
+
+    afr_inode_read_subvol_get(parent, this, readable, NULL, &event);
+    par_read_subvol = afr_get_parent_read_subvol(this, parent, replies,
+                                                 readable);
+
+    /* First, check if we have a gfid-change from somewhere,
+       If so, propagate that so that a fresh lookup can be
+       issued
+    */
+    if (local->cont.lookup.needs_fresh_lookup) {
+        local->op_ret = -1;
+        local->op_errno = ESTALE;
+        goto error;
+    }
+
+    op_errno = afr_final_errno(frame->local, this->private);
+    local->op_errno = op_errno;
+
+    read_subvol = -1;
+    afr_fill_success_replies(local, priv, success_replies);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid)
+            continue;
+
+        if (replies[i].op_ret == -1) {
+            if (locked_entry && replies[i].op_errno == ENOENT) {
+                in_flight_create = _gf_true;
+            }
+            continue;
+        }
+
+        if (read_subvol == -1 || !readable[read_subvol]) {
+            read_subvol = i;
+            gf_uuid_copy(read_gfid, replies[i].poststat.ia_gfid);
+            ia_type = replies[i].poststat.ia_type;
+            local->op_ret = 0;
+        }
+    }
+
+    if (in_flight_create && !afr_has_quorum(success_replies, this, NULL)) {
+        local->op_ret = -1;
+        local->op_errno = ENOENT;
+        goto error;
+    }
+
+    if (read_subvol == -1)
+        goto error;
+    /* We now have a read_subvol, which is readable[] (if there
+       were any). Next we look for GFID mismatches. We don't
+       consider a GFID mismatch as an error if read_subvol is
+       readable[] but the mismatching GFID subvol is not.
+    */
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid || replies[i].op_ret == -1) {
+            continue;
+        }
+
+        if (!gf_uuid_compare(replies[i].poststat.ia_gfid, read_gfid))
+            continue;
+
+        can_interpret = _gf_false;
+
+        if (locked_entry)
+            continue;
+
+        /* Now GFIDs mismatch. It's OK as long as this subvol
+           is not readable[] but read_subvol is */
+        if (readable[read_subvol] && !readable[i])
+            continue;
+
+        /* If we were called from glfsheal and there is still a gfid
+         * mismatch, succeed the lookup and let glfsheal print the
+         * response via gfid-heal-msg.*/
+        if (!dict_get_str_sizen(local->xattr_req, "gfid-heal-msg",
+                                &gfid_heal_msg))
+            goto cant_interpret;
+
+        /* LOG ERROR */
+        local->op_ret = -1;
+        local->op_errno = EIO;
+        goto error;
+    }
+
+    /* Forth, for the finalized GFID, pick the best subvolume
+       to return stats from.
+    */
+    read_subvol = -1;
+    memset(readable, 0, sizeof(*readable) * priv->child_count);
+    if (can_interpret) {
+        if (!afr_has_quorum(success_replies, this, NULL))
+            goto cant_interpret;
+        /* It is safe to call afr_replies_interpret() because we have
+           a response from all the UP subvolumes and all of them resolved
+           to the same GFID
+        */
+        gf_uuid_copy(args.gfid, read_gfid);
+        args.ia_type = ia_type;
+        ret = afr_replies_interpret(frame, this, local->inode, NULL);
+        read_subvol = afr_read_subvol_decide(local->inode, this, &args,
+                                             readable);
+        if (read_subvol == -1)
+            goto cant_interpret;
+        if (ret) {
+            afr_inode_need_refresh_set(local->inode, this);
+            dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY);
+        }
+    } else {
+    cant_interpret:
+        afr_attempt_readsubvol_set(frame, this, success_replies, readable,
+                                   &read_subvol);
+        if (read_subvol == -1) {
+            goto error;
+        }
+    }
+
+    afr_handle_quota_size(frame, this);
+
+    afr_set_need_heal(this, local);
+    if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) {
+        local->op_ret = -1;
+        local->op_errno = ENOTCONN;
+        gf_msg_debug(this->name, 0,
+                     "Arbiter cannot be a read subvol "
+                     "for %s",
+                     local->loc.path);
+        goto error;
+    }
+
+    ret = dict_get_str_sizen(local->xattr_req, "gfid-heal-msg", &gfid_heal_msg);
+    if (!ret) {
+        ret = dict_set_str_sizen(local->replies[read_subvol].xdata,
+                                 "gfid-heal-msg", gfid_heal_msg);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+                   "Error setting gfid-heal-msg dict");
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+        }
+    }
+
+    AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+                     local->inode, &local->replies[read_subvol].poststat,
+                     local->replies[read_subvol].xdata,
+                     &local->replies[par_read_subvol].postparent);
+    return;
+
+error:
+    AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL,
+                     NULL, NULL);
+}
+
+/*
+ * During a lookup, some errors are more "important" than
+ * others in that they must be given higher priority while
+ * returning to the user.
+ *
+ * The hierarchy is ENODATA > ENOENT > ESTALE > ENOSPC others
+ */
+
+int
+afr_higher_errno(int32_t old_errno, int32_t new_errno)
+{
+    if (old_errno == ENODATA || new_errno == ENODATA)
+        return ENODATA;
+    if (old_errno == ENOENT || new_errno == ENOENT)
+        return ENOENT;
+    if (old_errno == ESTALE || new_errno == ESTALE)
+        return ESTALE;
+    if (old_errno == ENOSPC || new_errno == ENOSPC)
+        return ENOSPC;
+
+    return new_errno;
+}
+
+int
+afr_final_errno(afr_local_t *local, afr_private_t *priv)
+{
+    int i = 0;
+    int op_errno = 0;
+    int tmp_errno = 0;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].valid)
+            continue;
+        if (local->replies[i].op_ret >= 0)
+            continue;
+        tmp_errno = local->replies[i].op_errno;
+        op_errno = afr_higher_errno(op_errno, tmp_errno);
+    }
+
+    return op_errno;
+}
+
+static int32_t
+afr_local_discovery_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *dict,
+                        dict_t *xdata)
+{
+    int ret = 0;
+    char *pathinfo = NULL;
+    gf_boolean_t is_local = _gf_false;
+    afr_private_t *priv = NULL;
+    int32_t child_index = -1;
+
+    if (op_ret != 0) {
+        goto out;
+    }
+
+    priv = this->private;
+    child_index = (int32_t)(long)cookie;
+
+    ret = dict_get_str_sizen(dict, GF_XATTR_PATHINFO_KEY, &pathinfo);
+    if (ret != 0) {
+        goto out;
+    }
+
+    ret = glusterfs_is_local_pathinfo(pathinfo, &is_local);
+    if (ret) {
+        goto out;
+    }
+
+    /*
+     * Note that one local subvolume will override another here.  The only
+     * way to avoid that would be to retain extra information about whether
+     * the previous read_child is local, and it's just not worth it.  Even
+     * the slowest local subvolume is far preferable to a remote one.
+     */
+    if (is_local) {
+        priv->local[child_index] = 1;
+        /* Don't set arbiter as read child. */
+        if (AFR_IS_ARBITER_BRICK(priv, child_index))
+            goto out;
+        gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_LOCAL_CHILD,
+               "selecting local read_child %s",
+               priv->children[child_index]->name);
+
+        priv->read_child = child_index;
+    }
+out:
+    STACK_DESTROY(frame->root);
+    return 0;
+}
+
+static void
+afr_attempt_local_discovery(xlator_t *this, int32_t child_index)
+{
+    call_frame_t *newframe = NULL;
+    loc_t tmploc = {
+        0,
+    };
+    afr_private_t *priv = this->private;
+
+    newframe = create_frame(this, this->ctx->pool);
+    if (!newframe) {
+        return;
+    }
+
+    tmploc.gfid[sizeof(tmploc.gfid) - 1] = 1;
+    STACK_WIND_COOKIE(newframe, afr_local_discovery_cbk,
+                      (void *)(long)child_index, priv->children[child_index],
+                      priv->children[child_index]->fops->getxattr, &tmploc,
+                      GF_XATTR_PATHINFO_KEY, NULL);
+}
+
+int
+afr_lookup_sh_metadata_wrap(void *opaque)
+{
+    call_frame_t *frame = opaque;
+    afr_local_t *local = NULL;
+    xlator_t *this = NULL;
+    inode_t *inode = NULL;
+    afr_private_t *priv = NULL;
+    struct afr_reply *replies = NULL;
+    int i = 0, first = -1;
+    int ret = -1;
+    dict_t *dict = NULL;
+
+    local = frame->local;
+    this = frame->this;
+    priv = this->private;
+    replies = local->replies;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid || replies[i].op_ret == -1)
+            continue;
+        first = i;
+        break;
+    }
+    if (first == -1)
+        goto out;
+
+    if (afr_selfheal_metadata_by_stbuf(this, &replies[first].poststat))
+        goto out;
+
+    afr_local_replies_wipe(local, this->private);
+
+    dict = dict_new();
+    if (!dict)
+        goto out;
+    if (local->xattr_req) {
+        dict_copy(local->xattr_req, dict);
+    }
+
+    ret = dict_set_sizen_str_sizen(dict, "link-count", GF_XATTROP_INDEX_COUNT);
+    if (ret) {
+        gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
+    }
+
+    if (loc_is_nameless(&local->loc)) {
+        ret = afr_selfheal_unlocked_discover_on(frame, local->inode,
+                                                local->loc.gfid, local->replies,
+                                                local->child_up, dict);
+    } else {
+        inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent,
+                                                local->loc.name, local->replies,
+                                                local->child_up, dict);
+    }
+    if (inode)
+        inode_unref(inode);
+out:
+    if (loc_is_nameless(&local->loc))
+        afr_discover_done(frame, this);
+    else
+        afr_lookup_done(frame, this);
+
+    if (dict)
+        dict_unref(dict);
+
+    return 0;
+}
+
+gf_boolean_t
+afr_is_pending_set(xlator_t *this, dict_t *xdata, int type)
+{
+    int idx = -1;
+    afr_private_t *priv = NULL;
+    void *pending_raw = NULL;
+    int *pending_int = NULL;
+    int i = 0;
+
+    priv = this->private;
+    idx = afr_index_for_transaction_type(type);
+
+    if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw) == 0) {
+        if (pending_raw) {
+            pending_int = pending_raw;
+
+            if (ntoh32(pending_int[idx]))
+                return _gf_true;
+        }
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
+            continue;
+        if (!pending_raw)
+            continue;
+        pending_int = pending_raw;
+
+        if (ntoh32(pending_int[idx]))
+            return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+static gf_boolean_t
+afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    struct afr_reply *replies = NULL;
+    int i = 0, first = -1;
+    gf_boolean_t start = _gf_false;
+    struct iatt stbuf = {
+        0,
+    };
+
+    local = frame->local;
+    replies = local->replies;
+    priv = this->private;
+
+    if (!priv->metadata_self_heal)
+        return _gf_false;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid || replies[i].op_ret == -1)
+            continue;
+        if (first == -1) {
+            first = i;
+            stbuf = replies[i].poststat;
+            continue;
+        }
+
+        if (afr_is_pending_set(this, replies[i].xdata,
+                               AFR_METADATA_TRANSACTION)) {
+            /* Let shd do the heal so that lookup is not blocked
+             * on getting metadata lock/doing the heal */
+            start = _gf_false;
+            break;
+        }
+
+        if (gf_uuid_compare(stbuf.ia_gfid, replies[i].poststat.ia_gfid)) {
+            start = _gf_false;
+            break;
+        }
+        if (!IA_EQUAL(stbuf, replies[i].poststat, type)) {
+            start = _gf_false;
+            break;
+        }
+
+        /*Check if iattrs need heal*/
+        if ((!IA_EQUAL(stbuf, replies[i].poststat, uid)) ||
+            (!IA_EQUAL(stbuf, replies[i].poststat, gid)) ||
+            (!IA_EQUAL(stbuf, replies[i].poststat, prot))) {
+            start = _gf_true;
+            continue;
+        }
+
+        /*Check if xattrs need heal*/
+        if (!afr_xattrs_are_equal(replies[first].xdata, replies[i].xdata))
+            start = _gf_true;
+    }
+
+    return start;
+}
+
+int
+afr_lookup_metadata_heal_check(call_frame_t *frame, xlator_t *this)
+
+{
+    call_frame_t *heal = NULL;
+    afr_local_t *local = NULL;
+    int ret = 0;
+
+    local = frame->local;
+    if (!afr_can_start_metadata_self_heal(frame, this))
+        goto out;
+
+    heal = afr_frame_create(this, &ret);
+    if (!heal) {
+        ret = -ret;
+        goto out;
+    }
+
+    ret = synctask_new(this->ctx->env, afr_lookup_sh_metadata_wrap,
+                       afr_refresh_selfheal_done, heal, frame);
+    if (ret)
+        goto out;
+    return ret;
+out:
+    if (loc_is_nameless(&local->loc))
+        afr_discover_done(frame, this);
+    else
+        afr_lookup_done(frame, this);
+    if (heal)
+        AFR_STACK_DESTROY(heal);
+    return ret;
+}
+
+int
+afr_lookup_selfheal_wrap(void *opaque)
+{
+    int ret = 0;
+    call_frame_t *frame = opaque;
+    afr_local_t *local = NULL;
+    xlator_t *this = NULL;
+    inode_t *inode = NULL;
+    uuid_t pargfid = {
+        0,
+    };
+
+    local = frame->local;
+    this = frame->this;
+    loc_pargfid(&local->loc, pargfid);
+
+    ret = afr_selfheal_name(frame->this, pargfid, local->loc.name,
+                            &local->cont.lookup.gfid_req, local->xattr_req);
+    if (ret == -EIO)
+        goto unwind;
+
+    afr_local_replies_wipe(local, this->private);
+
+    inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent,
+                                            local->loc.name, local->replies,
+                                            local->child_up, local->xattr_req);
+    if (inode)
+        inode_unref(inode);
+
+    afr_lookup_metadata_heal_check(frame, this);
+    return 0;
+
+unwind:
+    AFR_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL);
+    return 0;
+}
+
+int
+afr_lookup_entry_heal(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    call_frame_t *heal = NULL;
+    int i = 0, first = -1;
+    gf_boolean_t name_state_mismatch = _gf_false;
+    struct afr_reply *replies = NULL;
+    int ret = 0;
+    unsigned char *par_readables = NULL;
+    unsigned char *success = NULL;
+    int32_t op_errno = 0;
+    uuid_t gfid = {0};
+
+    local = frame->local;
+    replies = local->replies;
+    priv = this->private;
+    par_readables = alloca0(priv->child_count);
+    success = alloca0(priv->child_count);
+
+    ret = afr_inode_read_subvol_get(local->loc.parent, this, par_readables,
+                                    NULL, NULL);
+    if (ret < 0 || AFR_COUNT(par_readables, priv->child_count) == 0) {
+        /* In this case set par_readables to all 1 so that name_heal
+         * need checks at the end of this function will flag missing
+         * entry when name state mismatches*/
+        memset(par_readables, 1, priv->child_count);
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid)
+            continue;
+
+        if (replies[i].op_ret == 0) {
+            if (gf_uuid_is_null(gfid)) {
+                gf_uuid_copy(gfid, replies[i].poststat.ia_gfid);
+            }
+            success[i] = 1;
+        } else {
+            if ((replies[i].op_errno != ENOTCONN) &&
+                (replies[i].op_errno != ENOENT) &&
+                (replies[i].op_errno != ESTALE)) {
+                op_errno = replies[i].op_errno;
+            }
+        }
+
+        /*gfid is missing, needs heal*/
+        if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) {
+            goto name_heal;
+        }
+
+        if (first == -1) {
+            first = i;
+            continue;
+        }
+
+        if (replies[i].op_ret != replies[first].op_ret) {
+            name_state_mismatch = _gf_true;
+        }
+
+        if (replies[i].op_ret == 0) {
+            /* Rename after this lookup may succeed if we don't do
+             * a name-heal and the destination may not have pending xattrs
+             * to indicate which name is good and which is bad so always do
+             * this heal*/
+            if (gf_uuid_compare(replies[i].poststat.ia_gfid, gfid)) {
+                goto name_heal;
+            }
+        }
+    }
+
+    if (name_state_mismatch) {
+        if (!priv->quorum_count)
+            goto name_heal;
+        if (!afr_has_quorum(success, this, NULL))
+            goto name_heal;
+        if (op_errno)
+            goto name_heal;
+        for (i = 0; i < priv->child_count; i++) {
+            if (!replies[i].valid)
+                continue;
+            if (par_readables[i] && replies[i].op_ret < 0 &&
+                replies[i].op_errno != ENOTCONN) {
+                goto name_heal;
+            }
+        }
+    }
+
+    goto metadata_heal;
+
+name_heal:
+    heal = afr_frame_create(this, NULL);
+    if (!heal)
+        goto metadata_heal;
+
+    ret = synctask_new(this->ctx->env, afr_lookup_selfheal_wrap,
+                       afr_refresh_selfheal_done, heal, frame);
+    if (ret) {
+        AFR_STACK_DESTROY(heal);
+        goto metadata_heal;
+    }
+    return ret;
+
+metadata_heal:
+    ret = afr_lookup_metadata_heal_check(frame, this);
+
+    return ret;
+}
+
+int
+afr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+               int op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+               struct iatt *postparent)
+{
+    afr_local_t *local = NULL;
+    int call_count = -1;
+    int child_index = -1;
+    GF_UNUSED int ret = 0;
+    int8_t need_heal = 1;
+
+    child_index = (long)cookie;
+
+    local = frame->local;
+
+    local->replies[child_index].valid = 1;
+    local->replies[child_index].op_ret = op_ret;
+    local->replies[child_index].op_errno = op_errno;
+    /*
+     * On revalidate lookup if the gfid-changed, afr should unwind the fop
+     * with ESTALE so that a fresh lookup will be sent by the top xlator.
+     * So remember it.
+     */
+    if (xdata && dict_get_sizen(xdata, "gfid-changed"))
+        local->cont.lookup.needs_fresh_lookup = _gf_true;
+
+    if (xdata) {
+        ret = dict_get_int8(xdata, "link-count", &need_heal);
+        local->replies[child_index].need_heal = need_heal;
+    } else {
+        local->replies[child_index].need_heal = need_heal;
+    }
+    if (op_ret != -1) {
+        local->replies[child_index].poststat = *buf;
+        local->replies[child_index].postparent = *postparent;
+        if (xdata)
+            local->replies[child_index].xdata = dict_ref(xdata);
+    }
+
+    call_count = afr_frame_return(frame);
+    if (call_count == 0) {
+        afr_set_need_heal(this, local);
+        afr_lookup_entry_heal(frame, this);
+    }
+
+    return 0;
+}
+
+static void
+afr_discover_unwind(call_frame_t *frame, xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int read_subvol = -1;
+    int ret = 0;
+    unsigned char *data_readable = NULL;
+    unsigned char *success_replies = NULL;
+
+    priv = this->private;
+    local = frame->local;
+    data_readable = alloca0(priv->child_count);
+    success_replies = alloca0(priv->child_count);
+
+    afr_fill_success_replies(local, priv, success_replies);
+    if (AFR_COUNT(success_replies, priv->child_count) > 0)
+        local->op_ret = 0;
+
+    if (local->op_ret < 0) {
+        local->op_ret = -1;
+        local->op_errno = afr_final_errno(frame->local, this->private);
+        goto error;
+    }
+
+    if (!afr_has_quorum(success_replies, this, frame))
+        goto unwind;
+
+    ret = afr_replies_interpret(frame, this, local->inode, NULL);
+    if (ret) {
+        afr_inode_need_refresh_set(local->inode, this);
+    }
+
+    read_subvol = afr_read_subvol_decide(local->inode, this, NULL,
+                                         data_readable);
+
+unwind:
+    afr_attempt_readsubvol_set(frame, this, success_replies, data_readable,
+                               &read_subvol);
+    if (read_subvol == -1)
+        goto error;
+
+    if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) {
+        local->op_ret = -1;
+        local->op_errno = ENOTCONN;
+        gf_msg_debug(this->name, 0,
+                     "Arbiter cannot be a read subvol "
+                     "for %s",
+                     local->loc.path);
+    }
+
+    AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+                     local->inode, &local->replies[read_subvol].poststat,
+                     local->replies[read_subvol].xdata,
+                     &local->replies[read_subvol].postparent);
+    return;
+
+error:
+    AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL,
+                     NULL, NULL);
+}
+
+static int
+afr_ta_id_file_check(void *opaque)
+{
+    afr_private_t *priv = NULL;
+    xlator_t *this = NULL;
+    loc_t loc = {
+        0,
+    };
+    struct iatt stbuf = {
+        0,
+    };
+    dict_t *dict = NULL;
+    uuid_t gfid = {
+        0,
+    };
+    fd_t *fd = NULL;
+    int ret = 0;
+
+    this = opaque;
+    priv = this->private;
+
+    ret = afr_fill_ta_loc(this, &loc, _gf_false);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Failed to populate thin-arbiter loc for: %s.", loc.name);
+        goto out;
+    }
+
+    ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, &stbuf,
+                        0, 0, 0);
+    if (ret == 0) {
+        goto out;
+    } else if (ret == -ENOENT) {
+        fd = fd_create(loc.inode, getpid());
+        if (!fd)
+            goto out;
+        dict = dict_new();
+        if (!dict)
+            goto out;
+        gf_uuid_generate(gfid);
+        ret = dict_set_gfuuid(dict, "gfid-req", gfid, true);
+        ret = syncop_create(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+                            O_RDWR, 0664, fd, &stbuf, dict, NULL);
+    }
+
+out:
+    if (ret == 0) {
+        gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid);
+    } else {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Failed to lookup/create thin-arbiter id file.");
+    }
+    if (dict)
+        dict_unref(dict);
+    if (fd)
+        fd_unref(fd);
+    loc_wipe(&loc);
+
+    return 0;
+}
+
+static int
+afr_ta_id_file_check_cbk(int ret, call_frame_t *ta_frame, void *opaque)
+{
+    return 0;
+}
+
+static void
+afr_discover_done(call_frame_t *frame, xlator_t *this)
+{
+    int ret = 0;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+    if (!priv->thin_arbiter_count)
+        goto unwind;
+    if (!gf_uuid_is_null(priv->ta_gfid))
+        goto unwind;
+
+    ret = synctask_new(this->ctx->env, afr_ta_id_file_check,
+                       afr_ta_id_file_check_cbk, NULL, this);
+    if (ret)
+        goto unwind;
+unwind:
+    afr_discover_unwind(frame, this);
+}
+
+int
+afr_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+                 struct iatt *postparent)
+{
+    afr_local_t *local = NULL;
+    int call_count = -1;
+    int child_index = -1;
+    GF_UNUSED int ret = 0;
+    int8_t need_heal = 1;
+
+    child_index = (long)cookie;
+
+    local = frame->local;
+
+    local->replies[child_index].valid = 1;
+    local->replies[child_index].op_ret = op_ret;
+    local->replies[child_index].op_errno = op_errno;
+    if (op_ret != -1) {
+        local->replies[child_index].poststat = *buf;
+        local->replies[child_index].postparent = *postparent;
+        if (xdata)
+            local->replies[child_index].xdata = dict_ref(xdata);
+    }
+
+    if (local->do_discovery && (op_ret == 0))
+        afr_attempt_local_discovery(this, child_index);
+
+    if (xdata) {
+        ret = dict_get_int8(xdata, "link-count", &need_heal);
+        local->replies[child_index].need_heal = need_heal;
+    } else {
+        local->replies[child_index].need_heal = need_heal;
+    }
+
+    call_count = afr_frame_return(frame);
+    if (call_count == 0) {
+        afr_set_need_heal(this, local);
+        afr_lookup_metadata_heal_check(frame, this);
+    }
+
+    return 0;
+}
+
+int
+afr_discover_do(call_frame_t *frame, xlator_t *this, int err)
+{
+    int ret = 0;
+    int i = 0;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int call_count = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    if (err) {
+        local->op_errno = err;
+        goto out;
+    }
+
+    call_count = local->call_count = AFR_COUNT(local->child_up,
+                                               priv->child_count);
+
+    ret = afr_lookup_xattr_req_prepare(local, this, local->xattr_req,
+                                       &local->loc);
+    if (ret) {
+        local->op_errno = -ret;
+        goto out;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->child_up[i]) {
+            STACK_WIND_COOKIE(
+                frame, afr_discover_cbk, (void *)(long)i, priv->children[i],
+                priv->children[i]->fops->lookup, &local->loc, local->xattr_req);
+            if (!--call_count)
+                break;
+        }
+    }
+
+    return 0;
+out:
+    AFR_STACK_UNWIND(lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
+    return 0;
+}
+
+int
+afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+    int op_errno = ENOMEM;
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int event = 0;
+
+    priv = this->private;
+
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    if (!local->call_count) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    if (__is_root_gfid(loc->inode->gfid)) {
+        if (!priv->root_inode)
+            priv->root_inode = inode_ref(loc->inode);
+
+        if (priv->choose_local && !priv->did_discovery) {
+            /* Logic to detect which subvolumes of AFR are
+               local, in order to prefer them for reads
+            */
+            local->do_discovery = _gf_true;
+            priv->did_discovery = _gf_true;
+        }
+    }
+
+    local->op = GF_FOP_LOOKUP;
+
+    loc_copy(&local->loc, loc);
+
+    local->inode = inode_ref(loc->inode);
+
+    if (xattr_req) {
+        /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
+           allocate one for us */
+        local->xattr_req = dict_copy_with_ref(xattr_req, NULL);
+        if (!local->xattr_req) {
+            op_errno = ENOMEM;
+            goto out;
+        }
+    }
+
+    if (gf_uuid_is_null(loc->inode->gfid)) {
+        afr_discover_do(frame, this, 0);
+        return 0;
+    }
+
+    afr_read_subvol_get(loc->inode, this, NULL, NULL, &event,
+                        AFR_DATA_TRANSACTION, NULL);
+
+    afr_discover_do(frame, this, 0);
+
+    return 0;
+out:
+    AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+    return 0;
+}
+
+int
+afr_lookup_do(call_frame_t *frame, xlator_t *this, int err)
+{
+    int ret = 0;
+    int i = 0;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int call_count = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    if (err < 0) {
+        local->op_errno = err;
+        goto out;
+    }
+
+    call_count = local->call_count = AFR_COUNT(local->child_up,
+                                               priv->child_count);
+
+    ret = afr_lookup_xattr_req_prepare(local, this, local->xattr_req,
+                                       &local->loc);
+    if (ret) {
+        local->op_errno = -ret;
+        goto out;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->child_up[i]) {
+            STACK_WIND_COOKIE(
+                frame, afr_lookup_cbk, (void *)(long)i, priv->children[i],
+                priv->children[i]->fops->lookup, &local->loc, local->xattr_req);
+            if (!--call_count)
+                break;
+        }
+    }
+    return 0;
+out:
+    AFR_STACK_UNWIND(lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
+    return 0;
+}
+
+/*
+ * afr_lookup()
+ *
+ * The goal here is to figure out what the element getting looked up is.
+ * i.e what is the GFID, inode type and a conservative estimate of the
+ * inode attributes are.
+ *
+ * As we lookup, operations may be underway on the entry name and the
+ * inode. In lookup() we are primarily concerned only with the entry
+ * operations. If the entry is getting unlinked or renamed, we detect
+ * what operation is underway by querying for on-going transactions and
+ * pending self-healing on the entry through xdata.
+ *
+ * If the entry is a file/dir, it may need self-heal and/or in a
+ * split-brain condition. Lookup is not the place to worry about these
+ * conditions. Outcast marking will naturally handle them in the read
+ * paths.
+ *
+ * Here is a brief goal of what we are trying to achieve:
+ *
+ * - LOOKUP on all subvolumes concurrently, querying on-going transaction
+ *   and pending self-heal info from the servers.
+ *
+ * - If all servers reply the same inode type and GFID, the overall call
+ *   MUST be a success.
+ *
+ * - If inode types or GFIDs mismatch, and there IS either an on-going
+ *   transaction or pending self-heal, inspect what the nature of the
+ *   transaction or pending heal is, and select the appropriate subvolume's
+ *   reply as the winner.
+ *
+ * - If inode types or GFIDs mismatch, and there are no on-going transactions
+ *   or pending self-heal on the entry name on any of the servers, fail the
+ *   lookup with EIO. Something has gone wrong beyond reasonable action.
+ */
+
+int
+afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+    afr_local_t *local = NULL;
+    int32_t op_errno = 0;
+    int event = 0;
+    int ret = 0;
+
+    if (loc_is_nameless(loc)) {
+        if (xattr_req)
+            dict_del_sizen(xattr_req, "gfid-req");
+        afr_discover(frame, this, loc, xattr_req);
+        return 0;
+    }
+
+    if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name,
+                                 frame->root->pid)) {
+        op_errno = EPERM;
+        goto out;
+    }
+
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    if (!local->call_count) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    local->op = GF_FOP_LOOKUP;
+
+    loc_copy(&local->loc, loc);
+
+    local->inode = inode_ref(loc->inode);
+
+    if (xattr_req) {
+        /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
+           allocate one for us */
+        local->xattr_req = dict_copy_with_ref(xattr_req, NULL);
+        if (!local->xattr_req) {
+            op_errno = ENOMEM;
+            goto out;
+        }
+        ret = dict_get_gfuuid(local->xattr_req, "gfid-req",
+                              &local->cont.lookup.gfid_req);
+        if (ret == 0) {
+            dict_del_sizen(local->xattr_req, "gfid-req");
+        }
+    }
+
+    afr_read_subvol_get(loc->parent, this, NULL, NULL, &event,
+                        AFR_DATA_TRANSACTION, NULL);
+
+    afr_lookup_do(frame, this, 0);
+
+    return 0;
+out:
+    AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+
+    return 0;
+}
+
+void
+_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx)
+{
+    afr_private_t *priv = this->private;
+
+    if (fd_ctx->lk_heal_info) {
+        LOCK(&priv->lock);
+        {
+            list_del(&fd_ctx->lk_heal_info->pos);
+        }
+        afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info);
+        fd_ctx->lk_heal_info = NULL;
+    }
+    GF_FREE(fd_ctx->opened_on);
+    GF_FREE(fd_ctx);
+    return;
+}
+
+int
+afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd)
+{
+    uint64_t ctx = 0;
+    afr_fd_ctx_t *fd_ctx = NULL;
+    int ret = 0;
+
+    ret = fd_ctx_get(fd, this, &ctx);
+    if (ret < 0)
+        goto out;
+
+    fd_ctx = (afr_fd_ctx_t *)(long)ctx;
+
+    if (fd_ctx) {
+        _afr_cleanup_fd_ctx(this, fd_ctx);
+    }
+
+out:
+    return 0;
+}
+
+int
+afr_release(xlator_t *this, fd_t *fd)
+{
+    afr_cleanup_fd_ctx(this, fd);
+
+    return 0;
+}
+
+afr_fd_ctx_t *
+__afr_fd_ctx_get(fd_t *fd, xlator_t *this)
+{
+    uint64_t ctx = 0;
+    int ret = 0;
+    afr_fd_ctx_t *fd_ctx = NULL;
+
+    ret = __fd_ctx_get(fd, this, &ctx);
+
+    if (ret < 0) {
+        ret = __afr_fd_ctx_set(this, fd);
+        if (ret < 0)
+            goto out;
+
+        ret = __fd_ctx_get(fd, this, &ctx);
+        if (ret < 0)
+            goto out;
+    }
+
+    fd_ctx = (afr_fd_ctx_t *)(long)ctx;
+out:
+    return fd_ctx;
+}
+
+afr_fd_ctx_t *
+afr_fd_ctx_get(fd_t *fd, xlator_t *this)
+{
+    afr_fd_ctx_t *fd_ctx = NULL;
+
+    LOCK(&fd->lock);
+    {
+        fd_ctx = __afr_fd_ctx_get(fd, this);
+    }
+    UNLOCK(&fd->lock);
+
+    return fd_ctx;
+}
+
+int
+__afr_fd_ctx_set(xlator_t *this, fd_t *fd)
+{
+    afr_private_t *priv = NULL;
+    int ret = -1;
+    uint64_t ctx = 0;
+    afr_fd_ctx_t *fd_ctx = NULL;
+    int i = 0;
+
+    VALIDATE_OR_GOTO(this->private, out);
+    VALIDATE_OR_GOTO(fd, out);
+
+    priv = this->private;
+
+    ret = __fd_ctx_get(fd, this, &ctx);
+
+    if (ret == 0)
+        goto out;
+
+    fd_ctx = GF_CALLOC(1, sizeof(afr_fd_ctx_t), gf_afr_mt_afr_fd_ctx_t);
+    if (!fd_ctx) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    fd_ctx->opened_on = GF_CALLOC(sizeof(*fd_ctx->opened_on), priv->child_count,
+                                  gf_afr_mt_int32_t);
+    if (!fd_ctx->opened_on) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (fd_is_anonymous(fd))
+            fd_ctx->opened_on[i] = AFR_FD_OPENED;
+        else
+            fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED;
+    }
+
+    fd_ctx->readdir_subvol = -1;
+    fd_ctx->lk_heal_info = NULL;
+
+    ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx);
+    if (ret)
+        gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd);
+out:
+    if (ret && fd_ctx)
+        _afr_cleanup_fd_ctx(this, fd_ctx);
+    return ret;
+}
+
+/* {{{ flush */
+
+int
+afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int call_count = -1;
+
+    local = frame->local;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret != -1) {
+            local->op_ret = op_ret;
+            if (!local->xdata_rsp && xdata)
+                local->xdata_rsp = dict_ref(xdata);
+        } else {
+            local->op_errno = op_errno;
+        }
+        call_count = --local->call_count;
+    }
+    UNLOCK(&frame->lock);
+
+    if (call_count == 0)
+        AFR_STACK_UNWIND(flush, frame, local->op_ret, local->op_errno,
+                         local->xdata_rsp);
+
+    return 0;
+}
+
+static int
+afr_flush_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    int i = 0;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int call_count = -1;
+
+    priv = this->private;
+    local = frame->local;
+    call_count = local->call_count;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->child_up[i]) {
+            STACK_WIND_COOKIE(frame, afr_flush_cbk, (void *)(long)i,
+                              priv->children[i], priv->children[i]->fops->flush,
+                              local->fd, xdata);
+            if (!--call_count)
+                break;
+        }
+    }
+
+    return 0;
+}
+
+afr_local_t *
+afr_wakeup_same_fd_delayed_op(xlator_t *this, afr_lock_t *lock, fd_t *fd)
+{
+    afr_local_t *local = NULL;
+
+    if (lock->delay_timer) {
+        local = list_entry(lock->post_op.next, afr_local_t,
+                           transaction.owner_list);
+        if (fd == local->fd) {
+            if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) {
+                local = NULL;
+            } else {
+                lock->delay_timer = NULL;
+            }
+        } else {
+            local = NULL;
+        }
+    }
+
+    return local;
+}
+
+void
+afr_delayed_changelog_wake_resume(xlator_t *this, inode_t *inode,
+                                  call_stub_t *stub)
+{
+    afr_inode_ctx_t *ctx = NULL;
+    afr_lock_t *lock = NULL;
+    afr_local_t *metadata_local = NULL;
+    afr_local_t *data_local = NULL;
+    LOCK(&inode->lock);
+    {
+        (void)__afr_inode_ctx_get(this, inode, &ctx);
+        lock = &ctx->lock[AFR_DATA_TRANSACTION];
+        data_local = afr_wakeup_same_fd_delayed_op(this, lock, stub->args.fd);
+        lock = &ctx->lock[AFR_METADATA_TRANSACTION];
+        metadata_local = afr_wakeup_same_fd_delayed_op(this, lock,
+                                                       stub->args.fd);
+    }
+    UNLOCK(&inode->lock);
+
+    if (data_local) {
+        data_local->transaction.resume_stub = stub;
+    } else if (metadata_local) {
+        metadata_local->transaction.resume_stub = stub;
+    } else {
+        call_resume(stub);
+    }
+    if (data_local) {
+        afr_delayed_changelog_wake_up_cbk(data_local);
+    }
+    if (metadata_local) {
+        afr_delayed_changelog_wake_up_cbk(metadata_local);
+    }
+}
+
+int
+afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+    int op_errno = ENOMEM;
+
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    local->op = GF_FOP_FLUSH;
+    if (!afr_is_consistent_io_possible(local, this->private, &op_errno))
+        goto out;
+
+    local->fd = fd_ref(fd);
+
+    stub = fop_flush_stub(frame, afr_flush_wrapper, fd, xdata);
+    if (!stub)
+        goto out;
+
+    afr_delayed_changelog_wake_resume(this, fd->inode, stub);
+
+    return 0;
+out:
+    AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
+    return 0;
+}
+
+int
+afr_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int call_count = -1;
+
+    local = frame->local;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret == 0) {
+            local->op_ret = 0;
+            if (!local->xdata_rsp && xdata)
+                local->xdata_rsp = dict_ref(xdata);
+        } else {
+            local->op_errno = op_errno;
+        }
+        call_count = --local->call_count;
+    }
+    UNLOCK(&frame->lock);
+
+    if (call_count == 0)
+        AFR_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno,
+                         local->xdata_rsp);
+
+    return 0;
+}
+
+int
+afr_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+             dict_t *xdata)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int i = 0;
+    int32_t call_count = 0;
+    int32_t op_errno = ENOMEM;
+
+    priv = this->private;
+
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    local->op = GF_FOP_FSYNCDIR;
+    if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+        goto out;
+
+    call_count = local->call_count;
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->child_up[i]) {
+            STACK_WIND(frame, afr_fsyncdir_cbk, priv->children[i],
+                       priv->children[i]->fops->fsyncdir, fd, datasync, xdata);
+            if (!--call_count)
+                break;
+        }
+    }
+
+    return 0;
+out:
+    AFR_STACK_UNWIND(fsyncdir, frame, -1, op_errno, NULL);
+
+    return 0;
+}
+
+/* }}} */
+
+static int
+afr_serialized_lock_wind(call_frame_t *frame, xlator_t *this);
+
+static gf_boolean_t
+afr_is_conflicting_lock_present(int32_t op_ret, int32_t op_errno)
+{
+    if (op_ret == -1 && op_errno == EAGAIN)
+        return _gf_true;
+    return _gf_false;
+}
+
+static void
+afr_fop_lock_unwind(call_frame_t *frame, glusterfs_fop_t op, int32_t op_ret,
+                    int32_t op_errno, dict_t *xdata)
+{
+    switch (op) {
+        case GF_FOP_INODELK:
+            AFR_STACK_UNWIND(inodelk, frame, op_ret, op_errno, xdata);
+            break;
+        case GF_FOP_FINODELK:
+            AFR_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata);
+            break;
+        case GF_FOP_ENTRYLK:
+            AFR_STACK_UNWIND(entrylk, frame, op_ret, op_errno, xdata);
+            break;
+        case GF_FOP_FENTRYLK:
+            AFR_STACK_UNWIND(fentrylk, frame, op_ret, op_errno, xdata);
+            break;
+        default:
+            break;
+    }
+}
+
+static void
+afr_fop_lock_wind(call_frame_t *frame, xlator_t *this, int child_index,
+                  int32_t (*lock_cbk)(call_frame_t *, void *, xlator_t *,
+                                      int32_t, int32_t, dict_t *))
+{
+    afr_local_t *local = frame->local;
+    afr_private_t *priv = this->private;
+    int i = child_index;
+
+    switch (local->op) {
+        case GF_FOP_INODELK:
+            STACK_WIND_COOKIE(
+                frame, lock_cbk, (void *)(long)i, priv->children[i],
+                priv->children[i]->fops->inodelk,
+                (const char *)local->cont.inodelk.volume, &local->loc,
+                local->cont.inodelk.cmd, &local->cont.inodelk.flock,
+                local->cont.inodelk.xdata);
+            break;
+        case GF_FOP_FINODELK:
+            STACK_WIND_COOKIE(
+                frame, lock_cbk, (void *)(long)i, priv->children[i],
+                priv->children[i]->fops->finodelk,
+                (const char *)local->cont.inodelk.volume, local->fd,
+                local->cont.inodelk.cmd, &local->cont.inodelk.flock,
+                local->cont.inodelk.xdata);
+            break;
+        case GF_FOP_ENTRYLK:
+            STACK_WIND_COOKIE(
+                frame, lock_cbk, (void *)(long)i, priv->children[i],
+                priv->children[i]->fops->entrylk, local->cont.entrylk.volume,
+                &local->loc, local->cont.entrylk.basename,
+                local->cont.entrylk.cmd, local->cont.entrylk.type,
+                local->cont.entrylk.xdata);
+            break;
+        case GF_FOP_FENTRYLK:
+            STACK_WIND_COOKIE(
+                frame, lock_cbk, (void *)(long)i, priv->children[i],
+                priv->children[i]->fops->fentrylk, local->cont.entrylk.volume,
+                local->fd, local->cont.entrylk.basename,
+                local->cont.entrylk.cmd, local->cont.entrylk.type,
+                local->cont.entrylk.xdata);
+            break;
+        default:
+            break;
+    }
+}
+
+void
+afr_fop_lock_proceed(call_frame_t *frame)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = frame->this->private;
+
+    if (local->fop_lock_state != AFR_FOP_LOCK_PARALLEL) {
+        afr_fop_lock_unwind(frame, local->op, local->op_ret, local->op_errno,
+                            local->xdata_rsp);
+        return;
+    }
+    /* At least one child is up */
+    /*
+     * Non-blocking locks also need to be serialized.  Otherwise there is
+     * a chance that both the mounts which issued same non-blocking inodelk
+     * may endup not acquiring the lock on any-brick.
+     * Ex: Mount1 and Mount2
+     * request for full length lock on file f1.  Mount1 afr may acquire the
+     * partial lock on brick-1 and may not acquire the lock on brick-2
+     * because Mount2 already got the lock on brick-2, vice versa.  Since
+     * both the mounts only got partial locks, afr treats them as failure in
+     * gaining the locks and unwinds with EAGAIN errno.
+     */
+    local->op_ret = -1;
+    local->op_errno = EUCLEAN;
+    local->fop_lock_state = AFR_FOP_LOCK_SERIAL;
+    afr_local_replies_wipe(local, priv);
+    if (local->xdata_rsp)
+        dict_unref(local->xdata_rsp);
+    local->xdata_rsp = NULL;
+    switch (local->op) {
+        case GF_FOP_INODELK:
+        case GF_FOP_FINODELK:
+            local->cont.inodelk.cmd = local->cont.inodelk.in_cmd;
+            local->cont.inodelk.flock = local->cont.inodelk.in_flock;
+            if (local->cont.inodelk.xdata)
+                dict_unref(local->cont.inodelk.xdata);
+            local->cont.inodelk.xdata = NULL;
+            if (local->xdata_req)
+                local->cont.inodelk.xdata = dict_ref(local->xdata_req);
+            break;
+        case GF_FOP_ENTRYLK:
+        case GF_FOP_FENTRYLK:
+            local->cont.entrylk.cmd = local->cont.entrylk.in_cmd;
+            if (local->cont.entrylk.xdata)
+                dict_unref(local->cont.entrylk.xdata);
+            local->cont.entrylk.xdata = NULL;
+            if (local->xdata_req)
+                local->cont.entrylk.xdata = dict_ref(local->xdata_req);
+            break;
+        default:
+            break;
+    }
+    afr_serialized_lock_wind(frame, frame->this);
+}
+
+static int32_t
+afr_unlock_partial_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int32_t op_ret, int32_t op_errno, dict_t *xdata)
+
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int call_count = -1;
+    int child_index = (long)cookie;
+    uuid_t gfid = {0};
+
+    local = frame->local;
+    priv = this->private;
+
+    if (op_ret < 0 && op_errno != ENOTCONN) {
+        if (local->fd)
+            gf_uuid_copy(gfid, local->fd->inode->gfid);
+        else
+            loc_gfid(&local->loc, gfid);
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+               "%s: Failed to unlock %s on %s "
+               "with lk_owner: %s",
+               uuid_utoa(gfid), gf_fop_list[local->op],
+               priv->children[child_index]->name,
+               lkowner_utoa(&frame->root->lk_owner));
+    }
+
+    call_count = afr_frame_return(frame);
+    if (call_count == 0)
+        afr_fop_lock_proceed(frame);
+
+    return 0;
+}
+
+static int32_t
+afr_unlock_locks_and_proceed(call_frame_t *frame, xlator_t *this,
+                             int call_count)
+{
+    int i = 0;
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+
+    if (call_count == 0) {
+        afr_fop_lock_proceed(frame);
+        goto out;
+    }
+
+    local = frame->local;
+    priv = this->private;
+    local->call_count = call_count;
+    switch (local->op) {
+        case GF_FOP_INODELK:
+        case GF_FOP_FINODELK:
+            local->cont.inodelk.flock.l_type = F_UNLCK;
+            local->cont.inodelk.cmd = F_SETLK;
+            if (local->cont.inodelk.xdata)
+                dict_unref(local->cont.inodelk.xdata);
+            local->cont.inodelk.xdata = NULL;
+            break;
+        case GF_FOP_ENTRYLK:
+        case GF_FOP_FENTRYLK:
+            local->cont.entrylk.cmd = ENTRYLK_UNLOCK;
+            if (local->cont.entrylk.xdata)
+                dict_unref(local->cont.entrylk.xdata);
+            local->cont.entrylk.xdata = NULL;
+            break;
+        default:
+            break;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].valid)
+            continue;
+
+        if (local->replies[i].op_ret == -1)
+            continue;
+
+        afr_fop_lock_wind(frame, this, i, afr_unlock_partial_lock_cbk);
+
+        if (!--call_count)
+            break;
+    }
+
+out:
+    return 0;
+}
+
+int32_t
+afr_fop_lock_done(call_frame_t *frame, xlator_t *this)
+{
+    int i = 0;
+    int lock_count = 0;
+    unsigned char *success = NULL;
+
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+    success = alloca0(priv->child_count);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].valid)
+            continue;
+
+        if (local->replies[i].op_ret == 0) {
+            lock_count++;
+            success[i] = 1;
+        }
+
+        if (local->op_ret == -1 && local->op_errno == EAGAIN)
+            continue;
+
+        if ((local->replies[i].op_ret == -1) &&
+            (local->replies[i].op_errno == EAGAIN)) {
+            local->op_ret = -1;
+            local->op_errno = EAGAIN;
+            continue;
+        }
+
+        if (local->replies[i].op_ret == 0)
+            local->op_ret = 0;
+
+        local->op_errno = local->replies[i].op_errno;
+    }
+
+    if (afr_fop_lock_is_unlock(frame))
+        goto unwind;
+
+    if (afr_is_conflicting_lock_present(local->op_ret, local->op_errno)) {
+        afr_unlock_locks_and_proceed(frame, this, lock_count);
+    } else if (priv->quorum_count && !afr_has_quorum(success, this, NULL)) {
+        local->fop_lock_state = AFR_FOP_LOCK_QUORUM_FAILED;
+        local->op_ret = -1;
+        local->op_errno = afr_final_errno(local, priv);
+        if (local->op_errno == 0)
+            local->op_errno = afr_quorum_errno(priv);
+        afr_unlock_locks_and_proceed(frame, this, lock_count);
+    } else {
+        goto unwind;
+    }
+
+    return 0;
+unwind:
+    afr_fop_lock_unwind(frame, local->op, local->op_ret, local->op_errno,
+                        local->xdata_rsp);
+    return 0;
+}
+
+static int
+afr_common_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int child_index = (long)cookie;
+
+    local = frame->local;
+
+    local->replies[child_index].valid = 1;
+    local->replies[child_index].op_ret = op_ret;
+    local->replies[child_index].op_errno = op_errno;
+    if (op_ret == 0 && xdata) {
+        local->replies[child_index].xdata = dict_ref(xdata);
+        LOCK(&frame->lock);
+        {
+            if (!local->xdata_rsp)
+                local->xdata_rsp = dict_ref(xdata);
+        }
+        UNLOCK(&frame->lock);
+    }
+    return 0;
+}
+
+static int32_t
+afr_serialized_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *xdata)
+
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int child_index = (long)cookie;
+    int next_child = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+
+    for (next_child = child_index + 1; next_child < priv->child_count;
+         next_child++) {
+        if (local->child_up[next_child])
+            break;
+    }
+
+    if (afr_is_conflicting_lock_present(op_ret, op_errno) ||
+        (next_child == priv->child_count)) {
+        afr_fop_lock_done(frame, this);
+    } else {
+        afr_fop_lock_wind(frame, this, next_child, afr_serialized_lock_cbk);
+    }
+
+    return 0;
+}
+
+static int
+afr_serialized_lock_wind(call_frame_t *frame, xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int i = 0;
+
+    priv = this->private;
+    local = frame->local;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->child_up[i]) {
+            afr_fop_lock_wind(frame, this, i, afr_serialized_lock_cbk);
+            break;
+        }
+    }
+    return 0;
+}
+
+static int32_t
+afr_parallel_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
+
+{
+    int call_count = 0;
+
+    afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+
+    call_count = afr_frame_return(frame);
+    if (call_count == 0)
+        afr_fop_lock_done(frame, this);
+
+    return 0;
+}
+
+static int
+afr_parallel_lock_wind(call_frame_t *frame, xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int call_count = 0;
+    int i = 0;
+
+    priv = this->private;
+    local = frame->local;
+    call_count = local->call_count;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->child_up[i])
+            continue;
+        afr_fop_lock_wind(frame, this, i, afr_parallel_lock_cbk);
+        if (!--call_count)
+            break;
+    }
+    return 0;
+}
+
+static int
+afr_fop_handle_lock(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = frame->local;
+    int op_errno = 0;
+
+    if (!afr_fop_lock_is_unlock(frame)) {
+        if (!afr_is_consistent_io_possible(local, this->private, &op_errno))
+            goto out;
+
+        switch (local->op) {
+            case GF_FOP_INODELK:
+            case GF_FOP_FINODELK:
+                local->cont.inodelk.cmd = F_SETLK;
+                break;
+            case GF_FOP_ENTRYLK:
+            case GF_FOP_FENTRYLK:
+                local->cont.entrylk.cmd = ENTRYLK_LOCK_NB;
+                break;
+            default:
+                break;
+        }
+    }
+
+    if (local->xdata_req) {
+        switch (local->op) {
+            case GF_FOP_INODELK:
+            case GF_FOP_FINODELK:
+                local->cont.inodelk.xdata = dict_ref(local->xdata_req);
+                break;
+            case GF_FOP_ENTRYLK:
+            case GF_FOP_FENTRYLK:
+                local->cont.entrylk.xdata = dict_ref(local->xdata_req);
+                break;
+            default:
+                break;
+        }
+    }
+
+    local->fop_lock_state = AFR_FOP_LOCK_PARALLEL;
+    afr_parallel_lock_wind(frame, this);
+out:
+    return -op_errno;
+}
+
+static int32_t
+afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+                   const char *volume, loc_t *loc, fd_t *fd, int32_t cmd,
+                   struct gf_flock *flock, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int32_t op_errno = ENOMEM;
+
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    local->op = fop;
+    if (loc)
+        loc_copy(&local->loc, loc);
+    if (fd && (flock->l_type != F_UNLCK)) {
+        AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+        local->fd = fd_ref(fd);
+    }
+
+    local->cont.inodelk.volume = gf_strdup(volume);
+    if (!local->cont.inodelk.volume) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    local->cont.inodelk.in_cmd = cmd;
+    local->cont.inodelk.cmd = cmd;
+    local->cont.inodelk.in_flock = *flock;
+    local->cont.inodelk.flock = *flock;
+    if (xdata)
+        local->xdata_req = dict_ref(xdata);
+
+    op_errno = -afr_fop_handle_lock(frame, frame->this);
+    if (op_errno)
+        goto out;
+    return 0;
+out:
+    afr_fop_lock_unwind(frame, fop, -1, op_errno, NULL);
+
+    return 0;
+}
+
+int32_t
+afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+            int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+    afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd,
+                       flock, xdata);
+    return 0;
+}
+
+int32_t
+afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+             int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+    afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd,
+                       flock, xdata);
+    return 0;
+}
+
+static int
+afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+                   const char *volume, loc_t *loc, fd_t *fd,
+                   const char *basename, entrylk_cmd cmd, entrylk_type type,
+                   dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int32_t op_errno = ENOMEM;
+
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    local->op = fop;
+    if (loc)
+        loc_copy(&local->loc, loc);
+    if (fd && (cmd != ENTRYLK_UNLOCK)) {
+        AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+        local->fd = fd_ref(fd);
+    }
+    local->cont.entrylk.cmd = cmd;
+    local->cont.entrylk.in_cmd = cmd;
+    local->cont.entrylk.type = type;
+    local->cont.entrylk.volume = gf_strdup(volume);
+    local->cont.entrylk.basename = gf_strdup(basename);
+    if (!local->cont.entrylk.volume || !local->cont.entrylk.basename) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+    if (xdata)
+        local->xdata_req = dict_ref(xdata);
+    op_errno = -afr_fop_handle_lock(frame, frame->this);
+    if (op_errno)
+        goto out;
+
+    return 0;
+out:
+    afr_fop_lock_unwind(frame, fop, -1, op_errno, NULL);
+    return 0;
+}
+
+int
+afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+            const char *basename, entrylk_cmd cmd, entrylk_type type,
+            dict_t *xdata)
+{
+    afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename,
+                       cmd, type, xdata);
+    return 0;
+}
+
+int
+afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+             const char *basename, entrylk_cmd cmd, entrylk_type type,
+             dict_t *xdata)
+{
+    afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename,
+                       cmd, type, xdata);
+    return 0;
+}
+
+int
+afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+               int op_errno, struct statvfs *statvfs, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int call_count = 0;
+    struct statvfs *buf = NULL;
+
+    local = frame->local;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret != 0) {
+            local->op_errno = op_errno;
+            goto unlock;
+        }
+
+        local->op_ret = op_ret;
+
+        buf = &local->cont.statfs.buf;
+        if (local->cont.statfs.buf_set) {
+            if (statvfs->f_bavail < buf->f_bavail) {
+                *buf = *statvfs;
+                if (xdata) {
+                    if (local->xdata_rsp)
+                        dict_unref(local->xdata_rsp);
+                    local->xdata_rsp = dict_ref(xdata);
+                }
+            }
+        } else {
+            *buf = *statvfs;
+            local->cont.statfs.buf_set = 1;
+            if (xdata)
+                local->xdata_rsp = dict_ref(xdata);
+        }
+    }
+unlock:
+    call_count = --local->call_count;
+    UNLOCK(&frame->lock);
+
+    if (call_count == 0)
+        AFR_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno,
+                         &local->cont.statfs.buf, local->xdata_rsp);
+
+    return 0;
+}
+
+int
+afr_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+    int call_count = 0;
+    int32_t op_errno = ENOMEM;
+
+    priv = this->private;
+
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    local->op = GF_FOP_STATFS;
+    if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+        goto out;
+
+    if (priv->arbiter_count == 1 && local->child_up[ARBITER_BRICK_INDEX])
+        local->call_count--;
+    call_count = local->call_count;
+    if (!call_count) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->child_up[i]) {
+            if (AFR_IS_ARBITER_BRICK(priv, i))
+                continue;
+            STACK_WIND(frame, afr_statfs_cbk, priv->children[i],
+                       priv->children[i]->fops->statfs, loc, xdata);
+            if (!--call_count)
+                break;
+        }
+    }
+
+    return 0;
+out:
+    AFR_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+                  dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = this->private;
+    int call_count = -1;
+    int child_index = (long)cookie;
+
+    local = frame->local;
+
+    if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+               "gfid=%s: unlock failed on subvolume %s "
+               "with lock owner %s",
+               uuid_utoa(local->fd->inode->gfid),
+               priv->children[child_index]->name,
+               lkowner_utoa(&frame->root->lk_owner));
+    }
+
+    call_count = afr_frame_return(frame);
+    if (call_count == 0) {
+        AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL,
+                         local->xdata_rsp);
+    }
+
+    return 0;
+}
+
+int32_t
+afr_lk_unlock(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+    int call_count = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    call_count = afr_locked_nodes_count(local->cont.lk.locked_nodes,
+                                        priv->child_count);
+
+    if (call_count == 0) {
+        AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL,
+                         local->xdata_rsp);
+        return 0;
+    }
+
+    local->call_count = call_count;
+
+    local->cont.lk.user_flock.l_type = F_UNLCK;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->cont.lk.locked_nodes[i]) {
+            STACK_WIND_COOKIE(frame, afr_lk_unlock_cbk, (void *)(long)i,
+                              priv->children[i], priv->children[i]->fops->lk,
+                              local->fd, F_SETLK, &local->cont.lk.user_flock,
+                              NULL);
+
+            if (!--call_count)
+                break;
+        }
+    }
+
+    return 0;
+}
+
+int32_t
+afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+           int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int child_index = -1;
+
+    local = frame->local;
+    priv = this->private;
+
+    child_index = (long)cookie;
+
+    afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+    if (op_ret < 0 && op_errno == EAGAIN) {
+        local->op_ret = -1;
+        local->op_errno = EAGAIN;
+
+        afr_lk_unlock(frame, this);
+        return 0;
+    }
+
+    if (op_ret == 0) {
+        local->op_ret = 0;
+        local->op_errno = 0;
+        local->cont.lk.locked_nodes[child_index] = 1;
+        local->cont.lk.ret_flock = *lock;
+    }
+
+    child_index++;
+
+    if (child_index < priv->child_count) {
+        STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)child_index,
+                          priv->children[child_index],
+                          priv->children[child_index]->fops->lk, local->fd,
+                          local->cont.lk.cmd, &local->cont.lk.user_flock,
+                          local->xdata_req);
+    } else if (priv->quorum_count &&
+               !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) {
+        local->op_ret = -1;
+        local->op_errno = afr_final_errno(local, priv);
+
+        afr_lk_unlock(frame, this);
+    } else {
+        if (local->op_ret < 0)
+            local->op_errno = afr_final_errno(local, priv);
+
+        AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno,
+                         &local->cont.lk.ret_flock, local->xdata_rsp);
+    }
+
+    return 0;
+}
+
+int
+afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+    return 0;
+}
+
+int
+afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+                    dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int child_index = -1;
+
+    local = frame->local;
+    child_index = (long)cookie;
+    afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+    if (op_ret == 0) {
+        local->op_ret = 0;
+        local->op_errno = 0;
+        local->cont.lk.locked_nodes[child_index] = 1;
+        local->cont.lk.ret_flock = *lock;
+    }
+    syncbarrier_wake(&local->barrier);
+    return 0;
+}
+
+int
+afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+                      dict_t *xdata)
+{
+    afr_local_t *local = frame->local;
+    afr_private_t *priv = this->private;
+    int child_index = (long)cookie;
+
+    if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+               "gfid=%s: unlock failed on subvolume %s "
+               "with lock owner %s",
+               uuid_utoa(local->fd->inode->gfid),
+               priv->children[child_index]->name,
+               lkowner_utoa(&frame->root->lk_owner));
+    }
+    return 0;
+}
+int
+afr_lk_transaction(void *opaque)
+{
+    call_frame_t *frame = NULL;
+    xlator_t *this = NULL;
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    char *wind_on = NULL;
+    int op_errno = 0;
+    int i = 0;
+    int ret = 0;
+
+    frame = (call_frame_t *)opaque;
+    local = frame->local;
+    this = frame->this;
+    priv = this->private;
+    wind_on = alloca0(priv->child_count);
+
+    if (priv->arbiter_count || priv->child_count != 3) {
+        op_errno = ENOTSUP;
+        gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+               "%s: Lock healing supported only for replica 3 volumes.",
+               uuid_utoa(local->fd->inode->gfid));
+        goto err;
+    }
+
+    op_errno = -afr_dom_lock_acquire(frame);  // Released during
+                                              // AFR_STACK_UNWIND
+    if (op_errno != 0) {
+        goto err;
+    }
+    if (priv->quorum_count &&
+        !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) {
+        op_errno = afr_final_errno(local, priv);
+        goto err;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i])
+            wind_on[i] = 1;
+    }
+    AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd,
+               local->cont.lk.cmd, &local->cont.lk.user_flock,
+               local->xdata_req);
+
+    if (priv->quorum_count &&
+        !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) {
+        local->op_ret = -1;
+        local->op_errno = afr_final_errno(local, priv);
+        goto unlock;
+    } else {
+        if (local->cont.lk.user_flock.l_type == F_UNLCK)
+            ret = afr_remove_lock_from_saved_locks(local, this);
+        else
+            ret = afr_add_lock_to_saved_locks(frame, this);
+        if (ret) {
+            local->op_ret = -1;
+            local->op_errno = -ret;
+            goto unlock;
+        }
+        AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno,
+                         &local->cont.lk.ret_flock, local->xdata_rsp);
+    }
+
+    return 0;
+
+unlock:
+    local->cont.lk.user_flock.l_type = F_UNLCK;
+    AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk,
+               local->fd, F_SETLK, &local->cont.lk.user_flock, NULL);
+err:
+    AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+    return -1;
+}
+
+int
+afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+       struct gf_flock *flock, dict_t *xdata)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int ret = 0;
+    int i = 0;
+    int32_t op_errno = ENOMEM;
+
+    priv = this->private;
+
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    local->op = GF_FOP_LK;
+    if (!afr_lk_is_unlock(cmd, flock)) {
+        AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+        if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+            goto out;
+    }
+
+    local->cont.lk.locked_nodes = GF_CALLOC(
+        priv->child_count, sizeof(*local->cont.lk.locked_nodes),
+        gf_afr_mt_char);
+
+    if (!local->cont.lk.locked_nodes) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    local->fd = fd_ref(fd);
+    local->cont.lk.cmd = cmd;
+    local->cont.lk.user_flock = *flock;
+    local->cont.lk.ret_flock = *flock;
+    if (xdata)
+        local->xdata_req = dict_ref(xdata);
+
+    if (afr_is_lock_mode_mandatory(xdata)) {
+        ret = synctask_new(this->ctx->env, afr_lk_transaction,
+                           afr_lk_transaction_cbk, frame, frame);
+        if (ret) {
+            op_errno = ENOMEM;
+            goto out;
+        }
+        return 0;
+    }
+
+    STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i],
+                      priv->children[i]->fops->lk, fd, cmd, flock,
+                      local->xdata_req);
+
+    return 0;
+out:
+    AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+afr_lease_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct gf_lease *lease,
+                     dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int call_count = -1;
+
+    local = frame->local;
+    call_count = afr_frame_return(frame);
+
+    if (call_count == 0)
+        AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno, lease,
+                         xdata);
+
+    return 0;
+}
+
+int32_t
+afr_lease_unlock(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+    int call_count = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    call_count = afr_locked_nodes_count(local->cont.lease.locked_nodes,
+                                        priv->child_count);
+
+    if (call_count == 0) {
+        AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno,
+                         &local->cont.lease.ret_lease, NULL);
+        return 0;
+    }
+
+    local->call_count = call_count;
+
+    local->cont.lease.user_lease.cmd = GF_UNLK_LEASE;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->cont.lease.locked_nodes[i]) {
+            STACK_WIND(frame, afr_lease_unlock_cbk, priv->children[i],
+                       priv->children[i]->fops->lease, &local->loc,
+                       &local->cont.lease.user_lease, NULL);
+
+            if (!--call_count)
+                break;
+        }
+    }
+
+    return 0;
+}
+
+int32_t
+afr_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct gf_lease *lease, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int child_index = -1;
+
+    local = frame->local;
+    priv = this->private;
+
+    child_index = (long)cookie;
+
+    afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+    if (op_ret < 0 && op_errno == EAGAIN) {
+        local->op_ret = -1;
+        local->op_errno = EAGAIN;
+
+        afr_lease_unlock(frame, this);
+        return 0;
+    }
+
+    if (op_ret == 0) {
+        local->op_ret = 0;
+        local->op_errno = 0;
+        local->cont.lease.locked_nodes[child_index] = 1;
+        local->cont.lease.ret_lease = *lease;
+    }
+
+    child_index++;
+    if (child_index < priv->child_count) {
+        STACK_WIND_COOKIE(frame, afr_lease_cbk, (void *)(long)child_index,
+                          priv->children[child_index],
+                          priv->children[child_index]->fops->lease, &local->loc,
+                          &local->cont.lease.user_lease, xdata);
+    } else if (priv->quorum_count &&
+               !afr_has_quorum(local->cont.lease.locked_nodes, this, NULL)) {
+        local->op_ret = -1;
+        local->op_errno = afr_final_errno(local, priv);
+
+        afr_lease_unlock(frame, this);
+    } else {
+        if (local->op_ret < 0)
+            local->op_errno = afr_final_errno(local, priv);
+        AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno,
+                         &local->cont.lease.ret_lease, NULL);
+    }
+
+    return 0;
+}
+
+int
+afr_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+          struct gf_lease *lease, dict_t *xdata)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int32_t op_errno = ENOMEM;
+
+    priv = this->private;
+
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    local->op = GF_FOP_LEASE;
+    local->cont.lease.locked_nodes = GF_CALLOC(
+        priv->child_count, sizeof(*local->cont.lease.locked_nodes),
+        gf_afr_mt_char);
+
+    if (!local->cont.lease.locked_nodes) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    loc_copy(&local->loc, loc);
+    local->cont.lease.user_lease = *lease;
+    local->cont.lease.ret_lease = *lease;
+
+    STACK_WIND_COOKIE(frame, afr_lease_cbk, (void *)(long)0, priv->children[0],
+                      priv->children[0]->fops->lease, loc, lease, xdata);
+
+    return 0;
+out:
+    AFR_STACK_UNWIND(lease, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+int
+afr_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int child_index = (long)cookie;
+    int call_count = 0;
+    gf_boolean_t failed = _gf_false;
+    gf_boolean_t succeeded = _gf_false;
+    int i = 0;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    local->replies[child_index].valid = 1;
+    local->replies[child_index].op_ret = op_ret;
+    local->replies[child_index].op_errno = op_errno;
+    if (xdata)
+        local->replies[child_index].xdata = dict_ref(xdata);
+
+    call_count = afr_frame_return(frame);
+    if (call_count)
+        goto out;
+    /* If any of the subvolumes failed with other than ENOTCONN
+     * return error else return success unless all the subvolumes
+     * failed.
+     * TODO: In case of failure, we need to unregister the xattrs
+     * from the other subvolumes where it succeeded (once upcall
+     * fixes the Bz-1371622)*/
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].valid)
+            continue;
+        if (local->replies[i].op_ret < 0 &&
+            local->replies[i].op_errno != ENOTCONN) {
+            local->op_ret = local->replies[i].op_ret;
+            local->op_errno = local->replies[i].op_errno;
+            if (local->xdata_rsp)
+                dict_unref(local->xdata_rsp);
+            local->xdata_rsp = NULL;
+            if (local->replies[i].xdata) {
+                local->xdata_rsp = dict_ref(local->replies[i].xdata);
+            }
+            failed = _gf_true;
+            break;
+        }
+        if (local->replies[i].op_ret == 0) {
+            succeeded = _gf_true;
+            local->op_ret = 0;
+            local->op_errno = 0;
+            if (!local->xdata_rsp && local->replies[i].xdata) {
+                local->xdata_rsp = dict_ref(local->replies[i].xdata);
+            }
+        }
+    }
+
+    if (!succeeded && !failed) {
+        local->op_ret = -1;
+        local->op_errno = ENOTCONN;
+    }
+
+    AFR_STACK_UNWIND(ipc, frame, local->op_ret, local->op_errno,
+                     local->xdata_rsp);
+
+out:
+    return 0;
+}
+
+int
+afr_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int32_t op_errno = -1;
+    afr_private_t *priv = NULL;
+    int i = 0;
+    int call_cnt = -1;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+
+    if (op != GF_IPC_TARGET_UPCALL)
+        goto wind_default;
+
+    VALIDATE_OR_GOTO(this->private, err);
+    priv = this->private;
+
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto err;
+
+    call_cnt = local->call_count;
+
+    if (xdata) {
+        for (i = 0; i < priv->child_count; i++) {
+            if (dict_set_int8(xdata, priv->pending_key[i], 0) < 0)
+                goto err;
+        }
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->child_up[i])
+            continue;
+
+        STACK_WIND_COOKIE(frame, afr_ipc_cbk, (void *)(long)i,
+                          priv->children[i], priv->children[i]->fops->ipc, op,
+                          xdata);
+        if (!--call_cnt)
+            break;
+    }
+    return 0;
+
+err:
+    if (op_errno == -1)
+        op_errno = errno;
+    AFR_STACK_UNWIND(ipc, frame, -1, op_errno, NULL);
+
+    return 0;
+
+wind_default:
+    STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ipc, op, xdata);
+    return 0;
+}
+
+int
+afr_forget(xlator_t *this, inode_t *inode)
+{
+    uint64_t ctx_int = 0;
+    afr_inode_ctx_t *ctx = NULL;
+
+    afr_spb_choice_timeout_cancel(this, inode);
+    inode_ctx_del(inode, this, &ctx_int);
+    if (!ctx_int)
+        return 0;
+
+    ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int;
+    afr_inode_ctx_destroy(ctx);
+    return 0;
+}
+
+int
+afr_priv_dump(xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN];
+    char key[GF_DUMP_MAX_BUF_LEN];
+    int i = 0;
+
+    GF_ASSERT(this);
+    priv = this->private;
+
+    GF_ASSERT(priv);
+    snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
+    gf_proc_dump_add_section("%s", key_prefix);
+    gf_proc_dump_write("child_count", "%u", priv->child_count);
+    for (i = 0; i < priv->child_count; i++) {
+        sprintf(key, "child_up[%d]", i);
+        gf_proc_dump_write(key, "%d", priv->child_up[i]);
+        sprintf(key, "pending_key[%d]", i);
+        gf_proc_dump_write(key, "%s", priv->pending_key[i]);
+        sprintf(key, "pending_reads[%d]", i);
+        gf_proc_dump_write(key, "%" PRId64,
+                           GF_ATOMIC_GET(priv->pending_reads[i]));
+        sprintf(key, "child_latency[%d]", i);
+        gf_proc_dump_write(key, "%" PRId64, priv->child_latency[i]);
+        sprintf(key, "halo_child_up[%d]", i);
+        gf_proc_dump_write(key, "%d", priv->halo_child_up[i]);
+    }
+    gf_proc_dump_write("data_self_heal", "%d", priv->data_self_heal);
+    gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal);
+    gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal);
+    gf_proc_dump_write("read_child", "%d", priv->read_child);
+    gf_proc_dump_write("wait_count", "%u", priv->wait_count);
+    gf_proc_dump_write("heal-wait-queue-length", "%d", priv->heal_wait_qlen);
+    gf_proc_dump_write("heal-waiters", "%d", priv->heal_waiters);
+    gf_proc_dump_write("background-self-heal-count", "%d",
+                       priv->background_self_heal_count);
+    gf_proc_dump_write("healers", "%d", priv->healers);
+    gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode);
+    gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode);
+    if (priv->quorum_count == AFR_QUORUM_AUTO) {
+        gf_proc_dump_write("quorum-type", "auto");
+    } else if (priv->quorum_count == 0) {
+        gf_proc_dump_write("quorum-type", "none");
+    } else {
+        gf_proc_dump_write("quorum-type", "fixed");
+        gf_proc_dump_write("quorum-count", "%d", priv->quorum_count);
+    }
+    gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this, NULL));
+    if (priv->thin_arbiter_count) {
+        gf_proc_dump_write("ta_child_up", "%d", priv->ta_child_up);
+        gf_proc_dump_write("ta_bad_child_index", "%d",
+                           priv->ta_bad_child_index);
+        gf_proc_dump_write("ta_notify_dom_lock_offset", "%" PRId64,
+                           priv->ta_notify_dom_lock_offset);
+    }
+
+    return 0;
+}
+
+/**
+ * find_child_index - find the child's index in the array of subvolumes
+ * @this: AFR
+ * @child: child
+ */
+
+static int
+afr_find_child_index(xlator_t *this, xlator_t *child)
+{
+    afr_private_t *priv = NULL;
+    int child_count = -1;
+    int i = -1;
+
+    priv = this->private;
+    child_count = priv->child_count;
+    if (priv->thin_arbiter_count) {
+        child_count++;
+    }
+
+    for (i = 0; i < child_count; i++) {
+        if ((xlator_t *)child == priv->children[i])
+            break;
+    }
+
+    return i;
+}
+
+int
+__afr_get_up_children_count(afr_private_t *priv)
+{
+    int up_children = 0;
+    int i = 0;
+
+    for (i = 0; i < priv->child_count; i++)
+        if (priv->child_up[i] == 1)
+            up_children++;
+
+    return up_children;
+}
+
+static int
+__get_heard_from_all_status(xlator_t *this)
+{
+    afr_private_t *priv = this->private;
+    int i;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!priv->last_event[i]) {
+            return 0;
+        }
+    }
+    if (priv->thin_arbiter_count && !priv->ta_child_up) {
+        return 0;
+    }
+    return 1;
+}
+
+glusterfs_event_t
+__afr_transform_event_from_state(xlator_t *this)
+{
+    int i = 0;
+    int up_children = 0;
+    afr_private_t *priv = this->private;
+
+    if (__get_heard_from_all_status(this))
+        /* have_heard_from_all. Let afr_notify() do the propagation. */
+        return GF_EVENT_MAXVAL;
+
+    up_children = __afr_get_up_children_count(priv);
+    /* Treat the children with pending notification, as having sent a
+     * GF_EVENT_CHILD_DOWN. i.e. set the event as GF_EVENT_SOME_DESCENDENT_DOWN,
+     * as done in afr_notify() */
+    for (i = 0; i < priv->child_count; i++) {
+        if (priv->last_event[i])
+            continue;
+        priv->last_event[i] = GF_EVENT_SOME_DESCENDENT_DOWN;
+        priv->child_up[i] = 0;
+    }
+
+    if (up_children)
+        /* We received at least one child up */
+        return GF_EVENT_CHILD_UP;
+    else
+        return GF_EVENT_CHILD_DOWN;
+
+    return GF_EVENT_MAXVAL;
+}
+
+static void
+afr_notify_cbk(void *data)
+{
+    xlator_t *this = data;
+    afr_private_t *priv = this->private;
+    glusterfs_event_t event = GF_EVENT_MAXVAL;
+    gf_boolean_t propagate = _gf_false;
+
+    LOCK(&priv->lock);
+    {
+        if (!priv->timer) {
+            /*
+             * Either child_up/child_down is already sent to parent.
+             * This is a spurious wake up.
+             */
+            goto unlock;
+        }
+        priv->timer = NULL;
+        event = __afr_transform_event_from_state(this);
+        if (event != GF_EVENT_MAXVAL)
+            propagate = _gf_true;
+    }
+unlock:
+    UNLOCK(&priv->lock);
+    if (propagate)
+        default_notify(this, event, NULL);
+}
+
+static void
+__afr_launch_notify_timer(xlator_t *this, afr_private_t *priv)
+{
+    struct timespec delay = {
+        0,
+    };
+
+    gf_msg_debug(this->name, 0, "Initiating child-down timer");
+    delay.tv_sec = 10;
+    delay.tv_nsec = 0;
+    priv->timer = gf_timer_call_after(this->ctx, delay, afr_notify_cbk, this);
+    if (priv->timer == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_TIMER_CREATE_FAIL,
+               "Cannot create timer for delayed initialization");
+    }
+}
+
+static int
+find_best_down_child(xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    int i = -1;
+    int32_t best_child = -1;
+    int64_t best_latency = INT64_MAX;
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!priv->child_up[i] && priv->child_latency[i] >= 0 &&
+            priv->child_latency[i] < best_latency) {
+            best_child = i;
+            best_latency = priv->child_latency[i];
+        }
+    }
+    if (best_child >= 0) {
+        gf_msg_debug(this->name, 0,
+                     "Found best down child (%d) @ %" PRId64 " ms latency",
+                     best_child, best_latency);
+    }
+    return best_child;
+}
+
+int
+find_worst_up_child(xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    int i = -1;
+    int32_t worst_child = -1;
+    int64_t worst_latency = INT64_MIN;
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (priv->child_up[i] && priv->child_latency[i] >= 0 &&
+            priv->child_latency[i] > worst_latency) {
+            worst_child = i;
+            worst_latency = priv->child_latency[i];
+        }
+    }
+    if (worst_child >= 0) {
+        gf_msg_debug(this->name, 0,
+                     "Found worst up child (%d) @ %" PRId64 " ms latency",
+                     worst_child, worst_latency);
+    }
+    return worst_child;
+}
+
+void
+__afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx,
+                        int64_t halo_max_latency_msec, int32_t *event,
+                        int64_t child_latency_msec)
+{
+    afr_private_t *priv = NULL;
+    int up_children = 0;
+
+    priv = this->private;
+
+    priv->child_latency[idx] = child_latency_msec;
+    gf_msg_debug(child_xlator->name, 0, "Client ping @ %" PRId64 " ms",
+                 child_latency_msec);
+    if (priv->shd.iamshd)
+        return;
+
+    up_children = __afr_get_up_children_count(priv);
+
+    if (child_latency_msec > halo_max_latency_msec &&
+        priv->child_up[idx] == 1 && up_children > priv->halo_min_replicas) {
+        if ((up_children - 1) < priv->halo_min_replicas) {
+            gf_log(child_xlator->name, GF_LOG_INFO,
+                   "Overriding halo threshold, "
+                   "min replicas: %d",
+                   priv->halo_min_replicas);
+        } else {
+            gf_log(child_xlator->name, GF_LOG_INFO,
+                   "Child latency (%" PRId64
+                   " ms) "
+                   "exceeds halo threshold (%" PRId64
+                   "), "
+                   "marking child down.",
+                   child_latency_msec, halo_max_latency_msec);
+            if (priv->halo_child_up[idx]) {
+                *event = GF_EVENT_CHILD_DOWN;
+            }
+        }
+    } else if (child_latency_msec < halo_max_latency_msec &&
+               priv->child_up[idx] == 0) {
+        if (up_children < priv->halo_max_replicas) {
+            gf_log(child_xlator->name, GF_LOG_INFO,
+                   "Child latency (%" PRId64
+                   " ms) "
+                   "below halo threshold (%" PRId64
+                   "), "
+                   "marking child up.",
+                   child_latency_msec, halo_max_latency_msec);
+            if (priv->halo_child_up[idx]) {
+                *event = GF_EVENT_CHILD_UP;
+            }
+        } else {
+            gf_log(child_xlator->name, GF_LOG_INFO,
+                   "Not marking child %d up, "
+                   "max replicas (%d) reached.",
+                   idx, priv->halo_max_replicas);
+        }
+    }
+}
+
+static int64_t
+afr_get_halo_latency(xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    int64_t halo_max_latency_msec = 0;
+
+    priv = this->private;
+
+    if (priv->shd.iamshd) {
+        halo_max_latency_msec = priv->shd.halo_max_latency_msec;
+    } else if (priv->nfsd.iamnfsd) {
+        halo_max_latency_msec = priv->nfsd.halo_max_latency_msec;
+    } else {
+        halo_max_latency_msec = priv->halo_max_latency_msec;
+    }
+    gf_msg_debug(this->name, 0, "Using halo latency %" PRId64,
+                 halo_max_latency_msec);
+    return halo_max_latency_msec;
+}
+
+void
+__afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,
+                            const int idx, int64_t child_latency_msec,
+                            int32_t *event, int32_t *call_psh,
+                            int32_t *up_child)
+{
+    afr_private_t *priv = NULL;
+    int up_children = 0;
+    int worst_up_child = -1;
+    int64_t halo_max_latency_msec = afr_get_halo_latency(this);
+
+    priv = this->private;
+
+    /*
+     * This only really counts if the child was never up
+     * (value = -1) or had been down (value = 0).  See
+     * comment at GF_EVENT_CHILD_DOWN for a more detailed
+     * explanation.
+     */
+    if (priv->child_up[idx] != 1) {
+        priv->event_generation++;
+    }
+    priv->child_up[idx] = 1;
+
+    *call_psh = 1;
+    *up_child = idx;
+    up_children = __afr_get_up_children_count(priv);
+    /*
+     * If this is an _actual_ CHILD_UP event, we
+     * want to set the child_latency to MAX to indicate
+     * the child needs ping data to be available before doing child-up
+     */
+    if (!priv->halo_enabled)
+        goto out;
+
+    if (child_latency_msec < 0) {
+        /*set to INT64_MAX-1 so that it is found for best_down_child*/
+        priv->halo_child_up[idx] = 1;
+        if (priv->child_latency[idx] < 0) {
+            priv->child_latency[idx] = AFR_HALO_MAX_LATENCY;
+        }
+    }
+
+    /*
+     * Handle the edge case where we exceed
+     * halo_min_replicas and we've got a child which is
+     * marked up as it was helping to satisfy the
+     * halo_min_replicas even though it's latency exceeds
+     * halo_max_latency_msec.
+     */
+    if (up_children > priv->halo_min_replicas) {
+        worst_up_child = find_worst_up_child(this);
+        if (worst_up_child >= 0 &&
+            priv->child_latency[worst_up_child] > halo_max_latency_msec) {
+            gf_msg_debug(this->name, 0,
+                         "Marking child %d down, "
+                         "doesn't meet halo threshold (%" PRId64
+                         "), and > "
+                         "halo_min_replicas (%d)",
+                         worst_up_child, halo_max_latency_msec,
+                         priv->halo_min_replicas);
+            priv->child_up[worst_up_child] = 0;
+            up_children--;
+        }
+    }
+
+    if (up_children > priv->halo_max_replicas && !priv->shd.iamshd) {
+        worst_up_child = find_worst_up_child(this);
+        if (worst_up_child < 0) {
+            worst_up_child = idx;
+        }
+        priv->child_up[worst_up_child] = 0;
+        up_children--;
+        gf_msg_debug(this->name, 0,
+                     "Marking child %d down, "
+                     "up_children (%d) > halo_max_replicas (%d)",
+                     worst_up_child, up_children, priv->halo_max_replicas);
+    }
+out:
+    if (up_children == 1) {
+        gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP,
+               "Subvolume '%s' came back up; "
+               "going online.",
+               child_xlator->name);
+        gf_event(EVENT_AFR_SUBVOL_UP, "client-pid=%d; subvol=%s",
+                 this->ctx->cmd_args.client_pid, this->name);
+    } else {
+        *event = GF_EVENT_SOME_DESCENDENT_UP;
+    }
+
+    priv->last_event[idx] = *event;
+}
+
+void
+__afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
+                              int64_t child_latency_msec, int32_t *event,
+                              int32_t *call_psh, int32_t *up_child)
+{
+    afr_private_t *priv = NULL;
+    int i = 0;
+    int up_children = 0;
+    int down_children = 0;
+    int best_down_child = -1;
+
+    priv = this->private;
+
+    /*
+     * If a brick is down when we start, we'll get a
+     * CHILD_DOWN to indicate its initial state.  There
+     * was never a CHILD_UP in this case, so if we
+     * increment "down_count" the difference between than
+     * and "up_count" will no longer be the number of
+     * children that are currently up.  This has serious
+     * implications e.g. for quorum enforcement, so we
+     * don't increment these values unless the event
+     * represents an actual state transition between "up"
+     * (value = 1) and anything else.
+     */
+    if (priv->child_up[idx] == 1) {
+        priv->event_generation++;
+    }
+
+    /*
+     * If this is an _actual_ CHILD_DOWN event, we
+     * want to set the child_latency to < 0 to indicate
+     * the child is really disconnected.
+     */
+    if (child_latency_msec < 0) {
+        priv->child_latency[idx] = child_latency_msec;
+        priv->halo_child_up[idx] = 0;
+    }
+    priv->child_up[idx] = 0;
+
+    up_children = __afr_get_up_children_count(priv);
+    /*
+     * Handle the edge case where we need to find the
+     * next best child (to mark up) as marking this child
+     * down would cause us to fall below halo_min_replicas.
+     * We will also force the SHD to heal this child _now_
+     * as we want it to be up to date if we are going to
+     * begin using it synchronously.
+     */
+    if (priv->halo_enabled && up_children < priv->halo_min_replicas) {
+        best_down_child = find_best_down_child(this);
+        if (best_down_child >= 0) {
+            gf_msg_debug(this->name, 0,
+                         "Swapping out child %d for "
+                         "child %d to satisfy halo_min_replicas (%d).",
+                         idx, best_down_child, priv->halo_min_replicas);
+            priv->child_up[best_down_child] = 1;
+            *call_psh = 1;
+            *up_child = best_down_child;
+        }
+    }
+    for (i = 0; i < priv->child_count; i++)
+        if (priv->child_up[i] == 0)
+            down_children++;
+    if (down_children == priv->child_count) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SUBVOLS_DOWN,
+               "All subvolumes are down. Going "
+               "offline until at least one of them "
+               "comes back up.");
+        gf_event(EVENT_AFR_SUBVOLS_DOWN, "client-pid=%d; subvol=%s",
+                 this->ctx->cmd_args.client_pid, this->name);
+    } else {
+        *event = GF_EVENT_SOME_DESCENDENT_DOWN;
+    }
+    priv->last_event[idx] = *event;
+}
+
+void
+afr_ta_lock_release_synctask(xlator_t *this)
+{
+    call_frame_t *ta_frame = NULL;
+    int ret = 0;
+
+    ta_frame = afr_ta_frame_create(this);
+    if (!ta_frame) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+               "Failed to create ta_frame");
+        return;
+    }
+
+    ret = synctask_new(this->ctx->env, afr_release_notify_lock_for_ta,
+                       afr_ta_lock_release_done, ta_frame, this);
+    if (ret) {
+        STACK_DESTROY(ta_frame->root);
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+               "Failed to release "
+               "AFR_TA_DOM_NOTIFY lock.");
+    }
+}
+
+static void
+afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall)
+{
+    struct gf_upcall_inodelk_contention *lc = NULL;
+    unsigned int inmem_count = 0;
+    unsigned int onwire_count = 0;
+    afr_private_t *priv = this->private;
+
+    lc = upcall->data;
+
+    if (strcmp(lc->domain, AFR_TA_DOM_NOTIFY) != 0)
+        return;
+
+    if (priv->shd.iamshd) {
+        /* shd should ignore AFR_TA_DOM_NOTIFY release requests. */
+        return;
+    }
+    LOCK(&priv->lock);
+    {
+        if (priv->release_ta_notify_dom_lock == _gf_true) {
+            /* Ignore multiple release requests from shds.*/
+            UNLOCK(&priv->lock);
+            return;
+        }
+        priv->release_ta_notify_dom_lock = _gf_true;
+        inmem_count = priv->ta_in_mem_txn_count;
+        onwire_count = priv->ta_on_wire_txn_count;
+    }
+    UNLOCK(&priv->lock);
+    if (inmem_count || onwire_count)
+        /* lock release will happen in txn code path after
+         * in-memory or on-wire txns are over.*/
+        return;
+
+    afr_ta_lock_release_synctask(this);
+}
+
+static void
+afr_handle_upcall_event(xlator_t *this, struct gf_upcall *upcall)
+{
+    struct gf_upcall_cache_invalidation *up_ci = NULL;
+    afr_private_t *priv = this->private;
+    inode_t *inode = NULL;
+    inode_table_t *itable = NULL;
+    int i = 0;
+
+    switch (upcall->event_type) {
+        case GF_UPCALL_INODELK_CONTENTION:
+            afr_handle_inodelk_contention(this, upcall);
+            break;
+        case GF_UPCALL_CACHE_INVALIDATION:
+            up_ci = (struct gf_upcall_cache_invalidation *)upcall->data;
+
+            /* Since md-cache will be aggressively filtering
+             * lookups, the stale read issue will be more
+             * pronounced. Hence when a pending xattr is set notify
+             * all the md-cache clients to invalidate the existing
+             * stat cache and send the lookup next time */
+            if (!up_ci->dict)
+                break;
+            for (i = 0; i < priv->child_count; i++) {
+                if (!dict_get(up_ci->dict, priv->pending_key[i]))
+                    continue;
+                up_ci->flags |= UP_INVAL_ATTR;
+                itable = ((xlator_t *)this->graph->top)->itable;
+                /*Internal processes may not have itable for
+                 *top xlator*/
+                if (itable)
+                    inode = inode_find(itable, upcall->gfid);
+                if (inode)
+                    afr_inode_need_refresh_set(inode, this);
+                break;
+            }
+            break;
+        default:
+            break;
+    }
+}
+
+int32_t
+afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
+{
+    afr_private_t *priv = NULL;
+    xlator_t *child_xlator = NULL;
+    int i = -1;
+    int propagate = 0;
+    int had_heard_from_all = 0;
+    int have_heard_from_all = 0;
+    int idx = -1;
+    int ret = -1;
+    int call_psh = 0;
+    int up_child = -1;
+    dict_t *input = NULL;
+    dict_t *output = NULL;
+    gf_boolean_t had_quorum = _gf_false;
+    gf_boolean_t has_quorum = _gf_false;
+    int64_t halo_max_latency_msec = 0;
+    int64_t child_latency_msec = -1;
+
+    child_xlator = (xlator_t *)data;
+
+    priv = this->private;
+
+    if (!priv)
+        return 0;
+
+    /*
+     * We need to reset this in case children come up in "staggered"
+     * fashion, so that we discover a late-arriving local subvolume.  Note
+     * that we could end up issuing N lookups to the first subvolume, and
+     * O(N^2) overall, but N is small for AFR so it shouldn't be an issue.
+     */
+    priv->did_discovery = _gf_false;
+
+    /* parent xlators don't need to know about every child_up, child_down
+     * because of afr ha. If all subvolumes go down, child_down has
+     * to be triggered. In that state when 1 subvolume comes up child_up
+     * needs to be triggered. dht optimizes revalidate lookup by sending
+     * it only to one of its subvolumes. When child up/down happens
+     * for afr's subvolumes dht should be notified by child_modified. The
+     * subsequent revalidate lookup happens on all the dht's subvolumes
+     * which triggers afr self-heals if any.
+     */
+    idx = afr_find_child_index(this, child_xlator);
+    if (idx < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,
+               "Received child_up from invalid subvolume");
+        goto out;
+    }
+
+    had_quorum = priv->quorum_count &&
+                 afr_has_quorum(priv->child_up, this, NULL);
+    if (event == GF_EVENT_CHILD_PING) {
+        child_latency_msec = (int64_t)(uintptr_t)data2;
+        if (priv->halo_enabled) {
+            halo_max_latency_msec = afr_get_halo_latency(this);
+
+            /* Calculates the child latency and sets event
+             */
+            LOCK(&priv->lock);
+            {
+                __afr_handle_ping_event(this, child_xlator, idx,
+                                        halo_max_latency_msec, &event,
+                                        child_latency_msec);
+            }
+            UNLOCK(&priv->lock);
+        } else {
+            LOCK(&priv->lock);
+            {
+                priv->child_latency[idx] = child_latency_msec;
+            }
+            UNLOCK(&priv->lock);
+        }
+    }
+
+    if (event == GF_EVENT_CHILD_PING) {
+        /* This is the only xlator that handles PING, no reason to
+         * propagate.
+         */
+        goto out;
+    }
+
+    if (event == GF_EVENT_TRANSLATOR_OP) {
+        LOCK(&priv->lock);
+        {
+            had_heard_from_all = __get_heard_from_all_status(this);
+        }
+        UNLOCK(&priv->lock);
+
+        if (!had_heard_from_all) {
+            ret = -1;
+        } else {
+            input = data;
+            output = data2;
+            ret = afr_xl_op(this, input, output);
+        }
+        goto out;
+    }
+
+    if (event == GF_EVENT_UPCALL) {
+        afr_handle_upcall_event(this, data);
+    }
+
+    LOCK(&priv->lock);
+    {
+        had_heard_from_all = __get_heard_from_all_status(this);
+        switch (event) {
+            case GF_EVENT_PARENT_UP:
+                __afr_launch_notify_timer(this, priv);
+                propagate = 1;
+                break;
+            case GF_EVENT_CHILD_UP:
+                if (priv->thin_arbiter_count &&
+                    (idx == AFR_CHILD_THIN_ARBITER)) {
+                    priv->ta_child_up = 1;
+                    priv->ta_event_gen++;
+                    break;
+                }
+                __afr_handle_child_up_event(this, child_xlator, idx,
+                                            child_latency_msec, &event,
+                                            &call_psh, &up_child);
+                __afr_lock_heal_synctask(this, priv, idx);
+                break;
+
+            case GF_EVENT_CHILD_DOWN:
+                if (priv->thin_arbiter_count &&
+                    (idx == AFR_CHILD_THIN_ARBITER)) {
+                    priv->ta_child_up = 0;
+                    priv->ta_event_gen++;
+                    afr_ta_locked_priv_invalidate(priv);
+                    break;
+                }
+                __afr_handle_child_down_event(this, child_xlator, idx,
+                                              child_latency_msec, &event,
+                                              &call_psh, &up_child);
+                __afr_mark_pending_lk_heal(this, priv, idx);
+                break;
+
+            case GF_EVENT_CHILD_CONNECTING:
+                priv->last_event[idx] = event;
+
+                break;
+
+            case GF_EVENT_SOME_DESCENDENT_DOWN:
+                priv->last_event[idx] = event;
+                break;
+            default:
+                propagate = 1;
+                break;
+        }
+        have_heard_from_all = __get_heard_from_all_status(this);
+        if (!had_heard_from_all && have_heard_from_all) {
+            if (priv->timer) {
+                gf_timer_call_cancel(this->ctx, priv->timer);
+                priv->timer = NULL;
+            }
+            /* This is the first event which completes aggregation
+               of events from all subvolumes. If at least one subvol
+               had come up, propagate CHILD_UP, but only this time
+            */
+            event = GF_EVENT_CHILD_DOWN;
+            for (i = 0; i < priv->child_count; i++) {
+                if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
+                    event = GF_EVENT_CHILD_UP;
+                    break;
+                }
+
+                if (priv->last_event[i] == GF_EVENT_CHILD_CONNECTING) {
+                    event = GF_EVENT_CHILD_CONNECTING;
+                    /* continue to check other events for CHILD_UP */
+                }
+            }
+        }
+    }
+    UNLOCK(&priv->lock);
+
+    if (priv->quorum_count) {
+        has_quorum = afr_has_quorum(priv->child_up, this, NULL);
+        if (!had_quorum && has_quorum) {
+            gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_QUORUM_MET,
+                   "Client-quorum is met");
+            gf_event(EVENT_AFR_QUORUM_MET, "client-pid=%d; subvol=%s",
+                     this->ctx->cmd_args.client_pid, this->name);
+        }
+        if (had_quorum && !has_quorum) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
+                   "Client-quorum is not met");
+            gf_event(EVENT_AFR_QUORUM_FAIL, "client-pid=%d; subvol=%s",
+                     this->ctx->cmd_args.client_pid, this->name);
+        }
+    }
+
+    /* if all subvols have reported status, no need to hide anything
+       or wait for anything else. Just propagate blindly */
+    if (have_heard_from_all)
+        propagate = 1;
+
+    ret = 0;
+    if (propagate)
+        ret = default_notify(this, event, data);
+
+    if ((!had_heard_from_all) || call_psh) {
+        /* Launch self-heal on all local subvolumes if:
+         * a) We have_heard_from_all for the first time
+         * b) Already heard from everyone, but we now got a child-up
+         *    event.
+         */
+        if (have_heard_from_all) {
+            afr_selfheal_childup(this, priv);
+        }
+    }
+out:
+    return ret;
+}
+
+int
+afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
+{
+    int __ret = -1;
+    local->op_ret = -1;
+    local->op_errno = EUCLEAN;
+
+    __ret = syncbarrier_init(&local->barrier);
+    if (__ret) {
+        if (op_errno)
+            *op_errno = __ret;
+        goto out;
+    }
+
+    local->child_up = GF_MALLOC(priv->child_count * sizeof(*local->child_up),
+                                gf_afr_mt_char);
+    if (!local->child_up) {
+        if (op_errno)
+            *op_errno = ENOMEM;
+        goto out;
+    }
+
+    memcpy(local->child_up, priv->child_up,
+           sizeof(*local->child_up) * priv->child_count);
+    local->call_count = AFR_COUNT(local->child_up, priv->child_count);
+    if (local->call_count == 0) {
+        gf_msg(THIS->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN,
+               "no subvolumes up");
+        if (op_errno)
+            *op_errno = ENOTCONN;
+        goto out;
+    }
+
+    local->event_generation = priv->event_generation;
+
+    local->read_attempted = GF_CALLOC(priv->child_count, sizeof(char),
+                                      gf_afr_mt_char);
+    if (!local->read_attempted) {
+        if (op_errno)
+            *op_errno = ENOMEM;
+        goto out;
+    }
+
+    local->readable = GF_CALLOC(priv->child_count, sizeof(char),
+                                gf_afr_mt_char);
+    if (!local->readable) {
+        if (op_errno)
+            *op_errno = ENOMEM;
+        goto out;
+    }
+
+    local->readable2 = GF_CALLOC(priv->child_count, sizeof(char),
+                                 gf_afr_mt_char);
+    if (!local->readable2) {
+        if (op_errno)
+            *op_errno = ENOMEM;
+        goto out;
+    }
+
+    local->read_subvol = -1;
+
+    local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
+                               gf_afr_mt_reply_t);
+    if (!local->replies) {
+        if (op_errno)
+            *op_errno = ENOMEM;
+        goto out;
+    }
+
+    local->need_full_crawl = _gf_false;
+    if (priv->thin_arbiter_count) {
+        local->ta_child_up = priv->ta_child_up;
+        local->ta_failed_subvol = AFR_CHILD_UNKNOWN;
+        local->read_txn_query_child = AFR_CHILD_UNKNOWN;
+        local->ta_event_gen = priv->ta_event_gen;
+        local->fop_state = TA_SUCCESS;
+    }
+    local->is_new_entry = _gf_false;
+
+    INIT_LIST_HEAD(&local->healer);
+    return 0;
+out:
+    return -1;
+}
+
+int
+afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count)
+{
+    int ret = -ENOMEM;
+
+    lk->lower_locked_nodes = GF_CALLOC(sizeof(*lk->lower_locked_nodes),
+                                       child_count, gf_afr_mt_char);
+    if (NULL == lk->lower_locked_nodes)
+        goto out;
+
+    lk->lock_op_ret = -1;
+    lk->lock_op_errno = EUCLEAN;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+void
+afr_matrix_cleanup(int32_t **matrix, unsigned int m)
+{
+    int i = 0;
+
+    if (!matrix)
+        goto out;
+    for (i = 0; i < m; i++) {
+        GF_FREE(matrix[i]);
+    }
+
+    GF_FREE(matrix);
+out:
+    return;
+}
+
+int32_t **
+afr_matrix_create(unsigned int m, unsigned int n)
+{
+    int32_t **matrix = NULL;
+    int i = 0;
+
+    matrix = GF_CALLOC(sizeof(*matrix), m, gf_afr_mt_int32_t);
+    if (!matrix)
+        goto out;
+
+    for (i = 0; i < m; i++) {
+        matrix[i] = GF_CALLOC(sizeof(*matrix[i]), n, gf_afr_mt_int32_t);
+        if (!matrix[i])
+            goto out;
+    }
+    return matrix;
+out:
+    afr_matrix_cleanup(matrix, m);
+    return NULL;
+}
+
+int
+afr_transaction_local_init(afr_local_t *local, xlator_t *this)
+{
+    int ret = -ENOMEM;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+    INIT_LIST_HEAD(&local->transaction.wait_list);
+    INIT_LIST_HEAD(&local->transaction.owner_list);
+    INIT_LIST_HEAD(&local->ta_waitq);
+    INIT_LIST_HEAD(&local->ta_onwireq);
+    ret = afr_internal_lock_init(&local->internal_lock, priv->child_count);
+    if (ret < 0)
+        goto out;
+
+    ret = -ENOMEM;
+    local->pre_op_compat = priv->pre_op_compat;
+
+    local->transaction.pre_op = GF_CALLOC(sizeof(*local->transaction.pre_op),
+                                          priv->child_count, gf_afr_mt_char);
+    if (!local->transaction.pre_op)
+        goto out;
+
+    local->transaction.changelog_xdata = GF_CALLOC(
+        sizeof(*local->transaction.changelog_xdata), priv->child_count,
+        gf_afr_mt_dict_t);
+    if (!local->transaction.changelog_xdata)
+        goto out;
+
+    if (priv->arbiter_count == 1) {
+        local->transaction.pre_op_sources = GF_CALLOC(
+            sizeof(*local->transaction.pre_op_sources), priv->child_count,
+            gf_afr_mt_char);
+        if (!local->transaction.pre_op_sources)
+            goto out;
+    }
+
+    local->transaction.failed_subvols = GF_CALLOC(
+        sizeof(*local->transaction.failed_subvols), priv->child_count,
+        gf_afr_mt_char);
+    if (!local->transaction.failed_subvols)
+        goto out;
+
+    local->pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+    if (!local->pending)
+        goto out;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+void
+afr_set_low_priority(call_frame_t *frame)
+{
+    frame->root->pid = LOW_PRIO_PROC_PID;
+}
+
+void
+afr_priv_destroy(afr_private_t *priv)
+{
+    int i = 0;
+    int child_count = -1;
+
+    if (!priv)
+        goto out;
+
+    GF_FREE(priv->sh_domain);
+    GF_FREE(priv->last_event);
+
+    child_count = priv->child_count;
+    if (priv->thin_arbiter_count) {
+        child_count++;
+    }
+    if (priv->pending_key) {
+        for (i = 0; i < child_count; i++)
+            GF_FREE(priv->pending_key[i]);
+    }
+
+    GF_FREE(priv->pending_reads);
+    GF_FREE(priv->local);
+    GF_FREE(priv->pending_key);
+    GF_FREE(priv->children);
+    GF_FREE(priv->anon_inode);
+    GF_FREE(priv->child_up);
+    GF_FREE(priv->halo_child_up);
+    GF_FREE(priv->child_latency);
+    LOCK_DESTROY(&priv->lock);
+
+    GF_FREE(priv);
+out:
+    return;
+}
+
+int **
+afr_mark_pending_changelog(afr_private_t *priv, unsigned char *pending,
+                           dict_t *xattr, ia_type_t iat)
+{
+    int i = 0;
+    int **changelog = NULL;
+    int idx = -1;
+    int m_idx = 0;
+    int d_idx = 0;
+    int ret = 0;
+
+    m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION);
+    d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+
+    idx = afr_index_from_ia_type(iat);
+
+    changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+    if (!changelog)
+        goto out;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!pending[i])
+            continue;
+
+        changelog[i][m_idx] = hton32(1);
+        if (idx != -1)
+            changelog[i][idx] = hton32(1);
+        /* If the newentry marking is on a newly created directory,
+         * then mark it with the full-heal indicator.
+         */
+        if ((IA_ISDIR(iat)) && (priv->esh_granular))
+            changelog[i][d_idx] = hton32(1);
+    }
+    ret = afr_set_pending_dict(priv, xattr, changelog);
+    if (ret < 0) {
+        afr_matrix_cleanup(changelog, priv->child_count);
+        return NULL;
+    }
+out:
+    return changelog;
+}
+
+static dict_t *
+afr_set_heal_info(char *status)
+{
+    dict_t *dict = NULL;
+    int ret = -1;
+
+    dict = dict_new();
+    if (!dict) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    ret = dict_set_dynstr_sizen(dict, "heal-info", status);
+    if (ret)
+        gf_msg("", GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+               "Failed to set heal-info key to "
+               "%s",
+               status);
+out:
+    /* Any error other than EINVAL, dict_set_dynstr frees status */
+    if (ret == -ENOMEM || ret == -EINVAL) {
+        GF_FREE(status);
+    }
+
+    if (ret && dict) {
+        dict_unref(dict);
+        dict = NULL;
+    }
+    return dict;
+}
+
+static gf_boolean_t
+afr_is_dirty_count_non_unary_for_txn(xlator_t *this, struct afr_reply *replies,
+                                     afr_transaction_type type)
+{
+    afr_private_t *priv = this->private;
+    int *dirty = alloca0(priv->child_count * sizeof(int));
+    int i = 0;
+
+    afr_selfheal_extract_xattr(this, replies, type, dirty, NULL);
+    for (i = 0; i < priv->child_count; i++) {
+        if (dirty[i] > 1)
+            return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+static gf_boolean_t
+afr_is_dirty_count_non_unary(xlator_t *this, struct afr_reply *replies,
+                             ia_type_t ia_type)
+{
+    gf_boolean_t data_chk = _gf_false;
+    gf_boolean_t mdata_chk = _gf_false;
+    gf_boolean_t entry_chk = _gf_false;
+
+    switch (ia_type) {
+        case IA_IFDIR:
+            mdata_chk = _gf_true;
+            entry_chk = _gf_true;
+            break;
+        case IA_IFREG:
+            mdata_chk = _gf_true;
+            data_chk = _gf_true;
+            break;
+        default:
+            /*IA_IFBLK, IA_IFCHR, IA_IFLNK, IA_IFIFO, IA_IFSOCK*/
+            mdata_chk = _gf_true;
+            break;
+    }
+
+    if (data_chk && afr_is_dirty_count_non_unary_for_txn(
+                        this, replies, AFR_DATA_TRANSACTION)) {
+        return _gf_true;
+    } else if (mdata_chk && afr_is_dirty_count_non_unary_for_txn(
+                                this, replies, AFR_METADATA_TRANSACTION)) {
+        return _gf_true;
+    } else if (entry_chk && afr_is_dirty_count_non_unary_for_txn(
+                                this, replies, AFR_ENTRY_TRANSACTION)) {
+        return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+static int
+afr_update_heal_status(xlator_t *this, struct afr_reply *replies,
+                       ia_type_t ia_type, gf_boolean_t *esh, gf_boolean_t *dsh,
+                       gf_boolean_t *msh, unsigned char pending)
+{
+    int ret = -1;
+    GF_UNUSED int ret1 = 0;
+    int i = 0;
+    int io_domain_lk_count = 0;
+    int shd_domain_lk_count = 0;
+    afr_private_t *priv = NULL;
+    char *key1 = NULL;
+    char *key2 = NULL;
+
+    priv = this->private;
+    key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+                   strlen(this->name));
+    key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+                   strlen(priv->sh_domain));
+    sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name);
+    sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if ((replies[i].valid != 1) || (replies[i].op_ret != 0))
+            continue;
+        if (!io_domain_lk_count) {
+            ret1 = dict_get_int32(replies[i].xdata, key1, &io_domain_lk_count);
+        }
+        if (!shd_domain_lk_count) {
+            ret1 = dict_get_int32(replies[i].xdata, key2, &shd_domain_lk_count);
+        }
+    }
+
+    if (!pending) {
+        if ((afr_is_dirty_count_non_unary(this, replies, ia_type)) ||
+            (!io_domain_lk_count)) {
+            /* Needs heal. */
+            ret = 0;
+        } else {
+            /* No heal needed. */
+            *dsh = *esh = *msh = 0;
+        }
+    } else {
+        if (shd_domain_lk_count) {
+            ret = -EAGAIN; /*For 'possibly-healing'. */
+        } else {
+            ret = 0; /*needs heal. Just set a non -ve value so that it is
+                       assumed as the source index.*/
+        }
+    }
+    return ret;
+}
+
+/*return EIO, EAGAIN or pending*/
+int
+afr_lockless_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+                     inode_t **inode, gf_boolean_t *entry_selfheal,
+                     gf_boolean_t *data_selfheal,
+                     gf_boolean_t *metadata_selfheal, unsigned char *pending)
+{
+    int ret = -1;
+    int i = 0;
+    afr_private_t *priv = NULL;
+    struct afr_reply *replies = NULL;
+    gf_boolean_t dsh = _gf_false;
+    gf_boolean_t msh = _gf_false;
+    gf_boolean_t esh = _gf_false;
+    unsigned char *sources = NULL;
+    unsigned char *sinks = NULL;
+    unsigned char *valid_on = NULL;
+    uint64_t *witness = NULL;
+
+    priv = this->private;
+    replies = alloca0(sizeof(*replies) * priv->child_count);
+    sources = alloca0(sizeof(*sources) * priv->child_count);
+    sinks = alloca0(sizeof(*sinks) * priv->child_count);
+    witness = alloca0(sizeof(*witness) * priv->child_count);
+    valid_on = alloca0(sizeof(*valid_on) * priv->child_count);
+
+    ret = afr_selfheal_unlocked_inspect(frame, this, gfid, inode, &dsh, &msh,
+                                        &esh, replies);
+    if (ret)
+        goto out;
+    for (i = 0; i < priv->child_count; i++) {
+        if (replies[i].valid && replies[i].op_ret == 0) {
+            valid_on[i] = 1;
+        }
+    }
+    if (msh) {
+        ret = afr_selfheal_find_direction(frame, this, replies,
+                                          AFR_METADATA_TRANSACTION, valid_on,
+                                          sources, sinks, witness, pending);
+        if (*pending & PFLAG_SBRAIN)
+            ret = -EIO;
+        if (ret)
+            goto out;
+    }
+    if (dsh) {
+        ret = afr_selfheal_find_direction(frame, this, replies,
+                                          AFR_DATA_TRANSACTION, valid_on,
+                                          sources, sinks, witness, pending);
+        if (*pending & PFLAG_SBRAIN)
+            ret = -EIO;
+        if (ret)
+            goto out;
+    }
+    if (esh) {
+        ret = afr_selfheal_find_direction(frame, this, replies,
+                                          AFR_ENTRY_TRANSACTION, valid_on,
+                                          sources, sinks, witness, pending);
+        if (*pending & PFLAG_SBRAIN)
+            ret = -EIO;
+        if (ret)
+            goto out;
+    }
+
+    ret = afr_update_heal_status(this, replies, (*inode)->ia_type, &esh, &dsh,
+                                 &msh, *pending);
+out:
+    *data_selfheal = dsh;
+    *entry_selfheal = esh;
+    *metadata_selfheal = msh;
+    if (replies)
+        afr_replies_wipe(replies, priv->child_count);
+    return ret;
+}
+
+int
+afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+    gf_boolean_t data_selfheal = _gf_false;
+    gf_boolean_t metadata_selfheal = _gf_false;
+    gf_boolean_t entry_selfheal = _gf_false;
+    unsigned char pending = 0;
+    dict_t *dict = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
+    inode_t *inode = NULL;
+    char *substr = NULL;
+    char *status = NULL;
+    call_frame_t *heal_frame = NULL;
+    afr_local_t *heal_local = NULL;
+
+    /*Use frame with lk-owner set*/
+    heal_frame = afr_frame_create(frame->this, &op_errno);
+    if (!heal_frame) {
+        ret = -1;
+        goto out;
+    }
+    heal_local = heal_frame->local;
+    heal_frame->local = frame->local;
+
+    ret = afr_lockless_inspect(heal_frame, this, loc->gfid, &inode,
+                               &entry_selfheal, &data_selfheal,
+                               &metadata_selfheal, &pending);
+
+    if (ret == -ENOMEM) {
+        ret = -1;
+        goto out;
+    }
+
+    if (pending & PFLAG_PENDING) {
+        gf_asprintf(&substr, "-pending");
+        if (!substr)
+            goto out;
+    }
+
+    if (ret == -EIO) {
+        ret = gf_asprintf(&status, "split-brain%s", substr ? substr : "");
+        if (ret < 0) {
+            goto out;
+        }
+        dict = afr_set_heal_info(status);
+        if (!dict) {
+            ret = -1;
+            goto out;
+        }
+    } else if (ret == -EAGAIN) {
+        ret = gf_asprintf(&status, "possibly-healing%s", substr ? substr : "");
+        if (ret < 0) {
+            goto out;
+        }
+        dict = afr_set_heal_info(status);
+        if (!dict) {
+            ret = -1;
+            goto out;
+        }
+    } else if (ret >= 0) {
+        /* value of ret = source index
+         * so ret >= 0 and at least one of the 3 booleans set to
+         * true means a source is identified; heal is required.
+         */
+        if (!data_selfheal && !entry_selfheal && !metadata_selfheal) {
+            status = gf_strdup("no-heal");
+            if (!status) {
+                ret = -1;
+                goto out;
+            }
+            dict = afr_set_heal_info(status);
+            if (!dict) {
+                ret = -1;
+                goto out;
+            }
+        } else {
+            ret = gf_asprintf(&status, "heal%s", substr ? substr : "");
+            if (ret < 0) {
+                goto out;
+            }
+            dict = afr_set_heal_info(status);
+            if (!dict) {
+                ret = -1;
+                goto out;
+            }
+        }
+    } else if (ret < 0) {
+        /* Apart from above checked -ve ret values, there are
+         * other possible ret values like ENOTCONN
+         * (returned when number of valid replies received are
+         * less than 2)
+         * in which case heal is required when one of the
+         * selfheal booleans is set.
+         */
+        if (data_selfheal || entry_selfheal || metadata_selfheal) {
+            ret = gf_asprintf(&status, "heal%s", substr ? substr : "");
+            if (ret < 0) {
+                goto out;
+            }
+            dict = afr_set_heal_info(status);
+            if (!dict) {
+                ret = -1;
+                goto out;
+            }
+        }
+    }
+
+    ret = 0;
+    op_errno = 0;
+
+out:
+    if (heal_frame) {
+        heal_frame->local = heal_local;
+        AFR_STACK_DESTROY(heal_frame);
+    }
+    AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
+    if (dict)
+        dict_unref(dict);
+    if (inode)
+        inode_unref(inode);
+    GF_FREE(substr);
+    return ret;
+}
+
+int
+_afr_is_split_brain(call_frame_t *frame, xlator_t *this,
+                    struct afr_reply *replies, afr_transaction_type type,
+                    gf_boolean_t *spb)
+{
+    afr_private_t *priv = NULL;
+    uint64_t *witness = NULL;
+    unsigned char *sources = NULL;
+    unsigned char *sinks = NULL;
+    int sources_count = 0;
+    int ret = 0;
+
+    priv = this->private;
+
+    sources = alloca0(priv->child_count);
+    sinks = alloca0(priv->child_count);
+    witness = alloca0(priv->child_count * sizeof(*witness));
+
+    ret = afr_selfheal_find_direction(frame, this, replies, type,
+                                      priv->child_up, sources, sinks, witness,
+                                      NULL);
+    if (ret)
+        return ret;
+
+    sources_count = AFR_COUNT(sources, priv->child_count);
+    if (!sources_count)
+        *spb = _gf_true;
+
+    return ret;
+}
+
+int
+afr_is_split_brain(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                   uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb)
+{
+    int ret = -1;
+    afr_private_t *priv = NULL;
+    struct afr_reply *replies = NULL;
+
+    priv = this->private;
+
+    replies = alloca0(sizeof(*replies) * priv->child_count);
+
+    ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies);
+    if (ret)
+        goto out;
+
+    if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
+        ret = -EAGAIN;
+        goto out;
+    }
+
+    ret = _afr_is_split_brain(frame, this, replies, AFR_DATA_TRANSACTION,
+                              d_spb);
+    if (ret)
+        goto out;
+
+    ret = _afr_is_split_brain(frame, this, replies, AFR_METADATA_TRANSACTION,
+                              m_spb);
+out:
+    if (replies) {
+        afr_replies_wipe(replies, priv->child_count);
+        replies = NULL;
+    }
+    return ret;
+}
+
+int
+afr_get_split_brain_status_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+    GF_FREE(opaque);
+    return 0;
+}
+
+int
+afr_get_split_brain_status(void *opaque)
+{
+    gf_boolean_t d_spb = _gf_false;
+    gf_boolean_t m_spb = _gf_false;
+    int ret = -1;
+    int op_errno = 0;
+    int i = 0;
+    char *choices = NULL;
+    char *status = NULL;
+    dict_t *dict = NULL;
+    inode_t *inode = NULL;
+    afr_private_t *priv = NULL;
+    xlator_t **children = NULL;
+    call_frame_t *frame = NULL;
+    xlator_t *this = NULL;
+    loc_t *loc = NULL;
+    afr_spb_status_t *data = NULL;
+
+    data = opaque;
+    frame = data->frame;
+    this = frame->this;
+    loc = data->loc;
+    priv = this->private;
+    children = priv->children;
+
+    inode = afr_inode_find(this, loc->gfid);
+    if (!inode)
+        goto out;
+
+    dict = dict_new();
+    if (!dict) {
+        op_errno = ENOMEM;
+        ret = -1;
+        goto out;
+    }
+
+    /* Calculation for string length :
+     * (child_count X length of child-name) + SLEN("    Choices :")
+     * child-name consists of :
+     * a) 251 = max characters for volname according to GD_VOLUME_NAME_MAX
+     * b) strlen("-client-00,") assuming 16 replicas
+     */
+    choices = alloca0(priv->child_count * (256 + SLEN("-client-00,")) +
+                      SLEN("    Choices:"));
+
+    ret = afr_is_split_brain(frame, this, inode, loc->gfid, &d_spb, &m_spb);
+    if (ret) {
+        op_errno = -ret;
+        if (ret == -EAGAIN) {
+            ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS,
+                                           SBRAIN_HEAL_NO_GO_MSG);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, -ret,
+                       AFR_MSG_DICT_SET_FAILED,
+                       "Failed to set GF_AFR_SBRAIN_STATUS in dict");
+            }
+        }
+        ret = -1;
+        goto out;
+    }
+
+    if (d_spb || m_spb) {
+        sprintf(choices, "    Choices:");
+        for (i = 0; i < priv->child_count; i++) {
+            strcat(choices, children[i]->name);
+            strcat(choices, ",");
+        }
+        choices[strlen(choices) - 1] = '\0';
+
+        ret = gf_asprintf(&status,
+                          "data-split-brain:%s    "
+                          "metadata-split-brain:%s%s",
+                          (d_spb) ? "yes" : "no", (m_spb) ? "yes" : "no",
+                          choices);
+
+        if (-1 == ret) {
+            op_errno = ENOMEM;
+            goto out;
+        }
+        ret = dict_set_dynstr_sizen(dict, GF_AFR_SBRAIN_STATUS, status);
+        if (ret) {
+            op_errno = -ret;
+            ret = -1;
+            goto out;
+        }
+    } else {
+        ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS,
+                                       SFILE_NOT_UNDER_DATA);
+        if (ret) {
+            op_errno = -ret;
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
+    if (dict)
+        dict_unref(dict);
+    if (inode)
+        inode_unref(inode);
+    return ret;
+}
+
+int32_t
+afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+    int ret = 0;
+    int op_errno = 0;
+    dict_t *dict = NULL;
+    afr_local_t *local = NULL;
+    afr_local_t *heal_local = NULL;
+    call_frame_t *heal_frame = NULL;
+
+    local = frame->local;
+    dict = dict_new();
+    if (!dict) {
+        op_errno = ENOMEM;
+        ret = -1;
+        goto out;
+    }
+
+    heal_frame = afr_frame_create(this, &op_errno);
+    if (!heal_frame) {
+        ret = -1;
+        goto out;
+    }
+    heal_local = heal_frame->local;
+    heal_frame->local = frame->local;
+    /*Initiate heal with heal_frame with lk-owner set so that inodelk/entrylk
+     * work correctly*/
+    ret = afr_selfheal_do(heal_frame, this, loc->gfid);
+
+    if (ret == 1 || ret == 2) {
+        ret = dict_set_sizen_str_sizen(dict, "sh-fail-msg",
+                                       SFILE_NOT_IN_SPLIT_BRAIN);
+        if (ret)
+            gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+                   "Failed to set sh-fail-msg in dict");
+        ret = 0;
+        goto out;
+    } else {
+        if (local->xdata_rsp) {
+            /* 'sh-fail-msg' has been set in the dict during self-heal.*/
+            dict_copy(local->xdata_rsp, dict);
+            ret = 0;
+        } else if (ret < 0) {
+            op_errno = -ret;
+            ret = -1;
+        }
+    }
+
+out:
+    if (heal_frame) {
+        heal_frame->local = heal_local;
+        AFR_STACK_DESTROY(heal_frame);
+    }
+    if (local->op == GF_FOP_GETXATTR)
+        AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
+    else if (local->op == GF_FOP_SETXATTR)
+        AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+    if (dict)
+        dict_unref(dict);
+    return ret;
+}
+
+int
+afr_get_child_index_from_name(xlator_t *this, char *name)
+{
+    afr_private_t *priv = this->private;
+    int index = -1;
+
+    for (index = 0; index < priv->child_count; index++) {
+        if (!strcmp(priv->children[index]->name, name))
+            goto out;
+    }
+    index = -1;
+out:
+    return index;
+}
+
+void
+afr_priv_need_heal_set(afr_private_t *priv, gf_boolean_t need_heal)
+{
+    LOCK(&priv->lock);
+    {
+        priv->need_heal = need_heal;
+    }
+    UNLOCK(&priv->lock);
+}
+
+void
+afr_set_need_heal(xlator_t *this, afr_local_t *local)
+{
+    int i = 0;
+    afr_private_t *priv = this->private;
+    gf_boolean_t need_heal = _gf_false;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->replies[i].valid && local->replies[i].need_heal) {
+            need_heal = _gf_true;
+            break;
+        }
+    }
+    afr_priv_need_heal_set(priv, need_heal);
+    return;
+}
+
+gf_boolean_t
+afr_get_need_heal(xlator_t *this)
+{
+    afr_private_t *priv = this->private;
+    gf_boolean_t need_heal = _gf_true;
+
+    LOCK(&priv->lock);
+    {
+        need_heal = priv->need_heal;
+    }
+    UNLOCK(&priv->lock);
+    return need_heal;
+}
+
+int
+afr_get_msg_id(char *op_type)
+{
+    if (!strcmp(op_type, GF_AFR_REPLACE_BRICK))
+        return AFR_MSG_REPLACE_BRICK_STATUS;
+    else if (!strcmp(op_type, GF_AFR_ADD_BRICK))
+        return AFR_MSG_ADD_BRICK_STATUS;
+    return -1;
+}
+
+int
+afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *heal_frame,
+                                    void *opaque)
+{
+    call_frame_t *txn_frame = NULL;
+    afr_local_t *local = NULL;
+    afr_local_t *heal_local = NULL;
+    xlator_t *this = NULL;
+
+    heal_local = heal_frame->local;
+    txn_frame = heal_local->heal_frame;
+    local = txn_frame->local;
+    this = txn_frame->this;
+
+    /* Refresh the inode agan and proceed with the transaction.*/
+    afr_inode_refresh(txn_frame, this, local->inode, NULL, local->refreshfn);
+
+    AFR_STACK_DESTROY(heal_frame);
+
+    return 0;
+}
+
+int
+afr_fav_child_reset_sink_xattrs(void *opaque)
+{
+    call_frame_t *heal_frame = NULL;
+    call_frame_t *txn_frame = NULL;
+    xlator_t *this = NULL;
+    gf_boolean_t d_spb = _gf_false;
+    gf_boolean_t m_spb = _gf_false;
+    afr_local_t *heal_local = NULL;
+    afr_local_t *txn_local = NULL;
+    afr_private_t *priv = NULL;
+    inode_t *inode = NULL;
+    unsigned char *locked_on = NULL;
+    unsigned char *sources = NULL;
+    unsigned char *sinks = NULL;
+    unsigned char *healed_sinks = NULL;
+    unsigned char *undid_pending = NULL;
+    struct afr_reply *locked_replies = NULL;
+    int ret = 0;
+
+    heal_frame = (call_frame_t *)opaque;
+    heal_local = heal_frame->local;
+    txn_frame = heal_local->heal_frame;
+    txn_local = txn_frame->local;
+    this = txn_frame->this;
+    inode = txn_local->inode;
+    priv = this->private;
+    locked_on = alloca0(priv->child_count);
+    sources = alloca0(priv->child_count);
+    sinks = alloca0(priv->child_count);
+    healed_sinks = alloca0(priv->child_count);
+    undid_pending = alloca0(priv->child_count);
+    locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+    ret = _afr_is_split_brain(txn_frame, this, txn_local->replies,
+                              AFR_DATA_TRANSACTION, &d_spb);
+
+    ret = _afr_is_split_brain(txn_frame, this, txn_local->replies,
+                              AFR_METADATA_TRANSACTION, &m_spb);
+
+    /* Take appropriate locks and reset sink xattrs. */
+    if (d_spb) {
+        ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0,
+                                   locked_on);
+        {
+            if (ret < priv->child_count)
+                goto data_unlock;
+            ret = __afr_selfheal_data_prepare(
+                heal_frame, this, inode, locked_on, sources, sinks,
+                healed_sinks, undid_pending, locked_replies, NULL);
+        }
+    data_unlock:
+        afr_selfheal_uninodelk(heal_frame, this, inode, this->name, 0, 0,
+                               locked_on);
+    }
+
+    if (m_spb) {
+        memset(locked_on, 0, sizeof(*locked_on) * priv->child_count);
+        memset(undid_pending, 0, sizeof(*undid_pending) * priv->child_count);
+        ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name,
+                                   LLONG_MAX - 1, 0, locked_on);
+        {
+            if (ret < priv->child_count)
+                goto mdata_unlock;
+            ret = __afr_selfheal_metadata_prepare(
+                heal_frame, this, inode, locked_on, sources, sinks,
+                healed_sinks, undid_pending, locked_replies, NULL);
+        }
+    mdata_unlock:
+        afr_selfheal_uninodelk(heal_frame, this, inode, this->name,
+                               LLONG_MAX - 1, 0, locked_on);
+    }
+
+    return ret;
+}
+
+/*
+ * Concatenates the xattrs in local->replies separated by a delimiter.
+ */
+int
+afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this,
+                                    char *buf, const char *default_str,
+                                    int32_t *serz_len, char delimiter)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    char *xattr = NULL;
+    int i = 0;
+    int len = 0;
+    int keylen = 0;
+    size_t str_len = 0;
+    int ret = -1;
+
+    priv = this->private;
+    local = frame->local;
+
+    keylen = strlen(local->cont.getxattr.name);
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].valid || local->replies[i].op_ret) {
+            str_len = strlen(default_str);
+            buf = strncat(buf, default_str, str_len);
+            len += str_len;
+            buf[len++] = delimiter;
+            buf[len] = '\0';
+        } else {
+            ret = dict_get_strn(local->replies[i].xattr,
+                                local->cont.getxattr.name, keylen, &xattr);
+            if (ret) {
+                gf_msg("TEST", GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+                       "Failed to get the node_uuid of brick "
+                       "%d",
+                       i);
+                goto out;
+            }
+            str_len = strlen(xattr);
+            buf = strncat(buf, xattr, str_len);
+            len += str_len;
+            buf[len++] = delimiter;
+            buf[len] = '\0';
+        }
+    }
+    buf[--len] = '\0'; /*remove the last delimiter*/
+    if (serz_len)
+        *serz_len = ++len;
+    ret = 0;
+
+out:
+    return ret;
+}
+
+uint64_t
+afr_write_subvol_get(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    uint64_t write_subvol = 0;
+
+    local = frame->local;
+    LOCK(&local->inode->lock);
+    write_subvol = local->inode_ctx->write_subvol;
+    UNLOCK(&local->inode->lock);
+
+    return write_subvol;
+}
+
+int
+afr_write_subvol_set(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    unsigned char *data_accused = NULL;
+    unsigned char *metadata_accused = NULL;
+    unsigned char *data_readable = NULL;
+    unsigned char *metadata_readable = NULL;
+    uint16_t datamap = 0;
+    uint16_t metadatamap = 0;
+    uint64_t val = 0;
+    int event = 0;
+    int i = 0;
+
+    local = frame->local;
+    priv = this->private;
+    data_accused = alloca0(priv->child_count);
+    metadata_accused = alloca0(priv->child_count);
+    data_readable = alloca0(priv->child_count);
+    metadata_readable = alloca0(priv->child_count);
+    event = local->event_generation;
+
+    afr_readables_fill(frame, this, local->inode, data_accused,
+                       metadata_accused, data_readable, metadata_readable,
+                       NULL);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (data_readable[i])
+            datamap |= (1 << i);
+        if (metadata_readable[i])
+            metadatamap |= (1 << i);
+    }
+
+    val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
+          (((uint64_t)event) << 32);
+
+    LOCK(&local->inode->lock);
+    {
+        if (local->inode_ctx->write_subvol == 0 &&
+            local->transaction.type == AFR_DATA_TRANSACTION) {
+            local->inode_ctx->write_subvol = val;
+        }
+    }
+    UNLOCK(&local->inode->lock);
+
+    return 0;
+}
+
+int
+afr_write_subvol_reset(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+
+    local = frame->local;
+    LOCK(&local->inode->lock);
+    {
+        GF_ASSERT(local->inode_ctx->lock_count > 0);
+        local->inode_ctx->lock_count--;
+
+        if (!local->inode_ctx->lock_count)
+            local->inode_ctx->write_subvol = 0;
+    }
+    UNLOCK(&local->inode->lock);
+
+    return 0;
+}
+
+int
+afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode)
+{
+    int ret = 0;
+
+    local->inode = inode_ref(inode);
+    LOCK(&local->inode->lock);
+    {
+        ret = __afr_inode_ctx_get(this, local->inode, &local->inode_ctx);
+    }
+    UNLOCK(&local->inode->lock);
+    if (ret < 0) {
+        gf_msg_callingfn(
+            this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_INODE_CTX_GET_FAILED,
+            "Error getting inode ctx %s", uuid_utoa(local->inode->gfid));
+    }
+    return ret;
+}
+
+gf_boolean_t
+afr_ta_is_fop_called_from_synctask(xlator_t *this)
+{
+    struct synctask *task = NULL;
+    gf_lkowner_t tmp_owner = {
+        0,
+    };
+
+    task = synctask_get();
+    if (!task)
+        return _gf_false;
+
+    set_lk_owner_from_ptr(&tmp_owner, (void *)this);
+
+    if (!is_same_lkowner(&tmp_owner, &task->frame->root->lk_owner))
+        return _gf_false;
+
+    return _gf_true;
+}
+
+int
+afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
+{
+    int ret = 0;
+    uuid_t gfid = {
+        0,
+    };
+    afr_private_t *priv = this->private;
+    gf_boolean_t locked = _gf_false;
+    struct gf_flock flock1 = {
+        0,
+    };
+    struct gf_flock flock2 = {
+        0,
+    };
+    int32_t cmd = 0;
+
+    /* Clients must take AFR_TA_DOM_NOTIFY lock only when the previous lock
+     * has been released in afr_notify due to upcall notification from shd.
+     */
+    GF_ASSERT(priv->ta_notify_dom_lock_offset == 0);
+
+    if (!priv->shd.iamshd)
+        GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
+    flock1.l_type = F_WRLCK;
+
+    while (!locked) {
+        if (priv->shd.iamshd) {
+            cmd = F_SETLKW;
+            flock1.l_start = 0;
+            flock1.l_len = 0;
+        } else {
+            cmd = F_SETLK;
+            gf_uuid_generate(gfid);
+            flock1.l_start = gfid_to_ino(gfid);
+            if (flock1.l_start < 0)
+                flock1.l_start = -flock1.l_start;
+            flock1.l_len = 1;
+        }
+        ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+                             AFR_TA_DOM_NOTIFY, loc, cmd, &flock1, NULL, NULL);
+        if (!ret) {
+            locked = _gf_true;
+            priv->ta_notify_dom_lock_offset = flock1.l_start;
+        } else if (ret == -EAGAIN) {
+            continue;
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+                   "Failed to get "
+                   "AFR_TA_DOM_NOTIFY lock on %s.",
+                   loc->name);
+            goto out;
+        }
+    }
+
+    flock2.l_type = F_WRLCK;
+    flock2.l_start = 0;
+    flock2.l_len = 0;
+    ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+                         AFR_TA_DOM_MODIFY, loc, F_SETLKW, &flock2, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Failed to get AFR_TA_DOM_MODIFY lock on %s.", loc->name);
+        flock1.l_type = F_UNLCK;
+        ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+                             AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock1, NULL,
+                             NULL);
+    }
+out:
+    return ret;
+}
+
+int
+afr_ta_post_op_unlock(xlator_t *this, loc_t *loc)
+{
+    afr_private_t *priv = this->private;
+    struct gf_flock flock = {
+        0,
+    };
+    int ret = 0;
+
+    if (!priv->shd.iamshd)
+        GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
+    flock.l_type = F_UNLCK;
+    flock.l_start = 0;
+    flock.l_len = 0;
+
+    ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+                         AFR_TA_DOM_MODIFY, loc, F_SETLK, &flock, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Failed to unlock AFR_TA_DOM_MODIFY lock.");
+        goto out;
+    }
+
+    if (!priv->shd.iamshd)
+        /* Mounts (clients) will not release the AFR_TA_DOM_NOTIFY lock
+         * in post-op as they use it as a notification mechanism. When
+         * shd sends a lock request on TA during heal, the clients will
+         * receive a lock-contention upcall notification upon which they
+         * will release the AFR_TA_DOM_NOTIFY lock after completing the
+         * in flight I/O.*/
+        goto out;
+
+    ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+                         AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Failed to unlock AFR_TA_DOM_NOTIFY lock.");
+    }
+out:
+    return ret;
+}
+
+call_frame_t *
+afr_ta_frame_create(xlator_t *this)
+{
+    call_frame_t *frame = NULL;
+    void *lk_owner = NULL;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame)
+        return NULL;
+    lk_owner = (void *)this;
+    afr_set_lk_owner(frame, this, lk_owner);
+    return frame;
+}
+
+gf_boolean_t
+afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local)
+{
+    int data_count = 0;
+
+    data_count = AFR_COUNT(local->child_up, priv->child_count);
+    if (data_count == 2) {
+        return _gf_true;
+    } else if (data_count == 1 && local->ta_child_up) {
+        return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+static gf_boolean_t
+afr_is_add_replica_mount_lookup_on_root(call_frame_t *frame)
+{
+    afr_local_t *local = NULL;
+
+    if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT)
+        return _gf_false;
+
+    local = frame->local;
+
+    if (local->op != GF_FOP_LOOKUP)
+        /* TODO:If the replica count is being increased on a plain distribute
+         * volume that was never mounted, we need to allow setxattr on '/' with
+         * GF_CLIENT_PID_NO_ROOT_SQUASH to accomodate for DHT layout setting */
+        return _gf_false;
+
+    if (local->inode == NULL)
+        return _gf_false;
+
+    if (!__is_root_gfid(local->inode->gfid))
+        return _gf_false;
+
+    return _gf_true;
+}
+
+gf_boolean_t
+afr_lookup_has_quorum(call_frame_t *frame, const unsigned int up_children_count)
+{
+    if (frame && (up_children_count > 0) &&
+        afr_is_add_replica_mount_lookup_on_root(frame))
+        return _gf_true;
+
+    return _gf_false;
+}
+
+void
+afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = frame->local;
+    afr_private_t *priv = this->private;
+    unsigned char *success_replies = NULL;
+
+    success_replies = alloca0(priv->child_count);
+    afr_fill_success_replies(local, priv, success_replies);
+
+    if (priv->quorum_count && !afr_has_quorum(success_replies, this, NULL)) {
+        local->op_errno = afr_final_errno(local, priv);
+        if (!local->op_errno)
+            local->op_errno = afr_quorum_errno(priv);
+        local->op_ret = -1;
+    }
+}
+
+gf_boolean_t
+afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, int child)
+{
+    int *pending = NULL;
+    int ret = 0;
+    int i = 0;
+
+    ret = dict_get_ptr(dict, priv->pending_key[child], (void *)&pending);
+    if (ret == 0) {
+        for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+            /* Not doing a ntoh32(pending) as we just want to check
+             * if it is non-zero or not. */
+            if (pending[i]) {
+                return _gf_true;
+            }
+        }
+    }
+
+    return _gf_false;
+}
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index fd3fc1ba733..f8bf8340dab 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -1,345 +1,346 @@
 /*
-   Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
 
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
 
 #include <libgen.h>
 #include <unistd.h>
-#include <fnmatch.h>
 #include <sys/time.h>
 #include <stdlib.h>
 #include <signal.h>
+#include <string.h>
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/list.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
 
 #include "afr.h"
-
+#include "afr-transaction.h"
 
 int32_t
-afr_opendir_cbk (call_frame_t *frame, void *cookie,
-		 xlator_t *this, int32_t op_ret, int32_t op_errno,
-		 fd_t *fd)
+afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
 {
-	afr_local_t * local  = NULL;
-
-	int call_count = -1;
-
-	LOCK (&frame->lock);
-	{
-		local = frame->local;
-
-		if (op_ret == 0)
-			local->op_ret = 0;
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		AFR_STACK_UNWIND (frame, local->op_ret,
-				  local->op_errno, local->fd);
-	}
-
-	return 0;
+    afr_local_t *local = NULL;
+    int call_count = -1;
+    int32_t child_index = 0;
+    afr_fd_ctx_t *fd_ctx = NULL;
+
+    local = frame->local;
+    fd_ctx = local->fd_ctx;
+    child_index = (long)cookie;
+
+    local->replies[child_index].valid = 1;
+    local->replies[child_index].op_ret = op_ret;
+    local->replies[child_index].op_errno = op_errno;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret == -1) {
+            local->op_errno = op_errno;
+            fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+        } else {
+            local->op_ret = op_ret;
+            fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+            if (!local->xdata_rsp && xdata)
+                local->xdata_rsp = dict_ref(xdata);
+        }
+        call_count = --local->call_count;
+    }
+    UNLOCK(&frame->lock);
+
+    if (call_count == 0) {
+        afr_handle_replies_quorum(frame, this);
+        AFR_STACK_UNWIND(opendir, frame, local->op_ret, local->op_errno,
+                         local->fd, NULL);
+    }
+
+    return 0;
 }
 
-
-int32_t 
-afr_opendir (call_frame_t *frame, xlator_t *this,
-	     loc_t *loc, fd_t *fd)
+int
+afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+            dict_t *xdata)
 {
-	afr_private_t * priv        = NULL;
-	afr_local_t   * local       = NULL;
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int i = 0;
+    int call_count = -1;
+    int32_t op_errno = ENOMEM;
+    afr_fd_ctx_t *fd_ctx = NULL;
 
-	int             child_count = 0;
-	int             i           = 0;
+    priv = this->private;
 
-	int ret = -1;
-	int call_count = -1;
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
 
-	int32_t         op_ret   = -1;
-	int32_t         op_errno = 0;
+    local->op = GF_FOP_OPENDIR;
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+    if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+        op_errno = afr_quorum_errno(priv);
+        goto out;
+    }
 
-	priv = this->private;
+    if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+        goto out;
 
-	child_count = priv->child_count;
+    fd_ctx = afr_fd_ctx_get(fd, this);
+    if (!fd_ctx)
+        goto out;
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
+    loc_copy(&local->loc, loc);
 
-	frame->local = local;
-	local->fd    = fd_ref (fd);
+    local->fd = fd_ref(fd);
+    local->fd_ctx = fd_ctx;
 
-	call_count = local->call_count;
-	
-	for (i = 0; i < child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND (frame, afr_opendir_cbk, 
-				    priv->children[i],
-				    priv->children[i]->fops->opendir,
-				    loc, fd);
+    call_count = local->call_count;
 
-			if (!--call_count)
-				break;
-		}
-	}
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->child_up[i]) {
+            STACK_WIND_COOKIE(frame, afr_opendir_cbk, (void *)(long)i,
+                              priv->children[i],
+                              priv->children[i]->fops->opendir, loc, fd, NULL);
 
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, fd);
-	}
+            if (!--call_count)
+                break;
+        }
+    }
 
-	return 0;
+    return 0;
+out:
+    AFR_STACK_UNWIND(opendir, frame, -1, op_errno, fd, NULL);
+    return 0;
 }
 
+static int
+afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol)
+{
+    int gen = 0;
+    int entry_read_subvol = 0;
+    unsigned char *data_readable = NULL;
+    unsigned char *metadata_readable = NULL;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+    data_readable = alloca0(priv->child_count);
+    metadata_readable = alloca0(priv->child_count);
+
+    afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable,
+                              &gen);
+
+    if (gen != priv->event_generation || !data_readable[par_read_subvol] ||
+        !metadata_readable[par_read_subvol])
+        return -1;
+
+    /* Once the control reaches the following statement, it means that the
+     * parent's read subvol is perfectly readable. So calling
+     * either afr_data_subvol_get() or afr_metadata_subvol_get() would
+     * yield the same result. Hence, choosing afr_data_subvol_get() below.
+     */
+
+    if (!priv->consistent_metadata)
+        return 0;
+
+    /* For an inode fetched through readdirp which is yet to be linked,
+     * inode ctx would not be initialised (yet). So this function returns
+     * -1 above due to gen being 0, which is why it is OK to pass NULL for
+     *  read_subvol_args here.
+     */
+    entry_read_subvol = afr_data_subvol_get(inode, this, NULL, NULL, NULL,
+                                            NULL);
+    if (entry_read_subvol != par_read_subvol)
+        return -1;
+
+    return 0;
+}
 
-/**
- * Common algorithm for directory read calls:
- * 
- * - Try the fop on the first child that is up
- * - if we have failed due to ENOTCONN:
- *     try the next child
- *
- * Applicable to: readdir
- */
+static void
+afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries,
+                              int subvol, gf_dirent_t *entries, fd_t *fd)
+{
+    int ret = -1;
+    gf_dirent_t *entry = NULL;
+    gf_dirent_t *tmp = NULL;
+    xlator_t *this = NULL;
+    afr_private_t *priv = NULL;
+    gf_boolean_t need_heal = _gf_false;
+    gf_boolean_t validate_subvol = _gf_false;
+
+    this = THIS;
+    priv = this->private;
+
+    need_heal = afr_get_need_heal(this);
+    validate_subvol = need_heal | priv->consistent_metadata;
+
+    list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list)
+    {
+        if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name,
+                                     frame->root->pid)) {
+            continue;
+        }
+
+        list_del_init(&entry->list);
+        list_add_tail(&entry->list, &entries->list);
+
+        if (!validate_subvol)
+            continue;
+
+        if (entry->inode) {
+            ret = afr_validate_read_subvol(entry->inode, this, subvol);
+            if (ret == -1) {
+                inode_unref(entry->inode);
+                entry->inode = NULL;
+                continue;
+            }
+        }
+    }
+}
 
 int32_t
-afr_readdir_cbk (call_frame_t *frame, void *cookie,
-		 xlator_t *this, int32_t op_ret, int32_t op_errno,
-		 gf_dirent_t *buf)
+afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries,
+                dict_t *xdata)
 {
-	afr_private_t * priv     = NULL;
-	afr_local_t *   local    = NULL;
-	xlator_t **     children = NULL;
-
-	int unwind     = 1;
-	int last_tried = -1;
-	int this_try = -1;
+    afr_local_t *local = NULL;
+    gf_dirent_t entries;
 
-	priv     = this->private;
-	children = priv->children;
+    INIT_LIST_HEAD(&entries.list);
 
-	local = frame->local;
+    local = frame->local;
 
-	if (op_ret == -1) {
-		last_tried = local->cont.readdir.last_tried;
+    if (op_ret < 0 && !local->cont.readdir.offset) {
+        /* failover only if this was first readdir, detected
+           by offset == 0 */
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
 
-		if (all_tried (last_tried, priv->child_count)) {
-			goto out;
-		}
+        afr_read_txn_continue(frame, this, (long)cookie);
+        return 0;
+    }
 
-		this_try = ++local->cont.readdir.last_tried;
-		unwind = 0;
+    if (op_ret >= 0)
+        afr_readdir_transform_entries(frame, subvol_entries, (long)cookie,
+                                      &entries, local->fd);
 
-		STACK_WIND (frame, afr_readdir_cbk,
-			    children[this_try],
-			    children[this_try]->fops->readdir,
-			    local->fd, local->cont.readdir.size,
-			    local->cont.readdir.offset);
-	}
+    AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata);
 
-out:
-	if (unwind) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
-	}
+    gf_dirent_free(&entries);
 
-	return 0;
+    return 0;
 }
 
+int
+afr_readdir_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    afr_fd_ctx_t *fd_ctx = NULL;
+
+    priv = this->private;
+    local = frame->local;
+    fd_ctx = afr_fd_ctx_get(local->fd, this);
+    if (!fd_ctx) {
+        local->op_errno = EINVAL;
+        local->op_ret = -1;
+    }
+
+    if (subvol == -1 || !fd_ctx) {
+        AFR_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, 0, 0);
+        return 0;
+    }
+
+    fd_ctx->readdir_subvol = subvol;
+
+    if (local->op == GF_FOP_READDIR)
+        STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol,
+                          priv->children[subvol],
+                          priv->children[subvol]->fops->readdir, local->fd,
+                          local->cont.readdir.size, local->cont.readdir.offset,
+                          local->xdata_req);
+    else
+        STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol,
+                          priv->children[subvol],
+                          priv->children[subvol]->fops->readdirp, local->fd,
+                          local->cont.readdir.size, local->cont.readdir.offset,
+                          local->xdata_req);
+    return 0;
+}
 
-int32_t
-afr_readdir (call_frame_t *frame, xlator_t *this,
-	     fd_t *fd, size_t size, off_t offset)
+int
+afr_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+               off_t offset, int whichop, dict_t *dict)
 {
-	afr_private_t * priv       = NULL;
-	xlator_t **     children   = NULL;
-	int             call_child = 0;
-	afr_local_t     *local     = NULL;
-
-	int ret = -1;
-
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv     = this->private;
-	children = priv->children;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-						
-	frame->local = local;
-
-	call_child = afr_first_up_child (priv);
-	if (call_child == -1) {
-		op_errno = ENOTCONN;
-		gf_log (this->name, GF_LOG_ERROR,
-			"no child is up :(");
-		goto out;
-	}
-
-	local->cont.readdir.last_tried = call_child;
-
-	local->fd                  = fd_ref (fd);
-	local->cont.readdir.size   = size;
-	local->cont.readdir.offset = offset;
-
-	STACK_WIND (frame, afr_readdir_cbk,
-		    children[call_child], children[call_child]->fops->readdir,
-		    fd, size, offset);
-
-	op_ret = 0;
+    afr_local_t *local = NULL;
+    int32_t op_errno = 0;
+    int subvol = -1;
+    afr_fd_ctx_t *fd_ctx = NULL;
+
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    fd_ctx = afr_fd_ctx_get(fd, this);
+    if (!fd_ctx) {
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    local->op = whichop;
+    local->fd = fd_ref(fd);
+    local->cont.readdir.size = size;
+    local->cont.readdir.offset = offset;
+    local->xdata_req = (dict) ? dict_ref(dict) : NULL;
+
+    subvol = fd_ctx->readdir_subvol;
+
+    if (offset == 0 || subvol == -1) {
+        /* First readdir has option of failing over and selecting
+           an appropriate read subvolume */
+        afr_read_txn(frame, this, fd->inode, afr_readdir_wind,
+                     AFR_DATA_TRANSACTION);
+    } else {
+        /* But continued readdirs MUST stick to the same subvolume
+           without an option to failover */
+        afr_readdir_wind(frame, this, subvol);
+    }
+
+    return 0;
 out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
-	return 0;
+    AFR_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL);
+    return 0;
 }
 
-
 int32_t
-afr_getdents_cbk (call_frame_t *frame, void *cookie,
-		  xlator_t *this, int32_t op_ret, int32_t op_errno,
-		  dir_entry_t *entry, int32_t count)
+afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t offset, dict_t *xdata)
 {
-	afr_private_t * priv     = NULL;
-	afr_local_t *   local    = NULL;
-	xlator_t **     children = NULL;
-
-	int unwind     = 1;
-	int last_tried = -1;
-	int this_try = -1;
+    afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIR, xdata);
 
-	priv     = this->private;
-	children = priv->children;
-
-	local = frame->local;
-
-	if (op_ret == -1) {
-		last_tried = local->cont.getdents.last_tried;
-
-		if (all_tried (last_tried, priv->child_count)) {
-			goto out;
-		}
-
-		this_try = ++local->cont.getdents.last_tried;
-		unwind = 0;
-
-		STACK_WIND (frame, afr_getdents_cbk,
-			    children[this_try],
-			    children[this_try]->fops->getdents,
-			    local->fd, local->cont.getdents.size,
-			    local->cont.getdents.offset, local->cont.getdents.flag);
-	}
-
-out:
-	if (unwind) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, entry, count);
-	}
-
-	return 0;
+    return 0;
 }
 
-
 int32_t
-afr_getdents (call_frame_t *frame, xlator_t *this,
-	      fd_t *fd, size_t size, off_t offset, int32_t flag)
+afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+             off_t offset, dict_t *dict)
 {
-	afr_private_t * priv       = NULL;
-	xlator_t **     children   = NULL;
-	int             call_child = 0;
-	afr_local_t     *local     = NULL;
-
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv     = this->private;
-	children = priv->children;
+    afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIRP, dict);
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	call_child = afr_first_up_child (priv);
-	if (call_child == -1) {
-		op_errno = ENOTCONN;
-		gf_log (this->name, GF_LOG_ERROR,
-			"no child is up :(");
-		goto out;
-	}
-
-	local->cont.getdents.last_tried = call_child;
-
-	local->fd                   = fd_ref (fd);
-
-	local->cont.getdents.size   = size;
-	local->cont.getdents.offset = offset;
-	local->cont.getdents.flag   = flag;
-	
-	frame->local = local;
-
-	STACK_WIND (frame, afr_getdents_cbk,
-		    children[call_child], children[call_child]->fops->getdents,
-		    fd, size, offset, flag);
-
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
-
-	return 0;
+    return 0;
 }
 
+int32_t
+afr_releasedir(xlator_t *this, fd_t *fd)
+{
+    afr_cleanup_fd_ctx(this, fd);
 
+    return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h
index 6d981fdfd47..773e925ec6c 100644
--- a/xlators/cluster/afr/src/afr-dir-read.h
+++ b/xlators/cluster/afr/src/afr-dir-read.h
@@ -1,47 +1,33 @@
 /*
-   Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
 #ifndef __DIR_READ_H__
 #define __DIR_READ_H__
 
-
 int32_t
-afr_opendir (call_frame_t *frame, xlator_t *this,
-	     loc_t *loc, fd_t *fd);
+afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+            dict_t *xdata);
 
 int32_t
-afr_closedir (call_frame_t *frame, xlator_t *this,
-	      fd_t *fd);
+afr_releasedir(xlator_t *this, fd_t *fd);
 
 int32_t
-afr_readdir (call_frame_t *frame, xlator_t *this,
-	     fd_t *fd, size_t size, off_t offset);
-
+afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t offset, dict_t *xdata);
 
 int32_t
-afr_getdents (call_frame_t *frame, xlator_t *this,
-	      fd_t *fd, size_t size, off_t offset, int32_t flag);
-
+afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+             off_t offset, dict_t *dict);
 
 int32_t
-afr_checksum (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, int32_t flags);
-
+afr_checksum(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+             dict_t *xdata);
 
 #endif /* __DIR_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index af72c6440b6..b7cceb79158 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -1,1786 +1,1262 @@
 /*
-  Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
   This file is part of GlusterFS.
 
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
-
 #include <libgen.h>
 #include <unistd.h>
-#include <fnmatch.h>
 #include <sys/time.h>
 #include <stdlib.h>
 #include <signal.h>
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
 #include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/list.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
 
 #include "afr.h"
 #include "afr-transaction.h"
 
-
 void
-afr_build_parent_loc (loc_t *parent, loc_t *child)
-{
-	char *tmp = NULL;
-
-	if (!child->parent) {
-		loc_copy (parent, child);
-		return;
-	}
-
-	tmp = strdup (child->path);
-	parent->path   = strdup (dirname (tmp));
-	FREE (tmp);
+afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this);
 
-        parent->name   = strrchr (parent->path, '/');
-	if (parent->name)
-		parent->name++;
+int
+afr_build_parent_loc(loc_t *parent, loc_t *child, int32_t *op_errno)
+{
+    int ret = -1;
+    char *child_path = NULL;
+
+    if (!child->parent) {
+        if (op_errno)
+            *op_errno = EINVAL;
+        goto out;
+    }
+
+    child_path = gf_strdup(child->path);
+    if (!child_path) {
+        if (op_errno)
+            *op_errno = ENOMEM;
+        goto out;
+    }
+
+    parent->path = gf_strdup(dirname(child_path));
+    if (!parent->path) {
+        if (op_errno)
+            *op_errno = ENOMEM;
+        goto out;
+    }
+
+    parent->inode = inode_ref(child->parent);
+    gf_uuid_copy(parent->gfid, child->pargfid);
+
+    ret = 0;
+out:
+    GF_FREE(child_path);
 
-	parent->inode  = inode_ref (child->parent);
-	parent->parent = inode_parent (parent->inode, 0, NULL);
-	parent->ino    = parent->inode->ino;
+    return ret;
 }
 
-
-/* {{{ create */
-
-int
-afr_create_unwind (call_frame_t *frame, xlator_t *this)
+static void
+__afr_dir_write_finalize(call_frame_t *frame, xlator_t *this)
 {
-	call_frame_t *main_frame = NULL;
-	afr_local_t  *local = NULL;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame) {
-			main_frame = local->transaction.main_frame;
-		}
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
-
-	if (main_frame)
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
-				  local->cont.create.fd,
-				  local->cont.create.inode,
-				  &local->cont.create.buf);
-	return 0;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int inode_read_subvol = -1;
+    int parent_read_subvol = -1;
+    int parent2_read_subvol = -1;
+    int i = 0;
+    afr_read_subvol_args_t args = {
+        0,
+    };
+
+    local = frame->local;
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].valid)
+            continue;
+        if (local->replies[i].op_ret == -1)
+            continue;
+        gf_uuid_copy(args.gfid, local->replies[i].poststat.ia_gfid);
+        args.ia_type = local->replies[i].poststat.ia_type;
+        break;
+    }
+
+    if (local->inode) {
+        if (local->op != GF_FOP_RENAME && local->op != GF_FOP_LINK)
+            afr_replies_interpret(frame, this, local->inode, NULL);
+
+        inode_read_subvol = afr_data_subvol_get(local->inode, this, NULL, NULL,
+                                                NULL, &args);
+    }
+
+    if (local->parent)
+        parent_read_subvol = afr_data_subvol_get(local->parent, this, NULL,
+                                                 local->readable, NULL, NULL);
+
+    if (local->parent2)
+        parent2_read_subvol = afr_data_subvol_get(local->parent2, this, NULL,
+                                                  local->readable2, NULL, NULL);
+
+    local->op_ret = -1;
+    local->op_errno = afr_final_errno(local, priv);
+    afr_pick_error_xdata(local, priv, local->parent, local->readable,
+                         local->parent2, local->readable2);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].valid)
+            continue;
+        if (local->replies[i].op_ret < 0) {
+            if (local->inode)
+                afr_inode_need_refresh_set(local->inode, this);
+            if (local->parent)
+                afr_inode_need_refresh_set(local->parent, this);
+            if (local->parent2)
+                afr_inode_need_refresh_set(local->parent2, this);
+            continue;
+        }
+
+        if (local->op_ret == -1) {
+            local->op_ret = local->replies[i].op_ret;
+            local->op_errno = local->replies[i].op_errno;
+
+            local->cont.dir_fop.buf = local->replies[i].poststat;
+            local->cont.dir_fop.preparent = local->replies[i].preparent;
+            local->cont.dir_fop.postparent = local->replies[i].postparent;
+            local->cont.dir_fop.prenewparent = local->replies[i].preparent2;
+            local->cont.dir_fop.postnewparent = local->replies[i].postparent2;
+            if (local->xdata_rsp) {
+                dict_unref(local->xdata_rsp);
+                local->xdata_rsp = NULL;
+            }
+
+            if (local->replies[i].xdata)
+                local->xdata_rsp = dict_ref(local->replies[i].xdata);
+            continue;
+        }
+
+        if (i == inode_read_subvol) {
+            local->cont.dir_fop.buf = local->replies[i].poststat;
+            if (local->replies[i].xdata) {
+                if (local->xdata_rsp)
+                    dict_unref(local->xdata_rsp);
+                local->xdata_rsp = dict_ref(local->replies[i].xdata);
+            }
+        }
+
+        if (i == parent_read_subvol) {
+            local->cont.dir_fop.preparent = local->replies[i].preparent;
+            local->cont.dir_fop.postparent = local->replies[i].postparent;
+        }
+
+        if (i == parent2_read_subvol) {
+            local->cont.dir_fop.prenewparent = local->replies[i].preparent2;
+            local->cont.dir_fop.postnewparent = local->replies[i].postparent2;
+        }
+    }
 }
 
-
-int
-afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		     int32_t op_ret, int32_t op_errno, 
-		     fd_t *fd, inode_t *inode, struct stat *buf)
+static void
+__afr_dir_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+                     int op_ret, int op_errno, struct iatt *poststat,
+                     struct iatt *preparent, struct iatt *postparent,
+                     struct iatt *preparent2, struct iatt *postparent2,
+                     dict_t *xdata)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int call_count = -1;
-	int child_index = -1;
-
-	local = frame->local;
-	priv = this->private;
-
-	child_index = (long) cookie;
-
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
-
-		if (op_ret != -1) {
-			local->op_ret = op_ret;
-
-			if ((local->success_count == 0)
-			    || (child_index == priv->read_child)) {
-				local->cont.create.buf        = *buf;
-				local->cont.create.buf.st_ino = 
-					afr_itransform (buf->st_ino,
-							priv->child_count,
-							child_index);
-			}
-			local->cont.create.inode = inode;
-
-			local->success_count++;
-		}
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		local->transaction.unwind (frame, this);
-
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+    afr_local_t *local = NULL;
+    afr_fd_ctx_t *fd_ctx = NULL;
+
+    local = frame->local;
+    fd_ctx = local->fd_ctx;
+
+    local->replies[child_index].valid = 1;
+    local->replies[child_index].op_ret = op_ret;
+    local->replies[child_index].op_errno = op_errno;
+    if (xdata)
+        local->replies[child_index].xdata = dict_ref(xdata);
+
+    if (op_ret >= 0) {
+        if (poststat)
+            local->replies[child_index].poststat = *poststat;
+        if (preparent)
+            local->replies[child_index].preparent = *preparent;
+        if (postparent)
+            local->replies[child_index].postparent = *postparent;
+        if (preparent2)
+            local->replies[child_index].preparent2 = *preparent2;
+        if (postparent2)
+            local->replies[child_index].postparent2 = *postparent2;
+        if (fd_ctx)
+            fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+    } else {
+        if (op_errno != ENOTEMPTY)
+            afr_transaction_fop_failed(frame, this, child_index);
+        if (fd_ctx)
+            fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+    }
+
+    return;
 }
 
-
-int
-afr_create_wind (call_frame_t *frame, xlator_t *this)
+static int
+__afr_dir_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int op_ret, int op_errno, struct iatt *buf,
+                    struct iatt *preparent, struct iatt *postparent,
+                    struct iatt *preparent2, struct iatt *postparent2,
+                    dict_t *xdata)
 {
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-
-	int call_count = -1;
-	int i = 0;
-
-	local = frame->local;
-	priv = this->private;
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
-
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_create_wind_cbk,
-					   (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->create,
-					   &local->loc, 
-					   local->cont.create.flags, 
-					   local->cont.create.mode, 
-					   local->cont.create.fd);
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    afr_local_t *local = NULL;
+    int child_index = (long)cookie;
+    int call_count = -1;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    LOCK(&frame->lock);
+    {
+        __afr_dir_write_fill(frame, this, child_index, op_ret, op_errno, buf,
+                             preparent, postparent, preparent2, postparent2,
+                             xdata);
+        call_count = --local->call_count;
+    }
+    UNLOCK(&frame->lock);
+
+    if (call_count == 0) {
+        __afr_dir_write_finalize(frame, this);
+
+        if (afr_txn_nothing_failed(frame, this)) {
+            /*if it did pre-op, it will do post-op changing ctime*/
+            if (priv->consistent_metadata && afr_needs_changelog_update(local))
+                afr_zero_fill_stat(local);
+            local->transaction.unwind(frame, this);
+        }
+
+        afr_mark_entry_pending_changelog(frame, this);
+
+        afr_transaction_resume(frame, this);
+    }
+
+    return 0;
 }
 
-
 int
-afr_create_done (call_frame_t *frame, xlator_t *this)
+afr_mark_new_entry_changelog_cbk(call_frame_t *frame, void *cookie,
+                                 xlator_t *this, int op_ret, int op_errno,
+                                 dict_t *xattr, dict_t *xdata)
 {
-	afr_local_t * local = NULL;
+    int call_count = 0;
 
-	local = frame->local;
+    call_count = afr_frame_return(frame);
 
-	local->transaction.unwind (frame, this);
+    if (call_count == 0)
+        AFR_STACK_DESTROY(frame);
 
-	AFR_STACK_DESTROY (frame);
-
-	return 0;
+    return 0;
 }
 
-
-int
-afr_create (call_frame_t *frame, xlator_t *this,
-	    loc_t *loc, int32_t flags, mode_t mode, fd_t *fd)
+void
+afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t  * transaction_frame = NULL;
-
-	int ret = -1;
-
-	int op_ret   = -1;
-	int op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+    call_frame_t *new_frame = NULL;
+    afr_local_t *local = NULL;
+    afr_local_t *new_local = NULL;
+    afr_private_t *priv = NULL;
+    dict_t *xattr = NULL;
+    int32_t **changelog = NULL;
+    int i = 0;
+    int op_errno = ENOMEM;
+    unsigned char *pending = NULL;
+    int call_count = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    new_frame = copy_frame(frame);
+    if (!new_frame)
+        goto out;
+
+    new_local = AFR_FRAME_INIT(new_frame, op_errno);
+    if (!new_local)
+        goto out;
+
+    xattr = dict_new();
+    if (!xattr)
+        goto out;
+
+    pending = alloca0(priv->child_count);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->transaction.pre_op[i] &&
+            !local->transaction.failed_subvols[i]) {
+            call_count++;
+            continue;
+        }
+        pending[i] = 1;
+    }
+
+    changelog = afr_mark_pending_changelog(priv, pending, xattr,
+                                           local->cont.dir_fop.buf.ia_type);
+    if (!changelog)
+        goto out;
+
+    new_local->pending = changelog;
+    gf_uuid_copy(new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid);
+    new_local->loc.inode = inode_ref(local->inode);
+
+    new_local->call_count = call_count;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (pending[i])
+            continue;
+
+        STACK_WIND_COOKIE(new_frame, afr_mark_new_entry_changelog_cbk,
+                          (void *)(long)i, priv->children[i],
+                          priv->children[i]->fops->xattrop, &new_local->loc,
+                          GF_XATTROP_ADD_ARRAY, xattr, NULL);
+        if (!--call_count)
+            break;
+    }
+
+    new_frame = NULL;
+out:
+    if (new_frame)
+        AFR_STACK_DESTROY(new_frame);
+    if (xattr)
+        dict_unref(xattr);
+    return;
+}
 
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
+void
+afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int pre_op_count = 0;
+    int failed_count = 0;
+    unsigned char *success_replies = NULL;
 
-	transaction_frame->local = local;
+    local = frame->local;
+    priv = this->private;
 
-	loc_copy (&local->loc, loc);
+    if (local->op_ret < 0)
+        return;
 
-	local->cont.create.flags = flags;
-	local->cont.create.mode  = mode;
-	local->cont.create.fd    = fd_ref (fd);
+    if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD &&
+        local->op != GF_FOP_MKDIR)
+        return;
 
-	local->transaction.fop    = afr_create_wind;
-	local->transaction.done   = afr_create_done;
-	local->transaction.unwind = afr_create_unwind;
+    pre_op_count = AFR_COUNT(local->transaction.pre_op, priv->child_count);
+    failed_count = AFR_COUNT(local->transaction.failed_subvols,
+                             priv->child_count);
 
-	afr_build_parent_loc (&local->transaction.parent_loc, loc);
+    /* FOP succeeded on all bricks. */
+    if (pre_op_count == priv->child_count && !failed_count)
+        return;
 
-	local->transaction.main_frame = frame;
-	local->transaction.basename = AFR_BASENAME (loc->path);
-	local->transaction.pending  = AFR_ENTRY_PENDING;
+    /* FOP did not suceed on quorum no. of bricks. */
+    success_replies = alloca0(priv->child_count);
+    afr_fill_success_replies(local, priv, success_replies);
+    if (!afr_has_quorum(success_replies, this, NULL))
+        return;
 
-	afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+    if (priv->thin_arbiter_count) {
+        /*Mark new entry using ta file*/
+        local->is_new_entry = _gf_true;
+        return;
+    }
 
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
+    afr_mark_new_entry_changelog(frame, this);
 
-	return 0;
+    return;
 }
 
-/* }}} */
-
-/* {{{ mknod */
+/* {{{ create */
 
 int
-afr_mknod_unwind (call_frame_t *frame, xlator_t *this)
+afr_create_unwind(call_frame_t *frame, xlator_t *this)
 {
-	call_frame_t *main_frame = NULL;
-	afr_local_t  *local = NULL;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame) {
-			main_frame = local->transaction.main_frame;
-		}
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
-
-	if (main_frame)
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
-				  local->cont.mknod.inode,
-				  &local->cont.mknod.buf);
-	return 0;
-}
+    call_frame_t *main_frame = NULL;
+    afr_local_t *local = NULL;
 
+    local = frame->local;
 
-int
-afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		    int32_t op_ret, int32_t op_errno, 
-		    inode_t *inode, struct stat *buf)
-{
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int call_count = -1;
-	int child_index = -1;
-
-	local = frame->local;
-	priv = this->private;
-
-	child_index = (long) cookie;
-
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
-		
-		if (op_ret != -1) {
-			local->op_ret = op_ret;
-
-			if ((local->success_count == 0)
-			    || (child_index == priv->read_child)) {	
-				local->cont.mknod.buf   = *buf;
-				local->cont.mknod.buf.st_ino = 
-					afr_itransform (buf->st_ino,
-							priv->child_count,
-							child_index);
-			}
-			local->cont.mknod.inode = inode;
-
-			local->success_count++;
-		}
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		local->transaction.unwind (frame, this);
-
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
-}
+    main_frame = afr_transaction_detach_fop_frame(frame);
 
+    if (!main_frame)
+        return 0;
 
-int32_t
-afr_mknod_wind (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-
-	int call_count = -1;
-	int i = 0;
-
-	local = frame->local;
-	priv  = this->private;
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
-
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->mknod,
-					   &local->loc, local->cont.mknod.mode,
-					   local->cont.mknod.dev);
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    AFR_STACK_UNWIND(create, main_frame, local->op_ret, local->op_errno,
+                     local->cont.create.fd, local->inode,
+                     &local->cont.dir_fop.buf, &local->cont.dir_fop.preparent,
+                     &local->cont.dir_fop.postparent, local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_mknod_done (call_frame_t *frame, xlator_t *this)
+afr_create_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                    struct iatt *buf, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
 {
-	afr_local_t * local = NULL;
-
-	local = frame->local;
-
-	local->transaction.unwind (frame, this);
-	AFR_STACK_DESTROY (frame);
-
-	return 0;
+    return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+                               preparent, postparent, NULL, NULL, xdata);
 }
 
-
 int
-afr_mknod (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, mode_t mode, dev_t dev)
+afr_create_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t  * transaction_frame = NULL;
-
-	int ret = -1;
-
-	int op_ret   = -1;
-	int op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	transaction_frame->local = local;
-
-	loc_copy (&local->loc, loc);
-
-	local->cont.mknod.mode  = mode;
-	local->cont.mknod.dev   = dev;
-
-	local->transaction.fop    = afr_mknod_wind;
-	local->transaction.done   = afr_mknod_done;
-	local->transaction.unwind = afr_mknod_unwind;
-
-	afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
-	local->transaction.main_frame = frame;
-	local->transaction.basename = AFR_BASENAME (loc->path);
-	local->transaction.pending  = AFR_ENTRY_PENDING;
-
-	afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    STACK_WIND_COOKIE(frame, afr_create_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->create, &local->loc,
+                      local->cont.create.flags, local->cont.create.mode,
+                      local->umask, local->cont.create.fd, local->xdata_req);
+    return 0;
+}
 
-	op_ret = 0;
+int
+afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+           mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
+
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
+
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
+
+    loc_copy(&local->loc, loc);
+
+    local->fd_ctx = afr_fd_ctx_get(fd, this);
+    if (!local->fd_ctx)
+        goto out;
+
+    local->inode = inode_ref(loc->inode);
+    local->parent = inode_ref(loc->parent);
+
+    local->op = GF_FOP_CREATE;
+    local->cont.create.flags = flags;
+    local->fd_ctx->flags = flags;
+    local->cont.create.mode = mode;
+    local->cont.create.fd = fd_ref(fd);
+    local->umask = umask;
+
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
+
+    if (!local->xdata_req)
+        goto out;
+
+    local->transaction.wind = afr_create_wind;
+    local->transaction.unwind = afr_create_unwind;
+
+    ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+    if (ret)
+        goto out;
+
+    local->transaction.main_frame = frame;
+    local->transaction.basename = AFR_BASENAME(loc->path);
+    ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
+
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	return 0;
+    AFR_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+                     NULL);
+    return 0;
 }
 
 /* }}} */
 
-/* {{{ mkdir */
-
-
-int
-afr_mkdir_unwind (call_frame_t *frame, xlator_t *this)
-{
-	call_frame_t *main_frame = NULL;
-	afr_local_t  *local = NULL;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame) {
-			main_frame = local->transaction.main_frame;
-		}
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
-
-	if (main_frame)
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
-				  local->cont.mkdir.inode,
-				  &local->cont.mkdir.buf);
-	return 0;
-}
-
+/* {{{ mknod */
 
 int
-afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		    int32_t op_ret, int32_t op_errno, 
-		    inode_t *inode, struct stat *buf)
+afr_mknod_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int call_count = -1;
-	int child_index = -1;
-
-	local = frame->local;
-	priv = this->private;
-
-	child_index = (long) cookie;
-
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
-
-		if (op_ret != -1) {
-			local->op_ret           = op_ret;
-
-			if ((local->success_count == 0)
-			    || (child_index == priv->read_child)) {
-				local->cont.mkdir.buf   = *buf;
-				local->cont.mkdir.buf.st_ino = 
-					afr_itransform (buf->st_ino, priv->child_count,
-							child_index);
-			}
-			local->cont.mkdir.inode = inode;
-
-			local->success_count++;
-		}
+    call_frame_t *main_frame = NULL;
+    afr_local_t *local = NULL;
 
-		local->op_errno         = op_errno;
-	}
-	UNLOCK (&frame->lock);
+    local = frame->local;
 
-	call_count = afr_frame_return (frame);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	if (call_count == 0) {
-		local->transaction.unwind (frame, this);
-
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+    AFR_STACK_UNWIND(mknod, main_frame, local->op_ret, local->op_errno,
+                     local->inode, &local->cont.dir_fop.buf,
+                     &local->cont.dir_fop.preparent,
+                     &local->cont.dir_fop.postparent, local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_mkdir_wind (call_frame_t *frame, xlator_t *this)
+afr_mknod_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, inode_t *inode,
+                   struct iatt *buf, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
 {
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-	
-	int call_count = -1;
-	int i = 0;
-
-	local = frame->local;
-	priv  = this->private;
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
-
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk,
-					   (void *) (long) i,	
-					   priv->children[i], 
-					   priv->children[i]->fops->mkdir,
-					   &local->loc, local->cont.mkdir.mode);
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+                               preparent, postparent, NULL, NULL, xdata);
 }
 
-
 int
-afr_mkdir_done (call_frame_t *frame, xlator_t *this)
+afr_mknod_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_local_t * local = NULL;
-
-	local = frame->local;
-
-	local->transaction.unwind (frame, this);
-
-	AFR_STACK_DESTROY (frame);
-
-	return 0;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    STACK_WIND_COOKIE(frame, afr_mknod_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->mknod, &local->loc,
+                      local->cont.mknod.mode, local->cont.mknod.dev,
+                      local->umask, local->xdata_req);
+    return 0;
 }
 
-
 int
-afr_mkdir (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, mode_t mode)
+afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          dev_t dev, mode_t umask, dict_t *xdata)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t  * transaction_frame = NULL;
-
-	int ret = -1;
-
-	int op_ret   = -1;
-	int op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	transaction_frame->local = local;
-
-	loc_copy (&local->loc, loc);
-
-	local->cont.mkdir.mode  = mode;
-
-	local->transaction.fop    = afr_mkdir_wind;
-	local->transaction.done   = afr_mkdir_done;
-	local->transaction.unwind = afr_mkdir_unwind;
-
-	afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
-	local->transaction.main_frame = frame;
-	local->transaction.basename = AFR_BASENAME (loc->path);
-	local->transaction.pending  = AFR_ENTRY_PENDING;
-
-	afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
-	op_ret = 0;
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
+
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
+
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
+
+    loc_copy(&local->loc, loc);
+    local->inode = inode_ref(loc->inode);
+    local->parent = inode_ref(loc->parent);
+
+    local->op = GF_FOP_MKNOD;
+    local->cont.mknod.mode = mode;
+    local->cont.mknod.dev = dev;
+    local->umask = umask;
+
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
+
+    if (!local->xdata_req)
+        goto out;
+
+    local->transaction.wind = afr_mknod_wind;
+    local->transaction.unwind = afr_mknod_unwind;
+
+    ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+    if (ret)
+        goto out;
+
+    local->transaction.main_frame = frame;
+    local->transaction.basename = AFR_BASENAME(loc->path);
+    ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
+
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
-
-	return 0;
+    AFR_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+    return 0;
 }
 
 /* }}} */
 
-/* {{{ link */
-
-
-int
-afr_link_unwind (call_frame_t *frame, xlator_t *this)
-{
-	call_frame_t *main_frame = NULL;
-	afr_local_t  *local = NULL;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame) {
-			main_frame = local->transaction.main_frame;
-		}
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
-
-	if (main_frame) {
-		local->cont.link.buf.st_ino = local->cont.link.ino;
-
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, 
-				  local->cont.link.inode,
-				  &local->cont.link.buf);
-	}
-
-	return 0;
-}
-
+/* {{{ mkdir */
 
 int
-afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		   int32_t op_ret, int32_t op_errno, inode_t *inode,
-		   struct stat *buf)
+afr_mkdir_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int call_count = -1;
-	int child_index = -1;
-
-	local = frame->local;
-	priv = this->private;
-
-	child_index = (long) cookie;
-
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
-
-		if (op_ret != -1) {
-			local->op_ret   = op_ret;
-
-			if ((local->success_count == 0)
-			    || (child_index == priv->read_child)) {
-				local->cont.link.buf        = *buf;
-				local->cont.link.buf.st_ino = 
-					afr_itransform (buf->st_ino, priv->child_count,
-							child_index);
-			}
-			local->cont.link.inode    = inode;
-
-			local->success_count++;
-		}
-
-		local->op_errno = op_errno;		
-	}
-	UNLOCK (&frame->lock);
+    call_frame_t *main_frame = NULL;
+    afr_local_t *local = NULL;
 
-	call_count = afr_frame_return (frame);
+    local = frame->local;
 
-	if (call_count == 0) {
-		local->transaction.unwind (frame, this);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+    AFR_STACK_UNWIND(mkdir, main_frame, local->op_ret, local->op_errno,
+                     local->inode, &local->cont.dir_fop.buf,
+                     &local->cont.dir_fop.preparent,
+                     &local->cont.dir_fop.postparent, local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_link_wind (call_frame_t *frame, xlator_t *this)
+afr_mkdir_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, inode_t *inode,
+                   struct iatt *buf, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
 {
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-
-	int call_count = -1;
-	int i = 0;
-
-	local = frame->local;
-	priv  = this->private;
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
-
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->link,
-					   &local->loc,
-					   &local->newloc);
-			
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+                               preparent, postparent, NULL, NULL, xdata);
 }
 
-
 int
-afr_link_done (call_frame_t *frame, xlator_t *this)
+afr_mkdir_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_local_t * local = frame->local;
-
-	local->transaction.unwind (frame, this);
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
 
-	AFR_STACK_DESTROY (frame);
+    local = frame->local;
+    priv = this->private;
 
-	return 0;
+    STACK_WIND_COOKIE(frame, afr_mkdir_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->mkdir, &local->loc,
+                      local->cont.mkdir.mode, local->umask, local->xdata_req);
+    return 0;
 }
 
-
 int
-afr_link (call_frame_t *frame, xlator_t *this,
-	  loc_t *oldloc, loc_t *newloc)
+afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          mode_t umask, dict_t *xdata)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t  * transaction_frame = NULL;
-
-	int ret = -1;
-
-	int op_ret   = -1;
-	int op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	transaction_frame->local = local;
-
-	loc_copy (&local->loc,    oldloc);
-	loc_copy (&local->newloc, newloc);
-
-	local->cont.link.ino = oldloc->inode->ino;
-
-	local->transaction.fop    = afr_link_wind;
-	local->transaction.done   = afr_link_done;
-	local->transaction.unwind = afr_link_unwind;
-
-	afr_build_parent_loc (&local->transaction.parent_loc, oldloc);
-
-	local->transaction.main_frame   = frame;
-	local->transaction.basename     = AFR_BASENAME (oldloc->path);
-	local->transaction.new_basename = AFR_BASENAME (newloc->path);
-	local->transaction.pending      = AFR_ENTRY_PENDING;
-
-	afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
-	op_ret = 0;
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
+
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
+
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
+
+    loc_copy(&local->loc, loc);
+    local->inode = inode_ref(loc->inode);
+    local->parent = inode_ref(loc->parent);
+
+    local->cont.mkdir.mode = mode;
+    local->umask = umask;
+
+    if (!xdata || !dict_get_sizen(xdata, "gfid-req")) {
+        op_errno = EPERM;
+        gf_msg_callingfn(this->name, GF_LOG_WARNING, op_errno,
+                         AFR_MSG_GFID_NULL,
+                         "mkdir: %s is received "
+                         "without gfid-req %p",
+                         loc->path, xdata);
+        goto out;
+    }
+
+    local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    if (!local->xdata_req) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    local->op = GF_FOP_MKDIR;
+    local->transaction.wind = afr_mkdir_wind;
+    local->transaction.unwind = afr_mkdir_unwind;
+
+    ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+    if (ret)
+        goto out;
+
+    local->transaction.main_frame = frame;
+    local->transaction.basename = AFR_BASENAME(loc->path);
+    ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
+
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	return 0;
+    AFR_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+    return 0;
 }
 
 /* }}} */
 
-/* {{{ symlink */
-
+/* {{{ link */
 
 int
-afr_symlink_unwind (call_frame_t *frame, xlator_t *this)
+afr_link_unwind(call_frame_t *frame, xlator_t *this)
 {
-	call_frame_t *main_frame = NULL;
-	afr_local_t  *local = NULL;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame) {
-			main_frame = local->transaction.main_frame;
-		}
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
-
-	if (main_frame)
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
-				  local->cont.symlink.inode,
-				  &local->cont.symlink.buf);
-	return 0;
-}
+    call_frame_t *main_frame = NULL;
+    afr_local_t *local = NULL;
 
+    local = frame->local;
 
-int
-afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		      int32_t op_ret, int32_t op_errno, inode_t *inode,
-		      struct stat *buf)
-{
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int call_count = -1;
-	int child_index = -1;
-
-	local = frame->local;
-	priv = this->private;
-
-	child_index = (long) cookie;
-
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
-		
-		if (op_ret != -1) {
-			local->op_ret   = op_ret;
-
-			if ((local->success_count == 0)
-			    || (child_index == priv->read_child)) {
-				local->cont.symlink.buf        = *buf;
-				local->cont.symlink.buf.st_ino = 
-					afr_itransform (buf->st_ino, priv->child_count,
-							child_index);
-			}
-			local->cont.symlink.inode    = inode;
-
-			local->success_count++;
-		}
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		local->transaction.unwind (frame, this);
-
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
-}
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
+    AFR_STACK_UNWIND(link, main_frame, local->op_ret, local->op_errno,
+                     local->inode, &local->cont.dir_fop.buf,
+                     &local->cont.dir_fop.preparent,
+                     &local->cont.dir_fop.postparent, local->xdata_rsp);
+    return 0;
+}
 
 int
-afr_symlink_wind (call_frame_t *frame, xlator_t *this)
+afr_link_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, inode_t *inode,
+                  struct iatt *buf, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
 {
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-
-	int call_count = -1;
-	int i = 0;
-
-	local = frame->local;
-	priv = this->private;
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
-
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk,
-					   (void *) (long) i,	
-					   priv->children[i], 
-					   priv->children[i]->fops->symlink,
-					   local->cont.symlink.linkpath,
-					   &local->loc);
-
-			if (!--call_count)
-				break;
-
-		}
-	}
-	
-	return 0;
+    return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+                               preparent, postparent, NULL, NULL, xdata);
 }
 
-
 int
-afr_symlink_done (call_frame_t *frame, xlator_t *this)
+afr_link_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_local_t * local = frame->local;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
 
-	local->transaction.unwind (frame, this);
+    local = frame->local;
+    priv = this->private;
 
-	AFR_STACK_DESTROY (frame);
-	
-	return 0;
+    STACK_WIND_COOKIE(frame, afr_link_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->link, &local->loc,
+                      &local->newloc, local->xdata_req);
+    return 0;
 }
 
-
 int
-afr_symlink (call_frame_t *frame, xlator_t *this,
-	     const char *linkpath, loc_t *loc)
+afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+         dict_t *xdata)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t  * transaction_frame = NULL;
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
 
-	int ret = -1;
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
 
-	int op_ret   = -1;
-	int op_errno = 0;
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+    loc_copy(&local->loc, oldloc);
+    loc_copy(&local->newloc, newloc);
 
-	priv = this->private;
+    local->inode = inode_ref(oldloc->inode);
+    local->parent = inode_ref(newloc->parent);
 
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+    if (!local->xdata_req)
+        goto out;
 
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
+    local->op = GF_FOP_LINK;
 
-	transaction_frame->local = local;
-	
-	loc_copy (&local->loc, loc);
+    local->transaction.wind = afr_link_wind;
+    local->transaction.unwind = afr_link_unwind;
 
-	local->cont.symlink.ino      = loc->inode->ino;
-	local->cont.symlink.linkpath = strdup (linkpath);
+    ret = afr_build_parent_loc(&local->transaction.parent_loc, newloc,
+                               &op_errno);
+    if (ret)
+        goto out;
 
-	local->transaction.fop    = afr_symlink_wind;
-	local->transaction.done   = afr_symlink_done;
-	local->transaction.unwind = afr_symlink_unwind;
+    local->transaction.main_frame = frame;
+    local->transaction.basename = AFR_BASENAME(newloc->path);
+    ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
 
-	afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
-	local->transaction.main_frame   = frame;
-	local->transaction.basename     = AFR_BASENAME (loc->path);
-	local->transaction.pending      = AFR_ENTRY_PENDING;
-
-	afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
-	op_ret = 0;
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL);
-	}
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	return 0;
+    AFR_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+    return 0;
 }
 
 /* }}} */
 
-/* {{{ rename */
+/* {{{ symlink */
 
 int
-afr_rename_unwind (call_frame_t *frame, xlator_t *this)
+afr_symlink_unwind(call_frame_t *frame, xlator_t *this)
 {
-	call_frame_t *main_frame = NULL;
-	afr_local_t  *local = NULL;
+    call_frame_t *main_frame = NULL;
+    afr_local_t *local = NULL;
 
-	local = frame->local;
+    local = frame->local;
 
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame) {
-			main_frame = local->transaction.main_frame;
-		}
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	if (main_frame) {
-		local->cont.rename.buf.st_ino = local->cont.rename.ino;
-
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, 
-				  &local->cont.rename.buf);
-	}
-
-	return 0;
+    AFR_STACK_UNWIND(symlink, main_frame, local->op_ret, local->op_errno,
+                     local->inode, &local->cont.dir_fop.buf,
+                     &local->cont.dir_fop.preparent,
+                     &local->cont.dir_fop.postparent, local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		     int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_symlink_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, inode_t *inode,
+                     struct iatt *buf, struct iatt *preparent,
+                     struct iatt *postparent, dict_t *xdata)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int call_count = -1;
-	int child_index = -1;
-
-	local = frame->local;
-	priv  = this->private;
-
-	child_index = (long) cookie;
-
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
-
-		if ((op_ret != -1) && (local->success_count == 0)) {
-			local->op_ret = op_ret;
-		
-			if (buf) {
-				local->cont.rename.buf = *buf;
-				local->cont.rename.buf.st_ino = 
-					afr_itransform (buf->st_ino, priv->child_count,
-							child_index);
-			}
-			local->success_count++;
-		}
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		local->transaction.unwind (frame, this);
-
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+    return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+                               preparent, postparent, NULL, NULL, xdata);
 }
 
-
-int32_t
-afr_rename_wind (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-
-	int call_count = -1;
-	int i = 0;
-
-	local = frame->local;
-	priv = this->private;
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
-
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, 
-					   (void *) (long) i,	
-					   priv->children[i], 
-					   priv->children[i]->fops->rename,
-					   &local->loc,
-					   &local->newloc);
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
-}
-
-
 int
-afr_rename_done (call_frame_t *frame, xlator_t *this)
+afr_symlink_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_local_t * local = frame->local;
-
-	local->transaction.unwind (frame, this);
-
-	AFR_STACK_DESTROY (frame);
-	
-	return 0;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    STACK_WIND_COOKIE(frame, afr_symlink_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->symlink,
+                      local->cont.symlink.linkpath, &local->loc, local->umask,
+                      local->xdata_req);
+    return 0;
 }
 
-
 int
-afr_rename (call_frame_t *frame, xlator_t *this,
-	    loc_t *oldloc, loc_t *newloc)
+afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+            loc_t *loc, mode_t umask, dict_t *xdata)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t  * transaction_frame = NULL;
-
-	int ret = -1;
-
-	int op_ret   = -1;
-	int op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	transaction_frame->local = local;
-
-	loc_copy (&local->loc,    oldloc);
-	loc_copy (&local->newloc, newloc);
-
-	local->cont.rename.ino = oldloc->inode->ino;
-
-	local->transaction.fop    = afr_rename_wind;
-	local->transaction.done   = afr_rename_done;
-	local->transaction.unwind = afr_rename_unwind;
-
-	afr_build_parent_loc (&local->transaction.parent_loc, oldloc);
-	afr_build_parent_loc (&local->transaction.new_parent_loc, newloc);
-
-	local->transaction.main_frame   = frame;
-	local->transaction.basename     = AFR_BASENAME (oldloc->path);
-	local->transaction.new_basename = AFR_BASENAME (newloc->path);
-	local->transaction.pending      = AFR_ENTRY_PENDING;
-
-	afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION);
-
-	op_ret = 0;
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
+
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
+
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
+
+    loc_copy(&local->loc, loc);
+    local->inode = inode_ref(loc->inode);
+    local->parent = inode_ref(loc->parent);
+
+    local->cont.symlink.linkpath = gf_strdup(linkpath);
+    local->umask = umask;
+
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
+
+    if (!local->xdata_req)
+        goto out;
+
+    local->op = GF_FOP_SYMLINK;
+    local->transaction.wind = afr_symlink_wind;
+    local->transaction.unwind = afr_symlink_unwind;
+
+    ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+    if (ret)
+        goto out;
+
+    local->transaction.main_frame = frame;
+    local->transaction.basename = AFR_BASENAME(loc->path);
+    ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
+
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	return 0;
+    AFR_STACK_UNWIND(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                     NULL);
+    return 0;
 }
 
 /* }}} */
 
-/* {{{ unlink */
+/* {{{ rename */
 
 int
-afr_unlink_unwind (call_frame_t *frame, xlator_t *this)
+afr_rename_unwind(call_frame_t *frame, xlator_t *this)
 {
-	call_frame_t *main_frame = NULL;
-	afr_local_t  *local = NULL;
+    call_frame_t *main_frame = NULL;
+    afr_local_t *local = NULL;
 
-	local = frame->local;
+    local = frame->local;
 
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame) {
-			main_frame = local->transaction.main_frame;
-		}
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	if (main_frame)
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno);
-
-	return 0;
+    AFR_STACK_UNWIND(rename, main_frame, local->op_ret, local->op_errno,
+                     &local->cont.dir_fop.buf, &local->cont.dir_fop.preparent,
+                     &local->cont.dir_fop.postparent,
+                     &local->cont.dir_fop.prenewparent,
+                     &local->cont.dir_fop.postnewparent, local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		     int32_t op_ret, int32_t op_errno)
+afr_rename_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                    struct iatt *preoldparent, struct iatt *postoldparent,
+                    struct iatt *prenewparent, struct iatt *postnewparent,
+                    dict_t *xdata)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int call_count  = -1;
-	int child_index = (long) cookie;
-	int need_unwind = 0;
-
-	local = frame->local;
-	priv  = this->private;
-	
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
-
-		if (op_ret != -1) {
-			if (local->success_count == 0) {
-				local->op_ret   = op_ret;
-			}
-			local->success_count++;
-
-			if (local->success_count == priv->wait_count) {
-				need_unwind = 1;
-			}
-		}
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		local->transaction.unwind (frame, this);
-
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+    return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+                               preoldparent, postoldparent, prenewparent,
+                               postnewparent, xdata);
 }
 
-
-int32_t
-afr_unlink_wind (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-
-	int call_count = -1;
-	int i = 0;
-
-	local = frame->local;
-	priv  = this->private;
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
-
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk,	
-					   (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->unlink,
-					   &local->loc);
-			
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
-}
-
-
-int32_t
-afr_unlink_done (call_frame_t *frame, xlator_t *this)
+int
+afr_rename_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_local_t * local = frame->local;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
 
-	local->transaction.unwind (frame, this);
+    local = frame->local;
+    priv = this->private;
 
-	AFR_STACK_DESTROY (frame);
-	
-	return 0;
+    STACK_WIND_COOKIE(frame, afr_rename_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->rename, &local->loc,
+                      &local->newloc, local->xdata_req);
+    return 0;
 }
 
-
-int32_t
-afr_unlink (call_frame_t *frame, xlator_t *this,
-	    loc_t *loc)
+int
+afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+           dict_t *xdata)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t  * transaction_frame = NULL;
-
-	int ret = -1;
-
-	int op_ret   = -1;
-	int op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	transaction_frame->local = local;
-
-	loc_copy (&local->loc, loc);
-
-	local->transaction.fop    = afr_unlink_wind;
-	local->transaction.done   = afr_unlink_done;
-	local->transaction.unwind = afr_unlink_unwind;
-
-	afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
-	local->transaction.main_frame = frame;
-	local->transaction.basename = AFR_BASENAME (loc->path);
-	local->transaction.pending  = AFR_ENTRY_PENDING;
-
-	afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
-	op_ret = 0;
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
+
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
+
+    loc_copy(&local->loc, oldloc);
+    loc_copy(&local->newloc, newloc);
+
+    local->inode = inode_ref(oldloc->inode);
+    local->parent = inode_ref(oldloc->parent);
+    local->parent2 = inode_ref(newloc->parent);
+
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
+
+    if (!local->xdata_req)
+        goto out;
+
+    local->op = GF_FOP_RENAME;
+    local->transaction.wind = afr_rename_wind;
+    local->transaction.unwind = afr_rename_unwind;
+
+    ret = afr_build_parent_loc(&local->transaction.parent_loc, oldloc,
+                               &op_errno);
+    if (ret)
+        goto out;
+    ret = afr_build_parent_loc(&local->transaction.new_parent_loc, newloc,
+                               &op_errno);
+    if (ret)
+        goto out;
+
+    local->transaction.main_frame = frame;
+    local->transaction.basename = AFR_BASENAME(oldloc->path);
+    local->transaction.new_basename = AFR_BASENAME(newloc->path);
+    ret = afr_transaction(transaction_frame, this,
+                          AFR_ENTRY_RENAME_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
+
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	return 0;
+    AFR_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+                     NULL);
+    return 0;
 }
 
 /* }}} */
 
-/* {{{ rmdir */
-
-
-
-int
-afr_rmdir_unwind (call_frame_t *frame, xlator_t *this)
-{
-	call_frame_t *main_frame = NULL;
-	afr_local_t  *local = NULL;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame) {
-			main_frame = local->transaction.main_frame;
-		}
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
-
-	if (main_frame)
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno);
-
-	return 0;
-}
-
+/* {{{ unlink */
 
 int
-afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		    int32_t op_ret, int32_t op_errno)
+afr_unlink_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int call_count  = -1;
-	int child_index = (long) cookie;
-	int need_unwind = 0;
-
-	local = frame->local;
-	priv = this->private;
+    call_frame_t *main_frame = NULL;
+    afr_local_t *local = NULL;
 
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
+    local = frame->local;
 
-		if (op_ret != -1) {
-			if (local->success_count == 0) {
-				local->op_ret = op_ret;
-			}
-			local->success_count++;
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-			if (local->success_count == priv->wait_count)
-				need_unwind = 1;
-		}
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		local->transaction.unwind (frame, this);
-
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+    AFR_STACK_UNWIND(unlink, main_frame, local->op_ret, local->op_errno,
+                     &local->cont.dir_fop.preparent,
+                     &local->cont.dir_fop.postparent, local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_rmdir_wind (call_frame_t *frame, xlator_t *this)
+afr_unlink_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
 {
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-
-	int call_count = -1;
-	int i = 0;
-
-	local = frame->local;
-	priv  = this->private;
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
-
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk,	
-					   (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->rmdir,
-					   &local->loc);
-
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+                               preparent, postparent, NULL, NULL, xdata);
 }
 
-
 int
-afr_rmdir_done (call_frame_t *frame, xlator_t *this)
+afr_unlink_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_local_t * local = frame->local;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
 
-	local->transaction.unwind (frame, this);
+    local = frame->local;
+    priv = this->private;
 
-	AFR_STACK_DESTROY (frame);
-	
-	return 0;
+    STACK_WIND_COOKIE(frame, afr_unlink_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->unlink, &local->loc,
+                      local->xflag, local->xdata_req);
+    return 0;
 }
 
-
 int
-afr_rmdir (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc)
+afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+           dict_t *xdata)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t  * transaction_frame = NULL;
-	
-	int ret = -1;
-
-	int op_ret   = -1;
-	int op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	transaction_frame->local = local;
-
-	loc_copy (&local->loc, loc);
-
-	local->transaction.fop    = afr_rmdir_wind;
-	local->transaction.done   = afr_rmdir_done;
-	local->transaction.unwind = afr_rmdir_unwind;
-
-	afr_build_parent_loc (&local->transaction.parent_loc, loc);
-
-	local->transaction.main_frame = frame;
-	local->transaction.basename = AFR_BASENAME (loc->path);
-	local->transaction.pending  = AFR_ENTRY_PENDING;
-
-	afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
-
-	op_ret = 0;
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
+
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
+
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
+
+    loc_copy(&local->loc, loc);
+    local->xflag = xflag;
+
+    local->inode = inode_ref(loc->inode);
+    local->parent = inode_ref(loc->parent);
+
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
+
+    if (!local->xdata_req)
+        goto out;
+
+    local->op = GF_FOP_UNLINK;
+    local->transaction.wind = afr_unlink_wind;
+    local->transaction.unwind = afr_unlink_unwind;
+
+    ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+    if (ret)
+        goto out;
+
+    local->transaction.main_frame = frame;
+    local->transaction.basename = AFR_BASENAME(loc->path);
+    ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
+
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	return 0;
+    AFR_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
 }
 
 /* }}} */
 
-/* {{{ setdents */
+/* {{{ rmdir */
 
-int32_t
-afr_setdents_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		       int32_t op_ret, int32_t op_errno)
+int
+afr_rmdir_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int call_count  = -1;
-	int child_index = (long) cookie;
+    call_frame_t *main_frame = NULL;
+    afr_local_t *local = NULL;
 
-	local = frame->local;
-	priv = this->private;
+    local = frame->local;
 
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-		if ((op_ret != -1) && (local->success_count == 0)) {
-			local->op_ret = op_ret;
-			local->success_count++;
-		}
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+    AFR_STACK_UNWIND(rmdir, main_frame, local->op_ret, local->op_errno,
+                     &local->cont.dir_fop.preparent,
+                     &local->cont.dir_fop.postparent, local->xdata_rsp);
+    return 0;
 }
 
-
-int32_t
-afr_setdents_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_rmdir_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
 {
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-
-	int call_count = -1;
-	int i = 0;
-
-	local = frame->local;
-	priv  = this->private;
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
-
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_setdents_wind_cbk,	
-					   (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->setdents,
-					   local->fd, local->cont.setdents.flags,
-					   local->cont.setdents.entries, 
-					   local->cont.setdents.count);
-			
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+                               preparent, postparent, NULL, NULL, xdata);
 }
 
-
-int32_t
-afr_setdents_done (call_frame_t *frame, xlator_t *this)
+int
+afr_rmdir_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_local_t * local = frame->local;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
 
-	AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-	
-	return 0;
-}
+    local = frame->local;
+    priv = this->private;
 
+    STACK_WIND_COOKIE(frame, afr_rmdir_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->rmdir, &local->loc,
+                      local->cont.rmdir.flags, local->xdata_req);
+    return 0;
+}
 
-int32_t
-afr_setdents (call_frame_t *frame, xlator_t *this,
-	      fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count)
+int
+afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+          dict_t *xdata)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	
-	int ret = -1;
-
-	int op_ret   = -1;
-	int op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	frame->local = local;
-
-	local->fd = fd_ref (fd);
-
-	local->cont.setdents.flags   = flags;
-	local->cont.setdents.entries = entries;
-	local->cont.setdents.count   = count;
-
-	local->transaction.fop  = afr_setdents_wind;
-	local->transaction.done = afr_setdents_done;
-
-	local->transaction.basename = NULL;
-	local->transaction.pending  = AFR_ENTRY_PENDING;
-
-	afr_transaction (frame, this, AFR_ENTRY_TRANSACTION);
-
-	op_ret = 0;
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
+
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
+
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
+
+    loc_copy(&local->loc, loc);
+    local->inode = inode_ref(loc->inode);
+    local->parent = inode_ref(loc->parent);
+
+    local->cont.rmdir.flags = flags;
+
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
+
+    if (!local->xdata_req)
+        goto out;
+
+    local->op = GF_FOP_RMDIR;
+    local->transaction.wind = afr_rmdir_wind;
+    local->transaction.unwind = afr_rmdir_unwind;
+
+    ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+    if (ret)
+        goto out;
+
+    local->transaction.main_frame = frame;
+    local->transaction.basename = AFR_BASENAME(loc->path);
+    ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
+
+    return 0;
 out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	return 0;
+    AFR_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
 }
 
 /* }}} */
diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h
index 76b4b60faea..1d88c3b9b26 100644
--- a/xlators/cluster/afr/src/afr-dir-write.h
+++ b/xlators/cluster/afr/src/afr-dir-write.h
@@ -1,59 +1,46 @@
 /*
-   Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
 #ifndef __DIR_WRITE_H__
 #define __DIR_WRITE_H__
 
 int32_t
-afr_create (call_frame_t *frame, xlator_t *this,
-	    loc_t *loc, int32_t flags, mode_t mode, fd_t *fd);
-
-int32_t
-afr_mknod (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, mode_t mode, dev_t dev);
+afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+           mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata);
 
 int32_t
-afr_mkdir (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, mode_t mode);
+afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          dev_t dev, mode_t umask, dict_t *xdata);
 
 int32_t
-afr_unlink (call_frame_t *frame, xlator_t *this,
-	    loc_t *loc);
+afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          mode_t umask, dict_t *xdata);
 
 int32_t
-afr_rmdir (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc);
+afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+           dict_t *xdata);
 
 int32_t
-afr_link (call_frame_t *frame, xlator_t *this,
-	  loc_t *oldloc, loc_t *newloc);
+afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+          dict_t *xdata);
 
 int32_t
-afr_rename (call_frame_t *frame, xlator_t *this,
-	    loc_t *oldloc, loc_t *newloc);
+afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+         dict_t *xdata);
 
 int32_t
-afr_symlink (call_frame_t *frame, xlator_t *this,
-	     const char *linkpath, loc_t *oldloc);
+afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+           dict_t *xdata);
 
-int32_t
-afr_setdents (call_frame_t *frame, xlator_t *this,
-	      fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count);
+int
+afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+            loc_t *oldloc, mode_t umask, dict_t *params);
 
 #endif /* __DIR_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index fd1edc3b593..c5521704de2 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -1,22 +1,12 @@
 /*
-   Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
 
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
 
 #include <libgen.h>
 #include <unistd.h>
@@ -25,697 +15,1880 @@
 #include <stdlib.h>
 #include <signal.h>
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-
+#include <glusterfs/glusterfs.h>
 #include "afr.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/list.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/quota-common-utils.h>
+
+#include "afr-transaction.h"
+#include "afr-messages.h"
 
-
-/**
- * Common algorithm for inode read calls:
- * 
- * - Try the fop on the first child that is up
- * - if we have failed due to ENOTCONN:
- *     try the next child
- *
- * Applicable to: access, stat, fstat, readlink, getxattr
- */
+/*
+ * Quota size xattrs are not maintained by afr. There is a
+ * possibility that they differ even when both the directory changelog xattrs
+ * suggest everything is fine. So if there is at least one 'source' check among
+ * the sources which has the maximum quota size. Otherwise check among all the
+ * available ones for maximum quota size. This way if there is a source and
+ * stale copies it always votes for the 'source'.
+ * */
+
+int
+afr_handle_quota_size(call_frame_t *frame, xlator_t *this)
+{
+    unsigned char *readable = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    struct afr_reply *replies = NULL;
+    int i = 0;
+    int ret = 0;
+    quota_meta_t size = {
+        0,
+    };
+    quota_meta_t max_size = {
+        0,
+    };
+    int readable_cnt = 0;
+    int read_subvol = -1;
+
+    local = frame->local;
+    priv = this->private;
+    replies = local->replies;
+
+    readable = alloca0(priv->child_count);
+
+    afr_inode_read_subvol_get(local->inode, this, readable, 0, 0);
+
+    readable_cnt = AFR_COUNT(readable, priv->child_count);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid || replies[i].op_ret == -1)
+            continue;
+        if (readable_cnt && !readable[i])
+            continue;
+        if (!replies[i].xdata)
+            continue;
+        ret = quota_dict_get_meta(replies[i].xdata, QUOTA_SIZE_KEY,
+                                  SLEN(QUOTA_SIZE_KEY), &size);
+        if (ret == -1)
+            continue;
+        if (read_subvol == -1)
+            read_subvol = i;
+        if (size.size > max_size.size ||
+            (size.file_count + size.dir_count) >
+                (max_size.file_count + max_size.dir_count))
+            read_subvol = i;
+
+        if (size.size > max_size.size)
+            max_size.size = size.size;
+        if (size.file_count > max_size.file_count)
+            max_size.file_count = size.file_count;
+        if (size.dir_count > max_size.dir_count)
+            max_size.dir_count = size.dir_count;
+    }
+
+    if (max_size.size == 0 && max_size.file_count == 0 &&
+        max_size.dir_count == 0)
+        return read_subvol;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid || replies[i].op_ret == -1)
+            continue;
+        if (readable_cnt && !readable[i])
+            continue;
+        if (!replies[i].xdata)
+            continue;
+        quota_dict_set_meta(replies[i].xdata, QUOTA_SIZE_KEY, &max_size,
+                            IA_IFDIR);
+    }
+
+    return read_subvol;
+}
 
 /* {{{ access */
 
-int32_t
-afr_access_cbk (call_frame_t *frame, void *cookie,
-		xlator_t *this, int32_t op_ret, int32_t op_errno)
+int
+afr_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+               int op_errno, dict_t *xdata)
 {
-	afr_private_t * priv     = NULL;
-	afr_local_t *   local    = NULL;
-	xlator_t **     children = NULL;
+    afr_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret < 0) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+
+        afr_read_txn_continue(frame, this, (long)cookie);
+        return 0;
+    }
 
-	int unwind     = 1;
-	int last_tried = -1;
-	int this_try = -1;
+    AFR_STACK_UNWIND(access, frame, op_ret, op_errno, xdata);
 
-	priv     = this->private;
-	children = priv->children;
+    return 0;
+}
 
-	local = frame->local;
+int
+afr_access_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    if (subvol == -1) {
+        AFR_STACK_UNWIND(access, frame, local->op_ret, local->op_errno, 0);
+        return 0;
+    }
+
+    STACK_WIND_COOKIE(frame, afr_access_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->access, &local->loc,
+                      local->cont.access.mask, local->xdata_req);
+    return 0;
+}
 
-	if (op_ret == -1) {
-		last_tried = local->cont.access.last_tried;
+int
+afr_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int mask,
+           dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int op_errno = 0;
 
-		if (all_tried (last_tried, priv->child_count)) {
-			goto out;
-		}
-		this_try    = ++local->cont.access.last_tried;
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
 
-		unwind = 0;
+    local->op = GF_FOP_ACCESS;
+    loc_copy(&local->loc, loc);
+    local->cont.access.mask = mask;
+    if (xdata)
+        local->xdata_req = dict_ref(xdata);
 
-		STACK_WIND_COOKIE (frame, afr_access_cbk,
-				   (void *) (long) this_try,
-				   children[this_try], 
-				   children[this_try]->fops->access,
-				   &local->loc, local->cont.access.mask);
-	}
+    afr_read_txn(frame, this, loc->inode, afr_access_wind,
+                 AFR_METADATA_TRANSACTION);
 
+    return 0;
 out:
-	if (unwind) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
+    AFR_STACK_UNWIND(access, frame, -1, op_errno, NULL);
 
-	return 0;
+    return 0;
 }
 
+/* }}} */
 
-int32_t
-afr_access (call_frame_t *frame, xlator_t *this,
-	    loc_t *loc, int32_t mask)
+/* {{{ stat */
+
+int
+afr_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iatt *buf, dict_t *xdata)
 {
-	afr_private_t * priv       = NULL;
-	xlator_t **     children   = NULL;
-	int             call_child = 0;
-	afr_local_t     *local     = NULL;
+    afr_local_t *local = NULL;
 
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
+    local = frame->local;
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+    if (op_ret < 0) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
 
-	priv     = this->private;
-	VALIDATE_OR_GOTO (priv->children, out);
+        afr_read_txn_continue(frame, this, (long)cookie);
+        return 0;
+    }
 
-	children = priv->children;
+    AFR_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata);
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+    return 0;
+}
 
-	call_child = afr_first_up_child (priv);
-	if (call_child == -1) {
-		op_errno = ENOTCONN;
-		gf_log (this->name, GF_LOG_ERROR,
-			"no child is up :(");
-		goto out;
-	}
+int
+afr_stat_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
 
-	local->cont.access.last_tried = call_child;
-	loc_copy (&local->loc, loc);
-	local->cont.access.mask       = mask;
+    priv = this->private;
+    local = frame->local;
 
-	STACK_WIND_COOKIE (frame, afr_access_cbk,
-			   (void *) (long) call_child,
-			   children[call_child], children[call_child]->fops->access,
-			   loc, mask);
+    if (subvol == -1) {
+        AFR_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, 0, 0);
+        return 0;
+    }
 
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
-	return 0;
+    STACK_WIND_COOKIE(
+        frame, afr_stat_cbk, (void *)(long)subvol, priv->children[subvol],
+        priv->children[subvol]->fops->stat, &local->loc, local->xdata_req);
+    return 0;
 }
 
+int
+afr_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int op_errno = 0;
 
-/* }}} */
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
 
-/* {{{ stat */
+    local->op = GF_FOP_STAT;
+    loc_copy(&local->loc, loc);
+    if (xdata)
+        local->xdata_req = dict_ref(xdata);
 
-int32_t
-afr_stat_cbk (call_frame_t *frame, void *cookie,
-	      xlator_t *this, int32_t op_ret, int32_t op_errno,
-	      struct stat *buf)
-{
-	afr_private_t * priv     = NULL;
-	afr_local_t *   local    = NULL;
-	xlator_t **     children = NULL;
+    afr_read_txn(frame, this, loc->inode, afr_stat_wind, AFR_DATA_TRANSACTION);
 
-	int deitransform_child = -1;
+    return 0;
+out:
+    AFR_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL);
 
-	int unwind     = 1;
-	int last_tried = -1;
-	int this_try = -1;
+    return 0;
+}
 
-	priv     = this->private;
-	children = priv->children;
+/* }}} */
 
-	deitransform_child = (long) cookie;
+/* {{{ fstat */
 
-	local = frame->local;
+int
+afr_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
 
-	if (op_ret == -1) {
-	retry:
-		last_tried = local->cont.stat.last_tried;
+    local = frame->local;
 
-		if (all_tried (last_tried, priv->child_count)) {
-			goto out;
-		}
-		this_try = ++local->cont.stat.last_tried;
+    if (op_ret < 0) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
 
-		if (this_try == deitransform_child) {
-			goto retry;
-		}
+        afr_read_txn_continue(frame, this, (long)cookie);
+        return 0;
+    }
 
-		unwind = 0;
+    AFR_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata);
 
-		STACK_WIND_COOKIE (frame, afr_stat_cbk,
-				   (void *) (long) deitransform_child,
-				   children[this_try], 
-				   children[this_try]->fops->stat,
-				   &local->loc);
-	}
+    return 0;
+}
 
-out:
-	if (unwind) {
-		if (op_ret != -1)
-			buf->st_ino = local->cont.stat.ino;
+int
+afr_fstat_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
 
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
-	}
+    priv = this->private;
+    local = frame->local;
 
-	return 0;
-}
+    if (subvol == -1) {
+        AFR_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno, 0, 0);
+        return 0;
+    }
 
+    STACK_WIND_COOKIE(
+        frame, afr_fstat_cbk, (void *)(long)subvol, priv->children[subvol],
+        priv->children[subvol]->fops->fstat, local->fd, local->xdata_req);
+    return 0;
+}
 
 int32_t
-afr_stat (call_frame_t *frame, xlator_t *this,
-	  loc_t *loc)
+afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
 {
-	afr_private_t * priv       = NULL;
-	afr_local_t   * local      = NULL;
-	xlator_t **     children   = NULL;
-
-	int             call_child = 0;
+    afr_local_t *local = NULL;
+    int op_errno = 0;
 
-	int32_t         op_ret     = -1;
-	int32_t         op_errno   = 0;
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+    local->op = GF_FOP_FSTAT;
+    local->fd = fd_ref(fd);
+    if (xdata)
+        local->xdata_req = dict_ref(xdata);
 
-	priv     = this->private;
-	VALIDATE_OR_GOTO (priv->children, out);
+    afr_fix_open(fd, this);
 
-	children = priv->children;
+    afr_read_txn(frame, this, fd->inode, afr_fstat_wind, AFR_DATA_TRANSACTION);
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	frame->local = local;
+    return 0;
+out:
+    AFR_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL);
 
-	call_child = afr_deitransform (loc->inode->ino, priv->child_count);
-	loc_copy (&local->loc, loc);
+    return 0;
+}
 
-	/* 
-	   if stat fails from the deitranform'd child, we try
-	   all children starting with the first one
-	*/
-	local->cont.stat.last_tried = -1;
-	local->cont.stat.ino = loc->inode->ino;
+/* }}} */
 
-	STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child,
-			   children[call_child],
-			   children[call_child]->fops->stat,
-			   loc);
+/* {{{ readlink */
 
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
+int
+afr_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, const char *buf,
+                 struct iatt *sbuf, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
 
-	return 0;
-}
+    local = frame->local;
 
+    if (op_ret < 0) {
+        local->op_ret = -1;
+        local->op_errno = op_errno;
 
-/* }}} */
+        afr_read_txn_continue(frame, this, (long)cookie);
+        return 0;
+    }
 
-/* {{{ fstat */
+    AFR_STACK_UNWIND(readlink, frame, op_ret, op_errno, buf, sbuf, xdata);
+    return 0;
+}
 
-int32_t
-afr_fstat_cbk (call_frame_t *frame, void *cookie,
-	       xlator_t *this, int32_t op_ret, int32_t op_errno,
-	       struct stat *buf)
+int
+afr_readlink_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_private_t * priv     = NULL;
-	afr_local_t *   local    = NULL;
-	xlator_t **     children = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    if (subvol == -1) {
+        AFR_STACK_UNWIND(readlink, frame, local->op_ret, local->op_errno, 0, 0,
+                         0);
+        return 0;
+    }
+
+    STACK_WIND_COOKIE(frame, afr_readlink_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->readlink, &local->loc,
+                      local->cont.readlink.size, local->xdata_req);
+    return 0;
+}
 
-	int deitransform_child = -1;
+int
+afr_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+             dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int32_t op_errno = 0;
 
-	int unwind     = 1;
-	int last_tried = -1;
-	int this_try = -1;
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
 
-	priv     = this->private;
-	children = priv->children;
+    local->op = GF_FOP_READLINK;
+    loc_copy(&local->loc, loc);
+    local->cont.readlink.size = size;
+    if (xdata)
+        local->xdata_req = dict_ref(xdata);
 
-	deitransform_child = (long) cookie;
+    afr_read_txn(frame, this, loc->inode, afr_readlink_wind,
+                 AFR_DATA_TRANSACTION);
 
-	local = frame->local;
+    return 0;
+out:
+    AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0);
 
-	if (op_ret == -1) {
-	retry:
-		last_tried = local->cont.fstat.last_tried;
+    return 0;
+}
 
-		if (all_tried (last_tried, priv->child_count)) {
-			goto out;
-		}
-		this_try   = ++local->cont.fstat.last_tried;
+/* }}} */
 
-		if (this_try == deitransform_child) {
-			/* 
-			   skip the deitransform'd child since if we are here
-			   we must have already tried that child
-			*/
-			goto retry;
-		}
-	       
+/* {{{ getxattr */
 
-		unwind = 0;
+struct _xattr_key {
+    char *key;
+    struct list_head list;
+};
 
-		STACK_WIND_COOKIE (frame, afr_fstat_cbk,
-				   (void *) (long) deitransform_child,
-				   children[this_try], 
-				   children[this_try]->fops->fstat,
-				   local->fd);
-	}
+int
+__gather_xattr_keys(dict_t *dict, char *key, data_t *value, void *data)
+{
+    struct list_head *list = data;
+    struct _xattr_key *xkey = NULL;
 
-out:
-	if (unwind) {
-		if (op_ret != -1)
-			buf->st_ino = local->cont.fstat.ino;
+    if (!strncmp(key, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) {
+        xkey = GF_MALLOC(sizeof(*xkey), gf_afr_mt_xattr_key);
+        if (!xkey)
+            return -1;
 
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
-	}
+        xkey->key = key;
+        INIT_LIST_HEAD(&xkey->list);
 
-	return 0;
+        list_add_tail(&xkey->list, list);
+    }
+    return 0;
 }
 
-
-int32_t
-afr_fstat (call_frame_t *frame, xlator_t *this,
-	   fd_t *fd)
+void
+afr_filter_xattrs(dict_t *dict)
 {
-	afr_private_t * priv       = NULL;
-	afr_local_t   * local      = NULL;
-	xlator_t **     children   = NULL;
+    struct list_head keys = {
+        0,
+    };
+    struct _xattr_key *key = NULL;
+    struct _xattr_key *tmp = NULL;
 
-	int             call_child = 0;
+    INIT_LIST_HEAD(&keys);
 
-	int32_t         op_ret     = -1;
-	int32_t         op_errno   = 0;
+    dict_foreach(dict, __gather_xattr_keys, (void *)&keys);
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (fd, out);
-	VALIDATE_OR_GOTO (this->private, out);
+    list_for_each_entry_safe(key, tmp, &keys, list)
+    {
+        dict_del(dict, key->key);
 
-	priv     = this->private;
-	VALIDATE_OR_GOTO (priv->children, out);
+        list_del_init(&key->list);
 
-	children = priv->children;
+        GF_FREE(key);
+    }
+}
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+static gf_boolean_t
+afr_getxattr_ignorable_errnos(int32_t op_errno)
+{
+    if (op_errno == ENODATA || op_errno == ENOTSUP || op_errno == ERANGE ||
+        op_errno == ENAMETOOLONG)
+        return _gf_true;
 
-	frame->local = local;
+    return _gf_false;
+}
+int
+afr_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
 
-	VALIDATE_OR_GOTO (fd->inode, out);
+    local = frame->local;
 
-	call_child = afr_deitransform (fd->inode->ino, priv->child_count);
+    if (op_ret < 0 && !afr_getxattr_ignorable_errnos(op_errno)) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
 
-	/* 
-	   if fstat fails from the deitranform'd child, we try
-	   all children starting with the first one
-	*/
-	local->cont.fstat.last_tried = -1;
-	local->cont.fstat.ino = fd->inode->ino;
-	local->fd = fd_ref (fd);
+        afr_read_txn_continue(frame, this, (long)cookie);
+        return 0;
+    }
 
-	STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child,
-			   children[call_child],
-			   children[call_child]->fops->fstat,
-			   fd);
+    if (dict)
+        afr_filter_xattrs(dict);
 
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
+    AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
 
-	return 0;
+    return 0;
 }
 
-/* }}} */
+int
+afr_getxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    if (subvol == -1) {
+        AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, NULL,
+                         NULL);
+        return 0;
+    }
+
+    STACK_WIND_COOKIE(frame, afr_getxattr_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->getxattr, &local->loc,
+                      local->cont.getxattr.name, local->xdata_req);
+    return 0;
+}
 
-/* {{{ readlink */
+int32_t
+afr_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict,
+                    dict_t *xdata)
+
+{
+    AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
 
 int32_t
-afr_readlink_cbk (call_frame_t *frame, void *cookie,
-		  xlator_t *this, int32_t op_ret, int32_t op_errno,
-		  const char *buf)
+afr_fgetxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *dict,
+                        dict_t *xdata)
 {
-	afr_private_t * priv     = NULL;
-	afr_local_t *   local    = NULL;
-	xlator_t **     children = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    xlator_t **children = NULL;
+    dict_t *xattr = NULL;
+    char *tmp_report = NULL;
+    char lk_summary[1024] = {
+        0,
+    };
+    int serz_len = 0;
+    int32_t callcnt = 0;
+    long int cky = 0;
+    int ret = 0;
+    int keylen = 0;
+    int children_keylen = 0;
+
+    priv = this->private;
+    children = priv->children;
+
+    local = frame->local;
+    cky = (long)cookie;
+    keylen = strlen(local->cont.getxattr.name);
+    children_keylen = strlen(children[cky]->name);
+
+    LOCK(&frame->lock);
+    {
+        callcnt = --local->call_count;
+        if (op_ret == -1)
+            local->replies[cky].op_errno = op_errno;
+
+        if (!local->dict)
+            local->dict = dict_new();
+        if (local->dict) {
+            ret = dict_get_strn(dict, local->cont.getxattr.name, keylen,
+                                &tmp_report);
+            if (ret)
+                goto unlock;
+            ret = dict_set_dynstrn(local->dict, children[cky]->name,
+                                   children_keylen, gf_strdup(tmp_report));
+            if (ret)
+                goto unlock;
+        }
+    }
+unlock:
+    UNLOCK(&frame->lock);
+
+    if (!callcnt) {
+        xattr = dict_new();
+        if (!xattr) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unwind;
+        }
+        ret = dict_serialize_value_with_delim(local->dict, lk_summary,
+                                              &serz_len, '\n');
+        if (ret) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unwind;
+        }
+        if (serz_len == -1)
+            snprintf(lk_summary, sizeof(lk_summary), "No locks cleared.");
+        ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen,
+                               gf_strdup(lk_summary));
+        if (ret) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_SET_FAILED,
+                   "Error setting dictionary");
+            goto unwind;
+        }
+
+        op_errno = afr_final_errno(local, priv);
+
+    unwind:
+        AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata);
+        if (xattr)
+            dict_unref(xattr);
+    }
+
+    return ret;
+}
 
-	int unwind     = 1;
-	int last_tried = -1;
-	int this_try = -1;
+int32_t
+afr_getxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *dict,
+                       dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    xlator_t **children = NULL;
+    dict_t *xattr = NULL;
+    char *tmp_report = NULL;
+    char lk_summary[1024] = {
+        0,
+    };
+    int serz_len = 0;
+    int32_t callcnt = 0;
+    long int cky = 0;
+    int ret = 0;
+    int keylen = 0;
+    int children_keylen = 0;
+
+    priv = this->private;
+    children = priv->children;
+
+    local = frame->local;
+    cky = (long)cookie;
+
+    keylen = strlen(local->cont.getxattr.name);
+    children_keylen = strlen(children[cky]->name);
+
+    LOCK(&frame->lock);
+    {
+        callcnt = --local->call_count;
+        if (op_ret == -1)
+            local->replies[cky].op_errno = op_errno;
+
+        if (!local->dict)
+            local->dict = dict_new();
+        if (local->dict) {
+            ret = dict_get_strn(dict, local->cont.getxattr.name, keylen,
+                                &tmp_report);
+            if (ret)
+                goto unlock;
+            ret = dict_set_dynstrn(local->dict, children[cky]->name,
+                                   children_keylen, gf_strdup(tmp_report));
+            if (ret)
+                goto unlock;
+        }
+    }
+unlock:
+    UNLOCK(&frame->lock);
+
+    if (!callcnt) {
+        xattr = dict_new();
+        if (!xattr) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unwind;
+        }
+        ret = dict_serialize_value_with_delim(local->dict, lk_summary,
+                                              &serz_len, '\n');
+        if (ret) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unwind;
+        }
+        if (serz_len == -1)
+            snprintf(lk_summary, sizeof(lk_summary), "No locks cleared.");
+        ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen,
+                               gf_strdup(lk_summary));
+        if (ret) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_SET_FAILED,
+                   "Error setting dictionary");
+            goto unwind;
+        }
+
+        op_errno = afr_final_errno(local, priv);
+
+    unwind:
+        AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata);
+
+        if (xattr)
+            dict_unref(xattr);
+    }
+
+    return ret;
+}
 
-	priv     = this->private;
-	children = priv->children;
+/**
+ * node-uuid cbk uses next child querying mechanism
+ */
+int32_t
+afr_getxattr_node_uuid_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno, dict_t *dict,
+                           dict_t *xdata)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    xlator_t **children = NULL;
+    int unwind = 1;
+    int curr_call_child = 0;
+
+    priv = this->private;
+    children = priv->children;
+
+    local = frame->local;
+
+    if (op_ret == -1) { /** query the _next_ child */
+
+        /**
+         * _current_ becomes _next_
+         * If done with all children and yet no success; give up !
+         */
+        curr_call_child = (int)((long)cookie);
+        if (++curr_call_child == priv->child_count)
+            goto unwind;
+
+        gf_msg_debug(this->name, op_errno,
+                     "op_ret (-1): Re-querying afr-child (%d/%d)",
+                     curr_call_child, priv->child_count);
+
+        unwind = 0;
+        STACK_WIND_COOKIE(
+            frame, afr_getxattr_node_uuid_cbk, (void *)(long)curr_call_child,
+            children[curr_call_child],
+            children[curr_call_child]->fops->getxattr, &local->loc,
+            local->cont.getxattr.name, local->xdata_req);
+    }
+
+unwind:
+    if (unwind)
+        AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+
+    return 0;
+}
 
-	local = frame->local;
+/**
+ * list-node-uuids cbk returns the list of node_uuids for the subvolume.
+ */
+int32_t
+afr_getxattr_list_node_uuids_cbk(call_frame_t *frame, void *cookie,
+                                 xlator_t *this, int32_t op_ret,
+                                 int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int32_t callcnt = 0;
+    int ret = 0;
+    char *xattr_serz = NULL;
+    long cky = 0;
+    int32_t tlen = 0;
+
+    local = frame->local;
+    priv = this->private;
+    cky = (long)cookie;
+
+    LOCK(&frame->lock);
+    {
+        callcnt = --local->call_count;
+        local->replies[cky].valid = 1;
+        local->replies[cky].op_ret = op_ret;
+        local->replies[cky].op_errno = op_errno;
+
+        if (op_ret < 0)
+            goto unlock;
+
+        local->op_ret = 0;
+
+        if (!local->xdata_rsp && xdata)
+            local->xdata_rsp = dict_ref(xdata);
+        local->replies[cky].xattr = dict_ref(dict);
+    }
+
+unlock:
+    UNLOCK(&frame->lock);
+
+    if (!callcnt) {
+        if (local->op_ret != 0) {
+            /* All bricks gave an error. */
+            local->op_errno = afr_final_errno(local, priv);
+            goto unwind;
+        }
+
+        /*Since we store the UUID0_STR as node uuid for down bricks and
+         *for non zero op_ret, assigning length to  priv->child_count
+         *number of uuids*/
+        local->cont.getxattr.xattr_len = (SLEN(UUID0_STR) + 2) *
+                                         priv->child_count;
+
+        if (!local->dict)
+            local->dict = dict_new();
+        if (!local->dict) {
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            goto unwind;
+        }
+
+        xattr_serz = GF_CALLOC(local->cont.getxattr.xattr_len, sizeof(char),
+                               gf_common_mt_char);
+
+        if (!xattr_serz) {
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            goto unwind;
+        }
+
+        ret = afr_serialize_xattrs_with_delimiter(frame, this, xattr_serz,
+                                                  UUID0_STR, &tlen, ' ');
+        if (ret) {
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            GF_FREE(xattr_serz);
+            goto unwind;
+        }
+        ret = dict_set_dynstr_sizen(local->dict, GF_XATTR_LIST_NODE_UUIDS_KEY,
+                                    xattr_serz);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+                   "Cannot set node_uuid key in dict");
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            if (ret == -EINVAL)
+                GF_FREE(xattr_serz);
+        } else {
+            local->op_ret = local->cont.getxattr.xattr_len - 1;
+            local->op_errno = 0;
+        }
+
+    unwind:
+        AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+                         local->dict, local->xdata_rsp);
+    }
+
+    return ret;
+}
 
-	if (op_ret == -1) {
-		last_tried = local->cont.readlink.last_tried;
+int32_t
+afr_getxattr_quota_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int32_t op_ret, int32_t op_errno, dict_t *dict,
+                            dict_t *xdata)
+{
+    int idx = (long)cookie;
+    int call_count = 0;
+    afr_local_t *local = frame->local;
+    int read_subvol = -1;
+
+    local->replies[idx].valid = 1;
+    local->replies[idx].op_ret = op_ret;
+    local->replies[idx].op_errno = op_errno;
+    if (dict)
+        local->replies[idx].xdata = dict_ref(dict);
+    call_count = afr_frame_return(frame);
+    if (call_count == 0) {
+        local->inode = inode_ref(local->loc.inode);
+        read_subvol = afr_handle_quota_size(frame, this);
+        if (read_subvol != -1) {
+            op_ret = local->replies[read_subvol].op_ret;
+            op_errno = local->replies[read_subvol].op_errno;
+            dict = local->replies[read_subvol].xdata;
+        }
+        AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+    }
+
+    return 0;
+}
 
-		if (all_tried (last_tried, priv->child_count)) {
-			goto out;
-		}
-		this_try = ++local->cont.readlink.last_tried;
+int32_t
+afr_getxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, dict_t *dict,
+                          dict_t *xdata)
+{
+    int call_cnt = 0, len = 0;
+    char *lockinfo_buf = NULL;
+    dict_t *lockinfo = NULL, *newdict = NULL;
+    afr_local_t *local = NULL;
+
+    LOCK(&frame->lock);
+    {
+        local = frame->local;
+
+        call_cnt = --local->call_count;
+
+        if ((op_ret < 0) || (!dict && !xdata)) {
+            goto unlock;
+        }
+
+        if (xdata) {
+            if (!local->xdata_rsp) {
+                local->xdata_rsp = dict_new();
+                if (!local->xdata_rsp) {
+                    local->op_ret = -1;
+                    local->op_errno = ENOMEM;
+                    goto unlock;
+                }
+            }
+        }
+
+        if (!dict) {
+            goto unlock;
+        }
+
+        op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY,
+                                      (void **)&lockinfo_buf, &len);
+
+        if (!lockinfo_buf) {
+            goto unlock;
+        }
+
+        if (!local->dict) {
+            local->dict = dict_new();
+            if (!local->dict) {
+                local->op_ret = -1;
+                local->op_errno = ENOMEM;
+                goto unlock;
+            }
+        }
+    }
+unlock:
+    UNLOCK(&frame->lock);
+
+    if (lockinfo_buf != NULL) {
+        lockinfo = dict_new();
+        if (lockinfo == NULL) {
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+        } else {
+            op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo);
+
+            if (lockinfo && local->dict) {
+                dict_copy(lockinfo, local->dict);
+            }
+        }
+    }
+
+    if (xdata && local->xdata_rsp) {
+        dict_copy(xdata, local->xdata_rsp);
+    }
+
+    if (!call_cnt) {
+        newdict = dict_new();
+        if (!newdict) {
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            goto unwind;
+        }
+
+        op_ret = dict_allocate_and_serialize(
+            local->dict, (char **)&lockinfo_buf, (unsigned int *)&len);
+        if (op_ret != 0) {
+            local->op_ret = -1;
+            goto unwind;
+        }
+
+        op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY,
+                                 (void *)lockinfo_buf, len);
+        if (op_ret < 0) {
+            local->op_ret = -1;
+            local->op_errno = -op_ret;
+            goto unwind;
+        }
+
+    unwind:
+        AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, newdict,
+                         local->xdata_rsp);
+    }
+
+    dict_unref(lockinfo);
+
+    return 0;
+}
 
-		unwind = 0;
-		STACK_WIND_COOKIE (frame, afr_readlink_cbk,
-				   (void *) (long) this_try,
-				   children[this_try], 
-				   children[this_try]->fops->readlink,
-				   &local->loc,
-				   local->cont.readlink.size);
-	}
+int32_t
+afr_fgetxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno, dict_t *dict,
+                           dict_t *xdata)
+{
+    int call_cnt = 0, len = 0;
+    char *lockinfo_buf = NULL;
+    dict_t *lockinfo = NULL, *newdict = NULL;
+    afr_local_t *local = NULL;
+
+    LOCK(&frame->lock);
+    {
+        local = frame->local;
+
+        call_cnt = --local->call_count;
+
+        if ((op_ret < 0) || (!dict && !xdata)) {
+            goto unlock;
+        }
+
+        if (xdata) {
+            if (!local->xdata_rsp) {
+                local->xdata_rsp = dict_new();
+                if (!local->xdata_rsp) {
+                    local->op_ret = -1;
+                    local->op_errno = ENOMEM;
+                    goto unlock;
+                }
+            }
+        }
+
+        if (!dict) {
+            goto unlock;
+        }
+
+        op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY,
+                                      (void **)&lockinfo_buf, &len);
+
+        if (!lockinfo_buf) {
+            goto unlock;
+        }
+
+        if (!local->dict) {
+            local->dict = dict_new();
+            if (!local->dict) {
+                local->op_ret = -1;
+                local->op_errno = ENOMEM;
+                goto unlock;
+            }
+        }
+    }
+unlock:
+    UNLOCK(&frame->lock);
+
+    if (lockinfo_buf != NULL) {
+        lockinfo = dict_new();
+        if (lockinfo == NULL) {
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+        } else {
+            op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo);
+
+            if (lockinfo && local->dict) {
+                dict_copy(lockinfo, local->dict);
+            }
+        }
+    }
+
+    if (xdata && local->xdata_rsp) {
+        dict_copy(xdata, local->xdata_rsp);
+    }
+
+    if (!call_cnt) {
+        newdict = dict_new();
+        if (!newdict) {
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            goto unwind;
+        }
+
+        op_ret = dict_allocate_and_serialize(
+            local->dict, (char **)&lockinfo_buf, (unsigned int *)&len);
+        if (op_ret != 0) {
+            local->op_ret = -1;
+            goto unwind;
+        }
+
+        op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY,
+                                 (void *)lockinfo_buf, len);
+        if (op_ret < 0) {
+            local->op_ret = -1;
+            local->op_errno = -op_ret;
+            goto unwind;
+        }
+
+    unwind:
+        AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, newdict,
+                         local->xdata_rsp);
+    }
+
+    dict_unref(lockinfo);
+
+    return 0;
+}
 
-out:
-	if (unwind) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, buf);
-	}
+int32_t
+afr_fgetxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno, dict_t *dict,
+                           dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int32_t callcnt = 0;
+    int ret = 0;
+    char *xattr = NULL;
+    char *xattr_serz = NULL;
+    int keylen = 0;
+    char xattr_cky[1024] = {
+        0,
+    };
+    int xattr_cky_len = 0;
+    dict_t *nxattr = NULL;
+    long cky = 0;
+    int32_t padding = 0;
+    int32_t tlen = 0;
+
+    if (!frame || !frame->local || !this) {
+        gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref");
+        goto out;
+    }
+
+    local = frame->local;
+    cky = (long)cookie;
+    keylen = strlen(local->cont.getxattr.name);
+    xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld",
+                             local->cont.getxattr.name, cky);
+    LOCK(&frame->lock);
+    {
+        callcnt = --local->call_count;
+
+        if (op_ret < 0) {
+            local->op_errno = op_errno;
+        } else {
+            local->op_ret = op_ret;
+            if (!local->xdata_rsp && xdata)
+                local->xdata_rsp = dict_ref(xdata);
+        }
+
+        if (!dict || (op_ret < 0))
+            goto unlock;
+
+        if (!local->dict) {
+            local->dict = dict_new();
+            if (!local->dict)
+                goto unlock;
+        }
+        ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr);
+        if (ret)
+            goto unlock;
+
+        xattr = gf_strdup(xattr);
+
+        ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr);
+        if (ret) {
+            UNLOCK(&frame->lock);
+            gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+                   "Cannot set xattr cookie key");
+            goto post_unlock;
+        }
+
+        local->cont.getxattr.xattr_len += strlen(xattr) + 1;
+    }
+unlock:
+    UNLOCK(&frame->lock);
+post_unlock:
+    if (!callcnt) {
+        if (!local->cont.getxattr.xattr_len)
+            goto unwind;
+
+        nxattr = dict_new();
+        if (!nxattr)
+            goto unwind;
+
+        /* extra bytes for decorations (brackets and <>'s) */
+        padding += strlen(this->name) + SLEN(AFR_PATHINFO_HEADER) + 4;
+        local->cont.getxattr.xattr_len += (padding + 2);
+
+        xattr_serz = GF_MALLOC(local->cont.getxattr.xattr_len,
+                               gf_common_mt_char);
+
+        if (!xattr_serz)
+            goto unwind;
+
+        /* the xlator info */
+        int xattr_serz_len = sprintf(
+            xattr_serz, "(<" AFR_PATHINFO_HEADER "%s> ", this->name);
+
+        /* actual series of pathinfo */
+        ret = dict_serialize_value_with_delim(
+            local->dict, xattr_serz + xattr_serz_len, &tlen, ' ');
+        if (ret) {
+            GF_FREE(xattr_serz);
+            goto unwind;
+        }
+
+        /* closing part */
+        *(xattr_serz + padding + tlen) = ')';
+        *(xattr_serz + padding + tlen + 1) = '\0';
+
+        ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen,
+                               xattr_serz);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+                   "Cannot set pathinfo key in dict");
+            if (ret == -EINVAL)
+                GF_FREE(xattr_serz);
+        }
+
+    unwind:
+        AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno,
+                         nxattr, local->xdata_rsp);
+
+        if (nxattr)
+            dict_unref(nxattr);
+    }
 
-	return 0;
+out:
+    return ret;
 }
 
-
 int32_t
-afr_readlink (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, size_t size)
+afr_getxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, dict_t *dict,
+                          dict_t *xdata)
 {
-	afr_private_t * priv       = NULL;
-	xlator_t **     children   = NULL;
-	int             call_child = 0;
-	afr_local_t     *local     = NULL;
+    afr_local_t *local = NULL;
+    int32_t callcnt = 0;
+    int ret = 0;
+    char *xattr = NULL;
+    char *xattr_serz = NULL;
+    char xattr_cky[1024] = {
+        0,
+    };
+    int keylen = 0;
+    int xattr_cky_len = 0;
+    dict_t *nxattr = NULL;
+    long cky = 0;
+    int32_t padding = 0;
+    int32_t tlen = 0;
+
+    if (!frame || !frame->local || !this) {
+        gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref");
+        goto out;
+    }
+
+    local = frame->local;
+    cky = (long)cookie;
+    keylen = strlen(local->cont.getxattr.name);
+    xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld",
+                             local->cont.getxattr.name, cky);
+    LOCK(&frame->lock);
+    {
+        callcnt = --local->call_count;
+
+        if (op_ret < 0) {
+            local->op_errno = op_errno;
+        } else {
+            local->op_ret = op_ret;
+            if (!local->xdata_rsp && xdata)
+                local->xdata_rsp = dict_ref(xdata);
+        }
+
+        if (!dict || (op_ret < 0))
+            goto unlock;
+
+        if (!local->dict) {
+            local->dict = dict_new();
+            if (!local->dict)
+                goto unlock;
+        }
+        ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr);
+        if (ret)
+            goto unlock;
+
+        xattr = gf_strdup(xattr);
+
+        ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr);
+        if (ret) {
+            UNLOCK(&frame->lock);
+            gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+                   "Cannot set xattr cookie key");
+            goto post_unlock;
+        }
+
+        local->cont.getxattr.xattr_len += strlen(xattr) + 1;
+    }
+unlock:
+    UNLOCK(&frame->lock);
+post_unlock:
+    if (!callcnt) {
+        if (!local->cont.getxattr.xattr_len)
+            goto unwind;
+
+        nxattr = dict_new();
+        if (!nxattr)
+            goto unwind;
+
+        /* extra bytes for decorations (brackets and <>'s) */
+        padding += strlen(this->name) + SLEN(AFR_PATHINFO_HEADER) + 4;
+        local->cont.getxattr.xattr_len += (padding + 2);
+
+        xattr_serz = GF_MALLOC(local->cont.getxattr.xattr_len,
+                               gf_common_mt_char);
+
+        if (!xattr_serz)
+            goto unwind;
+
+        /* the xlator info */
+        int xattr_serz_len = sprintf(
+            xattr_serz, "(<" AFR_PATHINFO_HEADER "%s> ", this->name);
+
+        /* actual series of pathinfo */
+        ret = dict_serialize_value_with_delim(
+            local->dict, xattr_serz + xattr_serz_len, &tlen, ' ');
+        if (ret) {
+            GF_FREE(xattr_serz);
+            goto unwind;
+        }
+
+        /* closing part */
+        *(xattr_serz + padding + tlen) = ')';
+        *(xattr_serz + padding + tlen + 1) = '\0';
+
+        ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen,
+                               xattr_serz);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+                   "Cannot set pathinfo key in dict");
+            if (ret == -EINVAL)
+                GF_FREE(xattr_serz);
+        }
+
+    unwind:
+        AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+                         nxattr, local->xdata_rsp);
+
+        if (nxattr)
+            dict_unref(nxattr);
+    }
 
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
+out:
+    return ret;
+}
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+static int
+afr_aggregate_stime_xattr(dict_t *this, char *key, data_t *value, void *data)
+{
+    int ret = 0;
 
-	priv     = this->private;
-	VALIDATE_OR_GOTO (priv->children, out);
+    if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0)
+        ret = gf_get_max_stime(THIS, data, key, value);
 
-	children = priv->children;
+    return ret;
+}
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+int32_t
+afr_common_getxattr_stime_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                              int32_t op_ret, int32_t op_errno, dict_t *dict,
+                              dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int32_t callcnt = 0;
 
-	frame->local = local;
+    if (!frame || !frame->local || !this) {
+        gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref");
+        goto out;
+    }
 
-	call_child = afr_first_up_child (priv);
-	if (call_child == -1) {
-		op_errno = ENOTCONN;
-		gf_log (this->name, GF_LOG_ERROR,
-			"no child is up :(");
-		goto out;
-	}
+    local = frame->local;
 
-	local->cont.readlink.last_tried = call_child;
-	loc_copy (&local->loc, loc);
-	local->cont.readlink.size       = size;
+    LOCK(&frame->lock);
+    {
+        callcnt = --local->call_count;
 
-	STACK_WIND_COOKIE (frame, afr_readlink_cbk,
-			   (void *) (long) call_child,
-			   children[call_child], children[call_child]->fops->readlink,
-			   loc, size);
+        if (!dict || (op_ret < 0)) {
+            local->op_errno = op_errno;
+            goto cleanup;
+        }
 
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
-	return 0;
-}
+        if (!local->dict)
+            local->dict = dict_copy_with_ref(dict, NULL);
+        else
+            dict_foreach(dict, afr_aggregate_stime_xattr, local->dict);
+        local->op_ret = 0;
+    }
 
+cleanup:
+    UNLOCK(&frame->lock);
 
-/* }}} */
+    if (!callcnt) {
+        AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+                         local->dict, xdata);
+    }
 
-/* {{{ getxattr */
+out:
+    return 0;
+}
 
-int32_t
-afr_getxattr_cbk (call_frame_t *frame, void *cookie,
-		  xlator_t *this, int32_t op_ret, int32_t op_errno,
-		  dict_t *dict)
+static gf_boolean_t
+afr_is_special_xattr(const char *name, fop_getxattr_cbk_t *cbk,
+                     gf_boolean_t is_fgetxattr)
 {
-	afr_private_t * priv     = NULL;
-	afr_local_t *   local    = NULL;
-	xlator_t **     children = NULL;
+    gf_boolean_t is_spl = _gf_true;
+
+    GF_ASSERT(cbk);
+    if (!cbk || !name) {
+        is_spl = _gf_false;
+        goto out;
+    }
+
+    if (!strcmp(name, GF_XATTR_PATHINFO_KEY) ||
+        !strcmp(name, GF_XATTR_USER_PATHINFO_KEY)) {
+        if (is_fgetxattr) {
+            *cbk = afr_fgetxattr_pathinfo_cbk;
+        } else {
+            *cbk = afr_getxattr_pathinfo_cbk;
+        }
+    } else if (!strncmp(name, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD))) {
+        if (is_fgetxattr) {
+            *cbk = afr_fgetxattr_clrlk_cbk;
+        } else {
+            *cbk = afr_getxattr_clrlk_cbk;
+        }
+    } else if (!strncmp(name, GF_XATTR_LOCKINFO_KEY,
+                        SLEN(GF_XATTR_LOCKINFO_KEY))) {
+        if (is_fgetxattr) {
+            *cbk = afr_fgetxattr_lockinfo_cbk;
+        } else {
+            *cbk = afr_getxattr_lockinfo_cbk;
+        }
+    } else if (fnmatch(GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) {
+        *cbk = afr_common_getxattr_stime_cbk;
+    } else if (strcmp(name, QUOTA_SIZE_KEY) == 0) {
+        *cbk = afr_getxattr_quota_size_cbk;
+    } else if (!strcmp(name, GF_XATTR_LIST_NODE_UUIDS_KEY)) {
+        *cbk = afr_getxattr_list_node_uuids_cbk;
+    } else {
+        is_spl = _gf_false;
+    }
 
-	int unwind     = 1;
-	int last_tried = -1;
-	int this_try = -1;
+out:
+    return is_spl;
+}
 
-	priv     = this->private;
-	children = priv->children;
+static void
+afr_getxattr_all_subvols(xlator_t *this, call_frame_t *frame, const char *name,
+                         loc_t *loc, fop_getxattr_cbk_t cbk)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int i = 0;
+    int call_count = 0;
+
+    priv = this->private;
+
+    local = frame->local;
+    // local->call_count set in afr_local_init
+    call_count = local->call_count;
+
+    if (!strcmp(name, GF_XATTR_LIST_NODE_UUIDS_KEY)) {
+        GF_FREE(local->cont.getxattr.name);
+        local->cont.getxattr.name = gf_strdup(GF_XATTR_NODE_UUID_KEY);
+    }
+
+    // If up-children count is 0, afr_local_init would have failed already
+    // and the call would have unwound so not handling it here.
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->child_up[i]) {
+            STACK_WIND_COOKIE(frame, cbk, (void *)(long)i, priv->children[i],
+                              priv->children[i]->fops->getxattr, loc,
+                              local->cont.getxattr.name, NULL);
+            if (!--call_count)
+                break;
+        }
+    }
+    return;
+}
 
-	local = frame->local;
+int
+afr_marker_populate_args(call_frame_t *frame, int type, int *gauge,
+                         xlator_t **subvols)
+{
+    xlator_t *this = frame->this;
+    afr_private_t *priv = this->private;
 
-	if (op_ret == -1) {
-		last_tried = local->cont.getxattr.last_tried;
+    memcpy(subvols, priv->children, sizeof(*subvols) * priv->child_count);
 
-		if (all_tried (last_tried, priv->child_count)) {
-			goto out;
-		}
-		this_try = ++local->cont.getxattr.last_tried;
+    if (type == MARKER_XTIME_TYPE) {
+        /*Don't error out on ENOENT/ENOTCONN */
+        gauge[MCNT_NOTFOUND] = 0;
+        gauge[MCNT_ENOTCONN] = 0;
+    }
+    return priv->child_count;
+}
 
-		unwind = 0;
-		STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
-				   (void *) (long) this_try,
-				   children[this_try], 
-				   children[this_try]->fops->getxattr,
-				   &local->loc,
-				   local->cont.getxattr.name);
-	}
+static int
+afr_handle_heal_xattrs(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       const char *heal_op)
+{
+    int ret = -1;
+    afr_spb_status_t *data = NULL;
+
+    if (!strcmp(heal_op, GF_HEAL_INFO)) {
+        afr_get_heal_info(frame, this, loc);
+        ret = 0;
+        goto out;
+    }
+
+    if (!strcmp(heal_op, GF_AFR_HEAL_SBRAIN)) {
+        afr_heal_splitbrain_file(frame, this, loc);
+        ret = 0;
+        goto out;
+    }
+
+    if (!strcmp(heal_op, GF_AFR_SBRAIN_STATUS)) {
+        data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_spb_status_t);
+        if (!data) {
+            ret = 1;
+            goto out;
+        }
+        data->frame = frame;
+        data->loc = loc;
+        ret = synctask_new(this->ctx->env, afr_get_split_brain_status,
+                           afr_get_split_brain_status_cbk, NULL, data);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+                   "Failed to create"
+                   " synctask. Unable to fetch split-brain status"
+                   " for %s.",
+                   loc->name);
+            ret = 1;
+            goto out;
+        }
+        goto out;
+    }
 
 out:
-	if (unwind) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, dict);
-	}
+    if (ret == 1) {
+        AFR_STACK_UNWIND(getxattr, frame, -1, ENOMEM, NULL, NULL);
+        if (data)
+            GF_FREE(data);
+        ret = 0;
+    }
+    return ret;
+}
 
-	return 0;
+int32_t
+afr_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+             dict_t *xdata)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    xlator_t **children = NULL;
+    int i = 0;
+    int32_t op_errno = 0;
+    int ret = -1;
+    fop_getxattr_cbk_t cbk = NULL;
+
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    priv = this->private;
+
+    children = priv->children;
+
+    loc_copy(&local->loc, loc);
+
+    local->op = GF_FOP_GETXATTR;
+
+    if (xdata)
+        local->xdata_req = dict_ref(xdata);
+
+    if (!name)
+        goto no_name;
+
+    local->cont.getxattr.name = gf_strdup(name);
+
+    if (!local->cont.getxattr.name) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    if (!strncmp(name, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) {
+        op_errno = ENODATA;
+        goto out;
+    }
+
+    if (cluster_handle_marker_getxattr(frame, loc, name, priv->vol_uuid,
+                                       afr_getxattr_unwind,
+                                       afr_marker_populate_args) == 0)
+        return 0;
+
+    ret = afr_handle_heal_xattrs(frame, this, &local->loc, name);
+    if (ret == 0)
+        return 0;
+
+    /*
+     * Heal daemons don't have IO threads ... and as a result they
+     * send this getxattr down and eventually crash :(
+     */
+    op_errno = -1;
+    GF_CHECK_XATTR_KEY_AND_GOTO(name, IO_THREADS_QUEUE_SIZE_KEY, op_errno, out);
+
+    /*
+     * Special xattrs which need responses from all subvols
+     */
+    if (afr_is_special_xattr(name, &cbk, 0)) {
+        afr_getxattr_all_subvols(this, frame, name, loc, cbk);
+        return 0;
+    }
+
+    if (XATTR_IS_NODE_UUID(name)) {
+        i = 0;
+        STACK_WIND_COOKIE(frame, afr_getxattr_node_uuid_cbk, (void *)(long)i,
+                          children[i], children[i]->fops->getxattr, loc, name,
+                          xdata);
+        return 0;
+    }
+
+no_name:
+
+    afr_read_txn(frame, this, local->loc.inode, afr_getxattr_wind,
+                 AFR_METADATA_TRANSACTION);
+
+    ret = 0;
+out:
+    if (ret < 0)
+        AFR_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL);
+    return 0;
 }
 
+/* {{{ fgetxattr */
 
 int32_t
-afr_getxattr (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, const char *name)
+afr_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
 {
-	afr_private_t *   priv       = NULL;
-	xlator_t **       children   = NULL;
-	int               call_child = 0;
-	afr_local_t     * local      = NULL;
+    afr_local_t *local = NULL;
 
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
+    local = frame->local;
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+    if (op_ret < 0) {
+        local->op_ret = -1;
+        local->op_errno = op_errno;
 
-	priv     = this->private;
-	VALIDATE_OR_GOTO (priv->children, out);
+        afr_read_txn_continue(frame, this, (long)cookie);
+        return 0;
+    }
 
-	children = priv->children;
+    if (dict)
+        afr_filter_xattrs(dict);
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-	frame->local = local;
+    AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata);
 
-	call_child = afr_first_up_child (priv);
-	if (call_child == -1) {
-		op_errno = ENOTCONN;
-		gf_log (this->name, GF_LOG_ERROR,
-			"no child is up :(");
-		goto out;
-	}
+    return 0;
+}
 
-	local->cont.getxattr.last_tried = call_child;
-	loc_copy (&local->loc, loc);
-	if (name)
-	  local->cont.getxattr.name       = strdup (name);
+int
+afr_fgetxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    if (subvol == -1) {
+        AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno, NULL,
+                         NULL);
+        return 0;
+    }
+
+    STACK_WIND_COOKIE(frame, afr_fgetxattr_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->fgetxattr, local->fd,
+                      local->cont.getxattr.name, local->xdata_req);
+    return 0;
+}
 
-	STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
-			   (void *) (long) call_child,
-			   children[call_child], children[call_child]->fops->getxattr,
-			   loc, name);
+static void
+afr_fgetxattr_all_subvols(xlator_t *this, call_frame_t *frame,
+                          fop_fgetxattr_cbk_t cbk)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int i = 0;
+    int call_count = 0;
+
+    priv = this->private;
+
+    local = frame->local;
+    // local->call_count set in afr_local_init
+    call_count = local->call_count;
+
+    // If up-children count is 0, afr_local_init would have failed already
+    // and the call would have unwound so not handling it here.
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->child_up[i]) {
+            STACK_WIND_COOKIE(frame, cbk, (void *)(long)i, priv->children[i],
+                              priv->children[i]->fops->fgetxattr, local->fd,
+                              local->cont.getxattr.name, NULL);
+            if (!--call_count)
+                break;
+        }
+    }
+
+    return;
+}
 
-	op_ret = 0;
+int
+afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+              dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int32_t op_errno = 0;
+    fop_fgetxattr_cbk_t cbk = NULL;
+
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    local->op = GF_FOP_FGETXATTR;
+    local->fd = fd_ref(fd);
+    if (name) {
+        local->cont.getxattr.name = gf_strdup(name);
+        if (!local->cont.getxattr.name) {
+            op_errno = ENOMEM;
+            goto out;
+        }
+    }
+    if (xdata)
+        local->xdata_req = dict_ref(xdata);
+
+    /* pathinfo gets handled only in getxattr(), but we need to handle
+     * lockinfo.
+     * If we are doing fgetxattr with lockinfo as the key then we
+     * collect information from all children.
+     */
+    if (afr_is_special_xattr(name, &cbk, 1)) {
+        afr_fgetxattr_all_subvols(this, frame, cbk);
+        return 0;
+    }
+
+    afr_fix_open(fd, this);
+
+    afr_read_txn(frame, this, fd->inode, afr_fgetxattr_wind,
+                 AFR_METADATA_TRANSACTION);
+
+    return 0;
 out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
-	return 0;
-}
+    AFR_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL);
 
+    return 0;
+}
 
 /* }}} */
 
 /* {{{ readv */
 
-/**
- * read algorithm:
- * 
- * if the user has specified a read subvolume, use it
- * otherwise -
- *   use the inode number to hash it to one of the subvolumes, and
- *   read from there (to balance read load)
- *
- * if any of the above read's fail, try the children in sequence
- * beginning at the beginning
- */
- 
-int32_t
-afr_readv_cbk (call_frame_t *frame, void *cookie,
-	       xlator_t *this, int32_t op_ret, int32_t op_errno,
-	       struct iovec *vector, int32_t count, struct stat *buf)
+int
+afr_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iovec *vector, int32_t count,
+              struct iatt *buf, struct iobref *iobref, dict_t *xdata)
 {
-	afr_private_t * priv     = NULL;
-	afr_local_t *   local    = NULL;
-	xlator_t **     children = NULL;
+    afr_local_t *local = NULL;
 
-	int unwind     = 1;
-	int last_tried = -1;
-	int this_try = -1;
+    local = frame->local;
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+    if (op_ret < 0) {
+        local->op_ret = -1;
+        local->op_errno = op_errno;
 
-	priv     = this->private;
-	VALIDATE_OR_GOTO (priv->children, out);
+        afr_read_txn_continue(frame, this, (long)cookie);
+        return 0;
+    }
 
-	children = priv->children;
+    AFR_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, buf, iobref,
+                     xdata);
+    return 0;
+}
 
-	local = frame->local;
+int
+afr_readv_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    if (subvol == -1) {
+        AFR_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, 0, 0, 0,
+                         0, 0);
+        return 0;
+    }
+
+    STACK_WIND_COOKIE(
+        frame, afr_readv_cbk, (void *)(long)subvol, priv->children[subvol],
+        priv->children[subvol]->fops->readv, local->fd, local->cont.readv.size,
+        local->cont.readv.offset, local->cont.readv.flags, local->xdata_req);
+    return 0;
+}
 
-	if (op_ret == -1) {
-	retry:
-		last_tried = local->cont.readv.last_tried;
+int
+afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+          off_t offset, uint32_t flags, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int32_t op_errno = 0;
 
-		if (all_tried (last_tried, priv->child_count)) {
-			goto out;
-		}
-		this_try = ++local->cont.readv.last_tried;
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
 
-		if (this_try == priv->read_child) {
-			/* 
-			   skip the read child since if we are here
-			   we must have already tried that child
-			*/
-			goto retry;
-		}
+    local->op = GF_FOP_READ;
+    local->fd = fd_ref(fd);
+    local->cont.readv.size = size;
+    local->cont.readv.offset = offset;
+    local->cont.readv.flags = flags;
+    if (xdata)
+        local->xdata_req = dict_ref(xdata);
 
-		unwind = 0;
+    afr_fix_open(fd, this);
 
-		STACK_WIND_COOKIE (frame, afr_readv_cbk,
-				   (void *) (long) this_try,
-				   children[this_try], 
-				   children[this_try]->fops->readv,
-				   local->fd, local->cont.readv.size,
-				   local->cont.readv.offset);
-	}
+    afr_read_txn(frame, this, fd->inode, afr_readv_wind, AFR_DATA_TRANSACTION);
 
+    return 0;
 out:
-	if (unwind) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, vector, count, buf);
-	}
+    AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
 
-	return 0;
+    return 0;
 }
 
+/* }}} */
 
-int32_t
-afr_readv (call_frame_t *frame, xlator_t *this,
-	   fd_t *fd, size_t size, off_t offset)
-{
-	afr_private_t * priv       = NULL;
-	afr_local_t   * local      = NULL;
-	xlator_t **     children   = NULL;
-
-	int             call_child = 0;
+/* {{{ seek */
 
-	int32_t         op_ret     = -1;
-	int32_t         op_errno   = 0;
+int
+afr_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, off_t offset, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-	VALIDATE_OR_GOTO (fd, out);
+    local = frame->local;
 
-	priv     = this->private;
-	children = priv->children;
+    if (op_ret < 0) {
+        local->op_ret = -1;
+        local->op_errno = op_errno;
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+        afr_read_txn_continue(frame, this, (long)cookie);
+        return 0;
+    }
 
-	frame->local = local;
+    AFR_STACK_UNWIND(seek, frame, op_ret, op_errno, offset, xdata);
+    return 0;
+}
 
-	if (priv->read_child != -1) {
-		call_child = priv->read_child;
+int
+afr_seek_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    if (subvol == -1) {
+        AFR_STACK_UNWIND(seek, frame, local->op_ret, local->op_errno, 0, NULL);
+        return 0;
+    }
+
+    STACK_WIND_COOKIE(
+        frame, afr_seek_cbk, (void *)(long)subvol, priv->children[subvol],
+        priv->children[subvol]->fops->seek, local->fd, local->cont.seek.offset,
+        local->cont.seek.what, local->xdata_req);
+    return 0;
+}
 
-		/* 
-		   if read fails from the read child, we try
-		   all children starting with the first one
-		*/
-		local->cont.readv.last_tried = -1;
-	} else {
-		call_child = afr_first_up_child (priv);
-		if (call_child == -1) {
-			op_errno = ENOTCONN;
-			gf_log (this->name, GF_LOG_ERROR,
-				"no child is up :(");
-			goto out;
-		}
+int
+afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+         gf_seek_what_t what, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int32_t op_errno = 0;
 
-		local->cont.readv.last_tried = call_child;
-	}
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
 
-	local->fd                    = fd_ref (fd);
+    local->op = GF_FOP_SEEK;
+    local->fd = fd_ref(fd);
+    local->cont.seek.offset = offset;
+    local->cont.seek.what = what;
+    if (xdata)
+        local->xdata_req = dict_ref(xdata);
 
-	local->cont.readv.size       = size;
-	local->cont.readv.offset     = offset;
+    afr_fix_open(fd, this);
 
-	STACK_WIND_COOKIE (frame, afr_readv_cbk,
-			   (void *) (long) call_child,
-			   children[call_child],
-			   children[call_child]->fops->readv,
-			   fd, size, offset);
+    afr_read_txn(frame, this, fd->inode, afr_seek_wind, AFR_DATA_TRANSACTION);
 
-	op_ret = 0;
+    return 0;
 out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, 0, NULL);
-	}
-	return 0;
-}
+    AFR_STACK_UNWIND(seek, frame, -1, op_errno, 0, NULL);
 
+    return 0;
+}
 /* }}} */
diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h
index 1127cb65276..8c982bc7e6f 100644
--- a/xlators/cluster/afr/src/afr-inode-read.h
+++ b/xlators/cluster/afr/src/afr-inode-read.h
@@ -1,47 +1,45 @@
 /*
-   Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
 #ifndef __INODE_READ_H__
 #define __INODE_READ_H__
 
 int32_t
-afr_access (call_frame_t *frame, xlator_t *this,
-	    loc_t *loc, int32_t mask);
+afr_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+           dict_t *xdata);
 
 int32_t
-afr_stat (call_frame_t *frame, xlator_t *this,
-	  loc_t *loc);
+afr_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
 
 int32_t
-afr_fstat (call_frame_t *frame, xlator_t *this,
-	   fd_t *fd);
+afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
 
 int32_t
-afr_readlink (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, size_t size);
+afr_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+             dict_t *xdata);
 
 int32_t
-afr_readv (call_frame_t *frame, xlator_t *this,
-	   fd_t *fd, size_t size, off_t offset);
+afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+          off_t offset, uint32_t flags, dict_t *xdata);
 
 int32_t
-afr_getxattr (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, const char *name);
+afr_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+             dict_t *xdata);
 
+int32_t
+afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+              dict_t *xdata);
+
+int
+afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+         gf_seek_what_t what, dict_t *xdata);
+int
+afr_handle_quota_size(call_frame_t *frame, xlator_t *this);
 #endif /* __INODE_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index cf4d493a4f3..1d6e4f3570a 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -1,2024 +1,2565 @@
 /*
-   Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
 
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
 
-#include <libgen.h>
 #include <unistd.h>
-#include <fnmatch.h>
 #include <sys/time.h>
 #include <stdlib.h>
 #include <signal.h>
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-
+#include <glusterfs/glusterfs.h>
 #include "afr.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include "protocol-common.h"
+#include <glusterfs/byte-order.h>
 #include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-messages.h"
 
-
-/* {{{ chmod */
-
-
-int
-afr_chmod_unwind (call_frame_t *frame, xlator_t *this)
+static void
+__afr_inode_write_finalize(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	call_frame_t   *main_frame = NULL;
-
-	local = frame->local;
-	priv  = this->private;
-
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame)
-			main_frame = local->transaction.main_frame;
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
-
-	if (main_frame) {
-		local->cont.chmod.buf.st_ino = local->cont.chmod.ino;
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
-				  &local->cont.chmod.buf);
-	}
-	return 0;
+    int i = 0;
+    int ret = 0;
+    int read_subvol = 0;
+    struct iatt *stbuf = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    afr_read_subvol_args_t args = {
+        0,
+    };
+
+    local = frame->local;
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, local->inode, out);
+
+    /*This code needs to stay till DHT sends fops on linked
+     * inodes*/
+    if (!inode_is_linked(local->inode)) {
+        for (i = 0; i < priv->child_count; i++) {
+            if (!local->replies[i].valid)
+                continue;
+            if (local->replies[i].op_ret == -1)
+                continue;
+            if (!gf_uuid_is_null(local->replies[i].poststat.ia_gfid)) {
+                gf_uuid_copy(args.gfid, local->replies[i].poststat.ia_gfid);
+                args.ia_type = local->replies[i].poststat.ia_type;
+                break;
+            } else {
+                ret = dict_get_bin(local->replies[i].xdata,
+                                   DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
+                if (ret)
+                    continue;
+                gf_uuid_copy(args.gfid, stbuf->ia_gfid);
+                args.ia_type = stbuf->ia_type;
+                break;
+            }
+        }
+    }
+
+    if (local->transaction.type == AFR_METADATA_TRANSACTION) {
+        read_subvol = afr_metadata_subvol_get(local->inode, this, NULL,
+                                              local->readable, NULL, &args);
+    } else {
+        read_subvol = afr_data_subvol_get(local->inode, this, NULL,
+                                          local->readable, NULL, &args);
+    }
+
+    local->op_ret = -1;
+    local->op_errno = afr_final_errno(local, priv);
+    afr_pick_error_xdata(local, priv, local->inode, local->readable, NULL,
+                         NULL);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].valid)
+            continue;
+        if (local->replies[i].op_ret < 0)
+            continue;
+
+        /* Order of checks in the compound conditional
+           below is important.
+
+           - Highest precedence: largest op_ret
+           - Next precedence: if all op_rets are equal, read subvol
+           - Least precedence: any succeeded subvol
+        */
+        if ((local->op_ret < local->replies[i].op_ret) ||
+            ((local->op_ret == local->replies[i].op_ret) &&
+             (i == read_subvol))) {
+            local->op_ret = local->replies[i].op_ret;
+            local->op_errno = local->replies[i].op_errno;
+
+            local->cont.inode_wfop.prebuf = local->replies[i].prestat;
+            local->cont.inode_wfop.postbuf = local->replies[i].poststat;
+
+            if (local->replies[i].xdata) {
+                if (local->xdata_rsp)
+                    dict_unref(local->xdata_rsp);
+                local->xdata_rsp = dict_ref(local->replies[i].xdata);
+            }
+            if (local->replies[i].xattr) {
+                if (local->xattr_rsp)
+                    dict_unref(local->xattr_rsp);
+                local->xattr_rsp = dict_ref(local->replies[i].xattr);
+            }
+        }
+    }
+
+    afr_set_in_flight_sb_status(this, frame, local->inode);
+out:
+    return;
 }
 
-
-int
-afr_chmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		    int32_t op_ret, int32_t op_errno, struct stat *buf)
+static void
+__afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+                       int op_ret, int op_errno, struct iatt *prebuf,
+                       struct iatt *postbuf, dict_t *xattr, dict_t *xdata)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int call_count  = -1;
-	int child_index = (long) cookie;
-	int need_unwind = 0;
-
-	local = frame->local;
-	priv  = this->private;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    local->replies[child_index].valid = 1;
+
+    if (AFR_IS_ARBITER_BRICK(priv, child_index) && op_ret == 1)
+        op_ret = iov_length(local->cont.writev.vector,
+                            local->cont.writev.count);
+
+    local->replies[child_index].op_ret = op_ret;
+    local->replies[child_index].op_errno = op_errno;
+    if (xdata)
+        local->replies[child_index].xdata = dict_ref(xdata);
+
+    if (op_ret >= 0) {
+        if (prebuf)
+            local->replies[child_index].prestat = *prebuf;
+        if (postbuf)
+            local->replies[child_index].poststat = *postbuf;
+        if (xattr)
+            local->replies[child_index].xattr = dict_ref(xattr);
+    } else {
+        afr_transaction_fop_failed(frame, this, child_index);
+    }
+
+    return;
+}
 
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
+static int
+__afr_inode_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                      struct iatt *postbuf, dict_t *xattr, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int child_index = (long)cookie;
+    int call_count = -1;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    LOCK(&frame->lock);
+    {
+        __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno,
+                               prebuf, postbuf, xattr, xdata);
+        call_count = --local->call_count;
+    }
+    UNLOCK(&frame->lock);
+
+    if (call_count == 0) {
+        __afr_inode_write_finalize(frame, this);
+
+        if (afr_txn_nothing_failed(frame, this)) {
+            /*if it did pre-op, it will do post-op changing ctime*/
+            if (priv->consistent_metadata && afr_needs_changelog_update(local))
+                afr_zero_fill_stat(local);
+            local->transaction.unwind(frame, this);
+        }
+
+        afr_transaction_resume(frame, this);
+    }
+
+    return 0;
+}
 
-		if (op_ret != -1) {
-			if (local->success_count == 0) {
-				local->op_ret = op_ret;
-				local->cont.chmod.buf = *buf;
-			}
-			local->success_count++;
+/* {{{ writev */
 
-			if (local->success_count == priv->wait_count) {
-				need_unwind = 1;
-			}
-		}
+void
+afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame)
+{
+    afr_local_t *src_local = NULL;
+    afr_local_t *dst_local = NULL;
+
+    src_local = src_frame->local;
+    dst_local = dst_frame->local;
+
+    dst_local->op_ret = src_local->op_ret;
+    dst_local->op_errno = src_local->op_errno;
+    dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf;
+    dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf;
+    if (src_local->xdata_rsp)
+        dst_local->xdata_rsp = dict_ref(src_local->xdata_rsp);
+}
 
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
+void
+afr_writev_unwind(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = this->private;
 
-	if (need_unwind)
-		afr_chmod_unwind (frame, this);
+    local = frame->local;
 
-	call_count = afr_frame_return (frame);
+    if (priv->consistent_metadata)
+        afr_zero_fill_stat(local);
 
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+    AFR_STACK_UNWIND(writev, frame, local->op_ret, local->op_errno,
+                     &local->cont.inode_wfop.prebuf,
+                     &local->cont.inode_wfop.postbuf, local->xdata_rsp);
 }
 
-
 int
-afr_chmod_wind (call_frame_t *frame, xlator_t *this)
+afr_transaction_writev_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	
-	int i = 0;
-	int call_count = -1;
-
-	local = frame->local;
-	priv  = this->private;
+    call_frame_t *fop_frame = NULL;
 
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
+    fop_frame = afr_transaction_detach_fop_frame(frame);
 
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
+    if (fop_frame) {
+        afr_writev_copy_outvars(frame, fop_frame);
+        afr_writev_unwind(fop_frame, this);
+    }
+    return 0;
+}
 
-	local->call_count = call_count;
+static void
+afr_writev_handle_short_writes(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+
+    local = frame->local;
+    priv = this->private;
+    /*
+     * We already have the best case result of the writev calls staged
+     * as the return value. Any writev that returns some value less
+     * than the best case is now out of sync, so mark the fop as
+     * failed. Note that fops that have returned with errors have
+     * already been marked as failed.
+     */
+    for (i = 0; i < priv->child_count; i++) {
+        if ((!local->replies[i].valid) || (local->replies[i].op_ret == -1))
+            continue;
+
+        if (local->replies[i].op_ret < local->op_ret)
+            afr_transaction_fop_failed(frame, this, i);
+    }
+}
 
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_chmod_wind_cbk, (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->chmod,
-					   &local->loc, 
-					   local->cont.chmod.mode); 
-		
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+void
+afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata)
+{
+    int ret = 0;
+    afr_local_t *local = frame->local;
+    uint32_t open_fd_count = 0;
+    uint32_t write_is_append = 0;
+    int32_t num_inodelks = 0;
+
+    LOCK(&frame->lock);
+    {
+        __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno,
+                               prebuf, postbuf, NULL, xdata);
+        if (op_ret == -1 || !xdata)
+            goto unlock;
+
+        write_is_append = 0;
+        ret = dict_get_uint32(xdata, GLUSTERFS_WRITE_IS_APPEND,
+                              &write_is_append);
+        if (ret || !write_is_append)
+            local->append_write = _gf_false;
+
+        ret = dict_get_uint32(xdata, GLUSTERFS_ACTIVE_FD_COUNT, &open_fd_count);
+        if (ret < 0)
+            goto unlock;
+        if (open_fd_count > local->open_fd_count) {
+            local->open_fd_count = open_fd_count;
+            local->update_open_fd_count = _gf_true;
+        }
+
+        ret = dict_get_int32_sizen(xdata, GLUSTERFS_INODELK_COUNT,
+                                   &num_inodelks);
+        if (ret < 0)
+            goto unlock;
+        if (num_inodelks > local->num_inodelks) {
+            local->num_inodelks = num_inodelks;
+            local->update_num_inodelks = _gf_true;
+        }
+    }
+unlock:
+    UNLOCK(&frame->lock);
 }
 
+void
+afr_process_post_writev(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_lock_t *lock = NULL;
+
+    local = frame->local;
+
+    if (!local->stable_write && !local->append_write)
+        /* An appended write removes the necessity to
+           fsync() the file. This is because self-heal
+           has the logic to check for larger file when
+           the xattrs are not reliably pointing at
+           a stale file.
+        */
+        afr_fd_report_unstable_write(this, local);
+
+    __afr_inode_write_finalize(frame, this);
+
+    afr_writev_handle_short_writes(frame, this);
+
+    if (local->update_open_fd_count)
+        local->inode_ctx->open_fd_count = local->open_fd_count;
+    if (local->update_num_inodelks &&
+        local->transaction.type == AFR_DATA_TRANSACTION) {
+        lock = &local->inode_ctx->lock[local->transaction.type];
+        lock->num_inodelks = local->num_inodelks;
+    }
+}
 
 int
-afr_chmod_done (call_frame_t *frame, xlator_t *this)
+afr_writev_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                    struct iatt *postbuf, dict_t *xdata)
 {
-	afr_local_t * local = NULL;
-
-	local = frame->local;
-
-	local->transaction.unwind (frame, this);
-
-	AFR_STACK_DESTROY (frame);
-	
-	return 0;
+    call_frame_t *fop_frame = NULL;
+    int child_index = (long)cookie;
+    int call_count = -1;
+
+    afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, prebuf,
+                         postbuf, xdata);
+
+    call_count = afr_frame_return(frame);
+
+    if (call_count == 0) {
+        afr_process_post_writev(frame, this);
+
+        if (!afr_txn_nothing_failed(frame, this)) {
+            // Don't unwind until post-op is complete
+            afr_transaction_resume(frame, this);
+        } else {
+            /*
+             * Generally inode-write fops do transaction.unwind then
+             * transaction.resume, but writev needs to make sure that
+             * delayed post-op frame is placed in fdctx before unwind
+             * happens. This prevents the race of flush doing the
+             * changelog wakeup first in fuse thread and then this
+             * writev placing its delayed post-op frame in fdctx.
+             * This helps flush make sure all the delayed post-ops are
+             * completed.
+             */
+
+            fop_frame = afr_transaction_detach_fop_frame(frame);
+            afr_writev_copy_outvars(frame, fop_frame);
+            afr_transaction_resume(frame, this);
+            afr_writev_unwind(fop_frame, this);
+        }
+    }
+    return 0;
 }
 
-
-int32_t
-afr_chmod (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, mode_t mode)
+static int
+afr_arbiter_writev_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t  * transaction_frame = NULL;
-
-	int ret = -1;
-
-	int op_ret   = -1;
-	int op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	transaction_frame->local = local;
-
-	local->cont.chmod.mode = mode;
-	local->cont.chmod.ino  = loc->inode->ino;
-
-	local->transaction.fop    = afr_chmod_wind;
-	local->transaction.done   = afr_chmod_done;
-	local->transaction.unwind = afr_chmod_unwind;
-
-	loc_copy (&local->loc, loc);
-	
-	local->transaction.main_frame = frame;
-	local->transaction.start   = 0;
-	local->transaction.len     = 0;
-	local->transaction.pending = AFR_METADATA_PENDING;
-
-	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
-
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
-
-	return 0;
+    afr_local_t *local = frame->local;
+    afr_private_t *priv = this->private;
+    static char byte = 0xFF;
+    static struct iovec vector = {&byte, 1};
+    int32_t count = 1;
+
+    STACK_WIND_COOKIE(
+        frame, afr_writev_wind_cbk, (void *)(long)subvol,
+        priv->children[subvol], priv->children[subvol]->fops->writev, local->fd,
+        &vector, count, local->cont.writev.offset, local->cont.writev.flags,
+        local->cont.writev.iobref, local->xdata_req);
+
+    return 0;
 }
 
-/* }}} */
-
-
-/* {{{ fchmod */
-
 int
-afr_fchmod_unwind (call_frame_t *frame, xlator_t *this)
+afr_writev_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	call_frame_t   *main_frame = NULL;
-
-	local = frame->local;
-	priv  = this->private;
-
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame)
-			main_frame = local->transaction.main_frame;
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
-
-	if (main_frame) {
-		local->cont.fchmod.buf.st_ino = local->cont.fchmod.ino;
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
-				  &local->cont.fchmod.buf);
-	}
-	return 0;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    if (AFR_IS_ARBITER_BRICK(priv, subvol)) {
+        afr_arbiter_writev_wind(frame, this, subvol);
+        return 0;
+    }
+
+    STACK_WIND_COOKIE(frame, afr_writev_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->writev, local->fd,
+                      local->cont.writev.vector, local->cont.writev.count,
+                      local->cont.writev.offset, local->cont.writev.flags,
+                      local->cont.writev.iobref, local->xdata_req);
+    return 0;
 }
 
-
 int
-afr_fchmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		    int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_do_writev(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
+    call_frame_t *transaction_frame = NULL;
+    afr_local_t *local = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
+
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
+
+    local = frame->local;
+    transaction_frame->local = local;
+    frame->local = NULL;
+
+    if (!AFR_FRAME_INIT(frame, op_errno))
+        goto out;
+
+    local->op = GF_FOP_WRITE;
+
+    local->transaction.wind = afr_writev_wind;
+    local->transaction.unwind = afr_transaction_writev_unwind;
+
+    local->transaction.main_frame = frame;
+
+    if (local->fd->flags & O_APPEND) {
+        /*
+         * Backend vfs ignores the 'offset' for append mode fd so
+         * locking just the region provided for the writev does not
+         * give consistency guarantee. The actual write may happen at a
+         * completely different range than the one provided by the
+         * offset, len in the fop. So lock the entire file.
+         */
+        local->transaction.start = 0;
+        local->transaction.len = 0;
+    } else {
+        local->transaction.start = local->cont.writev.offset;
+        local->transaction.len = iov_length(local->cont.writev.vector,
+                                            local->cont.writev.count);
+    }
+
+    ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
+
+    return 0;
+out:
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	int call_count  = -1;
-	int child_index = (long) cookie;
-	int need_unwind = 0;
+    AFR_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+}
 
-	local = frame->local;
-	priv  = this->private;
+int
+afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+           int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+           dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int op_errno = ENOMEM;
+    int ret = -1;
+
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    local->cont.writev.vector = iov_dup(vector, count);
+    if (!local->cont.writev.vector)
+        goto out;
+    local->cont.writev.count = count;
+    local->cont.writev.offset = offset;
+    local->cont.writev.flags = flags;
+    local->cont.writev.iobref = iobref_ref(iobref);
+
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
+
+    if (!local->xdata_req)
+        goto out;
+
+    local->fd = fd_ref(fd);
+    ret = afr_set_inode_local(this, local, fd->inode);
+    if (ret)
+        goto out;
+
+    if (dict_set_uint32(local->xdata_req, GLUSTERFS_ACTIVE_FD_COUNT, 4)) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    if (dict_set_str_sizen(local->xdata_req, GLUSTERFS_INODELK_DOM_COUNT,
+                           this->name)) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    if (dict_set_uint32(local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    /* Set append_write to be true speculatively. If on any
+       server it turns not be true, we unset it in the
+       callback.
+    */
+    local->append_write = _gf_true;
+
+    /* detect here, but set it in writev_wind_cbk *after* the unstable
+       write is performed
+    */
+    local->stable_write = !!((fd->flags | flags) & (O_SYNC | O_DSYNC));
+
+    afr_fix_open(fd, this);
+
+    afr_do_writev(frame, this);
+
+    return 0;
+out:
+    AFR_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
 
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
+    return 0;
+}
 
-		if (op_ret != -1) {
-			if (local->success_count == 0) {
-				local->op_ret = op_ret;
-				local->cont.fchmod.buf = *buf;
-			}
-			local->success_count++;
+/* }}} */
 
-			if (local->success_count == priv->wait_count) {
-				need_unwind = 1;
-			}
-		}
+/* {{{ truncate */
 
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
+int
+afr_truncate_unwind(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
 
-	if (need_unwind)
-		afr_fchmod_unwind (frame, this);
+    local = frame->local;
 
-	call_count = afr_frame_return (frame);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+    AFR_STACK_UNWIND(truncate, main_frame, local->op_ret, local->op_errno,
+                     &local->cont.inode_wfop.prebuf,
+                     &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_fchmod_wind (call_frame_t *frame, xlator_t *this)
+afr_truncate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                      struct iatt *postbuf, dict_t *xdata)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	
-	int i = 0;
-	int call_count = -1;
-
-	local = frame->local;
-	priv  = this->private;
+    afr_local_t *local = NULL;
 
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
+    local = frame->local;
 
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
+    if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
+        local->stable_write = _gf_false;
 
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_fchmod_wind_cbk, (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->fchmod,
-					   local->fd, 
-					   local->cont.fchmod.mode); 
-		
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+                                 postbuf, NULL, xdata);
 }
 
-
 int
-afr_fchmod_done (call_frame_t *frame, xlator_t *this)
+afr_truncate_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_local_t * local = NULL;
-
-	local = frame->local;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
 
-	local->transaction.unwind (frame, this);
+    local = frame->local;
+    priv = this->private;
 
-	AFR_STACK_DESTROY (frame);
-	
-	return 0;
+    STACK_WIND_COOKIE(frame, afr_truncate_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->truncate, &local->loc,
+                      local->cont.truncate.offset, local->xdata_req);
+    return 0;
 }
 
-
-int32_t
-afr_fchmod (call_frame_t *frame, xlator_t *this,
-	    fd_t *fd, mode_t mode)
+int
+afr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+             dict_t *xdata)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t  * transaction_frame = NULL;
-
-	int ret = -1;
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
 
-	int op_ret   = -1;
-	int op_errno = 0;
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
 
-	priv = this->private;
+    local->cont.truncate.offset = offset;
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
 
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
+    if (!local->xdata_req)
+        goto out;
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
+    local->transaction.wind = afr_truncate_wind;
+    local->transaction.unwind = afr_truncate_unwind;
 
-	transaction_frame->local = local;
+    loc_copy(&local->loc, loc);
+    ret = afr_set_inode_local(this, local, loc->inode);
+    if (ret)
+        goto out;
 
-	local->cont.fchmod.mode = mode;
-	local->cont.fchmod.ino  = fd->inode->ino;
+    local->op = GF_FOP_TRUNCATE;
 
-	local->transaction.fop    = afr_fchmod_wind;
-	local->transaction.done   = afr_fchmod_done;
-	local->transaction.unwind = afr_fchmod_unwind;
+    local->transaction.main_frame = frame;
+    local->transaction.start = offset;
+    local->transaction.len = 0;
 
-	local->fd = fd_ref (fd);
-	
-	local->transaction.main_frame = frame;
-	local->transaction.start   = 0;
-	local->transaction.len     = 0;
-	local->transaction.pending = AFR_METADATA_PENDING;
+    /* Set it true speculatively, will get reset in afr_truncate_wind_cbk
+       if truncate was not a NOP */
+    local->stable_write = _gf_true;
 
-	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+    ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
 
-	op_ret = 0;
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	return 0;
+    AFR_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
 }
 
 /* }}} */
 
-/* {{{ chown */
-
-int
-afr_chown_unwind (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	call_frame_t   *main_frame = NULL;
-
-	local = frame->local;
-	priv  = this->private;
-
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame)
-			main_frame = local->transaction.main_frame;
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
-
-	if (main_frame) {
-		local->cont.chown.buf.st_ino = local->cont.chown.ino;
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
-				  &local->cont.chown.buf);
-	}
-	return 0;
-}
-
+/* {{{ ftruncate */
 
 int
-afr_chown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		    int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_ftruncate_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
- 
-	int call_count  = -1;
-	int child_index = (long) cookie;
-	int need_unwind = 0;
-
-	local = frame->local;
-	priv = this->private;
-
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
-
-		if (op_ret != -1) {
-			if (local->success_count == 0) {
-				local->op_ret = op_ret;
-				local->cont.chown.buf = *buf;
-			}
-			local->success_count++;
-
-			if (local->success_count == priv->wait_count) {
-				need_unwind = 1;
-			}
-		}
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
+    afr_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
 
-	if (need_unwind) {
-		local->transaction.unwind (frame, this);
-	}
+    local = frame->local;
 
-	call_count = afr_frame_return (frame);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+    AFR_STACK_UNWIND(ftruncate, main_frame, local->op_ret, local->op_errno,
+                     &local->cont.inode_wfop.prebuf,
+                     &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_chown_wind (call_frame_t *frame, xlator_t *this)
+afr_ftruncate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                       struct iatt *postbuf, dict_t *xdata)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int call_count = -1;
-	int i = 0;
-
-	local = frame->local;
-	priv  = this->private;
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
+    afr_local_t *local = NULL;
 
-	local->call_count = call_count;
+    local = frame->local;
 
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_chown_wind_cbk, (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->chown,
-					   &local->loc, local->cont.chown.uid,
-					   local->cont.chown.gid); 
+    if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
+        local->stable_write = _gf_false;
 
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+                                 postbuf, NULL, xdata);
 }
 
-
 int
-afr_chown_done (call_frame_t *frame, xlator_t *this)
+afr_ftruncate_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_local_t *local = NULL;
-
-	local = frame->local;
-
-	local->transaction.unwind (frame, this);
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
 
-	AFR_STACK_DESTROY (frame);
+    local = frame->local;
+    priv = this->private;
 
-	return 0;
+    STACK_WIND_COOKIE(frame, afr_ftruncate_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->ftruncate, local->fd,
+                      local->cont.ftruncate.offset, local->xdata_req);
+    return 0;
 }
 
-
 int
-afr_chown (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, uid_t uid, gid_t gid)
+afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+              dict_t *xdata)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t   *transaction_frame = NULL;
-
-	int ret = -1;
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
 
-	int op_ret   = -1;
-	int op_errno = 0;
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
 
-	priv = this->private;
+    local->cont.ftruncate.offset = offset;
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
 
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
+    if (!local->xdata_req)
+        goto out;
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+    local->fd = fd_ref(fd);
+    ret = afr_set_inode_local(this, local, fd->inode);
+    if (ret)
+        goto out;
 
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
+    local->op = GF_FOP_FTRUNCATE;
 
-	transaction_frame->local = local;
+    local->transaction.wind = afr_ftruncate_wind;
+    local->transaction.unwind = afr_ftruncate_unwind;
 
-	local->cont.chown.uid  = uid;
-	local->cont.chown.gid  = gid;
-	local->cont.chown.ino  = loc->inode->ino;
+    local->transaction.main_frame = frame;
 
-	local->transaction.fop    = afr_chown_wind;
-	local->transaction.done   = afr_chown_done;
-	local->transaction.unwind = afr_chown_unwind;
+    local->transaction.start = local->cont.ftruncate.offset;
+    local->transaction.len = 0;
 
-	loc_copy (&local->loc, loc);
+    afr_fix_open(fd, this);
 
-	local->transaction.main_frame = frame;
-	local->transaction.start   = 0;
-	local->transaction.len     = 0;
-	local->transaction.pending = AFR_METADATA_PENDING;
+    /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk
+       if truncate was not a NOP */
+    local->stable_write = _gf_true;
 
-	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+    ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
 
-	op_ret = 0;
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
+    AFR_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
 
-	return 0;
+    return 0;
 }
 
-
 /* }}} */
 
-/* {{{ chown */
+/* {{{ setattr */
 
 int
-afr_fchown_unwind (call_frame_t *frame, xlator_t *this)
+afr_setattr_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	call_frame_t   *main_frame = NULL;
+    afr_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
 
-	local = frame->local;
-	priv  = this->private;
+    local = frame->local;
 
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame)
-			main_frame = local->transaction.main_frame;
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	if (main_frame) {
-		local->cont.fchown.buf.st_ino = local->cont.fchown.ino;
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
-				  &local->cont.fchown.buf);
-	}
-	return 0;
+    AFR_STACK_UNWIND(setattr, main_frame, local->op_ret, local->op_errno,
+                     &local->cont.inode_wfop.prebuf,
+                     &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_fchown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		    int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_setattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int op_ret, int op_errno, struct iatt *preop,
+                     struct iatt *postop, dict_t *xdata)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
- 
-	int call_count  = -1;
-	int child_index = (long) cookie;
-	int need_unwind = 0;
-
-	local = frame->local;
-	priv = this->private;
-
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
-
-		if (op_ret != -1) {
-			if (local->success_count == 0) {
-				local->op_ret = op_ret;
-				local->cont.fchown.buf = *buf;
-			}
-			local->success_count++;
-
-			if (local->success_count == priv->wait_count) {
-				need_unwind = 1;
-			}
-		}
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	if (need_unwind) {
-		local->transaction.unwind (frame, this);
-	}
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+    return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, preop,
+                                 postop, NULL, xdata);
 }
 
-
 int
-afr_fchown_wind (call_frame_t *frame, xlator_t *this)
+afr_setattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int call_count = -1;
-	int i = 0;
-
-	local = frame->local;
-	priv  = this->private;
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
-
-	local->call_count = call_count;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    STACK_WIND_COOKIE(frame, afr_setattr_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->setattr, &local->loc,
+                      &local->cont.setattr.in_buf, local->cont.setattr.valid,
+                      local->xdata_req);
+    return 0;
+}
 
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_fchown_wind_cbk, (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->fchown,
-					   local->fd, local->cont.fchown.uid,
-					   local->cont.fchown.gid); 
+int
+afr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
+            int32_t valid, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
+
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
+
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
+
+    local->cont.setattr.in_buf = *buf;
+    local->cont.setattr.valid = valid;
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
+
+    if (!local->xdata_req)
+        goto out;
+
+    local->transaction.wind = afr_setattr_wind;
+    local->transaction.unwind = afr_setattr_unwind;
+
+    loc_copy(&local->loc, loc);
+    ret = afr_set_inode_local(this, local, loc->inode);
+    if (ret)
+        goto out;
+
+    local->op = GF_FOP_SETATTR;
+
+    local->transaction.main_frame = frame;
+    local->transaction.start = LLONG_MAX - 1;
+    local->transaction.len = 0;
+
+    ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
+
+    return 0;
+out:
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    AFR_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
 }
 
+/* {{{ fsetattr */
 
 int
-afr_fchown_done (call_frame_t *frame, xlator_t *this)
+afr_fsetattr_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *local = NULL;
+    afr_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
 
-	local = frame->local;
+    local = frame->local;
 
-	local->transaction.unwind (frame, this);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	AFR_STACK_DESTROY (frame);
-
-	return 0;
+    AFR_STACK_UNWIND(fsetattr, main_frame, local->op_ret, local->op_errno,
+                     &local->cont.inode_wfop.prebuf,
+                     &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_fchown (call_frame_t *frame, xlator_t *this,
-	    fd_t *fd, uid_t uid, gid_t gid)
+afr_fsetattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct iatt *preop,
+                      struct iatt *postop, dict_t *xdata)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t   *transaction_frame = NULL;
-
-	int ret = -1;
+    return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, preop,
+                                 postop, NULL, xdata);
+}
 
-	int op_ret   = -1;
-	int op_errno = 0;
+int
+afr_fsetattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    STACK_WIND_COOKIE(frame, afr_fsetattr_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->fsetattr, local->fd,
+                      &local->cont.fsetattr.in_buf, local->cont.fsetattr.valid,
+                      local->xdata_req);
+    return 0;
+}
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+int
+afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
+             int32_t valid, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
 
-	priv = this->private;
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
 
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+    local->cont.fsetattr.in_buf = *buf;
+    local->cont.fsetattr.valid = valid;
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
 
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
+    if (!local->xdata_req)
+        goto out;
 
-	transaction_frame->local = local;
+    local->transaction.wind = afr_fsetattr_wind;
+    local->transaction.unwind = afr_fsetattr_unwind;
 
-	local->cont.fchown.uid  = uid;
-	local->cont.fchown.gid  = gid;
-	local->cont.fchown.ino  = fd->inode->ino;
+    local->fd = fd_ref(fd);
+    ret = afr_set_inode_local(this, local, fd->inode);
+    if (ret)
+        goto out;
 
-	local->transaction.fop    = afr_fchown_wind;
-	local->transaction.done   = afr_fchown_done;
-	local->transaction.unwind = afr_fchown_unwind;
+    local->op = GF_FOP_FSETATTR;
 
-	local->fd = fd_ref (fd);
+    afr_fix_open(fd, this);
 
-	local->transaction.main_frame = frame;
-	local->transaction.start   = 0;
-	local->transaction.len     = 0;
-	local->transaction.pending = AFR_METADATA_PENDING;
+    local->transaction.main_frame = frame;
+    local->transaction.start = LLONG_MAX - 1;
+    local->transaction.len = 0;
 
-	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+    ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
 
-	op_ret = 0;
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	return 0;
+    AFR_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
 }
 
-/* }}} */
-
-/* {{{ writev */
+/* {{{ setxattr */
 
 int
-afr_writev_unwind (call_frame_t *frame, xlator_t *this)
+afr_setxattr_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	call_frame_t   *main_frame = NULL;
+    afr_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
 
-	local = frame->local;
-	priv  = this->private;
+    local = frame->local;
 
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame)
-			main_frame = local->transaction.main_frame;
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	if (main_frame) {
-		local->cont.writev.buf.st_ino = local->cont.writev.ino;
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
-				  &local->cont.writev.buf);
-	}
-	return 0;
+    AFR_STACK_UNWIND(setxattr, main_frame, local->op_ret, local->op_errno,
+                     local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		     int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_setxattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int child_index = (long) cookie;
-	int call_count  = -1;
-	int need_unwind = 0;
+    return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+                                 NULL, NULL, xdata);
+}
 
-	local = frame->local;
-	priv = this->private;
+int
+afr_setxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    STACK_WIND_COOKIE(frame, afr_setxattr_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->setxattr, &local->loc,
+                      local->cont.setxattr.dict, local->cont.setxattr.flags,
+                      local->xdata_req);
+    return 0;
+}
 
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
+int
+afr_emptyb_set_pending_changelog_cbk(call_frame_t *frame, void *cookie,
+                                     xlator_t *this, int op_ret, int op_errno,
+                                     dict_t *xattr, dict_t *xdata)
 
-		if (op_ret != -1) {
-			if (local->success_count == 0) {
-				local->op_ret   = op_ret;
-				local->cont.writev.buf = *buf;
-			}
-			local->success_count++;
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i, ret = 0;
+    char *op_type = NULL;
 
-			if (local->success_count == priv->wait_count) {
-				need_unwind = 1;
-			}
-		}
+    local = frame->local;
+    priv = this->private;
+    i = (long)cookie;
 
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
+    local->replies[i].valid = 1;
+    local->replies[i].op_ret = op_ret;
+    local->replies[i].op_errno = op_errno;
 
-	call_count = afr_frame_return (frame);
+    ret = dict_get_str_sizen(local->xdata_req, "replicate-brick-op", &op_type);
+    if (ret)
+        goto out;
 
-	if (call_count == 0) {
-		local->transaction.unwind (frame, this);
+    gf_smsg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO,
+            op_ret ? op_errno : 0, AFR_MSG_SET_PEND_XATTR, "name=%s",
+            priv->children[i]->name, "op_ret=%s",
+            op_ret ? "failed" : "succeeded", NULL);
 
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+out:
+    syncbarrier_wake(&local->barrier);
+    return 0;
 }
 
-
 int
-afr_writev_wind (call_frame_t *frame, xlator_t *this)
+afr_emptyb_set_pending_changelog(call_frame_t *frame, xlator_t *this,
+                                 unsigned char *locked_nodes)
 {
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-	
-	int i = 0;
-	int call_count = -1;
-
-	local = frame->local;
-	priv = this->private;
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int ret = 0, i = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    AFR_ONLIST(locked_nodes, frame, afr_emptyb_set_pending_changelog_cbk,
+               xattrop, &local->loc, GF_XATTROP_ADD_ARRAY, local->xattr_req,
+               NULL);
+
+    /* It is sufficient if xattrop was successful on one child */
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].valid)
+            continue;
+
+        if (local->replies[i].op_ret == 0) {
+            ret = 0;
+            goto out;
+        } else {
+            ret = afr_higher_errno(ret, local->replies[i].op_errno);
+        }
+    }
+out:
+    return -ret;
+}
 
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
+static int
+_afr_handle_empty_brick_type(xlator_t *this, call_frame_t *frame, loc_t *loc,
+                             int empty_index, afr_transaction_type type,
+                             char *op_type, const int op_type_len)
+{
+    int count = 0;
+    int ret = -ENOMEM;
+    int idx = -1;
+    int d_idx = -1;
+    unsigned char *locked_nodes = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    locked_nodes = alloca0(priv->child_count);
+
+    idx = afr_index_for_transaction_type(type);
+    d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+
+    local->pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+    if (!local->pending)
+        goto out;
+
+    local->pending[empty_index][idx] = hton32(1);
+
+    if ((priv->esh_granular) && (type == AFR_ENTRY_TRANSACTION))
+        local->pending[empty_index][d_idx] = hton32(1);
+
+    local->xdata_req = dict_new();
+    if (!local->xdata_req)
+        goto out;
+
+    ret = dict_set_nstrn(local->xdata_req, "replicate-brick-op",
+                         SLEN("replicate-brick-op"), op_type, op_type_len);
+    if (ret)
+        goto out;
+
+    local->xattr_req = dict_new();
+    if (!local->xattr_req)
+        goto out;
+
+    ret = afr_set_pending_dict(priv, local->xattr_req, local->pending);
+    if (ret < 0)
+        goto out;
+
+    if (AFR_ENTRY_TRANSACTION == type) {
+        count = afr_selfheal_entrylk(frame, this, loc->inode, this->name, NULL,
+                                     locked_nodes);
+    } else {
+        count = afr_selfheal_inodelk(frame, this, loc->inode, this->name,
+                                     LLONG_MAX - 1, 0, locked_nodes);
+    }
+
+    if (!count) {
+        gf_smsg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS,
+                NULL);
+        ret = -EAGAIN;
+        goto unlock;
+    }
+
+    ret = afr_emptyb_set_pending_changelog(frame, this, locked_nodes);
+    if (ret)
+        goto unlock;
+    ret = 0;
+unlock:
+    if (AFR_ENTRY_TRANSACTION == type) {
+        afr_selfheal_unentrylk(frame, this, loc->inode, this->name, NULL,
+                               locked_nodes, NULL);
+    } else {
+        afr_selfheal_uninodelk(frame, this, loc->inode, this->name,
+                               LLONG_MAX - 1, 0, locked_nodes);
+    }
+out:
+    return ret;
+}
 
-	local->call_count = call_count;
+void
+afr_brick_args_cleanup(void *opaque)
+{
+    afr_empty_brick_args_t *data = NULL;
 
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, 
-					   (void *) (long) i,	
-					   priv->children[i], 
-					   priv->children[i]->fops->writev,
-					   local->fd, 
-					   local->cont.writev.vector,
-					   local->cont.writev.count, 
-					   local->cont.writev.offset); 
-		
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    data = opaque;
+    loc_wipe(&data->loc);
+    GF_FREE(data);
 }
 
-
 int
-afr_writev_done (call_frame_t *frame, xlator_t *this)
+_afr_handle_empty_brick_cbk(int ret, call_frame_t *frame, void *opaque)
 {
-	afr_local_t *local = NULL;
-
-	local = frame->local;
-
-	if (local->cont.writev.refs)
-		dict_unref (local->cont.writev.refs);
-	local->cont.writev.refs = NULL;
-
-	local->transaction.unwind (frame, this);
-
-	AFR_STACK_DESTROY (frame);
-
-	return 0;
+    afr_brick_args_cleanup(opaque);
+    return 0;
 }
 
-
 int
-afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, 
-	    struct iovec *vector, int32_t count, off_t offset)
+_afr_handle_empty_brick(void *opaque)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t   *transaction_frame = NULL;
-
-	int ret = -1;
-
-	int op_ret   = -1;
-	int op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int empty_index = -1;
+    int ret = -1;
+    int op_errno = ENOMEM;
+    call_frame_t *frame = NULL;
+    xlator_t *this = NULL;
+    char *op_type = NULL;
+    int op_type_len = 0;
+    afr_empty_brick_args_t *data = NULL;
+    call_frame_t *op_frame = NULL;
+
+    data = opaque;
+    frame = data->frame;
+    empty_index = data->empty_index;
+    if (!data->op_type)
+        goto out;
+
+    op_frame = copy_frame(frame);
+    if (!op_frame) {
+        ret = -1;
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    op_type = data->op_type;
+    op_type_len = strlen(op_type);
+    this = op_frame->this;
+    priv = this->private;
+
+    afr_set_lk_owner(op_frame, this, op_frame->root);
+    local = AFR_FRAME_INIT(op_frame, op_errno);
+    if (!local)
+        goto out;
+
+    loc_copy(&local->loc, &data->loc);
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, AFR_MSG_NEW_BRICK, "name=%s",
+            priv->children[empty_index]->name, NULL);
+
+    ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index,
+                                       AFR_METADATA_TRANSACTION, op_type,
+                                       op_type_len);
+    if (ret) {
+        op_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    dict_unref(local->xdata_req);
+    dict_unref(local->xattr_req);
+    afr_matrix_cleanup(local->pending, priv->child_count);
+    local->pending = NULL;
+    local->xattr_req = NULL;
+    local->xdata_req = NULL;
+
+    ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index,
+                                       AFR_ENTRY_TRANSACTION, op_type,
+                                       op_type_len);
+    if (ret) {
+        op_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+    ret = 0;
+out:
+    if (op_frame) {
+        AFR_STACK_DESTROY(op_frame);
+    }
+    AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+    return 0;
+}
 
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
+int
+afr_split_brain_resolve_do(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                           char *data)
+{
+    afr_local_t *local = NULL;
+    int ret = -1;
+    int op_errno = EINVAL;
+
+    local = frame->local;
+    local->xdata_req = dict_new();
+
+    if (!local->xdata_req) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    ret = dict_set_int32_sizen(local->xdata_req, "heal-op",
+                               GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK);
+    if (ret) {
+        op_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+    ret = dict_set_str_sizen(local->xdata_req, "child-name", data);
+    if (ret) {
+        op_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+    /* set spb choice to -1 whether heal succeeds or not:
+     * If heal succeeds : spb-choice should be set to -1 as
+     *                    it is no longer valid; file is not
+     *                    in split-brain anymore.
+     * If heal doesn't succeed:
+     *                    spb-choice should be set to -1
+     *                    otherwise reads will be served
+     *                    from spb-choice which is misleading.
+     */
+    ret = afr_inode_split_brain_choice_set(loc->inode, this, -1);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_SET_FAILED,
+                NULL);
+    afr_heal_splitbrain_file(frame, this, loc);
+    ret = 0;
+out:
+    if (ret < 0)
+        AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+    return 0;
+}
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+int
+afr_get_split_brain_child_index(xlator_t *this, void *value, size_t len)
+{
+    int spb_child_index = -1;
+    char *spb_child_str = NULL;
 
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
+    spb_child_str = alloca0(len + 1);
+    memcpy(spb_child_str, value, len);
 
-	transaction_frame->local = local;
+    if (!strcmp(spb_child_str, "none"))
+        return -2;
 
-	local->op = GF_FOP_WRITE;
-	local->cont.writev.vector  = iov_dup (vector, count);
-	local->cont.writev.count   = count;
-	local->cont.writev.offset  = offset;
-	local->cont.writev.ino     = fd->inode->ino;
+    spb_child_index = afr_get_child_index_from_name(this, spb_child_str);
+    if (spb_child_index < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+                "subvol=%s", spb_child_str, NULL);
+    }
+    return spb_child_index;
+}
 
-	if (frame->root->req_refs)
-		local->cont.writev.refs = dict_ref (frame->root->req_refs);
+int
+afr_can_set_split_brain_choice(void *opaque)
+{
+    afr_spbc_timeout_t *data = opaque;
+    call_frame_t *frame = NULL;
+    xlator_t *this = NULL;
+    loc_t *loc = NULL;
+    int ret = -1;
+
+    frame = data->frame;
+    loc = data->loc;
+    this = frame->this;
+
+    ret = afr_is_split_brain(frame, this, loc->inode, loc->gfid, &data->d_spb,
+                             &data->m_spb);
+
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, 0,
+                AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, "gfid=%s",
+                uuid_utoa(loc->gfid), NULL);
+    return ret;
+}
 
-	local->transaction.fop    = afr_writev_wind;
-	local->transaction.done   = afr_writev_done;
-	local->transaction.unwind = afr_writev_unwind;
+int
+afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc,
+                                dict_t *dict)
+{
+    void *choice_value = NULL;
+    void *resolve_value = NULL;
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    afr_spbc_timeout_t *data = NULL;
+    int len = 0;
+    int spb_child_index = -1;
+    int ret = -1;
+    int op_errno = EINVAL;
+
+    priv = this->private;
+
+    ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_CHOICE, &choice_value, &len);
+    ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_RESOLVE, &resolve_value,
+                               &len);
+    if (!choice_value && !resolve_value) {
+        ret = -1;
+        goto out;
+    }
+
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local) {
+        ret = 1;
+        goto out;
+    }
+
+    local->op = GF_FOP_SETXATTR;
+
+    if (choice_value) {
+        spb_child_index = afr_get_split_brain_child_index(this, choice_value,
+                                                          len);
+        if (spb_child_index < 0) {
+            /* Case where value was "none" */
+            if (spb_child_index == -2)
+                spb_child_index = -1;
+            else {
+                ret = 1;
+                op_errno = EINVAL;
+                goto out;
+            }
+        }
+
+        data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_spbc_timeout_t);
+        if (!data) {
+            ret = 1;
+            goto out;
+        }
+        data->spb_child_index = spb_child_index;
+        data->frame = frame;
+        loc_copy(&local->loc, loc);
+        data->loc = &local->loc;
+        ret = synctask_new(this->ctx->env, afr_can_set_split_brain_choice,
+                           afr_set_split_brain_choice, NULL, data);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+                    "name=%s", loc->name, NULL);
+            ret = 1;
+            op_errno = ENOMEM;
+            goto out;
+        }
+        ret = 0;
+        goto out;
+    }
+
+    if (resolve_value) {
+        spb_child_index = afr_get_split_brain_child_index(this, resolve_value,
+                                                          len);
+        if (spb_child_index < 0) {
+            ret = 1;
+            goto out;
+        }
+
+        afr_split_brain_resolve_do(frame, this, loc,
+                                   priv->children[spb_child_index]->name);
+        ret = 0;
+    }
+out:
+    /* key was correct but value was invalid when ret == 1 */
+    if (ret == 1) {
+        AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+        if (data)
+            GF_FREE(data);
+        ret = 0;
+    }
+    return ret;
+}
 
-	local->fd                = fd_ref (fd);
+int
+afr_handle_spb_choice_timeout(xlator_t *this, call_frame_t *frame, dict_t *dict)
+{
+    int ret = -1;
+    int op_errno = 0;
+    uint64_t timeout = 0;
+    afr_private_t *priv = NULL;
 
-	local->transaction.main_frame = frame;
-	if (fd->flags & O_APPEND) {
-		local->transaction.start   = 0;
-		local->transaction.len     = 0;
-	} else {
-		local->transaction.start   = offset;
-		local->transaction.len     = iov_length (vector, count);
-	}
+    priv = this->private;
 
-	local->transaction.pending = AFR_DATA_PENDING;
+    ret = dict_get_uint64(dict, GF_AFR_SPB_CHOICE_TIMEOUT, &timeout);
+    if (!ret) {
+        priv->spb_choice_timeout = timeout * 60;
+        AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+    }
 
-	afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+    return ret;
+}
 
-	op_ret = 0;
+int
+afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc,
+                       dict_t *dict)
+{
+    int ret = -1;
+    int ab_ret = -1;
+    int empty_index = -1;
+    int op_errno = EPERM;
+    char *empty_brick = NULL;
+    char *op_type = NULL;
+    afr_empty_brick_args_t *data = NULL;
+
+    ret = dict_get_str_sizen(dict, GF_AFR_REPLACE_BRICK, &empty_brick);
+    if (!ret)
+        op_type = GF_AFR_REPLACE_BRICK;
+
+    ab_ret = dict_get_str_sizen(dict, GF_AFR_ADD_BRICK, &empty_brick);
+    if (!ab_ret)
+        op_type = GF_AFR_ADD_BRICK;
+
+    if (ret && ab_ret)
+        goto out;
+
+    if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) {
+        gf_smsg(this->name, GF_LOG_ERROR, EPERM, AFR_MSG_INTERNAL_ATTR,
+                "op_type=%s", op_type, NULL);
+        ret = 1;
+        goto out;
+    }
+    empty_index = afr_get_child_index_from_name(this, empty_brick);
+
+    if (empty_index < 0) {
+        /* Didn't belong to this replica pair
+         * Just do a no-op
+         */
+        AFR_STACK_UNWIND(setxattr, frame, 0, 0, NULL);
+        return 0;
+    } else {
+        data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_empty_brick_t);
+        if (!data) {
+            ret = 1;
+            op_errno = ENOMEM;
+            goto out;
+        }
+        data->frame = frame;
+        loc_copy(&data->loc, loc);
+        data->empty_index = empty_index;
+        data->op_type = op_type;
+        ret = synctask_new(this->ctx->env, _afr_handle_empty_brick,
+                           _afr_handle_empty_brick_cbk, NULL, data);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+                    NULL);
+            ret = 1;
+            op_errno = ENOMEM;
+            afr_brick_args_cleanup(data);
+            goto out;
+        }
+    }
+    ret = 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
-
-	return 0;
+    if (ret == 1) {
+        AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+        ret = 0;
+    }
+    return ret;
 }
 
-
-/* }}} */
-
-/* {{{ truncate */
-
-int
-afr_truncate_unwind (call_frame_t *frame, xlator_t *this)
+static int
+afr_handle_special_xattr(xlator_t *this, call_frame_t *frame, loc_t *loc,
+                         dict_t *dict)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	call_frame_t   *main_frame = NULL;
+    int ret = -1;
 
-	local = frame->local;
-	priv  = this->private;
+    ret = afr_handle_split_brain_commands(this, frame, loc, dict);
+    if (ret == 0)
+        goto out;
 
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame)
-			main_frame = local->transaction.main_frame;
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
+    ret = afr_handle_spb_choice_timeout(this, frame, dict);
+    if (ret == 0)
+        goto out;
 
-	if (main_frame) {
-		local->cont.truncate.buf.st_ino = local->cont.truncate.ino;
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
-				  &local->cont.truncate.buf);
-	}
-	return 0;
+    /* Applicable for replace-brick and add-brick commands */
+    ret = afr_handle_empty_brick(this, frame, loc, dict);
+out:
+    return ret;
 }
 
-
 int
-afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		       int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+             int32_t flags, dict_t *xdata)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int child_index = (long) cookie;
-	int call_count  = -1;
-	int need_unwind = 0;
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = EINVAL;
 
-	local = frame->local;
-	priv  = this->private;
+    GF_IF_INTERNAL_XATTR_GOTO("trusted.afr.*", dict, op_errno, out);
 
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
+    GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out);
 
-		if (op_ret != -1) {
-			if (local->success_count == 0) {
-				local->op_ret = op_ret;
-				local->cont.truncate.buf = *buf;
-			}
-			local->success_count++;
+    ret = afr_handle_special_xattr(this, frame, loc, dict);
+    if (ret == 0)
+        return 0;
 
-			if (local->success_count == priv->wait_count) {
-				need_unwind = 1;
-			}
-		}
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
 
-	if (need_unwind)
-		local->transaction.unwind (frame, this);
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
 
-	call_count = afr_frame_return (frame);
+    local->cont.setxattr.dict = dict_ref(dict);
+    local->cont.setxattr.flags = flags;
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
 
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
-}
+    if (!local->xdata_req)
+        goto out;
 
+    local->transaction.wind = afr_setxattr_wind;
+    local->transaction.unwind = afr_setxattr_unwind;
 
-int32_t
-afr_truncate_wind (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-	
-	int call_count = -1;
-	int i = 0;
+    loc_copy(&local->loc, loc);
+    ret = afr_set_inode_local(this, local, loc->inode);
+    if (ret)
+        goto out;
 
-	local = frame->local;
-	priv = this->private;
+    local->transaction.main_frame = frame;
+    local->transaction.start = LLONG_MAX - 1;
+    local->transaction.len = 0;
 
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
+    local->op = GF_FOP_SETXATTR;
 
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
+    ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
 
-	local->call_count = call_count;
+    return 0;
+out:
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk,
-					   (void *) (long) i,	
-					   priv->children[i], 
-					   priv->children[i]->fops->truncate,
-					   &local->loc, 
-					   local->cont.truncate.offset);
+    AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
 
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    return 0;
 }
 
+/* {{{ fsetxattr */
 
 int
-afr_truncate_done (call_frame_t *frame, xlator_t *this)
+afr_fsetxattr_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *local = NULL;
-
-	local = frame->local;
+    afr_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
 
-	local->transaction.unwind (frame, this);
+    local = frame->local;
 
-	AFR_STACK_DESTROY (frame);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	return 0;
+    AFR_STACK_UNWIND(fsetxattr, main_frame, local->op_ret, local->op_errno,
+                     local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_truncate (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, off_t offset)
+afr_fsetxattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t   *transaction_frame = NULL;
+    return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+                                 NULL, NULL, xdata);
+}
 
-	int ret = -1;
+int
+afr_fsetxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    STACK_WIND_COOKIE(frame, afr_fsetxattr_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->fsetxattr, local->fd,
+                      local->cont.fsetxattr.dict, local->cont.fsetxattr.flags,
+                      local->xdata_req);
+    return 0;
+}
 
-	int op_ret   = -1;
-	int op_errno = 0;
+int
+afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+              int32_t flags, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+    GF_IF_INTERNAL_XATTR_GOTO("trusted.afr.*", dict, op_errno, out);
 
-	priv = this->private;
+    GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out);
 
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
 
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
+    local->cont.fsetxattr.dict = dict_ref(dict);
+    local->cont.fsetxattr.flags = flags;
 
-	transaction_frame->local = local;
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
 
-	local->op_ret = -1;
+    if (!local->xdata_req)
+        goto out;
 
-	local->cont.truncate.offset  = offset;
-	local->cont.truncate.ino     = loc->inode->ino;
+    local->transaction.wind = afr_fsetxattr_wind;
+    local->transaction.unwind = afr_fsetxattr_unwind;
 
-	local->transaction.fop    = afr_truncate_wind;
-	local->transaction.done   = afr_truncate_done;
-	local->transaction.unwind = afr_truncate_unwind;
+    local->fd = fd_ref(fd);
+    ret = afr_set_inode_local(this, local, fd->inode);
+    if (ret)
+        goto out;
 
-	loc_copy (&local->loc, loc);
+    local->op = GF_FOP_FSETXATTR;
 
-	local->transaction.main_frame = frame;
-	local->transaction.start   = 0;
-	local->transaction.len     = offset;
-	local->transaction.pending = AFR_DATA_PENDING;
+    local->transaction.main_frame = frame;
+    local->transaction.start = LLONG_MAX - 1;
+    local->transaction.len = 0;
 
-	afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+    ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
 
-	op_ret = 0;
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	return 0;
+    AFR_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL);
+    return 0;
 }
 
-
 /* }}} */
 
-/* {{{ ftruncate */
-
-
-int
-afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	call_frame_t   *main_frame = NULL;
-
-	local = frame->local;
-	priv  = this->private;
-
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame)
-			main_frame = local->transaction.main_frame;
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
-
-	if (main_frame) {
-		local->cont.ftruncate.buf.st_ino = local->cont.ftruncate.ino;
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
-				  &local->cont.ftruncate.buf);
-	}
-	return 0;
-}
-
+/* {{{ removexattr */
 
 int
-afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-			int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_removexattr_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int child_index = (long) cookie;
-	int call_count  = -1;
-	int need_unwind = 0;
-
-	local = frame->local;
-	priv  = this->private;
-
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
-
-		if (op_ret != -1) {
-			if (local->success_count == 0) {
-				local->op_ret = op_ret;
-				local->cont.ftruncate.buf = *buf;
-			}
-			local->success_count++;
+    afr_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
 
-			if (local->success_count == priv->wait_count) {
-				need_unwind = 1;
-			}
-		}
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
+    local = frame->local;
 
-	if (need_unwind)
-		local->transaction.unwind (frame, this);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+    AFR_STACK_UNWIND(removexattr, main_frame, local->op_ret, local->op_errno,
+                     local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_ftruncate_wind (call_frame_t *frame, xlator_t *this)
+afr_removexattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-	
-	int call_count = -1;
-	int i = 0;
-
-	local = frame->local;
-	priv = this->private;
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
-
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk,
-					   (void *) (long) i,	
-					   priv->children[i], 
-					   priv->children[i]->fops->ftruncate,
-					   local->fd, local->cont.ftruncate.offset);
-
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+                                 NULL, NULL, xdata);
 }
 
-
 int
-afr_ftruncate_done (call_frame_t *frame, xlator_t *this)
+afr_removexattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_local_t *local = NULL;
-
-	local = frame->local;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
 
-	local->transaction.unwind (frame, this);
+    local = frame->local;
+    priv = this->private;
 
-	AFR_STACK_DESTROY (frame);
-
-	return 0;
+    STACK_WIND_COOKIE(frame, afr_removexattr_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->removexattr, &local->loc,
+                      local->cont.removexattr.name, local->xdata_req);
+    return 0;
 }
 
-
 int
-afr_ftruncate (call_frame_t *frame, xlator_t *this,
-	       fd_t *fd, off_t offset)
+afr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                const char *name, dict_t *xdata)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t   *transaction_frame = NULL;
-
-	int ret = -1;
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
 
-	int op_ret   = -1;
-	int op_errno = 0;
+    GF_IF_NATIVE_XATTR_GOTO("trusted.afr.*", name, op_errno, out);
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+    GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out);
 
-	priv = this->private;
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
 
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+    local->cont.removexattr.name = gf_strdup(name);
 
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
 
-	transaction_frame->local = local;
+    if (!local->xdata_req)
+        goto out;
 
-	local->op = GF_FOP_FTRUNCATE;
-	local->op_ret = -1;
+    local->transaction.wind = afr_removexattr_wind;
+    local->transaction.unwind = afr_removexattr_unwind;
 
-	local->cont.ftruncate.offset  = offset;
-	local->cont.ftruncate.ino     = fd->inode->ino;
+    loc_copy(&local->loc, loc);
+    ret = afr_set_inode_local(this, local, loc->inode);
+    if (ret)
+        goto out;
 
-	local->transaction.fop    = afr_ftruncate_wind;
-	local->transaction.done   = afr_ftruncate_done;
-	local->transaction.unwind = afr_ftruncate_unwind;
+    local->op = GF_FOP_REMOVEXATTR;
 
-	local->fd = fd_ref (fd);
+    local->transaction.main_frame = frame;
+    local->transaction.start = LLONG_MAX - 1;
+    local->transaction.len = 0;
 
-	local->transaction.main_frame = frame;
-	local->transaction.start   = 0;
-	local->transaction.len     = offset;
-	local->transaction.pending = AFR_DATA_PENDING;
+    ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
 
-	afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
-
-	op_ret = 0;
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	return 0;
+    AFR_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
+    return 0;
 }
 
-/* }}} */
-
-/* {{{ utimens */
-
-
+/* ffremovexattr */
 int
-afr_utimens_unwind (call_frame_t *frame, xlator_t *this)
+afr_fremovexattr_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	call_frame_t   *main_frame = NULL;
+    afr_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
 
-	local = frame->local;
-	priv  = this->private;
+    local = frame->local;
 
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame)
-			main_frame = local->transaction.main_frame;
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	if (main_frame) {
-		local->cont.utimens.buf.st_ino = local->cont.utimens.ino;
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno,
-				  &local->cont.utimens.buf);
-	}
-	return 0;
+    AFR_STACK_UNWIND(fremovexattr, main_frame, local->op_ret, local->op_errno,
+                     local->xdata_rsp);
+    return 0;
 }
 
+int
+afr_fremovexattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+                                 NULL, NULL, xdata);
+}
 
 int
-afr_utimens_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		      int32_t op_ret, int32_t op_errno, struct stat *buf)
+afr_fremovexattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
 
-	int child_index = (long) cookie;
-	int call_count  = -1;
-	int need_unwind = 1;
+    local = frame->local;
+    priv = this->private;
 
-	local = frame->local;
-	priv = this->private;
+    STACK_WIND_COOKIE(frame, afr_fremovexattr_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->fremovexattr, local->fd,
+                      local->cont.removexattr.name, local->xdata_req);
+    return 0;
+}
 
-	LOCK (&frame->lock);
-	{
-		if (child_went_down (op_ret, op_errno))
-			afr_transaction_child_died (frame, this, child_index);
+int
+afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                 const char *name, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
 
-		if (op_ret != -1) {
-			if (local->success_count == 0) {
-				local->op_ret = op_ret;
-				local->cont.utimens.buf = *buf;
-			}
-			local->success_count++;
+    GF_IF_NATIVE_XATTR_GOTO("trusted.afr.*", name, op_errno, out);
 
-			if (local->success_count == priv->wait_count) {
-				need_unwind = 1;
-			}
-		}
+    GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out);
 
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
 
-	if (need_unwind)
-		local->transaction.unwind (frame, this);
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
 
-	call_count = afr_frame_return (frame);
+    local->cont.removexattr.name = gf_strdup(name);
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
 
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
-}
+    if (!local->xdata_req)
+        goto out;
 
+    local->transaction.wind = afr_fremovexattr_wind;
+    local->transaction.unwind = afr_fremovexattr_unwind;
 
-int
-afr_utimens_wind (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-	
-	int call_count = -1;
-	int i = 0;
+    local->fd = fd_ref(fd);
+    ret = afr_set_inode_local(this, local, fd->inode);
+    if (ret)
+        goto out;
 
-	local = frame->local;
-	priv = this->private;
+    local->op = GF_FOP_FREMOVEXATTR;
 
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
+    local->transaction.main_frame = frame;
+    local->transaction.start = LLONG_MAX - 1;
+    local->transaction.len = 0;
 
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
+    ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
 
-	local->call_count = call_count;
+    return 0;
+out:
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_utimens_wind_cbk,
-					   (void *) (long) i,	
-					   priv->children[i], 
-					   priv->children[i]->fops->utimens,
-					   &local->loc, 
-					   local->cont.utimens.tv); 
+    AFR_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL);
 
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    return 0;
 }
 
-
 int
-afr_utimens_done (call_frame_t *frame, xlator_t *this)
+afr_fallocate_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t * local = NULL;
+    afr_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
 
-	local = frame->local;
+    local = frame->local;
 
-	local->transaction.unwind (frame, this);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	AFR_STACK_DESTROY (frame);
-
-	return 0;
+    AFR_STACK_UNWIND(fallocate, main_frame, local->op_ret, local->op_errno,
+                     &local->cont.inode_wfop.prebuf,
+                     &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_utimens (call_frame_t *frame, xlator_t *this,
-	     loc_t *loc, struct timespec tv[2])
+afr_fallocate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                       struct iatt *postbuf, dict_t *xdata)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t   *transaction_frame = NULL;
+    return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+                                 postbuf, NULL, xdata);
+}
 
-	int ret = -1;
+int
+afr_fallocate_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    STACK_WIND_COOKIE(frame, afr_fallocate_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->fallocate, local->fd,
+                      local->cont.fallocate.mode, local->cont.fallocate.offset,
+                      local->cont.fallocate.len, local->xdata_req);
+    return 0;
+}
 
-	int op_ret   = -1;
-	int op_errno = 0;
+int
+afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+              off_t offset, size_t len, dict_t *xdata)
+{
+    call_frame_t *transaction_frame = NULL;
+    afr_local_t *local = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
 
-	priv = this->private;
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
 
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
+    local->cont.fallocate.mode = mode;
+    local->cont.fallocate.offset = offset;
+    local->cont.fallocate.len = len;
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+    local->fd = fd_ref(fd);
+    ret = afr_set_inode_local(this, local, fd->inode);
+    if (ret)
+        goto out;
 
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
 
-	transaction_frame->local = local;
+    if (!local->xdata_req)
+        goto out;
 
-	local->op_ret = -1;
+    local->op = GF_FOP_FALLOCATE;
 
-	local->cont.utimens.tv[0] = tv[0];
-	local->cont.utimens.tv[1] = tv[1];
+    local->transaction.wind = afr_fallocate_wind;
+    local->transaction.unwind = afr_fallocate_unwind;
 
-	local->cont.utimens.ino  = loc->inode->ino;
+    local->transaction.main_frame = frame;
 
-	local->transaction.fop    = afr_utimens_wind;
-	local->transaction.done   = afr_utimens_done;
-	local->transaction.unwind = afr_utimens_unwind;
+    local->transaction.start = local->cont.fallocate.offset;
+    local->transaction.len = 0;
 
-	loc_copy (&local->loc, loc);
-	
-	local->transaction.main_frame = frame;
-	local->transaction.start   = 0;
-	local->transaction.len     = 0;
-	local->transaction.pending = AFR_METADATA_PENDING;
+    afr_fix_open(fd, this);
 
-	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+    ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
 
-	op_ret = 0;
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	return 0;
+    AFR_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
 }
 
 /* }}} */
 
-/* {{{ setxattr */
-
+/* {{{ discard */
 
 int
-afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_discard_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	call_frame_t   *main_frame = NULL;
+    afr_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
 
-	local = frame->local;
-	priv  = this->private;
+    local = frame->local;
 
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame)
-			main_frame = local->transaction.main_frame;
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	if (main_frame) {
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno)
-	}
-	return 0;
+    AFR_STACK_UNWIND(discard, main_frame, local->op_ret, local->op_errno,
+                     &local->cont.inode_wfop.prebuf,
+                     &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+    return 0;
 }
 
-
 int
-afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		       int32_t op_ret, int32_t op_errno)
+afr_discard_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int call_count  = -1;
-	int need_unwind = 0;
+    return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+                                 postbuf, NULL, xdata);
+}
 
-	local = frame->local;
-	priv = this->private;
+int
+afr_discard_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    STACK_WIND_COOKIE(frame, afr_discard_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->discard, local->fd,
+                      local->cont.discard.offset, local->cont.discard.len,
+                      local->xdata_req);
+    return 0;
+}
 
-	LOCK (&frame->lock);
-	{
-		if (op_ret != -1) {
-			if (local->success_count == 0) {
-				local->op_ret = op_ret;
-			}
-			local->success_count++;
+int
+afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            size_t len, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
 
-			if (local->success_count == priv->wait_count) {
-				need_unwind = 1;
-			}
-		}
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
 
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
 
-	if (need_unwind)
-		local->transaction.unwind (frame, this);
+    local->cont.discard.offset = offset;
+    local->cont.discard.len = len;
 
-	call_count = afr_frame_return (frame);
+    local->fd = fd_ref(fd);
+    ret = afr_set_inode_local(this, local, fd->inode);
+    if (ret)
+        goto out;
 
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
-}
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
 
+    if (!local->xdata_req)
+        goto out;
 
-int
-afr_setxattr_wind (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
+    local->op = GF_FOP_DISCARD;
 
-	int call_count = -1;
-	int i = 0;
+    local->transaction.wind = afr_discard_wind;
+    local->transaction.unwind = afr_discard_unwind;
 
-	local = frame->local;
-	priv = this->private;
+    local->transaction.main_frame = frame;
 
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
+    local->transaction.start = local->cont.discard.offset;
+    local->transaction.len = 0;
 
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
+    afr_fix_open(fd, this);
 
-	local->call_count = call_count;
+    ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
 
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk,
-					   (void *) (long) i,	
-					   priv->children[i], 
-					   priv->children[i]->fops->setxattr,
-					   &local->loc, 
-					   local->cont.setxattr.dict,
-					   local->cont.setxattr.flags); 
+    return 0;
+out:
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    AFR_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
 }
 
+/* {{{ zerofill */
 
 int
-afr_setxattr_done (call_frame_t *frame, xlator_t *this)
+afr_zerofill_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t * local = frame->local;
+    afr_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
 
-	local->transaction.unwind (frame, this);
+    local = frame->local;
 
-	AFR_STACK_DESTROY (frame);
-	
-	return 0;
-}
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
+    AFR_STACK_UNWIND(discard, main_frame, local->op_ret, local->op_errno,
+                     &local->cont.inode_wfop.prebuf,
+                     &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+    return 0;
+}
 
 int
-afr_setxattr (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, dict_t *dict, int32_t flags)
+afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                      struct iatt *postbuf, dict_t *xdata)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t   *transaction_frame = NULL;
+    return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+                                 postbuf, NULL, xdata);
+}
 
-	int ret = -1;
+int
+afr_zerofill_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    STACK_WIND_COOKIE(frame, afr_zerofill_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->zerofill, local->fd,
+                      local->cont.zerofill.offset, local->cont.zerofill.len,
+                      local->xdata_req);
+    return 0;
+}
 
-	int op_ret   = -1;
-	int op_errno = 0;
+int
+afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             size_t len, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
 
-	priv = this->private;
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
 
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
+    local->cont.zerofill.offset = offset;
+    local->cont.zerofill.len = len;
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+    local->fd = fd_ref(fd);
+    ret = afr_set_inode_local(this, local, fd->inode);
+    if (ret)
+        goto out;
 
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
+    if (xdata)
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+    else
+        local->xdata_req = dict_new();
 
-	transaction_frame->local = local;
+    if (!local->xdata_req)
+        goto out;
 
-	local->op_ret = -1;
+    local->op = GF_FOP_ZEROFILL;
 
-	local->cont.setxattr.dict  = dict_ref (dict);
-	local->cont.setxattr.flags = flags;
+    local->transaction.wind = afr_zerofill_wind;
+    local->transaction.unwind = afr_zerofill_unwind;
 
-	local->transaction.fop    = afr_setxattr_wind;
-	local->transaction.done   = afr_setxattr_done;
-	local->transaction.unwind = afr_setxattr_unwind;
+    local->transaction.main_frame = frame;
 
-	loc_copy (&local->loc, loc);
+    local->transaction.start = local->cont.zerofill.offset;
+    local->transaction.len = len;
 
-	local->transaction.main_frame = frame;
-	local->transaction.start   = 0;
-	local->transaction.len     = 0;
-	local->transaction.pending = AFR_METADATA_PENDING;
+    afr_fix_open(fd, this);
 
-	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+    ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
 
-	op_ret = 0;
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	return 0;
+    AFR_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
 }
 
 /* }}} */
 
-/* {{{ removexattr */
-
+int32_t
+afr_xattrop_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *xattr,
+                     dict_t *xdata)
+{
+    return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+                                 NULL, xattr, xdata);
+}
 
 int
-afr_removexattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_xattrop_wind(call_frame_t *frame, xlator_t *this, int subvol)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	call_frame_t   *main_frame = NULL;
-
-	local = frame->local;
-	priv  = this->private;
-
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.main_frame)
-			main_frame = local->transaction.main_frame;
-		local->transaction.main_frame = NULL;
-	}
-	UNLOCK (&frame->lock);
-
-	if (main_frame) {
-		AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno)
-	}
-	return 0;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    STACK_WIND_COOKIE(frame, afr_xattrop_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->xattrop, &local->loc,
+                      local->cont.xattrop.optype, local->cont.xattrop.xattr,
+                      local->xdata_req);
+    return 0;
 }
 
-
 int
-afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-			  int32_t op_ret, int32_t op_errno)
+afr_xattrop_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int call_count  = -1;
-	int need_unwind = 0;
-
-	local = frame->local;
-	priv = this->private;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret != -1) {
-			if (local->success_count == 0) {
-				local->op_ret = op_ret;
-			}
-			local->success_count++;
+    afr_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
 
-			if (local->success_count == priv->wait_count) {
-				need_unwind = 1;
-			}
-		}
+    local = frame->local;
 
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	if (need_unwind)
-		local->transaction.unwind (frame, this);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+    AFR_STACK_UNWIND(xattrop, main_frame, local->op_ret, local->op_errno,
+                     local->xattr_rsp, local->xdata_rsp);
+    return 0;
 }
 
-
 int32_t
-afr_removexattr_wind (call_frame_t *frame, xlator_t *this)
+afr_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+            gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
 {
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
 
-	int call_count = -1;
-	int i = 0;
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
 
-	local = frame->local;
-	priv = this->private;
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
 
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
+    local->cont.xattrop.xattr = dict_ref(xattr);
+    local->cont.xattrop.optype = optype;
+    if (xdata)
+        local->xdata_req = dict_ref(xdata);
 
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
+    local->transaction.wind = afr_xattrop_wind;
+    local->transaction.unwind = afr_xattrop_unwind;
 
-	local->call_count = call_count;
+    loc_copy(&local->loc, loc);
+    ret = afr_set_inode_local(this, local, loc->inode);
+    if (ret)
+        goto out;
 
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, 
-					   (void *) (long) i,	
-					   priv->children[i], 
-					   priv->children[i]->fops->removexattr,
-					   &local->loc, 
-					   local->cont.removexattr.name);
+    local->op = GF_FOP_XATTROP;
 
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
-}
+    local->transaction.main_frame = frame;
+    local->transaction.start = LLONG_MAX - 1;
+    local->transaction.len = 0;
 
+    ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
 
-int
-afr_removexattr_done (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t * local = frame->local;
+    return 0;
+out:
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	local->transaction.unwind (frame, this);
+    AFR_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL);
+    return 0;
+}
 
-	AFR_STACK_DESTROY (frame);
-	
-	return 0;
+int32_t
+afr_fxattrop_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xattr,
+                      dict_t *xdata)
+{
+    return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+                                 NULL, xattr, xdata);
 }
 
+int
+afr_fxattrop_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    STACK_WIND_COOKIE(frame, afr_fxattrop_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->fxattrop, local->fd,
+                      local->cont.xattrop.optype, local->cont.xattrop.xattr,
+                      local->xdata_req);
+    return 0;
+}
 
 int
-afr_removexattr (call_frame_t *frame, xlator_t *this,
-		 loc_t *loc, const char *name)
+afr_fxattrop_unwind(call_frame_t *frame, xlator_t *this)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	call_frame_t   *transaction_frame = NULL;
+    afr_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
 
-	int ret = -1;
+    local = frame->local;
 
-	int op_ret   = -1;
-	int op_errno = 0;
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-	VALIDATE_OR_GOTO (loc, out);
+    AFR_STACK_UNWIND(fxattrop, main_frame, local->op_ret, local->op_errno,
+                     local->xattr_rsp, local->xdata_rsp);
+    return 0;
+}
 
-	priv = this->private;
+int32_t
+afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+             gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int op_errno = ENOMEM;
+
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
+
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
+
+    local->cont.xattrop.xattr = dict_ref(xattr);
+    local->cont.xattrop.optype = optype;
+    if (xdata)
+        local->xdata_req = dict_ref(xdata);
+
+    local->transaction.wind = afr_fxattrop_wind;
+    local->transaction.unwind = afr_fxattrop_unwind;
+
+    local->fd = fd_ref(fd);
+    ret = afr_set_inode_local(this, local, fd->inode);
+    if (ret)
+        goto out;
+
+    local->op = GF_FOP_FXATTROP;
+
+    local->transaction.main_frame = frame;
+    local->transaction.start = LLONG_MAX - 1;
+    local->transaction.len = 0;
+
+    ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
+
+    return 0;
+out:
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
 
-	transaction_frame = copy_frame (frame);
-	if (!transaction_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
+    AFR_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL);
+    return 0;
+}
 
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+int
+afr_fsync_unwind(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
 
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
+    local = frame->local;
 
-	transaction_frame->local = local;
+    main_frame = afr_transaction_detach_fop_frame(frame);
+    if (!main_frame)
+        return 0;
 
-	local->op_ret = -1;
+    AFR_STACK_UNWIND(fsync, main_frame, local->op_ret, local->op_errno,
+                     &local->cont.inode_wfop.prebuf,
+                     &local->cont.inode_wfop.postbuf, local->xdata_rsp);
 
-	local->cont.removexattr.name = strdup (name);
+    return 0;
+}
 
-	local->transaction.fop    = afr_removexattr_wind;
-	local->transaction.done   = afr_removexattr_done;
-	local->transaction.unwind = afr_removexattr_unwind;
+int
+afr_fsync_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                   struct iatt *postbuf, dict_t *xdata)
+{
+    return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+                                 postbuf, NULL, xdata);
+}
 
-	loc_copy (&local->loc, loc);
+int
+afr_fsync_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
 
-	local->transaction.main_frame = frame;
-	local->transaction.start   = 0;
-	local->transaction.len     = 0;
-	local->transaction.pending = AFR_METADATA_PENDING;
+    local = frame->local;
+    priv = this->private;
 
-	afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+    STACK_WIND_COOKIE(frame, afr_fsync_wind_cbk, (void *)(long)subvol,
+                      priv->children[subvol],
+                      priv->children[subvol]->fops->fsync, local->fd,
+                      local->cont.fsync.datasync, local->xdata_req);
+    return 0;
+}
 
-	op_ret = 0;
+int
+afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+          dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    call_frame_t *transaction_frame = NULL;
+    int ret = -1;
+    int32_t op_errno = ENOMEM;
+    int8_t last_fsync = 0;
+
+    AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+    transaction_frame = copy_frame(frame);
+    if (!transaction_frame)
+        goto out;
+
+    local = AFR_FRAME_INIT(transaction_frame, op_errno);
+    if (!local)
+        goto out;
+
+    if (xdata) {
+        local->xdata_req = dict_copy_with_ref(xdata, NULL);
+        if (dict_get_int8(xdata, "last-fsync", &last_fsync) == 0) {
+            if (last_fsync) {
+                local->transaction.disable_delayed_post_op = _gf_true;
+            }
+        }
+    } else {
+        local->xdata_req = dict_new();
+    }
+
+    if (!local->xdata_req)
+        goto out;
+
+    local->fd = fd_ref(fd);
+    ret = afr_set_inode_local(this, local, fd->inode);
+    if (ret)
+        goto out;
+
+    local->op = GF_FOP_FSYNC;
+    local->cont.fsync.datasync = datasync;
+
+    if (afr_fd_has_witnessed_unstable_write(this, fd->inode)) {
+        /* don't care. we only wanted to CLEAR the bit */
+    }
+
+    local->transaction.wind = afr_fsync_wind;
+    local->transaction.unwind = afr_fsync_unwind;
+
+    local->transaction.main_frame = frame;
+
+    ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto out;
+    }
+
+    return 0;
 out:
-	if (op_ret == -1) {
-		if (transaction_frame)
-			AFR_STACK_DESTROY (transaction_frame);
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
+    if (transaction_frame)
+        AFR_STACK_DESTROY(transaction_frame);
+
+    AFR_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL);
 
-	return 0;
+    return 0;
 }
diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h
index 1f1a3a35afd..a787069b7a1 100644
--- a/xlators/cluster/afr/src/afr-inode-write.h
+++ b/xlators/cluster/afr/src/afr-inode-write.h
@@ -1,63 +1,94 @@
 /*
-   Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
 #ifndef __INODE_WRITE_H__
 #define __INODE_WRITE_H__
 
 int32_t
-afr_chmod (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, mode_t mode);
+afr_chmod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          dict_t *xdata);
 
 int32_t
-afr_chown (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, uid_t uid, gid_t gid);
+afr_chown(call_frame_t *frame, xlator_t *this, loc_t *loc, uid_t uid, gid_t gid,
+          dict_t *xdata);
 
 int
-afr_fchown (call_frame_t *frame, xlator_t *this,
-	    fd_t *fd, uid_t uid, gid_t gid);
+afr_fchown(call_frame_t *frame, xlator_t *this, fd_t *fd, uid_t uid, gid_t gid,
+           dict_t *xdata);
+
+int32_t
+afr_fchmod(call_frame_t *frame, xlator_t *this, fd_t *fd, mode_t mode,
+           dict_t *xdata);
+
+int32_t
+afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+           int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+           dict_t *xdata);
 
 int32_t
-afr_fchmod (call_frame_t *frame, xlator_t *this,
-	    fd_t *fd, mode_t mode);
+afr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+             dict_t *xdata);
+
+int32_t
+afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+              dict_t *xdata);
+
+int32_t
+afr_utimens(call_frame_t *frame, xlator_t *this, loc_t *loc,
+            struct timespec tv[2], dict_t *xdata);
+
+int
+afr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
+            int32_t valid, dict_t *xdata);
+
+int
+afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
+             int32_t valid, dict_t *xdata);
 
 int32_t
-afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, 
-	    struct iovec *vector, int32_t count, off_t offset);
+afr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+             int32_t flags, dict_t *xdata);
 
 int32_t
-afr_truncate (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, off_t offset);
+afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+              int32_t flags, dict_t *xdata);
 
 int32_t
-afr_ftruncate (call_frame_t *frame, xlator_t *this,
-	       fd_t *fd, off_t offset);
+afr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                const char *name, dict_t *xdata);
 
 int32_t
-afr_utimens (call_frame_t *frame, xlator_t *this,
-	     loc_t *loc, struct timespec tv[2]);
+afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                 const char *name, dict_t *xdata);
+
+int
+afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            size_t len, dict_t *xdata);
+
+int
+afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+              off_t offset, size_t len, dict_t *xdata);
+
+int
+afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             off_t len, dict_t *xdata);
 
 int32_t
-afr_setxattr (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, dict_t *dict, int32_t flags);
+afr_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+            gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
 
 int32_t
-afr_removexattr (call_frame_t *frame, xlator_t *this,
-		 loc_t *loc, const char *name);
+afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+             gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
 
+int
+afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+          dict_t *xdata);
 #endif /* __INODE_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
new file mode 100644
index 00000000000..bc8eabe0f43
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-lk-common.c
@@ -0,0 +1,791 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/dict.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/common-utils.h>
+
+#include "afr.h"
+#include "afr-transaction.h"
+#include "afr-messages.h"
+
+#include <signal.h>
+
+#define LOCKED_NO 0x0    /* no lock held */
+#define LOCKED_YES 0x1   /* for DATA, METADATA, ENTRY and higher_path */
+#define LOCKED_LOWER 0x2 /* for lower path */
+
+void
+afr_lockee_cleanup(afr_lockee_t *lockee)
+{
+    if (lockee->fd) {
+        fd_unref(lockee->fd);
+        lockee->fd = NULL;
+    } else {
+        loc_wipe(&lockee->loc);
+    }
+
+    GF_FREE(lockee->basename);
+    lockee->basename = NULL;
+    GF_FREE(lockee->locked_nodes);
+    lockee->locked_nodes = NULL;
+
+    return;
+}
+
+void
+afr_lockees_cleanup(afr_internal_lock_t *int_lock)
+{
+    int i = 0;
+
+    for (i = 0; i < int_lock->lockee_count; i++) {
+        afr_lockee_cleanup(&int_lock->lockee[i]);
+    }
+
+    return;
+}
+int
+afr_entry_lockee_cmp(const void *l1, const void *l2)
+{
+    const afr_lockee_t *r1 = l1;
+    const afr_lockee_t *r2 = l2;
+    int ret = 0;
+    uuid_t gfid1 = {0};
+    uuid_t gfid2 = {0};
+
+    loc_gfid((loc_t *)&r1->loc, gfid1);
+    loc_gfid((loc_t *)&r2->loc, gfid2);
+    ret = gf_uuid_compare(gfid1, gfid2);
+    /*Entrylks with NULL basename are the 'smallest'*/
+    if (ret == 0) {
+        if (!r1->basename)
+            return -1;
+        if (!r2->basename)
+            return 1;
+        ret = strcmp(r1->basename, r2->basename);
+    }
+
+    if (ret <= 0)
+        return -1;
+    else
+        return 1;
+}
+
+int
+afr_lock_blocking(call_frame_t *frame, xlator_t *this, int child_index);
+
+void
+afr_set_lk_owner(call_frame_t *frame, xlator_t *this, void *lk_owner)
+{
+    gf_msg_trace(this->name, 0, "Setting lk-owner=%llu",
+                 (unsigned long long)(unsigned long)lk_owner);
+
+    set_lk_owner_from_ptr(&frame->root->lk_owner, lk_owner);
+}
+
+int32_t
+internal_lock_count(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int32_t call_count = 0;
+    int i = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->child_up[i])
+            ++call_count;
+    }
+
+    return call_count;
+}
+
+int
+afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename,
+                     int child_count)
+{
+    int ret = -ENOMEM;
+    afr_internal_lock_t *int_lock = &local->internal_lock;
+    afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count];
+
+    GF_ASSERT(int_lock->lockee_count < AFR_LOCKEE_COUNT_MAX);
+    loc_copy(&lockee->loc, loc);
+    lockee->basename = (basename) ? gf_strdup(basename) : NULL;
+    if (basename && !lockee->basename)
+        goto out;
+
+    lockee->locked_count = 0;
+    lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes),
+                                     gf_afr_mt_afr_node_character);
+
+    if (!lockee->locked_nodes)
+        goto out;
+
+    ret = 0;
+    int_lock->lockee_count++;
+out:
+    if (ret) {
+        afr_lockee_cleanup(lockee);
+    }
+    return ret;
+}
+
+int
+afr_add_inode_lockee(afr_local_t *local, int child_count)
+{
+    int ret = -ENOMEM;
+    afr_internal_lock_t *int_lock = &local->internal_lock;
+    afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count];
+
+    if (local->fd) {
+        lockee->fd = fd_ref(local->fd);
+    } else {
+        loc_copy(&lockee->loc, &local->loc);
+    }
+
+    lockee->locked_count = 0;
+    lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes),
+                                     gf_afr_mt_afr_node_character);
+
+    if (!lockee->locked_nodes)
+        goto out;
+
+    ret = 0;
+    int_lock->lockee_count++;
+out:
+    if (ret) {
+        afr_lockee_cleanup(lockee);
+    }
+    return ret;
+}
+
+static int
+initialize_internal_lock_variables(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_internal_lock_t *int_lock = NULL;
+    afr_private_t *priv = NULL;
+
+    int i = 0;
+
+    priv = this->private;
+    local = frame->local;
+    int_lock = &local->internal_lock;
+
+    int_lock->lock_count = 0;
+    int_lock->lock_op_ret = -1;
+    int_lock->lock_op_errno = 0;
+    int_lock->lk_attempted_count = 0;
+
+    for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) {
+        if (!int_lock->lockee[i].locked_nodes)
+            break;
+        int_lock->lockee[i].locked_count = 0;
+        memset(int_lock->lockee[i].locked_nodes, 0,
+               sizeof(*int_lock->lockee[i].locked_nodes) * priv->child_count);
+    }
+
+    return 0;
+}
+
+int
+afr_lockee_locked_nodes_count(afr_internal_lock_t *int_lock)
+{
+    int call_count = 0;
+    int i = 0;
+
+    for (i = 0; i < int_lock->lockee_count; i++)
+        call_count += int_lock->lockee[i].locked_count;
+
+    return call_count;
+}
+
+int
+afr_locked_nodes_count(unsigned char *locked_nodes, int child_count)
+
+{
+    int i = 0;
+    int call_count = 0;
+
+    for (i = 0; i < child_count; i++) {
+        if (locked_nodes[i] & LOCKED_YES)
+            call_count++;
+    }
+
+    return call_count;
+}
+
+static void
+afr_log_locks_failure(call_frame_t *frame, char *where, char *what,
+                      int op_errno)
+{
+    xlator_t *this = frame->this;
+    gf_lkowner_t *lk_owner = &frame->root->lk_owner;
+    afr_local_t *local = frame->local;
+    const char *fop = NULL;
+    char *gfid = NULL;
+    const char *name = NULL;
+
+    fop = gf_fop_list[local->op];
+
+    switch (local->transaction.type) {
+        case AFR_ENTRY_RENAME_TRANSACTION:
+        case AFR_ENTRY_TRANSACTION:
+            switch (local->op) {
+                case GF_FOP_LINK:
+                    gfid = uuid_utoa(local->newloc.pargfid);
+                    name = local->newloc.name;
+                    break;
+                default:
+                    gfid = uuid_utoa(local->loc.pargfid);
+                    name = local->loc.name;
+                    break;
+            }
+            gf_msg(this->name, GF_LOG_WARNING, op_errno,
+                   AFR_MSG_INTERNAL_LKS_FAILED,
+                   "Unable to do entry %s with lk-owner:%s on %s "
+                   "while attempting %s on {pgfid:%s, name:%s}.",
+                   what, lkowner_utoa(lk_owner), where, fop, gfid, name);
+            break;
+        case AFR_DATA_TRANSACTION:
+        case AFR_METADATA_TRANSACTION:
+            gfid = uuid_utoa(local->inode->gfid);
+            gf_msg(this->name, GF_LOG_WARNING, op_errno,
+                   AFR_MSG_INTERNAL_LKS_FAILED,
+                   "Unable to do inode %s with lk-owner:%s on %s "
+                   "while attempting %s on gfid:%s.",
+                   what, lkowner_utoa(lk_owner), where, fop, gfid);
+            break;
+    }
+}
+
+static int32_t
+afr_unlock_common_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    afr_internal_lock_t *int_lock = NULL;
+    int lockee_num = 0;
+    int call_count = 0;
+    int child_index = 0;
+    int ret = 0;
+
+    local = frame->local;
+    int_lock = &local->internal_lock;
+    priv = this->private;
+    lockee_num = (int)((long)cookie) / priv->child_count;
+    child_index = (int)((long)cookie) % priv->child_count;
+
+    if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+        afr_log_locks_failure(frame, priv->children[child_index]->name,
+                              "unlock", op_errno);
+    }
+
+    int_lock->lockee[lockee_num].locked_nodes[child_index] &= LOCKED_NO;
+    if (local->transaction.type == AFR_DATA_TRANSACTION && op_ret != 1)
+        ret = afr_write_subvol_reset(frame, this);
+
+    LOCK(&frame->lock);
+    {
+        call_count = --int_lock->lk_call_count;
+    }
+    UNLOCK(&frame->lock);
+
+    if (call_count == 0) {
+        int_lock->lock_cbk(frame, this);
+    }
+
+    return ret;
+}
+
+void
+afr_internal_lock_wind(call_frame_t *frame,
+                       int32_t (*cbk)(call_frame_t *, void *, xlator_t *,
+                                      int32_t, int32_t, dict_t *),
+                       void *cookie, int child, int lockee_num,
+                       gf_boolean_t blocking, gf_boolean_t unlock)
+{
+    afr_local_t *local = frame->local;
+    xlator_t *this = frame->this;
+    afr_private_t *priv = this->private;
+    afr_internal_lock_t *int_lock = &local->internal_lock;
+    entrylk_cmd cmd = ENTRYLK_LOCK_NB;
+    int32_t cmd1 = F_SETLK;
+    struct gf_flock flock = {
+        0,
+    };
+
+    switch (local->transaction.type) {
+        case AFR_ENTRY_TRANSACTION:
+        case AFR_ENTRY_RENAME_TRANSACTION:
+            if (unlock) {
+                cmd = ENTRYLK_UNLOCK;
+            } else if (blocking) { /*Doesn't make sense to have blocking
+                                      unlock*/
+                cmd = ENTRYLK_LOCK;
+            }
+
+            if (local->fd) {
+                STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child],
+                                  priv->children[child]->fops->fentrylk,
+                                  int_lock->domain,
+                                  int_lock->lockee[lockee_num].fd,
+                                  int_lock->lockee[lockee_num].basename, cmd,
+                                  ENTRYLK_WRLCK, NULL);
+            } else {
+                STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child],
+                                  priv->children[child]->fops->entrylk,
+                                  int_lock->domain,
+                                  &int_lock->lockee[lockee_num].loc,
+                                  int_lock->lockee[lockee_num].basename, cmd,
+                                  ENTRYLK_WRLCK, NULL);
+            }
+            break;
+
+        case AFR_DATA_TRANSACTION:
+        case AFR_METADATA_TRANSACTION:
+            flock = int_lock->lockee[lockee_num].flock;
+            if (unlock) {
+                flock.l_type = F_UNLCK;
+            } else if (blocking) { /*Doesn't make sense to have blocking
+                                      unlock*/
+                cmd1 = F_SETLKW;
+            }
+
+            if (local->fd) {
+                STACK_WIND_COOKIE(
+                    frame, cbk, cookie, priv->children[child],
+                    priv->children[child]->fops->finodelk, int_lock->domain,
+                    int_lock->lockee[lockee_num].fd, cmd1, &flock, NULL);
+            } else {
+                STACK_WIND_COOKIE(
+                    frame, cbk, cookie, priv->children[child],
+                    priv->children[child]->fops->inodelk, int_lock->domain,
+                    &int_lock->lockee[lockee_num].loc, cmd1, &flock, NULL);
+            }
+            break;
+    }
+}
+
+static int
+afr_unlock_now(call_frame_t *frame, xlator_t *this)
+{
+    afr_internal_lock_t *int_lock = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int call_count = 0;
+    int child_index = 0;
+    int lockee_num = 0;
+    int i = -1;
+
+    local = frame->local;
+    int_lock = &local->internal_lock;
+    priv = this->private;
+
+    call_count = afr_lockee_locked_nodes_count(int_lock);
+
+    int_lock->lk_call_count = call_count;
+
+    if (!call_count) {
+        gf_msg_trace(this->name, 0, "No internal locks unlocked");
+        int_lock->lock_cbk(frame, this);
+        goto out;
+    }
+
+    for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
+        lockee_num = i / priv->child_count;
+        child_index = i % priv->child_count;
+        if (int_lock->lockee[lockee_num].locked_nodes[child_index] &
+            LOCKED_YES) {
+            afr_internal_lock_wind(frame, afr_unlock_common_cbk,
+                                   (void *)(long)i, child_index, lockee_num,
+                                   _gf_false, _gf_true);
+            if (!--call_count)
+                break;
+        }
+    }
+
+out:
+    return 0;
+}
+
+static int32_t
+afr_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, dict_t *xdata)
+{
+    afr_internal_lock_t *int_lock = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int cky = (long)cookie;
+    int child_index = 0;
+    int lockee_num = 0;
+
+    priv = this->private;
+    local = frame->local;
+    int_lock = &local->internal_lock;
+
+    child_index = ((int)cky) % priv->child_count;
+    lockee_num = ((int)cky) / priv->child_count;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret == -1) {
+            if (op_errno == ENOSYS) {
+                /* return ENOTSUP */
+                gf_msg(this->name, GF_LOG_ERROR, ENOSYS,
+                       AFR_MSG_LOCK_XLATOR_NOT_LOADED,
+                       "subvolume does not support locking. "
+                       "please load features/locks xlator on server");
+                local->op_ret = op_ret;
+                int_lock->lock_op_ret = op_ret;
+            }
+
+            local->op_errno = op_errno;
+            int_lock->lock_op_errno = op_errno;
+        }
+
+        int_lock->lk_attempted_count++;
+    }
+    UNLOCK(&frame->lock);
+
+    if ((op_ret == -1) && (op_errno == ENOSYS)) {
+        afr_unlock_now(frame, this);
+    } else {
+        if (op_ret == 0) {
+            int_lock->lockee[lockee_num]
+                .locked_nodes[child_index] |= LOCKED_YES;
+            int_lock->lockee[lockee_num].locked_count++;
+            int_lock->lock_count++;
+            if (local->transaction.type == AFR_DATA_TRANSACTION) {
+                LOCK(&local->inode->lock);
+                {
+                    local->inode_ctx->lock_count++;
+                }
+                UNLOCK(&local->inode->lock);
+            }
+        }
+        afr_lock_blocking(frame, this, cky + 1);
+    }
+
+    return 0;
+}
+
+static gf_boolean_t
+_is_lock_wind_needed(afr_local_t *local, int child_index)
+{
+    if (!local->child_up[child_index])
+        return _gf_false;
+
+    return _gf_true;
+}
+
+static gf_boolean_t
+is_blocking_locks_count_sufficient(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    afr_internal_lock_t *int_lock = NULL;
+    int child = 0;
+    int nlockee = 0;
+    int lockee_count = 0;
+    gf_boolean_t ret = _gf_true;
+
+    local = frame->local;
+    priv = this->private;
+    int_lock = &local->internal_lock;
+    lockee_count = int_lock->lockee_count;
+
+    if (int_lock->lock_count == 0) {
+        afr_log_locks_failure(frame, "any subvolume", "lock",
+                              int_lock->lock_op_errno);
+        return _gf_false;
+    }
+    /* For FOPS that take multiple sets of locks (mkdir, rename),
+     * there must be at least one brick on which the locks from
+     * all lock sets were successful. */
+    for (child = 0; child < priv->child_count; child++) {
+        ret = _gf_true;
+        for (nlockee = 0; nlockee < lockee_count; nlockee++) {
+            if (!(int_lock->lockee[nlockee].locked_nodes[child] & LOCKED_YES))
+                ret = _gf_false;
+        }
+        if (ret)
+            return ret;
+    }
+    if (!ret)
+        afr_log_locks_failure(frame, "all", "lock", int_lock->lock_op_errno);
+
+    return ret;
+}
+
+int
+afr_lock_blocking(call_frame_t *frame, xlator_t *this, int cookie)
+{
+    afr_internal_lock_t *int_lock = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    uint64_t ctx = 0;
+    int ret = 0;
+    int child_index = 0;
+    int lockee_num = 0;
+
+    local = frame->local;
+    int_lock = &local->internal_lock;
+    priv = this->private;
+    child_index = cookie % priv->child_count;
+    lockee_num = cookie / priv->child_count;
+
+    if (local->fd) {
+        ret = fd_ctx_get(local->fd, this, &ctx);
+
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED,
+                   "unable to get fd ctx for fd=%p", local->fd);
+
+            local->op_ret = -1;
+            int_lock->lock_op_ret = -1;
+
+            afr_unlock_now(frame, this);
+
+            return 0;
+        }
+    }
+
+    if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
+        if (!is_blocking_locks_count_sufficient(frame, this)) {
+            local->op_ret = -1;
+            int_lock->lock_op_ret = -1;
+
+            afr_unlock_now(frame, this);
+
+            return 0;
+        }
+    }
+
+    if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
+        /* we're done locking */
+
+        gf_msg_debug(this->name, 0, "we're done locking");
+
+        int_lock->lock_op_ret = 0;
+        int_lock->lock_cbk(frame, this);
+        return 0;
+    }
+
+    if (!_is_lock_wind_needed(local, child_index)) {
+        afr_lock_blocking(frame, this, cookie + 1);
+        return 0;
+    }
+
+    afr_internal_lock_wind(frame, afr_lock_cbk, (void *)(long)cookie,
+                           child_index, lockee_num, _gf_true, _gf_false);
+
+    return 0;
+}
+
+int32_t
+afr_blocking_lock(call_frame_t *frame, xlator_t *this)
+{
+    afr_internal_lock_t *int_lock = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int up_count = 0;
+
+    priv = this->private;
+    local = frame->local;
+    int_lock = &local->internal_lock;
+
+    up_count = AFR_COUNT(local->child_up, priv->child_count);
+    int_lock->lk_call_count = int_lock->lk_expected_count =
+        (int_lock->lockee_count * up_count);
+    initialize_internal_lock_variables(frame, this);
+
+    afr_lock_blocking(frame, this, 0);
+
+    return 0;
+}
+
+static int32_t
+afr_nb_internal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    afr_internal_lock_t *int_lock = NULL;
+    afr_local_t *local = NULL;
+    int call_count = 0;
+    int child_index = 0;
+    int lockee_num = 0;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    child_index = ((long)cookie) % priv->child_count;
+    lockee_num = ((long)cookie) / priv->child_count;
+
+    local = frame->local;
+    int_lock = &local->internal_lock;
+
+    if (op_ret == 0 && local->transaction.type == AFR_DATA_TRANSACTION) {
+        LOCK(&local->inode->lock);
+        {
+            local->inode_ctx->lock_count++;
+        }
+        UNLOCK(&local->inode->lock);
+    }
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret < 0) {
+            if (op_errno == ENOSYS) {
+                /* return ENOTSUP */
+                gf_msg(this->name, GF_LOG_ERROR, ENOSYS,
+                       AFR_MSG_LOCK_XLATOR_NOT_LOADED,
+                       "subvolume does not support "
+                       "locking. please load features/locks"
+                       " xlator on server");
+                local->op_ret = op_ret;
+                int_lock->lock_op_ret = op_ret;
+
+                int_lock->lock_op_errno = op_errno;
+                local->op_errno = op_errno;
+            }
+        } else if (op_ret == 0) {
+            int_lock->lockee[lockee_num]
+                .locked_nodes[child_index] |= LOCKED_YES;
+            int_lock->lockee[lockee_num].locked_count++;
+            int_lock->lock_count++;
+        }
+
+        call_count = --int_lock->lk_call_count;
+    }
+    UNLOCK(&frame->lock);
+
+    if (call_count == 0) {
+        gf_msg_trace(this->name, 0, "Last locking reply received");
+        /* all locks successful. Proceed to call FOP */
+        if (int_lock->lock_count == int_lock->lk_expected_count) {
+            gf_msg_trace(this->name, 0, "All servers locked. Calling the cbk");
+            int_lock->lock_op_ret = 0;
+            int_lock->lock_cbk(frame, this);
+        }
+        /* Not all locks were successful. Unlock and try locking
+           again, this time with serially blocking locks */
+        else {
+            gf_msg_trace(this->name, 0,
+                         "%d servers locked. Trying again "
+                         "with blocking calls",
+                         int_lock->lock_count);
+
+            afr_unlock_now(frame, this);
+        }
+    }
+
+    return 0;
+}
+
+int
+afr_lock_nonblocking(call_frame_t *frame, xlator_t *this)
+{
+    afr_internal_lock_t *int_lock = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    afr_fd_ctx_t *fd_ctx = NULL;
+    int child = 0;
+    int lockee_num = 0;
+    int32_t call_count = 0;
+    int i = 0;
+    int ret = 0;
+
+    local = frame->local;
+    int_lock = &local->internal_lock;
+    priv = this->private;
+
+    initialize_internal_lock_variables(frame, this);
+
+    if (local->fd) {
+        fd_ctx = afr_fd_ctx_get(local->fd, this);
+        if (!fd_ctx) {
+            gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED,
+                   "unable to get fd ctx for fd=%p", local->fd);
+
+            local->op_ret = -1;
+            int_lock->lock_op_ret = -1;
+            local->op_errno = EINVAL;
+            int_lock->lock_op_errno = EINVAL;
+
+            afr_unlock_now(frame, this);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    call_count = int_lock->lockee_count * internal_lock_count(frame, this);
+    int_lock->lk_call_count = call_count;
+    int_lock->lk_expected_count = call_count;
+
+    if (!call_count) {
+        gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INFO_COMMON,
+               "fd not open on any subvolumes. aborting.");
+        afr_unlock_now(frame, this);
+        goto out;
+    }
+
+    /* Send non-blocking lock calls only on up children
+       and where the fd has been opened */
+    for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
+        child = i % priv->child_count;
+        lockee_num = i / priv->child_count;
+        if (local->child_up[child]) {
+            afr_internal_lock_wind(frame, afr_nb_internal_lock_cbk,
+                                   (void *)(long)i, child, lockee_num,
+                                   _gf_false, _gf_false);
+            if (!--call_count)
+                break;
+        }
+    }
+out:
+    return ret;
+}
+
+int32_t
+afr_unlock(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_lock_t *lock = NULL;
+
+    local = frame->local;
+
+    if (!local->transaction.eager_lock_on)
+        goto out;
+    lock = &local->inode_ctx->lock[local->transaction.type];
+    LOCK(&local->inode->lock);
+    {
+        list_del_init(&local->transaction.owner_list);
+        if (list_empty(&lock->owners) && list_empty(&lock->post_op)) {
+            local->transaction.do_eager_unlock = _gf_true;
+            /*TODO: Need to get metadata use on_disk and inherit/uninherit
+             *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]);
+             *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]);
+             */
+            GF_ASSERT(lock->release);
+        }
+    }
+    UNLOCK(&local->inode->lock);
+    if (!local->transaction.do_eager_unlock) {
+        local->internal_lock.lock_cbk(frame, this);
+        return 0;
+    }
+
+out:
+    afr_unlock_now(frame, this);
+    return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
new file mode 100644
index 00000000000..816065fb57a
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -0,0 +1,38 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __AFR_MEM_TYPES_H__
+#define __AFR_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_afr_mem_types_ {
+    gf_afr_mt_afr_fd_ctx_t = gf_common_mt_end + 1,
+    gf_afr_mt_afr_private_t,
+    gf_afr_mt_int32_t,
+    gf_afr_mt_char,
+    gf_afr_mt_xattr_key,
+    gf_afr_mt_dict_t,
+    gf_afr_mt_xlator_t,
+    gf_afr_mt_afr_node_character,
+    gf_afr_mt_inode_ctx_t,
+    gf_afr_mt_shd_event_t,
+    gf_afr_mt_reply_t,
+    gf_afr_mt_subvol_healer_t,
+    gf_afr_mt_spbc_timeout_t,
+    gf_afr_mt_spb_status_t,
+    gf_afr_mt_empty_brick_t,
+    gf_afr_mt_child_latency_t,
+    gf_afr_mt_atomic_t,
+    gf_afr_mt_lk_heal_info_t,
+    gf_afr_mt_gf_lock,
+    gf_afr_mt_end
+};
+#endif
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
new file mode 100644
index 00000000000..e73fd997765
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-messages.h
@@ -0,0 +1,167 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _AFR_MESSAGES_H_
+#define _AFR_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(
+    AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET, AFR_MSG_QUORUM_OVERRIDE,
+    AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP, AFR_MSG_SUBVOLS_DOWN,
+    AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN, AFR_MSG_OPEN_FAIL,
+    AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS, AFR_MSG_GFID_NULL,
+    AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED,
+    AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS,
+    AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED,
+    AFR_MSG_INTERNAL_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO,
+    AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED,
+    AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR, AFR_MSG_SELF_HEAL_INFO,
+    AFR_MSG_READ_SUBVOL_ERROR, AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON,
+    AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD, AFR_MSG_INVALID_DATA,
+    AFR_MSG_INVALID_ARG, AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED,
+    AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED,
+    AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS, AFR_MSG_NO_CHANGELOG,
+    AFR_MSG_TIMER_CREATE_FAIL, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+    AFR_MSG_INODE_CTX_GET_FAILED, AFR_MSG_THIN_ARB,
+    AFR_MSG_THIN_ARB_XATTROP_FAILED, AFR_MSG_THIN_ARB_LOC_POP_FAILED,
+    AFR_MSG_GET_PEND_VAL, AFR_MSG_THIN_ARB_SKIP_SHD, AFR_MSG_UNKNOWN_SET,
+    AFR_MSG_NO_XL_ID, AFR_MSG_SELF_HEAL_INFO_START,
+    AFR_MSG_SELF_HEAL_INFO_FINISH, AFR_MSG_INCRE_COUNT,
+    AFR_MSG_ADD_TO_OUTPUT_FAILED, AFR_MSG_SET_TIME_FAILED,
+    AFR_MSG_GFID_MISMATCH_DETECTED, AFR_MSG_GFID_HEAL_MSG,
+    AFR_MSG_THIN_ARB_LOOKUP_FAILED, AFR_MSG_DICT_CREATE_FAILED,
+    AFR_MSG_NO_MAJORITY_TO_RESOLVE, AFR_MSG_TYPE_MISMATCH,
+    AFR_MSG_SIZE_POLICY_NOT_APPLICABLE, AFR_MSG_NO_CHILD_SELECTED,
+    AFR_MSG_INVALID_CHILD, AFR_MSG_RESOLVE_CONFLICTING_DATA,
+    SERROR_GETTING_SRC_BRICK, SNO_DIFF_IN_MTIME, SNO_BIGGER_FILE,
+    SALL_BRICKS_UP_TO_RESOLVE, AFR_MSG_UNLOCK_FAILED, AFR_MSG_POST_OP_FAILED,
+    AFR_MSG_TA_FRAME_CREATE_FAILED, AFR_MSG_SET_KEY_XATTROP_FAILED,
+    AFR_MSG_BLOCKING_ENTRYLKS_FAILED, AFR_MSG_FOP_FAILED,
+    AFR_MSG_CLEAN_UP_FAILED, AFR_MSG_UNABLE_TO_FETCH, AFR_MSG_XATTR_SET_FAILED,
+    AFR_MSG_SPLIT_BRAIN_REPLICA, AFR_MSG_INODE_CTX_FAILED,
+    AFR_MSG_LOOKUP_FAILED, AFR_MSG_ALL_SUBVOLS_DOWN,
+    AFR_MSG_RELEASE_LOCK_FAILED, AFR_MSG_CLEAR_TIME_SPLIT_BRAIN,
+    AFR_MSG_READ_FAILED, AFR_MSG_LAUNCH_FAILED, AFR_MSG_READ_SUBVOL_NOT_UP,
+    AFR_MSG_LK_HEAL_DOM, AFR_MSG_NEW_BRICK, AFR_MSG_SPLIT_BRAIN_SET_FAILED,
+    AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, AFR_MSG_HEALER_SPAWN_FAILED,
+    AFR_MSG_ADD_CRAWL_EVENT_FAILED, AFR_MSG_NULL_DEREF, AFR_MSG_SET_PEND_XATTR,
+    AFR_MSG_INTERNAL_ATTR);
+
+#define AFR_MSG_DICT_GET_FAILED_STR "Dict get failed"
+#define AFR_MSG_DICT_SET_FAILED_STR "Dict set failed"
+#define AFR_MSG_HEALER_SPAWN_FAILED_STR "Healer spawn failed"
+#define AFR_MSG_ADD_CRAWL_EVENT_FAILED_STR "Adding crawl event failed"
+#define AFR_MSG_INVALID_ARG_STR "Invalid argument"
+#define AFR_MSG_INDEX_DIR_GET_FAILED_STR "unable to get index-dir on "
+#define AFR_MSG_THIN_ARB_LOOKUP_FAILED_STR "Failed lookup on file"
+#define AFR_MSG_DICT_CREATE_FAILED_STR "Failed to create dict."
+#define AFR_MSG_THIN_ARB_XATTROP_FAILED_STR "Xattrop failed."
+#define AFR_MSG_THIN_ARB_LOC_POP_FAILED_STR                                    \
+    "Failed to populate loc for thin-arbiter"
+#define AFR_MSG_GET_PEND_VAL_STR "Error getting value of pending"
+#define AFR_MSG_THIN_ARB_SKIP_SHD_STR "I am not the god shd. skipping."
+#define AFR_MSG_UNKNOWN_SET_STR "Unknown set"
+#define AFR_MSG_NO_XL_ID_STR "xl does not have id"
+#define AFR_MSG_SELF_HEAL_INFO_START_STR "starting full sweep on"
+#define AFR_MSG_SELF_HEAL_INFO_FINISH_STR "finished full sweep on"
+#define AFR_MSG_INCRE_COUNT_STR "Could not increment the counter."
+#define AFR_MSG_ADD_TO_OUTPUT_FAILED_STR "Could not add to output"
+#define AFR_MSG_SET_TIME_FAILED_STR "Could not set time"
+#define AFR_MSG_GFID_HEAL_MSG_STR "Error setting gfid-heal-msg dict"
+#define AFR_MSG_NO_MAJORITY_TO_RESOLVE_STR                                     \
+    "No majority to resolve gfid split brain"
+#define AFR_MSG_GFID_MISMATCH_DETECTED_STR "Gfid mismatch dectected"
+#define AFR_MSG_SELF_HEAL_INFO_STR "performing selfheal"
+#define AFR_MSG_TYPE_MISMATCH_STR "TYPE mismatch"
+#define AFR_MSG_SIZE_POLICY_NOT_APPLICABLE_STR                                 \
+    "Size policy is not applicable to directories."
+#define AFR_MSG_NO_CHILD_SELECTED_STR                                          \
+    "No child selected by favorite-child policy"
+#define AFR_MSG_INVALID_CHILD_STR "Invalid child"
+#define AFR_MSG_RESOLVE_CONFLICTING_DATA_STR                                   \
+    "selected as authentic to resolve conflicting data"
+#define SERROR_GETTING_SRC_BRICK_STR "Error getting the source brick"
+#define SNO_DIFF_IN_MTIME_STR "No difference in mtime"
+#define SNO_BIGGER_FILE_STR "No bigger file"
+#define SALL_BRICKS_UP_TO_RESOLVE_STR                                          \
+    "All the bricks should be up to resolve the gfid split brain"
+#define AFR_MSG_UNLOCK_FAILED_STR "Failed to unlock"
+#define AFR_MSG_POST_OP_FAILED_STR "Post-op on thin-arbiter failed"
+#define AFR_MSG_TA_FRAME_CREATE_FAILED_STR "Failed to create ta_frame"
+#define AFR_MSG_SET_KEY_XATTROP_FAILED_STR "Could not set key during xattrop"
+#define AFR_MSG_BLOCKING_ENTRYLKS_FAILED_STR "Blocking entrylks failed"
+#define AFR_MSG_FSYNC_FAILED_STR "fsync failed"
+#define AFR_MSG_QUORUM_FAIL_STR "quorum is not met"
+#define AFR_MSG_FOP_FAILED_STR "Failing Fop"
+#define AFR_MSG_INVALID_SUBVOL_STR "not a subvolume"
+#define AFR_MSG_VOL_MISCONFIGURED_STR "Volume is dangling"
+#define AFR_MSG_CHILD_MISCONFIGURED_STR                                        \
+    "replicate translator needs more than one subvolume defined"
+#define AFR_MSG_CLEAN_UP_FAILED_STR "Failed to clean up healer threads"
+#define AFR_MSG_QUORUM_OVERRIDE_STR "overriding quorum-count"
+#define AFR_MSG_UNABLE_TO_FETCH_STR                                            \
+    "Unable to fetch afr-pending-xattr option from volfile. Falling back to "  \
+    "using client translator names"
+#define AFR_MSG_NULL_DEREF_STR "possible NULL deref"
+#define AFR_MSG_XATTR_SET_FAILED_STR "Cannot set xattr cookie key"
+#define AFR_MSG_SPLIT_BRAIN_STATUS_STR "Failed to create synctask"
+#define AFR_MSG_SUBVOLS_DOWN_STR "All subvolumes are not up"
+#define AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR_STR                                   \
+    "Failed to cancel split-brain choice"
+#define AFR_MSG_SPLIT_BRAIN_REPLICA_STR                                        \
+    "Cannot set replica. File is not in data/metadata split-brain"
+#define AFR_MSG_INODE_CTX_FAILED_STR "Failed to get inode_ctx"
+#define AFR_MSG_READ_SUBVOL_ERROR_STR "no read subvols"
+#define AFR_MSG_LOCAL_CHILD_STR "selecting local read-child"
+#define AFR_MSG_LOOKUP_FAILED_STR "Failed to lookup/create thin-arbiter id file"
+#define AFR_MSG_TIMER_CREATE_FAIL_STR                                          \
+    "Cannot create timer for delayed initialization"
+#define AFR_MSG_SUBVOL_UP_STR "Subvolume came back up; going online"
+#define AFR_MSG_ALL_SUBVOLS_DOWN_STR                                           \
+    "All subvolumes are down. Going offline until atleast one of them is up"
+#define AFR_MSG_RELEASE_LOCK_FAILED_STR "Failed to release lock"
+#define AFR_MSG_INVALID_CHILD_UP_STR "Received child_up from invalid subvolume"
+#define AFR_MSG_QUORUM_MET_STR "Client-quorum is met"
+#define AFR_MSG_EXPUNGING_FILE_OR_DIR_STR "expunging file or dir"
+#define AFR_MSG_SELF_HEAL_FAILED_STR "Invalid"
+#define AFR_MSG_SPLIT_BRAIN_STR "Skipping conservative mergeon the file"
+#define AFR_MSG_CLEAR_TIME_SPLIT_BRAIN_STR "clear time split brain"
+#define AFR_MSG_READ_FAILED_STR "Failing read since good brick is down"
+#define AFR_MSG_LAUNCH_FAILED_STR "Failed to launch synctask"
+#define AFR_MSG_READ_SUBVOL_NOT_UP_STR                                         \
+    "read subvolume in this generation is not up"
+#define AFR_MSG_INTERNAL_LKS_FAILED_STR                                        \
+    "Unable to work with lk-owner while attempting fop"
+#define AFR_MSG_LOCK_XLATOR_NOT_LOADED_STR                                     \
+    "subvolume does not support locking. please load features/locks xlator "   \
+    "on server."
+#define AFR_MSG_FD_CTX_GET_FAILED_STR "unable to get fd ctx"
+#define AFR_MSG_INFO_COMMON_STR "fd not open on any subvolumes, aborting."
+#define AFR_MSG_REPLACE_BRICK_STATUS_STR "Couldn't acquire lock on any child."
+#define AFR_MSG_NEW_BRICK_STR "New brick"
+#define AFR_MSG_SPLIT_BRAIN_SET_FAILED_STR                                     \
+    "Failed to set split-brain choice to -1"
+#define AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED_STR                               \
+    "Failed to determine split-brain. Aborting split-brain-choice set"
+#define AFR_MSG_OPEN_FAIL_STR "Failed to open subvolume"
+#define AFR_MSG_SET_PEND_XATTR_STR "Set of pending xattr"
+#define AFR_MSG_INTERNAL_ATTR_STR "is an internal extended attribute"
+#endif /* !_AFR_MESSAGES_H_ */
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
new file mode 100644
index 00000000000..64856042b65
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -0,0 +1,353 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <unistd.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#include <glusterfs/glusterfs.h>
+#include "afr.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/statedump.h>
+
+#include "afr-transaction.h"
+
+gf_boolean_t
+afr_is_fd_fixable(fd_t *fd)
+{
+    if (!fd || !fd->inode)
+        return _gf_false;
+    else if (fd_is_anonymous(fd))
+        return _gf_false;
+    else if (gf_uuid_is_null(fd->inode->gfid))
+        return _gf_false;
+
+    return _gf_true;
+}
+
+int
+afr_open_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                       struct iatt *postbuf, dict_t *xdata)
+{
+    afr_local_t *local = frame->local;
+
+    AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno,
+                     local->cont.open.fd, xdata);
+    return 0;
+}
+
+int
+afr_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int call_count = -1;
+    int child_index = (long)cookie;
+    afr_fd_ctx_t *fd_ctx = NULL;
+
+    local = frame->local;
+    fd_ctx = local->fd_ctx;
+
+    local->replies[child_index].valid = 1;
+    local->replies[child_index].op_ret = op_ret;
+    local->replies[child_index].op_errno = op_errno;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret == -1) {
+            local->op_errno = op_errno;
+            fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+        } else {
+            local->op_ret = op_ret;
+            fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+            if (!local->xdata_rsp && xdata)
+                local->xdata_rsp = dict_ref(xdata);
+        }
+        call_count = --local->call_count;
+    }
+    UNLOCK(&frame->lock);
+
+    if (call_count == 0) {
+        afr_handle_replies_quorum(frame, this);
+        if (local->op_ret == -1) {
+            AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, NULL,
+                             NULL);
+        } else if (fd_ctx->flags & O_TRUNC) {
+            STACK_WIND(frame, afr_open_ftruncate_cbk, this,
+                       this->fops->ftruncate, fd, 0, NULL);
+        } else {
+            AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno,
+                             local->cont.open.fd, local->xdata_rsp);
+        }
+    }
+
+    return 0;
+}
+
+int
+afr_open_continue(call_frame_t *frame, xlator_t *this, int err)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int call_count = 0;
+    int i = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    if (err) {
+        AFR_STACK_UNWIND(open, frame, -1, err, NULL, NULL);
+    } else {
+        local->call_count = AFR_COUNT(local->child_up, priv->child_count);
+        call_count = local->call_count;
+
+        for (i = 0; i < priv->child_count; i++) {
+            if (local->child_up[i]) {
+                STACK_WIND_COOKIE(frame, afr_open_cbk, (void *)(long)i,
+                                  priv->children[i],
+                                  priv->children[i]->fops->open, &local->loc,
+                                  (local->cont.open.flags & ~O_TRUNC),
+                                  local->cont.open.fd, local->xdata_req);
+                if (!--call_count)
+                    break;
+            }
+        }
+    }
+    return 0;
+}
+
+int
+afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+         fd_t *fd, dict_t *xdata)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int spb_subvol = 0;
+    int event_generation = 0;
+    int ret = 0;
+    int32_t op_errno = 0;
+    afr_fd_ctx_t *fd_ctx = NULL;
+
+    // We can't let truncation to happen outside transaction.
+
+    priv = this->private;
+
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    local->op = GF_FOP_OPEN;
+    fd_ctx = afr_fd_ctx_get(fd, this);
+    if (!fd_ctx) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+        op_errno = afr_quorum_errno(priv);
+        goto out;
+    }
+
+    if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+        goto out;
+
+    local->inode = inode_ref(loc->inode);
+    loc_copy(&local->loc, loc);
+    local->fd_ctx = fd_ctx;
+    fd_ctx->flags = flags;
+    if (xdata)
+        local->xdata_req = dict_ref(xdata);
+
+    local->cont.open.flags = flags;
+    local->cont.open.fd = fd_ref(fd);
+
+    ret = afr_inode_get_readable(frame, local->inode, this, NULL,
+                                 &event_generation, AFR_DATA_TRANSACTION);
+    if ((ret < 0) &&
+        (afr_split_brain_read_subvol_get(local->inode, this, NULL,
+                                         &spb_subvol) == 0) &&
+        spb_subvol < 0) {
+        afr_inode_refresh(frame, this, local->inode, local->inode->gfid,
+                          afr_open_continue);
+    } else {
+        afr_open_continue(frame, this, 0);
+    }
+
+    return 0;
+out:
+    AFR_STACK_UNWIND(open, frame, -1, op_errno, fd, NULL);
+
+    return 0;
+}
+
+int
+afr_openfd_fix_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, fd_t *fd,
+                        dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    afr_fd_ctx_t *fd_ctx = NULL;
+    int call_count = 0;
+    int child_index = (long)cookie;
+
+    priv = this->private;
+    local = frame->local;
+
+    if (op_ret >= 0) {
+        gf_msg_debug(this->name, 0,
+                     "fd for %s opened "
+                     "successfully on subvolume %s",
+                     local->loc.path, priv->children[child_index]->name);
+    } else {
+        gf_smsg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno,
+                AFR_MSG_OPEN_FAIL, "path=%s", local->loc.path, "subvolume=%s",
+                priv->children[child_index]->name, NULL);
+    }
+
+    fd_ctx = local->fd_ctx;
+
+    LOCK(&local->fd->lock);
+    {
+        if (op_ret >= 0) {
+            fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+        } else {
+            fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+        }
+    }
+    UNLOCK(&local->fd->lock);
+
+    call_count = afr_frame_return(frame);
+    if (call_count == 0)
+        AFR_STACK_DESTROY(frame);
+
+    return 0;
+}
+
+static int
+afr_fd_ctx_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open)
+{
+    afr_fd_ctx_t *fd_ctx = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+    int count = 0;
+
+    priv = this->private;
+
+    fd_ctx = afr_fd_ctx_get(fd, this);
+    if (!fd_ctx)
+        return 0;
+
+    LOCK(&fd->lock);
+    {
+        for (i = 0; i < priv->child_count; i++) {
+            if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED &&
+                priv->child_up[i]) {
+                fd_ctx->opened_on[i] = AFR_FD_OPENING;
+                need_open[i] = 1;
+                count++;
+            } else {
+                need_open[i] = 0;
+            }
+        }
+    }
+    UNLOCK(&fd->lock);
+
+    return count;
+}
+
+void
+afr_fix_open(fd_t *fd, xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    int i = 0;
+    call_frame_t *frame = NULL;
+    afr_local_t *local = NULL;
+    int ret = -1;
+    int32_t op_errno = 0;
+    afr_fd_ctx_t *fd_ctx = NULL;
+    unsigned char *need_open = NULL;
+    int call_count = 0;
+
+    priv = this->private;
+
+    if (!afr_is_fd_fixable(fd))
+        goto out;
+
+    fd_ctx = afr_fd_ctx_get(fd, this);
+    if (!fd_ctx)
+        goto out;
+
+    need_open = alloca0(priv->child_count);
+
+    call_count = afr_fd_ctx_need_open(fd, this, need_open);
+    if (!call_count)
+        goto out;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame)
+        goto out;
+
+    local = AFR_FRAME_INIT(frame, op_errno);
+    if (!local)
+        goto out;
+
+    local->loc.inode = inode_ref(fd->inode);
+    ret = loc_path(&local->loc, NULL);
+    if (ret < 0)
+        goto out;
+
+    local->fd = fd_ref(fd);
+    local->fd_ctx = fd_ctx;
+
+    local->call_count = call_count;
+
+    gf_msg_debug(this->name, 0, "need open count: %d", call_count);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!need_open[i])
+            continue;
+
+        if (IA_IFDIR == fd->inode->ia_type) {
+            gf_msg_debug(this->name, 0, "opening fd for dir %s on subvolume %s",
+                         local->loc.path, priv->children[i]->name);
+
+            STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i,
+                              priv->children[i],
+                              priv->children[i]->fops->opendir, &local->loc,
+                              local->fd, NULL);
+        } else {
+            gf_msg_debug(this->name, 0,
+                         "opening fd for file %s on subvolume %s",
+                         local->loc.path, priv->children[i]->name);
+
+            STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i,
+                              priv->children[i], priv->children[i]->fops->open,
+                              &local->loc, fd_ctx->flags & (~O_TRUNC),
+                              local->fd, NULL);
+        }
+
+        if (!--call_count)
+            break;
+    }
+
+    return;
+out:
+    if (frame)
+        AFR_STACK_DESTROY(frame);
+}
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
new file mode 100644
index 00000000000..6fc2c75145c
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -0,0 +1,494 @@
+/*
+  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "afr.h"
+#include "afr-transaction.h"
+#include "afr-messages.h"
+
+void
+afr_pending_read_increment(afr_private_t *priv, int child_index)
+{
+    if (child_index < 0 || child_index > priv->child_count)
+        return;
+
+    GF_ATOMIC_INC(priv->pending_reads[child_index]);
+}
+
+void
+afr_pending_read_decrement(afr_private_t *priv, int child_index)
+{
+    if (child_index < 0 || child_index > priv->child_count)
+        return;
+
+    GF_ATOMIC_DEC(priv->pending_reads[child_index]);
+}
+
+void
+afr_read_txn_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    afr_pending_read_decrement(priv, local->read_subvol);
+    local->read_subvol = subvol;
+    afr_pending_read_increment(priv, subvol);
+    local->readfn(frame, this, subvol);
+}
+
+int
+afr_read_txn_next_subvol(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+    int subvol = -1;
+
+    local = frame->local;
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->readable[i]) {
+            /* don't even bother trying here.
+               just mark as attempted and move on. */
+            local->read_attempted[i] = 1;
+            continue;
+        }
+
+        if (!local->read_attempted[i]) {
+            subvol = i;
+            break;
+        }
+    }
+
+    /* If no more subvols were available for reading, we leave
+       @subvol as -1, which is an indication we have run out of
+       readable subvols. */
+    if (subvol != -1)
+        local->read_attempted[subvol] = 1;
+    afr_read_txn_wind(frame, this, subvol);
+
+    return 0;
+}
+
+static int
+afr_ta_read_txn_done(int ret, call_frame_t *ta_frame, void *opaque)
+{
+    STACK_DESTROY(ta_frame->root);
+    return 0;
+}
+
+static int
+afr_ta_read_txn(void *opaque)
+{
+    call_frame_t *frame = NULL;
+    xlator_t *this = NULL;
+    int read_subvol = -1;
+    int query_child = AFR_CHILD_UNKNOWN;
+    int possible_bad_child = AFR_CHILD_UNKNOWN;
+    int ret = 0;
+    int op_errno = ENOMEM;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    struct gf_flock flock = {
+        0,
+    };
+    dict_t *xdata_req = NULL;
+    dict_t *xdata_rsp = NULL;
+    int **pending = NULL;
+    loc_t loc = {
+        0,
+    };
+
+    frame = (call_frame_t *)opaque;
+    this = frame->this;
+    local = frame->local;
+    priv = this->private;
+    query_child = local->read_txn_query_child;
+
+    if (query_child == AFR_CHILD_ZERO) {
+        possible_bad_child = AFR_CHILD_ONE;
+    } else if (query_child == AFR_CHILD_ONE) {
+        possible_bad_child = AFR_CHILD_ZERO;
+    } else {
+        /*read_txn_query_child is AFR_CHILD_UNKNOWN*/
+        goto out;
+    }
+
+    /* Ask the query_child to see if it blames the possibly bad one. */
+    xdata_req = dict_new();
+    if (!xdata_req)
+        goto out;
+
+    pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+    if (!pending)
+        goto out;
+
+    ret = afr_set_pending_dict(priv, xdata_req, pending);
+    if (ret < 0)
+        goto out;
+
+    if (local->fd) {
+        ret = syncop_fxattrop(priv->children[query_child], local->fd,
+                              GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
+                              NULL);
+    } else {
+        ret = syncop_xattrop(priv->children[query_child], &local->loc,
+                             GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
+                             NULL);
+    }
+    if (ret || !xdata_rsp) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Failed xattrop for gfid %s on %s",
+               uuid_utoa(local->inode->gfid),
+               priv->children[query_child]->name);
+        op_errno = -ret;
+        goto out;
+    }
+
+    if (afr_ta_dict_contains_pending_xattr(xdata_rsp, priv,
+                                           possible_bad_child)) {
+        read_subvol = query_child;
+        goto out;
+    }
+    dict_unref(xdata_rsp);
+    xdata_rsp = NULL;
+
+    /* It doesn't. So query thin-arbiter to see if it blames any data brick. */
+    ret = afr_fill_ta_loc(this, &loc, _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Failed to populate thin-arbiter loc for: %s.", loc.name);
+        goto out;
+    }
+    flock.l_type = F_WRLCK; /*start and length are already zero. */
+    ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+                         AFR_TA_DOM_MODIFY, &loc, F_SETLKW, &flock, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "gfid:%s: Failed to get AFR_TA_DOM_MODIFY lock on %s.",
+               uuid_utoa(local->inode->gfid),
+               priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+        op_errno = -ret;
+        goto out;
+    }
+
+    ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+                         GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
+                         NULL);
+    if (ret || !xdata_rsp) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "gfid:%s: Failed xattrop on %s.", uuid_utoa(local->inode->gfid),
+               priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+        op_errno = -ret;
+        goto unlock;
+    }
+
+    if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, query_child)) {
+        read_subvol = query_child;
+    } else {
+        gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
+               "Failing read for gfid %s since good brick %s is down",
+               uuid_utoa(local->inode->gfid),
+               priv->children[possible_bad_child]->name);
+        op_errno = EIO;
+    }
+
+unlock:
+    flock.l_type = F_UNLCK;
+    ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+                         AFR_TA_DOM_MODIFY, &loc, F_SETLK, &flock, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "gfid:%s: Failed to unlock AFR_TA_DOM_MODIFY lock on "
+               "%s.",
+               uuid_utoa(local->inode->gfid),
+               priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+    }
+out:
+    if (xdata_req)
+        dict_unref(xdata_req);
+    if (xdata_rsp)
+        dict_unref(xdata_rsp);
+    if (pending)
+        afr_matrix_cleanup(pending, priv->child_count);
+    loc_wipe(&loc);
+
+    if (read_subvol == -1) {
+        local->op_ret = -1;
+        local->op_errno = op_errno;
+    }
+    afr_read_txn_wind(frame, this, read_subvol);
+    return ret;
+}
+
+void
+afr_ta_read_txn_synctask(call_frame_t *frame, xlator_t *this)
+{
+    call_frame_t *ta_frame = NULL;
+    afr_local_t *local = NULL;
+    int ret = 0;
+
+    local = frame->local;
+    ta_frame = afr_ta_frame_create(this);
+    if (!ta_frame) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+               "Failed to create ta_frame");
+        goto out;
+    }
+    ret = synctask_new(this->ctx->env, afr_ta_read_txn, afr_ta_read_txn_done,
+                       ta_frame, frame);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+               "Failed to launch "
+               "afr_ta_read_txn synctask for gfid %s.",
+               uuid_utoa(local->inode->gfid));
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        STACK_DESTROY(ta_frame->root);
+        goto out;
+    }
+    return;
+out:
+    afr_read_txn_wind(frame, this, -1);
+}
+
+int
+afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int read_subvol = -1;
+    inode_t *inode = NULL;
+    int ret = -1;
+    int spb_subvol = -1;
+
+    local = frame->local;
+    inode = local->inode;
+    priv = this->private;
+
+    if (err) {
+        if (!priv->thin_arbiter_count)
+            goto readfn;
+        if (err != EINVAL)
+            goto readfn;
+        /* We need to query the good bricks and/or thin-arbiter.*/
+        afr_ta_read_txn_synctask(frame, this);
+        return 0;
+    }
+
+    read_subvol = afr_read_subvol_select_by_policy(inode, this, local->readable,
+                                                   NULL);
+    if (read_subvol == -1) {
+        err = EIO;
+        goto readfn;
+    }
+
+    if (local->read_attempted[read_subvol]) {
+        afr_read_txn_next_subvol(frame, this);
+        return 0;
+    }
+
+    local->read_attempted[read_subvol] = 1;
+readfn:
+    if (read_subvol == -1) {
+        ret = afr_split_brain_read_subvol_get(inode, this, frame, &spb_subvol);
+        if ((ret == 0) && spb_subvol >= 0)
+            read_subvol = spb_subvol;
+    }
+
+    if (read_subvol == -1) {
+        AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err);
+    }
+    afr_read_txn_wind(frame, this, read_subvol);
+
+    return 0;
+}
+
+int
+afr_read_txn_continue(call_frame_t *frame, xlator_t *this, int subvol)
+{
+    afr_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (!local->refreshed) {
+        local->refreshed = _gf_true;
+        afr_inode_refresh(frame, this, local->inode, NULL,
+                          afr_read_txn_refresh_done);
+    } else {
+        afr_read_txn_next_subvol(frame, this);
+    }
+
+    return 0;
+}
+
+/* afr_read_txn_wipe:
+
+   clean internal variables in @local in order to make
+   it possible to call afr_read_txn() multiple times from
+   the same frame
+*/
+
+void
+afr_read_txn_wipe(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    local->readfn = NULL;
+
+    if (local->inode)
+        inode_unref(local->inode);
+
+    for (i = 0; i < priv->child_count; i++) {
+        local->read_attempted[i] = 0;
+        local->readable[i] = 0;
+    }
+}
+
+/*
+  afr_read_txn:
+
+  This is the read transaction function. The way it works:
+
+  - Determine read-subvolume from inode ctx.
+
+  - If read-subvolume's generation was stale, refresh ctx once by
+    calling afr_inode_refresh()
+
+    Else make an attempt to read on read-subvolume.
+
+  - If attempted read on read-subvolume fails, refresh ctx once
+    by calling afr_inode_refresh()
+
+  - After ctx refresh, query read-subvolume freshly and attempt
+    read once.
+
+  - If read fails, try every other readable[] subvolume before
+    finally giving up. readable[] elements are set by afr_inode_refresh()
+    based on dirty and pending flags.
+
+  - If file is in split brain in the backend, generation will be
+    kept 0 by afr_inode_refresh() and readable[] will be set 0 for
+    all elements. Therefore reads always fail.
+*/
+
+int
+afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode,
+             afr_read_txn_wind_t readfn, afr_transaction_type type)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    unsigned char *data = NULL;
+    unsigned char *metadata = NULL;
+    int read_subvol = -1;
+    int event_generation = 0;
+    int ret = -1;
+
+    priv = this->private;
+    local = frame->local;
+    data = alloca0(priv->child_count);
+    metadata = alloca0(priv->child_count);
+
+    afr_read_txn_wipe(frame, this);
+
+    local->readfn = readfn;
+    local->inode = inode_ref(inode);
+    local->is_read_txn = _gf_true;
+    local->transaction.type = type;
+
+    if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+        local->op_ret = -1;
+        local->op_errno = afr_quorum_errno(priv);
+        goto read;
+    }
+
+    if (!afr_is_consistent_io_possible(local, priv, &local->op_errno)) {
+        local->op_ret = -1;
+        goto read;
+    }
+
+    if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) {
+        local->op_ret = -1;
+        local->op_errno = -afr_quorum_errno(priv);
+        goto read;
+    }
+
+    if (priv->thin_arbiter_count &&
+        AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
+        if (local->child_up[0]) {
+            local->read_txn_query_child = AFR_CHILD_ZERO;
+        } else if (local->child_up[1]) {
+            local->read_txn_query_child = AFR_CHILD_ONE;
+        }
+        afr_ta_read_txn_synctask(frame, this);
+        return 0;
+    }
+
+    ret = afr_inode_read_subvol_get(inode, this, data, metadata,
+                                    &event_generation);
+    if (ret == -1)
+        /* very first transaction on this inode */
+        goto refresh;
+    AFR_INTERSECT(local->readable, data, metadata, priv->child_count);
+
+    gf_msg_debug(this->name, 0,
+                 "%s: generation now vs cached: %d, "
+                 "%d",
+                 uuid_utoa(inode->gfid), local->event_generation,
+                 event_generation);
+    if (afr_is_inode_refresh_reqd(inode, this, local->event_generation,
+                                  event_generation))
+        /* servers have disconnected / reconnected, and possibly
+           rebooted, very likely changing the state of freshness
+           of copies */
+        goto refresh;
+
+    read_subvol = afr_read_subvol_select_by_policy(inode, this, local->readable,
+                                                   NULL);
+
+    if (read_subvol < 0 || read_subvol > priv->child_count) {
+        gf_msg_debug(this->name, 0,
+                     "Unreadable subvolume %d found "
+                     "with event generation %d for gfid %s.",
+                     read_subvol, event_generation, uuid_utoa(inode->gfid));
+        goto refresh;
+    }
+
+    if (!local->child_up[read_subvol]) {
+        /* should never happen, just in case */
+        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR,
+               "subvolume %d is the "
+               "read subvolume in this generation, but is not up",
+               read_subvol);
+        goto refresh;
+    }
+
+    local->read_attempted[read_subvol] = 1;
+
+read:
+    afr_read_txn_wind(frame, this, read_subvol);
+
+    return 0;
+
+refresh:
+    afr_inode_refresh(frame, this, inode, NULL, afr_read_txn_refresh_done);
+
+    return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index f418085e53e..a580a1584cc 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -1,1073 +1,2934 @@
 /*
-  Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
   This file is part of GlusterFS.
 
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
-#include "glusterfs.h"
-#include "xlator.h"
-#include "byte-order.h"
-
 #include "afr.h"
-#include "afr-transaction.h"
-#include "afr-self-heal-common.h"
 #include "afr-self-heal.h"
+#include <glusterfs/byte-order.h>
+#include "protocol-common.h"
+#include "afr-messages.h"
+#include <glusterfs/events.h>
 
-
-/**
- * select_source - select a source and return it
- * TODO: take into account option 'favorite-child'
- */
+void
+afr_heal_synctask(xlator_t *this, afr_local_t *local);
 
 int
-afr_sh_select_source (int sources[], int child_count)
+afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name,
+                         inode_t *inode, struct afr_reply *replies, int source,
+                         unsigned char *sources, void *gfid, int *gfid_idx)
 {
-	int i;
-	for (i = 0; i < child_count; i++)
-		if (sources[i])
-			return i;
+    afr_private_t *priv = NULL;
+    call_frame_t *frame = NULL;
+    afr_local_t *local = NULL;
+    unsigned char *wind_on = NULL;
+    ia_type_t ia_type = IA_INVAL;
+    dict_t *xdata = NULL;
+    loc_t loc = {
+        0,
+    };
+    int ret = 0;
+    int i = 0;
+
+    priv = this->private;
+    wind_on = alloca0(priv->child_count);
+    if (source >= 0 && replies[source].valid && replies[source].op_ret == 0)
+        ia_type = replies[source].poststat.ia_type;
+
+    if (ia_type != IA_INVAL)
+        goto heal;
+
+    /* If ia_type is still invalid, it means either
+     * (a)'source' was -1, i.e. parent dir pending xattrs are in split-brain
+     * (or) (b) The parent dir pending xattrs are all zeroes (i.e. all bricks
+     * are sources) and the 'source' we selected earlier might be the one where
+     * the file is not actually present.
+     *
+     * In both cases, let us pick a brick with a successful reply and use its
+     * ia_type.
+     * */
+    for (i = 0; i < priv->child_count; i++) {
+        if (source == -1) {
+            /* case (a) above. */
+            if (replies[i].valid && replies[i].op_ret == 0 &&
+                replies[i].poststat.ia_type != IA_INVAL) {
+                ia_type = replies[i].poststat.ia_type;
+                break;
+            }
+        } else {
+            /* case (b) above. */
+            if (i == source)
+                continue;
+            if (sources[i] && replies[i].valid && replies[i].op_ret == 0 &&
+                replies[i].poststat.ia_type != IA_INVAL) {
+                ia_type = replies[i].poststat.ia_type;
+                break;
+            }
+        }
+    }
+
+heal:
+    /* gfid heal on those subvolumes that do not have gfid associated
+     * with the inode and update those replies.
+     */
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid || replies[i].op_ret != 0)
+            continue;
+
+        if (gf_uuid_is_null(gfid) &&
+            !gf_uuid_is_null(replies[i].poststat.ia_gfid) &&
+            replies[i].poststat.ia_type == ia_type)
+            gfid = replies[i].poststat.ia_gfid;
+
+        if (!gf_uuid_is_null(replies[i].poststat.ia_gfid) ||
+            replies[i].poststat.ia_type != ia_type)
+            continue;
+
+        wind_on[i] = 1;
+    }
+
+    if (AFR_COUNT(wind_on, priv->child_count) == 0)
+        return 0;
+
+    xdata = dict_new();
+    if (!xdata) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    ret = dict_set_gfuuid(xdata, "gfid-req", gfid, true);
+    if (ret) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    frame = afr_frame_create(this, &ret);
+    if (!frame) {
+        ret = -ret;
+        goto out;
+    }
+
+    local = frame->local;
+    loc.parent = inode_ref(parent);
+    gf_uuid_copy(loc.pargfid, parent->gfid);
+    loc.name = name;
+    loc.inode = inode_ref(inode);
+
+    AFR_ONLIST(wind_on, frame, afr_selfheal_discover_cbk, lookup, &loc, xdata);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!wind_on[i])
+            continue;
+        afr_reply_wipe(&replies[i]);
+        afr_reply_copy(&replies[i], &local->replies[i]);
+    }
+    if (gfid_idx && (*gfid_idx == -1)) {
+        /*Pick a brick where the gifd heal was successful.*/
+        for (i = 0; i < priv->child_count; i++) {
+            if (!wind_on[i])
+                continue;
+            if (replies[i].valid && replies[i].op_ret == 0 &&
+                !gf_uuid_is_null(replies[i].poststat.ia_gfid)) {
+                *gfid_idx = i;
+                break;
+            }
+        }
+    }
+out:
+    if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) {
+        ret = -afr_final_errno(local, priv);
+    }
+    loc_wipe(&loc);
+    if (frame)
+        AFR_STACK_DESTROY(frame);
+    if (xdata)
+        dict_unref(xdata);
+
+    return ret;
+}
 
-	return -1;
+int
+afr_gfid_sbrain_source_from_src_brick(xlator_t *this, struct afr_reply *replies,
+                                      char *src_brick)
+{
+    int i = 0;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid || replies[i].op_ret == -1)
+            continue;
+        if (strcmp(priv->children[i]->name, src_brick) == 0)
+            return i;
+    }
+    return -1;
 }
 
+int
+afr_selfheal_gfid_mismatch_by_majority(struct afr_reply *replies,
+                                       int child_count)
+{
+    int j = 0;
+    int i = 0;
+    int votes;
+
+    for (i = 0; i < child_count; i++) {
+        if (!replies[i].valid || replies[i].op_ret == -1)
+            continue;
+
+        votes = 1;
+        for (j = i + 1; j < child_count; j++) {
+            if ((!gf_uuid_compare(replies[i].poststat.ia_gfid,
+                                  replies[j].poststat.ia_gfid)))
+                votes++;
+            if (votes > child_count / 2)
+                return i;
+        }
+    }
+
+    return -1;
+}
 
-/**
- * sink_count - return number of sinks in sources array
- */
+int
+afr_gfid_sbrain_source_from_bigger_file(struct afr_reply *replies,
+                                        int child_count)
+{
+    int i = 0;
+    int src = -1;
+    uint64_t size = 0;
+
+    for (i = 0; i < child_count; i++) {
+        if (!replies[i].valid || replies[i].op_ret == -1)
+            continue;
+        if (size < replies[i].poststat.ia_size) {
+            src = i;
+            size = replies[i].poststat.ia_size;
+        } else if (replies[i].poststat.ia_size == size) {
+            src = -1;
+        }
+    }
+    return src;
+}
 
 int
-afr_sh_sink_count (int sources[], int child_count)
+afr_gfid_sbrain_source_from_latest_mtime(struct afr_reply *replies,
+                                         int child_count)
 {
-	int i;
-	int sinks = 0;
-	for (i = 0; i < child_count; i++)
-		if (!sources[i])
-			sinks++;
-	return sinks;
+    int i = 0;
+    int src = -1;
+    uint32_t mtime = 0;
+    uint32_t mtime_nsec = 0;
+
+    for (i = 0; i < child_count; i++) {
+        if (!replies[i].valid || replies[i].op_ret != 0)
+            continue;
+        if ((mtime < replies[i].poststat.ia_mtime) ||
+            ((mtime == replies[i].poststat.ia_mtime) &&
+             (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) {
+            src = i;
+            mtime = replies[i].poststat.ia_mtime;
+            mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+        } else if ((mtime == replies[i].poststat.ia_mtime) &&
+                   (mtime_nsec == replies[i].poststat.ia_mtime_nsec)) {
+            src = -1;
+        }
+    }
+    return src;
 }
 
 int
-afr_sh_source_count (int sources[], int child_count)
+afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
+                            inode_t *inode, uuid_t pargfid, const char *bname,
+                            int src_idx, int child_idx,
+                            unsigned char *locked_on, int *src, dict_t *xdata)
 {
-	int i;
-	int nsource = 0;
+    afr_private_t *priv = NULL;
+    char g1[64] = {
+        0,
+    };
+    char g2[64] = {
+        0,
+    };
+    int up_count = 0;
+    int heal_op = -1;
+    int ret = -1;
+    char *src_brick = NULL;
+
+    *src = -1;
+    priv = this->private;
+    up_count = AFR_COUNT(locked_on, priv->child_count);
+    if (up_count != priv->child_count) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+               "All the bricks should be up to resolve the gfid split "
+               "barin");
+        if (xdata) {
+            ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+                                           SALL_BRICKS_UP_TO_RESOLVE);
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+                       "Error setting"
+                       " gfid-heal-msg dict");
+        }
+        goto out;
+    }
+
+    if (xdata) {
+        ret = dict_get_int32_sizen(xdata, "heal-op", &heal_op);
+        if (ret)
+            goto fav_child;
+    } else {
+        goto fav_child;
+    }
+
+    switch (heal_op) {
+        case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+            *src = afr_gfid_sbrain_source_from_bigger_file(replies,
+                                                           priv->child_count);
+            if (*src == -1) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+                       SNO_BIGGER_FILE);
+                if (xdata) {
+                    ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+                                                   SNO_BIGGER_FILE);
+                    if (ret)
+                        gf_msg(this->name, GF_LOG_ERROR, 0,
+                               AFR_MSG_DICT_SET_FAILED,
+                               "Error"
+                               " setting gfid-heal-msg dict");
+                }
+            }
+            break;
+
+        case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME:
+            *src = afr_gfid_sbrain_source_from_latest_mtime(replies,
+                                                            priv->child_count);
+            if (*src == -1) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+                       SNO_DIFF_IN_MTIME);
+                if (xdata) {
+                    ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+                                                   SNO_DIFF_IN_MTIME);
+                    if (ret)
+                        gf_msg(this->name, GF_LOG_ERROR, 0,
+                               AFR_MSG_DICT_SET_FAILED,
+                               "Error"
+                               "setting gfid-heal-msg dict");
+                }
+            }
+            break;
+
+        case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
+            ret = dict_get_str_sizen(xdata, "child-name", &src_brick);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+                       "Error getting the source "
+                       "brick");
+                break;
+            }
+            *src = afr_gfid_sbrain_source_from_src_brick(this, replies,
+                                                         src_brick);
+            if (*src == -1) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+                       SERROR_GETTING_SRC_BRICK);
+                if (xdata) {
+                    ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+                                                   SERROR_GETTING_SRC_BRICK);
+                    if (ret)
+                        gf_msg(this->name, GF_LOG_ERROR, 0,
+                               AFR_MSG_DICT_SET_FAILED,
+                               "Error"
+                               " setting gfid-heal-msg dict");
+                }
+            }
+            break;
+
+        default:
+            break;
+    }
+    goto out;
+
+fav_child:
+    switch (priv->fav_child_policy) {
+        case AFR_FAV_CHILD_BY_SIZE:
+            *src = afr_sh_fav_by_size(this, replies, inode);
+            break;
+        case AFR_FAV_CHILD_BY_MTIME:
+            *src = afr_sh_fav_by_mtime(this, replies, inode);
+            break;
+        case AFR_FAV_CHILD_BY_CTIME:
+            *src = afr_sh_fav_by_ctime(this, replies, inode);
+            break;
+        case AFR_FAV_CHILD_BY_MAJORITY:
+            if (priv->child_count != 2)
+                *src = afr_selfheal_gfid_mismatch_by_majority(
+                    replies, priv->child_count);
+            else
+                *src = -1;
+
+            if (*src == -1) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+                       "No majority to resolve "
+                       "gfid split brain");
+            }
+            break;
+        default:
+            break;
+    }
 
-	for (i = 0; i < child_count; i++)
-		if (sources[i])
-			nsource++;
-	return nsource;
+out:
+    if (*src == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+               "Gfid mismatch detected for <gfid:%s>/%s>, %s on %s and"
+               " %s on %s.",
+               uuid_utoa(pargfid), bname,
+               uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1),
+               priv->children[child_idx]->name,
+               uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2),
+               priv->children[src_idx]->name);
+        gf_event(EVENT_AFR_SPLIT_BRAIN,
+                 "client-pid=%d;"
+                 "subvol=%s;type=gfid;file="
+                 "<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;"
+                 "child-%d=%s;gfid-%d=%s",
+                 this->ctx->cmd_args.client_pid, this->name, uuid_utoa(pargfid),
+                 bname, child_idx, priv->children[child_idx]->name, child_idx,
+                 uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), src_idx,
+                 priv->children[src_idx]->name, src_idx,
+                 uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2));
+        return -1;
+    }
+    return 0;
 }
 
-
 int
-afr_sh_supress_errenous_children (int sources[], int child_errno[],
-				  int child_count)
+afr_selfheal_post_op_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
 {
-	int i = 0;
+    afr_local_t *local = NULL;
 
-	for (i = 0; i < child_count; i++) {
-		if (child_errno[i] && sources[i]) {
-			sources[i] = 0;
-		}
-	}
+    local = frame->local;
 
-	return 0;
-}
+    local->op_ret = op_ret;
+    local->op_errno = op_errno;
+    syncbarrier_wake(&local->barrier);
 
+    return 0;
+}
 
 int
-afr_sh_supress_empty_children (int sources[], dict_t *xattr[],
-			       struct stat *buf,
-			       int child_count, const char *key)
-{
-	int      i = 0;
-	int32_t *pending = NULL;
-	int      ret = 0;
-	int      all_xattr_missing = 1;
-
-	/* if the file was created by afr with xattrs */
-	for (i = 0; i < child_count; i++) {
-		if (!xattr[i])
-			continue;
-
-		ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending));
-		if (ret != 0) {
-			continue;
-		}
-
-		all_xattr_missing = 0;
-		break;
-	}
-
-	if (all_xattr_missing) {
-		/* supress 0byte files.. this avoids empty file created
-		   by dir selfheal to overwrite the 'good' file */
-		for (i = 0; i < child_count; i++) {
-			if (!buf[i].st_size)
-				sources[i] = 0;
-		}
-		goto out;
-	}
-
-
-	for (i = 0; i < child_count; i++) {
-		if (!xattr[i]) {
-			sources[i] = 0;
-			continue;
-		}
-
-		ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending));
-		if (ret != 0) {
-			sources[i] = 0;
-			continue;
-		}
-
-		if (!pending) {
-			sources[i] = 0;
-			continue;
-		}
-	}
-
-out:
-	return 0;
-}
+afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                     int subvol, dict_t *xattr, dict_t *xdata)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    loc_t loc = {
+        0,
+    };
+    int ret = 0;
 
+    priv = this->private;
+    local = frame->local;
 
-void
-afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
-{
-	afr_private_t * priv = this->private;
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
 
-	char *buf = NULL;
-	char *ptr = NULL;
+    local->op_ret = 0;
 
-	int i, j;
+    STACK_WIND(frame, afr_selfheal_post_op_cbk, priv->children[subvol],
+               priv->children[subvol]->fops->xattrop, &loc,
+               GF_XATTROP_ADD_ARRAY, xattr, xdata);
 
-        /* 10 digits per entry + 1 space + '[' and ']' */
-	buf = MALLOC (priv->child_count * 11 + 8); 
+    syncbarrier_wait(&local->barrier, 1);
+    if (local->op_ret < 0)
+        ret = -local->op_errno;
 
-	for (i = 0; i < priv->child_count; i++) {
-		ptr = buf;
-		ptr += sprintf (ptr, "[ ");
-		for (j = 0; j < priv->child_count; j++) {
-			ptr += sprintf (ptr, "%d ", pending_matrix[i][j]);
-		}
-		ptr += sprintf (ptr, "]");
-		gf_log (this->name, GF_LOG_DEBUG,
-			"pending_matrix: %s", buf);
-	}
+    loc_wipe(&loc);
+    local->op_ret = 0;
 
-	FREE (buf);
+    return ret;
 }
 
-
-void
-afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[],
-			     int child_count, const char *key)
+int
+afr_check_stale_error(struct afr_reply *replies, afr_private_t *priv)
 {
-	int i = 0;
-	int j = 0;
-	int32_t *pending = NULL;
-	int ret = -1;
+    int i = 0;
+    int op_errno = 0;
+    int tmp_errno = 0;
+    int stale_count = 0;
+
+    for (i = 0; i < priv->child_count; i++) {
+        tmp_errno = replies[i].op_errno;
+        if (tmp_errno == ENOENT || tmp_errno == ESTALE) {
+            op_errno = afr_higher_errno(op_errno, tmp_errno);
+            stale_count++;
+        }
+    }
+    if (stale_count != priv->child_count)
+        return -ENOTCONN;
+    else
+        return -op_errno;
+}
 
-	/* start clean */
-	for (i = 0; i < child_count; i++) {
-		for (j = 0; j < child_count; j++) {
-			pending_matrix[i][j] = 0;
-		}
-	}
+int
+afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int op_ret, int op_errno, struct iatt *pre,
+                       struct iatt *post, dict_t *xdata)
+{
+    int i = (long)cookie;
+    afr_local_t *local = NULL;
 
-	for (i = 0; i < child_count; i++) {
-		if (!xattr[i])
-			continue;
+    local = frame->local;
 
-		pending = NULL;
+    local->replies[i].valid = 1;
+    local->replies[i].op_ret = op_ret;
+    local->replies[i].op_errno = op_errno;
+    if (pre)
+        local->replies[i].prestat = *pre;
+    if (post)
+        local->replies[i].poststat = *post;
+    if (xdata)
+        local->replies[i].xdata = dict_ref(xdata);
 
-		ret = dict_get_ptr (xattr[i], (char *) key,
-				    VOID(&pending));
-		if (ret != 0)
-			continue;
+    syncbarrier_wake(&local->barrier);
 
-		for (j = 0; j < child_count; j++) {
-			pending_matrix[i][j] = ntoh32 (pending[j]);
-		}
-	}
+    return 0;
 }
 
-
-/**
- * mark_sources: Mark all 'source' nodes and return number of source
- * nodes found
- */
-
 int
-afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], int child_count)
+afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                          int source, unsigned char *healed_sinks,
+                          struct afr_reply *replies)
 {
-	int i = 0;
-	int j = 0;
-
-	int nsources = 0;
+    loc_t loc = {
+        0,
+    };
 
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
 
-	/* start clean */
-	for (i = 0; i < child_count; i++) {
-		sources[i] = 0;
-	}
+    AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, setattr, &loc,
+               &replies[source].poststat,
+               (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME),
+               NULL);
 
-	/*
-	  Let's 'normalize' the pending matrix first,
-	  by disregarding all pending entries that refer
-	  to themselves
-	*/
-	for (i = 0; i < child_count; i++) {
-		pending_matrix[i][i] = 0;
-	}
+    loc_wipe(&loc);
 
-	for (i = 0; i < child_count; i++) {
-		for (j = 0; j < child_count; j++) {
-			if (pending_matrix[j][i])
-				break;
-		}
+    return 0;
+}
 
-		if (j == child_count) {
-			nsources++;
-			sources[i] = 1;
-		}
-	}
+dict_t *
+afr_selfheal_output_xattr(xlator_t *this, gf_boolean_t is_full_crawl,
+                          afr_transaction_type type, int *output_dirty,
+                          int **output_matrix, int subvol,
+                          int **full_heal_mtx_out)
+{
+    int j = 0;
+    int idx = 0;
+    int d_idx = 0;
+    int ret = 0;
+    int *raw = 0;
+    dict_t *xattr = NULL;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+    idx = afr_index_for_transaction_type(type);
+    d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+
+    xattr = dict_new();
+    if (!xattr)
+        return NULL;
+
+    /* clear dirty */
+    raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
+    if (!raw)
+        goto err;
+
+    raw[idx] = hton32(output_dirty[subvol]);
+    ret = dict_set_bin(xattr, AFR_DIRTY, raw,
+                       sizeof(int) * AFR_NUM_CHANGE_LOGS);
+    if (ret) {
+        GF_FREE(raw);
+        goto err;
+    }
+
+    /* clear/set pending */
+    for (j = 0; j < priv->child_count; j++) {
+        raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
+        if (!raw)
+            goto err;
+
+        raw[idx] = hton32(output_matrix[subvol][j]);
+        if (is_full_crawl)
+            raw[d_idx] = hton32(full_heal_mtx_out[subvol][j]);
+
+        ret = dict_set_bin(xattr, priv->pending_key[j], raw,
+                           sizeof(int) * AFR_NUM_CHANGE_LOGS);
+        if (ret) {
+            GF_FREE(raw);
+            goto err;
+        }
+    }
+
+    return xattr;
+err:
+    if (xattr)
+        dict_unref(xattr);
+    return NULL;
+}
 
-	return nsources;
+int
+afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                          unsigned char *sources, unsigned char *sinks,
+                          unsigned char *healed_sinks,
+                          unsigned char *undid_pending,
+                          afr_transaction_type type, struct afr_reply *replies,
+                          unsigned char *locked_on)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int i = 0;
+    int j = 0;
+    unsigned char *pending = NULL;
+    int *input_dirty = NULL;
+    int **input_matrix = NULL;
+    int **full_heal_mtx_in = NULL;
+    int **full_heal_mtx_out = NULL;
+    int *output_dirty = NULL;
+    int **output_matrix = NULL;
+    dict_t *xattr = NULL;
+    dict_t *xdata = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    pending = alloca0(priv->child_count);
+
+    input_dirty = alloca0(priv->child_count * sizeof(int));
+    input_matrix = ALLOC_MATRIX(priv->child_count, int);
+    full_heal_mtx_in = ALLOC_MATRIX(priv->child_count, int);
+    full_heal_mtx_out = ALLOC_MATRIX(priv->child_count, int);
+    output_dirty = alloca0(priv->child_count * sizeof(int));
+    output_matrix = ALLOC_MATRIX(priv->child_count, int);
+
+    xdata = dict_new();
+    if (!xdata)
+        return -1;
+
+    afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix);
+
+    if (local->need_full_crawl)
+        afr_selfheal_extract_xattr(this, replies, AFR_DATA_TRANSACTION, NULL,
+                                   full_heal_mtx_in);
+
+    for (i = 0; i < priv->child_count; i++)
+        if (sinks[i] && !healed_sinks[i])
+            pending[i] = 1;
+
+    for (i = 0; i < priv->child_count; i++) {
+        for (j = 0; j < priv->child_count; j++) {
+            if (pending[j]) {
+                output_matrix[i][j] = 1;
+                if (type == AFR_ENTRY_TRANSACTION)
+                    full_heal_mtx_out[i][j] = 1;
+            } else if (locked_on[j]) {
+                output_matrix[i][j] = -input_matrix[i][j];
+                if (type == AFR_ENTRY_TRANSACTION)
+                    full_heal_mtx_out[i][j] = -full_heal_mtx_in[i][j];
+            }
+        }
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!pending[i])
+            output_dirty[i] = -input_dirty[i];
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!locked_on[i])
+            /* perform post-op only on subvols we had locked
+               and inspected on.
+            */
+            continue;
+        if (undid_pending[i])
+            /* We already unset the pending xattrs in
+             * _afr_fav_child_reset_sink_xattrs(). */
+            continue;
+
+        xattr = afr_selfheal_output_xattr(this, local->need_full_crawl, type,
+                                          output_dirty, output_matrix, i,
+                                          full_heal_mtx_out);
+        if (!xattr) {
+            continue;
+        }
+
+        if ((type == AFR_ENTRY_TRANSACTION) && (priv->esh_granular)) {
+            if (xdata && dict_set_int8(xdata, GF_XATTROP_PURGE_INDEX, 1))
+                gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_DICT_SET_FAILED,
+                       "Failed to set"
+                       " dict value for %s",
+                       GF_XATTROP_PURGE_INDEX);
+        }
+
+        afr_selfheal_post_op(frame, this, inode, i, xattr, xdata);
+        dict_unref(xattr);
+    }
+
+    if (xdata)
+        dict_unref(xdata);
+
+    return 0;
 }
 
+void
+afr_reply_copy(struct afr_reply *dst, struct afr_reply *src)
+{
+    dict_t *xdata = NULL;
+
+    dst->valid = src->valid;
+    dst->op_ret = src->op_ret;
+    dst->op_errno = src->op_errno;
+    dst->prestat = src->prestat;
+    dst->poststat = src->poststat;
+    dst->preparent = src->preparent;
+    dst->postparent = src->postparent;
+    dst->preparent2 = src->preparent2;
+    dst->postparent2 = src->postparent2;
+    if (src->xdata)
+        xdata = dict_ref(src->xdata);
+    else
+        xdata = NULL;
+    if (dst->xdata)
+        dict_unref(dst->xdata);
+    dst->xdata = xdata;
+    if (xdata && dict_get_str_boolean(xdata, "fips-mode-rchecksum",
+                                      _gf_false) == _gf_true) {
+        memcpy(dst->checksum, src->checksum, SHA256_DIGEST_LENGTH);
+    } else {
+        memcpy(dst->checksum, src->checksum, MD5_DIGEST_LENGTH);
+    }
+    dst->fips_mode_rchecksum = src->fips_mode_rchecksum;
+}
 
 void
-afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[],
-			 int success[], int child_count)
+afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count)
 {
-	int i = 0;
-	int j = 0;
+    int i = 0;
 
-	/* start clean */
-	for (i = 0; i < child_count; i++) {
-		for (j = 0; j < child_count; j++) {
-			delta_matrix[i][j] = 0;
-		}
-	}
+    if (dst == src)
+        return;
 
-	for (i = 0; i < child_count; i++) {
-		for (j = 0; j < child_count; j++) {
-			if (!success[j])
-				continue;
-			delta_matrix[i][j] = -pending_matrix[i][j];
-		}
-	}
+    for (i = 0; i < count; i++) {
+        afr_reply_copy(&dst[i], &src[i]);
+    }
 }
 
-
 int
-afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[],
-		       int child_count, const char *key)
+afr_selfheal_fill_dirty(xlator_t *this, int *dirty, int subvol, int idx,
+                        dict_t *xdata)
 {
-	int i = 0;
-	int j = 0;
+    void *pending_raw = NULL;
+    int pending[3] = {
+        0,
+    };
 
-	int ret = 0;
+    if (!dirty)
+        return 0;
 
-	int32_t *pending = 0;
+    if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw))
+        return -1;
 
-	for (i = 0; i < child_count; i++) {
-		if (!xattr[i])
-			continue;
+    if (!pending_raw)
+        return -1;
 
-		pending = CALLOC (sizeof (int32_t), child_count);
-		for (j = 0; j < child_count; j++) {
-			pending[j] = hton32 (delta_matrix[i][j]);
-		}
+    memcpy(pending, pending_raw, sizeof(pending));
 
-		ret = dict_set_bin (xattr[i], (char *) key, pending,
-				    child_count * sizeof (int32_t));
-	}
+    dirty[subvol] = ntoh32(pending[idx]);
 
-	return 0;
+    return 0;
 }
 
-
 int
-afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this)
+afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx,
+                         dict_t *xdata)
 {
-	afr_private_t *priv = NULL;
-	int32_t       *pending = NULL;
-	void          *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
+    int i = 0;
+    void *pending_raw = NULL;
+    int pending[3] = {
+        0,
+    };
+    afr_private_t *priv = NULL;
 
-	int           ret = -1;
-	int            i  = 0;
+    priv = this->private;
 
-	priv = this->private;
+    if (!matrix)
+        return 0;
 
-	ret = dict_get_ptr (xattr, AFR_METADATA_PENDING, &tmp_pending);
+    for (i = 0; i < priv->child_count; i++) {
+        if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
+            continue;
 
-	if (ret != 0)
-		return 0;
+        if (!pending_raw)
+            continue;
 
-	pending = tmp_pending;
-	for (i = 0; i < priv->child_count; i++) {
-		if (i == child_count)
-			continue;
-		if (pending[i])
-			return 1;
-	}
+        memcpy(pending, pending_raw, sizeof(pending));
 
-	return 0;
-}
+        matrix[subvol][i] = ntoh32(pending[idx]);
+    }
 
+    return 0;
+}
 
 int
-afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this)
+afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies,
+                           afr_transaction_type type, int *dirty, int **matrix)
 {
-	afr_private_t *priv = NULL;
-	int32_t       *pending = NULL;
-	void          *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
+    afr_private_t *priv = NULL;
+    int i = 0;
+    dict_t *xdata = NULL;
+    int idx = -1;
 
-	int          ret = -1;
-	int            i = 0;
+    idx = afr_index_for_transaction_type(type);
 
-	priv = this->private;
+    priv = this->private;
 
-	ret = dict_get_ptr (xattr, AFR_DATA_PENDING, &tmp_pending);
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid || replies[i].op_ret != 0)
+            continue;
 
-	if (ret != 0)
-		return 0;
+        if (!replies[i].xdata)
+            continue;
 
-	pending = tmp_pending;
-	for (i = 0; i < priv->child_count; i++) {
-		if (i == child_count)
-			continue;
-		if (pending[i])
-			return 1;
-	}
+        xdata = replies[i].xdata;
 
-	return 0;
-}
+        afr_selfheal_fill_dirty(this, dirty, i, idx, xdata);
+        afr_selfheal_fill_matrix(this, matrix, i, idx, xdata);
+    }
 
+    return 0;
+}
 
-int
-afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this)
+/*
+ * If by chance there are multiple sources with differing sizes, select
+ * the largest file as the source.
+ *
+ * This can happen if data was directly modified in the backend or for snapshots
+ */
+void
+afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources,
+                                struct afr_reply *replies)
 {
-	afr_private_t *priv = NULL;
-	int32_t       *pending = NULL;
-	void          *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */
-	
-	int          ret = -1;
-	int            i = 0;
+    int i = 0;
+    afr_private_t *priv = NULL;
+    uint64_t size = 0;
+
+    /* Find source with biggest file size */
+    priv = this->private;
+    for (i = 0; i < priv->child_count; i++) {
+        if (!sources[i])
+            continue;
+        if (!replies[i].valid || replies[i].op_ret != 0) {
+            sources[i] = 0;
+            continue;
+        }
+        if (size <= replies[i].poststat.ia_size) {
+            size = replies[i].poststat.ia_size;
+        }
+    }
+
+    /* Mark sources with less size as not source */
+    for (i = 0; i < priv->child_count; i++) {
+        if (!sources[i])
+            continue;
+        if (size > replies[i].poststat.ia_size)
+            sources[i] = 0;
+    }
+}
 
-	priv = this->private;
+void
+afr_mark_latest_mtime_file_as_source(xlator_t *this, unsigned char *sources,
+                                     struct afr_reply *replies)
+{
+    int i = 0;
+    afr_private_t *priv = NULL;
+    uint32_t mtime = 0;
+    uint32_t mtime_nsec = 0;
+
+    priv = this->private;
+    for (i = 0; i < priv->child_count; i++) {
+        if (!sources[i])
+            continue;
+        if (!replies[i].valid || replies[i].op_ret != 0) {
+            sources[i] = 0;
+            continue;
+        }
+        if ((mtime < replies[i].poststat.ia_mtime) ||
+            ((mtime == replies[i].poststat.ia_mtime) &&
+             (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) {
+            mtime = replies[i].poststat.ia_mtime;
+            mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+        }
+    }
+    for (i = 0; i < priv->child_count; i++) {
+        if (!sources[i])
+            continue;
+        if ((mtime > replies[i].poststat.ia_mtime) ||
+            ((mtime == replies[i].poststat.ia_mtime) &&
+             (mtime_nsec > replies[i].poststat.ia_mtime_nsec))) {
+            sources[i] = 0;
+        }
+    }
+}
 
-	ret = dict_get_ptr (xattr, AFR_ENTRY_PENDING, &tmp_pending);
+void
+afr_mark_active_sinks(xlator_t *this, unsigned char *sources,
+                      unsigned char *locked_on, unsigned char *sinks)
+{
+    int i = 0;
+    afr_private_t *priv = NULL;
 
-	if (ret != 0)
-		return 0;
+    priv = this->private;
 
-	pending = tmp_pending;
-	for (i = 0; i < priv->child_count; i++) {
-		if (i == child_count)
-			continue;
-		if (pending[i])
-			return 1;
-	}
+    for (i = 0; i < priv->child_count; i++) {
+        if (!sources[i] && locked_on[i])
+            sinks[i] = 1;
+        else
+            sinks[i] = 0;
+    }
+}
 
-	return 0;
+gf_boolean_t
+afr_dict_contains_heal_op(call_frame_t *frame)
+{
+    afr_local_t *local = NULL;
+    dict_t *xdata_req = NULL;
+    int ret = 0;
+    int heal_op = -1;
+
+    local = frame->local;
+    xdata_req = local->xdata_req;
+    ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op);
+    if (ret)
+        return _gf_false;
+    if (local->xdata_rsp == NULL) {
+        local->xdata_rsp = dict_new();
+        if (!local->xdata_rsp)
+            return _gf_true;
+    }
+    ret = dict_set_sizen_str_sizen(local->xdata_rsp, "sh-fail-msg",
+                                   SFILE_NOT_IN_SPLIT_BRAIN);
+
+    return _gf_true;
 }
 
+gf_boolean_t
+afr_can_decide_split_brain_source_sinks(struct afr_reply *replies,
+                                        int child_count)
+{
+    int i = 0;
 
+    for (i = 0; i < child_count; i++)
+        if (replies[i].valid != 1 || replies[i].op_ret != 0)
+            return _gf_false;
 
-/**
- * is_matrix_zero - return true if pending matrix is all zeroes
- */
+    return _gf_true;
+}
 
 int
-afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count)
+afr_mark_split_brain_source_sinks_by_heal_op(
+    call_frame_t *frame, xlator_t *this, unsigned char *sources,
+    unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+    struct afr_reply *replies, afr_transaction_type type, int heal_op)
 {
-	int i, j;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    dict_t *xdata_req = NULL;
+    dict_t *xdata_rsp = NULL;
+    int ret = 0;
+    int i = 0;
+    char *name = NULL;
+    int source = -1;
+
+    local = frame->local;
+    priv = this->private;
+    xdata_req = local->xdata_req;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (locked_on[i])
+            if (sources[i] || !sinks[i] || !healed_sinks[i]) {
+                ret = -1;
+                goto out;
+            }
+    }
+    if (local->xdata_rsp == NULL) {
+        local->xdata_rsp = dict_new();
+        if (!local->xdata_rsp) {
+            ret = -1;
+            goto out;
+        }
+    }
+    xdata_rsp = local->xdata_rsp;
+
+    if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
+        ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+                                       SBRAIN_HEAL_NO_GO_MSG);
+        ret = -1;
+        goto out;
+    }
+
+    for (i = 0; i < priv->child_count; i++)
+        if (locked_on[i])
+            sources[i] = 1;
+    switch (heal_op) {
+        case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+            if (type == AFR_METADATA_TRANSACTION) {
+                ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+                                               SUSE_SOURCE_BRICK_TO_HEAL);
+                if (!ret)
+                    ret = -1;
+                goto out;
+            }
+            afr_mark_largest_file_as_source(this, sources, replies);
+            if (AFR_COUNT(sources, priv->child_count) != 1) {
+                ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+                                               SNO_BIGGER_FILE);
+                if (!ret)
+                    ret = -1;
+                goto out;
+            }
+            break;
+        case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME:
+            if (type == AFR_METADATA_TRANSACTION) {
+                ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+                                               SUSE_SOURCE_BRICK_TO_HEAL);
+                if (!ret)
+                    ret = -1;
+                goto out;
+            }
+            afr_mark_latest_mtime_file_as_source(this, sources, replies);
+            if (AFR_COUNT(sources, priv->child_count) != 1) {
+                ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+                                               SNO_DIFF_IN_MTIME);
+                if (!ret)
+                    ret = -1;
+                goto out;
+            }
+            break;
+        case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
+            ret = dict_get_str_sizen(xdata_req, "child-name", &name);
+            if (ret)
+                goto out;
+            source = afr_get_child_index_from_name(this, name);
+            if (source < 0) {
+                ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+                                               SINVALID_BRICK_NAME);
+                if (!ret)
+                    ret = -1;
+                goto out;
+            }
+            if (locked_on[source] != 1) {
+                ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+                                               SBRICK_IS_NOT_UP);
+                if (!ret)
+                    ret = -1;
+                goto out;
+            }
+            memset(sources, 0, sizeof(*sources) * priv->child_count);
+            sources[source] = 1;
+            break;
+        default:
+            ret = -1;
+            goto out;
+    }
+    for (i = 0; i < priv->child_count; i++) {
+        if (sources[i]) {
+            source = i;
+            break;
+        }
+    }
+    sinks[source] = 0;
+    healed_sinks[source] = 0;
+    ret = source;
+out:
+    if (ret < 0)
+        memset(sources, 0, sizeof(*sources) * priv->child_count);
+    return ret;
+}
 
-	for (i = 0; i < child_count; i++) 
-		for (j = 0; j < child_count; j++) 
-			if (pending_matrix[i][j]) 
-				return 0;
-	return 1;
+int
+afr_sh_fav_by_majority(xlator_t *this, struct afr_reply *replies,
+                       inode_t *inode)
+{
+    afr_private_t *priv;
+    int vote_count = -1;
+    int fav_child = -1;
+    int i = 0;
+    int k = 0;
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (replies[i].valid == 1) {
+            gf_msg_debug(this->name, 0,
+                         "Child:%s mtime_sec = %" PRId64 ", size = %" PRIu64
+                         " for gfid %s",
+                         priv->children[i]->name, replies[i].poststat.ia_mtime,
+                         replies[i].poststat.ia_size, uuid_utoa(inode->gfid));
+            vote_count = 0;
+            for (k = 0; k < priv->child_count; k++) {
+                if ((replies[k].poststat.ia_mtime ==
+                     replies[i].poststat.ia_mtime) &&
+                    (replies[k].poststat.ia_size ==
+                     replies[i].poststat.ia_size)) {
+                    vote_count++;
+                }
+            }
+            if (vote_count > priv->child_count / 2) {
+                fav_child = i;
+                break;
+            }
+        }
+    }
+    return fav_child;
 }
 
+/*
+ * afr_sh_fav_by_mtime: Choose favorite child by mtime.
+ */
+int
+afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+    afr_private_t *priv;
+    int fav_child = -1;
+    int i = 0;
+    uint32_t cmp_mtime = 0;
+    uint32_t cmp_mtime_nsec = 0;
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (replies[i].valid == 1) {
+            gf_msg_debug(this->name, 0,
+                         "Child:%s mtime = %" PRId64
+                         ", mtime_nsec = %d for "
+                         "gfid %s",
+                         priv->children[i]->name, replies[i].poststat.ia_mtime,
+                         replies[i].poststat.ia_mtime_nsec,
+                         uuid_utoa(inode->gfid));
+            if (replies[i].poststat.ia_mtime > cmp_mtime) {
+                cmp_mtime = replies[i].poststat.ia_mtime;
+                cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+                fav_child = i;
+            } else if ((replies[i].poststat.ia_mtime == cmp_mtime) &&
+                       (replies[i].poststat.ia_mtime_nsec > cmp_mtime_nsec)) {
+                cmp_mtime = replies[i].poststat.ia_mtime;
+                cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+                fav_child = i;
+            }
+        }
+    }
+    return fav_child;
+}
 
+/*
+ * afr_sh_fav_by_ctime: Choose favorite child by ctime.
+ */
 int
-afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              i = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-//	memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
-	memset (sh->buf, 0, sizeof (struct stat) * priv->child_count);
-	
-	for (i = 0; i < priv->child_count; i++) {
-		if (sh->xattr[i])
-			dict_unref (sh->xattr[i]);
-		sh->xattr[i] = NULL;
-	}
-
-	if (local->govinda_gOvinda) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"aborting selfheal of %s",
-			local->loc.path);
-		sh->completion_cbk (frame, this);
-	} else {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"proceeding to metadata check on %s",
-			local->loc.path);
-		afr_self_heal_metadata (frame, this);
-	}
-
-	return 0;
+afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+    afr_private_t *priv;
+    int fav_child = -1;
+    int i = 0;
+    uint32_t cmp_ctime = 0;
+    uint32_t cmp_ctime_nsec = 0;
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (replies[i].valid == 1) {
+            gf_msg_debug(this->name, 0,
+                         "Child:%s ctime = %" PRId64
+                         ", ctime_nsec = %d for "
+                         "gfid %s",
+                         priv->children[i]->name, replies[i].poststat.ia_ctime,
+                         replies[i].poststat.ia_ctime_nsec,
+                         uuid_utoa(inode->gfid));
+            if (replies[i].poststat.ia_ctime > cmp_ctime) {
+                cmp_ctime = replies[i].poststat.ia_ctime;
+                cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec;
+                fav_child = i;
+            } else if ((replies[i].poststat.ia_ctime == cmp_ctime) &&
+                       (replies[i].poststat.ia_ctime_nsec > cmp_ctime_nsec)) {
+                cmp_ctime = replies[i].poststat.ia_ctime;
+                cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec;
+                fav_child = i;
+            }
+        }
+    }
+    return fav_child;
 }
 
+/*
+ * afr_sh_fav_by_size: Choose favorite child by size
+ * when not all files are of zero size.
+ */
+int
+afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+    afr_private_t *priv;
+    int fav_child = -1;
+    int i = 0;
+    uint64_t cmp_sz = 0;
+
+    priv = this->private;
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid) {
+            continue;
+        }
+        gf_msg_debug(this->name, 0,
+                     "Child:%s file size = %" PRIu64 " for gfid %s",
+                     priv->children[i]->name, replies[i].poststat.ia_size,
+                     uuid_utoa(inode->gfid));
+        if (replies[i].poststat.ia_type == IA_IFDIR) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+                   "Cannot perform selfheal on %s. "
+                   "Size policy is not applicable to directories.",
+                   uuid_utoa(inode->gfid));
+            break;
+        }
+        if (replies[i].poststat.ia_size > cmp_sz) {
+            cmp_sz = replies[i].poststat.ia_size;
+            fav_child = i;
+        } else if (replies[i].poststat.ia_size == cmp_sz) {
+            fav_child = -1;
+        }
+    }
+    if (fav_child == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+               "No bigger file");
+    }
+    return fav_child;
+}
 
 int
-sh_missing_entries_unlck_cbk (call_frame_t *frame, void *cookie,
-			      xlator_t *this,
-			      int32_t op_ret, int32_t op_errno)
+afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies,
+                         inode_t *inode, char **policy_str)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              call_count = 0;
+    afr_private_t *priv = NULL;
+    int fav_child = -1;
+
+    priv = this->private;
+    if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
+        return -1;
+    }
+
+    switch (priv->fav_child_policy) {
+        case AFR_FAV_CHILD_BY_SIZE:
+            fav_child = afr_sh_fav_by_size(this, replies, inode);
+            if (policy_str && fav_child >= 0) {
+                *policy_str = "SIZE";
+            }
+            break;
+        case AFR_FAV_CHILD_BY_CTIME:
+            fav_child = afr_sh_fav_by_ctime(this, replies, inode);
+            if (policy_str && fav_child >= 0) {
+                *policy_str = "CTIME";
+            }
+            break;
+        case AFR_FAV_CHILD_BY_MTIME:
+            fav_child = afr_sh_fav_by_mtime(this, replies, inode);
+            if (policy_str && fav_child >= 0) {
+                *policy_str = "MTIME";
+            }
+            break;
+        case AFR_FAV_CHILD_BY_MAJORITY:
+            fav_child = afr_sh_fav_by_majority(this, replies, inode);
+            if (policy_str && fav_child >= 0) {
+                *policy_str = "MAJORITY";
+            }
+            break;
+        case AFR_FAV_CHILD_NONE:
+        default:
+            break;
+    }
+
+    return fav_child;
+}
 
+int
+afr_mark_split_brain_source_sinks_by_policy(
+    call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+    unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+    struct afr_reply *replies, afr_transaction_type type)
+{
+    afr_private_t *priv = NULL;
+    int fav_child = -1;
+    char mtime_str[256];
+    char ctime_str[256];
+    char *policy_str = NULL;
+    struct tm *tm_ptr;
+    time_t time;
+
+    priv = this->private;
+
+    fav_child = afr_sh_get_fav_by_policy(this, replies, inode, &policy_str);
+    if (fav_child == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+               "No child selected by favorite-child policy.");
+    } else if (fav_child > priv->child_count - 1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+               "Invalid child (%d) "
+               "selected by policy %s.",
+               fav_child, policy_str);
+    } else if (fav_child >= 0) {
+        time = replies[fav_child].poststat.ia_mtime;
+        tm_ptr = localtime(&time);
+        strftime(mtime_str, sizeof(mtime_str), "%Y-%m-%d %H:%M:%S", tm_ptr);
+        time = replies[fav_child].poststat.ia_ctime;
+        tm_ptr = localtime(&time);
+        strftime(ctime_str, sizeof(ctime_str), "%Y-%m-%d %H:%M:%S", tm_ptr);
+
+        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+               "Source %s selected as authentic to resolve conflicting data "
+               "in file (gfid:%s) by %s (%" PRIu64
+               " bytes @ %s mtime, %s "
+               "ctime).",
+               priv->children[fav_child]->name, uuid_utoa(inode->gfid),
+               policy_str, replies[fav_child].poststat.ia_size, mtime_str,
+               ctime_str);
+
+        sources[fav_child] = 1;
+        sinks[fav_child] = 0;
+        healed_sinks[fav_child] = 0;
+    }
+    return fav_child;
+}
 
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
+gf_boolean_t
+afr_is_file_empty_on_all_children(afr_private_t *priv,
+                                  struct afr_reply *replies)
+{
+    int i = 0;
 
-	LOCK (&frame->lock);
-	{
-	}
-	UNLOCK (&frame->lock);
+    for (i = 0; i < priv->child_count; i++) {
+        if ((!replies[i].valid) || (replies[i].op_ret != 0) ||
+            (replies[i].poststat.ia_size != 0))
+            return _gf_false;
+    }
 
-	call_count = afr_frame_return (frame);
+    return _gf_true;
+}
 
-	if (call_count == 0) {
-		afr_sh_missing_entries_done (frame, this);
-	}
+int
+afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources,
+                                    unsigned char *sinks,
+                                    unsigned char *healed_sinks,
+                                    unsigned char *locked_on,
+                                    struct afr_reply *replies,
+                                    afr_transaction_type type)
+{
+    int source = -1;
+    int i = 0;
+    afr_private_t *priv = this->private;
+    struct iatt stbuf = {
+        0,
+    };
+
+    if ((AFR_COUNT(locked_on, priv->child_count) < priv->child_count) ||
+        (afr_success_count(replies, priv->child_count) < priv->child_count))
+        return -1;
+
+    if (type == AFR_DATA_TRANSACTION) {
+        if (!afr_is_file_empty_on_all_children(priv, replies))
+            return -1;
+        goto mark;
+    }
+
+    /*For AFR_METADATA_TRANSACTION, metadata must be same on all bricks.*/
+    stbuf = replies[0].poststat;
+    for (i = 1; i < priv->child_count; i++) {
+        if ((!IA_EQUAL(stbuf, replies[i].poststat, type)) ||
+            (!IA_EQUAL(stbuf, replies[i].poststat, uid)) ||
+            (!IA_EQUAL(stbuf, replies[i].poststat, gid)) ||
+            (!IA_EQUAL(stbuf, replies[i].poststat, prot)))
+            return -1;
+    }
+    for (i = 1; i < priv->child_count; i++) {
+        if (!afr_xattrs_are_equal(replies[0].xdata, replies[i].xdata))
+            return -1;
+    }
+
+mark:
+    /* data/metadata is same on all bricks. Pick one of them as source. Rest
+     * are sinks.*/
+    for (i = 0; i < priv->child_count; i++) {
+        if (source == -1) {
+            source = i;
+            sources[i] = 1;
+            sinks[i] = 0;
+            healed_sinks[i] = 0;
+            continue;
+        }
+        sources[i] = 0;
+        sinks[i] = 1;
+        healed_sinks[i] = 1;
+    }
+
+    return source;
+}
 
-	return 0;
+/* Return a source depending on the type of heal_op, and set sources[source],
+ * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so
+ * only if the following condition is met:
+ * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1))
+ * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and
+ * sinks[node] are 1. This should be the case if the file is in split-brain.
+ */
+int
+afr_mark_split_brain_source_sinks(
+    call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+    unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+    struct afr_reply *replies, afr_transaction_type type)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    dict_t *xdata_req = NULL;
+    int heal_op = -1;
+    int ret = -1;
+    int source = -1;
+
+    local = frame->local;
+    priv = this->private;
+    xdata_req = local->xdata_req;
+
+    source = afr_mark_source_sinks_if_file_empty(
+        this, sources, sinks, healed_sinks, locked_on, replies, type);
+    if (source >= 0)
+        return source;
+
+    ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op);
+    if (ret)
+        goto autoheal;
+
+    source = afr_mark_split_brain_source_sinks_by_heal_op(
+        frame, this, sources, sinks, healed_sinks, locked_on, replies, type,
+        heal_op);
+    return source;
+
+autoheal:
+    /* Automatically heal if fav_child_policy is set. */
+    if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) {
+        source = afr_mark_split_brain_source_sinks_by_policy(
+            frame, this, inode, sources, sinks, healed_sinks, locked_on,
+            replies, type);
+        if (source != -1) {
+            ret = dict_set_int32_sizen(xdata_req, "fav-child-policy", 1);
+            if (ret)
+                return -1;
+        }
+    }
+
+    return source;
 }
-			      
 
-static int
-sh_missing_entries_finish (call_frame_t *frame, xlator_t *this)
+int
+_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this,
+                                 inode_t *inode, int source,
+                                 unsigned char *healed_sinks,
+                                 unsigned char *undid_pending,
+                                 afr_transaction_type type,
+                                 unsigned char *locked_on,
+                                 struct afr_reply *replies)
 {
-	afr_private_t      *priv = NULL;
-	afr_local_t        *local = NULL;
-	int                 i = 0;
-	int                 call_count = 0;
-	afr_self_heal_t    *sh = NULL;
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int *input_dirty = NULL;
+    int **input_matrix = NULL;
+    int *output_dirty = NULL;
+    int **output_matrix = NULL;
+    dict_t *xattr = NULL;
+    dict_t *xdata = NULL;
+    int i = 0;
+
+    priv = this->private;
+    local = frame->local;
+
+    if (!dict_get_sizen(local->xdata_req, "fav-child-policy"))
+        return 0;
+
+    xdata = dict_new();
+    if (!xdata)
+        return -1;
+
+    input_dirty = alloca0(priv->child_count * sizeof(int));
+    input_matrix = ALLOC_MATRIX(priv->child_count, int);
+    output_dirty = alloca0(priv->child_count * sizeof(int));
+    output_matrix = ALLOC_MATRIX(priv->child_count, int);
+
+    afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (i == source || !healed_sinks[i])
+            continue;
+        output_dirty[i] = -input_dirty[i];
+        output_matrix[i][source] = -input_matrix[i][source];
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!healed_sinks[i] || !locked_on[i])
+            continue;
+        xattr = afr_selfheal_output_xattr(this, _gf_false, type, output_dirty,
+                                          output_matrix, i, NULL);
+
+        afr_selfheal_post_op(frame, this, inode, i, xattr, xdata);
+
+        undid_pending[i] = 1;
+        dict_unref(xattr);
+    }
+
+    if (xdata)
+        dict_unref(xdata);
+
+    return 0;
+}
 
+gf_boolean_t
+afr_does_witness_exist(xlator_t *this, uint64_t *witness)
+{
+    int i = 0;
+    afr_private_t *priv = NULL;
 
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
+    priv = this->private;
 
-	call_count = local->child_count;
+    for (i = 0; i < priv->child_count; i++) {
+        if (witness[i])
+            return _gf_true;
+    }
+    return _gf_false;
+}
 
-	local->call_count = call_count;
+unsigned int
+afr_get_quorum_count(afr_private_t *priv)
+{
+    if (priv->quorum_count == AFR_QUORUM_AUTO) {
+        return priv->child_count / 2 + 1;
+    } else {
+        return priv->quorum_count;
+    }
+}
 
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"unlocking %"PRId64"/%s on subvolume %s",
-				sh->parent_loc.inode->ino, local->loc.name,
-				priv->children[i]->name);
+void
+afr_selfheal_post_op_failure_accounting(afr_private_t *priv, char *accused,
+                                        unsigned char *sources,
+                                        unsigned char *locked_on)
+{
+    int i = 0;
+    unsigned int quorum_count = 0;
+
+    if (AFR_COUNT(sources, priv->child_count) != 0)
+        return;
+
+    quorum_count = afr_get_quorum_count(priv);
+    for (i = 0; i < priv->child_count; i++) {
+        if ((accused[i] < quorum_count) && locked_on[i]) {
+            sources[i] = 1;
+        }
+    }
+    return;
+}
 
-			STACK_WIND (frame, sh_missing_entries_unlck_cbk,
-				    priv->children[i],
-				    priv->children[i]->fops->entrylk,
-				    &sh->parent_loc, local->loc.name,
-				    ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
+/*
+ * This function determines if a self-heal is required for a given inode,
+ * and if needed, in what direction.
+ *
+ * locked_on[] is the array representing servers which have been locked and
+ * from which xattrs have been fetched for analysis.
+ *
+ * The output of the function is by filling the arrays sources[] and sinks[].
+ *
+ * sources[i] is set if i'th server is an eligible source for a selfheal.
+ *
+ * sinks[i] is set if i'th server needs to be healed.
+ *
+ * if sources[0..N] are all set, there is no need for a selfheal.
+ *
+ * if sinks[0..N] are all set, the inode is in split brain.
+ *
+ */
 
-			if (!--call_count)
-				break;
-		}
-	}
-	return 0;
+int
+afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
+                            struct afr_reply *replies,
+                            afr_transaction_type type, unsigned char *locked_on,
+                            unsigned char *sources, unsigned char *sinks,
+                            uint64_t *witness, unsigned char *pflag)
+{
+    afr_private_t *priv = NULL;
+    int i = 0;
+    int j = 0;
+    int *dirty = NULL;         /* Denotes if dirty xattr is set */
+    int **matrix = NULL;       /* Changelog matrix */
+    char *accused = NULL;      /* Accused others without any self-accusal */
+    char *pending = NULL;      /* Have pending operations on others */
+    char *self_accused = NULL; /* Accused itself */
+
+    priv = this->private;
+
+    dirty = alloca0(priv->child_count * sizeof(int));
+    accused = alloca0(priv->child_count);
+    pending = alloca0(priv->child_count);
+    self_accused = alloca0(priv->child_count);
+    matrix = ALLOC_MATRIX(priv->child_count, int);
+    memset(witness, 0, sizeof(*witness) * priv->child_count);
+
+    /* First construct the pending matrix for further analysis */
+    afr_selfheal_extract_xattr(this, replies, type, dirty, matrix);
+
+    if (pflag) {
+        for (i = 0; i < priv->child_count; i++) {
+            for (j = 0; j < priv->child_count; j++)
+                if (matrix[i][j])
+                    *pflag |= PFLAG_PENDING;
+            if (*pflag)
+                break;
+        }
+    }
+
+    if (afr_success_count(replies, priv->child_count) < priv->child_count) {
+        /* Treat this just like locks not being acquired */
+        return -ENOTCONN;
+    }
+
+    /* short list all self-accused */
+    for (i = 0; i < priv->child_count; i++) {
+        if (matrix[i][i])
+            self_accused[i] = 1;
+    }
+
+    /* Next short list all accused to exclude them from being sources */
+    /* Self-accused can't accuse others as they are FOOLs */
+    for (i = 0; i < priv->child_count; i++) {
+        for (j = 0; j < priv->child_count; j++) {
+            if (matrix[i][j]) {
+                if (!self_accused[i])
+                    accused[j] += 1;
+                if (i != j)
+                    pending[i] += 1;
+            }
+        }
+    }
+
+    /* Short list all non-accused as sources */
+    for (i = 0; i < priv->child_count; i++) {
+        if (!accused[i] && locked_on[i])
+            sources[i] = 1;
+        else
+            sources[i] = 0;
+    }
+
+    /* Everyone accused by non-self-accused sources are sinks */
+    memset(sinks, 0, priv->child_count);
+    for (i = 0; i < priv->child_count; i++) {
+        if (!sources[i])
+            continue;
+        if (self_accused[i])
+            continue;
+        for (j = 0; j < priv->child_count; j++) {
+            if (matrix[i][j])
+                sinks[j] = 1;
+        }
+    }
+
+    /* For breaking ties provide with number of fops they witnessed */
+
+    /*
+     * count the pending fops witnessed from itself to others when it is
+     * self-accused
+     */
+    for (i = 0; i < priv->child_count; i++) {
+        if (!self_accused[i])
+            continue;
+        for (j = 0; j < priv->child_count; j++) {
+            if (i == j)
+                continue;
+            witness[i] += matrix[i][j];
+        }
+    }
+
+    if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
+        afr_selfheal_post_op_failure_accounting(priv, accused, sources,
+                                                locked_on);
+
+    /* If no sources, all locked nodes are sinks - split brain */
+    if (AFR_COUNT(sources, priv->child_count) == 0) {
+        for (i = 0; i < priv->child_count; i++) {
+            if (locked_on[i])
+                sinks[i] = 1;
+        }
+        if (pflag)
+            *pflag |= PFLAG_SBRAIN;
+    }
+
+    /* One more class of witness similar to dirty in v2 is where no pending
+     * exists but we have self-accusing markers. This can happen in afr-v1
+     * if the brick crashes just after doing xattrop on self but
+     * before xattrop on the other xattrs on the brick in pre-op. */
+    if (AFR_COUNT(pending, priv->child_count) == 0) {
+        for (i = 0; i < priv->child_count; i++) {
+            if (self_accused[i])
+                witness[i] += matrix[i][i];
+        }
+    } else {
+        /* In afr-v1 if a file is self-accused and has pending
+         * operations on others then it is similar to 'dirty' in afr-v2.
+         * Consider such cases as witness.
+         */
+        for (i = 0; i < priv->child_count; i++) {
+            if (self_accused[i] && pending[i])
+                witness[i] += matrix[i][i];
+        }
+    }
+
+    /* count the number of dirty fops witnessed */
+    for (i = 0; i < priv->child_count; i++)
+        witness[i] += dirty[i];
+
+    return 0;
 }
 
-
-static int
-sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		int32_t op_ret, int op_errno, struct stat *stbuf)
+void
+afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source,
+                 unsigned char *sources, unsigned char *healed_sinks)
 {
-	STACK_DESTROY (frame->root);
-	return 0;
+    char *status = NULL;
+    char *sinks_str = NULL;
+    char *p = NULL;
+    char *sources_str = NULL;
+    char *q = NULL;
+    afr_private_t *priv = NULL;
+    gf_loglevel_t loglevel = GF_LOG_NONE;
+    int i = 0;
+
+    priv = this->private;
+    sinks_str = alloca0(priv->child_count * 8);
+    p = sinks_str;
+    sources_str = alloca0(priv->child_count * 8);
+    q = sources_str;
+    for (i = 0; i < priv->child_count; i++) {
+        if (healed_sinks[i])
+            p += sprintf(p, "%d ", i);
+        if (sources[i]) {
+            if (source == i) {
+                q += sprintf(q, "[%d] ", i);
+            } else {
+                q += sprintf(q, "%d ", i);
+            }
+        }
+    }
+
+    if (ret < 0) {
+        status = "Failed";
+        loglevel = GF_LOG_DEBUG;
+    } else {
+        status = "Completed";
+        loglevel = GF_LOG_INFO;
+    }
+
+    gf_msg(this->name, loglevel, 0, AFR_MSG_SELF_HEAL_INFO,
+           "%s %s selfheal on %s. "
+           "sources=%s sinks=%s",
+           status, type, uuid_utoa(gfid), sources_str, sinks_str);
 }
 
+int
+afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int op_ret, int op_errno, inode_t *inode,
+                          struct iatt *buf, dict_t *xdata, struct iatt *parbuf)
+{
+    afr_local_t *local = NULL;
+    int i = -1;
+    GF_UNUSED int ret = -1;
+    int8_t need_heal = 1;
+
+    local = frame->local;
+    i = (long)cookie;
+
+    local->replies[i].valid = 1;
+    local->replies[i].op_ret = op_ret;
+    local->replies[i].op_errno = op_errno;
+    if (buf)
+        local->replies[i].poststat = *buf;
+    if (parbuf)
+        local->replies[i].postparent = *parbuf;
+    if (xdata) {
+        local->replies[i].xdata = dict_ref(xdata);
+        ret = dict_get_int8(xdata, "link-count", &need_heal);
+    }
+
+    local->replies[i].need_heal = need_heal;
+    syncbarrier_wake(&local->barrier);
+
+    return 0;
+}
 
-static int
-sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie,
-				 xlator_t *this,
-				 int32_t op_ret, int32_t op_errno,
-				 inode_t *inode, struct stat *stbuf)
+inode_t *
+afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent,
+                                const char *name, struct afr_reply *replies,
+                                unsigned char *lookup_on, dict_t *xattr)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	call_frame_t    *chown_frame = NULL;
-	int              call_count = 0;
-	int              child_index = 0;
-	struct stat     *buf = NULL;
+    loc_t loc = {
+        0,
+    };
+    dict_t *xattr_req = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    inode_t *inode = NULL;
 
+    local = frame->local;
+    priv = frame->this->private;
 
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
+    xattr_req = dict_new();
+    if (!xattr_req)
+        return NULL;
 
-	buf = &sh->buf[sh->source];
-	child_index = (long) cookie;
+    if (xattr)
+        dict_copy(xattr, xattr_req);
 
-	if (op_ret == 0) {
-		chown_frame = copy_frame (frame);
+    if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) {
+        dict_unref(xattr_req);
+        return NULL;
+    }
 
-		gf_log (this->name, GF_LOG_DEBUG,
-			"chown %s to %d %d on subvolume %s",
-			local->loc.path, buf->st_uid, buf->st_gid,
-			priv->children[child_index]->name);
+    inode = inode_new(parent->table);
+    if (!inode) {
+        dict_unref(xattr_req);
+        return NULL;
+    }
 
-		STACK_WIND (chown_frame, sh_destroy_cbk,
-			    priv->children[child_index],
-			    priv->children[child_index]->fops->chown,
-			    &local->loc,
-			    buf->st_uid, buf->st_gid);
-	}
+    loc.parent = inode_ref(parent);
+    gf_uuid_copy(loc.pargfid, parent->gfid);
+    loc.name = name;
+    loc.inode = inode_ref(inode);
 
-	LOCK (&frame->lock);
-	{
-	}
-	UNLOCK (&frame->lock);
+    AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+               xattr_req);
 
-	call_count = afr_frame_return (frame);
+    afr_replies_copy(replies, local->replies, priv->child_count);
 
-	if (call_count == 0) {
-		sh_missing_entries_finish (frame, this);
-	}
+    loc_wipe(&loc);
+    dict_unref(xattr_req);
 
-	return 0;
+    return inode;
 }
 
-
 static int
-sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this)
+afr_set_multi_dom_lock_count_request(xlator_t *this, dict_t *dict)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              i = 0;
-	int              enoent_count = 0;
-	int              call_count = 0;
-	mode_t           st_mode = 0;
-	dev_t            st_dev = 0;
+    int ret = 0;
+    afr_private_t *priv = NULL;
+    char *key1 = NULL;
+    char *key2 = NULL;
+
+    priv = this->private;
+    key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+                   strlen(this->name));
+    key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+                   strlen(priv->sh_domain));
+
+    ret = dict_set_uint32(dict, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS, 1);
+    if (ret)
+        return ret;
+
+    sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name);
+    ret = dict_set_uint32(dict, key1, 1);
+    if (ret)
+        return ret;
+
+    sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain);
+    ret = dict_set_uint32(dict, key2, 1);
+    if (ret)
+        return ret;
+
+    return 0;
+}
 
+int
+afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
+                                  uuid_t gfid, struct afr_reply *replies,
+                                  unsigned char *discover_on, dict_t *dict)
+{
+    loc_t loc = {
+        0,
+    };
+    dict_t *xattr_req = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
 
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
+    local = frame->local;
+    priv = frame->this->private;
 
-	for (i = 0; i < priv->child_count; i++)
-		if (sh->child_errno[i] == ENOENT)
-			enoent_count++;
+    xattr_req = dict_new();
+    if (!xattr_req)
+        return -ENOMEM;
+    if (dict)
+        dict_copy(dict, xattr_req);
 
-	call_count = enoent_count;
-	local->call_count = call_count;
+    if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) {
+        dict_unref(xattr_req);
+        return -ENOMEM;
+    }
 
-	st_mode = sh->buf[sh->source].st_mode;
-	st_dev  = sh->buf[sh->source].st_dev;
+    if (afr_set_multi_dom_lock_count_request(frame->this, xattr_req)) {
+        dict_unref(xattr_req);
+        return -1;
+    }
 
-	gf_log (this->name, GF_LOG_DEBUG,
-		"mknod %s mode 0%o on %d subvolumes",
-		local->loc.path, st_mode, enoent_count);
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, gfid);
 
-	for (i = 0; i < priv->child_count; i++) {
-		if (sh->child_errno[i] == ENOENT) {
-			STACK_WIND_COOKIE (frame,
-					   sh_missing_entries_newentry_cbk,
-					   (void *) (long) i,
-					   priv->children[i],
-					   priv->children[i]->fops->mknod,
-					   &local->loc, st_mode, st_dev);
-			if (!--call_count)
-				break;
-		}
-	}
+    AFR_ONLIST(discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+               xattr_req);
 
-	return 0;
-}
+    afr_replies_copy(replies, local->replies, priv->child_count);
 
+    loc_wipe(&loc);
+    dict_unref(xattr_req);
 
-static int
-sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this)
+    return 0;
+}
+
+int
+afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
+                               struct afr_reply *replies)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              i = 0;
-	int              enoent_count = 0;
-	int              call_count = 0;
-	mode_t           st_mode = 0;
+    afr_local_t *local = NULL;
+    dict_t *dict = NULL;
 
+    local = frame->local;
 
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
+    if (local->xattr_req)
+        dict = local->xattr_req;
+
+    return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies,
+                                             local->child_up, dict);
+}
+
+unsigned int
+afr_success_count(struct afr_reply *replies, unsigned int count)
+{
+    int i = 0;
+    unsigned int success = 0;
 
-	for (i = 0; i < priv->child_count; i++)
-		if (sh->child_errno[i] == ENOENT)
-			enoent_count++;
+    for (i = 0; i < count; i++)
+        if (replies[i].valid && replies[i].op_ret == 0)
+            success++;
+    return success;
+}
 
-	call_count = enoent_count;
-	local->call_count = call_count;
+int
+afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int op_ret, int op_errno, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int i = 0;
 
-	st_mode = sh->buf[sh->source].st_mode;
+    local = frame->local;
+    i = (long)cookie;
 
-	gf_log (this->name, GF_LOG_DEBUG,
-		"mkdir %s mode 0%o on %d subvolumes",
-		local->loc.path, st_mode, enoent_count);
+    local->replies[i].valid = 1;
+    local->replies[i].op_ret = op_ret;
+    local->replies[i].op_errno = op_errno;
 
-	for (i = 0; i < priv->child_count; i++) {
-		if (sh->child_errno[i] == ENOENT) {
-			STACK_WIND_COOKIE (frame,
-					   sh_missing_entries_newentry_cbk,
-					   (void *) (long) i,
-					   priv->children[i],
-					   priv->children[i]->fops->mkdir,
-					   &local->loc, st_mode);
-			if (!--call_count)
-				break;
-		}
-	}
+    syncbarrier_wake(&local->barrier);
 
-	return 0;
+    return 0;
 }
 
-
-static int
-sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this,
-			    const char *link)
+int
+afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              i = 0;
-	int              enoent_count = 0;
-	int              call_count = 0;
+    int i = 0;
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int count = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+            locked_on[i] = 1;
+            count++;
+        } else {
+            locked_on[i] = 0;
+        }
+    }
+
+    return count;
+}
 
+int
+afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                        char *dom, off_t off, size_t size,
+                        unsigned char *locked_on)
+{
+    loc_t loc = {
+        0,
+    };
+    struct gf_flock flock = {
+        0,
+    };
 
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
 
-	for (i = 0; i < priv->child_count; i++)
-		if (sh->child_errno[i] == ENOENT)
-			enoent_count++;
+    flock.l_type = F_WRLCK;
+    flock.l_start = off;
+    flock.l_len = size;
 
-	call_count = enoent_count;
-	local->call_count = call_count;
+    AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+              NULL);
 
-	gf_log (this->name, GF_LOG_DEBUG,
-		"symlink %s -> %s on %d subvolumes",
-		local->loc.path, link, enoent_count);
+    loc_wipe(&loc);
 
-	for (i = 0; i < priv->child_count; i++) {
-		if (sh->child_errno[i] == ENOENT) {
-			STACK_WIND_COOKIE (frame,
-					   sh_missing_entries_newentry_cbk,
-					   (void *) (long) i,
-					   priv->children[i],
-					   priv->children[i]->fops->symlink,
-					   link, &local->loc);
-			if (!--call_count)
-				break;
-		}
-	}
+    return afr_locked_fill(frame, this, locked_on);
+}
 
-	return 0;
+int
+afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                     char *dom, off_t off, size_t size,
+                     unsigned char *locked_on)
+{
+    loc_t loc = {
+        0,
+    };
+    struct gf_flock flock = {
+        0,
+    };
+    afr_local_t *local = NULL;
+    int i = 0;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    flock.l_type = F_WRLCK;
+    flock.l_start = off;
+    flock.l_len = size;
+
+    AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+              NULL);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->replies[i].op_ret == -1 &&
+            local->replies[i].op_errno == EAGAIN) {
+            afr_locked_fill(frame, this, locked_on);
+            afr_selfheal_uninodelk(frame, this, inode, dom, off, size,
+                                   locked_on);
+
+            AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW,
+                    &flock, NULL);
+            break;
+        }
+    }
+
+    loc_wipe(&loc);
+
+    return afr_locked_fill(frame, this, locked_on);
 }
 
+static void
+afr_get_lock_and_eagain_counts(afr_private_t *priv, struct afr_reply *replies,
+                               int *lock_count, int *eagain_count)
+{
+    int i = 0;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid)
+            continue;
+        if (replies[i].op_ret == 0) {
+            (*lock_count)++;
+        } else if (replies[i].op_ret == -1 && replies[i].op_errno == EAGAIN) {
+            (*eagain_count)++;
+        }
+    }
+}
 
-static int
-sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie,
-				 xlator_t *this,
-				 int32_t op_ret, int32_t op_errno,
-				 const char *link)
+/*Do blocking locks if number of locks acquired is majority and there were some
+ * EAGAINs. Useful for odd-way replication*/
+int
+afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this,
+                                 inode_t *inode, char *dom, off_t off,
+                                 size_t size, unsigned char *locked_on)
 {
-	if (op_ret > 0)
-		sh_missing_entries_symlink (frame, this, link);
-	else
-		sh_missing_entries_finish (frame, this);
+    loc_t loc = {
+        0,
+    };
+    struct gf_flock flock = {
+        0,
+    };
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int lock_count = 0;
+    int eagain_count = 0;
 
-	return 0;
-}
+    priv = this->private;
+    local = frame->local;
 
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
 
-static int
-sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this)
+    flock.l_type = F_WRLCK;
+    flock.l_start = off;
+    flock.l_len = size;
+
+    AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+              NULL);
+
+    afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count,
+                                   &eagain_count);
+
+    if (lock_count > priv->child_count / 2 && eagain_count) {
+        afr_locked_fill(frame, this, locked_on);
+        afr_selfheal_uninodelk(frame, this, inode, dom, off, size, locked_on);
+
+        AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW,
+                &flock, NULL);
+    }
+
+    loc_wipe(&loc);
+
+    return afr_locked_fill(frame, this, locked_on);
+}
+
+int
+afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                       char *dom, off_t off, size_t size,
+                       const unsigned char *locked_on)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
+    loc_t loc = {
+        0,
+    };
+    struct gf_flock flock = {
+        0,
+    };
 
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
 
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
+    flock.l_type = F_UNLCK;
+    flock.l_start = off;
+    flock.l_len = size;
 
-	STACK_WIND (frame, sh_missing_entries_readlink_cbk,
-		    priv->children[sh->source],
-		    priv->children[sh->source]->fops->readlink,
-		    &local->loc, 4096);
+    AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, inodelk, dom, &loc,
+               F_SETLK, &flock, NULL);
 
-	return 0;
+    loc_wipe(&loc);
+
+    return 0;
 }
 
+int
+afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                        char *dom, const char *name, unsigned char *locked_on)
+{
+    loc_t loc = {
+        0,
+    };
 
-static int
-sh_missing_entries_create (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	int              type = 0;
-	int              i = 0;
-	afr_private_t   *priv = NULL;
-	int              enoent_count = 0;
-	int              govinda_gOvinda = 0;
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (sh->child_errno[i]) {
-			if (sh->child_errno[i] == ENOENT)
-				enoent_count++;
-		} else {
-			if (type) {
-				if (type != (sh->buf[i].st_mode & S_IFMT))
-					govinda_gOvinda = 1;
-			} else {
-				sh->source = i;
-				type = sh->buf[i].st_mode & S_IFMT;
-			}
-		}
-	}
-
-	if (govinda_gOvinda) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"conflicing filetypes exist for path %s. returning.",
-			local->loc.path);
-
-		local->govinda_gOvinda = 1;
-		sh_missing_entries_finish (frame, this);
-		return 0;
-	}
-
-	if (!type) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no source found for %s. all nodes down?. returning.",
-			local->loc.path);
-		/* subvolumes down and/or file does not exist */
-		sh_missing_entries_finish (frame, this);
-		return 0;
-	}
-
-	if (enoent_count == 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no missing files - %s. proceeding to metadata check",
-			local->loc.path);
-		/* proceed to next step - metadata self-heal */
-		sh_missing_entries_finish (frame, this);
-		return 0;
-	}
-
-	switch (type) {
-	case S_IFSOCK:
-	case S_IFREG:
-	case S_IFBLK:
-	case S_IFCHR:
-	case S_IFIFO:
-		sh_missing_entries_mknod (frame, this);
-		break;
-	case S_IFLNK:
-		sh_missing_entries_readlink (frame, this);
-		break;
-	case S_IFDIR:
-		sh_missing_entries_mkdir (frame, this);
-		break;
-	default:
-		gf_log (this->name, GF_LOG_ERROR,
-			"unknown file type: 0%o", type);
-		local->govinda_gOvinda = 1;
-		sh_missing_entries_finish (frame, this);
-	}
-
-	return 0;
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+              ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
+
+    loc_wipe(&loc);
+
+    return afr_locked_fill(frame, this, locked_on);
 }
 
+int
+afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                     char *dom, const char *name, unsigned char *locked_on)
+{
+    loc_t loc = {
+        0,
+    };
+    afr_local_t *local = NULL;
+    int i = 0;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+              ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->replies[i].op_ret == -1 &&
+            local->replies[i].op_errno == EAGAIN) {
+            afr_locked_fill(frame, this, locked_on);
+            afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on,
+                                   NULL);
+
+            AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+                    ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+            break;
+        }
+    }
+
+    loc_wipe(&loc);
+
+    return afr_locked_fill(frame, this, locked_on);
+}
 
-static int
-sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie,
-			       xlator_t *this,
-			       int32_t op_ret, int32_t op_errno,
-			       inode_t *inode, struct stat *buf, dict_t *xattr)
+int
+afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this,
+                                 inode_t *inode, char *dom, const char *name,
+                                 unsigned char *locked_on)
 {
-	int              child_index = 0;
-	afr_local_t     *local = NULL;
-	int              call_count = 0;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
+    loc_t loc = {
+        0,
+    };
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int lock_count = 0;
+    int eagain_count = 0;
+
+    priv = this->private;
+    local = frame->local;
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+              ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
 
+    afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count,
+                                   &eagain_count);
 
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
+    if (lock_count > priv->child_count / 2 && eagain_count) {
+        afr_locked_fill(frame, this, locked_on);
+        afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on, NULL);
 
-	child_index = (long) cookie;
+        AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+                ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+    }
 
-	LOCK (&frame->lock);
-	{
-		if (op_ret == 0) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"path %s on subvolume %s is of mode 0%o",
-				local->loc.path,
-				priv->children[child_index]->name,
-				buf->st_mode);
+    loc_wipe(&loc);
 
-			local->self_heal.buf[child_index] = *buf;
-		} else {
-			gf_log (this->name, GF_LOG_WARNING,
-				"path %s on subvolume %s => -1 (%s)",
-				local->loc.path,
-				priv->children[child_index]->name,
-				strerror (op_errno));
+    return afr_locked_fill(frame, this, locked_on);
+}
 
-			local->self_heal.child_errno[child_index] = op_errno;
-		}
+int
+afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                       char *dom, const char *name, unsigned char *locked_on,
+                       dict_t *xdata)
+{
+    loc_t loc = {
+        0,
+    };
 
-	}
-	UNLOCK (&frame->lock);
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
 
-	call_count = afr_frame_return (frame);
+    AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, entrylk, dom, &loc,
+               name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
 
-	if (call_count == 0) {
-		sh_missing_entries_create (frame, this);
-	}
+    loc_wipe(&loc);
 
-	return 0;
+    return 0;
 }
 
+gf_boolean_t
+afr_is_data_set(xlator_t *this, dict_t *xdata)
+{
+    return afr_is_pending_set(this, xdata, AFR_DATA_TRANSACTION);
+}
 
-static int
-sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t    *local = NULL;
-	int             i = 0;
-	int             call_count = 0;
-	afr_private_t  *priv = NULL;
-	dict_t         *xattr_req = NULL;
-	int             ret = -1;
-
-	local = frame->local;
-	call_count = local->child_count;
-	priv = this->private;
-
-	local->call_count = call_count;
-	
-	xattr_req = dict_new();
-	
-	if (xattr_req)
-		ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING,
-				       priv->child_count * sizeof(int32_t));
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"looking up %s on subvolume %s",
-				local->loc.path, priv->children[i]->name);
-
-			STACK_WIND_COOKIE (frame,
-					   sh_missing_entries_lookup_cbk,
-					   (void *) (long) i,
-					   priv->children[i],
-					   priv->children[i]->fops->lookup,
-					   &local->loc, xattr_req);
-
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	if (xattr_req)
-		dict_unref (xattr_req);
-
-	return 0;
+gf_boolean_t
+afr_is_metadata_set(xlator_t *this, dict_t *xdata)
+{
+    return afr_is_pending_set(this, xdata, AFR_METADATA_TRANSACTION);
 }
 
+gf_boolean_t
+afr_is_entry_set(xlator_t *this, dict_t *xdata)
+{
+    return afr_is_pending_set(this, xdata, AFR_ENTRY_TRANSACTION);
+}
 
-static int
-sh_missing_entries_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			   int32_t op_ret, int32_t op_errno)
+/*
+ * This function inspects the looked up replies (in an unlocked manner)
+ * and decides whether a locked verification and possible healing is
+ * required or not. It updates the three booleans for each type
+ * of healing. If the boolean flag gets set to FALSE, then we are sure
+ * no healing is required. If the boolean flag gets set to TRUE then
+ * we have to proceed with locked reinspection.
+ */
+
+int
+afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+                              inode_t **link_inode, gf_boolean_t *data_selfheal,
+                              gf_boolean_t *metadata_selfheal,
+                              gf_boolean_t *entry_selfheal,
+                              struct afr_reply *replies_dst)
+{
+    afr_private_t *priv = NULL;
+    inode_t *inode = NULL;
+    int i = 0;
+    int valid_cnt = 0;
+    struct iatt first = {
+        0,
+    };
+    int first_idx = 0;
+    struct afr_reply *replies = NULL;
+    int ret = -1;
+
+    priv = this->private;
+
+    inode = afr_inode_find(this, gfid);
+    if (!inode)
+        goto out;
+
+    replies = alloca0(sizeof(*replies) * priv->child_count);
+
+    ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies);
+    if (ret)
+        goto out;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid)
+            continue;
+        if (replies[i].op_ret == -1)
+            continue;
+
+        /* The data segment of the changelog can be non-zero to indicate
+         * the directory needs a full heal. So the check below ensures
+         * it's not a directory before setting the data_selfheal boolean.
+         */
+        if (data_selfheal && !IA_ISDIR(replies[i].poststat.ia_type) &&
+            afr_is_data_set(this, replies[i].xdata))
+            *data_selfheal = _gf_true;
+
+        if (metadata_selfheal && afr_is_metadata_set(this, replies[i].xdata))
+            *metadata_selfheal = _gf_true;
+
+        if (entry_selfheal && afr_is_entry_set(this, replies[i].xdata))
+            *entry_selfheal = _gf_true;
+
+        valid_cnt++;
+        if (valid_cnt == 1) {
+            first = replies[i].poststat;
+            first_idx = i;
+            continue;
+        }
+
+        if (!IA_EQUAL(first, replies[i].poststat, type)) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+                   "TYPE mismatch %d vs %d on %s for gfid:%s",
+                   (int)first.ia_type, (int)replies[i].poststat.ia_type,
+                   priv->children[i]->name,
+                   uuid_utoa(replies[i].poststat.ia_gfid));
+            gf_event(EVENT_AFR_SPLIT_BRAIN,
+                     "client-pid=%d;"
+                     "subvol=%s;"
+                     "type=file;gfid=%s;"
+                     "ia_type-%d=%s;ia_type-%d=%s",
+                     this->ctx->cmd_args.client_pid, this->name,
+                     uuid_utoa(replies[i].poststat.ia_gfid), first_idx,
+                     gf_inode_type_to_str(first.ia_type), i,
+                     gf_inode_type_to_str(replies[i].poststat.ia_type));
+            ret = -EIO;
+            goto out;
+        }
+
+        if (!IA_EQUAL(first, replies[i].poststat, uid)) {
+            gf_msg_debug(this->name, 0,
+                         "UID mismatch "
+                         "%d vs %d on %s for gfid:%s",
+                         (int)first.ia_uid, (int)replies[i].poststat.ia_uid,
+                         priv->children[i]->name,
+                         uuid_utoa(replies[i].poststat.ia_gfid));
+
+            if (metadata_selfheal)
+                *metadata_selfheal = _gf_true;
+        }
+
+        if (!IA_EQUAL(first, replies[i].poststat, gid)) {
+            gf_msg_debug(this->name, 0,
+                         "GID mismatch "
+                         "%d vs %d on %s for gfid:%s",
+                         (int)first.ia_uid, (int)replies[i].poststat.ia_uid,
+                         priv->children[i]->name,
+                         uuid_utoa(replies[i].poststat.ia_gfid));
+
+            if (metadata_selfheal)
+                *metadata_selfheal = _gf_true;
+        }
+
+        if (!IA_EQUAL(first, replies[i].poststat, prot)) {
+            gf_msg_debug(this->name, 0,
+                         "MODE mismatch "
+                         "%d vs %d on %s for gfid:%s",
+                         (int)st_mode_from_ia(first.ia_prot, 0),
+                         (int)st_mode_from_ia(replies[i].poststat.ia_prot, 0),
+                         priv->children[i]->name,
+                         uuid_utoa(replies[i].poststat.ia_gfid));
+
+            if (metadata_selfheal)
+                *metadata_selfheal = _gf_true;
+        }
+
+        if (IA_ISREG(first.ia_type) &&
+            !IA_EQUAL(first, replies[i].poststat, size)) {
+            gf_msg_debug(this->name, 0,
+                         "SIZE mismatch "
+                         "%lld vs %lld on %s for gfid:%s",
+                         (long long)first.ia_size,
+                         (long long)replies[i].poststat.ia_size,
+                         priv->children[i]->name,
+                         uuid_utoa(replies[i].poststat.ia_gfid));
+
+            if (data_selfheal)
+                *data_selfheal = _gf_true;
+        }
+    }
+
+    if (valid_cnt > 0 && link_inode) {
+        *link_inode = inode_link(inode, NULL, NULL, &first);
+        if (!*link_inode) {
+            ret = -EINVAL;
+            goto out;
+        }
+    } else if (valid_cnt < 2) {
+        ret = afr_check_stale_error(replies, priv);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (replies && replies_dst)
+        afr_replies_copy(replies_dst, replies, priv->child_count);
+    if (inode)
+        inode_unref(inode);
+    if (replies)
+        afr_replies_wipe(replies, priv->child_count);
+
+    return ret;
+}
+
+inode_t *
+afr_inode_find(xlator_t *this, uuid_t gfid)
+{
+    inode_table_t *table = NULL;
+    inode_t *inode = NULL;
+
+    table = this->itable;
+    if (!table)
+        return NULL;
+
+    inode = inode_find(table, gfid);
+    if (inode)
+        return inode;
+
+    inode = inode_new(table);
+    if (!inode)
+        return NULL;
+
+    gf_uuid_copy(inode->gfid, gfid);
+
+    return inode;
+}
+
+call_frame_t *
+afr_frame_create(xlator_t *this, int32_t *op_errno)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	int              call_count = 0;
-	int              child_index = (long) cookie;
+    call_frame_t *frame = NULL;
+    afr_local_t *local = NULL;
+    pid_t pid = GF_CLIENT_PID_SELF_HEALD;
 
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame) {
+        if (op_errno)
+            *op_errno = ENOMEM;
+        return NULL;
+    }
 
-	local = frame->local;
-	sh    = &local->self_heal;
+    local = AFR_FRAME_INIT(frame, (*op_errno));
+    if (!local) {
+        STACK_DESTROY(frame->root);
+        return NULL;
+    }
 
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			sh->op_failed = 1;
+    syncopctx_setfspid(&pid);
 
-			gf_log (this->name,
-				(op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
-				"locking inode of %s on child %d failed: %s",
-				local->loc.path, child_index,
-				strerror (op_errno));
-		} else {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"inode of %s on child %d locked",
-				local->loc.path, child_index);
-		}
-	}
-	UNLOCK (&frame->lock);
+    frame->root->pid = pid;
 
-	call_count = afr_frame_return (frame);
+    afr_set_lk_owner(frame, this, frame->root);
 
-	if (call_count == 0) {
-		if (sh->op_failed == 1) {
-			sh_missing_entries_finish (frame, this);
-			return 0;
-		}
+    return frame;
+}
+
+int
+afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                           int source, struct afr_reply *replies,
+                           unsigned char *sources, unsigned char *newentry)
+{
+    int ret = 0;
+    int i = 0;
+    afr_private_t *priv = NULL;
+    dict_t *xattr = NULL;
+    int **changelog = NULL;
+
+    priv = this->private;
+
+    gf_uuid_copy(inode->gfid, replies[source].poststat.ia_gfid);
+
+    xattr = dict_new();
+    if (!xattr)
+        return -ENOMEM;
+
+    changelog = afr_mark_pending_changelog(priv, newentry, xattr,
+                                           replies[source].poststat.ia_type);
 
-		sh_missing_entries_lookup (frame, this);
-	}
+    if (!changelog) {
+        ret = -ENOMEM;
+        goto out;
+    }
 
-	return 0;
+    for (i = 0; i < priv->child_count; i++) {
+        if (!sources[i])
+            continue;
+        ret |= afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+    }
+out:
+    if (changelog)
+        afr_matrix_cleanup(changelog, priv->child_count);
+    if (xattr)
+        dict_unref(xattr);
+    return ret;
 }
 
+int
+afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid)
+{
+    int ret = -1;
+    int entry_ret = 1;
+    int metadata_ret = 1;
+    int data_ret = 1;
+    int or_ret = 0;
+    inode_t *inode = NULL;
+    fd_t *fd = NULL;
+    gf_boolean_t data_selfheal = _gf_false;
+    gf_boolean_t metadata_selfheal = _gf_false;
+    gf_boolean_t entry_selfheal = _gf_false;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    ret = afr_selfheal_unlocked_inspect(frame, this, gfid, &inode,
+                                        &data_selfheal, &metadata_selfheal,
+                                        &entry_selfheal, NULL);
+    if (ret)
+        goto out;
+
+    if (!(data_selfheal || metadata_selfheal || entry_selfheal)) {
+        ret = 2;
+        goto out;
+    }
+
+    if (inode->ia_type == IA_IFREG) {
+        ret = afr_selfheal_data_open(this, inode, &fd);
+        if (!fd) {
+            ret = -EIO;
+            goto out;
+        }
+    }
+
+    if (data_selfheal && priv->data_self_heal)
+        data_ret = afr_selfheal_data(frame, this, fd);
+
+    if (metadata_selfheal && priv->metadata_self_heal)
+        metadata_ret = afr_selfheal_metadata(frame, this, inode);
+
+    if (entry_selfheal && priv->entry_self_heal)
+        entry_ret = afr_selfheal_entry(frame, this, inode);
+
+    or_ret = (data_ret | metadata_ret | entry_ret);
+
+    if (data_ret == -EIO || metadata_ret == -EIO || entry_ret == -EIO)
+        ret = -EIO;
+    else if (data_ret == 1 && metadata_ret == 1 && entry_ret == 1)
+        ret = 1;
+    else if (or_ret < 0)
+        ret = or_ret;
+    else
+        ret = 0;
 
-static int
-afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this)
+out:
+    if (inode)
+        inode_unref(inode);
+    if (fd)
+        fd_unref(fd);
+    return ret;
+}
+/*
+ * This is the entry point for healing a given GFID. The return values for this
+ * function are as follows:
+ * '0' if the self-heal is successful
+ * '1' if the afr-xattrs are non-zero (due to on-going IO) and no heal is needed
+ * '2' if the afr-xattrs are all-zero and no heal is needed
+ * $errno if the heal on the gfid failed.
+ */
+
+int
+afr_selfheal(xlator_t *this, uuid_t gfid)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              i = 0;
-	int              call_count = 0;
+    int ret = -1;
+    call_frame_t *frame = NULL;
+    afr_local_t *local = NULL;
+
+    frame = afr_frame_create(this, NULL);
+    if (!frame)
+        return ret;
 
+    local = frame->local;
+    local->xdata_req = dict_new();
 
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
+    ret = afr_selfheal_do(frame, this, gfid);
 
-	gf_log (this->name, GF_LOG_DEBUG,
-		"attempting to recreate missing entries for path=%s",
-		local->loc.path);
+    if (frame)
+        AFR_STACK_DESTROY(frame);
 
-	afr_build_parent_loc (&sh->parent_loc, &local->loc);
+    return ret;
+}
+
+afr_local_t *
+__afr_dequeue_heals(afr_private_t *priv)
+{
+    afr_local_t *local = NULL;
+
+    if (list_empty(&priv->heal_waiting))
+        goto none;
+    if ((priv->background_self_heal_count > 0) &&
+        (priv->healers >= priv->background_self_heal_count))
+        goto none;
+
+    local = list_entry(priv->heal_waiting.next, afr_local_t, healer);
+    priv->heal_waiters--;
+    GF_ASSERT(priv->heal_waiters >= 0);
+    list_del_init(&local->healer);
+    list_add(&local->healer, &priv->healing);
+    priv->healers++;
+    return local;
+none:
+    gf_msg_debug(THIS->name, 0,
+                 "Nothing dequeued. "
+                 "Num healers: %d, Num Waiters: %d",
+                 priv->healers, priv->heal_waiters);
+    return NULL;
+}
 
-	call_count = local->child_count;
+int
+afr_refresh_selfheal_wrap(void *opaque)
+{
+    call_frame_t *heal_frame = opaque;
+    afr_local_t *local = heal_frame->local;
+    int ret = 0;
+
+    ret = afr_selfheal(heal_frame->this, local->refreshinode->gfid);
+    return ret;
+}
+
+int
+afr_refresh_heal_done(int ret, call_frame_t *frame, void *opaque)
+{
+    call_frame_t *heal_frame = opaque;
+    xlator_t *this = heal_frame->this;
+    afr_private_t *priv = this->private;
+    afr_local_t *local = heal_frame->local;
+
+    LOCK(&priv->lock);
+    {
+        list_del_init(&local->healer);
+        priv->healers--;
+        GF_ASSERT(priv->healers >= 0);
+        local = __afr_dequeue_heals(priv);
+    }
+    UNLOCK(&priv->lock);
+
+    AFR_STACK_DESTROY(heal_frame);
+
+    if (local)
+        afr_heal_synctask(this, local);
+    return 0;
+}
 
-	local->call_count = call_count;
+void
+afr_heal_synctask(xlator_t *this, afr_local_t *local)
+{
+    int ret = 0;
+    call_frame_t *heal_frame = NULL;
+
+    heal_frame = local->heal_frame;
+    ret = synctask_new(this->ctx->env, afr_refresh_selfheal_wrap,
+                       afr_refresh_heal_done, heal_frame, heal_frame);
+    if (ret < 0)
+        /* Heal not launched. Will be queued when the next inode
+         * refresh happens and shd hasn't healed it yet. */
+        afr_refresh_heal_done(ret, heal_frame, heal_frame);
+}
 
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND (frame, sh_missing_entries_lk_cbk,
-				    priv->children[i],
-				    priv->children[i]->fops->entrylk,
-				    &sh->parent_loc, local->loc.name,
-				    ENTRYLK_LOCK_NB, ENTRYLK_WRLCK);
-			if (!--call_count)
-				break;
-		}
- 	}
+gf_boolean_t
+afr_throttled_selfheal(call_frame_t *frame, xlator_t *this)
+{
+    gf_boolean_t can_heal = _gf_true;
+    afr_private_t *priv = this->private;
+    afr_local_t *local = frame->local;
+
+    LOCK(&priv->lock);
+    {
+        if ((priv->background_self_heal_count > 0) &&
+            (priv->heal_wait_qlen + priv->background_self_heal_count) >
+                (priv->heal_waiters + priv->healers)) {
+            list_add_tail(&local->healer, &priv->heal_waiting);
+            priv->heal_waiters++;
+            local = __afr_dequeue_heals(priv);
+        } else {
+            can_heal = _gf_false;
+        }
+    }
+    UNLOCK(&priv->lock);
+
+    if (can_heal) {
+        if (local)
+            afr_heal_synctask(this, local);
+        else
+            gf_msg_debug(this->name, 0,
+                         "Max number of heals are "
+                         "pending, background self-heal rejected.");
+    }
+
+    return can_heal;
+}
 
-	return 0;
+int
+afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources,
+                            afr_transaction_type type)
+{
+    int source = -1;
+    int i = 0;
+
+    /* Give preference to local child to save on bandwidth */
+    for (i = 0; i < priv->child_count; i++) {
+        if (priv->local[i] && sources[i]) {
+            if ((type == AFR_DATA_TRANSACTION) && AFR_IS_ARBITER_BRICK(priv, i))
+                continue;
+
+            source = i;
+            goto out;
+        }
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (sources[i]) {
+            source = i;
+            goto out;
+        }
+    }
+out:
+    return source;
 }
 
+static int
+afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, inode_t *inode,
+                         struct iatt *buf, struct iatt *preparent,
+                         struct iatt *postparent, dict_t *xdata)
+{
+    afr_local_t *local = frame->local;
+    int i = (long)cookie;
+
+    local->replies[i].valid = 1;
+    local->replies[i].op_ret = op_ret;
+    local->replies[i].op_errno = op_errno;
+    if (op_ret == 0) {
+        local->op_ret = 0;
+        local->replies[i].poststat = *buf;
+        local->replies[i].preparent = *preparent;
+        local->replies[i].postparent = *postparent;
+    }
+    if (xdata) {
+        local->replies[i].xdata = dict_ref(xdata);
+    }
+
+    syncbarrier_wake(&local->barrier);
+    return 0;
+}
 
 int
-afr_self_heal (call_frame_t *frame, xlator_t *this,
-	       int (*completion_cbk) (call_frame_t *, xlator_t *))
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              i = 0;
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"performing self heal on %s (metadata=%d data=%d entry=%d)",
-		local->loc.path,
-		local->need_metadata_self_heal,
-		local->need_data_self_heal,
-		local->need_entry_self_heal);
-
-	sh->completion_cbk = completion_cbk;
-
-	sh->buf = CALLOC (priv->child_count, sizeof (struct stat));
-	sh->child_errno = CALLOC (priv->child_count, sizeof (int));
-	sh->success = CALLOC (priv->child_count, sizeof (int));
-	sh->xattr = CALLOC (priv->child_count, sizeof (dict_t *));
-	sh->sources = CALLOC (sizeof (*sh->sources), priv->child_count);
-
-	sh->pending_matrix = CALLOC (sizeof (int32_t *), priv->child_count);
-	for (i = 0; i < priv->child_count; i++) {
-		sh->pending_matrix[i] = CALLOC (sizeof (int32_t),
-						priv->child_count);
-	}
-
-	sh->delta_matrix = CALLOC (sizeof (int32_t *), priv->child_count);
-	for (i = 0; i < priv->child_count; i++) {
-		sh->delta_matrix[i] = CALLOC (sizeof (int32_t),
-					      priv->child_count);
-	}
-
-	if (local->success_count && local->enoent_count) {
-		afr_self_heal_missing_entries (frame, this);
-	} else {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"proceeding to metadata check on %s",
-			local->loc.path);
-		afr_sh_missing_entries_done (frame, this);
-	}
-
-	return 0;
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode)
+{
+    call_frame_t *frame = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = this->private;
+    unsigned char *mkdir_on = alloca0(priv->child_count);
+    unsigned char *lookup_on = alloca0(priv->child_count);
+    loc_t loc = {0};
+    int32_t op_errno = 0;
+    int32_t child_op_errno = 0;
+    struct iatt iatt = {0};
+    dict_t *xdata = NULL;
+    uuid_t anon_inode_gfid = {0};
+    int mkdir_count = 0;
+    int i = 0;
+
+    /*Try to mkdir everywhere and return success if the dir exists on 'child'
+     */
+
+    if (!priv->use_anon_inode) {
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    frame = afr_frame_create(this, &op_errno);
+    if (op_errno) {
+        goto out;
+    }
+    local = frame->local;
+    if (!local->child_up[child]) {
+        /*Other bricks may need mkdir so don't error out yet*/
+        child_op_errno = ENOTCONN;
+    }
+    gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid);
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->child_up[i])
+            continue;
+
+        if (priv->anon_inode[i]) {
+            mkdir_on[i] = 0;
+        } else {
+            mkdir_on[i] = 1;
+            mkdir_count++;
+        }
+    }
+
+    if (mkdir_count == 0) {
+        *linked_inode = inode_find(this->itable, anon_inode_gfid);
+        if (*linked_inode) {
+            op_errno = 0;
+            goto out;
+        }
+    }
+
+    loc.parent = inode_ref(this->itable->root);
+    loc.name = priv->anon_inode_name;
+    loc.inode = inode_new(this->itable);
+    if (!loc.inode) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    xdata = dict_new();
+    if (!xdata) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true);
+    if (op_errno) {
+        goto out;
+    }
+
+    if (mkdir_count == 0) {
+        memcpy(lookup_on, local->child_up, priv->child_count);
+        goto lookup;
+    }
+
+    AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0,
+               xdata);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!mkdir_on[i]) {
+            continue;
+        }
+
+        if (local->replies[i].op_ret == 0) {
+            priv->anon_inode[i] = 1;
+            iatt = local->replies[i].poststat;
+        } else if (local->replies[i].op_ret < 0 &&
+                   local->replies[i].op_errno == EEXIST) {
+            lookup_on[i] = 1;
+        } else if (i == child) {
+            child_op_errno = local->replies[i].op_errno;
+        }
+    }
+
+    if (AFR_COUNT(lookup_on, priv->child_count) == 0) {
+        goto link;
+    }
+
+lookup:
+    AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+               xdata);
+    for (i = 0; i < priv->child_count; i++) {
+        if (!lookup_on[i]) {
+            continue;
+        }
+
+        if (local->replies[i].op_ret == 0) {
+            if (gf_uuid_compare(anon_inode_gfid,
+                                local->replies[i].poststat.ia_gfid) == 0) {
+                priv->anon_inode[i] = 1;
+                iatt = local->replies[i].poststat;
+            } else {
+                if (i == child)
+                    child_op_errno = EINVAL;
+                gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA,
+                       "%s has gfid: %s", priv->anon_inode_name,
+                       uuid_utoa(local->replies[i].poststat.ia_gfid));
+            }
+        } else if (i == child) {
+            child_op_errno = local->replies[i].op_errno;
+        }
+    }
+link:
+    if (!gf_uuid_is_null(iatt.ia_gfid)) {
+        *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt);
+        if (*linked_inode) {
+            op_errno = 0;
+            inode_lookup(*linked_inode);
+        } else {
+            op_errno = ENOMEM;
+        }
+        goto out;
+    }
+
+out:
+    if (xdata)
+        dict_unref(xdata);
+    loc_wipe(&loc);
+    /*child_op_errno takes precedence*/
+    if (child_op_errno == 0) {
+        child_op_errno = op_errno;
+    }
+
+    if (child_op_errno && *linked_inode) {
+        inode_unref(*linked_inode);
+        *linked_inode = NULL;
+    }
+    if (frame)
+        AFR_STACK_DESTROY(frame);
+    return -child_op_errno;
 }
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h
deleted file mode 100644
index f3beff71f25..00000000000
--- a/xlators/cluster/afr/src/afr-self-heal-common.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
-  Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __AFR_SELF_HEAL_COMMON_H__
-#define __AFR_SELF_HEAL_COMMON_H__
-
-#define FILE_HAS_HOLES(buf) (((buf)->st_size) > ((buf)->st_blocks * 512))
-
-int
-afr_sh_select_source (int sources[], int child_count);
-
-int
-afr_sh_sink_count (int sources[], int child_count);
-
-int
-afr_sh_source_count (int sources[], int child_count);
-
-int
-afr_sh_supress_errenous_children (int sources[], int child_errno[],
-				  int child_count);
-
-int
-afr_sh_supress_empty_children (int sources[], dict_t *xattr[],
-			       struct stat *buf,
-			       int child_count, const char *key);
-
-void
-afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this);
-
-void
-afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[],
-			     int child_count, const char *key);
-
-void
-afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[],
-			 int32_t success[], int child_count);
-
-int
-afr_sh_mark_sources (int32_t *pending_matrix[], int sources[],
-		     int child_count);
-
-int
-afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[],
-		       int child_count, const char *key);
-
-int
-afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count);
-
-
-#endif /* __AFR_SELF_HEAL_COMMON_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 674d2923f1c..37bcc2b3f9e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -1,1030 +1,891 @@
 /*
-   Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
+  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
 
-#include "glusterfs.h"
 #include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
 #include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-
-
-int
-afr_sh_data_done (call_frame_t *frame, xlator_t *this)
+#include <glusterfs/byte-order.h>
+#include "protocol-common.h"
+#include "afr-messages.h"
+#include <glusterfs/events.h>
+
+#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size))
+static int
+__checksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+               int op_errno, uint32_t weak, uint8_t *strong, dict_t *xdata)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	/* 
-	   TODO: cleanup sh->* 
-	 */
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"self heal of %s completed",
-		local->loc.path);
-
-	sh->completion_cbk (frame, this);
-
-	return 0;
+    afr_local_t *local = NULL;
+    struct afr_reply *replies = NULL;
+    int i = (long)cookie;
+
+    local = frame->local;
+    replies = local->replies;
+
+    replies[i].valid = 1;
+    replies[i].op_ret = op_ret;
+    replies[i].op_errno = op_errno;
+    if (xdata) {
+        replies[i].buf_has_zeroes = dict_get_str_boolean(
+            xdata, "buf-has-zeroes", _gf_false);
+        replies[i].fips_mode_rchecksum = dict_get_str_boolean(
+            xdata, "fips-mode-rchecksum", _gf_false);
+    }
+    if (strong) {
+        if (replies[i].fips_mode_rchecksum) {
+            memcpy(local->replies[i].checksum, strong, SHA256_DIGEST_LENGTH);
+        } else {
+            memcpy(local->replies[i].checksum, strong, MD5_DIGEST_LENGTH);
+        }
+    }
+
+    syncbarrier_wake(&local->barrier);
+    return 0;
 }
 
-
-int
-afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		       int32_t op_ret, int32_t op_errno)
+static gf_boolean_t
+__afr_can_skip_data_block_heal(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                               int source, unsigned char *healed_sinks,
+                               off_t offset, size_t size, struct iatt *poststat)
 {
-	afr_local_t     *local = NULL;
-	afr_private_t   *priv  = NULL;
-	afr_self_heal_t *sh = NULL;
-	int              call_count = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	LOCK (&frame->lock);
-	{
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		fd_unref (sh->healing_fd);
-		sh->healing_fd = NULL;
-		afr_sh_data_done (frame, this);
-	}
-
-	return 0;
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    unsigned char *wind_subvols = NULL;
+    gf_boolean_t checksum_match = _gf_true;
+    struct afr_reply *replies = NULL;
+    dict_t *xdata = NULL;
+    int i = 0;
+
+    priv = this->private;
+    local = frame->local;
+    replies = local->replies;
+
+    xdata = dict_new();
+    if (!xdata)
+        goto out;
+    if (dict_set_int32_sizen(xdata, "check-zero-filled", 1)) {
+        dict_unref(xdata);
+        goto out;
+    }
+
+    wind_subvols = alloca0(priv->child_count);
+    for (i = 0; i < priv->child_count; i++) {
+        if (i == source || healed_sinks[i])
+            wind_subvols[i] = 1;
+    }
+
+    AFR_ONLIST(wind_subvols, frame, __checksum_cbk, rchecksum, fd, offset, size,
+               xdata);
+    if (xdata)
+        dict_unref(xdata);
+
+    if (!replies[source].valid || replies[source].op_ret != 0)
+        return _gf_false;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (i == source)
+            continue;
+        if (replies[i].valid) {
+            if (memcmp(replies[source].checksum, replies[i].checksum,
+                       replies[source].fips_mode_rchecksum
+                           ? SHA256_DIGEST_LENGTH
+                           : MD5_DIGEST_LENGTH)) {
+                checksum_match = _gf_false;
+                break;
+            }
+        }
+    }
+
+    if (checksum_match) {
+        if (HAS_HOLES(poststat))
+            return _gf_true;
+
+        /* For non-sparse files, we might be better off writing the
+         * zeroes to sinks to avoid mismatch of disk-usage in bricks. */
+        if (local->replies[source].buf_has_zeroes)
+            return _gf_false;
+        else
+            return _gf_true;
+    }
+out:
+    return _gf_false;
 }
 
-
-int
-afr_sh_data_close (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+__afr_is_sink_zero_filled(xlator_t *this, fd_t *fd, size_t size, off_t offset,
+                          int sink)
 {
-	afr_local_t     *local = NULL;
-	afr_private_t   *priv  = NULL;
-	afr_self_heal_t *sh  = NULL;
-	int              i = 0;
-	int              call_count = 0;
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	if (!sh->healing_fd) {
-		afr_sh_data_done (frame, this);
-		return 0;
-	}
-
-	call_count = sh->active_sinks + 1;
-	local->call_count = call_count;
-
-
-	/* closed source */
-	gf_log (this->name, GF_LOG_DEBUG,
-		"closing fd of %s on %s",
-		local->loc.path, priv->children[sh->source]->name);
-
-	STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
-			   (void *) (long) sh->source,
-			   priv->children[sh->source],
-			   priv->children[sh->source]->fops->flush,
-			   sh->healing_fd);
-	call_count--;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (sh->sources[i] || !local->child_up[i])
-			continue;
-
-		gf_log (this->name, GF_LOG_DEBUG,
-			"closing fd of %s on %s",
-			local->loc.path, priv->children[i]->name);
-
-		STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
-				   (void *) (long) i,
-				   priv->children[i],
-				   priv->children[i]->fops->flush,
-				   sh->healing_fd);
-		if (!--call_count)
-			break;
-	}
-
-	return 0;
+    afr_private_t *priv = NULL;
+    struct iobref *iobref = NULL;
+    struct iovec *iovec = NULL;
+    int count = 0;
+    int ret = 0;
+    gf_boolean_t zero_filled = _gf_false;
+
+    priv = this->private;
+    ret = syncop_readv(priv->children[sink], fd, size, offset, 0, &iovec,
+                       &count, &iobref, NULL, NULL, NULL);
+    if (ret < 0)
+        goto out;
+    ret = iov_0filled(iovec, count);
+    if (!ret)
+        zero_filled = _gf_true;
+out:
+    if (iovec)
+        GF_FREE(iovec);
+    if (iobref)
+        iobref_unref(iobref);
+    return zero_filled;
 }
 
-
-int
-afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		       int32_t op_ret, int32_t op_errno)
+static int
+__afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                               int source, unsigned char *healed_sinks,
+                               off_t offset, size_t size,
+                               struct afr_reply *replies, int type)
 {
-	afr_local_t * local = NULL;
-	int           call_count = 0;
-	int           child_index = (long) cookie;
-
-	
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR, 
-				"locking inode of %s on child %d failed: %s",
-				local->loc.path, child_index,
-				strerror (op_errno));
-		} else {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"inode of %s on child %d locked",
-				local->loc.path, child_index);
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		afr_sh_data_close (frame, this);
-	}
-
-	return 0;
+    struct iovec *iovec = NULL;
+    int count = 0;
+    struct iobref *iobref = NULL;
+    int ret = 0;
+    int i = 0;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    ret = syncop_readv(priv->children[source], fd, size, offset, 0, &iovec,
+                       &count, &iobref, NULL, NULL, NULL);
+    if (ret <= 0)
+        return ret;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!healed_sinks[i])
+            continue;
+
+            /*
+             * TODO: Use fiemap() and discard() to heal holes
+             * in the future.
+             *
+             * For now,
+             *
+             * - if the source had any holes at all,
+             * AND
+             * - if we are writing past the original file size
+             *   of the sink
+             * AND
+             * - is NOT the last block of the source file. if
+             *   the block contains EOF, it has to be written
+             *   in order to set the file size even if the
+             *   last block is 0-filled.
+             * AND
+             * - if the read buffer is filled with only 0's
+             *
+             * then, skip writing to this source. We don't depend
+             * on the write to happen to update the size as we
+             * have performed an ftruncate() upfront anyways.
+             */
+#define is_last_block(o, b, s) ((s >= o) && (s <= (o + b)))
+        if (HAS_HOLES((&replies[source].poststat)) &&
+            offset >= replies[i].poststat.ia_size &&
+            !is_last_block(offset, size, replies[source].poststat.ia_size) &&
+            (iov_0filled(iovec, count) == 0))
+            continue;
+
+        /* Avoid filling up sparse regions of the sink with 0-filled
+         * writes.*/
+        if (type == AFR_SELFHEAL_DATA_FULL &&
+            HAS_HOLES((&replies[source].poststat)) &&
+            ((offset + size) <= replies[i].poststat.ia_size) &&
+            (iov_0filled(iovec, count) == 0) &&
+            __afr_is_sink_zero_filled(this, fd, size, offset, i)) {
+            continue;
+        }
+
+        ret = syncop_writev(priv->children[i], fd, iovec, count, offset, iobref,
+                            0, NULL, NULL, NULL, NULL);
+        if (ret != iov_length(iovec, count)) {
+            /* write() failed on this sink. unset the corresponding
+               member in sinks[] (which is healed_sinks[] in the
+               caller) so that this server does NOT get considered
+               as successfully healed.
+            */
+            healed_sinks[i] = 0;
+        }
+    }
+    if (iovec)
+        GF_FREE(iovec);
+    if (iobref)
+        iobref_unref(iobref);
+
+    return ret;
 }
 
-
-int
-afr_sh_data_unlock (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+afr_source_sinks_locked(xlator_t *this, unsigned char *locked_on, int source,
+                        unsigned char *healed_sinks)
 {
-	struct flock flock;			
-	int i = 0;				
-	int call_count = 0;		     
-
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	afr_self_heal_t * sh  = NULL;
-
+    afr_private_t *priv = this->private;
+    int i = 0;
 
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
+    if (!locked_on[source])
+        return _gf_false;
 
-	call_count = local->child_count;
+    for (i = 0; i < priv->child_count; i++) {
+        if (healed_sinks[i] && locked_on[i])
+            return _gf_true;
+    }
 
-	local->call_count = call_count;		
-
-	flock.l_start = 0;
-	flock.l_len   = 0;
-	flock.l_type  = F_UNLCK;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"unlocking %s on subvolume %s",
-				local->loc.path, priv->children[i]->name);
-
-			STACK_WIND_COOKIE (frame, afr_sh_data_unlck_cbk,
-					   (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->inodelk,
-					   &local->loc, F_SETLK, &flock); 
-			if (!--call_count)
-				break;
-		}
-	}
-
-	return 0;
+    return _gf_false;
 }
 
-
-int
-afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
+static int
+afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                        int source, unsigned char *healed_sinks, off_t offset,
+                        size_t size, int type, struct afr_reply *replies)
 {
-	afr_local_t   *local = NULL;
-
-	local = frame->local;
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"finishing data selfheal of %s", local->loc.path);
-
-	afr_sh_data_unlock (frame, this);
-
-	return 0;
+    int ret = -1;
+    afr_private_t *priv = NULL;
+    unsigned char *data_lock = NULL;
+
+    priv = this->private;
+    data_lock = alloca0(priv->child_count);
+
+    ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size,
+                               data_lock);
+    {
+        if (!afr_source_sinks_locked(this, data_lock, source, healed_sinks)) {
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+
+        if (type == AFR_SELFHEAL_DATA_DIFF &&
+            __afr_can_skip_data_block_heal(frame, this, fd, source,
+                                           healed_sinks, offset, size,
+                                           &replies[source].poststat)) {
+            ret = 0;
+            goto unlock;
+        }
+
+        ret = __afr_selfheal_data_read_write(
+            frame, this, fd, source, healed_sinks, offset, size, replies, type);
+    }
+unlock:
+    afr_selfheal_uninodelk(frame, this, fd->inode, this->name, offset, size,
+                           data_lock);
+    return ret;
 }
 
-
-int
-afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
-			       xlator_t *this, int32_t op_ret,
-			       int32_t op_errno, dict_t *xattr)
+static int
+afr_selfheal_data_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                        unsigned char *healed_sinks)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int             call_count = 0;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
 
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
+    local = frame->local;
+    priv = this->private;
 
-	LOCK (&frame->lock);
-	{
-	}
-	UNLOCK (&frame->lock);
+    if (!priv->ensure_durability)
+        return 0;
 
-	call_count = afr_frame_return (frame);
+    AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, fsync, fd, 0, NULL);
 
-	if (call_count == 0)
-		afr_sh_data_finish (frame, this);
-
-	return 0;
+    for (i = 0; i < priv->child_count; i++)
+        if (healed_sinks[i] && local->replies[i].op_ret != 0)
+            /* fsync() failed. Do NOT consider this server
+               as successfully healed. Mark it so.
+            */
+            healed_sinks[i] = 0;
+    return 0;
 }
 
-
-int
-afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)
+static int
+afr_data_self_heal_type_get(afr_private_t *priv, unsigned char *healed_sinks,
+                            int source, struct afr_reply *replies)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              call_count = 0;
-	int              i = 0;
-	dict_t          **erase_xattr = NULL;
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-
-	afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix,
-				 sh->success, priv->child_count);
-
-	erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (sh->xattr[i]) {
-			call_count++;
-
-			erase_xattr[i] = get_new_dict();
-			dict_ref (erase_xattr[i]);
-		}
-	}
-
-	afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr,
-			       priv->child_count, AFR_DATA_PENDING);
-
-	local->call_count = call_count;
-	for (i = 0; i < priv->child_count; i++) {
-		if (!erase_xattr[i])
-			continue;
-
-		gf_log (this->name, GF_LOG_DEBUG,
-			"erasing pending flags from %s on %s",
-			local->loc.path, priv->children[i]->name);
-
-		STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk,
-				   (void *) (long) i,
-				   priv->children[i],
-				   priv->children[i]->fops->xattrop,
-				   &local->loc,
-				   GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
-		if (!--call_count)
-			break;
-	}
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (erase_xattr[i]) {
-			dict_unref (erase_xattr[i]);
-		}
-	}
-	FREE (erase_xattr);
-
-	return 0;
+    int type = AFR_SELFHEAL_DATA_FULL;
+    int i = 0;
+
+    if (priv->data_self_heal_algorithm == AFR_SELFHEAL_DATA_DYNAMIC) {
+        type = AFR_SELFHEAL_DATA_FULL;
+        for (i = 0; i < priv->child_count; i++) {
+            if (!healed_sinks[i] && i != source)
+                continue;
+            if (replies[i].poststat.ia_size) {
+                type = AFR_SELFHEAL_DATA_DIFF;
+                break;
+            }
+        }
+    } else {
+        type = priv->data_self_heal_algorithm;
+    }
+    return type;
 }
 
-
-int
-afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		      int32_t op_ret, int32_t op_errno, struct stat *buf)
+static int
+afr_selfheal_data_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source,
+                     unsigned char *healed_sinks, struct afr_reply *replies)
 {
-	afr_private_t * priv = NULL;
-	afr_local_t * local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-	int              call_count = 0;
-	int              child_index = 0;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	child_index = (long) cookie;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1)
-			gf_log (this->name, GF_LOG_ERROR,
-				"ftruncate of %s on subvolume %s failed (%s)",
-				local->loc.path,
-				priv->children[child_index]->name,
-				strerror (op_errno));
-		else
-			gf_log (this->name, GF_LOG_DEBUG,
-				"ftruncate of %s on subvolume %s completed",
-				local->loc.path,
-				priv->children[child_index]->name);
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		afr_sh_data_erase_pending (frame, this);
-	}
-
-	return 0;
-}
-
-
-int
-afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this)
-{
-	afr_private_t * priv = NULL;
-	afr_local_t * local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-	int             *sources = NULL;
-	int              call_count = 0;
-	int              i = 0;
-
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	sources = sh->sources;
-	call_count = sh->active_sinks;
-
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (sources[i] || !local->child_up[i])
-			continue;
-
-		STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk,
-				   (void *) (long) i,
-				   priv->children[i], 
-				   priv->children[i]->fops->ftruncate,
-				   sh->healing_fd, sh->file_size); 
+    afr_private_t *priv = NULL;
+    off_t off = 0;
+    size_t block = 0;
+    int type = AFR_SELFHEAL_DATA_FULL;
+    int ret = -1;
+    call_frame_t *iter_frame = NULL;
+    unsigned char arbiter_sink_status = 0;
+
+    gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+           "performing data selfheal on %s", uuid_utoa(fd->inode->gfid));
+
+    priv = this->private;
+    if (priv->arbiter_count) {
+        arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX];
+        healed_sinks[ARBITER_BRICK_INDEX] = 0;
+    }
+
+    block = 128 * 1024 * priv->data_self_heal_window_size;
+
+    type = afr_data_self_heal_type_get(priv, healed_sinks, source, replies);
+
+    iter_frame = afr_copy_frame(frame);
+    if (!iter_frame) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    for (off = 0; off < replies[source].poststat.ia_size; off += block) {
+        if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+            ret = -ENOTCONN;
+            goto out;
+        }
+
+        ret = afr_selfheal_data_block(iter_frame, this, fd, source,
+                                      healed_sinks, off, block, type, replies);
+        if (ret < 0)
+            goto out;
+
+        AFR_STACK_RESET(iter_frame);
+        if (iter_frame->local == NULL) {
+            ret = -ENOTCONN;
+            goto out;
+        }
+    }
+
+    ret = afr_selfheal_data_fsync(frame, this, fd, healed_sinks);
 
-		if (!--call_count)
-			break;
-	}
+out:
+    if (arbiter_sink_status)
+        healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status;
 
-	return 0;
+    if (iter_frame)
+        AFR_STACK_DESTROY(iter_frame);
+    return ret;
 }
 
-
-int
-afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_data_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		       int32_t op_ret, int32_t op_errno, struct stat *buf)
+static int
+__afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                              unsigned char *healed_sinks, uint64_t size)
 {
-	afr_private_t * priv = NULL;
-	afr_local_t * local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-
-	int child_index = (long) cookie;
-	int call_count = 0;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	gf_log (this->name, GF_LOG_DEBUG, 
-		"wrote %d bytes of data from %s to child %d, offset %"PRId64"", 
-		op_ret, local->loc.path, child_index, sh->offset - op_ret);
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"write to %s failed on subvolume %s (%s)",
-				local->loc.path,
-				priv->children[child_index]->name,
-				strerror (op_errno));
-			sh->op_failed = 1;
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		afr_sh_data_read_write_iter (frame, this);
-	}
-
-	return 0;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    /* This will send truncate on the arbiter brick as well if it is marked as
+     * sink. If changelog is enabled on the volume it captures truncate as a
+     * data transactions on the arbiter brick. This will help geo-rep to
+     * properly sync the data from master to slave if arbiter is the ACTIVE
+     * brick during syncing and which had got some entries healed for data as
+     * part of self heal.
+     */
+    AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, ftruncate, fd, size,
+               NULL);
+
+    for (i = 0; i < priv->child_count; i++)
+        if (healed_sinks[i] && local->replies[i].op_ret == -1)
+            /* truncate() failed. Do NOT consider this server
+               as successfully healed. Mark it so.
+            */
+            healed_sinks[i] = 0;
+
+    return 0;
 }
 
-
-int
-afr_sh_data_read_cbk (call_frame_t *frame, void *cookie,
-		      xlator_t *this, int32_t op_ret, int32_t op_errno,
-		      struct iovec *vector, int32_t count, struct stat *buf)
+gf_boolean_t
+afr_has_source_witnesses(xlator_t *this, unsigned char *sources,
+                         uint64_t *witness)
 {
-	afr_private_t * priv = NULL;
-	afr_local_t * local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-
-	int child_index = (long) cookie;
-	int i = 0;
-	int call_count = 0;
-
-	off_t offset;
+    int i = 0;
+    afr_private_t *priv = NULL;
 
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
+    priv = this->private;
 
-	call_count = sh->active_sinks;
+    for (i = 0; i < priv->child_count; i++) {
+        if (sources[i] && witness[i])
+            return _gf_true;
+    }
+    return _gf_false;
+}
 
-	local->call_count = call_count;
+static gf_boolean_t
+afr_does_size_mismatch(xlator_t *this, unsigned char *sources,
+                       struct afr_reply *replies)
+{
+    int i = 0;
+    afr_private_t *priv = NULL;
+    struct iatt *min = NULL;
+    struct iatt *max = NULL;
 
-	gf_log (this->name, GF_LOG_DEBUG, 
-		"read %d bytes of data from %s on child %d, offset %"PRId64"",
-		op_ret, local->loc.path, child_index, sh->offset);
+    priv = this->private;
 
-	if (op_ret <= 0) {
-		afr_sh_data_trim_sinks (frame, this);
-		return 0;
-	}
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid)
+            continue;
 
-	/* what if we read less than block size? */
-	offset = sh->offset;
-	sh->offset += op_ret;
+        if (replies[i].op_ret < 0)
+            continue;
 
-	frame->root->req_refs = frame->root->rsp_refs;
+        if (!sources[i])
+            continue;
 
-	if (sh->file_has_holes) {
-		if (iov_0filled (vector, count) == 0) {
-			/* the iter function depends on the
-			   sh->offset already being updated 
-			   above
-			*/
-			afr_sh_data_read_write_iter (frame, this);
-			goto out;
-		}
-	}
+        if (AFR_IS_ARBITER_BRICK(priv, i) && (replies[i].poststat.ia_size == 0))
+            continue;
 
-	for (i = 0; i < priv->child_count; i++) {
-		if (sh->sources[i] || !local->child_up[i])
-			continue;
+        if (!min)
+            min = &replies[i].poststat;
 
-		/* this is a sink, so write to it */
-		STACK_WIND_COOKIE (frame, afr_sh_data_write_cbk,
-				   (void *) (long) i,
-				   priv->children[i],
-				   priv->children[i]->fops->writev,
-				   sh->healing_fd, vector, count, offset);
+        if (!max)
+            max = &replies[i].poststat;
 
-		if (!--call_count)
-			break;
-	}
+        if (min->ia_size > replies[i].poststat.ia_size)
+            min = &replies[i].poststat;
 
-out:
-	return 0;
-}
+        if (max->ia_size < replies[i].poststat.ia_size)
+            max = &replies[i].poststat;
+    }
 
+    if (min && max) {
+        if (min->ia_size != max->ia_size)
+            return _gf_true;
+    }
 
-int
-afr_sh_data_read_write (call_frame_t *frame, xlator_t *this)
-{
-	afr_private_t * priv = NULL;
-	afr_local_t * local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	STACK_WIND_COOKIE (frame, afr_sh_data_read_cbk,
-			   (void *) (long) sh->source,
-			   priv->children[sh->source],
-			   priv->children[sh->source]->fops->readv,
-			   sh->healing_fd, sh->block_size,
-			   sh->offset);
-
-	return 0;
+    return _gf_false;
 }
 
-
-int
-afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this)
+static void
+afr_mark_biggest_witness_as_source(xlator_t *this, unsigned char *sources,
+                                   uint64_t *witness)
 {
-	afr_private_t * priv = NULL;
-	afr_local_t * local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	if (sh->op_failed) {
-		afr_sh_data_finish (frame, this);
-		goto out;
-	}
-
-	if (sh->offset >= sh->file_size) {
-		gf_log (this->name, GF_LOG_DEBUG, 
-			"closing fd's of %s",
-			local->loc.path);
-		afr_sh_data_trim_sinks (frame, this);
-
-		goto out;
-	}
-
-	afr_sh_data_read_write (frame, this);
-
-out:
-	return 0;
+    int i = 0;
+    afr_private_t *priv = NULL;
+    uint64_t biggest_witness = 0;
+
+    priv = this->private;
+    /* Find source with biggest witness count */
+    for (i = 0; i < priv->child_count; i++) {
+        if (!sources[i])
+            continue;
+        if (biggest_witness < witness[i])
+            biggest_witness = witness[i];
+    }
+
+    /* Mark files with less witness count as not source */
+    for (i = 0; i < priv->child_count; i++) {
+        if (!sources[i])
+            continue;
+        if (witness[i] < biggest_witness)
+            sources[i] = 0;
+    }
+
+    return;
 }
 
-
-int
-afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		      int32_t op_ret, int32_t op_errno, fd_t *fd)
+/* This is a tie breaker function. Only one source be assigned here */
+static void
+afr_mark_newest_file_as_source(xlator_t *this, unsigned char *sources,
+                               struct afr_reply *replies)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              call_count = 0;
-	int              child_index = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	child_index = (long) cookie;
-
-	/* TODO: some of the open's might fail.
-	   In that case, modify cleanup fn to send flush on those 
-	   fd's which are already open */
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"open of %s failed on child %s (%s)",
-				local->loc.path,
-				priv->children[child_index]->name,
-				strerror (op_errno));
-			sh->op_failed = 1;
-		}
-
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		if (sh->op_failed) {
-			afr_sh_data_finish (frame, this);
-			return 0;
-		}
-		gf_log (this->name, GF_LOG_DEBUG,
-			"fd for %s opened, commencing sync",
-			local->loc.path);
-
-		gf_log (this->name, GF_LOG_WARNING,
-			"sourcing file %s from %s to other sinks",
-			local->loc.path, priv->children[sh->source]->name);
-
-		afr_sh_data_read_write (frame, this);
-	}
-
-	return 0;
+    int i = 0;
+    afr_private_t *priv = NULL;
+    int source = -1;
+    uint32_t max_ctime = 0;
+
+    priv = this->private;
+    /* Find source with latest ctime */
+    for (i = 0; i < priv->child_count; i++) {
+        if (!sources[i])
+            continue;
+
+        if (max_ctime <= replies[i].poststat.ia_ctime) {
+            source = i;
+            max_ctime = replies[i].poststat.ia_ctime;
+        }
+    }
+
+    /* Only mark one of the files as source to break ties */
+    memset(sources, 0, sizeof(*sources) * priv->child_count);
+    sources[source] = 1;
 }
 
-
-int
-afr_sh_data_open (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_data_finalize_source(
+    call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+    unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+    unsigned char *undid_pending, struct afr_reply *replies, uint64_t *witness)
 {
-	int i = 0;				
-	int call_count = 0;		     
-
-	int source = -1;
-	int *sources = NULL;
-
-	fd_t *fd = NULL;
-
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	afr_self_heal_t *sh = NULL;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	call_count = sh->active_sinks + 1;
-	local->call_count = call_count;
-
-	fd = fd_create (local->loc.inode, frame->root->pid);
-	sh->healing_fd = fd;
-
-	source  = local->self_heal.source;
-	sources = local->self_heal.sources;
-
-	sh->block_size = 65536;
-	sh->file_size  = sh->buf[source].st_size;
-
-	if (FILE_HAS_HOLES (&sh->buf[source]))
-		sh->file_has_holes = 1;
+    afr_private_t *priv = NULL;
+    int source = -1;
+    int sources_count = 0;
+    priv = this->private;
+
+    sources_count = AFR_COUNT(sources, priv->child_count);
+
+    if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+        !sources_count) {
+        /* split brain */
+        source = afr_mark_split_brain_source_sinks(
+            frame, this, inode, sources, sinks, healed_sinks, locked_on,
+            replies, AFR_DATA_TRANSACTION);
+        if (source < 0) {
+            gf_event(EVENT_AFR_SPLIT_BRAIN,
+                     "client-pid=%d;"
+                     "subvol=%s;type=data;"
+                     "file=%s",
+                     this->ctx->cmd_args.client_pid, this->name,
+                     uuid_utoa(inode->gfid));
+            return -EIO;
+        }
+
+        _afr_fav_child_reset_sink_xattrs(
+            frame, this, inode, source, healed_sinks, undid_pending,
+            AFR_DATA_TRANSACTION, locked_on, replies);
+        goto out;
+    }
+
+    /* No split brain at this point. If we were called from
+     * afr_heal_splitbrain_file(), abort.*/
+    if (afr_dict_contains_heal_op(frame))
+        return -EIO;
+
+    /* If there are no witnesses/size-mismatches on sources we are done*/
+    if (!afr_does_size_mismatch(this, sources, replies) &&
+        !afr_has_source_witnesses(this, sources, witness))
+        goto out;
+
+    afr_mark_largest_file_as_source(this, sources, replies);
+    afr_mark_biggest_witness_as_source(this, sources, witness);
+    afr_mark_newest_file_as_source(this, sources, replies);
+    if (priv->arbiter_count)
+        /* Choose non-arbiter brick as source for empty files. */
+        afr_mark_source_sinks_if_file_empty(this, sources, sinks, healed_sinks,
+                                            locked_on, replies,
+                                            AFR_DATA_TRANSACTION);
 
-	/* open source */
-	STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
-			   (void *) (long) source,
-			   priv->children[source],
-			   priv->children[source]->fops->open,
-			   &local->loc, O_RDONLY|O_LARGEFILE, fd);
-	call_count--;
-
-	/* open sinks */
-	for (i = 0; i < priv->child_count; i++) {
-		if(sources[i] || !local->child_up[i])
-			continue;
-
-		STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
-				   (void *) (long) i,
-				   priv->children[i], 
-				   priv->children[i]->fops->open,
-				   &local->loc, 
-				   O_WRONLY|O_LARGEFILE, fd); 
-
-		if (!--call_count)
-			break;
-	}
+out:
+    afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+    source = afr_choose_source_by_policy(priv, sources, AFR_DATA_TRANSACTION);
 
-	return 0;
+    return source;
 }
 
-
+/*
+ * __afr_selfheal_data_prepare:
+ *
+ * This function inspects the on-disk xattrs and determines which subvols
+ * are sources and sinks.
+ *
+ * The return value is the index of the subvolume to be used as the source
+ * for self-healing, or -1 if no healing is necessary/split brain.
+ */
 int
-afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this)
+__afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                            unsigned char *locked_on, unsigned char *sources,
+                            unsigned char *sinks, unsigned char *healed_sinks,
+                            unsigned char *undid_pending,
+                            struct afr_reply *replies, unsigned char *pflag)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              active_sinks = 0;
-	int              source = 0;
-	int              i = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	source = sh->source;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (sh->sources[i] == 0 && local->child_up[i] == 1) {
-			active_sinks++;
-			sh->success[i] = 1;
-		}
-	}
-	sh->success[source] = 1;
-
-	if (active_sinks == 0) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"no active sinks for performing self-heal on file %s",
-			local->loc.path);
-		afr_sh_data_finish (frame, this);
-		return 0;
-	}
-	sh->active_sinks = active_sinks;
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"syncing data of %s from subvolume %s to %d active sinks",
-		local->loc.path, priv->children[source]->name, active_sinks);
-
-	afr_sh_data_open (frame, this);
-
-	return 0;
+    int ret = -1;
+    int source = -1;
+    afr_private_t *priv = NULL;
+    uint64_t *witness = NULL;
+
+    priv = this->private;
+
+    ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+
+    if (ret)
+        return ret;
+
+    witness = alloca0(priv->child_count * sizeof(*witness));
+    ret = afr_selfheal_find_direction(frame, this, replies,
+                                      AFR_DATA_TRANSACTION, locked_on, sources,
+                                      sinks, witness, pflag);
+    if (ret)
+        return ret;
+
+    /* Initialize the healed_sinks[] array optimistically to
+       the intersection of to-be-healed (i.e sinks[]) and
+       the list of servers which are up (i.e locked_on[]).
+       As we encounter failures in the healing process, we
+       will unmark the respective servers in the healed_sinks[]
+       array.
+    */
+    AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
+
+    source = __afr_selfheal_data_finalize_source(
+        frame, this, inode, sources, sinks, healed_sinks, locked_on,
+        undid_pending, replies, witness);
+    if (source < 0)
+        return -EIO;
+
+    return source;
 }
 
-
-int
-afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                    unsigned char *locked_on)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              nsources = 0;
-	int              source = 0;
-	int              i = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, 
-				     priv->child_count, AFR_DATA_PENDING);
-
-	afr_sh_print_pending_matrix (sh->pending_matrix, this);
-
-
-	afr_sh_mark_sources (sh->pending_matrix, sh->sources, 
-			     priv->child_count);
-
-	afr_sh_supress_empty_children (sh->sources, sh->xattr, sh->buf,
-				       priv->child_count, AFR_DATA_PENDING);
-
-	afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
-					  priv->child_count);
-
-	nsources = afr_sh_source_count (sh->sources, priv->child_count);
-
-	if ((nsources == 0)
-	    && (priv->favorite_child != -1)
-	    && (sh->child_errno[priv->favorite_child] == 0)) {
-
-		gf_log (this->name, GF_LOG_WARNING,
-			"Picking favorite child %s as authentic source to resolve conflicting data of %s",
-			priv->children[priv->favorite_child]->name,
-			local->loc.path);
-
-		sh->sources[priv->favorite_child] = 1;
-
-		nsources = afr_sh_source_count (sh->sources,
-						priv->child_count);
-	}
-
-	if (nsources == 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"Unable to resolve conflicting data of %s. "
-			"Please resolve manually by deleting the file %s "
-			"from all but the preferred subvolume. "
-			"Please consider 'option favorite-child <>'",
-			local->loc.path, local->loc.path);
-
-		local->govinda_gOvinda = 1;
-
-		afr_sh_data_finish (frame, this);
-		return 0;
-	}
-
-	source = afr_sh_select_source (sh->sources, priv->child_count);
-	sh->source = source;
-
-	/* detect changes not visible through pending flags -- JIC */
-	for (i = 0; i < priv->child_count; i++) {
-		if (i == source || sh->child_errno[i])
-			continue;
+    afr_private_t *priv = NULL;
+    int ret = -1;
+    unsigned char *sources = NULL;
+    unsigned char *sinks = NULL;
+    unsigned char *data_lock = NULL;
+    unsigned char *healed_sinks = NULL;
+    unsigned char *undid_pending = NULL;
+    struct afr_reply *locked_replies = NULL;
+    int source = -1;
+    gf_boolean_t did_sh = _gf_true;
+    gf_boolean_t is_arbiter_the_only_sink = _gf_false;
+    gf_boolean_t empty_file = _gf_false;
+
+    priv = this->private;
+
+    sources = alloca0(priv->child_count);
+    sinks = alloca0(priv->child_count);
+    healed_sinks = alloca0(priv->child_count);
+    data_lock = alloca0(priv->child_count);
+    undid_pending = alloca0(priv->child_count);
+
+    locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+    ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0,
+                               data_lock);
+    {
+        if (ret < priv->child_count) {
+            gf_msg_debug(this->name, 0,
+                         "%s: Skipping "
+                         "self-heal as only %d number "
+                         "of subvolumes "
+                         "could be locked",
+                         uuid_utoa(fd->inode->gfid), ret);
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+
+        ret = __afr_selfheal_data_prepare(frame, this, fd->inode, data_lock,
+                                          sources, sinks, healed_sinks,
+                                          undid_pending, locked_replies, NULL);
+        if (ret < 0)
+            goto unlock;
+
+        if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+            did_sh = _gf_false;
+            goto unlock;
+        }
+
+        source = ret;
+
+        if (AFR_IS_ARBITER_BRICK(priv, source)) {
+            empty_file = afr_is_file_empty_on_all_children(priv,
+                                                           locked_replies);
+            if (empty_file)
+                goto restore_time;
+
+            did_sh = _gf_false;
+            goto unlock;
+        }
+
+        ret = __afr_selfheal_truncate_sinks(
+            frame, this, fd, healed_sinks,
+            locked_replies[source].poststat.ia_size);
+        if (ret < 0)
+            goto unlock;
+
+        if (priv->arbiter_count &&
+            AFR_COUNT(healed_sinks, priv->child_count) == 1 &&
+            healed_sinks[ARBITER_BRICK_INDEX]) {
+            is_arbiter_the_only_sink = _gf_true;
+            goto restore_time;
+        }
+        ret = 0;
+    }
+unlock:
+    afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock);
+    if (ret < 0)
+        goto out;
+
+    if (!did_sh)
+        goto out;
+
+    ret = afr_selfheal_data_do(frame, this, fd, source, healed_sinks,
+                               locked_replies);
+    if (ret)
+        goto out;
+restore_time:
+    afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks,
+                              locked_replies);
+
+    if (!is_arbiter_the_only_sink && !empty_file) {
+        ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0,
+                                   data_lock);
+        if (ret < priv->child_count) {
+            ret = -ENOTCONN;
+            did_sh = _gf_false;
+            goto skip_undo_pending;
+        }
+    }
+    ret = afr_selfheal_undo_pending(
+        frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending,
+        AFR_DATA_TRANSACTION, locked_replies, data_lock);
+skip_undo_pending:
+    afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock);
+out:
 
-		if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source]))
-			sh->sources[i] = 0;
-	}
+    if (did_sh)
+        afr_log_selfheal(fd->inode->gfid, this, ret, "data", source, sources,
+                         healed_sinks);
+    else
+        ret = 1;
 
-	afr_sh_data_sync_prepare (frame, this);
+    if (locked_replies)
+        afr_replies_wipe(locked_replies, priv->child_count);
 
-	return 0;
+    return ret;
 }
 
-
 int
-afr_sh_data_lookup_cbk (call_frame_t *frame, void *cookie,
-			xlator_t *this, int32_t op_ret, int32_t op_errno,
-			inode_t *inode, struct stat *buf, dict_t *xattr)
+afr_selfheal_data_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno, fd_t *fd,
+                           dict_t *xdata)
 {
-	afr_private_t   *priv  = NULL;
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-
-	int call_count  = -1;
-	int child_index = (long) cookie;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
+    afr_local_t *local = NULL;
+    int i = (long)cookie;
 
-	LOCK (&frame->lock);
-	{
-		if (op_ret != -1) {
-			sh->xattr[child_index] = dict_ref (xattr);
-			sh->buf[child_index] = *buf;
-		}
-	}
-	UNLOCK (&frame->lock);
+    local = frame->local;
 
-	call_count = afr_frame_return (frame);
+    local->replies[i].valid = 1;
+    local->replies[i].op_ret = op_ret;
+    local->replies[i].op_errno = op_errno;
 
-	if (call_count == 0) {
-		afr_sh_data_fix (frame, this);
-	}
+    syncbarrier_wake(&local->barrier);
 
-	return 0;
+    return 0;
 }
 
-
 int
-afr_sh_data_lookup (call_frame_t *frame, xlator_t *this)
+afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd)
 {
-	afr_self_heal_t *sh    = NULL; 
-	afr_local_t     *local = NULL;
-	afr_private_t   *priv  = NULL;
-	dict_t          *xattr_req = NULL;
-
-	int call_count = 0;
-	int i = 0;
-	int ret = 0;
-
-	priv  = this->private;
-	local = frame->local;
-	sh    = &local->self_heal;
-
-	call_count = local->child_count;
-
-	local->call_count = call_count;
-	
-	xattr_req = dict_new();
-	if (xattr_req)
-		ret = dict_set_uint64 (xattr_req, AFR_DATA_PENDING,
-				       priv->child_count * sizeof(int32_t));
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_sh_data_lookup_cbk,
-					   (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->lookup,
-					   &local->loc, xattr_req);
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	if (xattr_req)
-		dict_unref (xattr_req);
-
-	return 0;
-}
-
-
-int
-afr_sh_data_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		      int32_t op_ret, int32_t op_errno)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	int              call_count = 0;
-	int              child_index = (long) cookie;
-
-	/* TODO: what if lock fails? */
-	
-	local = frame->local;
-	sh = &local->self_heal;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			sh->op_failed = 1;
-
-			gf_log (this->name,
-				(op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
-				"locking of %s on child %d failed: %s",
-				local->loc.path, child_index,
-				strerror (op_errno));
-		} else {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"inode of %s on child %d locked",
-				local->loc.path, child_index);
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		if (sh->op_failed) {
-			afr_sh_data_finish (frame, this);
-			return 0;
-		}
-
-		afr_sh_data_lookup (frame, this);
-	}
-
-	return 0;
-}
-
-
-int
-afr_sh_data_lock (call_frame_t *frame, xlator_t *this)
-{
-	struct flock flock;			
-	int i = 0;				
-	int call_count = 0;		     
-
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	afr_self_heal_t * sh  = NULL;
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	call_count = local->child_count;
-
-	local->call_count = call_count;		
-
-	flock.l_start = 0;
-	flock.l_len   = 0;
-	flock.l_type  = F_WRLCK;			
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"locking %s on subvolume %s",
-				local->loc.path, priv->children[i]->name);
-
-			STACK_WIND_COOKIE (frame, afr_sh_data_lock_cbk,
-					   (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->inodelk,
-					   &local->loc, F_SETLK, &flock); 
-			if (!--call_count)
-				break;
-		}
-	}
-
-	return 0;
+    int ret = 0;
+    fd_t *fd_tmp = NULL;
+    loc_t loc = {
+        0,
+    };
+    call_frame_t *frame = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+
+    priv = this->private;
+
+    fd_tmp = fd_create(inode, 0);
+    if (!fd_tmp)
+        return -ENOMEM;
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    frame = afr_frame_create(this, &ret);
+    if (!frame) {
+        ret = -ret;
+        fd_unref(fd_tmp);
+        goto out;
+    }
+    local = frame->local;
+
+    AFR_ONLIST(local->child_up, frame, afr_selfheal_data_open_cbk, open, &loc,
+               O_RDWR | O_LARGEFILE, fd_tmp, NULL);
+
+    ret = -ENOTCONN;
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].valid)
+            continue;
+
+        if (local->replies[i].op_ret < 0) {
+            ret = -local->replies[i].op_errno;
+            continue;
+        }
+
+        ret = 0;
+        break;
+    }
+
+    if (ret < 0) {
+        fd_unref(fd_tmp);
+        goto out;
+    } else {
+        fd_bind(fd_tmp);
+    }
+
+    *fd = fd_tmp;
+out:
+    loc_wipe(&loc);
+    if (frame)
+        AFR_STACK_DESTROY(frame);
+    return ret;
 }
 
-
 int
-afr_self_heal_data (call_frame_t *frame, xlator_t *this)
+afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd)
 {
-	afr_local_t   *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t *priv = this->private;
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-
-	if (local->need_data_self_heal && priv->data_self_heal) {
-		afr_sh_data_lock (frame, this);
-	} else {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"not doing data self heal on %s",
-			local->loc.path);
-		afr_sh_data_done (frame, this);
-	}
-
-	return 0;
+    afr_private_t *priv = NULL;
+    unsigned char *locked_on = NULL;
+    int ret = 0;
+    inode_t *inode = fd->inode;
+
+    priv = this->private;
+
+    locked_on = alloca0(priv->child_count);
+
+    ret = afr_selfheal_tie_breaker_inodelk(frame, this, inode, priv->sh_domain,
+                                           0, 0, locked_on);
+    {
+        if (ret < priv->child_count) {
+            gf_msg_debug(this->name, 0,
+                         "%s: Skipping "
+                         "self-heal as only %d number of "
+                         "subvolumes could be locked",
+                         uuid_utoa(fd->inode->gfid), ret);
+            /* Either less than two subvols available, or another
+               selfheal (from another server) is in progress. Skip
+               for now in any case there isn't anything to do.
+            */
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+
+        ret = __afr_selfheal_data(frame, this, fd, locked_on);
+    }
+unlock:
+    afr_selfheal_uninodelk(frame, this, inode, priv->sh_domain, 0, 0,
+                           locked_on);
+
+    return ret;
 }
-
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index d5b6dff3cb4..64893f441e3 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -1,2038 +1,1276 @@
 /*
-  Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
   This file is part of GlusterFS.
 
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
 #include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
 #include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-
-
-int
-afr_sh_entry_done (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	/* 
-	   TODO: cleanup sh->* 
-	*/
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"self heal of %s completed",
-		local->loc.path);
-
-	sh->completion_cbk (frame, this);
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			int32_t op_ret, int32_t op_errno)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	int           call_count = 0;
-	int           child_index = (long) cookie;
-
-	/* TODO: what if lock fails? */
-	
-	local = frame->local;
-	sh = &local->self_heal;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR, 
-				"unlocking inode of %s on child %d failed: %s",
-				local->loc.path, child_index,
-				strerror (op_errno));
-		} else {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"unlocked inode of %s on child %d",
-				local->loc.path, child_index);
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		if (sh->healing_fd)
-			fd_unref (sh->healing_fd);
-		sh->healing_fd = NULL;
-		afr_sh_entry_done (frame, this);
-	}
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this)
-{
-	int i = 0;				
-	int call_count = 0;		     
-
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	afr_self_heal_t * sh  = NULL;
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	call_count = local->child_count;
-
-	local->call_count = call_count;		
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"unlocking %s on subvolume %s",
-				local->loc.path, priv->children[i]->name);
-
-			STACK_WIND_COOKIE (frame, afr_sh_entry_unlck_cbk,
-					   (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->entrylk,
-					   &local->loc, NULL,
-					   ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
-			if (!--call_count)
-				break;
-		}
-	}
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_finish (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t   *local = NULL;
-
-	local = frame->local;
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"finishing entry selfheal of %s", local->loc.path);
-
-	afr_sh_entry_unlock (frame, this);
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie,
-				xlator_t *this, int32_t op_ret,
-				int32_t op_errno, dict_t *xattr)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int             call_count = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	LOCK (&frame->lock);
-	{
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		afr_sh_entry_finish (frame, this);
-
-	return 0;
-}
-
+#include <glusterfs/byte-order.h>
+#include "afr-transaction.h"
+#include "afr-messages.h"
+#include <glusterfs/syncop-utils.h>
+#include <glusterfs/events.h>
 
 int
-afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this)
+afr_selfheal_entry_anon_inode(xlator_t *this, inode_t *dir, const char *name,
+                              inode_t *inode, int child,
+                              struct afr_reply *replies,
+                              gf_boolean_t *anon_inode)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              call_count = 0;
-	int              i = 0;
-	dict_t          **erase_xattr = NULL;
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-
-	afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix,
-				 sh->success, priv->child_count);
-
-	erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (sh->xattr[i]) {
-			call_count++;
-
-			erase_xattr[i] = get_new_dict();
-			dict_ref (erase_xattr[i]);
-		}
-	}
-
-	afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr,
-			       priv->child_count, AFR_ENTRY_PENDING);
-
-	local->call_count = call_count;
-	for (i = 0; i < priv->child_count; i++) {
-		if (!erase_xattr[i])
-			continue;
-
-		gf_log (this->name, GF_LOG_DEBUG,
-			"erasing pending flags from %s on %s",
-			local->loc.path, priv->children[i]->name);
-
-		STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk,
-				   (void *) (long) i,
-				   priv->children[i],
-				   priv->children[i]->fops->xattrop,
-				   &local->loc,
-				   GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
-		if (!--call_count)
-			break;
-	}
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (erase_xattr[i]) {
-			dict_unref (erase_xattr[i]);
-		}
-	}
-	FREE (erase_xattr);
-
-	return 0;
-}
-
-
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    xlator_t *subvol = NULL;
+    int ret = 0;
+    int i = 0;
+    char g[64] = {0};
+    unsigned char *lookup_success = NULL;
+    call_frame_t *frame = NULL;
+    loc_t loc2 = {
+        0,
+    };
+    loc_t loc = {
+        0,
+    };
+
+    priv = this->private;
+    subvol = priv->children[child];
+    lookup_success = alloca0(priv->child_count);
+    uuid_utoa_r(replies[child].poststat.ia_gfid, g);
+    loc.inode = inode_new(inode->table);
+    if (!loc.inode) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    if (replies[child].poststat.ia_type == IA_IFDIR) {
+        /* This directory may have sub-directory hierarchy which may need to
+         * be preserved for subsequent heals. So unconditionally move the
+         * directory to anonymous-inode directory*/
+        *anon_inode = _gf_true;
+        goto anon_inode;
+    }
+
+    frame = afr_frame_create(this, &ret);
+    if (!frame) {
+        ret = -ret;
+        goto out;
+    }
+    local = frame->local;
+    gf_uuid_copy(loc.gfid, replies[child].poststat.ia_gfid);
+    AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+               NULL);
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->replies[i].op_ret == 0) {
+            lookup_success[i] = 1;
+        } else if (local->replies[i].op_errno != ENOENT &&
+                   local->replies[i].op_errno != ESTALE) {
+            ret = -local->replies[i].op_errno;
+        }
+    }
+
+    if (priv->quorum_count) {
+        if (afr_has_quorum(lookup_success, this, NULL)) {
+            *anon_inode = _gf_true;
+        }
+    } else if (AFR_COUNT(lookup_success, priv->child_count) > 1) {
+        *anon_inode = _gf_true;
+    } else if (ret) {
+        goto out;
+    }
+
+anon_inode:
+    if (!*anon_inode) {
+        ret = 0;
+        goto out;
+    }
+
+    loc.parent = inode_ref(dir);
+    gf_uuid_copy(loc.pargfid, dir->gfid);
+    loc.name = name;
+
+    ret = afr_anon_inode_create(this, child, &loc2.parent);
+    if (ret < 0)
+        goto out;
+
+    loc2.name = g;
+    ret = syncop_rename(subvol, &loc, &loc2, NULL, NULL);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+               "Rename to %s dir %s/%s (%s) on %s failed",
+               priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+               subvol->name);
+    } else {
+        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+               "Rename to %s dir %s/%s (%s) on %s successful",
+               priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+               subvol->name);
+    }
 
-static int
-next_active_source (call_frame_t *frame, xlator_t *this,
-		    int current_active_source)
-{
-	afr_private_t   *priv = NULL;
-	afr_local_t     *local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-	int              source = -1;
-	int              next_active_source = -1;
-	int              i = 0;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	source = sh->source;
-
-	if (source != -1) {
-		if (current_active_source != source)
-			next_active_source = source;
-		goto out;
-	}
-
-	/*
-	  the next active sink becomes the source for the
-	  'conservative decision' of merging all entries
-	*/
-
-	for (i = 0; i < priv->child_count; i++) {
-		if ((sh->sources[i] == 0)
-		    && (local->child_up[i] == 1)
-		    && (i > current_active_source)) {
-
-			next_active_source = i;
-			break;
-		}
-	}
 out:
-	return next_active_source;
-}
-
-
+    loc_wipe(&loc);
+    loc_wipe(&loc2);
+    if (frame) {
+        AFR_STACK_DESTROY(frame);
+    }
 
-static int
-next_active_sink (call_frame_t *frame, xlator_t *this,
-		  int current_active_sink)
-{
-	afr_private_t   *priv = NULL;
-	afr_local_t     *local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-	int              next_active_sink = -1;
-	int              i = 0;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	/*
-	  the next active sink becomes the source for the
-	  'conservative decision' of merging all entries
-	*/
-
-	for (i = 0; i < priv->child_count; i++) {
-		if ((sh->sources[i] == 0)
-		    && (local->child_up[i] == 1)
-		    && (i > current_active_sink)) {
-
-			next_active_sink = i;
-			break;
-		}
-	}
-
-	return next_active_sink;
+    return ret;
 }
 
-
 int
-build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+                          inode_t *inode, int child, struct afr_reply *replies)
 {
-	int   ret = -1;
-
-	if (!child) {
-		goto out;
-	}
-
-	if (strcmp (parent->path, "/") == 0)
-		asprintf ((char **)&child->path, "/%s", name);
-	else
-		asprintf ((char **)&child->path, "%s/%s", parent->path, name);
-
-	if (!child->path) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
-
-	child->name = strrchr (child->path, '/');
-	if (child->name)
-		child->name++;
+    char g[64] = {0};
+    afr_private_t *priv = NULL;
+    xlator_t *subvol = NULL;
+    int ret = 0;
+    loc_t loc = {
+        0,
+    };
+    gf_boolean_t anon_inode = _gf_false;
+
+    priv = this->private;
+    subvol = priv->children[child];
+
+    if ((!replies[child].valid) || (replies[child].op_ret < 0)) {
+        /*Nothing to do*/
+        ret = 0;
+        goto out;
+    }
+
+    if (priv->use_anon_inode) {
+        ret = afr_selfheal_entry_anon_inode(this, dir, name, inode, child,
+                                            replies, &anon_inode);
+        if (ret < 0 || anon_inode)
+            goto out;
+    }
+
+    loc.parent = inode_ref(dir);
+    loc.inode = inode_new(inode->table);
+    if (!loc.inode) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    loc.name = name;
+    switch (replies[child].poststat.ia_type) {
+        case IA_IFDIR:
+            gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+                   "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name,
+                   uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+                   subvol->name);
+            ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
+            break;
+        default:
+            gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+                   "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid),
+                   name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+                   subvol->name);
+            ret = syncop_unlink(subvol, &loc, NULL, NULL);
+            break;
+    }
 
-	child->parent = inode_ref (parent->inode);
-	child->inode = inode_new (parent->inode->table);
-
-	if (!child->inode) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
-
-	ret = 0;
 out:
-	if (ret == -1)
-		loc_wipe (child);
-
-	return ret;
+    loc_wipe(&loc);
+    return ret;
 }
 
-
-int
-afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
-			     int active_src);
-
 int
-afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this,
-				 int active_src)
+afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
+                            unsigned char *sources, inode_t *dir,
+                            const char *name, inode_t *inode,
+                            struct afr_reply *replies)
 {
-	afr_private_t   *priv = NULL;
-	afr_local_t     *local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-	int              call_count = 0;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	LOCK (&frame->lock);
-	{
-	}
-	UNLOCK (&frame->lock);
+    int ret = 0;
+    loc_t loc = {
+        0,
+    };
+    loc_t srcloc = {
+        0,
+    };
+    loc_t anonloc = {
+        0,
+    };
+    xlator_t *this = frame->this;
+    afr_private_t *priv = NULL;
+    dict_t *xdata = NULL;
+    struct iatt *iatt = NULL;
+    char *linkname = NULL;
+    mode_t mode = 0;
+    struct iatt newent = {
+        0,
+    };
+    unsigned char *newentry = NULL;
+    char iatt_uuid_str[64] = {0};
+    char dir_uuid_str[64] = {0};
+
+    priv = this->private;
+    iatt = &replies[source].poststat;
+    uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str);
+    if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED,
+               "Invalid ia_type (%d) or gfid(%s). source brick=%d, "
+               "pargfid=%s, name=%s",
+               iatt->ia_type, iatt_uuid_str, source,
+               uuid_utoa_r(dir->gfid, dir_uuid_str), name);
+        ret = -EINVAL;
+        goto out;
+    }
+
+    xdata = dict_new();
+    if (!xdata)
+        return -ENOMEM;
+    newentry = alloca0(priv->child_count);
+    loc.parent = inode_ref(dir);
+    gf_uuid_copy(loc.pargfid, dir->gfid);
+    loc.name = name;
+    loc.inode = inode_ref(inode);
+
+    ret = afr_selfheal_entry_delete(this, dir, name, inode, dst, replies);
+    if (ret)
+        goto out;
+
+    ret = dict_set_gfuuid(xdata, "gfid-req", replies[source].poststat.ia_gfid,
+                          true);
+    if (ret)
+        goto out;
+
+    srcloc.inode = inode_ref(inode);
+    gf_uuid_copy(srcloc.gfid, iatt->ia_gfid);
+    ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0);
+    if (ret == -ENOENT || ret == -ESTALE) {
+        newentry[dst] = 1;
+        ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies,
+                                         sources, newentry);
+        if (ret)
+            goto out;
+    } else if (ret == 0 && iatt->ia_type == IA_IFDIR && priv->use_anon_inode) {
+        // Try rename from hidden directory
+        ret = afr_anon_inode_create(this, dst, &anonloc.parent);
+        if (ret < 0)
+            goto out;
+        anonloc.inode = inode_ref(inode);
+        anonloc.name = iatt_uuid_str;
+        ret = syncop_rename(priv->children[dst], &anonloc, &loc, NULL, NULL);
+        if (ret == -ENOENT || ret == -ESTALE)
+            ret = -1; /*This sets 'mismatch' to true*/
+        goto out;
+    }
+
+    mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type);
+
+    switch (iatt->ia_type) {
+        case IA_IFDIR:
+            ret = syncop_mkdir(priv->children[dst], &loc, mode, 0, xdata, NULL);
+            break;
+        case IA_IFLNK:
+            if (!newentry[dst]) {
+                ret = syncop_link(priv->children[dst], &srcloc, &loc, &newent,
+                                  NULL, NULL);
+            } else {
+                ret = syncop_readlink(priv->children[source], &srcloc,
+                                      &linkname, 4096, NULL, NULL);
+                if (ret <= 0)
+                    goto out;
+                ret = syncop_symlink(priv->children[dst], &loc, linkname, NULL,
+                                     xdata, NULL);
+            }
+            break;
+        default:
+            ret = dict_set_int32_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
+            if (ret)
+                goto out;
+            ret = syncop_mknod(
+                priv->children[dst], &loc, mode,
+                makedev(ia_major(iatt->ia_rdev), ia_minor(iatt->ia_rdev)),
+                &newent, xdata, NULL);
+            break;
+    }
 
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		afr_sh_entry_expunge_subvol (frame, this, active_src);
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie,
-				 xlator_t *this,
-				 int32_t op_ret, int32_t op_errno)
-{
-	afr_private_t   *priv = NULL;
-	afr_local_t     *expunge_local = NULL;
-	afr_self_heal_t *expunge_sh = NULL;
-	int              active_src = 0;
-	call_frame_t    *frame = NULL;
-
-
-	priv = this->private;
-	expunge_local = expunge_frame->local;
-	expunge_sh = &expunge_local->self_heal;
-	frame = expunge_sh->sh_frame;
-
-	active_src = (long) cookie;
-
-	if (op_ret == 0) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"removed %s on %s",
-			expunge_local->loc.path,
-			priv->children[active_src]->name);
-	} else {
-		gf_log (this->name, GF_LOG_ERROR,
-			"removing %s on %s failed (%s)",
-			expunge_local->loc.path,
-			priv->children[active_src]->name,
-			strerror (op_errno));
-	}
-
-	AFR_STACK_DESTROY (expunge_frame);
-	afr_sh_entry_expunge_entry_done (frame, this, active_src);
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this,
-			     int active_src)
-{
-	afr_private_t   *priv = NULL;
-	afr_local_t     *expunge_local = NULL;
-
-	priv = this->private;
-	expunge_local = expunge_frame->local;
-
-	gf_log (this->name, GF_LOG_WARNING,
-		"removing directory %s on %s",
-		expunge_local->loc.path, priv->children[active_src]->name);
-
-	STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
-			   (void *) (long) active_src,
-			   priv->children[active_src],
-			   priv->children[active_src]->fops->rmdir,
-			   &expunge_local->loc);
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this,
-			     int active_src)
-{
-	afr_private_t   *priv = NULL;
-	afr_local_t     *expunge_local = NULL;
-
-	priv = this->private;
-	expunge_local = expunge_frame->local;
-
-	gf_log (this->name, GF_LOG_WARNING,
-		"unlinking file %s on %s",
-		expunge_local->loc.path, priv->children[active_src]->name);
-	
-	STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
-			   (void *) (long) active_src,
-			   priv->children[active_src],
-			   priv->children[active_src]->fops->unlink,
-			   &expunge_local->loc);
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this,
-			     int active_src, struct stat *buf)
-{
-	afr_private_t   *priv = NULL;
-	afr_local_t     *expunge_local = NULL;
-	afr_self_heal_t *expunge_sh = NULL;
-	int              source = 0;
-	call_frame_t    *frame = NULL;
-	int              type = 0;
-
-	priv = this->private;
-	expunge_local = expunge_frame->local;
-	expunge_sh = &expunge_local->self_heal;
-	frame = expunge_sh->sh_frame;
-	source = expunge_sh->source;
-
-	type = (buf->st_mode & S_IFMT);
-
-	switch (type) {
-	case S_IFSOCK:
-	case S_IFREG:
-	case S_IFBLK:
-	case S_IFCHR:
-	case S_IFIFO:
-	case S_IFLNK:
-		afr_sh_entry_expunge_unlink (expunge_frame, this, active_src);
-
-		break;
-	case S_IFDIR:
-		afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src);
-		break;
-	default:
-		gf_log (this->name, GF_LOG_ERROR,
-			"%s has unknown file type on %s: 0%o",
-			expunge_local->loc.path,
-			priv->children[source]->name, type);
-		goto out;
-		break;
-	}
-
-	return 0;
-out:
-	AFR_STACK_DESTROY (expunge_frame);
-	afr_sh_entry_expunge_entry_done (frame, this, active_src);
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie,
-				xlator_t *this,
-				int32_t op_ret,	int32_t op_errno,
-				inode_t *inode, struct stat *buf, dict_t *x)
-{
-	afr_private_t   *priv = NULL;
-	afr_local_t     *expunge_local = NULL;
-	afr_self_heal_t *expunge_sh = NULL;
-	call_frame_t    *frame = NULL;
-	int              active_src = 0;
-
-	priv = this->private;
-	expunge_local = expunge_frame->local;
-	expunge_sh = &expunge_local->self_heal;
-	frame = expunge_sh->sh_frame;
-	active_src = (long) cookie;
-
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"lookup of %s on %s failed (%s)",
-			expunge_local->loc.path,
-			priv->children[active_src]->name,
-			strerror (op_errno));
-		goto out;
-	}
-
-	afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf);
-
-	return 0;
 out:
-	AFR_STACK_DESTROY (expunge_frame);
-	afr_sh_entry_expunge_entry_done (frame, this, active_src);
-
-	return 0;
+    if (xdata)
+        dict_unref(xdata);
+    GF_FREE(linkname);
+    loc_wipe(&loc);
+    loc_wipe(&srcloc);
+    loc_wipe(&anonloc);
+    return ret;
 }
 
-
-int
-afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this,
-			    int active_src)
+static int
+__afr_selfheal_heal_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                           char *name, inode_t *inode, int source,
+                           unsigned char *sources, unsigned char *healed_sinks,
+                           unsigned char *locked_on, struct afr_reply *replies)
 {
-	afr_private_t   *priv = NULL;
-	afr_local_t     *expunge_local = NULL;
-
-	priv = this->private;
-	expunge_local = expunge_frame->local;
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"looking up %s on %s",
-		expunge_local->loc.path, priv->children[active_src]->name);
-	
-	STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk,
-			   (void *) (long) active_src,
-			   priv->children[active_src],
-			   priv->children[active_src]->fops->lookup,
-			   &expunge_local->loc, 0);
-
-	return 0;
+    int ret = 0;
+    afr_private_t *priv = NULL;
+    int i = 0;
+
+    priv = this->private;
+
+    if (!replies[source].valid)
+        return -EIO;
+
+    /* Skip healing this entry if the last lookup on it failed for reasons
+     * other than ENOENT.
+     */
+    if ((replies[source].op_ret < 0) && (replies[source].op_errno != ENOENT))
+        return -replies[source].op_errno;
+
+    if (replies[source].op_ret == 0) {
+        ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies,
+                                       source, sources,
+                                       &replies[source].poststat.ia_gfid, NULL);
+        if (ret)
+            return ret;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!healed_sinks[i])
+            continue;
+        if (replies[source].op_ret == -1 &&
+            replies[source].op_errno == ENOENT) {
+            ret = afr_selfheal_entry_delete(this, fd->inode, name, inode, i,
+                                            replies);
+        } else {
+            if (!gf_uuid_compare(replies[i].poststat.ia_gfid,
+                                 replies[source].poststat.ia_gfid))
+                continue;
+
+            ret = afr_selfheal_recreate_entry(frame, i, source, sources,
+                                              fd->inode, name, inode, replies);
+        }
+        if (ret < 0)
+            break;
+    }
+
+    return ret;
 }
 
-
-int
-afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie,
-				xlator_t *this,
-				int32_t op_ret,	int32_t op_errno,
-				inode_t *inode, struct stat *buf, dict_t *x)
+static int
+afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this,
+                                           struct afr_reply *replies,
+                                           inode_t *inode, uuid_t pargfid,
+                                           char *bname, int src_idx,
+                                           unsigned char *locked_on, int *src)
 {
-	afr_private_t   *priv = NULL;
-	afr_local_t     *expunge_local = NULL;
-	afr_self_heal_t *expunge_sh = NULL;
-	int              source = 0;
-	call_frame_t    *frame = NULL;
-	int              active_src = 0;
-
-
-	priv = this->private;
-	expunge_local = expunge_frame->local;
-	expunge_sh = &expunge_local->self_heal;
-	frame = expunge_sh->sh_frame;
-	active_src = expunge_sh->active_source;
-	source = (long) cookie;
-
-	if (op_ret == -1 && op_errno == ENOENT) {
-
-		gf_log (this->name, GF_LOG_DEBUG,
-			"missing entry %s on %s",
-			expunge_local->loc.path,
-			priv->children[source]->name);
-
-		afr_sh_entry_expunge_purge (expunge_frame, this, active_src);
-
-		return 0;
-	}
-
-	if (op_ret == 0) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"%s exists under %s",
-			expunge_local->loc.path,
-			priv->children[source]->name);
-	} else {
-		gf_log (this->name, GF_LOG_ERROR,
-			"looking up %s under %s failed (%s)",
-			expunge_local->loc.path,
-			priv->children[source]->name,
-			strerror (op_errno));
-	}
-
-	AFR_STACK_DESTROY (expunge_frame);
-	afr_sh_entry_expunge_entry_done (frame, this, active_src);
-
-	return 0;
+    int i = 0;
+    int ret = -1;
+    afr_private_t *priv = NULL;
+    void *gfid = NULL;
+    ia_type_t ia_type = IA_INVAL;
+
+    priv = this->private;
+    gfid = &replies[src_idx].poststat.ia_gfid;
+    ia_type = replies[src_idx].poststat.ia_type;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (i == src_idx)
+            continue;
+
+        if (!replies[i].valid)
+            continue;
+
+        if (replies[i].op_ret != 0)
+            continue;
+
+        if (gf_uuid_is_null(replies[i].poststat.ia_gfid))
+            continue;
+
+        if (replies[i].poststat.ia_type == IA_INVAL)
+            continue;
+
+        if (ia_type == IA_INVAL || gf_uuid_is_null(gfid)) {
+            src_idx = i;
+            ia_type = replies[src_idx].poststat.ia_type;
+            gfid = &replies[src_idx].poststat.ia_gfid;
+            continue;
+        }
+
+        if (gf_uuid_compare(gfid, replies[i].poststat.ia_gfid) &&
+            (ia_type == replies[i].poststat.ia_type)) {
+            ret = afr_gfid_split_brain_source(this, replies, inode, pargfid,
+                                              bname, src_idx, i, locked_on, src,
+                                              NULL);
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+                       "Skipping conservative merge on the "
+                       "file.");
+            return ret;
+        }
+
+        if (ia_type != replies[i].poststat.ia_type) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+                   "Type mismatch detected "
+                   "for <gfid:%s>/%s>, %s on %s and %s on %s. "
+                   "Skipping conservative merge on the file.",
+                   uuid_utoa(pargfid), bname,
+                   gf_inode_type_to_str(replies[i].poststat.ia_type),
+                   priv->children[i]->name,
+                   gf_inode_type_to_str(replies[src_idx].poststat.ia_type),
+                   priv->children[src_idx]->name);
+            gf_event(EVENT_AFR_SPLIT_BRAIN,
+                     "client-pid=%d;"
+                     "subvol=%s;type=file;"
+                     "file=<gfid:%s>/%s>;count=2;child-%d=%s;type-"
+                     "%d=%s;child-%d=%s;type-%d=%s",
+                     this->ctx->cmd_args.client_pid, this->name,
+                     uuid_utoa(pargfid), bname, i, priv->children[i]->name, i,
+                     gf_inode_type_to_str(replies[i].poststat.ia_type), src_idx,
+                     priv->children[src_idx]->name, src_idx,
+                     gf_inode_type_to_str(replies[src_idx].poststat.ia_type));
+            return -1;
+        }
+    }
+
+    return 0;
 }
 
-
-int
-afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this,
-			    char *name)
+static int
+__afr_selfheal_merge_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                            char *name, inode_t *inode, unsigned char *sources,
+                            unsigned char *healed_sinks,
+                            unsigned char *locked_on, struct afr_reply *replies)
 {
-	afr_private_t   *priv = NULL;
-	afr_local_t     *local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-	int              ret = -1;
-	call_frame_t    *expunge_frame = NULL;
-	afr_local_t     *expunge_local = NULL;
-	afr_self_heal_t *expunge_sh = NULL;
-	int              active_src = 0;
-	int              source = 0;
-	int              op_errno = 0;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	active_src = sh->active_source;
-	source = sh->source;
-
-	if ((strcmp (name, ".") == 0)
-	    || (strcmp (name, "..") == 0)) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"skipping inspection of %s under %s",
-			name, local->loc.path);
-		goto out;
-	}
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"inspecting existance of %s under %s",
-		name, local->loc.path);
-
-	expunge_frame = copy_frame (frame);
-	if (!expunge_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
-
-	ALLOC_OR_GOTO (expunge_local, afr_local_t, out);
-
-	expunge_frame->local = expunge_local;
-	expunge_sh = &expunge_local->self_heal;
-	expunge_sh->sh_frame = frame;
-	expunge_sh->active_source = active_src;
-
-	ret = build_child_loc (this, &expunge_local->loc, &local->loc, name);
-	if (ret != 0) {
-		goto out;
-	}
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"looking up %s on %s", expunge_local->loc.path,
-		priv->children[source]->name);
-
-	STACK_WIND_COOKIE (expunge_frame,
-			   afr_sh_entry_expunge_entry_cbk,
-			   (void *) (long) source,
-			   priv->children[source],
-			   priv->children[source]->fops->lookup,
-			   &expunge_local->loc, 0);
-
-	ret = 0;
-out:
-	if (ret == -1)
-		afr_sh_entry_expunge_entry_done (frame, this, active_src);
-
-	return 0;
+    int ret = 0;
+    int i = 0;
+    int source = -1;
+    int src = -1;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (replies[i].valid && replies[i].op_ret == 0) {
+            source = i;
+            break;
+        }
+    }
+
+    if (source == -1) {
+        /* entry got deleted in the mean time? */
+        return 0;
+    }
+
+    /* Set all the sources as 1, otheriwse newentry_mark won't be set */
+    for (i = 0; i < priv->child_count; i++) {
+        if (replies[i].valid && replies[i].op_ret == 0) {
+            sources[i] = 1;
+        }
+    }
+
+    ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies,
+                                   source, sources,
+                                   &replies[source].poststat.ia_gfid, NULL);
+    if (ret)
+        return ret;
+
+    /* In case of type mismatch / unable to resolve gfid mismatch on the
+     * entry, return -1.*/
+    ret = afr_selfheal_detect_gfid_and_type_mismatch(
+        this, replies, inode, fd->inode->gfid, name, source, locked_on, &src);
+
+    if (ret < 0)
+        return ret;
+    if (src != -1) {
+        source = src;
+        for (i = 0; i < priv->child_count; i++) {
+            if (i != src && replies[i].valid &&
+                gf_uuid_compare(replies[src].poststat.ia_gfid,
+                                replies[i].poststat.ia_gfid)) {
+                sources[i] = 0;
+            }
+        }
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (i == source || !healed_sinks[i])
+            continue;
+
+        if (src != -1) {
+            if (!gf_uuid_compare(replies[src].poststat.ia_gfid,
+                                 replies[i].poststat.ia_gfid))
+                continue;
+        } else if (replies[i].op_errno != ENOENT) {
+            continue;
+        }
+
+        ret |= afr_selfheal_recreate_entry(frame, i, source, sources, fd->inode,
+                                           name, inode, replies);
+    }
+
+    return ret;
 }
 
-
-int
-afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie,
-				  xlator_t *this,
-				  int32_t op_ret, int32_t op_errno,
-				  gf_dirent_t *entries)
+static int
+__afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                            char *name, inode_t *inode, int source,
+                            unsigned char *sources, unsigned char *healed_sinks,
+                            unsigned char *locked_on, struct afr_reply *replies)
 {
-	afr_private_t   *priv = NULL;
-	afr_local_t     *local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-	gf_dirent_t     *entry = NULL;
-	off_t            last_offset = 0;
-	int              active_src = 0;
-	int              entry_count = 0;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	active_src = sh->active_source;
-
-	if (op_ret <= 0) {
-		if (op_ret < 0) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"readdir of %s on subvolume %s failed (%s)",
-				local->loc.path,
-				priv->children[active_src]->name,
-				strerror (op_errno));
-		} else {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"readdir of %s on subvolume %s complete",
-				local->loc.path,
-				priv->children[active_src]->name);
-		}
-
-		afr_sh_entry_expunge_all (frame, this);
-		return 0;
-	}
-
-	list_for_each_entry (entry, &entries->list, list) {
-		last_offset = entry->d_off;
-		entry_count++;
-	}
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"readdir'ed %d entries from %s",
-		entry_count, priv->children[active_src]->name);
-
-	sh->offset = last_offset;
-	local->call_count = entry_count;
-
-	list_for_each_entry (entry, &entries->list, list) {
-		afr_sh_entry_expunge_entry (frame, this, entry->d_name);
-	}
-
-	return 0;
+    int ret = -1;
+
+    if (source < 0)
+        ret = __afr_selfheal_merge_dirent(frame, this, fd, name, inode, sources,
+                                          healed_sinks, locked_on, replies);
+    else
+        ret = __afr_selfheal_heal_dirent(frame, this, fd, name, inode, source,
+                                         sources, healed_sinks, locked_on,
+                                         replies);
+    return ret;
 }
 
-int
-afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
-			     int active_src)
+static gf_boolean_t
+is_full_heal_marker_present(xlator_t *this, dict_t *xdata, int idx)
 {
-	afr_private_t   *priv = NULL;
-	afr_local_t     *local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk,
-		    priv->children[active_src],
-		    priv->children[active_src]->fops->readdir,
-		    sh->healing_fd, sh->block_size, sh->offset);
-
-	return 0;
+    int i = 0;
+    int pending[3] = {
+        0,
+    };
+    void *pending_raw = NULL;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    if (!xdata)
+        return _gf_false;
+
+    /* Iterate over each of the priv->pending_keys[] elements and then
+     * see if any of them have data segment non-zero. If they do, return
+     * true. Else return false.
+     */
+    for (i = 0; i < priv->child_count; i++) {
+        if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
+            continue;
+
+        if (!pending_raw)
+            continue;
+
+        memcpy(pending, pending_raw, sizeof(pending));
+        if (ntoh32(pending[idx]))
+            return _gf_true;
+    }
+
+    return _gf_false;
 }
 
-
-int
-afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+afr_need_full_heal(xlator_t *this, struct afr_reply *replies, int source,
+                   unsigned char *healed_sinks, afr_transaction_type type)
 {
-	afr_private_t   *priv = NULL;
-	afr_local_t     *local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-	int              active_src = -1;
+    int i = 0;
+    int idx = 0;
+    afr_private_t *priv = NULL;
 
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
+    priv = this->private;
 
-	sh->offset = 0;
+    if (!priv->esh_granular)
+        return _gf_true;
 
-	if (sh->source == -1) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"no active sources for %s to expunge entries",
-			local->loc.path);
-		goto out;
-	}
+    if (type != AFR_ENTRY_TRANSACTION)
+        return _gf_true;
 
-	active_src = next_active_sink (frame, this, sh->active_source);
-	sh->active_source = active_src;
+    priv = this->private;
+    idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
 
-	if (sh->op_failed) {
-		goto out;
-	}
+    /* If there is a clear source, check whether the full-heal-indicator
+     * is present in its xdata. Otherwise, we need to examine all the
+     * participating bricks and then figure if *even* one of them has a
+     * full-heal-indicator.
+     */
 
-	if (active_src == -1) {
-		/* completed creating missing files on all subvolumes */
-		goto out;
-	}
+    if (source != -1) {
+        if (is_full_heal_marker_present(this, replies[source].xdata, idx))
+            return _gf_true;
+    }
 
-	gf_log (this->name, GF_LOG_DEBUG,
-		"expunging entries of %s on %s to other sinks",
-		local->loc.path, priv->children[active_src]->name);
+    /* else ..*/
 
-	afr_sh_entry_expunge_subvol (frame, this, active_src);
+    for (i = 0; i < priv->child_count; i++) {
+        if (!healed_sinks[i])
+            continue;
 
-	return 0;
-out:
-	afr_sh_entry_erase_pending (frame, this);
-	return 0;
+        if (is_full_heal_marker_present(this, replies[i].xdata, idx))
+            return _gf_true;
+    }
 
+    return _gf_false;
 }
 
-
-int
-afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this,
-			     int active_src);
-
-int
-afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this,
-				 int active_src)
+static int
+__afr_selfheal_entry_finalize_source(xlator_t *this, unsigned char *sources,
+                                     unsigned char *healed_sinks,
+                                     unsigned char *locked_on,
+                                     struct afr_reply *replies,
+                                     uint64_t *witness)
 {
-	afr_private_t   *priv = NULL;
-	afr_local_t     *local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-	int              call_count = 0;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	LOCK (&frame->lock);
-	{
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		afr_sh_entry_impunge_subvol (frame, this, active_src);
-
-	return 0;
+    afr_private_t *priv = NULL;
+    int source = -1;
+    int sources_count = 0;
+    int i = 0;
+
+    priv = this->private;
+
+    sources_count = AFR_COUNT(sources, priv->child_count);
+
+    if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+        !sources_count || afr_does_witness_exist(this, witness)) {
+        memset(sources, 0, sizeof(*sources) * priv->child_count);
+        afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+        return -1;
+    }
+
+    source = afr_choose_source_by_policy(priv, sources, AFR_ENTRY_TRANSACTION);
+
+    /*If the selected source does not blame any other brick, then mark
+     * everything as sink to trigger conservative merge.
+     */
+    if (source != -1 && !AFR_COUNT(healed_sinks, priv->child_count)) {
+        for (i = 0; i < priv->child_count; i++) {
+            if (locked_on[i]) {
+                sources[i] = 0;
+                healed_sinks[i] = 1;
+            }
+        }
+        return -1;
+    }
+
+    return source;
 }
 
-
 int
-afr_sh_entry_impunge_utimens_cbk (call_frame_t *impunge_frame, void *cookie,
-				  xlator_t *this, int32_t op_ret,
-				  int32_t op_errno, struct stat *stbuf)
+__afr_selfheal_entry_prepare(call_frame_t *frame, xlator_t *this,
+                             inode_t *inode, unsigned char *locked_on,
+                             unsigned char *sources, unsigned char *sinks,
+                             unsigned char *healed_sinks,
+                             struct afr_reply *replies, int *source_p,
+                             unsigned char *pflag)
 {
-	int              call_count = 0;
-	afr_private_t   *priv = NULL;
-	afr_local_t     *impunge_local = NULL;
-	afr_self_heal_t *impunge_sh = NULL;
-	call_frame_t    *frame = NULL;
-	int              active_src = 0;
-	int              child_index = 0;
-
-	priv = this->private;
-	impunge_local = impunge_frame->local;
-	impunge_sh = &impunge_local->self_heal;
-	frame = impunge_sh->sh_frame;
-	child_index = (long) cookie;
-
-	if (op_ret == 0) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"utimes set for %s on %s",
-			impunge_local->loc.path,
-			priv->children[child_index]->name);
-	} else {
-		gf_log (this->name, GF_LOG_ERROR,
-			"setting utimes of %s on %s failed (%s)",
-			impunge_local->loc.path,
-			priv->children[child_index]->name,
-			strerror (op_errno));
-	}
-
-	LOCK (&impunge_frame->lock);
-	{
-		call_count = --impunge_local->call_count;
-	}
-	UNLOCK (&impunge_frame->lock);
-
-	if (call_count == 0) {
-		AFR_STACK_DESTROY (impunge_frame);
-		afr_sh_entry_impunge_entry_done (frame, this, active_src);
-	}
-
-	return 0;
+    int ret = -1;
+    int source = -1;
+    afr_private_t *priv = NULL;
+    uint64_t *witness = NULL;
+
+    priv = this->private;
+
+    ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+    if (ret)
+        return ret;
+
+    witness = alloca0(sizeof(*witness) * priv->child_count);
+    ret = afr_selfheal_find_direction(frame, this, replies,
+                                      AFR_ENTRY_TRANSACTION, locked_on, sources,
+                                      sinks, witness, pflag);
+    if (ret)
+        return ret;
+
+    /* Initialize the healed_sinks[] array optimistically to
+       the intersection of to-be-healed (i.e sinks[]) and
+       the list of servers which are up (i.e locked_on[]).
+
+       As we encounter failures in the healing process, we
+       will unmark the respective servers in the healed_sinks[]
+       array.
+    */
+    AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
+
+    source = __afr_selfheal_entry_finalize_source(this, sources, healed_sinks,
+                                                  locked_on, replies, witness);
+
+    if (source < 0) {
+        /* If source is < 0 (typically split-brain), we perform a
+           conservative merge of entries rather than erroring out */
+    }
+    *source_p = source;
+
+    return ret;
 }
 
-
-int
-afr_sh_entry_impunge_chown_cbk (call_frame_t *impunge_frame, void *cookie,
-				xlator_t *this, int32_t op_ret,
-				int32_t op_errno, struct stat *stbuf)
+static int
+afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                          char *name, inode_t *parent_idx_inode,
+                          xlator_t *subvol, gf_boolean_t full_crawl)
 {
-	int              call_count = 0;
-	afr_private_t   *priv = NULL;
-	afr_local_t     *impunge_local = NULL;
-	afr_self_heal_t *impunge_sh = NULL;
-	call_frame_t    *frame = NULL;
-	int              active_src = 0;
-	int              child_index = 0;
-	struct timespec  ts[2];
-
-
-	priv = this->private;
-	impunge_local = impunge_frame->local;
-	impunge_sh = &impunge_local->self_heal;
-	frame = impunge_sh->sh_frame;
-	child_index = (long) cookie;
-
-	if (op_ret == 0) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"ownership of %s on %s changed",
-			impunge_local->loc.path,
-			priv->children[child_index]->name);
-	} else {
-		gf_log (this->name, GF_LOG_ERROR,
-			"setting ownership of %s on %s failed (%s)",
-			impunge_local->loc.path,
-			priv->children[child_index]->name,
-			strerror (op_errno));
-		goto out;
-	}
-
-#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC
-	ts[0] = impunge_local->cont.lookup.buf.st_atim;
-	ts[1] = impunge_local->cont.lookup.buf.st_mtim;
-#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC
-	ts[0] = impunge_local->cont.lookup.buf.st_atimespec;
-	ts[1] = impunge_local->cont.lookup.buf.st_mtimespec;
-#else
-	ts[0].tv_sec = impunge_local->cont.lookup.buf.st_atime;
-	ts[1].tv_sec = impunge_local->cont.lookup.buf.st_mtime;
-#endif
-	STACK_WIND_COOKIE (impunge_frame,
-			   afr_sh_entry_impunge_utimens_cbk,
-			   (void *) (long) child_index,
-			   priv->children[child_index],
-			   priv->children[child_index]->fops->utimens,
-			   &impunge_local->loc, ts);
-
-	return 0;
-
-out:
-	LOCK (&impunge_frame->lock);
-	{
-		call_count = --impunge_local->call_count;
-	}
-	UNLOCK (&impunge_frame->lock);
-
-	if (call_count == 0) {
-		AFR_STACK_DESTROY (impunge_frame);
-		afr_sh_entry_impunge_entry_done (frame, this, active_src);
-	}
-
-	return 0;
+    int ret = 0;
+    int source = -1;
+    unsigned char *locked_on = NULL;
+    unsigned char *sources = NULL;
+    unsigned char *sinks = NULL;
+    unsigned char *healed_sinks = NULL;
+    inode_t *inode = NULL;
+    struct afr_reply *replies = NULL;
+    struct afr_reply *par_replies = NULL;
+    afr_private_t *priv = NULL;
+    dict_t *xattr = NULL;
+
+    priv = this->private;
+
+    if (afr_is_private_directory(priv, fd->inode->gfid, name,
+                                 GF_CLIENT_PID_SELF_HEALD)) {
+        return 0;
+    }
+
+    xattr = dict_new();
+    if (!xattr)
+        return -ENOMEM;
+    ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1);
+    if (ret) {
+        dict_unref(xattr);
+        return -1;
+    }
+
+    sources = alloca0(priv->child_count);
+    sinks = alloca0(priv->child_count);
+    healed_sinks = alloca0(priv->child_count);
+    locked_on = alloca0(priv->child_count);
+
+    replies = alloca0(priv->child_count * sizeof(*replies));
+    par_replies = alloca0(priv->child_count * sizeof(*par_replies));
+
+    ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
+                               locked_on);
+    {
+        if (ret < priv->child_count) {
+            gf_msg_debug(this->name, 0,
+                         "%s: Skipping "
+                         "entry self-heal as only %d sub-volumes "
+                         " could be locked in %s domain",
+                         uuid_utoa(fd->inode->gfid), ret, this->name);
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+
+        ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, locked_on,
+                                           sources, sinks, healed_sinks,
+                                           par_replies, &source, NULL);
+        if (ret < 0)
+            goto unlock;
+
+        inode = afr_selfheal_unlocked_lookup_on(frame, fd->inode, name, replies,
+                                                locked_on, xattr);
+        if (!inode) {
+            ret = -ENOMEM;
+            goto unlock;
+        }
+
+        ret = __afr_selfheal_entry_dirent(frame, this, fd, name, inode, source,
+                                          sources, healed_sinks, locked_on,
+                                          replies);
+
+        if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) {
+            ret = afr_shd_entry_purge(subvol, parent_idx_inode, name,
+                                      inode->ia_type);
+            /* Why is ret force-set to 0? We do not care about
+             * index purge failing for full heal as it is quite
+             * possible during replace-brick that not all files
+             * and directories have their name indices present in
+             * entry-changes/.
+             */
+            ret = 0;
+        }
+    }
+
+unlock:
+    afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, locked_on,
+                           NULL);
+    if (inode)
+        inode_unref(inode);
+    if (replies)
+        afr_replies_wipe(replies, priv->child_count);
+    if (par_replies)
+        afr_replies_wipe(par_replies, priv->child_count);
+    if (xattr)
+        dict_unref(xattr);
+
+    return ret;
 }
 
-
-int
-afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie,
-				  xlator_t *this,
-				  int32_t op_ret, int32_t op_errno,
-				  inode_t *inode, struct stat *stbuf)
+static inode_t *
+afr_shd_entry_changes_index_inode(xlator_t *this, xlator_t *subvol,
+                                  uuid_t pargfid)
 {
-	int              call_count = 0;
-	afr_private_t   *priv = NULL;
-	afr_local_t     *impunge_local = NULL;
-	afr_self_heal_t *impunge_sh = NULL;
-	call_frame_t    *frame = NULL;
-	int              active_src = 0;
-	int              child_index = 0;
-
-	priv = this->private;
-	impunge_local = impunge_frame->local;
-	impunge_sh = &impunge_local->self_heal;
-	frame = impunge_sh->sh_frame;
-
-	child_index = (long) cookie;
-
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"creation of %s on %s failed (%s)",
-			impunge_local->loc.path,
-			priv->children[child_index]->name,
-			strerror (op_errno));
-		goto out;
-	}
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"setting ownership of %s on %s to %d/%d",
-		impunge_local->loc.path,
-		priv->children[child_index]->name,
-		impunge_local->cont.lookup.buf.st_uid,
-		impunge_local->cont.lookup.buf.st_gid);
-
-	inode->st_mode = stbuf->st_mode;
-
-	STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_chown_cbk,
-			   (void *) (long) child_index,
-			   priv->children[child_index],
-			   priv->children[child_index]->fops->chown,
-			   &impunge_local->loc,
-			   impunge_local->cont.lookup.buf.st_uid,
-			   impunge_local->cont.lookup.buf.st_gid);
-	return 0;
+    int ret = -1;
+    void *index_gfid = NULL;
+    loc_t rootloc = {
+        0,
+    };
+    loc_t loc = {
+        0,
+    };
+    dict_t *xattr = NULL;
+    inode_t *inode = NULL;
+    struct iatt iatt = {
+        0,
+    };
+
+    rootloc.inode = inode_ref(this->itable->root);
+    gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
+
+    ret = syncop_getxattr(subvol, &rootloc, &xattr,
+                          GF_XATTROP_ENTRY_CHANGES_GFID, NULL, NULL);
+    if (ret || !xattr) {
+        errno = -ret;
+        goto out;
+    }
+
+    ret = dict_get_ptr(xattr, GF_XATTROP_ENTRY_CHANGES_GFID, &index_gfid);
+    if (ret) {
+        errno = EINVAL;
+        goto out;
+    }
+
+    loc.inode = inode_new(this->itable);
+    if (!loc.inode) {
+        errno = ENOMEM;
+        goto out;
+    }
+
+    gf_uuid_copy(loc.pargfid, index_gfid);
+    loc.name = gf_strdup(uuid_utoa(pargfid));
+
+    ret = syncop_lookup(subvol, &loc, &iatt, NULL, NULL, NULL);
+    if (ret < 0) {
+        errno = -ret;
+        goto out;
+    }
+
+    inode = inode_link(loc.inode, NULL, NULL, &iatt);
 
 out:
-	LOCK (&impunge_frame->lock);
-	{
-		call_count = --impunge_local->call_count;
-	}
-	UNLOCK (&impunge_frame->lock);
-
-	if (call_count == 0) {
-		AFR_STACK_DESTROY (impunge_frame);
-		afr_sh_entry_impunge_entry_done (frame, this, active_src);
-	}
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this,
-			    int child_index, struct stat *stbuf)
-{
-	afr_private_t   *priv = NULL;
-	afr_local_t     *impunge_local = NULL;
-	afr_self_heal_t *impunge_sh = NULL;
+    if (xattr)
+        dict_unref(xattr);
+    loc_wipe(&rootloc);
+    GF_FREE((char *)loc.name);
+    loc_wipe(&loc);
 
-
-	priv = this->private;
-	impunge_local = impunge_frame->local;
-	impunge_sh = &impunge_local->self_heal;
-
-	gf_log (this->name, GF_LOG_WARNING,
-		"creating file %s mode=0%o dev=0x%"GF_PRI_DEV" on %s",
-		impunge_local->loc.path,
-		stbuf->st_mode, stbuf->st_rdev,
-		priv->children[child_index]->name);
-
-	STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
-			   (void *) (long) child_index,
-			   priv->children[child_index],
-			   priv->children[child_index]->fops->mknod,
-			   &impunge_local->loc,
-			   stbuf->st_mode, stbuf->st_rdev);
-
-	return 0;
+    return inode;
 }
 
-
-
-int
-afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this,
-			    int child_index, struct stat *stbuf)
+static int
+afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                             int child)
 {
-	afr_private_t   *priv = NULL;
-	afr_local_t     *impunge_local = NULL;
-	afr_self_heal_t *impunge_sh = NULL;
-
-
-	priv = this->private;
-	impunge_local = impunge_frame->local;
-	impunge_sh = &impunge_local->self_heal;
-
-	gf_log (this->name, GF_LOG_WARNING,
-		"creating directory %s mode=0%o on %s",
-		impunge_local->loc.path,
-		stbuf->st_mode,
-		priv->children[child_index]->name);
-
-	STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
-			   (void *) (long) child_index,
-			   priv->children[child_index],
-			   priv->children[child_index]->fops->mkdir,
-			   &impunge_local->loc, stbuf->st_mode);
-
-	return 0;
+    int ret = 0;
+    gf_dirent_t entries;
+    gf_dirent_t *entry = NULL;
+    off_t offset = 0;
+    call_frame_t *iter_frame = NULL;
+    xlator_t *subvol = NULL;
+    afr_private_t *priv = NULL;
+    gf_boolean_t mismatch = _gf_false;
+    afr_local_t *local = NULL;
+    loc_t loc = {
+        0,
+    };
+
+    priv = this->private;
+    subvol = priv->children[child];
+
+    INIT_LIST_HEAD(&entries.list);
+
+    local = frame->local;
+
+    iter_frame = afr_copy_frame(frame);
+    if (!iter_frame)
+        return -ENOMEM;
+
+    loc.inode = afr_shd_entry_changes_index_inode(this, subvol,
+                                                  fd->inode->gfid);
+
+    while ((ret = syncop_readdir(subvol, fd, 131072, offset, &entries, NULL,
+                                 NULL))) {
+        if (ret > 0)
+            ret = 0;
+        list_for_each_entry(entry, &entries.list, list)
+        {
+            offset = entry->d_off;
+
+            if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
+                continue;
+
+            ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name,
+                                            loc.inode, subvol,
+                                            local->need_full_crawl);
+            AFR_STACK_RESET(iter_frame);
+            if (iter_frame->local == NULL) {
+                ret = -ENOTCONN;
+                break;
+            }
+
+            if (ret == -1) {
+                /* gfid or type mismatch. */
+                mismatch = _gf_true;
+                ret = 0;
+            }
+            if (ret)
+                break;
+        }
+
+        gf_dirent_free(&entries);
+        if (ret)
+            break;
+    }
+
+    loc_wipe(&loc);
+
+    AFR_STACK_DESTROY(iter_frame);
+    if (mismatch == _gf_true)
+        /* undo pending will be skipped */
+        ret = -1;
+    return ret;
 }
 
-
-int
-afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this,
-			      int child_index, const char *linkname)
-{
-	afr_private_t   *priv = NULL;
-	afr_local_t     *impunge_local = NULL;
-	afr_self_heal_t *impunge_sh = NULL;
-
-
-	priv = this->private;
-	impunge_local = impunge_frame->local;
-	impunge_sh = &impunge_local->self_heal;
-
-	gf_log (this->name, GF_LOG_WARNING,
-		"creating symlink %s -> %s on %s",
-		impunge_local->loc.path, linkname,
-		priv->children[child_index]->name);
-
-	STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
-			   (void *) (long) child_index,
-			   priv->children[child_index],
-			   priv->children[child_index]->fops->symlink,
-			   linkname, &impunge_local->loc);
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie,
-				   xlator_t *this,
-				   int32_t op_ret, int32_t op_errno,
-				   const char *linkname)
+static int
+afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry,
+                                   loc_t *parent, void *data)
 {
-	afr_private_t   *priv = NULL;
-	afr_local_t     *impunge_local = NULL;
-	afr_self_heal_t *impunge_sh = NULL;
-	int              child_index = -1;
-	call_frame_t    *frame = NULL;
-	int              call_count = -1;
-	int              active_src = -1;
-
-	priv = this->private;
-	impunge_local = impunge_frame->local;
-	impunge_sh = &impunge_local->self_heal;
-	frame = impunge_sh->sh_frame;
-	active_src = impunge_sh->active_source;
-
-	child_index = (long) cookie;
-
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"readlink of %s on %s failed (%s)",
-			impunge_local->loc.path,
-			priv->children[active_src]->name,
-			strerror (op_errno));
-		goto out;
-	}
-
-	afr_sh_entry_impunge_symlink (impunge_frame, this, child_index,
-				      linkname);
-	return 0;
+    int ret = 0;
+    loc_t loc = {
+        0,
+    };
+    struct iatt iatt = {
+        0,
+    };
+    afr_granular_esh_args_t *args = data;
+
+    /* Look up the actual inode associated with entry. If the lookup returns
+     * ESTALE or ENOENT, then it means we have a stale index. Remove it.
+     * This is analogous to the check in afr_shd_index_heal() except that
+     * here it is achieved through LOOKUP and in afr_shd_index_heal() through
+     * a GETXATTR.
+     */
+
+    loc.inode = inode_new(args->xl->itable);
+    loc.parent = inode_ref(args->heal_fd->inode);
+    gf_uuid_copy(loc.pargfid, loc.parent->gfid);
+    loc.name = entry->d_name;
+
+    ret = syncop_lookup(args->xl, &loc, &iatt, NULL, NULL, NULL);
+    if ((ret == -ENOENT) || (ret == -ESTALE)) {
+        /* The name indices under the pgfid index dir are guaranteed
+         * to be regular files. Hence the hardcoding.
+         */
+        afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG);
+        ret = 0;
+        goto out;
+    }
+    /* TBD: afr_shd_zero_xattrop? */
+
+    ret = afr_selfheal_entry_dirent(args->frame, args->xl, args->heal_fd,
+                                    entry->d_name, parent->inode, subvol,
+                                    _gf_false);
+    AFR_STACK_RESET(args->frame);
+    if (args->frame->local == NULL)
+        ret = -ENOTCONN;
+
+    if (ret == -1)
+        args->mismatch = _gf_true;
 
 out:
-	LOCK (&impunge_frame->lock);
-	{
-		call_count = --impunge_local->call_count;
-	}
-	UNLOCK (&impunge_frame->lock);
-
-	if (call_count == 0) {
-		AFR_STACK_DESTROY (impunge_frame);
-		afr_sh_entry_impunge_entry_done (frame, this, active_src);
-	}
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this,
-			       int child_index, struct stat *stbuf)
-{
-	afr_private_t   *priv = NULL;
-	afr_local_t     *impunge_local = NULL;
-	afr_self_heal_t *impunge_sh = NULL;
-	int              active_src = -1;
-
-	priv = this->private;
-	impunge_local = impunge_frame->local;
-	impunge_sh = &impunge_local->self_heal;
-	active_src = impunge_sh->active_source;
-
-	STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk,
-			   (void *) (long) child_index,
-			   priv->children[active_src],
-			   priv->children[active_src]->fops->readlink,
-			   &impunge_local->loc, 4096);
-
-	return 0;
+    loc_wipe(&loc);
+    return 0;
 }
 
-
-int
-afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame,
-					  void *cookie, xlator_t *this,
-					  int32_t op_ret, int32_t op_errno,
-					  inode_t *inode, struct stat *buf,
-					  dict_t *xattr)
+static int
+afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                            int subvol_idx, gf_boolean_t is_src)
 {
-	afr_private_t   *priv = NULL;
-	afr_local_t     *impunge_local = NULL;
-	afr_self_heal_t *impunge_sh = NULL;
-	int              active_src = 0;
-	int              type = 0;
-	int              child_index = 0;
-	call_frame_t    *frame = NULL;
-	int              call_count = 0;
-
-	priv = this->private;
-	impunge_local = impunge_frame->local;
-	impunge_sh = &impunge_local->self_heal;
-	frame = impunge_sh->sh_frame;
-
-	child_index = (long) cookie;
-
-	active_src = impunge_sh->active_source;
-
-	if (op_ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"looking up %s on %s (for %s) failed (%s)",
-			impunge_local->loc.path,
-			priv->children[active_src]->name,
-			priv->children[child_index]->name,
-			strerror (op_errno));
-		goto out;
-	}
-
-	impunge_local->cont.lookup.buf = *buf;
-	type = (buf->st_mode & S_IFMT);
-
-	switch (type) {
-	case S_IFSOCK:
-	case S_IFREG:
-	case S_IFBLK:
-	case S_IFCHR:
-	case S_IFIFO:
-		afr_sh_entry_impunge_mknod (impunge_frame, this,
-					    child_index, buf);
-		break;
-	case S_IFLNK:
-		afr_sh_entry_impunge_readlink (impunge_frame, this,
-					       child_index, buf);
-		break;
-	case S_IFDIR:
-		afr_sh_entry_impunge_mkdir (impunge_frame, this,
-					    child_index, buf);
-		break;
-	default:
-		gf_log (this->name, GF_LOG_ERROR,
-			"%s has unknown file type on %s: 0%o",
-			impunge_local->loc.path,
-			priv->children[active_src]->name, type);
-		goto out;
-		break;
-	}
-
-	return 0;
-
+    int ret = 0;
+    loc_t loc = {
+        0,
+    };
+    xlator_t *subvol = NULL;
+    afr_private_t *priv = NULL;
+    afr_granular_esh_args_t args = {
+        0,
+    };
+
+    priv = this->private;
+    subvol = priv->children[subvol_idx];
+
+    args.frame = afr_copy_frame(frame);
+    if (!args.frame)
+        goto out;
+    args.xl = this;
+    /* args.heal_fd represents the fd associated with the original directory
+     * on which entry heal is being attempted.
+     */
+    args.heal_fd = fd;
+
+    /* @subvol here represents the subvolume of AFR where
+     * indices/entry-changes/<pargfid> will be processed
+     */
+    loc.inode = afr_shd_entry_changes_index_inode(this, subvol,
+                                                  fd->inode->gfid);
+    if (!loc.inode) {
+        /* If granular heal failed on the sink (as it might sometimes
+         * because it is the src that would mostly contain the granular
+         * changelogs and the sink's entry-changes would be empty),
+         * do not treat heal as failure.
+         */
+        if (is_src)
+            ret = -errno;
+        else
+            ret = 0;
+        goto out;
+    }
+
+    ret = syncop_dir_scan(subvol, &loc, GF_CLIENT_PID_SELF_HEALD, &args,
+                          afr_selfheal_entry_granular_dirent);
+
+    loc_wipe(&loc);
+
+    if (args.mismatch == _gf_true)
+        ret = -1;
 out:
-	LOCK (&impunge_frame->lock);
-	{
-		call_count = --impunge_local->call_count;
-	}
-	UNLOCK (&impunge_frame->lock);
-
-	if (call_count == 0) {
-		AFR_STACK_DESTROY (impunge_frame);
-		afr_sh_entry_impunge_entry_done (frame, this, active_src);
-	}
-
-	return 0;
+    if (args.frame)
+        AFR_STACK_DESTROY(args.frame);
+    return ret;
 }
 
-
-int
-afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this,
-			       int child_index)
-{
-	afr_private_t   *priv = NULL;
-	afr_local_t     *impunge_local = NULL;
-	afr_self_heal_t *impunge_sh = NULL;
-	int              active_src = 0;
-
-
-	priv = this->private;
-	impunge_local = impunge_frame->local;
-	impunge_sh = &impunge_local->self_heal;
-
-	active_src = impunge_sh->active_source;
-
-	STACK_WIND_COOKIE (impunge_frame,
-			   afr_sh_entry_impunge_recreate_lookup_cbk,
-			   (void *) (long) child_index,
-			   priv->children[active_src],
-			   priv->children[active_src]->fops->lookup,
-			   &impunge_local->loc, 0);
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie,
-				xlator_t *this,
-				int32_t op_ret,	int32_t op_errno,
-				inode_t *inode, struct stat *buf, dict_t *x)
+static int
+afr_selfheal_entry_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source,
+                      unsigned char *sources, unsigned char *healed_sinks)
 {
-	afr_private_t   *priv = NULL;
-	afr_local_t     *impunge_local = NULL;
-	afr_self_heal_t *impunge_sh = NULL;
-	int              call_count = 0;
-	int              child_index = 0;
-	call_frame_t    *frame = NULL;
-	int              active_src = 0;
-
-	priv = this->private;
-	impunge_local = impunge_frame->local;
-	impunge_sh = &impunge_local->self_heal;
-	frame = impunge_sh->sh_frame;
-	child_index = (long) cookie;
-	active_src = impunge_sh->active_source;
-
-	if (op_ret == -1 && op_errno == ENOENT) {
-		/* decrease call_count in recreate-callback */
-		gf_log (this->name, GF_LOG_DEBUG,
-			"missing entry %s on %s",
-			impunge_local->loc.path,
-			priv->children[child_index]->name);
-
-		afr_sh_entry_impunge_recreate (impunge_frame, this,
-					       child_index);
-		return 0;
-	}
-
-	if (op_ret == 0) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"%s exists under %s",
-			impunge_local->loc.path,
-			priv->children[child_index]->name);
-	} else {
-		gf_log (this->name, GF_LOG_ERROR,
-			"looking up %s under %s failed (%s)",
-			impunge_local->loc.path,
-			priv->children[child_index]->name,
-			strerror (op_errno));
-	}
-
-	LOCK (&impunge_frame->lock);
-	{
-		call_count = --impunge_local->call_count;
-	}
-	UNLOCK (&impunge_frame->lock);
-
-	if (call_count == 0) {
-		AFR_STACK_DESTROY (impunge_frame);
-		afr_sh_entry_impunge_entry_done (frame, this, active_src);
-	}
-
-	return 0;
+    int i = 0;
+    int ret = 0;
+    gf_boolean_t mismatch = _gf_false;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+           "performing entry selfheal on %s", uuid_utoa(fd->inode->gfid));
+
+    for (i = 0; i < priv->child_count; i++) {
+        /* Expunge */
+        if (!healed_sinks[i])
+            continue;
+
+        if (!local->need_full_crawl)
+            /* Why call afr_selfheal_entry_granular() on a "healed sink",
+             * given that it is the source that contains the granular
+             * indices?
+             * If the index for this directory is non-existent or empty on
+             * this subvol (=> clear sink), then it will return early
+             * without failure status.
+             * If the index is non-empty and it is yet a 'healed sink', then
+             * it is due to a split-brain in which case we anyway need to
+             * crawl the indices/entry-changes/pargfid directory.
+             */
+            ret = afr_selfheal_entry_granular(frame, this, fd, i, _gf_false);
+        else
+            ret = afr_selfheal_entry_do_subvol(frame, this, fd, i);
+
+        if (ret == -1) {
+            /* gfid or type mismatch. */
+            mismatch = _gf_true;
+            ret = 0;
+        }
+        if (ret)
+            break;
+    }
+
+    if (!ret && source != -1) {
+        /* Impunge */
+        if (local->need_full_crawl)
+            ret = afr_selfheal_entry_do_subvol(frame, this, fd, source);
+        else
+            ret = afr_selfheal_entry_granular(frame, this, fd, source,
+                                              _gf_true);
+    }
+
+    if (mismatch == _gf_true)
+        /* undo pending will be skipped */
+        ret = -1;
+    return ret;
 }
 
-
-int
-afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this,
-			    char *name)
+static int
+__afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                     unsigned char *locked_on)
 {
-	afr_private_t   *priv = NULL;
-	afr_local_t     *local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-	int              ret = -1;
-	call_frame_t    *impunge_frame = NULL;
-	afr_local_t     *impunge_local = NULL;
-	afr_self_heal_t *impunge_sh = NULL;
-	int              active_src = 0;
-	int              i = 0;
-	int              call_count = 0;
-	int              op_errno = 0;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	active_src = sh->active_source;
-
-	if ((strcmp (name, ".") == 0)
-	    || (strcmp (name, "..") == 0)) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"skipping inspection of %s under %s",
-			name, local->loc.path);
-		goto out;
-	}
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"inspecting existance of %s under %s",
-		name, local->loc.path);
-
-	impunge_frame = copy_frame (frame);
-	if (!impunge_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
-
-	ALLOC_OR_GOTO (impunge_local, afr_local_t, out);
-
-	impunge_frame->local = impunge_local;
-	impunge_sh = &impunge_local->self_heal;
-	impunge_sh->sh_frame = frame;
-	impunge_sh->active_source = active_src;
-
-	ret = build_child_loc (this, &impunge_local->loc, &local->loc, name);
-	if (ret != 0) {
-		goto out;
-	}
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (i == active_src)
-			continue;
-		if (local->child_up[i] == 0)
-			continue;
-		if (sh->sources[i] == 1)
-			continue;
-		call_count++;
-	}
-
-	impunge_local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (i == active_src)
-			continue;
-		if (local->child_up[i] == 0)
-			continue;
-		if (sh->sources[i] == 1)
-			continue;
-
-		gf_log (this->name, GF_LOG_DEBUG,
-			"looking up %s on %s", impunge_local->loc.path,
-			priv->children[i]->name);
-
-		STACK_WIND_COOKIE (impunge_frame,
-				   afr_sh_entry_impunge_entry_cbk,
-				   (void *) (long) i,
-				   priv->children[i],
-				   priv->children[i]->fops->lookup,
-				   &impunge_local->loc, 0);
-
-		if (!--call_count)
-			break;
-	}
-
-	ret = 0;
+    int ret = -1;
+    int source = -1;
+    unsigned char *sources = NULL;
+    unsigned char *sinks = NULL;
+    unsigned char *data_lock = NULL;
+    unsigned char *postop_lock = NULL;
+    unsigned char *healed_sinks = NULL;
+    unsigned char *undid_pending = NULL;
+    struct afr_reply *locked_replies = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    gf_boolean_t did_sh = _gf_true;
+
+    priv = this->private;
+    local = frame->local;
+
+    sources = alloca0(priv->child_count);
+    sinks = alloca0(priv->child_count);
+    healed_sinks = alloca0(priv->child_count);
+    undid_pending = alloca0(priv->child_count);
+    data_lock = alloca0(priv->child_count);
+    postop_lock = alloca0(priv->child_count);
+
+    locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+    ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
+                               data_lock);
+    {
+        if (ret < priv->child_count) {
+            gf_msg_debug(this->name, 0,
+                         "%s: Skipping "
+                         "entry self-heal as only %d sub-volumes could "
+                         "be locked in %s domain",
+                         uuid_utoa(fd->inode->gfid), ret, this->name);
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+
+        ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, data_lock,
+                                           sources, sinks, healed_sinks,
+                                           locked_replies, &source, NULL);
+        if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+            did_sh = _gf_false;
+            goto unlock;
+        }
+
+        local->need_full_crawl = afr_need_full_heal(
+            this, locked_replies, source, healed_sinks, AFR_ENTRY_TRANSACTION);
+    }
+unlock:
+    afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, data_lock,
+                           NULL);
+    if (ret < 0)
+        goto out;
+
+    if (!did_sh)
+        goto out;
+
+    ret = afr_selfheal_entry_do(frame, this, fd, source, sources, healed_sinks);
+    if (ret)
+        goto out;
+
+    /* Take entrylks in xlator domain before doing post-op (undo-pending) in
+     * entry self-heal. This is to prevent a parallel name self-heal on
+     * an entry under @fd->inode from reading pending xattrs while it is
+     * being modified by SHD after entry sh below, given that
+     * name self-heal takes locks ONLY in xlator domain and is free to read
+     * pending changelog in the absence of the following locking.
+     */
+    ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
+                               postop_lock);
+    {
+        if (AFR_CMP(data_lock, postop_lock, priv->child_count) != 0) {
+            gf_msg_debug(this->name, 0,
+                         "%s: Skipping "
+                         "post-op after entry self-heal as %d "
+                         "sub-volumes, as opposed to %d, "
+                         "could be locked in %s domain",
+                         uuid_utoa(fd->inode->gfid), ret,
+                         AFR_COUNT(data_lock, priv->child_count), this->name);
+            ret = -ENOTCONN;
+            goto postop_unlock;
+        }
+
+        afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks,
+                                  locked_replies);
+        ret = afr_selfheal_undo_pending(
+            frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending,
+            AFR_ENTRY_TRANSACTION, locked_replies, postop_lock);
+    }
+postop_unlock:
+    afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL,
+                           postop_lock, NULL);
 out:
-	if (ret == -1)
-		afr_sh_entry_impunge_entry_done (frame, this, active_src);
-	
-	return 0;
-}
-
-
-int
-afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie,
-				  xlator_t *this,
-				  int32_t op_ret, int32_t op_errno,
-				  gf_dirent_t *entries)
-{
-	afr_private_t   *priv = NULL;
-	afr_local_t     *local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-	gf_dirent_t     *entry = NULL;
-	off_t            last_offset = 0;
-	int              active_src = 0;
-	int              entry_count = 0;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	active_src = sh->active_source;
-
-	if (op_ret <= 0) {
-		if (op_ret < 0) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"readdir of %s on subvolume %s failed (%s)",
-				local->loc.path,
-				priv->children[active_src]->name,
-				strerror (op_errno));
-		} else {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"readdir of %s on subvolume %s complete",
-				local->loc.path,
-				priv->children[active_src]->name);
-		}
-
-		afr_sh_entry_impunge_all (frame, this);
-		return 0;
-	}
-
-	list_for_each_entry (entry, &entries->list, list) {
-		last_offset = entry->d_off;
-		entry_count++;
-	}
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"readdir'ed %d entries from %s",
-		entry_count, priv->children[active_src]->name);
-
-	sh->offset = last_offset;
-	local->call_count = entry_count;
-
-	list_for_each_entry (entry, &entries->list, list) {
-		afr_sh_entry_impunge_entry (frame, this, entry->d_name);
-	}
-
-	return 0;
-}
-				  
-
-int
-afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this,
-			     int active_src)
-{
-	afr_private_t   *priv = NULL;
-	afr_local_t     *local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk,
-		    priv->children[active_src],
-		    priv->children[active_src]->fops->readdir,
-		    sh->healing_fd, sh->block_size, sh->offset);
-
-	return 0;
+    if (did_sh)
+        afr_log_selfheal(fd->inode->gfid, this, ret, "entry", source, sources,
+                         healed_sinks);
+    else
+        ret = 1;
+
+    if (locked_replies)
+        afr_replies_wipe(locked_replies, priv->child_count);
+    return ret;
 }
 
-
-int
-afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this)
+static fd_t *
+afr_selfheal_data_opendir(xlator_t *this, inode_t *inode)
 {
-	afr_private_t   *priv = NULL;
-	afr_local_t     *local  = NULL;
-	afr_self_heal_t *sh  = NULL;
-	int              active_src = -1;
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	sh->offset = 0;
-
-	active_src = next_active_source (frame, this, sh->active_source);
-	sh->active_source = active_src;
-
-	if (sh->op_failed) {
-		afr_sh_entry_finish (frame, this);
-		return 0;
-	}
-
-	if (active_src == -1) {
-		/* completed creating missing files on all subvolumes */
-		afr_sh_entry_expunge_all (frame, this);
-		return 0;
-	}
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"impunging entries of %s on %s to other sinks",
-		local->loc.path, priv->children[active_src]->name);
-
-	afr_sh_entry_impunge_subvol (frame, this, active_src);
-
-	return 0;
+    loc_t loc = {
+        0,
+    };
+    int ret = 0;
+    fd_t *fd = NULL;
+
+    fd = fd_create(inode, 0);
+    if (!fd)
+        return NULL;
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    ret = syncop_opendir(this, &loc, fd, NULL, NULL);
+    if (ret) {
+        fd_unref(fd);
+        fd = NULL;
+    } else {
+        fd_bind(fd);
+    }
+
+    loc_wipe(&loc);
+    return fd;
 }
 
-
 int
-afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			  int32_t op_ret, int32_t op_errno, fd_t *fd)
+afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              call_count = 0;
-	int              child_index = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	child_index = (long) cookie;
-
-	/* TODO: some of the open's might fail.
-	   In that case, modify cleanup fn to send flush on those 
-	   fd's which are already open */
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"opendir of %s failed on child %s (%s)",
-				local->loc.path,
-				priv->children[child_index]->name,
-				strerror (op_errno));
-			sh->op_failed = 1;
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		if (sh->op_failed) {
-			afr_sh_entry_finish (frame, this);
-			return 0;
-		}
-		gf_log (this->name, GF_LOG_DEBUG,
-			"fd for %s opened, commencing sync",
-			local->loc.path);
-
-		sh->active_source = -1;
-		afr_sh_entry_impunge_all (frame, this);
-	}
-
-	return 0;
+    afr_private_t *priv = NULL;
+    unsigned char *locked_on = NULL;
+    fd_t *fd = NULL;
+    int ret = 0;
+
+    priv = this->private;
+
+    fd = afr_selfheal_data_opendir(this, inode);
+    if (!fd)
+        return -EIO;
+
+    locked_on = alloca0(priv->child_count);
+
+    ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain,
+                                           NULL, locked_on);
+    {
+        if (ret < priv->child_count) {
+            gf_msg_debug(this->name, 0,
+                         "%s: Skipping "
+                         "entry self-heal as only %d sub-volumes could "
+                         "be locked in %s domain",
+                         uuid_utoa(fd->inode->gfid), ret, priv->sh_domain);
+            /* Either less than two subvols available, or another
+               selfheal (from another server) is in progress. Skip
+               for now in any case there isn't anything to do.
+            */
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+
+        ret = __afr_selfheal_entry(frame, this, fd, locked_on);
+    }
+unlock:
+    afr_selfheal_unentrylk(frame, this, inode, priv->sh_domain, NULL, locked_on,
+                           NULL);
+
+    if (fd)
+        fd_unref(fd);
+
+    return ret;
 }
-
-
-int
-afr_sh_entry_open (call_frame_t *frame, xlator_t *this)
-{
-	int i = 0;				
-	int call_count = 0;		     
-
-	int source = -1;
-	int *sources = NULL;
-
-	fd_t *fd = NULL;
-
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	afr_self_heal_t *sh = NULL;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	source  = local->self_heal.source;
-	sources = local->self_heal.sources;
-
-	sh->block_size = 131072;
-	sh->offset = 0;
-
-	call_count = sh->active_sinks;
-	if (source != -1)
-		call_count++;
-
-	local->call_count = call_count;
-
-	fd = fd_create (local->loc.inode, frame->root->pid);
-	sh->healing_fd = fd;
-
-	if (source != -1) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"opening directory %s on subvolume %s (source)",
-			local->loc.path, priv->children[source]->name);
-
-		/* open source */
-		STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
-				   (void *) (long) source,
-				   priv->children[source],
-				   priv->children[source]->fops->opendir,
-				   &local->loc, fd);
-		call_count--;
-	}
-
-	/* open sinks */
-	for (i = 0; i < priv->child_count; i++) {
-		if (sources[i] || !local->child_up[i])
-			continue;
-
-		gf_log (this->name, GF_LOG_DEBUG,
-			"opening directory %s on subvolume %s (sink)",
-			local->loc.path, priv->children[i]->name);
-
-		STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
-				   (void *) (long) i,
-				   priv->children[i], 
-				   priv->children[i]->fops->opendir,
-				   &local->loc, fd);
-
-		if (!--call_count)
-			break;
-	}
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              active_sinks = 0;
-	int              source = 0;
-	int              i = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	source = sh->source;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (sh->sources[i] == 0 && local->child_up[i] == 1) {
-			active_sinks++;
-			sh->success[i] = 1;
-		}
-	}
-	if (source != -1)
-		sh->success[source] = 1;
-
-	if (active_sinks == 0) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"no active sinks for self-heal on dir %s",
-			local->loc.path);
-		afr_sh_entry_finish (frame, this);
-		return 0;
-	}
-	if (source == -1 && active_sinks < 2) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"cannot sync with 0 sources and 1 sink on dir %s",
-			local->loc.path);
-		afr_sh_entry_finish (frame, this);
-		return 0;
-	}
-	sh->active_sinks = active_sinks;
-
-	if (source != -1)
-		gf_log (this->name, GF_LOG_DEBUG,
-			"syncing %s from subvolume %s to %d active sinks",
-			local->loc.path, priv->children[source]->name,
-			active_sinks);
-	else
-		gf_log (this->name, GF_LOG_DEBUG,
-			"no active sources for %s found. "
-			"merging all entries as a conservative decision",
-			local->loc.path);
-
-	afr_sh_entry_open (frame, this);
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_fix (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              source = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, 
-				     priv->child_count, AFR_ENTRY_PENDING);
-
-	afr_sh_print_pending_matrix (sh->pending_matrix, this);
-
-
-	afr_sh_mark_sources (sh->pending_matrix, sh->sources, 
-			     priv->child_count);
-
-	afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
-					  priv->child_count);
-
-	source = afr_sh_select_source (sh->sources, priv->child_count);
-	sh->source = source;
-
-	afr_sh_entry_sync_prepare (frame, this);
-
-	return 0;
-}
-
-
-
-int
-afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie,
-			 xlator_t *this, int32_t op_ret, int32_t op_errno,
-			 inode_t *inode, struct stat *buf, dict_t *xattr)
-{
-	afr_private_t   *priv  = NULL;
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-
-	int call_count  = -1;
-	int child_index = (long) cookie;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret != -1) {
-			sh->xattr[child_index] = dict_ref (xattr);
-			sh->buf[child_index] = *buf;
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		afr_sh_entry_fix (frame, this);
-	}
-
-	return 0;
-}
-
-
-
-int
-afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this)
-{
-	afr_self_heal_t * sh    = NULL; 
-	afr_local_t    *  local = NULL;
-	afr_private_t  *  priv  = NULL;
-	dict_t         *xattr_req = NULL;
-	int ret = 0;
-	int call_count = 0;
-	int i = 0;
-
-	priv  = this->private;
-	local = frame->local;
-	sh    = &local->self_heal;
-
-	call_count = local->child_count;
-
-	local->call_count = call_count;
-	
-	xattr_req = dict_new();
-	if (xattr_req)
-		ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING,
-				       priv->child_count * sizeof(int32_t));
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame,
-					   afr_sh_entry_lookup_cbk,
-					   (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->lookup,
-					   &local->loc, xattr_req);
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	if (xattr_req)
-		dict_unref (xattr_req);
-
-	return 0;
-}
-
-
-
-int
-afr_sh_entry_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		       int32_t op_ret, int32_t op_errno)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	int              call_count = 0;
-	int              child_index = (long) cookie;
-
-	/* TODO: what if lock fails? */
-	
-	local = frame->local;
-	sh    = &local->self_heal;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			sh->op_failed = 1;
-
-			gf_log (this->name,
-				(op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
-				"locking inode of %s on child %d failed: %s",
-				local->loc.path, child_index,
-				strerror (op_errno));
-		} else {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"inode of %s on child %d locked",
-				local->loc.path, child_index);
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		if (sh->op_failed == 1) {
-			afr_sh_entry_finish (frame, this);
-			return 0;
-		}
-
-		afr_sh_entry_lookup (frame, this);
-	}
-
-	return 0;
-}
-
-
-int
-afr_sh_entry_lock (call_frame_t *frame, xlator_t *this)
-{
-	int i = 0;				
-	int call_count = 0;		     
-
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	afr_self_heal_t * sh  = NULL;
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	call_count = local->child_count;
-
-	local->call_count = call_count;		
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"locking %s on subvolume %s",
-				local->loc.path, priv->children[i]->name);
-
-			STACK_WIND_COOKIE (frame, afr_sh_entry_lock_cbk,
-					   (void *) (long) i,
-					   priv->children[i], 
-					   priv->children[i]->fops->entrylk,
-					   &local->loc, NULL,
-					   ENTRYLK_LOCK_NB, ENTRYLK_WRLCK);
-			if (!--call_count)
-				break;
-		}
-	}
-
-	return 0;
-}
-
-
-int
-afr_self_heal_entry (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t   *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-
-
-	priv = this->private;
-	local = frame->local;
-	sh = &local->self_heal;
-
-	if (local->need_entry_self_heal && priv->entry_self_heal) {
-		afr_sh_entry_lock (frame, this);
-	} else {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"proceeding to completion on %s",
-			local->loc.path);
-		afr_sh_entry_done (frame, this);
-	}
-
-	return 0;
-}
-
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 6c66f8704b0..03f43bad16e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -1,791 +1,546 @@
 /*
-   Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
+  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
 
-#include "glusterfs.h"
 #include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
 #include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-
-int
-afr_sh_metadata_done (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              i = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-//	memset (sh->child_errno, 0, sizeof (int) * priv->child_count);
-	memset (sh->buf, 0, sizeof (struct stat) * priv->child_count);
-	memset (sh->success, 0, sizeof (int) * priv->child_count);
-	
-	for (i = 0; i < priv->child_count; i++) {
-		if (sh->xattr[i])
-			dict_unref (sh->xattr[i]);
-		sh->xattr[i] = NULL;
-	}
-
-	if (local->govinda_gOvinda) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"aborting selfheal of %s",
-			local->loc.path);
-		sh->completion_cbk (frame, this);
-	} else {
-		if (S_ISREG (local->cont.lookup.buf.st_mode)) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"proceeding to data check on %s",
-				local->loc.path);
-			afr_self_heal_data (frame, this);
-			return 0;
-		}
-
-		if (S_ISDIR (local->cont.lookup.buf.st_mode)) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"proceeding to entry check on %s",
-				local->loc.path);
-			afr_self_heal_entry (frame, this);
-			return 0;
-		}
-		gf_log (this->name, GF_LOG_DEBUG,
-			"completed self heal of %s",
-			local->loc.path);
-
-		sh->completion_cbk (frame, this);
-	}
-
-	return 0;
-}
-
-
-int
-afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			   int32_t op_ret, int32_t op_errno)
-{
-	afr_local_t      *local = NULL;
-	int               call_count = 0;
-
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		afr_sh_metadata_done (frame, this);
-
-	return 0;
-}
-
-
-int
-afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              i = 0;
-	int              call_count = 0;
-	struct flock     flock = {0, };
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	call_count = local->child_count;
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {
-		flock.l_start   = 0;
-		flock.l_len     = 0;
-		flock.l_type    = F_UNLCK;
-
-		if (local->child_up[i]) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"unlocking %s on subvolume %s",
-				local->loc.path, priv->children[i]->name);
-
-			STACK_WIND (frame, afr_sh_metadata_unlck_cbk,
-				    priv->children[i],
-				    priv->children[i]->fops->inodelk,
-				    &local->loc, F_SETLK, &flock);
-
-			if (!--call_count)
-				break;
-		}
-	}
-
-	return 0;
-}
-
-
-int
-afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie,
-				   xlator_t *this, int32_t op_ret,
-				   int32_t op_errno, dict_t *xattr)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int             call_count = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	LOCK (&frame->lock);
-	{
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		afr_sh_metadata_finish (frame, this);
-
-	return 0;
-}
-
-
-int
-afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              call_count = 0;
-	int              i = 0;
-	dict_t          **erase_xattr = NULL;
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-
-	afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix,
-				 sh->success, priv->child_count);
-
-	erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count);
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (sh->xattr[i]) {
-			call_count++;
-
-			erase_xattr[i] = get_new_dict();
-			dict_ref (erase_xattr[i]);
-		}
-	}
-
-	afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr,
-			       priv->child_count, AFR_METADATA_PENDING);
-
-	local->call_count = call_count;
-
-	if (call_count == 0) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"metadata of %s not healed on any subvolume",
-			local->loc.path);
-
-		afr_sh_metadata_finish (frame, this);
-	}
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (!erase_xattr[i])
-			continue;
-
-		gf_log (this->name, GF_LOG_DEBUG,
-			"erasing pending flags from %s on %s",
-			local->loc.path, priv->children[i]->name);
-
-		STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk,
-				   (void *) (long) i,
-				   priv->children[i],
-				   priv->children[i]->fops->xattrop,
-				   &local->loc,
-				   GF_XATTROP_ADD_ARRAY, erase_xattr[i]);
-		if (!--call_count)
-			break;
-	}
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (erase_xattr[i]) {
-			dict_unref (erase_xattr[i]);
-		}
-	}
-	FREE (erase_xattr);
-
-	return 0;
-}
-
-
-int
-afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			  int32_t op_ret, int32_t op_errno)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              call_count = 0;
-	int              child_index = 0;
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	child_index = (long) cookie;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"setting attributes failed for %s on %s (%s)",
-				local->loc.path,
-				priv->children[child_index]->name,
-				strerror (op_errno));
-
-			sh->success[child_index] = 0;
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
+#include <glusterfs/byte-order.h>
+#include "protocol-common.h"
+#include <glusterfs/events.h>
 
-	if (call_count == 0)
-		afr_sh_metadata_erase_pending (frame, this);
+#define AFR_HEAL_ATTR (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE)
 
-	return 0;
-}
-
-
-int
-afr_sh_metadata_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			  int32_t op_ret, int32_t op_errno, struct stat *buf)
+static gf_boolean_t
+_afr_ignorable_key_match(dict_t *d, char *k, data_t *val, void *mdata)
 {
-	afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno);
-
-	return 0;
+    return afr_is_xattr_ignorable(k);
 }
 
-
-int
-afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			   int32_t op_ret, int32_t op_errno)
+void
+afr_delete_ignorable_xattrs(dict_t *xattr)
 {
-	afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno);
-
-	return 0;
+    dict_foreach_match(xattr, _afr_ignorable_key_match, NULL,
+                       dict_remove_foreach_fn, NULL);
 }
 
-
 int
-afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr)
+__afr_selfheal_metadata_do(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                           int source, unsigned char *healed_sinks,
+                           struct afr_reply *locked_replies)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              source = 0;
-	int              active_sinks = 0;
-	int              call_count = 0;
-	int              i = 0;
-	struct timespec  ts[2];
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	source = sh->source;
-	active_sinks = sh->active_sinks;
-
-	/*
-	 * 4 calls per sink - chown, chmod, utimes, setxattr
-	 */
-	if (xattr)
-		call_count = active_sinks * 4;
-	else
-		call_count = active_sinks * 3;
-
-	local->call_count = call_count;
-
-#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC
-	ts[0] = sh->buf[source].st_atim;
-	ts[1] = sh->buf[source].st_mtim;
-#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC
-	ts[0] = sh->buf[source].st_atimespec;
-	ts[1] = sh->buf[source].st_mtimespec;
-#else
-	ts[0].tv_sec = sh->buf[source].st_atime;
-	ts[1].tv_sec = sh->buf[source].st_mtime;
-#endif
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (call_count == 0) {
-			break;
-		}
-		if (sh->sources[i] || !local->child_up[i])
-			continue;
-
-		gf_log (this->name, GF_LOG_DEBUG,
-			"syncing metadata of %s from %s to %s",
-			local->loc.path, priv->children[source]->name,
-			priv->children[i]->name);
-
-		STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
-				   (void *) (long) i,
-				   priv->children[i],
-				   priv->children[i]->fops->chown,
-				   &local->loc,
-				   sh->buf[source].st_uid,
-				   sh->buf[source].st_gid);
-
-		STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
-				   (void *) (long) i,
-				   priv->children[i],
-				   priv->children[i]->fops->chmod,
-				   &local->loc, sh->buf[source].st_mode);
-
-		STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk,
-				   (void *) (long) i,
-				   priv->children[i],
-				   priv->children[i]->fops->utimens,
-				   &local->loc, ts);
-
-		call_count = call_count - 3;
-
-		if (!xattr)
-			continue;
-
-		STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk,
-				   (void *) (long) i,
-				   priv->children[i],
-				   priv->children[i]->fops->setxattr,
-				   &local->loc, xattr, 0);
-		call_count--;
-	}
-
-	return 0;
+    int ret = -1;
+    loc_t loc = {
+        0,
+    };
+    dict_t *xattr = NULL;
+    dict_t *old_xattr = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+
+    priv = this->private;
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+           "performing metadata selfheal on %s", uuid_utoa(inode->gfid));
+
+    ret = syncop_getxattr(priv->children[source], &loc, &xattr, NULL, NULL,
+                          NULL);
+    if (ret < 0) {
+        ret = -EIO;
+        goto out;
+    }
+
+    afr_delete_ignorable_xattrs(xattr);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (old_xattr) {
+            dict_unref(old_xattr);
+            old_xattr = NULL;
+        }
+
+        if (!healed_sinks[i])
+            continue;
+
+        ret = syncop_setattr(priv->children[i], &loc,
+                             &locked_replies[source].poststat, AFR_HEAL_ATTR,
+                             NULL, NULL, NULL, NULL);
+        if (ret)
+            healed_sinks[i] = 0;
+
+        ret = syncop_getxattr(priv->children[i], &loc, &old_xattr, 0, NULL,
+                              NULL);
+        if (old_xattr) {
+            afr_delete_ignorable_xattrs(old_xattr);
+            ret = syncop_removexattr(priv->children[i], &loc, "", old_xattr,
+                                     NULL);
+            if (ret)
+                healed_sinks[i] = 0;
+        }
+
+        ret = syncop_setxattr(priv->children[i], &loc, xattr, 0, NULL, NULL);
+        if (ret)
+            healed_sinks[i] = 0;
+    }
+    ret = 0;
+
+out:
+    loc_wipe(&loc);
+    if (xattr)
+        dict_unref(xattr);
+    if (old_xattr)
+        dict_unref(old_xattr);
+
+    return ret;
 }
 
-
-int
-afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie,
-			      xlator_t *this,
-			      int32_t op_ret, int32_t op_errno, dict_t *xattr)
+static uint64_t
+mtime_ns(struct iatt *ia)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              source = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	source = sh->source;
-
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"getxattr of %s failed on subvolume %s (%s). proceeding without xattr",
-			local->loc.path, priv->children[source]->name,
-			strerror (op_errno));
-
-		afr_sh_metadata_sync (frame, this, NULL);
-	} else {
-		dict_del (xattr, AFR_DATA_PENDING);
-		dict_del (xattr, AFR_METADATA_PENDING);
-		dict_del (xattr, AFR_ENTRY_PENDING);
-		afr_sh_metadata_sync (frame, this, xattr);
-	}
-
-	return 0;
-}
+    uint64_t ret;
 
+    ret = (((uint64_t)(ia->ia_mtime)) * 1000000000) +
+          (uint64_t)(ia->ia_mtime_nsec);
 
-int
-afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              active_sinks = 0;
-	int              source = 0;
-	int              i = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	source = sh->source;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (sh->sources[i] == 0 && local->child_up[i] == 1) {
-			active_sinks++;
-			sh->success[i] = 1;
-		}
-	}
-	sh->success[source] = 1;
-
-	if (active_sinks == 0) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"no active sinks for performing self-heal on file %s",
-			local->loc.path);
-		afr_sh_metadata_finish (frame, this);
-		return 0;
-	}
-	sh->active_sinks = active_sinks;
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"syncing metadata of %s from subvolume %s to %d active sinks",
-		local->loc.path, priv->children[source]->name, active_sinks);
-
-	STACK_WIND (frame, afr_sh_metadata_getxattr_cbk,
-		    priv->children[source],
-		    priv->children[source]->fops->getxattr,
-		    &local->loc, NULL);
-
-	return 0;
+    return ret;
 }
 
-
-int
-afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this)
+/*
+ * When directory content is modified, [mc]time is updated. On
+ * Linux, the filesystem does it, while at least on NetBSD, the
+ * kernel file-system independent code does it. This means that
+ * when entries are added while bricks are down, the kernel sends
+ * a SETATTR [mc]time which will cause metadata split brain for
+ * the directory. In this case, clear the split brain by finding
+ * the source with the most recent modification date.
+ */
+static int
+afr_dirtime_splitbrain_source(call_frame_t *frame, xlator_t *this,
+                              struct afr_reply *replies,
+                              unsigned char *locked_on)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              nsources = 0;
-	int              source = 0;
-	int              i = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, 
-				     priv->child_count, AFR_METADATA_PENDING);
-
-	afr_sh_print_pending_matrix (sh->pending_matrix, this);
-
-	afr_sh_mark_sources (sh->pending_matrix, sh->sources, 
-			     priv->child_count);
-
-	afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
-					  priv->child_count);
-
-	nsources = afr_sh_source_count (sh->sources, priv->child_count);
-
-	if ((nsources == 0)
-	    && (priv->favorite_child != -1)
-	    && (sh->child_errno[priv->favorite_child] == 0)) {
-
-		gf_log (this->name, GF_LOG_WARNING,
-			"Picking favorite child %s as authentic source to resolve conflicting metadata of %s",
-			priv->children[priv->favorite_child]->name,
-			local->loc.path);
-
-		sh->sources[priv->favorite_child] = 1;
-
-		nsources = afr_sh_source_count (sh->sources,
-						priv->child_count);
-	}
-
-	if (nsources == 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"Unable to resolve conflicting metadata of %s. "
-			"Please resolve manually by fixing the "
-			"permissions/ownership of %s on your subvolumes. "
-			"You can also consider 'option favorite-child <>'",
-			local->loc.path, local->loc.path);
-
-		local->govinda_gOvinda = 1;
-
-		afr_sh_metadata_finish (frame, this);
-		return 0;
-	}
-
-	source = afr_sh_select_source (sh->sources, priv->child_count);
-	sh->source = source;
-
-	/* detect changes not visible through pending flags -- JIC */
-	for (i = 0; i < priv->child_count; i++) {
-		if (i == source || sh->child_errno[i])
-			continue;
-
-		if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source]))
-			sh->sources[i] = 0;
-
-		if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source]))
-			sh->sources[i] = 0;
-	}
-
-	afr_sh_metadata_sync_prepare (frame, this);
-
-	return 0;
+    afr_private_t *priv = NULL;
+    int source = -1;
+    struct iatt source_ia;
+    struct iatt child_ia;
+    uint64_t mtime = 0;
+    int i;
+    int ret = -1;
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!locked_on[i])
+            continue;
+
+        if (!replies[i].valid)
+            continue;
+
+        if (replies[i].op_ret != 0)
+            continue;
+
+        if (mtime_ns(&replies[i].poststat) <= mtime)
+            continue;
+
+        mtime = mtime_ns(&replies[i].poststat);
+        source = i;
+    }
+
+    if (source == -1)
+        goto out;
+
+    source_ia = replies[source].poststat;
+    if (source_ia.ia_type != IA_IFDIR)
+        goto out;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (i == source)
+            continue;
+
+        if (!replies[i].valid)
+            continue;
+
+        if (replies[i].op_ret != 0)
+            continue;
+
+        child_ia = replies[i].poststat;
+
+        if (!IA_EQUAL(source_ia, child_ia, gfid) ||
+            !IA_EQUAL(source_ia, child_ia, type) ||
+            !IA_EQUAL(source_ia, child_ia, prot) ||
+            !IA_EQUAL(source_ia, child_ia, uid) ||
+            !IA_EQUAL(source_ia, child_ia, gid) ||
+            !afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata))
+            goto out;
+    }
+
+    /*
+     * Metadata split brain is just about [amc]time
+     * We return our source.
+     */
+    ret = source;
+out:
+    return ret;
 }
 
-
-int
-afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			    int32_t op_ret, int32_t op_errno,
-			    inode_t *inode, struct stat *buf, dict_t *xattr)
+static int
+__afr_selfheal_metadata_mark_pending_xattrs(call_frame_t *frame, xlator_t *this,
+                                            inode_t *inode,
+                                            struct afr_reply *replies,
+                                            unsigned char *sources)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              call_count = 0;
-	int              child_index = 0;
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	child_index = (long) cookie;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == 0) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"path %s on subvolume %s is of mode 0%o",
-				local->loc.path,
-				priv->children[child_index]->name,
-				buf->st_mode);
-
-			sh->buf[child_index] = *buf;
-			if (xattr)
-				sh->xattr[child_index] = dict_ref (xattr);
-		} else {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"path %s on subvolume %s => -1 (%s)",
-				local->loc.path,
-				priv->children[child_index]->name,
-				strerror (op_errno));
-
-			sh->child_errno[child_index] = op_errno;
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		afr_sh_metadata_fix (frame, this);
-
-	return 0;
+    int ret = 0;
+    int i = 0;
+    int m_idx = 0;
+    afr_private_t *priv = NULL;
+    int raw[AFR_NUM_CHANGE_LOGS] = {0};
+    dict_t *xattr = NULL;
+
+    priv = this->private;
+    m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION);
+    raw[m_idx] = 1;
+
+    xattr = dict_new();
+    if (!xattr)
+        return -ENOMEM;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (sources[i])
+            continue;
+        ret = dict_set_static_bin(xattr, priv->pending_key[i], raw,
+                                  sizeof(int) * AFR_NUM_CHANGE_LOGS);
+        if (ret) {
+            ret = -1;
+            goto out;
+        }
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!sources[i])
+            continue;
+        ret = afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_SELF_HEAL_INFO,
+                   "Failed to set pending metadata xattr on child %d for %s", i,
+                   uuid_utoa(inode->gfid));
+            goto out;
+        }
+    }
+
+    afr_replies_wipe(replies, priv->child_count);
+    ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+
+out:
+    if (xattr)
+        dict_unref(xattr);
+    return ret;
 }
 
-
-int
-afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this)
+/*
+ * Look for mismatching uid/gid or mode or user xattrs even if
+ * AFR xattrs don't say so, and pick one arbitrarily as winner. */
+
+static int
+__afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this,
+                                        inode_t *inode, unsigned char *sources,
+                                        unsigned char *sinks,
+                                        unsigned char *healed_sinks,
+                                        unsigned char *undid_pending,
+                                        unsigned char *locked_on,
+                                        struct afr_reply *replies)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              i = 0;
-	int              call_count = 0;
-	dict_t          *xattr_req = NULL;
-	int              ret = 0;
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	call_count = local->child_count;
-	local->call_count = call_count;
-	
-	xattr_req = dict_new();
-	
-	if (xattr_req)
-		ret = dict_set_uint64 (xattr_req, AFR_METADATA_PENDING,
-				       priv->child_count * sizeof(int32_t));
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"looking up %s on %s",
-				local->loc.path, priv->children[i]->name);
-
-			STACK_WIND_COOKIE (frame, afr_sh_metadata_lookup_cbk,
-					   (void *) (long) i,
-					   priv->children[i],
-					   priv->children[i]->fops->lookup,
-					   &local->loc, xattr_req);
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	if (xattr_req)
-		dict_unref (xattr_req);
-
-	return 0;
+    int i = 0;
+    afr_private_t *priv = NULL;
+    struct iatt srcstat = {
+        0,
+    };
+    int source = -1;
+    int sources_count = 0;
+    int ret = 0;
+
+    priv = this->private;
+
+    sources_count = AFR_COUNT(sources, priv->child_count);
+
+    if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+        !sources_count) {
+        source = afr_mark_split_brain_source_sinks(
+            frame, this, inode, sources, sinks, healed_sinks, locked_on,
+            replies, AFR_METADATA_TRANSACTION);
+        if (source >= 0) {
+            _afr_fav_child_reset_sink_xattrs(
+                frame, this, inode, source, healed_sinks, undid_pending,
+                AFR_METADATA_TRANSACTION, locked_on, replies);
+            goto out;
+        }
+
+        /* If this is a directory mtime/ctime only split brain
+           use the most recent */
+        source = afr_dirtime_splitbrain_source(frame, this, replies, locked_on);
+        if (source != -1) {
+            gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SPLIT_BRAIN,
+                   "clear time "
+                   "split brain on %s",
+                   uuid_utoa(replies[source].poststat.ia_gfid));
+            sources[source] = 1;
+            healed_sinks[source] = 0;
+            goto out;
+        }
+
+        if (!priv->metadata_splitbrain_forced_heal) {
+            gf_event(EVENT_AFR_SPLIT_BRAIN,
+                     "client-pid=%d;"
+                     "subvol=%s;"
+                     "type=metadata;file=%s",
+                     this->ctx->cmd_args.client_pid, this->name,
+                     uuid_utoa(inode->gfid));
+            return -EIO;
+        }
+
+        /* Metadata split brain, select one subvol
+           arbitrarily */
+        for (i = 0; i < priv->child_count; i++) {
+            if (locked_on[i] && healed_sinks[i]) {
+                sources[i] = 1;
+                healed_sinks[i] = 0;
+                break;
+            }
+        }
+    }
+
+    /* No split brain at this point. If we were called from
+     * afr_heal_splitbrain_file(), abort.*/
+    if (afr_dict_contains_heal_op(frame))
+        return -EIO;
+
+    source = afr_choose_source_by_policy(priv, sources,
+                                         AFR_METADATA_TRANSACTION);
+    srcstat = replies[source].poststat;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!sources[i] || i == source)
+            continue;
+        if (!IA_EQUAL(srcstat, replies[i].poststat, type) ||
+            !IA_EQUAL(srcstat, replies[i].poststat, uid) ||
+            !IA_EQUAL(srcstat, replies[i].poststat, gid) ||
+            !IA_EQUAL(srcstat, replies[i].poststat, prot)) {
+            gf_msg_debug(this->name, 0,
+                         "%s: iatt mismatch "
+                         "for source(%d) vs (%d)",
+                         uuid_utoa(replies[source].poststat.ia_gfid), source,
+                         i);
+            sources[i] = 0;
+            healed_sinks[i] = 1;
+        }
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!sources[i] || i == source)
+            continue;
+        if (!afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata)) {
+            gf_msg_debug(this->name, 0,
+                         "%s: xattr mismatch "
+                         "for source(%d) vs (%d)",
+                         uuid_utoa(replies[source].poststat.ia_gfid), source,
+                         i);
+            sources[i] = 0;
+            healed_sinks[i] = 1;
+        }
+    }
+    if ((sources_count == priv->child_count) && (source > -1) &&
+        (AFR_COUNT(healed_sinks, priv->child_count) != 0)) {
+        ret = __afr_selfheal_metadata_mark_pending_xattrs(frame, this, inode,
+                                                          replies, sources);
+        if (ret < 0)
+            return ret;
+    }
+out:
+    afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+    return source;
 }
 
-
 int
-afr_sh_metadata_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			int32_t op_ret, int32_t op_errno)
+__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this,
+                                inode_t *inode, unsigned char *locked_on,
+                                unsigned char *sources, unsigned char *sinks,
+                                unsigned char *healed_sinks,
+                                unsigned char *undid_pending,
+                                struct afr_reply *replies, unsigned char *pflag)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              call_count = 0;
-	int              child_index = (long) cookie;
-
-	/* TODO: what if lock fails? */
-	
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			sh->op_failed = 1;
-
-			gf_log (this->name,
-				(op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR),
-				"locking of %s on child %d failed: %s",
-				local->loc.path, child_index,
-				strerror (op_errno));
-		} else {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"inode of %s on child %d locked",
-				local->loc.path, child_index);
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		if (sh->op_failed) {
-			afr_sh_metadata_finish (frame, this);
-			return 0;
-		}
-
-		afr_sh_metadata_lookup (frame, this);
-	}
-
-	return 0;
+    int ret = -1;
+    int source = -1;
+    afr_private_t *priv = NULL;
+    int i = 0;
+    uint64_t *witness = NULL;
+
+    priv = this->private;
+
+    ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+    if (ret)
+        return ret;
+
+    witness = alloca0(sizeof(*witness) * priv->child_count);
+    ret = afr_selfheal_find_direction(frame, this, replies,
+                                      AFR_METADATA_TRANSACTION, locked_on,
+                                      sources, sinks, witness, pflag);
+    if (ret)
+        return ret;
+
+    /* Initialize the healed_sinks[] array optimistically to
+       the intersection of to-be-healed (i.e sinks[]) and
+       the list of servers which are up (i.e locked_on[]).
+
+       As we encounter failures in the healing process, we
+       will unmark the respective servers in the healed_sinks[]
+       array.
+    */
+    AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
+
+    /* If any source has witness, pick first
+     * witness source and make everybody else sinks */
+    for (i = 0; i < priv->child_count; i++) {
+        if (sources[i] && witness[i]) {
+            source = i;
+            break;
+        }
+    }
+
+    if (source != -1) {
+        for (i = 0; i < priv->child_count; i++) {
+            if (i != source && sources[i]) {
+                sources[i] = 0;
+                healed_sinks[i] = 1;
+            }
+        }
+    }
+
+    source = __afr_selfheal_metadata_finalize_source(
+        frame, this, inode, sources, sinks, healed_sinks, undid_pending,
+        locked_on, replies);
+
+    if (source < 0)
+        return -EIO;
+
+    return source;
 }
 
-
 int
-afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this)
+afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
 {
-	afr_local_t     *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              i = 0;
-	int              call_count = 0;
-	struct flock     flock = {0, };
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-	priv = this->private;
-
-	call_count = local->child_count;
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {
-		flock.l_start   = 0;
-		flock.l_len     = 0;
-		flock.l_type    = F_WRLCK;
-
-		if (local->child_up[i]) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"locking %s on subvolume %s",
-				local->loc.path, priv->children[i]->name);
-
-			STACK_WIND_COOKIE (frame, afr_sh_metadata_lk_cbk,
-					   (void *) (long) i,
-					   priv->children[i],
-					   priv->children[i]->fops->inodelk,
-					   &local->loc, F_SETLK, &flock);
-
-			if (!--call_count)
-				break;
-		}
-	}
-
-	return 0;
+    afr_private_t *priv = NULL;
+    int ret = -1;
+    unsigned char *sources = NULL;
+    unsigned char *sinks = NULL;
+    unsigned char *data_lock = NULL;
+    unsigned char *healed_sinks = NULL;
+    unsigned char *undid_pending = NULL;
+    struct afr_reply *locked_replies = NULL;
+    gf_boolean_t did_sh = _gf_true;
+    int source = -1;
+
+    priv = this->private;
+
+    sources = alloca0(priv->child_count);
+    sinks = alloca0(priv->child_count);
+    healed_sinks = alloca0(priv->child_count);
+    undid_pending = alloca0(priv->child_count);
+    data_lock = alloca0(priv->child_count);
+
+    locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+    ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
+                               data_lock);
+    {
+        if (ret < priv->child_count) {
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+
+        ret = __afr_selfheal_metadata_prepare(
+            frame, this, inode, data_lock, sources, sinks, healed_sinks,
+            undid_pending, locked_replies, NULL);
+        if (ret < 0)
+            goto unlock;
+
+        source = ret;
+
+        if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+            did_sh = _gf_false;
+            goto unlock;
+        }
+
+        ret = __afr_selfheal_metadata_do(frame, this, inode, source,
+                                         healed_sinks, locked_replies);
+        if (ret)
+            goto unlock;
+
+        afr_selfheal_restore_time(frame, this, inode, source, healed_sinks,
+                                  locked_replies);
+
+        ret = afr_selfheal_undo_pending(
+            frame, this, inode, sources, sinks, healed_sinks, undid_pending,
+            AFR_METADATA_TRANSACTION, locked_replies, data_lock);
+    }
+unlock:
+    afr_selfheal_uninodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
+                           data_lock);
+
+    if (did_sh)
+        afr_log_selfheal(inode->gfid, this, ret, "metadata", source, sources,
+                         healed_sinks);
+    else
+        ret = 1;
+
+    if (locked_replies)
+        afr_replies_wipe(locked_replies, priv->child_count);
+    return ret;
 }
 
-
 int
-afr_self_heal_metadata (call_frame_t *frame, xlator_t *this)
+afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf)
 {
-	afr_local_t   *local = NULL;
-	afr_self_heal_t *sh = NULL;
-	afr_private_t *priv = this->private;
-
-
-	local = frame->local;
-	sh = &local->self_heal;
-
-	if (local->need_metadata_self_heal && priv->metadata_self_heal) {
-		afr_sh_metadata_lock (frame, this);
-	} else {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"proceeding to data check on %s",
-			local->loc.path);
-		afr_sh_metadata_done (frame, this);
-	}
-
-	return 0;
+    inode_t *inode = NULL;
+    inode_t *link_inode = NULL;
+    call_frame_t *frame = NULL;
+    int ret = 0;
+
+    if (gf_uuid_is_null(stbuf->ia_gfid)) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    inode = inode_new(this->itable);
+    if (!inode) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    link_inode = inode_link(inode, NULL, NULL, stbuf);
+    if (!link_inode) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    frame = afr_frame_create(this, &ret);
+    if (!frame) {
+        ret = -ret;
+        goto out;
+    }
+
+    ret = afr_selfheal_metadata(frame, this, link_inode);
+out:
+    if (inode)
+        inode_unref(inode);
+    if (link_inode)
+        inode_unref(link_inode);
+    if (frame)
+        AFR_STACK_DESTROY(frame);
+    return ret;
 }
-
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
new file mode 100644
index 00000000000..834aac86d48
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -0,0 +1,616 @@
+/*
+  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/events.h>
+#include "afr.h"
+#include "afr-self-heal.h"
+#include "afr-messages.h"
+
+int
+__afr_selfheal_assign_gfid(xlator_t *this, inode_t *parent, uuid_t pargfid,
+                           const char *bname, inode_t *inode,
+                           struct afr_reply *replies, void *gfid,
+                           unsigned char *locked_on, int source,
+                           unsigned char *sources, gf_boolean_t is_gfid_absent,
+                           int *gfid_idx)
+{
+    int ret = 0;
+    int up_count = 0;
+    int locked_count = 0;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    gf_uuid_copy(parent->gfid, pargfid);
+
+    if (is_gfid_absent) {
+        /* Ensure all children of AFR are up before performing gfid heal, to
+         * guard against the possibility of gfid split brain. */
+
+        up_count = AFR_COUNT(priv->child_up, priv->child_count);
+        if (up_count != priv->child_count) {
+            ret = -EIO;
+            goto out;
+        }
+
+        locked_count = AFR_COUNT(locked_on, priv->child_count);
+        if (locked_count != priv->child_count) {
+            ret = -EIO;
+            goto out;
+        }
+    }
+
+    ret = afr_lookup_and_heal_gfid(this, parent, bname, inode, replies, source,
+                                   sources, gfid, gfid_idx);
+
+out:
+    return ret;
+}
+
+int
+__afr_selfheal_name_impunge(call_frame_t *frame, xlator_t *this,
+                            inode_t *parent, uuid_t pargfid, const char *bname,
+                            inode_t *inode, struct afr_reply *replies,
+                            int gfid_idx)
+{
+    int i = 0;
+    afr_private_t *priv = NULL;
+    int ret = 0;
+    unsigned char *sources = NULL;
+
+    priv = this->private;
+
+    sources = alloca0(priv->child_count);
+
+    gf_uuid_copy(parent->gfid, pargfid);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid || replies[i].op_ret != 0)
+            continue;
+
+        if (gf_uuid_compare(replies[i].poststat.ia_gfid,
+                            replies[gfid_idx].poststat.ia_gfid) == 0) {
+            sources[i] = 1;
+            continue;
+        }
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (sources[i])
+            continue;
+
+        ret |= afr_selfheal_recreate_entry(frame, i, gfid_idx, sources, parent,
+                                           bname, inode, replies);
+    }
+
+    return ret;
+}
+
+int
+__afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid,
+                            const char *bname, inode_t *inode,
+                            struct afr_reply *replies)
+{
+    int i = 0;
+    afr_private_t *priv = NULL;
+    int ret = 0;
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid)
+            continue;
+
+        if (replies[i].op_ret)
+            continue;
+
+        ret |= afr_selfheal_entry_delete(this, parent, bname, inode, i,
+                                         replies);
+    }
+
+    return ret;
+}
+
+static gf_boolean_t
+afr_selfheal_name_need_heal_check(xlator_t *this, struct afr_reply *replies)
+{
+    int i = 0;
+    int first_idx = -1;
+    gf_boolean_t need_heal = _gf_false;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid)
+            continue;
+
+        if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA))
+            need_heal = _gf_true;
+
+        if (first_idx == -1) {
+            first_idx = i;
+            continue;
+        }
+
+        if (replies[i].op_ret != replies[first_idx].op_ret)
+            need_heal = _gf_true;
+
+        if (gf_uuid_compare(replies[i].poststat.ia_gfid,
+                            replies[first_idx].poststat.ia_gfid))
+            need_heal = _gf_true;
+
+        if ((replies[i].op_ret == 0) &&
+            (gf_uuid_is_null(replies[i].poststat.ia_gfid)))
+            need_heal = _gf_true;
+    }
+
+    return need_heal;
+}
+
+static int
+afr_selfheal_name_type_mismatch_check(xlator_t *this, struct afr_reply *replies,
+                                      int source, unsigned char *sources,
+                                      uuid_t pargfid, const char *bname)
+{
+    int i = 0;
+    int type_idx = -1;
+    ia_type_t inode_type = IA_INVAL;
+    ia_type_t inode_type1 = IA_INVAL;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid || replies[i].op_ret != 0)
+            continue;
+
+        if (replies[i].poststat.ia_type == IA_INVAL)
+            continue;
+
+        if (inode_type == IA_INVAL) {
+            inode_type = replies[i].poststat.ia_type;
+            type_idx = i;
+            continue;
+        }
+        inode_type1 = replies[i].poststat.ia_type;
+        if (sources[i] || source == -1) {
+            if ((sources[type_idx] || source == -1) &&
+                (inode_type != inode_type1)) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN,
+                       "Type mismatch for <gfid:%s>/%s: "
+                       "%s on %s and %s on %s",
+                       uuid_utoa(pargfid), bname,
+                       gf_inode_type_to_str(inode_type1),
+                       priv->children[i]->name,
+                       gf_inode_type_to_str(inode_type),
+                       priv->children[type_idx]->name);
+                gf_event(EVENT_AFR_SPLIT_BRAIN,
+                         "client-pid=%d;"
+                         "subvol=%s;type=file;"
+                         "file=<gfid:%s>/%s;count=2;"
+                         "child-%d=%s;type-%d=%s;child-%d=%s;"
+                         "type-%d=%s",
+                         this->ctx->cmd_args.client_pid, this->name,
+                         uuid_utoa(pargfid), bname, i, priv->children[i]->name,
+                         i, gf_inode_type_to_str(inode_type1), type_idx,
+                         priv->children[type_idx]->name, type_idx,
+                         gf_inode_type_to_str(inode_type));
+                return -EIO;
+            }
+            inode_type = replies[i].poststat.ia_type;
+            type_idx = i;
+        }
+    }
+    return 0;
+}
+
+static int
+afr_selfheal_name_gfid_mismatch_check(xlator_t *this, struct afr_reply *replies,
+                                      int source, unsigned char *sources,
+                                      int *gfid_idx, uuid_t pargfid,
+                                      const char *bname, inode_t *inode,
+                                      unsigned char *locked_on, dict_t *xdata)
+{
+    int i = 0;
+    int gfid_idx_iter = -1;
+    int ret = -1;
+    void *gfid = NULL;
+    void *gfid1 = NULL;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid || replies[i].op_ret != 0)
+            continue;
+
+        if (gf_uuid_is_null(replies[i].poststat.ia_gfid))
+            continue;
+
+        if (!gfid) {
+            gfid = &replies[i].poststat.ia_gfid;
+            gfid_idx_iter = i;
+            continue;
+        }
+
+        gfid1 = &replies[i].poststat.ia_gfid;
+        if (sources[i] || source == -1) {
+            if ((sources[gfid_idx_iter] || source == -1) &&
+                gf_uuid_compare(gfid, gfid1)) {
+                ret = afr_gfid_split_brain_source(this, replies, inode, pargfid,
+                                                  bname, gfid_idx_iter, i,
+                                                  locked_on, gfid_idx, xdata);
+                if (!ret && *gfid_idx >= 0) {
+                    ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+                                                   "GFID split-brain resolved");
+                    if (ret)
+                        gf_msg(this->name, GF_LOG_ERROR, 0,
+                               AFR_MSG_DICT_SET_FAILED,
+                               "Error setting gfid-"
+                               "heal-msg dict");
+                }
+                return ret;
+            }
+            gfid = &replies[i].poststat.ia_gfid;
+            gfid_idx_iter = i;
+        }
+    }
+
+    *gfid_idx = gfid_idx_iter;
+    return 0;
+}
+
+static gf_boolean_t
+afr_selfheal_name_source_empty_check(xlator_t *this, struct afr_reply *replies,
+                                     unsigned char *sources, int source)
+{
+    int i = 0;
+    afr_private_t *priv = NULL;
+    gf_boolean_t source_is_empty = _gf_true;
+
+    priv = this->private;
+
+    if (source == -1) {
+        source_is_empty = _gf_false;
+        goto out;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!sources[i])
+            continue;
+
+        if (replies[i].op_ret == -1 && replies[i].op_errno == ENOENT)
+            continue;
+
+        source_is_empty = _gf_false;
+        break;
+    }
+out:
+    return source_is_empty;
+}
+
+int
+__afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
+                       uuid_t pargfid, const char *bname, inode_t *inode,
+                       unsigned char *sources, unsigned char *sinks,
+                       unsigned char *healed_sinks, int source,
+                       unsigned char *locked_on, struct afr_reply *replies,
+                       void *gfid_req, dict_t *xdata)
+{
+    int gfid_idx = -1;
+    int ret = -1;
+    void *gfid = NULL;
+    gf_boolean_t source_is_empty = _gf_true;
+    gf_boolean_t need_heal = _gf_false;
+    gf_boolean_t is_gfid_absent = _gf_false;
+
+    need_heal = afr_selfheal_name_need_heal_check(this, replies);
+    if (!need_heal)
+        return 0;
+
+    source_is_empty = afr_selfheal_name_source_empty_check(this, replies,
+                                                           sources, source);
+    if (source_is_empty) {
+        ret = __afr_selfheal_name_expunge(this, parent, pargfid, bname, inode,
+                                          replies);
+        if (ret == -EIO)
+            ret = -1;
+        return ret;
+    }
+
+    ret = afr_selfheal_name_type_mismatch_check(this, replies, source, sources,
+                                                pargfid, bname);
+    if (ret)
+        return ret;
+
+    ret = afr_selfheal_name_gfid_mismatch_check(this, replies, source, sources,
+                                                &gfid_idx, pargfid, bname,
+                                                inode, locked_on, xdata);
+    if (ret)
+        return ret;
+
+    if (gfid_idx == -1) {
+        if (!gfid_req || gf_uuid_is_null(gfid_req))
+            return -1;
+        gfid = gfid_req;
+    } else {
+        gfid = &replies[gfid_idx].poststat.ia_gfid;
+        if (source == -1)
+            /* Either entry split-brain or dirty xattrs are present on parent.*/
+            source = gfid_idx;
+    }
+
+    is_gfid_absent = (gfid_idx == -1) ? _gf_true : _gf_false;
+    ret = __afr_selfheal_assign_gfid(this, parent, pargfid, bname, inode,
+                                     replies, gfid, locked_on, source, sources,
+                                     is_gfid_absent, &gfid_idx);
+    if (ret || (gfid_idx < 0))
+        return ret;
+
+    ret = __afr_selfheal_name_impunge(frame, this, parent, pargfid, bname,
+                                      inode, replies, gfid_idx);
+    if (ret == -EIO)
+        ret = -1;
+
+    return ret;
+}
+
+int
+__afr_selfheal_name_finalize_source(xlator_t *this, unsigned char *sources,
+                                    unsigned char *healed_sinks,
+                                    unsigned char *locked_on, uint64_t *witness)
+{
+    int i = 0;
+    afr_private_t *priv = NULL;
+    int source = -1;
+    int sources_count = 0;
+
+    priv = this->private;
+
+    sources_count = AFR_COUNT(sources, priv->child_count);
+
+    if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+        !sources_count || afr_does_witness_exist(this, witness)) {
+        memset(sources, 0, sizeof(*sources) * priv->child_count);
+        afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+        return -1;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (sources[i]) {
+            source = i;
+            break;
+        }
+    }
+
+    return source;
+}
+
+int
+__afr_selfheal_name_prepare(call_frame_t *frame, xlator_t *this,
+                            inode_t *parent, uuid_t pargfid,
+                            unsigned char *locked_on, unsigned char *sources,
+                            unsigned char *sinks, unsigned char *healed_sinks,
+                            int *source_p)
+{
+    int ret = -1;
+    int source = -1;
+    afr_private_t *priv = NULL;
+    struct afr_reply *replies = NULL;
+    uint64_t *witness = NULL;
+
+    priv = this->private;
+
+    replies = alloca0(priv->child_count * sizeof(*replies));
+
+    ret = afr_selfheal_unlocked_discover(frame, parent, pargfid, replies);
+    if (ret)
+        goto out;
+
+    witness = alloca0(sizeof(*witness) * priv->child_count);
+    ret = afr_selfheal_find_direction(frame, this, replies,
+                                      AFR_ENTRY_TRANSACTION, locked_on, sources,
+                                      sinks, witness, NULL);
+    if (ret)
+        goto out;
+
+    /* Initialize the healed_sinks[] array optimistically to
+       the intersection of to-be-healed (i.e sinks[]) and
+       the list of servers which are up (i.e locked_on[]).
+
+       As we encounter failures in the healing process, we
+       will unmark the respective servers in the healed_sinks[]
+       array.
+    */
+    AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
+
+    source = __afr_selfheal_name_finalize_source(this, sources, healed_sinks,
+                                                 locked_on, witness);
+    if (source < 0) {
+        /* If source is < 0 (typically split-brain), we perform a
+           conservative merge of entries rather than erroring out */
+    }
+    *source_p = source;
+
+out:
+    if (replies)
+        afr_replies_wipe(replies, priv->child_count);
+
+    return ret;
+}
+
+int
+afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
+                     uuid_t pargfid, const char *bname, void *gfid_req,
+                     dict_t *xdata)
+{
+    afr_private_t *priv = NULL;
+    unsigned char *sources = NULL;
+    unsigned char *sinks = NULL;
+    unsigned char *healed_sinks = NULL;
+    unsigned char *locked_on = NULL;
+    int source = -1;
+    struct afr_reply *replies = NULL;
+    int ret = -1;
+    inode_t *inode = NULL;
+    dict_t *xattr = NULL;
+
+    xattr = dict_new();
+    if (!xattr)
+        return -ENOMEM;
+
+    ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1);
+    if (ret) {
+        dict_unref(xattr);
+        return -1;
+    }
+
+    priv = this->private;
+
+    locked_on = alloca0(priv->child_count);
+    sources = alloca0(priv->child_count);
+    sinks = alloca0(priv->child_count);
+    healed_sinks = alloca0(priv->child_count);
+
+    replies = alloca0(priv->child_count * sizeof(*replies));
+
+    ret = afr_selfheal_entrylk(frame, this, parent, this->name, bname,
+                               locked_on);
+    {
+        if (ret < priv->child_count) {
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+
+        ret = __afr_selfheal_name_prepare(frame, this, parent, pargfid,
+                                          locked_on, sources, sinks,
+                                          healed_sinks, &source);
+        if (ret)
+            goto unlock;
+
+        inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies,
+                                                locked_on, xattr);
+        if (!inode) {
+            ret = -ENOMEM;
+            goto unlock;
+        }
+
+        ret = __afr_selfheal_name_do(frame, this, parent, pargfid, bname, inode,
+                                     sources, sinks, healed_sinks, source,
+                                     locked_on, replies, gfid_req, xdata);
+    }
+unlock:
+    afr_selfheal_unentrylk(frame, this, parent, this->name, bname, locked_on,
+                           NULL);
+    if (inode)
+        inode_unref(inode);
+
+    if (replies)
+        afr_replies_wipe(replies, priv->child_count);
+    if (xattr)
+        dict_unref(xattr);
+
+    return ret;
+}
+
+int
+afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this,
+                                   inode_t *parent, uuid_t pargfid,
+                                   const char *bname, gf_boolean_t *need_heal)
+{
+    afr_private_t *priv = NULL;
+    int i = 0;
+    struct afr_reply *replies = NULL;
+    inode_t *inode = NULL;
+    int first_idx = -1;
+    afr_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    replies = alloca0(sizeof(*replies) * priv->child_count);
+
+    inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies,
+                                            local->child_up, NULL);
+    if (!inode)
+        return -ENOMEM;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!replies[i].valid)
+            continue;
+
+        if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) {
+            *need_heal = _gf_true;
+            break;
+        }
+
+        if (first_idx == -1) {
+            first_idx = i;
+            continue;
+        }
+
+        if (replies[i].op_ret != replies[first_idx].op_ret) {
+            *need_heal = _gf_true;
+            break;
+        }
+
+        if (gf_uuid_compare(replies[i].poststat.ia_gfid,
+                            replies[first_idx].poststat.ia_gfid)) {
+            *need_heal = _gf_true;
+            break;
+        }
+    }
+
+    if (inode)
+        inode_unref(inode);
+    if (replies)
+        afr_replies_wipe(replies, priv->child_count);
+    return 0;
+}
+
+int
+afr_selfheal_name(xlator_t *this, uuid_t pargfid, const char *bname,
+                  void *gfid_req, dict_t *xdata)
+{
+    inode_t *parent = NULL;
+    call_frame_t *frame = NULL;
+    int ret = -1;
+    gf_boolean_t need_heal = _gf_false;
+
+    parent = afr_inode_find(this, pargfid);
+    if (!parent)
+        goto out;
+
+    frame = afr_frame_create(this, NULL);
+    if (!frame)
+        goto out;
+
+    ret = afr_selfheal_name_unlocked_inspect(frame, this, parent, pargfid,
+                                             bname, &need_heal);
+    if (ret)
+        goto out;
+
+    if (need_heal) {
+        ret = afr_selfheal_name_do(frame, this, parent, pargfid, bname,
+                                   gfid_req, xdata);
+        if (ret)
+            goto out;
+    }
+
+    ret = 0;
+out:
+    if (parent)
+        inode_unref(parent);
+    if (frame)
+        AFR_STACK_DESTROY(frame);
+
+    return ret;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index c98831b8bca..48e6dbcfb18 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -1,52 +1,377 @@
 /*
-   Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
-#ifndef __AFR_SELF_HEAL_H__
-#define __AFR_SELF_HEAL_H__
+#ifndef _AFR_SELFHEAL_H
+#define _AFR_SELFHEAL_H
+
+/* Perform fop on all UP subvolumes and wait for all callbacks to return */
+
+#define AFR_ONALL(frame, rfn, fop, args...)                                    \
+    do {                                                                       \
+        afr_local_t *__local = frame->local;                                   \
+        afr_private_t *__priv = frame->this->private;                          \
+        int __i = 0, __count = 0;                                              \
+        unsigned char *__child_up = alloca(__priv->child_count);               \
+                                                                               \
+        memcpy(__child_up, __priv->child_up,                                   \
+               sizeof(*__child_up) * __priv->child_count);                     \
+        __count = AFR_COUNT(__child_up, __priv->child_count);                  \
+                                                                               \
+        __local->barrier.waitfor = __count;                                    \
+        afr_local_replies_wipe(__local, __priv);                               \
+                                                                               \
+        for (__i = 0; __i < __priv->child_count; __i++) {                      \
+            if (!__child_up[__i])                                              \
+                continue;                                                      \
+            STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i,                   \
+                              __priv->children[__i],                           \
+                              __priv->children[__i]->fops->fop, args);         \
+        }                                                                      \
+        syncbarrier_wait(&__local->barrier, __count);                          \
+    } while (0)
+
+/* Perform fop on all subvolumes represented by list[] array and wait
+   for all callbacks to return */
+
+#define AFR_ONLIST(list, frame, rfn, fop, args...)                             \
+    do {                                                                       \
+        afr_local_t *__local = frame->local;                                   \
+        afr_private_t *__priv = frame->this->private;                          \
+        int __i = 0;                                                           \
+        int __count = 0;                                                       \
+        unsigned char *__list = alloca(__priv->child_count);                   \
+                                                                               \
+        memcpy(__list, list, sizeof(*__list) * __priv->child_count);           \
+        __count = AFR_COUNT(__list, __priv->child_count);                      \
+        __local->barrier.waitfor = __count;                                    \
+        afr_local_replies_wipe(__local, __priv);                               \
+                                                                               \
+        for (__i = 0; __i < __priv->child_count; __i++) {                      \
+            if (!__list[__i])                                                  \
+                continue;                                                      \
+            STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i,                   \
+                              __priv->children[__i],                           \
+                              __priv->children[__i]->fops->fop, args);         \
+        }                                                                      \
+        syncbarrier_wait(&__local->barrier, __count);                          \
+    } while (0)
+
+#define AFR_SEQ(frame, rfn, fop, args...)                                      \
+    do {                                                                       \
+        afr_local_t *__local = frame->local;                                   \
+        afr_private_t *__priv = frame->this->private;                          \
+        int __i = 0;                                                           \
+                                                                               \
+        afr_local_replies_wipe(__local, __priv);                               \
+                                                                               \
+        for (__i = 0; __i < __priv->child_count; __i++) {                      \
+            if (!__priv->child_up[__i])                                        \
+                continue;                                                      \
+            STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i,                   \
+                              __priv->children[__i],                           \
+                              __priv->children[__i]->fops->fop, args);         \
+            syncbarrier_wait(&__local->barrier, 1);                            \
+        }                                                                      \
+    } while (0)
+
+#define ALLOC_MATRIX(n, type)                                                  \
+    ({                                                                         \
+        int __i;                                                               \
+        type **__ptr = alloca(n * sizeof(type *));                             \
+                                                                               \
+        for (__i = 0; __i < n; __i++)                                          \
+            __ptr[__i] = alloca0(n * sizeof(type));                            \
+        __ptr;                                                                 \
+    })
+
+#define IA_EQUAL(f, s, field)                                                  \
+    (memcmp(&(f.ia_##field), &(s.ia_##field), sizeof(s.ia_##field)) == 0)
+
+#define SBRAIN_HEAL_NO_GO_MSG                                                  \
+    "Failed to obtain replies from all bricks of "                             \
+    "the replica (are they up?). Cannot resolve split-brain."
+#define SFILE_NOT_IN_SPLIT_BRAIN "File not in split-brain"
+#define SNO_BIGGER_FILE "No bigger file"
+#define SNO_DIFF_IN_MTIME "No difference in mtime"
+#define SUSE_SOURCE_BRICK_TO_HEAL                                              \
+    "Use source-brick option to heal metadata"                                 \
+    " split-brain"
+#define SINVALID_BRICK_NAME "Invalid brick name"
+#define SBRICK_IS_NOT_UP "Brick is not up"
+#define SBRICK_NOT_CONNECTED "Brick is not connected"
+#define SLESS_THAN2_BRICKS_in_REP "< 2 bricks in replica are up"
+#define SBRICK_IS_REMOTE "Brick is remote"
+#define SSTARTED_SELF_HEAL "Started self-heal"
+#define SOP_NOT_SUPPORTED "Operation Not Supported"
+#define SFILE_NOT_UNDER_DATA                                                   \
+    "The file is not under data or metadata "                                  \
+    "split-brain"
+#define SFILE_NOT_IN_SPLIT_BRAIN "File not in split-brain"
+#define SALL_BRICKS_UP_TO_RESOLVE                                              \
+    "All the bricks should be up to resolve the"                               \
+    " gfid split brain"
+#define SERROR_GETTING_SRC_BRICK "Error getting the source brick"
+int
+afr_selfheal(xlator_t *this, uuid_t gfid);
+
+gf_boolean_t
+afr_throttled_selfheal(call_frame_t *frame, xlator_t *this);
 
-#include <sys/stat.h>
+int
+afr_selfheal_name(xlator_t *this, uuid_t gfid, const char *name, void *gfid_req,
+                  dict_t *xdata);
 
-#define FILETYPE_DIFFERS(buf1,buf2) ((S_IFMT & ((struct stat *)buf1)->st_mode) != (S_IFMT & ((struct stat *)buf2)->st_mode))
-#define PERMISSION_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_mode) != (((struct stat *)buf2)->st_mode))
-#define OWNERSHIP_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_uid) != (((struct stat *)buf2)->st_uid) || (((struct stat *)buf1)->st_gid != (((struct stat *)buf2)->st_gid)))
-#define SIZE_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_size) != (((struct stat *)buf2)->st_size))
+int
+afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd);
 
+int
+afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode);
 
+int
+afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode);
 
 int
-afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this);
+afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name,
+                         inode_t *inode, struct afr_reply *replies, int source,
+                         unsigned char *sources, void *gfid, int *gfid_idx);
+
 int
-afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this);
+afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                     char *dom, off_t off, size_t size,
+                     unsigned char *locked_on);
+
 int
-afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this);
+afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                        char *dom, off_t off, size_t size,
+                        unsigned char *locked_on);
 
 int
-afr_self_heal_entry (call_frame_t *frame, xlator_t *this);
+afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this,
+                                 inode_t *inode, char *dom, off_t off,
+                                 size_t size, unsigned char *locked_on);
 
 int
-afr_self_heal_data (call_frame_t *frame, xlator_t *this);
+afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                       char *dom, off_t off, size_t size,
+                       const unsigned char *locked_on);
 
 int
-afr_self_heal_metadata (call_frame_t *frame, xlator_t *this);
+afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                     char *dom, const char *name, unsigned char *locked_on);
+
+int
+afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                        char *dom, const char *name, unsigned char *locked_on);
+
+int
+afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this,
+                                 inode_t *inode, char *dom, const char *name,
+                                 unsigned char *locked_on);
+
+int
+afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                       char *dom, const char *name, unsigned char *locked_on,
+                       dict_t *xdata);
+
+int
+afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
+                               struct afr_reply *replies);
+
+int
+afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
+                                  uuid_t gfid, struct afr_reply *replies,
+                                  unsigned char *discover_on, dict_t *dict);
+inode_t *
+afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent,
+                                const char *name, struct afr_reply *replies,
+                                unsigned char *lookup_on, dict_t *xattr);
+
+int
+afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
+                            struct afr_reply *replies,
+                            afr_transaction_type type, unsigned char *locked_on,
+                            unsigned char *sources, unsigned char *sinks,
+                            uint64_t *witness, unsigned char *flag);
+int
+afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx,
+                         dict_t *xdata);
+
+int
+afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies,
+                           afr_transaction_type type, int *dirty, int **matrix);
+
+int
+afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int op_ret, int op_errno, struct iatt *pre,
+                       struct iatt *post, dict_t *xdata);
+
+int
+afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                          int source, unsigned char *healed_sinks,
+                          struct afr_reply *replies);
+int
+afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                          unsigned char *sources, unsigned char *sinks,
+                          unsigned char *healed_sinks,
+                          unsigned char *undid_pending,
+                          afr_transaction_type type, struct afr_reply *replies,
+                          unsigned char *locked_on);
+
+int
+afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
+                            unsigned char *sources, inode_t *dir,
+                            const char *name, inode_t *inode,
+                            struct afr_reply *replies);
+
+int
+afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                     int subvol, dict_t *xattr, dict_t *xdata);
+
+call_frame_t *
+afr_frame_create(xlator_t *this, int32_t *op_errno);
+
+inode_t *
+afr_inode_find(xlator_t *this, uuid_t gfid);
 
 int
-afr_self_heal (call_frame_t *frame, xlator_t *this,
-	       int (*completion_cbk) (call_frame_t *, xlator_t *));
+afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int op_ret, int op_errno, inode_t *inode,
+                          struct iatt *buf, dict_t *xdata, struct iatt *parbuf);
+void
+afr_reply_copy(struct afr_reply *dst, struct afr_reply *src);
 
-#endif /* __AFR_SELF_HEAL_H__ */
+void
+afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count);
+
+int
+afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                           int source, struct afr_reply *replies,
+                           unsigned char *sources, unsigned char *newentry);
+
+unsigned int
+afr_success_count(struct afr_reply *replies, unsigned int count);
+
+void
+afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source,
+                 unsigned char *sources, unsigned char *healed_sinks);
+
+void
+afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources,
+                                struct afr_reply *replies);
+void
+afr_mark_active_sinks(xlator_t *this, unsigned char *sources,
+                      unsigned char *locked_on, unsigned char *sinks);
+
+gf_boolean_t
+afr_dict_contains_heal_op(call_frame_t *frame);
+
+gf_boolean_t
+afr_can_decide_split_brain_source_sinks(struct afr_reply *replies,
+                                        int child_count);
+int
+afr_mark_split_brain_source_sinks(
+    call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+    unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+    struct afr_reply *replies, afr_transaction_type type);
+
+int
+afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies,
+                         inode_t *inode, char **policy_str);
+
+int
+_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this,
+                                 inode_t *inode, int source,
+                                 unsigned char *healed_sinks,
+                                 unsigned char *undid_pending,
+                                 afr_transaction_type type,
+                                 unsigned char *locked_on,
+                                 struct afr_reply *replies);
+
+int
+afr_get_child_index_from_name(xlator_t *this, char *name);
+
+gf_boolean_t
+afr_does_witness_exist(xlator_t *this, uint64_t *witness);
+
+int
+__afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                            unsigned char *locked_on, unsigned char *sources,
+                            unsigned char *sinks, unsigned char *healed_sinks,
+                            unsigned char *undid_pending,
+                            struct afr_reply *replies, unsigned char *flag);
+
+int
+__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this,
+                                inode_t *inode, unsigned char *locked_on,
+                                unsigned char *sources, unsigned char *sinks,
+                                unsigned char *healed_sinks,
+                                unsigned char *undid_pending,
+                                struct afr_reply *replies, unsigned char *flag);
+int
+__afr_selfheal_entry_prepare(call_frame_t *frame, xlator_t *this,
+                             inode_t *inode, unsigned char *locked_on,
+                             unsigned char *sources, unsigned char *sinks,
+                             unsigned char *healed_sinks,
+                             struct afr_reply *replies, int *source_p,
+                             unsigned char *flag);
+
+int
+afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+                              inode_t **link_inode, gf_boolean_t *data_selfheal,
+                              gf_boolean_t *metadata_selfheal,
+                              gf_boolean_t *entry_selfheal,
+                              struct afr_reply *replies);
+
+int
+afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid);
+
+int
+afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int op_ret, int op_errno, dict_t *xdata);
+
+int
+afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on);
+int
+afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources,
+                            afr_transaction_type type);
+
+int
+afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf);
+
+int
+afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode);
+int
+afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode);
+int
+afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode);
+
+int
+afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
+                            inode_t *inode, uuid_t pargfid, const char *bname,
+                            int src_idx, int child_idx,
+                            unsigned char *locked_on, int *src, dict_t *xdata);
+int
+afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources,
+                                    unsigned char *sinks,
+                                    unsigned char *healed_sinks,
+                                    unsigned char *locked_on,
+                                    struct afr_reply *replies,
+                                    afr_transaction_type type);
+
+gf_boolean_t
+afr_is_file_empty_on_all_children(afr_private_t *priv,
+                                  struct afr_reply *replies);
+
+int
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+                          inode_t *inode, int child, struct afr_reply *replies);
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode);
+#endif /* !_AFR_SELFHEAL_H */
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
new file mode 100644
index 00000000000..109fd4b7421
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -0,0 +1,1716 @@
+/*
+  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "afr.h"
+#include "afr-self-heal.h"
+#include "afr-self-heald.h"
+#include "protocol-common.h"
+#include <glusterfs/syncop-utils.h>
+#include "afr-messages.h"
+#include <glusterfs/byte-order.h>
+
+#define AFR_EH_SPLIT_BRAIN_LIMIT 1024
+#define AFR_STATISTICS_HISTORY_SIZE 50
+
+#define ASSERT_LOCAL(this, healer)                                             \
+    if (!afr_shd_is_subvol_local(this, healer->subvol)) {                      \
+        healer->local = _gf_false;                                             \
+        if (safe_break(healer)) {                                              \
+            break;                                                             \
+        } else {                                                               \
+            continue;                                                          \
+        }                                                                      \
+    } else {                                                                   \
+        healer->local = _gf_true;                                              \
+    }
+
+#define NTH_INDEX_HEALER(this, n)                                              \
+    &((((afr_private_t *)this->private))->shd.index_healers[n])
+#define NTH_FULL_HEALER(this, n)                                               \
+    &((((afr_private_t *)this->private))->shd.full_healers[n])
+
+char *
+afr_subvol_name(xlator_t *this, int subvol)
+{
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+    if (subvol < 0 || subvol > priv->child_count)
+        return NULL;
+
+    return priv->children[subvol]->name;
+}
+
+void
+afr_destroy_crawl_event_data(void *data)
+{
+    return;
+}
+
+void
+afr_destroy_shd_event_data(void *data)
+{
+    shd_event_t *shd_event = data;
+
+    if (!shd_event)
+        return;
+    GF_FREE(shd_event->path);
+
+    return;
+}
+
+gf_boolean_t
+afr_shd_is_subvol_local(xlator_t *this, int subvol)
+{
+    afr_private_t *priv = NULL;
+    gf_boolean_t is_local = _gf_false;
+    loc_t loc = {
+        0,
+    };
+
+    loc.inode = this->itable->root;
+    gf_uuid_copy(loc.gfid, loc.inode->gfid);
+    priv = this->private;
+    syncop_is_subvol_local(priv->children[subvol], &loc, &is_local);
+    return is_local;
+}
+
+int
+__afr_shd_healer_wait(struct subvol_healer *healer)
+{
+    afr_private_t *priv = NULL;
+    struct timespec wait_till = {
+        0,
+    };
+    int ret = 0;
+
+    priv = healer->this->private;
+
+disabled_loop:
+    wait_till.tv_sec = gf_time() + priv->shd.timeout;
+
+    while (!healer->rerun) {
+        ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till);
+        if (ret == ETIMEDOUT)
+            break;
+    }
+
+    ret = healer->rerun;
+    healer->rerun = 0;
+
+    if (!priv->shd.enabled)
+        goto disabled_loop;
+
+    return ret;
+}
+
+int
+afr_shd_healer_wait(struct subvol_healer *healer)
+{
+    int ret = 0;
+
+    pthread_mutex_lock(&healer->mutex);
+    {
+        ret = __afr_shd_healer_wait(healer);
+    }
+    pthread_mutex_unlock(&healer->mutex);
+
+    return ret;
+}
+
+gf_boolean_t
+safe_break(struct subvol_healer *healer)
+{
+    gf_boolean_t ret = _gf_false;
+
+    pthread_mutex_lock(&healer->mutex);
+    {
+        if (healer->rerun)
+            goto unlock;
+
+        healer->running = _gf_false;
+        ret = _gf_true;
+    }
+unlock:
+    pthread_mutex_unlock(&healer->mutex);
+
+    return ret;
+}
+
+inode_t *
+afr_shd_inode_find(xlator_t *this, xlator_t *subvol, uuid_t gfid)
+{
+    int ret = 0;
+    uint64_t val = IA_INVAL;
+    dict_t *xdata = NULL;
+    dict_t *rsp_dict = NULL;
+    inode_t *inode = NULL;
+
+    xdata = dict_new();
+    if (!xdata)
+        goto out;
+
+    ret = dict_set_int8(xdata, GF_INDEX_IA_TYPE_GET_REQ, 1);
+    if (ret)
+        goto out;
+
+    ret = syncop_inode_find(this, subvol, gfid, &inode, xdata, &rsp_dict);
+    if (ret < 0)
+        goto out;
+
+    if (rsp_dict) {
+        ret = dict_get_uint64(rsp_dict, GF_INDEX_IA_TYPE_GET_RSP, &val);
+        if (ret)
+            goto out;
+    }
+    ret = inode_ctx_set2(inode, subvol, 0, &val);
+out:
+    if (ret && inode) {
+        inode_unref(inode);
+        inode = NULL;
+    }
+    if (xdata)
+        dict_unref(xdata);
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+    return inode;
+}
+
+inode_t *
+afr_shd_index_inode(xlator_t *this, xlator_t *subvol, char *vgfid)
+{
+    loc_t rootloc = {
+        0,
+    };
+    inode_t *inode = NULL;
+    int ret = 0;
+    dict_t *xattr = NULL;
+    void *index_gfid = NULL;
+
+    rootloc.inode = inode_ref(this->itable->root);
+    gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
+
+    ret = syncop_getxattr(subvol, &rootloc, &xattr, vgfid, NULL, NULL);
+    if (ret || !xattr) {
+        errno = -ret;
+        goto out;
+    }
+
+    ret = dict_get_ptr(xattr, vgfid, &index_gfid);
+    if (ret)
+        goto out;
+
+    gf_msg_debug(this->name, 0, "%s dir gfid for %s: %s", vgfid, subvol->name,
+                 uuid_utoa(index_gfid));
+
+    inode = afr_shd_inode_find(this, subvol, index_gfid);
+
+out:
+    loc_wipe(&rootloc);
+
+    if (xattr)
+        dict_unref(xattr);
+
+    return inode;
+}
+
+int
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
+                    ia_type_t type)
+{
+    int ret = 0;
+    loc_t loc = {
+        0,
+    };
+
+    loc.parent = inode_ref(inode);
+    loc.name = name;
+
+    if (IA_ISDIR(type))
+        ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
+    else
+        ret = syncop_unlink(subvol, &loc, NULL, NULL);
+
+    loc_wipe(&loc);
+    return ret;
+}
+
+void
+afr_shd_zero_xattrop(xlator_t *this, uuid_t gfid)
+{
+    call_frame_t *frame = NULL;
+    inode_t *inode = NULL;
+    afr_private_t *priv = NULL;
+    dict_t *xattr = NULL;
+    int ret = 0;
+    int i = 0;
+    int raw[AFR_NUM_CHANGE_LOGS] = {0};
+
+    priv = this->private;
+    frame = afr_frame_create(this, NULL);
+    if (!frame)
+        goto out;
+    inode = afr_inode_find(this, gfid);
+    if (!inode)
+        goto out;
+    xattr = dict_new();
+    if (!xattr)
+        goto out;
+    ret = dict_set_static_bin(xattr, AFR_DIRTY, raw,
+                              sizeof(int) * AFR_NUM_CHANGE_LOGS);
+    if (ret)
+        goto out;
+    for (i = 0; i < priv->child_count; i++) {
+        ret = dict_set_static_bin(xattr, priv->pending_key[i], raw,
+                                  sizeof(int) * AFR_NUM_CHANGE_LOGS);
+        if (ret)
+            goto out;
+    }
+
+    /*Send xattrop to all bricks. Doing a lookup to see if bricks are up or
+     * has valid repies for this gfid seems a bit of an overkill.*/
+    for (i = 0; i < priv->child_count; i++)
+        afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+
+out:
+    if (frame)
+        AFR_STACK_DESTROY(frame);
+    if (inode)
+        inode_unref(inode);
+    if (xattr)
+        dict_unref(xattr);
+    return;
+}
+
+int
+afr_shd_selfheal_name(struct subvol_healer *healer, int child, uuid_t parent,
+                      const char *bname)
+{
+    int ret = -1;
+
+    ret = afr_selfheal_name(THIS, parent, bname, NULL, NULL);
+
+    return ret;
+}
+
+int
+afr_shd_selfheal(struct subvol_healer *healer, int child, uuid_t gfid)
+{
+    int ret = 0;
+    eh_t *eh = NULL;
+    afr_private_t *priv = NULL;
+    afr_self_heald_t *shd = NULL;
+    shd_event_t *shd_event = NULL;
+    char *path = NULL;
+    xlator_t *subvol = NULL;
+    xlator_t *this = NULL;
+    crawl_event_t *crawl_event = NULL;
+
+    this = healer->this;
+    priv = this->private;
+    shd = &priv->shd;
+    crawl_event = &healer->crawl_event;
+
+    subvol = priv->children[child];
+
+    // If this fails with ENOENT/ESTALE index is stale
+    ret = syncop_gfid_to_path(this->itable, subvol, gfid, &path);
+    if (ret < 0)
+        return ret;
+
+    ret = afr_selfheal(this, gfid);
+
+    LOCK(&priv->lock);
+    {
+        if (ret == -EIO) {
+            eh = shd->split_brain;
+            crawl_event->split_brain_count++;
+        } else if (ret < 0) {
+            crawl_event->heal_failed_count++;
+        } else if (ret == 0) {
+            crawl_event->healed_count++;
+        }
+    }
+    UNLOCK(&priv->lock);
+
+    if (eh) {
+        shd_event = GF_CALLOC(1, sizeof(*shd_event), gf_afr_mt_shd_event_t);
+        if (!shd_event)
+            goto out;
+
+        shd_event->child = child;
+        shd_event->path = path;
+
+        if (eh_save_history(eh, shd_event) < 0)
+            goto out;
+
+        shd_event = NULL;
+        path = NULL;
+    }
+out:
+    GF_FREE(shd_event);
+    GF_FREE(path);
+    return ret;
+}
+
+void
+afr_shd_sweep_prepare(struct subvol_healer *healer)
+{
+    crawl_event_t *event = NULL;
+
+    event = &healer->crawl_event;
+
+    event->healed_count = 0;
+    event->split_brain_count = 0;
+    event->heal_failed_count = 0;
+
+    event->start_time = gf_time();
+    event->end_time = 0;
+    _mask_cancellation();
+}
+
+void
+afr_shd_sweep_done(struct subvol_healer *healer)
+{
+    crawl_event_t *event = NULL;
+    crawl_event_t *history = NULL;
+    afr_self_heald_t *shd = NULL;
+
+    event = &healer->crawl_event;
+    shd = &(((afr_private_t *)healer->this->private)->shd);
+
+    event->end_time = gf_time();
+    history = gf_memdup(event, sizeof(*event));
+    event->start_time = 0;
+
+    if (!history)
+        return;
+
+    if (eh_save_history(shd->statistics[healer->subvol], history) < 0)
+        GF_FREE(history);
+    _unmask_cancellation();
+}
+
+int
+afr_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+                   void *data)
+{
+    struct subvol_healer *healer = data;
+    afr_private_t *priv = NULL;
+    uuid_t gfid = {0};
+    int ret = 0;
+    uint64_t val = IA_INVAL;
+
+    priv = healer->this->private;
+    if (!priv->shd.enabled)
+        return -EBUSY;
+
+    gf_msg_debug(healer->this->name, 0, "got entry: %s from %s", entry->d_name,
+                 priv->children[healer->subvol]->name);
+
+    ret = gf_uuid_parse(entry->d_name, gfid);
+    if (ret)
+        return 0;
+
+    inode_ctx_get2(parent->inode, subvol, NULL, &val);
+
+    ret = afr_shd_selfheal(healer, healer->subvol, gfid);
+
+    if (ret == -ENOENT || ret == -ESTALE)
+        afr_shd_entry_purge(subvol, parent->inode, entry->d_name, val);
+
+    if (ret == 2)
+        /* If bricks crashed in pre-op after creating indices/xattrop
+         * link but before setting afr changelogs, we end up with stale
+         * xattrop links but zero changelogs. Remove such entries by
+         * sending a post-op with zero changelogs.
+         */
+        afr_shd_zero_xattrop(healer->this, gfid);
+
+    return 0;
+}
+
+int
+afr_shd_index_sweep(struct subvol_healer *healer, char *vgfid)
+{
+    loc_t loc = {0};
+    afr_private_t *priv = NULL;
+    int ret = 0;
+    xlator_t *subvol = NULL;
+    dict_t *xdata = NULL;
+    call_frame_t *frame = NULL;
+
+    priv = healer->this->private;
+    subvol = priv->children[healer->subvol];
+
+    frame = afr_frame_create(healer->this, &ret);
+    if (!frame) {
+        ret = -ret;
+        goto out;
+    }
+
+    loc.inode = afr_shd_index_inode(healer->this, subvol, vgfid);
+    if (!loc.inode) {
+        gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+               AFR_MSG_INDEX_DIR_GET_FAILED, "unable to get index-dir on %s",
+               subvol->name);
+        ret = -errno;
+        goto out;
+    }
+
+    xdata = dict_new();
+    if (!xdata || dict_set_int32_sizen(xdata, "get-gfid-type", 1)) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    ret = syncop_mt_dir_scan(frame, subvol, &loc, GF_CLIENT_PID_SELF_HEALD,
+                             healer, afr_shd_index_heal, xdata,
+                             priv->shd.max_threads, priv->shd.wait_qlength);
+
+    if (ret == 0)
+        ret = healer->crawl_event.healed_count;
+
+out:
+    loc_wipe(&loc);
+
+    if (xdata)
+        dict_unref(xdata);
+    if (frame)
+        AFR_STACK_DESTROY(frame);
+    return ret;
+}
+
+int
+afr_shd_index_sweep_all(struct subvol_healer *healer)
+{
+    int ret = 0;
+    int count = 0;
+
+    ret = afr_shd_index_sweep(healer, GF_XATTROP_INDEX_GFID);
+    if (ret < 0)
+        goto out;
+    count = ret;
+
+    ret = afr_shd_index_sweep(healer, GF_XATTROP_DIRTY_GFID);
+    if (ret < 0)
+        goto out;
+    count += ret;
+
+    ret = afr_shd_index_sweep(healer, GF_XATTROP_ENTRY_CHANGES_GFID);
+    if (ret < 0)
+        goto out;
+    count += ret;
+out:
+    if (ret < 0)
+        return ret;
+    else
+        return count;
+}
+
+int
+afr_shd_full_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+                  void *data)
+{
+    struct subvol_healer *healer = data;
+    xlator_t *this = healer->this;
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    if (this->cleanup_starting) {
+        return -ENOTCONN;
+    }
+
+    if (!priv->shd.enabled)
+        return -EBUSY;
+
+    afr_shd_selfheal_name(healer, healer->subvol, parent->inode->gfid,
+                          entry->d_name);
+
+    afr_shd_selfheal(healer, healer->subvol, entry->d_stat.ia_gfid);
+
+    return 0;
+}
+
+int
+afr_shd_full_sweep(struct subvol_healer *healer, inode_t *inode)
+{
+    afr_private_t *priv = NULL;
+    loc_t loc = {0};
+
+    priv = healer->this->private;
+    loc.inode = inode;
+    return syncop_ftw(priv->children[healer->subvol], &loc,
+                      GF_CLIENT_PID_SELF_HEALD, healer, afr_shd_full_heal);
+}
+
+int
+afr_shd_fill_ta_loc(xlator_t *this, loc_t *loc)
+{
+    afr_private_t *priv = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+    int ret = -1;
+
+    priv = this->private;
+    loc->parent = inode_ref(this->itable->root);
+    gf_uuid_copy(loc->pargfid, loc->parent->gfid);
+    loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
+    loc->inode = inode_new(loc->parent->table);
+    GF_CHECK_ALLOC(loc->inode, ret, out);
+
+    if (!gf_uuid_is_null(priv->ta_gfid))
+        goto assign_gfid;
+
+    ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], loc, &stbuf,
+                        0, 0, 0);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Failed lookup on file %s.", loc->name);
+        goto out;
+    }
+
+    gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid);
+
+assign_gfid:
+    gf_uuid_copy(loc->gfid, priv->ta_gfid);
+    ret = 0;
+
+out:
+    if (ret)
+        loc_wipe(loc);
+
+    return ret;
+}
+
+int
+_afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata)
+{
+    afr_private_t *priv = NULL;
+    dict_t *xattr = NULL;
+    int raw[AFR_NUM_CHANGE_LOGS] = {
+        0,
+    };
+    int ret = -1;
+    int i = 0;
+
+    priv = this->private;
+
+    xattr = dict_new();
+    if (!xattr) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_GET_FAILED,
+               "Failed to create dict.");
+        goto out;
+    }
+    for (i = 0; i < priv->child_count; i++) {
+        ret = dict_set_static_bin(xattr, priv->pending_key[i], &raw,
+                                  AFR_NUM_CHANGE_LOGS * sizeof(int));
+        if (ret)
+            goto out;
+    }
+
+    ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc,
+                         GF_XATTROP_ADD_ARRAY, xattr, NULL, xdata, NULL);
+    if (ret || !(*xdata)) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Xattrop failed on %s.", loc->name);
+    }
+
+out:
+    if (xattr)
+        dict_unref(xattr);
+
+    return ret;
+}
+
+void
+afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, struct subvol_healer *healer,
+                      dict_t **xdata)
+{
+    int ret = 0;
+
+    loc_wipe(loc);
+    if (afr_shd_fill_ta_loc(this, loc)) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Failed to populate thin-arbiter loc for: %s.", loc->name);
+        ret = -1;
+        goto out;
+    }
+
+    ret = afr_ta_post_op_lock(this, loc);
+    if (ret)
+        goto out;
+
+    ret = _afr_shd_ta_get_xattrs(this, loc, xdata);
+    if (ret) {
+        if (*xdata) {
+            dict_unref(*xdata);
+            *xdata = NULL;
+        }
+    }
+
+    afr_ta_post_op_unlock(this, loc);
+
+out:
+    if (ret)
+        healer->rerun = 1;
+}
+
+int
+afr_shd_ta_unset_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer)
+{
+    afr_private_t *priv = NULL;
+    dict_t *xattr = NULL;
+    gf_boolean_t need_xattrop = _gf_false;
+    void *pending_raw = NULL;
+    int *raw = NULL;
+    int pending[AFR_NUM_CHANGE_LOGS] = {
+        0,
+    };
+    int i = 0;
+    int j = 0;
+    int val = 0;
+    int ret = -1;
+
+    priv = this->private;
+
+    xattr = dict_new();
+    if (!xattr) {
+        goto out;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t);
+        if (!raw) {
+            goto out;
+        }
+
+        ret = dict_get_ptr(*xdata, priv->pending_key[i], &pending_raw);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+                   "Error getting value "
+                   "of pending key %s",
+                   priv->pending_key[i]);
+            GF_FREE(raw);
+            goto out;
+        }
+
+        memcpy(pending, pending_raw, sizeof(pending));
+        for (j = 0; j < AFR_NUM_CHANGE_LOGS; j++) {
+            val = ntoh32(pending[j]);
+            if (val) {
+                if (i == healer) {
+                    gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_THIN_ARB,
+                           "I am "
+                           "not the good shd. Skipping. "
+                           "SHD = %d.",
+                           healer);
+                    ret = 0;
+                    GF_FREE(raw);
+                    goto out;
+                }
+                need_xattrop = _gf_true;
+                raw[j] = hton32(-val);
+            }
+        }
+
+        ret = dict_set_bin(xattr, priv->pending_key[i], raw,
+                           AFR_NUM_CHANGE_LOGS * sizeof(int));
+        if (ret) {
+            GF_FREE(raw);
+            goto out;
+        }
+
+        if (need_xattrop)
+            break;
+    }
+
+    if (!need_xattrop) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc,
+                         GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Xattrop failed.");
+
+out:
+    if (xattr)
+        dict_unref(xattr);
+
+    return ret;
+}
+
+void
+afr_shd_ta_check_and_unset_xattrs(xlator_t *this, loc_t *loc,
+                                  struct subvol_healer *healer,
+                                  dict_t *pre_crawl_xdata)
+{
+    int ret_lock = 0;
+    int ret = 0;
+    dict_t *post_crawl_xdata = NULL;
+
+    ret_lock = afr_ta_post_op_lock(this, loc);
+    if (ret_lock)
+        goto unref;
+
+    ret = _afr_shd_ta_get_xattrs(this, loc, &post_crawl_xdata);
+    if (ret)
+        goto unref;
+
+    if (!are_dicts_equal(pre_crawl_xdata, post_crawl_xdata, NULL, NULL)) {
+        ret = -1;
+        goto unref;
+    }
+
+    ret = afr_shd_ta_unset_xattrs(this, loc, &post_crawl_xdata, healer->subvol);
+
+unref:
+    if (post_crawl_xdata) {
+        dict_unref(post_crawl_xdata);
+        post_crawl_xdata = NULL;
+    }
+
+    if (ret || ret_lock)
+        healer->rerun = 1;
+
+    if (!ret_lock)
+        afr_ta_post_op_unlock(this, loc);
+}
+
+gf_boolean_t
+afr_bricks_available_for_heal(afr_private_t *priv)
+{
+    int up_children = 0;
+
+    up_children = __afr_get_up_children_count(priv);
+    if (up_children < 2) {
+        return _gf_false;
+    }
+    return _gf_true;
+}
+
+static gf_boolean_t
+afr_shd_ta_needs_heal(xlator_t *this, struct subvol_healer *healer)
+{
+    dict_t *xdata = NULL;
+    afr_private_t *priv = NULL;
+    loc_t loc = {
+        0,
+    };
+    int ret = -1;
+    int i = 0;
+    gf_boolean_t need_heal = _gf_false;
+
+    priv = this->private;
+
+    ret = afr_shd_fill_ta_loc(this, &loc);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Failed to populate thin-arbiter loc for: %s.", loc.name);
+        healer->rerun = 1;
+        goto out;
+    }
+
+    if (_afr_shd_ta_get_xattrs(this, &loc, &xdata)) {
+        healer->rerun = 1;
+        goto out;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (afr_ta_dict_contains_pending_xattr(xdata, priv, i)) {
+            need_heal = _gf_true;
+            break;
+        }
+    }
+
+out:
+    if (xdata)
+        dict_unref(xdata);
+    loc_wipe(&loc);
+
+    return need_heal;
+}
+
+static int
+afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+                           void *data)
+{
+    struct subvol_healer *healer = data;
+    afr_private_t *priv = healer->this->private;
+    call_frame_t *frame = NULL;
+    afr_local_t *local = NULL;
+    int ret = 0;
+    loc_t loc = {0};
+    int count = 0;
+    int i = 0;
+    int op_errno = 0;
+    struct iatt *iatt = NULL;
+    gf_boolean_t multiple_links = _gf_false;
+    unsigned char *gfid_present = alloca0(priv->child_count);
+    unsigned char *entry_present = alloca0(priv->child_count);
+    char *type = "file";
+
+    frame = afr_frame_create(healer->this, &ret);
+    if (!frame) {
+        ret = -ret;
+        goto out;
+    }
+    local = frame->local;
+    if (AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
+        gf_msg_debug(healer->this->name, 0,
+                     "Not all bricks are up. Skipping "
+                     "cleanup of %s on %s",
+                     entry->d_name, subvol->name);
+        ret = 0;
+        goto out;
+    }
+
+    loc.inode = inode_new(parent->inode->table);
+    if (!loc.inode) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    ret = gf_uuid_parse(entry->d_name, loc.gfid);
+    if (ret) {
+        ret = 0;
+        goto out;
+    }
+    AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+               NULL);
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->replies[i].op_ret == 0) {
+            count++;
+            gfid_present[i] = 1;
+            iatt = &local->replies[i].poststat;
+            if (iatt->ia_type == IA_IFDIR) {
+                type = "dir";
+            }
+
+            if (i == healer->subvol) {
+                if (local->replies[i].poststat.ia_nlink > 1) {
+                    multiple_links = _gf_true;
+                }
+            }
+        } else if (local->replies[i].op_errno != ENOENT &&
+                   local->replies[i].op_errno != ESTALE) {
+            /*We don't have complete view. Skip the entry*/
+            gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+                         "Skipping cleanup of %s on %s", entry->d_name,
+                         subvol->name);
+            ret = 0;
+            goto out;
+        }
+    }
+
+    /*Inode is deleted from subvol*/
+    if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) {
+        gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+               AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type,
+               priv->anon_inode_name, entry->d_name, subvol->name);
+        ret = afr_shd_entry_purge(subvol, parent->inode, entry->d_name,
+                                  iatt->ia_type);
+        if (ret == -ENOENT || ret == -ESTALE)
+            ret = 0;
+    } else if (count > 1) {
+        loc_wipe(&loc);
+        loc.parent = inode_ref(parent->inode);
+        loc.name = entry->d_name;
+        loc.inode = inode_new(parent->inode->table);
+        if (!loc.inode) {
+            ret = -ENOMEM;
+            goto out;
+        }
+        AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup,
+                   &loc, NULL);
+        count = 0;
+        for (i = 0; i < priv->child_count; i++) {
+            if (local->replies[i].op_ret == 0) {
+                count++;
+                entry_present[i] = 1;
+                iatt = &local->replies[i].poststat;
+            } else if (local->replies[i].op_errno != ENOENT &&
+                       local->replies[i].op_errno != ESTALE) {
+                /*We don't have complete view. Skip the entry*/
+                gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+                             "Skipping cleanup of %s on %s", entry->d_name,
+                             subvol->name);
+                ret = 0;
+                goto out;
+            }
+        }
+        for (i = 0; i < priv->child_count; i++) {
+            if (gfid_present[i] && !entry_present[i]) {
+                /*Entry is not anonymous on at least one subvol*/
+                gf_msg_debug(healer->this->name, 0,
+                             "Valid entry present on %s "
+                             "Skipping cleanup of %s on %s",
+                             priv->children[i]->name, entry->d_name,
+                             subvol->name);
+                ret = 0;
+                goto out;
+            }
+        }
+
+        gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+               AFR_MSG_EXPUNGING_FILE_OR_DIR,
+               "expunging %s %s/%s on all subvols", type, priv->anon_inode_name,
+               entry->d_name);
+        ret = 0;
+        for (i = 0; i < priv->child_count; i++) {
+            op_errno = -afr_shd_entry_purge(priv->children[i], loc.parent,
+                                            entry->d_name, iatt->ia_type);
+            if (op_errno != ENOENT && op_errno != ESTALE) {
+                ret |= -op_errno;
+            }
+        }
+    }
+
+out:
+    if (frame)
+        AFR_STACK_DESTROY(frame);
+    loc_wipe(&loc);
+    return ret;
+}
+
+static void
+afr_cleanup_anon_inode_dir(struct subvol_healer *healer)
+{
+    int ret = 0;
+    call_frame_t *frame = NULL;
+    afr_private_t *priv = healer->this->private;
+    loc_t loc = {0};
+
+    ret = afr_anon_inode_create(healer->this, healer->subvol, &loc.inode);
+    if (ret)
+        goto out;
+
+    frame = afr_frame_create(healer->this, &ret);
+    if (!frame) {
+        ret = -ret;
+        goto out;
+    }
+
+    ret = syncop_mt_dir_scan(frame, priv->children[healer->subvol], &loc,
+                             GF_CLIENT_PID_SELF_HEALD, healer,
+                             afr_shd_anon_inode_cleaner, NULL,
+                             priv->shd.max_threads, priv->shd.wait_qlength);
+out:
+    if (frame)
+        AFR_STACK_DESTROY(frame);
+    loc_wipe(&loc);
+    return;
+}
+
+void *
+afr_shd_index_healer(void *data)
+{
+    struct subvol_healer *healer = NULL;
+    xlator_t *this = NULL;
+    int ret = 0;
+    afr_private_t *priv = NULL;
+    dict_t *pre_crawl_xdata = NULL;
+    loc_t loc = {
+        0,
+    };
+
+    healer = data;
+    THIS = this = healer->this;
+    priv = this->private;
+
+    for (;;) {
+        afr_shd_healer_wait(healer);
+
+        if (!afr_bricks_available_for_heal(priv))
+            continue;
+
+        ASSERT_LOCAL(this, healer);
+        priv->local[healer->subvol] = healer->local;
+
+        if (priv->thin_arbiter_count) {
+            if (afr_shd_ta_needs_heal(this, healer))
+                afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata);
+        }
+
+        do {
+            gf_msg_debug(this->name, 0, "starting index sweep on subvol %s",
+                         afr_subvol_name(this, healer->subvol));
+
+            afr_shd_sweep_prepare(healer);
+
+            ret = afr_shd_index_sweep_all(healer);
+
+            afr_shd_sweep_done(healer);
+            /*
+              As long as at least one gfid was
+              healed, keep retrying. We may have
+              just healed a directory and thereby
+              created entries for other gfids which
+              could not be healed thus far.
+            */
+
+            gf_msg_debug(this->name, 0, "finished index sweep on subvol %s",
+                         afr_subvol_name(this, healer->subvol));
+            /*
+              Give a pause before retrying to avoid a busy loop
+              in case the only entry in index is because of
+              an ongoing I/O.
+            */
+            sleep(1);
+        } while (ret > 0);
+
+        if (ret == 0) {
+            afr_cleanup_anon_inode_dir(healer);
+        }
+
+        if (ret == 0 && pre_crawl_xdata &&
+            !healer->crawl_event.heal_failed_count) {
+            afr_shd_ta_check_and_unset_xattrs(this, &loc, healer,
+                                              pre_crawl_xdata);
+        }
+
+        if (pre_crawl_xdata) {
+            dict_unref(pre_crawl_xdata);
+            pre_crawl_xdata = NULL;
+        }
+    }
+
+    return NULL;
+}
+
+void *
+afr_shd_full_healer(void *data)
+{
+    struct subvol_healer *healer = NULL;
+    xlator_t *this = NULL;
+    int run = 0;
+
+    healer = data;
+    THIS = this = healer->this;
+
+    for (;;) {
+        pthread_mutex_lock(&healer->mutex);
+        {
+            run = __afr_shd_healer_wait(healer);
+            if (!run)
+                healer->running = _gf_false;
+        }
+        pthread_mutex_unlock(&healer->mutex);
+
+        if (!run)
+            break;
+
+        ASSERT_LOCAL(this, healer);
+
+        gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+               "starting full sweep on subvol %s",
+               afr_subvol_name(this, healer->subvol));
+
+        afr_shd_sweep_prepare(healer);
+
+        afr_shd_full_sweep(healer, this->itable->root);
+
+        afr_shd_sweep_done(healer);
+
+        gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+               "finished full sweep on subvol %s",
+               afr_subvol_name(this, healer->subvol));
+    }
+
+    return NULL;
+}
+
+int
+afr_shd_healer_init(xlator_t *this, struct subvol_healer *healer)
+{
+    int ret = 0;
+
+    ret = pthread_mutex_init(&healer->mutex, NULL);
+    if (ret)
+        goto out;
+
+    ret = pthread_cond_init(&healer->cond, NULL);
+    if (ret)
+        goto out;
+
+    healer->this = this;
+    healer->running = _gf_false;
+    healer->rerun = _gf_false;
+    healer->local = _gf_false;
+out:
+    return ret;
+}
+
+int
+afr_shd_healer_spawn(xlator_t *this, struct subvol_healer *healer,
+                     void *(threadfn)(void *))
+{
+    int ret = 0;
+
+    pthread_mutex_lock(&healer->mutex);
+    {
+        if (healer->running) {
+            pthread_cond_signal(&healer->cond);
+        } else {
+            ret = gf_thread_create(&healer->thread, NULL, threadfn, healer,
+                                   "shdheal");
+            if (ret)
+                goto unlock;
+            healer->running = 1;
+        }
+
+        healer->rerun = 1;
+    }
+unlock:
+    pthread_mutex_unlock(&healer->mutex);
+
+    return ret;
+}
+
+int
+afr_shd_full_healer_spawn(xlator_t *this, int subvol)
+{
+    return afr_shd_healer_spawn(this, NTH_FULL_HEALER(this, subvol),
+                                afr_shd_full_healer);
+}
+
+int
+afr_shd_index_healer_spawn(xlator_t *this, int subvol)
+{
+    return afr_shd_healer_spawn(this, NTH_INDEX_HEALER(this, subvol),
+                                afr_shd_index_healer);
+}
+
+int
+afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output,
+                             crawl_event_t *crawl_event)
+{
+    int ret = 0;
+    uint64_t count = 0;
+    char key[128] = {0};
+    int keylen = 0;
+    char suffix[64] = {0};
+    int xl_id = 0;
+    uint64_t healed_count = 0;
+    uint64_t split_brain_count = 0;
+    uint64_t heal_failed_count = 0;
+    char *start_time_str = 0;
+    char *end_time_str = NULL;
+    char *crawl_type = NULL;
+    int progress = -1;
+    int child = -1;
+
+    child = crawl_event->child;
+    healed_count = crawl_event->healed_count;
+    split_brain_count = crawl_event->split_brain_count;
+    heal_failed_count = crawl_event->heal_failed_count;
+    crawl_type = crawl_event->crawl_type;
+
+    if (!crawl_event->start_time)
+        goto out;
+
+    start_time_str = gf_strdup(ctime(&crawl_event->start_time));
+
+    if (crawl_event->end_time)
+        end_time_str = gf_strdup(ctime(&crawl_event->end_time));
+
+    ret = dict_get_int32(output, this->name, &xl_id);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+               "xl does not have id");
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "statistics-%d-%d-count", xl_id, child);
+    ret = dict_get_uint64(output, key, &count);
+
+    snprintf(suffix, sizeof(suffix), "%d-%d-%" PRIu64, xl_id, child, count);
+    snprintf(key, sizeof(key), "statistics_healed_cnt-%s", suffix);
+    ret = dict_set_uint64(output, key, healed_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+               "Could not add statistics_healed_count to output");
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "statistics_sb_cnt-%s", suffix);
+    ret = dict_set_uint64(output, key, split_brain_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+               "Could not add statistics_split_brain_count to output");
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "statistics_crawl_type-%s", suffix);
+    ret = dict_set_strn(output, key, keylen, crawl_type);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+               "Could not add statistics_crawl_type to output");
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "statistics_heal_failed_cnt-%s", suffix);
+    ret = dict_set_uint64(output, key, heal_failed_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+               "Could not add statistics_healed_failed_count to output");
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "statistics_strt_time-%s", suffix);
+    ret = dict_set_dynstrn(output, key, keylen, start_time_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+               "Could not add statistics_crawl_start_time to output");
+        goto out;
+    } else {
+        start_time_str = NULL;
+    }
+
+    if (!end_time_str)
+        progress = 1;
+    else
+        progress = 0;
+
+    keylen = snprintf(key, sizeof(key), "statistics_end_time-%s", suffix);
+    if (!end_time_str)
+        end_time_str = gf_strdup("Could not determine the end time");
+    ret = dict_set_dynstrn(output, key, keylen, end_time_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+               "Could not add statistics_crawl_end_time to output");
+        goto out;
+    } else {
+        end_time_str = NULL;
+    }
+
+    keylen = snprintf(key, sizeof(key), "statistics_inprogress-%s", suffix);
+
+    ret = dict_set_int32n(output, key, keylen, progress);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+               "Could not add statistics_inprogress to output");
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "statistics-%d-%d-count", xl_id, child);
+    ret = dict_set_uint64(output, key, count + 1);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+               "Could not increment the counter.");
+        goto out;
+    }
+out:
+    GF_FREE(start_time_str);
+    GF_FREE(end_time_str);
+    return ret;
+}
+
+int
+afr_shd_dict_add_path(xlator_t *this, dict_t *output, int child, char *path,
+                      struct timeval *tv)
+{
+    int ret = -1;
+    uint64_t count = 0;
+    char key[64] = {0};
+    int keylen = 0;
+    char xl_id_child_str[32] = {0};
+    int xl_id = 0;
+
+    ret = dict_get_int32(output, this->name, &xl_id);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+               "xl does not have id");
+        goto out;
+    }
+
+    snprintf(xl_id_child_str, sizeof(xl_id_child_str), "%d-%d", xl_id, child);
+    snprintf(key, sizeof(key), "%s-count", xl_id_child_str);
+    ret = dict_get_uint64(output, key, &count);
+
+    keylen = snprintf(key, sizeof(key), "%s-%" PRIu64, xl_id_child_str, count);
+    ret = dict_set_dynstrn(output, key, keylen, path);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+               "%s: Could not add to output", path);
+        goto out;
+    }
+
+    if (tv) {
+        snprintf(key, sizeof(key), "%s-%" PRIu64 "-time", xl_id_child_str,
+                 count);
+        ret = dict_set_uint32(output, key, tv->tv_sec);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+                   "%s: Could not set time", path);
+            goto out;
+        }
+    }
+
+    snprintf(key, sizeof(key), "%s-count", xl_id_child_str);
+
+    ret = dict_set_uint64(output, key, count + 1);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+               "Could not increment count");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+afr_add_shd_event(circular_buffer_t *cb, void *data)
+{
+    dict_t *output = NULL;
+    xlator_t *this = THIS;
+    afr_private_t *priv = NULL;
+    afr_self_heald_t *shd = NULL;
+    shd_event_t *shd_event = NULL;
+    char *path = NULL;
+
+    output = data;
+    priv = this->private;
+    shd = &priv->shd;
+    shd_event = cb->data;
+
+    if (!shd->index_healers[shd_event->child].local)
+        return 0;
+
+    path = gf_strdup(shd_event->path);
+    if (!path)
+        return -ENOMEM;
+
+    afr_shd_dict_add_path(this, output, shd_event->child, path, &cb->tv);
+    return 0;
+}
+
+int
+afr_add_crawl_event(circular_buffer_t *cb, void *data)
+{
+    dict_t *output = NULL;
+    xlator_t *this = THIS;
+    afr_private_t *priv = NULL;
+    afr_self_heald_t *shd = NULL;
+    crawl_event_t *crawl_event = NULL;
+
+    output = data;
+    priv = this->private;
+    shd = &priv->shd;
+    crawl_event = cb->data;
+
+    if (!shd->index_healers[crawl_event->child].local)
+        return 0;
+
+    afr_shd_dict_add_crawl_event(this, output, crawl_event);
+
+    return 0;
+}
+
+int
+afr_selfheal_daemon_init(xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    afr_self_heald_t *shd = NULL;
+    int ret = -1;
+    int i = 0;
+
+    priv = this->private;
+    shd = &priv->shd;
+
+    shd->index_healers = GF_CALLOC(sizeof(*shd->index_healers),
+                                   priv->child_count,
+                                   gf_afr_mt_subvol_healer_t);
+    if (!shd->index_healers)
+        goto out;
+
+    for (i = 0; i < priv->child_count; i++) {
+        shd->index_healers[i].subvol = i;
+        ret = afr_shd_healer_init(this, &shd->index_healers[i]);
+        if (ret)
+            goto out;
+    }
+
+    shd->full_healers = GF_CALLOC(sizeof(*shd->full_healers), priv->child_count,
+                                  gf_afr_mt_subvol_healer_t);
+    if (!shd->full_healers)
+        goto out;
+    for (i = 0; i < priv->child_count; i++) {
+        shd->full_healers[i].subvol = i;
+        ret = afr_shd_healer_init(this, &shd->full_healers[i]);
+        if (ret)
+            goto out;
+    }
+
+    shd->split_brain = eh_new(AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false,
+                              afr_destroy_shd_event_data);
+    if (!shd->split_brain)
+        goto out;
+
+    shd->statistics = GF_CALLOC(sizeof(eh_t *), priv->child_count,
+                                gf_common_mt_eh_t);
+    if (!shd->statistics)
+        goto out;
+
+    for (i = 0; i < priv->child_count; i++) {
+        shd->statistics[i] = eh_new(AFR_STATISTICS_HISTORY_SIZE, _gf_false,
+                                    afr_destroy_crawl_event_data);
+        if (!shd->statistics[i])
+            goto out;
+        shd->full_healers[i].crawl_event.child = i;
+        shd->full_healers[i].crawl_event.crawl_type = "FULL";
+        shd->index_healers[i].crawl_event.child = i;
+        shd->index_healers[i].crawl_event.crawl_type = "INDEX";
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+void
+afr_selfheal_childup(xlator_t *this, afr_private_t *priv)
+{
+    int subvol = 0;
+
+    if (!priv->shd.iamshd)
+        return;
+    for (subvol = 0; subvol < priv->child_count; subvol++)
+        if (priv->child_up[subvol])
+            afr_shd_index_healer_spawn(this, subvol);
+
+    return;
+}
+
+int
+afr_shd_get_index_count(xlator_t *this, int i, uint64_t *count)
+{
+    afr_private_t *priv = NULL;
+    xlator_t *subvol = NULL;
+    loc_t rootloc = {
+        0,
+    };
+    dict_t *xattr = NULL;
+    int ret = -1;
+
+    priv = this->private;
+    subvol = priv->children[i];
+
+    rootloc.inode = inode_ref(this->itable->root);
+    gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
+
+    ret = syncop_getxattr(subvol, &rootloc, &xattr, GF_XATTROP_INDEX_COUNT,
+                          NULL, NULL);
+    if (ret < 0)
+        goto out;
+
+    ret = dict_get_uint64(xattr, GF_XATTROP_INDEX_COUNT, count);
+    if (ret)
+        goto out;
+
+    ret = 0;
+
+out:
+    if (xattr)
+        dict_unref(xattr);
+    loc_wipe(&rootloc);
+
+    return ret;
+}
+
+int
+afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
+{
+    gf_xl_afr_op_t op = GF_SHD_OP_INVALID;
+    int ret = 0;
+    int xl_id = 0;
+    afr_private_t *priv = NULL;
+    afr_self_heald_t *shd = NULL;
+    struct subvol_healer *healer = NULL;
+    int i = 0;
+    char key[64];
+    int keylen = 0;
+    int this_name_len = 0;
+    int op_ret = 0;
+    uint64_t cnt = 0;
+
+#define AFR_SET_DICT_AND_LOG(name, output, key, keylen, dict_str,              \
+                             dict_str_len)                                     \
+    {                                                                          \
+        int ret;                                                               \
+                                                                               \
+        ret = dict_set_nstrn(output, key, keylen, dict_str, dict_str_len);     \
+        if (ret) {                                                             \
+            gf_smsg(name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,         \
+                    "key=%s", key, "value=%s", dict_str, NULL);                \
+        }                                                                      \
+    }
+
+    priv = this->private;
+    shd = &priv->shd;
+
+    ret = dict_get_int32_sizen(input, "xl-op", (int32_t *)&op);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+                "key=xl-op", NULL);
+        goto out;
+    }
+    this_name_len = strlen(this->name);
+    ret = dict_get_int32n(input, this->name, this_name_len, &xl_id);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+                "key=%s", this->name, NULL);
+        goto out;
+    }
+    ret = dict_set_int32n(output, this->name, this_name_len, xl_id);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+                "key=%s", this->name, NULL);
+        goto out;
+    }
+    switch (op) {
+        case GF_SHD_OP_HEAL_INDEX:
+            op_ret = 0;
+
+            for (i = 0; i < priv->child_count; i++) {
+                healer = &shd->index_healers[i];
+                keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
+
+                if (!priv->child_up[i]) {
+                    AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+                                         SBRICK_NOT_CONNECTED,
+                                         SLEN(SBRICK_NOT_CONNECTED));
+                    op_ret = -1;
+                } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) {
+                    AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+                                         SLESS_THAN2_BRICKS_in_REP,
+                                         SLEN(SLESS_THAN2_BRICKS_in_REP));
+                    op_ret = -1;
+                } else if (!afr_shd_is_subvol_local(this, healer->subvol)) {
+                    AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+                                         SBRICK_IS_REMOTE,
+                                         SLEN(SBRICK_IS_REMOTE));
+                } else {
+                    AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+                                         SSTARTED_SELF_HEAL,
+                                         SLEN(SSTARTED_SELF_HEAL));
+
+                    ret = afr_shd_index_healer_spawn(this, i);
+
+                    if (ret) {
+                        gf_smsg(this->name, GF_LOG_ERROR, -ret,
+                                AFR_MSG_HEALER_SPAWN_FAILED, NULL);
+                    }
+                }
+            }
+            break;
+        case GF_SHD_OP_HEAL_FULL:
+            op_ret = -1;
+
+            for (i = 0; i < priv->child_count; i++) {
+                healer = &shd->full_healers[i];
+                keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
+
+                if (!priv->child_up[i]) {
+                    AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+                                         SBRICK_NOT_CONNECTED,
+                                         SLEN(SBRICK_NOT_CONNECTED));
+                } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) {
+                    AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+                                         SLESS_THAN2_BRICKS_in_REP,
+                                         SLEN(SLESS_THAN2_BRICKS_in_REP));
+                } else if (!afr_shd_is_subvol_local(this, healer->subvol)) {
+                    AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+                                         SBRICK_IS_REMOTE,
+                                         SLEN(SBRICK_IS_REMOTE));
+                } else {
+                    AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+                                         SSTARTED_SELF_HEAL,
+                                         SLEN(SSTARTED_SELF_HEAL));
+
+                    ret = afr_shd_full_healer_spawn(this, i);
+
+                    if (ret) {
+                        gf_smsg(this->name, GF_LOG_ERROR, -ret,
+                                AFR_MSG_HEALER_SPAWN_FAILED, NULL);
+                    }
+                    op_ret = 0;
+                }
+            }
+            break;
+        case GF_SHD_OP_INDEX_SUMMARY:
+            /* this case has been handled in glfs-heal.c */
+            break;
+        case GF_SHD_OP_SPLIT_BRAIN_FILES:
+            eh_dump(shd->split_brain, output, afr_add_shd_event);
+            break;
+        case GF_SHD_OP_STATISTICS:
+            for (i = 0; i < priv->child_count; i++) {
+                eh_dump(shd->statistics[i], output, afr_add_crawl_event);
+                ret = afr_shd_dict_add_crawl_event(
+                    this, output, &shd->index_healers[i].crawl_event);
+                if (ret) {
+                    gf_smsg(this->name, GF_LOG_ERROR, -ret,
+                            AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL);
+                }
+
+                ret = afr_shd_dict_add_crawl_event(
+                    this, output, &shd->full_healers[i].crawl_event);
+                if (ret) {
+                    gf_smsg(this->name, GF_LOG_ERROR, -ret,
+                            AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL);
+                }
+            }
+            break;
+        case GF_SHD_OP_STATISTICS_HEAL_COUNT:
+        case GF_SHD_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+            op_ret = -1;
+
+            for (i = 0; i < priv->child_count; i++) {
+                if (!priv->child_up[i]) {
+                    keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id,
+                                      i);
+                    AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+                                         SBRICK_NOT_CONNECTED,
+                                         SLEN(SBRICK_NOT_CONNECTED));
+                } else {
+                    snprintf(key, sizeof(key), "%d-%d-hardlinks", xl_id, i);
+                    ret = afr_shd_get_index_count(this, i, &cnt);
+                    if (ret == 0) {
+                        ret = dict_set_uint64(output, key, cnt);
+                    }
+                    if (ret) {
+                        gf_smsg(this->name, GF_LOG_ERROR, -ret,
+                                AFR_MSG_DICT_SET_FAILED, NULL);
+                    }
+                    op_ret = 0;
+                }
+            }
+
+            break;
+
+        default:
+            gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "op=%d",
+                    op, NULL);
+            break;
+    }
+out:
+    dict_deln(output, this->name, this_name_len);
+    return op_ret;
+
+#undef AFR_SET_DICT_AND_LOG
+}
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
new file mode 100644
index 00000000000..18db728ea7b
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -0,0 +1,75 @@
+/*
+  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _AFR_SELF_HEALD_H
+#define _AFR_SELF_HEALD_H
+
+#include <pthread.h>
+
+typedef struct {
+    char *path;
+    int child;
+} shd_event_t;
+
+typedef struct {
+    uint64_t healed_count;
+    uint64_t split_brain_count;
+    uint64_t heal_failed_count;
+
+    /* If start_time is 0, it means crawler is not in progress
+       and stats are not valid */
+    time_t start_time;
+    /* If start_time is NOT 0 and end_time is 0, it means
+       cralwer is in progress */
+    time_t end_time;
+    char *crawl_type;
+    int child;
+} crawl_event_t;
+
+struct subvol_healer {
+    xlator_t *this;
+    crawl_event_t crawl_event;
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+    pthread_t thread;
+    int subvol;
+    gf_boolean_t local;
+    gf_boolean_t running;
+    gf_boolean_t rerun;
+};
+
+typedef struct {
+    struct subvol_healer *index_healers;
+    struct subvol_healer *full_healers;
+
+    eh_t *split_brain;
+    eh_t **statistics;
+    int timeout;
+    uint32_t max_threads;
+    uint32_t wait_qlength;
+    uint32_t halo_max_latency_msec;
+    gf_boolean_t iamshd;
+    gf_boolean_t enabled;
+} afr_self_heald_t;
+
+int
+afr_selfheal_daemon_init(xlator_t *this);
+
+int
+afr_xl_op(xlator_t *this, dict_t *input, dict_t *output);
+
+int
+afr_shd_gfid_to_path(xlator_t *this, xlator_t *subvol, uuid_t gfid,
+                     char **path_p);
+
+int
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
+                    ia_type_t type);
+#endif /* !_AFR_SELF_HEALD_H */
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 82c2ee3406f..a51f79b1f43 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -1,995 +1,2927 @@
 /*
-  Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
   This file is part of GlusterFS.
 
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
-#include "dict.h"
-#include "byte-order.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/timer.h>
 
 #include "afr.h"
 #include "afr-transaction.h"
+#include "afr-self-heal.h"
+#include "afr-messages.h"
 
 #include <signal.h>
 
+typedef enum {
+    AFR_TRANSACTION_PRE_OP,
+    AFR_TRANSACTION_POST_OP,
+} afr_xattrop_type_t;
 
 static void
-__mark_all_pending (int32_t *pending, int child_count)
-{	
-	int i;
-	
-	for (i = 0; i < child_count; i++)
-		pending[i] = hton32 (1);
-}
+afr_lock_resume_shared(struct list_head *list);
+
+static void
+afr_post_op_handle_success(call_frame_t *frame, xlator_t *this);
+
+static void
+afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno);
+
+void
+__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared);
+
+void
+afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this);
+
+int
+afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this);
+
+int
+afr_changelog_call_count(afr_transaction_type type,
+                         unsigned char *pre_op_subvols,
+                         unsigned char *failed_subvols,
+                         unsigned int child_count);
+int
+afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr,
+                 afr_changelog_resume_t changelog_resume,
+                 afr_xattrop_type_t op);
+
+static void
+afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this);
+
+static int
+afr_ta_post_op_do(void *opaque);
+
+static int
+afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local);
+
+static int
+afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this);
 
+static void
+afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno);
+
+void
+afr_ta_locked_priv_invalidate(afr_private_t *priv)
+{
+    priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
+    priv->release_ta_notify_dom_lock = _gf_false;
+    priv->ta_notify_dom_lock_offset = 0;
+}
 
 static void
-__mark_child_dead (int32_t *pending, int child_count, int child)
+afr_ta_process_waitq(xlator_t *this)
 {
-	pending[child] = 0;
+    afr_local_t *entry = NULL;
+    afr_private_t *priv = this->private;
+    struct list_head waitq = {
+        0,
+    };
+
+    INIT_LIST_HEAD(&waitq);
+    LOCK(&priv->lock);
+    list_splice_init(&priv->ta_waitq, &waitq);
+    UNLOCK(&priv->lock);
+    list_for_each_entry(entry, &waitq, ta_waitq)
+    {
+        afr_ta_decide_post_op_state(entry->transaction.frame, this);
+    }
 }
 
+int
+afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque)
+{
+    afr_ta_process_waitq(ta_frame->this);
+    STACK_DESTROY(ta_frame->root);
+    return 0;
+}
+
+int
+afr_release_notify_lock_for_ta(void *opaque)
+{
+    xlator_t *this = NULL;
+    afr_private_t *priv = NULL;
+    loc_t loc = {
+        0,
+    };
+    struct gf_flock flock = {
+        0,
+    };
+    int ret = -1;
+
+    this = (xlator_t *)opaque;
+    priv = this->private;
+    ret = afr_fill_ta_loc(this, &loc, _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Failed to populate loc for thin-arbiter.");
+        goto out;
+    }
+    flock.l_type = F_UNLCK;
+    flock.l_start = priv->ta_notify_dom_lock_offset;
+    flock.l_len = 1;
+    ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+                         AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Failed to unlock AFR_TA_DOM_NOTIFY lock.");
+    }
+
+    LOCK(&priv->lock);
+    {
+        afr_ta_locked_priv_invalidate(priv);
+    }
+    UNLOCK(&priv->lock);
+out:
+    loc_wipe(&loc);
+    return ret;
+}
+
+void
+afr_zero_fill_stat(afr_local_t *local)
+{
+    if (!local)
+        return;
+    if (local->transaction.type == AFR_DATA_TRANSACTION ||
+        local->transaction.type == AFR_METADATA_TRANSACTION) {
+        gf_zero_fill_stat(&local->cont.inode_wfop.prebuf);
+        gf_zero_fill_stat(&local->cont.inode_wfop.postbuf);
+    } else if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+               local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+        gf_zero_fill_stat(&local->cont.dir_fop.buf);
+        gf_zero_fill_stat(&local->cont.dir_fop.preparent);
+        gf_zero_fill_stat(&local->cont.dir_fop.postparent);
+        if (local->transaction.type == AFR_ENTRY_TRANSACTION)
+            return;
+        gf_zero_fill_stat(&local->cont.dir_fop.prenewparent);
+        gf_zero_fill_stat(&local->cont.dir_fop.postnewparent);
+    }
+}
+
+/* In case of errors afr needs to choose which xdata from lower xlators it needs
+ * to unwind with. The way it is done is by checking if there are
+ * any good subvols which failed. Give preference to errnos other than
+ * ENOTCONN even if the child is source */
+void
+afr_pick_error_xdata(afr_local_t *local, afr_private_t *priv, inode_t *inode1,
+                     unsigned char *readable1, inode_t *inode2,
+                     unsigned char *readable2)
+{
+    int s = -1; /*selection*/
+    int i = 0;
+    unsigned char *readable = NULL;
+
+    if (local->xdata_rsp) {
+        dict_unref(local->xdata_rsp);
+        local->xdata_rsp = NULL;
+    }
+
+    readable = alloca0(priv->child_count * sizeof(*readable));
+    if (inode2 && readable2) { /*rename fop*/
+        AFR_INTERSECT(readable, readable1, readable2, priv->child_count);
+    } else {
+        memcpy(readable, readable1, sizeof(*readable) * priv->child_count);
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->replies[i].valid)
+            continue;
+
+        if (local->replies[i].op_ret >= 0)
+            continue;
+
+        if (local->replies[i].op_errno == ENOTCONN)
+            continue;
+
+        /*Order is important in the following condition*/
+        if ((s < 0) || (!readable[s] && readable[i]))
+            s = i;
+    }
+
+    if (s != -1 && local->replies[s].xdata) {
+        local->xdata_rsp = dict_ref(local->replies[s].xdata);
+    } else if (s == -1) {
+        for (i = 0; i < priv->child_count; i++) {
+            if (!local->replies[i].valid)
+                continue;
+
+            if (local->replies[i].op_ret >= 0)
+                continue;
+
+            if (!local->replies[i].xdata)
+                continue;
+            local->xdata_rsp = dict_ref(local->replies[i].xdata);
+            break;
+        }
+    }
+}
+
+gf_boolean_t
+afr_needs_changelog_update(afr_local_t *local)
+{
+    if (local->transaction.type == AFR_DATA_TRANSACTION)
+        return _gf_true;
+    if (!local->optimistic_change_log)
+        return _gf_true;
+    return _gf_false;
+}
+
+gf_boolean_t
+afr_changelog_has_quorum(afr_local_t *local, xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    int i = 0;
+    unsigned char *success_children = NULL;
+
+    priv = this->private;
+    success_children = alloca0(priv->child_count);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->transaction.failed_subvols[i]) {
+            success_children[i] = 1;
+        }
+    }
+
+    if (afr_has_quorum(success_children, this, NULL)) {
+        return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+gf_boolean_t
+afr_is_write_subvol_valid(call_frame_t *frame, xlator_t *this)
+{
+    int i = 0;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    uint64_t write_subvol = 0;
+    unsigned char *writable = NULL;
+    uint16_t datamap = 0;
+
+    local = frame->local;
+    priv = this->private;
+    writable = alloca0(priv->child_count);
+
+    write_subvol = afr_write_subvol_get(frame, this);
+    datamap = (write_subvol & 0x00000000ffff0000) >> 16;
+    for (i = 0; i < priv->child_count; i++) {
+        if (datamap & (1 << i))
+            writable[i] = 1;
+
+        if (writable[i] && !local->transaction.failed_subvols[i])
+            return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+int
+afr_transaction_fop(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int call_count = -1;
+    unsigned char *failed_subvols = NULL;
+    int i = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    failed_subvols = local->transaction.failed_subvols;
+    call_count = priv->child_count -
+                 AFR_COUNT(failed_subvols, priv->child_count);
+    /* Fail if pre-op did not succeed on quorum no. of bricks. */
+    if (!afr_changelog_has_quorum(local, this) || !call_count) {
+        local->op_ret = -1;
+        /* local->op_errno is already captured in changelog cbk. */
+        afr_transaction_resume(frame, this);
+        return 0;
+    }
+
+    /* Fail if at least one writeable brick isn't up.*/
+    if (local->transaction.type == AFR_DATA_TRANSACTION &&
+        !afr_is_write_subvol_valid(frame, this)) {
+        local->op_ret = -1;
+        local->op_errno = EIO;
+        afr_transaction_resume(frame, this);
+        return 0;
+    }
+
+    local->call_count = call_count;
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->transaction.pre_op[i] && !failed_subvols[i]) {
+            local->transaction.wind(frame, this, i);
+
+            if (!--call_count)
+                break;
+        }
+    }
+
+    return 0;
+}
+
+int
+afr_transaction_done(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    gf_boolean_t unwind = _gf_false;
+    afr_lock_t *lock = NULL;
+    afr_local_t *lock_local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    if (priv->consistent_metadata) {
+        LOCK(&frame->lock);
+        {
+            unwind = (local->transaction.main_frame != NULL);
+        }
+        UNLOCK(&frame->lock);
+        if (unwind) /*It definitely did post-op*/
+            afr_zero_fill_stat(local);
+    }
+
+    if (local->transaction.do_eager_unlock) {
+        lock = &local->inode_ctx->lock[local->transaction.type];
+        LOCK(&local->inode->lock);
+        {
+            lock->acquired = _gf_false;
+            lock->release = _gf_false;
+            list_splice_init(&lock->frozen, &lock->waiting);
+            if (list_empty(&lock->waiting))
+                goto unlock;
+            lock_local = list_entry(lock->waiting.next, afr_local_t,
+                                    transaction.wait_list);
+            list_del_init(&lock_local->transaction.wait_list);
+            list_add(&lock_local->transaction.owner_list, &lock->owners);
+        }
+    unlock:
+        UNLOCK(&local->inode->lock);
+    }
+    if (lock_local) {
+        afr_lock(lock_local->transaction.frame,
+                 lock_local->transaction.frame->this);
+    }
+    local->transaction.unwind(frame, this);
+
+    GF_ASSERT(list_empty(&local->transaction.owner_list));
+    GF_ASSERT(list_empty(&local->transaction.wait_list));
+    AFR_STACK_DESTROY(frame);
+
+    return 0;
+}
+
+static void
+afr_lock_fail_shared(afr_local_t *local, struct list_head *list)
+{
+    afr_local_t *each = NULL;
+
+    while (!list_empty(list)) {
+        each = list_entry(list->next, afr_local_t, transaction.wait_list);
+        list_del_init(&each->transaction.wait_list);
+        each->op_ret = -1;
+        each->op_errno = local->op_errno;
+        afr_transaction_done(each->transaction.frame,
+                             each->transaction.frame->this);
+    }
+}
 
 static void
-__mark_down_children (int32_t *pending, int child_count, unsigned char *child_up)
+afr_handle_lock_acquire_failure(afr_local_t *local)
 {
-	int i;
-	
-	for (i = 0; i < child_count; i++)
-		if (!child_up[i])
-			pending[i] = 0;
+    struct list_head shared;
+    afr_lock_t *lock = NULL;
+
+    if (!local->transaction.eager_lock_on)
+        goto out;
+
+    lock = &local->inode_ctx->lock[local->transaction.type];
+
+    INIT_LIST_HEAD(&shared);
+    LOCK(&local->inode->lock);
+    {
+        lock->release = _gf_true;
+        list_splice_init(&lock->waiting, &shared);
+    }
+    UNLOCK(&local->inode->lock);
+
+    afr_lock_fail_shared(local, &shared);
+    local->transaction.do_eager_unlock = _gf_true;
+out:
+    local->internal_lock.lock_cbk = afr_transaction_done;
+    afr_unlock(local->transaction.frame, local->transaction.frame->this);
+}
+
+call_frame_t *
+afr_transaction_detach_fop_frame(call_frame_t *frame)
+{
+    afr_local_t *local = NULL;
+    call_frame_t *fop_frame = NULL;
+
+    local = frame->local;
+
+    afr_handle_inconsistent_fop(frame, &local->op_ret, &local->op_errno);
+    LOCK(&frame->lock);
+    {
+        fop_frame = local->transaction.main_frame;
+        local->transaction.main_frame = NULL;
+    }
+    UNLOCK(&frame->lock);
+
+    return fop_frame;
 }
 
+static void
+afr_save_lk_owner(call_frame_t *frame)
+{
+    afr_local_t *local = NULL;
+
+    local = frame->local;
+
+    local->saved_lk_owner = frame->root->lk_owner;
+}
 
 static void
-__mark_all_success (int32_t *pending, int child_count)
+afr_restore_lk_owner(call_frame_t *frame)
 {
-	int i;
-	
-	for (i = 0; i < child_count; i++)
-		pending[i] = hton32 (-1);
+    afr_local_t *local = NULL;
+
+    local = frame->local;
+
+    frame->root->lk_owner = local->saved_lk_owner;
 }
 
+void
+__mark_all_success(call_frame_t *frame, xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int i;
+
+    local = frame->local;
+    priv = this->private;
 
-static int
-__is_first_write_on_fd (xlator_t *this, fd_t *fd)
+    for (i = 0; i < priv->child_count; i++) {
+        local->transaction.failed_subvols[i] = 0;
+    }
+}
+
+void
+afr_compute_pre_op_sources(call_frame_t *frame, xlator_t *this)
 {
-        int op_ret     = 0;
-        int _ret       = -1;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    afr_transaction_type type = -1;
+    dict_t *xdata = NULL;
+    int **matrix = NULL;
+    int idx = -1;
+    int i = 0;
+    int j = 0;
+
+    priv = this->private;
+    local = frame->local;
+    type = local->transaction.type;
+    idx = afr_index_for_transaction_type(type);
+    matrix = ALLOC_MATRIX(priv->child_count, int);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->transaction.changelog_xdata[i])
+            continue;
+        xdata = local->transaction.changelog_xdata[i];
+        afr_selfheal_fill_matrix(this, matrix, i, idx, xdata);
+    }
+
+    memset(local->transaction.pre_op_sources, 1, priv->child_count);
+
+    /*If lock or pre-op failed on a brick, it is not a source. */
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->transaction.failed_subvols[i])
+            local->transaction.pre_op_sources[i] = 0;
+    }
+
+    /* If brick is blamed by others, it is not a source. */
+    for (i = 0; i < priv->child_count; i++)
+        for (j = 0; j < priv->child_count; j++)
+            if (matrix[i][j] != 0)
+                local->transaction.pre_op_sources[j] = 0;
+}
+
+void
+afr_txn_arbitrate_fop(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int pre_op_sources_count = 0;
+    int i = 0;
+
+    priv = this->private;
+    local = frame->local;
+
+    afr_compute_pre_op_sources(frame, this);
+    pre_op_sources_count = AFR_COUNT(local->transaction.pre_op_sources,
+                                     priv->child_count);
+
+    /* If arbiter is the only source, do not proceed. */
+    if (pre_op_sources_count < 2 &&
+        local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) {
+        local->op_ret = -1;
+        local->op_errno = ENOTCONN;
+        for (i = 0; i < priv->child_count; i++)
+            local->transaction.failed_subvols[i] = 1;
+    }
+
+    afr_transaction_fop(frame, this);
+
+    return;
+}
 
-        LOCK (&fd->inode->lock);
+int
+afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+    int ret = 0;
+    int failure_count = 0;
+    struct list_head shared;
+    afr_lock_t *lock = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    INIT_LIST_HEAD(&shared);
+    if (local->transaction.type == AFR_DATA_TRANSACTION &&
+        !local->transaction.inherited) {
+        ret = afr_write_subvol_set(frame, this);
+        if (ret) {
+            /*act as if operation failed on all subvols*/
+            local->op_ret = -1;
+            local->op_errno = -ret;
+            for (i = 0; i < priv->child_count; i++)
+                local->transaction.failed_subvols[i] = 1;
+        }
+    }
+
+    if (local->pre_op_compat)
+        /* old mode, pre-op was done as afr_changelog_do()
+           just now, before OP */
+        afr_changelog_pre_op_update(frame, this);
+
+    if (!local->transaction.eager_lock_on || local->transaction.inherited)
+        goto fop;
+    failure_count = AFR_COUNT(local->transaction.failed_subvols,
+                              priv->child_count);
+    if (failure_count == priv->child_count) {
+        afr_handle_lock_acquire_failure(local);
+        return 0;
+    } else {
+        lock = &local->inode_ctx->lock[local->transaction.type];
+        LOCK(&local->inode->lock);
         {
-                _ret = fd_ctx_get (fd, this, NULL);
-                if (_ret < 0) {
-                        gf_log (this->name, GF_LOG_DEBUG,
-                                "first writev() on fd=%p, writing changelog",
-                                fd);
-
-                        _ret = fd_ctx_set (fd, this, 0xaf1);
-                        op_ret = 1;
-                }
+            lock->acquired = _gf_true;
+            __afr_transaction_wake_shared(local, &shared);
         }
-        UNLOCK (&fd->inode->lock);
+        UNLOCK(&local->inode->lock);
+    }
+
+fop:
+    /*  Perform fops with the lk-owner from top xlator.
+     *  Eg: lk-owner of posix-lk and flush should be same,
+     *  flush cant clear the  posix-lks without that lk-owner.
+     */
+    afr_save_lk_owner(frame);
+    frame->root->lk_owner = local->transaction.main_frame->root->lk_owner;
+
+    if (priv->arbiter_count == 1) {
+        afr_txn_arbitrate_fop(frame, this);
+    } else {
+        afr_transaction_fop(frame, this);
+    }
+
+    afr_lock_resume_shared(&shared);
+    return 0;
+}
 
-        return op_ret;
+int
+afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending)
+{
+    int i = 0;
+    int ret = 0;
+
+    for (i = 0; i < priv->child_count; i++) {
+        ret = dict_set_static_bin(xattr, priv->pending_key[i], pending[i],
+                                  AFR_NUM_CHANGE_LOGS * sizeof(int));
+        /* 3 = data+metadata+entry */
+
+        if (ret)
+            break;
+    }
+
+    return ret;
 }
 
+static void
+afr_ta_dom_lock_check_and_release(afr_ta_fop_state_t fop_state, xlator_t *this)
+{
+    afr_private_t *priv = this->private;
+    unsigned int inmem_count = 0;
+    unsigned int onwire_count = 0;
+    gf_boolean_t release = _gf_false;
+
+    LOCK(&priv->lock);
+    {
+        /*Once we get notify lock release upcall notification,
+         if any of the fop state counters are non-zero, we will
+         not release the lock.
+         */
+        onwire_count = priv->ta_on_wire_txn_count;
+        inmem_count = priv->ta_in_mem_txn_count;
+        switch (fop_state) {
+            case TA_GET_INFO_FROM_TA_FILE:
+                onwire_count = --priv->ta_on_wire_txn_count;
+                break;
+            case TA_INFO_IN_MEMORY_SUCCESS:
+            case TA_INFO_IN_MEMORY_FAILED:
+                inmem_count = --priv->ta_in_mem_txn_count;
+                break;
+            case TA_WAIT_FOR_NOTIFY_LOCK_REL:
+                GF_ASSERT(0);
+                break;
+            case TA_SUCCESS:
+                break;
+        }
+        release = priv->release_ta_notify_dom_lock;
+    }
+    UNLOCK(&priv->lock);
+
+    if (inmem_count != 0 || release == _gf_false || onwire_count != 0)
+        return;
 
-static int
-__unset_fd_ctx_if_set (xlator_t *this, fd_t *fd)
+    afr_ta_lock_release_synctask(this);
+}
+
+static void
+afr_ta_process_onwireq(afr_ta_fop_state_t fop_state, xlator_t *this)
 {
-        int op_ret = 0;
-        int _ret   = -1;
+    afr_private_t *priv = this->private;
+    afr_local_t *entry = NULL;
+    int bad_child = AFR_CHILD_UNKNOWN;
+
+    struct list_head onwireq = {
+        0,
+    };
+    INIT_LIST_HEAD(&onwireq);
+
+    LOCK(&priv->lock);
+    {
+        bad_child = priv->ta_bad_child_index;
+        if (bad_child == AFR_CHILD_UNKNOWN) {
+            /*The previous on-wire ta_post_op was a failure. Just dequeue
+             *one element to wind on-wire again. */
+            entry = list_entry(priv->ta_onwireq.next, afr_local_t, ta_onwireq);
+            list_del_init(&entry->ta_onwireq);
+        } else {
+            /* Prepare to process all fops based on bad_child_index. */
+            list_splice_init(&priv->ta_onwireq, &onwireq);
+        }
+    }
+    UNLOCK(&priv->lock);
+
+    if (entry) {
+        afr_ta_post_op_synctask(this, entry);
+        return;
+    } else {
+        while (!list_empty(&onwireq)) {
+            entry = list_entry(onwireq.next, afr_local_t, ta_onwireq);
+            list_del_init(&entry->ta_onwireq);
+            if (entry->ta_failed_subvol == bad_child) {
+                afr_post_op_handle_success(entry->transaction.frame, this);
+            } else {
+                afr_post_op_handle_failure(entry->transaction.frame, this, EIO);
+            }
+        }
+    }
+}
 
-        LOCK (&fd->inode->lock);
-        {
-                _ret = fd_ctx_get (fd, this, NULL);
-                if (_ret == 0) {
-                        fd_ctx_del (fd, this, NULL);
-                        op_ret = 1;
-                }
+int
+afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_internal_lock_t *int_lock = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+    int_lock = &local->internal_lock;
+
+    if (priv->thin_arbiter_count) {
+        /*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */
+        afr_ta_dom_lock_check_and_release(local->fop_state, this);
+    }
+
+    /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */
+    if (!afr_changelog_has_quorum(local, this)) {
+        local->op_ret = -1;
+        /*local->op_errno is already captured in changelog cbk*/
+    }
+
+    if (local->transaction.resume_stub) {
+        call_resume(local->transaction.resume_stub);
+        local->transaction.resume_stub = NULL;
+    }
+
+    int_lock->lock_cbk = afr_transaction_done;
+    afr_unlock(frame, this);
+
+    return 0;
+}
+
+static void
+afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno)
+{
+    afr_local_t *local = frame->local;
+    local->op_ret = -1;
+    local->op_errno = op_errno;
+
+    gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_THIN_ARB,
+           "Failing %s for gfid %s. Fop state is:%d", gf_fop_list[local->op],
+           uuid_utoa(local->inode->gfid), local->fop_state);
+
+    afr_changelog_post_op_done(frame, this);
+}
+
+unsigned char *
+afr_locked_nodes_get(afr_transaction_type type, afr_internal_lock_t *int_lock)
+{
+    /*Because same set of subvols participate in all lockee
+     * entities*/
+    return int_lock->lockee[0].locked_nodes;
+}
+
+int
+afr_changelog_call_count(afr_transaction_type type,
+                         unsigned char *pre_op_subvols,
+                         unsigned char *failed_subvols,
+                         unsigned int child_count)
+{
+    int i = 0;
+    int call_count = 0;
+
+    for (i = 0; i < child_count; i++) {
+        if (pre_op_subvols[i] && !failed_subvols[i]) {
+            call_count++;
         }
-        UNLOCK (&fd->inode->lock);
+    }
+
+    if (type == AFR_ENTRY_RENAME_TRANSACTION)
+        call_count *= 2;
 
-        return op_ret;
+    return call_count;
 }
 
+gf_boolean_t
+afr_txn_nothing_failed(call_frame_t *frame, xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int i = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    if (priv->thin_arbiter_count) {
+        /* We need to perform post-op even if 1 data brick was down
+         * before the txn started.*/
+        if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count))
+            return _gf_false;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->transaction.pre_op[i] &&
+            local->transaction.failed_subvols[i])
+            return _gf_false;
+    }
+
+    return _gf_true;
+}
 
-static int
-__changelog_enabled (afr_private_t *priv, afr_transaction_type type)
+void
+afr_handle_symmetric_errors(call_frame_t *frame, xlator_t *this)
 {
-	int ret = 0;
+    if (afr_is_symmetric_error(frame, this))
+        __mark_all_success(frame, this);
+}
 
-	switch (type) {
-	case AFR_DATA_TRANSACTION:
-		if (priv->data_change_log)
-			ret = 1;
-		
-		break;
+gf_boolean_t
+afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame)
+{
+    unsigned int quorum_count = 0;
+    afr_private_t *priv = NULL;
+    unsigned int up_children_count = 0;
+
+    priv = this->private;
+    up_children_count = AFR_COUNT(subvols, priv->child_count);
+
+    if (afr_lookup_has_quorum(frame, up_children_count))
+        return _gf_true;
+
+    if (priv->quorum_count == AFR_QUORUM_AUTO) {
+        /*
+         * Special case for auto-quorum with an even number of nodes.
+         *
+         * A replica set with even count N can only handle the same
+         * number of failures as odd N-1 before losing "vanilla"
+         * quorum, and the probability of more simultaneous failures is
+         * actually higher.  For example, with a 1% chance of failure
+         * we'd have a 0.03% chance of two simultaneous failures with
+         * N=3 but a 0.06% chance with N=4.  However, the special case
+         * is necessary for N=2 because there's no real quorum in that
+         * case (i.e. can't normally survive *any* failures).  In that
+         * case, we treat the first node as a tie-breaker, allowing
+         * quorum to be retained in some cases while still honoring the
+         * all-important constraint that there can not simultaneously
+         * be two partitioned sets of nodes each believing they have
+         * quorum.  Of two equally sized sets, the one without that
+         * first node will lose.
+         *
+         * It turns out that the special case is beneficial for higher
+         * values of N as well.  Continuing the example above, the
+         * probability of losing quorum with N=4 and this type of
+         * quorum is (very) slightly lower than with N=3 and vanilla
+         * quorum.  The difference becomes even more pronounced with
+         * higher N.  Therefore, even though such replica counts are
+         * unlikely to be seen in practice, we might as well use the
+         * "special" quorum then as well.
+         */
+        if ((up_children_count * 2) == priv->child_count) {
+            return subvols[0];
+        }
+    }
 
-	case AFR_METADATA_TRANSACTION:
-		if (priv->metadata_change_log)
-			ret = 1;
+    if (priv->quorum_count == AFR_QUORUM_AUTO) {
+        quorum_count = priv->child_count / 2 + 1;
+    } else {
+        quorum_count = priv->quorum_count;
+    }
 
-		break;
+    if (up_children_count >= quorum_count)
+        return _gf_true;
 
-	case AFR_ENTRY_TRANSACTION:
-	case AFR_ENTRY_RENAME_TRANSACTION:
-		if (priv->entry_change_log)
-			ret = 1;
+    return _gf_false;
+}
 
-		break;
-		
-	case AFR_FLUSH_TRANSACTION:
-		ret = 1;
-	}
+static gf_boolean_t
+afr_has_fop_quorum(call_frame_t *frame)
+{
+    xlator_t *this = frame->this;
+    afr_local_t *local = frame->local;
+    unsigned char *locked_nodes = NULL;
 
-	return ret;
+    locked_nodes = afr_locked_nodes_get(local->transaction.type,
+                                        &local->internal_lock);
+    return afr_has_quorum(locked_nodes, this, NULL);
 }
 
+static gf_boolean_t
+afr_has_fop_cbk_quorum(call_frame_t *frame)
+{
+    afr_local_t *local = frame->local;
+    xlator_t *this = frame->this;
+    afr_private_t *priv = this->private;
+    unsigned char *success = alloca0(priv->child_count);
+    int i = 0;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->transaction.pre_op[i])
+            if (!local->transaction.failed_subvols[i])
+                success[i] = 1;
+    }
+
+    return afr_has_quorum(success, this, NULL);
+}
+
+gf_boolean_t
+afr_need_dirty_marking(call_frame_t *frame, xlator_t *this)
+{
+    afr_private_t *priv = this->private;
+    afr_local_t *local = NULL;
+    gf_boolean_t need_dirty = _gf_false;
+
+    local = frame->local;
+
+    if (!priv->quorum_count || !local->optimistic_change_log)
+        return _gf_false;
+
+    if (local->transaction.type == AFR_DATA_TRANSACTION ||
+        local->transaction.type == AFR_METADATA_TRANSACTION)
+        return _gf_false;
+
+    if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) ==
+        priv->child_count)
+        return _gf_false;
+
+    if (!afr_has_fop_cbk_quorum(frame))
+        need_dirty = _gf_true;
+
+    return need_dirty;
+}
+
+void
+afr_handle_quorum(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+    const char *file = NULL;
+    uuid_t gfid = {0};
+
+    local = frame->local;
+    priv = frame->this->private;
+
+    if (priv->quorum_count == 0)
+        return;
+
+    /* If the fop already failed return right away to preserve errno */
+    if (local->op_ret == -1)
+        return;
+
+    /*
+     * Network split may happen just after the fops are unwound, so check
+     * if the fop succeeded in a way it still follows quorum. If it doesn't,
+     * mark the fop as failure, mark the changelogs so it reflects that
+     * failure.
+     *
+     * Scenario:
+     * There are 3 mounts on 3 machines(node1, node2, node3) all writing to
+     * single file. Network split happened in a way that node1 can't see
+     * node2, node3. Node2, node3 both of them can't see node1. Now at the
+     * time of sending write all the bricks are up. Just after write fop is
+     * wound on node1, network split happens. Node1 thinks write fop failed
+     * on node2, node3 so marks pending changelog for those 2 extended
+     * attributes on node1. Node2, node3 thinks writes failed on node1 so
+     * they mark pending changelog for node1. When the network is stable
+     * again the file already is in split-brain. These checks prevent
+     * marking pending changelog on other subvolumes if the fop doesn't
+     * succeed in a way it is still following quorum. So with this fix what
+     * is happening is, node1 will have all pending changelog(FOOL) because
+     * the write succeeded only on node1 but failed on node2, node3 so
+     * instead of marking pending changelogs on node2, node3 it just treats
+     * the fop as failure and goes into DIRTY state. Where as node2, node3
+     * say they are sources and have pending changelog to node1 so there is
+     * no split-brain with the fix. The problem is eliminated completely.
+     */
+
+    if (afr_has_fop_cbk_quorum(frame))
+        return;
+
+    if (afr_need_dirty_marking(frame, this))
+        goto set_response;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->transaction.pre_op[i])
+            afr_transaction_fop_failed(frame, frame->this, i);
+    }
+
+set_response:
+    local->op_ret = -1;
+    local->op_errno = afr_final_errno(local, priv);
+    if (local->op_errno == 0)
+        local->op_errno = afr_quorum_errno(priv);
+
+    if (local->fd) {
+        gf_uuid_copy(gfid, local->fd->inode->gfid);
+        file = uuid_utoa(gfid);
+    } else {
+        loc_path(&local->loc, local->loc.name);
+        file = local->loc.path;
+    }
+
+    gf_msg(frame->this->name, GF_LOG_WARNING, local->op_errno,
+           AFR_MSG_QUORUM_FAIL, "%s: Failing %s as quorum is not met", file,
+           gf_fop_list[local->op]);
+
+    switch (local->transaction.type) {
+        case AFR_ENTRY_TRANSACTION:
+        case AFR_ENTRY_RENAME_TRANSACTION:
+            afr_pick_error_xdata(local, priv, local->parent, local->readable,
+                                 local->parent2, local->readable2);
+            break;
+        default:
+            afr_pick_error_xdata(local, priv, local->inode, local->readable,
+                                 NULL, NULL);
+            break;
+    }
+}
+
+int
+afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop)
+{
+    afr_private_t *priv = NULL;
+
+    priv = this->private;
+    loc->parent = inode_ref(priv->root_inode);
+    gf_uuid_copy(loc->pargfid, loc->parent->gfid);
+    loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
+    if (is_gfid_based_fop && gf_uuid_is_null(priv->ta_gfid)) {
+        /* Except afr_ta_id_file_check() which is path based, all other gluster
+         * FOPS need gfid.*/
+        return -EINVAL;
+    }
+    gf_uuid_copy(loc->gfid, priv->ta_gfid);
+    loc->inode = inode_new(loc->parent->table);
+    if (!loc->inode) {
+        loc_wipe(loc);
+        return -ENOMEM;
+    }
+    return 0;
+}
 
 static int
-__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this)
+afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-	fd_t *          fd    = NULL;
+    xlator_t *this = NULL;
+    afr_local_t *local = NULL;
+    call_frame_t *txn_frame = NULL;
+    afr_ta_fop_state_t fop_state;
+
+    local = (afr_local_t *)opaque;
+    fop_state = local->fop_state;
+    txn_frame = local->transaction.frame;
+    this = frame->this;
+
+    if (ret == 0) {
+        /*Mark pending xattrs on the up data brick.*/
+        afr_post_op_handle_success(txn_frame, this);
+    } else {
+        afr_post_op_handle_failure(txn_frame, this, -ret);
+    }
+
+    STACK_DESTROY(frame->root);
+    afr_ta_process_onwireq(fop_state, this);
+
+    return 0;
+}
+
+int **
+afr_set_changelog_xattr(afr_private_t *priv, unsigned char *pending,
+                        dict_t *xattr, afr_local_t *local)
+{
+    int **changelog = NULL;
+    int idx = 0;
+    int ret = 0;
+    int i;
+
+    if (local->is_new_entry == _gf_true) {
+        changelog = afr_mark_pending_changelog(priv, pending, xattr,
+                                               local->cont.dir_fop.buf.ia_type);
+    } else {
+        idx = afr_index_for_transaction_type(local->transaction.type);
+        changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+        if (!changelog) {
+            goto out;
+        }
+        for (i = 0; i < priv->child_count; i++) {
+            if (local->transaction.failed_subvols[i])
+                changelog[i][idx] = hton32(1);
+        }
+        ret = afr_set_pending_dict(priv, xattr, changelog);
+        if (ret < 0) {
+            afr_matrix_cleanup(changelog, priv->child_count);
+            return NULL;
+        }
+    }
 
-	int op_ret   = 0;
+out:
+    return changelog;
+}
 
-	priv  = this->private;
-	local = frame->local;
-	
-	if (__changelog_enabled (priv, local->transaction.type)) {
-		switch (local->op) {
+static void
+afr_ta_locked_xattrop_validate(afr_private_t *priv, afr_local_t *local,
+                               gf_boolean_t *valid)
+{
+    if (priv->ta_event_gen > local->ta_event_gen) {
+        /* We can't trust the ta's response anymore.*/
+        afr_ta_locked_priv_invalidate(priv);
+        *valid = _gf_false;
+        return;
+    }
+    return;
+}
 
-		case GF_FOP_WRITE:
-		case GF_FOP_FTRUNCATE:
-			/* 
-			   if it's a data transaction, we write the changelog
-			   only on the first write on an fd 
-			*/
-			
-			fd = local->fd;
-			if (!fd || __is_first_write_on_fd (this, fd))
-				op_ret = 1;
+static int
+afr_ta_post_op_do(void *opaque)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    xlator_t *this = NULL;
+    dict_t *xattr = NULL;
+    unsigned char *pending = NULL;
+    int **changelog = NULL;
+    int failed_subvol = -1;
+    int success_subvol = -1;
+    loc_t loc = {
+        0,
+    };
+    int i = 0;
+    int ret = 0;
+    gf_boolean_t valid = _gf_true;
+
+    local = (afr_local_t *)opaque;
+    this = local->transaction.frame->this;
+    priv = this->private;
+
+    ret = afr_fill_ta_loc(this, &loc, _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Failed to populate loc for thin-arbiter.");
+        goto out;
+    }
+
+    xattr = dict_new();
+    if (!xattr) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    pending = alloca0(priv->child_count);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->transaction.failed_subvols[i]) {
+            pending[i] = 1;
+            failed_subvol = i;
+        } else {
+            success_subvol = i;
+        }
+    }
+
+    changelog = afr_set_changelog_xattr(priv, pending, xattr, local);
+
+    if (!changelog) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    ret = afr_ta_post_op_lock(this, &loc);
+    if (ret)
+        goto out;
+
+    ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+                         GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+               "Post-op on thin-arbiter id file %s failed for gfid %s.",
+               priv->pending_key[THIN_ARBITER_BRICK_INDEX],
+               uuid_utoa(local->inode->gfid));
+    }
+    LOCK(&priv->lock);
+    {
+        if (ret == 0) {
+            priv->ta_bad_child_index = failed_subvol;
+        } else if (ret == -EINVAL) {
+            priv->ta_bad_child_index = success_subvol;
+            ret = -EIO; /* TA failed the fop. Return EIO to application. */
+        }
 
-			break;
+        afr_ta_locked_xattrop_validate(priv, local, &valid);
+    }
+    UNLOCK(&priv->lock);
+    if (valid == _gf_false) {
+        gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
+               "Post-op on thin-arbiter id file %s for gfid %s invalidated due "
+               "to event-gen mismatch.",
+               priv->pending_key[THIN_ARBITER_BRICK_INDEX],
+               uuid_utoa(local->inode->gfid));
+        ret = -EIO;
+    }
+
+    afr_ta_post_op_unlock(this, &loc);
+out:
+    if (xattr)
+        dict_unref(xattr);
+
+    if (changelog)
+        afr_matrix_cleanup(changelog, priv->child_count);
+
+    loc_wipe(&loc);
+
+    return ret;
+}
 
-		case GF_FOP_FLUSH:
-			/* only do post-op on flush() */
+static int
+afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local)
+{
+    call_frame_t *ta_frame = NULL;
+    int ret = 0;
+
+    ta_frame = afr_ta_frame_create(this);
+    if (!ta_frame) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+               "Failed to create ta_frame");
+        goto err;
+    }
+    ret = synctask_new(this->ctx->env, afr_ta_post_op_do, afr_ta_post_op_done,
+                       ta_frame, local);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+               "Failed to launch post-op on thin arbiter for gfid %s",
+               uuid_utoa(local->inode->gfid));
+        STACK_DESTROY(ta_frame->root);
+        goto err;
+    }
+
+    return ret;
+err:
+    afr_changelog_post_op_fail(local->transaction.frame, this, ENOMEM);
+    return ret;
+}
 
-			op_ret = 0;
-			break;
+static void
+afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local,
+                     int *on_wire_count)
+{
+    LOCK(&priv->lock);
+    {
+        if (priv->release_ta_notify_dom_lock == _gf_true) {
+            /* Put the fop in waitq until notify dom lock is released.*/
+            local->fop_state = TA_WAIT_FOR_NOTIFY_LOCK_REL;
+            list_add_tail(&local->ta_waitq, &priv->ta_waitq);
+        } else if (priv->ta_bad_child_index == AFR_CHILD_UNKNOWN) {
+            /* Post-op on thin-arbiter to decide success/failure. */
+            local->fop_state = TA_GET_INFO_FROM_TA_FILE;
+            *on_wire_count = ++priv->ta_on_wire_txn_count;
+            if (*on_wire_count > 1) {
+                /*Avoid sending multiple on-wire post-ops on TA*/
+                list_add_tail(&local->ta_onwireq, &priv->ta_onwireq);
+            }
+        } else if (local->ta_failed_subvol == priv->ta_bad_child_index) {
+            /* Post-op on TA not needed as the fop failed on the in-memory bad
+             * brick. Just mark pending xattrs on the good data brick.*/
+            local->fop_state = TA_INFO_IN_MEMORY_SUCCESS;
+            priv->ta_in_mem_txn_count++;
+        } else {
+            /* Post-op on TA not needed as the fop succeeded only on the
+             * in-memory bad data brick and not the good one. Fail the fop.*/
+            local->fop_state = TA_INFO_IN_MEMORY_FAILED;
+            priv->ta_in_mem_txn_count++;
+        }
+    }
+    UNLOCK(&priv->lock);
+}
 
-		default:
-			op_ret = 1;
-		}
-	}
+static void
+afr_ta_fill_failed_subvol(afr_private_t *priv, afr_local_t *local)
+{
+    int i = 0;
 
-	return op_ret;
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->transaction.failed_subvols[i]) {
+            local->ta_failed_subvol = i;
+            break;
+        }
+    }
+}
+
+static void
+afr_post_op_handle_success(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+
+    local = frame->local;
+    if (local->is_new_entry == _gf_true) {
+        afr_mark_new_entry_changelog(frame, this);
+    }
+    afr_changelog_post_op_do(frame, this);
+
+    return;
 }
 
+static void
+afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno)
+{
+    afr_changelog_post_op_fail(frame, this, op_errno);
+
+    return;
+}
+
+static void
+afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int on_wire_count = 0;
+
+    priv = this->private;
+    local = frame->local;
+
+    afr_ta_set_fop_state(priv, local, &on_wire_count);
+
+    switch (local->fop_state) {
+        case TA_GET_INFO_FROM_TA_FILE:
+            if (on_wire_count == 1)
+                afr_ta_post_op_synctask(this, local);
+            /*else, fop is queued in ta_onwireq.*/
+            break;
+        case TA_WAIT_FOR_NOTIFY_LOCK_REL:
+            /*Post releasing the notify lock, we will act on this queue*/
+            break;
+        case TA_INFO_IN_MEMORY_SUCCESS:
+            afr_post_op_handle_success(frame, this);
+            break;
+        case TA_INFO_IN_MEMORY_FAILED:
+            afr_post_op_handle_failure(frame, this, EIO);
+            break;
+        default:
+            break;
+    }
+    return;
+}
+
+static void
+afr_handle_failure_using_thin_arbiter(call_frame_t *frame, xlator_t *this)
+{
+    afr_private_t *priv = this->private;
+    afr_local_t *local = frame->local;
+
+    afr_ta_fill_failed_subvol(priv, local);
+    gf_msg_debug(this->name, 0,
+                 "Fop failed on data brick (%s) for gfid=%s. "
+                 "ta info needed to decide fop result.",
+                 priv->children[local->ta_failed_subvol]->name,
+                 uuid_utoa(local->inode->gfid));
+    afr_ta_decide_post_op_state(frame, this);
+}
+
+void
+afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this)
+{
+    afr_private_t *priv = this->private;
+    afr_local_t *local = NULL;
+    dict_t *xattr = NULL;
+    int i = 0;
+    int ret = 0;
+    int idx = 0;
+    int nothing_failed = 1;
+    gf_boolean_t need_undirty = _gf_false;
+
+    afr_handle_quorum(frame, this);
+    local = frame->local;
+    idx = afr_index_for_transaction_type(local->transaction.type);
+
+    xattr = dict_new();
+    if (!xattr) {
+        afr_changelog_post_op_fail(frame, this, ENOMEM);
+        goto out;
+    }
+
+    nothing_failed = afr_txn_nothing_failed(frame, this);
+
+    if (afr_changelog_pre_op_uninherit(frame, this))
+        need_undirty = _gf_false;
+    else
+        need_undirty = _gf_true;
+
+    if (local->op_ret < 0 && !nothing_failed) {
+        if (afr_need_dirty_marking(frame, this)) {
+            local->dirty[idx] = hton32(1);
+            goto set_dirty;
+        }
+
+        afr_changelog_post_op_done(frame, this);
+        goto out;
+    }
+
+    if (nothing_failed && !need_undirty) {
+        afr_changelog_post_op_done(frame, this);
+        goto out;
+    }
+
+    if (local->transaction.in_flight_sb) {
+        afr_changelog_post_op_fail(frame, this,
+                                   local->transaction.in_flight_sb_errno);
+        goto out;
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->transaction.failed_subvols[i])
+            local->pending[i][idx] = hton32(1);
+    }
+
+    ret = afr_set_pending_dict(priv, xattr, local->pending);
+    if (ret < 0) {
+        afr_changelog_post_op_fail(frame, this, ENOMEM);
+        goto out;
+    }
+
+    if (need_undirty)
+        local->dirty[idx] = hton32(-1);
+    else
+        local->dirty[idx] = hton32(0);
+
+set_dirty:
+    ret = dict_set_static_bin(xattr, AFR_DIRTY, local->dirty,
+                              sizeof(int) * AFR_NUM_CHANGE_LOGS);
+    if (ret) {
+        afr_changelog_post_op_fail(frame, this, ENOMEM);
+        goto out;
+    }
+
+    afr_changelog_do(frame, this, xattr, afr_changelog_post_op_done,
+                     AFR_TRANSACTION_POST_OP);
+out:
+    if (xattr)
+        dict_unref(xattr);
+
+    return;
+}
 
 static int
-__changelog_needed_post_op (call_frame_t *frame, xlator_t *this)
+afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+    int failed_count = 0;
+
+    priv = this->private;
+    local = frame->local;
+
+    if (priv->thin_arbiter_count) {
+        failed_count = AFR_COUNT(local->transaction.failed_subvols,
+                                 priv->child_count);
+        if (failed_count == 1) {
+            afr_handle_failure_using_thin_arbiter(frame, this);
+            return 0;
+        } else {
+            /* Txn either succeeded or failed on both data bricks. Let
+             * post_op_do handle it as the case might be. */
+        }
+    }
 
-	int op_ret = 0;
-	afr_transaction_type type = -1;
+    afr_changelog_post_op_do(frame, this);
+    return 0;
+}
 
-	priv  = this->private;
-	local = frame->local;
-	type  = local->transaction.type;
+gf_boolean_t
+afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    afr_inode_ctx_t *ctx = NULL;
+    int i = 0;
+    gf_boolean_t ret = _gf_false;
+    int type = 0;
+
+    local = frame->local;
+    priv = this->private;
+    ctx = local->inode_ctx;
+
+    type = afr_index_for_transaction_type(local->transaction.type);
+    if (type != AFR_DATA_TRANSACTION)
+        return !local->transaction.dirtied;
+
+    if (local->transaction.no_uninherit)
+        return _gf_false;
+
+    /* This function must be idempotent. So check if we
+       were called before and return the same answer again.
+
+       It is important to keep this function idempotent for
+       the call in afr_changelog_post_op_safe() to not have
+       side effects on the call from afr_changelog_post_op_now()
+    */
+    if (local->transaction.uninherit_done)
+        return local->transaction.uninherit_value;
+
+    LOCK(&local->inode->lock);
+    {
+        for (i = 0; i < priv->child_count; i++) {
+            if (local->transaction.pre_op[i] != ctx->pre_op_done[type][i]) {
+                ret = !local->transaction.dirtied;
+                goto unlock;
+            }
+        }
 
-	if (__changelog_enabled (priv, type)) {
-                switch (local->op) {
+        if (ctx->inherited[type]) {
+            ret = _gf_true;
+            ctx->inherited[type]--;
+        } else if (ctx->on_disk[type]) {
+            ret = _gf_false;
+            ctx->on_disk[type]--;
+        } else {
+            /* ASSERT */
+            ret = _gf_false;
+        }
+
+        if (!ctx->inherited[type] && !ctx->on_disk[type]) {
+            for (i = 0; i < priv->child_count; i++)
+                ctx->pre_op_done[type][i] = 0;
+        }
+    }
+unlock:
+    UNLOCK(&local->inode->lock);
 
-                case GF_FOP_WRITE:
-                case GF_FOP_FTRUNCATE:
-                        op_ret = 0;
-                        break;
+    local->transaction.uninherit_done = _gf_true;
+    local->transaction.uninherit_value = ret;
 
-                case GF_FOP_FLUSH:
-                        op_ret = __unset_fd_ctx_if_set (this, local->fd);
-                        break;
+    return ret;
+}
+
+gf_boolean_t
+afr_changelog_pre_op_inherit(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+    gf_boolean_t ret = _gf_false;
+    int type = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    if (local->transaction.type != AFR_DATA_TRANSACTION)
+        return _gf_false;
+
+    type = afr_index_for_transaction_type(local->transaction.type);
+
+    LOCK(&local->inode->lock);
+    {
+        if (!local->inode_ctx->on_disk[type]) {
+            /* nothing to inherit yet */
+            ret = _gf_false;
+            goto unlock;
+        }
+
+        for (i = 0; i < priv->child_count; i++) {
+            if (local->transaction.pre_op[i] !=
+                local->inode_ctx->pre_op_done[type][i]) {
+                /* either inherit exactly, or don't */
+                ret = _gf_false;
+                goto unlock;
+            }
+        }
 
-                default:
-                        op_ret = 1;
+        local->inode_ctx->inherited[type]++;
+
+        ret = _gf_true;
+
+        local->transaction.inherited = _gf_true;
+    }
+unlock:
+    UNLOCK(&local->inode->lock);
+
+    return ret;
+}
+
+gf_boolean_t
+afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+    gf_boolean_t ret = _gf_false;
+    int type = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+        local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
+        return _gf_false;
+
+    if (local->transaction.inherited)
+        /* was already inherited in afr_changelog_pre_op */
+        return _gf_false;
+
+    if (!local->transaction.dirtied)
+        return _gf_false;
+
+    if (!afr_txn_nothing_failed(frame, this))
+        return _gf_false;
+
+    type = afr_index_for_transaction_type(local->transaction.type);
+
+    ret = _gf_false;
+
+    LOCK(&local->inode->lock);
+    {
+        if (!local->inode_ctx->on_disk[type]) {
+            for (i = 0; i < priv->child_count; i++)
+                local->inode_ctx->pre_op_done[type][i] =
+                    (!local->transaction.failed_subvols[i]);
+        } else {
+            for (i = 0; i < priv->child_count; i++)
+                if (local->inode_ctx->pre_op_done[type][i] !=
+                    (!local->transaction.failed_subvols[i])) {
+                    local->transaction.no_uninherit = 1;
+                    goto unlock;
                 }
         }
+        local->inode_ctx->on_disk[type]++;
 
-	return op_ret;
+        ret = _gf_true;
+    }
+unlock:
+    UNLOCK(&local->inode->lock);
+
+    return ret;
 }
 
+int
+afr_changelog_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                  int op_errno, dict_t *xattr, dict_t *xdata)
+{
+    afr_local_t *local = NULL;
+    int call_count = -1;
+    int child_index = -1;
+
+    local = frame->local;
+    child_index = (long)cookie;
 
-static int
-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
+    if (op_ret == -1) {
+        local->op_errno = op_errno;
+        afr_transaction_fop_failed(frame, this, child_index);
+    }
+
+    if (xattr)
+        local->transaction.changelog_xdata[child_index] = dict_ref(xattr);
+
+    call_count = afr_frame_return(frame);
+
+    if (call_count == 0) {
+        local->transaction.changelog_resume(frame, this);
+    }
+
+    return 0;
+}
+
+void
+afr_changelog_populate_xdata(call_frame_t *frame, afr_xattrop_type_t op,
+                             dict_t **xdata, dict_t **newloc_xdata)
 {
-	int ret = 0;
+    int i = 0;
+    int ret = 0;
+    char *key = NULL;
+    int keylen = 0;
+    const char *name = NULL;
+    dict_t *xdata1 = NULL;
+    dict_t *xdata2 = NULL;
+    xlator_t *this = NULL;
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    gf_boolean_t need_entry_key_set = _gf_true;
+
+    local = frame->local;
+    this = THIS;
+    priv = this->private;
+
+    if (local->transaction.type == AFR_DATA_TRANSACTION ||
+        local->transaction.type == AFR_METADATA_TRANSACTION)
+        goto out;
+
+    if (!priv->esh_granular)
+        goto out;
+
+    xdata1 = dict_new();
+    if (!xdata1)
+        goto out;
+
+    name = local->loc.name;
+    if (local->op == GF_FOP_LINK)
+        name = local->newloc.name;
+
+    switch (op) {
+        case AFR_TRANSACTION_PRE_OP:
+            key = GF_XATTROP_ENTRY_IN_KEY;
+            break;
+        case AFR_TRANSACTION_POST_OP:
+            if (afr_txn_nothing_failed(frame, this)) {
+                key = GF_XATTROP_ENTRY_OUT_KEY;
+                for (i = 0; i < priv->child_count; i++) {
+                    if (!local->transaction.failed_subvols[i])
+                        continue;
+                    need_entry_key_set = _gf_false;
+                    break;
+                }
+                /* If the transaction itself did not fail and there
+                 * are no failed subvolumes, check whether the fop
+                 * failed due to a symmetric error. If it did, do
+                 * not set the ENTRY_OUT xattr which would end up
+                 * deleting a name index which was created possibly by
+                 * an earlier entry txn that may have failed on some
+                 * of the sub-volumes.
+                 */
+                if (local->op_ret)
+                    need_entry_key_set = _gf_false;
+            } else {
+                key = GF_XATTROP_ENTRY_IN_KEY;
+            }
+            break;
+    }
+
+    if (need_entry_key_set) {
+        keylen = strlen(key);
+        ret = dict_set_strn(xdata1, key, keylen, (char *)name);
+        if (ret)
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+                   "%s/%s: Could not set %s key during xattrop",
+                   uuid_utoa(local->loc.pargfid), local->loc.name, key);
+        if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+            xdata2 = dict_new();
+            if (!xdata2)
+                goto out;
+
+            ret = dict_set_strn(xdata2, key, keylen,
+                                (char *)local->newloc.name);
+            if (ret)
+                gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+                       "%s/%s: Could not set %s key during "
+                       "xattrop",
+                       uuid_utoa(local->newloc.pargfid), local->newloc.name,
+                       key);
+        }
+    }
+
+    *xdata = xdata1;
+    *newloc_xdata = xdata2;
+    xdata1 = xdata2 = NULL;
+out:
+    if (xdata1)
+        dict_unref(xdata1);
+    return;
+}
+
+int
+afr_changelog_prepare(xlator_t *this, call_frame_t *frame, int *call_count,
+                      afr_changelog_resume_t changelog_resume,
+                      afr_xattrop_type_t op, dict_t **xdata,
+                      dict_t **newloc_xdata)
+{
+    afr_private_t *priv = NULL;
+    afr_local_t *local = NULL;
+
+    local = frame->local;
+    priv = this->private;
 
-	switch (type) {
-	case AFR_FLUSH_TRANSACTION:
-	case AFR_DATA_TRANSACTION:
-		ret = priv->data_lock_server_count;
-		break;
+    *call_count = afr_changelog_call_count(
+        local->transaction.type, local->transaction.pre_op,
+        local->transaction.failed_subvols, priv->child_count);
 
-	case AFR_METADATA_TRANSACTION:
-		ret = priv->metadata_lock_server_count;
-		break;
+    if (*call_count == 0) {
+        changelog_resume(frame, this);
+        return -1;
+    }
 
-	case AFR_ENTRY_TRANSACTION:
-	case AFR_ENTRY_RENAME_TRANSACTION:
-		ret = priv->entry_lock_server_count;
-		break;
-	}
+    afr_changelog_populate_xdata(frame, op, xdata, newloc_xdata);
+    local->call_count = *call_count;
 
-	return ret;
+    local->transaction.changelog_resume = changelog_resume;
+    return 0;
 }
 
+int
+afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr,
+                 afr_changelog_resume_t changelog_resume, afr_xattrop_type_t op)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    dict_t *xdata = NULL;
+    dict_t *newloc_xdata = NULL;
+    int i = 0;
+    int call_count = 0;
+    int ret = 0;
+
+    local = frame->local;
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (local->transaction.changelog_xdata[i]) {
+            dict_unref(local->transaction.changelog_xdata[i]);
+            local->transaction.changelog_xdata[i] = NULL;
+        }
+    }
+
+    ret = afr_changelog_prepare(this, frame, &call_count, changelog_resume, op,
+                                &xdata, &newloc_xdata);
+
+    if (ret)
+        return 0;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->transaction.pre_op[i] ||
+            local->transaction.failed_subvols[i])
+            continue;
+
+        switch (local->transaction.type) {
+            case AFR_DATA_TRANSACTION:
+            case AFR_METADATA_TRANSACTION:
+                if (!local->fd) {
+                    STACK_WIND_COOKIE(
+                        frame, afr_changelog_cbk, (void *)(long)i,
+                        priv->children[i], priv->children[i]->fops->xattrop,
+                        &local->loc, GF_XATTROP_ADD_ARRAY, xattr, xdata);
+                } else {
+                    STACK_WIND_COOKIE(
+                        frame, afr_changelog_cbk, (void *)(long)i,
+                        priv->children[i], priv->children[i]->fops->fxattrop,
+                        local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata);
+                }
+                break;
+            case AFR_ENTRY_RENAME_TRANSACTION:
+
+                STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i,
+                                  priv->children[i],
+                                  priv->children[i]->fops->xattrop,
+                                  &local->transaction.new_parent_loc,
+                                  GF_XATTROP_ADD_ARRAY, xattr, newloc_xdata);
+                call_count--;
+
+                /* fall through */
+
+            case AFR_ENTRY_TRANSACTION:
+                if (local->fd)
+                    STACK_WIND_COOKIE(
+                        frame, afr_changelog_cbk, (void *)(long)i,
+                        priv->children[i], priv->children[i]->fops->fxattrop,
+                        local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata);
+                else
+                    STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i,
+                                      priv->children[i],
+                                      priv->children[i]->fops->xattrop,
+                                      &local->transaction.parent_loc,
+                                      GF_XATTROP_ADD_ARRAY, xattr, xdata);
+                break;
+        }
+
+        if (!--call_count)
+            break;
+    }
 
-/* {{{ unlock */
+    if (xdata)
+        dict_unref(xdata);
+    if (newloc_xdata)
+        dict_unref(newloc_xdata);
+    return 0;
+}
 
-int32_t
-afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		       int32_t op_ret, int32_t op_errno)
+static void
+afr_init_optimistic_changelog_for_txn(xlator_t *this, afr_local_t *local)
 {
-	afr_local_t *local;
-	int call_count = 0;
+    int locked_count = 0;
+    afr_private_t *priv = NULL;
 
-	local = frame->local;
+    priv = this->private;
 
-	LOCK (&frame->lock);
-	{
-		call_count = --local->call_count;
-	}
-	UNLOCK (&frame->lock);
+    locked_count = AFR_COUNT(local->transaction.pre_op, priv->child_count);
+    if (priv->optimistic_change_log && locked_count == priv->child_count)
+        local->optimistic_change_log = 1;
 
-	if (call_count == 0) {
-		local->transaction.done (frame, this);
-	}
-	
-	return 0;
+    return;
 }
 
+int
+afr_changelog_pre_op(call_frame_t *frame, xlator_t *this)
+{
+    afr_private_t *priv = this->private;
+    int i = 0;
+    int ret = 0;
+    int call_count = 0;
+    int op_errno = 0;
+    afr_local_t *local = NULL;
+    afr_internal_lock_t *int_lock = NULL;
+    unsigned char *locked_nodes = NULL;
+    int idx = -1;
+    gf_boolean_t pre_nop = _gf_true;
+    dict_t *xdata_req = NULL;
+
+    local = frame->local;
+    int_lock = &local->internal_lock;
+    idx = afr_index_for_transaction_type(local->transaction.type);
+
+    locked_nodes = afr_locked_nodes_get(local->transaction.type, int_lock);
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (locked_nodes[i]) {
+            local->transaction.pre_op[i] = 1;
+            call_count++;
+        } else {
+            local->transaction.failed_subvols[i] = 1;
+        }
+    }
+
+    afr_init_optimistic_changelog_for_txn(this, local);
+
+    if (afr_changelog_pre_op_inherit(frame, this))
+        goto next;
+
+    /* This condition should not be met with present code, as
+     * transaction.done will be called if locks are not acquired on even a
+     * single node.
+     */
+    if (call_count == 0) {
+        op_errno = ENOTCONN;
+        goto err;
+    }
+
+    /* Check if the fop can be performed on at least
+     * quorum number of nodes.
+     */
+    if (priv->quorum_count && !afr_has_fop_quorum(frame)) {
+        op_errno = int_lock->lock_op_errno;
+        if (op_errno == 0)
+            op_errno = afr_quorum_errno(priv);
+        goto err;
+    }
+
+    xdata_req = dict_new();
+    if (!xdata_req) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    if (call_count < priv->child_count)
+        pre_nop = _gf_false;
+
+    /* Set an all-zero pending changelog so that in the cbk, we can get the
+     * current on-disk values. In a replica 3 volume with arbiter enabled,
+     * these values are needed to arrive at a go/ no-go of the fop phase to
+     * avoid ending up in split-brain.*/
+
+    ret = afr_set_pending_dict(priv, xdata_req, local->pending);
+    if (ret < 0) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    if (afr_needs_changelog_update(local)) {
+        local->dirty[idx] = hton32(1);
+
+        ret = dict_set_static_bin(xdata_req, AFR_DIRTY, local->dirty,
+                                  sizeof(int) * AFR_NUM_CHANGE_LOGS);
+        if (ret) {
+            op_errno = ENOMEM;
+            goto err;
+        }
+
+        pre_nop = _gf_false;
+        local->transaction.dirtied = 1;
+    }
+
+    if (pre_nop)
+        goto next;
+
+    if (!local->pre_op_compat) {
+        dict_copy(xdata_req, local->xdata_req);
+        goto next;
+    }
+
+    afr_changelog_do(frame, this, xdata_req, afr_transaction_perform_fop,
+                     AFR_TRANSACTION_PRE_OP);
+
+    if (xdata_req)
+        dict_unref(xdata_req);
+
+    return 0;
+next:
+    afr_transaction_perform_fop(frame, this);
+
+    if (xdata_req)
+        dict_unref(xdata_req);
+
+    return 0;
+err:
+    local->internal_lock.lock_cbk = afr_transaction_done;
+    local->op_ret = -1;
+    local->op_errno = op_errno;
+
+    afr_handle_lock_acquire_failure(local);
+
+    if (xdata_req)
+        dict_unref(xdata_req);
+
+    return 0;
+}
 
 int
-afr_unlock (call_frame_t *frame, xlator_t *this)
-{
-	struct flock flock;			
-
-	int i = 0;				
-	int call_count = 0;		     
-
-	afr_local_t *local = NULL;
-	afr_private_t * priv = this->private;
-
-	local = frame->local;
-	
-	call_count = afr_locked_nodes_count (local->transaction.locked_nodes, 
-					     priv->child_count);
-	
-	if (call_count == 0) {
-		local->transaction.done (frame, this);
-		return 0;
-	}
-
-	if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) 
-		call_count *= 2;
-
-	local->call_count = call_count;		
-
-	for (i = 0; i < priv->child_count; i++) {				
-		flock.l_start = local->transaction.start;			
-		flock.l_len   = local->transaction.len;
-		flock.l_type  = F_UNLCK;			
-
-		if (local->transaction.locked_nodes[i]) {
-			switch (local->transaction.type) {
-			case AFR_DATA_TRANSACTION:
-			case AFR_METADATA_TRANSACTION:
-			case AFR_FLUSH_TRANSACTION:
-
-				if (local->fd) {
-					STACK_WIND (frame, afr_unlock_common_cbk,	
-						    priv->children[i], 
-						    priv->children[i]->fops->finodelk, 
-						    local->fd, F_SETLK, &flock); 
-				} else {
-					STACK_WIND (frame, afr_unlock_common_cbk,	
-						    priv->children[i], 
-						    priv->children[i]->fops->inodelk, 
-						    &local->loc,  F_SETLK, &flock); 
-				}
-				
-				break;
-
-			case AFR_ENTRY_RENAME_TRANSACTION:
-				
-				STACK_WIND (frame, afr_unlock_common_cbk,	
-					    priv->children[i], 
-					    priv->children[i]->fops->entrylk, 
-					    &local->transaction.new_parent_loc, 
-					    local->transaction.new_basename,
-					    ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
-
-				call_count--;
-
-				/* fall through */
-
-			case AFR_ENTRY_TRANSACTION:
-				if (local->fd) {
-					STACK_WIND (frame, afr_unlock_common_cbk,	
-						    priv->children[i], 
-						    priv->children[i]->fops->fentrylk, 
-						    local->fd, 
-						    local->transaction.basename,
-						    ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
-				} else {
-					STACK_WIND (frame, afr_unlock_common_cbk,	
-						    priv->children[i], 
-						    priv->children[i]->fops->entrylk, 
-						    &local->transaction.parent_loc, 
-						    local->transaction.basename,
-						    ENTRYLK_UNLOCK, ENTRYLK_WRLCK);
-
-				}
-				break;
-			}
-			
-			if (!--call_count)
-				break;
-		}
-	}
-
-	return 0;
+afr_post_nonblocking_lock_cbk(call_frame_t *frame, xlator_t *this)
+{
+    afr_internal_lock_t *int_lock = NULL;
+    afr_local_t *local = NULL;
+
+    local = frame->local;
+    int_lock = &local->internal_lock;
+
+    /* Initiate blocking locks if non-blocking has failed */
+    if (int_lock->lock_op_ret < 0) {
+        gf_msg_debug(this->name, 0,
+                     "Non blocking locks failed. Proceeding to blocking");
+        int_lock->lock_cbk = afr_internal_lock_finish;
+        afr_blocking_lock(frame, this);
+    } else {
+        gf_msg_debug(this->name, 0,
+                     "Non blocking locks done. Proceeding to FOP");
+
+        afr_internal_lock_finish(frame, this);
+    }
+
+    return 0;
 }
 
-/* }}} */
+int
+afr_post_blocking_rename_cbk(call_frame_t *frame, xlator_t *this)
+{
+    afr_internal_lock_t *int_lock = NULL;
+    afr_local_t *local = NULL;
+
+    local = frame->local;
+    int_lock = &local->internal_lock;
+
+    if (int_lock->lock_op_ret < 0) {
+        gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INTERNAL_LKS_FAILED,
+               "Blocking entrylks failed.");
 
+        afr_transaction_done(frame, this);
+    } else {
+        gf_msg_debug(this->name, 0,
+                     "Blocking entrylks done. Proceeding to FOP");
 
-/* {{{ pending */
-
-int32_t
-afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			   int32_t op_ret, int32_t op_errno, dict_t *xattr)
-{
-	afr_private_t * priv  = NULL;
-	afr_local_t *   local = NULL;
-	
-	int call_count = -1;
-
-	priv  = this->private;
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		call_count = --local->call_count;
-	}
-	UNLOCK (&frame->lock);
-
-	if (call_count == 0) {
-		if (afr_lock_server_count (priv, local->transaction.type) == 0) {
-			local->transaction.done (frame, this);
-		} else {
-			afr_unlock (frame, this);
-		}
-	}
-
-	return 0;	
-}
-
-
-int 
-afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
-{
-	afr_private_t * priv = this->private;
-
-	int ret        = 0;
-	int i          = 0;				
-	int call_count = 0;
-	
-	afr_local_t *  local = NULL;	
-	dict_t *       xattr = dict_ref (get_new_dict ());
-
-	local = frame->local;
-
-	__mark_all_success (local->pending_array, priv->child_count);
-	__mark_down_children (local->pending_array, priv->child_count, local->child_up);
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up); 
-
-	if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
-		call_count *= 2;
-	}
-
-	local->call_count = call_count;		
-
-	if (call_count == 0) {
-		/* no child is up */
-		dict_unref (xattr);
-		afr_unlock (frame, this);
-		return 0;
-	}
-
-	for (i = 0; i < priv->child_count; i++) {					
-		if (local->child_up[i]) {
-			ret = dict_set_static_bin (xattr, local->transaction.pending, 
-						   local->pending_array, 
-						   priv->child_count * sizeof (int32_t));
-			if (ret < 0)
-				gf_log (this->name, GF_LOG_ERROR, 
-					"failed to set pending entry");
-
-
-			switch (local->transaction.type) {
-			case AFR_DATA_TRANSACTION:
-			case AFR_METADATA_TRANSACTION:
-			case AFR_FLUSH_TRANSACTION:
-			{
-				if (local->fd)
-					STACK_WIND (frame, afr_changelog_post_op_cbk,
-						    priv->children[i], 
-						    priv->children[i]->fops->fxattrop,
-						    local->fd, 
-						    GF_XATTROP_ADD_ARRAY, xattr);
-				else 
-					STACK_WIND (frame, afr_changelog_post_op_cbk,
-						    priv->children[i], 
-						    priv->children[i]->fops->xattrop,
-						    &local->loc, 
-						    GF_XATTROP_ADD_ARRAY, xattr);
-			}
-			break;
-
-			case AFR_ENTRY_RENAME_TRANSACTION:
-			{
-				STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
-						   (void *) (long) i,
-						   priv->children[i],
-						   priv->children[i]->fops->xattrop,
-						   &local->transaction.new_parent_loc,
-						   GF_XATTROP_ADD_ARRAY, xattr);
-				
-				call_count--;
-			}
-
-			/* 
-			   set it again because previous stack_wind
-			   might have already returned (think of case
-			   where subvolume is posix) and would have
-			   used the dict as placeholder for return
-			   value
-			*/
-			ret = dict_set_static_bin (xattr, local->transaction.pending, 
-						   local->pending_array, 
-						   priv->child_count * sizeof (int32_t));
-			if (ret < 0)
-				gf_log (this->name, GF_LOG_ERROR, 
-					"failed to set pending entry");
-
-			/* fall through */
-
-			case AFR_ENTRY_TRANSACTION:
-			{
-				if (local->fd)
-					STACK_WIND (frame, afr_changelog_post_op_cbk,
-						    priv->children[i], 
-						    priv->children[i]->fops->fxattrop,
-						    local->fd, 
-						    GF_XATTROP_ADD_ARRAY, xattr);
-				else 
-					STACK_WIND (frame, afr_changelog_post_op_cbk,
-						    priv->children[i], 
-						    priv->children[i]->fops->xattrop,
-						    &local->transaction.parent_loc, 
-						    GF_XATTROP_ADD_ARRAY, xattr);
-			}
-			break;
-			}
-
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	dict_unref (xattr);
-	return 0;
-}
-
-
-int32_t
-afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			      int32_t op_ret, int32_t op_errno, dict_t *xattr)
-{
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = this->private;
-	loc_t       *   loc   = NULL;
-
-	int call_count  = -1;
-	int child_index = (long) cookie;
-
-	local = frame->local;
-	loc   = &local->loc;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			local->child_up[child_index] = 0;
-			
-			if (op_errno == ENOTSUP) {
-				gf_log (this->name, GF_LOG_ERROR,
-					"xattrop not supported by %s",
-					priv->children[child_index]->name);
-				local->op_ret = -1;
-			} else if (!child_went_down (op_ret, op_errno)) {
-				gf_log (this->name, GF_LOG_ERROR,
-					"xattrop failed on child %s: %s",
-					priv->children[child_index]->name, 
-					strerror (op_errno));
-			}
-			local->op_errno = op_errno;
-		}
-
-		call_count = --local->call_count;
-	}
-	UNLOCK (&frame->lock);
-
-	if (call_count == 0) {
-		if ((local->op_ret == -1) && 
-		    (local->op_errno == ENOTSUP)) {
-			local->transaction.resume (frame, this);
-		} else {
-			local->transaction.fop (frame, this);
-		}
-	}
-
-	return 0;	
-}
-
-
-int 
-afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
-{
-	afr_private_t * priv = this->private;
-
-	int i = 0;				
-	int ret = 0;
-	int call_count = 0;		     
-	dict_t *xattr = NULL;
-
-	afr_local_t *local = NULL;
-
-	local = frame->local;
-	xattr = get_new_dict ();
-	dict_ref (xattr);
-
-	call_count = afr_up_children_count (priv->child_count, 
-					    local->child_up); 
-
-	if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
-		call_count *= 2;
-	}
-
-	if (call_count == 0) {
-		/* no child is up */
-		dict_unref (xattr);
-		afr_unlock (frame, this);
-		return 0;
-	}
-
-	local->call_count = call_count;		
-
-	__mark_all_pending (local->pending_array, priv->child_count);
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			ret = dict_set_static_bin (xattr, 
-						   local->transaction.pending, 
-						   local->pending_array, 
-						   (priv->child_count * 
-						    sizeof (int32_t)));
-			if (ret < 0)
-				gf_log (this->name, GF_LOG_ERROR, 
-					"failed to set pending entry");
-
-
-			switch (local->transaction.type) {
-			case AFR_DATA_TRANSACTION:
-			case AFR_METADATA_TRANSACTION:
-			case AFR_FLUSH_TRANSACTION:
-			{
-				if (local->fd)
-					STACK_WIND_COOKIE (frame, 
-							   afr_changelog_pre_op_cbk,
-							   (void *) (long) i,
-							   priv->children[i], 
-							   priv->children[i]->fops->fxattrop,
-							   local->fd,
-							   GF_XATTROP_ADD_ARRAY, xattr);
-				else
-					STACK_WIND_COOKIE (frame, 
-							   afr_changelog_pre_op_cbk,
-							   (void *) (long) i,
-							   priv->children[i], 
-							   priv->children[i]->fops->xattrop,
-							   &(local->loc), 
-							   GF_XATTROP_ADD_ARRAY, xattr);
-			}
-			break;
-				
-			case AFR_ENTRY_RENAME_TRANSACTION: 
-			{
-				STACK_WIND_COOKIE (frame, 
-						   afr_changelog_pre_op_cbk,
-						   (void *) (long) i,
-						   priv->children[i], 
-						   priv->children[i]->fops->xattrop,
-						   &local->transaction.new_parent_loc, 
-						   GF_XATTROP_ADD_ARRAY, xattr);
-
-				call_count--;
-			}
-
-
-			/* 
-			   set it again because previous stack_wind
-			   might have already returned (think of case
-			   where subvolume is posix) and would have
-			   used the dict as placeholder for return
-			   value
-			*/
-
-			ret = dict_set_static_bin (xattr, local->transaction.pending, 
-						   local->pending_array, 
-						   priv->child_count * sizeof (int32_t));
-			if (ret < 0)
-				gf_log (this->name, GF_LOG_ERROR, 
-					"failed to set pending entry");
-
-			/* fall through */
-				
-			case AFR_ENTRY_TRANSACTION:
-			{
-				if (local->fd)
-					STACK_WIND_COOKIE (frame, 
-							   afr_changelog_pre_op_cbk,
-							   (void *) (long) i,
-							   priv->children[i], 
-							   priv->children[i]->fops->fxattrop,
-							   local->fd, 
-							   GF_XATTROP_ADD_ARRAY, xattr);
-				else
-					STACK_WIND_COOKIE (frame, 
-							   afr_changelog_pre_op_cbk,
-							   (void *) (long) i,
-							   priv->children[i], 
-							   priv->children[i]->fops->xattrop,
-							   &local->transaction.parent_loc, 
-							   GF_XATTROP_ADD_ARRAY, xattr);
-			}
-
-			break;
-			}
-
-			if (!--call_count)
-				break;
-		}
-	}
-
-	dict_unref (xattr);
-	return 0;
+        afr_internal_lock_finish(frame, this);
+    }
+    return 0;
+}
+
+int
+afr_post_lower_unlock_cbk(call_frame_t *frame, xlator_t *this)
+{
+    afr_internal_lock_t *int_lock = NULL;
+    afr_local_t *local = NULL;
+
+    local = frame->local;
+    int_lock = &local->internal_lock;
+
+    GF_ASSERT(!int_lock->higher_locked);
+
+    int_lock->lock_cbk = afr_post_blocking_rename_cbk;
+    afr_blocking_lock(frame, this);
+
+    return 0;
+}
+
+int
+afr_set_transaction_flock(xlator_t *this, afr_local_t *local,
+                          afr_lockee_t *lockee)
+{
+    afr_private_t *priv = NULL;
+    struct gf_flock *flock = NULL;
+
+    priv = this->private;
+    flock = &lockee->flock;
+
+    if ((priv->arbiter_count || local->transaction.eager_lock_on ||
+         priv->full_lock) &&
+        local->transaction.type == AFR_DATA_TRANSACTION) {
+        /*Lock entire file to avoid network split brains.*/
+        flock->l_len = 0;
+        flock->l_start = 0;
+    } else {
+        flock->l_len = local->transaction.len;
+        flock->l_start = local->transaction.start;
+    }
+    flock->l_type = F_WRLCK;
+
+    return 0;
+}
+
+int
+afr_lock(call_frame_t *frame, xlator_t *this)
+{
+    afr_internal_lock_t *int_lock = NULL;
+    afr_local_t *local = NULL;
+    int i = 0;
+
+    local = frame->local;
+    int_lock = &local->internal_lock;
+
+    int_lock->lock_cbk = afr_post_nonblocking_lock_cbk;
+    int_lock->domain = this->name;
+
+    switch (local->transaction.type) {
+        case AFR_DATA_TRANSACTION:
+        case AFR_METADATA_TRANSACTION:
+            for (i = 0; i < int_lock->lockee_count; i++) {
+                afr_set_transaction_flock(this, local, &int_lock->lockee[i]);
+            }
+
+            break;
+
+        case AFR_ENTRY_TRANSACTION:
+            int_lock->lk_basename = local->transaction.basename;
+            if (local->transaction.parent_loc.path)
+                int_lock->lk_loc = &local->transaction.parent_loc;
+            else
+                GF_ASSERT(local->fd);
+            break;
+        case AFR_ENTRY_RENAME_TRANSACTION:
+            break;
+    }
+    afr_lock_nonblocking(frame, this);
+
+    return 0;
+}
+
+static gf_boolean_t
+afr_locals_overlap(afr_local_t *local1, afr_local_t *local2)
+{
+    uint64_t start1 = local1->transaction.start;
+    uint64_t start2 = local2->transaction.start;
+    uint64_t end1 = 0;
+    uint64_t end2 = 0;
+
+    if (local1->transaction.len)
+        end1 = start1 + local1->transaction.len - 1;
+    else
+        end1 = ULLONG_MAX;
+
+    if (local2->transaction.len)
+        end2 = start2 + local2->transaction.len - 1;
+    else
+        end2 = ULLONG_MAX;
+
+    return ((end1 >= start2) && (end2 >= start1));
+}
+
+gf_boolean_t
+afr_has_lock_conflict(afr_local_t *local, gf_boolean_t waitlist_check)
+{
+    afr_local_t *each = NULL;
+    afr_lock_t *lock = NULL;
+
+    lock = &local->inode_ctx->lock[local->transaction.type];
+    /*
+     * Once full file lock is acquired in eager-lock phase, overlapping
+     * writes do not compete for inode-locks, instead are transferred to the
+     * next writes. Because of this overlapping writes are not ordered.
+     * This can cause inconsistencies in replication.
+     * Example:
+     * Two overlapping writes w1, w2 are sent in parallel on same fd
+     * in two threads t1, t2.
+     * Both threads can execute afr_writev_wind in the following manner.
+     * t1 winds w1 on brick-0
+     * t2 winds w2 on brick-0
+     * t2 winds w2 on brick-1
+     * t1 winds w1 on brick-1
+     *
+     * This check makes sure the locks are not transferred for
+     * overlapping writes.
+     */
+    list_for_each_entry(each, &lock->owners, transaction.owner_list)
+    {
+        if (afr_locals_overlap(each, local)) {
+            return _gf_true;
+        }
+    }
+
+    if (!waitlist_check)
+        return _gf_false;
+    list_for_each_entry(each, &lock->waiting, transaction.wait_list)
+    {
+        if (afr_locals_overlap(each, local)) {
+            return _gf_true;
+        }
+    }
+    return _gf_false;
 }
 
 /* }}} */
+static void
+afr_copy_inodelk_vars(afr_internal_lock_t *dst, afr_internal_lock_t *src,
+                      xlator_t *this, int lockee_num)
+{
+    afr_private_t *priv = this->private;
+    afr_lockee_t *sl = &src->lockee[lockee_num];
+    afr_lockee_t *dl = &dst->lockee[lockee_num];
+
+    dst->domain = src->domain;
+    dl->flock.l_len = sl->flock.l_len;
+    dl->flock.l_start = sl->flock.l_start;
+    dl->flock.l_type = sl->flock.l_type;
+    dl->locked_count = sl->locked_count;
+    memcpy(dl->locked_nodes, sl->locked_nodes,
+           priv->child_count * sizeof(*dl->locked_nodes));
+}
 
-/* {{{ lock */
+void
+__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared)
+{
+    gf_boolean_t conflict = _gf_false;
+    afr_local_t *each = NULL;
+    afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type];
+
+    while (!conflict) {
+        if (list_empty(&lock->waiting))
+            return;
+        each = list_entry(lock->waiting.next, afr_local_t,
+                          transaction.wait_list);
+        if (afr_has_lock_conflict(each, _gf_false)) {
+            conflict = _gf_true;
+        }
+        if (conflict && !list_empty(&lock->owners))
+            return;
+        afr_copy_inodelk_vars(&each->internal_lock, &local->internal_lock,
+                              each->transaction.frame->this, 0);
+        list_move_tail(&each->transaction.wait_list, shared);
+        list_add_tail(&each->transaction.owner_list, &lock->owners);
+    }
+}
 
-static
-int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index);
+static void
+afr_lock_resume_shared(struct list_head *list)
+{
+    afr_local_t *each = NULL;
+
+    while (!list_empty(list)) {
+        each = list_entry(list->next, afr_local_t, transaction.wait_list);
+        list_del_init(&each->transaction.wait_list);
+        afr_changelog_pre_op(each->transaction.frame,
+                             each->transaction.frame->this);
+    }
+}
+
+int
+afr_internal_lock_finish(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = frame->local;
+    afr_lock_t *lock = NULL;
+
+    local->internal_lock.lock_cbk = NULL;
+    if (!local->transaction.eager_lock_on) {
+        if (local->internal_lock.lock_op_ret < 0) {
+            afr_transaction_done(frame, this);
+            return 0;
+        }
+        afr_changelog_pre_op(frame, this);
+    } else {
+        lock = &local->inode_ctx->lock[local->transaction.type];
+        if (local->internal_lock.lock_op_ret < 0) {
+            afr_handle_lock_acquire_failure(local);
+        } else {
+            lock->event_generation = local->event_generation;
+            afr_changelog_pre_op(frame, this);
+        }
+    }
+
+    return 0;
+}
 
-int32_t
-afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	      int32_t op_ret, int32_t op_errno)
+gf_boolean_t
+afr_are_conflicting_ops_waiting(afr_local_t *local, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv = NULL;
-	int done = 0;
-	int child_index = (long) cookie;
+    afr_lock_t *lock = NULL;
+    lock = &local->inode_ctx->lock[local->transaction.type];
+
+    /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
+     * is taken mount2 opened the same file, it won't be able to
+     * perform any {meta,}data operations until mount1 releases eager-lock.
+     * To avoid such scenario do not enable eager-lock for this transaction
+     * if open-fd-count is > 1 for metadata transactions and if num-inodelks > 1
+     * for data transactions
+     */
+
+    if (local->transaction.type == AFR_METADATA_TRANSACTION) {
+        if (local->inode_ctx->open_fd_count > 1) {
+            return _gf_true;
+        }
+    } else if (local->transaction.type == AFR_DATA_TRANSACTION) {
+        if (lock->num_inodelks > 1) {
+            return _gf_true;
+        }
+    }
 
-	int call_count = 0;
+    return _gf_false;
+}
 
-	local = frame->local;
-	priv  = this->private;
+gf_boolean_t
+afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this,
+                                        int delay)
+{
+    afr_local_t *local = NULL;
+    afr_lock_t *lock = NULL;
+    gf_boolean_t res = _gf_false;
+
+    local = frame->local;
+    lock = &local->inode_ctx->lock[local->transaction.type];
+
+    if (!afr_txn_nothing_failed(frame, this)) {
+        lock->release = _gf_true;
+        goto out;
+    }
+
+    if (afr_are_conflicting_ops_waiting(local, this)) {
+        lock->release = _gf_true;
+        goto out;
+    }
+
+    if (!list_empty(&lock->owners))
+        goto out;
+    else
+        GF_ASSERT(list_empty(&lock->waiting));
+
+    if (lock->release) {
+        goto out;
+    }
+
+    if (!delay) {
+        goto out;
+    }
+
+    if (local->transaction.disable_delayed_post_op) {
+        goto out;
+    }
+
+    if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP) &&
+        (local->op != GF_FOP_FSYNC)) {
+        /*Only allow writes/fsyncs but shard does [f]xattrops on writes, so
+         * they are fine too*/
+        goto out;
+    }
+
+    res = _gf_true;
+out:
+    return res;
+}
 
-	LOCK (&frame->lock);
-	{
-		if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
-			/* wait for the other lock to return */
-			call_count = --local->call_count;
-		}
+void
+afr_delayed_changelog_wake_up_cbk(void *data)
+{
+    afr_lock_t *lock = NULL;
+    afr_local_t *local = data;
+    afr_local_t *timer_local = NULL;
+    struct list_head shared;
+
+    INIT_LIST_HEAD(&shared);
+    lock = &local->inode_ctx->lock[local->transaction.type];
+    LOCK(&local->inode->lock);
+    {
+        timer_local = list_entry(lock->post_op.next, afr_local_t,
+                                 transaction.owner_list);
+        if (list_empty(&lock->owners) && (local == timer_local)) {
+            GF_ASSERT(list_empty(&lock->waiting));
+            /*Last owner*/
+            lock->release = _gf_true;
+            lock->delay_timer = NULL;
+        }
+    }
+    UNLOCK(&local->inode->lock);
+    afr_changelog_post_op_now(local->transaction.frame,
+                              local->transaction.frame->this);
+}
 
-		if (op_ret == -1) {
-			if (op_errno == ENOSYS) {
-				/* return ENOTSUP */
-				gf_log (this->name, GF_LOG_ERROR,
-					"subvolume does not support locking. "
-					"please load features/posix-locks xlator on server");
-				local->op_ret   = op_ret;
-				done = 1;
-			}
+/* SET operation */
+int
+afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local)
+{
+    LOCK(&local->inode->lock);
+    {
+        local->inode_ctx->witnessed_unstable_write = _gf_true;
+    }
+    UNLOCK(&local->inode->lock);
 
-			local->child_up[child_index] = 0;
-			local->op_errno = op_errno;
-		}
-	}
-	UNLOCK (&frame->lock);
-	
-	if (call_count == 0) {
-		if ((local->op_ret == -1) &&
-		    (local->op_errno == ENOSYS)) {
-			afr_unlock (frame, this);
-		} else {
-			local->transaction.locked_nodes[child_index] = 1;
-			local->transaction.lock_count++;
-			afr_lock_rec (frame, this, child_index + 1);
-		}
-	}
-
-	return 0;
-}
-
-
-static loc_t *
-lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2)
-{
-	int ret = 0;
-
-	ret = strcmp (l1->path, l2->path);
-	
-	if (ret == 0) 
-		ret = strcmp (b1, b2);
-
-	if (ret <= 0)
-		return l1;
-	else
-		return l2;
-}
-
-
-static
-int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index)
-{
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-
-	struct flock flock;
-
-	loc_t * lower  = NULL;
-	loc_t * higher = NULL;
-
-	const char *lower_name  = NULL;
-	const char *higher_name = NULL;
-
-	local = frame->local;
-	priv  = this->private;
-
-	flock.l_start = local->transaction.start;
-	flock.l_len   = local->transaction.len;
-	flock.l_type  = F_WRLCK;
-
-	/* skip over children that are down */
-	while ((child_index < priv->child_count)
-	       && !local->child_up[child_index])
-		child_index++;
-
-	if ((child_index == priv->child_count) &&
-	    local->transaction.lock_count == 0) {
-
-		gf_log (this->name, GF_LOG_DEBUG,
-			"unable to lock on even one child");
-
-		local->op_ret   = -1;
-		local->op_errno = EAGAIN;
-
-		local->transaction.done (frame, this);
-		
-		return 0;
-
-	}
-
-	if ((child_index == priv->child_count) 
-	    || (local->transaction.lock_count == 
-		afr_lock_server_count (priv, local->transaction.type))) {
-
-		/* we're done locking */
-
-		if (__changelog_needed_pre_op (frame, this)) {
-			afr_changelog_pre_op (frame, this);
-		} else {
-			local->transaction.fop (frame, this);
-		}
-
-		return 0;
-	}
-
-	switch (local->transaction.type) {
-	case AFR_DATA_TRANSACTION:		
-	case AFR_METADATA_TRANSACTION:
-	case AFR_FLUSH_TRANSACTION:
-
-		if (local->fd) {
-			STACK_WIND_COOKIE (frame, afr_lock_cbk,
-					   (void *) (long) child_index,
-					   priv->children[child_index], 
-					   priv->children[child_index]->fops->finodelk,
-					   local->fd, F_SETLKW, &flock);
-			
-		} else {
-			STACK_WIND_COOKIE (frame, afr_lock_cbk,
-					   (void *) (long) child_index,
-					   priv->children[child_index], 
-					   priv->children[child_index]->fops->inodelk,
-					   &local->loc, F_SETLKW, &flock);
-		}
-		
-		break;
-		
-	case AFR_ENTRY_RENAME_TRANSACTION:
-	{
-		local->call_count = 2;
+    return 0;
+}
 
-		lower = lower_path (&local->transaction.parent_loc, 
-				    local->transaction.basename,
-				    &local->transaction.new_parent_loc,
-				    local->transaction.new_basename);
-		
-		lower_name = (lower == &local->transaction.parent_loc ? 
-			      local->transaction.basename :
-			      local->transaction.new_basename);
+/* TEST and CLEAR operation */
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode)
+{
+    afr_inode_ctx_t *ctx = NULL;
+    gf_boolean_t witness = _gf_false;
 
-		higher = (lower == &local->transaction.parent_loc ? 
-			  &local->transaction.new_parent_loc :
-			  &local->transaction.parent_loc);
+    LOCK(&inode->lock);
+    {
+        (void)__afr_inode_ctx_get(this, inode, &ctx);
 
-		higher_name = (higher == &local->transaction.parent_loc ? 
-			       local->transaction.basename :
-			       local->transaction.new_basename);
+        if (ctx->witnessed_unstable_write) {
+            witness = _gf_true;
+            ctx->witnessed_unstable_write = _gf_false;
+        }
+    }
+    UNLOCK(&inode->lock);
 
-
-		/* TODO: these locks should be blocking */
-
-		STACK_WIND_COOKIE (frame, afr_lock_cbk,
-				   (void *) (long) child_index,
-				   priv->children[child_index], 
-				   priv->children[child_index]->fops->entrylk, 
-				   lower, lower_name,
-				   ENTRYLK_LOCK, ENTRYLK_WRLCK);
-
-		STACK_WIND_COOKIE (frame, afr_lock_cbk,
-				   (void *) (long) child_index,
-				   priv->children[child_index], 
-				   priv->children[child_index]->fops->entrylk, 
-				   higher, higher_name,
-				   ENTRYLK_LOCK, ENTRYLK_WRLCK);
-
-		break;
-	}
-		
-	case AFR_ENTRY_TRANSACTION:
-		if (local->fd) {
-			STACK_WIND_COOKIE (frame, afr_lock_cbk,
-					   (void *) (long) child_index,	
-					   priv->children[child_index], 
-					   priv->children[child_index]->fops->fentrylk, 
-					   local->fd, 
-					   local->transaction.basename,
-					   ENTRYLK_LOCK, ENTRYLK_WRLCK);
-		} else {
-			STACK_WIND_COOKIE (frame, afr_lock_cbk,
-					   (void *) (long) child_index,	
-					   priv->children[child_index], 
-					   priv->children[child_index]->fops->entrylk, 
-					   &local->transaction.parent_loc, 
-					   local->transaction.basename,
-					   ENTRYLK_LOCK, ENTRYLK_WRLCK);
-		}
-
-		break;
-	}
-
-	return 0;
-}
-
-
-int32_t afr_lock (call_frame_t *frame, xlator_t *this)
-{
-        frame->root->pid = (long) frame->root;
-
-	return afr_lock_rec (frame, this, 0);
+    return witness;
 }
 
+int
+afr_changelog_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int op_ret, int op_errno, struct iatt *pre,
+                        struct iatt *post, dict_t *xdata)
+{
+    afr_private_t *priv = NULL;
+    int child_index = (long)cookie;
+    int call_count = -1;
+    afr_local_t *local = NULL;
 
-/* }}} */
+    priv = this->private;
+    local = frame->local;
+
+    if (op_ret != 0) {
+        /* Failure of fsync() is as good as failure of previous
+           write(). So treat it like one.
+        */
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, AFR_MSG_FSYNC_FAILED,
+               "fsync(%s) failed on subvolume %s. Transaction was %s",
+               uuid_utoa(local->fd->inode->gfid),
+               priv->children[child_index]->name, gf_fop_list[local->op]);
+
+        afr_transaction_fop_failed(frame, this, child_index);
+    }
+
+    call_count = afr_frame_return(frame);
+
+    if (call_count == 0)
+        afr_changelog_post_op_now(frame, this);
 
-int32_t
-afr_transaction_resume (call_frame_t *frame, xlator_t *this)
+    return 0;
+}
+
+int
+afr_changelog_fsync(call_frame_t *frame, xlator_t *this)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
+    afr_local_t *local = NULL;
+    int i = 0;
+    int call_count = 0;
+    afr_private_t *priv = NULL;
+    dict_t *xdata = NULL;
+    GF_UNUSED int ret = -1;
+
+    local = frame->local;
+    priv = this->private;
+
+    call_count = AFR_COUNT(local->transaction.pre_op, priv->child_count);
+
+    if (!call_count) {
+        /* will go straight to unlock */
+        afr_changelog_post_op_now(frame, this);
+        return 0;
+    }
+
+    local->call_count = call_count;
+
+    xdata = dict_new();
+    if (xdata) {
+        ret = dict_set_int32_sizen(xdata, "batch-fsync", 1);
+        ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+    }
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (!local->transaction.pre_op[i])
+            continue;
+
+        STACK_WIND_COOKIE(frame, afr_changelog_fsync_cbk, (void *)(long)i,
+                          priv->children[i], priv->children[i]->fops->fsync,
+                          local->fd, 1, xdata);
+        if (!--call_count)
+            break;
+    }
+
+    if (xdata)
+        dict_unref(xdata);
+
+    return 0;
+}
 
-	local = frame->local;
-	priv  = this->private;
+int
+afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
+        afr_changelog_post_op_now(frame, this);
+        return 0;
+    }
+
+    if (afr_changelog_pre_op_uninherit(frame, this) &&
+        afr_txn_nothing_failed(frame, this)) {
+        /* just detected that this post-op is about to
+           be optimized away as a new write() has
+           already piggybacked on this frame's changelog.
+           */
+        afr_changelog_post_op_now(frame, this);
+        return 0;
+    }
+
+    /* Calling afr_changelog_post_op_now() now will result in
+       issuing ->[f]xattrop().
+
+       Performing a hard POST-OP (->[f]xattrop() FOP) is a more
+       responsible operation that what it might appear on the surface.
+
+       The changelog of a file (in the xattr of the file on the server)
+       stores information (pending count) about the state of the file
+       on the OTHER server. This changelog is blindly trusted, and must
+       therefore be updated in such a way it remains trustworthy. This
+       implies that decrementing the pending count (essentially "clearing
+       the dirty flag") must be done STRICTLY after we are sure that the
+       operation on the other server has reached stable storage.
+
+       While the backend filesystem on that server will eventually flush
+       it to stable storage, we (being in userspace) have no mechanism
+       to get notified when the write became "stable".
+
+       This means we need take matter into our own hands and issue an
+       fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
+       and get an acknowledgement for it. And we need to wait for the
+       fsync() acknowledgement before initiating the hard POST-OP.
+
+       However if the FD itself was opened in O_SYNC or O_DSYNC then
+       we are already guaranteed that the writes were made stable as
+       part of the FOP itself. The same holds true for NFS stable
+       writes which happen on an anonymous FD with O_DSYNC or O_SYNC
+       flag set in the writev() @flags param. For all other write types,
+       mark a flag in the fdctx whenever an unstable write is witnessed.
+       */
+
+    if (!afr_fd_has_witnessed_unstable_write(this, local->inode)) {
+        afr_changelog_post_op_now(frame, this);
+        return 0;
+    }
+
+    /* Check whether users want durability and perform fsync/post-op
+     * accordingly.
+     */
+    if (priv->ensure_durability) {
+        /* Time to fsync() */
+        afr_changelog_fsync(frame, this);
+    } else {
+        afr_changelog_post_op_now(frame, this);
+    }
+
+    return 0;
+}
 
-	if (__changelog_needed_post_op (frame, this)) {
-		afr_changelog_post_op (frame, this);
-	} else {
-		if (afr_lock_server_count (priv, local->transaction.type) == 0) {
-			local->transaction.done (frame, this);
-		} else {
-			afr_unlock (frame, this);
-		}
-	}
+void
+afr_changelog_post_op(call_frame_t *frame, xlator_t *this)
+{
+    struct timespec delta = {
+        0,
+    };
+    afr_private_t *priv = NULL;
+    afr_local_t *local = frame->local;
+    afr_lock_t *lock = NULL;
+    gf_boolean_t post_op = _gf_true;
+    struct list_head shared;
+
+    priv = this->private;
+    delta.tv_sec = priv->post_op_delay_secs;
+    delta.tv_nsec = 0;
+
+    INIT_LIST_HEAD(&shared);
+    if (!local->transaction.eager_lock_on)
+        goto out;
+
+    lock = &local->inode_ctx->lock[local->transaction.type];
+    LOCK(&local->inode->lock);
+    {
+        list_del_init(&local->transaction.owner_list);
+        list_add(&local->transaction.owner_list, &lock->post_op);
+        __afr_transaction_wake_shared(local, &shared);
+
+        if (!afr_is_delayed_changelog_post_op_needed(frame, this,
+                                                     delta.tv_sec)) {
+            if (list_empty(&lock->owners))
+                lock->release = _gf_true;
+            goto unlock;
+        }
 
-	return 0;
+        GF_ASSERT(lock->delay_timer == NULL);
+        lock->delay_timer = gf_timer_call_after(
+            this->ctx, delta, afr_delayed_changelog_wake_up_cbk, local);
+        if (!lock->delay_timer) {
+            lock->release = _gf_true;
+        } else {
+            post_op = _gf_false;
+        }
+    }
+unlock:
+    UNLOCK(&local->inode->lock);
+
+    if (!list_empty(&shared)) {
+        afr_lock_resume_shared(&shared);
+    }
+
+out:
+    if (post_op) {
+        if (!local->transaction.eager_lock_on || lock->release) {
+            afr_changelog_post_op_safe(frame, this);
+        } else {
+            afr_changelog_post_op_now(frame, this);
+        }
+    }
 }
 
+int
+afr_transaction_resume(call_frame_t *frame, xlator_t *this)
+{
+    afr_local_t *local = NULL;
+
+    local = frame->local;
+
+    afr_restore_lk_owner(frame);
+
+    afr_handle_symmetric_errors(frame, this);
+
+    if (!local->pre_op_compat)
+        /* new mode, pre-op was done along
+           with OP */
+        afr_changelog_pre_op_update(frame, this);
+
+    afr_changelog_post_op(frame, this);
+
+    return 0;
+}
 
 /**
- * afr_transaction_child_died - inform that a child died during an fop
+ * afr_transaction_fop_failed - inform that an fop failed
  */
 
 void
-afr_transaction_child_died (call_frame_t *frame, xlator_t *this, int child_index)
+afr_transaction_fop_failed(call_frame_t *frame, xlator_t *this, int child_index)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
+    afr_local_t *local = NULL;
 
-	local = frame->local;
-	priv  = this->private;
+    local = frame->local;
 
-	__mark_child_dead (local->pending_array, priv->child_count, child_index);
+    local->transaction.failed_subvols[child_index] = 1;
 }
 
+static gf_boolean_t
+__need_previous_lock_unlocked(afr_local_t *local)
+{
+    afr_lock_t *lock = NULL;
+
+    lock = &local->inode_ctx->lock[local->transaction.type];
+    if (!lock->acquired)
+        return _gf_false;
+    if (lock->acquired && lock->event_generation != local->event_generation)
+        return _gf_true;
+    return _gf_false;
+}
 
-int32_t
-afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
+void
+__afr_eager_lock_handle(afr_local_t *local, gf_boolean_t *take_lock,
+                        gf_boolean_t *do_pre_op, afr_local_t **timer_local)
 {
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
+    afr_lock_t *lock = NULL;
+    afr_local_t *owner_local = NULL;
+    xlator_t *this = local->transaction.frame->this;
+
+    local->transaction.eager_lock_on = _gf_true;
+    afr_set_lk_owner(local->transaction.frame, this, local->inode);
+
+    lock = &local->inode_ctx->lock[local->transaction.type];
+    if (__need_previous_lock_unlocked(local)) {
+        if (!list_empty(&lock->owners)) {
+            lock->release = _gf_true;
+        } else if (lock->delay_timer) {
+            lock->release = _gf_true;
+            if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) {
+                /* It will be put in frozen list
+                 * in the code flow below*/
+            } else {
+                *timer_local = list_entry(lock->post_op.next, afr_local_t,
+                                          transaction.owner_list);
+                lock->delay_timer = NULL;
+            }
+        }
+    }
+
+    if (lock->release) {
+        list_add_tail(&local->transaction.wait_list, &lock->frozen);
+        *take_lock = _gf_false;
+        goto out;
+    }
+
+    if (lock->delay_timer) {
+        *take_lock = _gf_false;
+        if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) {
+            list_add_tail(&local->transaction.wait_list, &lock->frozen);
+        } else {
+            *timer_local = list_entry(lock->post_op.next, afr_local_t,
+                                      transaction.owner_list);
+            afr_copy_inodelk_vars(&local->internal_lock,
+                                  &(*timer_local)->internal_lock, this, 0);
+            lock->delay_timer = NULL;
+            *do_pre_op = _gf_true;
+            list_add_tail(&local->transaction.owner_list, &lock->owners);
+        }
+        goto out;
+    }
+
+    if (!list_empty(&lock->owners)) {
+        if (!lock->acquired || afr_has_lock_conflict(local, _gf_true)) {
+            list_add_tail(&local->transaction.wait_list, &lock->waiting);
+            *take_lock = _gf_false;
+            goto out;
+        }
+        owner_local = list_entry(lock->owners.next, afr_local_t,
+                                 transaction.owner_list);
+        afr_copy_inodelk_vars(&local->internal_lock,
+                              &owner_local->internal_lock, this, 0);
+        *take_lock = _gf_false;
+        *do_pre_op = _gf_true;
+    }
+
+    if (lock->acquired)
+        GF_ASSERT(!(*take_lock));
+    list_add_tail(&local->transaction.owner_list, &lock->owners);
+out:
+    return;
+}
 
-	local = frame->local;
-	priv  = this->private;
+void
+afr_transaction_start(afr_local_t *local, xlator_t *this)
+{
+    afr_private_t *priv = NULL;
+    gf_boolean_t take_lock = _gf_true;
+    gf_boolean_t do_pre_op = _gf_false;
+    afr_local_t *timer_local = NULL;
+
+    priv = this->private;
+
+    if (local->transaction.type != AFR_DATA_TRANSACTION &&
+        local->transaction.type != AFR_METADATA_TRANSACTION)
+        goto lock_phase;
+
+    if (!priv->eager_lock)
+        goto lock_phase;
+
+    LOCK(&local->inode->lock);
+    {
+        __afr_eager_lock_handle(local, &take_lock, &do_pre_op, &timer_local);
+    }
+    UNLOCK(&local->inode->lock);
+lock_phase:
+    if (!local->transaction.eager_lock_on) {
+        afr_set_lk_owner(local->transaction.frame, this,
+                         local->transaction.frame->root);
+    }
+
+    if (take_lock) {
+        afr_lock(local->transaction.frame, this);
+    } else if (do_pre_op) {
+        afr_changelog_pre_op(local->transaction.frame, this);
+    }
+    /*Always call delayed_changelog_wake_up_cbk after calling pre-op above
+     * so that any inheriting can happen*/
+    if (timer_local)
+        afr_delayed_changelog_wake_up_cbk(timer_local);
+}
 
-	afr_transaction_local_init (local, priv);
+int
+afr_write_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
+{
+    afr_local_t *local = frame->local;
+
+    if (err) {
+        AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err);
+        goto fail;
+    }
+
+    afr_transaction_start(local, this);
+    return 0;
+fail:
+    local->transaction.unwind(frame, this);
+    AFR_STACK_DESTROY(frame);
+    return 0;
+}
 
-	local->transaction.resume = afr_transaction_resume;
-	local->transaction.type   = type;
+int
+afr_transaction_lockee_init(call_frame_t *frame)
+{
+    afr_local_t *local = frame->local;
+    afr_internal_lock_t *int_lock = &local->internal_lock;
+    afr_private_t *priv = frame->this->private;
+    int ret = 0;
+
+    switch (local->transaction.type) {
+        case AFR_DATA_TRANSACTION:
+        case AFR_METADATA_TRANSACTION:
+            ret = afr_add_inode_lockee(local, priv->child_count);
+            break;
+
+        case AFR_ENTRY_TRANSACTION:
+        case AFR_ENTRY_RENAME_TRANSACTION:
+            ret = afr_add_entry_lockee(local, &local->transaction.parent_loc,
+                                       local->transaction.basename,
+                                       priv->child_count);
+            if (ret) {
+                goto out;
+            }
+            if (local->op == GF_FOP_RENAME) {
+                ret = afr_add_entry_lockee(
+                    local, &local->transaction.new_parent_loc,
+                    local->transaction.new_basename, priv->child_count);
+                if (ret) {
+                    goto out;
+                }
 
-	if (afr_lock_server_count (priv, local->transaction.type) == 0) {
-		if (__changelog_needed_pre_op (frame, this)) {
-			afr_changelog_pre_op (frame, this);
-		} else {
-			local->transaction.fop (frame, this);
-		}
-	} else {
-		afr_lock (frame, this);
-	}
+                if (local->newloc.inode &&
+                    IA_ISDIR(local->newloc.inode->ia_type)) {
+                    ret = afr_add_entry_lockee(local, &local->newloc, NULL,
+                                               priv->child_count);
+                    if (ret) {
+                        goto out;
+                    }
+                }
+            } else if (local->op == GF_FOP_RMDIR) {
+                ret = afr_add_entry_lockee(local, &local->loc, NULL,
+                                           priv->child_count);
+                if (ret) {
+                    goto out;
+                }
+            }
+
+            if (int_lock->lockee_count > 1) {
+                qsort(int_lock->lockee, int_lock->lockee_count,
+                      sizeof(*int_lock->lockee), afr_entry_lockee_cmp);
+            }
+            break;
+    }
+out:
+    return ret;
+}
 
-	return 0;
+int
+afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type)
+{
+    afr_local_t *local = NULL;
+    afr_private_t *priv = NULL;
+    int ret = -1;
+    int event_generation = 0;
+
+    local = frame->local;
+    priv = this->private;
+    local->transaction.frame = frame;
+
+    local->transaction.type = type;
+
+    if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+        ret = -afr_quorum_errno(priv);
+        goto out;
+    }
+
+    if (!afr_is_consistent_io_possible(local, priv, &ret)) {
+        ret = -ret; /*op_errno to ret conversion*/
+        goto out;
+    }
+
+    if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) {
+        ret = -afr_quorum_errno(priv);
+        goto out;
+    }
+
+    ret = afr_transaction_local_init(local, this);
+    if (ret < 0)
+        goto out;
+
+    ret = afr_transaction_lockee_init(frame);
+    if (ret)
+        goto out;
+
+    if (type != AFR_METADATA_TRANSACTION) {
+        goto txn_start;
+    }
+
+    ret = afr_inode_get_readable(frame, local->inode, this, local->readable,
+                                 &event_generation, type);
+    if (ret < 0 ||
+        afr_is_inode_refresh_reqd(local->inode, this, priv->event_generation,
+                                  event_generation)) {
+        afr_inode_refresh(frame, this, local->inode, local->loc.gfid,
+                          afr_write_txn_refresh_done);
+        ret = 0;
+        goto out;
+    }
+
+txn_start:
+    ret = 0;
+    afr_transaction_start(local, this);
+out:
+    return ret;
 }
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index 77d7a813c46..beefa26f4a6 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -1,36 +1,75 @@
 /*
-   Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
 #ifndef __TRANSACTION_H__
 #define __TRANSACTION_H__
 
-#define AFR_METADATA_PENDING "trusted.glusterfs.afr.metadata-pending"
+#include "afr.h"
+
+void
+afr_transaction_fop_failed(call_frame_t *frame, xlator_t *this,
+                           int child_index);
 
-#define AFR_DATA_PENDING "trusted.glusterfs.afr.data-pending"
+int32_t
+afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type);
 
-#define AFR_ENTRY_PENDING "trusted.glusterfs.afr.entry-pending"
+int
+afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int32_t **pending);
 
 void
-afr_transaction_child_died (call_frame_t *frame, xlator_t *this,
-			    int child_index);
+afr_delayed_changelog_wake_up(xlator_t *this, fd_t *fd);
 
-int32_t
-afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
+void
+__mark_all_success(call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_txn_nothing_failed(call_frame_t *frame, xlator_t *this);
+
+int
+afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode,
+             afr_read_txn_wind_t readfn, afr_transaction_type type);
+
+int
+afr_read_txn_continue(call_frame_t *frame, xlator_t *this, int subvol);
+
+void
+afr_pending_read_increment(afr_private_t *priv, int child_index);
+
+void
+afr_pending_read_decrement(afr_private_t *priv, int child_index);
+
+call_frame_t *
+afr_transaction_detach_fop_frame(call_frame_t *frame);
+gf_boolean_t
+afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame);
+gf_boolean_t
+afr_needs_changelog_update(afr_local_t *local);
+void
+afr_zero_fill_stat(afr_local_t *local);
+
+void
+afr_pick_error_xdata(afr_local_t *local, afr_private_t *priv, inode_t *inode1,
+                     unsigned char *readable1, inode_t *inode2,
+                     unsigned char *readable2);
+int
+afr_transaction_resume(call_frame_t *frame, xlator_t *this);
+
+int
+afr_lock(call_frame_t *frame, xlator_t *this);
+
+void
+afr_delayed_changelog_wake_up_cbk(void *data);
+
+int
+afr_release_notify_lock_for_ta(void *opaque);
 
+int
+afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque);
 #endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index acd7f8d0236..df7366f0a65 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -1,20 +1,11 @@
 /*
-   Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
 #include <libgen.h>
@@ -24,2250 +15,1330 @@
 #include <stdlib.h>
 #include <signal.h>
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-inode-read.h"
-#include "afr-inode-write.h"
-#include "afr-dir-read.h"
-#include "afr-dir-write.h"
-#include "afr-transaction.h"
-
-#include "afr-self-heal.h"
-
-
-/**
- * afr_local_cleanup - cleanup everything in frame->local
- */
-
-void
-afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
-{
-	afr_self_heal_t *sh = NULL;
-	afr_private_t   *priv = NULL;
-	int              i = 0;
-
-
-	sh = &local->self_heal;
-	priv = this->private;
-
-	if (sh->buf)
-		FREE (sh->buf);
-
-	if (sh->xattr) {
-		for (i = 0; i < priv->child_count; i++) {
-			if (sh->xattr[i]) {
-				dict_unref (sh->xattr[i]);
-				sh->xattr[i] = NULL;
-			}
-		}
-		FREE (sh->xattr);
-	}
-
-	if (sh->child_errno)
-		FREE (sh->child_errno);
-
-	if (sh->pending_matrix) {
-		for (i = 0; i < priv->child_count; i++) {
-			FREE (sh->pending_matrix[i]);
-		}
-		FREE (sh->pending_matrix);
-	}
-
-	if (sh->delta_matrix) {
-		for (i = 0; i < priv->child_count; i++) {
-			FREE (sh->delta_matrix[i]);
-		}
-		FREE (sh->delta_matrix);
-	}
-
-	if (sh->sources)
-		FREE (sh->sources);
-
-	if (sh->success)
-		FREE (sh->success);
-
-	if (sh->healing_fd) {
-		fd_unref (sh->healing_fd);
-		sh->healing_fd = NULL;
-	}
-
-	loc_wipe (&sh->parent_loc);
-}
-
-
-void 
-afr_local_cleanup (afr_local_t *local, xlator_t *this)
-{
-	if (!local)
-		return;
-
-	afr_local_sh_cleanup (local, this);
-
-	FREE (local->child_errno);
-	FREE (local->pending_array);
-
-	loc_wipe (&local->loc);
-	loc_wipe (&local->newloc);
-
-	FREE (local->transaction.locked_nodes);
-	FREE (local->transaction.child_errno);
-
-	FREE (local->transaction.basename);
-	FREE (local->transaction.new_basename);
-
-	loc_wipe (&local->transaction.parent_loc);	
-	loc_wipe (&local->transaction.new_parent_loc);
-
-	if (local->fd)
-		fd_unref (local->fd);
-	
-	if (local->xattr_req)
-		dict_unref (local->xattr_req);
-
-	FREE (local->child_up);
-
-	{ /* lookup */
-		if (local->cont.lookup.xattr)
-			dict_unref (local->cont.lookup.xattr);
-	}
-
-	{ /* getxattr */
-		if (local->cont.getxattr.name)
-			FREE (local->cont.getxattr.name);
-	}
-
-	{ /* lk */
-		if (local->cont.lk.locked_nodes)
-			FREE (local->cont.lk.locked_nodes);
-	}
-
-	{ /* checksum */
-		if (local->cont.checksum.file_checksum)
-			FREE (local->cont.checksum.file_checksum);
-		if (local->cont.checksum.dir_checksum)
-			FREE (local->cont.checksum.dir_checksum);
-	}
-
-	{ /* create */
-		if (local->cont.create.fd)
-			fd_unref (local->cont.create.fd);
-	}
-
-	{ /* writev */
-		FREE (local->cont.writev.vector);
-	}
-
-	{ /* setxattr */
-		if (local->cont.setxattr.dict)
-			dict_unref (local->cont.setxattr.dict);
-	}
-
-	{ /* removexattr */
-		FREE (local->cont.removexattr.name);
-	}
-
-	{ /* symlink */
-		FREE (local->cont.symlink.linkpath);
-	}
-}
-
-
-int
-afr_frame_return (call_frame_t *frame)
-{
-	afr_local_t *local = NULL;
-	int          call_count = 0;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		call_count = --local->call_count;
-	}
-	UNLOCK (&frame->lock);
-
-	return call_count;
-}
-
-/**
- * first_up_child - return the index of the first child that is up
- */
-
-int
-afr_first_up_child (afr_private_t *priv)
-{
-	xlator_t ** children = NULL;
-	int         ret      = -1;
-	int         i        = 0;
-
-	LOCK (&priv->lock);
-	{
-		children = priv->children;
-		for (i = 0; i < priv->child_count; i++) {
-			if (priv->child_up[i]) {
-				ret = i;
-				break;
-			}
-		}
-	}
-	UNLOCK (&priv->lock);
-
-	return ret;
-}
-
-
-/**
- * up_children_count - return the number of children that are up
- */
-
-int
-afr_up_children_count (int child_count, unsigned char *child_up)
-{
-	int i   = 0;
-	int ret = 0;
-
-	for (i = 0; i < child_count; i++)
-		if (child_up[i])
-			ret++;
-	return ret;
-}
-
-
-int
-afr_locked_nodes_count (unsigned char *locked_nodes, int child_count)
-{
-	int ret = 0;
-	int i;
-
-	for (i = 0; i < child_count; i++)
-		if (locked_nodes[i])
-			ret++;
-
-	return ret;
-}
-
-
-ino64_t
-afr_itransform (ino64_t ino, int child_count, int child_index)
-{
-	ino64_t scaled_ino = -1;
-
-	if (ino == ((uint64_t) -1)) {
-		scaled_ino = ((uint64_t) -1);
-		goto out;
-	}
+#include "afr-common.c"
+#include "afr-messages.h"
 
-	scaled_ino = (ino * child_count) + child_index;
-
-out:
-	return scaled_ino;
-}
-
-
-int
-afr_deitransform_orig (ino64_t ino, int child_count)
-{
-	int index = -1;
-
-	index = ino % child_count;
-
-	return index;
-}
-
-
-int
-afr_deitransform (ino64_t ino, int child_count)
-{
-	return 0;
-}
+struct volume_options options[];
 
+static char *afr_favorite_child_policies[AFR_FAV_CHILD_POLICY_MAX + 1] = {
+    [AFR_FAV_CHILD_NONE] = "none",
+    [AFR_FAV_CHILD_BY_SIZE] = "size",
+    [AFR_FAV_CHILD_BY_CTIME] = "ctime",
+    [AFR_FAV_CHILD_BY_MTIME] = "mtime",
+    [AFR_FAV_CHILD_BY_MAJORITY] = "majority",
+    [AFR_FAV_CHILD_POLICY_MAX] = NULL,
+};
 
-int
-afr_self_heal_cbk (call_frame_t *frame, xlator_t *this)
+int32_t
+notify(xlator_t *this, int32_t event, void *data, ...)
 {
-	afr_local_t *local = NULL;
-	int ret = -1;
-
-	local = frame->local;
-
-	if (local->govinda_gOvinda) {
-		ret = inode_ctx_put (local->cont.lookup.inode, this, 1);
-
-		if (ret < 0) {
-			local->op_ret   = -1;
-			local->op_errno = -ret;
-		}
-	} else {
-		inode_ctx_del (local->cont.lookup.inode, this, NULL);
-	}
-
-	AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-			  local->cont.lookup.inode,
-			  &local->cont.lookup.buf,
-			  local->cont.lookup.xattr);
-
-	return 0;
-}
+    int ret = -1;
+    va_list ap;
+    void *data2 = NULL;
 
+    va_start(ap, data);
+    data2 = va_arg(ap, dict_t *);
+    va_end(ap);
+    ret = afr_notify(this, event, data, data2);
 
-int
-afr_lookup_cbk (call_frame_t *frame, void *cookie,
-		xlator_t *this,	int32_t op_ret,	int32_t op_errno,
-		inode_t *inode,	struct stat *buf, dict_t *xattr)
-{
-	afr_local_t *   local = NULL;
-	afr_private_t * priv  = NULL;
-	struct stat *   lookup_buf = NULL;
-	int             call_count = -1;
-	int             child_index = -1;
-	int             prev_child_index = -1;
-	uint32_t        open_fd_count = 0;
-	int             ret = 0;
-
-	child_index = (long) cookie;
-	priv = this->private;
-
-	LOCK (&frame->lock);
-	{
-		local = frame->local;
-
-		lookup_buf = &local->cont.lookup.buf;
-
-		if (op_ret == -1) {
-			if (op_errno == ENOENT)
-				local->enoent_count++;
-			
-			if (op_errno != ENOTCONN)
-				local->op_errno = op_errno;
-
-			goto unlock;
-		}
-
-		if (afr_sh_has_metadata_pending (xattr, child_index, this))
-			local->need_metadata_self_heal = 1;
-
-		if (afr_sh_has_entry_pending (xattr, child_index, this))
-			local->need_entry_self_heal = 1;
-
-		if (afr_sh_has_data_pending (xattr, child_index, this))
-			local->need_data_self_heal = 1;
-
-		ret = dict_get_uint32 (xattr, GLUSTERFS_OPEN_FD_COUNT,
-				       &open_fd_count);
-		local->open_fd_count += open_fd_count;
-
-		/* in case of revalidate, we need to send stat of the
-		 * child whose stat was sent during the first lookup.
-		 * (so that time stamp does not vary with revalidate.
-		 * in case it is down, stat of the fist success will
-		 * be replied */
-
-		/* inode number should be preserved across revalidates */
-
-		if (local->success_count == 0) {
-			local->op_ret   = op_ret;
-				
-			local->cont.lookup.inode = inode;
-			local->cont.lookup.xattr = dict_ref (xattr);
-
-			*lookup_buf = *buf;
-			lookup_buf->st_ino = afr_itransform (buf->st_ino,
-							     priv->child_count,
-							     child_index);
-		} else {
-			if (FILETYPE_DIFFERS (buf, lookup_buf)) {
-				/* mismatching filetypes with same name
-				   -- Govinda !! GOvinda !!!
-				*/
-				local->govinda_gOvinda = 1;
-			}
-
-			if (PERMISSION_DIFFERS (buf, lookup_buf)) {
-				/* mismatching permissions */
-				local->need_metadata_self_heal = 1;
-			}
-
-			if (OWNERSHIP_DIFFERS (buf, lookup_buf)) {
-				/* mismatching permissions */
-				local->need_metadata_self_heal = 1;
-			}
-
-			if (SIZE_DIFFERS (buf, lookup_buf)
-			    && S_ISREG (buf->st_mode)) {
-				local->need_data_self_heal = 1;
-			}
-
-			prev_child_index = afr_deitransform_orig (lookup_buf->st_ino, 
-								  priv->child_count);
-			if (child_index < prev_child_index) {
-				*lookup_buf = *buf;
-				lookup_buf->st_ino = afr_itransform (buf->st_ino,
-								     priv->child_count,
-								     child_index);
-			}
-		}
-
-		local->success_count++;
-	}
-unlock:
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		if (local->op_ret == 0) {
-			/* KLUDGE: assuming DHT will not itransform in 
-			   revalidate */
-			if (local->cont.lookup.inode->ino)
-				lookup_buf->st_ino = 
-					local->cont.lookup.inode->ino;
-		}
-
-		if (local->success_count && local->enoent_count) {
-			local->need_metadata_self_heal = 1;
-			local->need_data_self_heal = 1;
-			local->need_entry_self_heal = 1;
-		}
-
-		if (local->success_count) {
-			/* check for govinda_gOvinda case in previous lookup */
-			if (!inode_ctx_get (local->cont.lookup.inode, 
-					   this, NULL))
-				local->need_data_self_heal = 1;
-		}
-
-		if ((local->need_metadata_self_heal
-		     || local->need_data_self_heal
-		     || local->need_entry_self_heal)
-		    && (!local->open_fd_count)) {
-
-			if (!local->cont.lookup.inode->st_mode) {
-				/* fix for RT #602 */
-				local->cont.lookup.inode->st_mode =
-					lookup_buf->st_mode;
-			}
-
-			afr_self_heal (frame, this, afr_self_heal_cbk);
-		} else {
-			AFR_STACK_UNWIND (frame, local->op_ret,
-					  local->op_errno,
-					  local->cont.lookup.inode, 
-					  &local->cont.lookup.buf,
-					  local->cont.lookup.xattr);
-		}
-	}
-
-	return 0;
+    return ret;
 }
 
-
-int
-afr_lookup (call_frame_t *frame, xlator_t *this,
-	    loc_t *loc, dict_t *xattr_req)
-{
-	afr_private_t *priv = NULL;
-	afr_local_t   *local = NULL;
-	int            ret = -1;
-	int            i = 0;
-	int32_t        op_errno = 0;
-
-
-	priv = this->private;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	local->op_ret = -1;
-
-	frame->local = local;
-
-	loc_copy (&local->loc, loc);
-
-	local->reval_child_index = 0;
-
-	local->call_count = priv->child_count;
-
-	local->child_up = memdup (priv->child_up, priv->child_count);
-	local->child_count = afr_up_children_count (priv->child_count,
-						    local->child_up);
-
-	/* By default assume ENOTCONN. On success it will be set to 0. */
-	local->op_errno = ENOTCONN;
-	
-	if ((xattr_req == NULL)
-	    && (priv->metadata_self_heal
-		|| priv->data_self_heal
-		|| priv->entry_self_heal))
-		local->xattr_req = dict_new ();
-	else
-		local->xattr_req = dict_ref (xattr_req);
-
-	if (priv->metadata_self_heal) {
-		ret = dict_set_uint64 (local->xattr_req, AFR_METADATA_PENDING,
-				       priv->child_count * sizeof(int32_t));
-	}
-	
-	if (priv->data_self_heal) {
-		ret = dict_set_uint64 (local->xattr_req, AFR_DATA_PENDING,
-				       priv->child_count * sizeof(int32_t));
-	}
-	
-	if (priv->entry_self_heal) {
-		ret = dict_set_uint64 (local->xattr_req, AFR_ENTRY_PENDING,
-				       priv->child_count * sizeof(int32_t));
-	}
-
-	ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 0);
-
-	for (i = 0; i < priv->child_count; i++) {
-		STACK_WIND_COOKIE (frame, afr_lookup_cbk, (void *) (long) i,
-				   priv->children[i],
-				   priv->children[i]->fops->lookup,
-				   loc, local->xattr_req);
-	}
-
-	ret = 0;
-out:
-	if (ret == -1)
-		AFR_STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL, NULL);
-
-	return 0;
-}
-
-
-/* {{{ open */
-
-int
-afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-			int32_t op_ret, int32_t op_errno, struct stat *buf)
+int32_t
+mem_acct_init(xlator_t *this)
 {
-	afr_local_t * local = frame->local;
-
-	AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-			  local->fd);
-	return 0;
-}
-
+    int ret = -1;
 
-int
-afr_open_cbk (call_frame_t *frame, void *cookie,
-	      xlator_t *this, int32_t op_ret, int32_t op_errno,
-	      fd_t *fd)
-{
-	afr_local_t *  local = NULL;
-	afr_private_t * priv = NULL;
-
-	int call_count = -1;
-	
-	priv  = this->private;
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			local->op_errno = op_errno;
-		}
-
-		if (op_ret >= 0) {
-			local->op_ret = op_ret;
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		if ((local->cont.open.flags & O_TRUNC)
-		    && (local->op_ret >= 0)) {
-			STACK_WIND (frame, afr_open_ftruncate_cbk,
-				    this, this->fops->ftruncate,
-				    fd, 0);
-		} else {
-			AFR_STACK_UNWIND (frame, local->op_ret,
-					  local->op_errno, local->fd);
-		}
-	}
-
-	return 0;
-}
+    if (!this)
+        return ret;
 
+    ret = xlator_mem_acct_init(this, gf_afr_mt_end + 1);
 
-int
-afr_open (call_frame_t *frame, xlator_t *this,
-	  loc_t *loc, int32_t flags, fd_t *fd)
-{
-	afr_private_t * priv  = NULL;
-	afr_local_t *   local = NULL;
-	
-	int     i = 0;
-	int   ret = -1;
-
-	int32_t call_count = 0;	
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-	int32_t wind_flags = flags & (~O_TRUNC);
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-	VALIDATE_OR_GOTO (loc, out);
-	
-	priv = this->private;
-
-	ret = inode_ctx_get (loc->inode, this, NULL);
-	if (ret == 0) {
-		/* if ctx is set it means self-heal failed */
-
-		gf_log (this->name, GF_LOG_WARNING, 
-			"returning EIO, file has to be manually corrected "
-			"in backend");
-		op_errno = EIO;
-		goto out;
-	}
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-	
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	frame->local = local;
-	call_count   = local->call_count;
-
-	local->cont.open.flags = flags;
-	local->fd = fd_ref (fd);
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,
-					   priv->children[i],
-					   priv->children[i]->fops->open,
-					   loc, wind_flags, fd);
-			
-			if (!--call_count)
-				break;
-		}
-	}
-
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, fd);
-	}
+    if (ret != 0) {
+        return ret;
+    }
 
-	return 0;
+    return ret;
 }
 
-/* }}} */
-
-/* {{{ flush */
-
 int
-afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		      int32_t op_ret, int32_t op_errno)
+xlator_subvolume_index(xlator_t *this, xlator_t *subvol)
 {
-	afr_local_t *   local = NULL;
-
-	int call_count  = -1;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == 0)
-			local->op_ret = 0;
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-	}
-	
-	return 0;
+    int index = -1;
+    int i = 0;
+    xlator_list_t *list = NULL;
+
+    list = this->children;
+
+    while (list) {
+        if (subvol == list->xlator ||
+            strcmp(subvol->name, list->xlator->name) == 0) {
+            index = i;
+            break;
+        }
+        list = list->next;
+        i++;
+    }
+
+    return index;
 }
 
-
-int
-afr_flush_wind (call_frame_t *frame, xlator_t *this)
+static void
+fix_quorum_options(xlator_t *this, afr_private_t *priv, char *qtype,
+                   dict_t *options)
 {
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-	
-	int i = 0;
-	int call_count = -1;
-
-	local = frame->local;
-	priv = this->private;
-
-	call_count = afr_up_children_count (priv->child_count, local->child_up);
-
-	if (call_count == 0) {
-		local->transaction.resume (frame, this);
-		return 0;
-	}
-
-	local->call_count = call_count;
-
-	for (i = 0; i < priv->child_count; i++) {				
-		if (local->child_up[i]) {
-			STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, 
-					   (void *) (long) i,	
-					   priv->children[i], 
-					   priv->children[i]->fops->flush,
-					   local->fd);
-		
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	return 0;
+    if (dict_get_sizen(options, "quorum-type") == NULL) {
+        /* If user doesn't configure anything enable auto-quorum if the
+         * replica has more than two subvolumes */
+        if (priv->child_count > 2)
+            qtype = "auto";
+    }
+
+    if (priv->quorum_count && strcmp(qtype, "fixed")) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_OVERRIDE,
+               "quorum-type %s overriding quorum-count %u", qtype,
+               priv->quorum_count);
+    }
+
+    if (!strcmp(qtype, "none")) {
+        priv->quorum_count = 0;
+    } else if (!strcmp(qtype, "auto")) {
+        priv->quorum_count = AFR_QUORUM_AUTO;
+    }
 }
 
-
 int
-afr_flush_done (call_frame_t *frame, xlator_t *this)
+afr_set_favorite_child_policy(afr_private_t *priv, char *policy)
 {
-	afr_local_t *local = NULL;
+    int index = -1;
 
-	local = frame->local;
+    index = gf_get_index_by_elem(afr_favorite_child_policies, policy);
+    if (index < 0 || index >= AFR_FAV_CHILD_POLICY_MAX)
+        return -1;
 
-	AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+    priv->fav_child_policy = index;
 
-	return 0;
+    return 0;
 }
 
-
-int
-afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+static void
+set_data_self_heal_algorithm(afr_private_t *priv, char *algo)
 {
-	afr_private_t * priv  = NULL;
-	afr_local_t   * local = NULL;
-
-	int ret        = -1;
-
-	int op_ret   = -1;
-	int op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	frame->local = local;
-
-        local->op = GF_FOP_FLUSH;
-        local->transaction.fop    = afr_flush_wind;
-        local->transaction.done   = afr_flush_done;
-
-        local->fd                 = fd_ref (fd);
-
-        local->transaction.start  = 0;
-        local->transaction.len    = 0;
-
-        local->transaction.pending = AFR_DATA_PENDING;
-
-        afr_transaction (frame, this, AFR_FLUSH_TRANSACTION);
-
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
-
-	return 0;
+    if (!algo) {
+        priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC;
+    } else if (strcmp(algo, "full") == 0) {
+        priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_FULL;
+    } else if (strcmp(algo, "diff") == 0) {
+        priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DIFF;
+    } else {
+        priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC;
+    }
 }
 
-/* }}} */
-
-/* {{{ fsync */
-
-int
-afr_fsync_cbk (call_frame_t *frame, void *cookie,
-	       xlator_t *this, int32_t op_ret, int32_t op_errno)
+void
+afr_handle_anon_inode_options(afr_private_t *priv, dict_t *options)
 {
-	afr_local_t *local = NULL;
-	
-	int call_count = -1;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == 0)
-			local->op_ret = 0;
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-
-	return 0;
+    char *volfile_id_str = NULL;
+    uuid_t anon_inode_gfid = {0};
+
+    /*If volume id is not present don't enable anything*/
+    if (dict_get_str(options, "volume-id", &volfile_id_str))
+        return;
+    GF_ASSERT(strlen(AFR_ANON_DIR_PREFIX) + strlen(volfile_id_str) <= NAME_MAX);
+    /*anon_inode_name is not supposed to change once assigned*/
+    if (!priv->anon_inode_name[0]) {
+        snprintf(priv->anon_inode_name, sizeof(priv->anon_inode_name), "%s-%s",
+                 AFR_ANON_DIR_PREFIX, volfile_id_str);
+        gf_uuid_parse(volfile_id_str, anon_inode_gfid);
+        /*Flip a bit to make sure volfile-id and anon-gfid are not same*/
+        anon_inode_gfid[0] ^= 1;
+        uuid_utoa_r(anon_inode_gfid, priv->anon_gfid_str);
+    }
 }
 
-
 int
-afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
-	   int32_t datasync)
-{
-	afr_private_t *priv = NULL;
-	afr_local_t *local = NULL;
-
-	int ret = -1;
-
-	int i = 0;
-	int32_t call_count = 0;
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	call_count = local->call_count;
-	frame->local = local;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND (frame, afr_fsync_cbk,
-				    priv->children[i],
-				    priv->children[i]->fops->fsync,
-				    fd, datasync);
-			if (!--call_count)
-				break;
-		}
-	}
-
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
-	return 0;
-}
-
-/* }}} */
-
-/* {{{ fsync */
-
-int32_t
-afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,
-		  xlator_t *this, int32_t op_ret, int32_t op_errno)
-{
-	afr_local_t *local = NULL;
-	
-	int call_count = -1;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == 0)
-			local->op_ret = 0;
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-
-	return 0;
-}
-
-
-int32_t
-afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
-	      int32_t datasync)
-{
-	afr_private_t *priv = NULL;
-	afr_local_t *local = NULL;
-
-	int ret = -1;
-
-	int i = 0;
-	int32_t call_count = 0;
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	call_count = local->call_count;
-	frame->local = local;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND (frame, afr_fsync_cbk,
-				    priv->children[i],
-				    priv->children[i]->fops->fsyncdir,
-				    fd, datasync);
-			if (!--call_count)
-				break;
-		}
-	}
-
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
-	return 0;
-}
-
-/* }}} */
-
-/* {{{ xattrop */
-
-int32_t
-afr_xattrop_cbk (call_frame_t *frame, void *cookie,
-		 xlator_t *this, int32_t op_ret, int32_t op_errno,
-		 dict_t *xattr)
+reconfigure(xlator_t *this, dict_t *options)
 {
-	afr_local_t *local = NULL;
-	
-	int call_count = -1;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == 0)
-			local->op_ret = 0;
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr);
-
-	return 0;
-}
-
-
-int32_t
-afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
-	     gf_xattrop_flags_t optype, dict_t *xattr)
-{
-	afr_private_t *priv = NULL;
-	afr_local_t *local  = NULL;
-
-	int ret = -1;
-
-	int i = 0;
-	int32_t call_count = 0;
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	call_count = local->call_count;
-	frame->local = local;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND (frame, afr_xattrop_cbk,
-				    priv->children[i],
-				    priv->children[i]->fops->xattrop,
-				    loc, optype, xattr);
-			if (!--call_count)
-				break;
-		}
-	}
-
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
-	return 0;
-}
-
-/* }}} */
-
-/* {{{ fxattrop */
-
-int32_t
-afr_fxattrop_cbk (call_frame_t *frame, void *cookie,
-		  xlator_t *this, int32_t op_ret, int32_t op_errno,
-		  dict_t *xattr)
-{
-	afr_local_t *local = NULL;
-	
-	int call_count = -1;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == 0)
-			local->op_ret = 0;
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr);
-
-	return 0;
-}
-
-
-int32_t
-afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
-	      gf_xattrop_flags_t optype, dict_t *xattr)
-{
-	afr_private_t *priv = NULL;
-	afr_local_t *local  = NULL;
-
-	int ret = -1;
-
-	int i = 0;
-	int32_t call_count = 0;
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
+    afr_private_t *priv = NULL;
+    xlator_t *read_subvol = NULL;
+    int read_subvol_index = -1;
+    int timeout_old = 0;
+    int ret = -1;
+    int index = -1;
+    char *qtype = NULL;
+    char *fav_child_policy = NULL;
+    char *data_self_heal = NULL;
+    char *data_self_heal_algorithm = NULL;
+    char *locking_scheme = NULL;
+    gf_boolean_t consistent_io = _gf_false;
+    gf_boolean_t choose_local_old = _gf_false;
+    gf_boolean_t enabled_old = _gf_false;
+
+    priv = this->private;
+
+    GF_OPTION_RECONF("metadata-splitbrain-forced-heal",
+                     priv->metadata_splitbrain_forced_heal, options, bool, out);
+
+    GF_OPTION_RECONF("background-self-heal-count",
+                     priv->background_self_heal_count, options, uint32, out);
+
+    GF_OPTION_RECONF("heal-wait-queue-length", priv->heal_wait_qlen, options,
+                     uint32, out);
+
+    GF_OPTION_RECONF("metadata-self-heal", priv->metadata_self_heal, options,
+                     bool, out);
+
+    GF_OPTION_RECONF("data-self-heal", data_self_heal, options, str, out);
+    if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1)
+        goto out;
+
+    GF_OPTION_RECONF("entry-self-heal", priv->entry_self_heal, options, bool,
+                     out);
+
+    GF_OPTION_RECONF("data-self-heal-window-size",
+                     priv->data_self_heal_window_size, options, uint32, out);
+
+    GF_OPTION_RECONF("data-self-heal-algorithm", data_self_heal_algorithm,
+                     options, str, out);
+    set_data_self_heal_algorithm(priv, data_self_heal_algorithm);
+
+    GF_OPTION_RECONF("halo-enabled", priv->halo_enabled, options, bool, out);
+
+    GF_OPTION_RECONF("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
+                     options, uint32, out);
+
+    GF_OPTION_RECONF("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec,
+                     options, uint32, out);
+
+    GF_OPTION_RECONF("halo-max-latency", priv->halo_max_latency_msec, options,
+                     uint32, out);
+
+    GF_OPTION_RECONF("halo-max-replicas", priv->halo_max_replicas, options,
+                     uint32, out);
+
+    GF_OPTION_RECONF("halo-min-replicas", priv->halo_min_replicas, options,
+                     uint32, out);
+
+    GF_OPTION_RECONF("read-subvolume", read_subvol, options, xlator, out);
+
+    choose_local_old = priv->choose_local;
+    GF_OPTION_RECONF("choose-local", priv->choose_local, options, bool, out);
+
+    if (choose_local_old != priv->choose_local) {
+        priv->read_child = -1;
+        if (choose_local_old == _gf_false)
+            priv->did_discovery = _gf_false;
+    }
+
+    GF_OPTION_RECONF("read-hash-mode", priv->hash_mode, options, uint32, out);
+
+    if (read_subvol) {
+        index = xlator_subvolume_index(this, read_subvol);
+        if (index == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+                   "%s not a subvolume", read_subvol->name);
+            goto out;
+        }
+        priv->read_child = index;
+    }
+
+    GF_OPTION_RECONF("read-subvolume-index", read_subvol_index, options, int32,
+                     out);
+
+    if (read_subvol_index > -1) {
+        index = read_subvol_index;
+        if (index >= priv->child_count) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+                   "%d not a subvolume-index", index);
+            goto out;
+        }
+        priv->read_child = index;
+    }
+
+    GF_OPTION_RECONF("pre-op-compat", priv->pre_op_compat, options, bool, out);
+    GF_OPTION_RECONF("locking-scheme", locking_scheme, options, str, out);
+    priv->granular_locks = (strcmp(locking_scheme, "granular") == 0);
+    GF_OPTION_RECONF("full-lock", priv->full_lock, options, bool, out);
+    GF_OPTION_RECONF("granular-entry-heal", priv->esh_granular, options, bool,
+                     out);
+
+    GF_OPTION_RECONF("eager-lock", priv->eager_lock, options, bool, out);
+    GF_OPTION_RECONF("optimistic-change-log", priv->optimistic_change_log,
+                     options, bool, out);
+    GF_OPTION_RECONF("quorum-type", qtype, options, str, out);
+    GF_OPTION_RECONF("quorum-count", priv->quorum_count, options, uint32, out);
+    fix_quorum_options(this, priv, qtype, options);
+    if (priv->quorum_count && !afr_has_quorum(priv->child_up, this, NULL))
+        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
+               "Client-quorum is not met");
 
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
+    GF_OPTION_RECONF("post-op-delay-secs", priv->post_op_delay_secs, options,
+                     uint32, out);
 
-	call_count = local->call_count;
-	frame->local = local;
+    GF_OPTION_RECONF(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, options,
+                     size_uint64, out);
+    /* Reset this so we re-discover in case the topology changed.  */
+    GF_OPTION_RECONF("ensure-durability", priv->ensure_durability, options,
+                     bool, out);
 
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND (frame, afr_fxattrop_cbk,
-				    priv->children[i],
-				    priv->children[i]->fops->fxattrop,
-				    fd, optype, xattr);
-			if (!--call_count)
-				break;
-		}
-	}
+    enabled_old = priv->shd.enabled;
+    GF_OPTION_RECONF("self-heal-daemon", priv->shd.enabled, options, bool, out);
 
-	op_ret = 0;
+    GF_OPTION_RECONF("iam-self-heal-daemon", priv->shd.iamshd, options, bool,
+                     out);
+
+    timeout_old = priv->shd.timeout;
+    GF_OPTION_RECONF("heal-timeout", priv->shd.timeout, options, int32, out);
+
+    GF_OPTION_RECONF("consistent-metadata", priv->consistent_metadata, options,
+                     bool, out);
+
+    GF_OPTION_RECONF("shd-max-threads", priv->shd.max_threads, options, uint32,
+                     out);
+
+    GF_OPTION_RECONF("shd-wait-qlength", priv->shd.wait_qlength, options,
+                     uint32, out);
+
+    GF_OPTION_RECONF("favorite-child-policy", fav_child_policy, options, str,
+                     out);
+    if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1)
+        goto out;
+
+    priv->did_discovery = _gf_false;
+
+    GF_OPTION_RECONF("consistent-io", consistent_io, options, bool, out);
+    if (priv->quorum_count != 0)
+        consistent_io = _gf_false;
+    priv->consistent_io = consistent_io;
+
+    afr_handle_anon_inode_options(priv, options);
+
+    GF_OPTION_RECONF("use-anonymous-inode", priv->use_anon_inode, options, bool,
+                     out);
+    if (priv->shd.enabled) {
+        if ((priv->shd.enabled != enabled_old) ||
+            (timeout_old != priv->shd.timeout))
+            afr_selfheal_childup(this, priv);
+    }
+
+    ret = 0;
 out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
-	return 0;
+    return ret;
 }
 
-/* }}} */
-
-
-int32_t
-afr_inodelk_cbk (call_frame_t *frame, void *cookie,
-		 xlator_t *this, int32_t op_ret, int32_t op_errno)
-		
-{
-	afr_local_t *local = NULL;
-	
-	int call_count = -1;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == 0)
-			local->op_ret = 0;
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-
-	return 0;
-}
-
-
-int32_t
-afr_inodelk (call_frame_t *frame, xlator_t *this, loc_t *loc,
-	     int32_t cmd, struct flock *flock)
-{
-	afr_private_t *priv = NULL;
-	afr_local_t *local  = NULL;
-
-	int ret = -1;
-
-	int i = 0;
-	int32_t call_count = 0;
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	call_count = local->call_count;
-	frame->local = local;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND (frame, afr_inodelk_cbk,
-				    priv->children[i],
-				    priv->children[i]->fops->inodelk,
-				    loc, cmd, flock);
-
-			if (!--call_count)
-				break;
-		}
-	}
-
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
-	return 0;
-}
-
-
-int32_t
-afr_finodelk_cbk (call_frame_t *frame, void *cookie,
-		  xlator_t *this, int32_t op_ret, int32_t op_errno)
-		
-{
-	afr_local_t *local = NULL;
-	
-	int call_count = -1;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == 0)
-			local->op_ret = 0;
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-
-	return 0;
-}
-
-
-int32_t
-afr_finodelk (call_frame_t *frame, xlator_t *this, fd_t *fd,
-	      int32_t cmd, struct flock *flock)
-{
-	afr_private_t *priv = NULL;
-	afr_local_t *local  = NULL;
-
-	int ret = -1;
-
-	int i = 0;
-	int32_t call_count = 0;
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	call_count = local->call_count;
-	frame->local = local;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND (frame, afr_finodelk_cbk,
-				    priv->children[i],
-				    priv->children[i]->fops->finodelk,
-				    fd, cmd, flock);
-
-			if (!--call_count)
-				break;
-		}
-	}
-
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
-	return 0;
-}
-
-
-int32_t
-afr_entrylk_cbk (call_frame_t *frame, void *cookie,
-		 xlator_t *this, int32_t op_ret, int32_t op_errno)
-		
-{
-	afr_local_t *local = NULL;
-	
-	int call_count = -1;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == 0)
-			local->op_ret = 0;
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-
-	return 0;
-}
-
-
-int32_t
-afr_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc,
-	     const char *basename, entrylk_cmd cmd, entrylk_type type)
-{
-	afr_private_t *priv = NULL;
-	afr_local_t *local  = NULL;
-
-	int ret = -1;
-
-	int i = 0;
-	int32_t call_count = 0;
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	call_count = local->call_count;
-	frame->local = local;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND (frame, afr_entrylk_cbk,
-				    priv->children[i],
-				    priv->children[i]->fops->entrylk,
-				    loc, basename, cmd, type);
-
-			if (!--call_count)
-				break;
-		}
-	}
-
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
-	return 0;
-}
-
-
-
-int32_t
-afr_fentrylk_cbk (call_frame_t *frame, void *cookie,
-		 xlator_t *this, int32_t op_ret, int32_t op_errno)
-		
-{
-	afr_local_t *local = NULL;
-	
-	int call_count = -1;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == 0)
-			local->op_ret = 0;
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-
-	return 0;
-}
-
-
-int32_t
-afr_fentrylk (call_frame_t *frame, xlator_t *this, fd_t *fd,
-	      const char *basename, entrylk_cmd cmd, entrylk_type type)
+static int
+afr_pending_xattrs_init(afr_private_t *priv, xlator_t *this)
 {
-	afr_private_t *priv = NULL;
-	afr_local_t *local  = NULL;
-
-	int ret = -1;
-
-	int i = 0;
-	int32_t call_count = 0;
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
+    int ret = -1;
+    int i = 0;
+    char *ptr = NULL;
+    char *ptr1 = NULL;
+    char *xattrs_list = NULL;
+    xlator_list_t *trav = NULL;
+    int child_count = -1;
+
+    trav = this->children;
+    child_count = priv->child_count;
+    if (priv->thin_arbiter_count) {
+        /* priv->pending_key[THIN_ARBITER_BRICK_INDEX] is used as the
+         * name of the thin arbiter file for persistence across add/
+         * removal of DHT subvols.*/
+        child_count++;
+    }
+
+    GF_OPTION_INIT("afr-pending-xattr", xattrs_list, str, out);
+    priv->pending_key = GF_CALLOC(sizeof(*priv->pending_key), child_count,
+                                  gf_afr_mt_char);
+    if (!priv->pending_key) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    if (!xattrs_list) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_NO_CHANGELOG,
+               "Unable to fetch afr-pending-xattr option from volfile."
+               " Falling back to using client translator names. ");
+
+        while (i < child_count) {
+            ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
+                              trav->xlator->name);
+            if (ret == -1) {
+                ret = -ENOMEM;
+                goto out;
+            }
+            trav = trav->next;
+            i++;
+        }
+        ret = 0;
+        goto out;
+    }
+
+    ptr = ptr1 = gf_strdup(xattrs_list);
+    if (!ptr) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    for (i = 0, ptr = strtok(ptr, ","); ptr; ptr = strtok(NULL, ",")) {
+        ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
+                          ptr);
+        if (ret == -1) {
+            ret = -ENOMEM;
+            goto out;
+        }
+        i++;
+    }
+    ret = 0;
 
-	priv = this->private;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	call_count = local->call_count;
-	frame->local = local;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND (frame, afr_fentrylk_cbk,
-				    priv->children[i],
-				    priv->children[i]->fops->fentrylk,
-				    fd, basename, cmd, type);
-
-			if (!--call_count)
-				break;
-		}
-	}
-
-	op_ret = 0;
 out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
-	return 0;
+    GF_FREE(ptr1);
+    return ret;
 }
 
-
-int32_t
-afr_checksum_cbk (call_frame_t *frame, void *cookie,
-		  xlator_t *this, int32_t op_ret, int32_t op_errno,
-		  uint8_t *file_checksum, uint8_t *dir_checksum)
-		
-{
-	afr_local_t *local = NULL;
-	
-	int call_count = -1;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == 0 && (local->op_ret != 0)) {
-			local->op_ret = 0;
-
-			local->cont.checksum.file_checksum = MALLOC (ZR_FILENAME_MAX);
-			memcpy (local->cont.checksum.file_checksum, file_checksum, 
-				ZR_FILENAME_MAX);
-
-			local->cont.checksum.dir_checksum = MALLOC (ZR_FILENAME_MAX);
-			memcpy (local->cont.checksum.dir_checksum, dir_checksum, 
-				ZR_FILENAME_MAX);
-
-		}
-
-		local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-				  local->cont.checksum.file_checksum, 
-				  local->cont.checksum.dir_checksum);
-
-	return 0;
-}
-
-
-int32_t
-afr_checksum (call_frame_t *frame, xlator_t *this, loc_t *loc,
-	      int32_t flag)
+void
+afr_ta_init(afr_private_t *priv)
 {
-	afr_private_t *priv = NULL;
-	afr_local_t *local  = NULL;
-
-	int ret = -1;
-
-	int i = 0;
-	int32_t call_count = 0;
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	call_count = local->call_count;
-	frame->local = local;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND (frame, afr_checksum_cbk,
-				    priv->children[i],
-				    priv->children[i]->fops->checksum,
-				    loc, flag);
-
-			if (!--call_count)
-				break;
-		}
-	}
-
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno);
-	}
-	return 0;
+    priv->thin_arbiter_count = 1;
+    priv->child_count--;
+    priv->ta_child_up = 0;
+    priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
+    priv->ta_notify_dom_lock_offset = 0;
+    priv->ta_in_mem_txn_count = 0;
+    priv->ta_on_wire_txn_count = 0;
+    priv->release_ta_notify_dom_lock = _gf_false;
+    INIT_LIST_HEAD(&priv->ta_waitq);
+    INIT_LIST_HEAD(&priv->ta_onwireq);
+    gf_uuid_clear(priv->ta_gfid);
 }
 
-
 int32_t
-afr_statfs_cbk (call_frame_t *frame, void *cookie,
-		xlator_t *this, int32_t op_ret, int32_t op_errno,
-		struct statvfs *statvfs)
+init(xlator_t *this)
 {
-	afr_local_t *local = NULL;
-
-	int call_count = 0;
+    afr_private_t *priv = NULL;
+    int child_count = 0;
+    xlator_list_t *trav = NULL;
+    int i = 0;
+    int ret = -1;
+    GF_UNUSED int op_errno = 0;
+    xlator_t *read_subvol = NULL;
+    int read_subvol_index = -1;
+    char *qtype = NULL;
+    char *fav_child_policy = NULL;
+    char *thin_arbiter = NULL;
+    char *data_self_heal = NULL;
+    char *locking_scheme = NULL;
+    char *data_self_heal_algorithm = NULL;
+
+    if (!this->children) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_CHILD_MISCONFIGURED,
+               "replicate translator needs more than one "
+               "subvolume defined.");
+        return -1;
+    }
+
+    if (!this->parents) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_VOL_MISCONFIGURED,
+               "Volume is dangling.");
+    }
+
+    this->private = GF_CALLOC(1, sizeof(afr_private_t),
+                              gf_afr_mt_afr_private_t);
+    if (!this->private)
+        goto out;
+
+    priv = this->private;
+    INIT_LIST_HEAD(&priv->saved_locks);
+    INIT_LIST_HEAD(&priv->lk_healq);
+    LOCK_INIT(&priv->lock);
+
+    child_count = xlator_subvolume_count(this);
+
+    priv->child_count = child_count;
+
+    priv->read_child = -1;
+
+    GF_OPTION_INIT("arbiter-count", priv->arbiter_count, uint32, out);
+    GF_OPTION_INIT("thin-arbiter", thin_arbiter, str, out);
+    if (thin_arbiter && strlen(thin_arbiter) > 0) {
+        afr_ta_init(priv);
+    }
+    INIT_LIST_HEAD(&priv->healing);
+    INIT_LIST_HEAD(&priv->heal_waiting);
+
+    priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT;
+
+    GF_OPTION_INIT("afr-dirty-xattr", priv->afr_dirty, str, out);
+
+    GF_OPTION_INIT("metadata-splitbrain-forced-heal",
+                   priv->metadata_splitbrain_forced_heal, bool, out);
+
+    GF_OPTION_INIT("read-subvolume", read_subvol, xlator, out);
+    if (read_subvol) {
+        priv->read_child = xlator_subvolume_index(this, read_subvol);
+        if (priv->read_child == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+                   "%s not a subvolume", read_subvol->name);
+            goto out;
+        }
+    }
+    GF_OPTION_INIT("read-subvolume-index", read_subvol_index, int32, out);
+    if (read_subvol_index > -1) {
+        if (read_subvol_index >= priv->child_count) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+                   "%d not a subvolume-index", read_subvol_index);
+            goto out;
+        }
+        priv->read_child = read_subvol_index;
+    }
+    GF_OPTION_INIT("choose-local", priv->choose_local, bool, out);
+
+    priv->pending_reads = GF_CALLOC(sizeof(*priv->pending_reads),
+                                    priv->child_count, gf_afr_mt_atomic_t);
+
+    GF_OPTION_INIT("read-hash-mode", priv->hash_mode, uint32, out);
+
+    priv->favorite_child = -1;
+
+    GF_OPTION_INIT("favorite-child-policy", fav_child_policy, str, out);
+    if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1)
+        goto out;
+
+    GF_OPTION_INIT("shd-max-threads", priv->shd.max_threads, uint32, out);
+
+    GF_OPTION_INIT("shd-wait-qlength", priv->shd.wait_qlength, uint32, out);
+
+    GF_OPTION_INIT("background-self-heal-count",
+                   priv->background_self_heal_count, uint32, out);
+
+    GF_OPTION_INIT("heal-wait-queue-length", priv->heal_wait_qlen, uint32, out);
+
+    GF_OPTION_INIT("data-self-heal", data_self_heal, str, out);
+    if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1)
+        goto out;
+
+    GF_OPTION_INIT("data-self-heal-algorithm", data_self_heal_algorithm, str,
+                   out);
+    set_data_self_heal_algorithm(priv, data_self_heal_algorithm);
+
+    GF_OPTION_INIT("data-self-heal-window-size",
+                   priv->data_self_heal_window_size, uint32, out);
+
+    GF_OPTION_INIT("metadata-self-heal", priv->metadata_self_heal, bool, out);
+
+    GF_OPTION_INIT("entry-self-heal", priv->entry_self_heal, bool, out);
+
+    GF_OPTION_INIT("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
+                   uint32, out);
+
+    GF_OPTION_INIT("halo-max-latency", priv->halo_max_latency_msec, uint32,
+                   out);
+    GF_OPTION_INIT("halo-max-replicas", priv->halo_max_replicas, uint32, out);
+    GF_OPTION_INIT("halo-min-replicas", priv->halo_min_replicas, uint32, out);
+
+    GF_OPTION_INIT("halo-enabled", priv->halo_enabled, bool, out);
+
+    GF_OPTION_INIT("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec,
+                   uint32, out);
+
+    GF_OPTION_INIT("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out);
+
+    GF_OPTION_INIT("optimistic-change-log", priv->optimistic_change_log, bool,
+                   out);
+
+    GF_OPTION_INIT("pre-op-compat", priv->pre_op_compat, bool, out);
+    GF_OPTION_INIT("locking-scheme", locking_scheme, str, out);
+    priv->granular_locks = (strcmp(locking_scheme, "granular") == 0);
+    GF_OPTION_INIT("full-lock", priv->full_lock, bool, out);
+    GF_OPTION_INIT("granular-entry-heal", priv->esh_granular, bool, out);
+
+    GF_OPTION_INIT("eager-lock", priv->eager_lock, bool, out);
+    GF_OPTION_INIT("quorum-type", qtype, str, out);
+    GF_OPTION_INIT("quorum-count", priv->quorum_count, uint32, out);
+    GF_OPTION_INIT(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size_uint64,
+                   out);
+    fix_quorum_options(this, priv, qtype, this->options);
+
+    GF_OPTION_INIT("post-op-delay-secs", priv->post_op_delay_secs, uint32, out);
+    GF_OPTION_INIT("ensure-durability", priv->ensure_durability, bool, out);
+
+    GF_OPTION_INIT("self-heal-daemon", priv->shd.enabled, bool, out);
+
+    GF_OPTION_INIT("iam-self-heal-daemon", priv->shd.iamshd, bool, out);
+    GF_OPTION_INIT("heal-timeout", priv->shd.timeout, int32, out);
+
+    GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out);
+    GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out);
+    afr_handle_anon_inode_options(priv, this->options);
+
+    GF_OPTION_INIT("use-anonymous-inode", priv->use_anon_inode, bool, out);
+    if (priv->quorum_count != 0)
+        priv->consistent_io = _gf_false;
+
+    priv->wait_count = 1;
+
+    priv->local = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char);
+    if (!priv->local) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    priv->anon_inode = GF_CALLOC(sizeof(unsigned char), child_count,
+                                 gf_afr_mt_char);
+
+    priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count,
+                               gf_afr_mt_char);
+
+    priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count,
+                                    gf_afr_mt_child_latency_t);
+    priv->halo_child_up = GF_CALLOC(sizeof(unsigned char), child_count,
+                                    gf_afr_mt_char);
 
-	LOCK (&frame->lock);
-	{
-		local = frame->local;
-
-		if (op_ret == 0) {
-			local->op_ret   = op_ret;
-			
-			if (local->cont.statfs.buf_set) {
-				if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail)
-					local->cont.statfs.buf = *statvfs;
-			} else {
-				local->cont.statfs.buf = *statvfs;
-				local->cont.statfs.buf_set = 1;
-			}
-		}
-
-		if (op_ret == -1)
-			local->op_errno = op_errno;
-
-	}
-	UNLOCK (&frame->lock);
-
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-				  &local->cont.statfs.buf);
-
-	return 0;
-}
-
-
-int32_t
-afr_statfs (call_frame_t *frame, xlator_t *this,
-	    loc_t *loc)
-{
-	afr_private_t *  priv        = NULL;
-	int              child_count = 0;
-	afr_local_t   *  local       = NULL;
-	int              i           = 0;
-
-	int ret = -1;
-	int              call_count = 0;
-	int32_t          op_ret      = -1;
-	int32_t          op_errno    = 0;
-
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-	VALIDATE_OR_GOTO (loc, out);
-
-	priv = this->private;
-	child_count = priv->child_count;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-	
-	ret = AFR_LOCAL_INIT (local, priv);
-	if (ret < 0) {
-		op_errno = -ret;
-		goto out;
-	}
-
-	frame->local = local;
-	call_count = local->call_count;
-
-	for (i = 0; i < child_count; i++) {
-		if (local->child_up[i]) {
-			STACK_WIND (frame, afr_statfs_cbk,
-				    priv->children[i],
-				    priv->children[i]->fops->statfs, 
-				    loc);
-			if (!--call_count)
-				break;
-		}
-	}
-	
-	op_ret = 0;
+    if (!priv->child_up || !priv->child_latency || !priv->halo_child_up ||
+        !priv->anon_inode) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    /*Initialize to -ve ping timeout so that they are not considered
+     * in child-up events until ping-event comes*/
+    for (i = 0; i < child_count; i++)
+        priv->child_latency[i] = -1;
+
+    priv->children = GF_CALLOC(sizeof(xlator_t *), child_count,
+                               gf_afr_mt_xlator_t);
+    if (!priv->children) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    ret = afr_pending_xattrs_init(priv, this);
+    if (ret)
+        goto out;
+
+    trav = this->children;
+    i = 0;
+    while (i < child_count) {
+        priv->children[i] = trav->xlator;
+        trav = trav->next;
+        i++;
+    }
+
+    ret = gf_asprintf(&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, this->name);
+    if (-1 == ret) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    priv->last_event = GF_CALLOC(child_count, sizeof(*priv->last_event),
+                                 gf_afr_mt_int32_t);
+    if (!priv->last_event) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    this->itable = inode_table_new(SHD_INODE_LRU_LIMIT, this);
+    if (!this->itable) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    if (priv->shd.iamshd) {
+        ret = afr_selfheal_daemon_init(this);
+        if (ret) {
+            ret = -ENOMEM;
+            goto out;
+        }
+    }
+
+    /* keep more local here as we may need them for self-heal etc */
+    this->local_pool = mem_pool_new(afr_local_t, 512);
+    if (!this->local_pool) {
+        ret = -1;
+        goto out;
+    }
+
+    priv->root_inode = NULL;
+
+    ret = 0;
 out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
-	return 0;
-}
-
-
-int32_t
-afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-		   int32_t op_ret, int32_t op_errno, struct flock *lock)
-{
-	afr_local_t * local = NULL;
-
-	int call_count = -1;
-
-	local = frame->local;
-	call_count = afr_frame_return (frame);
-
-	if (call_count == 0)
-		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-				  lock);
-
-	return 0;
-}
-
-
-int32_t 
-afr_lk_unlock (call_frame_t *frame, xlator_t *this)
-{
-	afr_local_t   * local = NULL;
-	afr_private_t * priv  = NULL;
-
-	int i;
-	int call_count = 0;
-
-	local = frame->local;
-	priv  = this->private;
-
-	call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes, 
-					     priv->child_count);
-
-	if (call_count == 0) {
-		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-				  &local->cont.lk.flock);
-		return 0;
-	}
-
-	local->call_count = call_count;
-
-	local->cont.lk.flock.l_type = F_UNLCK;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (local->cont.lk.locked_nodes[i]) {
-			STACK_WIND (frame, afr_lk_unlock_cbk,
-				    priv->children[i],
-				    priv->children[i]->fops->lk,
-				    local->fd, F_SETLK, 
-				    &local->cont.lk.flock);
-
-			if (!--call_count)
-				break;
-		}
-	}
-
-	return 0;
+    return ret;
 }
-
-
-int32_t
-afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-	    int32_t op_ret, int32_t op_errno, struct flock *lock)
-{
-	afr_local_t *local = NULL;
-	afr_private_t *priv = NULL;
-
-	int call_count  = -1;
-	int child_index = -1;
-
-	local = frame->local;
-	priv  = this->private;
-
-	child_index = (long) cookie;
-
-	call_count = --local->call_count;
-
-	if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) {
-		local->op_ret   = -1;
-		local->op_errno = op_errno;
-
-		afr_lk_unlock (frame, this);
-		return 0;
-	}
-
-	if (op_ret == 0) {
-		local->op_ret        = 0;
-		local->op_errno      = 0;
-		local->cont.lk.flock = *lock;
-		local->cont.lk.locked_nodes[child_index] = 1;
-	}
-
-	child_index++;
-
-	if (child_index < priv->child_count) {
-		STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index,
-				   priv->children[child_index],
-				   priv->children[child_index]->fops->lk,
-				   local->fd, local->cont.lk.cmd, 
-				   &local->cont.lk.flock);
-	} else if (local->op_ret == -1) {
-		/* all nodes have gone down */
-		
-		AFR_STACK_UNWIND (frame, -1, ENOTCONN, &local->cont.lk.flock);
-	} else {
-		/* locking has succeeded on all nodes that are up */
-		
-		AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-			      &local->cont.lk.flock);
-	}
-
-	return 0;
-}
-
-
-int
-afr_lk (call_frame_t *frame, xlator_t *this,
-	fd_t *fd, int32_t cmd,
-	struct flock *flock)
+void
+afr_destroy_healer_object(xlator_t *this, struct subvol_healer *healer)
 {
-	afr_private_t *priv = NULL;
-	afr_local_t *local = NULL;
-
-	int i = 0;
-
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (this, out);
-	VALIDATE_OR_GOTO (this->private, out);
-
-	priv = this->private;
-
-	ALLOC_OR_GOTO (local, afr_local_t, out);
-	AFR_LOCAL_INIT (local, priv);
-
-	frame->local  = local;
-
-	local->cont.lk.locked_nodes = CALLOC (priv->child_count, 
-					      sizeof (*local->cont.lk.locked_nodes));
-	
-	if (!local->cont.lk.locked_nodes) {
-		gf_log (this->name, GF_LOG_ERROR, "out of memory :(");
-		op_errno = ENOMEM;
-		goto out;
-	}
-
-	local->fd            = fd_ref (fd);
-	local->cont.lk.cmd   = cmd;
-	local->cont.lk.flock = *flock;
-
-	STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0,
-			   priv->children[i],
-			   priv->children[i]->fops->lk,
-			   fd, cmd, flock);
-
-	op_ret = 0;
-out:
-	if (op_ret == -1) {
-		AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL);
-	}
-	return 0;
+    int ret = -1;
+
+    if (!healer)
+        return;
+
+    if (healer->running) {
+        /*
+         * If there are any resources to cleanup, We need
+         * to do that gracefully using pthread_cleanup_push
+         */
+        ret = gf_thread_cleanup_xint(healer->thread);
+        if (ret)
+            gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SELF_HEAL_FAILED,
+                   "Failed to clean up healer threads.");
+        healer->thread = 0;
+    }
+    pthread_cond_destroy(&healer->cond);
+    pthread_mutex_destroy(&healer->mutex);
 }
 
-
-/**
- * find_child_index - find the child's index in the array of subvolumes
- * @this: AFR
- * @child: child
- */
-
-static int
-find_child_index (xlator_t *this, xlator_t *child)
+void
+afr_selfheal_daemon_fini(xlator_t *this)
 {
-	afr_private_t *priv = NULL;
-
-	int i = -1;
-
-	priv = this->private;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if ((xlator_t *) child == priv->children[i])
-			break;
-	}
-
-	return i;
+    struct subvol_healer *healer = NULL;
+    afr_self_heald_t *shd = NULL;
+    afr_private_t *priv = NULL;
+    int i = 0;
+
+    priv = this->private;
+    if (!priv)
+        return;
+
+    shd = &priv->shd;
+    if (!shd->iamshd)
+        return;
+
+    for (i = 0; i < priv->child_count; i++) {
+        healer = &shd->index_healers[i];
+        afr_destroy_healer_object(this, healer);
+
+        healer = &shd->full_healers[i];
+        afr_destroy_healer_object(this, healer);
+
+        if (shd->statistics[i])
+            eh_destroy(shd->statistics[i]);
+    }
+    GF_FREE(shd->index_healers);
+    GF_FREE(shd->full_healers);
+    GF_FREE(shd->statistics);
+    if (shd->split_brain)
+        eh_destroy(shd->split_brain);
 }
-
-
-int32_t
-notify (xlator_t *this, int32_t event,
-	void *data, ...)
+void
+fini(xlator_t *this)
 {
-	afr_private_t *     priv     = NULL;
-	unsigned char *     child_up = NULL;
-
-	int i           = -1;
-	int up_children = 0;
-
-	priv = this->private;
-
-	if (!priv)
-		return 0;
-
-	child_up = priv->child_up;
-
-	switch (event) {
-	case GF_EVENT_CHILD_UP:
-		i = find_child_index (this, data);
-
-		child_up[i] = 1;
-
-		/* 
-		   if all the children were down, and one child came up, 
-		   send notify to parent
-		*/
-
-		for (i = 0; i < priv->child_count; i++)
-			if (child_up[i])
-				up_children++;
-
-		if (up_children == 1)
-			default_notify (this, event, data);
-
-		break;
-
-	case GF_EVENT_CHILD_DOWN:
-		i = find_child_index (this, data);
-
-		child_up[i] = 0;
-		
-		/* 
-		   if all children are down, and this was the last to go down,
-		   send notify to parent
-		*/
+    afr_private_t *priv = NULL;
 
-		for (i = 0; i < priv->child_count; i++)
-			if (child_up[i])
-				up_children++;
+    priv = this->private;
 
-		if (up_children == 0)
-			default_notify (this, event, data);
+    afr_selfheal_daemon_fini(this);
+    GF_ASSERT(list_empty(&priv->saved_locks));
 
-		break;
+    LOCK(&priv->lock);
+    if (priv->timer != NULL) {
+        gf_timer_call_cancel(this->ctx, priv->timer);
+        priv->timer = NULL;
+    }
+    UNLOCK(&priv->lock);
 
-	default:
-		default_notify (this, event, data);
-	}
-
-	return 0;
-}
-
-
-static const char *favorite_child_warning_str = "You have specified subvolume '%s' "
-	"as the 'favorite child'. This means that if a discrepancy in the content "
-	"or attributes (ownership, permission, etc.) of a file is detected among "
-	"the subvolumes, the file on '%s' will be considered the definitive "
-	"version and its contents will OVERWRITE the contents of the file on other "
-	"subvolumes. All versions of the file except that on '%s' "
-	"WILL BE LOST.";
-
-static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. "
-	"This means correctness is NO LONGER GUARANTEED in all cases. If two or more "
-	"applications write to the same region of a file, there is a possibility that "
-	"its copies will be INCONSISTENT. Set it to a value greater than 0 unless you "
-	"are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS "
-	"RESPOSIBLE for inconsistent data. If you are in doubt, set it to a value "
-	"greater than 0.";
-
-int32_t 
-init (xlator_t *this)
-{
-	afr_private_t * priv        = NULL;
-	int             child_count = 0;
-	xlator_list_t * trav        = NULL;
-	int             i           = 0;
-	int             ret         = -1;
-	int             op_errno    = 0;
-
-	char * read_subvol = NULL;
-	char * fav_child   = NULL;
-	char * self_heal   = NULL;
-	char * change_log  = NULL;
-
-	int32_t lock_server_count = 1;
-
-	int    fav_ret       = -1;
-	int    read_ret      = -1;
-	int    dict_ret      = -1;
-
-	if (!this->children) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"AFR needs more than one child defined");
-		return -1;
-	}
-  
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-
-	ALLOC_OR_GOTO (this->private, afr_private_t, out);
-
-	priv = this->private;
-
-	read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol);
-	priv->read_child = -1;
-
-	fav_ret = dict_get_str (this->options, "favorite-child", &fav_child);
-	priv->favorite_child = -1;
-
-	/* Default values */
-
-	priv->data_self_heal     = 1;
-	priv->metadata_self_heal = 1;
-	priv->entry_self_heal    = 1;
-
-	dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal);
-	if (dict_ret == 0) {
-		ret = gf_string2boolean (self_heal, &priv->data_self_heal);
-		if (ret < 0) {
-			gf_log (this->name, GF_LOG_WARNING,
-				"invalid 'option data-self-heal %s' "
-				"defaulting to data-self-heal as 'on'",
-				self_heal);
-			priv->data_self_heal = 1;
-		} 
-	}
-
-	dict_ret = dict_get_str (this->options, "metadata-self-heal",
-				 &self_heal);
-	if (dict_ret == 0) {
-		ret = gf_string2boolean (self_heal, &priv->metadata_self_heal);
-		if (ret < 0) {
-			gf_log (this->name, GF_LOG_WARNING,
-				"invalid 'option metadata-self-heal %s' "
-				"defaulting to metadata-self-heal as 'on'", 
-				self_heal);
-			priv->metadata_self_heal = 1;
-		} 
-	}
-
-	dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal);
-	if (dict_ret == 0) {
-		ret = gf_string2boolean (self_heal, &priv->entry_self_heal);
-		if (ret < 0) {
-			gf_log (this->name, GF_LOG_WARNING,
-				"invalid 'option entry-self-heal %s' "
-				"defaulting to entry-self-heal as 'on'", 
-				self_heal);
-			priv->entry_self_heal = 1;
-		} 
-	}
-
-	/* Change log options */
-
-	priv->data_change_log     = 1;
-	priv->metadata_change_log = 0;
-	priv->entry_change_log    = 1;
-
-	dict_ret = dict_get_str (this->options, "data-change-log",
-				 &change_log);
-	if (dict_ret == 0) {
-		ret = gf_string2boolean (change_log, &priv->data_change_log);
-		if (ret < 0) {
-			gf_log (this->name, GF_LOG_WARNING,
-				"invalid 'option data-change-log %s'. "
-				"defaulting to data-change-log as 'on'", 
-				change_log);
-			priv->data_change_log = 1;
-		} 
-	}
-
-	dict_ret = dict_get_str (this->options, "metadata-change-log",
-				 &change_log);
-	if (dict_ret == 0) {
-		ret = gf_string2boolean (change_log,
-					 &priv->metadata_change_log);
-		if (ret < 0) {
-			gf_log (this->name, GF_LOG_WARNING,
-				"invalid 'option metadata-change-log %s'. "
-				"defaulting to metadata-change-log as 'off'",
-				change_log);
-			priv->metadata_change_log = 0;
-		} 
-	}
-
-	dict_ret = dict_get_str (this->options, "entry-change-log",
-				 &change_log);
-	if (dict_ret == 0) {
-		ret = gf_string2boolean (change_log, &priv->entry_change_log);
-		if (ret < 0) {
-			gf_log (this->name, GF_LOG_WARNING,
-				"invalid 'option entry-change-log %s'. "
-				"defaulting to entry-change-log as 'on'", 
-				change_log);
-			priv->entry_change_log = 1;
-		} 
-	}
-
-	/* Locking options */
-
-	priv->data_lock_server_count = 1;
-	priv->metadata_lock_server_count = 0;
-	priv->entry_lock_server_count = 1;
-
-	dict_ret = dict_get_int32 (this->options, "data-lock-server-count", 
-				   &lock_server_count);
-	if (dict_ret == 0) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"setting data lock server count to %d",
-			lock_server_count);
-
-		if (lock_server_count == 0) 
-			gf_log (this->name, GF_LOG_WARNING,
-				no_lock_servers_warning_str);
-
-		priv->data_lock_server_count = lock_server_count;
-	}
-
-
-	dict_ret = dict_get_int32 (this->options,
-				   "metadata-lock-server-count", 
-				   &lock_server_count);
-	if (dict_ret == 0) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"setting metadata lock server count to %d",
-			lock_server_count);
-		priv->metadata_lock_server_count = lock_server_count;
-	}
-
-
-	dict_ret = dict_get_int32 (this->options, "entry-lock-server-count", 
-				   &lock_server_count);
-	if (dict_ret == 0) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"setting entry lock server count to %d",
-			lock_server_count);
-
-		priv->entry_lock_server_count = lock_server_count;
-	}
-
-
-	trav = this->children;
-	while (trav) {
-		if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"subvolume '%s' specified as read child",
-				trav->xlator->name);
-
-			priv->read_child = child_count;
-		}
-
-		if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) {
-			gf_log (this->name, GF_LOG_WARNING,
-				favorite_child_warning_str, trav->xlator->name,
-				trav->xlator->name, trav->xlator->name);
-			priv->favorite_child = child_count;
-		}
-
-		child_count++;
-		trav = trav->next;
-	}
-
-	/* XXX: return inode numbers from 1st subvolume till
-	   afr supports read-subvolume based on inode's ctx 
-	   (and not itransform) for this reason afr_deitransform() 
-	   returns 0 always
-	*/
-	priv->read_child = 0;
-
-	priv->wait_count = 1;
-
-	priv->child_count = child_count;
-	LOCK_INIT (&priv->lock);
-
-	priv->child_up = CALLOC (sizeof (unsigned char), child_count);
-	if (!priv->child_up) {
-		gf_log (this->name, GF_LOG_ERROR,	
-			"out of memory :(");		
-		op_errno = ENOMEM;			
-		goto out;
-	}
-
-	priv->children = CALLOC (sizeof (xlator_t *), child_count);
-	if (!priv->children) {
-		gf_log (this->name, GF_LOG_ERROR,	
-			"out of memory :(");		
-		op_errno = ENOMEM;			
-		goto out;
-	}
-
-	trav = this->children;
-	i = 0;
-	while (i < child_count) {
-		priv->children[i] = trav->xlator;
-
-		trav = trav->next;
-		i++;
-	}
-
-	ret = 0;
-out:
-	return ret;
-}
+    if (this->local_pool != NULL) {
+        mem_pool_destroy(this->local_pool);
+        this->local_pool = NULL;
+    }
 
+    this->private = NULL;
+    afr_priv_destroy(priv);
+    if (this->itable) {
+        inode_table_destroy(this->itable);
+        this->itable = NULL;
+    }
 
-int
-fini (xlator_t *this)
-{
-	return 0;
+    return;
 }
 
-
 struct xlator_fops fops = {
-	.lookup      = afr_lookup,
-	.open        = afr_open,
-	.lk          = afr_lk,
-	.flush       = afr_flush,
-	.statfs      = afr_statfs,
-	.fsync       = afr_fsync,
-	.fsyncdir    = afr_fsyncdir,
-	.xattrop     = afr_xattrop,
-	.fxattrop    = afr_fxattrop,
-	.inodelk     = afr_inodelk,
-	.finodelk    = afr_finodelk,
-	.entrylk     = afr_entrylk,
-	.fentrylk    = afr_fentrylk,
-	.checksum    = afr_checksum,
-
-	/* inode read */
-	.access      = afr_access,
-	.stat        = afr_stat,
-	.fstat       = afr_fstat,
-	.readlink    = afr_readlink,
-	.getxattr    = afr_getxattr,
-	.readv       = afr_readv,
-
-	/* inode write */
-	.chmod       = afr_chmod,
-	.chown       = afr_chown,
-	.fchmod      = afr_fchmod,
-	.fchown      = afr_fchown,
-	.writev      = afr_writev,
-	.truncate    = afr_truncate,
-	.ftruncate   = afr_ftruncate,
-	.utimens     = afr_utimens,
-	.setxattr    = afr_setxattr,
-	.removexattr = afr_removexattr,
-
-	/* dir read */
-	.opendir     = afr_opendir,
-	.readdir     = afr_readdir,
-	.getdents    = afr_getdents,
-
-	/* dir write */
-	.create      = afr_create,
-	.mknod       = afr_mknod,
-	.mkdir       = afr_mkdir,
-	.unlink      = afr_unlink,
-	.rmdir       = afr_rmdir,
-	.link        = afr_link,
-	.symlink     = afr_symlink,
-	.rename      = afr_rename,
-	.setdents    = afr_setdents,
+    .lookup = afr_lookup,
+    .lk = afr_lk,
+    .flush = afr_flush,
+    .statfs = afr_statfs,
+    .fsyncdir = afr_fsyncdir,
+    .inodelk = afr_inodelk,
+    .finodelk = afr_finodelk,
+    .entrylk = afr_entrylk,
+    .fentrylk = afr_fentrylk,
+    .ipc = afr_ipc,
+    .lease = afr_lease,
+
+    /* inode read */
+    .access = afr_access,
+    .stat = afr_stat,
+    .fstat = afr_fstat,
+    .readlink = afr_readlink,
+    .getxattr = afr_getxattr,
+    .fgetxattr = afr_fgetxattr,
+    .readv = afr_readv,
+    .seek = afr_seek,
+
+    /* inode write */
+    .writev = afr_writev,
+    .truncate = afr_truncate,
+    .ftruncate = afr_ftruncate,
+    .setxattr = afr_setxattr,
+    .fsetxattr = afr_fsetxattr,
+    .setattr = afr_setattr,
+    .fsetattr = afr_fsetattr,
+    .removexattr = afr_removexattr,
+    .fremovexattr = afr_fremovexattr,
+    .fallocate = afr_fallocate,
+    .discard = afr_discard,
+    .zerofill = afr_zerofill,
+    .xattrop = afr_xattrop,
+    .fxattrop = afr_fxattrop,
+    .fsync = afr_fsync,
+
+    /*inode open*/
+    .opendir = afr_opendir,
+    .open = afr_open,
+
+    /* dir read */
+    .readdir = afr_readdir,
+    .readdirp = afr_readdirp,
+
+    /* dir write */
+    .create = afr_create,
+    .mknod = afr_mknod,
+    .mkdir = afr_mkdir,
+    .unlink = afr_unlink,
+    .rmdir = afr_rmdir,
+    .link = afr_link,
+    .symlink = afr_symlink,
+    .rename = afr_rename,
 };
 
-
-struct xlator_mops mops = {
+struct xlator_dumpops dumpops = {
+    .priv = afr_priv_dump,
 };
 
-
 struct xlator_cbks cbks = {
+    .release = afr_release,
+    .releasedir = afr_releasedir,
+    .forget = afr_forget,
 };
 
 struct volume_options options[] = {
-	{ .key  = {"read-subvolume" }, 
-	  .type = GF_OPTION_TYPE_XLATOR
-	},
-	{ .key  = {"favorite-child"}, 
-	  .type = GF_OPTION_TYPE_XLATOR
-	},
-	{ .key  = {"data-self-heal"},  
-	  .type = GF_OPTION_TYPE_BOOL 
-	},
-	{ .key  = {"metadata-self-heal"},  
-	  .type = GF_OPTION_TYPE_BOOL
-	},
-	{ .key  = {"entry-self-heal"},  
-	  .type = GF_OPTION_TYPE_BOOL 
-	},
-	{ .key  = {"data-change-log"},  
-	  .type = GF_OPTION_TYPE_BOOL 
-	},
-	{ .key  = {"metadata-change-log"},  
-	  .type = GF_OPTION_TYPE_BOOL
-	},
-	{ .key  = {"entry-change-log"},  
-	  .type = GF_OPTION_TYPE_BOOL
-	},
-	{ .key  = {"data-lock-server-count"},  
-	  .type = GF_OPTION_TYPE_INT, 
-	  .min  = 0
-	},
-	{ .key  = {"metadata-lock-server-count"},  
-	  .type = GF_OPTION_TYPE_INT, 
-	  .min  = 0
-	},
-	{ .key  = {"entry-lock-server-count"},  
-	  .type = GF_OPTION_TYPE_INT,
-	  .min  = 0
-	},
-	{ .key  = {NULL} },
+    {.key = {"read-subvolume"},
+     .type = GF_OPTION_TYPE_XLATOR,
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description = "inode-read fops happen only on one of the bricks in "
+                    "replicate. Afr will prefer the one specified using "
+                    "this option if it is not stale. Option value must be "
+                    "one of the xlator names of the children. "
+                    "Ex: <volname>-client-0 till "
+                    "<volname>-client-<number-of-bricks - 1>"},
+    {.key = {"read-subvolume-index"},
+     .type = GF_OPTION_TYPE_INT,
+     .default_value = "-1",
+     .op_version = {2},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description = "inode-read fops happen only on one of the bricks in "
+                    "replicate. AFR will prefer the one specified using "
+                    "this option if it is not stale. allowed options"
+                    " include -1 till replica-count - 1"},
+    {.key = {"read-hash-mode"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 0,
+     .max = 5,
+     .default_value = "1",
+     .op_version = {2},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description =
+         "inode-read fops happen only on one of the bricks in "
+         "replicate. AFR will prefer the one computed using "
+         "the method specified using this option.\n"
+         "0 = first readable child of AFR, starting from 1st child.\n"
+         "1 = hash by GFID of file (all clients use "
+         "same subvolume).\n"
+         "2 = hash by GFID of file and client PID.\n"
+         "3 = brick having the least outstanding read requests.\n"
+         "4 = brick having the least network ping latency.\n"
+         "5 = Hybrid mode between 3 and 4, ie least value among "
+         "network-latency multiplied by outstanding-read-requests."},
+    {
+        .key = {"choose-local"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "true",
+        .op_version = {2},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"replicate"},
+        .description = "Choose a local subvolume (i.e. Brick) to read from"
+                       " if read-subvolume is not explicitly set.",
+    },
+    {.key = {"background-self-heal-count"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 0,
+     .max = 256,
+     .default_value = "8",
+     .validate = GF_OPT_VALIDATE_MIN,
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description = "This specifies the number of per client self-heal "
+                    "jobs that can perform parallel heals in the "
+                    "background."},
+    {.key = {"halo-shd-max-latency"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 99999,
+     .default_value = "99999",
+     .op_version = {GD_OP_VERSION_3_11_0},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate", "halo"},
+     .description = "Maximum latency for shd halo replication in msec."},
+    {.key = {"halo-enabled"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "False",
+     .op_version = {GD_OP_VERSION_3_11_0},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate", "halo"},
+     .description = "Enable Halo (geo) replication mode."},
+    {.key = {"halo-nfsd-max-latency"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 99999,
+     .default_value = "5",
+     .op_version = {GD_OP_VERSION_3_11_0},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate", "halo"},
+     .description = "Maximum latency for nfsd halo replication in msec."},
+    {.key = {"halo-max-latency"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = AFR_HALO_MAX_LATENCY,
+     .default_value = "5",
+     .op_version = {GD_OP_VERSION_3_11_0},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate", "halo"},
+     .description = "Maximum latency for halo replication in msec."},
+    {.key = {"halo-max-replicas"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 99999,
+     .default_value = "99999",
+     .op_version = {GD_OP_VERSION_3_11_0},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate", "halo"},
+     .description = "The maximum number of halo replicas; replicas"
+                    " beyond this value will be written asynchronously"
+                    "via the SHD."},
+    {.key = {"halo-min-replicas"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 99999,
+     .default_value = "2",
+     .op_version = {GD_OP_VERSION_3_11_0},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate", "halo"},
+     .description = "The minimmum number of halo replicas, before adding "
+                    "out of region replicas."},
+    {.key = {"heal-wait-queue-length"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 0,
+     .max = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/
+     .default_value = "128",
+     .validate = GF_OPT_VALIDATE_MIN,
+     .op_version = {GD_OP_VERSION_3_7_10},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description = "This specifies the number of heals that can be queued"
+                    " for the parallel background self heal jobs."},
+    {.key = {"data-self-heal"},
+     .type = GF_OPTION_TYPE_STR,
+     .value = {"1", "on", "yes", "true", "enable", "0", "off", "no", "false",
+               "disable", "open"},
+     .default_value = "off",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description = "Using this option we can enable/disable data "
+                    "self-heal on the file. \"open\" means data "
+                    "self-heal action will only be triggered by file "
+                    "open operations."},
+    {.key = {"data-self-heal-algorithm"},
+     .type = GF_OPTION_TYPE_STR,
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description = "Select between \"full\", \"diff\". The "
+                    "\"full\" algorithm copies the entire file from "
+                    "source to sink. The \"diff\" algorithm copies to "
+                    "sink only those blocks whose checksums don't match "
+                    "with those of source. If no option is configured "
+                    "the option is chosen dynamically as follows: "
+                    "If the file does not exist on one of the sinks "
+                    "or empty file exists or if the source file size is "
+                    "about the same as page size the entire file will "
+                    "be read and written i.e \"full\" algo, "
+                    "otherwise \"diff\" algo is chosen.",
+     .value = {"diff", "full"}},
+    {.key = {"data-self-heal-window-size"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 1024,
+     .default_value = "1",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description = "Maximum number blocks per file for which self-heal "
+                    "process would be applied simultaneously."},
+    {.key = {"metadata-self-heal"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     /*.validate_fn = validate_replica*/
+     .description = "Using this option we can enable/disable metadata "
+                    "i.e. Permissions, ownerships, xattrs self-heal on "
+                    "the file/directory."},
+    {.key = {"entry-self-heal"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     /*.validate_fn = validate_replica*/
+     .description = "Using this option we can enable/disable entry "
+                    "self-heal on the directory."},
+    {.key = {"data-change-log"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description = "This option exists only for backward compatibility "
+                    "and configuring it doesn't have any effect"},
+    {.key = {"metadata-change-log"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description = "This option exists only for backward compatibility "
+                    "and configuring it doesn't have any effect"},
+    {.key = {"entry-change-log"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description = "This option exists only for backward compatibility "
+                    "and configuring it doesn't have any effect"},
+    {.key = {"optimistic-change-log"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description = "Entry/Metadata fops will not perform "
+                    "pre fop changelog operations in afr transaction "
+                    "if this option is enabled."},
+    {.key = {"inodelk-trace"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "Enabling this option logs inode lock/unlocks"},
+    {.key = {"entrylk-trace"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "Enabling this option logs entry lock/unlocks"},
+    {.key = {"pre-op-compat"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description = "Use separate pre-op xattrop() FOP rather than "
+                    "overloading xdata of the OP"},
+    {.key = {"eager-lock"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description =
+         "Enable/Disable eager lock for replica volume. "
+         "Lock phase of a transaction has two sub-phases. "
+         "First is an attempt to acquire locks in parallel by "
+         "broadcasting non-blocking lock requests. If lock "
+         "acquisition fails on any server, then the held locks "
+         "are unlocked and we revert to a blocking locks mode "
+         "sequentially on one server after another.  If this "
+         "option is enabled the initial broadcasting lock "
+         "request attempts to acquire a full lock on the entire file. "
+         "If this fails, we revert back to the sequential "
+         "\"regional\" blocking locks as before. In the case "
+         "where such an \"eager\" lock is granted in the "
+         "non-blocking phase, it gives rise to an opportunity "
+         "for optimization. i.e, if the next write transaction "
+         "on the same FD arrives before the unlock phase of "
+         "the first transaction, it \"takes over\" the full "
+         "file lock. Similarly if yet another data transaction "
+         "arrives before the unlock phase of the \"optimized\" "
+         "transaction, that in turn \"takes over\" the lock as "
+         "well. The actual unlock now happens at the end of "
+         "the last \"optimized\" transaction."
+
+    },
+    {.key = {"self-heal-daemon"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+     .tags = {"replicate"},
+     /*.validate_fn   = validate_replica_heal_enable_disable*/
+     .description = "This option applies to only self-heal-daemon. "
+                    "Index directory crawl and automatic healing of files "
+                    "will not be performed if this option is turned off."},
+    {.key = {"iam-self-heal-daemon"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "This option differentiates if the replicate "
+                    "translator is running as part of self-heal-daemon "
+                    "or not."},
+    {.key = {"iam-nfs-daemon"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "This option differentiates if the replicate "
+                    "translator is running as part of an NFS daemon "
+                    "or not."},
+    {
+        .key = {"quorum-type"},
+        .type = GF_OPTION_TYPE_STR,
+        .value = {"none", "auto", "fixed"},
+        .default_value = "none",
+        .op_version = {1},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"replicate"},
+        /*.option = quorum-type*/
+        .description = "If value is \"fixed\" only allow writes if "
+                       "quorum-count bricks are present.  If value is "
+                       "\"auto\" only allow writes if more than half of "
+                       "bricks, or exactly half including the first, are "
+                       "present.",
+    },
+    {
+        .key = {"quorum-count"},
+        .type = GF_OPTION_TYPE_INT,
+        .min = 1,
+        .max = INT_MAX,
+        .default_value = 0,
+        .op_version = {1},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"replicate"},
+        /*.option = quorum-count*/
+        /*.validate_fn = validate_quorum_count*/
+        .description = "If quorum-type is \"fixed\" only allow writes if "
+                       "this many bricks are present.  Other quorum types "
+                       "will OVERWRITE this value.",
+    },
+    {
+        .key = {"quorum-reads"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "no",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"replicate"},
+        .description = "This option has been removed. Reads are not allowed "
+                       "if quorum is not met.",
+    },
+    {
+        .key = {"node-uuid"},
+        .type = GF_OPTION_TYPE_STR,
+        .description = "Local glusterd uuid string, used in starting "
+                       "self-heal-daemon so that it can crawl only on "
+                       "local index directories.",
+    },
+    {
+        .key = {"post-op-delay-secs"},
+        .type = GF_OPTION_TYPE_INT,
+        .min = 0,
+        .max = INT_MAX,
+        .default_value = "1",
+        .op_version = {2},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"replicate"},
+        .description = "Time interval induced artificially before "
+                       "post-operation phase of the transaction to "
+                       "enhance overlap of adjacent write operations.",
+    },
+    {
+        .key = {AFR_SH_READDIR_SIZE_KEY},
+        .type = GF_OPTION_TYPE_SIZET,
+        .description = "readdirp size for performing entry self-heal",
+        .min = 1024,
+        .max = 131072,
+        .op_version = {2},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+        .tags = {"replicate"},
+        .default_value = "1KB",
+    },
+    {
+        .key = {"ensure-durability"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .op_version = {3},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"replicate"},
+        .description = "Afr performs fsyncs for transactions if this "
+                       "option is on to make sure the changelogs/data is "
+                       "written to the disk",
+        .default_value = "on",
+    },
+    {
+        .key = {"afr-dirty-xattr"},
+        .type = GF_OPTION_TYPE_STR,
+        .default_value = AFR_DIRTY_DEFAULT,
+    },
+    {.key = {"afr-pending-xattr"},
+     .type = GF_OPTION_TYPE_STR,
+     .description = "Comma separated list of xattrs that are used to  "
+                    "capture information on pending heals."},
+    {
+        .key = {"metadata-splitbrain-forced-heal"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+    },
+    {.key = {"heal-timeout"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 5,
+     .max = INT_MAX,
+     .default_value = "600",
+     .op_version = {2},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description = "time interval for checking the need to self-heal "
+                    "in self-heal-daemon"},
+    {
+        .key = {"consistent-metadata"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "no",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"replicate"},
+        .description = "If this option is enabled, readdirp will force "
+                       "lookups on those entries read whose read child is "
+                       "not the same as that of the parent. This will "
+                       "guarantee that all read operations on a file serve "
+                       "attributes from the same subvol as long as it holds "
+                       " a good copy of the file/dir.",
+    },
+    {.key = {"arbiter-count"},
+     .type = GF_OPTION_TYPE_INT,
+     .description = "subset of child_count. Has to be 0 or 1."},
+    {
+        .key = {"thin-arbiter"},
+        .type = GF_OPTION_TYPE_STR,
+        .op_version = {GD_OP_VERSION_4_1_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .tags = {"replicate"},
+        .description = "contains host:path of thin abriter brick",
+    },
+    {.key = {"shd-max-threads"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 64,
+     .default_value = "1",
+     .op_version = {GD_OP_VERSION_3_7_12},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description = "Maximum number of parallel heals SHD can do per "
+                    "local brick. This can substantially lower heal times"
+                    ", but can also crush your bricks if you don't have "
+                    "the storage hardware to support this."},
+    {
+        .key = {"shd-wait-qlength"},
+        .type = GF_OPTION_TYPE_INT,
+        .min = 1,
+        .max = 655536,
+        .default_value = "1024",
+        .op_version = {GD_OP_VERSION_3_7_12},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"replicate"},
+        .description = "This option can be used to control number of heals"
+                       " that can wait in SHD per subvolume",
+    },
+    {
+        .key = {"locking-scheme"},
+        .type = GF_OPTION_TYPE_STR,
+        .value = {"full", "granular"},
+        .default_value = "full",
+        .op_version = {GD_OP_VERSION_3_7_12},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"replicate"},
+        .description = "If this option is set to granular, self-heal will "
+                       "stop being compatible with afr-v1, which helps afr "
+                       "be more granular while self-healing",
+    },
+    {.key = {"full-lock"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "yes",
+     .op_version = {GD_OP_VERSION_3_13_2},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+     .tags = {"replicate"},
+     .description = "If this option is disabled, then the IOs will take "
+                    "range locks same as versions till 3.13.1."},
+    {
+        .key = {"granular-entry-heal"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "no",
+        .op_version = {GD_OP_VERSION_3_8_0},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"replicate"},
+        .description = "If this option is enabled, self-heal will resort to "
+                       "granular way of recording changelogs and doing entry "
+                       "self-heal.",
+    },
+    {
+        .key = {"favorite-child-policy"},
+        .type = GF_OPTION_TYPE_STR,
+        .value = {"none", "size", "ctime", "mtime", "majority"},
+        .default_value = "none",
+        .op_version = {GD_OP_VERSION_3_7_12},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"replicate"},
+        .description = "This option can be used to automatically resolve "
+                       "split-brains using various policies without user "
+                       "intervention. \"size\" picks the file with the "
+                       "biggest size as the source. \"ctime\" and \"mtime\" "
+                       "pick the file with the latest ctime and mtime "
+                       "respectively as the source. \"majority\" picks a file"
+                       " with identical mtime and size in more than half the "
+                       "number of bricks in the replica.",
+    },
+    {
+        .key = {"consistent-io"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "no",
+        .description = "If this option is enabled, i/o will fail even if "
+                       "one of the bricks is down in the replicas",
+    },
+    {.key = {"use-compound-fops"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "no",
+     .op_version = {GD_OP_VERSION_3_8_4},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"replicate"},
+     .description = "This option exists only for backward compatibility "
+                    "and configuring it doesn't have any effect"},
+    {.key = {"use-anonymous-inode"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "no",
+     .op_version = {GD_OP_VERSION_8_0},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+     .tags = {"replicate"},
+     .description = "Setting this option heals directory renames efficiently"},
+
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "replicate",
+    .category = GF_MAINTAINED,
 };
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 02c69597b0d..d62f9a9caf2 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1,523 +1,1423 @@
 /*
-   Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
 
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
 
 #ifndef __AFR_H__
 #define __AFR_H__
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+#include <glusterfs/call-stub.h>
+#include <glusterfs/compat-errno.h>
+#include "afr-mem-types.h"
+
+#include "libxlator.h"
+#include <glusterfs/timer.h>
+#include <glusterfs/syncop.h>
+
+#include "afr-self-heald.h"
+#include "afr-messages.h"
+
+#define SHD_INODE_LRU_LIMIT 1
+#define AFR_PATHINFO_HEADER "REPLICATE:"
+#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
+#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
+#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"
+#define AFR_DIRTY (((afr_private_t *)(THIS->private))->afr_dirty)
+
+#define AFR_LOCKEE_COUNT_MAX 3
+#define AFR_DOM_COUNT_MAX 3
+#define AFR_NUM_CHANGE_LOGS 3              /*data + metadata + entry*/
+#define AFR_DEFAULT_SPB_CHOICE_TIMEOUT 300 /*in seconds*/
+
+#define ARBITER_BRICK_INDEX 2
+#define THIN_ARBITER_BRICK_INDEX 2
+#define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify"
+#define AFR_TA_DOM_MODIFY "afr.ta.dom-modify"
+
+#define AFR_LK_HEAL_DOM "afr.lock-heal.domain"
+
+#define AFR_HALO_MAX_LATENCY 99999
+#define AFR_ANON_DIR_PREFIX ".glusterfs-anonymous-inode"
+
+#define PFLAG_PENDING (1 << 0)
+#define PFLAG_SBRAIN (1 << 1)
+
+typedef int (*afr_lock_cbk_t)(call_frame_t *frame, xlator_t *this);
+
+typedef int (*afr_read_txn_wind_t)(call_frame_t *frame, xlator_t *this,
+                                   int subvol);
+
+typedef int (*afr_inode_refresh_cbk_t)(call_frame_t *frame, xlator_t *this,
+                                       int err);
+
+typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this);
+
+#define AFR_COUNT(array, max)                                                  \
+    ({                                                                         \
+        int __i;                                                               \
+        int __res = 0;                                                         \
+        for (__i = 0; __i < max; __i++)                                        \
+            if (array[__i])                                                    \
+                __res++;                                                       \
+        __res;                                                                 \
+    })
+#define AFR_INTERSECT(dst, src1, src2, max)                                    \
+    ({                                                                         \
+        int __i;                                                               \
+        for (__i = 0; __i < max; __i++)                                        \
+            dst[__i] = src1[__i] && src2[__i];                                 \
+    })
+#define AFR_CMP(a1, a2, len)                                                   \
+    ({                                                                         \
+        int __cmp = 0;                                                         \
+        int __i;                                                               \
+        for (__i = 0; __i < len; __i++)                                        \
+            if (a1[__i] != a2[__i]) {                                          \
+                __cmp = 1;                                                     \
+                break;                                                         \
+            }                                                                  \
+        __cmp;                                                                 \
+    })
+#define AFR_IS_ARBITER_BRICK(priv, index)                                      \
+    ((priv->arbiter_count == 1) && (index == ARBITER_BRICK_INDEX))
+
+#define AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(ret, errnum)                       \
+    do {                                                                       \
+        local->op_ret = ret;                                                   \
+        local->op_errno = errnum;                                              \
+        if (local->op_errno == EIO)                                            \
+            gf_msg(this->name, GF_LOG_ERROR, local->op_errno,                  \
+                   AFR_MSG_SPLIT_BRAIN,                                        \
+                   "Failing %s on gfid %s: "                                   \
+                   "split-brain observed.",                                    \
+                   gf_fop_list[local->op], uuid_utoa(local->inode->gfid));     \
+    } while (0)
+
+#define AFR_ERROR_OUT_IF_FDCTX_INVALID(__fd, __this, __error, __label)         \
+    do {                                                                       \
+        afr_fd_ctx_t *__fd_ctx = NULL;                                         \
+        __fd_ctx = afr_fd_ctx_get(__fd, __this);                               \
+        if (__fd_ctx && __fd_ctx->is_fd_bad) {                                 \
+            __error = EBADF;                                                   \
+            goto __label;                                                      \
+        }                                                                      \
+    } while (0)
 
-#include "scheduler.h"
-#include "call-stub.h"
-#include "compat-errno.h"
+typedef enum {
+    AFR_READ_POLICY_FIRST_UP,
+    AFR_READ_POLICY_GFID_HASH,
+    AFR_READ_POLICY_GFID_PID_HASH,
+    AFR_READ_POLICY_LESS_LOAD,
+    AFR_READ_POLICY_LEAST_LATENCY,
+    AFR_READ_POLICY_LOAD_LATENCY_HYBRID,
+} afr_read_hash_mode_t;
 
+typedef enum {
+    AFR_FAV_CHILD_NONE,
+    AFR_FAV_CHILD_BY_SIZE,
+    AFR_FAV_CHILD_BY_CTIME,
+    AFR_FAV_CHILD_BY_MTIME,
+    AFR_FAV_CHILD_BY_MAJORITY,
+    AFR_FAV_CHILD_POLICY_MAX,
+} afr_favorite_child_policy;
 
-typedef struct _afr_private {
-	gf_lock_t lock;               /* to guard access to child_count, etc */
-	unsigned int child_count;     /* total number of children   */
+typedef enum {
+    AFR_SELFHEAL_DATA_FULL = 0,
+    AFR_SELFHEAL_DATA_DIFF,
+    AFR_SELFHEAL_DATA_DYNAMIC,
+} afr_data_self_heal_type_t;
+
+typedef enum {
+    AFR_CHILD_UNKNOWN = -1,
+    AFR_CHILD_ZERO,
+    AFR_CHILD_ONE,
+    AFR_CHILD_THIN_ARBITER,
+} afr_child_index;
 
-	xlator_t **children;
+typedef enum {
+    TA_WAIT_FOR_NOTIFY_LOCK_REL, /*FOP came after notify domain lock upcall
+                                   notification and waiting for its release.*/
+    TA_GET_INFO_FROM_TA_FILE,    /*FOP needs post-op on ta file to get
+                                  *info about which brick is bad.*/
+    TA_INFO_IN_MEMORY_SUCCESS,   /*Bad brick info is in memory and fop failed
+                                  *on BAD brick - Success*/
+    TA_INFO_IN_MEMORY_FAILED,    /*Bad brick info is in memory and fop failed
+                                  *on GOOD brick - Failed*/
+    TA_SUCCESS,                  /*FOP succeeded on both data bricks.*/
+} afr_ta_fop_state_t;
+
+struct afr_nfsd {
+    uint32_t halo_max_latency_msec;
+    gf_boolean_t iamnfsd;
+};
+
+typedef struct _afr_lk_heal_info {
+    fd_t *fd;
+    int32_t cmd;
+    struct gf_flock flock;
+    dict_t *xdata_req;
+    unsigned char *locked_nodes;
+    struct list_head pos;
+    gf_lkowner_t lk_owner;
+    pid_t pid;
+    int32_t *child_up_event_gen;
+    int32_t *child_down_event_gen;
+} afr_lk_heal_info_t;
 
-	unsigned char *child_up;
+typedef struct _afr_private {
+    gf_lock_t lock;             /* to guard access to child_count, etc */
+    unsigned int child_count;   /* total number of children   */
+    unsigned int arbiter_count; /*subset of child_count.
+                                  Has to be 0 or 1.*/
+
+    xlator_t **children;
+
+    inode_t *root_inode;
+
+    int favorite_child; /* subvolume to be preferred in resolving
+                                    split-brain cases */
+    /* For thin-arbiter. */
+    uuid_t ta_gfid;
+    unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/
+    int ta_bad_child_index;
+    int ta_event_gen;
+    unsigned int ta_in_mem_txn_count;
+    unsigned int ta_on_wire_txn_count;
+    struct list_head ta_waitq;
+    struct list_head ta_onwireq;
+
+    unsigned char *anon_inode;
+    unsigned char *child_up;
+    unsigned char *halo_child_up;
+    int64_t *child_latency;
+    unsigned char *local;
+
+    char **pending_key;
+
+    afr_data_self_heal_type_t data_self_heal_algorithm;
+    unsigned int data_self_heal_window_size; /* max number of pipelined
+                                                read/writes */
+
+    struct list_head heal_waiting; /*queue for files that need heal*/
+    uint32_t heal_wait_qlen; /*configurable queue length for heal_waiting*/
+    int32_t heal_waiters;    /* No. of elements currently in wait queue.*/
+
+    struct list_head healing;            /* queue for files that are undergoing
+                                            background heal*/
+    uint32_t background_self_heal_count; /*configurable queue length for
+                                           healing queue*/
+    int32_t healers; /* No. of elements currently undergoing background
+                      heal*/
+
+    gf_boolean_t release_ta_notify_dom_lock;
+
+    gf_boolean_t metadata_self_heal; /* on/off */
+    gf_boolean_t entry_self_heal;    /* on/off */
+
+    gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
+    int read_child;                               /* read-subvolume */
+    gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/
+
+    gf_timer_t *timer; /* launched when parent up is received */
+
+    unsigned int wait_count; /* # of servers to wait for success */
+
+    unsigned char ta_child_up;
+    gf_boolean_t optimistic_change_log;
+    gf_boolean_t eager_lock;
+    gf_boolean_t pre_op_compat; /* on/off */
+    uint32_t post_op_delay_secs;
+    unsigned int quorum_count;
+
+    off_t ta_notify_dom_lock_offset;
+    afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic
+                                              resolution of split-brains.*/
+    afr_read_hash_mode_t hash_mode; /* for when read_child is not set */
+
+    int32_t *last_event;
+
+    /* @event_generation: Keeps count of number of events received which can
+       potentially impact consistency decisions. The events are CHILD_UP
+       and CHILD_DOWN, when we have to recalculate the freshness/staleness
+       of copies to detect if changes had happened while the other server
+       was down. CHILD_DOWN and CHILD_UP can also be received on network
+       disconnect/reconnects and not necessarily server going down/up.
+       Recalculating freshness/staleness on network events is equally
+       important as we might have had a network split brain.
+    */
+    uint32_t event_generation;
+    char vol_uuid[UUID_SIZE + 1];
+
+    gf_boolean_t choose_local;
+    gf_boolean_t did_discovery;
+    gf_boolean_t ensure_durability;
+    gf_boolean_t halo_enabled;
+    gf_boolean_t consistent_metadata;
+    gf_boolean_t need_heal;
+    gf_boolean_t granular_locks;
+    uint64_t sh_readdir_size;
+    char *sh_domain;
+    char *afr_dirty;
+
+    uint64_t spb_choice_timeout;
+
+    afr_self_heald_t shd;
+    struct afr_nfsd nfsd;
+
+    uint32_t halo_max_latency_msec;
+    uint32_t halo_max_replicas;
+    uint32_t halo_min_replicas;
+
+    gf_boolean_t full_lock;
+    gf_boolean_t esh_granular;
+    gf_boolean_t consistent_io;
+    gf_boolean_t data_self_heal; /* on/off */
+    gf_boolean_t use_anon_inode;
+
+    /*For lock healing.*/
+    struct list_head saved_locks;
+    struct list_head lk_healq;
+
+    /*For anon-inode handling */
+    char anon_inode_name[NAME_MAX + 1];
+    char anon_gfid_str[UUID_SIZE + 1];
+} afr_private_t;
+
+typedef enum {
+    AFR_DATA_TRANSACTION,         /* truncate, write, ... */
+    AFR_METADATA_TRANSACTION,     /* chmod, chown, ... */
+    AFR_ENTRY_TRANSACTION,        /* create, rmdir, ... */
+    AFR_ENTRY_RENAME_TRANSACTION, /* rename */
+} afr_transaction_type;
 
-	gf_boolean_t data_self_heal;       /* on/off */
-	gf_boolean_t metadata_self_heal;   /* on/off */
-	gf_boolean_t entry_self_heal;      /* on/off */
+/*
+  xattr format: trusted.afr.volume = [x y z]
+  x - data pending
+  y - metadata pending
+  z - entry pending
+*/
 
+static inline int
+afr_index_for_transaction_type(afr_transaction_type type)
+{
+    switch (type) {
+        case AFR_DATA_TRANSACTION:
+            return 0;
 
-	gf_boolean_t data_change_log;       /* on/off */
-	gf_boolean_t metadata_change_log;   /* on/off */
-	gf_boolean_t entry_change_log;      /* on/off */
+        case AFR_METADATA_TRANSACTION:
+            return 1;
 
-	unsigned int read_child;      /* read-subvolume */
-	unsigned int favorite_child;  /* subvolume to be preferred in resolving
-					 split-brain cases */
+        case AFR_ENTRY_TRANSACTION:
+        case AFR_ENTRY_RENAME_TRANSACTION:
+            return 2;
+    }
 
-	unsigned int data_lock_server_count;
-	unsigned int metadata_lock_server_count;
-	unsigned int entry_lock_server_count;
+    return -1; /* make gcc happy */
+}
 
-	unsigned int wait_count;      /* # of servers to wait for success */
-} afr_private_t;
+static inline int
+afr_index_from_ia_type(ia_type_t type)
+{
+    switch (type) {
+        case IA_IFDIR:
+            return afr_index_for_transaction_type(AFR_ENTRY_TRANSACTION);
+        case IA_IFREG:
+            return afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+        default:
+            return -1;
+    }
+}
 
 typedef struct {
-	/* array of stat's, one for each child */
-	struct stat *buf;
+    struct gf_flock flock;
+    loc_t loc;
+    fd_t *fd;
+    char *basename;
+    unsigned char *locked_nodes;
+    int locked_count;
 
-	/* array of xattr's, one for each child */
-	dict_t **xattr;
+} afr_lockee_t;
 
-	/* array of errno's, one for each child */
-	int *child_errno;
-
-	int32_t **pending_matrix;
-	int32_t **delta_matrix;
+int
+afr_entry_lockee_cmp(const void *l1, const void *l2);
 
-	int *sources;
-	int source;
-	int active_source;
-	int active_sinks;
-	int *success;
+typedef struct {
+    loc_t *lk_loc;
+
+    afr_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
+
+    const char *lk_basename;
+    const char *lower_basename;
+    const char *higher_basename;
+
+    unsigned char *lower_locked_nodes;
+
+    afr_lock_cbk_t lock_cbk;
+
+    int lockee_count;
+
+    int32_t lk_call_count;
+    int32_t lk_expected_count;
+    int32_t lk_attempted_count;
+
+    int32_t lock_op_ret;
+    int32_t lock_op_errno;
+    char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
+    int32_t lock_count;
+    char lower_locked;
+    char higher_locked;
+} afr_internal_lock_t;
+
+struct afr_reply {
+    int valid;
+    int32_t op_ret;
+    dict_t *xattr; /*For xattrop*/
+    dict_t *xdata;
+    struct iatt poststat;
+    struct iatt postparent;
+    struct iatt prestat;
+    struct iatt preparent;
+    struct iatt preparent2;
+    struct iatt postparent2;
+    int32_t op_errno;
+    /* For rchecksum */
+    uint8_t checksum[SHA256_DIGEST_LENGTH];
+    gf_boolean_t buf_has_zeroes;
+    gf_boolean_t fips_mode_rchecksum;
+    /* For lookup */
+    int8_t need_heal;
+};
 
-	fd_t *healing_fd;
-	int   op_failed;
+typedef enum {
+    AFR_FD_NOT_OPENED,
+    AFR_FD_OPENED,
+    AFR_FD_OPENING
+} afr_fd_open_status_t;
 
-	int   file_has_holes;
-	blksize_t block_size;
-	off_t file_size;
-	off_t offset;
+typedef struct {
+    afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
+    int flags;
 
-	loc_t parent_loc;
-	int (*completion_cbk) (call_frame_t *frame, xlator_t *this);
-	call_frame_t *sh_frame;
-} afr_self_heal_t;
+    /* the subvolume on which the latest sequence of readdirs (starting
+       at offset 0) has begun. Till the next readdir request with 0 offset
+       arrives, we continue to read off this subvol.
+    */
+    int readdir_subvol;
+    /* lock-healing related members. */
+    gf_boolean_t is_fd_bad;
+    afr_lk_heal_info_t *lk_heal_info;
 
+} afr_fd_ctx_t;
 
 typedef enum {
-	AFR_DATA_TRANSACTION,          /* truncate, write, ... */
-	AFR_METADATA_TRANSACTION,      /* chmod, chown, ... */
-	AFR_ENTRY_TRANSACTION,         /* create, rmdir, ... */
-	AFR_ENTRY_RENAME_TRANSACTION,  /* rename */
-	AFR_FLUSH_TRANSACTION,         /* flush */
-} afr_transaction_type;
+    AFR_FOP_LOCK_PARALLEL,
+    AFR_FOP_LOCK_SERIAL,
+    AFR_FOP_LOCK_QUORUM_FAILED,
+} afr_fop_lock_state_t;
+
+typedef struct _afr_inode_lock_t {
+    /* @num_inodelks:
+       Number of inodelks queried from the server, as queried through
+       xdata in FOPs. Currently, used to decide if eager-locking must be
+       temporarily disabled.
+    */
+    int32_t num_inodelks;
+    unsigned int event_generation;
+    gf_timer_t *delay_timer;
+    struct list_head owners;  /*Transactions that are performing fop*/
+    struct list_head post_op; /*Transactions that are done with the fop
+                               *So can not conflict with the fops*/
+    struct list_head waiting; /*Transaction that are waiting for
+                               *conflicting transactions to complete*/
+    struct list_head frozen;  /*Transactions that need to go as part of
+                               * next batch of eager-lock*/
+    gf_boolean_t release;
+    gf_boolean_t acquired;
+} afr_lock_t;
+
+typedef struct _afr_inode_ctx {
+    uint64_t read_subvol;
+    uint64_t write_subvol;
+    int lock_count;
+    int spb_choice;
+    gf_timer_t *timer;
+    unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
+    int inherited[AFR_NUM_CHANGE_LOGS];
+    int on_disk[AFR_NUM_CHANGE_LOGS];
+    /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/
+    afr_lock_t lock[2];
+
+    /* @open_fd_count:
+       Number of open FDs queried from the server, as queried through
+       xdata in FOPs. Currently, used to decide if eager-locking must be
+       temporarily disabled.
+    */
+    uint32_t open_fd_count;
+    gf_boolean_t need_refresh;
+
+    /* set if any write on this fd was a non stable write
+       (i.e, without O_SYNC or O_DSYNC)
+    */
+    gf_boolean_t witnessed_unstable_write;
+} afr_inode_ctx_t;
 
 typedef struct _afr_local {
-	unsigned int call_count;
-	unsigned int success_count;
-	unsigned int enoent_count;
-
-	unsigned int need_metadata_self_heal;
-	unsigned int need_entry_self_heal;
-	unsigned int need_data_self_heal;
-	unsigned int govinda_gOvinda;
-
-	unsigned int reval_child_index;
-	int32_t op_ret;
-	int32_t op_errno;
-
-	int32_t *pending_array;
-
-	loc_t loc;
-	loc_t newloc;
-
-	fd_t *fd;
-
-	glusterfs_fop_t fop;
-
-	unsigned char *child_up; 
-	int            child_count;
-
-	int32_t *child_errno;
-	
-	dict_t  *xattr_req;
-	int      open_fd_count;
-	/* 
-	   This struct contains the arguments for the "continuation"
-	   (scheme-like) of fops
-	*/
-
-	int   op;
-	struct {
-		struct {
-			unsigned char buf_set;
-			struct statvfs buf;
-		} statfs;
-
-		struct {
-			inode_t *inode;
-			struct stat buf;
-			dict_t *xattr;
-		} lookup;
-
-		struct {
-			int32_t flags;
-		} open;
-
-		struct {
-			int32_t cmd;
-			struct flock flock;
-			unsigned char *locked_nodes;
-		} lk;
-
-		struct {
-			uint8_t *file_checksum;
-			uint8_t *dir_checksum;
-		} checksum;
-
-		/* inode read */
-
-		struct {
-			int32_t mask;
-			int last_tried;  /* index of the child we tried previously */
-		} access;
-
-		struct {
-			int last_tried;
-			ino_t ino;
-		} stat;
-
-		struct {
-			int last_tried;
-			ino_t ino;
-		} fstat;
-
-		struct {
-			size_t size;
-			int last_tried;
-		} readlink;
-
-		struct {
-			const char *name;
-			int last_tried;
-		} getxattr;
-
-		struct {
-			size_t size;
-			off_t offset;
-			int last_tried;
-		} readv;
-
-		/* dir read */
-
-		struct {
-			int success_count;
-			int32_t op_ret;
-			int32_t op_errno;
-		} opendir;
-
-		struct {
-			int32_t op_ret;
-			int32_t op_errno;
-			size_t size;
-			off_t offset;
-
-			int last_tried;
-		} readdir;
-
-		struct {
-			int32_t op_ret;
-			int32_t op_errno;
-
-			size_t size;
-			off_t offset;
-			int32_t flag;
-
-			int last_tried;
-		} getdents;
-
-		/* inode write */
-
-		struct {
-			ino_t ino;
-			mode_t mode;
-			struct stat buf;
-		} chmod;
-
-		struct {
-			ino_t ino;
-			mode_t mode;
-			struct stat buf;
-		} fchmod;
-
-		struct {
-			ino_t ino;
-			uid_t uid;
-			gid_t gid;
-			struct stat buf;
-		} chown;
-
-		struct {
-			ino_t ino;
-			uid_t uid;
-			gid_t gid;
-			struct stat buf;
-		} fchown;
-		
-		struct {
-			ino_t ino;
-			struct stat buf;
-
-			int32_t op_ret;
-
-			struct iovec *vector;
-			dict_t *refs;
-			int32_t count;
-			off_t offset;
-		} writev;
-
-		struct {
-			ino_t ino;
-			off_t offset;
-			struct stat buf;
-		} truncate;
-
-		struct {
-			ino_t ino;
-			off_t offset;
-			struct stat buf;
-		} ftruncate;
-
-		struct {
-			ino_t ino;
-			struct timespec tv[2];
-			struct stat buf;
-		} utimens;
-
-		struct {
-			dict_t *dict;
-			int32_t flags;
-		} setxattr;
-
-		struct {
-			const char *name;
-		} removexattr;
-
-		/* dir write */
-		
-		struct {
-			ino_t ino;
-			fd_t *fd;
-			int32_t flags;
-			mode_t mode;
-			inode_t *inode;
-			struct stat buf;
-		} create;
-
-		struct {
-			ino_t ino;
-			dev_t dev;
-			mode_t mode;
-			inode_t *inode;
-			struct stat buf;
-		} mknod;
-
-		struct {
-			ino_t ino;
-			int32_t mode;
-			inode_t *inode;
-			struct stat buf;
-		} mkdir;
-
-		struct {
-			int32_t op_ret;
-			int32_t op_errno;
-		} unlink;
-
-		struct {
-			int32_t op_ret;
-			int32_t op_errno;
-		} rmdir;
-
-		struct {
-			ino_t ino;
-			struct stat buf;
-		} rename;
-
-		struct {
-			ino_t ino;
-			inode_t *inode;
-			struct stat buf;
-		} link;
-
-		struct {
-			ino_t ino;
-			inode_t *inode;
-			struct stat buf;
-			char *linkpath;
-		} symlink;
-
-		struct {
-			int32_t flags;
-			dir_entry_t *entries;
-			int32_t count;
-		} setdents;
-	} cont;
-	
-	struct {
-		off_t start, len;
-
-		unsigned char *locked_nodes;
-		int lock_count;
-
-		const char *basename;
-		const char *new_basename;
-
-		char *pending;
-
-		loc_t parent_loc;
-		loc_t new_parent_loc;
-
-		afr_transaction_type type;
-
-		int success_count;
-		int erase_pending;
-		int failure_count;
-
-		int last_tried;
-		int32_t *child_errno;
-
-		call_frame_t *main_frame;
-
-		int (*fop) (call_frame_t *frame, xlator_t *this);
-
-		int (*done) (call_frame_t *frame, xlator_t *this);
-
-		int (*resume) (call_frame_t *frame, xlator_t *this);
-
-		int (*unwind) (call_frame_t *frame, xlator_t *this);
-	} transaction;
-
-	afr_self_heal_t self_heal;
+    glusterfs_fop_t op;
+    unsigned int call_count;
+
+    /* @event_generation: copy of priv->event_generation taken at the
+       time of starting the transaction. The copy is made so that we
+       have a stable value through the various phases of the transaction.
+    */
+    unsigned int event_generation;
+
+    uint32_t open_fd_count;
+    int32_t num_inodelks;
+
+    int32_t op_ret;
+    int32_t op_errno;
+
+    int dirty[AFR_NUM_CHANGE_LOGS];
+
+    int32_t **pending;
+
+    loc_t loc;
+    loc_t newloc;
+
+    fd_t *fd;
+    afr_fd_ctx_t *fd_ctx;
+
+    /* @child_up: copy of priv->child_up taken at the time of transaction
+       start. The copy is taken so that we have a stable child_up array
+       through the phases of the transaction as priv->child_up[i] can keep
+       changing through time.
+    */
+    unsigned char *child_up;
+
+    /* @read_attempted:
+       array of flags representing subvolumes where read operations of
+       the read transaction have already been attempted. The array is
+       first pre-filled with down subvolumes, and as reads are performed
+       on other subvolumes, those are set as well. This way if the read
+       operation fails we do not retry on that subvolume again.
+    */
+    unsigned char *read_attempted;
+
+    /* @readfn:
+
+       pointer to function which will perform the read operation on a given
+       subvolume. Used in read transactions.
+    */
+
+    afr_read_txn_wind_t readfn;
+
+    /* @inode:
+
+       the inode on which the read txn is performed on. ref'ed and copied
+       from either fd->inode or loc.inode
+    */
+
+    inode_t *inode;
+
+    /* @parent[2]:
+
+       parent inode[s] on which directory transactions are performed.
+    */
+
+    inode_t *parent;
+    inode_t *parent2;
+
+    /* @readable:
+
+       array of flags representing servers from which a read can be
+       performed. This is the output of afr_inode_refresh()
+    */
+    unsigned char *readable;
+    unsigned char *readable2; /*For rename transaction*/
+
+    afr_inode_refresh_cbk_t refreshfn;
+
+    /* @refreshinode:
+
+       Inode currently getting refreshed.
+    */
+    inode_t *refreshinode;
+
+    dict_t *xattr_req;
+
+    dict_t *dict;
+
+    int read_subvol; /* Current read subvolume */
+
+    int optimistic_change_log;
+
+    afr_internal_lock_t internal_lock;
+
+    /*To handle setattr/setxattr on yet to be linked inode from dht*/
+    uuid_t refreshgfid;
+
+    /* @refreshed:
+
+       the inode was "refreshed" (i.e, pending xattrs from all subvols
+       freshly inspected and inode ctx updated accordingly) as part of
+       this transaction already.
+    */
+    gf_boolean_t refreshed;
+
+    gf_boolean_t update_num_inodelks;
+    gf_boolean_t update_open_fd_count;
+
+    /*
+      @pre_op_compat:
+
+      compatibility mode of pre-op. send a separate pre-op and
+      op operations as part of transaction, rather than combining
+    */
+
+    gf_boolean_t pre_op_compat;
+
+    /* Is the current writev() going to perform a stable write?
+       i.e, is fd->flags or @flags writev param have O_SYNC or
+       O_DSYNC?
+    */
+    gf_boolean_t stable_write;
+
+    /* This write appended to the file. Nnot necessarily O_APPEND,
+       just means the offset of write was at the end of file.
+    */
+    gf_boolean_t append_write;
+
+    /*
+      This struct contains the arguments for the "continuation"
+      (scheme-like) of fops
+    */
+
+    struct {
+        struct {
+            struct statvfs buf;
+            unsigned char buf_set;
+        } statfs;
+
+        struct {
+            fd_t *fd;
+            int32_t flags;
+        } open;
+
+        struct {
+            struct gf_flock user_flock;
+            struct gf_flock ret_flock;
+            unsigned char *locked_nodes;
+            int32_t cmd;
+            /*For lock healing only.*/
+            unsigned char *dom_locked_nodes;
+            int32_t *dom_lock_op_ret;
+            int32_t *dom_lock_op_errno;
+            struct gf_flock *getlk_rsp;
+        } lk;
+
+        /* inode read */
+
+        struct {
+            int32_t mask;
+            int last_index; /* index of the child we tried previously */
+        } access;
+
+        struct {
+            int last_index;
+        } stat;
+
+        struct {
+            int last_index;
+        } fstat;
+
+        struct {
+            size_t size;
+            int last_index;
+        } readlink;
+
+        struct {
+            char *name;
+            long xattr_len;
+            int last_index;
+        } getxattr;
+
+        struct {
+            size_t size;
+            off_t offset;
+            int last_index;
+            uint32_t flags;
+        } readv;
+
+        /* dir read */
+
+        struct {
+            uint32_t *checksum;
+            int success_count;
+            int32_t op_ret;
+            int32_t op_errno;
+        } opendir;
+
+        struct {
+            int32_t op_ret;
+            int32_t op_errno;
+            size_t size;
+            off_t offset;
+            dict_t *dict;
+            int last_index;
+            gf_boolean_t failed;
+        } readdir;
+        /* inode write */
+
+        struct {
+            struct iatt prebuf;
+            struct iatt postbuf;
+        } inode_wfop;  // common structure for all inode-write-fops
+
+        struct {
+            struct iovec *vector;
+            struct iobref *iobref;
+            off_t offset;
+            int32_t op_ret;
+            int32_t count;
+            uint32_t flags;
+        } writev;
+
+        struct {
+            off_t offset;
+        } truncate;
+
+        struct {
+            off_t offset;
+        } ftruncate;
+
+        struct {
+            struct iatt in_buf;
+            int32_t valid;
+        } setattr;
+
+        struct {
+            struct iatt in_buf;
+            int32_t valid;
+        } fsetattr;
+
+        struct {
+            dict_t *dict;
+            int32_t flags;
+        } setxattr;
+
+        struct {
+            dict_t *dict;
+            int32_t flags;
+        } fsetxattr;
+
+        struct {
+            char *name;
+        } removexattr;
+
+        struct {
+            dict_t *xattr;
+            gf_xattrop_flags_t optype;
+        } xattrop;
+
+        /* dir write */
+
+        struct {
+            inode_t *inode;
+            struct iatt buf;
+            struct iatt preparent;
+            struct iatt postparent;
+            struct iatt prenewparent;
+            struct iatt postnewparent;
+        } dir_fop;  // common structure for all dir fops
+
+        struct {
+            fd_t *fd;
+            dict_t *params;
+            int32_t flags;
+            mode_t mode;
+        } create;
+
+        struct {
+            dict_t *params;
+            dev_t dev;
+            mode_t mode;
+        } mknod;
+
+        struct {
+            dict_t *params;
+            int32_t mode;
+        } mkdir;
+
+        struct {
+            dict_t *params;
+            char *linkpath;
+        } symlink;
+
+        struct {
+            off_t offset;
+            size_t len;
+            int32_t mode;
+        } fallocate;
+
+        struct {
+            off_t offset;
+            size_t len;
+        } discard;
+
+        struct {
+            off_t offset;
+            off_t len;
+            struct iatt prebuf;
+            struct iatt postbuf;
+        } zerofill;
+
+        struct {
+            char *volume;
+            int32_t cmd;
+            int32_t in_cmd;
+            struct gf_flock in_flock;
+            struct gf_flock flock;
+            void *xdata;
+        } inodelk;
+
+        struct {
+            char *volume;
+            char *basename;
+            void *xdata;
+            entrylk_cmd in_cmd;
+            entrylk_cmd cmd;
+            entrylk_type type;
+        } entrylk;
+
+        struct {
+            off_t offset;
+            gf_seek_what_t what;
+        } seek;
+
+        struct {
+            struct gf_lease user_lease;
+            struct gf_lease ret_lease;
+            unsigned char *locked_nodes;
+        } lease;
+
+        struct {
+            int flags;
+        } rmdir;
+
+        struct {
+            int32_t datasync;
+        } fsync;
+
+        struct {
+            uuid_t gfid_req;
+            gf_boolean_t needs_fresh_lookup;
+        } lookup;
+
+    } cont;
+
+    struct {
+        char *basename;
+        char *new_basename;
+
+        loc_t parent_loc;
+        loc_t new_parent_loc;
+
+        /* stub to resume on destruction
+           of the transaction frame */
+        call_stub_t *resume_stub;
+
+        struct list_head owner_list;
+        struct list_head wait_list;
+
+        unsigned char *pre_op;
+
+        /* Changelog xattr dict for [f]xattrop*/
+        dict_t **changelog_xdata;
+        unsigned char *pre_op_sources;
+
+        /* @failed_subvols: subvolumes on which a pre-op or a
+            FOP failed. */
+        unsigned char *failed_subvols;
+
+        call_frame_t *main_frame; /*Fop frame*/
+        call_frame_t *frame;      /*Transaction frame*/
+
+        int (*wind)(call_frame_t *frame, xlator_t *this, int subvol);
+
+        int (*unwind)(call_frame_t *frame, xlator_t *this);
+
+        off_t start, len;
+
+        afr_transaction_type type;
+
+        int32_t in_flight_sb_errno; /* This is where the cause of the
+                                       failure on the last good copy of
+                                       the file is stored.
+                                       */
+
+        /* @changelog_resume: function to be called after changlogging
+           (either pre-op or post-op) is done
+        */
+        afr_changelog_resume_t changelog_resume;
+
+        gf_boolean_t eager_lock_on;
+        gf_boolean_t do_eager_unlock;
+
+        /* @dirtied: flag which indicates whether we set dirty flag
+           in the OP. Typically true when we are performing operation
+           on more than one subvol and optimistic changelog is disabled
+
+           A 'true' value set in @dirtied flag means an 'undirtying'
+           has to be done in POST-OP phase.
+        */
+        gf_boolean_t dirtied;
+
+        /* @inherited: flag which indicates that the dirty flags
+           of the previous transaction were inherited
+        */
+        gf_boolean_t inherited;
+
+        /*
+          @no_uninherit: flag which indicates that a pre_op_uninherit()
+          must _not_ be attempted (and returned as failure) always. This
+          flag is set when a hard pre-op is performed, but not accounted
+          for it in fd_ctx->on_disk[]. Such transactions are "isolated"
+          from the pre-op piggybacking entirely and therefore uninherit
+          must not be attempted.
+        */
+        gf_boolean_t no_uninherit;
+
+        gf_boolean_t in_flight_sb; /* Indicator for occurrence of
+                                      split-brain while in the middle of
+                                      a txn. */
+
+        /* @uninherit_done:
+           @uninherit_value:
+
+           The above pair variables make pre_op_uninherit() idempotent.
+           Both are FALSE initially. The first call to pre_op_uninherit
+           sets @uninherit_done to TRUE and the return value to
+           @uninherit_value. Further calls will check for @uninherit_done
+           to be TRUE and if so will simply return @uninherit_value.
+        */
+        gf_boolean_t uninherit_done;
+        gf_boolean_t uninherit_value;
+
+        gf_boolean_t disable_delayed_post_op;
+    } transaction;
+
+    syncbarrier_t barrier;
+
+    /* extra data for fops */
+    dict_t *xdata_req;
+    dict_t *xdata_rsp;
+
+    dict_t *xattr_rsp; /*for [f]xattrop*/
+
+    mode_t umask;
+    int xflag;
+    struct afr_reply *replies;
+
+    /* For  client side background heals. */
+    struct list_head healer;
+    call_frame_t *heal_frame;
+
+    afr_inode_ctx_t *inode_ctx;
+
+    /*For thin-arbiter transactions.*/
+    int ta_failed_subvol;
+    int ta_event_gen;
+    struct list_head ta_waitq;
+    struct list_head ta_onwireq;
+    afr_ta_fop_state_t fop_state;
+    afr_fop_lock_state_t fop_lock_state;
+    gf_lkowner_t saved_lk_owner;
+    unsigned char read_txn_query_child;
+    unsigned char ta_child_up;
+    gf_boolean_t do_discovery;
+    gf_boolean_t need_full_crawl;
+    gf_boolean_t is_read_txn;
+    gf_boolean_t is_new_entry;
 } afr_local_t;
 
-/* try alloc and if it fails, goto label */
-#define ALLOC_OR_GOTO(var, type, label) do {			\
-		var = CALLOC (sizeof (type), 1);		\
-		if (!var) {					\
-			gf_log (this->name, GF_LOG_ERROR,	\
-				"out of memory :(");		\
-			op_errno = ENOMEM;			\
-			goto label;				\
-		}						\
-	} while (0);
+typedef struct afr_spbc_timeout {
+    call_frame_t *frame;
+    loc_t *loc;
+    int spb_child_index;
+    gf_boolean_t d_spb;
+    gf_boolean_t m_spb;
+} afr_spbc_timeout_t;
+
+typedef struct afr_spb_status {
+    call_frame_t *frame;
+    loc_t *loc;
+} afr_spb_status_t;
+
+typedef struct afr_empty_brick_args {
+    call_frame_t *frame;
+    char *op_type;
+    loc_t loc;
+    int empty_index;
+} afr_empty_brick_args_t;
+
+typedef struct afr_read_subvol_args {
+    ia_type_t ia_type;
+    uuid_t gfid;
+} afr_read_subvol_args_t;
+
+typedef struct afr_granular_esh_args {
+    fd_t *heal_fd;
+    xlator_t *xl;
+    call_frame_t *frame;
+    gf_boolean_t mismatch; /* flag to represent occurrence of type/gfid
+                              mismatch */
+} afr_granular_esh_args_t;
+
+int
+afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this,
+                       unsigned char *readable, int *event_p, int type);
+int
+afr_inode_read_subvol_get(inode_t *inode, xlator_t *this,
+                          unsigned char *data_subvols,
+                          unsigned char *metadata_subvols,
+                          int *event_generation);
+int
+__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this,
+                            unsigned char *data_subvols,
+                            unsigned char *metadata_subvols,
+                            int *event_generation);
+
+int
+__afr_inode_read_subvol_set(inode_t *inode, xlator_t *this,
+                            unsigned char *data_subvols,
+                            unsigned char *metadata_subvol,
+                            int event_generation);
+int
+afr_inode_read_subvol_set(inode_t *inode, xlator_t *this,
+                          unsigned char *data_subvols,
+                          unsigned char *metadata_subvols,
+                          int event_generation);
+
+int
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
+
+int
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
+
+int
+afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
+                                 unsigned char *readable,
+                                 afr_read_subvol_args_t *args);
+
+int
+afr_inode_read_subvol_type_get(inode_t *inode, xlator_t *this,
+                               unsigned char *readable, int *event_p, int type);
+int
+afr_read_subvol_get(inode_t *inode, xlator_t *this, int *subvol_p,
+                    unsigned char *readables, int *event_p,
+                    afr_transaction_type type, afr_read_subvol_args_t *args);
+
+#define afr_data_subvol_get(i, t, s, r, e, a)                                  \
+    afr_read_subvol_get(i, t, s, r, e, AFR_DATA_TRANSACTION, a)
 
+#define afr_metadata_subvol_get(i, t, s, r, e, a)                              \
+    afr_read_subvol_get(i, t, s, r, e, AFR_METADATA_TRANSACTION, a)
 
-/* did a call fail due to a child failing? */
-#define child_went_down(op_ret, op_errno) (((op_ret) < 0) &&	      \
-					   ((op_errno == ENOTCONN) || \
-					    (op_errno == EBADFD)))
+int
+afr_inode_refresh(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                  uuid_t gfid, afr_inode_refresh_cbk_t cbk);
+
+int32_t
+afr_notify(xlator_t *this, int32_t event, void *data, void *data2);
+
+int
+xattr_is_equal(dict_t *this, char *key1, data_t *value1, void *data);
+
+int
+afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename,
+                     int child_count);
+
+int
+afr_add_inode_lockee(afr_local_t *local, int child_count);
+
+void
+afr_lockees_cleanup(afr_internal_lock_t *int_lock);
 
-/* have we tried all children? */
-#define all_tried(i, count)  ((i) == (count) - 1)
+int
+afr_attempt_lock_recovery(xlator_t *this, int32_t child_index);
+
+int
+afr_mark_locked_nodes(xlator_t *this, fd_t *fd, unsigned char *locked_nodes);
 
 void
-afr_build_parent_loc (loc_t *parent, loc_t *child);
+afr_set_lk_owner(call_frame_t *frame, xlator_t *this, void *lk_owner);
+
+int
+afr_set_lock_number(call_frame_t *frame, xlator_t *this);
+
+int32_t
+afr_unlock(call_frame_t *frame, xlator_t *this);
+
+int
+afr_lock_nonblocking(call_frame_t *frame, xlator_t *this);
+
+int
+afr_blocking_lock(call_frame_t *frame, xlator_t *this);
+
+int
+afr_internal_lock_finish(call_frame_t *frame, xlator_t *this);
+
+int
+__afr_fd_ctx_set(xlator_t *this, fd_t *fd);
+
+afr_fd_ctx_t *
+afr_fd_ctx_get(fd_t *fd, xlator_t *this);
 
 int
-afr_up_children_count (int child_count, unsigned char *child_up);
+afr_build_parent_loc(loc_t *parent, loc_t *child, int32_t *op_errno);
 
 int
-afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
+afr_locked_nodes_count(unsigned char *locked_nodes, int child_count);
 
 int
-afr_first_up_child (afr_private_t *priv);
+afr_replies_interpret(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                      gf_boolean_t *start_heal);
+
+void
+afr_local_replies_wipe(afr_local_t *local, afr_private_t *priv);
+
+void
+afr_local_cleanup(afr_local_t *local, xlator_t *this);
 
-ino64_t
-afr_itransform (ino64_t ino, int child_count, int child_index);
+int
+afr_frame_return(call_frame_t *frame);
 
 int
-afr_deitransform (ino64_t ino, int child_count);
+afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+         fd_t *fd, dict_t *xdata);
 
 void
-afr_local_cleanup (afr_local_t *local, xlator_t *this);
-
-int
-afr_frame_return (call_frame_t *frame);
-
-#define AFR_STACK_UNWIND(frame, params ...)		\
-	do {						\
-		afr_local_t *__local = NULL;		\
-		xlator_t    *__this = NULL;		\
-		__local = frame->local;			\
-		__this = frame->this;			\
-		frame->local = NULL;                    \
-		STACK_UNWIND (frame, params);		\
-		afr_local_cleanup (__local, __this);	\
-		free (__local);				\
-} while (0);					
-
-#define AFR_STACK_DESTROY(frame)			\
-	do {						\
-		afr_local_t *__local = NULL;		\
-		xlator_t    *__this = NULL;		\
-		__local = frame->local;			\
-		__this = frame->this;			\
-		frame->local = NULL;                    \
-		STACK_DESTROY (frame->root);		\
-		afr_local_cleanup (__local, __this);	\
-		free (__local);				\
-} while (0);					
+afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this);
+
+int
+afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd);
+
+#define AFR_STACK_UNWIND(fop, frame, op_ret, op_errno, params...)              \
+    do {                                                                       \
+        afr_local_t *__local = NULL;                                           \
+        xlator_t *__this = NULL;                                               \
+        int32_t __op_ret = 0;                                                  \
+        int32_t __op_errno = 0;                                                \
+                                                                               \
+        __op_ret = op_ret;                                                     \
+        __op_errno = op_errno;                                                 \
+        if (frame) {                                                           \
+            __local = frame->local;                                            \
+            __this = frame->this;                                              \
+            afr_handle_inconsistent_fop(frame, &__op_ret, &__op_errno);        \
+            if (__local && __local->is_read_txn)                               \
+                afr_pending_read_decrement(__this->private,                    \
+                                           __local->read_subvol);              \
+            if (__local && __local->xdata_req &&                               \
+                afr_is_lock_mode_mandatory(__local->xdata_req))                \
+                afr_dom_lock_release(frame);                                   \
+            frame->local = NULL;                                               \
+        }                                                                      \
+                                                                               \
+        STACK_UNWIND_STRICT(fop, frame, __op_ret, __op_errno, params);         \
+        if (__local) {                                                         \
+            afr_local_cleanup(__local, __this);                                \
+            mem_put(__local);                                                  \
+        }                                                                      \
+    } while (0)
+
+#define AFR_STACK_DESTROY(frame)                                               \
+    do {                                                                       \
+        afr_local_t *__local = NULL;                                           \
+        xlator_t *__this = NULL;                                               \
+        __local = frame->local;                                                \
+        __this = frame->this;                                                  \
+        frame->local = NULL;                                                   \
+        STACK_DESTROY(frame->root);                                            \
+        if (__local) {                                                         \
+            afr_local_cleanup(__local, __this);                                \
+            mem_put(__local);                                                  \
+        }                                                                      \
+    } while (0);
+
+#define AFR_FRAME_INIT(frame, op_errno)                                        \
+    ({                                                                         \
+        frame->local = mem_get0(THIS->local_pool);                             \
+        if (afr_local_init(frame->local, frame->this->private, &op_errno)) {   \
+            afr_local_cleanup(frame->local, frame->this);                      \
+            mem_put(frame->local);                                             \
+            frame->local = NULL;                                               \
+        };                                                                     \
+        frame->local;                                                          \
+    })
+
+#define AFR_STACK_RESET(frame)                                                 \
+    do {                                                                       \
+        afr_local_t *__local = NULL;                                           \
+        xlator_t *__this = NULL;                                               \
+        __local = frame->local;                                                \
+        __this = frame->this;                                                  \
+        frame->local = NULL;                                                   \
+        int __opr;                                                             \
+        STACK_RESET(frame->root);                                              \
+        if (__local) {                                                         \
+            afr_local_cleanup(__local, __this);                                \
+            mem_put(__local);                                                  \
+        }                                                                      \
+        AFR_FRAME_INIT(frame, __opr);                                          \
+    } while (0)
 
 /* allocate and return a string that is the basename of argument */
-static inline char * 
-AFR_BASENAME (const char *str)						
+static inline char *
+AFR_BASENAME(const char *str)
 {
-	char *__tmp_str = NULL;				
-	char *__basename_str = NULL;			
-	__tmp_str = strdup (str);			
-	__basename_str = strdup (basename (__tmp_str));	
-	FREE (__tmp_str);
-	return __basename_str;
+    char *__tmp_str = NULL;
+    char *__basename_str = NULL;
+    __tmp_str = gf_strdup(str);
+    __basename_str = gf_strdup(basename(__tmp_str));
+    GF_FREE(__tmp_str);
+    return __basename_str;
 }
 
-/* initialize local_t */
-static inline int
-AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv)
-{
-	local->child_up = CALLOC (sizeof (*local->child_up),
-				  priv->child_count);
-	if (!local->child_up) {
-		return -ENOMEM;
-	}
+call_frame_t *
+afr_copy_frame(call_frame_t *base);
 
-	memcpy (local->child_up, priv->child_up, 
-		sizeof (*local->child_up) * priv->child_count);
+int
+afr_transaction_local_init(afr_local_t *local, xlator_t *this);
 
+int32_t
+afr_marker_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    const char *name, afr_local_t *local, afr_private_t *priv);
 
-	local->call_count = afr_up_children_count (priv->child_count, local->child_up);
-	if (local->call_count == 0)
-		return -ENOTCONN;
+int
+afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno);
 
-	local->transaction.erase_pending = 1;
+int
+afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count);
 
-	local->op_ret = -1;
-	local->op_errno = EUCLEAN;
+int
+afr_higher_errno(int32_t old_errno, int32_t new_errno);
 
-	return 0;
-}
+int
+afr_final_errno(afr_local_t *local, afr_private_t *priv);
 
+int
+afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req);
 
-static inline int
-afr_transaction_local_init (afr_local_t *local, afr_private_t *priv)
-{
-	local->child_errno = CALLOC (sizeof (*local->child_errno),
-				     priv->child_count);
-	if (!local->child_errno) {
-		return -ENOMEM;
-	}
+void
+afr_fix_open(fd_t *fd, xlator_t *this);
 
-	local->pending_array = CALLOC (sizeof (*local->pending_array),
-				       priv->child_count);
-	if (!local->pending_array) {
-		return -ENOMEM;
-	}
+afr_fd_ctx_t *
+afr_fd_ctx_get(fd_t *fd, xlator_t *this);
 
-	local->transaction.locked_nodes = CALLOC (sizeof (*local->transaction.locked_nodes),
-						  priv->child_count);
+void
+afr_set_low_priority(call_frame_t *frame);
+int
+afr_child_fd_ctx_set(xlator_t *this, fd_t *fd, int32_t child, int flags);
 
-	local->transaction.child_errno = CALLOC (sizeof (*local->transaction.child_errno),
-						  priv->child_count);
+void
+afr_matrix_cleanup(int32_t **pending, unsigned int m);
 
-	return 0;
-}
+int32_t **
+afr_matrix_create(unsigned int m, unsigned int n);
+
+int **
+afr_mark_pending_changelog(afr_private_t *priv, unsigned char *pending,
+                           dict_t *xattr, ia_type_t iat);
+
+void
+afr_filter_xattrs(dict_t *xattr);
+
+/*
+ * Special value indicating we should use the "auto" quorum method instead of
+ * a fixed value (including zero to turn off quorum enforcement).
+ */
+#define AFR_QUORUM_AUTO INT_MAX
+
+int
+afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local);
+
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode);
+
+void
+afr_reply_wipe(struct afr_reply *reply);
+
+void
+afr_replies_wipe(struct afr_reply *replies, int count);
+
+gf_boolean_t
+afr_xattrs_are_equal(dict_t *dict1, dict_t *dict2);
+
+gf_boolean_t
+afr_is_xattr_ignorable(char *key);
+
+int
+afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc);
+
+int
+afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc);
+
+int
+afr_get_split_brain_status(void *opaque);
+
+int
+afr_get_split_brain_status_cbk(int ret, call_frame_t *frame, void *opaque);
+
+int
+afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this,
+                                 int spb_choice);
+int
+afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this,
+                                call_frame_t *frame, int *spb_subvol);
+int
+afr_get_child_index_from_name(xlator_t *this, char *name);
+
+int
+afr_is_split_brain(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                   uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb);
+int
+afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode);
+
+int
+afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque);
+
+gf_boolean_t
+afr_get_need_heal(xlator_t *this);
+
+void
+afr_set_need_heal(xlator_t *this, afr_local_t *local);
+
+int
+afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd);
+
+int
+afr_get_msg_id(char *op_type);
+
+int
+afr_set_in_flight_sb_status(xlator_t *this, call_frame_t *frame,
+                            inode_t *inode);
+
+int32_t
+afr_quorum_errno(afr_private_t *priv);
+
+gf_boolean_t
+afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
+                              int32_t *op_errno);
+void
+afr_handle_inconsistent_fop(call_frame_t *frame, int32_t *op_ret,
+                            int32_t *op_errno);
+
+void
+afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata);
+void
+afr_process_post_writev(call_frame_t *frame, xlator_t *this);
+
+void
+afr_writev_unwind(call_frame_t *frame, xlator_t *this);
+
+void
+afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame);
+
+void
+afr_update_uninodelk(afr_local_t *local, afr_internal_lock_t *int_lock,
+                     int32_t child_index);
+afr_fd_ctx_t *
+__afr_fd_ctx_get(fd_t *fd, xlator_t *this);
+
+gf_boolean_t
+afr_is_inode_refresh_reqd(inode_t *inode, xlator_t *this, int event_gen1,
+                          int event_gen2);
+
+int
+afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this,
+                                    char *buf, const char *default_str,
+                                    int32_t *serz_len, char delimiter);
+gf_boolean_t
+afr_is_symmetric_error(call_frame_t *frame, xlator_t *this);
+
+int
+__afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx);
+
+uint64_t
+afr_write_subvol_get(call_frame_t *frame, xlator_t *this);
+
+int
+afr_write_subvol_set(call_frame_t *frame, xlator_t *this);
+
+int
+afr_write_subvol_reset(call_frame_t *frame, xlator_t *this);
+
+int
+afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode);
+
+int
+afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop);
+
+int
+afr_ta_post_op_lock(xlator_t *this, loc_t *loc);
+
+int
+afr_ta_post_op_unlock(xlator_t *this, loc_t *loc);
+
+gf_boolean_t
+afr_is_pending_set(xlator_t *this, dict_t *xdata, int type);
+
+int
+__afr_get_up_children_count(afr_private_t *priv);
+
+call_frame_t *
+afr_ta_frame_create(xlator_t *this);
+
+gf_boolean_t
+afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local);
+
+void
+afr_ta_lock_release_synctask(xlator_t *this);
+
+void
+afr_ta_locked_priv_invalidate(afr_private_t *priv);
+
+gf_boolean_t
+afr_lookup_has_quorum(call_frame_t *frame,
+                      const unsigned int up_children_count);
+
+void
+afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this);
+
+void
+afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv,
+                                   int child);
+
+void
+afr_selfheal_childup(xlator_t *this, afr_private_t *priv);
+
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata);
+
+void
+afr_dom_lock_release(call_frame_t *frame);
+
+void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+                         unsigned char *replies);
 
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+                         pid_t pid);
 #endif /* __AFR_H__ */
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
index d4e0752a585..56f1f2ad7c8 100644
--- a/xlators/cluster/dht/src/Makefile.am
+++ b/xlators/cluster/dht/src/Makefile.am
@@ -1,30 +1,48 @@
+xlator_LTLIBRARIES = dht.la nufa.la switch.la
 
-xlator_LTLIBRARIES = dht.la nufa.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+AM_CFLAGS = -Wall $(GF_CFLAGS)
 
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
 
-dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c \
-		dht-selfheal.c dht-rename.c dht-hashfn.c
+dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \
+	dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \
+	dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \
+	dht-lock.c $(top_builddir)/xlators/lib/src/libxlator.c
 
-dht_la_SOURCES = $(dht_common_source) dht.c 
+dht_la_SOURCES = $(dht_common_source) dht.c
 
 nufa_la_SOURCES = $(dht_common_source) nufa.c
+switch_la_SOURCES = $(dht_common_source) switch.c
 
-dht_la_LDFLAGS = -module -avoidversion
+dht_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
 dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
 
-nufa_la_LDFLAGS = -module -avoidversion
+nufa_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
 nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
 
-noinst_HEADERS = dht-common.h dht-common.c
+switch_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h \
+	dht-lock.h $(top_builddir)/xlators/lib/src/libxlator.h
 
-CLEANFILES = 
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(top_srcdir)/rpc/rpc-lib/src \
+	-I$(top_srcdir)/xlators/lib/src \
+	-DDATADIR=\"$(localstatedir)\" \
+	-DLIBDIR=\"$(libdir)\"
+
+CLEANFILES =
 
 uninstall-local:
 	rm -f $(DESTDIR)$(xlatordir)/distribute.so
 
 install-data-hook:
-	ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so
-\ No newline at end of file
+	ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so
+
+if UNITTEST
+CLEANFILES += *.gcda *.gcno *_xunit.xml
+noinst_PROGRAMS =
+TESTS =
+endif
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index cd29786e7bf..8ba0cc4c732 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -1,35 +1,447 @@
 /*
-   Copyright (c) 2009-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
 
 /* TODO: add NS locking */
 
-#include "glusterfs.h"
-#include "xlator.h"
+#include "libxlator.h"
 #include "dht-common.h"
-#include "defaults.h"
+#include "dht-lock.h"
+#include <glusterfs/byte-order.h>
+#include <glusterfs/quota-common-utils.h>
+#include <glusterfs/upcall-utils.h>
+#include "glusterfs/compat-errno.h"  // for ENODATA on BSD
+#include <glusterfs/common-utils.h>
+
+#include <sys/time.h>
+#include <libgen.h>
+#include <signal.h>
+
+static int
+dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int op_ret, int op_errno, gf_dirent_t *entries,
+                       dict_t *xdata);
+
+static int
+dht_link2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+
+static int
+dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req);
+
+static int
+dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this);
+
+static int
+dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                             int op_ret, int op_errno, dict_t *xdata);
+
+static int
+dht_rmdir_unlock(call_frame_t *frame, xlator_t *this);
+
+static const char *dht_dbg_vxattrs[] = {DHT_DBG_HASHED_SUBVOL_PATTERN, NULL};
+
+/* Check the xdata to make sure EBADF has been set by client xlator */
+int32_t
+dht_check_remote_fd_failed_error(dht_local_t *local, int op_ret, int op_errno)
+{
+    if (op_ret == -1 && (op_errno == EBADF || op_errno == EBADFD) &&
+        !(local->fd_checked)) {
+        return 1;
+    }
+    return 0;
+}
+
+/* Sets the blocks and size values to fixed values. This is to be called
+ * only for dirs. The caller is responsible for checking the type
+ */
+int32_t
+dht_set_fixed_dir_stat(struct iatt *stat)
+{
+    if (stat) {
+        stat->ia_blocks = DHT_DIR_STAT_BLOCKS;
+        stat->ia_size = DHT_DIR_STAT_SIZE;
+        return 0;
+    }
+    return -1;
+}
+
+/* Return true if key exists in array
+ */
+static gf_boolean_t
+dht_match_xattr(const char *key)
+{
+    char **xattrs_to_heal = get_xattrs_to_heal();
+
+    return gf_get_index_by_elem(xattrs_to_heal, (char *)key) >= 0;
+}
+
+static int
+dht_aggregate_quota_xattr(dict_t *dst, char *key, data_t *value)
+{
+    int ret = -1;
+    quota_meta_t *meta_dst = NULL;
+    quota_meta_t *meta_src = NULL;
+    int64_t *size = NULL;
+    int64_t dst_dir_count = 0;
+    int64_t src_dir_count = 0;
+
+    if (value == NULL) {
+        gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DATA_NULL,
+               "data value is NULL");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_bin(dst, key, (void **)&meta_dst);
+    if (ret < 0) {
+        meta_dst = GF_CALLOC(1, sizeof(quota_meta_t), gf_common_quota_meta_t);
+        if (meta_dst == NULL) {
+            gf_msg("dht", GF_LOG_WARNING, ENOMEM, DHT_MSG_NO_MEMORY,
+                   "Memory allocation failed");
+            ret = -1;
+            goto out;
+        }
+        ret = dict_set_bin(dst, key, meta_dst, sizeof(quota_meta_t));
+        if (ret < 0) {
+            gf_msg("dht", GF_LOG_WARNING, EINVAL, DHT_MSG_DICT_SET_FAILED,
+                   "dht aggregate dict set failed");
+            GF_FREE(meta_dst);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    if (value->len > sizeof(int64_t)) {
+        meta_src = data_to_bin(value);
+
+        meta_dst->size = hton64(ntoh64(meta_dst->size) +
+                                ntoh64(meta_src->size));
+        meta_dst->file_count = hton64(ntoh64(meta_dst->file_count) +
+                                      ntoh64(meta_src->file_count));
+
+        if (value->len > (2 * sizeof(int64_t))) {
+            dst_dir_count = ntoh64(meta_dst->dir_count);
+            src_dir_count = ntoh64(meta_src->dir_count);
+
+            if (src_dir_count > dst_dir_count)
+                meta_dst->dir_count = meta_src->dir_count;
+        } else {
+            meta_dst->dir_count = 0;
+        }
+    } else {
+        size = data_to_bin(value);
+        meta_dst->size = hton64(ntoh64(meta_dst->size) + ntoh64(*size));
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+add_opt(char **optsp, const char *opt)
+{
+    char *newopts = NULL;
+    unsigned oldsize = 0;
+    unsigned newsize = 0;
+
+    if (*optsp == NULL)
+        newopts = gf_strdup(opt);
+    else {
+        oldsize = strlen(*optsp);
+        newsize = oldsize + 1 + strlen(opt) + 1;
+        newopts = GF_REALLOC(*optsp, newsize);
+        if (newopts)
+            sprintf(newopts + oldsize, ",%s", opt);
+    }
+    if (newopts == NULL) {
+        gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+               "Error to add choices in buffer in add_opt");
+        return -1;
+    }
+    *optsp = newopts;
+    return 0;
+}
 
+/* Return Choice list from Split brain status */
+static char *
+getChoices(const char *value)
+{
+    int i = 0;
+    char *ptr = NULL;
+    char *tok = NULL;
+    char *result = NULL;
+    char *newval = NULL;
+
+    ptr = strstr(value, "Choices:");
+    if (!ptr) {
+        result = ptr;
+        goto out;
+    }
+
+    newval = gf_strdup(ptr);
+    if (!newval) {
+        result = newval;
+        goto out;
+    }
+
+    tok = strtok(newval, ":");
+    if (!tok) {
+        result = tok;
+        goto out;
+    }
+
+    while (tok) {
+        i++;
+        if (i == 2)
+            break;
+        tok = strtok(NULL, ":");
+    }
+
+    result = gf_strdup(tok);
+
+out:
+    if (newval)
+        GF_FREE(newval);
+
+    return result;
+}
+
+/* This function prepare a list of choices for key
+   (replica.split-brain-status) in   case of metadata split brain
+   only on the basis of key-value passed to this function.
+   After prepare the list of choices it update the same key in dict
+   with this value to reflect the same in
+   replica.split-brain-status attr for file.
+
+*/
+
+static int
+dht_aggregate_split_brain_xattr(dict_t *dst, char *key, data_t *value)
+{
+    int ret = 0;
+    char *oldvalue = NULL;
+    char *old_choice = NULL;
+    char *new_choice = NULL;
+    char *full_choice = NULL;
+    char *status = NULL;
+
+    if (value == NULL) {
+        gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DATA_NULL,
+               "GF_AFR_SBRAIN_STATUS value is NULL");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_str(dst, key, &oldvalue);
+    if (ret)
+        goto out;
+
+    /* skip code that is irrelevant if !oldvalue */
+    if (!oldvalue)
+        goto out;
+
+    if (strstr(oldvalue, "not")) {
+        gf_msg_debug("dht", 0, "Need to update split-brain status in dict");
+        ret = -1;
+        goto out;
+    }
+    if (strstr(oldvalue, "metadata-split-brain:yes") &&
+        (strstr(oldvalue, "data-split-brain:no"))) {
+        if (strstr(value->data, "not")) {
+            gf_msg_debug("dht", 0, "No need to update split-brain status");
+            ret = 0;
+            goto out;
+        }
+        if (strstr(value->data, "yes") &&
+            (strncmp(oldvalue, value->data, strlen(oldvalue)))) {
+            old_choice = getChoices(oldvalue);
+            if (!old_choice) {
+                gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+                       "Error to get choices");
+                ret = -1;
+                goto out;
+            }
+
+            ret = add_opt(&full_choice, old_choice);
+            if (ret) {
+                gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+                       "Error to add choices");
+                ret = -1;
+                goto out;
+            }
+
+            new_choice = getChoices(value->data);
+            if (!new_choice) {
+                gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+                       "Error to get choices");
+                ret = -1;
+                goto out;
+            }
+
+            ret = add_opt(&full_choice, new_choice);
+            if (ret) {
+                gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+                       "Error to add choices ");
+                ret = -1;
+                goto out;
+            }
+            ret = gf_asprintf(&status,
+                              "data-split-brain:%s    "
+                              "metadata-split-brain:%s   Choices:%s",
+                              "no", "yes", full_choice);
+
+            if (-1 == ret) {
+                gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+                       "Error to prepare status ");
+                goto out;
+            }
+            ret = dict_set_dynstr(dst, key, status);
+            if (ret) {
+                gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+                       "Failed to set full choice");
+            }
+        }
+    }
+
+out:
+    if (old_choice)
+        GF_FREE(old_choice);
+    if (new_choice)
+        GF_FREE(new_choice);
+    if (full_choice)
+        GF_FREE(full_choice);
+
+    return ret;
+}
+
+static int
+dht_aggregate(dict_t *this, char *key, data_t *value, void *data)
+{
+    dict_t *dst = NULL;
+    int32_t ret = -1;
+    data_t *dict_data = NULL;
+
+    dst = data;
+
+    /* compare split brain xattr only */
+    if (strcmp(key, GF_AFR_SBRAIN_STATUS) == 0) {
+        ret = dht_aggregate_split_brain_xattr(dst, key, value);
+        if (!ret)
+            goto out;
+    } else if (strcmp(key, QUOTA_SIZE_KEY) == 0) {
+        ret = dht_aggregate_quota_xattr(dst, key, value);
+        if (ret) {
+            gf_msg("dht", GF_LOG_WARNING, 0,
+                   DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED,
+                   "Failed to aggregate quota xattr");
+        }
+        goto out;
+    } else if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) {
+        ret = gf_get_min_stime(THIS, dst, key, value);
+        goto out;
+    } else {
+        /* compare user xattrs only */
+        if (!strncmp(key, "user.", SLEN("user."))) {
+            ret = dict_lookup(dst, key, &dict_data);
+            if (!ret && dict_data && value) {
+                ret = is_data_equal(dict_data, value);
+                if (!ret)
+                    gf_msg_debug("dht", 0, "xattr mismatch for %s", key);
+            }
+        }
+    }
+
+    ret = dict_set(dst, key, value);
+    if (ret) {
+        gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+               "Failed to set dictionary value: key = %s", key);
+    }
+
+out:
+    return ret;
+}
+
+static void
+dht_aggregate_xattr(dict_t *dst, dict_t *src)
+{
+    if ((dst == NULL) || (src == NULL)) {
+        goto out;
+    }
+
+    dict_foreach(src, dht_aggregate, dst);
+out:
+    return;
+}
+
+/* Code to save hashed subvol on inode ctx as a mds subvol
+ */
+int
+dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol)
+{
+    dht_inode_ctx_t *ctx = NULL;
+    int ret = -1;
+    uint64_t ctx_int = 0;
+    gf_boolean_t ctx_free = _gf_false;
+
+    LOCK(&inode->lock);
+    {
+        ret = __inode_ctx_get(inode, this, &ctx_int);
+        if (ctx_int) {
+            ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int;
+            ctx->mds_subvol = mds_subvol;
+        } else {
+            ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t);
+            if (!ctx)
+                goto unlock;
+            ctx->mds_subvol = mds_subvol;
+            ctx_free = _gf_true;
+            ctx_int = (long)ctx;
+            ret = __inode_ctx_set(inode, this, &ctx_int);
+        }
+    }
+unlock:
+    UNLOCK(&inode->lock);
+    if (ret && ctx_free)
+        GF_FREE(ctx);
+    return ret;
+}
+
+/*Code to get mds subvol from inode ctx */
+
+int
+dht_inode_ctx_mdsvol_get(inode_t *inode, xlator_t *this, xlator_t **mdsvol)
+{
+    dht_inode_ctx_t *ctx = NULL;
+    int ret = -1;
+
+    if (!mdsvol)
+        return ret;
+
+    if (__is_root_gfid(inode->gfid)) {
+        (*mdsvol) = FIRST_CHILD(this);
+        return 0;
+    }
+
+    ret = dht_inode_ctx_get(inode, this, &ctx);
+
+    if (!ret && ctx) {
+        if (ctx->mds_subvol) {
+            *mdsvol = ctx->mds_subvol;
+            ret = 0;
+        } else {
+            ret = -1;
+        }
+    }
+
+    return ret;
+}
 
 /* TODO:
    - use volumename in xattr instead of "dht"
@@ -38,3432 +450,10942 @@
    - complete linkfile selfheal
 */
 
-int
-dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie,
-			 xlator_t *this,
-			 int op_ret, int op_errno)
+static int
+dht_lookup_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int op_ret, int op_errno, dict_t *xdata)
 {
-	dht_local_t  *local = NULL;
-	dht_layout_t *layout = NULL;
-	int           ret = 0;
+    dht_local_t *local = NULL;
+    dht_layout_t *layout = NULL;
+    dht_conf_t *conf = NULL;
+    int ret = -1;
 
-	local = frame->local;
-	ret = op_ret;
+    GF_VALIDATE_OR_GOTO("dht", frame, out);
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
 
-	if (ret == 0) {
-		layout = local->selfheal.layout;
-		ret = inode_ctx_put (local->inode, this, (uint64_t)(long)layout);
+    local = frame->local;
+    conf = this->private;
+    ret = op_ret;
 
-		if (ret == 0)
-			local->selfheal.layout = NULL;
-		
-		if (local->st_ino) {
-			local->stbuf.st_ino = local->st_ino;
-		} else {
-			gf_log (this->name, GF_LOG_WARNING,
-				"could not find hashed subvolume for %s",
-				local->loc.path);
-		}
-	}
+    FRAME_SU_UNDO(frame, dht_local_t);
 
-	DHT_STACK_UNWIND (frame, ret, local->op_errno, local->inode,
-			  &local->stbuf, local->xattr);
+    if (ret == 0) {
+        layout = local->selfheal.layout;
+        ret = dht_layout_set(this, local->inode, layout);
+    }
 
-	return 0;
+    dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1);
+    if (local->loc.parent) {
+        dht_inode_ctx_time_update(local->loc.parent, this, &local->postparent,
+                                  1);
+    }
+
+    DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+    dht_set_fixed_dir_stat(&local->postparent);
+    /* Delete mds xattr at the time of STACK UNWIND */
+    GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
+
+    DHT_STACK_UNWIND(lookup, frame, ret, local->op_errno, local->inode,
+                     &local->stbuf, local->xattr, &local->postparent);
+
+out:
+    return ret;
 }
 
+static int
+dht_discover_complete(xlator_t *this, call_frame_t *discover_frame)
+{
+    dht_local_t *local = NULL;
+    dht_local_t *heal_local = NULL;
+    call_frame_t *main_frame = NULL;
+    call_frame_t *heal_frame = NULL;
+    int op_errno = 0;
+    int ret = -1;
+    dht_layout_t *layout = NULL;
+    dht_conf_t *conf = NULL;
+    uint32_t vol_commit_hash = 0;
+    xlator_t *source = NULL;
+    int heal_path = 0;
+    int error_while_marking_mds = 0;
+    int i = 0;
+    loc_t loc = {0};
+    int8_t is_read_only = 0, layout_anomalies = 0;
+    char gfid_local[GF_UUID_BUF_SIZE] = {0};
+
+    local = discover_frame->local;
+    layout = local->layout;
+    conf = this->private;
+    gf_uuid_unparse(local->gfid, gfid_local);
+
+    LOCK(&discover_frame->lock);
+    {
+        main_frame = local->main_frame;
+        local->main_frame = NULL;
+    }
+    UNLOCK(&discover_frame->lock);
+
+    if (!main_frame)
+        return 0;
 
-int
-dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-                    int op_ret, int op_errno,
-                    inode_t *inode, struct stat *stbuf, dict_t *xattr)
-{
-	dht_conf_t   *conf          = NULL;
-        dht_local_t  *local         = NULL;
-        int           this_call_cnt = 0;
-        call_frame_t *prev          = NULL;
-	dht_layout_t *layout        = NULL;
-	int           ret           = 0;
-	int           is_dir        = 0;
-
-	conf  = this->private;
-        local = frame->local;
-        prev  = cookie;
+    /* Code to update all extended attributed from
+       subvol to local->xattr on that internal xattr has found
+    */
+    if (conf->subvolume_cnt == 1)
+        local->need_xattr_heal = 0;
+    if (local->need_xattr_heal && (local->mds_xattr)) {
+        dht_dir_set_heal_xattr(this, local, local->xattr, local->mds_xattr,
+                               NULL, NULL);
+        dict_unref(local->mds_xattr);
+        local->mds_xattr = NULL;
+    }
+
+    ret = dict_get_int8(local->xattr_req, QUOTA_READ_ONLY_KEY, &is_read_only);
+    if (ret < 0)
+        gf_msg_debug(this->name, 0, "key = %s not present in dict",
+                     QUOTA_READ_ONLY_KEY);
+
+    if (local->file_count && local->dir_count) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_FILE_TYPE_MISMATCH,
+               "path %s exists as a file on one subvolume "
+               "and directory on another. "
+               "Please fix it manually",
+               local->loc.path);
+        op_errno = EIO;
+        goto out;
+    }
+
+    if (local->cached_subvol) {
+        ret = dht_layout_preset(this, local->cached_subvol, local->inode);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SET_FAILED,
+                   "failed to set layout for subvolume %s",
+                   local->cached_subvol ? local->cached_subvol->name : "<nil>");
+            op_errno = EINVAL;
+            goto out;
+        }
+    } else {
+        ret = dht_layout_normalize(this, &local->loc, layout);
+        if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) {
+            /* either the layout is incorrect or the directory is
+             * not found even in one subvolume.
+             */
+            gf_msg_debug(this->name, 0,
+                         "normalizing failed on %s "
+                         "(overlaps/holes present: %s, "
+                         "ENOENT errors: %d)",
+                         local->loc.path, (ret < 0) ? "yes" : "no",
+                         (ret > 0) ? ret : 0);
+            layout_anomalies = 1;
+        } else if (local->inode) {
+            dht_layout_set(this, local->inode, layout);
+        }
+    }
 
-	layout = local->layout;
+    if (!conf->vch_forced) {
+        ret = dict_get_uint32(local->xattr, conf->commithash_xattr_name,
+                              &vol_commit_hash);
+        if (ret == 0) {
+            conf->vol_commit_hash = vol_commit_hash;
+        }
+    }
+
+    if (IA_ISDIR(local->stbuf.ia_type) && !is_read_only) {
+        for (i = 0; i < layout->cnt; i++) {
+            if (!source && !layout->list[i].err)
+                source = layout->list[i].xlator;
+            if (layout->list[i].err == ENOENT ||
+                layout->list[i].err == ESTALE) {
+                heal_path = 1;
+            }
+
+            if (source && heal_path)
+                break;
+        }
+    }
+
+    if (IA_ISDIR(local->stbuf.ia_type)) {
+        /* Call function to save hashed subvol on inode ctx if
+           internal mds xattr is not present and all subvols are up
+        */
+        if (!local->op_ret && !__is_root_gfid(local->stbuf.ia_gfid))
+            (void)dht_common_mark_mdsxattr(discover_frame,
+                                           &error_while_marking_mds, 1);
+
+        if (local->need_xattr_heal && !heal_path) {
+            local->need_xattr_heal = 0;
+            ret = dht_dir_xattr_heal(this, local, &op_errno);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                       DHT_MSG_DIR_XATTR_HEAL_FAILED,
+                       "xattr heal failed for "
+                       "directory  gfid is %s ",
+                       gfid_local);
+            }
+        }
+    }
 
-        LOCK (&frame->lock);
-        {
-                /* TODO: assert equal mode on stbuf->st_mode and
-		   local->stbuf->st_mode
+    if (source && (heal_path || layout_anomalies || error_while_marking_mds)) {
+        gf_uuid_copy(loc.gfid, local->gfid);
+        if (gf_uuid_is_null(loc.gfid)) {
+            goto done;
+        }
 
-		   else mkdir/chmod/chown and fix
-		*/
-		/* TODO: assert equal hash type in xattr, local->xattr */
+        if (local->inode)
+            loc.inode = inode_ref(local->inode);
+        else
+            goto done;
+
+        heal_frame = create_frame(this, this->ctx->pool);
+        if (heal_frame) {
+            heal_local = dht_local_init(heal_frame, &loc, NULL, 0);
+            if (!heal_local)
+                goto cleanup;
+
+            gf_uuid_copy(heal_local->gfid, local->gfid);
+            heal_frame->cookie = source;
+            heal_local->xattr = dict_ref(local->xattr);
+            heal_local->stbuf = local->stbuf;
+            heal_local->postparent = local->postparent;
+            heal_local->inode = inode_ref(loc.inode);
+            heal_local->main_frame = main_frame;
+            FRAME_SU_DO(heal_frame, dht_local_t);
+            ret = synctask_new(this->ctx->env, dht_heal_full_path,
+                               dht_heal_full_path_done, heal_frame, heal_frame);
+            if (!ret) {
+                loc_wipe(&loc);
+                return 0;
+            }
+            /*
+             * Failed to spawn the synctask. Returning
+             * with out doing heal.
+             */
+        cleanup:
+            loc_wipe(&loc);
+            DHT_STACK_DESTROY(heal_frame);
+        }
+    }
+done:
+    dht_set_fixed_dir_stat(&local->postparent);
+    /* Delete mds xattr at the time of STACK UNWIND */
+    if (local->xattr)
+        GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
 
-		/* TODO: always ensure same subvolume is in layout->list[0] */
+    DHT_STACK_UNWIND(lookup, main_frame, local->op_ret, local->op_errno,
+                     local->inode, &local->stbuf, local->xattr,
+                     &local->postparent);
+    return 0;
 
-		ret = dht_layout_merge (this, layout, prev->this,
-					op_ret, op_errno, xattr);
+out:
+    DHT_STACK_UNWIND(lookup, main_frame, -1, op_errno, NULL, NULL, NULL, NULL);
 
-		if (op_ret == -1) {
-			local->op_errno = ENOENT;
-			gf_log (this->name, GF_LOG_WARNING,
-				"lookup of %s on %s returned error (%s)",
-				local->loc.path, prev->this->name,
-				strerror (op_errno));
+    return ret;
+}
 
-			goto unlock;
-		}
+static int
+dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                             int op_ret, int op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = cookie;
+    int ret = -1;
+    dht_conf_t *conf = 0;
+    dht_layout_t *layout = NULL;
+    int32_t mds_heal_fresh_lookup = 0;
+
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+
+    local = frame->local;
+    conf = this->private;
+    layout = local->selfheal.layout;
+    mds_heal_fresh_lookup = local->mds_heal_fresh_lookup;
+
+    if (op_ret) {
+        gf_msg_debug(this->name, op_ret,
+                     "Failed to set %s on the MDS %s for path %s. ",
+                     conf->mds_xattr_key, prev->name, local->loc.path);
+    } else {
+        /* Save mds subvol on inode ctx */
+        ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+                   "Failed to set mds subvol on inode ctx"
+                   " %s for %s ",
+                   prev->name, local->loc.path);
+        }
+    }
+    if (!local->mds_heal_fresh_lookup && layout) {
+        dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf, 0xffffffff,
+                                 layout);
+    }
+out:
+    if (mds_heal_fresh_lookup)
+        DHT_STACK_DESTROY(frame);
+    return 0;
+}
 
- 		is_dir = check_is_dir (inode, stbuf, xattr);
- 		if (!is_dir) 
- 			goto unlock;
+static xlator_t *
+dht_inode_get_hashed_subvol(inode_t *inode, xlator_t *this, loc_t *loc)
+{
+    char *path = NULL;
+    loc_t populate_loc = {
+        0,
+    };
+    char *name = NULL;
+    xlator_t *hash_subvol = NULL;
+
+    if (!inode)
+        return hash_subvol;
+
+    if (loc && loc->parent && loc->path) {
+        if (!loc->name) {
+            name = strrchr(loc->path, '/');
+            if (name) {
+                loc->name = name + 1;
+            } else {
+                goto out;
+            }
+        }
+        hash_subvol = dht_subvol_get_hashed(this, loc);
+        goto out;
+    }
 
- 		local->op_ret = 0;
- 		if (local->xattr == NULL)
- 			local->xattr = dict_ref (xattr);
- 		if (local->inode == NULL)
- 			local->inode = inode_ref (inode);
+    if (!gf_uuid_is_null(inode->gfid)) {
+        populate_loc.inode = inode_ref(inode);
+        populate_loc.parent = inode_parent(populate_loc.inode, NULL, NULL);
+        inode_path(populate_loc.inode, NULL, &path);
 
-		dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
+        if (!path)
+            goto out;
 
-		if (prev->this == local->hashed_subvol)
-			local->st_ino = local->stbuf.st_ino;
+        populate_loc.path = path;
+        if (!populate_loc.name && populate_loc.path) {
+            name = strrchr(populate_loc.path, '/');
+            if (name) {
+                populate_loc.name = name + 1;
 
+            } else {
+                goto out;
+            }
         }
-unlock:
-        UNLOCK (&frame->lock);
+        hash_subvol = dht_subvol_get_hashed(this, &populate_loc);
+    }
+out:
+    if (populate_loc.inode)
+        loc_wipe(&populate_loc);
+    return hash_subvol;
+}
 
+/* Common function call by revalidate/selfheal code path to populate
+   internal xattr if it is not present, mark_during_fresh_lookup value
+   determines either function is call by revalidate_cbk(discover_complete)
+   or call by selfheal code path while fresh lookup.
+   Here we do wind a call serially in case of fresh lookup and
+   for other lookup code path we do wind a call parallel.The reason
+   to wind a call serially is at the time of fresh lookup directory is not
+   discovered and at the time of revalidate_lookup directory is
+   already discovered. So, revalidate codepath can race with setxattr
+   codepath and can get into spurious heals because of an ongoing setxattr.
+   This can slow down revalidates, if healing happens in foreground.
+   However, if healing happens in background, there is no direct performance
+   penalty.
+*/
+int
+dht_common_mark_mdsxattr(call_frame_t *frame, int *errst,
+                         int mark_during_fresh_lookup)
+{
+    dht_local_t *local = NULL;
+    xlator_t *this = NULL;
+    xlator_t *hashed_subvol = NULL;
+    int ret = 0;
+    int i = 0;
+    dict_t *xattrs = NULL;
+    char gfid_local[GF_UUID_BUF_SIZE] = {
+        0,
+    };
+    int32_t zero[1] = {0};
+    dht_conf_t *conf = 0;
+    dht_layout_t *layout = NULL;
+    dht_local_t *copy_local = NULL;
+    call_frame_t *xattr_frame = NULL;
+    gf_boolean_t vol_down = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("dht", frame, out);
+    this = frame->this;
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    local = frame->local;
+    conf = this->private;
+    layout = local->selfheal.layout;
+    local->mds_heal_fresh_lookup = mark_during_fresh_lookup;
+
+    gf_uuid_unparse(local->gfid, gfid_local);
+
+    /* Code to update hashed subvol consider as a mds subvol
+       and wind a setxattr call on hashed subvol to update
+       internal xattr
+    */
+    if (!local->xattr || !dict_get(local->xattr, conf->mds_xattr_key)) {
+        /* It means no internal MDS xattr has been set yet
+         */
+        /* Check the status of all subvol are up while call
+           this function call by lookup code path
+        */
+        if (mark_during_fresh_lookup) {
+            for (i = 0; i < conf->subvolume_cnt; i++) {
+                if (!conf->subvolume_status[i]) {
+                    vol_down = _gf_true;
+                    break;
+                }
+            }
+            if (vol_down) {
+                gf_msg_debug(this->name, 0,
+                             "subvol %s is down. Unable to "
+                             " save mds subvol on inode for "
+                             " path %s gfid is %s ",
+                             conf->subvolumes[i]->name, local->loc.path,
+                             gfid_local);
+                goto out;
+            }
+        }
 
-        this_call_cnt = dht_frame_return (frame);
+        /* Calculate hashed subvol based on inode and parent node
+         */
+        hashed_subvol = dht_inode_get_hashed_subvol(local->inode, this,
+                                                    &local->loc);
+        if (!hashed_subvol) {
+            gf_msg(this->name, GF_LOG_DEBUG, 0,
+                   DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+                   "Failed to get hashed subvol for path %s"
+                   "gfid is %s ",
+                   local->loc.path, gfid_local);
+            if (errst)
+                (*errst) = 1;
+            ret = -1;
+            goto out;
+        }
+        xattrs = dict_new();
+        if (!xattrs) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+                   "dict_new failed");
+            ret = -1;
+            goto out;
+        }
+        /* Add internal MDS xattr on disk for hashed subvol
+         */
+        ret = dht_dict_set_array(xattrs, conf->mds_xattr_key, zero, 1);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+                   "Failed to set dictionary"
+                   "  value:key = %s for "
+                   "path %s",
+                   conf->mds_xattr_key, local->loc.path);
+            ret = -1;
+            goto out;
+        }
+        /* Create a new frame to wind a call only while
+           this function call by revalidate_cbk code path
+           To wind a call parallel need to create a new frame
+        */
+        if (mark_during_fresh_lookup) {
+            xattr_frame = create_frame(this, this->ctx->pool);
+            if (!xattr_frame) {
+                ret = -1;
+                goto out;
+            }
+            copy_local = dht_local_init(xattr_frame, &(local->loc), NULL, 0);
+            if (!copy_local) {
+                ret = -1;
+                DHT_STACK_DESTROY(xattr_frame);
+                goto out;
+            }
+            copy_local->stbuf = local->stbuf;
+            copy_local->mds_heal_fresh_lookup = mark_during_fresh_lookup;
+            if (!copy_local->inode)
+                copy_local->inode = inode_ref(local->inode);
+            gf_uuid_copy(copy_local->loc.gfid, local->gfid);
+            FRAME_SU_DO(xattr_frame, dht_local_t);
+            STACK_WIND_COOKIE(xattr_frame, dht_common_mark_mdsxattr_cbk,
+                              hashed_subvol, hashed_subvol,
+                              hashed_subvol->fops->setxattr, &local->loc,
+                              xattrs, 0, NULL);
+        } else {
+            STACK_WIND_COOKIE(frame, dht_common_mark_mdsxattr_cbk,
+                              (void *)hashed_subvol, hashed_subvol,
+                              hashed_subvol->fops->setxattr, &local->loc,
+                              xattrs, 0, NULL);
+        }
+    } else {
+        gf_msg_debug(this->name, 0,
+                     "internal xattr %s is present on subvol"
+                     "on path %s gfid is %s ",
+                     conf->mds_xattr_key, local->loc.path, gfid_local);
+        if (!mark_during_fresh_lookup)
+            dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf,
+                                     0xffffffff, layout);
+    }
 
-        if (is_last_call (this_call_cnt)) {
-		if (local->op_ret == 0) {
-			ret = dht_layout_normalize (this, &local->loc, layout);
+out:
+    if (xattrs)
+        dict_unref(xattrs);
+    return ret;
+}
 
-			local->layout = NULL;
+/* Get the value of key from dict in the bytewise and save in array after
+   convert from network byte order to host byte order
+*/
+static int32_t
+dht_dict_get_array(dict_t *dict, char *key, int32_t value[], int32_t size,
+                   int *errst)
+{
+    void *ptr = NULL;
+    int32_t len = -1;
+    int32_t vindex = -1;
+    int32_t err = -1;
+    int ret = 0;
+
+    if (dict == NULL) {
+        (*errst) = -1;
+        return -EINVAL;
+    }
+    err = dict_get_ptr_and_len(dict, key, &ptr, &len);
+    if (err != 0) {
+        (*errst) = -1;
+        return err;
+    }
+
+    if (len != (size * sizeof(int32_t))) {
+        (*errst) = -1;
+        return -EINVAL;
+    }
+
+    for (vindex = 0; vindex < size; vindex++) {
+        value[vindex] = ntoh32(*((int32_t *)ptr + vindex));
+        if (value[vindex] < 0)
+            ret = -1;
+    }
+
+    return ret;
+}
 
-			if (ret != 0) {
-				layout->gen = conf->gen;
+static int
+dht_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, inode_t *inode, struct iatt *stbuf,
+                 dict_t *xattr, struct iatt *postparent)
+{
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    xlator_t *prev = NULL;
+    dht_layout_t *layout = NULL;
+    int ret = -1;
+    int is_dir = 0;
+    int32_t check_mds = 0;
+    int is_linkfile = 0;
+    int attempt_unwind = 0;
+    dht_conf_t *conf = 0;
+    char gfid_local[GF_UUID_BUF_SIZE] = {0};
+    char gfid_node[GF_UUID_BUF_SIZE] = {0};
+    int32_t mds_xattr_val[1] = {0};
+    int errst = 0;
+
+    GF_VALIDATE_OR_GOTO("dht", frame, out);
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+    GF_VALIDATE_OR_GOTO("dht", this->private, out);
+    GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+    local = frame->local;
+    prev = cookie;
+    conf = this->private;
+
+    layout = local->layout;
+
+    /* Check if the gfid is different for file from other node */
+    if (!op_ret && gf_uuid_compare(local->gfid, stbuf->ia_gfid)) {
+        gf_uuid_unparse(stbuf->ia_gfid, gfid_node);
+        gf_uuid_unparse(local->gfid, gfid_local);
+
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+               "%s: gfid different on %s, gfid local = %s"
+               "gfid other = %s",
+               local->loc.path, prev->name, gfid_local, gfid_node);
+    }
+
+    LOCK(&frame->lock);
+    {
+        /* TODO: assert equal mode on stbuf->st_mode and
+           local->stbuf->st_mode
+
+           else mkdir/chmod/chown and fix
+        */
+
+        ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr);
+        if (ret)
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+                   "%s: failed to merge layouts for subvol %s", local->loc.path,
+                   prev->name);
 
-				gf_log (this->name, GF_LOG_WARNING,
-					"fixing assignment on %s",
-					local->loc.path);
-				goto selfheal;
-			}
-			
-			inode_ctx_put (local->inode, this, (uint64_t)(long)layout);
-			
-			if (local->st_ino) {
-				local->stbuf.st_ino = local->st_ino;
-			} else {
-				gf_log (this->name, GF_LOG_WARNING,
-					"could not find hashed subvolume for %s",
-					local->loc.path);
-			}
-		}
+        if (op_ret == -1) {
+            local->op_errno = op_errno;
+            gf_msg_debug(this->name, op_errno,
+                         "lookup of %s on %s returned error", local->loc.path,
+                         prev->name);
 
-		DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-				  local->inode, &local->stbuf, local->xattr);
+            goto unlock;
         }
 
-	return 0;
+        is_linkfile = check_is_linkfile(inode, stbuf, xattr,
+                                        conf->link_xattr_name);
+        is_dir = check_is_dir(inode, stbuf, xattr);
 
-selfheal:
-	ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk,
-				      &local->loc, layout);
+        if (is_dir) {
+            local->dir_count++;
+        } else {
+            local->file_count++;
+
+            if (!is_linkfile && !local->cached_subvol) {
+                /* real file */
+                /* Ok, we somehow managed to find a file on
+                 * more than one subvol. ignore this or we
+                 * will end up overwriting information while a
+                 * a thread is potentially unwinding from
+                 * dht_discover_complete
+                 */
+                local->cached_subvol = prev;
+                attempt_unwind = 1;
+            } else {
+                goto unlock;
+            }
+        }
 
-	return 0;
-}
+        local->op_ret = 0;
 
-int
-dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-                    int op_ret, int op_errno,
-                    inode_t *inode, struct stat *stbuf, dict_t *xattr)
-{
-        dht_local_t  *local         = NULL;
-        int           this_call_cnt = 0;
-        call_frame_t *prev          = NULL;
-	dht_layout_t *layout        = NULL;
-	int           ret  = -1;
-	int           is_dir = 0;
-	int           is_linkfile = 0;
+        if (local->xattr == NULL) {
+            local->xattr = dict_ref(xattr);
+        } else {
+            /* Don't aggregate for files. See BZ#1484709 */
+            if (is_dir)
+                dht_aggregate_xattr(local->xattr, xattr);
+        }
 
-        local = frame->local;
-        prev  = cookie;
-
-        LOCK (&frame->lock);
-        {
-		if (op_ret == -1) {
-			local->op_errno = op_errno;
-
-			if (op_errno != ENOTCONN && op_errno != ENOENT) {
-				gf_log (this->name, GF_LOG_WARNING,
-					"subvolume %s returned -1 (%s)",
-					prev->this->name, strerror (op_errno));
-			}
-
-			goto unlock;
-		}
-
-		if (S_IFMT & (stbuf->st_mode ^ local->inode->st_mode)) {
-			gf_log (this->name, GF_LOG_WARNING,
-				"mismatching filetypes 0%o v/s 0%o for %s",
-				(stbuf->st_mode & S_IFMT),
-				(local->inode->st_mode & S_IFMT),
-				local->loc.path);
-
-			local->op_ret = -1;
-			local->op_errno = EINVAL;
-
-			goto unlock;
-		}
-
-		layout = dht_layout_get (this, inode);
-		
-		is_dir = check_is_dir (inode, stbuf, xattr);
-		is_linkfile = check_is_linkfile (inode, stbuf, xattr);
-		
-		if (is_linkfile) {
-			gf_log (this->name, GF_LOG_WARNING,
-				"linkfile found in revalidate for %s",
-				local->loc.path);
-			local->layout_mismatch = 1;
-
-			goto unlock;
-		}
-
-		if (is_dir) {
-			ret = dht_layout_dir_mismatch (this, layout,
-						       prev->this, &local->loc,
-						       xattr);
-			if (ret != 0) {
-				gf_log (this->name, GF_LOG_WARNING,
-					"mismatching layouts for %s", 
-					local->loc.path);
-			
-				local->layout_mismatch = 1;
-
-				goto unlock;
-			}
-		} 
-		
-		dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
-		
-		local->op_ret = 0;
-		local->stbuf.st_ino = local->st_ino;
-
-		if (!local->xattr)
-			local->xattr = dict_ref (xattr);
-	}
+        if (local->inode == NULL)
+            local->inode = inode_ref(inode);
+
+        dht_iatt_merge(this, &local->stbuf, stbuf);
+        dht_iatt_merge(this, &local->postparent, postparent);
+
+        if (!dict_get(xattr, conf->mds_xattr_key)) {
+            goto unlock;
+        } else {
+            gf_msg_debug(this->name, 0,
+                         "internal xattr %s is present on subvol"
+                         "on path %s gfid is %s ",
+                         conf->mds_xattr_key, local->loc.path, gfid_local);
+        }
+        check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
+                                       mds_xattr_val, 1, &errst);
+        /* save mds subvol on inode ctx */
+        ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+                   "Failed to set hashed subvol for %s vol is %s",
+                   local->loc.path, prev->name);
+        }
+
+        if ((check_mds < 0) && !errst) {
+            local->mds_xattr = dict_ref(xattr);
+            gf_msg_debug(this->name, 0,
+                         "Value of %s is not zero on mds subvol"
+                         "so xattr needs to be healed on non mds"
+                         " path is %s and vol name is %s "
+                         " gfid is %s",
+                         conf->mds_xattr_key, local->loc.path, prev->name,
+                         gfid_local);
+            local->need_xattr_heal = 1;
+            local->mds_subvol = prev;
+        }
+    }
 unlock:
-	UNLOCK (&frame->lock);
-
-        this_call_cnt = dht_frame_return (frame);
-
-        if (is_last_call (this_call_cnt)) {
-		if (!S_ISDIR (local->stbuf.st_mode)
-		    && (local->hashed_subvol != local->cached_subvol)
-		    && (local->stbuf.st_nlink == 1))
-			local->stbuf.st_mode |= S_ISVTX;
-		
-		if (local->layout_mismatch) {
-			local->op_ret = -1;
-			local->op_errno = ESTALE;
-		}
-			
-		DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-				  local->inode, &local->stbuf, local->xattr);
-	}
+    UNLOCK(&frame->lock);
+out:
+    /* Make sure, the thread executing dht_discover_complete is the one
+     * which calls STACK_DESTROY (frame). In the case of "attempt_unwind",
+     * this makes sure that the thread don't call dht_frame_return, till
+     * call to dht_discover_complete is done.
+     */
+    if (attempt_unwind) {
+        dht_discover_complete(this, frame);
+    }
 
-        return 0;
+    this_call_cnt = dht_frame_return(frame);
+
+    if (is_last_call(this_call_cnt) && !attempt_unwind) {
+        dht_discover_complete(this, frame);
+    }
+
+    if (is_last_call(this_call_cnt))
+        DHT_STACK_DESTROY(frame);
+
+    return 0;
+}
+
+static int
+dht_set_file_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+    int ret = -EINVAL;
+    dht_conf_t *conf = NULL;
+
+    conf = this->private;
+    if (!conf) {
+        goto err;
+    }
+
+    if (!xattr_req) {
+        goto err;
+    }
+
+    /* Used to check whether this is a linkto file.
+     */
+    ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+               "Failed to set dictionary value:key = %s for "
+               "path %s",
+               conf->link_xattr_name, loc->path);
+        goto err;
+    }
+
+    /* This is used to make sure we don't unlink linkto files
+     * which are the target of an ongoing file migration.
+     */
+    ret = dict_set_uint32(xattr_req, GLUSTERFS_OPEN_FD_COUNT, 4);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+               "Failed to set dictionary value:key = %s for "
+               "path %s",
+               GLUSTERFS_OPEN_FD_COUNT, loc->path);
+        goto err;
+    }
+
+    ret = 0;
+err:
+    return ret;
 }
 
+/* This is a gfid based nameless lookup. Without a name, the hashed subvol
+ * cannot be calculated so a lookup is sent to all subvols.
+ */
+static int
+dht_do_discover(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+    int ret;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int call_cnt = 0;
+    int op_errno = EINVAL;
+    int i = 0;
+    call_frame_t *discover_frame = NULL;
+
+    conf = this->private;
+    local = frame->local;
+
+    /* As we do not know if this is a file or directory, request
+     * both file and directory xattrs
+     */
+    ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
+    if (ret) {
+        goto err;
+    }
+
+    ret = dht_set_dir_xattr_req(this, loc, local->xattr_req);
+    if (ret) {
+        goto err;
+    }
+
+    if (loc_is_root(loc)) {
+        /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash)
+         * set on the brick root.
+         */
+        ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name,
+                              sizeof(uint32_t));
+    }
+
+    call_cnt = conf->subvolume_cnt;
+    local->call_cnt = call_cnt;
+
+    local->layout = dht_layout_new(this, conf->subvolume_cnt);
+
+    if (!local->layout) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    gf_uuid_copy(local->gfid, loc->gfid);
+
+    discover_frame = copy_frame(frame);
+    if (!discover_frame) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    discover_frame->local = local;
+    frame->local = NULL;
+    local->main_frame = frame;
+
+    for (i = 0; i < call_cnt; i++) {
+        STACK_WIND_COOKIE(discover_frame, dht_discover_cbk, conf->subvolumes[i],
+                          conf->subvolumes[i],
+                          conf->subvolumes[i]->fops->lookup, &local->loc,
+                          local->xattr_req);
+    }
+
+    return 0;
+
+err:
+    DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+
+    return 0;
+}
 
+/* Code to call syntask to heal custom xattr from hashed subvol
+   to non hashed subvol
+*/
 int
-dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
-				xlator_t *this,
-				int32_t op_ret, int32_t op_errno,
-				inode_t *inode, struct stat *stbuf)
-{
-	dht_local_t  *local = NULL;
-	dht_layout_t *layout = NULL;
-	xlator_t     *cached_subvol = NULL;
-
-	local = frame->local;
-	cached_subvol = local->cached_subvol;
-
-	layout = dht_layout_for_subvol (this, local->cached_subvol);
-	if (!layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no pre-set layout for subvolume %s",
-			cached_subvol ? cached_subvol->name : "<nil>");
-		local->op_ret = -1;
-		local->op_errno = EINVAL;
-		goto unwind;
-	}
-
-	inode_ctx_put (local->inode, this, (uint64_t)(long)layout);
-	local->op_ret = 0;
-	if (local->stbuf.st_nlink == 1)
-		local->stbuf.st_mode |= S_ISVTX;
+dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno)
+{
+    dht_local_t *copy_local = NULL;
+    call_frame_t *copy = NULL;
+    int ret = -1;
+    char gfid_local[GF_UUID_BUF_SIZE] = {0};
+
+    if (gf_uuid_is_null(local->gfid)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DIR_XATTR_HEAL_FAILED,
+               "No gfid exists for path %s "
+               "so healing xattr is not possible",
+               local->loc.path);
+        *op_errno = EIO;
+        goto out;
+    }
+
+    gf_uuid_unparse(local->gfid, gfid_local);
+    copy = create_frame(this, this->ctx->pool);
+    if (copy) {
+        copy_local = dht_local_init(copy, &(local->loc), NULL, 0);
+        if (!copy_local) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM,
+                   DHT_MSG_DIR_XATTR_HEAL_FAILED,
+                   "Memory allocation failed "
+                   "for path %s gfid %s ",
+                   local->loc.path, gfid_local);
+            *op_errno = ENOMEM;
+            DHT_STACK_DESTROY(copy);
+        } else {
+            copy_local->stbuf = local->stbuf;
+            gf_uuid_copy(copy_local->loc.gfid, local->gfid);
+            copy_local->mds_subvol = local->mds_subvol;
+            FRAME_SU_DO(copy, dht_local_t);
+            ret = synctask_new(this->ctx->env, dht_dir_heal_xattrs,
+                               dht_dir_heal_xattrs_done, copy, copy);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM,
+                       DHT_MSG_DIR_XATTR_HEAL_FAILED,
+                       "Synctask creation failed to heal xattr "
+                       "for path %s gfid %s ",
+                       local->loc.path, gfid_local);
+                *op_errno = ENOMEM;
+                DHT_STACK_DESTROY(copy);
+            }
+        }
+    }
+out:
+    return ret;
+}
 
-unwind:
-	DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-			  local->inode, &local->stbuf, local->xattr);
-	return 0;
+static int
+dht_needs_selfheal(call_frame_t *frame, xlator_t *this)
+{
+    dht_local_t *local = NULL;
+    dht_layout_t *layout = NULL;
+    int needs_selfheal = 0;
+    int ret = 0;
+
+    local = frame->local;
+    layout = local->layout;
+
+    if (local->need_attrheal || local->need_xattr_heal ||
+        local->need_selfheal) {
+        needs_selfheal = 1;
+    }
+
+    ret = dht_layout_normalize(this, &local->loc, layout);
+
+    if (ret != 0) {
+        gf_msg_debug(this->name, 0, "fixing assignment on %s", local->loc.path);
+        needs_selfheal = 1;
+    }
+    return needs_selfheal;
 }
 
+static int
+is_permission_different(ia_prot_t *prot1, ia_prot_t *prot2)
+{
+    if ((prot1->owner.read != prot2->owner.read) ||
+        (prot1->owner.write != prot2->owner.write) ||
+        (prot1->owner.exec != prot2->owner.exec) ||
+        (prot1->group.read != prot2->group.read) ||
+        (prot1->group.write != prot2->group.write) ||
+        (prot1->group.exec != prot2->group.exec) ||
+        (prot1->other.read != prot2->other.read) ||
+        (prot1->other.write != prot2->other.write) ||
+        (prot1->other.exec != prot2->other.exec) ||
+        (prot1->suid != prot2->suid) || (prot1->sgid != prot2->sgid) ||
+        (prot1->sticky != prot2->sticky)) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
 
 int
-dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			   int32_t op_ret, int32_t op_errno,
-			   inode_t *inode, struct stat *buf, dict_t *xattr)
-{
-	dht_conf_t   *conf          = NULL;
-        dht_local_t  *local         = NULL;
-        int           this_call_cnt = 0;
-        call_frame_t *prev          = NULL;
-	int           is_linkfile   = 0;
-	int           is_dir        = 0;
-	xlator_t     *subvol        = NULL;
-	loc_t        *loc           = NULL;
-	xlator_t     *link_subvol   = NULL;
-	xlator_t     *hashed_subvol = NULL;
-	xlator_t     *cached_subvol = NULL;
-
-	conf   = this->private;
-
-	local  = frame->local;
-	loc    = &local->loc;
-
-	prev   = cookie;
-	subvol = prev->this;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			if (op_errno != ENOENT)
-				local->op_errno = op_errno;
-			goto unlock;
-		}
-
-		is_linkfile = check_is_linkfile (inode, buf, xattr);
-		is_dir = check_is_dir (inode, buf, xattr);
-
-		if (is_linkfile) {
-			link_subvol = dht_linkfile_subvol (this, inode, buf,
-							   xattr);
-			gf_log (this->name, GF_LOG_DEBUG,
-				"found on %s linkfile %s (-> %s)",
-				subvol->name, loc->path,
-				link_subvol ? link_subvol->name : "''");
-			goto unlock;
-		} else {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"found on %s file %s",
-				subvol->name, loc->path);
-		}
-
-		if (!local->cached_subvol) {
-			/* found one file */
-			dht_stat_merge (this, &local->stbuf, buf, subvol);
-			local->xattr = dict_ref (xattr);
-			local->cached_subvol = subvol;
-		} else {
-			gf_log (this->name, GF_LOG_WARNING,
-				"multiple subvolumes (%s and %s atleast) have "
-				"file %s", local->cached_subvol->name,
-				subvol->name, local->loc.path);
-		}
-	}
-unlock:
-	UNLOCK (&frame->lock);
+dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+                   dict_t *xattr, struct iatt *postparent)
+{
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int this_call_cnt = 0;
+    xlator_t *prev = NULL;
+    dht_layout_t *layout = NULL;
+    int ret = -1;
+    int is_dir = 0;
+    int32_t check_mds = 0;
+    int errst = 0;
+    char gfid_local[GF_UUID_BUF_SIZE] = {0};
+    char gfid_node[GF_UUID_BUF_SIZE] = {0};
+    int32_t mds_xattr_val[1] = {0};
+
+    GF_VALIDATE_OR_GOTO("dht", frame, out);
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+    GF_VALIDATE_OR_GOTO("dht", this->private, out);
+    GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+    local = frame->local;
+    prev = cookie;
+    conf = this->private;
+
+    layout = local->layout;
+    gf_msg_debug(this->name, op_errno,
+                 "%s: lookup on %s returned with op_ret = %d, op_errno = %d",
+                 local->loc.path, prev->name, op_ret, op_errno);
+
+    /* The first successful lookup*/
+    if (!op_ret && gf_uuid_is_null(local->gfid)) {
+        memcpy(local->gfid, stbuf->ia_gfid, 16);
+    }
+    if (!gf_uuid_is_null(local->gfid)) {
+        gf_uuid_unparse(local->gfid, gfid_local);
+    }
+
+    /* Check if the gfid is different for file from other node */
+    if (!op_ret && gf_uuid_compare(local->gfid, stbuf->ia_gfid)) {
+        gf_uuid_unparse(stbuf->ia_gfid, gfid_node);
+
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+               "%s: gfid different on %s."
+               " gfid local = %s, gfid subvol = %s",
+               local->loc.path, prev->name, gfid_local, gfid_node);
+    }
+
+    LOCK(&frame->lock);
+    {
+        /* TODO: assert equal mode on stbuf->st_mode and
+           local->stbuf->st_mode
+           else mkdir/chmod/chown and fix
+        */
+        ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr);
 
-	if (is_linkfile) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"deleting stale linkfile %s on %s",
-			loc->path, subvol->name);
-		dht_linkfile_unlink (frame, this, subvol, loc);
-	}
+        if (op_ret == -1) {
+            local->op_errno = op_errno;
 
-	this_call_cnt = dht_frame_return (frame);
-	if (is_last_call (this_call_cnt)) {
-		hashed_subvol = local->hashed_subvol;
-		cached_subvol = local->cached_subvol;
+            /* The GFID is missing on this subvol. Force a heal. */
+            if (op_errno == ENODATA) {
+                local->need_lookup_everywhere = 1;
+            }
+            goto unlock;
+        }
 
-		if (!cached_subvol) {
-			DHT_STACK_UNWIND (frame, -1, ENOENT, NULL, NULL, NULL);
-			return 0;
-		}
+        is_dir = check_is_dir(inode, stbuf, xattr);
+        if (!is_dir) {
+            gf_msg_debug(this->name, 0,
+                         "%s: lookup on %s returned non dir 0%o"
+                         "calling lookup_everywhere",
+                         local->loc.path, prev->name, stbuf->ia_type);
 
-		gf_log (this->name, GF_LOG_WARNING,
-			"linking file %s existing on %s to %s (hash)",
-			loc->path, cached_subvol->name, hashed_subvol->name);
+            local->need_lookup_everywhere = 1;
+            goto unlock;
+        }
 
-		dht_linkfile_create (frame, dht_lookup_linkfile_create_cbk,
-				     cached_subvol, hashed_subvol, loc);
-	}
+        local->op_ret = 0;
+        if (local->xattr == NULL) {
+            local->xattr = dict_ref(xattr);
+        } else {
+            dht_aggregate_xattr(local->xattr, xattr);
+        }
 
-	return 0;
-}
+        if (__is_root_gfid(stbuf->ia_gfid)) {
+            ret = dht_dir_has_layout(xattr, conf->xattr_name);
+            if (ret >= 0) {
+                if (is_greater_time(local->prebuf.ia_ctime,
+                                    local->prebuf.ia_ctime_nsec,
+                                    stbuf->ia_ctime, stbuf->ia_ctime_nsec)) {
+                    /* Choose source */
+                    local->prebuf.ia_gid = stbuf->ia_gid;
+                    local->prebuf.ia_uid = stbuf->ia_uid;
+
+                    local->prebuf.ia_ctime = stbuf->ia_ctime;
+                    local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec;
+                    local->prebuf.ia_prot = stbuf->ia_prot;
+                }
+            }
+        }
 
+        if (local->stbuf.ia_type != IA_INVAL) {
+            /* This is not the first subvol to respond
+             * Compare values to see if attrs need to be healed
+             */
+            if ((local->stbuf.ia_gid != stbuf->ia_gid) ||
+                (local->stbuf.ia_uid != stbuf->ia_uid) ||
+                (is_permission_different(&local->stbuf.ia_prot,
+                                         &stbuf->ia_prot))) {
+                local->need_attrheal = 1;
+            }
+        }
 
-int
-dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
-	dht_conf_t     *conf = NULL;
-	dht_local_t    *local = NULL;
-	int             i = 0;
-	int             call_cnt = 0;
+        if (local->inode == NULL)
+            local->inode = inode_ref(inode);
+
+        dht_iatt_merge(this, &local->stbuf, stbuf);
+        dht_iatt_merge(this, &local->postparent, postparent);
+
+        if (!dict_get(xattr, conf->mds_xattr_key)) {
+            gf_msg_debug(this->name, 0,
+                         "%s: mds xattr %s is not present "
+                         "on %s(gfid = %s)",
+                         local->loc.path, conf->mds_xattr_key, prev->name,
+                         gfid_local);
+            goto unlock;
+        }
+
+        /* Save the mds subvol info and stbuf. This is the value that will
+         * be used for healing
+         */
+        local->mds_subvol = prev;
+        local->mds_stbuf = *stbuf;
+
+        /* Save mds subvol on inode ctx */
+
+        ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+                   "%s: Failed to set mds (%s)", local->loc.path, prev->name);
+        }
+        check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
+                                       mds_xattr_val, 1, &errst);
+        if ((check_mds < 0) && !errst) {
+            /* Check if xattrs need to be healed on the directories */
+            local->mds_xattr = dict_ref(xattr);
+            gf_msg_debug(this->name, 0,
+                         "%s: %s is not zero on %s. Xattrs need to be healed."
+                         "(gfid = %s)",
+                         local->loc.path, conf->mds_xattr_key, prev->name,
+                         gfid_local);
+            local->need_xattr_heal = 1;
+        }
+    }
+
+unlock:
+    UNLOCK(&frame->lock);
+
+    this_call_cnt = dht_frame_return(frame);
+
+    if (is_last_call(this_call_cnt)) {
+        /* If the mds subvol is not set correctly*/
+        if (!__is_root_gfid(local->gfid) &&
+            (!dict_get(local->xattr, conf->mds_xattr_key))) {
+            local->need_selfheal = 1;
+        }
+
+        /* No need to call xattr heal code if volume count is 1
+         */
+        if (conf->subvolume_cnt == 1) {
+            local->need_xattr_heal = 0;
+        }
+
+        if (local->need_selfheal || local->need_lookup_everywhere) {
+            /* Set the gfid-req so posix will set the GFID*/
+            if (!gf_uuid_is_null(local->gfid)) {
+                /* Ok, this should _never_ happen */
+                ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+                                          local->gfid, 16);
+            } else {
+                if (!gf_uuid_is_null(local->gfid_req))
+                    ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+                                              local->gfid_req, 16);
+            }
+        }
+
+        if (local->need_lookup_everywhere) {
+            local->need_lookup_everywhere = 0;
+            dht_lookup_everywhere(frame, this, &local->loc);
+            return 0;
+        }
+
+        if (local->op_ret == 0) {
+            if (dht_needs_selfheal(frame, this)) {
+                goto selfheal;
+            }
+
+            dht_layout_set(this, local->inode, layout);
+            if (local->inode) {
+                dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1);
+            }
 
-	conf = this->private;
-	local = frame->local;
+            if (local->loc.parent) {
+                dht_inode_ctx_time_update(local->loc.parent, this,
+                                          &local->postparent, 1);
+            }
+        }
 
-	call_cnt = conf->subvolume_cnt;
-	local->call_cnt = call_cnt;
+        DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+        dht_set_fixed_dir_stat(&local->postparent);
+        /* Delete mds xattr at the time of STACK UNWIND */
+        if (local->xattr)
+            GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
 
-	if (!local->inode)
-		local->inode = inode_ref (loc->inode);
+        DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+                         local->inode, &local->stbuf, local->xattr,
+                         &local->postparent);
+    }
 
-	for (i = 0; i < call_cnt; i++) {
-		STACK_WIND (frame, dht_lookup_everywhere_cbk,
-			    conf->subvolumes[i],
-			    conf->subvolumes[i]->fops->lookup,
-			    loc, local->xattr_req);
-	}
+    return 0;
 
-	return 0;
+selfheal:
+    FRAME_SU_DO(frame, dht_local_t);
+    ret = dht_selfheal_directory(frame, dht_lookup_selfheal_cbk, &local->loc,
+                                 layout);
+out:
+    return ret;
 }
 
+static int
+dht_lookup_directory(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+    int call_cnt = 0;
+    int i = 0;
+    dht_conf_t *conf = NULL;
+    dht_local_t *local = NULL;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO("dht", frame, out);
+    GF_VALIDATE_OR_GOTO("dht", this, unwind);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, unwind);
+    GF_VALIDATE_OR_GOTO("dht", this->private, unwind);
+    GF_VALIDATE_OR_GOTO("dht", loc, unwind);
+
+    conf = this->private;
+    local = frame->local;
+
+    call_cnt = conf->subvolume_cnt;
+    local->call_cnt = call_cnt;
+
+    local->layout = dht_layout_new(this, conf->subvolume_cnt);
+    if (!local->layout) {
+        goto unwind;
+    }
+
+    if (local->xattr != NULL) {
+        dict_unref(local->xattr);
+        local->xattr = NULL;
+    }
+
+    if (!gf_uuid_is_null(local->gfid)) {
+        /* use this gfid in order to heal any missing ones */
+        ret = dict_set_gfuuid(local->xattr_req, "gfid-req", local->gfid, true);
+        if (ret)
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+                   "%s: Failed to set dictionary value:"
+                   " key = gfid-req",
+                   local->loc.path);
+    }
+
+    for (i = 0; i < call_cnt; i++) {
+        STACK_WIND_COOKIE(
+            frame, dht_lookup_dir_cbk, conf->subvolumes[i], conf->subvolumes[i],
+            conf->subvolumes[i]->fops->lookup, &local->loc, local->xattr_req);
+    }
+    return 0;
+unwind:
+    DHT_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
+out:
+    return 0;
+}
 
 int
-dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
-                         xlator_t *this, int op_ret, int op_errno,
-                         inode_t *inode, struct stat *stbuf, dict_t *xattr)
+dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+                   dict_t *xattr, struct iatt *postparent)
 {
-        call_frame_t *prev = NULL;
-	dht_local_t  *local = NULL;
-	dht_layout_t *layout = NULL;
-	xlator_t     *subvol = NULL;
-	loc_t        *loc = NULL;
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    xlator_t *prev = NULL;
+    dht_layout_t *layout = NULL;
+    dht_conf_t *conf = NULL;
+    int ret = -1;
+    int is_dir = 0;
+    int is_linkfile = 0;
+    int follow_link = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+    uint32_t vol_commit_hash = 0;
+    xlator_t *subvol = NULL;
+    int32_t check_mds = 0;
+    int errst = 0, i = 0;
+    int32_t mds_xattr_val[1] = {0};
+
+    GF_VALIDATE_OR_GOTO("dht", frame, err);
+    GF_VALIDATE_OR_GOTO("dht", this, err);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, err);
+    GF_VALIDATE_OR_GOTO("dht", cookie, err);
+    GF_VALIDATE_OR_GOTO("dht", this->private, err);
+
+    local = frame->local;
+    prev = cookie;
+    conf = this->private;
+
+    if (!conf->vch_forced) {
+        /* Update the commithash value if available
+         */
+        ret = dict_get_uint32(xattr, conf->commithash_xattr_name,
+                              &vol_commit_hash);
+        if (ret == 0) {
+            conf->vol_commit_hash = vol_commit_hash;
+        }
+    }
 
-        prev   = cookie;
-	subvol = prev->this;
+    gf_uuid_unparse(local->loc.gfid, gfid);
 
-	local  = frame->local;
-	loc    = &local->loc;
+    gf_msg_debug(this->name, op_errno,
+                 "%s: revalidate lookup on %s returned op_ret %d",
+                 local->loc.path, prev->name, op_ret);
+
+    LOCK(&frame->lock);
+    {
+        if (gf_uuid_is_null(local->gfid)) {
+            memcpy(local->gfid, local->loc.gfid, 16);
+        }
 
         if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"lookup of %s on %s (following linkfile) failed (%s)",
-			local->loc.path, subvol->name, strerror (op_errno));
+            local->op_errno = op_errno;
+
+            if ((op_errno != ENOTCONN) && (op_errno != ENOENT) &&
+                (op_errno != ESTALE)) {
+                gf_msg(this->name, GF_LOG_INFO, op_errno,
+                       DHT_MSG_REVALIDATE_CBK_INFO,
+                       "Revalidate: subvolume %s for %s "
+                       "(gfid = %s) returned -1",
+                       prev->name, local->loc.path, gfid);
+            }
+            if (op_errno == ESTALE) {
+                /* propagate the ESTALE to parent.
+                 * setting local->return_estale would send
+                 * ESTALE to parent. */
+                local->return_estale = 1;
+            }
+
+            /* if it is ENOENT, we may have to do a
+             * 'lookup_everywhere()' to make sure
+             * the file is not migrated */
+            if (op_errno == ENOENT) {
+                if (IA_ISREG(local->loc.inode->ia_type)) {
+                    gf_msg_debug(this->name, 0,
+                                 "found ENOENT for %s. "
+                                 "Setting "
+                                 "need_lookup_everywhere"
+                                 " flag to 1",
+                                 local->loc.path);
+
+                    local->need_lookup_everywhere = 1;
+                } else if (IA_ISDIR(local->loc.inode->ia_type)) {
+                    layout = local->layout;
+                    for (i = 0; i < layout->cnt; i++) {
+                        if (layout->list[i].xlator == prev) {
+                            layout->list[i].err = op_errno;
+                            break;
+                        }
+                    }
+
+                    local->need_selfheal = 1;
+                }
+            }
+
+            /* The GFID is missing on this subvol. Lookup everywhere to force a
+             * gfid heal
+             */
+            if ((op_errno == ENODATA) &&
+                (IA_ISDIR(local->loc.inode->ia_type))) {
+                local->need_lookup_everywhere = 1;
+            }
+
+            goto unlock;
+        }
+
+        if ((!IA_ISINVAL(local->inode->ia_type)) &&
+            stbuf->ia_type != local->inode->ia_type) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FILE_TYPE_MISMATCH,
+                   "mismatching filetypes 0%o v/s 0%o for %s,"
+                   " gfid = %s",
+                   (stbuf->ia_type), (local->inode->ia_type), local->loc.path,
+                   gfid);
+
+            local->op_ret = -1;
+            local->op_errno = EINVAL;
+
+            goto unlock;
+        }
+
+        layout = local->layout;
+
+        is_dir = check_is_dir(inode, stbuf, xattr);
+        is_linkfile = check_is_linkfile(inode, stbuf, xattr,
+                                        conf->link_xattr_name);
+        if (is_linkfile) {
+            follow_link = 1;
+            goto unlock;
+        }
+        if (is_dir) {
+            ret = dht_dir_has_layout(xattr, conf->xattr_name);
+            if (ret >= 0) {
+                if (is_greater_time(local->prebuf.ia_ctime,
+                                    local->prebuf.ia_ctime_nsec,
+                                    stbuf->ia_ctime, stbuf->ia_ctime_nsec)) {
+                    /* Choose source */
+                    local->prebuf.ia_gid = stbuf->ia_gid;
+                    local->prebuf.ia_uid = stbuf->ia_uid;
+
+                    local->prebuf.ia_ctime = stbuf->ia_ctime;
+                    local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec;
+
+                    if (__is_root_gfid(stbuf->ia_gfid))
+                        local->prebuf.ia_prot = stbuf->ia_prot;
+                }
+            }
+
+            if (local->stbuf.ia_type != IA_INVAL) {
+                if ((local->stbuf.ia_gid != stbuf->ia_gid) ||
+                    (local->stbuf.ia_uid != stbuf->ia_uid) ||
+                    is_permission_different(&local->stbuf.ia_prot,
+                                            &stbuf->ia_prot)) {
+                    local->need_attrheal = 1;
+                }
+            }
+
+            if (!dict_get(xattr, conf->mds_xattr_key)) {
+                gf_msg_debug(this->name, 0,
+                             "%s: internal xattr %s is not present"
+                             " on subvol %s(gfid is %s)",
+                             local->loc.path, conf->mds_xattr_key, prev->name,
+                             gfid);
+            } else {
+                check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
+                                               mds_xattr_val, 1, &errst);
+                local->mds_subvol = prev;
+                local->mds_stbuf.ia_gid = stbuf->ia_gid;
+                local->mds_stbuf.ia_uid = stbuf->ia_uid;
+                local->mds_stbuf.ia_prot = stbuf->ia_prot;
+
+                /* save mds subvol on inode ctx */
+                ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           DHT_MSG_SET_INODE_CTX_FAILED,
+                           "Failed to set MDS subvol for %s vol is %s",
+                           local->loc.path, prev->name);
+                }
+                if ((check_mds < 0) && !errst) {
+                    /* Check if xattrs need to be healed on the directory
+                     */
+                    local->mds_xattr = dict_ref(xattr);
+                    gf_msg_debug(this->name, 0,
+                                 "Value of %s is not zero on "
+                                 "hashed subvol so xattr needs to"
+                                 " be healed on non hashed"
+                                 " path is %s and vol name is %s "
+                                 " gfid is %s",
+                                 conf->mds_xattr_key, local->loc.path,
+                                 prev->name, gfid);
+                    local->need_xattr_heal = 1;
+                }
+            }
+            ret = dht_layout_dir_mismatch(this, layout, prev, &local->loc,
+                                          xattr);
+            if (ret != 0) {
+                /* In memory layout does not match on-disk layout.
+                 */
+                gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_MISMATCH,
+                       "Mismatching layouts for %s, gfid = %s", local->loc.path,
+                       gfid);
+
+                local->layout_mismatch = 1;
+
+                goto unlock;
+            }
+        }
+
+        gf_uuid_copy(local->stbuf.ia_gfid, stbuf->ia_gfid);
+        dht_iatt_merge(this, &local->stbuf, stbuf);
+        dht_iatt_merge(this, &local->postparent, postparent);
+
+        local->op_ret = 0;
+
+        if (!local->xattr) {
+            local->xattr = dict_ref(xattr);
+        } else if (is_dir) {
+            dht_aggregate_xattr(local->xattr, xattr);
+        }
+    }
+unlock:
+    UNLOCK(&frame->lock);
+
+    if (follow_link) {
+        /* Found a linkto file. Follow it to see if the target file exists
+         */
+        gf_uuid_copy(local->gfid, stbuf->ia_gfid);
+
+        subvol = dht_linkfile_subvol(this, inode, stbuf, xattr);
+        if (!subvol) {
+            op_errno = ESTALE;
+            local->op_ret = -1;
+        } else {
+            STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol,
+                              subvol->fops->lookup, &local->loc,
+                              local->xattr_req);
+            return 0;
+        }
+    }
+
+    this_call_cnt = dht_frame_return(frame);
+
+    if (is_last_call(this_call_cnt)) {
+        if (!IA_ISDIR(local->stbuf.ia_type) &&
+            (local->hashed_subvol != local->cached_subvol) &&
+            (local->stbuf.ia_nlink == 1) &&
+            (conf && conf->unhashed_sticky_bit)) {
+            local->stbuf.ia_prot.sticky = 1;
+        }
+        /* No need to call heal code if volume count is 1
+         */
+        if (conf->subvolume_cnt == 1)
+            local->need_xattr_heal = 0;
+
+        if (IA_ISDIR(local->stbuf.ia_type)) {
+            /* No mds xattr found. Trigger a heal to set it */
+            if (!__is_root_gfid(local->loc.inode->gfid) &&
+                (!dict_get(local->xattr, conf->mds_xattr_key)))
+                local->need_selfheal = 1;
+
+            if (dht_needs_selfheal(frame, this)) {
+                if (!__is_root_gfid(local->loc.inode->gfid)) {
+                    if (local->mds_subvol) {
+                        local->stbuf.ia_gid = local->mds_stbuf.ia_gid;
+                        local->stbuf.ia_uid = local->mds_stbuf.ia_uid;
+                        local->stbuf.ia_prot = local->mds_stbuf.ia_prot;
+                    }
+                } else {
+                    local->stbuf.ia_gid = local->prebuf.ia_gid;
+                    local->stbuf.ia_uid = local->prebuf.ia_uid;
+                    local->stbuf.ia_prot = local->prebuf.ia_prot;
+                }
 
-		dht_lookup_everywhere (frame, this, loc);
-		return 0;
-	}
+                layout = local->layout;
+                dht_selfheal_directory(frame, dht_lookup_selfheal_cbk,
+                                       &local->loc, layout);
+                return 0;
+            }
+        }
 
-        /* TODO: assert type is non-dir and non-linkfile */
+        if (local->layout_mismatch) {
+            /* Found layout mismatch in the directory, need to
+               fix this in the inode context */
+            dht_layout_unref(this, local->layout);
+            local->layout = NULL;
+            dht_lookup_directory(frame, this, &local->loc);
+            return 0;
+        }
 
-	if (stbuf->st_nlink == 1)
-		stbuf->st_mode |= S_ISVTX;
-        dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino);
+        if (local->need_lookup_everywhere) {
+            /* As the current layout gave ENOENT error, we would
+               need a new layout */
+            dht_layout_unref(this, local->layout);
+            local->layout = NULL;
+
+            /* We know that current cached subvol is no longer
+               valid, get the new one */
+            local->cached_subvol = NULL;
+            if (local->xattr_req) {
+                if (!gf_uuid_is_null(local->gfid)) {
+                    ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+                                              local->gfid, 16);
+                }
+            }
 
-	layout = dht_layout_for_subvol (this, prev->this);
-	if (!layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no pre-set layout for subvolume %s",
-			prev->this->name);
-		op_ret   = -1;
-		op_errno = EINVAL;
-		goto out;
-	}
+            dht_lookup_everywhere(frame, this, &local->loc);
+            return 0;
+        }
+        if (local->return_estale) {
+            local->op_ret = -1;
+            local->op_errno = ESTALE;
+        }
 
-	inode_ctx_put (inode, this, (uint64_t)(long)layout);
+        if (local->loc.parent) {
+            dht_inode_ctx_time_update(local->loc.parent, this,
+                                      &local->postparent, 1);
+        }
 
+        DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+        dht_set_fixed_dir_stat(&local->postparent);
+
+        /* local->stbuf is updated only from subvols which have a layout
+         * The reason is to avoid choosing attr heal source from newly
+         * added bricks. In case e.g we have only one subvol and for
+         * some reason layout is not present on it, then local->stbuf
+         * will be EINVAL. This is an indication that the subvols
+         * active in the cluster do not have layouts on disk.
+         * Unwind with ESTALE to trigger a fresh lookup */
+        if (is_dir && local->stbuf.ia_type == IA_INVAL) {
+            local->op_ret = -1;
+            local->op_errno = ESTALE;
+        }
+        /* Delete mds xattr at the time of STACK UNWIND */
+        if (local->xattr)
+            GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
+
+        DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+                         local->inode, &local->stbuf, local->xattr,
+                         &local->postparent);
+    }
+
+err:
+    return ret;
+}
+
+static int
+dht_lookup_linkfile_create_cbk(call_frame_t *frame, void *cooie, xlator_t *this,
+                               int32_t op_ret, int32_t op_errno, inode_t *inode,
+                               struct iatt *stbuf, struct iatt *preparent,
+                               struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *cached_subvol = NULL;
+    dht_conf_t *conf = NULL;
+    int ret = -1;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    GF_VALIDATE_OR_GOTO("dht", frame, out);
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+    GF_VALIDATE_OR_GOTO("dht", this->private, out);
+
+    local = frame->local;
+    cached_subvol = local->cached_subvol;
+    conf = this->private;
+
+    gf_uuid_unparse(local->loc.gfid, gfid);
+
+    if (local->locked)
+        dht_unlock_namespace(frame, &local->lock[0]);
+
+    ret = dht_layout_preset(this, local->cached_subvol, local->loc.inode);
+    if (ret < 0) {
+        gf_msg_debug(this->name, EINVAL,
+                     "Failed to set layout for subvolume %s, "
+                     "(gfid = %s)",
+                     cached_subvol ? cached_subvol->name : "<nil>", gfid);
+        local->op_ret = -1;
+        local->op_errno = EINVAL;
+        goto unwind;
+    }
+
+    local->op_ret = 0;
+    if ((local->stbuf.ia_nlink == 1) && (conf && conf->unhashed_sticky_bit)) {
+        local->stbuf.ia_prot.sticky = 1;
+    }
+
+    if (local->loc.parent) {
+        dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+    }
+
+unwind:
+    gf_msg_debug(this->name, 0,
+                 "creation of linkto on hashed subvol:%s, "
+                 "returned with op_ret %d and op_errno %d: %s",
+                 local->hashed_subvol->name, op_ret, op_errno,
+                 uuid_utoa(local->loc.gfid));
+
+    if (local->linked == _gf_true)
+        dht_linkfile_attr_heal(frame, this);
+
+    dht_set_fixed_dir_stat(&local->postparent);
+
+    DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+    DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+                     local->inode, &local->stbuf, local->xattr,
+                     &local->postparent);
 out:
-        DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr);
+    return ret;
+}
 
-        return 0;
+static int
+dht_lookup_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int op_ret, int op_errno, struct iatt *preparent,
+                      struct iatt *postparent, dict_t *xdata)
+{
+    int this_call_cnt = 0;
+    dht_local_t *local = NULL;
+    const char *path = NULL;
+
+    local = (dht_local_t *)frame->local;
+    path = local->loc.path;
+    FRAME_SU_UNDO(frame, dht_local_t);
+
+    gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO,
+           "lookup_unlink returned with "
+           "op_ret -> %d and op-errno -> %d for %s",
+           op_ret, op_errno, ((path == NULL) ? "null" : path));
+
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt)) {
+        dht_lookup_everywhere_done(frame, this);
+    }
+
+    return 0;
 }
 
+static int
+dht_lookup_unlink_of_false_linkto_cbk(call_frame_t *frame, void *cookie,
+                                      xlator_t *this, int op_ret, int op_errno,
+                                      struct iatt *preparent,
+                                      struct iatt *postparent, dict_t *xdata)
+{
+    int this_call_cnt = 0;
+    dht_local_t *local = NULL;
+    const char *path = NULL;
+
+    local = (dht_local_t *)frame->local;
+    path = local->loc.path;
 
-int
-dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-                int op_ret, int op_errno,
-                inode_t *inode, struct stat *stbuf, dict_t *xattr)
+    FRAME_SU_UNDO(frame, dht_local_t);
+
+    gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO,
+           "lookup_unlink returned with "
+           "op_ret -> %d and op-errno -> %d for %s",
+           op_ret, op_errno, ((path == NULL) ? "null" : path));
+
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt)) {
+        if ((op_ret == 0) || ((op_errno != EBUSY) && (op_errno != ENOTCONN))) {
+            dht_lookup_everywhere_done(frame, this);
+        } else {
+            /*When dht_lookup_everywhere is performed, one cached
+             *and one hashed file was found and hashed file does
+             *not point to the above mentioned cached node. So it
+             *was considered as stale and an unlink was performed.
+             *But unlink fails. So may be rebalance is in progress.
+             *now ideally we have two data-files. One obtained during
+             *lookup_everywhere and one where unlink-failed. So
+             *at this point in time we cannot decide which one to
+             *choose because there are chances of first cached
+             *file is truncated after rebalance and if it is chosen
+             *as cached node, application will fail. So return EIO.*/
+
+            if (op_errno == EBUSY) {
+                gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                       DHT_MSG_UNLINK_FAILED,
+                       "Could not unlink the linkto file as "
+                       "either fd is open and/or linkto xattr "
+                       "is set for %s",
+                       ((path == NULL) ? "null" : path));
+            }
+            DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL);
+        }
+    }
+
+    return 0;
+}
+
+static int
+dht_lookup_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie,
+                                   xlator_t *this, int op_ret, int op_errno,
+                                   struct iatt *preparent,
+                                   struct iatt *postparent, dict_t *xdata)
 {
-	dht_layout_t *layout      = NULL;
-        char          is_linkfile = 0;
-        char          is_dir      = 0;
-        xlator_t     *subvol      = NULL;
-        dht_conf_t   *conf        = NULL;
-        dht_local_t  *local       = NULL;
-        loc_t        *loc         = NULL;
-        int           i           = 0;
-        call_frame_t *prev        = NULL;
-	int           call_cnt    = 0;
+    dht_local_t *local = NULL;
+    const char *path = NULL;
 
+    /* NOTE:
+     * If stale file unlink fails either there is an open-fd or is not an
+     * dht-linkto-file then posix_unlink returns EBUSY, which is overwritten
+     *  to ENOENT
+     */
 
-        conf  = this->private;
+    local = frame->local;
 
-        prev  = cookie;
-        local = frame->local;
-        loc   = &local->loc;
-
-	if (ENTRY_MISSING (op_ret, op_errno)) {
-		if (conf->search_unhashed) {
-			local->op_errno = ENOENT;
-			dht_lookup_everywhere (frame, this, loc);
-			return 0;
-		}
-	}
-
- 	if (op_ret == 0) {
- 		is_dir      = check_is_dir (inode, stbuf, xattr);
- 		if (is_dir) {
- 			local->inode = inode_ref (inode);
- 			local->xattr = dict_ref (xattr);
- 		}
- 	}
-
- 	if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) {
-		call_cnt        = conf->subvolume_cnt;
- 		local->call_cnt = call_cnt;
-		
- 		local->layout = dht_layout_new (this, conf->subvolume_cnt);
- 		if (!local->layout) {
- 			op_ret   = -1;
- 			op_errno = ENOMEM;
- 			gf_log (this->name, GF_LOG_ERROR,
- 				"memory allocation failed :(");
- 			goto out;
- 		}
-		
-		for (i = 0; i < call_cnt; i++) {
-			STACK_WIND (frame, dht_lookup_dir_cbk,
-				    conf->subvolumes[i],
-				    conf->subvolumes[i]->fops->lookup,
-				    &local->loc, local->xattr_req);
-		}
- 		return 0;
- 	}
- 
-        if (op_ret == -1)
-                goto out;
+    if (local) {
+        FRAME_SU_UNDO(frame, dht_local_t);
+        if (local->loc.path)
+            path = local->loc.path;
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO,
+           "Returned with op_ret %d and "
+           "op_errno %d for %s",
+           op_ret, op_errno, ((path == NULL) ? "null" : path));
 
-        is_linkfile = check_is_linkfile (inode, stbuf, xattr);
-        is_dir      = check_is_dir (inode, stbuf, xattr);
+    DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL);
 
-        if (!is_dir && !is_linkfile) {
-                /* non-directory and not a linkfile */
+    return 0;
+}
+
+static int
+dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_t *dict)
+{
+    int ret = 0;
 
-		dht_itransform (this, prev->this, stbuf->st_ino,
-				&stbuf->st_ino);
+    ret = dict_set_int32_sizen(dict, DHT_SKIP_NON_LINKTO_UNLINK, 1);
 
-		layout = dht_layout_for_subvol (this, prev->this);
-		if (!layout) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"no pre-set layout for subvolume %s",
-				prev->this->name);
-			op_ret   = -1;
-			op_errno = EINVAL;
-			goto out;
-		}
+    if (ret)
+        return -1;
 
-                inode_ctx_put (inode, this, (uint64_t)(long)layout);
-                goto out; 
-	}
+    ret = dict_set_int32_sizen(dict, DHT_SKIP_OPEN_FD_UNLINK, 1);
 
-        if (is_linkfile) {
-                subvol = dht_linkfile_subvol (this, inode, stbuf, xattr);
-
-                if (!subvol) {
-                        gf_log (this->name, GF_LOG_WARNING,
-                                "linkfile not having link subvolume. path=%s",
-                                loc->path);
-			dht_lookup_everywhere (frame, this, loc);
-			return 0;
-                }
+    if (ret)
+        return -1;
 
-		STACK_WIND (frame, dht_lookup_linkfile_cbk,
-			    subvol, subvol->fops->lookup,
-			    &local->loc, local->xattr_req);
+    return 0;
+}
+
+static int32_t
+dht_linkfile_create_lookup_cbk(call_frame_t *frame, void *cookie,
+                               xlator_t *this, int32_t op_ret, int32_t op_errno,
+                               inode_t *inode, struct iatt *buf, dict_t *xdata,
+                               struct iatt *postparent)
+{
+    dht_local_t *local = NULL;
+    int call_cnt = 0, ret = 0;
+    xlator_t *subvol = NULL;
+    uuid_t gfid = {
+        0,
+    };
+    char gfid_str[GF_UUID_BUF_SIZE] = {0};
+
+    subvol = cookie;
+    local = frame->local;
+
+    if (subvol == local->hashed_subvol) {
+        if ((op_ret == 0) || (op_errno != ENOENT))
+            local->dont_create_linkto = _gf_true;
+    } else {
+        if (gf_uuid_is_null(local->gfid))
+            gf_uuid_copy(gfid, local->loc.gfid);
+        else
+            gf_uuid_copy(gfid, local->gfid);
+
+        if ((op_ret == 0) && gf_uuid_compare(gfid, buf->ia_gfid)) {
+            gf_uuid_unparse(gfid, gfid_str);
+            gf_msg_debug(this->name, 0,
+                         "gfid (%s) different on cached subvol "
+                         "(%s) and looked up inode (%s), not "
+                         "creating linkto",
+                         uuid_utoa(buf->ia_gfid), subvol->name, gfid_str);
+            local->dont_create_linkto = _gf_true;
+        } else if (op_ret == -1) {
+            local->dont_create_linkto = _gf_true;
+        }
+    }
+
+    call_cnt = dht_frame_return(frame);
+    if (is_last_call(call_cnt)) {
+        if (local->dont_create_linkto)
+            goto no_linkto;
+        else {
+            gf_msg_debug(this->name, 0,
+                         "Creating linkto file on %s(hash) to "
+                         "%s on %s (gfid = %s)",
+                         local->hashed_subvol->name, local->loc.path,
+                         local->cached_subvol->name, gfid_str);
+
+            ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk,
+                                      this, local->cached_subvol,
+                                      local->hashed_subvol, &local->loc);
+
+            if (ret < 0)
+                goto no_linkto;
         }
+    }
+
+    return 0;
 
+no_linkto:
+    gf_msg_debug(this->name, 0,
+                 "skipped linkto creation (path:%s) (gfid:%s) "
+                 "(hashed-subvol:%s) (cached-subvol:%s)",
+                 local->loc.path, gfid_str, local->hashed_subvol->name,
+                 local->cached_subvol->name);
+
+    dht_lookup_linkfile_create_cbk(frame, NULL, this, 0, 0, local->loc.inode,
+                                   &local->stbuf, &local->preparent,
+                                   &local->postparent, local->xattr);
+    return 0;
+}
+
+static int32_t
+dht_call_lookup_linkfile_create(call_frame_t *frame, void *cookie,
+                                xlator_t *this, int32_t op_ret,
+                                int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+    int i = 0;
+    xlator_t *subvol = NULL;
+
+    local = frame->local;
+    if (gf_uuid_is_null(local->gfid))
+        gf_uuid_unparse(local->loc.gfid, gfid);
+    else
+        gf_uuid_unparse(local->gfid, gfid);
+
+    if (op_ret < 0) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "protecting namespace failed, skipping linkto "
+               "creation (path:%s)(gfid:%s)(hashed-subvol:%s)"
+               "(cached-subvol:%s)",
+               local->loc.path, gfid, local->hashed_subvol->name,
+               local->cached_subvol->name);
+        goto err;
+    }
+
+    local->locked = _gf_true;
+
+    local->call_cnt = 2;
+
+    for (i = 0; i < 2; i++) {
+        subvol = (subvol == NULL) ? local->hashed_subvol : local->cached_subvol;
+
+        STACK_WIND_COOKIE(frame, dht_linkfile_create_lookup_cbk, subvol, subvol,
+                          subvol->fops->lookup, &local->loc, NULL);
+    }
+
+    return 0;
+
+err:
+    dht_lookup_linkfile_create_cbk(frame, NULL, this, 0, 0, local->loc.inode,
+                                   &local->stbuf, &local->preparent,
+                                   &local->postparent, local->xattr);
+    return 0;
+}
+
+/* Rebalance is performed from cached_node to hashed_node. Initial cached_node
+ * contains a non-linkto file. After migration it is converted to linkto and
+ * then unlinked. And at hashed_subvolume, first a linkto file is present,
+ * then after migration it is converted to a non-linkto file.
+ *
+ * Lets assume a file is present on cached subvolume and a new brick is added
+ * and new brick is the new_hashed subvolume. So fresh lookup on newly added
+ * hashed subvolume will fail and dht_lookup_everywhere gets called.  If just
+ * before sending the dht_lookup_everywhere request rebalance is in progress,
+ *
+ * from cached subvolume it may see: Nonlinkto or linkto or No file
+ * from hashed subvolume it may see: No file or linkto file or non-linkto file
+ *
+ * So this boils down to 9 cases:
+ *   at cached_subvol            at hashed_subvol
+ *   ----------------           -----------------
+ *
+ *a)   No file                     No file
+ *    [request reached after    [Request reached before
+ *       migration]                Migration]
+ *
+ *b)   No file                     Linkto File
+ *
+ *c)   No file                     Non-Linkto File
+ *
+ *d)   Linkto                      No-File
+ *
+ *e)   Linkto                      Linkto
+ *
+ *f)   Linkto                      Non-Linkto
+ *
+ *g)   NonLinkto                   No-File
+ *
+ *h)   NonLinkto                   Linkto
+ *
+ *i)   NonLinkto                   NonLinkto
+ *
+ * dht_lookup_everywhere_done takes decision based on any of the above case
+ */
+
+static int
+dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this)
+{
+    int ret = 0;
+    dht_local_t *local = NULL;
+    xlator_t *hashed_subvol = NULL;
+    xlator_t *cached_subvol = NULL;
+    dht_layout_t *layout = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+    gf_boolean_t found_non_linkto_on_hashed = _gf_false;
+
+    local = frame->local;
+    hashed_subvol = local->hashed_subvol;
+    cached_subvol = local->cached_subvol;
+
+    gf_uuid_unparse(local->loc.gfid, gfid);
+
+    if (local->file_count && local->dir_count) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_FILE_TYPE_MISMATCH,
+               "path %s (gfid = %s)exists as a file on one "
+               "subvolume and directory on another. "
+               "Please fix it manually",
+               local->loc.path, gfid);
+        DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL);
+        return 0;
+    }
+    if (local->op_ret && local->gfid_missing) {
+        if (gf_uuid_is_null(local->gfid_req)) {
+            DHT_STACK_UNWIND(lookup, frame, -1, ENODATA, NULL, NULL, NULL,
+                             NULL);
+            return 0;
+        }
+        /* A hack */
+        dht_lookup_directory(frame, this, &local->loc);
         return 0;
+    }
 
-out:
-        DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr);
+    if (local->dir_count) {
+        dht_lookup_directory(frame, this, &local->loc);
         return 0;
-}
+    }
+
+    gf_msg_debug(this->name, 0,
+                 "STATUS: hashed_subvol %s "
+                 "cached_subvol %s",
+                 (hashed_subvol == NULL) ? "null" : hashed_subvol->name,
+                 (cached_subvol == NULL) ? "null" : cached_subvol->name);
+
+    if (!cached_subvol) {
+        if (local->skip_unlink.handle_valid_link && hashed_subvol) {
+            /*Purpose of "DHT_SKIP_NON_LINKTO_UNLINK":
+             * If this lookup is performed by rebalance and this
+             * rebalance process detected hashed file and by
+             * the time it sends the lookup request to cached node,
+             * file got migrated and now at initial hashed_node,
+             * final migrated file is present. With current logic,
+             * because this process fails to find the cached_node,
+             * it will unlink the file at initial hashed_node.
+             *
+             * So we avoid this by setting key, and checking at the
+             * posix_unlink that unlink the file only if file is a
+             * linkto file and not a migrated_file.
+             */
+
+            ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(
+                local->xattr_req);
+
+            if (ret) {
+                /* If for some reason, setting key in the dict
+                 * fails, return with ENOENT, as with respect to
+                 * this process, it detected only a stale link
+                 * file.
+                 *
+                 * Next lookup will delete it.
+                 *
+                 * Performing deletion of stale link file when
+                 * setting key in dict fails, may cause the data
+                 * loss because of the above mentioned race.
+                 */
+
+                DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL,
+                                 NULL);
+            } else {
+                local->skip_unlink.handle_valid_link = _gf_false;
+
+                gf_msg_debug(this->name, 0,
+                             "No Cached was found and "
+                             "unlink on hashed was skipped"
+                             " so performing now: %s",
+                             local->loc.path);
+                FRAME_SU_DO(frame, dht_local_t);
+                STACK_WIND(frame, dht_lookup_unlink_stale_linkto_cbk,
+                           hashed_subvol, hashed_subvol->fops->unlink,
+                           &local->loc, 0, local->xattr_req);
+            }
 
+        } else {
+            gf_msg_debug(this->name, 0,
+                         "There was no cached file and  "
+                         "unlink on hashed is not skipped %s",
+                         local->loc.path);
 
-int
-dht_lookup (call_frame_t *frame, xlator_t *this,
-            loc_t *loc, dict_t *xattr_req)
-{
-        xlator_t     *subvol = NULL;
-        xlator_t     *hashed_subvol = NULL;
-        xlator_t     *cached_subvol = NULL;
-        dht_local_t  *local  = NULL;
-	dht_conf_t   *conf = NULL;
-        int           ret    = -1;
-        int           op_errno = -1;
-	dht_layout_t *layout = NULL;
-	int           i = 0;
-	int           call_cnt = 0;
-
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	conf = this->private;
-
-        local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-        ret = loc_dup (loc, &local->loc);
-        if (ret == -1) {
-                op_errno = errno;
-                gf_log (this->name, GF_LOG_ERROR,
-                        "copying location failed for path=%s",
-                        loc->path);
-                goto err;
+            DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL);
+        }
+        return 0;
+    }
+
+    /* At the time of dht_lookup, no file was found on hashed and that is
+     * why dht_lookup_everywhere is called, but by the time
+     * dht_lookup_everywhere
+     * reached to server, file might have already migrated. In that case we
+     * will find a migrated file at the hashed_node. In this case store the
+     * layout in context and return successfully.
+     */
+
+    if (hashed_subvol || local->need_lookup_everywhere) {
+        if (local->need_lookup_everywhere) {
+            found_non_linkto_on_hashed = _gf_true;
+
+        } else if ((local->file_count == 1) &&
+                   (hashed_subvol == cached_subvol)) {
+            gf_msg_debug(this->name, 0,
+                         "found cached file on hashed subvolume "
+                         "so store in context and return for %s",
+                         local->loc.path);
+
+            found_non_linkto_on_hashed = _gf_true;
         }
-	
-	if (xattr_req) {
-		local->xattr_req = dict_ref (xattr_req);
-	} else {
-		local->xattr_req = dict_new ();
-	}
 
-	hashed_subvol = dht_subvol_get_hashed (this, loc);
-	cached_subvol = dht_subvol_get_cached (this, loc->inode);
+        if (found_non_linkto_on_hashed)
+            goto preset_layout;
+    }
 
-	local->cached_subvol = cached_subvol;
-	local->hashed_subvol = hashed_subvol;
+    if (hashed_subvol) {
+        if (local->skip_unlink.handle_valid_link == _gf_true) {
+            if (cached_subvol == local->skip_unlink.hash_links_to) {
+                if (gf_uuid_compare(local->skip_unlink.cached_gfid,
+                                    local->skip_unlink.hashed_gfid)) {
+                    /*GFID different, return error*/
+                    DHT_STACK_UNWIND(lookup, frame, -1, ESTALE, NULL, NULL,
+                                     NULL, NULL);
 
-        if (is_revalidate (loc)) {
-		layout = dht_layout_get (this, loc->inode);
+                    return 0;
+                }
 
-                if (!layout) {
-                        gf_log (this->name, GF_LOG_ERROR,
-                                "revalidate without cache. path=%s",
-                                loc->path);
-                        op_errno = EINVAL;
-                        goto err;
+                ret = dht_layout_preset(this, cached_subvol, local->loc.inode);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_INFO, 0,
+                           DHT_MSG_LAYOUT_PRESET_FAILED,
+                           "Could not set pre-set layout "
+                           "for subvolume %s",
+                           cached_subvol->name);
                 }
 
-		if (layout->gen && (layout->gen < conf->gen)) {
-			gf_log (this->name, GF_LOG_WARNING,
-				"incomplete layout failure for path=%s",
-				loc->path);
-			op_errno = EAGAIN;
-			goto err;
-		}
-
-		local->inode    = inode_ref (loc->inode);
-		local->st_ino   = loc->inode->ino;
-		
-		local->call_cnt = layout->cnt;
-		call_cnt = local->call_cnt;
-		
-		/* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute,
-		 *       revalidates directly go to the cached-subvolume.
-		 */
-		ret = dict_set_uint32 (local->xattr_req, 
-				       "trusted.glusterfs.dht", 4 * 4);
-
-		for (i = 0; i < layout->cnt; i++) {
-			subvol = layout->list[i].xlator;
-			
-			STACK_WIND (frame, dht_revalidate_cbk,
-				    subvol, subvol->fops->lookup,
-				    loc, local->xattr_req);
-
-			if (!--call_cnt)
-				break;
-		}
-        } else {
-		/* TODO: remove the hard-coding */
-		ret = dict_set_uint32 (local->xattr_req, 
-				       "trusted.glusterfs.dht", 4 * 4);
-
-		ret = dict_set_uint32 (local->xattr_req, 
-				       "trusted.glusterfs.dht.linkto", 256);
-
-                if (!hashed_subvol) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"no subvolume in layout for path=%s, "
-				"checking on all the subvols to see if "
-				"it is a directory", loc->path);
- 			call_cnt        = conf->subvolume_cnt;
- 			local->call_cnt = call_cnt;
- 			
- 			local->layout = dht_layout_new (this, conf->subvolume_cnt);
- 			if (!local->layout) {
- 				op_errno = ENOMEM;
- 				gf_log (this->name, GF_LOG_ERROR,
- 					"memory allocation failed :(");
- 				goto err;
- 			}
-
-			for (i = 0; i < call_cnt; i++) {
- 				STACK_WIND (frame, dht_lookup_dir_cbk,
- 					    conf->subvolumes[i],
- 					    conf->subvolumes[i]->fops->lookup,
- 					    &local->loc, local->xattr_req);
- 			}
- 			return 0;
+                local->op_ret = (ret == 0) ? ret : -1;
+                local->op_errno = (ret == 0) ? ret : EINVAL;
+
+                /* Presence of local->cached_subvol validates
+                 * that lookup from cached node is successful
+                 */
+
+                if (!local->op_ret && local->loc.parent) {
+                    dht_inode_ctx_time_update(local->loc.parent, this,
+                                              &local->postparent, 1);
                 }
 
-                STACK_WIND (frame, dht_lookup_cbk,
-                            hashed_subvol, hashed_subvol->fops->lookup,
-                            loc, local->xattr_req);
+                gf_msg_debug(this->name, 0,
+                             "Skipped unlinking linkto file "
+                             "on the hashed subvolume. "
+                             "Returning success as it is a "
+                             "valid linkto file. Path:%s",
+                             local->loc.path);
+
+                goto unwind_hashed_and_cached;
+            } else {
+                local->skip_unlink.handle_valid_link = _gf_false;
+
+                gf_msg_debug(this->name, 0,
+                             "Linkto file found on hashed "
+                             "subvol "
+                             "and data file found on cached "
+                             "subvolume. But linkto points to "
+                             "different cached subvolume (%s) "
+                             "path %s",
+                             (local->skip_unlink.hash_links_to
+                                  ? local->skip_unlink.hash_links_to->name
+                                  : " <nil>"),
+                             local->loc.path);
+
+                if (local->skip_unlink.opend_fd_count == 0) {
+                    ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(
+                        local->xattr_req);
+
+                    if (ret) {
+                        DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL,
+                                         NULL, NULL);
+                    } else {
+                        local->call_cnt = 1;
+                        FRAME_SU_DO(frame, dht_local_t);
+                        STACK_WIND(frame, dht_lookup_unlink_of_false_linkto_cbk,
+                                   hashed_subvol, hashed_subvol->fops->unlink,
+                                   &local->loc, 0, local->xattr_req);
+                    }
+
+                    return 0;
+                }
+            }
+        }
+    }
+
+preset_layout:
+
+    if (found_non_linkto_on_hashed) {
+        if (local->need_lookup_everywhere) {
+            if (gf_uuid_compare(local->gfid, local->inode->gfid)) {
+                /* GFID different, return error */
+                DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL,
+                                 NULL);
+                return 0;
+            }
         }
 
+        local->op_ret = 0;
+        local->op_errno = 0;
+        layout = dht_layout_for_subvol(this, cached_subvol);
+        if (!layout) {
+            gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+                   "%s: no pre-set layout for subvolume %s,"
+                   " gfid = %s",
+                   local->loc.path,
+                   (cached_subvol ? cached_subvol->name : "<nil>"), gfid);
+        }
+
+        ret = dht_layout_set(this, local->inode, layout);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+                   "%s: failed to set layout for subvol %s, "
+                   "gfid = %s",
+                   local->loc.path,
+                   (cached_subvol ? cached_subvol->name : "<nil>"), gfid);
+        }
+
+        if (local->loc.parent) {
+            dht_inode_ctx_time_update(local->loc.parent, this,
+                                      &local->postparent, 1);
+        }
+
+        DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+        dht_set_fixed_dir_stat(&local->postparent);
+        DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+                         local->inode, &local->stbuf, local->xattr,
+                         &local->postparent);
         return 0;
+    }
+
+    if (!hashed_subvol) {
+        gf_msg_debug(this->name, 0,
+                     "Cannot create linkfile for %s on %s: "
+                     "hashed subvolume cannot be found, gfid = %s.",
+                     local->loc.path, cached_subvol->name, gfid);
+
+        local->op_ret = 0;
+        local->op_errno = 0;
+
+        ret = dht_layout_preset(frame->this, cached_subvol, local->inode);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED,
+                   "Failed to set layout for subvol %s"
+                   ", gfid = %s",
+                   cached_subvol ? cached_subvol->name : "<nil>", gfid);
+            local->op_ret = -1;
+            local->op_errno = EINVAL;
+        }
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-        DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);
+        if (local->loc.parent) {
+            dht_inode_ctx_time_update(local->loc.parent, this,
+                                      &local->postparent, 1);
+        }
+
+        DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+        dht_set_fixed_dir_stat(&local->postparent);
+        DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+                         local->inode, &local->stbuf, local->xattr,
+                         &local->postparent);
         return 0;
+    }
+
+    if (frame->root->op != GF_FOP_RENAME) {
+        local->current = &local->lock[0];
+        ret = dht_protect_namespace(frame, &local->loc, hashed_subvol,
+                                    &local->current->ns,
+                                    dht_call_lookup_linkfile_create);
+    } else {
+        gf_msg_debug(this->name, 0,
+                     "Creating linkto file on %s(hash) to %s on %s "
+                     "(gfid = %s)",
+                     hashed_subvol->name, local->loc.path, cached_subvol->name,
+                     gfid);
+
+        ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk, this,
+                                  cached_subvol, hashed_subvol, &local->loc);
+    }
+
+    return ret;
+
+unwind_hashed_and_cached:
+    DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+    dht_set_fixed_dir_stat(&local->postparent);
+    DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+                     local->inode, &local->stbuf, local->xattr,
+                     &local->postparent);
+    return 0;
 }
 
+static int
+dht_lookup_everywhere_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, inode_t *inode,
+                          struct iatt *buf, dict_t *xattr,
+                          struct iatt *postparent)
+{
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    xlator_t *prev = NULL;
+    int is_linkfile = 0;
+    int is_dir = 0;
+    loc_t *loc = NULL;
+    xlator_t *link_subvol = NULL;
+    int ret = -1;
+    int32_t fd_count = 0;
+    dht_conf_t *conf = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+    dict_t *dict_req = {0};
+
+    GF_VALIDATE_OR_GOTO("dht", frame, out);
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+    GF_VALIDATE_OR_GOTO("dht", cookie, out);
+    GF_VALIDATE_OR_GOTO("dht", this->private, out);
+
+    local = frame->local;
+    loc = &local->loc;
+    conf = this->private;
+
+    prev = cookie;
+
+    gf_msg_debug(this->name, 0,
+                 "returned with op_ret %d and op_errno %d (%s) "
+                 "from subvol %s",
+                 op_ret, op_errno, loc->path, prev->name);
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret == -1) {
+            if (op_errno != ENOENT)
+                local->op_errno = op_errno;
+            if (op_errno == ENODATA)
+                local->gfid_missing = _gf_true;
+            goto unlock;
+        }
 
-int
-dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	      int op_ret, int op_errno, struct stat *stbuf)
-{
-	dht_local_t  *local = NULL;
-	int           this_call_cnt = 0;
-	call_frame_t *prev = NULL;
-
-
-	local = frame->local;
-	prev = cookie;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			local->op_errno = op_errno;
-			gf_log (this->name, GF_LOG_ERROR,
-				"subvolume %s returned -1 (%s)",
-				prev->this->name, strerror (op_errno));
-			goto unlock;
-		}
-
-		dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
-		
-		if (local->inode)
-			local->stbuf.st_ino = local->inode->ino;
-		local->op_ret = 0;
-	}
+        if (gf_uuid_is_null(local->gfid))
+            gf_uuid_copy(local->gfid, buf->ia_gfid);
+
+        gf_uuid_unparse(local->gfid, gfid);
+
+        if (gf_uuid_compare(local->gfid, buf->ia_gfid)) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+                   "%s: gfid differs on subvolume %s,"
+                   " gfid local = %s, gfid node = %s",
+                   loc->path, prev->name, gfid, uuid_utoa(buf->ia_gfid));
+        }
+
+        is_linkfile = check_is_linkfile(inode, buf, xattr,
+                                        conf->link_xattr_name);
+
+        if (is_linkfile) {
+            link_subvol = dht_linkfile_subvol(this, inode, buf, xattr);
+            gf_msg_debug(this->name, 0, "found on %s linkfile %s (-> %s)",
+                         prev->name, loc->path,
+                         link_subvol ? link_subvol->name : "''");
+            goto unlock;
+        }
+
+        is_dir = check_is_dir(inode, buf, xattr);
+
+        /* non linkfile GFID takes precedence but don't overwrite
+         gfid if we have already found a cached file*/
+        if (!local->cached_subvol)
+            gf_uuid_copy(local->gfid, buf->ia_gfid);
+
+        if (is_dir) {
+            local->dir_count++;
+
+            gf_msg_debug(this->name, 0, "found on %s directory %s", prev->name,
+                         loc->path);
+        } else {
+            local->file_count++;
+
+            gf_msg_debug(this->name, 0, "found cached file on %s for %s",
+                         prev->name, loc->path);
+
+            if (!local->cached_subvol) {
+                /* found one file */
+                dht_iatt_merge(this, &local->stbuf, buf);
+
+                local->xattr = dict_ref(xattr);
+                local->cached_subvol = prev;
+
+                gf_msg_debug(this->name, 0,
+                             "storing cached on %s file"
+                             " %s",
+                             prev->name, loc->path);
+
+                dht_iatt_merge(this, &local->postparent, postparent);
+
+                gf_uuid_copy(local->skip_unlink.cached_gfid, buf->ia_gfid);
+            } else {
+                /* This is where we need 'rename' both entries logic */
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       DHT_MSG_FILE_ON_MULT_SUBVOL,
+                       "multiple subvolumes (%s and %s) have "
+                       "file %s (preferably rename the file "
+                       "in the backend,and do a fresh lookup)",
+                       local->cached_subvol->name, prev->name, local->loc.path);
+            }
+        }
+    }
 unlock:
-	UNLOCK (&frame->lock);
+    UNLOCK(&frame->lock);
+
+    if (is_linkfile) {
+        ret = dict_get_int32(xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count);
+
+        /*  Any linkto file found on the non-hashed subvolume should
+         *  be unlinked (performed in the "else if" block below)
+         *
+         *  But if a linkto file is found on hashed subvolume, it may be
+         *  pointing to valid cached node. So unlinking of linkto
+         *  file on hashed subvolume is skipped and inside
+         *  dht_lookup_everywhere_done, checks are performed. If this
+         *  linkto file is found as stale linkto file, it is deleted
+         *  otherwise unlink is skipped.
+         */
+
+        if (local->hashed_subvol && local->hashed_subvol == prev) {
+            local->skip_unlink.handle_valid_link = _gf_true;
+            local->skip_unlink.opend_fd_count = fd_count;
+            local->skip_unlink.hash_links_to = link_subvol;
+            gf_uuid_copy(local->skip_unlink.hashed_gfid, buf->ia_gfid);
+
+            gf_msg_debug(this->name, 0,
+                         "Found"
+                         " one linkto file on hashed subvol %s "
+                         "for %s: Skipping unlinking till "
+                         "everywhere_done",
+                         prev->name, loc->path);
+
+        } else if (!ret && (fd_count == 0)) {
+            dict_req = dict_new();
+
+            ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_req);
+
+            if (ret) {
+                /* Skip unlinking for dict_failure
+                 *File is found as a linkto file on non-hashed,
+                 *subvolume. In the current implementation,
+                 *finding a linkto-file on non-hashed does not
+                 *always implies that it is stale. So deletion
+                 *of file should be done only when both fd is
+                 *closed and linkto-xattr is set. In case of
+                 *dict_set failure, avoid skipping of file.
+                 *NOTE: dht_frame_return should get called for
+                 *      this block.
+                 */
+
+                dict_unref(dict_req);
+
+            } else {
+                gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+                       "attempting deletion of stale linkfile "
+                       "%s on %s (hashed subvol is %s)",
+                       loc->path, prev->name,
+                       (local->hashed_subvol ? local->hashed_subvol->name
+                                             : "<null>"));
+                /* *
+                 * These stale files may be created using root
+                 * user. Hence deletion will work only with
+                 * root.
+                 */
+                FRAME_SU_DO(frame, dht_local_t);
+                STACK_WIND(frame, dht_lookup_unlink_cbk, prev,
+                           prev->fops->unlink, loc, 0, dict_req);
+
+                dict_unref(dict_req);
+
+                return 0;
+            }
+        }
+    }
 
-	this_call_cnt = dht_frame_return (frame);
-	if (is_last_call (this_call_cnt))
-		DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-				  &local->stbuf);
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt)) {
+        dht_lookup_everywhere_done(frame, this);
+    }
 
-        return 0;
+out:
+    return ret;
 }
 
-
 int
-dht_stat (call_frame_t *frame, xlator_t *this,
-	  loc_t *loc)
-{
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
-	dht_local_t  *local = NULL;
-	dht_layout_t *layout = NULL;
-	int           i = 0;
-
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	layout = dht_layout_get (this, loc->inode);
-	if (!layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no layout for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	local->inode = inode_ref (loc->inode);
-	local->call_cnt = layout->cnt;
-
-	for (i = 0; i < layout->cnt; i++) {
-		subvol = layout->list[i].xlator;
-
-		STACK_WIND (frame, dht_attr_cbk,
-			    subvol, subvol->fops->stat,
-			    loc);
-	}
-
-	return 0;
+dht_lookup_everywhere(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+    dht_conf_t *conf = NULL;
+    dht_local_t *local = NULL;
+    int i = 0;
+    int call_cnt = 0;
+
+    GF_VALIDATE_OR_GOTO("dht", frame, err);
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+    GF_VALIDATE_OR_GOTO("dht", this->private, out);
+    GF_VALIDATE_OR_GOTO("dht", loc, out);
+
+    conf = this->private;
+    local = frame->local;
+
+    call_cnt = conf->subvolume_cnt;
+    local->call_cnt = call_cnt;
 
+    if (!local->inode)
+        local->inode = inode_ref(loc->inode);
+
+    gf_msg_debug(this->name, 0, "winding lookup call to %d subvols", call_cnt);
+
+    for (i = 0; i < call_cnt; i++) {
+        STACK_WIND_COOKIE(frame, dht_lookup_everywhere_cbk, conf->subvolumes[i],
+                          conf->subvolumes[i],
+                          conf->subvolumes[i]->fops->lookup, loc,
+                          local->xattr_req);
+    }
+
+    return 0;
+out:
+    DHT_STACK_UNWIND(lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL);
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    return -1;
+}
+
+int
+dht_lookup_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int op_ret, int op_errno, inode_t *inode,
+                        struct iatt *stbuf, dict_t *xattr,
+                        struct iatt *postparent)
+{
+    xlator_t *prev = NULL;
+    dht_local_t *local = NULL;
+    xlator_t *subvol = NULL;
+    loc_t *loc = NULL;
+    dht_conf_t *conf = NULL;
+    int ret = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    GF_VALIDATE_OR_GOTO("dht", frame, out);
+    GF_VALIDATE_OR_GOTO("dht", this, unwind);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, unwind);
+    GF_VALIDATE_OR_GOTO("dht", this->private, unwind);
+    GF_VALIDATE_OR_GOTO("dht", cookie, unwind);
+
+    prev = cookie;
+    subvol = prev;
+    conf = this->private;
+    local = frame->local;
+    loc = &local->loc;
+
+    gf_uuid_unparse(loc->gfid, gfid);
+
+    if (op_ret == -1) {
+        gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+               "Lookup of %s on %s (following linkfile) failed "
+               ",gfid = %s",
+               local->loc.path, subvol->name, gfid);
+
+        /* If cached subvol returned ENOTCONN, do not do
+        lookup_everywhere. We need to make sure linkfile does not get
+        removed, which can take away the namespace, and subvol is
+        anyways down. */
+
+        local->cached_subvol = NULL;
+        if (op_errno != ENOTCONN)
+            goto err;
+        else
+            goto unwind;
+    }
+
+    if (check_is_dir(inode, stbuf, xattr)) {
+        gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+               "Lookup of %s on %s (following linkfile) reached dir,"
+               " gfid = %s",
+               local->loc.path, subvol->name, gfid);
+        goto err;
+    }
+
+    if (check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) {
+        gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+               "lookup of %s on %s (following linkfile) reached link,"
+               "gfid = %s",
+               local->loc.path, subvol->name, gfid);
+        goto err;
+    }
+
+    if (gf_uuid_compare(local->gfid, stbuf->ia_gfid)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+               "%s: gfid different on data file on %s,"
+               " gfid local = %s, gfid node = %s ",
+               local->loc.path, subvol->name, gfid, uuid_utoa(stbuf->ia_gfid));
+        goto err;
+    }
+
+    if ((stbuf->ia_nlink == 1) && (conf && conf->unhashed_sticky_bit)) {
+        stbuf->ia_prot.sticky = 1;
+    }
+
+    ret = dht_layout_preset(this, prev, inode);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED,
+               "Failed to set layout for subvolume %s,"
+               "gfid = %s",
+               prev->name, gfid);
+        op_ret = -1;
+        op_errno = EINVAL;
+    }
+
+    if (local->loc.parent) {
+        dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+    }
+
+unwind:
+    DHT_STRIP_PHASE1_FLAGS(stbuf);
+    dht_set_fixed_dir_stat(postparent);
+    DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
+                     postparent);
+
+    return 0;
 
-	return 0;
+err:
+    dht_lookup_everywhere(frame, this, loc);
+out:
+    return 0;
 }
 
+/* Code to get hashed subvol based on inode and loc
+   First it check if loc->parent and loc->path exist then it get
+   hashed subvol based on loc.
+*/
+
+static gf_boolean_t
+dht_should_lookup_everywhere(xlator_t *this, dht_conf_t *conf, loc_t *loc)
+{
+    dht_layout_t *parent_layout = NULL;
+    int ret = 0;
+    gf_boolean_t lookup_everywhere = _gf_true;
+
+    /* lookup-optimize supersedes lookup-unhashed settings.
+     * If it is set, do not process search_unhashed
+     * If lookup-optimize if enabled, lookup everywhere if:
+     *  - this is the rebalance daemon.
+     *  - loc->parent is unavailable.
+     *  - parent_layout is unavailable
+     *  - parent_layout->commit_hash != conf->vol_commit_hash
+     */
+
+    if (conf->lookup_optimize) {
+        if (!conf->defrag && loc->parent) {
+            ret = dht_inode_ctx_layout_get(loc->parent, this, &parent_layout);
+            if (!ret && parent_layout &&
+                (parent_layout->commit_hash == conf->vol_commit_hash)) {
+                lookup_everywhere = _gf_false;
+            }
+        }
+        goto out;
+    } else {
+        if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) {
+            if (loc->parent) {
+                ret = dht_inode_ctx_layout_get(loc->parent, this,
+                                               &parent_layout);
+                if (ret || !parent_layout ||
+                    (!parent_layout->search_unhashed)) {
+                    lookup_everywhere = _gf_false;
+                }
+            } else {
+                lookup_everywhere = _gf_false;
+            }
+
+            goto out;
+        }
+    }
+out:
+    return lookup_everywhere;
+}
 
 int
-dht_fstat (call_frame_t *frame, xlator_t *this,
-	   fd_t *fd)
-{
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
-	dht_local_t  *local = NULL;
-	dht_layout_t *layout = NULL;
-	int           i = 0;
-
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-
-	layout = dht_layout_get (this, fd->inode);
-	if (!layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no layout for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"local allocation failed :(");
-		goto err;
-	}
-
-	local->inode    = inode_ref (fd->inode);
-	local->call_cnt = layout->cnt;;
-
-	for (i = 0; i < layout->cnt; i++) {
-		subvol = layout->list[i].xlator;
-		STACK_WIND (frame, dht_attr_cbk,
-			    subvol, subvol->fops->fstat,
-			    fd);
-	}
-
-	return 0;
+dht_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+               int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr,
+               struct iatt *postparent)
+{
+    char is_linkfile = 0;
+    char is_dir = 0;
+    xlator_t *subvol = NULL;
+    dht_conf_t *conf = NULL;
+    dht_local_t *local = NULL;
+    loc_t *loc = NULL;
+    xlator_t *prev = NULL;
+    int ret = 0;
+    uint32_t vol_commit_hash = 0;
+
+    GF_VALIDATE_OR_GOTO("dht", frame, err);
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+    GF_VALIDATE_OR_GOTO("dht", cookie, out);
+    GF_VALIDATE_OR_GOTO("dht", this->private, out);
+
+    conf = this->private;
+
+    prev = cookie;
+    local = frame->local;
+    loc = &local->loc;
+
+    gf_msg_debug(this->name, op_errno,
+                 "%s: fresh_lookup on %s returned with op_ret %d", loc->path,
+                 prev->name, op_ret);
+
+    if (op_ret == -1) {
+        if (ENTRY_MISSING(op_ret, op_errno)) {
+            if (1 == conf->subvolume_cnt) {
+                /* No need to lookup again */
+                goto out;
+            }
 
+            gf_msg_debug(this->name, 0, "Entry %s missing on subvol %s",
+                         loc->path, prev->name);
+
+            if (dht_should_lookup_everywhere(this, conf, loc)) {
+                local->op_errno = ENOENT;
+                dht_lookup_everywhere(frame, this, loc);
+                return 0;
+            }
+
+        } else {
+            /* posix returns ENODATA if the gfid is not set but the client and
+             * server protocol layers do not send the stbuf. We need to
+             * heal this so check if this is a directory on the other subvols.
+             */
+            if ((op_errno == ENOTCONN) || (op_errno == ENODATA)) {
+                dht_lookup_directory(frame, this, &local->loc);
+                return 0;
+            }
+        }
+        gf_msg_debug(this->name, op_errno, "%s: Lookup on subvolume %s failed",
+                     loc->path, prev->name);
+        goto out;
+    }
+
+    /* Lookup succeeded - op_ret = 0 */
+
+    /* This is required for handling stale linkfile deletion,
+     * or any more call which happens from this 'loc'.
+     */
+    if (gf_uuid_is_null(local->gfid)) {
+        /*This is set from the first successful response*/
+        memcpy(local->gfid, stbuf->ia_gfid, 16);
+    }
+
+    if (!conf->vch_forced) {
+        /* Update the commit hash in conf if it is found */
+        ret = dict_get_uint32(xattr, conf->commithash_xattr_name,
+                              &vol_commit_hash);
+        if (ret == 0) {
+            conf->vol_commit_hash = vol_commit_hash;
+        }
+    }
+
+    is_dir = check_is_dir(inode, stbuf, xattr);
+    if (is_dir) {
+        /* A directory is present on all subvols, send the lookup to
+         * all subvols now */
+        local->inode = inode_ref(inode);
+        local->xattr = dict_ref(xattr);
+        dht_lookup_directory(frame, this, &local->loc);
+        return 0;
+    }
+
+    is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name);
+
+    if (!is_linkfile) {
+        /* non-directory and not a linkto file. This is a data file
+         * Update the layout to point to the cached subvol
+         */
+
+        ret = dht_layout_preset(this, prev, inode);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED,
+                   "%s: could not set pre-set layout for subvolume %s",
+                   loc->path, prev->name);
+            op_ret = -1;
+            op_errno = EINVAL;
+            goto out;
+        }
+        goto out;
+    }
+
+    /* This is a linkto file. Get the value of the target subvol from the
+     * linkto xattr and lookup there to see if the file exists
+     */
+    subvol = dht_linkfile_subvol(this, inode, stbuf, xattr);
+    if (!subvol) {
+        gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+               "%s: No link subvol for linkto", loc->path);
+        dht_lookup_everywhere(frame, this, loc);
+        return 0;
+    }
+
+    gf_msg_debug(this->name, 0, "%s: Calling lookup on linkto target %s",
+                 loc->path, subvol->name);
+
+    STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol,
+                      subvol->fops->lookup, &local->loc, local->xattr_req);
+
+    return 0;
+
+out:
+    /*
+     * FIXME: postparent->ia_size and postparent->st_blocks do not have
+     * correct values. since, postparent corresponds to a directory these
+     * two members should have values equal to sum of corresponding values
+     * from each of the subvolume. See dht_iatt_merge for reference.
+     */
+
+    if (!op_ret && local && local->loc.parent) {
+        dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+    }
+
+    DHT_STRIP_PHASE1_FLAGS(stbuf);
+    dht_set_fixed_dir_stat(postparent);
+    DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
+                     postparent);
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    return 0;
+}
+
+/* For directories, check if acl xattrs have been requested (by the acl
+ * xlator), if not, request for them. These xattrs are needed for dht dir
+ * self-heal to perform proper self-healing of dirs
+ */
+static void
+dht_check_and_set_acl_xattr_req(xlator_t *this, dict_t *xattr_req)
+{
+    int ret = 0;
+
+    GF_ASSERT(xattr_req);
+
+    if (!dict_get(xattr_req, POSIX_ACL_ACCESS_XATTR)) {
+        ret = dict_set_int8(xattr_req, POSIX_ACL_ACCESS_XATTR, 0);
+        if (ret)
+            gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_DICT_SET_FAILED,
+                   "Failed to set dictionary value:key = %s",
+                   POSIX_ACL_ACCESS_XATTR);
+    }
+
+    if (!dict_get(xattr_req, POSIX_ACL_DEFAULT_XATTR)) {
+        ret = dict_set_int8(xattr_req, POSIX_ACL_DEFAULT_XATTR, 0);
+        if (ret)
+            gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_DICT_SET_FAILED,
+                   "Failed to set dictionary value:key = %s",
+                   POSIX_ACL_DEFAULT_XATTR);
+    }
+
+    return;
+}
 
-	return 0;
+/* for directories, we need the following info:
+ * the layout : trusted.glusterfs.dht
+ * the mds information : trusted.glusterfs.dht.mds
+ * the acl info: See above
+ */
+static int
+dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+    int ret = -EINVAL;
+    dht_conf_t *conf = NULL;
+
+    conf = this->private;
+    if (!conf) {
+        goto err;
+    }
+
+    if (!xattr_req) {
+        goto err;
+    }
+
+    /* Xattr to get the layout for a directory
+     */
+    ret = dict_set_uint32(xattr_req, conf->xattr_name, 4 * 4);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+               "Failed to set dictionary value:key = %s for "
+               "path %s",
+               conf->xattr_name, loc->path);
+        goto err;
+    }
+
+    /*Non-fatal failure */
+    ret = dict_set_uint32(xattr_req, conf->mds_xattr_key, 4);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+               "Failed to set dictionary value:key = %s for "
+               "path %s",
+               conf->mds_xattr_key, loc->path);
+    }
+
+    dht_check_and_set_acl_xattr_req(this, xattr_req);
+    ret = 0;
+err:
+    return ret;
 }
 
+/* If the hashed subvol is present, send the lookup to only that subvol first.
+ * If no hashed subvol, send a lookup to all subvols and proceed based on the
+ * responses.
+ */
+static int
+dht_do_fresh_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+    int ret = -1;
+    dht_conf_t *conf = NULL;
+    xlator_t *hashed_subvol = NULL;
+    dht_local_t *local = NULL;
+    int op_errno = -1;
+    int call_cnt = 0;
+    int i = 0;
+
+    conf = this->private;
+    if (!conf) {
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    local = frame->local;
+    if (!local) {
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    /* Since we don't know whether this is a file or a directory,
+     * request all xattrs*/
+    ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
+    if (ret) {
+        op_errno = -ret;
+        goto err;
+    }
+
+    ret = dht_set_dir_xattr_req(this, loc, local->xattr_req);
+    if (ret) {
+        op_errno = -ret;
+        goto err;
+    }
+
+    /* Fuse sets a random value in gfid-req. If the gfid is missing
+     * on one or more subvols, posix will set the gfid to this value,
+     * causing GFID mismatches for directories. Remove the value fuse
+     * has sent before sending the lookup.
+     */
+    ret = dict_get_gfuuid(local->xattr_req, "gfid-req", &local->gfid_req);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "%s: No gfid-req available", loc->path);
+    } else {
+        dict_del(local->xattr_req, "gfid-req");
+    }
+    /* This should have been set in dht_lookup */
+    hashed_subvol = local->hashed_subvol;
+
+    if (!hashed_subvol) {
+        gf_msg_debug(this->name, 0,
+                     "%s: no subvolume in layout for path, "
+                     "checking on all the subvols to see if "
+                     "it is a directory",
+                     loc->path);
+
+        call_cnt = conf->subvolume_cnt;
+        local->call_cnt = call_cnt;
+
+        /* Allocate a layout. This will be populated and saved in
+         * the dht inode_ctx on successful lookup
+         */
+        local->layout = dht_layout_new(this, conf->subvolume_cnt);
+        if (!local->layout) {
+            op_errno = ENOMEM;
+            goto err;
+        }
 
-int
-dht_chmod (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, mode_t mode)
-{
-	dht_layout_t *layout = NULL;
-	dht_local_t  *local  = NULL;
-        int           op_errno = -1;
-	int           i = -1;
-
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	layout = dht_layout_get (this, loc->inode);
-
-	if (!layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no layout for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	if (!layout_is_sane (layout)) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"layout is not sane for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	local->inode = inode_ref (loc->inode);
-	local->call_cnt = layout->cnt;
-
-	for (i = 0; i < layout->cnt; i++) {
-		STACK_WIND (frame, dht_attr_cbk,
-			    layout->list[i].xlator,
-			    layout->list[i].xlator->fops->chmod,
-			    loc, mode);
-	}
-
-	return 0;
+        gf_msg_debug(this->name, 0,
+                     "%s: Found null hashed subvol. Calling lookup"
+                     " on all nodes.",
+                     loc->path);
 
+        for (i = 0; i < call_cnt; i++) {
+            STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i],
+                              conf->subvolumes[i],
+                              conf->subvolumes[i]->fops->lookup, &local->loc,
+                              local->xattr_req);
+        }
+        return 0;
+    }
+
+    /* if the hashed_subvol is non-null, send the lookup there first so
+     * as to see whether we have a file or a directory */
+    gf_msg_debug(this->name, 0, "%s: Calling fresh lookup on %s", loc->path,
+                 hashed_subvol->name);
+
+    STACK_WIND_COOKIE(frame, dht_lookup_cbk, hashed_subvol, hashed_subvol,
+                      hashed_subvol->fops->lookup, loc, local->xattr_req);
+    return 0;
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+    return 0;
+}
+
+static int
+dht_do_revalidate(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+    xlator_t *subvol = NULL;
+    xlator_t *mds_subvol = NULL;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int ret = -1;
+    int op_errno = -1;
+    dht_layout_t *layout = NULL;
+    int i = 0;
+    int call_cnt = 0;
+    int gen = 0;
+
+    conf = this->private;
+    if (!conf) {
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    local = frame->local;
+    if (!local) {
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    layout = local->layout;
+    if (!layout) {
+        gf_msg_debug(this->name, 0,
+                     "path = %s. No layout found in the inode ctx.", loc->path);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    /* Generation number has changed. This layout may be stale. */
+    if (layout->gen && (layout->gen < conf->gen)) {
+        gen = layout->gen;
+        dht_layout_unref(this, local->layout);
+        local->layout = NULL;
+        local->cached_subvol = NULL;
+
+        gf_msg_debug(this->name, 0,
+                     "path = %s. In memory layout may be stale."
+                     "(layout->gen (%d) is less than "
+                     "conf->gen (%d)). Calling fresh lookup.",
+                     loc->path, gen, conf->gen);
+
+        dht_do_fresh_lookup(frame, this, loc);
+        return 0;
+    }
+
+    local->inode = inode_ref(loc->inode);
+
+    /* Since we don't know whether this has changed,
+     * request all xattrs*/
+    ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
+    if (ret) {
+        op_errno = -ret;
+        goto err;
+    }
+
+    ret = dht_set_dir_xattr_req(this, loc, local->xattr_req);
+    if (ret) {
+        op_errno = -ret;
+        goto err;
+    }
+
+    if (IA_ISDIR(local->inode->ia_type)) {
+        ret = dht_inode_ctx_mdsvol_get(local->inode, this, &mds_subvol);
+        if (ret || !mds_subvol) {
+            gf_msg_debug(this->name, 0, "path = %s. No mds subvol in inode ctx",
+                         local->loc.path);
+        }
+        local->mds_subvol = mds_subvol;
+        local->call_cnt = conf->subvolume_cnt;
+
+        /* local->call_cnt will change as responses are processed. Always use a
+         * local copy to loop through the STACK_WIND calls
+         */
+
+        call_cnt = local->call_cnt;
 
-	return 0;
+        for (i = 0; i < call_cnt; i++) {
+            STACK_WIND_COOKIE(frame, dht_revalidate_cbk, conf->subvolumes[i],
+                              conf->subvolumes[i],
+                              conf->subvolumes[i]->fops->lookup, loc,
+                              local->xattr_req);
+        }
+        return 0;
+    }
+
+    /* If not a dir, this should be 1 */
+    local->call_cnt = layout->cnt;
+    call_cnt = local->call_cnt;
+
+    for (i = 0; i < call_cnt; i++) {
+        subvol = layout->list[i].xlator;
+
+        gf_msg_debug(this->name, 0,
+                     "path = %s. Calling "
+                     "revalidate lookup on %s",
+                     loc->path, subvol->name);
+
+        STACK_WIND_COOKIE(frame, dht_revalidate_cbk, subvol, subvol,
+                          subvol->fops->lookup, &local->loc, local->xattr_req);
+    }
+    return 0;
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+    return 0;
 }
 
+/* Depending on the input, decide if this is a:
+ * fresh-lookup: loc->name is provided but no dht inode ctx
+ * revalidation: loc->name is provided, dht inode ctx is present
+ * discover: gfid based nameless lookup.
+ */
 
 int
-dht_chown (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, uid_t uid, gid_t gid)
-{
-	dht_layout_t *layout = NULL;
-	dht_local_t  *local  = NULL;
-        int           op_errno = -1;
-	int           i = -1;
-
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	layout = dht_layout_get (this, loc->inode);
-	if (!layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no layout for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	if (!layout_is_sane (layout)) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"layout is not sane for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	local->inode = inode_ref (loc->inode);
-	local->call_cnt = layout->cnt;
-
-	for (i = 0; i < layout->cnt; i++) {
-		STACK_WIND (frame, dht_attr_cbk,
-			    layout->list[i].xlator,
-			    layout->list[i].xlator->fops->chown,
-			    loc, uid, gid);
-	}
-
-	return 0;
+dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+    xlator_t *hashed_subvol = NULL;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int ret = -1;
+    int op_errno = -1;
+    loc_t new_loc = {
+        0,
+    };
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+
+    conf = this->private;
+    if (!conf)
+        goto err;
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_LOOKUP);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    ret = dht_filter_loc_subvol_key(this, loc, &new_loc, &hashed_subvol);
+    if (ret) {
+        loc_wipe(&local->loc);
+        ret = loc_dup(&new_loc, &local->loc);
+
+        /* we no longer need 'new_loc' entries */
+        loc_wipe(&new_loc);
+
+        /* check if loc_dup() is successful */
+        if (ret == -1) {
+            op_errno = errno;
+            gf_msg_debug(this->name, errno,
+                         "copying location failed for path=%s", loc->path);
+            goto err;
+        }
+    }
+
+    if (xattr_req) {
+        local->xattr_req = dict_ref(xattr_req);
+    } else {
+        local->xattr_req = dict_new();
+    }
+
+    /* Nameless lookup */
+
+    /* This is usually sent by NFS. Lookups are done based on the gfid and
+     * no name information is available. Without the name, dht cannot calculate
+     * the hash and has to send a lookup to all subvols.
+     */
+    if (gf_uuid_is_null(loc->pargfid) && !gf_uuid_is_null(loc->gfid) &&
+        !__is_root_gfid(loc->inode->gfid)) {
+        local->cached_subvol = NULL;
+        dht_do_discover(frame, this, loc);
+        return 0;
+    }
+
+    if (loc_is_root(loc)) {
+        /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash)
+         * set on the brick root.
+         */
+        ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name,
+                              sizeof(uint32_t));
+    }
+
+    if (!hashed_subvol)
+        hashed_subvol = dht_subvol_get_hashed(this, loc);
+    local->hashed_subvol = hashed_subvol;
+
+    if (is_revalidate(loc)) {
+        /* The entry has been looked up before and has a dht inode_ctx
+         */
+        dht_do_revalidate(frame, this, loc);
+        return 0;
+    } else {
+        /* Entry has not been looked up before
+         */
+        dht_do_fresh_lookup(frame, this, loc);
+        return 0;
+    }
 
+    return 0;
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+    return 0;
+}
+
+static int
+dht_unlink_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int op_ret, int op_errno, struct iatt *preparent,
+                        struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+
+    local = frame->local;
+    prev = cookie;
+
+    LOCK(&frame->lock);
+    {
+        if ((op_ret == -1) &&
+            !((op_errno == ENOENT) || (op_errno == ENOTCONN))) {
+            local->op_errno = op_errno;
+            UNLOCK(&frame->lock);
+            gf_msg_debug(this->name, op_errno,
+                         "Unlink link: subvolume %s returned -1", prev->name);
+            goto post_unlock;
+        }
 
-	return 0;
+        local->op_ret = 0;
+    }
+    UNLOCK(&frame->lock);
+post_unlock:
+    dht_set_fixed_dir_stat(&local->preparent);
+    dht_set_fixed_dir_stat(&local->postparent);
+    DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
+                     &local->preparent, &local->postparent, xdata);
+
+    return 0;
 }
 
+static int
+dht_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+               int op_errno, struct iatt *preparent, struct iatt *postparent,
+               dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    xlator_t *hashed_subvol = NULL;
 
-int
-dht_fchmod (call_frame_t *frame, xlator_t *this,
-	    fd_t *fd, mode_t mode)
-{
-	dht_layout_t *layout = NULL;
-	dht_local_t  *local  = NULL;
-        int           op_errno = -1;
-	int           i = -1;
-
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-
-
-	layout = dht_layout_get (this, fd->inode);
-	if (!layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no layout for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	if (!layout_is_sane (layout)) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"layout is not sane for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	local->inode = inode_ref (fd->inode);
-	local->call_cnt = layout->cnt;
-
-	for (i = 0; i < layout->cnt; i++) {
-		STACK_WIND (frame, dht_attr_cbk,
-			    layout->list[i].xlator,
-			    layout->list[i].xlator->fops->fchmod,
-			    fd, mode);
-	}
-
-	return 0;
+    local = frame->local;
+    prev = cookie;
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    LOCK(&frame->lock);
+    {
+        if (op_ret == -1) {
+            if (op_errno != ENOENT) {
+                local->op_ret = -1;
+                local->op_errno = op_errno;
+            } else {
+                local->op_ret = 0;
+            }
+            UNLOCK(&frame->lock);
+            gf_msg_debug(this->name, op_errno,
+                         "Unlink: subvolume %s returned -1", prev->name);
+            goto post_unlock;
+        }
+
+        local->op_ret = 0;
+
+        local->postparent = *postparent;
+        local->preparent = *preparent;
+
+        if (local->loc.parent) {
+            dht_inode_ctx_time_update(local->loc.parent, this,
+                                      &local->preparent, 0);
+            dht_inode_ctx_time_update(local->loc.parent, this,
+                                      &local->postparent, 1);
+        }
+    }
+    UNLOCK(&frame->lock);
+post_unlock:
+    if (!local->op_ret) {
+        hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
+        if (hashed_subvol && hashed_subvol != local->cached_subvol) {
+            /*
+             * If hashed and cached are different, then we need
+             * to unlink linkfile from hashed subvol if data
+             * file is deleted successfully
+             */
+            STACK_WIND_COOKIE(frame, dht_unlink_linkfile_cbk, hashed_subvol,
+                              hashed_subvol, hashed_subvol->fops->unlink,
+                              &local->loc, local->flags, xdata);
+            return 0;
+        }
+    }
 
-	return 0;
+    dht_set_fixed_dir_stat(&local->preparent);
+    dht_set_fixed_dir_stat(&local->postparent);
+    DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
+                     &local->preparent, &local->postparent, xdata);
+
+    return 0;
 }
 
+static int
+dht_common_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
 
-int
-dht_fchown (call_frame_t *frame, xlator_t *this,
-	    fd_t *fd, uid_t uid, gid_t gid)
-{
-	dht_layout_t *layout = NULL;
-	dht_local_t  *local  = NULL;
-        int           op_errno = -1;
-	int           i = -1;
-
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-
-	layout = dht_layout_get (this, fd->inode);
-	if (!layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no layout for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	if (!layout_is_sane (layout)) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"layout is not sane for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	local->inode = inode_ref (fd->inode);
-	local->call_cnt = layout->cnt;
-
-	for (i = 0; i < layout->cnt; i++) {
-		STACK_WIND (frame, dht_attr_cbk,
-			    layout->list[i].xlator,
-			    layout->list[i].xlator->fops->fchown,
-			    fd, uid, gid);
-	}
-
-	return 0;
+static int
+dht_fix_layout_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    dht_layout_t *layout = NULL;
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    if (op_ret == 0) {
+        /* update the layout in the inode ctx */
+        local = frame->local;
+        layout = local->selfheal.layout;
 
-	return 0;
+        dht_layout_set(this, local->loc.inode, layout);
+    }
+
+    DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+    return 0;
 }
 
+static int
+dht_err_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+            int op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    xlator_t *prev = NULL;
+
+    local = frame->local;
+    prev = cookie;
 
-int
-dht_utimens (call_frame_t *frame, xlator_t *this,
-	     loc_t *loc, struct timespec tv[2])
-{
-	dht_layout_t *layout = NULL;
-	dht_local_t  *local  = NULL;
-        int           op_errno = -1;
-	int           i = -1;
-
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	layout = dht_layout_get (this, loc->inode);
-	if (!layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no layout for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	if (!layout_is_sane (layout)) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"layout is not sane for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	local->inode = inode_ref (loc->inode);
-	local->call_cnt = layout->cnt;
-
-	for (i = 0; i < layout->cnt; i++) {
-		STACK_WIND (frame, dht_attr_cbk,
-			    layout->list[i].xlator,
-			    layout->list[i].xlator->fops->utimens,
-			    loc, tv);
-	}
-
-	return 0;
+    LOCK(&frame->lock);
+    {
+        if (op_ret == -1) {
+            local->op_errno = op_errno;
+            UNLOCK(&frame->lock);
+            gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                         prev->name);
+            goto post_unlock;
+        }
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+        local->op_ret = 0;
+    }
+    UNLOCK(&frame->lock);
+post_unlock:
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt)) {
+        if ((local->fop == GF_FOP_SETXATTR) ||
+            (local->fop == GF_FOP_FSETXATTR)) {
+            DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+                             NULL);
+            /* 'local' itself may not be valid after this */
+            goto out;
+        }
+        if ((local->fop == GF_FOP_REMOVEXATTR) ||
+            (local->fop == GF_FOP_FREMOVEXATTR)) {
+            DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+                             NULL);
+        }
+    }
 
-	return 0;
+out:
+    return 0;
 }
 
+/* Set the value[] of key into dict after convert from
+   host byte order to network byte order
+*/
+int32_t
+dht_dict_set_array(dict_t *dict, char *key, int32_t value[], int32_t size)
+{
+    int ret = -1;
+    int32_t *ptr = NULL;
+    int32_t vindex;
+
+    if (value == NULL) {
+        return -EINVAL;
+    }
+
+    ptr = GF_MALLOC(sizeof(int32_t) * size, gf_common_mt_char);
+    if (ptr == NULL) {
+        return -ENOMEM;
+    }
+    for (vindex = 0; vindex < size; vindex++) {
+        ptr[vindex] = hton32(value[vindex]);
+    }
+    ret = dict_set_bin(dict, key, ptr, sizeof(int32_t) * size);
+    if (ret)
+        GF_FREE(ptr);
+    return ret;
+}
 
-int
-dht_truncate (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, off_t offset)
+static int
+dht_common_mds_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno, dict_t *dict,
+                           dict_t *xdata)
 {
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
-	dht_local_t  *local = NULL;
+    dht_local_t *local = NULL;
+    call_frame_t *prev = cookie;
 
+    local = frame->local;
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
+    if (op_ret)
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                     prev->this->name);
 
-	subvol = dht_subvol_get_cached (this, loc->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
+    if (local->fop == GF_FOP_SETXATTR) {
+        DHT_STACK_UNWIND(setxattr, frame, 0, op_errno, local->xdata);
+        /* 'local' itself may not be valid after this */
+        goto out;
+    }
 
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
+    if (local->fop == GF_FOP_FSETXATTR) {
+        DHT_STACK_UNWIND(fsetxattr, frame, 0, op_errno, local->xdata);
+        /* 'local' itself may not be valid after this */
+        goto out;
+    }
 
-	local->inode = inode_ref (loc->inode);
-	local->call_cnt = 1;
+    if (local->fop == GF_FOP_REMOVEXATTR) {
+        DHT_STACK_UNWIND(removexattr, frame, 0, op_errno, NULL);
+        /* 'local' itself may not be valid after this */
+        goto out;
+    }
 
-	STACK_WIND (frame, dht_attr_cbk,
-		    subvol, subvol->fops->truncate,
-		    loc, offset);
+    if (local->fop == GF_FOP_FREMOVEXATTR) {
+        DHT_STACK_UNWIND(fremovexattr, frame, 0, op_errno, NULL);
+    }
 
-	return 0;
+out:
+    return 0;
+}
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+/* Code to wind a xattrop call to add 1 on current mds internal xattr
+   value
+*/
+static int
+dht_setxattr_non_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int op_ret, int op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    int ret = 0;
+    dict_t *xattrop = NULL;
+    int32_t addone[1] = {1};
+    call_frame_t *prev = NULL;
+    dht_conf_t *conf = NULL;
+
+    local = frame->local;
+    prev = cookie;
+    conf = this->private;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret && !local->op_ret) {
+            local->op_ret = op_ret;
+            local->op_errno = op_errno;
+            UNLOCK(&frame->lock);
+            gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                         prev->this->name);
+            goto post_unlock;
+        }
+    }
+    UNLOCK(&frame->lock);
+post_unlock:
+    this_call_cnt = dht_frame_return(frame);
+
+    if (is_last_call(this_call_cnt)) {
+        if (!local->op_ret) {
+            xattrop = dict_new();
+            if (!xattrop) {
+                gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0,
+                       "dictionary creation failed");
+                ret = -1;
+                goto out;
+            }
+            ret = dht_dict_set_array(xattrop, conf->mds_xattr_key, addone, 1);
+            if (ret != 0) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+                       "dictionary set array failed ");
+                ret = -1;
+                goto out;
+            }
+            if ((local->fop == GF_FOP_SETXATTR) ||
+                (local->fop == GF_FOP_REMOVEXATTR)) {
+                STACK_WIND(frame, dht_common_mds_xattrop_cbk, local->mds_subvol,
+                           local->mds_subvol->fops->xattrop, &local->loc,
+                           GF_XATTROP_ADD_ARRAY, xattrop, NULL);
+            } else {
+                STACK_WIND(frame, dht_common_mds_xattrop_cbk, local->mds_subvol,
+                           local->mds_subvol->fops->fxattrop, local->fd,
+                           GF_XATTROP_ADD_ARRAY, xattrop, NULL);
+            }
+        } else {
+            if (local->fop == GF_FOP_SETXATTR) {
+                DHT_STACK_UNWIND(setxattr, frame, 0, 0, local->xdata);
+                /* 'local' itself may not be valid after this */
+                goto just_return;
+            }
+
+            if (local->fop == GF_FOP_FSETXATTR) {
+                DHT_STACK_UNWIND(fsetxattr, frame, 0, 0, local->xdata);
+                /* 'local' itself may not be valid after this */
+                goto just_return;
+            }
+
+            if (local->fop == GF_FOP_REMOVEXATTR) {
+                DHT_STACK_UNWIND(removexattr, frame, 0, 0, NULL);
+                /* 'local' itself may not be valid after this */
+                goto just_return;
+            }
+
+            if (local->fop == GF_FOP_FREMOVEXATTR) {
+                DHT_STACK_UNWIND(fremovexattr, frame, 0, 0, NULL);
+                /* 'local' itself may not be valid after this */
+                goto just_return;
+            }
+        }
+    }
+out:
+    if (ret) {
+        if (local->fop == GF_FOP_SETXATTR) {
+            DHT_STACK_UNWIND(setxattr, frame, 0, 0, local->xdata);
+            /* 'local' itself may not be valid after this */
+            goto just_return;
+        }
 
-	return 0;
-}
+        if (local->fop == GF_FOP_FSETXATTR) {
+            DHT_STACK_UNWIND(fsetxattr, frame, 0, 0, local->xdata);
+            /* 'local' itself may not be valid after this */
+            goto just_return;
+        }
+
+        if (local->fop == GF_FOP_REMOVEXATTR) {
+            DHT_STACK_UNWIND(removexattr, frame, 0, 0, NULL);
+            /* 'local' itself may not be valid after this */
+            goto just_return;
+        }
 
+        if (local->fop == GF_FOP_FREMOVEXATTR) {
+            DHT_STACK_UNWIND(fremovexattr, frame, 0, 0, NULL);
+        }
+    }
+just_return:
+    if (xattrop)
+        dict_unref(xattrop);
+    return 0;
+}
 
-int
-dht_ftruncate (call_frame_t *frame, xlator_t *this,
-	       fd_t *fd, off_t offset)
+static int
+dht_setxattr_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int op_ret, int op_errno, dict_t *xdata)
 {
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
-	dht_local_t  *local = NULL;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    call_frame_t *prev = NULL;
+    xlator_t *mds_subvol = NULL;
+    int i = 0;
+
+    local = frame->local;
+    prev = cookie;
+    conf = this->private;
+    mds_subvol = local->mds_subvol;
+
+    if (op_ret == -1) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                     prev->this->name);
+        goto out;
+    }
+
+    local->op_ret = 0;
+    local->call_cnt = conf->subvolume_cnt - 1;
+    local->xdata = dict_ref(xdata);
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (mds_subvol && (mds_subvol == conf->subvolumes[i]))
+            continue;
+        if (local->fop == GF_FOP_SETXATTR) {
+            STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+                       conf->subvolumes[i]->fops->setxattr, &local->loc,
+                       local->xattr, local->flags, local->xattr_req);
+        }
 
+        if (local->fop == GF_FOP_FSETXATTR) {
+            STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+                       conf->subvolumes[i]->fops->fsetxattr, local->fd,
+                       local->xattr, local->flags, local->xattr_req);
+        }
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
+        if (local->fop == GF_FOP_REMOVEXATTR) {
+            STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+                       conf->subvolumes[i]->fops->removexattr, &local->loc,
+                       local->key, local->xattr_req);
+        }
 
-	subvol = dht_subvol_get_cached (this, fd->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
+        if (local->fop == GF_FOP_FREMOVEXATTR) {
+            STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+                       conf->subvolumes[i]->fops->fremovexattr, local->fd,
+                       local->key, local->xattr_req);
+        }
+    }
 
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
+    return 0;
+out:
+    if (local->fop == GF_FOP_SETXATTR) {
+        DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+                         xdata);
+        /* 'local' itself may not be valid after this */
+        goto just_return;
+    }
+
+    if (local->fop == GF_FOP_FSETXATTR) {
+        DHT_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno,
+                         xdata);
+        /* 'local' itself may not be valid after this */
+        goto just_return;
+    }
+
+    if (local->fop == GF_FOP_REMOVEXATTR) {
+        DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+                         NULL);
+        /* 'local' itself may not be valid after this */
+        goto just_return;
+    }
+
+    if (local->fop == GF_FOP_FREMOVEXATTR) {
+        DHT_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno,
+                         NULL);
+    }
+
+just_return:
+    return 0;
+}
 
-	local->inode = inode_ref (fd->inode);
-	local->call_cnt = 1;
+static int
+dht_xattrop_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int op_ret, int op_errno, dict_t *dict, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    call_frame_t *prev = NULL;
+
+    local = frame->local;
+    prev = cookie;
+
+    if (op_ret == -1) {
+        local->op_errno = op_errno;
+        local->op_ret = op_ret;
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                     prev->this->name);
+        goto out;
+    }
+
+    if (local->fop == GF_FOP_SETXATTR) {
+        STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+                   local->mds_subvol->fops->setxattr, &local->loc, local->xattr,
+                   local->flags, local->xattr_req);
+    }
+
+    if (local->fop == GF_FOP_FSETXATTR) {
+        STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+                   local->mds_subvol->fops->fsetxattr, local->fd, local->xattr,
+                   local->flags, local->xattr_req);
+    }
+
+    if (local->fop == GF_FOP_REMOVEXATTR) {
+        STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+                   local->mds_subvol->fops->removexattr, &local->loc,
+                   local->key, local->xattr_req);
+    }
+
+    if (local->fop == GF_FOP_FREMOVEXATTR) {
+        STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+                   local->mds_subvol->fops->fremovexattr, local->fd, local->key,
+                   local->xattr_req);
+    }
+
+    return 0;
+out:
+    if (local->fop == GF_FOP_SETXATTR) {
+        DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+                         xdata);
+        /* 'local' itself may not be valid after this */
+        goto just_return;
+    }
+
+    if (local->fop == GF_FOP_FSETXATTR) {
+        DHT_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno,
+                         xdata);
+        /* 'local' itself may not be valid after this */
+        goto just_return;
+    }
+
+    if (local->fop == GF_FOP_REMOVEXATTR) {
+        DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+                         NULL);
+        /* 'local' itself may not be valid after this */
+        goto just_return;
+    }
+
+    if (local->fop == GF_FOP_FREMOVEXATTR) {
+        DHT_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno,
+                         NULL);
+    }
+
+just_return:
+    return 0;
+}
 
-	STACK_WIND (frame, dht_attr_cbk,
-		    subvol, subvol->fops->ftruncate,
-		    fd, offset);
+static void
+fill_layout_info(dht_layout_t *layout, char *buf)
+{
+    int i = 0;
+    char tmp_buf[128] = {
+        0,
+    };
+
+    for (i = 0; i < layout->cnt; i++) {
+        snprintf(tmp_buf, sizeof(tmp_buf), "(%s %u %u)",
+                 layout->list[i].xlator->name, layout->list[i].start,
+                 layout->list[i].stop);
+        if (i)
+            strcat(buf, " ");
+        strcat(buf, tmp_buf);
+    }
+}
 
-	return 0;
+static void
+dht_fill_pathinfo_xattr(xlator_t *this, dht_local_t *local, char *xattr_buf,
+                        int32_t alloc_len, int flag, char *layout_buf)
+{
+    if (flag) {
+        if (local->xattr_val) {
+            snprintf(xattr_buf, alloc_len,
+                     "((<" DHT_PATHINFO_HEADER "%s> %s) (%s-layout %s))",
+                     this->name, local->xattr_val, this->name, layout_buf);
+        } else {
+            snprintf(xattr_buf, alloc_len, "(%s-layout %s)", this->name,
+                     layout_buf);
+        }
+    } else if (local->xattr_val) {
+        snprintf(xattr_buf, alloc_len, "(<" DHT_PATHINFO_HEADER "%s> %s)",
+                 this->name, local->xattr_val);
+    } else {
+        xattr_buf[0] = '\0';
+    }
+}
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+static int
+dht_vgetxattr_alloc_and_fill(dht_local_t *local, dict_t *xattr, xlator_t *this,
+                             int op_errno)
+{
+    int ret = -1;
+    char *value = NULL;
+
+    ret = dict_get_str(xattr, local->xsel, &value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_GET_XATTR_FAILED,
+               "Subvolume %s returned -1", this->name);
+        local->op_ret = -1;
+        local->op_errno = op_errno;
+        goto out;
+    }
+
+    local->alloc_len += strlen(value);
+
+    if (!local->xattr_val) {
+        local->alloc_len += (SLEN(DHT_PATHINFO_HEADER) + 10);
+        local->xattr_val = GF_MALLOC(local->alloc_len, gf_common_mt_char);
+        if (!local->xattr_val) {
+            ret = -1;
+            goto out;
+        }
+        local->xattr_val[0] = '\0';
+    }
+
+    int plen = strlen(local->xattr_val);
+    if (plen) {
+        /* extra byte(s) for \0 to be safe */
+        local->alloc_len += (plen + 2);
+        local->xattr_val = GF_REALLOC(local->xattr_val, local->alloc_len);
+        if (!local->xattr_val) {
+            ret = -1;
+            goto out;
+        }
+    }
+
+    (void)strcat(local->xattr_val, value);
+    (void)strcat(local->xattr_val, " ");
+    local->op_ret = 0;
 
-	return 0;
+    ret = 0;
+
+out:
+    return ret;
 }
 
+static int
+dht_vgetxattr_fill_and_set(dht_local_t *local, dict_t **dict, xlator_t *this,
+                           gf_boolean_t flag)
+{
+    int ret = -1;
+    char *xattr_buf = NULL;
+    char layout_buf[8192] = {
+        0,
+    };
+
+    if (flag)
+        fill_layout_info(local->layout, layout_buf);
+
+    *dict = dict_new();
+    if (!*dict)
+        goto out;
+
+    local->xattr_val[strlen(local->xattr_val) - 1] = '\0';
+
+    /* we would need max this many bytes to create xattr string
+     * extra 40 bytes is just an estimated amount of additional
+     * space required as we include translator name and some
+     * spaces, brackets etc. when forming the pathinfo string.
+     *
+     * For node-uuid we just don't have all the pretty formatting,
+     * but since this is a generic routine for pathinfo & node-uuid
+     * we don't have conditional space allocation and try to be
+     * generic
+     */
+    local->alloc_len += (2 * strlen(this->name)) + strlen(layout_buf) + 40;
+    xattr_buf = GF_MALLOC(local->alloc_len, gf_common_mt_char);
+    if (!xattr_buf)
+        goto out;
+
+    if (XATTR_IS_PATHINFO(local->xsel)) {
+        (void)dht_fill_pathinfo_xattr(this, local, xattr_buf, local->alloc_len,
+                                      flag, layout_buf);
+    } else if ((XATTR_IS_NODE_UUID(local->xsel)) ||
+               (XATTR_IS_NODE_UUID_LIST(local->xsel))) {
+        (void)snprintf(xattr_buf, local->alloc_len, "%s", local->xattr_val);
+    } else {
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GET_XATTR_FAILED,
+               "Unknown local->xsel (%s)", local->xsel);
+        GF_FREE(xattr_buf);
+        goto out;
+    }
+
+    ret = dict_set_dynstr(*dict, local->xsel, xattr_buf);
+    if (ret)
+        GF_FREE(xattr_buf);
+    GF_FREE(local->xattr_val);
 
-int
-dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	     int op_ret, int op_errno)
+out:
+    return ret;
+}
+
+static int
+dht_find_local_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int op_ret, int op_errno, dict_t *xattr,
+                          dict_t *xdata)
 {
-	dht_local_t  *local = NULL;
-	int           this_call_cnt = 0;
-	call_frame_t *prev = NULL;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    xlator_t *prev = NULL;
+    int this_call_cnt = 0;
+    int ret = 0;
+    char *uuid_str = NULL;
+    char *uuid_list = NULL;
+    char *next_uuid_str = NULL;
+    char *saveptr = NULL;
+    uuid_t node_uuid = {
+        0,
+    };
+    char *uuid_list_copy = NULL;
+    int count = 0;
+    int i = 0;
+    int index = 0;
+    int found = 0;
+    nodeuuid_info_t *tmp_ptr = NULL;
+
+    VALIDATE_OR_GOTO(frame, out);
+    VALIDATE_OR_GOTO(frame->local, out);
+
+    local = frame->local;
+    prev = cookie;
+    conf = this->private;
+
+    VALIDATE_OR_GOTO(conf->defrag, out);
+
+    gf_msg_debug(this->name, 0, "subvol %s returned", prev->name);
+
+    LOCK(&frame->lock);
+    {
+        this_call_cnt = --local->call_cnt;
+        if (op_ret < 0) {
+            local->op_ret = -1;
+            local->op_errno = op_errno;
+            UNLOCK(&frame->lock);
+            if (op_errno == ENODATA)
+                gf_msg_debug(this->name, 0, "failed to get node-uuid");
+            else
+                gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                       DHT_MSG_GET_XATTR_FAILED, "failed to get node-uuid");
+            goto post_unlock;
+        }
+
+        ret = dict_get_str(xattr, local->xsel, &uuid_list);
 
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_GET_FAILED,
+                   "Failed to get %s", local->xsel);
+            local->op_ret = -1;
+            local->op_errno = EINVAL;
+            goto unlock;
+        }
+
+        /* As DHT will not know details of its child xlators
+         * we need to parse this twice to get the count first
+         * and allocate memory later.
+         */
+        count = 0;
+        index = conf->local_subvols_cnt;
+
+        uuid_list_copy = gf_strdup(uuid_list);
+        if (!uuid_list_copy)
+            goto unlock;
+
+        for (uuid_str = strtok_r(uuid_list, " ", &saveptr); uuid_str;
+             uuid_str = next_uuid_str) {
+            next_uuid_str = strtok_r(NULL, " ", &saveptr);
+            if (gf_uuid_parse(uuid_str, node_uuid)) {
+                local->op_ret = -1;
+                local->op_errno = EINVAL;
+                UNLOCK(&frame->lock);
+                gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UUID_PARSE_ERROR,
+                       "Failed to parse uuid for %s", prev->name);
+                goto post_unlock;
+            }
+
+            count++;
+            if (gf_uuid_compare(node_uuid, conf->defrag->node_uuid)) {
+                gf_msg_debug(this->name, 0,
+                             "subvol %s does not"
+                             "belong to this node",
+                             prev->name);
+            } else {
+                /* handle multiple bricks of the same replica
+                 * on the same node */
+                if (found)
+                    continue;
+                conf->local_subvols[(conf->local_subvols_cnt)++] = prev;
+                found = 1;
+                gf_msg_debug(this->name, 0,
+                             "subvol %s belongs to"
+                             " this node",
+                             prev->name);
+            }
+        }
 
-	local = frame->local;
-	prev = cookie;
+        if (!found) {
+            local->op_ret = 0;
+            goto unlock;
+        }
 
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			local->op_errno = op_errno;
-			gf_log (this->name, GF_LOG_ERROR,
-				"subvolume %s returned -1 (%s)",
-				prev->this->name, strerror (op_errno));
-			goto unlock;
-		}
+        conf->local_nodeuuids[index].count = count;
+        conf->local_nodeuuids[index].elements = GF_CALLOC(
+            count, sizeof(nodeuuid_info_t), 1);
+
+        /* The node-uuids are guaranteed to be returned in the same
+         * order as the bricks
+         * A null node-uuid is returned for a brick that is down.
+         */
+
+        saveptr = NULL;
+        i = 0;
+
+        for (uuid_str = strtok_r(uuid_list_copy, " ", &saveptr); uuid_str;
+             uuid_str = next_uuid_str) {
+            next_uuid_str = strtok_r(NULL, " ", &saveptr);
+            tmp_ptr = &(conf->local_nodeuuids[index].elements[i]);
+            gf_uuid_parse(uuid_str, tmp_ptr->uuid);
+
+            if (!gf_uuid_compare(tmp_ptr->uuid, conf->defrag->node_uuid)) {
+                tmp_ptr->info = REBAL_NODEUUID_MINE;
+            }
+            i++;
+            tmp_ptr = NULL;
+        }
+    }
 
-		local->op_ret = 0;
-	}
+    local->op_ret = 0;
 unlock:
-	UNLOCK (&frame->lock);
+    UNLOCK(&frame->lock);
+post_unlock:
+    if (!is_last_call(this_call_cnt))
+        goto out;
 
-	this_call_cnt = dht_frame_return (frame);
-	if (is_last_call (this_call_cnt))
-		DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+    if (local->op_ret == -1) {
+        goto unwind;
+    }
 
-        return 0;
-}
+    DHT_STACK_UNWIND(getxattr, frame, 0, 0, xattr, xdata);
+    goto out;
 
+unwind:
 
-int
-dht_access (call_frame_t *frame, xlator_t *this,
-	    loc_t *loc, int32_t mask)
+    GF_FREE(conf->local_nodeuuids[index].elements);
+    conf->local_nodeuuids[index].elements = NULL;
+
+    DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, xdata);
+out:
+    GF_FREE(uuid_list_copy);
+    return 0;
+}
+
+static int
+dht_vgetxattr_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
 {
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
-	dht_local_t  *local = NULL;
+    int ret = 0;
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    dict_t *dict = NULL;
+
+    VALIDATE_OR_GOTO(frame, out);
+    VALIDATE_OR_GOTO(frame->local, out);
+
+    local = frame->local;
+
+    LOCK(&frame->lock);
+    {
+        this_call_cnt = --local->call_cnt;
+        if (op_ret < 0) {
+            if (op_errno != ENOTCONN) {
+                local->op_ret = -1;
+                local->op_errno = op_errno;
+                UNLOCK(&frame->lock);
+                gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                       DHT_MSG_GET_XATTR_FAILED, "getxattr err for dir");
+                goto post_unlock;
+            }
+
+            goto unlock;
+        }
 
+        ret = dht_vgetxattr_alloc_and_fill(local, xattr, this, op_errno);
+        if (ret) {
+            UNLOCK(&frame->lock);
+            gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_DICT_SET_FAILED,
+                   "alloc or fill failure");
+            goto post_unlock;
+        }
+    }
+unlock:
+    UNLOCK(&frame->lock);
+post_unlock:
+    if (!is_last_call(this_call_cnt))
+        goto out;
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
+    /* -- last call: do patch ups -- */
 
-	subvol = dht_subvol_get_cached (this, loc->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
+    if (local->op_ret == -1) {
+        goto unwind;
+    }
 
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
+    ret = dht_vgetxattr_fill_and_set(local, &dict, this, _gf_true);
+    if (ret)
+        goto unwind;
 
-	local->call_cnt = 1;
+    DHT_STACK_UNWIND(getxattr, frame, 0, 0, dict, xdata);
+    goto cleanup;
 
-	STACK_WIND (frame, dht_err_cbk,
-		    subvol, subvol->fops->access,
-		    loc, mask);
+unwind:
+    DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, NULL);
+cleanup:
+    if (dict)
+        dict_unref(dict);
+out:
+    return 0;
+}
 
-	return 0;
+static int
+dht_vgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                  int op_errno, dict_t *xattr, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int ret = 0;
+    dict_t *dict = NULL;
+    xlator_t *prev = NULL;
+    gf_boolean_t flag = _gf_true;
+
+    local = frame->local;
+    prev = cookie;
+
+    if (op_ret < 0) {
+        local->op_ret = -1;
+        local->op_errno = op_errno;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_GET_XATTR_FAILED,
+               "vgetxattr: Subvolume %s returned -1", prev->name);
+        goto unwind;
+    }
+
+    ret = dht_vgetxattr_alloc_and_fill(local, xattr, this, op_errno);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
+               "Allocation or fill failure");
+        goto unwind;
+    }
+
+    flag = (local->layout->cnt > 1) ? _gf_true : _gf_false;
+
+    ret = dht_vgetxattr_fill_and_set(local, &dict, this, flag);
+    if (ret)
+        goto unwind;
+
+    DHT_STACK_UNWIND(getxattr, frame, 0, 0, dict, xdata);
+    goto cleanup;
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno);
+unwind:
+    DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, NULL);
+cleanup:
+    if (dict)
+        dict_unref(dict);
 
-	return 0;
+    return 0;
 }
 
-
-int
-dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		  int op_ret, int op_errno, const char *path)
+static int
+dht_linkinfo_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int op_ret, int op_errno, dict_t *xattr,
+                          dict_t *xdata)
 {
-        DHT_STACK_UNWIND (frame, op_ret, op_errno, path);
+    int ret = 0;
+    char *value = NULL;
+
+    if (op_ret != -1) {
+        ret = dict_get_str(xattr, GF_XATTR_PATHINFO_KEY, &value);
+        if (!ret) {
+            ret = dict_set_str(xattr, GF_XATTR_LINKINFO_KEY, value);
+            if (!ret)
+                gf_msg_trace(this->name, 0, "failed to set linkinfo");
+        }
+    }
 
-        return 0;
+    DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata);
+
+    return 0;
 }
 
+static int
+dht_mds_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(frame->local, err);
+    VALIDATE_OR_GOTO(this->private, err);
+
+    conf = this->private;
+    local = frame->local;
+
+    if (!xattr || (op_ret == -1)) {
+        local->op_ret = op_ret;
+        goto out;
+    }
+    dict_del(xattr, conf->xattr_name);
+    local->op_ret = 0;
+
+    if (!local->xattr) {
+        local->xattr = dict_copy_with_ref(xattr, NULL);
+    }
+
+out:
+    DHT_STACK_UNWIND(getxattr, frame, local->op_ret, op_errno, local->xattr,
+                     xdata);
+    return 0;
+err:
+    DHT_STACK_UNWIND(getxattr, frame, -1, EINVAL, NULL, NULL);
+    return 0;
+}
 
 int
-dht_readlink (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, size_t size)
+dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, dict_t *xattr, dict_t *xdata)
 {
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
+    int this_call_cnt = 0;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int ret = 0;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(frame->local, err);
+    VALIDATE_OR_GOTO(this->private, err);
+
+    conf = this->private;
+    local = frame->local;
+
+    if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto err;
+        return 0;
+    }
 
+    LOCK(&frame->lock);
+    {
+        if (!xattr || (op_ret == -1)) {
+            local->op_ret = op_ret;
+            goto unlock;
+        }
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
+        dict_del(xattr, conf->xattr_name);
+        dict_del(xattr, conf->mds_xattr_key);
 
-	subvol = dht_subvol_get_cached (this, loc->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
+        dict_del(xattr, conf->commithash_xattr_name);
 
-	STACK_WIND (frame, dht_readlink_cbk,
-		    subvol, subvol->fops->readlink,
-		    loc, size);
+        if (frame->root->pid >= 0) {
+            GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
+            GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
+        }
 
-	return 0;
+        local->op_ret = 0;
 
+        if (!local->xattr) {
+            local->xattr = dict_copy_with_ref(xattr, NULL);
+        } else {
+            dht_aggregate_xattr(local->xattr, xattr);
+        }
+
+        if (!local->xdata) {
+            local->xdata = dict_ref(xdata);
+        } else if ((local->inode && IA_ISDIR(local->inode->ia_type)) ||
+                   (local->fd && IA_ISDIR(local->fd->inode->ia_type))) {
+            dht_aggregate_xattr(local->xdata, xdata);
+        }
+    }
+unlock:
+    UNLOCK(&frame->lock);
+
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt)) {
+        /* If we have a valid xattr received from any one of the
+         * subvolume, let's return it */
+        if (local->xattr) {
+            local->op_ret = 0;
+        }
+
+        DHT_STACK_UNWIND(getxattr, frame, local->op_ret, op_errno, local->xattr,
+                         local->xdata);
+    }
+    return 0;
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    DHT_STACK_UNWIND(getxattr, frame, -1, EINVAL, NULL, NULL);
+    return 0;
+}
 
-	return 0;
+static int32_t
+dht_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict,
+                    dict_t *xdata)
+{
+    DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+    return 0;
 }
 
+static int
+dht_getxattr_get_real_filename_cbk(call_frame_t *frame, void *cookie,
+                                   xlator_t *this, int op_ret, int op_errno,
+                                   dict_t *xattr, dict_t *xdata)
+{
+    int this_call_cnt = 0;
+    dht_local_t *local = NULL;
+
+    local = frame->local;
+
+    LOCK(&frame->lock);
+    {
+        if (local->op_errno == EOPNOTSUPP) {
+            /* Nothing to do here, we have already found
+             * a subvol which does not have the get_real_filename
+             * optimization. If condition is for simple logic.
+             */
+            goto unlock;
+        }
 
-int
-dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		  int op_ret, int op_errno, dict_t *xattr)
+        if (op_ret == -1) {
+            if (op_errno == EOPNOTSUPP) {
+                /* This subvol does not have the optimization.
+                 * Better let the user know we don't support it.
+                 * Remove previous results if any.
+                 */
+
+                if (local->xattr) {
+                    dict_unref(local->xattr);
+                    local->xattr = NULL;
+                }
+
+                if (local->xattr_req) {
+                    dict_unref(local->xattr_req);
+                    local->xattr_req = NULL;
+                }
+
+                local->op_ret = op_ret;
+                local->op_errno = op_errno;
+                UNLOCK(&frame->lock);
+                gf_msg(this->name, GF_LOG_WARNING, op_errno,
+                       DHT_MSG_UPGRADE_BRICKS,
+                       "At least "
+                       "one of the bricks does not support "
+                       "this operation. Please upgrade all "
+                       "bricks.");
+                goto post_unlock;
+            }
+
+            if (op_errno == ENOATTR) {
+                /* Do nothing, our defaults are set to this.
+                 */
+                goto unlock;
+            }
+
+            /* This is a place holder for every other error
+             * case. I am not sure of how to interpret
+             * ENOTCONN etc. As of now, choosing to ignore
+             * down subvol and return a good result(if any)
+             * from other subvol.
+             */
+            UNLOCK(&frame->lock);
+            gf_msg(this->name, GF_LOG_WARNING, op_errno,
+                   DHT_MSG_GET_XATTR_FAILED, "Failed to get real filename.");
+            goto post_unlock;
+        }
+
+        /* This subvol has the required file.
+         * There could be other subvols which have returned
+         * success already, choosing to return the latest good
+         * result.
+         */
+        if (local->xattr)
+            dict_unref(local->xattr);
+        local->xattr = dict_ref(xattr);
+
+        if (local->xattr_req) {
+            dict_unref(local->xattr_req);
+            local->xattr_req = NULL;
+        }
+        if (xdata)
+            local->xattr_req = dict_ref(xdata);
+
+        local->op_ret = op_ret;
+        local->op_errno = 0;
+        UNLOCK(&frame->lock);
+        gf_msg_debug(this->name, 0, "Found a matching file.");
+        goto post_unlock;
+    }
+unlock:
+    UNLOCK(&frame->lock);
+post_unlock:
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt)) {
+        DHT_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+                         local->xattr, local->xattr_req);
+    }
+
+    return 0;
+}
+
+static int
+dht_getxattr_get_real_filename(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                               const char *key, dict_t *xdata)
 {
-        DHT_STACK_UNWIND (frame, op_ret, op_errno, xattr);
+    dht_local_t *local = NULL;
+    int i = 0;
+    dht_layout_t *layout = NULL;
+    int cnt = 0;
+    xlator_t *subvol = NULL;
 
-        return 0;
+    local = frame->local;
+    layout = local->layout;
+
+    cnt = local->call_cnt = layout->cnt;
+
+    local->op_ret = -1;
+    local->op_errno = ENOATTR;
+
+    for (i = 0; i < cnt; i++) {
+        subvol = layout->list[i].xlator;
+        STACK_WIND(frame, dht_getxattr_get_real_filename_cbk, subvol,
+                   subvol->fops->getxattr, loc, key, xdata);
+    }
+
+    return 0;
 }
 
+static int
+dht_marker_populate_args(call_frame_t *frame, int type, int *gauge,
+                         xlator_t **subvols)
+{
+    dht_local_t *local = NULL;
+    int i = 0;
+    dht_layout_t *layout = NULL;
+
+    local = frame->local;
+    layout = local->layout;
 
-int
-dht_getxattr (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, const char *key)
+    for (i = 0; i < layout->cnt; i++)
+        subvols[i] = layout->list[i].xlator;
+
+    return layout->cnt;
+}
+
+static int
+dht_is_debug_xattr_key(const char **array, char *key)
 {
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
+    int i = 0;
 
+    for (i = 0; array[i]; i++) {
+        if (fnmatch(array[i], key, FNM_NOESCAPE) == 0)
+            return i;
+    }
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
+    return -1;
+}
 
-	subvol = dht_subvol_get_cached (this, loc->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
+/* Note we already have frame->local initialised here*/
 
-	STACK_WIND (frame, dht_getxattr_cbk,
-		    subvol, subvol->fops->getxattr,
-		    loc, key);
+static int
+dht_handle_debug_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                          const char *key)
+{
+    dht_local_t *local = NULL;
+    int ret = -1;
+    int op_errno = ENODATA;
+    char *value = NULL;
+    loc_t file_loc = {0};
+    const char *name = NULL;
+
+    local = frame->local;
+
+    if (dht_is_debug_xattr_key(dht_dbg_vxattrs, (char *)key) == -1) {
+        goto out;
+    }
+
+    local->xattr = dict_new();
+    if (!local->xattr) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    if (strncmp(key, DHT_DBG_HASHED_SUBVOL_KEY,
+                SLEN(DHT_DBG_HASHED_SUBVOL_KEY)) == 0) {
+        name = key + strlen(DHT_DBG_HASHED_SUBVOL_KEY);
+        if (strlen(name) == 0) {
+            op_errno = EINVAL;
+            goto out;
+        }
 
-	return 0;
+        ret = dht_build_child_loc(this, &file_loc, loc, (char *)name);
+        if (ret) {
+            op_errno = ENOMEM;
+            goto out;
+        }
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+        local->hashed_subvol = dht_subvol_get_hashed(this, &file_loc);
+        if (local->hashed_subvol == NULL) {
+            op_errno = ENODATA;
+            goto out;
+        }
+
+        value = gf_strdup(local->hashed_subvol->name);
+        if (!value) {
+            op_errno = ENOMEM;
+            goto out;
+        }
+
+        ret = dict_set_dynstr(local->xattr, (char *)key, value);
+        if (ret < 0) {
+            op_errno = -ret;
+            ret = -1;
+            goto out;
+        }
+        ret = 0;
+        goto out;
+    }
 
-	return 0;
+out:
+    loc_wipe(&file_loc);
+    DHT_STACK_UNWIND(getxattr, frame, ret, op_errno, local->xattr, NULL);
+    return 0;
 }
 
+/* Virtual Xattr which returns 1 if all subvols are up,
+   else returns 0. Geo-rep then uses this virtual xattr
+   after a fresh mount and starts the I/O.
+*/
+
+enum dht_vxattr_subvol {
+    DHT_VXATTR_SUBVOLS_UP = 1,
+    DHT_VXATTR_SUBVOLS_DOWN = 0,
+};
+
+int
+dht_vgetxattr_subvol_status(call_frame_t *frame, xlator_t *this,
+                            const char *key)
+{
+    dht_local_t *local = NULL;
+    int ret = -1;
+    int op_errno = ENODATA;
+    int value = DHT_VXATTR_SUBVOLS_UP;
+    int i = 0;
+    dht_conf_t *conf = NULL;
+
+    conf = this->private;
+    local = frame->local;
+
+    if (!key) {
+        op_errno = EINVAL;
+        goto out;
+    }
+    local->xattr = dict_new();
+    if (!local->xattr) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (!conf->subvolume_status[i]) {
+            value = DHT_VXATTR_SUBVOLS_DOWN;
+            gf_msg_debug(this->name, 0, "subvol %s is down ",
+                         conf->subvolumes[i]->name);
+            break;
+        }
+    }
+    ret = dict_set_int8(local->xattr, (char *)key, value);
+    if (ret < 0) {
+        op_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+    ret = 0;
+
+out:
+    DHT_STACK_UNWIND(getxattr, frame, ret, op_errno, local->xattr, NULL);
+    return 0;
+}
 
 int
-dht_setxattr (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, dict_t *xattr, int flags)
+dht_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
+             dict_t *xdata)
+#define DHT_IS_DIR(layout) (layout->cnt > 1)
 {
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
-	dht_local_t  *local = NULL;
+    xlator_t *subvol = NULL;
+    xlator_t *hashed_subvol = NULL;
+    xlator_t *mds_subvol = NULL;
+    xlator_t *cached_subvol = NULL;
+    dht_conf_t *conf = NULL;
+    dht_local_t *local = NULL;
+    dht_layout_t *layout = NULL;
+    int op_errno = -1;
+    int i = 0;
+    int cnt = 0;
+    char *node_uuid_key = NULL;
+    int ret = -1;
+
+    GF_CHECK_XATTR_KEY_AND_GOTO(key, IO_THREADS_QUEUE_SIZE_KEY, op_errno, err);
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+    VALIDATE_OR_GOTO(this->private, err);
+
+    conf = this->private;
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_GETXATTR);
+    if (!local) {
+        op_errno = ENOMEM;
+
+        goto err;
+    }
+
+    layout = local->layout;
+    if (!layout) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_NULL,
+               "Layout is NULL");
+        op_errno = ENOENT;
+        goto err;
+    }
+
+    /* skip over code which is irrelevant without a valid key */
+    if (!key)
+        goto no_key;
+
+    local->key = gf_strdup(key);
+    if (!local->key) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    if (strncmp(key, conf->mds_xattr_key, strlen(key)) == 0) {
+        op_errno = ENOTSUP;
+        goto err;
+    }
+
+    if (strncmp(key, DHT_SUBVOL_STATUS_KEY, SLEN(DHT_SUBVOL_STATUS_KEY)) == 0) {
+        dht_vgetxattr_subvol_status(frame, this, key);
+        return 0;
+    }
+
+    /* skip over code which is irrelevant if !DHT_IS_DIR(layout) */
+    if (!DHT_IS_DIR(layout))
+        goto no_dht_is_dir;
+
+    if ((strncmp(key, GF_XATTR_GET_REAL_FILENAME_KEY,
+                 SLEN(GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) &&
+        DHT_IS_DIR(layout)) {
+        dht_getxattr_get_real_filename(frame, this, loc, key, xdata);
+        return 0;
+    }
+
+    if (!strcmp(key, GF_REBAL_FIND_LOCAL_SUBVOL)) {
+        ret = gf_asprintf(&node_uuid_key, "%s", GF_XATTR_LIST_NODE_UUIDS_KEY);
+        if (ret == -1 || !node_uuid_key) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
+                   "Failed to copy node uuid key");
+            op_errno = ENOMEM;
+            goto err;
+        }
+        (void)snprintf(local->xsel, sizeof(local->xsel), "%s", node_uuid_key);
+        cnt = local->call_cnt = conf->subvolume_cnt;
+        for (i = 0; i < cnt; i++) {
+            STACK_WIND_COOKIE(frame, dht_find_local_subvol_cbk,
+                              conf->subvolumes[i], conf->subvolumes[i],
+                              conf->subvolumes[i]->fops->getxattr, loc,
+                              node_uuid_key, xdata);
+        }
+        if (node_uuid_key)
+            GF_FREE(node_uuid_key);
+        return 0;
+    }
+
+    if (!strcmp(key, GF_REBAL_OLD_FIND_LOCAL_SUBVOL)) {
+        ret = gf_asprintf(&node_uuid_key, "%s", GF_XATTR_NODE_UUID_KEY);
+        if (ret == -1 || !node_uuid_key) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
+                   "Failed to copy node uuid key");
+            op_errno = ENOMEM;
+            goto err;
+        }
+        (void)snprintf(local->xsel, sizeof(local->xsel), "%s", node_uuid_key);
+        cnt = local->call_cnt = conf->subvolume_cnt;
+        for (i = 0; i < cnt; i++) {
+            STACK_WIND_COOKIE(frame, dht_find_local_subvol_cbk,
+                              conf->subvolumes[i], conf->subvolumes[i],
+                              conf->subvolumes[i]->fops->getxattr, loc,
+                              node_uuid_key, xdata);
+        }
+        if (node_uuid_key)
+            GF_FREE(node_uuid_key);
+        return 0;
+    }
+
+    /* for file use cached subvolume (obviously!): see if {}
+     * below
+     * for directory:
+     *  wind to all subvolumes and exclude subvolumes which
+     *  return ENOTCONN (in callback)
+     *
+     * NOTE: Don't trust inode here, as that may not be valid
+     *       (until inode_link() happens)
+     */
+
+    if (XATTR_IS_PATHINFO(key) || (strcmp(key, GF_XATTR_NODE_UUID_KEY) == 0) ||
+        (strcmp(key, GF_XATTR_LIST_NODE_UUIDS_KEY) == 0)) {
+        (void)snprintf(local->xsel, sizeof(local->xsel), "%s", key);
+        cnt = local->call_cnt = layout->cnt;
+        for (i = 0; i < cnt; i++) {
+            subvol = layout->list[i].xlator;
+            STACK_WIND(frame, dht_vgetxattr_dir_cbk, subvol,
+                       subvol->fops->getxattr, loc, key, xdata);
+        }
+        return 0;
+    }
+
+no_dht_is_dir:
+    /* node-uuid or pathinfo for files */
+    if (XATTR_IS_PATHINFO(key) || (strcmp(key, GF_XATTR_NODE_UUID_KEY) == 0)) {
+        cached_subvol = local->cached_subvol;
+        (void)snprintf(local->xsel, sizeof(local->xsel), "%s", key);
+        local->call_cnt = 1;
+        STACK_WIND_COOKIE(frame, dht_vgetxattr_cbk, cached_subvol,
+                          cached_subvol, cached_subvol->fops->getxattr, loc,
+                          key, xdata);
+
+        return 0;
+    }
+
+    if (strcmp(key, GF_XATTR_LINKINFO_KEY) == 0) {
+        hashed_subvol = dht_subvol_get_hashed(this, loc);
+        if (!hashed_subvol) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+                   "Failed to get hashed subvol for %s", loc->path);
+            op_errno = EINVAL;
+            goto err;
+        }
+
+        cached_subvol = dht_subvol_get_cached(this, loc->inode);
+        if (!cached_subvol) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   DHT_MSG_CACHED_SUBVOL_GET_FAILED,
+                   "Failed to get cached subvol for %s", loc->path);
+            op_errno = EINVAL;
+            goto err;
+        }
+
+        if (hashed_subvol == cached_subvol) {
+            op_errno = ENODATA;
+            goto err;
+        }
 
+        STACK_WIND(frame, dht_linkinfo_getxattr_cbk, hashed_subvol,
+                   hashed_subvol->fops->getxattr, loc, GF_XATTR_PATHINFO_KEY,
+                   xdata);
+        return 0;
+    }
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
+    if (dht_is_debug_xattr_key(dht_dbg_vxattrs, (char *)key) >= 0) {
+        dht_handle_debug_getxattr(frame, this, loc, key);
+        return 0;
+    }
 
-	subvol = dht_subvol_get_cached (this, loc->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
+no_key:
+    if (cluster_handle_marker_getxattr(frame, loc, key, conf->vol_uuid,
+                                       dht_getxattr_unwind,
+                                       dht_marker_populate_args) == 0)
+        return 0;
 
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
+    if (DHT_IS_DIR(layout)) {
+        local->call_cnt = conf->subvolume_cnt;
+        cnt = conf->subvolume_cnt;
+        ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol);
+        if (!mds_subvol) {
+            gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+                   "Cannot determine MDS, fetching xattr %s randomly"
+                   " from a subvol for path %s ",
+                   key, loc->path);
+        } else {
+            /* TODO need to handle it, As of now we are
+               choosing availability instead of chossing
+               consistencty, in case of mds_subvol is
+               down winding a getxattr call on other subvol
+               and return xattr
+            */
+            local->mds_subvol = mds_subvol;
+            for (i = 0; i < cnt; i++) {
+                if (conf->subvolumes[i] == mds_subvol) {
+                    if (!conf->subvolume_status[i]) {
+                        gf_msg(this->name, GF_LOG_INFO, 0,
+                               DHT_MSG_HASHED_SUBVOL_DOWN,
+                               "MDS %s is down for path"
+                               " path %s so fetching xattr "
+                               "%s randomly from a subvol ",
+                               local->mds_subvol->name, loc->path, key);
+                        ret = 1;
+                    }
+                }
+            }
+        }
 
-	local->call_cnt = 1;
+        if (!ret && key && local->mds_subvol && dht_match_xattr(key)) {
+            STACK_WIND(frame, dht_mds_getxattr_cbk, local->mds_subvol,
+                       local->mds_subvol->fops->getxattr, loc, key, xdata);
 
-	STACK_WIND (frame, dht_err_cbk,
-		    subvol, subvol->fops->setxattr,
-		    loc, xattr, flags);
+            return 0;
+        }
+    } else {
+        cnt = local->call_cnt = 1;
+    }
 
-	return 0;
+    for (i = 0; i < cnt; i++) {
+        subvol = layout->list[i].xlator;
+        STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->getxattr, loc,
+                   key, xdata);
+    }
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL);
 
-	return 0;
+    return 0;
 }
-
+#undef DHT_IS_DIR
 
 int
-dht_removexattr (call_frame_t *frame, xlator_t *this,
-		 loc_t *loc, const char *key)
+dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+              dict_t *xdata)
 {
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
-	dht_local_t  *local = NULL;
+    xlator_t *subvol = NULL;
+    dht_local_t *local = NULL;
+    dht_layout_t *layout = NULL;
+    int op_errno = -1;
+    int i = 0;
+    int cnt = 0;
+    xlator_t *mds_subvol = NULL;
+    int ret = -1;
+    dht_conf_t *conf = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+    VALIDATE_OR_GOTO(fd->inode, err);
+    VALIDATE_OR_GOTO(this->private, err);
+
+    conf = this->private;
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_FGETXATTR);
+    if (!local) {
+        op_errno = ENOMEM;
+
+        goto err;
+    }
+
+    layout = local->layout;
+    if (!layout) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_NULL,
+               "Layout is NULL");
+        op_errno = ENOENT;
+        goto err;
+    }
+
+    if (key) {
+        local->key = gf_strdup(key);
+        if (!local->key) {
+            op_errno = ENOMEM;
+            goto err;
+        }
+    }
+
+    gf_uuid_unparse(fd->inode->gfid, gfid);
+
+    if ((fd->inode->ia_type == IA_IFDIR) && key &&
+        (strncmp(key, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) !=
+         0)) {
+        local->call_cnt = conf->subvolume_cnt;
+        cnt = conf->subvolume_cnt;
+        ret = dht_inode_ctx_mdsvol_get(fd->inode, this, &mds_subvol);
+
+        if (!mds_subvol) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+                   "cannot determine MDS, fetching xattr %s "
+                   " randomly from a subvol for gfid %s ",
+                   key, gfid);
+        } else {
+            /* TODO need to handle it, As of now we are
+               choosing availability instead of chossing
+               consistencty, in case of hashed_subvol is
+               down winding a getxattr call on other subvol
+               and return xattr
+            */
+            local->mds_subvol = mds_subvol;
+            for (i = 0; i < cnt; i++) {
+                if (conf->subvolumes[i] == mds_subvol) {
+                    if (!conf->subvolume_status[i]) {
+                        gf_msg(this->name, GF_LOG_WARNING, 0,
+                               DHT_MSG_HASHED_SUBVOL_DOWN,
+                               "MDS subvolume %s is down"
+                               " for gfid %s so fetching xattr "
+                               " %s randomly from a subvol ",
+                               local->mds_subvol->name, gfid, key);
+                        ret = 1;
+                    }
+                }
+            }
+        }
 
+        if (!ret && key && local->mds_subvol && dht_match_xattr(key)) {
+            STACK_WIND(frame, dht_mds_getxattr_cbk, local->mds_subvol,
+                       local->mds_subvol->fops->fgetxattr, fd, key, NULL);
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
+            return 0;
+        }
 
-	subvol = dht_subvol_get_cached (this, loc->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
+    } else {
+        cnt = local->call_cnt = 1;
+    }
 
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
+    for (i = 0; i < cnt; i++) {
+        subvol = layout->list[i].xlator;
+        STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr, fd,
+                   key, NULL);
+    }
+    return 0;
 
-	local->call_cnt = 1;
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL);
 
-	STACK_WIND (frame, dht_err_cbk,
-		    subvol, subvol->fops->removexattr,
-		    loc, key);
+    return 0;
+}
 
-	return 0;
+static int
+dht_setxattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int op_errno = EINVAL;
+
+    if (!frame || !frame->local)
+        goto err;
+
+    local = frame->local;
+    op_errno = local->op_errno;
+
+    if (we_are_not_migrating(ret)) {
+        /* This dht xlator is not migrating the file. Unwind and
+         * pass on the original mode bits so the higher DHT layer
+         * can handle this.
+         */
+        DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+                         local->rebalance.xdata);
+        return 0;
+    }
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    if (subvol == NULL)
+        goto err;
 
-	return 0;
-}
+    local->call_cnt = 2; /* This is the second attempt */
+
+    if (local->fop == GF_FOP_SETXATTR) {
+        STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+                          subvol->fops->setxattr, &local->loc,
+                          local->rebalance.xattr, local->rebalance.flags,
+                          local->xattr_req);
+    } else {
+        STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+                          subvol->fops->fsetxattr, local->fd,
+                          local->rebalance.xattr, local->rebalance.flags,
+                          local->xattr_req);
+    }
 
+    return 0;
+
+err:
+    DHT_STACK_UNWIND(setxattr, frame, (local ? local->op_ret : -1), op_errno,
+                     NULL);
+    return 0;
+}
 
 int
-dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	      int op_ret, int op_errno, fd_t *fd)
+dht_file_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int op_ret, int op_errno, dict_t *xdata)
 {
-	dht_local_t  *local = NULL;
-	int           this_call_cnt = 0;
-	call_frame_t *prev = NULL;
+    int ret = -1;
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    struct iatt *stbuf = NULL;
+    inode_t *inode = NULL;
+    xlator_t *subvol1 = NULL, *subvol2 = NULL;
+
+    local = frame->local;
+    prev = cookie;
+
+    local->op_errno = op_errno;
+
+    if ((local->fop == GF_FOP_FSETXATTR) &&
+        dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto out;
+        return 0;
+    }
+
+    if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1.",
+                     prev->name);
+        goto out;
+    }
+
+    if (local->call_cnt != 1)
+        goto out;
+
+    ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
+
+    if ((!op_ret) && !stbuf) {
+        goto out;
+    }
+
+    local->op_ret = op_ret;
+    local->rebalance.target_op_fn = dht_setxattr2;
+    if (xdata)
+        local->rebalance.xdata = dict_ref(xdata);
+
+    /* Phase 2 of migration */
+    if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) {
+        ret = dht_rebalance_complete_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+    /* Phase 1 of migration */
+    if (IS_DHT_MIGRATION_PHASE1(stbuf)) {
+        inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+        ret = dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2);
+        if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+            dht_setxattr2(this, subvol2, frame, 0);
+            return 0;
+        }
+
+        ret = dht_rebalance_in_progress_check(this, frame);
+        if (!ret)
+            return 0;
+    }
 
+out:
 
-	local = frame->local;
-	prev = cookie;
+    if (local->fop == GF_FOP_SETXATTR) {
+        DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+    } else {
+        DHT_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata);
+    }
 
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			local->op_errno = op_errno;
-			gf_log (this->name, GF_LOG_ERROR,
-				"subvolume %s returned -1 (%s)",
-				prev->this->name, strerror (op_errno));
-			goto unlock;
-		}
+    return 0;
+}
 
-		local->op_ret = 0;
-	}
-unlock:
-	UNLOCK (&frame->lock);
+/* Function is call by dict_foreach_fnmatch if key is match with
+   user.* and set boolean flag to true
+*/
+static int
+dht_is_user_xattr(dict_t *this, char *key, data_t *value, void *data)
+{
+    gf_boolean_t *user_xattr_found = data;
+    *user_xattr_found = _gf_true;
+    return 0;
+}
+
+/* Common code to wind a (f)(set|remove)xattr call to set xattr on directory
+ */
+static int
+dht_dir_common_set_remove_xattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                                fd_t *fd, dict_t *xattr, int flags,
+                                dict_t *xdata, int *op_errno)
 
-	this_call_cnt = dht_frame_return (frame);
-	if (is_last_call (this_call_cnt))
-		DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-				  local->fd);
+{
+    dict_t *xattrop = NULL;
+    int32_t subone[1] = {-1};
+    gf_boolean_t uxattr_key_found = _gf_false;
+    xlator_t *mds_subvol = NULL;
+    xlator_t *travvol = NULL;
+    dht_conf_t *conf = NULL;
+    int ret = -1;
+    int i = 0;
+    int call_cnt = 0;
+    dht_local_t *local = NULL;
+    char gfid_local[GF_UUID_BUF_SIZE] = {0};
+    char **xattrs_to_heal;
+
+    conf = this->private;
+    local = frame->local;
+    call_cnt = conf->subvolume_cnt;
+    local->flags = flags;
+    xattrs_to_heal = get_xattrs_to_heal();
+
+    if (!gf_uuid_is_null(local->gfid)) {
+        gf_uuid_unparse(local->gfid, gfid_local);
+    }
+
+    if ((local->fop == GF_FOP_SETXATTR) || (local->fop == GF_FOP_FSETXATTR)) {
+        /* Check if any user xattr present in xattr
+         */
+        dict_foreach_fnmatch(xattr, "user*", dht_is_user_xattr,
+                             &uxattr_key_found);
+
+        /* Check if any custom key xattr present in dict xattr
+           and start index from 1 because user xattr already
+           checked in previous line
+        */
+        for (i = 1; xattrs_to_heal[i]; i++)
+            if (dict_get(xattr, xattrs_to_heal[i]))
+                uxattr_key_found = _gf_true;
+    }
+
+    if ((local->fop == GF_FOP_REMOVEXATTR) ||
+        (local->fop == GF_FOP_FREMOVEXATTR)) {
+        /* Check if any custom key xattr present in local->key
+         */
+        for (i = 0; xattrs_to_heal[i]; i++)
+            if (strstr(local->key, xattrs_to_heal[i]))
+                uxattr_key_found = _gf_true;
+    }
+
+    /* If there is no custom key xattr present or gfid is root
+       or call_cnt is 1 then wind a (f)setxattr call on all subvols
+    */
+    if (!uxattr_key_found || __is_root_gfid(local->gfid) || call_cnt == 1) {
+        for (i = 0; i < conf->subvolume_cnt; i++) {
+            travvol = conf->subvolumes[i];
+            if ((local->fop == GF_FOP_SETXATTR) ||
+                (local->fop == GF_FOP_FSETXATTR)) {
+                if (fd) {
+                    STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+                                      travvol->fops->fsetxattr, fd, xattr,
+                                      flags, xdata);
+                } else {
+                    STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+                                      travvol->fops->setxattr, loc, xattr,
+                                      flags, xdata);
+                }
+            }
+
+            if ((local->fop == GF_FOP_REMOVEXATTR) ||
+                (local->fop == GF_FOP_FREMOVEXATTR)) {
+                if (fd) {
+                    STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+                                      travvol->fops->fremovexattr, fd,
+                                      local->key, local->xattr_req);
+                } else {
+                    STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+                                      travvol->fops->removexattr, loc,
+                                      local->key, local->xattr_req);
+                }
+            }
+        }
 
         return 0;
-}
+    }
+
+    /* Calculate hash subvol based on inode and parent inode
+     */
+    if (fd) {
+        ret = dht_inode_ctx_mdsvol_get(fd->inode, this, &mds_subvol);
+    } else {
+        ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol);
+    }
+    if (ret || !mds_subvol) {
+        if (fd) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+                   "Failed to get mds subvol for fd %p"
+                   "gfid is %s ",
+                   fd, gfid_local);
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+                   "%s: Failed to get mds subvol. (gfid is %s)", loc->path,
+                   gfid_local);
+        }
+        (*op_errno) = ENOENT;
+        goto err;
+    }
+
+    local->mds_subvol = mds_subvol;
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (conf->subvolumes[i] == mds_subvol) {
+            if (!conf->subvolume_status[i]) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       DHT_MSG_HASHED_SUBVOL_DOWN,
+                       "MDS subvol is down for path "
+                       " %s gfid is %s Unable to set xattr ",
+                       local->loc.path, gfid_local);
+                (*op_errno) = ENOTCONN;
+                goto err;
+            }
+        }
+    }
+
+    if (uxattr_key_found) {
+        xattrop = dict_new();
+        if (!xattrop) {
+            gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0,
+                   "dictionary creation failed for path %s "
+                   "for gfid is %s ",
+                   local->loc.path, gfid_local);
+            (*op_errno) = ENOMEM;
+            goto err;
+        }
+        local->xattr = dict_ref(xattr);
+        /* Subtract current MDS xattr value to -1 , value of MDS
+           xattr represents no. of times xattr modification failed
+           on non MDS subvols.
+        */
+        ret = dht_dict_set_array(xattrop, conf->mds_xattr_key, subone, 1);
+        if (ret != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+                   "dictionary set array failed for path %s "
+                   "for gfid is %s ",
+                   local->loc.path, gfid_local);
+            if (xattrop)
+                dict_unref(xattrop);
+            (*op_errno) = ret;
+            goto err;
+        }
+        /* Wind a xattrop call to use ref counting approach
+           update mds xattr to -1 before update xattr on
+           hashed subvol and update mds xattr to +1 after update
+           xattr on all non hashed subvol
+        */
+        if (fd) {
+            STACK_WIND(frame, dht_xattrop_mds_cbk, local->mds_subvol,
+                       local->mds_subvol->fops->fxattrop, fd,
+                       GF_XATTROP_ADD_ARRAY, xattrop, NULL);
+        } else {
+            STACK_WIND(frame, dht_xattrop_mds_cbk, local->mds_subvol,
+                       local->mds_subvol->fops->xattrop, loc,
+                       GF_XATTROP_ADD_ARRAY, xattrop, NULL);
+        }
+        if (xattrop)
+            dict_unref(xattrop);
+    }
 
+    return 0;
+err:
+    return -1;
+}
 
 int
-dht_open (call_frame_t *frame, xlator_t *this,
-	  loc_t *loc, int flags, fd_t *fd)
-{
-	xlator_t     *subvol = NULL;
-	int           ret = -1;
-        int           op_errno = -1;
-	dht_local_t  *local = NULL;
-
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-
-	subvol = dht_subvol_get_cached (this, fd->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	local->fd = fd_ref (fd);
-	ret = loc_dup (loc, &local->loc);
-	if (ret == -1) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	local->call_cnt = 1;
-
-	STACK_WIND (frame, dht_fd_cbk,
-		    subvol, subvol->fops->open,
-		    loc, flags, fd);
-
-	return 0;
+dht_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr,
+              int flags, dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    dht_local_t *local = NULL;
+    int op_errno = EINVAL;
+    dht_conf_t *conf = NULL;
+    dht_layout_t *layout = NULL;
+    int ret = -1;
+    int call_cnt = 0;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+    VALIDATE_OR_GOTO(fd->inode, err);
+    VALIDATE_OR_GOTO(this->private, err);
+
+    conf = this->private;
+
+    if (!conf->defrag)
+        GF_IF_INTERNAL_XATTR_GOTO(conf->wild_xattr_name, xattr, op_errno, err);
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_FSETXATTR);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    layout = local->layout;
+    if (!layout) {
+        gf_msg_debug(this->name, 0, "no layout for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+    local->call_cnt = call_cnt = layout->cnt;
+
+    if (IA_ISDIR(fd->inode->ia_type)) {
+        local->hashed_subvol = NULL;
+        ret = dht_dir_common_set_remove_xattr(frame, this, NULL, fd, xattr,
+                                              flags, xdata, &op_errno);
+        if (ret)
+            goto err;
+    } else {
+        local->call_cnt = 1;
+        local->rebalance.xattr = dict_ref(xattr);
+        local->rebalance.flags = flags;
+
+        ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set dictionary key %s for fd=%p",
+                         DHT_IATT_IN_XDATA_KEY, fd);
+        }
+
+        STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+                          subvol->fops->fsetxattr, fd, xattr, flags,
+                          local->xattr_req);
+    }
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL);
 
-	return 0;
+    return 0;
 }
 
+static int
+dht_checking_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int op_ret, int op_errno, dict_t *xattr,
+                          dict_t *xdata)
+{
+    int i = -1;
+    int ret = -1;
+    char *value = NULL;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    xlator_t *prev = NULL;
+    int this_call_cnt = 0;
+
+    local = frame->local;
+    prev = cookie;
+    conf = this->private;
+
+    if (op_ret == -1)
+        goto out;
+
+    ret = dict_get_str(xattr, GF_XATTR_PATHINFO_KEY, &value);
+    if (ret)
+        goto out;
+
+    if (!strcmp(value, local->key)) {
+        for (i = 0; i < conf->subvolume_cnt; i++) {
+            if (conf->subvolumes[i] == prev)
+                conf->decommissioned_bricks[i] = prev;
+        }
+    }
 
-int
-dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	       int op_ret, int op_errno,
-	       struct iovec *vector, int count, struct stat *stbuf)
+out:
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt)) {
+        DHT_STACK_UNWIND(setxattr, frame, local->op_ret, ENOTSUP, NULL);
+    }
+    return 0;
+}
+
+static int
+dht_nuke_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                 struct iatt *postparent, dict_t *xdata)
 {
-        DHT_STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
+    STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, NULL);
+    return 0;
+}
 
+static int
+dht_nuke_dir(call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *tmp)
+{
+    if (!IA_ISDIR(loc->inode->ia_type)) {
+        DHT_STACK_UNWIND(setxattr, frame, -1, ENOTSUP, NULL);
         return 0;
-}
+    }
+
+    /* Setxattr didn't need the parent, but rmdir does. */
+    loc->parent = inode_parent(loc->inode, NULL, NULL);
+    if (!loc->parent) {
+        DHT_STACK_UNWIND(setxattr, frame, -1, ENOENT, NULL);
+        return 0;
+    }
+    gf_uuid_copy(loc->pargfid, loc->parent->gfid);
 
+    if (!loc->name && loc->path) {
+        loc->name = strrchr(loc->path, '/');
+        if (loc->name) {
+            ++(loc->name);
+        }
+    }
+
+    /*
+     * We do this instead of calling dht_rmdir_do directly for two reasons.
+     * The first is that we want to reuse all of the initialization that
+     * dht_rmdir does, so if it ever changes we'll just follow along.  The
+     * second (i.e. why we don't use STACK_WIND_TAIL) is so that we don't
+     * obscure the fact that we came in via this path instead of a genuine
+     * rmdir.  That makes debugging just a tiny bit easier.
+     */
+    STACK_WIND(frame, dht_nuke_dir_cbk, this, this->fops->rmdir, loc, 1, NULL);
+
+    return 0;
+}
 
 int
-dht_readv (call_frame_t *frame, xlator_t *this,
-	   fd_t *fd, size_t size, off_t off)
+dht_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr,
+             int flags, dict_t *xdata)
 {
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
+    xlator_t *subvol = NULL;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    dht_methods_t *methods = NULL;
+    dht_layout_t *layout = NULL;
+    int i = 0;
+    int op_errno = EINVAL;
+    int ret = -1;
+    data_t *tmp = NULL;
+    uint32_t dir_spread = 0;
+    char value[4096] = {
+        0,
+    };
+    gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA;
+    int call_cnt = 0;
+    uint32_t new_hash = 0;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, err);
+
+    methods = &(conf->methods);
+
+    /* Rebalance daemon is allowed to set internal keys */
+    if (!conf->defrag)
+        GF_IF_INTERNAL_XATTR_GOTO(conf->wild_xattr_name, xattr, op_errno, err);
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_SETXATTR);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+                     loc->path);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    layout = local->layout;
+    if (!layout) {
+        gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    local->call_cnt = call_cnt = layout->cnt;
+    tmp = dict_get(xattr, conf->mds_xattr_key);
+    if (tmp) {
+        op_errno = ENOTSUP;
+        goto err;
+    }
+
+    tmp = dict_get(xattr, GF_XATTR_FILE_MIGRATE_KEY);
+    if (tmp) {
+        if (IA_ISDIR(loc->inode->ia_type)) {
+            op_errno = ENOTSUP;
+            goto err;
+        }
 
+        /* TODO: need to interpret the 'value' for more meaning
+           (ie, 'target' subvolume given there, etc) */
+        memcpy(value, tmp->data, tmp->len);
+        if (strcmp(value, "force") == 0)
+            forced_rebalance = GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS;
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
+        if (conf->decommission_in_progress)
+            forced_rebalance = GF_DHT_MIGRATE_HARDLINK;
 
-	subvol = dht_subvol_get_cached (this, fd->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
+        if (!loc->path) {
+            op_errno = EINVAL;
+            goto err;
+        }
 
-	STACK_WIND (frame, dht_readv_cbk,
-		    subvol, subvol->fops->readv,
-		    fd, size, off);
+        if (!local->loc.name)
+            local->loc.name = strrchr(local->loc.path, '/') + 1;
 
-	return 0;
+        if (!local->loc.parent)
+            local->loc.parent = inode_parent(local->loc.inode, NULL, NULL);
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL);
+        if ((!local->loc.name) || (!local->loc.parent)) {
+            op_errno = EINVAL;
+            goto err;
+        }
 
-	return 0;
-}
+        if (gf_uuid_is_null(local->loc.pargfid))
+            gf_uuid_copy(local->loc.pargfid, local->loc.parent->gfid);
 
+        methods->migration_get_dst_subvol(this, local);
 
-int
-dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		int op_ret, int op_errno, struct stat *stbuf)
-{
-        DHT_STACK_UNWIND (frame, op_ret, op_errno, stbuf);
+        if (!local->rebalance.target_node) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+                   "Failed to get hashed subvol for %s", loc->path);
+            op_errno = EINVAL;
+            goto err;
+        }
 
-        return 0;
-}
+        local->rebalance.from_subvol = local->cached_subvol;
 
+        if (local->rebalance.target_node == local->rebalance.from_subvol) {
+            op_errno = EEXIST;
+            goto err;
+        }
+        if (local->rebalance.target_node) {
+            local->flags = forced_rebalance;
 
-int
-dht_writev (call_frame_t *frame, xlator_t *this,
-	    fd_t *fd, struct iovec *vector, int count, off_t off)
-{
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
+            frame->root->pid = GF_CLIENT_PID_DEFRAG;
 
+            ret = dht_start_rebalance_task(this, frame);
+            if (!ret)
+                return 0;
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED,
+                   "%s: failed to create a new rebalance synctask", loc->path);
+        }
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    tmp = dict_get(xattr, "decommission-brick");
+    if (tmp) {
+        /* This operation should happen only on '/' */
+        if (!__is_root_gfid(loc->inode->gfid)) {
+            op_errno = ENOTSUP;
+            goto err;
+        }
+
+        memcpy(value, tmp->data, min(tmp->len, 4095));
+        local->key = gf_strdup(value);
+        local->call_cnt = conf->subvolume_cnt;
 
-	subvol = dht_subvol_get_cached (this, fd->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
+        for (i = 0; i < conf->subvolume_cnt; i++) {
+            /* Get the pathinfo, and then compare */
+            STACK_WIND_COOKIE(frame, dht_checking_pathinfo_cbk,
+                              conf->subvolumes[i], conf->subvolumes[i],
+                              conf->subvolumes[i]->fops->getxattr, loc,
+                              GF_XATTR_PATHINFO_KEY, NULL);
+        }
+        return 0;
+    }
+
+    tmp = dict_get(xattr, GF_XATTR_FIX_LAYOUT_KEY);
+    if (tmp) {
+        ret = dict_get_uint32(xattr, "new-commit-hash", &new_hash);
+        if (ret == 0) {
+            gf_msg_debug(this->name, 0,
+                         "updating commit hash for %s from %u to %u",
+                         uuid_utoa(loc->gfid), layout->commit_hash, new_hash);
+            layout->commit_hash = new_hash;
+
+            ret = dht_update_commit_hash_for_layout(frame);
+            if (ret) {
+                op_errno = ENOTCONN;
+                goto err;
+            }
+            return ret;
+        }
 
-	STACK_WIND (frame, dht_writev_cbk,
-		    subvol, subvol->fops->writev,
-		    fd, vector, count, off);
+        gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_FIX_LAYOUT_INFO,
+               "fixing the layout of %s", loc->path);
 
-	return 0;
+        ret = dht_fix_directory_layout(frame, dht_fix_layout_setxattr_cbk,
+                                       layout);
+        if (ret) {
+            op_errno = ENOTCONN;
+            goto err;
+        }
+        return ret;
+    }
+
+    tmp = dict_get(xattr, "distribute.directory-spread-count");
+    if (tmp) {
+        /* Setxattr value is packed as 'binary', not string */
+        memcpy(value, tmp->data, min(tmp->len, 4095));
+        ret = gf_string2uint32(value, &dir_spread);
+        if (!ret && ((dir_spread <= conf->subvolume_cnt) && (dir_spread > 0))) {
+            layout->spread_cnt = dir_spread;
+
+            ret = dht_fix_directory_layout(frame, dht_common_setxattr_cbk,
+                                           layout);
+            if (ret) {
+                op_errno = ENOTCONN;
+                goto err;
+            }
+            return ret;
+        }
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_OPERATION_NOT_SUP,
+               "wrong 'directory-spread-count' value (%s)", value);
+        op_errno = ENOTSUP;
+        goto err;
+    }
+
+    tmp = dict_get(xattr, "glusterfs.dht.nuke");
+    if (tmp) {
+        return dht_nuke_dir(frame, this, loc, tmp);
+    }
+    local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+
+    if (IA_ISDIR(loc->inode->ia_type)) {
+        local->hashed_subvol = NULL;
+        ret = dht_dir_common_set_remove_xattr(frame, this, loc, NULL, xattr,
+                                              flags, xdata, &op_errno);
+        if (ret)
+            goto err;
+    } else {
+        local->rebalance.xattr = dict_ref(xattr);
+        local->rebalance.flags = flags;
+        local->call_cnt = 1;
+
+        ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+
+        STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+                          subvol->fops->setxattr, loc, xattr, flags,
+                          local->xattr_req);
+    }
+
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL, 0);
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
 
-	return 0;
+    return 0;
 }
 
+static int
+dht_removexattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int op_errno = EINVAL;
+
+    if (!frame || !frame->local)
+        goto err;
+
+    local = frame->local;
+    op_errno = local->op_errno;
+
+    local->call_cnt = 2; /* This is the second attempt */
+
+    if (we_are_not_migrating(ret)) {
+        /* This dht xlator is not migrating the file. Unwind and
+         * pass on the original mode bits so the higher DHT layer
+         * can handle this.
+         */
+        DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+                         local->rebalance.xdata);
+        return 0;
+    }
+
+    if (subvol == NULL)
+        goto err;
+
+    if (local->fop == GF_FOP_REMOVEXATTR) {
+        STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+                          subvol->fops->removexattr, &local->loc, local->key,
+                          local->xattr_req);
+    } else {
+        STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+                          subvol->fops->fremovexattr, local->fd, local->key,
+                          local->xattr_req);
+    }
+
+    return 0;
+
+err:
+    DHT_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
+    return 0;
+}
 
 int
-dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+dht_file_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int op_ret, int op_errno, dict_t *xdata)
 {
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
-	dht_local_t  *local = NULL;
+    int ret = -1;
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    struct iatt *stbuf = NULL;
+    inode_t *inode = NULL;
+    xlator_t *subvol1 = NULL, *subvol2 = NULL;
+
+    local = frame->local;
+    prev = cookie;
+
+    local->op_errno = op_errno;
+
+    if ((local->fop == GF_FOP_FREMOVEXATTR) &&
+        dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto out;
+        return 0;
+    }
 
+    if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                     prev->name);
+        goto out;
+    }
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
+    if (local->call_cnt != 1)
+        goto out;
 
-	subvol = dht_subvol_get_cached (this, fd->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
+    ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
 
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
+    if ((!op_ret) && !stbuf) {
+        goto out;
+    }
 
-	local->fd = fd_ref (fd);
-	local->call_cnt = 1;
+    local->op_ret = 0;
 
-	STACK_WIND (frame, dht_err_cbk,
-		    subvol, subvol->fops->flush, fd);
+    local->rebalance.target_op_fn = dht_removexattr2;
+    if (xdata)
+        local->rebalance.xdata = dict_ref(xdata);
 
-	return 0;
+    /* Phase 2 of migration */
+    if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) {
+        ret = dht_rebalance_complete_check(this, frame);
+        if (!ret)
+            return 0;
+    }
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno);
+    /* Phase 1 of migration */
+    if (IS_DHT_MIGRATION_PHASE1(stbuf)) {
+        inode = (local->fd) ? local->fd->inode : local->loc.inode;
 
-	return 0;
-}
+        ret = dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2);
+        if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+            dht_removexattr2(this, subvol2, frame, 0);
+            return 0;
+        }
 
+        ret = dht_rebalance_in_progress_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+out:
+    if (local->fop == GF_FOP_REMOVEXATTR) {
+        DHT_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata);
+    } else {
+        DHT_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata);
+    }
+    return 0;
+}
 
 int
-dht_fsync (call_frame_t *frame, xlator_t *this,
-	   fd_t *fd, int datasync)
+dht_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                const char *key, dict_t *xdata)
 {
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
-	dht_local_t  *local = NULL;
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+    dht_layout_t *layout = NULL;
+    int call_cnt = 0;
+    dht_conf_t *conf = NULL;
+    int ret = 0;
+
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(this->private, err);
+
+    conf = this->private;
+
+    GF_IF_NATIVE_XATTR_GOTO(conf->wild_xattr_name, key, op_errno, err);
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_REMOVEXATTR);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+                     loc->path);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    layout = local->layout;
+    if (!local->layout) {
+        gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path);
+        op_errno = EINVAL;
+        goto err;
+    }
+    local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+
+    local->call_cnt = call_cnt = layout->cnt;
+    local->key = gf_strdup(key);
+
+    if (key && (strncmp(key, conf->mds_xattr_key, strlen(key)) == 0)) {
+        op_errno = ENOTSUP;
+        goto err;
+    }
+
+    if (IA_ISDIR(loc->inode->ia_type)) {
+        local->hashed_subvol = NULL;
+        ret = dht_dir_common_set_remove_xattr(frame, this, loc, NULL, NULL, 0,
+                                              local->xattr_req, &op_errno);
+        if (ret)
+            goto err;
+
+    } else {
+        local->call_cnt = 1;
+        ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+                   "Failed to "
+                   "set dictionary key %s for %s",
+                   DHT_IATT_IN_XDATA_KEY, loc->path);
+        }
+
+        STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+                          subvol->fops->removexattr, loc, key,
+                          local->xattr_req);
+    }
 
+    return 0;
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
 
-	subvol = dht_subvol_get_cached (this, fd->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
+    return 0;
+}
 
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocatoin failed :(");
-		goto err;
-	}
-	local->call_cnt = 1;
+int
+dht_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+                 dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+    dht_layout_t *layout = NULL;
+    int call_cnt = 0;
+    dht_conf_t *conf = 0;
+    int ret = 0;
+
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(this->private, err);
+
+    conf = this->private;
+
+    GF_IF_NATIVE_XATTR_GOTO(conf->wild_xattr_name, key, op_errno, err);
+
+    VALIDATE_OR_GOTO(frame, err);
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_FREMOVEXATTR);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for inode=%s",
+                     uuid_utoa(fd->inode->gfid));
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    layout = local->layout;
+    if (!local->layout) {
+        gf_msg_debug(this->name, 0, "no layout for inode=%s",
+                     uuid_utoa(fd->inode->gfid));
+        op_errno = EINVAL;
+        goto err;
+    }
+    local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+
+    local->call_cnt = call_cnt = layout->cnt;
+    local->key = gf_strdup(key);
+
+    if (IA_ISDIR(fd->inode->ia_type)) {
+        local->hashed_subvol = NULL;
+        ret = dht_dir_common_set_remove_xattr(frame, this, NULL, fd, NULL, 0,
+                                              local->xattr_req, &op_errno);
+        if (ret)
+            goto err;
+
+    } else {
+        local->call_cnt = 1;
+        ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+                   "Failed to "
+                   "set dictionary key %s for fd=%p",
+                   DHT_IATT_IN_XDATA_KEY, fd);
+        }
 
-	STACK_WIND (frame, dht_err_cbk,
-		    subvol, subvol->fops->fsync,
-		    fd, datasync);
+        STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+                          subvol->fops->fremovexattr, fd, key,
+                          local->xattr_req);
+    }
 
-	return 0;
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno);
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL);
 
-	return 0;
+    return 0;
 }
 
-
 int
-dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	    int op_ret, int op_errno, struct flock *flock)
+dht_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+           int op_errno, fd_t *fd, dict_t *xdata)
 {
-        DHT_STACK_UNWIND (frame, op_ret, op_errno, flock);
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    xlator_t *prev = NULL;
 
-        return 0;
+    local = frame->local;
+    prev = cookie;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret == -1) {
+            local->op_errno = op_errno;
+            UNLOCK(&frame->lock);
+            gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                         prev->name);
+            goto post_unlock;
+        }
+
+        local->op_ret = 0;
+    }
+    UNLOCK(&frame->lock);
+post_unlock:
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt))
+        DHT_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, local->fd,
+                         NULL);
+
+    return 0;
+}
+
+/*
+ * dht_normalize_stats -
+ */
+static void
+dht_normalize_stats(struct statvfs *buf, unsigned long bsize,
+                    unsigned long frsize)
+{
+    double factor = 0;
+
+    if (buf->f_bsize != bsize) {
+        buf->f_bsize = bsize;
+    }
+
+    if (buf->f_frsize != frsize) {
+        factor = ((double)buf->f_frsize) / frsize;
+        buf->f_frsize = frsize;
+        buf->f_blocks = (fsblkcnt_t)(factor * buf->f_blocks);
+        buf->f_bfree = (fsblkcnt_t)(factor * buf->f_bfree);
+        buf->f_bavail = (fsblkcnt_t)(factor * buf->f_bavail);
+    }
 }
 
+static int
+dht_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+               int op_errno, struct statvfs *statvfs, dict_t *xdata)
+{
+    gf_boolean_t event = _gf_false;
+    qdstatfs_action_t action = qdstatfs_action_OFF;
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    int bsize = 0;
+    int frsize = 0;
+    GF_UNUSED int ret = 0;
+    unsigned long new_usage = 0;
+    unsigned long cur_usage = 0;
+
+    local = frame->local;
+    GF_ASSERT(local);
+
+    if (xdata)
+        ret = dict_get_int8(xdata, "quota-deem-statfs", (int8_t *)&event);
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret == -1) {
+            local->op_errno = op_errno;
+            goto unlock;
+        }
+        if (!statvfs) {
+            op_errno = EINVAL;
+            local->op_ret = -1;
+            goto unlock;
+        }
+        local->op_ret = 0;
+
+        if (local->quota_deem_statfs) {
+            if (event == _gf_true) {
+                action = qdstatfs_action_COMPARE;
+            } else {
+                action = qdstatfs_action_NEGLECT;
+            }
+        } else {
+            if (event == _gf_true) {
+                action = qdstatfs_action_REPLACE;
+                local->quota_deem_statfs = _gf_true;
+            }
+        }
+
+        if (local->quota_deem_statfs) {
+            switch (action) {
+                case qdstatfs_action_NEGLECT:
+                    goto unlock;
+
+                case qdstatfs_action_REPLACE:
+                    local->statvfs = *statvfs;
+                    goto unlock;
+
+                case qdstatfs_action_COMPARE:
+                    new_usage = statvfs->f_blocks - statvfs->f_bfree;
+                    cur_usage = local->statvfs.f_blocks -
+                                local->statvfs.f_bfree;
+
+                    /* Take the max of the usage from subvols */
+                    if (new_usage >= cur_usage)
+                        local->statvfs = *statvfs;
+                    goto unlock;
+
+                default:
+                    break;
+            }
+        }
+
+        if (local->statvfs.f_bsize != 0) {
+            bsize = max(local->statvfs.f_bsize, statvfs->f_bsize);
+            frsize = max(local->statvfs.f_frsize, statvfs->f_frsize);
+            dht_normalize_stats(&local->statvfs, bsize, frsize);
+            dht_normalize_stats(statvfs, bsize, frsize);
+        } else {
+            local->statvfs.f_bsize = statvfs->f_bsize;
+            local->statvfs.f_frsize = statvfs->f_frsize;
+        }
+
+        local->statvfs.f_blocks += statvfs->f_blocks;
+        local->statvfs.f_bfree += statvfs->f_bfree;
+        local->statvfs.f_bavail += statvfs->f_bavail;
+        local->statvfs.f_files += statvfs->f_files;
+        local->statvfs.f_ffree += statvfs->f_ffree;
+        local->statvfs.f_favail += statvfs->f_favail;
+        local->statvfs.f_fsid = statvfs->f_fsid;
+        local->statvfs.f_flag = statvfs->f_flag;
+        local->statvfs.f_namemax = statvfs->f_namemax;
+    }
+unlock:
+    UNLOCK(&frame->lock);
+
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt))
+        DHT_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno,
+                         &local->statvfs, xdata);
+
+    return 0;
+}
 
 int
-dht_lk (call_frame_t *frame, xlator_t *this,
-	fd_t *fd, int cmd, struct flock *flock)
+dht_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
 {
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int op_errno = -1;
+    int i = -1;
+    inode_t *inode = NULL;
+    inode_table_t *itable = NULL;
+    static uuid_t root_gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+    loc_t newloc = {
+        0,
+    };
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(this->private, err);
+
+    conf = this->private;
+
+    local = dht_local_init(frame, NULL, NULL, GF_FOP_STATFS);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    if (loc->inode && !IA_ISDIR(loc->inode->ia_type)) {
+        itable = loc->inode->table;
+        if (!itable) {
+            op_errno = EINVAL;
+            goto err;
+        }
 
+        loc = &local->loc2;
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
+        inode = inode_find(itable, root_gfid);
+        if (!inode) {
+            op_errno = EINVAL;
+            goto err;
+        }
 
-	subvol = dht_subvol_get_cached (this, fd->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
+        dht_build_root_loc(inode, &newloc);
+        loc = &newloc;
+    }
 
-	STACK_WIND (frame, dht_lk_cbk,
-		    subvol, subvol->fops->lk,
-		    fd, cmd, flock);
+    local->call_cnt = conf->subvolume_cnt;
 
-	return 0;
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        STACK_WIND(frame, dht_statfs_cbk, conf->subvolumes[i],
+                   conf->subvolumes[i]->fops->statfs, loc, xdata);
+    }
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
 
-	return 0;
+    return 0;
 }
 
-/* gf_lk no longer exists 
 int
-dht_gf_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	    int op_ret, int op_errno, struct flock *flock)
+dht_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+            dict_t *xdata)
 {
-        DHT_STACK_UNWIND (frame, op_ret, op_errno, flock);
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int op_errno = -1;
+    int i = -1;
+    int ret = 0;
+    gf_boolean_t new_xdata = _gf_false;
+    xlator_t **subvolumes = NULL;
+    int call_count = 0;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+    VALIDATE_OR_GOTO(this->private, err);
+
+    conf = this->private;
+
+    local = dht_local_init(frame, loc, fd, GF_FOP_OPENDIR);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+    local->first_up_subvol = dht_first_up_subvol(this);
+
+    if (!xdata) {
+        xdata = dict_new();
+        if (!xdata) {
+            op_errno = ENOMEM;
+            goto err;
+        }
+        new_xdata = _gf_true;
+    }
+
+    ret = dict_set_uint32(xdata, conf->link_xattr_name, 256);
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+               "Failed to set dictionary value : key = %s",
+               conf->link_xattr_name);
+
+    /* dht_readdirp will wind to all subvols so open has to be sent to
+     * all subvols whether or not conf->local_subvols is set */
+
+    call_count = local->call_cnt = conf->subvolume_cnt;
+    subvolumes = conf->subvolumes;
+
+    /* In case of parallel-readdir, the readdir-ahead will be loaded
+     * below dht, in this case, if we want to enable or disable SKIP_DIRs
+     * it has to be done in opendir, so that prefetching logic in
+     * readdir-ahead, honors it */
+    for (i = 0; i < call_count; i++) {
+        if (conf->readdir_optimize == _gf_true) {
+            if (subvolumes[i] != local->first_up_subvol) {
+                ret = dict_set_int32(xdata, GF_READDIR_SKIP_DIRS, 1);
+                if (ret)
+                    gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+                           "Failed to set dictionary"
+                           " value :key = %s, ret:%d",
+                           GF_READDIR_SKIP_DIRS, ret);
+            }
+        }
 
-        return 0;
+        STACK_WIND_COOKIE(frame, dht_fd_cbk, subvolumes[i], subvolumes[i],
+                          subvolumes[i]->fops->opendir, loc, fd, xdata);
+        dict_del(xdata, GF_READDIR_SKIP_DIRS);
+    }
+
+    if (new_xdata)
+        dict_unref(xdata);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(opendir, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
 }
 
+/* dht_readdirp_cbk creates a new dentry and dentry->inode is not assigned.
+   This functions assigns an inode if all of the following conditions are
+   true:
 
-int
-dht_gf_lk (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, int cmd, struct flock *flock)
+   * DHT has only one child. In this case the entire layout is present on
+   this single child and hence we can set complete layout in inode.
+   * backend has complete layout and there are no anomalies in it and from
+   this information layout can be constructed and set in inode.
+*/
+
+static void
+dht_populate_inode_for_dentry(xlator_t *this, xlator_t *subvol,
+                              gf_dirent_t *entry, gf_dirent_t *orig_entry)
+{
+    dht_layout_t *layout = NULL;
+    int ret = 0;
+    loc_t loc = {
+        0,
+    };
+
+    if (gf_uuid_is_null(orig_entry->d_stat.ia_gfid)) {
+        /* this skips the '..' entry for the root of the volume */
+        return;
+    }
+
+    gf_uuid_copy(loc.gfid, orig_entry->d_stat.ia_gfid);
+    loc.inode = inode_ref(orig_entry->inode);
+
+    if (is_revalidate(&loc)) {
+        goto out;
+    }
+
+    layout = dht_layout_new(this, 1);
+    if (!layout)
+        goto out;
+
+    ret = dht_layout_merge(this, layout, subvol, 0, 0, orig_entry->dict);
+    if (!ret) {
+        ret = dht_layout_normalize(this, &loc, layout);
+        if (ret == 0) {
+            dht_layout_set(this, orig_entry->inode, layout);
+            entry->inode = inode_ref(orig_entry->inode);
+            layout = NULL;
+        }
+    }
+
+    if (layout)
+        dht_layout_unref(this, layout);
+
+out:
+    loc_wipe(&loc);
+    return;
+}
+
+/* Posix returns op_errno = ENOENT to indicate that there are no more
+ * entries
+ */
+static int
+dht_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
 {
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
+    dht_local_t *local = NULL;
+    gf_dirent_t entries;
+    gf_dirent_t *orig_entry = NULL;
+    gf_dirent_t *entry = NULL;
+    xlator_t *prev = NULL;
+    xlator_t *next_subvol = NULL;
+    off_t next_offset = 0;
+    int count = 0;
+    dht_layout_t *layout = NULL;
+    dht_conf_t *conf = NULL;
+    dht_methods_t *methods = NULL;
+    xlator_t *subvol = 0;
+    xlator_t *hashed_subvol = 0;
+    int ret = 0;
+    int readdir_optimize = 0;
+    inode_table_t *itable = NULL;
+    inode_t *inode = NULL;
+    gf_boolean_t skip_hashed_check = _gf_false;
+
+    INIT_LIST_HEAD(&entries.list);
+
+    prev = cookie;
+    local = frame->local;
+    GF_VALIDATE_OR_GOTO(this->name, local->fd, unwind);
+
+    itable = local->fd->inode->table;
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, unwind);
+
+    methods = &(conf->methods);
+
+    if (op_ret <= 0) {
+        goto done;
+    }
+
+    /* Why aren't we skipping DHT entirely in case of a single subvol?
+     * Because if this was a larger volume earlier and all but one subvol
+     * was removed, there might be stale linkto files on the subvol.
+     */
+    if (conf->subvolume_cnt == 1) {
+        /* return all directory and file entries except
+         * linkto files for a single child DHT
+         */
+        skip_hashed_check = _gf_true;
+    }
+
+    if (!local->layout)
+        local->layout = dht_layout_get(this, local->fd->inode);
+
+    layout = local->layout;
+
+    /* This will skip the entries on the subvol without a layout,
+     * hence preventing the crash but rmdir might fail with
+     * "directory not empty" errors*/
+
+    if (layout == NULL)
+        goto done;
+
+    if (conf->readdir_optimize == _gf_true)
+        readdir_optimize = 1;
+
+    gf_msg_debug(this->name, 0, "Processing entries from %s", prev->name);
+
+    list_for_each_entry(orig_entry, (&orig_entries->list), list)
+    {
+        next_offset = orig_entry->d_off;
+
+        gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d", prev->name,
+                     orig_entry->d_name, orig_entry->d_type);
+
+        if (IA_ISINVAL(orig_entry->d_stat.ia_type)) {
+            /*stat failed somewhere- display this entry but the data may
+             * be inaccurate.
+             */
+            gf_msg_debug(this->name, EINVAL, "Invalid stat for %s (gfid %s)",
+                         orig_entry->d_name,
+                         uuid_utoa(orig_entry->d_stat.ia_gfid));
+        }
+
+        if (check_is_linkfile(NULL, (&orig_entry->d_stat), orig_entry->dict,
+                              conf->link_xattr_name)) {
+            gf_msg_debug(this->name, 0, "%s: %s is a linkto file", prev->name,
+                         orig_entry->d_name);
+            continue;
+        }
+
+        if (skip_hashed_check) {
+            goto list;
+        }
+
+        if (check_is_dir(NULL, (&orig_entry->d_stat), NULL)) {
+            /*Directory entries filtering :
+             * a) If rebalance is running, pick from first_up_subvol
+             * b) (rebalance not running)hashed subvolume is NULL or
+             * down then filter in first_up_subvolume. Other wise the
+             * corresponding hashed subvolume will take care of the
+             * directory entry.
+             */
+            if (readdir_optimize) {
+                if (prev == local->first_up_subvol)
+                    goto list;
+                else
+                    continue;
+            }
+
+            hashed_subvol = methods->layout_search(this, layout,
+                                                   orig_entry->d_name);
+
+            if (prev == hashed_subvol)
+                goto list;
+            if ((hashed_subvol && dht_subvol_status(conf, hashed_subvol)) ||
+                (prev != local->first_up_subvol))
+                continue;
+
+            goto list;
+        }
+
+    list:
+        entry = gf_dirent_for_name(orig_entry->d_name);
+        if (!entry) {
+            goto unwind;
+        }
+
+        /* Do this if conf->search_unhashed is set to "auto" */
+        if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) {
+            subvol = methods->layout_search(this, layout, orig_entry->d_name);
+            if (!subvol || (subvol != prev)) {
+                /* TODO: Count the number of entries which need
+                   linkfile to prove its existence in fs */
+                layout->search_unhashed++;
+            }
+        }
+
+        entry->d_off = orig_entry->d_off;
+        entry->d_stat = orig_entry->d_stat;
+        entry->d_ino = orig_entry->d_ino;
+        entry->d_type = orig_entry->d_type;
+        entry->d_len = orig_entry->d_len;
+
+        if (orig_entry->dict)
+            entry->dict = dict_ref(orig_entry->dict);
+
+        /* making sure we set the inode ctx right with layout,
+           currently possible only for non-directories, so for
+           directories don't set entry inodes */
+        if (IA_ISDIR(entry->d_stat.ia_type)) {
+            entry->d_stat.ia_blocks = DHT_DIR_STAT_BLOCKS;
+            entry->d_stat.ia_size = DHT_DIR_STAT_SIZE;
+            if (orig_entry->inode) {
+                dht_inode_ctx_time_update(orig_entry->inode, this,
+                                          &entry->d_stat, 1);
+
+                if (conf->subvolume_cnt == 1) {
+                    dht_populate_inode_for_dentry(this, prev, entry,
+                                                  orig_entry);
+                }
+            }
+        } else {
+            if (orig_entry->dict &&
+                dict_get(orig_entry->dict, conf->link_xattr_name)) {
+                /* Strip out the S and T flags set by rebalance*/
+                DHT_STRIP_PHASE1_FLAGS(&entry->d_stat);
+            }
+
+            if (orig_entry->inode) {
+                ret = dht_layout_preset(this, prev, orig_entry->inode);
+                if (ret)
+                    gf_msg(this->name, GF_LOG_WARNING, 0,
+                           DHT_MSG_LAYOUT_SET_FAILED,
+                           "failed to link the layout "
+                           "in inode for %s",
+                           orig_entry->d_name);
+
+                entry->inode = inode_ref(orig_entry->inode);
+            } else if (itable) {
+                /*
+                 * orig_entry->inode might be null if any upper
+                 * layer xlators below client set to null, to
+                 * force a lookup on the inode even if the inode
+                 * is present in the inode table. In that case
+                 * we just update the ctx to make sure we didn't
+                 * missed anything.
+                 */
+                inode = inode_find(itable, orig_entry->d_stat.ia_gfid);
+                if (inode) {
+                    ret = dht_layout_preset(this, prev, inode);
+                    if (ret)
+                        gf_msg(this->name, GF_LOG_WARNING, 0,
+                               DHT_MSG_LAYOUT_SET_FAILED,
+                               "failed to link the layout"
+                               " in inode for %s",
+                               orig_entry->d_name);
+                    inode_unref(inode);
+                    inode = NULL;
+                }
+            }
+        }
+
+        gf_msg_debug(this->name, 0, "%s: Adding entry = %s", prev->name,
+                     entry->d_name);
+
+        list_add_tail(&entry->list, &entries.list);
+        count++;
+    }
+
+done:
+
+    /* We need to ensure that only the last subvolume's end-of-directory
+     * notification is respected so that directory reading does not stop
+     * before all subvolumes have been read. That could happen because the
+     * posix for each subvolume sends a ENOENT on end-of-directory but in
+     * distribute we're not concerned only with a posix's view of the
+     * directory but the aggregated namespace' view of the directory.
+     * Possible values:
+     * op_ret == 0 and op_errno != 0
+     *   if op_errno != ENOENT : Error.Unwind.
+     *   if op_errno == ENOENT : There are no more entries on this subvol.
+     *                           Move to the next one.
+     * op_ret > 0 and count == 0 :
+     *    The subvol returned entries to dht but all were stripped out.
+     *    For example, if they were linkto files or dirs where
+     *    hashed_subvol != prev. Try to get some entries by winding
+     *    to the next subvol. This can be dangerous if parallel readdir
+     *    is enabled as it grows the stack.
+     *
+     * op_ret > 0 and count > 0:
+     *   We found some entries. Unwind even if the buffer is not full.
+     *
+     */
+
+    op_ret = count;
+    if (count == 0) {
+        /* non-zero next_offset means that
+         * EOF is not yet hit on the current subvol
+         */
+        if ((next_offset == 0) || (op_errno == ENOENT)) {
+            next_offset = 0;
+            next_subvol = dht_subvol_next(this, prev);
+        } else {
+            next_subvol = prev;
+        }
+
+        if (!next_subvol) {
+            goto unwind;
+        }
 
+        if (conf->readdir_optimize == _gf_true) {
+            if (next_subvol != local->first_up_subvol) {
+                ret = dict_set_int32(local->xattr, GF_READDIR_SKIP_DIRS, 1);
+                if (ret)
+                    gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+                           "Failed to set dictionary value"
+                           ":key = %s",
+                           GF_READDIR_SKIP_DIRS);
+            } else {
+                dict_del(local->xattr, GF_READDIR_SKIP_DIRS);
+            }
+        }
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
+        STACK_WIND_COOKIE(frame, dht_readdirp_cbk, next_subvol, next_subvol,
+                          next_subvol->fops->readdirp, local->fd, local->size,
+                          next_offset, local->xattr);
+        return 0;
+    }
 
-	subvol = dht_subvol_get_cached (this, fd->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
+unwind:
+    /* We need to ensure that only the last subvolume's end-of-directory
+     * notification is respected so that directory reading does not stop
+     * before all subvolumes have been read. That could happen because the
+     * posix for each subvolume sends a ENOENT on end-of-directory but in
+     * distribute we're not concerned only with a posix's view of the
+     * directory but the aggregated namespace' view of the directory.
+     */
+    if (op_ret < 0)
+        op_ret = 0;
+
+    if (prev != dht_last_up_subvol(this))
+        op_errno = 0;
+
+    DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL);
+
+    gf_dirent_free(&entries);
+    return 0;
+}
 
-	STACK_WIND (frame, dht_gf_lk_cbk,
-		    subvol, subvol->fops->gf_lk,
-		    fd, cmd, flock);
+static int
+dht_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    gf_dirent_t entries;
+    gf_dirent_t *orig_entry = NULL;
+    gf_dirent_t *entry = NULL;
+    xlator_t *prev = NULL;
+    xlator_t *next_subvol = NULL;
+    off_t next_offset = 0;
+    int count = 0;
+    dht_layout_t *layout = 0;
+    xlator_t *subvol = 0;
+    dht_conf_t *conf = NULL;
+    dht_methods_t *methods = NULL;
+    gf_boolean_t skip_hashed_check = _gf_false;
+
+    INIT_LIST_HEAD(&entries.list);
+
+    prev = cookie;
+    local = frame->local;
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, done);
+
+    methods = &(conf->methods);
+
+    if (op_ret <= 0)
+        goto done;
+
+    if (!local->layout)
+        local->layout = dht_layout_get(this, local->fd->inode);
+
+    layout = local->layout;
+
+    gf_msg_debug(this->name, 0, "Processing entries from %s", prev->name);
+
+    if (conf->subvolume_cnt == 1) {
+        /*return everything*/
+        skip_hashed_check = _gf_true;
+        count = op_ret;
+        goto done;
+    }
+
+    list_for_each_entry(orig_entry, (&orig_entries->list), list)
+    {
+        next_offset = orig_entry->d_off;
+
+        gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d", prev->name,
+                     orig_entry->d_name, orig_entry->d_type);
+
+        subvol = methods->layout_search(this, layout, orig_entry->d_name);
+
+        if (!subvol || (subvol == prev)) {
+            entry = gf_dirent_for_name(orig_entry->d_name);
+            if (!entry) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+                       "Memory allocation failed ");
+                goto unwind;
+            }
+
+            entry->d_off = orig_entry->d_off;
+            entry->d_ino = orig_entry->d_ino;
+            entry->d_type = orig_entry->d_type;
+            entry->d_len = orig_entry->d_len;
+
+            gf_msg_debug(this->name, 0, "%s: Adding = entry %s", prev->name,
+                         entry->d_name);
+
+            list_add_tail(&entry->list, &entries.list);
+            count++;
+        }
+    }
+done:
+    op_ret = count;
+    /* We need to ensure that only the last subvolume's end-of-directory
+     * notification is respected so that directory reading does not stop
+     * before all subvolumes have been read. That could happen because the
+     * posix for each subvolume sends a ENOENT on end-of-directory but in
+     * distribute we're not concerned only with a posix's view of the
+     * directory but the aggregated namespace' view of the directory.
+     */
+    if (count == 0) {
+        if ((next_offset == 0) || (op_errno == ENOENT)) {
+            next_offset = 0;
+            next_subvol = dht_subvol_next(this, prev);
+        } else {
+            next_subvol = prev;
+        }
 
-	return 0;
+        if (!next_subvol) {
+            goto unwind;
+        }
+
+        STACK_WIND_COOKIE(frame, dht_readdir_cbk, next_subvol, next_subvol,
+                          next_subvol->fops->readdir, local->fd, local->size,
+                          next_offset, NULL);
+        return 0;
+    }
+
+unwind:
+    /* We need to ensure that only the last subvolume's end-of-directory
+     * notification is respected so that directory reading does not stop
+     * before all subvolumes have been read. That could happen because the
+     * posix for each subvolume sends a ENOENT on end-of-directory but in
+     * distribute we're not concerned only with a posix's view of the
+     * directory but the aggregated namespace' view of the directory.
+     */
+
+    if (prev != dht_last_up_subvol(this))
+        op_errno = 0;
+
+    if (!skip_hashed_check) {
+        DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL);
+        gf_dirent_free(&entries);
+
+    } else {
+        DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, orig_entries, NULL);
+    }
+    return 0;
+}
+
+static int
+dht_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+               off_t yoff, int whichop, dict_t *dict)
+{
+    dht_local_t *local = NULL;
+    int op_errno = -1;
+    xlator_t *xvol = NULL;
+    int ret = 0;
+    dht_conf_t *conf = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+    VALIDATE_OR_GOTO(this->private, err);
+
+    conf = this->private;
+
+    local = dht_local_init(frame, NULL, NULL, whichop);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->fd = fd_ref(fd);
+    local->size = size;
+    local->xattr_req = (dict) ? dict_ref(dict) : NULL;
+    local->first_up_subvol = dht_first_up_subvol(this);
+    local->op_ret = -1;
+
+    dht_deitransform(this, yoff, &xvol);
+
+    /* TODO: do proper readdir */
+    if (whichop == GF_FOP_READDIRP) {
+        if (dict)
+            local->xattr = dict_ref(dict);
+        else
+            local->xattr = dict_new();
+
+        if (local->xattr) {
+            ret = dict_set_uint32(local->xattr, conf->link_xattr_name, 256);
+            if (ret)
+                gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+                       "Failed to set dictionary value"
+                       " : key = %s",
+                       conf->link_xattr_name);
+
+            if (conf->readdir_optimize == _gf_true) {
+                if (xvol != local->first_up_subvol) {
+                    ret = dict_set_int32(local->xattr, GF_READDIR_SKIP_DIRS, 1);
+                    if (ret)
+                        gf_msg(this->name, GF_LOG_ERROR, 0,
+                               DHT_MSG_DICT_SET_FAILED,
+                               "Failed to set "
+                               "dictionary value: "
+                               "key = %s",
+                               GF_READDIR_SKIP_DIRS);
+                } else {
+                    dict_del(local->xattr, GF_READDIR_SKIP_DIRS);
+                }
+            }
+
+            if (conf->subvolume_cnt == 1) {
+                ret = dict_set_uint32(local->xattr, conf->xattr_name, 4 * 4);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_WARNING, ENOMEM,
+                           DHT_MSG_DICT_SET_FAILED,
+                           "Failed to set dictionary "
+                           "value:key = %s ",
+                           conf->xattr_name);
+                }
+            }
+        }
+
+        STACK_WIND_COOKIE(frame, dht_readdirp_cbk, xvol, xvol,
+                          xvol->fops->readdirp, fd, size, yoff, local->xattr);
+    } else {
+        STACK_WIND_COOKIE(frame, dht_readdir_cbk, xvol, xvol,
+                          xvol->fops->readdir, fd, size, yoff, local->xattr);
+    }
+
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL);
 
-	return 0;
+    return 0;
 }
-*/
 
 int
-dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		int op_ret, int op_errno, struct statvfs *statvfs)
-{
-	dht_local_t  *local = NULL;
-	int           this_call_cnt = 0;
-
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			local->op_errno = op_errno;
-			goto unlock;
-		}
-		local->op_ret = 0;
-
-		/* TODO: normalize sizes */
-		local->statvfs.f_bsize    = statvfs->f_bsize;
-		local->statvfs.f_frsize   = statvfs->f_frsize;
-
-		local->statvfs.f_blocks  += statvfs->f_blocks;
-		local->statvfs.f_bfree   += statvfs->f_bfree;
-		local->statvfs.f_bavail  += statvfs->f_bavail;
-		local->statvfs.f_files   += statvfs->f_files;
-		local->statvfs.f_ffree   += statvfs->f_ffree;
-		local->statvfs.f_favail  += statvfs->f_favail;
-		local->statvfs.f_fsid     = statvfs->f_fsid;
-		local->statvfs.f_flag     = statvfs->f_flag;
-		local->statvfs.f_namemax  = statvfs->f_namemax;
-
-	}
-unlock:
-	UNLOCK (&frame->lock);
+dht_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t yoff, dict_t *xdata)
+{
+    int op = GF_FOP_READDIR;
+    dht_conf_t *conf = NULL;
+    int i = 0;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (!conf->subvolume_status[i]) {
+            op = GF_FOP_READDIRP;
+            break;
+        }
+    }
 
+    if (conf->use_readdirp)
+        op = GF_FOP_READDIRP;
 
-	this_call_cnt = dht_frame_return (frame);
-	if (is_last_call (this_call_cnt))
-		DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-				  &local->statvfs);
+out:
+    dht_do_readdir(frame, this, fd, size, yoff, op, 0);
+    return 0;
+}
 
-        return 0;
+int
+dht_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+             off_t yoff, dict_t *dict)
+{
+    dht_do_readdir(frame, this, fd, size, yoff, GF_FOP_READDIRP, dict);
+    return 0;
 }
 
+static int
+dht_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+
+    local = frame->local;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret == -1)
+            local->op_errno = op_errno;
+        else if (op_ret == 0)
+            local->op_ret = 0;
+    }
+    UNLOCK(&frame->lock);
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt))
+        DHT_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno,
+                         xdata);
+
+    return 0;
+}
 
 int
-dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc)
+dht_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync,
+             dict_t *xdata)
 {
-	dht_local_t  *local  = NULL;
-	dht_conf_t   *conf = NULL;
-        int           op_errno = -1;
-	int           i = -1;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int op_errno = -1;
+    int i = -1;
 
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+    VALIDATE_OR_GOTO(this->private, err);
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
+    conf = this->private;
 
-	conf = this->private;
+    local = dht_local_init(frame, NULL, NULL, GF_FOP_FSYNCDIR);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
 
-	local = dht_local_init (frame);
-	local->call_cnt = conf->subvolume_cnt;
+    local->fd = fd_ref(fd);
+    local->call_cnt = conf->subvolume_cnt;
 
-	for (i = 0; i < conf->subvolume_cnt; i++) {
-		STACK_WIND (frame, dht_statfs_cbk,
-			    conf->subvolumes[i],
-			    conf->subvolumes[i]->fops->statfs, loc);
-	}
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        STACK_WIND(frame, dht_fsyncdir_cbk, conf->subvolumes[i],
+                   conf->subvolumes[i]->fops->fsyncdir, fd, datasync, xdata);
+    }
 
-	return 0;
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(fsyncdir, frame, -1, op_errno, NULL);
 
-	return 0;
+    return 0;
 }
 
-
 int
-dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
+dht_newfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                int op_errno, inode_t *inode, struct iatt *stbuf,
+                struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
 {
-	dht_local_t  *local  = NULL;
-	dht_conf_t   *conf = NULL;
-	int           ret = -1;
-        int           op_errno = -1;
-	int           i = -1;
+    xlator_t *prev = NULL;
+    int ret = -1;
+    dht_local_t *local = NULL;
+
+    if (op_ret == -1)
+        goto out;
+
+    local = frame->local;
+    if (!local) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    prev = cookie;
+
+    if (local->loc.parent) {
+        dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0);
+        dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+    }
+
+    ret = dht_layout_preset(this, prev, inode);
+    if (ret < 0) {
+        gf_msg_debug(this->name, EINVAL,
+                     "could not set pre-set layout for subvolume %s",
+                     prev ? prev->name : NULL);
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+    if (local->linked == _gf_true)
+        dht_linkfile_attr_heal(frame, this);
+out:
+    /*
+     * FIXME: ia_size and st_blocks of preparent and postparent do not have
+     * correct values. since, preparent and postparent buffers correspond
+     * to a directory these two members should have values equal to sum of
+     * corresponding values from each of the subvolume.
+     * See dht_iatt_merge for reference.
+     */
+    DHT_STRIP_PHASE1_FLAGS(stbuf);
+    dht_set_fixed_dir_stat(postparent);
+    dht_set_fixed_dir_stat(preparent);
+
+    if (local && local->lock[0].layout.parent_layout.locks) {
+        /* store op_errno for failure case*/
+        local->op_errno = op_errno;
+        local->refresh_layout_unlock(frame, this, op_ret, 1);
+
+        if (op_ret == 0) {
+            DHT_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, stbuf,
+                             preparent, postparent, xdata);
+        }
+    } else {
+        DHT_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, stbuf,
+                         preparent, postparent, xdata);
+    }
+
+    return 0;
+}
 
+static int
+dht_mknod_linkfile_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                              int32_t op_ret, int32_t op_errno, inode_t *inode,
+                              struct iatt *stbuf, struct iatt *preparent,
+                              struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *cached_subvol = NULL;
+    dht_conf_t *conf = NULL;
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
+    local = frame->local;
 
-	conf = this->private;
+    if (!local || !local->cached_subvol) {
+        op_errno = EINVAL;
+        goto err;
+    }
 
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
+    if (op_ret == -1) {
+        local->op_errno = op_errno;
+        goto err;
+    }
 
-	local->fd = fd_ref (fd);
-	ret = loc_dup (loc, &local->loc);
-	if (ret == -1) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
+    conf = this->private;
+    if (!conf) {
+        local->op_errno = EINVAL;
+        op_errno = EINVAL;
+        goto err;
+    }
 
-	local->call_cnt = conf->subvolume_cnt;
+    cached_subvol = local->cached_subvol;
 
-	for (i = 0; i < conf->subvolume_cnt; i++) {
-		STACK_WIND (frame, dht_fd_cbk,
-			    conf->subvolumes[i],
-			    conf->subvolumes[i]->fops->opendir,
-			    loc, fd);
-	}
+    if (local->params) {
+        dict_del(local->params, conf->link_xattr_name);
+        dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY);
+    }
 
-	return 0;
+    STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)cached_subvol,
+                      cached_subvol, cached_subvol->fops->mknod, &local->loc,
+                      local->mode, local->rdev, local->umask, local->params);
 
+    return 0;
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    if (local && local->lock[0].layout.parent_layout.locks) {
+        local->refresh_layout_unlock(frame, this, -1, 1);
+    } else {
+        DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                         NULL);
+    }
+    return 0;
+}
+
+static int
+dht_mknod_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this,
+                               xlator_t *subvol, loc_t *loc, dev_t rdev,
+                               mode_t mode, mode_t umask, dict_t *params)
+{
+    dht_local_t *local = NULL;
+    xlator_t *avail_subvol = NULL;
+
+    local = frame->local;
+
+    if (!dht_is_subvol_filled(this, subvol)) {
+        gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
+                     subvol->name);
+
+        STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol,
+                          subvol->fops->mknod, loc, mode, rdev, umask, params);
+    } else {
+        avail_subvol = dht_free_disk_available_subvol(this, subvol, local);
+
+        if (avail_subvol != subvol) {
+            local->params = dict_ref(params);
+            local->rdev = rdev;
+            local->mode = mode;
+            local->umask = umask;
+            local->cached_subvol = avail_subvol;
+            local->hashed_subvol = subvol;
+
+            gf_msg_debug(this->name, 0, "creating %s on %s (link at %s)",
+                         loc->path, avail_subvol->name, subvol->name);
+
+            dht_linkfile_create(frame, dht_mknod_linkfile_create_cbk, this,
+                                avail_subvol, subvol, loc);
+
+            goto out;
+        }
 
-	return 0;
+        gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
+                     subvol->name);
+
+        STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol,
+                          subvol->fops->mknod, loc, mode, rdev, umask, params);
+    }
+out:
+    return 0;
 }
 
+static int32_t
+dht_mknod_do(call_frame_t *frame)
+{
+    dht_local_t *local = NULL;
+    dht_layout_t *refreshed = NULL;
+    xlator_t *subvol = NULL;
+    xlator_t *this = NULL;
+    dht_conf_t *conf = NULL;
+    dht_methods_t *methods = NULL;
 
-int
-dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		 int op_ret, int op_errno, gf_dirent_t *orig_entries)
-{
-	dht_local_t  *local = NULL;
-	gf_dirent_t   entries;
-	gf_dirent_t  *orig_entry = NULL;
-	gf_dirent_t  *entry = NULL;
-	call_frame_t *prev = NULL;
-	xlator_t     *subvol = NULL;
-	xlator_t     *next = NULL;
-	dht_layout_t *layout = NULL;
-	int           count = 0;
-
-
-	INIT_LIST_HEAD (&entries.list);
-	prev = cookie;
-	local = frame->local;
-
-	if (op_ret < 0)
-		goto done;
-
-	layout = dht_layout_get (this, local->fd->inode);
-
-	list_for_each_entry (orig_entry, &orig_entries->list, list) {
-		subvol = dht_layout_search (this, layout, orig_entry->d_name);
-
-		if (!subvol || subvol == prev->this) {
-			entry = gf_dirent_for_name (orig_entry->d_name);
-			if (!entry) {
-				gf_log (this->name, GF_LOG_ERROR,
-					"memory allocation failed :(");
-				goto unwind;
-			}
-
-			dht_itransform (this, subvol, orig_entry->d_ino,
-					&entry->d_ino);
-			dht_itransform (this, subvol, orig_entry->d_off,
-					&entry->d_off);
-
-			entry->d_type = orig_entry->d_type;
-			entry->d_len  = orig_entry->d_len;
-
-			list_add_tail (&entry->list, &entries.list);
-			count++;
-		}
-	}
-	op_ret = count;
+    local = frame->local;
 
-done:
-	if (count == 0) {
-		next = dht_subvol_next (this, prev->this);
-		if (!next) {
-			goto unwind;
-		}
-
-		STACK_WIND (frame, dht_readdir_cbk,
-			    next, next->fops->readdir,
-			    local->fd, local->size, 0);
-		return 0;
-	}
+    this = THIS;
 
-unwind:
-	if (op_ret < 0)
-		op_ret = 0;
+    conf = this->private;
+
+    GF_VALIDATE_OR_GOTO(this->name, conf, err);
+
+    methods = &(conf->methods);
+
+    /* We don't need parent_loc anymore */
+    loc_wipe(&local->loc);
+
+    loc_copy(&local->loc, &local->loc2);
+
+    loc_wipe(&local->loc2);
 
-	DHT_STACK_UNWIND (frame, op_ret, op_errno, &entries);
+    refreshed = local->selfheal.refreshed_layout;
+
+    subvol = methods->layout_search(this, refreshed, local->loc.name);
+
+    if (!subvol) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+               "no subvolume in "
+               "layout for path=%s",
+               local->loc.path);
+        local->op_errno = ENOENT;
+        goto err;
+    }
+
+    dht_mknod_wind_to_avail_subvol(frame, this, subvol, &local->loc,
+                                   local->rdev, local->mode, local->umask,
+                                   local->params);
+    return 0;
+err:
+    local->refresh_layout_unlock(frame, this, -1, 1);
+
+    return 0;
+}
+
+static int32_t
+dht_mknod_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    DHT_STACK_DESTROY(frame);
+    return 0;
+}
+
+static int32_t
+dht_mknod_finish(call_frame_t *frame, xlator_t *this, int op_ret,
+                 int invoke_cbk)
+{
+    dht_local_t *local = NULL, *lock_local = NULL;
+    call_frame_t *lock_frame = NULL;
+    int lock_count = 0;
+
+    local = frame->local;
+    lock_count = dht_lock_count(local->lock[0].layout.parent_layout.locks,
+                                local->lock[0].layout.parent_layout.lk_count);
+    if (lock_count == 0)
+        goto done;
+
+    lock_frame = copy_frame(frame);
+    if (lock_frame == NULL) {
+        goto done;
+    }
+
+    lock_local = dht_local_init(lock_frame, &local->loc, NULL,
+                                lock_frame->root->op);
+    if (lock_local == NULL) {
+        goto done;
+    }
+
+    lock_local->lock[0]
+        .layout.parent_layout.locks = local->lock[0].layout.parent_layout.locks;
+    lock_local->lock[0].layout.parent_layout.lk_count =
+        local->lock[0].layout.parent_layout.lk_count;
+
+    local->lock[0].layout.parent_layout.locks = NULL;
+    local->lock[0].layout.parent_layout.lk_count = 0;
+
+    dht_unlock_inodelk(lock_frame,
+                       lock_local->lock[0].layout.parent_layout.locks,
+                       lock_local->lock[0].layout.parent_layout.lk_count,
+                       dht_mknod_unlock_cbk);
+    lock_frame = NULL;
 
-	gf_dirent_free (&entries);
+done:
+    if (lock_frame != NULL) {
+        DHT_STACK_DESTROY(lock_frame);
+    }
 
+    if (op_ret == 0)
         return 0;
+
+    DHT_STACK_UNWIND(mknod, frame, op_ret, local->op_errno, NULL, NULL, NULL,
+                     NULL, NULL);
+    return 0;
 }
 
+static int32_t
+dht_mknod_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
 
-int
-dht_readdir (call_frame_t *frame, xlator_t *this,
-	     fd_t *fd, size_t size, off_t yoff)
+    local = frame->local;
+
+    if (!local) {
+        goto err;
+    }
+
+    if (op_ret < 0) {
+        gf_msg("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+               "mknod lock failed for file: %s", local->loc2.name);
+
+        local->op_errno = op_errno;
+
+        goto err;
+    }
+
+    local->refresh_layout_unlock = dht_mknod_finish;
+
+    local->refresh_layout_done = dht_mknod_do;
+
+    dht_refresh_layout(frame);
+
+    return 0;
+err:
+    if (local)
+        dht_mknod_finish(frame, this, -1, 0);
+    else
+        DHT_STACK_UNWIND(mknod, frame, -1, EINVAL, NULL, NULL, NULL, NULL,
+                         NULL);
+    return 0;
+}
+
+static int32_t
+dht_mknod_lock(call_frame_t *frame, xlator_t *subvol)
 {
-	dht_local_t  *local  = NULL;
-	dht_conf_t   *conf = NULL;
-        int           op_errno = -1;
-	xlator_t     *xvol = NULL;
-	off_t         xoff = 0;
+    dht_local_t *local = NULL;
+    int count = 1, ret = -1;
+    dht_lock_t **lk_array = NULL;
 
+    GF_VALIDATE_OR_GOTO("dht", frame, err);
+    GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err);
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
+    local = frame->local;
 
-	conf = this->private;
+    lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
 
-	local = dht_local_init (frame);
-	if (!local) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		op_errno = ENOMEM;
-		goto err;
-	}
+    if (lk_array == NULL)
+        goto err;
 
-	local->fd = fd_ref (fd);
-	local->size = size;
+    lk_array[0] = dht_lock_new(frame->this, subvol, &local->loc, F_RDLCK,
+                               DHT_LAYOUT_HEAL_DOMAIN, NULL,
+                               IGNORE_ENOENT_ESTALE);
 
-	dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff);
+    if (lk_array[0] == NULL)
+        goto err;
 
-	/* TODO: do proper readdir */
-	STACK_WIND (frame, dht_readdir_cbk,
-		    xvol, xvol->fops->readdir,
-		    fd, size, xoff);
+    local->lock[0].layout.parent_layout.locks = lk_array;
+    local->lock[0].layout.parent_layout.lk_count = count;
 
-	return 0;
+    ret = dht_blocking_inodelk(frame, lk_array, count, dht_mknod_lock_cbk);
 
+    if (ret < 0) {
+        local->lock[0].layout.parent_layout.locks = NULL;
+        local->lock[0].layout.parent_layout.lk_count = 0;
+        goto err;
+    }
+
+    return 0;
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+    if (lk_array != NULL) {
+        dht_lock_array_free(lk_array, count);
+        GF_FREE(lk_array);
+    }
 
-	return 0;
+    return -1;
 }
 
-
-int
-dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		  int op_ret, int op_errno)
+static int
+dht_refresh_parent_layout_resume(call_frame_t *frame, xlator_t *this, int ret,
+                                 int invoke_cbk)
 {
-	dht_local_t  *local = NULL;
-	int           this_call_cnt = 0;
+    dht_local_t *local = NULL, *parent_local = NULL;
+    call_stub_t *stub = NULL;
+    call_frame_t *parent_frame = NULL;
 
+    local = frame->local;
 
-	local = frame->local;
+    stub = local->stub;
+    local->stub = NULL;
 
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1)
-			local->op_errno = op_errno;
+    parent_frame = stub->frame;
+    parent_local = parent_frame->local;
 
-		if (op_ret == 0)
-			local->op_ret = 0;
-	}
-	UNLOCK (&frame->lock);
+    if (ret < 0) {
+        parent_local->op_ret = -1;
+        parent_local->op_errno = local->op_errno ? local->op_errno : EIO;
+    } else {
+        parent_local->op_ret = 0;
+    }
 
-	this_call_cnt = dht_frame_return (frame);
-	if (is_last_call (this_call_cnt))
-		DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+    call_resume(stub);
 
-        return 0;
+    DHT_STACK_DESTROY(frame);
+
+    return 0;
 }
 
+static int
+dht_refresh_parent_layout_done(call_frame_t *frame)
+{
+    dht_local_t *local = NULL;
+    int ret = 0;
 
-int
-dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync)
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        ret = -1;
+        goto resume;
+    }
+
+    dht_layout_set(frame->this, local->loc.inode,
+                   local->selfheal.refreshed_layout);
+
+resume:
+    dht_refresh_parent_layout_resume(frame, frame->this, ret, 1);
+    return 0;
+}
+
+static int
+dht_handle_parent_layout_change(xlator_t *this, call_stub_t *stub)
 {
-	dht_local_t  *local  = NULL;
-	dht_conf_t   *conf = NULL;
-        int           op_errno = -1;
-	int           i = -1;
+    call_frame_t *refresh_frame = NULL, *frame = NULL;
+    dht_local_t *refresh_local = NULL, *local = NULL;
 
+    frame = stub->frame;
+    local = frame->local;
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
+    refresh_frame = copy_frame(frame);
+    if (!refresh_frame) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+               "mem allocation failed for refresh_frame");
+        return -1;
+    }
 
-	conf = this->private;
+    refresh_local = dht_local_init(refresh_frame, NULL, NULL, stub->fop);
+    if (!refresh_local) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+               "mem allocation failed for refresh_local");
+        return -1;
+    }
 
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
+    refresh_local->loc.inode = inode_ref(local->loc.parent);
+    gf_uuid_copy(refresh_local->loc.gfid, local->loc.parent->gfid);
 
-	local->fd = fd_ref (fd);
-	local->call_cnt = conf->subvolume_cnt;
+    refresh_local->stub = stub;
 
-	for (i = 0; i < conf->subvolume_cnt; i++) {
-		STACK_WIND (frame, dht_fsyncdir_cbk,
-			    conf->subvolumes[i],
-			    conf->subvolumes[i]->fops->fsyncdir,
-			    fd, datasync);
-	}
+    refresh_local->refresh_layout_unlock = dht_refresh_parent_layout_resume;
+    refresh_local->refresh_layout_done = dht_refresh_parent_layout_done;
 
-	return 0;
+    dht_refresh_layout(refresh_frame);
+    return 0;
+}
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno);
+static int32_t
+dht_call_mkdir_stub(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+
+    local = frame->local;
+    stub = local->stub;
+    local->stub = NULL;
+
+    if (op_ret < 0) {
+        local->op_ret = -1;
+        local->op_errno = op_errno;
+    } else {
+        local->op_ret = 0;
+    }
 
-	return 0;
+    call_resume(stub);
+
+    return 0;
 }
 
+static int32_t
+dht_guard_parent_layout_and_namespace(xlator_t *subvol, call_stub_t *stub)
+{
+    dht_local_t *local = NULL;
+    int ret = -1;
+    loc_t *loc = NULL;
+    xlator_t *hashed_subvol = NULL, *this = NULL;
+    ;
+    call_frame_t *frame = NULL;
+    char pgfid[GF_UUID_BUF_SIZE] = {0};
+    int32_t *parent_disk_layout = NULL;
+    dht_layout_t *parent_layout = NULL;
+    dht_conf_t *conf = NULL;
+
+    GF_VALIDATE_OR_GOTO("dht", stub, err);
+
+    frame = stub->frame;
+    this = frame->this;
+
+    conf = this->private;
+
+    local = frame->local;
+
+    local->stub = stub;
+
+    /* TODO: recheck whether we should lock on src or dst if we do similar
+     * stale layout checks for rename.
+     */
+    loc = &stub->args.loc;
+
+    gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+    if (local->params == NULL) {
+        local->params = dict_new();
+        if (local->params == NULL) {
+            local->op_errno = ENOMEM;
+            gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+                   DHT_MSG_PARENT_LAYOUT_CHANGED,
+                   "%s (%s/%s) (path: %s): "
+                   "dict allocation failed",
+                   gf_fop_list[stub->fop], pgfid, loc->name, loc->path);
+            goto err;
+        }
+    }
+
+    hashed_subvol = dht_subvol_get_hashed(this, loc);
+    if (hashed_subvol == NULL) {
+        local->op_errno = EINVAL;
+
+        gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+               DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "%s (%s/%s) (path: %s): "
+               "hashed subvolume not found",
+               gf_fop_list[stub->fop], pgfid, loc->name, loc->path);
+        goto err;
+    }
+
+    parent_layout = dht_layout_get(this, loc->parent);
+
+    ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol,
+                                             &parent_disk_layout);
+    if (ret == -1) {
+        local->op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+               DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "%s (%s/%s) (path: %s): "
+               "extracting in-memory layout of parent failed. ",
+               gf_fop_list[stub->fop], pgfid, loc->name, loc->path);
+        goto err;
+    }
+
+    memcpy((void *)local->parent_disk_layout, (void *)parent_disk_layout,
+           sizeof(local->parent_disk_layout));
+
+    dht_layout_unref(this, parent_layout);
+    parent_layout = NULL;
+
+    ret = dict_set_str(local->params, GF_PREOP_PARENT_KEY, conf->xattr_name);
+    if (ret < 0) {
+        local->op_errno = -ret;
+        gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+               DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "%s (%s/%s) (path: %s): "
+               "setting %s key in params dictionary failed. ",
+               gf_fop_list[stub->fop], pgfid, loc->name, loc->path,
+               GF_PREOP_PARENT_KEY);
+        goto err;
+    }
+
+    ret = dict_set_bin(local->params, conf->xattr_name, parent_disk_layout,
+                       4 * 4);
+    if (ret < 0) {
+        local->op_errno = -ret;
+        gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+               DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "%s (%s/%s) (path: %s): "
+               "setting parent-layout in params dictionary failed. ",
+               gf_fop_list[stub->fop], pgfid, loc->name, loc->path);
+        goto err;
+    }
+
+    parent_disk_layout = NULL;
+    local->hashed_subvol = hashed_subvol;
+
+    local->current = &local->lock[0];
+    ret = dht_protect_namespace(frame, loc, hashed_subvol, &local->current->ns,
+                                dht_call_mkdir_stub);
+    if (ret < 0)
+        goto err;
+
+    return 0;
+err:
+
+    if (parent_disk_layout != NULL)
+        GF_FREE(parent_disk_layout);
+
+    if (parent_layout != NULL)
+        dht_layout_unref(this, parent_layout);
+
+    return -1;
+}
 
 int
-dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		 int op_ret, int op_errno,
-		 inode_t *inode, struct stat *stbuf)
+dht_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          dev_t rdev, mode_t umask, dict_t *params)
 {
-	call_frame_t *prev = NULL;
-	dht_layout_t *layout = NULL;
-	int           ret = -1;
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    int i = 0;
+    int ret = 0;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+
+    conf = this->private;
+
+    dht_get_du_info(frame, this, loc);
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_MKNOD);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = dht_subvol_get_hashed(this, loc);
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+                     loc->path);
+        op_errno = EIO;
+        goto err;
+    }
+
+    /* Post remove-brick, the client layout may not be in sync with
+     * disk layout because of lack of lookup. Hence,a mknod call
+     * may fall on the decommissioned brick.  Hence, if the
+     * hashed_subvol is part of decommissioned bricks  list, do a
+     * lookup on parent dir. If a fix-layout is already done by the
+     * remove-brick process, the parent directory layout will be in
+     * sync with that of the disk. If fix-layout is still ending
+     * on the parent directory, we can let the file get created on
+     * the decommissioned brick which will be eventually migrated to
+     * non-decommissioned brick based on the new layout.
+     */
+
+    if (conf->decommission_subvols_cnt) {
+        for (i = 0; i < conf->subvolume_cnt; i++) {
+            if (conf->decommissioned_bricks[i] &&
+                conf->decommissioned_bricks[i] == subvol) {
+                gf_msg_debug(this->name, 0,
+                             "hashed subvol:%s is "
+                             "part of decommission brick list for "
+                             "file: %s",
+                             subvol->name, loc->path);
+
+                /* dht_refresh_layout needs directory info in
+                 * local->loc. Hence, storing the parent_loc in
+                 * local->loc and storing the create context in
+                 * local->loc2. We will restore this information
+                 * in dht_creation do */
+
+                ret = loc_copy(&local->loc2, &local->loc);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+                           "loc_copy failed %s", loc->path);
+
+                    goto err;
+                }
 
+                local->params = dict_ref(params);
+                local->rdev = rdev;
+                local->mode = mode;
+                local->umask = umask;
 
-	if (op_ret == -1)
-		goto out;
+                loc_wipe(&local->loc);
 
-	prev = cookie;
+                ret = dht_build_parent_loc(this, &local->loc, loc, &op_errno);
 
-	dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino);
-	layout = dht_layout_for_subvol (this, prev->this);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED,
+                           "parent loc build failed");
+                    goto err;
+                }
 
-	if (!layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no pre-set layout for subvolume %s",
-			prev->this->name);
-		op_ret   = -1;
-		op_errno = EINVAL;
-		goto out;
-	}
+                ret = dht_mknod_lock(frame, subvol);
 
-	ret = inode_ctx_put (inode, this, (uint64_t)(long)layout);
-	if (ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"could not set inode context");
-		op_ret   = -1;
-		op_errno = EINVAL;
-		goto out;
-	}
+                if (ret < 0) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+                           "locking parent failed");
+                    goto err;
+                }
 
-out:
-	DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf);
-	return 0;
-}
+                goto done;
+            }
+        }
+    }
 
+    dht_mknod_wind_to_avail_subvol(frame, this, subvol, loc, rdev, mode, umask,
+                                   params);
+
+done:
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+
+    return 0;
+}
 
 int
-dht_mknod (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, mode_t mode, dev_t rdev)
+dht_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
+            loc_t *loc, mode_t umask, dict_t *params)
 {
-	xlator_t  *subvol = NULL;
-	int        op_errno = -1;
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
 
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
 
-	VALIDATE_OR_GOTO (frame, err);
-	VALIDATE_OR_GOTO (this, err);
-	VALIDATE_OR_GOTO (loc, err);
+    local = dht_local_init(frame, loc, NULL, GF_FOP_SYMLINK);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
 
-	subvol = dht_subvol_get_hashed (this, loc);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no subvolume in layout for path=%s",
-			loc->path);
-		op_errno = ENOENT;
-		goto err;
-	}
+    subvol = dht_subvol_get_hashed(this, loc);
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+                     loc->path);
+        op_errno = EIO;
+        goto err;
+    }
 
-	gf_log (this->name, GF_LOG_DEBUG,
-		"creating %s on %s", loc->path, subvol->name);
+    gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name);
 
-	STACK_WIND (frame, dht_newfile_cbk,
-		    subvol, subvol->fops->mknod,
-		    loc, mode, rdev);
+    STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol,
+                      subvol->fops->symlink, linkname, loc, umask, params);
 
-	return 0;
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
 
-	return 0;
+    return 0;
 }
 
-
 int
-dht_symlink (call_frame_t *frame, xlator_t *this,
-	     const char *linkname, loc_t *loc)
+dht_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+           dict_t *xdata)
+{
+    xlator_t *cached_subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_UNLINK);
+    if (!local) {
+        op_errno = ENOMEM;
+
+        goto err;
+    }
+
+    cached_subvol = local->cached_subvol;
+    if (!cached_subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+                     loc->path);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    local->flags = xflag;
+    STACK_WIND_COOKIE(frame, dht_unlink_cbk, cached_subvol, cached_subvol,
+                      cached_subvol->fops->unlink, loc, xflag, xdata);
+
+    return 0;
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int
+dht_remove_stale_linkto_cbk(int ret, call_frame_t *sync_frame, void *data)
+{
+    DHT_STACK_DESTROY(sync_frame);
+    return 0;
+}
+
+static int
+dht_remove_stale_linkto(void *data)
 {
-	xlator_t  *subvol = NULL;
-	int        op_errno = -1;
+    call_frame_t *frame = NULL;
+    dht_local_t *local = NULL;
+    xlator_t *this = NULL;
+    dict_t *xdata_in = NULL;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO("dht", data, out);
+
+    frame = data;
+    local = frame->local;
+    this = frame->this;
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", local, out);
+    GF_VALIDATE_OR_GOTO("dht", local->link_subvol, out);
+
+    xdata_in = dict_new();
+    if (!xdata_in)
+        goto out;
+
+    ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(xdata_in);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, 0,
+               "Failed to set keys for stale linkto"
+               "deletion on path %s",
+               local->loc.path);
+        goto out;
+    }
+
+    ret = syncop_unlink(local->link_subvol, &local->loc, xdata_in, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, 0,
+               "Removal of linkto failed"
+               " on path %s at subvol %s",
+               local->loc.path, local->link_subvol->name);
+    }
+out:
+    if (xdata_in)
+        dict_unref(xdata_in);
+    return ret;
+}
+
+static int
+dht_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+             int op_errno, inode_t *inode, struct iatt *stbuf,
+             struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int ret = -1;
+    gf_boolean_t stbuf_merged = _gf_false;
+    xlator_t *subvol = NULL;
+    call_frame_t *cleanup_frame = NULL;
+    dht_local_t *cleanup_local = NULL;
+
+    local = frame->local;
+
+    if (op_ret == -1) {
+        /* Remove the linkto if exists */
+        if (local->linked) {
+            cleanup_frame = create_frame(this, this->ctx->pool);
+            if (cleanup_frame) {
+                cleanup_local = dht_local_init(cleanup_frame, &local->loc2,
+                                               NULL, 0);
+                if (!cleanup_local || !local->link_subvol) {
+                    DHT_STACK_DESTROY(cleanup_frame);
+                    goto out;
+                }
+                cleanup_local->link_subvol = local->link_subvol;
+                FRAME_SU_DO(cleanup_frame, dht_local_t);
+                ret = synctask_new(this->ctx->env, dht_remove_stale_linkto,
+                                   dht_remove_stale_linkto_cbk, cleanup_frame,
+                                   cleanup_frame);
+            }
+        }
+        /* No continuation on DHT inode missing errors, as we should
+         * then have a good stbuf that states P2 happened. We would
+         * get inode missing if, the file completed migrated between
+         * the lookup and the link call */
+        goto out;
+    }
+
+    /* Update parent on success, even if P1/2 checks are positive.
+     * The second call on success will further update the parent */
+    if (local->loc.parent) {
+        dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0);
+        dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+    }
+
+    /* Update linkto attrs, if this is the first call and non-P2,
+     * if we detect P2 then we need to trust the attrs from the
+     * second call, not the first */
+    if (local->linked == _gf_true &&
+        ((local->call_cnt == 1 && !IS_DHT_MIGRATION_PHASE2(stbuf)) ||
+         (local->call_cnt != 1 && IS_DHT_MIGRATION_PHASE2(&local->stbuf)))) {
+        dht_iatt_merge(this, &local->stbuf, stbuf);
+        stbuf_merged = _gf_true;
+        dht_linkfile_attr_heal(frame, this);
+    }
+
+    /* No further P1/2 checks if we are in the second iteration of
+     * the call */
+    if (local->call_cnt != 1) {
+        goto out;
+    } else {
+        /* Preserve the return values, in case the migration decides
+         * to recreate the link on the same subvol that the current
+         * hased for the link was created on. */
+        dht_iatt_merge(this, &local->preparent, preparent);
+        dht_iatt_merge(this, &local->postparent, postparent);
+        if (!stbuf_merged) {
+            dht_iatt_merge(this, &local->stbuf, stbuf);
+            stbuf_merged = _gf_true;
+        }
+
+        local->inode = inode_ref(inode);
+    }
+
+    local->op_ret = op_ret;
+    local->op_errno = op_errno;
+    local->rebalance.target_op_fn = dht_link2;
+    dht_set_local_rebalance(this, local, stbuf, preparent, postparent, xdata);
+
+    /* Check if the rebalance phase2 is true */
+    if (IS_DHT_MIGRATION_PHASE2(stbuf)) {
+        ret = dht_inode_ctx_get_mig_info(this, local->loc.inode, NULL, &subvol);
+        if (!subvol) {
+            /* Phase 2 of migration */
+            ret = dht_rebalance_complete_check(this, frame);
+            if (!ret)
+                return 0;
+        } else {
+            dht_link2(this, subvol, frame, 0);
+            return 0;
+        }
+    }
+
+    /* Check if the rebalance phase1 is true */
+    if (IS_DHT_MIGRATION_PHASE1(stbuf)) {
+        ret = dht_inode_ctx_get_mig_info(this, local->loc.inode, NULL, &subvol);
+        if (subvol) {
+            dht_link2(this, subvol, frame, 0);
+            return 0;
+        }
+        ret = dht_rebalance_in_progress_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+out:
+    DHT_STRIP_PHASE1_FLAGS(stbuf);
 
+    dht_set_fixed_dir_stat(preparent);
+    dht_set_fixed_dir_stat(postparent);
+    DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent,
+                     postparent, NULL);
 
-	VALIDATE_OR_GOTO (frame, err);
-	VALIDATE_OR_GOTO (this, err);
-	VALIDATE_OR_GOTO (loc, err);
+    return 0;
+}
 
-	subvol = dht_subvol_get_hashed (this, loc);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no subvolume in layout for path=%s",
-			loc->path);
-		op_errno = ENOENT;
-		goto err;
-	}
+static int
+dht_link2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int op_errno = EINVAL;
+
+    local = frame->local;
+    if (!local)
+        goto err;
+
+    op_errno = local->op_errno;
+
+    if (we_are_not_migrating(ret)) {
+        /* This dht xlator is not migrating the file. Unwind and
+         * pass on the original mode bits so the higher DHT layer
+         * can handle this.
+         */
+        dht_set_fixed_dir_stat(&local->preparent);
+        dht_set_fixed_dir_stat(&local->postparent);
+
+        DHT_STACK_UNWIND(link, frame, local->op_ret, op_errno, local->inode,
+                         &local->stbuf, &local->preparent, &local->postparent,
+                         NULL);
+        return 0;
+    }
+
+    if (subvol == NULL) {
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    /* Second call to create link file could result in EEXIST as the
+     * first call created the linkto in the currently
+     * migrating subvol, which could be the new hashed subvol */
+    if (local->link_subvol == subvol) {
+        DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+        dht_set_fixed_dir_stat(&local->preparent);
+        dht_set_fixed_dir_stat(&local->postparent);
+        DHT_STACK_UNWIND(link, frame, 0, 0, local->inode, &local->stbuf,
+                         &local->preparent, &local->postparent, NULL);
 
-	gf_log (this->name, GF_LOG_DEBUG,
-		"creating %s on %s", loc->path, subvol->name);
+        return 0;
+    }
 
-	STACK_WIND (frame, dht_newfile_cbk,
-		    subvol, subvol->fops->symlink,
-		    linkname, loc);
+    local->call_cnt = 2;
 
-	return 0;
+    STACK_WIND(frame, dht_link_cbk, subvol, subvol->fops->link, &local->loc,
+               &local->loc2, local->xattr_req);
 
+    return 0;
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+    DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
 
-	return 0;
+    return 0;
 }
 
+static int
+dht_link_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int op_ret, int op_errno, inode_t *inode,
+                      struct iatt *stbuf, struct iatt *preparent,
+                      struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *srcvol = NULL;
 
-int
-dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
-	xlator_t    *cached_subvol = NULL;
-	xlator_t    *hashed_subvol = NULL;
-	int          op_errno = -1;
-	dht_local_t *local = NULL;
-
-
-	VALIDATE_OR_GOTO (frame, err);
-	VALIDATE_OR_GOTO (this, err);
-	VALIDATE_OR_GOTO (loc, err);
-
-	cached_subvol = dht_subvol_get_cached (this, loc->inode);
-	if (!cached_subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	hashed_subvol = dht_subvol_get_hashed (this, loc);
-	if (!hashed_subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no subvolume in layout for path=%s",
-			loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	local->call_cnt = 1;
-	if (hashed_subvol != cached_subvol)
-		local->call_cnt++;
-
-	STACK_WIND (frame, dht_err_cbk,
-		    cached_subvol, cached_subvol->fops->unlink, loc);
-
-	if (hashed_subvol != cached_subvol)
-		STACK_WIND (frame, dht_err_cbk,
-			    hashed_subvol, hashed_subvol->fops->unlink, loc);
-
-	return 0;
+    if (op_ret == -1)
+        goto err;
+
+    local = frame->local;
+    srcvol = local->linkfile.srcvol;
+
+    STACK_WIND(frame, dht_link_cbk, srcvol, srcvol->fops->link, &local->loc,
+               &local->loc2, local->xattr_req);
+
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno);
+    DHT_STRIP_PHASE1_FLAGS(stbuf);
+    dht_set_fixed_dir_stat(preparent);
+    dht_set_fixed_dir_stat(postparent);
+    DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent,
+                     postparent, xdata);
 
-	return 0;
+    return 0;
 }
 
+int
+dht_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+         dict_t *xdata)
+{
+    xlator_t *cached_subvol = NULL;
+    xlator_t *hashed_subvol = NULL;
+    int op_errno = -1;
+    int ret = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(oldloc, err);
+    VALIDATE_OR_GOTO(newloc, err);
+
+    local = dht_local_init(frame, oldloc, NULL, GF_FOP_LINK);
+    if (!local) {
+        op_errno = ENOMEM;
+
+        goto err;
+    }
+    local->call_cnt = 1;
+
+    cached_subvol = local->cached_subvol;
+    if (!cached_subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+                     oldloc->path);
+        op_errno = ENOENT;
+        goto err;
+    }
+
+    hashed_subvol = dht_subvol_get_hashed(this, newloc);
+    if (!hashed_subvol) {
+        gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+                     newloc->path);
+        op_errno = EIO;
+        goto err;
+    }
+
+    ret = loc_copy(&local->loc2, newloc);
+    if (ret == -1) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    if (hashed_subvol != cached_subvol) {
+        gf_uuid_copy(local->gfid, oldloc->inode->gfid);
+        dht_linkfile_create(frame, dht_link_linkfile_cbk, this, cached_subvol,
+                            hashed_subvol, newloc);
+    } else {
+        STACK_WIND(frame, dht_link_cbk, cached_subvol,
+                   cached_subvol->fops->link, oldloc, newloc, xdata);
+    }
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+
+    return 0;
+}
 
 int
-dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	      int op_ret, int op_errno,
-	      inode_t *inode, struct stat *stbuf)
+dht_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+               int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
+               struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
 {
-        call_frame_t *prev = NULL;
-	dht_layout_t *layout = NULL;
-	dht_local_t  *local = NULL;
+    xlator_t *prev = NULL;
+    int ret = -1;
+    dht_local_t *local = NULL;
+    gf_boolean_t parent_layout_changed = _gf_false;
+    char pgfid[GF_UUID_BUF_SIZE] = {0};
+    xlator_t *subvol = NULL;
+
+    local = frame->local;
+
+    local = frame->local;
+    if (!local) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    if (op_ret == -1) {
+        local->op_errno = op_errno;
+        parent_layout_changed = (xdata &&
+                                 dict_get(xdata, GF_PREOP_CHECK_FAILED))
+                                    ? _gf_true
+                                    : _gf_false;
+
+        if (parent_layout_changed) {
+            if (local && local->lock[0].layout.parent_layout.locks) {
+                /* Returning failure as the layout could not be fixed even under
+                 * the lock */
+                goto out;
+            }
 
-        prev = cookie;
-	local = frame->local;
+            gf_uuid_unparse(local->loc.parent->gfid, pgfid);
+            gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_PARENT_LAYOUT_CHANGED,
+                   "create (%s/%s) (path: %s): parent layout "
+                   "changed. Attempting a layout refresh and then a "
+                   "retry",
+                   pgfid, local->loc.name, local->loc.path);
 
-        if (op_ret == -1)
+            /*
+              dht_refresh_layout needs directory info in local->loc.Hence,
+              storing the parent_loc in local->loc and storing the create
+              context in local->loc2. We will restore this information in
+              dht_creation_do.
+             */
+
+            loc_wipe(&local->loc2);
+
+            ret = loc_copy(&local->loc2, &local->loc);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+                       "loc_copy failed %s", local->loc.path);
+
+                goto out;
+            }
+
+            loc_wipe(&local->loc);
+
+            ret = dht_build_parent_loc(this, &local->loc, &local->loc2,
+                                       &op_errno);
+
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED,
+                       "parent loc build failed");
+                goto out;
+            }
+
+            subvol = dht_subvol_get_hashed(this, &local->loc2);
+
+            ret = dht_create_lock(frame, subvol);
+            if (ret < 0) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+                       "locking parent failed");
                 goto out;
+            }
+
+            return 0;
+        }
+
+        goto out;
+    }
+
+    prev = cookie;
+
+    if (local->loc.parent) {
+        dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0);
+
+        dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+    }
 
-	layout = dht_layout_for_subvol (this, prev->this);
-	if (!layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no pre-set layout for subvolume %s",
-			prev->this->name);
-		op_ret   = -1;
-		op_errno = EINVAL;
-		goto out;
-	}
+    ret = dht_fd_ctx_set(this, fd, prev);
+    if (ret != 0) {
+        gf_msg_debug(this->name, 0,
+                     "Possible fd leak. "
+                     "Could not set fd ctx for subvol %s",
+                     prev->name);
+    }
 
-	stbuf->st_ino = local->loc.inode->ino;
+    ret = dht_layout_preset(this, prev, inode);
+    if (ret != 0) {
+        gf_msg_debug(this->name, 0, "could not set preset layout for subvol %s",
+                     prev->name);
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
 
+    local->op_errno = op_errno;
+
+    if (local->linked == _gf_true) {
+        local->stbuf = *stbuf;
+        dht_linkfile_attr_heal(frame, this);
+    }
 out:
-        DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf);
 
-	return 0;
+    DHT_STRIP_PHASE1_FLAGS(stbuf);
+    dht_set_fixed_dir_stat(preparent);
+    dht_set_fixed_dir_stat(postparent);
+
+    if (local && local->lock[0].layout.parent_layout.locks) {
+        /* store op_errno for failure case*/
+        local->op_errno = op_errno;
+        local->refresh_layout_unlock(frame, this, op_ret, 1);
+
+        if (op_ret == 0) {
+            DHT_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf,
+                             preparent, postparent, xdata);
+        }
+    } else {
+        DHT_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf,
+                         preparent, postparent, xdata);
+    }
+    return 0;
 }
 
+static int
+dht_create_linkfile_create_cbk(call_frame_t *frame, void *cookie,
+                               xlator_t *this, int32_t op_ret, int32_t op_errno,
+                               inode_t *inode, struct iatt *stbuf,
+                               struct iatt *preparent, struct iatt *postparent,
+                               dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *cached_subvol = NULL;
+    dht_conf_t *conf = NULL;
+
+    local = frame->local;
+    if (!local) {
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    if (op_ret == -1) {
+        local->op_errno = op_errno;
+        goto err;
+    }
+
+    conf = this->private;
+    if (!conf) {
+        local->op_errno = EINVAL;
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    cached_subvol = local->cached_subvol;
+
+    if (local->params) {
+        dict_del(local->params, conf->link_xattr_name);
+        dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY);
+    }
+
+    STACK_WIND_COOKIE(frame, dht_create_cbk, cached_subvol, cached_subvol,
+                      cached_subvol->fops->create, &local->loc, local->flags,
+                      local->mode, local->umask, local->fd, local->params);
+
+    return 0;
+err:
+    if (local && local->lock[0].layout.parent_layout.locks) {
+        local->refresh_layout_unlock(frame, this, -1, 1);
+    } else {
+        DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                         NULL, NULL);
+    }
+    return 0;
+}
 
-int
-dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		       int op_ret, int op_errno,
-		       inode_t *inode, struct stat *stbuf)
+static int
+dht_create_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this,
+                                xlator_t *subvol, loc_t *loc, int32_t flags,
+                                mode_t mode, mode_t umask, fd_t *fd,
+                                dict_t *params)
 {
-	dht_local_t  *local = NULL;
-	xlator_t     *srcvol = NULL;
+    dht_local_t *local = NULL;
+    xlator_t *avail_subvol = NULL;
 
+    local = frame->local;
 
-	if (op_ret == -1)
-		goto err;
+    if (!dht_is_subvol_filled(this, subvol)) {
+        gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
+                     subvol->name);
 
-	local = frame->local;
-	srcvol = local->linkfile.srcvol;
+        dht_set_parent_layout_in_dict(loc, this, local);
 
-	STACK_WIND (frame, dht_link_cbk,
-		    srcvol, srcvol->fops->link,
-		    &local->loc, &local->loc2);
+        STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
+                          subvol->fops->create, loc, flags, mode, umask, fd,
+                          params);
 
-	return 0;
+    } else {
+        avail_subvol = dht_free_disk_available_subvol(this, subvol, local);
 
-err:
-	DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf);
+        if (avail_subvol != subvol) {
+            local->cached_subvol = avail_subvol;
+            local->hashed_subvol = subvol;
 
-	return 0;
-}
+            gf_msg_debug(this->name, 0, "creating %s on %s (link at %s)",
+                         loc->path, avail_subvol->name, subvol->name);
 
+            dht_linkfile_create(frame, dht_create_linkfile_create_cbk, this,
+                                avail_subvol, subvol, loc);
 
-int
-dht_link (call_frame_t *frame, xlator_t *this,
-	  loc_t *oldloc, loc_t *newloc)
-{
-	xlator_t    *cached_subvol = NULL;
-	xlator_t    *hashed_subvol = NULL;
-	int          op_errno = -1;
-	int          ret = -1;
-	dht_local_t *local = NULL;
-
-
-	VALIDATE_OR_GOTO (frame, err);
-	VALIDATE_OR_GOTO (this, err);
-	VALIDATE_OR_GOTO (oldloc, err);
-	VALIDATE_OR_GOTO (newloc, err);
-
-	cached_subvol = dht_subvol_get_cached (this, oldloc->inode);
-	if (!cached_subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for path=%s", oldloc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	hashed_subvol = dht_subvol_get_hashed (this, newloc);
-	if (!hashed_subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no subvolume in layout for path=%s",
-			newloc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	ret = loc_copy (&local->loc, oldloc);
-	if (ret == -1) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	ret = loc_copy (&local->loc2, newloc);
-	if (ret == -1) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	if (hashed_subvol != cached_subvol) {
-		dht_linkfile_create (frame, dht_link_linkfile_cbk,
-				     cached_subvol, hashed_subvol, newloc);
-	} else {
-		STACK_WIND (frame, dht_link_cbk,
-			    cached_subvol, cached_subvol->fops->link,
-			    oldloc, newloc);
-	}
-
-	return 0;
+            goto out;
+        }
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+        gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
+                     subvol->name);
 
-	return 0;
-}
+        dht_set_parent_layout_in_dict(loc, this, local);
 
+        STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
+                          subvol->fops->create, loc, flags, mode, umask, fd,
+                          params);
+    }
+out:
+    return 0;
+}
 
 int
-dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		 int op_ret, int op_errno,
-		 fd_t *fd, inode_t *inode, struct stat *stbuf)
+dht_build_parent_loc(xlator_t *this, loc_t *parent, loc_t *child,
+                     int32_t *op_errno)
 {
-	call_frame_t *prev = NULL;
-	dht_layout_t *layout = NULL;
-	int           ret = -1;
+    inode_table_t *table = NULL;
+    int ret = -1;
+
+    if (!parent || !child) {
+        if (op_errno)
+            *op_errno = EINVAL;
+        goto out;
+    }
+
+    if (child->parent) {
+        parent->inode = inode_ref(child->parent);
+        if (!parent->inode) {
+            if (op_errno)
+                *op_errno = EINVAL;
+            goto out;
+        }
 
+        gf_uuid_copy(parent->gfid, child->pargfid);
 
-	if (op_ret == -1)
-		goto out;
+        ret = 0;
 
-	prev = cookie;
+        goto out;
+    } else {
+        if (gf_uuid_is_null(child->pargfid)) {
+            if (op_errno)
+                *op_errno = EINVAL;
+            goto out;
+        }
 
-	dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino);
-	layout = dht_layout_for_subvol (this, prev->this);
+        table = this->itable;
 
-	if (!layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no pre-set layout for subvolume %s",
-			prev->this->name);
-		op_ret   = -1;
-		op_errno = EINVAL;
-		goto out;
-	}
+        if (!table) {
+            if (op_errno) {
+                *op_errno = EINVAL;
+                goto out;
+            }
+        }
 
-	ret = inode_ctx_put (inode, this, (uint64_t)(long)layout);
-	if (ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"could not set inode context");
-		op_ret   = -1;
-		op_errno = EINVAL;
-		goto out;
-	}
+        parent->inode = inode_find(table, child->pargfid);
+
+        if (!parent->inode) {
+            if (op_errno) {
+                *op_errno = EINVAL;
+                goto out;
+            }
+        }
+
+        gf_uuid_copy(parent->gfid, child->pargfid);
+
+        ret = 0;
+    }
 
 out:
-	DHT_STACK_UNWIND (frame, op_ret, op_errno, fd, inode, stbuf);
-	return 0;
+    return ret;
 }
 
-
-int
-dht_create (call_frame_t *frame, xlator_t *this,
-	    loc_t *loc, int32_t flags, mode_t mode, fd_t *fd)
+static int32_t
+dht_create_do(call_frame_t *frame)
 {
-	xlator_t  *subvol = NULL;
-	int        op_errno = -1;
+    dht_local_t *local = NULL;
+    dht_layout_t *refreshed = NULL;
+    xlator_t *subvol = NULL;
+    xlator_t *this = NULL;
+    dht_conf_t *conf = NULL;
+    dht_methods_t *methods = NULL;
 
+    local = frame->local;
 
-	VALIDATE_OR_GOTO (frame, err);
-	VALIDATE_OR_GOTO (this, err);
-	VALIDATE_OR_GOTO (loc, err);
+    this = THIS;
 
-	subvol = dht_subvol_get_hashed (this, loc);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no subvolume in layout for path=%s",
-			loc->path);
-		op_errno = ENOENT;
-		goto err;
-	}
+    conf = this->private;
 
-	gf_log (this->name, GF_LOG_DEBUG,
-		"creating %s on %s", loc->path, subvol->name);
+    GF_VALIDATE_OR_GOTO(this->name, conf, err);
 
-	STACK_WIND (frame, dht_create_cbk,
-		    subvol, subvol->fops->create,
-		    loc, flags, mode, fd);
+    methods = &(conf->methods);
 
-	return 0;
+    /* We don't need parent_loc anymore */
+    loc_wipe(&local->loc);
 
+    loc_copy(&local->loc, &local->loc2);
+
+    loc_wipe(&local->loc2);
+
+    refreshed = local->selfheal.refreshed_layout;
+
+    subvol = methods->layout_search(this, refreshed, local->loc.name);
+
+    if (!subvol) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+               "no subvolume in "
+               "layout for path=%s",
+               local->loc.path);
+        local->op_errno = ENOENT;
+        goto err;
+    }
+
+    dht_create_wind_to_avail_subvol(frame, this, subvol, &local->loc,
+                                    local->flags, local->mode, local->umask,
+                                    local->fd, local->params);
+    return 0;
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);
+    local->refresh_layout_unlock(frame, this, -1, 1);
 
-	return 0;
+    return 0;
 }
 
+static int32_t
+dht_create_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    DHT_STACK_DESTROY(frame);
+    return 0;
+}
 
-int
-dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie,
-			xlator_t *this,
-			int32_t op_ret, int32_t op_errno)
+static int32_t
+dht_create_finish(call_frame_t *frame, xlator_t *this, int op_ret,
+                  int invoke_cbk)
+{
+    dht_local_t *local = NULL, *lock_local = NULL;
+    call_frame_t *lock_frame = NULL;
+    int lock_count = 0;
+
+    local = frame->local;
+    lock_count = dht_lock_count(local->lock[0].layout.parent_layout.locks,
+                                local->lock[0].layout.parent_layout.lk_count);
+    if (lock_count == 0)
+        goto done;
+
+    lock_frame = copy_frame(frame);
+    if (lock_frame == NULL) {
+        goto done;
+    }
+
+    lock_local = dht_local_init(lock_frame, &local->loc, NULL,
+                                lock_frame->root->op);
+    if (lock_local == NULL) {
+        goto done;
+    }
+
+    lock_local->lock[0]
+        .layout.parent_layout.locks = local->lock[0].layout.parent_layout.locks;
+    lock_local->lock[0].layout.parent_layout.lk_count =
+        local->lock[0].layout.parent_layout.lk_count;
+
+    local->lock[0].layout.parent_layout.locks = NULL;
+    local->lock[0].layout.parent_layout.lk_count = 0;
+
+    dht_unlock_inodelk(lock_frame,
+                       lock_local->lock[0].layout.parent_layout.locks,
+                       lock_local->lock[0].layout.parent_layout.lk_count,
+                       dht_create_unlock_cbk);
+    lock_frame = NULL;
+
+done:
+    if (lock_frame != NULL) {
+        DHT_STACK_DESTROY(lock_frame);
+    }
+
+    if (op_ret == 0)
+        return 0;
+
+    DHT_STACK_UNWIND(create, frame, op_ret, local->op_errno, NULL, NULL, NULL,
+                     NULL, NULL, NULL);
+    return 0;
+}
+
+static int32_t
+dht_create_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
-	dht_local_t   *local = NULL;
-	dht_layout_t  *layout = NULL;
+    dht_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (!local) {
+        goto err;
+    }
+
+    if (op_ret < 0) {
+        gf_msg("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+               "Create lock failed for file: %s", local->loc2.name);
+
+        local->op_errno = op_errno;
 
+        goto err;
+    }
 
-	local = frame->local;
-	layout = local->selfheal.layout;
+    local->refresh_layout_unlock = dht_create_finish;
 
-	if (op_ret == 0) {
-		inode_ctx_put (local->inode, this, (uint64_t)(long)layout);
-		local->selfheal.layout = NULL;
-		local->stbuf.st_ino = local->st_ino;
-	}
+    local->refresh_layout_done = dht_create_do;
 
-	DHT_STACK_UNWIND (frame, op_ret, op_errno,
-			  local->inode, &local->stbuf);
+    dht_refresh_layout(frame);
 
-	return 0;
+    return 0;
+err:
+    if (local)
+        dht_create_finish(frame, this, -1, 0);
+    else
+        DHT_STACK_UNWIND(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL,
+                         NULL, NULL);
+    return 0;
 }
 
-int
-dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	       int op_ret, int op_errno, inode_t *inode, struct stat *stbuf)
-{
-	dht_local_t  *local = NULL;
-	int           this_call_cnt = 0;
-	int           ret = -1;
-	call_frame_t *prev = NULL;
-	dht_layout_t *layout = NULL;
-
-	local = frame->local;
-	prev  = cookie;
-	layout = local->layout;
-
-	LOCK (&frame->lock);
-	{
-		ret = dht_layout_merge (this, layout, prev->this,
-					op_ret, op_errno, NULL);
-
-		if (op_ret == -1) {
-			local->op_errno = op_errno;
-			goto unlock;
-		}
-		dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
-	}
-unlock:
-	UNLOCK (&frame->lock);
+int32_t
+dht_create_lock(call_frame_t *frame, xlator_t *subvol)
+{
+    dht_local_t *local = NULL;
+    int count = 1, ret = -1;
+    dht_lock_t **lk_array = NULL;
 
-	this_call_cnt = dht_frame_return (frame);
-	if (is_last_call (this_call_cnt)) {
-		local->layout = NULL;
-		dht_selfheal_new_directory (frame, dht_mkdir_selfheal_cbk,
-					    layout);
-	}
+    GF_VALIDATE_OR_GOTO("dht", frame, err);
+    GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err);
 
-        return 0;
+    local = frame->local;
+
+    lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
+
+    if (lk_array == NULL)
+        goto err;
+
+    lk_array[0] = dht_lock_new(frame->this, subvol, &local->loc, F_RDLCK,
+                               DHT_LAYOUT_HEAL_DOMAIN, NULL,
+                               IGNORE_ENOENT_ESTALE);
+
+    if (lk_array[0] == NULL)
+        goto err;
+
+    local->lock[0].layout.parent_layout.locks = lk_array;
+    local->lock[0].layout.parent_layout.lk_count = count;
+
+    ret = dht_blocking_inodelk(frame, lk_array, count, dht_create_lock_cbk);
+
+    if (ret < 0) {
+        local->lock[0].layout.parent_layout.locks = NULL;
+        local->lock[0].layout.parent_layout.lk_count = 0;
+        goto err;
+    }
+
+    return 0;
+err:
+    if (lk_array != NULL) {
+        dht_lock_array_free(lk_array, count);
+        GF_FREE(lk_array);
+    }
+
+    return -1;
 }
 
 int
-dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, 
-		      xlator_t *this, int op_ret, int op_errno, 
-		      inode_t *inode, struct stat *stbuf)
-{
-	dht_local_t  *local = NULL;
-	int           ret = -1;
-	call_frame_t *prev = NULL;
-	dht_layout_t *layout = NULL;
-	dht_conf_t   *conf = NULL;
-	int           i = 0;
-	xlator_t     *hashed_subvol = NULL;
-
-	local = frame->local;
-	prev  = cookie;
-	layout = local->layout;
-	conf = this->private;
-	hashed_subvol = local->hashed_subvol;
-
-	ret = dht_layout_merge (this, layout, prev->this,
-				op_ret, op_errno, NULL);
-
-	if (op_ret == -1) {
-		local->op_errno = op_errno;
-		goto err;
-	}
-	local->op_ret = 0;
-
-	dht_stat_merge (this, &local->stbuf, stbuf, prev->this);
-
-	local->st_ino = local->stbuf.st_ino;
-
-	local->call_cnt = conf->subvolume_cnt - 1;
-	
-	if (local->call_cnt == 0) {
-		local->layout = NULL;
-		dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk,
-					&local->loc, layout);
-	}
-	for (i = 0; i < conf->subvolume_cnt; i++) {
-		if (conf->subvolumes[i] == hashed_subvol)
-			continue;
-		STACK_WIND (frame, dht_mkdir_cbk,
-			    conf->subvolumes[i],
-			    conf->subvolumes[i]->fops->mkdir,
-			    &local->loc, local->mode);
-	}
-	return 0;
+dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local)
+{
+    dht_conf_t *conf = this->private;
+    dht_layout_t *parent_layout = NULL;
+    int *parent_disk_layout = NULL;
+    xlator_t *hashed_subvol = NULL;
+    char pgfid[GF_UUID_BUF_SIZE] = {0};
+    int ret = 0;
+
+    gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+    parent_layout = dht_layout_get(this, loc->parent);
+    hashed_subvol = dht_subvol_get_hashed(this, loc);
+
+    ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol,
+                                             &parent_disk_layout);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+               DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "%s (%s/%s) (path: %s): "
+               "extracting in-memory layout of parent failed. ",
+               gf_fop_list[local->fop], pgfid, loc->name, loc->path);
+        goto err;
+    }
+
+    ret = dict_set_str_sizen(local->params, GF_PREOP_PARENT_KEY,
+                             conf->xattr_name);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+               DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "%s (%s/%s) (path: %s): "
+               "setting %s key in params dictionary failed. ",
+               gf_fop_list[local->fop], pgfid, loc->name, loc->path,
+               GF_PREOP_PARENT_KEY);
+        goto err;
+    }
+
+    ret = dict_set_bin(local->params, conf->xattr_name, parent_disk_layout,
+                       4 * 4);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+               DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "%s (%s/%s) (path: %s): "
+               "setting parent-layout in params dictionary failed. ",
+               gf_fop_list[local->fop], pgfid, loc->name, loc->path);
+        goto err;
+    }
+
 err:
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-        return 0;
+    dht_layout_unref(this, parent_layout);
+    return ret;
 }
 
 int
-dht_mkdir (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, mode_t mode)
-{
-	dht_local_t  *local  = NULL;
-	dht_conf_t   *conf = NULL;
-        int           op_errno = -1;
-	int           ret = -1;
-	xlator_t     *hashed_subvol = NULL;
-
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	conf = this->private;
-
-	local = dht_local_init (frame);
-	if (!local) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		op_errno = ENOMEM;
-		goto err;
-	}
-
-	hashed_subvol = dht_subvol_get_hashed (this, loc);
-
-	if (hashed_subvol == NULL) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"hashed subvol not found");
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	local->hashed_subvol = hashed_subvol;
-	local->inode = inode_ref (loc->inode);
-	ret = loc_copy (&local->loc, loc);
-	local->mode = mode;
-
-	if (ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		op_errno = ENOMEM;
-		goto err;
-	}
-
-	local->layout = dht_layout_new (this, conf->subvolume_cnt);
-	if (!local->layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		op_errno = ENOMEM;
-		goto err;
-	}
-
-	STACK_WIND (frame, dht_mkdir_hashed_cbk,
-		    hashed_subvol,
-		    hashed_subvol->fops->mkdir,
-		    loc, mode);
-
-	return 0;
+dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+           mode_t mode, mode_t umask, fd_t *fd, dict_t *params)
+{
+    int op_errno = -1;
+    xlator_t *subvol = NULL;
+    xlator_t *hashed_subvol = NULL;
+    dht_local_t *local = NULL;
+    int i = 0;
+    dht_conf_t *conf = NULL;
+    int ret = 0;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+
+    conf = this->private;
+
+    dht_get_du_info(frame, this, loc);
+
+    local = dht_local_init(frame, loc, fd, GF_FOP_CREATE);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->params = dict_ref(params);
+    local->flags = flags;
+    local->mode = mode;
+    local->umask = umask;
+
+    if (dht_filter_loc_subvol_key(this, loc, &local->loc, &subvol)) {
+        gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+               "creating %s on %s (got create on %s)", local->loc.path,
+               subvol->name, loc->path);
+
+        /* Since lookup-optimize is enabled by default, we need
+         * to create the linkto file if required.
+         * Note this does not check for decommisioned bricks
+         * and min-free-disk limits as this is a debugging tool
+         * and not expected to be used in production.
+         */
+        hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
+
+        if (hashed_subvol && (hashed_subvol != subvol)) {
+            /* Create the linkto file and then the data file */
+            local->cached_subvol = subvol;
+            local->hashed_subvol = hashed_subvol;
+
+            dht_linkfile_create(frame, dht_create_linkfile_create_cbk, this,
+                                subvol, hashed_subvol, &local->loc);
+            goto done;
+        }
+        /* We either don't have a hashed subvol or the hashed subvol is
+         * the same as the one specified. No need to create the linkto
+         * file as we expect a lookup everywhere if there are problems
+         * with the parent layout
+         */
+
+        dht_set_parent_layout_in_dict(loc, this, local);
+
+        STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
+                          subvol->fops->create, &local->loc, flags, mode, umask,
+                          fd, params);
+        goto done;
+    }
+
+    subvol = dht_subvol_get_hashed(this, loc);
+    if (!subvol) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+               "no subvolume in layout for path=%s", loc->path);
+
+        op_errno = EIO;
+        goto err;
+    }
+
+    /* Post remove-brick, the client layout may not be in sync with
+     * disk layout because of lack of lookup. Hence,a create call
+     * may fall on the decommissioned brick.  Hence, if the
+     * hashed_subvol is part of decommissioned bricks  list, do a
+     * lookup on parent dir. If a fix-layout is already done by the
+     * remove-brick process, the parent directory layout will be in
+     * sync with that of the disk. If fix-layout is still ending
+     * on the parent directory, we can let the file get created on
+     * the decommissioned brick which will be eventually migrated to
+     * non-decommissioned brick based on the new layout.
+     */
+
+    if (conf->decommission_subvols_cnt) {
+        for (i = 0; i < conf->subvolume_cnt; i++) {
+            if (conf->decommissioned_bricks[i] &&
+                conf->decommissioned_bricks[i] == subvol) {
+                gf_msg_debug(this->name, 0,
+                             "hashed subvol:%s is "
+                             "part of decommission brick list for "
+                             "file: %s",
+                             subvol->name, loc->path);
+
+                /* dht_refresh_layout needs directory info in
+                 * local->loc. Hence, storing the parent_loc in
+                 * local->loc and storing the create context in
+                 * local->loc2. We will restore this information
+                 * in dht_creation do */
+
+                ret = loc_copy(&local->loc2, &local->loc);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+                           "loc_copy failed %s", loc->path);
+
+                    goto err;
+                }
+
+                loc_wipe(&local->loc);
+
+                ret = dht_build_parent_loc(this, &local->loc, loc, &op_errno);
+
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED,
+                           "parent loc build failed");
+                    goto err;
+                }
+
+                ret = dht_create_lock(frame, subvol);
+
+                if (ret < 0) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+                           "locking parent failed");
+                    goto err;
+                }
+
+                goto done;
+            }
+        }
+    }
+
+    dht_create_wind_to_avail_subvol(frame, this, subvol, loc, flags, mode,
+                                    umask, fd, params);
+done:
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
 
-	return 0;
-}
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+                     NULL);
 
+    return 0;
+}
 
-int
-dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			int op_ret, int op_errno)
+static int
+dht_mkdir_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
-	dht_local_t  *local = NULL;
+    dht_local_t *local = NULL;
+    dht_layout_t *layout = NULL;
+
+    local = frame->local;
+    layout = local->selfheal.layout;
+
+    FRAME_SU_UNDO(frame, dht_local_t);
+    dht_set_fixed_dir_stat(&local->preparent);
+    dht_set_fixed_dir_stat(&local->postparent);
 
-	local = frame->local;
-	local->layout = NULL;
+    if (op_ret == 0) {
+        dht_layout_set(this, local->inode, layout);
+
+        dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1);
+        if (local->loc.parent) {
+            dht_inode_ctx_time_update(local->loc.parent, this,
+                                      &local->preparent, 0);
+
+            dht_inode_ctx_time_update(local->loc.parent, this,
+                                      &local->postparent, 1);
+        }
+    }
 
-	DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
+    DHT_STACK_UNWIND(mkdir, frame, op_ret, op_errno, local->inode,
+                     &local->stbuf, &local->preparent, &local->postparent,
+                     NULL);
 
-	return 0;
+    return 0;
 }
 
+static int
+dht_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+              int op_errno, inode_t *inode, struct iatt *stbuf,
+              struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    int ret = -1;
+    gf_boolean_t subvol_filled = _gf_false;
+    gf_boolean_t dir_exists = _gf_false;
+    xlator_t *prev = NULL;
+    dht_layout_t *layout = NULL;
+
+    local = frame->local;
+    prev = cookie;
+    layout = local->layout;
+
+    subvol_filled = dht_is_subvol_filled(this, prev);
+
+    LOCK(&frame->lock);
+    {
+        if (subvol_filled && (op_ret != -1)) {
+            ret = dht_layout_merge(this, layout, prev, -1, ENOSPC, NULL);
+        } else {
+            if (op_ret == -1 && op_errno == EEXIST) {
+                /* Very likely just a race between mkdir and
+                   self-heal (from lookup of a concurrent mkdir
+                   attempt).
+                   Ignore error for now. layout setting will
+                   anyways fail if this was a different (old)
+                   pre-existing different directory.
+                */
+                op_ret = 0;
+                dir_exists = _gf_true;
+            }
+            ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, NULL);
+        }
+        if (ret)
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+                   "%s: failed to merge layouts for subvol %s", local->loc.path,
+                   prev->name);
 
-int
-dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	       int op_ret, int op_errno)
-{
-	uint64_t      tmp_layout = 0;
-	dht_local_t  *local = NULL;
-	int           this_call_cnt = 0;
-	call_frame_t *prev = NULL;
-	dht_layout_t *layout = NULL;
-
-	local = frame->local;
-	prev  = cookie;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret == -1) {
-			local->op_errno = op_errno;
-			local->op_ret   = -1;
-
-			if (op_errno != ENOENT)
-				local->need_selfheal = 1;
-
-			gf_log (this->name, GF_LOG_ERROR,
-				"rmdir on %s for %s failed (%s)",
-				prev->this->name, local->loc.path,
-				strerror (op_errno));
-			goto unlock;
-		}
-	}
+        if (op_ret == -1) {
+            local->op_errno = op_errno;
+            goto unlock;
+        }
+
+        if (dir_exists)
+            goto unlock;
+
+        dht_iatt_merge(this, &local->stbuf, stbuf);
+        dht_iatt_merge(this, &local->preparent, preparent);
+        dht_iatt_merge(this, &local->postparent, postparent);
+    }
 unlock:
-	UNLOCK (&frame->lock);
+    UNLOCK(&frame->lock);
+
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt)) {
+        /*Unlock entrylk and inodelk once mkdir is done on all subvols*/
+        dht_unlock_namespace(frame, &local->lock[0]);
+        FRAME_SU_DO(frame, dht_local_t);
+        dht_selfheal_new_directory(frame, dht_mkdir_selfheal_cbk, layout);
+    }
+
+    return 0;
+}
+
+static int
+dht_mkdir_hashed_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int op_ret, int op_errno, inode_t *inode,
+                     struct iatt *stbuf, struct iatt *preparent,
+                     struct iatt *postparent, dict_t *xdata);
+
+static int
+dht_mkdir_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+                 mode_t umask, dict_t *params)
+{
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int op_errno = -1, ret = -1;
+    xlator_t *hashed_subvol = NULL;
+    int32_t *parent_disk_layout = NULL;
+    dht_layout_t *parent_layout = NULL;
+    char pgfid[GF_UUID_BUF_SIZE] = {0};
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+    VALIDATE_OR_GOTO(loc->path, err);
+    VALIDATE_OR_GOTO(this->private, err);
+
+    gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+    conf = this->private;
+    local = frame->local;
+
+    if (local->op_ret == -1) {
+        gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+               DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "mkdir (%s/%s) (path: %s): refreshing parent layout "
+               "failed.",
+               pgfid, loc->name, loc->path);
+
+        op_errno = local->op_errno;
+        goto err;
+    }
+
+    local->op_ret = -1;
+
+    hashed_subvol = dht_subvol_get_hashed(this, loc);
+    if (hashed_subvol == NULL) {
+        gf_msg_debug(this->name, 0,
+                     "mkdir (%s/%s) (path: %s): hashed subvol not "
+                     "found",
+                     pgfid, loc->name, loc->path);
+        op_errno = ENOENT;
+        goto err;
+    }
+
+    local->hashed_subvol = hashed_subvol;
+
+    parent_layout = dht_layout_get(this, loc->parent);
+
+    ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol,
+                                             &parent_disk_layout);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_WARNING, EIO, DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "mkdir (%s/%s) (path: %s): "
+               "extracting in-memory layout of parent failed. ",
+               pgfid, loc->name, loc->path);
+        goto err;
+    }
+
+    if (memcmp(local->parent_disk_layout, parent_disk_layout,
+               sizeof(local->parent_disk_layout)) == 0) {
+        gf_msg(this->name, GF_LOG_WARNING, EIO, DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "mkdir (%s/%s) (path: %s): loop detected. "
+               "parent layout didn't change even though "
+               "previous attempt of mkdir failed because of "
+               "in-memory layout not matching with that on disk.",
+               pgfid, loc->name, loc->path);
+        op_errno = EIO;
+        goto err;
+    }
+
+    memcpy((void *)local->parent_disk_layout, (void *)parent_disk_layout,
+           sizeof(local->parent_disk_layout));
+
+    dht_layout_unref(this, parent_layout);
+    parent_layout = NULL;
+
+    ret = dict_set_str(params, GF_PREOP_PARENT_KEY, conf->xattr_name);
+    if (ret < 0) {
+        local->op_errno = -ret;
+        gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+               DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "mkdir (%s/%s) (path: %s): "
+               "setting %s key in params dictionary failed. ",
+               pgfid, loc->name, loc->path, GF_PREOP_PARENT_KEY);
+        goto err;
+    }
+
+    ret = dict_set_bin(params, conf->xattr_name, parent_disk_layout, 4 * 4);
+    if (ret < 0) {
+        local->op_errno = -ret;
+        gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+               DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "setting parent-layout in params dictionary failed. "
+               "mkdir (%s/%s) (path: %s)",
+               pgfid, loc->name, loc->path);
+        goto err;
+    }
+
+    parent_disk_layout = NULL;
+
+    STACK_WIND_COOKIE(frame, dht_mkdir_hashed_cbk, hashed_subvol, hashed_subvol,
+                      hashed_subvol->fops->mkdir, loc, mode, umask, params);
+
+    return 0;
+
+err:
+    dht_unlock_namespace(frame, &local->lock[0]);
+
+    op_errno = local ? local->op_errno : op_errno;
+    DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+
+    if (parent_disk_layout != NULL)
+        GF_FREE(parent_disk_layout);
+
+    if (parent_layout != NULL)
+        dht_layout_unref(this, parent_layout);
+
+    return 0;
+}
 
+static int
+dht_mkdir_hashed_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int op_ret, int op_errno, inode_t *inode,
+                     struct iatt *stbuf, struct iatt *preparent,
+                     struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int ret = -1;
+    xlator_t *prev = NULL;
+    dht_layout_t *layout = NULL;
+    dht_conf_t *conf = NULL;
+    int i = 0;
+    xlator_t *hashed_subvol = NULL;
+    char pgfid[GF_UUID_BUF_SIZE] = {0};
+    gf_boolean_t parent_layout_changed = _gf_false;
+    call_stub_t *stub = NULL;
+
+    local = frame->local;
+    prev = cookie;
+    layout = local->layout;
+    conf = this->private;
+    hashed_subvol = local->hashed_subvol;
+
+    gf_uuid_unparse(local->loc.parent->gfid, pgfid);
+
+    if (gf_uuid_is_null(local->loc.gfid) && !op_ret)
+        gf_uuid_copy(local->loc.gfid, stbuf->ia_gfid);
+
+    if (op_ret == -1) {
+        local->op_errno = op_errno;
+
+        parent_layout_changed = (xdata &&
+                                 dict_get(xdata, GF_PREOP_CHECK_FAILED))
+                                    ? 1
+                                    : 0;
+        if (parent_layout_changed) {
+            gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_PARENT_LAYOUT_CHANGED,
+                   "mkdir (%s/%s) (path: %s): parent layout "
+                   "changed. Attempting a refresh and then a "
+                   "retry",
+                   pgfid, local->loc.name, local->loc.path);
+
+            stub = fop_mkdir_stub(frame, dht_mkdir_helper, &local->loc,
+                                  local->mode, local->umask, local->params);
+            if (stub == NULL) {
+                goto err;
+            }
 
-	this_call_cnt = dht_frame_return (frame);
-	if (is_last_call (this_call_cnt)) {
-		if (local->need_selfheal) {
-			inode_ctx_get (local->loc.inode, this, 
-				       &tmp_layout);
-			layout = (dht_layout_t *)(long)tmp_layout;
+            ret = dht_handle_parent_layout_change(this, stub);
+            if (ret) {
+                goto err;
+            }
 
-			/* TODO: neater interface needed below */
-			local->stbuf.st_mode = local->loc.inode->st_mode;
+            stub = NULL;
 
-			dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk,
-					      &local->loc, layout);
-		} else {
-			DHT_STACK_UNWIND (frame, local->op_ret,
-					  local->op_errno);
-		}
-	}
+            return 0;
+        }
 
+        goto err;
+    }
+
+    dict_del(local->params, GF_PREOP_PARENT_KEY);
+    dict_del(local->params, conf->xattr_name);
+
+    if (dht_is_subvol_filled(this, hashed_subvol))
+        ret = dht_layout_merge(this, layout, prev, -1, ENOSPC, NULL);
+    else
+        ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, NULL);
+
+    /* TODO: we may have to return from the function
+       if layout merge fails. For now, lets just log an error */
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+               "%s: failed to merge layouts for subvol %s", local->loc.path,
+               prev->name);
+
+    local->op_ret = 0;
+
+    dht_iatt_merge(this, &local->stbuf, stbuf);
+    dht_iatt_merge(this, &local->preparent, preparent);
+    dht_iatt_merge(this, &local->postparent, postparent);
+
+    local->call_cnt = conf->subvolume_cnt - 1;
+    /* Delete internal mds xattr from params dict to avoid store
+      internal mds xattr on other subvols
+    */
+    dict_del(local->params, conf->mds_xattr_key);
+
+    if (gf_uuid_is_null(local->loc.gfid))
+        gf_uuid_copy(local->loc.gfid, stbuf->ia_gfid);
+
+    /* Set hashed subvol as a mds subvol on inode ctx */
+    /*if (!local->inode)
+            local->inode  = inode_ref (inode);
+    */
+    ret = dht_inode_ctx_mdsvol_set(local->inode, this, hashed_subvol);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+               "Failed to set hashed subvol for %s on inode vol is %s",
+               local->loc.path, hashed_subvol->name);
+    }
+
+    if (local->call_cnt == 0) {
+        /*Unlock namespace lock once mkdir is done on all subvols*/
+        dht_unlock_namespace(frame, &local->lock[0]);
+        FRAME_SU_DO(frame, dht_local_t);
+        dht_selfheal_directory(frame, dht_mkdir_selfheal_cbk, &local->loc,
+                               layout);
         return 0;
+    }
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (conf->subvolumes[i] == hashed_subvol)
+            continue;
+        STACK_WIND_COOKIE(frame, dht_mkdir_cbk, conf->subvolumes[i],
+                          conf->subvolumes[i], conf->subvolumes[i]->fops->mkdir,
+                          &local->loc, local->mode, local->umask,
+                          local->params);
+    }
+
+    return 0;
+err:
+    if (local->op_ret != 0) {
+        dht_unlock_namespace(frame, &local->lock[0]);
+    }
+
+    DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+
+    return 0;
 }
 
+static int
+dht_mkdir_guard_parent_layout_cbk(call_frame_t *frame, xlator_t *this,
+                                  loc_t *loc, mode_t mode, mode_t umask,
+                                  dict_t *params)
+{
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = 0;
+    char pgfid[GF_UUID_BUF_SIZE] = {0};
+    int ret = -1;
+    int32_t zero[1] = {0};
+
+    local = frame->local;
+    conf = this->private;
+
+    gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+    if (local->op_ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+               DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "mkdir (%s/%s) (path: %s): "
+               "Acquiring lock on parent to guard against "
+               "layout-change failed.",
+               pgfid, loc->name, loc->path);
+        goto err;
+    }
+
+    local->op_ret = -1;
+    /* Add internal MDS xattr on disk for hashed subvol
+     */
+    ret = dht_dict_set_array(params, conf->mds_xattr_key, zero, 1);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+               "Failed to set dictionary value:key = %s for "
+               "path %s",
+               conf->mds_xattr_key, loc->path);
+    }
+
+    STACK_WIND_COOKIE(frame, dht_mkdir_hashed_cbk, local->hashed_subvol,
+                      local->hashed_subvol, local->hashed_subvol->fops->mkdir,
+                      loc, mode, umask, params);
+
+    return 0;
+err:
+    DHT_STACK_UNWIND(mkdir, frame, -1, local->op_errno, NULL, NULL, NULL, NULL,
+                     NULL);
+
+    return 0;
+}
 
 int
-dht_rmdir_do (call_frame_t *frame, xlator_t *this)
+dht_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          mode_t umask, dict_t *params)
 {
-	dht_local_t  *local = NULL;
-	dht_conf_t   *conf = NULL;
-	int           i = 0;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int op_errno = EINVAL, ret = -1;
+    xlator_t *hashed_subvol = NULL;
+    char pgfid[GF_UUID_BUF_SIZE] = {0};
+    call_stub_t *stub = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+    VALIDATE_OR_GOTO(loc->path, err);
+    VALIDATE_OR_GOTO(this->private, err);
+
+    gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+    conf = this->private;
+
+    if (!params || !dict_get(params, "gfid-req")) {
+        op_errno = EPERM;
+        gf_msg_callingfn(this->name, GF_LOG_WARNING, op_errno,
+                         DHT_MSG_GFID_NULL,
+                         "mkdir: %s is received "
+                         "without gfid-req %p",
+                         loc->path, params);
+        goto err;
+    }
+
+    dht_get_du_info(frame, this, loc);
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_MKDIR);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    hashed_subvol = dht_subvol_get_hashed(this, loc);
+    if (hashed_subvol == NULL) {
+        gf_msg_debug(this->name, 0, "hashed subvol not found for %s",
+                     loc->path);
+        local->op_errno = EIO;
+        goto err;
+    }
+
+    local->hashed_subvol = hashed_subvol;
+    local->mode = mode;
+    local->umask = umask;
+    if (params)
+        local->params = dict_ref(params);
+
+    local->inode = inode_ref(loc->inode);
+
+    local->layout = dht_layout_new(this, conf->subvolume_cnt);
+    if (!local->layout) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    /* set the newly created directory hash to the commit hash
+     * if the configuration option is set. If configuration option
+     * is not set, the older clients may still be connecting to the
+     * volume and hence we need to preserve the 1 in disk[0] part of the
+     * layout xattr */
+    if (conf->lookup_optimize)
+        local->layout->commit_hash = conf->vol_commit_hash;
+    else
+        local->layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
+
+    stub = fop_mkdir_stub(frame, dht_mkdir_guard_parent_layout_cbk, loc, mode,
+                          umask, params);
+    if (stub == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM,
+               DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "mkdir (%s/%s) (path: %s): "
+               "creating stub failed.",
+               pgfid, loc->name, loc->path);
+        local->op_errno = ENOMEM;
+        goto err;
+    }
+
+    ret = dht_guard_parent_layout_and_namespace(this, stub);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_PARENT_LAYOUT_CHANGED,
+               "mkdir (%s/%s) (path: %s) cannot wind lock request to "
+               "guard parent layout",
+               pgfid, loc->name, loc->path);
+        goto err;
+    }
+
+    return 0;
+
+err:
+    op_errno = local ? local->op_errno : op_errno;
+    DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
 
-	conf = this->private;
-	local = frame->local;
+    return 0;
+}
+
+static int
+dht_rmdir_selfheal_cbk(call_frame_t *heal_frame, void *cookie, xlator_t *this,
+                       int op_ret, int op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    dht_local_t *heal_local = NULL;
+    call_frame_t *main_frame = NULL;
 
-	if (local->op_ret == -1)
-		goto err;
+    heal_local = heal_frame->local;
+    main_frame = heal_local->main_frame;
+    local = main_frame->local;
 
-	local->call_cnt = conf->subvolume_cnt;
+    DHT_STACK_DESTROY(heal_frame);
+    dht_set_fixed_dir_stat(&local->preparent);
+    dht_set_fixed_dir_stat(&local->postparent);
 
-	for (i = 0; i < conf->subvolume_cnt; i++) {
-		STACK_WIND (frame, dht_rmdir_cbk,
-			    conf->subvolumes[i],
-			    conf->subvolumes[i]->fops->rmdir,
-			    &local->loc);
-	}
+    DHT_STACK_UNWIND(rmdir, main_frame, local->op_ret, local->op_errno,
+                     &local->preparent, &local->postparent, NULL);
 
-	return 0;
+    return 0;
+}
+
+static int
+dht_rmdir_hashed_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int op_ret, int op_errno, struct iatt *preparent,
+                            struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    dht_local_t *heal_local = NULL;
+    call_frame_t *heal_frame = NULL;
+    dht_conf_t *conf = NULL;
+    int this_call_cnt = 0;
+    xlator_t *prev = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+    prev = cookie;
+    conf = this->private;
+
+    gf_uuid_unparse(local->loc.gfid, gfid);
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret == -1) {
+            local->op_errno = op_errno;
+            local->op_ret = -1;
+            if (conf->subvolume_cnt != 1) {
+                if (op_errno != ENOENT && op_errno != EACCES &&
+                    op_errno != ESTALE) {
+                    local->need_selfheal = 1;
+                }
+            }
+
+            gf_msg_debug(this->name, op_errno,
+                         "rmdir on %s for %s failed "
+                         "(gfid = %s)",
+                         prev->name, local->loc.path, gfid);
+            goto unlock;
+        }
+
+        dht_iatt_merge(this, &local->preparent, preparent);
+        dht_iatt_merge(this, &local->postparent, postparent);
+    }
+unlock:
+    UNLOCK(&frame->lock);
+
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt)) {
+        if (local->need_selfheal) {
+            dht_rmdir_unlock(frame, this);
+            local->layout = dht_layout_get(this, local->loc.inode);
+
+            /* TODO: neater interface needed below */
+            local->stbuf.ia_type = local->loc.inode->ia_type;
+
+            gf_uuid_copy(local->gfid, local->loc.inode->gfid);
+
+            /* Use a different frame or else the rmdir op_ret is
+             * overwritten by that of the selfheal */
+
+            heal_frame = copy_frame(frame);
+
+            if (heal_frame == NULL) {
+                goto err;
+            }
+
+            heal_local = dht_local_init(heal_frame, &local->loc, NULL, 0);
+            if (!heal_local) {
+                DHT_STACK_DESTROY(heal_frame);
+                goto err;
+            }
+
+            heal_local->inode = inode_ref(local->loc.inode);
+            heal_local->main_frame = frame;
+            gf_uuid_copy(heal_local->gfid, local->loc.inode->gfid);
+
+            dht_selfheal_restore(heal_frame, dht_rmdir_selfheal_cbk,
+                                 &heal_local->loc, heal_local->layout);
+            return 0;
+        } else {
+            if (local->loc.parent) {
+                dht_inode_ctx_time_update(local->loc.parent, this,
+                                          &local->preparent, 0);
+
+                dht_inode_ctx_time_update(local->loc.parent, this,
+                                          &local->postparent, 1);
+            }
+
+            dht_set_fixed_dir_stat(&local->preparent);
+            dht_set_fixed_dir_stat(&local->postparent);
+
+            dht_rmdir_unlock(frame, this);
+            DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+                             &local->preparent, &local->postparent, NULL);
+        }
+    }
+
+    return 0;
 
 err:
-	DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-	return 0;
+    DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, NULL, NULL,
+                     NULL);
+    return 0;
 }
 
+static int
+dht_rmdir_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    DHT_STACK_DESTROY(frame);
+    return 0;
+}
 
-int
-dht_rmdir_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		       int op_ret, int op_errno, gf_dirent_t *entries)
+static int
+dht_rmdir_unlock(call_frame_t *frame, xlator_t *this)
 {
-	dht_local_t  *local = NULL;
-	int           this_call_cnt = -1;
-	call_frame_t *prev = NULL;
+    dht_local_t *local = NULL, *lock_local = NULL;
+    call_frame_t *lock_frame = NULL;
+    int lock_count = 0;
 
-	local = frame->local;
-	prev  = cookie;
+    local = frame->local;
 
-	if (op_ret > 2) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"readdir on %s for %s returned %d entries",
-			prev->this->name, local->loc.path, op_ret);
-		local->op_ret = -1;
-		local->op_errno = ENOTEMPTY;
-	}
+    /* Unlock entrylk */
+    dht_unlock_entrylk_wrapper(frame, &local->lock[0].ns.directory_ns);
 
-	this_call_cnt = dht_frame_return (frame);
+    /* Unlock inodelk */
+    lock_count = dht_lock_count(local->lock[0].ns.parent_layout.locks,
+                                local->lock[0].ns.parent_layout.lk_count);
 
-	if (is_last_call (this_call_cnt)) {
-		dht_rmdir_do (frame, this);
-	}
+    if (lock_count == 0)
+        goto done;
 
-	return 0;
-}
+    lock_frame = copy_frame(frame);
+    if (lock_frame == NULL)
+        goto done;
 
+    lock_local = dht_local_init(lock_frame, &local->loc, NULL,
+                                lock_frame->root->op);
+    if (lock_local == NULL)
+        goto done;
 
-int
-dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		       int op_ret, int op_errno, fd_t *fd)
+    lock_local->lock[0].ns.parent_layout.locks = local->lock[0]
+                                                     .ns.parent_layout.locks;
+    lock_local->lock[0]
+        .ns.parent_layout.lk_count = local->lock[0].ns.parent_layout.lk_count;
+
+    local->lock[0].ns.parent_layout.locks = NULL;
+    local->lock[0].ns.parent_layout.lk_count = 0;
+    dht_unlock_inodelk(lock_frame, lock_local->lock[0].ns.parent_layout.locks,
+                       lock_local->lock[0].ns.parent_layout.lk_count,
+                       dht_rmdir_unlock_cbk);
+    lock_frame = NULL;
+
+done:
+    if (lock_frame != NULL) {
+        DHT_STACK_DESTROY(lock_frame);
+    }
+
+    return 0;
+}
+
+static int
+dht_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+              int op_errno, struct iatt *preparent, struct iatt *postparent,
+              dict_t *xdata)
 {
-	dht_local_t  *local = NULL;
-	int           this_call_cnt = -1;
-	call_frame_t *prev = NULL;
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    xlator_t *prev = NULL;
+    int done = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+    dht_local_t *heal_local = NULL;
+    call_frame_t *heal_frame = NULL;
+    int ret = -1;
+
+    local = frame->local;
+    prev = cookie;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret == -1) {
+            if ((op_errno != ENOENT) && (op_errno != ESTALE)) {
+                local->op_errno = op_errno;
+                local->op_ret = -1;
+
+                if (op_errno != EACCES)
+                    local->need_selfheal = 1;
+            }
+
+            gf_uuid_unparse(local->loc.gfid, gfid);
 
+            gf_msg_debug(this->name, op_errno,
+                         "rmdir on %s for %s failed."
+                         "(gfid = %s)",
+                         prev->name, local->loc.path, gfid);
+            goto unlock;
+        }
 
-	local = frame->local;
-	prev  = cookie;
+        /* Track if rmdir succeeded on at least one subvol*/
+        local->fop_succeeded = 1;
+        dht_iatt_merge(this, &local->preparent, preparent);
+        dht_iatt_merge(this, &local->postparent, postparent);
+    }
+unlock:
+    UNLOCK(&frame->lock);
 
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"opendir on %s for %s failed (%s)",
-			prev->this->name, local->loc.path,
-			strerror (op_errno));
-		goto err;
-	}
+    this_call_cnt = dht_frame_return(frame);
 
-	STACK_WIND (frame, dht_rmdir_readdir_cbk,
-		    prev->this, prev->this->fops->readdir,
-		    local->fd, 4096, 0);
+    /* if local->hashed_subvol, we are yet to wind to hashed_subvol. */
+    if (local->hashed_subvol && (this_call_cnt == 1)) {
+        done = 1;
+    } else if (!local->hashed_subvol && !this_call_cnt) {
+        done = 1;
+    }
 
-	return 0;
+    if (done) {
+        if (local->need_selfheal && local->fop_succeeded) {
+            dht_rmdir_unlock(frame, this);
+            local->layout = dht_layout_get(this, local->loc.inode);
 
-err:
-	this_call_cnt = dht_frame_return (frame);
+            /* TODO: neater interface needed below */
+            local->stbuf.ia_type = local->loc.inode->ia_type;
+
+            gf_uuid_copy(local->gfid, local->loc.inode->gfid);
+            heal_frame = copy_frame(frame);
+            if (heal_frame == NULL) {
+                goto err;
+            }
+
+            heal_local = dht_local_init(heal_frame, &local->loc, NULL, 0);
+            if (!heal_local) {
+                DHT_STACK_DESTROY(heal_frame);
+                goto err;
+            }
+
+            heal_local->inode = inode_ref(local->loc.inode);
+            heal_local->main_frame = frame;
+            gf_uuid_copy(heal_local->gfid, local->loc.inode->gfid);
+            ret = dht_selfheal_restore(heal_frame, dht_rmdir_selfheal_cbk,
+                                       &heal_local->loc, heal_local->layout);
+            if (ret) {
+                DHT_STACK_DESTROY(heal_frame);
+                goto err;
+            }
+
+        } else if (this_call_cnt) {
+            /* If non-hashed subvol's have responded, proceed */
+            if (local->op_ret == 0) {
+                /* Delete the dir from the hashed subvol if:
+                 * The fop succeeded on at least one subvol
+                 *  and did not fail on any
+                 *  or
+                 *  The fop failed with ENOENT/ESTALE on
+                 *  all subvols */
+
+                STACK_WIND_COOKIE(frame, dht_rmdir_hashed_subvol_cbk,
+                                  local->hashed_subvol, local->hashed_subvol,
+                                  local->hashed_subvol->fops->rmdir,
+                                  &local->loc, local->flags, NULL);
+            } else {
+                /* hashed-subvol was non-NULL and rmdir failed on
+                 * all non hashed-subvols. Unwind rmdir with
+                 * local->op_ret and local->op_errno. */
+                dht_rmdir_unlock(frame, this);
+                DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+                                 &local->preparent, &local->postparent, NULL);
+
+                return 0;
+            }
+        } else if (!this_call_cnt) {
+            /* All subvol's have responded, proceed */
+
+            if (local->loc.parent) {
+                dht_inode_ctx_time_update(local->loc.parent, this,
+                                          &local->preparent, 0);
+
+                dht_inode_ctx_time_update(local->loc.parent, this,
+                                          &local->postparent, 1);
+            }
+
+            dht_set_fixed_dir_stat(&local->preparent);
+            dht_set_fixed_dir_stat(&local->postparent);
+
+            dht_rmdir_unlock(frame, this);
+            DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+                             &local->preparent, &local->postparent, NULL);
+        }
+    }
 
-	if (is_last_call (this_call_cnt)) {
-		dht_rmdir_do (frame, this);
-	}
+    return 0;
 
-	return 0;
+err:
+    DHT_STACK_UNWIND(rmdir, frame, -1, local->op_errno, NULL, NULL, NULL);
+    return 0;
 }
 
+static int
+dht_rmdir_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int i = 0;
+    xlator_t *hashed_subvol;
+
+    conf = this->private;
+    local = frame->local;
 
-int
-dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
-	dht_local_t  *local  = NULL;
-	dht_conf_t   *conf = NULL;
-        int           op_errno = -1;
-	int           i = -1;
-	int           ret = -1;
-
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	conf = this->private;
-
-	local = dht_local_init (frame);
-	if (!local) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		op_errno = ENOMEM;
-		goto err;
-	}
-
-	local->call_cnt = conf->subvolume_cnt;
-	local->op_ret   = 0;
-
-	ret = loc_copy (&local->loc, loc);
-	if (ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		op_errno = ENOMEM;
-		goto err;
-	}
-
-	local->fd = fd_create (local->loc.inode, frame->root->pid);
-	if (!local->fd) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		op_errno = ENOMEM;
-		goto err;
-	}
-
-	for (i = 0; i < conf->subvolume_cnt; i++) {
-		STACK_WIND (frame, dht_rmdir_opendir_cbk,
-			    conf->subvolumes[i],
-			    conf->subvolumes[i]->fops->opendir,
-			    loc, local->fd);
-	}
-
-	return 0;
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+               "acquiring entrylk after inodelk failed rmdir for %s)",
+               local->loc.path);
+
+        local->op_ret = -1;
+        local->op_errno = op_errno;
+        goto err;
+    }
+
+    hashed_subvol = local->hashed_subvol;
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (hashed_subvol && (hashed_subvol == conf->subvolumes[i]))
+            continue;
+
+        STACK_WIND_COOKIE(frame, dht_rmdir_cbk, conf->subvolumes[i],
+                          conf->subvolumes[i], conf->subvolumes[i]->fops->rmdir,
+                          &local->loc, local->flags, NULL);
+    }
+
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno);
+    DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+                     &local->preparent, &local->postparent, NULL);
 
-	return 0;
+    return 0;
 }
 
+static int
+dht_rmdir_do(call_frame_t *frame, xlator_t *this)
+{
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int ret = -1;
+    xlator_t *hashed_subvol = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    VALIDATE_OR_GOTO(frame->local, err);
+    local = frame->local;
+    VALIDATE_OR_GOTO(this->private, out);
+    conf = this->private;
+
+    if (local->op_ret == -1)
+        goto out;
+
+    local->call_cnt = conf->subvolume_cnt;
+
+    /* first remove from non-hashed_subvol */
+    hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
+
+    if (!hashed_subvol) {
+        gf_uuid_unparse(local->loc.gfid, gfid);
+
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+               "Failed to get hashed subvol for %s (gfid = %s)",
+               local->loc.path, gfid);
+    } else {
+        local->hashed_subvol = hashed_subvol;
+    }
+
+    /* When DHT has only 1 child */
+    if (conf->subvolume_cnt == 1) {
+        STACK_WIND_COOKIE(frame, dht_rmdir_hashed_subvol_cbk,
+                          conf->subvolumes[0], conf->subvolumes[0],
+                          conf->subvolumes[0]->fops->rmdir, &local->loc,
+                          local->flags, NULL);
+        return 0;
+    }
+
+    local->current = &local->lock[0];
+    ret = dht_protect_namespace(frame, &local->loc, local->hashed_subvol,
+                                &local->current->ns, dht_rmdir_lock_cbk);
+    if (ret < 0) {
+        local->op_ret = -1;
+        local->op_errno = errno ? errno : EINVAL;
+        goto out;
+    }
 
-static int32_t
-dht_xattrop_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 dict_t *dict)
+    return 0;
+
+out:
+    dht_set_fixed_dir_stat(&local->preparent);
+    dht_set_fixed_dir_stat(&local->postparent);
+
+    DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+                     &local->preparent, &local->postparent, NULL);
+    return 0;
+err:
+    DHT_STACK_UNWIND(rmdir, frame, -1, EINVAL, NULL, NULL, NULL);
+    return 0;
+}
+
+static void
+dht_rmdir_readdirp_done(call_frame_t *readdirp_frame, xlator_t *this)
 {
-	DHT_STACK_UNWIND (frame, op_ret, op_errno, dict);
-	return 0;
+    call_frame_t *main_frame = NULL;
+    dht_local_t *main_local = NULL;
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+
+    local = readdirp_frame->local;
+    main_frame = local->main_frame;
+    main_local = main_frame->local;
+
+    /* At least one readdirp failed.
+     * This is a bit hit or miss - if readdirp failed on more than
+     * one subvol, we don't know which error is returned.
+     */
+    if (local->op_ret == -1) {
+        main_local->op_ret = local->op_ret;
+        main_local->op_errno = local->op_errno;
+    }
+
+    this_call_cnt = dht_frame_return(main_frame);
+
+    if (is_last_call(this_call_cnt))
+        dht_rmdir_do(main_frame, this);
+
+    DHT_STACK_DESTROY(readdirp_frame);
 }
 
-int32_t
-dht_xattrop (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     gf_xattrop_flags_t flags,
-	     dict_t *dict)
-{
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
-	dht_local_t  *local = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = dht_subvol_get_cached (this, loc->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	local->inode = inode_ref (loc->inode);
-	local->call_cnt = 1;
-
-	STACK_WIND (frame,
-		    dht_xattrop_cbk,
-		    subvol, subvol->fops->xattrop,
-		    loc, flags, dict);
-
-	return 0;
+/* Keep sending readdirp on the subvol until it returns no more entries
+ * It is possible that not all entries will fit in a single readdirp in
+ * which case the rmdir will keep failing with ENOTEMPTY
+ */
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
+static int
+dht_rmdir_readdirp_do(call_frame_t *readdirp_frame, xlator_t *this)
+{
+    dht_local_t *local = NULL;
 
-	return 0;
+    local = readdirp_frame->local;
+
+    if (local->op_ret == -1) {
+        /* there is no point doing another readdirp on this
+         * subvol . */
+        dht_rmdir_readdirp_done(readdirp_frame, this);
+        return 0;
+    }
+
+    STACK_WIND_COOKIE(readdirp_frame, dht_rmdir_readdirp_cbk,
+                      local->hashed_subvol, local->hashed_subvol,
+                      local->hashed_subvol->fops->readdirp, local->fd, 4096, 0,
+                      local->xattr);
+
+    return 0;
 }
 
-static int32_t
-dht_fxattrop_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  dict_t *dict)
+static int
+dht_rmdir_linkfile_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                              int op_ret, int op_errno, struct iatt *preparent,
+                              struct iatt *postparent, dict_t *xdata)
 {
-	DHT_STACK_UNWIND (frame, op_ret, op_errno, dict);
-	return 0;
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    xlator_t *src = NULL;
+    call_frame_t *readdirp_frame = NULL;
+    dht_local_t *readdirp_local = NULL;
+    int this_call_cnt = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+    prev = cookie;
+    src = prev;
+
+    readdirp_frame = local->main_frame;
+    readdirp_local = readdirp_frame->local;
+
+    gf_uuid_unparse(local->loc.gfid, gfid);
+
+    if (op_ret == 0) {
+        gf_msg_trace(this->name, 0, "Unlinked linkfile %s on %s, gfid = %s",
+                     local->loc.path, src->name, gfid);
+    } else {
+        if (op_errno != ENOENT) {
+            readdirp_local->op_ret = -1;
+            readdirp_local->op_errno = op_errno;
+        }
+        gf_msg_debug(this->name, op_errno,
+                     "Unlink of %s on %s failed. (gfid = %s)", local->loc.path,
+                     src->name, gfid);
+    }
+
+    this_call_cnt = dht_frame_return(readdirp_frame);
+
+    if (is_last_call(this_call_cnt))
+        dht_rmdir_readdirp_do(readdirp_frame, this);
+
+    DHT_STACK_DESTROY(frame);
+    return 0;
 }
 
-int32_t
-dht_fxattrop (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      gf_xattrop_flags_t flags,
-	      dict_t *dict)
-{
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-
-	subvol = dht_subvol_get_cached (this, fd->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    dht_fxattrop_cbk,
-		    subvol, subvol->fops->fxattrop,
-		    fd, flags, dict);
-
-	return 0;
+static int
+dht_rmdir_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int op_ret, int op_errno, inode_t *inode,
+                     struct iatt *stbuf, dict_t *xattr, struct iatt *parent)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    xlator_t *src = NULL;
+    call_frame_t *readdirp_frame = NULL;
+    dht_local_t *readdirp_local = NULL;
+    int this_call_cnt = 0;
+    dht_conf_t *conf = this->private;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+    prev = cookie;
+    src = prev;
+
+    gf_msg_debug(this->name, 0, "dht_rmdir_lookup_cbk %s", local->loc.path);
+
+    readdirp_frame = local->main_frame;
+    readdirp_local = readdirp_frame->local;
+
+    if (op_ret != 0) {
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_FILE_LOOKUP_FAILED,
+               "lookup failed for %s on %s", local->loc.path, src->name);
+        goto err;
+    }
+
+    if (!check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) {
+        readdirp_local->op_ret = -1;
+        readdirp_local->op_errno = ENOTEMPTY;
+
+        gf_uuid_unparse(local->loc.gfid, gfid);
+
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NOT_LINK_FILE_ERROR,
+               "%s on %s is not a linkfile (type=0%o, gfid = %s)",
+               local->loc.path, src->name, stbuf->ia_type, gfid);
+        goto err;
+    }
+
+    STACK_WIND_COOKIE(frame, dht_rmdir_linkfile_unlink_cbk, src, src,
+                      src->fops->unlink, &local->loc, 0, NULL);
+    return 0;
+err:
+
+    this_call_cnt = dht_frame_return(readdirp_frame);
+    if (is_last_call(this_call_cnt)) {
+        dht_rmdir_readdirp_do(readdirp_frame, this);
+    }
 
+    DHT_STACK_DESTROY(frame);
+    return 0;
+}
+
+static int
+dht_rmdir_cached_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int op_ret, int op_errno, inode_t *inode,
+                            struct iatt *stbuf, dict_t *xattr,
+                            struct iatt *parent)
+{
+    dht_local_t *local = NULL;
+    xlator_t *src = NULL;
+    call_frame_t *readdirp_frame = NULL;
+    dht_local_t *readdirp_local = NULL;
+    int this_call_cnt = 0;
+    dht_conf_t *conf = this->private;
+    dict_t *xattrs = NULL;
+    int ret = 0;
+
+    local = frame->local;
+    src = local->hashed_subvol;
+
+    /* main_frame here is the readdirp_frame */
+
+    readdirp_frame = local->main_frame;
+    readdirp_local = readdirp_frame->local;
+
+    gf_msg_debug(this->name, 0, "returning for %s ", local->loc.path);
+
+    if (op_ret == 0) {
+        readdirp_local->op_ret = -1;
+        readdirp_local->op_errno = ENOTEMPTY;
+
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_ERROR,
+               "%s found on cached subvol %s", local->loc.path, src->name);
+        goto err;
+    } else if (op_errno != ENOENT) {
+        readdirp_local->op_ret = -1;
+        readdirp_local->op_errno = op_errno;
+
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_SUBVOL_ERROR,
+               "%s not found on cached subvol %s", local->loc.path, src->name);
+        goto err;
+    }
+
+    xattrs = dict_new();
+    if (!xattrs) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+               "dict_new failed");
+        goto err;
+    }
+
+    ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+               "Failed to set dictionary value: key = %s",
+               conf->link_xattr_name);
+        if (xattrs)
+            dict_unref(xattrs);
+        goto err;
+    }
+    STACK_WIND_COOKIE(frame, dht_rmdir_lookup_cbk, src, src, src->fops->lookup,
+                      &local->loc, xattrs);
+    if (xattrs)
+        dict_unref(xattrs);
+
+    return 0;
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
 
-	return 0;
+    this_call_cnt = dht_frame_return(readdirp_frame);
+
+    /* Once all the lookups/unlinks etc have returned, proceed to wind
+     * readdirp on the subvol again until no entries are returned.
+     * This is required if there are more entries than can be returned
+     * in a single readdirp call.
+     */
+
+    if (is_last_call(this_call_cnt))
+        dht_rmdir_readdirp_do(readdirp_frame, this);
+
+    DHT_STACK_DESTROY(frame);
+    return 0;
 }
 
+static int
+dht_rmdir_is_subvol_empty(call_frame_t *frame, xlator_t *this,
+                          gf_dirent_t *entries, xlator_t *src)
+{
+    int ret = 0;
+    int build_ret = 0;
+    gf_dirent_t *trav = NULL;
+    call_frame_t *lookup_frame = NULL;
+    dht_local_t *lookup_local = NULL;
+    dht_local_t *local = NULL;
+    dict_t *xattrs = NULL;
+    dht_conf_t *conf = this->private;
+    xlator_t *subvol = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+    int count = 0;
+    gf_boolean_t unwind = _gf_false;
+
+    local = frame->local;
+
+    list_for_each_entry(trav, &entries->list, list)
+    {
+        if (strcmp(trav->d_name, ".") == 0)
+            continue;
+        if (strcmp(trav->d_name, "..") == 0)
+            continue;
+        if (check_is_linkfile(NULL, (&trav->d_stat), trav->dict,
+                              conf->link_xattr_name)) {
+            count++;
+            continue;
+        }
 
-static int32_t
-dht_inodelk_cbk (call_frame_t *frame, void *cookie,
-		 xlator_t *this, int32_t op_ret, int32_t op_errno)
+        /* this entry is either a directory which is neither "." nor "..",
+           or a non directory which is not a linkfile. the directory is to
+           be treated as non-empty
+        */
+        return 0;
+    }
+
+    xattrs = dict_new();
+    if (!xattrs) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+               "dict_new failed");
+        return -1;
+    }
+
+    ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+               "Failed to set dictionary value: key = %s",
+               conf->link_xattr_name);
+
+        if (xattrs)
+            dict_unref(xattrs);
+        return -1;
+    }
+
+    local->call_cnt = count;
+    ret = 0;
+
+    list_for_each_entry(trav, &entries->list, list)
+    {
+        if (strcmp(trav->d_name, ".") == 0)
+            continue;
+        if (strcmp(trav->d_name, "..") == 0)
+            continue;
+
+        lookup_frame = copy_frame(frame);
+
+        if (!lookup_frame) {
+            /* out of memory, let the rmdir fail
+               (as non-empty, unfortunately) */
+            goto err;
+        }
+
+        lookup_local = dht_local_init(lookup_frame, NULL, NULL, GF_FOP_LOOKUP);
+        if (!lookup_local) {
+            goto err;
+        }
+
+        lookup_frame->local = lookup_local;
+        lookup_local->main_frame = frame;
+        lookup_local->hashed_subvol = src;
+
+        build_ret = dht_build_child_loc(this, &lookup_local->loc, &local->loc,
+                                        trav->d_name);
+        if (build_ret != 0)
+            goto err;
+
+        gf_uuid_copy(lookup_local->loc.gfid, trav->d_stat.ia_gfid);
+
+        gf_uuid_unparse(lookup_local->loc.gfid, gfid);
+
+        gf_msg_trace(this->name, 0, "looking up %s on subvolume %s, gfid = %s",
+                     lookup_local->loc.path, src->name, gfid);
+
+        subvol = dht_linkfile_subvol(this, NULL, &trav->d_stat, trav->dict);
+        if (!subvol || (subvol == src)) {
+            /* we need to delete the linkto file if it does not have a
+             * valid subvol or it points to itself.
+             */
+            gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_INVALID_LINKFILE,
+                   "Linkfile does not have link subvolume. "
+                   "path = %s, gfid = %s",
+                   lookup_local->loc.path, gfid);
+
+            gf_msg_debug(this->name, 0, "looking up %s on subvol %s, gfid = %s",
+                         lookup_local->loc.path, src->name, gfid);
+
+            STACK_WIND_COOKIE(lookup_frame, dht_rmdir_lookup_cbk, src, src,
+                              src->fops->lookup, &lookup_local->loc, xattrs);
+        } else {
+            gf_msg_debug(this->name, 0,
+                         "Looking up linkfile target %s on "
+                         " subvol %s, gfid = %s",
+                         lookup_local->loc.path, subvol->name, gfid);
+
+            STACK_WIND(lookup_frame, dht_rmdir_cached_lookup_cbk, subvol,
+                       subvol->fops->lookup, &lookup_local->loc, xattrs);
+        }
+        ret++;
+
+        lookup_frame = NULL;
+        lookup_local = NULL;
+    }
+
+    if (xattrs)
+        dict_unref(xattrs);
+
+    return ret;
+err:
+    if (xattrs)
+        dict_unref(xattrs);
+
+    if (lookup_frame)
+        DHT_STACK_DESTROY(lookup_frame);
+
+    /* Handle the case where the wound calls have unwound before the
+     * loop processing is done
+     */
+
+    LOCK(&frame->lock);
+    {
+        local->op_ret = -1;
+        local->op_errno = ENOTEMPTY;
+
+        local->call_cnt -= (count - ret);
+        if (!local->call_cnt)
+            unwind = _gf_true;
+    }
+    UNLOCK(&frame->lock);
+
+    if (!unwind) {
+        return ret;
+    }
+    return 0;
+}
+
+/*
+ * No more entries on this subvol. Proceed to the actual rmdir operation.
+ */
 
+static int
+dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int op_ret, int op_errno, gf_dirent_t *entries,
+                       dict_t *xdata)
 {
-	DHT_STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    xlator_t *src = NULL;
+    int ret = 0;
+    char *path = NULL;
+
+    local = frame->local;
+    prev = cookie;
+    src = prev;
+
+    if (op_ret > 2) {
+        /* dht_rmdir_is_subvol_empty() may free the frame,
+         * copy path for logging.
+         */
+        path = gf_strdup(local->loc.path);
+
+        ret = dht_rmdir_is_subvol_empty(frame, this, entries, src);
+
+        switch (ret) {
+            case 0: /* non linkfiles exist */
+                gf_msg_trace(this->name, 0,
+                             "readdir on %s for %s returned %d "
+                             "entries",
+                             prev->name, local->loc.path, op_ret);
+                local->op_ret = -1;
+                local->op_errno = ENOTEMPTY;
+                break;
+            default:
+                /* @ret number of linkfiles are getting unlinked */
+                gf_msg_trace(this->name, 0,
+                             "readdir on %s for %s found %d "
+                             "linkfiles",
+                             prev->name, path, ret);
+                break;
+        }
+    }
+
+    /* readdirp failed or no linkto files were found on this subvol */
+    if (!ret)
+        dht_rmdir_readdirp_done(frame, this);
+
+    GF_FREE(path);
+    return 0;
 }
 
+static int
+dht_rmdir_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int this_call_cnt = -1;
+    xlator_t *prev = NULL;
+    int ret = 0;
+    dht_conf_t *conf = this->private;
+    dict_t *dict = NULL;
+    int i = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+    dht_local_t *readdirp_local = NULL;
+    call_frame_t *readdirp_frame = NULL;
+    int cnt = 0;
+
+    local = frame->local;
+    prev = cookie;
+
+    this_call_cnt = dht_frame_return(frame);
+    if (op_ret == -1) {
+        gf_uuid_unparse(local->loc.gfid, gfid);
+
+        gf_msg_debug(this->name, op_errno,
+                     "opendir on %s for %s failed, "
+                     "gfid = %s,",
+                     prev->name, local->loc.path, gfid);
+        if ((op_errno != ENOENT) && (op_errno != ESTALE)) {
+            local->op_ret = -1;
+            local->op_errno = op_errno;
+        }
+        goto err;
+    }
+
+    if (!is_last_call(this_call_cnt))
+        return 0;
 
-int32_t
-dht_inodelk (call_frame_t *frame, xlator_t *this,
-	     loc_t *loc, int32_t cmd, struct flock *lock)
-{
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
-	dht_local_t  *local = NULL;
-
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = dht_subvol_get_cached (this, loc->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	local->inode = inode_ref (loc->inode);
-	local->call_cnt = 1;
-
-	STACK_WIND (frame,
-		    dht_inodelk_cbk,
-		    subvol, subvol->fops->inodelk,
-		    loc, cmd, lock);
-
-	return 0;
+    if (local->op_ret == -1)
+        goto err;
+
+    fd_bind(fd);
+
+    dict = dict_new();
+    if (!dict) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto err;
+    }
+
+    ret = dict_set_uint32(dict, conf->link_xattr_name, 256);
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+               "%s: Failed to set dictionary value:key = %s", local->loc.path,
+               conf->link_xattr_name);
+
+    cnt = local->call_cnt = conf->subvolume_cnt;
+
+    /* Create a separate frame per subvol as we might need
+     * to resend readdirp multiple times to get all the
+     * entries.
+     */
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        readdirp_frame = copy_frame(frame);
+
+        if (!readdirp_frame) {
+            cnt--;
+            /* Reduce the local->call_cnt as well */
+            (void)dht_frame_return(frame);
+            continue;
+        }
+
+        readdirp_local = dht_local_init(readdirp_frame, &local->loc, local->fd,
+                                        0);
+
+        if (!readdirp_local) {
+            DHT_STACK_DESTROY(readdirp_frame);
+            cnt--;
+            /* Reduce the local->call_cnt as well */
+            dht_frame_return(frame);
+            continue;
+        }
+        readdirp_local->main_frame = frame;
+        readdirp_local->op_ret = 0;
+        readdirp_local->xattr = dict_ref(dict);
+        /* overload this field to save the subvol info */
+        readdirp_local->hashed_subvol = conf->subvolumes[i];
+
+        STACK_WIND_COOKIE(readdirp_frame, dht_rmdir_readdirp_cbk,
+                          conf->subvolumes[i], conf->subvolumes[i],
+                          conf->subvolumes[i]->fops->readdirp,
+                          readdirp_local->fd, 4096, 0, readdirp_local->xattr);
+    }
+
+    if (dict)
+        dict_unref(dict);
+
+    /* Could not wind readdirp to any subvol */
+
+    if (!cnt)
+        goto err;
+
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno);
+    if (is_last_call(this_call_cnt)) {
+        dht_rmdir_do(frame, this);
+    }
 
-	return 0;
+    return 0;
 }
 
+int
+dht_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+          dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int op_errno = -1;
+    int i = -1;
+    int ret = -1;
+    dict_t *xattr_req = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+    VALIDATE_OR_GOTO(loc->path, err);
+    VALIDATE_OR_GOTO(this->private, err);
+
+    conf = this->private;
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_RMDIR);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->call_cnt = conf->subvolume_cnt;
+    local->op_ret = 0;
+    local->fop_succeeded = 0;
+
+    local->flags = flags;
+
+    local->fd = fd_create(local->loc.inode, frame->root->pid);
+    if (!local->fd) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    if (flags) {
+        return dht_rmdir_do(frame, this);
+    }
+    if (xdata) {
+        xattr_req = dict_ref(xdata);
+    } else {
+        xattr_req = dict_new();
+    }
+    if (xattr_req) {
+        ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
+        /* If parallel-readdir is enabled, this is required
+         * to handle stale linkto files in the directory
+         * being deleted. If this fails, log an error but
+         * do not prevent the operation.
+         */
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0, "%s: failed to set key %s",
+                   loc->path, conf->link_xattr_name);
+        }
+    } else {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "%s: failed to set key %s",
+               loc->path, conf->link_xattr_name);
+    }
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        STACK_WIND_COOKIE(frame, dht_rmdir_opendir_cbk, conf->subvolumes[i],
+                          conf->subvolumes[i],
+                          conf->subvolumes[i]->fops->opendir, loc, local->fd,
+                          xattr_req);
+    }
+
+    if (xattr_req) {
+        dict_unref(xattr_req);
+    }
+    return 0;
 
-static int32_t
-dht_finodelk_cbk (call_frame_t *frame, void *cookie,
-		  xlator_t *this, int32_t op_ret, int32_t op_errno)
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL);
 
-{
-	DHT_STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
+    return 0;
 }
 
+static int
+dht_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
 
-int32_t
-dht_finodelk (call_frame_t *frame, xlator_t *this,
-	      fd_t *fd, int32_t cmd, struct flock *lock)
 {
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
+    DHT_STACK_UNWIND(entrylk, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+/* TODO
+ * Sending entrylk to cached subvol can result in stale lock
+ * as described in the bug 1311002.
+ */
+int
+dht_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+            const char *basename, entrylk_cmd cmd, entrylk_type type,
+            dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
 
-	subvol = dht_subvol_get_cached (this, fd->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
+    local = dht_local_init(frame, loc, NULL, GF_FOP_ENTRYLK);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
 
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_uuid_unparse(loc->gfid, gfid);
 
-	STACK_WIND (frame,
-		    dht_finodelk_cbk,
-		    subvol, subvol->fops->finodelk,
-		    fd, cmd, lock);
+        gf_msg_debug(this->name, 0,
+                     "no cached subvolume for path=%s, "
+                     "gfid = %s",
+                     loc->path, gfid);
+        op_errno = EINVAL;
+        goto err;
+    }
 
-	return 0;
+    local->call_cnt = 1;
+
+    STACK_WIND(frame, dht_entrylk_cbk, subvol, subvol->fops->entrylk, volume,
+               loc, basename, cmd, type, xdata);
+
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno);
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(entrylk, frame, -1, op_errno, NULL);
 
-	return 0;
+    return 0;
 }
 
-
-static int32_t
-dht_entrylk_cbk (call_frame_t *frame, void *cookie,
-		 xlator_t *this, int32_t op_ret, int32_t op_errno)
+static int
+dht_fentrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
 
 {
-	DHT_STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
+    DHT_STACK_UNWIND(fentrylk, frame, op_ret, op_errno, NULL);
+    return 0;
 }
 
-int32_t
-dht_entrylk (call_frame_t *frame, xlator_t *this,
-	     loc_t *loc, const char *basename,
-	     entrylk_cmd cmd, entrylk_type type)
-{
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
-	dht_local_t  *local = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = dht_subvol_get_cached (this, loc->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for path=%s", loc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	local->inode = inode_ref (loc->inode);
-	local->call_cnt = 1;
-
-	STACK_WIND (frame, dht_entrylk_cbk,
-		    subvol, subvol->fops->entrylk,
-		    loc, basename, cmd, type);
-
-	return 0;
+int
+dht_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+             const char *basename, entrylk_cmd cmd, entrylk_type type,
+             dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+    VALIDATE_OR_GOTO(fd->inode, err);
+
+    gf_uuid_unparse(fd->inode->gfid, gfid);
+
+    subvol = dht_subvol_get_cached(this, fd->inode);
+    if (!subvol) {
+        gf_msg_debug(this->name, 0,
+                     "No cached subvolume for fd=%p,"
+                     " gfid = %s",
+                     fd, gfid);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    STACK_WIND(frame, dht_fentrylk_cbk, subvol, subvol->fops->fentrylk, volume,
+               fd, basename, cmd, type, xdata);
+
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno);
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(fentrylk, frame, -1, op_errno, NULL);
 
-	return 0;
+    return 0;
 }
 
 static int32_t
-dht_fentrylk_cbk (call_frame_t *frame, void *cookie,
-		  xlator_t *this, int32_t op_ret, int32_t op_errno)
-
+dht_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, dict_t *xdata)
 {
-	DHT_STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+
+    GF_VALIDATE_OR_GOTO("dht", frame, out);
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+
+    local = frame->local;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret < 0 && op_errno != ENOTCONN) {
+            local->op_errno = op_errno;
+            goto unlock;
+        }
+        local->op_ret = 0;
+    }
+unlock:
+    UNLOCK(&frame->lock);
+
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt)) {
+        DHT_STACK_UNWIND(ipc, frame, local->op_ret, local->op_errno, NULL);
+    }
+
+out:
+    return 0;
 }
 
 int32_t
-dht_fentrylk (call_frame_t *frame, xlator_t *this,
-	      fd_t *fd, const char *basename,
-	      entrylk_cmd cmd, entrylk_type type)
+dht_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
 {
-	xlator_t     *subvol = NULL;
-        int           op_errno = -1;
+    dht_local_t *local = NULL;
+    int op_errno = EINVAL;
+    dht_conf_t *conf = NULL;
+    int call_cnt = 0;
+    int i = 0;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+
+    if (op != GF_IPC_TARGET_UPCALL)
+        goto wind_default;
 
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
+    VALIDATE_OR_GOTO(this->private, err);
+    conf = this->private;
 
-	subvol = dht_subvol_get_cached (this, fd->inode);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for fd=%p", fd);
-		op_errno = EINVAL;
-		goto err;
-	}
+    local = dht_local_init(frame, NULL, NULL, GF_FOP_IPC);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
 
-	STACK_WIND (frame, dht_fentrylk_cbk,
-		    subvol, subvol->fops->fentrylk,
-		    fd, basename, cmd, type);
+    call_cnt = conf->subvolume_cnt;
+    local->call_cnt = call_cnt;
 
-	return 0;
+    if (xdata) {
+        if (dict_set_int8(xdata, conf->xattr_name, 0) < 0)
+            goto err;
+    }
+
+    for (i = 0; i < call_cnt; i++) {
+        STACK_WIND(frame, dht_ipc_cbk, conf->subvolumes[i],
+                   conf->subvolumes[i]->fops->ipc, op, xdata);
+    }
+
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno);
+    DHT_STACK_UNWIND(ipc, frame, -1, op_errno, NULL);
+
+    return 0;
+
+wind_default:
+    STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ipc, op, xdata);
+    return 0;
+}
+
+int
+dht_forget(xlator_t *this, inode_t *inode)
+{
+    uint64_t ctx_int = 0;
+    dht_inode_ctx_t *ctx = NULL;
+    dht_layout_t *layout = NULL;
 
-	return 0;
+    inode_ctx_del(inode, this, &ctx_int);
+
+    if (!ctx_int)
+        return 0;
+
+    ctx = (dht_inode_ctx_t *)(long)ctx_int;
+
+    layout = ctx->layout;
+    ctx->layout = NULL;
+    dht_layout_unref(this, layout);
+    GF_FREE(ctx);
+
+    return 0;
 }
 
+int
+dht_notify(xlator_t *this, int event, void *data, ...)
+{
+    xlator_t *subvol = NULL;
+    int cnt = -1;
+    int i = -1;
+    dht_conf_t *conf = NULL;
+    int ret = -1;
+    int propagate = 0;
+
+    int had_heard_from_all = 0;
+    int have_heard_from_all = 0;
+    gf_defrag_info_t *defrag = NULL;
+    dict_t *dict = NULL;
+    gf_defrag_type cmd = 0;
+    dict_t *output = NULL;
+    va_list ap;
+    struct gf_upcall *up_data = NULL;
+    struct gf_upcall_cache_invalidation *up_ci = NULL;
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    /* had all subvolumes reported status once till now? */
+    had_heard_from_all = 1;
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (!conf->last_event[i]) {
+            had_heard_from_all = 0;
+        }
+    }
+
+    switch (event) {
+        case GF_EVENT_CHILD_UP:
+            subvol = data;
+
+            conf->gen++;
+
+            for (i = 0; i < conf->subvolume_cnt; i++) {
+                if (subvol == conf->subvolumes[i]) {
+                    cnt = i;
+                    break;
+                }
+            }
+
+            if (cnt == -1) {
+                gf_msg_debug(this->name, 0,
+                             "got GF_EVENT_CHILD_UP bad "
+                             "subvolume %s",
+                             subvol->name);
+                break;
+            }
+
+            LOCK(&conf->subvolume_lock);
+            {
+                conf->subvolume_status[cnt] = 1;
+                conf->last_event[cnt] = event;
+                conf->subvol_up_time[cnt] = gf_time();
+            }
+            UNLOCK(&conf->subvolume_lock);
+
+            /* one of the node came back up, do a stat update */
+            dht_get_du_info_for_subvol(this, cnt);
+
+            break;
+
+        case GF_EVENT_SOME_DESCENDENT_UP:
+            subvol = data;
+            conf->gen++;
+            propagate = 1;
+
+            break;
+
+        case GF_EVENT_SOME_DESCENDENT_DOWN:
+            subvol = data;
+            propagate = 1;
+
+            break;
+
+        case GF_EVENT_CHILD_DOWN:
+            subvol = data;
+
+            if (conf->assert_no_child_down) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_CHILD_DOWN,
+                       "Received CHILD_DOWN. Exiting");
+                if (conf->defrag) {
+                    gf_defrag_stop(conf, GF_DEFRAG_STATUS_FAILED, NULL);
+                } else {
+                    kill(getpid(), SIGTERM);
+                }
+            }
+
+            for (i = 0; i < conf->subvolume_cnt; i++) {
+                if (subvol == conf->subvolumes[i]) {
+                    cnt = i;
+                    break;
+                }
+            }
+
+            if (cnt == -1) {
+                gf_msg_debug(this->name, 0,
+                             "got GF_EVENT_CHILD_DOWN bad "
+                             "subvolume %s",
+                             subvol->name);
+                break;
+            }
+
+            LOCK(&conf->subvolume_lock);
+            {
+                conf->subvolume_status[cnt] = 0;
+                conf->last_event[cnt] = event;
+                conf->subvol_up_time[cnt] = 0;
+            }
+            UNLOCK(&conf->subvolume_lock);
+
+            for (i = 0; i < conf->subvolume_cnt; i++)
+                if (conf->last_event[i] != event)
+                    event = GF_EVENT_SOME_DESCENDENT_DOWN;
+            break;
+
+        case GF_EVENT_CHILD_CONNECTING:
+            subvol = data;
+
+            for (i = 0; i < conf->subvolume_cnt; i++) {
+                if (subvol == conf->subvolumes[i]) {
+                    cnt = i;
+                    break;
+                }
+            }
+
+            if (cnt == -1) {
+                gf_msg_debug(this->name, 0,
+                             "got GF_EVENT_CHILD_CONNECTING"
+                             " bad subvolume %s",
+                             subvol->name);
+                break;
+            }
+
+            LOCK(&conf->subvolume_lock);
+            {
+                conf->last_event[cnt] = event;
+            }
+            UNLOCK(&conf->subvolume_lock);
+
+            break;
+        case GF_EVENT_VOLUME_DEFRAG: {
+            if (!conf->defrag) {
+                return ret;
+            }
+            defrag = conf->defrag;
+
+            dict = data;
+            va_start(ap, data);
+            output = va_arg(ap, dict_t *);
+
+            ret = dict_get_int32(dict, "rebalance-command", (int32_t *)&cmd);
+            if (ret) {
+                va_end(ap);
+                return ret;
+            }
+            LOCK(&defrag->lock);
+            {
+                if (defrag->is_exiting)
+                    goto unlock;
+                if ((cmd == GF_DEFRAG_CMD_STATUS) ||
+                    (cmd == GF_DEFRAG_CMD_DETACH_STATUS))
+                    gf_defrag_status_get(conf, output);
+                else if (cmd == GF_DEFRAG_CMD_DETACH_START)
+                    defrag->cmd = GF_DEFRAG_CMD_DETACH_START;
+                else if (cmd == GF_DEFRAG_CMD_STOP ||
+                         cmd == GF_DEFRAG_CMD_DETACH_STOP)
+                    gf_defrag_stop(conf, GF_DEFRAG_STATUS_STOPPED, output);
+            }
+        unlock:
+            UNLOCK(&defrag->lock);
+            va_end(ap);
+            return ret;
+            break;
+        }
+        case GF_EVENT_UPCALL:
+            up_data = (struct gf_upcall *)data;
+            if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION)
+                break;
+            up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
+
+            /* Since md-cache will be aggressively filtering lookups,
+             * the stale layout issue will be more pronounced. Hence
+             * when a layout xattr is changed by the rebalance process
+             * notify all the md-cache clients to invalidate the existing
+             * stat cache and send the lookup next time*/
+            if (up_ci->dict && dict_get(up_ci->dict, conf->xattr_name))
+                up_ci->flags |= UP_EXPLICIT_LOOKUP;
+
+            /* TODO: Instead of invalidating iatt, update the new
+             * hashed/cached subvolume in dht inode_ctx */
+            if (IS_DHT_LINKFILE_MODE(&up_ci->stat))
+                up_ci->flags |= UP_EXPLICIT_LOOKUP;
+
+            propagate = 1;
+            break;
+        default:
+            propagate = 1;
+            break;
+    }
+
+    /* have all subvolumes reported status once by now? */
+    have_heard_from_all = 1;
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (!conf->last_event[i])
+            have_heard_from_all = 0;
+    }
+
+    /* if all subvols have reported status, no need to hide anything
+       or wait for anything else. Just propagate blindly */
+    if (have_heard_from_all) {
+        propagate = 1;
+    }
+
+    if (!had_heard_from_all && have_heard_from_all) {
+        static int run_defrag = 0;
+        /* This is the first event which completes aggregation
+           of events from all subvolumes. If at least one subvol
+           had come up, propagate CHILD_UP, but only this time
+        */
+        event = GF_EVENT_CHILD_DOWN;
+
+        for (i = 0; i < conf->subvolume_cnt; i++) {
+            if (conf->last_event[i] == GF_EVENT_CHILD_UP) {
+                event = GF_EVENT_CHILD_UP;
+                break;
+            }
+
+            if (conf->last_event[i] == GF_EVENT_CHILD_CONNECTING) {
+                event = GF_EVENT_CHILD_CONNECTING;
+                /* continue to check other events for CHILD_UP */
+            }
+        }
+
+        /* Rebalance is started with assert_no_child_down. So we do
+         * not need to handle CHILD_DOWN event here.
+         *
+         * If there is a graph switch, we should not restart the
+         * rebalance daemon. Use 'run_defrag' to indicate if the
+         * thread has already started.
+         */
+        if (conf->defrag && !run_defrag) {
+            run_defrag = 1;
+            ret = gf_thread_create(&conf->defrag->th, NULL, gf_defrag_start,
+                                   this, "dhtdg");
+            if (ret) {
+                GF_FREE(conf->defrag);
+                conf->defrag = NULL;
+                kill(getpid(), SIGTERM);
+            }
+        }
+    }
+
+    ret = 0;
+    if (propagate)
+        ret = default_notify(this, event, data);
+out:
+    return ret;
+}
 
 int
-dht_forget (xlator_t *this, inode_t *inode)
+dht_inode_ctx_layout_get(inode_t *inode, xlator_t *this, dht_layout_t **layout)
 {
-	uint64_t      tmp_layout = 0;
-	dht_layout_t *layout = NULL;
+    dht_inode_ctx_t *ctx = NULL;
+    int ret = -1;
 
-	inode_ctx_get (inode, this, &tmp_layout);
+    ret = dht_inode_ctx_get(inode, this, &ctx);
 
-	if (!layout)
-		return 0;
-	layout = (dht_layout_t *)(long)tmp_layout;
-	if (!layout->preset)
-		FREE (layout);
+    if (!ret && ctx) {
+        if (ctx->layout) {
+            if (layout)
+                *layout = ctx->layout;
+            ret = 0;
+        } else {
+            ret = -1;
+        }
+    }
 
-	return 0;
+    return ret;
 }
 
+void
+dht_log_new_layout_for_dir_selfheal(xlator_t *this, loc_t *loc,
+                                    dht_layout_t *layout)
+{
+    char string[2048] = {0};
+    char *output_string = NULL;
+    int len = 0;
+    int off = 0;
+    int i = 0;
+    gf_loglevel_t log_level = gf_log_get_loglevel();
+    int ret = 0;
 
+    if (log_level < GF_LOG_INFO)
+        return;
 
-static int
-dht_init_subvolumes (xlator_t *this, dht_conf_t *conf)
+    if (!layout)
+        return;
+
+    if (!layout->cnt)
+        return;
+
+    if (!loc)
+        return;
+
+    if (!loc->path)
+        return;
+
+    ret = snprintf(string, sizeof(string), "Setting layout of %s with ",
+                   loc->path);
+
+    if (ret < 0)
+        return;
+
+    len += ret;
+
+    /* Calculation  of total length of the string required to calloc
+     * output_string. Log includes subvolume-name, start-range, end-range
+     * and err value.
+     *
+     * This log will help to debug cases where:
+     * a) Different processes set different layout of a directory.
+     * b) Error captured in lookup, which will be filled in layout->err
+     * (like ENOENT, ESTALE etc)
+     */
+
+    for (i = 0; i < layout->cnt; i++) {
+        ret = snprintf(string, sizeof(string),
+                       "[Subvol_name: %s, Err: %d , Start: "
+                       "0x%x, Stop: 0x%x, Hash: 0x%x], ",
+                       layout->list[i].xlator->name, layout->list[i].err,
+                       layout->list[i].start, layout->list[i].stop,
+                       layout->list[i].commit_hash);
+
+        if (ret < 0)
+            return;
+
+        len += ret;
+    }
+
+    len++;
+
+    output_string = GF_MALLOC(len + 1, gf_common_mt_char);
+
+    if (!output_string)
+        return;
+
+    ret = snprintf(output_string, len + 1, "Setting layout of %s with ",
+                   loc->path);
+
+    if (ret < 0)
+        goto err;
+
+    off += ret;
+
+    for (i = 0; i < layout->cnt; i++) {
+        ret = snprintf(output_string + off, len - off,
+                       "[Subvol_name: %s, Err: %d , Start: "
+                       "0x%x, Stop: 0x%x, Hash: 0x%x], ",
+                       layout->list[i].xlator->name, layout->list[i].err,
+                       layout->list[i].start, layout->list[i].stop,
+                       layout->list[i].commit_hash);
+
+        if (ret < 0)
+            goto err;
+
+        off += ret;
+    }
+
+    gf_msg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_LOG_FIXED_LAYOUT, "%s",
+           output_string);
+
+err:
+    GF_FREE(output_string);
+}
+
+int32_t
+dht_migration_get_dst_subvol(xlator_t *this, dht_local_t *local)
 {
-        xlator_list_t *subvols = NULL;
-        int            cnt = 0;
+    int ret = -1;
 
+    if (!local)
+        goto out;
 
-        for (subvols = this->children; subvols; subvols = subvols->next)
-                cnt++;
+    local->rebalance.target_node = dht_subvol_get_hashed(this, &local->loc);
 
-        conf->subvolumes = CALLOC (cnt, sizeof (xlator_t *));
-        if (!conf->subvolumes) {
-                gf_log (this->name, GF_LOG_ERROR,
-                        "memory allocation failed :(");
-                return -1;
-        }
-        conf->subvolume_cnt = cnt;
+    if (local->rebalance.target_node)
+        ret = 0;
 
-        cnt = 0;
-        for (subvols = this->children; subvols; subvols = subvols->next)
-                conf->subvolumes[cnt++] = subvols->xlator;
+out:
+    return ret;
+}
 
-	conf->subvolume_status = CALLOC (cnt, sizeof (char));
-	if (!conf->subvolume_status) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		return -1;
-	}
+/*
+This function should not be called more then once during a FOP
+handling path. It is valid only for for ops on files
+*/
+int32_t
+dht_set_local_rebalance(xlator_t *this, dht_local_t *local, struct iatt *stbuf,
+                        struct iatt *prebuf, struct iatt *postbuf,
+                        dict_t *xdata)
+{
+    if (!local)
+        return -1;
 
-        return 0;
+    if (local->rebalance.set) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_REBAL_STRUCT_SET,
+               "local->rebalance already set");
+    }
+
+    if (stbuf)
+        memcpy(&local->rebalance.stbuf, stbuf, sizeof(struct iatt));
+
+    if (prebuf)
+        memcpy(&local->rebalance.prebuf, prebuf, sizeof(struct iatt));
+
+    if (postbuf)
+        memcpy(&local->rebalance.postbuf, postbuf, sizeof(struct iatt));
+
+    if (xdata)
+        local->rebalance.xdata = dict_ref(xdata);
+
+    local->rebalance.set = 1;
+
+    return 0;
 }
 
+int32_t
+dht_release(xlator_t *this, fd_t *fd)
+{
+    return dht_fd_ctx_destroy(this, fd);
+}
 
-int
-dht_notify (xlator_t *this, int event, void *data, ...)
+static int
+dht_pt_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, inode_t *inode, struct iatt *stbuf,
+                 struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
 {
-	xlator_t   *subvol = NULL;
-	int         cnt    = -1;
-	int         i      = -1;
-	dht_conf_t *conf   = NULL;
-	int         ret    = -1;
+    dht_local_t *local = NULL;
 
+    local = frame->local;
 
-	conf = this->private;
+    if (!op_ret) {
+        dht_layout_set(this, inode, local->layout);
+    }
 
-	switch (event) {
-	case GF_EVENT_CHILD_UP:
-		subvol = data;
+    DHT_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, stbuf, preparent,
+                     postparent, NULL);
 
-		conf->gen++;
+    return 0;
+}
 
-		for (i = 0; i < conf->subvolume_cnt; i++) {
-			if (subvol == conf->subvolumes[i]) {
-				cnt = i;
-				break;
-			}
-		}
+int32_t
+dht_pt_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+             mode_t umask, dict_t *xdata)
+{
+    dht_layout_t *layout = NULL;
+    dht_conf_t *conf = NULL;
+    dht_local_t *local = NULL;
+    bool free_xdata = false;
+    int ret = 0;
+    int op_errno = 0;
+    int32_t *disk_layout_p = NULL;
+
+    conf = this->private;
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_MKDIR);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    layout = dht_layout_new(this, conf->subvolume_cnt);
+    if (!layout)
+        goto wind;
+
+    local->layout = layout;
+
+    if (!xdata) {
+        xdata = dict_new();
+        if (!xdata)
+            goto wind;
+        free_xdata = true;
+    }
+
+    /*Set the xlator or the following will crash*/
+    layout->list[0].xlator = conf->subvolumes[0];
+
+    dht_selfheal_layout_new_directory(frame, loc, layout);
+
+    dht_disk_layout_extract(this, layout, 0, &disk_layout_p);
+
+    ret = dict_set_bin(xdata, conf->xattr_name, disk_layout_p, 4 * 4);
+    if (ret) {
+        gf_msg("dht", GF_LOG_DEBUG, EINVAL, DHT_MSG_DICT_SET_FAILED,
+               "dht layout dict set failed");
+    }
+wind:
+    STACK_WIND(frame, dht_pt_mkdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+    if (free_xdata)
+        dict_unref(xdata);
+    return 0;
 
-		if (cnt == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"got GF_EVENT_CHILD_UP bad subvolume %s",
-				subvol->name);
-			break;
-		}
+err:
+    op_errno = local ? local->op_errno : op_errno;
+    DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
 
-		LOCK (&conf->subvolume_lock);
-		{
-			conf->subvolume_status[cnt] = 1;
-		}
-		UNLOCK (&conf->subvolume_lock);
+    return 0;
+}
 
-		break;
+static int
+dht_pt_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+    dht_conf_t *conf = NULL;
 
-	case GF_EVENT_CHILD_DOWN:
-		subvol = data;
+    conf = this->private;
+    dict_del(xattr, conf->xattr_name);
+    dict_del(xattr, conf->mds_xattr_key);
+    dict_del(xattr, conf->commithash_xattr_name);
 
-		for (i = 0; i < conf->subvolume_cnt; i++) {
-			if (subvol == conf->subvolumes[i]) {
-				cnt = i;
-				break;
-			}
-		}
+    if (frame->root->pid >= 0) {
+        GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
+        GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
+    }
 
-		if (cnt == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"got GF_EVENT_CHILD_DOWN bad subvolume %s",
-				subvol->name);
-			break;
-		}
+    DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata);
+    return 0;
+}
 
-		LOCK (&conf->subvolume_lock);
-		{
-			conf->subvolume_status[cnt] = 0;
-		}
-		UNLOCK (&conf->subvolume_lock);
+int
+dht_pt_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                const char *key, dict_t *xdata)
+{
+    STACK_WIND(frame, dht_pt_getxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getxattr, loc, key, xdata);
+    return 0;
+}
 
-		break;
-	}
+static int
+dht_pt_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+    dht_conf_t *conf = NULL;
 
-	ret = default_notify (this, event, data);
+    conf = this->private;
+    dict_del(xattr, conf->xattr_name);
 
-	return ret;
+    if (frame->root->pid >= 0) {
+        GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
+        GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
+    }
+
+    DHT_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata);
+    return 0;
+}
+
+int
+dht_pt_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+                 dict_t *xdata)
+{
+    STACK_WIND(frame, dht_pt_fgetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fgetxattr, fd, key, xdata);
+    return 0;
 }
 
+/* The job of this function is to check if all the xlators have updated
+ * error in the layout. */
+int
+dht_dir_layout_error_check(xlator_t *this, inode_t *inode)
+{
+    dht_layout_t *layout = NULL;
+    int i = 0;
+
+    layout = dht_layout_get(this, inode);
+    for (i = 0; i < layout->cnt; i++) {
+        if (layout->list[i].err == 0) {
+            return 0;
+        }
+    }
+
+    /* Returning the first xlator error as all xlators have errors */
+    return layout->list[0].err;
+}
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index ed154dc1266..fe0dc3db34a 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -1,217 +1,1384 @@
 /*
-   Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+#include <regex.h>
+
+#include "dht-mem-types.h"
+#include "dht-messages.h"
+#include <glusterfs/call-stub.h>
+#include "libxlator.h"
+#include <glusterfs/syncop.h>
+#include <glusterfs/refcount.h>
+#include <glusterfs/timer.h>
+#include "protocol-common.h"
+#include <glusterfs/glusterfs-acl.h>
 
 #ifndef _DHT_H
 #define _DHT_H
 
+#define GF_XATTR_FIX_LAYOUT_KEY "distribute.fix.layout"
+#define GF_XATTR_FILE_MIGRATE_KEY "trusted.distribute.migrate-data"
+#define DHT_MDS_STR "mds"
+#define GF_DHT_LOOKUP_UNHASHED_OFF 0
+#define GF_DHT_LOOKUP_UNHASHED_ON 1
+#define GF_DHT_LOOKUP_UNHASHED_AUTO 2
+#define DHT_PATHINFO_HEADER "DISTRIBUTE:"
+#define DHT_FILE_MIGRATE_DOMAIN "dht.file.migrate"
+/* Layout synchronization */
+#define DHT_LAYOUT_HEAL_DOMAIN "dht.layout.heal"
+/* Namespace synchronization */
+#define DHT_ENTRY_SYNC_DOMAIN "dht.entry.sync"
+#define DHT_LAYOUT_HASH_INVALID 1
+#define MAX_REBAL_THREADS sysconf(_SC_NPROCESSORS_ONLN)
+
+#define DHT_DIR_STAT_BLOCKS 8
+#define DHT_DIR_STAT_SIZE 4096
+
+/* Virtual xattr for subvols status */
+
+#define DHT_SUBVOL_STATUS_KEY "dht.subvol.status"
+
+/* Virtual xattrs for debugging */
 
-typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie,
-				       xlator_t *this,
-				       int32_t op_ret, int32_t op_errno);
+#define DHT_DBG_HASHED_SUBVOL_PATTERN "dht.file.hashed-subvol.*"
+#define DHT_DBG_HASHED_SUBVOL_KEY "dht.file.hashed-subvol."
 
+/* Rebalance nodeuuid flags */
+#define REBAL_NODEUUID_MINE 0x01
+
+typedef int (*dht_selfheal_dir_cbk_t)(call_frame_t *frame, void *cookie,
+                                      xlator_t *this, int32_t op_ret,
+                                      int32_t op_errno, dict_t *xdata);
+typedef int (*dht_defrag_cbk_fn_t)(xlator_t *this, xlator_t *dst_node,
+                                   call_frame_t *frame, int ret);
+
+typedef int (*dht_refresh_layout_unlock)(call_frame_t *frame, xlator_t *this,
+                                         int op_ret, int invoke_cbk);
+
+typedef int (*dht_refresh_layout_done_handle)(call_frame_t *frame);
 
 struct dht_layout {
-        int               cnt;
-	int               preset;
-        int               gen;
-	int               type;
-        struct {
-		int       err;   /* 0 = normal
-				   -1 = dir exists and no xattr
-				   >0 = dir lookup failed with errno
-				 */
-                uint32_t  start;
-                uint32_t  stop;
-                xlator_t *xlator;
-        } list[0];
+    int spread_cnt; /* layout spread count per directory,
+                       is controlled by 'setxattr()' with
+                       special key */
+    int cnt;
+    int preset;
+    /*
+     * The last *configuration* state for which this directory was known
+     * to be in balance.  The corresponding vol_commit_hash changes
+     * whenever bricks are added or removed.  This value changes when a
+     * (full) rebalance is complete.  If they match, it's safe to assume
+     * that every file is where it should be and there's no need to do
+     * lookups for files elsewhere.  If they don't, then we have to do a
+     * global lookup to be sure.
+     */
+    uint32_t commit_hash;
+    /*
+     * The *runtime* state of the volume, changes when connections to
+     * bricks are made or lost.
+     */
+    int gen;
+    int type;
+    gf_atomic_t ref; /* use with dht_conf_t->layout_lock */
+    uint32_t search_unhashed;
+    struct {
+        int err; /* 0 = normal
+                    -1 = dir exists and no xattr
+                    >0 = dir lookup failed with errno
+                 */
+        uint32_t start;
+        uint32_t stop;
+        uint32_t commit_hash;
+        xlator_t *xlator;
+    } list[];
 };
 typedef struct dht_layout dht_layout_t;
 
+struct dht_stat_time {
+    uint32_t atime;
+    uint32_t atime_nsec;
+    uint32_t ctime;
+    uint32_t ctime_nsec;
+    uint32_t mtime;
+    uint32_t mtime_nsec;
+};
+
+typedef struct dht_stat_time dht_stat_time_t;
+
+struct dht_inode_ctx {
+    dht_layout_t *layout;
+    dht_stat_time_t time;
+    xlator_t *lock_subvol;
+    xlator_t *mds_subvol; /* This is only used for directories */
+};
+
+typedef struct dht_inode_ctx dht_inode_ctx_t;
+
+typedef enum {
+    DHT_HASH_TYPE_DM,
+    DHT_HASH_TYPE_DM_USER,
+} dht_hashfn_type_t;
+
+typedef enum {
+    DHT_INODELK,
+    DHT_ENTRYLK,
+} dht_lock_type_t;
+
+/* rebalance related */
+struct dht_rebalance_ {
+    xlator_t *from_subvol;
+    xlator_t *target_node;
+    off_t offset;
+    size_t size;
+    int32_t flags;
+    int count;
+    struct iobref *iobref;
+    struct iovec *vector;
+    struct iatt stbuf;
+    struct iatt prebuf;
+    struct iatt postbuf;
+    dht_defrag_cbk_fn_t target_op_fn;
+    dict_t *xdata;
+    dict_t *xattr;
+    dict_t *dict;
+    struct gf_flock flock;
+    int32_t set;
+    int lock_cmd;
+};
+
+/**
+ * Enum to store decided action based on the qdstatfs (quota-deem-statfs)
+ * events
+ **/
+typedef enum {
+    qdstatfs_action_OFF = 0,
+    qdstatfs_action_REPLACE,
+    qdstatfs_action_NEGLECT,
+    qdstatfs_action_COMPARE,
+} qdstatfs_action_t;
+
+typedef enum {
+    REACTION_INVALID,
+    FAIL_ON_ANY_ERROR,
+    IGNORE_ENOENT_ESTALE,
+    IGNORE_ENOENT_ESTALE_EIO,
+} dht_reaction_type_t;
+
+struct dht_skip_linkto_unlink {
+    xlator_t *hash_links_to;
+    uuid_t cached_gfid;
+    uuid_t hashed_gfid;
+    int opend_fd_count;
+    gf_boolean_t handle_valid_link;
+};
+
+typedef struct {
+    xlator_t *xl;
+    loc_t loc;      /* contains/points to inode to lock on. */
+    char *domain;   /* Only locks within a single domain
+                     * contend with each other
+                     */
+    char *basename; /* Required for entrylk */
+    gf_boolean_t locked;
+    dht_reaction_type_t do_on_failure;
+    short type; /* read/write lock.                     */
+    gf_lkowner_t lk_owner;
+} dht_lock_t;
+
+/* The lock structure represents inodelk. */
+typedef struct {
+    fop_inodelk_cbk_t inodelk_cbk;
+    dht_lock_t **locks;
+    int lk_count;
+    dht_reaction_type_t reaction;
+
+    /* whether locking failed on _any_ of the "locks" above */
+    int op_ret;
+    int op_errno;
+} dht_ilock_wrap_t;
+
+/* The lock structure represents entrylk. */
+typedef struct {
+    fop_entrylk_cbk_t entrylk_cbk;
+    dht_lock_t **locks;
+    int lk_count;
+    dht_reaction_type_t reaction;
+
+    /* whether locking failed on _any_ of the "locks" above */
+    int op_ret;
+    int op_errno;
+} dht_elock_wrap_t;
+
+/* The first member of dht_dir_transaction_t should be of type dht_ilock_wrap_t.
+ * Otherwise it can result in subtle memory corruption issues as in most of the
+ * places we use lock[0].layout.my_layout or lock[0].layout.parent_layout and
+ * lock[0].ns.parent_layout (like in dht_local_wipe).
+ */
+typedef union {
+    union {
+        dht_ilock_wrap_t my_layout;
+        dht_ilock_wrap_t parent_layout;
+    } layout;
+    struct dht_namespace {
+        dht_ilock_wrap_t parent_layout;
+        dht_elock_wrap_t directory_ns;
+        fop_entrylk_cbk_t ns_cbk;
+    } ns;
+} dht_dir_transaction_t;
+
+typedef int (*dht_selfheal_layout_t)(call_frame_t *frame, loc_t *loc,
+                                     dht_layout_t *layout);
+
+typedef gf_boolean_t (*dht_need_heal_t)(call_frame_t *frame,
+                                        dht_layout_t **inmem,
+                                        dht_layout_t **ondisk);
 
 struct dht_local {
-	int                      call_cnt;
-	loc_t                    loc;
-	loc_t                    loc2;
-	int                      op_ret;
-	int                      op_errno;
-	int                      layout_mismatch;
-	struct stat              stbuf;
-	struct statvfs           statvfs;
-	fd_t                    *fd;
-	inode_t                 *inode;
-	dict_t                  *xattr;
-	dict_t                  *xattr_req;
-	dht_layout_t            *layout;
-	size_t                   size;
-	ino_t                    st_ino;
-	xlator_t                *src_hashed, *src_cached;
-	xlator_t                *dst_hashed, *dst_cached;
-	xlator_t                *cached_subvol;
-	xlator_t                *hashed_subvol;
-	char                     need_selfheal;
-	struct {
-		fop_mknod_cbk_t  linkfile_cbk;
-		struct stat      stbuf;
-		loc_t            loc;
-		inode_t         *inode;
-		dict_t          *xattr;
-		xlator_t        *srcvol;
-	} linkfile;
-	struct {
-		uint32_t         hole_cnt;
-		uint32_t         overlaps_cnt;
-		uint32_t         missing;
-		uint32_t         down;
-		uint32_t         misc;
-		dht_selfheal_dir_cbk_t   dir_cbk;
-		dht_layout_t    *layout;
-	} selfheal;
-
-	/* needed by nufa */
-	int32_t flags;
-	mode_t  mode;
-	dev_t   rdev;
+    loc_t loc;
+    loc_t loc2;
+    int call_cnt;
+    int op_ret;
+    int op_errno;
+    int layout_mismatch;
+    /* Use stbuf as the postbuf, when we require both
+     * pre and post attrs */
+    struct iatt stbuf;
+    struct iatt mds_stbuf;
+    struct iatt prebuf;
+    struct iatt preoldparent;
+    struct iatt postoldparent;
+    struct iatt preparent;
+    struct iatt postparent;
+    struct statvfs statvfs;
+    fd_t *fd;
+    inode_t *inode;
+    dict_t *params;
+    dict_t *xattr;
+    dict_t *mds_xattr;
+    dict_t *xdata; /* dict used to save xdata response by xattr fop */
+    dict_t *xattr_req;
+    dht_layout_t *layout;
+    size_t size;
+    ino_t ia_ino;
+    xlator_t *src_hashed, *src_cached;
+    xlator_t *dst_hashed, *dst_cached;
+    xlator_t *cached_subvol;
+    xlator_t *hashed_subvol;
+    xlator_t *mds_subvol; /* This is use for dir only */
+    int file_count;
+    int dir_count;
+    call_frame_t *main_frame;
+    int fop_succeeded;
+    struct {
+        fop_mknod_cbk_t linkfile_cbk;
+        struct iatt stbuf;
+        loc_t loc;
+        inode_t *inode;
+        dict_t *xattr;
+        xlator_t *srcvol;
+    } linkfile;
+    struct {
+        uint32_t hole_cnt;
+        uint32_t overlaps_cnt;
+        uint32_t down;
+        uint32_t misc;
+        dht_selfheal_dir_cbk_t dir_cbk;
+        dht_selfheal_layout_t healer;
+        dht_need_heal_t should_heal;
+        dht_layout_t *layout, *refreshed_layout;
+        uint32_t missing_cnt;
+        gf_boolean_t force_mkdir;
+    } selfheal;
+
+    dht_refresh_layout_unlock refresh_layout_unlock;
+    dht_refresh_layout_done_handle refresh_layout_done;
+
+    uint32_t uid;
+    uint32_t gid;
+    pid_t pid;
+
+    glusterfs_fop_t fop;
+
+    /* need for file-info */
+    char *xattr_val;
+    char *key;
+
+    /* needed by nufa */
+    int32_t flags;
+    mode_t mode;
+    dev_t rdev;
+    mode_t umask;
+
+    /* which xattr request? */
+    char xsel[256];
+    int32_t alloc_len;
+
+    /* gfid related */
+    uuid_t gfid;
+    uuid_t gfid_req;
+
+    xlator_t *link_subvol;
+
+    struct dht_rebalance_ rebalance;
+    xlator_t *first_up_subvol;
+
+    struct dht_skip_linkto_unlink skip_unlink;
+
+    dht_dir_transaction_t lock[2], *current;
+
+    /* inodelks during filerename for backward compatibility */
+    dht_lock_t **rename_inodelk_backward_compatible;
+
+    call_stub_t *stub;
+    int32_t parent_disk_layout[4];
+
+    /* rename rollback */
+    int *ret_cache;
+
+    loc_t loc2_copy;
+
+    int rename_inodelk_bc_count;
+    /* This is use only for directory operation */
+    int32_t valid;
+    int32_t mds_heal_fresh_lookup;
+    short lock_type;
+    char need_selfheal;
+    char need_xattr_heal;
+    char need_attrheal;
+    /* flag used to make sure we need to return estale in
+       {lookup,revalidate}_cbk */
+    char return_estale;
+    char need_lookup_everywhere;
+    /* fd open check */
+    gf_boolean_t fd_checked;
+    gf_boolean_t linked;
+    gf_boolean_t added_link;
+    gf_boolean_t is_linkfile;
+    gf_boolean_t quota_deem_statfs;
+    gf_boolean_t heal_layout;
+    gf_boolean_t locked;
+    gf_boolean_t dont_create_linkto;
+    gf_boolean_t gfid_missing;
 };
 typedef struct dht_local dht_local_t;
 
+/* du - disk-usage */
+struct dht_du {
+    double avail_percent;
+    double avail_inodes;
+    uint64_t avail_space;
+    uint32_t log;
+    uint32_t chunks;
+    uint32_t total_blocks;
+    uint32_t avail_blocks;
+    uint32_t frsize; /*fragment size*/
+};
+typedef struct dht_du dht_du_t;
+
+enum gf_defrag_type {
+    GF_DEFRAG_CMD_NONE = 0,
+    GF_DEFRAG_CMD_START = 1,
+    GF_DEFRAG_CMD_STOP = 1 + 1,
+    GF_DEFRAG_CMD_STATUS = 1 + 2,
+    GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3,
+    GF_DEFRAG_CMD_START_FORCE = 1 + 4,
+    GF_DEFRAG_CMD_DETACH_STATUS = 1 + 11,
+    GF_DEFRAG_CMD_DETACH_START = 1 + 13,
+    GF_DEFRAG_CMD_DETACH_COMMIT = 1 + 14,
+    GF_DEFRAG_CMD_DETACH_COMMIT_FORCE = 1 + 15,
+    GF_DEFRAG_CMD_DETACH_STOP = 1 + 16,
+    /* new labels are used so it will help
+     * while removing old labels by easily differentiating.
+     * A few labels are added so that the count remains same
+     * between this enum and the ones on the xdr file.
+     * different values for the same enum cause errors and
+     * confusion.
+     */
+};
+typedef enum gf_defrag_type gf_defrag_type;
+
+enum gf_defrag_status_t {
+    GF_DEFRAG_STATUS_NOT_STARTED,
+    GF_DEFRAG_STATUS_STARTED,
+    GF_DEFRAG_STATUS_STOPPED,
+    GF_DEFRAG_STATUS_COMPLETE,
+    GF_DEFRAG_STATUS_FAILED,
+    GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED,
+    GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED,
+    GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE,
+    GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED,
+};
+typedef enum gf_defrag_status_t gf_defrag_status_t;
+
+typedef struct gf_defrag_pattern_list gf_defrag_pattern_list_t;
+
+struct gf_defrag_pattern_list {
+    char path_pattern[256];
+    uint64_t size;
+    gf_defrag_pattern_list_t *next;
+};
+
+struct dht_container {
+    union {
+        struct list_head list;
+        struct {
+            struct _gf_dirent_t *next;
+            struct _gf_dirent_t *prev;
+        };
+    };
+    gf_dirent_t *df_entry;
+    xlator_t *this;
+    loc_t *parent_loc;
+    dict_t *migrate_data;
+    int local_subvol_index;
+};
+
+typedef struct nodeuuid_info {
+    char info;   /* Set to 1 is this is my node's uuid*/
+    uuid_t uuid; /* Store the nodeuuid as well for debugging*/
+} nodeuuid_info_t;
+
+typedef struct subvol_nodeuuids_info {
+    nodeuuid_info_t *elements;
+    int count;
+} subvol_nodeuuids_info_t;
+
+struct gf_defrag_info_ {
+    uint64_t total_files;
+    uint64_t total_data;
+    uint64_t num_files_lookedup;
+    uint64_t total_failures;
+    uint64_t skipped;
+    uint64_t num_dirs_processed;
+    uint64_t size_processed;
+    gf_lock_t lock;
+    pthread_t th;
+    struct rpc_clnt *rpc;
+    uint32_t connected;
+    uint32_t is_exiting;
+    pid_t pid;
+    int cmd;
+    inode_t *root_inode;
+    uuid_t node_uuid;
+    time_t start_time;
+    uint32_t new_commit_hash;
+    gf_defrag_status_t defrag_status;
+    gf_defrag_pattern_list_t *defrag_pattern;
+
+    pthread_cond_t parallel_migration_cond;
+    pthread_mutex_t dfq_mutex;
+    pthread_cond_t rebalance_crawler_alarm;
+    int32_t q_entry_count;
+    int32_t global_error;
+    struct dht_container *queue;
+    int32_t crawl_done;
+    int32_t abort;
+    int32_t wakeup_crawler;
+
+    /*Throttle params*/
+    /*stands for reconfigured thread count*/
+    int32_t recon_thread_count;
+    pthread_cond_t df_wakeup_thread;
+
+    /* backpointer to make it easier to write functions for rebalance */
+    xlator_t *this;
+
+    pthread_cond_t fc_wakeup_cond;
+    pthread_mutex_t fc_mutex;
+
+    /*stands for current running thread count*/
+    int32_t current_thread_count;
+
+    gf_boolean_t stats;
+    /* lock migration flag */
+    gf_boolean_t lock_migration_enabled;
+};
+
+typedef struct gf_defrag_info_ gf_defrag_info_t;
+
+struct dht_methods_s {
+    int32_t (*migration_get_dst_subvol)(xlator_t *this, dht_local_t *local);
+    int32_t (*migration_other)(xlator_t *this, gf_defrag_info_t *defrag);
+    xlator_t *(*layout_search)(xlator_t *this, dht_layout_t *layout,
+                               const char *name);
+};
+
+typedef struct dht_methods_s dht_methods_t;
 
 struct dht_conf {
-	gf_lock_t      subvolume_lock;
-        int            subvolume_cnt;
-        xlator_t     **subvolumes;
-	xlator_t      *local_volume;     /* Needed by NUFA */
-	char          *subvolume_status;
-	dht_layout_t **file_layouts;
-	dht_layout_t **dir_layouts;
-	dht_layout_t  *default_dir_layout;
-	gf_boolean_t   search_unhashed;
-	int            gen;
+    xlator_t **subvolumes;
+    char *subvolume_status;
+    int *last_event;
+    dht_layout_t **file_layouts;
+    dht_layout_t **dir_layouts;
+    unsigned int search_unhashed;
+    int gen;
+    dht_du_t *du_stats;
+    double min_free_disk;
+    double min_free_inodes;
+    int subvolume_cnt;
+    int32_t refresh_interval;
+    gf_lock_t subvolume_lock;
+    time_t last_stat_fetch;
+    gf_lock_t layout_lock;
+    dict_t *leaf_to_subvol;
+    void *private; /* Can be used by wrapper xlators over
+                      dht */
+    time_t *subvol_up_time;
+
+    /* to keep track of nodes which are decommissioned */
+    xlator_t **decommissioned_bricks;
+    int decommission_in_progress;
+    int decommission_subvols_cnt;
+
+    /* defrag related */
+    gf_defrag_info_t *defrag;
+
+    /* Support regex-based name reinterpretation. */
+    regex_t rsync_regex;
+    regex_t extra_regex;
+
+    /* Support variable xattr names. */
+    char *xattr_name;
+    char *mds_xattr_key;
+    char *link_xattr_name;
+    char *commithash_xattr_name;
+    char *wild_xattr_name;
+
+    dht_methods_t methods;
+
+    struct mem_pool *lock_pool;
+
+    /*local subvol storage for rebalance*/
+    xlator_t **local_subvols;
+    subvol_nodeuuids_info_t *local_nodeuuids;
+    int32_t local_subvols_cnt;
+
+    int dthrottle;
+
+    /* Hard link handle requirement for migration triggered from client*/
+    synclock_t link_lock;
+
+    /* lock migration */
+    gf_lock_t lock;
+
+    /* This is the count used as the distribute layout for a directory */
+    /* Will be a global flag to control the layout spread count */
+    uint32_t dir_spread_cnt;
+
+    /*
+     * "Commit hash" for this volume topology.  Changed whenever bricks
+     * are added or removed.
+     */
+    uint32_t vol_commit_hash;
+
+    char vol_uuid[UUID_SIZE + 1];
+
+    char disk_unit;
+
+    gf_boolean_t lock_migration_enabled;
+
+    gf_boolean_t vch_forced;
+
+    gf_boolean_t use_fallocate;
+
+    gf_boolean_t force_migration;
+
+    gf_boolean_t lookup_optimize;
+
+    gf_boolean_t unhashed_sticky_bit;
+
+    gf_boolean_t assert_no_child_down;
+
+    gf_boolean_t use_readdirp;
+
+    /* Request to filter directory entries in readdir request */
+    gf_boolean_t readdir_optimize;
+
+    gf_boolean_t rsync_regex_valid;
+
+    gf_boolean_t extra_regex_valid;
+
+    /* Support size-weighted rebalancing (heterogeneous bricks). */
+    gf_boolean_t do_weighting;
+
+    gf_boolean_t randomize_by_gfid;
 };
 typedef struct dht_conf dht_conf_t;
 
+struct dht_dfoffset_ctx {
+    xlator_t *this;
+    off_t offset;
+    int32_t readdir_done;
+};
+typedef struct dht_dfoffset_ctx dht_dfoffset_ctx_t;
 
 struct dht_disk_layout {
-	uint32_t           cnt;
-	uint32_t           type;
-	struct {
-		uint32_t   start;
-		uint32_t   stop;
-	} list[1];
+    uint32_t cnt;
+    uint32_t type;
+    struct {
+        uint32_t start;
+        uint32_t stop;
+    } list[1];
 };
 typedef struct dht_disk_layout dht_disk_layout_t;
- 
-#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT)
 
-#define is_fs_root(loc) (strcmp (loc->path, "/") == 0)
+typedef enum {
+    GF_DHT_MIGRATE_DATA,
+    GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS,
+    GF_DHT_MIGRATE_HARDLINK,
+    GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS
+} gf_dht_migrate_data_type_t;
+
+typedef enum {
+    GF_DHT_EQUAL_DISTRIBUTION,
+    GF_DHT_WEIGHTED_DISTRIBUTION
+} dht_distribution_type_t;
+
+struct dir_dfmeta {
+    gf_dirent_t *equeue;
+    dht_dfoffset_ctx_t *offset_var;
+    struct list_head **head;
+    struct list_head **iterator;
+    int *fetch_entries;
+    /* fds corresponding to local subvols only */
+    fd_t **lfd;
+};
+
+typedef struct dht_migrate_info {
+    xlator_t *src_subvol;
+    xlator_t *dst_subvol;
+    GF_REF_DECL;
+} dht_migrate_info_t;
 
-#define is_revalidate(loc) (inode_ctx_get (loc->inode, this, NULL) == 0)
+typedef struct dht_fd_ctx {
+    uint64_t opened_on_dst;
+    GF_REF_DECL;
+} dht_fd_ctx_t;
+
+#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT)
+
+#define is_revalidate(loc)                                                     \
+    (dht_inode_ctx_layout_get((loc)->inode, this, NULL) == 0)
 
 #define is_last_call(cnt) (cnt == 0)
 
-#define DHT_LINKFILE_MODE (S_ISVTX)
-#define check_is_linkfile(i,s,x) ((s->st_mode & ~S_IFMT) == DHT_LINKFILE_MODE)
+#define DHT_MIGRATION_IN_PROGRESS 1
+#define DHT_MIGRATION_COMPLETED 2
+
+#define check_is_linkfile(i, s, x, n)                                          \
+    (IS_DHT_LINKFILE_MODE(s) && dict_get(x, n))
+
+#define IS_DHT_MIGRATION_PHASE2(buf)                                           \
+    (IA_ISREG((buf)->ia_type) &&                                               \
+     ((st_mode_from_ia((buf)->ia_prot, (buf)->ia_type) & ~S_IFMT) ==           \
+      DHT_LINKFILE_MODE))
+
+#define IS_DHT_MIGRATION_PHASE1(buf)                                           \
+    (IA_ISREG((buf)->ia_type) && ((buf)->ia_prot.sticky == 1) &&               \
+     ((buf)->ia_prot.sgid == 1))
 
-#define check_is_dir(i,s,x) (S_ISDIR(s->st_mode))
+#define DHT_STRIP_PHASE1_FLAGS(buf)                                            \
+    do {                                                                       \
+        if ((buf) && IS_DHT_MIGRATION_PHASE1(buf)) {                           \
+            (buf)->ia_prot.sticky = 0;                                         \
+            (buf)->ia_prot.sgid = 0;                                           \
+        }                                                                      \
+    } while (0)
+
+#define dht_inode_missing(op_errno) (op_errno == ENOENT || op_errno == ESTALE)
+
+#define check_is_dir(i, s, x) (IA_ISDIR(s->ia_type))
 
 #define layout_is_sane(layout) ((layout) && (layout->cnt > 0))
 
-#define DHT_STACK_UNWIND(frame, params ...) do {       \
-		dht_local_t *__local = NULL;           \
-		__local = frame->local;                \
-		frame->local = NULL;		       \
-		STACK_UNWIND (frame, params);          \
-		dht_local_wipe (__local);	       \
-	} while (0)
-
-#define DHT_STACK_DESTROY(frame) do {		       \
-		dht_local_t *__local = NULL;           \
-		__local = frame->local;                \
-		frame->local = NULL;		       \
-		STACK_DESTROY (frame->root);	       \
-		dht_local_wipe (__local);	       \
-	} while (0)
-
-dht_layout_t *dht_layout_new (xlator_t *this, int cnt);
-dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode);
-dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol);
-xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout,
-			     const char *name);
-int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout);
-int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
-			  uint32_t *holes_p, uint32_t *overlaps_p,
-			  uint32_t *missing_p, uint32_t *down_p,
-			  uint32_t *misc_p);
-int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout,
-			     xlator_t *subvol, loc_t *loc, dict_t *xattr);
-
-xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode,
-			       struct stat *buf, dict_t *xattr);
-int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this,
-			 xlator_t *subvol, loc_t *loc);
-
-int dht_layouts_init (xlator_t *this, dht_conf_t *conf);
-int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
-		      int op_ret, int op_errno, dict_t *xattr);
-
-int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
-			     int pos, int32_t **disk_layout_p);
-int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
-			   int pos, int32_t *disk_layout);
-
-
-int dht_frame_return (call_frame_t *frame);
-
-int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y);
-int dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol,
-		      uint64_t *x);
-
-void dht_local_wipe (dht_local_t *local);
-dht_local_t *dht_local_init (call_frame_t *frame);
-int dht_stat_merge (xlator_t *this, struct stat *to, struct stat *from,
-		    xlator_t *subvol);
-
-xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc);
-xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode);
-xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev);
-int dht_subvol_cnt (xlator_t *this, xlator_t *subvol);
-
-int dht_hash_compute (int type, const char *name, uint32_t *hash_p);
-
-int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
-			 xlator_t *tovol, xlator_t *fromvol, loc_t *loc);
-int
-dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
-			loc_t *loc, dht_layout_t *layout);
-int
-dht_selfheal_new_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
-			    dht_layout_t *layout);
-int
-dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
-		      loc_t *loc, dht_layout_t *layout);
-int
-dht_layout_sort_volname (dht_layout_t *layout);
-
-int dht_rename (call_frame_t *frame, xlator_t *this,
-		loc_t *oldloc, loc_t *newloc);
+#define we_are_not_migrating(x) ((x) == 1)
+
+#define DHT_STACK_UNWIND(fop, frame, params...)                                \
+    do {                                                                       \
+        dht_local_t *__local = NULL;                                           \
+        xlator_t *__xl = NULL;                                                 \
+        if (frame) {                                                           \
+            __xl = frame->this;                                                \
+            __local = frame->local;                                            \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, params);                               \
+        dht_local_wipe(__xl, __local);                                         \
+    } while (0)
+
+#define DHT_STACK_DESTROY(frame)                                               \
+    do {                                                                       \
+        dht_local_t *__local = NULL;                                           \
+        xlator_t *__xl = NULL;                                                 \
+        __xl = frame->this;                                                    \
+        __local = frame->local;                                                \
+        frame->local = NULL;                                                   \
+        STACK_DESTROY(frame->root);                                            \
+        dht_local_wipe(__xl, __local);                                         \
+    } while (0)
+
+#define DHT_UPDATE_TIME(ctx_sec, ctx_nsec, new_sec, new_nsec, post)            \
+    do {                                                                       \
+        if (ctx_sec == new_sec)                                                \
+            new_nsec = max(new_nsec, ctx_nsec);                                \
+        else if (ctx_sec > new_sec) {                                          \
+            new_sec = ctx_sec;                                                 \
+            new_nsec = ctx_nsec;                                               \
+        }                                                                      \
+        if (post) {                                                            \
+            ctx_sec = new_sec;                                                 \
+            ctx_nsec = new_nsec;                                               \
+        }                                                                      \
+    } while (0)
+
+#define is_greater_time(a, an, b, bn)                                          \
+    (((a) < (b)) || (((a) == (b)) && ((an) < (bn))))
+
+#define DHT_MARK_FOP_INTERNAL(xattr)                                           \
+    do {                                                                       \
+        int tmp = -1;                                                          \
+        if (!xattr) {                                                          \
+            xattr = dict_new();                                                \
+            if (!xattr)                                                        \
+                break;                                                         \
+        }                                                                      \
+        tmp = dict_set_str(xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes");          \
+        if (tmp) {                                                             \
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,       \
+                   "Failed to set dictionary value: key = %s,"                 \
+                   " path = %s",                                               \
+                   GLUSTERFS_INTERNAL_FOP_KEY, local->loc.path);               \
+        }                                                                      \
+    } while (0)
+
+dht_layout_t *
+dht_layout_new(xlator_t *this, int cnt);
+dht_layout_t *
+dht_layout_get(xlator_t *this, inode_t *inode);
+dht_layout_t *
+dht_layout_for_subvol(xlator_t *this, xlator_t *subvol);
+xlator_t *
+dht_layout_search(xlator_t *this, dht_layout_t *layout, const char *name);
+int32_t
+dht_migration_get_dst_subvol(xlator_t *this, dht_local_t *local);
+int32_t
+dht_migration_needed(xlator_t *this);
+int
+dht_layout_normalize(xlator_t *this, loc_t *loc, dht_layout_t *layout);
+void
+dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout,
+                     uint32_t *holes_p, uint32_t *overlaps_p,
+                     uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p,
+                     uint32_t *no_space_p);
+int
+dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+                        loc_t *loc, dict_t *xattr);
+xlator_t *
+dht_linkfile_subvol(xlator_t *this, inode_t *inode, struct iatt *buf,
+                    dict_t *xattr);
+int
+dht_linkfile_unlink(call_frame_t *frame, xlator_t *this, xlator_t *subvol,
+                    loc_t *loc);
+
+int
+dht_layouts_init(xlator_t *this, dht_conf_t *conf);
+int
+dht_layout_merge(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+                 int op_ret, int op_errno, dict_t *xattr);
+
+int
+dht_disk_layout_extract(xlator_t *this, dht_layout_t *layout, int pos,
+                        int32_t **disk_layout_p);
+int
+dht_disk_layout_extract_for_subvol(xlator_t *this, dht_layout_t *layout,
+                                   xlator_t *subvol, int32_t **disk_layout_p);
+
+int
+dht_frame_return(call_frame_t *frame);
+
+int
+dht_deitransform(xlator_t *this, uint64_t y, xlator_t **subvol);
+
+void
+dht_local_wipe(xlator_t *this, dht_local_t *local);
+dht_local_t *
+dht_local_init(call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop);
+int
+dht_iatt_merge(xlator_t *this, struct iatt *to, struct iatt *from);
+
+xlator_t *
+dht_subvol_get_hashed(xlator_t *this, loc_t *loc);
+xlator_t *
+dht_subvol_get_cached(xlator_t *this, inode_t *inode);
+xlator_t *
+dht_subvol_next(xlator_t *this, xlator_t *prev);
+xlator_t *
+dht_subvol_next_available(xlator_t *this, xlator_t *prev);
+int
+dht_subvol_cnt(xlator_t *this, xlator_t *subvol);
+
+int
+dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p);
+
+int
+dht_linkfile_create(call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
+                    xlator_t *this, xlator_t *tovol, xlator_t *fromvol,
+                    loc_t *loc);
+int
+dht_lookup_everywhere(call_frame_t *frame, xlator_t *this, loc_t *loc);
+int
+dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
+                       loc_t *loc, dht_layout_t *layout);
+int
+dht_selfheal_new_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
+                           dht_layout_t *layout);
+int
+dht_selfheal_restore(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
+                     loc_t *loc, dht_layout_t *layout);
+void
+dht_layout_sort_volname(dht_layout_t *layout);
+
+int
+dht_get_du_info(call_frame_t *frame, xlator_t *this, loc_t *loc);
+
+gf_boolean_t
+dht_is_subvol_filled(xlator_t *this, xlator_t *subvol);
+xlator_t *
+dht_free_disk_available_subvol(xlator_t *this, xlator_t *subvol,
+                               dht_local_t *layout);
+int
+dht_get_du_info_for_subvol(xlator_t *this, int subvol_idx);
+
+int
+dht_layout_preset(xlator_t *this, xlator_t *subvol, inode_t *inode);
+int
+dht_layout_set(xlator_t *this, inode_t *inode, dht_layout_t *layout);
+;
+void
+dht_layout_unref(xlator_t *this, dht_layout_t *layout);
+dht_layout_t *
+dht_layout_ref(xlator_t *this, dht_layout_t *layout);
+int
+dht_layout_index_for_subvol(dht_layout_t *layout, xlator_t *subvol);
+xlator_t *
+dht_first_up_subvol(xlator_t *this);
+xlator_t *
+dht_last_up_subvol(xlator_t *this);
+
+int
+dht_build_child_loc(xlator_t *this, loc_t *child, loc_t *parent, char *name);
+
+int
+dht_filter_loc_subvol_key(xlator_t *this, loc_t *loc, loc_t *new_loc,
+                          xlator_t **subvol);
+
+int
+dht_rename_cleanup(call_frame_t *frame);
+int
+dht_rename_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, inode_t *inode,
+                    struct iatt *stbuf, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata);
+
+int
+dht_update_commit_hash_for_layout(call_frame_t *frame);
+int
+dht_fix_directory_layout(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+                         dht_layout_t *layout);
+
+int
+dht_init_subvolumes(xlator_t *this, dht_conf_t *conf);
+
+/* migration/rebalance */
+int
+dht_start_rebalance_task(xlator_t *this, call_frame_t *frame);
+
+int
+dht_rebalance_in_progress_check(xlator_t *this, call_frame_t *frame);
+int
+dht_rebalance_complete_check(xlator_t *this, call_frame_t *frame);
+
+int
+dht_init_local_subvolumes(xlator_t *this, dht_conf_t *conf);
+
+/* FOPS */
+int32_t
+dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req);
+
+int32_t
+dht_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+int32_t
+dht_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
+
+int32_t
+dht_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+             dict_t *xdata);
+
+int32_t
+dht_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+              dict_t *xdata);
+
+int32_t
+dht_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+           dict_t *xdata);
+
+int32_t
+dht_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+             dict_t *xdata);
+
+int32_t
+dht_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          dev_t rdev, mode_t umask, dict_t *xdata);
+
+int32_t
+dht_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          mode_t umask, dict_t *xdata);
+
+int32_t
+dht_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+           dict_t *xdata);
+
+int32_t
+dht_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+          dict_t *xdata);
+
+int32_t
+dht_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+            loc_t *loc, mode_t umask, dict_t *xdata);
+
+int32_t
+dht_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+           dict_t *xdata);
+
+int32_t
+dht_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+         dict_t *xdata);
+
+int32_t
+dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+           mode_t mode, mode_t umask, fd_t *fd, dict_t *params);
+
+int32_t
+dht_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+         fd_t *fd, dict_t *xdata);
+
+int32_t
+dht_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+          off_t offset, uint32_t flags, dict_t *xdata);
+
+int32_t
+dht_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+           int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+           dict_t *xdata);
+
+int32_t
+dht_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
+
+int32_t
+dht_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+          dict_t *xdata);
+
+int32_t
+dht_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+            dict_t *xdata);
+
+int32_t
+dht_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+             dict_t *xdata);
+
+int32_t
+dht_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+int32_t
+dht_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+             int32_t flags, dict_t *xdata);
+
+int32_t
+dht_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+             dict_t *xdata);
+
+int32_t
+dht_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+              int32_t flags, dict_t *xdata);
+
+int32_t
+dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+              dict_t *xdata);
+
+int32_t
+dht_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                const char *name, dict_t *xdata);
+int32_t
+dht_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                 const char *name, dict_t *xdata);
+
+int32_t
+dht_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+       struct gf_flock *flock, dict_t *xdata);
+
+int32_t
+dht_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+          struct gf_lease *lease, dict_t *xdata);
+
+int32_t
+dht_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+            int32_t cmd, struct gf_flock *flock, dict_t *xdata);
+
+int32_t
+dht_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+             int32_t cmd, struct gf_flock *flock, dict_t *xdata);
+
+int32_t
+dht_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+            const char *basename, entrylk_cmd cmd, entrylk_type type,
+            dict_t *xdata);
+
+int32_t
+dht_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+             const char *basename, entrylk_cmd cmd, entrylk_type type,
+             dict_t *xdata);
+
+int32_t
+dht_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t off, dict_t *xdata);
+
+int32_t
+dht_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+             off_t off, dict_t *dict);
+
+int32_t
+dht_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+            gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata);
+
+int32_t
+dht_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+             gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata);
+
+int32_t
+dht_forget(xlator_t *this, inode_t *inode);
+int32_t
+dht_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+            int32_t valid, dict_t *xdata);
+int32_t
+dht_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+             int32_t valid, dict_t *xdata);
+int32_t
+dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+              off_t offset, size_t len, dict_t *xdata);
+int32_t
+dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            size_t len, dict_t *xdata);
+int32_t
+dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             off_t len, dict_t *xdata);
+int32_t
+dht_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata);
+
+int
+dht_set_subvol_range(xlator_t *this);
+int32_t
+dht_init(xlator_t *this);
+void
+dht_fini(xlator_t *this);
+int
+dht_reconfigure(xlator_t *this, dict_t *options);
+int32_t
+dht_notify(xlator_t *this, int32_t event, void *data, ...);
+
+/* definitions for nufa/switch */
+int
+dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+                   dict_t *xattr, struct iatt *postparent);
+int
+dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+                   dict_t *xattr, struct iatt *postparent);
+int
+dht_lookup_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int op_ret, int op_errno, inode_t *inode,
+                        struct iatt *stbuf, dict_t *xattr,
+                        struct iatt *postparent);
+int
+dht_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+               int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr,
+               struct iatt *postparent);
+int
+dht_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+               int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
+               struct iatt *preparent, struct iatt *postparent, dict_t *xdata);
+int
+dht_newfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                int op_errno, inode_t *inode, struct iatt *stbuf,
+                struct iatt *preparent, struct iatt *postparent, dict_t *xdata);
+
+int
+dht_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+int
+dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, dict_t *xattr, dict_t *xdata);
+
+int
+dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *dict,
+                       dict_t *xdata);
+int
+gf_defrag_status_get(dht_conf_t *conf, dict_t *dict);
+
+int
+gf_defrag_stop(dht_conf_t *conf, gf_defrag_status_t status, dict_t *output);
+
+void *
+gf_defrag_start(void *this);
+
+int32_t
+gf_defrag_handle_hardlink(xlator_t *this, loc_t *loc, int *fop_errno);
+int
+dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
+                 int flag, int *fop_errno);
+int
+dht_inode_ctx_layout_get(inode_t *inode, xlator_t *this,
+                         dht_layout_t **layout_int);
+int
+dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this,
+                         dht_layout_t *layout_int);
+int
+dht_inode_ctx_time_update(inode_t *inode, xlator_t *this, struct iatt *stat,
+                          int32_t update_ctx);
+void
+dht_inode_ctx_time_set(inode_t *inode, xlator_t *this, struct iatt *stat);
+
+int
+dht_inode_ctx_get(inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx);
+int
+dht_inode_ctx_set(inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx);
+int
+dht_dir_attr_heal(void *data);
+int
+dht_dir_attr_heal_done(int ret, call_frame_t *sync_frame, void *data);
+xlator_t *
+dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
+                                  xlator_t *ignore, dht_layout_t *layout,
+                                  uint64_t filesize);
+xlator_t *
+dht_subvol_maxspace_nonzeroinode(xlator_t *this, xlator_t *subvol,
+                                 dht_layout_t *layout);
+int
+dht_dir_has_layout(dict_t *xattr, char *name);
+int
+dht_linkfile_attr_heal(call_frame_t *frame, xlator_t *this);
+
+int32_t
+dht_priv_dump(xlator_t *this);
+int32_t
+dht_inodectx_dump(xlator_t *this, inode_t *inode);
+
+gf_boolean_t
+dht_is_subvol_in_layout(dht_layout_t *layout, xlator_t *xlator);
+
+int
+dht_inode_ctx_get_mig_info(xlator_t *this, inode_t *inode,
+                           xlator_t **src_subvol, xlator_t **dst_subvol);
+gf_boolean_t
+dht_mig_info_is_invalid(xlator_t *current, xlator_t *src_subvol,
+                        xlator_t *dst_subvol);
+
+int
+dht_subvol_status(dht_conf_t *conf, xlator_t *subvol);
+
+void
+dht_log_new_layout_for_dir_selfheal(xlator_t *this, loc_t *loc,
+                                    dht_layout_t *layout);
+
+int
+dht_layout_sort(dht_layout_t *layout);
+
+int
+dht_heal_full_path(void *data);
+
+int
+dht_heal_full_path_done(int op_ret, call_frame_t *frame, void *data);
+
+int
+dht_layout_missing_dirs(dht_layout_t *layout);
+
+int
+dht_refresh_layout(call_frame_t *frame);
+
+int
+dht_build_parent_loc(xlator_t *this, loc_t *parent, loc_t *child,
+                     int32_t *op_errno);
+
+int32_t
+dht_set_local_rebalance(xlator_t *this, dht_local_t *local, struct iatt *stbuf,
+                        struct iatt *prebuf, struct iatt *postbuf,
+                        dict_t *xdata);
+void
+dht_build_root_loc(inode_t *inode, loc_t *loc);
+
+gf_boolean_t
+dht_fd_open_on_dst(xlator_t *this, fd_t *fd, xlator_t *dst);
+
+int32_t
+dht_fd_ctx_destroy(xlator_t *this, fd_t *fd);
+
+int32_t
+dht_release(xlator_t *this, fd_t *fd);
+
+int32_t
+dht_set_fixed_dir_stat(struct iatt *stat);
+
+xlator_t *
+dht_get_lock_subvolume(xlator_t *this, struct gf_flock *lock,
+                       dht_local_t *local);
+
+int
+dht_lk_inode_unref(call_frame_t *frame, int32_t op_ret);
+
+int
+dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *subvol);
+
+int
+dht_check_and_open_fd_on_subvol(xlator_t *this, call_frame_t *frame);
+
+/* FD fop callbacks */
+
+int
+dht_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+               int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+               dict_t *xdata);
+
+int
+dht_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+              int op_errno, dict_t *xdata);
+
+int
+dht_file_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int op_ret, int op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata);
+
+int
+dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+                 dict_t *xdata);
+
+int
+dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+                dict_t *xdata);
+
+int
+dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                  int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+                  dict_t *xdata);
+
+int
+dht_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+                 dict_t *xdata);
+
+int
+dht_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+              int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+              dict_t *xdata);
+
+int
+dht_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+              int op_errno, struct iovec *vector, int count, struct iatt *stbuf,
+              struct iobref *iobref, dict_t *xdata);
+
+int
+dht_file_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                  int op_errno, struct iatt *stbuf, dict_t *xdata);
+
+int
+dht_file_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int op_ret, int op_errno, dict_t *xdata);
+
+int
+dht_file_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int op_ret, int op_errno, dict_t *xdata);
+
+/* All custom xattr heal functions */
+int
+dht_dir_heal_xattrs(void *data);
+
+int
+dht_dir_heal_xattrs_done(int ret, call_frame_t *sync_frame, void *data);
+
+int32_t
+dht_dict_set_array(dict_t *dict, char *key, int32_t value[], int32_t size);
+
+int
+dht_set_user_xattr(dict_t *dict, char *k, data_t *v, void *data);
+
+void
+dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst,
+                       dict_t *src, int *uret, int *uflag);
+
+int
+dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno);
+
+int
+dht_common_mark_mdsxattr(call_frame_t *frame, int *errst, int flag);
+
+int
+dht_inode_ctx_mdsvol_get(inode_t *inode, xlator_t *this, xlator_t **mdsvol);
+
+int
+dht_selfheal_dir_setattr(call_frame_t *frame, loc_t *loc, struct iatt *stbuf,
+                         int32_t valid, dht_layout_t *layout);
+
+/* Abstract out the DHT-IATT-IN-DICT */
+
+void
+dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc,
+                                  dht_layout_t *new_layout);
+
+int
+dht_pt_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+                 dict_t *xdata);
+
+int
+dht_pt_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                const char *key, dict_t *xdata);
+
+int32_t
+dht_pt_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+             mode_t umask, dict_t *xdata);
+
+int
+dht_pt_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+              dict_t *xdata);
+
+int32_t
+dht_check_remote_fd_failed_error(dht_local_t *local, int op_ret, int op_errno);
+
+int
+dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *dict,
+                       dict_t *xdata);
+
+int32_t
+dht_create_lock(call_frame_t *frame, xlator_t *subvol);
+
+int
+dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local);
+
+int
+dht_dir_layout_error_check(xlator_t *this, inode_t *inode);
+
+int
+dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol);
 #endif /* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
new file mode 100644
index 00000000000..c0588828fdb
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -0,0 +1,487 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+/* TODO: add NS locking */
+
+#include "dht-common.h"
+
+#include <sys/time.h>
+#include <glusterfs/events.h>
+
+int
+dht_du_info_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                int op_errno, struct statvfs *statvfs, dict_t *xdata)
+{
+    dht_conf_t *conf = NULL;
+    xlator_t *prev = NULL;
+    int this_call_cnt = 0;
+    int i = 0;
+    double percent = 0;
+    double percent_inodes = 0;
+    uint64_t bytes = 0;
+    uint32_t bpc; /* blocks per chunk */
+    uint32_t chunks = 0;
+
+    conf = this->private;
+    prev = cookie;
+
+    if (op_ret == -1 || !statvfs) {
+        gf_msg(this->name, GF_LOG_WARNING, op_errno,
+               DHT_MSG_GET_DISK_INFO_ERROR, "failed to get disk info from %s",
+               prev->name);
+        goto out;
+    }
+
+    if (statvfs->f_blocks) {
+        percent = (statvfs->f_bavail * 100) / statvfs->f_blocks;
+        bytes = (statvfs->f_bavail * statvfs->f_frsize);
+        /*
+         * A 32-bit count of 1MB chunks allows a maximum brick size of
+         * ~4PB.  It's possible that we could see a single local FS
+         * bigger than that some day, but this code is likely to be
+         * irrelevant by then.  Meanwhile, it's more important to keep
+         * the chunk size small so the layout-calculation code that
+         * uses this value can be tested on normal machines.
+         */
+        bpc = (1 << 20) / statvfs->f_bsize;
+        chunks = (statvfs->f_blocks + bpc - 1) / bpc;
+    }
+
+    if (statvfs->f_files) {
+        percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files;
+    } else {
+        /*
+         * Set percent inodes to 100 for dynamically allocated inode
+         * filesystems. The rationale is that distribute need not
+         * worry about total inodes; rather, let the 'create()' be
+         * scheduled on the hashed subvol regardless of the total
+         * inodes.
+         */
+        percent_inodes = 100;
+    }
+
+    LOCK(&conf->subvolume_lock);
+    {
+        for (i = 0; i < conf->subvolume_cnt; i++)
+            if (prev == conf->subvolumes[i]) {
+                conf->du_stats[i].avail_percent = percent;
+                conf->du_stats[i].avail_space = bytes;
+                conf->du_stats[i].avail_inodes = percent_inodes;
+                conf->du_stats[i].chunks = chunks;
+                conf->du_stats[i].total_blocks = statvfs->f_blocks;
+                conf->du_stats[i].avail_blocks = statvfs->f_bavail;
+                conf->du_stats[i].frsize = statvfs->f_frsize;
+
+                gf_msg_debug(this->name, 0,
+                             "subvolume '%s': avail_percent "
+                             "is: %.2f and avail_space "
+                             "is: %" PRIu64
+                             " and avail_inodes"
+                             " is: %.2f",
+                             prev->name, conf->du_stats[i].avail_percent,
+                             conf->du_stats[i].avail_space,
+                             conf->du_stats[i].avail_inodes);
+                break; /* no point in looping further */
+            }
+    }
+    UNLOCK(&conf->subvolume_lock);
+
+out:
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt))
+        DHT_STACK_DESTROY(frame);
+
+    return 0;
+}
+
+int
+dht_get_du_info_for_subvol(xlator_t *this, int subvol_idx)
+{
+    dht_conf_t *conf = NULL;
+    call_frame_t *statfs_frame = NULL;
+    dht_local_t *statfs_local = NULL;
+    call_pool_t *pool = NULL;
+    loc_t tmp_loc = {
+        0,
+    };
+
+    conf = this->private;
+    pool = this->ctx->pool;
+
+    statfs_frame = create_frame(this, pool);
+    if (!statfs_frame) {
+        goto err;
+    }
+
+    /* local->fop value is not used in this case */
+    statfs_local = dht_local_init(statfs_frame, NULL, NULL, GF_FOP_MAXVALUE);
+    if (!statfs_local) {
+        goto err;
+    }
+
+    /* make it root gfid, should be enough to get the proper info back */
+    tmp_loc.gfid[15] = 1;
+
+    statfs_local->call_cnt = 1;
+    STACK_WIND_COOKIE(
+        statfs_frame, dht_du_info_cbk, conf->subvolumes[subvol_idx],
+        conf->subvolumes[subvol_idx],
+        conf->subvolumes[subvol_idx]->fops->statfs, &tmp_loc, NULL);
+
+    return 0;
+err:
+    if (statfs_frame)
+        DHT_STACK_DESTROY(statfs_frame);
+
+    return -1;
+}
+
+int
+dht_get_du_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+    int i = 0;
+    int ret = -1;
+    dht_conf_t *conf = NULL;
+    call_frame_t *statfs_frame = NULL;
+    dht_local_t *statfs_local = NULL;
+    loc_t tmp_loc = {
+        0,
+    };
+    time_t now;
+
+    conf = this->private;
+    now = gf_time();
+    /* make it root gfid, should be enough to get the proper
+       info back */
+    tmp_loc.gfid[15] = 1;
+
+    if (now > (conf->refresh_interval + conf->last_stat_fetch)) {
+        statfs_frame = copy_frame(frame);
+        if (!statfs_frame) {
+            goto err;
+        }
+
+        /* In this case, 'local->fop' is not used */
+        statfs_local = dht_local_init(statfs_frame, loc, NULL, GF_FOP_MAXVALUE);
+        if (!statfs_local) {
+            goto err;
+        }
+
+        statfs_local->params = dict_new();
+        if (!statfs_local->params)
+            goto err;
+
+        ret = dict_set_int8(statfs_local->params,
+                            GF_INTERNAL_IGNORE_DEEM_STATFS, 1);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+                   "Failed to set " GF_INTERNAL_IGNORE_DEEM_STATFS " in dict");
+            goto err;
+        }
+
+        statfs_local->call_cnt = conf->subvolume_cnt;
+        for (i = 0; i < conf->subvolume_cnt; i++) {
+            STACK_WIND_COOKIE(statfs_frame, dht_du_info_cbk,
+                              conf->subvolumes[i], conf->subvolumes[i],
+                              conf->subvolumes[i]->fops->statfs, &tmp_loc,
+                              statfs_local->params);
+        }
+
+        conf->last_stat_fetch = now;
+    }
+    return 0;
+err:
+    if (statfs_frame)
+        DHT_STACK_DESTROY(statfs_frame);
+
+    return -1;
+}
+
+gf_boolean_t
+dht_is_subvol_filled(xlator_t *this, xlator_t *subvol)
+{
+    int i = 0;
+    char vol_name[256];
+    dht_conf_t *conf = NULL;
+    gf_boolean_t subvol_filled_inodes = _gf_false;
+    gf_boolean_t subvol_filled_space = _gf_false;
+    gf_boolean_t is_subvol_filled = _gf_false;
+    double usage = 0;
+
+    conf = this->private;
+
+    /* Check for values above specified percent or free disk */
+    LOCK(&conf->subvolume_lock);
+    {
+        for (i = 0; i < conf->subvolume_cnt; i++) {
+            if (subvol == conf->subvolumes[i]) {
+                if (conf->disk_unit == 'p') {
+                    if (conf->du_stats[i].avail_percent < conf->min_free_disk) {
+                        subvol_filled_space = _gf_true;
+                        break;
+                    }
+
+                } else {
+                    if (conf->du_stats[i].avail_space < conf->min_free_disk) {
+                        subvol_filled_space = _gf_true;
+                        break;
+                    }
+                }
+                if (conf->du_stats[i].avail_inodes < conf->min_free_inodes) {
+                    subvol_filled_inodes = _gf_true;
+                    break;
+                }
+            }
+        }
+    }
+    UNLOCK(&conf->subvolume_lock);
+
+    if (subvol_filled_space && conf->subvolume_status[i]) {
+        if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) {
+            usage = 100 - conf->du_stats[i].avail_percent;
+
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_INSUFF_SPACE,
+                   "disk space on subvolume '%s' is getting "
+                   "full (%.2f %%), consider adding more bricks",
+                   subvol->name, usage);
+
+            (void)snprintf(vol_name, sizeof(vol_name), "%s", this->name);
+            vol_name[(strlen(this->name) - 4)] = '\0';
+
+            gf_event(EVENT_DHT_DISK_USAGE, "volume=%s;subvol=%s;usage=%.2f %%",
+                     vol_name, subvol->name, usage);
+        }
+    }
+
+    if (subvol_filled_inodes && conf->subvolume_status[i]) {
+        if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) {
+            usage = 100 - conf->du_stats[i].avail_inodes;
+            gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_SUBVOL_INSUFF_INODES,
+                   "inodes on subvolume '%s' are at "
+                   "(%.2f %%), consider adding more bricks",
+                   subvol->name, usage);
+
+            (void)snprintf(vol_name, sizeof(vol_name), "%s", this->name);
+            vol_name[(strlen(this->name) - 4)] = '\0';
+
+            gf_event(EVENT_DHT_INODES_USAGE,
+                     "volume=%s;subvol=%s;usage=%.2f %%", vol_name,
+                     subvol->name, usage);
+        }
+    }
+
+    is_subvol_filled = (subvol_filled_space || subvol_filled_inodes);
+
+    return is_subvol_filled;
+}
+
+/*Get the best subvolume to create the file in*/
+xlator_t *
+dht_free_disk_available_subvol(xlator_t *this, xlator_t *subvol,
+                               dht_local_t *local)
+{
+    xlator_t *avail_subvol = NULL;
+    dht_conf_t *conf = NULL;
+    dht_layout_t *layout = NULL;
+    loc_t *loc = NULL;
+
+    conf = this->private;
+    if (!local)
+        goto out;
+    loc = &local->loc;
+    if (!local->layout) {
+        layout = dht_layout_get(this, loc->parent);
+
+        if (!layout) {
+            gf_msg_debug(this->name, 0,
+                         "Missing layout. path=%s,"
+                         " parent gfid = %s",
+                         loc->path, uuid_utoa(loc->parent->gfid));
+            goto out;
+        }
+    } else {
+        layout = dht_layout_ref(this, local->layout);
+    }
+
+    LOCK(&conf->subvolume_lock);
+    {
+        avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, NULL,
+                                                         layout, 0);
+        if (!avail_subvol) {
+            avail_subvol = dht_subvol_maxspace_nonzeroinode(this, subvol,
+                                                            layout);
+        }
+    }
+    UNLOCK(&conf->subvolume_lock);
+out:
+    if (!avail_subvol) {
+        gf_msg_debug(this->name, 0,
+                     "No subvolume has enough free space \
+                              and/or inodes to create");
+        avail_subvol = subvol;
+    }
+
+    if (layout)
+        dht_layout_unref(this, layout);
+    return avail_subvol;
+}
+
+static inline int32_t
+dht_subvol_has_err(dht_conf_t *conf, xlator_t *this, xlator_t *ignore,
+                   dht_layout_t *layout)
+{
+    int ret = -1;
+    int i = 0;
+
+    if (!this || !layout)
+        goto out;
+
+    /* this check is meant for rebalance process. The source of the file
+     * should be ignored for space check */
+    if (this == ignore) {
+        goto out;
+    }
+
+    /* check if subvol has layout errors, before selecting it */
+    for (i = 0; i < layout->cnt; i++) {
+        if (!strcmp(layout->list[i].xlator->name, this->name) &&
+            (layout->list[i].err != 0)) {
+            ret = -1;
+            goto out;
+        }
+    }
+
+    /* discard decommissioned subvol */
+    if (conf->decommission_subvols_cnt) {
+        for (i = 0; i < conf->subvolume_cnt; i++) {
+            if (conf->decommissioned_bricks[i] &&
+                conf->decommissioned_bricks[i] == this) {
+                ret = -1;
+                goto out;
+            }
+        }
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/*Get subvolume which has both space and inodes more than the min criteria*/
+xlator_t *
+dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
+                                  xlator_t *ignore, dht_layout_t *layout,
+                                  uint64_t filesize)
+{
+    int i = 0;
+    double max = 0;
+    double max_inodes = 0;
+    int ignore_subvol = 0;
+    uint64_t total_blocks = 0;
+    uint64_t avail_blocks = 0;
+    uint64_t frsize = 0;
+    double post_availspace = 0;
+    double post_percent = 0;
+
+    xlator_t *avail_subvol = NULL;
+    dht_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        /* check if subvol has layout errors and also it is not a
+         * decommissioned brick, before selecting it */
+        ignore_subvol = dht_subvol_has_err(conf, conf->subvolumes[i], ignore,
+                                           layout);
+        if (ignore_subvol)
+            continue;
+
+        if ((conf->disk_unit == 'p') &&
+            (conf->du_stats[i].avail_percent > conf->min_free_disk) &&
+            (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
+            if ((conf->du_stats[i].avail_inodes > max_inodes) ||
+                (conf->du_stats[i].avail_percent > max)) {
+                max = conf->du_stats[i].avail_percent;
+                max_inodes = conf->du_stats[i].avail_inodes;
+                avail_subvol = conf->subvolumes[i];
+                total_blocks = conf->du_stats[i].total_blocks;
+                avail_blocks = conf->du_stats[i].avail_blocks;
+                frsize = conf->du_stats[i].frsize;
+            }
+        }
+
+        if ((conf->disk_unit != 'p') &&
+            (conf->du_stats[i].avail_space > conf->min_free_disk) &&
+            (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
+            if ((conf->du_stats[i].avail_inodes > max_inodes) ||
+                (conf->du_stats[i].avail_space > max)) {
+                max = conf->du_stats[i].avail_space;
+                max_inodes = conf->du_stats[i].avail_inodes;
+                avail_subvol = conf->subvolumes[i];
+            }
+        }
+    }
+
+    if (avail_subvol) {
+        if (conf->disk_unit == 'p') {
+            post_availspace = (avail_blocks * frsize) - filesize;
+            post_percent = (post_availspace * 100) / (total_blocks * frsize);
+            if (post_percent < conf->min_free_disk)
+                avail_subvol = NULL;
+        }
+        if (conf->disk_unit != 'p') {
+            if ((max - filesize) < conf->min_free_disk)
+                avail_subvol = NULL;
+        }
+    }
+
+    return avail_subvol;
+}
+
+/* Get subvol which has at least one inode and maximum space */
+xlator_t *
+dht_subvol_maxspace_nonzeroinode(xlator_t *this, xlator_t *subvol,
+                                 dht_layout_t *layout)
+{
+    int i = 0;
+    double max = 0;
+    int ignore_subvol = 0;
+
+    xlator_t *avail_subvol = NULL;
+    dht_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        /* check if subvol has layout errors and also it is not a
+         * decommissioned brick, before selecting it*/
+
+        ignore_subvol = dht_subvol_has_err(conf, conf->subvolumes[i], NULL,
+                                           layout);
+        if (ignore_subvol)
+            continue;
+
+        if (conf->disk_unit == 'p') {
+            if ((conf->du_stats[i].avail_percent > max) &&
+                (conf->du_stats[i].avail_inodes > 0)) {
+                max = conf->du_stats[i].avail_percent;
+                avail_subvol = conf->subvolumes[i];
+            }
+        } else {
+            if ((conf->du_stats[i].avail_space > max) &&
+                (conf->du_stats[i].avail_inodes > 0)) {
+                max = conf->du_stats[i].avail_space;
+                avail_subvol = conf->subvolumes[i];
+            }
+        }
+    }
+
+    return avail_subvol;
+}
diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c
index 31cb828a483..acda67c312a 100644
--- a/xlators/cluster/dht/src/dht-hashfn.c
+++ b/xlators/cluster/dht/src/dht-hashfn.c
@@ -1,86 +1,110 @@
 /*
-   Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
 
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
 
-#include "glusterfs.h"
-#include "xlator.h"
 #include "dht-common.h"
-#include "hashfn.h"
-
+#include <glusterfs/hashfn.h>
 
-typedef enum {
-	DHT_HASH_TYPE_DM,
-} dht_hashfn_type_t;
-
-
-int
-dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p)
+static int
+dht_hash_compute_internal(int type, const char *name, const int len,
+                          uint32_t *hash_p)
 {
-	int      ret = 0;
-	uint32_t hash = 0;
-
-	switch (type) {
-	case DHT_HASH_TYPE_DM:
-		hash = gf_dm_hashfn (name, strlen (name));
-		break;
-	default:
-		ret = -1;
-		break;
-	}
-
-	if (ret == 0) {
-		*hash_p = hash;
-	}
-
-	return ret;
+    int ret = 0;
+    uint32_t hash = 0;
+
+    switch (type) {
+        case DHT_HASH_TYPE_DM:
+        case DHT_HASH_TYPE_DM_USER:
+            hash = gf_dm_hashfn(name, len);
+            break;
+        default:
+            ret = -1;
+            break;
+    }
+
+    if (ret == 0) {
+        *hash_p = hash;
+    }
+
+    return ret;
 }
 
-
-#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do {          \
-                rsync_frndly_name = (char *) name;			\
-                if (name[0] == '.') {                                   \
-                        char *dot   = 0;                                \
-                        int namelen = 0;                                \
-                                                                        \
-                        dot = strrchr (name, '.');                      \
-                        if (dot && dot > (name + 1) && *(dot + 1)) {    \
-                                namelen = (dot - name);                 \
-                                rsync_frndly_name = alloca (namelen);   \
-                                strncpy (rsync_frndly_name, name + 1,   \
-                                         namelen);                      \
-                                rsync_frndly_name[namelen - 1] = 0;     \
-                        }                                               \
-                }                                                       \
-        } while (0);
-
+/* The function returns:
+ * 0  : in case no munge took place
+ * >0 : the length (inc. terminating NULL!) of the newly modified string,
+ *      if it was munged.
+ */
+static int
+dht_munge_name(const char *original, char *modified, size_t len, regex_t *re)
+{
+    regmatch_t matches[2] = {
+        {0},
+    };
+    size_t new_len = 0;
+    int ret = 0;
+
+    ret = regexec(re, original, 2, matches, 0);
+
+    if (ret != REG_NOMATCH) {
+        if (matches[1].rm_so != -1) {
+            new_len = matches[1].rm_eo - matches[1].rm_so;
+            /* Equal would fail due to the NUL at the end. */
+            if (new_len < len) {
+                memcpy(modified, original + matches[1].rm_so, new_len);
+                modified[new_len] = '\0';
+                return new_len + 1; /* +1 for the terminating NULL */
+            }
+        }
+    }
+
+    /* This is guaranteed safe because of how the dest was allocated. */
+    strcpy(modified, original);
+    return 0;
+}
 
 int
-dht_hash_compute (int type, const char *name, uint32_t *hash_p)
+dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p)
 {
-	char     *rsync_friendly_name = NULL;
-
-	MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name);
-
-	return dht_hash_compute_internal (type, rsync_friendly_name, hash_p);
+    char *rsync_friendly_name = NULL;
+    dht_conf_t *priv = NULL;
+    size_t len = 0;
+    int munged = 0;
+
+    priv = this->private;
+
+    if (name == NULL)
+        return -1;
+
+    len = strlen(name) + 1;
+    rsync_friendly_name = alloca(len);
+
+    LOCK(&priv->lock);
+    {
+        if (priv->extra_regex_valid) {
+            munged = dht_munge_name(name, rsync_friendly_name, len,
+                                    &priv->extra_regex);
+        }
+
+        if (!munged && priv->rsync_regex_valid) {
+            gf_msg_trace(this->name, 0, "trying regex for %s", name);
+            munged = dht_munge_name(name, rsync_friendly_name, len,
+                                    &priv->rsync_regex);
+        }
+    }
+    UNLOCK(&priv->lock);
+    if (munged) {
+        gf_msg_debug(this->name, 0, "munged down to %s", rsync_friendly_name);
+        len = munged;
+    } else {
+        rsync_friendly_name = (char *)name;
+    }
+
+    return dht_hash_compute_internal(type, rsync_friendly_name, len - 1,
+                                     hash_p);
 }
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
index f7e89b2637a..3f2fe43d5f3 100644
--- a/xlators/cluster/dht/src/dht-helper.c
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -1,326 +1,2304 @@
 /*
-   Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+#include "dht-common.h"
+#include "dht-lock.h"
+#include "glusterfs/compat-errno.h"  // for ENODATA on BSD
 
+static void
+dht_free_fd_ctx(dht_fd_ctx_t *fd_ctx)
+{
+    GF_FREE(fd_ctx);
+}
 
-#include "glusterfs.h"
-#include "xlator.h"
-#include "dht-common.h"
+int32_t
+dht_fd_ctx_destroy(xlator_t *this, fd_t *fd)
+{
+    dht_fd_ctx_t *fd_ctx = NULL;
+    uint64_t value = 0;
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    ret = fd_ctx_del(fd, this, &value);
+    if (ret) {
+        goto out;
+    }
+
+    fd_ctx = (dht_fd_ctx_t *)(uintptr_t)value;
+    if (fd_ctx) {
+        GF_REF_PUT(fd_ctx);
+    }
+out:
+    return ret;
+}
+
+static int
+__dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *dst)
+{
+    dht_fd_ctx_t *fd_ctx = NULL;
+    uint64_t value = 0;
+    int ret = -1;
 
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    fd_ctx = GF_CALLOC(1, sizeof(*fd_ctx), gf_dht_mt_fd_ctx_t);
+
+    if (!fd_ctx) {
+        goto out;
+    }
+
+    fd_ctx->opened_on_dst = (uint64_t)(uintptr_t)dst;
+    GF_REF_INIT(fd_ctx, dht_free_fd_ctx);
+
+    value = (uint64_t)(uintptr_t)fd_ctx;
+
+    ret = __fd_ctx_set(fd, this, value);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FD_CTX_SET_FAILED,
+                "fd=0x%p", fd, NULL);
+        GF_REF_PUT(fd_ctx);
+    }
+out:
+    return ret;
+}
 
 int
-dht_frame_return (call_frame_t *frame)
+dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *dst)
+{
+    dht_fd_ctx_t *fd_ctx = NULL;
+    uint64_t value = 0;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    LOCK(&fd->lock);
+    {
+        ret = __fd_ctx_get(fd, this, &value);
+        if (ret && value) {
+            fd_ctx = (dht_fd_ctx_t *)(uintptr_t)value;
+            if (fd_ctx->opened_on_dst == (uint64_t)(uintptr_t)dst) {
+                /* This could happen due to racing
+                 * check_progress tasks*/
+                goto unlock;
+            } else {
+                /* This would be a big problem*/
+                /* Overwrite and hope for the best*/
+                fd_ctx->opened_on_dst = (uint64_t)(uintptr_t)dst;
+                UNLOCK(&fd->lock);
+                gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_VALUE,
+                        NULL);
+
+                goto out;
+            }
+        }
+        ret = __dht_fd_ctx_set(this, fd, dst);
+    }
+unlock:
+    UNLOCK(&fd->lock);
+out:
+    return ret;
+}
+
+static dht_fd_ctx_t *
+dht_fd_ctx_get(xlator_t *this, fd_t *fd)
+{
+    dht_fd_ctx_t *fd_ctx = NULL;
+    int ret = -1;
+    uint64_t tmp_val = 0;
+
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    LOCK(&fd->lock);
+    {
+        ret = __fd_ctx_get(fd, this, &tmp_val);
+        if ((ret < 0) || (tmp_val == 0)) {
+            goto unlock;
+        }
+
+        fd_ctx = (dht_fd_ctx_t *)(uintptr_t)tmp_val;
+        GF_REF_GET(fd_ctx);
+    }
+unlock:
+    UNLOCK(&fd->lock);
+
+out:
+    return fd_ctx;
+}
+
+gf_boolean_t
+dht_fd_open_on_dst(xlator_t *this, fd_t *fd, xlator_t *dst)
 {
-	dht_local_t *local = NULL;
-	int          this_call_cnt = -1;
+    dht_fd_ctx_t *fd_ctx = NULL;
+    gf_boolean_t opened = _gf_false;
+
+    fd_ctx = dht_fd_ctx_get(this, fd);
 
-	if (!frame)
-		return -1;
+    if (fd_ctx) {
+        if (fd_ctx->opened_on_dst == (uint64_t)(uintptr_t)dst) {
+            opened = _gf_true;
+        }
+        GF_REF_PUT(fd_ctx);
+    }
+
+    return opened;
+}
 
-	local = frame->local;
+void
+dht_free_mig_info(void *data)
+{
+    dht_migrate_info_t *miginfo = NULL;
 
-	LOCK (&frame->lock);
-	{
-		this_call_cnt = --local->call_cnt;
-	}
-	UNLOCK (&frame->lock);
+    miginfo = data;
+    GF_FREE(miginfo);
 
-	return this_call_cnt;
+    return;
 }
 
+static int
+dht_inode_ctx_set_mig_info(xlator_t *this, inode_t *inode, xlator_t *src_subvol,
+                           xlator_t *dst_subvol)
+{
+    dht_migrate_info_t *miginfo = NULL;
+    uint64_t value = 0;
+    int ret = -1;
+
+    miginfo = GF_CALLOC(1, sizeof(*miginfo), gf_dht_mt_miginfo_t);
+    if (miginfo == NULL)
+        goto out;
+
+    miginfo->src_subvol = src_subvol;
+    miginfo->dst_subvol = dst_subvol;
+    GF_REF_INIT(miginfo, dht_free_mig_info);
+
+    value = (uint64_t)(uintptr_t)miginfo;
+
+    ret = inode_ctx_set1(inode, this, &value);
+    if (ret < 0) {
+        GF_REF_PUT(miginfo);
+    }
+
+out:
+    return ret;
+}
 
 int
-dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
+dht_inode_ctx_get_mig_info(xlator_t *this, inode_t *inode,
+                           xlator_t **src_subvol, xlator_t **dst_subvol)
 {
-	dht_conf_t *conf = NULL;
-	int         cnt = 0;
-	int         max = 0;
-	uint64_t    y = 0;
+    int ret = -1;
+    uint64_t tmp_miginfo = 0;
+    dht_migrate_info_t *miginfo = NULL;
+
+    LOCK(&inode->lock);
+    {
+        ret = __inode_ctx_get1(inode, this, &tmp_miginfo);
+        if ((ret < 0) || (tmp_miginfo == 0)) {
+            UNLOCK(&inode->lock);
+            goto out;
+        }
+
+        miginfo = (dht_migrate_info_t *)(uintptr_t)tmp_miginfo;
+        GF_REF_GET(miginfo);
+    }
+    UNLOCK(&inode->lock);
+
+    if (src_subvol)
+        *src_subvol = miginfo->src_subvol;
 
+    if (dst_subvol)
+        *dst_subvol = miginfo->dst_subvol;
 
-	if (x == ((uint64_t) -1)) {
-		y = (uint64_t) -1;
-		goto out;
-	}
+    GF_REF_PUT(miginfo);
+
+out:
+    return ret;
+}
 
-	conf = this->private;
+gf_boolean_t
+dht_mig_info_is_invalid(xlator_t *current, xlator_t *src_subvol,
+                        xlator_t *dst_subvol)
+{
+    /* Not set
+     */
+    if (!src_subvol || !dst_subvol)
+        return _gf_true;
+
+    /* Invalid scenarios:
+     * The src_subvol does not match the subvol on which the current op was sent
+     * so the cached subvol has changed between the last mig_info_set and now.
+     * src_subvol == dst_subvol. The file was migrated without any FOP detecting
+     * a P2 so the old dst is now the current subvol.
+     *
+     * There is still one scenario where the info could be outdated - if
+     * file has undergone multiple migrations and ends up on the same src_subvol
+     * on which the mig_info was first set.
+     */
+    if ((current == dst_subvol) || (current != src_subvol))
+        return _gf_true;
+
+    return _gf_false;
+}
 
-	max = conf->subvolume_cnt;
-	cnt = dht_subvol_cnt (this, subvol);
+/* Used to check if fd fops have the fd opened on the cached subvol
+ * This is required when:
+ * 1. an fd is opened on FILE1 on subvol1
+ * 2. the file is migrated to subvol2
+ * 3. a lookup updates the cached subvol in the inode_ctx to subvol2
+ * 4. a write comes on the fd
+ * The write is sent to subvol2 on an fd which has been opened only on fd1
+ * Since the migration phase checks don't kick in, the fop fails with EBADF
+ *
+ */
 
-	y = ((x * max) + cnt);
+int
+dht_check_and_open_fd_on_subvol_complete(int ret, call_frame_t *frame,
+                                         void *data)
+{
+    glusterfs_fop_t fop = 0;
+    dht_local_t *local = NULL;
+    xlator_t *subvol = NULL;
+    xlator_t *this = NULL;
+    fd_t *fd = NULL;
+    int op_errno = -1;
+
+    local = frame->local;
+    this = frame->this;
+    fop = local->fop;
+    subvol = local->cached_subvol;
+    fd = local->fd;
+
+    if (ret) {
+        op_errno = local->op_errno;
+        goto handle_err;
+    }
+
+    switch (fop) {
+        case GF_FOP_WRITE:
+            STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol,
+                              subvol->fops->writev, fd, local->rebalance.vector,
+                              local->rebalance.count, local->rebalance.offset,
+                              local->rebalance.flags, local->rebalance.iobref,
+                              local->xattr_req);
+            break;
+
+        case GF_FOP_FLUSH:
+            STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, fd,
+                       local->xattr_req);
+            break;
+
+        case GF_FOP_FSETATTR:
+            STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+                              subvol->fops->fsetattr, fd,
+                              &local->rebalance.stbuf, local->rebalance.flags,
+                              local->xattr_req);
+            break;
+
+        case GF_FOP_ZEROFILL:
+            STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol,
+                              subvol->fops->zerofill, fd,
+                              local->rebalance.offset, local->rebalance.size,
+                              local->xattr_req);
+
+            break;
+
+        case GF_FOP_DISCARD:
+            STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol,
+                              subvol->fops->discard, local->fd,
+                              local->rebalance.offset, local->rebalance.size,
+                              local->xattr_req);
+            break;
+
+        case GF_FOP_FALLOCATE:
+            STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol,
+                              subvol->fops->fallocate, fd,
+                              local->rebalance.flags, local->rebalance.offset,
+                              local->rebalance.size, local->xattr_req);
+            break;
+
+        case GF_FOP_FTRUNCATE:
+            STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+                              subvol->fops->ftruncate, fd,
+                              local->rebalance.offset, local->xattr_req);
+            break;
+
+        case GF_FOP_FSYNC:
+            STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol,
+                              subvol->fops->fsync, local->fd,
+                              local->rebalance.flags, local->xattr_req);
+            break;
+
+        case GF_FOP_READ:
+            STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv,
+                       local->fd, local->rebalance.size,
+                       local->rebalance.offset, local->rebalance.flags,
+                       local->xattr_req);
+            break;
+
+        case GF_FOP_FSTAT:
+            STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+                              subvol->fops->fstat, fd, local->xattr_req);
+            break;
+
+        case GF_FOP_FSETXATTR:
+            STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+                              subvol->fops->fsetxattr, local->fd,
+                              local->rebalance.xattr, local->rebalance.flags,
+                              local->xattr_req);
+            break;
+
+        case GF_FOP_FREMOVEXATTR:
+            STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+                              subvol->fops->fremovexattr, local->fd, local->key,
+                              local->xattr_req);
+
+            break;
+
+        case GF_FOP_FXATTROP:
+            STACK_WIND(frame, dht_common_xattrop_cbk, subvol,
+                       subvol->fops->fxattrop, local->fd,
+                       local->rebalance.flags, local->rebalance.xattr,
+                       local->xattr_req);
+            break;
+
+        case GF_FOP_FGETXATTR:
+            STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr,
+                       local->fd, local->key, NULL);
+            break;
+
+        case GF_FOP_FINODELK:
+            STACK_WIND(frame, dht_finodelk_cbk, subvol, subvol->fops->finodelk,
+                       local->key, local->fd, local->rebalance.lock_cmd,
+                       &local->rebalance.flock, local->xattr_req);
+            break;
+        default:
+            gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP, "fd=%p",
+                    fd, "gfid=%s", uuid_utoa(fd->inode->gfid), "name=%s",
+                    subvol->name, NULL);
+            break;
+    }
+
+    goto out;
+
+    /* Could not open the fd on the dst. Unwind */
+
+handle_err:
+
+    switch (fop) {
+        case GF_FOP_WRITE:
+            DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
+            break;
+
+        case GF_FOP_FLUSH:
+            DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
+            break;
+
+        case GF_FOP_FSETATTR:
+            DHT_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
+            break;
+
+        case GF_FOP_ZEROFILL:
+            DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+            break;
+
+        case GF_FOP_DISCARD:
+            DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+            break;
+
+        case GF_FOP_FALLOCATE:
+            DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+            break;
+
+        case GF_FOP_FTRUNCATE:
+            DHT_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+            break;
+
+        case GF_FOP_FSYNC:
+            DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL);
+            break;
+
+        case GF_FOP_READ:
+            DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL,
+                             NULL);
+            break;
+
+        case GF_FOP_FSTAT:
+            DHT_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL);
+            break;
+
+        case GF_FOP_FSETXATTR:
+            DHT_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL);
+            break;
+
+        case GF_FOP_FREMOVEXATTR:
+            DHT_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL);
+            break;
+
+        case GF_FOP_FXATTROP:
+            DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL);
+            break;
+
+        case GF_FOP_FGETXATTR:
+            DHT_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL);
+            break;
+
+        case GF_FOP_FINODELK:
+            DHT_STACK_UNWIND(finodelk, frame, -1, op_errno, NULL);
+            break;
+
+        default:
+            gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP, "fd=%p",
+                    fd, "gfid=%s", uuid_utoa(fd->inode->gfid), "name=%s",
+                    subvol->name, NULL);
+            break;
+    }
 
 out:
-	if (y_p)
-		*y_p = y;
 
-	return 0;
+    return 0;
 }
 
+/* Check once again if the fd has been opened on the cached subvol.
+ * If not, open and update the fd_ctx.
+ */
 
 int
-dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p,
-		  uint64_t *x_p)
+dht_check_and_open_fd_on_subvol_task(void *data)
 {
-	dht_conf_t *conf = NULL;
-	int         cnt = 0;
-	int         max = 0;
-	uint64_t    x = 0;
-	xlator_t   *subvol = 0;
+    loc_t loc = {
+        0,
+    };
+    int ret = -1;
+    call_frame_t *frame = NULL;
+    dht_local_t *local = NULL;
+    fd_t *fd = NULL;
+    xlator_t *this = NULL;
+    xlator_t *subvol = NULL;
+
+    frame = data;
+    local = frame->local;
+    this = THIS;
+    fd = local->fd;
+    subvol = local->cached_subvol;
+
+    local->fd_checked = _gf_true;
+
+    if (fd_is_anonymous(fd) || dht_fd_open_on_dst(this, fd, subvol)) {
+        ret = 0;
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "Opening fd (%p, flags=0%o) on file %s @ %s",
+                 fd, fd->flags, uuid_utoa(fd->inode->gfid), subvol->name);
+
+    loc.inode = inode_ref(fd->inode);
+    gf_uuid_copy(loc.gfid, fd->inode->gfid);
+
+    /* Open this on the dst subvol */
+
+    SYNCTASK_SETID(0, 0);
+
+    ret = syncop_open(subvol, &loc, (fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)),
+                      fd, NULL, NULL);
+
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_OPEN_FD_ON_DST_FAILED,
+                "fd=%p", fd, "flags=0%o", fd->flags, "gfid=%s",
+                uuid_utoa(fd->inode->gfid), "name=%s", subvol->name, NULL);
+        /* This can happen if the cached subvol was updated in the
+         * inode_ctx and the fd was opened on the new cached suvol
+         * after this fop was wound on the old cached subvol.
+         * As we do not close the fd on the old subvol (a leak)
+         * don't treat ENOENT as an error and allow the phase1/phase2
+         * checks to handle it.
+         */
+
+        if ((-ret != ENOENT) && (-ret != ESTALE)) {
+            local->op_errno = -ret;
+            ret = -1;
+        } else {
+            ret = 0;
+        }
 
+        local->op_errno = -ret;
+        ret = -1;
 
-	conf = this->private;
-	max = conf->subvolume_cnt;
+    } else {
+        dht_fd_ctx_set(this, fd, subvol);
+    }
 
-	cnt = y % max;
-	x   = y / max;
+    SYNCTASK_SETID(frame->root->uid, frame->root->gid);
+out:
+    loc_wipe(&loc);
 
-	subvol = conf->subvolumes[cnt];
+    return ret;
+}
 
-	if (subvol_p)
-		*subvol_p = subvol;
+int
+dht_check_and_open_fd_on_subvol(xlator_t *this, call_frame_t *frame)
+{
+    int ret = -1;
+    dht_local_t *local = NULL;
 
-	if (x_p)
-		*x_p = x;
+    /*
+            if (dht_fd_open_on_dst (this, fd, subvol))
+                    goto out;
+    */
+    local = frame->local;
 
-	return 0;
+    ret = synctask_new(this->ctx->env, dht_check_and_open_fd_on_subvol_task,
+                       dht_check_and_open_fd_on_subvol_complete, frame, frame);
+
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SYNCTASK_CREATE_FAILED,
+                "to-check-and-open fd=%p", local->fd, NULL);
+    }
+
+    return ret;
 }
 
+int
+dht_frame_return(call_frame_t *frame)
+{
+    dht_local_t *local = NULL;
+    int this_call_cnt = -1;
 
-void
-dht_local_wipe (dht_local_t *local)
+    if (!frame)
+        return -1;
+
+    local = frame->local;
+
+    LOCK(&frame->lock);
+    {
+        this_call_cnt = --local->call_cnt;
+    }
+    UNLOCK(&frame->lock);
+
+    return this_call_cnt;
+}
+
+/*
+ * Use this function to specify which subvol you want the file created
+ * on - this need not be the hashed subvol.
+ * Format: <filename>@<this->name>:<subvol-name>
+ * Eg: file-1@vol1-dht:vol1-client-0
+ *     where vol1 is a pure distribute volume
+ *     will create file-1 on vol1-client-0
+ */
+
+int
+dht_filter_loc_subvol_key(xlator_t *this, loc_t *loc, loc_t *new_loc,
+                          xlator_t **subvol)
 {
-	if (!local)
-		return;
+    char *new_name = NULL;
+    char *new_path = NULL;
+    xlator_list_t *trav = NULL;
+    char key[1024] = {
+        0,
+    };
+    int ret = 0; /* not found */
+    int keylen = 0;
+    int name_len = 0;
+    int path_len = 0;
+
+    /* Why do other tasks if first required 'char' itself is not there */
+    if (!new_loc || !loc || !loc->name || !strchr(loc->name, '@')) {
+        /* Skip the GF_FREE checks here */
+        return ret;
+    }
+
+    trav = this->children;
+    while (trav) {
+        keylen = snprintf(key, sizeof(key), "*@%s:%s", this->name,
+                          trav->xlator->name);
+        /* Ignore '*' */
+        keylen = keylen - 1;
+        if (fnmatch(key, loc->name, FNM_NOESCAPE) == 0) {
+            name_len = strlen(loc->name) - keylen;
+            new_name = GF_MALLOC(name_len + 1, gf_common_mt_char);
+            if (!new_name)
+                goto out;
+            if (fnmatch(key, loc->path, FNM_NOESCAPE) == 0) {
+                path_len = strlen(loc->path) - keylen;
+                new_path = GF_MALLOC(path_len + 1, gf_common_mt_char);
+                if (!new_path)
+                    goto out;
+                snprintf(new_path, path_len + 1, "%s", loc->path);
+            }
+            snprintf(new_name, name_len + 1, "%s", loc->name);
+
+            if (new_loc) {
+                new_loc->path = ((new_path) ? new_path : gf_strdup(loc->path));
+                new_loc->name = new_name;
+                new_loc->inode = inode_ref(loc->inode);
+                new_loc->parent = inode_ref(loc->parent);
+            }
+            *subvol = trav->xlator;
+            ret = 1; /* success */
+            goto out;
+        }
+        trav = trav->next;
+    }
+out:
+    if (!ret) {
+        /* !success */
+        GF_FREE(new_path);
+        GF_FREE(new_name);
+    }
+    return ret;
+}
 
-	loc_wipe (&local->loc);
-	loc_wipe (&local->loc2);
+static xlator_t *
+dht_get_subvol_from_id(xlator_t *this, int client_id)
+{
+    xlator_t *xl = NULL;
+    dht_conf_t *conf = NULL;
+    char *sid = NULL;
+    int32_t ret = -1;
 
-	if (local->xattr)
-		dict_unref (local->xattr);
+    conf = this->private;
 
-	if (local->inode)
-		inode_unref (local->inode);
+    ret = gf_asprintf(&sid, "%d", client_id);
+    if (ret == -1) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_ASPRINTF_FAILED, NULL);
+        goto out;
+    }
 
-	if (local->layout)
-		FREE (local->layout);
+    if (dict_get_ptr(conf->leaf_to_subvol, sid, (void **)&xl))
+        xl = NULL;
 
-	loc_wipe (&local->linkfile.loc);
+    GF_FREE(sid);
 
-	if (local->linkfile.xattr)
-		dict_unref (local->linkfile.xattr);
+out:
+    return xl;
+}
+
+int
+dht_deitransform(xlator_t *this, uint64_t y, xlator_t **subvol_p)
+{
+    int client_id = 0;
+    xlator_t *subvol = 0;
+    dht_conf_t *conf = NULL;
+
+    if (!this->private)
+        return -1;
 
-	if (local->linkfile.inode)
-		inode_unref (local->linkfile.inode);
+    conf = this->private;
 
-	if (local->fd) {
-		fd_unref (local->fd);
-		local->fd = NULL;
-	}
-	
-	if (local->xattr_req)
-		dict_unref (local->xattr_req);
+    client_id = gf_deitransform(this, y);
 
-	FREE (local);
+    subvol = dht_get_subvol_from_id(this, client_id);
+
+    if (!subvol)
+        subvol = conf->subvolumes[0];
+
+    if (subvol_p)
+        *subvol_p = subvol;
+
+    return 0;
 }
 
+void
+dht_local_wipe(xlator_t *this, dht_local_t *local)
+{
+    int i = 0;
+
+    if (!local)
+        return;
+
+    loc_wipe(&local->loc);
+    loc_wipe(&local->loc2);
+    loc_wipe(&local->loc2_copy);
+
+    if (local->xattr)
+        dict_unref(local->xattr);
+
+    if (local->inode)
+        inode_unref(local->inode);
+
+    if (local->layout) {
+        dht_layout_unref(this, local->layout);
+        local->layout = NULL;
+    }
+
+    loc_wipe(&local->linkfile.loc);
+
+    if (local->linkfile.xattr)
+        dict_unref(local->linkfile.xattr);
+
+    if (local->linkfile.inode)
+        inode_unref(local->linkfile.inode);
+
+    if (local->fd) {
+        fd_unref(local->fd);
+        local->fd = NULL;
+    }
+
+    if (local->params) {
+        dict_unref(local->params);
+        local->params = NULL;
+    }
+
+    if (local->xattr_req)
+        dict_unref(local->xattr_req);
+    if (local->mds_xattr)
+        dict_unref(local->mds_xattr);
+    if (local->xdata)
+        dict_unref(local->xdata);
+
+    if (local->selfheal.layout) {
+        dht_layout_unref(this, local->selfheal.layout);
+        local->selfheal.layout = NULL;
+    }
+
+    if (local->selfheal.refreshed_layout) {
+        dht_layout_unref(this, local->selfheal.refreshed_layout);
+        local->selfheal.refreshed_layout = NULL;
+    }
+
+    for (i = 0; i < 2; i++) {
+        dht_lock_array_free(local->lock[i].ns.parent_layout.locks,
+                            local->lock[i].ns.parent_layout.lk_count);
+
+        GF_FREE(local->lock[i].ns.parent_layout.locks);
+
+        dht_lock_array_free(local->lock[i].ns.directory_ns.locks,
+                            local->lock[i].ns.directory_ns.lk_count);
+        GF_FREE(local->lock[i].ns.directory_ns.locks);
+    }
+
+    GF_FREE(local->key);
+
+    if (local->rebalance.xdata)
+        dict_unref(local->rebalance.xdata);
+
+    if (local->rebalance.xattr)
+        dict_unref(local->rebalance.xattr);
+
+    if (local->rebalance.dict)
+        dict_unref(local->rebalance.dict);
+
+    GF_FREE(local->rebalance.vector);
+
+    if (local->rebalance.iobref)
+        iobref_unref(local->rebalance.iobref);
+
+    if (local->stub) {
+        call_stub_destroy(local->stub);
+        local->stub = NULL;
+    }
+
+    if (local->ret_cache)
+        GF_FREE(local->ret_cache);
+
+    mem_put(local);
+}
 
 dht_local_t *
-dht_local_init (call_frame_t *frame)
+dht_local_init(call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop)
 {
-	dht_local_t *local = NULL;
+    dht_local_t *local = NULL;
+    inode_t *inode = NULL;
+    int ret = 0;
 
-	/* TODO: use mem-pool */
-	local = CALLOC (1, sizeof (*local));
+    local = mem_get0(THIS->local_pool);
+    if (!local)
+        goto out;
 
-	if (!local)
-		return NULL;
+    if (loc) {
+        ret = loc_copy(&local->loc, loc);
+        if (ret)
+            goto out;
 
-	local->op_ret = -1;
-	local->op_errno = EUCLEAN;
+        inode = loc->inode;
+    }
 
-	frame->local = local;
+    if (fd) {
+        local->fd = fd_ref(fd);
+        if (!inode)
+            inode = fd->inode;
+    }
 
-	return local;
-}
+    local->op_ret = -1;
+    local->op_errno = EUCLEAN;
+    local->fop = fop;
+
+    if (inode) {
+        local->layout = dht_layout_get(frame->this, inode);
+        local->cached_subvol = dht_subvol_get_cached(frame->this, inode);
+    }
+
+    frame->local = local;
 
+out:
+    if (ret) {
+        if (local)
+            mem_put(local);
+        local = NULL;
+    }
+    return local;
+}
 
-char *
-basestr (const char *str)
+xlator_t *
+dht_first_up_subvol(xlator_t *this)
 {
-        char *basestr = NULL;
+    dht_conf_t *conf = NULL;
+    xlator_t *child = NULL;
+    int i = 0;
+    time_t time = 0;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    LOCK(&conf->subvolume_lock);
+    {
+        for (i = 0; i < conf->subvolume_cnt; i++) {
+            if (conf->subvol_up_time[i]) {
+                if (!time) {
+                    time = conf->subvol_up_time[i];
+                    child = conf->subvolumes[i];
+                } else if (time > conf->subvol_up_time[i]) {
+                    time = conf->subvol_up_time[i];
+                    child = conf->subvolumes[i];
+                }
+            }
+        }
+    }
+    UNLOCK(&conf->subvolume_lock);
+
+out:
+    return child;
+}
 
-        basestr = strrchr (str, '/');
-        if (basestr)
-                basestr ++;
+xlator_t *
+dht_last_up_subvol(xlator_t *this)
+{
+    dht_conf_t *conf = NULL;
+    xlator_t *child = NULL;
+    int i = 0;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    LOCK(&conf->subvolume_lock);
+    {
+        for (i = conf->subvolume_cnt - 1; i >= 0; i--) {
+            if (conf->subvolume_status[i]) {
+                child = conf->subvolumes[i];
+                break;
+            }
+        }
+    }
+    UNLOCK(&conf->subvolume_lock);
 
-        return basestr;
+out:
+    return child;
 }
 
 xlator_t *
-dht_first_up_child (xlator_t *this)
-{
-	dht_conf_t *conf = NULL;
-	xlator_t   *child = NULL;
-	int         i = 0;
-
-	conf = this->private;
-	
-	LOCK (&conf->subvolume_lock);
-	{
-		for (i = 0; i < conf->subvolume_cnt; i++) {
-			if (conf->subvolume_status[i]) {
-				child = conf->subvolumes[i];
-				break;
-			}
-		}
-	}
-	UNLOCK (&conf->subvolume_lock);
-	
-	return child;
+dht_subvol_get_hashed(xlator_t *this, loc_t *loc)
+{
+    dht_layout_t *layout = NULL;
+    xlator_t *subvol = NULL;
+    dht_conf_t *conf = NULL;
+    dht_methods_t *methods = NULL;
+
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    methods = &(conf->methods);
+
+    if (__is_root_gfid(loc->gfid)) {
+        subvol = dht_first_up_subvol(this);
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO(this->name, loc->parent, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->name, out);
+
+    layout = dht_layout_get(this, loc->parent);
+
+    if (!layout) {
+        gf_msg_debug(this->name, 0, "Missing layout. path=%s, parent gfid =%s",
+                     loc->path, uuid_utoa(loc->parent->gfid));
+        goto out;
+    }
+
+    subvol = methods->layout_search(this, layout, loc->name);
+
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "No hashed subvolume for path=%s",
+                     loc->path);
+        goto out;
+    }
+
+out:
+    if (layout) {
+        dht_layout_unref(this, layout);
+    }
+
+    return subvol;
 }
 
 xlator_t *
-dht_subvol_get_hashed (xlator_t *this, loc_t *loc)
+dht_subvol_get_cached(xlator_t *this, inode_t *inode)
 {
-        dht_layout_t *layout = NULL;
-        xlator_t     *subvol = NULL;
+    dht_layout_t *layout = NULL;
+    xlator_t *subvol = NULL;
 
-        if (is_fs_root (loc)) {
-                subvol = dht_first_up_child (this);
-                goto out;
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    layout = dht_layout_get(this, inode);
+
+    if (!layout) {
+        goto out;
+    }
+
+    subvol = layout->list[0].xlator;
+
+out:
+    if (layout) {
+        dht_layout_unref(this, layout);
+    }
+
+    return subvol;
+}
+
+xlator_t *
+dht_subvol_next(xlator_t *this, xlator_t *prev)
+{
+    dht_conf_t *conf = NULL;
+    int i = 0;
+    xlator_t *next = NULL;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (conf->subvolumes[i] == prev) {
+            if ((i + 1) < conf->subvolume_cnt)
+                next = conf->subvolumes[i + 1];
+            break;
         }
+    }
 
-        layout = dht_layout_get (this, loc->parent);
+out:
+    return next;
+}
 
-        if (!layout) {
-                gf_log (this->name, GF_LOG_ERROR,
-                        "layout missing path=%s parent=%"PRId64,
-                        loc->path, loc->parent->ino);
-                goto out;
+/* This func wraps around, if prev is actually the last subvol.
+ */
+xlator_t *
+dht_subvol_next_available(xlator_t *this, xlator_t *prev)
+{
+    dht_conf_t *conf = NULL;
+    int i = 0;
+    xlator_t *next = NULL;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (conf->subvolumes[i] == prev) {
+            /* if prev is last in conf->subvolumes, then wrap
+             * around.
+             */
+            if ((i + 1) < conf->subvolume_cnt) {
+                next = conf->subvolumes[i + 1];
+            } else {
+                next = conf->subvolumes[0];
+            }
+            break;
+        }
+    }
+
+out:
+    return next;
+}
+int
+dht_subvol_cnt(xlator_t *this, xlator_t *subvol)
+{
+    int i = 0;
+    int ret = -1;
+    dht_conf_t *conf = NULL;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (subvol == conf->subvolumes[i]) {
+            ret = i;
+            break;
         }
+    }
 
-        subvol = dht_layout_search (this, layout, loc->name);
+out:
+    return ret;
+}
 
-        if (!subvol) {
-                gf_log (this->name, GF_LOG_ERROR,
-                        "could not find subvolume for path=%s",
-                        loc->path);
-                goto out;
+#define set_if_greater(a, b)                                                   \
+    do {                                                                       \
+        if ((a) < (b))                                                         \
+            (a) = (b);                                                         \
+    } while (0)
+
+#define set_if_greater_time(a, an, b, bn)                                      \
+    do {                                                                       \
+        if (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) {                  \
+            (a) = (b);                                                         \
+            (an) = (bn);                                                       \
+        }                                                                      \
+    } while (0)
+
+int
+dht_iatt_merge(xlator_t *this, struct iatt *to, struct iatt *from)
+{
+    if (!from || !to)
+        return 0;
+
+    to->ia_dev = from->ia_dev;
+
+    gf_uuid_copy(to->ia_gfid, from->ia_gfid);
+
+    to->ia_ino = from->ia_ino;
+    to->ia_prot = from->ia_prot;
+    to->ia_type = from->ia_type;
+    to->ia_nlink = from->ia_nlink;
+    to->ia_rdev = from->ia_rdev;
+    to->ia_size += from->ia_size;
+    to->ia_blksize = from->ia_blksize;
+    to->ia_blocks += from->ia_blocks;
+
+    if (IA_ISDIR(from->ia_type)) {
+        to->ia_blocks = DHT_DIR_STAT_BLOCKS;
+        to->ia_size = DHT_DIR_STAT_SIZE;
+    }
+    set_if_greater(to->ia_uid, from->ia_uid);
+    set_if_greater(to->ia_gid, from->ia_gid);
+
+    set_if_greater_time(to->ia_atime, to->ia_atime_nsec, from->ia_atime,
+                        from->ia_atime_nsec);
+    set_if_greater_time(to->ia_mtime, to->ia_mtime_nsec, from->ia_mtime,
+                        from->ia_mtime_nsec);
+    set_if_greater_time(to->ia_ctime, to->ia_ctime_nsec, from->ia_ctime,
+                        from->ia_ctime_nsec);
+
+    return 0;
+}
+
+int
+dht_build_child_loc(xlator_t *this, loc_t *child, loc_t *parent, char *name)
+{
+    if (!child) {
+        goto err;
+    }
+
+    if (strcmp(parent->path, "/") == 0)
+        gf_asprintf((char **)&child->path, "/%s", name);
+    else
+        gf_asprintf((char **)&child->path, "%s/%s", parent->path, name);
+
+    if (!child->path) {
+        goto err;
+    }
+
+    child->name = strrchr(child->path, '/');
+    if (child->name)
+        child->name++;
+
+    child->parent = inode_ref(parent->inode);
+    child->inode = inode_new(parent->inode->table);
+
+    if (!child->inode) {
+        goto err;
+    }
+
+    return 0;
+err:
+    if (child) {
+        loc_wipe(child);
+    }
+    return -1;
+}
+
+int
+dht_init_local_subvolumes(xlator_t *this, dht_conf_t *conf)
+{
+    xlator_list_t *subvols = NULL;
+    int cnt = 0;
+
+    if (!conf)
+        return -1;
+
+    for (subvols = this->children; subvols; subvols = subvols->next)
+        cnt++;
+
+    conf->local_subvols = GF_CALLOC(cnt, sizeof(xlator_t *),
+                                    gf_dht_mt_xlator_t);
+
+    /* FIX FIX : do this dynamically*/
+    conf->local_nodeuuids = GF_CALLOC(cnt, sizeof(subvol_nodeuuids_info_t),
+                                      gf_dht_nodeuuids_t);
+
+    if (!conf->local_subvols || !conf->local_nodeuuids) {
+        return -1;
+    }
+
+    conf->local_subvols_cnt = 0;
+
+    return 0;
+}
+
+int
+dht_init_subvolumes(xlator_t *this, dht_conf_t *conf)
+{
+    xlator_list_t *subvols = NULL;
+    int cnt = 0;
+
+    if (!conf)
+        return -1;
+
+    for (subvols = this->children; subvols; subvols = subvols->next)
+        cnt++;
+
+    conf->subvolumes = GF_CALLOC(cnt, sizeof(xlator_t *), gf_dht_mt_xlator_t);
+    if (!conf->subvolumes) {
+        return -1;
+    }
+    conf->subvolume_cnt = cnt;
+    /* Doesn't make sense to do any dht layer tasks
+       if the subvol count is 1. Set it as pass_through */
+    if (cnt == 1)
+        this->pass_through = _gf_true;
+
+    conf->local_subvols_cnt = 0;
+
+    dht_set_subvol_range(this);
+
+    cnt = 0;
+    for (subvols = this->children; subvols; subvols = subvols->next)
+        conf->subvolumes[cnt++] = subvols->xlator;
+
+    conf->subvolume_status = GF_CALLOC(cnt, sizeof(char), gf_dht_mt_char);
+    if (!conf->subvolume_status) {
+        return -1;
+    }
+
+    conf->last_event = GF_CALLOC(cnt, sizeof(int), gf_dht_mt_char);
+    if (!conf->last_event) {
+        return -1;
+    }
+
+    conf->subvol_up_time = GF_CALLOC(cnt, sizeof(time_t),
+                                     gf_dht_mt_subvol_time);
+    if (!conf->subvol_up_time) {
+        return -1;
+    }
+
+    conf->du_stats = GF_CALLOC(conf->subvolume_cnt, sizeof(dht_du_t),
+                               gf_dht_mt_dht_du_t);
+    if (!conf->du_stats) {
+        return -1;
+    }
+
+    conf->decommissioned_bricks = GF_CALLOC(cnt, sizeof(xlator_t *),
+                                            gf_dht_mt_xlator_t);
+    if (!conf->decommissioned_bricks) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ op_ret values :
+  0 : Success.
+ -1 : Failure.
+  1 : File is being migrated but not by this DHT layer.
+*/
+
+static int
+dht_migration_complete_check_done(int op_ret, call_frame_t *frame, void *data)
+{
+    dht_local_t *local = NULL;
+    xlator_t *subvol = NULL;
+
+    local = frame->local;
+
+    if (op_ret != 0)
+        goto out;
+
+    if (local->cached_subvol == NULL) {
+        local->op_errno = EINVAL;
+        goto out;
+    }
+
+    subvol = local->cached_subvol;
+
+out:
+    local->rebalance.target_op_fn(THIS, subvol, frame, op_ret);
+
+    return 0;
+}
+
+int
+dht_migration_complete_check_task(void *data)
+{
+    int ret = -1;
+    xlator_t *src_node = NULL;
+    xlator_t *dst_node = NULL, *linkto_target = NULL;
+    dht_local_t *local = NULL;
+    dict_t *dict = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+    xlator_t *this = NULL;
+    call_frame_t *frame = NULL;
+    loc_t tmp_loc = {
+        0,
+    };
+    char *path = NULL;
+    dht_conf_t *conf = NULL;
+    inode_t *inode = NULL;
+    fd_t *iter_fd = NULL;
+    fd_t *tmp = NULL;
+    uint64_t tmp_miginfo = 0;
+    dht_migrate_info_t *miginfo = NULL;
+    gf_boolean_t skip_open = _gf_false;
+    int open_failed = 0;
+
+    this = THIS;
+    frame = data;
+    local = frame->local;
+    conf = this->private;
+
+    src_node = local->cached_subvol;
+
+    if (!local->loc.inode && !local->fd) {
+        local->op_errno = EINVAL;
+        goto out;
+    }
+
+    inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
+    /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
+     * as root:root. If a fd is already open, access check won't be done*/
+
+    if (!local->loc.inode) {
+        ret = syncop_fgetxattr(src_node, local->fd, &dict,
+                               conf->link_xattr_name, NULL, NULL);
+    } else {
+        SYNCTASK_SETID(0, 0);
+        ret = syncop_getxattr(src_node, &local->loc, &dict,
+                              conf->link_xattr_name, NULL, NULL);
+        SYNCTASK_SETID(frame->root->uid, frame->root->gid);
+    }
+
+    /*
+     * Each DHT xlator layer has its own name for the linkto xattr.
+     * If the file mode bits indicate the the file is being migrated but
+     * this layer's linkto xattr is not set, it means that another
+     * DHT layer is migrating the file. In this case, return 1 so
+     * the mode bits can be passed on to the higher layer for appropriate
+     * action.
+     */
+    if (-ret == ENODATA) {
+        /* This DHT translator is not migrating this file */
+
+        ret = inode_ctx_reset1(inode, this, &tmp_miginfo);
+        if (tmp_miginfo) {
+            /* This can be a problem if the file was
+             * migrated by two different layers. Raise
+             * a warning here.
+             */
+            gf_smsg(
+                this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO, "tmp=%s",
+                tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), NULL);
+
+            miginfo = (void *)(uintptr_t)tmp_miginfo;
+            GF_REF_PUT(miginfo);
         }
+        ret = 1;
+        goto out;
+    }
+
+    if (!ret)
+        linkto_target = dht_linkfile_subvol(this, NULL, NULL, dict);
+
+    if (local->loc.inode) {
+        loc_copy(&tmp_loc, &local->loc);
+    } else {
+        tmp_loc.inode = inode_ref(inode);
+        gf_uuid_copy(tmp_loc.gfid, inode->gfid);
+    }
+
+    ret = syncop_lookup(this, &tmp_loc, &stbuf, 0, 0, 0);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED,
+                "tmp=%s", tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+                "name=%s", this->name, NULL);
+        local->op_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    dst_node = dht_subvol_get_cached(this, tmp_loc.inode);
+    if (linkto_target && dst_node != linkto_target) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_LINKFILE,
+                "linkto_target_name=%s", linkto_target->name, "dst_name=%s",
+                dst_node->name, NULL);
+    }
+
+    if (gf_uuid_compare(stbuf.ia_gfid, tmp_loc.inode->gfid)) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, "tmp=%s",
+                tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+                "dst_name=%s", dst_node->name, NULL);
+        ret = -1;
+        local->op_errno = EIO;
+        goto out;
+    }
+
+    /* update local. A layout is set in inode-ctx in lookup already */
+
+    dht_layout_unref(this, local->layout);
+
+    local->layout = dht_layout_get(frame->this, inode);
+    local->cached_subvol = dst_node;
+
+    ret = 0;
+
+    /* once we detect the migration complete, the inode-ctx2 is no more
+       required.. delete the ctx and also, it means, open() already
+       done on all the fd of inode */
+    ret = inode_ctx_reset1(inode, this, &tmp_miginfo);
+    if (tmp_miginfo) {
+        miginfo = (void *)(uintptr_t)tmp_miginfo;
+        GF_REF_PUT(miginfo);
+        goto out;
+    }
+
+    /* perform 'open()' on all the fd's present on the inode */
+    if (tmp_loc.path == NULL) {
+        inode_path(inode, NULL, &path);
+        if (path)
+            tmp_loc.path = path;
+    }
+
+    LOCK(&inode->lock);
+
+    if (list_empty(&inode->fd_list))
+        goto unlock;
+
+    /* perform open as root:root. There is window between linkfile
+     * creation(root:root) and setattr with the correct uid/gid
+     */
+    SYNCTASK_SETID(0, 0);
+
+    /* It's possible that we are the last user of iter_fd after each
+     * iteration. In this case the fd_unref() of iter_fd at the end of
+     * the loop will cause the destruction of the fd. So we need to
+     * iterate the list safely because iter_fd cannot be trusted.
+     */
+    iter_fd = list_entry((&inode->fd_list)->next, typeof(*iter_fd), inode_list);
+    while (&iter_fd->inode_list != (&inode->fd_list)) {
+        if (fd_is_anonymous(iter_fd) ||
+            (dht_fd_open_on_dst(this, iter_fd, dst_node))) {
+            if (!tmp) {
+                iter_fd = list_entry(iter_fd->inode_list.next, typeof(*iter_fd),
+                                     inode_list);
+                continue;
+            }
+            skip_open = _gf_true;
+        }
+        /* We need to release the inode->lock before calling
+         * syncop_open() to avoid possible deadlocks. However this
+         * can cause the iter_fd to be released by other threads.
+         * To avoid this, we take a reference before releasing the
+         * lock.
+         */
+        fd_ref(iter_fd);
+
+        UNLOCK(&inode->lock);
+
+        if (tmp) {
+            fd_unref(tmp);
+            tmp = NULL;
+        }
+        if (skip_open)
+            goto next;
+
+        /* flags for open are stripped down to allow following the
+         * new location of the file, otherwise we can get EEXIST or
+         * truncate the file again as rebalance is moving the data */
+        ret = syncop_open(dst_node, &tmp_loc,
+                          (iter_fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)),
+                          iter_fd, NULL, NULL);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, -ret,
+                    DHT_MSG_OPEN_FD_ON_DST_FAILED, "id=%p", iter_fd,
+                    "flags=0%o", iter_fd->flags, "path=%s", path, "name=%s",
+                    dst_node->name, NULL);
+
+            open_failed = 1;
+            local->op_errno = -ret;
+            ret = -1;
+        } else {
+            dht_fd_ctx_set(this, iter_fd, dst_node);
+        }
+
+    next:
+        LOCK(&inode->lock);
+        skip_open = _gf_false;
+        tmp = iter_fd;
+        iter_fd = list_entry(tmp->inode_list.next, typeof(*tmp), inode_list);
+    }
+
+    SYNCTASK_SETID(frame->root->uid, frame->root->gid);
+
+    if (open_failed) {
+        ret = -1;
+        goto unlock;
+    }
+    ret = 0;
+
+unlock:
+    UNLOCK(&inode->lock);
+    if (tmp) {
+        fd_unref(tmp);
+        tmp = NULL;
+    }
 
 out:
-        return subvol;
+    if (dict) {
+        dict_unref(dict);
+    }
+
+    loc_wipe(&tmp_loc);
+
+    return ret;
 }
 
+int
+dht_rebalance_complete_check(xlator_t *this, call_frame_t *frame)
+{
+    int ret = -1;
 
-xlator_t *
-dht_subvol_get_cached (xlator_t *this, inode_t *inode)
+    ret = synctask_new(this->ctx->env, dht_migration_complete_check_task,
+                       dht_migration_complete_check_done, frame, frame);
+    return ret;
+}
+
+/* During 'in-progress' state, both nodes should have the file */
+/*
+ op_ret values :
+  0 : Success
+ -1 : Failure.
+  1 : File is being migrated but not by this DHT layer.
+*/
+static int
+dht_inprogress_check_done(int op_ret, call_frame_t *frame, void *data)
 {
-        dht_layout_t *layout = NULL;
-        xlator_t     *subvol = NULL;
+    dht_local_t *local = NULL;
+    xlator_t *dst_subvol = NULL, *src_subvol = NULL;
+    inode_t *inode = NULL;
 
+    local = frame->local;
 
-        layout = dht_layout_get (this, inode);
+    if (op_ret != 0)
+        goto out;
 
-        if (!layout) {
-                goto out;
+    inode = local->loc.inode ? local->loc.inode : local->fd->inode;
+
+    dht_inode_ctx_get_mig_info(THIS, inode, &src_subvol, &dst_subvol);
+    if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol, dst_subvol)) {
+        dst_subvol = dht_subvol_get_cached(THIS, inode);
+        if (!dst_subvol) {
+            local->op_errno = EINVAL;
+            goto out;
         }
+    }
 
-	subvol = layout->list[0].xlator;
+out:
+    local->rebalance.target_op_fn(THIS, dst_subvol, frame, op_ret);
 
+    return 0;
+}
+
+static int
+dht_rebalance_inprogress_task(void *data)
+{
+    int ret = -1;
+    xlator_t *src_node = NULL;
+    xlator_t *dst_node = NULL;
+    dht_local_t *local = NULL;
+    dict_t *dict = NULL;
+    call_frame_t *frame = NULL;
+    xlator_t *this = NULL;
+    char *path = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+    loc_t tmp_loc = {
+        0,
+    };
+    dht_conf_t *conf = NULL;
+    inode_t *inode = NULL;
+    fd_t *iter_fd = NULL;
+    fd_t *tmp = NULL;
+    int open_failed = 0;
+    uint64_t tmp_miginfo = 0;
+    dht_migrate_info_t *miginfo = NULL;
+    gf_boolean_t skip_open = _gf_false;
+
+    this = THIS;
+    frame = data;
+    local = frame->local;
+    conf = this->private;
+
+    src_node = local->cached_subvol;
+
+    if (!local->loc.inode && !local->fd)
+        goto out;
+
+    inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
+    /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
+     * as root:root. If a fd is already open, access check won't be done*/
+    if (local->loc.inode) {
+        SYNCTASK_SETID(0, 0);
+        ret = syncop_getxattr(src_node, &local->loc, &dict,
+                              conf->link_xattr_name, NULL, NULL);
+        SYNCTASK_SETID(frame->root->uid, frame->root->gid);
+    } else {
+        ret = syncop_fgetxattr(src_node, local->fd, &dict,
+                               conf->link_xattr_name, NULL, NULL);
+    }
+
+    /*
+     * Each DHT xlator layer has its own name for the linkto xattr.
+     * If the file mode bits indicate the the file is being migrated but
+     * this layer's linkto xattr is not present, it means that another
+     * DHT layer is migrating the file. In this case, return 1 so
+     * the mode bits can be passed on to the higher layer for appropriate
+     * action.
+     */
+
+    if (-ret == ENODATA) {
+        /* This DHT layer is not migrating this file */
+        ret = inode_ctx_reset1(inode, this, &tmp_miginfo);
+        if (tmp_miginfo) {
+            /* This can be a problem if the file was
+             * migrated by two different layers. Raise
+             * a warning here.
+             */
+            gf_smsg(
+                this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO, "tmp=%s",
+                tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), NULL);
+            miginfo = (void *)(uintptr_t)tmp_miginfo;
+            GF_REF_PUT(miginfo);
+        }
+        ret = 1;
+        goto out;
+    }
+
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_GET_XATTR_FAILED,
+                "path=%s", local->loc.path, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    dst_node = dht_linkfile_subvol(this, NULL, NULL, dict);
+    if (!dst_node) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GET_XATTR_FAILED,
+                "path=%s", local->loc.path, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    local->rebalance.target_node = dst_node;
+
+    if (local->loc.inode) {
+        loc_copy(&tmp_loc, &local->loc);
+    } else {
+        tmp_loc.inode = inode_ref(inode);
+        gf_uuid_copy(tmp_loc.gfid, inode->gfid);
+    }
+
+    /* lookup on dst */
+    ret = syncop_lookup(dst_node, &tmp_loc, &stbuf, NULL, NULL, NULL);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED,
+                "tmp=%s", tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+                "name=%s", dst_node->name, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    if (gf_uuid_compare(stbuf.ia_gfid, tmp_loc.inode->gfid)) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, "tmp=%s",
+                tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+                "name=%s", dst_node->name, NULL);
+        ret = -1;
+        goto out;
+    }
+    ret = 0;
+
+    if (tmp_loc.path == NULL) {
+        inode_path(inode, NULL, &path);
+        if (path)
+            tmp_loc.path = path;
+    }
+
+    LOCK(&inode->lock);
+
+    if (list_empty(&inode->fd_list))
+        goto unlock;
+
+    /* perform open as root:root. There is window between linkfile
+     * creation(root:root) and setattr with the correct uid/gid
+     */
+    SYNCTASK_SETID(0, 0);
+
+    /* It's possible that we are the last user of iter_fd after each
+     * iteration. In this case the fd_unref() of iter_fd at the end of
+     * the loop will cause the destruction of the fd. So we need to
+     * iterate the list safely because iter_fd cannot be trusted.
+     */
+    iter_fd = list_entry((&inode->fd_list)->next, typeof(*iter_fd), inode_list);
+    while (&iter_fd->inode_list != (&inode->fd_list)) {
+        /* We need to release the inode->lock before calling
+         * syncop_open() to avoid possible deadlocks. However this
+         * can cause the iter_fd to be released by other threads.
+         * To avoid this, we take a reference before releasing the
+         * lock.
+         */
+
+        if (fd_is_anonymous(iter_fd) ||
+            (dht_fd_open_on_dst(this, iter_fd, dst_node))) {
+            if (!tmp) {
+                iter_fd = list_entry(iter_fd->inode_list.next, typeof(*iter_fd),
+                                     inode_list);
+                continue;
+            }
+            skip_open = _gf_true;
+        }
+
+        /* Yes, this is ugly but there isn't a cleaner way to do this
+         * the fd_ref is an atomic increment so not too bad. We want to
+         * reduce the number of inode locks and unlocks.
+         */
+
+        fd_ref(iter_fd);
+        UNLOCK(&inode->lock);
+
+        if (tmp) {
+            fd_unref(tmp);
+            tmp = NULL;
+        }
+        if (skip_open)
+            goto next;
+
+        /* flags for open are stripped down to allow following the
+         * new location of the file, otherwise we can get EEXIST or
+         * truncate the file again as rebalance is moving the data */
+        ret = syncop_open(dst_node, &tmp_loc,
+                          (iter_fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)),
+                          iter_fd, NULL, NULL);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, -ret,
+                    DHT_MSG_OPEN_FD_ON_DST_FAILED, "fd=%p", iter_fd,
+                    "flags=0%o", iter_fd->flags, "path=%s", path, "name=%s",
+                    dst_node->name, NULL);
+            ret = -1;
+            open_failed = 1;
+        } else {
+            /* Potential fd leak if this fails here as it will be
+               reopened at the next Phase1/2 check */
+            dht_fd_ctx_set(this, iter_fd, dst_node);
+        }
+
+    next:
+        LOCK(&inode->lock);
+        skip_open = _gf_false;
+        tmp = iter_fd;
+        iter_fd = list_entry(tmp->inode_list.next, typeof(*tmp), inode_list);
+    }
+
+    SYNCTASK_SETID(frame->root->uid, frame->root->gid);
+
+unlock:
+    UNLOCK(&inode->lock);
+
+    if (tmp) {
+        fd_unref(tmp);
+        tmp = NULL;
+    }
+    if (open_failed) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = dht_inode_ctx_set_mig_info(this, inode, src_node, dst_node);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+                "path=%s", local->loc.path, "name=%s", dst_node->name, NULL);
+        goto out;
+    }
+
+    ret = 0;
 out:
-        return subvol;
+    if (dict) {
+        dict_unref(dict);
+    }
+
+    loc_wipe(&tmp_loc);
+    return ret;
 }
 
+int
+dht_rebalance_in_progress_check(xlator_t *this, call_frame_t *frame)
+{
+    int ret = -1;
 
-xlator_t *
-dht_subvol_next (xlator_t *this, xlator_t *prev)
+    ret = synctask_new(this->ctx->env, dht_rebalance_inprogress_task,
+                       dht_inprogress_check_done, frame, frame);
+    return ret;
+}
+
+int
+dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this,
+                         dht_layout_t *layout_int)
+{
+    dht_inode_ctx_t *ctx = NULL;
+    int ret = -1;
+
+    ret = dht_inode_ctx_get(inode, this, &ctx);
+    if (!ret && ctx) {
+        ctx->layout = layout_int;
+    } else {
+        ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t);
+        if (!ctx)
+            return ret;
+        ctx->layout = layout_int;
+    }
+
+    ret = dht_inode_ctx_set(inode, this, ctx);
+
+    return ret;
+}
+
+void
+dht_inode_ctx_time_set(inode_t *inode, xlator_t *this, struct iatt *stat)
 {
-	dht_conf_t *conf = NULL;
-	int         i = 0;
-	xlator_t   *next = NULL;
+    dht_inode_ctx_t *ctx = NULL;
+    dht_stat_time_t *time = 0;
+    int ret = -1;
+
+    ret = dht_inode_ctx_get(inode, this, &ctx);
+
+    if (ret)
+        return;
+
+    time = &ctx->time;
 
-	conf = this->private;
+    time->mtime = stat->ia_mtime;
+    time->mtime_nsec = stat->ia_mtime_nsec;
 
-	for (i = 0; i < conf->subvolume_cnt; i++) {
-		if (conf->subvolumes[i] == prev) {
-			if ((i + 1) < conf->subvolume_cnt)
-				next = conf->subvolumes[i + 1];
-			break;
-		}
-	}
+    time->ctime = stat->ia_ctime;
+    time->ctime_nsec = stat->ia_ctime_nsec;
 
-	return next;
+    time->atime = stat->ia_atime;
+    time->atime_nsec = stat->ia_atime_nsec;
+
+    return;
 }
 
+int
+dht_inode_ctx_time_update(inode_t *inode, xlator_t *this, struct iatt *stat,
+                          int32_t post)
+{
+    dht_inode_ctx_t *ctx = NULL;
+    dht_stat_time_t *time = 0;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO(this->name, stat, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    ret = dht_inode_ctx_get(inode, this, &ctx);
+
+    if (ret) {
+        ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t);
+        if (!ctx)
+            return -1;
+    }
+
+    time = &ctx->time;
+
+    LOCK(&inode->lock);
+    {
+        DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, stat->ia_mtime,
+                        stat->ia_mtime_nsec, post);
+        DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, stat->ia_ctime,
+                        stat->ia_ctime_nsec, post);
+        DHT_UPDATE_TIME(time->atime, time->atime_nsec, stat->ia_atime,
+                        stat->ia_atime_nsec, post);
+    }
+    UNLOCK(&inode->lock);
+
+    ret = dht_inode_ctx_set(inode, this, ctx);
+out:
+    return 0;
+}
 
 int
-dht_subvol_cnt (xlator_t *this, xlator_t *subvol)
+dht_inode_ctx_get(inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx)
+{
+    int ret = -1;
+    uint64_t ctx_int = 0;
+
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    ret = inode_ctx_get(inode, this, &ctx_int);
+
+    if (ret)
+        return ret;
+
+    if (ctx)
+        *ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int;
+out:
+    return ret;
+}
+
+int
+dht_inode_ctx_set(inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx)
+{
+    int ret = -1;
+    uint64_t ctx_int = 0;
+
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, ctx, out);
+
+    ctx_int = (long)ctx;
+    ret = inode_ctx_set(inode, this, &ctx_int);
+out:
+    return ret;
+}
+
+int
+dht_subvol_status(dht_conf_t *conf, xlator_t *subvol)
+{
+    int i;
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (conf->subvolumes[i] == subvol) {
+            return conf->subvolume_status[i];
+        }
+    }
+    return 0;
+}
+
+inode_t *
+dht_heal_path(xlator_t *this, char *path, inode_table_t *itable)
 {
-	int i = 0;
-	int ret = -1;
-	dht_conf_t *conf = NULL;
+    int ret = -1;
+    struct iatt iatt = {
+        0,
+    };
+    inode_t *linked_inode = NULL;
+    loc_t loc = {
+        0,
+    };
+    char *bname = NULL;
+    char *save_ptr = NULL;
+    static uuid_t gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+    char *tmp_path = NULL;
+
+    tmp_path = gf_strdup(path);
+    if (!tmp_path) {
+        goto out;
+    }
+
+    gf_uuid_copy(loc.pargfid, gfid);
+    loc.parent = inode_ref(itable->root);
+
+    bname = strtok_r(tmp_path, "/", &save_ptr);
+
+    /* sending a lookup on parent directory,
+     * Eg:  if  path is like /a/b/c/d/e/f/g/
+     * then we will send a lookup on a first and then b,c,d,etc
+     */
+
+    while (bname) {
+        linked_inode = NULL;
+        loc.inode = inode_grep(itable, loc.parent, bname);
+        if (loc.inode == NULL) {
+            loc.inode = inode_new(itable);
+            if (loc.inode == NULL) {
+                ret = -ENOMEM;
+                goto out;
+            }
+        } else {
+            /*
+             * Inode is already populated in the inode table.
+             * Which means we already looked up the inode and
+             * linked with a dentry. So that we will skip
+             * lookup on this entry, and proceed to next.
+             */
+            linked_inode = loc.inode;
+            bname = strtok_r(NULL, "/", &save_ptr);
+            if (!bname) {
+                goto out;
+            }
+            inode_unref(loc.parent);
+            loc.parent = loc.inode;
+            gf_uuid_copy(loc.pargfid, loc.inode->gfid);
+            loc.inode = NULL;
+            continue;
+        }
 
+        loc.name = bname;
+        ret = loc_path(&loc, bname);
 
-	conf = this->private;
+        ret = syncop_lookup(this, &loc, &iatt, NULL, NULL, NULL);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_SELFHEAL_FAILED,
+                    "path=%s", path, "subvolume=%s", this->name, "bname=%s",
+                    bname, NULL);
+            goto out;
+        }
+
+        linked_inode = inode_link(loc.inode, loc.parent, bname, &iatt);
+        if (!linked_inode)
+            goto out;
 
-	for (i = 0; i < conf->subvolume_cnt; i++) {
-		if (subvol == conf->subvolumes[i]) {
-			ret = i;
-			break;
-		}
-	}
+        loc_wipe(&loc);
+        gf_uuid_copy(loc.pargfid, linked_inode->gfid);
+        loc.inode = NULL;
 
-	return ret;
+        bname = strtok_r(NULL, "/", &save_ptr);
+        if (bname)
+            loc.parent = linked_inode;
+    }
+out:
+    inode_ref(linked_inode);
+    loc_wipe(&loc);
+    GF_FREE(tmp_path);
+
+    return linked_inode;
 }
 
+int
+dht_heal_full_path(void *data)
+{
+    call_frame_t *heal_frame = data;
+    dht_local_t *local = NULL;
+    loc_t loc = {
+        0,
+    };
+    dict_t *dict = NULL;
+    char *path = NULL;
+    int ret = -1;
+    xlator_t *source = NULL;
+    xlator_t *this = NULL;
+    inode_table_t *itable = NULL;
+    inode_t *inode = NULL;
+    inode_t *tmp_inode = NULL;
+
+    GF_VALIDATE_OR_GOTO("DHT", heal_frame, out);
+
+    local = heal_frame->local;
+    this = heal_frame->this;
+    source = heal_frame->cookie;
+    heal_frame->cookie = NULL;
+    gf_uuid_copy(loc.gfid, local->gfid);
+
+    if (local->loc.inode)
+        loc.inode = inode_ref(local->loc.inode);
+    else
+        goto out;
+
+    itable = loc.inode->table;
+    ret = syncop_getxattr(source, &loc, &dict, GET_ANCESTRY_PATH_KEY, NULL,
+                          NULL);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_HEAL_ABORT,
+                "subvol=%s", source->name, NULL);
+        goto out;
+    }
+
+    ret = dict_get_str(dict, GET_ANCESTRY_PATH_KEY, &path);
+    if (path) {
+        inode = dht_heal_path(this, path, itable);
+        if (inode && inode != local->inode) {
+            /*
+             * if inode returned by heal function is different
+             * from what we passed, which means a racing thread
+             * already linked a different inode for dentry.
+             * So we will update our local->inode, so that we can
+             * retrurn proper inode.
+             */
+            tmp_inode = local->inode;
+            local->inode = inode;
+            inode_unref(tmp_inode);
+            tmp_inode = NULL;
+        } else {
+            inode_unref(inode);
+        }
+    }
 
-#define set_if_greater(a, b) do {		\
-		if ((a) < (b))			\
-			(a) = (b);		\
-	} while (0)
+out:
+    loc_wipe(&loc);
+    if (dict)
+        dict_unref(dict);
+    return 0;
+}
 
 int
-dht_stat_merge (xlator_t *this, struct stat *to,
-		struct stat *from, xlator_t *subvol)
+dht_heal_full_path_done(int op_ret, call_frame_t *heal_frame, void *data)
 {
-	to->st_dev      = from->st_dev;
+    call_frame_t *main_frame = NULL;
+    dht_local_t *local = NULL;
+    xlator_t *this = NULL;
+    int ret = -1;
+    int op_errno = 0;
+
+    local = heal_frame->local;
+    main_frame = local->main_frame;
+    local->main_frame = NULL;
+    this = heal_frame->this;
+
+    dht_set_fixed_dir_stat(&local->postparent);
+    if (local->need_xattr_heal) {
+        local->need_xattr_heal = 0;
+        ret = dht_dir_xattr_heal(this, local, &op_errno);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                    DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s", local->loc.path,
+                    NULL);
+        }
+    }
 
-	dht_itransform (this, subvol, from->st_ino, &to->st_ino);
+    DHT_STACK_UNWIND(lookup, main_frame, 0, 0, local->inode, &local->stbuf,
+                     local->xattr, &local->postparent);
+
+    DHT_STACK_DESTROY(heal_frame);
+    return 0;
+}
+
+/* This function must be called inside an inode lock */
+int
+__dht_lock_subvol_set(inode_t *inode, xlator_t *this, xlator_t *lock_subvol)
+{
+    dht_inode_ctx_t *ctx = NULL;
+    int ret = -1;
+    uint64_t value = 0;
 
-	to->st_mode     = from->st_mode;
-	to->st_nlink    = from->st_nlink;
-	to->st_uid      = from->st_uid;
-	to->st_gid      = from->st_gid;
-	to->st_rdev     = from->st_rdev;
-	to->st_size    += from->st_size;
-	to->st_blksize  = from->st_blksize;
-	to->st_blocks  += from->st_blocks;
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
 
-	set_if_greater (to->st_atime, from->st_atime);
-	set_if_greater (to->st_mtime, from->st_mtime);
-	set_if_greater (to->st_ctime, from->st_ctime);
+    ret = __inode_ctx_get0(inode, this, &value);
+    if (ret || !value) {
+        return -1;
+    }
 
-	return 0;
+    ctx = (dht_inode_ctx_t *)(uintptr_t)value;
+    ctx->lock_subvol = lock_subvol;
+out:
+    return ret;
+}
+
+xlator_t *
+dht_get_lock_subvolume(xlator_t *this, struct gf_flock *lock,
+                       dht_local_t *local)
+{
+    xlator_t *subvol = NULL;
+    inode_t *inode = NULL;
+    int32_t ret = -1;
+    uint64_t value = 0;
+    xlator_t *cached_subvol = NULL;
+    dht_inode_ctx_t *ctx = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    GF_VALIDATE_OR_GOTO(this->name, lock, out);
+    GF_VALIDATE_OR_GOTO(this->name, local, out);
+
+    cached_subvol = local->cached_subvol;
+
+    if (local->loc.inode || local->fd) {
+        inode = local->loc.inode ? local->loc.inode : local->fd->inode;
+    }
+
+    if (!inode)
+        goto out;
+
+    if (!(IA_ISDIR(inode->ia_type) || IA_ISINVAL(inode->ia_type))) {
+        /*
+         * We may get non-linked inode for directories as part
+         * of the selfheal code path. So checking  for IA_INVAL
+         * type also. This will only happen for directory.
+         */
+        subvol = local->cached_subvol;
+        goto out;
+    }
+
+    if (lock->l_type != F_UNLCK) {
+        /*
+         * inode purging might happen on NFS between a lk
+         * and unlk. Due to this lk and unlk might be sent
+         * to different subvols.
+         * So during a lock request, taking a ref on inode
+         * to prevent inode purging. inode unref will happen
+         * in unlock cbk code path.
+         */
+        inode_ref(inode);
+    }
+
+    LOCK(&inode->lock);
+    ret = __inode_ctx_get0(inode, this, &value);
+    if (!ret && value) {
+        ctx = (dht_inode_ctx_t *)(uintptr_t)value;
+        subvol = ctx->lock_subvol;
+    }
+    if (!subvol && lock->l_type != F_UNLCK && cached_subvol) {
+        ret = __dht_lock_subvol_set(inode, this, cached_subvol);
+        if (ret) {
+            gf_uuid_unparse(inode->gfid, gfid);
+            UNLOCK(&inode->lock);
+            gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+                    "lock_subvol gfid=%s", gfid, NULL);
+            goto post_unlock;
+        }
+        subvol = cached_subvol;
+    }
+    UNLOCK(&inode->lock);
+post_unlock:
+    if (!subvol && inode && lock->l_type != F_UNLCK) {
+        inode_unref(inode);
+    }
+out:
+    return subvol;
+}
+
+int
+dht_lk_inode_unref(call_frame_t *frame, int32_t op_ret)
+{
+    int ret = -1;
+    dht_local_t *local = NULL;
+    inode_t *inode = NULL;
+    xlator_t *this = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+    this = frame->this;
+
+    if (local->loc.inode || local->fd) {
+        inode = local->loc.inode ? local->loc.inode : local->fd->inode;
+    }
+    if (!inode) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LOCK_INODE_UNREF_FAILED,
+                NULL);
+        goto out;
+    }
+
+    if (!(IA_ISDIR(inode->ia_type) || IA_ISINVAL(inode->ia_type))) {
+        ret = 0;
+        goto out;
+    }
+
+    switch (local->lock_type) {
+        case F_RDLCK:
+        case F_WRLCK:
+            if (op_ret) {
+                gf_uuid_unparse(inode->gfid, gfid);
+                gf_msg_debug(this->name, 0, "lock request failed for gfid %s",
+                             gfid);
+                inode_unref(inode);
+                goto out;
+            }
+            break;
+
+        case F_UNLCK:
+            if (!op_ret) {
+                inode_unref(inode);
+            } else {
+                gf_uuid_unparse(inode->gfid, gfid);
+                gf_smsg(this->name, GF_LOG_WARNING, 0,
+                        DHT_MSG_LOCK_INODE_UNREF_FAILED, "gfid=%s", gfid, NULL);
+                goto out;
+            }
+        default:
+            break;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+/* Code to update custom extended attributes from src dict to dst dict
+ */
+void
+dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst,
+                       dict_t *src, int *uret, int *uflag)
+{
+    int ret = -1;
+    data_t *keyval = NULL;
+    int luret = -1;
+    int luflag = -1;
+    int i = 0;
+    char **xattrs_to_heal;
+
+    if (!src || !dst) {
+        gf_smsg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_DST_NULL_SET_FAILED,
+                "path=%s", local->loc.path, NULL);
+        return;
+    }
+    /* Check if any user xattr present in src dict and set
+       it to dst dict
+    */
+    luret = dict_foreach_fnmatch(src, "user.*", dht_set_user_xattr, dst);
+    /* Check if any other custom xattr present in src dict
+       and set it to dst dict, here index start from 1 because
+       user xattr already checked in previous statement
+    */
+
+    xattrs_to_heal = get_xattrs_to_heal();
+
+    for (i = 1; xattrs_to_heal[i]; i++) {
+        keyval = dict_get(src, xattrs_to_heal[i]);
+        if (keyval) {
+            luflag = 1;
+            ret = dict_set(dst, xattrs_to_heal[i], keyval);
+            if (ret)
+                gf_smsg(this->name, GF_LOG_WARNING, ENOMEM,
+                        DHT_MSG_DICT_SET_FAILED, "key=%s", xattrs_to_heal[i],
+                        "path=%s", local->loc.path, NULL);
+            keyval = NULL;
+        }
+    }
+    if (uret)
+        (*uret) = luret;
+    if (uflag)
+        (*uflag) = luflag;
 }
diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c
new file mode 100644
index 00000000000..dbb8070b0da
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-inode-read.c
@@ -0,0 +1,1658 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "dht-common.h"
+
+static int
+dht_access2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_readv2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_attr2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_open2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_flush2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_lk2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_fsync2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_common_xattrop2(xlator_t *this, xlator_t *subvol, call_frame_t *frame,
+                    int ret);
+
+static int
+dht_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+             int op_errno, fd_t *fd, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    int ret = 0;
+
+    local = frame->local;
+    prev = cookie;
+
+    local->op_errno = op_errno;
+    if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                     prev->name);
+        goto out;
+    }
+
+    /* Update ctx if the fd has been opened on the target*/
+    if (!op_ret && (local->call_cnt == 1)) {
+        dht_fd_ctx_set(this, fd, prev);
+        goto out;
+    }
+
+    if (!op_ret || (local->call_cnt != 1))
+        goto out;
+
+    /* rebalance would have happened */
+    local->rebalance.target_op_fn = dht_open2;
+    ret = dht_rebalance_complete_check(this, frame);
+    if (!ret)
+        return 0;
+
+out:
+    DHT_STACK_UNWIND(open, frame, op_ret, op_errno, local->fd, xdata);
+
+    return 0;
+}
+
+static int
+dht_open2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int op_errno = EINVAL;
+
+    if (!frame || !frame->local)
+        goto out;
+
+    local = frame->local;
+    op_errno = local->op_errno;
+
+    if (we_are_not_migrating(ret)) {
+        /* This DHT layer is not migrating the file */
+        DHT_STACK_UNWIND(open, frame, -1, local->op_errno, NULL,
+                         local->rebalance.xdata);
+        return 0;
+    }
+
+    if (subvol == NULL)
+        goto out;
+
+    local->call_cnt = 2;
+
+    STACK_WIND_COOKIE(frame, dht_open_cbk, subvol, subvol, subvol->fops->open,
+                      &local->loc, local->rebalance.flags, local->fd,
+                      local->xattr_req);
+    return 0;
+
+out:
+    DHT_STACK_UNWIND(open, frame, -1, op_errno, NULL, NULL);
+    return 0;
+}
+
+int
+dht_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd,
+         dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    local = dht_local_init(frame, loc, fd, GF_FOP_OPEN);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    local->rebalance.flags = flags;
+    local->call_cnt = 1;
+
+    STACK_WIND_COOKIE(frame, dht_open_cbk, subvol, subvol, subvol->fops->open,
+                      loc, flags, fd, xdata);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(open, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+int
+dht_file_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                  int op_errno, struct iatt *stbuf, dict_t *xdata)
+{
+    xlator_t *subvol1 = 0;
+    xlator_t *subvol2 = 0;
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    int ret = -1;
+    inode_t *inode = NULL;
+
+    GF_VALIDATE_OR_GOTO("dht", frame, err);
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+    GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+    local = frame->local;
+    prev = cookie;
+
+    if ((local->fop == GF_FOP_FSTAT) &&
+        dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto out;
+        return 0;
+    }
+
+    if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+        local->op_errno = op_errno;
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                     prev->name);
+        goto out;
+    }
+
+    if (local->call_cnt != 1)
+        goto out;
+
+    local->op_errno = op_errno;
+    local->op_ret = op_ret;
+
+    /* Check if the rebalance phase2 is true */
+    if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) {
+        local->rebalance.target_op_fn = dht_attr2;
+        dht_set_local_rebalance(this, local, NULL, NULL, stbuf, xdata);
+        inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+        dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2);
+        if (dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+            /* Phase 2 of migration */
+            ret = dht_rebalance_complete_check(this, frame);
+            if (!ret)
+                return 0;
+        } else {
+            /* it is a non-fd op or it is an fd based Fop and
+               opened on the dst.*/
+            if (local->fd && !dht_fd_open_on_dst(this, local->fd, subvol2)) {
+                ret = dht_rebalance_complete_check(this, frame);
+                if (!ret)
+                    return 0;
+            } else {
+                dht_attr2(this, subvol2, frame, 0);
+                return 0;
+            }
+        }
+    }
+
+out:
+    DHT_STRIP_PHASE1_FLAGS(stbuf);
+    DHT_STACK_UNWIND(stat, frame, op_ret, op_errno, stbuf, xdata);
+err:
+    return 0;
+}
+
+static int
+dht_attr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int op_errno = EINVAL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    op_errno = local->op_errno;
+
+    if (we_are_not_migrating(ret)) {
+        /* This dht xlator is not migrating the file. Unwind and
+         * pass on the original mode bits so the higher DHT layer
+         * can handle this.
+         */
+        DHT_STACK_UNWIND(stat, frame, local->op_ret, op_errno,
+                         &local->rebalance.postbuf, local->rebalance.xdata);
+        return 0;
+    }
+
+    if (subvol == NULL)
+        goto out;
+
+    local->call_cnt = 2;
+
+    if (local->fop == GF_FOP_FSTAT) {
+        STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+                          subvol->fops->fstat, local->fd, local->xattr_req);
+    } else {
+        STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+                          subvol->fops->stat, &local->loc, local->xattr_req);
+    }
+
+    return 0;
+
+out:
+    DHT_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL);
+    return 0;
+}
+
+static int
+dht_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+             int op_errno, struct iatt *stbuf, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    xlator_t *prev = NULL;
+    local = frame->local;
+    prev = cookie;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret == -1) {
+            local->op_errno = op_errno;
+            UNLOCK(&frame->lock);
+            gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                         prev->name);
+
+            goto post_unlock;
+        }
+
+        dht_iatt_merge(this, &local->stbuf, stbuf);
+
+        local->op_ret = 0;
+    }
+    UNLOCK(&frame->lock);
+post_unlock:
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt)) {
+        DHT_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno,
+                         &local->stbuf, xdata);
+    }
+
+    return 0;
+}
+
+int
+dht_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+    dht_layout_t *layout = NULL;
+    int i = 0;
+    int call_cnt = 0;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+    VALIDATE_OR_GOTO(loc->path, err);
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_STAT);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    layout = local->layout;
+    if (!layout) {
+        gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path);
+        op_errno = EINVAL;
+        goto err;
+    }
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    if (IA_ISREG(loc->inode->ia_type)) {
+        local->call_cnt = 1;
+
+        subvol = local->cached_subvol;
+
+        STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+                          subvol->fops->stat, loc, xdata);
+
+        return 0;
+    }
+
+    local->call_cnt = call_cnt = layout->cnt;
+
+    for (i = 0; i < call_cnt; i++) {
+        subvol = layout->list[i].xlator;
+
+        STACK_WIND_COOKIE(frame, dht_attr_cbk, subvol, subvol,
+                          subvol->fops->stat, loc, xdata);
+    }
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+int
+dht_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+    dht_layout_t *layout = NULL;
+    int i = 0;
+    int call_cnt = 0;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_FSTAT);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    layout = local->layout;
+    if (!layout) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "no layout for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    if (IA_ISREG(fd->inode->ia_type)) {
+        local->call_cnt = 1;
+
+        subvol = local->cached_subvol;
+
+        STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+                          subvol->fops->fstat, fd, xdata);
+        return 0;
+    }
+
+    local->call_cnt = call_cnt = layout->cnt;
+
+    for (i = 0; i < call_cnt; i++) {
+        subvol = layout->list[i].xlator;
+        STACK_WIND_COOKIE(frame, dht_attr_cbk, subvol, subvol,
+                          subvol->fops->fstat, fd, xdata);
+    }
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+int
+dht_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+              int op_errno, struct iovec *vector, int count, struct iatt *stbuf,
+              struct iobref *iobref, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int ret = 0;
+    xlator_t *src_subvol = 0;
+    xlator_t *dst_subvol = 0;
+
+    local = frame->local;
+    if (!local) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    /* This is already second try, no need for re-check */
+    if (local->call_cnt != 1)
+        goto out;
+
+    if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto out;
+        return 0;
+    }
+
+    if ((op_ret == -1) && !dht_inode_missing(op_errno))
+        goto out;
+
+    local->op_errno = op_errno;
+    if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) {
+        local->op_ret = op_ret;
+        local->rebalance.target_op_fn = dht_readv2;
+        dht_set_local_rebalance(this, local, NULL, NULL, stbuf, xdata);
+        /* File would be migrated to other node */
+        ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol,
+                                         &dst_subvol);
+
+        if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+                                    dst_subvol) ||
+            !dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+            ret = dht_rebalance_complete_check(this, frame);
+            if (!ret)
+                return 0;
+        } else {
+            /* value is already set in fd_ctx, that means no need
+               to check for whether its complete or not. */
+            dht_readv2(this, dst_subvol, frame, 0);
+            return 0;
+        }
+    }
+
+out:
+    DHT_STRIP_PHASE1_FLAGS(stbuf);
+
+    DHT_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, stbuf,
+                     iobref, xdata);
+
+    return 0;
+}
+
+static int
+dht_readv2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int op_errno = EINVAL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    op_errno = local->op_errno;
+
+    if (we_are_not_migrating(ret)) {
+        /* This dht xlator is not migrating the file. Unwind and
+         * pass on the original mode bits so the higher DHT layer
+         * can handle this.
+         */
+        DHT_STACK_UNWIND(readv, frame, local->op_ret, op_errno, NULL, 0,
+                         &local->rebalance.postbuf, NULL,
+                         local->rebalance.xdata);
+        return 0;
+    }
+
+    if (subvol == NULL)
+        goto out;
+
+    local->call_cnt = 2;
+
+    STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv, local->fd,
+               local->rebalance.size, local->rebalance.offset,
+               local->rebalance.flags, local->xattr_req);
+
+    return 0;
+
+out:
+    DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
+    return 0;
+}
+
+int
+dht_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off,
+          uint32_t flags, dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_READ);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    local->rebalance.offset = off;
+    local->rebalance.size = size;
+    local->rebalance.flags = flags;
+    local->call_cnt = 1;
+
+    STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv, local->fd,
+               local->rebalance.size, local->rebalance.offset,
+               local->rebalance.flags, local->xattr_req);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int
+dht_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+               int op_errno, dict_t *xdata)
+{
+    int ret = -1;
+    dht_local_t *local = NULL;
+    xlator_t *subvol = NULL;
+    xlator_t *prev = NULL;
+
+    local = frame->local;
+    prev = cookie;
+
+    if (!prev)
+        goto out;
+    if (local->call_cnt != 1)
+        goto out;
+    if ((op_ret == -1) &&
+        ((op_errno == ENOTCONN) || dht_inode_missing(op_errno)) &&
+        IA_ISDIR(local->loc.inode->ia_type)) {
+        subvol = dht_subvol_next_available(this, prev);
+        if (!subvol)
+            goto out;
+
+        /* check if we are done with visiting every node */
+        if (subvol == local->cached_subvol) {
+            goto out;
+        }
+
+        STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol,
+                          subvol->fops->access, &local->loc,
+                          local->rebalance.flags, NULL);
+        return 0;
+    }
+    if ((op_ret == -1) && dht_inode_missing(op_errno) &&
+        !(IA_ISDIR(local->loc.inode->ia_type))) {
+        /* File would be migrated to other node */
+        local->op_errno = op_errno;
+        local->rebalance.target_op_fn = dht_access2;
+        ret = dht_rebalance_complete_check(frame->this, frame);
+        if (!ret)
+            return 0;
+    }
+
+out:
+    DHT_STACK_UNWIND(access, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+static int
+dht_access2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int op_errno = EINVAL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    op_errno = local->op_errno;
+
+    if (we_are_not_migrating(ret)) {
+        /* This dht xlator is not migrating the file. Unwind and
+         * pass on the original mode bits so the higher DHT layer
+         * can handle this.
+         */
+
+        DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL);
+        return 0;
+    }
+
+    if (subvol == NULL)
+        goto out;
+
+    local->call_cnt = 2;
+
+    STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol,
+                      subvol->fops->access, &local->loc, local->rebalance.flags,
+                      local->xattr_req);
+
+    return 0;
+
+out:
+    DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL);
+    return 0;
+}
+
+int
+dht_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+           dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+    VALIDATE_OR_GOTO(loc->path, err);
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_ACCESS);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->rebalance.flags = mask;
+    local->call_cnt = 1;
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+                     loc->path);
+        op_errno = EINVAL;
+        goto err;
+    }
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol,
+                      subvol->fops->access, loc, mask, xdata);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL);
+
+    return 0;
+}
+
+int
+dht_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+              int op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *subvol = 0;
+    int ret = 0;
+
+    local = frame->local;
+
+    local->op_errno = op_errno;
+
+    if (local->call_cnt != 1)
+        goto out;
+
+    if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto out;
+        return 0;
+    }
+
+    local->rebalance.target_op_fn = dht_flush2;
+
+    local->op_ret = op_ret;
+    local->op_errno = op_errno;
+
+    /* If context is set, then send flush() it to the destination */
+    dht_inode_ctx_get_mig_info(this, local->fd->inode, NULL, &subvol);
+    if (subvol && dht_fd_open_on_dst(this, local->fd, subvol)) {
+        dht_flush2(this, subvol, frame, 0);
+        return 0;
+    }
+
+    if (op_errno == EREMOTE) {
+        ret = dht_rebalance_complete_check(this, frame);
+        if (!ret) {
+            return 0;
+        }
+    }
+
+out:
+    DHT_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+static int
+dht_flush2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
+
+    if ((frame == NULL) || (frame->local == NULL))
+        goto out;
+
+    local = frame->local;
+
+    op_errno = local->op_errno;
+
+    if (subvol == NULL)
+        goto out;
+
+    local->call_cnt = 2; /* This is the second attempt */
+
+    STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, local->fd,
+               local->xattr_req);
+
+    return 0;
+
+out:
+    DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
+    return 0;
+}
+
+int
+dht_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_FLUSH);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    local->call_cnt = 1;
+
+    STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, fd,
+               local->xattr_req);
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
+
+    return 0;
+}
+
+int
+dht_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+              int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+              dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    int ret = -1;
+    inode_t *inode = NULL;
+    xlator_t *src_subvol = 0;
+    xlator_t *dst_subvol = 0;
+
+    local = frame->local;
+    prev = cookie;
+
+    local->op_errno = op_errno;
+
+    if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto out;
+        return 0;
+    }
+
+    if (op_ret == -1 && !dht_inode_missing(op_errno)) {
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                     prev->name);
+        goto out;
+    }
+
+    if (local->call_cnt != 1) {
+        if (local->stbuf.ia_blocks) {
+            dht_iatt_merge(this, postbuf, &local->stbuf);
+            dht_iatt_merge(this, prebuf, &local->prebuf);
+        }
+        goto out;
+    }
+
+    local->op_ret = op_ret;
+    inode = local->fd->inode;
+
+    local->rebalance.target_op_fn = dht_fsync2;
+    dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+    if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+        ret = dht_rebalance_complete_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+    /* Check if the rebalance phase1 is true */
+    if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+        dht_iatt_merge(this, &local->stbuf, postbuf);
+        dht_iatt_merge(this, &local->prebuf, prebuf);
+
+        dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol);
+
+        if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+                                    dst_subvol) ||
+            !dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+            ret = dht_rebalance_in_progress_check(this, frame);
+            if (!ret)
+                return 0;
+        } else {
+            dht_fsync2(this, dst_subvol, frame, 0);
+            return 0;
+        }
+    }
+
+out:
+    DHT_STRIP_PHASE1_FLAGS(postbuf);
+    DHT_STRIP_PHASE1_FLAGS(prebuf);
+
+    DHT_STACK_UNWIND(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    return 0;
+}
+
+static int
+dht_fsync2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
+
+    if ((frame == NULL) || (frame->local == NULL))
+        goto out;
+
+    local = frame->local;
+    op_errno = local->op_errno;
+
+    if (we_are_not_migrating(ret)) {
+        /* This dht xlator is not migrating the file. Unwind and
+         * pass on the original mode bits so the higher DHT layer
+         * can handle this.
+         */
+        DHT_STACK_UNWIND(fsync, frame, local->op_ret, op_errno,
+                         &local->rebalance.prebuf, &local->rebalance.postbuf,
+                         local->rebalance.xdata);
+        return 0;
+    }
+
+    if (subvol == NULL)
+        goto out;
+
+    local->call_cnt = 2; /* This is the second attempt */
+
+    STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol, subvol->fops->fsync,
+                      local->fd, local->rebalance.flags, local->xattr_req);
+
+    return 0;
+
+out:
+    DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int
+dht_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync,
+          dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_FSYNC);
+    if (!local) {
+        op_errno = ENOMEM;
+
+        goto err;
+    }
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    local->call_cnt = 1;
+    local->rebalance.flags = datasync;
+
+    subvol = local->cached_subvol;
+
+    STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol, subvol->fops->fsync,
+                      local->fd, local->rebalance.flags, local->xattr_req);
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+/* TODO: for 'lk()' call, we need some other special error, may be ESTALE to
+   indicate that lock migration happened on the fd, so we can consider it as
+   phase 2 of migration */
+static int
+dht_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+           int op_errno, struct gf_flock *flock, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int ret = -1;
+    xlator_t *subvol = NULL;
+
+    local = frame->local;
+
+    if (!local) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    if (local->call_cnt != 1)
+        goto out;
+
+    local->rebalance.target_op_fn = dht_lk2;
+
+    local->op_ret = op_ret;
+    local->op_errno = op_errno;
+
+    if (xdata)
+        local->rebalance.xdata = dict_ref(xdata);
+
+    if (op_errno == EREMOTE) {
+        dht_inode_ctx_get_mig_info(this, local->fd->inode, NULL, &subvol);
+        if (subvol && dht_fd_open_on_dst(this, local->fd, subvol)) {
+            dht_lk2(this, subvol, frame, 0);
+            return 0;
+        } else {
+            ret = dht_rebalance_complete_check(this, frame);
+            if (!ret) {
+                return 0;
+            }
+        }
+    }
+
+out:
+    dht_lk_inode_unref(frame, op_ret);
+    DHT_STACK_UNWIND(lk, frame, op_ret, op_errno, flock, xdata);
+
+    return 0;
+}
+
+static int
+dht_lk2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
+
+    if ((frame == NULL) || (frame->local == NULL))
+        goto out;
+
+    local = frame->local;
+
+    op_errno = local->op_errno;
+
+    if (subvol == NULL)
+        goto out;
+
+    local->call_cnt = 2; /* This is the second attempt */
+
+    STACK_WIND(frame, dht_lk_cbk, subvol, subvol->fops->lk, local->fd,
+               local->rebalance.lock_cmd, &local->rebalance.flock,
+               local->xattr_req);
+
+    return 0;
+
+out:
+    DHT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+    return 0;
+}
+
+int
+dht_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd,
+       struct gf_flock *flock, dict_t *xdata)
+{
+    xlator_t *lock_subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_LK);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->lock_type = flock->l_type;
+    lock_subvol = dht_get_lock_subvolume(this, flock, local);
+    if (!lock_subvol) {
+        gf_msg_debug(this->name, 0, "no lock subvolume for path=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    /*
+            local->cached_subvol = lock_subvol;
+            ret = dht_check_and_open_fd_on_subvol (this, frame);
+            if (ret)
+                    goto err;
+    */
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    local->rebalance.flock = *flock;
+    local->rebalance.lock_cmd = cmd;
+
+    local->call_cnt = 1;
+
+    STACK_WIND(frame, dht_lk_cbk, lock_subvol, lock_subvol->fops->lk, fd, cmd,
+               flock, xdata);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+static int
+dht_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+              int op_errno, struct gf_lease *lease, dict_t *xdata)
+{
+    DHT_STACK_UNWIND(lease, frame, op_ret, op_errno, lease, xdata);
+
+    return 0;
+}
+
+int
+dht_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+          struct gf_lease *lease, dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+
+    subvol = dht_subvol_get_cached(this, loc->inode);
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+                     loc->path);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    /* TODO: for rebalance, we need to preserve the fop arguments */
+    STACK_WIND(frame, dht_lease_cbk, subvol, subvol->fops->lease, loc, lease,
+               xdata);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(lease, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+/* Symlinks are currently not migrated, so no need for any check here */
+static int
+dht_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, const char *path, struct iatt *stbuf,
+                 dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+
+    local = frame->local;
+    if (op_ret == -1)
+        goto err;
+
+    if (!local) {
+        op_ret = -1;
+        op_errno = EINVAL;
+    }
+
+err:
+    DHT_STRIP_PHASE1_FLAGS(stbuf);
+    DHT_STACK_UNWIND(readlink, frame, op_ret, op_errno, path, stbuf, xdata);
+
+    return 0;
+}
+
+int
+dht_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+             dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+    VALIDATE_OR_GOTO(loc->path, err);
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_READLINK);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+                     loc->path);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    STACK_WIND(frame, dht_readlink_cbk, subvol, subvol->fops->readlink, loc,
+               size, xdata);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(readlink, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+/* Get both DHT_IATT_IN_XDATA_KEY and DHT_MODE_IN_XDATA_KEY
+ * Use DHT_MODE_IN_XDATA_KEY if available, else fall back to
+ * DHT_IATT_IN_XDATA_KEY
+ * This will return a dummy iatt with only the mode and type set
+ */
+static int
+dht_read_iatt_from_xdata(dict_t *xdata, struct iatt *stbuf)
+{
+    int ret = -1;
+    int32_t mode = 0;
+
+    ret = dict_get_int32(xdata, DHT_MODE_IN_XDATA_KEY, &mode);
+
+    if (ret) {
+        ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
+    } else {
+        stbuf->ia_prot = ia_prot_from_st_mode(mode);
+        stbuf->ia_type = ia_type_from_st_mode(mode);
+    }
+
+    return ret;
+}
+
+int
+dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *dict,
+                       dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    call_frame_t *call_frame = NULL;
+    xlator_t *prev = NULL;
+    xlator_t *src_subvol = NULL;
+    xlator_t *dst_subvol = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+    int ret = -1;
+    inode_t *inode = NULL;
+
+    local = frame->local;
+    call_frame = cookie;
+    prev = call_frame->this;
+
+    local->op_errno = op_errno;
+
+    if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1.",
+                     prev->name);
+        goto out;
+    }
+
+    if (local->call_cnt != 1)
+        goto out;
+
+    if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto out;
+        return 0;
+    }
+
+    ret = dht_read_iatt_from_xdata(xdata, &stbuf);
+
+    if ((!op_ret) && (ret)) {
+        /* This is a potential problem and can cause corruption
+         * with sharding.
+         * Oh well. We tried.
+         */
+        goto out;
+    }
+
+    local->op_ret = op_ret;
+    local->rebalance.target_op_fn = dht_common_xattrop2;
+    if (xdata)
+        local->rebalance.xdata = dict_ref(xdata);
+
+    if (dict)
+        local->rebalance.dict = dict_ref(dict);
+
+    /* Phase 2 of migration */
+    if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(&stbuf)) {
+        ret = dht_rebalance_complete_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+    /* Check if the rebalance phase1 is true */
+    if (IS_DHT_MIGRATION_PHASE1(&stbuf)) {
+        inode = local->loc.inode ? local->loc.inode : local->fd->inode;
+        dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol);
+
+        if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+                                    dst_subvol) ||
+            !dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+            ret = dht_rebalance_in_progress_check(this, frame);
+            if (!ret)
+                return 0;
+        } else {
+            dht_common_xattrop2(this, dst_subvol, frame, 0);
+            return 0;
+        }
+    }
+
+out:
+    if (local->fop == GF_FOP_XATTROP) {
+        DHT_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata);
+    } else {
+        DHT_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, dict, xdata);
+    }
+
+    return 0;
+}
+
+static int
+dht_common_xattrop2(xlator_t *this, xlator_t *subvol, call_frame_t *frame,
+                    int ret)
+{
+    dht_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
+
+    if ((frame == NULL) || (frame->local == NULL))
+        goto out;
+
+    local = frame->local;
+    op_errno = local->op_errno;
+
+    if (we_are_not_migrating(ret)) {
+        /* This dht xlator is not migrating the file. Unwind and
+         * pass on the original mode bits so the higher DHT layer
+         * can handle this.
+         */
+        if (local->fop == GF_FOP_XATTROP) {
+            DHT_STACK_UNWIND(xattrop, frame, local->op_ret, op_errno,
+                             local->rebalance.dict, local->rebalance.xdata);
+        } else {
+            DHT_STACK_UNWIND(fxattrop, frame, local->op_ret, op_errno,
+                             local->rebalance.dict, local->rebalance.xdata);
+        }
+
+        return 0;
+    }
+
+    if (subvol == NULL)
+        goto out;
+
+    local->call_cnt = 2; /* This is the second attempt */
+
+    if (local->fop == GF_FOP_XATTROP) {
+        STACK_WIND(frame, dht_common_xattrop_cbk, subvol, subvol->fops->xattrop,
+                   &local->loc, local->rebalance.flags, local->rebalance.xattr,
+                   local->xattr_req);
+    } else {
+        STACK_WIND(frame, dht_common_xattrop_cbk, subvol,
+                   subvol->fops->fxattrop, local->fd, local->rebalance.flags,
+                   local->rebalance.xattr, local->xattr_req);
+    }
+
+    return 0;
+
+out:
+
+    /* If local is unavailable we could be unwinding the wrong
+     * function here */
+
+    if (local && (local->fop == GF_FOP_XATTROP)) {
+        DHT_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL);
+    } else {
+        DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL);
+    }
+    return 0;
+}
+
+static int
+dht_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    DHT_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+/* Set both DHT_IATT_IN_XDATA_KEY and DHT_MODE_IN_XDATA_KEY
+ * Use DHT_MODE_IN_XDATA_KEY if available. Else fall back to
+ * DHT_IATT_IN_XDATA_KEY
+ */
+static int
+dht_request_iatt_in_xdata(dict_t *xattr_req)
+{
+    int ret = -1;
+
+    ret = dict_set_int8(xattr_req, DHT_MODE_IN_XDATA_KEY, 1);
+    ret = dict_set_int8(xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+
+    /* At least one call succeeded */
+    return ret;
+}
+
+int
+dht_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+            gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+    int ret = -1;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_XATTROP);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for gfid=%s",
+                     uuid_utoa(loc->inode->gfid));
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    /* Todo : Handle dirs as well. At the moment the only xlator above dht
+     * that uses xattrop is sharding and that is only for files */
+
+    if (IA_ISDIR(loc->inode->ia_type)) {
+        STACK_WIND(frame, dht_xattrop_cbk, subvol, subvol->fops->xattrop, loc,
+                   flags, dict, xdata);
+
+    } else {
+        local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+        local->call_cnt = 1;
+
+        local->rebalance.xattr = dict_ref(dict);
+        local->rebalance.flags = flags;
+
+        ret = dht_request_iatt_in_xdata(local->xattr_req);
+
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set dictionary key %s file=%s",
+                         DHT_IATT_IN_XDATA_KEY, loc->path);
+        }
+
+        STACK_WIND(frame, dht_common_xattrop_cbk, subvol, subvol->fops->xattrop,
+                   loc, local->rebalance.flags, local->rebalance.xattr,
+                   local->xattr_req);
+    }
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+static int
+dht_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    DHT_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+int
+dht_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+             gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+    int ret = -1;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    subvol = dht_subvol_get_cached(this, fd->inode);
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_FXATTROP);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    /* Todo : Handle dirs as well. At the moment the only xlator above dht
+     * that uses xattrop is sharding and that is only for files */
+
+    if (IA_ISDIR(fd->inode->ia_type)) {
+        STACK_WIND(frame, dht_fxattrop_cbk, subvol, subvol->fops->fxattrop, fd,
+                   flags, dict, xdata);
+
+    } else {
+        local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+        local->call_cnt = 1;
+
+        local->rebalance.xattr = dict_ref(dict);
+        local->rebalance.flags = flags;
+
+        ret = dht_request_iatt_in_xdata(local->xattr_req);
+
+        if (ret) {
+            gf_msg_debug(this->name, 0, "Failed to set dictionary key %s fd=%p",
+                         DHT_IATT_IN_XDATA_KEY, fd);
+        }
+
+        STACK_WIND(frame, dht_common_xattrop_cbk, subvol,
+                   subvol->fops->fxattrop, fd, local->rebalance.flags,
+                   local->rebalance.xattr, local->xattr_req);
+    }
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+/* Currently no translators on top of 'distribute' will be using
+ * below fops, hence not implementing 'migration' related checks
+ */
+
+static int
+dht_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
+
+{
+    dht_lk_inode_unref(frame, op_ret);
+    DHT_STACK_UNWIND(inodelk, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int32_t
+dht_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+            int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+{
+    xlator_t *lock_subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_INODELK);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->lock_type = lock->l_type;
+    lock_subvol = dht_get_lock_subvolume(this, lock, local);
+    if (!lock_subvol) {
+        gf_msg_debug(this->name, 0, "no lock subvolume for path=%s", loc->path);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    local->call_cnt = 1;
+
+    STACK_WIND(frame, dht_inodelk_cbk, lock_subvol, lock_subvol->fops->inodelk,
+               volume, loc, cmd, lock, xdata);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(inodelk, frame, -1, op_errno, NULL);
+
+    return 0;
+}
+
+int
+dht_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+
+{
+    dht_local_t *local = NULL;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO("dht", frame, out);
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+
+    local = frame->local;
+
+    if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto out;
+        return 0;
+    }
+
+out:
+    dht_lk_inode_unref(frame, op_ret);
+    DHT_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+int
+dht_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+             int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+{
+    xlator_t *lock_subvol = NULL;
+    dht_local_t *local = NULL;
+    int op_errno = -1;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_INODELK);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->call_cnt = 1;
+    local->lock_type = lock->l_type;
+
+    lock_subvol = dht_get_lock_subvolume(this, lock, local);
+    if (!lock_subvol) {
+        gf_msg_debug(this->name, 0, "no lock subvolume for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    /*
+            local->cached_subvol = lock_subvol;
+            ret = dht_check_and_open_fd_on_subvol (this, frame);
+            if (ret)
+                    goto err;
+    */
+    local->rebalance.flock = *lock;
+    local->rebalance.lock_cmd = cmd;
+    local->key = gf_strdup(volume);
+
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    STACK_WIND(frame, dht_finodelk_cbk, lock_subvol,
+               lock_subvol->fops->finodelk, volume, fd, cmd, lock, xdata);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(finodelk, frame, -1, op_errno, NULL);
+
+    return 0;
+}
diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c
new file mode 100644
index 00000000000..2f23ce90fbd
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-inode-write.c
@@ -0,0 +1,1404 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "dht-common.h"
+
+static int
+dht_writev2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_truncate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_setattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_fallocate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_discard2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_zerofill2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+
+int
+dht_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+               int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+               dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    int ret = -1;
+    xlator_t *subvol1 = NULL;
+    xlator_t *subvol2 = NULL;
+
+    local = frame->local;
+    prev = cookie;
+
+    if (!local) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    /* writev fails with EBADF if dht has not yet opened the fd
+     * on the cached subvol. This could happen if the file was migrated
+     * and a lookup updated the cached subvol in the inode ctx.
+     * We only check once as this could be a valid bad fd error.
+     */
+
+    if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto out;
+        return 0;
+    }
+
+    if (op_ret == -1 && !dht_inode_missing(op_errno)) {
+        local->op_errno = op_errno;
+        local->op_ret = -1;
+        gf_msg_debug(this->name, 0, "subvolume %s returned -1 (%s)", prev->name,
+                     strerror(op_errno));
+        goto out;
+    }
+
+    if (local->call_cnt != 1) {
+        /* preserve the modes of source */
+        if (local->stbuf.ia_blocks) {
+            dht_iatt_merge(this, postbuf, &local->stbuf);
+            dht_iatt_merge(this, prebuf, &local->prebuf);
+        }
+        goto out;
+    }
+
+    local->rebalance.target_op_fn = dht_writev2;
+
+    local->op_ret = op_ret;
+    local->op_errno = op_errno;
+
+    /* We might need to pass the stbuf information to the higher DHT
+     * layer for appropriate handling.
+     */
+
+    dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+    /* Phase 2 of migration */
+    if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+        ret = dht_rebalance_complete_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+    /* Check if the rebalance phase1 is true */
+    if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+        if (!local->xattr_req) {
+            local->xattr_req = dict_new();
+            if (!local->xattr_req) {
+                gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM,
+                       "insufficient memory");
+                local->op_errno = ENOMEM;
+                local->op_ret = -1;
+                goto out;
+            }
+        }
+
+        ret = dict_set_uint32(local->xattr_req, GF_PROTECT_FROM_EXTERNAL_WRITES,
+                              1);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_DICT_SET_FAILED, 0,
+                   "Failed to set key %s in dictionary",
+                   GF_PROTECT_FROM_EXTERNAL_WRITES);
+            local->op_errno = ENOMEM;
+            local->op_ret = -1;
+            goto out;
+        }
+
+        dht_iatt_merge(this, &local->stbuf, postbuf);
+        dht_iatt_merge(this, &local->prebuf, prebuf);
+
+        ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &subvol1,
+                                         &subvol2);
+        if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+            if (dht_fd_open_on_dst(this, local->fd, subvol2)) {
+                dht_writev2(this, subvol2, frame, 0);
+                return 0;
+            }
+        }
+        ret = dht_rebalance_in_progress_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+out:
+    DHT_STRIP_PHASE1_FLAGS(postbuf);
+    DHT_STRIP_PHASE1_FLAGS(prebuf);
+
+    DHT_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    return 0;
+}
+
+static int
+dht_writev2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
+
+    if ((frame == NULL) || (frame->local == NULL))
+        goto out;
+
+    local = frame->local;
+    op_errno = local->op_errno;
+
+    if (we_are_not_migrating(ret)) {
+        /* This dht xlator is not migrating the file. Unwind and
+         * pass on the original mode bits so the higher DHT layer
+         * can handle this.
+         */
+        DHT_STACK_UNWIND(writev, frame, local->op_ret, local->op_errno,
+                         &local->rebalance.prebuf, &local->rebalance.postbuf,
+                         local->rebalance.xdata);
+        return 0;
+    }
+
+    if (subvol == NULL)
+        goto out;
+
+    local->call_cnt = 2; /* This is the second attempt */
+
+    STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol,
+                      subvol->fops->writev, local->fd, local->rebalance.vector,
+                      local->rebalance.count, local->rebalance.offset,
+                      local->rebalance.flags, local->rebalance.iobref,
+                      local->xattr_req);
+
+    return 0;
+
+out:
+    DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int
+dht_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+           int count, off_t off, uint32_t flags, struct iobref *iobref,
+           dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_WRITE);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    local->rebalance.vector = iov_dup(vector, count);
+    local->rebalance.offset = off;
+    local->rebalance.count = count;
+    local->rebalance.flags = flags;
+    local->rebalance.iobref = iobref_ref(iobref);
+    local->call_cnt = 1;
+
+    STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol,
+                      subvol->fops->writev, fd, local->rebalance.vector,
+                      local->rebalance.count, local->rebalance.offset,
+                      local->rebalance.flags, local->rebalance.iobref,
+                      local->xattr_req);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int
+dht_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+                 dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    int ret = -1;
+    xlator_t *src_subvol = NULL;
+    xlator_t *dst_subvol = NULL;
+    inode_t *inode = NULL;
+
+    GF_VALIDATE_OR_GOTO("dht", frame, err);
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+    GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+    local = frame->local;
+    prev = cookie;
+
+    /* Needs to be checked only for ftruncate.
+     * ftruncate fails with EBADF/EINVAL if dht has not yet opened the fd
+     * on the cached subvol. This could happen if the file was migrated
+     * and a lookup updated the cached subvol in the inode ctx.
+     * We only check once as this could actually be a valid error.
+     */
+
+    if ((local->fop == GF_FOP_FTRUNCATE) &&
+        dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto out;
+        return 0;
+    }
+
+    if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+        local->op_errno = op_errno;
+        local->op_ret = -1;
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                     prev->name);
+
+        goto out;
+    }
+
+    if (local->call_cnt != 1) {
+        if (local->stbuf.ia_blocks) {
+            dht_iatt_merge(this, postbuf, &local->stbuf);
+            dht_iatt_merge(this, prebuf, &local->prebuf);
+        }
+        goto out;
+    }
+
+    local->rebalance.target_op_fn = dht_truncate2;
+
+    local->op_ret = op_ret;
+    local->op_errno = op_errno;
+
+    /* We might need to pass the stbuf information to the higher DHT
+     * layer for appropriate handling.
+     */
+
+    dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+    /* Phase 2 of migration */
+    if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+        ret = dht_rebalance_complete_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+    /* Check if the rebalance phase1 is true */
+    if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+        dht_iatt_merge(this, &local->stbuf, postbuf);
+        dht_iatt_merge(this, &local->prebuf, prebuf);
+
+        inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+        dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol);
+        if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+                                     dst_subvol)) {
+            if ((!local->fd) ||
+                ((local->fd) &&
+                 dht_fd_open_on_dst(this, local->fd, dst_subvol))) {
+                dht_truncate2(this, dst_subvol, frame, 0);
+                return 0;
+            }
+        }
+        ret = dht_rebalance_in_progress_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+out:
+    DHT_STRIP_PHASE1_FLAGS(postbuf);
+    DHT_STRIP_PHASE1_FLAGS(prebuf);
+
+    DHT_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+err:
+    return 0;
+}
+
+static int
+dht_truncate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
+
+    if (!frame || !frame->local)
+        goto out;
+
+    local = frame->local;
+    op_errno = local->op_errno;
+
+    /* This dht xlator is not migrating the file  */
+    if (we_are_not_migrating(ret)) {
+        DHT_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno,
+                         &local->rebalance.prebuf, &local->rebalance.postbuf,
+                         local->rebalance.xdata);
+        return 0;
+    }
+
+    if (subvol == NULL)
+        goto out;
+
+    local->call_cnt = 2; /* This is the second attempt */
+
+    if (local->fop == GF_FOP_TRUNCATE) {
+        STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+                          subvol->fops->truncate, &local->loc,
+                          local->rebalance.offset, local->xattr_req);
+    } else {
+        STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+                          subvol->fops->ftruncate, local->fd,
+                          local->rebalance.offset, local->xattr_req);
+    }
+
+    return 0;
+
+out:
+    DHT_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int
+dht_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+             dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_TRUNCATE);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->rebalance.offset = offset;
+    local->call_cnt = 1;
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for gfid=%s",
+                     uuid_utoa(loc->inode->gfid));
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+                      subvol->fops->truncate, loc, offset, xdata);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int
+dht_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+              dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_FTRUNCATE);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->rebalance.offset = offset;
+    local->call_cnt = 1;
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+                      subvol->fops->ftruncate, fd, local->rebalance.offset,
+                      local->xattr_req);
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int
+dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                  int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+                  dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    int ret = -1;
+    xlator_t *src_subvol = NULL;
+    xlator_t *dst_subvol = NULL;
+
+    GF_VALIDATE_OR_GOTO("dht", frame, err);
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+    GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+    local = frame->local;
+    prev = cookie;
+
+    /* fallocate fails with EBADF if dht has not yet opened the fd
+     * on the cached subvol. This could happen if the file was migrated
+     * and a lookup updated the cached subvol in the inode ctx.
+     * We only check once as this could actually be a valid error.
+     */
+
+    if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto out;
+        return 0;
+    }
+
+    if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+        local->op_errno = op_errno;
+        local->op_ret = -1;
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                     prev->name);
+
+        goto out;
+    }
+
+    if (local->call_cnt != 1) {
+        if (local->stbuf.ia_blocks) {
+            dht_iatt_merge(this, postbuf, &local->stbuf);
+            dht_iatt_merge(this, prebuf, &local->prebuf);
+        }
+        goto out;
+    }
+
+    local->op_ret = op_ret;
+    local->op_errno = op_errno;
+    local->rebalance.target_op_fn = dht_fallocate2;
+
+    dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+    /* Phase 2 of migration */
+    if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+        ret = dht_rebalance_complete_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+    /* Check if the rebalance phase1 is true */
+    if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+        dht_iatt_merge(this, &local->stbuf, postbuf);
+        dht_iatt_merge(this, &local->prebuf, prebuf);
+
+        dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol,
+                                   &dst_subvol);
+        if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+                                     dst_subvol)) {
+            if (dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+                dht_fallocate2(this, dst_subvol, frame, 0);
+                return 0;
+            }
+        }
+        ret = dht_rebalance_in_progress_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+out:
+    DHT_STRIP_PHASE1_FLAGS(postbuf);
+    DHT_STRIP_PHASE1_FLAGS(prebuf);
+
+    DHT_STACK_UNWIND(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+                     xdata);
+err:
+    return 0;
+}
+
+static int
+dht_fallocate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
+
+    if (!frame || !frame->local)
+        goto out;
+
+    local = frame->local;
+    op_errno = local->op_errno;
+
+    if (we_are_not_migrating(ret)) {
+        /* This dht xlator is not migrating the file. Unwind and
+         * pass on the original mode bits so the higher DHT layer
+         * can handle this.
+         */
+        DHT_STACK_UNWIND(fallocate, frame, local->op_ret, local->op_errno,
+                         &local->rebalance.prebuf, &local->rebalance.postbuf,
+                         local->rebalance.xdata);
+        return 0;
+    }
+
+    if (subvol == NULL)
+        goto out;
+
+    local->call_cnt = 2; /* This is the second attempt */
+
+    STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol,
+                      subvol->fops->fallocate, local->fd,
+                      local->rebalance.flags, local->rebalance.offset,
+                      local->rebalance.size, local->xattr_req);
+
+    return 0;
+
+out:
+    DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int
+dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+              off_t offset, size_t len, dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_FALLOCATE);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->rebalance.flags = mode;
+    local->rebalance.offset = offset;
+    local->rebalance.size = len;
+
+    local->call_cnt = 1;
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol,
+                      subvol->fops->fallocate, fd, local->rebalance.flags,
+                      local->rebalance.offset, local->rebalance.size,
+                      local->xattr_req);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int
+dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+                dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    int ret = -1;
+    xlator_t *src_subvol = NULL;
+    xlator_t *dst_subvol = NULL;
+
+    GF_VALIDATE_OR_GOTO("dht", frame, err);
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+    GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+    local = frame->local;
+    prev = cookie;
+
+    /* discard fails with EBADF if dht has not yet opened the fd
+     * on the cached subvol. This could happen if the file was migrated
+     * and a lookup updated the cached subvol in the inode ctx.
+     * We only check once as this could actually be a valid error.
+     */
+    if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto out;
+        return 0;
+    }
+
+    if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+        local->op_errno = op_errno;
+        local->op_ret = -1;
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                     prev->name);
+
+        goto out;
+    }
+
+    if (local->call_cnt != 1) {
+        if (local->stbuf.ia_blocks) {
+            dht_iatt_merge(this, postbuf, &local->stbuf);
+            dht_iatt_merge(this, prebuf, &local->prebuf);
+        }
+        goto out;
+    }
+
+    local->rebalance.target_op_fn = dht_discard2;
+    local->op_ret = op_ret;
+    local->op_errno = op_errno;
+
+    dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+    /* Phase 2 of migration */
+    if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+        ret = dht_rebalance_complete_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+    /* Check if the rebalance phase1 is true */
+    if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+        dht_iatt_merge(this, &local->stbuf, postbuf);
+        dht_iatt_merge(this, &local->prebuf, prebuf);
+
+        dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol,
+                                   &dst_subvol);
+        if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+                                     dst_subvol)) {
+            if (dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+                dht_discard2(this, dst_subvol, frame, 0);
+                return 0;
+            }
+        }
+        ret = dht_rebalance_in_progress_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+out:
+    DHT_STRIP_PHASE1_FLAGS(postbuf);
+    DHT_STRIP_PHASE1_FLAGS(prebuf);
+
+    DHT_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+err:
+    return 0;
+}
+
+static int
+dht_discard2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
+
+    if (!frame || !frame->local)
+        goto out;
+
+    local = frame->local;
+    op_errno = local->op_errno;
+
+    if (we_are_not_migrating(ret)) {
+        /* This dht xlator is not migrating the file. Unwind and
+         * pass on the original mode bits so the higher DHT layer
+         * can handle this.
+         */
+        DHT_STACK_UNWIND(discard, frame, local->op_ret, local->op_errno,
+                         &local->rebalance.prebuf, &local->rebalance.postbuf,
+                         local->rebalance.xdata);
+        return 0;
+    }
+
+    if (subvol == NULL)
+        goto out;
+
+    local->call_cnt = 2; /* This is the second attempt */
+
+    STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol,
+                      subvol->fops->discard, local->fd, local->rebalance.offset,
+                      local->rebalance.size, local->xattr_req);
+
+    return 0;
+
+out:
+    DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int
+dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            size_t len, dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_DISCARD);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->rebalance.offset = offset;
+    local->rebalance.size = len;
+
+    local->call_cnt = 1;
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol,
+                      subvol->fops->discard, fd, local->rebalance.offset,
+                      local->rebalance.size, local->xattr_req);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int
+dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+                 dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    int ret = -1;
+    xlator_t *subvol1 = NULL, *subvol2 = NULL;
+
+    GF_VALIDATE_OR_GOTO("dht", frame, err);
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+    GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+    local = frame->local;
+    prev = cookie;
+
+    /* zerofill fails with EBADF if dht has not yet opened the fd
+     * on the cached subvol. This could happen if the file was migrated
+     * and a lookup updated the cached subvol in the inode ctx.
+     * We only check once as this could actually be a valid error.
+     */
+    if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto out;
+        return 0;
+    }
+
+    if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+        local->op_errno = op_errno;
+        local->op_ret = -1;
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                     prev->name);
+        goto out;
+    }
+
+    if (local->call_cnt != 1) {
+        if (local->stbuf.ia_blocks) {
+            dht_iatt_merge(this, postbuf, &local->stbuf);
+            dht_iatt_merge(this, prebuf, &local->prebuf);
+        }
+        goto out;
+    }
+
+    local->rebalance.target_op_fn = dht_zerofill2;
+    local->op_ret = op_ret;
+    local->op_errno = op_errno;
+
+    dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+    /* Phase 2 of migration */
+    if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+        ret = dht_rebalance_complete_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+    /* Check if the rebalance phase1 is true */
+    if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+        dht_iatt_merge(this, &local->stbuf, postbuf);
+        dht_iatt_merge(this, &local->prebuf, prebuf);
+
+        ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &subvol1,
+                                         &subvol2);
+        if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+            if (dht_fd_open_on_dst(this, local->fd, subvol2)) {
+                dht_zerofill2(this, subvol2, frame, 0);
+                return 0;
+            }
+        }
+
+        ret = dht_rebalance_in_progress_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+out:
+    DHT_STRIP_PHASE1_FLAGS(postbuf);
+    DHT_STRIP_PHASE1_FLAGS(prebuf);
+
+    DHT_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+err:
+    return 0;
+}
+
+static int
+dht_zerofill2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
+
+    if (!frame || !frame->local)
+        goto out;
+
+    local = frame->local;
+
+    op_errno = local->op_errno;
+
+    if (we_are_not_migrating(ret)) {
+        /* This dht xlator is not migrating the file. Unwind and
+         * pass on the original mode bits so the higher DHT layer
+         * can handle this.
+         */
+        DHT_STACK_UNWIND(zerofill, frame, local->op_ret, local->op_errno,
+                         &local->rebalance.prebuf, &local->rebalance.postbuf,
+                         local->rebalance.xdata);
+
+        return 0;
+    }
+
+    if (subvol == NULL)
+        goto out;
+
+    local->call_cnt = 2; /* This is the second attempt */
+
+    STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol,
+                      subvol->fops->zerofill, local->fd,
+                      local->rebalance.offset, local->rebalance.size,
+                      local->xattr_req);
+
+    return 0;
+
+out:
+
+    DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int
+dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             off_t len, dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    int op_errno = -1;
+    dht_local_t *local = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_ZEROFILL);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->rebalance.offset = offset;
+    local->rebalance.size = len;
+
+    local->call_cnt = 1;
+    subvol = local->cached_subvol;
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol,
+                      subvol->fops->zerofill, fd, local->rebalance.offset,
+                      local->rebalance.size, local->xattr_req);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+/* handle cases of migration here for 'setattr()' calls */
+int
+dht_file_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int op_ret, int op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    int ret = -1;
+
+    local = frame->local;
+    prev = cookie;
+
+    local->op_errno = op_errno;
+
+    if ((local->fop == GF_FOP_FSETATTR) &&
+        dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+        ret = dht_check_and_open_fd_on_subvol(this, frame);
+        if (ret)
+            goto out;
+        return 0;
+    }
+
+    if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                     prev->name);
+        goto out;
+    }
+
+    if (local->call_cnt != 1)
+        goto out;
+
+    local->op_ret = op_ret;
+    local->op_errno = op_errno;
+
+    local->rebalance.target_op_fn = dht_setattr2;
+
+    /* Phase 2 of migration */
+    if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+        dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+        ret = dht_rebalance_complete_check(this, frame);
+        if (!ret)
+            return 0;
+    }
+
+    /* At the end of the migration process, whatever 'attr' we
+       have on source file will be migrated to destination file
+       in one shot, hence we don't need to check for in progress
+       state here (ie, PHASE1) */
+out:
+    DHT_STRIP_PHASE1_FLAGS(postbuf);
+    DHT_STRIP_PHASE1_FLAGS(prebuf);
+
+    DHT_STACK_UNWIND(setattr, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    return 0;
+}
+
+static int
+dht_setattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+    dht_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
+
+    if (!frame || !frame->local)
+        goto out;
+
+    local = frame->local;
+    op_errno = local->op_errno;
+
+    if (we_are_not_migrating(ret)) {
+        /* This dht xlator is not migrating the file. Unwind and
+         * pass on the original mode bits so the higher DHT layer
+         * can handle this.
+         */
+        DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno,
+                         &local->rebalance.prebuf, &local->rebalance.postbuf,
+                         local->rebalance.xdata);
+        return 0;
+    }
+
+    if (subvol == NULL)
+        goto out;
+
+    local->call_cnt = 2; /* This is the second attempt */
+
+    if (local->fop == GF_FOP_SETATTR) {
+        STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+                          subvol->fops->setattr, &local->loc,
+                          &local->rebalance.stbuf, local->rebalance.flags,
+                          local->xattr_req);
+    } else {
+        STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+                          subvol->fops->fsetattr, local->fd,
+                          &local->rebalance.stbuf, local->rebalance.flags,
+                          local->xattr_req);
+    }
+
+    return 0;
+
+out:
+    DHT_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+}
+
+/* Keep the existing code same for all the cases other than regular file */
+int
+dht_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                int op_errno, struct iatt *statpre, struct iatt *statpost,
+                dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    xlator_t *prev = NULL;
+
+    local = frame->local;
+    prev = cookie;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret == -1) {
+            local->op_errno = op_errno;
+            UNLOCK(&frame->lock);
+            gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                         prev->name);
+            goto post_unlock;
+        }
+
+        dht_iatt_merge(this, &local->prebuf, statpre);
+        dht_iatt_merge(this, &local->stbuf, statpost);
+
+        local->op_ret = 0;
+        local->op_errno = 0;
+    }
+    UNLOCK(&frame->lock);
+post_unlock:
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt)) {
+        if (local->op_ret == 0)
+            dht_inode_ctx_time_set(local->loc.inode, this, &local->stbuf);
+        DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno,
+                         &local->prebuf, &local->stbuf, xdata);
+    }
+
+    return 0;
+}
+
+/* Keep the existing code same for all the cases other than regular file */
+int
+dht_non_mds_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int op_ret, int op_errno, struct iatt *statpre,
+                        struct iatt *statpost, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    xlator_t *prev = NULL;
+
+    local = frame->local;
+    prev = cookie;
+
+    if (op_ret == -1) {
+        gf_msg(this->name, op_errno, 0, 0, "subvolume %s returned -1",
+               prev->name);
+        goto post_unlock;
+    }
+
+    LOCK(&frame->lock);
+    {
+        dht_iatt_merge(this, &local->prebuf, statpre);
+        dht_iatt_merge(this, &local->stbuf, statpost);
+
+        local->op_ret = 0;
+        local->op_errno = 0;
+    }
+    UNLOCK(&frame->lock);
+post_unlock:
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt)) {
+        dht_inode_ctx_time_set(local->loc.inode, this, &local->stbuf);
+        DHT_STACK_UNWIND(setattr, frame, 0, 0, &local->prebuf, &local->stbuf,
+                         xdata);
+    }
+
+    return 0;
+}
+
+int
+dht_mds_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int op_ret, int op_errno, struct iatt *statpre,
+                    struct iatt *statpost, dict_t *xdata)
+
+{
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    xlator_t *prev = NULL;
+    xlator_t *mds_subvol = NULL;
+    struct iatt loc_stbuf = {
+        0,
+    };
+    int i = 0;
+
+    local = frame->local;
+    prev = cookie;
+    conf = this->private;
+    mds_subvol = local->mds_subvol;
+
+    if (op_ret == -1) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+                     prev->name);
+        goto out;
+    }
+
+    local->op_ret = 0;
+    loc_stbuf = local->stbuf;
+    dht_iatt_merge(this, &local->prebuf, statpre);
+    dht_iatt_merge(this, &local->stbuf, statpost);
+
+    local->call_cnt = conf->subvolume_cnt - 1;
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (mds_subvol == conf->subvolumes[i])
+            continue;
+        STACK_WIND_COOKIE(frame, dht_non_mds_setattr_cbk, conf->subvolumes[i],
+                          conf->subvolumes[i],
+                          conf->subvolumes[i]->fops->setattr, &local->loc,
+                          &loc_stbuf, local->valid, local->xattr_req);
+    }
+
+    return 0;
+out:
+    DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno,
+                     &local->prebuf, &local->stbuf, xdata);
+
+    return 0;
+}
+
+int
+dht_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+            int32_t valid, dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    xlator_t *mds_subvol = NULL;
+    dht_layout_t *layout = NULL;
+    dht_local_t *local = NULL;
+    int op_errno = -1;
+    int i = -1;
+    int ret = -1;
+    int call_cnt = 0;
+    dht_conf_t *conf = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+    VALIDATE_OR_GOTO(loc->path, err);
+
+    conf = this->private;
+    local = dht_local_init(frame, loc, NULL, GF_FOP_SETATTR);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    layout = local->layout;
+    if (!layout) {
+        gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    if (!layout_is_sane(layout)) {
+        gf_msg_debug(this->name, 0, "layout is not sane for path=%s",
+                     loc->path);
+        op_errno = EINVAL;
+        goto err;
+    }
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    if (IA_ISREG(loc->inode->ia_type)) {
+        /* in the regular file _cbk(), we need to check for
+           migration possibilities */
+        local->rebalance.stbuf = *stbuf;
+        local->rebalance.flags = valid;
+        local->call_cnt = 1;
+        subvol = local->cached_subvol;
+
+        STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+                          subvol->fops->setattr, loc, stbuf, valid, xdata);
+
+        return 0;
+    }
+
+    local->call_cnt = call_cnt = layout->cnt;
+
+    if (IA_ISDIR(loc->inode->ia_type) && !__is_root_gfid(loc->inode->gfid) &&
+        call_cnt != 1) {
+        ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol);
+        if (ret || !mds_subvol) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+                   "Failed to get mds subvol for path %s", local->loc.path);
+            op_errno = EINVAL;
+            goto err;
+        }
+
+        local->mds_subvol = mds_subvol;
+        for (i = 0; i < conf->subvolume_cnt; i++) {
+            if (conf->subvolumes[i] == mds_subvol) {
+                if (!conf->subvolume_status[i]) {
+                    gf_msg(this->name, GF_LOG_WARNING, layout->list[i].err,
+                           DHT_MSG_HASHED_SUBVOL_DOWN,
+                           "MDS subvol is down for path "
+                           " %s Unable to set attr ",
+                           local->loc.path);
+                    op_errno = ENOTCONN;
+                    goto err;
+                }
+            }
+        }
+        local->valid = valid;
+        local->stbuf = *stbuf;
+
+        STACK_WIND_COOKIE(frame, dht_mds_setattr_cbk, local->mds_subvol,
+                          local->mds_subvol, local->mds_subvol->fops->setattr,
+                          loc, stbuf, valid, xdata);
+        return 0;
+    } else {
+        for (i = 0; i < call_cnt; i++) {
+            STACK_WIND_COOKIE(frame, dht_setattr_cbk, layout->list[i].xlator,
+                              layout->list[i].xlator,
+                              layout->list[i].xlator->fops->setattr, loc, stbuf,
+                              valid, xdata);
+        }
+    }
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int
+dht_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+             int32_t valid, dict_t *xdata)
+{
+    xlator_t *subvol = NULL;
+    dht_layout_t *layout = NULL;
+    dht_local_t *local = NULL;
+    int op_errno = -1;
+    int i = -1;
+    int call_cnt = 0;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    local = dht_local_init(frame, NULL, fd, GF_FOP_FSETATTR);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    layout = local->layout;
+    if (!layout) {
+        gf_msg_debug(this->name, 0, "no layout for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    if (!layout_is_sane(layout)) {
+        gf_msg_debug(this->name, 0, "layout is not sane for fd=%p", fd);
+        op_errno = EINVAL;
+        goto err;
+    }
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    if (IA_ISREG(fd->inode->ia_type)) {
+        /* in the regular file _cbk(), we need to check for
+           migration possibilities */
+        local->rebalance.stbuf = *stbuf;
+        local->rebalance.flags = valid;
+        local->call_cnt = 1;
+        subvol = local->cached_subvol;
+
+        STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+                          subvol->fops->fsetattr, fd, &local->rebalance.stbuf,
+                          local->rebalance.flags, local->xattr_req);
+        return 0;
+    }
+
+    local->call_cnt = call_cnt = layout->cnt;
+
+    for (i = 0; i < call_cnt; i++) {
+        STACK_WIND_COOKIE(frame, dht_setattr_cbk, layout->list[i].xlator,
+                          layout->list[i].xlator,
+                          layout->list[i].xlator->fops->fsetattr, fd, stbuf,
+                          valid, xdata);
+    }
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
index 1515df69748..fda904c92c9 100644
--- a/xlators/cluster/dht/src/dht-layout.c
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -1,582 +1,808 @@
 /*
-   Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
 
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
 
-#include "glusterfs.h"
-#include "xlator.h"
 #include "dht-common.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
+#include "unittest/unittest.h"
 
-#define layout_base_size (sizeof (dht_layout_t))
+#define layout_base_size (sizeof(dht_layout_t))
 
-#define layout_entry_size (sizeof ((dht_layout_t *)NULL)->list[0])
+#define layout_entry_size (sizeof((dht_layout_t *)NULL)->list[0])
 
 #define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size))
 
-
 dht_layout_t *
-dht_layout_new (xlator_t *this, int cnt)
+dht_layout_new(xlator_t *this, int cnt)
 {
-	dht_layout_t *layout = NULL;
+    dht_layout_t *layout = NULL;
+    dht_conf_t *conf = NULL;
+
+    REQUIRE(NULL != this);
+    REQUIRE(cnt >= 0);
 
+    conf = this->private;
 
-	layout = CALLOC (1, layout_size (cnt));
-	if (!layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto out;
-	}
+    layout = GF_CALLOC(1, layout_size(cnt), gf_dht_mt_dht_layout_t);
+    if (!layout) {
+        goto out;
+    }
 
-	layout->cnt = cnt;
+    layout->type = DHT_HASH_TYPE_DM;
+    layout->cnt = cnt;
 
+    if (conf) {
+        layout->spread_cnt = conf->dir_spread_cnt;
+        layout->gen = conf->gen;
+    }
+
+    GF_ATOMIC_INIT(layout->ref, 1);
+
+    ENSURE(NULL != layout);
+    ENSURE(layout->type == DHT_HASH_TYPE_DM);
+    ENSURE(layout->cnt == cnt);
+    ENSURE(GF_ATOMIC_GET(layout->ref) == 1);
 out:
-	return layout;
+    return layout;
 }
 
-
 dht_layout_t *
-dht_layout_get (xlator_t *this, inode_t *inode)
+dht_layout_get(xlator_t *this, inode_t *inode)
 {
-        uint64_t layout = 0;
-        int      ret    = -1;
+    dht_layout_t *layout = NULL;
+    int ret = 0;
+
+    ret = dht_inode_ctx_layout_get(inode, this, &layout);
+    if ((!ret) && layout) {
+        GF_ATOMIC_INC(layout->ref);
+    }
+    return layout;
+}
+
+int
+dht_layout_set(xlator_t *this, inode_t *inode, dht_layout_t *layout)
+{
+    dht_conf_t *conf = NULL;
+    int oldret = -1;
+    int ret = -1;
+    dht_layout_t *old_layout;
+
+    conf = this->private;
+    if (!conf || !layout)
+        goto out;
+
+    LOCK(&conf->layout_lock);
+    {
+        oldret = dht_inode_ctx_layout_get(inode, this, &old_layout);
+        if (layout)
+            GF_ATOMIC_INC(layout->ref);
+        ret = dht_inode_ctx_layout_set(inode, this, layout);
+    }
+    UNLOCK(&conf->layout_lock);
+
+    if (!oldret) {
+        dht_layout_unref(this, old_layout);
+    }
+    if (ret)
+        GF_ATOMIC_DEC(layout->ref);
+
+out:
+    return ret;
+}
+
+void
+dht_layout_unref(xlator_t *this, dht_layout_t *layout)
+{
+    int ref = 0;
+
+    if (!layout || layout->preset || !this->private)
+        return;
 
-        ret = inode_ctx_get (inode, this, &layout);
+    ref = GF_ATOMIC_DEC(layout->ref);
 
-        return (dht_layout_t *)(long)layout;
+    if (!ref)
+        GF_FREE(layout);
 }
 
+dht_layout_t *
+dht_layout_ref(xlator_t *this, dht_layout_t *layout)
+{
+    if (layout->preset || !this->private)
+        return layout;
+
+    GF_ATOMIC_INC(layout->ref);
+
+    return layout;
+}
 
 xlator_t *
-dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name)
+dht_layout_search(xlator_t *this, dht_layout_t *layout, const char *name)
 {
-	uint32_t   hash = 0;
-        xlator_t  *subvol = NULL;
-	int        i = 0;
-	int        ret = 0;
-
-
-	ret = dht_hash_compute (layout->type, name, &hash);
-	if (ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"hash computation failed for type=%d name=%s",
-			layout->type, name);
-		goto out;
-	}
-
-	for (i = 0; i < layout->cnt; i++) {
-		if (layout->list[i].start <= hash
-		    && layout->list[i].stop >= hash) {
-			subvol = layout->list[i].xlator;
-			break;
-		}
-	}
-
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"no subvolume for hash (value) = %u", hash);
-	}
+    uint32_t hash = 0;
+    xlator_t *subvol = NULL;
+    int i = 0;
+    int ret = 0;
+
+    ret = dht_hash_compute(this, layout->type, name, &hash);
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_COMPUTE_HASH_FAILED,
+                "type=%d", layout->type, "name=%s", name, NULL);
+        goto out;
+    }
+
+    for (i = 0; i < layout->cnt; i++) {
+        if (layout->list[i].start <= hash && layout->list[i].stop >= hash) {
+            subvol = layout->list[i].xlator;
+            break;
+        }
+    }
+
+    if (!subvol) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+                "hash-value=0x%x", hash, NULL);
+    }
 
 out:
-	return subvol;
+    return subvol;
 }
 
-
 dht_layout_t *
-dht_layout_for_subvol (xlator_t *this, xlator_t *subvol)
+dht_layout_for_subvol(xlator_t *this, xlator_t *subvol)
 {
-	dht_conf_t   *conf = NULL;
-	dht_layout_t *layout = NULL;
-	int           i = 0;
-
+    dht_conf_t *conf = NULL;
+    dht_layout_t *layout = NULL;
+    int i = 0;
 
-	conf = this->private;
+    conf = this->private;
+    if (!conf)
+        goto out;
 
-	for (i = 0; i < conf->subvolume_cnt; i++) {
-		if (conf->subvolumes[i] == subvol) {
-			layout = conf->file_layouts[i];
-			break;
-		}
-	}
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (conf->subvolumes[i] == subvol) {
+            layout = conf->file_layouts[i];
+            break;
+        }
+    }
 
-	return layout;
+out:
+    return layout;
 }
 
-
 int
-dht_layouts_init (xlator_t *this, dht_conf_t *conf)
+dht_layouts_init(xlator_t *this, dht_conf_t *conf)
 {
-	dht_layout_t *layout = NULL;
-	int           i = 0;
-	int           ret = -1;
-	
+    dht_layout_t *layout = NULL;
+    int i = 0;
+    int ret = -1;
+
+    if (!conf)
+        goto out;
 
-	conf->file_layouts = CALLOC (conf->subvolume_cnt,
-				     sizeof (dht_layout_t *));
-	if (!conf->file_layouts) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto out;
-	}
+    conf->file_layouts = GF_CALLOC(conf->subvolume_cnt, sizeof(dht_layout_t *),
+                                   gf_dht_mt_dht_layout_t);
+    if (!conf->file_layouts) {
+        goto out;
+    }
 
-	for (i = 0; i < conf->subvolume_cnt; i++) {
-		layout = dht_layout_new (this, 1);
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        layout = dht_layout_new(this, 1);
 
-		if (!layout) {
-			goto out;
-		}
+        if (!layout) {
+            goto out;
+        }
 
-		layout->preset = 1;
+        layout->preset = 1;
 
-		layout->list[0].xlator = conf->subvolumes[i];
+        layout->list[0].xlator = conf->subvolumes[i];
 
-		conf->file_layouts[i] = layout;
-	}
+        conf->file_layouts[i] = layout;
+    }
 
-	ret = 0;
+    ret = 0;
 out:
-	return ret;
+    return ret;
 }
 
-
 int
-dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
-			 int pos, int32_t **disk_layout_p)
+dht_disk_layout_extract(xlator_t *this, dht_layout_t *layout, int pos,
+                        int32_t **disk_layout_p)
 {
-	int      ret = -1;
-	int32_t *disk_layout = NULL;
+    int ret = -1;
+    int32_t *disk_layout = NULL;
+
+    disk_layout = GF_CALLOC(5, sizeof(int), gf_dht_mt_int32_t);
+    if (!disk_layout) {
+        goto out;
+    }
 
-	disk_layout = CALLOC (5, sizeof (int));
-	if (!disk_layout) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto out;
-	}
+    disk_layout[0] = hton32(layout->list[pos].commit_hash);
+    disk_layout[1] = hton32(layout->type);
+    disk_layout[2] = hton32(layout->list[pos].start);
+    disk_layout[3] = hton32(layout->list[pos].stop);
 
-	disk_layout[0] = hton32 (1);
-	disk_layout[1] = hton32 (layout->type);
-	disk_layout[2] = hton32 (layout->list[pos].start);
-	disk_layout[3] = hton32 (layout->list[pos].stop);
+    if (disk_layout_p)
+        *disk_layout_p = disk_layout;
+    else
+        GF_FREE(disk_layout);
 
-	if (disk_layout_p)
-		*disk_layout_p = disk_layout;
-	ret = 0;
+    ret = 0;
 
 out:
-	return ret;
+    return ret;
 }
 
-
 int
-dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
-		       int pos, int32_t *disk_layout)
+dht_disk_layout_extract_for_subvol(xlator_t *this, dht_layout_t *layout,
+                                   xlator_t *subvol, int32_t **disk_layout_p)
 {
-	int      cnt = 0;
-	int      type = 0;
-	int      start_off = 0;
-	int      stop_off = 0;
-
-	/* TODO: assert disk_layout_ptr is of required length */
-
-	cnt  = ntoh32 (disk_layout[0]);
-	if (cnt != 1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"disk layout has invalid count %d", cnt);
-		return -1;
-	}
-
-	/* TODO: assert type is compatible */
-	type      = ntoh32 (disk_layout[1]);
-	start_off = ntoh32 (disk_layout[2]);
-	stop_off  = ntoh32 (disk_layout[3]);
-
-	layout->list[pos].start = start_off;
-	layout->list[pos].stop  = stop_off;
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"merged to layout: %u - %u (type %d) from %s",
-		start_off, stop_off, type,
-		layout->list[pos].xlator->name);
-
-	return 0;
+    int i = 0;
+
+    for (i = 0; i < layout->cnt; i++) {
+        if (layout->list[i].xlator == subvol)
+            break;
+    }
+
+    if (i == layout->cnt)
+        return -1;
+
+    return dht_disk_layout_extract(this, layout, i, disk_layout_p);
 }
 
+static int
+dht_disk_layout_merge(xlator_t *this, dht_layout_t *layout, int pos,
+                      void *disk_layout_raw, int disk_layout_len)
+{
+    int type = 0;
+    int start_off = 0;
+    int stop_off = 0;
+    int commit_hash = 0;
+    int disk_layout[4];
+
+    if (!disk_layout_raw) {
+        gf_smsg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+                NULL);
+        return -1;
+    }
+
+    GF_ASSERT(disk_layout_len == sizeof(disk_layout));
+
+    memcpy(disk_layout, disk_layout_raw, disk_layout_len);
+
+    type = ntoh32(disk_layout[1]);
+    switch (type) {
+        case DHT_HASH_TYPE_DM_USER:
+            gf_msg_debug(this->name, 0, "found user-set layout");
+            layout->type = type;
+            /* Fall through. */
+        case DHT_HASH_TYPE_DM:
+            break;
+        default:
+            gf_smsg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_DISK_LAYOUT,
+                    "layout=%d", disk_layout[1], NULL);
+            return -1;
+    }
+
+    commit_hash = ntoh32(disk_layout[0]);
+    start_off = ntoh32(disk_layout[2]);
+    stop_off = ntoh32(disk_layout[3]);
+
+    layout->list[pos].commit_hash = commit_hash;
+    layout->list[pos].start = start_off;
+    layout->list[pos].stop = stop_off;
+
+    gf_msg_trace(this->name, 0,
+                 "merged to layout: 0x%x - 0x%x (hash 0x%x, type %d) from %s",
+                 start_off, stop_off, commit_hash, type,
+                 layout->list[pos].xlator->name);
+
+    return 0;
+}
 
 int
-dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
-		  int op_ret, int op_errno, dict_t *xattr)
+dht_layout_merge(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+                 int op_ret, int op_errno, dict_t *xattr)
 {
-	int      i     = 0;
-	int      ret   = -1;
-	int      err   = -1;
-	int32_t *disk_layout = NULL;
-
-
-	if (op_ret != 0) {
-		err = op_errno;
-	}
-
-	for (i = 0; i < layout->cnt; i++) {
-		if (layout->list[i].xlator == NULL) {
-			layout->list[i].err    = err;
-			layout->list[i].xlator = subvol;
-			break;
-		}
-	}
-
-	if (op_ret != 0) {
-		ret = 0;
-		goto out;
-	}
-
-	if (xattr) {
-		/* during lookup and not mkdir */
-		ret = dict_get_ptr (xattr, "trusted.glusterfs.dht",
-				    VOID(&disk_layout));
-	}
-
-	if (ret != 0) {
-		layout->list[i].err = -1;
-		gf_log (this->name, GF_LOG_DEBUG,
-			"missing disk layout on %s. err = %d",
-			subvol->name, err);
-		ret = 0;
-		goto out;
-	}
-
-	ret = dht_disk_layout_merge (this, layout, i, disk_layout);
-	if (ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"layout merge from subvolume %s failed",
-			subvol->name);
-		goto out;
-	}
-	layout->list[i].err = 0;
+    int i = 0;
+    int ret = -1;
+    int err = -1;
+    void *disk_layout_raw = NULL;
+    int disk_layout_len = 0;
+    dht_conf_t *conf = this->private;
+
+    if (op_ret != 0) {
+        err = op_errno;
+    }
+
+    if (!layout)
+        goto out;
+
+    for (i = 0; i < layout->cnt; i++) {
+        if (layout->list[i].xlator == NULL) {
+            layout->list[i].err = err;
+            layout->list[i].xlator = subvol;
+            break;
+        }
+    }
+
+    if (op_ret != 0) {
+        ret = 0;
+        goto out;
+    }
+
+    if (xattr) {
+        /* during lookup and not mkdir */
+        ret = dict_get_ptr_and_len(xattr, conf->xattr_name, &disk_layout_raw,
+                                   &disk_layout_len);
+    }
+
+    if (ret != 0) {
+        layout->list[i].err = 0;
+        gf_msg_trace(this->name, 0, "Missing disk layout on %s. err = %d",
+                     subvol->name, err);
+        ret = 0;
+        goto out;
+    }
+
+    ret = dht_disk_layout_merge(this, layout, i, disk_layout_raw,
+                                disk_layout_len);
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+                "subvolume=%s", subvol->name, NULL);
+        goto out;
+    }
+
+    if (layout->commit_hash == 0) {
+        layout->commit_hash = layout->list[i].commit_hash;
+    } else if (layout->commit_hash != layout->list[i].commit_hash) {
+        layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
+    }
+
+    layout->list[i].err = 0;
 
 out:
-	return ret;
+    return ret;
 }
 
+void
+dht_layout_entry_swap(dht_layout_t *layout, int i, int j)
+{
+    uint32_t start_swap = 0;
+    uint32_t stop_swap = 0;
+    uint32_t commit_hash_swap = 0;
+    xlator_t *xlator_swap = 0;
+    int err_swap = 0;
+
+    start_swap = layout->list[i].start;
+    stop_swap = layout->list[i].stop;
+    xlator_swap = layout->list[i].xlator;
+    err_swap = layout->list[i].err;
+    commit_hash_swap = layout->list[i].commit_hash;
+
+    layout->list[i].start = layout->list[j].start;
+    layout->list[i].stop = layout->list[j].stop;
+    layout->list[i].xlator = layout->list[j].xlator;
+    layout->list[i].err = layout->list[j].err;
+    layout->list[i].commit_hash = layout->list[j].commit_hash;
+
+    layout->list[j].start = start_swap;
+    layout->list[j].stop = stop_swap;
+    layout->list[j].xlator = xlator_swap;
+    layout->list[j].err = err_swap;
+    layout->list[j].commit_hash = commit_hash_swap;
+}
 
 void
-dht_layout_entry_swap (dht_layout_t *layout, int i, int j)
+dht_layout_range_swap(dht_layout_t *layout, int i, int j)
+{
+    uint32_t start_swap = 0;
+    uint32_t stop_swap = 0;
+
+    start_swap = layout->list[i].start;
+    stop_swap = layout->list[i].stop;
+
+    layout->list[i].start = layout->list[j].start;
+    layout->list[i].stop = layout->list[j].stop;
+
+    layout->list[j].start = start_swap;
+    layout->list[j].stop = stop_swap;
+}
+static int64_t
+dht_layout_entry_cmp_volname(dht_layout_t *layout, int i, int j)
 {
-	uint32_t  start_swap = 0;
-	uint32_t  stop_swap = 0;
-	xlator_t *xlator_swap = 0;
-	int       err_swap = 0;
-
-
-	start_swap  = layout->list[i].start;
-	stop_swap   = layout->list[i].stop;
-	xlator_swap = layout->list[i].xlator;
-	err_swap    = layout->list[i].err;
-
-	layout->list[i].start  = layout->list[j].start;
-	layout->list[i].stop   = layout->list[j].stop;
-	layout->list[i].xlator = layout->list[j].xlator;
-	layout->list[i].err    = layout->list[j].err;
-
-	layout->list[j].start  = start_swap;
-	layout->list[j].stop   = stop_swap;
-	layout->list[j].xlator = xlator_swap;
-	layout->list[j].err    = err_swap;
+    return (strcmp(layout->list[i].xlator->name, layout->list[j].xlator->name));
 }
 
-int64_t
-dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j)
+gf_boolean_t
+dht_is_subvol_in_layout(dht_layout_t *layout, xlator_t *xlator)
 {
-	return (strcmp (layout->list[i].xlator->name, 
-			layout->list[j].xlator->name));
+    int i = 0;
+
+    for (i = 0; i < layout->cnt; i++) {
+        /* Check if xlator is already part of layout, and layout is
+         * non-zero. */
+        if (!strcmp(layout->list[i].xlator->name, xlator->name)) {
+            if (layout->list[i].start != layout->list[i].stop)
+                return _gf_true;
+            break;
+        }
+    }
+    return _gf_false;
 }
 
-int64_t
-dht_layout_entry_cmp (dht_layout_t *layout, int i, int j)
+static int64_t
+dht_layout_entry_cmp(dht_layout_t *layout, int i, int j)
 {
-	int64_t diff = 0;
+    int64_t diff = 0;
 
-	if (layout->list[i].err || layout->list[j].err)
-		diff = layout->list[i].err - layout->list[j].err;
-	else
-		diff = (int64_t) layout->list[i].start
-			- (int64_t) layout->list[j].start;
+    /* swap zero'ed out layouts to front, if needed */
+    if (!layout->list[j].start && !layout->list[j].stop) {
+        diff = (int64_t)layout->list[i].stop - (int64_t)layout->list[j].stop;
+        goto out;
+    }
+    diff = (int64_t)layout->list[i].start - (int64_t)layout->list[j].start;
 
-	return diff;
+out:
+    return diff;
 }
 
-
 int
-dht_layout_sort (dht_layout_t *layout)
+dht_layout_sort(dht_layout_t *layout)
 {
-	int       i = 0;
-	int       j = 0;
-	int64_t   ret = 0;
+    int i = 0;
+    int j = 0;
+    int64_t ret = 0;
 
-	/* TODO: O(n^2) -- bad bad */
+    /* TODO: O(n^2) -- bad bad */
 
-	for (i = 0; i < layout->cnt - 1; i++) {
-		for (j = i + 1; j < layout->cnt; j++) {
-			ret = dht_layout_entry_cmp (layout, i, j);
-			if (ret > 0)
-				dht_layout_entry_swap (layout, i, j);
-		}
-	}
+    for (i = 0; i < layout->cnt - 1; i++) {
+        for (j = i + 1; j < layout->cnt; j++) {
+            ret = dht_layout_entry_cmp(layout, i, j);
+            if (ret > 0)
+                dht_layout_entry_swap(layout, i, j);
+        }
+    }
 
-	return 0;
+    return 0;
+}
+
+void
+dht_layout_sort_volname(dht_layout_t *layout)
+{
+    int i = 0;
+    int j = 0;
+    int64_t ret = 0;
+
+    /* TODO: O(n^2) -- bad bad */
+
+    for (i = 0; i < layout->cnt - 1; i++) {
+        for (j = i + 1; j < layout->cnt; j++) {
+            ret = dht_layout_entry_cmp_volname(layout, i, j);
+            if (ret > 0)
+                dht_layout_entry_swap(layout, i, j);
+        }
+    }
+}
+
+void
+dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout,
+                     uint32_t *holes_p, uint32_t *overlaps_p,
+                     uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p,
+                     uint32_t *no_space_p)
+{
+    uint32_t overlaps = 0;
+    uint32_t missing = 0;
+    uint32_t down = 0;
+    uint32_t misc = 0;
+    uint32_t hole_cnt = 0;
+    uint32_t overlap_cnt = 0;
+    int i = 0;
+    uint32_t prev_stop = 0;
+    uint32_t last_stop = 0;
+    char is_virgin = 1;
+    uint32_t no_space = 0;
+
+    /* This function scans through the layout spread of a directory to
+       check if there are any anomalies. Prior to calling this function
+       the layout entries should be sorted in the ascending order.
+
+       If the layout entry has err != 0
+            then increment the corresponding anomaly.
+       else
+            if (start of the current layout entry > stop + 1 of previous
+               non erroneous layout entry)
+                    then it indicates a hole in the layout
+            if (start of the current layout entry < stop + 1 of previous
+                non erroneous layout entry)
+                     then it indicates an overlap in the layout
+    */
+    last_stop = layout->list[0].start - 1;
+    prev_stop = last_stop;
+
+    for (i = 0; i < layout->cnt; i++) {
+        switch (layout->list[i].err) {
+            case -1:
+            case ENOENT:
+            case ESTALE:
+                missing++;
+                continue;
+            case ENOTCONN:
+                down++;
+                continue;
+            case ENOSPC:
+                no_space++;
+                continue;
+            case 0:
+                /* if err == 0 and start == stop, then it is a non misc++;
+                 * participating subvolume(spread-cnt). Then, do not
+                 * check for anomalies. If start != stop, then treat it
+                 * as misc err */
+                if (layout->list[i].start == layout->list[i].stop) {
+                    continue;
+                }
+                break;
+            default:
+                misc++;
+                continue;
+        }
+
+        is_virgin = 0;
+
+        if ((prev_stop + 1) < layout->list[i].start) {
+            hole_cnt++;
+        }
+
+        if ((prev_stop + 1) > layout->list[i].start) {
+            overlap_cnt++;
+            overlaps += ((prev_stop + 1) - layout->list[i].start);
+        }
+        prev_stop = layout->list[i].stop;
+    }
+
+    if ((last_stop - prev_stop) || is_virgin)
+        hole_cnt++;
+
+    if (holes_p)
+        *holes_p = hole_cnt;
+
+    if (overlaps_p)
+        *overlaps_p = overlap_cnt;
+
+    if (missing_p)
+        *missing_p = missing;
+
+    if (down_p)
+        *down_p = down;
+
+    if (misc_p)
+        *misc_p = misc;
+
+    if (no_space_p)
+        *no_space_p = no_space;
 }
 
 int
-dht_layout_sort_volname (dht_layout_t *layout)
+dht_layout_missing_dirs(dht_layout_t *layout)
 {
-	int       i = 0;
-	int       j = 0;
-	int64_t   ret = 0;
+    int i = 0, missing = 0;
 
-	/* TODO: O(n^2) -- bad bad */
+    if (layout == NULL)
+        goto out;
 
-	for (i = 0; i < layout->cnt - 1; i++) {
-		for (j = i + 1; j < layout->cnt; j++) {
-			ret = dht_layout_entry_cmp_volname (layout, i, j);
-			if (ret > 0)
-				dht_layout_entry_swap (layout, i, j);
-		}
-	}
+    for (i = 0; i < layout->cnt; i++) {
+        if ((layout->list[i].err == ENOENT) ||
+            ((layout->list[i].err == -1) && (layout->list[i].start == 0) &&
+             (layout->list[i].stop == 0))) {
+            missing++;
+        }
+    }
 
-	return 0;
+out:
+    return missing;
 }
 
-
 int
-dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
-		      uint32_t *holes_p, uint32_t *overlaps_p,
-		      uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p)
+dht_layout_normalize(xlator_t *this, loc_t *loc, dht_layout_t *layout)
 {
-	dht_conf_t *conf = NULL;
-	uint32_t    holes    = 0;
-	uint32_t    overlaps = 0;
-	uint32_t    missing  = 0;
-	uint32_t    down     = 0;
-	uint32_t    misc     = 0;
-	uint32_t    hole_cnt = 0;
-	uint32_t    overlap_cnt = 0;
-	int         i = 0;
-	int         ret = 0;
-	uint32_t    prev_stop = 0;
-	uint32_t    last_stop = 0;
-	char        is_virgin = 1;
-
-
-	conf = this->private;
-
-	/* TODO: explain WTF is happening */
-
-	last_stop = layout->list[0].start - 1;
-	prev_stop = last_stop;
-
-	for (i = 0; i < layout->cnt; i++) {
-		if (layout->list[i].err) {
-			switch (layout->list[i].err) {
-			case -1:
-			case ENOENT:
-				missing++;
-				break;
-			case ENOTCONN:
-				down++;
-				break;
-			default:
-				misc++;
-			}
-			continue;
-		}
-
-		is_virgin = 0;
-
-		if ((prev_stop + 1) < layout->list[i].start) {
-			hole_cnt++;
-			holes += (layout->list[i].start - (prev_stop + 1));
-		}
-
-		if ((prev_stop + 1) > layout->list[i].start) {
-			overlap_cnt++;
-			overlaps += ((prev_stop + 1) - layout->list[i].start);
-		}
-		prev_stop = layout->list[i].stop;
-	}
-
-	if ((last_stop - prev_stop) || is_virgin)
-	    hole_cnt++;
-	holes += (last_stop - prev_stop);
-
-	if (holes_p)
-		*holes_p = hole_cnt;
-
-	if (overlaps_p)
-		*overlaps_p = overlap_cnt;
-
-	if (missing_p)
-		*missing_p = missing;
-
-	if (down_p)
-		*down_p = down;
-
-	if (misc_p)
-		*misc_p = misc;
-
-	return ret;
-}
+    int ret = 0;
+    uint32_t holes = 0;
+    uint32_t overlaps = 0;
+    uint32_t missing = 0;
+    uint32_t down = 0;
+    uint32_t misc = 0, missing_dirs = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    ret = dht_layout_sort(layout);
+    if (ret == -1) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SORT_FAILED,
+                NULL);
+        goto out;
+    }
+
+    gf_uuid_unparse(loc->gfid, gfid);
+
+    dht_layout_anomalies(this, loc, layout, &holes, &overlaps, &missing, &down,
+                         &misc, NULL);
+
+    if (holes || overlaps) {
+        if (missing == layout->cnt) {
+            gf_msg_debug(this->name, 0,
+                         "Directory %s looked up first time"
+                         " gfid = %s",
+                         loc->path, gfid);
+        } else {
+            gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_ANOMALIES_INFO,
+                    "path=%s", loc->path, "gfid=%s", gfid, "holes=%d", holes,
+                    "overlaps=%d", overlaps, NULL);
+        }
+        ret = -1;
+    }
+
+    if (ret >= 0) {
+        missing_dirs = dht_layout_missing_dirs(layout);
+        /* TODO During DHT selfheal rewrite (almost) find a better place
+         * to detect this - probably in dht_layout_anomalies()
+         */
+        if (missing_dirs > 0)
+            ret += missing_dirs;
+    }
 
+out:
+    return ret;
+}
 
 int
-dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout)
+dht_dir_has_layout(dict_t *xattr, char *name)
 {
-	int          ret   = 0;
-	int          i = 0;
-	uint32_t     holes = 0;
-	uint32_t     overlaps = 0;
-	uint32_t     missing = 0;
-	uint32_t     down = 0;
-	uint32_t     misc = 0;
-
-
-	ret = dht_layout_sort (layout);
-	if (ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"sort failed?! how the ....");
-		goto out;
-	}
-
-	ret = dht_layout_anomalies (this, loc, layout,
-				    &holes, &overlaps,
-				    &missing, &down, &misc);
-	if (ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"error while finding anomalies in %s -- not good news",
-			loc->path);
-		goto out;
-	}
-
-	if (holes || overlaps) {
-		if (missing == layout->cnt) {
-			gf_log (this->name, GF_LOG_WARNING,
-				"directory %s looked up first time",
-				loc->path);
-		} else {
-			gf_log (this->name, GF_LOG_ERROR,
-				"found anomalies in %s. holes=%d overlaps=%d",
-				loc->path, holes, overlaps);
-		}
-		ret = 1;
-	}
-
-	for (i = 0; i < layout->cnt; i++) {
-	/* TODO During DHT selfheal rewrite (almost) find a better place to 
-	 * detect this - probably in dht_layout_anomalies() 
-	 */
-		if (layout->list[i].err == ENOENT) {
-			gf_log (this->name, GF_LOG_WARNING,
-				"path=%s ENOENT - directory entry"
-				" should be created in selfheal", loc->path);
-			ret = 1;
-			break;
-		}
-	}
+    void *disk_layout_raw = NULL;
 
-out:
-	return ret;
+    return dict_get_ptr(xattr, name, &disk_layout_raw);
 }
 
+int
+dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+                        loc_t *loc, dict_t *xattr)
+{
+    int idx = 0;
+    int pos = -1;
+    int ret = 0;
+    int err = 0;
+    int dict_ret = 0;
+    int32_t disk_layout[4];
+    void *disk_layout_raw = NULL;
+    uint32_t start_off = -1;
+    uint32_t stop_off = -1;
+    uint32_t commit_hash = -1;
+    dht_conf_t *conf = this->private;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    if (loc && loc->inode)
+        gf_uuid_unparse(loc->inode->gfid, gfid);
+
+    for (idx = 0; idx < layout->cnt; idx++) {
+        if (layout->list[idx].xlator == subvol) {
+            pos = idx;
+            break;
+        }
+    }
+
+    if (pos == -1) {
+        if (loc) {
+            gf_msg_debug(this->name, 0, "%s - no layout info for subvolume %s",
+                         loc ? loc->path : "path not found", subvol->name);
+        }
+        ret = 1;
+        goto out;
+    }
+
+    err = layout->list[pos].err;
+
+    if (!xattr) {
+        if (err == 0) {
+            if (loc) {
+                gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_XATTR_DICT_NULL,
+                        "path=%s", loc->path, NULL);
+            } else {
+                gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_XATTR_DICT_NULL,
+                        "path not found", NULL);
+            }
+            ret = -1;
+        }
+        goto out;
+    }
+
+    dict_ret = dict_get_ptr(xattr, conf->xattr_name, &disk_layout_raw);
+
+    if (dict_ret < 0) {
+        if (err == 0 && layout->list[pos].stop) {
+            if (loc) {
+                gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING,
+                        "path=%s", loc->path, "gfid=%s", gfid, NULL);
+            } else {
+                gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING,
+                        "path not found"
+                        "gfid=%s",
+                        gfid, NULL);
+            }
+            ret = -1;
+        }
+        goto out;
+    }
+
+    memcpy(disk_layout, disk_layout_raw, sizeof(disk_layout));
+
+    start_off = ntoh32(disk_layout[2]);
+    stop_off = ntoh32(disk_layout[3]);
+    commit_hash = ntoh32(disk_layout[0]);
+
+    if ((layout->list[pos].start != start_off) ||
+        (layout->list[pos].stop != stop_off) ||
+        (layout->list[pos].commit_hash != commit_hash)) {
+        gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_INFO, "subvol=%s",
+                layout->list[pos].xlator->name, "inode-layout:start=0x%x",
+                layout->list[pos].start, "inode-layout:stop=0x%x",
+                layout->list[pos].stop, "layout-commit-hash=0x%x; ",
+                layout->list[pos].commit_hash, "disk-layout:start-off=0x%x",
+                start_off, "disk-layout:top-off=0x%x", stop_off,
+                "commit-hash=0x%x", commit_hash, NULL);
+        ret = 1;
+    } else {
+        ret = 0;
+    }
+out:
+    return ret;
+}
 
 int
-dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
-			 loc_t *loc, dict_t *xattr)
+dht_layout_preset(xlator_t *this, xlator_t *subvol, inode_t *inode)
 {
-	int       idx = 0;
-	int       pos = -1;
-	int       ret = -1;
-	int32_t  *disk_layout = NULL;
-	int32_t   count = -1;
-	uint32_t  start_off = -1;
-	uint32_t  stop_off = -1;
-
-
-	for (idx = 0; idx < layout->cnt; idx++) {
-		if (layout->list[idx].xlator == subvol) {
-			pos = idx;
-			break;
-		}
-	}
-	
-	if (pos == -1) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"%s - no layout info for subvolume %s",
-			loc->path, subvol->name);
-		ret = 1;
-		goto out;
-	}
-	
-	if (xattr == NULL) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"%s - xattr dictionary is NULL",
-			loc->path);
-		ret = -1;
-		goto out;
-	}
-
-	ret = dict_get_ptr (xattr, "trusted.glusterfs.dht",
-			    VOID(&disk_layout));
-	
-	if (ret < 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"%s - disk layout missing", loc->path);
-		ret = -1;
-		goto out;
-	} 
-
-	count  = ntoh32 (disk_layout[0]);
-	if (count != 1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"%s - disk layout has invalid count %d",
-			loc->path, count);
-		ret = -1;
-		goto out;
-	}
-
-	start_off = ntoh32 (disk_layout[2]);
-	stop_off  = ntoh32 (disk_layout[3]);
-	
-	if ((layout->list[pos].start != start_off)
-	    || (layout->list[pos].stop != stop_off)) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"subvol: %s; inode layout - %"PRId32" - %"PRId32"; "
-			"disk layout - %"PRId32" - %"PRId32,
-			layout->list[pos].xlator->name,
-			layout->list[pos].start, layout->list[pos].stop,
-			start_off, stop_off);
-		ret = 1;
-	} else {
-		ret = 0;
-	}
+    dht_layout_t *layout = NULL;
+    int ret = -1;
+    dht_conf_t *conf = NULL;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    layout = dht_layout_for_subvol(this, subvol);
+    if (!layout) {
+        gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_NO_LAYOUT_INFO,
+                "subvolume=%s", subvol ? subvol->name : "<nil>", NULL);
+        ret = -1;
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "file = %s, subvol = %s",
+                 uuid_utoa(inode->gfid), subvol ? subvol->name : "<nil>");
+
+    LOCK(&conf->layout_lock);
+    {
+        dht_inode_ctx_layout_set(inode, this, layout);
+    }
+
+    UNLOCK(&conf->layout_lock);
+
+    ret = 0;
 out:
-	return ret;
+    return ret;
 }
 
+int
+dht_layout_index_for_subvol(dht_layout_t *layout, xlator_t *subvol)
+{
+    int i = 0, ret = -1;
+
+    for (i = 0; i < layout->cnt; i++) {
+        if (layout->list[i].xlator == subvol) {
+            ret = i;
+            break;
+        }
+    }
+
+    return ret;
+}
diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c
index b7f73b40916..89ec6cca56e 100644
--- a/xlators/cluster/dht/src/dht-linkfile.c
+++ b/xlators/cluster/dht/src/dht-linkfile.c
@@ -1,224 +1,328 @@
 /*
-   Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+#include <glusterfs/compat.h>
+#include "dht-common.h"
 
+static int
+dht_linkfile_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int op_ret, int op_errno, inode_t *inode,
+                        struct iatt *stbuf, dict_t *xattr,
+                        struct iatt *postparent)
+{
+    char is_linkfile = 0;
+    dht_conf_t *conf = NULL;
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+    prev = cookie;
+    conf = this->private;
+
+    if (op_ret)
+        goto out;
+
+    gf_uuid_unparse(local->loc.gfid, gfid);
+
+    is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name);
+    if (!is_linkfile)
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NOT_LINK_FILE_ERROR,
+                "name=%s", prev->name, "path=%s", local->loc.path, "gfid=%s",
+                gfid, NULL);
+out:
+    local->linkfile.linkfile_cbk(frame, cookie, this, op_ret, op_errno, inode,
+                                 stbuf, postparent, postparent, xattr);
+    return 0;
+}
 
-#include "glusterfs.h"
-#include "xlator.h"
-#include "compat.h"
-#include "dht-common.h"
+static int
+dht_linkfile_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int op_ret, int op_errno, inode_t *inode,
+                        struct iatt *stbuf, struct iatt *preparent,
+                        struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *subvol = NULL;
+    dict_t *xattrs = NULL;
+    dht_conf_t *conf = NULL;
+    int ret = -1;
+
+    local = frame->local;
+
+    if (!op_ret)
+        local->linked = _gf_true;
+
+    FRAME_SU_UNDO(frame, dht_local_t);
+
+    if (op_ret && (op_errno == EEXIST)) {
+        conf = this->private;
+        subvol = cookie;
+        if (!subvol)
+            goto out;
+        xattrs = dict_new();
+        if (!xattrs)
+            goto out;
+        ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+                    "mame=%s", conf->link_xattr_name, NULL);
+            goto out;
+        }
+
+        STACK_WIND_COOKIE(frame, dht_linkfile_lookup_cbk, subvol, subvol,
+                          subvol->fops->lookup, &local->linkfile.loc, xattrs);
+        if (xattrs)
+            dict_unref(xattrs);
+        return 0;
+    }
+out:
+    local->linkfile.linkfile_cbk(frame, cookie, this, op_ret, op_errno, inode,
+                                 stbuf, preparent, postparent, xdata);
+    if (xattrs)
+        dict_unref(xattrs);
+    return 0;
+}
 
+int
+dht_linkfile_create(call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
+                    xlator_t *this, xlator_t *tovol, xlator_t *fromvol,
+                    loc_t *loc)
+{
+    dht_local_t *local = NULL;
+    dict_t *dict = NULL;
+    int need_unref = 0;
+    int ret = 0;
+    dht_conf_t *conf = this->private;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+    local->linkfile.linkfile_cbk = linkfile_cbk;
+    local->linkfile.srcvol = tovol;
+    loc_copy(&local->linkfile.loc, loc);
+
+    local->linked = _gf_false;
+
+    dict = local->params;
+    if (!dict) {
+        dict = dict_new();
+        if (!dict)
+            goto out;
+        need_unref = 1;
+    }
+
+    if (!gf_uuid_is_null(local->gfid)) {
+        gf_uuid_unparse(local->gfid, gfid);
+
+        ret = dict_set_gfuuid(dict, "gfid-req", local->gfid, true);
+        if (ret)
+            gf_smsg("dht-linkfile", GF_LOG_INFO, 0, DHT_MSG_DICT_SET_FAILED,
+                    "path=%s", loc->path, "gfid=%s", gfid, NULL);
+    } else {
+        gf_uuid_unparse(loc->gfid, gfid);
+    }
+
+    ret = dict_set_str(dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+    if (ret)
+        gf_smsg("dht-linkfile", GF_LOG_INFO, 0, DHT_MSG_DICT_SET_FAILED,
+                "path=%s", loc->path, "key=%s", GLUSTERFS_INTERNAL_FOP_KEY,
+                "gfid=%s", gfid, NULL);
+
+    ret = dict_set_str(dict, conf->link_xattr_name, tovol->name);
+
+    if (ret < 0) {
+        gf_smsg(frame->this->name, GF_LOG_INFO, 0, DHT_MSG_CREATE_LINK_FAILED,
+                "path=%s", loc->path, "gfid=%s", gfid, NULL);
+        goto out;
+    }
+
+    local->link_subvol = fromvol;
+    /* Always create as root:root. dht_linkfile_attr_heal fixes the
+     * ownsership */
+    FRAME_SU_DO(frame, dht_local_t);
+    STACK_WIND_COOKIE(frame, dht_linkfile_create_cbk, fromvol, fromvol,
+                      fromvol->fops->mknod, loc, S_IFREG | DHT_LINKFILE_MODE, 0,
+                      0, dict);
+
+    if (need_unref && dict)
+        dict_unref(dict);
+
+    return 0;
+out:
+    local->linkfile.linkfile_cbk(frame, frame->this, frame->this, -1, ENOMEM,
+                                 loc->inode, NULL, NULL, NULL, NULL);
 
+    if (need_unref && dict)
+        dict_unref(dict);
+
+    return 0;
+}
 
 int
-dht_linkfile_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			int op_ret, int op_errno)
+dht_linkfile_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno,
+                        struct iatt *preparent, struct iatt *postparent,
+                        dict_t *xdata)
 {
-	dht_local_t *local = NULL;
+    dht_local_t *local = NULL;
+    xlator_t *subvol = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
 
+    local = frame->local;
+    subvol = cookie;
 
-	local = frame->local;
-	local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
-				      local->linkfile.inode,
-				      &local->linkfile.stbuf);
+    if (op_ret == -1) {
+        gf_uuid_unparse(local->loc.gfid, gfid);
+        gf_smsg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_UNLINK_FAILED,
+                "path=%s", local->loc.path, "gfid=%s", gfid, "subvolume=%s",
+                subvol->name, NULL);
+    }
 
-	return 0;
-}
+    DHT_STACK_DESTROY(frame);
 
+    return 0;
+}
 
 int
-dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			 int op_ret, int op_errno,
-			 inode_t *inode, struct stat *stbuf)
+dht_linkfile_unlink(call_frame_t *frame, xlator_t *this, xlator_t *subvol,
+                    loc_t *loc)
 {
-	dht_local_t  *local = NULL;
-	call_frame_t *prev = NULL;
-	dict_t       *xattr = NULL;
-	data_t       *str_data = NULL;
-	int           ret = -1;
-
-	local = frame->local;
-	prev  = cookie;
-
-	if (op_ret == -1)
-		goto err;
-
-	xattr = get_new_dict ();
-	if (!xattr) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		op_errno = ENOMEM;
-		goto err;
-	}
-
-	local->linkfile.xattr = dict_ref (xattr);
-	local->linkfile.inode = inode_ref (inode);
-
-	str_data = str_to_data (local->linkfile.srcvol->name);
-	if (!str_data) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		op_errno = ENOMEM;
-		goto err;
-	}
-
-	ret = dict_set (xattr, "trusted.glusterfs.dht.linkto", str_data);
-	if (ret < 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"failed to initialize linkfile data");
-		op_errno = EINVAL;
-	}
-	str_data = NULL;
-
-	local->linkfile.stbuf = *stbuf;
-
-	STACK_WIND (frame, dht_linkfile_xattr_cbk,
-		    prev->this, prev->this->fops->setxattr,
-		    &local->linkfile.loc, local->linkfile.xattr, 0);
-
-	return 0;
+    call_frame_t *unlink_frame = NULL;
+    dht_local_t *unlink_local = NULL;
+
+    unlink_frame = copy_frame(frame);
+    if (!unlink_frame) {
+        goto err;
+    }
+
+    /* Using non-fop value here, as anyways, 'local->fop' is not used in
+       this particular case */
+    unlink_local = dht_local_init(unlink_frame, loc, NULL, GF_FOP_MAXVALUE);
+    if (!unlink_local) {
+        goto err;
+    }
+
+    STACK_WIND_COOKIE(unlink_frame, dht_linkfile_unlink_cbk, subvol, subvol,
+                      subvol->fops->unlink, &unlink_local->loc, 0, NULL);
 
+    return 0;
 err:
-	if (str_data) {
-		data_destroy (str_data);
-		str_data = NULL;
-	}
-
-	local->linkfile.linkfile_cbk (frame, cookie, this,
-				      op_ret, op_errno, inode, stbuf);
-	return 0;
-}
+    if (unlink_frame)
+        DHT_STACK_DESTROY(unlink_frame);
 
+    return -1;
+}
 
-int
-dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
-		     xlator_t *tovol, xlator_t *fromvol, loc_t *loc)
+xlator_t *
+dht_linkfile_subvol(xlator_t *this, inode_t *inode, struct iatt *stbuf,
+                    dict_t *xattr)
 {
-	dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    xlator_t *subvol = NULL;
+    void *volname = NULL;
+    int i = 0, ret = 0;
 
+    conf = this->private;
 
-	local = frame->local;
-	local->linkfile.linkfile_cbk = linkfile_cbk;
-	local->linkfile.srcvol = tovol;
-	loc_copy (&local->linkfile.loc, loc);
+    if (!xattr)
+        goto out;
 
-	STACK_WIND (frame, dht_linkfile_create_cbk,
-		    fromvol, fromvol->fops->mknod, loc,
-		    S_IFREG | DHT_LINKFILE_MODE, 0);
+    ret = dict_get_ptr(xattr, conf->link_xattr_name, &volname);
 
-	return 0;
-}
+    if ((-1 == ret) || !volname)
+        goto out;
 
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (strcmp(conf->subvolumes[i]->name, (char *)volname) == 0) {
+            subvol = conf->subvolumes[i];
+            break;
+        }
+    }
 
-int
-dht_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			 int32_t op_ret, int32_t op_errno)
+out:
+    return subvol;
+}
+
+static int
+dht_linkfile_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int op_ret, int op_errno, struct iatt *statpre,
+                         struct iatt *statpost, dict_t *xdata)
 {
-	dht_local_t   *local = NULL;
-	call_frame_t  *prev = NULL;
-	xlator_t      *subvol = NULL;
+    dht_local_t *local = NULL;
+    loc_t *loc = NULL;
 
-	local = frame->local;
-	prev = cookie;
-	subvol = prev->this;
+    local = frame->local;
+    loc = &local->loc;
 
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"unlinking linkfile %s on %s failed (%s)",
-			local->loc.path, subvol->name, strerror (op_errno));
-	}
+    if (op_ret)
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_SETATTR_FAILED,
+                "path=%s", (loc->path ? loc->path : "NULL"), "gfid=%s",
+                uuid_utoa(local->gfid), NULL);
 
-	DHT_STACK_DESTROY (frame);
+    DHT_STACK_DESTROY(frame);
 
-	return 0;
+    return 0;
 }
 
-
 int
-dht_linkfile_unlink (call_frame_t *frame, xlator_t *this,
-		     xlator_t *subvol, loc_t *loc)
+dht_linkfile_attr_heal(call_frame_t *frame, xlator_t *this)
 {
-	call_frame_t *unlink_frame = NULL;
-	dht_local_t  *unlink_local = NULL;
-
-	unlink_frame = copy_frame (frame);
-	if (!unlink_frame) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	unlink_local = dht_local_init (unlink_frame);
-	if (!unlink_local) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	loc_copy (&unlink_local->loc, loc);
-
-	STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk,
-		    subvol, subvol->fops->unlink,
-		    &unlink_local->loc);
-
-	return 0;
-err:
-	if (unlink_frame)
-		DHT_STACK_DESTROY (unlink_frame);
+    int ret = -1;
+    call_frame_t *copy = NULL;
+    dht_local_t *local = NULL;
+    dht_local_t *copy_local = NULL;
+    xlator_t *subvol = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+    dict_t *xattr = NULL;
 
-	return -1;
-}
+    local = frame->local;
 
+    GF_VALIDATE_OR_GOTO("dht", local, out);
+    GF_VALIDATE_OR_GOTO("dht", local->link_subvol, out);
 
-xlator_t *
-dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct stat *stbuf,
-		     dict_t *xattr)
-{
-	dht_conf_t *conf = NULL;
-	xlator_t   *subvol = NULL;
-	void       *volname = NULL;
-	int         i = 0, ret = 0;
+    if (local->stbuf.ia_type == IA_INVAL)
+        return 0;
+
+    DHT_MARK_FOP_INTERNAL(xattr);
 
+    gf_uuid_copy(local->loc.gfid, local->stbuf.ia_gfid);
 
-	conf = this->private;
+    copy = copy_frame(frame);
 
-	if (!xattr)
-		goto out;
+    if (!copy)
+        goto out;
 
-	ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname);
+    copy_local = dht_local_init(copy, &local->loc, NULL, 0);
 
-	if ((-1 == ret) || !volname)
-		goto out;
+    if (!copy_local)
+        goto out;
 
-	for (i = 0; i < conf->subvolume_cnt; i++) {
-		if (strcmp (conf->subvolumes[i]->name, (char *)volname) == 0) {
-			subvol = conf->subvolumes[i];
-			break;
-		}
-	}
+    stbuf = local->stbuf;
+    subvol = local->link_subvol;
 
+    copy->local = copy_local;
+
+    FRAME_SU_DO(copy, dht_local_t);
+
+    STACK_WIND(copy, dht_linkfile_setattr_cbk, subvol, subvol->fops->setattr,
+               &copy_local->loc, &stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
+               xattr);
+    ret = 0;
 out:
-	return subvol;
-}
+    if ((ret < 0) && (copy))
+        DHT_STACK_DESTROY(copy);
 
+    if (xattr)
+        dict_unref(xattr);
 
+    return ret;
+}
diff --git a/xlators/cluster/dht/src/dht-lock.c b/xlators/cluster/dht/src/dht-lock.c
new file mode 100644
index 00000000000..638821ccee5
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-lock.c
@@ -0,0 +1,1392 @@
+/*
+  Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "dht-lock.h"
+
+static char *
+dht_lock_asprintf(dht_lock_t *lock)
+{
+    char *lk_buf = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {
+        0,
+    };
+
+    if (lock == NULL)
+        goto out;
+
+    uuid_utoa_r(lock->loc.gfid, gfid);
+
+    gf_asprintf(&lk_buf, "%s:%s", lock->xl->name, gfid);
+
+out:
+    return lk_buf;
+}
+
+static void
+dht_log_lk_array(char *name, gf_loglevel_t log_level, dht_lock_t **lk_array,
+                 int count)
+{
+    int i = 0;
+    char *lk_buf = NULL;
+
+    if ((lk_array == NULL) || (count == 0))
+        goto out;
+
+    for (i = 0; i < count; i++) {
+        lk_buf = dht_lock_asprintf(lk_array[i]);
+        if (!lk_buf)
+            goto out;
+
+        gf_smsg(name, log_level, 0, DHT_MSG_LK_ARRAY_INFO, "index=%d", i,
+                "lk_buf=%s", lk_buf, NULL);
+        GF_FREE(lk_buf);
+    }
+
+out:
+    return;
+}
+
+static void
+dht_lock_stack_destroy(call_frame_t *lock_frame, dht_lock_type_t lk)
+{
+    dht_local_t *local = NULL;
+
+    local = lock_frame->local;
+
+    if (lk == DHT_INODELK) {
+        local->lock[0].layout.my_layout.locks = NULL;
+        local->lock[0].layout.my_layout.lk_count = 0;
+    } else {
+        local->lock[0].ns.directory_ns.locks = NULL;
+        local->lock[0].ns.directory_ns.lk_count = 0;
+    }
+
+    DHT_STACK_DESTROY(lock_frame);
+    return;
+}
+
+static void
+dht_lock_free(dht_lock_t *lock)
+{
+    if (lock == NULL)
+        goto out;
+
+    loc_wipe(&lock->loc);
+    GF_FREE(lock->domain);
+    GF_FREE(lock->basename);
+    mem_put(lock);
+
+out:
+    return;
+}
+
+static void
+dht_set_lkowner(dht_lock_t **lk_array, int count, gf_lkowner_t *lkowner)
+{
+    int i = 0;
+
+    if (!lk_array || !lkowner)
+        goto out;
+
+    for (i = 0; i < count; i++) {
+        lk_array[i]->lk_owner = *lkowner;
+    }
+
+out:
+    return;
+}
+
+static int
+dht_lock_request_cmp(const void *val1, const void *val2)
+{
+    dht_lock_t *lock1 = NULL;
+    dht_lock_t *lock2 = NULL;
+    int ret = -1;
+
+    lock1 = *(dht_lock_t **)val1;
+    lock2 = *(dht_lock_t **)val2;
+
+    GF_VALIDATE_OR_GOTO("dht-locks", lock1, out);
+    GF_VALIDATE_OR_GOTO("dht-locks", lock2, out);
+
+    ret = strcmp(lock1->xl->name, lock2->xl->name);
+
+    if (ret == 0) {
+        ret = gf_uuid_compare(lock1->loc.gfid, lock2->loc.gfid);
+    }
+
+out:
+    return ret;
+}
+
+static int
+dht_lock_order_requests(dht_lock_t **locks, int count)
+{
+    int ret = -1;
+
+    if (!locks || !count)
+        goto out;
+
+    qsort(locks, count, sizeof(*locks), dht_lock_request_cmp);
+    ret = 0;
+
+out:
+    return ret;
+}
+
+void
+dht_lock_array_free(dht_lock_t **lk_array, int count)
+{
+    int i = 0;
+    dht_lock_t *lock = NULL;
+
+    if (lk_array == NULL)
+        goto out;
+
+    for (i = 0; i < count; i++) {
+        lock = lk_array[i];
+        lk_array[i] = NULL;
+        dht_lock_free(lock);
+    }
+
+out:
+    return;
+}
+
+int32_t
+dht_lock_count(dht_lock_t **lk_array, int lk_count)
+{
+    int i = 0, locked = 0;
+
+    if ((lk_array == NULL) || (lk_count == 0))
+        goto out;
+
+    for (i = 0; i < lk_count; i++) {
+        if (lk_array[i]->locked)
+            locked++;
+    }
+out:
+    return locked;
+}
+
+static call_frame_t *
+dht_lock_frame(call_frame_t *parent_frame)
+{
+    call_frame_t *lock_frame = NULL;
+
+    lock_frame = copy_frame(parent_frame);
+    if (lock_frame == NULL)
+        goto out;
+
+    set_lk_owner_from_ptr(&lock_frame->root->lk_owner, parent_frame->root);
+
+out:
+    return lock_frame;
+}
+
+dht_lock_t *
+dht_lock_new(xlator_t *this, xlator_t *xl, loc_t *loc, short type,
+             const char *domain, const char *basename,
+             dht_reaction_type_t do_on_failure)
+{
+    dht_conf_t *conf = NULL;
+    dht_lock_t *lock = NULL;
+
+    conf = this->private;
+
+    lock = mem_get0(conf->lock_pool);
+    if (lock == NULL)
+        goto out;
+
+    lock->xl = xl;
+    lock->type = type;
+    lock->do_on_failure = do_on_failure;
+
+    lock->domain = gf_strdup(domain);
+    if (lock->domain == NULL) {
+        dht_lock_free(lock);
+        lock = NULL;
+        goto out;
+    }
+
+    if (basename) {
+        lock->basename = gf_strdup(basename);
+        if (lock->basename == NULL) {
+            dht_lock_free(lock);
+            lock = NULL;
+            goto out;
+        }
+    }
+
+    /* Fill only inode and gfid.
+       posix and protocol/server give preference to pargfid/basename over
+       gfid/inode for resolution if all the three parameters of loc_t are
+       present. I want to avoid the following hypothetical situation:
+
+       1. rebalance did a lookup on a dentry and got a gfid.
+       2. rebalance acquires lock on loc_t which was filled with gfid and
+          path (pargfid/bname) from step 1.
+       3. somebody deleted and recreated the same file
+       4. rename on the same path acquires lock on loc_t which now points
+          to a different inode (and hence gets the lock).
+       5. rebalance continues to migrate file (note that not all fops done
+          by rebalance during migration are inode/gfid based Eg., unlink)
+       6. rename continues.
+    */
+    lock->loc.inode = inode_ref(loc->inode);
+    loc_gfid(loc, lock->loc.gfid);
+
+out:
+    return lock;
+}
+
+static int
+dht_local_entrylk_init(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+                       fop_entrylk_cbk_t entrylk_cbk)
+{
+    int ret = -1;
+    dht_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local == NULL) {
+        local = dht_local_init(frame, NULL, NULL, 0);
+    }
+
+    if (local == NULL) {
+        goto out;
+    }
+
+    local->lock[0].ns.directory_ns.entrylk_cbk = entrylk_cbk;
+    local->lock[0].ns.directory_ns.locks = lk_array;
+    local->lock[0].ns.directory_ns.lk_count = lk_count;
+
+    ret = dht_lock_order_requests(local->lock[0].ns.directory_ns.locks,
+                                  local->lock[0].ns.directory_ns.lk_count);
+    if (ret < 0)
+        goto out;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static void
+dht_entrylk_done(call_frame_t *lock_frame)
+{
+    fop_entrylk_cbk_t entrylk_cbk = NULL;
+    call_frame_t *main_frame = NULL;
+    dht_local_t *local = NULL;
+
+    local = lock_frame->local;
+    main_frame = local->main_frame;
+
+    local->lock[0].ns.directory_ns.locks = NULL;
+    local->lock[0].ns.directory_ns.lk_count = 0;
+
+    entrylk_cbk = local->lock[0].ns.directory_ns.entrylk_cbk;
+    local->lock[0].ns.directory_ns.entrylk_cbk = NULL;
+
+    entrylk_cbk(main_frame, NULL, main_frame->this,
+                local->lock[0].ns.directory_ns.op_ret,
+                local->lock[0].ns.directory_ns.op_errno, NULL);
+
+    dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK);
+    return;
+}
+
+static int32_t
+dht_unlock_entrylk_done(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+    gf_uuid_unparse(local->lock[0].ns.directory_ns.locks[0]->loc.inode->gfid,
+                    gfid);
+
+    if (op_ret < 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+                DHT_MSG_UNLOCK_GFID_FAILED, "gfid=%s", gfid,
+                "DHT_LAYOUT_HEAL_DOMAIN", NULL);
+    }
+
+    DHT_STACK_DESTROY(frame);
+    return 0;
+}
+
+static int32_t
+dht_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int lk_index = 0, call_cnt = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    lk_index = (long)cookie;
+
+    local = frame->local;
+
+    uuid_utoa_r(local->lock[0].ns.directory_ns.locks[lk_index]->loc.gfid, gfid);
+
+    if (op_ret < 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLOCKING_FAILED,
+                "name=%s",
+                local->lock[0].ns.directory_ns.locks[lk_index]->xl->name,
+                "gfid=%s", gfid, NULL);
+    } else {
+        local->lock[0].ns.directory_ns.locks[lk_index]->locked = 0;
+    }
+
+    call_cnt = dht_frame_return(frame);
+    if (is_last_call(call_cnt)) {
+        dht_entrylk_done(frame);
+    }
+
+    return 0;
+}
+
+static int32_t
+dht_unlock_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+                   fop_entrylk_cbk_t entrylk_cbk)
+{
+    dht_local_t *local = NULL;
+    int ret = -1, i = 0;
+    call_frame_t *lock_frame = NULL;
+    int call_cnt = 0;
+
+    GF_VALIDATE_OR_GOTO("dht-locks", frame, done);
+    GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, done);
+    GF_VALIDATE_OR_GOTO(frame->this->name, entrylk_cbk, done);
+
+    call_cnt = dht_lock_count(lk_array, lk_count);
+    if (call_cnt == 0) {
+        ret = 0;
+        goto done;
+    }
+
+    lock_frame = dht_lock_frame(frame);
+    if (lock_frame == NULL) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+                DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS,
+                NULL);
+
+        dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
+        goto done;
+    }
+
+    ret = dht_local_entrylk_init(lock_frame, lk_array, lk_count, entrylk_cbk);
+    if (ret < 0) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+                DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK,
+                NULL);
+
+        dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
+
+        goto done;
+    }
+
+    local = lock_frame->local;
+    local->main_frame = frame;
+    local->call_cnt = call_cnt;
+
+    for (i = 0; i < local->lock[0].ns.directory_ns.lk_count; i++) {
+        if (!local->lock[0].ns.directory_ns.locks[i]->locked)
+            continue;
+
+        lock_frame->root
+            ->lk_owner = local->lock[0].ns.directory_ns.locks[i]->lk_owner;
+        STACK_WIND_COOKIE(
+            lock_frame, dht_unlock_entrylk_cbk, (void *)(long)i,
+            local->lock[0].ns.directory_ns.locks[i]->xl,
+            local->lock[0].ns.directory_ns.locks[i]->xl->fops->entrylk,
+            local->lock[0].ns.directory_ns.locks[i]->domain,
+            &local->lock[0].ns.directory_ns.locks[i]->loc,
+            local->lock[0].ns.directory_ns.locks[i]->basename, ENTRYLK_UNLOCK,
+            ENTRYLK_WRLCK, NULL);
+        if (!--call_cnt)
+            break;
+    }
+
+    return 0;
+
+done:
+    if (lock_frame)
+        dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK);
+
+    /* no locks acquired, invoke entrylk_cbk */
+    if (ret == 0)
+        entrylk_cbk(frame, NULL, frame->this, 0, 0, NULL);
+
+    return ret;
+}
+
+int32_t
+dht_unlock_entrylk_wrapper(call_frame_t *frame, dht_elock_wrap_t *entrylk)
+{
+    dht_local_t *local = NULL, *lock_local = NULL;
+    call_frame_t *lock_frame = NULL;
+    char pgfid[GF_UUID_BUF_SIZE] = {0};
+    int ret = 0;
+
+    local = frame->local;
+
+    if (!entrylk || !entrylk->locks)
+        goto out;
+
+    gf_uuid_unparse(local->loc.parent->gfid, pgfid);
+
+    lock_frame = copy_frame(frame);
+    if (lock_frame == NULL) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+                DHT_MSG_COPY_FRAME_FAILED, "pgfid=%s", pgfid, "name=%s",
+                local->loc.name, "path=%s", local->loc.path, NULL);
+        goto done;
+    }
+
+    lock_local = dht_local_init(lock_frame, NULL, NULL, 0);
+    if (lock_local == NULL) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+                DHT_MSG_CREATE_FAILED, "local", "pgfid=%s", pgfid, "name=%s",
+                local->loc.name, "path=%s", local->loc.path, NULL);
+        goto done;
+    }
+
+    lock_frame->local = lock_local;
+
+    lock_local->lock[0].ns.directory_ns.locks = entrylk->locks;
+    lock_local->lock[0].ns.directory_ns.lk_count = entrylk->lk_count;
+    entrylk->locks = NULL;
+    entrylk->lk_count = 0;
+
+    ret = dht_unlock_entrylk(
+        lock_frame, lock_local->lock[0].ns.directory_ns.locks,
+        lock_local->lock[0].ns.directory_ns.lk_count, dht_unlock_entrylk_done);
+    if (ret)
+        goto done;
+
+    lock_frame = NULL;
+
+done:
+    if (lock_frame != NULL) {
+        DHT_STACK_DESTROY(lock_frame);
+    }
+
+out:
+    return 0;
+}
+
+static int
+dht_entrylk_cleanup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_entrylk_done(frame);
+    return 0;
+}
+
+static void
+dht_entrylk_cleanup(call_frame_t *lock_frame)
+{
+    dht_lock_t **lk_array = NULL;
+    int lk_count = 0, lk_acquired = 0;
+    dht_local_t *local = NULL;
+
+    local = lock_frame->local;
+
+    lk_array = local->lock[0].ns.directory_ns.locks;
+    lk_count = local->lock[0].ns.directory_ns.lk_count;
+
+    lk_acquired = dht_lock_count(lk_array, lk_count);
+    if (lk_acquired != 0) {
+        dht_unlock_entrylk(lock_frame, lk_array, lk_count,
+                           dht_entrylk_cleanup_cbk);
+    } else {
+        dht_entrylk_done(lock_frame);
+    }
+
+    return;
+}
+
+static int32_t
+dht_blocking_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    int lk_index = 0;
+    int i = 0;
+    dht_local_t *local = NULL;
+
+    lk_index = (long)cookie;
+
+    local = frame->local;
+    if (op_ret == 0) {
+        local->lock[0].ns.directory_ns.locks[lk_index]->locked = _gf_true;
+    } else {
+        switch (op_errno) {
+            case ESTALE:
+            case ENOENT:
+                if (local->lock[0]
+                        .ns.directory_ns.locks[lk_index]
+                        ->do_on_failure != IGNORE_ENOENT_ESTALE) {
+                    local->lock[0].ns.directory_ns.op_ret = -1;
+                    local->lock[0].ns.directory_ns.op_errno = op_errno;
+                    goto cleanup;
+                }
+                break;
+            default:
+                local->lock[0].ns.directory_ns.op_ret = -1;
+                local->lock[0].ns.directory_ns.op_errno = op_errno;
+                goto cleanup;
+        }
+    }
+
+    if (lk_index == (local->lock[0].ns.directory_ns.lk_count - 1)) {
+        for (i = 0; (i < local->lock[0].ns.directory_ns.lk_count) &&
+                    (!local->lock[0].ns.directory_ns.locks[i]->locked);
+             i++)
+            ;
+
+        if (i == local->lock[0].ns.directory_ns.lk_count) {
+            local->lock[0].ns.directory_ns.op_ret = -1;
+            local->lock[0].ns.directory_ns.op_errno = op_errno;
+        }
+
+        dht_entrylk_done(frame);
+    } else {
+        dht_blocking_entrylk_rec(frame, ++lk_index);
+    }
+
+    return 0;
+
+cleanup:
+    dht_entrylk_cleanup(frame);
+
+    return 0;
+}
+
+void
+dht_blocking_entrylk_rec(call_frame_t *frame, int i)
+{
+    dht_local_t *local = NULL;
+
+    local = frame->local;
+
+    STACK_WIND_COOKIE(
+        frame, dht_blocking_entrylk_cbk, (void *)(long)i,
+        local->lock[0].ns.directory_ns.locks[i]->xl,
+        local->lock[0].ns.directory_ns.locks[i]->xl->fops->entrylk,
+        local->lock[0].ns.directory_ns.locks[i]->domain,
+        &local->lock[0].ns.directory_ns.locks[i]->loc,
+        local->lock[0].ns.directory_ns.locks[i]->basename, ENTRYLK_LOCK,
+        ENTRYLK_WRLCK, NULL);
+
+    return;
+}
+
+int
+dht_blocking_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+                     fop_entrylk_cbk_t entrylk_cbk)
+{
+    int ret = -1;
+    call_frame_t *lock_frame = NULL;
+    dht_local_t *local = NULL;
+
+    GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, entrylk_cbk, out);
+
+    lock_frame = dht_lock_frame(frame);
+    if (lock_frame == NULL)
+        goto out;
+
+    ret = dht_local_entrylk_init(lock_frame, lk_array, lk_count, entrylk_cbk);
+    if (ret < 0) {
+        goto out;
+    }
+
+    dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner);
+
+    local = lock_frame->local;
+    local->main_frame = frame;
+
+    dht_blocking_entrylk_rec(lock_frame, 0);
+
+    return 0;
+out:
+    if (lock_frame)
+        dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK);
+
+    return -1;
+}
+
+static int
+dht_local_inodelk_init(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+                       fop_inodelk_cbk_t inodelk_cbk)
+{
+    int ret = -1;
+    dht_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local == NULL) {
+        local = dht_local_init(frame, NULL, NULL, 0);
+    }
+
+    if (local == NULL) {
+        goto out;
+    }
+
+    local->lock[0].layout.my_layout.inodelk_cbk = inodelk_cbk;
+    local->lock[0].layout.my_layout.locks = lk_array;
+    local->lock[0].layout.my_layout.lk_count = lk_count;
+
+    ret = dht_lock_order_requests(local->lock[0].layout.my_layout.locks,
+                                  local->lock[0].layout.my_layout.lk_count);
+    if (ret < 0)
+        goto out;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static void
+dht_inodelk_done(call_frame_t *lock_frame)
+{
+    fop_inodelk_cbk_t inodelk_cbk = NULL;
+    call_frame_t *main_frame = NULL;
+    dht_local_t *local = NULL;
+
+    local = lock_frame->local;
+    main_frame = local->main_frame;
+
+    local->lock[0].layout.my_layout.locks = NULL;
+    local->lock[0].layout.my_layout.lk_count = 0;
+
+    inodelk_cbk = local->lock[0].layout.my_layout.inodelk_cbk;
+    local->lock[0].layout.my_layout.inodelk_cbk = NULL;
+
+    inodelk_cbk(main_frame, NULL, main_frame->this,
+                local->lock[0].layout.my_layout.op_ret,
+                local->lock[0].layout.my_layout.op_errno, NULL);
+
+    dht_lock_stack_destroy(lock_frame, DHT_INODELK);
+    return;
+}
+
+static int32_t
+dht_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int lk_index = 0, call_cnt = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    lk_index = (long)cookie;
+
+    local = frame->local;
+    if (op_ret < 0) {
+        uuid_utoa_r(local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid,
+                    gfid);
+
+        gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLOCKING_FAILED,
+                "name=%s",
+                local->lock[0].layout.my_layout.locks[lk_index]->xl->name,
+                "gfid=%s", gfid, NULL);
+    } else {
+        local->lock[0].layout.my_layout.locks[lk_index]->locked = 0;
+    }
+
+    call_cnt = dht_frame_return(frame);
+    if (is_last_call(call_cnt)) {
+        dht_inodelk_done(frame);
+    }
+
+    return 0;
+}
+
+static int32_t
+dht_unlock_inodelk_done(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+    gf_uuid_unparse(local->lock[0].layout.my_layout.locks[0]->loc.inode->gfid,
+                    gfid);
+
+    if (op_ret < 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+                DHT_MSG_UNLOCK_GFID_FAILED, "DHT_LAYOUT_HEAL_DOMAIN gfid=%s",
+                gfid, NULL);
+    }
+
+    DHT_STACK_DESTROY(frame);
+    return 0;
+}
+
+int32_t
+dht_unlock_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+                   fop_inodelk_cbk_t inodelk_cbk)
+{
+    dht_local_t *local = NULL;
+    struct gf_flock flock = {
+        0,
+    };
+    int ret = -1, i = 0;
+    call_frame_t *lock_frame = NULL;
+    int call_cnt = 0;
+
+    GF_VALIDATE_OR_GOTO("dht-locks", frame, done);
+    GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, done);
+    GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, done);
+
+    call_cnt = dht_lock_count(lk_array, lk_count);
+    if (call_cnt == 0) {
+        ret = 0;
+        goto done;
+    }
+
+    lock_frame = dht_lock_frame(frame);
+    if (lock_frame == NULL) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+                DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS,
+                NULL);
+
+        dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
+        goto done;
+    }
+
+    ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk);
+    if (ret < 0) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+                DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK,
+                NULL);
+
+        dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
+
+        goto done;
+    }
+
+    local = lock_frame->local;
+    local->main_frame = frame;
+    local->call_cnt = call_cnt;
+
+    flock.l_type = F_UNLCK;
+
+    for (i = 0; i < local->lock[0].layout.my_layout.lk_count; i++) {
+        if (!local->lock[0].layout.my_layout.locks[i]->locked)
+            continue;
+
+        lock_frame->root
+            ->lk_owner = local->lock[0].layout.my_layout.locks[i]->lk_owner;
+        STACK_WIND_COOKIE(
+            lock_frame, dht_unlock_inodelk_cbk, (void *)(long)i,
+            local->lock[0].layout.my_layout.locks[i]->xl,
+            local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk,
+            local->lock[0].layout.my_layout.locks[i]->domain,
+            &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLK, &flock,
+            NULL);
+        if (!--call_cnt)
+            break;
+    }
+
+    return 0;
+
+done:
+    if (lock_frame)
+        dht_lock_stack_destroy(lock_frame, DHT_INODELK);
+
+    /* no locks acquired, invoke inodelk_cbk */
+    if (ret == 0)
+        inodelk_cbk(frame, NULL, frame->this, 0, 0, NULL);
+
+    return ret;
+}
+
+int32_t
+dht_unlock_inodelk_wrapper(call_frame_t *frame, dht_ilock_wrap_t *inodelk)
+{
+    dht_local_t *local = NULL, *lock_local = NULL;
+    call_frame_t *lock_frame = NULL;
+    char pgfid[GF_UUID_BUF_SIZE] = {0};
+    int ret = 0;
+
+    local = frame->local;
+
+    if (!inodelk || !inodelk->locks)
+        goto out;
+
+    gf_uuid_unparse(local->loc.parent->gfid, pgfid);
+
+    lock_frame = copy_frame(frame);
+    if (lock_frame == NULL) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+                DHT_MSG_COPY_FRAME_FAILED, "pgfid=%s", pgfid, "name=%s",
+                local->loc.name, "path=%s", local->loc.path, NULL);
+        goto done;
+    }
+
+    lock_local = dht_local_init(lock_frame, NULL, NULL, 0);
+    if (lock_local == NULL) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+                DHT_MSG_CREATE_FAILED, "local", "gfid=%s", pgfid, "name=%s",
+                local->loc.name, "path=%s", local->loc.path, NULL);
+        goto done;
+    }
+
+    lock_frame->local = lock_local;
+
+    lock_local->lock[0].layout.my_layout.locks = inodelk->locks;
+    lock_local->lock[0].layout.my_layout.lk_count = inodelk->lk_count;
+    inodelk->locks = NULL;
+    inodelk->lk_count = 0;
+
+    ret = dht_unlock_inodelk(
+        lock_frame, lock_local->lock[0].layout.my_layout.locks,
+        lock_local->lock[0].layout.my_layout.lk_count, dht_unlock_inodelk_done);
+
+    if (ret)
+        goto done;
+
+    lock_frame = NULL;
+
+done:
+    if (lock_frame != NULL) {
+        DHT_STACK_DESTROY(lock_frame);
+    }
+out:
+    return 0;
+}
+
+static int
+dht_inodelk_cleanup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_inodelk_done(frame);
+    return 0;
+}
+
+static void
+dht_inodelk_cleanup(call_frame_t *lock_frame)
+{
+    dht_lock_t **lk_array = NULL;
+    int lk_count = 0, lk_acquired = 0;
+    dht_local_t *local = NULL;
+
+    local = lock_frame->local;
+
+    lk_array = local->lock[0].layout.my_layout.locks;
+    lk_count = local->lock[0].layout.my_layout.lk_count;
+
+    lk_acquired = dht_lock_count(lk_array, lk_count);
+    if (lk_acquired != 0) {
+        dht_unlock_inodelk(lock_frame, lk_array, lk_count,
+                           dht_inodelk_cleanup_cbk);
+    } else {
+        dht_inodelk_done(lock_frame);
+    }
+
+    return;
+}
+
+static int32_t
+dht_nonblocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int lk_index = 0, call_cnt = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+    lk_index = (long)cookie;
+
+    if (op_ret == -1) {
+        local->lock[0].layout.my_layout.op_ret = -1;
+        local->lock[0].layout.my_layout.op_errno = op_errno;
+
+        if (local && local->lock[0].layout.my_layout.locks[lk_index]) {
+            uuid_utoa_r(local->lock[0]
+                            .layout.my_layout.locks[lk_index]
+                            ->loc.inode->gfid,
+                        gfid);
+
+            gf_msg_debug(
+                this->name, op_errno,
+                "inodelk failed on gfid: %s "
+                "subvolume: %s",
+                gfid,
+                local->lock[0].layout.my_layout.locks[lk_index]->xl->name);
+        }
+
+        goto out;
+    }
+
+    local->lock[0].layout.my_layout.locks[lk_index]->locked = _gf_true;
+
+out:
+    call_cnt = dht_frame_return(frame);
+    if (is_last_call(call_cnt)) {
+        if (local->lock[0].layout.my_layout.op_ret < 0) {
+            dht_inodelk_cleanup(frame);
+            return 0;
+        }
+
+        dht_inodelk_done(frame);
+    }
+
+    return 0;
+}
+
+int
+dht_nonblocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array,
+                        int lk_count, fop_inodelk_cbk_t inodelk_cbk)
+{
+    struct gf_flock flock = {
+        0,
+    };
+    int i = 0, ret = 0;
+    dht_local_t *local = NULL;
+    call_frame_t *lock_frame = NULL;
+
+    GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, out);
+
+    lock_frame = dht_lock_frame(frame);
+    if (lock_frame == NULL)
+        goto out;
+
+    ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk);
+    if (ret < 0) {
+        goto out;
+    }
+
+    dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner);
+
+    local = lock_frame->local;
+    local->main_frame = frame;
+
+    local->call_cnt = lk_count;
+
+    for (i = 0; i < lk_count; i++) {
+        flock.l_type = local->lock[0].layout.my_layout.locks[i]->type;
+
+        STACK_WIND_COOKIE(
+            lock_frame, dht_nonblocking_inodelk_cbk, (void *)(long)i,
+            local->lock[0].layout.my_layout.locks[i]->xl,
+            local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk,
+            local->lock[0].layout.my_layout.locks[i]->domain,
+            &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLK, &flock,
+            NULL);
+    }
+
+    return 0;
+
+out:
+    if (lock_frame)
+        dht_lock_stack_destroy(lock_frame, DHT_INODELK);
+
+    return -1;
+}
+
+static int32_t
+dht_blocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    int lk_index = 0;
+    int i = 0;
+    dht_local_t *local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {
+        0,
+    };
+    dht_reaction_type_t reaction = 0;
+
+    lk_index = (long)cookie;
+
+    local = frame->local;
+    if (op_ret == 0) {
+        local->lock[0].layout.my_layout.locks[lk_index]->locked = _gf_true;
+    } else {
+        switch (op_errno) {
+            case ESTALE:
+            case ENOENT:
+                reaction = local->lock[0]
+                               .layout.my_layout.locks[lk_index]
+                               ->do_on_failure;
+                if ((reaction != IGNORE_ENOENT_ESTALE) &&
+                    (reaction != IGNORE_ENOENT_ESTALE_EIO)) {
+                    gf_uuid_unparse(local->lock[0]
+                                        .layout.my_layout.locks[lk_index]
+                                        ->loc.gfid,
+                                    gfid);
+                    local->lock[0].layout.my_layout.op_ret = -1;
+                    local->lock[0].layout.my_layout.op_errno = op_errno;
+                    gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                            DHT_MSG_INODELK_FAILED, "subvol=%s",
+                            local->lock[0]
+                                .layout.my_layout.locks[lk_index]
+                                ->xl->name,
+                            "gfid=%s", gfid, NULL);
+                    goto cleanup;
+                }
+                break;
+            case EIO:
+                reaction = local->lock[0]
+                               .layout.my_layout.locks[lk_index]
+                               ->do_on_failure;
+                if (reaction != IGNORE_ENOENT_ESTALE_EIO) {
+                    gf_uuid_unparse(local->lock[0]
+                                        .layout.my_layout.locks[lk_index]
+                                        ->loc.gfid,
+                                    gfid);
+                    local->lock[0].layout.my_layout.op_ret = -1;
+                    local->lock[0].layout.my_layout.op_errno = op_errno;
+                    gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                            DHT_MSG_INODELK_FAILED, "subvol=%s",
+                            local->lock[0]
+                                .layout.my_layout.locks[lk_index]
+                                ->xl->name,
+                            "gfid=%s", gfid, NULL);
+                    goto cleanup;
+                }
+                break;
+
+            default:
+                gf_uuid_unparse(
+                    local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid,
+                    gfid);
+                local->lock[0].layout.my_layout.op_ret = -1;
+                local->lock[0].layout.my_layout.op_errno = op_errno;
+                gf_smsg(
+                    this->name, GF_LOG_ERROR, op_errno, DHT_MSG_INODELK_FAILED,
+                    "subvol=%s",
+                    local->lock[0].layout.my_layout.locks[lk_index]->xl->name,
+                    "gfid=%s", gfid, NULL);
+                goto cleanup;
+        }
+    }
+
+    if (lk_index == (local->lock[0].layout.my_layout.lk_count - 1)) {
+        for (i = 0; (i < local->lock[0].layout.my_layout.lk_count) &&
+                    (!local->lock[0].layout.my_layout.locks[i]->locked);
+             i++)
+            ;
+
+        if (i == local->lock[0].layout.my_layout.lk_count) {
+            local->lock[0].layout.my_layout.op_ret = -1;
+            local->lock[0].layout.my_layout.op_errno = op_errno;
+        }
+
+        dht_inodelk_done(frame);
+    } else {
+        dht_blocking_inodelk_rec(frame, ++lk_index);
+    }
+
+    return 0;
+
+cleanup:
+    dht_inodelk_cleanup(frame);
+
+    return 0;
+}
+
+void
+dht_blocking_inodelk_rec(call_frame_t *frame, int i)
+{
+    dht_local_t *local = NULL;
+    struct gf_flock flock = {
+        0,
+    };
+
+    local = frame->local;
+
+    flock.l_type = local->lock[0].layout.my_layout.locks[i]->type;
+
+    STACK_WIND_COOKIE(
+        frame, dht_blocking_inodelk_cbk, (void *)(long)i,
+        local->lock[0].layout.my_layout.locks[i]->xl,
+        local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk,
+        local->lock[0].layout.my_layout.locks[i]->domain,
+        &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLKW, &flock, NULL);
+
+    return;
+}
+
+int
+dht_blocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+                     fop_inodelk_cbk_t inodelk_cbk)
+{
+    int ret = -1;
+    call_frame_t *lock_frame = NULL;
+    dht_local_t *local = NULL;
+    dht_local_t *tmp_local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, out);
+
+    tmp_local = frame->local;
+
+    lock_frame = dht_lock_frame(frame);
+    if (lock_frame == NULL) {
+        gf_uuid_unparse(tmp_local->loc.gfid, gfid);
+        gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_LOCK_FRAME_FAILED,
+                "gfid=%s", gfid, "path=%s", tmp_local->loc.path, NULL);
+        goto out;
+    }
+
+    ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk);
+    if (ret < 0) {
+        gf_uuid_unparse(tmp_local->loc.gfid, gfid);
+        gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_LOCAL_LOCK_INIT_FAILED,
+                "gfid=%s", gfid, "path=%s", tmp_local->loc.path, NULL);
+        goto out;
+    }
+
+    dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner);
+
+    local = lock_frame->local;
+    local->main_frame = frame;
+
+    dht_blocking_inodelk_rec(lock_frame, 0);
+
+    return 0;
+out:
+    if (lock_frame)
+        dht_lock_stack_destroy(lock_frame, DHT_INODELK);
+
+    return -1;
+}
+
+void
+dht_unlock_namespace(call_frame_t *frame, dht_dir_transaction_t *lock)
+{
+    GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, lock, out);
+
+    dht_unlock_entrylk_wrapper(frame, &lock->ns.directory_ns);
+    dht_unlock_inodelk_wrapper(frame, &lock->ns.parent_layout);
+
+out:
+    return;
+}
+
+static int32_t
+dht_protect_namespace_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+
+    local = frame->local;
+    if (op_ret != 0)
+        dht_unlock_inodelk_wrapper(frame, &local->current->ns.parent_layout);
+
+    local->current->ns.ns_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int32_t
+dht_blocking_entrylk_after_inodelk(call_frame_t *frame, void *cookie,
+                                   xlator_t *this, int32_t op_ret,
+                                   int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int ret = -1;
+    loc_t *loc = NULL;
+    dht_lock_t **lk_array = NULL;
+    char pgfid[GF_UUID_BUF_SIZE] = {0};
+    int count = 0;
+    dht_elock_wrap_t *entrylk = NULL;
+
+    local = frame->local;
+    entrylk = &local->current->ns.directory_ns;
+
+    if (op_ret < 0) {
+        local->op_ret = -1;
+        local->op_errno = op_errno;
+        goto err;
+    }
+
+    loc = &entrylk->locks[0]->loc;
+    gf_uuid_unparse(loc->gfid, pgfid);
+
+    local->op_ret = 0;
+    lk_array = entrylk->locks;
+    count = entrylk->lk_count;
+
+    ret = dht_blocking_entrylk(frame, lk_array, count,
+                               dht_protect_namespace_cbk);
+
+    if (ret < 0) {
+        local->op_ret = -1;
+        local->op_errno = EIO;
+        gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+                DHT_MSG_ENTRYLK_FAILED_AFT_INODELK, "fop=%s",
+                gf_fop_list[local->fop], "pgfid=%s", pgfid, "basename=%s",
+                entrylk->locks[0]->basename, NULL);
+        goto err;
+    }
+
+    return 0;
+
+err:
+    if (lk_array != NULL) {
+        dht_lock_array_free(lk_array, count);
+        GF_FREE(lk_array);
+        entrylk->locks = NULL;
+        entrylk->lk_count = 0;
+    }
+
+    /* Unlock inodelk. No harm calling unlock twice */
+    dht_unlock_inodelk_wrapper(frame, &local->current->ns.parent_layout);
+    /* Call ns_cbk. It will take care of unwinding */
+    local->current->ns.ns_cbk(frame, NULL, this, local->op_ret, local->op_errno,
+                              NULL);
+    return 0;
+}
+
+/* Given the loc and the subvol, this routine takes the inodelk on
+ * the parent inode and entrylk on (parent, loc->name). This routine
+ * is specific as it supports only one subvol on which it takes inodelk
+ * and then entrylk serially.
+ */
+int
+dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol,
+                      struct dht_namespace *ns, fop_entrylk_cbk_t ns_cbk)
+{
+    dht_ilock_wrap_t *inodelk = NULL;
+    dht_elock_wrap_t *entrylk = NULL;
+    dht_lock_t **lk_array = NULL;
+    dht_local_t *local = NULL;
+    xlator_t *this = NULL;
+    loc_t parent = {
+        0,
+    };
+    int ret = -1;
+    char pgfid[GF_UUID_BUF_SIZE] = {0};
+    int32_t op_errno = 0;
+    int count = 1;
+
+    GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, loc->parent, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, subvol, out);
+
+    local = frame->local;
+    this = frame->this;
+
+    inodelk = &ns->parent_layout;
+    entrylk = &ns->directory_ns;
+
+    /* Initialize entrylk_cbk and parent loc */
+    ns->ns_cbk = ns_cbk;
+
+    ret = dht_build_parent_loc(this, &parent, loc, &op_errno);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_LOC_FAILED,
+                "gfid=%s", loc->gfid, "name=%s", loc->name, "path=%s",
+                loc->path, NULL);
+        goto out;
+    }
+    gf_uuid_unparse(parent.gfid, pgfid);
+
+    /* Alloc inodelk */
+    inodelk->locks = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
+    if (inodelk->locks == NULL) {
+        local->op_errno = ENOMEM;
+        gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+                DHT_MSG_CALLOC_FAILED, "fop=%s", gf_fop_list[local->fop],
+                "pgfid=%s", pgfid, "name=%s", loc->name, "path=%s", loc->path,
+                NULL);
+        goto out;
+    }
+
+    inodelk->locks[0] = dht_lock_new(this, subvol, &parent, F_RDLCK,
+                                     DHT_LAYOUT_HEAL_DOMAIN, NULL,
+                                     FAIL_ON_ANY_ERROR);
+    if (inodelk->locks[0] == NULL) {
+        local->op_errno = ENOMEM;
+        gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+                DHT_MSG_LOCK_ALLOC_FAILED, "inodelk-fop=%s",
+                gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s",
+                loc->name, "path=%s", loc->path, NULL);
+        goto err;
+    }
+    inodelk->lk_count = count;
+
+    /* Allock entrylk */
+    entrylk->locks = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
+    if (entrylk->locks == NULL) {
+        local->op_errno = ENOMEM;
+        gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+                DHT_MSG_CALLOC_FAILED, "entrylk-fop=%s",
+                gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s",
+                loc->name, "path=%s", loc->path, NULL);
+
+        goto err;
+    }
+
+    entrylk->locks[0] = dht_lock_new(this, subvol, &parent, F_WRLCK,
+                                     DHT_ENTRY_SYNC_DOMAIN, loc->name,
+                                     FAIL_ON_ANY_ERROR);
+    if (entrylk->locks[0] == NULL) {
+        local->op_errno = ENOMEM;
+        gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+                DHT_MSG_LOCK_ALLOC_FAILED, "entrylk-fop=%s",
+                gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s",
+                loc->name, "path=%s", loc->path, NULL);
+
+        goto err;
+    }
+    entrylk->lk_count = count;
+
+    /* Take read inodelk on parent. If it is successful, take write entrylk
+     * on name in cbk.
+     */
+    lk_array = inodelk->locks;
+    ret = dht_blocking_inodelk(frame, lk_array, count,
+                               dht_blocking_entrylk_after_inodelk);
+    if (ret < 0) {
+        local->op_errno = EIO;
+        gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+                DHT_MSG_BLOCK_INODELK_FAILED, "fop=%s", gf_fop_list[local->fop],
+                "pgfid=%s", pgfid, "name=%s", loc->name, "path=%s", loc->path,
+                NULL);
+
+        goto err;
+    }
+
+    loc_wipe(&parent);
+
+    return 0;
+err:
+    if (entrylk->locks != NULL) {
+        dht_lock_array_free(entrylk->locks, count);
+        GF_FREE(entrylk->locks);
+        entrylk->locks = NULL;
+        entrylk->lk_count = 0;
+    }
+
+    if (inodelk->locks != NULL) {
+        dht_lock_array_free(inodelk->locks, count);
+        GF_FREE(inodelk->locks);
+        inodelk->locks = NULL;
+        inodelk->lk_count = 0;
+    }
+
+    loc_wipe(&parent);
+out:
+    return -1;
+}
diff --git a/xlators/cluster/dht/src/dht-lock.h b/xlators/cluster/dht/src/dht-lock.h
new file mode 100644
index 00000000000..6485c03fb6e
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-lock.h
@@ -0,0 +1,91 @@
+/*
+  Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _DHT_LOCK_H
+#define _DHT_LOCK_H
+
+#include "dht-common.h"
+
+void
+dht_lock_array_free(dht_lock_t **lk_array, int count);
+
+int32_t
+dht_lock_count(dht_lock_t **lk_array, int lk_count);
+
+dht_lock_t *
+dht_lock_new(xlator_t *this, xlator_t *xl, loc_t *loc, short type,
+             const char *domain, const char *basename,
+             dht_reaction_type_t do_on_failure);
+
+int32_t
+dht_unlock_entrylk_wrapper(call_frame_t *, dht_elock_wrap_t *);
+
+void
+dht_blocking_entrylk_rec(call_frame_t *frame, int i);
+
+int
+dht_blocking_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+                     fop_inodelk_cbk_t entrylk_cbk);
+
+int32_t
+dht_unlock_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+                   fop_inodelk_cbk_t inodelk_cbk);
+
+int32_t
+dht_unlock_inodelk_wrapper(call_frame_t *, dht_ilock_wrap_t *);
+
+/* Acquire non-blocking inodelk on a list of xlators.
+ *
+ * @lk_array: array of lock requests lock on.
+ *
+ * @lk_count: number of locks in @lk_array
+ *
+ * @inodelk_cbk: will be called after inodelk replies are received
+ *
+ * @retval: -1 if stack_winding inodelk fails. 0 otherwise.
+ *          inodelk_cbk is called with appropriate error on errors.
+ *          On failure to acquire lock on all members of list, successful
+ *          locks are unlocked before invoking cbk.
+ */
+
+int
+dht_nonblocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array,
+                        int lk_count, fop_inodelk_cbk_t inodelk_cbk);
+
+void
+dht_blocking_inodelk_rec(call_frame_t *frame, int i);
+
+/* same as dht_nonblocking_inodelk, but issues sequential blocking locks on
+ * @lk_array directly. locks are issued on some order which remains same
+ * for a list of xlators (irrespective of order of xlators within list).
+ */
+
+int
+dht_blocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+                     fop_inodelk_cbk_t inodelk_cbk);
+
+int32_t
+dht_blocking_entrylk_after_inodelk(call_frame_t *frame, void *cookie,
+                                   xlator_t *this, int32_t op_ret,
+                                   int32_t op_errno, dict_t *xdata);
+
+int32_t
+dht_blocking_entrylk_after_inodelk_rename(call_frame_t *frame, void *cookie,
+                                          xlator_t *this, int32_t op_ret,
+                                          int32_t op_errno, dict_t *xdata);
+
+void
+dht_unlock_namespace(call_frame_t *, dht_dir_transaction_t *);
+
+int
+dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol,
+                      struct dht_namespace *ns, fop_entrylk_cbk_t ns_cbk);
+
+#endif /* _DHT_LOCK_H */
diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h
new file mode 100644
index 00000000000..e3c4471334a
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-mem-types.h
@@ -0,0 +1,38 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __DHT_MEM_TYPES_H__
+#define __DHT_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_dht_mem_types_ {
+    gf_dht_mt_dht_du_t = gf_common_mt_end + 1,
+    gf_dht_mt_dht_conf_t,
+    gf_dht_mt_char,
+    gf_dht_mt_int32_t,
+    gf_dht_mt_xlator_t,
+    gf_dht_mt_dht_layout_t,
+    gf_switch_mt_switch_sched_array,
+    gf_switch_mt_switch_struct,
+    gf_dht_mt_subvol_time,
+    gf_dht_mt_loc_t,
+    gf_defrag_info_mt,
+    gf_dht_mt_inode_ctx_t,
+    gf_dht_mt_dirent_t,
+    gf_dht_mt_container_t,
+    gf_dht_mt_octx_t,
+    gf_dht_mt_miginfo_t,
+    gf_dht_mt_fd_ctx_t,
+    gf_dht_ret_cache_t,
+    gf_dht_nodeuuids_t,
+    gf_dht_mt_end
+};
+#endif
diff --git a/xlators/cluster/dht/src/dht-messages.h b/xlators/cluster/dht/src/dht-messages.h
new file mode 100644
index 00000000000..601f8dad78b
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-messages.h
@@ -0,0 +1,386 @@
+/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _DHT_MESSAGES_H_
+#define _DHT_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(
+    DHT, DHT_MSG_CACHED_SUBVOL_GET_FAILED, DHT_MSG_CREATE_LINK_FAILED,
+    DHT_MSG_DICT_SET_FAILED, DHT_MSG_DIR_ATTR_HEAL_FAILED,
+    DHT_MSG_DIR_SELFHEAL_FAILED, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+    DHT_MSG_FILE_ON_MULT_SUBVOL, DHT_MSG_FILE_TYPE_MISMATCH,
+    DHT_MSG_GFID_MISMATCH, DHT_MSG_GFID_NULL, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+    DHT_MSG_INIT_FAILED, DHT_MSG_INVALID_CONFIGURATION,
+    DHT_MSG_INVALID_DISK_LAYOUT, DHT_MSG_INVALID_OPTION,
+    DHT_MSG_LAYOUT_FIX_FAILED, DHT_MSG_LAYOUT_MERGE_FAILED,
+    DHT_MSG_LAYOUT_MISMATCH, DHT_MSG_LAYOUT_NULL, DHT_MSG_MIGRATE_DATA_COMPLETE,
+    DHT_MSG_MIGRATE_DATA_FAILED, DHT_MSG_MIGRATE_FILE_COMPLETE,
+    DHT_MSG_MIGRATE_FILE_FAILED, DHT_MSG_NO_MEMORY, DHT_MSG_OPENDIR_FAILED,
+    DHT_MSG_REBALANCE_FAILED, DHT_MSG_REBALANCE_START_FAILED,
+    DHT_MSG_REBALANCE_STATUS, DHT_MSG_REBALANCE_STOPPED, DHT_MSG_RENAME_FAILED,
+    DHT_MSG_SETATTR_FAILED, DHT_MSG_SUBVOL_INSUFF_INODES,
+    DHT_MSG_SUBVOL_INSUFF_SPACE, DHT_MSG_UNLINK_FAILED,
+    DHT_MSG_LAYOUT_SET_FAILED, DHT_MSG_LOG_FIXED_LAYOUT,
+    DHT_MSG_GET_XATTR_FAILED, DHT_MSG_FILE_LOOKUP_FAILED,
+    DHT_MSG_OPEN_FD_FAILED, DHT_MSG_SET_INODE_CTX_FAILED,
+    DHT_MSG_UNLOCKING_FAILED, DHT_MSG_DISK_LAYOUT_NULL, DHT_MSG_SUBVOL_INFO,
+    DHT_MSG_CHUNK_SIZE_INFO, DHT_MSG_LAYOUT_FORM_FAILED, DHT_MSG_SUBVOL_ERROR,
+    DHT_MSG_LAYOUT_SORT_FAILED, DHT_MSG_REGEX_INFO, DHT_MSG_FOPEN_FAILED,
+    DHT_MSG_SET_HOSTNAME_FAILED, DHT_MSG_BRICK_ERROR, DHT_MSG_SYNCOP_FAILED,
+    DHT_MSG_MIGRATE_INFO, DHT_MSG_SOCKET_ERROR, DHT_MSG_CREATE_FD_FAILED,
+    DHT_MSG_READDIR_ERROR, DHT_MSG_CHILD_LOC_BUILD_FAILED,
+    DHT_MSG_SET_SWITCH_PATTERN_ERROR, DHT_MSG_COMPUTE_HASH_FAILED,
+    DHT_MSG_FIND_LAYOUT_ANOMALIES_ERROR, DHT_MSG_ANOMALIES_INFO,
+    DHT_MSG_LAYOUT_INFO, DHT_MSG_INODE_LK_ERROR, DHT_MSG_RENAME_INFO,
+    DHT_MSG_DATA_NULL, DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED,
+    DHT_MSG_UNLINK_LOOKUP_INFO, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+    DHT_MSG_OPERATION_NOT_SUP, DHT_MSG_NOT_LINK_FILE_ERROR, DHT_MSG_CHILD_DOWN,
+    DHT_MSG_UUID_PARSE_ERROR, DHT_MSG_GET_DISK_INFO_ERROR,
+    DHT_MSG_INVALID_VALUE, DHT_MSG_SWITCH_PATTERN_INFO,
+    DHT_MSG_SUBVOL_OP_FAILED, DHT_MSG_LAYOUT_PRESET_FAILED,
+    DHT_MSG_INVALID_LINKFILE, DHT_MSG_FIX_LAYOUT_INFO,
+    DHT_MSG_GET_HOSTNAME_FAILED, DHT_MSG_WRITE_FAILED,
+    DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED, DHT_MSG_FSYNC_FAILED,
+    DHT_MSG_SUBVOL_DECOMMISSION_INFO, DHT_MSG_BRICK_QUERY_FAILED,
+    DHT_MSG_SUBVOL_NO_LAYOUT_INFO, DHT_MSG_OPEN_FD_ON_DST_FAILED,
+    DHT_MSG_SUBVOL_NOT_FOUND, DHT_MSG_FILE_LOOKUP_ON_DST_FAILED,
+    DHT_MSG_DISK_LAYOUT_MISSING, DHT_MSG_DICT_GET_FAILED,
+    DHT_MSG_REVALIDATE_CBK_INFO, DHT_MSG_UPGRADE_BRICKS, DHT_MSG_LK_ARRAY_INFO,
+    DHT_MSG_RENAME_NOT_LOCAL, DHT_MSG_RECONFIGURE_INFO,
+    DHT_MSG_INIT_LOCAL_SUBVOL_FAILED, DHT_MSG_SYS_CALL_GET_TIME_FAILED,
+    DHT_MSG_NO_DISK_USAGE_STATUS, DHT_MSG_SUBVOL_DOWN_ERROR,
+    DHT_MSG_REBAL_THROTTLE_INFO, DHT_MSG_COMMIT_HASH_INFO,
+    DHT_MSG_REBAL_STRUCT_SET, DHT_MSG_HAS_MIGINFO, DHT_MSG_SETTLE_HASH_FAILED,
+    DHT_MSG_DEFRAG_PROCESS_DIR_FAILED, DHT_MSG_FD_CTX_SET_FAILED,
+    DHT_MSG_STALE_LOOKUP, DHT_MSG_PARENT_LAYOUT_CHANGED,
+    DHT_MSG_LOCK_MIGRATION_FAILED, DHT_MSG_LOCK_INODE_UNREF_FAILED,
+    DHT_MSG_ASPRINTF_FAILED, DHT_MSG_DIR_LOOKUP_FAILED, DHT_MSG_INODELK_FAILED,
+    DHT_MSG_LOCK_FRAME_FAILED, DHT_MSG_LOCAL_LOCK_INIT_FAILED,
+    DHT_MSG_ENTRYLK_ERROR, DHT_MSG_INODELK_ERROR, DHT_MSG_LOC_FAILED,
+    DHT_MSG_UNKNOWN_FOP, DHT_MSG_MIGRATE_FILE_SKIPPED,
+    DHT_MSG_DIR_XATTR_HEAL_FAILED, DHT_MSG_HASHED_SUBVOL_DOWN,
+    DHT_MSG_NON_HASHED_SUBVOL_DOWN, DHT_MSG_SYNCTASK_CREATE_FAILED,
+    DHT_MSG_DIR_HEAL_ABORT, DHT_MSG_MIGRATE_SKIP, DHT_MSG_FD_CREATE_FAILED,
+    DHT_MSG_DICT_NEW_FAILED, DHT_MSG_FAILED_TO_OPEN, DHT_MSG_CREATE_FAILED,
+    DHT_MSG_FILE_NOT_EXIST, DHT_MSG_CHOWN_FAILED, DHT_MSG_FALLOCATE_FAILED,
+    DHT_MSG_FTRUNCATE_FAILED, DHT_MSG_STATFS_FAILED, DHT_MSG_WRITE_CROSS,
+    DHT_MSG_NEW_TARGET_FOUND, DHT_MSG_INSUFF_MEMORY, DHT_MSG_SET_XATTR_FAILED,
+    DHT_MSG_SET_MODE_FAILED, DHT_MSG_FILE_EXISTS_IN_DEST,
+    DHT_MSG_SYMLINK_FAILED, DHT_MSG_LINKFILE_DEL_FAILED, DHT_MSG_MKNOD_FAILED,
+    DHT_MSG_MIGRATE_CLEANUP_FAILED, DHT_MSG_LOCK_MIGRATE,
+    DHT_MSG_PARENT_BUILD_FAILED, DHT_MSG_HASHED_SUBVOL_NOT_FOUND,
+    DHT_MSG_ACQUIRE_ENTRYLK_FAILED, DHT_MSG_CREATE_DST_FAILED,
+    DHT_MSG_MIGRATION_EXIT, DHT_MSG_CHANGED_DST, DHT_MSG_TRACE_FAILED,
+    DHT_MSG_WRITE_LOCK_FAILED, DHT_MSG_GETACTIVELK_FAILED, DHT_MSG_STAT_FAILED,
+    DHT_MSG_UNLINK_PERFORM_FAILED, DHT_MSG_CLANUP_SOURCE_FILE_FAILED,
+    DHT_MSG_UNLOCK_FILE_FAILED, DHT_MSG_REMOVE_XATTR_FAILED,
+    DHT_MSG_DATA_MIGRATE_ABORT, DHT_MSG_DEFRAG_NULL, DHT_MSG_PARENT_NULL,
+    DHT_MSG_GFID_NOT_PRESENT, DHT_MSG_CHILD_LOC_FAILED,
+    DHT_MSG_SET_LOOKUP_FAILED, DHT_MSG_DIR_REMOVED, DHT_MSG_FIX_NOT_COMP,
+    DHT_MSG_SUBVOL_DETER_FAILED, DHT_MSG_LOCAL_SUBVOL, DHT_MSG_NODE_UUID,
+    DHT_MSG_SIZE_FILE, DHT_MSG_GET_DATA_SIZE_FAILED,
+    DHT_MSG_PTHREAD_JOIN_FAILED, DHT_MSG_COUNTER_THREAD_CREATE_FAILED,
+    DHT_MSG_MIGRATION_INIT_QUEUE_FAILED, DHT_MSG_PAUSED_TIMEOUT, DHT_MSG_WOKE,
+    DHT_MSG_ABORT_REBALANCE, DHT_MSG_CREATE_TASK_REBAL_FAILED,
+    DHT_MSG_REBAL_ESTIMATE_NOT_AVAIL, DHT_MSG_ADD_CHOICES_ERROR,
+    DHT_MSG_GET_CHOICES_ERROR, DHT_MSG_PREPARE_STATUS_ERROR,
+    DHT_MSG_SET_CHOICE_FAILED, DHT_MSG_SET_HASHED_SUBVOL_FAILED,
+    DHT_MSG_XATTR_HEAL_NOT_POSS, DHT_MSG_LINKTO_FILE_FAILED,
+    DHT_MSG_STALE_LINKFILE_DELETE, DHT_MSG_NO_SUBVOL_FOR_LINKTO,
+    DHT_MSG_SUBVOL_RETURNED, DHT_MSG_UNKNOWN_LOCAL_XSEL, DHT_MSG_GET_XATTR_ERR,
+    DHT_MSG_ALLOC_OR_FILL_FAILED, DHT_MSG_GET_REAL_NAME_FAILED,
+    DHT_MSG_COPY_UUID_FAILED, DHT_MSG_MDS_DETER_FAILED,
+    DHT_MSG_CREATE_REBAL_FAILED, DHT_MSG_LINK_LAYOUT_FAILED,
+    DHT_MSG_NO_SUBVOL_IN_LAYOUT, DHT_MSG_MEM_ALLOC_FAILED,
+    DHT_MSG_SET_IN_PARAMS_DICT_FAILED, DHT_MSG_LOC_COPY_FAILED,
+    DHT_MSG_PARENT_LOC_FAILED, DHT_MSG_CREATE_LOCK_FAILED,
+    DHT_MSG_PREV_ATTEMPT_FAILED, DHT_MSG_REFRESH_ATTEMPT,
+    DHT_MSG_ACQUIRE_LOCK_FAILED, DHT_MSG_CREATE_STUB_FAILED,
+    DHT_MSG_WIND_LOCK_REQ_FAILED, DHT_MSG_REFRESH_FAILED,
+    DHT_MSG_CACHED_SUBVOL_ERROR, DHT_MSG_NO_LINK_SUBVOL, DHT_MSG_SET_KEY_FAILED,
+    DHT_MSG_REMOVE_LINKTO_FAILED, DHT_MSG_LAYOUT_DICT_SET_FAILED,
+    DHT_MSG_XATTR_DICT_NULL, DHT_MSG_DUMMY_ALLOC_FAILED, DHT_MSG_DICT_IS_NULL,
+    DHT_MSG_LINK_INODE_FAILED, DHT_MSG_SELFHEAL_FAILED, DHT_MSG_NO_MDS_SUBVOL,
+    DHT_MSG_LIST_XATTRS_FAILED, DHT_MSG_RESET_INTER_XATTR_FAILED,
+    DHT_MSG_MDS_DOWN_UNABLE_TO_SET, DHT_MSG_WIND_UNLOCK_FAILED,
+    DHT_MSG_COMMIT_HASH_FAILED, DHT_MSG_UNLOCK_GFID_FAILED,
+    DHT_MSG_UNLOCK_FOLLOW_ENTRYLK, DHT_MSG_COPY_FRAME_FAILED,
+    DHT_MSG_UNLOCK_FOLLOW_LOCKS, DHT_MSG_ENTRYLK_FAILED_AFT_INODELK,
+    DHT_MSG_CALLOC_FAILED, DHT_MSG_LOCK_ALLOC_FAILED,
+    DHT_MSG_BLOCK_INODELK_FAILED,
+    DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK,
+    DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS,
+    DHT_MSG_DST_NULL_SET_FAILED);
+
+#define DHT_MSG_FD_CTX_SET_FAILED_STR "Failed to set fd ctx"
+#define DHT_MSG_INVALID_VALUE_STR "Different dst found in the fd ctx"
+#define DHT_MSG_UNKNOWN_FOP_STR "Unknown FOP on file"
+#define DHT_MSG_OPEN_FD_ON_DST_FAILED_STR "Failed to open the fd on file"
+#define DHT_MSG_SYNCTASK_CREATE_FAILED_STR "Failed to create synctask"
+#define DHT_MSG_ASPRINTF_FAILED_STR                                            \
+    "asprintf failed while fetching subvol from the id"
+#define DHT_MSG_HAS_MIGINFO_STR "Found miginfo in the inode ctx"
+#define DHT_MSG_FILE_LOOKUP_FAILED_STR "failed to lookup the file"
+#define DHT_MSG_INVALID_LINKFILE_STR                                           \
+    "linkto target is different from cached-subvol. treating as destination "  \
+    "subvol"
+#define DHT_MSG_GFID_MISMATCH_STR "gfid different on the target file"
+#define DHT_MSG_GET_XATTR_FAILED_STR "failed to get 'linkto' xattr"
+#define DHT_MSG_SET_INODE_CTX_FAILED_STR "failed to set inode-ctx target file"
+#define DHT_MSG_DIR_SELFHEAL_FAILED_STR "Healing of path failed"
+#define DHT_MSG_DIR_HEAL_ABORT_STR                                             \
+    "Failed to get path from subvol. Aborting directory healing"
+#define DHT_MSG_DIR_XATTR_HEAL_FAILED_STR "xattr heal failed for directory"
+#define DHT_MSG_LOCK_INODE_UNREF_FAILED_STR                                    \
+    "Found a NULL inode. Failed to unref the inode"
+#define DHT_MSG_DICT_SET_FAILED_STR "Failed to set dictionary value"
+#define DHT_MSG_NOT_LINK_FILE_ERROR_STR "got non-linkfile"
+#define DHT_MSG_CREATE_LINK_FAILED_STR "failed to initialize linkfile data"
+#define DHT_MSG_UNLINK_FAILED_STR "Unlinking linkfile on subvolume failed"
+#define DHT_MSG_MIGRATE_FILE_FAILED_STR "Migrate file failed"
+#define DHT_MSG_NO_MEMORY_STR "could not allocate memory for dict"
+#define DHT_MSG_SUBVOL_ERROR_STR "Failed to get linkto subvol"
+#define DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED_STR "link failed on subvol"
+#define DHT_MSG_MIGRATE_FILE_SKIPPED_STR "Migration skipped"
+#define DHT_MSG_FD_CREATE_FAILED_STR "fd create failed"
+#define DHT_MSG_DICT_NEW_FAILED_STR "dict_new failed"
+#define DHT_MSG_FAILED_TO_OPEN_STR "failed to open"
+#define DHT_MSG_CREATE_FAILED_STR "failed to create"
+#define DHT_MSG_FILE_NOT_EXIST_STR "file does not exist"
+#define DHT_MSG_CHOWN_FAILED_STR "chown failed"
+#define DHT_MSG_FALLOCATE_FAILED_STR "fallocate failed"
+#define DHT_MSG_FTRUNCATE_FAILED_STR "ftruncate failed"
+#define DHT_MSG_STATFS_FAILED_STR "failed to get statfs"
+#define DHT_MSG_WRITE_CROSS_STR                                                \
+    "write will cross min-fre-disk for file on subvol. looking for new subvol"
+#define DHT_MSG_SUBVOL_INSUFF_SPACE_STR                                        \
+    "Could not find any subvol with space accommodating the file. Cosider "    \
+    "adding bricks"
+#define DHT_MSG_NEW_TARGET_FOUND_STR "New target found for file"
+#define DHT_MSG_INSUFF_MEMORY_STR "insufficient memory"
+#define DHT_MSG_SET_XATTR_FAILED_STR "failed to set xattr"
+#define DHT_MSG_SET_MODE_FAILED_STR "failed to set mode"
+#define DHT_MSG_FILE_EXISTS_IN_DEST_STR "file exists in destination"
+#define DHT_MSG_LINKFILE_DEL_FAILED_STR "failed to delete the linkfile"
+#define DHT_MSG_SYMLINK_FAILED_STR "symlink failed"
+#define DHT_MSG_MKNOD_FAILED_STR "mknod failed"
+#define DHT_MSG_SETATTR_FAILED_STR "failed to perform setattr"
+#define DHT_MSG_MIGRATE_CLEANUP_FAILED_STR                                     \
+    "Migrate file cleanup failed: failed to fstat file"
+#define DHT_MSG_LOCK_MIGRATE_STR "locks will be migrated for file"
+#define DHT_MSG_PARENT_BUILD_FAILED_STR                                        \
+    "failed to build parent loc, which is needed to acquire entrylk to "       \
+    "synchronize with renames on this path. Skipping migration"
+#define DHT_MSG_HASHED_SUBVOL_NOT_FOUND_STR                                    \
+    "cannot find hashed subvol which is needed to synchronize with renames "   \
+    "on this path. Skipping migration"
+#define DHT_MSG_ACQUIRE_ENTRYLK_FAILED_STR "failed to acquire entrylk on subvol"
+#define DHT_MSG_CREATE_DST_FAILED_STR "create dst failed for file"
+#define DHT_MSG_MIGRATION_EXIT_STR "Exiting migration"
+#define DHT_MSG_CHANGED_DST_STR "destination changed fo file"
+#define DHT_MSG_TRACE_FAILED_STR "Trace failed"
+#define DHT_MSG_WRITE_LOCK_FAILED_STR "write lock failed"
+#define DHT_MSG_GETACTIVELK_FAILED_STR "getactivelk failed for file"
+#define DHT_MSG_STAT_FAILED_STR "failed to do a stat"
+#define DHT_MSG_UNLINK_PERFORM_FAILED_STR "failed to perform unlink"
+#define DHT_MSG_MIGRATE_FILE_COMPLETE_STR "completed migration"
+#define DHT_MSG_CLANUP_SOURCE_FILE_FAILED_STR "failed to cleanup source file"
+#define DHT_MSG_UNLOCK_FILE_FAILED_STR "failed to unlock file"
+#define DHT_MSG_REMOVE_XATTR_FAILED_STR "remove xattr failed"
+#define DHT_MSG_SOCKET_ERROR_STR "Failed to unlink listener socket"
+#define DHT_MSG_HASHED_SUBVOL_GET_FAILED_STR "Failed to get hashed subvolume"
+#define DHT_MSG_CACHED_SUBVOL_GET_FAILED_STR "Failed to get cached subvolume"
+#define DHT_MSG_MIGRATE_DATA_FAILED_STR "migrate-data failed"
+#define DHT_MSG_DEFRAG_NULL_STR "defrag is NULL"
+#define DHT_MSG_DATA_MIGRATE_ABORT_STR                                         \
+    "Readdirp failed. Aborting data migration for dict"
+#define DHT_MSG_LAYOUT_FIX_FAILED_STR "fix layout failed"
+#define DHT_MSG_PARENT_NULL_STR "parent is NULL"
+#define DHT_MSG_GFID_NOT_PRESENT_STR "gfid not present"
+#define DHT_MSG_CHILD_LOC_FAILED_STR "Child loc build failed"
+#define DHT_MSG_SET_LOOKUP_FAILED_STR "Failed to set lookup"
+#define DHT_MSG_DIR_LOOKUP_FAILED_STR "lookup failed"
+#define DHT_MSG_DIR_REMOVED_STR "Dir renamed or removed. Skipping"
+#define DHT_MSG_READDIR_ERROR_STR "readdir failed, Aborting fix-layout"
+#define DHT_MSG_SETTLE_HASH_FAILED_STR "Settle hash failed"
+#define DHT_MSG_DEFRAG_PROCESS_DIR_FAILED_STR "gf_defrag_process_dir failed"
+#define DHT_MSG_FIX_NOT_COMP_STR                                               \
+    "Unable to retrieve fixlayout xattr. Assume background fix layout not "    \
+    "complete"
+#define DHT_MSG_SUBVOL_DETER_FAILED_STR                                        \
+    "local subvolume determination failed with error"
+#define DHT_MSG_LOCAL_SUBVOL_STR "local subvol"
+#define DHT_MSG_NODE_UUID_STR "node uuid"
+#define DHT_MSG_SIZE_FILE_STR "Total size files"
+#define DHT_MSG_GET_DATA_SIZE_FAILED_STR                                       \
+    "Failed to get the total data size. Unable to estimate time to complete "  \
+    "rebalance"
+#define DHT_MSG_PTHREAD_JOIN_FAILED_STR                                        \
+    "file_counter_thread: pthread_join failed"
+#define DHT_MSG_COUNTER_THREAD_CREATE_FAILED_STR                               \
+    "Failed to create the file counter thread"
+#define DHT_MSG_MIGRATION_INIT_QUEUE_FAILED_STR                                \
+    "Failed to initialise migration queue"
+#define DHT_MSG_REBALANCE_STOPPED_STR "Received stop command on rebalance"
+#define DHT_MSG_PAUSED_TIMEOUT_STR "Request pause timer timeout"
+#define DHT_MSG_WOKE_STR "woken"
+#define DHT_MSG_ABORT_REBALANCE_STR "Aborting rebalance"
+#define DHT_MSG_REBALANCE_START_FAILED_STR                                     \
+    "Failed to start rebalance: look up on / failed"
+#define DHT_MSG_CREATE_TASK_REBAL_FAILED_STR                                   \
+    "Could not create task for rebalance"
+#define DHT_MSG_REBAL_ESTIMATE_NOT_AVAIL_STR                                   \
+    "Rebalance estimates will not be available"
+#define DHT_MSG_REBALANCE_STATUS_STR "Rebalance status"
+#define DHT_MSG_DATA_NULL_STR "data value is NULL"
+#define DHT_MSG_ADD_CHOICES_ERROR_STR "Error to add choices in buffer"
+#define DHT_MSG_GET_CHOICES_ERROR_STR "Error to get choices"
+#define DHT_MSG_PREPARE_STATUS_ERROR_STR "Error to prepare status"
+#define DHT_MSG_SET_CHOICE_FAILED_STR "Failed to set full choice"
+#define DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED_STR                               \
+    "Failed to aggregate quota xattr"
+#define DHT_MSG_FILE_TYPE_MISMATCH_STR                                         \
+    "path exists as a file on one subvolume and directory on another. Please " \
+    "fix it manually"
+#define DHT_MSG_LAYOUT_SET_FAILED_STR "failed to set layout for subvolume"
+#define DHT_MSG_LAYOUT_MERGE_FAILED_STR "failed to merge layouts for subvolume"
+#define DHT_MSG_SET_HASHED_SUBVOL_FAILED_STR "Failed to set hashed subvolume"
+#define DHT_MSG_XATTR_HEAL_NOT_POSS_STR                                        \
+    "No gfid exists for path. so healing xattr is not possible"
+#define DHT_MSG_REVALIDATE_CBK_INFO_STR "Revalidate: subvolume returned -1"
+#define DHT_MSG_LAYOUT_MISMATCH_STR "Mismatching layouts"
+#define DHT_MSG_UNLINK_LOOKUP_INFO_STR "lookup_unlink retuened"
+#define DHT_MSG_LINKTO_FILE_FAILED_STR                                         \
+    "Could not unlink the linkto file as either fd is open and/or linkto "     \
+    "xattr is set"
+#define DHT_MSG_LAYOUT_PRESET_FAILED_STR                                       \
+    "Could not set pre-set layout for subvolume"
+#define DHT_MSG_FILE_ON_MULT_SUBVOL_STR                                        \
+    "multiple subvolumes have file (preferably rename the file in the "        \
+    "backend, and do a fresh lookup"
+#define DHT_MSG_STALE_LINKFILE_DELETE_STR                                      \
+    "attempting deletion of stale linkfile"
+#define DHT_MSG_LINK_FILE_LOOKUP_INFO_STR "Lookup on following linkfile"
+#define DHT_MSG_NO_SUBVOL_FOR_LINKTO_STR "No link subvolume for linkto"
+#define DHT_MSG_SUBVOL_RETURNED_STR "Subvolume returned -1"
+#define DHT_MSG_UNKNOWN_LOCAL_XSEL_STR "Unknown local->xsel"
+#define DHT_MSG_DICT_GET_FAILED_STR "Failed to get"
+#define DHT_MSG_UUID_PARSE_ERROR_STR "Failed to parse uuid"
+#define DHT_MSG_GET_XATTR_ERR_STR "getxattr err for dir"
+#define DHT_MSG_ALLOC_OR_FILL_FAILED_STR "alloc or fill failed"
+#define DHT_MSG_UPGRADE_BRICKS_STR                                             \
+    "At least one of the bricks does not support this operation. Please "      \
+    "upgrade all bricks"
+#define DHT_MSG_GET_REAL_NAME_FAILED_STR "Failed to get real filename"
+#define DHT_MSG_LAYOUT_NULL_STR "Layout is NULL"
+#define DHT_MSG_COPY_UUID_FAILED_STR "Failed to copy node uuid key"
+#define DHT_MSG_MDS_DETER_FAILED_STR                                           \
+    "Cannot determine MDS, fetching xattr randomly from a subvol"
+#define DHT_MSG_HASHED_SUBVOL_DOWN_STR                                         \
+    "MDS is down for path, so fetching xattr randomly from subvol"
+#define DHT_MSG_CREATE_REBAL_FAILED_STR                                        \
+    "failed to create a new rebalance synctask"
+#define DHT_MSG_FIX_LAYOUT_INFO_STR "fixing the layout"
+#define DHT_MSG_OPERATION_NOT_SUP_STR "wrong directory-spread-count value"
+#define DHT_MSG_LINK_LAYOUT_FAILED_STR "failed to link the layout in inode"
+#define DHT_MSG_NO_SUBVOL_IN_LAYOUT_STR "no subvolume in layout for path"
+#define DHT_MSG_INODE_LK_ERROR_STR "mknod lock failed for file"
+#define DHT_MSG_MEM_ALLOC_FAILED_STR "mem allocation failed"
+#define DHT_MSG_PARENT_LAYOUT_CHANGED_STR                                      \
+    "extracting in-memory layout of parent failed"
+#define DHT_MSG_SET_IN_PARAMS_DICT_FAILED_STR                                  \
+    "setting in params dictionary failed"
+#define DHT_MSG_LOC_COPY_FAILED_STR "loc_copy failed"
+#define DHT_MSG_LOC_FAILED_STR "parent loc build failed"
+#define DHT_MSG_PARENT_LOC_FAILED_STR "locking parent failed"
+#define DHT_MSG_CREATE_LOCK_FAILED_STR "Create lock failed"
+#define DHT_MSG_PREV_ATTEMPT_FAILED_STR                                        \
+    "mkdir loop detected. parent layout didn't change even though previous "   \
+    "attempt of mkdir failed because of in-memory layout not matching with "   \
+    "that on disk."
+#define DHT_MSG_REFRESH_ATTEMPT_STR                                            \
+    "mkdir parent layout changed. Attempting a refresh and then a retry"
+#define DHT_MSG_ACQUIRE_LOCK_FAILED_STR                                        \
+    "Acquiring lock on parent to guard against layout-change failed"
+#define DHT_MSG_CREATE_STUB_FAILED_STR "creating stub failed"
+#define DHT_MSG_WIND_LOCK_REQ_FAILED_STR                                       \
+    "cannot wind lock request to guard parent layout"
+#define DHT_MSG_REFRESH_FAILED_STR "refreshing parent layout failed."
+#define DHT_MSG_CACHED_SUBVOL_ERROR_STR "On cached subvol"
+#define DHT_MSG_NO_LINK_SUBVOL_STR "Linkfile does not have link subvolume"
+#define DHT_MSG_SET_KEY_FAILED_STR "failed to set key"
+#define DHT_MSG_CHILD_DOWN_STR "Received CHILD_DOWN. Exiting"
+#define DHT_MSG_LOG_FIXED_LAYOUT_STR "log layout fixed"
+#define DHT_MSG_REBAL_STRUCT_SET_STR "local->rebalance already set"
+#define DHT_MSG_REMOVE_LINKTO_FAILED_STR "Removal of linkto failed at subvol"
+#define DHT_MSG_LAYOUT_DICT_SET_FAILED_STR "dht layout dict set failed"
+#define DHT_MSG_SUBVOL_INFO_STR "creating subvolume"
+#define DHT_MSG_COMPUTE_HASH_FAILED_STR "hash computation failed"
+#define DHT_MSG_INVALID_DISK_LAYOUT_STR                                        \
+    "Invalid disk layout: Catastrophic error layout with unknown type found"
+#define DHT_MSG_LAYOUT_SORT_FAILED_STR "layout sort failed"
+#define DHT_MSG_ANOMALIES_INFO_STR "Found anomalies"
+#define DHT_MSG_XATTR_DICT_NULL_STR "xattr dictionary is NULL"
+#define DHT_MSG_DISK_LAYOUT_MISSING_STR "Disk layout missing"
+#define DHT_MSG_LAYOUT_INFO_STR "layout info"
+#define DHT_MSG_SUBVOL_NO_LAYOUT_INFO_STR "no pre-set layout for subvol"
+#define DHT_MSG_SELFHEAL_XATTR_FAILED_STR "layout setxattr failed"
+#define DHT_MSG_DIR_SELFHEAL_XATTR_FAILED_STR "Directory self heal xattr failed"
+#define DHT_MSG_DUMMY_ALLOC_FAILED_STR "failed to allocate dummy layout"
+#define DHT_MSG_DICT_IS_NULL_STR                                               \
+    "dict is NULL, need to make sure gfids are same"
+#define DHT_MSG_ENTRYLK_ERROR_STR "acquiring entrylk after inodelk failed"
+#define DHT_MSG_NO_DISK_USAGE_STATUS_STR "no du stats"
+#define DHT_MSG_LINK_INODE_FAILED_STR "linking inode failed"
+#define DHT_MSG_SELFHEAL_FAILED_STR "Directory selfheal failed"
+#define DHT_MSG_NO_MDS_SUBVOL_STR "No mds subvol"
+#define DHT_MSG_LIST_XATTRS_FAILED_STR "failed to list xattrs"
+#define DHT_MSG_RESET_INTER_XATTR_FAILED_STR "Failed to reset internal xattr"
+#define DHT_MSG_MDS_DOWN_UNABLE_TO_SET_STR                                     \
+    "mds subvol is down, unable to set xattr"
+#define DHT_MSG_DIR_ATTR_HEAL_FAILED_STR                                       \
+    "Directory attr heal failed. Failed to set uid/gid"
+#define DHT_MSG_WIND_UNLOCK_FAILED_STR                                         \
+    "Winding unlock failed: stale locks left on brick"
+#define DHT_MSG_COMMIT_HASH_FAILED_STR "Directory commit hash updaten failed"
+#define DHT_MSG_LK_ARRAY_INFO_STR "lk info"
+#define DHT_MSG_UNLOCK_GFID_FAILED_STR                                         \
+    "unlock failed on gfid: stale lock might be left"
+#define DHT_MSG_UNLOCKING_FAILED_STR "unlocking failed"
+#define DHT_MSG_UNLOCK_FOLLOW_ENTRYLK_STR "not unlocking following entrylks"
+#define DHT_MSG_COPY_FRAME_FAILED_STR "copy frame failed"
+#define DHT_MSG_UNLOCK_FOLLOW_LOCKS_STR "not unlocking following locks"
+#define DHT_MSG_INODELK_FAILED_STR "inodelk failed on subvol"
+#define DHT_MSG_LOCK_FRAME_FAILED_STR "memory allocation failed for lock_frame"
+#define DHT_MSG_LOCAL_LOCK_INIT_FAILED_STR "dht_local_lock_init failed"
+#define DHT_MSG_ENTRYLK_FAILED_AFT_INODELK_STR                                 \
+    "dht_blocking_entrylk failed after taking inodelk"
+#define DHT_MSG_BLOCK_INODELK_FAILED_STR "dht_blocking_inodelk failed"
+#define DHT_MSG_CALLOC_FAILED_STR "calloc failed"
+#define DHT_MSG_LOCK_ALLOC_FAILED_STR "lock allocation failed"
+#define DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS_STR        \
+    "cannot allocate a frame, not unlocking following entrylks"
+#define DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK_STR       \
+    "storing locks in local failed, not unlocking following entrylks"
+#define DHT_MSG_DST_NULL_SET_FAILED_STR                                        \
+    "src or dst is NULL, Failed to set dictionary value"
+
+#endif /* _DHT_MESSAGES_H_ */
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
new file mode 100644
index 00000000000..8ba8082bd86
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -0,0 +1,4702 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "dht-common.h"
+#include <glusterfs/syscall.h>
+#include <fnmatch.h>
+#include <signal.h>
+#include <glusterfs/events.h>
+#include "glusterfs/compat-errno.h"  // for ENODATA on BSD
+
+#define GF_DISK_SECTOR_SIZE 512
+#define DHT_REBALANCE_PID 4242        /* Change it if required */
+#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */
+#define MAX_MIGRATE_QUEUE_COUNT 500
+#define MIN_MIGRATE_QUEUE_COUNT 200
+#define MAX_REBAL_TYPE_SIZE 16
+#define FILE_CNT_INTERVAL 600       /* 10 mins */
+#define ESTIMATE_START_INTERVAL 600 /* 10 mins */
+#define HARDLINK_MIG_INPROGRESS -2
+#define SKIP_MIGRATION_FD_POSITIVE -3
+#ifndef MAX
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#define GF_CRAWL_INDEX_MOVE(idx, sv_cnt)                                       \
+    {                                                                          \
+        idx++;                                                                 \
+        idx %= sv_cnt;                                                         \
+    }
+
+uint64_t g_totalfiles = 0;
+uint64_t g_totalsize = 0;
+
+void
+gf_defrag_free_dir_dfmeta(struct dir_dfmeta *meta, int local_subvols_cnt)
+{
+    int i = 0;
+
+    if (meta) {
+        for (i = 0; i < local_subvols_cnt; i++) {
+            if (meta->equeue)
+                gf_dirent_free(&meta->equeue[i]);
+            if (meta->lfd && meta->lfd[i])
+                fd_unref(meta->lfd[i]);
+        }
+
+        GF_FREE(meta->equeue);
+        GF_FREE(meta->head);
+        GF_FREE(meta->iterator);
+        GF_FREE(meta->offset_var);
+        GF_FREE(meta->fetch_entries);
+        GF_FREE(meta->lfd);
+        GF_FREE(meta);
+    }
+}
+
+void
+gf_defrag_free_container(struct dht_container *container)
+{
+    if (container) {
+        gf_dirent_entry_free(container->df_entry);
+
+        if (container->parent_loc) {
+            loc_wipe(container->parent_loc);
+        }
+
+        GF_FREE(container->parent_loc);
+
+        GF_FREE(container);
+    }
+}
+
+void
+dht_set_global_defrag_error(gf_defrag_info_t *defrag, int ret)
+{
+    LOCK(&defrag->lock);
+    {
+        defrag->global_error = ret;
+    }
+    UNLOCK(&defrag->lock);
+    return;
+}
+
+static int
+dht_send_rebalance_event(xlator_t *this, int cmd, gf_defrag_status_t status)
+{
+    int ret = -1;
+    char *volname = NULL;
+    char *tmpstr = NULL;
+    char *ptr = NULL;
+    char *suffix = "-dht";
+    int len = 0;
+
+    eventtypes_t event = EVENT_LAST;
+
+    switch (status) {
+        case GF_DEFRAG_STATUS_COMPLETE:
+            event = EVENT_VOLUME_REBALANCE_COMPLETE;
+            break;
+        case GF_DEFRAG_STATUS_FAILED:
+            event = EVENT_VOLUME_REBALANCE_FAILED;
+            break;
+        case GF_DEFRAG_STATUS_STOPPED:
+            event = EVENT_VOLUME_REBALANCE_STOP;
+            break;
+        default:
+            break;
+    }
+
+    /* DHT volume */
+    len = strlen(this->name) - strlen(suffix);
+    tmpstr = gf_strdup(this->name);
+    if (tmpstr) {
+        ptr = tmpstr + len;
+        if (!strcmp(ptr, suffix)) {
+            tmpstr[len] = '\0';
+            volname = tmpstr;
+        }
+    }
+
+    if (!volname) {
+        /* Better than nothing */
+        volname = this->name;
+    }
+
+    if (event != EVENT_LAST) {
+        gf_event(event, "volume=%s", volname);
+    }
+
+    GF_FREE(tmpstr);
+    return ret;
+}
+
+static void
+dht_strip_out_acls(dict_t *dict)
+{
+    if (dict) {
+        dict_del(dict, "trusted.SGI_ACL_FILE");
+        dict_del(dict, POSIX_ACL_ACCESS_XATTR);
+    }
+}
+
+/*
+   return values:
+   -1 : failure
+   -2 : success
+
+Hard link migration is carried out in three stages.
+
+(Say there are n hardlinks)
+Stage 1: Setting the new hashed subvol information on the 1st hardlink
+         encountered (linkto setxattr)
+
+Stage 2: Creating hardlinks on new hashed subvol for the 2nd to (n-1)th
+         hardlink
+
+Stage 3: Physical migration of the data file for nth hardlink
+
+Why to deem "-2" as success and not "0":
+
+   dht_migrate_file expects return value "0" from _is_file_migratable if
+the file has to be migrated.
+
+   _is_file_migratable returns zero only when it is called with the
+flag "GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS".
+
+   gf_defrag_handle_hardlink calls dht_migrate_file for physical migration
+of the data file with the flag "GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS"
+
+Hence, gf_defrag_handle_hardlink returning "0" for success will force
+"dht_migrate_file" to migrate each of the hardlink which is not intended.
+
+For each of the three stage mentioned above "-2" will be returned and will
+be converted to "0" in dht_migrate_file.
+
+*/
+
+int32_t
+gf_defrag_handle_hardlink(xlator_t *this, loc_t *loc, int *fop_errno)
+{
+    int32_t ret = -1;
+    xlator_t *cached_subvol = NULL;
+    xlator_t *hashed_subvol = NULL;
+    xlator_t *linkto_subvol = NULL;
+    data_t *data = NULL;
+    struct iatt iatt = {
+        0,
+    };
+    int32_t op_errno = 0;
+    dht_conf_t *conf = NULL;
+    gf_loglevel_t loglevel = 0;
+    dict_t *link_xattr = NULL;
+    dict_t *dict = NULL;
+    dict_t *xattr_rsp = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+
+    *fop_errno = EINVAL;
+
+    GF_VALIDATE_OR_GOTO("defrag", loc, out);
+    GF_VALIDATE_OR_GOTO("defrag", loc->name, out);
+    GF_VALIDATE_OR_GOTO("defrag", this, out);
+    GF_VALIDATE_OR_GOTO("defrag", this->private, out);
+
+    conf = this->private;
+
+    if (gf_uuid_is_null(loc->pargfid)) {
+        gf_msg("", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed :"
+               "loc->pargfid is NULL for %s",
+               loc->path);
+        *fop_errno = EINVAL;
+        ret = -1;
+        goto out;
+    }
+
+    if (gf_uuid_is_null(loc->gfid)) {
+        gf_msg("", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed :"
+               "loc->gfid is NULL for %s",
+               loc->path);
+        *fop_errno = EINVAL;
+        ret = -1;
+        goto out;
+    }
+
+    link_xattr = dict_new();
+    if (!link_xattr) {
+        ret = -1;
+        *fop_errno = ENOMEM;
+        goto out;
+    }
+
+    /*
+      Parallel migration can lead to migration of the hard link multiple
+      times which can lead to data loss. Hence, adding a fresh lookup to
+      decide whether migration is required or not.
+
+      Elaborating the scenario for let say 10 hardlinks [link{1..10}]:
+          Let say the first hard link "link1"  does the setxattr of the
+      new hashed subvolume info on the cached file. As there are multiple
+      threads working, we might have already all the links created on the
+      new hashed by the time we reach hardlink let say link5. Now the
+      number of links on hashed is equal to that of cached. Hence, file
+      migration will happen for link6.
+
+             Cached                                 Hashed
+      --------T link6                        rwxrwxrwx   link6
+
+      Now post above state all the link file on the cached will be zero
+      byte linkto files. Hence, if we still do migration for the following
+      files link{7..10}, we will end up migrating 0 data leading to data
+      loss.
+            Hence, a lookup can make sure whether we need to migrate the
+      file or not.
+    */
+
+    dict = dict_new();
+    if (!dict) {
+        ret = -1;
+        *fop_errno = ENOMEM;
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+               "could not allocate memory for dict");
+        goto out;
+    }
+
+    ret = dict_set_int32(dict, conf->link_xattr_name, 256);
+    if (ret) {
+        *fop_errno = ENOMEM;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed:"
+               "%s: failed to set 'linkto' key in dict",
+               loc->path);
+        goto out;
+    }
+
+    ret = syncop_lookup(this, loc, &stbuf, NULL, dict, &xattr_rsp);
+    if (ret) {
+        /*Ignore ENOENT and ESTALE as file might have been
+          migrated already*/
+        if (-ret == ENOENT || -ret == ESTALE) {
+            ret = -2;
+            goto out;
+        }
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed:%s lookup failed with ret = %d", loc->path,
+               ret);
+        *fop_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    cached_subvol = dht_subvol_get_cached(this, loc->inode);
+    if (!cached_subvol) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed :"
+               "Failed to get cached subvol"
+               " for %s on %s",
+               loc->name, this->name);
+        *fop_errno = EINVAL;
+        ret = -1;
+        goto out;
+    }
+
+    hashed_subvol = dht_subvol_get_hashed(this, loc);
+    if (!hashed_subvol) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed :"
+               "Failed to get hashed subvol"
+               " for %s on %s",
+               loc->name, this->name);
+        *fop_errno = EINVAL;
+        ret = -1;
+        goto out;
+    }
+
+    /* Hardlink migration happens only with remove-brick. So this condition will
+     * be true only when the migration has happened. In case hardlinks are
+     * migrated for rebalance case, remove this check. Having this check here
+     * avoid redundant calls below*/
+    if (hashed_subvol == cached_subvol) {
+        ret = -2;
+        goto out;
+    }
+
+    gf_log(this->name, GF_LOG_INFO,
+           "Attempting to migrate hardlink %s "
+           "with gfid %s from %s -> %s",
+           loc->name, uuid_utoa(loc->gfid), cached_subvol->name,
+           hashed_subvol->name);
+
+    data = dict_get(xattr_rsp, conf->link_xattr_name);
+    /* set linkto on cached -> hashed if not present, else link it */
+    if (!data) {
+        ret = dict_set_str(link_xattr, conf->link_xattr_name,
+                           hashed_subvol->name);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Migrate file failed :"
+                   "Failed to set dictionary value:"
+                   " key = %s for %s",
+                   conf->link_xattr_name, loc->name);
+            *fop_errno = ENOMEM;
+            ret = -1;
+            goto out;
+        }
+
+        ret = syncop_setxattr(cached_subvol, loc, link_xattr, 0, NULL, NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Migrate file failed :"
+                   "Linkto setxattr failed %s -> %s",
+                   cached_subvol->name, loc->name);
+            *fop_errno = -ret;
+            ret = -1;
+            goto out;
+        }
+
+        gf_msg_debug(this->name, 0,
+                     "hardlink target subvol created on %s "
+                     ",cached %s, file %s",
+                     hashed_subvol->name, cached_subvol->name, loc->path);
+
+        ret = -2;
+        goto out;
+    } else {
+        linkto_subvol = dht_linkfile_subvol(this, NULL, NULL, xattr_rsp);
+        if (!linkto_subvol) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_ERROR,
+                   "Failed to get "
+                   "linkto subvol for %s",
+                   loc->name);
+        } else {
+            hashed_subvol = linkto_subvol;
+        }
+
+        ret = syncop_link(hashed_subvol, loc, loc, &iatt, NULL, NULL);
+        if (ret) {
+            op_errno = -ret;
+            ret = -1;
+
+            loglevel = (op_errno == EEXIST) ? GF_LOG_DEBUG : GF_LOG_ERROR;
+            gf_msg(this->name, loglevel, op_errno,
+                   DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED,
+                   "link of %s -> %s"
+                   " failed on  subvol %s",
+                   loc->name, uuid_utoa(loc->gfid), hashed_subvol->name);
+            if (op_errno != EEXIST) {
+                *fop_errno = op_errno;
+                goto out;
+            }
+        } else {
+            gf_msg_debug(this->name, 0,
+                         "syncop_link successful for"
+                         " hardlink %s on subvol %s, cached %s",
+                         loc->path, hashed_subvol->name, cached_subvol->name);
+        }
+    }
+
+    ret = syncop_lookup(hashed_subvol, loc, &iatt, NULL, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed :Failed lookup %s on %s ", loc->name,
+               hashed_subvol->name);
+
+        *fop_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    /* There is a race where on the target subvol for the hardlink
+     * (note: hash subvol for the hardlink might differ from this), some
+     * other client(non-rebalance) would have created a linkto file for that
+     * hardlink as part of lookup. So let say there are 10 hardlinks, on the
+     * 5th hardlink it self the hardlinks might have migrated. Now for
+     * (6..10th) hardlinks the cached and target would be same as the file
+     * has already migrated. Hence this check is needed  */
+    if (cached_subvol == hashed_subvol) {
+        gf_msg_debug(this->name, 0,
+                     "source %s and destination %s "
+                     "for hardlink %s are same",
+                     cached_subvol->name, hashed_subvol->name, loc->path);
+        ret = -2;
+        goto out;
+    }
+
+    if (iatt.ia_nlink == stbuf.ia_nlink) {
+        ret = dht_migrate_file(this, loc, cached_subvol, hashed_subvol,
+                               GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS, fop_errno);
+        if (ret) {
+            goto out;
+        }
+    }
+    ret = -2;
+out:
+    if (link_xattr)
+        dict_unref(link_xattr);
+
+    if (xattr_rsp)
+        dict_unref(xattr_rsp);
+
+    if (dict)
+        dict_unref(dict);
+
+    return ret;
+}
+
+static int
+__check_file_has_hardlink(xlator_t *this, loc_t *loc, struct iatt *stbuf,
+                          dict_t *xattrs, int flags, gf_defrag_info_t *defrag,
+                          dht_conf_t *conf, int *fop_errno)
+{
+    int ret = 0;
+
+    if (flags == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS) {
+        ret = 0;
+        return ret;
+    }
+    if (stbuf->ia_nlink > 1) {
+        /* support for decomission */
+        if (flags == GF_DHT_MIGRATE_HARDLINK) {
+            synclock_lock(&conf->link_lock);
+            ret = gf_defrag_handle_hardlink(this, loc, fop_errno);
+            synclock_unlock(&conf->link_lock);
+            /*
+            Returning zero will force the file to be remigrated.
+            Checkout gf_defrag_handle_hardlink for more information.
+            */
+            if (ret && ret != -2) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       DHT_MSG_MIGRATE_FILE_FAILED,
+                       "Migrate file failed:"
+                       "%s: failed to migrate file with link",
+                       loc->path);
+            }
+        } else {
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Migration skipped for:"
+                   "%s: file has hardlinks",
+                   loc->path);
+            *fop_errno = ENOTSUP;
+            ret = 1;
+        }
+    }
+
+    return ret;
+}
+
+/*
+     return values
+     0 : File will be migrated
+    -2 : File will not be migrated
+         (This is the return value from gf_defrag_handle_hardlink. Checkout
+         gf_defrag_handle_hardlink for description of "returning -2")
+    -1 : failure
+*/
+static int
+__is_file_migratable(xlator_t *this, loc_t *loc, struct iatt *stbuf,
+                     dict_t *xattrs, int flags, gf_defrag_info_t *defrag,
+                     dht_conf_t *conf, int *fop_errno)
+{
+    int ret = -1;
+    int lock_count = 0;
+
+    if (IA_ISDIR(stbuf->ia_type)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed:"
+               "%s: migrate-file called on directory",
+               loc->path);
+        *fop_errno = EISDIR;
+        ret = -1;
+        goto out;
+    }
+
+    if (!conf->lock_migration_enabled) {
+        ret = dict_get_int32(xattrs, GLUSTERFS_POSIXLK_COUNT, &lock_count);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Migrate file failed:"
+                   "%s: Unable to get lock count for file",
+                   loc->path);
+            *fop_errno = EINVAL;
+            ret = -1;
+            goto out;
+        }
+
+        if (lock_count) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Migrate file failed: %s: File has locks."
+                   " Skipping file migration",
+                   loc->path);
+            *fop_errno = ENOTSUP;
+            ret = 1;
+            goto out;
+        }
+    }
+
+    /* Check if file has hardlink*/
+    ret = __check_file_has_hardlink(this, loc, stbuf, xattrs, flags, defrag,
+                                    conf, fop_errno);
+out:
+    return ret;
+}
+
+static int
+__dht_rebalance_create_dst_file(xlator_t *this, xlator_t *to, xlator_t *from,
+                                loc_t *loc, struct iatt *stbuf, fd_t **dst_fd,
+                                int *fop_errno, int file_has_holes)
+{
+    int ret = -1;
+    int ret2 = -1;
+    fd_t *fd = NULL;
+    struct iatt new_stbuf = {
+        0,
+    };
+    struct iatt check_stbuf = {
+        0,
+    };
+    dht_conf_t *conf = NULL;
+    dict_t *dict = NULL;
+    dict_t *xdata = NULL;
+
+    conf = this->private;
+
+    dict = dict_new();
+    if (!dict) {
+        *fop_errno = ENOMEM;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+               "dictionary allocation failed for"
+               "path:%s",
+               loc->path);
+        goto out;
+    }
+    ret = dict_set_gfuuid(dict, "gfid-req", stbuf->ia_gfid, true);
+    if (ret) {
+        *fop_errno = ENOMEM;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+               "%s: failed to set dictionary value: key = gfid-req", loc->path);
+        goto out;
+    }
+
+    ret = dict_set_str(dict, conf->link_xattr_name, from->name);
+    if (ret) {
+        *fop_errno = ENOMEM;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+               "%s: failed to set dictionary value: key = %s ", loc->path,
+               conf->link_xattr_name);
+        goto out;
+    }
+
+    fd = fd_create(loc->inode, DHT_REBALANCE_PID);
+    if (!fd) {
+        *fop_errno = ENOMEM;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: fd create failed (destination)", loc->path);
+        goto out;
+    }
+
+    xdata = dict_new();
+    if (!xdata) {
+        *fop_errno = ENOMEM;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: dict_new failed)", loc->path);
+        goto out;
+    }
+
+    ret = dict_set_int32_sizen(xdata, GF_CLEAN_WRITE_PROTECTION, 1);
+    if (ret) {
+        *fop_errno = ENOMEM;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+               "%s: failed to set dictionary value: key = %s ", loc->path,
+               GF_CLEAN_WRITE_PROTECTION);
+        goto out;
+    }
+
+    ret = syncop_lookup(to, loc, &new_stbuf, NULL, xdata, NULL);
+    if (!ret) {
+        /* File exits in the destination, check if gfid matches */
+        if (gf_uuid_compare(stbuf->ia_gfid, new_stbuf.ia_gfid) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH,
+                   "file %s exists in %s with different gfid", loc->path,
+                   to->name);
+            *fop_errno = EINVAL;
+            ret = -1;
+            goto out;
+        }
+    }
+    if ((ret < 0) && (-ret != ENOENT)) {
+        /* File exists in destination, but not accessible */
+        gf_msg(THIS->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: failed to lookup file", loc->path);
+        *fop_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    /* Create the destination with LINKFILE mode, and linkto xattr,
+       if the linkfile already exists, just open the file */
+    if (!ret) {
+        /*
+         * File already present, just open the file.
+         */
+        ret = syncop_open(to, loc, O_RDWR, fd, NULL, NULL);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "failed to open %s on %s", loc->path, to->name);
+            *fop_errno = -ret;
+            ret = -1;
+            goto out;
+        }
+    } else {
+        ret = syncop_create(to, loc, O_RDWR, DHT_LINKFILE_MODE, fd, &new_stbuf,
+                            dict, NULL);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "failed to create %s on %s", loc->path, to->name);
+            *fop_errno = -ret;
+            ret = -1;
+            goto out;
+        }
+    }
+
+    fd_bind(fd);
+
+    /*Reason of doing lookup after create again:
+     *In the create, there is some time-gap between opening fd at the
+     *server (posix_layer) and binding it in server (incrementing fd count),
+     *so if in that time-gap, if other process sends unlink considering it
+     *as a linkto file, because inode->fd count will be 0, so file will be
+     *unlinked at the backend. And because further operations are performed
+     *on fd, so though migration will be done but will end with no file
+     *at  the backend.
+     */
+
+    ret = syncop_lookup(to, loc, &check_stbuf, NULL, NULL, NULL);
+    if (!ret) {
+        if (gf_uuid_compare(stbuf->ia_gfid, check_stbuf.ia_gfid) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH,
+                   "file %s exists in %s with different gfid,"
+                   "found in lookup after create",
+                   loc->path, to->name);
+            *fop_errno = EINVAL;
+            ret = -1;
+            goto out;
+        }
+    }
+
+    if (-ret == ENOENT) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: file does not exist"
+               "on %s",
+               loc->path, to->name);
+        *fop_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    ret = syncop_fsetattr(to, fd, stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
+                          NULL, NULL, NULL, NULL);
+    if (ret < 0) {
+        *fop_errno = -ret;
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "chown failed for %s on %s", loc->path, to->name);
+    }
+
+    /* No need to bother about 0 byte size files */
+    if (stbuf->ia_size > 0) {
+        if (conf->use_fallocate && !file_has_holes) {
+            ret = syncop_fallocate(to, fd, 0, 0, stbuf->ia_size, NULL, NULL);
+            if (ret < 0) {
+                if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -ENOSYS) {
+                    conf->use_fallocate = _gf_false;
+                } else {
+                    gf_msg(this->name, GF_LOG_ERROR, -ret,
+                           DHT_MSG_MIGRATE_FILE_FAILED,
+                           "fallocate failed for %s on %s", loc->path,
+                           to->name);
+
+                    *fop_errno = -ret;
+
+                    /* fallocate does not release the space
+                     * in some cases
+                     */
+                    ret2 = syncop_ftruncate(to, fd, 0, NULL, NULL, NULL, NULL);
+                    if (ret2 < 0) {
+                        gf_msg(this->name, GF_LOG_WARNING, -ret2,
+                               DHT_MSG_MIGRATE_FILE_FAILED,
+                               "ftruncate failed for "
+                               "%s on %s",
+                               loc->path, to->name);
+                    }
+                    goto out;
+                }
+            }
+        } else {
+            ret = syncop_ftruncate(to, fd, stbuf->ia_size, NULL, NULL, NULL,
+                                   NULL);
+            if (ret < 0) {
+                *fop_errno = -ret;
+                gf_msg(this->name, GF_LOG_WARNING, -ret,
+                       DHT_MSG_MIGRATE_FILE_FAILED,
+                       "ftruncate failed for %s on %s", loc->path, to->name);
+            }
+        }
+    }
+
+    /* success */
+    ret = 0;
+
+    if (dst_fd)
+        *dst_fd = fd;
+
+out:
+    if (ret) {
+        if (fd) {
+            fd_unref(fd);
+        }
+    }
+    if (dict)
+        dict_unref(dict);
+
+    if (xdata)
+        dict_unref(xdata);
+
+    return ret;
+}
+
+static int
+__dht_check_free_space(xlator_t *this, xlator_t *to, xlator_t *from, loc_t *loc,
+                       struct iatt *stbuf, int flag, dht_conf_t *conf,
+                       gf_boolean_t *target_changed, xlator_t **new_subvol,
+                       int *fop_errno)
+{
+    struct statvfs src_statfs = {
+        0,
+    };
+    struct statvfs dst_statfs = {
+        0,
+    };
+    int ret = -1;
+    dict_t *xdata = NULL;
+    dht_layout_t *layout = NULL;
+    uint64_t src_statfs_blocks = 1;
+    uint64_t dst_statfs_blocks = 1;
+    double dst_post_availspacepercent = 0;
+    double src_post_availspacepercent = 0;
+    uint64_t file_blocks = 0;
+    uint64_t src_total_blocks = 0;
+    uint64_t dst_total_blocks = 0;
+
+    xdata = dict_new();
+    if (!xdata) {
+        *fop_errno = ENOMEM;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+               "failed to allocate dictionary");
+        goto out;
+    }
+
+    ret = dict_set_int8(xdata, GF_INTERNAL_IGNORE_DEEM_STATFS, 1);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Failed to set " GF_INTERNAL_IGNORE_DEEM_STATFS " in dict");
+        ret = -1;
+        *fop_errno = ENOMEM;
+        goto out;
+    }
+
+    ret = syncop_statfs(from, loc, &src_statfs, xdata, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "failed to get statfs of %s on %s", loc->path, from->name);
+        *fop_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    ret = syncop_statfs(to, loc, &dst_statfs, xdata, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "failed to get statfs of %s on %s", loc->path, to->name);
+        *fop_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0,
+                 "min_free_disk - %f , block available - %" PRId64
+                 ", block size - %lu",
+                 conf->min_free_disk, dst_statfs.f_bavail, dst_statfs.f_bsize);
+
+    dst_statfs_blocks = dst_statfs.f_bavail *
+                        (dst_statfs.f_frsize / GF_DISK_SECTOR_SIZE);
+
+    src_statfs_blocks = src_statfs.f_bavail *
+                        (src_statfs.f_frsize / GF_DISK_SECTOR_SIZE);
+
+    dst_total_blocks = dst_statfs.f_blocks *
+                       (dst_statfs.f_frsize / GF_DISK_SECTOR_SIZE);
+
+    src_total_blocks = src_statfs.f_blocks *
+                       (src_statfs.f_frsize / GF_DISK_SECTOR_SIZE);
+
+    /* if force option is given, do not check for space @ dst.
+     * Check only if space is avail for the file */
+    if (flag != GF_DHT_MIGRATE_DATA)
+        goto check_avail_space;
+
+    /* Check:
+       During rebalance `migrate-data` - Destination subvol experiences
+       a `reduction` in 'blocks' of free space, at the same time source
+       subvol gains certain 'blocks' of free space. A valid check is
+       necessary here to avoid erroneous move to destination where
+       the space could be scantily available.
+       With heterogeneous brick support, an actual space comparison could
+       prevent any files being migrated to newly added bricks if they are
+       smaller then the free space available on the existing bricks.
+     */
+    if (!conf->use_fallocate) {
+        file_blocks = stbuf->ia_size + GF_DISK_SECTOR_SIZE - 1;
+        file_blocks /= GF_DISK_SECTOR_SIZE;
+
+        if (file_blocks >= dst_statfs_blocks) {
+            dst_statfs_blocks = 0;
+        } else {
+            dst_statfs_blocks -= file_blocks;
+        }
+    }
+
+    src_post_availspacepercent = ((src_statfs_blocks + file_blocks) * 100) /
+                                 src_total_blocks;
+
+    dst_post_availspacepercent = (dst_statfs_blocks * 100) / dst_total_blocks;
+
+    if (dst_post_availspacepercent < src_post_availspacepercent) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+               "data movement of file "
+               "{blocks:%" PRIu64
+               " name:(%s)} would result in "
+               "dst node (%s:%" PRIu64
+               ") having lower disk "
+               "space than the source node (%s:%" PRIu64
+               ")"
+               ".Skipping file.",
+               stbuf->ia_blocks, loc->path, to->name, dst_statfs_blocks,
+               from->name, src_statfs_blocks);
+
+        /* this is not a 'failure', but we don't want to
+           consider this as 'success' too :-/ */
+        *fop_errno = ENOSPC;
+        ret = 1;
+        goto out;
+    }
+
+check_avail_space:
+    if (conf->disk_unit == 'p' && dst_statfs.f_blocks) {
+        dst_post_availspacepercent = (dst_statfs_blocks * 100) /
+                                     dst_total_blocks;
+
+        gf_msg_debug(this->name, 0,
+                     "file : %s, post_availspacepercent"
+                     " : %lf f_bavail : %" PRIu64 " min-free-disk: %lf",
+                     loc->path, dst_post_availspacepercent, dst_statfs.f_bavail,
+                     conf->min_free_disk);
+
+        if (dst_post_availspacepercent < conf->min_free_disk) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, 0,
+                   "Write will cross min-free-disk for "
+                   "file - %s on subvol - %s. Looking "
+                   "for new subvol",
+                   loc->path, to->name);
+
+            goto find_new_subvol;
+        } else {
+            ret = 0;
+            goto out;
+        }
+    }
+
+    if (conf->disk_unit != 'p') {
+        if ((dst_statfs_blocks * GF_DISK_SECTOR_SIZE) < conf->min_free_disk) {
+            gf_msg_debug(this->name, 0,
+                         "file : %s,  destination frsize: %lu "
+                         "f_bavail : %" PRIu64 " min-free-disk: %lf",
+                         loc->path, dst_statfs.f_frsize, dst_statfs.f_bavail,
+                         conf->min_free_disk);
+
+            gf_msg(this->name, GF_LOG_WARNING, 0, 0,
+                   "write will"
+                   " cross min-free-disk for file - %s on subvol -"
+                   " %s. looking for new subvol",
+                   loc->path, to->name);
+
+            goto find_new_subvol;
+
+        } else {
+            ret = 0;
+            goto out;
+        }
+    }
+
+find_new_subvol:
+    layout = dht_layout_get(this, loc->parent);
+    if (!layout) {
+        gf_log(this->name, GF_LOG_ERROR, "Layout is NULL");
+        *fop_errno = EINVAL;
+        ret = -1;
+        goto out;
+    }
+
+    *new_subvol = dht_subvol_with_free_space_inodes(this, to, from, layout,
+                                                    stbuf->ia_size);
+    if ((!(*new_subvol)) || (*new_subvol == from)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_INSUFF_SPACE,
+               "Could not find any subvol"
+               " with space accommodating the file - %s. Consider "
+               "adding bricks",
+               loc->path);
+
+        *target_changed = _gf_false;
+        *fop_errno = ENOSPC;
+        ret = -1;
+    } else {
+        gf_msg(this->name, GF_LOG_INFO, 0, 0,
+               "new target found - %s"
+               " for file - %s",
+               (*new_subvol)->name, loc->path);
+        *target_changed = _gf_true;
+        ret = 0;
+    }
+
+out:
+    if (xdata)
+        dict_unref(xdata);
+    return ret;
+}
+
+static int
+__dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
+                             xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst,
+                             uint64_t ia_size, int hole_exists, int *fop_errno)
+{
+    int ret = 0;
+    int count = 0;
+    off_t offset = 0;
+    off_t data_offset = 0;
+    off_t hole_offset = 0;
+    struct iovec *vector = NULL;
+    struct iobref *iobref = NULL;
+    uint64_t total = 0;
+    size_t read_size = 0;
+    size_t data_block_size = 0;
+    dict_t *xdata = NULL;
+    dht_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    /* if file size is '0', no need to enter this loop */
+    while (total < ia_size) {
+        /* This is a regular file - read it sequentially */
+        if (!hole_exists) {
+            read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
+                             ? DHT_REBALANCE_BLKSIZE
+                             : (ia_size - total));
+        } else {
+            /* This is a sparse file - read only the data segments in the file
+             */
+
+            /* If the previous data block is fully copied, find the next data
+             * segment
+             * starting at the offset of the last read and written byte,  */
+            if (data_block_size <= 0) {
+                ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
+                                  &data_offset);
+                if (ret) {
+                    if (ret == -ENXIO)
+                        ret = 0; /* No more data segments */
+                    else
+                        *fop_errno = -ret; /* Error occurred */
+
+                    break;
+                }
+
+                /* If the position of the current data segment is greater than
+                 * the position of the next hole, find the next hole in order to
+                 * calculate the length of the new data segment */
+                if (data_offset > hole_offset) {
+                    /* Starting at the offset of the last data segment, find the
+                     * next hole */
+                    ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
+                                      NULL, &hole_offset);
+                    if (ret) {
+                        /* If an error occurred here it's a real error because
+                         * if the seek for a data segment was successful then
+                         * necessarily another hole must exist (EOF is a hole)
+                         */
+                        *fop_errno = -ret;
+                        break;
+                    }
+
+                    /* Calculate the total size of the current data block */
+                    data_block_size = hole_offset - data_offset;
+                }
+            } else {
+                /* There is still data in the current segment, move the
+                 * data_offset to the position of the last written byte */
+                data_offset = offset;
+            }
+
+            /* Calculate how much data needs to be read and written. If the data
+             * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
+             * write DHT_REBALANCE_BLKSIZE data length and the rest in the
+             * next iteration(s) */
+            read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
+                             ? DHT_REBALANCE_BLKSIZE
+                             : data_block_size);
+
+            /* Calculate the remaining size of the data block - maybe there's no
+             * need to seek for data in the next iteration */
+            data_block_size -= read_size;
+
+            /* Set offset to the offset of the data segment so read and write
+             * will have the correct position */
+            offset = data_offset;
+        }
+
+        ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
+                           &iobref, NULL, NULL, NULL);
+
+        if (!ret || (ret < 0)) {
+            if (!ret) {
+                /* File was probably truncated*/
+                ret = -1;
+                *fop_errno = ENOSPC;
+            } else {
+                *fop_errno = -ret;
+            }
+            break;
+        }
+
+        if (!conf->force_migration) {
+            if (!xdata) {
+                xdata = dict_new();
+                if (!xdata) {
+                    gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+                           "insufficient memory");
+                    ret = -1;
+                    *fop_errno = ENOMEM;
+                    break;
+                }
+
+                /* Fail this write and abort rebalance if we
+                 * detect a write from client since migration of
+                 * this file started. This is done to avoid
+                 * potential data corruption due to out of order
+                 * writes from rebalance and client to the same
+                 * region (as compared between src and dst
+                 * files). See
+                 * https://github.com/gluster/glusterfs/issues/308
+                 * for more details.
+                 */
+                ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1);
+                if (ret) {
+                    gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM,
+                           "failed to set dict");
+                    ret = -1;
+                    *fop_errno = ENOMEM;
+                    break;
+                }
+            }
+        }
+
+        ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
+                            NULL, xdata, NULL);
+        if (ret < 0) {
+            *fop_errno = -ret;
+            break;
+        }
+
+        offset += ret;
+        total += ret;
+
+        GF_FREE(vector);
+        if (iobref)
+            iobref_unref(iobref);
+        iobref = NULL;
+        vector = NULL;
+    }
+    if (iobref)
+        iobref_unref(iobref);
+    GF_FREE(vector);
+
+    if (ret >= 0)
+        ret = 0;
+    else
+        ret = -1;
+
+    if (xdata) {
+        dict_unref(xdata);
+    }
+
+    return ret;
+}
+
+static int
+__dht_rebalance_open_src_file(xlator_t *this, xlator_t *from, xlator_t *to,
+                              loc_t *loc, struct iatt *stbuf, fd_t **src_fd,
+                              gf_boolean_t *clean_src, int *fop_errno)
+{
+    int ret = 0;
+    fd_t *fd = NULL;
+    dict_t *dict = NULL;
+    struct iatt iatt = {
+        0,
+    };
+    dht_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    *clean_src = _gf_false;
+
+    fd = fd_create(loc->inode, DHT_REBALANCE_PID);
+    if (!fd) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: fd create failed (source)", loc->path);
+        *fop_errno = ENOMEM;
+        ret = -1;
+        goto out;
+    }
+
+    ret = syncop_open(from, loc, O_RDWR, fd, NULL, NULL);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "failed to open file %s on %s", loc->path, from->name);
+        *fop_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    fd_bind(fd);
+
+    if (src_fd)
+        *src_fd = fd;
+
+    ret = -1;
+    dict = dict_new();
+    if (!dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: Could not allocate memory for dict", loc->path);
+        *fop_errno = ENOMEM;
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_str(dict, conf->link_xattr_name, to->name);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to set xattr in dict for %s (linkto:%s)", loc->path,
+               to->name);
+        *fop_errno = ENOMEM;
+        ret = -1;
+        goto out;
+    }
+
+    /* Once the migration starts, the source should have 'linkto' key set
+       to show which is the target, so other clients can work around it */
+    ret = syncop_setxattr(from, loc, dict, 0, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "failed to set xattr on %s in %s", loc->path, from->name);
+        *fop_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    /* Reset source mode/xattr if migration fails*/
+    *clean_src = _gf_true;
+
+    /* mode should be (+S+T) to indicate migration is in progress */
+    iatt.ia_prot = stbuf->ia_prot;
+    iatt.ia_type = stbuf->ia_type;
+    iatt.ia_prot.sticky = 1;
+    iatt.ia_prot.sgid = 1;
+
+    ret = syncop_setattr(from, loc, &iatt, GF_SET_ATTR_MODE, NULL, NULL, NULL,
+                         NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "failed to set mode on %s in %s", loc->path, from->name);
+        *fop_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    /* success */
+    ret = 0;
+out:
+    if (dict)
+        dict_unref(dict);
+
+    return ret;
+}
+
+int
+migrate_special_files(xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
+                      struct iatt *buf, int *fop_errno)
+{
+    int ret = -1;
+    dict_t *rsp_dict = NULL;
+    dict_t *dict = NULL;
+    char *link = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+    dht_conf_t *conf = this->private;
+
+    dict = dict_new();
+    if (!dict) {
+        *fop_errno = ENOMEM;
+        ret = -1;
+        goto out;
+    }
+    ret = dict_set_int32(dict, conf->link_xattr_name, 256);
+    if (ret) {
+        *fop_errno = ENOMEM;
+        ret = -1;
+        gf_log(this->name, GF_LOG_ERROR,
+               "%s: failed to set 'linkto' key in dict", loc->path);
+        goto out;
+    }
+
+    /* check in the destination if the file is link file */
+    ret = syncop_lookup(to, loc, &stbuf, NULL, dict, &rsp_dict);
+    if ((ret < 0) && (-ret != ENOENT)) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: lookup failed", loc->path);
+        *fop_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    /* we no more require this key */
+    dict_del(dict, conf->link_xattr_name);
+
+    /* file exists in target node, only if it is 'linkfile' its valid,
+       otherwise, error out */
+    if (!ret) {
+        if (!check_is_linkfile(loc->inode, &stbuf, rsp_dict,
+                               conf->link_xattr_name)) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "%s: file exists in destination", loc->path);
+            *fop_errno = EINVAL;
+            ret = -1;
+            goto out;
+        }
+
+        /* as file is linkfile, delete it */
+        ret = syncop_unlink(to, loc, NULL, NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, -ret,
+                   DHT_MSG_MIGRATE_FILE_FAILED,
+                   "%s: failed to delete the linkfile", loc->path);
+            *fop_errno = -ret;
+            ret = -1;
+            goto out;
+        }
+    }
+
+    /* Set the gfid of the source file in dict */
+    ret = dict_set_gfuuid(dict, "gfid-req", buf->ia_gfid, true);
+    if (ret) {
+        *fop_errno = ENOMEM;
+        ret = -1;
+        gf_log(this->name, GF_LOG_ERROR,
+               "%s: failed to set gfid in dict for create", loc->path);
+        goto out;
+    }
+
+    /* Create the file in target */
+    if (IA_ISLNK(buf->ia_type)) {
+        /* Handle symlinks separately */
+        ret = syncop_readlink(from, loc, &link, buf->ia_size, NULL, NULL);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_WARNING, -ret,
+                   DHT_MSG_MIGRATE_FILE_FAILED,
+                   "%s: readlink on symlink failed", loc->path);
+            *fop_errno = -ret;
+            ret = -1;
+            goto out;
+        }
+
+        ret = syncop_symlink(to, loc, link, 0, dict, NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, -ret,
+                   DHT_MSG_MIGRATE_FILE_FAILED, "%s: creating symlink failed",
+                   loc->path);
+            *fop_errno = -ret;
+            ret = -1;
+            goto out;
+        }
+
+        goto done;
+    }
+
+    ret = syncop_mknod(to, loc, st_mode_from_ia(buf->ia_prot, buf->ia_type),
+                       makedev(ia_major(buf->ia_rdev), ia_minor(buf->ia_rdev)),
+                       0, dict, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: mknod failed", loc->path);
+        *fop_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+done:
+    ret = syncop_setattr(to, loc, buf,
+                         (GF_SET_ATTR_MTIME | GF_SET_ATTR_UID |
+                          GF_SET_ATTR_GID | GF_SET_ATTR_MODE),
+                         NULL, NULL, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: failed to perform setattr on %s", loc->path, to->name);
+        *fop_errno = -ret;
+    }
+
+    ret = syncop_unlink(from, loc, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: unlink failed", loc->path);
+        *fop_errno = -ret;
+        ret = -1;
+    }
+
+out:
+    GF_FREE(link);
+    if (dict)
+        dict_unref(dict);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    return ret;
+}
+
+static int
+__dht_migration_cleanup_src_file(xlator_t *this, loc_t *loc, fd_t *fd,
+                                 xlator_t *from, ia_prot_t *src_ia_prot)
+{
+    int ret = -1;
+    dht_conf_t *conf = NULL;
+    struct iatt new_stbuf = {
+        0,
+    };
+
+    if (!this || !fd || !from || !src_ia_prot) {
+        goto out;
+    }
+
+    conf = this->private;
+
+    /*Revert source mode and xattr changes*/
+    ret = syncop_fstat(from, fd, &new_stbuf, NULL, NULL);
+    if (ret < 0) {
+        /* Failed to get the stat info */
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file cleanup failed: failed to fstat "
+               "file %s on %s ",
+               loc->path, from->name);
+        ret = -1;
+        goto out;
+    }
+
+    /* Remove the sticky bit and sgid bit set, reset it to 0*/
+    if (!src_ia_prot->sticky)
+        new_stbuf.ia_prot.sticky = 0;
+
+    if (!src_ia_prot->sgid)
+        new_stbuf.ia_prot.sgid = 0;
+
+    ret = syncop_fsetattr(from, fd, &new_stbuf,
+                          (GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL, NULL,
+                          NULL, NULL);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file cleanup failed:"
+               "%s: failed to perform fsetattr on %s ",
+               loc->path, from->name);
+        ret = -1;
+        goto out;
+    }
+
+    ret = syncop_fremovexattr(from, fd, conf->link_xattr_name, 0, NULL);
+    if (ret) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "%s: failed to remove linkto xattr on %s (%s)", loc->path,
+               from->name, strerror(-ret));
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+/*
+  return values:
+
+   -1 : failure
+    0 : successfully migrated data
+    1 : not a failure, but we can't migrate data as of now
+*/
+int
+dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
+                 int flag, int *fop_errno)
+{
+    int ret = -1;
+    struct iatt new_stbuf = {
+        0,
+    };
+    struct iatt stbuf = {
+        0,
+    };
+    struct iatt empty_iatt = {
+        0,
+    };
+    ia_prot_t src_ia_prot = {
+        0,
+    };
+    fd_t *src_fd = NULL;
+    fd_t *dst_fd = NULL;
+    dict_t *dict = NULL;
+    dict_t *xattr = NULL;
+    dict_t *xattr_rsp = NULL;
+    int file_has_holes = 0;
+    dht_conf_t *conf = this->private;
+    int rcvd_enoent_from_src = 0;
+    struct gf_flock flock = {
+        0,
+    };
+    struct gf_flock plock = {
+        0,
+    };
+    loc_t tmp_loc = {
+        0,
+    };
+    loc_t parent_loc = {
+        0,
+    };
+    gf_boolean_t inodelk_locked = _gf_false;
+    gf_boolean_t entrylk_locked = _gf_false;
+    gf_boolean_t p_locked = _gf_false;
+    int lk_ret = -1;
+    gf_defrag_info_t *defrag = NULL;
+    gf_boolean_t clean_src = _gf_false;
+    gf_boolean_t clean_dst = _gf_false;
+    int log_level = GF_LOG_INFO;
+    gf_boolean_t delete_src_linkto = _gf_true;
+    lock_migration_info_t locklist;
+    dict_t *meta_dict = NULL;
+    gf_boolean_t meta_locked = _gf_false;
+    gf_boolean_t target_changed = _gf_false;
+    xlator_t *new_target = NULL;
+    xlator_t *old_target = NULL;
+    xlator_t *hashed_subvol = NULL;
+    fd_t *linkto_fd = NULL;
+    dict_t *xdata = NULL;
+
+    if (from == to) {
+        gf_msg_debug(this->name, 0,
+                     "destination and source are same. file %s"
+                     " might have migrated already",
+                     loc->path);
+        ret = 0;
+        goto out;
+    }
+
+    gf_log(this->name, log_level, "%s: attempting to move from %s to %s",
+           loc->path, from->name, to->name);
+
+    dict = dict_new();
+    if (!dict) {
+        ret = -1;
+        *fop_errno = ENOMEM;
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+               "Could not allocate memory for dict");
+        goto out;
+    }
+    ret = dict_set_int32(dict, conf->link_xattr_name, 256);
+    if (ret) {
+        *fop_errno = ENOMEM;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed:"
+               "%s: failed to set 'linkto' key in dict",
+               loc->path);
+        goto out;
+    }
+
+    /* Do not migrate file in case lock migration is not enabled on the
+     * volume*/
+    if (!conf->lock_migration_enabled) {
+        ret = dict_set_int32(dict, GLUSTERFS_POSIXLK_COUNT, sizeof(int32_t));
+        if (ret) {
+            *fop_errno = ENOMEM;
+            ret = -1;
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Migrate file failed: %s: failed to "
+                   "set " GLUSTERFS_POSIXLK_COUNT " key in dict",
+                   loc->path);
+            goto out;
+        }
+    } else {
+        gf_msg(this->name, GF_LOG_INFO, 0, 0,
+               "locks will be migrated"
+               " for file: %s",
+               loc->path);
+    }
+
+    /* The file is locked to prevent a rename during a migration. Renames
+     * and migrations on the file at the same time can lead to data loss.
+     */
+
+    ret = dht_build_parent_loc(this, &parent_loc, loc, fop_errno);
+    if (ret < 0) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_WARNING, *fop_errno,
+               DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: failed to build parent loc, which is needed to "
+               "acquire entrylk to synchronize with renames on this "
+               "path. Skipping migration",
+               loc->path);
+        goto out;
+    }
+
+    hashed_subvol = dht_subvol_get_hashed(this, loc);
+    if (hashed_subvol == NULL) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: cannot find hashed subvol which is needed to "
+               "synchronize with renames on this path. "
+               "Skipping migration",
+               loc->path);
+        goto out;
+    }
+
+    flock.l_type = F_WRLCK;
+
+    tmp_loc.inode = inode_ref(loc->inode);
+    gf_uuid_copy(tmp_loc.gfid, loc->gfid);
+    tmp_loc.path = gf_strdup(loc->path);
+
+    /* this inodelk happens with flock.owner being zero. But to synchronize
+     * hardlink migration we need to have different lkowner for each migration
+     * Filed a bug here: https://bugzilla.redhat.com/show_bug.cgi?id=1468202 to
+     * track the fix for this. Currently synclock takes care of synchronizing
+     * hardlink migration. Once this bug is fixed we can avoid taking synclock
+     */
+    ret = syncop_inodelk(from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc, F_SETLKW,
+                         &flock, NULL, NULL);
+    if (ret < 0) {
+        *fop_errno = -ret;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_WARNING, *fop_errno,
+               DHT_MSG_MIGRATE_FILE_FAILED,
+               "migrate file failed: "
+               "%s: failed to lock file on %s",
+               loc->path, from->name);
+        goto out;
+    }
+
+    inodelk_locked = _gf_true;
+
+    /* dht_rename has changed to use entrylk on hashed subvol for
+     * synchronization. So, rebalance too has to acquire an entrylk on
+     * hashed subvol.
+     */
+    ret = syncop_entrylk(hashed_subvol, DHT_ENTRY_SYNC_DOMAIN, &parent_loc,
+                         loc->name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL, NULL);
+    if (ret < 0) {
+        *fop_errno = -ret;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_WARNING, *fop_errno,
+               DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: failed to acquire entrylk on subvol %s", loc->path,
+               hashed_subvol->name);
+        goto out;
+    }
+
+    entrylk_locked = _gf_true;
+
+    /* Phase 1 - Data migration is in progress from now on */
+    ret = syncop_lookup(from, loc, &stbuf, NULL, dict, &xattr_rsp);
+    if (ret) {
+        *fop_errno = -ret;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, *fop_errno,
+               DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed:"
+               "%s: lookup failed on %s",
+               loc->path, from->name);
+        goto out;
+    }
+
+    /* preserve source mode, so set the same to the destination */
+    src_ia_prot = stbuf.ia_prot;
+
+    /* Check if file can be migrated */
+    ret = __is_file_migratable(this, loc, &stbuf, xattr_rsp, flag, defrag, conf,
+                               fop_errno);
+    if (ret) {
+        if (ret == HARDLINK_MIG_INPROGRESS)
+            ret = 0;
+        goto out;
+    }
+
+    /* Take care of the special files */
+    if (!IA_ISREG(stbuf.ia_type)) {
+        /* Special files */
+        ret = migrate_special_files(this, from, to, loc, &stbuf, fop_errno);
+        goto out;
+    }
+
+    /* Try to preserve 'holes' while migrating data */
+    if (stbuf.ia_size > (stbuf.ia_blocks * GF_DISK_SECTOR_SIZE))
+        file_has_holes = 1;
+
+    /* create the destination, with required modes/xattr */
+    ret = __dht_rebalance_create_dst_file(this, to, from, loc, &stbuf, &dst_fd,
+                                          fop_errno, file_has_holes);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "Create dst failed"
+               " on - %s for file - %s",
+               to->name, loc->path);
+        goto out;
+    }
+
+    clean_dst = _gf_true;
+
+    ret = __dht_check_free_space(this, to, from, loc, &stbuf, flag, conf,
+                                 &target_changed, &new_target, fop_errno);
+    if (target_changed) {
+        /* Can't handle for hardlinks. Marking this as failure */
+        if (flag == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS || stbuf.ia_nlink > 1) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_INSUFF_SPACE,
+                   "Exiting migration for"
+                   " file - %s. flag - %d, stbuf.ia_nlink - %d",
+                   loc->path, flag, stbuf.ia_nlink);
+            ret = -1;
+            goto out;
+        }
+
+        ret = syncop_ftruncate(to, dst_fd, 0, NULL, NULL, NULL, NULL);
+        if (ret) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "%s: failed to perform truncate on %s (%s)", loc->path,
+                   to->name, strerror(-ret));
+        }
+
+        syncop_close(dst_fd);
+        dst_fd = NULL;
+
+        old_target = to;
+        to = new_target;
+
+        clean_dst = _gf_false;
+
+        /* if the file migration is successful to this new target, then
+         * update the xattr on the old destination to point the new
+         * destination. We need to do update this only post migration
+         * as in case of failure the linkto needs to point to the source
+         * subvol */
+        ret = __dht_rebalance_create_dst_file(
+            this, to, from, loc, &stbuf, &dst_fd, fop_errno, file_has_holes);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "Create dst failed"
+                   " on - %s for file - %s",
+                   to->name, loc->path);
+            goto out;
+        } else {
+            gf_msg(this->name, GF_LOG_INFO, 0, 0,
+                   "destination for file "
+                   "- %s is changed to - %s",
+                   loc->path, to->name);
+            clean_dst = _gf_true;
+        }
+    }
+
+    if (ret) {
+        goto out;
+    }
+
+    /* Open the source, and also update mode/xattr */
+    ret = __dht_rebalance_open_src_file(this, from, to, loc, &stbuf, &src_fd,
+                                        &clean_src, fop_errno);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed: failed to open %s on %s", loc->path,
+               from->name);
+        goto out;
+    }
+
+    /* TODO: move all xattr related operations to fd based operations */
+    ret = syncop_listxattr(from, loc, &xattr, NULL, NULL);
+    if (ret < 0) {
+        *fop_errno = -ret;
+        gf_msg(this->name, GF_LOG_WARNING, *fop_errno,
+               DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed:"
+               "%s: failed to get xattr from %s",
+               loc->path, from->name);
+        ret = -1;
+        goto out;
+    }
+
+    /* Copying posix acls to the linkto file messes up the permissions*/
+    dht_strip_out_acls(xattr);
+
+    /* Remove the linkto xattr as we don't want to overwrite the value
+     * set on the dst.
+     */
+    dict_del(xattr, conf->link_xattr_name);
+
+    /* We need to error out if this fails as having the wrong shard xattrs
+     * set on the dst could cause data corruption
+     */
+    ret = syncop_fsetxattr(to, dst_fd, xattr, 0, NULL, NULL);
+    if (ret < 0) {
+        *fop_errno = -ret;
+        gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: failed to set xattr on %s", loc->path, to->name);
+        ret = -1;
+        goto out;
+    }
+
+    if (xattr_rsp) {
+        /* we no more require this key */
+        dict_del(dict, conf->link_xattr_name);
+        dict_unref(xattr_rsp);
+    }
+
+    ret = syncop_fstat(from, src_fd, &stbuf, dict, &xattr_rsp);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed:failed to lookup %s on %s ", loc->path,
+               from->name);
+        *fop_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    /* Check again if file has hardlink */
+    ret = __check_file_has_hardlink(this, loc, &stbuf, xattr_rsp, flag, defrag,
+                                    conf, fop_errno);
+    if (ret) {
+        if (ret == HARDLINK_MIG_INPROGRESS)
+            ret = 0;
+        goto out;
+    }
+
+    ret = __dht_rebalance_migrate_data(this, defrag, from, to, src_fd, dst_fd,
+                                       stbuf.ia_size, file_has_holes,
+                                       fop_errno);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed: %s: failed to migrate data", loc->path);
+
+        ret = -1;
+        goto out;
+    }
+
+    /* TODO: Sync the locks */
+
+    xdata = dict_new();
+    if (!xdata || dict_set_int8(xdata, "last-fsync", 1)) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "%s: failed to set last-fsync flag on "
+               "%s (%s)",
+               loc->path, to->name, strerror(ENOMEM));
+    }
+
+    ret = syncop_fsync(to, dst_fd, 0, NULL, NULL, xdata, NULL);
+    if (ret) {
+        gf_log(this->name, GF_LOG_WARNING, "%s: failed to fsync on %s (%s)",
+               loc->path, to->name, strerror(-ret));
+        *fop_errno = -ret;
+    }
+
+    /* Phase 2 - Data-Migration Complete, Housekeeping updates pending */
+
+    ret = syncop_fstat(from, src_fd, &new_stbuf, NULL, NULL);
+    if (ret < 0) {
+        /* Failed to get the stat info */
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed: failed to fstat file %s on %s ", loc->path,
+               from->name);
+        *fop_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    /* Lock the entire source file to prevent clients from taking a
+       lock on it as dht_lk does not handle file migration.
+
+       This still leaves a small window where conflicting locks can
+       be granted to different clients. If client1 requests a blocking
+       lock on the src file, it will be granted after the migrating
+       process releases its lock. If client2 requests a lock on the dst
+       data file, it will also be granted, but all FOPs will be redirected
+       to the dst data file.
+    */
+
+    /* Take meta lock  */
+
+    if (conf->lock_migration_enabled) {
+        meta_dict = dict_new();
+        if (!meta_dict) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "dict_new failed");
+
+            *fop_errno = ENOMEM;
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_set_str(meta_dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+                   "Failed to set dictionary value: key = %s,"
+                   " path = %s",
+                   GLUSTERFS_INTERNAL_FOP_KEY, loc->path);
+            *fop_errno = ENOMEM;
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_set_int32(meta_dict, GF_META_LOCK_KEY, 1);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Trace dict_set failed");
+            *fop_errno = ENOMEM;
+            ret = -1;
+            goto out;
+        }
+
+        ret = syncop_setxattr(from, loc, meta_dict, 0, NULL, NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Trace syncop_setxattr metalock failed");
+
+            *fop_errno = -ret;
+            ret = -1;
+            goto out;
+        } else {
+            meta_locked = _gf_true;
+        }
+    }
+
+    if (!conf->lock_migration_enabled) {
+        plock.l_type = F_WRLCK;
+        plock.l_start = 0;
+        plock.l_len = 0;
+        plock.l_whence = SEEK_SET;
+
+        ret = syncop_lk(from, src_fd, F_SETLK, &plock, NULL, NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Migrate file failed:"
+                   "%s: Failed to lock on %s",
+                   loc->path, from->name);
+            *fop_errno = -ret;
+            ret = -1;
+            goto out;
+        }
+
+        p_locked = _gf_true;
+
+    } else {
+        INIT_LIST_HEAD(&locklist.list);
+
+        ret = syncop_getactivelk(from, loc, &locklist, NULL, NULL);
+        if (ret == 0) {
+            gf_log(this->name, GF_LOG_INFO, "No active locks on:%s", loc->path);
+
+        } else if (ret > 0) {
+            ret = syncop_setactivelk(to, loc, &locklist, NULL, NULL);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, -ret,
+                       DHT_MSG_LOCK_MIGRATION_FAILED, "write lock failed on:%s",
+                       loc->path);
+
+                *fop_errno = -ret;
+                ret = -1;
+                goto metaunlock;
+            }
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, -ret,
+                   DHT_MSG_LOCK_MIGRATION_FAILED,
+                   "getactivelk failed for file: %s", loc->path);
+            *fop_errno = -ret;
+        }
+    }
+
+    /* source would have both sticky bit and sgid bit set, reset it to 0,
+       and set the source permission on destination, if it was not set
+       prior to setting rebalance-modes in source  */
+    if (!src_ia_prot.sticky)
+        new_stbuf.ia_prot.sticky = 0;
+
+    if (!src_ia_prot.sgid)
+        new_stbuf.ia_prot.sgid = 0;
+
+    /* TODO: if the source actually had sticky bit, or sgid bit set,
+       we are not handling it */
+
+    ret = syncop_fsetattr(
+        to, dst_fd, &new_stbuf,
+        (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL, NULL,
+        NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed:"
+               "%s: failed to perform setattr on %s ",
+               loc->path, to->name);
+        *fop_errno = -ret;
+        ret = -1;
+        goto metaunlock;
+    }
+
+    /* Because 'futimes' is not portable */
+    ret = syncop_setattr(to, loc, &new_stbuf,
+                         (GF_SET_ATTR_MTIME | GF_SET_ATTR_ATIME), NULL, NULL,
+                         NULL, NULL);
+    if (ret) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "%s: failed to perform setattr on %s ", loc->path, to->name);
+        *fop_errno = -ret;
+    }
+
+    if (target_changed) {
+        dict_del(dict, GLUSTERFS_POSIXLK_COUNT);
+        ret = dict_set_str(dict, conf->link_xattr_name, to->name);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "failed to set xattr in dict for %s (linkto:%s)", loc->path,
+                   to->name);
+            *fop_errno = ENOMEM;
+            ret = -1;
+            goto out;
+        }
+
+        ret = syncop_setxattr(old_target, loc, dict, 0, NULL, NULL);
+        if (ret && -ret != ESTALE && -ret != ENOENT) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "failed to set xattr on %s in %s", loc->path,
+                   old_target->name);
+            *fop_errno = -ret;
+            ret = -1;
+            goto out;
+        } else if (-ret == ESTALE || -ret == ENOENT) {
+            /* The failure ESTALE indicates that the linkto
+             * file on the hashed subvol might have been deleted.
+             * In this case will create a linkto file with new target
+             * as linkto xattr value*/
+            linkto_fd = fd_create(loc->inode, DHT_REBALANCE_PID);
+            if (!linkto_fd) {
+                gf_msg(this->name, GF_LOG_ERROR, errno,
+                       DHT_MSG_MIGRATE_FILE_FAILED, "%s: fd create failed",
+                       loc->path);
+                *fop_errno = ENOMEM;
+                ret = -1;
+                goto out;
+            }
+            ret = syncop_create(old_target, loc, O_RDWR, DHT_LINKFILE_MODE,
+                                linkto_fd, NULL, dict, NULL);
+            if (ret != 0 && -ret != EEXIST && -ret != ESTALE) {
+                *fop_errno = -ret;
+                ret = -1;
+                gf_msg(this->name, GF_LOG_ERROR, -ret,
+                       DHT_MSG_MIGRATE_FILE_FAILED,
+                       "failed to create linkto file on %s in %s", loc->path,
+                       old_target->name);
+                goto out;
+            } else if (ret == 0) {
+                ret = syncop_fsetattr(old_target, linkto_fd, &stbuf,
+                                      (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL,
+                                      NULL, NULL, NULL);
+                if (ret < 0) {
+                    *fop_errno = -ret;
+                    gf_msg(this->name, GF_LOG_ERROR, -ret,
+                           DHT_MSG_MIGRATE_FILE_FAILED,
+                           "chown failed for %s on %s", loc->path,
+                           old_target->name);
+                }
+            }
+        }
+    }
+
+    clean_dst = _gf_false;
+
+    /* Posix acls are not set on DHT linkto files as part of the initial
+     * initial xattrs set on the dst file, so these need
+     * to be set on the dst file after the linkto attrs are removed.
+     * TODO: Optimize this.
+     */
+    if (xattr) {
+        dict_unref(xattr);
+        xattr = NULL;
+    }
+
+    /* Set only the Posix ACLs this time */
+    ret = syncop_getxattr(from, loc, &xattr, POSIX_ACL_ACCESS_XATTR, NULL,
+                          NULL);
+    if (ret < 0) {
+        if ((-ret != ENODATA) && (-ret != ENOATTR)) {
+            gf_msg(this->name, GF_LOG_WARNING, -ret,
+                   DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Migrate file failed:"
+                   "%s: failed to get xattr from %s",
+                   loc->path, from->name);
+            *fop_errno = -ret;
+        }
+    } else {
+        ret = syncop_setxattr(to, loc, xattr, 0, NULL, NULL);
+        if (ret < 0) {
+            /* Potential problem here where Posix ACLs will
+             * not be set on the target file */
+
+            gf_msg(this->name, GF_LOG_WARNING, -ret,
+                   DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Migrate file failed:"
+                   "%s: failed to set xattr on %s",
+                   loc->path, to->name);
+            *fop_errno = -ret;
+        }
+    }
+
+    /* The src file is being unlinked after this so we don't need
+       to clean it up */
+    clean_src = _gf_false;
+
+    /* Make the source as a linkfile first before deleting it */
+    empty_iatt.ia_prot.sticky = 1;
+    ret = syncop_fsetattr(from, src_fd, &empty_iatt, GF_SET_ATTR_MODE, NULL,
+                          NULL, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed:"
+               "%s: failed to perform setattr on %s ",
+               loc->path, from->name);
+        *fop_errno = -ret;
+        ret = -1;
+        goto metaunlock;
+    }
+
+    /* Free up the data blocks on the source node, as the whole
+        file is migrated */
+    ret = syncop_ftruncate(from, src_fd, 0, NULL, NULL, NULL, NULL);
+    if (ret) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "%s: failed to perform truncate on %s (%s)", loc->path,
+               from->name, strerror(-ret));
+        *fop_errno = -ret;
+    }
+
+    /* remove the 'linkto' xattr from the destination */
+    ret = syncop_fremovexattr(to, dst_fd, conf->link_xattr_name, 0, NULL);
+    if (ret) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "%s: failed to perform removexattr on %s (%s)", loc->path,
+               to->name, strerror(-ret));
+        *fop_errno = -ret;
+    }
+
+    /* Do a stat and check the gfid before unlink */
+
+    /*
+     * Cached file changes its state from non-linkto to linkto file after
+     * migrating data. If lookup from any other mount-point is performed,
+     * converted-linkto-cached file will be treated as a stale and will be
+     * unlinked. But by this time, file is already migrated. So further
+     * failure because of ENOENT should  not be treated as error
+     */
+
+    ret = syncop_stat(from, loc, &empty_iatt, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "%s: failed to do a stat on %s", loc->path, from->name);
+
+        if (-ret != ENOENT) {
+            *fop_errno = -ret;
+            ret = -1;
+            goto metaunlock;
+        }
+
+        rcvd_enoent_from_src = 1;
+    }
+
+    if ((gf_uuid_compare(empty_iatt.ia_gfid, loc->gfid) == 0) &&
+        (!rcvd_enoent_from_src) && delete_src_linkto) {
+        /* take out the source from namespace */
+        ret = syncop_unlink(from, loc, NULL, NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, -ret,
+                   DHT_MSG_MIGRATE_FILE_FAILED,
+                   "%s: failed to perform unlink on %s", loc->path, from->name);
+            *fop_errno = -ret;
+            ret = -1;
+            goto metaunlock;
+        }
+    }
+
+    ret = syncop_lookup(this, loc, NULL, NULL, NULL, NULL);
+    if (ret) {
+        gf_msg_debug(this->name, -ret,
+                     "%s: failed to lookup the file on subvolumes", loc->path);
+        *fop_errno = -ret;
+    }
+
+    gf_msg(this->name, log_level, 0, DHT_MSG_MIGRATE_FILE_COMPLETE,
+           "completed migration of %s from subvolume %s to %s", loc->path,
+           from->name, to->name);
+
+    ret = 0;
+
+metaunlock:
+
+    if (conf->lock_migration_enabled && meta_locked) {
+        dict_del(meta_dict, GF_META_LOCK_KEY);
+
+        ret = dict_set_int32(meta_dict, GF_META_UNLOCK_KEY, 1);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Trace dict_set failed");
+
+            *fop_errno = ENOMEM;
+            ret = -1;
+            goto out;
+        }
+
+        if (clean_dst == _gf_false)
+            ret = dict_set_int32(meta_dict, "status", 1);
+        else
+            ret = dict_set_int32(meta_dict, "status", 0);
+
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Trace dict_set failed");
+
+            *fop_errno = ENOMEM;
+            ret = -1;
+            goto out;
+        }
+
+        ret = syncop_setxattr(from, loc, meta_dict, 0, NULL, NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Trace syncop_setxattr meta unlock failed");
+
+            *fop_errno = -ret;
+            ret = -1;
+            goto out;
+        }
+    }
+
+out:
+    if (clean_src) {
+        /* Revert source mode and xattr changes*/
+        lk_ret = __dht_migration_cleanup_src_file(this, loc, src_fd, from,
+                                                  &src_ia_prot);
+        if (lk_ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+                   "%s: failed to cleanup source file on %s", loc->path,
+                   from->name);
+        }
+    }
+
+    /* reset the destination back to 0 */
+    if (clean_dst) {
+        lk_ret = syncop_ftruncate(to, dst_fd, 0, NULL, NULL, NULL, NULL);
+        if (lk_ret) {
+            gf_msg(this->name, GF_LOG_ERROR, -lk_ret,
+                   DHT_MSG_MIGRATE_FILE_FAILED,
+                   "Migrate file failed: "
+                   "%s: failed to reset target size back to 0",
+                   loc->path);
+        }
+    }
+
+    if (inodelk_locked) {
+        flock.l_type = F_UNLCK;
+
+        lk_ret = syncop_inodelk(from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc,
+                                F_SETLK, &flock, NULL, NULL);
+        if (lk_ret < 0) {
+            gf_msg(this->name, GF_LOG_WARNING, -lk_ret,
+                   DHT_MSG_MIGRATE_FILE_FAILED,
+                   "%s: failed to unlock file on %s", loc->path, from->name);
+        }
+    }
+
+    if (entrylk_locked) {
+        lk_ret = syncop_entrylk(hashed_subvol, DHT_ENTRY_SYNC_DOMAIN,
+                                &parent_loc, loc->name, ENTRYLK_UNLOCK,
+                                ENTRYLK_UNLOCK, NULL, NULL);
+        if (lk_ret < 0) {
+            gf_msg(this->name, GF_LOG_WARNING, -lk_ret,
+                   DHT_MSG_MIGRATE_FILE_FAILED,
+                   "%s: failed to unlock entrylk on %s", loc->path,
+                   hashed_subvol->name);
+        }
+    }
+
+    if (p_locked) {
+        plock.l_type = F_UNLCK;
+        lk_ret = syncop_lk(from, src_fd, F_SETLK, &plock, NULL, NULL);
+
+        if (lk_ret < 0) {
+            gf_msg(this->name, GF_LOG_WARNING, -lk_ret,
+                   DHT_MSG_MIGRATE_FILE_FAILED,
+                   "%s: failed to unlock file on %s", loc->path, from->name);
+        }
+    }
+
+    lk_ret = syncop_removexattr(to, loc, GF_PROTECT_FROM_EXTERNAL_WRITES, NULL,
+                                NULL);
+    if (lk_ret && (lk_ret != -ENODATA) && (lk_ret != -ENOATTR)) {
+        gf_msg(this->name, GF_LOG_WARNING, -lk_ret, 0,
+               "%s: removexattr failed key %s", loc->path,
+               GF_PROTECT_FROM_EXTERNAL_WRITES);
+    }
+
+    if (dict)
+        dict_unref(dict);
+
+    if (xattr)
+        dict_unref(xattr);
+    if (xattr_rsp)
+        dict_unref(xattr_rsp);
+
+    if (dst_fd)
+        syncop_close(dst_fd);
+
+    if (src_fd)
+        syncop_close(src_fd);
+    if (linkto_fd)
+        syncop_close(linkto_fd);
+
+    if (xdata)
+        dict_unref(xdata);
+
+    loc_wipe(&tmp_loc);
+    loc_wipe(&parent_loc);
+
+    return ret;
+}
+
+static int
+rebalance_task(void *data)
+{
+    int ret = -1;
+    dht_local_t *local = NULL;
+    call_frame_t *frame = NULL;
+    int fop_errno = 0;
+
+    frame = data;
+
+    local = frame->local;
+
+    /* This function is 'synchrounous', hence if it returns,
+       we are done with the task */
+    ret = dht_migrate_file(THIS, &local->loc, local->rebalance.from_subvol,
+                           local->rebalance.target_node, local->flags,
+                           &fop_errno);
+
+    return ret;
+}
+
+static int
+rebalance_task_completion(int op_ret, call_frame_t *sync_frame, void *data)
+{
+    int32_t op_errno = EINVAL;
+
+    if (op_ret == -1) {
+        /* Failure of migration process, mostly due to write process.
+           as we can't preserve the exact errno, lets say there was
+           no space to migrate-data
+        */
+        op_errno = ENOSPC;
+    } else if (op_ret == 1) {
+        /* migration didn't happen, but is not a failure, let the user
+           understand that he doesn't have permission to migrate the
+           file.
+        */
+        op_ret = -1;
+        op_errno = EPERM;
+    } else if (op_ret != 0) {
+        op_errno = -op_ret;
+        op_ret = -1;
+    }
+
+    DHT_STACK_UNWIND(setxattr, sync_frame, op_ret, op_errno, NULL);
+    return 0;
+}
+
+int
+dht_start_rebalance_task(xlator_t *this, call_frame_t *frame)
+{
+    int ret = -1;
+
+    ret = synctask_new(this->ctx->env, rebalance_task,
+                       rebalance_task_completion, frame, frame);
+    return ret;
+}
+
+int
+gf_listener_stop(xlator_t *this)
+{
+    glusterfs_ctx_t *ctx = NULL;
+    cmd_args_t *cmd_args = NULL;
+    int ret = 0;
+
+    ctx = this->ctx;
+    GF_ASSERT(ctx);
+    cmd_args = &ctx->cmd_args;
+    if (cmd_args->sock_file) {
+        ret = sys_unlink(cmd_args->sock_file);
+        if (ret && (ENOENT == errno)) {
+            ret = 0;
+        }
+    }
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, DHT_MSG_SOCKET_ERROR,
+               "Failed to unlink listener "
+               "socket %s",
+               cmd_args->sock_file);
+    }
+    return ret;
+}
+
+void
+dht_build_root_inode(xlator_t *this, inode_t **inode)
+{
+    inode_table_t *itable = NULL;
+    static uuid_t root_gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+    itable = inode_table_new(0, this);
+    if (!itable)
+        return;
+
+    *inode = inode_find(itable, root_gfid);
+}
+
+void
+dht_build_root_loc(inode_t *inode, loc_t *loc)
+{
+    loc->path = "/";
+    loc->inode = inode;
+    loc->inode->ia_type = IA_IFDIR;
+    memset(loc->gfid, 0, 16);
+    loc->gfid[15] = 1;
+}
+
+/* return values: 1 -> error, bug ignore and continue
+                  0 -> proceed
+                 -1 -> error, handle it */
+int32_t
+gf_defrag_handle_migrate_error(int32_t op_errno, gf_defrag_info_t *defrag)
+{
+    int ret = 0;
+    /* if errno is not ENOTCONN, we can still continue
+       with rebalance process */
+    if (op_errno != ENOTCONN) {
+        ret = 1;
+        goto out;
+    }
+
+    if (op_errno == ENOTCONN) {
+        /* Most probably mount point went missing (mostly due
+           to a brick down), say rebalance failure to user,
+           let him restart it if everything is fine */
+        defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+        ret = -1;
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+static gf_boolean_t
+gf_defrag_pattern_match(gf_defrag_info_t *defrag, char *name, uint64_t size)
+{
+    gf_defrag_pattern_list_t *trav = NULL;
+    gf_boolean_t match = _gf_false;
+    gf_boolean_t ret = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("dht", defrag, out);
+
+    trav = defrag->defrag_pattern;
+    while (trav) {
+        if (!fnmatch(trav->path_pattern, name, FNM_NOESCAPE)) {
+            match = _gf_true;
+            break;
+        }
+        trav = trav->next;
+    }
+
+    if ((match == _gf_true) && (size >= trav->size))
+        ret = _gf_true;
+
+out:
+    return ret;
+}
+
+int
+dht_dfreaddirp_done(dht_dfoffset_ctx_t *offset_var, int cnt)
+{
+    int i;
+    int result = 1;
+
+    for (i = 0; i < cnt; i++) {
+        if (offset_var[i].readdir_done == 0) {
+            result = 0;
+            break;
+        }
+    }
+    return result;
+}
+
+int static gf_defrag_ctx_subvols_init(dht_dfoffset_ctx_t *offset_var,
+                                      xlator_t *this)
+{
+    int i;
+    dht_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf)
+        return -1;
+
+    for (i = 0; i < conf->local_subvols_cnt; i++) {
+        offset_var[i].this = conf->local_subvols[i];
+        offset_var[i].offset = (off_t)0;
+        offset_var[i].readdir_done = 0;
+    }
+
+    return 0;
+}
+
+static int
+dht_get_first_non_null_index(subvol_nodeuuids_info_t *entry)
+{
+    int i = 0;
+    int index = 0;
+
+    for (i = 0; i < entry->count; i++) {
+        if (!gf_uuid_is_null(entry->elements[i].uuid)) {
+            index = i;
+            goto out;
+        }
+    }
+
+    if (i == entry->count) {
+        index = -1;
+    }
+out:
+    return index;
+}
+
+/* Return value
+ * 0 : this node does not migrate the file
+ * 1 : this node migrates the file
+ *
+ * Use the hash value of the gfid to determine which node will migrate files.
+ * Using the gfid instead of the name also ensures that the same node handles
+ * all hardlinks.
+ */
+
+gf_boolean_t
+gf_defrag_should_i_migrate(xlator_t *this, int local_subvol_index, uuid_t gfid)
+{
+    gf_boolean_t ret = _gf_false;
+    int i = local_subvol_index;
+    char *str = NULL;
+    uint32_t hashval = 0;
+    int32_t index = 0;
+    dht_conf_t *conf = NULL;
+    char buf[UUID_CANONICAL_FORM_LEN + 1] = {
+        0,
+    };
+    subvol_nodeuuids_info_t *entry = NULL;
+
+    conf = this->private;
+
+    /* Pure distribute. A subvol in this case
+        will be handled by only one node */
+
+    entry = &(conf->local_nodeuuids[i]);
+    if (entry->count == 1) {
+        return 1;
+    }
+
+    str = uuid_utoa_r(gfid, buf);
+    if (dht_hash_compute(this, 0, str, &hashval) == 0) {
+        index = (hashval % entry->count);
+        if (entry->elements[index].info == REBAL_NODEUUID_MINE) {
+            /* Index matches this node's nodeuuid.*/
+            ret = _gf_true;
+            goto out;
+        }
+
+        /* Brick down - some other node has to migrate these files*/
+        if (gf_uuid_is_null(entry->elements[index].uuid)) {
+            /* Fall back to the first non-null index */
+            index = dht_get_first_non_null_index(entry);
+
+            if (index == -1) {
+                /* None of the bricks in the subvol are up.
+                 * CHILD_DOWN will kill the process soon */
+
+                return _gf_false;
+            }
+
+            if (entry->elements[index].info == REBAL_NODEUUID_MINE) {
+                /* Index matches this node's nodeuuid.*/
+                ret = _gf_true;
+                goto out;
+            }
+        }
+    }
+out:
+    return ret;
+}
+
+int
+gf_defrag_migrate_single_file(void *opaque)
+{
+    xlator_t *this = NULL;
+    dht_conf_t *conf = NULL;
+    gf_defrag_info_t *defrag = NULL;
+    int ret = 0;
+    gf_dirent_t *entry = NULL;
+    struct timeval start = {
+        0,
+    };
+    loc_t entry_loc = {
+        0,
+    };
+    loc_t *loc = NULL;
+    struct iatt iatt = {
+        0,
+    };
+    dict_t *migrate_data = NULL;
+    struct timeval end = {
+        0,
+    };
+    double elapsed = {
+        0,
+    };
+    struct dht_container *rebal_entry = NULL;
+    inode_t *inode = NULL;
+    xlator_t *hashed_subvol = NULL;
+    xlator_t *cached_subvol = NULL;
+    call_frame_t *statfs_frame = NULL;
+    xlator_t *old_THIS = NULL;
+    data_t *tmp = NULL;
+    int fop_errno = 0;
+    gf_dht_migrate_data_type_t rebal_type = GF_DHT_MIGRATE_DATA;
+    char value[MAX_REBAL_TYPE_SIZE] = {
+        0,
+    };
+    struct iatt *iatt_ptr = NULL;
+    gf_boolean_t update_skippedcount = _gf_true;
+    int i = 0;
+    gf_boolean_t should_i_migrate = 0;
+
+    rebal_entry = (struct dht_container *)opaque;
+    if (!rebal_entry) {
+        gf_log("DHT", GF_LOG_ERROR, "rebal_entry is NULL");
+        ret = -1;
+        goto out;
+    }
+
+    this = rebal_entry->this;
+
+    conf = this->private;
+
+    defrag = conf->defrag;
+
+    loc = rebal_entry->parent_loc;
+
+    migrate_data = rebal_entry->migrate_data;
+
+    entry = rebal_entry->df_entry;
+    iatt_ptr = &entry->d_stat;
+
+    if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+        ret = -1;
+        goto out;
+    }
+
+    if (defrag->stats == _gf_true) {
+        gettimeofday(&start, NULL);
+    }
+
+    if (defrag->defrag_pattern &&
+        (gf_defrag_pattern_match(defrag, entry->d_name,
+                                 entry->d_stat.ia_size) == _gf_false)) {
+        gf_log(this->name, GF_LOG_ERROR, "pattern_match failed");
+        goto out;
+    }
+
+    memset(&entry_loc, 0, sizeof(entry_loc));
+
+    ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name);
+    if (ret) {
+        LOCK(&defrag->lock);
+        {
+            defrag->total_failures += 1;
+        }
+        UNLOCK(&defrag->lock);
+
+        ret = 0;
+
+        gf_log(this->name, GF_LOG_ERROR, "Child loc build failed");
+
+        goto out;
+    }
+
+    should_i_migrate = gf_defrag_should_i_migrate(
+        this, rebal_entry->local_subvol_index, entry->d_stat.ia_gfid);
+
+    gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid);
+
+    gf_uuid_copy(entry_loc.pargfid, loc->gfid);
+
+    ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL);
+
+    if (!should_i_migrate) {
+        /* this node isn't supposed to migrate the file. suppressing any
+         * potential error from lookup as this file is under migration by
+         * another node */
+        if (ret) {
+            gf_msg_debug(this->name, -ret,
+                         "Ignoring lookup failure: node isn't migrating %s",
+                         entry_loc.path);
+            ret = 0;
+        }
+        gf_msg_debug(this->name, 0, "Don't migrate %s ", entry_loc.path);
+        goto out;
+    }
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+               "Migrate file failed: %s lookup failed", entry_loc.path);
+
+        /* Increase failure count only for remove-brick op, so that
+         * user is warned to check the removed-brick for any files left
+         * unmigrated
+         */
+        if (conf->decommission_subvols_cnt) {
+            LOCK(&defrag->lock);
+            {
+                defrag->total_failures += 1;
+            }
+            UNLOCK(&defrag->lock);
+        }
+
+        ret = 0;
+        goto out;
+    }
+
+    iatt_ptr = &iatt;
+
+    hashed_subvol = dht_subvol_get_hashed(this, &entry_loc);
+    if (!hashed_subvol) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+               "Failed to get hashed subvol for %s", entry_loc.path);
+        ret = 0;
+        goto out;
+    }
+
+    cached_subvol = dht_subvol_get_cached(this, entry_loc.inode);
+    if (!cached_subvol) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CACHED_SUBVOL_GET_FAILED,
+               "Failed to get cached subvol for %s", entry_loc.path);
+
+        ret = 0;
+        goto out;
+    }
+
+    if (hashed_subvol == cached_subvol) {
+        ret = 0;
+        goto out;
+    }
+
+    inode = inode_link(entry_loc.inode, entry_loc.parent, entry->d_name, &iatt);
+    inode_unref(entry_loc.inode);
+    /* use the inode returned by inode_link */
+    entry_loc.inode = inode;
+
+    old_THIS = THIS;
+    THIS = this;
+    statfs_frame = create_frame(this, this->ctx->pool);
+    if (!statfs_frame) {
+        gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM,
+               "Insufficient memory. Frame creation failed");
+        ret = -1;
+        goto out;
+    }
+
+    /* async statfs information for honoring min-free-disk */
+    dht_get_du_info(statfs_frame, this, loc);
+    THIS = old_THIS;
+
+    tmp = dict_get(migrate_data, GF_XATTR_FILE_MIGRATE_KEY);
+    if (tmp) {
+        memcpy(value, tmp->data, tmp->len);
+        if (strcmp(value, "force") == 0)
+            rebal_type = GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS;
+
+        if (conf->decommission_in_progress)
+            rebal_type = GF_DHT_MIGRATE_HARDLINK;
+    }
+
+    ret = dht_migrate_file(this, &entry_loc, cached_subvol, hashed_subvol,
+                           rebal_type, &fop_errno);
+    if (ret == 1) {
+        if (fop_errno == ENOSPC) {
+            gf_msg_debug(this->name, 0,
+                         "migrate-data skipped for"
+                         " %s due to space constraints",
+                         entry_loc.path);
+
+            /* For remove-brick case if the source is not one of the
+             * removed-brick, do not mark the error as failure */
+            if (conf->decommission_subvols_cnt) {
+                for (i = 0; i < conf->subvolume_cnt; i++) {
+                    if (conf->decommissioned_bricks[i] == cached_subvol) {
+                        LOCK(&defrag->lock);
+                        {
+                            defrag->total_failures += 1;
+                            update_skippedcount = _gf_false;
+                        }
+                        UNLOCK(&defrag->lock);
+
+                        break;
+                    }
+                }
+            }
+
+            if (update_skippedcount) {
+                LOCK(&defrag->lock);
+                {
+                    defrag->skipped += 1;
+                }
+                UNLOCK(&defrag->lock);
+
+                gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_MIGRATE_FILE_SKIPPED,
+                       "File migration skipped for %s.", entry_loc.path);
+            }
+
+        } else if (fop_errno == ENOTSUP) {
+            gf_msg_debug(this->name, 0,
+                         "migrate-data skipped for"
+                         " hardlink %s ",
+                         entry_loc.path);
+            LOCK(&defrag->lock);
+            {
+                defrag->skipped += 1;
+            }
+            UNLOCK(&defrag->lock);
+
+            gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_MIGRATE_FILE_SKIPPED,
+                   "File migration skipped for %s.", entry_loc.path);
+        }
+
+        ret = 0;
+        goto out;
+    } else if (ret < 0) {
+        if (fop_errno != EEXIST) {
+            gf_msg(this->name, GF_LOG_ERROR, fop_errno,
+                   DHT_MSG_MIGRATE_FILE_FAILED, "migrate-data failed for %s",
+                   entry_loc.path);
+
+            LOCK(&defrag->lock);
+            {
+                defrag->total_failures += 1;
+            }
+            UNLOCK(&defrag->lock);
+        }
+
+        ret = gf_defrag_handle_migrate_error(fop_errno, defrag);
+
+        if (!ret) {
+            gf_msg(this->name, GF_LOG_ERROR, fop_errno,
+                   DHT_MSG_MIGRATE_FILE_FAILED,
+                   "migrate-data on %s failed:", entry_loc.path);
+        } else if (ret == 1) {
+            ret = 0;
+        }
+
+        goto out;
+    }
+
+    LOCK(&defrag->lock);
+    {
+        defrag->total_files += 1;
+        defrag->total_data += iatt.ia_size;
+    }
+    UNLOCK(&defrag->lock);
+
+    if (defrag->stats == _gf_true) {
+        gettimeofday(&end, NULL);
+        elapsed = gf_tvdiff(&start, &end);
+        gf_log(this->name, GF_LOG_INFO,
+               "Migration of "
+               "file:%s size:%" PRIu64
+               " bytes took %.2f"
+               "secs and ret: %d",
+               entry_loc.name, iatt.ia_size, elapsed / 1e6, ret);
+    }
+
+out:
+    if (statfs_frame) {
+        STACK_DESTROY(statfs_frame->root);
+    }
+
+    if (iatt_ptr) {
+        LOCK(&defrag->lock);
+        {
+            defrag->size_processed += iatt_ptr->ia_size;
+        }
+        UNLOCK(&defrag->lock);
+    }
+    loc_wipe(&entry_loc);
+
+    return ret;
+}
+
+void *
+gf_defrag_task(void *opaque)
+{
+    struct list_head *q_head = NULL;
+    struct dht_container *iterator = NULL;
+    gf_defrag_info_t *defrag = NULL;
+    int ret = 0;
+    pid_t pid = GF_CLIENT_PID_DEFRAG;
+
+    defrag = (gf_defrag_info_t *)opaque;
+    if (!defrag) {
+        gf_msg("dht", GF_LOG_ERROR, 0, 0, "defrag is NULL");
+        goto out;
+    }
+
+    syncopctx_setfspid(&pid);
+
+    q_head = &(defrag->queue[0].list);
+
+    /* The following while loop will dequeue one entry from the defrag->queue
+       under lock. We will update the defrag->global_error only when there
+       is an error which is critical to stop the rebalance process. The stop
+       message will be intimated to other migrator threads by setting the
+       defrag->defrag_status to GF_DEFRAG_STATUS_FAILED.
+
+       In defrag->queue, a low watermark (MIN_MIGRATE_QUEUE_COUNT) is
+       maintained so that crawler does not starve the file migration
+       workers and a high watermark (MAX_MIGRATE_QUEUE_COUNT) so that
+       crawler does not go far ahead in filling up the queue.
+     */
+
+    while (_gf_true) {
+        if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+            pthread_cond_broadcast(&defrag->rebalance_crawler_alarm);
+            pthread_cond_broadcast(&defrag->parallel_migration_cond);
+            goto out;
+        }
+
+        pthread_mutex_lock(&defrag->dfq_mutex);
+        {
+            /*Throttle down:
+              If the reconfigured count is less than current thread
+              count, then the current thread will sleep */
+
+            /*TODO: Need to refactor the following block to work
+             *under defrag->lock. For now access
+             * defrag->current_thread_count and rthcount under
+             * dfq_mutex lock */
+            while (!defrag->crawl_done && (defrag->recon_thread_count <
+                                           defrag->current_thread_count)) {
+                defrag->current_thread_count--;
+                gf_msg_debug("DHT", 0,
+                             "Thread sleeping. "
+                             "current thread count: %d",
+                             defrag->current_thread_count);
+
+                pthread_cond_wait(&defrag->df_wakeup_thread,
+                                  &defrag->dfq_mutex);
+
+                defrag->current_thread_count++;
+                gf_msg_debug("DHT", 0,
+                             "Thread wokeup. "
+                             "current thread count: %d",
+                             defrag->current_thread_count);
+            }
+
+            if (defrag->q_entry_count) {
+                iterator = list_entry(q_head->next, typeof(*iterator), list);
+
+                gf_msg_debug("DHT", 0,
+                             "picking entry "
+                             "%s",
+                             iterator->df_entry->d_name);
+
+                list_del_init(&(iterator->list));
+
+                defrag->q_entry_count--;
+
+                if ((defrag->q_entry_count < MIN_MIGRATE_QUEUE_COUNT) &&
+                    defrag->wakeup_crawler) {
+                    pthread_cond_broadcast(&defrag->rebalance_crawler_alarm);
+                }
+                pthread_mutex_unlock(&defrag->dfq_mutex);
+                ret = gf_defrag_migrate_single_file((void *)iterator);
+
+                /*Critical errors: ENOTCONN and ENOSPACE*/
+                if (ret) {
+                    dht_set_global_defrag_error(defrag, ret);
+
+                    defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+                    pthread_cond_broadcast(&defrag->rebalance_crawler_alarm);
+
+                    pthread_cond_broadcast(&defrag->parallel_migration_cond);
+
+                    goto out;
+                }
+
+                gf_defrag_free_container(iterator);
+
+                continue;
+            } else {
+                /* defrag->crawl_done flag is set means crawling
+                 file system is done and hence a list_empty when
+                 the above flag is set indicates there are no more
+                 entries to be added to the queue and rebalance is
+                 finished */
+
+                if (!defrag->crawl_done) {
+                    defrag->current_thread_count--;
+                    gf_msg_debug("DHT", 0,
+                                 "Thread "
+                                 "sleeping while  waiting "
+                                 "for migration entries. "
+                                 "current thread  count:%d",
+                                 defrag->current_thread_count);
+
+                    pthread_cond_wait(&defrag->parallel_migration_cond,
+                                      &defrag->dfq_mutex);
+                }
+
+                if (defrag->crawl_done && !defrag->q_entry_count) {
+                    defrag->current_thread_count++;
+                    gf_msg_debug("DHT", 0, "Exiting thread");
+
+                    pthread_cond_broadcast(&defrag->parallel_migration_cond);
+                    goto unlock;
+                } else {
+                    defrag->current_thread_count++;
+                    gf_msg_debug("DHT", 0,
+                                 "Thread woke up"
+                                 " as found migrating entries. "
+                                 "current thread count:%d",
+                                 defrag->current_thread_count);
+
+                    pthread_mutex_unlock(&defrag->dfq_mutex);
+                    continue;
+                }
+            }
+        }
+    unlock:
+        pthread_mutex_unlock(&defrag->dfq_mutex);
+        break;
+    }
+out:
+    return NULL;
+}
+
+int static gf_defrag_get_entry(xlator_t *this, int i,
+                               struct dht_container **container, loc_t *loc,
+                               dht_conf_t *conf, gf_defrag_info_t *defrag,
+                               fd_t *fd, dict_t *migrate_data,
+                               struct dir_dfmeta *dir_dfmeta, dict_t *xattr_req,
+                               int *perrno)
+{
+    int ret = 0;
+    char is_linkfile = 0;
+    gf_dirent_t *df_entry = NULL;
+    struct dht_container *tmp_container = NULL;
+
+    if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+        ret = -1;
+        goto out;
+    }
+
+    if (dir_dfmeta->offset_var[i].readdir_done == 1) {
+        ret = 0;
+        goto out;
+    }
+
+    if (dir_dfmeta->fetch_entries[i] == 1) {
+        if (!fd) {
+            dir_dfmeta->fetch_entries[i] = 0;
+            dir_dfmeta->offset_var[i].readdir_done = 1;
+            ret = 0;
+            goto out;
+        }
+
+        ret = syncop_readdirp(conf->local_subvols[i], fd, 131072,
+                              dir_dfmeta->offset_var[i].offset,
+                              &(dir_dfmeta->equeue[i]), xattr_req, NULL);
+        if (ret == 0) {
+            dir_dfmeta->offset_var[i].readdir_done = 1;
+            ret = 0;
+            goto out;
+        }
+
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_WARNING, -ret,
+                   DHT_MSG_MIGRATE_DATA_FAILED,
+                   "Readdirp failed. Aborting data migration for "
+                   "directory: %s",
+                   loc->path);
+            *perrno = -ret;
+            ret = -1;
+            goto out;
+        }
+
+        if (list_empty(&(dir_dfmeta->equeue[i].list))) {
+            dir_dfmeta->offset_var[i].readdir_done = 1;
+            ret = 0;
+            goto out;
+        }
+
+        dir_dfmeta->fetch_entries[i] = 0;
+    }
+
+    while (1) {
+        if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+            ret = -1;
+            goto out;
+        }
+
+        df_entry = list_entry(dir_dfmeta->iterator[i]->next, typeof(*df_entry),
+                              list);
+
+        if (&df_entry->list == dir_dfmeta->head[i]) {
+            gf_dirent_free(&(dir_dfmeta->equeue[i]));
+            INIT_LIST_HEAD(&(dir_dfmeta->equeue[i].list));
+            dir_dfmeta->fetch_entries[i] = 1;
+            dir_dfmeta->iterator[i] = dir_dfmeta->head[i];
+            ret = 0;
+            goto out;
+        }
+
+        dir_dfmeta->iterator[i] = dir_dfmeta->iterator[i]->next;
+
+        dir_dfmeta->offset_var[i].offset = df_entry->d_off;
+        if (!strcmp(df_entry->d_name, ".") || !strcmp(df_entry->d_name, ".."))
+            continue;
+
+        if (IA_ISDIR(df_entry->d_stat.ia_type)) {
+            defrag->size_processed += df_entry->d_stat.ia_size;
+            continue;
+        }
+
+        defrag->num_files_lookedup++;
+
+        if (defrag->defrag_pattern &&
+            (gf_defrag_pattern_match(defrag, df_entry->d_name,
+                                     df_entry->d_stat.ia_size) == _gf_false)) {
+            defrag->size_processed += df_entry->d_stat.ia_size;
+            continue;
+        }
+
+        is_linkfile = check_is_linkfile(NULL, &df_entry->d_stat, df_entry->dict,
+                                        conf->link_xattr_name);
+
+        if (is_linkfile) {
+            /* No need to add linkto file to the queue for
+               migration. Only the actual data file need to
+               be checked for migration criteria.
+            */
+
+            gf_msg_debug(this->name, 0,
+                         "Skipping linkfile"
+                         " %s on subvol: %s",
+                         df_entry->d_name, conf->local_subvols[i]->name);
+            continue;
+        }
+
+        /*Build Container Structure */
+
+        tmp_container = GF_CALLOC(1, sizeof(struct dht_container),
+                                  gf_dht_mt_container_t);
+        if (!tmp_container) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "Failed to allocate "
+                   "memory for container");
+            ret = -1;
+            goto out;
+        }
+        tmp_container->df_entry = gf_dirent_for_name(df_entry->d_name);
+        if (!tmp_container->df_entry) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "Failed to allocate "
+                   "memory for df_entry");
+            ret = -1;
+            goto out;
+        }
+
+        tmp_container->local_subvol_index = i;
+
+        tmp_container->df_entry->d_stat = df_entry->d_stat;
+
+        tmp_container->df_entry->d_ino = df_entry->d_ino;
+
+        tmp_container->df_entry->d_type = df_entry->d_type;
+
+        tmp_container->df_entry->d_len = df_entry->d_len;
+
+        tmp_container->parent_loc = GF_CALLOC(1, sizeof(*loc), gf_dht_mt_loc_t);
+        if (!tmp_container->parent_loc) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "Failed to allocate "
+                   "memory for loc");
+            ret = -1;
+            goto out;
+        }
+
+        ret = loc_copy(tmp_container->parent_loc, loc);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR, "loc_copy failed");
+            ret = -1;
+            goto out;
+        }
+
+        tmp_container->migrate_data = migrate_data;
+
+        tmp_container->this = this;
+
+        if (df_entry->dict)
+            tmp_container->df_entry->dict = dict_ref(df_entry->dict);
+
+        /*Build Container Structure >> END*/
+
+        ret = 0;
+        goto out;
+    }
+
+out:
+    if (ret == 0) {
+        *container = tmp_container;
+    } else {
+        if (tmp_container) {
+            gf_defrag_free_container(tmp_container);
+        }
+    }
+
+    return ret;
+}
+
+int
+gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
+                      dict_t *migrate_data, int *perrno)
+{
+    int ret = -1;
+    dht_conf_t *conf = NULL;
+    gf_dirent_t entries;
+    dict_t *xattr_req = NULL;
+    struct timeval dir_start = {
+        0,
+    };
+    struct timeval end = {
+        0,
+    };
+    double elapsed = {
+        0,
+    };
+    int local_subvols_cnt = 0;
+    int i = 0;
+    int j = 0;
+    struct dht_container *container = NULL;
+    int ldfq_count = 0;
+    int dfc_index = 0;
+    int throttle_up = 0;
+    struct dir_dfmeta *dir_dfmeta = NULL;
+    xlator_t *old_THIS = NULL;
+
+    gf_log(this->name, GF_LOG_INFO, "migrate data called on %s", loc->path);
+    gettimeofday(&dir_start, NULL);
+
+    conf = this->private;
+    local_subvols_cnt = conf->local_subvols_cnt;
+
+    if (!local_subvols_cnt) {
+        ret = 0;
+        goto out;
+    }
+
+    old_THIS = THIS;
+    THIS = this;
+
+    dir_dfmeta = GF_CALLOC(1, sizeof(*dir_dfmeta), gf_common_mt_pointer);
+    if (!dir_dfmeta) {
+        gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta is NULL");
+        ret = -1;
+        goto out;
+    }
+
+    dir_dfmeta->lfd = GF_CALLOC(local_subvols_cnt, sizeof(fd_t *),
+                                gf_common_mt_pointer);
+    if (!dir_dfmeta->lfd) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_INSUFF_MEMORY,
+                "for dir_dfmeta", NULL);
+        ret = -1;
+        *perrno = ENOMEM;
+        goto out;
+    }
+
+    for (i = 0; i < local_subvols_cnt; i++) {
+        dir_dfmeta->lfd[i] = fd_create(loc->inode, defrag->pid);
+        if (!dir_dfmeta->lfd[i]) {
+            gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_FD_CREATE_FAILED,
+                    NULL);
+            *perrno = ENOMEM;
+            ret = -1;
+            goto out;
+        }
+
+        ret = syncop_opendir(conf->local_subvols[i], loc, dir_dfmeta->lfd[i],
+                             NULL, NULL);
+        if (ret) {
+            fd_unref(dir_dfmeta->lfd[i]);
+            dir_dfmeta->lfd[i] = NULL;
+            gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FAILED_TO_OPEN,
+                    "dir: %s", loc->path, "subvol: %s",
+                    conf->local_subvols[i]->name, NULL);
+
+            if (conf->decommission_in_progress) {
+                *perrno = -ret;
+                ret = -1;
+                goto out;
+            }
+        } else {
+            fd_bind(dir_dfmeta->lfd[i]);
+        }
+    }
+
+    dir_dfmeta->head = GF_CALLOC(local_subvols_cnt, sizeof(*(dir_dfmeta->head)),
+                                 gf_common_mt_pointer);
+    if (!dir_dfmeta->head) {
+        gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->head is NULL");
+        ret = -1;
+        goto out;
+    }
+
+    dir_dfmeta->iterator = GF_CALLOC(local_subvols_cnt,
+                                     sizeof(*(dir_dfmeta->iterator)),
+                                     gf_common_mt_pointer);
+    if (!dir_dfmeta->iterator) {
+        gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->iterator is NULL");
+        ret = -1;
+        goto out;
+    }
+
+    dir_dfmeta->equeue = GF_CALLOC(local_subvols_cnt, sizeof(entries),
+                                   gf_dht_mt_dirent_t);
+    if (!dir_dfmeta->equeue) {
+        gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->equeue is NULL");
+        ret = -1;
+        goto out;
+    }
+
+    dir_dfmeta->offset_var = GF_CALLOC(
+        local_subvols_cnt, sizeof(dht_dfoffset_ctx_t), gf_dht_mt_octx_t);
+    if (!dir_dfmeta->offset_var) {
+        gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->offset_var is NULL");
+        ret = -1;
+        goto out;
+    }
+
+    ret = gf_defrag_ctx_subvols_init(dir_dfmeta->offset_var, this);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "dht_dfoffset_ctx_t"
+               "initialization failed");
+        ret = -1;
+        goto out;
+    }
+
+    dir_dfmeta->fetch_entries = GF_CALLOC(local_subvols_cnt, sizeof(int),
+                                          gf_common_mt_int);
+    if (!dir_dfmeta->fetch_entries) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_INSUFF_MEMORY,
+                "for dir_dfmeta->fetch_entries", NULL);
+        ret = -1;
+        goto out;
+    }
+
+    for (i = 0; i < local_subvols_cnt; i++) {
+        INIT_LIST_HEAD(&(dir_dfmeta->equeue[i].list));
+        dir_dfmeta->head[i] = &(dir_dfmeta->equeue[i].list);
+        dir_dfmeta->iterator[i] = dir_dfmeta->head[i];
+        dir_dfmeta->fetch_entries[i] = 1;
+    }
+
+    xattr_req = dict_new();
+    if (!xattr_req) {
+        gf_log(this->name, GF_LOG_ERROR, "dict_new failed");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to set dict for "
+               "key: %s",
+               conf->link_xattr_name);
+        ret = -1;
+        goto out;
+    }
+
+    /*
+     Job: Read entries from each local subvol and store the entries
+          in equeue array of linked list. Now pick one entry from the
+          equeue array in a round robin basis and add them to defrag Queue.
+    */
+
+    while (!dht_dfreaddirp_done(dir_dfmeta->offset_var, local_subvols_cnt)) {
+        pthread_mutex_lock(&defrag->dfq_mutex);
+        {
+            /*Throttle up: If reconfigured count is higher than
+              current thread count, wake up the sleeping threads
+              TODO: Need to refactor this. Instead of making the
+              thread sleep and wake, we should terminate and spawn
+              threads on-demand*/
+
+            if (defrag->recon_thread_count > defrag->current_thread_count) {
+                throttle_up = (defrag->recon_thread_count -
+                               defrag->current_thread_count);
+                for (j = 0; j < throttle_up; j++) {
+                    pthread_cond_signal(&defrag->df_wakeup_thread);
+                }
+            }
+
+            while (defrag->q_entry_count > MAX_MIGRATE_QUEUE_COUNT) {
+                defrag->wakeup_crawler = 1;
+                pthread_cond_wait(&defrag->rebalance_crawler_alarm,
+                                  &defrag->dfq_mutex);
+            }
+
+            ldfq_count = defrag->q_entry_count;
+
+            if (defrag->wakeup_crawler) {
+                defrag->wakeup_crawler = 0;
+            }
+        }
+        pthread_mutex_unlock(&defrag->dfq_mutex);
+
+        while (
+            ldfq_count <= MAX_MIGRATE_QUEUE_COUNT &&
+            !dht_dfreaddirp_done(dir_dfmeta->offset_var, local_subvols_cnt)) {
+            ret = gf_defrag_get_entry(this, dfc_index, &container, loc, conf,
+                                      defrag, dir_dfmeta->lfd[dfc_index],
+                                      migrate_data, dir_dfmeta, xattr_req,
+                                      perrno);
+
+            if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) {
+                goto out;
+            }
+
+            if (ret) {
+                gf_log(this->name, GF_LOG_WARNING,
+                       "Found "
+                       "error from gf_defrag_get_entry");
+
+                ret = -1;
+                goto out;
+            }
+
+            /* Check if we got an entry, else we need to move the
+               index to the next subvol */
+            if (!container) {
+                GF_CRAWL_INDEX_MOVE(dfc_index, local_subvols_cnt);
+                continue;
+            }
+
+            /* Q this entry in the dfq */
+            pthread_mutex_lock(&defrag->dfq_mutex);
+            {
+                list_add_tail(&container->list, &(defrag->queue[0].list));
+                defrag->q_entry_count++;
+                ldfq_count = defrag->q_entry_count;
+
+                gf_msg_debug(this->name, 0,
+                             "added "
+                             "file:%s parent:%s to the queue ",
+                             container->df_entry->d_name,
+                             container->parent_loc->path);
+
+                pthread_cond_signal(&defrag->parallel_migration_cond);
+            }
+            pthread_mutex_unlock(&defrag->dfq_mutex);
+
+            GF_CRAWL_INDEX_MOVE(dfc_index, local_subvols_cnt);
+        }
+    }
+
+    gettimeofday(&end, NULL);
+    elapsed = gf_tvdiff(&dir_start, &end);
+    gf_log(this->name, GF_LOG_INFO,
+           "Migration operation on dir %s took "
+           "%.2f secs",
+           loc->path, elapsed / 1e6);
+    ret = 0;
+out:
+    THIS = old_THIS;
+    gf_defrag_free_dir_dfmeta(dir_dfmeta, local_subvols_cnt);
+
+    if (xattr_req)
+        dict_unref(xattr_req);
+
+    /* It does not matter if it errored out - this number is
+     * used to calculate rebalance estimated time to complete.
+     * No locking required as dirs are processed by a single thread.
+     */
+    defrag->num_dirs_processed++;
+    return ret;
+}
+
+int
+gf_defrag_settle_hash(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
+                      dict_t *fix_layout)
+{
+    int ret;
+    dht_conf_t *conf = NULL;
+    /*
+     * Now we're ready to update the directory commit hash for the volume
+     * root, so that hash miscompares and broadcast lookups can stop.
+     * However, we want to skip that if fix-layout is all we did.  In
+     * that case, we want the miscompares etc. to continue until a real
+     * rebalance is complete.
+     */
+    if (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX ||
+        defrag->cmd == GF_DEFRAG_CMD_DETACH_START) {
+        return 0;
+    }
+
+    conf = this->private;
+    if (!conf) {
+        /*Uh oh
+         */
+        return -1;
+    }
+
+    if (conf->local_subvols_cnt == 0 || !conf->lookup_optimize) {
+        /* Commit hash updates are only done on local subvolumes and
+         * only when lookup optimization is needed (for older client
+         * support)
+         */
+        return 0;
+    }
+
+    ret = dict_set_uint32(fix_layout, "new-commit-hash",
+                          defrag->new_commit_hash);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR, "Failed to set new-commit-hash");
+        return -1;
+    }
+
+    ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+               "fix layout on %s failed", loc->path);
+
+        if (-ret == ENOENT || -ret == ESTALE) {
+            /* Dir most likely is deleted */
+            return 0;
+        }
+
+        return -1;
+    }
+
+    /* TBD: find more efficient solution than adding/deleting every time */
+    dict_del(fix_layout, "new-commit-hash");
+
+    return 0;
+}
+
+int
+gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
+                     dict_t *fix_layout, dict_t *migrate_data)
+{
+    int ret = -1;
+    loc_t entry_loc = {
+        0,
+    };
+    fd_t *fd = NULL;
+    gf_dirent_t entries;
+    gf_dirent_t *tmp = NULL;
+    gf_dirent_t *entry = NULL;
+    gf_boolean_t free_entries = _gf_false;
+    off_t offset = 0;
+    struct iatt iatt = {
+        0,
+    };
+    inode_t *linked_inode = NULL, *inode = NULL;
+    dht_conf_t *conf = NULL;
+    int perrno = 0;
+
+    conf = this->private;
+    if (!conf) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = syncop_lookup(this, loc, &iatt, NULL, NULL, NULL);
+    if (ret) {
+        if (strcmp(loc->path, "/") == 0) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_DIR_LOOKUP_FAILED,
+                   "lookup failed for:%s", loc->path);
+
+            defrag->total_failures++;
+            ret = -1;
+            goto out;
+        }
+
+        if (-ret == ENOENT || -ret == ESTALE) {
+            gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_LOOKUP_FAILED,
+                   "Dir:%s renamed or removed. Skipping", loc->path);
+            if (conf->decommission_subvols_cnt) {
+                defrag->total_failures++;
+            }
+            ret = 0;
+            goto out;
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_DIR_LOOKUP_FAILED,
+                   "lookup failed for:%s", loc->path);
+
+            defrag->total_failures++;
+            goto out;
+        }
+    }
+
+    fd = fd_create(loc->inode, defrag->pid);
+    if (!fd) {
+        gf_log(this->name, GF_LOG_ERROR, "Failed to create fd");
+        ret = -1;
+        goto out;
+    }
+
+    ret = syncop_opendir(this, loc, fd, NULL, NULL);
+    if (ret) {
+        if (-ret == ENOENT || -ret == ESTALE) {
+            if (conf->decommission_subvols_cnt) {
+                defrag->total_failures++;
+            }
+            ret = 0;
+            goto out;
+        }
+
+        gf_log(this->name, GF_LOG_ERROR,
+               "Failed to open dir %s, "
+               "err:%d",
+               loc->path, -ret);
+
+        ret = -1;
+        goto out;
+    }
+
+    fd_bind(fd);
+    INIT_LIST_HEAD(&entries.list);
+
+    while ((ret = syncop_readdirp(this, fd, 131072, offset, &entries, NULL,
+                                  NULL)) != 0) {
+        if (ret < 0) {
+            if (-ret == ENOENT || -ret == ESTALE) {
+                if (conf->decommission_subvols_cnt) {
+                    defrag->total_failures++;
+                }
+                ret = 0;
+                goto out;
+            }
+
+            gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_READDIR_ERROR,
+                   "readdirp failed for "
+                   "path %s. Aborting fix-layout",
+                   loc->path);
+
+            ret = -1;
+            goto out;
+        }
+
+        if (list_empty(&entries.list))
+            break;
+
+        free_entries = _gf_true;
+
+        list_for_each_entry_safe(entry, tmp, &entries.list, list)
+        {
+            if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+                ret = 1;
+                goto out;
+            }
+
+            offset = entry->d_off;
+
+            if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
+                continue;
+            if (!IA_ISDIR(entry->d_stat.ia_type)) {
+                continue;
+            }
+            loc_wipe(&entry_loc);
+
+            ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name);
+            if (ret) {
+                gf_log(this->name, GF_LOG_ERROR,
+                       "Child loc"
+                       " build failed for entry: %s",
+                       entry->d_name);
+
+                if (conf->decommission_in_progress) {
+                    defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+                    goto out;
+                } else {
+                    continue;
+                }
+            }
+
+            if (gf_uuid_is_null(entry->d_stat.ia_gfid)) {
+                gf_log(this->name, GF_LOG_ERROR,
+                       "%s/%s"
+                       " gfid not present",
+                       loc->path, entry->d_name);
+                continue;
+            }
+
+            gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid);
+
+            /*In case the gfid stored in the inode by inode_link
+             * and the gfid obtained in the lookup differs, then
+             * client3_3_lookup_cbk will return ESTALE and proper
+             * error will be captured
+             */
+
+            linked_inode = inode_link(entry_loc.inode, loc->inode,
+                                      entry->d_name, &entry->d_stat);
+
+            inode = entry_loc.inode;
+            entry_loc.inode = linked_inode;
+            inode_unref(inode);
+
+            if (gf_uuid_is_null(loc->gfid)) {
+                gf_log(this->name, GF_LOG_ERROR,
+                       "%s/%s"
+                       " gfid not present",
+                       loc->path, entry->d_name);
+                continue;
+            }
+
+            gf_uuid_copy(entry_loc.pargfid, loc->gfid);
+
+            ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL);
+            if (ret) {
+                if (-ret == ENOENT || -ret == ESTALE) {
+                    gf_msg(this->name, GF_LOG_INFO, -ret,
+                           DHT_MSG_DIR_LOOKUP_FAILED,
+                           "Dir:%s renamed or removed. "
+                           "Skipping",
+                           loc->path);
+                    ret = 0;
+                    if (conf->decommission_subvols_cnt) {
+                        defrag->total_failures++;
+                    }
+                    continue;
+                } else {
+                    gf_msg(this->name, GF_LOG_ERROR, -ret,
+                           DHT_MSG_DIR_LOOKUP_FAILED, "lookup failed for:%s",
+                           entry_loc.path);
+
+                    defrag->total_failures++;
+
+                    if (conf->decommission_in_progress) {
+                        defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+                        ret = -1;
+                        goto out;
+                    } else {
+                        continue;
+                    }
+                }
+            }
+
+            /* A return value of 2 means, either process_dir or
+             * lookup of a dir failed. Hence, don't commit hash
+             * for the current directory*/
+
+            ret = gf_defrag_fix_layout(this, defrag, &entry_loc, fix_layout,
+                                       migrate_data);
+
+            if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED ||
+                defrag->defrag_status == GF_DEFRAG_STATUS_FAILED) {
+                goto out;
+            }
+
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED,
+                       "Fix layout failed for %s", entry_loc.path);
+
+                defrag->total_failures++;
+
+                if (conf->decommission_in_progress) {
+                    defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+
+                    goto out;
+                } else {
+                    /* Let's not commit-hash if
+                     * gf_defrag_fix_layout failed*/
+                    continue;
+                }
+            }
+        }
+
+        gf_dirent_free(&entries);
+        free_entries = _gf_false;
+        INIT_LIST_HEAD(&entries.list);
+    }
+
+    /* A directory layout is fixed only after its subdirs are healed to
+     * any newly added bricks. If the layout is fixed before subdirs are
+     * healed, the newly added brick will get a non-null layout.
+     * Any subdirs which hash to that layout will no longer show up
+     * in a directory listing until they are healed.
+     */
+
+    ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL);
+
+    /* In case of a race where the directory is deleted just before
+     * layout setxattr, the errors are updated in the layout structure.
+     * We can use this information to make a decision whether the directory
+     * is deleted entirely.
+     */
+    if (ret == 0) {
+        ret = dht_dir_layout_error_check(this, loc->inode);
+        ret = -ret;
+    }
+
+    if (ret) {
+        if (-ret == ENOENT || -ret == ESTALE) {
+            gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+                   "Setxattr failed. Dir %s "
+                   "renamed or removed",
+                   loc->path);
+            if (conf->decommission_subvols_cnt) {
+                defrag->total_failures++;
+            }
+            ret = 0;
+            goto out;
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+                   "Setxattr failed for %s", loc->path);
+
+            defrag->total_failures++;
+
+            if (conf->decommission_in_progress) {
+                defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+                ret = -1;
+                goto out;
+            }
+        }
+    }
+
+    if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) {
+        ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno);
+
+        if (ret) {
+            if (perrno == ENOENT || perrno == ESTALE) {
+                ret = 0;
+                goto out;
+            } else {
+                defrag->total_failures++;
+
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       DHT_MSG_DEFRAG_PROCESS_DIR_FAILED,
+                       "gf_defrag_process_dir failed for "
+                       "directory: %s",
+                       loc->path);
+
+                if (conf->decommission_in_progress) {
+                    goto out;
+                }
+            }
+        }
+    }
+
+    gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path);
+
+    if (gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) {
+        defrag->total_failures++;
+
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED,
+               "Settle hash failed for %s", loc->path);
+
+        ret = -1;
+
+        if (conf->decommission_in_progress) {
+            defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    if (free_entries)
+        gf_dirent_free(&entries);
+
+    loc_wipe(&entry_loc);
+
+    if (fd)
+        fd_unref(fd);
+
+    return ret;
+}
+
+int
+dht_init_local_subvols_and_nodeuuids(xlator_t *this, dht_conf_t *conf,
+                                     loc_t *loc)
+{
+    dict_t *dict = NULL;
+    uuid_t *uuid_ptr = NULL;
+    int ret = -1;
+    int i = 0;
+    int j = 0;
+
+    /* Find local subvolumes */
+    ret = syncop_getxattr(this, loc, &dict, GF_REBAL_FIND_LOCAL_SUBVOL, NULL,
+                          NULL);
+    if (ret && (ret != -ENODATA)) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, 0,
+               "local "
+               "subvolume determination failed with error: %d",
+               -ret);
+        ret = -1;
+        goto out;
+    }
+
+    if (!ret)
+        goto out;
+
+    ret = syncop_getxattr(this, loc, &dict, GF_REBAL_OLD_FIND_LOCAL_SUBVOL,
+                          NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, 0,
+               "local "
+               "subvolume determination failed with error: %d",
+               -ret);
+        ret = -1;
+        goto out;
+    }
+    ret = 0;
+
+out:
+    if (ret) {
+        return ret;
+    }
+
+    for (i = 0; i < conf->local_subvols_cnt; i++) {
+        gf_msg(this->name, GF_LOG_INFO, 0, 0,
+               "local subvol: "
+               "%s",
+               conf->local_subvols[i]->name);
+
+        for (j = 0; j < conf->local_nodeuuids[i].count; j++) {
+            uuid_ptr = &(conf->local_nodeuuids[i].elements[j].uuid);
+            gf_msg(this->name, GF_LOG_INFO, 0, 0, "node uuid : %s",
+                   uuid_utoa(*uuid_ptr));
+        }
+    }
+
+    return ret;
+}
+
+/* Functions for the rebalance estimates feature */
+
+uint64_t
+gf_defrag_subvol_file_size(xlator_t *this, loc_t *root_loc)
+{
+    int ret = -1;
+    struct statvfs buf = {
+        0,
+    };
+
+    ret = syncop_statfs(this, root_loc, &buf, NULL, NULL);
+    if (ret) {
+        /* Aargh! */
+        return 0;
+    }
+    return ((buf.f_blocks - buf.f_bfree) * buf.f_frsize);
+}
+
+uint64_t
+gf_defrag_total_file_size(xlator_t *this, loc_t *root_loc)
+{
+    dht_conf_t *conf = NULL;
+    int i = 0;
+    uint64_t size_files = 0;
+    uint64_t total_size = 0;
+
+    conf = this->private;
+    if (!conf) {
+        return 0;
+    }
+
+    for (i = 0; i < conf->local_subvols_cnt; i++) {
+        size_files = gf_defrag_subvol_file_size(conf->local_subvols[i],
+                                                root_loc);
+        total_size += size_files;
+        gf_msg(this->name, GF_LOG_INFO, 0, 0,
+               "local subvol: %s,"
+               "cnt = %" PRIu64,
+               conf->local_subvols[i]->name, size_files);
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0, 0, "Total size files = %" PRIu64,
+           total_size);
+
+    return total_size;
+}
+
+static void *
+dht_file_counter_thread(void *args)
+{
+    gf_defrag_info_t *defrag = NULL;
+    loc_t root_loc = {
+        0,
+    };
+    struct timespec time_to_wait = {
+        0,
+    };
+    uint64_t tmp_size = 0;
+
+    if (!args)
+        return NULL;
+
+    defrag = (gf_defrag_info_t *)args;
+    dht_build_root_loc(defrag->root_inode, &root_loc);
+
+    while (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED) {
+        timespec_now(&time_to_wait);
+        time_to_wait.tv_sec += 600;
+
+        pthread_mutex_lock(&defrag->fc_mutex);
+        pthread_cond_timedwait(&defrag->fc_wakeup_cond, &defrag->fc_mutex,
+                               &time_to_wait);
+
+        pthread_mutex_unlock(&defrag->fc_mutex);
+
+        if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED)
+            break;
+
+        tmp_size = gf_defrag_total_file_size(defrag->this, &root_loc);
+
+        gf_log("dht", GF_LOG_INFO, "tmp data size =%" PRIu64, tmp_size);
+
+        if (!tmp_size) {
+            gf_msg("dht", GF_LOG_ERROR, 0, 0,
+                   "Failed to get "
+                   "the total data size. Unable to estimate "
+                   "time to complete rebalance.");
+        } else {
+            g_totalsize = tmp_size;
+            gf_msg_debug("dht", 0, "total data size =%" PRIu64, g_totalsize);
+        }
+    }
+
+    return NULL;
+}
+
+int
+gf_defrag_estimates_cleanup(xlator_t *this, gf_defrag_info_t *defrag,
+                            pthread_t filecnt_thread)
+{
+    int ret = -1;
+
+    /* Wake up the filecounter thread.
+     * By now the defrag status will no longer be
+     * GF_DEFRAG_STATUS_STARTED so the thread will exit the loop.
+     */
+    pthread_mutex_lock(&defrag->fc_mutex);
+    {
+        pthread_cond_broadcast(&defrag->fc_wakeup_cond);
+    }
+    pthread_mutex_unlock(&defrag->fc_mutex);
+
+    ret = pthread_join(filecnt_thread, NULL);
+    if (ret) {
+        gf_msg("dht", GF_LOG_ERROR, ret, 0,
+               "file_counter_thread: pthread_join failed.");
+        ret = -1;
+    }
+    return ret;
+}
+
+int
+gf_defrag_estimates_init(xlator_t *this, loc_t *loc, pthread_t *filecnt_thread)
+{
+    int ret = -1;
+    dht_conf_t *conf = NULL;
+    gf_defrag_info_t *defrag = NULL;
+
+    conf = this->private;
+    defrag = conf->defrag;
+
+    g_totalsize = gf_defrag_total_file_size(this, loc);
+    if (!g_totalsize) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "Failed to get "
+               "the total data size. Unable to estimate "
+               "time to complete rebalance.");
+        goto out;
+    }
+
+    ret = gf_thread_create(filecnt_thread, NULL, dht_file_counter_thread,
+                           (void *)defrag, "dhtfcnt");
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, ret, 0,
+               "Failed to "
+               "create the file counter thread ");
+        ret = -1;
+        goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+/* Init and cleanup functions for parallel file migration*/
+int
+gf_defrag_parallel_migration_init(xlator_t *this, gf_defrag_info_t *defrag,
+                                  pthread_t **tid_array, int *thread_index)
+{
+    int ret = -1;
+    int thread_spawn_count = 0;
+    int index = 0;
+    pthread_t *tid = NULL;
+
+    if (!defrag)
+        goto out;
+
+    /* Initialize global entry queue */
+    defrag->queue = GF_CALLOC(1, sizeof(struct dht_container),
+                              gf_dht_mt_container_t);
+
+    if (!defrag->queue) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+               "Failed to initialise migration queue");
+        ret = -1;
+        goto out;
+    }
+
+    INIT_LIST_HEAD(&(defrag->queue[0].list));
+
+    thread_spawn_count = MAX(MAX_REBAL_THREADS, 4);
+
+    gf_msg_debug(this->name, 0, "thread_spawn_count: %d", thread_spawn_count);
+
+    tid = GF_CALLOC(thread_spawn_count, sizeof(pthread_t),
+                    gf_common_mt_pthread_t);
+    if (!tid) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+               "Failed to create migration threads");
+        ret = -1;
+        goto out;
+    }
+    defrag->current_thread_count = thread_spawn_count;
+
+    /*Spawn Threads Here*/
+    while (index < thread_spawn_count) {
+        ret = gf_thread_create(&(tid[index]), NULL, gf_defrag_task,
+                               (void *)defrag, "dhtmig%d", (index + 1) & 0x3ff);
+        if (ret != 0) {
+            gf_msg("DHT", GF_LOG_ERROR, ret, 0, "Thread[%d] creation failed. ",
+                   index);
+            ret = -1;
+            goto out;
+        } else {
+            gf_log("DHT", GF_LOG_INFO,
+                   "Thread[%d] "
+                   "creation successful",
+                   index);
+        }
+        index++;
+    }
+
+    ret = 0;
+out:
+    *thread_index = index;
+    *tid_array = tid;
+
+    return ret;
+}
+
+int
+gf_defrag_parallel_migration_cleanup(gf_defrag_info_t *defrag,
+                                     pthread_t *tid_array, int thread_index)
+{
+    int ret = -1;
+    int i = 0;
+
+    if (!defrag)
+        goto out;
+
+    /* Wake up all migration threads */
+    pthread_mutex_lock(&defrag->dfq_mutex);
+    {
+        defrag->crawl_done = 1;
+
+        pthread_cond_broadcast(&defrag->parallel_migration_cond);
+        pthread_cond_broadcast(&defrag->df_wakeup_thread);
+    }
+    pthread_mutex_unlock(&defrag->dfq_mutex);
+
+    /*Wait for all the threads to complete their task*/
+    for (i = 0; i < thread_index; i++) {
+        pthread_join(tid_array[i], NULL);
+    }
+
+    GF_FREE(tid_array);
+
+    /* Cleanup the migration queue */
+    if (defrag->queue) {
+        gf_dirent_free(defrag->queue[0].df_entry);
+        INIT_LIST_HEAD(&(defrag->queue[0].list));
+    }
+
+    GF_FREE(defrag->queue);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+gf_defrag_start_crawl(void *data)
+{
+    xlator_t *this = NULL;
+    dht_conf_t *conf = NULL;
+    gf_defrag_info_t *defrag = NULL;
+    dict_t *fix_layout = NULL;
+    dict_t *migrate_data = NULL;
+    dict_t *status = NULL;
+    glusterfs_ctx_t *ctx = NULL;
+    call_frame_t *statfs_frame = NULL;
+    xlator_t *old_THIS = NULL;
+    int ret = -1;
+    loc_t loc = {
+        0,
+    };
+    struct iatt iatt = {
+        0,
+    };
+    struct iatt parent = {
+        0,
+    };
+    int thread_index = 0;
+    pthread_t *tid = NULL;
+    pthread_t filecnt_thread;
+    gf_boolean_t fc_thread_started = _gf_false;
+
+    this = data;
+    if (!this)
+        goto exit;
+
+    ctx = this->ctx;
+    if (!ctx)
+        goto exit;
+
+    conf = this->private;
+    if (!conf)
+        goto exit;
+
+    defrag = conf->defrag;
+    if (!defrag)
+        goto exit;
+
+    defrag->start_time = gf_time();
+
+    dht_build_root_inode(this, &defrag->root_inode);
+    if (!defrag->root_inode)
+        goto out;
+
+    dht_build_root_loc(defrag->root_inode, &loc);
+
+    /* fix-layout on '/' first */
+
+    ret = syncop_lookup(this, &loc, &iatt, &parent, NULL, NULL);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_REBALANCE_START_FAILED,
+               "Failed to start rebalance: look up on / failed");
+        ret = -1;
+        goto out;
+    }
+
+    old_THIS = THIS;
+    THIS = this;
+
+    statfs_frame = create_frame(this, this->ctx->pool);
+    if (!statfs_frame) {
+        gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM,
+               "Insufficient memory. Frame creation failed");
+        ret = -1;
+        goto out;
+    }
+
+    /* async statfs update for honoring min-free-disk */
+    dht_get_du_info(statfs_frame, this, &loc);
+    THIS = old_THIS;
+
+    fix_layout = dict_new();
+    if (!fix_layout) {
+        ret = -1;
+        goto out;
+    }
+
+    /*
+     * Unfortunately, we can't do special xattrs (like fix.layout) and
+     * real ones in the same call currently, and changing it seems
+     * riskier than just doing two calls.
+     */
+
+    gf_log(this->name, GF_LOG_INFO, "%s using commit hash %u", __func__,
+           conf->vol_commit_hash);
+
+    ret = dict_set_uint32(fix_layout, conf->commithash_xattr_name,
+                          conf->vol_commit_hash);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR, "Failed to set %s",
+               conf->commithash_xattr_name);
+        defrag->total_failures++;
+        ret = -1;
+        goto out;
+    }
+
+    ret = syncop_setxattr(this, &loc, fix_layout, 0, NULL, NULL);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Failed to set commit hash on %s. "
+               "Rebalance cannot proceed.",
+               loc.path);
+        defrag->total_failures++;
+        ret = -1;
+        goto out;
+    }
+
+    /* We now return to our regularly scheduled program. */
+
+    ret = dict_set_str(fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED,
+               "Failed to start rebalance:"
+               "Failed to set dictionary value: key = %s",
+               GF_XATTR_FIX_LAYOUT_KEY);
+        defrag->total_failures++;
+        ret = -1;
+        goto out;
+    }
+
+    defrag->new_commit_hash = conf->vol_commit_hash;
+
+    ret = syncop_setxattr(this, &loc, fix_layout, 0, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_REBALANCE_FAILED,
+               "fix layout on %s failed", loc.path);
+        defrag->total_failures++;
+        ret = -1;
+        goto out;
+    }
+
+    if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) {
+        /* We need to migrate files */
+
+        migrate_data = dict_new();
+        if (!migrate_data) {
+            defrag->total_failures++;
+            ret = -1;
+            goto out;
+        }
+        ret = dict_set_str(
+            migrate_data, GF_XATTR_FILE_MIGRATE_KEY,
+            (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) ? "force" : "non-force");
+        if (ret) {
+            defrag->total_failures++;
+            ret = -1;
+            goto out;
+        }
+
+        ret = dht_init_local_subvols_and_nodeuuids(this, conf, &loc);
+        if (ret) {
+            ret = -1;
+            goto out;
+        }
+
+        /* Initialise the structures required for parallel migration */
+        ret = gf_defrag_parallel_migration_init(this, defrag, &tid,
+                                                &thread_index);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0, "Aborting rebalance.");
+            goto out;
+        }
+
+        ret = gf_defrag_estimates_init(this, &loc, &filecnt_thread);
+        if (ret) {
+            /* Not a fatal error. Allow the rebalance to proceed*/
+            ret = 0;
+        } else {
+            fc_thread_started = _gf_true;
+        }
+    }
+
+    ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, migrate_data);
+    if (ret) {
+        defrag->total_failures++;
+        ret = -1;
+        goto out;
+    }
+
+    if (gf_defrag_settle_hash(this, defrag, &loc, fix_layout) != 0) {
+        defrag->total_failures++;
+        ret = -1;
+        goto out;
+    }
+
+    gf_log("DHT", GF_LOG_INFO, "crawling file-system completed");
+out:
+
+    /* We are here means crawling the entire file system is done
+       or something failed. Set defrag->crawl_done flag to intimate
+       the migrator threads to exhaust the defrag->queue and terminate*/
+
+    if (ret) {
+        defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+    }
+
+    gf_defrag_parallel_migration_cleanup(defrag, tid, thread_index);
+
+    if ((defrag->defrag_status != GF_DEFRAG_STATUS_STOPPED) &&
+        (defrag->defrag_status != GF_DEFRAG_STATUS_FAILED)) {
+        defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE;
+    }
+
+    if (fc_thread_started) {
+        gf_defrag_estimates_cleanup(this, defrag, filecnt_thread);
+    }
+
+    dht_send_rebalance_event(this, defrag->cmd, defrag->defrag_status);
+
+    status = dict_new();
+    LOCK(&defrag->lock);
+    {
+        gf_defrag_status_get(conf, status);
+        if (ctx && ctx->notify)
+            ctx->notify(GF_EN_DEFRAG_STATUS, status);
+        if (status)
+            dict_unref(status);
+        defrag->is_exiting = 1;
+    }
+    UNLOCK(&defrag->lock);
+
+    GF_FREE(defrag);
+    conf->defrag = NULL;
+
+    if (migrate_data)
+        dict_unref(migrate_data);
+
+    if (statfs_frame) {
+        STACK_DESTROY(statfs_frame->root);
+    }
+exit:
+    return ret;
+}
+
+static int
+gf_defrag_done(int ret, call_frame_t *sync_frame, void *data)
+{
+    gf_listener_stop(sync_frame->this);
+
+    STACK_DESTROY(sync_frame->root);
+    kill(getpid(), SIGTERM);
+    return 0;
+}
+
+void *
+gf_defrag_start(void *data)
+{
+    int ret = -1;
+    call_frame_t *frame = NULL;
+    dht_conf_t *conf = NULL;
+    gf_defrag_info_t *defrag = NULL;
+    xlator_t *this = NULL;
+    xlator_t *old_THIS = NULL;
+
+    this = data;
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    defrag = conf->defrag;
+    if (!defrag)
+        goto out;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame)
+        goto out;
+
+    frame->root->pid = GF_CLIENT_PID_DEFRAG;
+
+    defrag->pid = frame->root->pid;
+
+    defrag->defrag_status = GF_DEFRAG_STATUS_STARTED;
+
+    old_THIS = THIS;
+    THIS = this;
+    ret = synctask_new(this->ctx->env, gf_defrag_start_crawl, gf_defrag_done,
+                       frame, this);
+
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED,
+               "Could not create task for rebalance");
+    THIS = old_THIS;
+out:
+    return NULL;
+}
+
+uint64_t
+gf_defrag_get_estimates_based_on_size(dht_conf_t *conf)
+{
+    gf_defrag_info_t *defrag = NULL;
+    double rate_processed = 0;
+    uint64_t total_processed = 0;
+    uint64_t tmp_count = 0;
+    uint64_t time_to_complete = 0;
+    double elapsed = 0;
+
+    defrag = conf->defrag;
+
+    if (!g_totalsize)
+        goto out;
+
+    elapsed = gf_time() - defrag->start_time;
+
+    /* Don't calculate the estimates for the first 10 minutes.
+     * It is unlikely to be accurate and estimates are not required
+     * if the process finishes in less than 10 mins.
+     */
+
+    if (elapsed < ESTIMATE_START_INTERVAL) {
+        gf_msg(THIS->name, GF_LOG_INFO, 0, 0,
+               "Rebalance estimates will not be available for the "
+               "first %d seconds.",
+               ESTIMATE_START_INTERVAL);
+
+        goto out;
+    }
+
+    total_processed = defrag->size_processed;
+
+    /* rate at which files processed */
+    rate_processed = (total_processed) / elapsed;
+
+    tmp_count = g_totalsize;
+
+    if (rate_processed) {
+        time_to_complete = (tmp_count) / rate_processed;
+
+    } else {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, 0,
+               "Unable to calculate estimated time for rebalance");
+    }
+
+    gf_log(THIS->name, GF_LOG_INFO,
+           "TIME: (size) total_processed=%" PRIu64 " tmp_cnt = %" PRIu64
+           ","
+           "rate_processed=%f, elapsed = %f",
+           total_processed, tmp_count, rate_processed, elapsed);
+
+out:
+    return time_to_complete;
+}
+
+int
+gf_defrag_status_get(dht_conf_t *conf, dict_t *dict)
+{
+    int ret = 0;
+    uint64_t files = 0;
+    uint64_t size = 0;
+    uint64_t lookup = 0;
+    uint64_t failures = 0;
+    uint64_t skipped = 0;
+    char *status = "";
+    double elapsed = 0;
+    uint64_t time_to_complete = 0;
+    uint64_t time_left = 0;
+    gf_defrag_info_t *defrag = conf->defrag;
+
+    if (!defrag)
+        goto out;
+
+    ret = 0;
+    if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED)
+        goto out;
+
+    files = defrag->total_files;
+    size = defrag->total_data;
+    lookup = defrag->num_files_lookedup;
+    failures = defrag->total_failures;
+    skipped = defrag->skipped;
+
+    elapsed = gf_time() - defrag->start_time;
+
+    /* The rebalance is still in progress */
+
+    if (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED) {
+        time_to_complete = gf_defrag_get_estimates_based_on_size(conf);
+
+        if (time_to_complete && (time_to_complete > elapsed))
+            time_left = time_to_complete - elapsed;
+
+        gf_log(THIS->name, GF_LOG_INFO,
+               "TIME: Estimated total time to complete (size)= %" PRIu64
+               " seconds, seconds left = %" PRIu64 "",
+               time_to_complete, time_left);
+    }
+
+    if (!dict)
+        goto log;
+
+    ret = dict_set_uint64(dict, "files", files);
+    if (ret)
+        gf_log(THIS->name, GF_LOG_WARNING, "failed to set file count");
+
+    ret = dict_set_uint64(dict, "size", size);
+    if (ret)
+        gf_log(THIS->name, GF_LOG_WARNING, "failed to set size of xfer");
+
+    ret = dict_set_uint64(dict, "lookups", lookup);
+    if (ret)
+        gf_log(THIS->name, GF_LOG_WARNING, "failed to set lookedup file count");
+
+    ret = dict_set_int32(dict, "status", defrag->defrag_status);
+    if (ret)
+        gf_log(THIS->name, GF_LOG_WARNING, "failed to set status");
+
+    ret = dict_set_double(dict, "run-time", elapsed);
+    if (ret)
+        gf_log(THIS->name, GF_LOG_WARNING, "failed to set run-time");
+
+    ret = dict_set_uint64(dict, "failures", failures);
+    if (ret)
+        gf_log(THIS->name, GF_LOG_WARNING, "failed to set failure count");
+
+    ret = dict_set_uint64(dict, "skipped", skipped);
+    if (ret)
+        gf_log(THIS->name, GF_LOG_WARNING, "failed to set skipped file count");
+
+    ret = dict_set_uint64(dict, "time-left", time_left);
+    if (ret)
+        gf_log(THIS->name, GF_LOG_WARNING, "failed to set time-left");
+
+log:
+    switch (defrag->defrag_status) {
+        case GF_DEFRAG_STATUS_NOT_STARTED:
+            status = "not started";
+            break;
+        case GF_DEFRAG_STATUS_STARTED:
+            status = "in progress";
+            break;
+        case GF_DEFRAG_STATUS_STOPPED:
+            status = "stopped";
+            break;
+        case GF_DEFRAG_STATUS_COMPLETE:
+            status = "completed";
+            break;
+        case GF_DEFRAG_STATUS_FAILED:
+            status = "failed";
+            break;
+        default:
+            break;
+    }
+
+    gf_msg(THIS->name, GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STATUS,
+           "Rebalance is %s. Time taken is %.2f secs", status, elapsed);
+    gf_msg(THIS->name, GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STATUS,
+           "Files migrated: %" PRIu64 ", size: %" PRIu64 ", lookups: %" PRIu64
+           ", failures: %" PRIu64
+           ", skipped: "
+           "%" PRIu64,
+           files, size, lookup, failures, skipped);
+out:
+    return 0;
+}
+
+int
+gf_defrag_stop(dht_conf_t *conf, gf_defrag_status_t status, dict_t *output)
+{
+    /* TODO: set a variable 'stop_defrag' here, it should be checked
+       in defrag loop */
+    int ret = -1;
+    gf_defrag_info_t *defrag = conf->defrag;
+
+    GF_ASSERT(defrag);
+
+    if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) {
+        goto out;
+    }
+
+    gf_msg("", GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STOPPED,
+           "Received stop command on rebalance");
+    defrag->defrag_status = status;
+
+    if (output)
+        gf_defrag_status_get(conf, output);
+    ret = 0;
+out:
+    gf_msg_debug("", 0, "Returning %d", ret);
+    return ret;
+}
diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c
index f19c07ae492..d9dbf50492f 100644
--- a/xlators/cluster/dht/src/dht-rename.c
+++ b/xlators/cluster/dht/src/dht-rename.c
@@ -1,562 +1,1997 @@
 /*
-   Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
 /* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should
  *       delete the newpath if it gets EEXISTS from link() call.
  */
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "xlator.h"
 #include "dht-common.h"
-#include "defaults.h"
+#include "dht-lock.h"
+#include <glusterfs/defaults.h>
 
+int
+dht_rename_unlock(call_frame_t *frame, xlator_t *this);
+int32_t
+dht_rename_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *xdata);
 
 int
-dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		    int32_t op_ret, int32_t op_errno, struct stat *stbuf)
+dht_rename_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
-	dht_local_t  *local = NULL;
-	int           this_call_cnt = 0;
-	call_frame_t *prev = NULL;
+    dht_local_t *local = NULL;
 
+    local = frame->local;
 
-	local = frame->local;
-	prev = cookie;
+    dht_set_fixed_dir_stat(&local->preoldparent);
+    dht_set_fixed_dir_stat(&local->postoldparent);
+    dht_set_fixed_dir_stat(&local->preparent);
+    dht_set_fixed_dir_stat(&local->postparent);
 
-	if (op_ret == -1) {
-		/* TODO: undo the damage */
+    if (IA_ISREG(local->stbuf.ia_type))
+        DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
 
-		gf_log (this->name, GF_LOG_ERROR,
-			"rename %s -> %s on %s failed (%s)",
-			local->loc.path, local->loc2.path,
-			prev->this->name, strerror (op_errno));
+    DHT_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno,
+                     &local->stbuf, &local->preoldparent, &local->postoldparent,
+                     &local->preparent, &local->postparent, local->xattr);
+    return 0;
+}
 
-		local->op_ret   = op_ret;
-		local->op_errno = op_errno;
-	} else {
-		/* TODO: construct proper stbuf for dir */
-		local->stbuf = *stbuf;
-	}
+static void
+dht_rename_dir_unlock_src(call_frame_t *frame, xlator_t *this)
+{
+    dht_local_t *local = NULL;
 
-	this_call_cnt = dht_frame_return (frame);
-	if (is_last_call (this_call_cnt)) {
-		DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-				  &local->stbuf);
-	}
+    local = frame->local;
+    dht_unlock_namespace(frame, &local->lock[0]);
+    return;
+}
 
-	return 0;
+static void
+dht_rename_dir_unlock_dst(call_frame_t *frame, xlator_t *this)
+{
+    dht_local_t *local = NULL;
+    int op_ret = -1;
+    char src_gfid[GF_UUID_BUF_SIZE] = {0};
+    char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+
+    /* Unlock entrylk */
+    dht_unlock_entrylk_wrapper(frame, &local->lock[1].ns.directory_ns);
+
+    /* Unlock inodelk */
+    op_ret = dht_unlock_inodelk(frame, local->lock[1].ns.parent_layout.locks,
+                                local->lock[1].ns.parent_layout.lk_count,
+                                dht_rename_unlock_cbk);
+    if (op_ret < 0) {
+        uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+        if (local->loc2.inode)
+            uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+        if (IA_ISREG(local->stbuf.ia_type))
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
+                   "winding unlock inodelk failed "
+                   "rename (%s:%s:%s %s:%s:%s), "
+                   "stale locks left on bricks",
+                   local->loc.path, src_gfid, local->src_cached->name,
+                   local->loc2.path, dst_gfid,
+                   local->dst_cached ? local->dst_cached->name : NULL);
+        else
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
+                   "winding unlock inodelk failed "
+                   "rename (%s:%s %s:%s), "
+                   "stale locks left on bricks",
+                   local->loc.path, src_gfid, local->loc2.path, dst_gfid);
+
+        dht_rename_unlock_cbk(frame, NULL, this, 0, 0, NULL);
+    }
+
+    return;
 }
 
+static int
+dht_rename_dir_unlock(call_frame_t *frame, xlator_t *this)
+{
+    dht_rename_dir_unlock_src(frame, this);
+    dht_rename_dir_unlock_dst(frame, this);
+    return 0;
+}
+int
+dht_rename_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
+                   struct iatt *preoldparent, struct iatt *postoldparent,
+                   struct iatt *prenewparent, struct iatt *postnewparent,
+                   dict_t *xdata)
+{
+    dht_conf_t *conf = NULL;
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    xlator_t *prev = NULL;
+    int i = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+    int subvol_cnt = -1;
+
+    conf = this->private;
+    local = frame->local;
+    prev = cookie;
+    subvol_cnt = dht_subvol_cnt(this, prev);
+    local->ret_cache[subvol_cnt] = op_ret;
+
+    if (op_ret == -1) {
+        gf_uuid_unparse(local->loc.inode->gfid, gfid);
+
+        gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_RENAME_FAILED,
+               "Rename %s -> %s on %s failed, (gfid = %s)", local->loc.path,
+               local->loc2.path, prev->name, gfid);
+
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto unwind;
+    }
+    /* TODO: construct proper stbuf for dir */
+    /*
+     * FIXME: is this the correct way to build stbuf and
+     * parent bufs?
+     */
+    dht_iatt_merge(this, &local->stbuf, stbuf);
+    dht_iatt_merge(this, &local->preoldparent, preoldparent);
+    dht_iatt_merge(this, &local->postoldparent, postoldparent);
+    dht_iatt_merge(this, &local->preparent, prenewparent);
+    dht_iatt_merge(this, &local->postparent, postnewparent);
 
+unwind:
+    this_call_cnt = dht_frame_return(frame);
+    if (is_last_call(this_call_cnt)) {
+        /* We get here with local->call_cnt == 0. Which means
+         * we are the only one executing this code, there is
+         * no contention. Therefore it's safe to manipulate or
+         * deref local->call_cnt directly (without locking).
+         */
+        if (local->ret_cache[conf->subvolume_cnt] == 0) {
+            /* count errant subvols in last field of ret_cache */
+            for (i = 0; i < conf->subvolume_cnt; i++) {
+                if (local->ret_cache[i] != 0)
+                    ++local->ret_cache[conf->subvolume_cnt];
+            }
+            if (local->ret_cache[conf->subvolume_cnt]) {
+                /* undoing the damage:
+                 * for all subvolumes, where rename
+                 * succeeded, we perform the reverse operation
+                 */
+                for (i = 0; i < conf->subvolume_cnt; i++) {
+                    if (local->ret_cache[i] == 0)
+                        ++local->call_cnt;
+                }
+                for (i = 0; i < conf->subvolume_cnt; i++) {
+                    if (local->ret_cache[i])
+                        continue;
+
+                    STACK_WIND(frame, dht_rename_dir_cbk, conf->subvolumes[i],
+                               conf->subvolumes[i]->fops->rename, &local->loc2,
+                               &local->loc, NULL);
+                }
+
+                return 0;
+            }
+        }
+
+        WIPE(&local->preoldparent);
+        WIPE(&local->postoldparent);
+        WIPE(&local->preparent);
+        WIPE(&local->postparent);
+
+        dht_rename_dir_unlock(frame, this);
+    }
+
+    return 0;
+}
 
 int
-dht_rename_dir_do (call_frame_t *frame, xlator_t *this)
+dht_rename_hashed_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
+                          struct iatt *preoldparent, struct iatt *postoldparent,
+                          struct iatt *prenewparent, struct iatt *postnewparent,
+                          dict_t *xdata)
 {
-	dht_local_t  *local = NULL;
-	dht_conf_t   *conf = NULL;
-	int           i = 0;
+    dht_conf_t *conf = NULL;
+    dht_local_t *local = NULL;
+    int call_cnt = 0;
+    xlator_t *prev = NULL;
+    int i = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    conf = this->private;
+    local = frame->local;
+    prev = cookie;
+
+    if (op_ret == -1) {
+        gf_uuid_unparse(local->loc.inode->gfid, gfid);
+
+        gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_RENAME_FAILED,
+               "rename %s -> %s on %s failed, (gfid = %s) ", local->loc.path,
+               local->loc2.path, prev->name, gfid);
+
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto unwind;
+    }
+    /* TODO: construct proper stbuf for dir */
+    /*
+     * FIXME: is this the correct way to build stbuf and
+     * parent bufs?
+     */
+    dht_iatt_merge(this, &local->stbuf, stbuf);
+    dht_iatt_merge(this, &local->preoldparent, preoldparent);
+    dht_iatt_merge(this, &local->postoldparent, postoldparent);
+    dht_iatt_merge(this, &local->preparent, prenewparent);
+    dht_iatt_merge(this, &local->postparent, postnewparent);
+
+    call_cnt = local->call_cnt = conf->subvolume_cnt - 1;
+
+    if (!local->call_cnt)
+        goto unwind;
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (conf->subvolumes[i] == local->dst_hashed)
+            continue;
+        STACK_WIND_COOKIE(
+            frame, dht_rename_dir_cbk, conf->subvolumes[i], conf->subvolumes[i],
+            conf->subvolumes[i]->fops->rename, &local->loc, &local->loc2, NULL);
+        if (!--call_cnt)
+            break;
+    }
+
+    return 0;
+unwind:
+    WIPE(&local->preoldparent);
+    WIPE(&local->postoldparent);
+    WIPE(&local->preparent);
+    WIPE(&local->postparent);
 
-	conf = this->private;
-	local = frame->local;
+    dht_rename_dir_unlock(frame, this);
+    return 0;
+}
 
-	if (local->op_ret == -1)
-		goto err;
+int
+dht_rename_dir_do(call_frame_t *frame, xlator_t *this)
+{
+    dht_local_t *local = NULL;
 
-	local->call_cnt = conf->subvolume_cnt;
-	local->op_ret = 0;
+    local = frame->local;
 
-	for (i = 0; i < conf->subvolume_cnt; i++) {
-		STACK_WIND (frame, dht_rename_dir_cbk,
-			    conf->subvolumes[i],
-			    conf->subvolumes[i]->fops->rename,
-			    &local->loc, &local->loc2);
-	}
+    if (local->op_ret == -1)
+        goto err;
 
-	return 0;
+    local->op_ret = 0;
+
+    STACK_WIND_COOKIE(frame, dht_rename_hashed_dir_cbk, local->dst_hashed,
+                      local->dst_hashed, local->dst_hashed->fops->rename,
+                      &local->loc, &local->loc2, NULL);
+    return 0;
 
 err:
-	DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno);
-	return 0;
+    dht_rename_dir_unlock(frame, this);
+    return 0;
 }
 
-
 int
-dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			int op_ret, int op_errno, gf_dirent_t *entries)
+dht_rename_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int op_ret, int op_errno, gf_dirent_t *entries,
+                       dict_t *xdata)
 {
-	dht_local_t  *local = NULL;
-	int           this_call_cnt = -1;
-	call_frame_t *prev = NULL;
+    dht_local_t *local = NULL;
+    int this_call_cnt = -1;
+    xlator_t *prev = NULL;
 
-	local = frame->local;
-	prev  = cookie;
+    local = frame->local;
+    prev = cookie;
 
-	if (op_ret > 2) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"readdir on %s for %s returned %d entries",
-			prev->this->name, local->loc.path, op_ret);
-		local->op_ret = -1;
-		local->op_errno = ENOTEMPTY;
-	}
+    if (op_ret > 2) {
+        gf_msg_trace(this->name, 0, "readdir on %s for %s returned %d entries",
+                     prev->name, local->loc.path, op_ret);
+        local->op_ret = -1;
+        local->op_errno = ENOTEMPTY;
+    }
 
-	this_call_cnt = dht_frame_return (frame);
+    this_call_cnt = dht_frame_return(frame);
 
-	if (is_last_call (this_call_cnt)) {
-		dht_rename_dir_do (frame, this);
-	}
+    if (is_last_call(this_call_cnt)) {
+        dht_rename_dir_do(frame, this);
+    }
 
-	return 0;
+    return 0;
 }
 
-
 int
-dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			int op_ret, int op_errno, fd_t *fd)
+dht_rename_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
 {
-	dht_local_t  *local = NULL;
-	int           this_call_cnt = -1;
-	call_frame_t *prev = NULL;
-
+    dht_local_t *local = NULL;
+    int this_call_cnt = -1;
+    xlator_t *prev = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
 
-	local = frame->local;
-	prev  = cookie;
+    local = frame->local;
+    prev = cookie;
 
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"opendir on %s for %s failed (%s)",
-			prev->this->name, local->loc.path,
-			strerror (op_errno));
-		goto err;
-	}
+    if (op_ret == -1) {
+        gf_uuid_unparse(local->loc.inode->gfid, gfid);
+        gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_OPENDIR_FAILED,
+               "opendir on %s for %s failed,(gfid = %s) ", prev->name,
+               local->loc.path, gfid);
+        goto err;
+    }
 
-	STACK_WIND (frame, dht_rename_readdir_cbk,
-		    prev->this, prev->this->fops->readdir,
-		    local->fd, 4096, 0);
+    fd_bind(fd);
+    STACK_WIND_COOKIE(frame, dht_rename_readdir_cbk, prev, prev,
+                      prev->fops->readdir, local->fd, 4096, 0, NULL);
 
-	return 0;
+    return 0;
 
 err:
-	this_call_cnt = dht_frame_return (frame);
+    this_call_cnt = dht_frame_return(frame);
 
-	if (is_last_call (this_call_cnt)) {
-		dht_rename_dir_do (frame, this);
-	}
+    if (is_last_call(this_call_cnt)) {
+        dht_rename_dir_do(frame, this);
+    }
 
-	return 0;
+    return 0;
 }
 
+int
+dht_rename_dir_lock2_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    char src_gfid[GF_UUID_BUF_SIZE] = {0};
+    char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+    dht_conf_t *conf = NULL;
+    int i = 0;
+
+    local = frame->local;
+    conf = this->private;
+
+    if (op_ret < 0) {
+        uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+        if (local->loc2.inode)
+            uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+               "acquiring entrylk after inodelk failed"
+               "rename (%s:%s:%s %s:%s:%s)",
+               local->loc.path, src_gfid, local->src_cached->name,
+               local->loc2.path, dst_gfid,
+               local->dst_cached ? local->dst_cached->name : NULL);
+
+        local->op_ret = -1;
+        local->op_errno = op_errno;
+        goto err;
+    }
+
+    local->fd = fd_create(local->loc.inode, frame->root->pid);
+    if (!local->fd) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->op_ret = 0;
+
+    if (!local->dst_cached) {
+        dht_rename_dir_do(frame, this);
+        return 0;
+    }
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        STACK_WIND_COOKIE(frame, dht_rename_opendir_cbk, conf->subvolumes[i],
+                          conf->subvolumes[i],
+                          conf->subvolumes[i]->fops->opendir, &local->loc2,
+                          local->fd, NULL);
+    }
+
+    return 0;
+
+err:
+    /* No harm in calling an extra unlock */
+    dht_rename_dir_unlock(frame, this);
+    return 0;
+}
 
 int
-dht_rename_dir (call_frame_t *frame, xlator_t *this)
+dht_rename_dir_lock1_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
-	dht_conf_t  *conf = NULL;
-	dht_local_t *local = NULL;
-	int          i = 0;
-	int          op_errno = -1;
+    dht_local_t *local = NULL;
+    char src_gfid[GF_UUID_BUF_SIZE] = {0};
+    char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+    int ret = 0;
+    loc_t *loc = NULL;
+    xlator_t *subvol = NULL;
+
+    local = frame->local;
+
+    if (op_ret < 0) {
+        uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+        if (local->loc2.inode)
+            uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+               "acquiring entrylk after inodelk failed"
+               "rename (%s:%s:%s %s:%s:%s)",
+               local->loc.path, src_gfid, local->src_cached->name,
+               local->loc2.path, dst_gfid,
+               local->dst_cached ? local->dst_cached->name : NULL);
+
+        local->op_ret = -1;
+        local->op_errno = op_errno;
+        goto err;
+    }
+
+    if (local->current == &local->lock[0]) {
+        loc = &local->loc2;
+        subvol = local->dst_hashed;
+        local->current = &local->lock[1];
+    } else {
+        loc = &local->loc;
+        subvol = local->src_hashed;
+        local->current = &local->lock[0];
+    }
+    ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns,
+                                dht_rename_dir_lock2_cbk);
+    if (ret < 0) {
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    return 0;
+err:
+    /* No harm in calling an extra unlock */
+    dht_rename_dir_unlock(frame, this);
+    return 0;
+}
 
+/*
+ * If the hashed subvolumes of both source and dst are the different,
+ * lock in dictionary order of hashed subvol->name. This is important
+ * in case the parent directory is the same for both src and dst to
+ * prevent inodelk deadlocks when racing with a fix-layout op on the parent.
+ *
+ * If the hashed subvols are the same, use the gfid/name to determine
+ * the order of taking locks to prevent entrylk deadlocks when the parent
+ * dirs are the same.
+ *
+ */
+static int
+dht_order_rename_lock(call_frame_t *frame, loc_t **loc, xlator_t **subvol)
+{
+    int ret = 0;
+    int op_ret = 0;
+    dht_local_t *local = NULL;
+    char *src = NULL;
+    char *dst = NULL;
+
+    local = frame->local;
+
+    if (local->src_hashed->name == local->dst_hashed->name) {
+        ret = 0;
+    } else {
+        ret = strcmp(local->src_hashed->name, local->dst_hashed->name);
+    }
+
+    if (ret == 0) {
+        /* hashed subvols are the same for src and dst */
+        /* Entrylks need to be ordered*/
+
+        src = alloca(GF_UUID_BNAME_BUF_SIZE + strlen(local->loc.name) + 1);
+        if (!src) {
+            gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, 0,
+                   "Insufficient memory for src");
+            op_ret = -1;
+            goto out;
+        }
+
+        if (!gf_uuid_is_null(local->loc.pargfid))
+            uuid_utoa_r(local->loc.pargfid, src);
+        else if (local->loc.parent)
+            uuid_utoa_r(local->loc.parent->gfid, src);
+        else
+            src[0] = '\0';
+
+        strcat(src, local->loc.name);
+
+        dst = alloca(GF_UUID_BNAME_BUF_SIZE + strlen(local->loc2.name) + 1);
+        if (!dst) {
+            gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, 0,
+                   "Insufficient memory for dst");
+            op_ret = -1;
+            goto out;
+        }
+
+        if (!gf_uuid_is_null(local->loc2.pargfid))
+            uuid_utoa_r(local->loc2.pargfid, dst);
+        else if (local->loc2.parent)
+            uuid_utoa_r(local->loc2.parent->gfid, dst);
+        else
+            dst[0] = '\0';
+
+        strcat(dst, local->loc2.name);
+        ret = strcmp(src, dst);
+    }
+
+    if (ret <= 0) {
+        /*inodelk in dictionary order of hashed subvol names*/
+        /*entrylk in dictionary order of gfid/basename */
+        local->current = &local->lock[0];
+        *loc = &local->loc;
+        *subvol = local->src_hashed;
+
+    } else {
+        local->current = &local->lock[1];
+        *loc = &local->loc2;
+        *subvol = local->dst_hashed;
+    }
+
+    op_ret = 0;
+
+out:
+    return op_ret;
+}
 
-	conf = frame->this->private;
-	local = frame->local;
+int
+dht_rename_dir(call_frame_t *frame, xlator_t *this)
+{
+    dht_conf_t *conf = NULL;
+    dht_local_t *local = NULL;
+    loc_t *loc = NULL;
+    xlator_t *subvol = NULL;
+    int i = 0;
+    int ret = 0;
+    int op_errno = -1;
+
+    conf = frame->this->private;
+    local = frame->local;
+
+    local->ret_cache = GF_CALLOC(conf->subvolume_cnt + 1, sizeof(int),
+                                 gf_dht_ret_cache_t);
+
+    if (local->ret_cache == NULL) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->call_cnt = conf->subvolume_cnt;
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (!conf->subvolume_status[i]) {
+            gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED,
+                   "Rename dir failed: subvolume down (%s)",
+                   conf->subvolumes[i]->name);
+            op_errno = ENOTCONN;
+            goto err;
+        }
+    }
+
+    /* Locks on src and dst needs to ordered which otherwise might cause
+     * deadlocks when rename (src, dst) and rename (dst, src) is done from
+     * two different clients
+     */
+    ret = dht_order_rename_lock(frame, &loc, &subvol);
+    if (ret) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    /* Rename must take locks on src to avoid lookup selfheal from
+     * recreating src on those subvols where the rename was successful.
+     * The locks can't be issued parallel as two different clients might
+     * attempt same rename command and be in dead lock.
+     */
+    ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns,
+                                dht_rename_dir_lock1_cbk);
+    if (ret < 0) {
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    return 0;
 
-	local->call_cnt = conf->subvolume_cnt;
+err:
+    DHT_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+                     NULL);
+    return 0;
+}
+
+static int
+dht_rename_track_for_changelog(xlator_t *this, dict_t *xattr, loc_t *oldloc,
+                               loc_t *newloc)
+{
+    int ret = -1;
+    dht_changelog_rename_info_t *info = NULL;
+    char *name = NULL;
+    int len1 = 0;
+    int len2 = 0;
+    int size = 0;
+
+    if (!xattr || !oldloc || !newloc || !this)
+        return ret;
+
+    len1 = strlen(oldloc->name) + 1;
+    len2 = strlen(newloc->name) + 1;
+    size = sizeof(dht_changelog_rename_info_t) + len1 + len2;
+
+    info = GF_CALLOC(size, sizeof(char), gf_common_mt_char);
+    if (!info) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+               "Failed to calloc memory");
+        return ret;
+    }
+
+    gf_uuid_copy(info->old_pargfid, oldloc->pargfid);
+    gf_uuid_copy(info->new_pargfid, newloc->pargfid);
+
+    info->oldname_len = len1;
+    info->newname_len = len2;
+    strncpy(info->buffer, oldloc->name, len1);
+    name = info->buffer + len1;
+    strncpy(name, newloc->name, len2);
+
+    ret = dict_set_bin(xattr, DHT_CHANGELOG_RENAME_OP_KEY, info, size);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+               "Failed to set dictionary value: key = %s,"
+               " path = %s",
+               DHT_CHANGELOG_RENAME_OP_KEY, oldloc->name);
+        GF_FREE(info);
+    }
+
+    return ret;
+}
 
-	local->fd = fd_create (local->loc.inode, frame->root->pid);
-	if (!local->fd) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		op_errno = ENOMEM;
-		goto err;
-	}
+#define DHT_MARKER_DONT_ACCOUNT(xattr)                                         \
+    do {                                                                       \
+        int tmp = -1;                                                          \
+        if (!xattr) {                                                          \
+            xattr = dict_new();                                                \
+            if (!xattr)                                                        \
+                break;                                                         \
+        }                                                                      \
+        tmp = dict_set_str(xattr, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, "yes");   \
+        if (tmp) {                                                             \
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,       \
+                   "Failed to set dictionary value: key = %s,"                 \
+                   " path = %s",                                               \
+                   GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, local->loc.path);        \
+        }                                                                      \
+    } while (0)
+
+#define DHT_CHANGELOG_TRACK_AS_RENAME(xattr, oldloc, newloc)                   \
+    do {                                                                       \
+        int tmp = -1;                                                          \
+        if (!xattr) {                                                          \
+            xattr = dict_new();                                                \
+            if (!xattr) {                                                      \
+                gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,   \
+                       "Failed to create dictionary to "                       \
+                       "track rename");                                        \
+                break;                                                         \
+            }                                                                  \
+        }                                                                      \
+                                                                               \
+        tmp = dht_rename_track_for_changelog(this, xattr, oldloc, newloc);     \
+                                                                               \
+        if (tmp) {                                                             \
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,       \
+                   "Failed to set dictionary value: key = %s,"                 \
+                   " path = %s",                                               \
+                   DHT_CHANGELOG_RENAME_OP_KEY, (oldloc)->path);               \
+        }                                                                      \
+    } while (0)
 
-	local->op_ret = 0;
+int
+dht_rename_unlock(call_frame_t *frame, xlator_t *this)
+{
+    dht_local_t *local = NULL;
+    int op_ret = -1;
+    char src_gfid[GF_UUID_BUF_SIZE] = {0};
+    char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+    dht_ilock_wrap_t inodelk_wrapper = {
+        0,
+    };
+
+    local = frame->local;
+    inodelk_wrapper.locks = local->rename_inodelk_backward_compatible;
+    inodelk_wrapper.lk_count = local->rename_inodelk_bc_count;
+
+    op_ret = dht_unlock_inodelk_wrapper(frame, &inodelk_wrapper);
+    if (op_ret < 0) {
+        uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+        if (local->loc2.inode)
+            uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+        if (IA_ISREG(local->stbuf.ia_type))
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
+                   "winding unlock inodelk failed "
+                   "rename (%s:%s:%s %s:%s:%s), "
+                   "stale locks left on bricks",
+                   local->loc.path, src_gfid, local->src_cached->name,
+                   local->loc2.path, dst_gfid,
+                   local->dst_cached ? local->dst_cached->name : NULL);
+        else
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
+                   "winding unlock inodelk failed "
+                   "rename (%s:%s %s:%s), "
+                   "stale locks left on bricks",
+                   local->loc.path, src_gfid, local->loc2.path, dst_gfid);
+    }
+
+    dht_unlock_namespace(frame, &local->lock[0]);
+    dht_unlock_namespace(frame, &local->lock[1]);
+
+    dht_rename_unlock_cbk(frame, NULL, this, local->op_ret, local->op_errno,
+                          NULL);
+    return 0;
+}
 
-	if (!local->dst_cached) {
-		dht_rename_dir_do (frame, this);
-		return 0;
-	}
+int
+dht_rename_done(call_frame_t *frame, xlator_t *this)
+{
+    dht_local_t *local = NULL;
 
-	for (i = 0; i < conf->subvolume_cnt; i++) {
-		STACK_WIND (frame, dht_rename_opendir_cbk,
-			    conf->subvolumes[i],
-			    conf->subvolumes[i]->fops->opendir,
-			    &local->loc2, local->fd);
-	}
+    local = frame->local;
 
-	return 0;
+    if (local->linked == _gf_true) {
+        local->linked = _gf_false;
+        dht_linkfile_attr_heal(frame, this);
+    }
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;
+    dht_rename_unlock(frame, this);
+    return 0;
 }
 
+int
+dht_rename_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                      struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    int this_call_cnt = 0;
+
+    local = frame->local;
+    prev = cookie;
+
+    FRAME_SU_UNDO(frame, dht_local_t);
+    if (!local) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_VALUE,
+               "!local, should not happen");
+        goto out;
+    }
+
+    this_call_cnt = dht_frame_return(frame);
+
+    if (op_ret == -1) {
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLINK_FAILED,
+               "%s: Rename: unlink on %s failed ", local->loc.path, prev->name);
+    }
+
+    WIPE(&local->preoldparent);
+    WIPE(&local->postoldparent);
+    WIPE(&local->preparent);
+    WIPE(&local->postparent);
+
+    if (is_last_call(this_call_cnt)) {
+        dht_rename_done(frame, this);
+    }
+
+out:
+    return 0;
+}
 
 int
-dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		       int32_t op_ret, int32_t op_errno)
+dht_rename_cleanup(call_frame_t *frame)
 {
-	dht_local_t  *local = NULL;
-	call_frame_t *prev = NULL;
-	int           this_call_cnt = 0;
+    dht_local_t *local = NULL;
+    xlator_t *this = NULL;
+    xlator_t *src_hashed = NULL;
+    xlator_t *src_cached = NULL;
+    xlator_t *dst_hashed = NULL;
+    xlator_t *dst_cached = NULL;
+    int call_cnt = 0;
+    dict_t *xattr = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
 
-	local = frame->local;
-	prev  = cookie;
+    local = frame->local;
+    this = frame->this;
 
-	this_call_cnt = dht_frame_return (frame);
+    src_hashed = local->src_hashed;
+    src_cached = local->src_cached;
+    dst_hashed = local->dst_hashed;
+    dst_cached = local->dst_cached;
 
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"unlink on %s failed (%s)",
-			prev->this->name, strerror (op_errno));
-	}
+    if (src_cached == dst_cached)
+        goto nolinks;
 
-	if (is_last_call (this_call_cnt))
-		DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-				  &local->stbuf);
+    if (local->linked && (dst_hashed != src_hashed) &&
+        (dst_hashed != src_cached)) {
+        call_cnt++;
+    }
 
-	return 0;
-}
+    if (local->added_link && (src_cached != dst_hashed)) {
+        call_cnt++;
+    }
+
+    local->call_cnt = call_cnt;
+
+    if (!call_cnt)
+        goto nolinks;
+
+    DHT_MARK_FOP_INTERNAL(xattr);
+
+    gf_uuid_unparse(local->loc.inode->gfid, gfid);
+
+    if (local->linked && (dst_hashed != src_hashed) &&
+        (dst_hashed != src_cached)) {
+        dict_t *xattr_new = NULL;
+
+        gf_msg_trace(this->name, 0,
+                     "unlinking linkfile %s @ %s => %s, (gfid = %s)",
+                     local->loc.path, dst_hashed->name, src_cached->name, gfid);
+
+        xattr_new = dict_copy_with_ref(xattr, NULL);
+
+        DHT_MARKER_DONT_ACCOUNT(xattr_new);
 
+        FRAME_SU_DO(frame, dht_local_t);
+        STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, dst_hashed, dst_hashed,
+                          dst_hashed->fops->unlink, &local->loc, 0, xattr_new);
+
+        dict_unref(xattr_new);
+        xattr_new = NULL;
+    }
+
+    if (local->added_link && (src_cached != dst_hashed)) {
+        dict_t *xattr_new = NULL;
+
+        gf_msg_trace(this->name, 0, "unlinking link %s => %s (%s), (gfid = %s)",
+                     local->loc.path, local->loc2.path, src_cached->name, gfid);
+
+        xattr_new = dict_copy_with_ref(xattr, NULL);
+
+        if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) {
+            DHT_MARKER_DONT_ACCOUNT(xattr_new);
+        }
+        /* *
+         * The link to file is created using root permission.
+         * Hence deletion should happen using root. Otherwise
+         * it will fail.
+         */
+        FRAME_SU_DO(frame, dht_local_t);
+        STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, src_cached, src_cached,
+                          src_cached->fops->unlink, &local->loc2, 0, xattr_new);
+
+        dict_unref(xattr_new);
+        xattr_new = NULL;
+    }
+
+    if (xattr)
+        dict_unref(xattr);
+
+    return 0;
+
+nolinks:
+    WIPE(&local->preoldparent);
+    WIPE(&local->postoldparent);
+    WIPE(&local->preparent);
+    WIPE(&local->postparent);
+
+    dht_rename_unlock(frame, this);
+    return 0;
+}
 
 int
-dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		int32_t op_ret, int32_t op_errno, struct stat *stbuf)
-{
-	dht_local_t  *local = NULL;
-	call_frame_t *prev = NULL;
-	xlator_t     *src_hashed = NULL;
-	xlator_t     *src_cached = NULL;
-	xlator_t     *dst_hashed = NULL;
-	xlator_t     *dst_cached = NULL;
-	xlator_t     *rename_subvol = NULL;
-
-	local = frame->local;
-	prev = cookie;
-
-	src_hashed = local->src_hashed;
-	src_cached = local->src_cached;
-	dst_hashed = local->dst_hashed;
-	dst_cached = local->dst_cached;
-
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"rename on %s failed (%s)", prev->this->name,
-			strerror (op_errno));
-		local->op_ret   = op_ret;
-		local->op_errno = op_errno;
-		goto unwind;
-	}
-	
-	/* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk
-	 *       is called. since rename has already happened on rename_subvol,
-	 *       unlink should not be sent for oldpath (either linkfile or cached-file)
-	 *       on rename_subvol. */
-	if (src_cached == dst_cached)
-		rename_subvol = src_cached;
-	else
-		rename_subvol = dst_hashed;
-
-	/* TODO: delete files in background */
-
-	if (src_cached != dst_hashed && src_cached != dst_cached)
-		local->call_cnt++;
-
-	if (src_hashed != rename_subvol && src_hashed != src_cached)
-		local->call_cnt++;
-
-	if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached)
-		local->call_cnt++;
-
-	if (local->call_cnt == 0)
-		goto unwind;
-
-	if (src_cached != dst_hashed && src_cached != dst_cached) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"deleting old src datafile %s @ %s",
-			local->loc.path, src_cached->name);
-
-		STACK_WIND (frame, dht_rename_unlink_cbk,
-			    src_cached, src_cached->fops->unlink,
-			    &local->loc);
-	}
-
-	if (src_hashed != rename_subvol && src_hashed != src_cached) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"deleting old src linkfile %s @ %s",
-			local->loc.path, src_hashed->name);
-
-		STACK_WIND (frame, dht_rename_unlink_cbk,
-			    src_hashed, src_hashed->fops->unlink,
-			    &local->loc);
-	}
-
-	if (dst_cached
-	    && (dst_cached != dst_hashed)
-	    && (dst_cached != src_cached)) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"deleting old dst datafile %s @ %s",
-			local->loc2.path, dst_cached->name);
-
-		STACK_WIND (frame, dht_rename_unlink_cbk,
-			    dst_cached, dst_cached->fops->unlink,
-			    &local->loc2);
-	}
-	return 0;
+dht_rename_unlink(call_frame_t *frame, xlator_t *this)
+{
+    dht_local_t *local = NULL;
+    xlator_t *src_hashed = NULL;
+    xlator_t *src_cached = NULL;
+    xlator_t *dst_hashed = NULL;
+    xlator_t *dst_cached = NULL;
+    xlator_t *rename_subvol = NULL;
+    dict_t *xattr = NULL;
+
+    local = frame->local;
+
+    src_hashed = local->src_hashed;
+    src_cached = local->src_cached;
+    dst_hashed = local->dst_hashed;
+    dst_cached = local->dst_cached;
+
+    local->call_cnt = 0;
+
+    /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk
+     * is called. since rename has already happened on rename_subvol,
+     * unlink shouldn't be sent for oldpath (either linkfile or cached-file)
+     * on rename_subvol. */
+    if (src_cached == dst_cached)
+        rename_subvol = src_cached;
+    else
+        rename_subvol = dst_hashed;
+
+    /* TODO: delete files in background */
+
+    if (src_cached != dst_hashed && src_cached != dst_cached)
+        local->call_cnt++;
+
+    if (src_hashed != rename_subvol && src_hashed != src_cached)
+        local->call_cnt++;
+
+    if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached)
+        local->call_cnt++;
+
+    if (local->call_cnt == 0)
+        goto unwind;
+
+    DHT_MARK_FOP_INTERNAL(xattr);
+
+    if (src_cached != dst_hashed && src_cached != dst_cached) {
+        dict_t *xattr_new = NULL;
+
+        xattr_new = dict_copy_with_ref(xattr, NULL);
+
+        gf_msg_trace(this->name, 0, "deleting old src datafile %s @ %s",
+                     local->loc.path, src_cached->name);
+
+        if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) {
+            DHT_MARKER_DONT_ACCOUNT(xattr_new);
+        }
+
+        DHT_CHANGELOG_TRACK_AS_RENAME(xattr_new, &local->loc, &local->loc2);
+        STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, src_cached, src_cached,
+                          src_cached->fops->unlink, &local->loc, 0, xattr_new);
+
+        dict_unref(xattr_new);
+        xattr_new = NULL;
+    }
+
+    if (src_hashed != rename_subvol && src_hashed != src_cached) {
+        dict_t *xattr_new = NULL;
+
+        xattr_new = dict_copy_with_ref(xattr, NULL);
+
+        gf_msg_trace(this->name, 0, "deleting old src linkfile %s @ %s",
+                     local->loc.path, src_hashed->name);
+
+        DHT_MARKER_DONT_ACCOUNT(xattr_new);
+
+        STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, src_hashed, src_hashed,
+                          src_hashed->fops->unlink, &local->loc, 0, xattr_new);
+
+        dict_unref(xattr_new);
+        xattr_new = NULL;
+    }
+
+    if (dst_cached && (dst_cached != dst_hashed) &&
+        (dst_cached != src_cached)) {
+        gf_msg_trace(this->name, 0, "deleting old dst datafile %s @ %s",
+                     local->loc2.path, dst_cached->name);
+
+        STACK_WIND_COOKIE(frame, dht_rename_unlink_cbk, dst_cached, dst_cached,
+                          dst_cached->fops->unlink, &local->loc2, 0, xattr);
+    }
+    if (xattr)
+        dict_unref(xattr);
+    return 0;
 
 unwind:
-	DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-			  &local->stbuf);
+    WIPE(&local->preoldparent);
+    WIPE(&local->postoldparent);
+    WIPE(&local->preparent);
+    WIPE(&local->postparent);
+
+    dht_rename_done(frame, this);
 
-	return 0;
+    return 0;
 }
 
+int
+dht_rename_links_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int32_t op_ret, int32_t op_errno, inode_t *inode,
+                            struct iatt *stbuf, struct iatt *preparent,
+                            struct iatt *postparent, dict_t *xdata)
+{
+    xlator_t *prev = NULL;
+    dht_local_t *local = NULL;
+    call_frame_t *main_frame = NULL;
+
+    prev = cookie;
+    local = frame->local;
+    main_frame = local->main_frame;
+
+    /* TODO: Handle this case in lookup-optimize */
+    if (op_ret == -1) {
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_CREATE_LINK_FAILED,
+               "link/file %s on %s failed", local->loc.path, prev->name);
+    }
+
+    if (local->linked == _gf_true) {
+        local->linked = _gf_false;
+        dht_linkfile_attr_heal(frame, this);
+    }
+
+    dht_rename_unlink(main_frame, this);
+    DHT_STACK_DESTROY(frame);
+    return 0;
+}
 
 int
-dht_do_rename (call_frame_t *frame)
+dht_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
+               struct iatt *preoldparent, struct iatt *postoldparent,
+               struct iatt *prenewparent, struct iatt *postnewparent,
+               dict_t *xdata)
 {
-	dht_local_t *local = NULL;
-	xlator_t    *dst_hashed = NULL;
-	xlator_t    *src_cached = NULL;
-	xlator_t    *dst_cached = NULL;
-	xlator_t    *this = NULL;
-	xlator_t    *rename_subvol = NULL;
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    xlator_t *src_cached = NULL;
+    xlator_t *dst_hashed = NULL;
+    xlator_t *dst_cached = NULL;
+    call_frame_t *link_frame = NULL;
+    dht_local_t *link_local = NULL;
+
+    local = frame->local;
+    prev = cookie;
+
+    src_cached = local->src_cached;
+    dst_hashed = local->dst_hashed;
+    dst_cached = local->dst_cached;
+
+    if (local->linked == _gf_true)
+        FRAME_SU_UNDO(frame, dht_local_t);
+
+    /* It is a critical failure iff we fail to rename the cached file
+     * if the rename of the linkto failed, it is not a critical failure,
+     * and we do not want to lose the created hard link for the new
+     * name as that could have been read by other clients.
+     *
+     * NOTE: If another client is attempting the same oldname -> newname
+     * rename, and finds both file names as existing, and are hard links
+     * to each other, then FUSE would send in an unlink for oldname. In
+     * this time duration if we treat the linkto as a critical error and
+     * unlink the newname we created, we would have effectively lost the
+     * file to rename operations.
+     *
+     * Repercussions of treating this as a non-critical error is that
+     * we could leave behind a stale linkto file and/or not create the new
+     * linkto file, the second case would be rectified by a subsequent
+     * lookup, the first case by a rebalance, like for all stale linkto
+     * files */
+
+    if (op_ret == -1) {
+        /* Critical failure: unable to rename the cached file */
+        if (prev == src_cached) {
+            gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_RENAME_FAILED,
+                   "%s: Rename on %s failed, (gfid = %s) ", local->loc.path,
+                   prev->name,
+                   local->loc.inode ? uuid_utoa(local->loc.inode->gfid) : "");
+            local->op_ret = op_ret;
+            local->op_errno = op_errno;
+            goto cleanup;
+        } else {
+            /* Non-critical failure, unable to rename the linkto
+             * file
+             */
+            gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_RENAME_FAILED,
+                   "%s: Rename (linkto file) on %s failed, "
+                   "(gfid = %s) ",
+                   local->loc.path, prev->name,
+                   local->loc.inode ? uuid_utoa(local->loc.inode->gfid) : "");
+        }
+    }
+    if (xdata) {
+        if (!local->xattr)
+            local->xattr = dict_ref(xdata);
+        else
+            local->xattr = dict_copy_with_ref(xdata, local->xattr);
+    }
+
+    /* Merge attrs only from src_cached. In case there of src_cached !=
+     * dst_hashed, this ignores linkfile attrs. */
+    if (prev == src_cached) {
+        dht_iatt_merge(this, &local->stbuf, stbuf);
+        dht_iatt_merge(this, &local->preoldparent, preoldparent);
+        dht_iatt_merge(this, &local->postoldparent, postoldparent);
+        dht_iatt_merge(this, &local->preparent, prenewparent);
+        dht_iatt_merge(this, &local->postparent, postnewparent);
+    }
+
+    /* Create the linkto file for the dst file */
+    if ((src_cached == dst_cached) && (dst_hashed != dst_cached)) {
+        link_frame = copy_frame(frame);
+        if (!link_frame) {
+            goto unlink;
+        }
+
+        /* fop value sent as maxvalue because it is not used
+         * anywhere in this case */
+        link_local = dht_local_init(link_frame, &local->loc2, NULL,
+                                    GF_FOP_MAXVALUE);
+        if (!link_local) {
+            goto unlink;
+        }
+
+        if (link_local->loc.inode)
+            inode_unref(link_local->loc.inode);
+        link_local->loc.inode = inode_ref(local->loc.inode);
+        link_local->main_frame = frame;
+        link_local->stbuf = local->stbuf;
+        gf_uuid_copy(link_local->gfid, local->loc.inode->gfid);
+
+        dht_linkfile_create(link_frame, dht_rename_links_create_cbk, this,
+                            src_cached, dst_hashed, &link_local->loc);
+        return 0;
+    }
+
+unlink:
+
+    if (link_frame) {
+        DHT_STACK_DESTROY(link_frame);
+    }
+    dht_rename_unlink(frame, this);
+    return 0;
+
+cleanup:
+    dht_rename_cleanup(frame);
+
+    return 0;
+}
 
+int
+dht_do_rename(call_frame_t *frame)
+{
+    dht_local_t *local = NULL;
+    xlator_t *dst_hashed = NULL;
+    xlator_t *src_cached = NULL;
+    xlator_t *dst_cached = NULL;
+    xlator_t *this = NULL;
+    xlator_t *rename_subvol = NULL;
+
+    local = frame->local;
+    this = frame->this;
+
+    dst_hashed = local->dst_hashed;
+    dst_cached = local->dst_cached;
+    src_cached = local->src_cached;
+
+    if (src_cached == dst_cached)
+        rename_subvol = src_cached;
+    else
+        rename_subvol = dst_hashed;
+
+    if ((src_cached != dst_hashed) && (rename_subvol == dst_hashed)) {
+        DHT_MARKER_DONT_ACCOUNT(local->xattr_req);
+    }
+
+    if (rename_subvol == src_cached) {
+        DHT_CHANGELOG_TRACK_AS_RENAME(local->xattr_req, &local->loc,
+                                      &local->loc2);
+    }
+
+    gf_msg_trace(this->name, 0, "renaming %s => %s (%s)", local->loc.path,
+                 local->loc2.path, rename_subvol->name);
+
+    if (local->linked == _gf_true)
+        FRAME_SU_DO(frame, dht_local_t);
+    STACK_WIND_COOKIE(frame, dht_rename_cbk, rename_subvol, rename_subvol,
+                      rename_subvol->fops->rename, &local->loc, &local->loc2,
+                      local->xattr_req);
+    return 0;
+}
 
-	local = frame->local;
-	this  = frame->this;
+int
+dht_rename_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, inode_t *inode,
+                    struct iatt *stbuf, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
 
-	dst_hashed = local->dst_hashed;
-	dst_cached = local->dst_cached;
-	src_cached = local->src_cached;
+    local = frame->local;
+    prev = cookie;
 
-	if (src_cached == dst_cached)
-		rename_subvol = src_cached;
-	else
-		rename_subvol = dst_hashed;
+    if (op_ret == -1) {
+        gf_msg_debug(this->name, 0, "link/file on %s failed (%s)", prev->name,
+                     strerror(op_errno));
+        local->op_ret = -1;
+        local->op_errno = op_errno;
+        local->added_link = _gf_false;
+    } else
+        dht_iatt_merge(this, &local->stbuf, stbuf);
 
-	gf_log (this->name, GF_LOG_DEBUG,
-		"renaming %s => %s (%s)",
-		local->loc.path, local->loc2.path, rename_subvol->name);
+    if (local->op_ret == -1)
+        goto cleanup;
 
-	STACK_WIND (frame, dht_rename_cbk,
-		    rename_subvol, rename_subvol->fops->rename,
-		    &local->loc, &local->loc2);
+    dht_do_rename(frame);
 
-	return 0;
-}
+    return 0;
 
+cleanup:
+    dht_rename_cleanup(frame);
+
+    return 0;
+}
 
 int
-dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		      int32_t op_ret, int32_t op_errno,
-		      inode_t *inode, struct stat *stbuf)
-{
-	dht_local_t  *local = NULL;
-	call_frame_t *prev = NULL;
-	int           this_call_cnt  = 0;
-
-
-	local = frame->local;
-	prev = cookie;
-	
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"link/file on %s failed (%s)",
-			prev->this->name, strerror (op_errno));
-		local->op_ret   = -1;
-		local->op_errno = op_errno;
-	}
-
-	this_call_cnt = dht_frame_return (frame);
-	if (is_last_call (this_call_cnt)) {
-		if (local->op_ret == -1)
-			goto unwind;
-		
-		dht_do_rename (frame);
-	}
-
-	return 0;
+dht_rename_linkto_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, inode_t *inode,
+                      struct iatt *stbuf, struct iatt *preparent,
+                      struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
+    xlator_t *src_cached = NULL;
+    dict_t *xattr = NULL;
 
-unwind:
-	DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno,
-			  &local->stbuf);
+    local = frame->local;
+    DHT_MARK_FOP_INTERNAL(xattr);
+    prev = cookie;
+    src_cached = local->src_cached;
 
-	return 0;
-}
+    if (op_ret == -1) {
+        gf_msg_debug(this->name, 0, "link/file on %s failed (%s)", prev->name,
+                     strerror(op_errno));
+        local->op_ret = -1;
+        local->op_errno = op_errno;
+    }
+
+    /* If linkto creation failed move to failure cleanup code,
+     * instead of continuing with creating the link file */
+    if (local->op_ret != 0) {
+        goto cleanup;
+    }
+
+    gf_msg_trace(this->name, 0, "link %s => %s (%s)", local->loc.path,
+                 local->loc2.path, src_cached->name);
+    if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) {
+        DHT_MARKER_DONT_ACCOUNT(xattr);
+    }
+
+    local->added_link = _gf_true;
+
+    STACK_WIND_COOKIE(frame, dht_rename_link_cbk, src_cached, src_cached,
+                      src_cached->fops->link, &local->loc, &local->loc2, xattr);
 
+    if (xattr)
+        dict_unref(xattr);
+
+    return 0;
+
+cleanup:
+    dht_rename_cleanup(frame);
+
+    if (xattr)
+        dict_unref(xattr);
+
+    return 0;
+}
 
 int
-dht_rename_create_links (call_frame_t *frame)
+dht_rename_unlink_links_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int32_t op_ret, int32_t op_errno,
+                            struct iatt *preparent, struct iatt *postparent,
+                            dict_t *xdata)
 {
-	dht_local_t *local = NULL;
-	xlator_t    *this = NULL;
-	xlator_t    *src_hashed = NULL;
-	xlator_t    *src_cached = NULL;
-	xlator_t    *dst_hashed = NULL;
-	xlator_t    *dst_cached = NULL;
-	int          call_cnt = 0;
+    dht_local_t *local = NULL;
+    xlator_t *prev = NULL;
 
+    local = frame->local;
+    prev = cookie;
 
-	local = frame->local;
-	this  = frame->this;
+    if ((op_ret == -1) && (op_errno != ENOENT)) {
+        gf_msg_debug(this->name, 0, "unlink of %s on %s failed (%s)",
+                     local->loc2.path, prev->name, strerror(op_errno));
+        local->op_ret = -1;
+        local->op_errno = op_errno;
+    }
 
-	src_hashed = local->src_hashed;
-	src_cached = local->src_cached;
-	dst_hashed = local->dst_hashed;
-	dst_cached = local->dst_cached;
+    if (local->op_ret == -1)
+        goto cleanup;
 
-	if (src_cached == dst_cached)
-		goto nolinks;
+    dht_do_rename(frame);
 
-	if (dst_hashed != src_hashed && dst_hashed != src_cached)
-		call_cnt++;
+    return 0;
 
-	if (src_cached != dst_hashed)
-		call_cnt++;
+cleanup:
+    dht_rename_cleanup(frame);
 
-	local->call_cnt = call_cnt;
-
-	if (dst_hashed != src_hashed && dst_hashed != src_cached) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"linkfile %s @ %s => %s",
-			local->loc.path, dst_hashed->name, src_cached->name);
-		dht_linkfile_create (frame, dht_rename_links_cbk,
-				     src_cached, dst_hashed, &local->loc);
-	}
+    return 0;
+}
 
-	if (src_cached != dst_hashed) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"link %s => %s (%s)", local->loc.path,
-			local->loc2.path, src_cached->name);
-		STACK_WIND (frame, dht_rename_links_cbk,
-			    src_cached, src_cached->fops->link,
-			    &local->loc, &local->loc2);
-	}
+int
+dht_rename_create_links(call_frame_t *frame)
+{
+    dht_local_t *local = NULL;
+    xlator_t *this = NULL;
+    xlator_t *src_hashed = NULL;
+    xlator_t *src_cached = NULL;
+    xlator_t *dst_hashed = NULL;
+    xlator_t *dst_cached = NULL;
+    int call_cnt = 0;
+    dict_t *xattr = NULL;
+
+    local = frame->local;
+    this = frame->this;
+
+    src_hashed = local->src_hashed;
+    src_cached = local->src_cached;
+    dst_hashed = local->dst_hashed;
+    dst_cached = local->dst_cached;
+
+    DHT_MARK_FOP_INTERNAL(xattr);
+
+    if (src_cached == dst_cached) {
+        dict_t *xattr_new = NULL;
+
+        if (dst_hashed == dst_cached)
+            goto nolinks;
+
+        xattr_new = dict_copy_with_ref(xattr, NULL);
+
+        gf_msg_trace(this->name, 0, "unlinking dst linkfile %s @ %s",
+                     local->loc2.path, dst_hashed->name);
+
+        DHT_MARKER_DONT_ACCOUNT(xattr_new);
+
+        STACK_WIND_COOKIE(frame, dht_rename_unlink_links_cbk, dst_hashed,
+                          dst_hashed, dst_hashed->fops->unlink, &local->loc2, 0,
+                          xattr_new);
+
+        dict_unref(xattr_new);
+        if (xattr)
+            dict_unref(xattr);
+
+        return 0;
+    }
+
+    if (src_cached != dst_hashed) {
+        /* needed to create the link file */
+        call_cnt++;
+        if (dst_hashed != src_hashed)
+            /* needed to create the linkto file */
+            call_cnt++;
+    }
+
+    /* We should not have any failures post the link creation, as this
+     * introduces the newname into the namespace. Clients could have cached
+     * the existence of the newname and may start taking actions based on
+     * the same. Hence create the linkto first, and then attempt the link.
+     *
+     * NOTE: If another client is attempting the same oldname -> newname
+     * rename, and finds both file names as existing, and are hard links
+     * to each other, then FUSE would send in an unlink for oldname. In
+     * this time duration if we treat the linkto as a critical error and
+     * unlink the newname we created, we would have effectively lost the
+     * file to rename operations. */
+    if (dst_hashed != src_hashed && src_cached != dst_hashed) {
+        gf_msg_trace(this->name, 0, "linkfile %s @ %s => %s", local->loc.path,
+                     dst_hashed->name, src_cached->name);
+
+        memcpy(local->gfid, local->loc.inode->gfid, 16);
+        dht_linkfile_create(frame, dht_rename_linkto_cbk, this, src_cached,
+                            dst_hashed, &local->loc);
+    } else if (src_cached != dst_hashed) {
+        dict_t *xattr_new = NULL;
+
+        xattr_new = dict_copy_with_ref(xattr, NULL);
+
+        gf_msg_trace(this->name, 0, "link %s => %s (%s)", local->loc.path,
+                     local->loc2.path, src_cached->name);
+        if (gf_uuid_compare(local->loc.pargfid, local->loc2.pargfid) == 0) {
+            DHT_MARKER_DONT_ACCOUNT(xattr_new);
+        }
+
+        local->added_link = _gf_true;
+
+        STACK_WIND_COOKIE(frame, dht_rename_link_cbk, src_cached, src_cached,
+                          src_cached->fops->link, &local->loc, &local->loc2,
+                          xattr_new);
+
+        dict_unref(xattr_new);
+    }
 
 nolinks:
-	if (!call_cnt) {
-		/* skip to next step */
-		dht_do_rename (frame);
-	}
+    if (!call_cnt) {
+        /* skip to next step */
+        dht_do_rename(frame);
+    }
+    if (xattr)
+        dict_unref(xattr);
+
+    return 0;
+}
+
+int
+dht_rename_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int op_ret, int op_errno, inode_t *inode,
+                      struct iatt *stbuf, dict_t *xattr,
+                      struct iatt *postparent)
+{
+    dht_local_t *local = NULL;
+    int call_cnt = 0;
+    dht_conf_t *conf = NULL;
+    char gfid_local[GF_UUID_BUF_SIZE] = {0};
+    char gfid_server[GF_UUID_BUF_SIZE] = {0};
+    int child_index = -1;
+    gf_boolean_t is_src = _gf_false;
+    loc_t *loc = NULL;
+
+    child_index = (long)cookie;
+
+    local = frame->local;
+    conf = this->private;
+
+    is_src = (child_index == 0);
+    if (is_src)
+        loc = &local->loc;
+    else
+        loc = &local->loc2;
+
+    if (op_ret >= 0) {
+        if (is_src)
+            local->src_cached = dht_subvol_get_cached(this, local->loc.inode);
+        else {
+            if (loc->inode)
+                gf_uuid_unparse(loc->inode->gfid, gfid_local);
+
+            gf_msg_debug(this->name, 0,
+                         "dst_cached before lookup: %s, "
+                         "(path:%s)(gfid:%s),",
+                         local->loc2.path,
+                         local->dst_cached ? local->dst_cached->name : NULL,
+                         local->dst_cached ? gfid_local : NULL);
+
+            local->dst_cached = dht_subvol_get_cached(this,
+                                                      local->loc2_copy.inode);
+
+            gf_uuid_unparse(stbuf->ia_gfid, gfid_local);
+
+            gf_msg_debug(this->name, GF_LOG_WARNING,
+                         "dst_cached after lookup: %s, "
+                         "(path:%s)(gfid:%s)",
+                         local->loc2.path,
+                         local->dst_cached ? local->dst_cached->name : NULL,
+                         local->dst_cached ? gfid_local : NULL);
+
+            if ((local->loc2.inode == NULL) ||
+                gf_uuid_compare(stbuf->ia_gfid, local->loc2.inode->gfid)) {
+                if (local->loc2.inode != NULL) {
+                    inode_unlink(local->loc2.inode, local->loc2.parent,
+                                 local->loc2.name);
+                    inode_unref(local->loc2.inode);
+                }
+
+                local->loc2.inode = inode_link(local->loc2_copy.inode,
+                                               local->loc2_copy.parent,
+                                               local->loc2_copy.name, stbuf);
+                gf_uuid_copy(local->loc2.gfid, stbuf->ia_gfid);
+            }
+        }
+    }
+
+    if (op_ret < 0) {
+        if (is_src) {
+            /* The meaning of is_linkfile is overloaded here. For locking
+             * to work properly both rebalance and rename should acquire
+             * lock on datafile. The reason for sending this lookup is to
+             * find out whether we've acquired a lock on data file.
+             * Between the lookup before rename and this rename, the
+             * file could be migrated by a rebalance process and now this
+             * file this might be a linkto file. We verify that by sending
+             * this lookup. However, if this lookup fails we cannot really
+             * say whether we've acquired lock on a datafile or linkto file.
+             * So, we act conservatively and _assume_
+             * that this is a linkfile and fail the rename operation.
+             */
+            local->is_linkfile = _gf_true;
+            local->op_errno = op_errno;
+        } else {
+            if (local->dst_cached)
+                gf_msg_debug(this->name, op_errno,
+                             "file %s (gfid:%s) was present "
+                             "(hashed-subvol=%s, "
+                             "cached-subvol=%s) before rename,"
+                             " but lookup failed",
+                             local->loc2.path,
+                             uuid_utoa(local->loc2.inode->gfid),
+                             local->dst_hashed->name, local->dst_cached->name);
+            if (dht_inode_missing(op_errno))
+                local->dst_cached = NULL;
+        }
+    } else if (is_src && xattr &&
+               check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) {
+        local->is_linkfile = _gf_true;
+        /* Found linkto file instead of data file, passdown ENOENT
+         * based on the above comment */
+        local->op_errno = ENOENT;
+    }
+
+    if (!local->is_linkfile && (op_ret >= 0) &&
+        gf_uuid_compare(loc->gfid, stbuf->ia_gfid)) {
+        gf_uuid_unparse(loc->gfid, gfid_local);
+        gf_uuid_unparse(stbuf->ia_gfid, gfid_server);
+
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+               "path:%s, received a different gfid, local_gfid= %s"
+               " server_gfid: %s",
+               local->loc.path, gfid_local, gfid_server);
+
+        /* Will passdown ENOENT anyway since the file we sent on
+         * rename is replaced with a different file */
+        local->op_errno = ENOENT;
+        /* Since local->is_linkfile is used here to detect failure,
+         * marking this to true */
+        local->is_linkfile = _gf_true;
+    }
+
+    call_cnt = dht_frame_return(frame);
+    if (is_last_call(call_cnt)) {
+        if (local->is_linkfile) {
+            local->op_ret = -1;
+            goto fail;
+        }
+
+        dht_rename_create_links(frame);
+    }
+
+    return 0;
+fail:
+    dht_rename_unlock(frame, this);
+    return 0;
+}
+
+int
+dht_rename_file_lock1_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    char src_gfid[GF_UUID_BUF_SIZE] = {0};
+    char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+    int ret = 0;
+    loc_t *loc = NULL;
+    xlator_t *subvol = NULL;
+
+    local = frame->local;
+
+    if (op_ret < 0) {
+        uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+        if (local->loc2.inode)
+            uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+               "protecting namespace of %s failed"
+               "rename (%s:%s:%s %s:%s:%s)",
+               local->current == &local->lock[0] ? local->loc.path
+                                                 : local->loc2.path,
+               local->loc.path, src_gfid, local->src_hashed->name,
+               local->loc2.path, dst_gfid,
+               local->dst_hashed ? local->dst_hashed->name : NULL);
+
+        local->op_ret = -1;
+        local->op_errno = op_errno;
+        goto err;
+    }
+
+    if (local->current == &local->lock[0]) {
+        loc = &local->loc2;
+        subvol = local->dst_hashed;
+        local->current = &local->lock[1];
+    } else {
+        loc = &local->loc;
+        subvol = local->src_hashed;
+        local->current = &local->lock[0];
+    }
+
+    ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns,
+                                dht_rename_lock_cbk);
+    if (ret < 0) {
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    return 0;
+err:
+    /* No harm in calling an extra unlock */
+    dht_rename_unlock(frame, this);
+    return 0;
+}
 
-	return 0;
+int32_t
+dht_rename_file_protect_namespace(call_frame_t *frame, void *cookie,
+                                  xlator_t *this, int32_t op_ret,
+                                  int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    char src_gfid[GF_UUID_BUF_SIZE] = {0};
+    char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+    int ret = 0;
+    loc_t *loc = NULL;
+    xlator_t *subvol = NULL;
+
+    local = frame->local;
+
+    if (op_ret < 0) {
+        uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+        if (local->loc2.inode)
+            uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+               "acquiring inodelk failed "
+               "rename (%s:%s:%s %s:%s:%s)",
+               local->loc.path, src_gfid, local->src_cached->name,
+               local->loc2.path, dst_gfid,
+               local->dst_cached ? local->dst_cached->name : NULL);
+
+        local->op_ret = -1;
+        local->op_errno = op_errno;
+
+        goto err;
+    }
+
+    /* Locks on src and dst needs to ordered which otherwise might cause
+     * deadlocks when rename (src, dst) and rename (dst, src) is done from
+     * two different clients
+     */
+    ret = dht_order_rename_lock(frame, &loc, &subvol);
+    if (ret) {
+        local->op_errno = ENOMEM;
+        goto err;
+    }
+
+    ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns,
+                                dht_rename_file_lock1_cbk);
+    if (ret < 0) {
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    return 0;
+
+err:
+    /* Its fine to call unlock even when no locks are acquired, as we check
+     * for lock->locked before winding a unlock call.
+     */
+    dht_rename_unlock(frame, this);
+
+    return 0;
 }
 
+int32_t
+dht_rename_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    char src_gfid[GF_UUID_BUF_SIZE] = {0};
+    char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+    dict_t *xattr_req = NULL;
+    dht_conf_t *conf = NULL;
+    int i = 0;
+    xlator_t *subvol = NULL;
+    dht_lock_t *lock = NULL;
+
+    local = frame->local;
+    conf = this->private;
+
+    if (op_ret < 0) {
+        uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+        if (local->loc2.inode)
+            uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+               "protecting namespace of %s failed. "
+               "rename (%s:%s:%s %s:%s:%s)",
+               local->current == &local->lock[0] ? local->loc.path
+                                                 : local->loc2.path,
+               local->loc.path, src_gfid, local->src_hashed->name,
+               local->loc2.path, dst_gfid,
+               local->dst_hashed ? local->dst_hashed->name : NULL);
+
+        local->op_ret = -1;
+        local->op_errno = op_errno;
+
+        goto done;
+    }
+
+    xattr_req = dict_new();
+    if (xattr_req == NULL) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto done;
+    }
+
+    op_ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
+    if (op_ret < 0) {
+        local->op_ret = -1;
+        local->op_errno = -op_ret;
+        goto done;
+    }
+
+    /* dst_cached might've changed. This normally happens for two reasons:
+     * 1. rebalance migrated dst
+     * 2. Another parallel rename was done overwriting dst
+     *
+     * Doing a lookup on local->loc2 when dst exists, but is associated
+     * with a different gfid will result in an ESTALE error. So, do a fresh
+     * lookup with a new inode on dst-path and handle change of dst-cached
+     * in the cbk. Also, to identify dst-cached changes we do a lookup on
+     * "this" rather than the subvol.
+     */
+    loc_copy(&local->loc2_copy, &local->loc2);
+    inode_unref(local->loc2_copy.inode);
+    local->loc2_copy.inode = inode_new(local->loc.inode->table);
+
+    /* Why not use local->lock.locks[?].loc for lookup post lock phase
+     * ---------------------------------------------------------------
+     * "layout.parent_layout.locks[?].loc" does not have the name and pargfid
+     * populated.
+     * Reason: If we had populated the name and pargfid, server might
+     * resolve to a successful lookup even if there is a file with same name
+     * with a different gfid(unlink & create) as server does name based
+     * resolution on first priority. And this can result in operating on a
+     * different inode entirely.
+     *
+     * Now consider a scenario where source file was renamed by some other
+     * client to a new name just before this lock was granted. So if a
+     * lookup would be done on local->lock[0].layout.parent_layout.locks[?].loc,
+     * server will send success even if the entry was renamed (since server will
+     * do a gfid based resolution). So once a lock is granted, make sure the
+     * file exists with the name that the client requested with.
+     * */
+
+    local->call_cnt = 2;
+    for (i = 0; i < 2; i++) {
+        if (i == 0) {
+            lock = local->rename_inodelk_backward_compatible[0];
+            if (gf_uuid_compare(local->loc.gfid, lock->loc.gfid) == 0)
+                subvol = lock->xl;
+            else {
+                lock = local->rename_inodelk_backward_compatible[1];
+                subvol = lock->xl;
+            }
+        } else {
+            subvol = this;
+        }
+
+        STACK_WIND_COOKIE(frame, dht_rename_lookup_cbk, (void *)(long)i, subvol,
+                          subvol->fops->lookup,
+                          (i == 0) ? &local->loc : &local->loc2_copy,
+                          xattr_req);
+    }
+
+    dict_unref(xattr_req);
+    return 0;
+
+done:
+    /* Its fine to call unlock even when no locks are acquired, as we check
+     * for lock->locked before winding a unlock call.
+     */
+    dht_rename_unlock(frame, this);
+
+    if (xattr_req)
+        dict_unref(xattr_req);
+
+    return 0;
+}
 
 int
-dht_rename (call_frame_t *frame, xlator_t *this,
-	    loc_t *oldloc, loc_t *newloc)
-{
-	xlator_t    *src_cached = NULL;
-	xlator_t    *src_hashed = NULL;
-	xlator_t    *dst_cached = NULL;
-	xlator_t    *dst_hashed = NULL;
-	int          op_errno = -1;
-	int          ret = -1;
-	dht_local_t *local = NULL;
-
-
-	VALIDATE_OR_GOTO (frame, err);
-	VALIDATE_OR_GOTO (this, err);
-	VALIDATE_OR_GOTO (oldloc, err);
-	VALIDATE_OR_GOTO (newloc, err);
-
-	src_hashed = dht_subvol_get_hashed (this, oldloc);
-	if (!src_hashed) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no subvolume in layout for path=%s",
-			oldloc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	src_cached = dht_subvol_get_cached (this, oldloc->inode);
-	if (!src_cached) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no cached subvolume for path=%s", oldloc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	dst_hashed = dht_subvol_get_hashed (this, newloc);
-	if (!dst_hashed) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no subvolume in layout for path=%s",
-			newloc->path);
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	if (newloc->inode)
-		dst_cached = dht_subvol_get_cached (this, newloc->inode);
-
-	local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	ret = loc_copy (&local->loc, oldloc);
-	if (ret == -1) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	ret = loc_copy (&local->loc2, newloc);
-	if (ret == -1) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	local->src_hashed = src_hashed;
-	local->src_cached = src_cached;
-	local->dst_hashed = dst_hashed;
-	local->dst_cached = dst_cached;
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"renaming %s (hash=%s/cache=%s) => %s (hash=%s/cache=%s)",
-		oldloc->path, src_hashed->name, src_cached->name,
-		newloc->path, dst_hashed->name,
-		dst_cached ? dst_cached->name : "<nul>");
-
-	if (S_ISDIR (oldloc->inode->st_mode)) {
-		dht_rename_dir (frame, this);
-	} else {
-		local->op_ret = 0;
-		dht_rename_create_links (frame);
-	}
-
-	return 0;
+dht_rename_lock(call_frame_t *frame)
+{
+    dht_local_t *local = NULL;
+    int count = 1, ret = -1;
+    dht_lock_t **lk_array = NULL;
+
+    local = frame->local;
+
+    if (local->dst_cached)
+        count++;
+
+    lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
+    if (lk_array == NULL)
+        goto err;
+
+    lk_array[0] = dht_lock_new(frame->this, local->src_cached, &local->loc,
+                               F_WRLCK, DHT_FILE_MIGRATE_DOMAIN, NULL,
+                               FAIL_ON_ANY_ERROR);
+    if (lk_array[0] == NULL)
+        goto err;
+
+    if (local->dst_cached) {
+        /* dst might be removed by the time inodelk reaches bricks,
+         * which can result in ESTALE errors. POSIX imposes no
+         * restriction for dst to be present for renames to be
+         * successful. So, we'll ignore ESTALE errors. As far as
+         * synchronization on dst goes, we'll achieve the same by
+         * holding entrylk on parent directory of dst in the namespace
+         * of basename(dst). Also, there might not be quorum in cluster
+         * xlators like EC/disperse on errno, in which case they return
+         * EIO. For eg., in a disperse (4 + 2), 3 might return success
+         * and three might return ESTALE. Disperse, having no Quorum
+         * unwinds inodelk with EIO. So, ignore EIO too.
+         */
+        lk_array[1] = dht_lock_new(frame->this, local->dst_cached, &local->loc2,
+                                   F_WRLCK, DHT_FILE_MIGRATE_DOMAIN, NULL,
+                                   IGNORE_ENOENT_ESTALE_EIO);
+        if (lk_array[1] == NULL)
+            goto err;
+    }
+
+    local->rename_inodelk_backward_compatible = lk_array;
+    local->rename_inodelk_bc_count = count;
+
+    /* retaining inodelks for the sake of backward compatibility. Please
+     * make sure to remove this inodelk once all of 3.10, 3.12 and 3.13
+     * reach EOL. Better way of getting synchronization would be to acquire
+     * entrylks on src and dst parent directories in the namespace of
+     * basenames of src and dst
+     */
+    ret = dht_blocking_inodelk(frame, lk_array, count,
+                               dht_rename_file_protect_namespace);
+    if (ret < 0) {
+        local->rename_inodelk_backward_compatible = NULL;
+        local->rename_inodelk_bc_count = 0;
+        goto err;
+    }
+
+    return 0;
+err:
+    if (lk_array != NULL) {
+        int tmp_count = 0, i = 0;
+
+        for (i = 0; (i < count) && (lk_array[i]); i++, tmp_count++)
+            ;
+
+        dht_lock_array_free(lk_array, tmp_count);
+        GF_FREE(lk_array);
+    }
+
+    return -1;
+}
+
+int
+dht_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+           dict_t *xdata)
+{
+    xlator_t *src_cached = NULL;
+    xlator_t *src_hashed = NULL;
+    xlator_t *dst_cached = NULL;
+    xlator_t *dst_hashed = NULL;
+    int op_errno = -1;
+    int ret = -1;
+    dht_local_t *local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+    char newgfid[GF_UUID_BUF_SIZE] = {0};
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(oldloc, err);
+    VALIDATE_OR_GOTO(newloc, err);
+
+    gf_uuid_unparse(oldloc->inode->gfid, gfid);
+
+    src_hashed = dht_subvol_get_hashed(this, oldloc);
+    if (!src_hashed) {
+        gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED,
+               "No hashed subvolume in layout for path=%s,"
+               "(gfid = %s)",
+               oldloc->path, gfid);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    src_cached = dht_subvol_get_cached(this, oldloc->inode);
+    if (!src_cached) {
+        gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED,
+               "No cached subvolume for path = %s,"
+               "(gfid = %s)",
+               oldloc->path, gfid);
+
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    dst_hashed = dht_subvol_get_hashed(this, newloc);
+    if (!dst_hashed) {
+        gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED,
+               "No hashed subvolume in layout for path=%s", newloc->path);
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    if (newloc->inode)
+        dst_cached = dht_subvol_get_cached(this, newloc->inode);
+
+    local = dht_local_init(frame, oldloc, NULL, GF_FOP_RENAME);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+    /* cached_subvol will be set from dht_local_init, reset it to NULL,
+       as the logic of handling rename is different  */
+    local->cached_subvol = NULL;
+
+    ret = loc_copy(&local->loc2, newloc);
+    if (ret == -1) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->src_hashed = src_hashed;
+    local->src_cached = src_cached;
+    local->dst_hashed = dst_hashed;
+    local->dst_cached = dst_cached;
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    if (newloc->inode)
+        gf_uuid_unparse(newloc->inode->gfid, newgfid);
+
+    gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_INFO,
+           "renaming %s (%s) (hash=%s/cache=%s) => %s (%s) "
+           "(hash=%s/cache=%s) ",
+           oldloc->path, gfid, src_hashed->name, src_cached->name, newloc->path,
+           newloc->inode ? newgfid : NULL, dst_hashed->name,
+           dst_cached ? dst_cached->name : "<nul>");
+
+    if (IA_ISDIR(oldloc->inode->ia_type)) {
+        dht_rename_dir(frame, this);
+    } else {
+        local->op_ret = 0;
+        ret = dht_rename_lock(frame);
+        if (ret < 0) {
+            op_errno = ENOMEM;
+            goto err;
+        }
+    }
+
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+                     NULL);
 
-	return 0;
+    return 0;
+}
+
+int
+dht_pt_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+              dict_t *xdata)
+{
+    gf_boolean_t free_xdata = _gf_false;
+
+    /* Just a pass through */
+    if (!IA_ISDIR(oldloc->inode->ia_type)) {
+        if (!xdata) {
+            free_xdata = _gf_true;
+        }
+        DHT_CHANGELOG_TRACK_AS_RENAME(xdata, oldloc, newloc);
+    }
+    default_rename(frame, this, oldloc, newloc, xdata);
+    if (free_xdata && xdata) {
+        dict_unref(xdata);
+        xdata = NULL;
+    }
+    return 0;
 }
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index 8b6bdabd681..3e24065227c 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -1,473 +1,2600 @@
 /*
-   Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
 
+#include "dht-lock.h"
+
+#define DHT_SET_LAYOUT_RANGE(layout, i, srt, chunk, path)                      \
+    do {                                                                       \
+        layout->list[i].start = srt;                                           \
+        layout->list[i].stop = srt + chunk - 1;                                \
+        layout->list[i].commit_hash = layout->commit_hash;                     \
+                                                                               \
+        gf_msg_trace(this->name, 0,                                            \
+                     "gave fix: 0x%x - 0x%x, with commit-hash 0x%x"            \
+                     " on %s for %s",                                          \
+                     layout->list[i].start, layout->list[i].stop,              \
+                     layout->list[i].commit_hash,                              \
+                     layout->list[i].xlator->name, path);                      \
+    } while (0)
+
+#define DHT_RESET_LAYOUT_RANGE(layout)                                         \
+    do {                                                                       \
+        int cnt = 0;                                                           \
+        for (cnt = 0; cnt < layout->cnt; cnt++) {                              \
+            layout->list[cnt].start = 0;                                       \
+            layout->list[cnt].stop = 0;                                        \
+        }                                                                      \
+    } while (0)
+
+static int
+dht_selfheal_layout_lock(call_frame_t *frame, dht_layout_t *layout,
+                         gf_boolean_t newdir, dht_selfheal_layout_t healer,
+                         dht_need_heal_t should_heal);
+
+static uint32_t
+dht_overlap_calc(dht_layout_t *old, int o, dht_layout_t *new, int n)
+{
+    if (o >= old->cnt || n >= new->cnt)
+        return 0;
 
-#include "glusterfs.h"
-#include "xlator.h"
-#include "dht-common.h"
+    if (old->list[o].err > 0 || new->list[n].err > 0)
+        return 0;
 
+    if (old->list[o].start == old->list[o].stop) {
+        return 0;
+    }
 
-int
-dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret)
-{
-	dht_local_t  *local = NULL;
+    if (new->list[n].start == new->list[n].stop) {
+        return 0;
+    }
 
+    if ((old->list[o].start > new->list[n].stop) ||
+        (old->list[o].stop < new->list[n].start))
+        return 0;
 
-	local = frame->local;
-	local->selfheal.dir_cbk (frame, NULL, frame->this, ret,
-				 local->op_errno);
+    return min(old->list[o].stop, new->list[n].stop) -
+           max(old->list[o].start, new->list[n].start) + 1;
+}
 
-	return 0;
+int
+dht_selfheal_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    DHT_STACK_DESTROY(frame);
+    return 0;
 }
 
+int
+dht_selfheal_dir_finish(call_frame_t *frame, xlator_t *this, int ret,
+                        int invoke_cbk)
+{
+    dht_local_t *local = NULL, *lock_local = NULL;
+    call_frame_t *lock_frame = NULL;
+    int lock_count = 0;
+
+    local = frame->local;
+
+    /* Unlock entrylk */
+    dht_unlock_entrylk_wrapper(frame, &local->lock[0].ns.directory_ns);
+
+    /* Unlock inodelk */
+    lock_count = dht_lock_count(local->lock[0].ns.parent_layout.locks,
+                                local->lock[0].ns.parent_layout.lk_count);
+    if (lock_count == 0)
+        goto done;
+
+    lock_frame = copy_frame(frame);
+    if (lock_frame == NULL) {
+        goto done;
+    }
+
+    lock_local = dht_local_init(lock_frame, &local->loc, NULL,
+                                lock_frame->root->op);
+    if (lock_local == NULL) {
+        goto done;
+    }
+
+    lock_local->lock[0].ns.parent_layout.locks = local->lock[0]
+                                                     .ns.parent_layout.locks;
+    lock_local->lock[0]
+        .ns.parent_layout.lk_count = local->lock[0].ns.parent_layout.lk_count;
+
+    local->lock[0].ns.parent_layout.locks = NULL;
+    local->lock[0].ns.parent_layout.lk_count = 0;
+
+    dht_unlock_inodelk(lock_frame, lock_local->lock[0].ns.parent_layout.locks,
+                       lock_local->lock[0].ns.parent_layout.lk_count,
+                       dht_selfheal_unlock_cbk);
+    lock_frame = NULL;
+
+done:
+    if (invoke_cbk)
+        local->selfheal.dir_cbk(frame, NULL, frame->this, ret, local->op_errno,
+                                NULL);
+    if (lock_frame != NULL) {
+        DHT_STACK_DESTROY(lock_frame);
+    }
+
+    return 0;
+}
 
 int
-dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			    int op_ret, int op_errno)
+dht_refresh_layout_done(call_frame_t *frame)
 {
-	dht_local_t  *local = NULL;
-	call_frame_t *prev = NULL;
-	xlator_t     *subvol = NULL;
-	int           i = 0;
-	dht_layout_t *layout = NULL;
-	int           err = 0;
-	int           this_call_cnt = 0;
+    int ret = -1;
+    dht_layout_t *refreshed = NULL, *heal = NULL;
+    dht_local_t *local = NULL;
+    dht_need_heal_t should_heal = NULL;
+    dht_selfheal_layout_t healer = NULL;
 
-	local = frame->local;
-	layout = local->selfheal.layout;
-	prev = cookie;
-	subvol = prev->this;
+    local = frame->local;
 
-	if (op_ret == 0)
-		err = 0;
-	else
-		err = op_errno;
+    refreshed = local->selfheal.refreshed_layout;
+    heal = local->selfheal.layout;
 
-	for (i = 0; i < layout->cnt; i++) {
-		if (layout->list[i].xlator == subvol) {
-			layout->list[i].err = err;
-			break;
-		}
-	}
+    healer = local->selfheal.healer;
+    should_heal = local->selfheal.should_heal;
 
-	this_call_cnt = dht_frame_return (frame);
+    ret = dht_layout_sort(refreshed);
+    if (ret == -1) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+                DHT_MSG_LAYOUT_SORT_FAILED, NULL);
+        goto err;
+    }
 
-	if (is_last_call (this_call_cnt)) {
-		dht_selfheal_dir_finish (frame, this, 0);
-	}
+    if (should_heal(frame, &heal, &refreshed)) {
+        healer(frame, &local->loc, heal);
+    } else {
+        local->selfheal.layout = NULL;
+        local->selfheal.refreshed_layout = NULL;
+        local->selfheal.layout = refreshed;
 
-	return 0;
-}
+        dht_layout_unref(frame->this, heal);
+
+        dht_selfheal_dir_finish(frame, frame->this, 0, 1);
+    }
+
+    return 0;
 
+err:
+    dht_selfheal_dir_finish(frame, frame->this, -1, 1);
+    return 0;
+}
 
 int
-dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
-				  dht_layout_t *layout, int i)
+dht_refresh_layout_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int op_ret, int op_errno, inode_t *inode,
+                       struct iatt *stbuf, dict_t *xattr,
+                       struct iatt *postparent)
 {
-	xlator_t          *subvol = NULL;
-	dict_t            *xattr = NULL;
-	int                ret = 0;
-	xlator_t          *this = NULL;
-	int32_t           *disk_layout = NULL;
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+    xlator_t *prev = NULL;
+    dht_layout_t *layout = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("dht", frame, err);
+    GF_VALIDATE_OR_GOTO("dht", this, err);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, err);
+    GF_VALIDATE_OR_GOTO("dht", this->private, err);
 
+    local = frame->local;
+    prev = cookie;
 
-	subvol = layout->list[i].xlator;
-	this = frame->this;
+    layout = local->selfheal.refreshed_layout;
 
-	xattr = get_new_dict ();
-	if (!xattr) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
+    LOCK(&frame->lock);
+    {
+        op_ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr);
 
-	ret = dht_disk_layout_extract (this, layout, i, &disk_layout);
-	if (ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"failed to extract disk layout");
-		goto err;
-	}
+        dht_iatt_merge(this, &local->stbuf, stbuf);
 
-	ret = dict_set_bin (xattr, "trusted.glusterfs.dht",
-			    disk_layout, 4 * 4);
-	if (ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"failed to set xattr dictionary");
-		goto err;
-	}
-	disk_layout = NULL;
+        if (op_ret == -1) {
+            gf_uuid_unparse(local->loc.gfid, gfid);
+            local->op_errno = op_errno;
+            gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                    DHT_MSG_FILE_LOOKUP_FAILED, "path=%s", local->loc.path,
+                    "name=%s", prev->name, "gfid=%s", gfid, NULL);
 
-	gf_log (this->name, GF_LOG_DEBUG,
-		"setting hash range %u - %u (type %d) on subvolume %s for %s",
-		layout->list[i].start, layout->list[i].stop,
-		layout->type, subvol->name, loc->path);
+            goto unlock;
+        }
 
-	dict_ref (xattr);
+        local->op_ret = 0;
+    }
+unlock:
+    UNLOCK(&frame->lock);
 
-	STACK_WIND (frame, dht_selfheal_dir_xattr_cbk,
-		    subvol, subvol->fops->setxattr,
-		    loc, xattr, 0);
+    this_call_cnt = dht_frame_return(frame);
 
-	dict_unref (xattr);
+    if (is_last_call(this_call_cnt)) {
+        if (local->op_ret == 0) {
+            local->refresh_layout_done(frame);
+        } else {
+            goto err;
+        }
+    }
 
-	return 0;
+    return 0;
 
 err:
-	if (xattr)
-		dict_destroy (xattr);
+    if (local) {
+        local->refresh_layout_unlock(frame, this, -1, 1);
+    }
+    return 0;
+}
+
+int
+dht_refresh_layout(call_frame_t *frame)
+{
+    int call_cnt = 0;
+    int i = 0, ret = -1;
+    dht_conf_t *conf = NULL;
+    dht_local_t *local = NULL;
+    xlator_t *this = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("dht", frame, out);
+    GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+
+    this = frame->this;
+    conf = this->private;
+    local = frame->local;
+
+    call_cnt = conf->subvolume_cnt;
+    local->call_cnt = call_cnt;
+    local->op_ret = -1;
+
+    if (local->selfheal.refreshed_layout) {
+        dht_layout_unref(this, local->selfheal.refreshed_layout);
+        local->selfheal.refreshed_layout = NULL;
+    }
+
+    local->selfheal.refreshed_layout = dht_layout_new(this,
+                                                      conf->subvolume_cnt);
+    if (!local->selfheal.refreshed_layout) {
+        gf_uuid_unparse(local->loc.gfid, gfid);
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+                "path=%s", local->loc.path, "gfid=%s", gfid, NULL);
+        goto out;
+    }
+
+    if (local->xattr != NULL) {
+        dict_del(local->xattr, conf->xattr_name);
+    }
+
+    if (local->xattr_req == NULL) {
+        gf_uuid_unparse(local->loc.gfid, gfid);
+        local->xattr_req = dict_new();
+        if (local->xattr_req == NULL) {
+            gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+                    "path=%s", local->loc.path, "gfid=%s", gfid, NULL);
+            goto out;
+        }
+    }
+
+    if (dict_get(local->xattr_req, conf->xattr_name) == 0) {
+        ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+                    "path=%s", local->loc.path, "key=%s", conf->xattr_name,
+                    NULL);
+    }
+
+    for (i = 0; i < call_cnt; i++) {
+        STACK_WIND_COOKIE(frame, dht_refresh_layout_cbk, conf->subvolumes[i],
+                          conf->subvolumes[i],
+                          conf->subvolumes[i]->fops->lookup, &local->loc,
+                          local->xattr_req);
+    }
+
+    return 0;
+
+out:
+    if (local) {
+        local->refresh_layout_unlock(frame, this, -1, 1);
+    }
+    return 0;
+}
 
-	if (disk_layout)
-		FREE (disk_layout);
+int32_t
+dht_selfheal_layout_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                             int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (!local) {
+        goto err;
+    }
+
+    if (op_ret < 0) {
+        local->op_errno = op_errno;
+        goto err;
+    }
+
+    local->refresh_layout_unlock = dht_selfheal_dir_finish;
+    local->refresh_layout_done = dht_refresh_layout_done;
+
+    dht_refresh_layout(frame);
+    return 0;
 
-	dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this,
-				    -1, ENOMEM);
-	return 0;
+err:
+    dht_selfheal_dir_finish(frame, this, -1, 1);
+    return 0;
 }
 
+gf_boolean_t
+dht_should_heal_layout(call_frame_t *frame, dht_layout_t **heal,
+                       dht_layout_t **ondisk)
+{
+    gf_boolean_t fixit = _gf_true;
+    dht_local_t *local = NULL;
+    int heal_missing_dirs = 0;
+
+    local = frame->local;
+
+    if ((heal == NULL) || (*heal == NULL) || (ondisk == NULL) ||
+        (*ondisk == NULL))
+        goto out;
+
+    dht_layout_anomalies(
+        frame->this, &local->loc, *ondisk, &local->selfheal.hole_cnt,
+        &local->selfheal.overlaps_cnt, &local->selfheal.missing_cnt,
+        &local->selfheal.down, &local->selfheal.misc, NULL);
+
+    /* Directories might've been created as part of this self-heal. We've to
+     * sync non-layout xattrs and set range 0-0 on new directories
+     */
+    heal_missing_dirs = local->selfheal.force_mkdir
+                            ? local->selfheal.force_mkdir
+                            : dht_layout_missing_dirs(*heal);
+
+    if ((local->selfheal.hole_cnt == 0) &&
+        (local->selfheal.overlaps_cnt == 0) && heal_missing_dirs) {
+        dht_layout_t *tmp = NULL;
+
+        /* Just added a brick and need to set 0-0 range on this brick.
+         * But ondisk layout is well-formed. So, swap layouts "heal" and
+         * "ondisk". Now "ondisk" layout will be used for healing
+         * xattrs. If there are any non-participating subvols in
+         * "ondisk" layout, dht_selfheal_dir_xattr_persubvol will set
+         * 0-0 and non-layout xattrs. This way we won't end up in
+         * "corrupting" already set and well-formed "ondisk" layout.
+         */
+        tmp = *heal;
+        *heal = *ondisk;
+        *ondisk = tmp;
+
+        /* Current selfheal code, heals non-layout xattrs only after
+         * an add-brick. In fact non-layout xattrs are considered as
+         * secondary citizens which are healed only if layout xattrs
+         * need to be healed. This is wrong, since for eg., quota can be
+         * set when layout is well-formed, but a node is down. Also,
+         * just for healing non-layout xattrs, we don't need locking.
+         * This issue is _NOT FIXED_ by this patch.
+         */
+    }
+
+    fixit = (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt ||
+             heal_missing_dirs);
+
+out:
+    return fixit;
+}
+
+int
+dht_layout_span(dht_layout_t *layout)
+{
+    int i = 0, count = 0;
+
+    for (i = 0; i < layout->cnt; i++) {
+        if (layout->list[i].err)
+            continue;
+
+        if (layout->list[i].start != layout->list[i].stop)
+            count++;
+    }
+
+    return count;
+}
 
 int
-dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
+dht_decommissioned_bricks_in_layout(xlator_t *this, dht_layout_t *layout)
+{
+    dht_conf_t *conf = NULL;
+    int count = 0, i = 0, j = 0;
+
+    if ((this == NULL) || (layout == NULL))
+        goto out;
+
+    conf = this->private;
+
+    for (i = 0; i < layout->cnt; i++) {
+        for (j = 0; j < conf->subvolume_cnt; j++) {
+            if (conf->decommissioned_bricks[j] &&
+                conf->decommissioned_bricks[j] == layout->list[i].xlator) {
+                count++;
+            }
+        }
+    }
+
+out:
+    return count;
+}
+
+dht_distribution_type_t
+dht_distribution_type(xlator_t *this, dht_layout_t *layout)
+{
+    dht_distribution_type_t type = GF_DHT_EQUAL_DISTRIBUTION;
+    int i = 0;
+    uint32_t start_range = 0, range = 0, diff = 0;
+
+    if ((this == NULL) || (layout == NULL) || (layout->cnt < 1)) {
+        goto out;
+    }
+
+    for (i = 0; i < layout->cnt; i++) {
+        if (start_range == 0) {
+            start_range = layout->list[i].stop - layout->list[i].start;
+            continue;
+        }
+
+        range = layout->list[i].stop - layout->list[i].start;
+        diff = (range >= start_range) ? range - start_range
+                                      : start_range - range;
+
+        if ((range != 0) && (diff > layout->cnt)) {
+            type = GF_DHT_WEIGHTED_DISTRIBUTION;
+            break;
+        }
+    }
+
+out:
+    return type;
+}
+
+gf_boolean_t
+dht_should_fix_layout(call_frame_t *frame, dht_layout_t **inmem,
+                      dht_layout_t **ondisk)
 {
-	dht_local_t *local = NULL;
-	int          missing_xattr = 0;
-	int          i = 0;
-	int          ret = 0;
-	xlator_t    *this = NULL;
+    gf_boolean_t fixit = _gf_true;
+
+    dht_local_t *local = NULL;
+    int layout_span = 0;
+    int decommissioned_bricks = 0;
+    dht_conf_t *conf = NULL;
+    dht_distribution_type_t inmem_dist_type = 0;
+    dht_distribution_type_t ondisk_dist_type = 0;
+
+    conf = frame->this->private;
+
+    local = frame->local;
+
+    if ((inmem == NULL) || (*inmem == NULL) || (ondisk == NULL) ||
+        (*ondisk == NULL))
+        goto out;
+
+    dht_layout_anomalies(frame->this, &local->loc, *ondisk,
+                         &local->selfheal.hole_cnt,
+                         &local->selfheal.overlaps_cnt, NULL,
+                         &local->selfheal.down, &local->selfheal.misc, NULL);
+
+    if (local->selfheal.down || local->selfheal.misc) {
+        fixit = _gf_false;
+        goto out;
+    }
+
+    if (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt)
+        goto out;
 
-	local = frame->local;
-	this = frame->this;
+    /* If commit hashes are being updated, let it through */
+    if ((*inmem)->commit_hash != (*ondisk)->commit_hash)
+        goto out;
 
-	for (i = 0; i < layout->cnt; i++) {
-		if (layout->list[i].err != -1 || !layout->list[i].stop) {
-			/* err != -1 would mean xattr present on the directory
-			 * or the directory is itself non existant.
-			 * !layout->list[i].stop would mean layout absent
-			 */
-			continue;
-		}
-		missing_xattr++;
-	}
+    layout_span = dht_layout_span(*ondisk);
 
-	gf_log (this->name, GF_LOG_DEBUG,
-		"%d subvolumes missing xattr for %s",
-		missing_xattr, loc->path);
+    decommissioned_bricks = dht_decommissioned_bricks_in_layout(frame->this,
+                                                                *ondisk);
+    inmem_dist_type = dht_distribution_type(frame->this, *inmem);
+    ondisk_dist_type = dht_distribution_type(frame->this, *ondisk);
 
-	if (missing_xattr == 0) {
-		dht_selfheal_dir_finish (frame, this, 0);
-		return 0;
-	}
+    if ((decommissioned_bricks == 0) &&
+        (layout_span ==
+         (conf->subvolume_cnt - conf->decommission_subvols_cnt)) &&
+        (inmem_dist_type == ondisk_dist_type))
+        fixit = _gf_false;
 
-	local->call_cnt = missing_xattr;
+out:
 
-	for (i = 0; i < layout->cnt; i++) {
-		if (layout->list[i].err != -1 || !layout->list[i].stop)
-			continue;
+    return fixit;
+}
 
-		ret = dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i);
+static int
+dht_selfheal_layout_lock(call_frame_t *frame, dht_layout_t *layout,
+                         gf_boolean_t newdir, dht_selfheal_layout_t healer,
+                         dht_need_heal_t should_heal)
+{
+    dht_local_t *local = NULL;
+    int count = 1, ret = -1, i = 0;
+    dht_lock_t **lk_array = NULL;
+    dht_conf_t *conf = NULL;
+    dht_layout_t *tmp = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    GF_VALIDATE_OR_GOTO("dht", frame, err);
+    GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err);
+
+    local = frame->local;
+
+    conf = frame->this->private;
+
+    local->selfheal.healer = healer;
+    local->selfheal.should_heal = should_heal;
+
+    tmp = local->selfheal.layout;
+    local->selfheal.layout = dht_layout_ref(frame->this, layout);
+    dht_layout_unref(frame->this, tmp);
+
+    if (!newdir) {
+        count = conf->subvolume_cnt;
+
+        lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_char);
+        if (lk_array == NULL) {
+            gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
+            gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+                    "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL);
+            goto err;
+        }
+
+        for (i = 0; i < count; i++) {
+            lk_array[i] = dht_lock_new(
+                frame->this, conf->subvolumes[i], &local->loc, F_WRLCK,
+                DHT_LAYOUT_HEAL_DOMAIN, NULL, FAIL_ON_ANY_ERROR);
+            if (lk_array[i] == NULL) {
+                gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
+                gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM,
+                        DHT_MSG_MEM_ALLOC_FAILED, "lk_array-gfid=%s", gfid,
+                        "path=%s", local->loc.path, NULL);
+                goto err;
+            }
+        }
+    } else {
+        count = 1;
+        lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_char);
+        if (lk_array == NULL) {
+            gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
+            gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+                    "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL);
+            goto err;
+        }
+
+        lk_array[0] = dht_lock_new(frame->this, local->hashed_subvol,
+                                   &local->loc, F_WRLCK, DHT_LAYOUT_HEAL_DOMAIN,
+                                   NULL, FAIL_ON_ANY_ERROR);
+        if (lk_array[0] == NULL) {
+            gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
+            gf_smsg(THIS->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+                    "lk_array-gfid=%s", gfid, "path=%s", local->loc.path, NULL);
+            goto err;
+        }
+    }
+
+    local->lock[0].layout.my_layout.locks = lk_array;
+    local->lock[0].layout.my_layout.lk_count = count;
+
+    ret = dht_blocking_inodelk(frame, lk_array, count,
+                               dht_selfheal_layout_lock_cbk);
+    if (ret < 0) {
+        local->lock[0].layout.my_layout.locks = NULL;
+        local->lock[0].layout.my_layout.lk_count = 0;
+        goto err;
+    }
+
+    return 0;
+err:
+    if (lk_array != NULL) {
+        dht_lock_array_free(lk_array, count);
+        GF_FREE(lk_array);
+    }
 
-		if (--missing_xattr == 0)
-			break;
-	}
-	return 0;
+    return -1;
 }
 
+static int
+dht_selfheal_dir_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int op_ret, int op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    xlator_t *subvol = NULL;
+    struct iatt *stbuf = NULL;
+    int i = 0;
+    int ret = 0;
+    dht_layout_t *layout = NULL;
+    int err = 0;
+    int this_call_cnt = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+    layout = local->selfheal.layout;
+    subvol = cookie;
+
+    if (op_ret == 0) {
+        err = 0;
+    } else {
+        gf_uuid_unparse(local->loc.gfid, gfid);
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "name=%s", subvol->name,
+                "path=%s", local->loc.path, "gfid=%s", gfid, NULL);
+        err = op_errno;
+    }
+
+    ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
+    if (ret < 0) {
+        gf_uuid_unparse(local->loc.gfid, gfid);
+        gf_msg_debug(this->name, 0,
+                     "key = %s not present in dict"
+                     ", path:%s gfid:%s",
+                     DHT_IATT_IN_XDATA_KEY, local->loc.path, gfid);
+    }
+
+    for (i = 0; i < layout->cnt; i++) {
+        if (layout->list[i].xlator == subvol) {
+            layout->list[i].err = err;
+            break;
+        }
+    }
+
+    LOCK(&frame->lock);
+    {
+        dht_iatt_merge(this, &local->stbuf, stbuf);
+    }
+    UNLOCK(&frame->lock);
+
+    this_call_cnt = dht_frame_return(frame);
+
+    if (is_last_call(this_call_cnt)) {
+        dht_selfheal_dir_finish(frame, this, 0, 1);
+    }
+
+    return 0;
+}
 
+/* Code is required to set user xattr to local->xattr
+ */
 int
-dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			    int op_ret, int op_errno,
-			    inode_t *inode, struct stat *stbuf)
+dht_set_user_xattr(dict_t *dict, char *k, data_t *v, void *data)
 {
-	dht_local_t   *local = NULL;
-	dht_layout_t  *layout = NULL;
-	call_frame_t  *prev = NULL;
-	xlator_t      *subvol = NULL;
-	int            i = 0;
-	int            this_call_cnt = 0;
+    dict_t *set_xattr = data;
+    int ret = -1;
 
+    ret = dict_set(set_xattr, k, v);
+    return ret;
+}
 
-	local  = frame->local;
-	layout = local->selfheal.layout;
-	prev   = cookie;
-	subvol = prev->this;
+static int
+dht_selfheal_dir_xattr_persubvol(call_frame_t *frame, loc_t *loc,
+                                 dht_layout_t *layout, int i,
+                                 xlator_t *req_subvol)
+{
+    xlator_t *subvol = NULL;
+    dict_t *xattr = NULL;
+    dict_t *xdata = NULL;
+    int ret = 0;
+    xlator_t *this = NULL;
+    int32_t *disk_layout = NULL;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    data_t *data = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+    if (req_subvol)
+        subvol = req_subvol;
+    else
+        subvol = layout->list[i].xlator;
+    this = frame->this;
+
+    GF_VALIDATE_OR_GOTO("", this, err);
+    GF_VALIDATE_OR_GOTO(this->name, layout, err);
+    GF_VALIDATE_OR_GOTO(this->name, local, err);
+    GF_VALIDATE_OR_GOTO(this->name, subvol, err);
+    VALIDATE_OR_GOTO(this->private, err);
+
+    conf = this->private;
+
+    xattr = dict_new();
+    if (!xattr) {
+        goto err;
+    }
+
+    xdata = dict_new();
+    if (!xdata)
+        goto err;
+
+    ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+                "path=%s", loc->path, "key=%s", GLUSTERFS_INTERNAL_FOP_KEY,
+                "gfid=%s", gfid, NULL);
+        goto err;
+    }
+
+    ret = dict_set_int8(xdata, DHT_IATT_IN_XDATA_KEY, 1);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+                "path=%s", loc->path, "key=%s", DHT_IATT_IN_XDATA_KEY,
+                "gfid=%s", gfid, NULL);
+        goto err;
+    }
+
+    gf_uuid_unparse(loc->inode->gfid, gfid);
+
+    ret = dht_disk_layout_extract(this, layout, i, &disk_layout);
+    if (ret == -1) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0,
+                DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+                "extract-disk-layout-failed, path=%s", loc->path, "subvol=%s",
+                subvol->name, "gfid=%s", gfid, NULL);
+        goto err;
+    }
+
+    ret = dict_set_bin(xattr, conf->xattr_name, disk_layout, 4 * 4);
+    if (ret == -1) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0,
+                DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "path=%s", loc->path,
+                "subvol=%s", subvol->name,
+                "set-xattr-dictionary-failed"
+                "gfid=%s",
+                gfid, NULL);
+        goto err;
+    }
+    disk_layout = NULL;
+
+    gf_msg_trace(this->name, 0,
+                 "setting hash range 0x%x - 0x%x (type %d) on subvolume %s"
+                 " for %s",
+                 layout->list[i].start, layout->list[i].stop, layout->type,
+                 subvol->name, loc->path);
+
+    if (local->xattr) {
+        data = dict_get(local->xattr, QUOTA_LIMIT_KEY);
+        if (data) {
+            ret = dict_add(xattr, QUOTA_LIMIT_KEY, data);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+                        "path=%s", loc->path, "key=%s", QUOTA_LIMIT_KEY, NULL);
+            }
+        }
+        data = dict_get(local->xattr, QUOTA_LIMIT_OBJECTS_KEY);
+        if (data) {
+            ret = dict_add(xattr, QUOTA_LIMIT_OBJECTS_KEY, data);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+                        "path=%s", loc->path, "key=%s", QUOTA_LIMIT_OBJECTS_KEY,
+                        NULL);
+            }
+        }
+    }
+
+    if (!gf_uuid_is_null(local->gfid))
+        gf_uuid_copy(loc->gfid, local->gfid);
+
+    STACK_WIND_COOKIE(frame, dht_selfheal_dir_xattr_cbk, (void *)subvol, subvol,
+                      subvol->fops->setxattr, loc, xattr, 0, xdata);
+
+    dict_unref(xattr);
+    dict_unref(xdata);
+
+    return 0;
 
-	if ((op_ret == 0) || (op_errno == EEXIST)) {
-		for (i = 0; i < layout->cnt; i++) {
-			if (layout->list[i].xlator == subvol) {
-				layout->list[i].err = -1;
-				break;
-			}
-		}
-	}
+err:
+    if (xattr)
+        dict_unref(xattr);
+    if (xdata)
+        dict_unref(xdata);
 
-	this_call_cnt = dht_frame_return (frame);
+    GF_FREE(disk_layout);
 
-	if (is_last_call (this_call_cnt)) {
-		dht_selfheal_dir_xattr (frame, &local->loc, layout);
-	}
+    dht_selfheal_dir_xattr_cbk(frame, (void *)subvol, frame->this, -1, ENOMEM,
+                               NULL);
+    return 0;
+}
+
+static int
+dht_fix_dir_xattr(call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
+{
+    dht_local_t *local = NULL;
+    int i = 0;
+    int count = 0;
+    xlator_t *this = NULL;
+    dht_conf_t *conf = NULL;
+    dht_layout_t *dummy = NULL;
+
+    local = frame->local;
+    this = frame->this;
+    conf = this->private;
+
+    gf_msg_debug(this->name, 0, "%s: Writing the new range for all subvolumes",
+                 loc->path);
+
+    local->call_cnt = count = conf->subvolume_cnt;
+
+    if (gf_log_get_loglevel() >= GF_LOG_DEBUG)
+        dht_log_new_layout_for_dir_selfheal(this, loc, layout);
+
+    for (i = 0; i < layout->cnt; i++) {
+        dht_selfheal_dir_xattr_persubvol(frame, loc, layout, i, NULL);
+
+        if (--count == 0)
+            goto out;
+    }
+    /* if we are here, subvolcount > layout_count. subvols-per-directory
+     * option might be set here. We need to clear out layout from the
+     * non-participating subvolumes, else it will result in overlaps */
+    dummy = dht_layout_new(this, 1);
+    if (!dummy)
+        goto out;
+    dummy->commit_hash = layout->commit_hash;
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (_gf_false == dht_is_subvol_in_layout(layout, conf->subvolumes[i])) {
+            dht_selfheal_dir_xattr_persubvol(frame, loc, dummy, 0,
+                                             conf->subvolumes[i]);
+            if (--count == 0)
+                break;
+        }
+    }
+
+    dht_layout_unref(this, dummy);
+out:
+    return 0;
+}
 
-	return 0;
+static int
+dht_selfheal_dir_xattr(call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
+{
+    dht_local_t *local = NULL;
+    int missing_xattr = 0;
+    int i = 0;
+    xlator_t *this = NULL;
+    dht_conf_t *conf = NULL;
+    dht_layout_t *dummy = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {
+        0,
+    };
+
+    local = frame->local;
+    this = frame->this;
+    conf = this->private;
+
+    for (i = 0; i < layout->cnt; i++) {
+        if (layout->list[i].err != -1 || !layout->list[i].stop) {
+            /* err != -1 would mean xattr present on the directory
+             * or the directory is non existent.
+             * !layout->list[i].stop would mean layout absent
+             */
+
+            continue;
+        }
+        missing_xattr++;
+    }
+    /* Also account for subvolumes with no-layout. Used for zero'ing out
+     * the layouts and for setting quota key's if present */
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (_gf_false == dht_is_subvol_in_layout(layout, conf->subvolumes[i])) {
+            missing_xattr++;
+        }
+    }
+    gf_msg_trace(this->name, 0, "%d subvolumes missing xattr for %s",
+                 missing_xattr, loc->path);
+
+    if (missing_xattr == 0) {
+        dht_selfheal_dir_finish(frame, this, 0, 1);
+        return 0;
+    }
+
+    local->call_cnt = missing_xattr;
+
+    if (gf_log_get_loglevel() >= GF_LOG_DEBUG)
+        dht_log_new_layout_for_dir_selfheal(this, loc, layout);
+
+    for (i = 0; i < layout->cnt; i++) {
+        if (layout->list[i].err != -1 || !layout->list[i].stop)
+            continue;
+
+        dht_selfheal_dir_xattr_persubvol(frame, loc, layout, i, NULL);
+
+        if (--missing_xattr == 0)
+            break;
+    }
+    dummy = dht_layout_new(this, 1);
+    if (!dummy) {
+        gf_uuid_unparse(loc->gfid, gfid);
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DUMMY_ALLOC_FAILED,
+                "path=%s", loc->path, "gfid=%s", gfid, NULL);
+        goto out;
+    }
+    for (i = 0; i < conf->subvolume_cnt && missing_xattr; i++) {
+        if (_gf_false == dht_is_subvol_in_layout(layout, conf->subvolumes[i])) {
+            dht_selfheal_dir_xattr_persubvol(frame, loc, dummy, 0,
+                                             conf->subvolumes[i]);
+            missing_xattr--;
+        }
+    }
+
+    dht_layout_unref(this, dummy);
+out:
+    return 0;
 }
 
+int
+dht_selfheal_dir_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                             int op_ret, int op_errno, struct iatt *statpre,
+                             struct iatt *statpost, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    dht_layout_t *layout = NULL;
+    int this_call_cnt = 0, ret = -1;
+
+    local = frame->local;
+    layout = local->selfheal.layout;
+
+    this_call_cnt = dht_frame_return(frame);
+
+    if (is_last_call(this_call_cnt)) {
+        if (!local->heal_layout) {
+            gf_msg_trace(this->name, 0, "Skip heal layout for %s gfid = %s ",
+                         local->loc.path, uuid_utoa(local->gfid));
+
+            dht_selfheal_dir_finish(frame, this, 0, 1);
+            return 0;
+        }
+        ret = dht_selfheal_layout_lock(frame, layout, _gf_false,
+                                       dht_selfheal_dir_xattr,
+                                       dht_should_heal_layout);
+
+        if (ret < 0) {
+            dht_selfheal_dir_finish(frame, this, -1, 1);
+        }
+    }
+
+    return 0;
+}
 
 int
-dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc,
-			dht_layout_t *layout, int force)
+dht_selfheal_dir_setattr(call_frame_t *frame, loc_t *loc, struct iatt *stbuf,
+                         int32_t valid, dht_layout_t *layout)
 {
-	int           missing_dirs = 0;
-	int           i = 0;
-	dht_local_t  *local = NULL;
-	xlator_t     *this = NULL;
+    int missing_attr = 0;
+    int i = 0, ret = -1;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+    int cnt = 0;
+
+    local = frame->local;
+    this = frame->this;
+    conf = this->private;
+
+    /* We need to heal the attrs if:
+     * 1. Any directories were missing - the newly created dirs will need
+     *    to have the correct attrs set
+     * 2. An existing dir does not have the correct permissions -they may
+     *    have been changed when a brick was down.
+     */
+
+    for (i = 0; i < layout->cnt; i++) {
+        if (layout->list[i].err == -1)
+            missing_attr++;
+    }
+
+    if ((missing_attr == 0) && (local->need_attrheal == 0)) {
+        if (!local->heal_layout) {
+            gf_msg_trace(this->name, 0, "Skip heal layout for %s gfid = %s ",
+                         loc->path, uuid_utoa(loc->gfid));
+            dht_selfheal_dir_finish(frame, this, 0, 1);
+            return 0;
+        }
+        ret = dht_selfheal_layout_lock(frame, layout, _gf_false,
+                                       dht_selfheal_dir_xattr,
+                                       dht_should_heal_layout);
+
+        if (ret < 0) {
+            dht_selfheal_dir_finish(frame, this, -1, 1);
+        }
+
+        return 0;
+    }
+
+    cnt = local->call_cnt = conf->subvolume_cnt;
+
+    for (i = 0; i < cnt; i++) {
+        STACK_WIND(frame, dht_selfheal_dir_setattr_cbk, layout->list[i].xlator,
+                   layout->list[i].xlator->fops->setattr, loc, stbuf, valid,
+                   NULL);
+    }
+
+    return 0;
+}
 
+static int
+dht_selfheal_dir_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int op_ret, int op_errno, inode_t *inode,
+                           struct iatt *stbuf, struct iatt *preparent,
+                           struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    dht_layout_t *layout = NULL;
+    xlator_t *prev = NULL;
+    xlator_t *subvol = NULL;
+    int i = 0, ret = -1;
+    int this_call_cnt = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+    layout = local->selfheal.layout;
+    prev = cookie;
+    subvol = prev;
+
+    if ((op_ret == 0) || ((op_ret == -1) && (op_errno == EEXIST))) {
+        for (i = 0; i < layout->cnt; i++) {
+            if (layout->list[i].xlator == subvol) {
+                layout->list[i].err = -1;
+                break;
+            }
+        }
+    }
+
+    if (op_ret) {
+        gf_uuid_unparse(local->loc.gfid, gfid);
+        gf_smsg(this->name,
+                ((op_errno == EEXIST) ? GF_LOG_DEBUG : GF_LOG_WARNING),
+                op_errno, DHT_MSG_DIR_SELFHEAL_FAILED, "path=%s",
+                local->loc.path, "gfid=%s", gfid, NULL);
+        goto out;
+    }
+    dht_iatt_merge(this, &local->preparent, preparent);
+    dht_iatt_merge(this, &local->postparent, postparent);
+    ret = 0;
+
+out:
+    this_call_cnt = dht_frame_return(frame);
+
+    if (is_last_call(this_call_cnt)) {
+        dht_selfheal_dir_finish(frame, this, ret, 0);
+        dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf, 0xffffff,
+                                 layout);
+    }
+
+    return 0;
+}
 
-	local = frame->local;
-	this = frame->this;
+static int
+dht_selfheal_dir_mkdir_lookup_done(call_frame_t *frame, xlator_t *this)
+{
+    dht_local_t *local = NULL;
+    int i = 0;
+    dict_t *dict = NULL;
+    dht_layout_t *layout = NULL;
+    loc_t *loc = NULL;
+    int cnt = 0;
+    int ret = -1;
+
+    VALIDATE_OR_GOTO(this->private, err);
+
+    local = frame->local;
+    layout = local->layout;
+    loc = &local->loc;
+
+    if (!gf_uuid_is_null(local->gfid)) {
+        dict = dict_new();
+        if (!dict)
+            return -1;
+
+        ret = dict_set_gfuuid(dict, "gfid-req", local->gfid, true);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+                    "path=%s", loc->path, "key=gfid-req", NULL);
+    } else if (local->params) {
+        /* Send the dictionary from higher layers directly */
+
+        dict = dict_ref(local->params);
+    }
+    /* Code to update all extended attributed from local->xattr
+       to dict
+    */
+    dht_dir_set_heal_xattr(this, local, dict, local->xattr, NULL, NULL);
+
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_IS_NULL, NULL);
+        dict = dict_new();
+        if (!dict)
+            return -1;
+    }
+    ret = dict_set_flag(dict, GF_INTERNAL_CTX_KEY, GF_DHT_HEAL_DIR);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, "key=%s",
+                GF_INTERNAL_CTX_KEY, "path=%s", loc->path, NULL);
+        /* We can still continue. As heal can still happen
+         * unless quota limits have reached for the dir.
+         */
+    }
+
+    cnt = layout->cnt;
+    for (i = 0; i < cnt; i++) {
+        if (layout->list[i].err == ESTALE || layout->list[i].err == ENOENT ||
+            local->selfheal.force_mkdir) {
+            gf_msg_debug(this->name, 0, "Creating directory %s on subvol %s",
+                         loc->path, layout->list[i].xlator->name);
+
+            STACK_WIND_COOKIE(
+                frame, dht_selfheal_dir_mkdir_cbk, layout->list[i].xlator,
+                layout->list[i].xlator, layout->list[i].xlator->fops->mkdir,
+                loc,
+                st_mode_from_ia(local->stbuf.ia_prot, local->stbuf.ia_type), 0,
+                dict);
+        }
+    }
+
+    if (dict)
+        dict_unref(dict);
+
+    return 0;
 
-	for (i = 0; i < layout->cnt; i++) {
-		if (layout->list[i].err == ENOENT || force)
-			missing_dirs++;
-	}
+err:
+    dht_selfheal_dir_finish(frame, this, -1, 1);
+    return 0;
+}
 
-	if (missing_dirs == 0) {
-		dht_selfheal_dir_xattr (frame, loc, layout);
-		return 0;
-	}
+static int
+dht_selfheal_dir_mkdir_lookup_cbk(call_frame_t *frame, void *cookie,
+                                  xlator_t *this, int op_ret, int op_errno,
+                                  inode_t *inode, struct iatt *stbuf,
+                                  dict_t *xattr, struct iatt *postparent)
+{
+    dht_local_t *local = NULL;
+    int i = 0;
+    int this_call_cnt = 0;
+    int missing_dirs = 0;
+    dht_layout_t *layout = NULL;
+    xlator_t *prev = 0;
+    loc_t *loc = NULL;
+    char gfid_local[GF_UUID_BUF_SIZE] = {0};
+    int index = -1;
+
+    VALIDATE_OR_GOTO(this->private, err);
+
+    local = frame->local;
+    layout = local->layout;
+    loc = &local->loc;
+    prev = cookie;
+
+    if (!gf_uuid_is_null(local->gfid))
+        gf_uuid_unparse(local->gfid, gfid_local);
+
+    LOCK(&frame->lock);
+    {
+        index = dht_layout_index_for_subvol(layout, prev);
+        if ((op_ret < 0) && (op_errno == ENOENT || op_errno == ESTALE)) {
+            local->selfheal.hole_cnt = !local->selfheal.hole_cnt
+                                           ? 1
+                                           : local->selfheal.hole_cnt + 1;
+            /* the status might have changed. Update the layout with the
+             * new status
+             */
+            if (index >= 0) {
+                layout->list[index].err = op_errno;
+            }
+        }
+
+        if (!op_ret) {
+            dht_iatt_merge(this, &local->stbuf, stbuf);
+            if (prev == local->mds_subvol) {
+                dict_unref(local->xattr);
+                local->xattr = dict_ref(xattr);
+            }
+            /* the status might have changed. Update the layout with the
+             * new status
+             */
+            if (index >= 0) {
+                layout->list[index].err = -1;
+            }
+        }
+    }
+    UNLOCK(&frame->lock);
+
+    this_call_cnt = dht_frame_return(frame);
+
+    if (is_last_call(this_call_cnt)) {
+        if (local->selfheal.hole_cnt == layout->cnt) {
+            gf_msg_debug(this->name, op_errno,
+                         "Lookup failed, an rmdir could have "
+                         "deleted this entry %s",
+                         loc->name);
+            local->op_errno = op_errno;
+            goto err;
+        } else {
+            for (i = 0; i < layout->cnt; i++) {
+                if (layout->list[i].err == ENOENT ||
+                    layout->list[i].err == ESTALE ||
+                    local->selfheal.force_mkdir)
+                    missing_dirs++;
+            }
+
+            if (missing_dirs == 0) {
+                dht_selfheal_dir_finish(frame, this, 0, 0);
+                dht_selfheal_dir_setattr(frame, loc, &local->stbuf, 0xffffffff,
+                                         layout);
+                return 0;
+            }
+
+            local->call_cnt = missing_dirs;
+            dht_selfheal_dir_mkdir_lookup_done(frame, this);
+        }
+    }
+
+    return 0;
 
-	local->call_cnt = missing_dirs;
-	for (i = 0; i < layout->cnt; i++) {
-		if (layout->list[i].err == ENOENT || force) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"creating directory %s on subvol %s",
-				loc->path, layout->list[i].xlator->name);
+err:
+    dht_selfheal_dir_finish(frame, this, -1, 1);
+    return 0;
+}
 
-			STACK_WIND (frame, dht_selfheal_dir_mkdir_cbk,
-				    layout->list[i].xlator,
-				    layout->list[i].xlator->fops->mkdir,
-				    loc, local->stbuf.st_mode);
-		}
-	}
+static int
+dht_selfheal_dir_mkdir_lock_cbk(call_frame_t *frame, void *cookie,
+                                xlator_t *this, int32_t op_ret,
+                                int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int i = 0;
+    int ret = -1;
+    xlator_t *mds_subvol = NULL;
+
+    VALIDATE_OR_GOTO(this->private, err);
+
+    conf = this->private;
+    local = frame->local;
+    mds_subvol = local->mds_subvol;
+
+    local->call_cnt = conf->subvolume_cnt;
+
+    if (op_ret < 0) {
+        if (op_errno == EINVAL) {
+            local->call_cnt = 1;
+            dht_selfheal_dir_mkdir_lookup_done(frame, this);
+            return 0;
+        }
+
+        gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_ENTRYLK_ERROR,
+                "path=%s", local->loc.path, NULL);
+
+        local->op_errno = op_errno;
+        goto err;
+    }
+
+    /* After getting locks, perform lookup again to ensure that the
+       directory was not deleted by a racing rmdir
+    */
+    if (!local->xattr_req)
+        local->xattr_req = dict_new();
+
+    ret = dict_set_int32(local->xattr_req, "list-xattr", 1);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, "path=%s",
+                local->loc.path, NULL);
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (mds_subvol && conf->subvolumes[i] == mds_subvol) {
+            STACK_WIND_COOKIE(frame, dht_selfheal_dir_mkdir_lookup_cbk,
+                              conf->subvolumes[i], conf->subvolumes[i],
+                              conf->subvolumes[i]->fops->lookup, &local->loc,
+                              local->xattr_req);
+        } else {
+            STACK_WIND_COOKIE(frame, dht_selfheal_dir_mkdir_lookup_cbk,
+                              conf->subvolumes[i], conf->subvolumes[i],
+                              conf->subvolumes[i]->fops->lookup, &local->loc,
+                              NULL);
+        }
+    }
+
+    return 0;
 
-	return 0;
+err:
+    dht_selfheal_dir_finish(frame, this, -1, 1);
+    return 0;
+}
+
+static int
+dht_selfheal_dir_mkdir(call_frame_t *frame, loc_t *loc, dht_layout_t *layout,
+                       int force)
+{
+    int missing_dirs = 0;
+    int i = 0;
+    int op_errno = 0;
+    int ret = -1;
+    dht_local_t *local = NULL;
+    xlator_t *this = NULL;
+    dht_conf_t *conf = NULL;
+
+    local = frame->local;
+    this = frame->this;
+    conf = this->private;
+
+    local->selfheal.force_mkdir = force;
+    local->selfheal.hole_cnt = 0;
+
+    for (i = 0; i < layout->cnt; i++) {
+        if (layout->list[i].err == ENOENT || force)
+            missing_dirs++;
+    }
+
+    if (missing_dirs == 0) {
+        /* We don't need to create any directories. Proceed to heal the
+         * attrs and xattrs
+         */
+        if (!__is_root_gfid(local->stbuf.ia_gfid)) {
+            if (local->need_xattr_heal) {
+                local->need_xattr_heal = 0;
+                ret = dht_dir_xattr_heal(this, local, &op_errno);
+                if (ret) {
+                    gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                            DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s",
+                            local->loc.path, "gfid=%s", local->gfid, NULL);
+                }
+            } else {
+                if (!gf_uuid_is_null(local->gfid))
+                    gf_uuid_copy(loc->gfid, local->gfid);
+
+                ret = dht_common_mark_mdsxattr(frame, NULL, 0);
+                if (!ret)
+                    return 0;
+
+                gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_SET_XATTR_FAILED,
+                        "path=%s", local->loc.path, "gfid=%s", local->gfid,
+                        NULL);
+            }
+        }
+        dht_selfheal_dir_setattr(frame, loc, &local->stbuf, 0xffffffff, layout);
+        return 0;
+    }
+
+    /* MDS xattr is populated only while DHT is having more than one
+     subvol.In case of graph switch while adding more dht subvols need to
+     consider hash subvol as a MDS to avoid MDS check failure at the time
+     of running fop on directory
+    */
+    if (!dict_get(local->xattr, conf->mds_xattr_key) &&
+        (conf->subvolume_cnt > 1)) {
+        if (local->hashed_subvol == NULL) {
+            local->hashed_subvol = dht_subvol_get_hashed(this, loc);
+            if (local->hashed_subvol == NULL) {
+                local->op_errno = EINVAL;
+                gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+                        DHT_MSG_HASHED_SUBVOL_GET_FAILED, "gfid=%s",
+                        loc->pargfid, "name=%s", loc->name, "path=%s",
+                        loc->path, NULL);
+                goto err;
+            }
+        }
+        ret = dht_inode_ctx_mdsvol_set(local->inode, this,
+                                       local->hashed_subvol);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+                   "Failed to set hashed subvol for %s on inode vol is %s",
+                   local->loc.path,
+                   local->hashed_subvol ? local->hashed_subvol->name : "NULL");
+            goto err;
+        }
+    }
+
+    if (local->hashed_subvol == NULL) {
+        local->hashed_subvol = dht_subvol_get_hashed(this, loc);
+        if (local->hashed_subvol == NULL) {
+            local->op_errno = EINVAL;
+            gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+                    DHT_MSG_HASHED_SUBVOL_GET_FAILED, "gfid=%s", loc->pargfid,
+                    "name=%s", loc->name, "path=%s", loc->path, NULL);
+            goto err;
+        }
+    }
+
+    local->current = &local->lock[0];
+    ret = dht_protect_namespace(frame, loc, local->hashed_subvol,
+                                &local->current->ns,
+                                dht_selfheal_dir_mkdir_lock_cbk);
+
+    if (ret < 0)
+        goto err;
+
+    return 0;
+err:
+    return -1;
+}
+
+static int
+dht_selfheal_layout_alloc_start(xlator_t *this, loc_t *loc,
+                                dht_layout_t *layout)
+{
+    int start = 0;
+    uint32_t hashval = 0;
+    int ret = 0;
+    const char *str = NULL;
+    dht_conf_t *conf = NULL;
+    char buf[UUID_CANONICAL_FORM_LEN + 1] = {
+        0,
+    };
+
+    conf = this->private;
+
+    if (conf->randomize_by_gfid) {
+        str = uuid_utoa_r(loc->gfid, buf);
+    } else {
+        str = loc->path;
+    }
+
+    ret = dht_hash_compute(this, layout->type, str, &hashval);
+    if (ret == 0) {
+        start = (hashval % layout->cnt);
+    }
+
+    return start;
+}
+
+static int
+dht_get_layout_count(xlator_t *this, dht_layout_t *layout, int new_layout)
+{
+    int i = 0;
+    int j = 0;
+    int err = 0;
+    int count = 0;
+    dht_conf_t *conf = NULL;
+
+    /* Gets in use only for replace-brick, remove-brick */
+    conf = this->private;
+    for (i = 0; i < layout->cnt; i++) {
+        for (j = 0; j < conf->subvolume_cnt; j++) {
+            if (conf->decommissioned_bricks[j] &&
+                conf->decommissioned_bricks[j] == layout->list[i].xlator) {
+                layout->list[i].err = EINVAL;
+                break;
+            }
+        }
+    }
+
+    for (i = 0; i < layout->cnt; i++) {
+        err = layout->list[i].err;
+        if (err == -1 || err == 0 || err == ENOENT) {
+            /* Take this with a pinch of salt. The behaviour seems
+             * to be slightly different when this function is
+             * invoked from mkdir codepath. For eg., err == 0 in
+             * mkdir codepath means directory created but xattr
+             * is not set yet.
+             */
+
+            /* Setting list[i].err = -1 is an indication for
+               dht_selfheal_layout_new_directory() to assign
+               a range. We set it to -1 based on any one of
+               the three criteria:
+
+               - err == -1 already, which means directory
+                 existed but layout was not set on it.
+
+               - err == 0, which means directory exists and
+                 has an old layout piece which will be
+                 overwritten now.
+
+               - err == ENOENT, which means directory does
+                 not exist (possibly racing with mkdir or
+                 finishing half done mkdir). The missing
+                 directory will be attempted to be recreated.
+            */
+            count++;
+            if (!err)
+                layout->list[i].err = -1;
+        }
+    }
+
+    /* no subvolume has enough space, but can't stop directory creation */
+    if (!count || !new_layout) {
+        for (i = 0; i < layout->cnt; i++) {
+            err = layout->list[i].err;
+            if (err == ENOSPC) {
+                layout->list[i].err = -1;
+                count++;
+            }
+        }
+    }
+
+    /* if layout->spread_cnt is set, check if it is <= available
+     * subvolumes (down brick and decommissioned bricks are considered
+     * un-available). Else return count (available up bricks) */
+    count = ((layout->spread_cnt && (layout->spread_cnt <= count))
+                 ? layout->spread_cnt
+                 : ((count) ? count : 1));
+
+    return count;
+}
+
+void
+dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc,
+                                  dht_layout_t *new_layout);
+
+void
+dht_layout_range_swap(dht_layout_t *layout, int i, int j);
+
+/*
+ * It's a bit icky using local variables in a macro, but it makes the rest
+ * of the code a lot clearer.
+ */
+#define OV_ENTRY(x, y) table[x * new->cnt + y]
+
+static void
+dht_selfheal_layout_maximize_overlap(call_frame_t *frame, loc_t *loc,
+                                     dht_layout_t *new, dht_layout_t *old)
+{
+    int i = 0;
+    int j = 0;
+    uint32_t curr_overlap = 0;
+    uint32_t max_overlap = 0;
+    int max_overlap_idx = -1;
+    uint32_t overlap = 0;
+    uint32_t *table = NULL;
+
+    dht_layout_sort_volname(old);
+    /* Now both old_layout->list[] and new_layout->list[]
+       are match the same xlators/subvolumes. i.e,
+       old_layout->[i] and new_layout->[i] are referring
+       to the same subvolumes
+    */
+
+    /* Build a table of overlaps between new[i] and old[j]. */
+    table = alloca(sizeof(overlap) * old->cnt * new->cnt);
+    if (!table) {
+        return;
+    }
+    memset(table, 0, sizeof(overlap) * old->cnt * new->cnt);
+    for (i = 0; i < new->cnt; ++i) {
+        for (j = 0; j < old->cnt; ++j) {
+            OV_ENTRY(i, j) = dht_overlap_calc(old, j, new, i);
+        }
+    }
+
+    for (i = 0; i < new->cnt; i++) {
+        if (new->list[i].err > 0) {
+            /* Subvol might be marked for decommission
+               with EINVAL, or some other serious error
+               marked with positive errno.
+            */
+            continue;
+        }
+
+        max_overlap = 0;
+        max_overlap_idx = i;
+        for (j = (i + 1); j < new->cnt; ++j) {
+            if (new->list[j].err > 0) {
+                /* Subvol might be marked for decommission
+                with EINVAL, or some other serious error
+                marked with positive errno.
+                */
+                continue;
+            }
+            /* Calculate the overlap now. */
+            curr_overlap = OV_ENTRY(i, i) + OV_ENTRY(j, j);
+            /* Calculate the overlap after the proposed swap. */
+            overlap = OV_ENTRY(i, j) + OV_ENTRY(j, i);
+            /* Are we better than status quo? */
+            if (overlap > curr_overlap) {
+                overlap -= curr_overlap;
+                /* Are we better than the previous choice? */
+                if (overlap > max_overlap) {
+                    max_overlap = overlap;
+                    max_overlap_idx = j;
+                }
+            }
+        }
+
+        if (max_overlap_idx != i) {
+            dht_layout_range_swap(new, i, max_overlap_idx);
+            /* Need to swap the table values too. */
+            for (j = 0; j < old->cnt; ++j) {
+                overlap = OV_ENTRY(i, j);
+                OV_ENTRY(i, j) = OV_ENTRY(max_overlap_idx, j);
+                OV_ENTRY(max_overlap_idx, j) = overlap;
+            }
+        }
+    }
+}
+
+static dht_layout_t *
+dht_fix_layout_of_directory(call_frame_t *frame, loc_t *loc,
+                            dht_layout_t *layout)
+{
+    int i = 0;
+    xlator_t *this = NULL;
+    dht_layout_t *new_layout = NULL;
+    dht_conf_t *priv = NULL;
+    dht_local_t *local = NULL;
+    uint32_t subvol_down = 0;
+    gf_boolean_t maximize_overlap = _gf_true;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    this = frame->this;
+    priv = this->private;
+    local = frame->local;
+
+    if (layout->type == DHT_HASH_TYPE_DM_USER) {
+        gf_msg_debug(THIS->name, 0, "leaving %s alone", loc->path);
+        goto done;
+    }
+
+    new_layout = dht_layout_new(this, priv->subvolume_cnt);
+    if (!new_layout) {
+        gf_uuid_unparse(loc->gfid, gfid);
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MEM_ALLOC_FAILED,
+                "new_layout, path=%s", loc->path, "gfid=%s", gfid, NULL);
+        goto done;
+    }
+
+    /* If a subvolume is down, do not re-write the layout. */
+    dht_layout_anomalies(this, loc, layout, NULL, NULL, NULL, &subvol_down,
+                         NULL, NULL);
+
+    if (subvol_down) {
+        gf_uuid_unparse(loc->gfid, gfid);
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_FIX_FAILED,
+                "subvol-down=%u", subvol_down, "Skipping-fix-layout", "path=%s",
+                loc->path, "gfid=%s", gfid, NULL);
+        GF_FREE(new_layout);
+        return NULL;
+    }
+
+    for (i = 0; i < new_layout->cnt; i++) {
+        if (layout->list[i].err != ENOSPC)
+            new_layout->list[i].err = layout->list[i].err;
+        else
+            new_layout->list[i].err = -1;
+
+        new_layout->list[i].xlator = layout->list[i].xlator;
+    }
+
+    new_layout->commit_hash = layout->commit_hash;
+
+    if (priv->du_stats) {
+        for (i = 0; i < priv->subvolume_cnt; ++i) {
+            gf_smsg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_SUBVOL_INFO,
+                    "index=%d", i, "name=%s", priv->subvolumes[i]->name,
+                    "chunks=%u", priv->du_stats[i].chunks, "path=%s", loc->path,
+                    NULL);
+
+            /* Maximize overlap if the bricks are all the same
+             *  size.
+             * This is probably not going to be very common on
+             * live setups but will benefit our regression tests
+             */
+            if (i && (priv->du_stats[i].chunks != priv->du_stats[0].chunks)) {
+                maximize_overlap = _gf_false;
+            }
+        }
+    } else {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_DISK_USAGE_STATUS,
+                NULL);
+    }
+
+    /* First give it a layout as though it is a new directory. This
+       ensures rotation to kick in */
+    dht_layout_sort_volname(new_layout);
+    dht_selfheal_layout_new_directory(frame, loc, new_layout);
+
+    /* Maximize overlap if weighted-rebalance is disabled */
+    if (!priv->do_weighting)
+        maximize_overlap = _gf_true;
+
+    /* Now selectively re-assign ranges only when it helps */
+    if (maximize_overlap) {
+        dht_selfheal_layout_maximize_overlap(frame, loc, new_layout, layout);
+    }
+done:
+    if (new_layout) {
+        /* Make sure the extra 'ref' for existing layout is removed */
+        dht_layout_unref(this, local->layout);
+
+        local->layout = new_layout;
+    }
+
+    return local->layout;
+}
+
+/*
+ * Having to call this 2x for each entry in the layout is pretty horrible, but
+ * that's what all of this layout-sorting nonsense gets us.
+ */
+static uint32_t
+dht_get_chunks_from_xl(xlator_t *parent, xlator_t *child)
+{
+    dht_conf_t *priv = parent->private;
+    xlator_list_t *trav;
+    uint32_t index = 0;
+
+    if (!priv->du_stats) {
+        return 0;
+    }
+
+    for (trav = parent->children; trav; trav = trav->next) {
+        if (trav->xlator == child) {
+            return priv->du_stats[index].chunks;
+        }
+        ++index;
+    }
+
+    return 0;
 }
 
 void
-dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
-				   dht_layout_t *layout)
-{
-	dht_conf_t  *conf = NULL;
-	xlator_t    *this = NULL;
-	uint32_t     chunk = 0;
-	int          i = 0;
-	uint32_t     start = 0;
-	int          cnt = 0;
-	int          err = 0;
-
-	this = frame->this;
-	conf = this->private;
-
-	for (i = 0; i < layout->cnt; i++) {
-		err = layout->list[i].err;
-		if (err == -1 || err == 0) {
-			layout->list[i].err = -1;
-			cnt++;
-		}
-	}
-
-	chunk = ((unsigned long) 0xffffffff) / cnt;
-
-	start = 0;
-	for (i = 0; i < layout->cnt; i++) {
-		err = layout->list[i].err;
-		if (err == -1) {
-			layout->list[i].start = start;
-			layout->list[i].stop  = start + chunk - 1;
-			
-			start = start + chunk;
-
-			gf_log (this->name, GF_LOG_DEBUG,
-				"gave fix: %u - %u on %s for %s",
-				layout->list[i].start, layout->list[i].stop,
-				layout->list[i].xlator->name, loc->path);
-			if (--cnt == 0) {
-				layout->list[i].stop = 0xffffffff;
-				break;
-			}
-		}
-	}
+dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc,
+                                  dht_layout_t *layout)
+{
+    xlator_t *this = NULL;
+    double chunk = 0;
+    int i = 0;
+    uint32_t start = 0;
+    int bricks_to_use = 0;
+    int err = 0;
+    int start_subvol = 0;
+    uint32_t curr_size;
+    uint32_t range_size;
+    uint64_t total_size = 0;
+    int real_i;
+    dht_conf_t *priv;
+    gf_boolean_t weight_by_size;
+    int bricks_used = 0;
+
+    this = frame->this;
+    priv = this->private;
+    weight_by_size = priv->do_weighting;
+
+    bricks_to_use = dht_get_layout_count(this, layout, 1);
+    GF_ASSERT(bricks_to_use > 0);
+
+    bricks_used = 0;
+    for (i = 0; i < layout->cnt; ++i) {
+        err = layout->list[i].err;
+        if ((err != -1) && (err != ENOENT)) {
+            continue;
+        }
+        curr_size = dht_get_chunks_from_xl(this, layout->list[i].xlator);
+        if (!curr_size) {
+            weight_by_size = _gf_false;
+            break;
+        }
+        total_size += curr_size;
+        if (++bricks_used >= bricks_to_use) {
+            break;
+        }
+    }
+
+    if (weight_by_size && total_size) {
+        /* We know total_size is not zero. */
+        chunk = ((double)0xffffffff) / ((double)total_size);
+        gf_msg_debug(this->name, 0,
+                     "chunk size = 0xffffffff / %" PRIu64 " = %f", total_size,
+                     chunk);
+    } else {
+        weight_by_size = _gf_false;
+        chunk = ((unsigned long)0xffffffff) / bricks_to_use;
+    }
+
+    start_subvol = dht_selfheal_layout_alloc_start(this, loc, layout);
+
+    /* clear out the range, as we are re-computing here */
+    DHT_RESET_LAYOUT_RANGE(layout);
+
+    /*
+     * OK, what's this "real_i" stuff about?  This used to be two loops -
+     * from start_subvol to layout->cnt-1, then from 0 to start_subvol-1.
+     * That way is practically an open invitation to bugs when only one
+     * of the loops is updated.  Using real_i and modulo operators to make
+     * it one loop avoids this problem.  Remember, folks: it's everyone's
+     * responsibility to help stamp out copy/paste abuse.
+     */
+    bricks_used = 0;
+    for (real_i = 0; real_i < layout->cnt; real_i++) {
+        i = (real_i + start_subvol) % layout->cnt;
+        err = layout->list[i].err;
+        if ((err != -1) && (err != ENOENT)) {
+            continue;
+        }
+        if (weight_by_size) {
+            curr_size = dht_get_chunks_from_xl(this, layout->list[i].xlator);
+            if (!curr_size) {
+                continue;
+            }
+        } else {
+            curr_size = 1;
+        }
+        range_size = chunk * curr_size;
+        gf_msg_debug(this->name, 0, "assigning range size 0x%x to %s",
+                     range_size, layout->list[i].xlator->name);
+        DHT_SET_LAYOUT_RANGE(layout, i, start, range_size, loc->path);
+        if (++bricks_used >= bricks_to_use) {
+            layout->list[i].stop = 0xffffffff;
+            goto done;
+        }
+        start += range_size;
+    }
+
+done:
+    return;
 }
 
+static int
+dht_selfheal_dir_getafix(call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
+{
+    dht_local_t *local = NULL;
+    uint32_t holes = 0;
+    int ret = -1;
+    int i = -1;
+    uint32_t overlaps = 0;
+
+    local = frame->local;
+
+    holes = local->selfheal.hole_cnt;
+    overlaps = local->selfheal.overlaps_cnt;
+
+    if (holes || overlaps) {
+        /* If the layout has anomalies which would change the hash
+         * ranges, then we need to reset the commit_hash for this
+         * directory, as the layout would change and things may not
+         * be in place as expected */
+        layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
+        dht_selfheal_layout_new_directory(frame, loc, layout);
+        ret = 0;
+    }
+
+    for (i = 0; i < layout->cnt; i++) {
+        /* directory not present */
+        if (layout->list[i].err == ENOENT) {
+            ret = 0;
+            break;
+        }
+    }
+
+    /* TODO: give a fix to these non-virgins */
+
+    return ret;
+}
 
 int
-dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc,
-			  dht_layout_t *layout)
-{
-	dht_conf_t  *conf = NULL;
-	xlator_t    *this = NULL;
-	dht_local_t *local = NULL;
-	int          missing = -1;
-	int          down = -1;
-	int          holes = -1;
-	int          ret = -1;
-	int          i = -1;
-	int          overlaps = -1;
-
-	this = frame->this;
-	conf = this->private;
-	local = frame->local;
-
-	missing = local->selfheal.missing;
-	down = local->selfheal.down;
-	holes = local->selfheal.hole_cnt;
-	overlaps = local->selfheal.overlaps_cnt;
-
-	if ((missing + down) == conf->subvolume_cnt) {
-		dht_selfheal_layout_new_directory (frame, loc, layout);
-		ret = 0;
-	}
-
-	if (holes <= down) {
-		/* the down subvol might fill up the holes */
-		ret = 0;
-	}
-
-	if (holes || missing || overlaps) {
-		dht_selfheal_layout_new_directory (frame, loc, layout);
-		ret = 0;
-	}
-
-	for (i = 0; i < layout->cnt; i++) {
-		/* directory not present */
-		if (layout->list[i].err == ENOENT) {
-			ret = 0;
-			break;
-		}
-	}
-
-	/* TODO: give a fix to these non-virgins */
-
-	return ret;
+dht_selfheal_new_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+                           dht_layout_t *layout)
+{
+    dht_local_t *local = NULL;
+    int ret = 0;
+    inode_t *linked_inode = NULL, *inode = NULL;
+    loc_t *loc = NULL;
+    char pgfid[GF_UUID_BUF_SIZE] = {0};
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+    int32_t op_errno = EIO;
+
+    local = frame->local;
+
+    loc = &local->loc;
+
+    gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
+    gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+    linked_inode = inode_link(loc->inode, loc->parent, loc->name,
+                              &local->stbuf);
+    if (!linked_inode) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, 0, DHT_MSG_LINK_INODE_FAILED,
+                "pgfid=%s", pgfid, "name=%s", loc->name, "gfid=%s", gfid, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    inode = loc->inode;
+    loc->inode = linked_inode;
+    inode_unref(inode);
+
+    local->selfheal.dir_cbk = dir_cbk;
+    local->selfheal.layout = dht_layout_ref(frame->this, layout);
+
+    dht_layout_sort_volname(layout);
+    dht_selfheal_layout_new_directory(frame, &local->loc, layout);
+
+    op_errno = ENOMEM;
+    ret = dht_selfheal_layout_lock(frame, layout, _gf_true,
+                                   dht_selfheal_dir_xattr,
+                                   dht_should_heal_layout);
+
+out:
+    if (ret < 0) {
+        dir_cbk(frame, NULL, frame->this, -1, op_errno, NULL);
+    }
+
+    return 0;
 }
 
 int
-dht_selfheal_new_directory (call_frame_t *frame, 
-			    dht_selfheal_dir_cbk_t dir_cbk,
-			    dht_layout_t *layout)
+dht_fix_directory_layout(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+                         dht_layout_t *layout)
 {
-	dht_local_t *local = NULL;
+    dht_local_t *local = NULL;
+    dht_layout_t *tmp_layout = NULL;
+    int ret = 0;
 
-	local = frame->local;
+    local = frame->local;
 
-	local->selfheal.dir_cbk = dir_cbk;
-	local->selfheal.layout = layout;
+    local->selfheal.dir_cbk = dir_cbk;
+    local->selfheal.layout = dht_layout_ref(frame->this, layout);
 
-	dht_layout_sort_volname (layout);
-	dht_selfheal_layout_new_directory (frame, &local->loc, layout);	
-	dht_selfheal_dir_xattr (frame, &local->loc, layout);
-	return 0;
-}
+    /* No layout sorting required here */
+    tmp_layout = dht_fix_layout_of_directory(frame, &local->loc, layout);
+    if (!tmp_layout) {
+        return -1;
+    }
 
+    ret = dht_selfheal_layout_lock(frame, tmp_layout, _gf_false,
+                                   dht_fix_dir_xattr, dht_should_fix_layout);
+
+    return ret;
+}
 
 int
-dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
-			loc_t *loc, dht_layout_t *layout)
-{
-	dht_local_t *local    = NULL;
-	uint32_t     holes    = 0;
-	uint32_t     overlaps = 0;
-	uint32_t     missing  = 0;
-	uint32_t     down     = 0;
-	uint32_t     misc     = 0;
-	int          ret      = 0;
-	xlator_t    *this     = NULL;
-
-	local = frame->local;
-	this = frame->this;
-
-	ret = dht_layout_anomalies (this, loc, layout,
-				    &local->selfheal.hole_cnt,
-				    &local->selfheal.overlaps_cnt,
-				    &local->selfheal.missing,
-				    &local->selfheal.down,
-				    &local->selfheal.misc);
-
-	holes    = local->selfheal.hole_cnt;
-	overlaps = local->selfheal.overlaps_cnt;
-	missing  = local->selfheal.missing;
-	down     = local->selfheal.down;
-	misc     = local->selfheal.misc;
-
-	local->selfheal.dir_cbk = dir_cbk;
-	local->selfheal.layout = layout;
-
-	if (down) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"%d subvolumes down -- not fixing", down);
-		ret = 0;
-		goto sorry_no_fix;
-	}
-
-	if (misc) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"%d subvolumes have unrecoverable errors", misc);
-		ret = 0;
-		goto sorry_no_fix;
-	}
-
-	dht_layout_sort_volname (layout);
-	ret = dht_selfheal_dir_getafix (frame, loc, layout);
-
-	if (ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"not able to form layout for the directory");
-		goto sorry_no_fix;
-	}
-
-	dht_selfheal_dir_mkdir (frame, loc, layout, 0);
-
-	return 0;
+dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+                       loc_t *loc, dht_layout_t *layout)
+{
+    dht_local_t *local = NULL;
+    xlator_t *this = NULL;
+    uint32_t down = 0;
+    uint32_t misc = 0;
+    int ret = 0;
+    char pgfid[GF_UUID_BUF_SIZE] = {0};
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+    inode_t *linked_inode = NULL, *inode = NULL;
+
+    local = frame->local;
+    this = frame->this;
+
+    local->selfheal.dir_cbk = dir_cbk;
+    local->selfheal.layout = dht_layout_ref(this, layout);
+
+    if (local->need_attrheal) {
+        if (__is_root_gfid(local->stbuf.ia_gfid)) {
+            local->stbuf.ia_gid = local->prebuf.ia_gid;
+            local->stbuf.ia_uid = local->prebuf.ia_uid;
+
+            local->stbuf.ia_ctime = local->prebuf.ia_ctime;
+            local->stbuf.ia_ctime_nsec = local->prebuf.ia_ctime_nsec;
+            local->stbuf.ia_prot = local->prebuf.ia_prot;
+
+        } else if (!IA_ISINVAL(local->mds_stbuf.ia_type)) {
+            local->stbuf = local->mds_stbuf;
+        }
+    }
+
+    if (!__is_root_gfid(local->stbuf.ia_gfid)) {
+        gf_uuid_unparse(local->stbuf.ia_gfid, gfid);
+        gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+        linked_inode = inode_link(loc->inode, loc->parent, loc->name,
+                                  &local->stbuf);
+        if (!linked_inode) {
+            gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LINK_INODE_FAILED,
+                    "pgfid=%s", pgfid, "name=%s", loc->name, "gfid=%s", gfid,
+                    NULL);
+            ret = 0;
+            goto sorry_no_fix;
+        }
+
+        inode = loc->inode;
+        loc->inode = linked_inode;
+        inode_unref(inode);
+    }
+
+    if (local->need_xattr_heal && (local->mds_xattr)) {
+        dht_dir_set_heal_xattr(this, local, local->xattr, local->mds_xattr,
+                               NULL, NULL);
+        dict_unref(local->mds_xattr);
+        local->mds_xattr = NULL;
+    }
+
+    dht_layout_anomalies(this, loc, layout, &local->selfheal.hole_cnt,
+                         &local->selfheal.overlaps_cnt,
+                         &local->selfheal.missing_cnt, &local->selfheal.down,
+                         &local->selfheal.misc, NULL);
+
+    down = local->selfheal.down;
+    misc = local->selfheal.misc;
+
+    if (down) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SELFHEAL_FAILED,
+                "path=%s", loc->path, "subvol-down=%d", down, "Not-fixing",
+                "gfid=%s", gfid, NULL);
+        ret = 0;
+        goto sorry_no_fix;
+    }
+
+    if (misc) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SELFHEAL_FAILED,
+                "path=%s", loc->path, "misc=%d", misc, "unrecoverable-errors",
+                "gfid=%s", gfid, NULL);
+
+        ret = 0;
+        goto sorry_no_fix;
+    }
+
+    dht_layout_sort_volname(layout);
+    local->heal_layout = _gf_true;
+
+    /* Ignore return value as it can be inferred from result of
+     * dht_layout_anomalies
+     */
+    dht_selfheal_dir_getafix(frame, loc, layout);
+
+    if (!(local->selfheal.hole_cnt || local->selfheal.overlaps_cnt ||
+          local->selfheal.missing_cnt)) {
+        local->heal_layout = _gf_false;
+    }
+
+    ret = dht_selfheal_dir_mkdir(frame, loc, layout, 0);
+    if (ret < 0) {
+        ret = 0;
+        goto sorry_no_fix;
+    }
+
+    return 0;
 
 sorry_no_fix:
-	/* TODO: need to put appropriate local->op_errno */
-	dht_selfheal_dir_finish (frame, this, ret);
+    /* TODO: need to put appropriate local->op_errno */
+    dht_selfheal_dir_finish(frame, this, ret, 1);
 
-	return 0;
+    return 0;
 }
 
+int
+dht_selfheal_restore(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+                     loc_t *loc, dht_layout_t *layout)
+{
+    int ret = 0;
+    dht_local_t *local = NULL;
+
+    local = frame->local;
+
+    local->selfheal.dir_cbk = dir_cbk;
+    local->selfheal.layout = dht_layout_ref(frame->this, layout);
+
+    ret = dht_selfheal_dir_mkdir(frame, loc, layout, 1);
+
+    return ret;
+}
 
 int
-dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
-		      loc_t *loc, dht_layout_t *layout)
+dht_dir_heal_xattrs(void *data)
 {
-	int          ret = 0;
-	dht_local_t *local    = NULL;
+    call_frame_t *frame = NULL;
+    dht_local_t *local = NULL;
+    xlator_t *subvol = NULL;
+    xlator_t *mds_subvol = NULL;
+    xlator_t *this = NULL;
+    dht_conf_t *conf = NULL;
+    dict_t *user_xattr = NULL;
+    dict_t *internal_xattr = NULL;
+    dict_t *mds_xattr = NULL;
+    dict_t *xdata = NULL;
+    int call_cnt = 0;
+    int ret = -1;
+    int uret = 0;
+    int uflag = 0;
+    int i = 0;
+    int xattr_hashed = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+    int32_t allzero[1] = {0};
+
+    GF_VALIDATE_OR_GOTO("dht", data, out);
+
+    frame = data;
+    local = frame->local;
+    this = frame->this;
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, local, out);
+    mds_subvol = local->mds_subvol;
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+    gf_uuid_unparse(local->loc.gfid, gfid);
+
+    if (!mds_subvol) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_MDS_SUBVOL, "path=%s",
+                local->loc.path, "gfid=%s", gfid, NULL);
+        goto out;
+    }
+
+    if ((local->loc.inode && gf_uuid_is_null(local->loc.inode->gfid)) ||
+        gf_uuid_is_null(local->loc.gfid)) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_NOT_PRESENT,
+                "skip-heal path=%s", local->loc.path, "gfid=%s", gfid, NULL);
+        goto out;
+    }
+
+    internal_xattr = dict_new();
+    if (!internal_xattr) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED,
+                "dictionary", NULL);
+        goto out;
+    }
+    xdata = dict_new();
+    if (!xdata) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED,
+                "dictionary", NULL);
+        goto out;
+    }
+
+    call_cnt = conf->subvolume_cnt;
+
+    user_xattr = dict_new();
+    if (!user_xattr) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CREATE_FAILED,
+                "dictionary", NULL);
+        goto out;
+    }
+
+    ret = syncop_listxattr(local->mds_subvol, &local->loc, &mds_xattr, NULL,
+                           NULL);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LIST_XATTRS_FAILED,
+                "path=%s", local->loc.path, "name=%s", local->mds_subvol->name,
+                NULL);
+    }
+
+    if (!mds_xattr)
+        goto out;
+
+    dht_dir_set_heal_xattr(this, local, user_xattr, mds_xattr, &uret, &uflag);
+
+    /* To set quota related xattr need to set GLUSTERFS_INTERNAL_FOP_KEY
+     * key value to 1
+     */
+    if (dict_get(user_xattr, QUOTA_LIMIT_KEY) ||
+        dict_get(user_xattr, QUOTA_LIMIT_OBJECTS_KEY)) {
+        ret = dict_set_int32(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+                    "key=%s", GLUSTERFS_INTERNAL_FOP_KEY, "path=%s",
+                    local->loc.path, NULL);
+            goto out;
+        }
+    }
+    if (uret <= 0 && !uflag)
+        goto out;
+
+    for (i = 0; i < call_cnt; i++) {
+        subvol = conf->subvolumes[i];
+        if (subvol == mds_subvol)
+            continue;
+        if (uret || uflag) {
+            /* Custom xattr heal is required - let posix handle it */
+            ret = dict_set_int8(xdata, "sync_backend_xattrs", _gf_true);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+                        "path=%s", local->loc.path, "key=%s",
+                        "sync_backend_xattrs", NULL);
+                goto out;
+            }
+
+            ret = syncop_setxattr(subvol, &local->loc, user_xattr, 0, xdata,
+                                  NULL);
+            if (ret) {
+                xattr_hashed = 1;
+                gf_smsg(this->name, GF_LOG_ERROR, -ret,
+                        DHT_MSG_DIR_XATTR_HEAL_FAILED,
+                        "set-user-xattr-failed path=%s", local->loc.path,
+                        "subvol=%s", subvol->name, "gfid=%s", gfid, NULL);
+            } else {
+                dict_del(xdata, "sync_backend_xattrs");
+            }
+        }
+    }
+    /* After heal all custom xattr reset internal MDS xattr to 0 */
+    if (!xattr_hashed) {
+        ret = dht_dict_set_array(internal_xattr, conf->mds_xattr_key, allzero,
+                                 1);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+                    "key=%s", conf->mds_xattr_key, "path=%s", local->loc.path,
+                    NULL);
+            goto out;
+        }
+        ret = syncop_setxattr(mds_subvol, &local->loc, internal_xattr, 0, NULL,
+                              NULL);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, -ret,
+                    DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s", local->loc.path,
+                    "subvol=%s", mds_subvol->name, "gfid=%s", gfid, NULL);
+        }
+    }
+
+out:
+    if (user_xattr)
+        dict_unref(user_xattr);
+    if (mds_xattr)
+        dict_unref(mds_xattr);
+    if (internal_xattr)
+        dict_unref(internal_xattr);
+    if (xdata)
+        dict_unref(xdata);
+    return 0;
+}
 
+int
+dht_dir_heal_xattrs_done(int ret, call_frame_t *sync_frame, void *data)
+{
+    DHT_STACK_DESTROY(sync_frame);
+    return 0;
+}
 
-	local = frame->local;
+int
+dht_dir_attr_heal(void *data)
+{
+    call_frame_t *frame = NULL;
+    dht_local_t *local = NULL;
+    xlator_t *subvol = NULL;
+    xlator_t *mds_subvol = NULL;
+    xlator_t *this = NULL;
+    dht_conf_t *conf = NULL;
+    int call_cnt = 0;
+    int ret = -1;
+    int i = 0;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    GF_VALIDATE_OR_GOTO("dht", data, out);
+
+    frame = data;
+    local = frame->local;
+    this = frame->this;
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", local, out);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO("dht", conf, out);
+
+    mds_subvol = local->mds_subvol;
+    call_cnt = conf->subvolume_cnt;
+
+    if (!__is_root_gfid(local->stbuf.ia_gfid) && (!mds_subvol)) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NO_MDS_SUBVOL, "path=%s",
+                local->loc.path, "gfid=%s", gfid, NULL);
+        goto out;
+    }
+
+    if (!__is_root_gfid(local->stbuf.ia_gfid)) {
+        for (i = 0; i < conf->subvolume_cnt; i++) {
+            if (conf->subvolumes[i] == mds_subvol) {
+                if (!conf->subvolume_status[i]) {
+                    gf_smsg(this->name, GF_LOG_ERROR, 0,
+                            DHT_MSG_MDS_DOWN_UNABLE_TO_SET, "path=%s",
+                            local->loc.path, "gfid=%s", gfid, NULL);
+                    goto out;
+                }
+            }
+        }
+    }
+
+    for (i = 0; i < call_cnt; i++) {
+        subvol = conf->subvolumes[i];
+        if (!subvol || subvol == mds_subvol)
+            continue;
+        if (__is_root_gfid(local->stbuf.ia_gfid)) {
+            ret = syncop_setattr(
+                subvol, &local->loc, &local->stbuf,
+                (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL,
+                NULL, NULL, NULL);
+        } else {
+            ret = syncop_setattr(
+                subvol, &local->loc, &local->mds_stbuf,
+                (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL,
+                NULL, NULL, NULL);
+        }
+
+        if (ret) {
+            gf_uuid_unparse(local->loc.gfid, gfid);
+
+            gf_smsg(this->name, GF_LOG_ERROR, -ret,
+                    DHT_MSG_DIR_ATTR_HEAL_FAILED, "path=%s", local->loc.path,
+                    "subvol=%s", subvol->name, "gfid=%s", gfid, NULL);
+        }
+    }
+out:
+    return 0;
+}
+
+int
+dht_dir_attr_heal_done(int ret, call_frame_t *sync_frame, void *data)
+{
+    DHT_STACK_DESTROY(sync_frame);
+    return 0;
+}
+
+/* EXIT: dht_update_commit_hash_for_layout */
+static int
+dht_update_commit_hash_for_layout_done(call_frame_t *frame, void *cookie,
+                                       xlator_t *this, int32_t op_ret,
+                                       int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+
+    local = frame->local;
+
+    /* preserve oldest error */
+    if (op_ret && !local->op_ret) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+    }
 
-	local->selfheal.dir_cbk = dir_cbk;
-	local->selfheal.layout = layout;
+    DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, NULL);
 
-	ret = dht_selfheal_dir_mkdir (frame, loc, layout, 1);
+    return 0;
+}
+
+static int
+dht_update_commit_hash_for_layout_unlock(call_frame_t *frame, xlator_t *this)
+{
+    dht_local_t *local = NULL;
+    int ret = 0;
+
+    local = frame->local;
+
+    ret = dht_unlock_inodelk(frame, local->lock[0].layout.my_layout.locks,
+                             local->lock[0].layout.my_layout.lk_count,
+                             dht_update_commit_hash_for_layout_done);
+    if (ret < 0) {
+        /* preserve oldest error, just ... */
+        if (!local->op_ret) {
+            local->op_errno = errno;
+            local->op_ret = -1;
+        }
+
+        gf_smsg(this->name, GF_LOG_WARNING, errno, DHT_MSG_WIND_UNLOCK_FAILED,
+                "path=%s", local->loc.path, NULL);
+
+        dht_update_commit_hash_for_layout_done(frame, NULL, this, 0, 0, NULL);
+    }
+
+    return 0;
+}
+
+static int
+dht_update_commit_hash_for_layout_cbk(call_frame_t *frame, void *cookie,
+                                      xlator_t *this, int op_ret, int op_errno,
+                                      dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int this_call_cnt = 0;
+
+    local = frame->local;
+
+    LOCK(&frame->lock);
+    /* store first failure, just because */
+    if (op_ret && !local->op_ret) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+    }
+    UNLOCK(&frame->lock);
+
+    this_call_cnt = dht_frame_return(frame);
+
+    if (is_last_call(this_call_cnt)) {
+        dht_update_commit_hash_for_layout_unlock(frame, this);
+    }
+
+    return 0;
+}
+
+static int
+dht_update_commit_hash_for_layout_resume(call_frame_t *frame, void *cookie,
+                                         xlator_t *this, int32_t op_ret,
+                                         int32_t op_errno, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+    int count = 1, ret = -1, i = 0, j = 0;
+    dht_conf_t *conf = NULL;
+    dht_layout_t *layout = NULL;
+    int32_t *disk_layout = NULL;
+    dict_t **xattr = NULL;
+
+    local = frame->local;
+    conf = frame->this->private;
+    count = conf->local_subvols_cnt;
+    layout = local->layout;
+
+    if (op_ret < 0) {
+        goto err_done;
+    }
+
+    /* We precreate the xattr list as we cannot change call count post the
+     * first wind as we may never continue from there. So we finish prep
+     * work before winding the setxattrs */
+    xattr = GF_CALLOC(count, sizeof(*xattr), gf_common_mt_char);
+    if (!xattr) {
+        local->op_errno = errno;
+
+        gf_smsg(this->name, GF_LOG_WARNING, errno, DHT_MSG_COMMIT_HASH_FAILED,
+                "allocation-failed path=%s", local->loc.path, NULL);
+
+        goto err;
+    }
+
+    for (i = 0; i < count; i++) {
+        /* find the layout index for the subvolume */
+        ret = dht_layout_index_for_subvol(layout, conf->local_subvols[i]);
+        if (ret < 0) {
+            local->op_errno = ENOENT;
+
+            gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_COMMIT_HASH_FAILED,
+                    "path=%s", local->loc.path, "subvol=%s",
+                    conf->local_subvols[i]->name, "find-disk-layout-failed",
+                    NULL);
+
+            goto err;
+        }
+        j = ret;
+
+        /* update the commit hash for the layout */
+        layout->list[j].commit_hash = layout->commit_hash;
+
+        /* extract the current layout */
+        ret = dht_disk_layout_extract(this, layout, j, &disk_layout);
+        if (ret == -1) {
+            local->op_errno = errno;
+
+            gf_smsg(this->name, GF_LOG_WARNING, errno,
+                    DHT_MSG_COMMIT_HASH_FAILED, "path=%s", local->loc.path,
+                    "subvol=%s", conf->local_subvols[i]->name,
+                    "extract-disk-layout-failed", NULL);
+
+            goto err;
+        }
+
+        xattr[i] = dict_new();
+        if (!xattr[i]) {
+            local->op_errno = errno;
+
+            gf_smsg(this->name, GF_LOG_WARNING, errno,
+                    DHT_MSG_COMMIT_HASH_FAILED, "path=%s Allocation-failed",
+                    local->loc.path, NULL);
+
+            goto err;
+        }
+
+        ret = dict_set_bin(xattr[i], conf->xattr_name, disk_layout, 4 * 4);
+        if (ret != 0) {
+            local->op_errno = ENOMEM;
+
+            gf_smsg(this->name, GF_LOG_WARNING, 0,
+                    DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, "path=%s",
+                    local->loc.path, "subvol=%s", conf->local_subvols[i]->name,
+                    "set-xattr-failed", NULL);
+
+            goto err;
+        }
+        disk_layout = NULL;
+
+        gf_msg_trace(this->name, 0,
+                     "setting commit hash %u on subvolume %s"
+                     " for %s",
+                     layout->list[j].commit_hash, conf->local_subvols[i]->name,
+                     local->loc.path);
+    }
+
+    /* wind the setting of the commit hash across the local subvols */
+    local->call_cnt = count;
+    local->op_ret = 0;
+    local->op_errno = 0;
+    for (i = 0; i < count; i++) {
+        STACK_WIND(frame, dht_update_commit_hash_for_layout_cbk,
+                   conf->local_subvols[i],
+                   conf->local_subvols[i]->fops->setxattr, &local->loc,
+                   xattr[i], 0, NULL);
+    }
+    for (i = 0; i < count; i++)
+        dict_unref(xattr[i]);
+    GF_FREE(xattr);
+
+    return 0;
+err:
+    if (xattr) {
+        for (i = 0; i < count; i++) {
+            if (xattr[i])
+                dict_unref(xattr[i]);
+        }
+
+        GF_FREE(xattr);
+    }
+
+    GF_FREE(disk_layout);
+
+    local->op_ret = -1;
+
+    dht_update_commit_hash_for_layout_unlock(frame, this);
+
+    return 0;
+err_done:
+    local->op_ret = -1;
+
+    dht_update_commit_hash_for_layout_done(frame, NULL, this, 0, 0, NULL);
+
+    return 0;
+}
+
+/* ENTER: dht_update_commit_hash_for_layout (see EXIT above)
+ * This function is invoked from rebalance only.
+ * As a result, the check here is simple enough to see if defrag is present
+ * in the conf, as other data would be populated appropriately if so.
+ * If ever this was to be used in other code paths, checks would need to
+ * change.
+ *
+ * Functional details:
+ *  - Lock the inodes on the subvols that we want the commit hash updated
+ *  - Update each layout with the inode layout, modified to take in the new
+ *    commit hash.
+ *  - Unlock and return.
+ */
+int
+dht_update_commit_hash_for_layout(call_frame_t *frame)
+{
+    dht_local_t *local = NULL;
+    int count = 1, ret = -1, i = 0;
+    dht_lock_t **lk_array = NULL;
+    dht_conf_t *conf = NULL;
+
+    GF_VALIDATE_OR_GOTO("dht", frame, err);
+    GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err);
+
+    local = frame->local;
+    conf = frame->this->private;
+
+    if (!conf->defrag)
+        goto err;
+
+    count = conf->local_subvols_cnt;
+    lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_char);
+    if (lk_array == NULL)
+        goto err;
+
+    for (i = 0; i < count; i++) {
+        lk_array[i] = dht_lock_new(frame->this, conf->local_subvols[i],
+                                   &local->loc, F_WRLCK, DHT_LAYOUT_HEAL_DOMAIN,
+                                   NULL, FAIL_ON_ANY_ERROR);
+        if (lk_array[i] == NULL)
+            goto err;
+    }
+
+    local->lock[0].layout.my_layout.locks = lk_array;
+    local->lock[0].layout.my_layout.lk_count = count;
+
+    ret = dht_blocking_inodelk(frame, lk_array, count,
+                               dht_update_commit_hash_for_layout_resume);
+    if (ret < 0) {
+        local->lock[0].layout.my_layout.locks = NULL;
+        local->lock[0].layout.my_layout.lk_count = 0;
+        goto err;
+    }
+
+    return 0;
+err:
+    if (lk_array != NULL) {
+        dht_lock_array_free(lk_array, count);
+        GF_FREE(lk_array);
+    }
 
-	return 0;
+    return -1;
 }
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
new file mode 100644
index 00000000000..bb72b0ffbb5
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -0,0 +1,1104 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+/* TODO: add NS locking */
+#include <glusterfs/statedump.h>
+#include "dht-common.h"
+#include "dht-messages.h"
+
+#ifndef MAX
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+/* TODO:
+   - use volumename in xattr instead of "dht"
+   - use NS locks
+   - handle all cases in self heal layout reconstruction
+   - complete linkfile selfheal
+*/
+
+static void
+dht_layout_dump(dht_layout_t *layout, const char *prefix)
+{
+    char key[GF_DUMP_MAX_BUF_LEN];
+    int i = 0;
+
+    if (!layout)
+        goto out;
+
+    gf_proc_dump_build_key(key, prefix, "cnt");
+    gf_proc_dump_write(key, "%d", layout->cnt);
+    gf_proc_dump_build_key(key, prefix, "preset");
+    gf_proc_dump_write(key, "%d", layout->preset);
+    gf_proc_dump_build_key(key, prefix, "gen");
+    gf_proc_dump_write(key, "%d", layout->gen);
+    if (layout->type != IA_INVAL) {
+        gf_proc_dump_build_key(key, prefix, "inode type");
+        gf_proc_dump_write(key, "%d", layout->type);
+    }
+
+    if (!IA_ISDIR(layout->type))
+        goto out;
+
+    for (i = 0; i < layout->cnt; i++) {
+        gf_proc_dump_build_key(key, prefix, "list[%d].err", i);
+        gf_proc_dump_write(key, "%d", layout->list[i].err);
+        gf_proc_dump_build_key(key, prefix, "list[%d].start", i);
+        gf_proc_dump_write(key, "0x%x", layout->list[i].start);
+        gf_proc_dump_build_key(key, prefix, "list[%d].stop", i);
+        gf_proc_dump_write(key, "0x%x", layout->list[i].stop);
+        if (layout->list[i].xlator) {
+            gf_proc_dump_build_key(key, prefix, "list[%d].xlator.type", i);
+            gf_proc_dump_write(key, "%s", layout->list[i].xlator->type);
+            gf_proc_dump_build_key(key, prefix, "list[%d].xlator.name", i);
+            gf_proc_dump_write(key, "%s", layout->list[i].xlator->name);
+        }
+    }
+
+out:
+    return;
+}
+
+int32_t
+dht_priv_dump(xlator_t *this)
+{
+    char key_prefix[GF_DUMP_MAX_BUF_LEN];
+    char key[GF_DUMP_MAX_BUF_LEN];
+    int i = 0;
+    dht_conf_t *conf = NULL;
+    int ret = -1;
+
+    if (!this)
+        goto out;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    ret = TRY_LOCK(&conf->subvolume_lock);
+    if (ret != 0) {
+        return ret;
+    }
+
+    gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name);
+    gf_proc_dump_build_key(key_prefix, "xlator.cluster.dht", "%s.priv",
+                           this->name);
+    gf_proc_dump_write("subvol_cnt", "%d", conf->subvolume_cnt);
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        snprintf(key, sizeof(key), "subvolumes[%d]", i);
+        gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type,
+                           conf->subvolumes[i]->name);
+        if (conf->file_layouts && conf->file_layouts[i]) {
+            snprintf(key, sizeof(key), "file_layouts[%d]", i);
+            dht_layout_dump(conf->file_layouts[i], key);
+        }
+        if (conf->dir_layouts && conf->dir_layouts[i]) {
+            snprintf(key, sizeof(key), "dir_layouts[%d]", i);
+            dht_layout_dump(conf->dir_layouts[i], key);
+        }
+        if (conf->subvolume_status) {
+            snprintf(key, sizeof(key), "subvolume_status[%d]", i);
+            gf_proc_dump_write(key, "%d", (int)conf->subvolume_status[i]);
+        }
+    }
+
+    gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed);
+    gf_proc_dump_write("gen", "%d", conf->gen);
+    gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk);
+    gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes);
+    gf_proc_dump_write("disk_unit", "%c", conf->disk_unit);
+    gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval);
+    gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit);
+    gf_proc_dump_write("use-readdirp", "%d", conf->use_readdirp);
+
+    if (conf->du_stats && conf->subvolume_status) {
+        for (i = 0; i < conf->subvolume_cnt; i++) {
+            if (!conf->subvolume_status[i])
+                continue;
+
+            snprintf(key, sizeof(key), "subvolumes[%d]", i);
+            gf_proc_dump_write(key, "%s", conf->subvolumes[i]->name);
+
+            snprintf(key, sizeof(key), "du_stats[%d].avail_percent", i);
+            gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_percent);
+
+            snprintf(key, sizeof(key), "du_stats[%d].avail_space", i);
+            gf_proc_dump_write(key, "%" PRIu64, conf->du_stats[i].avail_space);
+
+            snprintf(key, sizeof(key), "du_stats[%d].avail_inodes", i);
+            gf_proc_dump_write(key, "%lf", conf->du_stats[i].avail_inodes);
+
+            snprintf(key, sizeof(key), "du_stats[%d].log", i);
+            gf_proc_dump_write(key, "%" PRIu32, conf->du_stats[i].log);
+        }
+    }
+
+    if (conf->last_stat_fetch)
+        gf_proc_dump_write("last_stat_fetch", "%s",
+                           ctime(&conf->last_stat_fetch));
+
+    UNLOCK(&conf->subvolume_lock);
+
+out:
+    return ret;
+}
+
+int32_t
+dht_inodectx_dump(xlator_t *this, inode_t *inode)
+{
+    int ret = -1;
+    dht_layout_t *layout = NULL;
+
+    if (!this)
+        goto out;
+    if (!inode)
+        goto out;
+
+    ret = dht_inode_ctx_layout_get(inode, this, &layout);
+
+    if ((ret != 0) || !layout)
+        return ret;
+
+    gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name);
+    dht_layout_dump(layout, "layout");
+
+out:
+    return ret;
+}
+
+void
+dht_fini(xlator_t *this)
+{
+    int i = 0;
+    dht_conf_t *conf = NULL;
+
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+
+    conf = this->private;
+    this->private = NULL;
+    if (conf) {
+        if (conf->file_layouts) {
+            for (i = 0; i < conf->subvolume_cnt; i++) {
+                GF_FREE(conf->file_layouts[i]);
+            }
+            GF_FREE(conf->file_layouts);
+        }
+
+        dict_unref(conf->leaf_to_subvol);
+
+        /* allocated in dht_init_subvolumes() */
+        GF_FREE(conf->subvolumes);
+        GF_FREE(conf->subvolume_status);
+        GF_FREE(conf->last_event);
+        GF_FREE(conf->subvol_up_time);
+        GF_FREE(conf->du_stats);
+        GF_FREE(conf->decommissioned_bricks);
+
+        /* allocated in dht_init() */
+        GF_FREE(conf->mds_xattr_key);
+        GF_FREE(conf->link_xattr_name);
+        GF_FREE(conf->commithash_xattr_name);
+        GF_FREE(conf->wild_xattr_name);
+
+        /* allocated in dht_init_regex() */
+        if (conf->rsync_regex_valid)
+            regfree(&conf->rsync_regex);
+        if (conf->extra_regex_valid)
+            regfree(&conf->extra_regex);
+
+        synclock_destroy(&conf->link_lock);
+
+        if (conf->lock_pool)
+            mem_pool_destroy(conf->lock_pool);
+
+        GF_FREE(conf);
+    }
+out:
+    return;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+
+    ret = xlator_mem_acct_init(this, gf_dht_mt_end + 1);
+
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
+               "Memory accounting init failed");
+        return ret;
+    }
+out:
+    return ret;
+}
+
+static int
+dht_parse_decommissioned_bricks(xlator_t *this, dht_conf_t *conf,
+                                const char *bricks)
+{
+    int i = 0;
+    int ret = -1;
+    char *tmpstr = NULL;
+    char *dup_brick = NULL;
+    char *node = NULL;
+
+    if (!conf || !bricks)
+        goto out;
+
+    dup_brick = gf_strdup(bricks);
+    if (dup_brick == NULL) {
+        goto out;
+    }
+
+    node = strtok_r(dup_brick, ",", &tmpstr);
+    while (node) {
+        for (i = 0; i < conf->subvolume_cnt; i++) {
+            if (!strcmp(conf->subvolumes[i]->name, node)) {
+                conf->decommissioned_bricks[i] = conf->subvolumes[i];
+                conf->decommission_subvols_cnt++;
+                gf_msg(this->name, GF_LOG_INFO, 0,
+                       DHT_MSG_SUBVOL_DECOMMISSION_INFO,
+                       "decommissioning subvolume %s",
+                       conf->subvolumes[i]->name);
+                break;
+            }
+        }
+        if (i == conf->subvolume_cnt) {
+            /* Wrong node given. */
+            goto out;
+        }
+        node = strtok_r(NULL, ",", &tmpstr);
+    }
+
+    ret = 0;
+    conf->decommission_in_progress = 1;
+out:
+    GF_FREE(dup_brick);
+
+    return ret;
+}
+
+static void
+dht_decommissioned_remove(xlator_t *this, dht_conf_t *conf)
+{
+    int i = 0;
+
+    for (i = 0; i < conf->subvolume_cnt; i++) {
+        if (conf->decommissioned_bricks[i]) {
+            conf->decommissioned_bricks[i] = NULL;
+            conf->decommission_subvols_cnt--;
+        }
+    }
+}
+
+static void
+dht_init_regex(xlator_t *this, dict_t *odict, char *name, regex_t *re,
+               gf_boolean_t *re_valid, dht_conf_t *conf)
+{
+    char *temp_str = NULL;
+
+    if (dict_get_str(odict, name, &temp_str) != 0) {
+        if (strcmp(name, "rsync-hash-regex")) {
+            return;
+        }
+        temp_str = "^\\.(.+)\\.[^.]+$";
+    }
+
+    LOCK(&conf->lock);
+    {
+        if (*re_valid) {
+            regfree(re);
+            *re_valid = _gf_false;
+        }
+
+        if (!strcmp(temp_str, "none")) {
+            goto unlock;
+        }
+
+        if (regcomp(re, temp_str, REG_EXTENDED) == 0) {
+            gf_msg_debug(this->name, 0, "using regex %s = %s", name, temp_str);
+            *re_valid = _gf_true;
+        } else {
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_REGEX_INFO,
+                   "compiling regex %s failed", temp_str);
+        }
+    }
+unlock:
+    UNLOCK(&conf->lock);
+}
+
+int
+dht_set_subvol_range(xlator_t *this)
+{
+    int ret = -1;
+    dht_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf)
+        goto out;
+
+    conf->leaf_to_subvol = dict_new();
+    if (!conf->leaf_to_subvol)
+        goto out;
+
+    ret = glusterfs_reachable_leaves(this, conf->leaf_to_subvol);
+
+out:
+    return ret;
+}
+
+static int
+dht_configure_throttle(xlator_t *this, dht_conf_t *conf, char *temp_str)
+{
+    int rebal_thread_count = 0;
+    int ret = 0;
+
+    pthread_mutex_lock(&conf->defrag->dfq_mutex);
+    {
+        if (!strcasecmp(temp_str, "lazy")) {
+            conf->defrag->recon_thread_count = 1;
+        } else if (!strcasecmp(temp_str, "normal")) {
+            conf->defrag->recon_thread_count = 2;
+        } else if (!strcasecmp(temp_str, "aggressive")) {
+            conf->defrag->recon_thread_count = MAX(MAX_REBAL_THREADS - 4, 4);
+        } else if ((gf_string2int(temp_str, &rebal_thread_count) == 0)) {
+            if ((rebal_thread_count > 0) &&
+                (rebal_thread_count <= MAX_REBAL_THREADS)) {
+                conf->defrag->recon_thread_count = rebal_thread_count;
+                pthread_mutex_unlock(&conf->defrag->dfq_mutex);
+                gf_msg(this->name, GF_LOG_INFO, 0, 0,
+                       "rebal thread count configured to %d",
+                       rebal_thread_count);
+                goto out;
+            } else {
+                pthread_mutex_unlock(&conf->defrag->dfq_mutex);
+                gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
+                       "Invalid option: Reconfigure: "
+                       "rebal-throttle should be "
+                       "within range of 0 and maximum number of"
+                       " cores available");
+                ret = -1;
+                goto out;
+            }
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
+                   "Invalid option: Reconfigure: "
+                   "rebal-throttle should be {lazy|normal|aggressive}"
+                   " or a number up to the number of cores available,"
+                   " not (%s), defaulting to (%d)",
+                   temp_str, conf->dthrottle);
+            ret = -1;
+        }
+    }
+    pthread_mutex_unlock(&conf->defrag->dfq_mutex);
+
+out:
+    return ret;
+}
+
+int
+dht_reconfigure(xlator_t *this, dict_t *options)
+{
+    dht_conf_t *conf = NULL;
+    char *temp_str = NULL;
+    gf_boolean_t search_unhashed;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+    GF_VALIDATE_OR_GOTO("dht", options, out);
+
+    conf = this->private;
+    if (!conf)
+        return 0;
+
+    if (dict_get_str(options, "lookup-unhashed", &temp_str) == 0) {
+        /* If option is not "auto", other options _should_ be boolean*/
+        if (strcasecmp(temp_str, "auto")) {
+            if (!gf_string2boolean(temp_str, &search_unhashed)) {
+                gf_msg_debug(this->name, 0,
+                             "Reconfigure: "
+                             "lookup-unhashed reconfigured(%s)",
+                             temp_str);
+                conf->search_unhashed = search_unhashed;
+            } else {
+                gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
+                       "Invalid option: Reconfigure: "
+                       "lookup-unhashed should be boolean,"
+                       " not (%s), defaulting to (%d)",
+                       temp_str, conf->search_unhashed);
+                ret = -1;
+                goto out;
+            }
+        } else {
+            gf_msg_debug(this->name, 0,
+                         "Reconfigure:"
+                         " lookup-unhashed reconfigured auto ");
+            conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
+        }
+    }
+
+    GF_OPTION_RECONF("lookup-optimize", conf->lookup_optimize, options, bool,
+                     out);
+
+    GF_OPTION_RECONF("min-free-disk", conf->min_free_disk, options,
+                     percent_or_size, out);
+    /* option can be any one of percent or bytes */
+    conf->disk_unit = 0;
+    if (conf->min_free_disk < 100.0)
+        conf->disk_unit = 'p';
+
+    GF_OPTION_RECONF("min-free-inodes", conf->min_free_inodes, options, percent,
+                     out);
+
+    GF_OPTION_RECONF("directory-layout-spread", conf->dir_spread_cnt, options,
+                     uint32, out);
+
+    GF_OPTION_RECONF("readdir-optimize", conf->readdir_optimize, options, bool,
+                     out);
+    GF_OPTION_RECONF("randomize-hash-range-by-gfid", conf->randomize_by_gfid,
+                     options, bool, out);
+
+    GF_OPTION_RECONF("lock-migration", conf->lock_migration_enabled, options,
+                     bool, out);
+
+    GF_OPTION_RECONF("force-migration", conf->force_migration, options, bool,
+                     out);
+
+    if (conf->defrag) {
+        if (dict_get_str(options, "rebal-throttle", &temp_str) == 0) {
+            ret = dht_configure_throttle(this, conf, temp_str);
+            if (ret == -1)
+                goto out;
+        }
+    }
+
+    if (conf->defrag) {
+        conf->defrag->lock_migration_enabled = conf->lock_migration_enabled;
+    }
+
+    if (conf->defrag) {
+        GF_OPTION_RECONF("rebalance-stats", conf->defrag->stats, options, bool,
+                         out);
+    }
+
+    if (dict_get_str(options, "decommissioned-bricks", &temp_str) == 0) {
+        ret = dht_parse_decommissioned_bricks(this, conf, temp_str);
+        if (ret == -1)
+            goto out;
+    } else {
+        dht_decommissioned_remove(this, conf);
+    }
+
+    dht_init_regex(this, options, "rsync-hash-regex", &conf->rsync_regex,
+                   &conf->rsync_regex_valid, conf);
+    dht_init_regex(this, options, "extra-hash-regex", &conf->extra_regex,
+                   &conf->extra_regex_valid, conf);
+
+    GF_OPTION_RECONF("weighted-rebalance", conf->do_weighting, options, bool,
+                     out);
+
+    GF_OPTION_RECONF("use-readdirp", conf->use_readdirp, options, bool, out);
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+gf_defrag_pattern_list_fill(xlator_t *this, gf_defrag_info_t *defrag,
+                            char *data)
+{
+    int ret = -1;
+    char *tmp_str = NULL;
+    char *tmp_str1 = NULL;
+    char *dup_str = NULL;
+    char *num = NULL;
+    char *pattern_str = NULL;
+    char *pattern = NULL;
+    gf_defrag_pattern_list_t *temp_list = NULL;
+    gf_defrag_pattern_list_t *pattern_list = NULL;
+
+    if (!this || !defrag || !data)
+        goto out;
+
+    /* Get the pattern for pattern list. "pattern:<optional-size>"
+     * eg: *avi, *pdf:10MB, *:1TB
+     */
+    pattern_str = strtok_r(data, ",", &tmp_str);
+    while (pattern_str) {
+        dup_str = gf_strdup(pattern_str);
+        if (!dup_str)
+            goto out;
+        pattern_list = GF_CALLOC(1, sizeof(gf_defrag_pattern_list_t), 1);
+        if (!pattern_list) {
+            goto out;
+        }
+        pattern = strtok_r(dup_str, ":", &tmp_str1);
+        num = strtok_r(NULL, ":", &tmp_str1);
+        if (!pattern)
+            goto out;
+        if (!num) {
+            if (gf_string2bytesize_uint64(pattern, &pattern_list->size) == 0) {
+                pattern = "*";
+            }
+        } else if (gf_string2bytesize_uint64(num, &pattern_list->size) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
+                   "Invalid option. Defrag pattern:"
+                   " Invalid number format \"%s\"",
+                   num);
+            goto out;
+        }
+        memcpy(pattern_list->path_pattern, pattern, strlen(dup_str));
+
+        if (!defrag->defrag_pattern)
+            temp_list = NULL;
+        else
+            temp_list = defrag->defrag_pattern;
+
+        pattern_list->next = temp_list;
+
+        defrag->defrag_pattern = pattern_list;
+        pattern_list = NULL;
+
+        GF_FREE(dup_str);
+        dup_str = NULL;
+
+        pattern_str = strtok_r(NULL, ",", &tmp_str);
+    }
+
+    ret = 0;
+out:
+    if (ret)
+        GF_FREE(pattern_list);
+    GF_FREE(dup_str);
+
+    return ret;
+}
+
+static int
+dht_init_methods(xlator_t *this)
+{
+    int ret = -1;
+    dht_conf_t *conf = NULL;
+    dht_methods_t *methods = NULL;
+
+    GF_VALIDATE_OR_GOTO("dht", this, err);
+
+    conf = this->private;
+    methods = &(conf->methods);
+
+    methods->migration_get_dst_subvol = dht_migration_get_dst_subvol;
+    methods->migration_other = NULL;
+    methods->layout_search = dht_layout_search;
+
+    ret = 0;
+err:
+    return ret;
+}
+
+int
+dht_init(xlator_t *this)
+{
+    dht_conf_t *conf = NULL;
+    char *temp_str = NULL;
+    int ret = -1;
+    int i = 0;
+    gf_defrag_info_t *defrag = NULL;
+    int cmd = 0;
+    char *node_uuid = NULL;
+    uint32_t commit_hash = 0;
+
+    GF_VALIDATE_OR_GOTO("dht", this, err);
+
+    if (!this->children) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_CONFIGURATION,
+               "Distribute needs more than one subvolume");
+        return -1;
+    }
+
+    if (!this->parents) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_CONFIGURATION,
+               "dangling volume. check volfile");
+    }
+
+    conf = GF_CALLOC(1, sizeof(*conf), gf_dht_mt_dht_conf_t);
+    if (!conf) {
+        goto err;
+    }
+
+    LOCK_INIT(&conf->subvolume_lock);
+    LOCK_INIT(&conf->layout_lock);
+    LOCK_INIT(&conf->lock);
+    synclock_init(&conf->link_lock, SYNC_LOCK_DEFAULT);
+
+    /* We get the commit-hash to set only for rebalance process */
+    if (dict_get_uint32(this->options, "commit-hash", &commit_hash) == 0) {
+        gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_COMMIT_HASH_INFO,
+               "%s using commit hash %u", __func__, commit_hash);
+        conf->vol_commit_hash = commit_hash;
+        conf->vch_forced = _gf_true;
+    }
+
+    ret = dict_get_int32(this->options, "rebalance-cmd", &cmd);
+
+    if (cmd) {
+        defrag = GF_CALLOC(1, sizeof(gf_defrag_info_t), gf_defrag_info_mt);
+
+        GF_VALIDATE_OR_GOTO(this->name, defrag, err);
+
+        LOCK_INIT(&defrag->lock);
+
+        defrag->is_exiting = 0;
+
+        conf->defrag = defrag;
+        defrag->this = this;
+
+        ret = dict_get_str(this->options, "node-uuid", &node_uuid);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_CONFIGURATION,
+                   "Invalid volume configuration: "
+                   "node-uuid not specified");
+            goto err;
+        }
+
+        if (gf_uuid_parse(node_uuid, defrag->node_uuid)) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
+                   "Invalid option:"
+                   " Cannot parse glusterd node uuid");
+            goto err;
+        }
+
+        defrag->cmd = cmd;
+
+        defrag->stats = _gf_false;
+
+        defrag->queue = NULL;
+
+        defrag->crawl_done = 0;
+
+        defrag->global_error = 0;
+
+        defrag->q_entry_count = 0;
+
+        defrag->wakeup_crawler = 0;
+
+        pthread_mutex_init(&defrag->dfq_mutex, 0);
+        pthread_cond_init(&defrag->parallel_migration_cond, 0);
+        pthread_cond_init(&defrag->rebalance_crawler_alarm, 0);
+        pthread_cond_init(&defrag->df_wakeup_thread, 0);
+
+        pthread_mutex_init(&defrag->fc_mutex, 0);
+        pthread_cond_init(&defrag->fc_wakeup_cond, 0);
+
+        defrag->global_error = 0;
+    }
+
+    conf->use_fallocate = 1;
+
+    conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
+    if (dict_get_str(this->options, "lookup-unhashed", &temp_str) == 0) {
+        /* If option is not "auto", other options _should_ be boolean */
+        if (strcasecmp(temp_str, "auto")) {
+            gf_boolean_t search_unhashed_bool;
+            ret = gf_string2boolean(temp_str, &search_unhashed_bool);
+            if (ret == -1) {
+                goto err;
+            }
+            conf->search_unhashed = search_unhashed_bool
+                                        ? GF_DHT_LOOKUP_UNHASHED_ON
+                                        : GF_DHT_LOOKUP_UNHASHED_OFF;
+        } else {
+            conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
+        }
+    }
+
+    GF_OPTION_INIT("lookup-optimize", conf->lookup_optimize, bool, err);
+
+    GF_OPTION_INIT("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, err);
+
+    GF_OPTION_INIT("use-readdirp", conf->use_readdirp, bool, err);
+
+    GF_OPTION_INIT("min-free-disk", conf->min_free_disk, percent_or_size, err);
+
+    GF_OPTION_INIT("min-free-inodes", conf->min_free_inodes, percent, err);
+
+    conf->dir_spread_cnt = conf->subvolume_cnt;
+    GF_OPTION_INIT("directory-layout-spread", conf->dir_spread_cnt, uint32,
+                   err);
+
+    GF_OPTION_INIT("assert-no-child-down", conf->assert_no_child_down, bool,
+                   err);
+
+    GF_OPTION_INIT("readdir-optimize", conf->readdir_optimize, bool, err);
+
+    GF_OPTION_INIT("lock-migration", conf->lock_migration_enabled, bool, err);
+
+    GF_OPTION_INIT("force-migration", conf->force_migration, bool, err);
+
+    if (defrag) {
+        defrag->lock_migration_enabled = conf->lock_migration_enabled;
+
+        GF_OPTION_INIT("rebalance-stats", defrag->stats, bool, err);
+        if (dict_get_str(this->options, "rebalance-filter", &temp_str) == 0) {
+            if (gf_defrag_pattern_list_fill(this, defrag, temp_str) == -1) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_OPTION,
+                       "Invalid option:"
+                       " Cannot parse rebalance-filter (%s)",
+                       temp_str);
+
+                goto err;
+            }
+        }
+    }
+
+    /* option can be any one of percent or bytes */
+    conf->disk_unit = 0;
+    if (conf->min_free_disk < 100)
+        conf->disk_unit = 'p';
+
+    ret = dht_init_subvolumes(this, conf);
+    if (ret == -1) {
+        goto err;
+    }
+
+    if (cmd) {
+        ret = dht_init_local_subvolumes(this, conf);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   DHT_MSG_INIT_LOCAL_SUBVOL_FAILED,
+                   "dht_init_local_subvolumes failed");
+            goto err;
+        }
+    }
+
+    if (dict_get_str(this->options, "decommissioned-bricks", &temp_str) == 0) {
+        ret = dht_parse_decommissioned_bricks(this, conf, temp_str);
+        if (ret == -1)
+            goto err;
+    }
+
+    dht_init_regex(this, this->options, "rsync-hash-regex", &conf->rsync_regex,
+                   &conf->rsync_regex_valid, conf);
+    dht_init_regex(this, this->options, "extra-hash-regex", &conf->extra_regex,
+                   &conf->extra_regex_valid, conf);
+
+    ret = dht_layouts_init(this, conf);
+    if (ret == -1) {
+        goto err;
+    }
+
+    conf->gen = 1;
+
+    this->local_pool = mem_pool_new(dht_local_t, 512);
+    if (!this->local_pool) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+               " DHT initialisation failed. "
+               "failed to create local_t's memory pool");
+        goto err;
+    }
+
+    GF_OPTION_INIT("randomize-hash-range-by-gfid", conf->randomize_by_gfid,
+                   bool, err);
+
+    if (defrag) {
+        GF_OPTION_INIT("rebal-throttle", temp_str, str, err);
+        if (temp_str) {
+            ret = dht_configure_throttle(this, conf, temp_str);
+            if (ret == -1)
+                goto err;
+        }
+    }
+
+    GF_OPTION_INIT("xattr-name", conf->xattr_name, str, err);
+    gf_asprintf(&conf->mds_xattr_key, "%s." DHT_MDS_STR, conf->xattr_name);
+    gf_asprintf(&conf->link_xattr_name, "%s." DHT_LINKFILE_STR,
+                conf->xattr_name);
+    gf_asprintf(&conf->commithash_xattr_name, "%s." DHT_COMMITHASH_STR,
+                conf->xattr_name);
+    gf_asprintf(&conf->wild_xattr_name, "%s*", conf->xattr_name);
+    if (!conf->link_xattr_name || !conf->wild_xattr_name) {
+        goto err;
+    }
+
+    GF_OPTION_INIT("weighted-rebalance", conf->do_weighting, bool, err);
+
+    conf->lock_pool = mem_pool_new(dht_lock_t, 512);
+    if (!conf->lock_pool) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INIT_FAILED,
+               "failed to create lock mem_pool, failing "
+               "initialization");
+        goto err;
+    }
+
+    this->private = conf;
+
+    if (dht_set_subvol_range(this))
+        goto err;
+
+    if (dht_init_methods(this))
+        goto err;
+
+    return 0;
+
+err:
+    if (conf) {
+        if (conf->file_layouts) {
+            for (i = 0; i < conf->subvolume_cnt; i++) {
+                GF_FREE(conf->file_layouts[i]);
+            }
+            GF_FREE(conf->file_layouts);
+        }
+
+        GF_FREE(conf->subvolumes);
+
+        GF_FREE(conf->subvolume_status);
+
+        GF_FREE(conf->du_stats);
+
+        GF_FREE(conf->defrag);
+
+        GF_FREE(conf->xattr_name);
+        GF_FREE(conf->link_xattr_name);
+        GF_FREE(conf->wild_xattr_name);
+        GF_FREE(conf->mds_xattr_key);
+
+        if (conf->lock_pool)
+            mem_pool_destroy(conf->lock_pool);
+
+        GF_FREE(conf);
+    }
+
+    return -1;
+}
+
+struct volume_options dht_options[] = {
+    {
+        .key = {"lookup-unhashed"},
+        .value = {"auto", "yes", "no", "enable", "disable", "1", "0", "on",
+                  "off"},
+        .type = GF_OPTION_TYPE_STR,
+        .default_value = "on",
+        .description =
+            "This option if set to ON, does a lookup through "
+            "all the sub-volumes, in case a lookup didn't return any result "
+            "from the hash subvolume. If set to OFF, it does not do a lookup "
+            "on the remaining subvolumes.",
+        .op_version = {1},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+        .level = OPT_STATUS_BASIC,
+    },
+    {.key = {"lookup-optimize"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description =
+         "This option if set to ON enables the optimization "
+         "of -ve lookups, by not doing a lookup on non-hashed subvolumes for "
+         "files, in case the hashed subvolume does not return any result. "
+         "This option disregards the lookup-unhashed setting, when enabled.",
+     .op_version = {GD_OP_VERSION_3_7_2},
+     .level = OPT_STATUS_ADVANCED,
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+    {.key = {"min-free-disk"},
+     .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
+     .default_value = "10%",
+     .description =
+         "Percentage/Size of disk space, after which the "
+         "process starts balancing out the cluster, and logs will appear "
+         "in log files",
+     .op_version = {1},
+     .level = OPT_STATUS_BASIC,
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+    {.key = {"min-free-inodes"},
+     .type = GF_OPTION_TYPE_PERCENT,
+     .default_value = "5%",
+     .description = "after system has only N% of inodes, warnings "
+                    "starts to appear in log files",
+     .op_version = {1},
+     .level = OPT_STATUS_BASIC,
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+    {
+        .key = {"unhashed-sticky-bit"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+    },
+    {.key = {"use-readdirp"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description = "This option if set to ON, forces the use of "
+                    "readdirp, and hence also displays the stats of the files.",
+     .level = OPT_STATUS_ADVANCED,
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+    {.key = {"assert-no-child-down"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "This option if set to ON, in the event of "
+                    "CHILD_DOWN, will call exit."},
+    {
+        .key = {"directory-layout-spread"},
+        .type = GF_OPTION_TYPE_INT,
+        .min = 1,
+        .validate = GF_OPT_VALIDATE_MIN,
+        .description = "Specifies the directory layout spread. Takes number "
+                       "of subvolumes as default value.",
+
+        .op_version = {2},
+    },
+    {
+        .key = {"decommissioned-bricks"},
+        .type = GF_OPTION_TYPE_ANY,
+        .description =
+            "This option if set to ON, decommissions "
+            "the brick, so that no new data is allowed to be created "
+            "on that brick.",
+        .level = OPT_STATUS_ADVANCED,
+    },
+    {
+        .key = {"rebalance-cmd"},
+        .type = GF_OPTION_TYPE_INT,
+    },
+    {
+        .key = {"commit-hash"},
+        .type = GF_OPTION_TYPE_INT,
+    },
+    {
+        .key = {"node-uuid"},
+        .type = GF_OPTION_TYPE_STR,
+    },
+    {
+        .key = {"rebalance-stats"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description =
+            "This option if set to ON displays and logs the "
+            " time taken for migration of each file, during the rebalance "
+            "process. If set to OFF, the rebalance logs will only display the "
+            "time spent in each directory.",
+        .op_version = {2},
+        .level = OPT_STATUS_BASIC,
+    },
+    {.key = {"readdir-optimize"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description =
+         "This option if set to ON enables the optimization "
+         "that allows DHT to requests non-first subvolumes to filter out "
+         "directory entries.",
+     .op_version = {1},
+     .level = OPT_STATUS_ADVANCED,
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+    {.key = {"rsync-hash-regex"},
+     .type = GF_OPTION_TYPE_STR,
+     /* Setting a default here doesn't work.  See dht_init_regex. */
+     .description =
+         "Regular expression for stripping temporary-file "
+         "suffix and prefix used by rsync, to prevent relocation when the "
+         "file is renamed.",
+     .op_version = {3},
+     .level = OPT_STATUS_BASIC,
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+    {.key = {"extra-hash-regex"},
+     .type = GF_OPTION_TYPE_STR,
+     /* Setting a default here doesn't work.  See dht_init_regex. */
+     .description =
+         "Regular expression for stripping temporary-file "
+         "suffix and prefix used by an application, to prevent relocation when "
+         "the file is renamed.",
+     .op_version = {3},
+     .level = OPT_STATUS_BASIC,
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+    {
+        .key = {"rebalance-filter"},
+        .type = GF_OPTION_TYPE_STR,
+    },
+
+    {
+        .key = {"xattr-name"},
+        .type = GF_OPTION_TYPE_STR,
+        .default_value = "trusted.glusterfs.dht",
+        .description =
+            "Base for extended attributes used by this "
+            "translator instance, to avoid conflicts with others above or "
+            "below it.",
+        .op_version = {3},
+    },
+
+    {.key = {"weighted-rebalance"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description =
+         "When enabled, files will be allocated to bricks "
+         "with a probability proportional to their size.  Otherwise, all "
+         "bricks will have the same probability (legacy behavior).",
+     .op_version = {GD_OP_VERSION_3_6_0},
+     .level = OPT_STATUS_BASIC,
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+
+    /* NUFA option */
+    {.key = {"local-volume-name"}, .type = GF_OPTION_TYPE_XLATOR},
+
+    /* switch option */
+    {.key = {"pattern.switch.case"}, .type = GF_OPTION_TYPE_ANY},
+
+    {
+        .key = {"randomize-hash-range-by-gfid"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description =
+            "Use gfid of directory to determine the subvolume "
+            "from which hash ranges are allocated starting with 0. "
+            "Note that we still use a directory/file's name to determine the "
+            "subvolume to which it hashes",
+        .op_version = {GD_OP_VERSION_3_6_0},
+    },
+
+    {.key = {"rebal-throttle"},
+     .type = GF_OPTION_TYPE_STR,
+     .default_value = "normal",
+     .description = " Sets the maximum number of parallel file migrations "
+                    "allowed on a node during the rebalance operation. The"
+                    " default value is normal and allows a max of "
+                    "[($(processing units) - 4) / 2), 2]  files to be "
+                    "migrated at a time. Lazy will allow only one file to "
+                    "be migrated at a time and aggressive will allow "
+                    "max of [($(processing units) - 4) / 2), 4]",
+     .op_version = {GD_OP_VERSION_3_7_0},
+     .level = OPT_STATUS_BASIC,
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC
+
+    },
+
+    {.key = {"lock-migration"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = " If enabled this feature will migrate the posix locks"
+                    " associated with a file during rebalance",
+     .op_version = {GD_OP_VERSION_3_8_0},
+     .level = OPT_STATUS_ADVANCED,
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+
+    {.key = {"force-migration"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "If disabled, rebalance will not migrate files that "
+                    "are being written to by an application",
+     .op_version = {GD_OP_VERSION_4_0_0},
+     .level = OPT_STATUS_ADVANCED,
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+
+    {.key = {NULL}},
+};
+
+#define NUM_DHT_OPTIONS (sizeof(dht_options) / sizeof(dht_options[0]))
+
+extern struct volume_options options[NUM_DHT_OPTIONS]
+    __attribute__((alias("dht_options")));
diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c
index 3be6312937c..53de8292704 100644
--- a/xlators/cluster/dht/src/dht.c
+++ b/xlators/cluster/dht/src/dht.c
@@ -1,222 +1,123 @@
 /*
-   Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
 
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-/* TODO: add NS locking */
-
-#include "dht-common.c"
-
-/* TODO:
-   - use volumename in xattr instead of "dht"
-   - use NS locks
-   - handle all cases in self heal layout reconstruction
-   - complete linkfile selfheal
-*/
-
-
-
-int
-notify (xlator_t *this, int event, void *data, ...)
-{
-	int ret = -1;
-
-	ret = dht_notify (this, event, data);
-
-	return ret;
-}
-
-void
-fini (xlator_t *this)
-{
-        int         i = 0;
-        dht_conf_t *conf = NULL;
-
-	conf = this->private;
-
-        if (conf) {
-                if (conf->file_layouts) {
-                        for (i = 0; i < conf->subvolume_cnt; i++) {
-                                FREE (conf->file_layouts[i]);
-                        }
-                        FREE (conf->file_layouts);
-                }
-
-                if (conf->default_dir_layout)
-                        FREE (conf->default_dir_layout);
-
-                if (conf->subvolumes)
-                        FREE (conf->subvolumes);
-
-		if (conf->subvolume_status)
-			FREE (conf->subvolume_status);
-
-                FREE (conf);
-        }
-
-	return;
-}
-
-int
-init (xlator_t *this)
-{
-        dht_conf_t    *conf = NULL;
-	char          *lookup_unhashed_str = NULL;
-        int            ret = -1;
-        int            i = 0;
-
-	if (!this->children) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"DHT needs more than one child defined");
-		return -1;
-	}
-  
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-
-        conf = CALLOC (1, sizeof (*conf));
-        if (!conf) {
-                gf_log (this->name, GF_LOG_ERROR,
-                        "memory allocation failed :(");
-                goto err;
-        }
-
-	conf->search_unhashed = 0;
-
-	if (dict_get_str (this->options, "lookup-unhashed",
-			  &lookup_unhashed_str) == 0) {
-		gf_string2boolean (lookup_unhashed_str,
-				   &conf->search_unhashed);
-	}
-
-        ret = dht_init_subvolumes (this, conf);
-        if (ret == -1) {
-                goto err;
-        }
-
-        ret = dht_layouts_init (this, conf);
-        if (ret == -1) {
-                goto err;
-        }
-
-	LOCK_INIT (&conf->subvolume_lock);
-
-	conf->gen = 1;
-
-        this->private = conf;
-
-        return 0;
-
-err:
-        if (conf) {
-                if (conf->file_layouts) {
-                        for (i = 0; i < conf->subvolume_cnt; i++) {
-                                FREE (conf->file_layouts[i]);
-                        }
-                        FREE (conf->file_layouts);
-                }
-
-                if (conf->default_dir_layout)
-                        FREE (conf->default_dir_layout);
-
-                if (conf->subvolumes)
-                        FREE (conf->subvolumes);
-
-		if (conf->subvolume_status)
-			FREE (conf->subvolume_status);
-
-                FREE (conf);
-        }
-
-        return -1;
-}
-
+#include "dht-common.h"
+
+struct xlator_fops dht_pt_fops = {
+    /* we need to keep mkdir to make sure we
+       have layout on new directory */
+    .mkdir = dht_pt_mkdir,
+    .getxattr = dht_pt_getxattr,
+    .fgetxattr = dht_pt_fgetxattr,
+
+    /* required to trace fop properly in changelog */
+    .rename = dht_pt_rename,
+
+    /* FIXME: commenting the '.lookup()' below made some of
+       the failing tests to pass. I would remove the below
+       line, but keeping it here as a reminder for people
+       to check for issues if they find concerns with DHT
+       pass-through logic  */
+    /*
+      .lookup = dht_lookup,
+      .readdir = dht_readdir,
+      .readdirp = dht_readdirp,
+    */
+    /* Keeping above as commented, mainly to support the
+       usecase of a gluster volume getting to 1x(anytype),
+       due to remove-brick (shrinking) exercise. In that case,
+       we would need above fops to be available, so we can
+       handle the case of dangling linkto files (if any) */
+};
 
 struct xlator_fops fops = {
-	.lookup      = dht_lookup,
-	.mknod       = dht_mknod,
-	.create      = dht_create,
-
-	.stat        = dht_stat,
-	.chmod       = dht_chmod,
-	.chown       = dht_chown,
-	.fchown      = dht_fchown,
-	.fchmod      = dht_fchmod,
-	.fstat       = dht_fstat,
-	.utimens     = dht_utimens,
-	.truncate    = dht_truncate,
-	.ftruncate   = dht_ftruncate,
-	.access      = dht_access,
-	.readlink    = dht_readlink,
-	.setxattr    = dht_setxattr,
-	.getxattr    = dht_getxattr,
-	.removexattr = dht_removexattr,
-	.open        = dht_open,
-	.readv       = dht_readv,
-	.writev      = dht_writev,
-	.flush       = dht_flush,
-	.fsync       = dht_fsync,
-	.statfs      = dht_statfs,
-	.lk          = dht_lk,
-	.opendir     = dht_opendir,
-	.readdir     = dht_readdir,
-	.fsyncdir    = dht_fsyncdir,
-	.symlink     = dht_symlink,
-	.unlink      = dht_unlink,
-	.link        = dht_link,
-	.mkdir       = dht_mkdir,
-	.rmdir       = dht_rmdir,
-	.rename      = dht_rename,
-	.inodelk     = dht_inodelk,
-	.finodelk    = dht_finodelk,
-	.entrylk     = dht_entrylk,
-	.fentrylk    = dht_fentrylk,
-	.xattrop     = dht_xattrop,
-	.fxattrop    = dht_fxattrop,
-#if 0
-	.setdents    = dht_setdents,
-	.getdents    = dht_getdents,
-	.checksum    = dht_checksum,
-#endif
+    .ipc = dht_ipc,
+    .lookup = dht_lookup,
+    .mknod = dht_mknod,
+    .create = dht_create,
+
+    .open = dht_open,
+    .statfs = dht_statfs,
+    .opendir = dht_opendir,
+    .readdir = dht_readdir,
+    .readdirp = dht_readdirp,
+    .fsyncdir = dht_fsyncdir,
+    .symlink = dht_symlink,
+    .unlink = dht_unlink,
+    .link = dht_link,
+    .mkdir = dht_mkdir,
+    .rmdir = dht_rmdir,
+    .rename = dht_rename,
+    .entrylk = dht_entrylk,
+    .fentrylk = dht_fentrylk,
+
+    /* Inode read operations */
+    .stat = dht_stat,
+    .fstat = dht_fstat,
+    .access = dht_access,
+    .readlink = dht_readlink,
+    .getxattr = dht_getxattr,
+    .fgetxattr = dht_fgetxattr,
+    .readv = dht_readv,
+    .flush = dht_flush,
+    .fsync = dht_fsync,
+    .inodelk = dht_inodelk,
+    .finodelk = dht_finodelk,
+    .lk = dht_lk,
+    .lease = dht_lease,
+
+    /* Inode write operations */
+    .fremovexattr = dht_fremovexattr,
+    .removexattr = dht_removexattr,
+    .setxattr = dht_setxattr,
+    .fsetxattr = dht_fsetxattr,
+    .truncate = dht_truncate,
+    .ftruncate = dht_ftruncate,
+    .writev = dht_writev,
+    .xattrop = dht_xattrop,
+    .fxattrop = dht_fxattrop,
+    .setattr = dht_setattr,
+    .fsetattr = dht_fsetattr,
+    .fallocate = dht_fallocate,
+    .discard = dht_discard,
+    .zerofill = dht_zerofill,
 };
 
-
-struct xlator_mops mops = {
+struct xlator_dumpops dumpops = {
+    .priv = dht_priv_dump,
+    .inodectx = dht_inodectx_dump,
 };
 
-
 struct xlator_cbks cbks = {
-//	.release    = dht_release,
-//      .releasedir = dht_releasedir,
-	.forget     = dht_forget
+    .release = dht_release,
+    //      .releasedir = dht_releasedir,
+    .forget = dht_forget,
 };
 
-
-struct volume_options options[] = {
-        { .key  = {"lookup-unhashed"}, 
-	  .type = GF_OPTION_TYPE_BOOL 
-	},
-	{ .key  = {NULL} },
+extern int32_t
+mem_acct_init(xlator_t *this);
+
+extern struct volume_options dht_options[];
+
+xlator_api_t xlator_api = {
+    .init = dht_init,
+    .fini = dht_fini,
+    .notify = dht_notify,
+    .reconfigure = dht_reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = dht_options,
+    .identifier = "distribute",
+    .pass_through_fops = &dht_pt_fops,
+    .category = GF_MAINTAINED,
 };
diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c
index 544913411db..3648a564840 100644
--- a/xlators/cluster/dht/src/nufa.c
+++ b/xlators/cluster/dht/src/nufa.c
@@ -1,684 +1,657 @@
 /*
-   Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
 
-#include "dht-common.c"
+#include "dht-common.h"
 
 /* TODO: all 'TODO's in dht.c holds good */
 
-int 
-nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		       int op_ret, int op_errno,
-		       inode_t *inode, struct stat *stbuf, dict_t *xattr)
+extern struct volume_options dht_options[];
+
+int
+nufa_local_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int op_ret, int op_errno, inode_t *inode,
+                      struct iatt *stbuf, dict_t *xattr,
+                      struct iatt *postparent)
 {
-	dht_layout_t *layout      = NULL;
-        xlator_t     *subvol      = NULL;
-        char          is_linkfile = 0;
-        char          is_dir      = 0;
-        dht_conf_t   *conf        = NULL;
-        dht_local_t  *local       = NULL;
-        loc_t        *loc         = NULL;
-        int           i           = 0;
-        call_frame_t *prev        = NULL;
-	int           call_cnt    = 0;
-
-
-        conf  = this->private;
-
-        prev  = cookie;
-        local = frame->local;
-        loc   = &local->loc;
-
-	if (ENTRY_MISSING (op_ret, op_errno)) {
-		if (conf->search_unhashed) {
-			local->op_errno = ENOENT;
-			dht_lookup_everywhere (frame, this, loc);
-			return 0;
-		}
-	}
-
-        if (op_ret == -1)
-                goto out;
-
-        is_linkfile = check_is_linkfile (inode, stbuf, xattr);
-        is_dir      = check_is_dir (inode, stbuf, xattr);
-
-        if (!is_dir && !is_linkfile) {
-                /* non-directory and not a linkfile */
-
-		dht_itransform (this, prev->this, stbuf->st_ino,
-				&stbuf->st_ino);
-
-		layout = dht_layout_for_subvol (this, prev->this);
-		if (!layout) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"no pre-set layout for subvolume %s",
-				prev->this->name);
-			op_ret   = -1;
-			op_errno = EINVAL;
-			goto err;
-		}
-
-                inode_ctx_put (inode, this, (uint64_t)(long)layout);
-                goto out;
+    xlator_t *subvol = NULL;
+    char is_linkfile = 0;
+    char is_dir = 0;
+    dht_conf_t *conf = NULL;
+    dht_local_t *local = NULL;
+    loc_t *loc = NULL;
+    int i = 0;
+    xlator_t *prev = NULL;
+    int call_cnt = 0;
+    int ret = 0;
+
+    conf = this->private;
+
+    prev = cookie;
+    local = frame->local;
+    loc = &local->loc;
+
+    if (ENTRY_MISSING(op_ret, op_errno)) {
+        if (conf->search_unhashed) {
+            local->op_errno = ENOENT;
+            dht_lookup_everywhere(frame, this, loc);
+            return 0;
+        }
+    }
+
+    if (op_ret == -1)
+        goto out;
+
+    is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name);
+    is_dir = check_is_dir(inode, stbuf, xattr);
+
+    if (!is_dir && !is_linkfile) {
+        /* non-directory and not a linkfile */
+        ret = dht_layout_preset(this, prev, inode);
+        if (ret < 0) {
+            gf_msg_debug(this->name, 0,
+                         "could not set pre-set layout for subvol"
+                         " %s",
+                         prev->name);
+            op_ret = -1;
+            op_errno = EINVAL;
+            goto err;
         }
 
-        if (is_dir) {
-                call_cnt        = conf->subvolume_cnt;
-		local->call_cnt = call_cnt;
-
-                local->inode = inode_ref (inode);
-                local->xattr = dict_ref (xattr);
-
-		local->op_ret = 0;
-		local->op_errno = 0;
-
-		local->layout = dht_layout_new (this, conf->subvolume_cnt);
-		if (!local->layout) {
-			op_ret   = -1;
-			op_errno = ENOMEM;
-			gf_log (this->name, GF_LOG_ERROR,
-				"memory allocation failed :(");
-			goto err;
-		}
-
-                for (i = 0; i < call_cnt; i++) {
-                        STACK_WIND (frame, dht_lookup_dir_cbk,
-                                    conf->subvolumes[i],
-                                    conf->subvolumes[i]->fops->lookup,
-                                    &local->loc, local->xattr_req);
-                }
+        goto out;
+    }
+
+    if (is_dir) {
+        call_cnt = conf->subvolume_cnt;
+        local->call_cnt = call_cnt;
+
+        local->inode = inode_ref(inode);
+        local->xattr = dict_ref(xattr);
+
+        local->op_ret = 0;
+        local->op_errno = 0;
+
+        local->layout = dht_layout_new(this, conf->subvolume_cnt);
+        if (!local->layout) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto err;
         }
 
-        if (is_linkfile) {
-                subvol = dht_linkfile_subvol (this, inode, stbuf, xattr);
+        for (i = 0; i < call_cnt; i++) {
+            STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i],
+                              conf->subvolumes[i],
+                              conf->subvolumes[i]->fops->lookup, &local->loc,
+                              local->xattr_req);
+        }
+    }
 
-                if (!subvol) {
-                        gf_log (this->name, GF_LOG_WARNING,
-                                "linkfile not having link subvolume. path=%s",
-                                loc->path);
-			dht_lookup_everywhere (frame, this, loc);
-			return 0;
-                }
+    if (is_linkfile) {
+        subvol = dht_linkfile_subvol(this, inode, stbuf, xattr);
 
-		STACK_WIND (frame, dht_lookup_linkfile_cbk,
-			    subvol, subvol->fops->lookup,
-			    &local->loc, local->xattr_req);
+        if (!subvol) {
+            gf_msg_debug(this->name, 0,
+                         "linkfile has no link subvolume. path=%s", loc->path);
+            dht_lookup_everywhere(frame, this, loc);
+            return 0;
         }
 
-        return 0;
+        STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol,
+                          subvol->fops->lookup, &local->loc, local->xattr_req);
+    }
+
+    return 0;
 
 out:
-	if (!local->hashed_subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no subvolume in layout for path=%s",
-			local->loc.path);
-		op_errno = EINVAL;
-		goto err;
-	}
-		
-	STACK_WIND (frame, dht_lookup_cbk,
-		    local->hashed_subvol, local->hashed_subvol->fops->lookup,
-		    &local->loc, local->xattr_req);
-
-	return 0;
-
- err:
-        DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr);
+    if (!local->hashed_subvol) {
+        gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+                     local->loc.path);
+        local->op_errno = ENOENT;
+        dht_lookup_everywhere(frame, this, loc);
         return 0;
+    }
+
+    STACK_WIND_COOKIE(frame, dht_lookup_cbk, local->hashed_subvol,
+                      local->hashed_subvol, local->hashed_subvol->fops->lookup,
+                      &local->loc, local->xattr_req);
+
+    return 0;
+
+err:
+    DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
+                     postparent);
+    return 0;
 }
 
 int
-nufa_lookup (call_frame_t *frame, xlator_t *this,
-	     loc_t *loc, dict_t *xattr_req)
+nufa_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
 {
-        xlator_t     *hashed_subvol = NULL;
-        xlator_t     *cached_subvol = NULL;
-        xlator_t     *subvol = NULL;
-        dht_local_t  *local  = NULL;
-	dht_conf_t   *conf = NULL;
-        int           ret    = -1;
-        int           op_errno = -1;
-	dht_layout_t *layout = NULL;
-	int           i = 0;
-	int           call_cnt = 0;
-
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	conf = this->private;
-
-        local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-        ret = loc_dup (loc, &local->loc);
-        if (ret == -1) {
-                op_errno = errno;
-                gf_log (this->name, GF_LOG_ERROR,
-                        "copying location failed for path=%s",
-                        loc->path);
-                goto err;
+    xlator_t *hashed_subvol = NULL;
+    xlator_t *subvol = NULL;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int ret = -1;
+    int op_errno = -1;
+    dht_layout_t *layout = NULL;
+    int i = 0;
+    int call_cnt = 0;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+    VALIDATE_OR_GOTO(loc->path, err);
+
+    conf = this->private;
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_LOOKUP);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    if (xattr_req) {
+        local->xattr_req = dict_ref(xattr_req);
+    } else {
+        local->xattr_req = dict_new();
+    }
+
+    hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
+
+    local->hashed_subvol = hashed_subvol;
+
+    if (is_revalidate(loc)) {
+        layout = local->layout;
+        if (!layout) {
+            gf_msg_debug(this->name, 0,
+                         "revalidate lookup without cache. "
+                         "path=%s",
+                         loc->path);
+            op_errno = EINVAL;
+            goto err;
         }
 
-	if (xattr_req) {
-		local->xattr_req = dict_ref (xattr_req);
-	} else {
-		local->xattr_req = dict_new ();
-	}
-
-	hashed_subvol = dht_subvol_get_hashed (this, &local->loc);
-	cached_subvol = dht_subvol_get_cached (this, local->loc.inode);
-	
-	local->cached_subvol = cached_subvol;
-	local->hashed_subvol = hashed_subvol;
-
-        if (is_revalidate (loc)) {
-		layout = dht_layout_get (this, loc->inode);
-
-                if (!layout) {
-                        gf_log (this->name, GF_LOG_ERROR,
-                                "revalidate without cache. path=%s",
-                                loc->path);
-                        op_errno = EINVAL;
-                        goto err;
-                }
-
-		if (layout->gen && (layout->gen < conf->gen)) {
-			gf_log (this->name, GF_LOG_WARNING,
-				"incomplete layout failure for path=%s",
-				loc->path);
-			op_errno = EAGAIN;
-			goto err;
-		}
-
-		local->inode    = inode_ref (loc->inode);
-		local->st_ino   = loc->inode->ino;
-
-		local->call_cnt = layout->cnt;
-		call_cnt = local->call_cnt;
-		
-		/* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute,
-		 *       revalidates directly go to the cached-subvolume.
-		 */
-		ret = dict_set_uint32 (local->xattr_req, 
-				       "trusted.glusterfs.dht", 4 * 4);
-
-		for (i = 0; i < layout->cnt; i++) {
-			subvol = layout->list[i].xlator;
-			
-			STACK_WIND (frame, dht_revalidate_cbk,
-				    subvol, subvol->fops->lookup,
-				    loc, local->xattr_req);
-
-			if (!--call_cnt)
-				break;
-		}
-	} else {
-		ret = dict_set_uint32 (local->xattr_req, 
-				       "trusted.glusterfs.dht", 4 * 4);
-
-		ret = dict_set_uint32 (local->xattr_req, 
-				       "trusted.glusterfs.dht.linkto", 256);
-
-		/* Send it to only local volume */
-		STACK_WIND (frame, nufa_local_lookup_cbk,
-			    conf->local_volume, 
-			    conf->local_volume->fops->lookup,
-			    loc, local->xattr_req);
-	}
+        if (layout->gen && (layout->gen < conf->gen)) {
+            gf_msg_debug(this->name, 0, "incomplete layout failure for path=%s",
+                         loc->path);
+            dht_layout_unref(this, local->layout);
+            goto do_fresh_lookup;
+        }
 
-        return 0;
+        local->inode = inode_ref(loc->inode);
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-        DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);
-	return 0;
-}
+        local->call_cnt = layout->cnt;
+        call_cnt = local->call_cnt;
 
-int
-nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, 
-				 xlator_t *this, int op_ret, int op_errno,
-				 inode_t *inode, struct stat *stbuf)
-{
- 	dht_local_t  *local = NULL;
- 	call_frame_t *prev = NULL;
-	dht_conf_t   *conf  = NULL;
-	
- 	local = frame->local;
- 	prev  = cookie;
- 	conf  = this->private;
-	
- 	if (op_ret == -1)
- 		goto err;
-	
- 	STACK_WIND (frame, dht_create_cbk,
- 		    conf->local_volume, conf->local_volume->fops->create,
- 		    &local->loc, local->flags, local->mode, local->fd);
-	
- 	return 0;
-	
- err:
- 	DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);	
- 	return 0;
-}
+        /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute,
+         *       revalidates directly go to the cached-subvolume.
+         */
+        ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+                   "Failed to set dict value.");
+            op_errno = -1;
+            goto err;
+        }
 
-int
-nufa_create (call_frame_t *frame, xlator_t *this,
-	     loc_t *loc, int32_t flags, mode_t mode, fd_t *fd)
-{
- 	dht_local_t *local = NULL;
-	dht_conf_t  *conf  = NULL;
-	xlator_t    *subvol = NULL;
-	int          op_errno = -1;
-	int          ret = -1;
-
-	VALIDATE_OR_GOTO (frame, err);
-	VALIDATE_OR_GOTO (this, err);
-	VALIDATE_OR_GOTO (loc, err);
-
- 	conf  = this->private; 	
-
-        local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	subvol = dht_subvol_get_hashed (this, loc);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no subvolume in layout for path=%s",
-			loc->path);
-		op_errno = ENOENT;
-		goto err;
-	}
-
- 	if (subvol != conf->local_volume) {
- 		/* create a link file instead of actual file */
- 		ret = loc_copy (&local->loc, loc);
- 		if (ret == -1) {
- 			gf_log (this->name, GF_LOG_ERROR,
- 				"memory allocation failed :(");
- 			op_errno = ENOMEM;
- 			goto err;
- 		}
- 
- 		local->fd = fd_ref (fd);
- 		local->mode = mode;
- 		local->flags = flags;
- 		
- 		dht_linkfile_create (frame, nufa_create_linkfile_create_cbk,
- 				      conf->local_volume, subvol, loc);
- 		return 0;
- 	}
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"creating %s on %s", loc->path, subvol->name);
-
-	STACK_WIND (frame, dht_create_cbk,
-		    subvol, subvol->fops->create,
-		    loc, flags, mode, fd);
-
-	return 0;
+        for (i = 0; i < layout->cnt; i++) {
+            subvol = layout->list[i].xlator;
 
-err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);
+            STACK_WIND_COOKIE(frame, dht_revalidate_cbk, subvol, subvol,
+                              subvol->fops->lookup, loc, local->xattr_req);
 
-	return 0;
-}
+            if (!--call_cnt)
+                break;
+        }
+    } else {
+    do_fresh_lookup:
+        ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+                   "Failed to set dict value.");
+            op_errno = -1;
+            goto err;
+        }
 
-int
-nufa_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			 int op_ret, int op_errno,
-			 inode_t *inode, struct stat *stbuf)
-{
- 	dht_local_t  *local = NULL;
- 	call_frame_t *prev = NULL;
-	dht_conf_t   *conf  = NULL;
-	
- 	local = frame->local;
- 	prev  = cookie;
- 	conf  = this->private;
- 	
- 	if (op_ret >= 0) {
- 		STACK_WIND (frame, dht_newfile_cbk,
- 			    conf->local_volume, 
- 			    conf->local_volume->fops->mknod,
- 			    &local->loc, local->mode, local->rdev);
-		
- 		return 0;
- 	}
-	
- 	DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf);
- 	return 0;
-}
+        ret = dict_set_uint32(local->xattr_req, conf->link_xattr_name, 256);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+                   "Failed to set dict value.");
+            op_errno = -1;
+            goto err;
+        }
 
+        /* Send it to only local volume */
+        STACK_WIND_COOKIE(
+            frame, nufa_local_lookup_cbk, ((xlator_t *)conf->private),
+            ((xlator_t *)conf->private),
+            ((xlator_t *)conf->private)->fops->lookup, loc, local->xattr_req);
+    }
 
-int
-nufa_mknod (call_frame_t *frame, xlator_t *this,
-	    loc_t *loc, mode_t mode, dev_t rdev)
-{
- 	dht_local_t *local = NULL;
-	dht_conf_t  *conf  = NULL;
-	xlator_t    *subvol = NULL;
-	int          op_errno = -1;
-	int          ret = -1;
-
-	VALIDATE_OR_GOTO (frame, err);
-	VALIDATE_OR_GOTO (this, err);
-	VALIDATE_OR_GOTO (loc, err);
-
- 	conf  = this->private; 	
-
-
-        local = dht_local_init (frame);
-	if (!local) {
-		op_errno = ENOMEM;
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		goto err;
-	}
-
-	subvol = dht_subvol_get_hashed (this, loc);
-	if (!subvol) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"no subvolume in layout for path=%s",
-			loc->path);
-		op_errno = ENOENT;
-		goto err;
-	}
-
-
- 	if (conf->local_volume != subvol) {
- 		/* Create linkfile first */
- 		ret = loc_copy (&local->loc, loc);
- 		if (ret == -1) {
- 			gf_log (this->name, GF_LOG_ERROR,
- 				"memory allocation failed :(");
- 			op_errno = ENOMEM;
- 			goto err;
- 		}
- 
-		local->mode = mode;
- 		local->rdev = rdev;
- 		
- 		dht_linkfile_create (frame, nufa_mknod_linkfile_cbk,
- 				      conf->local_volume, subvol, loc);
- 		return 0;
- 	}
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"creating %s on %s", loc->path, subvol->name);
-
-	STACK_WIND (frame, dht_newfile_cbk,
-		    subvol, subvol->fops->mknod,
-		    loc, mode, rdev);
-
-	return 0;
+    return 0;
 
 err:
-	op_errno = (op_errno == -1) ? errno : op_errno;
-	DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+    return 0;
 }
 
-
 int
-notify (xlator_t *this, int event, void *data, ...)
+nufa_create_linkfile_create_cbk(call_frame_t *frame, void *cookie,
+                                xlator_t *this, int op_ret, int op_errno,
+                                inode_t *inode, struct iatt *stbuf,
+                                struct iatt *preparent, struct iatt *postparent,
+                                dict_t *xdata)
 {
-	int ret = -1;
+    dht_local_t *local = NULL;
 
-	ret = dht_notify (this, event, data);
+    local = frame->local;
 
-	return ret;
-}
+    if (op_ret == -1)
+        goto err;
 
-void
-fini (xlator_t *this)
-{
-        int         i = 0;
-        dht_conf_t *conf = NULL;
+    STACK_WIND_COOKIE(frame, dht_create_cbk, local->cached_subvol,
+                      local->cached_subvol, local->cached_subvol->fops->create,
+                      &local->loc, local->flags, local->mode, local->umask,
+                      local->fd, local->params);
+
+    return 0;
 
-	conf = this->private;
+err:
+    DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+                     NULL);
+    return 0;
+}
 
-        if (conf) {
-                if (conf->file_layouts) {
-                        for (i = 0; i < conf->subvolume_cnt; i++) {
-                                FREE (conf->file_layouts[i]);
-                        }
-                        FREE (conf->file_layouts);
-                }
+int
+nufa_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+            mode_t mode, mode_t umask, fd_t *fd, dict_t *params)
+{
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    xlator_t *subvol = NULL;
+    xlator_t *avail_subvol = NULL;
+    int op_errno = -1;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+
+    conf = this->private;
+
+    dht_get_du_info(frame, this, loc);
+
+    local = dht_local_init(frame, loc, fd, GF_FOP_CREATE);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = dht_subvol_get_hashed(this, loc);
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+                     loc->path);
+        op_errno = ENOENT;
+        goto err;
+    }
+
+    avail_subvol = conf->private;
+    if (dht_is_subvol_filled(this, (xlator_t *)conf->private)) {
+        avail_subvol = dht_free_disk_available_subvol(
+            this, (xlator_t *)conf->private, local);
+    }
+
+    if (subvol != avail_subvol) {
+        /* create a link file instead of actual file */
+        local->params = dict_ref(params);
+        local->mode = mode;
+        local->flags = flags;
+        local->umask = umask;
+        local->cached_subvol = avail_subvol;
+        dht_linkfile_create(frame, nufa_create_linkfile_create_cbk, this,
+                            avail_subvol, subvol, loc);
+        return 0;
+    }
 
-                if (conf->default_dir_layout)
-                        FREE (conf->default_dir_layout);
+    gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name);
 
-                if (conf->subvolumes)
-                        FREE (conf->subvolumes);
+    STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
+                      subvol->fops->create, loc, flags, mode, umask, fd,
+                      params);
 
-		if (conf->subvolume_status)
-			FREE (conf->subvolume_status);
+    return 0;
 
-                FREE (conf);
-        }
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+                     NULL);
 
-	return;
+    return 0;
 }
 
 int
-init (xlator_t *this)
+nufa_mknod_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int op_ret, int op_errno, inode_t *inode,
+                        struct iatt *stbuf, struct iatt *preparent,
+                        struct iatt *postparent, dict_t *xdata)
 {
-        dht_conf_t    *conf = NULL;
-	xlator_list_t *trav = NULL;
-	data_t        *data = NULL;
-	char          *local_volname = NULL;
-	char          *lookup_unhashed_str = NULL;
-        int            ret = -1;
-        int            i = 0;
-	char           my_hostname[256];
-
-	if (!this->children) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"DHT needs more than one child defined");
-		return -1;
-	}
-  
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-
-        conf = CALLOC (1, sizeof (*conf));
-        if (!conf) {
-                gf_log (this->name, GF_LOG_ERROR,
-                        "memory allocation failed :(");
-                goto err;
-        }
+    dht_local_t *local = NULL;
 
-	conf->search_unhashed = 0;
+    local = frame->local;
+    if (!local || !local->cached_subvol) {
+        op_errno = EINVAL;
+        op_ret = -1;
+        goto err;
+    }
 
-	if (dict_get_str (this->options, "lookup-unhashed",
-			  &lookup_unhashed_str) == 0) {
-		gf_string2boolean (lookup_unhashed_str,
-				   &conf->search_unhashed);
-	}
+    if (op_ret >= 0) {
+        STACK_WIND_COOKIE(
+            frame, dht_newfile_cbk, (void *)local->cached_subvol,
+            local->cached_subvol, local->cached_subvol->fops->mknod,
+            &local->loc, local->mode, local->rdev, local->umask, local->params);
 
-        ret = dht_init_subvolumes (this, conf);
-        if (ret == -1) {
-                goto err;
-        }
-
-        ret = dht_layouts_init (this, conf);
-        if (ret == -1) {
-                goto err;
-        }
+        return 0;
+    }
+err:
+    WIPE(postparent);
+    WIPE(preparent);
 
-	LOCK_INIT (&conf->subvolume_lock);
-
-	conf->gen = 1;
-
-	local_volname = "localhost";
-	ret = gethostname (my_hostname, 256);
-	if (ret < 0) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"could not find hostname (%s)",
-			strerror (errno));
-	}
-
-	if (ret == 0)
-		local_volname = my_hostname;
-
-	data = dict_get (this->options, "local-volume-name");
-	if (data) {
-		local_volname = data->data;
-	}
-
-	trav = this->children;
-	while (trav) {
-		if (strcmp (trav->xlator->name, local_volname) == 0)
-			break;
-		trav = trav->next;
-	}
-
-	if (!trav) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"Could not find subvolume named '%s'. "
-			"Please define volume with the name as the hostname "
-			"or override it with 'option local-volume-name'",
-			local_volname);
-		goto err;
-	}
-	/* The volume specified exists */
-	conf->local_volume = trav->xlator;
-
-        this->private = conf;
+    DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent,
+                     postparent, xdata);
+    return 0;
+}
 
+int
+nufa_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+           dev_t rdev, mode_t umask, dict_t *params)
+{
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    xlator_t *subvol = NULL;
+    xlator_t *avail_subvol = NULL;
+    int op_errno = -1;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+
+    conf = this->private;
+
+    dht_get_du_info(frame, this, loc);
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_MKNOD);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = dht_subvol_get_hashed(this, loc);
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+                     loc->path);
+        op_errno = ENOENT;
+        goto err;
+    }
+
+    /* Consider the disksize in consideration */
+    avail_subvol = conf->private;
+    if (dht_is_subvol_filled(this, (xlator_t *)conf->private)) {
+        avail_subvol = dht_free_disk_available_subvol(
+            this, (xlator_t *)conf->private, local);
+    }
+
+    if (avail_subvol != subvol) {
+        /* Create linkfile first */
+
+        local->params = dict_ref(params);
+        local->mode = mode;
+        local->umask = umask;
+        local->rdev = rdev;
+        local->cached_subvol = avail_subvol;
+
+        dht_linkfile_create(frame, nufa_mknod_linkfile_cbk, this, avail_subvol,
+                            subvol, loc);
         return 0;
+    }
 
-err:
-        if (conf) {
-                if (conf->file_layouts) {
-                        for (i = 0; i < conf->subvolume_cnt; i++) {
-                                FREE (conf->file_layouts[i]);
-                        }
-                        FREE (conf->file_layouts);
-                }
+    gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name);
 
-                if (conf->default_dir_layout)
-                        FREE (conf->default_dir_layout);
+    STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol,
+                      subvol->fops->mknod, loc, mode, rdev, umask, params);
 
-                if (conf->subvolumes)
-                        FREE (conf->subvolumes);
+    return 0;
 
-		if (conf->subvolume_status)
-			FREE (conf->subvolume_status);
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
 
-                FREE (conf);
+    return 0;
+}
+
+gf_boolean_t
+same_first_part(char *str1, char term1, char *str2, char term2)
+{
+    gf_boolean_t ended1;
+    gf_boolean_t ended2;
+
+    for (;;) {
+        ended1 = ((*str1 == '\0') || (*str1 == term1));
+        ended2 = ((*str2 == '\0') || (*str2 == term2));
+        if (ended1 && ended2) {
+            return _gf_true;
         }
+        if (ended1 || ended2 || (*str1 != *str2)) {
+            return _gf_false;
+        }
+        ++str1;
+        ++str2;
+    }
+}
 
-        return -1;
+typedef struct nufa_args {
+    xlator_t *this;
+    char *volname;
+    gf_boolean_t addr_match;
+} nufa_args_t;
+
+static void
+nufa_find_local_brick(xlator_t *xl, void *data)
+{
+    nufa_args_t *args = data;
+    xlator_t *this = args->this;
+    char *local_volname = args->volname;
+    gf_boolean_t addr_match = args->addr_match;
+    char *brick_host = NULL;
+    dht_conf_t *conf = this->private;
+    int ret = -1;
+
+    /*This means a local subvol was already found. We pick the first brick
+     * that is local*/
+    if (conf->private)
+        return;
+
+    if (strcmp(xl->name, local_volname) == 0) {
+        conf->private = xl;
+        gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+               "Using specified subvol %s", local_volname);
+        return;
+    }
+
+    if (!addr_match)
+        return;
+
+    ret = dict_get_str(xl->options, "remote-host", &brick_host);
+    if ((ret == 0) && (gf_is_same_address(local_volname, brick_host) ||
+                       gf_is_local_addr(brick_host))) {
+        conf->private = xl;
+        gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+               "Using the first local "
+               "subvol %s",
+               xl->name);
+        return;
+    }
 }
 
+static void
+nufa_to_dht(xlator_t *this)
+{
+    GF_ASSERT(this);
+    GF_ASSERT(this->fops);
 
-struct xlator_fops fops = {
-	.lookup      = nufa_lookup,
-	.create      = nufa_create,
-	.mknod       = nufa_mknod,
-
-	.stat        = dht_stat,
-	.chmod       = dht_chmod,
-	.chown       = dht_chown,
-	.fchown      = dht_fchown,
-	.fchmod      = dht_fchmod,
-	.fstat       = dht_fstat,
-	.utimens     = dht_utimens,
-	.truncate    = dht_truncate,
-	.ftruncate   = dht_ftruncate,
-	.access      = dht_access,
-	.readlink    = dht_readlink,
-	.setxattr    = dht_setxattr,
-	.getxattr    = dht_getxattr,
-	.removexattr = dht_removexattr,
-	.open        = dht_open,
-	.readv       = dht_readv,
-	.writev      = dht_writev,
-	.flush       = dht_flush,
-	.fsync       = dht_fsync,
-	.statfs      = dht_statfs,
-	.lk          = dht_lk,
-	.opendir     = dht_opendir,
-	.readdir     = dht_readdir,
-	.fsyncdir    = dht_fsyncdir,
-	.symlink     = dht_symlink,
-	.unlink      = dht_unlink,
-	.link        = dht_link,
-	.mkdir       = dht_mkdir,
-	.rmdir       = dht_rmdir,
-	.rename      = dht_rename,
-	.inodelk     = dht_inodelk,
-	.finodelk    = dht_finodelk,
-	.entrylk     = dht_entrylk,
-	.fentrylk    = dht_fentrylk,
-	.xattrop     = dht_xattrop,
-	.fxattrop    = dht_fxattrop,
-#if 0
-	.setdents    = dht_setdents,
-	.getdents    = dht_getdents,
-	.checksum    = dht_checksum,
-#endif
-};
+    this->fops->lookup = dht_lookup;
+    this->fops->create = dht_create;
+    this->fops->mknod = dht_mknod;
+}
 
+int
+nufa_find_local_subvol(xlator_t *this, void (*fn)(xlator_t *each, void *data),
+                       void *data)
+{
+    int ret = -1;
+    dht_conf_t *conf = this->private;
+    xlator_list_t *trav = NULL;
+    xlator_t *parent = NULL;
+    xlator_t *candidate = NULL;
+
+    xlator_foreach_depth_first(this, fn, data);
+    if (!conf->private) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_BRICK_ERROR,
+               "Couldn't find a local "
+               "brick");
+        return -1;
+    }
+
+    candidate = conf->private;
+    trav = candidate->parents;
+    while (trav) {
+        parent = trav->xlator;
+        if (strcmp(parent->type, "cluster/nufa") == 0) {
+            gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+                   "Found local subvol, "
+                   "%s",
+                   candidate->name);
+            ret = 0;
+            conf->private = candidate;
+            break;
+        }
 
-struct xlator_mops mops = {
-};
+        candidate = parent;
+        trav = parent->parents;
+    }
+
+    return ret;
+}
 
+int
+nufa_init(xlator_t *this)
+{
+    data_t *data = NULL;
+    char *local_volname = NULL;
+    int ret = -1;
+    char my_hostname[256];
+    gf_boolean_t addr_match = _gf_false;
+    nufa_args_t args = {
+        0,
+    };
+
+    ret = dht_init(this);
+    if (ret) {
+        return ret;
+    }
+
+    if ((data = dict_get(this->options, "local-volume-name"))) {
+        local_volname = data->data;
+
+    } else {
+        addr_match = _gf_true;
+        local_volname = "localhost";
+        ret = gethostname(my_hostname, 256);
+        if (ret == 0)
+            local_volname = my_hostname;
+
+        else
+            gf_msg(this->name, GF_LOG_WARNING, errno,
+                   DHT_MSG_GET_HOSTNAME_FAILED, "could not find hostname");
+    }
+
+    args.this = this;
+    args.volname = local_volname;
+    args.addr_match = addr_match;
+    ret = nufa_find_local_subvol(this, nufa_find_local_brick, &args);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+               "Unable to find local subvolume, switching "
+               "to dht mode");
+        nufa_to_dht(this);
+    }
+    return 0;
+}
 
-struct xlator_cbks cbks = {
-//	.release    = dht_release,
-//      .releasedir = dht_releasedir,
-	.forget     = dht_forget
+dht_methods_t dht_methods = {
+    .migration_get_dst_subvol = dht_migration_get_dst_subvol,
+    .layout_search = dht_layout_search,
 };
 
+struct xlator_fops fops = {
+    .lookup = nufa_lookup,
+    .create = nufa_create,
+    .mknod = nufa_mknod,
+
+    .stat = dht_stat,
+    .fstat = dht_fstat,
+    .truncate = dht_truncate,
+    .ftruncate = dht_ftruncate,
+    .access = dht_access,
+    .readlink = dht_readlink,
+    .setxattr = dht_setxattr,
+    .getxattr = dht_getxattr,
+    .removexattr = dht_removexattr,
+    .open = dht_open,
+    .readv = dht_readv,
+    .writev = dht_writev,
+    .flush = dht_flush,
+    .fsync = dht_fsync,
+    .statfs = dht_statfs,
+    .lk = dht_lk,
+    .opendir = dht_opendir,
+    .readdir = dht_readdir,
+    .readdirp = dht_readdirp,
+    .fsyncdir = dht_fsyncdir,
+    .symlink = dht_symlink,
+    .unlink = dht_unlink,
+    .link = dht_link,
+    .mkdir = dht_mkdir,
+    .rmdir = dht_rmdir,
+    .rename = dht_rename,
+    .inodelk = dht_inodelk,
+    .finodelk = dht_finodelk,
+    .entrylk = dht_entrylk,
+    .fentrylk = dht_fentrylk,
+    .xattrop = dht_xattrop,
+    .fxattrop = dht_fxattrop,
+    .setattr = dht_setattr,
+};
 
-struct volume_options options[] = {
-	{ .key  = {"local-volume-name"}, 
-	  .type = GF_OPTION_TYPE_XLATOR 
-	},
-        { .key  = {"lookup-unhashed"}, 
-	  .type = GF_OPTION_TYPE_BOOL 
-	},
-	{ .key  = {NULL} },
+struct xlator_cbks cbks = {.forget = dht_forget};
+extern int32_t
+mem_acct_init(xlator_t *this);
+
+xlator_api_t xlator_api = {
+    .init = nufa_init,
+    .fini = dht_fini,
+    .notify = dht_notify,
+    .reconfigure = dht_reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = dht_options,
+    .identifier = "nufa",
+    .category = GF_TECH_PREVIEW,
 };
diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c
new file mode 100644
index 00000000000..207d109a025
--- /dev/null
+++ b/xlators/cluster/dht/src/switch.c
@@ -0,0 +1,891 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "dht-common.h"
+#include "dht-mem-types.h"
+
+#include <sys/time.h>
+#include <stdlib.h>
+#include <fnmatch.h>
+#include <string.h>
+
+extern struct volume_options dht_options[];
+
+struct switch_sched_array {
+    xlator_t *xl;
+    int32_t eligible;
+    int32_t considered;
+};
+
+/* Select one of this struct based on the path's pattern match */
+struct switch_struct {
+    struct switch_struct *next;
+    struct switch_sched_array *array;
+    int32_t node_index; /* Index of the node in
+                           this pattern. */
+    int32_t num_child;  /* Total num of child nodes
+                           with this pattern. */
+    char path_pattern[256];
+};
+
+/* TODO: all 'TODO's in dht.c holds good */
+/* This function should return child node as '*:subvolumes' is inserterd */
+
+static int32_t
+gf_switch_valid_child(xlator_t *this, const char *child)
+{
+    xlator_list_t *children = NULL;
+    int32_t ret = 0;
+
+    children = this->children;
+    while (children) {
+        if (!strcmp(child, children->xlator->name)) {
+            ret = 1;
+            break;
+        }
+        children = children->next;
+    }
+
+    return ret;
+}
+
+static xlator_t *
+get_switch_matching_subvol(const char *path, dht_conf_t *conf,
+                           xlator_t *hashed_subvol)
+{
+    struct switch_struct *cond = NULL;
+    struct switch_struct *trav = NULL;
+    char *pathname = NULL;
+    int idx = 0;
+    xlator_t *subvol = NULL;
+
+    cond = conf->private;
+    subvol = hashed_subvol;
+    if (!cond)
+        goto out;
+
+    pathname = gf_strdup(path);
+    if (!pathname)
+        goto out;
+
+    trav = cond;
+    while (trav) {
+        if (fnmatch(trav->path_pattern, pathname, FNM_NOESCAPE) == 0) {
+            for (idx = 0; idx < trav->num_child; idx++) {
+                if (trav->array[idx].xl == hashed_subvol)
+                    goto out;
+            }
+            idx = trav->node_index++;
+            trav->node_index %= trav->num_child;
+            subvol = trav->array[idx].xl;
+            goto out;
+        }
+        trav = trav->next;
+    }
+out:
+    GF_FREE(pathname);
+
+    return subvol;
+}
+
+int
+switch_local_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int op_ret, int op_errno, inode_t *inode,
+                        struct iatt *stbuf, dict_t *xattr,
+                        struct iatt *postparent)
+{
+    xlator_t *subvol = NULL;
+    char is_linkfile = 0;
+    char is_dir = 0;
+    dht_conf_t *conf = NULL;
+    dht_local_t *local = NULL;
+    loc_t *loc = NULL;
+    int i = 0;
+    xlator_t *prev = NULL;
+    int call_cnt = 0;
+    int ret = 0;
+
+    conf = this->private;
+
+    prev = cookie;
+    local = frame->local;
+    loc = &local->loc;
+
+    if (ENTRY_MISSING(op_ret, op_errno)) {
+        if (conf->search_unhashed) {
+            local->op_errno = ENOENT;
+            dht_lookup_everywhere(frame, this, loc);
+            return 0;
+        }
+    }
+
+    if (op_ret == -1)
+        goto out;
+
+    is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name);
+    is_dir = check_is_dir(inode, stbuf, xattr);
+
+    if (!is_dir && !is_linkfile) {
+        /* non-directory and not a linkfile */
+
+        ret = dht_layout_preset(this, prev, inode);
+        if (ret < 0) {
+            gf_msg_debug(this->name, 0,
+                         "could not set pre-set layout "
+                         "for subvol %s",
+                         prev->name);
+            op_ret = -1;
+            op_errno = EINVAL;
+            goto err;
+        }
+
+        goto out;
+    }
+
+    if (is_dir) {
+        call_cnt = conf->subvolume_cnt;
+        local->call_cnt = call_cnt;
+
+        local->inode = inode_ref(inode);
+        local->xattr = dict_ref(xattr);
+
+        local->op_ret = 0;
+        local->op_errno = 0;
+
+        local->layout = dht_layout_new(this, conf->subvolume_cnt);
+        if (!local->layout) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            gf_msg_debug(this->name, 0, "memory allocation failed :(");
+            goto err;
+        }
+
+        for (i = 0; i < call_cnt; i++) {
+            STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i],
+                              conf->subvolumes[i],
+                              conf->subvolumes[i]->fops->lookup, &local->loc,
+                              local->xattr_req);
+        }
+    }
+
+    if (is_linkfile) {
+        subvol = dht_linkfile_subvol(this, inode, stbuf, xattr);
+
+        if (!subvol) {
+            gf_msg_debug(this->name, 0,
+                         "linkfile has no link subvolume.path=%s", loc->path);
+            dht_lookup_everywhere(frame, this, loc);
+            return 0;
+        }
+
+        STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol,
+                          subvol->fops->lookup, &local->loc, local->xattr_req);
+    }
+
+    return 0;
+
+out:
+    if (!local->hashed_subvol) {
+        gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+                     local->loc.path);
+        local->op_errno = ENOENT;
+        dht_lookup_everywhere(frame, this, loc);
+        return 0;
+    }
+
+    STACK_WIND_COOKIE(frame, dht_lookup_cbk, local->hashed_subvol,
+                      local->hashed_subvol, local->hashed_subvol->fops->lookup,
+                      &local->loc, local->xattr_req);
+
+    return 0;
+
+err:
+    DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
+                     NULL);
+    return 0;
+}
+
+int
+switch_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc,
+              dict_t *xattr_req)
+{
+    xlator_t *hashed_subvol = NULL;
+    xlator_t *cached_subvol = NULL;
+    xlator_t *subvol = NULL;
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    int ret = -1;
+    int op_errno = -1;
+    dht_layout_t *layout = NULL;
+    int i = 0;
+    int call_cnt = 0;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+    VALIDATE_OR_GOTO(loc->path, err);
+
+    conf = this->private;
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_LOOKUP);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    if (xattr_req) {
+        local->xattr_req = dict_ref(xattr_req);
+    } else {
+        local->xattr_req = dict_new();
+    }
+
+    hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
+    cached_subvol = local->cached_subvol;
+
+    local->hashed_subvol = hashed_subvol;
+
+    if (is_revalidate(loc)) {
+        layout = local->layout;
+        if (!layout) {
+            gf_msg_debug(this->name, 0,
+                         "revalidate lookup without cache. path=%s", loc->path);
+            op_errno = EINVAL;
+            goto err;
+        }
+
+        if (layout->gen && (layout->gen < conf->gen)) {
+            gf_msg_debug(this->name, 0, "incomplete layout failure for path=%s",
+                         loc->path);
+            dht_layout_unref(this, local->layout);
+            goto do_fresh_lookup;
+        }
+
+        local->inode = inode_ref(loc->inode);
+
+        local->call_cnt = layout->cnt;
+        call_cnt = local->call_cnt;
+
+        /* NOTE: we don't require 'trusted.glusterfs.dht.linkto'
+         * attribute, revalidates directly go to the cached-subvolume.
+         */
+        ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4);
+        if (ret < 0)
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+                   "failed to set dict value for %s", conf->xattr_name);
+
+        for (i = 0; i < layout->cnt; i++) {
+            subvol = layout->list[i].xlator;
+
+            STACK_WIND_COOKIE(frame, dht_revalidate_cbk, subvol, subvol,
+                              subvol->fops->lookup, loc, local->xattr_req);
+
+            if (!--call_cnt)
+                break;
+        }
+    } else {
+    do_fresh_lookup:
+        ret = dict_set_uint32(local->xattr_req, conf->xattr_name, 4 * 4);
+        if (ret < 0)
+            gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+                   "failed to set dict value for %s", conf->xattr_name);
+
+        ret = dict_set_uint32(local->xattr_req, conf->link_xattr_name, 256);
+        if (ret < 0)
+            gf_msg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_DICT_SET_FAILED,
+                   "failed to set dict value for %s", conf->link_xattr_name);
+
+        if (!hashed_subvol) {
+            gf_msg_debug(this->name, 0,
+                         "no subvolume in layout for path=%s, "
+                         "checking on all the subvols to see if "
+                         "it is a directory",
+                         loc->path);
+            call_cnt = conf->subvolume_cnt;
+            local->call_cnt = call_cnt;
+
+            local->layout = dht_layout_new(this, conf->subvolume_cnt);
+            if (!local->layout) {
+                op_errno = ENOMEM;
+                goto err;
+            }
+
+            for (i = 0; i < call_cnt; i++) {
+                STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk,
+                                  conf->subvolumes[i], conf->subvolumes[i],
+                                  conf->subvolumes[i]->fops->lookup,
+                                  &local->loc, local->xattr_req);
+            }
+            return 0;
+        }
+
+        /*  */
+        cached_subvol = get_switch_matching_subvol(loc->path, conf,
+                                                   hashed_subvol);
+        if (cached_subvol == hashed_subvol) {
+            STACK_WIND_COOKIE(frame, dht_lookup_cbk, hashed_subvol,
+                              hashed_subvol, hashed_subvol->fops->lookup, loc,
+                              local->xattr_req);
+        } else {
+            STACK_WIND_COOKIE(frame, switch_local_lookup_cbk, cached_subvol,
+                              cached_subvol, cached_subvol->fops->lookup, loc,
+                              local->xattr_req);
+        }
+    }
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+    return 0;
+}
+
+int
+switch_create_linkfile_create_cbk(call_frame_t *frame, void *cookie,
+                                  xlator_t *this, int op_ret, int op_errno,
+                                  inode_t *inode, struct iatt *stbuf,
+                                  struct iatt *preparent,
+                                  struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret == -1)
+        goto err;
+
+    STACK_WIND_COOKIE(frame, dht_create_cbk, local->cached_subvol,
+                      local->cached_subvol, local->cached_subvol->fops->create,
+                      &local->loc, local->flags, local->mode, local->umask,
+                      local->fd, local->params);
+
+    return 0;
+
+err:
+    DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+                     NULL);
+    return 0;
+}
+
+int
+switch_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+              mode_t mode, mode_t umask, fd_t *fd, dict_t *params)
+{
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    xlator_t *subvol = NULL;
+    xlator_t *avail_subvol = NULL;
+    int op_errno = -1;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+
+    conf = this->private;
+
+    dht_get_du_info(frame, this, loc);
+
+    local = dht_local_init(frame, loc, fd, GF_FOP_CREATE);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = dht_subvol_get_hashed(this, loc);
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+                     loc->path);
+        op_errno = ENOENT;
+        goto err;
+    }
+
+    avail_subvol = get_switch_matching_subvol(loc->path, conf, subvol);
+    if (dht_is_subvol_filled(this, avail_subvol)) {
+        avail_subvol = dht_free_disk_available_subvol(this, avail_subvol,
+                                                      local);
+    }
+
+    if (subvol != avail_subvol) {
+        /* create a link file instead of actual file */
+        local->mode = mode;
+        local->flags = flags;
+        local->umask = umask;
+        local->cached_subvol = avail_subvol;
+        dht_linkfile_create(frame, switch_create_linkfile_create_cbk, this,
+                            avail_subvol, subvol, loc);
+        return 0;
+    }
+
+    gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name);
+
+    STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
+                      subvol->fops->create, loc, flags, mode, umask, fd,
+                      params);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+                     NULL);
+
+    return 0;
+}
+
+int
+switch_mknod_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int op_ret, int op_errno, inode_t *inode,
+                          struct iatt *stbuf, struct iatt *preparent,
+                          struct iatt *postparent, dict_t *xdata)
+{
+    dht_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local || !local->cached_subvol) {
+        op_errno = EINVAL;
+        op_ret = -1;
+        goto err;
+    }
+
+    if (op_ret >= 0) {
+        STACK_WIND_COOKIE(
+            frame, dht_newfile_cbk, (void *)local->cached_subvol,
+            local->cached_subvol, local->cached_subvol->fops->mknod,
+            &local->loc, local->mode, local->rdev, local->umask, local->params);
+
+        return 0;
+    }
+err:
+    DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent,
+                     postparent, xdata);
+    return 0;
+}
+
+int
+switch_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+             dev_t rdev, mode_t umask, dict_t *params)
+{
+    dht_local_t *local = NULL;
+    dht_conf_t *conf = NULL;
+    xlator_t *subvol = NULL;
+    xlator_t *avail_subvol = NULL;
+    int op_errno = -1;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+
+    conf = this->private;
+
+    dht_get_du_info(frame, this, loc);
+
+    local = dht_local_init(frame, loc, NULL, GF_FOP_MKNOD);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    subvol = dht_subvol_get_hashed(this, loc);
+    if (!subvol) {
+        gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+                     loc->path);
+        op_errno = ENOENT;
+        goto err;
+    }
+
+    /* Consider the disksize in consideration */
+    avail_subvol = get_switch_matching_subvol(loc->path, conf, subvol);
+    if (dht_is_subvol_filled(this, avail_subvol)) {
+        avail_subvol = dht_free_disk_available_subvol(this, avail_subvol,
+                                                      local);
+    }
+
+    if (avail_subvol != subvol) {
+        /* Create linkfile first */
+
+        local->params = dict_ref(params);
+        local->mode = mode;
+        local->umask = umask;
+        local->rdev = rdev;
+        local->cached_subvol = avail_subvol;
+
+        dht_linkfile_create(frame, switch_mknod_linkfile_cbk, this,
+                            avail_subvol, subvol, loc);
+        return 0;
+    }
+
+    gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name);
+
+    STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol,
+                      subvol->fops->mknod, loc, mode, rdev, umask, params);
+
+    return 0;
+
+err:
+    op_errno = (op_errno == -1) ? errno : op_errno;
+    DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+
+    return 0;
+}
+
+void
+switch_fini(xlator_t *this)
+{
+    dht_conf_t *conf = NULL;
+    struct switch_struct *trav = NULL;
+    struct switch_struct *prev = NULL;
+
+    conf = this->private;
+
+    if (conf) {
+        trav = (struct switch_struct *)conf->private;
+        conf->private = NULL;
+        while (trav) {
+            GF_FREE(trav->array);
+            prev = trav;
+            trav = trav->next;
+            GF_FREE(prev);
+        }
+    }
+
+    dht_fini(this);
+}
+
+int
+set_switch_pattern(xlator_t *this, dht_conf_t *conf, const char *pattern_str)
+{
+    int flag = 0;
+    int idx = 0;
+    int index = 0;
+    int child_count = 0;
+    char *tmp = NULL;
+    char *tmp1 = NULL;
+    char *child = NULL;
+    char *tmp_str = NULL;
+    char *tmp_str1 = NULL;
+    char *dup_str = NULL;
+    char *dup_childs = NULL;
+    char *switch_str = NULL;
+    char *pattern = NULL;
+    char *childs = NULL;
+    char *option_string = NULL;
+    size_t pattern_length;
+    struct switch_struct *switch_buf = NULL;
+    struct switch_struct *switch_opt = NULL;
+    struct switch_struct *trav = NULL;
+    struct switch_sched_array *switch_buf_array = NULL;
+    xlator_list_t *trav_xl = NULL;
+
+    trav_xl = this->children;
+    while (trav_xl) {
+        index++;
+        trav_xl = trav_xl->next;
+    }
+    child_count = index;
+    switch_buf_array = GF_CALLOC((index + 1), sizeof(struct switch_sched_array),
+                                 gf_switch_mt_switch_sched_array);
+    if (!switch_buf_array)
+        goto err;
+
+    trav_xl = this->children;
+    index = 0;
+
+    while (trav_xl) {
+        switch_buf_array[index].xl = trav_xl->xlator;
+        switch_buf_array[index].eligible = 1;
+        trav_xl = trav_xl->next;
+        index++;
+    }
+
+    /*  *jpg:child1,child2;*mpg:child3;*:child4,child5,child6 */
+
+    /* Get the pattern for considering switch case.
+       "option block-size *avi:10MB" etc */
+    option_string = gf_strdup(pattern_str);
+    if (option_string == NULL) {
+        goto err;
+    }
+    switch_str = strtok_r(option_string, ";", &tmp_str);
+    while (switch_str) {
+        dup_str = gf_strdup(switch_str);
+        if (dup_str == NULL) {
+            goto err;
+        }
+        switch_opt = GF_CALLOC(1, sizeof(struct switch_struct),
+                               gf_switch_mt_switch_struct);
+        if (!switch_opt) {
+            GF_FREE(dup_str);
+            goto err;
+        }
+
+        pattern = strtok_r(dup_str, ":", &tmp_str1);
+        childs = strtok_r(NULL, ":", &tmp_str1);
+        if (strncmp(pattern, "*", 2) == 0) {
+            gf_msg("switch", GF_LOG_INFO, 0, DHT_MSG_SWITCH_PATTERN_INFO,
+                   "'*' pattern will be taken by default "
+                   "for all the unconfigured child nodes,"
+                   " hence neglecting current option");
+            switch_str = strtok_r(NULL, ";", &tmp_str);
+            GF_FREE(switch_opt);
+            switch_opt = NULL;
+            GF_FREE(dup_str);
+            continue;
+        }
+        GF_FREE(dup_str);
+
+        pattern_length = strlen(pattern);
+        if (pattern_length >= (sizeof(switch_opt->path_pattern))) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   DHT_MSG_SET_SWITCH_PATTERN_ERROR, "Pattern (%s) too long",
+                   pattern);
+            goto err;
+        }
+        memcpy(switch_opt->path_pattern, pattern, pattern_length);
+        switch_opt->path_pattern[pattern_length] = '\0';
+
+        if (childs) {
+            dup_childs = gf_strdup(childs);
+            if (dup_childs == NULL) {
+                goto err;
+            }
+            child = strtok_r(dup_childs, ",", &tmp);
+            while (child) {
+                if (gf_switch_valid_child(this, child)) {
+                    idx++;
+                    child = strtok_r(NULL, ",", &tmp);
+                } else {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_ERROR,
+                           "%s is not a subvolume of %s. "
+                           "pattern can only be scheduled "
+                           "only to a subvolume of %s",
+                           child, this->name, this->name);
+                    GF_FREE(dup_childs);
+                    goto err;
+                }
+            }
+            GF_FREE(dup_childs);
+            child = strtok_r(childs, ",", &tmp1);
+            switch_opt->num_child = idx;
+            switch_opt->array = GF_CALLOC(
+                1, (idx * sizeof(struct switch_sched_array)),
+                gf_switch_mt_switch_sched_array);
+            if (!switch_opt->array)
+                goto err;
+            idx = 0;
+            while (child) {
+                for (index = 0; index < child_count; index++) {
+                    if (strcmp(switch_buf_array[index].xl->name, child) == 0) {
+                        gf_msg_debug("switch", 0,
+                                     "'%s' pattern will be "
+                                     "scheduled to \"%s\"",
+                                     switch_opt->path_pattern, child);
+                        /*
+                          if (switch_buf_array[index-1].considered) {
+                          gf_msg_debug ("switch", 0,
+                          "ambiguity found, exiting");
+                          return -1;
+                          }
+                        */
+                        switch_opt->array[idx].xl = switch_buf_array[index].xl;
+                        switch_buf_array[index].considered = 1;
+                        idx++;
+                        break;
+                    }
+                }
+                child = strtok_r(NULL, ",", &tmp1);
+            }
+        } else {
+            /* error */
+            gf_msg("switch", GF_LOG_ERROR, 0, DHT_MSG_SET_SWITCH_PATTERN_ERROR,
+                   "Check \"scheduler.switch.case\" "
+                   "option in unify volume. Exiting");
+            goto err;
+        }
+
+        /* Link it to the main structure */
+        if (switch_buf) {
+            /* there are already few entries */
+            trav = switch_buf;
+            while (trav->next)
+                trav = trav->next;
+            trav->next = switch_opt;
+        } else {
+            /* First entry */
+            switch_buf = switch_opt;
+        }
+        switch_opt = NULL;
+        switch_str = strtok_r(NULL, ";", &tmp_str);
+    }
+
+    /* Now, all the pattern based considerations done, so for all the
+     * remaining pattern, '*' to all the remaining child nodes
+     */
+    {
+        for (index = 0; index < child_count; index++) {
+            /* check for considered flag */
+            if (switch_buf_array[index].considered)
+                continue;
+            flag++;
+        }
+        if (!flag) {
+            gf_msg("switch", GF_LOG_ERROR, 0, DHT_MSG_SET_SWITCH_PATTERN_ERROR,
+                   "No nodes left for pattern '*'. Exiting");
+            goto err;
+        }
+        switch_opt = GF_CALLOC(1, sizeof(struct switch_struct),
+                               gf_switch_mt_switch_struct);
+        if (!switch_opt)
+            goto err;
+
+        /* Add the '*' pattern to the array */
+        memcpy(switch_opt->path_pattern, "*", 2);
+        switch_opt->num_child = flag;
+        switch_opt->array = GF_CALLOC(1,
+                                      flag * sizeof(struct switch_sched_array),
+                                      gf_switch_mt_switch_sched_array);
+        if (!switch_opt->array)
+            goto err;
+        flag = 0;
+        for (index = 0; index < child_count; index++) {
+            /* check for considered flag */
+            if (switch_buf_array[index].considered)
+                continue;
+            gf_msg_debug("switch", 0,
+                         "'%s'"
+                         " pattern will be scheduled to \"%s\"",
+                         switch_opt->path_pattern,
+                         switch_buf_array[index].xl->name);
+
+            switch_opt->array[flag].xl = switch_buf_array[index].xl;
+            switch_buf_array[index].considered = 1;
+            flag++;
+        }
+        if (switch_buf) {
+            /* there are already few entries */
+            trav = switch_buf;
+            while (trav->next)
+                trav = trav->next;
+            trav->next = switch_opt;
+        } else {
+            /* First entry */
+            switch_buf = switch_opt;
+        }
+        switch_opt = NULL;
+    }
+    /* */
+    conf->private = switch_buf;
+
+    GF_FREE(option_string);
+    return 0;
+err:
+    GF_FREE(switch_buf_array);
+    GF_FREE(switch_opt);
+    GF_FREE(option_string);
+
+    if (switch_buf) {
+        trav = switch_buf;
+        while (trav) {
+            GF_FREE(trav->array);
+            switch_opt = trav;
+            trav = trav->next;
+            GF_FREE(switch_opt);
+        }
+    }
+    return -1;
+}
+
+int32_t
+switch_init(xlator_t *this)
+{
+    dht_conf_t *conf = NULL;
+    data_t *data = NULL;
+    int ret = -1;
+
+    ret = dht_init(this);
+    if (ret) {
+        return ret;
+    }
+    conf = this->private;
+
+    data = dict_get(this->options, "pattern.switch.case");
+    if (data) {
+        /* TODO: */
+        ret = set_switch_pattern(this, conf, data->data);
+        if (ret) {
+            goto err;
+        }
+    }
+
+    this->private = conf;
+    return 0;
+
+err:
+    dht_fini(this);
+    return -1;
+}
+
+struct xlator_fops fops = {
+    .lookup = switch_lookup,
+    .create = switch_create,
+    .mknod = switch_mknod,
+
+    .stat = dht_stat,
+    .fstat = dht_fstat,
+    .truncate = dht_truncate,
+    .ftruncate = dht_ftruncate,
+    .access = dht_access,
+    .readlink = dht_readlink,
+    .setxattr = dht_setxattr,
+    .getxattr = dht_getxattr,
+    .removexattr = dht_removexattr,
+    .open = dht_open,
+    .readv = dht_readv,
+    .writev = dht_writev,
+    .flush = dht_flush,
+    .fsync = dht_fsync,
+    .statfs = dht_statfs,
+    .lk = dht_lk,
+    .opendir = dht_opendir,
+    .readdir = dht_readdir,
+    .readdirp = dht_readdirp,
+    .fsyncdir = dht_fsyncdir,
+    .symlink = dht_symlink,
+    .unlink = dht_unlink,
+    .link = dht_link,
+    .mkdir = dht_mkdir,
+    .rmdir = dht_rmdir,
+    .rename = dht_rename,
+    .inodelk = dht_inodelk,
+    .finodelk = dht_finodelk,
+    .entrylk = dht_entrylk,
+    .fentrylk = dht_fentrylk,
+    .xattrop = dht_xattrop,
+    .fxattrop = dht_fxattrop,
+    .setattr = dht_setattr,
+};
+
+struct xlator_cbks cbks = {.forget = dht_forget};
+extern int32_t
+mem_acct_init(xlator_t *this);
+
+xlator_api_t xlator_api = {
+    .init = switch_init,
+    .fini = switch_fini,
+    .notify = dht_notify,
+    .reconfigure = dht_reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = dht_options,
+    .identifier = "switch",
+    .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/cluster/dht/src/unittest/dht_layout_mock.c b/xlators/cluster/dht/src/unittest/dht_layout_mock.c
new file mode 100644
index 00000000000..771452963d1
--- /dev/null
+++ b/xlators/cluster/dht/src/unittest/dht_layout_mock.c
@@ -0,0 +1,73 @@
+/*
+  Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include "dht-common.h"
+#include <glusterfs/byte-order.h>
+
+int
+dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p)
+{
+    return 0;
+}
+
+int
+dht_inode_ctx_layout_get(inode_t *inode, xlator_t *this, dht_layout_t **layout)
+{
+    return 0;
+}
+
+int
+dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this,
+                         dht_layout_t *layout_int)
+{
+    return 0;
+}
+
+int
+dict_get_ptr(dict_t *this, char *key, void **ptr)
+{
+    return 0;
+}
+
+int
+dict_get_ptr_and_len(dict_t *this, char *key, void **ptr, int *len)
+{
+    return 0;
+}
+
+int
+_gf_log(const char *domain, const char *file, const char *function,
+        int32_t line, gf_loglevel_t level, const char *fmt, ...)
+{
+    return 0;
+}
+
+int
+_gf_log_callingfn(const char *domain, const char *file, const char *function,
+                  int32_t line, gf_loglevel_t level, const char *fmt, ...)
+{
+    return 0;
+}
+
+void
+gf_uuid_unparse(const uuid_t uu, char *out)
+{
+    // could call a will-return function here
+    // to place the correct data in *out
+}
+
+int
+_gf_msg(const char *domain, const char *file, const char *function,
+        int32_t line, gf_loglevel_t level, int errnum, int trace,
+        uint64_t msgid, const char *fmt, ...)
+{
+    return 0;
+}
diff --git a/xlators/cluster/dht/src/unittest/dht_layout_unittest.c b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c
new file mode 100644
index 00000000000..c94a1d0a2e1
--- /dev/null
+++ b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c
@@ -0,0 +1,127 @@
+/*
+  Copyright (c) 2008-2014 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "dht-common.h"
+#include <glusterfs/logging.h>
+#include <glusterfs/xlator.h>
+
+#include <inttypes.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include <cmocka_pbc.h>
+#include <cmocka.h>
+
+/*
+ * Helper functions
+ */
+
+static xlator_t *
+helper_xlator_init(uint32_t num_types)
+{
+    xlator_t *xl;
+    int i, ret;
+
+    REQUIRE(num_types > 0);
+
+    xl = test_calloc(1, sizeof(xlator_t));
+    assert_non_null(xl);
+    xl->mem_acct->num_types = num_types;
+    xl->mem_acct = test_calloc(sizeof(struct mem_acct) +
+                               sizeof(struct mem_acct_rec) + num_types);
+    assert_non_null(xl->mem_acct);
+
+    xl->ctx = test_calloc(1, sizeof(glusterfs_ctx_t));
+    assert_non_null(xl->ctx);
+
+    for (i = 0; i < num_types; i++) {
+        ret = LOCK_INIT(&(xl->mem_acct.rec[i].lock));
+        assert_false(ret);
+    }
+
+    ENSURE(num_types == xl->mem_acct.num_types);
+    ENSURE(NULL != xl);
+
+    return xl;
+}
+
+static int
+helper_xlator_destroy(xlator_t *xl)
+{
+    int i, ret;
+
+    for (i = 0; i < xl->mem_acct.num_types; i++) {
+        ret = LOCK_DESTROY(&(xl->mem_acct.rec[i].lock));
+        assert_int_equal(ret, 0);
+    }
+
+    free(xl->mem_acct.rec);
+    free(xl->ctx);
+    free(xl);
+    return 0;
+}
+
+/*
+ * Unit tests
+ */
+static void
+test_dht_layout_new(void **state)
+{
+    xlator_t *xl;
+    dht_layout_t *layout;
+    dht_conf_t *conf;
+    int cnt;
+
+    expect_assert_failure(dht_layout_new(NULL, 0));
+    expect_assert_failure(dht_layout_new((xlator_t *)0x12345, -1));
+    xl = helper_xlator_init(10);
+
+    // xl->private is NULL
+    assert_null(xl->private);
+    cnt = 100;
+    layout = dht_layout_new(xl, cnt);
+    assert_non_null(layout);
+    assert_int_equal(layout->type, DHT_HASH_TYPE_DM);
+    assert_int_equal(layout->cnt, cnt);
+    assert_int_equal(GF_ATOMIC_GET(layout->ref), 1);
+    assert_int_equal(layout->gen, 0);
+    assert_int_equal(layout->spread_cnt, 0);
+    free(layout);
+
+    // xl->private is not NULL
+    cnt = 110;
+    conf = (dht_conf_t *)test_calloc(1, sizeof(dht_conf_t));
+    assert_non_null(conf);
+    conf->dir_spread_cnt = 12345;
+    conf->gen = -123;
+    xl->private = conf;
+
+    layout = dht_layout_new(xl, cnt);
+    assert_non_null(layout);
+    assert_int_equal(layout->type, DHT_HASH_TYPE_DM);
+    assert_int_equal(layout->cnt, cnt);
+    assert_int_equal(GF_ATOMIC_GET(layout->ref), 1);
+    assert_int_equal(layout->gen, conf->gen);
+    assert_int_equal(layout->spread_cnt, conf->dir_spread_cnt);
+    free(layout);
+
+    free(conf);
+    helper_xlator_destroy(xl);
+}
+
+int
+main(void)
+{
+    const struct CMUnitTest xlator_dht_layout_tests[] = {
+        unit_test(test_dht_layout_new),
+    };
+
+    return cmocka_run_group_tests(xlator_dht_layout_tests, NULL, NULL);
+}
diff --git a/xlators/cluster/ha/Makefile.am b/xlators/cluster/ec/Makefile.am
index d471a3f9243..d471a3f9243 100644
--- a/xlators/cluster/ha/Makefile.am
+++ b/xlators/cluster/ec/Makefile.am
diff --git a/xlators/cluster/ec/src/Makefile.am b/xlators/cluster/ec/src/Makefile.am
new file mode 100644
index 00000000000..406a636bbc2
--- /dev/null
+++ b/xlators/cluster/ec/src/Makefile.am
@@ -0,0 +1,83 @@
+xlator_LTLIBRARIES = ec.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+ec_sources := ec.c
+ec_sources += ec-data.c
+ec_sources += ec-helpers.c
+ec_sources += ec-common.c
+ec_sources += ec-generic.c
+ec_sources += ec-locks.c
+ec_sources += ec-dir-read.c
+ec_sources += ec-dir-write.c
+ec_sources += ec-inode-read.c
+ec_sources += ec-inode-write.c
+ec_sources += ec-combine.c
+ec_sources += ec-method.c
+ec_sources += ec-galois.c
+ec_sources += ec-code.c
+ec_sources += ec-code-c.c
+ec_sources += ec-gf8.c
+ec_sources += ec-heal.c
+ec_sources += ec-heald.c
+
+ec_headers := ec.h
+ec_headers += ec-mem-types.h
+ec_headers += ec-helpers.h
+ec_headers += ec-data.h
+ec_headers += ec-fops.h
+ec_headers += ec-common.h
+ec_headers += ec-combine.h
+ec_headers += ec-method.h
+ec_headers += ec-galois.h
+ec_headers += ec-code.h
+ec_headers += ec-code-c.h
+ec_headers += ec-gf8.h
+ec_headers += ec-heald.h
+ec_headers += ec-messages.h
+ec_headers += ec-types.h
+
+if ENABLE_EC_DYNAMIC_INTEL
+  ec_sources += ec-code-intel.c
+  ec_headers += ec-code-intel.h
+endif
+
+if ENABLE_EC_DYNAMIC_X64
+  ec_sources += ec-code-x64.c
+  ec_headers += ec-code-x64.h
+endif
+
+if ENABLE_EC_DYNAMIC_SSE
+  ec_sources += ec-code-sse.c
+  ec_headers += ec-code-sse.h
+endif
+
+if ENABLE_EC_DYNAMIC_AVX
+  ec_sources += ec-code-avx.c
+  ec_headers += ec-code-avx.h
+endif
+
+ec_ext_sources = $(top_builddir)/xlators/lib/src/libxlator.c
+
+ec_ext_headers = $(top_builddir)/xlators/lib/src/libxlator.h
+
+ec_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+ec_la_SOURCES = $(ec_sources) $(ec_headers) $(ec_ext_sources) $(ec_ext_headers)
+ec_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS)
+AM_CPPFLAGS += -I$(top_srcdir)/libglusterfs/src
+AM_CPPFLAGS += -I$(top_srcdir)/xlators/lib/src
+AM_CPPFLAGS += -I$(top_srcdir)/rpc/rpc-lib/src
+AM_CPPFLAGS += -I$(top_srcdir)/rpc/xdr/src
+AM_CPPFLAGS += -I$(top_builddir)/rpc/xdr/src
+AM_CPPFLAGS += -DGLUSTERFS_LIBEXECDIR=\"$(GLUSTERFS_LIBEXECDIR)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
+
+install-data-hook:
+	ln -sf ec.so $(DESTDIR)$(xlatordir)/disperse.so
+
+uninstall-local:
+	rm -f $(DESTDIR)$(xlatordir)/disperse.so
diff --git a/xlators/cluster/ec/src/ec-code-avx.c b/xlators/cluster/ec/src/ec-code-avx.c
new file mode 100644
index 00000000000..70afaa00f54
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-avx.c
@@ -0,0 +1,109 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <errno.h>
+
+#include "ec-code-intel.h"
+
+static void
+ec_code_avx_prolog(ec_code_builder_t *builder)
+{
+    builder->loop = builder->address;
+}
+
+static void
+ec_code_avx_epilog(ec_code_builder_t *builder)
+{
+    ec_code_intel_op_add_i2r(builder, 32, REG_DX);
+    ec_code_intel_op_add_i2r(builder, 32, REG_DI);
+    ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX);
+    ec_code_intel_op_jne(builder, builder->loop);
+
+    ec_code_intel_op_ret(builder, 0);
+}
+
+static void
+ec_code_avx_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx,
+                 uint32_t bit)
+{
+    if (builder->linear) {
+        ec_code_intel_op_mov_m2avx(
+            builder, REG_SI, REG_DX, 1,
+            idx * builder->width * builder->bits + bit * builder->width, dst);
+    } else {
+        if (builder->base != idx) {
+            ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8,
+                                     REG_AX);
+            builder->base = idx;
+        }
+        ec_code_intel_op_mov_m2avx(builder, REG_AX, REG_DX, 1,
+                                   bit * builder->width, dst);
+    }
+}
+
+static void
+ec_code_avx_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit)
+{
+    ec_code_intel_op_mov_avx2m(builder, src, REG_DI, REG_NULL, 0,
+                               bit * builder->width);
+}
+
+static void
+ec_code_avx_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+    ec_code_intel_op_mov_avx2avx(builder, src, dst);
+}
+
+static void
+ec_code_avx_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+    ec_code_intel_op_xor_avx2avx(builder, src, dst);
+}
+
+static void
+ec_code_avx_xor3(ec_code_builder_t *builder, uint32_t dst, uint32_t src1,
+                 uint32_t src2)
+{
+    ec_code_intel_op_mov_avx2avx(builder, src1, dst);
+    ec_code_intel_op_xor_avx2avx(builder, src2, dst);
+}
+
+static void
+ec_code_avx_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx,
+                 uint32_t bit)
+{
+    if (builder->linear) {
+        ec_code_intel_op_xor_m2avx(
+            builder, REG_SI, REG_DX, 1,
+            idx * builder->width * builder->bits + bit * builder->width, dst);
+    } else {
+        if (builder->base != idx) {
+            ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8,
+                                     REG_AX);
+            builder->base = idx;
+        }
+        ec_code_intel_op_xor_m2avx(builder, REG_AX, REG_DX, 1,
+                                   bit * builder->width, dst);
+    }
+}
+
+static char *ec_code_avx_needed_flags[] = {"avx2", NULL};
+
+ec_code_gen_t ec_code_gen_avx = {.name = "avx",
+                                 .flags = ec_code_avx_needed_flags,
+                                 .width = 32,
+                                 .prolog = ec_code_avx_prolog,
+                                 .epilog = ec_code_avx_epilog,
+                                 .load = ec_code_avx_load,
+                                 .store = ec_code_avx_store,
+                                 .copy = ec_code_avx_copy,
+                                 .xor2 = ec_code_avx_xor2,
+                                 .xor3 = ec_code_avx_xor3,
+                                 .xorm = ec_code_avx_xorm};
diff --git a/xlators/cluster/ec/src/ec-code-avx.h b/xlators/cluster/ec/src/ec-code-avx.h
new file mode 100644
index 00000000000..fdca4ad2c8f
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-avx.h
@@ -0,0 +1,18 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_CODE_AVX_H__
+#define __EC_CODE_AVX_H__
+
+#include "ec-code.h"
+
+extern ec_code_gen_t ec_code_gen_avx;
+
+#endif /* __EC_CODE_AVX_H__ */
diff --git a/xlators/cluster/ec/src/ec-code-c.c b/xlators/cluster/ec/src/ec-code-c.c
new file mode 100644
index 00000000000..acdc665c2cf
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-c.c
@@ -0,0 +1,11679 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <inttypes.h>
+#include <string.h>
+
+#include "ec-method.h"
+#include "ec-code-c.h"
+
+#define WIDTH (EC_METHOD_WORD_SIZE / sizeof(uint64_t))
+
+static void
+gf8_muladd_00(void *out, void *in)
+{
+    memcpy(out, in, EC_METHOD_WORD_SIZE * 8);
+}
+
+static void
+gf8_muladd_01(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        out_ptr[0] ^= in_ptr[0];
+        out_ptr[WIDTH] ^= in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] ^= in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] ^= in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] ^= in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] ^= in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] ^= in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] ^= in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_02(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in7;
+        out1 = in0;
+        out7 = in6;
+        out5 = in4;
+        out6 = in5;
+        out3 = in2 ^ in7;
+        out4 = in3 ^ in7;
+        out2 = in1 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_03(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in0 ^ in7;
+        tmp0 = in2 ^ in7;
+        out1 = in0 ^ in1;
+        out7 = in6 ^ in7;
+        out5 = in4 ^ in5;
+        out6 = in5 ^ in6;
+        out4 = in3 ^ in4 ^ in7;
+        out2 = tmp0 ^ in1;
+        out3 = tmp0 ^ in3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_04(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in6;
+        out1 = in7;
+        out7 = in5;
+        out6 = in4;
+        tmp0 = in6 ^ in7;
+        out2 = in0 ^ in6;
+        out5 = in3 ^ in7;
+        out3 = tmp0 ^ in1;
+        out4 = tmp0 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_05(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in0 ^ in6;
+        out1 = in1 ^ in7;
+        out7 = in5 ^ in7;
+        out6 = in4 ^ in6;
+        out2 = out0 ^ in2;
+        out3 = out1 ^ in3 ^ in6;
+        out5 = out7 ^ in3;
+        out4 = out6 ^ in2 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_06(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in6 ^ in7;
+        tmp0 = in1 ^ in6;
+        out1 = in0 ^ in7;
+        out7 = in5 ^ in6;
+        out6 = in4 ^ in5;
+        out4 = in2 ^ in3 ^ in6;
+        out5 = in3 ^ in4 ^ in7;
+        out3 = tmp0 ^ in2;
+        out2 = tmp0 ^ out1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_07(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in6;
+        tmp1 = in5 ^ in6;
+        tmp2 = in0 ^ in7;
+        tmp3 = tmp0 ^ in3;
+        out6 = tmp1 ^ in4;
+        out7 = tmp1 ^ in7;
+        out0 = tmp2 ^ in6;
+        out1 = tmp2 ^ in1;
+        out3 = tmp3 ^ in1;
+        out4 = tmp3 ^ in4;
+        out5 = out4 ^ out7 ^ in2;
+        out2 = tmp0 ^ out1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_08(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in5;
+        out1 = in6;
+        out7 = in4;
+        out6 = in3 ^ in7;
+        out3 = in0 ^ in5 ^ in6;
+        out5 = in2 ^ in6 ^ in7;
+        out2 = in5 ^ in7;
+        out4 = out2 ^ in1 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_09(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in0 ^ in5;
+        tmp0 = in3 ^ in6;
+        out1 = in1 ^ in6;
+        out7 = in4 ^ in7;
+        out2 = in2 ^ in5 ^ in7;
+        out3 = tmp0 ^ out0;
+        out6 = tmp0 ^ in7;
+        out4 = out1 ^ out7 ^ in5;
+        out5 = out2 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_0A(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in5 ^ in7;
+        out1 = in0 ^ in6;
+        out7 = in4 ^ in6;
+        out2 = in1 ^ in5;
+        out6 = out0 ^ in3;
+        out3 = out0 ^ out1 ^ in2;
+        out5 = out7 ^ in2 ^ in7;
+        out4 = out2 ^ in3 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_0B(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in5;
+        tmp1 = in0 ^ in6;
+        tmp2 = in4 ^ in7;
+        out0 = in0 ^ in5 ^ in7;
+        out2 = tmp0 ^ in1;
+        out1 = tmp1 ^ in1;
+        out6 = tmp1 ^ out0 ^ in3;
+        out7 = tmp2 ^ in6;
+        out4 = tmp2 ^ out6 ^ in1;
+        out3 = out6 ^ in0 ^ in2;
+        out5 = tmp0 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_0C(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in5 ^ in6;
+        out1 = in6 ^ in7;
+        out7 = in4 ^ in5;
+        tmp0 = in1 ^ in5;
+        tmp1 = in0 ^ in7;
+        out5 = in2 ^ in3 ^ in6;
+        out6 = in3 ^ in4 ^ in7;
+        out2 = tmp1 ^ out0;
+        out4 = tmp0 ^ in2;
+        out3 = tmp0 ^ tmp1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_0D(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in4 ^ in5;
+        tmp1 = in5 ^ in6;
+        out1 = in1 ^ in6 ^ in7;
+        out7 = tmp0 ^ in7;
+        out4 = tmp0 ^ in1 ^ in2;
+        out0 = tmp1 ^ in0;
+        tmp2 = tmp1 ^ in3;
+        out6 = tmp2 ^ out7;
+        out2 = out0 ^ in2 ^ in7;
+        out3 = out0 ^ out1 ^ in3;
+        out5 = tmp2 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_0E(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in1;
+        tmp1 = in2 ^ in5;
+        tmp2 = in5 ^ in6;
+        out1 = in0 ^ in6 ^ in7;
+        out3 = tmp0 ^ tmp1;
+        out2 = tmp0 ^ tmp2;
+        tmp3 = tmp1 ^ in3;
+        out7 = tmp2 ^ in4;
+        out0 = tmp2 ^ in7;
+        out4 = tmp3 ^ in1 ^ in7;
+        out5 = tmp3 ^ out7;
+        out6 = out0 ^ out5 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_0F(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in6 ^ in7;
+        tmp1 = tmp0 ^ in1;
+        tmp2 = tmp0 ^ in5;
+        out1 = tmp1 ^ in0;
+        out7 = tmp2 ^ in4;
+        out0 = tmp2 ^ in0;
+        out6 = out7 ^ in3;
+        out5 = out6 ^ in2 ^ in7;
+        tmp3 = tmp1 ^ out0 ^ in2;
+        out4 = tmp1 ^ out5;
+        out2 = tmp3 ^ in6;
+        out3 = tmp3 ^ in3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_10(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in4;
+        out1 = in5;
+        out7 = in3 ^ in7;
+        tmp0 = in6 ^ in7;
+        out2 = in4 ^ in6;
+        tmp1 = out2 ^ in5;
+        out6 = tmp0 ^ in2;
+        out3 = tmp0 ^ tmp1;
+        out5 = out2 ^ out3 ^ in1;
+        out4 = tmp1 ^ in0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_11(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out7 = in3;
+        out0 = in0 ^ in4;
+        out1 = in1 ^ in5;
+        out6 = in2 ^ in7;
+        out4 = in0 ^ in5 ^ in6;
+        out5 = in1 ^ in6 ^ in7;
+        out2 = in2 ^ in4 ^ in6;
+        out3 = in3 ^ in4 ^ in5 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_12(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in4 ^ in7;
+        out1 = in0 ^ in5;
+        out3 = in2 ^ in4 ^ in5;
+        tmp0 = out0 ^ in6;
+        out2 = tmp0 ^ in1;
+        tmp1 = tmp0 ^ in3;
+        out6 = tmp0 ^ out3;
+        out5 = out2 ^ in5;
+        out7 = tmp1 ^ in4;
+        out4 = tmp1 ^ out1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_13(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out7 = in3 ^ in6;
+        tmp0 = in0 ^ in5;
+        tmp1 = in4 ^ in7;
+        out6 = in2 ^ in5 ^ in7;
+        out4 = tmp0 ^ out7 ^ in7;
+        out1 = tmp0 ^ in1;
+        out0 = tmp1 ^ in0;
+        out5 = tmp1 ^ in1 ^ in6;
+        out3 = tmp1 ^ out6 ^ in3;
+        out2 = out5 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_14(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in4 ^ in6;
+        out1 = in5 ^ in7;
+        out2 = in0 ^ in4;
+        tmp0 = out0 ^ in5;
+        out7 = out1 ^ in3;
+        tmp1 = out1 ^ in2;
+        out3 = tmp0 ^ in1;
+        out6 = tmp0 ^ tmp1;
+        out4 = tmp1 ^ out2;
+        out5 = out3 ^ in3 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_15(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out7 = in3 ^ in5;
+        tmp0 = in0 ^ in4;
+        out1 = in1 ^ in5 ^ in7;
+        out5 = in1 ^ in3 ^ in6;
+        out0 = tmp0 ^ in6;
+        out2 = tmp0 ^ in2;
+        out3 = out5 ^ in4 ^ in5;
+        out6 = out2 ^ in0 ^ in7;
+        out4 = tmp0 ^ out6 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_16(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in5;
+        tmp1 = in4 ^ in7;
+        tmp2 = in2 ^ in3 ^ in4;
+        out1 = tmp0 ^ in7;
+        out4 = tmp0 ^ tmp2;
+        out0 = tmp1 ^ in6;
+        tmp3 = tmp1 ^ in1;
+        out6 = out0 ^ in2 ^ in5;
+        out2 = tmp3 ^ in0;
+        out3 = out6 ^ in1;
+        out7 = tmp2 ^ out6;
+        out5 = tmp3 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_17(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in5;
+        tmp1 = in3 ^ in6;
+        tmp2 = tmp0 ^ in4;
+        out4 = tmp0 ^ in0 ^ in3;
+        out7 = tmp1 ^ in5;
+        tmp3 = tmp1 ^ in1;
+        out6 = tmp2 ^ in7;
+        out5 = tmp3 ^ in4;
+        out3 = tmp3 ^ out6;
+        out0 = out3 ^ out4 ^ in1;
+        out2 = out3 ^ out7 ^ in0;
+        out1 = tmp2 ^ out2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_18(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in4 ^ in5;
+        out1 = in5 ^ in6;
+        tmp0 = in4 ^ in7;
+        out5 = in1 ^ in2 ^ in5;
+        out6 = in2 ^ in3 ^ in6;
+        out2 = tmp0 ^ out1;
+        out7 = tmp0 ^ in3;
+        tmp1 = tmp0 ^ in0;
+        out3 = tmp1 ^ in6;
+        out4 = tmp1 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_19(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out5 = in1 ^ in2;
+        out7 = in3 ^ in4;
+        tmp0 = in0 ^ in7;
+        out6 = in2 ^ in3;
+        out1 = in1 ^ in5 ^ in6;
+        out0 = in0 ^ in4 ^ in5;
+        out4 = tmp0 ^ in1;
+        tmp1 = tmp0 ^ in6;
+        out2 = tmp1 ^ out0 ^ in2;
+        out3 = tmp1 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_1A(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in4 ^ in5;
+        tmp1 = in5 ^ in6;
+        tmp2 = tmp0 ^ in1;
+        out0 = tmp0 ^ in7;
+        out1 = tmp1 ^ in0;
+        tmp3 = tmp1 ^ in3;
+        out5 = tmp2 ^ in2;
+        out2 = tmp2 ^ in6;
+        out7 = tmp3 ^ out0;
+        out6 = tmp3 ^ in2;
+        out4 = tmp3 ^ out2 ^ in0;
+        out3 = tmp0 ^ out1 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_1B(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in4;
+        tmp1 = in2 ^ in5;
+        tmp2 = in3 ^ in6;
+        out5 = tmp0 ^ in1;
+        tmp3 = tmp0 ^ in0;
+        out6 = tmp1 ^ in3;
+        out0 = tmp1 ^ tmp3 ^ in7;
+        out7 = tmp2 ^ in4;
+        tmp4 = out5 ^ in6;
+        out3 = tmp2 ^ tmp3;
+        out2 = tmp4 ^ in5;
+        out4 = tmp4 ^ out3;
+        out1 = tmp3 ^ out2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_1C(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in3;
+        tmp1 = in4 ^ in6;
+        tmp2 = in5 ^ in7;
+        out6 = tmp0 ^ tmp1;
+        out0 = tmp1 ^ in5;
+        out1 = tmp2 ^ in6;
+        tmp3 = tmp2 ^ in1;
+        tmp4 = tmp2 ^ in4;
+        out2 = tmp4 ^ in0;
+        out7 = tmp4 ^ in3;
+        out5 = tmp0 ^ tmp3;
+        out3 = tmp3 ^ out2;
+        out4 = out3 ^ in2 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_1D(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in3;
+        tmp1 = in0 ^ in4;
+        tmp2 = in3 ^ in4;
+        tmp3 = in2 ^ in7;
+        out3 = tmp0 ^ tmp1;
+        out5 = tmp0 ^ tmp3;
+        tmp4 = tmp1 ^ in5;
+        out6 = tmp2 ^ in2;
+        out7 = tmp2 ^ in5;
+        out2 = tmp3 ^ tmp4;
+        out4 = out3 ^ out6 ^ in6;
+        out0 = tmp4 ^ in6;
+        out1 = out2 ^ out4 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_1E(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in4;
+        tmp1 = in2 ^ in7;
+        tmp2 = tmp0 ^ in1;
+        out3 = tmp1 ^ tmp2;
+        out2 = tmp2 ^ in5;
+        out4 = out3 ^ in3 ^ in6;
+        tmp3 = out4 ^ in7;
+        out6 = tmp3 ^ out2 ^ in4;
+        out7 = tmp1 ^ out6;
+        out0 = out7 ^ in3;
+        out1 = tmp0 ^ out0;
+        out5 = tmp3 ^ out1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_1F(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in4 ^ in6;
+        tmp1 = tmp0 ^ in5;
+        out7 = tmp1 ^ in3;
+        out0 = tmp1 ^ in0 ^ in7;
+        out6 = out7 ^ in2 ^ in6;
+        out1 = out0 ^ in1 ^ in4;
+        out4 = out0 ^ out6 ^ in1;
+        out3 = tmp0 ^ out4;
+        out2 = out4 ^ out7 ^ in7;
+        out5 = out3 ^ in0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_20(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out1 = in4;
+        out0 = in3 ^ in7;
+        tmp0 = in3 ^ in4;
+        tmp1 = in6 ^ in7;
+        out2 = out0 ^ in5;
+        out4 = tmp0 ^ in5;
+        out3 = tmp0 ^ tmp1;
+        out7 = tmp1 ^ in2;
+        out6 = tmp1 ^ in1 ^ in5;
+        out5 = out2 ^ out3 ^ in0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_21(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out1 = in1 ^ in4;
+        tmp0 = in4 ^ in6;
+        out4 = in3 ^ in5;
+        out7 = in2 ^ in6;
+        out0 = in0 ^ in3 ^ in7;
+        out6 = in1 ^ in5 ^ in7;
+        out3 = tmp0 ^ in7;
+        out5 = tmp0 ^ in0;
+        out2 = out4 ^ in2 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_22(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in3;
+        out1 = in0 ^ in4;
+        out7 = in2 ^ in7;
+        out4 = in4 ^ in5 ^ in7;
+        out5 = in0 ^ in5 ^ in6;
+        out6 = in1 ^ in6 ^ in7;
+        out3 = in2 ^ in3 ^ in4 ^ in6;
+        out2 = in1 ^ in3 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_23(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out7 = in2;
+        out0 = in0 ^ in3;
+        out4 = in5 ^ in7;
+        out5 = in0 ^ in6;
+        out6 = in1 ^ in7;
+        out3 = in2 ^ in4 ^ in6;
+        out1 = in0 ^ in1 ^ in4;
+        out2 = out4 ^ out6 ^ in2 ^ in3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_24(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out1 = in4 ^ in7;
+        tmp0 = in3 ^ in4;
+        out0 = in3 ^ in6 ^ in7;
+        out3 = tmp0 ^ in1;
+        tmp1 = out0 ^ in5;
+        out6 = tmp1 ^ out3;
+        out2 = tmp1 ^ in0;
+        out7 = tmp1 ^ in2 ^ in3;
+        out5 = out2 ^ in4;
+        out4 = tmp0 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_25(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in1 ^ in4;
+        tmp0 = in2 ^ in5;
+        out1 = out3 ^ in7;
+        out7 = tmp0 ^ in6;
+        out6 = out1 ^ in5;
+        out4 = out7 ^ in3 ^ in7;
+        out2 = out4 ^ in0;
+        out0 = tmp0 ^ out2;
+        out5 = out0 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_26(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in3 ^ in6;
+        tmp0 = in4 ^ in7;
+        out7 = in2 ^ in5 ^ in7;
+        tmp1 = out0 ^ in0 ^ in5;
+        out1 = tmp0 ^ in0;
+        tmp2 = tmp0 ^ in6;
+        out2 = tmp1 ^ in1;
+        out5 = tmp1 ^ in7;
+        out6 = tmp2 ^ in1;
+        out4 = tmp2 ^ out7;
+        out3 = out0 ^ out6 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_27(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out7 = in2 ^ in5;
+        out0 = in0 ^ in3 ^ in6;
+        out6 = in1 ^ in4 ^ in7;
+        out4 = out7 ^ in6;
+        out2 = out0 ^ out7 ^ in1;
+        out5 = out0 ^ in7;
+        out1 = out6 ^ in0;
+        out3 = out6 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_28(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in3;
+        out1 = in4 ^ in6;
+        out0 = in3 ^ in5 ^ in7;
+        tmp0 = out1 ^ in7;
+        tmp1 = out0 ^ in4;
+        out7 = tmp0 ^ in2;
+        tmp2 = tmp0 ^ in1;
+        out3 = tmp1 ^ in0;
+        out6 = tmp1 ^ tmp2;
+        out4 = tmp2 ^ in3;
+        out5 = out3 ^ in2 ^ in3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_29(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in2 ^ in3;
+        tmp0 = in1 ^ in3;
+        tmp1 = in4 ^ in6;
+        tmp2 = in0 ^ in4 ^ in7;
+        out6 = tmp0 ^ in5;
+        out4 = tmp0 ^ in6 ^ in7;
+        out1 = tmp1 ^ in1;
+        out7 = tmp1 ^ in2;
+        out3 = tmp2 ^ in5;
+        out5 = tmp2 ^ in2;
+        out0 = out3 ^ in3 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_2A(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in3 ^ in5;
+        tmp0 = in1 ^ in3;
+        tmp1 = in0 ^ in4;
+        out7 = in2 ^ in4 ^ in7;
+        out3 = tmp1 ^ out0 ^ in2;
+        out2 = tmp0 ^ in7;
+        out6 = tmp0 ^ in6;
+        out1 = tmp1 ^ in6;
+        out5 = tmp1 ^ out7 ^ in5;
+        out4 = out1 ^ in0 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_2B(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in1 ^ in6;
+        out7 = in2 ^ in4;
+        tmp0 = in0 ^ in5;
+        tmp1 = in2 ^ in7;
+        out6 = in1 ^ in3;
+        out1 = out4 ^ in0 ^ in4;
+        out3 = tmp0 ^ out7;
+        out0 = tmp0 ^ in3;
+        out5 = tmp1 ^ in0;
+        out2 = tmp1 ^ out6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_2C(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in5;
+        tmp1 = in2 ^ in3 ^ in4;
+        tmp2 = tmp0 ^ in6;
+        out4 = tmp1 ^ in1;
+        out5 = tmp1 ^ in0 ^ in5;
+        tmp3 = tmp2 ^ in4;
+        out6 = tmp2 ^ out4;
+        out7 = tmp3 ^ in7;
+        out2 = tmp3 ^ out5;
+        out3 = out6 ^ in0;
+        out0 = tmp1 ^ out7;
+        out1 = tmp0 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_2D(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in3;
+        out4 = tmp0 ^ in1;
+        tmp1 = tmp0 ^ in0;
+        out2 = tmp1 ^ in6;
+        out5 = tmp1 ^ in4;
+        tmp2 = out2 ^ in2;
+        tmp3 = tmp2 ^ in5;
+        out0 = tmp3 ^ in7;
+        out7 = tmp3 ^ out5;
+        out6 = out4 ^ out7 ^ in6;
+        out3 = tmp2 ^ out6;
+        out1 = out0 ^ out6 ^ in0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_2E(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in4 ^ in7;
+        out0 = in3 ^ in5 ^ in6;
+        tmp1 = tmp0 ^ in0;
+        tmp2 = tmp0 ^ in2;
+        out1 = tmp1 ^ in6;
+        out4 = tmp2 ^ in1;
+        out7 = tmp2 ^ in5;
+        out3 = out0 ^ out4 ^ in0;
+        out2 = out3 ^ out7 ^ in7;
+        out6 = tmp1 ^ out2;
+        out5 = tmp1 ^ out7 ^ in3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_2F(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in3;
+        tmp1 = in2 ^ in5;
+        out4 = in1 ^ in2 ^ in7;
+        out6 = in1 ^ in3 ^ in4;
+        out5 = tmp0 ^ in2;
+        tmp2 = tmp0 ^ in6;
+        out7 = tmp1 ^ in4;
+        out0 = tmp2 ^ in5;
+        out2 = tmp2 ^ out4;
+        out1 = tmp2 ^ out6 ^ in7;
+        out3 = tmp1 ^ out1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_30(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out1 = in4 ^ in5;
+        tmp0 = in3 ^ in6;
+        tmp1 = in4 ^ in7;
+        out6 = in1 ^ in2 ^ in5;
+        out3 = tmp0 ^ in5;
+        out4 = tmp0 ^ in0;
+        out7 = tmp0 ^ in2;
+        out0 = tmp1 ^ in3;
+        out2 = tmp1 ^ out3;
+        out5 = tmp1 ^ in0 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_31(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in5 ^ in6;
+        tmp0 = in4 ^ in5;
+        tmp1 = in0 ^ in3 ^ in4;
+        tmp2 = out3 ^ in2;
+        out1 = tmp0 ^ in1;
+        out0 = tmp1 ^ in7;
+        out4 = tmp1 ^ in6;
+        out6 = tmp2 ^ in1;
+        out2 = tmp2 ^ out0 ^ in0;
+        out5 = out1 ^ in0 ^ in7;
+        out7 = tmp0 ^ out2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_32(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in3 ^ in4;
+        out7 = in2 ^ in3;
+        tmp0 = in5 ^ in6;
+        tmp1 = in0 ^ in7;
+        out6 = in1 ^ in2;
+        out1 = in0 ^ in4 ^ in5;
+        out2 = tmp0 ^ out0 ^ in1;
+        out3 = tmp0 ^ out7 ^ in7;
+        out4 = tmp1 ^ in6;
+        out5 = tmp1 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_33(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in3;
+        tmp1 = in0 ^ in4;
+        tmp2 = in1 ^ in5;
+        out6 = in1 ^ in2 ^ in6;
+        out7 = tmp0 ^ in7;
+        out0 = tmp1 ^ in3;
+        out1 = tmp1 ^ tmp2;
+        tmp3 = tmp2 ^ in7;
+        tmp4 = tmp2 ^ in4 ^ in6;
+        out5 = tmp3 ^ in0;
+        out3 = tmp3 ^ out6;
+        out4 = tmp4 ^ out5;
+        out2 = tmp0 ^ tmp4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_34(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in3 ^ in4;
+        tmp1 = in4 ^ in5;
+        tmp2 = tmp0 ^ in1;
+        tmp3 = tmp0 ^ in6;
+        out1 = tmp1 ^ in7;
+        tmp4 = tmp1 ^ in2;
+        out5 = tmp2 ^ in0;
+        out3 = tmp2 ^ out1;
+        out0 = tmp3 ^ in7;
+        out7 = tmp3 ^ tmp4;
+        out6 = tmp4 ^ in1;
+        out2 = out3 ^ out5 ^ in3;
+        out4 = tmp4 ^ out2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_35(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in6;
+        tmp1 = in5 ^ in7;
+        out7 = tmp0 ^ tmp1 ^ in3;
+        out3 = tmp1 ^ in1;
+        out1 = out3 ^ in4;
+        tmp2 = out1 ^ in7;
+        out5 = tmp2 ^ in0 ^ in3;
+        out6 = tmp0 ^ tmp2;
+        out0 = out3 ^ out5 ^ in6;
+        out4 = tmp0 ^ out0;
+        out2 = out4 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_36(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in0 ^ in2;
+        tmp0 = in1 ^ in3;
+        out0 = in3 ^ in4 ^ in6;
+        out6 = in1 ^ in2 ^ in4;
+        out5 = tmp0 ^ in0;
+        tmp1 = out5 ^ in5;
+        out2 = tmp1 ^ in4;
+        out3 = tmp1 ^ out4;
+        out1 = tmp0 ^ out2 ^ in7;
+        out7 = out3 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_37(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in2;
+        tmp1 = in2 ^ in4;
+        tmp2 = tmp0 ^ in6;
+        out3 = tmp0 ^ in5;
+        out4 = tmp1 ^ in0;
+        out6 = tmp2 ^ in4;
+        out1 = out3 ^ out4 ^ in7;
+        tmp3 = out4 ^ in1 ^ in3;
+        out7 = tmp3 ^ out1;
+        out2 = tmp3 ^ in5;
+        out5 = tmp1 ^ out2;
+        out0 = tmp2 ^ tmp3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_38(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in0 ^ in3;
+        tmp0 = in3 ^ in4;
+        tmp1 = in5 ^ in7;
+        tmp2 = out3 ^ in1;
+        out2 = tmp0 ^ in6;
+        out0 = tmp0 ^ tmp1;
+        out4 = tmp1 ^ tmp2;
+        out7 = out2 ^ in2;
+        out1 = out2 ^ in3 ^ in5;
+        out6 = out4 ^ in0 ^ in2;
+        out5 = tmp2 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_39(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in0;
+        tmp0 = in1 ^ in5;
+        tmp1 = tmp0 ^ in4;
+        out1 = tmp1 ^ in6;
+        out5 = out1 ^ in0 ^ in2;
+        tmp2 = tmp0 ^ out5;
+        out2 = tmp2 ^ in0 ^ in3;
+        out7 = out2 ^ in7;
+        out6 = tmp1 ^ out7;
+        out4 = tmp2 ^ out6;
+        out0 = out4 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_3A(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in1;
+        tmp1 = in0 ^ in2;
+        tmp2 = in3 ^ in4;
+        tmp3 = in1 ^ in6;
+        tmp4 = in3 ^ in7;
+        out4 = tmp0 ^ in5;
+        out5 = tmp1 ^ tmp3;
+        out3 = tmp1 ^ tmp4;
+        out0 = tmp2 ^ in5;
+        out7 = tmp2 ^ in2;
+        tmp5 = tmp3 ^ in4;
+        out2 = tmp4 ^ tmp5;
+        out1 = tmp5 ^ out4;
+        out6 = tmp0 ^ out3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_3B(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in6;
+        tmp1 = in2 ^ in7;
+        tmp2 = tmp0 ^ in3;
+        out3 = tmp1 ^ in0;
+        out6 = tmp1 ^ tmp2;
+        out2 = out6 ^ in4;
+        out7 = tmp0 ^ out2;
+        out0 = out3 ^ out7 ^ in5;
+        out5 = out0 ^ out2 ^ in7;
+        out1 = tmp2 ^ out0;
+        out4 = out1 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_3C(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in3;
+        tmp1 = in2 ^ in7;
+        tmp2 = in1 ^ in6 ^ in7;
+        out2 = tmp0 ^ in4;
+        out3 = tmp0 ^ tmp2;
+        out4 = tmp1 ^ out3 ^ in5;
+        out5 = tmp2 ^ out2 ^ in2;
+        out1 = out4 ^ out5 ^ in6;
+        out0 = out1 ^ in3;
+        out7 = tmp1 ^ out0;
+        out6 = tmp2 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_3D(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in2;
+        tmp1 = tmp0 ^ in3;
+        out2 = tmp1 ^ in4;
+        tmp2 = out2 ^ in5;
+        out4 = tmp2 ^ in1 ^ in6;
+        out5 = out4 ^ in7;
+        out6 = out5 ^ in0;
+        out7 = out6 ^ in1;
+        out0 = tmp0 ^ out7;
+        out1 = tmp1 ^ out5;
+        out3 = tmp2 ^ out6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_3E(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in3 ^ in5;
+        tmp1 = tmp0 ^ in4;
+        out0 = tmp1 ^ in6;
+        out7 = tmp1 ^ in2;
+        out6 = out7 ^ in1 ^ in5 ^ in7;
+        out2 = out6 ^ in0 ^ in2;
+        out4 = out0 ^ out6 ^ in0;
+        out5 = tmp0 ^ out4;
+        out3 = out5 ^ in7;
+        out1 = out3 ^ out6 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_3F(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in1;
+        out3 = tmp0 ^ in2 ^ in6;
+        tmp1 = out3 ^ in5 ^ in7;
+        out4 = tmp1 ^ in4;
+        out5 = tmp1 ^ in3;
+        out1 = out4 ^ in2;
+        out7 = out1 ^ out3 ^ in3;
+        out2 = tmp0 ^ out7 ^ in5;
+        tmp2 = out2 ^ in0;
+        out6 = tmp2 ^ in6;
+        out0 = tmp1 ^ tmp2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_40(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out1 = in3 ^ in7;
+        tmp0 = in3 ^ in4;
+        tmp1 = in6 ^ in7;
+        out4 = tmp0 ^ in2;
+        out5 = tmp0 ^ in5;
+        out0 = tmp1 ^ in2;
+        out7 = tmp1 ^ in1 ^ in5;
+        out2 = out0 ^ in4;
+        out3 = out2 ^ out5 ^ in7;
+        out6 = out3 ^ out4 ^ in0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_41(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in2 ^ in3;
+        tmp0 = in5 ^ in6;
+        tmp1 = in6 ^ in7;
+        out5 = in3 ^ in4;
+        out1 = in1 ^ in3 ^ in7;
+        out6 = in0 ^ in4 ^ in5;
+        out3 = tmp0 ^ in2;
+        out7 = tmp0 ^ in1;
+        out2 = tmp1 ^ in4;
+        out0 = tmp1 ^ in0 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_42(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in2 ^ in6;
+        out5 = in3 ^ in5;
+        out1 = in0 ^ in3 ^ in7;
+        out7 = in1 ^ in5 ^ in7;
+        out4 = in2 ^ in4 ^ in7;
+        out6 = in0 ^ in4 ^ in6;
+        out2 = out0 ^ in1 ^ in4;
+        out3 = out5 ^ in6 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_43(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out5 = in3;
+        out7 = in1 ^ in5;
+        out4 = in2 ^ in7;
+        out6 = in0 ^ in4;
+        out0 = in0 ^ in2 ^ in6;
+        out3 = in5 ^ in6 ^ in7;
+        out2 = in1 ^ in4 ^ in6;
+        out1 = in0 ^ in1 ^ in3 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_44(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out1 = in3;
+        out0 = in2 ^ in7;
+        tmp0 = in4 ^ in7;
+        out7 = in1 ^ in6 ^ in7;
+        out6 = in0 ^ in5 ^ in6;
+        out4 = tmp0 ^ in3 ^ in6;
+        out3 = out0 ^ in1 ^ in3 ^ in5;
+        out2 = out0 ^ in0 ^ in4;
+        out5 = tmp0 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_45(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out1 = in1 ^ in3;
+        out7 = in1 ^ in6;
+        out5 = in4 ^ in7;
+        out6 = in0 ^ in5;
+        out0 = in0 ^ in2 ^ in7;
+        out4 = in3 ^ in6 ^ in7;
+        out2 = out5 ^ in0;
+        out3 = out0 ^ out6 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_46(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in2;
+        out1 = in0 ^ in3;
+        out7 = in1 ^ in7;
+        out4 = in4 ^ in6;
+        out5 = in5 ^ in7;
+        out6 = in0 ^ in6;
+        out3 = in1 ^ in3 ^ in5;
+        out2 = out4 ^ out6 ^ in1 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_47(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in6;
+        out7 = in1;
+        out5 = in7;
+        out6 = in0;
+        tmp0 = in0 ^ in1;
+        out3 = in1 ^ in5;
+        out0 = in0 ^ in2;
+        out1 = tmp0 ^ in3;
+        out2 = tmp0 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_48(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in3;
+        out1 = in3 ^ in6 ^ in7;
+        out3 = tmp0 ^ in0;
+        out0 = tmp0 ^ out1 ^ in5;
+        tmp1 = out0 ^ in4;
+        out2 = tmp1 ^ in7;
+        out5 = tmp1 ^ in3;
+        out4 = out5 ^ in1;
+        out7 = tmp0 ^ out4;
+        out6 = tmp1 ^ out3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_49(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in0 ^ in2;
+        tmp0 = in2 ^ in5;
+        out2 = in4 ^ in5 ^ in6;
+        tmp1 = tmp0 ^ out2 ^ in3;
+        out7 = out2 ^ in1;
+        out5 = tmp1 ^ in7;
+        out4 = out5 ^ out7 ^ in6;
+        out1 = tmp0 ^ out4;
+        out6 = out1 ^ out7 ^ in0;
+        out0 = tmp1 ^ out6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_4A(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in6;
+        tmp1 = in3 ^ in7;
+        out0 = tmp0 ^ in5;
+        out3 = tmp1 ^ in0;
+        out5 = tmp1 ^ out0;
+        out4 = out0 ^ in1 ^ in4;
+        out1 = out3 ^ in6;
+        out2 = out4 ^ in7;
+        out6 = out1 ^ in4;
+        out7 = tmp0 ^ out2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_4B(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in0 ^ in7;
+        tmp0 = in1 ^ in5;
+        tmp1 = in2 ^ in6;
+        tmp2 = out3 ^ in3;
+        out7 = tmp0 ^ in4;
+        out4 = tmp0 ^ tmp1;
+        tmp3 = tmp1 ^ in0;
+        out6 = tmp2 ^ in4;
+        out5 = tmp2 ^ tmp3;
+        out1 = tmp2 ^ in1 ^ in6;
+        out2 = out7 ^ in6 ^ in7;
+        out0 = tmp3 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_4C(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out1 = in3 ^ in6;
+        tmp0 = in2 ^ in5;
+        tmp1 = out1 ^ in5 ^ in7;
+        out0 = tmp0 ^ in7;
+        tmp2 = tmp0 ^ in4;
+        out6 = tmp1 ^ in0;
+        out2 = tmp2 ^ in0;
+        out5 = tmp2 ^ in6;
+        out3 = tmp0 ^ out6 ^ in1;
+        out7 = out0 ^ out5 ^ in1;
+        out4 = tmp1 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_4D(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in5;
+        tmp1 = in1 ^ in6;
+        out4 = in1 ^ in3 ^ in5;
+        tmp2 = tmp0 ^ in7;
+        out2 = tmp0 ^ in4;
+        out1 = tmp1 ^ in3;
+        out7 = tmp1 ^ in4;
+        out0 = tmp2 ^ in2;
+        out6 = tmp2 ^ in3;
+        out5 = out7 ^ in1 ^ in2;
+        out3 = tmp1 ^ out0 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_4E(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in2 ^ in5;
+        out7 = in1 ^ in4 ^ in7;
+        out1 = in0 ^ in3 ^ in6;
+        out5 = out0 ^ in6;
+        out4 = out7 ^ in5;
+        out3 = out1 ^ in1;
+        out6 = out1 ^ in7;
+        out2 = out4 ^ in0 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_4F(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out5 = in2 ^ in6;
+        out7 = in1 ^ in4;
+        out3 = in0 ^ in1 ^ in6;
+        out4 = in1 ^ in5 ^ in7;
+        out0 = in0 ^ in2 ^ in5;
+        out6 = in0 ^ in3 ^ in7;
+        out1 = out3 ^ in3;
+        out2 = out4 ^ in0 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_50(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in2 ^ in7;
+        tmp0 = in3 ^ in5;
+        out0 = out2 ^ in4 ^ in6;
+        out1 = tmp0 ^ in7;
+        tmp1 = tmp0 ^ in6;
+        out3 = out0 ^ in3;
+        out7 = tmp1 ^ in1;
+        tmp2 = tmp1 ^ in0;
+        out5 = out3 ^ in1 ^ in2;
+        out4 = tmp2 ^ in2;
+        out6 = tmp2 ^ out3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_51(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in7;
+        out3 = in2 ^ in4 ^ in6 ^ in7;
+        out0 = out3 ^ in0;
+        out6 = out0 ^ in5;
+        out4 = out6 ^ in3 ^ in7;
+        out1 = out0 ^ out4 ^ in1;
+        out7 = out1 ^ in6;
+        out5 = out7 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_52(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in1 ^ in2;
+        tmp0 = in2 ^ in4;
+        tmp1 = in3 ^ in5;
+        tmp2 = in3 ^ in6;
+        tmp3 = in0 ^ in7;
+        out0 = tmp0 ^ in6;
+        out6 = tmp0 ^ tmp3;
+        out7 = tmp1 ^ in1;
+        out1 = tmp1 ^ tmp3;
+        out3 = tmp2 ^ in4;
+        out5 = tmp2 ^ in1 ^ in7;
+        out4 = tmp2 ^ out1 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_53(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in1;
+        out3 = in4 ^ in6;
+        out0 = out3 ^ in0 ^ in2;
+        out6 = out0 ^ in7;
+        out4 = out6 ^ in5;
+        out7 = out0 ^ out4 ^ in1 ^ in3;
+        out1 = out7 ^ in0;
+        out5 = out7 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_54(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out1 = in3 ^ in5;
+        tmp0 = in1 ^ in3;
+        tmp1 = in2 ^ in4;
+        tmp2 = in0 ^ in7;
+        out5 = in1 ^ in4 ^ in6;
+        out4 = tmp2 ^ out1;
+        out7 = tmp0 ^ in6;
+        out3 = tmp0 ^ tmp1;
+        out0 = tmp1 ^ in7;
+        tmp3 = tmp2 ^ in2;
+        out2 = tmp3 ^ in6;
+        out6 = tmp3 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_55(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in3;
+        tmp1 = in1 ^ in4;
+        tmp2 = in6 ^ in7;
+        out7 = tmp0 ^ tmp2;
+        out1 = tmp0 ^ in5;
+        out3 = tmp1 ^ in2;
+        out5 = tmp1 ^ in5 ^ in6;
+        out2 = tmp2 ^ in0;
+        out4 = out5 ^ out7 ^ in0;
+        out6 = out2 ^ in2 ^ in5;
+        out0 = out5 ^ out6 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_56(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in2 ^ in4;
+        tmp0 = in0 ^ in2;
+        out4 = in0 ^ in5;
+        out7 = in1 ^ in3;
+        out5 = in1 ^ in6;
+        out6 = tmp0 ^ in7;
+        out2 = tmp0 ^ out5;
+        out1 = out4 ^ in3;
+        out3 = out7 ^ in4 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_57(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in5;
+        tmp1 = in1 ^ in7;
+        out0 = in0 ^ in2 ^ in4;
+        out5 = in1 ^ in5 ^ in6;
+        out4 = tmp0 ^ in4;
+        out1 = tmp0 ^ in1 ^ in3;
+        out2 = tmp0 ^ out5;
+        out3 = tmp1 ^ in4;
+        out7 = tmp1 ^ in3;
+        out6 = tmp1 ^ out2 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_58(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in2 ^ in5;
+        tmp0 = in2 ^ in3 ^ in4;
+        out5 = tmp0 ^ in1;
+        out6 = tmp0 ^ in0 ^ in5;
+        out3 = out6 ^ in7;
+        tmp1 = out2 ^ out5;
+        out7 = tmp1 ^ in6;
+        out4 = tmp1 ^ out3 ^ in3;
+        out0 = out4 ^ out7 ^ in0;
+        out1 = tmp0 ^ out0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_59(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in5;
+        tmp0 = in0 ^ in5 ^ in7;
+        out3 = tmp0 ^ in2 ^ in4;
+        out0 = out3 ^ in6;
+        tmp1 = out0 ^ in7;
+        out6 = tmp1 ^ in3;
+        out5 = out6 ^ in0 ^ in1 ^ in6;
+        out4 = tmp0 ^ out5;
+        out1 = tmp1 ^ out4;
+        out7 = out1 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_5A(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in2;
+        tmp1 = in2 ^ in5;
+        out5 = tmp0 ^ in3;
+        out4 = tmp0 ^ in0;
+        tmp2 = tmp1 ^ in4;
+        out2 = tmp1 ^ in1 ^ in7;
+        out7 = tmp2 ^ out5;
+        out6 = out4 ^ out7 ^ in5;
+        out0 = tmp2 ^ in6;
+        out1 = out0 ^ out6 ^ in7;
+        out3 = tmp1 ^ out6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_5B(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in3;
+        tmp1 = in0 ^ in4;
+        tmp2 = in1 ^ in5;
+        out5 = tmp0 ^ tmp2;
+        tmp3 = tmp1 ^ in6;
+        out3 = tmp1 ^ in5;
+        out2 = tmp2 ^ in7;
+        tmp4 = out3 ^ in2;
+        out7 = out2 ^ in3 ^ in4;
+        out0 = tmp4 ^ in6;
+        out6 = tmp0 ^ tmp3;
+        out4 = tmp2 ^ tmp4;
+        out1 = tmp3 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_5C(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in3 ^ in6;
+        tmp1 = in0 ^ in2 ^ in5;
+        out1 = tmp0 ^ in5;
+        tmp2 = tmp0 ^ in1;
+        out2 = tmp1 ^ in6;
+        out6 = tmp1 ^ in3;
+        out4 = tmp2 ^ in0;
+        out7 = tmp2 ^ in4;
+        out3 = tmp1 ^ out7;
+        out0 = out3 ^ out4 ^ in7;
+        out5 = out0 ^ in1 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_5D(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in1;
+        tmp1 = in0 ^ in6;
+        out2 = tmp1 ^ in5;
+        tmp2 = out2 ^ in3;
+        out6 = tmp2 ^ in2;
+        out1 = tmp0 ^ tmp2;
+        tmp3 = out1 ^ in4 ^ in5;
+        out4 = tmp3 ^ in0;
+        out7 = tmp3 ^ in7;
+        tmp4 = out4 ^ out6;
+        out5 = tmp4 ^ in7;
+        out0 = tmp0 ^ out5;
+        out3 = tmp1 ^ tmp4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_5E(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in5;
+        tmp1 = in3 ^ in5;
+        tmp2 = in1 ^ in7;
+        out7 = in1 ^ in3 ^ in4;
+        out0 = tmp0 ^ in4;
+        tmp3 = tmp1 ^ in0;
+        out5 = tmp2 ^ in2;
+        out1 = tmp3 ^ in6;
+        out6 = tmp0 ^ tmp3;
+        tmp4 = tmp2 ^ out1;
+        out3 = tmp4 ^ in4;
+        out4 = tmp1 ^ tmp4;
+        out2 = tmp0 ^ out4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_5F(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in5;
+        tmp1 = in0 ^ in6;
+        tmp2 = tmp0 ^ in7;
+        tmp3 = tmp1 ^ in3;
+        out2 = tmp1 ^ tmp2;
+        out5 = tmp2 ^ in2;
+        out6 = tmp3 ^ in2;
+        out3 = out2 ^ in4;
+        out4 = out3 ^ in5;
+        out1 = tmp0 ^ tmp3;
+        out7 = tmp3 ^ out4;
+        out0 = out4 ^ out5 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_60(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in2 ^ in5;
+        tmp0 = in3 ^ in6;
+        out1 = in3 ^ in4 ^ in7;
+        out7 = out4 ^ in1;
+        tmp1 = out4 ^ in4;
+        out0 = tmp0 ^ in2;
+        out5 = tmp0 ^ in0;
+        out2 = tmp0 ^ tmp1;
+        out3 = tmp1 ^ in7;
+        out6 = out3 ^ out7 ^ in0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_61(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in5;
+        out4 = tmp0 ^ in4;
+        tmp1 = out4 ^ in3;
+        out3 = tmp1 ^ in7;
+        out2 = tmp1 ^ in2 ^ in6;
+        out1 = tmp0 ^ out3 ^ in1;
+        out0 = out2 ^ out4 ^ in0;
+        out7 = tmp1 ^ out1;
+        out6 = out0 ^ out1 ^ in2;
+        out5 = tmp0 ^ out0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_62(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in4 ^ in5;
+        tmp0 = in0 ^ in3 ^ in4;
+        out1 = tmp0 ^ in7;
+        out5 = tmp0 ^ in6;
+        tmp1 = out1 ^ in0;
+        tmp2 = tmp1 ^ out3;
+        out4 = tmp2 ^ in2;
+        tmp3 = tmp2 ^ in1;
+        out0 = out4 ^ in5 ^ in6;
+        out7 = tmp3 ^ out0;
+        out6 = tmp0 ^ tmp3;
+        out2 = tmp1 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_63(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in3 ^ in4;
+        tmp1 = in1 ^ in7;
+        out3 = tmp0 ^ in5;
+        tmp2 = out3 ^ in6;
+        out4 = out3 ^ in2 ^ in7;
+        out5 = tmp2 ^ in0;
+        tmp3 = out5 ^ in3;
+        out0 = tmp3 ^ out4;
+        out2 = tmp1 ^ tmp2;
+        out6 = tmp1 ^ tmp3;
+        tmp4 = tmp0 ^ out2;
+        out1 = tmp4 ^ out5;
+        out7 = tmp4 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_64(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in2 ^ in3;
+        out1 = in3 ^ in4;
+        out7 = in1 ^ in2;
+        tmp0 = in4 ^ in5;
+        tmp1 = in0 ^ in7;
+        out4 = in5 ^ in6 ^ in7;
+        out2 = tmp0 ^ out0 ^ in0;
+        out3 = tmp0 ^ out7 ^ in6;
+        out5 = tmp1 ^ in6;
+        out6 = tmp1 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_65(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in3;
+        tmp1 = in4 ^ in5;
+        tmp2 = in6 ^ in7;
+        out7 = in1 ^ in2 ^ in7;
+        out1 = in1 ^ in3 ^ in4;
+        out0 = tmp0 ^ in2;
+        out2 = tmp0 ^ tmp1;
+        out4 = tmp1 ^ tmp2;
+        tmp3 = tmp2 ^ in0;
+        out3 = out4 ^ out7 ^ in3;
+        out5 = tmp3 ^ in5;
+        out6 = tmp3 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_66(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in2;
+        tmp1 = in2 ^ in3;
+        tmp2 = in0 ^ in4;
+        out7 = tmp0 ^ in6;
+        out0 = tmp1 ^ in7;
+        out1 = tmp2 ^ in3;
+        tmp3 = tmp2 ^ in6;
+        tmp4 = out1 ^ in5;
+        out5 = tmp3 ^ in7;
+        out4 = tmp3 ^ tmp4;
+        out2 = tmp0 ^ tmp4 ^ in7;
+        out6 = tmp1 ^ out2 ^ in4;
+        out3 = tmp3 ^ out6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_67(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in3;
+        tmp1 = tmp0 ^ in1;
+        tmp2 = tmp0 ^ in7;
+        out1 = tmp1 ^ in4;
+        out0 = tmp2 ^ in2;
+        tmp3 = out1 ^ in7;
+        out2 = tmp3 ^ in5;
+        out3 = out2 ^ in0 ^ in6;
+        out7 = tmp1 ^ out0 ^ in6;
+        out5 = tmp1 ^ out3;
+        out4 = tmp2 ^ out5;
+        out6 = tmp3 ^ out4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_68(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in3 ^ in4;
+        tmp1 = in2 ^ in3 ^ in5;
+        tmp2 = tmp0 ^ in1;
+        tmp3 = tmp0 ^ in6;
+        out0 = tmp1 ^ in6;
+        out6 = tmp2 ^ in0;
+        out7 = tmp1 ^ tmp2;
+        out1 = tmp3 ^ in7;
+        out2 = out1 ^ in2;
+        out4 = tmp2 ^ out2;
+        out3 = out4 ^ out6 ^ in3;
+        out5 = tmp3 ^ out3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_69(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in6 ^ in7;
+        out2 = tmp0 ^ in3 ^ in4;
+        out1 = out2 ^ in1;
+        out3 = out2 ^ in0 ^ in2;
+        out4 = out1 ^ in2 ^ in3;
+        out6 = out1 ^ in0 ^ in7;
+        out7 = out4 ^ in5 ^ in6;
+        out5 = out4 ^ out6 ^ in5;
+        out0 = tmp0 ^ out5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_6A(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in6;
+        out3 = in0 ^ in4 ^ in6;
+        tmp1 = tmp0 ^ in3;
+        out4 = tmp1 ^ in1;
+        tmp2 = tmp1 ^ in7;
+        out2 = out4 ^ in4;
+        out0 = tmp2 ^ in5;
+        out5 = tmp2 ^ out3;
+        out7 = out2 ^ in3 ^ in5;
+        out1 = tmp0 ^ out5;
+        out6 = tmp1 ^ out7 ^ in0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_6B(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in4 ^ in6;
+        out2 = tmp0 ^ in1 ^ in3;
+        out4 = out2 ^ in2;
+        tmp1 = out2 ^ in0;
+        out7 = out4 ^ in3 ^ in5 ^ in7;
+        out1 = tmp1 ^ in7;
+        out3 = tmp1 ^ in1;
+        out6 = tmp1 ^ in5;
+        out0 = tmp1 ^ out7 ^ in6;
+        out5 = tmp0 ^ out0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_6C(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in1;
+        tmp0 = in2 ^ in3;
+        out5 = in0 ^ in2;
+        out1 = in3 ^ in4 ^ in6;
+        tmp1 = out5 ^ in1;
+        out0 = tmp0 ^ in5;
+        out6 = tmp0 ^ tmp1;
+        out3 = tmp1 ^ in4;
+        out7 = out3 ^ in0;
+        out2 = out6 ^ out7 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_6D(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in1 ^ in4;
+        tmp0 = in0 ^ in2;
+        tmp1 = out4 ^ in3;
+        out7 = out4 ^ in2 ^ in7;
+        out5 = tmp0 ^ in5;
+        out3 = tmp0 ^ tmp1;
+        out1 = tmp1 ^ in6;
+        out0 = out5 ^ in3;
+        out2 = out3 ^ out7 ^ in4;
+        out6 = out1 ^ in0 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_6E(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in3;
+        tmp1 = in0 ^ in4;
+        out4 = tmp0 ^ in7;
+        out6 = tmp0 ^ in0 ^ in5;
+        out5 = tmp1 ^ in2;
+        tmp2 = tmp1 ^ in3;
+        out3 = tmp2 ^ out4;
+        out1 = tmp2 ^ in6;
+        out2 = tmp0 ^ out5;
+        out0 = out2 ^ out3 ^ in5;
+        out7 = out1 ^ out2 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_6F(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in3 ^ in7;
+        tmp1 = tmp0 ^ in4;
+        tmp2 = tmp0 ^ in0 ^ in2;
+        out4 = tmp1 ^ in1;
+        out0 = tmp2 ^ in5;
+        out3 = out4 ^ in0;
+        out2 = out3 ^ in7;
+        out1 = out2 ^ in6;
+        out6 = out1 ^ in4 ^ in5;
+        out7 = tmp2 ^ out1;
+        out5 = tmp1 ^ out0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_70(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in2;
+        tmp0 = in2 ^ in4;
+        out2 = in2 ^ in3 ^ in5;
+        tmp1 = tmp0 ^ in6;
+        tmp2 = out2 ^ in7;
+        out0 = tmp1 ^ in3;
+        out4 = tmp1 ^ in0;
+        out7 = tmp2 ^ in1;
+        out6 = out4 ^ in1;
+        out5 = out7 ^ in0 ^ in2;
+        out1 = tmp0 ^ tmp2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_71(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in3 ^ in5;
+        out3 = in2 ^ in3;
+        tmp0 = in0 ^ in2;
+        tmp1 = out2 ^ in1;
+        out4 = tmp0 ^ in6;
+        tmp2 = tmp0 ^ in1;
+        out7 = tmp1 ^ in2;
+        out1 = tmp1 ^ in4 ^ in7;
+        out0 = out4 ^ in3 ^ in4;
+        out6 = tmp2 ^ in4;
+        out5 = tmp2 ^ out3 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_72(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in7;
+        tmp0 = in0 ^ in4;
+        tmp1 = tmp0 ^ in3 ^ in7;
+        out1 = tmp1 ^ in5;
+        out5 = out1 ^ in1;
+        tmp2 = tmp0 ^ out5;
+        out2 = tmp2 ^ in2;
+        out7 = out2 ^ in6;
+        out6 = tmp1 ^ out7;
+        out4 = tmp2 ^ out6;
+        out0 = out4 ^ in0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_73(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in3 ^ in7;
+        out2 = out3 ^ in1 ^ in5;
+        out1 = out2 ^ in0 ^ in4;
+        out5 = out1 ^ in5;
+        out6 = out1 ^ out3 ^ in2;
+        out0 = out2 ^ out6 ^ in6;
+        out7 = out0 ^ out1 ^ in3;
+        out4 = out0 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_74(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in3 ^ in4;
+        tmp1 = in1 ^ in2 ^ in6;
+        out4 = in0 ^ in4 ^ in7;
+        out5 = in0 ^ in1 ^ in5;
+        out0 = tmp0 ^ in2;
+        out1 = tmp0 ^ in5;
+        out3 = tmp1 ^ in7;
+        out6 = tmp1 ^ in0;
+        out2 = tmp1 ^ out5 ^ in3;
+        out7 = out3 ^ in3 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_75(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in0 ^ in7;
+        tmp0 = in1 ^ in3;
+        out5 = in0 ^ in1;
+        out7 = tmp0 ^ in2;
+        tmp1 = tmp0 ^ in4;
+        out6 = out5 ^ in2;
+        tmp2 = out7 ^ in6;
+        out1 = tmp1 ^ in5;
+        out0 = tmp1 ^ out6;
+        out3 = tmp2 ^ in7;
+        out2 = tmp2 ^ out6 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_76(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in1 ^ in6;
+        tmp0 = in0 ^ in5;
+        tmp1 = in3 ^ in7;
+        tmp2 = tmp0 ^ in4;
+        tmp3 = tmp1 ^ in2;
+        out5 = tmp2 ^ in1;
+        out1 = tmp2 ^ in3;
+        out0 = tmp3 ^ in4;
+        out4 = out1 ^ in5;
+        out7 = tmp3 ^ out3;
+        out2 = tmp0 ^ out7;
+        out6 = tmp1 ^ out2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_77(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in0 ^ in3;
+        tmp0 = in1 ^ in4;
+        tmp1 = in1 ^ in6;
+        tmp2 = out4 ^ in5;
+        out5 = tmp0 ^ in0;
+        out1 = tmp0 ^ tmp2;
+        out3 = tmp1 ^ in3;
+        out2 = tmp1 ^ tmp2 ^ in7;
+        out7 = out3 ^ in2;
+        tmp3 = out7 ^ in6;
+        out6 = tmp2 ^ tmp3;
+        out0 = tmp3 ^ out5 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_78(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in3;
+        tmp1 = in2 ^ in7;
+        tmp2 = in0 ^ in5 ^ in6;
+        out2 = tmp1 ^ in3;
+        out3 = tmp2 ^ in2;
+        out5 = out3 ^ in1 ^ in3;
+        out0 = tmp0 ^ out3 ^ in4;
+        out1 = tmp1 ^ out0;
+        out4 = out1 ^ out5 ^ in5;
+        out7 = tmp0 ^ out4;
+        out6 = tmp2 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_79(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in3 ^ in7;
+        tmp0 = in3 ^ in4;
+        tmp1 = in1 ^ in5;
+        tmp2 = tmp1 ^ in2;
+        out4 = tmp2 ^ in0 ^ in7;
+        tmp3 = out4 ^ in5;
+        out5 = tmp3 ^ out2 ^ in6;
+        out7 = tmp0 ^ tmp2;
+        out6 = tmp0 ^ tmp3;
+        out3 = tmp1 ^ out5;
+        out0 = out3 ^ in4;
+        out1 = tmp3 ^ out0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_7A(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in2;
+        out2 = tmp0 ^ in3;
+        tmp1 = out2 ^ in4;
+        out4 = tmp1 ^ in0 ^ in5;
+        out5 = out4 ^ in6;
+        out6 = out5 ^ in7;
+        out7 = out6 ^ in0;
+        out0 = out7 ^ in1;
+        out1 = tmp0 ^ out6;
+        out3 = tmp1 ^ out6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_7B(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in1 ^ in3;
+        tmp0 = in0 ^ in5;
+        out4 = tmp0 ^ out2 ^ in2;
+        tmp1 = out4 ^ in4;
+        out6 = tmp1 ^ in7;
+        out5 = tmp1 ^ in5 ^ in6;
+        out0 = out6 ^ in1 ^ in6;
+        tmp2 = out0 ^ in2;
+        out1 = tmp2 ^ in1;
+        out3 = tmp2 ^ in4;
+        out7 = tmp0 ^ out5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_7C(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in3 ^ in5;
+        tmp1 = tmp0 ^ in4;
+        out0 = tmp1 ^ in2;
+        out1 = tmp1 ^ in6;
+        out7 = out0 ^ in1 ^ in5 ^ in7;
+        out5 = out1 ^ out7 ^ in0;
+        out3 = out5 ^ in6;
+        out6 = tmp0 ^ out5;
+        out2 = out6 ^ in1;
+        out4 = out2 ^ out7 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_7D(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in2;
+        tmp1 = tmp0 ^ in3;
+        tmp2 = tmp0 ^ in6;
+        out7 = tmp1 ^ in4;
+        tmp3 = tmp2 ^ in0;
+        out5 = tmp3 ^ in7;
+        out4 = tmp3 ^ in2 ^ in5;
+        out2 = tmp1 ^ out5;
+        out6 = tmp2 ^ out2;
+        out0 = out4 ^ out7 ^ in6;
+        out1 = tmp3 ^ out0;
+        out3 = out6 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_7E(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in3 ^ in4;
+        tmp1 = in0 ^ in5;
+        out1 = tmp0 ^ tmp1 ^ in6;
+        out3 = tmp1 ^ in1;
+        out4 = out1 ^ in1 ^ in7;
+        tmp2 = out4 ^ in3;
+        out5 = tmp2 ^ in2;
+        out6 = tmp0 ^ out5;
+        out7 = tmp1 ^ out4 ^ in2;
+        out2 = out6 ^ in5 ^ in7;
+        out0 = tmp2 ^ out2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_7F(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in7;
+        tmp1 = tmp0 ^ in3 ^ in5;
+        tmp2 = tmp1 ^ in0;
+        out0 = tmp2 ^ in4;
+        out6 = tmp2 ^ in1;
+        out3 = tmp0 ^ out6;
+        tmp3 = out3 ^ in6;
+        out1 = tmp3 ^ in4;
+        out2 = tmp3 ^ in5;
+        out4 = tmp3 ^ in7;
+        out5 = tmp1 ^ out1;
+        out7 = out0 ^ out4 ^ in3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_80(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in3;
+        tmp1 = in4 ^ in5;
+        out1 = in2 ^ in6 ^ in7;
+        out5 = tmp0 ^ in4;
+        tmp2 = tmp0 ^ in1;
+        out6 = tmp1 ^ in3;
+        out7 = tmp1 ^ in0 ^ in6;
+        out4 = tmp2 ^ in7;
+        out3 = tmp2 ^ out6;
+        out2 = out3 ^ out5 ^ in6;
+        out0 = out2 ^ in3 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_81(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in4 ^ in6;
+        tmp1 = tmp0 ^ in3;
+        out6 = tmp1 ^ in5;
+        out5 = out6 ^ in2 ^ in6;
+        out3 = out5 ^ in1;
+        out2 = tmp0 ^ out3;
+        out1 = out3 ^ out6 ^ in7;
+        out4 = tmp1 ^ out1;
+        out7 = out2 ^ out4 ^ in0;
+        out0 = out7 ^ in1 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_82(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in1 ^ in2;
+        tmp0 = in6 ^ in7;
+        out5 = in2 ^ in3;
+        out6 = in3 ^ in4;
+        out7 = in0 ^ in4 ^ in5;
+        out0 = in1 ^ in5 ^ in6;
+        out1 = tmp0 ^ in0 ^ in2;
+        out2 = tmp0 ^ in3 ^ in5;
+        out3 = tmp0 ^ out0 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_83(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in1;
+        tmp1 = in2 ^ in5;
+        tmp2 = in3 ^ in6;
+        out4 = in1 ^ in2 ^ in4;
+        out0 = tmp0 ^ in5 ^ in6;
+        out5 = tmp1 ^ in3;
+        tmp3 = tmp1 ^ in7;
+        out6 = tmp2 ^ in4;
+        out2 = tmp2 ^ tmp3;
+        tmp4 = tmp3 ^ out4;
+        out1 = tmp3 ^ out0;
+        out3 = tmp4 ^ in3;
+        out7 = tmp0 ^ tmp4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_84(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out1 = in2 ^ in6;
+        out6 = in3 ^ in5;
+        out0 = in1 ^ in5 ^ in7;
+        out7 = in0 ^ in4 ^ in6;
+        out4 = in1 ^ in3 ^ in6;
+        out5 = in2 ^ in4 ^ in7;
+        out2 = out6 ^ in0 ^ in1;
+        out3 = out5 ^ in5 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_85(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in6;
+        tmp1 = in3 ^ in6;
+        tmp2 = tmp0 ^ in4;
+        out1 = tmp0 ^ in2;
+        out6 = tmp1 ^ in5;
+        out4 = tmp2 ^ in3;
+        tmp3 = out1 ^ out6;
+        out2 = tmp3 ^ in0;
+        out3 = tmp2 ^ tmp3 ^ in7;
+        out7 = out2 ^ out3 ^ in1;
+        out5 = tmp1 ^ out3;
+        out0 = tmp2 ^ out7 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_86(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out6 = in3;
+        out7 = in0 ^ in4;
+        out0 = in1 ^ in5;
+        out5 = in2 ^ in7;
+        out3 = in4 ^ in5 ^ in6;
+        out1 = in0 ^ in2 ^ in6;
+        out4 = in1 ^ in6 ^ in7;
+        out2 = in0 ^ in3 ^ in5 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_87(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out6 = in3 ^ in6;
+        tmp0 = in0 ^ in1;
+        out7 = in0 ^ in4 ^ in7;
+        out5 = in2 ^ in5 ^ in7;
+        out3 = out6 ^ in4 ^ in5;
+        out0 = tmp0 ^ in5;
+        tmp1 = tmp0 ^ in6;
+        out2 = out5 ^ in0 ^ in3;
+        out1 = tmp1 ^ in2;
+        out4 = tmp1 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_88(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out1 = in2 ^ in7;
+        tmp0 = in5 ^ in6;
+        out0 = in1 ^ in6 ^ in7;
+        out6 = in4 ^ in5 ^ in7;
+        out3 = out0 ^ out1 ^ in0 ^ in4;
+        out7 = tmp0 ^ in0;
+        tmp1 = tmp0 ^ in3;
+        out2 = out0 ^ in3;
+        out4 = tmp1 ^ in2;
+        out5 = tmp1 ^ out6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_89(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in7;
+        tmp1 = in2 ^ in7;
+        tmp2 = tmp0 ^ in6;
+        out1 = tmp1 ^ in1;
+        out7 = tmp2 ^ in5;
+        out0 = tmp2 ^ in1;
+        out2 = out1 ^ in3 ^ in6;
+        out6 = out7 ^ in0 ^ in4;
+        out5 = out6 ^ in3;
+        out3 = tmp0 ^ out2 ^ in4;
+        out4 = tmp1 ^ out5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_8A(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in1 ^ in6;
+        out7 = in0 ^ in5;
+        out2 = in3 ^ in6;
+        out6 = in4 ^ in7;
+        out1 = in0 ^ in2 ^ in7;
+        out3 = out0 ^ out6 ^ in0;
+        out4 = out1 ^ out7 ^ in6;
+        out5 = out2 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_8B(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in1;
+        tmp1 = in3 ^ in6;
+        tmp2 = in5 ^ in7;
+        tmp3 = tmp0 ^ in7;
+        out0 = tmp0 ^ in6;
+        out2 = tmp1 ^ in2;
+        out5 = tmp1 ^ tmp2;
+        out7 = tmp2 ^ in0;
+        tmp4 = tmp3 ^ in4;
+        out1 = tmp3 ^ in2;
+        out6 = tmp4 ^ out0;
+        out4 = out6 ^ in2 ^ in5;
+        out3 = tmp1 ^ tmp4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_8C(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out1 = in2;
+        out0 = in1 ^ in7;
+        out7 = in0 ^ in6;
+        out5 = in4 ^ in6;
+        out6 = in5 ^ in7;
+        out2 = out0 ^ in0 ^ in3;
+        out3 = out5 ^ out7 ^ in2 ^ in7;
+        out4 = out6 ^ in3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_8D(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out1 = in1 ^ in2;
+        tmp0 = in6 ^ in7;
+        out0 = in0 ^ in1 ^ in7;
+        out5 = in4 ^ in5 ^ in6;
+        out6 = tmp0 ^ in5;
+        out7 = tmp0 ^ in0;
+        out4 = tmp0 ^ out5 ^ in3;
+        out2 = out0 ^ in2 ^ in3;
+        out3 = out2 ^ in1 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_8E(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in1;
+        out4 = in5;
+        out7 = in0;
+        out5 = in6;
+        out6 = in7;
+        out3 = in0 ^ in4;
+        out1 = in0 ^ in2;
+        out2 = in0 ^ in3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_8F(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in0 ^ in1;
+        tmp0 = in0 ^ in3;
+        out4 = in4 ^ in5;
+        out7 = in0 ^ in7;
+        out5 = in5 ^ in6;
+        out6 = in6 ^ in7;
+        out1 = out0 ^ in2;
+        out2 = tmp0 ^ in2;
+        out3 = tmp0 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_90(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in2;
+        tmp1 = in2 ^ in6 ^ in7;
+        out3 = tmp0 ^ in7;
+        out1 = tmp1 ^ in5;
+        tmp2 = out1 ^ in4;
+        out6 = tmp2 ^ in3;
+        out5 = out6 ^ in1;
+        out4 = out5 ^ in0;
+        out0 = tmp0 ^ tmp2;
+        out7 = tmp0 ^ out4;
+        out2 = tmp1 ^ out5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_91(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in4;
+        tmp1 = tmp0 ^ in3 ^ in5;
+        out2 = tmp1 ^ in1;
+        out6 = tmp1 ^ in7;
+        tmp2 = out2 ^ in5 ^ in7;
+        out3 = tmp2 ^ in4;
+        out5 = tmp2 ^ in6;
+        out1 = tmp1 ^ out5 ^ in2;
+        tmp3 = out1 ^ in0;
+        out4 = tmp3 ^ in3;
+        out0 = tmp0 ^ tmp3;
+        out7 = tmp2 ^ tmp3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_92(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in1;
+        tmp0 = in4 ^ in5;
+        tmp1 = tmp0 ^ in1;
+        out2 = tmp0 ^ in3 ^ in7;
+        out0 = tmp1 ^ in6;
+        out7 = out2 ^ in0;
+        out4 = out0 ^ in0 ^ in2;
+        out5 = out4 ^ out7 ^ in5;
+        out6 = tmp1 ^ out5;
+        out1 = out6 ^ out7 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_93(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in1 ^ in3;
+        tmp0 = in2 ^ in7;
+        tmp1 = out3 ^ in6;
+        tmp2 = tmp0 ^ in4;
+        out5 = tmp0 ^ tmp1;
+        out6 = tmp2 ^ in3;
+        out2 = out6 ^ in5;
+        out0 = out2 ^ out5 ^ in0;
+        out7 = tmp1 ^ out0;
+        out1 = tmp2 ^ out0;
+        out4 = out1 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_94(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in2 ^ in6;
+        tmp0 = in1 ^ in4 ^ in5;
+        out1 = out3 ^ in5;
+        out5 = tmp0 ^ out3;
+        out0 = tmp0 ^ in7;
+        out4 = tmp0 ^ in0 ^ in3;
+        out6 = out1 ^ in3 ^ in7;
+        out2 = out4 ^ in6;
+        out7 = out0 ^ out2 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_95(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in3;
+        out3 = tmp0 ^ in6;
+        tmp1 = tmp0 ^ in7;
+        tmp2 = out3 ^ in0;
+        out6 = tmp1 ^ in5;
+        tmp3 = tmp2 ^ in4;
+        out7 = tmp3 ^ in2;
+        tmp4 = tmp3 ^ in5;
+        out2 = tmp4 ^ in1;
+        tmp5 = out2 ^ in6;
+        out0 = tmp1 ^ tmp5;
+        out1 = tmp5 ^ out7;
+        out4 = tmp2 ^ out1;
+        out5 = tmp4 ^ out4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_96(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in6 ^ in7;
+        tmp0 = in1 ^ in5;
+        tmp1 = in5 ^ in6;
+        out6 = out3 ^ in2 ^ in3;
+        out0 = tmp0 ^ in4;
+        tmp2 = tmp1 ^ in2;
+        out4 = out0 ^ in0 ^ in7;
+        out1 = tmp2 ^ in0;
+        out5 = tmp2 ^ in1;
+        out7 = tmp0 ^ out4 ^ in3;
+        out2 = tmp1 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_97(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in4;
+        tmp1 = in2 ^ in6;
+        out3 = in3 ^ in6 ^ in7;
+        out7 = tmp0 ^ in3;
+        tmp2 = tmp0 ^ in5;
+        out5 = tmp1 ^ in1;
+        out6 = tmp1 ^ out3;
+        out0 = tmp2 ^ in1;
+        out2 = tmp2 ^ out3 ^ in2;
+        tmp3 = out0 ^ in4;
+        out4 = tmp3 ^ in7;
+        out1 = tmp1 ^ tmp3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_98(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in5 ^ in7;
+        tmp1 = in1 ^ in4 ^ in7;
+        out1 = tmp0 ^ in2;
+        out0 = tmp1 ^ in6;
+        out2 = tmp1 ^ in3;
+        out6 = out0 ^ out1 ^ in1;
+        out5 = tmp0 ^ out2;
+        out3 = tmp1 ^ out6 ^ in0;
+        out7 = out0 ^ out5 ^ in0;
+        out4 = out6 ^ out7 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_99(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in3;
+        out5 = in1 ^ in3 ^ in4;
+        out6 = in2 ^ in4 ^ in5;
+        out4 = tmp0 ^ in2;
+        tmp1 = tmp0 ^ in6;
+        tmp2 = out5 ^ in7;
+        out7 = tmp1 ^ in5;
+        out0 = tmp1 ^ tmp2;
+        out2 = tmp2 ^ in2;
+        out3 = out0 ^ out6 ^ in3;
+        out1 = tmp1 ^ out3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_9A(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in3 ^ in4;
+        tmp0 = in0 ^ in5;
+        tmp1 = in1 ^ in6;
+        out5 = in1 ^ in3 ^ in5;
+        tmp2 = tmp0 ^ in7;
+        out3 = tmp0 ^ tmp1;
+        out0 = tmp1 ^ in4;
+        out7 = tmp2 ^ in3;
+        out1 = tmp2 ^ in2;
+        out6 = out0 ^ in1 ^ in2;
+        out4 = out1 ^ in4 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_9B(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out5 = in1 ^ in3;
+        tmp0 = in3 ^ in5;
+        out6 = in2 ^ in4;
+        out4 = in0 ^ in2 ^ in7;
+        out7 = tmp0 ^ in0;
+        out2 = out6 ^ in3;
+        out1 = out4 ^ in1 ^ in5;
+        out3 = out7 ^ in1 ^ in6;
+        out0 = tmp0 ^ out3 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_9C(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out1 = in2 ^ in5;
+        tmp0 = in0 ^ in3 ^ in6;
+        out3 = out1 ^ in0;
+        out6 = out1 ^ in6;
+        out7 = tmp0 ^ in7;
+        out4 = out7 ^ in4;
+        out2 = out4 ^ in1;
+        out0 = tmp0 ^ out2;
+        out5 = out0 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_9D(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out6 = in2 ^ in5;
+        tmp0 = in0 ^ in3;
+        out5 = in1 ^ in4 ^ in7;
+        out1 = out6 ^ in1;
+        out3 = tmp0 ^ out6;
+        out7 = tmp0 ^ in6;
+        out0 = out5 ^ in0;
+        out4 = out7 ^ in7;
+        out2 = out5 ^ out7 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_9E(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in1 ^ in4;
+        tmp0 = in0 ^ in5;
+        out6 = in2 ^ in6;
+        out7 = in0 ^ in3 ^ in7;
+        out4 = in0 ^ in4 ^ in6;
+        out5 = in1 ^ in5 ^ in7;
+        out1 = tmp0 ^ in2;
+        out3 = tmp0 ^ in7;
+        out2 = out4 ^ in3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_9F(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out6 = in2;
+        out7 = in0 ^ in3;
+        tmp0 = in0 ^ in1;
+        out4 = in0 ^ in6;
+        out5 = in1 ^ in7;
+        out1 = tmp0 ^ in2 ^ in5;
+        out2 = out7 ^ in2 ^ in4 ^ in6;
+        out3 = out7 ^ in5 ^ in7;
+        out0 = tmp0 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_A0(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in6;
+        out2 = tmp0 ^ in7;
+        tmp1 = tmp0 ^ in5;
+        out6 = out2 ^ in3 ^ in4;
+        out0 = tmp1 ^ in3;
+        tmp2 = out0 ^ in2;
+        out3 = tmp2 ^ in7;
+        tmp3 = tmp2 ^ in1;
+        out5 = tmp3 ^ in0;
+        out4 = tmp3 ^ out6;
+        out7 = out5 ^ out6 ^ in1;
+        out1 = tmp1 ^ out4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_A1(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in5;
+        tmp1 = tmp0 ^ in1;
+        tmp2 = tmp0 ^ in4;
+        out4 = tmp1 ^ in7;
+        out7 = tmp2 ^ in0;
+        out6 = tmp2 ^ out4 ^ in3;
+        out3 = out4 ^ in6;
+        out2 = out3 ^ in5;
+        out1 = out2 ^ in4;
+        out5 = out1 ^ out6 ^ in0;
+        out0 = tmp1 ^ out5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_A2(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in6;
+        tmp0 = in1 ^ in3 ^ in5;
+        out3 = tmp0 ^ in6;
+        out4 = tmp0 ^ in2 ^ in4;
+        out0 = out3 ^ in7;
+        out6 = out0 ^ in4;
+        out1 = out0 ^ out4 ^ in0;
+        out7 = out1 ^ in5;
+        out5 = out7 ^ in3 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_A3(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in2 ^ in6;
+        out3 = in1 ^ in5 ^ in6;
+        tmp0 = out2 ^ in0;
+        out4 = out2 ^ out3 ^ in3;
+        tmp1 = tmp0 ^ in4;
+        out0 = tmp0 ^ out4 ^ in7;
+        out5 = tmp1 ^ in3;
+        out7 = tmp1 ^ in5;
+        out1 = tmp1 ^ in1 ^ in7;
+        out6 = tmp1 ^ out0 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_A4(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in3;
+        tmp1 = in2 ^ in4;
+        tmp2 = in2 ^ in5;
+        tmp3 = in0 ^ in7;
+        out0 = tmp0 ^ in5;
+        out6 = tmp0 ^ in6 ^ in7;
+        out1 = tmp1 ^ in6;
+        out7 = tmp1 ^ tmp3;
+        out3 = tmp2 ^ in3;
+        tmp4 = tmp2 ^ out1;
+        out2 = tmp3 ^ in1;
+        out5 = tmp4 ^ out7;
+        out4 = tmp4 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_A5(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in2 ^ in5;
+        tmp0 = in1 ^ in6;
+        tmp1 = in0 ^ in1;
+        tmp2 = in2 ^ in4;
+        out6 = in1 ^ in3 ^ in7;
+        out4 = tmp0 ^ in5;
+        out1 = tmp0 ^ tmp2;
+        out0 = tmp1 ^ in3 ^ in5;
+        out2 = tmp1 ^ in2 ^ in7;
+        out7 = tmp2 ^ in0;
+        out5 = tmp0 ^ out2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_A6(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in0;
+        out3 = in3 ^ in5 ^ in7;
+        out1 = in0 ^ in2 ^ in4 ^ in6;
+        out0 = out3 ^ in1;
+        out7 = out1 ^ in7;
+        out6 = out0 ^ in6;
+        out5 = out7 ^ in5;
+        out4 = out6 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_A7(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in0 ^ in2;
+        out3 = in5 ^ in7;
+        out7 = out2 ^ in4 ^ in6;
+        out6 = out3 ^ in1 ^ in3;
+        out1 = out7 ^ in1;
+        out5 = out7 ^ in7;
+        out0 = out6 ^ in0;
+        out4 = out6 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_A8(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in4;
+        tmp1 = in1 ^ in6;
+        tmp2 = in0 ^ in2 ^ in7;
+        out1 = tmp0 ^ in7;
+        out4 = tmp0 ^ in6;
+        out0 = tmp1 ^ in3;
+        out2 = tmp1 ^ in5;
+        out6 = tmp1 ^ in4;
+        out7 = tmp2 ^ in5;
+        out3 = tmp2 ^ out0 ^ in6;
+        out5 = out7 ^ in2 ^ in3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_A9(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in2 ^ in6;
+        out6 = in1 ^ in4;
+        out7 = in0 ^ in2 ^ in5;
+        out5 = in0 ^ in3 ^ in7;
+        out2 = out4 ^ in1 ^ in5;
+        out1 = out6 ^ in2 ^ in7;
+        out0 = out2 ^ out7 ^ in3;
+        out3 = out1 ^ in0 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_AA(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in2;
+        tmp1 = in1 ^ in3;
+        tmp2 = in6 ^ in7;
+        out1 = tmp0 ^ in4 ^ in7;
+        out3 = tmp1 ^ in0;
+        out0 = tmp1 ^ tmp2;
+        out2 = tmp2 ^ in5;
+        out7 = tmp0 ^ out2;
+        out6 = out1 ^ out7 ^ in1;
+        out5 = out0 ^ out6 ^ in0;
+        out4 = out5 ^ out7 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_AB(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in0 ^ in1;
+        tmp0 = in1 ^ in4;
+        tmp1 = in0 ^ in7;
+        out6 = tmp0 ^ in5;
+        out1 = tmp0 ^ tmp1 ^ in2;
+        out5 = tmp1 ^ in3 ^ in4;
+        out0 = tmp0 ^ out5 ^ in6;
+        out4 = out0 ^ out3 ^ in2;
+        out2 = out4 ^ in3 ^ in5;
+        out7 = tmp1 ^ out2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_AC(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in1 ^ in3;
+        out1 = in2 ^ in4;
+        tmp0 = in0 ^ in2;
+        out4 = in4 ^ in7;
+        out5 = in0 ^ in5;
+        out6 = in1 ^ in6;
+        out7 = tmp0 ^ in7;
+        out3 = tmp0 ^ in3 ^ in6;
+        out2 = out5 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_AD(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in7;
+        out5 = in0;
+        out6 = in1;
+        out7 = in0 ^ in2;
+        out0 = in0 ^ in1 ^ in3;
+        out2 = out7 ^ in1 ^ in5;
+        out1 = in1 ^ in2 ^ in4;
+        out3 = out7 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_AE(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in3 ^ in4;
+        tmp0 = in0 ^ in4;
+        tmp1 = in0 ^ in7;
+        out0 = in1 ^ in3 ^ in7;
+        out1 = tmp0 ^ in2;
+        out5 = tmp0 ^ in5;
+        tmp2 = tmp1 ^ in6;
+        out2 = tmp1 ^ in5;
+        out3 = tmp2 ^ in3;
+        out7 = tmp2 ^ in2;
+        out6 = tmp2 ^ out2 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_AF(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in3;
+        tmp0 = in0 ^ in7;
+        out5 = in0 ^ in4;
+        out6 = in1 ^ in5;
+        out7 = in0 ^ in2 ^ in6;
+        out0 = tmp0 ^ in1 ^ in3;
+        out3 = tmp0 ^ in6;
+        out2 = tmp0 ^ in2 ^ in5;
+        out1 = out5 ^ in1 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_B0(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in4;
+        tmp1 = in3 ^ in6;
+        out2 = tmp0 ^ in7;
+        tmp2 = tmp0 ^ tmp1;
+        out0 = tmp2 ^ in5;
+        out3 = tmp2 ^ in2;
+        out6 = out3 ^ in6;
+        tmp3 = out6 ^ in0 ^ in1;
+        out7 = tmp3 ^ in5;
+        out5 = tmp3 ^ out2;
+        out1 = out0 ^ out5 ^ in0;
+        out4 = tmp1 ^ out5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_B1(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in4;
+        out2 = tmp0 ^ in2 ^ in7;
+        tmp1 = out2 ^ in6;
+        out1 = tmp1 ^ in5;
+        out3 = tmp1 ^ in7;
+        out4 = tmp1 ^ in0;
+        out6 = out3 ^ in3;
+        out0 = out6 ^ in0 ^ in2 ^ in5;
+        out5 = tmp1 ^ out0 ^ in1;
+        out7 = tmp0 ^ out5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_B2(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in4;
+        tmp0 = in4 ^ in7;
+        tmp1 = in1 ^ in3 ^ in6;
+        out3 = tmp0 ^ tmp1;
+        tmp2 = tmp1 ^ in0;
+        out0 = out3 ^ in5;
+        out4 = tmp2 ^ in2;
+        tmp3 = out4 ^ in6;
+        out5 = tmp0 ^ tmp3;
+        out1 = tmp3 ^ out0;
+        tmp4 = out1 ^ in7;
+        out7 = tmp4 ^ in3;
+        out6 = tmp2 ^ tmp4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_B3(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in2 ^ in4;
+        tmp0 = in0 ^ in5;
+        tmp1 = in1 ^ in6;
+        out3 = tmp1 ^ in4 ^ in7;
+        tmp2 = tmp0 ^ out3;
+        out0 = tmp2 ^ in3;
+        out1 = tmp2 ^ in2;
+        out5 = out0 ^ in2 ^ in6;
+        out7 = tmp1 ^ out5;
+        out4 = out7 ^ in1 ^ in5 ^ in7;
+        out6 = tmp0 ^ out4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_B4(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in0 ^ in1;
+        out5 = out4 ^ in2;
+        tmp0 = out4 ^ in4;
+        out6 = out5 ^ in0 ^ in3;
+        out7 = tmp0 ^ out6;
+        out2 = tmp0 ^ in6 ^ in7;
+        out3 = out7 ^ in0 ^ in7;
+        out0 = out5 ^ out7 ^ in5;
+        out1 = out0 ^ out6 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_B5(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in1;
+        tmp1 = in2 ^ in4;
+        out4 = tmp0 ^ in4;
+        out3 = tmp1 ^ in7;
+        tmp2 = out4 ^ in5;
+        out7 = out3 ^ in0 ^ in3;
+        out0 = tmp2 ^ in3;
+        out2 = tmp0 ^ out3 ^ in6;
+        out5 = tmp1 ^ tmp2;
+        out6 = out2 ^ out7 ^ in2;
+        out1 = tmp0 ^ out0 ^ out6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_B6(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in3 ^ in4;
+        tmp0 = in1 ^ in2;
+        tmp1 = in0 ^ in4;
+        tmp2 = in3 ^ in5;
+        tmp3 = out3 ^ in1 ^ in7;
+        out5 = tmp0 ^ tmp1;
+        out6 = tmp0 ^ tmp2;
+        out2 = tmp1 ^ in6;
+        out4 = tmp1 ^ tmp3;
+        out0 = tmp3 ^ in5;
+        out1 = out2 ^ in2 ^ in5;
+        out7 = tmp2 ^ out1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_B7(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in4;
+        tmp0 = in0 ^ in4;
+        out2 = tmp0 ^ in2 ^ in6;
+        tmp1 = out2 ^ in7;
+        out1 = out2 ^ in1 ^ in5;
+        out7 = tmp1 ^ in3;
+        out5 = out1 ^ in6;
+        out6 = tmp0 ^ out1 ^ in3;
+        out0 = tmp1 ^ out6;
+        out4 = out0 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_B8(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in4;
+        tmp1 = in2 ^ in5;
+        out2 = tmp0 ^ in5;
+        out4 = tmp1 ^ in0;
+        tmp2 = tmp1 ^ in7;
+        out6 = tmp2 ^ out2;
+        out7 = out4 ^ in3;
+        out1 = tmp2 ^ in4;
+        out3 = tmp0 ^ out7;
+        out0 = out3 ^ out4 ^ in6;
+        out5 = out0 ^ in0 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_B9(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in2;
+        tmp1 = in4 ^ in5;
+        out4 = tmp0 ^ tmp1;
+        tmp2 = tmp0 ^ in3 ^ in7;
+        out3 = out4 ^ in1;
+        out7 = tmp2 ^ in5;
+        out2 = out3 ^ in0;
+        out1 = out2 ^ in7;
+        out6 = out1 ^ in5 ^ in6;
+        out0 = tmp2 ^ out6;
+        out5 = tmp1 ^ out0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_BA(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in5 ^ in7;
+        out2 = tmp0 ^ in4;
+        tmp1 = out2 ^ in2;
+        out1 = tmp1 ^ in0;
+        out6 = tmp1 ^ in1;
+        out4 = out1 ^ in3 ^ in4;
+        tmp2 = out4 ^ out6;
+        out7 = out4 ^ in6 ^ in7;
+        out5 = tmp2 ^ in6;
+        out3 = tmp0 ^ tmp2;
+        out0 = out6 ^ out7 ^ in0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_BB(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in2 ^ in4 ^ in5 ^ in7;
+        tmp0 = out2 ^ in1;
+        out4 = out2 ^ in0 ^ in3;
+        out1 = tmp0 ^ in0;
+        out6 = tmp0 ^ in6;
+        out3 = out1 ^ in2;
+        tmp1 = out4 ^ out6 ^ in4;
+        out0 = tmp1 ^ in7;
+        out5 = tmp1 ^ in5;
+        out7 = tmp0 ^ tmp1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_BC(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in2;
+        tmp1 = in2 ^ in4;
+        out0 = in1 ^ in3 ^ in4;
+        out6 = in1 ^ in2 ^ in7;
+        out7 = tmp0 ^ in3;
+        out5 = tmp0 ^ out6 ^ in6;
+        out1 = tmp1 ^ in5;
+        tmp2 = out1 ^ out5 ^ in1;
+        out3 = tmp2 ^ in3;
+        out4 = tmp1 ^ tmp2;
+        out2 = tmp2 ^ out6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_BD(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in3;
+        tmp1 = in1 ^ in4;
+        out0 = tmp0 ^ tmp1;
+        out7 = tmp0 ^ in2 ^ in7;
+        out1 = tmp1 ^ in2 ^ in5;
+        tmp2 = out1 ^ in0;
+        out2 = tmp2 ^ in6;
+        out3 = out2 ^ in1 ^ in7;
+        out4 = out3 ^ in2;
+        out5 = tmp1 ^ out4;
+        out6 = tmp2 ^ out4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_BE(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in3 ^ in6;
+        out4 = tmp0 ^ in5;
+        out7 = tmp0 ^ in2;
+        out3 = out4 ^ in4;
+        out1 = out3 ^ out7 ^ in0;
+        out2 = out3 ^ in3 ^ in7;
+        out0 = out2 ^ out4 ^ in1;
+        out5 = tmp0 ^ out0;
+        out6 = out1 ^ out5 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_BF(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in4;
+        out3 = tmp0 ^ in5 ^ in6;
+        out4 = out3 ^ in3;
+        tmp1 = out3 ^ in7;
+        out2 = tmp1 ^ in2;
+        out5 = tmp1 ^ in1;
+        tmp2 = out2 ^ in5;
+        out7 = tmp2 ^ in3 ^ in4;
+        tmp3 = tmp0 ^ out5;
+        out0 = tmp3 ^ out4;
+        out1 = tmp2 ^ tmp3;
+        out6 = tmp3 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_C0(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out5 = in2 ^ in5;
+        tmp0 = in1 ^ in4;
+        tmp1 = in3 ^ in6;
+        out0 = out5 ^ in1;
+        out4 = tmp0 ^ in7;
+        out3 = tmp0 ^ tmp1;
+        out1 = tmp1 ^ in2;
+        out6 = tmp1 ^ in0;
+        out7 = out4 ^ in0;
+        out2 = out4 ^ out5 ^ in3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_C1(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out5 = in2;
+        tmp0 = in0 ^ in1;
+        out4 = in1 ^ in7;
+        out6 = in0 ^ in3;
+        out3 = in1 ^ in4 ^ in6;
+        tmp1 = tmp0 ^ in2;
+        out7 = tmp0 ^ in4;
+        out0 = tmp1 ^ in5;
+        out1 = tmp1 ^ out6 ^ in6;
+        out2 = out6 ^ out7 ^ in5 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_C2(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in1 ^ in3 ^ in4;
+        tmp0 = in0 ^ in3 ^ in6;
+        out5 = in2 ^ in4 ^ in5;
+        tmp1 = out4 ^ in7;
+        out1 = tmp0 ^ in2;
+        out6 = tmp0 ^ in5;
+        out2 = out5 ^ in3;
+        out7 = tmp0 ^ tmp1;
+        out3 = tmp1 ^ in2 ^ in6;
+        out0 = tmp1 ^ out2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_C3(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in1 ^ in3;
+        tmp0 = in0 ^ in2;
+        tmp1 = in3 ^ in5;
+        out5 = in2 ^ in4;
+        tmp2 = tmp0 ^ out4;
+        out2 = tmp1 ^ in4;
+        out6 = tmp1 ^ in0;
+        out0 = tmp1 ^ tmp2 ^ in7;
+        out1 = tmp2 ^ in6;
+        out7 = out1 ^ out5 ^ in3;
+        out3 = tmp0 ^ out7 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_C4(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in3 ^ in7;
+        out3 = tmp0 ^ in4;
+        tmp1 = tmp0 ^ in2;
+        out1 = tmp1 ^ in6;
+        out5 = tmp1 ^ in5;
+        out4 = out1 ^ out3 ^ in1;
+        out0 = out4 ^ in4 ^ in5;
+        out2 = out0 ^ out3 ^ in0;
+        out7 = out1 ^ out2 ^ in7;
+        out6 = tmp1 ^ out0 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_C5(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in4 ^ in7;
+        tmp0 = in3 ^ in7;
+        out4 = in1 ^ in2 ^ in6;
+        out6 = in0 ^ in3 ^ in4;
+        out5 = tmp0 ^ in2;
+        out1 = tmp0 ^ out4;
+        out0 = out4 ^ in0 ^ in5;
+        out2 = out0 ^ out5 ^ in4;
+        out7 = tmp0 ^ out2 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_C6(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in5 ^ in6;
+        tmp1 = in1 ^ in7;
+        tmp2 = tmp0 ^ in0;
+        tmp3 = tmp0 ^ tmp1;
+        tmp4 = tmp2 ^ in4;
+        out0 = tmp3 ^ in2;
+        out6 = tmp4 ^ in3;
+        out2 = out6 ^ in2;
+        out7 = tmp1 ^ tmp4;
+        out3 = tmp2 ^ out2;
+        tmp5 = out3 ^ in5;
+        out5 = tmp5 ^ in7;
+        out4 = tmp3 ^ tmp5;
+        out1 = tmp4 ^ out5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_C7(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in2 ^ in4;
+        tmp0 = in3 ^ in5;
+        tmp1 = out3 ^ in7;
+        out6 = tmp0 ^ in0 ^ in4;
+        out5 = tmp1 ^ in3;
+        out2 = out6 ^ in6;
+        out7 = out2 ^ in1 ^ in3;
+        out0 = tmp1 ^ out7;
+        out1 = tmp0 ^ out0;
+        out4 = out1 ^ in0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_C8(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out0 = in1 ^ in2;
+        out1 = in2 ^ in3;
+        tmp0 = in5 ^ in6;
+        tmp1 = in0 ^ in7;
+        out2 = out1 ^ in1 ^ in4;
+        out4 = tmp0 ^ in4;
+        out5 = tmp0 ^ in7;
+        out6 = tmp1 ^ in6;
+        out7 = tmp1 ^ in1;
+        out3 = out2 ^ in0 ^ in2 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_C9(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in5 ^ in6;
+        out7 = in0 ^ in1;
+        tmp0 = in1 ^ in3;
+        out5 = in6 ^ in7;
+        out6 = in0 ^ in7;
+        out0 = out7 ^ in2;
+        out3 = out7 ^ in4 ^ in5;
+        out1 = tmp0 ^ in2;
+        out2 = tmp0 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_CA(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in7;
+        tmp1 = in2 ^ in7;
+        tmp2 = tmp0 ^ in6;
+        out0 = tmp1 ^ in1;
+        tmp3 = tmp1 ^ in3;
+        out6 = tmp2 ^ in5;
+        out7 = tmp2 ^ in1;
+        out2 = tmp3 ^ in4;
+        out5 = out6 ^ in0 ^ in4;
+        out4 = out5 ^ in3;
+        out1 = tmp0 ^ tmp3;
+        out3 = tmp3 ^ out5 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_CB(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in4 ^ in7;
+        tmp1 = in5 ^ in7;
+        out7 = in0 ^ in1 ^ in6;
+        out5 = tmp0 ^ in6;
+        out2 = tmp0 ^ in3;
+        out6 = tmp1 ^ in0;
+        out4 = tmp1 ^ in3 ^ in6;
+        tmp2 = out5 ^ out7 ^ in2;
+        out1 = tmp2 ^ out2;
+        out0 = tmp2 ^ in4;
+        out3 = tmp2 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_CC(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in3 ^ in5;
+        tmp1 = in1 ^ in6;
+        out1 = in2 ^ in3 ^ in7;
+        out5 = tmp0 ^ in6;
+        out0 = tmp1 ^ in2;
+        tmp2 = out5 ^ in0 ^ in7;
+        out3 = tmp2 ^ in4;
+        out6 = tmp0 ^ out3;
+        out7 = tmp1 ^ tmp2 ^ in3;
+        tmp3 = out1 ^ out6;
+        out4 = tmp2 ^ tmp3;
+        out2 = tmp3 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_CD(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out5 = in3 ^ in6;
+        tmp0 = in0 ^ in1;
+        tmp1 = in2 ^ in7;
+        out6 = in0 ^ in4 ^ in7;
+        out2 = tmp0 ^ out5 ^ in4;
+        out7 = tmp0 ^ in5;
+        out0 = tmp0 ^ in2 ^ in6;
+        out4 = tmp1 ^ in5;
+        out1 = tmp1 ^ in1 ^ in3;
+        out3 = out6 ^ in5 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_CE(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in5;
+        tmp1 = tmp0 ^ in3;
+        out4 = tmp1 ^ in4;
+        tmp2 = out4 ^ in6;
+        out3 = tmp2 ^ in0;
+        out5 = tmp2 ^ in2;
+        out2 = out3 ^ in5 ^ in7;
+        out6 = tmp1 ^ out2;
+        out7 = out2 ^ out4 ^ in1;
+        out1 = tmp2 ^ out6;
+        out0 = tmp0 ^ out7 ^ in0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_CF(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in3 ^ in6;
+        tmp1 = in0 ^ in1 ^ in5;
+        out4 = in2 ^ in3 ^ in5;
+        out5 = tmp0 ^ in4;
+        out7 = tmp1 ^ in6;
+        out1 = tmp1 ^ out4 ^ in7;
+        tmp2 = out5 ^ in0;
+        out2 = tmp2 ^ in7;
+        out3 = tmp2 ^ out4;
+        out6 = tmp0 ^ out2 ^ in5;
+        out0 = tmp0 ^ out1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_D0(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in3;
+        tmp1 = in1 ^ in4;
+        tmp2 = in2 ^ in5;
+        out7 = tmp0 ^ tmp1;
+        out0 = tmp1 ^ tmp2;
+        tmp3 = tmp2 ^ in3;
+        out1 = tmp3 ^ in6;
+        tmp4 = out1 ^ in1;
+        out2 = tmp4 ^ in7;
+        out3 = out2 ^ in2;
+        out4 = tmp0 ^ out3;
+        out5 = tmp3 ^ out3;
+        out6 = tmp4 ^ out4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_D1(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in3 ^ in5 ^ in6;
+        tmp1 = tmp0 ^ in1;
+        out1 = tmp1 ^ in2;
+        out2 = tmp1 ^ in7;
+        out3 = out2 ^ in3;
+        out5 = out3 ^ in2;
+        tmp2 = out3 ^ in0;
+        out4 = tmp2 ^ in4;
+        out7 = tmp0 ^ out4;
+        out6 = tmp2 ^ out1 ^ in6;
+        out0 = out2 ^ out6 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_D2(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in5 ^ in6;
+        out2 = tmp0 ^ in2 ^ in3;
+        out1 = out2 ^ in0;
+        out3 = out2 ^ in1;
+        out4 = out1 ^ in1 ^ in2;
+        out6 = out1 ^ in6 ^ in7;
+        out7 = out4 ^ in4 ^ in5;
+        out5 = out4 ^ out6 ^ in4;
+        out0 = tmp0 ^ out5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_D3(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in3 ^ in5 ^ in6;
+        tmp0 = out2 ^ in2;
+        tmp1 = tmp0 ^ in1;
+        out1 = tmp1 ^ in0;
+        out3 = tmp1 ^ in3;
+        out4 = out1 ^ in2 ^ in4;
+        tmp2 = out4 ^ in5;
+        out7 = tmp2 ^ in7;
+        out0 = tmp0 ^ out7;
+        tmp3 = out0 ^ in0;
+        out5 = tmp3 ^ in6;
+        out6 = tmp2 ^ tmp3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_D4(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in3 ^ in5;
+        tmp0 = in1 ^ in5;
+        tmp1 = tmp0 ^ in2;
+        out4 = tmp1 ^ in0;
+        tmp2 = tmp1 ^ in6;
+        out2 = out4 ^ in3 ^ in7;
+        out0 = tmp2 ^ in4;
+        out5 = tmp2 ^ out3;
+        out1 = tmp0 ^ out5 ^ in7;
+        out6 = tmp0 ^ out2 ^ in4;
+        out7 = tmp1 ^ out6 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_D5(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in5;
+        tmp0 = in0 ^ in4;
+        tmp1 = tmp0 ^ in1 ^ in5;
+        out4 = tmp1 ^ in2;
+        out0 = out4 ^ in6;
+        tmp2 = tmp0 ^ out0;
+        out5 = tmp2 ^ in3;
+        out1 = out5 ^ in7;
+        out6 = tmp1 ^ out1;
+        out7 = tmp2 ^ out6;
+        out2 = out7 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_D6(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in2 ^ in4 ^ in6;
+        out5 = tmp0 ^ in3;
+        out0 = tmp0 ^ in5 ^ in7;
+        out3 = out0 ^ out5 ^ in2;
+        tmp1 = out3 ^ in0;
+        out1 = tmp1 ^ in6;
+        out2 = tmp1 ^ in7;
+        out4 = tmp1 ^ in1;
+        out6 = tmp1 ^ in4;
+        out7 = tmp0 ^ out2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_D7(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in3;
+        out3 = in2 ^ in5 ^ in7;
+        out2 = tmp0 ^ in5;
+        tmp1 = tmp0 ^ out3 ^ in1;
+        out1 = tmp1 ^ in6;
+        out4 = tmp1 ^ in4;
+        tmp2 = out1 ^ in4;
+        out6 = tmp2 ^ in1;
+        out7 = tmp2 ^ in2;
+        out0 = tmp2 ^ in3;
+        out5 = tmp2 ^ in0 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_D8(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in0;
+        out5 = in1;
+        tmp0 = in1 ^ in2;
+        out6 = in0 ^ in2;
+        out0 = tmp0 ^ in4;
+        tmp1 = tmp0 ^ in3;
+        out7 = tmp1 ^ out6;
+        out2 = tmp1 ^ in6;
+        out3 = out7 ^ in7;
+        out1 = tmp1 ^ in1 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_D9(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in0 ^ in4;
+        out5 = in1 ^ in5;
+        out2 = in1 ^ in3 ^ in6;
+        out3 = in0 ^ in1 ^ in7;
+        out6 = in0 ^ in2 ^ in6;
+        out0 = out4 ^ in1 ^ in2;
+        out1 = out5 ^ in2 ^ in3;
+        out7 = out3 ^ in3;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_DA(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out5 = in1 ^ in4;
+        tmp0 = in2 ^ in7;
+        tmp1 = in0 ^ in2 ^ in3;
+        out0 = tmp0 ^ out5;
+        out4 = tmp0 ^ tmp1;
+        out2 = tmp0 ^ in3 ^ in6;
+        out1 = tmp1 ^ in5;
+        out3 = tmp1 ^ in1;
+        out6 = out1 ^ in3;
+        out7 = out3 ^ in2 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_DB(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in1;
+        tmp1 = in1 ^ in5;
+        tmp2 = in3 ^ in7;
+        out3 = tmp0 ^ in2;
+        out5 = tmp1 ^ in4;
+        out6 = tmp1 ^ out3 ^ in6;
+        out2 = tmp2 ^ in6;
+        tmp3 = tmp2 ^ in4;
+        tmp4 = out3 ^ in3;
+        out4 = tmp3 ^ in0;
+        out1 = tmp4 ^ in5;
+        out0 = tmp3 ^ tmp4;
+        out7 = tmp0 ^ out2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_DC(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in2;
+        tmp1 = in0 ^ in3;
+        out6 = tmp0 ^ in4;
+        tmp2 = tmp0 ^ in7;
+        out3 = tmp1 ^ in6;
+        tmp3 = tmp1 ^ in1;
+        out1 = tmp1 ^ tmp2 ^ in5;
+        out4 = tmp2 ^ in6;
+        out2 = tmp3 ^ in2;
+        out7 = tmp3 ^ in5;
+        out5 = tmp2 ^ out2;
+        out0 = out2 ^ out3 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_DD(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in0 ^ in6;
+        out2 = in0 ^ in1 ^ in3;
+        out6 = out3 ^ in2 ^ in4;
+        out7 = out2 ^ in5 ^ in7;
+        out0 = out6 ^ in1;
+        out4 = out6 ^ in7;
+        out5 = out7 ^ in0;
+        out1 = out5 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_DE(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in3 ^ in6;
+        tmp1 = in3 ^ in4 ^ in7;
+        out4 = tmp0 ^ in0;
+        out5 = tmp1 ^ in1;
+        out3 = out4 ^ in7;
+        out2 = out3 ^ in6;
+        out1 = out2 ^ in5;
+        out6 = tmp1 ^ out1;
+        out0 = tmp0 ^ out5;
+        out7 = out0 ^ out1 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_DF(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in0 ^ in3 ^ in7;
+        tmp0 = out2 ^ in1 ^ in5;
+        out1 = tmp0 ^ in2;
+        out7 = tmp0 ^ in6;
+        out5 = tmp0 ^ in0 ^ in4;
+        tmp1 = out1 ^ out5 ^ in6;
+        out4 = tmp1 ^ in3;
+        out6 = tmp1 ^ in5;
+        tmp2 = tmp1 ^ in7;
+        out0 = tmp2 ^ in1;
+        out3 = tmp2 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_E0(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in1 ^ in7;
+        tmp0 = in2 ^ in4;
+        out4 = out3 ^ in3 ^ in5;
+        out2 = tmp0 ^ in1;
+        tmp1 = tmp0 ^ in6;
+        out0 = out4 ^ in2;
+        out6 = out4 ^ in0;
+        out1 = tmp1 ^ in3;
+        out5 = tmp1 ^ in0;
+        out7 = out5 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_E1(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in1 ^ in4;
+        tmp0 = in1 ^ in7;
+        out3 = tmp0 ^ in3;
+        tmp1 = out3 ^ in5;
+        out4 = tmp1 ^ in4;
+        tmp2 = tmp1 ^ in0;
+        out0 = tmp2 ^ in2;
+        out6 = tmp2 ^ in6;
+        tmp3 = out0 ^ out4 ^ in6;
+        out5 = tmp3 ^ in5;
+        out7 = tmp0 ^ tmp3;
+        out1 = tmp2 ^ out5 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_E2(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in1 ^ in2;
+        out4 = in1 ^ in5;
+        out2 = in2 ^ in4 ^ in7;
+        out5 = in0 ^ in2 ^ in6;
+        out0 = out3 ^ in3 ^ in5;
+        out7 = out3 ^ in0 ^ in4;
+        out6 = out2 ^ out7 ^ in3;
+        out1 = out5 ^ in3 ^ in4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_E3(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in4 ^ in7;
+        tmp0 = in1 ^ in3;
+        out3 = tmp0 ^ in2;
+        tmp1 = out3 ^ in0;
+        out0 = tmp1 ^ in5;
+        tmp2 = tmp1 ^ in4;
+        out1 = tmp2 ^ in6;
+        tmp3 = tmp2 ^ in3;
+        out7 = tmp3 ^ in7;
+        out6 = out1 ^ out2 ^ in2;
+        tmp4 = tmp0 ^ out0;
+        out5 = tmp4 ^ in6;
+        out4 = tmp3 ^ tmp4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_E4(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in6;
+        tmp0 = in0 ^ in4;
+        tmp1 = tmp0 ^ in2 ^ in6;
+        out2 = tmp1 ^ in1;
+        out7 = out2 ^ in5;
+        tmp2 = tmp0 ^ out7;
+        out4 = tmp2 ^ in3;
+        out0 = out4 ^ in7;
+        out6 = tmp1 ^ out0;
+        out5 = tmp2 ^ out6;
+        out1 = out5 ^ in0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_E5(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in3 ^ in6;
+        tmp0 = in0 ^ in1;
+        tmp1 = in5 ^ in7;
+        out2 = tmp0 ^ in4 ^ in6;
+        tmp2 = tmp1 ^ out2;
+        out6 = tmp2 ^ in3;
+        out7 = tmp2 ^ in2;
+        out0 = out6 ^ in2 ^ in4;
+        out5 = out6 ^ in1 ^ in2;
+        out1 = tmp0 ^ out5 ^ in5;
+        out4 = tmp1 ^ out1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_E6(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in2 ^ in6 ^ in7;
+        out2 = out3 ^ in0 ^ in4;
+        out4 = out3 ^ in1 ^ in5;
+        out1 = out2 ^ in3;
+        out7 = out2 ^ out4 ^ in2;
+        out0 = out4 ^ in3 ^ in7;
+        out5 = out1 ^ in4;
+        out6 = out0 ^ out2 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_E7(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in3;
+        out3 = tmp0 ^ in6 ^ in7;
+        tmp1 = out3 ^ in0;
+        out5 = tmp1 ^ in5;
+        tmp2 = tmp1 ^ in4;
+        tmp3 = out5 ^ in7;
+        out1 = tmp2 ^ in1;
+        out0 = tmp3 ^ in1;
+        out6 = out1 ^ in2;
+        out2 = tmp0 ^ tmp2;
+        tmp4 = tmp3 ^ out6;
+        out4 = tmp4 ^ in6;
+        out7 = tmp4 ^ in0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_E8(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in3 ^ in6;
+        tmp0 = in4 ^ in7;
+        out1 = in2 ^ in3 ^ in4;
+        out5 = tmp0 ^ in0;
+        tmp1 = tmp0 ^ in1;
+        tmp2 = tmp1 ^ in5;
+        out0 = tmp1 ^ out1;
+        out2 = tmp2 ^ in2;
+        out6 = tmp2 ^ out5;
+        tmp3 = out6 ^ in6;
+        out3 = tmp3 ^ in7;
+        out7 = tmp3 ^ in2 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_E9(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in1;
+        tmp1 = in3 ^ in6;
+        tmp2 = tmp0 ^ in6;
+        out4 = tmp1 ^ in4;
+        out6 = tmp2 ^ in5;
+        out7 = tmp2 ^ in2 ^ in7;
+        out3 = out6 ^ in3 ^ in7;
+        out0 = tmp1 ^ out7;
+        out2 = out3 ^ out4 ^ in0;
+        out5 = tmp0 ^ out2;
+        out1 = out0 ^ out5 ^ in5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_EA(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in6 ^ in7;
+        out5 = in0 ^ in7;
+        out6 = in0 ^ in1;
+        out0 = in1 ^ in2 ^ in3;
+        out2 = in2 ^ in4 ^ in5;
+        out7 = out6 ^ in2;
+        out1 = out0 ^ out6 ^ in4;
+        out3 = out7 ^ in5 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_EB(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in4 ^ in5;
+        tmp0 = in0 ^ in1;
+        out4 = in4 ^ in6 ^ in7;
+        out5 = in0 ^ in5 ^ in7;
+        out6 = tmp0 ^ in6;
+        tmp1 = tmp0 ^ in2;
+        out0 = tmp1 ^ in3;
+        out7 = tmp1 ^ in7;
+        out1 = out0 ^ in4;
+        out3 = out0 ^ in5 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_EC(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out3 = in0 ^ in5;
+        out4 = in2 ^ in3 ^ in7;
+        out5 = in0 ^ in3 ^ in4;
+        out6 = out3 ^ in1 ^ in4;
+        out1 = out4 ^ in4;
+        out0 = out4 ^ in1 ^ in6;
+        out2 = out0 ^ out5 ^ in5;
+        out7 = out2 ^ in4 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_ED(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in2 ^ in4;
+        tmp1 = in3 ^ in5;
+        out4 = tmp0 ^ in3 ^ in7;
+        out3 = tmp1 ^ in0;
+        out1 = out4 ^ in1;
+        out5 = out3 ^ in4;
+        out7 = out1 ^ out5 ^ in6;
+        out2 = tmp0 ^ out7;
+        out0 = tmp1 ^ out7;
+        out6 = out2 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_EE(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in2;
+        tmp0 = in0 ^ in1;
+        out5 = in0 ^ in3;
+        tmp1 = tmp0 ^ in2;
+        out6 = tmp0 ^ in4;
+        tmp2 = tmp1 ^ out5;
+        out7 = tmp1 ^ in5;
+        out1 = tmp2 ^ out6 ^ in7;
+        out0 = tmp2 ^ in6;
+        tmp3 = out7 ^ in1;
+        out3 = tmp3 ^ in7;
+        out2 = tmp3 ^ in4 ^ in6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_EF(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out4 = in2 ^ in4;
+        tmp0 = in0 ^ in5;
+        tmp1 = in4 ^ in6;
+        out5 = tmp0 ^ in3;
+        out2 = tmp0 ^ tmp1;
+        out6 = tmp1 ^ in0 ^ in1;
+        out3 = out5 ^ in2 ^ in7;
+        out7 = out3 ^ in1 ^ in3;
+        out0 = out4 ^ out6 ^ in3;
+        out1 = tmp1 ^ out0 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_F0(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in2;
+        tmp1 = in4 ^ in5;
+        out2 = tmp0 ^ in6;
+        out3 = tmp1 ^ in1;
+        tmp2 = tmp1 ^ in7;
+        out1 = out2 ^ out3 ^ in3;
+        tmp3 = tmp0 ^ tmp2;
+        out0 = tmp3 ^ in3;
+        out5 = tmp3 ^ in0;
+        out4 = out1 ^ out5 ^ in4;
+        out7 = out4 ^ in2;
+        out6 = tmp2 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_F1(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in1 ^ in6;
+        tmp0 = in3 ^ in5;
+        out3 = tmp0 ^ in1 ^ in4;
+        tmp1 = out3 ^ in2;
+        out1 = tmp1 ^ in6;
+        tmp2 = tmp1 ^ in0;
+        tmp3 = out1 ^ in5;
+        out0 = tmp2 ^ in7;
+        out6 = tmp2 ^ in4;
+        out7 = tmp3 ^ in0;
+        out5 = tmp0 ^ out0;
+        out4 = tmp3 ^ out5 ^ in1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_F2(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in4 ^ in5;
+        out2 = in2 ^ in6 ^ in7;
+        tmp1 = tmp0 ^ in1;
+        tmp2 = tmp1 ^ in2;
+        out0 = tmp2 ^ in3;
+        out3 = tmp2 ^ in7;
+        out5 = out3 ^ in0 ^ in4;
+        tmp3 = tmp0 ^ out5;
+        out7 = tmp3 ^ in3;
+        out4 = tmp3 ^ out2;
+        out1 = out0 ^ out4 ^ in4;
+        out6 = tmp1 ^ out1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_F3(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in6 ^ in7;
+        tmp0 = in0 ^ in1;
+        out4 = tmp0 ^ in6;
+        tmp1 = tmp0 ^ in2;
+        out5 = tmp1 ^ in7;
+        out6 = tmp1 ^ in3;
+        out7 = out6 ^ in4;
+        out0 = out7 ^ in5;
+        out1 = out0 ^ in6;
+        out3 = out0 ^ in0 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_F4(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in0 ^ in1 ^ in2;
+        tmp0 = out2 ^ in3;
+        out4 = tmp0 ^ in4;
+        out5 = out4 ^ in5;
+        out6 = out5 ^ in6;
+        out7 = out6 ^ in7;
+        out0 = out7 ^ in0;
+        out1 = out0 ^ in1;
+        out3 = tmp0 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_F5(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in0 ^ in1;
+        tmp0 = out2 ^ in2;
+        out4 = tmp0 ^ in3;
+        out5 = out4 ^ in4;
+        out6 = out5 ^ in5;
+        out7 = out6 ^ in6;
+        out0 = out7 ^ in7;
+        out1 = out0 ^ in0;
+        out3 = tmp0 ^ out0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_F6(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in7;
+        out2 = tmp0 ^ in2;
+        out4 = out2 ^ in1 ^ in4;
+        out7 = out4 ^ in3 ^ in5;
+        out5 = out7 ^ in4 ^ in7;
+        out0 = tmp0 ^ out7 ^ in6;
+        tmp1 = out0 ^ in1;
+        out6 = out0 ^ in0 ^ in5;
+        out3 = tmp1 ^ in3;
+        out1 = tmp0 ^ tmp1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_F7(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in0 ^ in7;
+        tmp0 = out2 ^ in1;
+        out4 = tmp0 ^ in2;
+        out5 = out4 ^ in3 ^ in7;
+        out6 = out5 ^ in4;
+        out7 = out6 ^ in5;
+        out0 = out7 ^ in6;
+        out1 = out0 ^ in7;
+        out3 = tmp0 ^ out1;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_F8(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in4;
+        tmp1 = in3 ^ in5;
+        tmp2 = tmp0 ^ in6;
+        out4 = tmp0 ^ tmp1;
+        out1 = tmp1 ^ in2 ^ in4;
+        out3 = tmp2 ^ in1;
+        out5 = out3 ^ in5;
+        out7 = out1 ^ out5 ^ in7;
+        out6 = tmp1 ^ out7;
+        out0 = tmp2 ^ out7;
+        out2 = out6 ^ in0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_F9(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in3 ^ in5;
+        tmp1 = in0 ^ in6;
+        out4 = tmp0 ^ in0;
+        tmp2 = tmp1 ^ in4;
+        tmp3 = tmp1 ^ in2;
+        out5 = tmp2 ^ in1;
+        out3 = out5 ^ in3;
+        tmp4 = tmp3 ^ out3;
+        out1 = tmp4 ^ in5;
+        out0 = tmp4 ^ in0 ^ in7;
+        out6 = tmp0 ^ out0 ^ in4;
+        out7 = tmp2 ^ tmp4;
+        out2 = tmp3 ^ out6;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_FA(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in1;
+        tmp1 = tmp0 ^ in2;
+        tmp2 = tmp0 ^ in5;
+        tmp3 = tmp1 ^ in7;
+        out5 = tmp2 ^ in6;
+        out6 = tmp3 ^ in6;
+        out7 = tmp3 ^ in3;
+        out3 = out6 ^ in4;
+        out2 = tmp1 ^ out5;
+        out4 = out2 ^ out3 ^ in1;
+        out0 = out4 ^ out7 ^ in5;
+        out1 = tmp2 ^ out0;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_FB(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in5 ^ in6;
+        tmp0 = in0 ^ in1;
+        out4 = in0 ^ in5 ^ in7;
+        out5 = tmp0 ^ in6;
+        tmp1 = tmp0 ^ in2;
+        out6 = tmp1 ^ in7;
+        out7 = tmp1 ^ in3;
+        out0 = out7 ^ in4;
+        out1 = out0 ^ in5;
+        out3 = out0 ^ in6 ^ in7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_FC(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in1 ^ in2;
+        tmp1 = in0 ^ in7;
+        out2 = tmp0 ^ tmp1 ^ in5;
+        out3 = tmp1 ^ in4;
+        tmp2 = out2 ^ in6;
+        out6 = tmp2 ^ in4;
+        out7 = tmp2 ^ in3;
+        out4 = out6 ^ in1 ^ in3;
+        tmp3 = out4 ^ in0;
+        out1 = tmp3 ^ in6;
+        out0 = tmp3 ^ in1 ^ in5;
+        out5 = tmp0 ^ out4;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_FD(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in5;
+        tmp1 = in1 ^ in7;
+        out2 = tmp0 ^ tmp1;
+        out6 = out2 ^ in2 ^ in4;
+        tmp2 = out6 ^ in0;
+        out1 = tmp2 ^ in3;
+        out0 = tmp0 ^ out1 ^ in6;
+        out5 = out0 ^ in2;
+        tmp3 = out5 ^ in1;
+        out3 = tmp3 ^ in6;
+        out7 = tmp2 ^ tmp3;
+        out4 = tmp1 ^ out7;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_FE(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3, tmp4;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        tmp0 = in0 ^ in2;
+        out2 = tmp0 ^ in5;
+        out3 = tmp0 ^ in4;
+        tmp1 = out3 ^ in6;
+        out4 = tmp1 ^ in5;
+        tmp2 = tmp1 ^ in1;
+        out6 = tmp2 ^ in7;
+        tmp3 = tmp2 ^ in0;
+        out0 = tmp3 ^ in3;
+        tmp4 = out0 ^ out4 ^ in7;
+        out5 = tmp4 ^ in6;
+        out7 = tmp4 ^ in2;
+        out1 = tmp3 ^ out5;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void
+gf8_muladd_FF(void *out, void *in)
+{
+    unsigned int i;
+    uint64_t *in_ptr = (uint64_t *)in;
+    uint64_t *out_ptr = (uint64_t *)out;
+
+    for (i = 0; i < WIDTH; i++) {
+        uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+        uint64_t tmp0, tmp1, tmp2, tmp3;
+
+        uint64_t in0 = out_ptr[0];
+        uint64_t in1 = out_ptr[WIDTH];
+        uint64_t in2 = out_ptr[WIDTH * 2];
+        uint64_t in3 = out_ptr[WIDTH * 3];
+        uint64_t in4 = out_ptr[WIDTH * 4];
+        uint64_t in5 = out_ptr[WIDTH * 5];
+        uint64_t in6 = out_ptr[WIDTH * 6];
+        uint64_t in7 = out_ptr[WIDTH * 7];
+
+        out2 = in0 ^ in5;
+        tmp0 = in4 ^ in7;
+        tmp1 = out2 ^ in2;
+        out4 = tmp1 ^ in6;
+        out7 = tmp1 ^ in1 ^ in3;
+        out1 = tmp0 ^ out7;
+        tmp2 = out1 ^ in5;
+        out6 = tmp2 ^ in3;
+        tmp3 = tmp2 ^ in7;
+        out0 = tmp3 ^ in6;
+        out3 = tmp3 ^ in1;
+        out5 = tmp0 ^ out0 ^ in2;
+
+        out_ptr[0] = out0 ^ in_ptr[0];
+        out_ptr[WIDTH] = out1 ^ in_ptr[WIDTH];
+        out_ptr[WIDTH * 2] = out2 ^ in_ptr[WIDTH * 2];
+        out_ptr[WIDTH * 3] = out3 ^ in_ptr[WIDTH * 3];
+        out_ptr[WIDTH * 4] = out4 ^ in_ptr[WIDTH * 4];
+        out_ptr[WIDTH * 5] = out5 ^ in_ptr[WIDTH * 5];
+        out_ptr[WIDTH * 6] = out6 ^ in_ptr[WIDTH * 6];
+        out_ptr[WIDTH * 7] = out7 ^ in_ptr[WIDTH * 7];
+
+        in_ptr++;
+        out_ptr++;
+    }
+}
+
+static void (*gf8_muladd[])(void *out, void *in) = {
+    gf8_muladd_00, gf8_muladd_01, gf8_muladd_02, gf8_muladd_03, gf8_muladd_04,
+    gf8_muladd_05, gf8_muladd_06, gf8_muladd_07, gf8_muladd_08, gf8_muladd_09,
+    gf8_muladd_0A, gf8_muladd_0B, gf8_muladd_0C, gf8_muladd_0D, gf8_muladd_0E,
+    gf8_muladd_0F, gf8_muladd_10, gf8_muladd_11, gf8_muladd_12, gf8_muladd_13,
+    gf8_muladd_14, gf8_muladd_15, gf8_muladd_16, gf8_muladd_17, gf8_muladd_18,
+    gf8_muladd_19, gf8_muladd_1A, gf8_muladd_1B, gf8_muladd_1C, gf8_muladd_1D,
+    gf8_muladd_1E, gf8_muladd_1F, gf8_muladd_20, gf8_muladd_21, gf8_muladd_22,
+    gf8_muladd_23, gf8_muladd_24, gf8_muladd_25, gf8_muladd_26, gf8_muladd_27,
+    gf8_muladd_28, gf8_muladd_29, gf8_muladd_2A, gf8_muladd_2B, gf8_muladd_2C,
+    gf8_muladd_2D, gf8_muladd_2E, gf8_muladd_2F, gf8_muladd_30, gf8_muladd_31,
+    gf8_muladd_32, gf8_muladd_33, gf8_muladd_34, gf8_muladd_35, gf8_muladd_36,
+    gf8_muladd_37, gf8_muladd_38, gf8_muladd_39, gf8_muladd_3A, gf8_muladd_3B,
+    gf8_muladd_3C, gf8_muladd_3D, gf8_muladd_3E, gf8_muladd_3F, gf8_muladd_40,
+    gf8_muladd_41, gf8_muladd_42, gf8_muladd_43, gf8_muladd_44, gf8_muladd_45,
+    gf8_muladd_46, gf8_muladd_47, gf8_muladd_48, gf8_muladd_49, gf8_muladd_4A,
+    gf8_muladd_4B, gf8_muladd_4C, gf8_muladd_4D, gf8_muladd_4E, gf8_muladd_4F,
+    gf8_muladd_50, gf8_muladd_51, gf8_muladd_52, gf8_muladd_53, gf8_muladd_54,
+    gf8_muladd_55, gf8_muladd_56, gf8_muladd_57, gf8_muladd_58, gf8_muladd_59,
+    gf8_muladd_5A, gf8_muladd_5B, gf8_muladd_5C, gf8_muladd_5D, gf8_muladd_5E,
+    gf8_muladd_5F, gf8_muladd_60, gf8_muladd_61, gf8_muladd_62, gf8_muladd_63,
+    gf8_muladd_64, gf8_muladd_65, gf8_muladd_66, gf8_muladd_67, gf8_muladd_68,
+    gf8_muladd_69, gf8_muladd_6A, gf8_muladd_6B, gf8_muladd_6C, gf8_muladd_6D,
+    gf8_muladd_6E, gf8_muladd_6F, gf8_muladd_70, gf8_muladd_71, gf8_muladd_72,
+    gf8_muladd_73, gf8_muladd_74, gf8_muladd_75, gf8_muladd_76, gf8_muladd_77,
+    gf8_muladd_78, gf8_muladd_79, gf8_muladd_7A, gf8_muladd_7B, gf8_muladd_7C,
+    gf8_muladd_7D, gf8_muladd_7E, gf8_muladd_7F, gf8_muladd_80, gf8_muladd_81,
+    gf8_muladd_82, gf8_muladd_83, gf8_muladd_84, gf8_muladd_85, gf8_muladd_86,
+    gf8_muladd_87, gf8_muladd_88, gf8_muladd_89, gf8_muladd_8A, gf8_muladd_8B,
+    gf8_muladd_8C, gf8_muladd_8D, gf8_muladd_8E, gf8_muladd_8F, gf8_muladd_90,
+    gf8_muladd_91, gf8_muladd_92, gf8_muladd_93, gf8_muladd_94, gf8_muladd_95,
+    gf8_muladd_96, gf8_muladd_97, gf8_muladd_98, gf8_muladd_99, gf8_muladd_9A,
+    gf8_muladd_9B, gf8_muladd_9C, gf8_muladd_9D, gf8_muladd_9E, gf8_muladd_9F,
+    gf8_muladd_A0, gf8_muladd_A1, gf8_muladd_A2, gf8_muladd_A3, gf8_muladd_A4,
+    gf8_muladd_A5, gf8_muladd_A6, gf8_muladd_A7, gf8_muladd_A8, gf8_muladd_A9,
+    gf8_muladd_AA, gf8_muladd_AB, gf8_muladd_AC, gf8_muladd_AD, gf8_muladd_AE,
+    gf8_muladd_AF, gf8_muladd_B0, gf8_muladd_B1, gf8_muladd_B2, gf8_muladd_B3,
+    gf8_muladd_B4, gf8_muladd_B5, gf8_muladd_B6, gf8_muladd_B7, gf8_muladd_B8,
+    gf8_muladd_B9, gf8_muladd_BA, gf8_muladd_BB, gf8_muladd_BC, gf8_muladd_BD,
+    gf8_muladd_BE, gf8_muladd_BF, gf8_muladd_C0, gf8_muladd_C1, gf8_muladd_C2,
+    gf8_muladd_C3, gf8_muladd_C4, gf8_muladd_C5, gf8_muladd_C6, gf8_muladd_C7,
+    gf8_muladd_C8, gf8_muladd_C9, gf8_muladd_CA, gf8_muladd_CB, gf8_muladd_CC,
+    gf8_muladd_CD, gf8_muladd_CE, gf8_muladd_CF, gf8_muladd_D0, gf8_muladd_D1,
+    gf8_muladd_D2, gf8_muladd_D3, gf8_muladd_D4, gf8_muladd_D5, gf8_muladd_D6,
+    gf8_muladd_D7, gf8_muladd_D8, gf8_muladd_D9, gf8_muladd_DA, gf8_muladd_DB,
+    gf8_muladd_DC, gf8_muladd_DD, gf8_muladd_DE, gf8_muladd_DF, gf8_muladd_E0,
+    gf8_muladd_E1, gf8_muladd_E2, gf8_muladd_E3, gf8_muladd_E4, gf8_muladd_E5,
+    gf8_muladd_E6, gf8_muladd_E7, gf8_muladd_E8, gf8_muladd_E9, gf8_muladd_EA,
+    gf8_muladd_EB, gf8_muladd_EC, gf8_muladd_ED, gf8_muladd_EE, gf8_muladd_EF,
+    gf8_muladd_F0, gf8_muladd_F1, gf8_muladd_F2, gf8_muladd_F3, gf8_muladd_F4,
+    gf8_muladd_F5, gf8_muladd_F6, gf8_muladd_F7, gf8_muladd_F8, gf8_muladd_F9,
+    gf8_muladd_FA, gf8_muladd_FB, gf8_muladd_FC, gf8_muladd_FD, gf8_muladd_FE,
+    gf8_muladd_FF};
+
+static uint64_t zero[EC_METHOD_WORD_SIZE * 8] = {
+    0,
+};
+
+void
+ec_code_c_prepare(ec_gf_t *gf, uint32_t *values, uint32_t count)
+{
+    uint32_t i, last, tmp;
+
+    last = 1;
+    for (i = count; i > 0; i--) {
+        if (values[i - 1] != 0) {
+            tmp = values[i - 1];
+            values[i - 1] = ec_gf_div(gf, tmp, last);
+            last = tmp;
+        }
+    }
+}
+
+void
+ec_code_c_linear(void *dst, void *src, uint64_t offset, uint32_t *values,
+                 uint32_t count)
+{
+    src += offset;
+    gf8_muladd_00(dst, src);
+    while (--count > 0) {
+        src += EC_METHOD_CHUNK_SIZE;
+        gf8_muladd[*values](dst, src);
+        values++;
+    }
+}
+
+void
+ec_code_c_interleaved(void *dst, void **src, uint64_t offset, uint32_t *values,
+                      uint32_t count)
+{
+    uint32_t i, last, tmp;
+
+    i = 0;
+    while ((last = *values++) == 0) {
+        i++;
+    }
+    gf8_muladd_00(dst, src[i++] + offset);
+    while (i < count) {
+        tmp = *values++;
+        if (tmp != 0) {
+            gf8_muladd[last](dst, src[i] + offset);
+            last = tmp;
+        }
+        i++;
+    }
+    gf8_muladd[last](dst, zero);
+}
diff --git a/xlators/cluster/ec/src/ec-code-c.h b/xlators/cluster/ec/src/ec-code-c.h
new file mode 100644
index 00000000000..42b5a064eb8
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-c.h
@@ -0,0 +1,27 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_CODE_C_H__
+#define __EC_CODE_C_H__
+
+#include "ec-types.h"
+
+void
+ec_code_c_prepare(ec_gf_t *gf, uint32_t *values, uint32_t count);
+
+void
+ec_code_c_linear(void *dst, void *src, uint64_t offset, uint32_t *values,
+                 uint32_t count);
+
+void
+ec_code_c_interleaved(void *dst, void **src, uint64_t offset, uint32_t *values,
+                      uint32_t count);
+
+#endif /* __EC_CODE_C_H__ */
diff --git a/xlators/cluster/ec/src/ec-code-intel.c b/xlators/cluster/ec/src/ec-code-intel.c
new file mode 100644
index 00000000000..f1c4e13e321
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-intel.c
@@ -0,0 +1,594 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <inttypes.h>
+#include <string.h>
+#include <errno.h>
+
+#include "ec-code-intel.h"
+
+static void
+ec_code_intel_init(ec_code_intel_t *intel)
+{
+    memset(intel, 0, sizeof(ec_code_intel_t));
+}
+
+static void
+ec_code_intel_prefix(ec_code_intel_t *intel, uint8_t prefix)
+{
+    intel->prefix.data[intel->prefix.bytes++] = prefix;
+}
+
+static void
+ec_code_intel_rex(ec_code_intel_t *intel, gf_boolean_t w)
+{
+    gf_boolean_t present = _gf_false;
+
+    if (w) {
+        intel->rex.w = 1;
+        present = _gf_true;
+    }
+    if (intel->modrm.present) {
+        if (intel->modrm.reg > 7) {
+            intel->modrm.reg &= 7;
+            intel->rex.r = 1;
+            present = _gf_true;
+        }
+        if (intel->sib.present) {
+            if (intel->sib.index > 7) {
+                intel->sib.index &= 7;
+                intel->rex.x = 1;
+                present = _gf_true;
+            }
+            if (intel->sib.base > 7) {
+                intel->sib.base &= 7;
+                intel->rex.b = 1;
+                present = _gf_true;
+            }
+        } else if (intel->modrm.rm > 7) {
+            intel->modrm.rm &= 7;
+            intel->rex.b = 1;
+            present = _gf_true;
+        }
+    } else if (intel->reg > 7) {
+        intel->reg &= 7;
+        intel->rex.b = 1;
+        present = _gf_true;
+    }
+    intel->rex.present = present;
+}
+
+static void
+ec_code_intel_vex(ec_code_intel_t *intel, gf_boolean_t w, gf_boolean_t l,
+                  ec_code_vex_opcode_t opcode, ec_code_vex_prefix_t prefix,
+                  uint32_t reg)
+{
+    ec_code_intel_rex(intel, w);
+    if (((intel->rex.w == 1) || (intel->rex.x == 0) || (intel->rex.b == 0)) ||
+        ((opcode != VEX_OPCODE_NONE) && (opcode != VEX_OPCODE_0F))) {
+        intel->rex.present = _gf_false;
+
+        intel->vex.bytes = 3;
+        intel->vex.data[0] = 0xC4;
+        intel->vex.data[1] = ((intel->rex.r << 7) | (intel->rex.x << 6) |
+                              (intel->rex.b << 5) | opcode) ^
+                             0xE0;
+        intel->vex.data[2] = (intel->rex.w << 7) | ((~reg & 0x0F) << 3) |
+                             (l ? 0x04 : 0x00) | prefix;
+    } else {
+        intel->vex.bytes = 2;
+        intel->vex.data[0] = 0xC5;
+        intel->vex.data[1] = (intel->rex.r << 7) | ((~reg & 0x0F) << 3) |
+                             (l ? 0x04 : 0x00) | prefix;
+    }
+}
+
+static void
+ec_code_intel_modrm_reg(ec_code_intel_t *intel, uint32_t rm, uint32_t reg)
+{
+    intel->modrm.present = _gf_true;
+    intel->modrm.mod = 3;
+    intel->modrm.rm = rm;
+    intel->modrm.reg = reg;
+}
+
+static void
+ec_code_intel_modrm_mem(ec_code_intel_t *intel, uint32_t reg,
+                        ec_code_intel_reg_t base, ec_code_intel_reg_t index,
+                        uint32_t scale, int32_t offset)
+{
+    if (index == REG_SP) {
+        intel->invalid = _gf_true;
+        return;
+    }
+    if ((index != REG_NULL) && (scale != 1) && (scale != 2) && (scale != 4) &&
+        (scale != 8)) {
+        intel->invalid = _gf_true;
+        return;
+    }
+    scale >>= 1;
+    if (scale == 4) {
+        scale = 3;
+    }
+
+    intel->modrm.present = _gf_true;
+    intel->modrm.reg = reg;
+
+    intel->offset.value = offset;
+    if ((offset == 0) && (base != REG_BP)) {
+        intel->modrm.mod = 0;
+        intel->offset.bytes = 0;
+    } else if ((offset >= -128) && (offset <= 127)) {
+        intel->modrm.mod = 1;
+        intel->offset.bytes = 1;
+    } else {
+        intel->modrm.mod = 2;
+        intel->offset.bytes = 4;
+    }
+
+    intel->modrm.rm = base;
+    if ((index != REG_NULL) || (base == REG_SP)) {
+        intel->modrm.rm = 4;
+        intel->sib.present = _gf_true;
+        intel->sib.index = index;
+        if (index == REG_NULL) {
+            intel->sib.index = 4;
+        }
+        intel->sib.scale = scale;
+        intel->sib.base = base;
+        if (base == REG_NULL) {
+            intel->sib.base = 5;
+            intel->modrm.mod = 0;
+            intel->offset.bytes = 4;
+        }
+    } else if (base == REG_NULL) {
+        intel->modrm.mod = 0;
+        intel->modrm.rm = 5;
+        intel->offset.bytes = 4;
+    }
+}
+
+static void
+ec_code_intel_op_1(ec_code_intel_t *intel, uint8_t opcode, uint32_t reg)
+{
+    intel->reg = reg;
+    intel->opcode.bytes = 1;
+    intel->opcode.data[0] = opcode;
+}
+
+static void
+ec_code_intel_op_2(ec_code_intel_t *intel, uint8_t opcode1, uint8_t opcode2,
+                   uint32_t reg)
+{
+    intel->reg = reg;
+    intel->opcode.bytes = 2;
+    intel->opcode.data[0] = opcode1;
+    intel->opcode.data[1] = opcode2;
+}
+
+static void
+ec_code_intel_immediate_1(ec_code_intel_t *intel, uint32_t value)
+{
+    intel->immediate.bytes = 1;
+    intel->immediate.value = value;
+}
+
+static void
+ec_code_intel_immediate_2(ec_code_intel_t *intel, uint32_t value)
+{
+    intel->immediate.bytes = 2;
+    intel->immediate.value = value;
+}
+
+static void
+ec_code_intel_immediate_4(ec_code_intel_t *intel, uint32_t value)
+{
+    intel->immediate.bytes = 4;
+    intel->immediate.value = value;
+}
+
+static void
+ec_code_intel_emit(ec_code_builder_t *builder, ec_code_intel_t *intel)
+{
+    uint8_t insn[15];
+    uint32_t i, count;
+
+    if (intel->invalid) {
+        ec_code_error(builder, EINVAL);
+        return;
+    }
+
+    count = 0;
+    for (i = 0; i < intel->prefix.bytes; i++) {
+        insn[count++] = intel->prefix.data[i];
+    }
+    for (i = 0; i < intel->vex.bytes; i++) {
+        insn[count++] = intel->vex.data[i];
+    }
+    if (intel->rex.present) {
+        insn[count++] = 0x40 | (intel->rex.w << 3) | (intel->rex.r << 2) |
+                        (intel->rex.x << 1) | (intel->rex.b << 0);
+    }
+    for (i = 0; i < intel->opcode.bytes; i++) {
+        insn[count++] = intel->opcode.data[i];
+    }
+    if (intel->modrm.present) {
+        insn[count++] = (intel->modrm.mod << 6) | (intel->modrm.reg << 3) |
+                        (intel->modrm.rm << 0);
+        if (intel->sib.present) {
+            insn[count++] = (intel->sib.scale << 6) | (intel->sib.index << 3) |
+                            (intel->sib.base << 0);
+        }
+    }
+    for (i = 0; i < intel->offset.bytes; i++) {
+        insn[count++] = intel->offset.data[i];
+    }
+    for (i = 0; i < intel->immediate.bytes; i++) {
+        insn[count++] = intel->immediate.data[i];
+    }
+
+    ec_code_emit(builder, insn, count);
+}
+
+void
+ec_code_intel_op_push_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_op_1(&intel, 0x50 | (reg & 7), reg);
+    ec_code_intel_rex(&intel, _gf_false);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_pop_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_op_1(&intel, 0x58 | (reg & 7), reg);
+    ec_code_intel_rex(&intel, _gf_false);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_ret(ec_code_builder_t *builder, uint32_t size)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    if (size == 0) {
+        ec_code_intel_op_1(&intel, 0xC3, 0);
+    } else {
+        ec_code_intel_immediate_2(&intel, size);
+        ec_code_intel_op_1(&intel, 0xC2, 0);
+    }
+    ec_code_intel_rex(&intel, _gf_false);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src,
+                         ec_code_intel_reg_t dst)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_modrm_reg(&intel, dst, src);
+    ec_code_intel_op_1(&intel, 0x89, 0);
+    ec_code_intel_rex(&intel, _gf_true);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_r2m(ec_code_builder_t *builder, ec_code_intel_reg_t src,
+                         ec_code_intel_reg_t base, ec_code_intel_reg_t index,
+                         uint32_t scale, int32_t offset)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset);
+    ec_code_intel_op_1(&intel, 0x89, 0);
+    ec_code_intel_rex(&intel, _gf_true);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+                         ec_code_intel_reg_t index, uint32_t scale,
+                         int32_t offset, ec_code_intel_reg_t dst)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset);
+    ec_code_intel_op_1(&intel, 0x8B, 0);
+    ec_code_intel_rex(&intel, _gf_true);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_xor_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src,
+                         ec_code_intel_reg_t dst)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_modrm_reg(&intel, dst, src);
+    ec_code_intel_op_1(&intel, 0x31, 0);
+    ec_code_intel_rex(&intel, _gf_true);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_xor_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+                         ec_code_intel_reg_t index, uint32_t scale,
+                         int32_t offset, ec_code_intel_reg_t dst)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset);
+    ec_code_intel_op_1(&intel, 0x33, 0);
+    ec_code_intel_rex(&intel, _gf_true);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_add_i2r(ec_code_builder_t *builder, int32_t value,
+                         ec_code_intel_reg_t reg)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    if ((value >= -128) && (value < 128)) {
+        ec_code_intel_modrm_reg(&intel, reg, 0);
+        ec_code_intel_op_1(&intel, 0x83, 0);
+        ec_code_intel_immediate_1(&intel, value);
+    } else {
+        if (reg == REG_AX) {
+            ec_code_intel_op_1(&intel, 0x05, reg);
+        } else {
+            ec_code_intel_modrm_reg(&intel, reg, 0);
+            ec_code_intel_op_1(&intel, 0x81, 0);
+        }
+        ec_code_intel_immediate_4(&intel, value);
+    }
+    ec_code_intel_rex(&intel, _gf_true);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_test_i2r(ec_code_builder_t *builder, uint32_t value,
+                          ec_code_intel_reg_t reg)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    if (reg == REG_AX) {
+        ec_code_intel_op_1(&intel, 0xA9, reg);
+    } else {
+        ec_code_intel_modrm_reg(&intel, reg, 0);
+        ec_code_intel_op_1(&intel, 0xF7, 0);
+    }
+    ec_code_intel_immediate_4(&intel, value);
+    ec_code_intel_rex(&intel, _gf_true);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_jne(ec_code_builder_t *builder, uint32_t address)
+{
+    ec_code_intel_t intel;
+    int32_t rel;
+
+    ec_code_intel_init(&intel);
+
+    rel = address - builder->address - 2;
+    if ((rel >= -128) && (rel < 128)) {
+        ec_code_intel_op_1(&intel, 0x75, 0);
+        ec_code_intel_immediate_1(&intel, rel);
+    } else {
+        rel -= 4;
+        ec_code_intel_op_2(&intel, 0x0F, 0x85, 0);
+        ec_code_intel_immediate_4(&intel, rel);
+    }
+    ec_code_intel_rex(&intel, _gf_false);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_sse2sse(ec_code_builder_t *builder, uint32_t src,
+                             uint32_t dst)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_prefix(&intel, 0x66);
+    ec_code_intel_modrm_reg(&intel, src, dst);
+    ec_code_intel_op_2(&intel, 0x0F, 0x6F, 0);
+    ec_code_intel_rex(&intel, _gf_false);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_sse2m(ec_code_builder_t *builder, uint32_t src,
+                           ec_code_intel_reg_t base, ec_code_intel_reg_t index,
+                           uint32_t scale, int32_t offset)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_prefix(&intel, 0x66);
+    ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset);
+    ec_code_intel_op_2(&intel, 0x0F, 0x7F, 0);
+    ec_code_intel_rex(&intel, _gf_false);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_m2sse(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+                           ec_code_intel_reg_t index, uint32_t scale,
+                           int32_t offset, uint32_t dst)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_prefix(&intel, 0x66);
+    ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset);
+    ec_code_intel_op_2(&intel, 0x0F, 0x6F, 0);
+    ec_code_intel_rex(&intel, _gf_false);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_xor_sse2sse(ec_code_builder_t *builder, uint32_t src,
+                             uint32_t dst)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_prefix(&intel, 0x66);
+    ec_code_intel_modrm_reg(&intel, src, dst);
+    ec_code_intel_op_2(&intel, 0x0F, 0xEF, 0);
+    ec_code_intel_rex(&intel, _gf_false);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_xor_m2sse(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+                           ec_code_intel_reg_t index, uint32_t scale,
+                           int32_t offset, uint32_t dst)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_prefix(&intel, 0x66);
+    ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset);
+    ec_code_intel_op_2(&intel, 0x0F, 0xEF, 0);
+    ec_code_intel_rex(&intel, _gf_false);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_avx2avx(ec_code_builder_t *builder, uint32_t src,
+                             uint32_t dst)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_modrm_reg(&intel, src, dst);
+    ec_code_intel_op_1(&intel, 0x6F, 0);
+    ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66,
+                      VEX_REG_NONE);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_avx2m(ec_code_builder_t *builder, uint32_t src,
+                           ec_code_intel_reg_t base, ec_code_intel_reg_t index,
+                           uint32_t scale, int32_t offset)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_modrm_mem(&intel, src, base, index, scale, offset);
+    ec_code_intel_op_1(&intel, 0x7F, 0);
+    ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66,
+                      VEX_REG_NONE);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_mov_m2avx(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+                           ec_code_intel_reg_t index, uint32_t scale,
+                           int32_t offset, uint32_t dst)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset);
+    ec_code_intel_op_1(&intel, 0x6F, 0);
+    ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66,
+                      VEX_REG_NONE);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_xor_avx2avx(ec_code_builder_t *builder, uint32_t src,
+                             uint32_t dst)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_modrm_reg(&intel, src, dst);
+    ec_code_intel_op_1(&intel, 0xEF, 0);
+    ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66,
+                      dst);
+
+    ec_code_intel_emit(builder, &intel);
+}
+
+void
+ec_code_intel_op_xor_m2avx(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+                           ec_code_intel_reg_t index, uint32_t scale,
+                           int32_t offset, uint32_t dst)
+{
+    ec_code_intel_t intel;
+
+    ec_code_intel_init(&intel);
+
+    ec_code_intel_modrm_mem(&intel, dst, base, index, scale, offset);
+    ec_code_intel_op_1(&intel, 0xEF, 0);
+    ec_code_intel_vex(&intel, _gf_false, _gf_true, VEX_OPCODE_0F, VEX_PREFIX_66,
+                      dst);
+
+    ec_code_intel_emit(builder, &intel);
+}
diff --git a/xlators/cluster/ec/src/ec-code-intel.h b/xlators/cluster/ec/src/ec-code-intel.h
new file mode 100644
index 00000000000..3fa4a174765
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-intel.h
@@ -0,0 +1,191 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_CODE_INTEL_H__
+#define __EC_CODE_INTEL_H__
+
+#include "ec-code.h"
+
+#define VEX_REG_NONE 0
+
+enum _ec_code_intel_reg;
+typedef enum _ec_code_intel_reg ec_code_intel_reg_t;
+
+enum _ec_code_vex_prefix;
+typedef enum _ec_code_vex_prefix ec_code_vex_prefix_t;
+
+enum _ec_code_vex_opcode;
+typedef enum _ec_code_vex_opcode ec_code_vex_opcode_t;
+
+struct _ec_code_intel_buffer;
+typedef struct _ec_code_intel_buffer ec_code_intel_buffer_t;
+
+struct _ec_code_intel_sib;
+typedef struct _ec_code_intel_sib ec_code_intel_sib_t;
+
+struct _ec_code_intel_modrm;
+typedef struct _ec_code_intel_modrm ec_code_intel_modrm_t;
+
+struct _ec_code_intel_rex;
+typedef struct _ec_code_intel_rex ec_code_intel_rex_t;
+
+struct _ec_code_intel;
+typedef struct _ec_code_intel ec_code_intel_t;
+
+enum _ec_code_intel_reg {
+    REG_NULL = -1,
+    REG_AX,
+    REG_CX,
+    REG_DX,
+    REG_BX,
+    REG_SP,
+    REG_BP,
+    REG_SI,
+    REG_DI,
+    REG_8,
+    REG_9,
+    REG_10,
+    REG_11,
+    REG_12,
+    REG_13,
+    REG_14,
+    REG_15
+};
+
+enum _ec_code_vex_prefix {
+    VEX_PREFIX_NONE = 0,
+    VEX_PREFIX_66,
+    VEX_PREFIX_F3,
+    VEX_PREFIX_F2
+};
+
+enum _ec_code_vex_opcode {
+    VEX_OPCODE_NONE = 0,
+    VEX_OPCODE_0F,
+    VEX_OPCODE_0F_38,
+    VEX_OPCODE_0F_3A
+};
+
+struct _ec_code_intel_buffer {
+    uint32_t bytes;
+    union {
+        uint8_t data[4];
+        uint32_t value;
+    };
+};
+
+struct _ec_code_intel_sib {
+    gf_boolean_t present;
+    uint32_t base;
+    uint32_t index;
+    uint32_t scale;
+};
+
+struct _ec_code_intel_modrm {
+    gf_boolean_t present;
+    uint32_t mod;
+    uint32_t rm;
+    uint32_t reg;
+};
+
+struct _ec_code_intel_rex {
+    gf_boolean_t present;
+    uint32_t w;
+    uint32_t r;
+    uint32_t x;
+    uint32_t b;
+};
+
+struct _ec_code_intel {
+    gf_boolean_t invalid;
+    ec_code_intel_buffer_t prefix;
+    ec_code_intel_buffer_t opcode;
+    ec_code_intel_buffer_t offset;
+    ec_code_intel_buffer_t immediate;
+    ec_code_intel_buffer_t vex;
+    ec_code_intel_rex_t rex;
+    ec_code_intel_modrm_t modrm;
+    ec_code_intel_sib_t sib;
+    uint32_t reg;
+};
+
+void
+ec_code_intel_op_push_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg);
+void
+ec_code_intel_op_pop_r(ec_code_builder_t *builder, ec_code_intel_reg_t reg);
+void
+ec_code_intel_op_ret(ec_code_builder_t *builder, uint32_t size);
+
+void
+ec_code_intel_op_mov_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src,
+                         ec_code_intel_reg_t dst);
+void
+ec_code_intel_op_mov_r2m(ec_code_builder_t *builder, ec_code_intel_reg_t src,
+                         ec_code_intel_reg_t base, ec_code_intel_reg_t index,
+                         uint32_t scale, int32_t offset);
+void
+ec_code_intel_op_mov_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+                         ec_code_intel_reg_t index, uint32_t scale,
+                         int32_t offset, ec_code_intel_reg_t dst);
+void
+ec_code_intel_op_xor_r2r(ec_code_builder_t *builder, ec_code_intel_reg_t src,
+                         ec_code_intel_reg_t dst);
+void
+ec_code_intel_op_xor_m2r(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+                         ec_code_intel_reg_t index, uint32_t scale,
+                         int32_t offset, ec_code_intel_reg_t dst);
+void
+ec_code_intel_op_add_i2r(ec_code_builder_t *builder, int32_t value,
+                         ec_code_intel_reg_t reg);
+void
+ec_code_intel_op_test_i2r(ec_code_builder_t *builder, uint32_t value,
+                          ec_code_intel_reg_t reg);
+void
+ec_code_intel_op_jne(ec_code_builder_t *builder, uint32_t address);
+
+void
+ec_code_intel_op_mov_sse2sse(ec_code_builder_t *builder, uint32_t src,
+                             uint32_t dst);
+void
+ec_code_intel_op_mov_sse2m(ec_code_builder_t *builder, uint32_t src,
+                           ec_code_intel_reg_t base, ec_code_intel_reg_t index,
+                           uint32_t scale, int32_t offset);
+void
+ec_code_intel_op_mov_m2sse(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+                           ec_code_intel_reg_t index, uint32_t scale,
+                           int32_t offset, uint32_t dst);
+void
+ec_code_intel_op_xor_sse2sse(ec_code_builder_t *builder, uint32_t src,
+                             uint32_t dst);
+void
+ec_code_intel_op_xor_m2sse(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+                           ec_code_intel_reg_t index, uint32_t scale,
+                           int32_t offset, uint32_t dst);
+
+void
+ec_code_intel_op_mov_avx2avx(ec_code_builder_t *builder, uint32_t src,
+                             uint32_t dst);
+void
+ec_code_intel_op_mov_avx2m(ec_code_builder_t *builder, uint32_t src,
+                           ec_code_intel_reg_t base, ec_code_intel_reg_t index,
+                           uint32_t scale, int32_t offset);
+void
+ec_code_intel_op_mov_m2avx(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+                           ec_code_intel_reg_t index, uint32_t scale,
+                           int32_t offset, uint32_t dst);
+void
+ec_code_intel_op_xor_avx2avx(ec_code_builder_t *builder, uint32_t src,
+                             uint32_t dst);
+void
+ec_code_intel_op_xor_m2avx(ec_code_builder_t *builder, ec_code_intel_reg_t base,
+                           ec_code_intel_reg_t index, uint32_t scale,
+                           int32_t offset, uint32_t dst);
+
+#endif /* __EC_CODE_INTEL_H__ */
diff --git a/xlators/cluster/ec/src/ec-code-sse.c b/xlators/cluster/ec/src/ec-code-sse.c
new file mode 100644
index 00000000000..e11e7ff8400
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-sse.c
@@ -0,0 +1,101 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <errno.h>
+
+#include "ec-code-intel.h"
+
+static void
+ec_code_sse_prolog(ec_code_builder_t *builder)
+{
+    builder->loop = builder->address;
+}
+
+static void
+ec_code_sse_epilog(ec_code_builder_t *builder)
+{
+    ec_code_intel_op_add_i2r(builder, 16, REG_DX);
+    ec_code_intel_op_add_i2r(builder, 16, REG_DI);
+    ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX);
+    ec_code_intel_op_jne(builder, builder->loop);
+
+    ec_code_intel_op_ret(builder, 0);
+}
+
+static void
+ec_code_sse_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx,
+                 uint32_t bit)
+{
+    if (builder->linear) {
+        ec_code_intel_op_mov_m2sse(
+            builder, REG_SI, REG_DX, 1,
+            idx * builder->width * builder->bits + bit * builder->width, dst);
+    } else {
+        if (builder->base != idx) {
+            ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8,
+                                     REG_AX);
+            builder->base = idx;
+        }
+        ec_code_intel_op_mov_m2sse(builder, REG_AX, REG_DX, 1,
+                                   bit * builder->width, dst);
+    }
+}
+
+static void
+ec_code_sse_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit)
+{
+    ec_code_intel_op_mov_sse2m(builder, src, REG_DI, REG_NULL, 0,
+                               bit * builder->width);
+}
+
+static void
+ec_code_sse_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+    ec_code_intel_op_mov_sse2sse(builder, src, dst);
+}
+
+static void
+ec_code_sse_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+    ec_code_intel_op_xor_sse2sse(builder, src, dst);
+}
+
+static void
+ec_code_sse_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx,
+                 uint32_t bit)
+{
+    if (builder->linear) {
+        ec_code_intel_op_xor_m2sse(
+            builder, REG_SI, REG_DX, 1,
+            idx * builder->width * builder->bits + bit * builder->width, dst);
+    } else {
+        if (builder->base != idx) {
+            ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8,
+                                     REG_AX);
+            builder->base = idx;
+        }
+        ec_code_intel_op_xor_m2sse(builder, REG_AX, REG_DX, 1,
+                                   bit * builder->width, dst);
+    }
+}
+
+static char *ec_code_sse_needed_flags[] = {"sse2", NULL};
+
+ec_code_gen_t ec_code_gen_sse = {.name = "sse",
+                                 .flags = ec_code_sse_needed_flags,
+                                 .width = 16,
+                                 .prolog = ec_code_sse_prolog,
+                                 .epilog = ec_code_sse_epilog,
+                                 .load = ec_code_sse_load,
+                                 .store = ec_code_sse_store,
+                                 .copy = ec_code_sse_copy,
+                                 .xor2 = ec_code_sse_xor2,
+                                 .xor3 = NULL,
+                                 .xorm = ec_code_sse_xorm};
diff --git a/xlators/cluster/ec/src/ec-code-sse.h b/xlators/cluster/ec/src/ec-code-sse.h
new file mode 100644
index 00000000000..f1acbcf894b
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-sse.h
@@ -0,0 +1,18 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_CODE_SSE_H__
+#define __EC_CODE_SSE_H__
+
+#include "ec-code.h"
+
+extern ec_code_gen_t ec_code_gen_sse;
+
+#endif /* __EC_CODE_SSE_H__ */
diff --git a/xlators/cluster/ec/src/ec-code-x64.c b/xlators/cluster/ec/src/ec-code-x64.c
new file mode 100644
index 00000000000..26565b4493f
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-x64.c
@@ -0,0 +1,144 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <errno.h>
+
+#include "ec-code-intel.h"
+
+static ec_code_intel_reg_t ec_code_x64_regmap[] = {
+    REG_AX, REG_CX, REG_BP, REG_8,  REG_9, REG_10,
+    REG_11, REG_12, REG_13, REG_14, REG_15};
+
+static void
+ec_code_x64_prolog(ec_code_builder_t *builder)
+{
+    uint32_t i;
+
+    ec_code_intel_op_push_r(builder, REG_BP);
+    if (!builder->linear) {
+        ec_code_intel_op_push_r(builder, REG_BX);
+    }
+    if (builder->regs > 11) {
+        ec_code_error(builder, EINVAL);
+        return;
+    }
+    for (i = 7; i < builder->regs; i++) {
+        ec_code_intel_op_push_r(builder, ec_code_x64_regmap[i]);
+    }
+
+    builder->loop = builder->address;
+}
+
+static void
+ec_code_x64_epilog(ec_code_builder_t *builder)
+{
+    uint32_t i;
+
+    ec_code_intel_op_add_i2r(builder, 8, REG_DX);
+    ec_code_intel_op_add_i2r(builder, 8, REG_DI);
+    ec_code_intel_op_test_i2r(builder, builder->width - 1, REG_DX);
+    ec_code_intel_op_jne(builder, builder->loop);
+
+    if (builder->regs > 11) {
+        ec_code_error(builder, EINVAL);
+        return;
+    }
+    for (i = builder->regs; i > 7; i--) {
+        ec_code_intel_op_pop_r(builder, ec_code_x64_regmap[i - 1]);
+    }
+    if (!builder->linear) {
+        ec_code_intel_op_pop_r(builder, REG_BX);
+    }
+    ec_code_intel_op_pop_r(builder, REG_BP);
+    ec_code_intel_op_ret(builder, 0);
+}
+
+static void
+ec_code_x64_load(ec_code_builder_t *builder, uint32_t dst, uint32_t idx,
+                 uint32_t bit)
+{
+    dst = ec_code_x64_regmap[dst];
+
+    if (builder->linear) {
+        ec_code_intel_op_mov_m2r(
+            builder, REG_SI, REG_DX, 1,
+            idx * builder->width * builder->bits + bit * builder->width, dst);
+    } else {
+        if (builder->base != idx) {
+            ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8,
+                                     REG_BX);
+            builder->base = idx;
+        }
+        ec_code_intel_op_mov_m2r(builder, REG_BX, REG_DX, 1,
+                                 bit * builder->width, dst);
+    }
+}
+
+static void
+ec_code_x64_store(ec_code_builder_t *builder, uint32_t src, uint32_t bit)
+{
+    src = ec_code_x64_regmap[src];
+
+    ec_code_intel_op_mov_r2m(builder, src, REG_DI, REG_NULL, 0,
+                             bit * builder->width);
+}
+
+static void
+ec_code_x64_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+    dst = ec_code_x64_regmap[dst];
+    src = ec_code_x64_regmap[src];
+
+    ec_code_intel_op_mov_r2r(builder, src, dst);
+}
+
+static void
+ec_code_x64_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+    dst = ec_code_x64_regmap[dst];
+    src = ec_code_x64_regmap[src];
+
+    ec_code_intel_op_xor_r2r(builder, src, dst);
+}
+
+static void
+ec_code_x64_xorm(ec_code_builder_t *builder, uint32_t dst, uint32_t idx,
+                 uint32_t bit)
+{
+    dst = ec_code_x64_regmap[dst];
+
+    if (builder->linear) {
+        ec_code_intel_op_xor_m2r(
+            builder, REG_SI, REG_DX, 1,
+            idx * builder->width * builder->bits + bit * builder->width, dst);
+    } else {
+        if (builder->base != idx) {
+            ec_code_intel_op_mov_m2r(builder, REG_SI, REG_NULL, 0, idx * 8,
+                                     REG_BX);
+            builder->base = idx;
+        }
+        ec_code_intel_op_xor_m2r(builder, REG_BX, REG_DX, 1,
+                                 bit * builder->width, dst);
+    }
+}
+
+static char *ec_code_x64_needed_flags[] = {NULL};
+
+ec_code_gen_t ec_code_gen_x64 = {.name = "x64",
+                                 .flags = ec_code_x64_needed_flags,
+                                 .width = sizeof(uint64_t),
+                                 .prolog = ec_code_x64_prolog,
+                                 .epilog = ec_code_x64_epilog,
+                                 .load = ec_code_x64_load,
+                                 .store = ec_code_x64_store,
+                                 .copy = ec_code_x64_copy,
+                                 .xor2 = ec_code_x64_xor2,
+                                 .xor3 = NULL,
+                                 .xorm = ec_code_x64_xorm};
diff --git a/xlators/cluster/ec/src/ec-code-x64.h b/xlators/cluster/ec/src/ec-code-x64.h
new file mode 100644
index 00000000000..bd8174e4bf5
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code-x64.h
@@ -0,0 +1,18 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_CODE_X64_H__
+#define __EC_CODE_X64_H__
+
+#include "ec-code.h"
+
+extern ec_code_gen_t ec_code_gen_x64;
+
+#endif /* __EC_CODE_X64_H__ */
diff --git a/xlators/cluster/ec/src/ec-code.c b/xlators/cluster/ec/src/ec-code.c
new file mode 100644
index 00000000000..03162ae05a9
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code.c
@@ -0,0 +1,1060 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <ctype.h>
+
+#include <glusterfs/syscall.h>
+
+#include "ec-mem-types.h"
+#include "ec-code.h"
+#include "ec-messages.h"
+#include "ec-code-c.h"
+#include "ec-helpers.h"
+
+#ifdef USE_EC_DYNAMIC_X64
+#include "ec-code-x64.h"
+#endif
+
+#ifdef USE_EC_DYNAMIC_SSE
+#include "ec-code-sse.h"
+#endif
+
+#ifdef USE_EC_DYNAMIC_AVX
+#include "ec-code-avx.h"
+#endif
+
+#define EC_CODE_SIZE (1024 * 64)
+#define EC_CODE_ALIGN 4096
+
+#define EC_CODE_CHUNK_MIN_SIZE 512
+
+#define EC_PROC_BUFFER_SIZE 4096
+
+#define PROC_CPUINFO "/proc/cpuinfo"
+
+struct _ec_code_proc;
+typedef struct _ec_code_proc ec_code_proc_t;
+
+struct _ec_code_proc {
+    int32_t fd;
+    gf_boolean_t eof;
+    gf_boolean_t error;
+    gf_boolean_t skip;
+    ssize_t size;
+    ssize_t pos;
+    char buffer[EC_PROC_BUFFER_SIZE];
+};
+
+static ec_code_gen_t *ec_code_gen_table[] = {
+#ifdef USE_EC_DYNAMIC_AVX
+    &ec_code_gen_avx,
+#endif
+#ifdef USE_EC_DYNAMIC_SSE
+    &ec_code_gen_sse,
+#endif
+#ifdef USE_EC_DYNAMIC_X64
+    &ec_code_gen_x64,
+#endif
+    NULL};
+
+static void
+ec_code_arg_set(ec_code_arg_t *arg, uint32_t value)
+{
+    arg->value = value;
+}
+
+static void
+ec_code_arg_assign(ec_code_builder_t *builder, ec_code_op_t *op,
+                   ec_code_arg_t *arg, uint32_t reg)
+{
+    arg->value = reg;
+
+    if (builder->regs <= reg) {
+        builder->regs = reg + 1;
+    }
+}
+
+static void
+ec_code_arg_use(ec_code_builder_t *builder, ec_code_op_t *op,
+                ec_code_arg_t *arg, uint32_t reg)
+{
+    arg->value = reg;
+}
+
+static void
+ec_code_arg_update(ec_code_builder_t *builder, ec_code_op_t *op,
+                   ec_code_arg_t *arg, uint32_t reg)
+{
+    arg->value = reg;
+}
+
+static ec_code_op_t *
+ec_code_op_next(ec_code_builder_t *builder)
+{
+    ec_code_op_t *op;
+
+    op = &builder->ops[builder->count++];
+    memset(op, 0, sizeof(ec_code_op_t));
+
+    return op;
+}
+
+static void
+ec_code_load(ec_code_builder_t *builder, uint32_t bit, uint32_t offset)
+{
+    ec_code_op_t *op;
+
+    op = ec_code_op_next(builder);
+
+    op->op = EC_GF_OP_LOAD;
+    ec_code_arg_assign(builder, op, &op->arg1, builder->map[bit]);
+    ec_code_arg_set(&op->arg2, offset);
+    ec_code_arg_set(&op->arg3, bit);
+}
+
+static void
+ec_code_store(ec_code_builder_t *builder, uint32_t reg, uint32_t bit)
+{
+    ec_code_op_t *op;
+
+    op = ec_code_op_next(builder);
+
+    op->op = EC_GF_OP_STORE;
+    ec_code_arg_use(builder, op, &op->arg1, builder->map[reg]);
+    ec_code_arg_set(&op->arg2, 0);
+    ec_code_arg_set(&op->arg3, bit);
+}
+
+static void
+ec_code_copy(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+    ec_code_op_t *op;
+
+    op = ec_code_op_next(builder);
+
+    op->op = EC_GF_OP_COPY;
+    ec_code_arg_assign(builder, op, &op->arg1, builder->map[dst]);
+    ec_code_arg_use(builder, op, &op->arg2, builder->map[src]);
+    ec_code_arg_set(&op->arg3, 0);
+}
+
+static void
+ec_code_xor2(ec_code_builder_t *builder, uint32_t dst, uint32_t src)
+{
+    ec_code_op_t *op;
+
+    op = ec_code_op_next(builder);
+
+    op->op = EC_GF_OP_XOR2;
+    ec_code_arg_update(builder, op, &op->arg1, builder->map[dst]);
+    ec_code_arg_use(builder, op, &op->arg2, builder->map[src]);
+    ec_code_arg_set(&op->arg3, 0);
+}
+
+static void
+ec_code_xor3(ec_code_builder_t *builder, uint32_t dst, uint32_t src1,
+             uint32_t src2)
+{
+    ec_code_op_t *op;
+
+    if (builder->code->gen->xor3 == NULL) {
+        ec_code_copy(builder, dst, src1);
+        ec_code_xor2(builder, dst, src2);
+
+        return;
+    }
+
+    op = ec_code_op_next(builder);
+
+    op->op = EC_GF_OP_XOR3;
+    ec_code_arg_assign(builder, op, &op->arg1, builder->map[dst]);
+    ec_code_arg_use(builder, op, &op->arg2, builder->map[src1]);
+    ec_code_arg_use(builder, op, &op->arg3, builder->map[src2]);
+}
+
+static void
+ec_code_xorm(ec_code_builder_t *builder, uint32_t bit, uint32_t offset)
+{
+    ec_code_op_t *op;
+
+    op = ec_code_op_next(builder);
+
+    op->op = EC_GF_OP_XORM;
+    ec_code_arg_update(builder, op, &op->arg1, builder->map[bit]);
+    ec_code_arg_set(&op->arg2, offset);
+    ec_code_arg_set(&op->arg3, bit);
+}
+
+static void
+ec_code_dup(ec_code_builder_t *builder, ec_gf_op_t *op)
+{
+    switch (op->op) {
+        case EC_GF_OP_COPY:
+            ec_code_copy(builder, op->arg1, op->arg2);
+            break;
+        case EC_GF_OP_XOR2:
+            ec_code_xor2(builder, op->arg1, op->arg2);
+            break;
+        case EC_GF_OP_XOR3:
+            ec_code_xor3(builder, op->arg1, op->arg2, op->arg3);
+            break;
+        default:
+            break;
+    }
+}
+
+static void
+ec_code_gf_load(ec_code_builder_t *builder, uint32_t offset)
+{
+    uint32_t i;
+
+    for (i = 0; i < builder->code->gf->bits; i++) {
+        ec_code_load(builder, i, offset);
+    }
+}
+
+static void
+ec_code_gf_load_xor(ec_code_builder_t *builder, uint32_t offset)
+{
+    uint32_t i;
+
+    for (i = 0; i < builder->code->gf->bits; i++) {
+        ec_code_xorm(builder, i, offset);
+    }
+}
+
+static void
+ec_code_gf_store(ec_code_builder_t *builder)
+{
+    uint32_t i;
+
+    for (i = 0; i < builder->code->gf->bits; i++) {
+        ec_code_store(builder, i, i);
+    }
+}
+
+static void
+ec_code_gf_clear(ec_code_builder_t *builder)
+{
+    uint32_t i;
+
+    ec_code_xor2(builder, 0, 0);
+    for (i = 0; i < builder->code->gf->bits; i++) {
+        ec_code_store(builder, 0, i);
+    }
+}
+
+static void
+ec_code_gf_mul(ec_code_builder_t *builder, uint32_t value)
+{
+    ec_gf_mul_t *mul;
+    ec_gf_op_t *op;
+    uint32_t map[EC_GF_MAX_REGS];
+    int32_t i;
+
+    mul = builder->code->gf->table[value];
+    for (op = mul->ops; op->op != EC_GF_OP_END; op++) {
+        ec_code_dup(builder, op);
+    }
+
+    for (i = 0; i < mul->regs; i++) {
+        map[i] = builder->map[mul->map[i]];
+    }
+    memcpy(builder->map, map, sizeof(uint32_t) * mul->regs);
+}
+
+static ec_code_builder_t *
+ec_code_prepare(ec_code_t *code, uint32_t count, uint32_t width,
+                gf_boolean_t linear)
+{
+    ec_code_builder_t *builder;
+    uint32_t i;
+
+    count *= code->gf->bits + code->gf->max_ops;
+    count += code->gf->bits;
+    builder = GF_MALLOC(
+        sizeof(ec_code_builder_t) + sizeof(ec_code_op_t) * count,
+        ec_mt_ec_code_builder_t);
+    if (builder == NULL) {
+        return EC_ERR(ENOMEM);
+    }
+
+    builder->address = 0;
+    builder->code = code;
+    builder->size = 0;
+    builder->count = 0;
+    builder->regs = 0;
+    builder->error = 0;
+    builder->bits = code->gf->bits;
+    builder->width = width;
+    builder->data = NULL;
+    builder->linear = linear;
+    builder->base = -1;
+
+    for (i = 0; i < EC_GF_MAX_REGS; i++) {
+        builder->map[i] = i;
+    }
+
+    return builder;
+}
+
+static size_t
+ec_code_space_size(void)
+{
+    return (sizeof(ec_code_space_t) + 15) & ~15;
+}
+
+static size_t
+ec_code_chunk_size(void)
+{
+    return (sizeof(ec_code_chunk_t) + 15) & ~15;
+}
+
+static ec_code_chunk_t *
+ec_code_chunk_from_space(ec_code_space_t *space)
+{
+    return (ec_code_chunk_t *)((uintptr_t)space + ec_code_space_size());
+}
+
+static void *
+ec_code_to_executable(ec_code_space_t *space, void *addr)
+{
+    return (void *)((uintptr_t)addr - (uintptr_t)space +
+                    (uintptr_t)space->exec);
+}
+
+static void *
+ec_code_from_executable(ec_code_space_t *space, void *addr)
+{
+    return (void *)((uintptr_t)addr - (uintptr_t)space->exec +
+                    (uintptr_t)space);
+}
+
+static void *
+ec_code_func_from_chunk(ec_code_chunk_t *chunk, void **exec)
+{
+    void *addr;
+
+    addr = (void *)((uintptr_t)chunk + ec_code_chunk_size());
+
+    *exec = ec_code_to_executable(chunk->space, addr);
+
+    return addr;
+}
+
+static ec_code_chunk_t *
+ec_code_chunk_from_func(ec_code_func_linear_t func)
+{
+    ec_code_chunk_t *chunk;
+
+    chunk = (ec_code_chunk_t *)((uintptr_t)func - ec_code_chunk_size());
+
+    return ec_code_from_executable(chunk->space, chunk);
+}
+
+static ec_code_chunk_t *
+ec_code_chunk_split(ec_code_chunk_t *chunk, size_t size)
+{
+    ec_code_chunk_t *extra;
+    ssize_t avail;
+
+    avail = chunk->size - size - ec_code_chunk_size();
+    if (avail > 0) {
+        extra = (ec_code_chunk_t *)((uintptr_t)chunk + chunk->size - avail);
+        extra->space = chunk->space;
+        extra->size = avail;
+        list_add(&extra->list, &chunk->list);
+        chunk->size = size;
+    }
+    list_del_init(&chunk->list);
+
+    return chunk;
+}
+
+static gf_boolean_t
+ec_code_chunk_touch(ec_code_chunk_t *prev, ec_code_chunk_t *next)
+{
+    uintptr_t end;
+
+    end = (uintptr_t)prev + ec_code_chunk_size() + prev->size;
+    return (end == (uintptr_t)next);
+}
+
+static ec_code_space_t *
+ec_code_space_create(ec_code_t *code, size_t size)
+{
+    char path[] = GLUSTERFS_LIBEXECDIR "/ec-code-dynamic.XXXXXX";
+    ec_code_space_t *space;
+    void *exec;
+    int32_t fd, err;
+
+    /* We need to create memory areas to store the generated dynamic code.
+     * Obviously these areas need to be written to be able to create the
+     * code and they also need to be executable to execute it.
+     *
+     * However it's a bad practice to have a memory region that is both
+     * writable *and* executable. In fact, selinux forbids this and causes
+     * attempts to do so to fail (unless specifically configured).
+     *
+     * To solve the problem we'll use two distinct memory areas mapped to
+     * the same physical storage. One of the memory areas will have write
+     * permission, and the other will have execute permission. Both areas
+     * will have the same contents. The physical storage will be a regular
+     * file that will be mmapped to both areas.
+     */
+
+    /* We need to create a temporary file as the backend storage for the
+     * memory mapped areas. */
+    /* coverity[secure_temp] mkstemp uses 0600 as the mode and is safe */
+    fd = mkstemp(path);
+    if (fd < 0) {
+        err = errno;
+        gf_msg(THIS->name, GF_LOG_ERROR, err, EC_MSG_DYN_CREATE_FAILED,
+               "Unable to create a temporary file for the ec dynamic "
+               "code");
+        space = EC_ERR(err);
+        goto done;
+    }
+    /* Once created we don't need to keep it in the file system. It will
+     * still exist until we close the last file descriptor or unmap the
+     * memory areas bound to the file. */
+    sys_unlink(path);
+
+    size = (size + EC_CODE_ALIGN - 1) & ~(EC_CODE_ALIGN - 1);
+    if (sys_ftruncate(fd, size) < 0) {
+        err = errno;
+        gf_msg(THIS->name, GF_LOG_ERROR, err, EC_MSG_DYN_CREATE_FAILED,
+               "Unable to resize the file for the ec dynamic code");
+        space = EC_ERR(err);
+        goto done_close;
+    }
+
+    /* This creates an executable memory area to be able to run the
+     * generated fragments of code. */
+    exec = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_SHARED, fd, 0);
+    if (exec == MAP_FAILED) {
+        err = errno;
+        gf_msg(THIS->name, GF_LOG_ERROR, err, EC_MSG_DYN_CREATE_FAILED,
+               "Unable to map the executable area for the ec dynamic "
+               "code");
+        space = EC_ERR(err);
+        goto done_close;
+    }
+    /* It's not important to check the return value of mlock(). If it fails
+     * everything will continue to work normally. */
+    mlock(exec, size);
+
+    /* This maps a read/write memory area to be able to create the dynamici
+     * code. */
+    space = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    if (space == MAP_FAILED) {
+        err = errno;
+        gf_msg(THIS->name, GF_LOG_ERROR, err, EC_MSG_DYN_CREATE_FAILED,
+               "Unable to map the writable area for the ec dynamic "
+               "code");
+        space = EC_ERR(err);
+
+        munmap(exec, size);
+
+        goto done_close;
+    }
+
+    space->exec = exec;
+    space->size = size;
+    space->code = code;
+    list_add_tail(&space->list, &code->spaces);
+    INIT_LIST_HEAD(&space->chunks);
+
+done_close:
+    /* If everything has succeeded, we already have the memory areas
+     * mapped. We don't need the file descriptor anymore because the
+     * backend storage will be there until the mmap()'d regions are
+     * unmapped. */
+    sys_close(fd);
+done:
+    return space;
+}
+
+static void
+ec_code_space_destroy(ec_code_space_t *space)
+{
+    list_del_init(&space->list);
+
+    munmap(space->exec, space->size);
+    munmap(space, space->size);
+}
+
+static void
+ec_code_chunk_merge(ec_code_chunk_t *chunk)
+{
+    ec_code_chunk_t *item, *tmp;
+
+    list_for_each_entry_safe(item, tmp, &chunk->space->chunks, list)
+    {
+        if ((uintptr_t)item > (uintptr_t)chunk) {
+            list_add_tail(&chunk->list, &item->list);
+            if (ec_code_chunk_touch(chunk, item)) {
+                chunk->size += item->size + ec_code_chunk_size();
+                list_del_init(&item->list);
+            }
+
+            goto check;
+        }
+        if (ec_code_chunk_touch(item, chunk)) {
+            item->size += chunk->size + ec_code_chunk_size();
+            list_del_init(&item->list);
+            chunk = item;
+        }
+    }
+    list_add_tail(&chunk->list, &chunk->space->chunks);
+
+check:
+    if (chunk->size ==
+        chunk->space->size - ec_code_space_size() - ec_code_chunk_size()) {
+        ec_code_space_destroy(chunk->space);
+    }
+}
+
+static ec_code_chunk_t *
+ec_code_space_alloc(ec_code_t *code, size_t size)
+{
+    ec_code_space_t *space;
+    ec_code_chunk_t *chunk;
+    size_t map_size;
+
+    /* To minimize fragmentation, we only allocate chunks of sizes multiples
+     * of EC_CODE_CHUNK_MIN_SIZE. */
+    size = ((size + ec_code_chunk_size() + EC_CODE_CHUNK_MIN_SIZE - 1) &
+            ~(EC_CODE_CHUNK_MIN_SIZE - 1)) -
+           ec_code_chunk_size();
+    list_for_each_entry(space, &code->spaces, list)
+    {
+        list_for_each_entry(chunk, &space->chunks, list)
+        {
+            if (chunk->size >= size) {
+                goto out;
+            }
+        }
+    }
+
+    map_size = EC_CODE_SIZE - ec_code_space_size() - ec_code_chunk_size();
+    if (map_size < size) {
+        map_size = size;
+    }
+    space = ec_code_space_create(code, map_size);
+    if (EC_IS_ERR(space)) {
+        return (ec_code_chunk_t *)space;
+    }
+
+    chunk = ec_code_chunk_from_space(space);
+    chunk->size = map_size - ec_code_space_size() - ec_code_chunk_size();
+    list_add(&chunk->list, &space->chunks);
+
+out:
+    chunk->space = space;
+
+    return ec_code_chunk_split(chunk, size);
+}
+
+static ec_code_chunk_t *
+ec_code_alloc(ec_code_t *code, uint32_t size)
+{
+    ec_code_chunk_t *chunk;
+
+    LOCK(&code->lock);
+
+    chunk = ec_code_space_alloc(code, size);
+
+    UNLOCK(&code->lock);
+
+    return chunk;
+}
+
+static void
+ec_code_free(ec_code_chunk_t *chunk)
+{
+    gf_lock_t *lock;
+
+    lock = &chunk->space->code->lock;
+    LOCK(lock);
+
+    ec_code_chunk_merge(chunk);
+
+    UNLOCK(lock);
+}
+
+static int32_t
+ec_code_write(ec_code_builder_t *builder)
+{
+    ec_code_gen_t *gen;
+    ec_code_op_t *op;
+    uint32_t i;
+
+    builder->error = 0;
+    builder->size = 0;
+    builder->address = 0;
+    builder->base = -1;
+
+    gen = builder->code->gen;
+    gen->prolog(builder);
+    for (i = 0; i < builder->count; i++) {
+        op = &builder->ops[i];
+        switch (op->op) {
+            case EC_GF_OP_LOAD:
+                gen->load(builder, op->arg1.value, op->arg2.value,
+                          op->arg3.value);
+                break;
+            case EC_GF_OP_STORE:
+                gen->store(builder, op->arg1.value, op->arg3.value);
+                break;
+            case EC_GF_OP_COPY:
+                gen->copy(builder, op->arg1.value, op->arg2.value);
+                break;
+            case EC_GF_OP_XOR2:
+                gen->xor2(builder, op->arg1.value, op->arg2.value);
+                break;
+            case EC_GF_OP_XOR3:
+                gen->xor3(builder, op->arg1.value, op->arg2.value,
+                          op->arg3.value);
+                break;
+            case EC_GF_OP_XORM:
+                gen->xorm(builder, op->arg1.value, op->arg2.value,
+                          op->arg3.value);
+                break;
+            default:
+                break;
+        }
+    }
+    gen->epilog(builder);
+
+    return builder->error;
+}
+
+static void *
+ec_code_compile(ec_code_builder_t *builder)
+{
+    ec_code_chunk_t *chunk;
+    void *func;
+    int32_t err;
+
+    err = ec_code_write(builder);
+    if (err != 0) {
+        return EC_ERR(err);
+    }
+
+    chunk = ec_code_alloc(builder->code, builder->size);
+    if (EC_IS_ERR(chunk)) {
+        return chunk;
+    }
+    builder->data = ec_code_func_from_chunk(chunk, &func);
+
+    err = ec_code_write(builder);
+    if (err != 0) {
+        ec_code_free(chunk);
+
+        return EC_ERR(err);
+    }
+
+    GF_FREE(builder);
+
+    return func;
+}
+
+ec_code_t *
+ec_code_create(ec_gf_t *gf, ec_code_gen_t *gen)
+{
+    ec_code_t *code;
+
+    code = GF_MALLOC(sizeof(ec_code_t), ec_mt_ec_code_t);
+    if (code == NULL) {
+        return EC_ERR(ENOMEM);
+    }
+    memset(code, 0, sizeof(ec_code_t));
+    INIT_LIST_HEAD(&code->spaces);
+    LOCK_INIT(&code->lock);
+
+    code->gf = gf;
+    code->gen = gen;
+
+    return code;
+}
+
+void
+ec_code_destroy(ec_code_t *code)
+{
+    if (!list_empty(&code->spaces)) {
+    }
+
+    LOCK_DESTROY(&code->lock);
+
+    GF_FREE(code);
+}
+
+static uint32_t
+ec_code_value_next(uint32_t *values, uint32_t count, uint32_t *offset)
+{
+    uint32_t i, next;
+
+    next = 0;
+    for (i = *offset + 1; i < count; i++) {
+        next = values[i];
+        if (next != 0) {
+            break;
+        }
+    }
+    *offset = i;
+
+    return next;
+}
+
+static void *
+ec_code_build_dynamic(ec_code_t *code, uint32_t width, uint32_t *values,
+                      uint32_t count, gf_boolean_t linear)
+{
+    ec_code_builder_t *builder;
+    uint32_t offset, val, next;
+
+    builder = ec_code_prepare(code, count, width, linear);
+    if (EC_IS_ERR(builder)) {
+        return builder;
+    }
+
+    offset = -1;
+    next = ec_code_value_next(values, count, &offset);
+    if (next != 0) {
+        ec_code_gf_load(builder, offset);
+        do {
+            val = next;
+            next = ec_code_value_next(values, count, &offset);
+            if (next != 0) {
+                ec_code_gf_mul(builder, ec_gf_div(code->gf, val, next));
+                ec_code_gf_load_xor(builder, offset);
+            }
+        } while (next != 0);
+        ec_code_gf_mul(builder, val);
+        ec_code_gf_store(builder);
+    } else {
+        ec_code_gf_clear(builder);
+    }
+
+    return ec_code_compile(builder);
+}
+
+static void *
+ec_code_build(ec_code_t *code, uint32_t width, uint32_t *values, uint32_t count,
+              gf_boolean_t linear)
+{
+    void *func;
+
+    if (code->gen != NULL) {
+        func = ec_code_build_dynamic(code, width, values, count, linear);
+        if (!EC_IS_ERR(func)) {
+            return func;
+        }
+
+        gf_msg_debug(THIS->name, GF_LOG_DEBUG,
+                     "Unable to generate dynamic code. Falling back "
+                     "to precompiled code");
+
+        /* The dynamic code generation shouldn't fail in normal
+         * conditions, but if it fails at some point, it's very
+         * probable that it will fail again, so we completely disable
+         * dynamic code generation. */
+        code->gen = NULL;
+    }
+
+    ec_code_c_prepare(code->gf, values, count);
+
+    if (linear) {
+        return ec_code_c_linear;
+    }
+
+    return ec_code_c_interleaved;
+}
+
+ec_code_func_linear_t
+ec_code_build_linear(ec_code_t *code, uint32_t width, uint32_t *values,
+                     uint32_t count)
+{
+    return (ec_code_func_linear_t)ec_code_build(code, width, values, count,
+                                                _gf_true);
+}
+
+ec_code_func_interleaved_t
+ec_code_build_interleaved(ec_code_t *code, uint32_t width, uint32_t *values,
+                          uint32_t count)
+{
+    return (ec_code_func_interleaved_t)ec_code_build(code, width, values, count,
+                                                     _gf_false);
+}
+
+void
+ec_code_release(ec_code_t *code, ec_code_func_t *func)
+{
+    if ((func->linear != ec_code_c_linear) &&
+        (func->interleaved != ec_code_c_interleaved)) {
+        ec_code_free(ec_code_chunk_from_func(func->linear));
+    }
+}
+
+void
+ec_code_error(ec_code_builder_t *builder, int32_t error)
+{
+    if (builder->error == 0) {
+        gf_msg(THIS->name, GF_LOG_ERROR, error, EC_MSG_DYN_CODEGEN_FAILED,
+               "Failed to generate dynamic code");
+        builder->error = error;
+    }
+}
+
+void
+ec_code_emit(ec_code_builder_t *builder, uint8_t *bytes, uint32_t count)
+{
+    if (builder->error != 0) {
+        return;
+    }
+
+    if (builder->data != NULL) {
+        memcpy(builder->data + builder->size, bytes, count);
+    }
+
+    builder->size += count;
+    builder->address += count;
+}
+
+static char *
+ec_code_proc_trim_left(char *text, ssize_t *length)
+{
+    ssize_t len;
+
+    for (len = *length; (len > 0) && isspace(*text); len--) {
+        text++;
+    }
+    *length = len;
+
+    return text;
+}
+
+static char *
+ec_code_proc_trim_right(char *text, ssize_t *length, char sep)
+{
+    char *last;
+    ssize_t len;
+
+    len = *length;
+
+    last = text;
+    for (len = *length; (len > 0) && (*text != sep); len--) {
+        if (!isspace(*text)) {
+            last = text + 1;
+        }
+        text++;
+    }
+    *last = 0;
+    *length = len;
+
+    return text;
+}
+
+static char *
+ec_code_proc_line_parse(ec_code_proc_t *file, ssize_t *length)
+{
+    char *text, *end;
+    ssize_t len;
+
+    len = file->size - file->pos;
+    text = ec_code_proc_trim_left(file->buffer + file->pos, &len);
+    end = ec_code_proc_trim_right(text, &len, '\n');
+    if (len == 0) {
+        if (!file->eof) {
+            if (text == file->buffer) {
+                file->size = file->pos = 0;
+                file->skip = _gf_true;
+            } else {
+                file->size = file->pos = end - text;
+                memmove(file->buffer, text, file->pos + 1);
+            }
+            len = sys_read(file->fd, file->buffer + file->pos,
+                           sizeof(file->buffer) - file->pos - 1);
+            if (len > 0) {
+                file->size += len;
+            }
+            file->error = len < 0;
+            file->eof = len <= 0;
+
+            return NULL;
+        }
+        file->size = file->pos = 0;
+    } else {
+        file->pos = end - file->buffer + 1;
+    }
+
+    *length = end - text;
+
+    if (file->skip) {
+        file->skip = _gf_false;
+        text = NULL;
+    }
+
+    return text;
+}
+
+static char *
+ec_code_proc_line(ec_code_proc_t *file, ssize_t *length)
+{
+    char *text;
+
+    text = NULL;
+    while (!file->eof) {
+        text = ec_code_proc_line_parse(file, length);
+        if (text != NULL) {
+            break;
+        }
+    }
+
+    return text;
+}
+
+static char *
+ec_code_proc_split(char *text, ssize_t *length, char sep)
+{
+    text = ec_code_proc_trim_right(text, length, sep);
+    if (*length == 0) {
+        return NULL;
+    }
+    (*length)--;
+    text++;
+
+    return ec_code_proc_trim_left(text, length);
+}
+
+static uint32_t
+ec_code_cpu_check(uint32_t idx, char *list, uint32_t count)
+{
+    ec_code_gen_t *gen;
+    char **ptr;
+    char *table[count + 1];
+    uint32_t i;
+
+    for (i = 0; i < count; i++) {
+        table[i] = list;
+        list += strlen(list) + 1;
+    }
+
+    gen = ec_code_gen_table[idx];
+    while (gen != NULL) {
+        for (ptr = gen->flags; *ptr != NULL; ptr++) {
+            for (i = 0; i < count; i++) {
+                if (strcmp(*ptr, table[i]) == 0) {
+                    break;
+                }
+            }
+            if (i >= count) {
+                gen = ec_code_gen_table[++idx];
+                break;
+            }
+        }
+        if (*ptr == NULL) {
+            break;
+        }
+    }
+
+    return idx;
+}
+
+ec_code_gen_t *
+ec_code_detect(xlator_t *xl, const char *def)
+{
+    ec_code_proc_t file;
+    ec_code_gen_t *gen = NULL;
+    char *line, *data, *list;
+    ssize_t length;
+    uint32_t count, base, select;
+
+    if (strcmp(def, "none") == 0) {
+        gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION_NONE,
+               "Not using any cpu extensions");
+
+        return NULL;
+    }
+
+    file.fd = sys_open(PROC_CPUINFO, O_RDONLY, 0);
+    if (file.fd < 0) {
+        goto out;
+    }
+    file.size = file.pos = 0;
+    file.eof = file.error = file.skip = _gf_false;
+
+    select = 0;
+    if (strcmp(def, "auto") != 0) {
+        while (ec_code_gen_table[select] != NULL) {
+            if (strcmp(ec_code_gen_table[select]->name, def) == 0) {
+                break;
+            }
+            select++;
+        }
+        if (ec_code_gen_table[select] == NULL) {
+            gf_msg(xl->name, GF_LOG_WARNING, EINVAL, EC_MSG_EXTENSION_UNKNOWN,
+                   "CPU extension '%s' is not known. Not using any cpu "
+                   "extensions",
+                   def);
+
+            return NULL;
+        }
+    } else {
+        def = NULL;
+    }
+
+    while ((line = ec_code_proc_line(&file, &length)) != NULL) {
+        data = ec_code_proc_split(line, &length, ':');
+        if ((data != NULL) && (strcmp(line, "flags") == 0)) {
+            list = data;
+            count = 0;
+            while ((data != NULL) && (*data != 0)) {
+                count++;
+                data = ec_code_proc_split(data, &length, ' ');
+            }
+            base = select;
+            select = ec_code_cpu_check(select, list, count);
+            if ((base != select) && (def != NULL)) {
+                gf_msg(xl->name, GF_LOG_WARNING, ENOTSUP,
+                       EC_MSG_EXTENSION_UNSUPPORTED,
+                       "CPU extension '%s' is not supported", def);
+                def = NULL;
+            }
+        }
+    }
+
+    if (file.error) {
+        gf_msg(xl->name, GF_LOG_WARNING, 0, EC_MSG_EXTENSION_FAILED,
+               "Unable to determine supported CPU extensions. Not using any "
+               "cpu extensions");
+
+        gen = NULL;
+    } else {
+        gen = ec_code_gen_table[select];
+        if (gen == NULL) {
+            gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION_NONE,
+                   "Not using any cpu extensions");
+        } else {
+            gf_msg(xl->name, GF_LOG_INFO, 0, EC_MSG_EXTENSION,
+                   "Using '%s' CPU extensions", gen->name);
+        }
+    }
+
+    sys_close(file.fd);
+
+out:
+    return gen;
+}
diff --git a/xlators/cluster/ec/src/ec-code.h b/xlators/cluster/ec/src/ec-code.h
new file mode 100644
index 00000000000..75fb35d93e3
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-code.h
@@ -0,0 +1,44 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_CODE_H__
+#define __EC_CODE_H__
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/list.h>
+
+#include "ec-types.h"
+#include "ec-galois.h"
+
+ec_code_gen_t *
+ec_code_detect(xlator_t *xl, const char *def);
+
+ec_code_t *
+ec_code_create(ec_gf_t *gf, ec_code_gen_t *gen);
+
+void
+ec_code_destroy(ec_code_t *code);
+
+ec_code_func_linear_t
+ec_code_build_linear(ec_code_t *code, uint32_t width, uint32_t *values,
+                     uint32_t count);
+ec_code_func_interleaved_t
+ec_code_build_interleaved(ec_code_t *code, uint32_t width, uint32_t *values,
+                          uint32_t count);
+void
+ec_code_release(ec_code_t *code, ec_code_func_t *func);
+
+void
+ec_code_error(ec_code_builder_t *builder, int32_t error);
+
+void
+ec_code_emit(ec_code_builder_t *builder, uint8_t *bytes, uint32_t count);
+
+#endif /* __EC_CODE_H__ */
diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c
new file mode 100644
index 00000000000..703a30e2485
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-combine.c
@@ -0,0 +1,995 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <fnmatch.h>
+
+#include "libxlator.h"
+#include <glusterfs/byte-order.h>
+
+#include "ec-types.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-messages.h"
+#include <glusterfs/quota-common-utils.h>
+
+#define EC_QUOTA_PREFIX "trusted.glusterfs.quota."
+
+#define EC_MISSING_DATA ((data_t *)1ULL)
+
+struct _ec_dict_info;
+typedef struct _ec_dict_info ec_dict_info_t;
+
+struct _ec_dict_combine;
+typedef struct _ec_dict_combine ec_dict_combine_t;
+
+struct _ec_dict_info {
+    dict_t *dict;
+    int32_t count;
+};
+
+struct _ec_dict_combine {
+    ec_cbk_data_t *cbk;
+    int32_t which;
+};
+
+int32_t
+ec_combine_write(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+    int valid = 0;
+
+    if (!fop || !dst || !src)
+        return 0;
+
+    switch (fop->id) {
+        case GF_FOP_REMOVEXATTR:
+        case GF_FOP_FREMOVEXATTR:
+        case GF_FOP_SETXATTR:
+        case GF_FOP_FSETXATTR:
+            return 1;
+
+        case GF_FOP_SYMLINK:
+        case GF_FOP_LINK:
+        case GF_FOP_CREATE:
+        case GF_FOP_MKNOD:
+        case GF_FOP_MKDIR:
+            valid = 3;
+            break;
+        case GF_FOP_UNLINK:
+        case GF_FOP_RMDIR:
+        case GF_FOP_SETATTR:
+        case GF_FOP_FSETATTR:
+        case GF_FOP_TRUNCATE:
+        case GF_FOP_FTRUNCATE:
+        case GF_FOP_WRITE:
+        case GF_FOP_FALLOCATE:
+        case GF_FOP_DISCARD:
+        case GF_FOP_ZEROFILL:
+            valid = 2;
+            break;
+        case GF_FOP_RENAME:
+            valid = 5;
+            break;
+        default:
+            gf_msg_callingfn(fop->xl->name, GF_LOG_WARNING, EINVAL,
+                             EC_MSG_INVALID_FOP, "Invalid fop %d", fop->id);
+            return 0;
+            break;
+    }
+
+    if (!ec_iatt_combine(fop, dst->iatt, src->iatt, valid)) {
+        gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH,
+               "Mismatching iatt in "
+               "answers of '%s'",
+               gf_fop_list[fop->id]);
+        return 0;
+    }
+    return 1;
+}
+
+void
+ec_iatt_time_merge(int64_t *dst_sec, uint32_t *dst_nsec, int64_t src_sec,
+                   uint32_t src_nsec)
+{
+    if ((*dst_sec < src_sec) ||
+        ((*dst_sec == src_sec) && (*dst_nsec < src_nsec))) {
+        *dst_sec = src_sec;
+        *dst_nsec = src_nsec;
+    }
+}
+
+static gf_boolean_t
+ec_iatt_is_trusted(ec_fop_data_t *fop, struct iatt *iatt)
+{
+    uint64_t ino;
+    int32_t i;
+
+    /* Only the top level fop will have fop->locks filled. */
+    while (fop->parent != NULL) {
+        fop = fop->parent;
+    }
+
+    /* Lookups are special requests always done without locks taken but they
+     * require to be able to identify differences between bricks. Special
+     * handling of these differences is already done in lookup specific code
+     * so we shouldn't ignore any difference here and consider all iatt
+     * structures as trusted. */
+    if (fop->id == GF_FOP_LOOKUP) {
+        return _gf_true;
+    }
+
+    /* Check if the iatt references an inode locked by the current fop */
+    for (i = 0; i < fop->lock_count; i++) {
+        ino = gfid_to_ino(fop->locks[i].lock->loc.inode->gfid);
+        if (iatt->ia_ino == ino) {
+            return _gf_true;
+        }
+    }
+
+    return _gf_false;
+}
+
+int32_t
+ec_iatt_combine(ec_fop_data_t *fop, struct iatt *dst, struct iatt *src,
+                int32_t count)
+{
+    int32_t i;
+    gf_boolean_t failed = _gf_false;
+
+    for (i = 0; i < count; i++) {
+        /* Check for basic fields. These fields must be equal always, even if
+         * the inode is not locked because in these cases the parent inode
+         * will be locked and differences in these fields require changes in
+         * the parent directory. */
+        if ((dst[i].ia_ino != src[i].ia_ino) ||
+            (((dst[i].ia_type == IA_IFBLK) || (dst[i].ia_type == IA_IFCHR)) &&
+             (dst[i].ia_rdev != src[i].ia_rdev)) ||
+            (gf_uuid_compare(dst[i].ia_gfid, src[i].ia_gfid) != 0)) {
+            failed = _gf_true;
+        }
+        /* Check for not so stable fields. These fields can change if the
+         * inode is not locked. */
+        if (!failed && ((dst[i].ia_uid != src[i].ia_uid) ||
+                        (dst[i].ia_gid != src[i].ia_gid) ||
+                        (st_mode_from_ia(dst[i].ia_prot, dst[i].ia_type) !=
+                         st_mode_from_ia(src[i].ia_prot, src[i].ia_type)))) {
+            if (ec_iatt_is_trusted(fop, dst)) {
+                /* If the iatt contains information from an inode that is
+                 * locked, these differences are real problems, so we need to
+                 * report them. Otherwise we ignore them and don't care which
+                 * data is returned. */
+                failed = _gf_true;
+            } else {
+                gf_msg_debug(fop->xl->name, 0,
+                             "Ignoring iatt differences because inode is not "
+                             "locked");
+            }
+        }
+        if (failed) {
+            gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_IATT_COMBINE_FAIL,
+                   "Failed to combine iatt (inode: %" PRIu64 "-%" PRIu64
+                   ", "
+                   "links: %u-%u, uid: %u-%u, gid: %u-%u, "
+                   "rdev: %" PRIu64 "-%" PRIu64 ", size: %" PRIu64 "-%" PRIu64
+                   ", "
+                   "mode: %o-%o), %s",
+                   dst[i].ia_ino, src[i].ia_ino, dst[i].ia_nlink,
+                   src[i].ia_nlink, dst[i].ia_uid, src[i].ia_uid, dst[i].ia_gid,
+                   src[i].ia_gid, dst[i].ia_rdev, src[i].ia_rdev,
+                   dst[i].ia_size, src[i].ia_size,
+                   st_mode_from_ia(dst[i].ia_prot, dst[i].ia_type),
+                   st_mode_from_ia(src[i].ia_prot, dst[i].ia_type),
+                   ec_msg_str(fop));
+
+            return 0;
+        }
+    }
+
+    while (count-- > 0) {
+        dst[count].ia_blocks += src[count].ia_blocks;
+        if (dst[count].ia_blksize < src[count].ia_blksize) {
+            dst[count].ia_blksize = src[count].ia_blksize;
+        }
+
+        ec_iatt_time_merge(&dst[count].ia_atime, &dst[count].ia_atime_nsec,
+                           src[count].ia_atime, src[count].ia_atime_nsec);
+        ec_iatt_time_merge(&dst[count].ia_mtime, &dst[count].ia_mtime_nsec,
+                           src[count].ia_mtime, src[count].ia_mtime_nsec);
+        ec_iatt_time_merge(&dst[count].ia_ctime, &dst[count].ia_ctime_nsec,
+                           src[count].ia_ctime, src[count].ia_ctime_nsec);
+    }
+
+    return 1;
+}
+
+void
+ec_iatt_rebuild(ec_t *ec, struct iatt *iatt, int32_t count, int32_t answers)
+{
+    uint64_t blocks;
+
+    while (count-- > 0) {
+        blocks = iatt[count].ia_blocks * ec->fragments + answers - 1;
+        blocks /= answers;
+        iatt[count].ia_blocks = blocks;
+    }
+}
+
+gf_boolean_t
+ec_xattr_match(dict_t *dict, char *key, data_t *value, void *arg)
+{
+    if ((fnmatch(GF_XATTR_STIME_PATTERN, key, 0) == 0) ||
+        (strcmp(key, GET_LINK_COUNT) == 0) ||
+        (strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) ||
+        (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0) ||
+        (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0)) {
+        return _gf_false;
+    }
+
+    return _gf_true;
+}
+
+gf_boolean_t
+ec_value_ignore(char *key)
+{
+    if ((strcmp(key, GF_CONTENT_KEY) == 0) ||
+        (strcmp(key, GF_XATTR_PATHINFO_KEY) == 0) ||
+        (strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0) ||
+        (strcmp(key, GF_XATTR_LOCKINFO_KEY) == 0) ||
+        (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0) ||
+        (strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) ||
+        (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0) ||
+        (strncmp(key, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)) == 0) ||
+        (strcmp(key, DHT_IATT_IN_XDATA_KEY) == 0) ||
+        (strncmp(key, EC_QUOTA_PREFIX, SLEN(EC_QUOTA_PREFIX)) == 0) ||
+        (fnmatch(MARKER_XATTR_PREFIX ".*." XTIME, key, 0) == 0) ||
+        (fnmatch(GF_XATTR_MARKER_KEY ".*", key, 0) == 0) ||
+        (XATTR_IS_NODE_UUID(key))) {
+        return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+int32_t
+ec_dict_compare(dict_t *dict1, dict_t *dict2)
+{
+    if (are_dicts_equal(dict1, dict2, ec_xattr_match, ec_value_ignore))
+        return 1;
+    return 0;
+}
+
+static uint32_t
+ec_dict_list(data_t **list, ec_cbk_data_t *cbk, int32_t which, char *key,
+             gf_boolean_t global)
+{
+    ec_t *ec = cbk->fop->xl->private;
+    ec_cbk_data_t *ans = NULL;
+    dict_t *dict = NULL;
+    data_t *data;
+    uint32_t count;
+    int32_t i;
+
+    for (i = 0; i < ec->nodes; i++) {
+        /* We initialize the list with EC_MISSING_DATA if we are
+         * returning a global list or the current subvolume belongs
+         * to the group of the accepted answer. Note that if some
+         * subvolume is known to be down before issuing the request,
+         * we won't have any answer from it, so we set here the
+         * appropriate default value. */
+        if (global || ((cbk->mask & (1ULL << i)) != 0)) {
+            list[i] = EC_MISSING_DATA;
+        } else {
+            list[i] = NULL;
+        }
+    }
+
+    count = 0;
+    list_for_each_entry(ans, &cbk->fop->answer_list, answer_list)
+    {
+        if (global || ((cbk->mask & ans->mask) != 0)) {
+            dict = (which == EC_COMBINE_XDATA) ? ans->xdata : ans->dict;
+            data = dict_get(dict, key);
+            if (data != NULL) {
+                list[ans->idx] = data;
+                count++;
+            }
+        }
+    }
+
+    return count;
+}
+
+int32_t
+ec_concat_prepare(xlator_t *xl, char **str, char **sep, char **post,
+                  const char *fmt, va_list args)
+{
+    char *tmp;
+    int32_t len;
+
+    len = gf_vasprintf(str, fmt, args);
+    if (len < 0) {
+        return -ENOMEM;
+    }
+
+    tmp = strchr(*str, '{');
+    if (tmp == NULL) {
+        goto out;
+    }
+    *tmp++ = 0;
+    *sep = tmp;
+    tmp = strchr(tmp, '}');
+    if (tmp == NULL) {
+        goto out;
+    }
+    *tmp++ = 0;
+    *post = tmp;
+
+    return 0;
+
+out:
+    gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_FORMAT,
+           "Invalid concat format");
+
+    GF_FREE(*str);
+
+    return -EINVAL;
+}
+
+static int32_t
+ec_dict_data_concat(ec_cbk_data_t *cbk, int32_t which, char *key, char *new_key,
+                    const char *def, gf_boolean_t global, const char *fmt, ...)
+{
+    ec_t *ec = cbk->fop->xl->private;
+    data_t *data[ec->nodes];
+    char *str = NULL, *pre = NULL, *sep, *post;
+    dict_t *dict;
+    va_list args;
+    int32_t i, num, len, deflen, prelen, postlen, seplen, tmp;
+    int32_t err;
+
+    ec_dict_list(data, cbk, which, key, global);
+
+    va_start(args, fmt);
+    err = ec_concat_prepare(cbk->fop->xl, &pre, &sep, &post, fmt, args);
+    va_end(args);
+
+    if (err != 0) {
+        return err;
+    }
+
+    prelen = strlen(pre);
+    seplen = strlen(sep);
+    postlen = strlen(post);
+
+    deflen = 0;
+    if (def != NULL) {
+        deflen = strlen(def);
+    }
+
+    len = prelen + postlen + 1;
+    num = -1;
+    for (i = 0; i < ec->nodes; i++) {
+        if (data[i] == NULL) {
+            continue;
+        }
+        if (data[i] == EC_MISSING_DATA) {
+            if (def == NULL) {
+                continue;
+            }
+            len += deflen;
+        } else {
+            len += data[i]->len - 1;
+        }
+        if (num >= 0) {
+            len += seplen;
+        }
+        num++;
+    }
+
+    err = -ENOMEM;
+
+    str = GF_MALLOC(len, gf_common_mt_char);
+    if (str == NULL) {
+        goto out;
+    }
+
+    memcpy(str, pre, prelen);
+    len = prelen;
+    for (i = 0; i < ec->nodes; i++) {
+        if (data[i] == NULL) {
+            continue;
+        }
+        if (data[i] == EC_MISSING_DATA) {
+            if (deflen == 0) {
+                continue;
+            }
+            tmp = deflen;
+            memcpy(str + len, def, tmp);
+        } else {
+            tmp = data[i]->len - 1;
+            memcpy(str + len, data[i]->data, tmp);
+        }
+        len += tmp;
+        if (i < num) {
+            memcpy(str + len, sep, seplen);
+            len += seplen;
+        }
+    }
+    memcpy(str + len, post, postlen + 1);
+
+    dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+    if (new_key) {
+        key = new_key;
+    }
+    err = dict_set_dynstr(dict, key, str);
+    if (err != 0) {
+        goto out;
+    }
+
+    str = NULL;
+
+out:
+    GF_FREE(str);
+    GF_FREE(pre);
+
+    return err;
+}
+
+int32_t
+ec_dict_data_merge(ec_cbk_data_t *cbk, int32_t which, char *key)
+{
+    ec_t *ec = cbk->fop->xl->private;
+    data_t *data[ec->nodes];
+    dict_t *dict, *lockinfo, *tmp = NULL;
+    char *ptr = NULL;
+    int32_t i, len;
+    int32_t err;
+
+    ec_dict_list(data, cbk, which, key, _gf_false);
+
+    lockinfo = dict_new();
+    if (lockinfo == NULL) {
+        return -ENOMEM;
+    }
+
+    for (i = 0; i < ec->nodes; i++) {
+        if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) {
+            continue;
+        }
+
+        tmp = dict_new();
+        if (tmp == NULL) {
+            err = -ENOMEM;
+
+            goto out;
+        }
+        err = dict_unserialize(data[i]->data, data[i]->len, &tmp);
+        if (err != 0) {
+            goto out;
+        }
+        if (dict_copy(tmp, lockinfo) == NULL) {
+            err = -ENOMEM;
+
+            goto out;
+        }
+
+        dict_unref(tmp);
+    }
+
+    tmp = NULL;
+
+    err = dict_allocate_and_serialize(lockinfo, (char **)&ptr,
+                                      (unsigned int *)&len);
+    if (err != 0) {
+        goto out;
+    }
+
+    dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+    err = dict_set_dynptr(dict, key, ptr, len);
+    if (err != 0) {
+        goto out;
+    }
+
+    ptr = NULL;
+
+out:
+    GF_FREE(ptr);
+    dict_unref(lockinfo);
+    if (tmp != NULL) {
+        dict_unref(tmp);
+    }
+
+    return err;
+}
+
+int32_t
+ec_dict_data_uuid(ec_cbk_data_t *cbk, int32_t which, char *key)
+{
+    ec_cbk_data_t *ans, *min;
+    dict_t *src, *dst;
+    data_t *data;
+
+    min = cbk;
+    for (ans = cbk->next; ans != NULL; ans = ans->next) {
+        if (ans->idx < min->idx) {
+            min = ans;
+        }
+    }
+
+    if (min != cbk) {
+        src = (which == EC_COMBINE_XDATA) ? min->xdata : min->dict;
+        dst = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+
+        data = dict_get(src, key);
+        if (data == NULL) {
+            return -ENOENT;
+        }
+        if (dict_set(dst, key, data) != 0) {
+            return -ENOMEM;
+        }
+    }
+
+    return 0;
+}
+
+int32_t
+ec_dict_data_iatt(ec_cbk_data_t *cbk, int32_t which, char *key)
+{
+    ec_t *ec = cbk->fop->xl->private;
+    data_t *data[ec->nodes];
+    dict_t *dict;
+    struct iatt *stbuf, *tmp;
+    int32_t i, ret;
+
+    ec_dict_list(data, cbk, which, key, _gf_false);
+
+    stbuf = NULL;
+    for (i = 0; i < ec->nodes; i++) {
+        if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) {
+            continue;
+        }
+        tmp = data_to_iatt(data[i], key);
+        if (tmp == NULL) {
+            ret = -EINVAL;
+            goto out;
+        }
+        if (stbuf == NULL) {
+            stbuf = GF_MALLOC(sizeof(struct iatt), gf_common_mt_char);
+            if (stbuf == NULL) {
+                ret = -ENOMEM;
+                goto out;
+            }
+            *stbuf = *tmp;
+        } else {
+            if (!ec_iatt_combine(cbk->fop, stbuf, tmp, 1)) {
+                ret = -EINVAL;
+                goto out;
+            }
+        }
+    }
+
+    if ((stbuf != NULL) && (stbuf->ia_type == IA_IFREG)) {
+        ec_iatt_rebuild(ec, stbuf, 1, cbk->count);
+        /* TODO: not sure if an iatt could come in xdata from a fop that takes
+         *       no locks. */
+        if (!ec_get_inode_size(cbk->fop, cbk->fop->locks[0].lock->loc.inode,
+                               &stbuf->ia_size)) {
+            ret = -EINVAL;
+            goto out;
+        }
+    }
+
+    dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+    ret = dict_set_iatt(dict, key, stbuf, false);
+    if (ret >= 0) {
+        stbuf = NULL;
+    }
+
+out:
+    GF_FREE(stbuf);
+
+    return ret;
+}
+
+int32_t
+ec_dict_data_max32(ec_cbk_data_t *cbk, int32_t which, char *key)
+{
+    ec_t *ec = cbk->fop->xl->private;
+    data_t *data[ec->nodes];
+    dict_t *dict;
+    int32_t i;
+    uint32_t max, tmp;
+
+    ec_dict_list(data, cbk, which, key, _gf_false);
+
+    max = 0;
+    for (i = 0; i < ec->nodes; i++) {
+        if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) {
+            continue;
+        }
+
+        tmp = data_to_uint32(data[i]);
+        if (max < tmp) {
+            max = tmp;
+        }
+    }
+
+    dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+    return dict_set_uint32(dict, key, max);
+}
+
+int32_t
+ec_dict_data_max64(ec_cbk_data_t *cbk, int32_t which, char *key)
+{
+    ec_t *ec = cbk->fop->xl->private;
+    data_t *data[ec->nodes];
+    dict_t *dict;
+    int32_t i;
+    uint64_t max, tmp;
+
+    ec_dict_list(data, cbk, which, key, _gf_false);
+
+    max = 0;
+    for (i = 0; i < ec->nodes; i++) {
+        if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) {
+            continue;
+        }
+
+        tmp = data_to_uint64(data[i]);
+        if (max < tmp) {
+            max = tmp;
+        }
+    }
+
+    dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+    return dict_set_uint64(dict, key, max);
+}
+
+int32_t
+ec_dict_data_quota(ec_cbk_data_t *cbk, int32_t which, char *key)
+{
+    ec_t *ec = cbk->fop->xl->private;
+    data_t *data[ec->nodes];
+    dict_t *dict = NULL;
+    int32_t i = 0;
+    quota_meta_t size = {
+        0,
+    };
+    quota_meta_t max_size = {
+        0,
+    };
+
+    if (ec_dict_list(data, cbk, which, key, _gf_false) == 0) {
+        return 0;
+    }
+
+    /* Quota size xattr is managed outside of the control of the ec xlator.
+     * This means that it might not be updated at the same time on all
+     * bricks and we can receive slightly different values. If that's the
+     * case, we take the maximum of all received values.
+     */
+    for (i = 0; i < ec->nodes; i++) {
+        if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA) ||
+            (quota_data_to_meta(data[i], &size) < 0)) {
+            continue;
+        }
+
+        if (size.size > max_size.size)
+            max_size.size = size.size;
+        if (size.file_count > max_size.file_count)
+            max_size.file_count = size.file_count;
+        if (size.dir_count > max_size.dir_count)
+            max_size.dir_count = size.dir_count;
+    }
+
+    max_size.size *= ec->fragments;
+
+    dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+    return quota_dict_set_meta(dict, key, &max_size, IA_IFDIR);
+}
+
+int32_t
+ec_dict_data_stime(ec_cbk_data_t *cbk, int32_t which, char *key)
+{
+    ec_t *ec = cbk->fop->xl->private;
+    data_t *data[ec->nodes];
+    dict_t *dict;
+    int32_t i, err;
+
+    ec_dict_list(data, cbk, which, key, _gf_false);
+
+    dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+    for (i = 0; i < ec->nodes; i++) {
+        if ((data[i] == NULL) || (data[i] == EC_MISSING_DATA)) {
+            continue;
+        }
+        err = gf_get_max_stime(cbk->fop->xl, dict, key, data[i]);
+        if (err != 0) {
+            gf_msg(cbk->fop->xl->name, GF_LOG_ERROR, -err,
+                   EC_MSG_STIME_COMBINE_FAIL, "STIME combination failed");
+
+            return err;
+        }
+    }
+
+    return 0;
+}
+
+int32_t
+ec_dict_data_combine(dict_t *dict, char *key, data_t *value, void *arg)
+{
+    ec_dict_combine_t *data = arg;
+
+    if ((strcmp(key, GF_XATTR_PATHINFO_KEY) == 0) ||
+        (strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0)) {
+        return ec_dict_data_concat(data->cbk, data->which, key, NULL, NULL,
+                                   _gf_false, _gf_false, "(<EC:%s> { })",
+                                   data->cbk->fop->xl->name);
+    }
+
+    if (strncmp(key, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)) == 0) {
+        return ec_dict_data_concat(data->cbk, data->which, key, NULL, NULL,
+                                   _gf_false, "{\n}");
+    }
+
+    if (strncmp(key, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) == 0) {
+        return ec_dict_data_merge(data->cbk, data->which, key);
+    }
+
+    if (strcmp(key, GET_LINK_COUNT) == 0) {
+        return ec_dict_data_max32(data->cbk, data->which, key);
+    }
+
+    if (strcmp(key, GLUSTERFS_OPEN_FD_COUNT) == 0) {
+        return ec_dict_data_max32(data->cbk, data->which, key);
+    }
+    if ((strcmp(key, GLUSTERFS_INODELK_COUNT) == 0) ||
+        (strcmp(key, GLUSTERFS_ENTRYLK_COUNT) == 0)) {
+        return ec_dict_data_max32(data->cbk, data->which, key);
+    }
+
+    if (strcmp(key, QUOTA_SIZE_KEY) == 0) {
+        return ec_dict_data_quota(data->cbk, data->which, key);
+    }
+    /* Ignore all other quota attributes */
+    if (strncmp(key, EC_QUOTA_PREFIX, SLEN(EC_QUOTA_PREFIX)) == 0) {
+        return 0;
+    }
+
+    if (XATTR_IS_NODE_UUID(key)) {
+        if (data->cbk->fop->int32) {
+            /* List of node uuid is requested */
+            return ec_dict_data_concat(data->cbk, data->which, key,
+                                       GF_XATTR_LIST_NODE_UUIDS_KEY, UUID0_STR,
+                                       _gf_true, "{ }");
+        } else {
+            return ec_dict_data_uuid(data->cbk, data->which, key);
+        }
+    }
+
+    if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) {
+        return ec_dict_data_stime(data->cbk, data->which, key);
+    }
+
+    if (fnmatch(MARKER_XATTR_PREFIX ".*." XTIME, key, FNM_NOESCAPE) == 0) {
+        return ec_dict_data_max64(data->cbk, data->which, key);
+    }
+
+    if (strcmp(key, GF_PRESTAT) == 0 || strcmp(key, GF_POSTSTAT) == 0) {
+        return ec_dict_data_iatt(data->cbk, data->which, key);
+    }
+
+    return 0;
+}
+
+int32_t
+ec_dict_combine(ec_cbk_data_t *cbk, int32_t which)
+{
+    dict_t *dict = NULL;
+    ec_dict_combine_t data;
+    int32_t err = 0;
+
+    data.cbk = cbk;
+    data.which = which;
+
+    dict = (which == EC_COMBINE_XDATA) ? cbk->xdata : cbk->dict;
+    if (dict != NULL) {
+        err = dict_foreach(dict, ec_dict_data_combine, &data);
+        if (err != 0) {
+            gf_msg(cbk->fop->xl->name, GF_LOG_ERROR, -err,
+                   EC_MSG_DICT_COMBINE_FAIL, "Dictionary combination failed");
+
+            return err;
+        }
+    }
+
+    return 0;
+}
+
+int32_t
+ec_vector_compare(struct iovec *dst_vector, int32_t dst_count,
+                  struct iovec *src_vector, int32_t src_count)
+{
+    int32_t dst_size = 0, src_size = 0;
+
+    if (dst_count > 0) {
+        dst_size = iov_length(dst_vector, dst_count);
+    }
+    if (src_count > 0) {
+        src_size = iov_length(src_vector, src_count);
+    }
+
+    return (dst_size == src_size);
+}
+
+int32_t
+ec_flock_compare(struct gf_flock *dst, struct gf_flock *src)
+{
+    if ((dst->l_type != src->l_type) || (dst->l_whence != src->l_whence) ||
+        (dst->l_start != src->l_start) || (dst->l_len != src->l_len) ||
+        (dst->l_pid != src->l_pid) ||
+        !is_same_lkowner(&dst->l_owner, &src->l_owner)) {
+        return 0;
+    }
+
+    return 1;
+}
+
+void
+ec_statvfs_combine(struct statvfs *dst, struct statvfs *src)
+{
+    if (dst->f_bsize < src->f_bsize) {
+        dst->f_bsize = src->f_bsize;
+    }
+
+    if (dst->f_frsize < src->f_frsize) {
+        dst->f_blocks *= dst->f_frsize;
+        dst->f_blocks /= src->f_frsize;
+
+        dst->f_bfree *= dst->f_frsize;
+        dst->f_bfree /= src->f_frsize;
+
+        dst->f_bavail *= dst->f_frsize;
+        dst->f_bavail /= src->f_frsize;
+
+        dst->f_frsize = src->f_frsize;
+    } else if (dst->f_frsize > src->f_frsize) {
+        src->f_blocks *= src->f_frsize;
+        src->f_blocks /= dst->f_frsize;
+
+        src->f_bfree *= src->f_frsize;
+        src->f_bfree /= dst->f_frsize;
+
+        src->f_bavail *= src->f_frsize;
+        src->f_bavail /= dst->f_frsize;
+    }
+    if (dst->f_blocks > src->f_blocks) {
+        dst->f_blocks = src->f_blocks;
+    }
+    if (dst->f_bfree > src->f_bfree) {
+        dst->f_bfree = src->f_bfree;
+    }
+    if (dst->f_bavail > src->f_bavail) {
+        dst->f_bavail = src->f_bavail;
+    }
+
+    if (dst->f_files < src->f_files) {
+        dst->f_files = src->f_files;
+    }
+    if (dst->f_ffree > src->f_ffree) {
+        dst->f_ffree = src->f_ffree;
+    }
+    if (dst->f_favail > src->f_favail) {
+        dst->f_favail = src->f_favail;
+    }
+    if (dst->f_namemax > src->f_namemax) {
+        dst->f_namemax = src->f_namemax;
+    }
+
+    if (dst->f_flag != src->f_flag) {
+        gf_msg_debug(THIS->name, 0,
+                     "Mismatching file system flags "
+                     "(%lX, %lX)",
+                     dst->f_flag, src->f_flag);
+    }
+    dst->f_flag &= src->f_flag;
+}
+
+int32_t
+ec_combine_check(ec_cbk_data_t *dst, ec_cbk_data_t *src, ec_combine_f combine)
+{
+    ec_fop_data_t *fop = dst->fop;
+
+    if (dst->op_ret != src->op_ret) {
+        gf_msg_debug(fop->xl->name, 0,
+                     "Mismatching return code in "
+                     "answers of '%s': %d <-> %d",
+                     ec_fop_name(fop->id), dst->op_ret, src->op_ret);
+
+        return 0;
+    }
+    if (dst->op_ret < 0) {
+        if (dst->op_errno != src->op_errno) {
+            gf_msg_debug(fop->xl->name, 0,
+                         "Mismatching errno code in "
+                         "answers of '%s': %d <-> %d",
+                         ec_fop_name(fop->id), dst->op_errno, src->op_errno);
+
+            return 0;
+        }
+    }
+
+    if (!ec_dict_compare(dst->xdata, src->xdata)) {
+        gf_msg(fop->xl->name, GF_LOG_DEBUG, 0, EC_MSG_XDATA_MISMATCH,
+               "Mismatching xdata in answers "
+               "of '%s'",
+               ec_fop_name(fop->id));
+
+        return 0;
+    }
+
+    if ((dst->op_ret >= 0) && (combine != NULL)) {
+        return combine(fop, dst, src);
+    }
+
+    return 1;
+}
+
+void
+ec_combine(ec_cbk_data_t *newcbk, ec_combine_f combine)
+{
+    ec_fop_data_t *fop = newcbk->fop;
+    ec_cbk_data_t *cbk = NULL, *tmp = NULL;
+    struct list_head *item = NULL;
+    int32_t needed = 0;
+    char str[32];
+
+    LOCK(&fop->lock);
+
+    fop->received |= newcbk->mask;
+
+    item = fop->cbk_list.prev;
+    list_for_each_entry(cbk, &fop->cbk_list, list)
+    {
+        if (ec_combine_check(newcbk, cbk, combine)) {
+            newcbk->count += cbk->count;
+            newcbk->mask |= cbk->mask;
+
+            item = cbk->list.prev;
+            while (item != &fop->cbk_list) {
+                tmp = list_entry(item, ec_cbk_data_t, list);
+                if (tmp->count >= newcbk->count) {
+                    break;
+                }
+                item = item->prev;
+            }
+            list_del(&cbk->list);
+
+            newcbk->next = cbk;
+
+            break;
+        }
+    }
+    list_add(&newcbk->list, item);
+
+    ec_trace("ANSWER", fop, "combine=%s[%d]",
+             ec_bin(str, sizeof(str), newcbk->mask, 0), newcbk->count);
+
+    cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list);
+    if ((fop->mask ^ fop->remaining) == fop->received) {
+        needed = fop->minimum - cbk->count;
+    }
+
+    UNLOCK(&fop->lock);
+
+    if (needed > 0) {
+        ec_dispatch_next(fop, newcbk->idx);
+    }
+}
diff --git a/xlators/cluster/ec/src/ec-combine.h b/xlators/cluster/ec/src/ec-combine.h
new file mode 100644
index 00000000000..1010cc3be26
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-combine.h
@@ -0,0 +1,44 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_COMBINE_H__
+#define __EC_COMBINE_H__
+
+#define EC_COMBINE_DICT 0
+#define EC_COMBINE_XDATA 1
+
+typedef int32_t (*ec_combine_f)(ec_fop_data_t *fop, ec_cbk_data_t *dst,
+                                ec_cbk_data_t *src);
+
+void
+ec_iatt_rebuild(ec_t *ec, struct iatt *iatt, int32_t count, int32_t answers);
+
+int32_t
+ec_iatt_combine(ec_fop_data_t *fop, struct iatt *dst, struct iatt *src,
+                int32_t count);
+int32_t
+ec_dict_compare(dict_t *dict1, dict_t *dict2);
+int32_t
+ec_vector_compare(struct iovec *dst_vector, int32_t dst_count,
+                  struct iovec *src_vector, int32_t src_count);
+int32_t
+ec_flock_compare(struct gf_flock *dst, struct gf_flock *src);
+void
+ec_statvfs_combine(struct statvfs *dst, struct statvfs *src);
+
+int32_t
+ec_dict_combine(ec_cbk_data_t *cbk, int32_t which);
+
+void
+ec_combine(ec_cbk_data_t *cbk, ec_combine_f combine);
+
+int32_t
+ec_combine_write(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src);
+#endif /* __EC_COMBINE_H__ */
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
new file mode 100644
index 00000000000..b955efd8c2d
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -0,0 +1,3042 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/byte-order.h>
+#include <glusterfs/hashfn.h>
+
+#include "ec-mem-types.h"
+#include "ec-types.h"
+#include "ec-helpers.h"
+#include "ec-combine.h"
+#include "ec-common.h"
+#include "ec-fops.h"
+#include "ec-method.h"
+#include "ec.h"
+#include "ec-messages.h"
+
+#define EC_INVALID_INDEX UINT32_MAX
+
+void
+ec_update_fd_status(fd_t *fd, xlator_t *xl, int idx, int32_t ret_status)
+{
+    ec_fd_t *fd_ctx;
+
+    if (fd == NULL)
+        return;
+
+    LOCK(&fd->lock);
+    {
+        fd_ctx = __ec_fd_get(fd, xl);
+        if (fd_ctx) {
+            if (ret_status >= 0)
+                fd_ctx->fd_status[idx] = EC_FD_OPENED;
+            else
+                fd_ctx->fd_status[idx] = EC_FD_NOT_OPENED;
+        }
+    }
+    UNLOCK(&fd->lock);
+}
+
+static uintptr_t
+ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t mask)
+{
+    int i = 0;
+    int count = 0;
+    ec_t *ec = NULL;
+    ec_fd_t *fd_ctx = NULL;
+    uintptr_t need_open = 0;
+
+    ec = this->private;
+
+    fd_ctx = ec_fd_get(fd, this);
+    if (!fd_ctx)
+        return count;
+
+    LOCK(&fd->lock);
+    {
+        for (i = 0; i < ec->nodes; i++) {
+            if ((fd_ctx->fd_status[i] == EC_FD_NOT_OPENED) &&
+                ((ec->xl_up & (1 << i)) != 0) && ((mask & (1 << i)) != 0)) {
+                fd_ctx->fd_status[i] = EC_FD_OPENING;
+                need_open |= (1 << i);
+                count++;
+            }
+        }
+    }
+    UNLOCK(&fd->lock);
+
+    /* If fd needs to open on minimum number of nodes
+     * then ignore fixing the fd as it has been
+     * requested from heal operation.
+     */
+    if (count >= ec->fragments) {
+        need_open = 0;
+    }
+
+    return need_open;
+}
+
+static gf_boolean_t
+ec_is_fd_fixable(fd_t *fd)
+{
+    if (!fd || !fd->inode)
+        return _gf_false;
+    else if (fd_is_anonymous(fd))
+        return _gf_false;
+    else if (gf_uuid_is_null(fd->inode->gfid))
+        return _gf_false;
+
+    return _gf_true;
+}
+
+static void
+ec_fix_open(ec_fop_data_t *fop, uintptr_t mask)
+{
+    uintptr_t need_open = 0;
+    int ret = 0;
+    int32_t flags = 0;
+    loc_t loc = {
+        0,
+    };
+
+    if (!ec_is_fd_fixable(fop->fd))
+        goto out;
+
+    /* Evaluate how many remote fd's to be opened */
+    need_open = ec_fd_ctx_need_open(fop->fd, fop->xl, mask);
+    if (need_open == 0) {
+        goto out;
+    }
+
+    loc.inode = inode_ref(fop->fd->inode);
+    gf_uuid_copy(loc.gfid, fop->fd->inode->gfid);
+    ret = loc_path(&loc, NULL);
+    if (ret < 0) {
+        goto out;
+    }
+
+    flags = fop->fd->flags & (~(O_TRUNC | O_APPEND | O_CREAT | O_EXCL));
+    if (IA_IFDIR == fop->fd->inode->ia_type) {
+        ec_opendir(fop->frame, fop->xl, need_open,
+                   EC_MINIMUM_ONE | EC_FOP_NO_PROPAGATE_ERROR, NULL, NULL,
+                   &fop->loc[0], fop->fd, NULL);
+    } else {
+        ec_open(fop->frame, fop->xl, need_open,
+                EC_MINIMUM_ONE | EC_FOP_NO_PROPAGATE_ERROR, NULL, NULL, &loc,
+                flags, fop->fd, NULL);
+    }
+
+out:
+    loc_wipe(&loc);
+}
+
+static off_t
+ec_range_end_get(off_t fl_start, uint64_t fl_size)
+{
+    if (fl_size > 0) {
+        if (fl_size >= EC_RANGE_FULL) {
+            /* Infinity */
+            fl_start = LLONG_MAX;
+        } else {
+            fl_start += fl_size - 1;
+            if (fl_start < 0) {
+                /* Overflow */
+                fl_start = LLONG_MAX;
+            }
+        }
+    }
+
+    return fl_start;
+}
+
+static gf_boolean_t
+ec_is_range_conflict(ec_lock_link_t *l1, ec_lock_link_t *l2)
+{
+    return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start));
+}
+
+static gf_boolean_t
+ec_lock_conflict(ec_lock_link_t *l1, ec_lock_link_t *l2)
+{
+    ec_t *ec = l1->fop->xl->private;
+
+    /* Fops like access/stat won't have to worry what the other fops are
+     * modifying as the fop is wound only to one brick. So it can be
+     * executed in parallel*/
+    if (l1->fop->minimum == EC_MINIMUM_ONE ||
+        l2->fop->minimum == EC_MINIMUM_ONE)
+        return _gf_false;
+
+    if ((l1->fop->flags & EC_FLAG_LOCK_SHARED) &&
+        (l2->fop->flags & EC_FLAG_LOCK_SHARED))
+        return _gf_false;
+
+    if (!ec->parallel_writes) {
+        return _gf_true;
+    }
+
+    return ec_is_range_conflict(l1, l2);
+}
+
+uint32_t
+ec_select_first_by_read_policy(ec_t *ec, ec_fop_data_t *fop)
+{
+    if (ec->read_policy == EC_ROUND_ROBIN) {
+        return ec->idx;
+    } else if (ec->read_policy == EC_GFID_HASH) {
+        if (fop->use_fd) {
+            return SuperFastHash((char *)fop->fd->inode->gfid,
+                                 sizeof(fop->fd->inode->gfid)) %
+                   ec->nodes;
+        } else {
+            if (gf_uuid_is_null(fop->loc[0].gfid))
+                loc_gfid(&fop->loc[0], fop->loc[0].gfid);
+            return SuperFastHash((char *)fop->loc[0].gfid,
+                                 sizeof(fop->loc[0].gfid)) %
+                   ec->nodes;
+        }
+    }
+    return 0;
+}
+
+static gf_boolean_t
+ec_child_valid(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
+{
+    return (idx < ec->nodes) && (((fop->remaining >> idx) & 1) == 1);
+}
+
+static uint32_t
+ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
+{
+    while (!ec_child_valid(ec, fop, idx)) {
+        if (++idx >= ec->nodes) {
+            idx = 0;
+        }
+        if (idx == fop->first) {
+            return EC_INVALID_INDEX;
+        }
+    }
+
+    return idx;
+}
+
+int32_t
+ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good,
+               uintptr_t bad, uint32_t pending, dict_t *xdata)
+{
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL,
+               "Heal failed");
+    } else {
+        if ((mask & ~good) != 0) {
+            gf_msg(this->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_SUCCESS,
+                   "Heal succeeded on %d/%d "
+                   "subvolumes",
+                   gf_bits_count(mask & ~(good | bad)),
+                   gf_bits_count(mask & ~good));
+        }
+    }
+
+    return 0;
+}
+
+static uintptr_t
+ec_fop_needs_name_heal(ec_fop_data_t *fop)
+{
+    ec_t *ec = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    ec_cbk_data_t *enoent_cbk = NULL;
+
+    ec = fop->xl->private;
+    if (fop->id != GF_FOP_LOOKUP)
+        return 0;
+
+    if (!fop->loc[0].name || strlen(fop->loc[0].name) == 0)
+        return 0;
+
+    list_for_each_entry(cbk, &fop->cbk_list, list)
+    {
+        if (cbk->op_ret < 0 && cbk->op_errno == ENOENT) {
+            enoent_cbk = cbk;
+            break;
+        }
+    }
+
+    if (!enoent_cbk)
+        return 0;
+
+    return ec->xl_up & ~enoent_cbk->mask;
+}
+
+int32_t
+ec_fop_needs_heal(ec_fop_data_t *fop)
+{
+    ec_t *ec = fop->xl->private;
+
+    if (fop->lock_count == 0) {
+        /*
+         * if fop->lock_count is zero that means it saw version mismatch
+         * without any locks so it can't be trusted. If we launch a heal
+         * based on this it will lead to INODELKs which will affect I/O
+         * performance. Considering self-heal-daemon and operations on
+         * the inode from client which take locks can still trigger the
+         * heal we can choose to not attempt a heal when fop->lock_count
+         * is zero.
+         */
+        return 0;
+    }
+    return (ec->xl_up & ~(fop->remaining | fop->good)) != 0;
+}
+
+void
+ec_check_status(ec_fop_data_t *fop)
+{
+    ec_t *ec = fop->xl->private;
+    int32_t partial = 0;
+    char str1[32], str2[32], str3[32], str4[32], str5[32];
+
+    if (!ec_fop_needs_name_heal(fop) && !ec_fop_needs_heal(fop)) {
+        return;
+    }
+
+    if (fop->answer && fop->answer->op_ret >= 0) {
+        if ((fop->id == GF_FOP_LOOKUP) || (fop->id == GF_FOP_STAT) ||
+            (fop->id == GF_FOP_FSTAT)) {
+            partial = fop->answer->iatt[0].ia_type == IA_IFDIR;
+        } else if (fop->id == GF_FOP_OPENDIR) {
+            partial = 1;
+        }
+    }
+
+    gf_msg(
+        fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
+        "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
+        "remaining=%s, good=%s, bad=%s,"
+        "(Least significant bit represents first client/brick of subvol), %s)",
+        gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
+        ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+        ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
+        ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
+        ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
+        ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
+               ec->nodes),
+        ec_msg_str(fop));
+    if (fop->use_fd) {
+        if (fop->fd != NULL) {
+            ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
+                     fop->fd, partial, NULL);
+        }
+    } else {
+        ec_heal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
+                &fop->loc[0], partial, NULL);
+
+        if (fop->loc[1].inode != NULL) {
+            ec_heal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
+                    &fop->loc[1], partial, NULL);
+        }
+    }
+}
+
+void
+ec_update_good(ec_fop_data_t *fop, uintptr_t good)
+{
+    fop->good = good;
+
+    /* Fops that are executed only on one brick do not have enough information
+     * to decide if healing is needed or not. */
+    if ((fop->expected != 1) && (fop->parent == NULL)) {
+        ec_check_status(fop);
+    }
+}
+
+void
+ec_lock_update_good(ec_lock_t *lock, ec_fop_data_t *fop)
+{
+    /* Fops that are executed only on one brick do not have enough information
+     * to update the global mask of good bricks. */
+    if (fop->expected == 1) {
+        return;
+    }
+
+    /* When updating the good mask of the lock, we only take into consideration
+     * those bits corresponding to the bricks where the fop has been executed.
+     * Bad bricks are removed from good_mask, but once marked as bad it's never
+     * set to good until the lock is released and reacquired */
+
+    lock->good_mask &= fop->good | fop->remaining;
+}
+
+void
+__ec_fop_set_error(ec_fop_data_t *fop, int32_t error)
+{
+    if ((error != 0) && (fop->error == 0)) {
+        fop->error = error;
+    }
+}
+
+void
+ec_fop_set_error(ec_fop_data_t *fop, int32_t error)
+{
+    LOCK(&fop->lock);
+
+    __ec_fop_set_error(fop, error);
+
+    UNLOCK(&fop->lock);
+}
+
+gf_boolean_t
+ec_cbk_set_error(ec_cbk_data_t *cbk, int32_t error, gf_boolean_t ro)
+{
+    if ((error != 0) && (cbk->op_ret >= 0)) {
+        /* If cbk->op_errno was 0, it means that the fop succeeded and this
+         * error has happened while processing the answer. If the operation was
+         * read-only, there's no problem (i.e. we simply return the generated
+         * error code). However if it caused a modification, we must return EIO
+         * to indicate that the operation has been partially executed. */
+        cbk->op_errno = ro ? error : EIO;
+        cbk->op_ret = -1;
+
+        ec_fop_set_error(cbk->fop, cbk->op_errno);
+    }
+
+    return (cbk->op_ret < 0);
+}
+
+ec_cbk_data_t *
+ec_fop_prepare_answer(ec_fop_data_t *fop, gf_boolean_t ro)
+{
+    ec_cbk_data_t *cbk;
+    int32_t err;
+
+    cbk = fop->answer;
+    if (cbk == NULL) {
+        ec_fop_set_error(fop, EIO);
+
+        return NULL;
+    }
+
+    if (cbk->op_ret < 0) {
+        ec_fop_set_error(fop, cbk->op_errno);
+    }
+
+    err = ec_dict_combine(cbk, EC_COMBINE_XDATA);
+    if (ec_cbk_set_error(cbk, -err, ro)) {
+        return NULL;
+    }
+
+    return cbk;
+}
+
+void
+ec_sleep(ec_fop_data_t *fop)
+{
+    LOCK(&fop->lock);
+
+    GF_ASSERT(fop->refs > 0);
+    fop->refs++;
+    fop->jobs++;
+
+    UNLOCK(&fop->lock);
+}
+
+int32_t
+ec_check_complete(ec_fop_data_t *fop, ec_resume_f resume)
+{
+    int32_t error = -1;
+
+    LOCK(&fop->lock);
+
+    GF_ASSERT(fop->resume == NULL);
+
+    if (--fop->jobs != 0) {
+        ec_trace("WAIT", fop, "resume=%p", resume);
+
+        fop->resume = resume;
+    } else {
+        error = fop->error;
+        fop->error = 0;
+    }
+
+    UNLOCK(&fop->lock);
+
+    return error;
+}
+
+void
+ec_resume(ec_fop_data_t *fop, int32_t error)
+{
+    ec_resume_f resume = NULL;
+
+    LOCK(&fop->lock);
+
+    __ec_fop_set_error(fop, error);
+
+    if (--fop->jobs == 0) {
+        resume = fop->resume;
+        fop->resume = NULL;
+        if (resume != NULL) {
+            ec_trace("RESUME", fop, "error=%d", error);
+
+            if (fop->error != 0) {
+                error = fop->error;
+            }
+            fop->error = 0;
+        }
+    }
+
+    UNLOCK(&fop->lock);
+
+    if (resume != NULL) {
+        resume(fop, error);
+    }
+
+    ec_fop_data_release(fop);
+}
+
+void
+ec_resume_parent(ec_fop_data_t *fop)
+{
+    ec_fop_data_t *parent;
+    int32_t error = 0;
+
+    parent = fop->parent;
+    if (parent != NULL) {
+        if ((fop->fop_flags & EC_FOP_NO_PROPAGATE_ERROR) == 0) {
+            error = fop->error;
+        }
+        ec_trace("RESUME_PARENT", fop, "error=%u", error);
+        fop->parent = NULL;
+        ec_resume(parent, error);
+    }
+}
+
+gf_boolean_t
+ec_is_recoverable_error(int32_t op_errno)
+{
+    switch (op_errno) {
+        case ENOTCONN:
+        case ESTALE:
+        case ENOENT:
+        case EBADFD: /*Opened fd but brick is disconnected*/
+        case EIO:    /*Backend-fs crash like XFS/ext4 etc*/
+            return _gf_true;
+    }
+    return _gf_false;
+}
+
+void
+ec_complete(ec_fop_data_t *fop)
+{
+    ec_cbk_data_t *cbk = NULL;
+    int32_t resume = 0, update = 0;
+    int healing_count = 0;
+
+    LOCK(&fop->lock);
+
+    ec_trace("COMPLETE", fop, "");
+
+    if (--fop->winds == 0) {
+        if (fop->answer == NULL) {
+            if (!list_empty(&fop->cbk_list)) {
+                cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list);
+                healing_count = gf_bits_count(cbk->mask & fop->healing);
+                /* fop shouldn't be treated as success if it is not
+                 * successful on at least fop->minimum good copies*/
+                if ((cbk->count - healing_count) >= fop->minimum) {
+                    fop->answer = cbk;
+
+                    update = 1;
+                }
+            }
+
+            resume = 1;
+        }
+    }
+
+    UNLOCK(&fop->lock);
+
+    /* ec_update_good() locks inode->lock. This may cause deadlocks with
+       fop->lock when used in another order. Since ec_update_good() will not
+       be called more than once for each fop, it can be called from outside
+       the fop->lock locked region. */
+    if (update) {
+        ec_update_good(fop, cbk->mask);
+    }
+
+    if (resume) {
+        ec_resume(fop, 0);
+    }
+
+    ec_fop_data_release(fop);
+}
+
+/* There could be already granted locks sitting on the bricks, unlock for which
+ * must be wound at all costs*/
+static gf_boolean_t
+ec_must_wind(ec_fop_data_t *fop)
+{
+    if ((fop->id == GF_FOP_INODELK) || (fop->id == GF_FOP_FINODELK) ||
+        (fop->id == GF_FOP_LK)) {
+        if (fop->flock.l_type == F_UNLCK)
+            return _gf_true;
+    } else if ((fop->id == GF_FOP_ENTRYLK) || (fop->id == GF_FOP_FENTRYLK)) {
+        if (fop->entrylk_cmd == ENTRYLK_UNLOCK)
+            return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+static gf_boolean_t
+ec_internal_op(ec_fop_data_t *fop)
+{
+    if (ec_must_wind(fop))
+        return _gf_true;
+    if (fop->id == GF_FOP_XATTROP)
+        return _gf_true;
+    if (fop->id == GF_FOP_FXATTROP)
+        return _gf_true;
+    if (fop->id == GF_FOP_OPEN)
+        return _gf_true;
+    return _gf_false;
+}
+
+char *
+ec_msg_str(ec_fop_data_t *fop)
+{
+    loc_t *loc1 = NULL;
+    loc_t *loc2 = NULL;
+    char gfid1[64] = {0};
+    char gfid2[64] = {0};
+    ec_fop_data_t *parent = fop->parent;
+
+    if (fop->errstr)
+        return fop->errstr;
+    if (!fop->use_fd) {
+        loc1 = &fop->loc[0];
+        loc2 = &fop->loc[1];
+
+        if (fop->id == GF_FOP_RENAME) {
+            gf_asprintf(&fop->errstr,
+                        "FOP : '%s' failed on '%s' and '%s' with gfids "
+                        "%s and %s respectively. Parent FOP: %s",
+                        ec_fop_name(fop->id), loc1->path, loc2->path,
+                        uuid_utoa_r(loc1->gfid, gfid1),
+                        uuid_utoa_r(loc2->gfid, gfid2),
+                        parent ? ec_fop_name(parent->id) : "No Parent");
+        } else {
+            gf_asprintf(
+                &fop->errstr,
+                "FOP : '%s' failed on '%s' with gfid %s. Parent FOP: %s",
+                ec_fop_name(fop->id), loc1->path,
+                uuid_utoa_r(loc1->gfid, gfid1),
+                parent ? ec_fop_name(parent->id) : "No Parent");
+        }
+    } else {
+        gf_asprintf(
+            &fop->errstr, "FOP : '%s' failed on gfid %s. Parent FOP: %s",
+            ec_fop_name(fop->id), uuid_utoa_r(fop->fd->inode->gfid, gfid1),
+            parent ? ec_fop_name(parent->id) : "No Parent");
+    }
+    return fop->errstr;
+}
+
+static void
+ec_log_insufficient_vol(ec_fop_data_t *fop, int32_t have, uint32_t need,
+                        int32_t loglevel)
+{
+    ec_t *ec = fop->xl->private;
+    char str1[32], str2[32], str3[32];
+
+    gf_msg(ec->xl->name, loglevel, 0, EC_MSG_CHILDS_INSUFFICIENT,
+           "Insufficient available children for this request: "
+           "Have : %d, Need : %u : Child UP : %s "
+           "Mask: %s, Healing : %s : %s ",
+           have, need, ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+           ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
+           ec_bin(str3, sizeof(str3), fop->healing, ec->nodes),
+           ec_msg_str(fop));
+}
+
+static int32_t
+ec_child_select(ec_fop_data_t *fop)
+{
+    ec_t *ec = fop->xl->private;
+    int32_t first = 0, num = 0;
+
+    ec_fop_cleanup(fop);
+
+    fop->mask &= ec->node_mask;
+    /* Wind the fop on same subvols as parent for any internal extra fops like
+     * head/tail read in case of writev fop. Unlocks shouldn't do this because
+     * unlock should go on all subvols where lock is performed*/
+    if (fop->parent && !ec_internal_op(fop)) {
+        fop->mask &= (fop->parent->mask & ~fop->parent->healing);
+        if (ec_is_data_fop(fop->id)) {
+            fop->healing |= fop->parent->healing;
+        }
+    }
+
+    if ((fop->mask & ~ec->xl_up) != 0) {
+        gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_EXEC_UNAVAIL,
+               "Executing operation with "
+               "some subvolumes unavailable. (%" PRIXPTR "). %s ",
+               fop->mask & ~ec->xl_up, ec_msg_str(fop));
+        fop->mask &= ec->xl_up;
+    }
+
+    switch (fop->minimum) {
+        case EC_MINIMUM_ALL:
+            fop->minimum = gf_bits_count(fop->mask);
+            if (fop->minimum >= ec->fragments) {
+                break;
+            }
+        case EC_MINIMUM_MIN:
+            fop->minimum = ec->fragments;
+            break;
+        case EC_MINIMUM_ONE:
+            fop->minimum = 1;
+    }
+
+    if (ec->read_policy == EC_ROUND_ROBIN) {
+        first = ec->idx;
+        if (++first >= ec->nodes) {
+            first = 0;
+        }
+        ec->idx = first;
+    }
+
+    num = gf_bits_count(fop->mask);
+    /*Unconditionally wind on healing subvolumes*/
+    fop->mask |= fop->healing;
+    fop->remaining = fop->mask;
+    fop->received = 0;
+
+    ec_trace("SELECT", fop, "");
+
+    if ((num < fop->minimum) && (num < ec->fragments)) {
+        ec_log_insufficient_vol(fop, num, fop->minimum, GF_LOG_ERROR);
+        return 0;
+    }
+
+    if (!fop->parent && fop->lock_count &&
+        (fop->locks[0].update[EC_DATA_TXN] ||
+         fop->locks[0].update[EC_METADATA_TXN])) {
+        if (ec->quorum_count && (num < ec->quorum_count)) {
+            ec_log_insufficient_vol(fop, num, ec->quorum_count, GF_LOG_ERROR);
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+void
+ec_dispatch_next(ec_fop_data_t *fop, uint32_t idx)
+{
+    uint32_t i = EC_INVALID_INDEX;
+    ec_t *ec = fop->xl->private;
+
+    LOCK(&fop->lock);
+
+    i = ec_child_next(ec, fop, idx);
+    if (i < EC_MAX_NODES) {
+        idx = i;
+
+        fop->remaining ^= 1ULL << idx;
+
+        ec_trace("EXECUTE", fop, "idx=%d", idx);
+
+        fop->winds++;
+        fop->refs++;
+    }
+
+    UNLOCK(&fop->lock);
+
+    if (i < EC_MAX_NODES) {
+        fop->wind(ec, fop, idx);
+    }
+}
+
+void
+ec_dispatch_mask(ec_fop_data_t *fop, uintptr_t mask)
+{
+    ec_t *ec = fop->xl->private;
+    int32_t count, idx;
+
+    count = gf_bits_count(mask);
+
+    LOCK(&fop->lock);
+
+    ec_trace("EXECUTE", fop, "mask=%lX", mask);
+
+    fop->remaining ^= mask;
+
+    fop->winds += count;
+    fop->refs += count;
+
+    UNLOCK(&fop->lock);
+
+    idx = 0;
+    while (mask != 0) {
+        if ((mask & 1) != 0) {
+            fop->wind(ec, fop, idx);
+        }
+        idx++;
+        mask >>= 1;
+    }
+}
+
+void
+ec_dispatch_start(ec_fop_data_t *fop)
+{
+    fop->answer = NULL;
+    fop->good = 0;
+
+    INIT_LIST_HEAD(&fop->cbk_list);
+
+    if (fop->lock_count > 0) {
+        ec_owner_copy(fop->frame, &fop->req_frame->root->lk_owner);
+    }
+}
+
+void
+ec_dispatch_one(ec_fop_data_t *fop)
+{
+    ec_dispatch_start(fop);
+
+    if (ec_child_select(fop)) {
+        ec_sleep(fop);
+
+        fop->expected = 1;
+        fop->first = ec_select_first_by_read_policy(fop->xl->private, fop);
+
+        ec_dispatch_next(fop, fop->first);
+    }
+}
+
+gf_boolean_t
+ec_dispatch_one_retry(ec_fop_data_t *fop, ec_cbk_data_t **cbk)
+{
+    ec_cbk_data_t *tmp;
+
+    tmp = ec_fop_prepare_answer(fop, _gf_true);
+    if (cbk != NULL) {
+        *cbk = tmp;
+    }
+    if ((tmp != NULL) && (tmp->op_ret < 0) &&
+        ec_is_recoverable_error(tmp->op_errno)) {
+        GF_ASSERT(fop->mask & (1ULL << tmp->idx));
+        fop->mask ^= (1ULL << tmp->idx);
+        if (fop->mask) {
+            return _gf_true;
+        }
+    }
+
+    return _gf_false;
+}
+
+void
+ec_dispatch_inc(ec_fop_data_t *fop)
+{
+    ec_dispatch_start(fop);
+
+    if (ec_child_select(fop)) {
+        ec_sleep(fop);
+
+        fop->expected = gf_bits_count(fop->remaining);
+        fop->first = 0;
+
+        ec_dispatch_next(fop, 0);
+    }
+}
+
+void
+ec_dispatch_all(ec_fop_data_t *fop)
+{
+    ec_dispatch_start(fop);
+
+    if (ec_child_select(fop)) {
+        ec_sleep(fop);
+
+        fop->expected = gf_bits_count(fop->remaining);
+        fop->first = 0;
+
+        ec_dispatch_mask(fop, fop->remaining);
+    }
+}
+
+void
+ec_dispatch_min(ec_fop_data_t *fop)
+{
+    ec_t *ec = fop->xl->private;
+    uintptr_t mask;
+    uint32_t idx;
+    int32_t count;
+
+    ec_dispatch_start(fop);
+
+    if (ec_child_select(fop)) {
+        ec_sleep(fop);
+
+        fop->expected = count = ec->fragments;
+        fop->first = ec_select_first_by_read_policy(fop->xl->private, fop);
+        idx = fop->first - 1;
+        mask = 0;
+        while (count-- > 0) {
+            idx = ec_child_next(ec, fop, idx + 1);
+            if (idx < EC_MAX_NODES)
+                mask |= 1ULL << idx;
+        }
+
+        ec_dispatch_mask(fop, mask);
+    }
+}
+
+void
+ec_succeed_all(ec_fop_data_t *fop)
+{
+    ec_dispatch_start(fop);
+
+    if (ec_child_select(fop)) {
+        fop->expected = gf_bits_count(fop->remaining);
+        fop->first = 0;
+
+        /* Simulate a successful execution on all bricks */
+        ec_trace("SUCCEED", fop, "");
+
+        fop->good = fop->remaining;
+        fop->remaining = 0;
+    }
+}
+
+ec_lock_t *
+ec_lock_allocate(ec_fop_data_t *fop, loc_t *loc)
+{
+    ec_t *ec = fop->xl->private;
+    ec_lock_t *lock;
+    int32_t err;
+
+    if ((loc->inode == NULL) ||
+        (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid))) {
+        gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_INODE,
+               "Trying to lock based on an invalid "
+               "inode");
+
+        __ec_fop_set_error(fop, EINVAL);
+
+        return NULL;
+    }
+
+    lock = mem_get0(ec->lock_pool);
+    if (lock != NULL) {
+        lock->good_mask = UINTPTR_MAX;
+        INIT_LIST_HEAD(&lock->owners);
+        INIT_LIST_HEAD(&lock->waiting);
+        INIT_LIST_HEAD(&lock->frozen);
+        err = ec_loc_from_loc(fop->xl, &lock->loc, loc);
+        if (err != 0) {
+            mem_put(lock);
+            lock = NULL;
+
+            __ec_fop_set_error(fop, -err);
+        }
+    }
+
+    return lock;
+}
+
+void
+ec_lock_destroy(ec_lock_t *lock)
+{
+    loc_wipe(&lock->loc);
+    if (lock->fd != NULL) {
+        fd_unref(lock->fd);
+    }
+
+    mem_put(lock);
+}
+
+int32_t
+ec_lock_compare(ec_lock_t *lock1, ec_lock_t *lock2)
+{
+    return gf_uuid_compare(lock1->loc.gfid, lock2->loc.gfid);
+}
+
+static void
+ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags, loc_t *base,
+               off_t fl_start, uint64_t fl_size)
+{
+    ec_lock_link_t *link;
+
+    /* This check is only prepared for up to 2 locks per fop. If more locks
+     * are needed this must be changed. */
+    if ((fop->lock_count > 0) &&
+        (ec_lock_compare(fop->locks[0].lock, lock) < 0)) {
+        fop->first_lock = fop->lock_count;
+    } else {
+        /* When the first lock is added to the current fop, request lock
+         * counts from locks xlator to be able to determine if there is
+         * contention and release the lock sooner. */
+        if (fop->xdata == NULL) {
+            fop->xdata = dict_new();
+            if (fop->xdata == NULL) {
+                ec_fop_set_error(fop, ENOMEM);
+                return;
+            }
+        }
+        if (dict_set_str(fop->xdata, GLUSTERFS_INODELK_DOM_COUNT,
+                         fop->xl->name) != 0) {
+            ec_fop_set_error(fop, ENOMEM);
+            return;
+        }
+    }
+
+    link = &fop->locks[fop->lock_count++];
+
+    link->lock = lock;
+    link->fop = fop;
+    link->update[EC_DATA_TXN] = (flags & EC_UPDATE_DATA) != 0;
+    link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0;
+    link->base = base;
+    link->fl_start = fl_start;
+    link->fl_end = ec_range_end_get(fl_start, fl_size);
+
+    lock->refs_pending++;
+}
+
+static void
+ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
+                               loc_t *base, off_t fl_start, uint64_t fl_size)
+{
+    ec_lock_t *lock = NULL;
+    ec_inode_t *ctx;
+
+    if ((fop->parent != NULL) || (fop->error != 0) || (loc->inode == NULL)) {
+        return;
+    }
+
+    LOCK(&loc->inode->lock);
+
+    ctx = __ec_inode_get(loc->inode, fop->xl);
+    if (ctx == NULL) {
+        __ec_fop_set_error(fop, ENOMEM);
+
+        goto unlock;
+    }
+
+    if (ctx->inode_lock != NULL) {
+        lock = ctx->inode_lock;
+
+        /* If there's another lock, make sure that it's not the same. Otherwise
+         * do not insert it.
+         *
+         * This can only happen on renames where source and target names are
+         * in the same directory. */
+        if ((fop->lock_count > 0) && (fop->locks[0].lock == lock)) {
+            /* Combine data/meta updates */
+            fop->locks[0].update[EC_DATA_TXN] |= (flags & EC_UPDATE_DATA) != 0;
+            fop->locks[0].update[EC_METADATA_TXN] |= (flags & EC_UPDATE_META) !=
+                                                     0;
+
+            /* Only one base inode is allowed per fop, so there shouldn't be
+             * overwrites here. */
+            if (base != NULL) {
+                fop->locks[0].base = base;
+            }
+
+            goto update_query;
+        }
+
+        ec_trace("LOCK_INODELK", fop,
+                 "lock=%p, inode=%p. Lock already "
+                 "acquired",
+                 lock, loc->inode);
+
+        goto insert;
+    }
+
+    lock = ec_lock_allocate(fop, loc);
+    if (lock == NULL) {
+        goto unlock;
+    }
+
+    ec_trace("LOCK_CREATE", fop, "lock=%p", lock);
+
+    lock->flock.l_type = F_WRLCK;
+    lock->flock.l_whence = SEEK_SET;
+
+    lock->ctx = ctx;
+    ctx->inode_lock = lock;
+
+insert:
+    ec_lock_insert(fop, lock, flags, base, fl_start, fl_size);
+update_query:
+    lock->query |= (flags & EC_QUERY_INFO) != 0;
+unlock:
+    UNLOCK(&loc->inode->lock);
+}
+
+void
+ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
+                      off_t fl_start, uint64_t fl_size)
+{
+    ec_lock_prepare_inode_internal(fop, loc, flags, NULL, fl_start, fl_size);
+}
+
+void
+ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
+                             uint32_t flags)
+{
+    loc_t tmp;
+    int32_t err;
+
+    if (fop->error != 0) {
+        return;
+    }
+
+    err = ec_loc_parent(fop->xl, loc, &tmp);
+    if (err != 0) {
+        ec_fop_set_error(fop, -err);
+
+        return;
+    }
+
+    if ((flags & EC_INODE_SIZE) != 0) {
+        flags ^= EC_INODE_SIZE;
+    } else {
+        base = NULL;
+    }
+
+    ec_lock_prepare_inode_internal(fop, &tmp, flags, base, 0, EC_RANGE_FULL);
+
+    loc_wipe(&tmp);
+}
+
+void
+ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags, off_t fl_start,
+                   uint64_t fl_size)
+{
+    loc_t loc;
+    int32_t err;
+
+    if (fop->error != 0) {
+        return;
+    }
+
+    err = ec_loc_from_fd(fop->xl, &loc, fd);
+    if (err != 0) {
+        ec_fop_set_error(fop, -err);
+
+        return;
+    }
+
+    ec_lock_prepare_inode_internal(fop, &loc, flags, NULL, fl_start, fl_size);
+
+    loc_wipe(&loc);
+}
+
+gf_boolean_t
+ec_config_check(xlator_t *xl, ec_config_t *config)
+{
+    ec_t *ec;
+
+    ec = xl->private;
+    if ((config->version != EC_CONFIG_VERSION) ||
+        (config->algorithm != EC_CONFIG_ALGORITHM) ||
+        (config->gf_word_size != EC_GF_BITS) || (config->bricks != ec->nodes) ||
+        (config->redundancy != ec->redundancy) ||
+        (config->chunk_size != EC_METHOD_CHUNK_SIZE)) {
+        uint32_t data_bricks;
+
+        /* This combination of version/algorithm requires the following
+           values. Incorrect values for these fields are a sign of
+           corruption:
+
+             redundancy > 0
+             redundancy * 2 < bricks
+             gf_word_size must be a power of 2
+             chunk_size (in bits) must be a multiple of gf_word_size *
+                 (bricks - redundancy) */
+
+        data_bricks = config->bricks - config->redundancy;
+        if ((config->redundancy < 1) ||
+            (config->redundancy * 2 >= config->bricks) ||
+            !ec_is_power_of_2(config->gf_word_size) ||
+            ((config->chunk_size * 8) % (config->gf_word_size * data_bricks) !=
+             0)) {
+            gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_CONFIG,
+                   "Invalid or corrupted config");
+        } else {
+            gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_CONFIG,
+                   "Unsupported config "
+                   "(V=%u, A=%u, W=%u, "
+                   "N=%u, R=%u, S=%u)",
+                   config->version, config->algorithm, config->gf_word_size,
+                   config->bricks, config->redundancy, config->chunk_size);
+        }
+
+        return _gf_false;
+    }
+
+    return _gf_true;
+}
+
+gf_boolean_t
+ec_set_dirty_flag(ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty)
+{
+    gf_boolean_t set_dirty = _gf_false;
+
+    if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) {
+        if (!link->optimistic_changelog)
+            dirty[EC_DATA_TXN] = 1;
+    }
+
+    if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) {
+        if (!link->optimistic_changelog)
+            dirty[EC_METADATA_TXN] = 1;
+    }
+
+    if (dirty[EC_METADATA_TXN] || dirty[EC_DATA_TXN]) {
+        set_dirty = _gf_true;
+    }
+
+    return set_dirty;
+}
+
+int32_t
+ec_prepare_update_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *dict,
+                      dict_t *xdata)
+{
+    struct list_head list;
+    ec_fop_data_t *fop = cookie, *parent, *tmp;
+    ec_lock_link_t *parent_link = fop->data;
+    ec_lock_link_t *link = NULL;
+    ec_lock_t *lock = NULL;
+    ec_inode_t *ctx;
+    gf_boolean_t release = _gf_false;
+    uint64_t provided_flags = 0;
+    uint64_t dirty[EC_VERSION_SIZE] = {0, 0};
+    lock = parent_link->lock;
+    parent = parent_link->fop;
+    ctx = lock->ctx;
+
+    INIT_LIST_HEAD(&list);
+    provided_flags = EC_PROVIDED_FLAGS(parent_link->waiting_flags);
+
+    LOCK(&lock->loc.inode->lock);
+
+    list_for_each_entry(link, &lock->owners, owner_list)
+    {
+        if ((link->waiting_flags & provided_flags) != 0) {
+            link->waiting_flags ^= (link->waiting_flags & provided_flags);
+            if (EC_NEEDED_FLAGS(link->waiting_flags) == 0)
+                list_add_tail(&link->fop->cbk_list, &list);
+        }
+    }
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_SIZE_VERS_GET_FAIL,
+               "Failed to get size and version :  %s", ec_msg_str(fop));
+
+        goto unlock;
+    }
+
+    if (EC_FLAGS_HAVE(provided_flags, EC_FLAG_XATTROP)) {
+        op_errno = -ec_dict_del_array(dict, EC_XATTR_VERSION, ctx->pre_version,
+                                      EC_VERSION_SIZE);
+        if (op_errno != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                   EC_MSG_VER_XATTR_GET_FAIL, "Unable to get version xattr. %s",
+                   ec_msg_str(fop));
+            goto unlock;
+        }
+        ctx->post_version[0] += ctx->pre_version[0];
+        ctx->post_version[1] += ctx->pre_version[1];
+
+        ctx->have_version = _gf_true;
+
+        if (lock->loc.inode->ia_type == IA_IFREG ||
+            lock->loc.inode->ia_type == IA_INVAL) {
+            op_errno = -ec_dict_del_number(dict, EC_XATTR_SIZE, &ctx->pre_size);
+            if (op_errno != 0) {
+                if (lock->loc.inode->ia_type == IA_IFREG) {
+                    gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                           EC_MSG_SIZE_XATTR_GET_FAIL,
+                           "Unable to get size xattr. %s", ec_msg_str(fop));
+                    goto unlock;
+                }
+            } else {
+                ctx->post_size = ctx->pre_size;
+
+                ctx->have_size = _gf_true;
+            }
+
+            op_errno = -ec_dict_del_config(dict, EC_XATTR_CONFIG, &ctx->config);
+            if (op_errno != 0) {
+                if ((lock->loc.inode->ia_type == IA_IFREG) ||
+                    (op_errno != ENODATA)) {
+                    gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                           EC_MSG_CONFIG_XATTR_GET_FAIL,
+                           "Unable to get config xattr. %s", ec_msg_str(fop));
+
+                    goto unlock;
+                }
+            } else {
+                if (!ec_config_check(parent->xl, &ctx->config)) {
+                    gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+                           EC_MSG_CONFIG_XATTR_INVALID, "Invalid config xattr");
+
+                    op_errno = EINVAL;
+
+                    goto unlock;
+                }
+                ctx->have_config = _gf_true;
+            }
+        }
+        ctx->have_info = _gf_true;
+    }
+
+    ec_set_dirty_flag(fop->data, ctx, dirty);
+    if (dirty[EC_METADATA_TXN] &&
+        (EC_FLAGS_HAVE(provided_flags, EC_FLAG_METADATA_DIRTY))) {
+        GF_ASSERT(!ctx->dirty[EC_METADATA_TXN]);
+        ctx->dirty[EC_METADATA_TXN] = 1;
+    }
+
+    if (dirty[EC_DATA_TXN] &&
+        (EC_FLAGS_HAVE(provided_flags, EC_FLAG_DATA_DIRTY))) {
+        GF_ASSERT(!ctx->dirty[EC_DATA_TXN]);
+        ctx->dirty[EC_DATA_TXN] = 1;
+    }
+    op_errno = 0;
+unlock:
+
+    lock->waiting_flags ^= provided_flags;
+
+    if (op_errno == 0) {
+        /* If the fop fails on any of the good bricks, it is important to mark
+         * it dirty and update versions right away if dirty was not set before.
+         */
+        if (lock->good_mask & ~(fop->good | fop->remaining)) {
+            release = _gf_true;
+        }
+
+        if (parent_link->update[0] && !parent_link->dirty[0]) {
+            lock->release |= release;
+        }
+
+        if (parent_link->update[1] && !parent_link->dirty[1]) {
+            lock->release |= release;
+        }
+
+        /* We don't allow the main fop to be executed on bricks that have not
+         * succeeded the initial xattrop. */
+        ec_lock_update_good(lock, fop);
+
+        /*As of now only data healing marks bricks as healing*/
+        lock->healing |= fop->healing;
+    }
+
+    UNLOCK(&lock->loc.inode->lock);
+
+    while (!list_empty(&list)) {
+        tmp = list_entry(list.next, ec_fop_data_t, cbk_list);
+        list_del_init(&tmp->cbk_list);
+
+        if (op_errno == 0) {
+            tmp->mask &= fop->good;
+
+            /*As of now only data healing marks bricks as healing*/
+            if (ec_is_data_fop(tmp->id)) {
+                tmp->healing |= fop->healing;
+            }
+        }
+
+        ec_resume(tmp, op_errno);
+    }
+
+    return 0;
+}
+
+static gf_boolean_t
+ec_set_needed_flag(ec_lock_t *lock, ec_lock_link_t *link, uint64_t flag)
+{
+    uint64_t current;
+
+    link->waiting_flags |= EC_FLAG_NEEDS(flag);
+
+    current = EC_NEEDED_FLAGS(lock->waiting_flags);
+    if (!EC_FLAGS_HAVE(current, flag)) {
+        lock->waiting_flags |= EC_FLAG_NEEDS(flag);
+        link->waiting_flags |= EC_FLAG_PROVIDES(flag);
+
+        return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+static uint64_t
+ec_set_xattrop_flags_and_params(ec_lock_t *lock, ec_lock_link_t *link,
+                                uint64_t *dirty)
+{
+    uint64_t oldflags = 0;
+    uint64_t newflags = 0;
+    ec_inode_t *ctx = lock->ctx;
+
+    oldflags = EC_NEEDED_FLAGS(lock->waiting_flags);
+
+    if (lock->query && !ctx->have_info) {
+        ec_set_needed_flag(lock, link, EC_FLAG_XATTROP);
+    }
+
+    if (dirty[EC_DATA_TXN]) {
+        if (!ec_set_needed_flag(lock, link, EC_FLAG_DATA_DIRTY)) {
+            dirty[EC_DATA_TXN] = 0;
+        }
+    }
+
+    if (dirty[EC_METADATA_TXN]) {
+        if (!ec_set_needed_flag(lock, link, EC_FLAG_METADATA_DIRTY)) {
+            dirty[EC_METADATA_TXN] = 0;
+        }
+    }
+    newflags = EC_NEEDED_FLAGS(lock->waiting_flags);
+
+    return oldflags ^ newflags;
+}
+
+void
+ec_get_size_version(ec_lock_link_t *link)
+{
+    loc_t loc;
+    ec_lock_t *lock;
+    ec_inode_t *ctx;
+    ec_fop_data_t *fop;
+    dict_t *dict = NULL;
+    dict_t *xdata = NULL;
+    ec_t *ec = NULL;
+    int32_t error = 0;
+    gf_boolean_t set_dirty = _gf_false;
+    uint64_t allzero[EC_VERSION_SIZE] = {0, 0};
+    uint64_t dirty[EC_VERSION_SIZE] = {0, 0};
+    lock = link->lock;
+    ctx = lock->ctx;
+    fop = link->fop;
+    ec = fop->xl->private;
+    uint64_t changed_flags = 0;
+
+    if (ec->optimistic_changelog && !(ec->node_mask & ~link->lock->good_mask) &&
+        !ec_is_data_fop(fop->id))
+        link->optimistic_changelog = _gf_true;
+
+    memset(&loc, 0, sizeof(loc));
+
+    LOCK(&lock->loc.inode->lock);
+
+    set_dirty = ec_set_dirty_flag(link, ctx, dirty);
+
+    /* If ec metadata has already been retrieved, do not try again. */
+    if (ctx->have_info) {
+        if (ec_is_data_fop(fop->id)) {
+            fop->healing |= lock->healing;
+        }
+        if (!set_dirty)
+            goto unlock;
+    }
+
+    /* Determine if there's something we need to retrieve for the current
+     * operation. */
+    if (!set_dirty && !lock->query && (lock->loc.inode->ia_type != IA_IFREG) &&
+        (lock->loc.inode->ia_type != IA_INVAL)) {
+        goto unlock;
+    }
+
+    changed_flags = ec_set_xattrop_flags_and_params(lock, link, dirty);
+    if (link->waiting_flags) {
+        /* This fop needs to wait until all its flags are cleared which
+         * potentially can be cleared by other xattrops that are already
+         * wound*/
+        ec_sleep(fop);
+    } else {
+        GF_ASSERT(!changed_flags);
+    }
+
+unlock:
+    UNLOCK(&lock->loc.inode->lock);
+
+    if (!changed_flags)
+        goto out;
+
+    dict = dict_new();
+    if (dict == NULL) {
+        error = -ENOMEM;
+        goto out;
+    }
+
+    if (EC_FLAGS_HAVE(changed_flags, EC_FLAG_XATTROP)) {
+        /* Once we know that an xattrop will be needed,
+         * we try to get all available information in a
+         * single call. */
+        error = ec_dict_set_array(dict, EC_XATTR_VERSION, allzero,
+                                  EC_VERSION_SIZE);
+        if (error != 0) {
+            goto out;
+        }
+
+        if (lock->loc.inode->ia_type == IA_IFREG ||
+            lock->loc.inode->ia_type == IA_INVAL) {
+            error = ec_dict_set_number(dict, EC_XATTR_SIZE, 0);
+            if (error == 0) {
+                error = ec_dict_set_number(dict, EC_XATTR_CONFIG, 0);
+            }
+            if (error != 0) {
+                goto out;
+            }
+
+            xdata = dict_new();
+            if (xdata == NULL || dict_set_int32(xdata, GF_GET_SIZE, 1)) {
+                error = -ENOMEM;
+                goto out;
+            }
+        }
+    }
+
+    if (memcmp(allzero, dirty, sizeof(allzero))) {
+        error = ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
+        if (error != 0) {
+            goto out;
+        }
+    }
+
+    fop->frame->root->uid = 0;
+    fop->frame->root->gid = 0;
+
+    /* For normal fops, ec_[f]xattrop() must succeed on at least
+     * EC_MINIMUM_MIN bricks, however when this is called as part of a
+     * self-heal operation the mask of target bricks (fop->mask) could
+     * contain less than EC_MINIMUM_MIN bricks, causing the xattrop to
+     * always fail. Thus we always use the same minimum used for the main
+     * fop.
+     */
+    if (lock->fd == NULL) {
+        error = ec_loc_from_loc(fop->xl, &loc, &lock->loc);
+        if (error != 0) {
+            goto out;
+        }
+        if (gf_uuid_is_null(loc.pargfid)) {
+            if (loc.parent != NULL) {
+                inode_unref(loc.parent);
+                loc.parent = NULL;
+            }
+            GF_FREE((char *)loc.path);
+            loc.path = NULL;
+            loc.name = NULL;
+        }
+
+        ec_xattrop(fop->frame, fop->xl, fop->mask, fop->minimum,
+                   ec_prepare_update_cbk, link, &loc, GF_XATTROP_ADD_ARRAY64,
+                   dict, xdata);
+    } else {
+        ec_fxattrop(fop->frame, fop->xl, fop->mask, fop->minimum,
+                    ec_prepare_update_cbk, link, lock->fd,
+                    GF_XATTROP_ADD_ARRAY64, dict, xdata);
+    }
+
+    error = 0;
+
+out:
+    fop->frame->root->uid = fop->uid;
+    fop->frame->root->gid = fop->gid;
+
+    loc_wipe(&loc);
+
+    if (dict != NULL) {
+        dict_unref(dict);
+    }
+
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
+
+    if (error != 0) {
+        ec_fop_set_error(fop, -error);
+    }
+}
+
+gf_boolean_t
+__ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size)
+{
+    ec_inode_t *ctx;
+    gf_boolean_t found = _gf_false;
+
+    ctx = __ec_inode_get(inode, fop->xl);
+    if (ctx == NULL) {
+        goto out;
+    }
+
+    if (ctx->have_size) {
+        *size = ctx->post_size;
+        found = _gf_true;
+    }
+
+out:
+    return found;
+}
+
+gf_boolean_t
+ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size)
+{
+    gf_boolean_t found = _gf_false;
+
+    LOCK(&inode->lock);
+    {
+        found = __ec_get_inode_size(fop, inode, size);
+    }
+    UNLOCK(&inode->lock);
+
+    return found;
+}
+
+gf_boolean_t
+__ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size)
+{
+    ec_inode_t *ctx;
+    gf_boolean_t found = _gf_false;
+
+    ctx = __ec_inode_get(inode, fop->xl);
+    if (ctx == NULL) {
+        goto out;
+    }
+
+    /* Normal fops always have ctx->have_size set. However self-heal calls this
+     * to prepare the inode, so ctx->have_size will be false. In this case we
+     * prepare both pre_size and post_size, and set have_size and have_info to
+     * true. */
+    if (!ctx->have_size) {
+        ctx->pre_size = size;
+        ctx->have_size = ctx->have_info = _gf_true;
+    }
+    ctx->post_size = size;
+
+    found = _gf_true;
+
+out:
+    return found;
+}
+
+gf_boolean_t
+ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size)
+{
+    gf_boolean_t found = _gf_false;
+
+    LOCK(&inode->lock);
+    {
+        found = __ec_set_inode_size(fop, inode, size);
+    }
+    UNLOCK(&inode->lock);
+
+    return found;
+}
+
+static void
+ec_release_stripe_cache(ec_inode_t *ctx)
+{
+    ec_stripe_list_t *stripe_cache = NULL;
+    ec_stripe_t *stripe = NULL;
+
+    stripe_cache = &ctx->stripe_cache;
+    while (!list_empty(&stripe_cache->lru)) {
+        stripe = list_first_entry(&stripe_cache->lru, ec_stripe_t, lru);
+        list_del(&stripe->lru);
+        GF_FREE(stripe);
+    }
+    stripe_cache->count = 0;
+    stripe_cache->max = 0;
+}
+
+void
+ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode)
+{
+    ec_inode_t *ctx;
+
+    LOCK(&inode->lock);
+
+    ctx = __ec_inode_get(inode, fop->xl);
+    if (ctx == NULL) {
+        goto unlock;
+    }
+
+    ec_release_stripe_cache(ctx);
+    ctx->have_info = _gf_false;
+    ctx->have_config = _gf_false;
+    ctx->have_version = _gf_false;
+    ctx->have_size = _gf_false;
+
+    memset(&ctx->config, 0, sizeof(ctx->config));
+    memset(ctx->pre_version, 0, sizeof(ctx->pre_version));
+    memset(ctx->post_version, 0, sizeof(ctx->post_version));
+    ctx->pre_size = ctx->post_size = 0;
+    memset(ctx->dirty, 0, sizeof(ctx->dirty));
+
+unlock:
+    UNLOCK(&inode->lock);
+}
+
+int32_t
+ec_get_real_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, inode_t *inode,
+                     struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+    ec_fop_data_t *fop = cookie;
+    ec_lock_link_t *link;
+
+    if (op_ret >= 0) {
+        link = fop->data;
+        link->size = buf->ia_size;
+    } else {
+        /* Prevent failure of parent fop. */
+        fop->error = 0;
+    }
+
+    return 0;
+}
+
+/* This function is used to get the trusted.ec.size xattr from a file when
+ * no lock is needed on the inode. This is only required to maintain iatt
+ * structs on fops that manipulate directory entries but do not operate
+ * directly on the inode, like link, rename, ...
+ *
+ * Any error processing this request is ignored. In the worst case, an invalid
+ * or not up to date value in the iatt could cause some cache invalidation.
+ */
+void
+ec_get_real_size(ec_lock_link_t *link)
+{
+    ec_fop_data_t *fop;
+    dict_t *xdata;
+
+    if (link->base == NULL || link->base->inode == NULL) {
+        return;
+    }
+
+    if (link->base->inode->ia_type != IA_IFREG) {
+        return;
+    }
+
+    fop = link->fop;
+
+    if (ec_get_inode_size(fop, link->base->inode, &link->size)) {
+        return;
+    }
+
+    xdata = dict_new();
+    if (xdata == NULL) {
+        return;
+    }
+    if (ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) {
+        goto out;
+    }
+
+    /* Send a simple lookup. A single answer is considered ok since this value
+     * is only used to return an iatt struct related to an inode that is not
+     * locked and have not suffered any operation. */
+    ec_lookup(fop->frame, fop->xl, fop->mask, 1, ec_get_real_size_cbk, link,
+              link->base, xdata);
+
+out:
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
+}
+
+static void
+ec_lock_update_fd(ec_lock_t *lock, ec_fop_data_t *fop)
+{
+    /* If the fop has an fd available, attach it to the lock structure to be
+     * able to do fxattrop calls instead of xattrop. */
+    if (fop->use_fd && (lock->fd == NULL)) {
+        lock->fd = __fd_ref(fop->fd);
+    }
+}
+
+static gf_boolean_t
+ec_link_has_lock_conflict(ec_lock_link_t *link, gf_boolean_t waitlist_check)
+{
+    ec_lock_link_t *trav_link = NULL;
+
+    list_for_each_entry(trav_link, &link->lock->owners, owner_list)
+    {
+        if (ec_lock_conflict(trav_link, link))
+            return _gf_true;
+    }
+
+    if (!waitlist_check)
+        return _gf_false;
+
+    list_for_each_entry(trav_link, &link->lock->waiting, wait_list)
+    {
+        if (ec_lock_conflict(trav_link, link))
+            return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+static void
+ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list)
+{
+    ec_fop_data_t *fop;
+    ec_lock_link_t *link;
+    gf_boolean_t conflict = _gf_false;
+
+    while (!conflict && !list_empty(&lock->waiting)) {
+        link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list);
+        fop = link->fop;
+
+        /* If lock is not acquired, at most one fop can be assigned as owner.
+         * The following fops will need to wait in the lock->waiting queue
+         * until the lock has been fully acquired. */
+        conflict = !lock->acquired;
+
+        /* If the fop is not shareable, only this fop can be assigned as owner.
+         * Other fops will need to wait until this one finishes. */
+        if (ec_link_has_lock_conflict(link, _gf_false)) {
+            conflict = _gf_true;
+        }
+
+        /* If only one fop is allowed, it can be assigned as the owner of the
+         * lock only if there weren't any other owner. */
+        if (conflict && !list_empty(&lock->owners)) {
+            break;
+        }
+
+        list_move_tail(&link->wait_list, list);
+
+        list_add_tail(&link->owner_list, &lock->owners);
+        lock->refs_owners++;
+
+        ec_lock_update_fd(lock, fop);
+    }
+}
+
+static void
+ec_lock_apply(ec_lock_link_t *link)
+{
+    ec_fop_data_t *fop = link->fop;
+
+    fop->mask &= link->lock->good_mask;
+    fop->locked++;
+
+    ec_get_size_version(link);
+    ec_get_real_size(link);
+}
+
+gf_boolean_t
+ec_lock_acquire(ec_lock_link_t *link);
+
+static void
+ec_lock_resume_shared(struct list_head *list)
+{
+    ec_lock_link_t *link;
+
+    while (!list_empty(list)) {
+        link = list_entry(list->next, ec_lock_link_t, wait_list);
+        list_del_init(&link->wait_list);
+
+        if (link->lock->acquired) {
+            ec_lock_apply(link);
+            ec_lock(link->fop);
+        } else {
+            GF_ASSERT(list_empty(list));
+
+            ec_lock_acquire(link);
+        }
+
+        ec_resume(link->fop, 0);
+    }
+}
+
+void
+ec_lock_acquired(ec_lock_link_t *link)
+{
+    struct list_head list;
+    ec_lock_t *lock;
+    ec_fop_data_t *fop;
+
+    lock = link->lock;
+    fop = link->fop;
+
+    ec_trace("LOCKED", fop, "lock=%p", lock);
+
+    INIT_LIST_HEAD(&list);
+
+    LOCK(&lock->loc.inode->lock);
+
+    lock->acquired = _gf_true;
+    if (lock->contention) {
+        lock->release = _gf_true;
+        lock->contention = _gf_false;
+    }
+
+    ec_lock_update_fd(lock, fop);
+    ec_lock_wake_shared(lock, &list);
+
+    UNLOCK(&lock->loc.inode->lock);
+
+    ec_lock_apply(link);
+
+    if (fop->use_fd &&
+        (link->update[EC_DATA_TXN] || link->update[EC_METADATA_TXN])) {
+        /* Try to reopen closed fd's only if lock has succeeded. */
+        ec_fix_open(fop, lock->mask);
+    }
+
+    ec_lock_resume_shared(&list);
+}
+
+int32_t
+ec_locked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+          int32_t op_errno, dict_t *xdata)
+{
+    ec_fop_data_t *fop = cookie;
+    ec_lock_link_t *link = NULL;
+    ec_lock_t *lock = NULL;
+
+    link = fop->data;
+    lock = link->lock;
+    if (op_ret >= 0) {
+        lock->mask = lock->good_mask = fop->good;
+        lock->healing = 0;
+
+        ec_lock_acquired(link);
+        ec_lock(fop->parent);
+    } else {
+        LOCK(&lock->loc.inode->lock);
+        {
+            lock->contention = _gf_false;
+        }
+        UNLOCK(&lock->loc.inode->lock);
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_PREOP_LOCK_FAILED,
+               "Failed to complete preop lock");
+    }
+
+    return 0;
+}
+
+gf_boolean_t
+ec_lock_acquire(ec_lock_link_t *link)
+{
+    ec_lock_t *lock;
+    ec_fop_data_t *fop;
+    gf_lkowner_t lk_owner;
+
+    lock = link->lock;
+    fop = link->fop;
+
+    if (!lock->acquired) {
+        set_lk_owner_from_ptr(&lk_owner, lock);
+
+        ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p", lock,
+                 lock->loc.inode);
+
+        lock->flock.l_type = F_WRLCK;
+        ec_inodelk(fop->frame, fop->xl, &lk_owner, -1, EC_MINIMUM_ALL,
+                   ec_locked, link, fop->xl->name, &lock->loc, F_SETLKW,
+                   &lock->flock, NULL);
+
+        return _gf_false;
+    }
+
+    ec_trace("LOCK_REUSE", fop, "lock=%p", lock);
+
+    ec_lock_acquired(link);
+
+    return _gf_true;
+}
+
+static ec_lock_link_t *
+ec_lock_timer_cancel(xlator_t *xl, ec_lock_t *lock)
+{
+    ec_lock_link_t *timer_link;
+
+    /* If we don't have any timer, there's nothing to cancel. */
+    if (lock->timer == NULL) {
+        return NULL;
+    }
+
+    /* We are trying to access a lock that has an unlock timer active.
+     * This means that the lock must be idle, i.e. no fop can be in the
+     * owner, waiting or frozen lists. It also means that the lock cannot
+     * have been marked as being released (this is done without timers).
+     * There should only be one owner reference, but it's possible that
+     * some fops are being prepared to use this lock. */
+    GF_ASSERT((lock->refs_owners == 1) && list_empty(&lock->owners) &&
+              list_empty(&lock->waiting));
+
+    /* We take the timer_link before cancelling the timer, since a
+     * successful cancellation will destroy it. It must not be NULL
+     * because it references the fop responsible for the delayed unlock
+     * that we are currently trying to cancel. */
+    timer_link = lock->timer->data;
+    GF_ASSERT(timer_link != NULL);
+
+    if (gf_timer_call_cancel(xl->ctx, lock->timer) < 0) {
+        /* It's too late to avoid the execution of the timer callback.
+         * Since we need to be sure that the callback has access to all
+         * needed resources, we cannot resume the execution of the
+         * timer fop now. This will be done in the callback. */
+        timer_link = NULL;
+    } else {
+        /* The timer has been cancelled. The fop referenced by
+         * timer_link holds the last reference. The caller is
+         * responsible to release it when not needed anymore. */
+        ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+    }
+
+    /* We have two options here:
+     *
+     * 1. The timer has been successfully cancelled.
+     *
+     *    This is the easiest case and we can continue with the currently
+     *    acquired lock.
+     *
+     * 2. The timer callback has already been fired.
+     *
+     *    In this case we have not been able to cancel the timer before
+     *    the timer callback has been fired, but we also know that
+     *    lock->timer != NULL. This means that the timer callback is still
+     *    trying to acquire the inode mutex that we currently own. We are
+     *    safe until we release it. In this case we can safely clear
+     *    lock->timer. This will cause that the timer callback does nothing
+     *    once it acquires the mutex.
+     */
+    lock->timer = NULL;
+
+    return timer_link;
+}
+
+static gf_boolean_t
+ec_lock_assign_owner(ec_lock_link_t *link)
+{
+    ec_fop_data_t *fop;
+    ec_lock_t *lock;
+    ec_lock_link_t *timer_link = NULL;
+    gf_boolean_t assigned = _gf_false;
+
+    /* The link cannot be in any list because we have just finished preparing
+     * it. */
+    GF_ASSERT(list_empty(&link->wait_list));
+
+    fop = link->fop;
+    lock = link->lock;
+
+    LOCK(&lock->loc.inode->lock);
+
+    /* Since the link has just been prepared but it's not active yet, the
+     * refs_pending must be one at least (the ref owned by this link). */
+    GF_ASSERT(lock->refs_pending > 0);
+    /* The link is not pending any more. It will be assigned to the owner,
+     * waiting or frozen list. */
+    lock->refs_pending--;
+
+    if (lock->release) {
+        ec_trace("LOCK_QUEUE_FREEZE", fop, "lock=%p", lock);
+
+        /* When lock->release is set, we'll unlock the lock as soon as
+         * possible, meaning that we won't use a timer. */
+        GF_ASSERT(lock->timer == NULL);
+
+        /* The lock is marked to be released. We can still have owners and fops
+         * in the waiting ilist f they have been added before the lock has been
+         * marked to be released. However new fops are put into the frozen list
+         * to wait for the next unlock/lock cycle. */
+        list_add_tail(&link->wait_list, &lock->frozen);
+
+        goto unlock;
+    }
+
+    /* The lock is not marked to be released, so the frozen list should be
+     * empty. */
+    GF_ASSERT(list_empty(&lock->frozen));
+
+    timer_link = ec_lock_timer_cancel(fop->xl, lock);
+
+    if (!list_empty(&lock->owners)) {
+        /* There are other owners of this lock. We can only take ownership if
+         * the lock is already acquired and doesn't have conflict with existing
+         * owners, or waiters(to prevent starvation).
+         * Otherwise we need to wait.
+         */
+        if (!lock->acquired || ec_link_has_lock_conflict(link, _gf_true)) {
+            ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock);
+
+            list_add_tail(&link->wait_list, &lock->waiting);
+
+            goto unlock;
+        }
+    }
+
+    list_add_tail(&link->owner_list, &lock->owners);
+
+    /* If timer_link is not NULL, it means that we have inherited the owner
+     * reference assigned to the timer fop. In this case we simply reuse it.
+     * Otherwise we need to increase the number of owners. */
+    if (timer_link == NULL) {
+        lock->refs_owners++;
+    }
+
+    assigned = _gf_true;
+
+unlock:
+    if (!assigned) {
+        /* We have not been able to take ownership of this lock. The fop must
+         * be put to sleep. */
+        ec_sleep(fop);
+    }
+
+    UNLOCK(&lock->loc.inode->lock);
+
+    /* If we have cancelled the timer, we need to resume the fop that was
+     * waiting for it. */
+    if (timer_link != NULL) {
+        ec_resume(timer_link->fop, 0);
+    }
+
+    return assigned;
+}
+
+static void
+ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
+                   gf_boolean_t release)
+{
+    struct list_head list;
+    ec_lock_t *lock = link->lock;
+    ec_fop_data_t *fop = link->fop;
+    ec_inode_t *ctx = lock->ctx;
+
+    INIT_LIST_HEAD(&list);
+
+    LOCK(&lock->loc.inode->lock);
+
+    ec_trace("LOCK_DONE", fop, "lock=%p", lock);
+
+    /* Current link must belong to the owner list of the lock. We don't
+     * decrement lock->refs_owners here because the inode mutex is released
+     * before ec_unlock() is called and we need to know when the last owner
+     * unlocks the lock to do proper cleanup. lock->refs_owners is used for
+     * this task. */
+    GF_ASSERT((lock->refs_owners > 0) && !list_empty(&link->owner_list));
+    list_del_init(&link->owner_list);
+
+    lock->release |= release;
+
+    if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) {
+        if (link->update[0]) {
+            ctx->post_version[0]++;
+        }
+        if (link->update[1]) {
+            ctx->post_version[1]++;
+        }
+        /* If the fop fails on any of the good bricks, it is important to mark
+         * it dirty and update versions right away. */
+        if (link->update[0] || link->update[1]) {
+            if (lock->good_mask & ~(fop->good | fop->remaining)) {
+                lock->release = _gf_true;
+            }
+        }
+    }
+
+    if (fop->healing) {
+        lock->healing = fop->healing & (fop->good | fop->remaining);
+    }
+    ec_lock_update_good(lock, fop);
+
+    ec_lock_wake_shared(lock, &list);
+
+    UNLOCK(&lock->loc.inode->lock);
+
+    ec_lock_resume_shared(&list);
+}
+
+void
+ec_lock(ec_fop_data_t *fop)
+{
+    ec_lock_link_t *link;
+
+    /* There is a chance that ec_resume is called on fop even before ec_sleep.
+     * Which can result in refs == 0 for fop leading to use after free in this
+     * function when it calls ec_sleep so do ec_sleep at start and ec_resume at
+     * the end of this function.*/
+    ec_sleep(fop);
+
+    while (fop->locked < fop->lock_count) {
+        /* Since there are only up to 2 locks per fop, this xor will change
+         * the order of the locks if fop->first_lock is 1. */
+        link = &fop->locks[fop->locked ^ fop->first_lock];
+
+        if (!ec_lock_assign_owner(link) || !ec_lock_acquire(link)) {
+            break;
+        }
+    }
+
+    ec_resume(fop, 0);
+}
+
+void
+ec_lock_unfreeze(ec_lock_link_t *link)
+{
+    struct list_head list;
+    ec_lock_t *lock;
+    gf_boolean_t destroy = _gf_false;
+
+    lock = link->lock;
+
+    INIT_LIST_HEAD(&list);
+
+    LOCK(&lock->loc.inode->lock);
+
+    /* The lock must be marked to be released here, since we have just released
+     * it and any attempt to assign it to more fops must have added them to the
+     * frozen list. We can only have one active reference here: the one that
+     * is processing this unfreeze. */
+    GF_ASSERT(lock->release && (lock->refs_owners == 1));
+    lock->release = _gf_false;
+    lock->refs_owners = 0;
+
+    lock->acquired = _gf_false;
+
+    /* We are unfreezing a lock. This means that the lock has already been
+     * released. In this state it shouldn't have a pending timer nor have any
+     * owner, and the waiting list should be empty. Only the frozen list can
+     * contain some fop. */
+    GF_ASSERT((lock->timer == NULL) && list_empty(&lock->waiting) &&
+              list_empty(&lock->owners));
+
+    /* We move all frozen fops to the waiting list. */
+    list_splice_init(&lock->frozen, &lock->waiting);
+
+    /* If we don't have any fop waiting nor there are any prepared fops using
+     * this lock, we can finally dispose it. */
+    destroy = list_empty(&lock->waiting) && (lock->refs_pending == 0);
+    if (destroy) {
+        ec_trace("LOCK_DESTROY", link->fop, "lock=%p", lock);
+
+        lock->ctx->inode_lock = NULL;
+    } else {
+        ec_trace("LOCK_UNFREEZE", link->fop, "lock=%p", lock);
+
+        ec_lock_wake_shared(lock, &list);
+    }
+
+    UNLOCK(&lock->loc.inode->lock);
+
+    ec_lock_resume_shared(&list);
+
+    if (destroy) {
+        ec_lock_destroy(lock);
+    }
+}
+
+int32_t
+ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, dict_t *xdata)
+{
+    ec_fop_data_t *fop = cookie;
+    ec_lock_link_t *link = fop->data;
+
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_UNLOCK_FAILED,
+               "entry/inode unlocking failed :(%s)", ec_msg_str(link->fop));
+    } else {
+        ec_trace("UNLOCKED", link->fop, "lock=%p", link->lock);
+    }
+
+    ec_lock_unfreeze(link);
+
+    return 0;
+}
+
+void
+ec_unlock_lock(ec_lock_link_t *link)
+{
+    ec_lock_t *lock;
+    ec_fop_data_t *fop;
+    gf_lkowner_t lk_owner;
+
+    lock = link->lock;
+    fop = link->fop;
+
+    lock->unlock_now = _gf_false;
+    ec_clear_inode_info(fop, lock->loc.inode);
+
+    if ((lock->mask != 0) && lock->acquired) {
+        set_lk_owner_from_ptr(&lk_owner, lock);
+        lock->flock.l_type = F_UNLCK;
+        ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock,
+                 lock->loc.inode);
+
+        ec_inodelk(fop->frame, fop->xl, &lk_owner, lock->mask, EC_MINIMUM_ONE,
+                   ec_unlocked, link, fop->xl->name, &lock->loc, F_SETLK,
+                   &lock->flock, NULL);
+    } else {
+        ec_lock_unfreeze(link);
+    }
+}
+
+void
+ec_inode_bad_inc(inode_t *inode, xlator_t *xl)
+{
+    ec_inode_t *ctx = NULL;
+
+    LOCK(&inode->lock);
+    {
+        ctx = __ec_inode_get(inode, xl);
+        if (ctx == NULL) {
+            goto unlock;
+        }
+        ctx->bad_version++;
+    }
+unlock:
+    UNLOCK(&inode->lock);
+}
+
+int32_t
+ec_update_size_version_done(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int32_t op_ret, int32_t op_errno, dict_t *xattr,
+                            dict_t *xdata)
+{
+    ec_fop_data_t *fop = cookie;
+    ec_lock_link_t *link;
+    ec_lock_t *lock;
+    ec_inode_t *ctx;
+
+    link = fop->data;
+    lock = link->lock;
+    ctx = lock->ctx;
+
+    if (op_ret < 0) {
+        if (link->lock->fd == NULL) {
+            ec_inode_bad_inc(link->lock->loc.inode, this);
+        } else {
+            ec_inode_bad_inc(link->lock->fd->inode, this);
+        }
+
+        gf_msg(fop->xl->name, fop_log_level(fop->id, op_errno), op_errno,
+               EC_MSG_SIZE_VERS_UPDATE_FAIL,
+               "Failed to update version and size. %s", ec_msg_str(fop));
+    } else {
+        fop->parent->good &= fop->good;
+
+        ec_lock_update_good(lock, fop);
+
+        if (ec_dict_del_array(xattr, EC_XATTR_VERSION, ctx->post_version,
+                              EC_VERSION_SIZE) == 0) {
+            ctx->pre_version[0] = ctx->post_version[0];
+            ctx->pre_version[1] = ctx->post_version[1];
+
+            ctx->have_version = _gf_true;
+        }
+        if (ec_dict_del_number(xattr, EC_XATTR_SIZE, &ctx->post_size) == 0) {
+            ctx->pre_size = ctx->post_size;
+
+            ctx->have_size = _gf_true;
+        }
+        if ((ec_dict_del_config(xdata, EC_XATTR_CONFIG, &ctx->config) == 0) &&
+            ec_config_check(fop->xl, &ctx->config)) {
+            ctx->have_config = _gf_true;
+        }
+
+        ctx->have_info = _gf_true;
+    }
+    /* If we are here because of fop's and other than unlock request,
+     * that means we are still holding a lock. That make sure
+     * lock->unlock_now can not be modified.
+     */
+    if (lock->unlock_now) {
+        ec_unlock_lock(fop->data);
+    }
+
+    return 0;
+}
+
+void
+ec_update_size_version(ec_lock_link_t *link, uint64_t *version, uint64_t size,
+                       uint64_t *dirty)
+{
+    ec_fop_data_t *fop;
+    ec_lock_t *lock;
+    ec_inode_t *ctx;
+    dict_t *dict = NULL;
+    uintptr_t update_on = 0;
+    int32_t err = -ENOMEM;
+
+    fop = link->fop;
+    lock = link->lock;
+    ctx = lock->ctx;
+
+    ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld",
+             version[0], version[1], size, dirty[0], dirty[1]);
+
+    dict = dict_new();
+    if (dict == NULL) {
+        goto out;
+    }
+
+    /* If we don't have version information or it has been modified, we
+     * update it. */
+    if (!ctx->have_version || (version[0] != 0) || (version[1] != 0)) {
+        err = ec_dict_set_array(dict, EC_XATTR_VERSION, version,
+                                EC_VERSION_SIZE);
+        if (err != 0) {
+            goto out;
+        }
+    }
+
+    if (size != 0) {
+        /* If size has been changed, we should already
+         * know the previous size of the file. */
+        GF_ASSERT(ctx->have_size);
+
+        err = ec_dict_set_number(dict, EC_XATTR_SIZE, size);
+        if (err != 0) {
+            goto out;
+        }
+    }
+
+    if (dirty[0] || dirty[1]) {
+        err = ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
+        if (err != 0) {
+            goto out;
+        }
+    }
+
+    /* If config information is not known, we request it now. */
+    if ((lock->loc.inode->ia_type == IA_IFREG) && !ctx->have_config) {
+        /* A failure requesting this xattr is ignored because it's not
+         * absolutely required right now. */
+        (void)ec_dict_set_number(dict, EC_XATTR_CONFIG, 0);
+    }
+
+    fop->frame->root->uid = 0;
+    fop->frame->root->gid = 0;
+
+    update_on = lock->good_mask | lock->healing;
+
+    if (link->lock->fd == NULL) {
+        ec_xattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,
+                   ec_update_size_version_done, link, &link->lock->loc,
+                   GF_XATTROP_ADD_ARRAY64, dict, NULL);
+    } else {
+        ec_fxattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,
+                    ec_update_size_version_done, link, link->lock->fd,
+                    GF_XATTROP_ADD_ARRAY64, dict, NULL);
+    }
+
+    fop->frame->root->uid = fop->uid;
+    fop->frame->root->gid = fop->gid;
+
+    dict_unref(dict);
+
+    return;
+
+out:
+    if (dict != NULL) {
+        dict_unref(dict);
+    }
+
+    ec_fop_set_error(fop, -err);
+
+    gf_msg(fop->xl->name, GF_LOG_ERROR, -err, EC_MSG_SIZE_VERS_UPDATE_FAIL,
+           "Unable to update version and size. %s", ec_msg_str(fop));
+
+    if (lock->unlock_now) {
+        ec_unlock_lock(fop->data);
+    }
+}
+
+gf_boolean_t
+ec_update_info(ec_lock_link_t *link)
+{
+    ec_lock_t *lock;
+    ec_inode_t *ctx;
+    uint64_t version[2] = {0, 0};
+    uint64_t dirty[2] = {0, 0};
+    uint64_t size;
+    ec_t *ec = NULL;
+    uintptr_t mask;
+
+    lock = link->lock;
+    ctx = lock->ctx;
+    ec = link->fop->xl->private;
+
+    /* pre_version[*] will be 0 if have_version is false */
+    version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] -
+                           ctx->pre_version[EC_DATA_TXN];
+    version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] -
+                               ctx->pre_version[EC_METADATA_TXN];
+
+    size = ctx->post_size - ctx->pre_size;
+    /* If we set the dirty flag for update fop, we have to unset it.
+     * If fop has failed on some bricks, leave the dirty as marked. */
+
+    if (lock->unlock_now) {
+        if (version[EC_DATA_TXN]) {
+            /*A data fop will have difference in post and pre version
+             *and for data fop we send writes on healing bricks also */
+            mask = lock->good_mask | lock->healing;
+        } else {
+            mask = lock->good_mask;
+        }
+        /* Ensure that nodes are up while doing final
+         * metadata update.*/
+        if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) {
+            if (ctx->dirty[EC_DATA_TXN] != 0) {
+                dirty[EC_DATA_TXN] = -1;
+            }
+            if (ctx->dirty[EC_METADATA_TXN] != 0) {
+                dirty[EC_METADATA_TXN] = -1;
+            }
+            /*If everything is fine and we already
+             *have version xattr set on entry, there
+             *is no need to update version again*/
+            if (ctx->pre_version[EC_DATA_TXN]) {
+                version[EC_DATA_TXN] = 0;
+            }
+            if (ctx->pre_version[EC_METADATA_TXN]) {
+                version[EC_METADATA_TXN] = 0;
+            }
+        } else {
+            link->optimistic_changelog = _gf_false;
+            ec_set_dirty_flag(link, ctx, dirty);
+        }
+        memset(ctx->dirty, 0, sizeof(ctx->dirty));
+    }
+
+    if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) ||
+        (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) {
+        ec_update_size_version(link, version, size, dirty);
+        return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+void
+ec_unlock_now(ec_lock_link_t *link)
+{
+    ec_lock_t *lock;
+    lock = link->lock;
+
+    ec_trace("UNLOCK_NOW", link->fop, "lock=%p", link->lock);
+    /*At this point, lock is not being used by any fop and
+     *can not be reused by any fop as it is going to be released.
+     *lock->unlock_now can not be modified at any other place.
+     */
+    lock->unlock_now = _gf_true;
+
+    if (!ec_update_info(link)) {
+        ec_unlock_lock(link);
+    }
+
+    ec_resume(link->fop, 0);
+}
+
+void
+ec_lock_release(ec_t *ec, inode_t *inode)
+{
+    ec_lock_t *lock;
+    ec_inode_t *ctx;
+    ec_lock_link_t *timer_link = NULL;
+
+    LOCK(&inode->lock);
+
+    ctx = __ec_inode_get(inode, ec->xl);
+    if (ctx == NULL) {
+        goto done;
+    }
+    lock = ctx->inode_lock;
+    if ((lock == NULL) || lock->release) {
+        goto done;
+    }
+
+    gf_msg_debug(ec->xl->name, 0, "Releasing inode %p due to lock contention",
+                 inode);
+
+    if (!lock->acquired) {
+        /* This happens if some bricks already got the lock while inodelk is in
+         * progress.  Set release to true after lock is acquired*/
+        lock->contention = _gf_true;
+        goto done;
+    }
+
+    /* The lock is not marked to be released, so the frozen list should be
+     * empty. */
+    GF_ASSERT(list_empty(&lock->frozen));
+
+    timer_link = ec_lock_timer_cancel(ec->xl, lock);
+
+    /* We mark the lock to be released as soon as possible. */
+    lock->release = _gf_true;
+
+done:
+    UNLOCK(&inode->lock);
+
+    /* If we have cancelled the timer, we need to start the unlock of the
+     * inode. If there was a timer but we have been unable to cancel it
+     * because it was just triggered, the timer callback will take care
+     * of releasing the inode. */
+    if (timer_link != NULL) {
+        ec_unlock_now(timer_link);
+    }
+}
+
+void
+ec_unlock_timer_add(ec_lock_link_t *link);
+
+void
+ec_unlock_timer_del(ec_lock_link_t *link)
+{
+    ec_lock_t *lock;
+    inode_t *inode;
+    gf_boolean_t now = _gf_false;
+
+    /* If we are here, it means that the timer has expired before having
+     * been cancelled. This guarantees that 'link' is still valid because
+     * the fop that contains it must be pending (if timer cancellation in
+     * ec_lock_assign_owner() fails, the fop is left sleeping).
+     *
+     * At the same time, the fop still has a reference to the lock, so
+     * it must also be valid.
+     */
+    lock = link->lock;
+
+    /* 'lock' must have a valid inode since it can only be destroyed
+     * when the lock itself is destroyed, but we have a reference to the
+     * lock to avoid this.
+     */
+    inode = lock->loc.inode;
+
+    LOCK(&inode->lock);
+
+    if (lock->timer != NULL) {
+        ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock);
+
+        /* The unlock timer has expired without anyone cancelling it.
+         * This means that it shouldn't have any owner, and the waiting
+         * and frozen lists should be empty.  It must have only one
+         * owner reference, but there can be fops being prepared
+         * though.
+         * */
+        GF_ASSERT(!lock->release && (lock->refs_owners == 1) &&
+                  list_empty(&lock->owners) && list_empty(&lock->waiting) &&
+                  list_empty(&lock->frozen));
+
+        gf_timer_call_cancel(link->fop->xl->ctx, lock->timer);
+        lock->timer = NULL;
+
+        /* Any fop being processed from now on, will need to wait
+         * until the next unlock/lock cycle. */
+        lock->release = now = _gf_true;
+    }
+
+    UNLOCK(&inode->lock);
+
+    if (now) {
+        ec_unlock_now(link);
+    } else {
+        /* The timer has been cancelled just after firing it but before
+         * getting here. This means that another fop has used the lock
+         * and everything should be handled as if this callback were
+         * have not been executed. However we still have an owner
+         * reference.
+         *
+         * We need to release our reference. If this is not the last
+         * reference (the most common case because another fop has
+         * taken another ref) we only need to decrement the counter.
+         * Otherwise we have been delayed enough so that the other fop
+         * has had time to acquire the reference, do its operation and
+         * release it. At the time of releasing it, the fop did found
+         * that the ref counter was > 1 (our reference), so the delayed
+         * unlock timer wasn't started. We need to start it again if we
+         * are the last reference.
+         *
+         * ec_unlock_timer_add() handles both cases.
+         */
+        ec_unlock_timer_add(link);
+
+        /* We need to resume the fop that was waiting for the delayed
+         * unlock.
+         */
+        ec_resume(link->fop, 0);
+    }
+}
+
+void
+ec_unlock_timer_cbk(void *data)
+{
+    ec_unlock_timer_del(data);
+}
+
+static gf_boolean_t
+ec_eager_lock_used(ec_t *ec, ec_fop_data_t *fop)
+{
+    /* Fops with no locks at this point mean that they are sent as sub-fops
+     * of other higher level fops. In this case we simply assume that the
+     * parent fop will take correct care of the eager lock. */
+    if (fop->lock_count == 0) {
+        return _gf_true;
+    }
+
+    /* We may have more than one lock, but this only happens in the rename
+     * fop, and both locks will reference an inode of the same type (a
+     * directory in this case), so we only need to check the first lock. */
+    if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) {
+        return ec->eager_lock;
+    }
+
+    return ec->other_eager_lock;
+}
+
+static uint32_t
+ec_eager_lock_timeout(ec_t *ec, ec_lock_t *lock)
+{
+    if (lock->loc.inode->ia_type == IA_IFREG) {
+        return ec->eager_lock_timeout;
+    }
+
+    return ec->other_eager_lock_timeout;
+}
+
+static gf_boolean_t
+ec_lock_delay_create(ec_lock_link_t *link)
+{
+    struct timespec delay;
+    ec_fop_data_t *fop = link->fop;
+    ec_lock_t *lock = link->lock;
+
+    delay.tv_sec = ec_eager_lock_timeout(fop->xl->private, lock);
+    delay.tv_nsec = 0;
+    lock->timer = gf_timer_call_after(fop->xl->ctx, delay, ec_unlock_timer_cbk,
+                                      link);
+    if (lock->timer == NULL) {
+        gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
+               EC_MSG_UNLOCK_DELAY_FAILED, "Unable to delay an unlock");
+
+        return _gf_false;
+    }
+
+    return _gf_true;
+}
+
+void
+ec_unlock_timer_add(ec_lock_link_t *link)
+{
+    ec_fop_data_t *fop = link->fop;
+    ec_lock_t *lock = link->lock;
+    gf_boolean_t now = _gf_false;
+
+    LOCK(&lock->loc.inode->lock);
+
+    /* We are trying to unlock the lock. We can have multiple scenarios here,
+     * but all of them need to have lock->timer == NULL:
+     *
+     * 1. There are other owners currently running that can call ec_unlock().
+     *
+     *    None of them can have started the timer until the last one. But this
+     *    call should be the consequence of this lastest one.
+     *
+     * 2. There are fops in the waiting or frozen lists.
+     *
+     *    These fops cannot call ec_unlock(). So we should be here.
+     *
+     * We must reach here with at least one owner reference.
+     */
+    GF_ASSERT((lock->timer == NULL) && (lock->refs_owners > 0));
+
+    /* If the fop detects that a heal is needed, we mark the lock to be
+     * released as soon as possible. */
+    lock->release |= ec_fop_needs_heal(fop);
+
+    if (lock->refs_owners > 1) {
+        ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock);
+
+        /* If there are other owners we cannot do anything else with the lock.
+         * Note that the current fop has already been removed from the owners
+         * list in ec_lock_reuse(). */
+        lock->refs_owners--;
+
+        UNLOCK(&lock->loc.inode->lock);
+    } else if (lock->acquired) {
+        /* There are no other owners and the lock is acquired. If there were
+         * fops waiting, at least one of them should have been promoted to an
+         * owner, so the waiting list should be empty. */
+        GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting));
+
+        ec_t *ec = fop->xl->private;
+
+        /* If everything goes as expected this fop will be put to sleep until
+         * the timer callback is executed. */
+        ec_sleep(fop);
+
+        /* If the lock needs to be released, or ec is shutting down, do not
+         * delay lock release. */
+        if (!lock->release && !ec->shutdown) {
+            ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock,
+                     lock->release);
+
+            if (!ec_lock_delay_create(link)) {
+                /* We are unable to create a new timer. We immediately release
+                 * the lock. */
+                lock->release = now = _gf_true;
+            }
+
+        } else {
+            ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock,
+                     lock->release);
+            lock->release = now = _gf_true;
+        }
+
+        UNLOCK(&lock->loc.inode->lock);
+
+        if (now) {
+            ec_unlock_now(link);
+        }
+    } else {
+        /* There are no owners and the lock is not acquired. This can only
+         * happen if a lock attempt has failed and we get to the unlock step
+         * of the fop. As in the previous case, the waiting list must be
+         * empty. */
+        GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting));
+
+        /* We need to mark the lock to be released to correctly handle fops
+         * that may get in after we release the inode mutex but before
+         * ec_lock_unfreeze() is processed. */
+        lock->release = _gf_true;
+
+        UNLOCK(&lock->loc.inode->lock);
+
+        ec_lock_unfreeze(link);
+    }
+}
+
+void
+ec_unlock(ec_fop_data_t *fop)
+{
+    int32_t i;
+
+    for (i = 0; i < fop->lock_count; i++) {
+        ec_unlock_timer_add(&fop->locks[i]);
+    }
+}
+
+void
+ec_flush_size_version(ec_fop_data_t *fop)
+{
+    GF_ASSERT(fop->lock_count == 1);
+    ec_update_info(&fop->locks[0]);
+}
+
+static void
+ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe,
+                 ec_fop_data_t *fop)
+{
+    off_t base;
+
+    /* On write fops, we only update existing fragments if the write has
+     * succeeded. Otherwise, we remove them from the cache. */
+    if ((fop->id == GF_FOP_WRITE) && (fop->answer != NULL) &&
+        (fop->answer->op_ret >= 0)) {
+        base = stripe->frag_offset - fop->frag_range.first;
+        base *= ec->fragments;
+
+        /* We check if the stripe offset falls inside the real region
+         * modified by the write fop (a write request is allowed,
+         * though uncommon, to write less bytes than requested). The
+         * current write fop implementation doesn't allow partial
+         * writes of fragments, so if there's no error, we are sure
+         * that a full stripe has been completely modified or not
+         * touched at all. The value of op_ret may not be a multiple
+         * of the stripe size because it depends on the requested
+         * size by the user, so we update the stripe if the write has
+         * modified at least one byte (meaning ec has written the full
+         * stripe). */
+        if (base < fop->answer->op_ret + fop->head) {
+            memcpy(stripe->data, fop->vector[0].iov_base + base,
+                   ec->stripe_size);
+            list_move_tail(&stripe->lru, &stripe_cache->lru);
+
+            GF_ATOMIC_INC(ec->stats.stripe_cache.updates);
+        }
+    } else {
+        stripe->frag_offset = -1;
+        list_move(&stripe->lru, &stripe_cache->lru);
+
+        GF_ATOMIC_INC(ec->stats.stripe_cache.invals);
+    }
+}
+
+static void
+ec_update_cached_stripes(ec_fop_data_t *fop)
+{
+    uint64_t first;
+    uint64_t last;
+    ec_stripe_t *stripe = NULL;
+    ec_inode_t *ctx = NULL;
+    ec_stripe_list_t *stripe_cache = NULL;
+    inode_t *inode = NULL;
+    struct list_head *temp;
+    struct list_head sentinel;
+
+    first = fop->frag_range.first;
+    /* 'last' represents the first stripe not touched by the operation */
+    last = fop->frag_range.last;
+
+    /* If there are no modified stripes, we don't need to do anything
+     * else. */
+    if (last <= first) {
+        return;
+    }
+
+    if (!fop->use_fd) {
+        inode = fop->loc[0].inode;
+    } else {
+        inode = fop->fd->inode;
+    }
+
+    LOCK(&inode->lock);
+
+    ctx = __ec_inode_get(inode, fop->xl);
+    if (ctx == NULL) {
+        goto out;
+    }
+    stripe_cache = &ctx->stripe_cache;
+
+    /* Since we'll be moving elements of the list to the tail, we might
+     * end in an infinite loop. To avoid it, we insert a sentinel element
+     * into the list, so that it will be used to detect when we have
+     * traversed all existing elements once. */
+    list_add_tail(&sentinel, &stripe_cache->lru);
+    temp = stripe_cache->lru.next;
+    while (temp != &sentinel) {
+        stripe = list_entry(temp, ec_stripe_t, lru);
+        temp = temp->next;
+        if ((first <= stripe->frag_offset) && (stripe->frag_offset < last)) {
+            ec_update_stripe(fop->xl->private, stripe_cache, stripe, fop);
+        }
+    }
+    list_del(&sentinel);
+
+out:
+    UNLOCK(&inode->lock);
+}
+
+void
+ec_lock_reuse(ec_fop_data_t *fop)
+{
+    ec_cbk_data_t *cbk;
+    ec_t *ec = NULL;
+    int32_t i, count;
+    gf_boolean_t release = _gf_false;
+    ec = fop->xl->private;
+    cbk = fop->answer;
+
+    if (ec_eager_lock_used(ec, fop) && cbk != NULL) {
+        if (cbk->xdata != NULL) {
+            if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT, &count) ==
+                 0) &&
+                (count > 1)) {
+                release = _gf_true;
+            }
+            if (release) {
+                gf_msg_debug(fop->xl->name, 0, "Lock contention detected");
+            }
+        }
+    } else {
+        /* If eager lock is disabled or if we haven't get
+         * an answer with enough quorum, we always release
+         * the lock. */
+        release = _gf_true;
+    }
+    ec_update_cached_stripes(fop);
+
+    for (i = 0; i < fop->lock_count; i++) {
+        ec_lock_next_owner(&fop->locks[i], cbk, release);
+    }
+}
+
+void
+__ec_manager(ec_fop_data_t *fop, int32_t error)
+{
+    ec_t *ec = fop->xl->private;
+
+    do {
+        ec_trace("MANAGER", fop, "error=%d", error);
+
+        if (!ec_must_wind(fop)) {
+            if (ec->xl_up_count < ec->fragments) {
+                error = ENOTCONN;
+            }
+        }
+
+        if (error != 0) {
+            fop->error = error;
+            fop->state = -fop->state;
+        }
+
+        if ((fop->state == EC_STATE_END) || (fop->state == -EC_STATE_END)) {
+            ec_fop_data_release(fop);
+
+            break;
+        }
+
+        /* At each state, fop must not be used anywhere else and there
+         * shouldn't be any pending subfop going on. */
+        GF_ASSERT(fop->jobs == 0);
+
+        /* While the manager is running we need to avoid that subfops launched
+         * from it could finish and call ec_resume() before the fop->handler
+         * has completed. This could lead to the same manager being executed
+         * by two threads concurrently. ec_check_complete() will take care of
+         * this reference. */
+        fop->jobs = 1;
+
+        fop->state = fop->handler(fop, fop->state);
+        GF_ASSERT(fop->state >= 0);
+
+        error = ec_check_complete(fop, __ec_manager);
+    } while (error >= 0);
+}
+
+void
+ec_manager(ec_fop_data_t *fop, int32_t error)
+{
+    GF_ASSERT(fop->jobs == 0);
+    GF_ASSERT(fop->winds == 0);
+    GF_ASSERT(fop->error == 0);
+
+    if (fop->state == EC_STATE_START) {
+        fop->state = EC_STATE_INIT;
+    }
+
+    __ec_manager(fop, error);
+}
+
+gf_boolean_t
+__ec_is_last_fop(ec_t *ec)
+{
+    if ((list_empty(&ec->pending_fops)) &&
+        (GF_ATOMIC_GET(ec->async_fop_count) == 0)) {
+        return _gf_true;
+    }
+    return _gf_false;
+}
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
new file mode 100644
index 00000000000..51493612ac6
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -0,0 +1,234 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_COMMON_H__
+#define __EC_COMMON_H__
+
+#include "glusterfs/compat-errno.h"  // for ENODATA on BSD
+#include "ec-data.h"
+
+typedef enum { EC_DATA_TXN, EC_METADATA_TXN } ec_txn_t;
+
+#define EC_FOP_HEAL -1
+#define EC_FOP_FHEAL -2
+
+#define EC_CONFIG_VERSION 0
+
+#define EC_CONFIG_ALGORITHM 0
+
+#define EC_FLAG_LOCK_SHARED 0x0001
+
+#define QUORUM_CBK(fn, fop, frame, cookie, this, op_ret, op_errno, params...)  \
+    do {                                                                       \
+        ec_t *__ec = fop->xl->private;                                         \
+        int32_t __op_ret = 0;                                                  \
+        int32_t __op_errno = 0;                                                \
+        int32_t __success_count = gf_bits_count(fop->good);                    \
+                                                                               \
+        __op_ret = op_ret;                                                     \
+        __op_errno = op_errno;                                                 \
+        if (!fop->parent && frame &&                                           \
+            (GF_CLIENT_PID_SELF_HEALD != frame->root->pid) &&                  \
+            __ec->quorum_count && (__success_count < __ec->quorum_count) &&    \
+            op_ret >= 0) {                                                     \
+            __op_ret = -1;                                                     \
+            __op_errno = EIO;                                                  \
+            gf_msg(__ec->xl->name, GF_LOG_ERROR, 0,                            \
+                   EC_MSG_CHILDS_INSUFFICIENT,                                 \
+                   "Insufficient available children for this request "         \
+                   "(have %d, need %d). %s",                                   \
+                   __success_count, __ec->quorum_count, ec_msg_str(fop));      \
+        }                                                                      \
+        fn(frame, cookie, this, __op_ret, __op_errno, params);                 \
+    } while (0)
+
+enum _ec_xattrop_flags {
+    EC_FLAG_XATTROP,
+    EC_FLAG_DATA_DIRTY,
+    EC_FLAG_METADATA_DIRTY,
+
+    /* Add any new flag here, before EC_FLAG_MAX. The maximum number of
+     * flags that can be defined is 16. */
+
+    EC_FLAG_MAX
+};
+
+/* We keep two sets of flags. One to determine what's really providing the
+ * current xattrop and the other to know what the parent fop of the xattrop
+ * needs to proceed. It might happen that a fop needs some information that
+ * is being already requested by a previous fop. The two sets are stored
+ * contiguously. */
+
+#define EC_FLAG_NEEDS(_flag) (1 << (_flag))
+#define EC_FLAG_PROVIDES(_flag) (1 << ((_flag) + EC_FLAG_MAX))
+
+#define EC_NEEDED_FLAGS(_flags) ((_flags) & ((1 << EC_FLAG_MAX) - 1))
+
+#define EC_PROVIDED_FLAGS(_flags) EC_NEEDED_FLAGS((_flags) >> EC_FLAG_MAX)
+
+#define EC_FLAGS_HAVE(_flags, _flag) (((_flags) & (1 << (_flag))) != 0)
+
+#define EC_SELFHEAL_BIT 62
+
+#define EC_MINIMUM_ONE (1 << 6)
+#define EC_MINIMUM_MIN (2 << 6)
+#define EC_MINIMUM_ALL (3 << 6)
+#define EC_FOP_NO_PROPAGATE_ERROR (1 << 8)
+#define EC_FOP_MINIMUM(_flags) ((_flags)&255)
+#define EC_FOP_FLAGS(_flags) ((_flags) & ~255)
+
+#define EC_UPDATE_DATA 1
+#define EC_UPDATE_META 2
+#define EC_QUERY_INFO 4
+#define EC_INODE_SIZE 8
+
+#define EC_STATE_START 0
+#define EC_STATE_END 0
+#define EC_STATE_INIT 1
+#define EC_STATE_LOCK 2
+#define EC_STATE_DISPATCH 3
+#define EC_STATE_PREPARE_ANSWER 4
+#define EC_STATE_REPORT 5
+#define EC_STATE_LOCK_REUSE 6
+#define EC_STATE_UNLOCK 7
+
+#define EC_STATE_DELAYED_START 100
+
+#define EC_STATE_HEAL_ENTRY_LOOKUP 200
+#define EC_STATE_HEAL_ENTRY_PREPARE 201
+#define EC_STATE_HEAL_PRE_INODELK_LOCK 202
+#define EC_STATE_HEAL_PRE_INODE_LOOKUP 203
+#define EC_STATE_HEAL_XATTRIBUTES_REMOVE 204
+#define EC_STATE_HEAL_XATTRIBUTES_SET 205
+#define EC_STATE_HEAL_ATTRIBUTES 206
+#define EC_STATE_HEAL_OPEN 207
+#define EC_STATE_HEAL_REOPEN_FD 208
+#define EC_STATE_HEAL_UNLOCK 209
+#define EC_STATE_HEAL_UNLOCK_ENTRY 210
+#define EC_STATE_HEAL_DATA_LOCK 211
+#define EC_STATE_HEAL_DATA_COPY 212
+#define EC_STATE_HEAL_DATA_UNLOCK 213
+#define EC_STATE_HEAL_POST_INODELK_LOCK 214
+#define EC_STATE_HEAL_POST_INODE_LOOKUP 215
+#define EC_STATE_HEAL_SETATTR 216
+#define EC_STATE_HEAL_POST_INODELK_UNLOCK 217
+#define EC_STATE_HEAL_DISPATCH 218
+
+/* Value to cover the full range of a file */
+#define EC_RANGE_FULL ((uint64_t)LLONG_MAX + 1)
+
+gf_boolean_t
+ec_dispatch_one_retry(ec_fop_data_t *fop, ec_cbk_data_t **cbk);
+void
+ec_dispatch_next(ec_fop_data_t *fop, uint32_t idx);
+
+void
+ec_complete(ec_fop_data_t *fop);
+
+void
+ec_update_good(ec_fop_data_t *fop, uintptr_t good);
+
+void
+ec_fop_set_error(ec_fop_data_t *fop, int32_t error);
+
+void
+__ec_fop_set_error(ec_fop_data_t *fop, int32_t error);
+
+ec_cbk_data_t *
+ec_fop_prepare_answer(ec_fop_data_t *fop, gf_boolean_t ro);
+
+gf_boolean_t
+ec_cbk_set_error(ec_cbk_data_t *cbk, int32_t error, gf_boolean_t ro);
+
+void
+ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
+                      off_t fl_start, uint64_t fl_size);
+void
+ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
+                             uint32_t flags);
+void
+ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags, off_t fl_start,
+                   uint64_t fl_size);
+void
+ec_lock(ec_fop_data_t *fop);
+void
+ec_lock_reuse(ec_fop_data_t *fop);
+void
+ec_unlock(ec_fop_data_t *fop);
+void
+ec_lock_release(ec_t *ec, inode_t *inode);
+
+gf_boolean_t
+ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size);
+gf_boolean_t
+__ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size);
+gf_boolean_t
+ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size);
+gf_boolean_t
+__ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size);
+void
+ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode);
+
+void
+ec_flush_size_version(ec_fop_data_t *fop);
+
+void
+ec_dispatch_all(ec_fop_data_t *fop);
+void
+ec_dispatch_inc(ec_fop_data_t *fop);
+void
+ec_dispatch_min(ec_fop_data_t *fop);
+void
+ec_dispatch_one(ec_fop_data_t *fop);
+
+void
+ec_succeed_all(ec_fop_data_t *fop);
+
+void
+ec_sleep(ec_fop_data_t *fop);
+void
+ec_resume(ec_fop_data_t *fop, int32_t error);
+void
+ec_resume_parent(ec_fop_data_t *fop);
+
+void
+ec_manager(ec_fop_data_t *fop, int32_t error);
+gf_boolean_t
+ec_is_recoverable_error(int32_t op_errno);
+void
+ec_handle_healers_done(ec_fop_data_t *fop);
+
+int32_t
+ec_heal_inspect(call_frame_t *frame, ec_t *ec, inode_t *inode,
+                unsigned char *locked_on, gf_boolean_t self_locked,
+                gf_boolean_t thorough, ec_heal_need_t *need_heal);
+int32_t
+ec_get_heal_info(xlator_t *this, loc_t *loc, dict_t **dict);
+
+int32_t
+ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+void
+ec_update_fd_status(fd_t *fd, xlator_t *xl, int child_index,
+                    int32_t ret_status);
+gf_boolean_t
+ec_is_entry_healing(ec_fop_data_t *fop);
+void
+ec_set_entry_healing(ec_fop_data_t *fop);
+void
+ec_reset_entry_healing(ec_fop_data_t *fop);
+char *
+ec_msg_str(ec_fop_data_t *fop);
+gf_boolean_t
+__ec_is_last_fop(ec_t *ec);
+void
+ec_lock_update_good(ec_lock_t *lock, ec_fop_data_t *fop);
+#endif /* __EC_COMMON_H__ */
diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c
new file mode 100644
index 00000000000..06388833546
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-data.c
@@ -0,0 +1,288 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-data.h"
+#include "ec-messages.h"
+
+ec_cbk_data_t *
+ec_cbk_data_allocate(call_frame_t *frame, xlator_t *this, ec_fop_data_t *fop,
+                     int32_t id, int32_t idx, int32_t op_ret, int32_t op_errno)
+{
+    ec_cbk_data_t *cbk;
+    ec_t *ec = this->private;
+
+    if (fop->xl != this) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_XLATOR_MISMATCH,
+               "Mismatching xlators between request "
+               "and answer (req=%s, ans=%s).",
+               fop->xl->name, this->name);
+
+        return NULL;
+    }
+    if (fop->frame != frame) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_FRAME_MISMATCH,
+               "Mismatching frames between request "
+               "and answer (req=%p, ans=%p).",
+               fop->frame, frame);
+
+        return NULL;
+    }
+    if (fop->id != id) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_FOP_MISMATCH,
+               "Mismatching fops between request "
+               "and answer (req=%d, ans=%d).",
+               fop->id, id);
+
+        return NULL;
+    }
+
+    cbk = mem_get0(ec->cbk_pool);
+    if (cbk == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+               "Failed to allocate memory for an "
+               "answer.");
+        return NULL;
+    }
+
+    cbk->fop = fop;
+    cbk->idx = idx;
+    cbk->mask = 1ULL << idx;
+    cbk->count = 1;
+    cbk->op_ret = op_ret;
+    cbk->op_errno = op_errno;
+    INIT_LIST_HEAD(&cbk->entries.list);
+
+    LOCK(&fop->lock);
+
+    list_add_tail(&cbk->answer_list, &fop->answer_list);
+
+    UNLOCK(&fop->lock);
+
+    return cbk;
+}
+
+void
+ec_cbk_data_destroy(ec_cbk_data_t *cbk)
+{
+    if (cbk->xdata != NULL) {
+        dict_unref(cbk->xdata);
+    }
+    if (cbk->dict != NULL) {
+        dict_unref(cbk->dict);
+    }
+    if (cbk->inode != NULL) {
+        inode_unref(cbk->inode);
+    }
+    if (cbk->fd != NULL) {
+        fd_unref(cbk->fd);
+    }
+    if (cbk->buffers != NULL) {
+        iobref_unref(cbk->buffers);
+    }
+    GF_FREE(cbk->vector);
+    gf_dirent_free(&cbk->entries);
+    GF_FREE(cbk->str);
+
+    mem_put(cbk);
+}
+
+ec_fop_data_t *
+ec_fop_data_allocate(call_frame_t *frame, xlator_t *this, int32_t id,
+                     uint32_t flags, uintptr_t target, uint32_t fop_flags,
+                     ec_wind_f wind, ec_handler_f handler, ec_cbk_t cbks,
+                     void *data)
+{
+    ec_fop_data_t *fop, *parent;
+    ec_t *ec = this->private;
+
+    fop = mem_get0(ec->fop_pool);
+    if (fop == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+               "Failed to allocate memory for a "
+               "request.");
+
+        return NULL;
+    }
+
+    INIT_LIST_HEAD(&fop->cbk_list);
+    INIT_LIST_HEAD(&fop->healer);
+    INIT_LIST_HEAD(&fop->answer_list);
+    INIT_LIST_HEAD(&fop->pending_list);
+    INIT_LIST_HEAD(&fop->locks[0].owner_list);
+    INIT_LIST_HEAD(&fop->locks[0].wait_list);
+    INIT_LIST_HEAD(&fop->locks[1].owner_list);
+    INIT_LIST_HEAD(&fop->locks[1].wait_list);
+
+    fop->xl = this;
+    fop->req_frame = frame;
+
+    /* fops need a private frame to be able to execute some postop operations
+     * even if the original fop has completed and reported back to the upper
+     * xlator and it has destroyed the base frame.
+     *
+     * TODO: minimize usage of private frames. Reuse req_frame as much as
+     *       possible.
+     */
+    if (frame != NULL) {
+        fop->frame = copy_frame(frame);
+    } else {
+        fop->frame = create_frame(this, this->ctx->pool);
+    }
+    if (fop->frame == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+               "Failed to create a private frame "
+               "for a request");
+
+        mem_put(fop);
+
+        return NULL;
+    }
+    fop->id = id;
+    fop->refs = 1;
+
+    fop->flags = flags;
+    fop->minimum = EC_FOP_MINIMUM(fop_flags);
+    fop->fop_flags = EC_FOP_FLAGS(fop_flags);
+    fop->mask = target;
+
+    fop->wind = wind;
+    fop->handler = handler;
+    fop->cbks = cbks;
+    fop->data = data;
+
+    fop->uid = fop->frame->root->uid;
+    fop->gid = fop->frame->root->gid;
+
+    LOCK_INIT(&fop->lock);
+
+    fop->frame->local = fop;
+
+    if (frame != NULL) {
+        parent = frame->local;
+        if (parent != NULL) {
+            ec_sleep(parent);
+        }
+
+        fop->parent = parent;
+    }
+
+    LOCK(&ec->lock);
+
+    list_add_tail(&fop->pending_list, &ec->pending_fops);
+
+    UNLOCK(&ec->lock);
+
+    return fop;
+}
+
+void
+ec_fop_data_acquire(ec_fop_data_t *fop)
+{
+    LOCK(&fop->lock);
+
+    ec_trace("ACQUIRE", fop, "");
+
+    fop->refs++;
+
+    UNLOCK(&fop->lock);
+}
+
+static void
+ec_handle_last_pending_fop_completion(ec_fop_data_t *fop, gf_boolean_t *notify)
+{
+    ec_t *ec = fop->xl->private;
+
+    *notify = _gf_false;
+
+    if (!list_empty(&fop->pending_list)) {
+        LOCK(&ec->lock);
+        {
+            list_del_init(&fop->pending_list);
+            *notify = __ec_is_last_fop(ec);
+        }
+        UNLOCK(&ec->lock);
+    }
+}
+
+void
+ec_fop_cleanup(ec_fop_data_t *fop)
+{
+    ec_cbk_data_t *cbk, *tmp;
+
+    list_for_each_entry_safe(cbk, tmp, &fop->answer_list, answer_list)
+    {
+        list_del_init(&cbk->answer_list);
+
+        ec_cbk_data_destroy(cbk);
+    }
+    INIT_LIST_HEAD(&fop->cbk_list);
+
+    fop->answer = NULL;
+}
+
+void
+ec_fop_data_release(ec_fop_data_t *fop)
+{
+    ec_t *ec = NULL;
+    int32_t refs;
+    gf_boolean_t notify = _gf_false;
+
+    LOCK(&fop->lock);
+
+    ec_trace("RELEASE", fop, "");
+
+    GF_ASSERT(fop->refs > 0);
+    refs = --fop->refs;
+
+    UNLOCK(&fop->lock);
+
+    if (refs == 0) {
+        fop->frame->local = NULL;
+        STACK_DESTROY(fop->frame->root);
+
+        LOCK_DESTROY(&fop->lock);
+
+        if (fop->xdata != NULL) {
+            dict_unref(fop->xdata);
+        }
+        if (fop->dict != NULL) {
+            dict_unref(fop->dict);
+        }
+        if (fop->inode != NULL) {
+            inode_unref(fop->inode);
+        }
+        if (fop->fd != NULL) {
+            fd_unref(fop->fd);
+        }
+        if (fop->buffers != NULL) {
+            iobref_unref(fop->buffers);
+        }
+        GF_FREE(fop->vector);
+        GF_FREE(fop->str[0]);
+        GF_FREE(fop->str[1]);
+        loc_wipe(&fop->loc[0]);
+        loc_wipe(&fop->loc[1]);
+        GF_FREE(fop->errstr);
+
+        ec_resume_parent(fop);
+
+        ec_fop_cleanup(fop);
+
+        ec = fop->xl->private;
+        ec_handle_last_pending_fop_completion(fop, &notify);
+        ec_handle_healers_done(fop);
+        mem_put(fop);
+        if (notify) {
+            ec_pending_fops_completed(ec);
+        }
+    }
+}
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
new file mode 100644
index 00000000000..c8a74ffe1ed
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -0,0 +1,35 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_DATA_H__
+#define __EC_DATA_H__
+
+#include "ec-types.h"
+
+ec_cbk_data_t *
+ec_cbk_data_allocate(call_frame_t *frame, xlator_t *this, ec_fop_data_t *fop,
+                     int32_t id, int32_t idx, int32_t op_ret, int32_t op_errno);
+ec_fop_data_t *
+ec_fop_data_allocate(call_frame_t *frame, xlator_t *this, int32_t id,
+                     uint32_t flags, uintptr_t target, uint32_t fop_flags,
+                     ec_wind_f wind, ec_handler_f handler, ec_cbk_t cbks,
+                     void *data);
+void
+ec_fop_data_acquire(ec_fop_data_t *fop);
+void
+ec_fop_data_release(ec_fop_data_t *fop);
+
+void
+ec_fop_cleanup(ec_fop_data_t *fop);
+
+void
+ec_pending_fops_completed(ec_t *ec);
+
+#endif /* __EC_DATA_H__ */
diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c
new file mode 100644
index 00000000000..f71dcfac293
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-dir-read.c
@@ -0,0 +1,647 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "ec.h"
+#include "ec-messages.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-fops.h"
+
+/****************************************************************
+ *
+ * File Operation: opendir
+ *
+ ***************************************************************/
+
+int32_t
+ec_combine_opendir(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+    if (dst->fd != src->fd) {
+        gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_FD_MISMATCH,
+               "Mismatching fd in answers "
+               "of 'GF_FOP_OPENDIR': %p <-> %p",
+               dst->fd, src->fd);
+
+        return 0;
+    }
+
+    return 1;
+}
+
+int32_t
+ec_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_OPENDIR, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (op_ret >= 0) {
+            if (fd != NULL) {
+                cbk->fd = fd_ref(fd);
+                if (cbk->fd == NULL) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           EC_MSG_FILE_DESC_REF_FAIL,
+                           "Failed to reference a "
+                           "file descriptor.");
+
+                    goto out;
+                }
+            }
+        }
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, ec_combine_opendir);
+
+        ec_update_fd_status(fd, this, idx, op_ret);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_opendir(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_opendir_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->opendir,
+                      &fop->loc[0], fop->fd, fop->xdata);
+}
+
+int32_t
+ec_manager_opendir(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+    ec_fd_t *ctx;
+    int32_t err;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            LOCK(&fop->fd->lock);
+
+            ctx = __ec_fd_get(fop->fd, fop->xl);
+            if (ctx == NULL) {
+                UNLOCK(&fop->fd->lock);
+
+                fop->error = ENOMEM;
+
+                return EC_STATE_REPORT;
+            }
+            if (!ctx->loc.inode) {
+                err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]);
+                if (err != 0) {
+                    UNLOCK(&fop->fd->lock);
+
+                    fop->error = -err;
+
+                    return EC_STATE_REPORT;
+                }
+            }
+
+            UNLOCK(&fop->fd->lock);
+
+            /* Fall through */
+
+        case EC_STATE_LOCK:
+            ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
+                                  EC_RANGE_FULL);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_true);
+            if (cbk != NULL) {
+                /* Save which subvolumes successfully opened the directory.
+                 * If ctx->open is 0, it means that readdir cannot be
+                 * processed in this directory.
+                 */
+                LOCK(&fop->fd->lock);
+
+                ctx = __ec_fd_get(fop->fd, fop->xl);
+                if (ctx != NULL) {
+                    ctx->open |= cbk->mask;
+                }
+
+                UNLOCK(&fop->fd->lock);
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.opendir != NULL) {
+                fop->cbks.opendir(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                  cbk->op_errno, cbk->fd, cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.opendir != NULL) {
+                fop->cbks.opendir(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                  NULL, NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_opendir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+           uint32_t fop_flags, fop_opendir_cbk_t func, void *data, loc_t *loc,
+           fd_t *fd, dict_t *xdata)
+{
+    ec_cbk_t callback = {.opendir = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(OPENDIR) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_OPENDIR, EC_FLAG_LOCK_SHARED,
+                               target, fop_flags, ec_wind_opendir,
+                               ec_manager_opendir, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL);
+    }
+}
+
+/* Returns -1 if client_id is invalid else index of child subvol in xl_list */
+int
+ec_deitransform(xlator_t *this, off_t offset)
+{
+    int idx = -1;
+    int client_id = -1;
+    ec_t *ec = this->private;
+    char id[32] = {0};
+    int err;
+
+    client_id = gf_deitransform(this, offset);
+    sprintf(id, "%d", client_id);
+    err = dict_get_int32(ec->leaf_to_subvolid, id, &idx);
+    if (err < 0) {
+        idx = err;
+        goto out;
+    }
+
+out:
+    if (idx < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_REQUEST,
+               "Invalid index %d in readdirp request", client_id);
+        idx = -EINVAL;
+    }
+    return idx;
+}
+
+/* FOP: readdir */
+
+void
+ec_adjust_readdirp(ec_t *ec, int32_t idx, gf_dirent_t *entries)
+{
+    gf_dirent_t *entry;
+
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        if (!entry->inode)
+            continue;
+
+        if (entry->d_stat.ia_type == IA_IFREG) {
+            if ((entry->dict == NULL) ||
+                (ec_dict_del_number(entry->dict, EC_XATTR_SIZE,
+                                    &entry->d_stat.ia_size) != 0)) {
+                inode_unref(entry->inode);
+                entry->inode = NULL;
+            } else {
+                ec_iatt_rebuild(ec, &entry->d_stat, 1, 1);
+            }
+        }
+    }
+}
+
+int32_t
+ec_common_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+                      dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret,
+                               op_errno);
+    if (cbk) {
+        if (xdata)
+            cbk->xdata = dict_ref(xdata);
+        if (cbk->op_ret >= 0)
+            list_splice_init(&entries->list, &cbk->entries.list);
+        ec_combine(cbk, NULL);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_readdir(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_common_readdir_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->readdir,
+                      fop->fd, fop->size, fop->offset, fop->xdata);
+}
+
+int32_t
+ec_manager_readdir(ec_fop_data_t *fop, int32_t state)
+{
+    ec_fd_t *ctx = NULL;
+    ec_cbk_data_t *cbk = NULL;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            /* Return error if opendir has not been successfully called on
+             * any subvolume. */
+            ctx = ec_fd_get(fop->fd, fop->xl);
+            if (ctx == NULL) {
+                fop->error = ENOMEM;
+            } else if (ctx->open == 0) {
+                fop->error = EBADFD;
+            }
+
+            if (fop->error) {
+                gf_msg(fop->xl->name, GF_LOG_ERROR, fop->error,
+                       EC_MSG_INVALID_REQUEST, "EC is not winding readdir: %s",
+                       ec_msg_str(fop));
+                return EC_STATE_REPORT;
+            }
+
+            if (fop->id == GF_FOP_READDIRP) {
+                int32_t err;
+
+                if (fop->xdata == NULL) {
+                    fop->xdata = dict_new();
+                    if (fop->xdata == NULL) {
+                        fop->error = ENOMEM;
+
+                        return EC_STATE_REPORT;
+                    }
+                }
+
+                err = dict_set_uint64(fop->xdata, EC_XATTR_SIZE, 0);
+                if (err != 0) {
+                    fop->error = -err;
+
+                    return EC_STATE_REPORT;
+                }
+            }
+
+            if (fop->offset != 0) {
+                /* Non-zero offset is irrecoverable error as the offset may not
+                 * be valid on other bricks*/
+                int32_t idx = -1;
+
+                idx = ec_deitransform(fop->xl, fop->offset);
+
+                if (idx < 0) {
+                    fop->error = -idx;
+                    return EC_STATE_REPORT;
+                }
+                fop->mask &= 1ULL << idx;
+            } else {
+                ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
+                                   EC_RANGE_FULL);
+                ec_lock(fop);
+            }
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_one(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            if (ec_dispatch_one_retry(fop, &cbk)) {
+                return EC_STATE_DISPATCH;
+            }
+
+            if ((cbk != NULL) && (cbk->op_ret > 0) &&
+                (fop->id == GF_FOP_READDIRP)) {
+                ec_adjust_readdirp(fop->xl->private, cbk->idx, &cbk->entries);
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+            GF_ASSERT(cbk);
+            if (fop->id == GF_FOP_READDIR) {
+                if (fop->cbks.readdir != NULL) {
+                    fop->cbks.readdir(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                      cbk->op_errno, &cbk->entries, cbk->xdata);
+                }
+            } else {
+                if (fop->cbks.readdirp != NULL) {
+                    fop->cbks.readdirp(fop->req_frame, fop, fop->xl,
+                                       cbk->op_ret, cbk->op_errno,
+                                       &cbk->entries, cbk->xdata);
+                }
+            }
+            if (fop->offset == 0)
+                return EC_STATE_LOCK_REUSE;
+            else
+                return EC_STATE_END;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            if (fop->id == GF_FOP_READDIR) {
+                if (fop->cbks.readdir != NULL) {
+                    fop->cbks.readdir(fop->req_frame, fop, fop->xl, -1,
+                                      fop->error, NULL, NULL);
+                }
+            } else {
+                if (fop->cbks.readdirp != NULL) {
+                    fop->cbks.readdirp(fop->req_frame, fop, fop->xl, -1,
+                                       fop->error, NULL, NULL);
+                }
+            }
+            if (fop->offset == 0)
+                return EC_STATE_LOCK_REUSE;
+            else
+                return EC_STATE_END;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            GF_ASSERT(fop->offset == 0);
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            GF_ASSERT(fop->offset == 0);
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_readdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+           uint32_t fop_flags, fop_readdir_cbk_t func, void *data, fd_t *fd,
+           size_t size, off_t offset, dict_t *xdata)
+{
+    ec_cbk_t callback = {.readdir = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(READDIR) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_READDIR, EC_FLAG_LOCK_SHARED,
+                               target, fop_flags, ec_wind_readdir,
+                               ec_manager_readdir, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    fop->size = size;
+    fop->offset = offset;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL);
+    }
+}
+
+/* FOP: readdirp */
+
+void
+ec_wind_readdirp(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_common_readdir_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->readdirp,
+                      fop->fd, fop->size, fop->offset, fop->xdata);
+}
+
+void
+ec_readdirp(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_readdirp_cbk_t func, void *data, fd_t *fd,
+            size_t size, off_t offset, dict_t *xdata)
+{
+    ec_cbk_t callback = {.readdirp = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(READDIRP) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(
+        frame, this, GF_FOP_READDIRP, EC_FLAG_LOCK_SHARED, target, fop_flags,
+        ec_wind_readdirp, ec_manager_readdir, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    fop->size = size;
+    fop->offset = offset;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL);
+    }
+}
diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c
new file mode 100644
index 00000000000..53d27d895c3
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-dir-write.c
@@ -0,0 +1,1487 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "ec.h"
+#include "ec-messages.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-method.h"
+#include "ec-fops.h"
+
+int
+ec_dir_write_cbk(call_frame_t *frame, xlator_t *this, void *cookie, int op_ret,
+                 int op_errno, struct iatt *poststat, struct iatt *preparent,
+                 struct iatt *postparent, struct iatt *preparent2,
+                 struct iatt *postparent2, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int i = 0;
+    int idx = 0;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+    idx = (long)cookie;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret,
+                               op_errno);
+    if (!cbk)
+        goto out;
+
+    if (xdata)
+        cbk->xdata = dict_ref(xdata);
+
+    if (op_ret < 0)
+        goto out;
+
+    if (poststat)
+        cbk->iatt[i++] = *poststat;
+
+    if (preparent)
+        cbk->iatt[i++] = *preparent;
+
+    if (postparent)
+        cbk->iatt[i++] = *postparent;
+
+    if (preparent2)
+        cbk->iatt[i++] = *preparent2;
+
+    if (postparent2)
+        cbk->iatt[i++] = *postparent2;
+
+out:
+    if (cbk)
+        ec_combine(cbk, ec_combine_write);
+
+    if (fop)
+        ec_complete(fop);
+    return 0;
+}
+
+/* FOP: create */
+
+int32_t
+ec_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf,
+              struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf,
+                            preparent, postparent, NULL, NULL, xdata);
+}
+
+void
+ec_wind_create(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_create_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->create,
+                      &fop->loc[0], fop->int32, fop->mode[0], fop->mode[1],
+                      fop->fd, fop->xdata);
+}
+
+int32_t
+ec_manager_create(ec_fop_data_t *fop, int32_t state)
+{
+    ec_config_t config;
+    ec_t *ec;
+    ec_cbk_data_t *cbk;
+    ec_fd_t *ctx;
+    uint64_t version[2] = {0, 0};
+    int32_t err;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            LOCK(&fop->fd->lock);
+
+            ctx = __ec_fd_get(fop->fd, fop->xl);
+            if (ctx == NULL) {
+                UNLOCK(&fop->fd->lock);
+
+                fop->error = ENOMEM;
+
+                return EC_STATE_REPORT;
+            }
+            err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]);
+            if (err != 0) {
+                UNLOCK(&fop->fd->lock);
+
+                fop->error = -err;
+
+                return EC_STATE_REPORT;
+            }
+
+            ctx->flags = fop->int32;
+
+            UNLOCK(&fop->fd->lock);
+
+            if (fop->xdata == NULL) {
+                fop->xdata = dict_new();
+                if (fop->xdata == NULL) {
+                    fop->error = ENOMEM;
+
+                    return EC_STATE_REPORT;
+                }
+            }
+
+            ec = fop->xl->private;
+
+            config.version = EC_CONFIG_VERSION;
+            config.algorithm = EC_CONFIG_ALGORITHM;
+            config.gf_word_size = EC_GF_BITS;
+            config.bricks = ec->nodes;
+            config.redundancy = ec->redundancy;
+            config.chunk_size = EC_METHOD_CHUNK_SIZE;
+
+            err = ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG, &config);
+            if (err != 0) {
+                fop->error = -err;
+
+                return EC_STATE_REPORT;
+            }
+            err = ec_dict_set_array(fop->xdata, EC_XATTR_VERSION, version,
+                                    EC_VERSION_SIZE);
+            if (err != 0) {
+                fop->error = -err;
+
+                return EC_STATE_REPORT;
+            }
+            err = ec_dict_set_number(fop->xdata, EC_XATTR_SIZE, 0);
+            if (err != 0) {
+                fop->error = -err;
+
+                return EC_STATE_REPORT;
+            }
+
+            /* We need to write to specific offsets on the bricks, so we
+             * need to remove O_APPEND from flags (if present) */
+            fop->int32 &= ~O_APPEND;
+
+            /* Fall through */
+
+        case EC_STATE_LOCK:
+            ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL,
+                                         EC_UPDATE_DATA | EC_UPDATE_META);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_false);
+            if (cbk != NULL) {
+                int32_t err;
+
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count);
+
+                err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode,
+                                    &cbk->iatt[0]);
+                if (!ec_cbk_set_error(cbk, -err, _gf_false)) {
+                    LOCK(&fop->fd->lock);
+
+                    ctx = __ec_fd_get(fop->fd, fop->xl);
+                    if (ctx != NULL) {
+                        ctx->open |= cbk->mask;
+                    }
+
+                    UNLOCK(&fop->fd->lock);
+                }
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.create != NULL) {
+                QUORUM_CBK(fop->cbks.create, fop, fop->req_frame, fop, fop->xl,
+                           cbk->op_ret, cbk->op_errno, fop->fd,
+                           fop->loc[0].inode, &cbk->iatt[0], &cbk->iatt[1],
+                           &cbk->iatt[2], cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.create != NULL) {
+                fop->cbks.create(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                 NULL, NULL, NULL, NULL, NULL, NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_create(call_frame_t *frame, xlator_t *this, uintptr_t target,
+          uint32_t fop_flags, fop_create_cbk_t func, void *data, loc_t *loc,
+          int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    ec_cbk_t callback = {.create = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(CREATE) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_CREATE, 0, target, fop_flags,
+                               ec_wind_create, ec_manager_create, callback,
+                               data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->int32 = flags;
+    fop->mode[0] = mode;
+    fop->mode[1] = umask;
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: link */
+
+int32_t
+ec_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, inode_t *inode, struct iatt *buf,
+            struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf,
+                            preparent, postparent, NULL, NULL, xdata);
+}
+
+void
+ec_wind_link(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_link_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->link,
+                      &fop->loc[0], &fop->loc[1], fop->xdata);
+}
+
+int32_t
+ec_manager_link(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            ec_lock_prepare_parent_inode(
+                fop, &fop->loc[1], &fop->loc[0],
+                EC_UPDATE_DATA | EC_UPDATE_META | EC_INODE_SIZE);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_false);
+            if (cbk != NULL) {
+                int32_t err;
+
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count);
+
+                if (cbk->iatt[0].ia_type == IA_IFREG) {
+                    cbk->iatt[0].ia_size = fop->locks[0].size;
+                }
+
+                err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode,
+                                    &cbk->iatt[0]);
+                ec_cbk_set_error(cbk, -err, _gf_false);
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.link != NULL) {
+                QUORUM_CBK(fop->cbks.link, fop, fop->req_frame, fop, fop->xl,
+                           cbk->op_ret, cbk->op_errno, fop->loc[0].inode,
+                           &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2],
+                           cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.link != NULL) {
+                fop->cbks.link(fop->req_frame, fop, fop->xl, -1, fop->error,
+                               NULL, NULL, NULL, NULL, NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target,
+        uint32_t fop_flags, fop_link_cbk_t func, void *data, loc_t *oldloc,
+        loc_t *newloc, dict_t *xdata)
+{
+    ec_cbk_t callback = {.link = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(LINK) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_LINK, 0, target, fop_flags,
+                               ec_wind_link, ec_manager_link, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    if (oldloc != NULL) {
+        if (loc_copy(&fop->loc[0], oldloc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (newloc != NULL) {
+        if (loc_copy(&fop->loc[1], newloc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: mkdir */
+
+int32_t
+ec_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, inode_t *inode, struct iatt *buf,
+             struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf,
+                            preparent, postparent, NULL, NULL, xdata);
+}
+
+void
+ec_wind_mkdir(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_mkdir_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->mkdir,
+                      &fop->loc[0], fop->mode[0], fop->mode[1], fop->xdata);
+}
+
+int32_t
+ec_manager_mkdir(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+    uint64_t version[2] = {0, 0};
+    int32_t err;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            if (fop->xdata == NULL) {
+                fop->xdata = dict_new();
+                if (fop->xdata == NULL) {
+                    fop->error = ENOMEM;
+
+                    return EC_STATE_REPORT;
+                }
+            }
+
+            err = ec_dict_set_array(fop->xdata, EC_XATTR_VERSION, version,
+                                    EC_VERSION_SIZE);
+            if (err != 0) {
+                fop->error = -err;
+                return EC_STATE_REPORT;
+            }
+
+            /* Fall through */
+
+        case EC_STATE_LOCK:
+            ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL,
+                                         EC_UPDATE_DATA | EC_UPDATE_META);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_false);
+            if (cbk != NULL) {
+                int32_t err;
+
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count);
+
+                err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode,
+                                    &cbk->iatt[0]);
+                ec_cbk_set_error(cbk, -err, _gf_false);
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.mkdir != NULL) {
+                QUORUM_CBK(fop->cbks.mkdir, fop, fop->req_frame, fop, fop->xl,
+                           cbk->op_ret, cbk->op_errno, fop->loc[0].inode,
+                           &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2],
+                           cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            cbk = fop->answer;
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.mkdir != NULL) {
+                fop->cbks.mkdir(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                NULL, NULL, NULL, NULL,
+                                ((cbk) ? cbk->xdata : NULL));
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_mkdir_cbk_t func, void *data, loc_t *loc,
+         mode_t mode, mode_t umask, dict_t *xdata)
+{
+    ec_cbk_t callback = {.mkdir = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(MKDIR) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_MKDIR, 0, target, fop_flags,
+                               ec_wind_mkdir, ec_manager_mkdir, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->mode[0] = mode;
+    fop->mode[1] = umask;
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: mknod */
+
+int32_t
+ec_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, inode_t *inode, struct iatt *buf,
+             struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf,
+                            preparent, postparent, NULL, NULL, xdata);
+}
+
+void
+ec_wind_mknod(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_mknod_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->mknod,
+                      &fop->loc[0], fop->mode[0], fop->dev, fop->mode[1],
+                      fop->xdata);
+}
+
+int32_t
+ec_manager_mknod(ec_fop_data_t *fop, int32_t state)
+{
+    ec_config_t config;
+    ec_t *ec;
+    ec_cbk_data_t *cbk;
+    uint64_t version[2] = {0, 0};
+
+    switch (state) {
+        case EC_STATE_INIT:
+            if (S_ISREG(fop->mode[0])) {
+                int32_t err;
+
+                if (fop->xdata == NULL) {
+                    fop->xdata = dict_new();
+                    if (fop->xdata == NULL) {
+                        fop->error = ENOMEM;
+
+                        return EC_STATE_REPORT;
+                    }
+                }
+
+                ec = fop->xl->private;
+
+                config.version = EC_CONFIG_VERSION;
+                config.algorithm = EC_CONFIG_ALGORITHM;
+                config.gf_word_size = EC_GF_BITS;
+                config.bricks = ec->nodes;
+                config.redundancy = ec->redundancy;
+                config.chunk_size = EC_METHOD_CHUNK_SIZE;
+
+                err = ec_dict_set_config(fop->xdata, EC_XATTR_CONFIG, &config);
+                if (err != 0) {
+                    fop->error = -err;
+
+                    return EC_STATE_REPORT;
+                }
+                err = ec_dict_set_array(fop->xdata, EC_XATTR_VERSION, version,
+                                        EC_VERSION_SIZE);
+                if (err != 0) {
+                    fop->error = -err;
+
+                    return EC_STATE_REPORT;
+                }
+                err = ec_dict_set_number(fop->xdata, EC_XATTR_SIZE, 0);
+                if (err != 0) {
+                    fop->error = -err;
+
+                    return EC_STATE_REPORT;
+                }
+            }
+
+            /* Fall through */
+
+        case EC_STATE_LOCK:
+            ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL,
+                                         EC_UPDATE_DATA | EC_UPDATE_META);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_false);
+            if (cbk != NULL) {
+                int32_t err;
+
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count);
+
+                err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode,
+                                    &cbk->iatt[0]);
+                ec_cbk_set_error(cbk, -err, _gf_false);
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.mknod != NULL) {
+                QUORUM_CBK(fop->cbks.mknod, fop, fop->req_frame, fop, fop->xl,
+                           cbk->op_ret, cbk->op_errno, fop->loc[0].inode,
+                           &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2],
+                           cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.mknod != NULL) {
+                fop->cbks.mknod(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                NULL, NULL, NULL, NULL, NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_mknod_cbk_t func, void *data, loc_t *loc,
+         mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    ec_cbk_t callback = {.mknod = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(MKNOD) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_MKNOD, 0, target, fop_flags,
+                               ec_wind_mknod, ec_manager_mknod, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->mode[0] = mode;
+    fop->dev = rdev;
+    fop->mode[1] = umask;
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: rename */
+
+int32_t
+ec_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *buf, struct iatt *preoldparent,
+              struct iatt *postoldparent, struct iatt *prenewparent,
+              struct iatt *postnewparent, dict_t *xdata)
+{
+    return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf,
+                            preoldparent, postoldparent, prenewparent,
+                            postnewparent, xdata);
+}
+
+void
+ec_wind_rename(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_rename_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->rename,
+                      &fop->loc[0], &fop->loc[1], fop->xdata);
+}
+
+int32_t
+ec_manager_rename(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            ec_lock_prepare_parent_inode(
+                fop, &fop->loc[0], &fop->loc[0],
+                EC_UPDATE_DATA | EC_UPDATE_META | EC_INODE_SIZE);
+            ec_lock_prepare_parent_inode(fop, &fop->loc[1], NULL,
+                                         EC_UPDATE_DATA | EC_UPDATE_META);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_false);
+            if (cbk != NULL) {
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 5, cbk->count);
+
+                if (cbk->iatt[0].ia_type == IA_IFREG) {
+                    cbk->iatt[0].ia_size = fop->locks[0].size;
+                }
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.rename != NULL) {
+                QUORUM_CBK(fop->cbks.rename, fop, fop->req_frame, fop, fop->xl,
+                           cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+                           &cbk->iatt[1], &cbk->iatt[2], &cbk->iatt[3],
+                           &cbk->iatt[4], cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.rename != NULL) {
+                fop->cbks.rename(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                 NULL, NULL, NULL, NULL, NULL, NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_rename(call_frame_t *frame, xlator_t *this, uintptr_t target,
+          uint32_t fop_flags, fop_rename_cbk_t func, void *data, loc_t *oldloc,
+          loc_t *newloc, dict_t *xdata)
+{
+    ec_cbk_t callback = {.rename = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(RENAME) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_RENAME, 0, target, fop_flags,
+                               ec_wind_rename, ec_manager_rename, callback,
+                               data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    if (oldloc != NULL) {
+        if (loc_copy(&fop->loc[0], oldloc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (newloc != NULL) {
+        if (loc_copy(&fop->loc[1], newloc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: rmdir */
+
+int32_t
+ec_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iatt *preparent, struct iatt *postparent,
+             dict_t *xdata)
+{
+    return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, NULL,
+                            preparent, postparent, NULL, NULL, xdata);
+}
+
+void
+ec_wind_rmdir(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_rmdir_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->rmdir,
+                      &fop->loc[0], fop->int32, fop->xdata);
+}
+
+int32_t
+ec_manager_rmdir(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL,
+                                         EC_UPDATE_DATA | EC_UPDATE_META);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            ec_fop_prepare_answer(fop, _gf_false);
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.rmdir != NULL) {
+                QUORUM_CBK(fop->cbks.rmdir, fop, fop->req_frame, fop, fop->xl,
+                           cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+                           &cbk->iatt[1], cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.rmdir != NULL) {
+                fop->cbks.rmdir(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                NULL, NULL, NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_rmdir_cbk_t func, void *data, loc_t *loc,
+         int xflags, dict_t *xdata)
+{
+    ec_cbk_t callback = {.rmdir = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(RMDIR) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_RMDIR, 0, target, fop_flags,
+                               ec_wind_rmdir, ec_manager_rmdir, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->int32 = xflags;
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: symlink */
+
+int32_t
+ec_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, inode_t *inode,
+               struct iatt *buf, struct iatt *preparent,
+               struct iatt *postparent, dict_t *xdata)
+{
+    return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, buf,
+                            preparent, postparent, NULL, NULL, xdata);
+}
+
+void
+ec_wind_symlink(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_symlink_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->symlink,
+                      fop->str[0], &fop->loc[0], fop->mode[0], fop->xdata);
+}
+
+int32_t
+ec_manager_symlink(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL,
+                                         EC_UPDATE_DATA | EC_UPDATE_META);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_false);
+            if (cbk != NULL) {
+                int32_t err;
+
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 3, cbk->count);
+
+                err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode,
+                                    &cbk->iatt[0]);
+                ec_cbk_set_error(cbk, -err, _gf_false);
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.symlink != NULL) {
+                QUORUM_CBK(fop->cbks.symlink, fop, fop->req_frame, fop, fop->xl,
+                           cbk->op_ret, cbk->op_errno, fop->loc[0].inode,
+                           &cbk->iatt[0], &cbk->iatt[1], &cbk->iatt[2],
+                           cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.symlink != NULL) {
+                fop->cbks.symlink(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                  NULL, NULL, NULL, NULL, NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_symlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
+           uint32_t fop_flags, fop_symlink_cbk_t func, void *data,
+           const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    ec_cbk_t callback = {.symlink = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(SYMLINK) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_SYMLINK, 0, target,
+                               fop_flags, ec_wind_symlink, ec_manager_symlink,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->mode[0] = umask;
+
+    if (linkname != NULL) {
+        fop->str[0] = gf_strdup(linkname);
+        if (fop->str[0] == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                   "Failed to duplicate a string.");
+
+            goto out;
+        }
+    }
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: unlink */
+
+int32_t
+ec_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *preparent, struct iatt *postparent,
+              dict_t *xdata)
+{
+    return ec_dir_write_cbk(frame, this, cookie, op_ret, op_errno, NULL,
+                            preparent, postparent, NULL, NULL, xdata);
+}
+
+void
+ec_wind_unlink(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_unlink_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->unlink,
+                      &fop->loc[0], fop->int32, fop->xdata);
+}
+
+int32_t
+ec_manager_unlink(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            ec_lock_prepare_parent_inode(fop, &fop->loc[0], NULL,
+                                         EC_UPDATE_DATA | EC_UPDATE_META);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            ec_fop_prepare_answer(fop, _gf_false);
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.unlink != NULL) {
+                QUORUM_CBK(fop->cbks.unlink, fop, fop->req_frame, fop, fop->xl,
+                           cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+                           &cbk->iatt[1], cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.unlink != NULL) {
+                fop->cbks.unlink(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                 NULL, NULL, NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_unlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
+          uint32_t fop_flags, fop_unlink_cbk_t func, void *data, loc_t *loc,
+          int xflags, dict_t *xdata)
+{
+    ec_cbk_t callback = {.unlink = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(UNLINK) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_UNLINK, 0, target, fop_flags,
+                               ec_wind_unlink, ec_manager_unlink, callback,
+                               data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->int32 = xflags;
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+    }
+}
diff --git a/xlators/cluster/ec/src/ec-fops.h b/xlators/cluster/ec/src/ec-fops.h
new file mode 100644
index 00000000000..07edf8a7fec
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-fops.h
@@ -0,0 +1,254 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_FOPS_H__
+#define __EC_FOPS_H__
+
+#include <glusterfs/xlator.h>
+
+#include "ec-types.h"
+#include "ec-common.h"
+
+void
+ec_access(call_frame_t *frame, xlator_t *this, uintptr_t target,
+          uint32_t fop_flags, fop_access_cbk_t func, void *data, loc_t *loc,
+          int32_t mask, dict_t *xdata);
+
+void
+ec_create(call_frame_t *frame, xlator_t *this, uintptr_t target,
+          uint32_t fop_flags, fop_create_cbk_t func, void *data, loc_t *loc,
+          int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata);
+
+void
+ec_entrylk(call_frame_t *frame, xlator_t *this, uintptr_t target,
+           uint32_t fop_flags, fop_entrylk_cbk_t func, void *data,
+           const char *volume, loc_t *loc, const char *basename,
+           entrylk_cmd cmd, entrylk_type type, dict_t *xdata);
+
+void
+ec_fentrylk(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_fentrylk_cbk_t func, void *data,
+            const char *volume, fd_t *fd, const char *basename, entrylk_cmd cmd,
+            entrylk_type type, dict_t *xdata);
+
+void
+ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_flush_cbk_t func, void *data, fd_t *fd,
+         dict_t *xdata);
+
+void
+ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_fsync_cbk_t func, void *data, fd_t *fd,
+         int32_t datasync, dict_t *xdata);
+
+void
+ec_fsyncdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_fsyncdir_cbk_t func, void *data, fd_t *fd,
+            int32_t datasync, dict_t *xdata);
+
+void
+ec_getxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_getxattr_cbk_t func, void *data, loc_t *loc,
+            const char *name, dict_t *xdata);
+
+void
+ec_fgetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+             uint32_t fop_flags, fop_fgetxattr_cbk_t func, void *data, fd_t *fd,
+             const char *name, dict_t *xdata);
+
+void
+ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target,
+        uint32_t fop_flags, fop_heal_cbk_t func, void *data, loc_t *loc,
+        int32_t partial, dict_t *xdata);
+
+void
+ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_fheal_cbk_t func, void *data, fd_t *fd,
+         int32_t partial, dict_t *xdata);
+
+void
+ec_inodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner,
+           uintptr_t target, uint32_t fop_flags, fop_inodelk_cbk_t func,
+           void *data, const char *volume, loc_t *loc, int32_t cmd,
+           struct gf_flock *flock, dict_t *xdata);
+
+void
+ec_finodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner,
+            uintptr_t target, uint32_t fop_flags, fop_finodelk_cbk_t func,
+            void *data, const char *volume, fd_t *fd, int32_t cmd,
+            struct gf_flock *flock, dict_t *xdata);
+
+void
+ec_link(call_frame_t *frame, xlator_t *this, uintptr_t target,
+        uint32_t fop_flags, fop_link_cbk_t func, void *data, loc_t *oldloc,
+        loc_t *newloc, dict_t *xdata);
+
+void
+ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags,
+      fop_lk_cbk_t func, void *data, fd_t *fd, int32_t cmd,
+      struct gf_flock *flock, dict_t *xdata);
+
+void
+ec_lookup(call_frame_t *frame, xlator_t *this, uintptr_t target,
+          uint32_t fop_flags, fop_lookup_cbk_t func, void *data, loc_t *loc,
+          dict_t *xdata);
+
+void
+ec_mkdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_mkdir_cbk_t func, void *data, loc_t *loc,
+         mode_t mode, mode_t umask, dict_t *xdata);
+
+void
+ec_mknod(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_mknod_cbk_t func, void *data, loc_t *loc,
+         mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata);
+
+void
+ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target,
+        uint32_t fop_flags, fop_open_cbk_t func, void *data, loc_t *loc,
+        int32_t flags, fd_t *fd, dict_t *xdata);
+
+void
+ec_opendir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+           uint32_t fop_flags, fop_opendir_cbk_t func, void *data, loc_t *loc,
+           fd_t *fd, dict_t *xdata);
+
+void
+ec_readdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+           uint32_t fop_flags, fop_readdir_cbk_t func, void *data, fd_t *fd,
+           size_t size, off_t offset, dict_t *xdata);
+
+void
+ec_readdirp(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_readdirp_cbk_t func, void *data, fd_t *fd,
+            size_t size, off_t offset, dict_t *xdata);
+
+void
+ec_readlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_readlink_cbk_t func, void *data, loc_t *loc,
+            size_t size, dict_t *xdata);
+
+void
+ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_readv_cbk_t func, void *data, fd_t *fd,
+         size_t size, off_t offset, uint32_t flags, dict_t *xdata);
+
+void
+ec_removexattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+               uint32_t fop_flags, fop_removexattr_cbk_t func, void *data,
+               loc_t *loc, const char *name, dict_t *xdata);
+
+void
+ec_fremovexattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+                uint32_t fop_flags, fop_fremovexattr_cbk_t func, void *data,
+                fd_t *fd, const char *name, dict_t *xdata);
+
+void
+ec_rename(call_frame_t *frame, xlator_t *this, uintptr_t target,
+          uint32_t fop_flags, fop_rename_cbk_t func, void *data, loc_t *oldloc,
+          loc_t *newloc, dict_t *xdata);
+
+void
+ec_rmdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_rmdir_cbk_t func, void *data, loc_t *loc,
+         int xflags, dict_t *xdata);
+
+void
+ec_setattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+           uint32_t fop_flags, fop_setattr_cbk_t func, void *data, loc_t *loc,
+           struct iatt *stbuf, int32_t valid, dict_t *xdata);
+
+void
+ec_fsetattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_fsetattr_cbk_t func, void *data, fd_t *fd,
+            struct iatt *stbuf, int32_t valid, dict_t *xdata);
+
+void
+ec_setxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_setxattr_cbk_t func, void *data, loc_t *loc,
+            dict_t *dict, int32_t flags, dict_t *xdata);
+
+void
+ec_fsetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+             uint32_t fop_flags, fop_fsetxattr_cbk_t func, void *data, fd_t *fd,
+             dict_t *dict, int32_t flags, dict_t *xdata);
+
+void
+ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target,
+        uint32_t fop_flags, fop_stat_cbk_t func, void *data, loc_t *loc,
+        dict_t *xdata);
+
+void
+ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_fstat_cbk_t func, void *data, fd_t *fd,
+         dict_t *xdata);
+
+void
+ec_statfs(call_frame_t *frame, xlator_t *this, uintptr_t target,
+          uint32_t fop_flags, fop_statfs_cbk_t func, void *data, loc_t *loc,
+          dict_t *xdata);
+
+void
+ec_symlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
+           uint32_t fop_flags, fop_symlink_cbk_t func, void *data,
+           const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata);
+
+void
+ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target,
+             uint32_t fop_flags, fop_fallocate_cbk_t func, void *data, fd_t *fd,
+             int32_t mode, off_t offset, size_t len, dict_t *xdata);
+
+void
+ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target,
+           uint32_t fop_flags, fop_discard_cbk_t func, void *data, fd_t *fd,
+           off_t offset, size_t len, dict_t *xdata);
+
+void
+ec_truncate(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_truncate_cbk_t func, void *data, loc_t *loc,
+            off_t offset, dict_t *xdata);
+
+void
+ec_ftruncate(call_frame_t *frame, xlator_t *this, uintptr_t target,
+             uint32_t fop_flags, fop_ftruncate_cbk_t func, void *data, fd_t *fd,
+             off_t offset, dict_t *xdata);
+
+void
+ec_unlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
+          uint32_t fop_flags, fop_unlink_cbk_t func, void *data, loc_t *loc,
+          int xflags, dict_t *xdata);
+
+void
+ec_writev(call_frame_t *frame, xlator_t *this, uintptr_t target,
+          uint32_t fop_flags, fop_writev_cbk_t func, void *data, fd_t *fd,
+          struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
+          struct iobref *iobref, dict_t *xdata);
+
+void
+ec_xattrop(call_frame_t *frame, xlator_t *this, uintptr_t target,
+           uint32_t fop_flags, fop_xattrop_cbk_t func, void *data, loc_t *loc,
+           gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
+
+void
+ec_fxattrop(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_fxattrop_cbk_t func, void *data, fd_t *fd,
+            gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
+
+void
+ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target,
+        uint32_t fop_flags, fop_seek_cbk_t func, void *data, fd_t *fd,
+        off_t offset, gf_seek_what_t what, dict_t *xdata);
+
+void
+ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target,
+       uint32_t fop_flags, fop_ipc_cbk_t func, void *data, int32_t op,
+       dict_t *xdata);
+
+#endif /* __EC_FOPS_H__ */
diff --git a/xlators/cluster/ec/src/ec-galois.c b/xlators/cluster/ec/src/ec-galois.c
new file mode 100644
index 00000000000..6e4990c71f5
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-galois.c
@@ -0,0 +1,183 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <string.h>
+
+#include "ec-mem-types.h"
+#include "ec-gf8.h"
+#include "ec-helpers.h"
+
+static ec_gf_t *
+ec_gf_alloc(uint32_t bits, uint32_t mod)
+{
+    ec_gf_t *gf;
+
+    gf = GF_MALLOC(sizeof(ec_gf_t), ec_mt_ec_gf_t);
+    if (gf == NULL) {
+        goto failed;
+    }
+
+    gf->bits = bits;
+    gf->size = 1 << bits;
+    gf->mod = mod;
+
+    gf->log = GF_MALLOC(sizeof(uint32_t) * (gf->size * 2 - 1),
+                        gf_common_mt_int);
+    if (gf->log == NULL) {
+        goto failed_gf;
+    }
+    gf->pow = GF_MALLOC(sizeof(uint32_t) * (gf->size * 2 - 1),
+                        gf_common_mt_int);
+    if (gf->pow == NULL) {
+        goto failed_log;
+    }
+
+    return gf;
+
+failed_log:
+    GF_FREE(gf->log);
+failed_gf:
+    GF_FREE(gf);
+failed:
+    return EC_ERR(ENOMEM);
+}
+
+static void
+ec_gf_init_tables(ec_gf_t *gf)
+{
+    uint32_t i, tmp;
+
+    memset(gf->log, -1, sizeof(uint32_t) * gf->size);
+
+    gf->pow[0] = 1;
+    gf->log[0] = gf->size;
+    gf->log[1] = 0;
+    for (i = 1; i < gf->size; i++) {
+        tmp = gf->pow[i - 1] << 1;
+        if (tmp >= gf->size) {
+            tmp ^= gf->mod;
+        }
+        gf->pow[i + gf->size - 1] = gf->pow[i] = tmp;
+        gf->log[tmp + gf->size - 1] = gf->log[tmp] = i;
+    }
+}
+
+ec_gf_t *
+ec_gf_prepare(uint32_t bits, uint32_t mod)
+{
+    ec_gf_mul_t **tbl;
+    ec_gf_t *gf;
+    uint32_t i, j;
+
+    if (bits != 8) {
+        return EC_ERR(EINVAL);
+    }
+
+    tbl = ec_gf8_mul;
+    if (mod == 0) {
+        mod = 0x11d;
+    }
+
+    gf = ec_gf_alloc(bits, mod);
+    if (EC_IS_ERR(gf)) {
+        return gf;
+    }
+    ec_gf_init_tables(gf);
+
+    gf->table = tbl;
+    gf->min_ops = bits * bits;
+    gf->max_ops = 0;
+    gf->avg_ops = 0;
+    for (i = 1; i < gf->size; i++) {
+        for (j = 0; tbl[i]->ops[j].op != EC_GF_OP_END; j++) {
+        }
+        if (gf->max_ops < j) {
+            gf->max_ops = j;
+        }
+        if (gf->min_ops > j) {
+            gf->min_ops = j;
+        }
+        gf->avg_ops += j;
+    }
+    gf->avg_ops /= gf->size;
+
+    return gf;
+}
+
+void
+ec_gf_destroy(ec_gf_t *gf)
+{
+    GF_FREE(gf->pow);
+    GF_FREE(gf->log);
+    GF_FREE(gf);
+}
+
+uint32_t
+ec_gf_add(ec_gf_t *gf, uint32_t a, uint32_t b)
+{
+    if ((a >= gf->size) || (b >= gf->size)) {
+        return gf->size;
+    }
+
+    return a ^ b;
+}
+
+uint32_t
+ec_gf_mul(ec_gf_t *gf, uint32_t a, uint32_t b)
+{
+    if ((a >= gf->size) || (b >= gf->size)) {
+        return gf->size;
+    }
+
+    if ((a != 0) && (b != 0)) {
+        return gf->pow[gf->log[a] + gf->log[b]];
+    }
+
+    return 0;
+}
+
+uint32_t
+ec_gf_div(ec_gf_t *gf, uint32_t a, uint32_t b)
+{
+    if ((a >= gf->size) || (b >= gf->size)) {
+        return gf->size;
+    }
+
+    if (b != 0) {
+        if (a != 0) {
+            return gf->pow[gf->size - 1 + gf->log[a] - gf->log[b]];
+        }
+
+        return 0;
+    }
+
+    return gf->size;
+}
+
+uint32_t
+ec_gf_exp(ec_gf_t *gf, uint32_t a, uint32_t b)
+{
+    uint32_t r;
+
+    if ((a >= gf->size) || ((a == 0) && (b == 0))) {
+        return gf->size;
+    }
+
+    r = 1;
+    while (b != 0) {
+        if ((b & 1) != 0) {
+            r = ec_gf_mul(gf, r, a);
+        }
+        a = ec_gf_mul(gf, a, a);
+        b >>= 1;
+    }
+
+    return r;
+}
diff --git a/xlators/cluster/ec/src/ec-galois.h b/xlators/cluster/ec/src/ec-galois.h
new file mode 100644
index 00000000000..ed55d53e419
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-galois.h
@@ -0,0 +1,32 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_GALOIS_H__
+#define __EC_GALOIS_H__
+
+#include <inttypes.h>
+
+#include "ec-types.h"
+
+ec_gf_t *
+ec_gf_prepare(uint32_t bits, uint32_t mod);
+void
+ec_gf_destroy(ec_gf_t *gf);
+
+uint32_t
+ec_gf_add(ec_gf_t *gf, uint32_t a, uint32_t b);
+uint32_t
+ec_gf_mul(ec_gf_t *gf, uint32_t a, uint32_t b);
+uint32_t
+ec_gf_div(ec_gf_t *gf, uint32_t a, uint32_t b);
+uint32_t
+ec_gf_exp(ec_gf_t *gf, uint32_t a, uint32_t b);
+
+#endif /* __EC_GALOIS_H__ */
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
new file mode 100644
index 00000000000..884deb93669
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-generic.c
@@ -0,0 +1,1591 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/byte-order.h>
+
+#include "ec.h"
+#include "ec-messages.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-fops.h"
+
+/* FOP: flush */
+
+int32_t
+ec_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FLUSH, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, NULL);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_flush(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_flush_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->flush, fop->fd,
+                      fop->xdata);
+}
+
+int32_t
+ec_manager_flush(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            ec_lock_prepare_fd(fop, fop->fd, 0, 0, EC_RANGE_FULL);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_flush_size_version(fop);
+
+            return EC_STATE_DELAYED_START;
+
+        case EC_STATE_DELAYED_START:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            ec_fop_prepare_answer(fop, _gf_false);
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.flush != NULL) {
+                fop->cbks.flush(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                cbk->op_errno, cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DELAYED_START:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.flush != NULL) {
+                fop->cbks.flush(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+static int32_t
+ec_validate_fd(fd_t *fd, xlator_t *xl)
+{
+    uint64_t iversion = 0;
+    uint64_t fversion = 0;
+    ec_inode_t *inode_ctx = NULL;
+    ec_fd_t *fd_ctx = NULL;
+
+    LOCK(&fd->lock);
+    {
+        fd_ctx = __ec_fd_get(fd, xl);
+        if (fd_ctx) {
+            fversion = fd_ctx->bad_version;
+        }
+    }
+    UNLOCK(&fd->lock);
+
+    LOCK(&fd->inode->lock);
+    {
+        inode_ctx = __ec_inode_get(fd->inode, xl);
+        if (inode_ctx) {
+            iversion = inode_ctx->bad_version;
+        }
+    }
+    UNLOCK(&fd->inode->lock);
+    if (fversion < iversion) {
+        return EBADF;
+    }
+    return 0;
+}
+
+void
+ec_flush(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_flush_cbk_t func, void *data, fd_t *fd,
+         dict_t *xdata)
+{
+    ec_cbk_t callback = {.flush = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(FLUSH) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    if (fd) {
+        error = ec_validate_fd(fd, this);
+        if (error) {
+            gf_msg(this->name, GF_LOG_ERROR, EBADF, EC_MSG_FD_BAD,
+                   "Failing %s on %s", gf_fop_list[GF_FOP_FLUSH],
+                   fd->inode ? uuid_utoa(fd->inode->gfid) : "");
+            goto out;
+        }
+    }
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_FLUSH, 0, target, fop_flags,
+                               ec_wind_flush, ec_manager_flush, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL);
+    }
+}
+
+/* FOP: fsync */
+
+int32_t
+ec_combine_fsync(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+    if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 2)) {
+        gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH,
+               "Mismatching iatt in "
+               "answers of 'GF_FOP_FSYNC'");
+
+        return 0;
+    }
+
+    return 1;
+}
+
+int32_t
+ec_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
+             dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FSYNC, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (op_ret >= 0) {
+            if (prebuf != NULL) {
+                cbk->iatt[0] = *prebuf;
+            }
+            if (postbuf != NULL) {
+                cbk->iatt[1] = *postbuf;
+            }
+        }
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, ec_combine_fsync);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_fsync(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_fsync_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->fsync, fop->fd,
+                      fop->int32, fop->xdata);
+}
+
+int32_t
+ec_manager_fsync(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0, EC_RANGE_FULL);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_flush_size_version(fop);
+
+            return EC_STATE_DELAYED_START;
+
+        case EC_STATE_DELAYED_START:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_false);
+            if (cbk != NULL) {
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
+
+                /* This shouldn't fail because we have the inode locked. */
+                GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode,
+                                            &cbk->iatt[0].ia_size));
+                cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.fsync != NULL) {
+                fop->cbks.fsync(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1],
+                                cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+        case -EC_STATE_DELAYED_START:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.fsync != NULL) {
+                fop->cbks.fsync(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                NULL, NULL, NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_fsync(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_fsync_cbk_t func, void *data, fd_t *fd,
+         int32_t datasync, dict_t *xdata)
+{
+    ec_cbk_t callback = {.fsync = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(FSYNC) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    if (fd) {
+        error = ec_validate_fd(fd, this);
+        if (error) {
+            gf_msg(this->name, GF_LOG_ERROR, EBADF, EC_MSG_FD_BAD,
+                   "Failing %s on %s", gf_fop_list[GF_FOP_FSYNC],
+                   fd->inode ? uuid_utoa(fd->inode->gfid) : "");
+            goto out;
+        }
+    }
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNC, 0, target, fop_flags,
+                               ec_wind_fsync, ec_manager_fsync, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    fop->int32 = datasync;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: fsyncdir */
+
+int32_t
+ec_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FSYNCDIR, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, NULL);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_fsyncdir(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_fsyncdir_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->fsyncdir,
+                      fop->fd, fop->int32, fop->xdata);
+}
+
+int32_t
+ec_manager_fsyncdir(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            ec_lock_prepare_fd(fop, fop->fd, 0, 0, EC_RANGE_FULL);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_flush_size_version(fop);
+
+            return EC_STATE_DELAYED_START;
+
+        case EC_STATE_DELAYED_START:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            ec_fop_prepare_answer(fop, _gf_false);
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.fsyncdir != NULL) {
+                fop->cbks.fsyncdir(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                   cbk->op_errno, cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+        case -EC_STATE_DELAYED_START:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.fsyncdir != NULL) {
+                fop->cbks.fsyncdir(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                   NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_fsyncdir(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_fsyncdir_cbk_t func, void *data, fd_t *fd,
+            int32_t datasync, dict_t *xdata)
+{
+    ec_cbk_t callback = {.fsyncdir = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(FSYNCDIR) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_FSYNCDIR, 0, target,
+                               fop_flags, ec_wind_fsyncdir, ec_manager_fsyncdir,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    fop->int32 = datasync;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL);
+    }
+}
+
+/* FOP: lookup */
+
+void
+ec_lookup_rebuild(ec_t *ec, ec_fop_data_t *fop, ec_cbk_data_t *cbk)
+{
+    ec_inode_t *ctx = NULL;
+    uint64_t size = 0;
+    int32_t have_size = 0, err;
+
+    if (cbk->op_ret < 0) {
+        return;
+    }
+
+    ec_dict_del_array(cbk->xdata, EC_XATTR_VERSION, cbk->version,
+                      EC_VERSION_SIZE);
+
+    err = ec_loc_update(fop->xl, &fop->loc[0], cbk->inode, &cbk->iatt[0]);
+    if (ec_cbk_set_error(cbk, -err, _gf_true)) {
+        return;
+    }
+
+    LOCK(&cbk->inode->lock);
+
+    ctx = __ec_inode_get(cbk->inode, fop->xl);
+    if (ctx != NULL) {
+        if (ctx->have_version) {
+            cbk->version[0] = ctx->post_version[0];
+            cbk->version[1] = ctx->post_version[1];
+        }
+        if (ctx->have_size) {
+            size = ctx->post_size;
+            have_size = 1;
+        }
+    }
+
+    UNLOCK(&cbk->inode->lock);
+
+    if (cbk->iatt[0].ia_type == IA_IFREG) {
+        cbk->size = cbk->iatt[0].ia_size;
+        ec_dict_del_number(cbk->xdata, EC_XATTR_SIZE, &cbk->iatt[0].ia_size);
+        if (have_size) {
+            cbk->iatt[0].ia_size = size;
+        }
+    }
+}
+
+int32_t
+ec_combine_lookup(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+    if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 2)) {
+        gf_msg(fop->xl->name, GF_LOG_DEBUG, 0, EC_MSG_IATT_MISMATCH,
+               "Mismatching iatt in "
+               "answers of 'GF_FOP_LOOKUP'");
+
+        return 0;
+    }
+
+    return 1;
+}
+
+int32_t
+ec_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+              struct iatt *postparent)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+    uint64_t dirty[2] = {0};
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_LOOKUP, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (op_ret >= 0) {
+            if (inode != NULL) {
+                cbk->inode = inode_ref(inode);
+                if (cbk->inode == NULL) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_INODE_REF_FAIL,
+                           "Failed to reference an inode.");
+
+                    goto out;
+                }
+            }
+            if (buf != NULL) {
+                cbk->iatt[0] = *buf;
+            }
+            if (postparent != NULL) {
+                cbk->iatt[1] = *postparent;
+            }
+        }
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+            ec_dict_del_array(xdata, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
+        }
+
+        ec_combine(cbk, ec_combine_lookup);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_lookup(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_lookup_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->lookup,
+                      &fop->loc[0], fop->xdata);
+}
+
+int32_t
+ec_manager_lookup(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+    int32_t err;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            if (fop->xdata == NULL) {
+                fop->xdata = dict_new();
+                if (fop->xdata == NULL) {
+                    gf_msg(fop->xl->name, GF_LOG_ERROR, ENOMEM,
+                           EC_MSG_LOOKUP_REQ_PREP_FAIL,
+                           "Unable to prepare "
+                           "lookup request");
+
+                    fop->error = ENOMEM;
+
+                    return EC_STATE_REPORT;
+                }
+            } else {
+                /*TODO: To be handled once we have 'syndromes' */
+                dict_del(fop->xdata, GF_CONTENT_KEY);
+            }
+            err = dict_set_uint64(fop->xdata, EC_XATTR_SIZE, 0);
+            if (err == 0) {
+                err = dict_set_uint64(fop->xdata, EC_XATTR_VERSION, 0);
+            }
+            if (err == 0) {
+                err = dict_set_uint64(fop->xdata, EC_XATTR_DIRTY, 0);
+            }
+            if (err != 0) {
+                gf_msg(fop->xl->name, GF_LOG_ERROR, -err,
+                       EC_MSG_LOOKUP_REQ_PREP_FAIL,
+                       "Unable to prepare lookup "
+                       "request");
+
+                fop->error = -err;
+
+                return EC_STATE_REPORT;
+            }
+
+            /* Fall through */
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            /*
+             * Lookup happens without any lock, so there is a chance that it
+             * will have answers before modification happened and after
+             * modification happened in the same response. So choose the next
+             * best answer when the answers don't match for EC_MINIMUM_MIN
+             */
+
+            if (!fop->answer && !list_empty(&fop->cbk_list)) {
+                fop->answer = list_entry(fop->cbk_list.next, ec_cbk_data_t,
+                                         list);
+            }
+
+            cbk = ec_fop_prepare_answer(fop, _gf_true);
+            if (cbk != NULL) {
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
+
+                ec_lookup_rebuild(fop->xl->private, fop, cbk);
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.lookup != NULL) {
+                fop->cbks.lookup(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                 cbk->op_errno, cbk->inode, &cbk->iatt[0],
+                                 cbk->xdata, &cbk->iatt[1]);
+            }
+
+            return EC_STATE_END;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.lookup != NULL) {
+                fop->cbks.lookup(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                 NULL, NULL, NULL, NULL);
+            }
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_lookup(call_frame_t *frame, xlator_t *this, uintptr_t target,
+          uint32_t fop_flags, fop_lookup_cbk_t func, void *data, loc_t *loc,
+          dict_t *xdata)
+{
+    ec_cbk_t callback = {.lookup = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(LOOKUP) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_LOOKUP, EC_FLAG_LOCK_SHARED,
+                               target, fop_flags, ec_wind_lookup,
+                               ec_manager_lookup, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        /* Do not log failures here as a memory problem would have already
+         * been logged by the corresponding alloc functions */
+        if (fop->xdata == NULL)
+            goto out;
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: statfs */
+
+int32_t
+ec_combine_statfs(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+    ec_statvfs_combine(&dst->statvfs, &src->statvfs);
+
+    return 1;
+}
+
+int32_t
+ec_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct statvfs *buf, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_STATFS, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (op_ret >= 0) {
+            if (buf != NULL) {
+                cbk->statvfs = *buf;
+            }
+        }
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, ec_combine_statfs);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_statfs(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_statfs_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->statfs,
+                      &fop->loc[0], fop->xdata);
+}
+
+int32_t
+ec_manager_statfs(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk = NULL;
+    gf_boolean_t deem_statfs_enabled = _gf_false;
+    int32_t err = 0;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_true);
+            if (cbk != NULL) {
+                ec_t *ec = fop->xl->private;
+
+                if (cbk->xdata) {
+                    err = dict_get_int8(cbk->xdata, "quota-deem-statfs",
+                                        (int8_t *)&deem_statfs_enabled);
+                    if (err != -ENOENT) {
+                        ec_cbk_set_error(cbk, -err, _gf_true);
+                    }
+                }
+
+                if (err != 0 || deem_statfs_enabled == _gf_false) {
+                    cbk->statvfs.f_blocks *= ec->fragments;
+                    cbk->statvfs.f_bfree *= ec->fragments;
+                    cbk->statvfs.f_bavail *= ec->fragments;
+                }
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.statfs != NULL) {
+                fop->cbks.statfs(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                 cbk->op_errno, &cbk->statvfs, cbk->xdata);
+            }
+
+            return EC_STATE_END;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.statfs != NULL) {
+                fop->cbks.statfs(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                 NULL, NULL);
+            }
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_statfs(call_frame_t *frame, xlator_t *this, uintptr_t target,
+          uint32_t fop_flags, fop_statfs_cbk_t func, void *data, loc_t *loc,
+          dict_t *xdata)
+{
+    ec_cbk_t callback = {.statfs = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(STATFS) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_STATFS, EC_FLAG_LOCK_SHARED,
+                               target, fop_flags, ec_wind_statfs,
+                               ec_manager_statfs, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL);
+    }
+}
+
+/* FOP: xattrop */
+
+int32_t
+ec_combine_xattrop(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+    if (!ec_dict_compare(dst->dict, src->dict)) {
+        gf_msg(fop->xl->name, GF_LOG_DEBUG, 0, EC_MSG_DICT_MISMATCH,
+               "Mismatching dictionary in "
+               "answers of 'GF_FOP_XATTROP'");
+
+        return 0;
+    }
+
+    return 1;
+}
+
+int32_t
+ec_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_lock_link_t *link = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    uint64_t dirty[2] = {0};
+    data_t *data;
+    uint64_t *version;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret,
+                               op_errno);
+    if (!cbk)
+        goto out;
+
+    if (op_ret >= 0) {
+        cbk->dict = dict_ref(xattr);
+
+        data = dict_get(cbk->dict, EC_XATTR_VERSION);
+        if ((data != NULL) && (data->len >= sizeof(uint64_t))) {
+            version = (uint64_t *)data->data;
+
+            if (((ntoh64(version[0]) >> EC_SELFHEAL_BIT) & 1) != 0) {
+                LOCK(&fop->lock);
+
+                fop->healing |= 1ULL << idx;
+
+                UNLOCK(&fop->lock);
+            }
+        }
+
+        ec_dict_del_array(xattr, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
+        link = fop->data;
+        if (link) {
+            /*Keep a note of if the dirty is already set or not*/
+            link->dirty[0] |= (dirty[0] != 0);
+            link->dirty[1] |= (dirty[1] != 0);
+        }
+    }
+
+    if (xdata)
+        cbk->xdata = dict_ref(xdata);
+
+    ec_combine(cbk, ec_combine_xattrop);
+
+out:
+    if (fop)
+        ec_complete(fop);
+
+    return 0;
+}
+
+void
+ec_wind_xattrop(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_xattrop_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->xattrop,
+                      &fop->loc[0], fop->xattrop_flags, fop->dict, fop->xdata);
+}
+
+int32_t
+ec_manager_xattrop(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            if (fop->fd == NULL) {
+                ec_lock_prepare_inode(fop, &fop->loc[0], EC_UPDATE_META, 0,
+                                      EC_RANGE_FULL);
+            } else {
+                ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META, 0,
+                                   EC_RANGE_FULL);
+            }
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_false);
+            if (cbk != NULL) {
+                int32_t err;
+
+                err = ec_dict_combine(cbk, EC_COMBINE_DICT);
+                ec_cbk_set_error(cbk, -err, _gf_false);
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->id == GF_FOP_XATTROP) {
+                if (fop->cbks.xattrop != NULL) {
+                    fop->cbks.xattrop(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                      cbk->op_errno, cbk->dict, cbk->xdata);
+                }
+            } else {
+                if (fop->cbks.fxattrop != NULL) {
+                    fop->cbks.fxattrop(fop->req_frame, fop, fop->xl,
+                                       cbk->op_ret, cbk->op_errno, cbk->dict,
+                                       cbk->xdata);
+                }
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->id == GF_FOP_XATTROP) {
+                if (fop->cbks.xattrop != NULL) {
+                    fop->cbks.xattrop(fop->req_frame, fop, fop->xl, -1,
+                                      fop->error, NULL, NULL);
+                }
+            } else {
+                if (fop->cbks.fxattrop != NULL) {
+                    fop->cbks.fxattrop(fop->req_frame, fop, fop->xl, -1,
+                                       fop->error, NULL, NULL);
+                }
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_xattrop(call_frame_t *frame, xlator_t *this, uintptr_t target,
+           uint32_t fop_flags, fop_xattrop_cbk_t func, void *data, loc_t *loc,
+           gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    ec_cbk_t callback = {.xattrop = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(XATTROP) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_XATTROP, 0, target,
+                               fop_flags, ec_wind_xattrop, ec_manager_xattrop,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->xattrop_flags = optype;
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (xattr != NULL) {
+        fop->dict = dict_ref(xattr);
+        if (fop->dict == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL);
+    }
+}
+
+void
+ec_wind_fxattrop(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_xattrop_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->fxattrop,
+                      fop->fd, fop->xattrop_flags, fop->dict, fop->xdata);
+}
+
+void
+ec_fxattrop(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_fxattrop_cbk_t func, void *data, fd_t *fd,
+            gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    ec_cbk_t callback = {.fxattrop = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(FXATTROP) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_FXATTROP, 0, target,
+                               fop_flags, ec_wind_fxattrop, ec_manager_xattrop,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    fop->xattrop_flags = optype;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (xattr != NULL) {
+        fop->dict = dict_ref(xattr);
+        if (fop->dict == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL);
+    }
+}
+
+/* FOP: IPC */
+
+int32_t
+ec_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+           int32_t op_errno, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_IPC, idx, op_ret,
+                               op_errno);
+
+    if (cbk != NULL) {
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+        }
+
+        ec_combine(cbk, NULL);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_ipc(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_ipc_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->ipc, fop->int32,
+                      fop->xdata);
+}
+
+int32_t
+ec_manager_ipc(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            ec_fop_prepare_answer(fop, _gf_true);
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+            if (fop->cbks.ipc != NULL) {
+                fop->cbks.ipc(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                              cbk->op_errno, cbk->xdata);
+            }
+
+            return EC_STATE_END;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.ipc != NULL) {
+                fop->cbks.ipc(fop->req_frame, fop, fop->xl, -1, fop->error,
+                              NULL);
+            }
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_ipc(call_frame_t *frame, xlator_t *this, uintptr_t target,
+       uint32_t fop_flags, fop_ipc_cbk_t func, void *data, int32_t op,
+       dict_t *xdata)
+{
+    ec_cbk_t callback = {.ipc = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(IPC) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_IPC, 0, target, fop_flags,
+                               ec_wind_ipc, ec_manager_ipc, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+    }
+    fop->int32 = op;
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL);
+    }
+}
diff --git a/xlators/cluster/ec/src/ec-gf8.c b/xlators/cluster/ec/src/ec-gf8.c
new file mode 100644
index 00000000000..039adae5929
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-gf8.c
@@ -0,0 +1,5882 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "ec-gf8.h"
+
+static ec_gf_op_t ec_gf8_mul_00_ops[] = {{EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_00 = {0,
+                                    {
+                                        0,
+                                    },
+                                    ec_gf8_mul_00_ops};
+
+static ec_gf_op_t ec_gf8_mul_01_ops[] = {{EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_01 = {8,
+                                    {
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                    },
+                                    ec_gf8_mul_01_ops};
+
+static ec_gf_op_t ec_gf8_mul_02_ops[] = {{EC_GF_OP_XOR2, 1, 7, 0},
+                                         {EC_GF_OP_XOR2, 2, 7, 0},
+                                         {EC_GF_OP_XOR2, 3, 7, 0},
+                                         {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_02 = {8,
+                                    {
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                    },
+                                    ec_gf8_mul_02_ops};
+
+static ec_gf_op_t ec_gf8_mul_03_ops[] = {
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_COPY, 8, 3, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_03 = {9,
+                                    {
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        8,
+                                    },
+                                    ec_gf8_mul_03_ops};
+
+static ec_gf_op_t ec_gf8_mul_04_ops[] = {
+    {EC_GF_OP_XOR3, 8, 6, 7}, {EC_GF_OP_XOR2, 2, 8, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_04 = {9,
+                                    {
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        8,
+                                    },
+                                    ec_gf8_mul_04_ops};
+
+static ec_gf_op_t ec_gf8_mul_05_ops[] = {
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_05 = {8,
+                                    {
+                                        0,
+                                        1,
+                                        2,
+                                        6,
+                                        7,
+                                        3,
+                                        4,
+                                        5,
+                                    },
+                                    ec_gf8_mul_05_ops};
+
+static ec_gf_op_t ec_gf8_mul_06_ops[] = {
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_COPY, 8, 2, 0},
+    {EC_GF_OP_XOR2, 8, 3, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_06 = {9,
+                                    {
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        8,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                    },
+                                    ec_gf8_mul_06_ops};
+
+static ec_gf_op_t ec_gf8_mul_07_ops[] = {
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_07 = {8,
+                                    {
+                                        6,
+                                        0,
+                                        1,
+                                        3,
+                                        2,
+                                        4,
+                                        5,
+                                        7,
+                                    },
+                                    ec_gf8_mul_07_ops};
+
+static ec_gf_op_t ec_gf8_mul_08_ops[] = {
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR3, 8, 6, 7},
+    {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 2, 8, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_08 = {9,
+                                    {
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                        8,
+                                    },
+                                    ec_gf8_mul_08_ops};
+
+static ec_gf_op_t ec_gf8_mul_09_ops[] = {
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_09 = {8,
+                                    {
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        5,
+                                        6,
+                                        7,
+                                        4,
+                                    },
+                                    ec_gf8_mul_09_ops};
+
+static ec_gf_op_t ec_gf8_mul_0A_ops[] = {
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_0A = {8,
+                                    {
+                                        5,
+                                        0,
+                                        1,
+                                        2,
+                                        6,
+                                        7,
+                                        3,
+                                        4,
+                                    },
+                                    ec_gf8_mul_0A_ops};
+
+static ec_gf_op_t ec_gf8_mul_0B_ops[] = {
+    {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_COPY, 9, 3, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_COPY, 8, 5, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR3, 3, 8, 6}, {EC_GF_OP_XOR2, 1, 9, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_0B = {10,
+                                    {
+                                        7,
+                                        1,
+                                        5,
+                                        2,
+                                        4,
+                                        3,
+                                        0,
+                                        6,
+                                        8,
+                                        9,
+                                    },
+                                    ec_gf8_mul_0B_ops};
+
+static ec_gf_op_t ec_gf8_mul_0C_ops[] = {
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_COPY, 8, 1, 0},
+    {EC_GF_OP_XOR2, 8, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_0C = {9,
+                                    {
+                                        5,
+                                        7,
+                                        0,
+                                        1,
+                                        8,
+                                        2,
+                                        3,
+                                        4,
+                                        6,
+                                    },
+                                    ec_gf8_mul_0C_ops};
+
+static ec_gf_op_t ec_gf8_mul_0D_ops[] = {
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR3, 8, 2, 4}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR3, 2, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_0D = {9,
+                                    {
+                                        5,
+                                        6,
+                                        7,
+                                        3,
+                                        1,
+                                        0,
+                                        2,
+                                        4,
+                                        8,
+                                    },
+                                    ec_gf8_mul_0D_ops};
+
+static ec_gf_op_t ec_gf8_mul_0E_ops[] = {
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_0E = {8,
+                                    {
+                                        7,
+                                        0,
+                                        6,
+                                        1,
+                                        3,
+                                        2,
+                                        4,
+                                        5,
+                                    },
+                                    ec_gf8_mul_0E_ops};
+
+static ec_gf_op_t ec_gf8_mul_0F_ops[] = {
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_0F = {8,
+                                    {
+                                        1,
+                                        0,
+                                        5,
+                                        6,
+                                        7,
+                                        2,
+                                        3,
+                                        4,
+                                    },
+                                    ec_gf8_mul_0F_ops};
+
+static ec_gf_op_t ec_gf8_mul_10_ops[] = {
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_10 = {8,
+                                    {
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                    },
+                                    ec_gf8_mul_10_ops};
+
+static ec_gf_op_t ec_gf8_mul_11_ops[] = {
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_11 = {8,
+                                    {
+                                        4,
+                                        1,
+                                        2,
+                                        6,
+                                        0,
+                                        5,
+                                        7,
+                                        3,
+                                    },
+                                    ec_gf8_mul_11_ops};
+
+static ec_gf_op_t ec_gf8_mul_12_ops[] = {
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_12 = {8,
+                                    {
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        5,
+                                        6,
+                                        4,
+                                    },
+                                    ec_gf8_mul_12_ops};
+
+static ec_gf_op_t ec_gf8_mul_13_ops[] = {
+    {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR3, 8, 3, 7},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 8, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 0, 8, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_13 = {9,
+                                    {
+                                        4,
+                                        5,
+                                        2,
+                                        6,
+                                        0,
+                                        1,
+                                        7,
+                                        3,
+                                        8,
+                                    },
+                                    ec_gf8_mul_13_ops};
+
+static ec_gf_op_t ec_gf8_mul_14_ops[] = {
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_14 = {8,
+                                    {
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        4,
+                                        5,
+                                        3,
+                                    },
+                                    ec_gf8_mul_14_ops};
+
+static ec_gf_op_t ec_gf8_mul_15_ops[] = {
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR3, 5, 8, 7},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_15 = {9,
+                                    {
+                                        0,
+                                        1,
+                                        2,
+                                        4,
+                                        7,
+                                        6,
+                                        5,
+                                        3,
+                                        8,
+                                    },
+                                    ec_gf8_mul_15_ops};
+
+static ec_gf_op_t ec_gf8_mul_16_ops[] = {
+    {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_16 = {8,
+                                    {
+                                        6,
+                                        7,
+                                        4,
+                                        1,
+                                        2,
+                                        3,
+                                        5,
+                                        0,
+                                    },
+                                    ec_gf8_mul_16_ops};
+
+static ec_gf_op_t ec_gf8_mul_17_ops[] = {
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_17 = {8,
+                                    {
+                                        5,
+                                        7,
+                                        0,
+                                        1,
+                                        3,
+                                        2,
+                                        4,
+                                        6,
+                                    },
+                                    ec_gf8_mul_17_ops};
+
+static ec_gf_op_t ec_gf8_mul_18_ops[] = {
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_18 = {9,
+                                    {
+                                        4,
+                                        5,
+                                        7,
+                                        6,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        8,
+                                    },
+                                    ec_gf8_mul_18_ops};
+
+static ec_gf_op_t ec_gf8_mul_19_ops[] = {
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_19 = {8,
+                                    {
+                                        0,
+                                        5,
+                                        2,
+                                        6,
+                                        7,
+                                        1,
+                                        3,
+                                        4,
+                                    },
+                                    ec_gf8_mul_19_ops};
+
+static ec_gf_op_t ec_gf8_mul_1A_ops[] = {
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_1A = {8,
+                                    {
+                                        7,
+                                        0,
+                                        4,
+                                        5,
+                                        3,
+                                        1,
+                                        2,
+                                        6,
+                                    },
+                                    ec_gf8_mul_1A_ops};
+
+static ec_gf_op_t ec_gf8_mul_1B_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_1B = {8,
+                                    {
+                                        7,
+                                        4,
+                                        5,
+                                        6,
+                                        3,
+                                        1,
+                                        2,
+                                        0,
+                                    },
+                                    ec_gf8_mul_1B_ops};
+
+static ec_gf_op_t ec_gf8_mul_1C_ops[] = {
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_1C = {8,
+                                    {
+                                        5,
+                                        4,
+                                        3,
+                                        0,
+                                        1,
+                                        7,
+                                        2,
+                                        6,
+                                    },
+                                    ec_gf8_mul_1C_ops};
+
+static ec_gf_op_t ec_gf8_mul_1D_ops[] = {
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR3, 8, 4, 2},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_1D = {9,
+                                    {
+                                        0,
+                                        7,
+                                        5,
+                                        8,
+                                        2,
+                                        3,
+                                        4,
+                                        1,
+                                        6,
+                                    },
+                                    ec_gf8_mul_1D_ops};
+
+static ec_gf_op_t ec_gf8_mul_1E_ops[] = {
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_1E = {8,
+                                    {
+                                        4,
+                                        7,
+                                        5,
+                                        1,
+                                        6,
+                                        0,
+                                        2,
+                                        3,
+                                    },
+                                    ec_gf8_mul_1E_ops};
+
+static ec_gf_op_t ec_gf8_mul_1F_ops[] = {
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR3, 8, 3, 7},
+    {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_1F = {9,
+                                    {
+                                        1,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        3,
+                                        2,
+                                        8,
+                                    },
+                                    ec_gf8_mul_1F_ops};
+
+static ec_gf_op_t ec_gf8_mul_20_ops[] = {
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_20 = {8,
+                                    {
+                                        7,
+                                        4,
+                                        5,
+                                        6,
+                                        3,
+                                        0,
+                                        1,
+                                        2,
+                                    },
+                                    ec_gf8_mul_20_ops};
+
+static ec_gf_op_t ec_gf8_mul_21_ops[] = {
+    {EC_GF_OP_COPY, 9, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR3, 8, 7, 5}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 2, 8, 0},
+    {EC_GF_OP_XOR2, 4, 9, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_21 = {10,
+                                    {
+                                        0,
+                                        1,
+                                        2,
+                                        7,
+                                        5,
+                                        4,
+                                        3,
+                                        6,
+                                        8,
+                                        9,
+                                    },
+                                    ec_gf8_mul_21_ops};
+
+static ec_gf_op_t ec_gf8_mul_22_ops[] = {
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_22 = {8,
+                                    {
+                                        3,
+                                        0,
+                                        5,
+                                        2,
+                                        6,
+                                        4,
+                                        1,
+                                        7,
+                                    },
+                                    ec_gf8_mul_22_ops};
+
+static ec_gf_op_t ec_gf8_mul_23_ops[] = {
+    {EC_GF_OP_COPY, 8, 2, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 3, 8, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_23 = {9,
+                                    {
+                                        0,
+                                        4,
+                                        3,
+                                        2,
+                                        5,
+                                        6,
+                                        1,
+                                        8,
+                                        7,
+                                    },
+                                    ec_gf8_mul_23_ops};
+
+static ec_gf_op_t ec_gf8_mul_24_ops[] = {
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_24 = {8,
+                                    {
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        4,
+                                        5,
+                                        3,
+                                    },
+                                    ec_gf8_mul_24_ops};
+
+static ec_gf_op_t ec_gf8_mul_25_ops[] = {
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_25 = {8,
+                                    {
+                                        2,
+                                        7,
+                                        0,
+                                        1,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                    },
+                                    ec_gf8_mul_25_ops};
+
+static ec_gf_op_t ec_gf8_mul_26_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_26 = {8,
+                                    {
+                                        3,
+                                        4,
+                                        1,
+                                        2,
+                                        0,
+                                        5,
+                                        6,
+                                        7,
+                                    },
+                                    ec_gf8_mul_26_ops};
+
+static ec_gf_op_t ec_gf8_mul_27_ops[] = {
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_27 = {8,
+                                    {
+                                        3,
+                                        0,
+                                        1,
+                                        2,
+                                        6,
+                                        7,
+                                        4,
+                                        5,
+                                    },
+                                    ec_gf8_mul_27_ops};
+
+static ec_gf_op_t ec_gf8_mul_28_ops[] = {
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_28 = {8,
+                                    {
+                                        5,
+                                        6,
+                                        3,
+                                        0,
+                                        1,
+                                        2,
+                                        4,
+                                        7,
+                                    },
+                                    ec_gf8_mul_28_ops};
+
+static ec_gf_op_t ec_gf8_mul_29_ops[] = {
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_29 = {8,
+                                    {
+                                        4,
+                                        6,
+                                        3,
+                                        5,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                    },
+                                    ec_gf8_mul_29_ops};
+
+static ec_gf_op_t ec_gf8_mul_2A_ops[] = {
+    {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 8, 0, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR3, 6, 8, 4}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_2A = {9,
+                                    {
+                                        3,
+                                        4,
+                                        7,
+                                        2,
+                                        6,
+                                        5,
+                                        1,
+                                        0,
+                                        8,
+                                    },
+                                    ec_gf8_mul_2A_ops};
+
+static ec_gf_op_t ec_gf8_mul_2B_ops[] = {
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_2B = {8,
+                                    {
+                                        3,
+                                        4,
+                                        7,
+                                        5,
+                                        6,
+                                        0,
+                                        1,
+                                        2,
+                                    },
+                                    ec_gf8_mul_2B_ops};
+
+static ec_gf_op_t ec_gf8_mul_2C_ops[] = {
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_2C = {8,
+                                    {
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        2,
+                                        3,
+                                        4,
+                                        1,
+                                    },
+                                    ec_gf8_mul_2C_ops};
+
+static ec_gf_op_t ec_gf8_mul_2D_ops[] = {
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR3, 8, 4, 6},
+    {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_2D = {9,
+                                    {
+                                        7,
+                                        0,
+                                        3,
+                                        5,
+                                        1,
+                                        4,
+                                        2,
+                                        6,
+                                        8,
+                                    },
+                                    ec_gf8_mul_2D_ops};
+
+static ec_gf_op_t ec_gf8_mul_2E_ops[] = {
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_COPY, 8, 4, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 8, 7, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 2, 8, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_2E = {9,
+                                    {
+                                        5,
+                                        0,
+                                        7,
+                                        3,
+                                        2,
+                                        6,
+                                        4,
+                                        1,
+                                        8,
+                                    },
+                                    ec_gf8_mul_2E_ops};
+
+static ec_gf_op_t ec_gf8_mul_2F_ops[] = {
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR3, 8, 7, 6}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 3, 8, 0},
+    {EC_GF_OP_XOR2, 2, 8, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_2F = {9,
+                                    {
+                                        6,
+                                        3,
+                                        2,
+                                        5,
+                                        7,
+                                        0,
+                                        1,
+                                        4,
+                                        8,
+                                    },
+                                    ec_gf8_mul_2F_ops};
+
+static ec_gf_op_t ec_gf8_mul_30_ops[] = {
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 8, 1, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR3, 6, 8, 7},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_30 = {9,
+                                    {
+                                        3,
+                                        4,
+                                        7,
+                                        5,
+                                        0,
+                                        6,
+                                        1,
+                                        2,
+                                        8,
+                                    },
+                                    ec_gf8_mul_30_ops};
+
+static ec_gf_op_t ec_gf8_mul_31_ops[] = {
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_31 = {8,
+                                    {
+                                        7,
+                                        1,
+                                        4,
+                                        5,
+                                        6,
+                                        0,
+                                        2,
+                                        3,
+                                    },
+                                    ec_gf8_mul_31_ops};
+
+static ec_gf_op_t ec_gf8_mul_32_ops[] = {
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_32 = {8,
+                                    {
+                                        3,
+                                        4,
+                                        6,
+                                        7,
+                                        5,
+                                        0,
+                                        1,
+                                        2,
+                                    },
+                                    ec_gf8_mul_32_ops};
+
+static ec_gf_op_t ec_gf8_mul_33_ops[] = {
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_33 = {8,
+                                    {
+                                        5,
+                                        4,
+                                        3,
+                                        0,
+                                        2,
+                                        1,
+                                        6,
+                                        7,
+                                    },
+                                    ec_gf8_mul_33_ops};
+
+static ec_gf_op_t ec_gf8_mul_34_ops[] = {
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_34 = {8,
+                                    {
+                                        7,
+                                        5,
+                                        3,
+                                        0,
+                                        2,
+                                        4,
+                                        1,
+                                        6,
+                                    },
+                                    ec_gf8_mul_34_ops};
+
+static ec_gf_op_t ec_gf8_mul_35_ops[] = {
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_35 = {8,
+                                    {
+                                        6,
+                                        7,
+                                        5,
+                                        4,
+                                        2,
+                                        0,
+                                        1,
+                                        3,
+                                    },
+                                    ec_gf8_mul_35_ops};
+
+static ec_gf_op_t ec_gf8_mul_36_ops[] = {
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_36 = {8,
+                                    {
+                                        6,
+                                        7,
+                                        4,
+                                        1,
+                                        2,
+                                        3,
+                                        0,
+                                        5,
+                                    },
+                                    ec_gf8_mul_36_ops};
+
+static ec_gf_op_t ec_gf8_mul_37_ops[] = {
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR3, 8, 0, 1},
+    {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_37 = {9,
+                                    {
+                                        6,
+                                        7,
+                                        2,
+                                        1,
+                                        0,
+                                        3,
+                                        4,
+                                        5,
+                                        8,
+                                    },
+                                    ec_gf8_mul_37_ops};
+
+static ec_gf_op_t ec_gf8_mul_38_ops[] = {
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR3, 8, 6, 7},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 4, 8, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_38 = {9,
+                                    {
+                                        4,
+                                        5,
+                                        6,
+                                        3,
+                                        0,
+                                        1,
+                                        7,
+                                        2,
+                                        8,
+                                    },
+                                    ec_gf8_mul_38_ops};
+
+static ec_gf_op_t ec_gf8_mul_39_ops[] = {
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_39 = {8,
+                                    {
+                                        1,
+                                        6,
+                                        3,
+                                        0,
+                                        5,
+                                        2,
+                                        4,
+                                        7,
+                                    },
+                                    ec_gf8_mul_39_ops};
+
+static ec_gf_op_t ec_gf8_mul_3A_ops[] = {
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_3A = {8,
+                                    {
+                                        3,
+                                        4,
+                                        7,
+                                        0,
+                                        5,
+                                        6,
+                                        1,
+                                        2,
+                                    },
+                                    ec_gf8_mul_3A_ops};
+
+static ec_gf_op_t ec_gf8_mul_3B_ops[] = {
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR3, 8, 7, 3}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_3B = {9,
+                                    {
+                                        3,
+                                        0,
+                                        1,
+                                        7,
+                                        6,
+                                        2,
+                                        4,
+                                        8,
+                                        5,
+                                    },
+                                    ec_gf8_mul_3B_ops};
+
+static ec_gf_op_t ec_gf8_mul_3C_ops[] = {
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_3C = {8,
+                                    {
+                                        3,
+                                        6,
+                                        4,
+                                        1,
+                                        7,
+                                        2,
+                                        0,
+                                        5,
+                                    },
+                                    ec_gf8_mul_3C_ops};
+
+static ec_gf_op_t ec_gf8_mul_3D_ops[] = {
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_3D = {8,
+                                    {
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                    },
+                                    ec_gf8_mul_3D_ops};
+
+static ec_gf_op_t ec_gf8_mul_3E_ops[] = {
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_3E = {8,
+                                    {
+                                        6,
+                                        1,
+                                        2,
+                                        7,
+                                        0,
+                                        3,
+                                        5,
+                                        4,
+                                    },
+                                    ec_gf8_mul_3E_ops};
+
+static ec_gf_op_t ec_gf8_mul_3F_ops[] = {
+    {EC_GF_OP_COPY, 8, 0, 0},  {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_COPY, 10, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0},  {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0},  {EC_GF_OP_COPY, 9, 2, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0},  {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0},  {EC_GF_OP_XOR3, 4, 9, 7},
+    {EC_GF_OP_XOR2, 3, 4, 0},  {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0},  {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 3, 10, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_3F = {11,
+                                    {
+                                        1,
+                                        7,
+                                        6,
+                                        2,
+                                        4,
+                                        3,
+                                        5,
+                                        0,
+                                        8,
+                                        9,
+                                        10,
+                                    },
+                                    ec_gf8_mul_3F_ops};
+
+static ec_gf_op_t ec_gf8_mul_40_ops[] = {
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR3, 8, 7, 6},
+    {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+    {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_40 = {9,
+                                    {
+                                        5,
+                                        7,
+                                        4,
+                                        6,
+                                        2,
+                                        3,
+                                        0,
+                                        1,
+                                        8,
+                                    },
+                                    ec_gf8_mul_40_ops};
+
+static ec_gf_op_t ec_gf8_mul_41_ops[] = {
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 8, 4, 0},
+    {EC_GF_OP_XOR2, 8, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_41 = {9,
+                                    {
+                                        0,
+                                        7,
+                                        6,
+                                        5,
+                                        3,
+                                        4,
+                                        8,
+                                        1,
+                                        2,
+                                    },
+                                    ec_gf8_mul_41_ops};
+
+static ec_gf_op_t ec_gf8_mul_42_ops[] = {
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 8, 3, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_42 = {9,
+                                    {
+                                        2,
+                                        7,
+                                        1,
+                                        6,
+                                        4,
+                                        3,
+                                        0,
+                                        5,
+                                        8,
+                                    },
+                                    ec_gf8_mul_42_ops};
+
+static ec_gf_op_t ec_gf8_mul_43_ops[] = {
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_43 = {8,
+                                    {
+                                        2,
+                                        6,
+                                        4,
+                                        1,
+                                        7,
+                                        3,
+                                        0,
+                                        5,
+                                    },
+                                    ec_gf8_mul_43_ops};
+
+static ec_gf_op_t ec_gf8_mul_44_ops[] = {
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_44 = {8,
+                                    {
+                                        2,
+                                        3,
+                                        4,
+                                        1,
+                                        6,
+                                        5,
+                                        0,
+                                        7,
+                                    },
+                                    ec_gf8_mul_44_ops};
+
+static ec_gf_op_t ec_gf8_mul_45_ops[] = {
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_45 = {8,
+                                    {
+                                        2,
+                                        3,
+                                        0,
+                                        1,
+                                        7,
+                                        4,
+                                        5,
+                                        6,
+                                    },
+                                    ec_gf8_mul_45_ops};
+
+static ec_gf_op_t ec_gf8_mul_46_ops[] = {
+    {EC_GF_OP_XOR3, 8, 2, 4}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 8, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_46 = {9,
+                                    {
+                                        2,
+                                        0,
+                                        1,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        8,
+                                    },
+                                    ec_gf8_mul_46_ops};
+
+static ec_gf_op_t ec_gf8_mul_47_ops[] = {
+    {EC_GF_OP_XOR3, 8, 0, 1}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_47 = {9,
+                                    {
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        8,
+                                    },
+                                    ec_gf8_mul_47_ops};
+
+static ec_gf_op_t ec_gf8_mul_48_ops[] = {
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_48 = {8,
+                                    {
+                                        4,
+                                        5,
+                                        6,
+                                        0,
+                                        1,
+                                        3,
+                                        7,
+                                        2,
+                                    },
+                                    ec_gf8_mul_48_ops};
+
+static ec_gf_op_t ec_gf8_mul_49_ops[] = {
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR3, 8, 0, 6},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR3, 1, 8, 5},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_49 = {9,
+                                    {
+                                        7,
+                                        2,
+                                        4,
+                                        0,
+                                        3,
+                                        5,
+                                        1,
+                                        6,
+                                        8,
+                                    },
+                                    ec_gf8_mul_49_ops};
+
+static ec_gf_op_t ec_gf8_mul_4A_ops[] = {
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_4A = {8,
+                                    {
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        3,
+                                        4,
+                                        2,
+                                    },
+                                    ec_gf8_mul_4A_ops};
+
+static ec_gf_op_t ec_gf8_mul_4B_ops[] = {
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR3, 8, 3, 7}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 8, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_4B = {9,
+                                    {
+                                        5,
+                                        3,
+                                        6,
+                                        7,
+                                        0,
+                                        2,
+                                        4,
+                                        1,
+                                        8,
+                                    },
+                                    ec_gf8_mul_4B_ops};
+
+static ec_gf_op_t ec_gf8_mul_4C_ops[] = {
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_4C = {8,
+                                    {
+                                        5,
+                                        3,
+                                        4,
+                                        7,
+                                        0,
+                                        6,
+                                        2,
+                                        1,
+                                    },
+                                    ec_gf8_mul_4C_ops};
+
+static ec_gf_op_t ec_gf8_mul_4D_ops[] = {
+    {EC_GF_OP_COPY, 8, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR3, 9, 3, 1},
+    {EC_GF_OP_XOR2, 5, 9, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR3, 0, 8, 2},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_4D = {10,
+                                    {
+                                        0,
+                                        9,
+                                        3,
+                                        5,
+                                        6,
+                                        4,
+                                        7,
+                                        1,
+                                        2,
+                                        8,
+                                    },
+                                    ec_gf8_mul_4D_ops};
+
+static ec_gf_op_t ec_gf8_mul_4E_ops[] = {
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_4E = {8,
+                                    {
+                                        2,
+                                        3,
+                                        0,
+                                        1,
+                                        5,
+                                        6,
+                                        7,
+                                        4,
+                                    },
+                                    ec_gf8_mul_4E_ops};
+
+static ec_gf_op_t ec_gf8_mul_4F_ops[] = {
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_4F = {8,
+                                    {
+                                        0,
+                                        3,
+                                        5,
+                                        6,
+                                        1,
+                                        2,
+                                        7,
+                                        4,
+                                    },
+                                    ec_gf8_mul_4F_ops};
+
+static ec_gf_op_t ec_gf8_mul_50_ops[] = {
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_50 = {8,
+                                    {
+                                        4,
+                                        5,
+                                        7,
+                                        3,
+                                        0,
+                                        1,
+                                        2,
+                                        6,
+                                    },
+                                    ec_gf8_mul_50_ops};
+
+static ec_gf_op_t ec_gf8_mul_51_ops[] = {
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_51 = {8,
+                                    {
+                                        0,
+                                        1,
+                                        7,
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                    },
+                                    ec_gf8_mul_51_ops};
+
+static ec_gf_op_t ec_gf8_mul_52_ops[] = {
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_COPY, 9, 4, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR3, 3, 5, 8},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 2, 9, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_52 = {10,
+                                    {
+                                        2,
+                                        3,
+                                        1,
+                                        4,
+                                        6,
+                                        7,
+                                        0,
+                                        5,
+                                        8,
+                                        9,
+                                    },
+                                    ec_gf8_mul_52_ops};
+
+static ec_gf_op_t ec_gf8_mul_53_ops[] = {
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_53 = {8,
+                                    {
+                                        2,
+                                        0,
+                                        1,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        3,
+                                    },
+                                    ec_gf8_mul_53_ops};
+
+static ec_gf_op_t ec_gf8_mul_54_ops[] = {
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_54 = {8,
+                                    {
+                                        7,
+                                        3,
+                                        0,
+                                        4,
+                                        2,
+                                        6,
+                                        5,
+                                        1,
+                                    },
+                                    ec_gf8_mul_54_ops};
+
+static ec_gf_op_t ec_gf8_mul_55_ops[] = {
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_55 = {8,
+                                    {
+                                        1,
+                                        5,
+                                        6,
+                                        4,
+                                        3,
+                                        7,
+                                        2,
+                                        0,
+                                    },
+                                    ec_gf8_mul_55_ops};
+
+static ec_gf_op_t ec_gf8_mul_56_ops[] = {
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_56 = {8,
+                                    {
+                                        2,
+                                        3,
+                                        0,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        1,
+                                    },
+                                    ec_gf8_mul_56_ops};
+
+static ec_gf_op_t ec_gf8_mul_57_ops[] = {
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_57 = {8,
+                                    {
+                                        2,
+                                        3,
+                                        0,
+                                        1,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                    },
+                                    ec_gf8_mul_57_ops};
+
+static ec_gf_op_t ec_gf8_mul_58_ops[] = {
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_58 = {8,
+                                    {
+                                        4,
+                                        3,
+                                        2,
+                                        7,
+                                        0,
+                                        1,
+                                        5,
+                                        6,
+                                    },
+                                    ec_gf8_mul_58_ops};
+
+static ec_gf_op_t ec_gf8_mul_59_ops[] = {
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_59 = {8,
+                                    {
+                                        7,
+                                        3,
+                                        5,
+                                        6,
+                                        1,
+                                        2,
+                                        0,
+                                        4,
+                                    },
+                                    ec_gf8_mul_59_ops};
+
+static ec_gf_op_t ec_gf8_mul_5A_ops[] = {
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_5A = {8,
+                                    {
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        5,
+                                        4,
+                                    },
+                                    ec_gf8_mul_5A_ops};
+
+static ec_gf_op_t ec_gf8_mul_5B_ops[] = {
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_5B = {8,
+                                    {
+                                        6,
+                                        0,
+                                        7,
+                                        5,
+                                        2,
+                                        1,
+                                        3,
+                                        4,
+                                    },
+                                    ec_gf8_mul_5B_ops};
+
+static ec_gf_op_t ec_gf8_mul_5C_ops[] = {
+    {EC_GF_OP_COPY, 8, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 2, 8, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_5C = {9,
+                                    {
+                                        7,
+                                        5,
+                                        2,
+                                        4,
+                                        1,
+                                        0,
+                                        6,
+                                        3,
+                                        8,
+                                    },
+                                    ec_gf8_mul_5C_ops};
+
+static ec_gf_op_t ec_gf8_mul_5D_ops[] = {
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_5D = {8,
+                                    {
+                                        1,
+                                        3,
+                                        5,
+                                        4,
+                                        6,
+                                        7,
+                                        2,
+                                        0,
+                                    },
+                                    ec_gf8_mul_5D_ops};
+
+static ec_gf_op_t ec_gf8_mul_5E_ops[] = {
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_5E = {8,
+                                    {
+                                        4,
+                                        3,
+                                        6,
+                                        2,
+                                        5,
+                                        7,
+                                        0,
+                                        1,
+                                    },
+                                    ec_gf8_mul_5E_ops};
+
+static ec_gf_op_t ec_gf8_mul_5F_ops[] = {
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_5F = {8,
+                                    {
+                                        6,
+                                        1,
+                                        3,
+                                        4,
+                                        5,
+                                        7,
+                                        2,
+                                        0,
+                                    },
+                                    ec_gf8_mul_5F_ops};
+
+static ec_gf_op_t ec_gf8_mul_60_ops[] = {
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_60 = {8,
+                                    {
+                                        2,
+                                        3,
+                                        4,
+                                        7,
+                                        5,
+                                        6,
+                                        0,
+                                        1,
+                                    },
+                                    ec_gf8_mul_60_ops};
+
+static ec_gf_op_t ec_gf8_mul_61_ops[] = {
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_61 = {8,
+                                    {
+                                        0,
+                                        5,
+                                        6,
+                                        7,
+                                        4,
+                                        2,
+                                        1,
+                                        3,
+                                    },
+                                    ec_gf8_mul_61_ops};
+
+static ec_gf_op_t ec_gf8_mul_62_ops[] = {
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_62 = {8,
+                                    {
+                                        2,
+                                        0,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        1,
+                                    },
+                                    ec_gf8_mul_62_ops};
+
+static ec_gf_op_t ec_gf8_mul_63_ops[] = {
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_63 = {8,
+                                    {
+                                        3,
+                                        4,
+                                        6,
+                                        5,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                    },
+                                    ec_gf8_mul_63_ops};
+
+static ec_gf_op_t ec_gf8_mul_64_ops[] = {
+    {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 8, 0, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 8, 7, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_64 = {9,
+                                    {
+                                        2,
+                                        3,
+                                        4,
+                                        6,
+                                        5,
+                                        7,
+                                        8,
+                                        1,
+                                        0,
+                                    },
+                                    ec_gf8_mul_64_ops};
+
+static ec_gf_op_t ec_gf8_mul_65_ops[] = {
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_65 = {8,
+                                    {
+                                        2,
+                                        5,
+                                        1,
+                                        3,
+                                        4,
+                                        0,
+                                        6,
+                                        7,
+                                    },
+                                    ec_gf8_mul_65_ops};
+
+static ec_gf_op_t ec_gf8_mul_66_ops[] = {
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_66 = {8,
+                                    {
+                                        2,
+                                        3,
+                                        1,
+                                        4,
+                                        5,
+                                        7,
+                                        0,
+                                        6,
+                                    },
+                                    ec_gf8_mul_66_ops};
+
+static ec_gf_op_t ec_gf8_mul_67_ops[] = {
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_67 = {8,
+                                    {
+                                        2,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        3,
+                                        1,
+                                        0,
+                                    },
+                                    ec_gf8_mul_67_ops};
+
+static ec_gf_op_t ec_gf8_mul_68_ops[] = {
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_68 = {8,
+                                    {
+                                        5,
+                                        7,
+                                        2,
+                                        3,
+                                        0,
+                                        6,
+                                        4,
+                                        1,
+                                    },
+                                    ec_gf8_mul_68_ops};
+
+static ec_gf_op_t ec_gf8_mul_69_ops[] = {
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_69 = {8,
+                                    {
+                                        0,
+                                        1,
+                                        3,
+                                        2,
+                                        4,
+                                        5,
+                                        7,
+                                        6,
+                                    },
+                                    ec_gf8_mul_69_ops};
+
+static ec_gf_op_t ec_gf8_mul_6A_ops[] = {
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_6A = {8,
+                                    {
+                                        5,
+                                        7,
+                                        4,
+                                        6,
+                                        1,
+                                        2,
+                                        0,
+                                        3,
+                                    },
+                                    ec_gf8_mul_6A_ops};
+
+static ec_gf_op_t ec_gf8_mul_6B_ops[] = {
+    {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_6B = {9,
+                                    {
+                                        6,
+                                        7,
+                                        2,
+                                        0,
+                                        3,
+                                        1,
+                                        5,
+                                        4,
+                                        8,
+                                    },
+                                    ec_gf8_mul_6B_ops};
+
+static ec_gf_op_t ec_gf8_mul_6C_ops[] = {
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_6C = {8,
+                                    {
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                    },
+                                    ec_gf8_mul_6C_ops};
+
+static ec_gf_op_t ec_gf8_mul_6D_ops[] = {
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR3, 8, 3, 4}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_6D = {9,
+                                    {
+                                        3,
+                                        6,
+                                        7,
+                                        0,
+                                        4,
+                                        5,
+                                        1,
+                                        2,
+                                        8,
+                                    },
+                                    ec_gf8_mul_6D_ops};
+
+static ec_gf_op_t ec_gf8_mul_6E_ops[] = {
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_6E = {8,
+                                    {
+                                        5,
+                                        6,
+                                        3,
+                                        1,
+                                        7,
+                                        2,
+                                        0,
+                                        4,
+                                    },
+                                    ec_gf8_mul_6E_ops};
+
+static ec_gf_op_t ec_gf8_mul_6F_ops[] = {
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR3, 0, 8, 7}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_6F = {9,
+                                    {
+                                        2,
+                                        6,
+                                        3,
+                                        7,
+                                        0,
+                                        1,
+                                        4,
+                                        5,
+                                        8,
+                                    },
+                                    ec_gf8_mul_6F_ops};
+
+static ec_gf_op_t ec_gf8_mul_70_ops[] = {
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_70 = {8,
+                                    {
+                                        3,
+                                        4,
+                                        5,
+                                        2,
+                                        6,
+                                        0,
+                                        1,
+                                        7,
+                                    },
+                                    ec_gf8_mul_70_ops};
+
+static ec_gf_op_t ec_gf8_mul_71_ops[] = {
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_71 = {8,
+                                    {
+                                        4,
+                                        7,
+                                        5,
+                                        3,
+                                        6,
+                                        0,
+                                        2,
+                                        1,
+                                    },
+                                    ec_gf8_mul_71_ops};
+
+static ec_gf_op_t ec_gf8_mul_72_ops[] = {
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_72 = {8,
+                                    {
+                                        0,
+                                        5,
+                                        2,
+                                        7,
+                                        4,
+                                        1,
+                                        3,
+                                        6,
+                                    },
+                                    ec_gf8_mul_72_ops};
+
+static ec_gf_op_t ec_gf8_mul_73_ops[] = {
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_73 = {8,
+                                    {
+                                        6,
+                                        0,
+                                        1,
+                                        7,
+                                        4,
+                                        5,
+                                        2,
+                                        3,
+                                    },
+                                    ec_gf8_mul_73_ops};
+
+static ec_gf_op_t ec_gf8_mul_74_ops[] = {
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_74 = {8,
+                                    {
+                                        3,
+                                        2,
+                                        1,
+                                        0,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                    },
+                                    ec_gf8_mul_74_ops};
+
+static ec_gf_op_t ec_gf8_mul_75_ops[] = {
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_75 = {8,
+                                    {
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                    },
+                                    ec_gf8_mul_75_ops};
+
+static ec_gf_op_t ec_gf8_mul_76_ops[] = {
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR3, 8, 6, 2},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 8, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_76 = {9,
+                                    {
+                                        2,
+                                        3,
+                                        0,
+                                        6,
+                                        5,
+                                        1,
+                                        7,
+                                        8,
+                                        4,
+                                    },
+                                    ec_gf8_mul_76_ops};
+
+static ec_gf_op_t ec_gf8_mul_77_ops[] = {
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_77 = {8,
+                                    {
+                                        7,
+                                        4,
+                                        3,
+                                        6,
+                                        0,
+                                        1,
+                                        5,
+                                        2,
+                                    },
+                                    ec_gf8_mul_77_ops};
+
+static ec_gf_op_t ec_gf8_mul_78_ops[] = {
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR3, 8, 0, 2},
+    {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_78 = {9,
+                                    {
+                                        4,
+                                        7,
+                                        3,
+                                        2,
+                                        5,
+                                        1,
+                                        6,
+                                        0,
+                                        8,
+                                    },
+                                    ec_gf8_mul_78_ops};
+
+static ec_gf_op_t ec_gf8_mul_79_ops[] = {
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR3, 8, 4, 7},
+    {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_79 = {9,
+                                    {
+                                        4,
+                                        5,
+                                        7,
+                                        3,
+                                        1,
+                                        6,
+                                        2,
+                                        0,
+                                        8,
+                                    },
+                                    ec_gf8_mul_79_ops};
+
+static ec_gf_op_t ec_gf8_mul_7A_ops[] = {
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_7A = {8,
+                                    {
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                    },
+                                    ec_gf8_mul_7A_ops};
+
+static ec_gf_op_t ec_gf8_mul_7B_ops[] = {
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR3, 8, 5, 3},
+    {EC_GF_OP_XOR2, 8, 0, 0}, {EC_GF_OP_COPY, 9, 4, 0},
+    {EC_GF_OP_XOR2, 8, 2, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR3, 4, 1, 9},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_7B = {10,
+                                    {
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                        8,
+                                        5,
+                                        6,
+                                        0,
+                                        7,
+                                        9,
+                                    },
+                                    ec_gf8_mul_7B_ops};
+
+static ec_gf_op_t ec_gf8_mul_7C_ops[] = {
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_7C = {8,
+                                    {
+                                        2,
+                                        4,
+                                        1,
+                                        6,
+                                        3,
+                                        5,
+                                        7,
+                                        0,
+                                    },
+                                    ec_gf8_mul_7C_ops};
+
+static ec_gf_op_t ec_gf8_mul_7D_ops[] = {
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_7D = {8,
+                                    {
+                                        1,
+                                        0,
+                                        3,
+                                        5,
+                                        6,
+                                        7,
+                                        2,
+                                        4,
+                                    },
+                                    ec_gf8_mul_7D_ops};
+
+static ec_gf_op_t ec_gf8_mul_7E_ops[] = {
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_COPY, 8, 0, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR3, 6, 2, 7},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_7E = {9,
+                                    {
+                                        5,
+                                        1,
+                                        2,
+                                        0,
+                                        7,
+                                        3,
+                                        4,
+                                        6,
+                                        8,
+                                    },
+                                    ec_gf8_mul_7E_ops};
+
+static ec_gf_op_t ec_gf8_mul_7F_ops[] = {
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR3, 9, 7, 5}, {EC_GF_OP_XOR2, 2, 9, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 6, 9, 0},
+    {EC_GF_OP_XOR3, 9, 6, 4}, {EC_GF_OP_XOR2, 7, 9, 0},
+    {EC_GF_OP_XOR2, 3, 9, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_7F = {10,
+                                    {
+                                        4,
+                                        1,
+                                        0,
+                                        5,
+                                        6,
+                                        7,
+                                        2,
+                                        3,
+                                        8,
+                                        9,
+                                    },
+                                    ec_gf8_mul_7F_ops};
+
+static ec_gf_op_t ec_gf8_mul_80_ops[] = {
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_80 = {8,
+                                    {
+                                        7,
+                                        5,
+                                        6,
+                                        4,
+                                        1,
+                                        2,
+                                        3,
+                                        0,
+                                    },
+                                    ec_gf8_mul_80_ops};
+
+static ec_gf_op_t ec_gf8_mul_81_ops[] = {
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_81 = {8,
+                                    {
+                                        2,
+                                        7,
+                                        4,
+                                        1,
+                                        5,
+                                        6,
+                                        3,
+                                        0,
+                                    },
+                                    ec_gf8_mul_81_ops};
+
+static ec_gf_op_t ec_gf8_mul_82_ops[] = {
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_COPY, 8, 6, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR3, 5, 8, 7}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_82 = {9,
+                                    {
+                                        6,
+                                        2,
+                                        7,
+                                        5,
+                                        1,
+                                        3,
+                                        4,
+                                        0,
+                                        8,
+                                    },
+                                    ec_gf8_mul_82_ops};
+
+static ec_gf_op_t ec_gf8_mul_83_ops[] = {
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_83 = {8,
+                                    {
+                                        3,
+                                        5,
+                                        6,
+                                        7,
+                                        1,
+                                        2,
+                                        4,
+                                        0,
+                                    },
+                                    ec_gf8_mul_83_ops};
+
+static ec_gf_op_t ec_gf8_mul_84_ops[] = {
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_84 = {8,
+                                    {
+                                        7,
+                                        6,
+                                        0,
+                                        4,
+                                        1,
+                                        5,
+                                        3,
+                                        2,
+                                    },
+                                    ec_gf8_mul_84_ops};
+
+static ec_gf_op_t ec_gf8_mul_85_ops[] = {
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_85 = {8,
+                                    {
+                                        7,
+                                        6,
+                                        0,
+                                        3,
+                                        2,
+                                        4,
+                                        5,
+                                        1,
+                                    },
+                                    ec_gf8_mul_85_ops};
+
+static ec_gf_op_t ec_gf8_mul_86_ops[] = {
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_86 = {8,
+                                    {
+                                        1,
+                                        2,
+                                        6,
+                                        4,
+                                        5,
+                                        7,
+                                        3,
+                                        0,
+                                    },
+                                    ec_gf8_mul_86_ops};
+
+static ec_gf_op_t ec_gf8_mul_87_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_COPY, 8, 1, 0},
+    {EC_GF_OP_XOR2, 8, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR3, 5, 8, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 8, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_87 = {9,
+                                    {
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        7,
+                                        6,
+                                        0,
+                                        8,
+                                    },
+                                    ec_gf8_mul_87_ops};
+
+static ec_gf_op_t ec_gf8_mul_88_ops[] = {
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_88 = {8,
+                                    {
+                                        6,
+                                        7,
+                                        3,
+                                        1,
+                                        2,
+                                        4,
+                                        5,
+                                        0,
+                                    },
+                                    ec_gf8_mul_88_ops};
+
+static ec_gf_op_t ec_gf8_mul_89_ops[] = {
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR3, 8, 5, 2},
+    {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_89 = {9,
+                                    {
+                                        2,
+                                        1,
+                                        6,
+                                        5,
+                                        7,
+                                        3,
+                                        4,
+                                        0,
+                                        8,
+                                    },
+                                    ec_gf8_mul_89_ops};
+
+static ec_gf_op_t ec_gf8_mul_8A_ops[] = {
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_8A = {8,
+                                    {
+                                        1,
+                                        2,
+                                        3,
+                                        0,
+                                        6,
+                                        7,
+                                        4,
+                                        5,
+                                    },
+                                    ec_gf8_mul_8A_ops};
+
+static ec_gf_op_t ec_gf8_mul_8B_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_8B = {8,
+                                    {
+                                        6,
+                                        1,
+                                        2,
+                                        3,
+                                        5,
+                                        7,
+                                        4,
+                                        0,
+                                    },
+                                    ec_gf8_mul_8B_ops};
+
+static ec_gf_op_t ec_gf8_mul_8C_ops[] = {
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_8C = {8,
+                                    {
+                                        1,
+                                        2,
+                                        0,
+                                        7,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                    },
+                                    ec_gf8_mul_8C_ops};
+
+static ec_gf_op_t ec_gf8_mul_8D_ops[] = {
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_8D = {8,
+                                    {
+                                        7,
+                                        1,
+                                        3,
+                                        2,
+                                        4,
+                                        5,
+                                        0,
+                                        6,
+                                    },
+                                    ec_gf8_mul_8D_ops};
+
+static ec_gf_op_t ec_gf8_mul_8E_ops[] = {{EC_GF_OP_XOR2, 2, 0, 0},
+                                         {EC_GF_OP_XOR2, 3, 0, 0},
+                                         {EC_GF_OP_XOR2, 4, 0, 0},
+                                         {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_8E = {8,
+                                    {
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                    },
+                                    ec_gf8_mul_8E_ops};
+
+static ec_gf_op_t ec_gf8_mul_8F_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_8F = {8,
+                                    {
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                    },
+                                    ec_gf8_mul_8F_ops};
+
+static ec_gf_op_t ec_gf8_mul_90_ops[] = {
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_90 = {8,
+                                    {
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        3,
+                                        2,
+                                    },
+                                    ec_gf8_mul_90_ops};
+
+static ec_gf_op_t ec_gf8_mul_91_ops[] = {
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_COPY, 9, 1, 0}, {EC_GF_OP_COPY, 8, 3, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 7, 9, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR3, 5, 8, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_91 = {10,
+                                    {
+                                        2,
+                                        3,
+                                        1,
+                                        4,
+                                        0,
+                                        6,
+                                        7,
+                                        5,
+                                        8,
+                                        9,
+                                    },
+                                    ec_gf8_mul_91_ops};
+
+static ec_gf_op_t ec_gf8_mul_92_ops[] = {
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_92 = {8,
+                                    {
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        5,
+                                        4,
+                                    },
+                                    ec_gf8_mul_92_ops};
+
+static ec_gf_op_t ec_gf8_mul_93_ops[] = {
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_93 = {8,
+                                    {
+                                        6,
+                                        4,
+                                        5,
+                                        1,
+                                        7,
+                                        2,
+                                        3,
+                                        0,
+                                    },
+                                    ec_gf8_mul_93_ops};
+
+static ec_gf_op_t ec_gf8_mul_94_ops[] = {
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_94 = {8,
+                                    {
+                                        7,
+                                        5,
+                                        0,
+                                        2,
+                                        6,
+                                        1,
+                                        3,
+                                        4,
+                                    },
+                                    ec_gf8_mul_94_ops};
+
+static ec_gf_op_t ec_gf8_mul_95_ops[] = {
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_95 = {8,
+                                    {
+                                        7,
+                                        6,
+                                        1,
+                                        3,
+                                        0,
+                                        4,
+                                        5,
+                                        2,
+                                    },
+                                    ec_gf8_mul_95_ops};
+
+static ec_gf_op_t ec_gf8_mul_96_ops[] = {
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR3, 8, 0, 4}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 8, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_96 = {9,
+                                    {
+                                        4,
+                                        0,
+                                        1,
+                                        6,
+                                        7,
+                                        2,
+                                        3,
+                                        5,
+                                        8,
+                                    },
+                                    ec_gf8_mul_96_ops};
+
+static ec_gf_op_t ec_gf8_mul_97_ops[] = {
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_COPY, 8, 2, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 8, 6, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_97 = {9,
+                                    {
+                                        4,
+                                        5,
+                                        3,
+                                        6,
+                                        7,
+                                        1,
+                                        2,
+                                        0,
+                                        8,
+                                    },
+                                    ec_gf8_mul_97_ops};
+
+static ec_gf_op_t ec_gf8_mul_98_ops[] = {
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_98 = {8,
+                                    {
+                                        4,
+                                        2,
+                                        3,
+                                        6,
+                                        7,
+                                        5,
+                                        1,
+                                        0,
+                                    },
+                                    ec_gf8_mul_98_ops};
+
+static ec_gf_op_t ec_gf8_mul_99_ops[] = {
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_99 = {8,
+                                    {
+                                        6,
+                                        5,
+                                        3,
+                                        7,
+                                        0,
+                                        1,
+                                        4,
+                                        2,
+                                    },
+                                    ec_gf8_mul_99_ops};
+
+static ec_gf_op_t ec_gf8_mul_9A_ops[] = {
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR3, 8, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_9A = {9,
+                                    {
+                                        6,
+                                        3,
+                                        4,
+                                        0,
+                                        5,
+                                        1,
+                                        2,
+                                        7,
+                                        8,
+                                    },
+                                    ec_gf8_mul_9A_ops};
+
+static ec_gf_op_t ec_gf8_mul_9B_ops[] = {
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_COPY, 9, 5, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR3, 8, 3, 2}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 3, 9, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_9B = {10,
+                                    {
+                                        4,
+                                        5,
+                                        8,
+                                        6,
+                                        7,
+                                        1,
+                                        2,
+                                        0,
+                                        3,
+                                        9,
+                                    },
+                                    ec_gf8_mul_9B_ops};
+
+static ec_gf_op_t ec_gf8_mul_9C_ops[] = {
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_9C = {8,
+                                    {
+                                        3,
+                                        2,
+                                        1,
+                                        0,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                    },
+                                    ec_gf8_mul_9C_ops};
+
+static ec_gf_op_t ec_gf8_mul_9D_ops[] = {
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_9D = {8,
+                                    {
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        7,
+                                        4,
+                                        5,
+                                        6,
+                                    },
+                                    ec_gf8_mul_9D_ops};
+
+static ec_gf_op_t ec_gf8_mul_9E_ops[] = {
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_COPY, 8, 7, 0},
+    {EC_GF_OP_XOR2, 8, 5, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_9E = {9,
+                                    {
+                                        4,
+                                        5,
+                                        3,
+                                        8,
+                                        6,
+                                        0,
+                                        2,
+                                        7,
+                                        1,
+                                    },
+                                    ec_gf8_mul_9E_ops};
+
+static ec_gf_op_t ec_gf8_mul_9F_ops[] = {
+    {EC_GF_OP_XOR3, 8, 1, 2}, {EC_GF_OP_XOR2, 8, 3, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_9F = {9,
+                                    {
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        8,
+                                    },
+                                    ec_gf8_mul_9F_ops};
+
+static ec_gf_op_t ec_gf8_mul_A0_ops[] = {
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A0 = {8,
+                                    {
+                                        3,
+                                        1,
+                                        6,
+                                        7,
+                                        5,
+                                        2,
+                                        4,
+                                        0,
+                                    },
+                                    ec_gf8_mul_A0_ops};
+
+static ec_gf_op_t ec_gf8_mul_A1_ops[] = {
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR3, 8, 0, 6},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+    {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A1 = {9,
+                                    {
+                                        7,
+                                        4,
+                                        1,
+                                        5,
+                                        6,
+                                        0,
+                                        2,
+                                        3,
+                                        8,
+                                    },
+                                    ec_gf8_mul_A1_ops};
+
+static ec_gf_op_t ec_gf8_mul_A2_ops[] = {
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A2 = {8,
+                                    {
+                                        7,
+                                        0,
+                                        6,
+                                        3,
+                                        2,
+                                        1,
+                                        4,
+                                        5,
+                                    },
+                                    ec_gf8_mul_A2_ops};
+
+static ec_gf_op_t ec_gf8_mul_A3_ops[] = {
+    {EC_GF_OP_COPY, 8, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 3, 8, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A3 = {9,
+                                    {
+                                        3,
+                                        7,
+                                        2,
+                                        6,
+                                        1,
+                                        4,
+                                        0,
+                                        5,
+                                        8,
+                                    },
+                                    ec_gf8_mul_A3_ops};
+
+static ec_gf_op_t ec_gf8_mul_A4_ops[] = {
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A4 = {8,
+                                    {
+                                        5,
+                                        6,
+                                        7,
+                                        2,
+                                        4,
+                                        3,
+                                        0,
+                                        1,
+                                    },
+                                    ec_gf8_mul_A4_ops};
+
+static ec_gf_op_t ec_gf8_mul_A5_ops[] = {
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR3, 8, 5, 6}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A5 = {9,
+                                    {
+                                        1,
+                                        4,
+                                        2,
+                                        5,
+                                        6,
+                                        7,
+                                        3,
+                                        0,
+                                        8,
+                                    },
+                                    ec_gf8_mul_A5_ops};
+
+static ec_gf_op_t ec_gf8_mul_A6_ops[] = {
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A6 = {8,
+                                    {
+                                        1,
+                                        2,
+                                        0,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                    },
+                                    ec_gf8_mul_A6_ops};
+
+static ec_gf_op_t ec_gf8_mul_A7_ops[] = {
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A7 = {8,
+                                    {
+                                        0,
+                                        1,
+                                        2,
+                                        5,
+                                        6,
+                                        7,
+                                        3,
+                                        4,
+                                    },
+                                    ec_gf8_mul_A7_ops};
+
+static ec_gf_op_t ec_gf8_mul_A8_ops[] = {
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 8, 1, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_COPY, 9, 4, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 8, 3, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 2, 9, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A8 = {10,
+                                    {
+                                        1,
+                                        7,
+                                        5,
+                                        8,
+                                        6,
+                                        3,
+                                        4,
+                                        0,
+                                        2,
+                                        9,
+                                    },
+                                    ec_gf8_mul_A8_ops};
+
+static ec_gf_op_t ec_gf8_mul_A9_ops[] = {
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_A9 = {8,
+                                    {
+                                        3,
+                                        7,
+                                        6,
+                                        1,
+                                        2,
+                                        0,
+                                        4,
+                                        5,
+                                    },
+                                    ec_gf8_mul_A9_ops};
+
+static ec_gf_op_t ec_gf8_mul_AA_ops[] = {
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_AA = {8,
+                                    {
+                                        0,
+                                        4,
+                                        5,
+                                        3,
+                                        6,
+                                        7,
+                                        1,
+                                        2,
+                                    },
+                                    ec_gf8_mul_AA_ops};
+
+static ec_gf_op_t ec_gf8_mul_AB_ops[] = {
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_COPY, 9, 6, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 8, 7, 0}, {EC_GF_OP_XOR2, 3, 8, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR3, 3, 9, 7},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_AB = {10,
+                                    {
+                                        2,
+                                        3,
+                                        8,
+                                        0,
+                                        5,
+                                        6,
+                                        1,
+                                        4,
+                                        7,
+                                        9,
+                                    },
+                                    ec_gf8_mul_AB_ops};
+
+static ec_gf_op_t ec_gf8_mul_AC_ops[] = {
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_AC = {8,
+                                    {
+                                        3,
+                                        2,
+                                        1,
+                                        0,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                    },
+                                    ec_gf8_mul_AC_ops};
+
+static ec_gf_op_t ec_gf8_mul_AD_ops[] = {
+    {EC_GF_OP_XOR3, 8, 1, 2}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 4, 8, 0},
+    {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_AD = {9,
+                                    {
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        8,
+                                    },
+                                    ec_gf8_mul_AD_ops};
+
+static ec_gf_op_t ec_gf8_mul_AE_ops[] = {
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_COPY, 8, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 8, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_AE = {9,
+                                    {
+                                        7,
+                                        0,
+                                        5,
+                                        6,
+                                        3,
+                                        4,
+                                        1,
+                                        2,
+                                        8,
+                                    },
+                                    ec_gf8_mul_AE_ops};
+
+static ec_gf_op_t ec_gf8_mul_AF_ops[] = {
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_AF = {8,
+                                    {
+                                        0,
+                                        1,
+                                        2,
+                                        7,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                    },
+                                    ec_gf8_mul_AF_ops};
+
+static ec_gf_op_t ec_gf8_mul_B0_ops[] = {
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B0 = {8,
+                                    {
+                                        4,
+                                        0,
+                                        7,
+                                        2,
+                                        3,
+                                        1,
+                                        6,
+                                        5,
+                                    },
+                                    ec_gf8_mul_B0_ops};
+
+static ec_gf_op_t ec_gf8_mul_B1_ops[] = {
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_COPY, 8, 4, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR3, 5, 8, 1}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B1 = {9,
+                                    {
+                                        2,
+                                        6,
+                                        4,
+                                        7,
+                                        0,
+                                        1,
+                                        3,
+                                        5,
+                                        8,
+                                    },
+                                    ec_gf8_mul_B1_ops};
+
+static ec_gf_op_t ec_gf8_mul_B2_ops[] = {
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR3, 8, 4, 5},
+    {EC_GF_OP_XOR2, 2, 8, 0}, {EC_GF_OP_XOR2, 8, 1, 0},
+    {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 3, 8, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B2 = {9,
+                                    {
+                                        0,
+                                        7,
+                                        4,
+                                        5,
+                                        6,
+                                        1,
+                                        2,
+                                        3,
+                                        8,
+                                    },
+                                    ec_gf8_mul_B2_ops};
+
+static ec_gf_op_t ec_gf8_mul_B3_ops[] = {
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_COPY, 9, 5, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR3, 8, 6, 4},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 8, 5, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+    {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR3, 1, 9, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B3 = {10,
+                                    {
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        1,
+                                        6,
+                                        0,
+                                        7,
+                                        8,
+                                        9,
+                                    },
+                                    ec_gf8_mul_B3_ops};
+
+static ec_gf_op_t ec_gf8_mul_B4_ops[] = {
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B4 = {8,
+                                    {
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                    },
+                                    ec_gf8_mul_B4_ops};
+
+static ec_gf_op_t ec_gf8_mul_B5_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_COPY, 8, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR3, 4, 8, 3}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B5 = {9,
+                                    {
+                                        3,
+                                        4,
+                                        0,
+                                        7,
+                                        1,
+                                        5,
+                                        6,
+                                        2,
+                                        8,
+                                    },
+                                    ec_gf8_mul_B5_ops};
+
+static ec_gf_op_t ec_gf8_mul_B6_ops[] = {
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B6 = {8,
+                                    {
+                                        5,
+                                        3,
+                                        6,
+                                        4,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                    },
+                                    ec_gf8_mul_B6_ops};
+
+static ec_gf_op_t ec_gf8_mul_B7_ops[] = {
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B7 = {8,
+                                    {
+                                        5,
+                                        0,
+                                        1,
+                                        4,
+                                        2,
+                                        6,
+                                        7,
+                                        3,
+                                    },
+                                    ec_gf8_mul_B7_ops};
+
+static ec_gf_op_t ec_gf8_mul_B8_ops[] = {
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B8 = {8,
+                                    {
+                                        6,
+                                        4,
+                                        5,
+                                        1,
+                                        2,
+                                        0,
+                                        7,
+                                        3,
+                                    },
+                                    ec_gf8_mul_B8_ops};
+
+static ec_gf_op_t ec_gf8_mul_B9_ops[] = {
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR3, 0, 8, 2}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_B9 = {9,
+                                    {
+                                        6,
+                                        7,
+                                        0,
+                                        2,
+                                        1,
+                                        4,
+                                        5,
+                                        3,
+                                        8,
+                                    },
+                                    ec_gf8_mul_B9_ops};
+
+static ec_gf_op_t ec_gf8_mul_BA_ops[] = {
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_BA = {8,
+                                    {
+                                        1,
+                                        2,
+                                        4,
+                                        3,
+                                        5,
+                                        6,
+                                        0,
+                                        7,
+                                    },
+                                    ec_gf8_mul_BA_ops};
+
+static ec_gf_op_t ec_gf8_mul_BB_ops[] = {
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_COPY, 8, 3, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 8, 5, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 8, 7, 0}, {EC_GF_OP_XOR2, 2, 8, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_BB = {9,
+                                    {
+                                        7,
+                                        2,
+                                        1,
+                                        8,
+                                        3,
+                                        5,
+                                        6,
+                                        4,
+                                        0,
+                                    },
+                                    ec_gf8_mul_BB_ops};
+
+static ec_gf_op_t ec_gf8_mul_BC_ops[] = {
+    {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 8, 2, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR3, 2, 8, 4},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_BC = {9,
+                                    {
+                                        2,
+                                        6,
+                                        3,
+                                        4,
+                                        5,
+                                        1,
+                                        7,
+                                        0,
+                                        8,
+                                    },
+                                    ec_gf8_mul_BC_ops};
+
+static ec_gf_op_t ec_gf8_mul_BD_ops[] = {
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_BD = {8,
+                                    {
+                                        4,
+                                        5,
+                                        0,
+                                        2,
+                                        7,
+                                        1,
+                                        6,
+                                        3,
+                                    },
+                                    ec_gf8_mul_BD_ops};
+
+static ec_gf_op_t ec_gf8_mul_BE_ops[] = {
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_BE = {8,
+                                    {
+                                        0,
+                                        6,
+                                        7,
+                                        4,
+                                        5,
+                                        1,
+                                        3,
+                                        2,
+                                    },
+                                    ec_gf8_mul_BE_ops};
+
+static ec_gf_op_t ec_gf8_mul_BF_ops[] = {
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_BF = {8,
+                                    {
+                                        5,
+                                        6,
+                                        1,
+                                        7,
+                                        3,
+                                        0,
+                                        2,
+                                        4,
+                                    },
+                                    ec_gf8_mul_BF_ops};
+
+static ec_gf_op_t ec_gf8_mul_C0_ops[] = {
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C0 = {8,
+                                    {
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                        7,
+                                        5,
+                                        6,
+                                        0,
+                                    },
+                                    ec_gf8_mul_C0_ops};
+
+static ec_gf_op_t ec_gf8_mul_C1_ops[] = {
+    {EC_GF_OP_XOR3, 8, 1, 2}, {EC_GF_OP_XOR2, 8, 3, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 6, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C1 = {9,
+                                    {
+                                        5,
+                                        6,
+                                        7,
+                                        4,
+                                        1,
+                                        2,
+                                        3,
+                                        0,
+                                        8,
+                                    },
+                                    ec_gf8_mul_C1_ops};
+
+static ec_gf_op_t ec_gf8_mul_C2_ops[] = {
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C2 = {8,
+                                    {
+                                        7,
+                                        6,
+                                        3,
+                                        0,
+                                        1,
+                                        4,
+                                        5,
+                                        2,
+                                    },
+                                    ec_gf8_mul_C2_ops};
+
+static ec_gf_op_t ec_gf8_mul_C3_ops[] = {
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR3, 0, 2, 6}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR3, 9, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 7, 9, 0},
+    {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C3 = {10,
+                                    {
+                                        5,
+                                        6,
+                                        4,
+                                        7,
+                                        1,
+                                        2,
+                                        3,
+                                        0,
+                                        8,
+                                        9,
+                                    },
+                                    ec_gf8_mul_C3_ops};
+
+static ec_gf_op_t ec_gf8_mul_C4_ops[] = {
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 1, 0, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C4 = {8,
+                                    {
+                                        0,
+                                        2,
+                                        1,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                    },
+                                    ec_gf8_mul_C4_ops};
+
+static ec_gf_op_t ec_gf8_mul_C5_ops[] = {
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C5 = {8,
+                                    {
+                                        4,
+                                        3,
+                                        5,
+                                        7,
+                                        6,
+                                        2,
+                                        0,
+                                        1,
+                                    },
+                                    ec_gf8_mul_C5_ops};
+
+static ec_gf_op_t ec_gf8_mul_C6_ops[] = {
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_COPY, 8, 4, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR3, 9, 5, 4},
+    {EC_GF_OP_XOR2, 6, 9, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 7, 9, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 6, 8, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C6 = {10,
+                                    {
+                                        6,
+                                        3,
+                                        0,
+                                        4,
+                                        5,
+                                        7,
+                                        2,
+                                        1,
+                                        8,
+                                        9,
+                                    },
+                                    ec_gf8_mul_C6_ops};
+
+static ec_gf_op_t ec_gf8_mul_C7_ops[] = {
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C7 = {8,
+                                    {
+                                        7,
+                                        0,
+                                        6,
+                                        2,
+                                        5,
+                                        3,
+                                        4,
+                                        1,
+                                    },
+                                    ec_gf8_mul_C7_ops};
+
+static ec_gf_op_t ec_gf8_mul_C8_ops[] = {
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C8 = {8,
+                                    {
+                                        1,
+                                        3,
+                                        2,
+                                        4,
+                                        6,
+                                        7,
+                                        5,
+                                        0,
+                                    },
+                                    ec_gf8_mul_C8_ops};
+
+static ec_gf_op_t ec_gf8_mul_C9_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_C9 = {8,
+                                    {
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                    },
+                                    ec_gf8_mul_C9_ops};
+
+static ec_gf_op_t ec_gf8_mul_CA_ops[] = {
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 4, 5, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_CA = {8,
+                                    {
+                                        1,
+                                        2,
+                                        5,
+                                        7,
+                                        3,
+                                        4,
+                                        0,
+                                        6,
+                                    },
+                                    ec_gf8_mul_CA_ops};
+
+static ec_gf_op_t ec_gf8_mul_CB_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_CB = {8,
+                                    {
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        7,
+                                        6,
+                                        0,
+                                        1,
+                                    },
+                                    ec_gf8_mul_CB_ops};
+
+static ec_gf_op_t ec_gf8_mul_CC_ops[] = {
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_CC = {8,
+                                    {
+                                        2,
+                                        7,
+                                        1,
+                                        0,
+                                        5,
+                                        6,
+                                        3,
+                                        4,
+                                    },
+                                    ec_gf8_mul_CC_ops};
+
+static ec_gf_op_t ec_gf8_mul_CD_ops[] = {
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_CD = {8,
+                                    {
+                                        0,
+                                        6,
+                                        1,
+                                        2,
+                                        7,
+                                        3,
+                                        4,
+                                        5,
+                                    },
+                                    ec_gf8_mul_CD_ops};
+
+static ec_gf_op_t ec_gf8_mul_CE_ops[] = {
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_COPY, 8, 7, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR3, 3, 6, 8},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR3, 8, 2, 3},
+    {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 4, 8, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_CE = {9,
+                                    {
+                                        5,
+                                        7,
+                                        3,
+                                        0,
+                                        2,
+                                        6,
+                                        4,
+                                        1,
+                                        8,
+                                    },
+                                    ec_gf8_mul_CE_ops};
+
+static ec_gf_op_t ec_gf8_mul_CF_ops[] = {
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_CF = {8,
+                                    {
+                                        3,
+                                        6,
+                                        7,
+                                        0,
+                                        2,
+                                        4,
+                                        5,
+                                        1,
+                                    },
+                                    ec_gf8_mul_CF_ops};
+
+static ec_gf_op_t ec_gf8_mul_D0_ops[] = {
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D0 = {8,
+                                    {
+                                        5,
+                                        6,
+                                        7,
+                                        2,
+                                        0,
+                                        3,
+                                        1,
+                                        4,
+                                    },
+                                    ec_gf8_mul_D0_ops};
+
+static ec_gf_op_t ec_gf8_mul_D1_ops[] = {
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR3, 8, 6, 0},
+    {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D1 = {9,
+                                    {
+                                        5,
+                                        6,
+                                        3,
+                                        2,
+                                        0,
+                                        7,
+                                        4,
+                                        1,
+                                        8,
+                                    },
+                                    ec_gf8_mul_D1_ops};
+
+static ec_gf_op_t ec_gf8_mul_D2_ops[] = {
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D2 = {8,
+                                    {
+                                        7,
+                                        0,
+                                        2,
+                                        1,
+                                        3,
+                                        4,
+                                        6,
+                                        5,
+                                    },
+                                    ec_gf8_mul_D2_ops};
+
+static ec_gf_op_t ec_gf8_mul_D3_ops[] = {
+    {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_COPY, 8, 4, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 8, 6, 0}, {EC_GF_OP_XOR2, 3, 8, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 1, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D3 = {9,
+                                    {
+                                        0,
+                                        3,
+                                        2,
+                                        8,
+                                        4,
+                                        6,
+                                        7,
+                                        1,
+                                        5,
+                                    },
+                                    ec_gf8_mul_D3_ops};
+
+static ec_gf_op_t ec_gf8_mul_D4_ops[] = {
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_COPY, 8, 1, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR3, 1, 7, 8},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D4 = {9,
+                                    {
+                                        4,
+                                        1,
+                                        7,
+                                        5,
+                                        0,
+                                        6,
+                                        3,
+                                        2,
+                                        8,
+                                    },
+                                    ec_gf8_mul_D4_ops};
+
+static ec_gf_op_t ec_gf8_mul_D5_ops[] = {
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D5 = {8,
+                                    {
+                                        6,
+                                        7,
+                                        4,
+                                        5,
+                                        2,
+                                        3,
+                                        1,
+                                        0,
+                                    },
+                                    ec_gf8_mul_D5_ops};
+
+static ec_gf_op_t ec_gf8_mul_D6_ops[] = {
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D6 = {9,
+                                    {
+                                        0,
+                                        6,
+                                        2,
+                                        7,
+                                        1,
+                                        3,
+                                        4,
+                                        5,
+                                        8,
+                                    },
+                                    ec_gf8_mul_D6_ops};
+
+static ec_gf_op_t ec_gf8_mul_D7_ops[] = {
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR3, 8, 3, 5}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR3, 6, 7, 8}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D7 = {9,
+                                    {
+                                        3,
+                                        4,
+                                        6,
+                                        5,
+                                        0,
+                                        7,
+                                        1,
+                                        2,
+                                        8,
+                                    },
+                                    ec_gf8_mul_D7_ops};
+
+static ec_gf_op_t ec_gf8_mul_D8_ops[] = {
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D8 = {8,
+                                    {
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                    },
+                                    ec_gf8_mul_D8_ops};
+
+static ec_gf_op_t ec_gf8_mul_D9_ops[] = {
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 7, 0, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_D9 = {8,
+                                    {
+                                        1,
+                                        2,
+                                        6,
+                                        7,
+                                        4,
+                                        5,
+                                        0,
+                                        3,
+                                    },
+                                    ec_gf8_mul_D9_ops};
+
+static ec_gf_op_t ec_gf8_mul_DA_ops[] = {
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR3, 8, 2, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+    {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 2, 4, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_DA = {9,
+                                    {
+                                        2,
+                                        5,
+                                        7,
+                                        1,
+                                        0,
+                                        4,
+                                        3,
+                                        6,
+                                        8,
+                                    },
+                                    ec_gf8_mul_DA_ops};
+
+static ec_gf_op_t ec_gf8_mul_DB_ops[] = {
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 8, 4, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 6, 3, 0}, {EC_GF_OP_XOR2, 3, 8, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_DB = {9,
+                                    {
+                                        7,
+                                        5,
+                                        6,
+                                        2,
+                                        3,
+                                        4,
+                                        1,
+                                        0,
+                                        8,
+                                    },
+                                    ec_gf8_mul_DB_ops};
+
+static ec_gf_op_t ec_gf8_mul_DC_ops[] = {
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_DC = {8,
+                                    {
+                                        4,
+                                        5,
+                                        2,
+                                        6,
+                                        7,
+                                        1,
+                                        0,
+                                        3,
+                                    },
+                                    ec_gf8_mul_DC_ops};
+
+static ec_gf_op_t ec_gf8_mul_DD_ops[] = {
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_DD = {8,
+                                    {
+                                        1,
+                                        2,
+                                        3,
+                                        6,
+                                        7,
+                                        0,
+                                        4,
+                                        5,
+                                    },
+                                    ec_gf8_mul_DD_ops};
+
+static ec_gf_op_t ec_gf8_mul_DE_ops[] = {
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_DE = {8,
+                                    {
+                                        0,
+                                        5,
+                                        2,
+                                        6,
+                                        7,
+                                        1,
+                                        3,
+                                        4,
+                                    },
+                                    ec_gf8_mul_DE_ops};
+
+static ec_gf_op_t ec_gf8_mul_DF_ops[] = {
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 8, 3, 0},
+    {EC_GF_OP_COPY, 9, 0, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 8, 7, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR3, 1, 9, 2}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_DF = {10,
+                                    {
+                                        7,
+                                        2,
+                                        8,
+                                        4,
+                                        3,
+                                        1,
+                                        0,
+                                        6,
+                                        5,
+                                        9,
+                                    },
+                                    ec_gf8_mul_DF_ops};
+
+static ec_gf_op_t ec_gf8_mul_E0_ops[] = {
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 4, 1, 0}, {EC_GF_OP_XOR2, 7, 1, 0},
+    {EC_GF_OP_XOR2, 5, 7, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E0 = {8,
+                                    {
+                                        2,
+                                        3,
+                                        4,
+                                        7,
+                                        5,
+                                        6,
+                                        0,
+                                        1,
+                                    },
+                                    ec_gf8_mul_E0_ops};
+
+static ec_gf_op_t ec_gf8_mul_E1_ops[] = {
+    {EC_GF_OP_COPY, 8, 1, 0}, {EC_GF_OP_XOR2, 8, 7, 0},
+    {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR3, 9, 5, 3},
+    {EC_GF_OP_XOR2, 0, 9, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 4, 9, 0}, {EC_GF_OP_XOR2, 0, 2, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 2, 8, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E1 = {10,
+                                    {
+                                        0,
+                                        7,
+                                        1,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        2,
+                                        8,
+                                        9,
+                                    },
+                                    ec_gf8_mul_E1_ops};
+
+static ec_gf_op_t ec_gf8_mul_E2_ops[] = {
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E2 = {8,
+                                    {
+                                        2,
+                                        3,
+                                        7,
+                                        1,
+                                        5,
+                                        6,
+                                        0,
+                                        4,
+                                    },
+                                    ec_gf8_mul_E2_ops};
+
+static ec_gf_op_t ec_gf8_mul_E3_ops[] = {
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 3, 1, 0},
+    {EC_GF_OP_XOR3, 8, 2, 7}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 0, 1, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_XOR2, 0, 8, 0}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR3, 6, 8, 4},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E3 = {9,
+                                    {
+                                        5,
+                                        4,
+                                        7,
+                                        2,
+                                        1,
+                                        3,
+                                        6,
+                                        0,
+                                        8,
+                                    },
+                                    ec_gf8_mul_E3_ops};
+
+static ec_gf_op_t ec_gf8_mul_E4_ops[] = {
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 4, 5, 0},
+    {EC_GF_OP_XOR2, 3, 4, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E4 = {8,
+                                    {
+                                        7,
+                                        0,
+                                        1,
+                                        6,
+                                        3,
+                                        4,
+                                        2,
+                                        5,
+                                    },
+                                    ec_gf8_mul_E4_ops};
+
+static ec_gf_op_t ec_gf8_mul_E5_ops[] = {
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_COPY, 8, 0, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E5 = {9,
+                                    {
+                                        4,
+                                        5,
+                                        3,
+                                        6,
+                                        7,
+                                        1,
+                                        0,
+                                        2,
+                                        8,
+                                    },
+                                    ec_gf8_mul_E5_ops};
+
+static ec_gf_op_t ec_gf8_mul_E6_ops[] = {
+    {EC_GF_OP_XOR2, 6, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 1, 4, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E6 = {8,
+                                    {
+                                        5,
+                                        4,
+                                        3,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                    },
+                                    ec_gf8_mul_E6_ops};
+
+static ec_gf_op_t ec_gf8_mul_E7_ops[] = {
+    {EC_GF_OP_COPY, 8, 6, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR3, 9, 0, 6}, {EC_GF_OP_XOR2, 4, 9, 0},
+    {EC_GF_OP_XOR2, 5, 9, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E7 = {10,
+                                    {
+                                        1,
+                                        4,
+                                        3,
+                                        6,
+                                        7,
+                                        5,
+                                        2,
+                                        0,
+                                        8,
+                                        9,
+                                    },
+                                    ec_gf8_mul_E7_ops};
+
+static ec_gf_op_t ec_gf8_mul_E8_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 2, 5, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 1, 4, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E8 = {8,
+                                    {
+                                        1,
+                                        4,
+                                        2,
+                                        7,
+                                        3,
+                                        0,
+                                        5,
+                                        6,
+                                    },
+                                    ec_gf8_mul_E8_ops};
+
+static ec_gf_op_t ec_gf8_mul_E9_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_COPY, 8, 1, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 6, 3, 0},
+    {EC_GF_OP_XOR2, 4, 6, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR3, 1, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_E9 = {9,
+                                    {
+                                        6,
+                                        2,
+                                        0,
+                                        3,
+                                        4,
+                                        1,
+                                        5,
+                                        7,
+                                        8,
+                                    },
+                                    ec_gf8_mul_E9_ops};
+
+static ec_gf_op_t ec_gf8_mul_EA_ops[] = {
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_EA = {8,
+                                    {
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                    },
+                                    ec_gf8_mul_EA_ops};
+
+static ec_gf_op_t ec_gf8_mul_EB_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_EB = {8,
+                                    {
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                    },
+                                    ec_gf8_mul_EB_ops};
+
+static ec_gf_op_t ec_gf8_mul_EC_ops[] = {
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR3, 8, 4, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+    {EC_GF_OP_XOR2, 7, 3, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_EC = {9,
+                                    {
+                                        7,
+                                        4,
+                                        3,
+                                        0,
+                                        2,
+                                        5,
+                                        1,
+                                        6,
+                                        8,
+                                    },
+                                    ec_gf8_mul_EC_ops};
+
+static ec_gf_op_t ec_gf8_mul_ED_ops[] = {
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 4, 0, 0},
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 3, 6, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 5, 2, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_ED = {8,
+                                    {
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        4,
+                                        3,
+                                        2,
+                                    },
+                                    ec_gf8_mul_ED_ops};
+
+static ec_gf_op_t ec_gf8_mul_EE_ops[] = {
+    {EC_GF_OP_XOR2, 5, 3, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR3, 8, 2, 3},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_XOR2, 4, 8, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 8, 5, 0},
+    {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 1, 8, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+    {EC_GF_OP_XOR2, 6, 0, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_EE = {9,
+                                    {
+                                        6,
+                                        4,
+                                        5,
+                                        7,
+                                        2,
+                                        3,
+                                        0,
+                                        1,
+                                        8,
+                                    },
+                                    ec_gf8_mul_EE_ops};
+
+static ec_gf_op_t ec_gf8_mul_EF_ops[] = {
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_COPY, 8, 0, 0},
+    {EC_GF_OP_XOR2, 8, 2, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 6, 8, 0},
+    {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 7, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_EF = {9,
+                                    {
+                                        6,
+                                        4,
+                                        5,
+                                        7,
+                                        2,
+                                        0,
+                                        3,
+                                        1,
+                                        8,
+                                    },
+                                    ec_gf8_mul_EF_ops};
+
+static ec_gf_op_t ec_gf8_mul_F0_ops[] = {
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR3, 8, 3, 6},
+    {EC_GF_OP_XOR2, 5, 8, 0}, {EC_GF_OP_XOR2, 8, 4, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 7, 8, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 1, 8, 0},
+    {EC_GF_OP_XOR2, 0, 2, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F0 = {9,
+                                    {
+                                        3,
+                                        4,
+                                        6,
+                                        1,
+                                        2,
+                                        0,
+                                        5,
+                                        7,
+                                        8,
+                                    },
+                                    ec_gf8_mul_F0_ops};
+
+static ec_gf_op_t ec_gf8_mul_F1_ops[] = {
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_COPY, 8, 3, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 2, 3, 0}, {EC_GF_OP_COPY, 9, 2, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 9, 0, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 5, 2, 0},
+    {EC_GF_OP_XOR2, 7, 9, 0}, {EC_GF_OP_XOR2, 4, 9, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR3, 9, 8, 7},
+    {EC_GF_OP_XOR2, 1, 9, 0}, {EC_GF_OP_XOR2, 5, 9, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F1 = {10,
+                                    {
+                                        7,
+                                        2,
+                                        6,
+                                        3,
+                                        5,
+                                        1,
+                                        4,
+                                        0,
+                                        8,
+                                        9,
+                                    },
+                                    ec_gf8_mul_F1_ops};
+
+static ec_gf_op_t ec_gf8_mul_F2_ops[] = {
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 7, 2, 0},
+    {EC_GF_OP_XOR2, 0, 6, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 2, 3, 0},
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_XOR3, 8, 6, 4},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 3, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F2 = {9,
+                                    {
+                                        1,
+                                        0,
+                                        6,
+                                        7,
+                                        4,
+                                        5,
+                                        2,
+                                        3,
+                                        8,
+                                    },
+                                    ec_gf8_mul_F2_ops};
+
+static ec_gf_op_t ec_gf8_mul_F3_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 6, 5, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F3 = {8,
+                                    {
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                    },
+                                    ec_gf8_mul_F3_ops};
+
+static ec_gf_op_t ec_gf8_mul_F4_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 1, 7, 0}, {EC_GF_OP_XOR2, 3, 7, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F4 = {8,
+                                    {
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                    },
+                                    ec_gf8_mul_F4_ops};
+
+static ec_gf_op_t ec_gf8_mul_F5_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F5 = {8,
+                                    {
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                        6,
+                                    },
+                                    ec_gf8_mul_F5_ops};
+
+static ec_gf_op_t ec_gf8_mul_F6_ops[] = {
+    {EC_GF_OP_XOR2, 3, 1, 0}, {EC_GF_OP_COPY, 8, 3, 0},
+    {EC_GF_OP_XOR2, 3, 5, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_COPY, 9, 3, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 2, 7, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 9, 4, 0}, {EC_GF_OP_XOR2, 4, 1, 0},
+    {EC_GF_OP_XOR2, 6, 9, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 5, 7, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR3, 7, 8, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F6 = {10,
+                                    {
+                                        0,
+                                        6,
+                                        2,
+                                        7,
+                                        4,
+                                        3,
+                                        5,
+                                        9,
+                                        1,
+                                        8,
+                                    },
+                                    ec_gf8_mul_F6_ops};
+
+static ec_gf_op_t ec_gf8_mul_F7_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 3, 2, 0}, {EC_GF_OP_XOR2, 4, 3, 0},
+    {EC_GF_OP_XOR2, 5, 4, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F7 = {8,
+                                    {
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                        4,
+                                        5,
+                                    },
+                                    ec_gf8_mul_F7_ops};
+
+static ec_gf_op_t ec_gf8_mul_F8_ops[] = {
+    {EC_GF_OP_XOR2, 4, 0, 0}, {EC_GF_OP_XOR2, 3, 5, 0},
+    {EC_GF_OP_XOR2, 6, 4, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 1, 6, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 5, 1, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 6, 7, 0},
+    {EC_GF_OP_XOR2, 0, 3, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F8 = {8,
+                                    {
+                                        6,
+                                        2,
+                                        0,
+                                        1,
+                                        4,
+                                        5,
+                                        3,
+                                        7,
+                                    },
+                                    ec_gf8_mul_F8_ops};
+
+static ec_gf_op_t ec_gf8_mul_F9_ops[] = {
+    {EC_GF_OP_XOR2, 1, 5, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 5, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 6, 4, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR3, 8, 7, 1}, {EC_GF_OP_XOR2, 1, 3, 0},
+    {EC_GF_OP_XOR2, 4, 8, 0}, {EC_GF_OP_XOR2, 5, 8, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_F9 = {9,
+                                    {
+                                        4,
+                                        1,
+                                        7,
+                                        6,
+                                        0,
+                                        3,
+                                        5,
+                                        2,
+                                        8,
+                                    },
+                                    ec_gf8_mul_F9_ops};
+
+static ec_gf_op_t ec_gf8_mul_FA_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 2, 1, 0}, {EC_GF_OP_XOR2, 0, 7, 0},
+    {EC_GF_OP_XOR2, 7, 2, 0}, {EC_GF_OP_XOR2, 1, 5, 0},
+    {EC_GF_OP_XOR2, 3, 7, 0}, {EC_GF_OP_XOR2, 5, 0, 0},
+    {EC_GF_OP_XOR2, 7, 6, 0}, {EC_GF_OP_XOR2, 0, 3, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_FA = {8,
+                                    {
+                                        0,
+                                        1,
+                                        2,
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        3,
+                                    },
+                                    ec_gf8_mul_FA_ops};
+
+static ec_gf_op_t ec_gf8_mul_FB_ops[] = {
+    {EC_GF_OP_XOR2, 1, 0, 0}, {EC_GF_OP_XOR2, 2, 1, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 3, 2, 0},
+    {EC_GF_OP_XOR2, 0, 7, 0}, {EC_GF_OP_XOR2, 2, 7, 0},
+    {EC_GF_OP_XOR2, 1, 6, 0}, {EC_GF_OP_XOR2, 7, 6, 0},
+    {EC_GF_OP_XOR2, 4, 3, 0}, {EC_GF_OP_XOR2, 6, 5, 0},
+    {EC_GF_OP_XOR2, 7, 4, 0}, {EC_GF_OP_XOR2, 5, 4, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_FB = {8,
+                                    {
+                                        4,
+                                        5,
+                                        6,
+                                        7,
+                                        0,
+                                        1,
+                                        2,
+                                        3,
+                                    },
+                                    ec_gf8_mul_FB_ops};
+
+static ec_gf_op_t ec_gf8_mul_FC_ops[] = {
+    {EC_GF_OP_XOR2, 7, 0, 0}, {EC_GF_OP_XOR2, 7, 4, 0},
+    {EC_GF_OP_XOR2, 5, 1, 0}, {EC_GF_OP_COPY, 9, 3, 0},
+    {EC_GF_OP_XOR3, 8, 5, 7}, {EC_GF_OP_XOR2, 3, 6, 0},
+    {EC_GF_OP_XOR2, 8, 3, 0}, {EC_GF_OP_XOR2, 2, 8, 0},
+    {EC_GF_OP_XOR2, 1, 2, 0}, {EC_GF_OP_XOR2, 4, 2, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 3, 4, 0},
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 6, 0, 0},
+    {EC_GF_OP_XOR3, 0, 9, 2}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_FC = {10,
+                                    {
+                                        5,
+                                        6,
+                                        3,
+                                        7,
+                                        1,
+                                        8,
+                                        0,
+                                        4,
+                                        2,
+                                        9,
+                                    },
+                                    ec_gf8_mul_FC_ops};
+
+static ec_gf_op_t ec_gf8_mul_FD_ops[] = {
+    {EC_GF_OP_XOR2, 7, 1, 0}, {EC_GF_OP_COPY, 8, 7, 0},
+    {EC_GF_OP_XOR2, 5, 0, 0}, {EC_GF_OP_XOR2, 7, 5, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 4, 7, 0},
+    {EC_GF_OP_XOR2, 5, 6, 0}, {EC_GF_OP_XOR2, 0, 4, 0},
+    {EC_GF_OP_XOR2, 3, 0, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 2, 5, 0}, {EC_GF_OP_XOR2, 1, 2, 0},
+    {EC_GF_OP_XOR2, 0, 1, 0}, {EC_GF_OP_XOR2, 6, 1, 0},
+    {EC_GF_OP_XOR3, 1, 8, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_FD = {9,
+                                    {
+                                        5,
+                                        3,
+                                        7,
+                                        6,
+                                        1,
+                                        2,
+                                        4,
+                                        0,
+                                        8,
+                                    },
+                                    ec_gf8_mul_FD_ops};
+
+static ec_gf_op_t ec_gf8_mul_FE_ops[] = {
+    {EC_GF_OP_XOR2, 2, 0, 0}, {EC_GF_OP_COPY, 8, 2, 0},
+    {EC_GF_OP_XOR2, 2, 4, 0}, {EC_GF_OP_XOR2, 6, 2, 0},
+    {EC_GF_OP_XOR2, 8, 5, 0}, {EC_GF_OP_XOR2, 5, 6, 0},
+    {EC_GF_OP_XOR2, 6, 1, 0}, {EC_GF_OP_XOR2, 0, 6, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 7, 8, 0}, {EC_GF_OP_XOR2, 3, 0, 0},
+    {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR2, 0, 4, 0}, {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_FE = {9,
+                                    {
+                                        3,
+                                        4,
+                                        8,
+                                        2,
+                                        5,
+                                        0,
+                                        6,
+                                        1,
+                                        7,
+                                    },
+                                    ec_gf8_mul_FE_ops};
+
+static ec_gf_op_t ec_gf8_mul_FF_ops[] = {
+    {EC_GF_OP_XOR2, 4, 7, 0}, {EC_GF_OP_COPY, 9, 0, 0},
+    {EC_GF_OP_COPY, 8, 4, 0}, {EC_GF_OP_XOR2, 9, 1, 0},
+    {EC_GF_OP_XOR2, 4, 2, 0}, {EC_GF_OP_XOR2, 9, 4, 0},
+    {EC_GF_OP_XOR2, 0, 5, 0}, {EC_GF_OP_XOR2, 2, 0, 0},
+    {EC_GF_OP_XOR2, 3, 9, 0}, {EC_GF_OP_XOR2, 7, 3, 0},
+    {EC_GF_OP_XOR2, 2, 6, 0}, {EC_GF_OP_XOR2, 5, 3, 0},
+    {EC_GF_OP_XOR2, 6, 7, 0}, {EC_GF_OP_XOR2, 1, 7, 0},
+    {EC_GF_OP_XOR3, 3, 8, 5}, {EC_GF_OP_XOR2, 4, 6, 0},
+    {EC_GF_OP_END, 0, 0, 0}};
+
+static ec_gf_mul_t ec_gf8_mul_FF = {10,
+                                    {
+                                        6,
+                                        5,
+                                        0,
+                                        1,
+                                        2,
+                                        4,
+                                        9,
+                                        3,
+                                        7,
+                                        8,
+                                    },
+                                    ec_gf8_mul_FF_ops};
+
+ec_gf_mul_t *ec_gf8_mul[] = {
+    &ec_gf8_mul_00, &ec_gf8_mul_01, &ec_gf8_mul_02, &ec_gf8_mul_03,
+    &ec_gf8_mul_04, &ec_gf8_mul_05, &ec_gf8_mul_06, &ec_gf8_mul_07,
+    &ec_gf8_mul_08, &ec_gf8_mul_09, &ec_gf8_mul_0A, &ec_gf8_mul_0B,
+    &ec_gf8_mul_0C, &ec_gf8_mul_0D, &ec_gf8_mul_0E, &ec_gf8_mul_0F,
+    &ec_gf8_mul_10, &ec_gf8_mul_11, &ec_gf8_mul_12, &ec_gf8_mul_13,
+    &ec_gf8_mul_14, &ec_gf8_mul_15, &ec_gf8_mul_16, &ec_gf8_mul_17,
+    &ec_gf8_mul_18, &ec_gf8_mul_19, &ec_gf8_mul_1A, &ec_gf8_mul_1B,
+    &ec_gf8_mul_1C, &ec_gf8_mul_1D, &ec_gf8_mul_1E, &ec_gf8_mul_1F,
+    &ec_gf8_mul_20, &ec_gf8_mul_21, &ec_gf8_mul_22, &ec_gf8_mul_23,
+    &ec_gf8_mul_24, &ec_gf8_mul_25, &ec_gf8_mul_26, &ec_gf8_mul_27,
+    &ec_gf8_mul_28, &ec_gf8_mul_29, &ec_gf8_mul_2A, &ec_gf8_mul_2B,
+    &ec_gf8_mul_2C, &ec_gf8_mul_2D, &ec_gf8_mul_2E, &ec_gf8_mul_2F,
+    &ec_gf8_mul_30, &ec_gf8_mul_31, &ec_gf8_mul_32, &ec_gf8_mul_33,
+    &ec_gf8_mul_34, &ec_gf8_mul_35, &ec_gf8_mul_36, &ec_gf8_mul_37,
+    &ec_gf8_mul_38, &ec_gf8_mul_39, &ec_gf8_mul_3A, &ec_gf8_mul_3B,
+    &ec_gf8_mul_3C, &ec_gf8_mul_3D, &ec_gf8_mul_3E, &ec_gf8_mul_3F,
+    &ec_gf8_mul_40, &ec_gf8_mul_41, &ec_gf8_mul_42, &ec_gf8_mul_43,
+    &ec_gf8_mul_44, &ec_gf8_mul_45, &ec_gf8_mul_46, &ec_gf8_mul_47,
+    &ec_gf8_mul_48, &ec_gf8_mul_49, &ec_gf8_mul_4A, &ec_gf8_mul_4B,
+    &ec_gf8_mul_4C, &ec_gf8_mul_4D, &ec_gf8_mul_4E, &ec_gf8_mul_4F,
+    &ec_gf8_mul_50, &ec_gf8_mul_51, &ec_gf8_mul_52, &ec_gf8_mul_53,
+    &ec_gf8_mul_54, &ec_gf8_mul_55, &ec_gf8_mul_56, &ec_gf8_mul_57,
+    &ec_gf8_mul_58, &ec_gf8_mul_59, &ec_gf8_mul_5A, &ec_gf8_mul_5B,
+    &ec_gf8_mul_5C, &ec_gf8_mul_5D, &ec_gf8_mul_5E, &ec_gf8_mul_5F,
+    &ec_gf8_mul_60, &ec_gf8_mul_61, &ec_gf8_mul_62, &ec_gf8_mul_63,
+    &ec_gf8_mul_64, &ec_gf8_mul_65, &ec_gf8_mul_66, &ec_gf8_mul_67,
+    &ec_gf8_mul_68, &ec_gf8_mul_69, &ec_gf8_mul_6A, &ec_gf8_mul_6B,
+    &ec_gf8_mul_6C, &ec_gf8_mul_6D, &ec_gf8_mul_6E, &ec_gf8_mul_6F,
+    &ec_gf8_mul_70, &ec_gf8_mul_71, &ec_gf8_mul_72, &ec_gf8_mul_73,
+    &ec_gf8_mul_74, &ec_gf8_mul_75, &ec_gf8_mul_76, &ec_gf8_mul_77,
+    &ec_gf8_mul_78, &ec_gf8_mul_79, &ec_gf8_mul_7A, &ec_gf8_mul_7B,
+    &ec_gf8_mul_7C, &ec_gf8_mul_7D, &ec_gf8_mul_7E, &ec_gf8_mul_7F,
+    &ec_gf8_mul_80, &ec_gf8_mul_81, &ec_gf8_mul_82, &ec_gf8_mul_83,
+    &ec_gf8_mul_84, &ec_gf8_mul_85, &ec_gf8_mul_86, &ec_gf8_mul_87,
+    &ec_gf8_mul_88, &ec_gf8_mul_89, &ec_gf8_mul_8A, &ec_gf8_mul_8B,
+    &ec_gf8_mul_8C, &ec_gf8_mul_8D, &ec_gf8_mul_8E, &ec_gf8_mul_8F,
+    &ec_gf8_mul_90, &ec_gf8_mul_91, &ec_gf8_mul_92, &ec_gf8_mul_93,
+    &ec_gf8_mul_94, &ec_gf8_mul_95, &ec_gf8_mul_96, &ec_gf8_mul_97,
+    &ec_gf8_mul_98, &ec_gf8_mul_99, &ec_gf8_mul_9A, &ec_gf8_mul_9B,
+    &ec_gf8_mul_9C, &ec_gf8_mul_9D, &ec_gf8_mul_9E, &ec_gf8_mul_9F,
+    &ec_gf8_mul_A0, &ec_gf8_mul_A1, &ec_gf8_mul_A2, &ec_gf8_mul_A3,
+    &ec_gf8_mul_A4, &ec_gf8_mul_A5, &ec_gf8_mul_A6, &ec_gf8_mul_A7,
+    &ec_gf8_mul_A8, &ec_gf8_mul_A9, &ec_gf8_mul_AA, &ec_gf8_mul_AB,
+    &ec_gf8_mul_AC, &ec_gf8_mul_AD, &ec_gf8_mul_AE, &ec_gf8_mul_AF,
+    &ec_gf8_mul_B0, &ec_gf8_mul_B1, &ec_gf8_mul_B2, &ec_gf8_mul_B3,
+    &ec_gf8_mul_B4, &ec_gf8_mul_B5, &ec_gf8_mul_B6, &ec_gf8_mul_B7,
+    &ec_gf8_mul_B8, &ec_gf8_mul_B9, &ec_gf8_mul_BA, &ec_gf8_mul_BB,
+    &ec_gf8_mul_BC, &ec_gf8_mul_BD, &ec_gf8_mul_BE, &ec_gf8_mul_BF,
+    &ec_gf8_mul_C0, &ec_gf8_mul_C1, &ec_gf8_mul_C2, &ec_gf8_mul_C3,
+    &ec_gf8_mul_C4, &ec_gf8_mul_C5, &ec_gf8_mul_C6, &ec_gf8_mul_C7,
+    &ec_gf8_mul_C8, &ec_gf8_mul_C9, &ec_gf8_mul_CA, &ec_gf8_mul_CB,
+    &ec_gf8_mul_CC, &ec_gf8_mul_CD, &ec_gf8_mul_CE, &ec_gf8_mul_CF,
+    &ec_gf8_mul_D0, &ec_gf8_mul_D1, &ec_gf8_mul_D2, &ec_gf8_mul_D3,
+    &ec_gf8_mul_D4, &ec_gf8_mul_D5, &ec_gf8_mul_D6, &ec_gf8_mul_D7,
+    &ec_gf8_mul_D8, &ec_gf8_mul_D9, &ec_gf8_mul_DA, &ec_gf8_mul_DB,
+    &ec_gf8_mul_DC, &ec_gf8_mul_DD, &ec_gf8_mul_DE, &ec_gf8_mul_DF,
+    &ec_gf8_mul_E0, &ec_gf8_mul_E1, &ec_gf8_mul_E2, &ec_gf8_mul_E3,
+    &ec_gf8_mul_E4, &ec_gf8_mul_E5, &ec_gf8_mul_E6, &ec_gf8_mul_E7,
+    &ec_gf8_mul_E8, &ec_gf8_mul_E9, &ec_gf8_mul_EA, &ec_gf8_mul_EB,
+    &ec_gf8_mul_EC, &ec_gf8_mul_ED, &ec_gf8_mul_EE, &ec_gf8_mul_EF,
+    &ec_gf8_mul_F0, &ec_gf8_mul_F1, &ec_gf8_mul_F2, &ec_gf8_mul_F3,
+    &ec_gf8_mul_F4, &ec_gf8_mul_F5, &ec_gf8_mul_F6, &ec_gf8_mul_F7,
+    &ec_gf8_mul_F8, &ec_gf8_mul_F9, &ec_gf8_mul_FA, &ec_gf8_mul_FB,
+    &ec_gf8_mul_FC, &ec_gf8_mul_FD, &ec_gf8_mul_FE, &ec_gf8_mul_FF};
diff --git a/xlators/cluster/ec/src/ec-gf8.h b/xlators/cluster/ec/src/ec-gf8.h
new file mode 100644
index 00000000000..4aca91127fc
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-gf8.h
@@ -0,0 +1,18 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_GF8_H__
+#define __EC_GF8_H__
+
+#include "ec-galois.h"
+
+extern ec_gf_mul_t *ec_gf8_mul[];
+
+#endif /* __EC_GF8_H__ */
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
new file mode 100644
index 00000000000..7d991f04aac
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -0,0 +1,3367 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/defaults.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/syncop-utils.h>
+#include <glusterfs/cluster-syncop.h>
+
+#include "ec.h"
+#include "ec-types.h"
+#include "ec-messages.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-method.h"
+#include "ec-fops.h"
+#include "ec-heald.h"
+
+#define EC_COUNT(array, max)                                                   \
+    ({                                                                         \
+        int __i;                                                               \
+        int __res = 0;                                                         \
+        for (__i = 0; __i < max; __i++)                                        \
+            if (array[__i])                                                    \
+                __res++;                                                       \
+        __res;                                                                 \
+    })
+#define EC_INTERSECT(dst, src1, src2, max)                                     \
+    ({                                                                         \
+        int __i;                                                               \
+        for (__i = 0; __i < max; __i++)                                        \
+            dst[__i] = src1[__i] && src2[__i];                                 \
+    })
+#define EC_ADJUST_SOURCE(source, sources, max)                                 \
+    ({                                                                         \
+        int __i;                                                               \
+        if (sources[source] == 0) {                                            \
+            source = -1;                                                       \
+            for (__i = 0; __i < max; __i++)                                    \
+                if (sources[__i])                                              \
+                    source = __i;                                              \
+        }                                                                      \
+    })
+#define IA_EQUAL(f, s, field)                                                  \
+    (memcmp(&(f.ia_##field), &(s.ia_##field), sizeof(s.ia_##field)) == 0)
+#define EC_REPLIES_ALLOC(replies, numsubvols)                                  \
+    do {                                                                       \
+        int __i = 0;                                                           \
+        replies = alloca0(numsubvols * sizeof(*replies));                      \
+        for (__i = 0; __i < numsubvols; __i++)                                 \
+            INIT_LIST_HEAD(&replies[__i].entries.list);                        \
+    } while (0)
+
+struct ec_name_data {
+    call_frame_t *frame;
+    unsigned char *participants;
+    unsigned char *failed_on;
+    unsigned char *gfidless;
+    unsigned char *enoent;
+    unsigned char *same;
+    char *name;
+    inode_t *parent;
+    default_args_cbk_t *replies;
+    uint32_t heal_pending;
+};
+
+static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
+
+static gf_boolean_t
+ec_ignorable_key_match(dict_t *dict, char *key, data_t *val, void *mdata)
+{
+    int i = 0;
+
+    if (!key)
+        goto out;
+
+    if (strncmp(key, EC_XATTR_PREFIX, SLEN(EC_XATTR_PREFIX)) == 0)
+        return _gf_true;
+
+    for (i = 0; ec_ignore_xattrs[i]; i++) {
+        if (!strcmp(key, ec_ignore_xattrs[i]))
+            return _gf_true;
+    }
+
+out:
+    return _gf_false;
+}
+
+static gf_boolean_t
+ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata)
+{
+    return !ec_ignorable_key_match(dict, key, val, mdata);
+}
+/* FOP: heal */
+
+void
+ec_set_entry_healing(ec_fop_data_t *fop)
+{
+    ec_inode_t *ctx = NULL;
+    loc_t *loc = NULL;
+
+    if (!fop)
+        return;
+
+    loc = &fop->loc[0];
+    LOCK(&loc->inode->lock);
+    {
+        ctx = __ec_inode_get(loc->inode, fop->xl);
+        if (ctx) {
+            ctx->heal_count += 1;
+        }
+    }
+    UNLOCK(&loc->inode->lock);
+}
+
+void
+ec_reset_entry_healing(ec_fop_data_t *fop)
+{
+    ec_inode_t *ctx = NULL;
+    loc_t *loc = NULL;
+    int32_t heal_count = 0;
+    if (!fop)
+        return;
+
+    loc = &fop->loc[0];
+    LOCK(&loc->inode->lock);
+    {
+        ctx = __ec_inode_get(loc->inode, fop->xl);
+        if (ctx) {
+            ctx->heal_count += -1;
+            heal_count = ctx->heal_count;
+        }
+    }
+    UNLOCK(&loc->inode->lock);
+    GF_ASSERT(heal_count >= 0);
+}
+
+uintptr_t
+ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood)
+{
+    ec_cbk_data_t *cbk;
+    uintptr_t mask[2] = {0, 0};
+
+    list_for_each_entry(cbk, &fop->cbk_list, list)
+    {
+        mask[cbk->op_ret >= 0] |= cbk->mask;
+    }
+
+    if (pgood != NULL) {
+        *pgood = mask[1];
+    }
+
+    return mask[0];
+}
+
+void
+ec_heal_update(ec_fop_data_t *fop, int32_t is_open)
+{
+    ec_heal_t *heal = fop->data;
+    uintptr_t good, bad;
+
+    bad = ec_heal_check(fop, &good);
+
+    LOCK(&heal->lock);
+
+    heal->bad &= ~bad;
+    if (is_open) {
+        heal->open |= good;
+    }
+
+    UNLOCK(&heal->lock);
+
+    fop->error = 0;
+}
+
+void
+ec_heal_avoid(ec_fop_data_t *fop)
+{
+    ec_heal_t *heal = fop->data;
+    uintptr_t bad;
+
+    bad = ec_heal_check(fop, NULL);
+
+    LOCK(&heal->lock);
+
+    heal->good &= ~bad;
+
+    UNLOCK(&heal->lock);
+}
+
+int32_t
+ec_heal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    ec_fop_data_t *fop = cookie;
+    ec_heal_t *heal = fop->data;
+
+    if (op_ret >= 0) {
+        GF_ASSERT(
+            ec_set_inode_size(heal->fop, heal->fd->inode, heal->total_size));
+    }
+
+    return 0;
+}
+
+void
+ec_heal_lock(ec_heal_t *heal, int32_t type, fd_t *fd, loc_t *loc, off_t offset,
+             size_t size)
+{
+    struct gf_flock flock;
+    fop_inodelk_cbk_t cbk = NULL;
+
+    flock.l_type = type;
+    flock.l_whence = SEEK_SET;
+    flock.l_start = offset;
+    flock.l_len = size;
+    flock.l_pid = 0;
+    flock.l_owner.len = 0;
+
+    if (type == F_UNLCK) {
+        /* Remove inode size information before unlocking it. */
+        if (fd == NULL) {
+            ec_clear_inode_info(heal->fop, heal->loc.inode);
+        } else {
+            ec_clear_inode_info(heal->fop, heal->fd->inode);
+        }
+        cbk = ec_lock_unlocked;
+    } else {
+        /* Otherwise use the callback to update size information. */
+        cbk = ec_heal_lock_cbk;
+    }
+
+    if (fd != NULL) {
+        ec_finodelk(heal->fop->frame, heal->xl,
+                    &heal->fop->frame->root->lk_owner, heal->fop->mask,
+                    EC_MINIMUM_ALL, cbk, heal, heal->xl->name, fd, F_SETLKW,
+                    &flock, NULL);
+    } else {
+        ec_inodelk(heal->fop->frame, heal->xl,
+                   &heal->fop->frame->root->lk_owner, heal->fop->mask,
+                   EC_MINIMUM_ALL, cbk, heal, heal->xl->name, loc, F_SETLKW,
+                   &flock, NULL);
+    }
+}
+
+void
+ec_heal_inodelk(ec_heal_t *heal, int32_t type, int32_t use_fd, off_t offset,
+                size_t size)
+{
+    ec_heal_lock(heal, type, use_fd ? heal->fd : NULL, &heal->loc, offset,
+                 size);
+}
+
+int32_t
+ec_heal_xattr_clean(dict_t *dict, char *key, data_t *data, void *arg)
+{
+    dict_t *base = arg;
+
+    if (ec_ignorable_key_match(NULL, key, NULL, NULL)) {
+        dict_del(dict, key);
+        return 0;
+    }
+
+    if (dict_get(base, key) != NULL)
+        dict_del(dict, key);
+
+    return 0;
+}
+
+/********************************************************************
+ * ec_wind_xattrop_parallel:
+ *              Helper function to update the extended attributes
+ *    in parallel.
+ *
+ *******************************************************************/
+void
+ec_wind_xattrop_parallel(call_frame_t *frame, xlator_t *subvol, int child_index,
+                         loc_t *loc, gf_xattrop_flags_t flags, dict_t **dict,
+                         dict_t *xdata)
+{
+    gf_msg_debug("EC", 0, "WIND: on child %d ", child_index);
+    STACK_WIND_COOKIE(
+        frame, cluster_xattrop_cbk, (void *)(uintptr_t)child_index, subvol,
+        subvol->fops->xattrop, loc, flags, dict[child_index], xdata);
+}
+
+int32_t
+ec_heal_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                   struct iatt *postbuf, dict_t *xdata)
+{
+    ec_fop_data_t *fop = cookie;
+    ec_heal_t *heal = fop->data;
+
+    ec_trace("WRITE_CBK", cookie, "ret=%d, errno=%d", op_ret, op_errno);
+
+    gf_msg_debug(fop->xl->name, 0,
+                 "%s: write op_ret %d, op_errno %s"
+                 " at %" PRIu64,
+                 uuid_utoa(heal->fd->inode->gfid), op_ret, strerror(op_errno),
+                 heal->offset);
+
+    ec_heal_update(cookie, 0);
+
+    return 0;
+}
+
+int32_t
+ec_heal_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                  int32_t count, struct iatt *stbuf, struct iobref *iobref,
+                  dict_t *xdata)
+{
+    ec_fop_data_t *fop = cookie;
+    ec_heal_t *heal = fop->data;
+
+    ec_trace("READ_CBK", fop, "ret=%d, errno=%d", op_ret, op_errno);
+
+    ec_heal_avoid(fop);
+
+    if (op_ret > 0) {
+        gf_msg_debug(fop->xl->name, 0,
+                     "%s: read succeeded, proceeding "
+                     "to write at %" PRIu64,
+                     uuid_utoa(heal->fd->inode->gfid), heal->offset);
+        ec_writev(heal->fop->frame, heal->xl, heal->bad, EC_MINIMUM_ONE,
+                  ec_heal_writev_cbk, heal, heal->fd, vector, count,
+                  heal->offset, 0, iobref, NULL);
+    } else {
+        if (op_ret < 0) {
+            gf_msg_debug(fop->xl->name, 0,
+                         "%s: read failed %s, failing "
+                         "to heal block at %" PRIu64,
+                         uuid_utoa(heal->fd->inode->gfid), strerror(op_errno),
+                         heal->offset);
+            heal->bad = 0;
+        }
+        heal->done = 1;
+    }
+
+    return 0;
+}
+
+void
+ec_heal_data_block(ec_heal_t *heal)
+{
+    ec_trace("DATA", heal->fop, "good=%lX, bad=%lX", heal->good, heal->bad);
+
+    if ((heal->good != 0) && (heal->bad != 0) &&
+        (heal->iatt.ia_type == IA_IFREG)) {
+        ec_readv(heal->fop->frame, heal->xl, heal->good, EC_MINIMUM_MIN,
+                 ec_heal_readv_cbk, heal, heal->fd, heal->size, heal->offset, 0,
+                 NULL);
+    }
+}
+
+/* FOP: fheal */
+
+void
+ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_fheal_cbk_t func, void *data, fd_t *fd,
+         int32_t partial, dict_t *xdata)
+{
+    ec_fd_t *ctx = ec_fd_get(fd, this);
+
+    if (ctx != NULL) {
+        gf_msg_trace("ec", 0, "FHEAL ctx: flags=%X, open=%" PRIXPTR, ctx->flags,
+                     ctx->open);
+        ec_heal(frame, this, target, fop_flags, func, data, &ctx->loc, partial,
+                xdata);
+    }
+}
+
+/* Common heal code */
+void
+ec_mask_to_char_array(uintptr_t mask, unsigned char *array, int numsubvols)
+{
+    int i = 0;
+
+    for (i = 0; i < numsubvols; i++)
+        array[i] = ((mask >> i) & 1);
+}
+
+uintptr_t
+ec_char_array_to_mask(unsigned char *array, int numsubvols)
+{
+    int i = 0;
+    uintptr_t mask = 0;
+
+    if (array == NULL)
+        goto out;
+
+    for (i = 0; i < numsubvols; i++)
+        if (array[i])
+            mask |= (1ULL << i);
+out:
+    return mask;
+}
+
+int
+ec_heal_entry_find_direction(ec_t *ec, default_args_cbk_t *replies,
+                             uint64_t *versions, uint64_t *dirty,
+                             unsigned char *sources,
+                             unsigned char *healed_sinks)
+{
+    uint64_t xattr[EC_VERSION_SIZE] = {0};
+    int source = -1;
+    uint64_t max_version = 0;
+    int ret = 0;
+    int i = 0;
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (!replies[i].valid)
+            continue;
+
+        if (replies[i].op_ret == -1)
+            continue;
+
+        if (source == -1)
+            source = i;
+
+        ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_VERSION, xattr,
+                                EC_VERSION_SIZE);
+        if (ret == 0) {
+            versions[i] = xattr[EC_DATA_TXN];
+            if (max_version < versions[i]) {
+                max_version = versions[i];
+                source = i;
+            }
+        }
+
+        memset(xattr, 0, sizeof(xattr));
+        ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_DIRTY, xattr,
+                                EC_VERSION_SIZE);
+        if (ret == 0) {
+            dirty[i] = xattr[EC_DATA_TXN];
+        }
+    }
+
+    if (source < 0)
+        goto out;
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (!replies[i].valid)
+            continue;
+
+        if (replies[i].op_ret == -1)
+            continue;
+
+        if (versions[i] == versions[source])
+            sources[i] = 1;
+        else
+            healed_sinks[i] = 1;
+    }
+
+out:
+    return source;
+}
+
+int
+ec_adjust_versions(call_frame_t *frame, ec_t *ec, ec_txn_t type, inode_t *inode,
+                   int source, unsigned char *sources,
+                   unsigned char *healed_sinks, uint64_t *versions,
+                   uint64_t *dirty)
+{
+    int i = 0;
+    int ret = 0;
+    int call_count = 0;
+    dict_t **xattr = NULL;
+    int op_ret = 0;
+    loc_t loc = {0};
+    gf_boolean_t erase_dirty = _gf_false;
+    uint64_t *versions_xattr = NULL;
+    uint64_t *dirty_xattr = NULL;
+    uint64_t allzero[2] = {0};
+    unsigned char *on = NULL;
+    unsigned char *output = NULL;
+    default_args_cbk_t *replies = NULL;
+
+    /* Allocate the required memory */
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+    on = alloca0(ec->nodes);
+    output = alloca0(ec->nodes);
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    xattr = GF_CALLOC(ec->nodes, sizeof(*xattr), gf_common_mt_pointer);
+    if (!xattr) {
+        op_ret = -ENOMEM;
+        goto out;
+    }
+    for (i = 0; i < ec->nodes; i++) {
+        xattr[i] = dict_new();
+        if (!xattr[i]) {
+            op_ret = -ENOMEM;
+            goto out;
+        }
+    }
+
+    /* dirty xattr represents if the file/dir needs heal. Unless all the
+     * copies are healed, don't erase it */
+    if (EC_COUNT(sources, ec->nodes) + EC_COUNT(healed_sinks, ec->nodes) ==
+        ec->nodes)
+        erase_dirty = _gf_true;
+    else
+        op_ret = -ENOTCONN;
+
+    /* Populate the xattr array */
+    for (i = 0; i < ec->nodes; i++) {
+        if (!sources[i] && !healed_sinks[i])
+            continue;
+        versions_xattr = GF_CALLOC(EC_VERSION_SIZE, sizeof(*versions_xattr),
+                                   gf_common_mt_pointer);
+        if (!versions_xattr) {
+            op_ret = -ENOMEM;
+            continue;
+        }
+
+        versions_xattr[type] = hton64(versions[source] - versions[i]);
+        ret = dict_set_bin(xattr[i], EC_XATTR_VERSION, versions_xattr,
+                           (sizeof(*versions_xattr) * EC_VERSION_SIZE));
+        if (ret < 0) {
+            op_ret = -ENOMEM;
+            continue;
+        }
+
+        if (erase_dirty) {
+            dirty_xattr = GF_CALLOC(EC_VERSION_SIZE, sizeof(*dirty_xattr),
+                                    gf_common_mt_pointer);
+            if (!dirty_xattr) {
+                op_ret = -ENOMEM;
+                continue;
+            }
+
+            dirty_xattr[type] = hton64(-dirty[i]);
+            ret = dict_set_bin(xattr[i], EC_XATTR_DIRTY, dirty_xattr,
+                               (sizeof(*dirty_xattr) * EC_VERSION_SIZE));
+            if (ret < 0) {
+                op_ret = -ENOMEM;
+                continue;
+            }
+        }
+
+        if (memcmp(versions_xattr, allzero,
+                   (sizeof(*versions_xattr) * EC_VERSION_SIZE)) == 0) {
+            if (!erase_dirty) {
+                continue;
+            }
+
+            if (memcmp(dirty_xattr, allzero,
+                       (sizeof(*dirty_xattr) * EC_VERSION_SIZE)) == 0) {
+                continue;
+            }
+        }
+
+        on[i] = 1;
+        call_count++;
+    }
+
+    /* Update the bricks with xattr */
+    if (call_count) {
+        PARALLEL_FOP_ONLIST(ec->xl_list, on, ec->nodes, replies, frame,
+                            ec_wind_xattrop_parallel, &loc,
+                            GF_XATTROP_ADD_ARRAY64, xattr, NULL);
+        ret = cluster_fop_success_fill(replies, ec->nodes, output);
+    }
+
+    if (ret < call_count) {
+        op_ret = -ENOTCONN;
+        goto out;
+    }
+
+out:
+    /* Cleanup */
+    if (xattr) {
+        for (i = 0; i < ec->nodes; i++) {
+            if (xattr[i])
+                dict_unref(xattr[i]);
+        }
+        GF_FREE(xattr);
+    }
+    cluster_replies_wipe(replies, ec->nodes);
+    loc_wipe(&loc);
+    return op_ret;
+}
+
+int
+ec_heal_metadata_find_direction(ec_t *ec, default_args_cbk_t *replies,
+                                uint64_t *versions, uint64_t *dirty,
+                                unsigned char *sources,
+                                unsigned char *healed_sinks)
+{
+    uint64_t xattr[EC_VERSION_SIZE] = {0};
+    uint64_t max_version = 0;
+    int same_count = 0;
+    int max_same_count = 0;
+    int same_source = -1;
+    int ret = 0;
+    int i = 0;
+    int j = 0;
+    int *groups = NULL;
+    struct iatt source_ia = {0};
+    struct iatt child_ia = {0};
+
+    groups = alloca0(ec->nodes * sizeof(*groups));
+    for (i = 0; i < ec->nodes; i++)
+        groups[i] = -1;
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (!replies[i].valid)
+            continue;
+        if (replies[i].op_ret < 0)
+            continue;
+        ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_VERSION, xattr,
+                                EC_VERSION_SIZE);
+        if (ret == 0) {
+            versions[i] = xattr[EC_METADATA_TXN];
+        }
+
+        memset(xattr, 0, sizeof(xattr));
+        ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_DIRTY, xattr,
+                                EC_VERSION_SIZE);
+        if (ret == 0) {
+            dirty[i] = xattr[EC_METADATA_TXN];
+        }
+        if (groups[i] >= 0) /*Already part of group*/
+            continue;
+        groups[i] = i;
+        same_count = 1;
+        source_ia = replies[i].stat;
+        for (j = i + 1; j < ec->nodes; j++) {
+            if (!replies[j].valid || replies[j].op_ret < 0)
+                continue;
+            child_ia = replies[j].stat;
+            if (!IA_EQUAL(source_ia, child_ia, gfid) ||
+                !IA_EQUAL(source_ia, child_ia, type) ||
+                !IA_EQUAL(source_ia, child_ia, prot) ||
+                !IA_EQUAL(source_ia, child_ia, uid) ||
+                !IA_EQUAL(source_ia, child_ia, gid))
+                continue;
+            if (!are_dicts_equal(replies[i].xdata, replies[j].xdata,
+                                 ec_sh_key_match, NULL))
+                continue;
+            groups[j] = i;
+            same_count++;
+        }
+
+        if (max_same_count < same_count) {
+            max_same_count = same_count;
+            same_source = i;
+        }
+    }
+
+    if (max_same_count < ec->fragments) {
+        ret = -EIO;
+        goto out;
+    }
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (groups[i] == groups[same_source])
+            sources[i] = 1;
+        else if (replies[i].valid && replies[i].op_ret >= 0)
+            healed_sinks[i] = 1;
+    }
+    for (i = 0; i < ec->nodes; i++) {
+        if (sources[i] && (versions[i] > max_version)) {
+            same_source = i;
+            max_version = versions[i];
+        }
+    }
+    ret = same_source;
+out:
+    return ret;
+}
+
+int
+__ec_heal_metadata_prepare(call_frame_t *frame, ec_t *ec, inode_t *inode,
+                           unsigned char *locked_on,
+                           default_args_cbk_t *replies, uint64_t *versions,
+                           uint64_t *dirty, unsigned char *sources,
+                           unsigned char *healed_sinks)
+{
+    loc_t loc = {0};
+    unsigned char *output = NULL;
+    unsigned char *lookup_on = NULL;
+    int ret = 0;
+    int source = 0;
+    default_args_cbk_t *greplies = NULL;
+    int i = 0;
+    EC_REPLIES_ALLOC(greplies, ec->nodes);
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+    output = alloca0(ec->nodes);
+    lookup_on = alloca0(ec->nodes);
+    ret = cluster_lookup(ec->xl_list, locked_on, ec->nodes, replies, output,
+                         frame, ec->xl, &loc, NULL);
+    if (ret <= ec->fragments) {
+        ret = -ENOTCONN;
+        goto out;
+    }
+
+    memcpy(lookup_on, output, ec->nodes);
+    /*Use getxattr to get the filtered xattrs which filter internal xattrs*/
+    ret = cluster_getxattr(ec->xl_list, lookup_on, ec->nodes, greplies, output,
+                           frame, ec->xl, &loc, NULL, NULL);
+    for (i = 0; i < ec->nodes; i++) {
+        if (lookup_on[i] && !output[i]) {
+            replies[i].valid = 0;
+            continue;
+        }
+        if (replies[i].xdata) {
+            dict_unref(replies[i].xdata);
+            replies[i].xdata = NULL;
+            if (greplies[i].xattr)
+                replies[i].xdata = dict_ref(greplies[i].xattr);
+        }
+    }
+
+    source = ec_heal_metadata_find_direction(ec, replies, versions, dirty,
+                                             sources, healed_sinks);
+    if (source < 0) {
+        ret = -EIO;
+        goto out;
+    }
+    ret = source;
+out:
+    cluster_replies_wipe(greplies, ec->nodes);
+    loc_wipe(&loc);
+    return ret;
+}
+
+/* Metadata heal */
+int
+__ec_removexattr_sinks(call_frame_t *frame, ec_t *ec, inode_t *inode,
+                       int source, unsigned char *sources,
+                       unsigned char *healed_sinks, default_args_cbk_t *replies)
+{
+    int i = 0;
+    int ret = 0;
+    loc_t loc = {0};
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (i == source)
+            continue;
+        if (!sources[i] && !healed_sinks[i])
+            continue;
+        ret = dict_foreach(replies[i].xdata, ec_heal_xattr_clean,
+                           replies[source].xdata);
+        if (ret < 0) {
+            sources[i] = 0;
+            healed_sinks[i] = 0;
+            continue;
+        }
+
+        if (replies[i].xdata->count == 0) {
+            continue;
+        } else if (sources[i]) {
+            /* This can happen if setxattr/removexattr succeeds on
+             * the bricks but fails to update the version. This
+             * will make sure that the xattrs are made equal after
+             * heal*/
+            sources[i] = 0;
+            healed_sinks[i] = 1;
+        }
+
+        ret = syncop_removexattr(ec->xl_list[i], &loc, "", replies[i].xdata,
+                                 NULL);
+        if (ret < 0)
+            healed_sinks[i] = 0;
+    }
+
+    loc_wipe(&loc);
+    if (EC_COUNT(healed_sinks, ec->nodes) == 0)
+        return -ENOTCONN;
+    return 0;
+}
+
+int
+__ec_heal_metadata(call_frame_t *frame, ec_t *ec, inode_t *inode,
+                   unsigned char *locked_on, unsigned char *sources,
+                   unsigned char *healed_sinks)
+{
+    loc_t loc = {0};
+    int ret = 0;
+    int source = 0;
+    default_args_cbk_t *replies = NULL;
+    default_args_cbk_t *sreplies = NULL;
+    uint64_t *versions = NULL;
+    uint64_t *dirty = NULL;
+    unsigned char *output = NULL;
+    dict_t *source_dict = NULL;
+    struct iatt source_buf = {0};
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    EC_REPLIES_ALLOC(sreplies, ec->nodes);
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+    output = alloca0(ec->nodes);
+    versions = alloca0(ec->nodes * sizeof(*versions));
+    dirty = alloca0(ec->nodes * sizeof(*dirty));
+    source = __ec_heal_metadata_prepare(frame, ec, inode, locked_on, replies,
+                                        versions, dirty, sources, healed_sinks);
+    if (source < 0) {
+        ret = -EIO;
+        goto out;
+    }
+
+    if ((EC_COUNT(sources, ec->nodes) == ec->nodes) ||
+        (EC_COUNT(healed_sinks, ec->nodes) == 0)) {
+        ret = 0;
+        goto erase_dirty;
+    }
+
+    source_buf = replies[source].stat;
+    ret = cluster_setattr(ec->xl_list, healed_sinks, ec->nodes, sreplies,
+                          output, frame, ec->xl, &loc, &source_buf,
+                          GF_SET_ATTR_MODE | GF_SET_ATTR_UID | GF_SET_ATTR_GID,
+                          NULL);
+    /*In case the operation fails on some of the subvols*/
+    memcpy(healed_sinks, output, ec->nodes);
+    if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+        ret = -ENOTCONN;
+        goto out;
+    }
+
+    ret = __ec_removexattr_sinks(frame, ec, inode, source, sources,
+                                 healed_sinks, replies);
+    if (ret < 0)
+        goto out;
+
+    source_dict = dict_ref(replies[source].xdata);
+    if (dict_foreach_match(source_dict, ec_ignorable_key_match, NULL,
+                           dict_remove_foreach_fn, NULL) == -1) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    ret = cluster_setxattr(ec->xl_list, healed_sinks, ec->nodes, replies,
+                           output, frame, ec->xl, &loc, source_dict, 0, NULL);
+
+    EC_INTERSECT(healed_sinks, healed_sinks, output, ec->nodes);
+    if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+        ret = -ENOTCONN;
+        goto out;
+    }
+
+erase_dirty:
+    ret = ec_adjust_versions(frame, ec, EC_METADATA_TXN, inode, source, sources,
+                             healed_sinks, versions, dirty);
+out:
+    if (source_dict)
+        dict_unref(source_dict);
+
+    loc_wipe(&loc);
+    cluster_replies_wipe(replies, ec->nodes);
+    cluster_replies_wipe(sreplies, ec->nodes);
+    return ret;
+}
+
+int
+ec_heal_metadata(call_frame_t *frame, ec_t *ec, inode_t *inode,
+                 unsigned char *sources, unsigned char *healed_sinks)
+{
+    unsigned char *locked_on = NULL;
+    unsigned char *up_subvols = NULL;
+    unsigned char *output = NULL;
+    int ret = 0;
+    default_args_cbk_t *replies = NULL;
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    locked_on = alloca0(ec->nodes);
+    output = alloca0(ec->nodes);
+    up_subvols = alloca0(ec->nodes);
+    ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+    ret = cluster_inodelk(ec->xl_list, up_subvols, ec->nodes, replies,
+                          locked_on, frame, ec->xl, ec->xl->name, inode, 0, 0);
+    {
+        if (ret <= ec->fragments) {
+            gf_msg_debug(ec->xl->name, 0,
+                         "%s: Skipping heal "
+                         "as only %d number of subvolumes could "
+                         "be locked",
+                         uuid_utoa(inode->gfid), ret);
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+        ret = __ec_heal_metadata(frame, ec, inode, locked_on, sources,
+                                 healed_sinks);
+    }
+unlock:
+    cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+                      ec->xl, ec->xl->name, inode, 0, 0);
+    cluster_replies_wipe(replies, ec->nodes);
+    return ret;
+}
+
+/*entry heal*/
+int
+__ec_heal_entry_prepare(call_frame_t *frame, ec_t *ec, inode_t *inode,
+                        unsigned char *locked_on, uint64_t *versions,
+                        uint64_t *dirty, unsigned char *sources,
+                        unsigned char *healed_sinks)
+{
+    loc_t loc = {0};
+    int source = 0;
+    int ret = 0;
+    default_args_cbk_t *replies = NULL;
+    unsigned char *output = NULL;
+    dict_t *xdata = NULL;
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+    xdata = dict_new();
+    if (!xdata) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    if (dict_set_uint64(xdata, EC_XATTR_VERSION, 0) ||
+        dict_set_uint64(xdata, EC_XATTR_DIRTY, 0)) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    output = alloca0(ec->nodes);
+    ret = cluster_lookup(ec->xl_list, locked_on, ec->nodes, replies, output,
+                         frame, ec->xl, &loc, xdata);
+    if (ret <= ec->fragments) {
+        ret = -ENOTCONN;
+        goto out;
+    }
+
+    source = ec_heal_entry_find_direction(ec, replies, versions, dirty, sources,
+                                          healed_sinks);
+    if (source < 0) {
+        ret = -EIO;
+        goto out;
+    }
+    ret = source;
+out:
+    if (xdata)
+        dict_unref(xdata);
+    loc_wipe(&loc);
+    cluster_replies_wipe(replies, ec->nodes);
+    return ret;
+}
+int32_t
+ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
+                       call_frame_t *frame, xlator_t *this, unsigned char *on)
+{
+    dict_t *xattr = NULL;
+    int32_t ret = -1;
+    default_args_cbk_t *replies = NULL;
+    unsigned char *output = NULL;
+    uint64_t dirty[EC_VERSION_SIZE] = {1, 1};
+    loc_t newloc = {0};
+
+    /*Symlinks don't have any data to be healed*/
+    if (ia->ia_type == IA_IFLNK)
+        dirty[EC_DATA_TXN] = 0;
+
+    newloc.inode = inode_ref(loc->inode);
+    gf_uuid_copy(newloc.gfid, ia->ia_gfid);
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    output = alloca0(ec->nodes);
+    xattr = dict_new();
+    if (!xattr) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    ret = ec_dict_set_array(xattr, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
+    if (ret)
+        goto out;
+
+    ret = cluster_xattrop(ec->xl_list, on, ec->nodes, replies, output, frame,
+                          ec->xl, &newloc, GF_XATTROP_ADD_ARRAY64, xattr, NULL);
+
+    if (ret < ec->fragments) {
+        ret = -ENOTCONN;
+        goto out;
+    }
+
+out:
+    if (xattr)
+        dict_unref(xattr);
+    cluster_replies_wipe(replies, ec->nodes);
+    loc_wipe(&newloc);
+    return ret;
+}
+
+/*Name heal*/
+int
+ec_delete_stale_name(dict_t *gfid_db, char *key, data_t *d, void *data)
+{
+    struct ec_name_data *name_data = data;
+    struct iatt *ia = NULL;
+    ec_t *ec = NULL;
+    loc_t loc = {0};
+    unsigned char *same = data_to_bin(d);
+    default_args_cbk_t *replies = NULL;
+    unsigned char *output = NULL;
+    int ret = 0;
+    int estale_count = 0;
+    int i = 0;
+    call_frame_t *frame = name_data->frame;
+    uuid_t gfid;
+
+    ec = name_data->frame->this->private;
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    if (EC_COUNT(same, ec->nodes) >= ec->fragments) {
+        ret = 0;
+        goto out;
+    }
+
+    loc.parent = inode_ref(name_data->parent);
+    loc.inode = inode_new(name_data->parent->table);
+    if (!loc.inode) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    gf_uuid_parse(key, gfid);
+    gf_uuid_copy(loc.pargfid, name_data->parent->gfid);
+    loc.name = name_data->name;
+    output = alloca0(ec->nodes);
+    ret = cluster_lookup(ec->xl_list, name_data->participants, ec->nodes,
+                         replies, output, name_data->frame, ec->xl, &loc, NULL);
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (!replies[i].valid)
+            continue;
+        if (replies[i].op_ret == -1) {
+            if (replies[i].op_errno == ESTALE || replies[i].op_errno == ENOENT)
+                estale_count++;
+            else
+                name_data->participants[i] = 0;
+        } else if (gf_uuid_compare(gfid, replies[i].stat.ia_gfid)) {
+            estale_count++;
+            gf_msg_debug(ec->xl->name, 0, "%s/%s: different gfid as %s",
+                         uuid_utoa(name_data->parent->gfid), name_data->name,
+                         key);
+        }
+    }
+
+    if (estale_count <= ec->redundancy) {
+        /* We have at least ec->fragments number of fragments, so the
+         * file is recoverable, so don't delete it*/
+
+        /* Please note that the lookup call above could fail with
+         * ENOTCONN on all subvoumes and still this branch will be
+         * true, but in those cases conservatively we decide to not
+         * delete the file until we are sure*/
+        ret = 0;
+        goto out;
+    }
+
+    /*Noway to recover, delete the name*/
+    loc_wipe(&loc);
+    loc.parent = inode_ref(name_data->parent);
+    gf_uuid_copy(loc.pargfid, loc.parent->gfid);
+    loc.name = name_data->name;
+    for (i = 0; i < ec->nodes; i++) {
+        if (same[i] && replies[i].valid && (replies[i].op_ret == 0)) {
+            ia = &replies[i].stat;
+            break;
+        }
+    }
+
+    if (!ia) {
+        ret = -ENOTCONN;
+        goto out;
+    }
+
+    if (IA_ISDIR(ia->ia_type)) {
+        ret = cluster_rmdir(ec->xl_list, same, ec->nodes, replies, output,
+                            frame, ec->xl, &loc, 1, NULL);
+        gf_msg_debug(ec->xl->name, 0,
+                     "cluster rmdir succeeded on %d "
+                     "nodes",
+                     ret);
+    } else {
+        ret = cluster_unlink(ec->xl_list, same, ec->nodes, replies, output,
+                             frame, ec->xl, &loc, 0, NULL);
+        gf_msg_debug(ec->xl->name, 0,
+                     "cluster unlink succeeded on %d "
+                     "nodes",
+                     ret);
+    }
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (output[i]) {
+            same[i] = 0;
+            name_data->enoent[i] = 1;
+        } else {
+            /*op failed*/
+            if (same[i])
+                name_data->participants[i] = 0;
+        }
+    }
+    ret = 0;
+    /*This will help in making decisions about creating names*/
+    dict_del(gfid_db, key);
+out:
+    if (ret < 0) {
+        gf_msg_debug(ec->xl->name, 0, "%s/%s: heal failed %s",
+                     uuid_utoa(name_data->parent->gfid), name_data->name,
+                     strerror(-ret));
+    }
+    cluster_replies_wipe(replies, ec->nodes);
+    loc_wipe(&loc);
+    return ret;
+}
+
+int
+ec_delete_stale_names(call_frame_t *frame, ec_t *ec, inode_t *parent,
+                      char *name, default_args_cbk_t *replies, dict_t *gfid_db,
+                      unsigned char *enoent, unsigned char *gfidless,
+                      unsigned char *participants)
+{
+    struct ec_name_data name_data = {0};
+
+    name_data.enoent = enoent;
+    name_data.gfidless = gfidless;
+    name_data.participants = participants;
+    name_data.name = name;
+    name_data.parent = parent;
+    name_data.frame = frame;
+    name_data.replies = replies;
+    return dict_foreach(gfid_db, ec_delete_stale_name, &name_data);
+}
+
+int
+_assign_same(dict_t *dict, char *key, data_t *value, void *data)
+{
+    struct ec_name_data *name_data = data;
+
+    name_data->same = data_to_bin(value);
+    return 0;
+}
+
+int
+ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+               default_args_cbk_t *lookup_replies, dict_t *gfid_db,
+               unsigned char *enoent, unsigned char *participants)
+{
+    int ret = 0;
+    int i = 0;
+    struct ec_name_data name_data = {0};
+    struct iatt *ia = NULL;
+    unsigned char *output = 0;
+    unsigned char *output1 = 0;
+    unsigned char *on = NULL;
+    default_args_cbk_t *replies = NULL;
+    loc_t loc = {0};
+    loc_t srcloc = {0};
+    unsigned char *link = NULL;
+    unsigned char *create = NULL;
+    dict_t *xdata = NULL;
+    char *linkname = NULL;
+    ec_config_t config;
+
+    /* There should be just one gfid key */
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    if (gfid_db->count != 1) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = dict_foreach(gfid_db, _assign_same, &name_data);
+    if (ret < 0)
+        goto out;
+    /*There should at least be one valid success reply with gfid*/
+    for (i = 0; i < ec->nodes; i++)
+        if (name_data.same[i])
+            break;
+
+    if (i == ec->nodes) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ia = &lookup_replies[i].stat;
+    xdata = dict_new();
+    loc.parent = inode_ref(parent);
+    gf_uuid_copy(loc.pargfid, parent->gfid);
+    loc.inode = inode_new(parent->table);
+    if (loc.inode)
+        srcloc.inode = inode_ref(loc.inode);
+    gf_uuid_copy(srcloc.gfid, ia->ia_gfid);
+    if (!loc.inode || !xdata ||
+        dict_set_static_bin(xdata, "gfid-req", ia->ia_gfid,
+                            sizeof(ia->ia_gfid))) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    loc.name = name;
+    link = alloca0(ec->nodes);
+    create = alloca0(ec->nodes);
+    on = alloca0(ec->nodes);
+    output = alloca0(ec->nodes);
+    output1 = alloca0(ec->nodes);
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (!lookup_replies[i].valid)
+            continue;
+        if (lookup_replies[i].op_ret)
+            continue;
+        on[i] = 1;
+    }
+    switch (ia->ia_type) {
+        case IA_IFDIR:
+            ec_set_new_entry_dirty(ec, &loc, ia, frame, ec->xl, on);
+            (void)cluster_mkdir(
+                ec->xl_list, enoent, ec->nodes, replies, output, frame, ec->xl,
+                &loc, st_mode_from_ia(ia->ia_prot, ia->ia_type), 0, xdata);
+            break;
+
+        case IA_IFLNK:
+            /*Check for hard links and create/link*/
+            ret = cluster_lookup(ec->xl_list, enoent, ec->nodes, replies,
+                                 output, frame, ec->xl, &srcloc, NULL);
+            for (i = 0; i < ec->nodes; i++) {
+                if (output[i]) {
+                    link[i] = 1;
+                } else {
+                    if (replies[i].op_errno == ENOENT ||
+                        replies[i].op_errno == ESTALE) {
+                        create[i] = 1;
+                    }
+                }
+            }
+
+            if (EC_COUNT(link, ec->nodes)) {
+                cluster_link(ec->xl_list, link, ec->nodes, replies, output1,
+                             frame, ec->xl, &srcloc, &loc, NULL);
+            }
+
+            if (EC_COUNT(create, ec->nodes)) {
+                cluster_readlink(ec->xl_list, name_data.same, ec->nodes,
+                                 replies, output, frame, ec->xl, &srcloc, 4096,
+                                 NULL);
+                if (EC_COUNT(output, ec->nodes) == 0) {
+                    ret = -ENOTCONN;
+                    goto out;
+                }
+
+                for (i = 0; i < ec->nodes; i++) {
+                    if (output[i])
+                        break;
+                }
+                linkname = alloca0(strlen(replies[i].buf) + 1);
+                strcpy(linkname, replies[i].buf);
+                ec_set_new_entry_dirty(ec, &loc, ia, frame, ec->xl, on);
+                cluster_symlink(ec->xl_list, create, ec->nodes, replies, output,
+                                frame, ec->xl, linkname, &loc, 0, xdata);
+            }
+            for (i = 0; i < ec->nodes; i++)
+                if (output1[i])
+                    output[i] = 1;
+            break;
+        case IA_IFREG:
+            ec_set_new_entry_dirty(ec, &loc, ia, frame, ec->xl, on);
+            config.version = EC_CONFIG_VERSION;
+            config.algorithm = EC_CONFIG_ALGORITHM;
+            config.gf_word_size = EC_GF_BITS;
+            config.bricks = ec->nodes;
+            config.redundancy = ec->redundancy;
+            config.chunk_size = EC_METHOD_CHUNK_SIZE;
+
+            ret = ec_dict_set_config(xdata, EC_XATTR_CONFIG, &config);
+            if (ret != 0) {
+                goto out;
+            }
+
+            /* Fall through */
+
+        default:
+            ret = dict_set_int32(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
+            if (ret)
+                goto out;
+            ret = cluster_mknod(
+                ec->xl_list, enoent, ec->nodes, replies, output, frame, ec->xl,
+                &loc, st_mode_from_ia(ia->ia_prot, ia->ia_type),
+                makedev(ia_major(ia->ia_rdev), ia_minor(ia->ia_rdev)), 0,
+                xdata);
+            break;
+    }
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (enoent[i] && !output[i])
+            participants[i] = 0;
+    }
+
+    ret = 0;
+out:
+    if (ret < 0)
+        gf_msg_debug(ec->xl->name, 0, "%s/%s: heal failed %s",
+                     uuid_utoa(parent->gfid), name, strerror(-ret));
+    cluster_replies_wipe(replies, ec->nodes);
+    loc_wipe(&loc);
+    loc_wipe(&srcloc);
+    if (xdata)
+        dict_unref(xdata);
+    return ret;
+}
+
+int
+__ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+               unsigned char *participants)
+{
+    unsigned char *output = NULL;
+    unsigned char *enoent = NULL;
+    default_args_cbk_t *replies = NULL;
+    dict_t *xdata = NULL;
+    dict_t *gfid_db = NULL;
+    int ret = 0;
+    loc_t loc = {0};
+    int i = 0;
+    struct iatt *ia = NULL;
+    char gfid[64] = {0};
+    unsigned char *same = NULL;
+    unsigned char *gfidless = NULL;
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    loc.parent = inode_ref(parent);
+    loc.inode = inode_new(parent->table);
+    gf_uuid_copy(loc.pargfid, parent->gfid);
+    loc.name = name;
+    xdata = dict_new();
+    gfid_db = dict_new();
+    if (!xdata || !gfid_db || !loc.inode) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    ret = dict_set_int32(xdata, GF_GFIDLESS_LOOKUP, 1);
+    if (ret) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    output = alloca0(ec->nodes);
+    gfidless = alloca0(ec->nodes);
+    enoent = alloca0(ec->nodes);
+    ret = cluster_lookup(ec->xl_list, participants, ec->nodes, replies, output,
+                         frame, ec->xl, &loc, NULL);
+    for (i = 0; i < ec->nodes; i++) {
+        if (!replies[i].valid)
+            continue;
+
+        if (replies[i].op_ret == -1) {
+            /*If ESTALE comes here, that means parent dir is not
+             * present, nothing to do there, so reset participants
+             * for that brick*/
+            if (replies[i].op_errno == ENOENT)
+                enoent[i] = 1;
+            else
+                participants[i] = 0;
+            continue;
+        }
+        ia = &replies[i].stat;
+        if (gf_uuid_is_null(ia->ia_gfid)) {
+            if (IA_ISDIR(ia->ia_type) || ia->ia_size == 0)
+                gfidless[i] = 1;
+            else
+                participants[i] = 0;
+        } else {
+            uuid_utoa_r(ia->ia_gfid, gfid);
+            ret = dict_get_bin(gfid_db, gfid, (void **)&same);
+            if (ret < 0) {
+                same = alloca0(ec->nodes);
+            }
+            same[i] = 1;
+            if (ret < 0) {
+                ret = dict_set_static_bin(gfid_db, gfid, same, ec->nodes);
+            }
+            if (ret < 0)
+                goto out;
+        }
+    }
+
+    ret = ec_delete_stale_names(frame, ec, parent, name, replies, gfid_db,
+                                enoent, gfidless, participants);
+
+    if (gfid_db->count == 0) {
+        /* All entries seem to be stale entries and deleted,
+         * nothing more to do.*/
+        goto out;
+    }
+
+    if (gfid_db->count > 1) {
+        gf_msg(ec->xl->name, GF_LOG_INFO, 0, EC_MSG_HEAL_FAIL,
+               "%s/%s: Not able to heal", uuid_utoa(parent->gfid), name);
+        memset(participants, 0, ec->nodes);
+        goto out;
+    }
+
+    EC_INTERSECT(enoent, enoent, participants, ec->nodes);
+    if (EC_COUNT(enoent, ec->nodes) == 0) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
+                         participants);
+    if (ret >= 0) {
+        /* If ec_create_name() succeeded we return 1 to indicate that a new
+         * file has been created and it will need to be healed. */
+        ret = 1;
+    }
+out:
+    cluster_replies_wipe(replies, ec->nodes);
+    loc_wipe(&loc);
+    if (xdata)
+        dict_unref(xdata);
+    if (gfid_db)
+        dict_unref(gfid_db);
+    return ret;
+}
+
+int
+ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+             unsigned char *participants)
+{
+    int ret = 0;
+    default_args_cbk_t *replies = NULL;
+    unsigned char *output = NULL;
+    unsigned char *locked_on = NULL;
+    loc_t loc = {0};
+
+    loc.parent = inode_ref(parent);
+    loc.name = name;
+    loc.inode = inode_new(parent->table);
+    if (!loc.inode) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    output = alloca0(ec->nodes);
+    locked_on = alloca0(ec->nodes);
+    ret = cluster_inodelk(ec->xl_list, participants, ec->nodes, replies,
+                          locked_on, frame, ec->xl, ec->xl->name, parent, 0, 0);
+    {
+        if (ret <= ec->fragments) {
+            gf_msg_debug(ec->xl->name, 0,
+                         "%s/%s: Skipping "
+                         "heal as only %d number of subvolumes could "
+                         "be locked",
+                         uuid_utoa(parent->gfid), name, ret);
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+        EC_INTERSECT(participants, participants, locked_on, ec->nodes);
+        ret = __ec_heal_name(frame, ec, parent, name, participants);
+    }
+unlock:
+    cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+                      ec->xl, ec->xl->name, parent, 0, 0);
+out:
+    cluster_replies_wipe(replies, ec->nodes);
+    loc_wipe(&loc);
+    return ret;
+}
+
+int
+ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+                     void *data)
+{
+    struct ec_name_data *name_data = data;
+    xlator_t *this = THIS;
+    ec_t *ec = this->private;
+    unsigned char *name_on = alloca0(ec->nodes);
+    int i = 0;
+    int ret = 0;
+
+    if (ec->shutdown) {
+        gf_msg_debug(this->name, 0,
+                     "Cancelling directory heal "
+                     "because EC is stopping.");
+        return -ENOTCONN;
+    }
+
+    memcpy(name_on, name_data->participants, ec->nodes);
+    ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
+                       name_on);
+
+    if (ret < 0) {
+        memset(name_on, 0, ec->nodes);
+    } else {
+        name_data->heal_pending += ret;
+    }
+
+    for (i = 0; i < ec->nodes; i++)
+        if (name_data->participants[i] && !name_on[i])
+            name_data->failed_on[i] = 1;
+
+    return 0;
+}
+
+int
+ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
+              unsigned char *participants, uint32_t *pending)
+{
+    int i = 0;
+    int j = 0;
+    loc_t loc = {0};
+    struct ec_name_data name_data = {0};
+    int ret = 0;
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+    name_data.frame = frame;
+    name_data.participants = participants;
+    name_data.failed_on = alloca0(ec->nodes);
+    name_data.heal_pending = 0;
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (!participants[i])
+            continue;
+        ret = syncop_dir_scan(ec->xl_list[i], &loc, GF_CLIENT_PID_SELF_HEALD,
+                              &name_data, ec_name_heal_handler);
+        if (ret < 0) {
+            break;
+        }
+        for (j = 0; j < ec->nodes; j++)
+            if (name_data.failed_on[j])
+                participants[j] = 0;
+
+        if (EC_COUNT(participants, ec->nodes) <= ec->fragments) {
+            ret = -ENOTCONN;
+            break;
+        }
+    }
+    *pending += name_data.heal_pending;
+
+    loc_wipe(&loc);
+    return ret;
+}
+
+int
+__ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
+                unsigned char *heal_on, unsigned char *sources,
+                unsigned char *healed_sinks, uint32_t *pending)
+{
+    unsigned char *locked_on = NULL;
+    unsigned char *output = NULL;
+    uint64_t *versions = NULL;
+    uint64_t *dirty = NULL;
+    unsigned char *participants = NULL;
+    default_args_cbk_t *replies = NULL;
+    int ret = 0;
+    int source = 0;
+    int i = 0;
+
+    locked_on = alloca0(ec->nodes);
+    output = alloca0(ec->nodes);
+    versions = alloca0(ec->nodes * sizeof(*versions));
+    dirty = alloca0(ec->nodes * sizeof(*dirty));
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    ret = cluster_inodelk(ec->xl_list, heal_on, ec->nodes, replies, locked_on,
+                          frame, ec->xl, ec->xl->name, inode, 0, 0);
+    {
+        if (ret <= ec->fragments) {
+            gf_msg_debug(ec->xl->name, 0,
+                         "%s: Skipping heal "
+                         "as only %d number of subvolumes could "
+                         "be locked",
+                         uuid_utoa(inode->gfid), ret);
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+        ret = __ec_heal_entry_prepare(frame, ec, inode, locked_on, versions,
+                                      dirty, sources, healed_sinks);
+        source = ret;
+    }
+unlock:
+    cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+                      ec->xl, ec->xl->name, inode, 0, 0);
+    if (ret < 0)
+        goto out;
+
+    participants = alloca0(ec->nodes);
+    for (i = 0; i < ec->nodes; i++) {
+        if (sources[i] || healed_sinks[i])
+            participants[i] = 1;
+    }
+    ret = ec_heal_names(frame, ec, inode, participants, pending);
+
+    if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
+        goto out;
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (!participants[i]) {
+            sources[i] = 0;
+            healed_sinks[i] = 0;
+        }
+    }
+
+    ec_adjust_versions(frame, ec, EC_DATA_TXN, inode, source, sources,
+                       healed_sinks, versions, dirty);
+out:
+    cluster_replies_wipe(replies, ec->nodes);
+    return ret;
+}
+
+int
+ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
+              unsigned char *sources, unsigned char *healed_sinks,
+              uint32_t *pending)
+{
+    unsigned char *locked_on = NULL;
+    unsigned char *up_subvols = NULL;
+    unsigned char *output = NULL;
+    char selfheal_domain[1024] = {0};
+    int ret = 0;
+    default_args_cbk_t *replies = NULL;
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    locked_on = alloca0(ec->nodes);
+    output = alloca0(ec->nodes);
+    up_subvols = alloca0(ec->nodes);
+
+    sprintf(selfheal_domain, "%s:self-heal", ec->xl->name);
+    ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+    /*If other processes are already doing the heal, don't block*/
+    ret = cluster_tiebreaker_inodelk(ec->xl_list, up_subvols, ec->nodes,
+                                     replies, locked_on, frame, ec->xl,
+                                     selfheal_domain, inode, 0, 0);
+    {
+        if (ret <= ec->fragments) {
+            gf_msg_debug(ec->xl->name, 0,
+                         "%s: Skipping heal "
+                         "as only %d number of subvolumes could "
+                         "be locked",
+                         uuid_utoa(inode->gfid), ret);
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+        ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
+                              healed_sinks, pending);
+    }
+unlock:
+    cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+                      ec->xl, selfheal_domain, inode, 0, 0);
+    cluster_replies_wipe(replies, ec->nodes);
+    return ret;
+}
+
+/*Find direction for data heal and heal info*/
+int
+ec_heal_data_find_direction(ec_t *ec, default_args_cbk_t *replies,
+                            uint64_t *data_versions, uint64_t *dirty,
+                            uint64_t *size, unsigned char *sources,
+                            unsigned char *healed_sinks,
+                            gf_boolean_t check_ondisksize, int which)
+{
+    uint64_t xattr[EC_VERSION_SIZE] = {0};
+    char version_size[128] = {0};
+    dict_t *version_size_db = NULL;
+    unsigned char *same = NULL;
+    int max_same_count = 0;
+    int source = 0;
+    int i = 0;
+    int ret = 0;
+    dict_t *dict = NULL;
+    uint64_t source_size = 0;
+
+    version_size_db = dict_new();
+    if (!version_size_db) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (!replies[i].valid)
+            continue;
+        if (replies[i].op_ret < 0)
+            continue;
+        dict = (which == EC_COMBINE_XDATA) ? replies[i].xdata
+                                           : replies[i].xattr;
+
+        ret = ec_dict_get_array(dict, EC_XATTR_VERSION, xattr, EC_VERSION_SIZE);
+        if (ret == 0) {
+            data_versions[i] = xattr[EC_DATA_TXN];
+        }
+
+        memset(xattr, 0, sizeof(xattr));
+        ret = ec_dict_get_array(dict, EC_XATTR_DIRTY, xattr, EC_VERSION_SIZE);
+        if (ret == 0) {
+            dirty[i] = xattr[EC_DATA_TXN];
+        }
+        ret = ec_dict_del_number(dict, EC_XATTR_SIZE, &size[i]);
+        /*Build a db of same metadata and data version and size*/
+        snprintf(version_size, sizeof(version_size), "%" PRIu64 "-%" PRIu64,
+                 data_versions[i], size[i]);
+
+        ret = dict_get_bin(version_size_db, version_size, (void **)&same);
+        if (ret < 0) {
+            same = alloca0(ec->nodes);
+        }
+
+        same[i] = 1;
+        if (max_same_count < EC_COUNT(same, ec->nodes)) {
+            max_same_count = EC_COUNT(same, ec->nodes);
+            source = i;
+        }
+
+        if (ret < 0) {
+            ret = dict_set_static_bin(version_size_db, version_size, same,
+                                      ec->nodes);
+        }
+
+        if (ret < 0) {
+            ret = -ENOMEM;
+            goto out;
+        }
+    }
+    /* If we don't have ec->fragments number of same version,size it is not
+     * recoverable*/
+    if (max_same_count < ec->fragments) {
+        ret = -EIO;
+        goto out;
+    } else {
+        snprintf(version_size, sizeof(version_size), "%" PRIu64 "-%" PRIu64,
+                 data_versions[source], size[source]);
+
+        ret = dict_get_bin(version_size_db, version_size, (void **)&same);
+        if (ret < 0)
+            goto out;
+        memcpy(sources, same, ec->nodes);
+        for (i = 0; i < ec->nodes; i++) {
+            if (replies[i].valid && (replies[i].op_ret == 0) && !sources[i])
+                healed_sinks[i] = 1;
+        }
+    }
+
+    /* There could be files with versions, size same but on disk ia_size
+     * could be different because of disk crashes, mark them as sinks as
+     * well*/
+
+    if (check_ondisksize) {
+        source_size = size[source];
+        ec_adjust_size_up(ec, &source_size, _gf_true);
+
+        for (i = 0; i < ec->nodes; i++) {
+            if (sources[i]) {
+                if (replies[i].stat.ia_size != source_size) {
+                    sources[i] = 0;
+                    healed_sinks[i] = 1;
+                    max_same_count--;
+                } else {
+                    source = i;
+                }
+            }
+        }
+        if (max_same_count < ec->fragments) {
+            ret = -EIO;
+            goto out;
+        }
+    }
+
+    ret = source;
+out:
+    if (version_size_db)
+        dict_unref(version_size_db);
+    return ret;
+}
+
+int
+__ec_heal_data_prepare(call_frame_t *frame, ec_t *ec, fd_t *fd,
+                       unsigned char *locked_on, uint64_t *versions,
+                       uint64_t *dirty, uint64_t *size, unsigned char *sources,
+                       unsigned char *healed_sinks, unsigned char *trim,
+                       struct iatt *stbuf)
+{
+    default_args_cbk_t *replies = NULL;
+    default_args_cbk_t *fstat_replies = NULL;
+    unsigned char *output = NULL;
+    unsigned char *fstat_output = NULL;
+    dict_t *xattrs = NULL;
+    uint64_t zero_array[2] = {0};
+    int source = 0;
+    int ret = 0;
+    uint64_t zero_value = 0;
+    int i = 0;
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    EC_REPLIES_ALLOC(fstat_replies, ec->nodes);
+    output = alloca0(ec->nodes);
+    fstat_output = alloca0(ec->nodes);
+    xattrs = dict_new();
+    if (!xattrs ||
+        dict_set_static_bin(xattrs, EC_XATTR_VERSION, zero_array,
+                            sizeof(zero_array)) ||
+        dict_set_static_bin(xattrs, EC_XATTR_DIRTY, zero_array,
+                            sizeof(zero_array)) ||
+        dict_set_static_bin(xattrs, EC_XATTR_SIZE, &zero_value,
+                            sizeof(zero_value))) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    ret = cluster_fxattrop(ec->xl_list, locked_on, ec->nodes, replies, output,
+                           frame, ec->xl, fd, GF_XATTROP_ADD_ARRAY64, xattrs,
+                           NULL);
+
+    ret = cluster_fstat(ec->xl_list, locked_on, ec->nodes, fstat_replies,
+                        fstat_output, frame, ec->xl, fd, NULL);
+
+    for (i = 0; i < ec->nodes; i++) {
+        output[i] = output[i] && fstat_output[i];
+        replies[i].valid = output[i];
+        if (output[i])
+            replies[i].stat = fstat_replies[i].stat;
+    }
+
+    if (EC_COUNT(output, ec->nodes) <= ec->fragments) {
+        ret = -ENOTCONN;
+        goto out;
+    }
+
+    source = ec_heal_data_find_direction(ec, replies, versions, dirty, size,
+                                         sources, healed_sinks, _gf_true,
+                                         EC_COMBINE_DICT);
+    ret = source;
+    if (ret < 0)
+        goto out;
+
+    if (stbuf)
+        *stbuf = replies[source].stat;
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (healed_sinks[i]) {
+            if (replies[i].stat.ia_size)
+                trim[i] = 1;
+        }
+    }
+
+    if (EC_COUNT(sources, ec->nodes) < ec->fragments) {
+        ret = -ENOTCONN;
+        goto out;
+    }
+
+    ret = source;
+out:
+    if (xattrs)
+        dict_unref(xattrs);
+    cluster_replies_wipe(replies, ec->nodes);
+    cluster_replies_wipe(fstat_replies, ec->nodes);
+    if (ret < 0) {
+        gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s",
+                     uuid_utoa(fd->inode->gfid), strerror(-ret));
+    } else {
+        gf_msg_debug(ec->xl->name, 0,
+                     "%s: sources: %d, sinks: "
+                     "%d",
+                     uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes),
+                     EC_COUNT(healed_sinks, ec->nodes));
+    }
+    return ret;
+}
+
+int
+__ec_heal_mark_sinks(call_frame_t *frame, ec_t *ec, fd_t *fd,
+                     uint64_t *versions, unsigned char *healed_sinks)
+{
+    int i = 0;
+    int ret = 0;
+    unsigned char *mark = NULL;
+    dict_t *xattrs = NULL;
+    default_args_cbk_t *replies = NULL;
+    unsigned char *output = NULL;
+    uint64_t versions_xattr[2] = {0};
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    xattrs = dict_new();
+    if (!xattrs) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    mark = alloca0(ec->nodes);
+    for (i = 0; i < ec->nodes; i++) {
+        if (!healed_sinks[i])
+            continue;
+        if ((versions[i] >> EC_SELFHEAL_BIT) & 1)
+            continue;
+        mark[i] = 1;
+    }
+
+    if (EC_COUNT(mark, ec->nodes) == 0)
+        return 0;
+
+    versions_xattr[EC_DATA_TXN] = hton64(1ULL << EC_SELFHEAL_BIT);
+    if (dict_set_static_bin(xattrs, EC_XATTR_VERSION, versions_xattr,
+                            sizeof(versions_xattr))) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    output = alloca0(ec->nodes);
+    ret = cluster_fxattrop(ec->xl_list, mark, ec->nodes, replies, output, frame,
+                           ec->xl, fd, GF_XATTROP_ADD_ARRAY64, xattrs, NULL);
+    for (i = 0; i < ec->nodes; i++) {
+        if (!output[i]) {
+            if (mark[i])
+                healed_sinks[i] = 0;
+            continue;
+        }
+        versions[i] |= (1ULL << EC_SELFHEAL_BIT);
+    }
+
+    if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+        ret = -ENOTCONN;
+        goto out;
+    }
+    ret = 0;
+
+out:
+    cluster_replies_wipe(replies, ec->nodes);
+    if (xattrs)
+        dict_unref(xattrs);
+    if (ret < 0)
+        gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s",
+                     uuid_utoa(fd->inode->gfid), strerror(-ret));
+    return ret;
+}
+
+int32_t
+ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
+{
+    ec_heal_t *heal = fop->data;
+    heal->fop = fop;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            ec_owner_set(fop->frame, fop->frame->root);
+
+            ec_heal_inodelk(heal, F_WRLCK, 1, 0, 0);
+
+            return EC_STATE_HEAL_DATA_COPY;
+
+        case EC_STATE_HEAL_DATA_COPY:
+            gf_msg_debug(fop->xl->name, 0, "%s: read/write starting",
+                         uuid_utoa(heal->fd->inode->gfid));
+            ec_heal_data_block(heal);
+
+            return EC_STATE_HEAL_DATA_UNLOCK;
+
+        case -EC_STATE_HEAL_DATA_COPY:
+        case -EC_STATE_HEAL_DATA_UNLOCK:
+        case EC_STATE_HEAL_DATA_UNLOCK:
+            ec_heal_inodelk(heal, F_UNLCK, 1, 0, 0);
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            if (fop->cbks.heal) {
+                fop->cbks.heal(fop->req_frame, fop->data, fop->xl, 0, 0,
+                               (heal->good | heal->bad), heal->good, heal->bad,
+                               0, NULL);
+            }
+
+            return EC_STATE_END;
+        case -EC_STATE_REPORT:
+            if (fop->cbks.heal) {
+                fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
+                               fop->error, 0, 0, 0, 0, NULL);
+            }
+
+            return EC_STATE_END;
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, 0, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+/*Takes lock */
+void
+ec_heal_block(call_frame_t *frame, xlator_t *this, uintptr_t target,
+              uint32_t fop_flags, fop_heal_cbk_t func, ec_heal_t *heal)
+{
+    ec_cbk_t callback = {.heal = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(HEAL) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, fop_flags,
+                               NULL, ec_manager_heal_block, callback, heal);
+    if (fop == NULL)
+        goto out;
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
+    }
+}
+
+int32_t
+ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, uintptr_t mask,
+                   uintptr_t good, uintptr_t bad, uint32_t pending,
+                   dict_t *xdata)
+{
+    ec_heal_t *heal = cookie;
+
+    if (heal->fop) {
+        heal->fop->heal = NULL;
+    }
+    heal->fop = NULL;
+    heal->error = op_ret < 0 ? op_errno : 0;
+    syncbarrier_wake(heal->data);
+    return 0;
+}
+
+int
+ec_sync_heal_block(call_frame_t *frame, xlator_t *this, ec_heal_t *heal)
+{
+    ec_heal_block(frame, this, heal->bad | heal->good, EC_MINIMUM_ONE,
+                  ec_heal_block_done, heal);
+    syncbarrier_wait(heal->data, 1);
+    if (heal->error != 0) {
+        return -heal->error;
+    }
+    if (heal->bad == 0)
+        return -ENOTCONN;
+    return 0;
+}
+
+int
+ec_rebuild_data(call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
+                unsigned char *sources, unsigned char *healed_sinks)
+{
+    ec_heal_t *heal = NULL;
+    int ret = 0;
+    syncbarrier_t barrier;
+
+    if (syncbarrier_init(&barrier))
+        return -ENOMEM;
+
+    heal = alloca0(sizeof(*heal));
+    heal->fd = fd_ref(fd);
+    heal->xl = ec->xl;
+    heal->data = &barrier;
+    ec_adjust_size_up(ec, &size, _gf_false);
+    heal->total_size = size;
+    heal->size = (128 * GF_UNIT_KB * (ec->self_heal_window_size));
+    /* We need to adjust the size to a multiple of the stripe size of the
+     * volume. Otherwise writes would need to fill gaps (head and/or tail)
+     * with existent data from the bad bricks. This could be garbage on a
+     * damaged file or it could fail if there aren't enough bricks. */
+    heal->size -= heal->size % ec->stripe_size;
+    heal->bad = ec_char_array_to_mask(healed_sinks, ec->nodes);
+    heal->good = ec_char_array_to_mask(sources, ec->nodes);
+    heal->iatt.ia_type = IA_IFREG;
+    LOCK_INIT(&heal->lock);
+
+    for (heal->offset = 0; (heal->offset < size) && !heal->done;
+         heal->offset += heal->size) {
+        /* We immediately abort any heal if a shutdown request has been
+         * received to avoid delays. The healing of this file will be
+         * restarted by another SHD or other client that accesses the
+         * file. */
+        if (ec->shutdown) {
+            gf_msg_debug(ec->xl->name, 0,
+                         "Cancelling heal because "
+                         "EC is stopping.");
+            ret = -ENOTCONN;
+            break;
+        }
+
+        gf_msg_debug(ec->xl->name, 0,
+                     "%s: sources: %d, sinks: "
+                     "%d, offset: %" PRIu64 " bsize: %" PRIu64,
+                     uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes),
+                     EC_COUNT(healed_sinks, ec->nodes), heal->offset,
+                     heal->size);
+        ret = ec_sync_heal_block(frame, ec->xl, heal);
+        if (ret < 0)
+            break;
+    }
+    memset(healed_sinks, 0, ec->nodes);
+    ec_mask_to_char_array(heal->bad, healed_sinks, ec->nodes);
+    fd_unref(heal->fd);
+    LOCK_DESTROY(&heal->lock);
+    syncbarrier_destroy(heal->data);
+    if (ret < 0)
+        gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s",
+                     uuid_utoa(fd->inode->gfid), strerror(-ret));
+    return ret;
+}
+
+int
+__ec_heal_trim_sinks(call_frame_t *frame, ec_t *ec, fd_t *fd,
+                     unsigned char *healed_sinks, unsigned char *trim,
+                     uint64_t size)
+{
+    default_args_cbk_t *replies = NULL;
+    unsigned char *output = NULL;
+    int ret = 0;
+    int i = 0;
+    off_t trim_offset = 0;
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    output = alloca0(ec->nodes);
+
+    if (EC_COUNT(trim, ec->nodes) == 0) {
+        ret = 0;
+        goto out;
+    }
+    trim_offset = size;
+    ec_adjust_offset_up(ec, &trim_offset, _gf_true);
+    ret = cluster_ftruncate(ec->xl_list, trim, ec->nodes, replies, output,
+                            frame, ec->xl, fd, trim_offset, NULL);
+    for (i = 0; i < ec->nodes; i++) {
+        if (!output[i] && trim[i])
+            healed_sinks[i] = 0;
+    }
+
+    if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+        ret = -ENOTCONN;
+        goto out;
+    }
+
+out:
+    cluster_replies_wipe(replies, ec->nodes);
+    if (ret < 0)
+        gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s",
+                     uuid_utoa(fd->inode->gfid), strerror(-ret));
+    return ret;
+}
+
+int
+ec_data_undo_pending(call_frame_t *frame, ec_t *ec, fd_t *fd, dict_t *xattr,
+                     uint64_t *versions, uint64_t *dirty, uint64_t *size,
+                     int source, gf_boolean_t erase_dirty, int idx)
+{
+    uint64_t versions_xattr[2] = {0};
+    uint64_t dirty_xattr[2] = {0};
+    uint64_t allzero[2] = {0};
+    uint64_t size_xattr = 0;
+    int ret = 0;
+
+    versions_xattr[EC_DATA_TXN] = hton64(versions[source] - versions[idx]);
+    ret = dict_set_static_bin(xattr, EC_XATTR_VERSION, versions_xattr,
+                              sizeof(versions_xattr));
+    if (ret < 0)
+        goto out;
+
+    size_xattr = hton64(size[source] - size[idx]);
+    ret = dict_set_static_bin(xattr, EC_XATTR_SIZE, &size_xattr,
+                              sizeof(size_xattr));
+    if (ret < 0)
+        goto out;
+
+    if (erase_dirty) {
+        dirty_xattr[EC_DATA_TXN] = hton64(-dirty[idx]);
+        ret = dict_set_static_bin(xattr, EC_XATTR_DIRTY, dirty_xattr,
+                                  sizeof(dirty_xattr));
+        if (ret < 0)
+            goto out;
+    }
+
+    if ((memcmp(versions_xattr, allzero, sizeof(allzero)) == 0) &&
+        (memcmp(dirty_xattr, allzero, sizeof(allzero)) == 0) &&
+        (size_xattr == 0)) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = syncop_fxattrop(ec->xl_list[idx], fd, GF_XATTROP_ADD_ARRAY64, xattr,
+                          NULL, NULL, NULL);
+out:
+    return ret;
+}
+
+int
+__ec_fd_data_adjust_versions(call_frame_t *frame, ec_t *ec, fd_t *fd,
+                             unsigned char *sources,
+                             unsigned char *healed_sinks, uint64_t *versions,
+                             uint64_t *dirty, uint64_t *size)
+{
+    dict_t *xattr = NULL;
+    int i = 0;
+    int ret = 0;
+    int op_ret = 0;
+    int source = -1;
+    gf_boolean_t erase_dirty = _gf_false;
+
+    xattr = dict_new();
+    if (!xattr) {
+        op_ret = -ENOMEM;
+        goto out;
+    }
+
+    /* dirty xattr represents if the file needs heal. Unless all the
+     * copies are healed, don't erase it */
+    if (EC_COUNT(sources, ec->nodes) + EC_COUNT(healed_sinks, ec->nodes) ==
+        ec->nodes)
+        erase_dirty = _gf_true;
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (sources[i]) {
+            source = i;
+            break;
+        }
+    }
+
+    if (source == -1) {
+        op_ret = -ENOTCONN;
+        goto out;
+    }
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (healed_sinks[i]) {
+            ret = ec_data_undo_pending(frame, ec, fd, xattr, versions, dirty,
+                                       size, source, erase_dirty, i);
+            if (ret < 0)
+                goto out;
+        }
+    }
+
+    if (!erase_dirty)
+        goto out;
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (sources[i]) {
+            ret = ec_data_undo_pending(frame, ec, fd, xattr, versions, dirty,
+                                       size, source, erase_dirty, i);
+            if (ret < 0)
+                continue;
+        }
+    }
+out:
+    if (xattr)
+        dict_unref(xattr);
+    return op_ret;
+}
+
+int
+ec_restore_time_and_adjust_versions(call_frame_t *frame, ec_t *ec, fd_t *fd,
+                                    unsigned char *sources,
+                                    unsigned char *healed_sinks,
+                                    uint64_t *versions, uint64_t *dirty,
+                                    uint64_t *size)
+{
+    unsigned char *locked_on = NULL;
+    unsigned char *participants = NULL;
+    unsigned char *output = NULL;
+    default_args_cbk_t *replies = NULL;
+    unsigned char *postsh_sources = NULL;
+    unsigned char *postsh_healed_sinks = NULL;
+    unsigned char *postsh_trim = NULL;
+    uint64_t *postsh_versions = NULL;
+    uint64_t *postsh_dirty = NULL;
+    uint64_t *postsh_size = NULL;
+    int ret = 0;
+    int i = 0;
+    struct iatt source_buf = {0};
+    loc_t loc = {0};
+
+    locked_on = alloca0(ec->nodes);
+    output = alloca0(ec->nodes);
+    participants = alloca0(ec->nodes);
+    postsh_sources = alloca0(ec->nodes);
+    postsh_healed_sinks = alloca0(ec->nodes);
+    postsh_trim = alloca0(ec->nodes);
+    postsh_versions = alloca0(ec->nodes * sizeof(*postsh_versions));
+    postsh_dirty = alloca0(ec->nodes * sizeof(*postsh_dirty));
+    postsh_size = alloca0(ec->nodes * sizeof(*postsh_size));
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (healed_sinks[i] || sources[i])
+            participants[i] = 1;
+    }
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    ret = cluster_inodelk(ec->xl_list, participants, ec->nodes, replies,
+                          locked_on, frame, ec->xl, ec->xl->name, fd->inode, 0,
+                          0);
+    {
+        if (ret <= ec->fragments) {
+            gf_msg_debug(ec->xl->name, 0,
+                         "%s: Skipping heal "
+                         "as only %d number of subvolumes could "
+                         "be locked",
+                         uuid_utoa(fd->inode->gfid), ret);
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+
+        ret = __ec_heal_data_prepare(frame, ec, fd, locked_on, postsh_versions,
+                                     postsh_dirty, postsh_size, postsh_sources,
+                                     postsh_healed_sinks, postsh_trim,
+                                     &source_buf);
+        if (ret < 0)
+            goto unlock;
+
+        loc.inode = inode_ref(fd->inode);
+        gf_uuid_copy(loc.gfid, fd->inode->gfid);
+        ret = cluster_setattr(
+            ec->xl_list, healed_sinks, ec->nodes, replies, output, frame,
+            ec->xl, &loc, &source_buf,
+            GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME, NULL);
+        EC_INTERSECT(healed_sinks, healed_sinks, output, ec->nodes);
+        if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+        ret = __ec_fd_data_adjust_versions(frame, ec, fd, sources, healed_sinks,
+                                           versions, dirty, size);
+    }
+unlock:
+    cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+                      ec->xl, ec->xl->name, fd->inode, 0, 0);
+    cluster_replies_wipe(replies, ec->nodes);
+    loc_wipe(&loc);
+    return ret;
+}
+
+int
+__ec_heal_data(call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on,
+               unsigned char *sources, unsigned char *healed_sinks)
+{
+    unsigned char *locked_on = NULL;
+    unsigned char *output = NULL;
+    uint64_t *versions = NULL;
+    uint64_t *dirty = NULL;
+    uint64_t *size = NULL;
+    unsigned char *trim = NULL;
+    default_args_cbk_t *replies = NULL;
+    int ret = 0;
+    int source = 0;
+
+    locked_on = alloca0(ec->nodes);
+    output = alloca0(ec->nodes);
+    trim = alloca0(ec->nodes);
+    versions = alloca0(ec->nodes * sizeof(*versions));
+    dirty = alloca0(ec->nodes * sizeof(*dirty));
+    size = alloca0(ec->nodes * sizeof(*size));
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    ret = cluster_inodelk(ec->xl_list, heal_on, ec->nodes, replies, locked_on,
+                          frame, ec->xl, ec->xl->name, fd->inode, 0, 0);
+    {
+        if (ret <= ec->fragments) {
+            gf_msg_debug(ec->xl->name, 0,
+                         "%s: Skipping heal "
+                         "as only %d number of subvolumes could "
+                         "be locked",
+                         uuid_utoa(fd->inode->gfid), ret);
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+
+        ret = __ec_heal_data_prepare(frame, ec, fd, locked_on, versions, dirty,
+                                     size, sources, healed_sinks, trim, NULL);
+        if (ret < 0)
+            goto unlock;
+
+        if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+            ret = __ec_fd_data_adjust_versions(
+                frame, ec, fd, sources, healed_sinks, versions, dirty, size);
+            goto unlock;
+        }
+
+        source = ret;
+        ret = __ec_heal_mark_sinks(frame, ec, fd, versions, healed_sinks);
+        if (ret < 0)
+            goto unlock;
+
+        ret = __ec_heal_trim_sinks(frame, ec, fd, healed_sinks, trim,
+                                   size[source]);
+    }
+unlock:
+    cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+                      ec->xl, ec->xl->name, fd->inode, 0, 0);
+    if (ret < 0)
+        goto out;
+
+    if (EC_COUNT(healed_sinks, ec->nodes) == 0)
+        goto out;
+
+    gf_msg_debug(ec->xl->name, 0,
+                 "%s: sources: %d, sinks: "
+                 "%d",
+                 uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes),
+                 EC_COUNT(healed_sinks, ec->nodes));
+
+    ret = ec_rebuild_data(frame, ec, fd, size[source], sources, healed_sinks);
+    if (ret < 0)
+        goto out;
+
+    ret = ec_restore_time_and_adjust_versions(
+        frame, ec, fd, sources, healed_sinks, versions, dirty, size);
+out:
+    cluster_replies_wipe(replies, ec->nodes);
+    return ret;
+}
+
+int
+ec_heal_data(call_frame_t *frame, ec_t *ec, gf_boolean_t block, inode_t *inode,
+             unsigned char *sources, unsigned char *healed_sinks)
+{
+    unsigned char *locked_on = NULL;
+    unsigned char *up_subvols = NULL;
+    unsigned char *output = NULL;
+    default_args_cbk_t *replies = NULL;
+    fd_t *fd = NULL;
+    loc_t loc = {0};
+    char selfheal_domain[1024] = {0};
+    int ret = 0;
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+
+    locked_on = alloca0(ec->nodes);
+    output = alloca0(ec->nodes);
+    up_subvols = alloca0(ec->nodes);
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    fd = fd_create(inode, 0);
+    if (!fd) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+
+    ret = cluster_open(ec->xl_list, up_subvols, ec->nodes, replies, output,
+                       frame, ec->xl, &loc, O_RDWR | O_LARGEFILE, fd, NULL);
+    if (ret <= ec->fragments) {
+        ret = -ENOTCONN;
+        goto out;
+    }
+
+    fd_bind(fd);
+    sprintf(selfheal_domain, "%s:self-heal", ec->xl->name);
+    /*If other processes are already doing the heal, don't block*/
+    if (block) {
+        ret = cluster_inodelk(ec->xl_list, output, ec->nodes, replies,
+                              locked_on, frame, ec->xl, selfheal_domain, inode,
+                              0, 0);
+    } else {
+        ret = cluster_tiebreaker_inodelk(ec->xl_list, output, ec->nodes,
+                                         replies, locked_on, frame, ec->xl,
+                                         selfheal_domain, inode, 0, 0);
+    }
+    {
+        if (ret <= ec->fragments) {
+            gf_msg_debug(ec->xl->name, 0,
+                         "%s: Skipping heal "
+                         "as only %d number of subvolumes could "
+                         "be locked",
+                         uuid_utoa(inode->gfid), ret);
+            ret = -ENOTCONN;
+            goto unlock;
+        }
+        ret = __ec_heal_data(frame, ec, fd, locked_on, sources, healed_sinks);
+    }
+unlock:
+    cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+                      ec->xl, selfheal_domain, inode, 0, 0);
+out:
+    if (fd)
+        fd_unref(fd);
+    loc_wipe(&loc);
+    cluster_replies_wipe(replies, ec->nodes);
+    return ret;
+}
+
+int
+ec_heal_purge_stale_index(call_frame_t *frame, ec_t *ec, inode_t *inode)
+{
+    int i = 0;
+    int ret = 0;
+    dict_t **xattr = NULL;
+    loc_t loc = {0};
+    uint64_t dirty_xattr[EC_VERSION_SIZE] = {0};
+    unsigned char *on = NULL;
+    default_args_cbk_t *replies = NULL;
+    dict_t *dict = NULL;
+
+    /* Allocate the required memory */
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+    on = alloca0(ec->nodes);
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    xattr = GF_CALLOC(ec->nodes, sizeof(*xattr), gf_common_mt_pointer);
+    if (!xattr) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    dict = dict_new();
+    if (!dict) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    for (i = 0; i < ec->nodes; i++) {
+        xattr[i] = dict;
+        on[i] = 1;
+    }
+    ret = dict_set_static_bin(dict, EC_XATTR_DIRTY, dirty_xattr,
+                              (sizeof(*dirty_xattr) * EC_VERSION_SIZE));
+    if (ret < 0) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    PARALLEL_FOP_ONLIST(ec->xl_list, on, ec->nodes, replies, frame,
+                        ec_wind_xattrop_parallel, &loc, GF_XATTROP_ADD_ARRAY64,
+                        xattr, NULL);
+out:
+    if (dict) {
+        dict_unref(dict);
+    }
+    if (xattr) {
+        GF_FREE(xattr);
+    }
+    cluster_replies_wipe(replies, ec->nodes);
+    loc_wipe(&loc);
+    return ret;
+}
+
+void
+ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
+{
+    call_frame_t *frame = NULL;
+    unsigned char *participants = NULL;
+    unsigned char *msources = NULL;
+    unsigned char *mhealed_sinks = NULL;
+    unsigned char *sources = NULL;
+    unsigned char *healed_sinks = NULL;
+    ec_t *ec = NULL;
+    int ret = 0;
+    int op_ret = 0;
+    int op_errno = 0;
+    intptr_t mgood = 0;
+    intptr_t mbad = 0;
+    intptr_t good = 0;
+    intptr_t bad = 0;
+    uint32_t pending = 0;
+    ec_fop_data_t *fop = data;
+    gf_boolean_t blocking = _gf_false;
+    ec_heal_need_t need_heal = EC_HEAL_NONEED;
+    unsigned char *up_subvols = NULL;
+    char up_bricks[32];
+
+    ec = this->private;
+
+    /* If it is heal request from getxattr, complete the heal and then
+     * unwind, if it is ec_heal with NULL as frame then no need to block
+     * the heal as the caller doesn't care about its completion. In case
+     * of heald whichever gets tiebreaking inodelk will take care of the
+     * heal, so no need to block*/
+    if (fop->req_frame && !ec->shd.iamshd)
+        blocking = _gf_true;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame)
+        goto out;
+
+    ec_owner_set(frame, frame->root);
+    /*Do heal as root*/
+    frame->root->uid = 0;
+    frame->root->gid = 0;
+    /*Mark the fops as internal*/
+    frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
+    participants = alloca0(ec->nodes);
+    ec_mask_to_char_array(ec->xl_up, participants, ec->nodes);
+
+    up_subvols = alloca0(ec->nodes);
+    ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+
+    if (loc->name && strlen(loc->name)) {
+        ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
+                           participants);
+        if (ret >= 0) {
+            gf_msg_debug(this->name, 0,
+                         "%s: name heal "
+                         "successful on %" PRIXPTR,
+                         loc->path,
+                         ec_char_array_to_mask(participants, ec->nodes));
+        } else {
+            gf_msg_debug(
+                this->name, 0,
+                "%s: name heal "
+                "failed. ret = %d, subvolumes up = %s",
+                loc->path, ret,
+                ec_bin(up_bricks, sizeof(up_bricks), ec->xl_up, ec->nodes));
+        }
+    }
+
+    /* Mount triggers heal only when it detects that it must need heal, shd
+     * triggers heals periodically which need not be thorough*/
+    if (ec->shd.iamshd && (ret <= 0)) {
+        ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
+                        &need_heal);
+
+        if (need_heal == EC_HEAL_PURGE_INDEX) {
+            gf_msg(ec->xl->name, GF_LOG_INFO, 0, EC_MSG_HEAL_FAIL,
+                   "Index entry needs to be purged for: %s ",
+                   uuid_utoa(loc->gfid));
+            /* We need to send zero-xattrop so that stale index entry could be
+             * removed. We need not take lock on this entry to do so as
+             * xattrop on a brick is atomic. */
+            ec_heal_purge_stale_index(frame, ec, loc->inode);
+            goto out;
+        } else if (need_heal == EC_HEAL_NONEED) {
+            gf_msg(ec->xl->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_FAIL,
+                   "Heal is not required for : %s ", uuid_utoa(loc->gfid));
+            goto out;
+        }
+    }
+
+    sources = alloca0(ec->nodes);
+    healed_sinks = alloca0(ec->nodes);
+    if (IA_ISREG(loc->inode->ia_type)) {
+        ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
+                           healed_sinks);
+    } else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
+        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
+                            &pending);
+    } else {
+        ret = 0;
+        memcpy(sources, participants, ec->nodes);
+        memcpy(healed_sinks, participants, ec->nodes);
+    }
+
+    if (ret == 0) {
+        good = ec_char_array_to_mask(sources, ec->nodes);
+        bad = ec_char_array_to_mask(healed_sinks, ec->nodes);
+    } else {
+        op_ret = -1;
+        op_errno = -ret;
+    }
+    msources = alloca0(ec->nodes);
+    mhealed_sinks = alloca0(ec->nodes);
+    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
+    if (ret == 0) {
+        mgood = ec_char_array_to_mask(msources, ec->nodes);
+        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
+    } else {
+        op_ret = -1;
+        op_errno = -ret;
+    }
+
+out:
+    ec_reset_entry_healing(fop);
+    if (fop->cbks.heal) {
+        fop->cbks.heal(fop->req_frame, fop->data, fop->xl, op_ret, op_errno,
+                       ec_char_array_to_mask(participants, ec->nodes),
+                       mgood & good, mbad & bad, pending, NULL);
+    }
+    if (frame)
+        STACK_DESTROY(frame->root);
+
+    return;
+}
+
+int
+ec_synctask_heal_wrap(void *opaque)
+{
+    ec_fop_data_t *fop = opaque;
+    ec_heal_do(fop->xl, fop, &fop->loc[0], fop->int32);
+    return 0;
+}
+
+int
+ec_heal_done(int ret, call_frame_t *heal, void *opaque)
+{
+    if (opaque)
+        ec_fop_data_release(opaque);
+    return 0;
+}
+
+ec_fop_data_t *
+__ec_dequeue_heals(ec_t *ec)
+{
+    ec_fop_data_t *fop = NULL;
+
+    if (list_empty(&ec->heal_waiting))
+        goto none;
+
+    if ((ec->background_heals > 0) && (ec->healers >= ec->background_heals))
+        goto none;
+
+    fop = list_entry(ec->heal_waiting.next, ec_fop_data_t, healer);
+    ec->heal_waiters--;
+    list_del_init(&fop->healer);
+    list_add(&fop->healer, &ec->healing);
+    ec->healers++;
+    return fop;
+none:
+    gf_msg_debug(ec->xl->name, 0, "Num healers: %d, Num Waiters: %d",
+                 ec->healers, ec->heal_waiters);
+    return NULL;
+}
+
+void
+ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
+{
+    if (fop->cbks.heal) {
+        fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
+                       0, 0, NULL);
+    }
+    ec_fop_data_release(fop);
+}
+
+void
+ec_launch_heal(ec_t *ec, ec_fop_data_t *fop)
+{
+    int ret = 0;
+    call_frame_t *frame = NULL;
+
+    frame = create_frame(ec->xl, ec->xl->ctx->pool);
+    if (!frame) {
+        ret = -1;
+        goto out;
+    }
+
+    ec_owner_set(frame, frame->root);
+    /*Do heal as root*/
+    frame->root->uid = 0;
+    frame->root->gid = 0;
+    /*Mark the fops as internal*/
+    frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
+
+    ret = synctask_new(ec->xl->ctx->env, ec_synctask_heal_wrap, ec_heal_done,
+                       frame, fop);
+out:
+    if (ret < 0) {
+        ec_fop_set_error(fop, ENOMEM);
+        ec_heal_fail(ec, fop);
+    }
+
+    if (frame)
+        STACK_DESTROY(frame->root);
+}
+
+void
+ec_handle_healers_done(ec_fop_data_t *fop)
+{
+    ec_t *ec = fop->xl->private;
+    ec_fop_data_t *heal_fop = NULL;
+
+    if (list_empty(&fop->healer))
+        return;
+
+    LOCK(&ec->lock);
+
+    list_del_init(&fop->healer);
+
+    do {
+        ec->healers--;
+        heal_fop = __ec_dequeue_heals(ec);
+
+        if ((heal_fop != NULL) && ec->shutdown) {
+            /* This will prevent ec_handle_healers_done() to be
+             * called recursively. That would be problematic if
+             * the queue is too big. */
+            list_del_init(&heal_fop->healer);
+
+            UNLOCK(&ec->lock);
+
+            ec_fop_set_error(fop, ENOTCONN);
+            ec_heal_fail(ec, heal_fop);
+
+            LOCK(&ec->lock);
+        }
+    } while ((heal_fop != NULL) && ec->shutdown);
+
+    UNLOCK(&ec->lock);
+
+    if (heal_fop)
+        ec_launch_heal(ec, heal_fop);
+}
+
+gf_boolean_t
+ec_is_entry_healing(ec_fop_data_t *fop)
+{
+    ec_inode_t *ctx = NULL;
+    int32_t heal_count = 0;
+    loc_t *loc = NULL;
+
+    loc = &fop->loc[0];
+
+    LOCK(&loc->inode->lock);
+    {
+        ctx = __ec_inode_get(loc->inode, fop->xl);
+        if (ctx) {
+            heal_count = ctx->heal_count;
+        }
+    }
+    UNLOCK(&loc->inode->lock);
+    GF_ASSERT(heal_count >= 0);
+    return heal_count;
+}
+
+void
+ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
+{
+    gf_boolean_t can_heal = _gf_true;
+    ec_t *ec = this->private;
+    ec_fop_data_t *fop_rel = NULL;
+
+    if (fop->req_frame == NULL) {
+        LOCK(&ec->lock);
+        {
+            if ((ec->background_heals > 0) &&
+                (ec->heal_wait_qlen + ec->background_heals) >
+                    (ec->heal_waiters + ec->healers)) {
+                if (!ec_is_entry_healing(fop)) {
+                    list_add_tail(&fop->healer, &ec->heal_waiting);
+                    ec->heal_waiters++;
+                    ec_set_entry_healing(fop);
+                } else {
+                    fop_rel = fop;
+                }
+                fop = __ec_dequeue_heals(ec);
+            } else {
+                can_heal = _gf_false;
+            }
+        }
+        UNLOCK(&ec->lock);
+    }
+
+    if (can_heal) {
+        if (fop) {
+            if (fop->req_frame != NULL) {
+                ec_set_entry_healing(fop);
+            }
+            ec_launch_heal(ec, fop);
+        }
+    } else {
+        gf_msg_debug(this->name, 0,
+                     "Max number of heals are "
+                     "pending, background self-heal rejected");
+        ec_fop_set_error(fop, EBUSY);
+        ec_heal_fail(ec, fop);
+    }
+    if (fop_rel) {
+        ec_heal_done(0, NULL, fop_rel);
+    }
+}
+
+void
+ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target,
+        uint32_t fop_flags, fop_heal_cbk_t func, void *data, loc_t *loc,
+        int32_t partial, dict_t *xdata)
+{
+    ec_cbk_t callback = {.heal = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t err = EINVAL;
+
+    gf_msg_trace("ec", 0, "EC(HEAL) %p", frame);
+
+    VALIDATE_OR_GOTO(this, fail);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, fail);
+
+    if (!loc || !loc->inode || gf_uuid_is_null(loc->inode->gfid))
+        goto fail;
+
+    if (frame && frame->local)
+        goto fail;
+    fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, fop_flags,
+                               NULL, NULL, callback, data);
+
+    err = ENOMEM;
+
+    if (fop == NULL)
+        goto fail;
+
+    fop->int32 = partial;
+
+    if (loc) {
+        if (loc_copy(&fop->loc[0], loc) != 0)
+            goto fail;
+    }
+
+    if (xdata)
+        fop->xdata = dict_ref(xdata);
+
+    ec_heal_throttle(this, fop);
+
+    return;
+
+fail:
+    if (fop)
+        ec_fop_data_release(fop);
+    if (func)
+        func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
+}
+
+int
+ec_replace_heal_done(int ret, call_frame_t *heal, void *opaque)
+{
+    ec_t *ec = opaque;
+    gf_boolean_t last_fop = _gf_false;
+
+    if (GF_ATOMIC_DEC(ec->async_fop_count) == 0) {
+        LOCK(&ec->lock);
+        {
+            last_fop = __ec_is_last_fop(ec);
+        }
+        UNLOCK(&ec->lock);
+    }
+    gf_msg_debug(ec->xl->name, 0, "getxattr on bricks is done ret %d", ret);
+
+    if (last_fop)
+        ec_pending_fops_completed(ec);
+
+    return 0;
+}
+
+int32_t
+ec_replace_heal(ec_t *ec, inode_t *inode)
+{
+    loc_t loc = {0};
+    int ret = 0;
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+    ret = syncop_getxattr(ec->xl, &loc, NULL, EC_XATTR_HEAL, NULL, NULL);
+    if (ret < 0)
+        gf_msg_debug(ec->xl->name, 0, "Heal failed for replace brick ret = %d",
+                     ret);
+
+    /* Once the root inode has been checked, it might have triggered a
+     * self-heal on it after a replace brick command or for some other
+     * reason. It can also happen that the volume already had damaged
+     * files in the index, even if the heal on the root directory failed.
+     * In both cases we need to wake all index healers to continue
+     * healing remaining entries that are marked as dirty. */
+    ec_shd_index_healer_wake(ec);
+
+    loc_wipe(&loc);
+    return ret;
+}
+
+int32_t
+ec_replace_brick_heal_wrap(void *opaque)
+{
+    ec_t *ec = opaque;
+    inode_table_t *itable = NULL;
+    int32_t ret = -1;
+
+    if (ec->xl->itable)
+        itable = ec->xl->itable;
+    else
+        goto out;
+
+    if (xlator_is_cleanup_starting(ec->xl))
+        goto out;
+
+    ret = ec_replace_heal(ec, itable->root);
+out:
+    return ret;
+}
+
+int32_t
+ec_launch_replace_heal(ec_t *ec)
+{
+    int ret = -1;
+
+    ret = synctask_new(ec->xl->ctx->env, ec_replace_brick_heal_wrap,
+                       ec_replace_heal_done, NULL, ec);
+
+    if (ret < 0) {
+        gf_msg_debug(ec->xl->name, 0, "Heal failed for replace brick ret = %d",
+                     ret);
+        ec_replace_heal_done(-1, NULL, ec);
+    }
+
+    return ret;
+}
+
+int32_t
+ec_set_heal_info(dict_t **dict_rsp, char *status)
+{
+    dict_t *dict = NULL;
+    int ret = 0;
+
+    dict = dict_new();
+    if (!dict) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    ret = dict_set_str(dict, "heal-info", status);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_WARNING, -ret, EC_MSG_HEAL_FAIL,
+               "Failed to set heal-info key to "
+               "%s",
+               status);
+        dict_unref(dict);
+        dict = NULL;
+    }
+    *dict_rsp = dict;
+out:
+    return ret;
+}
+
+static int32_t
+_need_heal_calculate(ec_t *ec, uint64_t *dirty, unsigned char *sources,
+                     gf_boolean_t self_locked, int32_t lock_count,
+                     ec_heal_need_t *need_heal, uint64_t *versions)
+{
+    int i = 0;
+    int source_count = 0;
+
+    source_count = EC_COUNT(sources, ec->nodes);
+    if (source_count == ec->nodes) {
+        *need_heal = EC_HEAL_NONEED;
+        if (self_locked || lock_count == 0) {
+            for (i = 0; i < ec->nodes; i++) {
+                if (dirty[i] || (versions[i] != versions[0])) {
+                    *need_heal = EC_HEAL_MUST;
+                    goto out;
+                }
+            }
+            /* If lock count is 0, all dirty flags are 0 and all the
+             * versions are macthing then why are we here. It looks
+             * like something went wrong while removing the index entries
+             * after completing a successful heal or fop. In this case
+             * we need to remove this index entry to avoid triggering heal
+             * in a loop and causing lookups again and again*/
+            *need_heal = EC_HEAL_PURGE_INDEX;
+        } else {
+            for (i = 0; i < ec->nodes; i++) {
+                /* Since each lock can only increment the dirty
+                 * count once, if dirty is > 1 it means that
+                 * another operation has left the dirty count
+                 * set and this indicates a problem in the
+                 * inode.*/
+                if (dirty[i] > 1) {
+                    *need_heal = EC_HEAL_MUST;
+                    goto out;
+                }
+                if (dirty[i] != dirty[0] || (versions[i] != versions[0])) {
+                    *need_heal = EC_HEAL_MAYBE;
+                }
+            }
+        }
+    } else {
+        *need_heal = EC_HEAL_MUST;
+    }
+
+out:
+    return source_count;
+}
+
+static int32_t
+ec_need_metadata_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+                      int32_t lock_count, gf_boolean_t self_locked,
+                      gf_boolean_t thorough, ec_heal_need_t *need_heal)
+{
+    uint64_t *dirty = NULL;
+    unsigned char *sources = NULL;
+    unsigned char *healed_sinks = NULL;
+    uint64_t *meta_versions = NULL;
+    int ret = 0;
+
+    sources = alloca0(ec->nodes);
+    healed_sinks = alloca0(ec->nodes);
+    dirty = alloca0(ec->nodes * sizeof(*dirty));
+    meta_versions = alloca0(ec->nodes * sizeof(*meta_versions));
+    ret = ec_heal_metadata_find_direction(ec, replies, meta_versions, dirty,
+                                          sources, healed_sinks);
+    if (ret < 0 && ret != -EIO) {
+        goto out;
+    }
+
+    ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count,
+                               need_heal, meta_versions);
+out:
+    return ret;
+}
+
+static int32_t
+ec_need_data_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+                  int32_t lock_count, gf_boolean_t self_locked,
+                  gf_boolean_t thorough, ec_heal_need_t *need_heal)
+{
+    uint64_t *dirty = NULL;
+    unsigned char *sources = NULL;
+    unsigned char *healed_sinks = NULL;
+    uint64_t *data_versions = NULL;
+    uint64_t *size = NULL;
+    int ret = 0;
+
+    sources = alloca0(ec->nodes);
+    healed_sinks = alloca0(ec->nodes);
+    dirty = alloca0(ec->nodes * sizeof(*dirty));
+    data_versions = alloca0(ec->nodes * sizeof(*data_versions));
+    size = alloca0(ec->nodes * sizeof(*size));
+
+    /* When dd is going on and heal info is called there is a very good
+     * chance for on disk sizes to mismatch even though nothing is wrong
+     * we don't need ondisk size check there. But if the file is either
+     * self-locked or the caller wants a thorough check then make sure to
+     * perform on disk check also. */
+    ret = ec_heal_data_find_direction(
+        ec, replies, data_versions, dirty, size, sources, healed_sinks,
+        self_locked || thorough, EC_COMBINE_XDATA);
+    if (ret < 0 && ret != -EIO) {
+        goto out;
+    }
+
+    ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count,
+                               need_heal, data_versions);
+out:
+    return ret;
+}
+
+static int32_t
+ec_need_entry_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+                   int32_t lock_count, gf_boolean_t self_locked,
+                   gf_boolean_t thorough, ec_heal_need_t *need_heal)
+{
+    uint64_t *dirty = NULL;
+    unsigned char *sources = NULL;
+    unsigned char *healed_sinks = NULL;
+    uint64_t *data_versions = NULL;
+    int ret = 0;
+
+    sources = alloca0(ec->nodes);
+    healed_sinks = alloca0(ec->nodes);
+    dirty = alloca0(ec->nodes * sizeof(*dirty));
+    data_versions = alloca0(ec->nodes * sizeof(*data_versions));
+
+    ret = ec_heal_entry_find_direction(ec, replies, data_versions, dirty,
+                                       sources, healed_sinks);
+    if (ret < 0 && ret != -EIO) {
+        goto out;
+    }
+
+    ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count,
+                               need_heal, data_versions);
+out:
+    return ret;
+}
+
+static int32_t
+ec_need_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+             int32_t lock_count, gf_boolean_t self_locked,
+             gf_boolean_t thorough, ec_heal_need_t *need_heal)
+{
+    int ret = 0;
+
+    ret = ec_need_metadata_heal(ec, inode, replies, lock_count, self_locked,
+                                thorough, need_heal);
+    if (ret < 0)
+        goto out;
+
+    if (*need_heal == EC_HEAL_MUST)
+        goto out;
+
+    if (inode->ia_type == IA_IFREG) {
+        ret = ec_need_data_heal(ec, inode, replies, lock_count, self_locked,
+                                thorough, need_heal);
+    } else if (inode->ia_type == IA_IFDIR) {
+        ret = ec_need_entry_heal(ec, inode, replies, lock_count, self_locked,
+                                 thorough, need_heal);
+    }
+
+out:
+    return ret;
+}
+
+int32_t
+ec_heal_inspect(call_frame_t *frame, ec_t *ec, inode_t *inode,
+                unsigned char *locked_on, gf_boolean_t self_locked,
+                gf_boolean_t thorough, ec_heal_need_t *need_heal)
+{
+    loc_t loc = {0};
+    int i = 0;
+    int ret = 0;
+    dict_t *xdata = NULL;
+    uint64_t zero_array[2] = {0};
+    uint64_t zero_value = 0;
+    unsigned char *output = NULL;
+    default_args_cbk_t *replies = NULL;
+    int32_t lock_count = 0;
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    output = alloca0(ec->nodes);
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    xdata = dict_new();
+    if (!xdata ||
+        dict_set_static_bin(xdata, EC_XATTR_VERSION, zero_array,
+                            sizeof(zero_array)) ||
+        dict_set_static_bin(xdata, EC_XATTR_DIRTY, zero_array,
+                            sizeof(zero_array)) ||
+        dict_set_static_bin(xdata, EC_XATTR_SIZE, &zero_value,
+                            sizeof(zero_value))) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    if (!self_locked) {
+        ret = dict_set_str(xdata, GLUSTERFS_INODELK_DOM_COUNT, ec->xl->name);
+        if (ret) {
+            ret = -ENOMEM;
+            goto out;
+        }
+    }
+
+    ret = cluster_lookup(ec->xl_list, locked_on, ec->nodes, replies, output,
+                         frame, ec->xl, &loc, xdata);
+
+    if (ret != ec->nodes) {
+        ret = ec->nodes;
+        *need_heal = EC_HEAL_MUST;
+        goto out;
+    }
+
+    if (self_locked)
+        goto need_heal;
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (!output[i] || !replies[i].xdata) {
+            continue;
+        }
+        if ((dict_get_int32(replies[i].xdata, GLUSTERFS_INODELK_COUNT,
+                            &lock_count) == 0) &&
+            lock_count > 0) {
+            break;
+        }
+    }
+need_heal:
+    ret = ec_need_heal(ec, inode, replies, lock_count, self_locked, thorough,
+                       need_heal);
+out:
+    cluster_replies_wipe(replies, ec->nodes);
+    loc_wipe(&loc);
+    if (xdata) {
+        dict_unref(xdata);
+    }
+    return ret;
+}
+
+int32_t
+ec_heal_locked_inspect(call_frame_t *frame, ec_t *ec, inode_t *inode,
+                       ec_heal_need_t *need_heal)
+{
+    unsigned char *locked_on = NULL;
+    unsigned char *up_subvols = NULL;
+    unsigned char *output = NULL;
+    default_args_cbk_t *replies = NULL;
+    int ret = 0;
+
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    locked_on = alloca0(ec->nodes);
+    output = alloca0(ec->nodes);
+    up_subvols = alloca0(ec->nodes);
+    ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+
+    ret = cluster_inodelk(ec->xl_list, up_subvols, ec->nodes, replies,
+                          locked_on, frame, ec->xl, ec->xl->name, inode, 0, 0);
+    if (ret != ec->nodes) {
+        *need_heal = EC_HEAL_MUST;
+        goto unlock;
+    }
+    ret = ec_heal_inspect(frame, ec, inode, locked_on, _gf_true, _gf_true,
+                          need_heal);
+unlock:
+    cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+                      ec->xl, ec->xl->name, inode, 0, 0);
+    cluster_replies_wipe(replies, ec->nodes);
+    return ret;
+}
+
+int32_t
+ec_get_heal_info(xlator_t *this, loc_t *entry_loc, dict_t **dict_rsp)
+{
+    int ret = -ENOMEM;
+    ec_heal_need_t need_heal = EC_HEAL_NONEED;
+    call_frame_t *frame = NULL;
+    ec_t *ec = NULL;
+    unsigned char *up_subvols = NULL;
+    loc_t loc = {
+        0,
+    };
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, entry_loc, out);
+
+    ec = this->private;
+    up_subvols = alloca0(ec->nodes);
+    ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+
+    if (EC_COUNT(up_subvols, ec->nodes) != ec->nodes) {
+        need_heal = EC_HEAL_MUST;
+        goto set_heal;
+    }
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame) {
+        goto out;
+    }
+    ec_owner_set(frame, frame->root);
+    frame->root->uid = 0;
+    frame->root->gid = 0;
+    frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
+
+    if (loc_copy(&loc, entry_loc) != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+               "Failed to copy a location.");
+        goto out;
+    }
+    if (!loc.inode) {
+        ret = syncop_inode_find(this, this, loc.gfid, &loc.inode, NULL, NULL);
+        if (ret < 0)
+            goto out;
+    }
+
+    ret = ec_heal_inspect(frame, ec, loc.inode, up_subvols, _gf_false,
+                          _gf_false, &need_heal);
+    if (ret == ec->nodes && need_heal != EC_HEAL_MAYBE) {
+        goto set_heal;
+    }
+    need_heal = EC_HEAL_NONEED;
+    ret = ec_heal_locked_inspect(frame, ec, loc.inode, &need_heal);
+    if (ret < 0)
+        goto out;
+set_heal:
+    if (need_heal == EC_HEAL_MUST) {
+        ret = ec_set_heal_info(dict_rsp, "heal");
+    } else {
+        ret = ec_set_heal_info(dict_rsp, "no-heal");
+    }
+out:
+    if (frame) {
+        STACK_DESTROY(frame->root);
+    }
+    loc_wipe(&loc);
+    return ret;
+}
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
new file mode 100644
index 00000000000..5c1586bc9c5
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-heald.c
@@ -0,0 +1,681 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/defaults.h>
+#include <glusterfs/compat-errno.h>
+#include "ec.h"
+#include "ec-messages.h"
+#include "ec-heald.h"
+#include "ec-mem-types.h"
+#include <glusterfs/syncop.h>
+#include <glusterfs/syncop-utils.h>
+#include "protocol-common.h"
+
+#define NTH_INDEX_HEALER(this, n)                                              \
+    (&((((ec_t *)this->private))->shd.index_healers[n]))
+#define NTH_FULL_HEALER(this, n)                                               \
+    (&((((ec_t *)this->private))->shd.full_healers[n]))
+
+gf_boolean_t
+ec_shd_is_subvol_local(xlator_t *this, int subvol)
+{
+    ec_t *ec = NULL;
+    gf_boolean_t is_local = _gf_false;
+    loc_t loc = {
+        0,
+    };
+
+    ec = this->private;
+    loc.inode = this->itable->root;
+    syncop_is_subvol_local(ec->xl_list[subvol], &loc, &is_local);
+    return is_local;
+}
+
+char *
+ec_subvol_name(xlator_t *this, int subvol)
+{
+    ec_t *ec = NULL;
+
+    ec = this->private;
+    if (subvol < 0 || subvol > ec->nodes)
+        return NULL;
+
+    return ec->xl_list[subvol]->name;
+}
+
+int
+__ec_shd_healer_wait(struct subvol_healer *healer)
+{
+    ec_t *ec = NULL;
+    struct timespec wait_till = {
+        0,
+    };
+    int ret = 0;
+
+    ec = healer->this->private;
+
+disabled_loop:
+    wait_till.tv_sec = gf_time() + ec->shd.timeout;
+
+    while (!healer->rerun) {
+        ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till);
+        if (ret == ETIMEDOUT)
+            break;
+    }
+
+    if (ec->shutdown) {
+        healer->running = _gf_false;
+        return -1;
+    }
+
+    ret = healer->rerun;
+    healer->rerun = 0;
+
+    if (!ec->shd.enabled || !ec->up)
+        goto disabled_loop;
+
+    return ret;
+}
+
+int
+ec_shd_healer_wait(struct subvol_healer *healer)
+{
+    int ret = 0;
+
+    pthread_mutex_lock(&healer->mutex);
+    {
+        ret = __ec_shd_healer_wait(healer);
+    }
+    pthread_mutex_unlock(&healer->mutex);
+
+    return ret;
+}
+
+int
+ec_shd_index_inode(xlator_t *this, xlator_t *subvol, inode_t **inode)
+{
+    loc_t rootloc = {
+        0,
+    };
+    int ret = 0;
+    dict_t *xattr = NULL;
+    void *index_gfid = NULL;
+
+    *inode = NULL;
+    rootloc.inode = inode_ref(this->itable->root);
+    gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
+
+    ret = syncop_getxattr(subvol, &rootloc, &xattr, GF_XATTROP_INDEX_GFID, NULL,
+                          NULL);
+    if (ret < 0)
+        goto out;
+    if (!xattr) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = dict_get_ptr(xattr, GF_XATTROP_INDEX_GFID, &index_gfid);
+    if (ret)
+        goto out;
+
+    gf_msg_debug(this->name, 0, "index-dir gfid for %s: %s", subvol->name,
+                 uuid_utoa(index_gfid));
+
+    ret = syncop_inode_find(this, subvol, index_gfid, inode, NULL, NULL);
+
+out:
+    loc_wipe(&rootloc);
+
+    if (xattr)
+        dict_unref(xattr);
+
+    return ret;
+}
+
+int
+ec_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name)
+{
+    loc_t loc = {
+        0,
+    };
+    int ret = 0;
+
+    loc.parent = inode_ref(inode);
+    loc.name = name;
+
+    ret = syncop_unlink(subvol, &loc, NULL, NULL);
+
+    loc_wipe(&loc);
+    return ret;
+}
+
+static gf_boolean_t
+ec_is_heal_completed(char *status)
+{
+    char *bad_pos = NULL;
+    char *zero_pos = NULL;
+
+    if (!status) {
+        return _gf_false;
+    }
+
+    /*Logic:
+     * Status will be of the form Good: <binary>, Bad: <binary>
+     * If heal completes, if we do strchr for '0' it should be present after
+     * 'Bad:' i.e. strRchr for ':'
+     * */
+
+    zero_pos = strchr(status, '0');
+    bad_pos = strrchr(status, ':');
+    if (!zero_pos || !bad_pos) {
+        /*malformed status*/
+        return _gf_false;
+    }
+
+    if (zero_pos > bad_pos) {
+        return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+int
+ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc,
+                gf_boolean_t full)
+{
+    dict_t *xdata = NULL;
+    dict_t *dict = NULL;
+    uint32_t count;
+    int32_t ret;
+    char *heal_status = NULL;
+    ec_t *ec = healer->this->private;
+
+    GF_ATOMIC_INC(ec->stats.shd.attempted);
+    ret = syncop_getxattr(healer->this, loc, &dict, EC_XATTR_HEAL, NULL,
+                          &xdata);
+    if (ret == 0) {
+        if (dict && (dict_get_str(dict, EC_XATTR_HEAL, &heal_status) == 0)) {
+            if (ec_is_heal_completed(heal_status)) {
+                GF_ATOMIC_INC(ec->stats.shd.completed);
+            }
+        }
+    }
+
+    if (!full && (loc->inode->ia_type == IA_IFDIR)) {
+        /* If we have just healed a directory, it's possible that
+         * other index entries have appeared to be healed. */
+        if ((xdata != NULL) &&
+            (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) &&
+            (count > 0)) {
+            /* Force a rerun of the index healer. */
+            gf_msg_debug(healer->this->name, 0, "%d more entries to heal",
+                         count);
+
+            healer->rerun = _gf_true;
+        }
+    }
+
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
+
+    if (dict) {
+        dict_unref(dict);
+    }
+
+    return ret;
+}
+
+int
+ec_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+                  void *data)
+{
+    struct subvol_healer *healer = data;
+    ec_t *ec = NULL;
+    loc_t loc = {0};
+    int ret = 0;
+
+    ec = healer->this->private;
+    if (ec->xl_up_count <= ec->fragments) {
+        return -ENOTCONN;
+    }
+    if (!ec->shd.enabled)
+        return -EBUSY;
+
+    gf_msg_debug(healer->this->name, 0, "got entry: %s", entry->d_name);
+
+    ret = gf_uuid_parse(entry->d_name, loc.gfid);
+    if (ret)
+        return 0;
+
+    /* If this fails with ENOENT/ESTALE index is stale */
+    ret = syncop_gfid_to_path(healer->this->itable, subvol, loc.gfid,
+                              (char **)&loc.path);
+    if (ret < 0)
+        goto out;
+
+    ret = syncop_inode_find(healer->this, healer->this, loc.gfid, &loc.inode,
+                            NULL, NULL);
+    if (ret < 0)
+        goto out;
+
+    ec_shd_selfheal(healer, healer->subvol, &loc, _gf_false);
+out:
+    if (ret == -ENOENT || ret == -ESTALE) {
+        gf_msg(healer->this->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_FAIL,
+               "Purging index for gfid %s:", uuid_utoa(loc.gfid));
+        ec_shd_index_purge(subvol, parent->inode, entry->d_name);
+    }
+    loc_wipe(&loc);
+
+    return 0;
+}
+
+int
+ec_shd_index_sweep(struct subvol_healer *healer)
+{
+    loc_t loc = {0};
+    ec_t *ec = NULL;
+    int ret = 0;
+    xlator_t *subvol = NULL;
+    dict_t *xdata = NULL;
+
+    ec = healer->this->private;
+    subvol = ec->xl_list[healer->subvol];
+
+    ret = ec_shd_index_inode(healer->this, subvol, &loc.inode);
+    if (ret < 0) {
+        gf_msg(healer->this->name, GF_LOG_WARNING, errno,
+               EC_MSG_INDEX_DIR_GET_FAIL, "unable to get index-dir on %s",
+               subvol->name);
+        goto out;
+    }
+
+    xdata = dict_new();
+    if (!xdata || dict_set_int32(xdata, "get-gfid-type", 1)) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    _mask_cancellation();
+    ret = syncop_mt_dir_scan(NULL, subvol, &loc, GF_CLIENT_PID_SELF_HEALD,
+                             healer, ec_shd_index_heal, xdata,
+                             ec->shd.max_threads, ec->shd.wait_qlength);
+    _unmask_cancellation();
+out:
+    if (xdata)
+        dict_unref(xdata);
+    loc_wipe(&loc);
+
+    return ret;
+}
+
+int
+ec_shd_full_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+                 void *data)
+{
+    struct subvol_healer *healer = data;
+    xlator_t *this = healer->this;
+    ec_t *ec = NULL;
+    loc_t loc = {0};
+    int ret = 0;
+
+    ec = this->private;
+
+    if (this->cleanup_starting) {
+        return -ENOTCONN;
+    }
+
+    if (ec->xl_up_count <= ec->fragments) {
+        return -ENOTCONN;
+    }
+    if (!ec->shd.enabled)
+        return -EBUSY;
+
+    if (gf_uuid_is_null(entry->d_stat.ia_gfid)) {
+        /* It's possible that an entry has been removed just after
+         * being seen in a directory but before getting its stat info.
+         * In this case we'll receive a NULL gfid here. Since the file
+         * doesn't exist anymore, we can safely ignore it. */
+        return 0;
+    }
+
+    loc.parent = inode_ref(parent->inode);
+    loc.name = entry->d_name;
+    gf_uuid_copy(loc.gfid, entry->d_stat.ia_gfid);
+
+    /* If this fails with ENOENT/ESTALE index is stale */
+    ret = syncop_gfid_to_path(this->itable, subvol, loc.gfid,
+                              (char **)&loc.path);
+    if (ret < 0)
+        goto out;
+
+    ret = syncop_inode_find(this, this, loc.gfid, &loc.inode, NULL, NULL);
+    if (ret < 0)
+        goto out;
+
+    ec_shd_selfheal(healer, healer->subvol, &loc, _gf_true);
+
+    ret = 0;
+
+out:
+    loc_wipe(&loc);
+    return ret;
+}
+
+int
+ec_shd_full_sweep(struct subvol_healer *healer, inode_t *inode)
+{
+    ec_t *ec = NULL;
+    loc_t loc = {0};
+    int ret = -1;
+
+    ec = healer->this->private;
+    loc.inode = inode;
+    _mask_cancellation();
+    ret = syncop_ftw(ec->xl_list[healer->subvol], &loc,
+                     GF_CLIENT_PID_SELF_HEALD, healer, ec_shd_full_heal);
+    _unmask_cancellation();
+    return ret;
+}
+
+void *
+ec_shd_index_healer(void *data)
+{
+    struct subvol_healer *healer = NULL;
+    xlator_t *this = NULL;
+    int run = 0;
+
+    healer = data;
+    THIS = this = healer->this;
+    ec_t *ec = this->private;
+
+    for (;;) {
+        run = ec_shd_healer_wait(healer);
+        if (run == -1)
+            break;
+
+        if (ec->xl_up_count > ec->fragments) {
+            gf_msg_debug(this->name, 0, "starting index sweep on subvol %s",
+                         ec_subvol_name(this, healer->subvol));
+            ec_shd_index_sweep(healer);
+        }
+        gf_msg_debug(this->name, 0, "finished index sweep on subvol %s",
+                     ec_subvol_name(this, healer->subvol));
+    }
+
+    return NULL;
+}
+
+void *
+ec_shd_full_healer(void *data)
+{
+    struct subvol_healer *healer = NULL;
+    xlator_t *this = NULL;
+    loc_t rootloc = {0};
+
+    int run = 0;
+
+    healer = data;
+    THIS = this = healer->this;
+    ec_t *ec = this->private;
+
+    rootloc.inode = this->itable->root;
+    for (;;) {
+        run = ec_shd_healer_wait(healer);
+        if (run < 0) {
+            break;
+        } else if (run == 0) {
+            continue;
+        }
+
+        if (ec->xl_up_count > ec->fragments) {
+            gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_FULL_SWEEP_START,
+                   "starting full sweep on subvol %s",
+                   ec_subvol_name(this, healer->subvol));
+
+            ec_shd_selfheal(healer, healer->subvol, &rootloc, _gf_true);
+            ec_shd_full_sweep(healer, this->itable->root);
+        }
+
+        gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_FULL_SWEEP_STOP,
+               "finished full sweep on subvol %s",
+               ec_subvol_name(this, healer->subvol));
+    }
+
+    return NULL;
+}
+
+int
+ec_shd_healer_init(xlator_t *this, struct subvol_healer *healer)
+{
+    int ret = 0;
+
+    ret = pthread_mutex_init(&healer->mutex, NULL);
+    if (ret)
+        goto out;
+
+    ret = pthread_cond_init(&healer->cond, NULL);
+    if (ret)
+        goto out;
+
+    healer->this = this;
+    healer->running = _gf_false;
+    healer->rerun = _gf_false;
+out:
+    return ret;
+}
+
+int
+ec_shd_healer_spawn(xlator_t *this, struct subvol_healer *healer,
+                    void *(threadfn)(void *))
+{
+    int ret = 0;
+
+    pthread_mutex_lock(&healer->mutex);
+    {
+        if (healer->running) {
+            pthread_cond_signal(&healer->cond);
+        } else {
+            ret = gf_thread_create(&healer->thread, NULL, threadfn, healer,
+                                   "ecshd");
+            if (ret)
+                goto unlock;
+            healer->running = 1;
+        }
+
+        healer->rerun = 1;
+    }
+unlock:
+    pthread_mutex_unlock(&healer->mutex);
+
+    return ret;
+}
+
+int
+ec_shd_full_healer_spawn(xlator_t *this, int subvol)
+{
+    if (xlator_is_cleanup_starting(this))
+        return -1;
+
+    return ec_shd_healer_spawn(this, NTH_FULL_HEALER(this, subvol),
+                               ec_shd_full_healer);
+}
+
+int
+ec_shd_index_healer_spawn(xlator_t *this, int subvol)
+{
+    if (xlator_is_cleanup_starting(this))
+        return -1;
+
+    return ec_shd_healer_spawn(this, NTH_INDEX_HEALER(this, subvol),
+                               ec_shd_index_healer);
+}
+
+void
+ec_shd_index_healer_wake(ec_t *ec)
+{
+    int32_t i;
+
+    for (i = 0; i < ec->nodes; i++) {
+        if (((ec->xl_up >> i) & 1) != 0) {
+            ec_shd_index_healer_spawn(ec->xl, i);
+        }
+    }
+}
+
+int
+ec_selfheal_daemon_init(xlator_t *this)
+{
+    ec_t *ec = NULL;
+    ec_self_heald_t *shd = NULL;
+    int ret = -1;
+    int i = 0;
+
+    ec = this->private;
+    shd = &ec->shd;
+
+    shd->index_healers = GF_CALLOC(sizeof(*shd->index_healers), ec->nodes,
+                                   ec_mt_subvol_healer_t);
+    if (!shd->index_healers)
+        goto out;
+
+    for (i = 0; i < ec->nodes; i++) {
+        shd->index_healers[i].subvol = i;
+        ret = ec_shd_healer_init(this, &shd->index_healers[i]);
+        if (ret)
+            goto out;
+    }
+
+    shd->full_healers = GF_CALLOC(sizeof(*shd->full_healers), ec->nodes,
+                                  ec_mt_subvol_healer_t);
+    if (!shd->full_healers)
+        goto out;
+
+    for (i = 0; i < ec->nodes; i++) {
+        shd->full_healers[i].subvol = i;
+        ret = ec_shd_healer_init(this, &shd->full_healers[i]);
+        if (ret)
+            goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+ec_heal_op(xlator_t *this, dict_t *output, gf_xl_afr_op_t op, int xl_id)
+{
+    char key[64] = {0};
+    int op_ret = 0;
+    ec_t *ec = NULL;
+    int i = 0;
+    GF_UNUSED int ret = 0;
+
+    ec = this->private;
+
+    op_ret = -1;
+    for (i = 0; i < ec->nodes; i++) {
+        snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
+
+        if (((ec->xl_up >> i) & 1) == 0) {
+            ret = dict_set_str(output, key, "Brick is not connected");
+        } else if (!ec->up) {
+            ret = dict_set_str(output, key, "Disperse subvolume is not up");
+        } else if (!ec_shd_is_subvol_local(this, i)) {
+            ret = dict_set_str(output, key, "Brick is remote");
+        } else {
+            ret = dict_set_str(output, key, "Started self-heal");
+            if (op == GF_SHD_OP_HEAL_FULL) {
+                ec_shd_full_healer_spawn(this, i);
+            } else if (op == GF_SHD_OP_HEAL_INDEX) {
+                ec_shd_index_healer_spawn(this, i);
+            }
+            op_ret = 0;
+        }
+    }
+    return op_ret;
+}
+
+int
+ec_xl_op(xlator_t *this, dict_t *input, dict_t *output)
+{
+    gf_xl_afr_op_t op = GF_SHD_OP_INVALID;
+    int ret = 0;
+    int xl_id = 0;
+
+    ret = dict_get_int32(input, "xl-op", (int32_t *)&op);
+    if (ret)
+        goto out;
+
+    ret = dict_get_int32(input, this->name, &xl_id);
+    if (ret)
+        goto out;
+
+    ret = dict_set_int32(output, this->name, xl_id);
+    if (ret)
+        goto out;
+
+    switch (op) {
+        case GF_SHD_OP_HEAL_FULL:
+            ret = ec_heal_op(this, output, op, xl_id);
+            break;
+
+        case GF_SHD_OP_HEAL_INDEX:
+            ret = ec_heal_op(this, output, op, xl_id);
+            break;
+
+        default:
+            ret = -1;
+            break;
+    }
+out:
+    dict_del(output, this->name);
+    return ret;
+}
+
+void
+ec_destroy_healer_object(xlator_t *this, struct subvol_healer *healer)
+{
+    if (!healer)
+        return;
+
+    pthread_cond_destroy(&healer->cond);
+    pthread_mutex_destroy(&healer->mutex);
+}
+
+void
+ec_selfheal_daemon_fini(xlator_t *this)
+{
+    struct subvol_healer *healer = NULL;
+    ec_self_heald_t *shd = NULL;
+    ec_t *priv = NULL;
+    int i = 0;
+
+    priv = this->private;
+    if (!priv)
+        return;
+
+    shd = &priv->shd;
+    if (!shd->iamshd)
+        return;
+
+    for (i = 0; i < priv->nodes; i++) {
+        healer = &shd->index_healers[i];
+        ec_destroy_healer_object(this, healer);
+
+        healer = &shd->full_healers[i];
+        ec_destroy_healer_object(this, healer);
+    }
+
+    GF_FREE(shd->index_healers);
+    GF_FREE(shd->full_healers);
+}
diff --git a/xlators/cluster/ec/src/ec-heald.h b/xlators/cluster/ec/src/ec-heald.h
new file mode 100644
index 00000000000..6c7da4edc10
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-heald.h
@@ -0,0 +1,30 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_HEALD_H__
+#define __EC_HEALD_H__
+
+#include "ec-types.h"           // for ec_t
+#include "glusterfs/dict.h"     // for dict_t
+#include "glusterfs/globals.h"  // for xlator_t
+
+int
+ec_xl_op(xlator_t *this, dict_t *input, dict_t *output);
+
+int
+ec_selfheal_daemon_init(xlator_t *this);
+
+void
+ec_shd_index_healer_wake(ec_t *ec);
+
+void
+ec_selfheal_daemon_fini(xlator_t *this);
+
+#endif /* __EC_HEALD_H__ */
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
new file mode 100644
index 00000000000..48f54475e01
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-helpers.c
@@ -0,0 +1,867 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <libgen.h>
+
+#include <glusterfs/byte-order.h>
+
+#include "ec.h"
+#include "ec-mem-types.h"
+#include "ec-messages.h"
+#include "ec-fops.h"
+#include "ec-method.h"
+#include "ec-helpers.h"
+
+static const char *ec_fop_list[] = {[-EC_FOP_HEAL] = "HEAL"};
+
+const char *
+ec_bin(char *str, size_t size, uint64_t value, int32_t digits)
+{
+    str += size;
+
+    if (size-- < 1) {
+        goto failed;
+    }
+    *--str = 0;
+
+    while ((value != 0) || (digits > 0)) {
+        if (size-- < 1) {
+            goto failed;
+        }
+        *--str = '0' + (value & 1);
+        digits--;
+        value >>= 1;
+    }
+
+    return str;
+
+failed:
+    return "<buffer too small>";
+}
+
+const char *
+ec_fop_name(int32_t id)
+{
+    if (id >= 0) {
+        return gf_fop_list[id];
+    }
+
+    return ec_fop_list[-id];
+}
+
+void
+ec_trace(const char *event, ec_fop_data_t *fop, const char *fmt, ...)
+{
+    char str1[32], str2[32], str3[32];
+    char *msg;
+    ec_t *ec = fop->xl->private;
+    va_list args;
+    int32_t ret;
+
+    va_start(args, fmt);
+    ret = vasprintf(&msg, fmt, args);
+    va_end(args);
+
+    if (ret < 0) {
+        msg = "<memory allocation error>";
+    }
+
+    gf_msg_trace("ec", 0,
+                 "%s(%s) %p(%p) [refs=%d, winds=%d, jobs=%d] "
+                 "frame=%p/%p, min/exp=%d/%d, err=%d state=%d "
+                 "{%s:%s:%s} %s",
+                 event, ec_fop_name(fop->id), fop, fop->parent, fop->refs,
+                 fop->winds, fop->jobs, fop->req_frame, fop->frame,
+                 fop->minimum, fop->expected, fop->error, fop->state,
+                 ec_bin(str1, sizeof(str1), fop->mask, ec->nodes),
+                 ec_bin(str2, sizeof(str2), fop->remaining, ec->nodes),
+                 ec_bin(str3, sizeof(str3), fop->good, ec->nodes), msg);
+
+    if (ret >= 0) {
+        free(msg);
+    }
+}
+
+int32_t
+ec_bits_consume(uint64_t *n)
+{
+    uint64_t tmp;
+
+    tmp = *n;
+    tmp &= -tmp;
+    *n ^= tmp;
+
+    return gf_bits_index(tmp);
+}
+
+size_t
+ec_iov_copy_to(void *dst, struct iovec *vector, int32_t count, off_t offset,
+               size_t size)
+{
+    int32_t i = 0;
+    size_t total = 0, len = 0;
+
+    while (i < count) {
+        if (offset < vector[i].iov_len) {
+            while ((i < count) && (size > 0)) {
+                len = size;
+                if (len > vector[i].iov_len - offset) {
+                    len = vector[i].iov_len - offset;
+                }
+                memcpy(dst, vector[i++].iov_base + offset, len);
+                offset = 0;
+                dst += len;
+                total += len;
+                size -= len;
+            }
+
+            break;
+        }
+
+        offset -= vector[i].iov_len;
+        i++;
+    }
+
+    return total;
+}
+
+int32_t
+ec_buffer_alloc(xlator_t *xl, size_t size, struct iobref **piobref, void **ptr)
+{
+    struct iobref *iobref = NULL;
+    struct iobuf *iobuf = NULL;
+    int32_t ret = -ENOMEM;
+
+    iobuf = iobuf_get_page_aligned(xl->ctx->iobuf_pool, size,
+                                   EC_METHOD_WORD_SIZE);
+    if (iobuf == NULL) {
+        goto out;
+    }
+
+    iobref = *piobref;
+    if (iobref == NULL) {
+        iobref = iobref_new();
+        if (iobref == NULL) {
+            goto out;
+        }
+    }
+
+    ret = iobref_add(iobref, iobuf);
+    if (ret != 0) {
+        if (iobref != *piobref) {
+            iobref_unref(iobref);
+        }
+        iobref = NULL;
+
+        goto out;
+    }
+
+    GF_ASSERT(EC_ALIGN_CHECK(iobuf->ptr, EC_METHOD_WORD_SIZE));
+
+    *ptr = iobuf->ptr;
+
+out:
+    if (iobuf != NULL) {
+        iobuf_unref(iobuf);
+    }
+
+    if (iobref != NULL) {
+        *piobref = iobref;
+    }
+
+    return ret;
+}
+
+int32_t
+ec_dict_set_array(dict_t *dict, char *key, uint64_t value[], int32_t size)
+{
+    int ret = -1;
+    uint64_t *ptr = NULL;
+    int32_t vindex;
+
+    if (value == NULL) {
+        return -EINVAL;
+    }
+
+    ptr = GF_MALLOC(sizeof(uint64_t) * size, gf_common_mt_char);
+    if (ptr == NULL) {
+        return -ENOMEM;
+    }
+    for (vindex = 0; vindex < size; vindex++) {
+        ptr[vindex] = hton64(value[vindex]);
+    }
+    ret = dict_set_bin(dict, key, ptr, sizeof(uint64_t) * size);
+    if (ret)
+        GF_FREE(ptr);
+    return ret;
+}
+
+int32_t
+ec_dict_get_array(dict_t *dict, char *key, uint64_t value[], int32_t size)
+{
+    void *ptr;
+    int32_t len;
+    int32_t vindex;
+    int32_t old_size = 0;
+    int32_t err;
+
+    if (dict == NULL) {
+        return -EINVAL;
+    }
+    err = dict_get_ptr_and_len(dict, key, &ptr, &len);
+    if (err != 0) {
+        return err;
+    }
+
+    if (len > (size * sizeof(uint64_t)) || (len % sizeof(uint64_t))) {
+        return -EINVAL;
+    }
+
+    /* 3.6 version ec would have stored version in 64 bit. In that case treat
+     * metadata versions same as data*/
+    old_size = min(size, len / sizeof(uint64_t));
+    for (vindex = 0; vindex < old_size; vindex++) {
+        value[vindex] = ntoh64(*((uint64_t *)ptr + vindex));
+    }
+
+    if (old_size < size) {
+        for (vindex = old_size; vindex < size; vindex++) {
+            value[vindex] = value[old_size - 1];
+        }
+    }
+
+    return 0;
+}
+
+int32_t
+ec_dict_del_array(dict_t *dict, char *key, uint64_t value[], int32_t size)
+{
+    int ret = 0;
+
+    ret = ec_dict_get_array(dict, key, value, size);
+    if (ret == 0)
+        dict_del(dict, key);
+
+    return ret;
+}
+
+int32_t
+ec_dict_set_number(dict_t *dict, char *key, uint64_t value)
+{
+    int ret = -1;
+    uint64_t *ptr;
+
+    ptr = GF_MALLOC(sizeof(value), gf_common_mt_char);
+    if (ptr == NULL) {
+        return -ENOMEM;
+    }
+
+    *ptr = hton64(value);
+
+    ret = dict_set_bin(dict, key, ptr, sizeof(value));
+    if (ret)
+        GF_FREE(ptr);
+
+    return ret;
+}
+
+int32_t
+ec_dict_del_number(dict_t *dict, char *key, uint64_t *value)
+{
+    void *ptr;
+    int32_t len, err;
+
+    if (dict == NULL) {
+        return -EINVAL;
+    }
+    err = dict_get_ptr_and_len(dict, key, &ptr, &len);
+    if (err != 0) {
+        return err;
+    }
+    if (len != sizeof(uint64_t)) {
+        return -EINVAL;
+    }
+
+    *value = ntoh64(*(uint64_t *)ptr);
+
+    dict_del(dict, key);
+
+    return 0;
+}
+
+int32_t
+ec_dict_set_config(dict_t *dict, char *key, ec_config_t *config)
+{
+    int ret = -1;
+    uint64_t *ptr, data;
+
+    if (config->version > EC_CONFIG_VERSION) {
+        gf_msg("ec", GF_LOG_ERROR, EINVAL, EC_MSG_UNSUPPORTED_VERSION,
+               "Trying to store an unsupported config "
+               "version (%u)",
+               config->version);
+
+        return -EINVAL;
+    }
+
+    ptr = GF_MALLOC(sizeof(uint64_t), gf_common_mt_char);
+    if (ptr == NULL) {
+        return -ENOMEM;
+    }
+
+    data = ((uint64_t)config->version) << 56;
+    data |= ((uint64_t)config->algorithm) << 48;
+    data |= ((uint64_t)config->gf_word_size) << 40;
+    data |= ((uint64_t)config->bricks) << 32;
+    data |= ((uint64_t)config->redundancy) << 24;
+    data |= config->chunk_size;
+
+    *ptr = hton64(data);
+
+    ret = dict_set_bin(dict, key, ptr, sizeof(uint64_t));
+    if (ret)
+        GF_FREE(ptr);
+
+    return ret;
+}
+
+int32_t
+ec_dict_del_config(dict_t *dict, char *key, ec_config_t *config)
+{
+    void *ptr;
+    uint64_t data;
+    int32_t len, err;
+
+    if (dict == NULL) {
+        return -EINVAL;
+    }
+    err = dict_get_ptr_and_len(dict, key, &ptr, &len);
+    if (err != 0) {
+        return err;
+    }
+    if (len != sizeof(uint64_t)) {
+        return -EINVAL;
+    }
+
+    data = ntoh64(*(uint64_t *)ptr);
+    /* Currently we need to get the config xattr for entries of type IA_INVAL.
+     * These entries can later become IA_DIR entries (after inode_link()),
+     * which don't have a config xattr. However, since the xattr is requested
+     * using an xattrop() fop, it will always return a config full of 0's
+     * instead of saying that it doesn't exist.
+     *
+     * We need to filter out this case and consider that a config xattr == 0 is
+     * the same as a non-existent xattr. Otherwise ec_config_check() will fail.
+     */
+    if (data == 0) {
+        return -ENODATA;
+    }
+
+    config->version = (data >> 56) & 0xff;
+    if (config->version > EC_CONFIG_VERSION) {
+        gf_msg("ec", GF_LOG_ERROR, EINVAL, EC_MSG_UNSUPPORTED_VERSION,
+               "Found an unsupported config version (%u)", config->version);
+
+        return -EINVAL;
+    }
+
+    config->algorithm = (data >> 48) & 0xff;
+    config->gf_word_size = (data >> 40) & 0xff;
+    config->bricks = (data >> 32) & 0xff;
+    config->redundancy = (data >> 24) & 0xff;
+    config->chunk_size = data & 0xffffff;
+
+    dict_del(dict, key);
+
+    return 0;
+}
+
+gf_boolean_t
+ec_loc_gfid_check(xlator_t *xl, uuid_t dst, uuid_t src)
+{
+    if (gf_uuid_is_null(src)) {
+        return _gf_true;
+    }
+
+    if (gf_uuid_is_null(dst)) {
+        gf_uuid_copy(dst, src);
+
+        return _gf_true;
+    }
+
+    if (gf_uuid_compare(dst, src) != 0) {
+        gf_msg(xl->name, GF_LOG_WARNING, 0, EC_MSG_GFID_MISMATCH,
+               "Mismatching GFID's in loc");
+
+        return _gf_false;
+    }
+
+    return _gf_true;
+}
+
+int32_t
+ec_loc_setup_inode(xlator_t *xl, inode_table_t *table, loc_t *loc)
+{
+    int32_t ret = -EINVAL;
+
+    if (loc->inode != NULL) {
+        if (!ec_loc_gfid_check(xl, loc->gfid, loc->inode->gfid)) {
+            goto out;
+        }
+    } else if (table != NULL) {
+        if (!gf_uuid_is_null(loc->gfid)) {
+            loc->inode = inode_find(table, loc->gfid);
+        } else if (loc->path && strchr(loc->path, '/')) {
+            loc->inode = inode_resolve(table, (char *)loc->path);
+        }
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int32_t
+ec_loc_setup_parent(xlator_t *xl, inode_table_t *table, loc_t *loc)
+{
+    char *path, *parent;
+    int32_t ret = -EINVAL;
+
+    if (loc->parent != NULL) {
+        if (!ec_loc_gfid_check(xl, loc->pargfid, loc->parent->gfid)) {
+            goto out;
+        }
+    } else if (table != NULL) {
+        if (!gf_uuid_is_null(loc->pargfid)) {
+            loc->parent = inode_find(table, loc->pargfid);
+        } else if (loc->path && strchr(loc->path, '/')) {
+            path = gf_strdup(loc->path);
+            if (path == NULL) {
+                gf_msg(xl->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                       "Unable to duplicate path '%s'", loc->path);
+
+                ret = -ENOMEM;
+
+                goto out;
+            }
+            parent = dirname(path);
+            loc->parent = inode_resolve(table, parent);
+            if (loc->parent != NULL) {
+                gf_uuid_copy(loc->pargfid, loc->parent->gfid);
+            }
+            GF_FREE(path);
+        }
+    }
+
+    /* If 'pargfid' has not been determined, clear 'name' to avoid resolutions
+       based on <gfid:pargfid>/name. */
+    if (gf_uuid_is_null(loc->pargfid)) {
+        loc->name = NULL;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int32_t
+ec_loc_setup_path(xlator_t *xl, loc_t *loc)
+{
+    static uuid_t root = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+    char *name;
+    int32_t ret = -EINVAL;
+
+    if (loc->path != NULL) {
+        name = strrchr(loc->path, '/');
+        if (name == NULL) {
+            /* Allow gfid paths: <gfid:...> */
+            if (strncmp(loc->path, "<gfid:", 6) == 0) {
+                ret = 0;
+            }
+            goto out;
+        }
+        if (name == loc->path) {
+            if (name[1] == 0) {
+                if (!ec_loc_gfid_check(xl, loc->gfid, root)) {
+                    goto out;
+                }
+            } else {
+                if (!ec_loc_gfid_check(xl, loc->pargfid, root)) {
+                    goto out;
+                }
+            }
+        }
+        name++;
+
+        if (loc->name != NULL) {
+            if (strcmp(loc->name, name) != 0) {
+                gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_LOC_NAME,
+                       "Invalid name '%s' in loc", loc->name);
+
+                goto out;
+            }
+        } else {
+            loc->name = name;
+        }
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int32_t
+ec_loc_parent(xlator_t *xl, loc_t *loc, loc_t *parent)
+{
+    inode_table_t *table = NULL;
+    char *str = NULL;
+    int32_t ret = -ENOMEM;
+
+    memset(parent, 0, sizeof(loc_t));
+
+    if (loc->parent != NULL) {
+        table = loc->parent->table;
+        parent->inode = inode_ref(loc->parent);
+    } else if (loc->inode != NULL) {
+        table = loc->inode->table;
+    }
+    if (!gf_uuid_is_null(loc->pargfid)) {
+        gf_uuid_copy(parent->gfid, loc->pargfid);
+    }
+    if (loc->path && strchr(loc->path, '/')) {
+        str = gf_strdup(loc->path);
+        if (str == NULL) {
+            gf_msg(xl->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                   "Unable to duplicate path '%s'", loc->path);
+
+            goto out;
+        }
+        parent->path = gf_strdup(dirname(str));
+        if (parent->path == NULL) {
+            gf_msg(xl->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                   "Unable to duplicate path '%s'", dirname(str));
+
+            goto out;
+        }
+    }
+
+    ret = ec_loc_setup_path(xl, parent);
+    if (ret == 0) {
+        ret = ec_loc_setup_inode(xl, table, parent);
+    }
+    if (ret == 0) {
+        ret = ec_loc_setup_parent(xl, table, parent);
+    }
+    if (ret != 0) {
+        goto out;
+    }
+
+    if ((parent->inode == NULL) && (parent->path == NULL) &&
+        gf_uuid_is_null(parent->gfid)) {
+        gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_LOC_PARENT_INODE_MISSING,
+               "Parent inode missing for loc_t");
+
+        ret = -EINVAL;
+
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    GF_FREE(str);
+
+    if (ret != 0) {
+        loc_wipe(parent);
+    }
+
+    return ret;
+}
+
+int32_t
+ec_loc_update(xlator_t *xl, loc_t *loc, inode_t *inode, struct iatt *iatt)
+{
+    inode_table_t *table = NULL;
+    int32_t ret = -EINVAL;
+
+    if (inode != NULL) {
+        table = inode->table;
+        if (loc->inode != inode) {
+            if (loc->inode != NULL) {
+                inode_unref(loc->inode);
+            }
+            loc->inode = inode_ref(inode);
+            gf_uuid_copy(loc->gfid, inode->gfid);
+        }
+    } else if (loc->inode != NULL) {
+        table = loc->inode->table;
+    } else if (loc->parent != NULL) {
+        table = loc->parent->table;
+    }
+
+    if (iatt != NULL) {
+        if (!ec_loc_gfid_check(xl, loc->gfid, iatt->ia_gfid)) {
+            goto out;
+        }
+    }
+
+    ret = ec_loc_setup_path(xl, loc);
+    if (ret == 0) {
+        ret = ec_loc_setup_inode(xl, table, loc);
+    }
+    if (ret == 0) {
+        ret = ec_loc_setup_parent(xl, table, loc);
+    }
+    if (ret != 0) {
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+int32_t
+ec_loc_from_fd(xlator_t *xl, loc_t *loc, fd_t *fd)
+{
+    ec_fd_t *ctx;
+    int32_t ret = -ENOMEM;
+
+    memset(loc, 0, sizeof(*loc));
+
+    ctx = ec_fd_get(fd, xl);
+    if (ctx != NULL) {
+        if (loc_copy(loc, &ctx->loc) != 0) {
+            goto out;
+        }
+    }
+
+    ret = ec_loc_update(xl, loc, fd->inode, NULL);
+    if (ret != 0) {
+        goto out;
+    }
+
+out:
+    if (ret != 0) {
+        loc_wipe(loc);
+    }
+
+    return ret;
+}
+
+int32_t
+ec_loc_from_loc(xlator_t *xl, loc_t *dst, loc_t *src)
+{
+    int32_t ret = -ENOMEM;
+
+    memset(dst, 0, sizeof(*dst));
+
+    if (loc_copy(dst, src) != 0) {
+        goto out;
+    }
+
+    ret = ec_loc_update(xl, dst, NULL, NULL);
+    if (ret != 0) {
+        goto out;
+    }
+
+out:
+    if (ret != 0) {
+        loc_wipe(dst);
+    }
+
+    return ret;
+}
+
+void
+ec_owner_set(call_frame_t *frame, void *owner)
+{
+    set_lk_owner_from_ptr(&frame->root->lk_owner, owner);
+}
+
+void
+ec_owner_copy(call_frame_t *frame, gf_lkowner_t *owner)
+{
+    lk_owner_copy(&frame->root->lk_owner, owner);
+}
+
+static void
+ec_stripe_cache_init(ec_t *ec, ec_inode_t *ctx)
+{
+    ec_stripe_list_t *stripe_cache = NULL;
+
+    stripe_cache = &(ctx->stripe_cache);
+    if (stripe_cache->max == 0) {
+        stripe_cache->max = ec->stripe_cache;
+    }
+}
+
+ec_inode_t *
+__ec_inode_get(inode_t *inode, xlator_t *xl)
+{
+    ec_inode_t *ctx = NULL;
+    uint64_t value = 0;
+
+    if ((__inode_ctx_get(inode, xl, &value) != 0) || (value == 0)) {
+        ctx = GF_MALLOC(sizeof(*ctx), ec_mt_ec_inode_t);
+        if (ctx != NULL) {
+            memset(ctx, 0, sizeof(*ctx));
+            INIT_LIST_HEAD(&ctx->heal);
+            INIT_LIST_HEAD(&ctx->stripe_cache.lru);
+            ctx->heal_count = 0;
+            value = (uint64_t)(uintptr_t)ctx;
+            if (__inode_ctx_set(inode, xl, &value) != 0) {
+                GF_FREE(ctx);
+
+                return NULL;
+            }
+        }
+    } else {
+        ctx = (ec_inode_t *)(uintptr_t)value;
+    }
+    if (ctx)
+        ec_stripe_cache_init(xl->private, ctx);
+
+    return ctx;
+}
+
+ec_inode_t *
+ec_inode_get(inode_t *inode, xlator_t *xl)
+{
+    ec_inode_t *ctx = NULL;
+
+    LOCK(&inode->lock);
+
+    ctx = __ec_inode_get(inode, xl);
+
+    UNLOCK(&inode->lock);
+
+    return ctx;
+}
+
+ec_fd_t *
+__ec_fd_get(fd_t *fd, xlator_t *xl)
+{
+    int i = 0;
+    ec_fd_t *ctx = NULL;
+    ec_inode_t *ictx = NULL;
+    uint64_t value = 0;
+    ec_t *ec = xl->private;
+
+    if ((__fd_ctx_get(fd, xl, &value) != 0) || (value == 0)) {
+        ctx = GF_MALLOC(sizeof(*ctx) + (sizeof(ec_fd_status_t) * ec->nodes),
+                        ec_mt_ec_fd_t);
+        if (ctx != NULL) {
+            memset(ctx, 0, sizeof(*ctx));
+
+            for (i = 0; i < ec->nodes; i++) {
+                if (fd_is_anonymous(fd)) {
+                    ctx->fd_status[i] = EC_FD_OPENED;
+                } else {
+                    ctx->fd_status[i] = EC_FD_NOT_OPENED;
+                }
+            }
+
+            value = (uint64_t)(uintptr_t)ctx;
+            if (__fd_ctx_set(fd, xl, value) != 0) {
+                GF_FREE(ctx);
+                return NULL;
+            }
+            /* Only refering bad-version so no need for lock
+             * */
+            ictx = __ec_inode_get(fd->inode, xl);
+            if (ictx) {
+                ctx->bad_version = ictx->bad_version;
+            }
+        }
+    } else {
+        ctx = (ec_fd_t *)(uintptr_t)value;
+    }
+
+    /* Treat anonymous fd specially */
+    if (fd->anonymous && ctx) {
+        /* Mark the fd open for all subvolumes. */
+        ctx->open = -1;
+        /* Try to populate ctx->loc with fd->inode information. */
+        ec_loc_update(xl, &ctx->loc, fd->inode, NULL);
+    }
+
+    return ctx;
+}
+
+ec_fd_t *
+ec_fd_get(fd_t *fd, xlator_t *xl)
+{
+    ec_fd_t *ctx = NULL;
+
+    LOCK(&fd->lock);
+
+    ctx = __ec_fd_get(fd, xl);
+
+    UNLOCK(&fd->lock);
+
+    return ctx;
+}
+
+gf_boolean_t
+ec_is_internal_xattr(dict_t *dict, char *key, data_t *value, void *data)
+{
+    if (key && (strncmp(key, EC_XATTR_PREFIX, SLEN(EC_XATTR_PREFIX)) == 0))
+        return _gf_true;
+
+    return _gf_false;
+}
+
+void
+ec_filter_internal_xattrs(dict_t *xattr)
+{
+    dict_foreach_match(xattr, ec_is_internal_xattr, NULL,
+                       dict_remove_foreach_fn, NULL);
+}
+
+gf_boolean_t
+ec_is_data_fop(glusterfs_fop_t fop)
+{
+    switch (fop) {
+        case GF_FOP_WRITE:
+        case GF_FOP_TRUNCATE:
+        case GF_FOP_FTRUNCATE:
+        case GF_FOP_FALLOCATE:
+        case GF_FOP_DISCARD:
+        case GF_FOP_ZEROFILL:
+            return _gf_true;
+        default:
+            return _gf_false;
+    }
+    return _gf_false;
+}
+/*
+gf_boolean_t
+ec_is_metadata_fop (int32_t lock_kind, glusterfs_fop_t fop)
+{
+        if (lock_kind == EC_LOCK_ENTRY) {
+                return _gf_false;
+        }
+
+        switch (fop) {
+        case GF_FOP_SETATTR:
+        case GF_FOP_FSETATTR:
+        case GF_FOP_SETXATTR:
+        case GF_FOP_FSETXATTR:
+        case GF_FOP_REMOVEXATTR:
+        case GF_FOP_FREMOVEXATTR:
+                return _gf_true;
+        default:
+                return _gf_false;
+        }
+        return _gf_false;
+}*/
diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h
new file mode 100644
index 00000000000..015db514e05
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-helpers.h
@@ -0,0 +1,200 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_HELPERS_H__
+#define __EC_HELPERS_H__
+
+#include "ec-types.h"
+
+#define EC_ERR(_x) ((void *)-(intptr_t)(_x))
+#define EC_IS_ERR(_x) (((uintptr_t)(_x) & ~0xfffULL) == ~0xfffULL)
+#define EC_GET_ERR(_x) ((int32_t)(intptr_t)(_x))
+
+#define EC_ALIGN_CHECK(_ptr, _align) ((((uintptr_t)(_ptr)) & ((_align)-1)) == 0)
+
+const char *
+ec_bin(char *str, size_t size, uint64_t value, int32_t digits);
+const char *
+ec_fop_name(int32_t id);
+void
+ec_trace(const char *event, ec_fop_data_t *fop, const char *fmt, ...);
+int32_t
+ec_bits_consume(uint64_t *n);
+size_t
+ec_iov_copy_to(void *dst, struct iovec *vector, int32_t count, off_t offset,
+               size_t size);
+int32_t
+ec_buffer_alloc(xlator_t *xl, size_t size, struct iobref **piobref, void **ptr);
+int32_t
+ec_dict_set_array(dict_t *dict, char *key, uint64_t *value, int32_t size);
+int32_t
+ec_dict_get_array(dict_t *dict, char *key, uint64_t value[], int32_t size);
+
+int32_t
+ec_dict_del_array(dict_t *dict, char *key, uint64_t *value, int32_t size);
+int32_t
+ec_dict_set_number(dict_t *dict, char *key, uint64_t value);
+int32_t
+ec_dict_del_number(dict_t *dict, char *key, uint64_t *value);
+int32_t
+ec_dict_set_config(dict_t *dict, char *key, ec_config_t *config);
+int32_t
+ec_dict_del_config(dict_t *dict, char *key, ec_config_t *config);
+
+int32_t
+ec_loc_parent(xlator_t *xl, loc_t *loc, loc_t *parent);
+int32_t
+ec_loc_update(xlator_t *xl, loc_t *loc, inode_t *inode, struct iatt *iatt);
+
+int32_t
+ec_loc_from_fd(xlator_t *xl, loc_t *loc, fd_t *fd);
+int32_t
+ec_loc_from_loc(xlator_t *xl, loc_t *dst, loc_t *src);
+
+void
+ec_owner_set(call_frame_t *frame, void *owner);
+void
+ec_owner_copy(call_frame_t *frame, gf_lkowner_t *owner);
+
+ec_inode_t *
+__ec_inode_get(inode_t *inode, xlator_t *xl);
+ec_inode_t *
+ec_inode_get(inode_t *inode, xlator_t *xl);
+ec_fd_t *
+__ec_fd_get(fd_t *fd, xlator_t *xl);
+ec_fd_t *
+ec_fd_get(fd_t *fd, xlator_t *xl);
+
+static inline uint32_t
+ec_adjust_size_down(ec_t *ec, uint64_t *value, gf_boolean_t scale)
+{
+    uint64_t head, tmp;
+
+    tmp = *value;
+    head = tmp % ec->stripe_size;
+    tmp -= head;
+
+    if (scale) {
+        tmp /= ec->fragments;
+    }
+
+    *value = tmp;
+
+    return (uint32_t)head;
+}
+
+/* This function can cause an overflow if the passed value is too near to the
+ * uint64_t limit. If this happens, it returns the tail in negative form and
+ * the value is set to UINT64_MAX. */
+static inline int32_t
+ec_adjust_size_up(ec_t *ec, uint64_t *value, gf_boolean_t scale)
+{
+    uint64_t tmp;
+    int32_t tail;
+
+    tmp = *value;
+    /* We first adjust the value down. This never causes overflow. */
+    tail = ec_adjust_size_down(ec, &tmp, scale);
+
+    /* If the value was already aligned, tail will be 0 and nothing else
+     * needs to be done. */
+    if (tail != 0) {
+        /* Otherwise, we need to compute the real tail and adjust the
+         * returned value to the next stripe. */
+        tail = ec->stripe_size - tail;
+        if (scale) {
+            tmp += ec->fragment_size;
+        } else {
+            tmp += ec->stripe_size;
+            /* If no scaling is requested there's a possibility of
+             * overflow. */
+            if (tmp < ec->stripe_size) {
+                tmp = UINT64_MAX;
+                tail = -tail;
+            }
+        }
+    }
+
+    *value = tmp;
+
+    return tail;
+}
+
+/* This function is equivalent to ec_adjust_size_down() but with a potentially
+ * different parameter size (off_t vs uint64_t). */
+static inline uint32_t
+ec_adjust_offset_down(ec_t *ec, off_t *value, gf_boolean_t scale)
+{
+    off_t head, tmp;
+
+    tmp = *value;
+    head = tmp % ec->stripe_size;
+    tmp -= head;
+
+    if (scale) {
+        tmp /= ec->fragments;
+    }
+
+    *value = tmp;
+
+    return (uint32_t)head;
+}
+
+/* This function is equivalent to ec_adjust_size_up() but with a potentially
+ * different parameter size (off_t vs uint64_t). */
+static inline int32_t
+ec_adjust_offset_up(ec_t *ec, off_t *value, gf_boolean_t scale)
+{
+    uint64_t tail, tmp;
+
+    /* An offset is a signed type that can only have positive values, so
+     * we take advantage of this to avoid overflows. We simply convert it
+     * to an unsigned integer and operate normally. This won't cause an
+     * overflow. Overflow is only checked when converting back to an
+     * off_t. */
+    tmp = *value;
+    tail = ec->stripe_size;
+    tail -= (tmp + tail - 1) % tail + 1;
+    tmp += tail;
+    if (scale) {
+        /* If we are scaling, we'll never get an overflow. */
+        tmp /= ec->fragments;
+    } else {
+        /* Check if there has been an overflow. */
+        if ((off_t)tmp < 0) {
+            tmp = GF_OFF_MAX;
+            tail = -tail;
+        }
+    }
+
+    *value = (off_t)tmp;
+
+    return (int32_t)tail;
+}
+
+static inline int32_t
+ec_is_power_of_2(uint32_t value)
+{
+    return (value != 0) && ((value & (value - 1)) == 0);
+}
+
+gf_boolean_t
+ec_is_internal_xattr(dict_t *dict, char *key, data_t *value, void *data);
+
+void
+ec_filter_internal_xattrs(dict_t *xattr);
+
+gf_boolean_t
+ec_is_data_fop(glusterfs_fop_t fop);
+
+int32_t
+ec_launch_replace_heal(ec_t *ec);
+
+#endif /* __EC_HELPERS_H__ */
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
new file mode 100644
index 00000000000..dad5f4d7018
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-inode-read.c
@@ -0,0 +1,2046 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "ec.h"
+#include "ec-messages.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-method.h"
+#include "ec-fops.h"
+
+/* FOP: access */
+
+int32_t
+ec_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_ACCESS, idx, op_ret,
+                               op_errno);
+    if (cbk) {
+        if (xdata)
+            cbk->xdata = dict_ref(xdata);
+        ec_combine(cbk, NULL);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_access(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_access_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->access,
+                      &fop->loc[0], fop->int32, fop->xdata);
+}
+
+int32_t
+ec_manager_access(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk = NULL;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
+                                  EC_RANGE_FULL);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_one(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            if (ec_dispatch_one_retry(fop, NULL)) {
+                return EC_STATE_DISPATCH;
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+            GF_ASSERT(cbk);
+            if (fop->cbks.access != NULL) {
+                if (cbk) {
+                    fop->cbks.access(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                     cbk->op_errno, cbk->xdata);
+                }
+            }
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            if (fop->cbks.access != NULL) {
+                fop->cbks.access(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                 NULL);
+            }
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_access(call_frame_t *frame, xlator_t *this, uintptr_t target,
+          uint32_t fop_flags, fop_access_cbk_t func, void *data, loc_t *loc,
+          int32_t mask, dict_t *xdata)
+{
+    ec_cbk_t callback = {.access = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(ACCESS) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_ACCESS, EC_FLAG_LOCK_SHARED,
+                               target, fop_flags, ec_wind_access,
+                               ec_manager_access, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->int32 = mask;
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL);
+    }
+}
+
+/* FOP: getxattr */
+
+int32_t
+ec_combine_getxattr(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+    if (!ec_dict_compare(dst->dict, src->dict)) {
+        gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_DICT_MISMATCH,
+               "Mismatching dictionary in "
+               "answers of 'GF_FOP_GETXATTR'");
+
+        return 0;
+    }
+
+    return 1;
+}
+
+int32_t
+ec_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_GETXATTR, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (op_ret >= 0) {
+            if (dict != NULL) {
+                cbk->dict = dict_ref(dict);
+                if (cbk->dict == NULL) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                           "Failed to reference a "
+                           "dictionary.");
+
+                    goto out;
+                }
+            }
+        }
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, ec_combine_getxattr);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_getxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_getxattr_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->getxattr,
+                      &fop->loc[0], fop->str[0], fop->xdata);
+}
+
+void
+ec_handle_special_xattrs(ec_fop_data_t *fop)
+{
+    ec_cbk_data_t *cbk = NULL;
+    /* Stime may not be available on all the bricks, so even if some of the
+     * subvols succeed the operation, treat it as answer.*/
+    if (fop->str[0] && fnmatch(GF_XATTR_STIME_PATTERN, fop->str[0], 0) == 0) {
+        if (!fop->answer || (fop->answer->op_ret < 0)) {
+            list_for_each_entry(cbk, &fop->cbk_list, list)
+            {
+                if (cbk->op_ret >= 0) {
+                    fop->answer = cbk;
+                    break;
+                }
+            }
+        }
+    }
+}
+
+int32_t
+ec_manager_getxattr(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            /* clear-locks commands must be done without any locks acquired
+               to avoid interferences. */
+            if ((fop->str[0] == NULL) ||
+                (strncmp(fop->str[0], GF_XATTR_CLRLK_CMD,
+                         SLEN(GF_XATTR_CLRLK_CMD)) != 0)) {
+                if (fop->fd == NULL) {
+                    ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
+                                          EC_RANGE_FULL);
+                } else {
+                    ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
+                                       EC_RANGE_FULL);
+                }
+                ec_lock(fop);
+            }
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            if (fop->minimum == EC_MINIMUM_ALL) {
+                ec_dispatch_all(fop);
+            } else {
+                ec_dispatch_one(fop);
+            }
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            ec_handle_special_xattrs(fop);
+            if (fop->minimum == EC_MINIMUM_ALL) {
+                cbk = ec_fop_prepare_answer(fop, _gf_true);
+            } else {
+                if (ec_dispatch_one_retry(fop, &cbk)) {
+                    return EC_STATE_DISPATCH;
+                }
+            }
+            if (cbk != NULL) {
+                int32_t err;
+
+                err = ec_dict_combine(cbk, EC_COMBINE_DICT);
+                if (!ec_cbk_set_error(cbk, -err, _gf_true)) {
+                    if (cbk->xdata != NULL)
+                        ec_filter_internal_xattrs(cbk->xdata);
+
+                    if (cbk->dict != NULL)
+                        ec_filter_internal_xattrs(cbk->dict);
+                }
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.getxattr != NULL) {
+                fop->cbks.getxattr(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                   cbk->op_errno, cbk->dict, cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.getxattr != NULL) {
+                fop->cbks.getxattr(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                   NULL, NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+int32_t
+ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
+                     int32_t op_ret, int32_t op_errno, uintptr_t mask,
+                     uintptr_t good, uintptr_t bad, uint32_t pending,
+                     dict_t *xdata)
+{
+    fop_getxattr_cbk_t func = cookie;
+    ec_t *ec = xl->private;
+    dict_t *dict = NULL;
+    char *str;
+    char bin1[65], bin2[65];
+
+    /* We try to return the 'pending' information in xdata, but if this cannot
+     * be set, we will ignore it silently. We prefer to report the success or
+     * failure of the heal itself. */
+    if (xdata == NULL) {
+        xdata = dict_new();
+    } else {
+        dict_ref(xdata);
+    }
+    if (xdata != NULL) {
+        if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) {
+            /* dict_set_uint32() is marked as 'warn_unused_result' and gcc
+             * enforces to check the result in this case. However we don't
+             * really care if it succeeded or not. We'll just do the same.
+             *
+             * This empty 'if' avoids the warning, and it will be removed by
+             * the optimizer. */
+        }
+    }
+
+    if (op_ret >= 0) {
+        dict = dict_new();
+        if (dict == NULL) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+        } else {
+            if (gf_asprintf(&str, "Good: %s, Bad: %s",
+                            ec_bin(bin1, sizeof(bin1), good, ec->nodes),
+                            ec_bin(bin2, sizeof(bin2), mask & ~(good | bad),
+                                   ec->nodes)) < 0) {
+                dict_unref(dict);
+                dict = NULL;
+
+                op_ret = -1;
+                op_errno = ENOMEM;
+
+                goto out;
+            }
+
+            if (dict_set_dynstr(dict, EC_XATTR_HEAL, str) != 0) {
+                GF_FREE(str);
+                dict_unref(dict);
+                dict = NULL;
+
+                op_ret = -1;
+                op_errno = ENOMEM;
+
+                goto out;
+            }
+        }
+    }
+
+out:
+    func(frame, NULL, xl, op_ret, op_errno, dict, xdata);
+
+    if (dict != NULL) {
+        dict_unref(dict);
+    }
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
+
+    return 0;
+}
+
+void
+ec_getxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_getxattr_cbk_t func, void *data, loc_t *loc,
+            const char *name, dict_t *xdata)
+{
+    ec_cbk_t callback = {.getxattr = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(GETXATTR) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    /* Special handling of an explicit self-heal request */
+    if ((name != NULL) && (strcmp(name, EC_XATTR_HEAL) == 0)) {
+        ec_heal(frame, this, target, EC_MINIMUM_ONE, ec_getxattr_heal_cbk, func,
+                loc, 0, NULL);
+
+        return;
+    }
+
+    fop = ec_fop_data_allocate(
+        frame, this, GF_FOP_GETXATTR, EC_FLAG_LOCK_SHARED, target, fop_flags,
+        ec_wind_getxattr, ec_manager_getxattr, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (name != NULL) {
+        /* In case of list-node-uuids xattr, set flag to indicate
+         * the same and use node-uuid xattr for winding fop */
+        if (XATTR_IS_NODE_UUID_LIST(name)) {
+            fop->int32 = 1;
+            fop->str[0] = gf_strdup(GF_XATTR_NODE_UUID_KEY);
+        } else {
+            fop->str[0] = gf_strdup(name);
+        }
+        if (fop->str[0] == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                   "Failed to duplicate a string.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL);
+    }
+}
+
+/* FOP: fgetxattr */
+
+int32_t
+ec_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FGETXATTR, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (op_ret >= 0) {
+            if (dict != NULL) {
+                cbk->dict = dict_ref(dict);
+                if (cbk->dict == NULL) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                           "Failed to reference a "
+                           "dictionary.");
+
+                    goto out;
+                }
+            }
+        }
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, ec_combine_getxattr);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_fgetxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_fgetxattr_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->fgetxattr,
+                      fop->fd, fop->str[0], fop->xdata);
+}
+
+void
+ec_fgetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+             uint32_t fop_flags, fop_fgetxattr_cbk_t func, void *data, fd_t *fd,
+             const char *name, dict_t *xdata)
+{
+    ec_cbk_t callback = {.fgetxattr = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(FGETXATTR) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(
+        frame, this, GF_FOP_FGETXATTR, EC_FLAG_LOCK_SHARED, target, fop_flags,
+        ec_wind_fgetxattr, ec_manager_getxattr, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (name != NULL) {
+        fop->str[0] = gf_strdup(name);
+        if (fop->str[0] == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                   "Failed to duplicate a string.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL);
+    }
+}
+
+/* FOP: open */
+
+int32_t
+ec_combine_open(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+    if (dst->fd != src->fd) {
+        gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_FD_MISMATCH,
+               "Mismatching fd in answers "
+               "of 'GF_FOP_OPEN': %p <-> %p",
+               dst->fd, src->fd);
+
+        return 0;
+    }
+
+    return 1;
+}
+
+int32_t
+ec_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_OPEN, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (op_ret >= 0) {
+            if (fd != NULL) {
+                cbk->fd = fd_ref(fd);
+                if (cbk->fd == NULL) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           EC_MSG_FILE_DESC_REF_FAIL,
+                           "Failed to reference a "
+                           "file descriptor.");
+
+                    goto out;
+                }
+            }
+        }
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, ec_combine_open);
+
+        ec_update_fd_status(fd, this, idx, op_ret);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_open(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_open_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->open,
+                      &fop->loc[0], fop->int32, fop->fd, fop->xdata);
+}
+
+int32_t
+ec_open_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata)
+{
+    ec_fop_data_t *fop = cookie;
+    int32_t error = 0;
+
+    fop = fop->data;
+    if (op_ret >= 0) {
+        fop->answer->iatt[0] = *postbuf;
+    } else {
+        error = op_errno;
+    }
+
+    ec_resume(fop, error);
+
+    return 0;
+}
+
+int32_t
+ec_manager_open(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+    ec_fd_t *ctx;
+    int32_t err;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            LOCK(&fop->fd->lock);
+
+            ctx = __ec_fd_get(fop->fd, fop->xl);
+            if (ctx == NULL) {
+                UNLOCK(&fop->fd->lock);
+
+                fop->error = ENOMEM;
+
+                return EC_STATE_REPORT;
+            }
+            if (!ctx->loc.inode) {
+                err = ec_loc_from_loc(fop->xl, &ctx->loc, &fop->loc[0]);
+                if (err != 0) {
+                    UNLOCK(&fop->fd->lock);
+
+                    fop->error = -err;
+
+                    return EC_STATE_REPORT;
+                }
+            }
+
+            ctx->flags = fop->int32;
+
+            UNLOCK(&fop->fd->lock);
+
+            /* We need to write to specific offsets on the bricks, so we
+               need to remove O_APPEND from flags (if present).
+               If O_TRUNC is specified, we remove it from open and an
+               ftruncate will be executed later, which will correctly update
+               the file size taking appropriate locks. O_TRUNC flag is saved
+               into fop->uint32 to use it later.*/
+            fop->uint32 = fop->int32 & O_TRUNC;
+            fop->int32 &= ~(O_APPEND | O_TRUNC);
+
+            /* Fall through */
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_true);
+            if (cbk != NULL) {
+                int32_t err;
+
+                err = ec_loc_update(fop->xl, &fop->loc[0], cbk->fd->inode,
+                                    NULL);
+                if (!ec_cbk_set_error(cbk, -err, _gf_true)) {
+                    LOCK(&fop->fd->lock);
+
+                    ctx = __ec_fd_get(fop->fd, fop->xl);
+                    if (ctx != NULL) {
+                        ctx->open |= cbk->mask;
+                    }
+
+                    UNLOCK(&fop->fd->lock);
+
+                    /* If O_TRUNC was specified, call ftruncate to
+                       effectively trunc the file with appropriate locks
+                       acquired. We don't use ctx->flags because self-heal
+                       can use the same fd with different flags. */
+                    if (fop->uint32 != 0) {
+                        ec_sleep(fop);
+                        ec_ftruncate(fop->req_frame, fop->xl, cbk->mask,
+                                     fop->minimum, ec_open_truncate_cbk, fop,
+                                     cbk->fd, 0, NULL);
+                    }
+                }
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.open != NULL) {
+                fop->cbks.open(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                               cbk->op_errno, cbk->fd, cbk->xdata);
+            }
+
+            return EC_STATE_END;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.open != NULL) {
+                fop->cbks.open(fop->req_frame, fop, fop->xl, -1, fop->error,
+                               NULL, NULL);
+            }
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_open(call_frame_t *frame, xlator_t *this, uintptr_t target,
+        uint32_t fop_flags, fop_open_cbk_t func, void *data, loc_t *loc,
+        int32_t flags, fd_t *fd, dict_t *xdata)
+{
+    ec_cbk_t callback = {.open = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(OPEN) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_OPEN, EC_FLAG_LOCK_SHARED,
+                               target, fop_flags, ec_wind_open, ec_manager_open,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->int32 = flags;
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL);
+    }
+}
+
+/* FOP: readlink */
+
+int32_t
+ec_combine_readlink(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+    if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 1)) {
+        gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH,
+               "Mismatching iatt in "
+               "answers of 'GF_FOP_READLINK'");
+
+        return 0;
+    }
+
+    return 1;
+}
+
+int32_t
+ec_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, const char *path,
+                struct iatt *buf, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret,
+                               op_errno);
+    if (cbk) {
+        if (xdata)
+            cbk->xdata = dict_ref(xdata);
+
+        if (cbk->op_ret >= 0) {
+            cbk->iatt[0] = *buf;
+            cbk->str = gf_strdup(path);
+            if (!cbk->str) {
+                ec_cbk_set_error(cbk, ENOMEM, _gf_true);
+            }
+        }
+        ec_combine(cbk, NULL);
+    }
+
+out:
+    if (fop != NULL)
+        ec_complete(fop);
+
+    return 0;
+}
+
+void
+ec_wind_readlink(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_readlink_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->readlink,
+                      &fop->loc[0], fop->size, fop->xdata);
+}
+
+int32_t
+ec_manager_readlink(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk = NULL;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
+                                  EC_RANGE_FULL);
+            ec_lock(fop);
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_one(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            if (ec_dispatch_one_retry(fop, &cbk)) {
+                return EC_STATE_DISPATCH;
+            }
+
+            if ((cbk != NULL) && (cbk->op_ret >= 0)) {
+                ec_iatt_rebuild(fop->xl->private, &cbk->iatt[0], 1, 1);
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+            GF_ASSERT(cbk);
+            if (fop->cbks.readlink != NULL) {
+                fop->cbks.readlink(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                   cbk->op_errno, cbk->str, &cbk->iatt[0],
+                                   cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            if (fop->cbks.readlink != NULL) {
+                fop->cbks.readlink(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                   NULL, NULL, NULL);
+            }
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_readlink(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_readlink_cbk_t func, void *data, loc_t *loc,
+            size_t size, dict_t *xdata)
+{
+    ec_cbk_t callback = {.readlink = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(READLINK) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(
+        frame, this, GF_FOP_READLINK, EC_FLAG_LOCK_SHARED, target, fop_flags,
+        ec_wind_readlink, ec_manager_readlink, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->size = size;
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: readv */
+
+int32_t
+ec_readv_rebuild(ec_t *ec, ec_fop_data_t *fop, ec_cbk_data_t *cbk)
+{
+    struct iovec vector[1];
+    ec_cbk_data_t *ans = NULL;
+    struct iobref *iobref = NULL;
+    void *ptr;
+    uint64_t fsize = 0, size = 0, max = 0;
+    int32_t pos, err = -ENOMEM;
+
+    if (cbk->op_ret < 0) {
+        err = -cbk->op_errno;
+
+        goto out;
+    }
+
+    /* This shouldn't fail because we have the inode locked. */
+    GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, &cbk->iatt[0].ia_size));
+
+    if (cbk->op_ret > 0) {
+        void *blocks[cbk->count];
+        uint32_t values[cbk->count];
+
+        fsize = cbk->op_ret;
+        size = fsize * ec->fragments;
+        for (ans = cbk; ans != NULL; ans = ans->next) {
+            pos = gf_bits_count(cbk->mask & ((1 << ans->idx) - 1));
+            values[pos] = ans->idx + 1;
+            blocks[pos] = ans->vector[0].iov_base;
+            if ((ans->int32 != 1) ||
+                !EC_ALIGN_CHECK(blocks[pos], EC_METHOD_WORD_SIZE)) {
+                if (iobref == NULL) {
+                    err = ec_buffer_alloc(ec->xl, size, &iobref, &ptr);
+                    if (err != 0) {
+                        goto out;
+                    }
+                }
+                ec_iov_copy_to(ptr, ans->vector, ans->int32, 0, fsize);
+                blocks[pos] = ptr;
+                ptr += fsize;
+            }
+        }
+
+        err = ec_buffer_alloc(ec->xl, size, &iobref, &ptr);
+        if (err != 0) {
+            goto out;
+        }
+
+        err = ec_method_decode(&ec->matrix, fsize, cbk->mask, values, blocks,
+                               ptr);
+        if (err != 0) {
+            goto out;
+        }
+
+        vector[0].iov_base = ptr + fop->head;
+        vector[0].iov_len = size - fop->head;
+
+        max = fop->offset * ec->fragments + size;
+        if (max > cbk->iatt[0].ia_size) {
+            max = cbk->iatt[0].ia_size;
+        }
+        max -= fop->offset * ec->fragments + fop->head;
+        if (max > fop->user_size) {
+            max = fop->user_size;
+        }
+        size -= fop->head;
+        if (size > max) {
+            vector[0].iov_len -= size - max;
+            size = max;
+        }
+
+        cbk->op_ret = size;
+        cbk->int32 = 1;
+
+        iobref_unref(cbk->buffers);
+        cbk->buffers = iobref;
+
+        GF_FREE(cbk->vector);
+        cbk->vector = iov_dup(vector, 1);
+        if (cbk->vector == NULL) {
+            return -ENOMEM;
+        }
+    }
+
+    return 0;
+
+out:
+    if (iobref != NULL) {
+        iobref_unref(iobref);
+    }
+
+    return err;
+}
+
+int32_t
+ec_combine_readv(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+    if (!ec_vector_compare(dst->vector, dst->int32, src->vector, src->int32)) {
+        gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_VECTOR_MISMATCH,
+               "Mismatching vector in "
+               "answers of 'GF_FOP_READ'");
+
+        return 0;
+    }
+
+    if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 1)) {
+        gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH,
+               "Mismatching iatt in "
+               "answers of 'GF_FOP_READ'");
+
+        return 0;
+    }
+
+    return 1;
+}
+
+int32_t
+ec_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iovec *vector, int32_t count,
+             struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    ec_t *ec = this->private;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_READ, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (op_ret >= 0) {
+            cbk->int32 = count;
+
+            if (count > 0) {
+                cbk->vector = iov_dup(vector, count);
+                if (cbk->vector == NULL) {
+                    gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                           "Failed to duplicate a "
+                           "vector list.");
+
+                    goto out;
+                }
+                cbk->int32 = count;
+            }
+            if (stbuf != NULL) {
+                cbk->iatt[0] = *stbuf;
+            }
+            if (iobref != NULL) {
+                cbk->buffers = iobref_ref(iobref);
+                if (cbk->buffers == NULL) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_BUF_REF_FAIL,
+                           "Failed to reference a "
+                           "buffer.");
+
+                    goto out;
+                }
+            }
+        }
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        if ((op_ret > 0) && ((op_ret % ec->fragment_size) != 0)) {
+            ec_cbk_set_error(cbk, EIO, _gf_true);
+        }
+
+        ec_combine(cbk, ec_combine_readv);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_readv(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_readv_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->readv, fop->fd,
+                      fop->size, fop->offset, fop->uint32, fop->xdata);
+}
+
+int32_t
+ec_manager_readv(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+    ec_t *ec = fop->xl->private;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            fop->user_size = fop->size;
+            fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset,
+                                              _gf_true);
+            fop->size += fop->head;
+            ec_adjust_size_up(fop->xl->private, &fop->size, _gf_true);
+
+            /* Fall through */
+
+        case EC_STATE_LOCK:
+            ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, fop->offset,
+                               fop->size);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            if (ec->read_mask) {
+                fop->mask &= ec->read_mask;
+            }
+            ec_dispatch_min(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_true);
+            if (cbk != NULL) {
+                int32_t err;
+
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 1, cbk->count);
+
+                err = ec_readv_rebuild(fop->xl->private, fop, cbk);
+                if (err != 0) {
+                    ec_cbk_set_error(cbk, -err, _gf_true);
+                }
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.readv != NULL) {
+                fop->cbks.readv(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                cbk->op_errno, cbk->vector, cbk->int32,
+                                &cbk->iatt[0], cbk->buffers, cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.readv != NULL) {
+                fop->cbks.readv(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                NULL, 0, NULL, NULL, NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_readv(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_readv_cbk_t func, void *data, fd_t *fd,
+         size_t size, off_t offset, uint32_t flags, dict_t *xdata)
+{
+    ec_cbk_t callback = {.readv = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(READ) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_READ, EC_FLAG_LOCK_SHARED,
+                               target, fop_flags, ec_wind_readv,
+                               ec_manager_readv, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    fop->size = size;
+    fop->offset = offset;
+    fop->uint32 = flags;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, 0, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: seek */
+
+int32_t
+ec_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, off_t offset, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    ec_t *ec = this->private;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_SEEK, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (op_ret >= 0) {
+            cbk->offset = offset;
+        }
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+        }
+
+        if ((op_ret > 0) && ((cbk->offset % ec->fragment_size) != 0)) {
+            cbk->op_ret = -1;
+            cbk->op_errno = EIO;
+        }
+
+        ec_combine(cbk, NULL);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_seek(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_seek_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->seek, fop->fd,
+                      fop->offset, fop->seek, fop->xdata);
+}
+
+int32_t
+ec_manager_seek(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+    uint64_t size;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            fop->user_size = fop->offset;
+            fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset,
+                                              _gf_true);
+
+            /* Fall through */
+
+        case EC_STATE_LOCK:
+            ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, fop->offset,
+                               EC_RANGE_FULL);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            /* This shouldn't fail because we have the inode locked. */
+            GF_ASSERT(
+                ec_get_inode_size(fop, fop->locks[0].lock->loc.inode, &size));
+
+            if (fop->user_size >= size) {
+                ec_fop_set_error(fop, ENXIO);
+
+                return EC_STATE_REPORT;
+            }
+
+            ec_dispatch_one(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            if (ec_dispatch_one_retry(fop, &cbk)) {
+                return EC_STATE_DISPATCH;
+            }
+            if ((cbk != NULL) && (cbk->op_ret >= 0)) {
+                ec_t *ec = fop->xl->private;
+
+                /* This shouldn't fail because we have the inode locked. */
+                GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode,
+                                            &size));
+
+                cbk->offset *= ec->fragments;
+                if (cbk->offset < fop->user_size) {
+                    cbk->offset = fop->user_size;
+                }
+                if (cbk->offset > size) {
+                    cbk->offset = size;
+                }
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.seek != NULL) {
+                fop->cbks.seek(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                               cbk->op_errno, cbk->offset, cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.seek != NULL) {
+                fop->cbks.seek(fop->req_frame, fop, fop->xl, -1, fop->error, 0,
+                               NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, 0, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_seek(call_frame_t *frame, xlator_t *this, uintptr_t target,
+        uint32_t fop_flags, fop_seek_cbk_t func, void *data, fd_t *fd,
+        off_t offset, gf_seek_what_t what, dict_t *xdata)
+{
+    ec_cbk_t callback = {.seek = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = EIO;
+
+    gf_msg_trace("ec", 0, "EC(SEEK) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_SEEK, EC_FLAG_LOCK_SHARED,
+                               target, fop_flags, ec_wind_seek, ec_manager_seek,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    fop->offset = offset;
+    fop->seek = what;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, EIO, 0, NULL);
+    }
+}
+
+/* FOP: stat */
+
+int32_t
+ec_combine_stat(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+    if (!ec_iatt_combine(fop, dst->iatt, src->iatt, 1)) {
+        gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_IATT_MISMATCH,
+               "Mismatching iatt in "
+               "answers of 'GF_FOP_STAT'");
+
+        return 0;
+    }
+
+    return 1;
+}
+
+int32_t
+ec_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_STAT, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (op_ret >= 0) {
+            if (buf != NULL) {
+                cbk->iatt[0] = *buf;
+            }
+        }
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, ec_combine_stat);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_stat(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_stat_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->stat,
+                      &fop->loc[0], fop->xdata);
+}
+
+int32_t
+ec_manager_stat(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            if (fop->fd == NULL) {
+                ec_lock_prepare_inode(fop, &fop->loc[0], EC_QUERY_INFO, 0,
+                                      EC_RANGE_FULL);
+            } else {
+                ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO, 0,
+                                   EC_RANGE_FULL);
+            }
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_true);
+
+            if (cbk != NULL) {
+                if (cbk->iatt[0].ia_type == IA_IFREG) {
+                    ec_iatt_rebuild(fop->xl->private, cbk->iatt, 1, cbk->count);
+
+                    /* This shouldn't fail because we have the inode locked. */
+                    GF_ASSERT(ec_get_inode_size(fop,
+                                                fop->locks[0].lock->loc.inode,
+                                                &cbk->iatt[0].ia_size));
+                }
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->id == GF_FOP_STAT) {
+                if (fop->cbks.stat != NULL) {
+                    fop->cbks.stat(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                   cbk->op_errno, &cbk->iatt[0], cbk->xdata);
+                }
+            } else {
+                if (fop->cbks.fstat != NULL) {
+                    fop->cbks.fstat(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                    cbk->op_errno, &cbk->iatt[0], cbk->xdata);
+                }
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->id == GF_FOP_STAT) {
+                if (fop->cbks.stat != NULL) {
+                    fop->cbks.stat(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                   NULL, NULL);
+                }
+            } else {
+                if (fop->cbks.fstat != NULL) {
+                    fop->cbks.fstat(fop->req_frame, fop, fop->xl, -1,
+                                    fop->error, NULL, NULL);
+                }
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_stat(call_frame_t *frame, xlator_t *this, uintptr_t target,
+        uint32_t fop_flags, fop_stat_cbk_t func, void *data, loc_t *loc,
+        dict_t *xdata)
+{
+    ec_cbk_t callback = {.stat = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(STAT) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_STAT, EC_FLAG_LOCK_SHARED,
+                               target, fop_flags, ec_wind_stat, ec_manager_stat,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL);
+    }
+}
+
+/* FOP: fstat */
+
+int32_t
+ec_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FSTAT, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (op_ret >= 0) {
+            if (buf != NULL) {
+                cbk->iatt[0] = *buf;
+            }
+        }
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, ec_combine_stat);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_fstat(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_fstat_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->fstat, fop->fd,
+                      fop->xdata);
+}
+
+void
+ec_fstat(call_frame_t *frame, xlator_t *this, uintptr_t target,
+         uint32_t fop_flags, fop_fstat_cbk_t func, void *data, fd_t *fd,
+         dict_t *xdata)
+{
+    ec_cbk_t callback = {.fstat = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(FSTAT) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_FSTAT, EC_FLAG_LOCK_SHARED,
+                               target, fop_flags, ec_wind_fstat,
+                               ec_manager_stat, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL);
+    }
+}
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
new file mode 100644
index 00000000000..9b5fe2a7fdc
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-inode-write.c
@@ -0,0 +1,2369 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "ec-messages.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-method.h"
+#include "ec-fops.h"
+#include "ec-mem-types.h"
+
+int32_t
+ec_update_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata)
+{
+    ec_fop_data_t *fop = cookie;
+    ec_cbk_data_t *cbk = NULL;
+    ec_fop_data_t *parent = fop->parent;
+    int i = 0;
+
+    ec_trace("UPDATE_WRITEV_CBK", cookie, "ret=%d, errno=%d, parent-fop=%s",
+             op_ret, op_errno, ec_fop_name(parent->id));
+
+    if (op_ret < 0) {
+        ec_fop_set_error(parent, op_errno);
+        goto out;
+    }
+    cbk = ec_cbk_data_allocate(parent->frame, this, parent, parent->id, 0,
+                               op_ret, op_errno);
+    if (!cbk) {
+        ec_fop_set_error(parent, ENOMEM);
+        goto out;
+    }
+
+    if (xdata)
+        cbk->xdata = dict_ref(xdata);
+
+    if (prebuf)
+        cbk->iatt[i++] = *prebuf;
+
+    if (postbuf)
+        cbk->iatt[i++] = *postbuf;
+
+    LOCK(&parent->lock);
+    {
+        parent->good &= fop->good;
+
+        if (gf_bits_count(parent->good) < parent->minimum) {
+            __ec_fop_set_error(parent, EIO);
+        } else if (fop->error == 0 && parent->answer == NULL) {
+            parent->answer = cbk;
+        }
+    }
+    UNLOCK(&parent->lock);
+out:
+    return 0;
+}
+
+static int32_t
+ec_update_write(ec_fop_data_t *fop, uintptr_t mask, off_t offset, uint64_t size)
+{
+    struct iobref *iobref = NULL;
+    struct iobuf *iobuf = NULL;
+    struct iovec vector;
+    int32_t err = -ENOMEM;
+
+    iobref = iobref_new();
+    if (iobref == NULL) {
+        goto out;
+    }
+    iobuf = iobuf_get(fop->xl->ctx->iobuf_pool);
+    if (iobuf == NULL) {
+        goto out;
+    }
+    err = iobref_add(iobref, iobuf);
+    if (err != 0) {
+        goto out;
+    }
+
+    if (fop->locks[0].lock)
+        ec_lock_update_good(fop->locks[0].lock, fop);
+    vector.iov_base = iobuf->ptr;
+    vector.iov_len = size;
+    memset(vector.iov_base, 0, vector.iov_len);
+
+    ec_writev(fop->frame, fop->xl, mask, fop->minimum, ec_update_writev_cbk,
+              NULL, fop->fd, &vector, 1, offset, 0, iobref, NULL);
+
+    err = 0;
+
+out:
+    if (iobuf != NULL) {
+        iobuf_unref(iobuf);
+    }
+    if (iobref != NULL) {
+        iobref_unref(iobref);
+    }
+
+    return err;
+}
+
+int
+ec_inode_write_cbk(call_frame_t *frame, xlator_t *this, void *cookie,
+                   int op_ret, int op_errno, struct iatt *prestat,
+                   struct iatt *poststat, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int i = 0;
+    int idx = 0;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+    idx = (int32_t)(uintptr_t)cookie;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, fop->id, idx, op_ret,
+                               op_errno);
+    if (!cbk)
+        goto out;
+
+    if (op_ret < 0)
+        goto out;
+
+    if (xdata)
+        cbk->xdata = dict_ref(xdata);
+
+    if (prestat)
+        cbk->iatt[i++] = *prestat;
+
+    if (poststat)
+        cbk->iatt[i++] = *poststat;
+
+out:
+    if (cbk)
+        ec_combine(cbk, ec_combine_write);
+
+    if (fop)
+        ec_complete(fop);
+    return 0;
+}
+/* FOP: removexattr */
+
+int32_t
+ec_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, NULL, NULL,
+                              xdata);
+}
+
+void
+ec_wind_removexattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_removexattr_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->removexattr,
+                      &fop->loc[0], fop->str[0], fop->xdata);
+}
+
+void
+ec_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, dict_t *xdata)
+{
+    ec_fop_data_t *fop = cookie;
+    switch (fop->id) {
+        case GF_FOP_SETXATTR:
+            if (fop->cbks.setxattr) {
+                QUORUM_CBK(fop->cbks.setxattr, fop, frame, cookie, this, op_ret,
+                           op_errno, xdata);
+            }
+            break;
+        case GF_FOP_REMOVEXATTR:
+            if (fop->cbks.removexattr) {
+                QUORUM_CBK(fop->cbks.removexattr, fop, frame, cookie, this,
+                           op_ret, op_errno, xdata);
+            }
+            break;
+        case GF_FOP_FSETXATTR:
+            if (fop->cbks.fsetxattr) {
+                QUORUM_CBK(fop->cbks.fsetxattr, fop, frame, cookie, this,
+                           op_ret, op_errno, xdata);
+            }
+            break;
+        case GF_FOP_FREMOVEXATTR:
+            if (fop->cbks.fremovexattr) {
+                QUORUM_CBK(fop->cbks.fremovexattr, fop, frame, cookie, this,
+                           op_ret, op_errno, xdata);
+            }
+            break;
+    }
+}
+
+int32_t
+ec_manager_xattr(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            if (fop->fd == NULL) {
+                ec_lock_prepare_inode(fop, &fop->loc[0],
+                                      EC_UPDATE_META | EC_QUERY_INFO, 0,
+                                      EC_RANGE_FULL);
+            } else {
+                ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META | EC_QUERY_INFO,
+                                   0, EC_RANGE_FULL);
+            }
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            ec_fop_prepare_answer(fop, _gf_false);
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            ec_xattr_cbk(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                         cbk->op_errno, cbk->xdata);
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            ec_xattr_cbk(fop->req_frame, fop, fop->xl, -1, fop->error, NULL);
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_removexattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+               uint32_t fop_flags, fop_removexattr_cbk_t func, void *data,
+               loc_t *loc, const char *name, dict_t *xdata)
+{
+    ec_cbk_t callback = {.removexattr = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(REMOVEXATTR) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_REMOVEXATTR, 0, target,
+                               fop_flags, ec_wind_removexattr, ec_manager_xattr,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (name != NULL) {
+        fop->str[0] = gf_strdup(name);
+        if (fop->str[0] == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                   "Failed to duplicate a string.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL);
+    }
+}
+
+/* FOP: fremovexattr */
+
+int32_t
+ec_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, NULL, NULL,
+                              xdata);
+}
+
+void
+ec_wind_fremovexattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_fremovexattr_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->fremovexattr,
+                      fop->fd, fop->str[0], fop->xdata);
+}
+
+void
+ec_fremovexattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+                uint32_t fop_flags, fop_fremovexattr_cbk_t func, void *data,
+                fd_t *fd, const char *name, dict_t *xdata)
+{
+    ec_cbk_t callback = {.fremovexattr = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(FREMOVEXATTR) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_FREMOVEXATTR, 0, target,
+                               fop_flags, ec_wind_fremovexattr,
+                               ec_manager_xattr, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (name != NULL) {
+        fop->str[0] = gf_strdup(name);
+        if (fop->str[0] == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                   "Failed to duplicate a string.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL);
+    }
+}
+
+/* FOP: setattr */
+
+int32_t
+ec_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *prestat,
+               struct iatt *poststat, dict_t *xdata)
+{
+    return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat,
+                              poststat, xdata);
+}
+
+void
+ec_wind_setattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_setattr_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->setattr,
+                      &fop->loc[0], &fop->iatt, fop->int32, fop->xdata);
+}
+
+int32_t
+ec_manager_setattr(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            if (fop->fd == NULL) {
+                ec_lock_prepare_inode(fop, &fop->loc[0],
+                                      EC_UPDATE_META | EC_QUERY_INFO, 0,
+                                      EC_RANGE_FULL);
+            } else {
+                ec_lock_prepare_fd(fop, fop->fd, EC_UPDATE_META | EC_QUERY_INFO,
+                                   0, EC_RANGE_FULL);
+            }
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_false);
+            if (cbk != NULL) {
+                if (cbk->iatt[0].ia_type == IA_IFREG) {
+                    ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
+
+                    /* This shouldn't fail because we have the inode locked. */
+                    GF_ASSERT(ec_get_inode_size(fop,
+                                                fop->locks[0].lock->loc.inode,
+                                                &cbk->iatt[0].ia_size));
+                    cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+                }
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->id == GF_FOP_SETATTR) {
+                if (fop->cbks.setattr != NULL) {
+                    QUORUM_CBK(fop->cbks.setattr, fop, fop->req_frame, fop,
+                               fop->xl, cbk->op_ret, cbk->op_errno,
+                               &cbk->iatt[0], &cbk->iatt[1], cbk->xdata);
+                }
+            } else {
+                if (fop->cbks.fsetattr != NULL) {
+                    QUORUM_CBK(fop->cbks.fsetattr, fop, fop->req_frame, fop,
+                               fop->xl, cbk->op_ret, cbk->op_errno,
+                               &cbk->iatt[0], &cbk->iatt[1], cbk->xdata);
+                }
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->id == GF_FOP_SETATTR) {
+                if (fop->cbks.setattr != NULL) {
+                    fop->cbks.setattr(fop->req_frame, fop, fop->xl, -1,
+                                      fop->error, NULL, NULL, NULL);
+                }
+            } else {
+                if (fop->cbks.fsetattr != NULL) {
+                    fop->cbks.fsetattr(fop->req_frame, fop, fop->xl, -1,
+                                       fop->error, NULL, NULL, NULL);
+                }
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_setattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+           uint32_t fop_flags, fop_setattr_cbk_t func, void *data, loc_t *loc,
+           struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    ec_cbk_t callback = {.setattr = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(SETATTR) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_SETATTR, 0, target,
+                               fop_flags, ec_wind_setattr, ec_manager_setattr,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->int32 = valid;
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (stbuf != NULL) {
+        fop->iatt = *stbuf;
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: fsetattr */
+
+int32_t
+ec_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prestat,
+                struct iatt *poststat, dict_t *xdata)
+{
+    return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat,
+                              poststat, xdata);
+}
+
+void
+ec_wind_fsetattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_fsetattr_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->fsetattr,
+                      fop->fd, &fop->iatt, fop->int32, fop->xdata);
+}
+
+void
+ec_fsetattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_fsetattr_cbk_t func, void *data, fd_t *fd,
+            struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    ec_cbk_t callback = {.fsetattr = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(FSETATTR) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_FSETATTR, 0, target,
+                               fop_flags, ec_wind_fsetattr, ec_manager_setattr,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    fop->int32 = valid;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (stbuf != NULL) {
+        fop->iatt = *stbuf;
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: setxattr */
+
+int32_t
+ec_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, NULL, NULL,
+                              xdata);
+}
+
+void
+ec_wind_setxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_setxattr_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->setxattr,
+                      &fop->loc[0], fop->dict, fop->int32, fop->xdata);
+}
+
+void
+ec_setxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_setxattr_cbk_t func, void *data, loc_t *loc,
+            dict_t *dict, int32_t flags, dict_t *xdata)
+{
+    ec_cbk_t callback = {.setxattr = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(SETXATTR) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_SETXATTR, 0, target,
+                               fop_flags, ec_wind_setxattr, ec_manager_xattr,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->int32 = flags;
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (dict != NULL) {
+        fop->dict = dict_copy_with_ref(dict, NULL);
+        if (fop->dict == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL);
+    }
+}
+
+/* FOP: fsetxattr */
+
+int32_t
+ec_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FSETXATTR, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, NULL);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_fsetxattr(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_fsetxattr_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->fsetxattr,
+                      fop->fd, fop->dict, fop->int32, fop->xdata);
+}
+
+void
+ec_fsetxattr(call_frame_t *frame, xlator_t *this, uintptr_t target,
+             uint32_t fop_flags, fop_fsetxattr_cbk_t func, void *data, fd_t *fd,
+             dict_t *dict, int32_t flags, dict_t *xdata)
+{
+    ec_cbk_t callback = {.fsetxattr = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(FSETXATTR) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_FSETXATTR, 0, target,
+                               fop_flags, ec_wind_fsetxattr, ec_manager_xattr,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    fop->int32 = flags;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (dict != NULL) {
+        fop->dict = dict_copy_with_ref(dict, NULL);
+        if (fop->dict == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL);
+    }
+}
+
+/*********************************************************************
+ *
+ * File Operation : fallocate
+ *
+ *********************************************************************/
+
+int32_t
+ec_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                 struct iatt *postbuf, dict_t *xdata)
+{
+    return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prebuf,
+                              postbuf, xdata);
+}
+
+void
+ec_wind_fallocate(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_fallocate_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->fallocate,
+                      fop->fd, fop->int32, fop->offset, fop->size, fop->xdata);
+}
+
+int32_t
+ec_manager_fallocate(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk = NULL;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            if (fop->size == 0) {
+                ec_fop_set_error(fop, EINVAL);
+                return EC_STATE_REPORT;
+            }
+            if (fop->int32 &
+                (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
+                 FALLOC_FL_ZERO_RANGE | FALLOC_FL_PUNCH_HOLE)) {
+                ec_fop_set_error(fop, ENOTSUP);
+                return EC_STATE_REPORT;
+            }
+            fop->user_size = fop->offset + fop->size;
+            fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset,
+                                              _gf_true);
+            fop->size += fop->head;
+            ec_adjust_size_up(fop->xl->private, &fop->size, _gf_true);
+
+            /* Fall through */
+
+        case EC_STATE_LOCK:
+            ec_lock_prepare_fd(fop, fop->fd,
+                               EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
+                               fop->offset, fop->size);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_false);
+            if (cbk != NULL) {
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
+
+                /* This shouldn't fail because we have the inode locked. */
+                LOCK(&fop->locks[0].lock->loc.inode->lock);
+                {
+                    GF_ASSERT(__ec_get_inode_size(fop,
+                                                  fop->locks[0].lock->loc.inode,
+                                                  &cbk->iatt[0].ia_size));
+
+                    /*If mode has FALLOC_FL_KEEP_SIZE keep the size */
+                    if (fop->int32 & FALLOC_FL_KEEP_SIZE) {
+                        cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+                    } else if (fop->user_size > cbk->iatt[0].ia_size) {
+                        cbk->iatt[1].ia_size = fop->user_size;
+
+                        /* This shouldn't fail because we have the inode
+                         * locked. */
+                        GF_ASSERT(__ec_set_inode_size(
+                            fop, fop->locks[0].lock->loc.inode,
+                            cbk->iatt[1].ia_size));
+                    } else {
+                        cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+                    }
+                }
+                UNLOCK(&fop->locks[0].lock->loc.inode->lock);
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.fallocate != NULL) {
+                QUORUM_CBK(fop->cbks.fallocate, fop, fop->req_frame, fop,
+                           fop->xl, cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+                           &cbk->iatt[1], cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.fallocate != NULL) {
+                fop->cbks.fallocate(fop->req_frame, fop, fop->xl, -1,
+                                    fop->error, NULL, NULL, NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target,
+             uint32_t fop_flags, fop_fallocate_cbk_t func, void *data, fd_t *fd,
+             int32_t mode, off_t offset, size_t len, dict_t *xdata)
+{
+    ec_cbk_t callback = {.fallocate = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(FALLOCATE) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_FALLOCATE, 0, target,
+                               fop_flags, ec_wind_fallocate,
+                               ec_manager_fallocate, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+    fop->int32 = mode;
+    fop->offset = offset;
+    fop->size = len;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+            goto out;
+        }
+    }
+
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+    }
+}
+
+/*********************************************************************
+ *
+ * File Operation : Discard
+ *
+ *********************************************************************/
+void
+ec_update_discard_write(ec_fop_data_t *fop, uintptr_t mask)
+{
+    ec_t *ec = fop->xl->private;
+    off_t off_head = 0;
+    off_t off_tail = 0;
+    uint64_t size_head = 0;
+    uint64_t size_tail = 0;
+    int error = 0;
+
+    off_head = fop->offset * ec->fragments - fop->int32;
+    if (fop->size == 0) {
+        error = ec_update_write(fop, mask, off_head, fop->user_size);
+    } else {
+        size_head = fop->int32;
+        size_tail = (off_head + fop->user_size) % ec->stripe_size;
+        off_tail = off_head + fop->user_size - size_tail;
+        if (size_head) {
+            error = ec_update_write(fop, mask, off_head, size_head);
+            if (error) {
+                goto out;
+            }
+        }
+        if (size_tail) {
+            error = ec_update_write(fop, mask, off_tail, size_tail);
+        }
+    }
+out:
+    if (error)
+        ec_fop_set_error(fop, -error);
+}
+
+void
+ec_discard_adjust_offset_size(ec_fop_data_t *fop)
+{
+    ec_t *ec = fop->xl->private;
+
+    fop->user_size = fop->size;
+    /* If discard length covers at least a fragment on brick, we will
+     * perform discard operation(when fop->size is non-zero) else we just
+     * write zeros.
+     */
+    fop->int32 = ec_adjust_offset_up(ec, &fop->offset, _gf_true);
+    fop->frag_range.first = fop->offset;
+    if (fop->size < fop->int32) {
+        fop->size = 0;
+    } else {
+        fop->size -= fop->int32;
+        ec_adjust_size_down(ec, &fop->size, _gf_true);
+    }
+    fop->frag_range.last = fop->offset + fop->size;
+}
+
+int32_t
+ec_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+               struct iatt *postbuf, dict_t *xdata)
+{
+    return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prebuf,
+                              postbuf, xdata);
+}
+
+void
+ec_wind_discard(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_discard_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->discard,
+                      fop->fd, fop->offset, fop->size, fop->xdata);
+}
+
+int32_t
+ec_manager_discard(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk = NULL;
+    off_t fl_start = 0;
+    uint64_t fl_size = 0;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            if ((fop->size <= 0) || (fop->offset < 0)) {
+                ec_fop_set_error(fop, EINVAL);
+                return EC_STATE_REPORT;
+            }
+            /* Because of the head/tail writes, "discard" happens on the
+             * remaining regions, but we need to compute region including
+             * head/tail writes so compute them separately*/
+            fl_start = fop->offset;
+            fl_size = fop->size;
+            fl_size += ec_adjust_offset_down(fop->xl->private, &fl_start,
+                                             _gf_true);
+            ec_adjust_size_up(fop->xl->private, &fl_size, _gf_true);
+
+            ec_discard_adjust_offset_size(fop);
+
+            /* Fall through */
+
+        case EC_STATE_LOCK:
+            ec_lock_prepare_fd(fop, fop->fd,
+                               EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
+                               fl_start, fl_size);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+
+            /* Dispatch discard fop only if we have whole fragment
+             * to deallocate */
+            if (fop->size) {
+                ec_dispatch_all(fop);
+                return EC_STATE_DELAYED_START;
+            } else {
+                /* Assume discard to have succeeded on all bricks */
+                ec_succeed_all(fop);
+            }
+
+            /* Fall through */
+
+        case EC_STATE_DELAYED_START:
+
+            if (fop->size) {
+                if (fop->answer && fop->answer->op_ret == 0)
+                    ec_update_discard_write(fop, fop->answer->mask);
+            } else {
+                ec_update_discard_write(fop, fop->mask);
+            }
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_false);
+            if (cbk != NULL) {
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
+
+                /* This shouldn't fail because we have the inode locked. */
+                GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode,
+                                            &cbk->iatt[0].ia_size));
+
+                cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+            }
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.discard != NULL) {
+                QUORUM_CBK(fop->cbks.discard, fop, fop->req_frame, fop, fop->xl,
+                           cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+                           &cbk->iatt[1], cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_DELAYED_START:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.discard != NULL) {
+                fop->cbks.discard(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                  NULL, NULL, NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target,
+           uint32_t fop_flags, fop_discard_cbk_t func, void *data, fd_t *fd,
+           off_t offset, size_t len, dict_t *xdata)
+{
+    ec_cbk_t callback = {.discard = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(DISCARD) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_DISCARD, 0, target,
+                               fop_flags, ec_wind_discard, ec_manager_discard,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+    fop->offset = offset;
+    fop->size = len;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+    }
+
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+    }
+}
+
+/*********************************************************************
+ *
+ * File Operation : truncate
+ *
+ *********************************************************************/
+
+int32_t
+ec_update_truncate_write(ec_fop_data_t *fop, uintptr_t mask)
+{
+    ec_t *ec = fop->xl->private;
+    uint64_t size = fop->offset * ec->fragments - fop->user_size;
+    return ec_update_write(fop, mask, fop->user_size, size);
+}
+
+int32_t
+ec_truncate_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    ec_fop_data_t *fop = cookie;
+    int32_t err;
+
+    fop->parent->good &= fop->good;
+    if (op_ret >= 0) {
+        fd_bind(fd);
+        err = ec_update_truncate_write(fop->parent, fop->answer->mask);
+        if (err != 0) {
+            ec_fop_set_error(fop->parent, -err);
+        }
+    }
+
+    return 0;
+}
+
+int32_t
+ec_truncate_clean(ec_fop_data_t *fop)
+{
+    if (fop->fd == NULL) {
+        fop->fd = fd_create(fop->loc[0].inode, fop->frame->root->pid);
+        if (fop->fd == NULL) {
+            return -ENOMEM;
+        }
+
+        ec_open(fop->frame, fop->xl, fop->answer->mask, fop->minimum,
+                ec_truncate_open_cbk, fop, &fop->loc[0], O_RDWR, fop->fd, NULL);
+
+        return 0;
+    } else {
+        return ec_update_truncate_write(fop, fop->answer->mask);
+    }
+}
+
+int32_t
+ec_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prestat,
+                struct iatt *poststat, dict_t *xdata)
+{
+    return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat,
+                              poststat, xdata);
+}
+
+void
+ec_wind_truncate(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_truncate_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->truncate,
+                      &fop->loc[0], fop->offset, fop->xdata);
+}
+
+int32_t
+ec_manager_truncate(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+    off_t offset_down;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            fop->user_size = fop->offset;
+            ec_adjust_offset_up(fop->xl->private, &fop->offset, _gf_true);
+            fop->frag_range.first = fop->offset;
+            fop->frag_range.last = UINT64_MAX;
+
+            /* Fall through */
+
+        case EC_STATE_LOCK:
+            offset_down = fop->user_size;
+            ec_adjust_offset_down(fop->xl->private, &offset_down, _gf_true);
+
+            if (fop->id == GF_FOP_TRUNCATE) {
+                ec_lock_prepare_inode(
+                    fop, &fop->loc[0],
+                    EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
+                    offset_down, EC_RANGE_FULL);
+            } else {
+                ec_lock_prepare_fd(
+                    fop, fop->fd,
+                    EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
+                    offset_down, EC_RANGE_FULL);
+            }
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_false);
+            if (cbk != NULL) {
+                int32_t err;
+
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
+
+                /* This shouldn't fail because we have the inode locked. */
+                /* Inode size doesn't need to be updated under locks, because
+                 * conflicting operations won't be in-flight
+                 */
+                GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode,
+                                            &cbk->iatt[0].ia_size));
+                cbk->iatt[1].ia_size = fop->user_size;
+                /* This shouldn't fail because we have the inode locked. */
+                GF_ASSERT(ec_set_inode_size(fop, fop->locks[0].lock->loc.inode,
+                                            fop->user_size));
+                if ((cbk->iatt[0].ia_size > cbk->iatt[1].ia_size) &&
+                    (fop->user_size != fop->offset)) {
+                    err = ec_truncate_clean(fop);
+                    if (err != 0) {
+                        ec_cbk_set_error(cbk, -err, _gf_false);
+                    }
+                }
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->id == GF_FOP_TRUNCATE) {
+                if (fop->cbks.truncate != NULL) {
+                    QUORUM_CBK(fop->cbks.truncate, fop, fop->req_frame, fop,
+                               fop->xl, cbk->op_ret, cbk->op_errno,
+                               &cbk->iatt[0], &cbk->iatt[1], cbk->xdata);
+                }
+            } else {
+                if (fop->cbks.ftruncate != NULL) {
+                    QUORUM_CBK(fop->cbks.ftruncate, fop, fop->req_frame, fop,
+                               fop->xl, cbk->op_ret, cbk->op_errno,
+                               &cbk->iatt[0], &cbk->iatt[1], cbk->xdata);
+                }
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->id == GF_FOP_TRUNCATE) {
+                if (fop->cbks.truncate != NULL) {
+                    fop->cbks.truncate(fop->req_frame, fop, fop->xl, -1,
+                                       fop->error, NULL, NULL, NULL);
+                }
+            } else {
+                if (fop->cbks.ftruncate != NULL) {
+                    fop->cbks.ftruncate(fop->req_frame, fop, fop->xl, -1,
+                                        fop->error, NULL, NULL, NULL);
+                }
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_truncate(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_truncate_cbk_t func, void *data, loc_t *loc,
+            off_t offset, dict_t *xdata)
+{
+    ec_cbk_t callback = {.truncate = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(TRUNCATE) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_TRUNCATE, 0, target,
+                               fop_flags, ec_wind_truncate, ec_manager_truncate,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->offset = offset;
+
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: ftruncate */
+
+int32_t
+ec_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *prestat,
+                 struct iatt *poststat, dict_t *xdata)
+{
+    return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat,
+                              poststat, xdata);
+}
+
+void
+ec_wind_ftruncate(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_ftruncate_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->ftruncate,
+                      fop->fd, fop->offset, fop->xdata);
+}
+
+void
+ec_ftruncate(call_frame_t *frame, xlator_t *this, uintptr_t target,
+             uint32_t fop_flags, fop_ftruncate_cbk_t func, void *data, fd_t *fd,
+             off_t offset, dict_t *xdata)
+{
+    ec_cbk_t callback = {.ftruncate = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(FTRUNCATE) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_FTRUNCATE, 0, target,
+                               fop_flags, ec_wind_ftruncate,
+                               ec_manager_truncate, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    fop->offset = offset;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+    }
+}
+
+/* FOP: writev */
+static ec_stripe_t *
+ec_allocate_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache)
+{
+    ec_stripe_t *stripe = NULL;
+
+    if (stripe_cache->count >= stripe_cache->max) {
+        GF_ASSERT(!list_empty(&stripe_cache->lru));
+        stripe = list_first_entry(&stripe_cache->lru, ec_stripe_t, lru);
+        list_move_tail(&stripe->lru, &stripe_cache->lru);
+        GF_ATOMIC_INC(ec->stats.stripe_cache.evicts);
+    } else {
+        stripe = GF_MALLOC(sizeof(ec_stripe_t) + ec->stripe_size,
+                           ec_mt_ec_stripe_t);
+        if (stripe != NULL) {
+            stripe_cache->count++;
+            list_add_tail(&stripe->lru, &stripe_cache->lru);
+            GF_ATOMIC_INC(ec->stats.stripe_cache.allocs);
+        } else {
+            GF_ATOMIC_INC(ec->stats.stripe_cache.errors);
+        }
+    }
+
+    return stripe;
+}
+
+static void
+ec_write_stripe_data(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
+{
+    off_t base;
+
+    base = fop->size - ec->stripe_size;
+    memcpy(stripe->data, fop->vector[0].iov_base + base, ec->stripe_size);
+    stripe->frag_offset = fop->frag_range.last - ec->fragment_size;
+}
+
+static void
+ec_add_stripe_in_cache(ec_t *ec, ec_fop_data_t *fop)
+{
+    ec_inode_t *ctx = NULL;
+    ec_stripe_t *stripe = NULL;
+    ec_stripe_list_t *stripe_cache = NULL;
+    gf_boolean_t failed = _gf_true;
+
+    LOCK(&fop->fd->inode->lock);
+
+    ctx = __ec_inode_get(fop->fd->inode, fop->xl);
+    if (ctx == NULL) {
+        goto out;
+    }
+
+    stripe_cache = &ctx->stripe_cache;
+    if (stripe_cache->max > 0) {
+        stripe = ec_allocate_stripe(ec, stripe_cache);
+        if (stripe == NULL) {
+            goto out;
+        }
+
+        ec_write_stripe_data(ec, fop, stripe);
+    }
+
+    failed = _gf_false;
+
+out:
+    UNLOCK(&fop->fd->inode->lock);
+
+    if (failed) {
+        gf_msg(ec->xl->name, GF_LOG_DEBUG, ENOMEM, EC_MSG_FILE_DESC_REF_FAIL,
+               "Failed to create and add stripe in cache");
+    }
+}
+
+int32_t
+ec_writev_merge_tail(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                     int32_t count, struct iatt *stbuf, struct iobref *iobref,
+                     dict_t *xdata)
+{
+    ec_t *ec = this->private;
+    ec_fop_data_t *fop = frame->local;
+    uint64_t size, base, tmp;
+
+    if (op_ret >= 0) {
+        tmp = 0;
+        size = fop->size - fop->user_size - fop->head;
+        base = ec->stripe_size - size;
+        if (op_ret > base) {
+            tmp = min(op_ret - base, size);
+            ec_iov_copy_to(fop->vector[0].iov_base + fop->size - size, vector,
+                           count, base, tmp);
+
+            size -= tmp;
+        }
+
+        if (size > 0) {
+            memset(fop->vector[0].iov_base + fop->size - size, 0, size);
+        }
+
+        if (ec->stripe_cache) {
+            ec_add_stripe_in_cache(ec, fop);
+        }
+    }
+    return 0;
+}
+
+int32_t
+ec_writev_merge_head(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                     int32_t count, struct iatt *stbuf, struct iobref *iobref,
+                     dict_t *xdata)
+{
+    ec_t *ec = this->private;
+    ec_fop_data_t *fop = frame->local;
+    uint64_t size, base;
+
+    if (op_ret >= 0) {
+        size = fop->head;
+        base = 0;
+
+        if (op_ret > 0) {
+            base = min(op_ret, size);
+            ec_iov_copy_to(fop->vector[0].iov_base, vector, count, 0, base);
+
+            size -= base;
+        }
+
+        if (size > 0) {
+            memset(fop->vector[0].iov_base + base, 0, size);
+        }
+
+        size = fop->size - fop->user_size - fop->head;
+        if ((size > 0) && (fop->size == ec->stripe_size)) {
+            ec_writev_merge_tail(frame, cookie, this, op_ret, op_errno, vector,
+                                 count, stbuf, iobref, xdata);
+        }
+    }
+
+    return 0;
+}
+
+static int
+ec_make_internal_fop_xdata(dict_t **xdata)
+{
+    dict_t *dict = NULL;
+
+    if (*xdata)
+        return 0;
+
+    dict = dict_new();
+    if (!dict)
+        goto out;
+
+    if (dict_set_str(dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"))
+        goto out;
+
+    *xdata = dict;
+    return 0;
+out:
+    if (dict)
+        dict_unref(dict);
+    return -1;
+}
+
+static int32_t
+ec_writev_prepare_buffers(ec_t *ec, ec_fop_data_t *fop)
+{
+    struct iobref *iobref = NULL;
+    struct iovec *iov;
+    void *ptr;
+    int32_t err;
+
+    fop->user_size = iov_length(fop->vector, fop->int32);
+    fop->head = ec_adjust_offset_down(ec, &fop->offset, _gf_false);
+    fop->frag_range.first = fop->offset / ec->fragments;
+    fop->size = fop->user_size + fop->head;
+    ec_adjust_size_up(ec, &fop->size, _gf_false);
+    fop->frag_range.last = fop->frag_range.first + fop->size / ec->fragments;
+
+    if ((fop->int32 != 1) || (fop->head != 0) || (fop->size > fop->user_size) ||
+        !EC_ALIGN_CHECK(fop->vector[0].iov_base, EC_METHOD_WORD_SIZE)) {
+        err = ec_buffer_alloc(ec->xl, fop->size, &iobref, &ptr);
+        if (err != 0) {
+            goto out;
+        }
+
+        ec_iov_copy_to(ptr + fop->head, fop->vector, fop->int32, 0,
+                       fop->user_size);
+
+        fop->vector[0].iov_base = ptr;
+        fop->vector[0].iov_len = fop->size;
+
+        iobref_unref(fop->buffers);
+        fop->buffers = iobref;
+    }
+
+    if (fop->int32 != 2) {
+        iov = GF_MALLOC(VECTORSIZE(2), gf_common_mt_iovec);
+        if (iov == NULL) {
+            err = -ENOMEM;
+
+            goto out;
+        }
+        iov[0].iov_base = fop->vector[0].iov_base;
+        iov[0].iov_len = fop->vector[0].iov_len;
+
+        GF_FREE(fop->vector);
+        fop->vector = iov;
+    }
+
+    fop->vector[1].iov_len = fop->size / ec->fragments;
+    err = ec_buffer_alloc(ec->xl, fop->vector[1].iov_len * ec->nodes,
+                          &fop->buffers, &fop->vector[1].iov_base);
+    if (err != 0) {
+        goto out;
+    }
+
+    err = 0;
+
+out:
+    return err;
+}
+
+static void
+ec_merge_stripe_head_locked(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
+{
+    uint32_t head, size;
+
+    head = fop->head;
+    memcpy(fop->vector[0].iov_base, stripe->data, head);
+
+    size = ec->stripe_size - head;
+    if (size > fop->user_size) {
+        head += fop->user_size;
+        size = ec->stripe_size - head;
+        memcpy(fop->vector[0].iov_base + head, stripe->data + head, size);
+    }
+}
+
+static void
+ec_merge_stripe_tail_locked(ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe)
+{
+    uint32_t head, tail;
+    off_t offset;
+
+    offset = fop->user_size + fop->head;
+    tail = fop->size - offset;
+    head = ec->stripe_size - tail;
+
+    memcpy(fop->vector[0].iov_base + offset, stripe->data + head, tail);
+}
+
+static ec_stripe_t *
+ec_get_stripe_from_cache_locked(ec_t *ec, ec_fop_data_t *fop,
+                                uint64_t frag_offset)
+{
+    ec_inode_t *ctx = NULL;
+    ec_stripe_t *stripe = NULL;
+    ec_stripe_list_t *stripe_cache = NULL;
+
+    ctx = __ec_inode_get(fop->fd->inode, fop->xl);
+    if (ctx == NULL) {
+        GF_ATOMIC_INC(ec->stats.stripe_cache.errors);
+        return NULL;
+    }
+
+    stripe_cache = &ctx->stripe_cache;
+    list_for_each_entry(stripe, &stripe_cache->lru, lru)
+    {
+        if (stripe->frag_offset == frag_offset) {
+            list_move_tail(&stripe->lru, &stripe_cache->lru);
+            GF_ATOMIC_INC(ec->stats.stripe_cache.hits);
+            return stripe;
+        }
+    }
+
+    GF_ATOMIC_INC(ec->stats.stripe_cache.misses);
+
+    return NULL;
+}
+
+static gf_boolean_t
+ec_get_and_merge_stripe(ec_t *ec, ec_fop_data_t *fop, ec_stripe_part_t which)
+{
+    uint64_t frag_offset;
+    ec_stripe_t *stripe = NULL;
+    gf_boolean_t found = _gf_false;
+
+    if (!ec->stripe_cache) {
+        return found;
+    }
+
+    LOCK(&fop->fd->inode->lock);
+    if (which == EC_STRIPE_HEAD) {
+        frag_offset = fop->frag_range.first;
+        stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset);
+        if (stripe) {
+            ec_merge_stripe_head_locked(ec, fop, stripe);
+            found = _gf_true;
+        }
+    }
+
+    if (which == EC_STRIPE_TAIL) {
+        frag_offset = fop->frag_range.last - ec->fragment_size;
+        stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset);
+        if (stripe) {
+            ec_merge_stripe_tail_locked(ec, fop, stripe);
+            found = _gf_true;
+        }
+    }
+    UNLOCK(&fop->fd->inode->lock);
+
+    return found;
+}
+
+static uintptr_t
+ec_get_lock_good_mask(inode_t *inode, xlator_t *xl)
+{
+    ec_lock_t *lock = NULL;
+    ec_inode_t *ictx = NULL;
+    LOCK(&inode->lock);
+    {
+        ictx = __ec_inode_get(inode, xl);
+        if (ictx)
+            lock = ictx->inode_lock;
+    }
+    UNLOCK(&inode->lock);
+    if (lock)
+        return lock->good_mask;
+    return 0;
+}
+
+void
+ec_writev_start(ec_fop_data_t *fop)
+{
+    ec_t *ec = fop->xl->private;
+    ec_fd_t *ctx;
+    fd_t *fd;
+    dict_t *xdata = NULL;
+    uint64_t tail, current;
+    int32_t err = -ENOMEM;
+    gf_boolean_t found_stripe = _gf_false;
+
+    /* This shouldn't fail because we have the inode locked. */
+    GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, &current));
+
+    fd = fd_anonymous(fop->fd->inode);
+    if (fd == NULL) {
+        goto failed;
+    }
+
+    fop->frame->root->uid = 0;
+    fop->frame->root->gid = 0;
+
+    ctx = ec_fd_get(fop->fd, fop->xl);
+    if (ctx != NULL) {
+        if ((ctx->flags & O_APPEND) != 0) {
+            /* Appending writes take full locks so size won't change because
+             * of any parallel operations
+             */
+            fop->offset = current;
+        }
+    }
+
+    err = ec_writev_prepare_buffers(ec, fop);
+    if (err != 0) {
+        goto failed_fd;
+    }
+    tail = fop->size - fop->user_size - fop->head;
+    if (fop->head > 0) {
+        if (current > fop->offset) {
+            found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD);
+            if (!found_stripe) {
+                if (ec_make_internal_fop_xdata(&xdata)) {
+                    err = -ENOMEM;
+                    goto failed_xdata;
+                }
+                ec_readv(fop->frame, fop->xl,
+                         ec_get_lock_good_mask(fop->fd->inode, fop->xl),
+                         EC_MINIMUM_MIN, ec_writev_merge_head, NULL, fd,
+                         ec->stripe_size, fop->offset, 0, xdata);
+            }
+        } else {
+            memset(fop->vector[0].iov_base, 0, fop->head);
+            memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);
+            if (ec->stripe_cache && (fop->size <= ec->stripe_size)) {
+                ec_add_stripe_in_cache(ec, fop);
+            }
+        }
+    }
+
+    if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
+        /* Current locking scheme will make sure the 'current' below will
+         * never decrease while the fop is in progress, so the checks will
+         * work as expected
+         */
+        if (current > fop->offset + fop->head + fop->user_size) {
+            found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_TAIL);
+            if (!found_stripe) {
+                if (ec_make_internal_fop_xdata(&xdata)) {
+                    err = -ENOMEM;
+                    goto failed_xdata;
+                }
+                ec_readv(fop->frame, fop->xl,
+                         ec_get_lock_good_mask(fop->fd->inode, fop->xl),
+                         EC_MINIMUM_MIN, ec_writev_merge_tail, NULL, fd,
+                         ec->stripe_size,
+                         fop->offset + fop->size - ec->stripe_size, 0, xdata);
+            }
+        } else {
+            memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);
+            if (ec->stripe_cache) {
+                ec_add_stripe_in_cache(ec, fop);
+            }
+        }
+    }
+
+    err = 0;
+
+failed_xdata:
+    if (xdata) {
+        dict_unref(xdata);
+    }
+failed_fd:
+    fd_unref(fd);
+failed:
+    ec_fop_set_error(fop, -err);
+}
+
+int32_t
+ec_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *prestat, struct iatt *poststat,
+              dict_t *xdata)
+{
+    ec_t *ec = NULL;
+    if (this && this->private) {
+        ec = this->private;
+        if ((op_ret > 0) && ((op_ret % ec->fragment_size) != 0)) {
+            op_ret = -1;
+            op_errno = EIO;
+        }
+    }
+    return ec_inode_write_cbk(frame, this, cookie, op_ret, op_errno, prestat,
+                              poststat, xdata);
+}
+
+void
+ec_wind_writev(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    struct iovec vector[1];
+    size_t size;
+
+    size = fop->vector[1].iov_len;
+
+    vector[0].iov_base = fop->vector[1].iov_base + idx * size;
+    vector[0].iov_len = size;
+
+    STACK_WIND_COOKIE(fop->frame, ec_writev_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->writev, fop->fd,
+                      vector, 1, fop->offset / ec->fragments, fop->uint32,
+                      fop->buffers, fop->xdata);
+}
+
+static void
+ec_writev_encode(ec_fop_data_t *fop)
+{
+    ec_t *ec = fop->xl->private;
+    void *blocks[ec->nodes];
+    uint32_t i;
+
+    blocks[0] = fop->vector[1].iov_base;
+    for (i = 1; i < ec->nodes; i++) {
+        blocks[i] = blocks[i - 1] + fop->vector[1].iov_len;
+    }
+    ec_method_encode(&ec->matrix, fop->vector[0].iov_len,
+                     fop->vector[0].iov_base, blocks);
+}
+
+int32_t
+ec_manager_writev(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+    ec_fd_t *ctx = NULL;
+    ec_t *ec = fop->xl->private;
+    off_t fl_start = 0;
+    uint64_t fl_size = LONG_MAX;
+
+    switch (state) {
+        case EC_STATE_INIT:
+        case EC_STATE_LOCK:
+            ctx = ec_fd_get(fop->fd, fop->xl);
+            if (ctx != NULL) {
+                if ((ctx->flags & O_APPEND) == 0) {
+                    off_t user_size = 0;
+                    off_t head = 0;
+
+                    fl_start = fop->offset;
+                    user_size = iov_length(fop->vector, fop->int32);
+                    head = ec_adjust_offset_down(ec, &fl_start, _gf_true);
+                    fl_size = user_size + head;
+                    ec_adjust_size_up(ec, &fl_size, _gf_true);
+                }
+            }
+            ec_lock_prepare_fd(fop, fop->fd,
+                               EC_UPDATE_DATA | EC_UPDATE_META | EC_QUERY_INFO,
+                               fl_start, fl_size);
+            ec_lock(fop);
+
+            return EC_STATE_DISPATCH;
+
+        case EC_STATE_DISPATCH:
+            ec_writev_start(fop);
+
+            return EC_STATE_DELAYED_START;
+
+        case EC_STATE_DELAYED_START:
+            /* Restore uid, gid if they were changed to do some partial
+             * reads. */
+            fop->frame->root->uid = fop->uid;
+            fop->frame->root->gid = fop->gid;
+
+            ec_writev_encode(fop);
+
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+            cbk = ec_fop_prepare_answer(fop, _gf_false);
+            if (cbk != NULL) {
+                ec_t *ec = fop->xl->private;
+                uint64_t size;
+
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count);
+
+                /* This shouldn't fail because we have the inode locked. */
+                LOCK(&fop->fd->inode->lock);
+                {
+                    GF_ASSERT(__ec_get_inode_size(fop, fop->fd->inode,
+                                                  &cbk->iatt[0].ia_size));
+                    cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+                    size = fop->offset + fop->head + fop->user_size;
+                    if (size > cbk->iatt[0].ia_size) {
+                        /* Only update inode size if this is a top level fop.
+                         * Otherwise this is an internal write and the top
+                         * level fop should take care of the real inode size.
+                         */
+                        if (fop->parent == NULL) {
+                            /* This shouldn't fail because we have the inode
+                             * locked. */
+                            GF_ASSERT(
+                                __ec_set_inode_size(fop, fop->fd->inode, size));
+                        }
+                        cbk->iatt[1].ia_size = size;
+                    }
+                }
+                UNLOCK(&fop->fd->inode->lock);
+
+                if (fop->error == 0) {
+                    cbk->op_ret *= ec->fragments;
+                    if (cbk->op_ret < fop->head) {
+                        cbk->op_ret = 0;
+                    } else {
+                        cbk->op_ret -= fop->head;
+                    }
+                    if (cbk->op_ret > fop->user_size) {
+                        cbk->op_ret = fop->user_size;
+                    }
+                }
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.writev != NULL) {
+                QUORUM_CBK(fop->cbks.writev, fop, fop->req_frame, fop, fop->xl,
+                           cbk->op_ret, cbk->op_errno, &cbk->iatt[0],
+                           &cbk->iatt[1], cbk->xdata);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_DELAYED_START:
+            /* We have failed while doing partial reads. We need to restore
+             * original uid, gid. */
+            fop->frame->root->uid = fop->uid;
+            fop->frame->root->gid = fop->gid;
+
+            /* Fall through */
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_LOCK:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.writev != NULL) {
+                fop->cbks.writev(fop->req_frame, fop, fop->xl, -1, fop->error,
+                                 NULL, NULL, NULL);
+            }
+
+            return EC_STATE_LOCK_REUSE;
+
+        case -EC_STATE_LOCK_REUSE:
+        case EC_STATE_LOCK_REUSE:
+            ec_lock_reuse(fop);
+
+            return EC_STATE_UNLOCK;
+
+        case -EC_STATE_UNLOCK:
+        case EC_STATE_UNLOCK:
+            ec_unlock(fop);
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_writev(call_frame_t *frame, xlator_t *this, uintptr_t target,
+          uint32_t fop_flags, fop_writev_cbk_t func, void *data, fd_t *fd,
+          struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
+          struct iobref *iobref, dict_t *xdata)
+{
+    ec_cbk_t callback = {.writev = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(WRITE) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_WRITE, 0, target, fop_flags,
+                               ec_wind_writev, ec_manager_writev, callback,
+                               data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->int32 = count;
+    fop->offset = offset;
+    fop->uint32 = flags;
+
+    fop->use_fd = 1;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (count > 0) {
+        fop->vector = iov_dup(vector, count);
+        if (fop->vector == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                   "Failed to duplicate a "
+                   "vector list.");
+
+            goto out;
+        }
+        fop->int32 = count;
+    }
+    if (iobref != NULL) {
+        fop->buffers = iobref_ref(iobref);
+        if (fop->buffers == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_BUF_REF_FAIL,
+                   "Failed to reference a "
+                   "buffer.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_copy_with_ref(xdata, NULL);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+    }
+}
diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c
new file mode 100644
index 00000000000..601960d6154
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-locks.c
@@ -0,0 +1,1128 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-combine.h"
+#include "ec-fops.h"
+#include "ec-messages.h"
+
+#define EC_LOCK_MODE_NONE 0
+#define EC_LOCK_MODE_INC 1
+#define EC_LOCK_MODE_ALL 2
+
+int32_t
+ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
+{
+    ec_t *ec = fop->xl->private;
+    ec_cbk_data_t *ans = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    uintptr_t locked = 0;
+    int32_t good = 0;
+    int32_t eagain = 0;
+    int32_t estale = 0;
+    int32_t error = -1;
+
+    /* There are some errors that we'll handle in an special way while trying
+     * to acquire a lock.
+     *
+     *   EAGAIN:  If it's found during a parallel non-blocking lock request, we
+     *            consider that there's contention on the inode, so we consider
+     *            the acquisition a failure and try again with a sequential
+     *            blocking lock request. This will ensure that we get a lock on
+     *            as many bricks as possible (ignoring EAGAIN here would cause
+     *            unnecessary triggers of self-healing).
+     *
+     *            If it's found during a sequential blocking lock request, it's
+     *            considered an error. Lock will only succeed if there are
+     *            enough other bricks locked.
+     *
+     *   ESTALE:  This can appear during parallel or sequential lock request if
+     *            the inode has just been unlinked. We consider this error is
+     *            not recoverable, but we also don't consider it as fatal. So,
+     *            if it happens during parallel lock, we won't attempt a
+     *            sequential one unless there are EAGAIN errors on other
+     *            bricks (and are enough to form a quorum), but if we reach
+     *            quorum counting the ESTALE bricks, we consider the whole
+     *            result of the operation is ESTALE instead of EIO.
+     */
+
+    list_for_each_entry(ans, &fop->cbk_list, list)
+    {
+        if (ans->op_ret >= 0) {
+            if (locked != 0) {
+                error = EIO;
+            }
+            locked |= ans->mask;
+            good = ans->count;
+            cbk = ans;
+        } else if (ans->op_errno == ESTALE) {
+            estale += ans->count;
+        } else if ((ans->op_errno == EAGAIN) &&
+                   (fop->uint32 != EC_LOCK_MODE_INC)) {
+            eagain += ans->count;
+        }
+    }
+
+    if (error == -1) {
+        /* If we have enough quorum with succeeded and EAGAIN answers, we
+         * ignore for now any ESTALE answer. If there are EAGAIN answers,
+         * we retry with a sequential blocking lock request if needed.
+         * Otherwise we succeed. */
+        if ((good + eagain) >= ec->fragments) {
+            if (eagain == 0) {
+                if (fop->answer == NULL) {
+                    fop->answer = cbk;
+                }
+
+                ec_update_good(fop, locked);
+
+                error = 0;
+            } else {
+                switch (fop->uint32) {
+                    case EC_LOCK_MODE_NONE:
+                        error = EAGAIN;
+                        break;
+                    case EC_LOCK_MODE_ALL:
+                        fop->uint32 = EC_LOCK_MODE_INC;
+                        break;
+                    default:
+                        /* This shouldn't happen because eagain cannot be > 0
+                         * when fop->uint32 is EC_LOCK_MODE_INC. */
+                        error = EIO;
+                        break;
+                }
+            }
+        } else {
+            /* We have been unable to find enough candidates that will be able
+             * to take the lock. If we have quorum on some answer, we return
+             * it. Otherwise we check if ESTALE answers allow us to reach
+             * quorum. If so, we return ESTALE. */
+            if (fop->answer && fop->answer->op_ret < 0) {
+                error = fop->answer->op_errno;
+            } else if ((good + eagain + estale) >= ec->fragments) {
+                error = ESTALE;
+            } else {
+                error = EIO;
+            }
+        }
+    }
+
+    *mask = locked;
+
+    return error;
+}
+
+int32_t
+ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_UNLOCK_FAILED,
+               "Failed to unlock an entry/inode");
+    }
+
+    return 0;
+}
+
+int32_t
+ec_lock_lk_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct gf_flock *flock,
+                    dict_t *xdata)
+{
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_LK_UNLOCK_FAILED,
+               "Failed to unlock an lk");
+    }
+
+    return 0;
+}
+
+/* FOP: entrylk */
+
+int32_t
+ec_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_ENTRYLK, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, NULL);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_entrylk(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_entrylk_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->entrylk,
+                      fop->str[0], &fop->loc[0], fop->str[1], fop->entrylk_cmd,
+                      fop->entrylk_type, fop->xdata);
+}
+
+int32_t
+ec_manager_entrylk(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            if (fop->entrylk_cmd == ENTRYLK_LOCK) {
+                fop->uint32 = EC_LOCK_MODE_ALL;
+                fop->entrylk_cmd = ENTRYLK_LOCK_NB;
+            }
+
+            /* Fall through */
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_PREPARE_ANSWER:
+            if (fop->entrylk_cmd != ENTRYLK_UNLOCK) {
+                uintptr_t mask;
+
+                ec_fop_set_error(fop, ec_lock_check(fop, &mask));
+                if (fop->error != 0) {
+                    if (mask != 0) {
+                        if (fop->id == GF_FOP_ENTRYLK) {
+                            ec_entrylk(
+                                fop->frame, fop->xl, mask, 1, ec_lock_unlocked,
+                                NULL, fop->str[0], &fop->loc[0], fop->str[1],
+                                ENTRYLK_UNLOCK, fop->entrylk_type, fop->xdata);
+                        } else {
+                            ec_fentrylk(fop->frame, fop->xl, mask, 1,
+                                        ec_lock_unlocked, NULL, fop->str[0],
+                                        fop->fd, fop->str[1], ENTRYLK_UNLOCK,
+                                        fop->entrylk_type, fop->xdata);
+                        }
+                    }
+                    if (fop->error < 0) {
+                        fop->error = 0;
+
+                        fop->entrylk_cmd = ENTRYLK_LOCK;
+
+                        ec_dispatch_inc(fop);
+
+                        return EC_STATE_PREPARE_ANSWER;
+                    }
+                }
+            } else {
+                ec_fop_prepare_answer(fop, _gf_true);
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->id == GF_FOP_ENTRYLK) {
+                if (fop->cbks.entrylk != NULL) {
+                    fop->cbks.entrylk(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                      cbk->op_errno, cbk->xdata);
+                }
+            } else {
+                if (fop->cbks.fentrylk != NULL) {
+                    fop->cbks.fentrylk(fop->req_frame, fop, fop->xl,
+                                       cbk->op_ret, cbk->op_errno, cbk->xdata);
+                }
+            }
+
+            return EC_STATE_END;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->id == GF_FOP_ENTRYLK) {
+                if (fop->cbks.entrylk != NULL) {
+                    fop->cbks.entrylk(fop->req_frame, fop, fop->xl, -1,
+                                      fop->error, NULL);
+                }
+            } else {
+                if (fop->cbks.fentrylk != NULL) {
+                    fop->cbks.fentrylk(fop->req_frame, fop, fop->xl, -1,
+                                       fop->error, NULL);
+                }
+            }
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_entrylk(call_frame_t *frame, xlator_t *this, uintptr_t target,
+           uint32_t fop_flags, fop_entrylk_cbk_t func, void *data,
+           const char *volume, loc_t *loc, const char *basename,
+           entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
+{
+    ec_cbk_t callback = {.entrylk = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(ENTRYLK) %p", frame);
+
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_ENTRYLK, 0, target,
+                               fop_flags, ec_wind_entrylk, ec_manager_entrylk,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->entrylk_cmd = cmd;
+    fop->entrylk_type = type;
+
+    if (volume != NULL) {
+        fop->str[0] = gf_strdup(volume);
+        if (fop->str[0] == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                   "Failed to duplicate a string.");
+
+            goto out;
+        }
+    }
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (basename != NULL) {
+        fop->str[1] = gf_strdup(basename);
+        if (fop->str[1] == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                   "Failed to duplicate a string.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL);
+    }
+}
+
+/* FOP: fentrylk */
+
+int32_t
+ec_fentrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FENTRYLK, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, NULL);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_fentrylk(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_fentrylk_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->fentrylk,
+                      fop->str[0], fop->fd, fop->str[1], fop->entrylk_cmd,
+                      fop->entrylk_type, fop->xdata);
+}
+
+void
+ec_fentrylk(call_frame_t *frame, xlator_t *this, uintptr_t target,
+            uint32_t fop_flags, fop_fentrylk_cbk_t func, void *data,
+            const char *volume, fd_t *fd, const char *basename, entrylk_cmd cmd,
+            entrylk_type type, dict_t *xdata)
+{
+    ec_cbk_t callback = {.fentrylk = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(FENTRYLK) %p", frame);
+
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_FENTRYLK, 0, target,
+                               fop_flags, ec_wind_fentrylk, ec_manager_entrylk,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    fop->entrylk_cmd = cmd;
+    fop->entrylk_type = type;
+
+    if (volume != NULL) {
+        fop->str[0] = gf_strdup(volume);
+        if (fop->str[0] == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                   "Failed to duplicate a string.");
+
+            goto out;
+        }
+    }
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (basename != NULL) {
+        fop->str[1] = gf_strdup(basename);
+        if (fop->str[1] == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                   "Failed to duplicate a string.");
+
+            goto out;
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL);
+    }
+}
+
+/* FOP: inodelk */
+
+int32_t
+ec_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_INODELK, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, NULL);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_inodelk(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_inodelk_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->inodelk,
+                      fop->str[0], &fop->loc[0], fop->int32, &fop->flock,
+                      fop->xdata);
+}
+
+int32_t
+ec_manager_inodelk(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            fop->flock.l_len += ec_adjust_offset_down(
+                fop->xl->private, &fop->flock.l_start, _gf_true);
+            ec_adjust_offset_up(fop->xl->private, &fop->flock.l_len, _gf_true);
+            if ((fop->int32 == F_SETLKW) && (fop->flock.l_type != F_UNLCK)) {
+                fop->uint32 = EC_LOCK_MODE_ALL;
+                fop->int32 = F_SETLK;
+            }
+
+            /* Fall through */
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_PREPARE_ANSWER:
+            if (fop->flock.l_type != F_UNLCK) {
+                uintptr_t mask;
+
+                ec_fop_set_error(fop, ec_lock_check(fop, &mask));
+                if (fop->error != 0) {
+                    if (mask != 0) {
+                        ec_t *ec = fop->xl->private;
+                        struct gf_flock flock;
+
+                        flock.l_type = F_UNLCK;
+                        flock.l_whence = fop->flock.l_whence;
+                        flock.l_start = fop->flock.l_start * ec->fragments;
+                        flock.l_len = fop->flock.l_len * ec->fragments;
+                        flock.l_pid = 0;
+                        flock.l_owner.len = 0;
+
+                        if (fop->id == GF_FOP_INODELK) {
+                            ec_inodelk(fop->frame, fop->xl,
+                                       &fop->frame->root->lk_owner, mask, 1,
+                                       ec_lock_unlocked, NULL, fop->str[0],
+                                       &fop->loc[0], F_SETLK, &flock,
+                                       fop->xdata);
+                        } else {
+                            ec_finodelk(fop->frame, fop->xl,
+                                        &fop->frame->root->lk_owner, mask, 1,
+                                        ec_lock_unlocked, NULL, fop->str[0],
+                                        fop->fd, F_SETLK, &flock, fop->xdata);
+                        }
+                    }
+                    if (fop->error < 0) {
+                        fop->error = 0;
+
+                        fop->int32 = F_SETLKW;
+
+                        ec_dispatch_inc(fop);
+
+                        return EC_STATE_PREPARE_ANSWER;
+                    }
+                }
+            } else {
+                ec_fop_prepare_answer(fop, _gf_true);
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->id == GF_FOP_INODELK) {
+                if (fop->cbks.inodelk != NULL) {
+                    fop->cbks.inodelk(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                      cbk->op_errno, cbk->xdata);
+                }
+            } else {
+                if (fop->cbks.finodelk != NULL) {
+                    fop->cbks.finodelk(fop->req_frame, fop, fop->xl,
+                                       cbk->op_ret, cbk->op_errno, cbk->xdata);
+                }
+            }
+
+            return EC_STATE_END;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->id == GF_FOP_INODELK) {
+                if (fop->cbks.inodelk != NULL) {
+                    fop->cbks.inodelk(fop->req_frame, fop, fop->xl, -1,
+                                      fop->error, NULL);
+                }
+            } else {
+                if (fop->cbks.finodelk != NULL) {
+                    fop->cbks.finodelk(fop->req_frame, fop, fop->xl, -1,
+                                       fop->error, NULL);
+                }
+            }
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_inodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner,
+           uintptr_t target, uint32_t fop_flags, fop_inodelk_cbk_t func,
+           void *data, const char *volume, loc_t *loc, int32_t cmd,
+           struct gf_flock *flock, dict_t *xdata)
+{
+    ec_cbk_t callback = {.inodelk = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(INODELK) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_INODELK, 0, target,
+                               fop_flags, ec_wind_inodelk, ec_manager_inodelk,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->int32 = cmd;
+    ec_owner_copy(fop->frame, owner);
+
+    if (volume != NULL) {
+        fop->str[0] = gf_strdup(volume);
+        if (fop->str[0] == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                   "Failed to duplicate a string.");
+
+            goto out;
+        }
+    }
+    if (loc != NULL) {
+        if (loc_copy(&fop->loc[0], loc) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+                   "Failed to copy a location.");
+
+            goto out;
+        }
+    }
+    if (flock != NULL) {
+        fop->flock.l_type = flock->l_type;
+        fop->flock.l_whence = flock->l_whence;
+        fop->flock.l_start = flock->l_start;
+        fop->flock.l_len = flock->l_len;
+        fop->flock.l_pid = flock->l_pid;
+        fop->flock.l_owner.len = flock->l_owner.len;
+        if (flock->l_owner.len > 0) {
+            memcpy(fop->flock.l_owner.data, flock->l_owner.data,
+                   flock->l_owner.len);
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL);
+    }
+}
+
+/* FOP: finodelk */
+
+int32_t
+ec_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FINODELK, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, NULL);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_finodelk(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_finodelk_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->finodelk,
+                      fop->str[0], fop->fd, fop->int32, &fop->flock,
+                      fop->xdata);
+}
+
+void
+ec_finodelk(call_frame_t *frame, xlator_t *this, gf_lkowner_t *owner,
+            uintptr_t target, uint32_t fop_flags, fop_finodelk_cbk_t func,
+            void *data, const char *volume, fd_t *fd, int32_t cmd,
+            struct gf_flock *flock, dict_t *xdata)
+{
+    ec_cbk_t callback = {.finodelk = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(FINODELK) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_FINODELK, 0, target,
+                               fop_flags, ec_wind_finodelk, ec_manager_inodelk,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    fop->int32 = cmd;
+    ec_owner_copy(fop->frame, owner);
+
+    if (volume != NULL) {
+        fop->str[0] = gf_strdup(volume);
+        if (fop->str[0] == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+                   "Failed to duplicate a string.");
+
+            goto out;
+        }
+    }
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (flock != NULL) {
+        fop->flock.l_type = flock->l_type;
+        fop->flock.l_whence = flock->l_whence;
+        fop->flock.l_start = flock->l_start;
+        fop->flock.l_len = flock->l_len;
+        fop->flock.l_pid = flock->l_pid;
+        fop->flock.l_owner.len = flock->l_owner.len;
+        if (flock->l_owner.len > 0) {
+            memcpy(fop->flock.l_owner.data, flock->l_owner.data,
+                   flock->l_owner.len);
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL);
+    }
+}
+
+/* FOP: lk */
+
+int32_t
+ec_combine_lk(ec_fop_data_t *fop, ec_cbk_data_t *dst, ec_cbk_data_t *src)
+{
+    if (!ec_flock_compare(&dst->flock, &src->flock)) {
+        gf_msg(fop->xl->name, GF_LOG_NOTICE, 0, EC_MSG_LOCK_MISMATCH,
+               "Mismatching lock in "
+               "answers of 'GF_FOP_LK'");
+
+        return 0;
+    }
+
+    return 1;
+}
+
+int32_t
+ec_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+          int32_t op_errno, struct gf_flock *flock, dict_t *xdata)
+{
+    ec_fop_data_t *fop = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    int32_t idx = (int32_t)(uintptr_t)cookie;
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = frame->local;
+
+    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, frame,
+             op_ret, op_errno);
+
+    cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_LK, idx, op_ret,
+                               op_errno);
+    if (cbk != NULL) {
+        if (op_ret >= 0) {
+            if (flock != NULL) {
+                cbk->flock.l_type = flock->l_type;
+                cbk->flock.l_whence = flock->l_whence;
+                cbk->flock.l_start = flock->l_start;
+                cbk->flock.l_len = flock->l_len;
+                cbk->flock.l_pid = flock->l_pid;
+                cbk->flock.l_owner.len = flock->l_owner.len;
+                if (flock->l_owner.len > 0) {
+                    memcpy(cbk->flock.l_owner.data, flock->l_owner.data,
+                           flock->l_owner.len);
+                }
+            }
+        }
+        if (xdata != NULL) {
+            cbk->xdata = dict_ref(xdata);
+            if (cbk->xdata == NULL) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                       "Failed to reference a "
+                       "dictionary.");
+
+                goto out;
+            }
+        }
+
+        ec_combine(cbk, ec_combine_lk);
+    }
+
+out:
+    if (fop != NULL) {
+        ec_complete(fop);
+    }
+
+    return 0;
+}
+
+void
+ec_wind_lk(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_lk_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->lk, fop->fd,
+                      fop->int32, &fop->flock, fop->xdata);
+}
+
+int32_t
+ec_manager_lk(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk;
+
+    switch (state) {
+        case EC_STATE_INIT:
+            if ((fop->int32 == F_SETLKW) && (fop->flock.l_type != F_UNLCK)) {
+                fop->uint32 = EC_LOCK_MODE_ALL;
+                fop->int32 = F_SETLK;
+            }
+
+            /* Fall through */
+
+        case EC_STATE_DISPATCH:
+            ec_dispatch_all(fop);
+
+            return EC_STATE_PREPARE_ANSWER;
+
+        case EC_STATE_PREPARE_ANSWER:
+        case -EC_STATE_PREPARE_ANSWER:
+            if (fop->flock.l_type != F_UNLCK) {
+                uintptr_t mask;
+
+                ec_fop_set_error(fop, ec_lock_check(fop, &mask));
+                if (fop->error != 0) {
+                    if (mask != 0) {
+                        struct gf_flock flock = {0};
+
+                        flock.l_type = F_UNLCK;
+                        flock.l_whence = fop->flock.l_whence;
+                        flock.l_start = fop->flock.l_start;
+                        flock.l_len = fop->flock.l_len;
+                        flock.l_pid = fop->flock.l_pid;
+                        lk_owner_copy(&flock.l_owner, &fop->flock.l_owner);
+
+                        ec_lk(fop->frame, fop->xl, mask, 1, ec_lock_lk_unlocked,
+                              NULL, fop->fd, F_SETLK, &flock, fop->xdata);
+                    }
+
+                    if (fop->error < 0) {
+                        fop->error = 0;
+
+                        fop->int32 = F_SETLKW;
+
+                        ec_dispatch_inc(fop);
+
+                        return EC_STATE_PREPARE_ANSWER;
+                    }
+                }
+            } else {
+                ec_fop_prepare_answer(fop, _gf_true);
+            }
+
+            return EC_STATE_REPORT;
+
+        case EC_STATE_REPORT:
+            cbk = fop->answer;
+
+            GF_ASSERT(cbk != NULL);
+
+            if (fop->cbks.lk != NULL) {
+                fop->cbks.lk(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                             cbk->op_errno, &cbk->flock, cbk->xdata);
+            }
+
+            return EC_STATE_END;
+
+        case -EC_STATE_INIT:
+        case -EC_STATE_DISPATCH:
+        case -EC_STATE_REPORT:
+            GF_ASSERT(fop->error != 0);
+
+            if (fop->cbks.lk != NULL) {
+                fop->cbks.lk(fop->req_frame, fop, fop->xl, -1, fop->error, NULL,
+                             NULL);
+            }
+
+            return EC_STATE_END;
+
+        default:
+            gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_UNHANDLED_STATE,
+                   "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+            return EC_STATE_END;
+    }
+}
+
+void
+ec_lk(call_frame_t *frame, xlator_t *this, uintptr_t target, uint32_t fop_flags,
+      fop_lk_cbk_t func, void *data, fd_t *fd, int32_t cmd,
+      struct gf_flock *flock, dict_t *xdata)
+{
+    ec_cbk_t callback = {.lk = func};
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace("ec", 0, "EC(LK) %p", frame);
+
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_LK, 0, target, fop_flags,
+                               ec_wind_lk, ec_manager_lk, callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+
+    fop->int32 = cmd;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_FILE_DESC_REF_FAIL,
+                   "Failed to reference a "
+                   "file descriptor.");
+
+            goto out;
+        }
+    }
+    if (flock != NULL) {
+        fop->flock.l_type = flock->l_type;
+        fop->flock.l_whence = flock->l_whence;
+        fop->flock.l_start = flock->l_start;
+        fop->flock.l_len = flock->l_len;
+        fop->flock.l_pid = flock->l_pid;
+        fop->flock.l_owner.len = flock->l_owner.len;
+        if (flock->l_owner.len > 0) {
+            memcpy(fop->flock.l_owner.data, flock->l_owner.data,
+                   flock->l_owner.len);
+        }
+    }
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_DICT_REF_FAIL,
+                   "Failed to reference a "
+                   "dictionary.");
+
+            goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL);
+    }
+}
diff --git a/xlators/cluster/ec/src/ec-mem-types.h b/xlators/cluster/ec/src/ec-mem-types.h
new file mode 100644
index 00000000000..3252c4c1c58
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-mem-types.h
@@ -0,0 +1,30 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_MEM_TYPES_H__
+#define __EC_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_ec_mem_types_ {
+    ec_mt_ec_t = gf_common_mt_end + 1,
+    ec_mt_xlator_t,
+    ec_mt_ec_inode_t,
+    ec_mt_ec_fd_t,
+    ec_mt_subvol_healer_t,
+    ec_mt_ec_gf_t,
+    ec_mt_ec_code_t,
+    ec_mt_ec_code_builder_t,
+    ec_mt_ec_matrix_t,
+    ec_mt_ec_stripe_t,
+    ec_mt_end
+};
+
+#endif /* __EC_MEM_TYPES_H__ */
diff --git a/xlators/cluster/ec/src/ec-messages.h b/xlators/cluster/ec/src/ec-messages.h
new file mode 100644
index 00000000000..72e98f11286
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-messages.h
@@ -0,0 +1,61 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _EC_MESSAGES_H_
+#define _EC_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(EC, EC_MSG_INVALID_CONFIG, EC_MSG_HEAL_FAIL,
+           EC_MSG_DICT_COMBINE_FAIL, EC_MSG_STIME_COMBINE_FAIL,
+           EC_MSG_INVALID_DICT_NUMS, EC_MSG_IATT_COMBINE_FAIL,
+           EC_MSG_INVALID_FORMAT, EC_MSG_DICT_GET_FAILED,
+           EC_MSG_UNHANDLED_STATE, EC_MSG_FILE_DESC_REF_FAIL,
+           EC_MSG_LOC_COPY_FAIL, EC_MSG_BUF_REF_FAIL, EC_MSG_DICT_REF_FAIL,
+           EC_MSG_LK_UNLOCK_FAILED, EC_MSG_UNLOCK_FAILED,
+           EC_MSG_LOC_PARENT_INODE_MISSING, EC_MSG_INVALID_LOC_NAME,
+           EC_MSG_NO_MEMORY, EC_MSG_GFID_MISMATCH, EC_MSG_UNSUPPORTED_VERSION,
+           EC_MSG_FD_CREATE_FAIL, EC_MSG_READDIRP_REQ_PREP_FAIL,
+           EC_MSG_LOOKUP_REQ_PREP_FAIL, EC_MSG_INODE_REF_FAIL,
+           EC_MSG_LOOKUP_READAHEAD_FAIL, EC_MSG_FRAME_MISMATCH,
+           EC_MSG_XLATOR_MISMATCH, EC_MSG_VECTOR_MISMATCH, EC_MSG_IATT_MISMATCH,
+           EC_MSG_FD_MISMATCH, EC_MSG_DICT_MISMATCH, EC_MSG_INDEX_DIR_GET_FAIL,
+           EC_MSG_PREOP_LOCK_FAILED, EC_MSG_CHILDS_INSUFFICIENT,
+           EC_MSG_OP_EXEC_UNAVAIL, EC_MSG_UNLOCK_DELAY_FAILED,
+           EC_MSG_SIZE_VERS_UPDATE_FAIL, EC_MSG_INVALID_REQUEST,
+           EC_MSG_INVALID_LOCK_TYPE, EC_MSG_SIZE_VERS_GET_FAIL,
+           EC_MSG_FILE_SIZE_GET_FAIL, EC_MSG_FOP_MISMATCH,
+           EC_MSG_SUBVOL_ID_DICT_SET_FAIL, EC_MSG_SUBVOL_BUILD_FAIL,
+           EC_MSG_XLATOR_INIT_FAIL, EC_MSG_NO_PARENTS, EC_MSG_TIMER_CREATE_FAIL,
+           EC_MSG_TOO_MANY_SUBVOLS, EC_MSG_DATA_UNAVAILABLE,
+           EC_MSG_INODE_REMOVE_FAIL, EC_MSG_INVALID_REDUNDANCY,
+           EC_MSG_XLATOR_PARSE_OPT_FAIL, EC_MSG_OP_FAIL_ON_SUBVOLS,
+           EC_MSG_INVALID_INODE, EC_MSG_LOCK_MISMATCH, EC_MSG_XDATA_MISMATCH,
+           EC_MSG_HEALING_INFO, EC_MSG_HEAL_SUCCESS, EC_MSG_FULL_SWEEP_START,
+           EC_MSG_FULL_SWEEP_STOP, EC_MSG_INVALID_FOP, EC_MSG_EC_UP,
+           EC_MSG_EC_DOWN, EC_MSG_SIZE_XATTR_GET_FAIL,
+           EC_MSG_VER_XATTR_GET_FAIL, EC_MSG_CONFIG_XATTR_GET_FAIL,
+           EC_MSG_CONFIG_XATTR_INVALID, EC_MSG_EXTENSION, EC_MSG_EXTENSION_NONE,
+           EC_MSG_EXTENSION_UNKNOWN, EC_MSG_EXTENSION_UNSUPPORTED,
+           EC_MSG_EXTENSION_FAILED, EC_MSG_NO_GF, EC_MSG_MATRIX_FAILED,
+           EC_MSG_DYN_CREATE_FAILED, EC_MSG_DYN_CODEGEN_FAILED,
+           EC_MSG_THREAD_CLEANUP_FAILED, EC_MSG_FD_BAD);
+
+#endif /* !_EC_MESSAGES_H_ */
diff --git a/xlators/cluster/ec/src/ec-method.c b/xlators/cluster/ec/src/ec-method.c
new file mode 100644
index 00000000000..55faed0b193
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-method.c
@@ -0,0 +1,433 @@
+/*
+  Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <string.h>
+#include <inttypes.h>
+
+#include "ec-types.h"
+#include "ec-mem-types.h"
+#include "ec-galois.h"
+#include "ec-code.h"
+#include "ec-method.h"
+#include "ec-helpers.h"
+
+static void
+ec_method_matrix_normal(ec_gf_t *gf, uint32_t *matrix, uint32_t columns,
+                        uint32_t *values, uint32_t count)
+{
+    uint32_t i, j, v, tmp;
+
+    columns--;
+    for (i = 0; i < count; i++) {
+        v = *values++;
+        *matrix++ = tmp = ec_gf_exp(gf, v, columns);
+        for (j = 0; j < columns; j++) {
+            *matrix++ = tmp = ec_gf_div(gf, tmp, v);
+        }
+    }
+}
+
+static void
+ec_method_matrix_inverse(ec_gf_t *gf, uint32_t *matrix, uint32_t *values,
+                         uint32_t count)
+{
+    uint32_t a[count];
+    uint32_t i, j, p, last, tmp;
+
+    last = count - 1;
+    for (i = 0; i < last; i++) {
+        a[i] = 1;
+    }
+    a[i] = values[0];
+    for (i = last; i > 0; i--) {
+        for (j = i - 1; j < last; j++) {
+            a[j] = a[j + 1] ^ ec_gf_mul(gf, values[i], a[j]);
+        }
+        a[j] = ec_gf_mul(gf, values[i], a[j]);
+    }
+    for (i = 0; i < count; i++) {
+        p = a[0];
+        matrix += count;
+        *matrix = tmp = p ^ values[i];
+        for (j = 1; j < last; j++) {
+            matrix += count;
+            *matrix = tmp = a[j] ^ ec_gf_mul(gf, values[i], tmp);
+            p = tmp ^ ec_gf_mul(gf, values[i], p);
+        }
+        for (j = 0; j < last; j++) {
+            *matrix = ec_gf_div(gf, *matrix, p);
+            matrix -= count;
+        }
+        *matrix = ec_gf_div(gf, 1, p);
+        matrix++;
+    }
+}
+
+static void
+ec_method_matrix_init(ec_matrix_list_t *list, ec_matrix_t *matrix,
+                      uintptr_t mask, uint32_t *rows, gf_boolean_t inverse)
+{
+    uint32_t i;
+
+    matrix->refs = 1;
+    matrix->mask = mask;
+    matrix->code = list->code;
+    matrix->columns = list->columns;
+    INIT_LIST_HEAD(&matrix->lru);
+
+    if (inverse) {
+        matrix->rows = list->columns;
+        ec_method_matrix_inverse(matrix->code->gf, matrix->values, rows,
+                                 matrix->rows);
+        for (i = 0; i < matrix->rows; i++) {
+            matrix->row_data[i].values = matrix->values + i * matrix->columns;
+            matrix->row_data[i].func.interleaved = ec_code_build_interleaved(
+                matrix->code, EC_METHOD_WORD_SIZE, matrix->row_data[i].values,
+                matrix->columns);
+        }
+    } else {
+        matrix->rows = list->rows;
+        ec_method_matrix_normal(matrix->code->gf, matrix->values,
+                                matrix->columns, rows, matrix->rows);
+        for (i = 0; i < matrix->rows; i++) {
+            matrix->row_data[i].values = matrix->values + i * matrix->columns;
+            matrix->row_data[i].func.linear = ec_code_build_linear(
+                matrix->code, EC_METHOD_WORD_SIZE, matrix->row_data[i].values,
+                matrix->columns);
+        }
+    }
+}
+
+static void
+ec_method_matrix_release(ec_matrix_t *matrix)
+{
+    uint32_t i;
+
+    for (i = 0; i < matrix->rows; i++) {
+        if (matrix->row_data[i].func.linear != NULL) {
+            ec_code_release(matrix->code, &matrix->row_data[i].func);
+            matrix->row_data[i].func.linear = NULL;
+        }
+    }
+}
+
+static void
+ec_method_matrix_destroy(ec_matrix_list_t *list, ec_matrix_t *matrix)
+{
+    list_del_init(&matrix->lru);
+
+    ec_method_matrix_release(matrix);
+
+    mem_put(matrix);
+
+    list->count--;
+}
+
+static void
+ec_method_matrix_unref(ec_matrix_list_t *list, ec_matrix_t *matrix)
+{
+    if (--matrix->refs == 0) {
+        list_add_tail(&matrix->lru, &list->lru);
+        if (list->count > list->max) {
+            matrix = list_first_entry(&list->lru, ec_matrix_t, lru);
+            ec_method_matrix_destroy(list, matrix);
+        }
+    }
+}
+
+static ec_matrix_t *
+ec_method_matrix_lookup(ec_matrix_list_t *list, uintptr_t mask, uint32_t *pos)
+{
+    ec_matrix_t *matrix;
+    uint32_t i, j, k;
+
+    i = 0;
+    j = list->count;
+    while (i < j) {
+        k = (i + j) >> 1;
+        matrix = list->objects[k];
+        if (matrix->mask == mask) {
+            *pos = k;
+            return matrix;
+        }
+        if (matrix->mask < mask) {
+            i = k + 1;
+        } else {
+            j = k;
+        }
+    }
+    *pos = i;
+
+    return NULL;
+}
+
+static void
+ec_method_matrix_remove(ec_matrix_list_t *list, uintptr_t mask)
+{
+    uint32_t pos;
+
+    if (ec_method_matrix_lookup(list, mask, &pos) != NULL) {
+        list->count--;
+        if (pos < list->count) {
+            memmove(list->objects + pos, list->objects + pos + 1,
+                    sizeof(ec_matrix_t *) * (list->count - pos));
+        }
+    }
+}
+
+static void
+ec_method_matrix_insert(ec_matrix_list_t *list, ec_matrix_t *matrix)
+{
+    uint32_t pos;
+
+    GF_ASSERT(ec_method_matrix_lookup(list, matrix->mask, &pos) == NULL);
+
+    if (pos < list->count) {
+        memmove(list->objects + pos + 1, list->objects + pos,
+                sizeof(ec_matrix_t *) * (list->count - pos));
+    }
+    list->objects[pos] = matrix;
+    list->count++;
+}
+
+static ec_matrix_t *
+ec_method_matrix_get(ec_matrix_list_t *list, uintptr_t mask, uint32_t *rows)
+{
+    ec_matrix_t *matrix;
+    uint32_t pos;
+
+    LOCK(&list->lock);
+
+    matrix = ec_method_matrix_lookup(list, mask, &pos);
+    if (matrix != NULL) {
+        list_del_init(&matrix->lru);
+        matrix->refs++;
+
+        goto out;
+    }
+
+    if ((list->count >= list->max) && !list_empty(&list->lru)) {
+        matrix = list_first_entry(&list->lru, ec_matrix_t, lru);
+        list_del_init(&matrix->lru);
+
+        ec_method_matrix_remove(list, matrix->mask);
+
+        ec_method_matrix_release(matrix);
+    } else {
+        matrix = mem_get0(list->pool);
+        if (matrix == NULL) {
+            matrix = EC_ERR(ENOMEM);
+            goto out;
+        }
+        matrix->values = (uint32_t *)((uintptr_t)matrix + sizeof(ec_matrix_t) +
+                                      sizeof(ec_matrix_row_t) * list->columns);
+    }
+
+    ec_method_matrix_init(list, matrix, mask, rows, _gf_true);
+
+    if (list->count < list->max) {
+        ec_method_matrix_insert(list, matrix);
+    } else {
+        matrix->mask = 0;
+    }
+
+out:
+    UNLOCK(&list->lock);
+
+    return matrix;
+}
+
+static void
+ec_method_matrix_put(ec_matrix_list_t *list, ec_matrix_t *matrix)
+{
+    LOCK(&list->lock);
+
+    ec_method_matrix_unref(list, matrix);
+
+    UNLOCK(&list->lock);
+}
+
+static int32_t
+ec_method_setup(xlator_t *xl, ec_matrix_list_t *list, const char *gen)
+{
+    ec_matrix_t *matrix;
+    uint32_t values[list->rows];
+    uint32_t i;
+    int32_t err;
+
+    matrix = GF_MALLOC(sizeof(ec_matrix_t) +
+                           sizeof(ec_matrix_row_t) * list->rows +
+                           sizeof(uint32_t) * list->columns * list->rows,
+                       ec_mt_ec_matrix_t);
+    if (matrix == NULL) {
+        err = -ENOMEM;
+        goto failed;
+    }
+    memset(matrix, 0, sizeof(ec_matrix_t));
+    matrix->values = (uint32_t *)((uintptr_t)matrix + sizeof(ec_matrix_t) +
+                                  sizeof(ec_matrix_row_t) * list->rows);
+
+    list->code = ec_code_create(list->gf, ec_code_detect(xl, gen));
+    if (EC_IS_ERR(list->code)) {
+        err = EC_GET_ERR(list->code);
+        list->code = NULL;
+        goto failed_matrix;
+    }
+
+    for (i = 0; i < list->rows; i++) {
+        values[i] = i + 1;
+    }
+    ec_method_matrix_init(list, matrix, 0, values, _gf_false);
+
+    list->encode = matrix;
+
+    return 0;
+
+failed_matrix:
+    GF_FREE(matrix);
+failed:
+    return err;
+}
+
+int32_t
+ec_method_init(xlator_t *xl, ec_matrix_list_t *list, uint32_t columns,
+               uint32_t rows, uint32_t max, const char *gen)
+{
+    list->columns = columns;
+    list->rows = rows;
+    list->max = max;
+    list->stripe = EC_METHOD_CHUNK_SIZE * list->columns;
+    INIT_LIST_HEAD(&list->lru);
+    int32_t err;
+
+    list->pool = mem_pool_new_fn(xl->ctx,
+                                 sizeof(ec_matrix_t) +
+                                     sizeof(ec_matrix_row_t) * columns +
+                                     sizeof(uint32_t) * columns * columns,
+                                 128, "ec_matrix_t");
+    if (list->pool == NULL) {
+        err = -ENOMEM;
+        goto failed;
+    }
+
+    list->objects = GF_MALLOC(sizeof(ec_matrix_t *) * max, ec_mt_ec_matrix_t);
+    if (list->objects == NULL) {
+        err = -ENOMEM;
+        goto failed_pool;
+    }
+
+    list->gf = ec_gf_prepare(EC_GF_BITS, EC_GF_MOD);
+    if (EC_IS_ERR(list->gf)) {
+        err = EC_GET_ERR(list->gf);
+        goto failed_objects;
+    }
+
+    err = ec_method_setup(xl, list, gen);
+    if (err != 0) {
+        goto failed_gf;
+    }
+
+    LOCK_INIT(&list->lock);
+
+    return 0;
+
+failed_gf:
+    ec_gf_destroy(list->gf);
+failed_objects:
+    GF_FREE(list->objects);
+failed_pool:
+    mem_pool_destroy(list->pool);
+failed:
+    list->pool = NULL;
+    list->objects = NULL;
+    list->gf = NULL;
+
+    return err;
+}
+
+void
+ec_method_fini(ec_matrix_list_t *list)
+{
+    ec_matrix_t *matrix;
+
+    if (list->encode == NULL) {
+        return;
+    }
+
+    while (!list_empty(&list->lru)) {
+        matrix = list_first_entry(&list->lru, ec_matrix_t, lru);
+        ec_method_matrix_destroy(list, matrix);
+    }
+
+    GF_ASSERT(list->count == 0);
+
+    if (list->pool) /*Init was successful*/
+        LOCK_DESTROY(&list->lock);
+
+    ec_method_matrix_release(list->encode);
+    GF_FREE(list->encode);
+
+    ec_code_destroy(list->code);
+    ec_gf_destroy(list->gf);
+    GF_FREE(list->objects);
+
+    if (list->pool)
+        mem_pool_destroy(list->pool);
+}
+
+int32_t
+ec_method_update(xlator_t *xl, ec_matrix_list_t *list, const char *gen)
+{
+    /* TODO: Allow changing code generator */
+
+    return 0;
+}
+
+void
+ec_method_encode(ec_matrix_list_t *list, uint64_t size, void *in, void **out)
+{
+    ec_matrix_t *matrix;
+    uint64_t pos;
+    uint32_t i;
+
+    matrix = list->encode;
+    for (pos = 0; pos < size; pos += list->stripe) {
+        for (i = 0; i < matrix->rows; i++) {
+            matrix->row_data[i].func.linear(
+                out[i], in, pos, matrix->row_data[i].values, list->columns);
+            out[i] += EC_METHOD_CHUNK_SIZE;
+        }
+    }
+}
+
+int32_t
+ec_method_decode(ec_matrix_list_t *list, uint64_t size, uintptr_t mask,
+                 uint32_t *rows, void **in, void *out)
+{
+    ec_matrix_t *matrix;
+    uint64_t pos;
+    uint32_t i;
+
+    matrix = ec_method_matrix_get(list, mask, rows);
+    if (EC_IS_ERR(matrix)) {
+        return EC_GET_ERR(matrix);
+    }
+    for (pos = 0; pos < size; pos += EC_METHOD_CHUNK_SIZE) {
+        for (i = 0; i < matrix->rows; i++) {
+            matrix->row_data[i].func.interleaved(
+                out, in, pos, matrix->row_data[i].values, list->columns);
+            out += EC_METHOD_CHUNK_SIZE;
+        }
+    }
+
+    ec_method_matrix_put(list, matrix);
+
+    return 0;
+}
diff --git a/xlators/cluster/ec/src/ec-method.h b/xlators/cluster/ec/src/ec-method.h
new file mode 100644
index 00000000000..f91233b2f88
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-method.h
@@ -0,0 +1,48 @@
+/*
+  Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_METHOD_H__
+#define __EC_METHOD_H__
+
+#include "ec-types.h"
+#include "ec-galois.h"
+
+#define EC_GF_BITS 8
+#define EC_GF_MOD 0x11D
+
+#define EC_GF_SIZE (1 << EC_GF_BITS)
+
+/* Determines the maximum size of the matrix used to encode/decode data */
+#define EC_METHOD_MAX_FRAGMENTS 16
+/* Determines the maximum number of usable elements in the Galois Field */
+#define EC_METHOD_MAX_NODES (EC_GF_SIZE - 1)
+
+#define EC_METHOD_WORD_SIZE 64
+
+#define EC_METHOD_CHUNK_SIZE (EC_METHOD_WORD_SIZE * EC_GF_BITS)
+
+int32_t
+ec_method_init(xlator_t *xl, ec_matrix_list_t *list, uint32_t columns,
+               uint32_t rows, uint32_t max, const char *gen);
+
+void
+ec_method_fini(ec_matrix_list_t *list);
+
+int32_t
+ec_method_update(xlator_t *xl, ec_matrix_list_t *list, const char *gen);
+
+void
+ec_method_encode(ec_matrix_list_t *list, uint64_t size, void *in, void **out);
+
+int32_t
+ec_method_decode(ec_matrix_list_t *list, uint64_t size, uintptr_t mask,
+                 uint32_t *rows, void **in, void *out);
+
+#endif /* __EC_METHOD_H__ */
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
new file mode 100644
index 00000000000..de9b89bb2c9
--- /dev/null
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -0,0 +1,690 @@
+/*
+  Copyright (c) 2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_TYPES_H__
+#define __EC_TYPES_H__
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/timer.h>
+#include "libxlator.h"
+#include <glusterfs/atomic.h>
+
+#define EC_GF_MAX_REGS 16
+
+enum _ec_heal_need;
+typedef enum _ec_heal_need ec_heal_need_t;
+
+enum _ec_stripe_part;
+typedef enum _ec_stripe_part ec_stripe_part_t;
+
+enum _ec_read_policy;
+typedef enum _ec_read_policy ec_read_policy_t;
+
+struct _ec_config;
+typedef struct _ec_config ec_config_t;
+
+struct _ec_fd;
+typedef struct _ec_fd ec_fd_t;
+
+struct _ec_fragment_range;
+typedef struct _ec_fragment_range ec_fragment_range_t;
+
+struct _ec_inode;
+typedef struct _ec_inode ec_inode_t;
+
+union _ec_cbk;
+typedef union _ec_cbk ec_cbk_t;
+
+struct _ec_lock;
+typedef struct _ec_lock ec_lock_t;
+
+struct _ec_lock_link;
+typedef struct _ec_lock_link ec_lock_link_t;
+
+struct _ec_fop_data;
+typedef struct _ec_fop_data ec_fop_data_t;
+
+struct _ec_cbk_data;
+typedef struct _ec_cbk_data ec_cbk_data_t;
+
+enum _ec_gf_opcode;
+typedef enum _ec_gf_opcode ec_gf_opcode_t;
+
+struct _ec_gf_op;
+typedef struct _ec_gf_op ec_gf_op_t;
+
+struct _ec_gf_mul;
+typedef struct _ec_gf_mul ec_gf_mul_t;
+
+struct _ec_gf;
+typedef struct _ec_gf ec_gf_t;
+
+struct _ec_code_gen;
+typedef struct _ec_code_gen ec_code_gen_t;
+
+struct _ec_code;
+typedef struct _ec_code ec_code_t;
+
+struct _ec_code_arg;
+typedef struct _ec_code_arg ec_code_arg_t;
+
+struct _ec_code_op;
+typedef struct _ec_code_op ec_code_op_t;
+
+struct _ec_code_builder;
+typedef struct _ec_code_builder ec_code_builder_t;
+
+struct _ec_code_chunk;
+typedef struct _ec_code_chunk ec_code_chunk_t;
+
+struct _ec_stripe;
+typedef struct _ec_stripe ec_stripe_t;
+
+struct _ec_stripe_list;
+typedef struct _ec_stripe_list ec_stripe_list_t;
+
+struct _ec_code_space;
+typedef struct _ec_code_space ec_code_space_t;
+
+typedef void (*ec_code_func_linear_t)(void *dst, void *src, uint64_t offset,
+                                      uint32_t *values, uint32_t count);
+
+typedef void (*ec_code_func_interleaved_t)(void *dst, void **src,
+                                           uint64_t offset, uint32_t *values,
+                                           uint32_t count);
+
+union _ec_code_func;
+typedef union _ec_code_func ec_code_func_t;
+
+struct _ec_matrix_row;
+typedef struct _ec_matrix_row ec_matrix_row_t;
+
+struct _ec_matrix;
+typedef struct _ec_matrix ec_matrix_t;
+
+struct _ec_matrix_list;
+typedef struct _ec_matrix_list ec_matrix_list_t;
+
+struct _ec_heal;
+typedef struct _ec_heal ec_heal_t;
+
+struct _ec_self_heald;
+typedef struct _ec_self_heald ec_self_heald_t;
+
+struct _ec_statistics;
+typedef struct _ec_statistics ec_statistics_t;
+
+struct _ec;
+typedef struct _ec ec_t;
+
+typedef void (*ec_wind_f)(ec_t *, ec_fop_data_t *, int32_t);
+typedef int32_t (*ec_handler_f)(ec_fop_data_t *, int32_t);
+typedef void (*ec_resume_f)(ec_fop_data_t *, int32_t);
+
+enum _ec_read_policy { EC_ROUND_ROBIN, EC_GFID_HASH, EC_READ_POLICY_MAX };
+
+enum _ec_heal_need {
+    EC_HEAL_NONEED,
+    EC_HEAL_MAYBE,
+    EC_HEAL_MUST,
+    EC_HEAL_PURGE_INDEX
+};
+
+enum _ec_stripe_part { EC_STRIPE_HEAD, EC_STRIPE_TAIL };
+
+/* Enumartions to indicate FD status. */
+typedef enum { EC_FD_NOT_OPENED, EC_FD_OPENED, EC_FD_OPENING } ec_fd_status_t;
+
+struct _ec_config {
+    uint32_t version;
+    uint8_t algorithm;
+    uint8_t gf_word_size;
+    uint8_t bricks;
+    uint8_t redundancy;
+    uint32_t chunk_size;
+};
+
+struct _ec_fd {
+    loc_t loc;
+    uintptr_t open;
+    int32_t flags;
+    uint64_t bad_version;
+    ec_fd_status_t fd_status[0];
+};
+
+struct _ec_stripe {
+    struct list_head lru; /* LRU list member */
+    uint64_t frag_offset; /* Fragment offset of this stripe */
+    char data[];          /* Contents of the stripe */
+};
+
+struct _ec_stripe_list {
+    struct list_head lru;
+    uint32_t count;
+    uint32_t max;
+};
+
+struct _ec_inode {
+    ec_lock_t *inode_lock;
+    gf_boolean_t have_info;
+    gf_boolean_t have_config;
+    gf_boolean_t have_version;
+    gf_boolean_t have_size;
+    int32_t heal_count;
+    ec_config_t config;
+    uint64_t pre_version[2];
+    uint64_t post_version[2];
+    uint64_t pre_size;
+    uint64_t post_size;
+    uint64_t dirty[2];
+    struct list_head heal;
+    ec_stripe_list_t stripe_cache;
+    uint64_t bad_version;
+};
+
+typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
+                                  int32_t, uintptr_t, uintptr_t, uintptr_t,
+                                  uint32_t, dict_t *);
+typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
+                                   int32_t, uintptr_t, uintptr_t, uintptr_t,
+                                   uint32_t, dict_t *);
+
+union _ec_cbk {
+    fop_access_cbk_t access;
+    fop_create_cbk_t create;
+    fop_discard_cbk_t discard;
+    fop_entrylk_cbk_t entrylk;
+    fop_fentrylk_cbk_t fentrylk;
+    fop_fallocate_cbk_t fallocate;
+    fop_flush_cbk_t flush;
+    fop_fsync_cbk_t fsync;
+    fop_fsyncdir_cbk_t fsyncdir;
+    fop_getxattr_cbk_t getxattr;
+    fop_fgetxattr_cbk_t fgetxattr;
+    fop_heal_cbk_t heal;
+    fop_fheal_cbk_t fheal;
+    fop_inodelk_cbk_t inodelk;
+    fop_finodelk_cbk_t finodelk;
+    fop_link_cbk_t link;
+    fop_lk_cbk_t lk;
+    fop_lookup_cbk_t lookup;
+    fop_mkdir_cbk_t mkdir;
+    fop_mknod_cbk_t mknod;
+    fop_open_cbk_t open;
+    fop_opendir_cbk_t opendir;
+    fop_readdir_cbk_t readdir;
+    fop_readdirp_cbk_t readdirp;
+    fop_readlink_cbk_t readlink;
+    fop_readv_cbk_t readv;
+    fop_removexattr_cbk_t removexattr;
+    fop_fremovexattr_cbk_t fremovexattr;
+    fop_rename_cbk_t rename;
+    fop_rmdir_cbk_t rmdir;
+    fop_setattr_cbk_t setattr;
+    fop_fsetattr_cbk_t fsetattr;
+    fop_setxattr_cbk_t setxattr;
+    fop_fsetxattr_cbk_t fsetxattr;
+    fop_stat_cbk_t stat;
+    fop_fstat_cbk_t fstat;
+    fop_statfs_cbk_t statfs;
+    fop_symlink_cbk_t symlink;
+    fop_truncate_cbk_t truncate;
+    fop_ftruncate_cbk_t ftruncate;
+    fop_unlink_cbk_t unlink;
+    fop_writev_cbk_t writev;
+    fop_xattrop_cbk_t xattrop;
+    fop_fxattrop_cbk_t fxattrop;
+    fop_zerofill_cbk_t zerofill;
+    fop_seek_cbk_t seek;
+    fop_ipc_cbk_t ipc;
+};
+
+struct _ec_lock {
+    ec_inode_t *ctx;
+    gf_timer_t *timer;
+
+    /* List of owners of this lock. All fops added to this list are running
+     * concurrently. */
+    struct list_head owners;
+
+    /* List of fops waiting to be an owner of the lock. Fops are added to this
+     * list when the current owner has an incompatible access (conflicting lock)
+     * or the lock is not acquired yet. */
+    struct list_head waiting;
+
+    /* List of fops that will wait until the next unlock/lock cycle. This
+     * happens when the currently acquired lock is decided to be released as
+     * soon as possible. In this case, all frozen fops will be continued only
+     * after the lock is reacquired. */
+    struct list_head frozen;
+
+    uintptr_t mask;
+    uintptr_t good_mask;
+    uintptr_t healing;
+    uint32_t refs_owners;   /* Refs for fops owning the lock */
+    uint32_t refs_pending;  /* Refs assigned to fops being prepared */
+    uint32_t waiting_flags; /*Track xattrop/dirty marking*/
+    gf_boolean_t acquired;
+    gf_boolean_t contention;
+    gf_boolean_t unlock_now;
+    gf_boolean_t release;
+    gf_boolean_t query;
+    fd_t *fd;
+    loc_t loc;
+    union {
+        entrylk_type type;
+        struct gf_flock flock;
+    };
+};
+
+struct _ec_lock_link {
+    ec_lock_t *lock;
+    ec_fop_data_t *fop;
+    struct list_head owner_list;
+    struct list_head wait_list;
+    gf_boolean_t update[2];
+    gf_boolean_t dirty[2];
+    gf_boolean_t optimistic_changelog;
+    loc_t *base;
+    uint64_t size;
+    uint32_t waiting_flags;
+    off_t fl_start;
+    off_t fl_end;
+};
+
+/* This structure keeps a range of fragment offsets affected by a fop. Since
+ * real file offsets can be difficult to handle correctly because of overflows,
+ * we use the 'scaled' offset, which corresponds to the offset of the fragment
+ * seen by the bricks, which is always smaller and cannot overflow. */
+struct _ec_fragment_range {
+    uint64_t first; /* Address of the first affected fragment as seen by the
+                       bricks (offset on brick) */
+    uint64_t last;  /* Address of the first non affected fragment as seen by
+                       the bricks (offset on brick) */
+};
+
+/* EC xlator data structure to collect all the data required to perform
+ * the file operation.*/
+struct _ec_fop_data {
+    int32_t id; /* ID of the file operation */
+    int32_t refs;
+    int32_t state;
+    uint32_t minimum; /* Minimum number of successful
+                         operation required to conclude a
+                         fop as successful */
+    int32_t expected;
+    int32_t winds;
+    int32_t jobs;
+    int32_t error;
+    ec_fop_data_t *parent;
+    xlator_t *xl;                  /* points to EC xlator */
+    call_frame_t *req_frame;       /* frame of the calling xlator */
+    call_frame_t *frame;           /* frame used by this fop */
+    struct list_head cbk_list;     /* sorted list of groups of answers */
+    struct list_head answer_list;  /* list of answers */
+    struct list_head pending_list; /* member of ec_t.pending_fops */
+    ec_cbk_data_t *answer;         /* accepted answer */
+    int32_t lock_count;
+    int32_t locked;
+    gf_lock_t lock;
+    ec_lock_link_t locks[2];
+    int32_t first_lock;
+
+    uint32_t fop_flags; /* Flags passed by the caller. */
+    uint32_t flags;     /* Internal flags. */
+    uint32_t first;
+    uintptr_t mask;
+    uintptr_t healing; /*Dispatch is done but call is successful only
+                         if fop->minimum number of subvolumes succeed
+                         which are not healing*/
+    uintptr_t remaining;
+    uintptr_t received; /* Mask of responses */
+    uintptr_t good;
+
+    uid_t uid;
+    gid_t gid;
+
+    ec_wind_f wind;       /* Function to wind to */
+    ec_handler_f handler; /* FOP manager function */
+    ec_resume_f resume;
+    ec_cbk_t cbks; /* Callback function for this FOP */
+    void *data;
+    ec_heal_t *heal;
+    struct list_head healer;
+
+    uint64_t user_size;
+    uint32_t head;
+
+    int32_t use_fd; /* Indicates whether this FOP uses FD or
+                       not */
+
+    dict_t *xdata;
+    dict_t *dict;
+    int32_t int32;
+    uint32_t uint32;
+    uint64_t size;
+    off_t offset;
+    mode_t mode[2];
+    entrylk_cmd entrylk_cmd;
+    entrylk_type entrylk_type;
+    gf_xattrop_flags_t xattrop_flags;
+    dev_t dev;
+    inode_t *inode;
+    fd_t *fd; /* FD of the file on which FOP is
+                 being carried upon */
+    struct iatt iatt;
+    char *str[2];
+    loc_t loc[2]; /* Holds the location details for
+                     the file */
+    struct gf_flock flock;
+    struct iovec *vector;
+    struct iobref *buffers;
+    gf_seek_what_t seek;
+    ec_fragment_range_t frag_range; /* This will hold the range of stripes
+                                        affected by the fop. */
+    char *errstr;                   /*String of fop name, path and gfid
+                                     to be used in gf_msg. */
+};
+
+struct _ec_cbk_data {
+    struct list_head list;        /* item in the sorted list of groups */
+    struct list_head answer_list; /* item in the list of answers */
+    ec_fop_data_t *fop;
+    ec_cbk_data_t *next; /* next answer in the same group */
+    uint32_t idx;
+    int32_t op_ret;
+    int32_t op_errno;
+    int32_t count;
+    uintptr_t mask;
+
+    dict_t *xdata;
+    dict_t *dict;
+    int32_t int32;
+    uintptr_t uintptr[3];
+    uint64_t size;
+    uint64_t version[2];
+    inode_t *inode;
+    fd_t *fd;
+    struct statvfs statvfs;
+    struct iatt iatt[5];
+    struct gf_flock flock;
+    struct iovec *vector;
+    struct iobref *buffers;
+    char *str;
+    gf_dirent_t entries;
+    off_t offset;
+    gf_seek_what_t what;
+};
+
+enum _ec_gf_opcode {
+    EC_GF_OP_LOAD,
+    EC_GF_OP_STORE,
+    EC_GF_OP_COPY,
+    EC_GF_OP_XOR2,
+    EC_GF_OP_XOR3,
+    EC_GF_OP_XORM,
+    EC_GF_OP_END
+};
+
+struct _ec_gf_op {
+    ec_gf_opcode_t op;
+    uint32_t arg1;
+    uint32_t arg2;
+    uint32_t arg3;
+};
+
+struct _ec_gf_mul {
+    uint32_t regs;
+    uint32_t map[EC_GF_MAX_REGS];
+    ec_gf_op_t *ops;
+};
+
+struct _ec_gf {
+    uint32_t bits;
+    uint32_t size;
+    uint32_t mod;
+    uint32_t min_ops;
+    uint32_t max_ops;
+    uint32_t avg_ops;
+    uint32_t *log;
+    uint32_t *pow;
+    ec_gf_mul_t **table;
+};
+
+struct _ec_code_gen {
+    char *name;
+    char **flags;
+    uint32_t width;
+
+    void (*prolog)(ec_code_builder_t *builder);
+    void (*epilog)(ec_code_builder_t *builder);
+    void (*load)(ec_code_builder_t *builder, uint32_t reg, uint32_t offset,
+                 uint32_t bit);
+    void (*store)(ec_code_builder_t *builder, uint32_t reg, uint32_t bit);
+    void (*copy)(ec_code_builder_t *builder, uint32_t dst, uint32_t src);
+    void (*xor2)(ec_code_builder_t *builder, uint32_t dst, uint32_t src);
+    void (*xor3)(ec_code_builder_t *builder, uint32_t dst, uint32_t src1,
+                 uint32_t src2);
+    void (*xorm)(ec_code_builder_t *builder, uint32_t dst, uint32_t offset,
+                 uint32_t bit);
+};
+
+struct _ec_code {
+    gf_lock_t lock;
+    struct list_head spaces;
+    ec_gf_t *gf;
+    ec_code_gen_t *gen;
+};
+
+struct _ec_code_arg {
+    uint32_t value;
+};
+
+struct _ec_code_op {
+    ec_gf_opcode_t op;
+    ec_code_arg_t arg1;
+    ec_code_arg_t arg2;
+    ec_code_arg_t arg3;
+};
+
+struct _ec_code_builder {
+    ec_code_t *code;
+    uint64_t address;
+    uint8_t *data;
+    uint32_t size;
+    int32_t error;
+    uint32_t regs;
+    uint32_t bits;
+    uint32_t width;
+    uint32_t count;
+    uint32_t base;
+    uint32_t map[EC_GF_MAX_REGS];
+    gf_boolean_t linear;
+    uint64_t loop;
+    ec_code_op_t ops[0];
+};
+
+struct _ec_code_chunk {
+    struct list_head list;
+    size_t size;
+    ec_code_space_t *space;
+};
+
+struct _ec_code_space {
+    struct list_head list;
+    struct list_head chunks;
+    ec_code_t *code;
+    void *exec;
+    size_t size;
+};
+
+union _ec_code_func {
+    ec_code_func_linear_t linear;
+    ec_code_func_interleaved_t interleaved;
+};
+
+struct _ec_matrix_row {
+    ec_code_func_t func;
+    uint32_t *values;
+};
+
+struct _ec_matrix {
+    struct list_head lru;
+    uint32_t refs;
+    uint32_t columns;
+    uint32_t rows;
+    uintptr_t mask;
+    ec_code_t *code;
+    uint32_t *values;
+    ec_matrix_row_t row_data[0];
+};
+
+struct _ec_matrix_list {
+    struct list_head lru;
+    gf_lock_t lock;
+    uint32_t columns;
+    uint32_t rows;
+    uint32_t max;
+    uint32_t count;
+    uint32_t stripe;
+    struct mem_pool *pool;
+    ec_gf_t *gf;
+    ec_code_t *code;
+    ec_matrix_t *encode;
+    ec_matrix_t **objects;
+};
+
+struct _ec_heal {
+    struct list_head list;
+    gf_lock_t lock;
+    xlator_t *xl;
+    ec_fop_data_t *fop;
+    void *data;
+    ec_fop_data_t *lookup;
+    loc_t loc;
+    struct iatt iatt;
+    char *symlink;
+    fd_t *fd;
+    int32_t partial;
+    int32_t done;
+    int32_t error;
+    gf_boolean_t nameheal;
+    uintptr_t available;
+    uintptr_t good;
+    uintptr_t bad;
+    uintptr_t open;
+    uintptr_t fixed;
+    uint64_t offset;
+    uint64_t size;
+    uint64_t total_size;
+    uint64_t version[2];
+    uint64_t raw_size;
+};
+
+struct subvol_healer {
+    xlator_t *this;
+    int subvol;
+    gf_boolean_t running;
+    gf_boolean_t rerun;
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+    pthread_t thread;
+};
+
+struct _ec_self_heald {
+    gf_boolean_t iamshd;
+    gf_boolean_t enabled;
+    int timeout;
+    uint32_t max_threads;
+    uint32_t wait_qlength;
+    struct subvol_healer *index_healers;
+    struct subvol_healer *full_healers;
+};
+
+struct _ec_statistics {
+    struct {
+        gf_atomic_t hits;    /* Cache hits. */
+        gf_atomic_t misses;  /* Cache misses. */
+        gf_atomic_t updates; /* Number of times an existing stripe has
+                                been updated with new content. */
+        gf_atomic_t invals;  /* Number of times an existing stripe has
+                                been invalidated because of truncates
+                                or discards. */
+        gf_atomic_t evicts;  /* Number of times that an existing entry
+                                has been evicted to make room for newer
+                                entries. */
+        gf_atomic_t allocs;  /* Number of memory allocations made to
+                                store stripes. */
+        gf_atomic_t errors;  /* Number of errors that have caused extra
+                                requests. (Basically memory allocation
+                                errors). */
+    } stripe_cache;
+    struct {
+        gf_atomic_t attempted; /*Number of heals attempted on
+                                files/directories*/
+        gf_atomic_t completed; /*Number of heals complted on files/directories*/
+    } shd;
+};
+
+struct _ec {
+    xlator_t *xl;
+    int32_t healers;
+    int32_t heal_waiters;
+    int32_t nodes; /* Total number of bricks(n) */
+    int32_t bits_for_nodes;
+    int32_t fragments;      /* Data bricks(k) */
+    int32_t redundancy;     /* Redundant bricks(m) */
+    uint32_t fragment_size; /* Size of fragment/chunk on a
+                               brick. */
+    uint32_t stripe_size;   /* (fragment_size * fragments)
+                               maximum size of user data
+                               stored in one stripe. */
+    int32_t up;             /* Represents whether EC volume is
+                               up or not. */
+    uint32_t idx;
+    uint32_t xl_up_count;     /* Number of UP bricks. */
+    uintptr_t xl_up;          /* Bit flag representing UP
+                                 bricks */
+    uint32_t xl_notify_count; /* Number of notifications. */
+    uintptr_t xl_notify;      /* Bit flag representing
+                                 notification for bricks. */
+    uintptr_t node_mask;
+    uintptr_t read_mask;         /*Stores user defined read-mask*/
+    gf_atomic_t async_fop_count; /* Number of on going asynchronous fops. */
+    xlator_t **xl_list;
+    gf_lock_t lock;
+    gf_timer_t *timer;
+    gf_boolean_t shutdown;
+    gf_boolean_t eager_lock;
+    gf_boolean_t other_eager_lock;
+    gf_boolean_t optimistic_changelog;
+    gf_boolean_t parallel_writes;
+    uint32_t stripe_cache;
+    uint32_t quorum_count;
+    uint32_t background_heals;
+    uint32_t heal_wait_qlen;
+    uint32_t self_heal_window_size; /* max size of read/writes */
+    uint32_t eager_lock_timeout;
+    uint32_t other_eager_lock_timeout;
+    struct list_head pending_fops;
+    struct list_head heal_waiting;
+    struct list_head healing;
+    struct mem_pool *fop_pool;
+    struct mem_pool *cbk_pool;
+    struct mem_pool *lock_pool;
+    ec_self_heald_t shd;
+    char vol_uuid[UUID_SIZE + 1];
+    dict_t *leaf_to_subvolid;
+    ec_read_policy_t read_policy;
+    ec_matrix_list_t matrix;
+    ec_statistics_t stats;
+};
+
+#endif /* __EC_TYPES_H__ */
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
new file mode 100644
index 00000000000..7344be4968d
--- /dev/null
+++ b/xlators/cluster/ec/src/ec.c
@@ -0,0 +1,1873 @@
+/*
+  Copyright (c) 2012-2015 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/defaults.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/upcall-utils.h>
+
+#include "ec.h"
+#include "ec-messages.h"
+#include "ec-mem-types.h"
+#include "ec-types.h"
+#include "ec-helpers.h"
+#include "ec-common.h"
+#include "ec-fops.h"
+#include "ec-method.h"
+#include "ec-code.h"
+#include "ec-heald.h"
+#include <glusterfs/events.h>
+
+static char *ec_read_policies[EC_READ_POLICY_MAX + 1] = {
+    [EC_ROUND_ROBIN] = "round-robin",
+    [EC_GFID_HASH] = "gfid-hash",
+    [EC_READ_POLICY_MAX] = NULL};
+
+#define EC_INTERNAL_XATTR_OR_GOTO(name, xattr, op_errno, label)                \
+    do {                                                                       \
+        if (ec_is_internal_xattr(NULL, (char *)name, NULL, NULL)) {            \
+            op_errno = EPERM;                                                  \
+            goto label;                                                        \
+        }                                                                      \
+        if (name && (strlen(name) == 0) && xattr) {                            \
+            /* Bulk [f]removexattr/[f]setxattr */                              \
+            GF_IF_INTERNAL_XATTR_GOTO(EC_XATTR_PREFIX "*", xattr, op_errno,    \
+                                      label);                                  \
+        }                                                                      \
+    } while (0)
+
+int32_t
+ec_parse_options(xlator_t *this)
+{
+    ec_t *ec = this->private;
+    int32_t error = EINVAL;
+    uintptr_t mask;
+
+    GF_OPTION_INIT("redundancy", ec->redundancy, int32, out);
+    ec->fragments = ec->nodes - ec->redundancy;
+    if ((ec->redundancy < 1) || (ec->redundancy >= ec->fragments) ||
+        (ec->fragments > EC_MAX_FRAGMENTS)) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_REDUNDANCY,
+               "Invalid redundancy (must be between "
+               "1 and %d)",
+               (ec->nodes - 1) / 2);
+
+        goto out;
+    }
+
+    ec->bits_for_nodes = 1;
+    mask = 2;
+    while (ec->nodes > mask) {
+        ec->bits_for_nodes++;
+        mask <<= 1;
+    }
+    ec->node_mask = (1ULL << ec->nodes) - 1ULL;
+    ec->fragment_size = EC_METHOD_CHUNK_SIZE;
+    ec->stripe_size = ec->fragment_size * ec->fragments;
+
+    gf_msg_debug("ec", 0,
+                 "Initialized with: nodes=%u, fragments=%u, "
+                 "stripe_size=%u, node_mask=%" PRIxFAST32,
+                 ec->nodes, ec->fragments, ec->stripe_size, ec->node_mask);
+
+    error = 0;
+
+out:
+    return error;
+}
+
+int32_t
+ec_prepare_childs(xlator_t *this)
+{
+    ec_t *ec = this->private;
+    xlator_list_t *child = NULL;
+    int32_t count = 0;
+
+    for (child = this->children; child != NULL; child = child->next) {
+        count++;
+    }
+    if (count > EC_MAX_NODES) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_TOO_MANY_SUBVOLS,
+               "Too many subvolumes");
+
+        return EINVAL;
+    }
+    ec->nodes = count;
+
+    ec->xl_list = GF_CALLOC(count, sizeof(ec->xl_list[0]), ec_mt_xlator_t);
+    if (ec->xl_list == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+               "Allocation of xlator list failed");
+
+        return ENOMEM;
+    }
+    ec->xl_up = 0;
+    ec->xl_up_count = 0;
+
+    count = 0;
+    for (child = this->children; child != NULL; child = child->next) {
+        ec->xl_list[count++] = child->xlator;
+    }
+
+    return 0;
+}
+
+/* This function transforms the subvol to subvol-id*/
+static int
+_subvol_to_subvolid(dict_t *this, char *key, data_t *value, void *data)
+{
+    ec_t *ec = data;
+    xlator_t *subvol = NULL;
+    int i = 0;
+    int ret = -1;
+
+    subvol = data_to_ptr(value);
+    for (i = 0; i < ec->nodes; i++) {
+        if (ec->xl_list[i] == subvol) {
+            ret = dict_set_int32(this, key, i);
+            /* -1 stops dict_foreach and returns -1*/
+            if (ret < 0)
+                ret = -1;
+            goto out;
+        }
+    }
+out:
+    return ret;
+}
+
+int
+ec_subvol_to_subvol_id_transform(ec_t *ec, dict_t *leaf_to_subvolid)
+{
+    return dict_foreach(leaf_to_subvolid, _subvol_to_subvolid, ec);
+}
+
+void
+__ec_destroy_private(xlator_t *this)
+{
+    ec_t *ec = this->private;
+
+    if (ec != NULL) {
+        LOCK(&ec->lock);
+
+        if (ec->timer != NULL) {
+            gf_timer_call_cancel(this->ctx, ec->timer);
+            ec->timer = NULL;
+        }
+
+        UNLOCK(&ec->lock);
+
+        /* There is a race with timer because there is no way to know if
+         * timer callback has really been cancelled or it has been scheduled
+         * for execution. If it has been scheduled, it will crash if we
+         * destroy ec too fast.
+         *
+         * Not sure how this can be solved without using global variables or
+         * having support from gf_timer_call_cancel()
+         */
+        sleep(2);
+
+        this->private = NULL;
+        if (ec->xl_list != NULL) {
+            GF_FREE(ec->xl_list);
+            ec->xl_list = NULL;
+        }
+
+        if (ec->fop_pool != NULL) {
+            mem_pool_destroy(ec->fop_pool);
+        }
+
+        if (ec->cbk_pool != NULL) {
+            mem_pool_destroy(ec->cbk_pool);
+        }
+
+        if (ec->lock_pool != NULL) {
+            mem_pool_destroy(ec->lock_pool);
+        }
+
+        LOCK_DESTROY(&ec->lock);
+
+        if (ec->leaf_to_subvolid)
+            dict_unref(ec->leaf_to_subvolid);
+
+        ec_method_fini(&ec->matrix);
+
+        GF_FREE(ec);
+    }
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    if (xlator_mem_acct_init(this, ec_mt_end + 1) != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+               "Memory accounting initialization "
+               "failed.");
+
+        return -1;
+    }
+
+    return 0;
+}
+
+void
+ec_configure_background_heal_opts(ec_t *ec, int background_heals,
+                                  int heal_wait_qlen)
+{
+    if (background_heals == 0) {
+        ec->heal_wait_qlen = 0;
+    } else {
+        ec->heal_wait_qlen = heal_wait_qlen;
+    }
+    ec->background_heals = background_heals;
+}
+
+int
+ec_assign_read_policy(ec_t *ec, char *read_policy)
+{
+    int read_policy_idx = -1;
+
+    read_policy_idx = gf_get_index_by_elem(ec_read_policies, read_policy);
+    if (read_policy_idx < 0 || read_policy_idx >= EC_READ_POLICY_MAX)
+        return -1;
+
+    ec->read_policy = read_policy_idx;
+    return 0;
+}
+
+int32_t
+reconfigure(xlator_t *this, dict_t *options)
+{
+    ec_t *ec = this->private;
+    char *read_policy = NULL;
+    char *extensions = NULL;
+    uint32_t heal_wait_qlen = 0;
+    uint32_t background_heals = 0;
+    int32_t ret = -1;
+    int32_t err;
+
+    GF_OPTION_RECONF("cpu-extensions", extensions, options, str, failed);
+
+    GF_OPTION_RECONF("self-heal-daemon", ec->shd.enabled, options, bool,
+                     failed);
+    GF_OPTION_RECONF("iam-self-heal-daemon", ec->shd.iamshd, options, bool,
+                     failed);
+    GF_OPTION_RECONF("eager-lock", ec->eager_lock, options, bool, failed);
+    GF_OPTION_RECONF("other-eager-lock", ec->other_eager_lock, options, bool,
+                     failed);
+    GF_OPTION_RECONF("eager-lock-timeout", ec->eager_lock_timeout, options,
+                     uint32, failed);
+    GF_OPTION_RECONF("other-eager-lock-timeout", ec->other_eager_lock_timeout,
+                     options, uint32, failed);
+    GF_OPTION_RECONF("background-heals", background_heals, options, uint32,
+                     failed);
+    GF_OPTION_RECONF("heal-wait-qlength", heal_wait_qlen, options, uint32,
+                     failed);
+    GF_OPTION_RECONF("self-heal-window-size", ec->self_heal_window_size,
+                     options, uint32, failed);
+    GF_OPTION_RECONF("heal-timeout", ec->shd.timeout, options, int32, failed);
+    ec_configure_background_heal_opts(ec, background_heals, heal_wait_qlen);
+    GF_OPTION_RECONF("shd-max-threads", ec->shd.max_threads, options, uint32,
+                     failed);
+    GF_OPTION_RECONF("shd-wait-qlength", ec->shd.wait_qlength, options, uint32,
+                     failed);
+
+    GF_OPTION_RECONF("read-policy", read_policy, options, str, failed);
+
+    GF_OPTION_RECONF("optimistic-change-log", ec->optimistic_changelog, options,
+                     bool, failed);
+    GF_OPTION_RECONF("parallel-writes", ec->parallel_writes, options, bool,
+                     failed);
+    GF_OPTION_RECONF("stripe-cache", ec->stripe_cache, options, uint32, failed);
+    GF_OPTION_RECONF("quorum-count", ec->quorum_count, options, uint32, failed);
+    ret = 0;
+    if (ec_assign_read_policy(ec, read_policy)) {
+        ret = -1;
+    }
+
+    err = ec_method_update(this, &ec->matrix, extensions);
+    if (err != 0) {
+        ret = -1;
+    }
+
+failed:
+    return ret;
+}
+
+glusterfs_event_t
+ec_get_event_from_state(ec_t *ec)
+{
+    int down_count = 0;
+
+    if (ec->xl_up_count >= ec->fragments) {
+        /* If ec is up but some subvolumes are yet to notify, give
+         * grace time for other subvols to notify to prevent start of
+         * I/O which may result in self-heals */
+        if (ec->xl_notify_count < ec->nodes)
+            return GF_EVENT_MAXVAL;
+
+        return GF_EVENT_CHILD_UP;
+    } else {
+        down_count = ec->xl_notify_count - ec->xl_up_count;
+        if (down_count > ec->redundancy)
+            return GF_EVENT_CHILD_DOWN;
+    }
+
+    return GF_EVENT_MAXVAL;
+}
+
+void
+ec_up(xlator_t *this, ec_t *ec)
+{
+    char str1[32], str2[32];
+
+    if (ec->timer != NULL) {
+        gf_timer_call_cancel(this->ctx, ec->timer);
+        ec->timer = NULL;
+    }
+
+    ec->up = 1;
+    gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP,
+           "Going UP : Child UP = %s Child Notify = %s",
+           ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+           ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes));
+
+    gf_event(EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name);
+}
+
+void
+ec_down(xlator_t *this, ec_t *ec)
+{
+    char str1[32], str2[32];
+
+    if (ec->timer != NULL) {
+        gf_timer_call_cancel(this->ctx, ec->timer);
+        ec->timer = NULL;
+    }
+
+    ec->up = 0;
+    gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN,
+           "Going DOWN : Child UP = %s Child Notify = %s",
+           ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+           ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes));
+
+    gf_event(EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name);
+}
+
+void
+ec_notify_cbk(void *data)
+{
+    ec_t *ec = data;
+    glusterfs_event_t event = GF_EVENT_MAXVAL;
+    gf_boolean_t propagate = _gf_false;
+    gf_boolean_t launch_heal = _gf_false;
+
+    LOCK(&ec->lock);
+    {
+        if (!ec->timer) {
+            /*
+             * Either child_up/child_down is already sent to parent
+             * This is a spurious wake up.
+             */
+            goto unlock;
+        }
+
+        gf_timer_call_cancel(ec->xl->ctx, ec->timer);
+        ec->timer = NULL;
+
+        /* The timeout has expired, so any subvolume that has not
+         * already reported its state, will be considered to be down.
+         * We mark as if all bricks had reported. */
+        ec->xl_notify = (1ULL << ec->nodes) - 1ULL;
+        ec->xl_notify_count = ec->nodes;
+
+        /* Since we have marked all subvolumes as notified, it's
+         * guaranteed that ec_get_event_from_state() will return
+         * CHILD_UP or CHILD_DOWN, but not MAXVAL. */
+        event = ec_get_event_from_state(ec);
+        if (event == GF_EVENT_CHILD_UP) {
+            /* We are ready to bring the volume up. If there are
+             * still bricks DOWN, they will be healed when they
+             * come up. */
+            ec_up(ec->xl, ec);
+
+            if (ec->shd.iamshd && !ec->shutdown) {
+                launch_heal = _gf_true;
+                GF_ATOMIC_INC(ec->async_fop_count);
+            }
+        }
+
+        propagate = _gf_true;
+    }
+unlock:
+    UNLOCK(&ec->lock);
+
+    if (launch_heal) {
+        /* We have just brought the volume UP, so we trigger
+         * a self-heal check on the root directory. */
+        ec_launch_replace_heal(ec);
+    }
+    if (propagate) {
+        default_notify(ec->xl, event, NULL);
+    }
+}
+
+void
+ec_launch_notify_timer(xlator_t *this, ec_t *ec)
+{
+    struct timespec delay = {
+        0,
+    };
+
+    gf_msg_debug(this->name, 0, "Initiating child-down timer");
+    delay.tv_sec = 10;
+    delay.tv_nsec = 0;
+    ec->timer = gf_timer_call_after(this->ctx, delay, ec_notify_cbk, ec);
+    if (ec->timer == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_TIMER_CREATE_FAIL,
+               "Cannot create timer "
+               "for delayed initialization");
+    }
+}
+
+gf_boolean_t
+ec_disable_delays(ec_t *ec)
+{
+    ec->shutdown = _gf_true;
+
+    return __ec_is_last_fop(ec);
+}
+
+void
+ec_cleanup_healer_object(ec_t *ec)
+{
+    struct subvol_healer *healer = NULL;
+    ec_self_heald_t *shd = NULL;
+    void *res = NULL;
+    int i = 0;
+    gf_boolean_t is_join = _gf_false;
+
+    shd = &ec->shd;
+    if (!shd->iamshd)
+        return;
+
+    for (i = 0; i < ec->nodes; i++) {
+        healer = &shd->index_healers[i];
+        pthread_mutex_lock(&healer->mutex);
+        {
+            healer->rerun = 1;
+            if (healer->running) {
+                pthread_cond_signal(&healer->cond);
+                is_join = _gf_true;
+            }
+        }
+        pthread_mutex_unlock(&healer->mutex);
+        if (is_join) {
+            pthread_join(healer->thread, &res);
+            is_join = _gf_false;
+        }
+
+        healer = &shd->full_healers[i];
+        pthread_mutex_lock(&healer->mutex);
+        {
+            healer->rerun = 1;
+            if (healer->running) {
+                pthread_cond_signal(&healer->cond);
+                is_join = _gf_true;
+            }
+        }
+        pthread_mutex_unlock(&healer->mutex);
+        if (is_join) {
+            pthread_join(healer->thread, &res);
+            is_join = _gf_false;
+        }
+    }
+}
+void
+ec_pending_fops_completed(ec_t *ec)
+{
+    if (ec->shutdown) {
+        default_notify(ec->xl, GF_EVENT_PARENT_DOWN, NULL);
+    }
+}
+
+static gf_boolean_t
+ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
+{
+    uintptr_t current_state = 0;
+
+    if (xlator_is_cleanup_starting(ec->xl))
+        return _gf_false;
+
+    if ((ec->xl_notify & index_mask) == 0) {
+        ec->xl_notify |= index_mask;
+        ec->xl_notify_count++;
+    }
+    current_state = ec->xl_up & index_mask;
+    if (current_state != new_state) {
+        ec->xl_up ^= index_mask;
+        ec->xl_up_count += (current_state ? -1 : 1);
+
+        return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+static gf_boolean_t
+ec_upcall(ec_t *ec, struct gf_upcall *upcall)
+{
+    struct gf_upcall_cache_invalidation *ci = NULL;
+    struct gf_upcall_inodelk_contention *lc = NULL;
+    inode_t *inode;
+    inode_table_t *table;
+
+    switch (upcall->event_type) {
+        case GF_UPCALL_CACHE_INVALIDATION:
+            ci = upcall->data;
+            ci->flags |= UP_INVAL_ATTR;
+            return _gf_true;
+
+        case GF_UPCALL_INODELK_CONTENTION:
+            lc = upcall->data;
+            if (strcmp(lc->domain, ec->xl->name) != 0) {
+                /* The lock is not owned by EC, ignore it. */
+                return _gf_true;
+            }
+            table = ((xlator_t *)ec->xl->graph->top)->itable;
+            if (table == NULL) {
+                /* Self-heal daemon doesn't have an inode table on the top
+                 * xlator because it doesn't need it. In this case we should
+                 * use the inode table managed by EC itself where all inodes
+                 * being healed should be present. However self-heal doesn't
+                 * use eager-locking and inodelk's are already released as
+                 * soon as possible. In this case we can safely ignore these
+                 * notifications. */
+                return _gf_false;
+            }
+            inode = inode_find(table, upcall->gfid);
+            /* If inode is not found, it means that it's already released,
+             * so we can ignore it. Probably it has been released and
+             * destroyed while the contention notification was being sent.
+             */
+            if (inode != NULL) {
+                ec_lock_release(ec, inode);
+                inode_unref(inode);
+            }
+
+            return _gf_false;
+
+        default:
+            return _gf_true;
+    }
+}
+
+int32_t
+ec_notify(xlator_t *this, int32_t event, void *data, void *data2)
+{
+    ec_t *ec = this->private;
+    int32_t idx = 0;
+    int32_t error = 0;
+    glusterfs_event_t old_event = GF_EVENT_MAXVAL;
+    dict_t *input = NULL;
+    dict_t *output = NULL;
+    gf_boolean_t propagate = _gf_true;
+    gf_boolean_t needs_shd_check = _gf_false;
+    int32_t orig_event = event;
+    uintptr_t mask = 0;
+
+    gf_msg_trace(this->name, 0, "NOTIFY(%d): %p, %p", event, data, data2);
+
+    if (event == GF_EVENT_UPCALL) {
+        propagate = ec_upcall(ec, data);
+        goto done;
+    }
+
+    if (event == GF_EVENT_TRANSLATOR_OP) {
+        if (!ec->up) {
+            error = -1;
+        } else {
+            input = data;
+            output = data2;
+            error = ec_xl_op(this, input, output);
+        }
+        goto out;
+    }
+
+    for (idx = 0; idx < ec->nodes; idx++) {
+        if (ec->xl_list[idx] == data) {
+            break;
+        }
+    }
+
+    LOCK(&ec->lock);
+
+    if (event == GF_EVENT_PARENT_UP) {
+        /*
+         * Start a timer which sends appropriate event to parent
+         * xlator to prevent the 'mount' syscall from hanging.
+         */
+        ec_launch_notify_timer(this, ec);
+        goto unlock;
+    } else if (event == GF_EVENT_PARENT_DOWN) {
+        /* If there aren't pending fops running after we have waken up
+         * them, we immediately propagate the notification. */
+        propagate = ec_disable_delays(ec);
+        ec_cleanup_healer_object(ec);
+        goto unlock;
+    }
+
+    if (idx < ec->nodes) { /* CHILD_* events */
+        old_event = ec_get_event_from_state(ec);
+
+        mask = 1ULL << idx;
+        if (event == GF_EVENT_CHILD_UP) {
+            /* We need to trigger a selfheal if a brick changes
+             * to UP state. */
+            if (ec_set_up_state(ec, mask, mask) && ec->shd.iamshd &&
+                !ec->shutdown) {
+                needs_shd_check = _gf_true;
+            }
+        } else if (event == GF_EVENT_CHILD_DOWN) {
+            ec_set_up_state(ec, mask, 0);
+        }
+
+        event = ec_get_event_from_state(ec);
+
+        if (event == GF_EVENT_CHILD_UP) {
+            if (!ec->up) {
+                ec_up(this, ec);
+            }
+        } else {
+            /* If the volume is not UP, it's irrelevant if one
+             * brick has come up. We cannot heal anything. */
+            needs_shd_check = _gf_false;
+
+            if ((event == GF_EVENT_CHILD_DOWN) && ec->up) {
+                ec_down(this, ec);
+            }
+        }
+
+        if (event != GF_EVENT_MAXVAL) {
+            if (event == old_event) {
+                if (orig_event == GF_EVENT_CHILD_UP)
+                    event = GF_EVENT_SOME_DESCENDENT_UP;
+                else /* orig_event has to be GF_EVENT_CHILD_DOWN */
+                    event = GF_EVENT_SOME_DESCENDENT_DOWN;
+            }
+        } else {
+            propagate = _gf_false;
+            needs_shd_check = _gf_false;
+        }
+
+        if (needs_shd_check) {
+            GF_ATOMIC_INC(ec->async_fop_count);
+        }
+    }
+unlock:
+    UNLOCK(&ec->lock);
+
+done:
+    if (needs_shd_check) {
+        ec_launch_replace_heal(ec);
+    }
+    if (propagate) {
+        error = default_notify(this, event, data);
+    }
+
+out:
+    return error;
+}
+
+int32_t
+notify(xlator_t *this, int32_t event, void *data, ...)
+{
+    int ret = -1;
+    va_list ap;
+    void *data2 = NULL;
+
+    va_start(ap, data);
+    data2 = va_arg(ap, dict_t *);
+    va_end(ap);
+    ret = ec_notify(this, event, data, data2);
+
+    return ret;
+}
+
+static void
+ec_statistics_init(ec_t *ec)
+{
+    GF_ATOMIC_INIT(ec->stats.stripe_cache.hits, 0);
+    GF_ATOMIC_INIT(ec->stats.stripe_cache.misses, 0);
+    GF_ATOMIC_INIT(ec->stats.stripe_cache.updates, 0);
+    GF_ATOMIC_INIT(ec->stats.stripe_cache.invals, 0);
+    GF_ATOMIC_INIT(ec->stats.stripe_cache.evicts, 0);
+    GF_ATOMIC_INIT(ec->stats.stripe_cache.allocs, 0);
+    GF_ATOMIC_INIT(ec->stats.stripe_cache.errors, 0);
+    GF_ATOMIC_INIT(ec->stats.shd.attempted, 0);
+    GF_ATOMIC_INIT(ec->stats.shd.completed, 0);
+}
+
+static int
+ec_assign_read_mask(ec_t *ec, char *read_mask_str)
+{
+    char *mask = NULL;
+    char *maskptr = NULL;
+    char *saveptr = NULL;
+    char *id_str = NULL;
+    int id = 0;
+    int ret = 0;
+    uintptr_t read_mask = 0;
+
+    if (!read_mask_str) {
+        ec->read_mask = 0;
+        ret = 0;
+        goto out;
+    }
+
+    mask = gf_strdup(read_mask_str);
+    if (!mask) {
+        ret = -1;
+        goto out;
+    }
+    maskptr = mask;
+
+    for (;;) {
+        id_str = strtok_r(maskptr, ":", &saveptr);
+        if (id_str == NULL)
+            break;
+        if (gf_string2int(id_str, &id)) {
+            gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL,
+                   "In read-mask \"%s\" id %s is not a valid integer",
+                   read_mask_str, id_str);
+            ret = -1;
+            goto out;
+        }
+
+        if ((id < 0) || (id >= ec->nodes)) {
+            gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL,
+                   "In read-mask \"%s\" id %d is not in range [0 - %d]",
+                   read_mask_str, id, ec->nodes - 1);
+            ret = -1;
+            goto out;
+        }
+        read_mask |= (1UL << id);
+        maskptr = NULL;
+    }
+
+    if (gf_bits_count(read_mask) < ec->fragments) {
+        gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL,
+               "read-mask \"%s\" should contain at least %d ids", read_mask_str,
+               ec->fragments);
+        ret = -1;
+        goto out;
+    }
+    ec->read_mask = read_mask;
+    ret = 0;
+out:
+    GF_FREE(mask);
+    return ret;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    ec_t *ec = NULL;
+    char *read_policy = NULL;
+    char *extensions = NULL;
+    int32_t err;
+    char *read_mask_str = NULL;
+
+    if (this->parents == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, EC_MSG_NO_PARENTS,
+               "Volume does not have parents.");
+    }
+
+    ec = GF_MALLOC(sizeof(*ec), ec_mt_ec_t);
+    if (ec == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+               "Failed to allocate private memory.");
+
+        return -1;
+    }
+    memset(ec, 0, sizeof(*ec));
+
+    this->private = ec;
+
+    ec->xl = this;
+    LOCK_INIT(&ec->lock);
+
+    GF_ATOMIC_INIT(ec->async_fop_count, 0);
+    INIT_LIST_HEAD(&ec->pending_fops);
+    INIT_LIST_HEAD(&ec->heal_waiting);
+    INIT_LIST_HEAD(&ec->healing);
+
+    ec->fop_pool = mem_pool_new(ec_fop_data_t, 1024);
+    ec->cbk_pool = mem_pool_new(ec_cbk_data_t, 4096);
+    ec->lock_pool = mem_pool_new(ec_lock_t, 1024);
+    if ((ec->fop_pool == NULL) || (ec->cbk_pool == NULL) ||
+        (ec->lock_pool == NULL)) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_NO_MEMORY,
+               "Failed to create memory pools.");
+
+        goto failed;
+    }
+
+    if (ec_prepare_childs(this) != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_XLATOR_INIT_FAIL,
+               "Failed to initialize xlator");
+
+        goto failed;
+    }
+
+    if (ec_parse_options(this) != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, EC_MSG_XLATOR_PARSE_OPT_FAIL,
+               "Failed to parse xlator options");
+
+        goto failed;
+    }
+
+    GF_OPTION_INIT("cpu-extensions", extensions, str, failed);
+
+    err = ec_method_init(this, &ec->matrix, ec->fragments, ec->nodes,
+                         ec->nodes * 2, extensions);
+    if (err != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, -err, EC_MSG_MATRIX_FAILED,
+               "Failed to initialize matrix management");
+
+        goto failed;
+    }
+
+    GF_OPTION_INIT("self-heal-daemon", ec->shd.enabled, bool, failed);
+    GF_OPTION_INIT("iam-self-heal-daemon", ec->shd.iamshd, bool, failed);
+    GF_OPTION_INIT("eager-lock", ec->eager_lock, bool, failed);
+    GF_OPTION_INIT("other-eager-lock", ec->other_eager_lock, bool, failed);
+    GF_OPTION_INIT("eager-lock-timeout", ec->eager_lock_timeout, uint32,
+                   failed);
+    GF_OPTION_INIT("other-eager-lock-timeout", ec->other_eager_lock_timeout,
+                   uint32, failed);
+    GF_OPTION_INIT("background-heals", ec->background_heals, uint32, failed);
+    GF_OPTION_INIT("heal-wait-qlength", ec->heal_wait_qlen, uint32, failed);
+    GF_OPTION_INIT("self-heal-window-size", ec->self_heal_window_size, uint32,
+                   failed);
+    ec_configure_background_heal_opts(ec, ec->background_heals,
+                                      ec->heal_wait_qlen);
+    GF_OPTION_INIT("read-policy", read_policy, str, failed);
+    if (ec_assign_read_policy(ec, read_policy))
+        goto failed;
+
+    GF_OPTION_INIT("heal-timeout", ec->shd.timeout, int32, failed);
+    GF_OPTION_INIT("shd-max-threads", ec->shd.max_threads, uint32, failed);
+    GF_OPTION_INIT("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed);
+    GF_OPTION_INIT("optimistic-change-log", ec->optimistic_changelog, bool,
+                   failed);
+    GF_OPTION_INIT("parallel-writes", ec->parallel_writes, bool, failed);
+    GF_OPTION_INIT("stripe-cache", ec->stripe_cache, uint32, failed);
+    GF_OPTION_INIT("quorum-count", ec->quorum_count, uint32, failed);
+    GF_OPTION_INIT("ec-read-mask", read_mask_str, str, failed);
+
+    if (ec_assign_read_mask(ec, read_mask_str))
+        goto failed;
+
+    this->itable = inode_table_new(EC_SHD_INODE_LRU_LIMIT, this);
+    if (!this->itable)
+        goto failed;
+
+    if (ec->shd.iamshd)
+        ec_selfheal_daemon_init(this);
+    gf_msg_debug(this->name, 0, "Disperse translator initialized.");
+
+    ec->leaf_to_subvolid = dict_new();
+    if (!ec->leaf_to_subvolid)
+        goto failed;
+    if (glusterfs_reachable_leaves(this, ec->leaf_to_subvolid)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_SUBVOL_BUILD_FAIL,
+               "Failed to build subvol "
+               "dictionary");
+        goto failed;
+    }
+
+    if (ec_subvol_to_subvol_id_transform(ec, ec->leaf_to_subvolid) < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, EC_MSG_SUBVOL_ID_DICT_SET_FAIL,
+               "Failed to build subvol-id "
+               "dictionary");
+        goto failed;
+    }
+
+    ec_statistics_init(ec);
+
+    return 0;
+
+failed:
+    __ec_destroy_private(this);
+
+    return -1;
+}
+
+void
+fini(xlator_t *this)
+{
+    ec_selfheal_daemon_fini(this);
+    __ec_destroy_private(this);
+}
+
+int32_t
+ec_gf_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+             dict_t *xdata)
+{
+    ec_access(frame, this, -1, EC_MINIMUM_ONE, default_access_cbk, NULL, loc,
+              mask, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+             mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    ec_create(frame, this, -1, EC_MINIMUM_MIN, default_create_cbk, NULL, loc,
+              flags, mode, umask, fd, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+              size_t len, dict_t *xdata)
+{
+    ec_discard(frame, this, -1, EC_MINIMUM_MIN, default_discard_cbk, NULL, fd,
+               offset, len, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_entrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+              loc_t *loc, const char *basename, entrylk_cmd cmd,
+              entrylk_type type, dict_t *xdata)
+{
+    uint32_t fop_flags = EC_MINIMUM_ALL;
+
+    if (cmd == ENTRYLK_UNLOCK)
+        fop_flags = EC_MINIMUM_ONE;
+    ec_entrylk(frame, this, -1, fop_flags, default_entrylk_cbk, NULL, volume,
+               loc, basename, cmd, type, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+               fd_t *fd, const char *basename, entrylk_cmd cmd,
+               entrylk_type type, dict_t *xdata)
+{
+    uint32_t fop_flags = EC_MINIMUM_ALL;
+
+    if (cmd == ENTRYLK_UNLOCK)
+        fop_flags = EC_MINIMUM_ONE;
+    ec_fentrylk(frame, this, -1, fop_flags, default_fentrylk_cbk, NULL, volume,
+                fd, basename, cmd, type, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+                off_t offset, size_t len, dict_t *xdata)
+{
+    ec_fallocate(frame, this, -1, EC_MINIMUM_MIN, default_fallocate_cbk, NULL,
+                 fd, mode, offset, len, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    ec_flush(frame, this, -1, EC_MINIMUM_MIN, default_flush_cbk, NULL, fd,
+             xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+            dict_t *xdata)
+{
+    ec_fsync(frame, this, -1, EC_MINIMUM_MIN, default_fsync_cbk, NULL, fd,
+             datasync, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+               dict_t *xdata)
+{
+    ec_fsyncdir(frame, this, -1, EC_MINIMUM_MIN, default_fsyncdir_cbk, NULL, fd,
+                datasync, xdata);
+
+    return 0;
+}
+
+int
+ec_marker_populate_args(call_frame_t *frame, int type, int *gauge,
+                        xlator_t **subvols)
+{
+    xlator_t *this = frame->this;
+    ec_t *ec = this->private;
+
+    memcpy(subvols, ec->xl_list, sizeof(*subvols) * ec->nodes);
+
+    if (type == MARKER_XTIME_TYPE) {
+        /*Don't error out on ENOENT/ENOTCONN */
+        gauge[MCNT_NOTFOUND] = 0;
+        gauge[MCNT_ENOTCONN] = 0;
+    }
+
+    return ec->nodes;
+}
+
+int32_t
+ec_handle_heal_commands(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                        const char *name, dict_t *xdata)
+{
+    dict_t *dict_rsp = NULL;
+    int op_ret = -1;
+    int op_errno = ENOMEM;
+
+    if (!name || strcmp(name, GF_HEAL_INFO))
+        return -1;
+
+    op_errno = -ec_get_heal_info(this, loc, &dict_rsp);
+    if (op_errno <= 0) {
+        op_errno = op_ret = 0;
+    }
+
+    STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict_rsp, NULL);
+    if (dict_rsp)
+        dict_unref(dict_rsp);
+    return 0;
+}
+
+int32_t
+ec_gf_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               const char *name, dict_t *xdata)
+{
+    int error = 0;
+    ec_t *ec = this->private;
+    int32_t fop_flags = EC_MINIMUM_ONE;
+
+    if (name && strcmp(name, EC_XATTR_HEAL) != 0) {
+        EC_INTERNAL_XATTR_OR_GOTO(name, NULL, error, out);
+    }
+
+    if (ec_handle_heal_commands(frame, this, loc, name, xdata) == 0)
+        return 0;
+
+    if (cluster_handle_marker_getxattr(frame, loc, name, ec->vol_uuid, NULL,
+                                       ec_marker_populate_args) == 0)
+        return 0;
+
+    if (name && ((fnmatch(GF_XATTR_STIME_PATTERN, name, 0) == 0) ||
+                 XATTR_IS_NODE_UUID(name) || XATTR_IS_NODE_UUID_LIST(name))) {
+        fop_flags = EC_MINIMUM_ALL;
+    }
+
+    ec_getxattr(frame, this, -1, fop_flags, default_getxattr_cbk, NULL, loc,
+                name, xdata);
+
+    return 0;
+out:
+    error = ENODATA;
+    STACK_UNWIND_STRICT(getxattr, frame, -1, error, NULL, NULL);
+    return 0;
+}
+
+int32_t
+ec_gf_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+                dict_t *xdata)
+{
+    int error = 0;
+
+    EC_INTERNAL_XATTR_OR_GOTO(name, NULL, error, out);
+
+    ec_fgetxattr(frame, this, -1, EC_MINIMUM_ONE, default_fgetxattr_cbk, NULL,
+                 fd, name, xdata);
+    return 0;
+out:
+    error = ENODATA;
+    STACK_UNWIND_STRICT(fgetxattr, frame, -1, error, NULL, NULL);
+    return 0;
+}
+
+int32_t
+ec_gf_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+              loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+    int32_t fop_flags = EC_MINIMUM_ALL;
+
+    if (flock->l_type == F_UNLCK)
+        fop_flags = EC_MINIMUM_ONE;
+
+    ec_inodelk(frame, this, &frame->root->lk_owner, -1, fop_flags,
+               default_inodelk_cbk, NULL, volume, loc, cmd, flock, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_finodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+               fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+    int32_t fop_flags = EC_MINIMUM_ALL;
+
+    if (flock->l_type == F_UNLCK)
+        fop_flags = EC_MINIMUM_ONE;
+    ec_finodelk(frame, this, &frame->root->lk_owner, -1, fop_flags,
+                default_finodelk_cbk, NULL, volume, fd, cmd, flock, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+           dict_t *xdata)
+{
+    ec_link(frame, this, -1, EC_MINIMUM_MIN, default_link_cbk, NULL, oldloc,
+            newloc, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+         struct gf_flock *flock, dict_t *xdata)
+{
+    int32_t fop_flags = EC_MINIMUM_ALL;
+
+    if (flock->l_type == F_UNLCK)
+        fop_flags = EC_MINIMUM_ONE;
+    ec_lk(frame, this, -1, fop_flags, default_lk_cbk, NULL, fd, cmd, flock,
+          xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    ec_lookup(frame, this, -1, EC_MINIMUM_MIN, default_lookup_cbk, NULL, loc,
+              xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+            mode_t umask, dict_t *xdata)
+{
+    ec_mkdir(frame, this, -1, EC_MINIMUM_MIN, default_mkdir_cbk, NULL, loc,
+             mode, umask, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+            dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    ec_mknod(frame, this, -1, EC_MINIMUM_MIN, default_mknod_cbk, NULL, loc,
+             mode, rdev, umask, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+           fd_t *fd, dict_t *xdata)
+{
+    ec_open(frame, this, -1, EC_MINIMUM_MIN, default_open_cbk, NULL, loc, flags,
+            fd, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+              dict_t *xdata)
+{
+    ec_opendir(frame, this, -1, EC_MINIMUM_MIN, default_opendir_cbk, NULL, loc,
+               fd, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+              off_t offset, dict_t *xdata)
+{
+    ec_readdir(frame, this, -1, EC_MINIMUM_ONE, default_readdir_cbk, NULL, fd,
+               size, offset, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+               off_t offset, dict_t *xdata)
+{
+    ec_readdirp(frame, this, -1, EC_MINIMUM_ONE, default_readdirp_cbk, NULL, fd,
+                size, offset, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+               dict_t *xdata)
+{
+    ec_readlink(frame, this, -1, EC_MINIMUM_ONE, default_readlink_cbk, NULL,
+                loc, size, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t offset, uint32_t flags, dict_t *xdata)
+{
+    ec_readv(frame, this, -1, EC_MINIMUM_MIN, default_readv_cbk, NULL, fd, size,
+             offset, flags, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                  const char *name, dict_t *xdata)
+{
+    int error = 0;
+
+    EC_INTERNAL_XATTR_OR_GOTO(name, xdata, error, out);
+
+    ec_removexattr(frame, this, -1, EC_MINIMUM_MIN, default_removexattr_cbk,
+                   NULL, loc, name, xdata);
+
+    return 0;
+out:
+    STACK_UNWIND_STRICT(removexattr, frame, -1, error, NULL);
+    return 0;
+}
+
+int32_t
+ec_gf_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                   const char *name, dict_t *xdata)
+{
+    int error = 0;
+
+    EC_INTERNAL_XATTR_OR_GOTO(name, xdata, error, out);
+
+    ec_fremovexattr(frame, this, -1, EC_MINIMUM_MIN, default_fremovexattr_cbk,
+                    NULL, fd, name, xdata);
+
+    return 0;
+out:
+    STACK_UNWIND_STRICT(fremovexattr, frame, -1, error, NULL);
+    return 0;
+}
+
+int32_t
+ec_gf_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+             dict_t *xdata)
+{
+    ec_rename(frame, this, -1, EC_MINIMUM_MIN, default_rename_cbk, NULL, oldloc,
+              newloc, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+            dict_t *xdata)
+{
+    ec_rmdir(frame, this, -1, EC_MINIMUM_MIN, default_rmdir_cbk, NULL, loc,
+             xflags, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+              struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    ec_setattr(frame, this, -1, EC_MINIMUM_MIN, default_setattr_cbk, NULL, loc,
+               stbuf, valid, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+               struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    ec_fsetattr(frame, this, -1, EC_MINIMUM_MIN, default_fsetattr_cbk, NULL, fd,
+                stbuf, valid, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+               int32_t flags, dict_t *xdata)
+{
+    int error = 0;
+
+    EC_INTERNAL_XATTR_OR_GOTO("", dict, error, out);
+
+    ec_setxattr(frame, this, -1, EC_MINIMUM_MIN, default_setxattr_cbk, NULL,
+                loc, dict, flags, xdata);
+
+    return 0;
+out:
+    STACK_UNWIND_STRICT(setxattr, frame, -1, error, NULL);
+    return 0;
+}
+
+int32_t
+ec_gf_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+                int32_t flags, dict_t *xdata)
+{
+    int error = 0;
+
+    EC_INTERNAL_XATTR_OR_GOTO("", dict, error, out);
+
+    ec_fsetxattr(frame, this, -1, EC_MINIMUM_MIN, default_fsetxattr_cbk, NULL,
+                 fd, dict, flags, xdata);
+
+    return 0;
+out:
+    STACK_UNWIND_STRICT(fsetxattr, frame, -1, error, NULL);
+    return 0;
+}
+
+int32_t
+ec_gf_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    ec_stat(frame, this, -1, EC_MINIMUM_MIN, default_stat_cbk, NULL, loc,
+            xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    ec_fstat(frame, this, -1, EC_MINIMUM_MIN, default_fstat_cbk, NULL, fd,
+             xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    ec_statfs(frame, this, -1, EC_MINIMUM_MIN, default_statfs_cbk, NULL, loc,
+              xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
+              loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    ec_symlink(frame, this, -1, EC_MINIMUM_MIN, default_symlink_cbk, NULL,
+               linkname, loc, umask, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+               dict_t *xdata)
+{
+    ec_truncate(frame, this, -1, EC_MINIMUM_MIN, default_truncate_cbk, NULL,
+                loc, offset, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                dict_t *xdata)
+{
+    ec_ftruncate(frame, this, -1, EC_MINIMUM_MIN, default_ftruncate_cbk, NULL,
+                 fd, offset, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+             dict_t *xdata)
+{
+    ec_unlink(frame, this, -1, EC_MINIMUM_MIN, default_unlink_cbk, NULL, loc,
+              xflags, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+             struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
+             struct iobref *iobref, dict_t *xdata)
+{
+    ec_writev(frame, this, -1, EC_MINIMUM_MIN, default_writev_cbk, NULL, fd,
+              vector, count, offset, flags, iobref, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+              gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    ec_xattrop(frame, this, -1, EC_MINIMUM_MIN, default_xattrop_cbk, NULL, loc,
+               optype, xattr, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+               gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    ec_fxattrop(frame, this, -1, EC_MINIMUM_MIN, default_fxattrop_cbk, NULL, fd,
+                optype, xattr, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+               off_t len, dict_t *xdata)
+{
+    default_zerofill_failure_cbk(frame, ENOTSUP);
+
+    return 0;
+}
+
+int32_t
+ec_gf_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+           gf_seek_what_t what, dict_t *xdata)
+{
+    ec_seek(frame, this, -1, EC_MINIMUM_ONE, default_seek_cbk, NULL, fd, offset,
+            what, xdata);
+
+    return 0;
+}
+
+int32_t
+ec_gf_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+    ec_ipc(frame, this, -1, EC_MINIMUM_MIN, default_ipc_cbk, NULL, op, xdata);
+    return 0;
+}
+
+int32_t
+ec_gf_forget(xlator_t *this, inode_t *inode)
+{
+    uint64_t value = 0;
+    ec_inode_t *ctx = NULL;
+
+    if ((inode_ctx_del(inode, this, &value) == 0) && (value != 0)) {
+        ctx = (ec_inode_t *)(uintptr_t)value;
+        /* We can only forget an inode if it has been unlocked, so the stripe
+         * cache should also be empty. */
+        GF_ASSERT(list_empty(&ctx->stripe_cache.lru));
+        GF_FREE(ctx);
+    }
+
+    return 0;
+}
+
+void
+ec_gf_release_fd(xlator_t *this, fd_t *fd)
+{
+    uint64_t value = 0;
+    ec_fd_t *ctx = NULL;
+
+    if ((fd_ctx_del(fd, this, &value) == 0) && (value != 0)) {
+        ctx = (ec_fd_t *)(uintptr_t)value;
+        loc_wipe(&ctx->loc);
+        GF_FREE(ctx);
+    }
+}
+
+int32_t
+ec_gf_release(xlator_t *this, fd_t *fd)
+{
+    ec_gf_release_fd(this, fd);
+
+    return 0;
+}
+
+int32_t
+ec_gf_releasedir(xlator_t *this, fd_t *fd)
+{
+    ec_gf_release_fd(this, fd);
+
+    return 0;
+}
+
+int32_t
+ec_dump_private(xlator_t *this)
+{
+    ec_t *ec = NULL;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN];
+    char tmp[65];
+
+    GF_ASSERT(this);
+
+    ec = this->private;
+    GF_ASSERT(ec);
+
+    snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
+    gf_proc_dump_add_section("%s", key_prefix);
+    gf_proc_dump_write("up", "%u", ec->up);
+    gf_proc_dump_write("nodes", "%u", ec->nodes);
+    gf_proc_dump_write("redundancy", "%u", ec->redundancy);
+    gf_proc_dump_write("fragment_size", "%u", ec->fragment_size);
+    gf_proc_dump_write("stripe_size", "%u", ec->stripe_size);
+    gf_proc_dump_write("childs_up", "%u", ec->xl_up_count);
+    gf_proc_dump_write("childs_up_mask", "%s",
+                       ec_bin(tmp, sizeof(tmp), ec->xl_up, ec->nodes));
+    if (ec->read_mask) {
+        gf_proc_dump_write("read-mask", "%s",
+                           ec_bin(tmp, sizeof(tmp), ec->read_mask, ec->nodes));
+    }
+    gf_proc_dump_write("background-heals", "%d", ec->background_heals);
+    gf_proc_dump_write("heal-wait-qlength", "%d", ec->heal_wait_qlen);
+    gf_proc_dump_write("self-heal-window-size", "%" PRIu32,
+                       ec->self_heal_window_size);
+    gf_proc_dump_write("healers", "%d", ec->healers);
+    gf_proc_dump_write("heal-waiters", "%d", ec->heal_waiters);
+    gf_proc_dump_write("read-policy", "%s", ec_read_policies[ec->read_policy]);
+    gf_proc_dump_write("parallel-writes", "%d", ec->parallel_writes);
+    gf_proc_dump_write("quorum-count", "%u", ec->quorum_count);
+
+    snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s.stats.stripe_cache",
+             this->type, this->name);
+    gf_proc_dump_add_section("%s", key_prefix);
+
+    gf_proc_dump_write("hits", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(ec->stats.stripe_cache.hits));
+    gf_proc_dump_write("misses", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(ec->stats.stripe_cache.misses));
+    gf_proc_dump_write("updates", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(ec->stats.stripe_cache.updates));
+    gf_proc_dump_write("invalidations", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(ec->stats.stripe_cache.invals));
+    gf_proc_dump_write("evicts", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(ec->stats.stripe_cache.evicts));
+    gf_proc_dump_write("allocations", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(ec->stats.stripe_cache.allocs));
+    gf_proc_dump_write("errors", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(ec->stats.stripe_cache.errors));
+    gf_proc_dump_write("heals-attempted", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(ec->stats.shd.attempted));
+    gf_proc_dump_write("heals-completed", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(ec->stats.shd.completed));
+
+    return 0;
+}
+
+struct xlator_fops fops = {.lookup = ec_gf_lookup,
+                           .stat = ec_gf_stat,
+                           .fstat = ec_gf_fstat,
+                           .truncate = ec_gf_truncate,
+                           .ftruncate = ec_gf_ftruncate,
+                           .access = ec_gf_access,
+                           .readlink = ec_gf_readlink,
+                           .mknod = ec_gf_mknod,
+                           .mkdir = ec_gf_mkdir,
+                           .unlink = ec_gf_unlink,
+                           .rmdir = ec_gf_rmdir,
+                           .symlink = ec_gf_symlink,
+                           .rename = ec_gf_rename,
+                           .link = ec_gf_link,
+                           .create = ec_gf_create,
+                           .open = ec_gf_open,
+                           .readv = ec_gf_readv,
+                           .writev = ec_gf_writev,
+                           .flush = ec_gf_flush,
+                           .fsync = ec_gf_fsync,
+                           .opendir = ec_gf_opendir,
+                           .readdir = ec_gf_readdir,
+                           .readdirp = ec_gf_readdirp,
+                           .fsyncdir = ec_gf_fsyncdir,
+                           .statfs = ec_gf_statfs,
+                           .setxattr = ec_gf_setxattr,
+                           .getxattr = ec_gf_getxattr,
+                           .fsetxattr = ec_gf_fsetxattr,
+                           .fgetxattr = ec_gf_fgetxattr,
+                           .removexattr = ec_gf_removexattr,
+                           .fremovexattr = ec_gf_fremovexattr,
+                           .lk = ec_gf_lk,
+                           .inodelk = ec_gf_inodelk,
+                           .finodelk = ec_gf_finodelk,
+                           .entrylk = ec_gf_entrylk,
+                           .fentrylk = ec_gf_fentrylk,
+                           .xattrop = ec_gf_xattrop,
+                           .fxattrop = ec_gf_fxattrop,
+                           .setattr = ec_gf_setattr,
+                           .fsetattr = ec_gf_fsetattr,
+                           .fallocate = ec_gf_fallocate,
+                           .discard = ec_gf_discard,
+                           .zerofill = ec_gf_zerofill,
+                           .seek = ec_gf_seek,
+                           .ipc = ec_gf_ipc};
+
+struct xlator_cbks cbks = {.forget = ec_gf_forget,
+                           .release = ec_gf_release,
+                           .releasedir = ec_gf_releasedir};
+
+struct xlator_dumpops dumpops = {.priv = ec_dump_private};
+
+struct volume_options options[] = {
+    {.key = {"redundancy"},
+     .type = GF_OPTION_TYPE_INT,
+     .default_value = "{{ volume.redundancy }}",
+     .description = "Maximum number of bricks that can fail "
+                    "simultaneously without losing data."},
+    {
+        .key = {"self-heal-daemon"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .description = "self-heal daemon enable/disable",
+        .default_value = "enable",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"disperse"},
+    },
+    {.key = {"iam-self-heal-daemon"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "This option differentiates if the disperse "
+                    "translator is running as part of self-heal-daemon "
+                    "or not."},
+    {.key = {"eager-lock"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .op_version = {GD_OP_VERSION_3_7_10},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+     .tags = {"disperse"},
+     .description = "Enable/Disable eager lock for regular files on a "
+                    "disperse volume. If a fop takes a lock and completes "
+                    "its operation, it waits for next 1 second before "
+                    "releasing the lock, to see if the lock can be reused "
+                    "for next fop from the same client. If ec finds any lock "
+                    "contention within 1 second it releases the lock "
+                    "immediately before time expires. This improves the "
+                    "performance of file operations. However, as it takes "
+                    "lock on first brick, for few operations like read, "
+                    "discovery of lock contention might take long time and "
+                    "can actually degrade the performance. If eager lock is "
+                    "disabled, lock will be released as soon as fop "
+                    "completes."},
+    {.key = {"other-eager-lock"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .op_version = {GD_OP_VERSION_3_13_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+     .tags = {"disperse"},
+     .description = "It's equivalent to the eager-lock option but for non "
+                    "regular files."},
+    {.key = {"eager-lock-timeout"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 60,
+     .default_value = "1",
+     .op_version = {GD_OP_VERSION_4_0_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+     .tags = {"disperse", "locks", "timeout"},
+     .description = "Maximum time (in seconds) that a lock on an inode is "
+                    "kept held if no new operations on the inode are "
+                    "received."},
+    {.key = {"other-eager-lock-timeout"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 60,
+     .default_value = "1",
+     .op_version = {GD_OP_VERSION_4_0_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+     .tags = {"disperse", "locks", "timeout"},
+     .description = "It's equivalent to eager-lock-timeout option but for "
+                    "non regular files."},
+    {
+        .key = {"background-heals"},
+        .type = GF_OPTION_TYPE_INT,
+        .min = 0, /*Disabling background heals*/
+        .max = 256,
+        .default_value = "8",
+        .op_version = {GD_OP_VERSION_3_7_3},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .tags = {"disperse"},
+        .description = "This option can be used to control number of parallel"
+                       " heals",
+    },
+    {
+        .key = {"heal-wait-qlength"},
+        .type = GF_OPTION_TYPE_INT,
+        .min = 0,
+        .max =
+            65536, /*Around 100MB as of now with sizeof(ec_fop_data_t) at 1800*/
+        .default_value = "128",
+        .op_version = {GD_OP_VERSION_3_7_3},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .tags = {"disperse"},
+        .description = "This option can be used to control number of heals"
+                       " that can wait",
+    },
+    {.key = {"heal-timeout"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 60,
+     .max = INT_MAX,
+     .default_value = "600",
+     .op_version = {GD_OP_VERSION_3_7_3},
+     .flags = OPT_FLAG_SETTABLE,
+     .tags = {"disperse"},
+     .description = "time interval for checking the need to self-heal "
+                    "in self-heal-daemon"},
+    {
+        .key = {"read-policy"},
+        .type = GF_OPTION_TYPE_STR,
+        .value = {"round-robin", "gfid-hash"},
+        .default_value = "gfid-hash",
+        .op_version = {GD_OP_VERSION_3_7_6},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .tags = {"disperse"},
+        .description =
+            "inode-read fops happen only on 'k' number of bricks in"
+            " n=k+m disperse subvolume. 'round-robin' selects the read"
+            " subvolume using round-robin algo. 'gfid-hash' selects read"
+            " subvolume based on hash of the gfid of that file/directory.",
+    },
+    {.key = {"shd-max-threads"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 64,
+     .default_value = "1",
+     .op_version = {GD_OP_VERSION_3_9_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"disperse"},
+     .description = "Maximum number of parallel heals SHD can do per local "
+                    "brick.  This can substantially lower heal times, "
+                    "but can also crush your bricks if you don't have "
+                    "the storage hardware to support this."},
+    {.key = {"shd-wait-qlength"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 65536,
+     .default_value = "1024",
+     .op_version = {GD_OP_VERSION_3_9_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"disperse"},
+     .description = "This option can be used to control number of heals"
+                    " that can wait in SHD per subvolume"},
+    {.key = {"cpu-extensions"},
+     .type = GF_OPTION_TYPE_STR,
+     .value = {"none", "auto", "x64", "sse", "avx"},
+     .default_value = "auto",
+     .op_version = {GD_OP_VERSION_3_9_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+     .tags = {"disperse"},
+     .description = "force the cpu extensions to be used to accelerate the "
+                    "galois field computations."},
+    {.key = {"self-heal-window-size"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 1024,
+     .default_value = "1",
+     .op_version = {GD_OP_VERSION_3_11_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+     .tags = {"disperse"},
+     .description = "Maximum number blocks(128KB) per file for which "
+                    "self-heal process would be applied simultaneously."},
+    {.key = {"optimistic-change-log"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .op_version = {GD_OP_VERSION_3_10_1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT,
+     .tags = {"disperse"},
+     .description = "Set/Unset dirty flag for every update fop at the start"
+                    "of the fop. If OFF, this option impacts performance of"
+                    "entry  operations or metadata operations as it will"
+                    "set dirty flag at the start and unset it at the end of"
+                    "ALL update fop. If ON and all the bricks are good,"
+                    "dirty flag will be set at the start only for file fops"
+                    "For metadata and entry fops dirty flag will not be set"
+                    "at the start, if all the bricks are good. This does"
+                    "not impact performance for metadata operations and"
+                    "entry operation but has a very small window to miss"
+                    "marking entry as dirty in case it is required to be"
+                    "healed"},
+    {.key = {"parallel-writes"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description = "This controls if writes can be wound in parallel as long"
+                    "as it doesn't modify same stripes"},
+    {.key = {"stripe-cache"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 0, /*Disabling stripe_cache*/
+     .max = EC_STRIPE_CACHE_MAX_SIZE,
+     .default_value = "4",
+     .description = "This option will keep the last stripe of write fop"
+                    "in memory. If next write falls in this stripe, we need"
+                    "not to read it again from backend and we can save READ"
+                    "fop going over the network. This will improve performance,"
+                    "specially for sequential writes. However, this will also"
+                    "lead to extra memory consumption, maximum "
+                    "(cache size * stripe size) Bytes per open file."},
+    {
+        .key = {"quorum-count"},
+        .type = GF_OPTION_TYPE_INT,
+        .default_value = "0",
+        .description =
+            "This option can be used to define how many successes on"
+            "the bricks constitute a success to the application. This"
+            " count should be in the range"
+            "[disperse-data-count,  disperse-count] (inclusive)",
+    },
+    {
+        .key = {"ec-read-mask"},
+        .type = GF_OPTION_TYPE_STR,
+        .default_value = NULL,
+        .description = "This option can be used to choose which bricks can be"
+                       " used for reading data/metadata of a file/directory",
+    },
+    {
+        .key = {NULL},
+    },
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1},
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "disperse",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
new file mode 100644
index 00000000000..6f6de6d5981
--- /dev/null
+++ b/xlators/cluster/ec/src/ec.h
@@ -0,0 +1,34 @@
+/*
+  Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __EC_H__
+#define __EC_H__
+
+#include "ec-method.h"
+
+#define EC_XATTR_PREFIX "trusted.ec."
+#define EC_XATTR_CONFIG EC_XATTR_PREFIX "config"
+#define EC_XATTR_SIZE EC_XATTR_PREFIX "size"
+#define EC_XATTR_VERSION EC_XATTR_PREFIX "version"
+#define EC_XATTR_HEAL EC_XATTR_PREFIX "heal"
+#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new"
+#define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty"
+#define EC_STRIPE_CACHE_MAX_SIZE 10
+#define EC_VERSION_SIZE 2
+#define EC_SHD_INODE_LRU_LIMIT 10
+
+#define EC_MAX_FRAGMENTS EC_METHOD_MAX_FRAGMENTS
+/* The maximum number of nodes is derived from the maximum allowed fragments
+ * using the rule that redundancy cannot be equal or greater than the number
+ * of fragments.
+ */
+#define EC_MAX_NODES min(EC_MAX_FRAGMENTS * 2 - 1, EC_METHOD_MAX_NODES)
+
+#endif /* __EC_H__ */
diff --git a/xlators/cluster/ha/src/Makefile.am b/xlators/cluster/ha/src/Makefile.am
deleted file mode 100644
index 069a0dcded2..00000000000
--- a/xlators/cluster/ha/src/Makefile.am
+++ /dev/null
@@ -1,15 +0,0 @@
-xlator_LTLIBRARIES = ha.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
-
-ha_la_LDFLAGS = -module -avoidversion 
-
-ha_la_SOURCES = ha-helpers.c ha.c
-ha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-noinst_HEADERS = ha.h
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
-	    -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
-
-CLEANFILES = 
-
diff --git a/xlators/cluster/ha/src/ha-helpers.c b/xlators/cluster/ha/src/ha-helpers.c
deleted file mode 100644
index 4bc7d5e20de..00000000000
--- a/xlators/cluster/ha/src/ha-helpers.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
-  Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-#include "xlator.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "dict.h"
-#include "compat-errno.h"
-#include "ha.h"
-
-#define HA_TRANSPORT_NOTCONN(_ret, _errno, _fd) \
-	((_ret == -1) && (_fd ? (_errno == EBADFD):(_errno == ENOTCONN)))
-
-int ha_alloc_init_fd (call_frame_t *frame, fd_t *fd)
-{
-	ha_local_t *local = NULL;
-	int i = -1;
-	ha_private_t *pvt = NULL;
-	int child_count = 0;
-	int ret = -1;
-	hafd_t *hafdp = NULL;
-	xlator_t *this = NULL;
-	uint64_t tmp_hafdp = 0;
-
-	this = frame->this;
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-
-	if (local == NULL) {
-		ret = fd_ctx_get (fd, this, &tmp_hafdp);
-		if (ret < 0) {
-			goto out;
-		}
-		hafdp = (hafd_t *)(long)tmp_hafdp;
-		local = frame->local = CALLOC (1, sizeof (*local));
-		if (local == NULL) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		local->state = CALLOC (1, child_count);
-		if (local->state == NULL) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		/* take care of the preferred subvolume */
-		if (pvt->pref_subvol == -1)
-			local->active = hafdp->active;
-		else
-			local->active = pvt->pref_subvol;
-
-		LOCK (&hafdp->lock);
-		memcpy (local->state, hafdp->fdstate, child_count);
-		UNLOCK (&hafdp->lock);
-
-		/* in case the preferred subvolume is down */
-		if ((local->active != -1) && (local->state[local->active] == 0))
-			local->active = -1;
-
-		for (i = 0; i < child_count; i++) {
-			if (local->state[i]) {
-				if (local->active == -1)
-					local->active = i;
-				local->tries++;
-			}
-		}
-		if (local->active == -1) {
-			ret = -ENOTCONN;
-			goto out;
-		}
-		local->fd = fd_ref (fd);
-	}
-	ret = 0;
-out:
-	return ret;
-}
-
-int ha_handle_cbk (call_frame_t *frame, void *cookie, int op_ret, int op_errno) 
-{
-	xlator_t *xl = NULL;
-	ha_private_t *pvt = NULL;
-	xlator_t **children = NULL;
-	int prev_child = -1;
-	hafd_t *hafdp = NULL;
-	int ret = -1;
-	call_stub_t *stub = NULL;
-	ha_local_t *local = NULL;
-	uint64_t tmp_hafdp = 0;
-
-	xl = frame->this;
-	pvt = xl->private;
-	children = pvt->children;
-	prev_child = (long) cookie;
-	local = frame->local;
-
-	if (op_ret == -1) {
-		gf_log (xl->name, GF_LOG_ERROR ,"(child=%s) (op_ret=%d op_errno=%s)",
-			children[prev_child]->name, op_ret, strerror (op_errno));
-	}
-
-	if (HA_TRANSPORT_NOTCONN (op_ret, op_errno, (local->fd))) {
-		ret = 0;
-		if (local->fd) {
-			ret = fd_ctx_get (local->fd, xl, &tmp_hafdp);
-		}
-		hafdp = (hafd_t *)(long)tmp_hafdp;		
-		if (ret == 0) {
-			if (local->fd) {
-				LOCK(&hafdp->lock);
-				hafdp->fdstate[prev_child] = 0;
-				UNLOCK(&hafdp->lock);
-			}
-			local->tries--;
-			if (local->tries != 0) {
-				while (1) {
-					local->active = (local->active + 1) % pvt->child_count;
-					if (local->state[local->active])
-						break;
-				}
-				stub = local->stub;
-				local->stub = NULL;
-				call_resume (stub);
-				return -1;
-			}
-		}
-	}
-	if (local->stub)
-		call_stub_destroy (local->stub);
-	if (local->fd) {
-		FREE (local->state);
-		fd_unref (local->fd);
-	}
-	return 0;
-}
-
-int ha_alloc_init_inode (call_frame_t *frame, inode_t *inode)
-{
-	int i = -1;
-	ha_private_t *pvt = NULL;
-	xlator_t *xl = NULL;
-	int ret = -1;
-	ha_local_t *local = NULL;
-	uint64_t tmp_state = 0;
-
-	xl = frame->this;
-	pvt = xl->private;
-	local = frame->local;
-
-	if (local == NULL) {
-		local = frame->local = CALLOC (1, sizeof (*local));
-		if (local == NULL) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		local->active = pvt->pref_subvol;
-		ret = inode_ctx_get (inode, xl, &tmp_state);
-		if (ret < 0) {
-			goto out;
-		}
-		local->state = (char *)(long)tmp_state;
-		if (local->active != -1 && local->state[local->active] == 0)
-			local->active = -1;
-		for (i = 0; i < pvt->child_count; i++) {
-			if (local->state[i]) {
-				if (local->active == -1)
-					local->active = i;
-				local->tries++;
-			}
-		}
-		if (local->active == -1) {
-			ret = -ENOTCONN;
-			goto out;
-		}
-	}
-	ret = 0;
-out:
-	return ret;
-}
diff --git a/xlators/cluster/ha/src/ha.c b/xlators/cluster/ha/src/ha.c
deleted file mode 100644
index b8670ecfb14..00000000000
--- a/xlators/cluster/ha/src/ha.c
+++ /dev/null
@@ -1,3479 +0,0 @@
-/*
-  Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-/* generate errors randomly, code is simple now, better alogorithm
- * can be written to decide what error to be returned and when
- */
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "xlator.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "dict.h"
-#include "compat-errno.h"
-#include "ha.h"
-
-/*
- * TODO:
- * - dbench fails if ha over server side afr
- * - lock calls - lock on all subvols.
- * - support preferred-subvolume option. code already there.
- * - do not alloc the call-stub in case only one subvol is up.
- */
-
-int
-ha_forget (xlator_t *this,
-	   inode_t *inode)
-{
-	uint64_t stateino = 0;
-	char *state = NULL;
-	if (!inode_ctx_del (inode, this, &stateino)) {
-		state =  ((char *)(long)stateino);
-		FREE (state);
-	}
-
-	return 0;
-
-}
-
-int32_t 
-ha_lookup_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno,
-	       inode_t *inode,
-	       struct stat *buf,
-	       dict_t *dict)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	int child_count = 0, i = 0, callcnt = 0;
-	char *state = NULL;
-	call_frame_t *prev_frame = NULL;
-	xlator_t **children = NULL;
-	uint64_t tmp_state = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-	prev_frame = cookie;
-	children = pvt->children;
-
-	for (i = 0; i < child_count; i++) {
-		if (pvt->children[i] == prev_frame->this)
-			break;
-	}
-	if ((op_ret == -1) && (op_errno != ENOENT)) {
-		gf_log (this->name, GF_LOG_ERROR, "(child=%s) (op_ret=%d op_errno=%s)", 
-			  children[i]->name, op_ret, strerror (op_errno));
-	}
-	inode_ctx_get (local->inode, this, &tmp_state);
-	state = (char *)(long)tmp_state;
-
-	LOCK (&frame->lock);
-	if (local->revalidate == 1) {
-		if ((!op_ret) != state[i]) {
-			local->revalidate_error = 1;
-			gf_log (this->name, GF_LOG_DEBUG, "revalidate error on %s", 
-				pvt->children[i]->name);
-		}
-	} else {
-		if (op_ret == 0) {
-			state[i] = 1;
-		}
-	}
-	if (local->op_ret == -1 && op_ret == 0) {
-		local->op_ret = 0;
-		local->buf = *buf;
-		if (dict)
-			local->dict = dict_ref (dict);
-	}
-	if (op_ret == -1 && op_ret != ENOTCONN)
-		local->op_errno = op_errno;
-	callcnt = --local->call_count;
-	UNLOCK (&frame->lock);
-
-	if (callcnt == 0) {
-		dict_t *ctx = local->dict;
-		inode_t *inode = local->inode;
-		if (local->revalidate_error == 1) {
-			local->op_ret = -1;
-			local->op_errno = EIO;
-			gf_log (this->name, GF_LOG_DEBUG, "revalidate error, returning EIO");
-		}
-		STACK_UNWIND (frame,
-			      local->op_ret,
-			      local->op_errno,
-			      inode,
-			      &local->buf,
-			      ctx);
-		if (inode)
-			inode_unref (inode);
-		if (ctx)
-			dict_unref (ctx);
-	}
-	return 0;
-}
-
-int32_t
-ha_lookup (call_frame_t *frame,
-	   xlator_t *this,
-	   loc_t *loc,
-	   dict_t *xattr_req)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	int child_count = 0, i = 0;
-	char *state = NULL;
-	xlator_t **children = NULL;
-	int ret = -1;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-	children = pvt->children;
-
-	frame->local = local = CALLOC (1, sizeof (*local));
-	child_count = pvt->child_count;
-	local->inode = inode_ref (loc->inode);
-
-	ret = inode_ctx_get (loc->inode, this, NULL);
-	if (ret) {
-		state = CALLOC (1, child_count);
-		inode_ctx_put (loc->inode, this, (uint64_t)(long)state);
-	} else
-		local->revalidate = 1;
-
-	local->op_ret = -1;
-	local->op_errno = ENOTCONN;
-	local->call_count = child_count;
-
-	for (i = 0; i < child_count; i++) {
-		STACK_WIND (frame,
-			    ha_lookup_cbk,
-			    children[i],
-			    children[i]->fops->lookup,
-			    loc,
-			    xattr_req);
-	}
-	return 0;
-}
-
- int32_t
-ha_stat_cbk (call_frame_t *frame,
-	     void *cookie,
-	     xlator_t *this,
-	     int32_t op_ret,
-	     int32_t op_errno,
-	     struct stat *buf)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      buf);
-	}
-	return 0;
-}
-
-int32_t
-ha_stat (call_frame_t *frame,
-	 xlator_t *this,
-	 loc_t *loc)
-{
-	ha_local_t *local = NULL;
-	int op_errno = ENOTCONN;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_stat_stub (frame, ha_stat, loc);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_stat_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->stat,
-			   loc);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;	
-}
-
- int32_t
-ha_chmod_cbk (call_frame_t *frame,
-	      void *cookie,
-	      xlator_t *this,
-	      int32_t op_ret,
-	      int32_t op_errno,
-	      struct stat *buf)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      buf);
-	}
-	return 0;
-}
-
-int32_t
-ha_chmod (call_frame_t *frame,
-	  xlator_t *this,
-	  loc_t *loc,
-	  mode_t mode)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_chmod_stub (frame, ha_chmod, loc, mode);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_chmod_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->chmod,
-			   loc,
-			   mode);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;
-}
-
- int32_t
-ha_fchmod_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno,
-	       struct stat *buf)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      buf);
-	}
-	return 0;
-}
-
-int32_t 
-ha_fchmod (call_frame_t *frame,
-	   xlator_t *this,
-	   fd_t *fd,
-	   mode_t mode)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_fd (frame, fd);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_fchmod_stub (frame, ha_fchmod, fd, mode);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_fchmod_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->fchmod,
-			   fd,
-			   mode);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;
-}
-
- int32_t
-ha_chown_cbk (call_frame_t *frame,
-	      void *cookie,
-	      xlator_t *this,
-	      int32_t op_ret,
-	      int32_t op_errno,
-	      struct stat *buf)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      buf);
-	}
-	return 0;
-}
-
-int32_t
-ha_chown (call_frame_t *frame,
-	  xlator_t *this,
-	  loc_t *loc,
-	  uid_t uid,
-	  gid_t gid)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_chown_stub (frame, ha_chown, loc, uid, gid);
-
-	STACK_WIND_COOKIE (frame,	      
-			   ha_chown_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->chown,
-			   loc,
-			   uid,
-			   gid);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-	return 0;
-}
-
- int32_t
-ha_fchown_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno,
-	       struct stat *buf)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      buf);
-	}
-	return 0;
-}
-
-int32_t 
-ha_fchown (call_frame_t *frame,
-	   xlator_t *this,
-	   fd_t *fd,
-	   uid_t uid,
-	   gid_t gid)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_fd (frame, fd);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_fchown_stub (frame, ha_fchown, fd, uid, gid);
-
-	STACK_WIND_COOKIE (frame,	      
-			   ha_fchown_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->fchown,
-			   fd,
-			   uid,
-			   gid);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;
-}
-
- int32_t
-ha_truncate_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 struct stat *buf)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      buf);
-	}
-	return 0;
-}
-
-int32_t
-ha_truncate (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     off_t offset)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_truncate_stub (frame, ha_truncate, loc, offset);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_truncate_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->truncate,
-			   loc,
-			   offset);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;
-}
-
- int32_t
-ha_ftruncate_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct stat *buf)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      buf);
-	}
-	return 0;
-}
-
-int32_t
-ha_ftruncate (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      off_t offset)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_fd (frame, fd);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_ftruncate_stub (frame, ha_ftruncate, fd, offset);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_ftruncate_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->ftruncate,
-			   fd,
-			   offset);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;
-}
-
-int32_t 
-ha_utimens_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		struct stat *buf)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      buf);
-	}
-	return 0;
-}
-
-int32_t 
-ha_utimens (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    struct timespec tv[2])
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_utimens_stub (frame, ha_utimens, loc, tv);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_utimens_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->utimens,
-			   loc,
-			   tv);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;
-}
-
-int32_t
-ha_access_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno);
-	}
-	return 0;
-}
-
-int32_t
-ha_access (call_frame_t *frame,
-	   xlator_t *this,
-	   loc_t *loc,
-	   int32_t mask)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_access_stub (frame, ha_access, loc, mask);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_access_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->access,
-			   loc,
-			   mask);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno);
-	return 0;
-}
-
-
- int32_t
-ha_readlink_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 const char *path)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      path);
-	}
-	return 0;
-}
-
-int32_t
-ha_readlink (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     size_t size)
-{
-	ha_local_t *local = frame->local;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_readlink_stub (frame, ha_readlink, loc, size);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_readlink_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->readlink,
-			   loc,
-			   size);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;
-}
-
-int
-ha_mknod_lookup_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     inode_t *inode,
-		     struct stat *buf,
-		     dict_t *dict)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	char *stateino = NULL;
-	int child_count = 0, i = 0, cnt = 0, ret = 0;
-	call_frame_t *prev_frame = NULL;
-	xlator_t **children = NULL;
-	uint64_t tmp_stateino = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-	prev_frame = cookie;
-	children = pvt->children;
-
-	for (i = 0; i < child_count; i++)
-		if (prev_frame->this == children[i])
-			break;
-
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"(path=%s) (op_ret=%d op_errno=%d)", 
-			local->stub->args.mknod.loc.path, op_ret, op_errno);
-	}
-	ret = inode_ctx_get (local->stub->args.mknod.loc.inode, 
-			     this, &tmp_stateino);
-	stateino = (char *)(long)tmp_stateino;
-	if (ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"unwind(-1), inode_ctx_get() error");
-		/* It is difficult to handle this error at this stage
-		 * as we still expect more cbks, we can't return as
-		 * of now
-		 */
-	} else if (op_ret == 0) {
-		stateino[i] = 1;
-	}
-	LOCK (&frame->lock);
-	cnt = --local->call_count;
-	UNLOCK (&frame->lock);
-
-	if (cnt == 0) {
-		call_stub_t *stub = local->stub;
-		FREE (local->state);
-		STACK_UNWIND (frame,
-			      local->op_ret,
-			      local->op_errno,
-			      local->stub->args.mknod.loc.inode,
-			      &local->buf);
-		call_stub_destroy (stub);
-	}
-	return 0;
-}
-
-int32_t
-ha_mknod_cbk (call_frame_t *frame,
-	      void *cookie,
-	      xlator_t *this,
-	      int32_t op_ret,
-	      int32_t op_errno,
-	      inode_t *inode,
-	      struct stat *buf)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	char *stateino = NULL;
-	int child_count = 0, i = 0, cnt = 0, ret = 0;
-	call_frame_t *prev_frame = NULL;
-	xlator_t **children = NULL;
-	uint64_t tmp_stateino = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-	prev_frame = cookie;
-	children = pvt->children;
-
-	for (i = 0; i < child_count; i++)
-		if (prev_frame->this == children[i])
-			break;
-
-	if (op_ret == -1) {
-		local->op_errno = op_errno;
-		gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mknod.loc.path, op_ret, op_errno);
-	}
-
-	ret = inode_ctx_get (local->stub->args.mknod.loc.inode, 
-			     this, &tmp_stateino);
-	stateino = (char *)(long)tmp_stateino;
-
-	if (ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR, "inode_ctx_get() error");
-		/* FIXME: handle the case */
-	}
-	if (op_ret == 0) {
-		stateino[i] = 1;
-		local->op_ret = 0;
-		local->first_success = 1;
-		local->buf = *buf;
-	}
-	cnt = --local->call_count;
-	for (i = local->active + 1; i < child_count; i++) {
-		if (local->state[i])
-			break;
-	}
-
-	if (cnt == 0 || i == child_count) {
-		call_stub_t *stub = local->stub;
-		FREE (local->state);
-		stub = local->stub;
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.mknod.loc.inode, &local->buf);
-		call_stub_destroy (stub);
-		return 0;
-	}
-
-	local->active = i;
-
-	if (local->first_success == 0) {
-		STACK_WIND (frame,
-			    ha_mknod_cbk,
-			    children[i],
-			    children[i]->fops->mknod,
-			    &local->stub->args.mknod.loc,
-			    local->stub->args.mknod.mode,
-			    local->stub->args.mknod.rdev);
-		return 0;
-	}
-	cnt = local->call_count;
-
-	for (; i < child_count; i++) {
-		if (local->state[i]) {
-			STACK_WIND (frame,
-				    ha_mknod_lookup_cbk,
-				    children[i],
-				    children[i]->fops->lookup,
-				    &local->stub->args.mknod.loc,
-				    0);
-			if (--cnt == 0)
-				break;
-		}
-	}
-	return 0;
-}
-
-int32_t
-ha_mknod (call_frame_t *frame,
-	  xlator_t *this,
-	  loc_t *loc,
-	  mode_t mode,
-	  dev_t rdev)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	int child_count = 0, i = 0;
-	char *stateino = NULL;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-
-	frame->local = local = CALLOC (1, sizeof (*local));
-	local->stub = fop_mknod_stub (frame, ha_mknod, loc, mode, rdev);
-	local->op_ret = -1;
-	local->op_errno = ENOTCONN;
-	local->state = CALLOC (1, child_count);
-	memcpy (local->state, pvt->state, child_count);
-	local->active = -1;
-
-	stateino = CALLOC (1, child_count);
-	inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino);
-
-	for (i = 0; i < child_count; i++) {
-		if (local->state[i]) {
-			local->call_count++;
-			if (local->active == -1) 
-				local->active = i;
-		}
-	}
-
-	STACK_WIND (frame,
-		    ha_mknod_cbk,
-		    HA_ACTIVE_CHILD(this, local),
-		    HA_ACTIVE_CHILD(this, local)->fops->mknod,
-		    loc, mode, rdev);
-	return 0;
-}
-
-
-int
-ha_mkdir_lookup_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     inode_t *inode,
-		     struct stat *buf,
-		     dict_t *dict)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	char *stateino = NULL;
-	int child_count = 0, i = 0, cnt = 0;
-	call_frame_t *prev_frame = NULL;
-	xlator_t **children = NULL;
-	uint64_t tmp_stateino = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-	prev_frame = cookie;
-	children = pvt->children;
-
-	for (i = 0; i < child_count; i++)
-		if (prev_frame->this == children[i])
-			break;
-
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mkdir.loc.path, op_ret, op_errno);
-	}
-	inode_ctx_get (local->stub->args.mkdir.loc.inode, 
-		       this, &tmp_stateino);  
-	stateino = (char *)(long)tmp_stateino;
-
-	if (op_ret == 0)
-		stateino[i] = 1;
-
-	LOCK (&frame->lock);
-	cnt = --local->call_count;
-	UNLOCK (&frame->lock);
-
-	if (cnt == 0) {
-		call_stub_t *stub = local->stub;
-		FREE (local->state);
-		STACK_UNWIND (frame,
-			      local->op_ret,
-			      local->op_errno,
-			      local->stub->args.mkdir.loc.inode,
-			      &local->buf);
-		call_stub_destroy (stub);
-	}
-	return 0;
-}
-
-int32_t
-ha_mkdir_cbk (call_frame_t *frame,
-	      void *cookie,
-	      xlator_t *this,
-	      int32_t op_ret,
-	      int32_t op_errno,
-	      inode_t *inode,
-	      struct stat *buf)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	char *stateino = NULL;
-	int child_count = 0, i = 0, cnt = 0;
-	call_frame_t *prev_frame = NULL;
-	xlator_t **children = NULL;
-	uint64_t tmp_stateino = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-	prev_frame = cookie;
-	children = pvt->children;
-	
-	for (i = 0; i < child_count; i++)
-		if (prev_frame->this == children[i])
-			break;
-
-	if (op_ret == -1) {
-		local->op_errno = op_errno;
-		gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mkdir.loc.path, op_ret, op_errno);
-	}
-
-	inode_ctx_get (local->stub->args.mkdir.loc.inode, 
-		       this, &tmp_stateino);
-	stateino = (char *)(long)tmp_stateino;
-
-	if (op_ret == 0) {
-		stateino[i] = 1;
-		local->op_ret = 0;
-		local->first_success = 1;
-		local->buf = *buf;
-	}
-	cnt = --local->call_count;
-	for (i = local->active + 1; i < child_count; i++) {
-		if (local->state[i])
-			break;
-	}
-
-	if (cnt == 0 || i == child_count) {
-		call_stub_t *stub = local->stub;
-		FREE (local->state);
-		stub = local->stub;
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.mkdir.loc.inode, &local->buf);
-		call_stub_destroy (stub);
-		return 0;
-	}
-
-	local->active = i;
-
-	if (local->first_success == 0) {
-		STACK_WIND (frame,
-			    ha_mkdir_cbk,
-			    children[i],
-			    children[i]->fops->mkdir,
-			    &local->stub->args.mkdir.loc,
-			    local->stub->args.mkdir.mode);
-		return 0;
-	}
-	cnt = local->call_count;
-
-	for (; i < child_count; i++) {
-		if (local->state[i]) {
-			STACK_WIND (frame,
-				    ha_mkdir_lookup_cbk,
-				    children[i],
-				    children[i]->fops->lookup,
-				    &local->stub->args.mkdir.loc,
-				    0);
-			if (--cnt == 0)
-				break;
-		}
-	}
-	return 0;
-}
-
-int32_t
-ha_mkdir (call_frame_t *frame,
-	  xlator_t *this,
-	  loc_t *loc,
-	  mode_t mode)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	int child_count = 0, i = 0;
-	char *stateino = NULL;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-
-	frame->local = local = CALLOC (1, sizeof (*local));
-	local->stub = fop_mkdir_stub (frame, ha_mkdir, loc, mode);
-	local->op_ret = -1;
-	local->op_errno = ENOTCONN;
-	local->state = CALLOC (1, child_count);
-	memcpy (local->state, pvt->state, child_count);
-	local->active = -1;
-
-	stateino = CALLOC (1, child_count);
-	inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino);
-	for (i = 0; i < child_count; i++) {
-		if (local->state[i]) {
-			local->call_count++;
-			if (local->active == -1)
-				local->active = i;
-		}
-	}
-
-	STACK_WIND (frame,
-		    ha_mkdir_cbk,
-		    HA_ACTIVE_CHILD(this, local),
-		    HA_ACTIVE_CHILD(this, local)->fops->mkdir,
-		    loc, mode);
-	return 0;
-}
-
- int32_t
-ha_unlink_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-	if (ret == 0) {
-		STACK_UNWIND (frame, op_ret, op_errno);
-	}
-	return 0;
-}
-
-int32_t
-ha_unlink (call_frame_t *frame,
-	   xlator_t *this,
-	   loc_t *loc)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_unlink_stub (frame, ha_unlink, loc);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_unlink_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->unlink,
-			   loc);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno);
-	return 0;
-}
-
- int32_t
-ha_rmdir_cbk (call_frame_t *frame,
-	      void *cookie,
-	      xlator_t *this,
-	      int32_t op_ret,
-	      int32_t op_errno)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno);
-	}
-	return 0;
-}
-
-int32_t
-ha_rmdir (call_frame_t *frame,
-	  xlator_t *this,
-	  loc_t *loc)
-{
-	ha_local_t *local = frame->local;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_rmdir_stub (frame, ha_rmdir, loc);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_rmdir_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->rmdir,
-			   loc);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno);
-	return 0;
-}
-
-
-int
-ha_symlink_lookup_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       inode_t *inode,
-		       struct stat *buf,
-		       dict_t *dict)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	char *stateino = NULL;
-	int child_count = 0, i = 0, cnt = 0;
-	call_frame_t *prev_frame = NULL;
-	xlator_t **children = NULL;
-	uint64_t tmp_stateino = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-	prev_frame = cookie;
-	children = pvt->children;
-
-	for (i = 0; i < child_count; i++)
-		if (prev_frame->this == children[i])
-			break;
-
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.symlink.loc.path, op_ret, op_errno);
-	}
-	inode_ctx_get (local->stub->args.symlink.loc.inode,
-		       this, &tmp_stateino);  
-	stateino = (char *)(long)tmp_stateino;
-
-	if (op_ret == 0)
-		stateino[i] = 1;
-
-	LOCK (&frame->lock);
-	cnt = --local->call_count;
-	UNLOCK (&frame->lock);
-
-	if (cnt == 0) {
-		call_stub_t *stub = local->stub;
-		FREE (local->state);
-		STACK_UNWIND (frame,
-			      local->op_ret,
-			      local->op_errno,
-			      local->stub->args.symlink.loc.inode,
-			      &local->buf);
-		call_stub_destroy (stub);
-	}
-	return 0;
-}
-
-int32_t
-ha_symlink_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		inode_t *inode,
-		struct stat *buf)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	char *stateino = NULL;
-	int child_count = 0, i = 0, cnt = 0;
-	call_frame_t *prev_frame = NULL;
-	xlator_t **children = NULL;
-	uint64_t tmp_stateino = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-	prev_frame = cookie;
-	children = pvt->children;
-
-	for (i = 0; i < child_count; i++)
-		if (prev_frame->this == children[i])
-			break;
-
-	if (op_ret == -1) {
-		local->op_errno = op_errno;
-		gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.symlink.loc.path, op_ret, op_errno);
-	}
-	inode_ctx_get (local->stub->args.symlink.loc.inode, 
-		       this, &tmp_stateino);
-	stateino = (char *)(long)tmp_stateino;
-
-	if (op_ret == 0) {
-		stateino[i] = 1;
-		local->op_ret = 0;
-		local->first_success = 1;
-		local->buf = *buf;
-	}
-	cnt = --local->call_count;
-	for (i = local->active + 1; i < child_count; i++) {
-		if (local->state[i])
-			break;
-	}
-
-	if (cnt == 0 || i == child_count) {
-		call_stub_t *stub = local->stub;
-		FREE (local->state);
-		stub = local->stub;
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-			      local->stub->args.symlink.loc.inode, &local->buf);
-		call_stub_destroy (stub);
-		return 0;
-	}
-
-	local->active = i;
-
-	if (local->first_success == 0) {
-		STACK_WIND (frame,
-			    ha_symlink_cbk,
-			    children[i],
-			    children[i]->fops->symlink,
-			    local->stub->args.symlink.linkname,
-			    &local->stub->args.symlink.loc);
-		return 0;
-	}
-	cnt = local->call_count;
-
-	for (; i < child_count; i++) {
-		if (local->state[i]) {
-			STACK_WIND (frame,
-				    ha_symlink_lookup_cbk,
-				    children[i],
-				    children[i]->fops->lookup,
-				    &local->stub->args.symlink.loc,
-				    0);
-			if (--cnt == 0)
-				break;
-		}
-	}
-	return 0;
-}
-
-int32_t
-ha_symlink (call_frame_t *frame,
-	    xlator_t *this,
-	    const char *linkname,
-	    loc_t *loc)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	int child_count = 0, i = 0;
-	char *stateino = NULL;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-
-	frame->local = local = CALLOC (1, sizeof (*local));
-	local->stub = fop_symlink_stub (frame, ha_symlink, linkname, loc);
-	local->op_ret = -1;
-	local->op_errno = ENOTCONN;
-	local->state = CALLOC (1, child_count);
-	memcpy (local->state, pvt->state, child_count);
-	local->active = -1;
-
-	stateino = CALLOC (1, child_count);
-	inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino);
-
-	for (i = 0; i < child_count; i++) {
-		if (local->state[i]) {
-			local->call_count++;
-			if (local->active == -1) {
-				local->active = i;
-			}
-		}
-	}
-
-	STACK_WIND (frame,
-		    ha_symlink_cbk,
-		    HA_ACTIVE_CHILD(this, local),
-		    HA_ACTIVE_CHILD(this, local)->fops->symlink,
-		    linkname, loc);
-	return 0;
-}
-
- int32_t
-ha_rename_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno,
-	       struct stat *buf)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame, op_ret, op_errno, buf);
-	}
-	return 0;
-}
-
-int32_t
-ha_rename (call_frame_t *frame,
-	   xlator_t *this,
-	   loc_t *oldloc,
-	   loc_t *newloc)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, oldloc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_rename_stub (frame, ha_rename, oldloc, newloc);
-	STACK_WIND_COOKIE (frame,
-			   ha_rename_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->rename,
-			   oldloc, newloc);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;
-}
-
-int
-ha_link_lookup_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    inode_t *inode,
-		    struct stat *buf,
-		    dict_t *dict)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	char *stateino = NULL;
-	int child_count = 0, i = 0, cnt = 0;
-	call_frame_t *prev_frame = NULL;
-	xlator_t **children = NULL;
-	uint64_t tmp_stateino = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-	prev_frame = cookie;
-	children = pvt->children;
-
-	for (i = 0; i < child_count; i++)
-		if (prev_frame->this == children[i])
-			break;
-
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.link.newloc.path, op_ret, op_errno);
-	}
-	inode_ctx_get (local->stub->args.link.newloc.inode, 
-		       this, &tmp_stateino);  
-	stateino = (char *)(long)tmp_stateino;
-
-	if (op_ret == 0)
-		stateino[i] = 1;
-
-	LOCK (&frame->lock);
-	cnt = --local->call_count;
-	UNLOCK (&frame->lock);
-
-	if (cnt == 0) {
-		call_stub_t *stub = local->stub;
-		FREE (local->state);
-		STACK_UNWIND (frame,
-			      local->op_ret,
-			      local->op_errno,
-			      local->stub->args.link.oldloc.inode,
-			      &local->buf);
-		call_stub_destroy (stub);
-	}
-	return 0;
-}
-
-int32_t
-ha_link_cbk (call_frame_t *frame,
-	     void *cookie,
-	     xlator_t *this,
-	     int32_t op_ret,
-	     int32_t op_errno,
-	     inode_t *inode,
-	     struct stat *buf)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	char *stateino = NULL;
-	int child_count = 0, i = 0, cnt = 0;
-	call_frame_t *prev_frame = NULL;
-	xlator_t **children = NULL;
-	uint64_t tmp_stateino = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-	prev_frame = cookie;
-	children = pvt->children;
-
-	for (i = 0; i < child_count; i++)
-		if (prev_frame->this == children[i])
-			break;
-
-	if (op_ret == -1) {
-		local->op_errno = op_errno;
-		gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.link.newloc.path, op_ret, op_errno);
-	}
-	inode_ctx_get (local->stub->args.link.newloc.inode, 
-		       this, &tmp_stateino);
-	stateino = (char *)(long)tmp_stateino;
-
-	if (op_ret == 0) {
-		stateino[i] = 1;
-		local->op_ret = 0;
-		local->first_success = 1;
-		local->buf = *buf;
-	}
-	cnt = --local->call_count;
-	for (i = local->active + 1; i < child_count; i++) {
-		if (local->state[i])
-			break;
-	}
-
-	if (cnt == 0 || i == child_count) {
-		call_stub_t *stub = local->stub;
-		FREE (local->state);
-		stub = local->stub;
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.link.oldloc.inode, &local->buf);
-		call_stub_destroy (stub);
-		return 0;
-	}
-
-	local->active = i;
-
-	if (local->first_success == 0) {
-		STACK_WIND (frame,
-			    ha_link_cbk,
-			    children[i],
-			    children[i]->fops->link,
-			    &local->stub->args.link.oldloc,
-			    &local->stub->args.link.newloc);
-		return 0;
-	}
-	cnt = local->call_count;
-
-	for (; i < child_count; i++) {
-		if (local->state[i]) {
-			STACK_WIND (frame,
-				    ha_link_lookup_cbk,
-				    children[i],
-				    children[i]->fops->lookup,
-				    &local->stub->args.link.newloc,
-				    0);
-			if (--cnt == 0)
-				break;
-		}
-	}
-	return 0;
-}
-
-int32_t
-ha_link (call_frame_t *frame,
-	 xlator_t *this,
-	 loc_t *oldloc,
-	 loc_t *newloc)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	int child_count = 0, i = 0;
-	char *stateino = NULL;
-	int32_t ret = 0;
-	uint64_t tmp_stateino = 0;
-
-	ret = inode_ctx_get (newloc->inode, this, &tmp_stateino);
-	if (ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()");
-	}
-	stateino = (char *)(long)tmp_stateino;
-
-	if (stateino == NULL) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"newloc->inode's ctx is NULL, returning EINVAL");
-		STACK_UNWIND (frame, -1, EINVAL, oldloc->inode, NULL);
-		return 0;
-	}
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-
-	frame->local = local = CALLOC (1, sizeof (*local));
-	local->stub = fop_link_stub (frame, ha_link, oldloc, newloc);
-	local->op_ret = -1;
-	local->op_errno = ENOTCONN;
-	local->state = CALLOC (1, child_count);
-	memcpy (local->state, pvt->state, child_count);
-	local->active = -1;
-
-	for (i = 0; i < child_count; i++) {
-		if (local->state[i]) {
-			local->call_count++;
-			if (local->active == -1)
-				local->active = i;
-		}
-	}
-
-	STACK_WIND (frame,
-		    ha_link_cbk,
-		    HA_ACTIVE_CHILD(this, local),
-		    HA_ACTIVE_CHILD(this, local)->fops->link,
-		    oldloc,
-		    newloc);
-	return 0;
-}
-
-int32_t
-ha_create_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno,
-	       fd_t *fd,
-	       inode_t *inode,
-	       struct stat *buf)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	int i, child_count = 0, cnt = 0, ret = 0;
-	char *stateino = NULL;
-	hafd_t *hafdp = NULL;
-	call_frame_t *prev_frame = NULL;
-	xlator_t **children = NULL;
-	uint64_t tmp_stateino = 0;
-	uint64_t tmp_hafdp = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-	prev_frame = cookie;
-	children = pvt->children;
-
-	ret = inode_ctx_get (local->stub->args.create.loc.inode, 
-			     this, &tmp_stateino);
-	stateino = (char *)(long)tmp_stateino;
-
-	if (ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR, "dict_to_ptr() error");
-		/* FIXME: handle */
-	}
-	ret = fd_ctx_get (local->stub->args.create.fd, this, &tmp_hafdp);
-	if (ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR, "dict_to_ptr() error");
-		/* FIXME: handle */
-	}
-	hafdp = (hafd_t *)(long)tmp_hafdp;
-
-	for (i = 0; i < child_count; i++) {
-		if (prev_frame->this == children[i])
-			break;
-	}
-
-	if (op_ret == -1) {
-		local->op_errno = op_errno;
-		gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.create.loc.path, op_ret, op_errno);
-	}
-	if (op_ret != -1) {
-		stateino[i] = 1;
-		hafdp->fdstate[i] = 1;
-		if (local->op_ret == -1) {
-			local->op_ret = 0;
-			local->buf = *buf;
-			local->first_success = 1;
-		}
-		local->stub->args.create.flags &= (~O_EXCL);
-	}
-	LOCK (&frame->lock);
-	cnt = --local->call_count;
-	UNLOCK (&frame->lock);
-
-	for (i = local->active + 1; i < child_count; i++) {
-		if (local->state[i])
-			break;
-	}
-
-	if (cnt == 0 || i == child_count) {
-		char *state = local->state;
-		call_stub_t *stub = local->stub;
-		STACK_UNWIND (frame, local->op_ret, local->op_errno,
-			      stub->args.create.fd,
-			      stub->args.create.loc.inode, &local->buf);
-		FREE (state);
-		call_stub_destroy (stub);
-		return 0;
-	}
-	local->active = i;
-	cnt = local->call_count;
-	for (; i < child_count; i++) {
-		if (local->state[i]) {
-			STACK_WIND (frame,
-				    ha_create_cbk,
-				    children[i],
-				    children[i]->fops->create,
-				    &local->stub->args.create.loc,
-				    local->stub->args.create.flags,
-				    local->stub->args.create.mode,
-				    local->stub->args.create.fd);
-			if ((local->first_success == 0) || (cnt == 0))
-				break;
-		}
-	}
-	return 0;
-}
-
-int32_t
-ha_create (call_frame_t *frame,
-	   xlator_t *this,
-	   loc_t *loc,
-	   int32_t flags,
-	   mode_t mode, fd_t *fd)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	int i, child_count = 0;
-	char *stateino = NULL;
-	xlator_t **children = NULL;
-	hafd_t *hafdp = NULL;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-	children = pvt->children;
-
-	if (local == NULL) {
-		local = frame->local = CALLOC (1, sizeof (*local));
-		local->stub = fop_create_stub (frame, ha_create, loc, flags, mode, fd);
-		local->state = CALLOC (1, child_count);
-		local->active = -1;
-		local->op_ret = -1;
-		local->op_errno = ENOTCONN;
-		memcpy (local->state, pvt->state, child_count);
-
-		for (i = 0; i < pvt->child_count; i++) {
-			if (local->state[i]) {
-				local->call_count++;
-				if (local->active == -1)
-					local->active = i;
-			}
-		}
-		/* FIXME handle active -1 */
-		stateino = CALLOC (1, child_count);
-		hafdp = CALLOC (1, sizeof (*hafdp));
-		hafdp->fdstate = CALLOC (1, child_count);
-		hafdp->path = strdup(loc->path);
-		LOCK_INIT (&hafdp->lock);
-		fd_ctx_set (fd, this, (uint64_t)(long)hafdp);
-		inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino);
-	}
-
-	STACK_WIND (frame,
-		    ha_create_cbk,
-		    children[local->active],
-		    children[local->active]->fops->create,
-		    loc, flags, mode, fd);
-	return 0;
-}
-
- int32_t
-ha_open_cbk (call_frame_t *frame,
-	     void *cookie,
-	     xlator_t *this,
-	     int32_t op_ret,
-	     int32_t op_errno,
-	     fd_t *fd)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	xlator_t **children = NULL;
-	int i = 0, child_count = 0, callcnt = 0, ret = 0;
-	call_frame_t *prev_frame = NULL;
-	hafd_t *hafdp = NULL;
-	uint64_t tmp_hafdp = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	children = pvt->children;
-	child_count = pvt->child_count;
-	prev_frame = cookie;
-
-	ret = fd_ctx_get (local->fd, this, &tmp_hafdp);
-	if (ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()");
-	}
-	hafdp = (hafd_t *)(long)tmp_hafdp;
-
-	for (i = 0; i < child_count; i++)
-		if (children[i] == prev_frame->this)
-			break;
-	LOCK (&frame->lock);
-	if (op_ret != -1) {
-		hafdp->fdstate[i] = 1;
-		local->op_ret = 0;
-	}
-	if (op_ret == -1 && op_errno != ENOTCONN)
-		local->op_errno = op_errno;
-	callcnt = --local->call_count;
-	UNLOCK (&frame->lock);
-
-	if (callcnt == 0) {
-		STACK_UNWIND (frame,
-			      local->op_ret,
-			      local->op_errno,
-			      local->fd);
-	}
-	return 0;
-}
-
-int32_t
-ha_open (call_frame_t *frame,
-	 xlator_t *this,
-	 loc_t *loc,
-	 int32_t flags, fd_t *fd)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	char *stateino = NULL;
-	xlator_t **children = NULL;
-	int cnt = 0, i, child_count = 0, ret = 0;
-	hafd_t *hafdp = NULL;
-	uint64_t tmp_stateino = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	children = pvt->children;
-	child_count = pvt->child_count;
-
-
-	local = frame->local = CALLOC (1, sizeof (*local));
-	local->op_ret = -1;
-	local->op_errno = ENOTCONN;
-	local->fd = fd;
-
-	hafdp = CALLOC (1, sizeof (*hafdp));
-	hafdp->fdstate = CALLOC (1, child_count);
-	hafdp->path = strdup (loc->path);
-	hafdp->active = -1;
-	if (pvt->pref_subvol == -1) {
-		hafdp->active = fd->inode->ino % child_count;
-	}
-
-	LOCK_INIT (&hafdp->lock);
-	fd_ctx_set (fd, this, (uint64_t)(long)hafdp);
-	ret = inode_ctx_get (loc->inode, this, &tmp_stateino);
-	stateino = (char *)(long)tmp_stateino;
-
-	for (i = 0; i < child_count; i++)
-		if (stateino[i])
-			cnt++;
-	local->call_count = cnt;
-	for (i = 0; i < child_count; i++) {
-		if (stateino[i]) {
-			STACK_WIND (frame,
-				    ha_open_cbk,
-				    children[i],
-				    children[i]->fops->open,
-				    loc, flags, fd);
-			if (--cnt == 0)
-				break;
-		}
-	}
-	return 0;
-}
-
- int32_t
-ha_readv_cbk (call_frame_t *frame,
-	      void *cookie,
-	      xlator_t *this,
-	      int32_t op_ret,
-	      int32_t op_errno,
-	      struct iovec *vector,
-	      int32_t count,
-	      struct stat *stbuf)
-{
-	int ret = 0;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      vector,
-			      count,
-			      stbuf);
-	}
-	return 0;
-}
-
-int32_t
-ha_readv (call_frame_t *frame,
-	  xlator_t *this,
-	  fd_t *fd,
-	  size_t size,
-	  off_t offset)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_fd (frame, fd);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_readv_stub (frame, ha_readv, fd, size, offset);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_readv_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->readv,
-			   fd,
-			   size,
-			   offset);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;
-}
-
- int32_t
-ha_writev_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno,
-	       struct stat *stbuf)
-{
-	int ret = 0;
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      stbuf);
-	}
-	return 0;
-}
-
-int32_t
-ha_writev (call_frame_t *frame,
-	   xlator_t *this,
-	   fd_t *fd,
-	   struct iovec *vector,
-	   int32_t count,
-	   off_t off)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_fd (frame, fd);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_writev_stub (frame, ha_writev, fd, vector, count, off);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_writev_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->writev,
-			   fd,
-			   vector,
-			   count,
-			   off);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;	
-}
-
- int32_t
-ha_flush_cbk (call_frame_t *frame,
-	      void *cookie,
-	      xlator_t *this,
-	      int32_t op_ret,
-	      int32_t op_errno)
-{
-	int ret = 0;
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno);
-	}
-	return 0;
-}
-
-int32_t
-ha_flush (call_frame_t *frame,
-	  xlator_t *this,
-	  fd_t *fd)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_fd (frame, fd);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_flush_stub (frame, ha_flush, fd);
-	STACK_WIND_COOKIE (frame,
-			   ha_flush_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->flush,
-			   fd);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno);
-	return 0;
-}
-
-
- int32_t
-ha_fsync_cbk (call_frame_t *frame,
-	      void *cookie,
-	      xlator_t *this,
-	      int32_t op_ret,
-	      int32_t op_errno)
-{
-	int ret = 0;
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno);
-	}
-	return 0;
-}
-
-int32_t
-ha_fsync (call_frame_t *frame,
-	  xlator_t *this,
-	  fd_t *fd,
-	  int32_t flags)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_fd (frame, fd);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_fsync_stub (frame, ha_fsync, fd, flags);
-	STACK_WIND_COOKIE (frame,
-			   ha_fsync_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->fsync,
-			   fd,
-			   flags);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno);
-	return 0;
-}
-
- int32_t
-ha_fstat_cbk (call_frame_t *frame,
-	      void *cookie,
-	      xlator_t *this,
-	      int32_t op_ret,
-	      int32_t op_errno,
-	      struct stat *buf)
-{
-	int ret = 0;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      buf);
-	}
-	return 0;
-}
-
-int32_t
-ha_fstat (call_frame_t *frame,
-	  xlator_t *this,
-	  fd_t *fd)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_fd (frame, fd);
-
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_fstat_stub (frame, ha_fstat, fd);
-	STACK_WIND_COOKIE (frame,
-			   ha_fstat_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->fstat,
-			   fd);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;
-}
-
-int32_t
-ha_opendir_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		fd_t *fd)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	xlator_t **children = NULL;
-	int i = 0, child_count = 0, callcnt = 0, ret = 0;
-	call_frame_t *prev_frame = NULL;
-	hafd_t *hafdp = NULL;
-	uint64_t tmp_hafdp = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	children = pvt->children;
-	child_count = pvt->child_count;
-	prev_frame = cookie;
-
-	ret = fd_ctx_get (local->fd, this, &tmp_hafdp);
-	if (ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()");
-	}
-	hafdp = (hafd_t *)(long)tmp_hafdp;
-
-	for (i = 0; i < child_count; i++)
-		if (children[i] == prev_frame->this)
-			break;
-	LOCK (&frame->lock);
-	if (op_ret != -1) {
-		hafdp->fdstate[i] = 1;
-		local->op_ret = 0;
-	}
-	if (op_ret == -1 && op_errno != ENOTCONN)
-		local->op_errno = op_errno;
-	callcnt = --local->call_count;
-	UNLOCK (&frame->lock);
-
-	if (callcnt == 0) {
-		STACK_UNWIND (frame,
-			      local->op_ret,
-			      local->op_errno,
-			      local->fd);
-	}
-	return 0;
-}
-
-int32_t
-ha_opendir (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc, fd_t *fd)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	char *stateino = NULL;
-	xlator_t **children = NULL;
-	int cnt = 0, i, child_count = 0, ret = 0;
-	hafd_t *hafdp = NULL;
-	uint64_t tmp_stateino = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	children = pvt->children;
-	child_count = pvt->child_count;
-
-	local = frame->local = CALLOC (1, sizeof (*local));
-	local->op_ret = -1;
-	local->op_errno = ENOTCONN;
-	local->fd = fd;
-
-	hafdp = CALLOC (1, sizeof (*hafdp));
-	hafdp->fdstate = CALLOC (1, child_count);
-	hafdp->path = strdup (loc->path);
-	LOCK_INIT (&hafdp->lock);
-	fd_ctx_set (fd, this, (uint64_t)(long)hafdp);
-	ret = inode_ctx_get (loc->inode, this, &tmp_stateino);
-	stateino = (char *)(long)tmp_stateino;
-	
-	if (ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR, "inode_ctx_get() error");
-	}
-	for (i = 0; i < child_count; i++)
-		if (stateino[i])
-			cnt++;
-	local->call_count = cnt;
-	for (i = 0; i < child_count; i++) {
-		if (stateino[i]) {
-			STACK_WIND (frame,
-				    ha_opendir_cbk,
-				    children[i],
-				    children[i]->fops->opendir,
-				    loc, fd);
-			if (--cnt == 0)
-				break;
-		}
-	}
-	return 0;
-}
-
- int32_t
-ha_getdents_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 dir_entry_t *entries,
-		 int32_t count)
-{
-	int ret = 0;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      entries,
-			      count);
-	}
-	return 0;
-}
-
-int32_t
-ha_getdents (call_frame_t *frame,
-	     xlator_t *this,
-	     fd_t *fd,
-	     size_t size,
-	     off_t offset,
-	     int32_t flag)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_fd (frame, fd);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_getdents_stub (frame, ha_getdents, fd, size, offset, flag);
-	STACK_WIND_COOKIE (frame,
-			   ha_getdents_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->getdents,
-			   fd,
-			   size,
-			   offset,
-			   flag);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, 0);
-	return 0;
-}
-
- int32_t
-ha_setdents_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno)
-{
-	int ret = 0;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno);
-	}
-	return 0;
-}
-
-int32_t
-ha_setdents (call_frame_t *frame,
-	     xlator_t *this,
-	     fd_t *fd,
-	     int32_t flags,
-	     dir_entry_t *entries,
-	     int32_t count)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_fd (frame, fd);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-
-	local->stub = fop_setdents_stub (frame, ha_setdents, fd, flags, entries, count);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_setdents_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->setdents,
-			   fd,
-			   flags,
-			   entries,
-			   count);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno);
-	return 0;
-}
-
- int32_t
-ha_fsyncdir_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno)
-{
-	int ret = 0;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno);
-	}
-	return 0;
-}
-
-int32_t
-ha_fsyncdir (call_frame_t *frame,
-	     xlator_t *this,
-	     fd_t *fd,
-	     int32_t flags)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_fd (frame, fd);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_fsyncdir_stub (frame, ha_fsyncdir, fd, flags);
-	STACK_WIND_COOKIE (frame,
-			   ha_fsyncdir_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->fsyncdir,
-			   fd,
-			   flags);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno);
-	return 0;
-}
-
-
- int32_t
-ha_statfs_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno,
-	       struct statvfs *buf)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      buf);
-	}
-	return 0;
-}
-
-int32_t
-ha_statfs (call_frame_t *frame,
-	   xlator_t *this,
-	   loc_t *loc)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-
-	local->stub = fop_statfs_stub (frame, ha_statfs, loc);
-	STACK_WIND_COOKIE (frame,
-			   ha_statfs_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->statfs,
-			   loc);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;
-}
-
- int32_t
-ha_setxattr_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno);
-	}
-	return 0;
-}
-
-int32_t
-ha_setxattr (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     dict_t *dict,
-	     int32_t flags)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_setxattr_stub (frame, ha_setxattr, loc, dict, flags);
-	STACK_WIND_COOKIE (frame,
-			   ha_setxattr_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->setxattr,
-			   loc,
-			   dict,
-			   flags);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno);
-	return 0;
-}
-
- int32_t
-ha_getxattr_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 dict_t *dict)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      dict);
-	}
-	return 0;
-}
-
-int32_t
-ha_getxattr (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     const char *name)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_getxattr_stub (frame, ha_getxattr, loc, name);
-	STACK_WIND_COOKIE (frame,
-			   ha_getxattr_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->getxattr,
-			   loc,
-			   name);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-	return 0;
-}
-
-int32_t
-ha_xattrop_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		dict_t *dict)
-{
-	int ret = -1;
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-	if (ret == 0) {
-		STACK_UNWIND (frame, op_ret, op_errno, dict);
-	}
-	return 0;
-}
-
-
-int32_t
-ha_xattrop (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    gf_xattrop_flags_t flags,
-	    dict_t *dict)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-
-	local->stub = fop_xattrop_stub (frame, ha_xattrop, loc, flags, dict);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_xattrop_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->xattrop,
-			   loc,
-			   flags,
-			   dict);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, dict);
-	return 0;
-}
-
-int32_t
-ha_fxattrop_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 dict_t *dict)
-{
-	int ret = -1;
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-	if (ret == 0)
-		STACK_UNWIND (frame, op_ret, op_errno, dict);
-	return 0;
-}
-
-int32_t
-ha_fxattrop (call_frame_t *frame,
-	     xlator_t *this,
-	     fd_t *fd,
-	     gf_xattrop_flags_t flags,
-	     dict_t *dict)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_fd (frame, fd);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_fxattrop_stub (frame, ha_fxattrop, fd, flags, dict);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_fxattrop_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->fxattrop,
-			   fd,
-			   flags,
-			   dict);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, dict);
-	return 0;
-}
-
- int32_t
-ha_removexattr_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno)
-{
-	int ret = -1;
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno);
-	}
-	return 0;
-}
-
-int32_t
-ha_removexattr (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		const char *name)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	
-	local->stub = fop_removexattr_stub (frame, ha_removexattr, loc, name);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_removexattr_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->removexattr,
-			   loc,
-			   name);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno);
-	return 0;
-}
-
-int32_t
-ha_lk_setlk_unlck_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       struct flock *lock)
-{
-	ha_local_t *local = NULL;
-	int cnt = 0;
-	call_stub_t *stub = NULL;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	cnt = --local->call_count;
-	if (op_ret == 0)
-		local->op_ret = 0;
-	UNLOCK (&frame->lock);
-
-	if (cnt == 0) {
-		stub = local->stub;
-		FREE (local->state);
-		if (stub->args.lk.lock.l_type == F_UNLCK) {
-			STACK_UNWIND (frame, local->op_ret, local->op_errno, &stub->args.lk.lock);
-		} else {
-			STACK_UNWIND (frame, -1, EIO, NULL);
-		}
-		call_stub_destroy (stub);
-	}
-	return 0;
-}
-
-int32_t
-ha_lk_setlk_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 struct flock *lock)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	xlator_t **children = NULL;
-	int i = 0, cnt = 0, j = 0;
-	int child_count = 0;
-	call_frame_t *prev_frame = NULL;
-	char *state = NULL;
-
-	local = frame->local;
-	pvt = this->private;
-	children = pvt->children;
-	child_count = pvt->child_count;
-	prev_frame = cookie;
-	state = local->state;
-
-	if (op_ret == 0)
-		local->op_ret = 0;
-
-	if ((op_ret == 0) || (op_ret == -1 && op_errno == ENOTCONN)) {
-		for (i = 0; i < child_count; i++) {
-			if (prev_frame->this == cookie)
-				break;
-		}
-		i++;
-		for (; i < child_count; i++) {
-			if (local->state[i])
-				break;
-		}
-		if (i == child_count) {
-			call_stub_t *stub = local->stub;
-			FREE (local->state);
-			STACK_UNWIND (frame, 0, op_errno, &stub->args.lk.lock);
-			call_stub_destroy (stub);
-			return 0;
-		}
-		STACK_WIND (frame,
-			    ha_lk_setlk_cbk,
-			    children[i],
-			    children[i]->fops->lk,
-			    local->stub->args.lk.fd,
-			    local->stub->args.lk.cmd,
-			    &local->stub->args.lk.lock);
-		return 0;
-	} else {
-		for (i = 0; i < child_count; i++) {
-			if (prev_frame->this == cookie)
-				break;
-		}
-		cnt = 0;
-		for (j = 0; j < i; j++) {
-			if (state[i])
-				cnt++;
-		}
-		if (cnt) {
-			struct flock lock;
-			lock = local->stub->args.lk.lock;
-			for (i = 0; i < child_count; i++) {
-				if (state[i]) {
-					STACK_WIND (frame,
-						    ha_lk_setlk_unlck_cbk,
-						    children[i],
-						    children[i]->fops->lk,
-						    local->stub->args.lk.fd,
-						    local->stub->args.lk.cmd,
-						    &lock);
-					if (--cnt == 0)
-						break;
-				}
-			}
-			return 0;
-		} else {
-			FREE (local->state);
-			call_stub_destroy (local->stub);
-			STACK_UNWIND (frame,
-				      op_ret,
-				      op_errno,
-				      lock);
-			return 0;
-		}
-	}
-}
-
-int32_t
-ha_lk_getlk_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 struct flock *lock)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	fd_t *fd = NULL;
-	int child_count = 0, i = 0;
-	xlator_t **children = NULL;
-	call_frame_t *prev_frame = NULL;
-
-	local = frame->local;
-	pvt = this->private;
-	fd = local->stub->args.lk.fd;
-	child_count = pvt->child_count;
-	children = pvt->children;
-	prev_frame = cookie;
-
-	if (op_ret == 0) {
-		FREE (local->state);
-		call_stub_destroy (local->stub);
-		STACK_UNWIND (frame, 0, 0, lock);
-		return 0;
-	}
-
-	for (i = 0; i < child_count; i++) {
-		if (prev_frame->this == children[i])
-			break;
-	}
-
-	for (; i < child_count; i++) {
-		if (local->state[i])
-			break;
-	}
-
-	if (i == child_count) {
-		FREE (local->state);
-		call_stub_destroy (local->stub);
-		STACK_UNWIND (frame, op_ret, op_errno, lock);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    ha_lk_getlk_cbk,
-		    children[i],
-		    children[i]->fops->lk,
-		    fd,
-		    local->stub->args.lk.cmd,
-		    &local->stub->args.lk.lock);
-	return 0;
-}
-
-int32_t
-ha_lk (call_frame_t *frame,
-       xlator_t *this,
-       fd_t *fd,
-       int32_t cmd,
-       struct flock *lock)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	hafd_t *hafdp = NULL;
-	char *state = NULL;
-	int child_count = 0, i = 0, cnt = 0, ret = 0;
-	xlator_t **children = NULL;
-	uint64_t tmp_hafdp = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	child_count = pvt->child_count;
-	children = pvt->children;
-	ret = fd_ctx_get (fd, this, &tmp_hafdp);
-	if (ret < 0)
-		gf_log (this->name, GF_LOG_ERROR, "fd_ctx_get failed");
-
-	if (local == NULL) {
-		local = frame->local = CALLOC (1, sizeof (*local));
-		local->active = -1;
-		local->op_ret = -1;
-		local->op_errno = ENOTCONN;
-	}
-	hafdp = (hafd_t *)(long)tmp_hafdp;
-
-	if (local->active == -1) {
-		STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-		return 0;
-	}
-
-	local->stub = fop_lk_stub (frame, ha_lk, fd, cmd, lock);
-	local->state = CALLOC (1, child_count);
-	state = hafdp->fdstate;
-	LOCK (&hafdp->lock);
-	memcpy (local->state, state, child_count);
-	UNLOCK (&hafdp->lock);
-	if (cmd == F_GETLK) {
-		for (i = 0; i < child_count; i++) {
-			if (local->state[i])
-				break;
-		}
-		STACK_WIND (frame,
-			    ha_lk_getlk_cbk,
-			    children[i],
-			    children[i]->fops->lk,
-			    fd,
-			    cmd,
-			    lock);
-	} else if (cmd == F_SETLK && lock->l_type == F_UNLCK) {
-		for (i = 0; i < child_count; i++) {
-			if (local->state[i])
-				local->call_count++;
-		}
-		cnt = local->call_count;
-		for (i = 0; i < child_count; i++) {
-			if (local->state[i]) {
-				STACK_WIND (frame,
-					    ha_lk_setlk_unlck_cbk,
-					    children[i],
-					    children[i]->fops->lk,
-					    fd, cmd, lock);
-				if (--cnt == 0)
-					break;
-			}
-		}
-	} else {
-		for (i = 0; i < child_count; i++) {
-			if (local->state[i])
-				break;
-		}
-		STACK_WIND (frame,
-			    ha_lk_setlk_cbk,
-			    children[i],
-			    children[i]->fops->lk,
-			    fd,
-			    cmd,
-			    lock);
-	}
-	return 0;
-}
-
- int32_t
-ha_inode_entry_lk_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno);
-	}
-	return 0;
-}
-
-int32_t
-ha_inodelk (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    int32_t cmd,
-	    struct flock *lock)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_inodelk_stub (frame, ha_inodelk, loc, cmd, lock);
-	STACK_WIND_COOKIE (frame,
-			   ha_inode_entry_lk_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->inodelk,
-			   loc,
-			   cmd,
-			   lock);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno);
-	return 0;
-}
-
-int32_t
-ha_entrylk (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    const char *basename,
-	    entrylk_cmd cmd,
-	    entrylk_type type)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_entrylk_stub (frame, ha_entrylk, loc, basename, cmd, type);
-	STACK_WIND_COOKIE (frame,
-			   ha_inode_entry_lk_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->entrylk,
-			   loc, basename, cmd, type);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno);
-	return 0;
-}
-
- int32_t
-ha_checksum_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 uint8_t *file_checksum,
-		 uint8_t *dir_checksum)
-{
-	int ret = -1;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-	if (ret == 0) {
-		STACK_UNWIND (frame,
-			      op_ret,
-			      op_errno,
-			      file_checksum,
-			      dir_checksum);
-	}
-	return 0;
-}
-
-int32_t
-ha_checksum (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     int32_t flag)
-{
-	int op_errno = 0;
-	ha_local_t *local = NULL;
-
-	op_errno = ha_alloc_init_inode (frame, loc->inode);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_checksum_stub (frame, ha_checksum, loc, flag);
-
-	STACK_WIND_COOKIE (frame,
-			   ha_checksum_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->checksum,
-			   loc,
-			   flag);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-	return 0;
-}
-
-int32_t
-ha_readdir_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		gf_dirent_t *entries)
-{
-	int ret = 0;
-
-	ret = ha_handle_cbk (frame, cookie, op_ret, op_errno);
-	if (ret == 0)
-		STACK_UNWIND (frame, op_ret, op_errno, entries);
-	return 0;
-}
-
-int32_t
-ha_readdir (call_frame_t *frame,
-	    xlator_t *this,
-	    fd_t *fd,
-	    size_t size,
-	    off_t off)
-{
-	ha_local_t *local = NULL;
-	int op_errno = 0;
-
-	op_errno = ha_alloc_init_fd (frame, fd);
-	if (op_errno < 0) {
-		op_errno = -op_errno;
-		goto err;
-	}
-	local = frame->local;
-	local->stub = fop_readdir_stub (frame, ha_readdir, fd, size, off);
-	STACK_WIND_COOKIE (frame,
-			   ha_readdir_cbk,
-			   (void *)(long)local->active,
-			   HA_ACTIVE_CHILD(this, local),
-			   HA_ACTIVE_CHILD(this, local)->fops->readdir,
-			   fd, size, off);
-	return 0;
-err:
-	STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-	return 0;
-}
-
-/* Management operations */
-
- int32_t
-ha_stats_cbk (call_frame_t *frame,
-	      void *cookie,
-	      xlator_t *this,
-	      int32_t op_ret,
-	      int32_t op_errno,
-	      struct xlator_stats *stats)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	call_frame_t *prev_frame = NULL;
-	xlator_t **children = NULL;
-	int i = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	prev_frame = cookie;
-	children = pvt->children;
-
-	if (op_ret == -1 && op_errno == ENOTCONN) {
-		for (i = 0; i < pvt->child_count; i++) {
-			if (prev_frame->this == children[i])
-				break;
-		}
-		i++;
-		for (; i < pvt->child_count; i++) {
-			if (pvt->state[i])
-				break;
-		}
-
-		if (i == pvt->child_count) {
-			STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-			return 0;
-		}
-		STACK_WIND (frame,
-			    ha_stats_cbk,
-			    children[i],
-			    children[i]->mops->stats,
-			    local->flags);
-		return 0;
-	}
-
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      stats);
-	return 0;
-}
-
-int32_t
-ha_stats (call_frame_t *frame,
-	  xlator_t *this,
-	  int32_t flags)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	xlator_t **children = NULL;
-	int i = 0;
-
-	local = frame->local = CALLOC (1, sizeof (*local));
-	pvt = this->private;
-	children = pvt->children;
-	for (i = 0; i < pvt->child_count; i++) {
-		if (pvt->state[i])
-			break;
-	}
-
-	if (i == pvt->child_count) {
-		STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-		return 0;
-	}
-	local->flags = flags;
-
-	STACK_WIND (frame,
-		    ha_stats_cbk,
-		    children[i],
-		    children[i]->mops->stats,
-		    flags);
-	return 0;
-}
-
-
-int32_t
-ha_getspec_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		char *spec_data)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	call_frame_t *prev_frame = NULL;
-	xlator_t **children = NULL;
-	int i = 0;
-
-	local = frame->local;
-	pvt = this->private;
-	prev_frame = cookie;
-	children = pvt->children;
-
-	if (op_ret == -1 && op_errno == ENOTCONN) {
-		for (i = 0; i < pvt->child_count; i++) {
-			if (prev_frame->this == children[i])
-				break;
-		}
-		i++;
-		for (; i < pvt->child_count; i++) {
-			if (pvt->state[i])
-				break;
-		}
-
-		if (i == pvt->child_count) {
-			STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-			return 0;
-		}
-		STACK_WIND (frame,
-			    ha_getspec_cbk,
-			    children[i],
-			    children[i]->mops->getspec,
-			    local->pattern,
-			    local->flags);
-		return 0;
-	}
-
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      spec_data);
-	return 0;
-}
-
-int32_t
-ha_getspec (call_frame_t *frame,
-	    xlator_t *this,
-	    const char *key,
-	    int32_t flags)
-{
-	ha_local_t *local = NULL;
-	ha_private_t *pvt = NULL;
-	xlator_t **children = NULL;
-	int i = 0;
-
-	local = frame->local = CALLOC (1, sizeof (*local));
-	pvt = this->private;
-	children = pvt->children;
-
-	local = frame->local = CALLOC (1, sizeof (*local));
-	for (i = 0; i < pvt->child_count; i++) {
-		if (pvt->state[i])
-			break;
-	}
-
-	if (i == pvt->child_count) {
-		STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-		return 0;
-	}
-	local->flags = flags;
-	local->pattern = (char *)key;
-
-	STACK_WIND (frame,
-		    ha_getspec_cbk,
-		    children[i],
-		    children[i]->mops->getspec,
-		    key, flags);
-	return 0;
-}
-
-int32_t
-ha_closedir (xlator_t *this,
-	     fd_t *fd)
-{
-	hafd_t *hafdp = NULL;
-	int op_errno = 0;
-	uint64_t tmp_hafdp = 0;
-
-	op_errno = fd_ctx_del (fd, this, &tmp_hafdp);
-	if (op_errno != 0) {
-		gf_log (this->name, GF_LOG_ERROR, "fd_ctx_del() error");
-		return 0;
-	}
-	hafdp = (hafd_t *)(long)tmp_hafdp;
-
-	FREE (hafdp->fdstate);
-	FREE (hafdp->path);
-	LOCK_DESTROY (&hafdp->lock);
-	return 0;
-}
-
-int32_t
-ha_close (xlator_t *this,
-	  fd_t *fd)
-{
-	hafd_t *hafdp = NULL;
-	int op_errno = 0;
-	uint64_t tmp_hafdp = 0;
-
-	op_errno = fd_ctx_del (fd, this, &tmp_hafdp);
-	if (op_errno != 0) {
-		gf_log (this->name, GF_LOG_ERROR, "fd_ctx_del() error");
-		return 0;
-	}
-	hafdp = (hafd_t *)(long)tmp_hafdp;
-
-	FREE (hafdp->fdstate);
-	FREE (hafdp->path);
-	LOCK_DESTROY (&hafdp->lock);
-	return 0;
-}
-
-/* notify */
-int32_t
-notify (xlator_t *this,
-	int32_t event,
-	void *data,
-	...)
-{
-	ha_private_t *pvt = NULL;
-	int32_t i = 0, upcnt = 0;
-
-	pvt = this->private;
-	if (pvt == NULL) {
-		gf_log (this->name, GF_LOG_DEBUG, "got notify before init()");
-		return 0;
-	}
-
-	switch (event)
-	{
-	case GF_EVENT_CHILD_DOWN:
-	{
-		for (i = 0; i < pvt->child_count; i++) {
-			if (data == pvt->children[i])
-				break;
-		}
-	        gf_log (this->name, GF_LOG_DEBUG, "GF_EVENT_CHILD_DOWN from %s", pvt->children[i]->name);
-		pvt->state[i] = 0;
-		for (i = 0; i < pvt->child_count; i++) {
-			if (pvt->state[i])
-				break;
-		}
-		if (i == pvt->child_count) {
-			default_notify (this, event, data);
-		}
-	}
-	break;
-	case GF_EVENT_CHILD_UP:
-	{
-		for (i = 0; i < pvt->child_count; i++) {
-			if (data == pvt->children[i])
-				break;
-		}
-
-		gf_log (this->name, GF_LOG_DEBUG, "GF_EVENT_CHILD_UP from %s", pvt->children[i]->name);
-
-		pvt->state[i] = 1;
-
-		for (i = 0; i < pvt->child_count; i++) {
-			if (pvt->state[i])
-				upcnt++;
-		}
-
-		if (upcnt == 1) {
-			default_notify (this, event, data);
-		}
-	}
-	break;
-
-	default:
-	{
-		default_notify (this, event, data);
-	}
-	}
-
-	return 0;
-}
-
-int
-init (xlator_t *this)
-{
-	ha_private_t *pvt = NULL;
-	xlator_list_t *trav = NULL;
-	int count = 0, ret = 0;
-
-	if (!this->children) {
-		gf_log (this->name,GF_LOG_ERROR, 
-			"FATAL: ha should have one or more child defined");
-		return -1;
-	}
-
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-  
-	trav = this->children;
-	pvt = CALLOC (1, sizeof (ha_private_t));
-
-	ret = dict_get_int32 (this->options, "preferred-subvolume", 
-			      &pvt->pref_subvol);
-	if (ret < 0) {
-		pvt->pref_subvol = -1;
-	}
-
-	trav = this->children;
-	while (trav) {
-		count++;
-		trav = trav->next;
-	}
-
-	pvt->child_count = count;
-	pvt->children = CALLOC (count, sizeof (xlator_t*));
-
-	trav = this->children;
-	count = 0;
-	while (trav) {
-		pvt->children[count] = trav->xlator;
-		count++;
-		trav = trav->next;
-	}
-
-	pvt->state = CALLOC (1, count);
-	this->private = pvt;
-	return 0;
-}
-
-void
-fini (xlator_t *this)
-{
-	ha_private_t *priv = NULL;
-	priv = this->private;
-	FREE (priv);
-	return;
-}
-
-
-struct xlator_fops fops = {
-	.lookup      = ha_lookup,
-	.stat        = ha_stat,
-	.readlink    = ha_readlink,
-	.mknod       = ha_mknod,
-	.mkdir       = ha_mkdir,
-	.unlink      = ha_unlink,
-	.rmdir       = ha_rmdir,
-	.symlink     = ha_symlink,
-	.rename      = ha_rename,
-	.link        = ha_link,
-	.chmod       = ha_chmod,
-	.chown       = ha_chown,
-	.truncate    = ha_truncate,
-	.utimens     = ha_utimens,
-	.create      = ha_create,
-	.open        = ha_open,
-	.readv       = ha_readv,
-	.writev      = ha_writev,
-	.statfs      = ha_statfs,
-	.flush       = ha_flush,
-	.fsync       = ha_fsync,
-	.setxattr    = ha_setxattr,
-	.getxattr    = ha_getxattr,
-	.removexattr = ha_removexattr,
-	.opendir     = ha_opendir,
-	.readdir     = ha_readdir,
-	.getdents    = ha_getdents,
-	.fsyncdir    = ha_fsyncdir,
-	.access      = ha_access,
-	.ftruncate   = ha_ftruncate,
-	.fstat       = ha_fstat,
-	.lk          = ha_lk,
-	.fchmod      = ha_fchmod,
-	.fchown      = ha_fchown,
-	.setdents    = ha_setdents,
-	.lookup_cbk  = ha_lookup_cbk,
-	.checksum    = ha_checksum,
-	.xattrop     = ha_xattrop,
-	.fxattrop    = ha_fxattrop
-};
-
-struct xlator_mops mops = {
-	.stats   = ha_stats,
-	.getspec = ha_getspec,
-};
-
-struct xlator_cbks cbks = {
-	.release    = ha_close,
-	.releasedir = ha_closedir,
-	.forget     = ha_forget,
-};
diff --git a/xlators/cluster/ha/src/ha.h b/xlators/cluster/ha/src/ha.h
deleted file mode 100644
index 7e4898ceb90..00000000000
--- a/xlators/cluster/ha/src/ha.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
-  Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __HA_H_
-#define __HA_H_
-
-typedef struct {
-	call_stub_t *stub;
-	int32_t op_ret, op_errno;
-	int32_t active, tries, revalidate, revalidate_error;
-	int32_t call_count;
-	char *state, *pattern;
-	dict_t *dict;
-	loc_t *loc;
-	struct stat buf;
-	fd_t *fd;
-	inode_t *inode;
-	int32_t flags;
-	int32_t first_success;
-} ha_local_t;
-
-typedef struct {
-	char *state;
-	xlator_t **children;
-	int child_count, pref_subvol;
-} ha_private_t;
-
-typedef struct {
-	char *fdstate;
-	char *path;
-	gf_lock_t lock;
-	int active;
-} hafd_t;
-
-#define HA_ACTIVE_CHILD(this, local) (((ha_private_t *)this->private)->children[local->active])
-
-extern int ha_alloc_init_fd (call_frame_t *frame, fd_t *fd);
-
-extern int ha_handle_cbk (call_frame_t *frame, void *cookie, int op_ret, int op_errno) ;
-
-extern int ha_alloc_init_inode (call_frame_t *frame, inode_t *inode);
-
-#endif
diff --git a/xlators/cluster/map/src/Makefile.am b/xlators/cluster/map/src/Makefile.am
deleted file mode 100644
index 44ee4d9eed3..00000000000
--- a/xlators/cluster/map/src/Makefile.am
+++ /dev/null
@@ -1,15 +0,0 @@
-xlator_LTLIBRARIES = map.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
-
-map_la_LDFLAGS = -module -avoidversion 
-
-map_la_SOURCES = map.c map-helper.c
-map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-noinst_HEADERS = map.h
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
-	    -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
-
-CLEANFILES = 
-
diff --git a/xlators/cluster/map/src/map-helper.c b/xlators/cluster/map/src/map-helper.c
deleted file mode 100644
index 365eeb4900b..00000000000
--- a/xlators/cluster/map/src/map-helper.c
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
-  Copyright (c) 2009-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "xlator.h"
-#include "map.h"
-
-
-xlator_t *
-map_subvol_next (xlator_t *this, xlator_t *prev)
-{
-	map_private_t *priv = NULL;
-	xlator_t      *next = NULL;
-	int            i = 0;
-
-	priv = this->private;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (priv->xlarray[i].xl == prev) {
-			if ((i + 1) < priv->child_count)
-				next = priv->xlarray[i + 1].xl;
-			break;
-		}
-	}
-
-	return next;
-}
-
-int
-map_subvol_cnt (xlator_t *this, xlator_t *subvol)
-{
-	int i = 0;
-	int ret = -1;
-	map_private_t *priv = NULL;
-
-	priv = this->private;
-
-	for (i = 0; i < priv->child_count; i++) {
-		if (subvol == priv->xlarray[i].xl) {
-			ret = i;
-			break;
-		}
-	}
-
-	return ret;
-}
-
-int
-map_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
-{
-	map_private_t *priv = NULL;
-	int         cnt = 0;
-	int         max = 0;
-	uint64_t    y = 0;
-
-	if (x == ((uint64_t) -1)) {
-		y = (uint64_t) -1;
-		goto out;
-	}
-
-	priv = this->private;
-
-	max = priv->child_count;
-	cnt = map_subvol_cnt (this, subvol);
-
-	y = ((x * max) + cnt);
-
-out:
-	if (y_p)
-		*y_p = y;
-
-	return 0;
-}
-
-
-int
-map_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p,
-		  uint64_t *x_p)
-{
-	int         cnt = 0;
-	int         max = 0;
-	uint64_t    x = 0;
-	xlator_t   *subvol = 0;
-	map_private_t *priv = NULL;
-
-	priv = this->private;
-	max = priv->child_count;
-
-	cnt = y % max;
-	x   = y / max;
-
-	subvol = priv->xlarray[cnt].xl;
-
-	if (subvol_p)
-		*subvol_p = subvol;
-
-	if (x_p)
-		*x_p = x;
-
-	return 0;
-}
-
-
-xlator_t *
-get_mapping_subvol_from_path (xlator_t *this, const char *path) 
-{
-	map_private_t      *priv = NULL;
-	struct map_pattern *map = NULL;
-
-	/* To make sure we handle '/' properly */
-	if (!strcmp (path, "/"))
-		return NULL;
-
-	priv = this->private;
-
-	map = priv->map;
-	while (map) {
-		if (!strncmp (map->directory, path, map->dir_len)) {
-			if ((path[map->dir_len] == '/') ||
-			    (path[map->dir_len] == '\0')) {
-				return map->xl;
-			}
-		}
-		
-		map = map->next;
-	}
-
-	return priv->default_xl;
-}
-
-xlator_t *
-get_mapping_subvol_from_ctx (xlator_t *this, inode_t *inode)
-{
-	uint64_t subvol = 0;
-	int      ret    = -1;
-
-	ret = inode_ctx_get (inode, this, &subvol);
-	if (ret != 0) 
-		return NULL;
-
-	return (xlator_t *)(long)subvol;
-}
-
-int
-check_multiple_volume_entry (xlator_t *this, 
-			     xlator_t *subvol)
-{
-	int ret = -1;
-	int idx = 0;
-	map_private_t *priv = NULL;
-
-	priv = this->private;
-	
-	for (idx = 0; idx < priv->child_count; idx++) {
-		if (priv->xlarray[idx].xl == subvol) {
-			if (priv->xlarray[idx].mapped) {
-				gf_log (this->name, GF_LOG_ERROR,
-					"subvolume '%s' is already mapped",
-					subvol->name);
-				goto out;
-			}
-			priv->xlarray[idx].mapped = 1;
-			ret = 0;
-			goto out;
-		}
-	}
-
-	gf_log (this->name, GF_LOG_ERROR,
-		"subvolume '%s' is not found",
-		subvol->name);
-	
- out:
-	return ret;
-}
-
-int
-verify_dir_and_assign_subvol (xlator_t *this, 
-			      const char *directory, 
-			      const char *subvol)
-{
-	int            default_flag = 0;
-	int            ret  = -1;
-	int            idx  = 0;
-	map_private_t *priv = NULL;
-	xlator_list_t *trav = NULL;
-	struct map_pattern *tmp_map = NULL;
-
-	priv = this->private;
-
-	/* check if directory is valid, ie, its a top level dir, and 
-	 * not includes a '*' in it.
-	 */
-	if (!strcmp ("*", directory)) {
-		default_flag = 1;
-	} else {
-		if (directory[0] != '/') {
-			gf_log (this->name, GF_LOG_ERROR,
-				"map takes absolute path, starting with '/'. "
-				"not '%s'", directory);
-			goto out;
-		}
-		for (idx = 1; idx < (strlen (directory) - 1); idx++) {
-			if (directory[idx] == '/') {
-				gf_log (this->name, GF_LOG_ERROR,
-					"map takes only top level directory, "
-					"not '%s'", directory);
-				goto out;
-			}
-		}
-	}
-
-	/* Assign proper subvolume */
-	trav = this->children;
-	while (trav) {
-		if (!strcmp (trav->xlator->name, subvol)) {
-			
-			/* Check if there is another directory for 
-			 * same volume, if yes, return error.
-			 */
-			ret = check_multiple_volume_entry (this, 
-							   trav->xlator);
-			if (ret != 0) {
-				goto out;
-			}
-
-			ret = 0;
-			if (default_flag) {
-				if (priv->default_xl) {
-					ret = -1;
-					gf_log (this->name, GF_LOG_ERROR,
-						"'*' specified more than "
-						"once. don't confuse me!!!");
-				}
-
-				priv->default_xl = trav->xlator;
-				goto out;
-			}
-
-			tmp_map = CALLOC (1, sizeof (struct map_pattern));
-			tmp_map->xl = trav->xlator;
-			tmp_map->dir_len = strlen (directory);
-
-			/* make sure that the top level directory starts 
-			 * with '/' and ends without '/'
-			 */
-			tmp_map->directory = strdup (directory);
-			if (directory[tmp_map->dir_len - 1] == '/') {
-				tmp_map->dir_len--;
-			}
-
-			if (!priv->map) 
-				priv->map = tmp_map;
-			else {
-				struct map_pattern *trav_map = NULL;
-				trav_map = priv->map;
-				while (trav_map->next)
-					trav_map = trav_map->next;
-				trav_map->next = tmp_map;
-			}
-			
-			goto out;
-		}
-
-		trav = trav->next;
-	}
-
-	gf_log (this->name, GF_LOG_ERROR, 
-		"map volume '%s' is not proper subvolume", subvol);
-
- out:
-	return ret;
-}
-
-int 
-assign_default_subvol (xlator_t *this, const char *default_xl)
-{
-	int ret = -1;
-	map_private_t *priv = NULL;
-	xlator_list_t *trav = NULL;
-
-	priv = this->private;
-	trav = this->children;
-
-	while (trav) {
-		if (!strcmp (trav->xlator->name, default_xl)) {
-			ret = check_multiple_volume_entry (this, 
-							   trav->xlator);
-			if (ret != 0) {
-				goto out;
-			}
-			if (priv->default_xl)
-				gf_log (this->name, GF_LOG_WARNING,
-					"default-volume option provided, "
-					"overriding earlier '*' option");
-			priv->default_xl = trav->xlator;
-			return 0;
-		}
-		trav = trav->next;
-	}
-
-	gf_log (this->name, GF_LOG_ERROR,
-		"default-volume value is not an valid subvolume. check again");
- out:
-	return -1;
-}
-
-void
-verify_if_all_subvolumes_got_used (xlator_t *this)
-{
-	int idx = 0;
-	map_private_t *priv = NULL;
-
-	priv = this->private;
-	
-	for (idx = 0; idx < priv->child_count; idx++) {
-		if (!priv->xlarray[idx].mapped) {
-			if (!priv->default_xl) {
-				priv->default_xl = priv->xlarray[idx].xl;
-				priv->xlarray[idx].mapped = 1;
-			} else {
-				gf_log (this->name, GF_LOG_WARNING,
-					"subvolume '%s' is not mapped to "
-					"any directory",
-					priv->xlarray[idx].xl->name);
-			}
-		}
-	}
-
-	if (!priv->default_xl) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"default subvolume not specified, filesystem "
-			"may not work properly. Check 'map' translator "
-			"documentation for more info");
-	}
-
-	return ;
-}
diff --git a/xlators/cluster/map/src/map.c b/xlators/cluster/map/src/map.c
deleted file mode 100644
index c44b24ce5b2..00000000000
--- a/xlators/cluster/map/src/map.c
+++ /dev/null
@@ -1,2193 +0,0 @@
-/*
-  Copyright (c) 2009-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "xlator.h"
-#include "map.h"
-
-/* For <op>_cbk functions */
-#include "defaults.c"
-
-
-int32_t
-map_stat (call_frame_t *frame,
-	  xlator_t *this,
-	  loc_t *loc)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_stat_cbk,
-		    subvol,
-		    subvol->fops->stat,
-		    loc);
-
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-
-int32_t
-map_chmod (call_frame_t *frame,
-	   xlator_t *this,
-	   loc_t *loc,
-	   mode_t mode)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_chmod_cbk,
-		    subvol,
-		    subvol->fops->chmod,
-		    loc,
-		    mode);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_fchmod (call_frame_t *frame,
-	    xlator_t *this,
-	    fd_t *fd,
-	    mode_t mode)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_fchmod_cbk,
-		    subvol,
-		    subvol->fops->fchmod,
-		    fd,
-		    mode);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_chown (call_frame_t *frame,
-	   xlator_t *this,
-	   loc_t *loc,
-	   uid_t uid,
-	   gid_t gid)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_chown_cbk,
-		    subvol,
-		    subvol->fops->chown,
-		    loc,
-		    uid,
-		    gid);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_fchown (call_frame_t *frame,
-	    xlator_t *this,
-	    fd_t *fd,
-	    uid_t uid,
-	    gid_t gid)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_fchown_cbk,
-		    subvol,
-		    subvol->fops->fchown,
-		    fd,
-		    uid,
-		    gid);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_truncate (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      off_t offset)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_truncate_cbk,
-		    subvol,
-		    subvol->fops->truncate,
-		    loc,
-		    offset);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_ftruncate (call_frame_t *frame,
-	       xlator_t *this,
-	       fd_t *fd,
-	       off_t offset)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_ftruncate_cbk,
-		    subvol,
-		    subvol->fops->ftruncate,
-		    fd,
-		    offset);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_utimens (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     struct timespec tv[2])
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_utimens_cbk,
-		    subvol,
-		    subvol->fops->utimens,
-		    loc,
-		    tv);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_access (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    int32_t mask)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_access_cbk,
-		    subvol,
-		    subvol->fops->access,
-		    loc,
-		    mask);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-
-int32_t
-map_readlink (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      size_t size)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_readlink_cbk,
-		    subvol,
-		    subvol->fops->readlink,
-		    loc,
-		    size);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_unlink (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_unlink_cbk,
-		    subvol,
-		    subvol->fops->unlink,
-		    loc);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_rmdir (call_frame_t *frame,
-	   xlator_t *this,
-	   loc_t *loc)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_rmdir_cbk,
-		    subvol,
-		    subvol->fops->rmdir,
-		    loc);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-
-int32_t
-map_rename (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *oldloc,
-	    loc_t *newloc)
-{
-	int32_t op_errno = 1;
-	xlator_t *old_subvol = NULL;
-	xlator_t *new_subvol = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (oldloc, err);
-        VALIDATE_OR_GOTO (oldloc->inode, err);
-        VALIDATE_OR_GOTO (oldloc->path, err);
-        VALIDATE_OR_GOTO (newloc, err);
-
-	old_subvol = get_mapping_subvol_from_ctx (this, oldloc->inode);
-	if (!old_subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	if (newloc->path) {
-		new_subvol = get_mapping_subvol_from_path (this, 
-							   newloc->path);
-		if (new_subvol && (new_subvol != old_subvol)) {
-			op_errno = EXDEV;
-			goto err;
-		}
-	}
-
-	STACK_WIND (frame,
-		    default_rename_cbk,
-		    old_subvol,
-		    old_subvol->fops->rename,
-		    oldloc, newloc);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-
-int32_t
-map_link (call_frame_t *frame,
-	  xlator_t *this,
-	  loc_t *oldloc,
-	  loc_t *newloc)
-{
-	int32_t op_errno = 1;
-	xlator_t *old_subvol = NULL;
-	xlator_t *new_subvol = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (oldloc, err);
-        VALIDATE_OR_GOTO (oldloc->inode, err);
-        VALIDATE_OR_GOTO (oldloc->path, err);
-        VALIDATE_OR_GOTO (newloc, err);
-
-	old_subvol = get_mapping_subvol_from_ctx (this, oldloc->inode);
-	if (!old_subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	if (newloc->path) {
-		new_subvol = get_mapping_subvol_from_path (this, 
-							   newloc->path);
-		if (new_subvol && (new_subvol != old_subvol)) {
-			op_errno = EXDEV;
-			goto err;
-		}
-	}
-
-	STACK_WIND (frame,
-		    default_link_cbk,
-		    old_subvol,
-		    old_subvol->fops->link,
-		    oldloc, newloc);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-
-int32_t
-map_open (call_frame_t *frame,
-	  xlator_t *this,
-	  loc_t *loc,
-	  int32_t flags, fd_t *fd)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_open_cbk,
-		    subvol,
-		    subvol->fops->open,
-		    loc, flags, fd);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_readv (call_frame_t *frame,
-	   xlator_t *this,
-	   fd_t *fd,
-	   size_t size,
-	   off_t offset)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_readv_cbk,
-		    subvol,
-		    subvol->fops->readv,
-		    fd,
-		    size,
-		    offset);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_writev (call_frame_t *frame,
-	    xlator_t *this,
-	    fd_t *fd,
-	    struct iovec *vector,
-	    int32_t count,
-	    off_t off)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_writev_cbk,
-		    subvol,
-		    subvol->fops->writev,
-		    fd,
-		    vector,
-		    count,
-		    off);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_flush (call_frame_t *frame,
-	   xlator_t *this,
-	   fd_t *fd)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_flush_cbk,
-		    subvol,
-		    subvol->fops->flush,
-		    fd);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-
-int32_t
-map_fsync (call_frame_t *frame,
-	   xlator_t *this,
-	   fd_t *fd,
-	   int32_t flags)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_fsync_cbk,
-		    subvol,
-		    subvol->fops->fsync,
-		    fd,
-		    flags);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_fstat (call_frame_t *frame,
-	   xlator_t *this,
-	   fd_t *fd)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_fstat_cbk,
-		    subvol,
-		    subvol->fops->fstat,
-		    fd);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_getdents (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      size_t size,
-	      off_t offset,
-	      int32_t flag)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_getdents_cbk,
-		    subvol,
-		    subvol->fops->getdents,
-		    fd,
-		    size,
-		    offset,
-		    flag);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_setdents (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      int32_t flags,
-	      dir_entry_t *entries,
-	      int32_t count)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_setdents_cbk,
-		    subvol,
-		    subvol->fops->setdents,
-		    fd,
-		    flags,
-		    entries,
-		    count);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-
-int32_t
-map_fsyncdir (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      int32_t flags)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_fsyncdir_cbk,
-		    subvol,
-		    subvol->fops->fsyncdir,
-		    fd,
-		    flags);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-
-
-
-int32_t
-map_setxattr (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      dict_t *dict,
-	      int32_t flags)
-{
-	/* TODO: support for 'get' 'put' API */
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_setxattr_cbk,
-		    subvol,
-		    subvol->fops->setxattr,
-		    loc,
-		    dict,
-		    flags);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_getxattr (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      const char *name)
-{
-	/* TODO: support for 'get' 'put' API */
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_getxattr_cbk,
-		    subvol,
-		    subvol->fops->getxattr,
-		    loc,
-		    name);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_xattrop (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     gf_xattrop_flags_t flags,
-	     dict_t *dict)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_xattrop_cbk,
-		    subvol,
-		    subvol->fops->xattrop,
-		    loc,
-		    flags,
-		    dict);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_fxattrop (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      gf_xattrop_flags_t flags,
-	      dict_t *dict)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_fxattrop_cbk,
-		    subvol,
-		    subvol->fops->fxattrop,
-		    fd,
-		    flags,
-		    dict);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_removexattr (call_frame_t *frame,
-		 xlator_t *this,
-		 loc_t *loc,
-		 const char *name)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_removexattr_cbk,
-		    subvol,
-		    subvol->fops->removexattr,
-		    loc,
-		    name);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_lk (call_frame_t *frame,
-	xlator_t *this,
-	fd_t *fd,
-	int32_t cmd,
-	struct flock *lock)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_lk_cbk,
-		    subvol,
-		    subvol->fops->lk,
-		    fd,
-		    cmd,
-		    lock);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-
-int32_t
-map_inodelk (call_frame_t *frame, xlator_t *this,
-	     loc_t *loc, int32_t cmd, struct flock *lock)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_inodelk_cbk,
-		    subvol,
-		    subvol->fops->inodelk,
-		    loc, cmd, lock);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-
-int32_t
-map_finodelk (call_frame_t *frame, xlator_t *this,
-	      fd_t *fd, int32_t cmd, struct flock *lock)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_finodelk_cbk,
-		    subvol,
-		    subvol->fops->finodelk,
-		    fd, cmd, lock);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_entrylk (call_frame_t *frame, xlator_t *this,
-	     loc_t *loc, const char *basename,
-	     entrylk_cmd cmd, entrylk_type type)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame, default_entrylk_cbk,
-		    subvol,
-		    subvol->fops->entrylk,
-		    loc, basename, cmd, type);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_fentrylk (call_frame_t *frame, xlator_t *this,
-	      fd_t *fd, const char *basename,
-	      entrylk_cmd cmd, entrylk_type type)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame, default_fentrylk_cbk,
-		    subvol,
-		    subvol->fops->fentrylk,
-		    fd, basename, cmd, type);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_checksum (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      int32_t flag)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	STACK_WIND (frame,
-		    default_checksum_cbk,
-		    subvol,
-		    subvol->fops->checksum,
-		    loc,
-		    flag);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-static int32_t
-map_newentry_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  inode_t *inode,
-		  struct stat *buf)
-{
-        call_frame_t *prev = NULL;
-        prev  = cookie;
-	
-	map_itransform (this, prev->this, buf->st_ino, &buf->st_ino);
-
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
-
-}
-
-
-int32_t
-map_mknod (call_frame_t *frame,
-	   xlator_t *this,
-	   loc_t *loc,
-	   mode_t mode,
-	   dev_t rdev)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-
-	subvol = get_mapping_subvol_from_path (this, loc->path);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-	
-	op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol);
-	if (op_errno != 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"%s: failed to set subvolume ptr in inode ctx",
-			loc->path);
-	}
-
-	STACK_WIND (frame,
-		    map_newentry_cbk,
-		    subvol,
-		    subvol->fops->mknod,
-		    loc, mode, rdev);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_mkdir (call_frame_t *frame,
-	   xlator_t *this,
-	   loc_t *loc,
-	   mode_t mode)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-
-	subvol = get_mapping_subvol_from_path (this, loc->path);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol);
-	if (op_errno != 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"%s: failed to set subvolume ptr in inode ctx",
-			loc->path);
-	}
-
-	STACK_WIND (frame,
-		    map_newentry_cbk,
-		    subvol,
-		    subvol->fops->mkdir,
-		    loc, mode);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-
-int32_t
-map_symlink (call_frame_t *frame,
-	     xlator_t *this,
-	     const char *linkpath,
-	     loc_t *loc)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-
-	subvol = get_mapping_subvol_from_path (this, loc->path);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-	
-	op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol);
-	if (op_errno != 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"%s: failed to set subvolume ptr in inode ctx",
-			loc->path);
-	}
-
-	STACK_WIND (frame,
-		    map_newentry_cbk,
-		    subvol,
-		    subvol->fops->symlink,
-		    linkpath, loc);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-
-static int32_t
-map_create_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		fd_t *fd,
-		inode_t *inode,
-		struct stat *buf)
-{
-        call_frame_t *prev = NULL;
-        prev  = cookie;
-	
-	map_itransform (this, prev->this, buf->st_ino, &buf->st_ino);
-	STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
-	return 0;
-}
-
-int32_t
-map_create (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    int32_t flags,
-	    mode_t mode, fd_t *fd)
-{
-	int32_t op_errno = 1;
-	xlator_t *subvol   = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-
-	subvol = get_mapping_subvol_from_path (this, loc->path);
-	if (!subvol) {
-		op_errno = EINVAL;
-		goto err;
-	}
-
-	op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol);
-	if (op_errno != 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"%s: failed to set subvolume ptr in inode ctx",
-			loc->path);
-	}
-
-	STACK_WIND (frame, map_create_cbk,
-		    subvol,
-		    subvol->fops->create,
-		    loc, flags, mode, fd);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-
-int32_t
-map_single_lookup_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       inode_t *inode,
-		       struct stat *buf,
-		       dict_t *dict)
-{
-        call_frame_t *prev = NULL;
-        prev  = cookie;
-	
-	map_itransform (this, prev->this, buf->st_ino, &buf->st_ino);
-
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf, dict);
-
-	return 0;
-}
-
-int32_t
-map_lookup_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		inode_t *inode,
-		struct stat *buf,
-		dict_t *dict)
-{
-	int callcnt = 0;
-	map_local_t *local = NULL;
-	inode_t *tmp_inode = NULL;
-	dict_t *tmp_dict = NULL;
-
-	local = frame->local;
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-		if ((op_ret == 0) && (local->op_ret == -1)) {
-			local->op_ret = 0;
-			local->stbuf = *buf;
-			if (dict)
-				local->dict = dict_ref (dict);
-			local->inode = inode_ref (inode);
-		}
-		if (op_ret == -1)
-			local->op_errno = op_errno;
-		
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		tmp_dict = local->dict;
-		tmp_inode = local->inode;
-
-		STACK_UNWIND (frame, local->op_ret, 
-			      local->op_errno, local->inode, 
-			      &local->stbuf, local->dict);
-
-		inode_unref (local->inode);
-		if (tmp_dict)
-			dict_unref (tmp_dict);
-	}
-
-	return 0;
-}
-
-int32_t
-map_lookup (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    dict_t *xattr_req)
-{
-	int32_t op_errno = EINVAL;
-	xlator_t *subvol   = NULL;
-	map_local_t *local = NULL;
-	map_private_t *priv = NULL;
-	xlator_list_t *trav = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-
-	priv = this->private;
-
-	if (loc->inode->ino == 1)
-		goto root_inode;
-
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		subvol = get_mapping_subvol_from_path (this, loc->path);
-		if (!subvol) {
-			goto err;
-		}
-
-		op_errno = inode_ctx_put (loc->inode, this, 
-					  (uint64_t)(long)subvol);
-		if (op_errno != 0) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"%s: failed to set subvolume in inode ctx",
-				loc->path);
-		}
-	}
-
-	/* Just one callback */
-	STACK_WIND (frame,
-		    map_single_lookup_cbk,
-		    subvol,
-		    subvol->fops->lookup,
-		    loc,
-		    xattr_req);
-
-	return 0;
-
- root_inode:
-	local = CALLOC (1, sizeof (map_local_t));
-
-	frame->local = local;
-	local->call_count = priv->child_count;
-	local->op_ret = -1;
-
-	trav = this->children;
-	while (trav) {
-		STACK_WIND (frame,
-			    map_lookup_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->lookup,
-			    loc,
-			    xattr_req);
-		trav = trav->next;
-	}
-
-	return 0;
-
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-/*
- * unify_normalize_stats -
- */
-void
-map_normalize_stats (struct statvfs *buf,
-		     unsigned long bsize,
-		     unsigned long frsize)
-{
-	double factor;
-
-	if (buf->f_bsize != bsize) {
-		factor = ((double) buf->f_bsize) / bsize;
-		buf->f_bsize  = bsize;
-		buf->f_bfree  = (fsblkcnt_t) (factor * buf->f_bfree);
-		buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail);
-	}
-  
-	if (buf->f_frsize != frsize) {
-		factor = ((double) buf->f_frsize) / frsize;
-		buf->f_frsize = frsize;
-		buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks);
-	}
-}
-
-
-int32_t
-map_statfs_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		struct statvfs *stbuf)
-{
-	struct statvfs *dict_buf = NULL;
-	map_local_t  *local = NULL;
-	int           this_call_cnt = 0;
-	unsigned long bsize;
-	unsigned long frsize;
-
-	local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		this_call_cnt = --local->call_count;
-
-		if (op_ret == -1) {
-			local->op_errno = op_errno;
-			goto unlock;
-		}
-		local->op_ret = 0;
-		
-		/* when a call is successfull, add it to local->dict */
-		dict_buf = &local->statvfs;
-		
-		if (dict_buf->f_bsize != 0) {
-			bsize  = max (dict_buf->f_bsize, 
-				      stbuf->f_bsize);
-			
-			frsize = max (dict_buf->f_frsize, 
-				      stbuf->f_frsize);
-			map_normalize_stats(dict_buf, bsize, frsize);
-			map_normalize_stats(stbuf, bsize, frsize);
-		} else {
-			dict_buf->f_bsize   = stbuf->f_bsize;
-			dict_buf->f_frsize  = stbuf->f_frsize;
-		}
-		
-		dict_buf->f_blocks += stbuf->f_blocks;
-		dict_buf->f_bfree  += stbuf->f_bfree;
-		dict_buf->f_bavail += stbuf->f_bavail;
-		dict_buf->f_files  += stbuf->f_files;
-		dict_buf->f_ffree  += stbuf->f_ffree;
-		dict_buf->f_favail += stbuf->f_favail;
-		dict_buf->f_fsid    = stbuf->f_fsid;
-		dict_buf->f_flag    = stbuf->f_flag;
-		dict_buf->f_namemax = stbuf->f_namemax;
-	}
-unlock:
-	UNLOCK (&frame->lock);
-
-	if (!this_call_cnt) {
-		STACK_UNWIND (frame, local->op_ret, local->op_errno,
-			      &local->statvfs);
-	}
-
-	return 0;
-}
-
-int32_t
-map_statfs (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc)
-{
-	int32_t op_errno = EINVAL;
-	xlator_t *subvol   = NULL;
-	map_local_t *local = NULL;
-	map_private_t *priv = NULL;
-	xlator_list_t *trav = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (loc, err);
-        VALIDATE_OR_GOTO (loc->path, err);
-        VALIDATE_OR_GOTO (loc->inode, err);
-
-	if (loc->inode->ino == 1)
-		goto root_inode;
-	subvol = get_mapping_subvol_from_ctx (this, loc->inode);
-	if (!subvol) {
-		goto err;
-	}
-	
-	/* Just one callback */
-	STACK_WIND (frame,
-		    default_statfs_cbk,
-		    subvol,
-		    subvol->fops->statfs,
-		    loc);
-
-	return 0;
-
- root_inode:
-	local = CALLOC (1, sizeof (map_local_t));
-
-	priv = this->private;
-	frame->local = local;
-	local->call_count = priv->child_count;
-	local->op_ret = -1;
-
-	trav = this->children;
-	while (trav) {
-		STACK_WIND (frame,
-			    map_statfs_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->statfs,
-			    loc);
-		trav = trav->next;
-	}
-
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-
-	return 0;
-}
-
-int32_t
-map_opendir_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 fd_t *fd)
-{
-	int callcnt = 0;
-	map_local_t *local = NULL;
-	fd_t *local_fd = NULL;
-
-	local = frame->local;
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-
-		if (op_ret == -1) {
-			local->op_errno = op_errno;
-			goto unlock;
-		}
-
-		local->op_ret = 0;
-	}
- unlock:
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		local_fd = local->fd;
-		local->fd = NULL;
-
-		STACK_UNWIND (frame, local->op_ret, 
-			      local->op_errno, local_fd);
-
-		fd_unref (local_fd);
-	}
-	return 0;
-}
-
-
-int32_t
-map_opendir (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc, fd_t *fd)
-{
-	int32_t op_errno = EINVAL;
-	xlator_t *subvol   = NULL;
-	map_local_t *local = NULL;
-	map_private_t *priv = NULL;
-	xlator_list_t *trav = NULL;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	if (loc->inode->ino == 1) 
-		goto root_inode;
-	
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		goto err;
-	}
-	
-	/* Just one callback */
-	STACK_WIND (frame,
-		    default_opendir_cbk,
-		    subvol,
-		    subvol->fops->opendir,
-		    loc, fd);
-	return 0;
-
- root_inode:
-	local = CALLOC (1, sizeof (map_local_t));
-
-	priv = this->private;
-	frame->local = local;
-	local->call_count = priv->child_count;
-	local->op_ret = -1;
-	local->fd = fd_ref (fd);
-
-	trav = this->children;
-	while (trav) {
-		STACK_WIND (frame,
-			    map_opendir_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->opendir,
-			    loc, fd);
-		trav = trav->next;
-	}
-	
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-
-	return 0;
-}
-
-
-int32_t
-map_single_readdir_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno,
-			gf_dirent_t *entries)
-{
-        call_frame_t *prev = NULL;
-	gf_dirent_t  *orig_entry = NULL;
-
-        prev  = cookie;
-
-	list_for_each_entry (orig_entry, &entries->list, list) {
-		map_itransform (this, prev->this, orig_entry->d_ino, 
-				&orig_entry->d_ino);
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, entries);
-	
-	return 0;
-}
-
-
-int
-map_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		 int op_ret, int op_errno, gf_dirent_t *orig_entries)
-{
-	map_local_t  *local = NULL;
-	gf_dirent_t   entries;
-	gf_dirent_t  *orig_entry = NULL;
-	gf_dirent_t  *entry = NULL;
-	call_frame_t *prev = NULL;
-	xlator_t     *subvol = NULL;
-	xlator_t     *next = NULL;
-	int           count = 0;
-	fd_t         *local_fd = NULL;
-
-	INIT_LIST_HEAD (&entries.list);
-	prev = cookie;
-	local = frame->local;
-
-	if (op_ret < 0)
-		goto done;
-
-	list_for_each_entry (orig_entry, &orig_entries->list, list) {
-		subvol = prev->this;
-
-		entry = gf_dirent_for_name (orig_entry->d_name);
-		if (!entry) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"memory allocation failed :(");
-			goto unwind;
-		}
-		
-		map_itransform (this, subvol, orig_entry->d_ino,
-				&entry->d_ino);
-		map_itransform (this, subvol, orig_entry->d_off,
-				&entry->d_off);
-		
-		entry->d_type = orig_entry->d_type;
-		entry->d_len  = orig_entry->d_len;
-		
-		list_add_tail (&entry->list, &entries.list);
-		count++;
-	}
-
-	op_ret = count;
-
-done:
-	if (count == 0) {
-		next = map_subvol_next (this, prev->this);
-		if (!next) {
-			goto unwind;
-		}
-
-		STACK_WIND (frame, map_readdir_cbk,
-			    next, next->fops->readdir,
-			    local->fd, local->size, 0);
-		return 0;
-	}
-
-unwind:
-	if (op_ret < 0)
-		op_ret = 0;
-
-	local_fd = local->fd;
-	local->fd = NULL;
-
-	STACK_UNWIND (frame, op_ret, op_errno, &entries);
-
-	fd_unref (local_fd);
-
-	gf_dirent_free (&entries);
-
-        return 0;
-}
-
-
-int32_t
-map_readdir (call_frame_t *frame,
-	     xlator_t *this,
-	     fd_t *fd,
-	     size_t size,
-	     off_t yoff)
-{
-	int32_t        op_errno = EINVAL;
-	xlator_t      *subvol = NULL;
-	map_local_t   *local = NULL;
-	map_private_t *priv = NULL;
-	xlator_t      *xvol = NULL;
-	off_t          xoff = 0;
-
-        VALIDATE_OR_GOTO (frame, err);
-        VALIDATE_OR_GOTO (this, err);
-        VALIDATE_OR_GOTO (fd, err);
-        VALIDATE_OR_GOTO (fd->inode, err);
-
-	if (fd->inode->ino == 1) 
-		goto root_inode;
-
-	subvol = get_mapping_subvol_from_ctx (this, fd->inode);
-	if (!subvol) {
-		goto err;
-	}
-	
-	/* Just one callback */
-	
-	STACK_WIND (frame,
-		    map_single_readdir_cbk,
-		    subvol,
-		    subvol->fops->readdir,
-		    fd, size, yoff);
-	return 0;
-
- root_inode:
-	/* readdir on '/' */
-	local = CALLOC (1, sizeof (map_local_t));
-	if (!local) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"memory allocation failed :(");
-		op_errno = ENOMEM;
-		goto err;
-	}
-	
-	priv = this->private;
-	frame->local = local;
-	local->op_errno = ENOENT;
-	local->op_ret = -1;
-
-	local->fd = fd_ref (fd);
-	local->size = size;
-
-	map_deitransform (this, yoff, &xvol, (uint64_t *)&xoff);
-
-	STACK_WIND (frame, map_readdir_cbk,
-		    xvol, xvol->fops->readdir,
-		    fd, size, xoff);
-
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL);
-
-	return 0;
-}
-
-
-#if 0 
-/* TODO : do it later as currently only unify uses this mop and mostly 
-   unify will be used below map  */
-int32_t
-map_stats_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno,
-	       struct xlator_stats *stats)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, stats);
-	return 0;
-}
-
-
-int32_t
-map_stats (call_frame_t *frame,
-	   xlator_t *this,
-	   int32_t flags)
-{
-	STACK_WIND (frame,
-		    map_stats_cbk,
-		    subvol,
-		    subvol->mops->stats,
-		    flags);
-	return 0;
- err:
-	STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-
-	return 0;
-}
-#endif /* if 0 */
-
-
-/* TODO: define the behavior of notify */
-
-
-void
-fini (xlator_t *this)
-{
-	map_private_t *priv = NULL;
-	struct map_pattern *trav_map = NULL;
-	struct map_pattern *tmp_map  = NULL;
-
-	priv = this->private;
-
-	if (priv) {
-		if (priv->xlarray)
-			FREE (priv->xlarray);
-
-		trav_map = priv->map;
-		while (trav_map) {
-			tmp_map = trav_map;
-			trav_map = trav_map->next;
-			FREE (tmp_map);
-		}
-
-		FREE(priv);
-	}
-
-	return;
-}
-
-int
-init (xlator_t *this)
-{
-	map_private_t *priv = NULL;
-	xlator_list_t *trav = NULL;
-	int   count = 0;
-	int   ret = -1;
-	char *pattern_string = NULL;
-	char *map_pair_str = NULL;
-	char *tmp_str = NULL;
-	char *tmp_str1 = NULL;
-	char *dup_map_pair = NULL;
-	char *dir_str = NULL;
-	char *subvol_str = NULL;
-	char *default_xl = NULL;
-
-	if (!this->children) {
-		gf_log (this->name,GF_LOG_ERROR,
-			"FATAL: map should have one or more child defined");
-		return -1;
-	}
-
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-
-	priv = CALLOC (1, sizeof (map_private_t));
-	this->private = priv;
-
-	/* allocate xlator array */
-	trav = this->children;
-	while (trav) {
-		count++;
-		trav = trav->next;
-	}
-	priv->xlarray = CALLOC (1, sizeof (struct map_xlator_array) * count);
-	priv->child_count = count;
-
-	/* build xlator array */
-	count = 0;
-	trav = this->children;
-	while (trav) {
-		priv->xlarray[count++].xl = trav->xlator;
-		trav = trav->next;
-	}
-
-	/* map dir1:brick1;dir2:brick2;dir3:brick3;*:brick4 */
-	ret = dict_get_str (this->options, "map-directory", &pattern_string);
-	if (ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"map.pattern not given, can't continue");
-		goto err;
-	}
-	map_pair_str = strtok_r (pattern_string, ";", &tmp_str);
-	while (map_pair_str) {
-		dup_map_pair = strdup (map_pair_str);
-		dir_str = strtok_r (dup_map_pair, ":", &tmp_str1);
-		if (!dir_str) {
-			gf_log (this->name, GF_LOG_ERROR, 
-				"directory string invalid");
-			goto err;
-		}
-		subvol_str = strtok_r (NULL, ":", &tmp_str1);
-		if (!subvol_str) {
-			gf_log (this->name, GF_LOG_ERROR, 
-				"mapping subvolume string invalid");
-			goto err;
-		}
-		ret = verify_dir_and_assign_subvol (this, 
-						    dir_str, 
-						    subvol_str);
-		if (ret != 0) {
-			gf_log (this->name, GF_LOG_ERROR, 
-				"verification failed");
-			goto err;
-		}
-		
-		FREE (dup_map_pair);
-
-		map_pair_str = strtok_r (NULL, ";", &tmp_str);
-	}
-
-	/* default-volume brick4 */
-	ret = dict_get_str (this->options, "default-volume", &default_xl);
-	if (ret == 0) {
-		ret = assign_default_subvol (this, default_xl);
-		if (ret != 0) {
-			gf_log (this->name, GF_LOG_ERROR, 
-				"assigning default failed");
-			goto err;
-		}
-	}
-
-	verify_if_all_subvolumes_got_used (this);
-	
-	return 0;
- err:
-	fini (this);
-	return -1;
-}
-
-
-struct xlator_fops fops = {
-	.lookup      = map_lookup,
-	.mknod       = map_mknod,
-	.create      = map_create,
-
-	.stat        = map_stat,
-	.chmod       = map_chmod,
-	.chown       = map_chown,
-	.fchown      = map_fchown,
-	.fchmod      = map_fchmod,
-	.fstat       = map_fstat,
-	.utimens     = map_utimens,
-	.truncate    = map_truncate,
-	.ftruncate   = map_ftruncate,
-	.access      = map_access,
-	.readlink    = map_readlink,
-	.setxattr    = map_setxattr,
-	.getxattr    = map_getxattr,
-	.removexattr = map_removexattr,
-	.open        = map_open,
-	.readv       = map_readv,
-	.writev      = map_writev,
-	.flush       = map_flush,
-	.fsync       = map_fsync,
-	.statfs      = map_statfs,
-	.lk          = map_lk,
-	.opendir     = map_opendir,
-	.readdir     = map_readdir,
-	.fsyncdir    = map_fsyncdir,
-	.symlink     = map_symlink,
-	.unlink      = map_unlink,
-	.link        = map_link,
-	.mkdir       = map_mkdir,
-	.rmdir       = map_rmdir,
-	.rename      = map_rename,
-	.inodelk     = map_inodelk,
-	.finodelk    = map_finodelk,
-	.entrylk     = map_entrylk,
-	.fentrylk    = map_fentrylk,
-	.xattrop     = map_xattrop,
-	.fxattrop    = map_fxattrop,
-	.setdents    = map_setdents,
-	.getdents    = map_getdents,
-	.checksum    = map_checksum,
-};
-
-struct xlator_mops mops = {
-};
-
-struct xlator_cbks cbks = {
-};
-
-struct volume_options options[] = {
-	{ .key   = {"map-directory"},  
-	  .type  = GF_OPTION_TYPE_ANY 
-	},
-	{ .key   = {"default-volume"},  
-	  .type  = GF_OPTION_TYPE_XLATOR 
-	},
-	
-	{ .key = {NULL} }
-};
diff --git a/xlators/cluster/map/src/map.h b/xlators/cluster/map/src/map.h
deleted file mode 100644
index 72b4f56400f..00000000000
--- a/xlators/cluster/map/src/map.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
-  Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __MAP_H__
-#define __MAP_H__
-
-#include "xlator.h"
-
-struct map_pattern {
-	struct map_pattern *next;
-	xlator_t           *xl;
-	char               *directory;
-	int                 dir_len;
-};
-
-struct map_xlator_array {
-	xlator_t *xl;
-	int       mapped; /* yes/no */
-};
-
-typedef struct {
-	struct map_pattern      *map;
-	xlator_t                *default_xl;
-	struct map_xlator_array *xlarray;
-	int                      child_count;
-} map_private_t;
-
-typedef struct {
-	int32_t        op_ret;
-	int32_t        op_errno;
-	int            call_count;
-	struct statvfs statvfs;
-	struct stat    stbuf;
-	inode_t       *inode;
-	dict_t        *dict;
-	fd_t          *fd;
-
-	size_t        size;
-} map_local_t;
-
-xlator_t *map_subvol_next (xlator_t *this, xlator_t *prev);
-int map_subvol_cnt (xlator_t *this, xlator_t *subvol);
-
-int map_itransform (xlator_t *this, xlator_t *subvol, 
-		    uint64_t x, uint64_t *y_p);
-int map_deitransform (xlator_t *this, uint64_t y, 
-		      xlator_t **subvol_p, uint64_t *x_p);
-
-
-xlator_t *get_mapping_subvol_from_path (xlator_t *this, const char *path);
-xlator_t *get_mapping_subvol_from_ctx (xlator_t *this, inode_t *inode);
-
-int check_multiple_volume_entry (xlator_t *this, xlator_t *subvol);
-int verify_dir_and_assign_subvol (xlator_t *this, 
-				  const char *directory, const char *subvol);
-int assign_default_subvol (xlator_t *this, const char *default_xl);
-void verify_if_all_subvolumes_got_used (xlator_t *this);
-
-
-#endif /* __MAP_H__ */
diff --git a/xlators/cluster/stripe/src/Makefile.am b/xlators/cluster/stripe/src/Makefile.am
deleted file mode 100644
index 60e0a156876..00000000000
--- a/xlators/cluster/stripe/src/Makefile.am
+++ /dev/null
@@ -1,14 +0,0 @@
-
-xlator_LTLIBRARIES = stripe.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
-
-stripe_la_LDFLAGS = -module -avoidversion
-
-stripe_la_SOURCES = stripe.c
-stripe_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
-
-CLEANFILES = 
-
diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c
deleted file mode 100644
index 0b598e69150..00000000000
--- a/xlators/cluster/stripe/src/stripe.c
+++ /dev/null
@@ -1,3286 +0,0 @@
-/*
-  Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * xlators/cluster/stripe:
- *    Stripe translator, stripes the data accross its child nodes, 
- *    as per the options given in the volfile. The striping works 
- *    fairly simple. It writes files at different offset as per 
- *    calculation. So, 'ls -l' output at the real posix level will 
- *    show file size bigger than the actual size. But when one does 
- *    'df' or 'du <file>', real size of the file on the server is shown.
- *
- * WARNING:
- *  Stripe translator can't regenerate data if a child node gets disconnected.
- *  So, no 'self-heal' for stripe. Hence the advice, use stripe only when its 
- *  very much necessary, or else, use it in combination with AFR, to have a 
- *  backup copy. 
- */
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "xlator.h"
-#include "logging.h"
-#include "defaults.h"
-#include "compat.h"
-#include "compat-errno.h"
-#include <fnmatch.h>
-#include <signal.h>
-
-#define STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR(_loc) do {        \
-        if (!(_loc && _loc->inode)) {                              \
-               STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); \
-               return 0;                                           \
-        }                                                          \
-} while(0)
-
-/**
- * struct stripe_options : This keeps the pattern and the block-size 
- *     information, which is used for striping on a file.
- */
-struct stripe_options {
-	struct stripe_options *next;
-	char path_pattern[256];
-	uint64_t block_size;
-};
-
-/**
- * Private structure for stripe translator 
- */
-struct stripe_private {
-	struct stripe_options *pattern;
-	xlator_t **xl_array;
-	uint64_t   block_size;
-	gf_lock_t  lock;
-	uint8_t    nodes_down;
-	int8_t     first_child_down;
-	int8_t     child_count;
-	int8_t     state[256];       /* Current state of the child node, 
-					0 for down, 1 for up */
-	gf_boolean_t  xattr_supported;  /* 0 for no, 1 for yes, default yes */
-};
-
-/**
- * Used to keep info about the replies received from fops->readv calls 
- */
-struct readv_replies {
-	struct iovec *vector;
-	int32_t       count;    //count of vector
-	int32_t       op_ret;   //op_ret of readv
-	int32_t       op_errno;
-	struct stat   stbuf;    /* 'stbuf' is also a part of reply */
-};
-
-/**
- * Local structure to be passed with all the frames in case of STACK_WIND
- */
-struct stripe_local; /* this itself is used inside the structure; */
-
-struct stripe_local {
-	struct stripe_local *next;
-	call_frame_t        *orig_frame; 
-
-	/* Used by _cbk functions */
- 	struct stat          stbuf;
-	struct readv_replies *replies;
-	struct statvfs       statvfs_buf;
-	dir_entry_t         *entry;
-	struct xlator_stats  stats;
-
-	int8_t               revalidate;
-	int8_t               failed;
-	int8_t               unwind;
-
-	int32_t              node_index;
-	int32_t              call_count;
-	int32_t              wind_count; /* used instead of child_cound 
-					    in case of read and write */
-	int32_t              op_ret;
-	int32_t              op_errno; 
-	int32_t              count;
-	int32_t              flags;
-	char                *name;
-	inode_t             *inode;
-
-	loc_t                loc;
-	loc_t                loc2;
-
-	/* For File I/O fops */
-	dict_t              *dict;
-
-	/* General usage */
-	off_t                offset;
-	off_t                stripe_size;
-
-	int8_t              *list;
-	struct flock         lock;
-	fd_t                *fd;
-	void                *value;
-};
-
-typedef struct stripe_local   stripe_local_t;
-typedef struct stripe_private stripe_private_t;
-
-/**
- * stripe_get_matching_bs - Get the matching block size for the given path.
- */
-int32_t 
-stripe_get_matching_bs (const char *path, 
-			struct stripe_options *opts,
-			uint64_t default_bs) 
-{
-	struct stripe_options *trav       = NULL;
-	char                  *pathname   = NULL;
-	uint64_t               block_size = 0;
-
-	block_size = default_bs;
-	pathname   = strdup (path);
-	trav       = opts;
-
-	while (trav) {
-		if (fnmatch (trav->path_pattern, 
-			     pathname, FNM_NOESCAPE) == 0) {
-			block_size = trav->block_size;
-			break;
-		}
-		trav = trav->next;
-	}
-	free (pathname);
-	
-	return block_size;
-}
-
-
-/*
- * stripe_common_cbk -
- */
-int32_t
-stripe_common_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-/**
- * stripe_stack_unwind_cbk -  This function is used for all the _cbk without 
- *     any extra arguments (other than the minimum given)
- * This is called from functions like fsync,unlink,rmdir etc.
- *
- */
-int32_t 
-stripe_stack_unwind_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno)
-{
-	int32_t         callcnt = 0;
-	stripe_local_t *local   = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_WARNING, 
-				"%s returned %s",
-				((call_frame_t *)cookie)->this->name, 
-				strerror (op_errno));			
-			local->op_errno = op_errno;
-			if (op_errno == ENOTCONN) 
-				local->failed = 1;
-		}
-		if (op_ret >= 0) 
-			local->op_ret = op_ret;
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		if (local->failed)
-			local->op_ret = -1;
-
-		if (local->loc.path)
-			loc_wipe (&local->loc);
-		if (local->loc2.path)
-			loc_wipe (&local->loc2);
-
-		STACK_UNWIND (frame, local->op_ret, local->op_errno);
-	}
-	return 0;
-}
-
-int32_t 
-stripe_common_buf_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-/**
- * stripe_stack_unwind_buf_cbk -  This function is used for all the _cbk with 
- *    'struct stat *buf' as extra argument (other than minimum)
- * This is called from functions like, chmod, fchmod, chown, fchown,
- * truncate, ftruncate, utimens etc.
- *
- * @cookie - this argument should be always 'xlator_t *' of child node 
- */
-int32_t 
-stripe_stack_unwind_buf_cbk (call_frame_t *frame,
-			     void *cookie,
-			     xlator_t *this,
-			     int32_t op_ret,
-			     int32_t op_errno,
-			     struct stat *buf)
-{
-	int32_t callcnt = 0;
-	stripe_local_t *local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_WARNING, 
-				"%s returned error %s",
-				((call_frame_t *)cookie)->this->name, 
-				strerror (op_errno));
-			local->op_errno = op_errno;
-			if (op_errno == ENOTCONN)
-				local->failed = 1;
-		}
-    
-		if (op_ret == 0) {
-			local->op_ret = 0;
-			if (local->stbuf.st_blksize == 0) {
-				local->stbuf = *buf;
-				/* Because st_blocks gets added again */
-				local->stbuf.st_blocks = 0;
-			}
-
-			if (FIRST_CHILD(this) == 
-			    ((call_frame_t *)cookie)->this) {
-				/* Always, pass the inode number of 
-				   first child to the above layer */
-				local->stbuf.st_ino = buf->st_ino;
-				local->stbuf.st_mtime = buf->st_mtime;
-			}
-
-			local->stbuf.st_blocks += buf->st_blocks;
-			if (local->stbuf.st_size < buf->st_size)
-				local->stbuf.st_size = buf->st_size;
-			if (local->stbuf.st_blksize != buf->st_blksize) {
-				/* TODO: add to blocks in terms of 
-				   original block size */
-			}
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		if (local->failed)
-			local->op_ret = -1;
-
-		if (local->loc.path)
-			loc_wipe (&local->loc);
-		if (local->loc2.path)
-			loc_wipe (&local->loc2);
-
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-			      &local->stbuf);
-	}
-
-	return 0;
-}
-
-/* In case of symlink, mknod, the file is created on just first node */
-int32_t 
-stripe_common_inode_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 inode_t *inode,
-			 struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
-}
-
-/**
- * stripe_stack_unwind_inode_cbk - This is called by the function like, 
- *                   link (), symlink (), mkdir (), mknod () 
- *           This creates a inode for new inode. It keeps a list of all 
- *           the inodes received from the child nodes. It is used while 
- *           forwarding any fops to child nodes.
- *
- */
-int32_t 
-stripe_stack_unwind_inode_cbk (call_frame_t *frame,
-			       void *cookie,
-			       xlator_t *this,
-			       int32_t op_ret,
-			       int32_t op_errno,
-			       inode_t *inode,
-			       struct stat *buf)
-{
-	int32_t callcnt = 0;
-	stripe_local_t *local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-    
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_WARNING, 
-				"%s returned error %s",
-				((call_frame_t *)cookie)->this->name, 
-				strerror (op_errno));
-			local->op_errno = op_errno;
-			if (op_errno == ENOTCONN)
-				local->failed = 1;
-		}
- 
-		if (op_ret >= 0) {
-			local->op_ret = 0;
-
-			if (local->stbuf.st_blksize == 0) {
-				local->inode = inode;
-				local->stbuf = *buf;
-				/* Because st_blocks gets added again */
-				local->stbuf.st_blocks = 0;
-			}
-			if (FIRST_CHILD(this) == 
-			    ((call_frame_t *)cookie)->this) {
-				local->stbuf.st_ino = buf->st_ino;
-				local->stbuf.st_mtime = buf->st_mtime;
-			}
-
-			local->stbuf.st_blocks += buf->st_blocks;
-			if (local->stbuf.st_size < buf->st_size)
-				local->stbuf.st_size = buf->st_size;
-			if (local->stbuf.st_blksize != buf->st_blksize) {
-				/* TODO: add to blocks in terms of 
-				   original block size */
-			}
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		if (local->failed)
-			local->op_ret = -1;
-
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-			      local->inode, &local->stbuf);
-	}
-
-	return 0;
-}
-
-int32_t 
-stripe_stack_unwind_inode_lookup_cbk (call_frame_t *frame,
-				      void *cookie,
-				      xlator_t *this,
-				      int32_t op_ret,
-				      int32_t op_errno,
-				      inode_t *inode,
-				      struct stat *buf,
-				      dict_t *dict)
-{
-	int32_t callcnt = 0;
-	dict_t *tmp_dict = NULL;
-	stripe_local_t *local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-    
-		if (op_ret == -1) {
-			if (op_errno != ENOENT)
-				gf_log (this->name, GF_LOG_WARNING, 
-					"%s returned error %s",
-					((call_frame_t *)cookie)->this->name,
-					strerror (op_errno));
-			local->op_errno = op_errno;
-			if (op_errno == ENOTCONN)
-				local->failed = 1;
-		}
- 
-		if (op_ret >= 0) {
-			local->op_ret = 0;
-
-			if (local->stbuf.st_blksize == 0) {
-				local->inode = inode;
-				local->stbuf = *buf;
-				/* Because st_blocks gets added again */
-				local->stbuf.st_blocks = 0;
-			}
-			if (FIRST_CHILD(this) == 
-			    ((call_frame_t *)cookie)->this) {
-				local->stbuf.st_ino = buf->st_ino;
-				local->stbuf.st_mtime = buf->st_mtime;
-				if (local->dict)
-					dict_unref (local->dict);
-				local->dict = dict_ref (dict);
-			} else {
-				if (!local->dict)
-					local->dict = dict_ref (dict);
-			}
-			local->stbuf.st_blocks += buf->st_blocks;
-			if (local->stbuf.st_size < buf->st_size)
-				local->stbuf.st_size = buf->st_size;
-			if (local->stbuf.st_blksize != buf->st_blksize) {
-				/* TODO: add to blocks in terms of 
-				   original block size */
-			}
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		if (local->failed)
-			local->op_ret = -1;
-
-		tmp_dict = local->dict;
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-			      local->inode, &local->stbuf, local->dict);
-		if (tmp_dict)
-			dict_unref (tmp_dict);
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_lookup -
- */
-int32_t 
-stripe_lookup (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       dict_t *xattr_req)
-{
-	stripe_local_t *local = NULL;
-	xlator_list_t *trav = NULL;
-	stripe_private_t *priv = this->private;
-	char send_lookup_to_all = 0;
-
-	if (!(loc && loc->inode)) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"wrong argument, returning EINVAL");
-		STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL);
-		return 0;
-	}
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	frame->local = local;
-
-	if ((!loc->inode->st_mode) || 
-	    S_ISDIR (loc->inode->st_mode) || 
-	    S_ISREG (loc->inode->st_mode))
-		send_lookup_to_all = 1;
-
-	if (send_lookup_to_all) {
-		/* Everytime in stripe lookup, all child nodes 
-		   should be looked up */
-		local->call_count = priv->child_count;
-		trav = this->children;
-		while (trav) {
-			STACK_WIND (frame,
-				    stripe_stack_unwind_inode_lookup_cbk,
-				    trav->xlator,
-				    trav->xlator->fops->lookup,
-				    loc, xattr_req);
-			trav = trav->next;
-		}
-	} else {
-		local->call_count = 1;
-		
-		STACK_WIND (frame,
-			    stripe_stack_unwind_inode_lookup_cbk,
-			    FIRST_CHILD(this),
-			    FIRST_CHILD(this)->fops->lookup,
-			    loc, xattr_req);
-	}
-  
-	return 0;
-}
-
-/**
- * stripe_stat -
- */
-int32_t
-stripe_stat (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc)
-{ 
-	int send_lookup_to_all = 0;
-	xlator_list_t *trav = NULL;
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode))
-		send_lookup_to_all = 1;
-
-	if (!send_lookup_to_all) {
-		STACK_WIND (frame,
-			    stripe_common_buf_cbk,
-			    FIRST_CHILD(this),
-			    FIRST_CHILD(this)->fops->stat,
-			    loc);
-	} else {
-		/* Initialization */
-		local = CALLOC (1, sizeof (stripe_local_t));
-		ERR_ABORT (local);
-		local->op_ret = -1;
-		frame->local = local;
-		local->inode = loc->inode;
-		local->call_count = priv->child_count;
-    
-		trav = this->children;
-		while (trav) {
-			STACK_WIND (frame,
-				    stripe_stack_unwind_buf_cbk,
-				    trav->xlator,
-				    trav->xlator->fops->stat,
-				    loc);
-			trav = trav->next;
-		}
-	}
-	return 0;
-}
-
-
-/**
- * stripe_chmod -
- */
-int32_t
-stripe_chmod (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      mode_t mode)
-{
-	int send_fop_to_all = 0;
-	xlator_list_t *trav = NULL;
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning ENOTCONN");
-		STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-		return 0;
-	}
-
-	if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode))
-		send_fop_to_all = 1;
-
-	if (!send_fop_to_all) {
-		STACK_WIND (frame,
-			    stripe_common_buf_cbk,
-			    FIRST_CHILD(this),
-			    FIRST_CHILD(this)->fops->chmod,
-			    loc, mode);
-	} else {
-		/* Initialization */
-		local = CALLOC (1, sizeof (stripe_local_t));
-		ERR_ABORT (local);
-		local->op_ret = -1;
-		frame->local = local;
-		local->inode = loc->inode;
-		local->call_count = priv->child_count;
-
-		trav = this->children;
-		while (trav) {
-			STACK_WIND (frame,
-				    stripe_stack_unwind_buf_cbk,
-				    trav->xlator,
-				    trav->xlator->fops->chmod,
-				    loc, mode);
-			trav = trav->next;
-		}
-	}
-	return 0;
-}
-
-
-/**
- * stripe_chown - 
- */
-int32_t
-stripe_chown (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      uid_t uid,
-	      gid_t gid)
-{
-	int send_fop_to_all = 0;
-	xlator_list_t *trav = NULL;
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning ENOTCONN");
-		STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-		return 0;
-	}
-
-	if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode))
-		send_fop_to_all = 1;
-
-	trav = this->children;
-	if (!send_fop_to_all) {
-		STACK_WIND (frame,
-			    stripe_common_buf_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->chown,
-			    loc, uid, gid);
-	} else {
-		/* Initialization */
-		local = CALLOC (1, sizeof (stripe_local_t));
-		ERR_ABORT (local);
-		local->op_ret = -1;
-		frame->local = local;
-		local->inode = loc->inode;
-		local->call_count = priv->child_count;
-
-		trav = this->children;
-		while (trav) {
-			STACK_WIND (frame,
-				    stripe_stack_unwind_buf_cbk,
-				    trav->xlator,
-				    trav->xlator->fops->chown,
-				    loc, uid, gid);
-			trav = trav->next;
-		}
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_statfs_cbk - 
- */
-int32_t
-stripe_statfs_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   struct statvfs *stbuf)
-{
-	stripe_local_t *local = (stripe_local_t *)frame->local;
-	int32_t callcnt;
-	LOCK(&frame->lock);
-	{
-		callcnt = --local->call_count;
-
-		if (op_ret != 0 && op_errno != ENOTCONN) {
-			local->op_errno = op_errno;
-		}
-		if (op_ret == 0) {
-      			struct statvfs *dict_buf = &local->statvfs_buf;
-			dict_buf->f_bsize   = stbuf->f_bsize;
-			dict_buf->f_frsize  = stbuf->f_frsize;
-			dict_buf->f_blocks += stbuf->f_blocks;
-			dict_buf->f_bfree  += stbuf->f_bfree;
-			dict_buf->f_bavail += stbuf->f_bavail;
-			dict_buf->f_files  += stbuf->f_files;
-			dict_buf->f_ffree  += stbuf->f_ffree;
-			dict_buf->f_favail += stbuf->f_favail;
-			dict_buf->f_fsid    = stbuf->f_fsid;
-			dict_buf->f_flag    = stbuf->f_flag;
-			dict_buf->f_namemax = stbuf->f_namemax;
-			local->op_ret = 0;
-		}
-	}
-	UNLOCK (&frame->lock);
-  
-	if (!callcnt) {
-		STACK_UNWIND (frame, local->op_ret, 
-			      local->op_errno, &local->statvfs_buf);
-	}
-  
-	return 0;
-}
-
-
-/**
- * stripe_statfs - 
- */
-int32_t
-stripe_statfs (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc)
-{
-	stripe_local_t *local = NULL;
-	xlator_list_t *trav = this->children;
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	local->op_errno = ENOTCONN;
-	frame->local = local;
-
-	local->call_count = ((stripe_private_t *)this->private)->child_count;
-	while (trav) {
-		STACK_WIND (frame,
-			    stripe_statfs_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->statfs,
-			    loc);
-		trav = trav->next;
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_truncate - 
- */
-int32_t
-stripe_truncate (call_frame_t *frame,
-		 xlator_t *this,
-		 loc_t *loc,
-		 off_t offset)
-{
-	int send_fop_to_all = 0;
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning ENOTCONN");
-		STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-		return 0;
-	}
-
-	if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode))
-		send_fop_to_all = 1;
-
-	if (!send_fop_to_all) {
-		STACK_WIND (frame,
-			    stripe_common_buf_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->truncate,
-			    loc,
-			    offset);
-	} else {
-		/* Initialization */
-		local = CALLOC (1, sizeof (stripe_local_t));
-		ERR_ABORT (local);
-		local->op_ret = -1;
-		frame->local = local;
-		local->inode = loc->inode;
-		local->call_count = priv->child_count;
-    
-		while (trav) {
-			STACK_WIND (frame,
-				    stripe_stack_unwind_buf_cbk,
-				    trav->xlator,
-				    trav->xlator->fops->truncate,
-				    loc,
-				    offset);
-			trav = trav->next;
-		}
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_utimens - 
- */
-int32_t 
-stripe_utimens (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		struct timespec tv[2])
-{
-	int send_fop_to_all = 0;
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning ENOTCONN");
-		STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-		return 0;
-	}
-
-	if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode))
-		send_fop_to_all = 1;
-
-	if (!send_fop_to_all) {
-		STACK_WIND (frame,
-			    stripe_common_buf_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->utimens,
-			    loc, tv);
-	} else {
-		/* Initialization */
-		local = CALLOC (1, sizeof (stripe_local_t));
-		ERR_ABORT (local);
-		local->op_ret = -1;
-		frame->local = local;
-		local->inode = loc->inode;
-		local->call_count = priv->child_count;
-    
-		while (trav) {
-			STACK_WIND (frame,
-				    stripe_stack_unwind_buf_cbk,
-				    trav->xlator,
-				    trav->xlator->fops->utimens,
-				    loc, tv);
-			trav = trav->next;
-		}
-	}
-	return 0;
-}
-
-
-int32_t 
-stripe_first_rename_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 struct stat *buf)
-{
-	stripe_local_t *local = frame->local;
-	xlator_list_t *trav = this->children;
-
-	if (op_ret == -1) 
-	{
-		STACK_UNWIND (frame, op_ret, op_errno, buf);
-		return 0;
-	}
-
-	local->op_ret = 0;
-	local->stbuf = *buf;
-	local->call_count--;
-	trav = trav->next; /* Skip first child */
-
-	while (trav) {
-		STACK_WIND (frame,
-			    stripe_stack_unwind_buf_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->rename,
-			    &local->loc, &local->loc2);
-		trav = trav->next;
-	}
-
-	return 0;
-}
-/**
- * stripe_rename - 
- */
-int32_t
-stripe_rename (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *oldloc,
-	       loc_t *newloc)
-{
-	stripe_private_t *priv = this->private;
-	stripe_local_t *local = NULL;
-	xlator_list_t *trav = this->children;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc);
-
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning ENOTCONN");
-		STACK_UNWIND (frame, -1, EIO, NULL);
-		return 0;
-	}
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	local->inode = oldloc->inode;
-	loc_copy (&local->loc, oldloc);
-	loc_copy (&local->loc2, newloc);
-
-	local->call_count = priv->child_count;
-  
-	frame->local = local;
-
-	STACK_WIND (frame,
-		    stripe_first_rename_cbk,
-		    trav->xlator,
-		    trav->xlator->fops->rename,
-		    oldloc, newloc);
-
-	return 0;
-}
-
-
-/**
- * stripe_access - 
- */
-int32_t
-stripe_access (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       int32_t mask)
-{
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	STACK_WIND (frame,
-		    stripe_common_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->access,
-		    loc, mask);
-
-	return 0;
-}
-
-
-/**
- * stripe_readlink_cbk - 
- */
-int32_t 
-stripe_readlink_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     const char *path)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, path);
-
-	return 0;
-}
-
-
-/**
- * stripe_readlink - 
- */
-int32_t
-stripe_readlink (call_frame_t *frame,
-		 xlator_t *this,
-		 loc_t *loc,
-		 size_t size)
-{
-	stripe_private_t *priv = this->private;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning ENOTCONN");
-		STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    stripe_readlink_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->readlink,
-		    loc, size);
-
-	return 0;
-}
-
-
-/**
- * stripe_unlink - 
- */
-int32_t
-stripe_unlink (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc)
-{
-	int send_fop_to_all = 0;
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning EIO");
-		STACK_UNWIND (frame, -1, EIO);
-		return 0;
-	}
- 
-	if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode))
-		send_fop_to_all = 1;
-
-	if (!send_fop_to_all) {
-		STACK_WIND (frame,
-			    stripe_common_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->unlink,
-			    loc);
-	} else {
-		/* Initialization */
-		local = CALLOC (1, sizeof (stripe_local_t));
-		ERR_ABORT (local);
-		local->op_ret = -1;
-		frame->local = local;
-		local->call_count = priv->child_count;
-    
-		while (trav) {
-			STACK_WIND (frame,
-				    stripe_stack_unwind_cbk,
-				    trav->xlator,
-				    trav->xlator->fops->unlink,
-				    loc);
-			trav = trav->next;
-		}
-	}
-
-	return 0;
-}
-
-
-int32_t 
-stripe_first_rmdir_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno)
-{
-	xlator_list_t *trav = this->children;
-	stripe_local_t *local = frame->local;
-
-	if (op_ret == -1) 
-	{
-		STACK_UNWIND (frame, op_ret, op_errno);
-		return 0;
-	}
-
-	local->call_count--; /* First child successful */
-	trav = trav->next; /* Skip first child */
-
-	while (trav) {
-		STACK_WIND (frame,
-			    stripe_stack_unwind_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->rmdir,
-			    &local->loc);
-		trav = trav->next;
-	}
-
-	return 0;
-}
-
-/**
- * stripe_rmdir - 
- */
-int32_t
-stripe_rmdir (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc)
-{
-	stripe_local_t *local = NULL;
-	xlator_list_t *trav = this->children;
-	stripe_private_t *priv = this->private;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning EIO");
-		STACK_UNWIND (frame, -1, EIO);
-		return 0;
-	}
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	frame->local = local;
-	local->inode = loc->inode;
-	loc_copy (&local->loc, loc);
-	local->call_count = priv->child_count;
-  
-	STACK_WIND (frame,
-		    stripe_first_rmdir_cbk,
-		    trav->xlator,
-		    trav->xlator->fops->rmdir,
-		    loc);
-
-	return 0;
-}
-
-
-/**
- * stripe_setxattr - 
- */
-int32_t
-stripe_setxattr (call_frame_t *frame,
-		 xlator_t *this,
-		 loc_t *loc,
-		 dict_t *dict,
-		 int32_t flags)
-{
-	stripe_private_t *priv = this->private;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning ENOTCONN");
-		STACK_UNWIND (frame, -1, ENOTCONN);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    stripe_common_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->setxattr,
-		    loc, dict, flags);
-
-	return 0;
-}
-
-
-int32_t 
-stripe_mknod_ifreg_fail_unlink_cbk (call_frame_t *frame,
-				    void *cookie,
-				    xlator_t *this,
-				    int32_t op_ret,
-				    int32_t op_errno)
-{
-	int32_t callcnt = 0;
-	stripe_local_t *local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		loc_wipe (&local->loc);
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-			      local->inode, &local->stbuf);
-	}
-
-	return 0;
-}
-
-
-/**
- */
-int32_t
-stripe_mknod_ifreg_setxattr_cbk (call_frame_t *frame,
-				 void *cookie,
-				 xlator_t *this,
-				 int32_t op_ret,
-				 int32_t op_errno)
-{
-	int32_t callcnt = 0;
-	stripe_local_t *local = frame->local;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-    
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_WARNING, 
-				"%s returned error %s",
-				((call_frame_t *)cookie)->this->name, 
-				strerror (op_errno));
-			local->op_ret = -1;
-			local->op_errno = op_errno;
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		if (local->op_ret == -1) {
-			local->call_count = priv->child_count;
-			while (trav) {
-				STACK_WIND (frame,
-					    stripe_mknod_ifreg_fail_unlink_cbk,
-					    trav->xlator,
-					    trav->xlator->fops->unlink,
-					    &local->loc);
-				trav = trav->next;
-			}
-			return 0;
-		}
-
-		loc_wipe (&local->loc);
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-			      local->inode, &local->stbuf);
-	}
-	return 0;
-}
-
-/**
- */
-int32_t
-stripe_mknod_ifreg_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno,
-			inode_t *inode,
-			struct stat *buf)
-{
-	int ret = 0;
-	int32_t callcnt = 0;
-	stripe_local_t *local = frame->local;
-	stripe_private_t *priv = this->private;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-    
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_WARNING, 
-				"%s returned error %s",
-				((call_frame_t *)cookie)->this->name, 
-				strerror (op_errno));
-			local->failed = 1;
-			local->op_errno = op_errno;
-		}
-    
-		if (op_ret >= 0) {
-			local->op_ret = op_ret;
-			/* Get the mapping in inode private */
-			/* Get the stat buf right */
-			if (local->stbuf.st_blksize == 0) {
-				local->stbuf = *buf;
-				/* Because st_blocks gets added again */
-				local->stbuf.st_blocks = 0;
-			}
-
-			/* Always, pass the inode number of first child 
-			   to the above layer */
-			if (FIRST_CHILD(this) == 
-			    ((call_frame_t *)cookie)->this)
-				local->stbuf.st_ino = buf->st_ino;
-      
-			local->stbuf.st_blocks += buf->st_blocks;
-			if (local->stbuf.st_size < buf->st_size)
-				local->stbuf.st_size = buf->st_size;
-			if (local->stbuf.st_blksize != buf->st_blksize) {
-				/* TODO: add to blocks in terms of
-				   original block size */
-			}
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		if (local->failed) 
-			local->op_ret = -1;
-
-		if ((local->op_ret != -1) && priv->xattr_supported) {
-			/* Send a setxattr request to nodes where the
-			   files are created */
-			int32_t index = 0;
-			char size_key[256] = {0,};
-			char index_key[256] = {0,};
-			char count_key[256] = {0,};
-			xlator_list_t *trav = this->children;
-			dict_t *dict = NULL;
-
-			sprintf (size_key, 
-				 "trusted.%s.stripe-size", this->name);
-			sprintf (count_key, 
-				 "trusted.%s.stripe-count", this->name);
-			sprintf (index_key, 
-				 "trusted.%s.stripe-index", this->name);
-
-			local->call_count = priv->child_count;
-
-			while (trav) {
-				dict = get_new_dict ();
-				dict_ref (dict);
-				/* TODO: check return value */
-				ret = dict_set_int64 (dict, size_key, 
-						local->stripe_size);
-				ret = dict_set_int32 (dict, count_key, 
-						local->call_count);
-				ret = dict_set_int32 (dict, index_key, index);
-
-				STACK_WIND (frame,
-					    stripe_mknod_ifreg_setxattr_cbk,
-					    trav->xlator,
-					    trav->xlator->fops->setxattr,
-					    &local->loc, dict, 0);
-	
-				dict_unref (dict);
-				index++;
-				trav = trav->next;
-			}
-		} else {
-			/* Create itself has failed.. so return 
-			   without setxattring */
-			loc_wipe (&local->loc);
-			STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-				      local->inode, &local->stbuf);
-		}
-	}
-  
-	return 0;
-}
-
-
-/**
- * stripe_mknod - 
- */
-int32_t
-stripe_mknod (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      mode_t mode,
-	      dev_t rdev)
-{
-	stripe_private_t *priv = this->private;
-	stripe_local_t *local = NULL;
-	xlator_list_t *trav = NULL;	
-  
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning EIO");
-		STACK_UNWIND (frame, -1, EIO, NULL, NULL);
-		return 0;
-	}
-
-	if (S_ISREG(mode)) {
-		/* NOTE: on older kernels (older than 2.6.9), 
-		   creat() fops is sent as mknod() + open(). Hence handling 
-		   S_IFREG files is necessary */
-		if (priv->nodes_down) {
-			gf_log (this->name, GF_LOG_WARNING, 
-				"Some node down, returning EIO");
-			STACK_UNWIND (frame, -1, EIO, loc->inode, NULL);
-			return 0;
-		}
-		
-		/* Initialization */
-		local = CALLOC (1, sizeof (stripe_local_t));
-		ERR_ABORT (local);
-		local->op_ret = -1;
-		local->op_errno = ENOTCONN;
-		local->stripe_size = stripe_get_matching_bs (loc->path,
-							     priv->pattern,
-							     priv->block_size);
-		frame->local = local;
-		local->inode = loc->inode;
-		loc_copy (&local->loc, loc);
-
-		/* Everytime in stripe lookup, all child nodes should
-		   be looked up */
-		local->call_count = 
-			((stripe_private_t *)this->private)->child_count;
-		
-		trav = this->children;
-		while (trav) {
-			STACK_WIND (frame,
-				    stripe_mknod_ifreg_cbk,
-				    trav->xlator,
-				    trav->xlator->fops->mknod,
-				    loc, mode, rdev);
-			trav = trav->next;
-		}
-
-		/* This case is handled, no need to continue further. */
-		return 0; 
-	}
-
-
-	STACK_WIND (frame,
-		    stripe_common_inode_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->mknod,
-		    loc, mode, rdev);
-
-	return 0;
-}
-
-
-/**
- * stripe_mkdir - 
- */
-int32_t
-stripe_mkdir (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      mode_t mode)
-{
-	stripe_private_t *priv = this->private;
-	stripe_local_t *local = NULL;
-	xlator_list_t *trav = NULL;
-  
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning EIO");
-		STACK_UNWIND (frame, -1, EIO, NULL, NULL);
-		return 0;
-	}
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	local->call_count = priv->child_count;
-	frame->local = local;
-
-	/* Everytime in stripe lookup, all child nodes should be looked up */
-	trav = this->children;
-	while (trav) {
-		STACK_WIND (frame,
-			    stripe_stack_unwind_inode_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->mkdir,
-			    loc, mode);
-		trav = trav->next;
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_symlink - 
- */
-int32_t
-stripe_symlink (call_frame_t *frame,
-		xlator_t *this,
-		const char *linkpath,
-		loc_t *loc)
-{
-	stripe_private_t *priv = this->private;
-  
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning EIO");
-		STACK_UNWIND (frame, -1, EIO, NULL, NULL);
-		return 0;
-	}
-
-	/* send symlink to only first node */
-	STACK_WIND (frame,
-		    stripe_common_inode_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->symlink,
-		    linkpath, loc);
-
-	return 0;
-}
-
-/**
- * stripe_link -
- */
-int32_t
-stripe_link (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *oldloc,
-	     loc_t *newloc)
-{
-	int send_fop_to_all = 0;
-	stripe_private_t *priv = this->private;
-	stripe_local_t *local = NULL;
-	xlator_list_t *trav = this->children;
-  
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc);
-
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning EIO");
-		STACK_UNWIND (frame, -1, EIO, NULL, NULL);
-		return 0;
-	}
-
-
-	if (S_ISREG (oldloc->inode->st_mode))
-		send_fop_to_all = 1;
-
-	if (!send_fop_to_all) {
-		STACK_WIND (frame,
-			    stripe_common_inode_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->link,
-			    oldloc, newloc);
-	} else {
-		/* Initialization */
-		local = CALLOC (1, sizeof (stripe_local_t));
-		ERR_ABORT (local);
-		local->op_ret = -1;
-		frame->local = local;
-		local->call_count = priv->child_count;
-
-		/* Everytime in stripe lookup, all child
-		   nodes should be looked up */
-		while (trav) {
-			STACK_WIND (frame,
-				    stripe_stack_unwind_inode_cbk,
-				    trav->xlator,
-				    trav->xlator->fops->link,
-				    oldloc, newloc);
-			trav = trav->next;
-		}
-	}
-
-	return 0;
-}
-
-int32_t 
-stripe_create_fail_unlink_cbk (call_frame_t *frame,
-			       void *cookie,
-			       xlator_t *this,
-			       int32_t op_ret,
-			       int32_t op_errno)
-{
-	int32_t callcnt = 0;
-	fd_t *lfd = NULL;
-	stripe_local_t *local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		lfd = local->fd;
-		loc_wipe (&local->loc);
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-			      local->fd, local->inode, &local->stbuf);
-		fd_unref (lfd);
-	}
-	return 0;
-}
-
-
-/**
- * stripe_create_setxattr_cbk - 
- */
-int32_t
-stripe_create_setxattr_cbk (call_frame_t *frame,
-			    void *cookie,
-			    xlator_t *this,
-			    int32_t op_ret,
-			    int32_t op_errno)
-{
-	fd_t *lfd = NULL;
-	int32_t callcnt = 0;
-	stripe_local_t *local = frame->local;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-    
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_WARNING, 
-				"%s returned error %s",
-				((call_frame_t *)cookie)->this->name, 
-				strerror (op_errno));
-			local->op_ret = -1;
-			local->op_errno = op_errno;
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		if (local->op_ret == -1) {
-			local->call_count = priv->child_count;
-			while (trav) {
-				STACK_WIND (frame,
-					    stripe_create_fail_unlink_cbk,
-					    trav->xlator,
-					    trav->xlator->fops->unlink,
-					    &local->loc);
-				trav = trav->next;
-			}
-	
-			return 0;
-		}
-
-		lfd = local->fd;
-		loc_wipe (&local->loc);
-		STACK_UNWIND (frame, local->op_ret, local->op_errno,
-			      local->fd, local->inode, &local->stbuf);
-		fd_unref (lfd);
-	}
-
-	return 0;
-}
-
-/**
- * stripe_create_cbk - 
- */
-int32_t
-stripe_create_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   fd_t *fd,
-		   inode_t *inode,
-		   struct stat *buf)
-{
-	int32_t callcnt = 0;
-	stripe_local_t *local = frame->local;
-	stripe_private_t *priv = this->private;
-	fd_t *lfd = NULL;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-    
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_WARNING, 
-				"%s returned error %s",
-				((call_frame_t *)cookie)->this->name, 
-				strerror (op_errno));
-			local->failed = 1;
-			local->op_errno = op_errno;
-		}
-    
-		if (op_ret >= 0) {
-			local->op_ret = op_ret;
-			/* Get the mapping in inode private */
-			/* Get the stat buf right */
-			if (local->stbuf.st_blksize == 0) {
-				local->stbuf = *buf;
-				/* Because st_blocks gets added again */
-				local->stbuf.st_blocks = 0;
-			}
-      
-			/* Always, pass the inode number of first
-			   child to the above layer */
-			if (FIRST_CHILD(this) == 
-			    ((call_frame_t *)cookie)->this)
-				local->stbuf.st_ino = buf->st_ino;
-      
-			local->stbuf.st_blocks += buf->st_blocks;
-			if (local->stbuf.st_size < buf->st_size)
-				local->stbuf.st_size = buf->st_size;
-			if (local->stbuf.st_blksize != buf->st_blksize) {
-				/* TODO: add to blocks in terms of 
-				   original block size */
-			}
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		if (local->failed)
-			local->op_ret = -1;
-
-		if (local->op_ret >= 0) {
-			fd_ctx_set (local->fd, this, local->stripe_size);
-		}
-
-		if ((local->op_ret != -1) && 
-		    local->stripe_size && priv->xattr_supported) {
-			/* Send a setxattr request to nodes where
-			   the files are created */
-			int ret = 0;
-			int32_t index = 0;
-			char size_key[256] = {0,};
-			char index_key[256] = {0,};
-			char count_key[256] = {0,};
-			xlator_list_t *trav = this->children;
-			dict_t *dict = NULL;
-
-			sprintf (size_key, 
-				 "trusted.%s.stripe-size", this->name);
-			sprintf (count_key, 
-				 "trusted.%s.stripe-count", this->name);
-			sprintf (index_key,
-				 "trusted.%s.stripe-index", this->name);
-
-			local->call_count = priv->child_count;
-	
-			while (trav) {
-				dict = get_new_dict ();
-				dict_ref (dict);
-
-				/* TODO: check return values */
-				ret = dict_set_int64 (dict, size_key, 
-						      local->stripe_size);
-				ret = dict_set_int32 (dict, count_key, 
-						      local->call_count);
-				ret = dict_set_int32 (dict, index_key, index);
-	
-				STACK_WIND (frame,
-					    stripe_create_setxattr_cbk,
-					    trav->xlator,
-					    trav->xlator->fops->setxattr,
-					    &local->loc,
-					    dict,
-					    0);
-	
-				dict_unref (dict);
-				index++;
-				trav = trav->next;
-			}
-		} else {
-			/* Create itself has failed.. so return
-			   without setxattring */
-			lfd = local->fd;
-			loc_wipe (&local->loc);
-			STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-				      local->fd, local->inode, &local->stbuf);
-      
-			fd_unref (lfd);
-		}
-	}
-  
-	return 0;
-}
-
-
-/**
- * stripe_create - If a block-size is specified for the 'name', create the 
- *    file in all the child nodes. If not, create it in only first child.
- *
- * @name- complete path of the file to be created.
- */
-int32_t
-stripe_create (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       int32_t flags,
-	       mode_t mode,
-	       fd_t *fd)
-{
-	stripe_private_t *priv = this->private;
-	stripe_local_t *local = NULL;
-	xlator_list_t *trav = NULL;
-
-	/* files created in O_APPEND mode does not allow lseek() on fd */
-	flags &= ~O_APPEND;
-
-	if (priv->first_child_down || priv->nodes_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning EIO");
-		STACK_UNWIND (frame, -1, EIO, fd, loc->inode, NULL);
-		return 0;
-	}
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	local->op_errno = ENOTCONN;
-	local->stripe_size = stripe_get_matching_bs (loc->path,
-						     priv->pattern,
-						     priv->block_size);
-	frame->local = local;
-	local->inode = loc->inode;
-	loc_copy (&local->loc, loc);
-	local->fd = fd_ref (fd);
-
-	local->call_count = ((stripe_private_t *)this->private)->child_count;
-	
-	trav = this->children;
-	while (trav) {
-		STACK_WIND (frame,
-			    stripe_create_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->create,
-			    loc, flags, mode, fd);
-		trav = trav->next;
-	}
-       
-	return 0;
-}
-
-/**
- * stripe_open_cbk - 
- */
-int32_t
-stripe_open_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 fd_t *fd)
-{
-	int32_t callcnt = 0;
-	stripe_local_t *local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-
-		if (op_ret == -1) {
-			local->failed = 1;
-			gf_log (this->name, GF_LOG_WARNING, 
-				"%s returned error %s",
-				((call_frame_t *)cookie)->this->name, 
-				strerror (op_errno));
-			local->op_ret = -1;
-			local->op_errno = op_errno;
-		}
-    
-		if (op_ret >= 0)
-			local->op_ret = op_ret;
-	}
-	UNLOCK (&frame->lock);
-  
-	if (!callcnt) {
-		if (local->failed)
-			local->op_ret = -1;
-
-		if (local->op_ret >= 0) {
-			fd_ctx_set (local->fd, this, local->stripe_size);
-		}
-		loc_wipe (&local->loc);
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, fd);
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_getxattr_cbk - 
- */
-int32_t
-stripe_open_getxattr_cbk (call_frame_t *frame,
-			  void *cookie,
-			  xlator_t *this,
-			  int32_t op_ret,
-			  int32_t op_errno,
-			  dict_t *dict)
-{
-	int32_t callcnt = 0;
-	stripe_local_t *local = frame->local;
-	xlator_list_t *trav = this->children;
-	stripe_private_t *priv = this->private;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_WARNING, 
-				"%s returned error %s",
-				((call_frame_t *)cookie)->this->name, 
-				strerror (op_errno));
-			local->op_ret = -1;
-			local->op_errno = op_errno;
-			if (op_errno == ENOTCONN)
-				local->failed = 1;
-		}
-	}
-	UNLOCK (&frame->lock);
-  
-	if (!callcnt) {
-		if (!local->failed && (local->op_ret != -1)) {
-			/* If getxattr doesn't fails, call open */
-			char size_key[256] = {0,};
-			data_t *stripe_size_data = NULL;
-
-			sprintf (size_key, 
-				 "trusted.%s.stripe-size", this->name);
-			stripe_size_data = dict_get (dict, size_key);
-
-			if (stripe_size_data) {
-				local->stripe_size = 
-					data_to_int64 (stripe_size_data);
-				/*
-				if (local->stripe_size != priv->block_size) {
-					gf_log (this->name, GF_LOG_WARNING,
-						"file(%s) is having different "
-						"block-size", local->loc.path);
-				}
-				*/
-			} else {
-				/* if the file was created using earlier 
-				   versions of stripe */
-				gf_log (this->name, GF_LOG_CRITICAL,
-					"[CRITICAL] Seems like file(%s) "
-					"created using earlier version",
-					local->loc.path);
-			}
-		}
-    
-		local->call_count = priv->child_count;
-
-		while (trav) {
-			STACK_WIND (frame,
-				    stripe_open_cbk,
-				    trav->xlator,
-				    trav->xlator->fops->open,
-				    &local->loc, local->flags, local->fd);
-			trav = trav->next;
-		}
-	}
-
-	return 0;
-}
-
-/**
- * stripe_open - 
- */
-int32_t
-stripe_open (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     int32_t flags,
-	     fd_t *fd)
-{
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-  
-	/* files opened in O_APPEND mode does not allow lseek() on fd */
-	flags &= ~O_APPEND;
-
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning ENOTCONN");
-		STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-		return 0;
-	}
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->fd = fd;
-	frame->local = local;
-	local->inode = loc->inode;
-	loc_copy (&local->loc, loc);
-
-	/* Striped files */
-	local->flags = flags;
-	local->call_count = priv->child_count;
-	local->stripe_size = stripe_get_matching_bs (loc->path,
-						     priv->pattern,
-						     priv->block_size);
-	
-	if (priv->xattr_supported) {
-		while (trav) {
-			STACK_WIND (frame,
-				    stripe_open_getxattr_cbk,
-				    trav->xlator,
-				    trav->xlator->fops->getxattr,
-				    loc, NULL);
-			trav = trav->next;
-		}
-	} else {
-		while (trav) {
-			STACK_WIND (frame,
-				    stripe_open_cbk,
-				    trav->xlator,
-				    trav->xlator->fops->open,
-				    &local->loc, local->flags, local->fd);
-			trav = trav->next;
-		}
-	}
-
-	return 0;
-}
-
-/**
- * stripe_opendir_cbk - 
- */
-int32_t
-stripe_opendir_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    fd_t *fd)
-{
-	int32_t callcnt = 0;
-	stripe_local_t *local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_WARNING, 
-				"%s returned error %s",
-				((call_frame_t *)cookie)->this->name, 
-				strerror (op_errno));
-			local->op_ret = -1;
-			local->failed = 1;
-			local->op_errno = op_errno;
-		}
-    
-		if (op_ret >= 0) 
-			local->op_ret = op_ret;
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		STACK_UNWIND (frame, local->op_ret, 
-			      local->op_errno, local->fd);
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_opendir - 
- */
-int32_t
-stripe_opendir (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		fd_t *fd)
-{
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning EIO");
-		STACK_UNWIND (frame, -1, EIO, NULL);
-		return 0;
-	}
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	frame->local = local;
-	local->inode = loc->inode;
-	local->fd = fd;
-	local->call_count = priv->child_count;
-
-	while (trav) {
-		STACK_WIND (frame,
-			    stripe_opendir_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->opendir,
-			    loc, fd);
-		trav = trav->next;
-	}
-  
-	return 0;
-}
-
-
-/**
- * stripe_getxattr_cbk - 
- */
-int32_t
-stripe_getxattr_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     dict_t *value)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, value);
-	return 0;
-}
-
-
-/**
- * stripe_getxattr - 
- */
-int32_t
-stripe_getxattr (call_frame_t *frame,
-		 xlator_t *this,
-		 loc_t *loc,
-		 const char *name)
-{
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	STACK_WIND (frame,
-		    stripe_getxattr_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->getxattr,
-		    loc, name);
-
-	return 0;
-}
-
-/**
- * stripe_removexattr - 
- */
-int32_t
-stripe_removexattr (call_frame_t *frame,
-		    xlator_t *this,
-		    loc_t *loc,
-		    const char *name)
-{
-	stripe_private_t *priv = this->private;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	if (priv->first_child_down) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"First node down, returning ENOTCONN");
-		STACK_UNWIND (frame, -1, ENOTCONN, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    stripe_common_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->removexattr,
-		    loc, name);
-
-	return 0;
-}
-
-
-/**
- * stripe_lk_cbk - 
- */
-int32_t
-stripe_lk_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno,
-	       struct flock *lock)
-{
-	int32_t callcnt = 0;
-	stripe_local_t *local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_WARNING, 
-				"%s returned error %s",
-				((call_frame_t *)cookie)->this->name, 
-				strerror (op_errno));
-			local->op_errno = op_errno;
-			if (op_errno == ENOTCONN)
-				local->failed = 1;
-		}
-		if (op_ret == 0 && local->op_ret == -1) {
-			/* First successful call, copy the *lock */
-			local->op_ret = 0;
-			local->lock = *lock;
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		if (local->failed)
-			local->op_ret = -1;
-		STACK_UNWIND (frame, local->op_ret, 
-			      local->op_errno, &local->lock);
-	}
-	return 0;
-}
-
-
-/**
- * stripe_lk - 
- */
-int32_t
-stripe_lk (call_frame_t *frame,
-	   xlator_t *this,
-	   fd_t *fd,
-	   int32_t cmd,
-	   struct flock *lock)
-{
-	stripe_local_t *local = NULL;
-	xlator_list_t *trav = this->children;
-	stripe_private_t *priv = this->private;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	frame->local = local;
-  
-	local->call_count = priv->child_count;
-	
-	while (trav) {
-		STACK_WIND (frame,	      
-			    stripe_lk_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->lk,
-			    fd, cmd, lock);
-		trav = trav->next;
-	}
-
-	return 0;
-}
-
-/**
- * stripe_writedir - 
- */
-int32_t
-stripe_setdents (call_frame_t *frame,
-		 xlator_t *this,
-		 fd_t *fd,
-		 int32_t flags,
-		 dir_entry_t *entries,
-		 int32_t count)
-{
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	frame->local = local;
-	local->call_count = priv->child_count;
-
-	while (trav) {
-		STACK_WIND (frame,	      
-			    stripe_stack_unwind_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->setdents,
-			    fd, flags, entries, count);
-		trav = trav->next;
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_flush - 
- */
-int32_t
-stripe_flush (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd)
-{
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	frame->local = local;
-	local->call_count = priv->child_count;
-	
-	while (trav) {
-		STACK_WIND (frame,	      
-			    stripe_stack_unwind_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->flush,
-			    fd);
-		trav = trav->next;
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_close - 
- */
-int32_t
-stripe_release (xlator_t *this,
-		fd_t *fd)
-{
-	return 0;
-}
-
-
-/**
- * stripe_fsync - 
- */
-int32_t
-stripe_fsync (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      int32_t flags)
-{
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	frame->local = local;
-	local->call_count = priv->child_count;
-	
-	while (trav) {
-		STACK_WIND (frame,	      
-			    stripe_stack_unwind_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->fsync,
-			    fd, flags);
-		trav = trav->next;
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_fstat - 
- */
-int32_t
-stripe_fstat (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd)
-{
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	frame->local = local;
-	local->inode = fd->inode;
-	local->call_count = priv->child_count;
-	
-	while (trav) {
-		STACK_WIND (frame,	      
-			    stripe_stack_unwind_buf_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->fstat,
-			    fd);
-		trav = trav->next;
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_fchmod - 
- */
-int32_t 
-stripe_fchmod (call_frame_t *frame,
-	       xlator_t *this,
-	       fd_t *fd,
-	       mode_t mode)
-{
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	frame->local = local;
-	local->inode = fd->inode;
-	local->call_count = priv->child_count;
-	
-	while (trav) {
-		STACK_WIND (frame,	      
-			    stripe_stack_unwind_buf_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->fchmod,
-			    fd, mode);
-		trav = trav->next;
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_fchown - 
- */
-int32_t 
-stripe_fchown (call_frame_t *frame,
-	       xlator_t *this,
-	       fd_t *fd,
-	       uid_t uid,
-	       gid_t gid)
-{
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	frame->local = local;
-	local->inode = fd->inode;
-	local->call_count = priv->child_count;
-	
-	while (trav) {
-		STACK_WIND (frame,	      
-			    stripe_stack_unwind_buf_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->fchown,
-			    fd, uid, gid);
-		trav = trav->next;
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_ftruncate - 
- */
-int32_t
-stripe_ftruncate (call_frame_t *frame,
-		  xlator_t *this,
-		  fd_t *fd,
-		  off_t offset)
-{
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	frame->local = local;
-	local->inode = fd->inode;
-	local->call_count = priv->child_count;
-	
-	while (trav) {
-		STACK_WIND (frame,	      
-			    stripe_stack_unwind_buf_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->ftruncate,
-			    fd, offset);
-		trav = trav->next;
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_releasedir - 
- */
-int32_t
-stripe_releasedir (xlator_t *this,
-		   fd_t *fd)
-{
-	return 0;
-}
-
-
-/**
- * stripe_fsyncdir - 
- */
-int32_t
-stripe_fsyncdir (call_frame_t *frame,
-		 xlator_t *this,
-		 fd_t *fd,
-		 int32_t flags)
-{
-	stripe_local_t *local = NULL;
-	stripe_private_t *priv = this->private;
-	xlator_list_t *trav = this->children;
-
-	STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd);
-
-	/* Initialization */
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->op_ret = -1;
-	frame->local = local;
-	local->call_count = priv->child_count;
-
-	while (trav) {
-		STACK_WIND (frame,	      
-			    stripe_stack_unwind_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->fsyncdir,
-			    fd,
-			    flags);
-		trav = trav->next;
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_single_readv_cbk - This function is used as return fn, when the 
- *     file name doesn't match the pattern specified for striping.
- */
-int32_t
-stripe_single_readv_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 struct iovec *vector,
-			 int32_t count,
-			 struct stat *stbuf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
-	return 0;
-}
-
-/**
- * stripe_readv_cbk - get all the striped reads, and order it properly, send it
- *        to above layer after putting it in a single vector.
- */
-int32_t
-stripe_readv_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct iovec *vector,
-		  int32_t count,
-		  struct stat *stbuf)
-{
-	int32_t index = 0;
-	int32_t callcnt = 0;
-	call_frame_t *main_frame = NULL;
-	stripe_local_t *main_local = NULL;
-	stripe_local_t *local = frame->local;
-
-	index = local->node_index;
-	main_frame = local->orig_frame;
-	main_local = main_frame->local;
-
-	LOCK (&main_frame->lock);
-	{
-		main_local->replies[index].op_ret = op_ret;
-		main_local->replies[index].op_errno = op_errno;
-		if (op_ret >= 0) {
-			main_local->replies[index].stbuf  = *stbuf;
-			main_local->replies[index].count  = count;
-			main_local->replies[index].vector = 
-				iov_dup (vector, count);
-
-			if (frame->root->rsp_refs)
-				dict_copy (frame->root->rsp_refs, 
-					   main_frame->root->rsp_refs);
-		}
-		callcnt = ++main_local->call_count;
-	}
-	UNLOCK(&main_frame->lock);
-
-	if (callcnt == main_local->wind_count) {
-		int32_t final_count = 0;
-		struct iovec *final_vec = NULL;
-		struct stat tmp_stbuf = {0,};
-		dict_t *refs = main_frame->root->rsp_refs;
-
-		op_ret = 0;
-		memcpy (&tmp_stbuf, &main_local->replies[0].stbuf, 
-			sizeof (struct stat));
-		for (index=0; index < main_local->wind_count; index++) {
-			/* TODO: check whether each stripe returned 'expected'
-			 * number of bytes 
-			 */
-			if (main_local->replies[index].op_ret == -1) {
-				op_ret = -1;
-				op_errno = main_local->replies[index].op_errno;
-				break;
-			}
-			op_ret += main_local->replies[index].op_ret;
-			final_count += main_local->replies[index].count;
-			/* TODO: Do I need to send anything more in stbuf? */
-			if (tmp_stbuf.st_size < 
-			    main_local->replies[index].stbuf.st_size) {
-				tmp_stbuf.st_size = 
-					main_local->replies[index].stbuf.st_size;
-			}
-		}
-		if (op_ret != -1) {
-			final_vec = CALLOC (final_count, 
-					    sizeof (struct iovec));
-			ERR_ABORT (final_vec);
-			final_count = 0;
-
-			for (index=0; 
-			     index < main_local->wind_count; index++) {
-				memcpy (final_vec + final_count,
-					main_local->replies[index].vector,
-					(main_local->replies[index].count * 
-					 sizeof (struct iovec)));
-				final_count += 
-					main_local->replies[index].count;
-
-				free (main_local->replies[index].vector);
-			}
-		} else {
-			final_vec = NULL;
-			final_count = 0;
-		}
-		/* */
-		FREE (main_local->replies);
-		refs = main_frame->root->rsp_refs;
-		STACK_UNWIND (main_frame, op_ret, op_errno, 
-			      final_vec, final_count, &tmp_stbuf);
-
-		dict_unref (refs);
-		if (final_vec)
-			free (final_vec);
-	}
-
-	STACK_DESTROY (frame->root);
-	return 0;
-}
-
-/**
- * stripe_readv - 
- */
-int32_t
-stripe_readv (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      size_t size,
-	      off_t offset)
-{
-	int32_t index = 0;
-	int32_t num_stripe = 0;
-	size_t frame_size = 0;
-	off_t rounded_end = 0;
-	uint64_t stripe_size = 0;
-	off_t rounded_start = 0;
-	off_t frame_offset = offset;
-	stripe_local_t *local = NULL;
-	call_frame_t *rframe = NULL;
-	stripe_local_t *rlocal = NULL;
-	xlator_list_t *trav = this->children;
-	stripe_private_t *priv = this->private;
-
-	fd_ctx_get (fd, this, &stripe_size);
-	if (!stripe_size) {
-		STACK_UNWIND (frame, -1, EINVAL, NULL, 0, NULL);
-		return 0;
-	}
-
-	/* The file is stripe across the child nodes. Send the read request 
-	 * to the child nodes appropriately after checking which region of 
-	 * the file is in which child node. Always '0-<stripe_size>' part of
-	 * the file resides in the first child.
-	 */
-	rounded_start = floor (offset, stripe_size);
-	rounded_end = roof (offset+size, stripe_size);
-	num_stripe = (rounded_end - rounded_start) / stripe_size;
-	
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	local->wind_count = num_stripe;
-	frame->local = local;
-	frame->root->rsp_refs = dict_ref (get_new_dict ());
-	
-	/* This is where all the vectors should be copied. */
-	local->replies = CALLOC (1, num_stripe * 
-				 sizeof (struct readv_replies));
-	ERR_ABORT (local->replies);
-	
-	for (index = 0;
-	     index < ((offset / stripe_size) % priv->child_count);
-	     index++) {
-		trav = trav->next;
-	}
-    
-	for (index = 0; index < num_stripe; index++) {
-		rframe = copy_frame (frame);
-		rlocal = CALLOC (1, sizeof (stripe_local_t));
-		ERR_ABORT (rlocal);
-		
-		frame_size = min (roof (frame_offset+1, stripe_size),
-				  (offset + size)) - frame_offset;
-		
-		rlocal->node_index = index;
-		rlocal->orig_frame = frame;
-		rframe->local = rlocal;
-		STACK_WIND (rframe,
-			    stripe_readv_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->readv,
-			    fd, frame_size, frame_offset);
-      
-		frame_offset += frame_size;
-
-		trav = trav->next ? trav->next : this->children;
-	}
-
-	return 0;
-}
-
-
-/**
- * stripe_writev_cbk - 
- */
-int32_t
-stripe_writev_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   struct stat *stbuf)
-{
-	int32_t callcnt = 0;
-	stripe_local_t *local = frame->local;
-	LOCK(&frame->lock);
-	{
-		callcnt = ++local->call_count;
-    
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_WARNING, 
-				"%s returned error %s",
-				((call_frame_t *)cookie)->this->name, 
-				strerror (op_errno));
-			local->op_errno = op_errno;
-			local->op_ret = -1;
-		}
-		if (op_ret >= 0) {
-			local->op_ret += op_ret;
-			local->stbuf = *stbuf;
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if ((callcnt == local->wind_count) && local->unwind) {
-		STACK_UNWIND (frame, local->op_ret, 
-			      local->op_errno, &local->stbuf);
-	}
-	return 0;
-}
-
-
-/**
- * stripe_single_writev_cbk - 
- */
-int32_t
-stripe_single_writev_cbk (call_frame_t *frame,
-			  void *cookie,
-			  xlator_t *this,
-			  int32_t op_ret,
-			  int32_t op_errno,
-			  struct stat *stbuf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, stbuf);
-	return 0;
-}
-/**
- * stripe_writev - 
- */
-int32_t
-stripe_writev (call_frame_t *frame,
-	       xlator_t *this,
-	       fd_t *fd,
-	       struct iovec *vector,
-	       int32_t count,
-	       off_t offset)
-{
-	int32_t idx = 0;
-	int32_t total_size = 0;
-	int32_t offset_offset = 0;
-	int32_t remaining_size = 0;
-	int32_t tmp_count = count;
-	off_t fill_size = 0;
-	uint64_t stripe_size = 0;
-	struct iovec *tmp_vec = vector;
-	stripe_private_t *priv = this->private;
-	stripe_local_t *local = NULL;
-	xlator_list_t *trav = NULL;
-
-	fd_ctx_get (fd, this, &stripe_size);
-	if (!stripe_size) {
-		STACK_UNWIND (frame, -1, EINVAL, NULL);
-		return 0;
-	}
-
-	/* File has to be stripped across the child nodes */
-	for (idx = 0; idx< count; idx ++) {
-		total_size += tmp_vec[idx].iov_len;
-	}
-	remaining_size = total_size;
-
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	frame->local = local;
-	local->stripe_size = stripe_size;
-
-	while (1) {
-		/* Send striped chunk of the vector to child 
-		   nodes appropriately. */
-		trav = this->children;
-		
-		idx = (((offset + offset_offset) / 
-			local->stripe_size) % priv->child_count);
-		while (idx) {
-			trav = trav->next;
-			idx--;
-		}
-		fill_size = (local->stripe_size - 
-			     ((offset + offset_offset) % local->stripe_size));
-		if (fill_size > remaining_size)
-			fill_size = remaining_size;
-
-		remaining_size -= fill_size;
-
-		tmp_count = iov_subset (vector, count, offset_offset,
-					offset_offset + fill_size, NULL);
-		tmp_vec = CALLOC (tmp_count, sizeof (struct iovec));
-		ERR_ABORT (tmp_vec);
-		tmp_count = iov_subset (vector, count, offset_offset,
-					offset_offset + fill_size, tmp_vec);
-		
-		local->wind_count++;
-		if (remaining_size == 0)
-			local->unwind = 1;
-
-		STACK_WIND(frame,
-			   stripe_writev_cbk,
-			   trav->xlator,
-			   trav->xlator->fops->writev,
-			   fd, tmp_vec, tmp_count, offset + offset_offset);
-		FREE (tmp_vec);
-		offset_offset += fill_size;
-		if (remaining_size == 0)
-			break;
-	}
-
-	return 0;
-}
-
-
-
-/* Management operations */
-
-/**
- * stripe_stats_cbk - Add all the fields received from different clients. 
- *    Once all the clients return, send stats to above layer.
- * 
- */
-int32_t
-stripe_stats_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct xlator_stats *stats)
-{
-	int32_t callcnt = 0;
-	stripe_local_t *local = frame->local;
-
-	LOCK(&frame->lock);
-	{
-		callcnt = --local->call_count;
-    
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_WARNING, 
-				"%s returned error %s",
-				((call_frame_t *)cookie)->this->name, 
-				strerror (op_errno));
-			local->op_ret = -1;
-			local->op_errno = op_errno;
-		}
-		if (op_ret == 0) {
-			if (local->op_ret == -2) {
-				/* This is to make sure this is the 
-				   first time */
-				local->stats = *stats;
-				local->op_ret = 0;
-			} else {
-				local->stats.nr_files += stats->nr_files;
-				local->stats.free_disk += stats->free_disk;
-				local->stats.disk_usage += stats->disk_usage;
-				local->stats.nr_clients += stats->nr_clients;
-			}
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		STACK_UNWIND (frame, local->op_ret, local->op_errno,
-			      &local->stats);
-	}
-
-	return 0;
-}
-
-/**
- * stripe_stats - 
- */
-int32_t
-stripe_stats (call_frame_t *frame,
-	      xlator_t *this,
-	      int32_t flags)
-{
-	stripe_local_t *local = NULL;
-	xlator_list_t *trav = this->children;
-
-	local = CALLOC (1, sizeof (stripe_local_t));
-	ERR_ABORT (local);
-	frame->local = local;
-	local->op_ret = -2; /* to be used as a flag in _cbk */
-	local->call_count = ((stripe_private_t*)this->private)->child_count;
-	while (trav) {
-		STACK_WIND (frame,
-			    stripe_stats_cbk,
-			    trav->xlator,
-			    trav->xlator->mops->stats,
-			    flags);
-		trav = trav->next;
-	}
-	return 0;
-}
-
-/**
- * notify
- */
-int32_t
-notify (xlator_t *this,
-        int32_t event,
-        void *data,
-        ...)
-{
-	stripe_private_t *priv = this->private;
-	int down_client = 0;
-	int i = 0;
-
-	if (!priv)
-		return 0;
-
-	switch (event) 
-	{
-	case GF_EVENT_CHILD_UP:
-	{
-		/* get an index number to set */
-		for (i = 0; i < priv->child_count; i++) {
-			if (data == priv->xl_array[i])
-				break;
-		}
-		priv->state[i] = 1;
-		for (i = 0; i < priv->child_count; i++) {
-			if (!priv->state[i])
-				down_client++;
-		}
-
-		LOCK (&priv->lock);
-		{
-			priv->nodes_down = down_client;
-
-			if (data == FIRST_CHILD (this)) {
-				priv->first_child_down = 0;
-				default_notify (this, event, data);
-			}
-		}
-		UNLOCK (&priv->lock);
-	}
-	break;
-	case GF_EVENT_CHILD_DOWN:
-	{
-		/* get an index number to set */
-		for (i = 0; i < priv->child_count; i++) {
-			if (data == priv->xl_array[i])
-				break;
-		}
-		priv->state[i] = 0;
-		for (i = 0; i < priv->child_count; i++) {
-			if (!priv->state[i])
-				down_client++;
-		}
-
-		LOCK (&priv->lock);
-		{
-			priv->nodes_down = down_client;
-
-			if (data == FIRST_CHILD (this)) {
-				priv->first_child_down = 1;
-				default_notify (this, event, data);
-			}
-		}
-		UNLOCK (&priv->lock);
-	}
-	break;
-
-	default:
-	{
-		/* */
-		default_notify (this, event, data);
-	}
-	break;
-	}
-
-	return 0;
-}
-/**
- * init - This function is called when xlator-graph gets initialized. 
- *     The option given in volfiles are parsed here.
- * @this - 
- */
-int32_t
-init (xlator_t *this)
-{
-	stripe_private_t *priv = NULL;
-	xlator_list_t *trav = NULL;
-	data_t *data = NULL;
-	int32_t count = 0;
-
-	trav = this->children;
-	while (trav) {
-		count++;
-		trav = trav->next;
-	}
-
-	if (!count) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"stripe configured without \"subvolumes\" option. "
-			"exiting");
-		return -1;
-	}
-
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-  
-	priv = CALLOC (1, sizeof (stripe_private_t));
-	ERR_ABORT (priv);
-	priv->xl_array = CALLOC (1, count * sizeof (xlator_t *));
-	ERR_ABORT (priv->xl_array);
-	priv->child_count = count;
-	LOCK_INIT (&priv->lock);
-
-	trav = this->children;
-	count = 0;
-	while (trav) {
-		priv->xl_array[count++] = trav->xlator;
-		trav = trav->next;
-	}
-
-	if (count > 256) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"maximum number of stripe subvolumes supported "
-			"is 256");
-		return -1;
-	}
-
-	priv->block_size = (128 * GF_UNIT_KB);
-	/* option stripe-pattern *avi:1GB,*pdf:4096 */
-	data = dict_get (this->options, "block-size");
-	if (!data) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"No \"option block-size <x>\" given, defaulting "
-			"to 128KB");
-	} else {
-		char *tmp_str = NULL;
-		char *tmp_str1 = NULL;
-		char *dup_str = NULL;
-		char *stripe_str = NULL;
-		char *pattern = NULL;
-		char *num = NULL;
-		struct stripe_options *temp_stripeopt = NULL;
-		struct stripe_options *stripe_opt = NULL;    
-
-		/* Get the pattern for striping. 
-		   "option block-size *avi:10MB" etc */
-		stripe_str = strtok_r (data->data, ",", &tmp_str);
-		while (stripe_str) {
-			dup_str = strdup (stripe_str);
-			stripe_opt = CALLOC (1, 
-					     sizeof (struct stripe_options));
-			ERR_ABORT (stripe_opt);
-			pattern = strtok_r (dup_str, ":", &tmp_str1);
-			num = strtok_r (NULL, ":", &tmp_str1);
-			if (num && 
-			    (gf_string2bytesize (num, 
-						 &stripe_opt->block_size) 
-			     != 0)) {
-				gf_log (this->name, GF_LOG_ERROR, 
-					"invalid number format \"%s\"", 
-					num);
-				return -1;
-			} else if (!num && (gf_string2bytesize (
-						    pattern, 
-						    &stripe_opt->block_size) 
-					    != 0)) {
-				/* Possible that there is no pattern given */
-				stripe_opt->block_size = (128 * GF_UNIT_KB);
-				pattern = "*";
-			}
-			memcpy (stripe_opt->path_pattern, 
-				pattern, strlen (pattern));
-			
-			gf_log (this->name, GF_LOG_DEBUG, 
-				"block-size : pattern %s : size %"PRId64, 
-				stripe_opt->path_pattern, 
-				stripe_opt->block_size);
-			
-			if (!priv->pattern) {
-				priv->pattern = stripe_opt;
-			} else {
-				temp_stripeopt = priv->pattern;
-				while (temp_stripeopt->next)
-					temp_stripeopt = temp_stripeopt->next;
-				temp_stripeopt->next = stripe_opt;
-			}
-			stripe_str = strtok_r (NULL, ",", &tmp_str);
-		}
-	}
-
-	priv->xattr_supported = 1;
-	data = dict_get (this->options, "use-xattr");
-	if (data) {
-		if (gf_string2boolean (data->data, 
-				       &priv->xattr_supported) == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"error setting hard check for extended "
-				"attribute");
-			//return -1;
-		}
-	}
-
-	/* notify related */
-	priv->nodes_down = priv->child_count;
-	this->private = priv;
-
-	return 0;
-} 
-
-/** 
- * fini -   Free all the private variables
- * @this - 
- */
-void 
-fini (xlator_t *this)
-{
-	stripe_private_t *priv = this->private;
-	struct stripe_options *prev = NULL;
-	struct stripe_options *trav = priv->pattern;
-	while (trav) {
-		prev = trav;
-		trav = trav->next;
-		FREE (prev);
-	}
-	FREE (priv->xl_array);
-	LOCK_DESTROY (&priv->lock);
-	FREE (priv);
-	return;
-}
-
-
-struct xlator_fops fops = {
-	.stat        = stripe_stat,
-	.unlink      = stripe_unlink,
-	.symlink     = stripe_symlink,
-	.rename      = stripe_rename,
-	.link        = stripe_link,
-	.chmod       = stripe_chmod,
-	.chown       = stripe_chown,
-	.truncate    = stripe_truncate,
-	.utimens     = stripe_utimens,
-	.create      = stripe_create,
-	.open        = stripe_open,
-	.readv       = stripe_readv,
-	.writev      = stripe_writev,
-	.statfs      = stripe_statfs,
-	.flush       = stripe_flush,
-	.fsync       = stripe_fsync,
-	.setxattr    = stripe_setxattr,
-	.getxattr    = stripe_getxattr,
-	.removexattr = stripe_removexattr,
-	.access      = stripe_access,
-	.ftruncate   = stripe_ftruncate,
-	.fstat       = stripe_fstat,
-	.readlink    = stripe_readlink,
-	.mkdir       = stripe_mkdir,
-	.rmdir       = stripe_rmdir,
-	.lk          = stripe_lk,
-	.opendir     = stripe_opendir,
-	.fsyncdir    = stripe_fsyncdir,
-	.fchmod      = stripe_fchmod,
-	.fchown      = stripe_fchown,
-	.lookup      = stripe_lookup,
-	.setdents    = stripe_setdents,
-	.mknod       = stripe_mknod,
-};
-
-struct xlator_mops mops = {
-	.stats  = stripe_stats,
-};
-
-struct xlator_cbks cbks = {
-	.release    = stripe_release,
-	.releasedir = stripe_releasedir
-};
-
-
-struct volume_options options[] = {
-	{ .key  = {"block-size"}, 
-	  .type = GF_OPTION_TYPE_ANY 
-	},
-	{ .key  = {"use-xattr"}, 
-	  .type = GF_OPTION_TYPE_BOOL
-	},
-	{ .key  = {NULL} },
-};
diff --git a/xlators/cluster/unify/src/Makefile.am b/xlators/cluster/unify/src/Makefile.am
deleted file mode 100644
index b9e6f63e9d7..00000000000
--- a/xlators/cluster/unify/src/Makefile.am
+++ /dev/null
@@ -1,16 +0,0 @@
-
-xlator_LTLIBRARIES = unify.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
-
-unify_la_LDFLAGS = -module -avoidversion
-
-unify_la_SOURCES = unify.c unify-self-heal.c
-unify_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-noinst_HEADERS = unify.h
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
-
-CLEANFILES = 
-
diff --git a/xlators/cluster/unify/src/unify-self-heal.c b/xlators/cluster/unify/src/unify-self-heal.c
deleted file mode 100644
index 3099c646e2d..00000000000
--- a/xlators/cluster/unify/src/unify-self-heal.c
+++ /dev/null
@@ -1,1225 +0,0 @@
-/*
-  Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * unify-self-heal.c : 
- *   This file implements few functions which enables 'unify' translator 
- *  to be consistent in its behaviour when 
- *     > a node fails, 
- *     > a node gets added, 
- *     > a failed node comes back
- *     > a new namespace server is added (ie, an fresh namespace server).
- * 
- *  This functionality of 'unify' will enable glusterfs to support storage
- *  system failure, and maintain consistancy. This works both ways, ie, when
- *  an entry (either file or directory) is found on namespace server, and not
- *  on storage nodes, its created in storage nodes and vica-versa.
- * 
- *  The two fops, where it can be implemented are 'getdents ()' and 'lookup ()'
- *
- */
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "unify.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "common-utils.h"
-
-int32_t
-unify_sh_getdents_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       dir_entry_t *entry,
-		       int32_t count);
-
-int32_t
-unify_sh_ns_getdents_cbk (call_frame_t *frame,
-			  void *cookie,
-			  xlator_t *this,
-			  int32_t op_ret,
-			  int32_t op_errno,
-			  dir_entry_t *entry,
-			  int32_t count);
-
-int32_t
-unify_bgsh_getdents_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 dir_entry_t *entry,
-			 int32_t count);
-
-int32_t
-unify_bgsh_ns_getdents_cbk (call_frame_t *frame,
-			    void *cookie,
-			    xlator_t *this,
-			    int32_t op_ret,
-			    int32_t op_errno,
-			    dir_entry_t *entry,
-			    int32_t count);
-
-/**
- * unify_local_wipe - free all the extra allocation of local->* here.
- */
-static void 
-unify_local_wipe (unify_local_t *local)
-{
-	/* Free the strdup'd variables in the local structure */
-	if (local->name) {
-		FREE (local->name);
-	}
-
-	if (local->sh_struct) {
-		if (local->sh_struct->offset_list)
-			FREE (local->sh_struct->offset_list);
-
-		if (local->sh_struct->entry_list)
-			FREE (local->sh_struct->entry_list);
-
-		if (local->sh_struct->count_list)
-			FREE (local->sh_struct->count_list);
-
-		FREE (local->sh_struct);
-	}
-
-	loc_wipe (&local->loc1);
-	loc_wipe (&local->loc2);
-}
-
-int32_t 
-unify_sh_setdents_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno)
-{
-	int32_t callcnt = -1;
-	unify_local_t *local = frame->local;
-	inode_t *inode = NULL;
-	dict_t *tmp_dict = NULL;
-	dir_entry_t *prev, *entry, *trav;
-
-	LOCK (&frame->lock);
-	{
-		/* if local->call_count == 0, that means, setdents on 
-		 * storagenodes is still pending.
-		 */
-		if (local->call_count)
-			callcnt = --local->call_count;
-	}
-	UNLOCK (&frame->lock);
-
-	if (callcnt == 0) {
-		if (local->sh_struct->entry_list[0]) {
-			prev = entry = local->sh_struct->entry_list[0];
-			if (!entry)
-				return 0;
-			trav = entry->next;
-			while (trav) {
-				prev->next = trav->next;
-				FREE (trav->name);
-				if (S_ISLNK (trav->buf.st_mode))
-					FREE (trav->link);
-				FREE (trav);
-				trav = prev->next;
-			}
-			FREE (entry);
-		}
-
-		if (!local->flags) {
-			if (local->sh_struct->count_list[0] >= 
-			    UNIFY_SELF_HEAL_GETDENTS_COUNT) {
-				/* count == size, that means, there are more entries
-				   to read from */
-				//local->call_count = 0;
-				local->sh_struct->offset_list[0] += 
-					UNIFY_SELF_HEAL_GETDENTS_COUNT;
-				STACK_WIND (frame,
-					    unify_sh_ns_getdents_cbk,
-					    NS(this),
-					    NS(this)->fops->getdents,
-					    local->fd,
-					    UNIFY_SELF_HEAL_GETDENTS_COUNT,
-					    local->sh_struct->offset_list[0],
-					    GF_GET_DIR_ONLY);
-			}		
-		} else {
-			inode = local->loc1.inode;
-			fd_unref (local->fd);
-			tmp_dict = local->dict;
-
-			unify_local_wipe (local);
-			
-			STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-				      inode, &local->stbuf, local->dict);
-			if (tmp_dict)
-				dict_unref (local->dict);
-		}
-	}
-  
-	return 0;
-}
-
-
-int32_t
-unify_sh_ns_getdents_cbk (call_frame_t *frame,
-			  void *cookie,
-			  xlator_t *this,
-			  int32_t op_ret,
-			  int32_t op_errno,
-			  dir_entry_t *entry,
-			  int32_t count)
-{
-	unify_local_t *local = frame->local;
-	unify_private_t *priv = this->private;
-	long index = 0;
-	unsigned long final = 0;
-	dir_entry_t *tmp = CALLOC (1, sizeof (dir_entry_t));
-	
-	local->sh_struct->entry_list[0] = tmp;
-	local->sh_struct->count_list[0] = count;
-	if (entry) {
-		tmp->next = entry->next;
-		entry->next = NULL;
-	}
-
-	if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) {
-		final = 1;
-	}
-
-	LOCK (&frame->lock);
-	{
-		/* local->call_count will be '0' till now. make it 1 so, it 
-		   can be UNWIND'ed for the last call. */
-		local->call_count = priv->child_count;
-		if (final)
-			local->flags = 1;
-	}
-	UNLOCK (&frame->lock);
-
-	for (index = 0; index < priv->child_count; index++) 
-	{
-		STACK_WIND_COOKIE (frame,
-				   unify_sh_setdents_cbk, 
-				   (void *)index,
-				   priv->xl_array[index],
-				   priv->xl_array[index]->fops->setdents,
-				   local->fd, GF_SET_DIR_ONLY,
-				   local->sh_struct->entry_list[0], count);
-	}
-
-	return 0;
-}
-
-int32_t 
-unify_sh_ns_setdents_cbk (call_frame_t *frame,
-			  void *cookie,
-			  xlator_t *this,
-			  int32_t op_ret,
-			  int32_t op_errno)
-{
-	int32_t callcnt = -1;
-	unify_local_t *local = frame->local;
-	unify_private_t *priv = this->private;
-	long index = (long)cookie;
-	dir_entry_t *prev, *entry, *trav;
-
-	LOCK (&frame->lock);
-	{
-		if (local->sh_struct->entry_list[index]) {
-			prev = entry = local->sh_struct->entry_list[index];
-			trav = entry->next;
-			while (trav) {
-				prev->next = trav->next;
-				FREE (trav->name);
-				if (S_ISLNK (trav->buf.st_mode))
-					FREE (trav->link);
-				FREE (trav);
-				trav = prev->next;
-			}
-			FREE (entry);
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if (local->sh_struct->count_list[index] < 
-	    UNIFY_SELF_HEAL_GETDENTS_COUNT) {
-		LOCK (&frame->lock);
-		{
-			callcnt = --local->call_count;
-		}
-		UNLOCK (&frame->lock);
-	} else {
-		/* count == size, that means, there are more entries 
-		   to read from */
-		local->sh_struct->offset_list[index] += 
-			UNIFY_SELF_HEAL_GETDENTS_COUNT;
-		STACK_WIND_COOKIE (frame,
-				   unify_sh_getdents_cbk,
-				   cookie,
-				   priv->xl_array[index],
-				   priv->xl_array[index]->fops->getdents,
-				   local->fd,
-				   UNIFY_SELF_HEAL_GETDENTS_COUNT,
-				   local->sh_struct->offset_list[index],
-				   GF_GET_ALL);
-    
-		gf_log (this->name, GF_LOG_DEBUG, 
-			"readdir on (%s) with offset %"PRId64"", 
-			priv->xl_array[index]->name, 
-			local->sh_struct->offset_list[index]);
-	}
-
-	if (!callcnt) {
-		/* All storage nodes have done unified setdents on NS node.
-		 * Now, do getdents from NS and do setdents on storage nodes.
-		 */
-    
-		/* sh_struct->offset_list is no longer required for
-		   storage nodes now */
-		local->sh_struct->offset_list[0] = 0; /* reset */
-
-		STACK_WIND (frame,
-			    unify_sh_ns_getdents_cbk,
-			    NS(this),
-			    NS(this)->fops->getdents,
-			    local->fd,
-			    UNIFY_SELF_HEAL_GETDENTS_COUNT,
-			    0, /* In this call, do send '0' as offset */
-			    GF_GET_DIR_ONLY);
-	}
-
-	return 0;
-}
-
-
-/**
- * unify_sh_getdents_cbk -
- */
-int32_t
-unify_sh_getdents_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       dir_entry_t *entry,
-		       int32_t count)
-{
-	int32_t callcnt = -1;
-	unify_local_t *local = frame->local;
-	unify_private_t *priv = this->private;
-	long index = (long)cookie;
-	dir_entry_t *tmp = NULL; 
-
-	if (op_ret >= 0 && count > 0) {
-		/* There is some dentry found, just send the dentry to NS */
-		tmp = CALLOC (1, sizeof (dir_entry_t));
-		local->sh_struct->entry_list[index] = tmp;
-		local->sh_struct->count_list[index] = count;
-		if (entry) {
-			tmp->next = entry->next;
-			entry->next = NULL;
-		}
-		STACK_WIND_COOKIE (frame,
-				   unify_sh_ns_setdents_cbk,
-				   cookie,
-				   NS(this),
-				   NS(this)->fops->setdents,
-				   local->fd,
-				   GF_SET_IF_NOT_PRESENT,
-				   local->sh_struct->entry_list[index],
-				   count);
-		return 0;
-	}
-  
-	if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) {
-		LOCK (&frame->lock);
-		{
-			callcnt = --local->call_count;
-		}
-		UNLOCK (&frame->lock);
-	} else {
-		/* count == size, that means, there are more entries 
-		   to read from */
-		local->sh_struct->offset_list[index] += 
-			UNIFY_SELF_HEAL_GETDENTS_COUNT;
-		STACK_WIND_COOKIE (frame,
-				   unify_sh_getdents_cbk,
-				   cookie,
-				   priv->xl_array[index],
-				   priv->xl_array[index]->fops->getdents,
-				   local->fd,
-				   UNIFY_SELF_HEAL_GETDENTS_COUNT,
-				   local->sh_struct->offset_list[index],
-				   GF_GET_ALL);
-    
-		gf_log (this->name, GF_LOG_DEBUG, 
-			"readdir on (%s) with offset %"PRId64"", 
-			priv->xl_array[index]->name, 
-			local->sh_struct->offset_list[index]);
-	}
-
-	if (!callcnt) {
-		/* All storage nodes have done unified setdents on NS node.
-		 * Now, do getdents from NS and do setdents on storage nodes.
-		 */
-    
-		/* sh_struct->offset_list is no longer required for
-		   storage nodes now */
-		local->sh_struct->offset_list[0] = 0; /* reset */
-
-		STACK_WIND (frame,
-			    unify_sh_ns_getdents_cbk,
-			    NS(this),
-			    NS(this)->fops->getdents,
-			    local->fd,
-			    UNIFY_SELF_HEAL_GETDENTS_COUNT,
-			    0, /* In this call, do send '0' as offset */
-			    GF_GET_DIR_ONLY);
-	}
-
-	return 0;
-}
-
-/**
- * unify_sh_opendir_cbk -
- *
- * @cookie: 
- */
-int32_t 
-unify_sh_opendir_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno,
-		      fd_t *fd)
-{
-	int32_t callcnt = 0;
-	unify_local_t *local = frame->local;
-	unify_private_t *priv = this->private;
-	int16_t index = 0;
-	inode_t *inode = NULL;
-	dict_t *tmp_dict = NULL;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-    
-		if (op_ret >= 0) {
-			local->op_ret = op_ret;
-		} else {
-			gf_log (this->name, GF_LOG_WARNING, "failed");
-			local->failed = 1;
-		}
-	}
-	UNLOCK (&frame->lock);
-  
-	if (!callcnt) {
-		local->call_count = priv->child_count + 1;
-    
-		if (!local->failed) {
-			/* send getdents() namespace after finishing
-			   storage nodes */
-			local->call_count--; 
-      
-			fd_bind (fd);
-
-			if (local->call_count) {
-				/* Used as the offset index. This list keeps
-				 * track of offset sent to each node during
-				 * STACK_WIND.
-				 */
-				local->sh_struct->offset_list = 
-					calloc (priv->child_count, 
-						sizeof (off_t));
-				ERR_ABORT (local->sh_struct->offset_list);
-	
-				local->sh_struct->entry_list = 
-					calloc (priv->child_count, 
-						sizeof (dir_entry_t *));
-				ERR_ABORT (local->sh_struct->entry_list);
-
-				local->sh_struct->count_list = 
-					calloc (priv->child_count, 
-						sizeof (int));
-				ERR_ABORT (local->sh_struct->count_list);
-
-				/* Send getdents on all the fds */
-				for (index = 0; 
-				     index < priv->child_count; index++) {
-					STACK_WIND_COOKIE (frame,
-							   unify_sh_getdents_cbk,
-							   (void *)(long)index,
-							   priv->xl_array[index],
-							   priv->xl_array[index]->fops->getdents,
-							   local->fd,
-							   UNIFY_SELF_HEAL_GETDENTS_COUNT,
-							   0, /* In this call, do send '0' as offset */
-							   GF_GET_ALL);
-				}
-
-				/* did stack wind, so no need to unwind here */
-				return 0;
-			} /* (local->call_count) */
-		} /* (!local->failed) */
-
-		/* Opendir failed on one node. */ 
-		inode = local->loc1.inode;
-		fd_unref (local->fd);
-		tmp_dict = local->dict;
-
-		unify_local_wipe (local);
-		/* Only 'self-heal' failed, lookup() was successful. */
-		local->op_ret = 0;
-
-		/* This is lookup_cbk ()'s UNWIND. */
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, inode,
-			      &local->stbuf, local->dict);
-		if (tmp_dict)
-			dict_unref (tmp_dict);
-	}
-
-	return 0;
-}
-
-/**
- * gf_sh_checksum_cbk - 
- * 
- * @frame: frame used in lookup. get a copy of it, and use that copy.
- * @this: pointer to unify xlator.
- * @inode: pointer to inode, for which the consistency check is required.
- *
- */
-int32_t 
-unify_sh_checksum_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       uint8_t *file_checksum,
-		       uint8_t *dir_checksum)
-{
-	unify_local_t *local = frame->local;
-	unify_private_t *priv = this->private;
-	int16_t index = 0;
-	int32_t callcnt = 0;
-	inode_t *inode = NULL;
-	dict_t *tmp_dict = NULL;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-		if (op_ret >= 0) {
-			if (NS(this) == (xlator_t *)cookie) {
-				memcpy (local->sh_struct->ns_file_checksum, 
-					file_checksum, ZR_FILENAME_MAX);
-				memcpy (local->sh_struct->ns_dir_checksum, 
-					dir_checksum, ZR_FILENAME_MAX);
-			} else {
-				if (local->entry_count == 0) {
-					/* Initialize the dir_checksum to be 
-					 * used for comparision with other
-					 * storage nodes. Should be done for
-					 * the first successful call *only*. 
-					 */
-                                        /* Using 'entry_count' as a flag */
-					local->entry_count = 1;
-					memcpy (local->sh_struct->dir_checksum,
-						dir_checksum, ZR_FILENAME_MAX);
-				}
-
-				/* Reply from the storage nodes */
-				for (index = 0; 
-				     index < ZR_FILENAME_MAX; index++) {
-					/* Files should be present in
-					   only one node */
-					local->sh_struct->file_checksum[index] ^= file_checksum[index];
-	  
-					/* directory structure should be
-					   same accross */
-					if (local->sh_struct->dir_checksum[index] != dir_checksum[index])
-						local->failed = 1;
-				}
-			}
-		} 
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		for (index = 0; index < ZR_FILENAME_MAX ; index++) {
-			if (local->sh_struct->file_checksum[index] != 
-			    local->sh_struct->ns_file_checksum[index]) {
-				local->failed = 1;
-				break;
-			}
-			if (local->sh_struct->dir_checksum[index] != 
-			    local->sh_struct->ns_dir_checksum[index]) {
-				local->failed = 1;
-				break;
-			}
-		}
-	
-		if (local->failed) {
-			/* Log it, it should be a rare event */
-			gf_log (this->name, GF_LOG_WARNING, 
-				"Self-heal triggered on directory %s", 
-				local->loc1.path);
-
-			/* Any self heal will be done at directory level */
-			local->call_count = 0;
-			local->op_ret = -1;
-			local->failed = 0;
-      
-			local->fd = fd_create (local->loc1.inode, 
-					       frame->root->pid);
-
-			local->call_count = priv->child_count + 1;
-	
-			for (index = 0; 
-			     index < (priv->child_count + 1); index++) {
-				STACK_WIND_COOKIE (frame,
-						   unify_sh_opendir_cbk,
-						   priv->xl_array[index]->name,
-						   priv->xl_array[index],
-						   priv->xl_array[index]->fops->opendir,
-						   &local->loc1,
-						   local->fd);
-			}
-			/* opendir can be done on the directory */
-			return 0;
-		}
-
-		/* no mismatch */
-		inode = local->loc1.inode;
-		tmp_dict = local->dict;
-
-		unify_local_wipe (local);
-
-		/* This is lookup_cbk ()'s UNWIND. */
-		STACK_UNWIND (frame,
-			      local->op_ret,
-			      local->op_errno,
-			      inode,
-			      &local->stbuf,
-			      local->dict);
-		if (tmp_dict)
-			dict_unref (tmp_dict);
-	}
-
-	return 0;
-}
-
-/* Foreground self-heal part over */
-
-/* Background self-heal part */
-
-int32_t 
-unify_bgsh_setdents_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno)
-{
-	int32_t callcnt = -1;
-	unify_local_t *local = frame->local;
-	dir_entry_t *prev, *entry, *trav;
-
-	LOCK (&frame->lock);
-	{
-		/* if local->call_count == 0, that means, setdents 
-		   on storagenodes is still pending. */
-		if (local->call_count)
-			callcnt = --local->call_count;
-	}
-	UNLOCK (&frame->lock);
-
-
-	if (callcnt == 0) {
-		if (local->sh_struct->entry_list[0]) {
-			prev = entry = local->sh_struct->entry_list[0];
-			trav = entry->next;
-			while (trav) {
-				prev->next = trav->next;
-				FREE (trav->name);
-				if (S_ISLNK (trav->buf.st_mode))
-					FREE (trav->link);
-				FREE (trav);
-				trav = prev->next;
-			}
-			FREE (entry);
-		}
-
-		if (!local->flags) {
-			if (local->sh_struct->count_list[0] >= 
-			    UNIFY_SELF_HEAL_GETDENTS_COUNT) {
-				/* count == size, that means, there are more
-				   entries to read from */
-				//local->call_count = 0;
-				local->sh_struct->offset_list[0] += 
-					UNIFY_SELF_HEAL_GETDENTS_COUNT;
-				STACK_WIND (frame,
-					    unify_bgsh_ns_getdents_cbk,
-					    NS(this),
-					    NS(this)->fops->getdents,
-					    local->fd,
-					    UNIFY_SELF_HEAL_GETDENTS_COUNT,
-					    local->sh_struct->offset_list[0],
-					    GF_GET_DIR_ONLY);
-			}		
-		} else {
-			fd_unref (local->fd);
-			unify_local_wipe (local);
-			STACK_DESTROY (frame->root);
-		}
-	}
-
-	return 0;
-}
-
-
-int32_t
-unify_bgsh_ns_getdents_cbk (call_frame_t *frame,
-			    void *cookie,
-			    xlator_t *this,
-			    int32_t op_ret,
-			    int32_t op_errno,
-			    dir_entry_t *entry,
-			    int32_t count)
-{
-	unify_local_t *local = frame->local;
-	unify_private_t *priv = this->private;
-	long index = 0;
-	unsigned long final = 0;
-	dir_entry_t *tmp = CALLOC (1, sizeof (dir_entry_t));
-
-	local->sh_struct->entry_list[0] = tmp;
-	local->sh_struct->count_list[0] = count;
-	if (entry) {
-		tmp->next = entry->next;
-		entry->next = NULL;
-	}
-
-	if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) {
-		final = 1;
-	}
-
-	LOCK (&frame->lock);
-	{
-		/* local->call_count will be '0' till now. make it 1 so, 
-		   it can be UNWIND'ed for the last call. */
-		local->call_count = priv->child_count;
-		if (final)
-			local->flags = 1;
-	}
-	UNLOCK (&frame->lock);
-
-	for (index = 0; index < priv->child_count; index++) 
-	{
-		STACK_WIND_COOKIE (frame,
-				   unify_bgsh_setdents_cbk, 
-				   (void *)index,
-				   priv->xl_array[index],
-				   priv->xl_array[index]->fops->setdents,
-				   local->fd, GF_SET_DIR_ONLY,
-				   local->sh_struct->entry_list[0], count);
-	}
-
-	return 0;
-}
-
-int32_t 
-unify_bgsh_ns_setdents_cbk (call_frame_t *frame,
-			    void *cookie,
-			    xlator_t *this,
-			    int32_t op_ret,
-			    int32_t op_errno)
-{
-	int32_t callcnt = -1;
-	unify_local_t *local = frame->local;
-	unify_private_t *priv = this->private;
-	long index = (long)cookie;
-	dir_entry_t *prev, *entry, *trav;
-
-	if (local->sh_struct->entry_list[index]) {
-		prev = entry = local->sh_struct->entry_list[index];
-		if (!entry)
-			return 0;
-		trav = entry->next;
-		while (trav) {
-			prev->next = trav->next;
-			FREE (trav->name);
-			if (S_ISLNK (trav->buf.st_mode))
-				FREE (trav->link);
-			FREE (trav);
-			trav = prev->next;
-		}
-		FREE (entry);
-	}
-
-	if (local->sh_struct->count_list[index] < 
-	    UNIFY_SELF_HEAL_GETDENTS_COUNT) {
-		LOCK (&frame->lock);
-		{
-			callcnt = --local->call_count;
-		}
-		UNLOCK (&frame->lock);
-	} else {
-		/* count == size, that means, there are more entries 
-		   to read from */
-		local->sh_struct->offset_list[index] += 
-			UNIFY_SELF_HEAL_GETDENTS_COUNT;
-		STACK_WIND_COOKIE (frame,
-				   unify_bgsh_getdents_cbk,
-				   cookie,
-				   priv->xl_array[index],
-				   priv->xl_array[index]->fops->getdents,
-				   local->fd,
-				   UNIFY_SELF_HEAL_GETDENTS_COUNT,
-				   local->sh_struct->offset_list[index],
-				   GF_GET_ALL);
-    
-		gf_log (this->name, GF_LOG_DEBUG, 
-			"readdir on (%s) with offset %"PRId64"", 
-			priv->xl_array[index]->name, 
-			local->sh_struct->offset_list[index]);
-	}
-
-	if (!callcnt) {
-		/* All storage nodes have done unified setdents on NS node.
-		 * Now, do getdents from NS and do setdents on storage nodes.
-		 */
-    
-		/* sh_struct->offset_list is no longer required for
-		   storage nodes now */
-		local->sh_struct->offset_list[0] = 0; /* reset */
-
-		STACK_WIND (frame,
-			    unify_bgsh_ns_getdents_cbk,
-			    NS(this),
-			    NS(this)->fops->getdents,
-			    local->fd,
-			    UNIFY_SELF_HEAL_GETDENTS_COUNT,
-			    0, /* In this call, do send '0' as offset */
-			    GF_GET_DIR_ONLY);
-	}
-
-	return 0;
-}
-
-
-/**
- * unify_bgsh_getdents_cbk -
- */
-int32_t
-unify_bgsh_getdents_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 dir_entry_t *entry,
-			 int32_t count)
-{
-	int32_t callcnt = -1;
-	unify_local_t *local = frame->local;
-	unify_private_t *priv = this->private;
-	long index = (long)cookie;
-	dir_entry_t *tmp = NULL; 
-
-	if (op_ret >= 0 && count > 0) {
-		/* There is some dentry found, just send the dentry to NS */
-		tmp = CALLOC (1, sizeof (dir_entry_t));
-		local->sh_struct->entry_list[index] = tmp;
-		local->sh_struct->count_list[index] = count;
-		if (entry) {
-			tmp->next = entry->next;
-			entry->next = NULL;
-		}
-		STACK_WIND_COOKIE (frame,
-				   unify_bgsh_ns_setdents_cbk,
-				   cookie,
-				   NS(this),
-				   NS(this)->fops->setdents,
-				   local->fd,
-				   GF_SET_IF_NOT_PRESENT,
-				   local->sh_struct->entry_list[index],
-				   count);
-		return 0;
-	}
-  
-	if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) {
-		LOCK (&frame->lock);
-		{
-			callcnt = --local->call_count;
-		}
-		UNLOCK (&frame->lock);
-	} else {
-		/* count == size, that means, there are more entries to read from */
-		local->sh_struct->offset_list[index] += 
-			UNIFY_SELF_HEAL_GETDENTS_COUNT;
-
-		STACK_WIND_COOKIE (frame,
-				   unify_bgsh_getdents_cbk,
-				   cookie,
-				   priv->xl_array[index],
-				   priv->xl_array[index]->fops->getdents,
-				   local->fd,
-				   UNIFY_SELF_HEAL_GETDENTS_COUNT,
-				   local->sh_struct->offset_list[index],
-				   GF_GET_ALL);
-    
-		gf_log (this->name, GF_LOG_DEBUG, 
-			"readdir on (%s) with offset %"PRId64"", 
-			priv->xl_array[index]->name, 
-			local->sh_struct->offset_list[index]);
-	}
-
-	if (!callcnt) {
-		/* All storage nodes have done unified setdents on NS node.
-		 * Now, do getdents from NS and do setdents on storage nodes.
-		 */
-    
-		/* sh_struct->offset_list is no longer required for 
-		   storage nodes now */
-		local->sh_struct->offset_list[0] = 0; /* reset */
-
-		STACK_WIND (frame,
-			    unify_bgsh_ns_getdents_cbk,
-			    NS(this),
-			    NS(this)->fops->getdents,
-			    local->fd,
-			    UNIFY_SELF_HEAL_GETDENTS_COUNT,
-			    0, /* In this call, do send '0' as offset */
-			    GF_GET_DIR_ONLY);
-	}
-
-	return 0;
-}
-
-/**
- * unify_bgsh_opendir_cbk -
- *
- * @cookie: 
- */
-int32_t 
-unify_bgsh_opendir_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno,
-			fd_t *fd)
-{
-	unify_local_t *local = frame->local;
-	unify_private_t *priv = this->private;
-	int32_t callcnt = 0;
-	int16_t index = 0;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-    
-		if (op_ret >= 0) {
-			local->op_ret = op_ret;
-		} else {
-			local->failed = 1;
-		}
-	}
-	UNLOCK (&frame->lock);
-  
-	if (!callcnt) {
-		local->call_count = priv->child_count + 1;
-    
-		if (!local->failed) {
-			/* send getdents() namespace after finishing 
-			   storage nodes */
-			local->call_count--; 
-			callcnt = local->call_count;
-      
-			fd_bind (fd);
-
-			if (local->call_count) {
-				/* Used as the offset index. This list keeps 
-				   track of offset sent to each node during 
-				   STACK_WIND. */
-				local->sh_struct->offset_list = 
-					calloc (priv->child_count, 
-						sizeof (off_t));
-				ERR_ABORT (local->sh_struct->offset_list);
-	
-				local->sh_struct->entry_list = 
-					calloc (priv->child_count, 
-						sizeof (dir_entry_t *));
-				ERR_ABORT (local->sh_struct->entry_list);
-
-				local->sh_struct->count_list = 
-					calloc (priv->child_count, 
-						sizeof (int));
-				ERR_ABORT (local->sh_struct->count_list);
-
-				/* Send getdents on all the fds */
-				for (index = 0; 
-				     index < priv->child_count; index++) {
-					STACK_WIND_COOKIE (frame,
-							   unify_bgsh_getdents_cbk,
-							   (void *)(long)index,
-							   priv->xl_array[index],
-							   priv->xl_array[index]->fops->getdents,
-							   local->fd,
-							   UNIFY_SELF_HEAL_GETDENTS_COUNT,
-							   0, /* In this call, do send '0' as offset */
-							   GF_GET_ALL);
-				}
-				/* did a stack wind, so no need to unwind here */
-				return 0;
-			} /* (local->call_count) */
-		} /* (!local->failed) */
-
-		/* Opendir failed on one node. 	 */
-		fd_unref (local->fd);
-		
-		unify_local_wipe (local);
-		STACK_DESTROY (frame->root);
-	}
-
-	return 0;
-}
-
-/**
- * gf_bgsh_checksum_cbk - 
- * 
- * @frame: frame used in lookup. get a copy of it, and use that copy.
- * @this: pointer to unify xlator.
- * @inode: pointer to inode, for which the consistency check is required.
- *
- */
-int32_t 
-unify_bgsh_checksum_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 uint8_t *file_checksum,
-			 uint8_t *dir_checksum)
-{
-	unify_local_t *local = frame->local;
-	unify_private_t *priv = this->private;
-	int16_t index = 0;
-	int32_t callcnt = 0;
-  
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-		if (op_ret >= 0) {
-			if (NS(this) == (xlator_t *)cookie) {
-				memcpy (local->sh_struct->ns_file_checksum, 
-					file_checksum, ZR_FILENAME_MAX);
-				memcpy (local->sh_struct->ns_dir_checksum, 
-					dir_checksum, ZR_FILENAME_MAX);
-			} else {
-				if (local->entry_count == 0) {
-					/* Initialize the dir_checksum to be 
-					 * used for comparision with other 
-					 * storage nodes. Should be done for
-					 * the first successful call *only*. 
-					 */
-					/* Using 'entry_count' as a flag */
-					local->entry_count = 1; 
-					memcpy (local->sh_struct->dir_checksum,
-						dir_checksum, ZR_FILENAME_MAX);
-				}
-
-				/* Reply from the storage nodes */
-				for (index = 0; 
-				     index < ZR_FILENAME_MAX; index++) {
-					/* Files should be present in only 
-					   one node */
-					local->sh_struct->file_checksum[index] ^= file_checksum[index];
-	  
-					/* directory structure should be same 
-					   accross */
-					if (local->sh_struct->dir_checksum[index] != dir_checksum[index])
-						local->failed = 1;
-				}
-			}
-		} 
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		for (index = 0; index < ZR_FILENAME_MAX ; index++) {
-			if (local->sh_struct->file_checksum[index] != 
-			    local->sh_struct->ns_file_checksum[index]) {
-				local->failed = 1;
-				break;
-			}
-			if (local->sh_struct->dir_checksum[index] != 
-			    local->sh_struct->ns_dir_checksum[index]) {
-				local->failed = 1;
-				break;
-			}
-		}
-	
-		if (local->failed) {
-			/* Log it, it should be a rare event */
-			gf_log (this->name, GF_LOG_WARNING, 
-				"Self-heal triggered on directory %s", 
-				local->loc1.path);
-
-			/* Any self heal will be done at the directory level */
-			local->op_ret = -1;
-			local->failed = 0;
-      
-			local->fd = fd_create (local->loc1.inode, 
-					       frame->root->pid);
-			local->call_count = priv->child_count + 1;
-	
-			for (index = 0; 
-			     index < (priv->child_count + 1); index++) {
-				STACK_WIND_COOKIE (frame,
-						   unify_bgsh_opendir_cbk,
-						   priv->xl_array[index]->name,
-						   priv->xl_array[index],
-						   priv->xl_array[index]->fops->opendir,
-						   &local->loc1,
-						   local->fd);
-			}
-      
-			/* opendir can be done on the directory */
-			return 0;
-		}
-
-		/* no mismatch */
-		unify_local_wipe (local);
-		STACK_DESTROY (frame->root);
-	}
-
-	return 0;
-}
-
-/* Background self-heal part over */
-
-
-
-
-/**
- * zr_unify_self_heal - 
- * 
- * @frame: frame used in lookup. get a copy of it, and use that copy.
- * @this: pointer to unify xlator.
- * @inode: pointer to inode, for which the consistency check is required.
- *
- */
-int32_t 
-zr_unify_self_heal (call_frame_t *frame,
-		    xlator_t *this,
-		    unify_local_t *local)
-{
-	unify_private_t *priv = this->private;
-	call_frame_t *bg_frame = NULL;
-	unify_local_t *bg_local = NULL;
-	inode_t *tmp_inode = NULL;
-	dict_t *tmp_dict = NULL;
-	int16_t index = 0;
-  
-	if (local->inode_generation < priv->inode_generation) {
-		/* Any self heal will be done at the directory level */
-		/* Update the inode's generation to the current generation
-		   value. */
-		local->inode_generation = priv->inode_generation;
-		inode_ctx_put (local->loc1.inode, this, 
-			  (uint64_t)(long)local->inode_generation);
-
-		if (priv->self_heal == ZR_UNIFY_FG_SELF_HEAL) {
-			local->op_ret = 0;
-			local->failed = 0;
-			local->call_count = priv->child_count + 1;
-			local->sh_struct = 
-				calloc (1, sizeof (struct unify_self_heal_struct));
-      
-			/* +1 is for NS */
-			for (index = 0; 
-			     index < (priv->child_count + 1); index++) {
-				STACK_WIND_COOKIE (frame,
-						   unify_sh_checksum_cbk,
-						   priv->xl_array[index],
-						   priv->xl_array[index],
-						   priv->xl_array[index]->fops->checksum,
-						   &local->loc1,
-						   0);
-			}
-
-			/* Self-heal in foreground, hence no need 
-			   to UNWIND here */
-			return 0;
-		}
-
-		/* Self Heal done in background */
-		bg_frame = copy_frame (frame);
-		INIT_LOCAL (bg_frame, bg_local);
-		loc_copy (&bg_local->loc1, &local->loc1);
-		bg_local->op_ret = 0;
-		bg_local->failed = 0;
-		bg_local->call_count = priv->child_count + 1;
-		bg_local->sh_struct = 
-			calloc (1, sizeof (struct unify_self_heal_struct));
-    
-		/* +1 is for NS */
-		for (index = 0; index < (priv->child_count + 1); index++) {
-			STACK_WIND_COOKIE (bg_frame,
-					   unify_bgsh_checksum_cbk,
-					   priv->xl_array[index],
-					   priv->xl_array[index],
-					   priv->xl_array[index]->fops->checksum,
-					   &bg_local->loc1,
-					   0);
-		}
-	}
-
-	/* generation number matches, self heal already done or
-	 * self heal done in background: just do STACK_UNWIND 
-	 */
-	tmp_inode = local->loc1.inode;
-	tmp_dict = local->dict;
-
-	unify_local_wipe (local);
-
-	/* This is lookup_cbk ()'s UNWIND. */
-	STACK_UNWIND (frame,
-		      local->op_ret,
-		      local->op_errno,
-		      tmp_inode,
-		      &local->stbuf,
-		      local->dict);
-
-	if (tmp_dict)
-		dict_unref (tmp_dict);
-
-	return 0;
-}
-
diff --git a/xlators/cluster/unify/src/unify.c b/xlators/cluster/unify/src/unify.c
deleted file mode 100644
index 6455b4f072d..00000000000
--- a/xlators/cluster/unify/src/unify.c
+++ /dev/null
@@ -1,4451 +0,0 @@
-/*
-  Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-/**
- * xlators/cluster/unify:
- *     - This xlator is one of the main translator in GlusterFS, which
- *   actually does the clustering work of the file system. One need to 
- *   understand that, unify assumes file to be existing in only one of 
- *   the child node, and directories to be present on all the nodes. 
- *
- * NOTE:
- *   Now, unify has support for global namespace, which is used to keep a 
- * global view of fs's namespace tree. The stat for directories are taken
- * just from the namespace, where as for files, just 'st_ino' is taken from
- * Namespace node, and other stat info is taken from the actual storage node.
- * Also Namespace node helps to keep consistant inode for files across 
- * glusterfs (re-)mounts.
- */
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "unify.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include <signal.h>
-#include <libgen.h>
-#include "compat-errno.h"
-#include "compat.h"
-
-#define UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR(_loc) do { \
-  if (!(_loc && _loc->inode)) {                            \
-    STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL);    \
-    return 0;                                              \
-  }                                                        \
-} while(0)
-
-
-#define UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(_fd) do { \
-  if (!(_fd && !fd_ctx_get (_fd, this, NULL))) {       \
-    STACK_UNWIND (frame, -1, EBADFD, NULL, NULL);      \
-    return 0;                                          \
-  }                                                    \
-} while(0)
-
-#define UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(_fd) do { \
-  if (!_fd) {                                      \
-    STACK_UNWIND (frame, -1, EBADFD, NULL, NULL);  \
-    return 0;                                      \
-  }                                                \
-} while(0)
-
-/**
- * unify_local_wipe - free all the extra allocation of local->* here.
- */
-static void 
-unify_local_wipe (unify_local_t *local)
-{
-	/* Free the strdup'd variables in the local structure */
-	if (local->name) {
-		FREE (local->name);
-	}
-	loc_wipe (&local->loc1);
-	loc_wipe (&local->loc2);
-}
-
-
-
-/*
- * unify_normalize_stats -
- */
-void
-unify_normalize_stats (struct statvfs *buf,
-		       unsigned long bsize,
-		       unsigned long frsize)
-{
-	double factor;
-
-	if (buf->f_bsize != bsize) {
-		factor = ((double) buf->f_bsize) / bsize;
-		buf->f_bsize  = bsize;
-		buf->f_bfree  = (fsblkcnt_t) (factor * buf->f_bfree);
-		buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail);
-	}
-  
-	if (buf->f_frsize != frsize) {
-		factor = ((double) buf->f_frsize) / frsize;
-		buf->f_frsize = frsize;
-		buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks);
-	}
-}
-
-
-xlator_t *
-unify_loc_subvol (loc_t *loc, xlator_t *this)
-{
-	unify_private_t *priv = NULL;
-	xlator_t        *subvol = NULL;
-	int16_t         *list = NULL;
-	long             index = 0;
-	xlator_t        *subvol_i = NULL;
-	int              ret = 0;
-	uint64_t         tmp_list = 0;
-
-	priv   = this->private;
-	subvol = NS (this);
-
-	if (!S_ISDIR (loc->inode->st_mode)) {
-		ret = inode_ctx_get (loc->inode, this, &tmp_list);
-		list = (int16_t *)(long)tmp_list;
-		if (!list)
-			goto out;
-
-		for (index = 0; list[index] != -1; index++) {
-			subvol_i = priv->xl_array[list[index]];
-			if (subvol_i != NS (this)) {
-				subvol = subvol_i;
-				break;
-			}
-		}
-	}
-out:
-	return subvol;
-}
-
-
-
-/**
- * unify_statfs_cbk -
- */
-int32_t
-unify_statfs_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct statvfs *stbuf)
-{
-	int32_t callcnt = 0;
-	struct statvfs *dict_buf = NULL;
-	unsigned long bsize;
-	unsigned long frsize;
-	unify_local_t *local = (unify_local_t *)frame->local;
-	call_frame_t *prev_frame = cookie;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret >= 0) {
-			/* when a call is successfull, add it to local->dict */
-			dict_buf = &local->statvfs_buf;
-
-			if (dict_buf->f_bsize != 0) {
-				bsize  = max (dict_buf->f_bsize, 
-					      stbuf->f_bsize);
-
-				frsize = max (dict_buf->f_frsize, 
-					      stbuf->f_frsize);
-				unify_normalize_stats(dict_buf, bsize, frsize);
-				unify_normalize_stats(stbuf, bsize, frsize);
-			} else {
-				dict_buf->f_bsize   = stbuf->f_bsize;
-				dict_buf->f_frsize  = stbuf->f_frsize;
-			}
-      
-			dict_buf->f_blocks += stbuf->f_blocks;
-			dict_buf->f_bfree  += stbuf->f_bfree;
-			dict_buf->f_bavail += stbuf->f_bavail;
-			dict_buf->f_files  += stbuf->f_files;
-			dict_buf->f_ffree  += stbuf->f_ffree;
-			dict_buf->f_favail += stbuf->f_favail;
-			dict_buf->f_fsid    = stbuf->f_fsid;
-			dict_buf->f_flag    = stbuf->f_flag;
-			dict_buf->f_namemax = stbuf->f_namemax;
-			local->op_ret = op_ret;
-		} else {
-			/* fop on storage node has failed due to some error */
-			if (op_errno != ENOTCONN) {
-				gf_log (this->name, GF_LOG_ERROR, 
-					"child(%s): %s", 
-					prev_frame->this->name, 
-					strerror (op_errno));
-			}
-			local->op_errno = op_errno;
-		}
-		callcnt = --local->call_count;
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-			      &local->statvfs_buf);
-	}
-
-	return 0;
-}
-
-/**
- * unify_statfs -
- */
-int32_t
-unify_statfs (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc)
-{
-	unify_local_t *local = NULL;
-	xlator_list_t *trav = this->children;
-
-	INIT_LOCAL (frame, local);
-	local->call_count = ((unify_private_t *)this->private)->child_count;
-
-	while(trav) {
-		STACK_WIND (frame,
-			    unify_statfs_cbk,
-			    trav->xlator,
-			    trav->xlator->fops->statfs,
-			    loc);
-		trav = trav->next;
-	}
-
-	return 0;
-}
-
-/**
- * unify_buf_cbk - 
- */
-int32_t
-unify_buf_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno,
-	       struct stat *buf)
-{
-	int32_t callcnt = 0;
-	unify_private_t *priv = this->private;
-	unify_local_t *local = frame->local;
-	call_frame_t *prev_frame = cookie;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-    
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"%s(): child(%s): path(%s): %s", 
-				gf_fop_list[frame->root->op],
-				prev_frame->this->name, 
-				(local->loc1.path)?local->loc1.path:"", 
-				strerror (op_errno));
-
-			local->op_errno = op_errno;
-			if ((op_errno == ENOENT) && priv->optimist) 
-				local->op_ret = 0;
-		}
-
-		if (op_ret >= 0) {
-			local->op_ret = 0;
-
-			if (NS (this) == prev_frame->this) {
-				local->st_ino = buf->st_ino;
-				/* If the entry is directory, get the stat
-				   from NS node */
-				if (S_ISDIR (buf->st_mode) || 
-				    !local->stbuf.st_blksize) {
-					local->stbuf = *buf;
-				}
-			}
-
-			if ((!S_ISDIR (buf->st_mode)) && 
-			    (NS (this) != prev_frame->this)) {
-				/* If file, take the stat info from Storage 
-				   node. */
-				local->stbuf = *buf;
-			}
-		}
-	}
-	UNLOCK (&frame->lock);
-    
-	if (!callcnt) {
-		/* If the inode number is not filled, operation should
-		   fail */
-		if (!local->st_ino)
-			local->op_ret = -1;
-
-		local->stbuf.st_ino = local->st_ino;
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-			      &local->stbuf);
-	}
-
-	return 0;
-}
-
-#define check_if_dht_linkfile(s) ((s->st_mode & ~S_IFMT) == S_ISVTX)
-
-/**
- * unify_lookup_cbk - 
- */
-int32_t 
-unify_lookup_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  inode_t *inode,
-		  struct stat *buf,
-		  dict_t *dict)
-{
-	int32_t callcnt = 0;
-	unify_private_t *priv = this->private;
-	unify_local_t *local = frame->local;
-	inode_t *tmp_inode = NULL;
-	dict_t *local_dict = NULL;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
- 
-		if (op_ret == -1) {
-			if ((op_errno != ENOTCONN) && (op_errno != ENOENT)) {
-				gf_log (this->name, GF_LOG_ERROR,
-					"child(%s): path(%s): %s", 
-					priv->xl_array[(long)cookie]->name, 
-					local->loc1.path, strerror (op_errno));
-				local->op_errno = op_errno;
-				local->failed = 1;
-
-			} else if (local->revalidate && 
-				   !(priv->optimist && (op_errno == ENOENT))) {
-
-				gf_log (this->name, 
-					(op_errno == ENOTCONN) ? 
-					GF_LOG_DEBUG:GF_LOG_ERROR,
-					"child(%s): path(%s): %s", 
-					priv->xl_array[(long)cookie]->name, 
-					local->loc1.path, strerror (op_errno));
-				local->op_errno = op_errno;
-				local->failed = 1;
-			}
-		}
-
-		if (op_ret == 0) {
-			local->op_ret = 0; 
-			
-			if (check_if_dht_linkfile(buf)) {
-				gf_log (this->name, GF_LOG_CRITICAL,
-					"file %s may be DHT link file on %s, "
-					"make sure the backend is not shared "
-					"between unify and DHT", 
-					local->loc1.path, 
-					priv->xl_array[(long)cookie]->name);
-			}
-
-			if (local->stbuf.st_mode && local->stbuf.st_blksize) {
-				/* make sure we already have a stbuf
-				   stored in local->stbuf */
-				if (S_ISDIR (local->stbuf.st_mode) && 
-				    !S_ISDIR (buf->st_mode)) {
-					gf_log (this->name, GF_LOG_CRITICAL, 
-						"[CRITICAL] '%s' is directory "
-						"on namespace, non-directory "
-						"on node '%s', returning EIO",
-						local->loc1.path, 
-						priv->xl_array[(long)cookie]->name);
-					local->return_eio = 1;
-				}
-				if (!S_ISDIR (local->stbuf.st_mode) && 
-				    S_ISDIR (buf->st_mode)) {
-					gf_log (this->name, GF_LOG_CRITICAL, 
-						"[CRITICAL] '%s' is directory "
-						"on node '%s', non-directory "
-						"on namespace, returning EIO",
-						local->loc1.path, 
-						priv->xl_array[(long)cookie]->name);
-					local->return_eio = 1;
-				}
-			}
-	
-			if (!local->revalidate && !S_ISDIR (buf->st_mode)) {
-				/* This is the first time lookup on file*/
-				if (!local->list) {
-					/* list is not allocated, allocate 
-					   the max possible range */
-					local->list = CALLOC (1, 2 * (priv->child_count + 2));
-					if (!local->list) {
-						gf_log (this->name, 
-							GF_LOG_CRITICAL, 
-							"Not enough memory");
-						STACK_UNWIND (frame, -1, 
-							      ENOMEM, inode, 
-							      NULL, NULL);
-						return 0;
-					}
-				}
-				/* update the index of the list */
-				local->list [local->index++] = 
-					(int16_t)(long)cookie;
-			}
-      
-			if ((!local->dict) && dict &&
-			    (priv->xl_array[(long)cookie] != NS(this)))	{
-				local->dict = dict_ref (dict);
-			}
-
-			/* index of NS node is == total child count */
-			if (priv->child_count == (int16_t)(long)cookie) {
-				/* Take the inode number from namespace */
-				local->st_ino = buf->st_ino;
-				if (S_ISDIR (buf->st_mode) || 
-				    !(local->stbuf.st_blksize)) {
-					local->stbuf = *buf;
-				}
-			} else if (!S_ISDIR (buf->st_mode)) {
-				/* If file, then get the stat from 
-				   storage node */
-				local->stbuf = *buf;
-			}
-
-			if (local->st_nlink < buf->st_nlink) {
-				local->st_nlink = buf->st_nlink;
-			}
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		local_dict = local->dict;
-		if (local->return_eio) {
-			gf_log (this->name, GF_LOG_CRITICAL, 
-				"[CRITICAL] Unable to fix the path (%s) with "
-				"self-heal, try manual verification. "
-				"returning EIO.", local->loc1.path);
-			unify_local_wipe (local);
-			STACK_UNWIND (frame, -1, EIO, inode, NULL, NULL);
-			if (local_dict)	{
-				dict_unref (local_dict);
-			}
-			return 0;
-		}
-
-		if (!local->stbuf.st_blksize) {
-			/* Inode not present */
-			local->op_ret = -1;
-		} else {
-			if (!local->revalidate && 
-			    !S_ISDIR (local->stbuf.st_mode)) { 
-				/* If its a file, big array is useless, 
-				   allocate the smaller one */
-				int16_t *list = NULL;
-				list = CALLOC (1, 2 * (local->index + 1));
-				ERR_ABORT (list);
-				memcpy (list, local->list, 2 * local->index);
-				/* Make the end of the list as -1 */
-				FREE (local->list);
-				local->list = list;
-				local->list [local->index] = -1;
-				/* Update the inode's ctx with proper array */
-				/* TODO: log on failure */
-				inode_ctx_put (local->loc1.inode, this, 
-					       (uint64_t)(long)local->list);
-			}
-
-			if (S_ISDIR(local->loc1.inode->st_mode)) {
-				/* lookup is done for directory */
-				if (local->failed && priv->self_heal) {
-					/* Triggering self-heal */
-                                        /* means, self-heal required for this 
-					   inode */
-					local->inode_generation = 0; 
-					priv->inode_generation++;
-				}
-			} else {
-				local->stbuf.st_ino = local->st_ino;
-			}
-	  
-			local->stbuf.st_nlink = local->st_nlink;
-		}
-		if (local->op_ret == -1) {
-			if (!local->revalidate && local->list)
-				FREE (local->list);
-		}
-
-		if ((local->op_ret >= 0) && local->failed && 
-		    local->revalidate) {
-			/* Done revalidate, but it failed */
-			if (op_errno != ENOTCONN) {
-				gf_log (this->name, GF_LOG_ERROR, 
-					"Revalidate failed for path(%s): %s", 
-					local->loc1.path, strerror (op_errno));
-			}
-			local->op_ret = -1;
-		}
-
-		if ((priv->self_heal && !priv->optimist) && 
-		    (!local->revalidate && (local->op_ret == 0) && 
-		     S_ISDIR(local->stbuf.st_mode))) {
-			/* Let the self heal be done here */
-			zr_unify_self_heal (frame, this, local);
-			local_dict = NULL;
-		} else {
-			/* either no self heal, or op_ret == -1 (failure) */
-			tmp_inode = local->loc1.inode;
-			unify_local_wipe (local);
-			STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-				      tmp_inode, &local->stbuf, local->dict);
-		}
-		if (local_dict) {
-			dict_unref (local_dict);
-		}
-	}
-  
-	return 0;
-}
-
-/**
- * unify_lookup - 
- */
-int32_t 
-unify_lookup (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      dict_t *xattr_req)
-{
-	unify_local_t *local = NULL;
-	unify_private_t *priv = this->private;
-	int16_t *list = NULL;
-	long index = 0;
-
-	if (!(loc && loc->inode)) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"%s: Argument not right", loc?loc->path:"(null)");
-		STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL);
-		return 0;
-	}
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	loc_copy (&local->loc1, loc);
-	if (local->loc1.path == NULL) {
-		gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
-		STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL, NULL);
-		return 0;
-	}
-
-	if (!inode_ctx_get (loc->inode, this, NULL) && 
-	    loc->inode->st_mode && 
-	    !S_ISDIR (loc->inode->st_mode)) {
-		uint64_t tmp_list = 0;
-		/* check if revalidate or fresh lookup */
-		inode_ctx_get (loc->inode, this, &tmp_list);
-		local->list = (int16_t *)(long)tmp_list;
-	}
-
-	if (local->list) {
-		list = local->list;
-		for (index = 0; list[index] != -1; index++);
-		if (index != 2) {
-			if (index < 2) {
-				gf_log (this->name, GF_LOG_ERROR,
-					"returning ESTALE for %s: file "
-					"count is %ld", loc->path, index);
-				/* Print where all the file is present */
-				for (index = 0; 
-				     local->list[index] != -1; index++) {
-					gf_log (this->name, GF_LOG_ERROR, 
-						"%s: found on %s", loc->path, 
-						priv->xl_array[list[index]]->name);
-				}
-				unify_local_wipe (local);
-				STACK_UNWIND (frame, -1, ESTALE, 
-					      NULL, NULL, NULL);
-				return 0;  
-			} else {
-				/* There are more than 2 presences */
-				/* Just log and continue */
-				gf_log (this->name, GF_LOG_ERROR,
-					"%s: file count is %ld", 
-					loc->path, index);
-				/* Print where all the file is present */
-				for (index = 0; 
-				     local->list[index] != -1; index++) {
-					gf_log (this->name, GF_LOG_ERROR, 
-						"%s: found on %s", loc->path, 
-						priv->xl_array[list[index]]->name);
-				}
-			}
-		}
-      
-		/* is revalidate */
-		local->revalidate = 1;
-      
-		for (index = 0; list[index] != -1; index++)
-			local->call_count++;
-      
-		for (index = 0; list[index] != -1; index++) {
-			char need_break = (list[index+1] == -1);
-			STACK_WIND_COOKIE (frame,
-					   unify_lookup_cbk,
-					   (void *)(long)list[index], //cookie
-					   priv->xl_array [list[index]],
-					   priv->xl_array [list[index]]->fops->lookup,
-					   loc,
-					   xattr_req);
-			if (need_break)
-				break;
-		}
-	} else {
-		if (loc->inode->st_mode) {
-			if (inode_ctx_get (loc->inode, this, NULL)) {
-				inode_ctx_get (loc->inode, this, 
-					       &local->inode_generation);
-			}
-		}
-		/* This is first call, there is no list */
-		/* call count should be all child + 1 namespace */
-		local->call_count = priv->child_count + 1;
-      
-		for (index = 0; index <= priv->child_count; index++) {
-			STACK_WIND_COOKIE (frame,
-					   unify_lookup_cbk,
-					   (void *)index, //cookie
-					   priv->xl_array[index],
-					   priv->xl_array[index]->fops->lookup,
-					   loc,
-					   xattr_req);
-		}
-	}
-
-	return 0;
-}
-
-/**
- * unify_stat - if directory, get the stat directly from NameSpace child.
- *     if file, check for a hint and send it only there (also to NS).
- *     if its a fresh stat, then do it on all the nodes.
- *
- * NOTE: for all the call, sending cookie as xlator pointer, which will be 
- *       used in cbk.
- */
-int32_t
-unify_stat (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc)
-{
-	unify_local_t *local = NULL;
-	unify_private_t *priv = this->private;
-	int16_t index = 0;
-	int16_t *list = NULL;
-	uint64_t tmp_list = 0;
-
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	loc_copy (&local->loc1, loc);
-	if (local->loc1.path == NULL) {
-		gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
-		STACK_UNWIND (frame, -1, ENOMEM, NULL);
-		return 0;
-	}
-	local->st_ino = loc->inode->ino;
-	if (S_ISDIR (loc->inode->st_mode)) {
-		/* Directory */
-		local->call_count = 1;
-		STACK_WIND (frame, unify_buf_cbk, NS(this),
-			    NS(this)->fops->stat, loc);
-	} else {
-		/* File */
-		inode_ctx_get (loc->inode, this, &tmp_list);
-    		list = (int16_t *)(long)tmp_list;
-
-		for (index = 0; list[index] != -1; index++)
-			local->call_count++;
-    
-		for (index = 0; list[index] != -1; index++) {
-			char need_break = (list[index+1] == -1);
-			STACK_WIND (frame,
-				    unify_buf_cbk,
-				    priv->xl_array[list[index]],
-				    priv->xl_array[list[index]]->fops->stat,
-				    loc);
-			if (need_break)
-				break;
-		}
-	}
-
-	return 0;
-}
-
-/**
- * unify_access_cbk -
- */
-int32_t
-unify_access_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-
-/**
- * unify_access - Send request to only namespace, which has all the 
- *      attributes set for the file.
- */
-int32_t
-unify_access (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      int32_t mask)
-{
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	STACK_WIND (frame,
-		    unify_access_cbk,
-		    NS(this),
-		    NS(this)->fops->access,
-		    loc,
-		    mask);
-
-	return 0;
-}
-
-int32_t
-unify_mkdir_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 inode_t *inode,
-		 struct stat *buf)
-{
-	int32_t callcnt = 0;
-	unify_private_t *priv = this->private;
-	unify_local_t *local = frame->local;
-	inode_t *tmp_inode = NULL;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-  
-		if ((op_ret == -1) && !(priv->optimist && 
-					(op_errno == ENOENT || 
-					 op_errno == EEXIST))) {
-			/* TODO: Decrement the inode_generation of 
-			 * this->inode's parent inode, hence the missing 
-			 * directory is created properly by self-heal. 
-			 * Currently, there is no way to get the parent 
-			 * inode directly.
-			 */
-			gf_log (this->name, GF_LOG_ERROR,
-				"child(%s): path(%s): %s", 
-				priv->xl_array[(long)cookie]->name, 
-				local->loc1.path, strerror (op_errno));
-			if (op_errno != EEXIST)
-				local->failed = 1;
-			local->op_errno = op_errno;
-		}
-  
-		if (op_ret >= 0)
-			local->op_ret = 0;
-
-	}
-	UNLOCK (&frame->lock);
-  
-	if (!callcnt) {
-		if (!local->failed) {
-			inode_ctx_put (local->loc1.inode, this, 
-				       priv->inode_generation);
-		}
-		
-		tmp_inode = local->loc1.inode;
-		unify_local_wipe (local);
-
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-			      tmp_inode, &local->stbuf);
-	}
-
-	return 0;
-}
-
-/**
- * unify_ns_mkdir_cbk -
- */
-int32_t
-unify_ns_mkdir_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    inode_t *inode,
-		    struct stat *buf)
-{
-	unify_private_t *priv = this->private;
-	unify_local_t *local = frame->local;
-	long index = 0;
-
-	if (op_ret == -1) {
-		/* No need to send mkdir request to other servers, 
-		 * as namespace action failed 
-		 */
-		gf_log (this->name, GF_LOG_ERROR,
-			"namespace: path(%s): %s", 
-			local->name, strerror (op_errno));
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, op_ret, op_errno, inode, NULL);
-		return 0;
-	}
-  
-	/* Create one inode for this entry */
-	local->op_ret = 0;
-	local->stbuf = *buf;
-
-	local->call_count = priv->child_count;
-
-	/* Send mkdir request to all the nodes now */
-	for (index = 0; index < priv->child_count; index++) {
-		STACK_WIND_COOKIE (frame,
-				   unify_mkdir_cbk,
-				   (void *)index, //cookie
-				   priv->xl_array[index],
-				   priv->xl_array[index]->fops->mkdir,
-				   &local->loc1,
-				   local->mode);
-	}
-  
-	return 0;
-}
-
-
-/**
- * unify_mkdir -
- */
-int32_t
-unify_mkdir (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     mode_t mode)
-{
-	unify_local_t *local = NULL;
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	local->mode = mode;
-
-	loc_copy (&local->loc1, loc);
-
-	if (local->loc1.path == NULL) {
-		gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
-		STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    unify_ns_mkdir_cbk,
-		    NS(this),
-		    NS(this)->fops->mkdir,
-		    loc,
-		    mode);
-	return 0;
-}
-
-/**
- * unify_rmdir_cbk -
- */
-int32_t
-unify_rmdir_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno)
-{
-	int32_t callcnt = 0;
-	unify_private_t *priv = this->private;
-	unify_local_t *local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-		if (op_ret == 0 || (priv->optimist && (op_errno == ENOENT)))
-			local->op_ret = 0;
-		if (op_ret == -1)
-			local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, local->op_ret, local->op_errno);
-	}
-
-	return 0;
-}
-
-/**
- * unify_ns_rmdir_cbk -
- */
-int32_t
-unify_ns_rmdir_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno)
-{
-	int16_t index = 0;
-	unify_private_t *priv = this->private;
-	unify_local_t *local = frame->local;
-  
-	if (op_ret == -1) {
-		/* No need to send rmdir request to other servers, 
-		 * as namespace action failed 
-		 */
-		gf_log (this->name, 
-			((op_errno != ENOTEMPTY) ? 
-			 GF_LOG_ERROR : GF_LOG_DEBUG),
-			"namespace: path(%s): %s", 
-			local->loc1.path, strerror (op_errno));
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, op_ret, op_errno);
-		return 0;
-	}
-
-	local->call_count = priv->child_count;
-
-	for (index = 0; index < priv->child_count; index++) {
-		STACK_WIND (frame,
-			    unify_rmdir_cbk,
-			    priv->xl_array[index],
-			    priv->xl_array[index]->fops->rmdir,
-			    &local->loc1);
-	}
-
-	return 0;
-}
-
-/**
- * unify_rmdir -
- */
-int32_t
-unify_rmdir (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc)
-{
-	unify_local_t *local = NULL;
-
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-
-	loc_copy (&local->loc1, loc);
-	if (local->loc1.path == NULL) {
-		gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
-		STACK_UNWIND (frame, -1, ENOMEM);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    unify_ns_rmdir_cbk,
-		    NS(this),
-		    NS(this)->fops->rmdir,
-		    loc);
-
-	return 0;
-}
-
-/**
- * unify_open_cbk -
- */
-int32_t
-unify_open_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		fd_t *fd)
-{
-	int32_t callcnt = 0;
-	unify_local_t *local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret >= 0) {
-			local->op_ret = op_ret;
-			if (NS(this) != (xlator_t *)cookie) {
-				/* Store child node's ptr, used in 
-				   all the f*** / FileIO calls */
-				fd_ctx_set (fd, this, (uint64_t)(long)cookie);
-			}
-		}
-		if (op_ret == -1) {
-			local->op_errno = op_errno;
-			local->failed = 1;
-		}
-		callcnt = --local->call_count;
-	}
-	UNLOCK (&frame->lock);
-  
-	if (!callcnt) {
-		if ((local->failed == 1) && (local->op_ret >= 0)) {
-			local->call_count = 1;
-			/* return -1 to user */
-			local->op_ret = -1;
-			//local->op_errno = EIO; 
-      
-			if (!fd_ctx_get (local->fd, this, NULL)) {
-				gf_log (this->name, GF_LOG_ERROR, 
-					"Open success on child node, "
-					"failed on namespace");
-			} else {
-				gf_log (this->name, GF_LOG_ERROR, 
-					"Open success on namespace, "
-					"failed on child node");
-			}
-		}
-
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, local->op_ret, 
-			      local->op_errno, local->fd);
-	}
-
-	return 0;
-}
-
-#ifdef GF_DARWIN_HOST_OS
-/**
- * unify_create_lookup_cbk - 
- */
-int32_t 
-unify_open_lookup_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       inode_t *inode,
-		       struct stat *buf,
-		       dict_t *dict)
-{
-	int32_t callcnt = 0;
-	int16_t index = 0;
-	unify_private_t *priv = this->private;
-	unify_local_t *local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-		if ((op_ret == -1) && (op_errno != ENOENT)) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"child(%s): path(%s): %s", 
-				priv->xl_array[(long)cookie]->name, 
-				local->loc1.path, strerror (op_errno));
-			local->op_errno = op_errno;
-		}
-    
-		if (op_ret >= 0) {
-			local->op_ret = op_ret; 
-			local->index++;
-			if (NS(this) == priv->xl_array[(long)cookie]) {
-				local->list[0] = (int16_t)(long)cookie;
-			} else {
-				local->list[1] = (int16_t)(long)cookie;
-			}
-			if (S_ISDIR (buf->st_mode))
-				local->failed = 1;
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		int16_t file_list[3] = {0,};
-		local->op_ret = -1;
-
-		file_list[0] = local->list[0];
-		file_list[1] = local->list[1];
-		file_list[2] = -1;
-
-		if (local->index != 2) {
-			/* Lookup failed, can't do open */
-			gf_log (this->name, GF_LOG_ERROR,
-				"%s: present on %d nodes", 
-				local->name, local->index);
-
-			if (local->index < 2) {
-				unify_local_wipe (local);
-				gf_log (this->name, GF_LOG_ERROR,
-					"returning as file found on less "
-					"than 2 nodes");
-				STACK_UNWIND (frame, local->op_ret, 
-					      local->op_errno, local->fd);
-				return 0;
-			}
-		}
-
-		if (local->failed) {
-			/* Open on directory, return EISDIR */
-			unify_local_wipe (local);
-			STACK_UNWIND (frame, -1, EISDIR, local->fd);
-			return 0;
-		}
-
-		/* Everything is perfect :) */    
-		local->call_count = 2;
-    
-		for (index = 0; file_list[index] != -1; index++) {
-			char need_break = (file_list[index+1] == -1);
-			STACK_WIND_COOKIE (frame,
-					   unify_open_cbk,
-					   priv->xl_array[file_list[index]],
-					   priv->xl_array[file_list[index]],
-					   priv->xl_array[file_list[index]]->fops->open,
-					   &local->loc1,
-					   local->flags,
-					   local->fd);
-			if (need_break)
-				break;
-		}
-	}
-
-	return 0;
-}
-
-
-int32_t
-unify_open_readlink_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 const char *path)
-{
-	int16_t index = 0;
-	unify_private_t *priv = this->private;
-	unify_local_t *local = frame->local;
-
-	if (op_ret == -1) {
-		STACK_UNWIND (frame, -1, ENOENT);
-		return 0;
-	}
-
-	if (path[0] == '/') {
-		local->name = strdup (path);
-		ERR_ABORT (local->name);
-	} else {
-		char *tmp_str = strdup (local->loc1.path);
-		char *tmp_base = dirname (tmp_str);
-		local->name = CALLOC (1, ZR_PATH_MAX);
-		strcpy (local->name, tmp_base);
-		strncat (local->name, "/", 1);
-		strcat (local->name, path);
-		FREE (tmp_str);
-	}
-  
-	local->list = CALLOC (1, sizeof (int16_t) * 3);
-	ERR_ABORT (local->list);
-	local->call_count = priv->child_count + 1;
-	local->op_ret = -1;
-	for (index = 0; index <= priv->child_count; index++) {
-		/* Send the lookup to all the nodes including namespace */
-		STACK_WIND_COOKIE (frame,
-				   unify_open_lookup_cbk,
-				   (void *)(long)index,
-				   priv->xl_array[index],
-				   priv->xl_array[index]->fops->lookup,
-				   &local->loc1,
-				   NULL);
-	}
-
-	return 0;
-}
-#endif /* GF_DARWIN_HOST_OS */
-
-/**
- * unify_open - 
- */
-int32_t
-unify_open (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    int32_t flags,
-	    fd_t *fd)
-{
-	unify_private_t *priv = this->private;
-	unify_local_t *local = NULL;
-	int16_t *list = NULL;
-	int16_t index = 0;
-	int16_t file_list[3] = {0,};
-	uint64_t tmp_list = 0;
-
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	/* Init */
-	INIT_LOCAL (frame, local);
-	loc_copy (&local->loc1, loc);
-	local->fd    = fd;
-	local->flags = flags;
-	inode_ctx_get (loc->inode, this, &tmp_list);
-	list = (int16_t *)(long)tmp_list;
-
-	local->list = list;
-	file_list[0] = priv->child_count; /* Thats namespace */
-	file_list[2] = -1;
-	for (index = 0; list[index] != -1; index++) {
-		local->call_count++;
-		if (list[index] != priv->child_count)
-			file_list[1] = list[index];
-	}
-
-	if (local->call_count != 2) {
-		/* If the lookup was done for file */
-		gf_log (this->name, GF_LOG_ERROR,
-			"%s: entry_count is %d",
-			loc->path, local->call_count);
-		for (index = 0; local->list[index] != -1; index++)
-			gf_log (this->name, GF_LOG_ERROR, "%s: found on %s",
-				loc->path, priv->xl_array[list[index]]->name);
-
-		if (local->call_count < 2) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"returning EIO as file found on onlyone node");
-			STACK_UNWIND (frame, -1, EIO, fd);
-			return 0;
-		}
-	}
-
-#ifdef GF_DARWIN_HOST_OS
-	/* Handle symlink here */
-	if (S_ISLNK (loc->inode->st_mode)) {
-		/* Callcount doesn't matter here */
-		STACK_WIND (frame,
-			    unify_open_readlink_cbk,
-			    NS(this),
-			    NS(this)->fops->readlink,
-			    loc, ZR_PATH_MAX);
-		return 0;
-	}
-#endif /* GF_DARWIN_HOST_OS */
-
-	local->call_count = 2;
-	for (index = 0; file_list[index] != -1; index++) {
-		char need_break = (file_list[index+1] == -1);
-		STACK_WIND_COOKIE (frame,
-				   unify_open_cbk,
-				   priv->xl_array[file_list[index]], //cookie
-				   priv->xl_array[file_list[index]],
-				   priv->xl_array[file_list[index]]->fops->open,
-				   loc,
-				   flags,
-				   fd);
-		if (need_break)
-			break;
-	}
-
-	return 0;
-}
-
-
-int32_t 
-unify_create_unlink_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno)
-{
-	unify_local_t *local = frame->local;
-	inode_t *inode = local->loc1.inode;
-
-	unify_local_wipe (local);
-
-	STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd, 
-		      inode, &local->stbuf);
-  
-	return 0;
-}
-
-/**
- * unify_create_open_cbk -
- */
-int32_t
-unify_create_open_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       fd_t *fd)
-{
-	int ret = 0;
-	int32_t callcnt = 0;
-	unify_local_t *local = frame->local;
-	inode_t *inode = NULL;
-	xlator_t *child = NULL;
-	uint64_t tmp_value = 0;
-
-	LOCK (&frame->lock);
-	{
-		if (op_ret >= 0) {
-			local->op_ret = op_ret;
-			if (NS(this) != (xlator_t *)cookie) {
-				/* Store child node's ptr, used in all 
-				   the f*** / FileIO calls */
-				/* TODO: log on failure */
-				ret = fd_ctx_get (fd, this, &tmp_value);
-				cookie = (void *)(long)tmp_value;
-			} else {
-				/* NOTE: open successful on namespace.
-				 *       fd's ctx can be used to identify open 
-				 *       failure on storage subvolume. cool 
-				 *       ide ;) */
-				local->failed = 0;
-			}
-		} else {
-			gf_log (this->name, GF_LOG_ERROR,
-				"child(%s): path(%s): %s", 
-				((xlator_t *)cookie)->name,
-				local->loc1.path, strerror (op_errno));
-			local->op_errno = op_errno;
-			local->failed = 1;
-		}
-		callcnt = --local->call_count;
-	}
-	UNLOCK (&frame->lock);
-  
-	if (!callcnt) {
-		if (local->failed == 1 && (local->op_ret >= 0)) {
-			local->call_count = 1;
-			/* return -1 to user */
-			local->op_ret = -1;
-			local->op_errno = EIO;
-			local->fd = fd;
-			local->call_count = 1;
-
-			if (!fd_ctx_get (local->fd, this, &tmp_value)) {
-				child = (xlator_t *)(long)tmp_value;
-
-				gf_log (this->name, GF_LOG_ERROR, 
-					"Create success on child node, "
-					"failed on namespace");
-
-				STACK_WIND (frame,
-					    unify_create_unlink_cbk,
-					    child,
-					    child->fops->unlink,
-					    &local->loc1);
-			} else {
-				gf_log (this->name, GF_LOG_ERROR, 
-					"Create success on namespace, "
-					"failed on child node");
-
-				STACK_WIND (frame,
-					    unify_create_unlink_cbk,
-					    NS(this),
-					    NS(this)->fops->unlink,
-					    &local->loc1);
-			}
-			return 0;
-		}
-		inode = local->loc1.inode;
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, fd,
-			      inode, &local->stbuf);
-	}
-	return 0;
-}
-
-/**
- * unify_create_lookup_cbk - 
- */
-int32_t 
-unify_create_lookup_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 inode_t *inode,
-			 struct stat *buf,
-			 dict_t *dict)
-{
-	int32_t callcnt = 0;
-	int16_t index = 0;
-	unify_private_t *priv = this->private;
-	unify_local_t *local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"child(%s): path(%s): %s", 
-				priv->xl_array[(long)cookie]->name, 
-				local->loc1.path, strerror (op_errno));
-			local->op_errno = op_errno;
-			local->failed = 1;
-		}
-
-		if (op_ret >= 0) {
-			local->op_ret = op_ret; 
-			local->list[local->index++] = (int16_t)(long)cookie;
-			if (NS(this) == priv->xl_array[(long)cookie]) {
-				local->st_ino = buf->st_ino;
-			} else {
-				local->stbuf = *buf;
-			}
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		int16_t *list = local->list;
-		int16_t file_list[3] = {0,};
-		local->op_ret = -1;
-
-		local->list [local->index] = -1;
-		file_list[0] = list[0];
-		file_list[1] = list[1];
-		file_list[2] = -1;
-
-		local->stbuf.st_ino = local->st_ino;
-		/* TODO: log on failure */
-		inode_ctx_put (local->loc1.inode, this, 
-			       (uint64_t)(long)local->list);
-
-		if (local->index != 2) {
-			/* Lookup failed, can't do open */
-			gf_log (this->name, GF_LOG_ERROR,
-				"%s: present on %d nodes", 
-				local->loc1.path, local->index);
-			file_list[0] = priv->child_count;
-			for (index = 0; list[index] != -1; index++) {
-				gf_log (this->name, GF_LOG_ERROR, 
-					"%s: found on %s", local->loc1.path, 
-					priv->xl_array[list[index]]->name);
-				if (list[index] != priv->child_count)
-					file_list[1] = list[index];
-			}
-
-			if (local->index < 2) {
-				unify_local_wipe (local);
-				gf_log (this->name, GF_LOG_ERROR,
-					"returning EIO as file found on "
-					"only one node");
-				STACK_UNWIND (frame, -1, EIO, 
-					      local->fd, inode, NULL);
-				return 0;
-			}
-		}
-		/* Everything is perfect :) */    
-		local->call_count = 2;
-    
-		for (index = 0; file_list[index] != -1; index++) {
-			char need_break = (file_list[index+1] == -1);
-			STACK_WIND_COOKIE (frame,
-					   unify_create_open_cbk,
-					   priv->xl_array[file_list[index]],
-					   priv->xl_array[file_list[index]],
-					   priv->xl_array[file_list[index]]->fops->open,
-					   &local->loc1,
-					   local->flags,
-					   local->fd);
-			if (need_break)
-				break;
-		}
-	}
-
-	return 0;
-}
-
-
-/**
- * unify_create_cbk -
- */
-int32_t
-unify_create_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  fd_t *fd,
-		  inode_t *inode,
-		  struct stat *buf)
-{
-	int ret = 0;
-	unify_local_t *local = frame->local;
-	call_frame_t *prev_frame = cookie;
-	inode_t *tmp_inode = NULL;
-
-	if (op_ret == -1) {
-		/* send unlink () on Namespace */
-		local->op_errno = op_errno;
-		local->op_ret = -1;
-		local->call_count = 1;
-		gf_log (this->name, GF_LOG_ERROR,
-			"create failed on %s (file %s, error %s), "
-			"sending unlink to namespace", 
-			prev_frame->this->name, 
-			local->loc1.path, strerror (op_errno));
-
-		STACK_WIND (frame,
-			    unify_create_unlink_cbk,
-			    NS(this),
-			    NS(this)->fops->unlink,
-			    &local->loc1);
-
-		return 0;
-	}
-
-	if (op_ret >= 0) {
-		local->op_ret = op_ret;
-		local->stbuf = *buf;
-		/* Just inode number should be from NS node */
-		local->stbuf.st_ino = local->st_ino;
-
-		/* TODO: log on failure */
-		ret = fd_ctx_set (fd, this, (uint64_t)(long)prev_frame->this);
-	}
-  
-	tmp_inode = local->loc1.inode;
-	unify_local_wipe (local);
-	STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd, 
-		      tmp_inode, &local->stbuf);
-
-	return 0;
-}
-
-/**
- * unify_ns_create_cbk -
- * 
- */
-int32_t
-unify_ns_create_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     fd_t *fd,
-		     inode_t *inode,
-		     struct stat *buf)
-{
-	struct sched_ops *sched_ops = NULL;
-	xlator_t *sched_xl = NULL;
-	unify_local_t *local = frame->local;
-	unify_private_t *priv = this->private;
-	int16_t *list = NULL;
-	int16_t index = 0;
-
-	if (op_ret == -1) {
-		/* No need to send create request to other servers, as 
-		   namespace action failed. Handle exclusive create here. */
-		if ((op_errno != EEXIST) || 
-		    ((op_errno == EEXIST) && 
-		     ((local->flags & O_EXCL) == O_EXCL))) {
-			/* If its just a create call without O_EXCL, 
-			   don't do this */
-			gf_log (this->name, GF_LOG_ERROR,
-				"namespace: path(%s): %s", 
-				local->loc1.path, strerror (op_errno));
-			unify_local_wipe (local);
-			STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
-			return 0;
-		}
-	}
-  
-	if (op_ret >= 0) {
-		/* Get the inode number from the NS node */
-		local->st_ino = buf->st_ino;
-  
-		local->op_ret = -1;
-
-		/* Start the mapping list */
-		list = CALLOC (1, sizeof (int16_t) * 3);
-		ERR_ABORT (list);
-		inode_ctx_put (inode, this, (uint64_t)(long)list);
-		list[0] = priv->child_count;
-		list[2] = -1;
-
-		/* This means, file doesn't exist anywhere in the Filesystem */
-		sched_ops = priv->sched_ops;
-
-		/* Send create request to the scheduled node now */
-		sched_xl = sched_ops->schedule (this, local->loc1.path);
-		if (sched_xl == NULL)
-		{
-			/* send unlink () on Namespace */
-			local->op_errno = ENOTCONN;
-			local->op_ret = -1;
-			local->call_count = 1;
-			gf_log (this->name, GF_LOG_ERROR,
-				"no node online to schedule create:(file %s) "
-				"sending unlink to namespace", 
-				(local->loc1.path)?local->loc1.path:"");
-
-			STACK_WIND (frame,
-				    unify_create_unlink_cbk,
-				    NS(this),
-				    NS(this)->fops->unlink,
-				    &local->loc1);
-	
-			return 0;
-		}
-
-		for (index = 0; index < priv->child_count; index++)
-			if (sched_xl == priv->xl_array[index])
-				break;
-		list[1] = index;
-
-		STACK_WIND (frame, unify_create_cbk,
-			    sched_xl, sched_xl->fops->create,
-			    &local->loc1, local->flags, local->mode, fd);
-	} else {
-		/* File already exists, and there is no O_EXCL flag */
-
-		gf_log (this->name, GF_LOG_DEBUG, 
-			"File(%s) already exists on namespace, sending "
-			"open instead", local->loc1.path);
-
-		local->list = CALLOC (1, sizeof (int16_t) * 3);
-		ERR_ABORT (local->list);
-		local->call_count = priv->child_count + 1;
-		local->op_ret = -1;
-		for (index = 0; index <= priv->child_count; index++) {
-			/* Send lookup() to all nodes including namespace */
-			STACK_WIND_COOKIE (frame,
-					   unify_create_lookup_cbk,
-					   (void *)(long)index,
-					   priv->xl_array[index],
-					   priv->xl_array[index]->fops->lookup,
-					   &local->loc1,
-					   NULL);
-		}
-	}
-	return 0;
-}
-
-/**
- * unify_create - create a file in global namespace first, so other 
- *    clients can see them. Create the file in storage nodes in background.
- */
-int32_t
-unify_create (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      int32_t flags,
-	      mode_t mode,
-	      fd_t *fd)
-{
-	unify_local_t *local = NULL;
-  
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	local->mode = mode;
-	local->flags = flags;
-	local->fd = fd;
-
-	loc_copy (&local->loc1, loc);
-	if (local->loc1.path == NULL) {
-		gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
-		STACK_UNWIND (frame, -1, ENOMEM, fd, loc->inode, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    unify_ns_create_cbk,
-		    NS(this),
-		    NS(this)->fops->create,
-		    loc,
-		    flags | O_EXCL,
-		    mode,
-		    fd);
-  
-	return 0;
-}
-
-
-/**
- * unify_opendir_cbk - 
- */
-int32_t
-unify_opendir_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   fd_t *fd)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, fd);
-
-	return 0;
-}
-
-/** 
- * unify_opendir -
- */
-int32_t
-unify_opendir (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       fd_t *fd)
-{
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	STACK_WIND (frame, unify_opendir_cbk,
-		    NS(this), NS(this)->fops->opendir, loc, fd);
-
-	return 0;
-}
-
-
-/**
- * unify_chmod - 
- */
-int32_t
-unify_chmod (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     mode_t mode)
-{
-	unify_local_t *local = NULL;
-	unify_private_t *priv = this->private;
-	int32_t index = 0;
-	int32_t callcnt = 0;
-	uint64_t tmp_list = 0;
-		
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-
-	loc_copy (&local->loc1, loc);
-	local->st_ino = loc->inode->ino;
-
-	if (S_ISDIR (loc->inode->st_mode)) {
-		local->call_count = priv->child_count + 1;
-      
-		for (index = 0; index < (priv->child_count + 1); index++) {
-			STACK_WIND (frame,
-				    unify_buf_cbk,
-				    priv->xl_array[index],
-				    priv->xl_array[index]->fops->chmod,
-				    loc, mode);
-		}    
-	} else {
-		inode_ctx_get (loc->inode, this, &tmp_list);
-		local->list = (int16_t *)(long)tmp_list;
-
-		for (index = 0; local->list[index] != -1; index++) {
-			local->call_count++;
-			callcnt++;
-		}
-      
-		for (index = 0; local->list[index] != -1; index++) {
-			STACK_WIND (frame,
-				    unify_buf_cbk,
-				    priv->xl_array[local->list[index]],
-				    priv->xl_array[local->list[index]]->fops->chmod,
-				    loc,
-				    mode);
-			if (!--callcnt)
-				break;
-		}
-	}
-
-	return 0;
-}
-
-/**
- * unify_chown - 
- */
-int32_t
-unify_chown (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     uid_t uid,
-	     gid_t gid)
-{
-	unify_local_t *local = NULL;
-	unify_private_t *priv = this->private;
-	int32_t index = 0;
-	int32_t callcnt = 0;
-  	uint64_t tmp_list = 0;
-
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	loc_copy (&local->loc1, loc);
-	local->st_ino = loc->inode->ino;
-
-	if (S_ISDIR (loc->inode->st_mode)) {
-		local->call_count = priv->child_count + 1;
-      
-		for (index = 0; index < (priv->child_count + 1); index++) {
-			STACK_WIND (frame,
-				    unify_buf_cbk,
-				    priv->xl_array[index],
-				    priv->xl_array[index]->fops->chown,
-				    loc, uid, gid);
-		}    
-	} else {
-		inode_ctx_get (loc->inode, this, &tmp_list);
-		local->list = (int16_t *)(long)tmp_list;
-
-		for (index = 0; local->list[index] != -1; index++) {
-			local->call_count++;
-			callcnt++;
-		}
-      
-		for (index = 0; local->list[index] != -1; index++) {
-			STACK_WIND (frame,
-				    unify_buf_cbk,
-				    priv->xl_array[local->list[index]],
-				    priv->xl_array[local->list[index]]->fops->chown,
-				    loc, uid, gid);
-			if (!--callcnt)
-				break;
-		}
-	}
-
-	return 0;
-}
-
-
-/**
- * unify_truncate_cbk - 
- */
-int32_t
-unify_truncate_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    struct stat *buf)
-{
-	int32_t callcnt = 0;
-	unify_private_t *priv = this->private;
-	unify_local_t *local = frame->local;
-	call_frame_t *prev_frame = cookie;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-    
-		if (op_ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"child(%s): path(%s): %s", 
-				prev_frame->this->name, 
-				(local->loc1.path)?local->loc1.path:"", 
-				strerror (op_errno));
-			local->op_errno = op_errno;
-			if (!((op_errno == ENOENT) && priv->optimist))
-				local->op_ret = -1;
-		}
-
-		if (op_ret >= 0) {
-			if (NS (this) == prev_frame->this) {
-				local->st_ino = buf->st_ino;
-				/* If the entry is directory, get the 
-				   stat from NS node */
-				if (S_ISDIR (buf->st_mode) || 
-				    !local->stbuf.st_blksize) {
-					local->stbuf = *buf;
-				}
-			}
-
-			if ((!S_ISDIR (buf->st_mode)) && 
-			    (NS (this) != prev_frame->this)) {
-				/* If file, take the stat info from 
-				   Storage node. */
-				local->stbuf = *buf;
-			}
-		}
-	}
-	UNLOCK (&frame->lock);
-    
-	if (!callcnt) {
-		if (local->st_ino)
-			local->stbuf.st_ino = local->st_ino;
-		else
-			local->op_ret = -1;
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-			      &local->stbuf);
-	}
-
-	return 0;
-}
-
-/**
- * unify_truncate - 
- */
-int32_t
-unify_truncate (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		off_t offset)
-{
-	unify_local_t *local = NULL;
-	unify_private_t *priv = this->private;
-	int32_t index = 0;
-	int32_t callcnt = 0;
-  	uint64_t tmp_list = 0;
-
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	loc_copy (&local->loc1, loc);
-	local->st_ino = loc->inode->ino;
-
-	if (S_ISDIR (loc->inode->st_mode)) {
-		local->call_count = 1;
-      
-		STACK_WIND (frame,
-			    unify_buf_cbk,
-			    NS(this),
-			    NS(this)->fops->stat,
-			    loc);
-	} else {
-		local->op_ret = 0;
-		inode_ctx_get (loc->inode, this, &tmp_list);
-		local->list = (int16_t *)(long)tmp_list;
-
-		for (index = 0; local->list[index] != -1; index++) {
-			local->call_count++;
-			callcnt++;
-		}
-      
-		/* Don't send truncate to NS node */
-		STACK_WIND (frame, unify_truncate_cbk, NS(this),
-			    NS(this)->fops->stat, loc);
-		callcnt--;
-
-		for (index = 0; local->list[index] != -1; index++) {
-			if (NS(this) != priv->xl_array[local->list[index]]) {
-				STACK_WIND (frame,
-					    unify_truncate_cbk,
-					    priv->xl_array[local->list[index]],
-					    priv->xl_array[local->list[index]]->fops->truncate,
-					    loc,
-					    offset);
-				if (!--callcnt)
-					break;
-			}
-		}
-	}
-
-	return 0;
-}
-
-/**
- * unify_utimens - 
- */
-int32_t 
-unify_utimens (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       struct timespec tv[2])
-{
-	unify_local_t *local = NULL;
-	unify_private_t *priv = this->private;
-	int32_t index = 0;
-	int32_t callcnt = 0;
-  	uint64_t tmp_list = 0;
-  
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	loc_copy (&local->loc1, loc);
-	local->st_ino = loc->inode->ino;
-
-	if (S_ISDIR (loc->inode->st_mode)) {
-		local->call_count = priv->child_count + 1;
-      
-		for (index = 0; index < (priv->child_count + 1); index++) {
-			STACK_WIND (frame,
-				    unify_buf_cbk,
-				    priv->xl_array[index],
-				    priv->xl_array[index]->fops->utimens,
-				    loc, tv);
-		}
-	} else {
-		inode_ctx_get (loc->inode, this, &tmp_list);
-		local->list = (int16_t *)(long)tmp_list;
-
-		for (index = 0; local->list[index] != -1; index++) {
-			local->call_count++;
-			callcnt++;
-		}
-      
-		for (index = 0; local->list[index] != -1; index++) {
-			STACK_WIND (frame,
-				    unify_buf_cbk,
-				    priv->xl_array[local->list[index]],
-				    priv->xl_array[local->list[index]]->fops->utimens,
-				    loc,
-				    tv);
-			if (!--callcnt)
-				break;
-		}
-	}
-  
-	return 0;
-}
-
-/**
- * unify_readlink_cbk - 
- */
-int32_t
-unify_readlink_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    const char *path)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, path);
-	return 0;
-}
-
-/**
- * unify_readlink - Read the link only from the storage node.
- */
-int32_t
-unify_readlink (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		size_t size)
-{
-	unify_private_t *priv = this->private;
-	int32_t entry_count = 0;
-	int16_t *list = NULL;
-	int16_t index = 0;
-  	uint64_t tmp_list = 0;
-  
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-	
-	inode_ctx_get (loc->inode, this, &tmp_list);
-	list = (int16_t *)(long)tmp_list;
-
-	for (index = 0; list[index] != -1; index++)
-		entry_count++;
-
-	if (entry_count >= 2) {
-		for (index = 0; list[index] != -1; index++) {
-			if (priv->xl_array[list[index]] != NS(this)) {
-				STACK_WIND (frame,
-					    unify_readlink_cbk,
-					    priv->xl_array[list[index]],
-					    priv->xl_array[list[index]]->fops->readlink,
-					    loc,
-					    size);
-				break;
-			}
-		}
-	} else {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"returning ENOENT, no softlink files found "
-			"on storage node");
-		STACK_UNWIND (frame, -1, ENOENT, NULL);
-	}
-
-	return 0;
-}
-
-
-/**
- * unify_unlink_cbk - 
- */
-int32_t
-unify_unlink_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno)
-{
-	int32_t callcnt = 0;
-	unify_private_t *priv = this->private;
-	unify_local_t *local = frame->local;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-		if (op_ret == 0  || ((op_errno == ENOENT) && priv->optimist))
-			local->op_ret = 0;
-		if (op_ret == -1)
-			local->op_errno = op_errno;
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, local->op_ret, local->op_errno);
-	}
-
-	return 0;
-}
-
-
-/**
- * unify_unlink - 
- */
-int32_t
-unify_unlink (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc)
-{
-	unify_private_t *priv = this->private;
-	unify_local_t *local = NULL;
-	int16_t *list = NULL;
-	int16_t index = 0;
-  	uint64_t tmp_list = 0;
-
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	loc_copy (&local->loc1, loc);
-
-	inode_ctx_get (loc->inode, this, &tmp_list);
-	list = (int16_t *)(long)tmp_list;
-
-	for (index = 0; list[index] != -1; index++)
-		local->call_count++;
-
-	if (local->call_count) {
-		for (index = 0; list[index] != -1; index++) {
-			char need_break = (list[index+1] == -1);
-			STACK_WIND (frame,
-				    unify_unlink_cbk,
-				    priv->xl_array[list[index]],
-				    priv->xl_array[list[index]]->fops->unlink,
-				    loc);
-			if (need_break)
-				break;
-		}
-	} else {
-		gf_log (this->name, GF_LOG_ERROR,
-			"%s: returning ENOENT", loc->path);
-		STACK_UNWIND (frame, -1, ENOENT);
-	}
-
-	return 0;
-}
-
-
-/**
- * unify_readv_cbk - 
- */
-int32_t
-unify_readv_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 struct iovec *vector,
-		 int32_t count,
-		 struct stat *stbuf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
-	return 0;
-}
-
-/**
- * unify_readv - 
- */
-int32_t
-unify_readv (call_frame_t *frame,
-	     xlator_t *this,
-	     fd_t *fd,
-	     size_t size,
-	     off_t offset)
-{
-	UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
-	xlator_t *child = NULL;
-	uint64_t tmp_child = 0;
-
-	fd_ctx_get (fd, this, &tmp_child);
-	child = (xlator_t *)(long)tmp_child;		     
-
-	STACK_WIND (frame,
-		    unify_readv_cbk,
-		    child,
-		    child->fops->readv,
-		    fd,
-		    size,
-		    offset);
-
-
-	return 0;
-}
-
-/**
- * unify_writev_cbk - 
- */
-int32_t
-unify_writev_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct stat *stbuf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, stbuf);
-	return 0;
-}
-
-/**
- * unify_writev - 
- */
-int32_t
-unify_writev (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      struct iovec *vector,
-	      int32_t count,
-	      off_t off)
-{
-	UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
-	xlator_t *child = NULL;
-	uint64_t tmp_child = 0;
-
-	fd_ctx_get (fd, this, &tmp_child);
-	child = (xlator_t *)(long)tmp_child;		     
-
-	STACK_WIND (frame,
-		    unify_writev_cbk,
-		    child,
-		    child->fops->writev,
-		    fd,
-		    vector,
-		    count,
-		    off);
-
-	return 0;
-}
-
-/**
- * unify_ftruncate -
- */
-int32_t
-unify_ftruncate (call_frame_t *frame,
-		 xlator_t *this,
-		 fd_t *fd,
-		 off_t offset)
-{
-	xlator_t *child = NULL;
-	unify_local_t *local = NULL;
-	uint64_t tmp_child = 0;
-
-	UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(fd);
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	local->op_ret = 0;
-
-	fd_ctx_get (fd, this, &tmp_child);
-	child = (xlator_t *)(long)tmp_child;		     
-
-	local->call_count = 2;
-  
-	STACK_WIND (frame, unify_truncate_cbk, 
-		    child, child->fops->ftruncate,
-		    fd, offset);
-  
-	STACK_WIND (frame, unify_truncate_cbk, 
-		    NS(this), NS(this)->fops->fstat,
-		    fd);
-  
-	return 0;
-}
-
-
-/**
- * unify_fchmod - 
- */
-int32_t 
-unify_fchmod (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      mode_t mode)
-{
-	unify_local_t *local = NULL;
-	xlator_t *child = NULL;	
-	uint64_t tmp_child = 0;
-
-	UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd);
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	local->st_ino = fd->inode->ino;
-
-	if (!fd_ctx_get (fd, this, &tmp_child)) {
-		/* If its set, then its file */
-		child = (xlator_t *)(long)tmp_child;		     
-
-		local->call_count = 2;
-
-		STACK_WIND (frame, unify_buf_cbk, child, 
-			    child->fops->fchmod, fd, mode);
-
-		STACK_WIND (frame, unify_buf_cbk, NS(this),	
-			    NS(this)->fops->fchmod, fd, mode);
-
-	} else {
-		/* this is an directory */
-		local->call_count = 1;
-    
-		STACK_WIND (frame, unify_buf_cbk,
-			    NS(this), NS(this)->fops->fchmod, fd, mode);
-	}
-
-	return 0;
-}
-
-/**
- * unify_fchown - 
- */
-int32_t 
-unify_fchown (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      uid_t uid,
-	      gid_t gid)
-{
-	unify_local_t *local = NULL;
-	xlator_t *child = NULL;
-	uint64_t tmp_child = 0;
-
-	UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd);
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	local->st_ino = fd->inode->ino;
-
-	if (!fd_ctx_get (fd, this, &tmp_child)) {
-		/* If its set, then its file */
-		child = (xlator_t *)(long)tmp_child;		     
-
-		local->call_count = 2;
-
-		STACK_WIND (frame, unify_buf_cbk, child,
-			    child->fops->fchown, fd, uid, gid);
-
-		STACK_WIND (frame, unify_buf_cbk, NS(this),
-			    NS(this)->fops->fchown,	fd, uid, gid);
-	} else {
-		local->call_count = 1;
-    
-		STACK_WIND (frame, unify_buf_cbk,
-			    NS(this), NS(this)->fops->fchown,
-			    fd, uid, gid);
-	}
-  
-	return 0;
-}
-
-/**
- * unify_flush_cbk - 
- */
-int32_t
-unify_flush_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-/**
- * unify_flush -
- */
-int32_t
-unify_flush (call_frame_t *frame,
-	     xlator_t *this,
-	     fd_t *fd)
-{
-	UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
-	xlator_t *child = NULL;
-	uint64_t tmp_child = 0;
-
-	fd_ctx_get (fd, this, &tmp_child);
-	child = (xlator_t *)(long)tmp_child;		     
-
-	STACK_WIND (frame, unify_flush_cbk, child, 
-		    child->fops->flush, fd);
-
-	return 0;
-}
-
-
-/**
- * unify_fsync_cbk - 
- */
-int32_t
-unify_fsync_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-/**
- * unify_fsync - 
- */
-int32_t
-unify_fsync (call_frame_t *frame,
-	     xlator_t *this,
-	     fd_t *fd,
-	     int32_t flags)
-{
-	UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
-	xlator_t *child = NULL;
-	uint64_t tmp_child = 0;
-
-	fd_ctx_get (fd, this, &tmp_child);
-	child = (xlator_t *)(long)tmp_child;		     
-
-	STACK_WIND (frame, unify_fsync_cbk, child,
-		    child->fops->fsync, fd, flags);
-
-	return 0;
-}
-
-/**
- * unify_fstat - Send fstat FOP to Namespace only if its directory, and to 
- *     both namespace and the storage node if its a file.
- */
-int32_t
-unify_fstat (call_frame_t *frame,
-	     xlator_t *this,
-	     fd_t *fd)
-{
-	unify_local_t *local = NULL;
-	xlator_t *child = NULL;
-	uint64_t tmp_child = 0;
-
-	UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd);
-
-	INIT_LOCAL (frame, local);
-	local->st_ino = fd->inode->ino;
-
-	if (!fd_ctx_get (fd, this, &tmp_child)) {
-		/* If its set, then its file */
-		child = (xlator_t *)(long)tmp_child;		     
-		local->call_count = 2;
-
-		STACK_WIND (frame, unify_buf_cbk, child,
-			    child->fops->fstat, fd);
-
-		STACK_WIND (frame, unify_buf_cbk, NS(this),
-			    NS(this)->fops->fstat, fd);
-
-	} else {
-		/* this is an directory */
-		local->call_count = 1;
-		STACK_WIND (frame, unify_buf_cbk, NS(this),
-			    NS(this)->fops->fstat, fd);
-	}
-
-	return 0;
-}
-
-/**
- * unify_getdents_cbk - 
- */
-int32_t
-unify_getdents_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    dir_entry_t *entry,
-		    int32_t count)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, entry, count);
-	return 0;
-}
-
-/**
- * unify_getdents - send the FOP request to all the nodes.
- */
-int32_t
-unify_getdents (call_frame_t *frame,
-		xlator_t *this,
-		fd_t *fd,
-		size_t size,
-		off_t offset,
-		int32_t flag)
-{
-	UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd);
-
-	STACK_WIND (frame, unify_getdents_cbk, NS(this),
-		    NS(this)->fops->getdents, fd, size, offset, flag);
-
-	return 0;
-}
-
-
-/**
- * unify_readdir_cbk - 
- */
-int32_t
-unify_readdir_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   gf_dirent_t *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-
-	return 0;
-}
-
-/**
- * unify_readdir - send the FOP request to all the nodes.
- */
-int32_t
-unify_readdir (call_frame_t *frame,
-	       xlator_t *this,
-	       fd_t *fd,
-	       size_t size,
-	       off_t offset)
-{
-	UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd);
-
-	STACK_WIND (frame, unify_readdir_cbk, NS(this),
-		    NS(this)->fops->readdir, fd, size, offset);
-
-	return 0;
-}
-
-
-/**
- * unify_fsyncdir_cbk - 
- */
-int32_t
-unify_fsyncdir_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-
-	return 0;
-}
-
-/**
- * unify_fsyncdir -
- */
-int32_t
-unify_fsyncdir (call_frame_t *frame,
-		xlator_t *this,
-		fd_t *fd,
-		int32_t flags)
-{
-	UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd);
-
-	STACK_WIND (frame, unify_fsyncdir_cbk,
-		    NS(this), NS(this)->fops->fsyncdir, fd, flags);
-
-	return 0;
-}
-
-/**
- * unify_lk_cbk - UNWIND frame with the proper return arguments.
- */
-int32_t
-unify_lk_cbk (call_frame_t *frame,
-	      void *cookie,
-	      xlator_t *this,
-	      int32_t op_ret,
-	      int32_t op_errno,
-	      struct flock *lock)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, lock);
-	return 0;
-}
-
-/**
- * unify_lk - Send it to all the storage nodes, (should be 1) which has file.
- */
-int32_t
-unify_lk (call_frame_t *frame,
-	  xlator_t *this,
-	  fd_t *fd,
-	  int32_t cmd,
-	  struct flock *lock)
-{
-	UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
-	xlator_t *child = NULL;
-	uint64_t tmp_child = 0;
-
-	fd_ctx_get (fd, this, &tmp_child);
-	child = (xlator_t *)(long)tmp_child;		     
-
-	STACK_WIND (frame, unify_lk_cbk, child,
-		    child->fops->lk, fd, cmd, lock);
-
-	return 0;
-}
-
-
-int32_t
-unify_setxattr_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno);
-
-static int32_t
-unify_setxattr_file_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno)
-{
-	unify_private_t *private = this->private;
-	unify_local_t *local = frame->local;
-	xlator_t *sched_xl = NULL;
-	struct sched_ops *sched_ops = NULL;
-
-	if (op_ret == -1) {
-		if (!ENOTSUP)
-			gf_log (this->name, GF_LOG_ERROR,
-				"setxattr with XATTR_CREATE on ns: "
-				"path(%s) key(%s): %s",
-				local->loc1.path, local->name, 
-				strerror (op_errno));
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, op_ret, op_errno);
-		return 0;
-	} 
-
-	LOCK (&frame->lock);
-	{
-		local->failed = 0;
-		local->op_ret = 0;
-		local->op_errno = 0;
-		local->call_count = 1;
-	}
-	UNLOCK (&frame->lock);
-
-	/* schedule XATTR_CREATE on one of the child node */
-	sched_ops = private->sched_ops;
-    
-	/* Send create request to the scheduled node now */
-	sched_xl = sched_ops->schedule (this, local->name); 
-	if (!sched_xl) {
-		STACK_UNWIND (frame, -1, ENOTCONN);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    unify_setxattr_cbk,
-		    sched_xl,
-		    sched_xl->fops->setxattr,
-		    &local->loc1,
-		    local->dict,
-		    local->flags);
-	return 0;
-}
-
-/**
- * unify_setxattr_cbk - When all the child nodes return, UNWIND frame.
- */
-int32_t
-unify_setxattr_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno)
-{
-	int32_t callcnt = 0;
-	unify_local_t *local = frame->local;
-	call_frame_t *prev_frame = cookie;
-	dict_t *dict = NULL;
-
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-    
-		if (op_ret == -1) {
-			gf_log (this->name, (((op_errno == ENOENT) || 
-					      (op_errno == ENOTSUP))? 
-					     GF_LOG_DEBUG : GF_LOG_ERROR), 
-				"child(%s): path(%s): %s", 
-				prev_frame->this->name, 
-				(local->loc1.path)?local->loc1.path:"", 
-				strerror (op_errno));
-			if (local->failed == -1) {
-				local->failed = 1;
-			}
-			local->op_errno = op_errno;
-		} else {
-			local->failed = 0;
-			local->op_ret = op_ret;
-		}
-	}
-	UNLOCK (&frame->lock);
-  
-	if (!callcnt) {
-		if (local->failed && local->name && 
-		    ZR_FILE_CONTENT_REQUEST(local->name)) {      
-			dict = get_new_dict ();
-			dict_set (dict, local->dict->members_list->key, 
-				  data_from_dynptr(NULL, 0));
-			dict_ref (dict);
-
-			local->call_count = 1;
-
-			STACK_WIND (frame,
-				    unify_setxattr_file_cbk,
-				    NS(this),
-				    NS(this)->fops->setxattr,
-				    &local->loc1,
-				    dict,
-				    XATTR_CREATE);
-
-			dict_unref (dict);
-			return 0;
-		}
-    
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, local->op_ret, local->op_errno);
-	}
-
-	return 0;
-}
-
-/**
- * unify_sexattr - This function should be sent to all the storage nodes, 
- *       which contains the file, (excluding namespace).
- */
-int32_t
-unify_setxattr (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		dict_t *dict,
-		int32_t flags)
-{
-	unify_private_t *priv = this->private;
-	unify_local_t *local = NULL;
-	int16_t *list = NULL;
-	int16_t index = 0;
-	int32_t call_count = 0;
-  	uint64_t tmp_list = 0;
-	data_pair_t *trav = dict->members_list;
-
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	local->failed = -1;
-	loc_copy (&local->loc1, loc);
-
-	if (S_ISDIR (loc->inode->st_mode)) {
-
-		if (trav && trav->key && ZR_FILE_CONTENT_REQUEST(trav->key)) {
-			/* direct the storage xlators to change file 
-			   content only if file exists */
-			local->flags = flags;
-			local->dict = dict;
-			local->name = strdup (trav->key);
-			flags |= XATTR_REPLACE;
-		}
-
-		local->call_count = priv->child_count;
-		for (index = 0; index < priv->child_count; index++) {
-			STACK_WIND (frame,
-				    unify_setxattr_cbk,
-				    priv->xl_array[index],
-				    priv->xl_array[index]->fops->setxattr,
-				    loc, dict, flags);
-		}
-		return 0;
-	}
-
-	inode_ctx_get (loc->inode, this, &tmp_list);
-	list = (int16_t *)(long)tmp_list;
-
-	for (index = 0; list[index] != -1; index++) {
-		if (NS(this) != priv->xl_array[list[index]]) {
-			local->call_count++;
-			call_count++;
-		}
-	}
-  
-	if (local->call_count) {
-		for (index = 0; list[index] != -1; index++) {
-			if (priv->xl_array[list[index]] != NS(this)) {
-				STACK_WIND (frame,
-					    unify_setxattr_cbk,
-					    priv->xl_array[list[index]],
-					    priv->xl_array[list[index]]->fops->setxattr,
-					    loc,
-					    dict,
-					    flags);
-				if (!--call_count)
-					break;
-			}
-		}
-		return 0;
-	}
-
-	/* No entry in storage nodes */
-	gf_log (this->name, GF_LOG_DEBUG, 
-		"returning ENOENT, file not found on storage node.");
-	STACK_UNWIND (frame, -1, ENOENT);
-
-	return 0;
-}
-
-
-/**
- * unify_getxattr_cbk - This function is called from only one child, so, no
- *     need of any lock or anything else, just send it to above layer 
- */
-int32_t
-unify_getxattr_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    dict_t *value)
-{
-	int32_t callcnt = 0;
-	dict_t *local_value = NULL;
-	unify_local_t *local = frame->local;
-	call_frame_t *prev_frame = cookie;
-  
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-    
-		if (op_ret == -1) {
-			local->op_errno = op_errno;
-			gf_log (this->name, 
-				(((op_errno == ENOENT) || 
-				  (op_errno == ENODATA) || 
-				  (op_errno == ENOTSUP)) ? 
-				 GF_LOG_DEBUG : GF_LOG_ERROR), 
-				"child(%s): path(%s): %s", 
-				prev_frame->this->name, 
-				(local->loc1.path)?local->loc1.path:"", 
-				strerror (op_errno));
-		} else {
-			if (!local->dict)
-				local->dict = dict_ref (value);
-			local->op_ret = op_ret;
-		}
-	}
-	UNLOCK (&frame->lock);
-  
-	if (!callcnt) {
-		local_value = local->dict;
-		local->dict = NULL;
-      
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-			      local_value);
-      
-		if (local_value)
-			dict_unref (local_value);
-	} 
-
-	return 0;
-}
-
-
-/** 
- * unify_getxattr - This FOP is sent to only the storage node.
- */
-int32_t
-unify_getxattr (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		const char *name)
-{
-	unify_private_t *priv = this->private;
-	int16_t *list = NULL;
-	int16_t index = 0;
-	int16_t count = 0;
-	unify_local_t *local = NULL;
-  	uint64_t tmp_list = 0;
-
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-	INIT_LOCAL (frame, local);
-
-	if (S_ISDIR (loc->inode->st_mode)) {
-		local->call_count = priv->child_count;
-		for (index = 0; index < priv->child_count; index++)
-			STACK_WIND (frame,
-				    unify_getxattr_cbk,
-				    priv->xl_array[index],
-				    priv->xl_array[index]->fops->getxattr,
-				    loc,
-				    name);
-		return 0;
-	}
-
-	inode_ctx_get (loc->inode, this, &tmp_list);
-	list = (int16_t *)(long)tmp_list;
-
-	for (index = 0; list[index] != -1; index++) {
-		if (NS(this) != priv->xl_array[list[index]]) {
-			local->call_count++;
-			count++;
-		}
-	}
-
-	if (count) {
-		for (index = 0; list[index] != -1; index++) {
-			if (priv->xl_array[list[index]] != NS(this)) {
-				STACK_WIND (frame,
-					    unify_getxattr_cbk,
-					    priv->xl_array[list[index]],
-					    priv->xl_array[list[index]]->fops->getxattr,
-					    loc,
-					    name);
-				if (!--count)
-					break;
-			}
-		}
-	} else {
-		dict_t *tmp_dict = get_new_dict ();
-		gf_log (this->name, GF_LOG_DEBUG, 
-			"%s: returning ENODATA, no file found on storage node",
-			loc->path);
-		STACK_UNWIND (frame, -1, ENODATA, tmp_dict);
-		dict_destroy (tmp_dict);
-	}
-
-	return 0;
-}
-
-/**
- * unify_removexattr_cbk - Wait till all the child node returns the call
- *      and then UNWIND to above layer.
- */
-int32_t
-unify_removexattr_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno)
-{
-	int32_t callcnt = 0;
-	unify_local_t *local = frame->local;
-	call_frame_t *prev_frame = cookie;
-
-	LOCK (&frame->lock);
-	{ 
-		callcnt = --local->call_count;
-		if (op_ret == -1) {
-			local->op_errno = op_errno;
-			if (op_errno != ENOTSUP)
-				gf_log (this->name, GF_LOG_ERROR, 
-					"child(%s): path(%s): %s", 
-					prev_frame->this->name, 
-					local->loc1.path, strerror (op_errno));
-		} else {
-			local->op_ret = op_ret;
-		}
-	}
-	UNLOCK (&frame->lock);  
-
-	if (!callcnt) {
-		STACK_UNWIND (frame, local->op_ret, local->op_errno);
-	}
-
-	return 0;
-}
-
-/**
- * unify_removexattr - Send it to all the child nodes which has the files.
- */
-int32_t
-unify_removexattr (call_frame_t *frame,
-		   xlator_t *this,
-		   loc_t *loc,
-		   const char *name)
-{
-	unify_private_t *priv = this->private;
-	unify_local_t *local = NULL;
-	int16_t *list = NULL;
-	int16_t index = 0;
-	int32_t call_count = 0;
-  	uint64_t tmp_list = 0;
-
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc);
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-
-	if (S_ISDIR (loc->inode->st_mode)) {
-		local->call_count = priv->child_count;
-		for (index = 0; index < priv->child_count; index++)
-			STACK_WIND (frame, 		    
-				    unify_removexattr_cbk,
-				    priv->xl_array[index],
-				    priv->xl_array[index]->fops->removexattr,
-				    loc,
-				    name);
-
-		return 0;
-	}
-
-	inode_ctx_get (loc->inode, this, &tmp_list);
-	list = (int16_t *)(long)tmp_list;
-
-	for (index = 0; list[index] != -1; index++) {
-		if (NS(this) != priv->xl_array[list[index]]) {
-			local->call_count++;
-			call_count++;
-		}
-	}
-
-	if (local->call_count) {
-		for (index = 0; list[index] != -1; index++) {
-			if (priv->xl_array[list[index]] != NS(this)) {
-				STACK_WIND (frame,
-					    unify_removexattr_cbk,
-					    priv->xl_array[list[index]],
-					    priv->xl_array[list[index]]->fops->removexattr,
-					    loc,
-					    name);
-				if (!--call_count)
-					break;
-			}
-		}
-		return 0;
-	} 
-
-	gf_log (this->name, GF_LOG_DEBUG, 
-		"%s: returning ENOENT, not found on storage node.", loc->path);
-	STACK_UNWIND (frame, -1, ENOENT);
-
-	return 0;
-}
-
-
-int32_t 
-unify_mknod_unlink_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno)
-{
-	unify_local_t *local = frame->local;
-
-	if (op_ret == -1)
-		gf_log (this->name, GF_LOG_ERROR, 
-			"%s: %s", local->loc1.path, strerror (op_errno));
-  
-	unify_local_wipe (local);
-	/* No log required here as this -1 is for mknod call */
-	STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL);
-	return 0;
-}
-
-/**
- * unify_mknod_cbk - 
- */
-int32_t
-unify_mknod_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 inode_t *inode,
-		 struct stat *buf)
-{
-	unify_local_t *local = frame->local;
-
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"mknod failed on storage node, sending unlink to "
-			"namespace");
-		local->op_errno = op_errno;
-		STACK_WIND (frame,
-			    unify_mknod_unlink_cbk,
-			    NS(this),
-			    NS(this)->fops->unlink,
-			    &local->loc1);
-		return 0;
-	}
-  
-	local->stbuf = *buf;
-	local->stbuf.st_ino = local->st_ino;
-	unify_local_wipe (local);
-	STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf);
-	return 0;
-}
-
-/**
- * unify_ns_mknod_cbk - 
- */
-int32_t
-unify_ns_mknod_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    inode_t *inode,
-		    struct stat *buf)
-{
-	struct sched_ops *sched_ops = NULL;
-	xlator_t *sched_xl = NULL;
-	unify_local_t *local = frame->local;
-	unify_private_t *priv = this->private;
-	int16_t *list = NULL;
-	int16_t index = 0;
-	call_frame_t *prev_frame = cookie;
-
-	if (op_ret == -1) {
-		/* No need to send mknod request to other servers, 
-		 * as namespace action failed 
-		 */
-		gf_log (this->name, GF_LOG_ERROR, 
-			"child(%s): path(%s): %s", 
-			prev_frame->this->name, local->loc1.path, 
-			strerror (op_errno));
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-		return 0;
-	}
-  
-	/* Create one inode for this entry */
-	local->op_ret = 0;
-	local->stbuf = *buf;
-	local->st_ino = buf->st_ino;
-
-	list = CALLOC (1, sizeof (int16_t) * 3);
-	ERR_ABORT (list);
-	list[0] = priv->child_count;
-	list[2] = -1;
-	inode_ctx_put (inode, this, (uint64_t)(long)list);
-
-	sched_ops = priv->sched_ops;
-
-	/* Send mknod request to scheduled node now */
-	sched_xl = sched_ops->schedule (this, local->loc1.path); 
-	if (!sched_xl) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"mknod failed on storage node, no node online "
-			"at the moment, sending unlink to NS");
-		local->op_errno = ENOTCONN;
-		STACK_WIND (frame,
-			    unify_mknod_unlink_cbk,
-			    NS(this),
-			    NS(this)->fops->unlink,
-			    &local->loc1);
-      
-		return 0;
-	}
-
-	for (index = 0; index < priv->child_count; index++)
-		if (sched_xl == priv->xl_array[index])
-			break;
-	list[1] = index;
-  
-	STACK_WIND (frame,  unify_mknod_cbk,
-		    sched_xl,  sched_xl->fops->mknod,
-		    &local->loc1, local->mode, local->dev);
-
-	return 0;
-}
-
-/**
- * unify_mknod - Create a device on namespace first, and later create on 
- *       the storage node.
- */
-int32_t
-unify_mknod (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     mode_t mode,
-	     dev_t rdev)
-{
-	unify_local_t *local = NULL;
-  
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	local->mode = mode;
-	local->dev = rdev;
-	loc_copy (&local->loc1, loc);
-	if (local->loc1.path == NULL) {
-		gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
-		STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    unify_ns_mknod_cbk,
-		    NS(this),
-		    NS(this)->fops->mknod,
-		    loc,
-		    mode,
-		    rdev);
-
-	return 0;
-}
-
-int32_t 
-unify_symlink_unlink_cbk (call_frame_t *frame,
-			  void *cookie,
-			  xlator_t *this,
-			  int32_t op_ret,
-			  int32_t op_errno)
-{
-	unify_local_t *local = frame->local;
-	if (op_ret == -1)
-		gf_log (this->name, GF_LOG_ERROR, 
-			"%s: %s", local->loc1.path, strerror (op_errno));
-
-	unify_local_wipe (local);
-	STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL);
-	return 0;
-}
-
-/**
- * unify_symlink_cbk - 
- */
-int32_t
-unify_symlink_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   inode_t *inode,
-		   struct stat *buf)
-{
-	unify_local_t *local = frame->local;
-
-	if (op_ret == -1) {
-		/* Symlink on storage node failed, hence send unlink 
-		   to the NS node */
-		local->op_errno = op_errno;
-		gf_log (this->name, GF_LOG_ERROR, 
-			"symlink on storage node failed, sending unlink "
-			"to namespace");
-
-		STACK_WIND (frame,
-			    unify_symlink_unlink_cbk,
-			    NS(this),
-			    NS(this)->fops->unlink,
-			    &local->loc1);
-    
-		return 0;
-	}
-  
-	local->stbuf = *buf;
-	local->stbuf.st_ino = local->st_ino;
-	unify_local_wipe (local);
-	STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf);
-
-	return 0;
-}
-
-/**
- * unify_ns_symlink_cbk - 
- */
-int32_t
-unify_ns_symlink_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno,
-		      inode_t *inode,
-		      struct stat *buf)
-{
-
-	struct sched_ops *sched_ops = NULL;
-	xlator_t *sched_xl = NULL;
-	int16_t *list = NULL;
-	unify_local_t *local = frame->local;
-	unify_private_t *priv = this->private;
-	int16_t index = 0;
-
-	if (op_ret == -1) {
-		/* No need to send symlink request to other servers, 
-		 * as namespace action failed 
-		 */
-		gf_log (this->name, GF_LOG_ERROR, 
-			"namespace: path(%s): %s", 
-			local->loc1.path, strerror (op_errno));
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, op_ret, op_errno, NULL, buf);
-		return 0;
-	}
-  
-	/* Create one inode for this entry */
-	local->op_ret = 0;
-	local->st_ino = buf->st_ino;
-  
-	/* Start the mapping list */
-
-	list = CALLOC (1, sizeof (int16_t) * 3);
-	ERR_ABORT (list);
-	list[0] = priv->child_count; //namespace's index
-	list[2] = -1;
-	inode_ctx_put (inode, this, (uint64_t)(long)list);
-
-	sched_ops = priv->sched_ops;
-
-	/* Send symlink request to all the nodes now */
-	sched_xl = sched_ops->schedule (this, local->loc1.path); 
-	if (!sched_xl) {
-		/* Symlink on storage node failed, hence send unlink 
-		   to the NS node */
-		local->op_errno = ENOTCONN;
-		gf_log (this->name, GF_LOG_ERROR, 
-			"symlink on storage node failed, no node online, "
-			"sending unlink to namespace");
-      
-		STACK_WIND (frame,
-			    unify_symlink_unlink_cbk,
-			    NS(this),
-			    NS(this)->fops->unlink,
-			    &local->loc1);
-      
-		return 0;
-	}
-
-	for (index = 0; index < priv->child_count; index++)
-		if (sched_xl == priv->xl_array[index])
-			break;
-	list[1] = index;
-	
-	STACK_WIND (frame,
-		    unify_symlink_cbk,
-		    sched_xl,
-		    sched_xl->fops->symlink,
-		    local->name,
-		    &local->loc1);
-
-	return 0;
-}
-
-/**
- * unify_symlink - 
- */
-int32_t
-unify_symlink (call_frame_t *frame,
-	       xlator_t *this,
-	       const char *linkpath,
-	       loc_t *loc)
-{
-	unify_local_t *local = NULL;
-  
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	loc_copy (&local->loc1, loc);
-	local->name = strdup (linkpath);
-
-	if ((local->name == NULL) || 
-	    (local->loc1.path == NULL)) {
-		gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
-		STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    unify_ns_symlink_cbk,
-		    NS(this),
-		    NS(this)->fops->symlink,
-		    linkpath,
-		    loc);
-
-	return 0;
-}
-
-
-int32_t 
-unify_rename_unlink_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno)
-{
-	int32_t callcnt = 0;
-	unify_local_t *local = frame->local;
-	call_frame_t *prev_frame = cookie;
-  
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"child(%s): path(%s -> %s): %s", 
-			prev_frame->this->name, 
-			local->loc1.path, local->loc2.path, 
-			strerror (op_errno));
-      
-	}
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		local->stbuf.st_ino = local->st_ino;
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, local->op_ret, local->op_errno, 
-			      &local->stbuf);
-	}
-	return 0;
-}
-
-int32_t 
-unify_ns_rename_undo_cbk (call_frame_t *frame,
-			  void *cookie,
-			  xlator_t *this,
-			  int32_t op_ret,
-			  int32_t op_errno,
-			  struct stat *buf)
-{
-	unify_local_t *local = frame->local;
-
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"namespace: path(%s -> %s): %s", 
-			local->loc1.path, local->loc2.path, 
-			strerror (op_errno));
-	}
-
-	local->stbuf.st_ino = local->st_ino;
-	unify_local_wipe (local);
-	STACK_UNWIND (frame, local->op_ret, local->op_errno, &local->stbuf);
-	return 0;
-}
-
-int32_t 
-unify_rename_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct stat *buf)
-{
-	int32_t index = 0;
-	int32_t callcnt = 0;
-	int16_t *list = NULL;
-	unify_private_t *priv = this->private;
-	unify_local_t *local = frame->local;
-	call_frame_t *prev_frame = cookie;
-  
-	LOCK (&frame->lock);
-	{
-		callcnt = --local->call_count;
-		if (op_ret >= 0) {
-			if (!S_ISDIR (buf->st_mode))
-				local->stbuf = *buf;
-			local->op_ret = op_ret;
-		} else {
-			gf_log (this->name, GF_LOG_ERROR, 
-				"child(%s): path(%s -> %s): %s", 
-				prev_frame->this->name, 
-				local->loc1.path, local->loc2.path, 
-				strerror (op_errno));
-			local->op_errno = op_errno;
-		}
-	}
-	UNLOCK (&frame->lock);
-
-	if (!callcnt) {
-		local->stbuf.st_ino = local->st_ino;
-		if (S_ISDIR (local->loc1.inode->st_mode)) {
-			unify_local_wipe (local);
-			STACK_UNWIND (frame, local->op_ret, local->op_errno, &local->stbuf);
-			return 0;
-		}
-
-		if (local->op_ret == -1) {
-			/* TODO: check this logic */
-
-			/* Rename failed in storage node, successful on NS, 
-			 * hence, rename back the entries in NS */
-			/* NOTE: this will be done only if the destination 
-			 * doesn't exists, if  the destination exists, the 
-			 * job of correcting NS is left to self-heal
-			 */
-			if (!local->index) {
-				loc_t tmp_oldloc = {
-                                        /* its actual 'newloc->path' */
-					.path = local->loc2.path, 
-					.inode = local->loc1.inode,
-					.parent = local->loc2.parent
-				};
-	
-				loc_t tmp_newloc = {
-					/* Actual 'oldloc->path' */
-					.path = local->loc1.path,
-					.parent = local->loc1.parent
-				};
-
-				gf_log (this->name, GF_LOG_ERROR, 
-					"rename succussful on namespace, on "
-					"stroage node failed, reverting back");
-
-				STACK_WIND (frame,
-					    unify_ns_rename_undo_cbk,
-					    NS(this),
-					    NS(this)->fops->rename,
-					    &tmp_oldloc,
-					    &tmp_newloc);
-				return 0;
-			}
-		} else {
-			/* Rename successful on storage nodes */
-
-			int32_t idx = 0;
-			int16_t *tmp_list = NULL;
-			uint64_t tmp_list_int64 = 0;
-			if (local->loc2.inode) {
-				inode_ctx_get (local->loc2.inode, 
-					       this, &tmp_list_int64);
-				list = (int16_t *)(long)tmp_list_int64;
-
-			}
-
-			if (list) {				
-				for (index = 0; list[index] != -1; index++);
-				tmp_list = CALLOC (1, index * 2);
-				memcpy (tmp_list, list, index * 2);
-
-				for (index = 0; list[index] != -1; index++) {
-					/* TODO: Check this logic. */
-					/* If the destination file exists in 
-					 * the same storage node where we sent
-					 * 'rename' call, no need to send 
-					 * unlink 
-					 */
-					for (idx = 0; 
-					     local->list[idx] != -1; idx++) {
-						if (tmp_list[index] == local->list[idx]) {
-							tmp_list[index] = priv->child_count;
-							continue;
-						}
-					}
-	  
-					if (NS(this) != priv->xl_array[tmp_list[index]]) {
-						local->call_count++;
-						callcnt++;
-					}
-				}
-
-				if (local->call_count) {
-					if (callcnt > 1)
-						gf_log (this->name, 
-							GF_LOG_ERROR, 
-							"%s->%s: more (%d) "
-							"subvolumes have the "
-							"newloc entry", 
-							local->loc1.path, 
-							local->loc2.path, 
-							callcnt);
-
-					for (index=0; 
-					     tmp_list[index] != -1; index++) {
-						if (NS(this) != priv->xl_array[tmp_list[index]]) {		    
-							STACK_WIND (frame,
-								    unify_rename_unlink_cbk,
-								    priv->xl_array[tmp_list[index]],
-								    priv->xl_array[tmp_list[index]]->fops->unlink,
-								    &local->loc2);
-							if (!--callcnt)
-								break;
-						}
-					}
-
-					FREE (tmp_list);
-					return 0;
-				}
-				if (tmp_list)
-					FREE (tmp_list);
-			}
-		}
-    
-		/* Need not send 'unlink' to storage node */
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, local->op_ret, 
-			      local->op_errno, &local->stbuf);
-	}
-
-	return 0;
-}
-
-int32_t 
-unify_ns_rename_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     struct stat *buf)
-{
-	int32_t index = 0;
-	int32_t callcnt = 0;
-	int16_t *list = NULL;
-	unify_private_t *priv = this->private;
-	unify_local_t *local = frame->local;
-
-	if (op_ret == -1) {
-		/* Free local->new_inode */
-		gf_log (this->name, GF_LOG_ERROR, 
-			"namespace: path(%s -> %s): %s", 
-			local->loc1.path, local->loc2.path, 
-			strerror (op_errno));
-
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, op_ret, op_errno, buf);
-		return 0;
-	}
-
-	local->stbuf = *buf;
-	local->st_ino = buf->st_ino;
-
-	/* Everything is fine. */
-	if (S_ISDIR (buf->st_mode)) {
-		local->call_count = priv->child_count;
-		for (index=0; index < priv->child_count; index++) {
-			STACK_WIND (frame,
-				    unify_rename_cbk,
-				    priv->xl_array[index],
-				    priv->xl_array[index]->fops->rename,
-				    &local->loc1,
-				    &local->loc2);
-		}
-
-		return 0;
-	}
-
-	local->call_count = 0;  
-	/* send rename */
-	list = local->list;
-	for (index=0; list[index] != -1; index++) {
-		if (NS(this) != priv->xl_array[list[index]]) {
-			local->call_count++;
-			callcnt++;
-		}
-	}
-
-	if (local->call_count) {
-		for (index=0; list[index] != -1; index++) {
-			if (NS(this) != priv->xl_array[list[index]]) {
-				STACK_WIND (frame,
-					    unify_rename_cbk,
-					    priv->xl_array[list[index]],
-					    priv->xl_array[list[index]]->fops->rename,
-					    &local->loc1,
-					    &local->loc2);
-				if (!--callcnt)
-					break;
-			}
-		}
-	} else {
-		/* file doesn't seem to be present in storage nodes */
-		gf_log (this->name, GF_LOG_CRITICAL,
-			"CRITICAL: source file not in storage node, "
-			"rename successful on namespace :O");
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, -1, EIO, NULL);
-	}
-	return 0;
-}
-
-
-/**
- * unify_rename - One of the tricky function. The deadliest of all :O
- */
-int32_t
-unify_rename (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *oldloc,
-	      loc_t *newloc)
-{
-	unify_local_t *local = NULL;
-  	uint64_t tmp_list = 0;
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-	loc_copy (&local->loc1, oldloc);
-	loc_copy (&local->loc2, newloc);
-
-	if ((local->loc1.path == NULL) || 
-	    (local->loc2.path == NULL)) {
-		gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O");
-		STACK_UNWIND (frame, -1, ENOMEM, NULL);
-		return 0;
-	}
-  
-	inode_ctx_get (oldloc->inode, this, &tmp_list);
-	local->list = (int16_t *)(long)tmp_list;
-
-	STACK_WIND (frame,
-		    unify_ns_rename_cbk,
-		    NS(this),
-		    NS(this)->fops->rename,
-		    oldloc,
-		    newloc);
-	return 0;
-}
-
-/**
- * unify_link_cbk -
- */
-int32_t
-unify_link_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		inode_t *inode,
-		struct stat *buf)
-{
-	unify_local_t *local = frame->local;
-
-	if (op_ret >= 0) 
-		local->stbuf = *buf;
-	local->stbuf.st_ino = local->st_ino;
-
-	unify_local_wipe (local);
-	STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf);
-
-	return 0;
-}
-
-/**
- * unify_ns_link_cbk - 
- */
-int32_t
-unify_ns_link_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   inode_t *inode,
-		   struct stat *buf)
-{
-	unify_private_t *priv = this->private;
-	unify_local_t *local = frame->local;
-	int16_t *list = local->list;
-	int16_t index = 0;
-
-	if (op_ret == -1) {
-		/* No need to send link request to other servers, 
-		 * as namespace action failed 
-		 */
-		gf_log (this->name, GF_LOG_ERROR, 
-			"namespace: path(%s -> %s): %s", 
-			local->loc1.path, local->loc2.path, 
-			strerror (op_errno));
-		unify_local_wipe (local);
-		STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-		return 0;
-	}
-
-	/* Update inode for this entry */
-	local->op_ret = 0;
-	local->st_ino = buf->st_ino;
-
-	/* Send link request to the node now */
-	for (index = 0; list[index] != -1; index++) {
-		char need_break = (list[index+1] == -1);
-		if (priv->xl_array[list[index]] != NS (this)) {
-			STACK_WIND (frame,
-				    unify_link_cbk,
-				    priv->xl_array[list[index]],
-				    priv->xl_array[list[index]]->fops->link,
-				    &local->loc1,
-				    &local->loc2);
-		}
-		if (need_break)
-			break;
-	}
-
-	return 0;
-}
-
-/**
- * unify_link - 
- */
-int32_t
-unify_link (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *oldloc,
-	    loc_t *newloc)
-{
-	unify_local_t *local = NULL;
-  	uint64_t tmp_list = 0;
-
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc);
-	UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (newloc);
-
-	/* Initialization */
-	INIT_LOCAL (frame, local);
-
-	loc_copy (&local->loc1, oldloc);
-	loc_copy (&local->loc2, newloc);
-
-	inode_ctx_get (oldloc->inode, this, &tmp_list);
-	local->list = (int16_t *)(long)tmp_list;
-
-	STACK_WIND (frame,
-		    unify_ns_link_cbk,
-		    NS(this),
-		    NS(this)->fops->link,
-		    oldloc,
-		    newloc);
-
-	return 0;
-}
-
-
-/**
- * unify_checksum_cbk - 
- */
-int32_t
-unify_checksum_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    uint8_t *fchecksum,
-		    uint8_t *dchecksum)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum);
-
-	return 0;
-}
-
-/**
- * unify_checksum - 
- */
-int32_t
-unify_checksum (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		int32_t flag)
-{
-	STACK_WIND (frame,
-		    unify_checksum_cbk,
-		    NS(this),
-		    NS(this)->fops->checksum,
-		    loc,
-		    flag);
-
-	return 0;
-}
-
-
-/**
- * unify_finodelk_cbk - 
- */
-int
-unify_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		    int32_t op_ret, int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-/**
- * unify_finodelk
- */
-int
-unify_finodelk (call_frame_t *frame, xlator_t *this,
-		fd_t *fd, int cmd, struct flock *flock)
-{
-	UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
-	xlator_t *child = NULL;
-	uint64_t tmp_child = 0;
-
-	fd_ctx_get (fd, this, &tmp_child);
-	child = (xlator_t *)(long)tmp_child;		     
-
-	STACK_WIND (frame, unify_finodelk_cbk,
-		    child, child->fops->finodelk,
-		    fd, cmd, flock);
-
-	return 0;
-}
-
-
-
-/**
- * unify_fentrylk_cbk - 
- */
-int
-unify_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		    int32_t op_ret, int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-/**
- * unify_fentrylk
- */
-int
-unify_fentrylk (call_frame_t *frame, xlator_t *this,
-		fd_t *fd, const char *basename,
-		entrylk_cmd cmd, entrylk_type type)
-		
-{
-	UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
-	xlator_t *child = NULL;
-	uint64_t tmp_child = 0;
-
-	fd_ctx_get (fd, this, &tmp_child);
-	child = (xlator_t *)(long)tmp_child;		     
-
-	STACK_WIND (frame, unify_fentrylk_cbk,
-		    child, child->fops->fentrylk,
-		    fd, basename, cmd, type);
-
-	return 0;
-}
-
-
-
-/**
- * unify_fxattrop_cbk - 
- */
-int
-unify_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		    int32_t op_ret, int32_t op_errno, dict_t *xattr)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, xattr);
-	return 0;
-}
-
-/**
- * unify_fxattrop
- */
-int
-unify_fxattrop (call_frame_t *frame, xlator_t *this,
-		fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr)
-{
-	UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd);
-	xlator_t *child = NULL;
-	uint64_t tmp_child = 0;
-
-	fd_ctx_get (fd, this, &tmp_child);
-	child = (xlator_t *)(long)tmp_child;		     
-
-	STACK_WIND (frame, unify_fxattrop_cbk,
-		    child, child->fops->fxattrop,
-		    fd, optype, xattr);
-
-	return 0;
-}
-
-
-/**
- * unify_inodelk_cbk - 
- */
-int
-unify_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		   int32_t op_ret, int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-
-/**
- * unify_inodelk
- */
-int
-unify_inodelk (call_frame_t *frame, xlator_t *this,
-	       loc_t *loc, int cmd, struct flock *flock)
-{
-	xlator_t *child = NULL;
-
-	child = unify_loc_subvol (loc, this);
-
-	STACK_WIND (frame, unify_inodelk_cbk,
-		    child, child->fops->inodelk,
-		    loc, cmd, flock);
-
-	return 0;
-}
-
-
-
-/**
- * unify_entrylk_cbk - 
- */
-int
-unify_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		   int32_t op_ret, int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-/**
- * unify_entrylk
- */
-int
-unify_entrylk (call_frame_t *frame, xlator_t *this,
-	       loc_t *loc, const char *basename,
-	       entrylk_cmd cmd, entrylk_type type)
-		
-{
-	xlator_t *child = NULL;
-
-	child = unify_loc_subvol (loc, this);
-
-	STACK_WIND (frame, unify_entrylk_cbk,
-		    child, child->fops->entrylk,
-		    loc, basename, cmd, type);
-
-	return 0;
-}
-
-
-
-/**
- * unify_xattrop_cbk - 
- */
-int
-unify_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		   int32_t op_ret, int32_t op_errno, dict_t *xattr)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, xattr);
-	return 0;
-}
-
-/**
- * unify_xattrop
- */
-int
-unify_xattrop (call_frame_t *frame, xlator_t *this,
-		loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr)
-{
-	xlator_t *child = NULL;
-
-	child = unify_loc_subvol (loc, this);
-
-	STACK_WIND (frame, unify_xattrop_cbk,
-		    child, child->fops->xattrop,
-		    loc, optype, xattr);
-
-	return 0;
-}
-
-
-/**
- * notify
- */
-int32_t
-notify (xlator_t *this,
-        int32_t event,
-        void *data,
-        ...)
-{
-	unify_private_t *priv = this->private;
-	struct sched_ops *sched = NULL;
-
-	if (!priv) {
-		return 0;
-	}
-
-	sched = priv->sched_ops;    
-	if (!sched) {
-		gf_log (this->name, GF_LOG_CRITICAL, "No scheduler :O");
-		raise (SIGTERM);
-		return 0;
-	}
-	if (priv->namespace == data) {
-		if (event == GF_EVENT_CHILD_UP) {
-			sched->notify (this, event, data);
-		}
-		return 0;
-	}
-
-	switch (event)
-	{
-	case GF_EVENT_CHILD_UP:
-	{
-		/* Call scheduler's update () to enable it for scheduling */
-		sched->notify (this, event, data);
-	
-		LOCK (&priv->lock);
-		{
-			/* Increment the inode's generation, which is 
-			   used for self_heal */
-			++priv->inode_generation;
-			++priv->num_child_up;
-		}
-		UNLOCK (&priv->lock);
-
-		if (!priv->is_up) {
-			default_notify (this, event, data);
-			priv->is_up = 1;
-		}
-	}
-	break;
-	case GF_EVENT_CHILD_DOWN:
-	{
-		/* Call scheduler's update () to disable the child node 
-		 * for scheduling
-		 */
-		sched->notify (this, event, data);
-		LOCK (&priv->lock);
-		{
-			--priv->num_child_up;
-		}
-		UNLOCK (&priv->lock);
-
-		if (priv->num_child_up == 0) {
-			/* Send CHILD_DOWN to upper layer */
-			default_notify (this, event, data);
-			priv->is_up = 0;
-		}
-	}
-	break;
-
-	default:
-	{
-		default_notify (this, event, data);
-	}
-	break;
-	}
-
-	return 0;
-}
-
-/** 
- * init - This function is called first in the xlator, while initializing.
- *   All the config file options are checked and appropriate flags are set.
- *
- * @this - 
- */
-int32_t 
-init (xlator_t *this)
-{
-	int32_t          ret       = 0;
-	int32_t          count     = 0;
-	data_t          *scheduler = NULL;
-	data_t          *data      = NULL;
-	xlator_t        *ns_xl     = NULL;
-	xlator_list_t   *trav      = NULL;
-	xlator_list_t   *xlparent  = NULL;
-	xlator_list_t   *parent    = NULL;
-	unify_private_t *_private  = NULL; 
-
-	/* Check for number of child nodes, if there is no child nodes, exit */
-	if (!this->children) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"No child nodes specified. check \"subvolumes \" "
-			"option in volfile");
-		return -1;
-	}
-
-  	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-  
-	/* Check for 'scheduler' in volume */
-	scheduler = dict_get (this->options, "scheduler");
-	if (!scheduler) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"\"option scheduler <x>\" is missing in volfile");
-		return -1;
-	}
-
-	/* Setting "option namespace <node>" */
-	data = dict_get (this->options, "namespace");
-	if(!data) {
-		gf_log (this->name, GF_LOG_CRITICAL, 
-			"namespace option not specified, Exiting");
-		return -1;
-	}
-	/* Search namespace in the child node, if found, exit */
-	trav = this->children;
-	while (trav) {
-		if (strcmp (trav->xlator->name, data->data) == 0)
-			break;
-		trav = trav->next;
-	}
-	if (trav) {
-		gf_log (this->name, GF_LOG_CRITICAL, 
-			"namespace node used as a subvolume, Exiting");
-		return -1;
-	}
-	
-	/* Search for the namespace node, if found, continue */
-	ns_xl = this->next;
-	while (ns_xl) {
-		if (strcmp (ns_xl->name, data->data) == 0)
-			break;
-		ns_xl = ns_xl->next;
-	}
-	if (!ns_xl) {
-		gf_log (this->name, GF_LOG_CRITICAL, 
-			"namespace node not found in volfile, Exiting");
-		return -1;
-	}
-	
-	gf_log (this->name, GF_LOG_DEBUG, 
-		"namespace node specified as %s", data->data);
-	
-	_private = CALLOC (1, sizeof (*_private));
-	ERR_ABORT (_private);
-	_private->sched_ops = get_scheduler (this, scheduler->data);
-	if (!_private->sched_ops) {
-		gf_log (this->name, GF_LOG_CRITICAL, 
-			"Error while loading scheduler. Exiting");
-		FREE (_private);
-		return -1;
-	}
-	
-	if (ns_xl->parents) {
-		gf_log (this->name, GF_LOG_CRITICAL,
-			"Namespace node should not be a child of any other node. Exiting");
-		FREE (_private);
-		return -1;
-	}
-
-	_private->namespace = ns_xl;
-	
-	/* update _private structure */
-	{
-		count = 0;
-		trav = this->children;
-		/* Get the number of child count */
-		while (trav) {
-			count++;
-			trav = trav->next;
-		}
-		
-		gf_log (this->name, GF_LOG_DEBUG, 
-			"Child node count is %d", count);    
-
-		_private->child_count = count;
-		if (count == 1) {
-			/* TODO: Should I error out here? */
-			gf_log (this->name, GF_LOG_CRITICAL, 
-				"WARNING: You have defined only one "
-				"\"subvolumes\" for unify volume. It may not "
-				"be the desired config, review your volume "
-				"volfile. If this is how you are testing it,"
-				" you may hit some performance penalty");
-		}
-		
-		_private->xl_array = CALLOC (1, 
-					     sizeof (xlator_t) * (count + 1));
-		ERR_ABORT (_private->xl_array);
-		
-		count = 0;
-		trav = this->children;
-		while (trav) {
-			_private->xl_array[count++] = trav->xlator;
-			trav = trav->next;
-		}
-		_private->xl_array[count] = _private->namespace;
-		
-		/* self-heal part, start with generation '1' */
-		_private->inode_generation = 1; 
-                /* Because, Foreground part is tested well */
-		_private->self_heal = ZR_UNIFY_FG_SELF_HEAL; 
-		data = dict_get (this->options, "self-heal");
-		if (data) {
-			if (strcasecmp (data->data, "off") == 0) 
-				_private->self_heal = ZR_UNIFY_SELF_HEAL_OFF;
-
-			if (strcasecmp (data->data, "foreground") == 0)
-				_private->self_heal = ZR_UNIFY_FG_SELF_HEAL;
-
-			if (strcasecmp (data->data, "background") == 0)
-				_private->self_heal = ZR_UNIFY_BG_SELF_HEAL;
-		}
-    
-		/* optimist - ask bulde for more about it */
-		data = dict_get (this->options, "optimist");
-		if (data) {
-			if (gf_string2boolean (data->data, 
-					       &_private->optimist) == -1) {
-				gf_log (this->name, GF_LOG_ERROR, 
-					"optimist excepts only boolean "
-					"options");
-			}
-		}
-
-		LOCK_INIT (&_private->lock);
-	}
-
-	/* Now that everything is fine. */
-	this->private = (void *)_private;
-	{
-		/* Initialize scheduler, if everything else is successful */
-		ret = _private->sched_ops->init (this); 
-		if (ret == -1) {
-			gf_log (this->name, GF_LOG_CRITICAL,
-				"Initializing scheduler failed, Exiting");
-			FREE (_private);
-			return -1;
-		}
-
-		ret = 0;
-
-		/* This section is required because some fops may look 
-		 * for 'xl->parent' variable 
-		 */
-		xlparent = CALLOC (1, sizeof (*xlparent));
-		xlparent->xlator = this;
-		if (!ns_xl->parents) {
-			ns_xl->parents = xlparent;
-		} else {
-			parent = ns_xl->parents;
-			while (parent->next)
-				parent = parent->next;
-			parent->next = xlparent;
-		}
-		/* Initialize the namespace volume */
-		if (!ns_xl->ready) {
-			ret = xlator_tree_init (ns_xl);
-			if (ret) {
-				gf_log (this->name, GF_LOG_ERROR, 
-					"initializing namespace node failed, "
-					"Exiting");
-			FREE (_private);
-			return -1;
-			}
-		}
-	}
-
-	/* Tell namespace node that init is done */
-	ns_xl->notify (ns_xl, GF_EVENT_PARENT_UP, this);
-
-	return 0;
-}
-
-/** 
- * fini  - Free all the allocated memory 
- */
-void
-fini (xlator_t *this)
-{
-	unify_private_t *priv = this->private;
-	priv->sched_ops->fini (this);
-	this->private = NULL;
-	LOCK_DESTROY (&priv->lock);
-	FREE (priv->xl_array);
-	FREE (priv);
-	return;
-}
-
-
-struct xlator_fops fops = {
-	.stat        = unify_stat,
-	.chmod       = unify_chmod,
-	.readlink    = unify_readlink,
-	.mknod       = unify_mknod,
-	.mkdir       = unify_mkdir,
-	.unlink      = unify_unlink,
-	.rmdir       = unify_rmdir,
-	.symlink     = unify_symlink,
-	.rename      = unify_rename,
-	.link        = unify_link,
-	.chown       = unify_chown,
-	.truncate    = unify_truncate,
-	.create      = unify_create,
-	.open        = unify_open,
-	.readv       = unify_readv,
-	.writev      = unify_writev,
-	.statfs      = unify_statfs,
-	.flush       = unify_flush,
-	.fsync       = unify_fsync,
-	.setxattr    = unify_setxattr,
-	.getxattr    = unify_getxattr,
-	.removexattr = unify_removexattr,
-	.opendir     = unify_opendir,
-	.readdir     = unify_readdir,
-	.fsyncdir    = unify_fsyncdir,
-	.access      = unify_access,
-	.ftruncate   = unify_ftruncate,
-	.fstat       = unify_fstat,
-	.lk          = unify_lk,
-	.fchown      = unify_fchown,
-	.fchmod      = unify_fchmod,
-	.utimens     = unify_utimens,
-	.lookup      = unify_lookup,
-	.getdents    = unify_getdents,
-	.checksum    = unify_checksum,
-	.inodelk     = unify_inodelk,
-	.finodelk    = unify_finodelk,
-	.entrylk     = unify_entrylk,
-	.fentrylk    = unify_fentrylk,
-	.xattrop     = unify_xattrop,
-	.fxattrop    = unify_fxattrop
-};
-
-struct xlator_mops mops = {
-};
-
-struct xlator_cbks cbks = {
-};
-
-struct volume_options options[] = {
-	{ .key   = { "namespace" },  
-	  .type  = GF_OPTION_TYPE_XLATOR 
-	},
-	{ .key   = { "scheduler" },  
-	  .value = { "alu", "rr", "random", "nufa", "switch" },
-	  .type  = GF_OPTION_TYPE_STR
-	},
-	{ .key   = {"self-heal"},  
-	  .value = { "foreground", "background", "off" },
-	  .type  = GF_OPTION_TYPE_STR
-	},
-	/* TODO: remove it some time later */
-	{ .key   = {"optimist"},  
-	  .type  = GF_OPTION_TYPE_BOOL 
-	},
-
-	{ .key   = {NULL} },
-};
diff --git a/xlators/cluster/unify/src/unify.h b/xlators/cluster/unify/src/unify.h
deleted file mode 100644
index da2f1e93b29..00000000000
--- a/xlators/cluster/unify/src/unify.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
-  Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#ifndef _UNIFY_H
-#define _UNIFY_H
-
-#include "scheduler.h"
-#include "list.h"
-
-#define MAX_DIR_ENTRY_STRING     (32 * 1024)
-
-#define ZR_UNIFY_SELF_HEAL_OFF 0
-#define ZR_UNIFY_FG_SELF_HEAL  1
-#define ZR_UNIFY_BG_SELF_HEAL  2
-
-/* Sometimes one should use completely random numbers.. its good :p */
-#define UNIFY_SELF_HEAL_GETDENTS_COUNT 1024
-
-#define NS(xl)          (((unify_private_t *)xl->private)->namespace)
-
-/* This is used to allocate memory for local structure */
-#define INIT_LOCAL(fr, loc)                   \
-do {                                          \
-  loc = CALLOC (1, sizeof (unify_local_t));   \
-  ERR_ABORT (loc);			      \
-  if (!loc) {                                 \
-    STACK_UNWIND (fr, -1, ENOMEM);            \
-    return 0;                                 \
-  }                                           \
-  fr->local = loc;                            \
-  loc->op_ret = -1;                           \
-  loc->op_errno = ENOENT;                     \
-} while (0)
-
-
-
-struct unify_private {
-	/* Update this structure depending on requirement */
-	void *scheduler;               /* THIS SHOULD BE THE FIRST VARIABLE, 
-					  if xlator is using scheduler */
-	struct sched_ops *sched_ops;   /* Scheduler options  */
-	xlator_t *namespace;           /* ptr to namespace xlator */
-	xlator_t **xl_array;
-	gf_boolean_t optimist;
-	int16_t child_count;
-	int16_t num_child_up;
-	uint8_t self_heal;
-	uint8_t is_up;
-	uint64_t inode_generation;
-	gf_lock_t lock;
-};
-typedef struct unify_private unify_private_t;
-
-struct unify_self_heal_struct {
-	uint8_t dir_checksum[ZR_FILENAME_MAX];
-	uint8_t ns_dir_checksum[ZR_FILENAME_MAX];
-	uint8_t file_checksum[ZR_FILENAME_MAX];
-	uint8_t ns_file_checksum[ZR_FILENAME_MAX];
-	off_t *offset_list;
-	int   *count_list;
-	dir_entry_t **entry_list;
-};
-
-
-struct _unify_local_t {
-	int32_t call_count;
-	int32_t op_ret;
-	int32_t op_errno;
-	mode_t mode;
-	off_t offset;
-	dev_t dev;
-	uid_t uid;
-	gid_t gid;
-	int32_t flags;
-	int32_t entry_count;
-	int32_t count;    // dir_entry_t count;
-	fd_t *fd;
-	struct stat stbuf;
-	struct statvfs statvfs_buf;
-	struct timespec tv[2];
-	char *name;
-	int32_t revalidate;
-
-	ino_t st_ino;
-	nlink_t st_nlink;
-  
-	dict_t *dict;
-
-	int16_t *list;
-	int16_t *new_list; /* Used only in case of rename */
-	int16_t index;
-
-	int32_t failed;
-	int32_t return_eio;  /* Used in case of different st-mode 
-				present for a given path */
-
-	uint64_t inode_generation; /* used to store the per directory 
-				    * inode_generation. Got from inode's ctx 
-				    * of directory inodes
-				    */
-
-	struct unify_self_heal_struct *sh_struct;
-	loc_t loc1, loc2;
-};
-typedef struct _unify_local_t unify_local_t;
-
-int32_t zr_unify_self_heal (call_frame_t *frame,
-			    xlator_t *this,
-			    unify_local_t *local);
-
-#endif /* _UNIFY_H */
diff --git a/xlators/debug/Makefile.am b/xlators/debug/Makefile.am
index 16cf893a11c..88fac1c6d9e 100644
--- a/xlators/debug/Makefile.am
+++ b/xlators/debug/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = trace error-gen
+SUBDIRS = error-gen io-stats sink trace delay-gen
 
 CLEANFILES = 
diff --git a/xlators/storage/bdb/Makefile.am b/xlators/debug/delay-gen/Makefile.am
index d471a3f9243..a985f42a877 100644
--- a/xlators/storage/bdb/Makefile.am
+++ b/xlators/debug/delay-gen/Makefile.am
@@ -1,3 +1,3 @@
 SUBDIRS = src
 
-CLEANFILES = 
+CLEANFILES =
diff --git a/xlators/debug/delay-gen/src/Makefile.am b/xlators/debug/delay-gen/src/Makefile.am
new file mode 100644
index 00000000000..8f758dec199
--- /dev/null
+++ b/xlators/debug/delay-gen/src/Makefile.am
@@ -0,0 +1,11 @@
+
+xlator_LTLIBRARIES = delay-gen.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug
+delay_gen_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+delay_gen_la_SOURCES = delay-gen.c
+delay_gen_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+noinst_HEADERS = delay-gen.h delay-gen-mem-types.h delay-gen-messages.h
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	   -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS)
+CLEANFILES =
diff --git a/xlators/debug/delay-gen/src/delay-gen-mem-types.h b/xlators/debug/delay-gen/src/delay-gen-mem-types.h
new file mode 100644
index 00000000000..c89a9217193
--- /dev/null
+++ b/xlators/debug/delay-gen/src/delay-gen-mem-types.h
@@ -0,0 +1,21 @@
+/*
+ *   Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#ifndef __DELAY_GEN_MEM_TYPES_H__
+#define __DELAY_GEN_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_delay_gen_mem_types_ {
+    gf_delay_gen_mt_dg_t = gf_common_mt_end + 1,
+    gf_delay_gen_mt_end
+};
+
+#endif /* __DELAY_GEN_MEM_TYPES_H__ */
diff --git a/xlators/debug/delay-gen/src/delay-gen-messages.h b/xlators/debug/delay-gen/src/delay-gen-messages.h
new file mode 100644
index 00000000000..bc98cec2885
--- /dev/null
+++ b/xlators/debug/delay-gen/src/delay-gen-messages.h
@@ -0,0 +1,26 @@
+/*
+ *   Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#ifndef __DELAY_GEN_MESSAGES_H__
+#define __DELAY_GEN_MESSAGES_H__
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+#endif /* __DELAY_GEN_MESSAGES_H__ */
diff --git a/xlators/debug/delay-gen/src/delay-gen.c b/xlators/debug/delay-gen/src/delay-gen.c
new file mode 100644
index 00000000000..4698f1fd785
--- /dev/null
+++ b/xlators/debug/delay-gen/src/delay-gen.c
@@ -0,0 +1,697 @@
+/*
+ *  Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+ *  This file is part of GlusterFS.
+ *
+ *  This file is licensed to you under your choice of the GNU Lesser
+ *  General Public License, version 3 or any later version (LGPLv3 or
+ *  later), or the GNU General Public License, version 2 (GPLv2), in all
+ *  cases as published by the Free Software Foundation.
+ */
+
+#include "delay-gen.h"
+
+#define DELAY_GRANULARITY (1 << 20)
+
+#define DG_FOP(fop, name, frame, this, args...)                                \
+    do {                                                                       \
+        delay_gen(this, fop);                                                  \
+        default_##name(frame, this, args);                                     \
+    } while (0)
+
+int
+delay_gen(xlator_t *this, int fop)
+{
+    dg_t *dg = this->private;
+
+    if (!dg->enable[fop] || !dg->delay_ppm)
+        return 0;
+
+    if ((rand() % DELAY_GRANULARITY) < dg->delay_ppm)
+        gf_nanosleep(dg->delay_duration * GF_US_IN_NS);
+
+    return 0;
+}
+
+int32_t
+dg_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+          dict_t *xdata)
+{
+    DG_FOP(GF_FOP_RENAME, rename, frame, this, oldloc, newloc, xdata);
+    return 0;
+}
+
+int32_t
+dg_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_IPC, ipc, frame, this, op, xdata);
+    return 0;
+}
+
+int32_t
+dg_setactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               lock_migration_info_t *locklist, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_SETACTIVELK, setactivelk, frame, this, loc, locklist, xdata);
+    return 0;
+}
+
+int32_t
+dg_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_FLUSH, flush, frame, this, fd, xdata);
+    return 0;
+}
+
+int32_t
+dg_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+           off_t off, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_READDIR, readdir, frame, this, fd, size, off, xdata);
+    return 0;
+}
+
+int32_t
+dg_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+            int32_t flags, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_SETXATTR, setxattr, frame, this, loc, dict, flags, xdata);
+    return 0;
+}
+
+int32_t
+dg_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_MKNOD, mknod, frame, this, loc, mode, rdev, umask, xdata);
+    return 0;
+}
+
+int32_t
+dg_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+             int32_t flags, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_FSETXATTR, fsetxattr, frame, this, fd, dict, flags, xdata);
+    return 0;
+}
+
+int32_t
+dg_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+         off_t offset, uint32_t flags, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_READ, readv, frame, this, fd, size, offset, flags, xdata);
+    return 0;
+}
+
+int32_t
+dg_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+           int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_INODELK, inodelk, frame, this, volume, loc, cmd, lock, xdata);
+    return 0;
+}
+
+int32_t
+dg_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+                dict_t *xdata)
+{
+    DG_FOP(GF_FOP_FREMOVEXATTR, fremovexattr, frame, this, fd, name, xdata);
+    return 0;
+}
+
+int32_t
+dg_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+        fd_t *fd, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_OPEN, open, frame, this, loc, flags, fd, xdata);
+    return 0;
+}
+
+int32_t
+dg_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+           gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_XATTROP, xattrop, frame, this, loc, flags, dict, xdata);
+    return 0;
+}
+
+int32_t
+dg_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+           const char *basename, entrylk_cmd cmd, entrylk_type type,
+           dict_t *xdata)
+{
+    DG_FOP(GF_FOP_ENTRYLK, entrylk, frame, this, volume, loc, basename, cmd,
+           type, xdata);
+    return 0;
+}
+
+int32_t
+dg_getactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_GETACTIVELK, getactivelk, frame, this, loc, xdata);
+    return 0;
+}
+
+int32_t
+dg_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_FINODELK, finodelk, frame, this, volume, fd, cmd, lock,
+           xdata);
+    return 0;
+}
+
+int32_t
+dg_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+          mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_CREATE, create, frame, this, loc, flags, mode, umask, fd,
+           xdata);
+    return 0;
+}
+
+int32_t
+dg_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+           size_t len, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_DISCARD, discard, frame, this, fd, offset, len, xdata);
+    return 0;
+}
+
+int32_t
+dg_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         mode_t umask, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_MKDIR, mkdir, frame, this, loc, mode, umask, xdata);
+    return 0;
+}
+
+int32_t
+dg_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+      struct gf_flock *lock, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_LK, lk, frame, this, fd, cmd, lock, xdata);
+    return 0;
+}
+
+int32_t
+dg_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+          int32_t count, off_t off, uint32_t flags, struct iobref *iobref,
+          dict_t *xdata)
+{
+    DG_FOP(GF_FOP_WRITE, writev, frame, this, fd, vector, count, off, flags,
+           iobref, xdata);
+    return 0;
+}
+
+int32_t
+dg_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+          dict_t *xdata)
+{
+    DG_FOP(GF_FOP_ACCESS, access, frame, this, loc, mask, xdata);
+    return 0;
+}
+
+int32_t
+dg_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_LOOKUP, lookup, frame, this, loc, xdata);
+    return 0;
+}
+
+int32_t
+dg_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+         dict_t *xdata)
+{
+    DG_FOP(GF_FOP_RMDIR, rmdir, frame, this, loc, flags, xdata);
+    return 0;
+}
+
+int32_t
+dg_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size,
+             off_t offset, size_t len, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_FALLOCATE, fallocate, frame, this, fd, keep_size, offset, len,
+           xdata);
+    return 0;
+}
+
+int32_t
+dg_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_FSTAT, fstat, frame, this, fd, xdata);
+    return 0;
+}
+
+int32_t
+dg_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+         struct gf_lease *lease, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_LEASE, lease, frame, this, loc, lease, xdata);
+    return 0;
+}
+
+int32_t
+dg_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_STAT, stat, frame, this, loc, xdata);
+    return 0;
+}
+
+int32_t
+dg_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+            dict_t *xdata)
+{
+    DG_FOP(GF_FOP_TRUNCATE, truncate, frame, this, loc, offset, xdata);
+    return 0;
+}
+
+int32_t
+dg_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+            dict_t *xdata)
+{
+    DG_FOP(GF_FOP_GETXATTR, getxattr, frame, this, loc, name, xdata);
+    return 0;
+}
+
+int32_t
+dg_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+           loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_SYMLINK, symlink, frame, this, linkpath, loc, umask, xdata);
+    return 0;
+}
+
+int32_t
+dg_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            off_t len, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_ZEROFILL, zerofill, frame, this, fd, offset, len, xdata);
+    return 0;
+}
+
+int32_t
+dg_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+            dict_t *xdata)
+{
+    DG_FOP(GF_FOP_FSYNCDIR, fsyncdir, frame, this, fd, flags, xdata);
+    return 0;
+}
+
+int32_t
+dg_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+             dict_t *xdata)
+{
+    DG_FOP(GF_FOP_FGETXATTR, fgetxattr, frame, this, fd, name, xdata);
+    return 0;
+}
+
+int32_t
+dg_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t off, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_READDIRP, readdirp, frame, this, fd, size, off, xdata);
+    return 0;
+}
+
+int32_t
+dg_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+        dict_t *xdata)
+{
+    DG_FOP(GF_FOP_LINK, link, frame, this, oldloc, newloc, xdata);
+    return 0;
+}
+
+int32_t
+dg_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+            gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_FXATTROP, fxattrop, frame, this, fd, flags, dict, xdata);
+    return 0;
+}
+
+int32_t
+dg_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             dict_t *xdata)
+{
+    DG_FOP(GF_FOP_FTRUNCATE, ftruncate, frame, this, fd, offset, xdata);
+    return 0;
+}
+
+int32_t
+dg_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             int32_t len, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_RCHECKSUM, rchecksum, frame, this, fd, offset, len, xdata);
+    return 0;
+}
+
+int32_t
+dg_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+          dict_t *xdata)
+{
+    DG_FOP(GF_FOP_UNLINK, unlink, frame, this, loc, flags, xdata);
+    return 0;
+}
+
+int32_t
+dg_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            const char *basename, entrylk_cmd cmd, entrylk_type type,
+            dict_t *xdata)
+{
+    DG_FOP(GF_FOP_FENTRYLK, fentrylk, frame, this, volume, fd, basename, cmd,
+           type, xdata);
+    return 0;
+}
+
+int32_t
+dg_getspec(call_frame_t *frame, xlator_t *this, const char *key, int32_t flags)
+{
+    DG_FOP(GF_FOP_GETSPEC, getspec, frame, this, key, flags);
+    return 0;
+}
+
+int32_t
+dg_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+           int32_t valid, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_SETATTR, setattr, frame, this, loc, stbuf, valid, xdata);
+    return 0;
+}
+
+int32_t
+dg_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+         dict_t *xdata)
+{
+    DG_FOP(GF_FOP_FSYNC, fsync, frame, this, fd, flags, xdata);
+    return 0;
+}
+
+int32_t
+dg_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_STATFS, statfs, frame, this, loc, xdata);
+    return 0;
+}
+
+int32_t
+dg_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+        gf_seek_what_t what, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_SEEK, seek, frame, this, fd, offset, what, xdata);
+    return 0;
+}
+
+int32_t
+dg_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+            int32_t valid, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_FSETATTR, fsetattr, frame, this, fd, stbuf, valid, xdata);
+    return 0;
+}
+
+int32_t
+dg_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+           dict_t *xdata)
+{
+    DG_FOP(GF_FOP_OPENDIR, opendir, frame, this, loc, fd, xdata);
+    return 0;
+}
+
+int32_t
+dg_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+            dict_t *xdata)
+{
+    DG_FOP(GF_FOP_READLINK, readlink, frame, this, loc, size, xdata);
+    return 0;
+}
+
+int32_t
+dg_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               const char *name, dict_t *xdata)
+{
+    DG_FOP(GF_FOP_REMOVEXATTR, removexattr, frame, this, loc, name, xdata);
+    return 0;
+}
+
+int32_t
+dg_forget(xlator_t *this, inode_t *inode)
+{
+    return 0;
+}
+
+int32_t
+dg_release(xlator_t *this, fd_t *fd)
+{
+    return 0;
+}
+
+int32_t
+dg_releasedir(xlator_t *this, fd_t *fd)
+{
+    return 0;
+}
+
+static int
+delay_gen_parse_fill_fops(dg_t *dg, char *enable_fops)
+{
+    char *op_no_str = NULL;
+    int op_no = -1;
+    int i = 0;
+    int ret = 0;
+    xlator_t *this = THIS;
+    char *saveptr = NULL;
+    char *dup_enable_fops = NULL;
+
+    if (strlen(enable_fops) == 0) {
+        for (i = GF_FOP_NULL + 1; i < GF_FOP_MAXVALUE; i++)
+            dg->enable[i] = 1;
+    } else {
+        dup_enable_fops = gf_strdup(enable_fops);
+        if (!dup_enable_fops) {
+            ret = -1;
+            goto out;
+        }
+        op_no_str = strtok_r(dup_enable_fops, ",", &saveptr);
+        while (op_no_str) {
+            op_no = gf_fop_int(op_no_str);
+            if (op_no == -1) {
+                gf_log(this->name, GF_LOG_WARNING, "Wrong option value %s",
+                       op_no_str);
+                ret = -1;
+                goto out;
+            } else {
+                dg->enable[op_no] = 1;
+            }
+
+            op_no_str = strtok_r(NULL, ",", &saveptr);
+        }
+    }
+out:
+    GF_FREE(dup_enable_fops);
+    return ret;
+}
+
+void
+delay_gen_set_delay_ppm(dg_t *dg, double percent)
+{
+    double ppm;
+
+    ppm = (percent / 100.0) * (double)DELAY_GRANULARITY;
+    dg->delay_ppm = ppm;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    dg_t *dg = NULL;
+    int32_t ret = 0;
+    double delay_percent = 0;
+    char *delay_enable_fops = NULL;
+
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "delay-gen not configured with one subvolume");
+        ret = -1;
+        goto out;
+    }
+
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
+    }
+
+    dg = GF_CALLOC(1, sizeof(*dg), gf_delay_gen_mt_dg_t);
+
+    if (!dg) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = -1;
+
+    GF_OPTION_INIT("delay-percentage", delay_percent, percent, out);
+    GF_OPTION_INIT("enable", delay_enable_fops, str, out);
+    GF_OPTION_INIT("delay-duration", dg->delay_duration, int32, out);
+
+    delay_gen_set_delay_ppm(dg, delay_percent);
+
+    ret = delay_gen_parse_fill_fops(dg, delay_enable_fops);
+    if (ret)
+        goto out;
+
+    this->private = dg;
+
+    ret = 0;
+out:
+    if (ret)
+        GF_FREE(dg);
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    GF_FREE(this->private);
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_delay_gen_mt_end + 1);
+
+    if (ret != 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Memory accounting init"
+               " failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+int32_t
+reconfigure(xlator_t *this, dict_t *dict)
+{
+    /*At the moment I don't see any need to implement this. In future
+     *if this is needed we can add code here.
+     */
+    return 0;
+}
+
+int
+notify(xlator_t *this, int event, void *data, ...)
+{
+    return default_notify(this, event, data);
+}
+
+struct xlator_fops fops = {
+    .rename = dg_rename,
+    .ipc = dg_ipc,
+    .setactivelk = dg_setactivelk,
+    .flush = dg_flush,
+    .readdir = dg_readdir,
+    .setxattr = dg_setxattr,
+    .mknod = dg_mknod,
+    .fsetxattr = dg_fsetxattr,
+    .readv = dg_readv,
+    .inodelk = dg_inodelk,
+    .fremovexattr = dg_fremovexattr,
+    .open = dg_open,
+    .xattrop = dg_xattrop,
+    .entrylk = dg_entrylk,
+    .getactivelk = dg_getactivelk,
+    .finodelk = dg_finodelk,
+    .create = dg_create,
+    .discard = dg_discard,
+    .mkdir = dg_mkdir,
+    .lk = dg_lk,
+    .writev = dg_writev,
+    .access = dg_access,
+    .lookup = dg_lookup,
+    .rmdir = dg_rmdir,
+    .fallocate = dg_fallocate,
+    .fstat = dg_fstat,
+    .lease = dg_lease,
+    .stat = dg_stat,
+    .truncate = dg_truncate,
+    .getxattr = dg_getxattr,
+    .symlink = dg_symlink,
+    .zerofill = dg_zerofill,
+    .fsyncdir = dg_fsyncdir,
+    .fgetxattr = dg_fgetxattr,
+    .readdirp = dg_readdirp,
+    .link = dg_link,
+    .fxattrop = dg_fxattrop,
+    .ftruncate = dg_ftruncate,
+    .rchecksum = dg_rchecksum,
+    .unlink = dg_unlink,
+    .fentrylk = dg_fentrylk,
+    .getspec = dg_getspec,
+    .setattr = dg_setattr,
+    .fsync = dg_fsync,
+    .statfs = dg_statfs,
+    .seek = dg_seek,
+    .fsetattr = dg_fsetattr,
+    .opendir = dg_opendir,
+    .readlink = dg_readlink,
+    .removexattr = dg_removexattr,
+};
+
+struct xlator_cbks cbks = {
+    .forget = dg_forget,
+    .release = dg_release,
+    .releasedir = dg_releasedir,
+};
+
+struct volume_options options[] = {
+    {
+        .key = {"delay-percentage"},
+        .type = GF_OPTION_TYPE_PERCENT,
+        .default_value = "10%",
+        .description = "Percentage delay of operations when enabled.",
+        .op_version = {GD_OP_VERSION_3_13_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .tags = {"delay-gen"},
+    },
+
+    {
+        .key = {"delay-duration"},
+        .type = GF_OPTION_TYPE_INT,
+        .description = "Delay duration in micro seconds",
+        .default_value = "100000",
+        .op_version = {GD_OP_VERSION_3_13_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .tags = {"delay-gen"},
+    },
+
+    {
+        .key = {"enable"},
+        .type = GF_OPTION_TYPE_STR,
+        .description = "Accepts a string which takes ',' separated fop "
+                       "strings to denote which fops are enabled for delay",
+        .op_version = {GD_OP_VERSION_3_13_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .tags = {"delay-gen"},
+        .default_value = "",
+    },
+
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {GD_OP_VERSION_3_12_0},
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "delay-gen",
+    .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/debug/delay-gen/src/delay-gen.h b/xlators/debug/delay-gen/src/delay-gen.h
new file mode 100644
index 00000000000..afa95e5eb2d
--- /dev/null
+++ b/xlators/debug/delay-gen/src/delay-gen.h
@@ -0,0 +1,27 @@
+/*
+ *   Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#ifndef __DELAY_GEN_H__
+#define __DELAY_GEN_H__
+
+#include "delay-gen-mem-types.h"
+#include "delay-gen-messages.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+typedef struct {
+    int enable[GF_FOP_MAXVALUE];
+    int op_count;
+    int delay_ppm;
+    int delay_duration;
+} dg_t;
+
+#endif /* __DELAY_GEN_H__ */
diff --git a/xlators/debug/error-gen/src/Makefile.am b/xlators/debug/error-gen/src/Makefile.am
index 1bd7f332ca0..038d2e8e66d 100644
--- a/xlators/debug/error-gen/src/Makefile.am
+++ b/xlators/debug/error-gen/src/Makefile.am
@@ -2,13 +2,17 @@
 xlator_LTLIBRARIES = error-gen.la
 xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug
 
-error_gen_la_LDFLAGS = -module -avoidversion
+error_gen_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
 
 error_gen_la_SOURCES = error-gen.c
 error_gen_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+noinst_HEADERS = error-gen.h error-gen-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
 
 CLEANFILES = 
 
diff --git a/xlators/debug/error-gen/src/error-gen-mem-types.h b/xlators/debug/error-gen/src/error-gen-mem-types.h
new file mode 100644
index 00000000000..b9b713af8fc
--- /dev/null
+++ b/xlators/debug/error-gen/src/error-gen-mem-types.h
@@ -0,0 +1,20 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __ERROR_GEN_MEM_TYPES_H__
+#define __ERROR_GEN_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_error_gen_mem_types_ {
+    gf_error_gen_mt_eg_t = gf_common_mt_end + 1,
+    gf_error_gen_mt_end
+};
+#endif
diff --git a/xlators/debug/error-gen/src/error-gen.c b/xlators/debug/error-gen/src/error-gen.c
index 2be30cb6c98..d45655ef4c3 100644
--- a/xlators/debug/error-gen/src/error-gen.c
+++ b/xlators/debug/error-gen/src/error-gen.c
@@ -1,1780 +1,1663 @@
 /*
-  Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
 */
+#include <glusterfs/xlator.h>
+#include "error-gen.h"
+#include <glusterfs/statedump.h>
+#include <glusterfs/defaults.h>
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+/*
+ * The user can specify an error probability as a float percentage, but we
+ * store it internally as a numerator with this as the denominator.  When it's
+ * used, it's like this:
+ *
+ *    (rand() % FAILURE_GRANULARITY) < error_rate
+ *
+ * To minimize rounding errors from the modulo operation, it's good for this to
+ * be a power of two.
+ *
+ * (BTW this is just the normal case.  If "random-failure" is set, that does
+ * something completely different and this number is irrelevant.  See error_gen
+ * for the legacy code.)
+ */
+#define FAILURE_GRANULARITY (1 << 20)
+
+sys_error_t error_no_list[] = {
+    [GF_FOP_LOOKUP] = {.error_no_count = 4,
+                       .error_no = {ENOENT, ENOTDIR, ENAMETOOLONG, EAGAIN}},
+    [GF_FOP_STAT] = {.error_no_count = 6,
+                     .error_no = {EACCES, EFAULT, ENAMETOOLONG, ENOENT, ENOMEM,
+                                  ENOTDIR}},
+    [GF_FOP_READLINK] = {.error_no_count = 8,
+                         .error_no = {EACCES, EFAULT, EINVAL, EIO, ENAMETOOLONG,
+                                      ENOENT, ENOMEM, ENOTDIR}},
+    [GF_FOP_MKNOD] = {.error_no_count = 11,
+                      .error_no = {EACCES, EEXIST, EFAULT, EINVAL, ENAMETOOLONG,
+                                   ENOENT, ENOMEM, ENOSPC, ENOTDIR, EPERM,
+                                   EROFS}},
+    [GF_FOP_MKDIR] = {.error_no_count = 10,
+                      .error_no = {EACCES, EEXIST, EFAULT, ENAMETOOLONG, ENOENT,
+                                   ENOMEM, ENOSPC, ENOTDIR, EPERM, EROFS}},
+    [GF_FOP_UNLINK] = {.error_no_count = 10,
+                       .error_no = {EACCES, EBUSY, EFAULT, EIO, EISDIR,
+                                    ENAMETOOLONG, ENOENT, ENOMEM, ENOTDIR,
+                                    EPERM, EROFS}},
+    [GF_FOP_RMDIR] = {.error_no_count = 8,
+                      .error_no = {EACCES, EBUSY, EFAULT, ENOMEM, ENOTDIR,
+                                   ENOTEMPTY, EPERM, EROFS}},
+    [GF_FOP_SYMLINK] = {.error_no_count = 11,
+                        .error_no = {EACCES, EEXIST, EFAULT, EIO, ENAMETOOLONG,
+                                     ENOENT, ENOMEM, ENOSPC, ENOTDIR, EPERM,
+                                     EROFS}},
+    [GF_FOP_RENAME] = {.error_no_count = 13,
+                       .error_no = {EACCES, EBUSY, EFAULT, EINVAL, EISDIR,
+                                    EMLINK, ENAMETOOLONG, ENOENT, ENOMEM,
+                                    ENOSPC, ENOTDIR, EEXIST, EXDEV}},
+    [GF_FOP_LINK] = {.error_no_count = 13,
+                     .error_no = {EACCES, EFAULT, EEXIST, EIO, EMLINK,
+                                  ENAMETOOLONG, ENOENT, ENOMEM, ENOSPC, ENOTDIR,
+                                  EPERM, EROFS, EXDEV}},
+    [GF_FOP_TRUNCATE] = {.error_no_count = 10,
+                         .error_no = {EACCES, EFAULT, EFBIG, EINTR, EINVAL, EIO,
+                                      EISDIR, ENAMETOOLONG, ENOENT, EISDIR}},
+    [GF_FOP_CREATE] = {.error_no_count = 10,
+                       .error_no = {EACCES, EEXIST, EFAULT, EISDIR, EMFILE,
+                                    ENAMETOOLONG, ENFILE, ENODEV, ENOENT,
+                                    ENODEV}},
+    [GF_FOP_OPEN] = {.error_no_count = 10,
+                     .error_no = {EACCES, EEXIST, EFAULT, EISDIR, EMFILE,
+                                  ENAMETOOLONG, ENFILE, ENODEV, ENOENT,
+                                  ENOMEM}},
+    [GF_FOP_READ] = {.error_no_count = 5,
+                     .error_no = {EINVAL, EBADF, EFAULT, EISDIR, ENAMETOOLONG}},
+    [GF_FOP_WRITE] = {.error_no_count = 7,
+                      .error_no = {EINVAL, EBADF, EFAULT, EISDIR, ENAMETOOLONG,
+                                   ENOSPC, GF_ERROR_SHORT_WRITE}},
+    [GF_FOP_STATFS] = {.error_no_count = 9,
+                       .error_no = {EACCES, EFAULT, EINTR, EIO, ENAMETOOLONG,
+                                    ENOENT, ENOMEM, ENOSYS, ENOTDIR}},
+    [GF_FOP_FLUSH] = {.error_no_count = 5,
+                      .error_no = {EACCES, EFAULT, ENAMETOOLONG, ENOSYS,
+                                   ENOENT}},
+    [GF_FOP_FSYNC] = {.error_no_count = 4,
+                      .error_no = {EBADF, EIO, EROFS, EINVAL}},
+    [GF_FOP_SETXATTR] = {.error_no_count = 3,
+                         .error_no = {EACCES, EINTR, ENAMETOOLONG}},
+    [GF_FOP_GETXATTR] = {.error_no_count = 3,
+                         .error_no = {EACCES, ENAMETOOLONG, EINTR}},
+    [GF_FOP_REMOVEXATTR] = {.error_no_count = 3,
+                            .error_no = {EACCES, ENAMETOOLONG, EINTR}},
+    [GF_FOP_FSETXATTR] = {.error_no_count = 4,
+                          .error_no = {EACCES, EBADF, EINTR, ENAMETOOLONG}},
+    [GF_FOP_FGETXATTR] = {.error_no_count = 4,
+                          .error_no = {EACCES, EBADF, ENAMETOOLONG, EINTR}},
+    [GF_FOP_FREMOVEXATTR] = {.error_no_count = 4,
+                             .error_no = {EACCES, EBADF, ENAMETOOLONG, EINTR}},
+    [GF_FOP_OPENDIR] = {.error_no_count = 8,
+                        .error_no = {EACCES, EEXIST, EFAULT, EISDIR, EMFILE,
+                                     ENAMETOOLONG, ENFILE, ENODEV}},
+    [GF_FOP_READDIR] = {.error_no_count = 5,
+                        .error_no = {EINVAL, EACCES, EBADF, EMFILE, ENOENT}},
+    [GF_FOP_READDIRP] = {.error_no_count = 5,
+                         .error_no = {EINVAL, EACCES, EBADF, EMFILE, ENOENT}},
+    [GF_FOP_FSYNCDIR] = {.error_no_count = 4,
+                         .error_no = {EBADF, EIO, EROFS, EINVAL}},
+    [GF_FOP_ACCESS] = {.error_no_count = 8,
+                       .error_no = {EACCES, ENAMETOOLONG, ENOENT, ENOTDIR,
+                                    EROFS, EFAULT, EINVAL, EIO}},
+    [GF_FOP_FTRUNCATE] = {.error_no_count = 9,
+                          .error_no = {EACCES, EFAULT, EFBIG, EINTR, EINVAL,
+                                       EIO, EISDIR, ENAMETOOLONG, ENOENT}},
+    [GF_FOP_FSTAT] = {.error_no_count = 7,
+                      .error_no = {EACCES, EBADF, EFAULT, ENAMETOOLONG, ENOENT,
+                                   ENOMEM, ENOTDIR}},
+    [GF_FOP_LK] = {.error_no_count = 4,
+                   .error_no = {EACCES, EFAULT, ENOENT, EINTR}},
+    [GF_FOP_XATTROP] = {.error_no_count = 5,
+                        .error_no = {EACCES, EFAULT, ENAMETOOLONG, ENOSYS,
+                                     ENOENT}},
+    [GF_FOP_FXATTROP] = {.error_no_count = 4,
+                         .error_no = {EBADF, EIO, EROFS, EINVAL}},
+    [GF_FOP_INODELK] = {.error_no_count = 3,
+                        .error_no = {EACCES, EINTR, ENAMETOOLONG}},
+    [GF_FOP_FINODELK] = {.error_no_count = 4,
+                         .error_no = {EACCES, EBADF, EINTR, ENAMETOOLONG}},
+    [GF_FOP_ENTRYLK] = {.error_no_count = 3,
+                        .error_no = {EACCES, ENAMETOOLONG, EINTR}},
+    [GF_FOP_FENTRYLK] = {.error_no_count = 10,
+                         .error_no = {EACCES, EEXIST, EFAULT, EISDIR, EMFILE,
+                                      ENAMETOOLONG, ENFILE, ENODEV, ENOENT,
+                                      ENOMEM}},
+    [GF_FOP_SETATTR] = {.error_no_count = 10,
+                        .error_no = {EACCES, EFAULT, EIO, ENAMETOOLONG, ENOENT,
+                                     ENOMEM, ENOTDIR, EPERM, EROFS, EIO}},
+    [GF_FOP_FSETATTR] = {.error_no_count = 11,
+                         .error_no = {EACCES, EFAULT, EIO, ENAMETOOLONG, ENOENT,
+                                      ENOMEM, ENOTDIR, EPERM, EROFS, EBADF,
+                                      EIO}},
+    [GF_FOP_GETSPEC] = {.error_no_count = 3,
+                        .error_no = {EACCES, ENAMETOOLONG, EINTR}}};
 
-#include "xlator.h"
+int
+generate_rand_no(int op_no)
+{
+    int rand_no = 0;
+    int error_no_list_size = 0;
 
-typedef struct {
-	int op_count;
-} eg_t;
+    error_no_list_size = sizeof(error_no_list) / sizeof(error_no_list[0]);
 
-int error_gen (xlator_t *this)
-{
-	eg_t *egp = NULL;
-	int count = 0;
-	egp = this->private;
-	count = ++egp->op_count;
-	if((count % 10) == 0) {
-		count = count / 10;
-		if ((count % 2) == 0)
-			return ENOTCONN;
-		else
-			return EIO;
-	}
-	return 0;
+    if (op_no < error_no_list_size)
+        /* coverity[DC.WEAK_CRYPTO] */
+        rand_no = rand() % error_no_list[op_no].error_no_count;
+    return rand_no;
 }
 
-static int32_t
-error_gen_lookup_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno,
-		      inode_t *inode,
-		      struct stat *buf,
-		      dict_t *dict)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      inode,
-		      buf,
-		      dict);
-	return 0;
+int
+conv_errno_to_int(char **error_no)
+{
+    if (!strcmp((*error_no), "ENOENT"))
+        return ENOENT;
+    else if (!strcmp((*error_no), "ENOTDIR"))
+        return ENOTDIR;
+    else if (!strcmp((*error_no), "ENAMETOOLONG"))
+        return ENAMETOOLONG;
+    else if (!strcmp((*error_no), "EACCES"))
+        return EACCES;
+    else if (!strcmp((*error_no), "EBADF"))
+        return EBADF;
+    else if (!strcmp((*error_no), "EFAULT"))
+        return EFAULT;
+    else if (!strcmp((*error_no), "ENOMEM"))
+        return ENOMEM;
+    else if (!strcmp((*error_no), "EINVAL"))
+        return EINVAL;
+    else if (!strcmp((*error_no), "EIO"))
+        return EIO;
+    else if (!strcmp((*error_no), "EEXIST"))
+        return EEXIST;
+    else if (!strcmp((*error_no), "ENOSPC"))
+        return ENOSPC;
+    else if (!strcmp((*error_no), "EPERM"))
+        return EPERM;
+    else if (!strcmp((*error_no), "EROFS"))
+        return EROFS;
+    else if (!strcmp((*error_no), "EBUSY"))
+        return EBUSY;
+    else if (!strcmp((*error_no), "EISDIR"))
+        return EISDIR;
+    else if (!strcmp((*error_no), "ENOTEMPTY"))
+        return ENOTEMPTY;
+    else if (!strcmp((*error_no), "EMLINK"))
+        return EMLINK;
+    else if (!strcmp((*error_no), "ENODEV"))
+        return ENODEV;
+    else if (!strcmp((*error_no), "EXDEV"))
+        return EXDEV;
+    else if (!strcmp((*error_no), "EMFILE"))
+        return EMFILE;
+    else if (!strcmp((*error_no), "ENFILE"))
+        return ENFILE;
+    else if (!strcmp((*error_no), "ENOSYS"))
+        return ENOSYS;
+    else if (!strcmp((*error_no), "EINTR"))
+        return EINTR;
+    else if (!strcmp((*error_no), "EFBIG"))
+        return EFBIG;
+    else if (!strcmp((*error_no), "GF_ERROR_SHORT_WRITE"))
+        return GF_ERROR_SHORT_WRITE;
+    else
+        return EAGAIN;
 }
 
-int32_t
-error_gen_lookup (call_frame_t *frame,
-		  xlator_t *this,
-		  loc_t *loc,
-		  dict_t *xattr_req)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_lookup_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->lookup,
-		    loc,
-		    xattr_req);
-	return 0;
+int
+error_gen(xlator_t *this, int op_no)
+{
+    eg_t *egp = NULL;
+    int count = 0;
+    int error_no_int = 0;
+    int rand_no = 0;
+    int ret = 0;
+    gf_boolean_t should_err = _gf_false;
+    int error_no_list_size = 0;
+
+    egp = this->private;
+
+    if (egp->random_failure) {
+        /*
+         * I honestly don't know why anyone would use this "feature"
+         * but I'll try to preserve its functionality anyway.  Without
+         * locking twice to update failure_iter_no and egp->op_count
+         * separately, then not locking at all to update
+         * egp->failure_iter_no.  That's not needed for compatibility,
+         * and it's abhorrently wrong.  I have *some* standards.
+         */
+        LOCK(&egp->lock);
+        {
+            count = ++(egp->op_count);
+            error_no_int = egp->error_no_int;
+            if ((count % egp->failure_iter_no) == 0) {
+                egp->op_count = 0;
+                /* coverity[DC.WEAK_CRYPTO] */
+                egp->failure_iter_no = 3 + (rand() % GF_UNIVERSAL_ANSWER);
+                should_err = _gf_true;
+            }
+        }
+        UNLOCK(&egp->lock);
+    } else {
+        /*
+         * It turns out that rand() is almost universally implemented
+         * as a linear congruential PRNG, which is about as cheap as
+         * it gets.  This gets us real random behavior, including
+         * phenomena like streaks and dry spells, with controllable
+         * long-term probability, cheaply.
+         */
+        if ((rand() % FAILURE_GRANULARITY) < egp->failure_iter_no) {
+            should_err = _gf_true;
+        }
+    }
+
+    error_no_list_size = sizeof(error_no_list) / sizeof(error_no_list[0]);
+    if (should_err) {
+        if (error_no_int)
+            ret = error_no_int;
+        else {
+            rand_no = generate_rand_no(op_no);
+            if (op_no >= error_no_list_size)
+                op_no = 0;
+            if (rand_no >= error_no_list[op_no].error_no_count)
+                rand_no = 0;
+            ret = error_no_list[op_no].error_no[rand_no];
+        }
+    }
+
+    return ret;
 }
 
-
-int32_t
-error_gen_forget (xlator_t *this,
-		  inode_t *inode)
+int
+error_gen_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
 {
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
-int32_t
-error_gen_stat_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    struct stat *buf)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      buf);
-	return 0;
-}
+    egp = this->private;
+    enable = egp->enable[GF_FOP_LOOKUP];
 
-int32_t
-error_gen_stat (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_stat_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->stat,
-		    loc);
-	return 0;
-}
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_LOOKUP);
 
-int32_t
-error_gen_chmod_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     struct stat *buf)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      buf);
-	return 0;
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(lookup, frame, -1, op_errno, NULL, NULL, NULL,
+                            NULL);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup,
+                    loc, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_chmod (call_frame_t *frame,
-		 xlator_t *this,
-		 loc_t *loc,
-		 mode_t mode)
+int
+error_gen_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_chmod_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->chmod,
-		    loc,
-		    mode);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_STAT];
 
-int32_t
-error_gen_fchmod_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno,
-		      struct stat *buf)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      buf);
-	return 0;
-}
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_STAT);
 
-int32_t
-error_gen_fchmod (call_frame_t *frame,
-		  xlator_t *this,
-		  fd_t *fd,
-		  mode_t mode)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_fchmod_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->fchmod,
-		    fd,
-		    mode);
-	return 0;
-}
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(stat, frame, -1, op_errno, NULL, xdata);
+        return 0;
+    }
 
-int32_t
-error_gen_chown_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     struct stat *buf)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      buf);
-	return 0;
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat,
+                    loc, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_chown (call_frame_t *frame,
-		 xlator_t *this,
-		 loc_t *loc,
-		 uid_t uid,
-		 gid_t gid)
+int
+error_gen_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                  struct iatt *stbuf, int32_t valid, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_chown_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->chown,
-		    loc,
-		    uid,
-		    gid);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
-int32_t
-error_gen_fchown_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno,
-		      struct stat *buf)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      buf);
-	return 0;
-}
+    egp = this->private;
+    enable = egp->enable[GF_FOP_SETATTR];
 
-int32_t
-error_gen_fchown (call_frame_t *frame,
-		  xlator_t *this,
-		  fd_t *fd,
-		  uid_t uid,
-		  gid_t gid)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_fchown_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->fchown,
-		    fd,
-		    uid,
-		    gid);
-	return 0;
-}
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_SETATTR);
 
-int32_t
-error_gen_truncate_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno,
-			struct stat *buf)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      buf);
-	return 0;
-}
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(setattr, frame, -1, op_errno, NULL, NULL, xdata);
+        return 0;
+    }
 
-int32_t
-error_gen_truncate (call_frame_t *frame,
-		    xlator_t *this,
-		    loc_t *loc,
-		    off_t offset)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_truncate_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->truncate,
-		    loc,
-		    offset);
-	return 0;
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setattr,
+                    loc, stbuf, valid, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_ftruncate_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 struct stat *buf)
+int
+error_gen_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                   struct iatt *stbuf, int32_t valid, dict_t *xdata)
 {
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      buf);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
-int32_t
-error_gen_ftruncate (call_frame_t *frame,
-		     xlator_t *this,
-		     fd_t *fd,
-		     off_t offset)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_ftruncate_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->ftruncate,
-		    fd,
-		    offset);
-	return 0;
-}
+    egp = this->private;
+    enable = egp->enable[GF_FOP_FSETATTR];
 
-int32_t
-error_gen_utimens_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       struct stat *buf)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      buf);
-	return 0;
-}
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_FSETATTR);
 
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(fsetattr, frame, -1, op_errno, NULL, NULL, xdata);
+        return 0;
+    }
 
-int32_t
-error_gen_utimens (call_frame_t *frame,
-		   xlator_t *this,
-		   loc_t *loc,
-		   struct timespec tv[2])
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_utimens_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->utimens,
-		    loc,
-		    tv);
-	return 0;
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetattr,
+                    fd, stbuf, valid, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_access_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno)
+int
+error_gen_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   off_t offset, dict_t *xdata)
 {
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
-int32_t
-error_gen_access (call_frame_t *frame,
-		  xlator_t *this,
-		  loc_t *loc,
-		  int32_t mask)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_access_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->access,
-		    loc,
-		    mask);
-	return 0;
-}
+    egp = this->private;
+    enable = egp->enable[GF_FOP_TRUNCATE];
 
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_TRUNCATE);
 
-int32_t
-error_gen_readlink_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno,
-			const char *path)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      path);
-	return 0;
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(truncate, frame, -1, op_errno, NULL, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate,
+                    loc, offset, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_readlink (call_frame_t *frame,
-		    xlator_t *this,
-		    loc_t *loc,
-		    size_t size)
+int
+error_gen_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                    dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_readlink_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->readlink,
-		    loc,
-		    size);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_FTRUNCATE];
 
-int32_t
-error_gen_mknod_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     inode_t *inode,
-		     struct stat *buf)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      inode,
-		      buf);
-	return 0;
-}
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_FTRUNCATE);
 
-int32_t
-error_gen_mknod (call_frame_t *frame,
-		 xlator_t *this,
-		 loc_t *loc,
-		 mode_t mode,
-		 dev_t rdev)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_mknod_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->mknod,
-		    loc, mode, rdev);
-	return 0;
-}
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(ftruncate, frame, -1, op_errno, NULL, NULL, xdata);
+        return 0;
+    }
 
-int32_t
-error_gen_mkdir_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     inode_t *inode,
-		     struct stat *buf)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      inode,
-		      buf);
-	return 0;
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_mkdir (call_frame_t *frame,
-		 xlator_t *this,
-		 loc_t *loc,
-		 mode_t mode)
+int
+error_gen_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+                 dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_mkdir_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->mkdir,
-		    loc, mode);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
-int32_t
-error_gen_unlink_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
+    egp = this->private;
+    enable = egp->enable[GF_FOP_ACCESS];
 
-int32_t
-error_gen_unlink (call_frame_t *frame,
-		  xlator_t *this,
-		  loc_t *loc)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_unlink_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->unlink,
-		    loc);
-	return 0;
-}
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_ACCESS);
 
-int32_t
-error_gen_rmdir_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno);
-	return 0;
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(access, frame, -1, op_errno, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->access,
+                    loc, mask, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_rmdir (call_frame_t *frame,
-		 xlator_t *this,
-		 loc_t *loc)
+int
+error_gen_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+                   dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_rmdir_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->rmdir,
-		    loc);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_READLINK];
 
-int32_t
-error_gen_symlink_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       inode_t *inode,
-		       struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, inode,	buf);
-	return 0;
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_READLINK);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(readlink, frame, -1, op_errno, NULL, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readlink,
+                    loc, size, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_symlink (call_frame_t *frame,
-		   xlator_t *this,
-		   const char *linkpath,
-		   loc_t *loc)
+int
+error_gen_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+                dev_t rdev, mode_t umask, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_symlink_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->symlink,
-		    linkpath, loc);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_MKNOD];
 
-int32_t
-error_gen_rename_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno,
-		      struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_MKNOD);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                            xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod,
+                    loc, mode, rdev, umask, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_rename (call_frame_t *frame,
-		  xlator_t *this,
-		  loc_t *oldloc,
-		  loc_t *newloc)
+int
+error_gen_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+                mode_t umask, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_rename_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->rename,
-		    oldloc, newloc);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_MKDIR];
 
-int32_t
-error_gen_link_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    inode_t *inode,
-		    struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, inode,	buf);
-	return 0;
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_MKDIR);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                            xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir,
+                    loc, mode, umask, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_link (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *oldloc,
-		loc_t *newloc)
+int
+error_gen_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+                 dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    error_gen_link_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->link,
-		    oldloc, newloc);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_UNLINK];
 
-int32_t
-error_gen_create_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno,
-		      fd_t *fd,
-		      inode_t *inode,
-		      struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
-	return 0;
-}
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_UNLINK);
 
-int32_t
-error_gen_create (call_frame_t *frame,
-		  xlator_t *this,
-		  loc_t *loc,
-		  int32_t flags,
-		  mode_t mode, fd_t *fd)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame, error_gen_create_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->create,
-		    loc, flags, mode, fd);
-	return 0;
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(unlink, frame, -1, op_errno, NULL, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink,
+                    loc, xflag, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_open_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    fd_t *fd)
+int
+error_gen_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+                dict_t *xdata)
 {
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      fd);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_RMDIR];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_RMDIR);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(rmdir, frame, -1, op_errno, NULL, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir,
+                    loc, flags, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_open (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		int32_t flags, fd_t *fd)
+int
+error_gen_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+                  loc_t *loc, mode_t umask, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_open_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->open,
-		    loc, flags, fd);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_SYMLINK];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_SYMLINK);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(symlink, frame, -1, op_errno, NULL, NULL, NULL,
+                            NULL, NULL); /* pre & post parent attr */
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink,
+                    linkpath, loc, umask, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_readv_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     struct iovec *vector,
-		     int32_t count,
-		     struct stat *stbuf)
+int
+error_gen_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+                 loc_t *newloc, dict_t *xdata)
 {
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      vector,
-		      count,
-		      stbuf);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_RENAME];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_RENAME);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                            NULL, NULL);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename,
+                    oldloc, newloc, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_readv (call_frame_t *frame,
-		 xlator_t *this,
-		 fd_t *fd,
-		 size_t size,
-		 off_t offset)
+int
+error_gen_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+               loc_t *newloc, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL);
-		return 0;
-	}
-
-
-	STACK_WIND (frame,
-		    error_gen_readv_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->readv,
-		    fd,
-		    size,
-		    offset);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_LINK];
 
-int32_t
-error_gen_writev_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno,
-		      struct stat *stbuf)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      stbuf);
-	return 0;
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_LINK);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(link, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                            NULL);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+                    oldloc, newloc, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_writev (call_frame_t *frame,
-		  xlator_t *this,
-		  fd_t *fd,
-		  struct iovec *vector,
-		  int32_t count,
-		  off_t off)
+int
+error_gen_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+                 mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-
-
-	STACK_WIND (frame,
-		    error_gen_writev_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->writev,
-		    fd,
-		    vector,
-		    count,
-		    off);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_CREATE];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_CREATE);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                            NULL, NULL);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->create,
+                    loc, flags, mode, umask, fd, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_flush_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno)
+int
+error_gen_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+               fd_t *fd, dict_t *xdata)
 {
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_OPEN];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_OPEN);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(open, frame, -1, op_errno, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->open,
+                    loc, flags, fd, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_flush (call_frame_t *frame,
-		 xlator_t *this,
-		 fd_t *fd)
+int
+error_gen_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                off_t offset, uint32_t flags, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_flush_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->flush,
-		    fd);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_READ];
 
-int32_t
-error_gen_fsync_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno);
-	return 0;
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_READ);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(readv, frame, -1, op_errno, NULL, 0, NULL, NULL,
+                            xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv,
+                    fd, size, offset, flags, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_fsync (call_frame_t *frame,
-		 xlator_t *this,
-		 fd_t *fd,
-		 int32_t flags)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_fsync_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->fsync,
-		    fd,
-		    flags);
-	return 0;
+int
+error_gen_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                 struct iovec *vector, int32_t count, off_t off, uint32_t flags,
+                 struct iobref *iobref, dict_t *xdata)
+{
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+    struct iovec *shortvec = NULL;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_WRITE];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_WRITE);
+
+    if (op_errno == GF_ERROR_SHORT_WRITE) {
+        /*
+         * A short write error returns some value less than what was
+         * requested from a write. To simulate this, replace the vector
+         * with one half the size;
+         */
+        shortvec = iov_dup(vector, 1);
+        shortvec->iov_len /= 2;
+        count = 1;
+        goto wind;
+    } else if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(writev, frame, -1, op_errno, NULL, NULL, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev,
+                    fd, shortvec ? shortvec : vector, count, off, flags, iobref,
+                    xdata);
+
+    if (shortvec)
+        GF_FREE(shortvec);
+    return 0;
 }
 
-int32_t
-error_gen_fstat_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     struct stat *buf)
+int
+error_gen_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
 {
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      buf);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_FLUSH];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_FLUSH);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(flush, frame, -1, op_errno, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->flush,
+                    fd, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_fstat (call_frame_t *frame,
-		 xlator_t *this,
-		 fd_t *fd)
+int
+error_gen_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+                dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_fstat_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->fstat,
-		    fd);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_FSYNC];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_FSYNC);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(fsync, frame, -1, op_errno, NULL, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync,
+                    fd, flags, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_opendir_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       fd_t *fd)
+int
+error_gen_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
 {
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      fd);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_FSTAT];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_FSTAT);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(fstat, frame, -1, op_errno, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat,
+                    fd, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_opendir (call_frame_t *frame,
-		   xlator_t *this,
-		   loc_t *loc, fd_t *fd)
+int
+error_gen_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+                  dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_opendir_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->opendir,
-		    loc, fd);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_OPENDIR];
 
-int32_t
-error_gen_getdents_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno,
-			dir_entry_t *entries,
-			int32_t count)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      entries,
-		      count);
-	return 0;
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_OPENDIR);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(opendir, frame, -1, op_errno, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir,
+                    loc, fd, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_getdents (call_frame_t *frame,
-		    xlator_t *this,
-		    fd_t *fd,
-		    size_t size,
-		    off_t offset,
-		    int32_t flag)
+int
+error_gen_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+                   dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL, 0);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_getdents_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->getdents,
-		    fd,
-		    size,
-		    offset,
-		    flag);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_FSYNCDIR];
 
-int32_t
-error_gen_setdents_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno);
-	return 0;
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_FSYNCDIR);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(fsyncdir, frame, -1, op_errno, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsyncdir,
+                    fd, flags, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_setdents (call_frame_t *frame,
-		    xlator_t *this,
-		    fd_t *fd,
-		    int32_t flags,
-		    dir_entry_t *entries,
-		    int32_t count)
+int
+error_gen_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL, 0);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_setdents_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->setdents,
-		    fd,
-		    flags,
-		    entries,
-		    count);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_STATFS];
 
-int32_t
-error_gen_fsyncdir_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno);
-	return 0;
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_STATFS);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(statfs, frame, -1, op_errno, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->statfs,
+                    loc, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_fsyncdir (call_frame_t *frame,
-		    xlator_t *this,
-		    fd_t *fd,
-		    int32_t flags)
+int
+error_gen_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   dict_t *dict, int32_t flags, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_fsyncdir_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->fsyncdir,
-		    fd,
-		    flags);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_SETXATTR];
 
-int32_t
-error_gen_statfs_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno,
-		      struct statvfs *buf)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      buf);
-	return 0;
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_SETXATTR);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(setxattr, frame, -1, op_errno, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr,
+                    loc, dict, flags, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_statfs (call_frame_t *frame,
-		  xlator_t *this,
-		  loc_t *loc)
+int
+error_gen_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   const char *name, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_statfs_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->statfs,
-		    loc);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_GETXATTR];
 
-int32_t
-error_gen_setxattr_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno);
-	return 0;
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_GETXATTR);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(getxattr, frame, -1, op_errno, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->getxattr,
+                    loc, name, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_setxattr (call_frame_t *frame,
-		    xlator_t *this,
-		    loc_t *loc,
-		    dict_t *dict,
-		    int32_t flags)
+int
+error_gen_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+                    int32_t flags, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_setxattr_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->setxattr,
-		    loc,
-		    dict,
-		    flags);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_FSETXATTR];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_FSETXATTR);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(fsetxattr, frame, -1, op_errno, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_getxattr_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno,
-			dict_t *dict)
+int
+error_gen_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                    const char *name, dict_t *xdata)
 {
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      dict);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_FGETXATTR];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_FGETXATTR);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(fgetxattr, frame, -1, op_errno, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_getxattr (call_frame_t *frame,
-		    xlator_t *this,
-		    loc_t *loc,
-		    const char *name)
+int
+error_gen_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                  gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_getxattr_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->getxattr,
-		    loc,
-		    name);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_XATTROP];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_XATTROP);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(xattrop, frame, -1, op_errno, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop,
+                    loc, flags, dict, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_xattrop_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       dict_t *dict)
+int
+error_gen_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                   gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
 {
-	STACK_UNWIND (frame, op_ret, op_errno, dict);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_FXATTROP];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_FXATTROP);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(fxattrop, frame, -1, op_errno, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fxattrop,
+                    fd, flags, dict, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_xattrop (call_frame_t *frame,
-		   xlator_t *this,
-		   loc_t *loc,
-		   gf_xattrop_flags_t flags,
-		   dict_t *dict)
+int
+error_gen_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                      const char *name, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_xattrop_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->xattrop,
-		    loc, flags, dict);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_REMOVEXATTR];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_REMOVEXATTR);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(removexattr, frame, -1, op_errno, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_fxattrop_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno,
-			dict_t *dict)
+int
+error_gen_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                       const char *name, dict_t *xdata)
 {
-	STACK_UNWIND (frame, op_ret, op_errno, dict);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_FREMOVEXATTR];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_FREMOVEXATTR);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(fremovexattr, frame, -1, op_errno, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_fxattrop (call_frame_t *frame,
-  		    xlator_t *this,
-  		    fd_t *fd,
-  		    gf_xattrop_flags_t flags,
-  		    dict_t *dict)
+int
+error_gen_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+             struct gf_flock *lock, dict_t *xdata)
 {
-  	int op_errno = 0;
-  	op_errno = error_gen(this);
-  	if (op_errno) {
-  		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-  		STACK_UNWIND (frame, -1, op_errno, NULL);
-  		return 0;
-  	}
-
-  	STACK_WIND (frame,
-  		    error_gen_fxattrop_cbk,
-  		    FIRST_CHILD(this),
-  		    FIRST_CHILD(this)->fops->fxattrop,
-  		    fd, flags, dict);
-  	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_LK];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_LK);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(lk, frame, -1, op_errno, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lk, fd,
+                    cmd, lock, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_removexattr_cbk (call_frame_t *frame,
-			   void *cookie,
-			   xlator_t *this,
-			   int32_t op_ret,
-			   int32_t op_errno)
+int
+error_gen_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+                  loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
 {
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_INODELK];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_INODELK);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(inodelk, frame, -1, op_errno, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->inodelk,
+                    volume, loc, cmd, lock, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_removexattr (call_frame_t *frame,
-		       xlator_t *this,
-		       loc_t *loc,
-		       const char *name)
+int
+error_gen_finodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+                   fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_removexattr_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->removexattr,
-		    loc,
-		    name);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_FINODELK];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_FINODELK);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(finodelk, frame, -1, op_errno, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->finodelk,
+                    volume, fd, cmd, lock, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_lk_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct flock *lock)
+int
+error_gen_entrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+                  loc_t *loc, const char *basename, entrylk_cmd cmd,
+                  entrylk_type type, dict_t *xdata)
 {
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      lock);
-	return 0;
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
+
+    egp = this->private;
+    enable = egp->enable[GF_FOP_ENTRYLK];
+
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_ENTRYLK);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(entrylk, frame, -1, op_errno, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->entrylk,
+                    volume, loc, basename, cmd, type, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_lk (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      int32_t cmd,
-	      struct flock *lock)
+int
+error_gen_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+                   fd_t *fd, const char *basename, entrylk_cmd cmd,
+                   entrylk_type type, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_lk_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->lk,
-		    fd,
-		    cmd,
-		    lock);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_FENTRYLK];
 
-int32_t
-error_gen_inodelk_cbk (call_frame_t *frame, void *cookie,
-		       xlator_t *this, int32_t op_ret, int32_t op_errno)
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_FENTRYLK);
 
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(fentrylk, frame, -1, op_errno, xdata);
+        return 0;
+    }
 
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fentrylk,
+                    volume, fd, basename, cmd, type, xdata);
+    return 0;
+}
 
-int32_t
-error_gen_inodelk (call_frame_t *frame, xlator_t *this,
-		   loc_t *loc, int32_t cmd, struct flock *lock)
+int
+error_gen_getspec(call_frame_t *frame, xlator_t *this, const char *key,
+                  int32_t flags)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_inodelk_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->inodelk,
-		    loc, cmd, lock);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_GETSPEC];
 
-int32_t
-error_gen_finodelk_cbk (call_frame_t *frame, void *cookie,
-			xlator_t *this, int32_t op_ret, int32_t op_errno)
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_GETSPEC);
 
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(getspec, frame, -1, op_errno, NULL);
+        return 0;
+    }
 
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->getspec,
+                    key, flags);
+    return 0;
+}
 
-int32_t
-error_gen_finodelk (call_frame_t *frame, xlator_t *this,
-		    fd_t *fd, int32_t cmd, struct flock *lock)
+int
+error_gen_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                  off_t off, dict_t *xdata)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_finodelk_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->finodelk,
-		    fd, cmd, lock);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
+    egp = this->private;
+    enable = egp->enable[GF_FOP_READDIR];
 
-int32_t
-error_gen_entrylk_cbk (call_frame_t *frame, void *cookie,
-		       xlator_t *this, int32_t op_ret, int32_t op_errno)
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_READDIR);
 
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(readdir, frame, -1, op_errno, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir,
+                    fd, size, off, xdata);
+    return 0;
 }
 
-int32_t
-error_gen_entrylk (call_frame_t *frame, xlator_t *this,
-		   loc_t *loc, const char *basename,
-		   entrylk_cmd cmd, entrylk_type type)
+int
+error_gen_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                   off_t off, dict_t *dict)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno);
-		return 0;
-	}
-
-	STACK_WIND (frame, error_gen_entrylk_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->entrylk,
-		    loc, basename, cmd, type);
-	return 0;
-}
+    int op_errno = 0;
+    eg_t *egp = NULL;
+    int enable = 1;
 
-int32_t
-error_gen_fentrylk_cbk (call_frame_t *frame, void *cookie,
-			xlator_t *this, int32_t op_ret, int32_t op_errno)
+    egp = this->private;
+    enable = egp->enable[GF_FOP_READDIRP];
 
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
+    if (enable)
+        op_errno = error_gen(this, GF_FOP_READDIRP);
+
+    if (op_errno) {
+        GF_ERROR(this, "unwind(-1, %s)", strerror(op_errno));
+        STACK_UNWIND_STRICT(readdirp, frame, -1, op_errno, NULL, NULL);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp,
+                    fd, size, off, dict);
+    return 0;
 }
 
-int32_t
-error_gen_fentrylk (call_frame_t *frame, xlator_t *this,
-		    fd_t *fd, const char *basename,
-		    entrylk_cmd cmd, entrylk_type type)
+static void
+error_gen_set_failure(eg_t *pvt, double percent)
 {
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno);
-		return 0;
-	}
-
-	STACK_WIND (frame, error_gen_fentrylk_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->fentrylk,
-		    fd, basename, cmd, type);
-	return 0;
-}
+    double ppm;
 
+    GF_ASSERT(pvt);
 
-/* Management operations */
+    ppm = (percent / 100.0) * (double)FAILURE_GRANULARITY;
+    pvt->failure_iter_no = (int)ppm;
+}
 
-int32_t
-error_gen_stats_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     struct xlator_stats *stats)
+static void
+error_gen_parse_fill_fops(eg_t *pvt, char *enable_fops)
 {
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      stats);
-	return 0;
-}
+    char *op_no_str = NULL;
+    int op_no = -1;
+    int i = 0;
+    xlator_t *this = THIS;
+    char *saveptr = NULL;
 
+    GF_ASSERT(pvt);
+    GF_ASSERT(this);
 
-int32_t
-error_gen_stats (call_frame_t *frame,
-		 xlator_t *this,
-		 int32_t flags)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_stats_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->mops->stats,
-		    flags);
-	return 0;
-}
+    for (i = 0; i < GF_FOP_MAXVALUE; i++)
+        pvt->enable[i] = 0;
 
+    if (!enable_fops) {
+        gf_log(this->name, GF_LOG_WARNING, "All fops are enabled.");
+        for (i = 0; i < GF_FOP_MAXVALUE; i++)
+            pvt->enable[i] = 1;
+    } else {
+        op_no_str = strtok_r(enable_fops, ",", &saveptr);
+        while (op_no_str) {
+            op_no = gf_fop_int(op_no_str);
+            if (op_no == -1) {
+                gf_log(this->name, GF_LOG_WARNING, "Wrong option value %s",
+                       op_no_str);
+            } else
+                pvt->enable[op_no] = 1;
 
+            op_no_str = strtok_r(NULL, ",", &saveptr);
+        }
+    }
+}
 
 int32_t
-error_gen_getspec_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       char *spec_data)
+error_gen_priv_dump(xlator_t *this)
 {
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      spec_data);
-	return 0;
-}
+    char key_prefix[GF_DUMP_MAX_BUF_LEN];
+    int ret = -1;
+    eg_t *conf = NULL;
 
+    if (!this)
+        goto out;
 
-int32_t
-error_gen_getspec (call_frame_t *frame,
-		   xlator_t *this,
-		   const char *key,
-		   int32_t flags)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_getspec_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->mops->getspec,
-		    key, flags);
-	return 0;
-}
+    conf = this->private;
+    if (!conf)
+        goto out;
 
+    ret = TRY_LOCK(&conf->lock);
+    if (ret != 0) {
+        return ret;
+    }
 
-int32_t
-error_gen_checksum_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno,
-			uint8_t *file_checksum,
-			uint8_t *dir_checksum)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      file_checksum,
-		      dir_checksum);
-	return 0;
-}
+    gf_proc_dump_add_section("xlator.debug.error-gen.%s.priv", this->name);
+    gf_proc_dump_build_key(key_prefix, "xlator.debug.error-gen", "%s.priv",
+                           this->name);
 
+    gf_proc_dump_write("op_count", "%d", conf->op_count);
+    gf_proc_dump_write("failure_iter_no", "%d", conf->failure_iter_no);
+    gf_proc_dump_write("error_no", "%d", conf->error_no_int);
+    gf_proc_dump_write("random_failure", "%d", conf->random_failure);
 
-int32_t
-error_gen_checksum (call_frame_t *frame,
-		    xlator_t *this,
-		    loc_t *loc,
-		    int32_t flag)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_checksum_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->checksum,
-		    loc,
-		    flag);
-	return 0;
+    UNLOCK(&conf->lock);
+out:
+    return ret;
 }
 
 int32_t
-error_gen_readdir_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno,
-		       gf_dirent_t *entries)
+mem_acct_init(xlator_t *this)
 {
-	STACK_UNWIND (frame, op_ret, op_errno, entries);
-	return 0;
-}
+    int ret = -1;
 
+    if (!this)
+        return ret;
 
-int32_t
-error_gen_readdir (call_frame_t *frame,
-		   xlator_t *this,
-		   fd_t *fd,
-		   size_t size,
-		   off_t off)
-{
-	int op_errno = 0;
-	op_errno = error_gen(this);
-	if (op_errno) {
-		GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
-		STACK_UNWIND (frame, -1, op_errno, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    error_gen_readdir_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->readdir,
-		    fd, size, off);
-	return 0;
-}
+    ret = xlator_mem_acct_init(this, gf_error_gen_mt_end + 1);
 
-int32_t
-error_gen_closedir (xlator_t *this,
-		    fd_t *fd)
-{
-	return 0;
+    if (ret != 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Memory accounting init"
+               " failed");
+        return ret;
+    }
+
+    return ret;
 }
 
-int32_t
-error_gen_close (xlator_t *this,
-		 fd_t *fd)
+int
+reconfigure(xlator_t *this, dict_t *options)
 {
-	return 0;
+    eg_t *pvt = NULL;
+    int32_t ret = 0;
+    char *error_enable_fops = NULL;
+    char *error_no = NULL;
+    double failure_percent_dbl = 0.0;
+
+    if (!this || !this->private)
+        goto out;
+
+    pvt = this->private;
+
+    ret = -1;
+
+    GF_OPTION_RECONF("error-no", error_no, options, str, out);
+
+    if (error_no)
+        pvt->error_no_int = conv_errno_to_int(&error_no);
+
+    GF_OPTION_RECONF("failure", failure_percent_dbl, options, percent, out);
+
+    GF_OPTION_RECONF("enable", error_enable_fops, options, str, out);
+
+    GF_OPTION_RECONF("random-failure", pvt->random_failure, options, bool, out);
+
+    error_gen_parse_fill_fops(pvt, error_enable_fops);
+    error_gen_set_failure(pvt, failure_percent_dbl);
+
+    ret = 0;
+out:
+    gf_log(this ? this->name : "error-gen", GF_LOG_DEBUG,
+           "reconfigure returning %d", ret);
+    return ret;
 }
 
 int
-init (xlator_t *this)
+init(xlator_t *this)
 {
-	eg_t *pvt = NULL;
-
-	if (!this->children || this->children->next) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"error-gen not configured with one subvolume");
-		return -1;
-	}
-
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-  
-	pvt = CALLOC (1, sizeof (eg_t));
-	this->private = pvt;
-	return 0;
+    eg_t *pvt = NULL;
+    int32_t ret = 0;
+    char *error_enable_fops = NULL;
+    char *error_no = NULL;
+    double failure_percent_dbl = 0.0;
+
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "error-gen not configured with one subvolume");
+        ret = -1;
+        goto out;
+    }
+
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
+    }
+
+    pvt = GF_CALLOC(1, sizeof(eg_t), gf_error_gen_mt_eg_t);
+
+    if (!pvt) {
+        ret = -1;
+        goto out;
+    }
+
+    LOCK_INIT(&pvt->lock);
+
+    ret = -1;
+
+    GF_OPTION_INIT("error-no", error_no, str, out);
+
+    if (error_no)
+        pvt->error_no_int = conv_errno_to_int(&error_no);
+
+    GF_OPTION_INIT("failure", failure_percent_dbl, percent, out);
+
+    GF_OPTION_INIT("enable", error_enable_fops, str, out);
+
+    GF_OPTION_INIT("random-failure", pvt->random_failure, bool, out);
+
+    error_gen_parse_fill_fops(pvt, error_enable_fops);
+    error_gen_set_failure(pvt, failure_percent_dbl);
+
+    this->private = pvt;
+
+    /* Give some seed value here. */
+    srand(gf_time());
+
+    ret = 0;
+out:
+    if (ret)
+        GF_FREE(pvt);
+    return ret;
 }
 
 void
-fini (xlator_t *this)
+fini(xlator_t *this)
 {
-	gf_log (this->name, GF_LOG_DEBUG, "fini called");
-	return;
+    eg_t *pvt = NULL;
+
+    if (!this)
+        return;
+    pvt = this->private;
+
+    if (pvt) {
+        LOCK_DESTROY(&pvt->lock);
+        GF_FREE(pvt);
+        gf_log(this->name, GF_LOG_DEBUG, "fini called");
+    }
+    return;
 }
 
+struct xlator_dumpops dumpops = {
+    .priv = error_gen_priv_dump,
+};
+
+struct xlator_cbks cbks;
 
 struct xlator_fops fops = {
-	.lookup      = error_gen_lookup,
-	.stat        = error_gen_stat,
-	.readlink    = error_gen_readlink,
-	.mknod       = error_gen_mknod,
-	.mkdir       = error_gen_mkdir,
-	.unlink      = error_gen_unlink,
-	.rmdir       = error_gen_rmdir,
-	.symlink     = error_gen_symlink,
-	.rename      = error_gen_rename,
-	.link        = error_gen_link,
-	.chmod       = error_gen_chmod,
-	.chown       = error_gen_chown,
-	.truncate    = error_gen_truncate,
-	.utimens     = error_gen_utimens,
-	.create      = error_gen_create,
-	.open        = error_gen_open,
-	.readv       = error_gen_readv,
-	.writev      = error_gen_writev,
-	.statfs      = error_gen_statfs,
-	.flush       = error_gen_flush,
-	.fsync       = error_gen_fsync,
-	.setxattr    = error_gen_setxattr,
-	.getxattr    = error_gen_getxattr,
-	.removexattr = error_gen_removexattr,
-	.opendir     = error_gen_opendir,
-	.readdir     = error_gen_readdir,
-	.getdents    = error_gen_getdents,
-	.fsyncdir    = error_gen_fsyncdir,
-	.access      = error_gen_access,
-	.ftruncate   = error_gen_ftruncate,
-	.fstat       = error_gen_fstat,
-	.lk          = error_gen_lk,
-	.fchmod      = error_gen_fchmod,
-	.fchown      = error_gen_fchown,
-	.setdents    = error_gen_setdents,
-	.lookup_cbk  = error_gen_lookup_cbk,
-	.checksum    = error_gen_checksum,
-	.xattrop     = error_gen_xattrop,
-	.fxattrop    = error_gen_fxattrop,
-	.inodelk     = error_gen_inodelk,
-	.finodelk    = error_gen_finodelk,
-	.entrylk     = error_gen_entrylk,
-	.fentrylk    = error_gen_fentrylk
+    .lookup = error_gen_lookup,
+    .stat = error_gen_stat,
+    .readlink = error_gen_readlink,
+    .mknod = error_gen_mknod,
+    .mkdir = error_gen_mkdir,
+    .unlink = error_gen_unlink,
+    .rmdir = error_gen_rmdir,
+    .symlink = error_gen_symlink,
+    .rename = error_gen_rename,
+    .link = error_gen_link,
+    .truncate = error_gen_truncate,
+    .create = error_gen_create,
+    .open = error_gen_open,
+    .readv = error_gen_readv,
+    .writev = error_gen_writev,
+    .statfs = error_gen_statfs,
+    .flush = error_gen_flush,
+    .fsync = error_gen_fsync,
+    .setxattr = error_gen_setxattr,
+    .getxattr = error_gen_getxattr,
+    .removexattr = error_gen_removexattr,
+    .fsetxattr = error_gen_fsetxattr,
+    .fgetxattr = error_gen_fgetxattr,
+    .fremovexattr = error_gen_fremovexattr,
+    .opendir = error_gen_opendir,
+    .readdir = error_gen_readdir,
+    .readdirp = error_gen_readdirp,
+    .fsyncdir = error_gen_fsyncdir,
+    .access = error_gen_access,
+    .ftruncate = error_gen_ftruncate,
+    .fstat = error_gen_fstat,
+    .lk = error_gen_lk,
+    .xattrop = error_gen_xattrop,
+    .fxattrop = error_gen_fxattrop,
+    .inodelk = error_gen_inodelk,
+    .finodelk = error_gen_finodelk,
+    .entrylk = error_gen_entrylk,
+    .fentrylk = error_gen_fentrylk,
+    .setattr = error_gen_setattr,
+    .fsetattr = error_gen_fsetattr,
+    .getspec = error_gen_getspec,
 };
 
-struct xlator_mops mops = {
-	.stats = error_gen_stats,
-	.getspec = error_gen_getspec,
+struct volume_options options[] = {
+    {
+        .key = {"failure"},
+        .type = GF_OPTION_TYPE_PERCENT,
+        .description = "Percentage failure of operations when enabled.",
+    },
+
+    {
+        .key = {"error-no"},
+        .value = {"ENOENT",
+                  "ENOTDIR",
+                  "ENAMETOOLONG",
+                  "EACCES",
+                  "EBADF",
+                  "EFAULT",
+                  "ENOMEM",
+                  "EINVAL",
+                  "EIO",
+                  "EEXIST",
+                  "ENOSPC",
+                  "EPERM",
+                  "EROFS",
+                  "EBUSY",
+                  "EISDIR",
+                  "ENOTEMPTY",
+                  "EMLINK"
+                  "ENODEV",
+                  "EXDEV",
+                  "EMFILE",
+                  "ENFILE",
+                  "ENOSYS",
+                  "EINTR",
+                  "EFBIG",
+                  "EAGAIN",
+                  "GF_ERROR_SHORT_WRITE"},
+        .type = GF_OPTION_TYPE_STR,
+        .op_version = {3},
+        .tags = {"error-gen"},
+        .flags = OPT_FLAG_SETTABLE,
+
+    },
+
+    {
+        .key = {"random-failure"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .op_version = {3},
+        .tags = {"error-gen"},
+        .flags = OPT_FLAG_SETTABLE,
+    },
+
+    {
+        .key = {"enable", "error-fops"},
+        .type = GF_OPTION_TYPE_STR,
+        .description = "Accepts a string which takes ',' separated fop "
+                       "strings to denote which fops are enabled for error",
+        .op_version = {3},
+        .tags = {"error-gen"},
+        .flags = OPT_FLAG_SETTABLE,
+    },
+
+    {.key = {NULL}},
 };
 
-struct xlator_cbks cbks = {
-	.release = error_gen_close,
-	.releasedir = error_gen_closedir,
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1},
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "error-gen",
+    .category = GF_TECH_PREVIEW,
 };
diff --git a/xlators/debug/error-gen/src/error-gen.h b/xlators/debug/error-gen/src/error-gen.h
new file mode 100644
index 00000000000..2478cd5b21c
--- /dev/null
+++ b/xlators/debug/error-gen/src/error-gen.h
@@ -0,0 +1,49 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _ERROR_GEN_H
+#define _ERROR_GEN_H
+
+#include "error-gen-mem-types.h"
+
+#define GF_FAILURE_DEFAULT 10
+
+/*
+ * Pseudo-errors refer to errors beyond the scope of traditional <-1, op_errno>
+ * returns. This facilitates the ability to return unexpected, but not -1 values
+ * and/or to inject operations that lead to implicit error conditions. The range
+ * for pseudo errors resides at a high value to avoid conflicts with the errno
+ * range.
+ */
+enum GF_PSEUDO_ERRORS {
+    GF_ERROR_SHORT_WRITE = 1000, /* short writev return value */
+    GF_ERROR_MAX
+};
+
+typedef struct {
+    int enable[GF_FOP_MAXVALUE];
+    int op_count;
+    /*
+     * This is only an iteration number in the random-failure case.  For
+     * the normal controlled-probability case, it's actually a numerator
+     * for the failure probability (see FAILURE_GRANULARITY declaration).
+     * It's just not worth blowing up the diff by changing it.
+     */
+    int failure_iter_no;
+    int error_no_int;
+    gf_boolean_t random_failure;
+    gf_lock_t lock;
+} eg_t;
+
+typedef struct {
+    int error_no_count;
+    int error_no[20];
+} sys_error_t;
+
+#endif
diff --git a/xlators/features/filter/Makefile.am b/xlators/debug/io-stats/Makefile.am
index d471a3f9243..a985f42a877 100644
--- a/xlators/features/filter/Makefile.am
+++ b/xlators/debug/io-stats/Makefile.am
@@ -1,3 +1,3 @@
 SUBDIRS = src
 
-CLEANFILES = 
+CLEANFILES =
diff --git a/xlators/debug/io-stats/src/Makefile.am b/xlators/debug/io-stats/src/Makefile.am
new file mode 100644
index 00000000000..c69f3caf0fe
--- /dev/null
+++ b/xlators/debug/io-stats/src/Makefile.am
@@ -0,0 +1,19 @@
+
+xlator_LTLIBRARIES = io-stats.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug
+
+io_stats_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+io_stats_la_SOURCES = io-stats.c
+io_stats_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = io-stats-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(top_srcdir)/rpc/rpc-lib/src \
+	-DDATADIR=\"$(localstatedir)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/debug/io-stats/src/io-stats-mem-types.h b/xlators/debug/io-stats/src/io-stats-mem-types.h
new file mode 100644
index 00000000000..51d38d8b97c
--- /dev/null
+++ b/xlators/debug/io-stats/src/io-stats-mem-types.h
@@ -0,0 +1,27 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __IO_STATS_MEM_TYPES_H__
+#define __IO_STATS_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+extern const char *__progname;
+
+enum gf_io_stats_mem_types_ {
+    gf_io_stats_mt_ios_conf = gf_common_mt_end + 1,
+    gf_io_stats_mt_ios_fd,
+    gf_io_stats_mt_ios_stat,
+    gf_io_stats_mt_ios_stat_list,
+    gf_io_stats_mt_ios_sample_buf,
+    gf_io_stats_mt_ios_sample,
+    gf_io_stats_mt_end
+};
+#endif
diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c
new file mode 100644
index 00000000000..aa00c446e5a
--- /dev/null
+++ b/xlators/debug/io-stats/src/io-stats.c
@@ -0,0 +1,4480 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/xlator.h>
+#include <glusterfs/syscall.h>
+
+/**
+ * xlators/debug/io_stats :
+ *    This translator maintains statistics of all filesystem activity
+ *    happening through it. The kind of statistics include:
+ *
+ *  a) total read data - since process start, last interval and per fd
+ *  b) total write data - since process start, last interval and per fd
+ *  c) counts of read IO block size - since process start, last interval and per
+ * fd d) counts of write IO block size - since process start, last interval and
+ * per fd e) counts of all FOP types passing through it
+ *
+ *  Usage: setfattr -n trusted.io-stats-dump /tmp/filename /mnt/gluster
+ *      output is written to /tmp/filename.<iostats xlator instance name>
+ *
+ */
+
+#include <fnmatch.h>
+#include <errno.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include "io-stats-mem-types.h"
+#include <stdarg.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/syncop.h>
+#include <pwd.h>
+#include <grp.h>
+#include <glusterfs/upcall-utils.h>
+#include <glusterfs/async.h>
+
+#define MAX_LIST_MEMBERS 100
+#define DEFAULT_PWD_BUF_SZ 16384
+#define DEFAULT_GRP_BUF_SZ 16384
+#define IOS_BLOCK_COUNT_SIZE 32
+
+#define IOS_STATS_DUMP_DIR DEFAULT_VAR_RUN_DIRECTORY
+
+typedef enum {
+    IOS_STATS_TYPE_NONE,
+    IOS_STATS_TYPE_OPEN,
+    IOS_STATS_TYPE_READ,
+    IOS_STATS_TYPE_WRITE,
+    IOS_STATS_TYPE_OPENDIR,
+    IOS_STATS_TYPE_READDIRP,
+    IOS_STATS_TYPE_READ_THROUGHPUT,
+    IOS_STATS_TYPE_WRITE_THROUGHPUT,
+    IOS_STATS_TYPE_MAX
+} ios_stats_type_t;
+
+typedef enum {
+    IOS_STATS_THRU_READ,
+    IOS_STATS_THRU_WRITE,
+    IOS_STATS_THRU_MAX,
+} ios_stats_thru_t;
+
+/* This is same as gf1_cli_info_op */
+/* had to be defined here again, so we have modularity between
+ xdr, xlator, and library functions */
+typedef enum ios_info_op {
+    GF_IOS_INFO_NONE = 0,
+    GF_IOS_INFO_ALL = 1,
+    GF_IOS_INFO_INCREMENTAL = 2,
+    GF_IOS_INFO_CUMULATIVE = 3,
+    GF_IOS_INFO_CLEAR = 4,
+} ios_info_op_t;
+
+struct ios_stat_lat {
+    struct timeval time;
+    double throughput;
+};
+
+struct ios_stat {
+    gf_lock_t lock;
+    uuid_t gfid;
+    char *filename;
+    gf_atomic_t counters[IOS_STATS_TYPE_MAX];
+    struct ios_stat_lat thru_counters[IOS_STATS_THRU_MAX];
+    gf_atomic_t refcnt;
+};
+
+struct ios_stat_list {
+    struct list_head list;
+    struct ios_stat *iosstat;
+    double value;
+};
+
+struct ios_stat_head {
+    gf_lock_t lock;
+    double min_cnt;
+    uint64_t members;
+    struct ios_stat_list *iosstats;
+};
+
+typedef struct _ios_sample_t {
+    uid_t uid;
+    gid_t gid;
+    char identifier[UNIX_PATH_MAX];
+    glusterfs_fop_t fop_type;
+    struct timeval timestamp;
+    double elapsed;
+} ios_sample_t;
+
+typedef struct _ios_sample_buf_t {
+    uint64_t pos;              /* Position in write buffer */
+    uint64_t size;             /* Size of ring buffer */
+    uint64_t collected;        /* Number of samples we've collected */
+    uint64_t observed;         /* Number of FOPs we've observed */
+    ios_sample_t *ios_samples; /* Our list of samples */
+} ios_sample_buf_t;
+
+struct ios_lat {
+    double min;
+    double max;
+    double avg;
+    uint64_t total;
+};
+
+struct ios_global_stats {
+    gf_atomic_t data_written;
+    gf_atomic_t data_read;
+    gf_atomic_t block_count_write[IOS_BLOCK_COUNT_SIZE];
+    gf_atomic_t block_count_read[IOS_BLOCK_COUNT_SIZE];
+    gf_atomic_t fop_hits[GF_FOP_MAXVALUE];
+    gf_atomic_t upcall_hits[GF_UPCALL_FLAGS_MAXVALUE];
+    time_t started_at;
+    struct ios_lat latency[GF_FOP_MAXVALUE];
+    uint64_t nr_opens;
+    uint64_t max_nr_opens;
+    struct timeval max_openfd_time;
+};
+
+typedef enum {
+    IOS_DUMP_TYPE_NONE = 0,
+    IOS_DUMP_TYPE_FILE = 1,
+    IOS_DUMP_TYPE_DICT = 2,
+    IOS_DUMP_TYPE_JSON_FILE = 3,
+    IOS_DUMP_TYPE_SAMPLES = 4,
+    IOS_DUMP_TYPE_MAX = 5
+} ios_dump_type_t;
+
+struct ios_conf {
+    gf_lock_t lock;
+    struct ios_global_stats cumulative;
+    uint64_t increment;
+    struct ios_global_stats incremental;
+    gf_boolean_t dump_fd_stats;
+    gf_boolean_t count_fop_hits;
+    gf_boolean_t measure_latency;
+    struct ios_stat_head list[IOS_STATS_TYPE_MAX];
+    struct ios_stat_head thru_list[IOS_STATS_THRU_MAX];
+    int32_t ios_dump_interval;
+    pthread_t dump_thread;
+    gf_boolean_t dump_thread_should_die;
+    gf_boolean_t dump_thread_running;
+    gf_lock_t ios_sampling_lock;
+    int32_t ios_sample_interval;
+    int32_t ios_sample_buf_size;
+    ios_sample_buf_t *ios_sample_buf;
+    struct dnscache *dnscache;
+    int32_t ios_dnscache_ttl_sec;
+    /*
+     * What we really need here is just a unique value to keep files
+     * created by this instance distinct from those created by any other.
+     * On the client side this isn't a problem, so we just use the
+     * translator name.  On the server side conflicts can occur, so the
+     * volfile-generation code automatically sets this (via an option)
+     * to be the brick path.
+     *
+     * NB While the *field* name has changed, it didn't seem worth changing
+     * all of the cases where "xlator_name" is used as a *variable* name.
+     */
+    char *unique_id;
+    ios_dump_type_t dump_format;
+};
+
+struct ios_fd {
+    char *filename;
+    gf_atomic_t data_written;
+    gf_atomic_t data_read;
+    gf_atomic_t block_count_write[IOS_BLOCK_COUNT_SIZE];
+    gf_atomic_t block_count_read[IOS_BLOCK_COUNT_SIZE];
+    struct timeval opened_at;
+};
+
+struct ios_dump_args {
+    ios_dump_type_t type;
+    union {
+        FILE *logfp;
+        dict_t *dict;
+    } u;
+};
+
+typedef int (*block_dump_func)(xlator_t *, struct ios_dump_args *, int, int,
+                               uint64_t);
+
+struct ios_local {
+    struct timeval wind_at;
+    struct timeval unwind_at;
+};
+
+struct volume_options options[];
+
+static int
+is_fop_latency_started(call_frame_t *frame)
+{
+    GF_ASSERT(frame);
+    struct timeval epoch = {
+        0,
+    };
+    return memcmp(&frame->begin, &epoch, sizeof(epoch));
+}
+
+#define _IOS_SAMP_DIR DEFAULT_LOG_FILE_DIRECTORY "/samples"
+#ifdef GF_LINUX_HOST_OS
+#define _IOS_DUMP_DIR DATADIR "/lib/glusterd/stats"
+#else
+#define _IOS_DUMP_DIR DATADIR "/db/glusterd/stats"
+#endif
+
+#define END_FOP_LATENCY(frame, op)                                             \
+    do {                                                                       \
+        struct ios_conf *conf = NULL;                                          \
+                                                                               \
+        conf = this->private;                                                  \
+        if (conf && conf->measure_latency) {                                   \
+            timespec_now(&frame->end);                                         \
+            update_ios_latency(conf, frame, GF_FOP_##op);                      \
+        }                                                                      \
+    } while (0)
+
+#define START_FOP_LATENCY(frame)                                               \
+    do {                                                                       \
+        struct ios_conf *conf = NULL;                                          \
+                                                                               \
+        conf = this->private;                                                  \
+        if (conf && conf->measure_latency) {                                   \
+            timespec_now(&frame->begin);                                       \
+        } else {                                                               \
+            memset(&frame->begin, 0, sizeof(frame->begin));                    \
+        }                                                                      \
+    } while (0)
+
+#define BUMP_FOP(op)                                                           \
+    do {                                                                       \
+        struct ios_conf *conf = NULL;                                          \
+                                                                               \
+        conf = this->private;                                                  \
+        if (!conf)                                                             \
+            break;                                                             \
+        GF_ATOMIC_INC(conf->cumulative.fop_hits[GF_FOP_##op]);                 \
+        GF_ATOMIC_INC(conf->incremental.fop_hits[GF_FOP_##op]);                \
+    } while (0)
+
+#define UPDATE_PROFILE_STATS(frame, op)                                        \
+    do {                                                                       \
+        struct ios_conf *conf = NULL;                                          \
+                                                                               \
+        if (!is_fop_latency_started(frame))                                    \
+            break;                                                             \
+        conf = this->private;                                                  \
+        if (conf && conf->measure_latency && conf->count_fop_hits) {           \
+            BUMP_FOP(op);                                                      \
+            timespec_now(&frame->end);                                         \
+            update_ios_latency(conf, frame, GF_FOP_##op);                      \
+        }                                                                      \
+    } while (0)
+
+#define BUMP_THROUGHPUT(iosstat, type)                                         \
+    do {                                                                       \
+        struct ios_conf *conf = NULL;                                          \
+        double elapsed;                                                        \
+        struct timespec *begin, *end;                                          \
+        double throughput;                                                     \
+        int flag = 0;                                                          \
+        struct timeval tv = {                                                  \
+            0,                                                                 \
+        };                                                                     \
+                                                                               \
+        begin = &frame->begin;                                                 \
+        end = &frame->end;                                                     \
+                                                                               \
+        elapsed = gf_tsdiff(begin, end) / 1000.0;                              \
+        throughput = op_ret / elapsed;                                         \
+                                                                               \
+        conf = this->private;                                                  \
+        gettimeofday(&tv, NULL);                                               \
+        LOCK(&iosstat->lock);                                                  \
+        {                                                                      \
+            if (iosstat->thru_counters[type].throughput <= throughput) {       \
+                iosstat->thru_counters[type].throughput = throughput;          \
+                memcpy(&iosstat->thru_counters[type].time, &tv,                \
+                       sizeof(struct timeval));                                \
+                flag = 1;                                                      \
+            }                                                                  \
+        }                                                                      \
+        UNLOCK(&iosstat->lock);                                                \
+        if (flag)                                                              \
+            ios_stat_add_to_list(&conf->thru_list[type], throughput, iosstat); \
+    } while (0)
+
+static int
+ios_fd_ctx_get(fd_t *fd, xlator_t *this, struct ios_fd **iosfd)
+{
+    uint64_t iosfd64 = 0;
+    unsigned long iosfdlong = 0;
+    int ret = 0;
+
+    ret = fd_ctx_get(fd, this, &iosfd64);
+    iosfdlong = iosfd64;
+    if (ret != -1)
+        *iosfd = (void *)iosfdlong;
+
+    return ret;
+}
+
+static int
+ios_fd_ctx_set(fd_t *fd, xlator_t *this, struct ios_fd *iosfd)
+{
+    uint64_t iosfd64 = 0;
+    int ret = 0;
+
+    iosfd64 = (unsigned long)iosfd;
+    ret = fd_ctx_set(fd, this, iosfd64);
+
+    return ret;
+}
+
+static int
+ios_stat_ref(struct ios_stat *iosstat)
+{
+    uint64_t refcnt = 0;
+    refcnt = GF_ATOMIC_INC(iosstat->refcnt);
+
+    return refcnt;
+}
+
+static int
+ios_stat_unref(struct ios_stat *iosstat)
+{
+    int cleanup = 0;
+    uint64_t refcnt = 0;
+
+    refcnt = GF_ATOMIC_DEC(iosstat->refcnt);
+    if (refcnt == 0) {
+        if (iosstat->filename) {
+            GF_FREE(iosstat->filename);
+            iosstat->filename = NULL;
+        }
+        cleanup = 1;
+    }
+
+    if (cleanup) {
+        LOCK_DESTROY(&iosstat->lock);
+        GF_FREE(iosstat);
+        iosstat = NULL;
+    }
+
+    return 0;
+}
+
+static int
+ios_stat_add_to_list(struct ios_stat_head *list_head, uint64_t value,
+                     struct ios_stat *iosstat)
+{
+    struct ios_stat_list *new = NULL;
+    struct ios_stat_list *entry = NULL;
+    struct ios_stat_list *t = NULL;
+    struct ios_stat_list *list_entry = NULL;
+    struct ios_stat_list *tmp = NULL;
+    struct ios_stat_list *last = NULL;
+    struct ios_stat *stat = NULL;
+    int cnt = 0;
+    int found = 0;
+    int reposition = 0;
+    double min_count = 0;
+
+    LOCK(&list_head->lock);
+    {
+        if (list_head->min_cnt == 0)
+            list_head->min_cnt = value;
+        if ((list_head->members == MAX_LIST_MEMBERS) &&
+            (list_head->min_cnt > value))
+            goto out;
+
+        list_for_each_entry_safe(entry, t, &list_head->iosstats->list, list)
+        {
+            cnt++;
+            if (cnt == list_head->members)
+                last = entry;
+
+            if (!gf_uuid_compare(iosstat->gfid, entry->iosstat->gfid)) {
+                list_entry = entry;
+                found = cnt;
+                entry->value = value;
+                if (!reposition) {
+                    if (cnt == list_head->members)
+                        list_head->min_cnt = value;
+                    goto out;
+                }
+                break;
+            } else if (entry->value <= value && !reposition) {
+                reposition = cnt;
+                tmp = entry;
+                if (cnt == list_head->members - 1)
+                    min_count = entry->value;
+            }
+        }
+        if (found) {
+            list_del(&list_entry->list);
+            list_add_tail(&list_entry->list, &tmp->list);
+            if (min_count)
+                list_head->min_cnt = min_count;
+            goto out;
+        } else if (list_head->members == MAX_LIST_MEMBERS && reposition) {
+            new = GF_CALLOC(1, sizeof(*new), gf_io_stats_mt_ios_stat_list);
+            new->iosstat = iosstat;
+            new->value = value;
+            ios_stat_ref(iosstat);
+            list_add_tail(&new->list, &tmp->list);
+            if (last) {
+                stat = last->iosstat;
+                last->iosstat = NULL;
+                ios_stat_unref(stat);
+                list_del(&last->list);
+                GF_FREE(last);
+            }
+            if (reposition == MAX_LIST_MEMBERS)
+                list_head->min_cnt = value;
+            else if (min_count) {
+                list_head->min_cnt = min_count;
+            }
+        } else if (list_head->members < MAX_LIST_MEMBERS) {
+            new = GF_CALLOC(1, sizeof(*new), gf_io_stats_mt_ios_stat_list);
+            new->iosstat = iosstat;
+            new->value = value;
+            ios_stat_ref(iosstat);
+            if (reposition) {
+                list_add_tail(&new->list, &tmp->list);
+            } else {
+                list_add_tail(&new->list, &entry->list);
+            }
+            list_head->members++;
+            if (list_head->min_cnt > value)
+                list_head->min_cnt = value;
+        }
+    }
+out:
+    UNLOCK(&list_head->lock);
+    return 0;
+}
+
+static void
+ios_bump_read(xlator_t *this, fd_t *fd, size_t len)
+{
+    struct ios_conf *conf = NULL;
+    struct ios_fd *iosfd = NULL;
+    int lb2 = 0;
+
+    conf = this->private;
+    lb2 = log_base2(len);
+    ios_fd_ctx_get(fd, this, &iosfd);
+    if (!conf)
+        return;
+
+    GF_ATOMIC_ADD(conf->cumulative.data_read, len);
+    GF_ATOMIC_ADD(conf->incremental.data_read, len);
+    GF_ATOMIC_INC(conf->cumulative.block_count_read[lb2]);
+    GF_ATOMIC_INC(conf->incremental.block_count_read[lb2]);
+
+    if (iosfd) {
+        GF_ATOMIC_ADD(iosfd->data_read, len);
+        GF_ATOMIC_INC(iosfd->block_count_read[lb2]);
+    }
+}
+
+static void
+ios_bump_write(xlator_t *this, fd_t *fd, size_t len)
+{
+    struct ios_conf *conf = NULL;
+    struct ios_fd *iosfd = NULL;
+    int lb2 = 0;
+
+    conf = this->private;
+    lb2 = log_base2(len);
+    ios_fd_ctx_get(fd, this, &iosfd);
+    if (!conf)
+        return;
+
+    GF_ATOMIC_ADD(conf->cumulative.data_written, len);
+    GF_ATOMIC_ADD(conf->incremental.data_written, len);
+    GF_ATOMIC_INC(conf->cumulative.block_count_write[lb2]);
+    GF_ATOMIC_INC(conf->incremental.block_count_write[lb2]);
+
+    if (iosfd) {
+        GF_ATOMIC_ADD(iosfd->data_written, len);
+        GF_ATOMIC_INC(iosfd->block_count_write[lb2]);
+    }
+}
+
+static void
+ios_bump_upcall(xlator_t *this, gf_upcall_flags_t event)
+{
+    struct ios_conf *conf = NULL;
+
+    conf = this->private;
+    if (!conf)
+        return;
+    if (conf->count_fop_hits) {
+        GF_ATOMIC_INC(conf->cumulative.upcall_hits[event]);
+        GF_ATOMIC_INC(conf->incremental.upcall_hits[event]);
+    }
+}
+
+static void
+ios_bump_stats(xlator_t *this, struct ios_stat *iosstat, ios_stats_type_t type)
+{
+    struct ios_conf *conf = NULL;
+    uint64_t value = 0;
+
+    conf = this->private;
+
+    value = GF_ATOMIC_INC(iosstat->counters[type]);
+    ios_stat_add_to_list(&conf->list[type], value, iosstat);
+}
+
+int
+ios_inode_ctx_set(inode_t *inode, xlator_t *this, struct ios_stat *iosstat)
+{
+    uint64_t iosstat64 = 0;
+    int ret = 0;
+
+    ios_stat_ref(iosstat);
+    iosstat64 = (unsigned long)iosstat;
+    ret = inode_ctx_put(inode, this, iosstat64);
+    return ret;
+}
+
+int
+ios_inode_ctx_get(inode_t *inode, xlator_t *this, struct ios_stat **iosstat)
+{
+    uint64_t iosstat64 = 0;
+    unsigned long iosstatlong = 0;
+    int ret = 0;
+
+    ret = inode_ctx_get(inode, this, &iosstat64);
+    iosstatlong = iosstat64;
+    if (ret != -1)
+        *iosstat = (void *)iosstatlong;
+
+    return ret;
+}
+
+/*
+ * So why goto all this trouble?  Why not just queue up some samples in
+ * a big list and malloc away?  Well malloc is expensive relative
+ * to what we are measuring, so cannot have any malloc's (or worse
+ * callocs) in our measurement code paths.  Instead, we are going to
+ * pre-allocate a circular buffer and collect a maximum number of samples.
+ * Prior to dumping them all we'll create a new buffer and swap the
+ * old buffer with the new, and then proceed to dump the statistics
+ * in our dump thread.
+ *
+ */
+ios_sample_buf_t *
+ios_create_sample_buf(size_t buf_size)
+{
+    ios_sample_buf_t *ios_sample_buf = NULL;
+    ios_sample_t *ios_samples = NULL;
+
+    ios_sample_buf = GF_CALLOC(1, sizeof(*ios_sample_buf),
+                               gf_io_stats_mt_ios_sample_buf);
+    if (!ios_sample_buf)
+        goto err;
+
+    ios_samples = GF_CALLOC(buf_size, sizeof(*ios_samples),
+                            gf_io_stats_mt_ios_sample);
+
+    if (!ios_samples)
+        goto err;
+
+    ios_sample_buf->ios_samples = ios_samples;
+    ios_sample_buf->size = buf_size;
+    ios_sample_buf->pos = 0;
+    ios_sample_buf->observed = 0;
+    ios_sample_buf->collected = 0;
+
+    return ios_sample_buf;
+err:
+    GF_FREE(ios_sample_buf);
+    return NULL;
+}
+
+void
+ios_destroy_sample_buf(ios_sample_buf_t *ios_sample_buf)
+{
+    GF_FREE(ios_sample_buf->ios_samples);
+    GF_FREE(ios_sample_buf);
+}
+
+static int
+ios_init_sample_buf(struct ios_conf *conf)
+{
+    int32_t ret = -1;
+
+    GF_ASSERT(conf);
+    LOCK(&conf->lock);
+    conf->ios_sample_buf = ios_create_sample_buf(conf->ios_sample_buf_size);
+    if (!conf->ios_sample_buf)
+        goto out;
+    ret = 0;
+out:
+    UNLOCK(&conf->lock);
+    return ret;
+}
+
+static int
+ios_stats_cleanup(xlator_t *this, inode_t *inode)
+{
+    struct ios_stat *iosstat = NULL;
+    uint64_t iosstat64 = 0;
+
+    inode_ctx_del(inode, this, &iosstat64);
+    if (!iosstat64) {
+        gf_log(this->name, GF_LOG_WARNING, "could not get inode ctx");
+        return 0;
+    }
+    iosstat = (void *)(long)iosstat64;
+    if (iosstat) {
+        ios_stat_unref(iosstat);
+    }
+    return 0;
+}
+
+#define ios_log(this, logfp, fmt...)                                           \
+    do {                                                                       \
+        if (logfp) {                                                           \
+            fprintf(logfp, fmt);                                               \
+            fprintf(logfp, "\n");                                              \
+        }                                                                      \
+        gf_log(this->name, GF_LOG_DEBUG, fmt);                                 \
+    } while (0)
+
+int
+ios_dump_file_stats(struct ios_stat_head *list_head, xlator_t *this,
+                    FILE *logfp)
+{
+    struct ios_stat_list *entry = NULL;
+
+    LOCK(&list_head->lock);
+    {
+        list_for_each_entry(entry, &list_head->iosstats->list, list)
+        {
+            ios_log(this, logfp, "%-12.0f %s", entry->value,
+                    entry->iosstat->filename);
+        }
+    }
+    UNLOCK(&list_head->lock);
+    return 0;
+}
+
+int
+ios_dump_throughput_stats(struct ios_stat_head *list_head, xlator_t *this,
+                          FILE *logfp, ios_stats_thru_t type)
+{
+    struct ios_stat_list *entry = NULL;
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+
+    LOCK(&list_head->lock);
+    {
+        list_for_each_entry(entry, &list_head->iosstats->list, list)
+        {
+            gf_time_fmt_tv(timestr, sizeof timestr,
+                           &entry->iosstat->thru_counters[type].time,
+                           gf_timefmt_FT);
+
+            ios_log(this, logfp, "%s \t %-10.2f  \t  %s", timestr, entry->value,
+                    entry->iosstat->filename);
+        }
+    }
+    UNLOCK(&list_head->lock);
+    return 0;
+}
+
+int
+_io_stats_get_key_prefix(xlator_t *this, char **key_prefix)
+{
+    char *key_root = "gluster";
+    char *xlator_name = NULL;
+    char *instance_name = NULL;
+    size_t key_len = 0;
+    int bytes_written = 0;
+    int i = 0;
+    int ret = 0;
+    struct ios_conf *conf = this->private;
+
+    xlator_name = strdupa(conf->unique_id);
+    for (i = 0; i < strlen(xlator_name); i++) {
+        if (xlator_name[i] == '/')
+            xlator_name[i] = '_';
+    }
+
+    instance_name = this->instance_name;
+    if (this->name && strcmp(this->name, "glustershd") == 0) {
+        xlator_name = "shd";
+    } else if (this->prev && strcmp(this->prev->name, "nfs-server") == 0) {
+        xlator_name = "nfsd";
+        if (this->prev->instance_name)
+            instance_name = strdupa(this->prev->instance_name);
+    }
+
+    if (strcmp(__progname, "glusterfsd") == 0)
+        key_root = "gluster.brick";
+
+    if (instance_name) {
+        /* +3 for 2 x "." + NULL */
+        key_len = strlen(key_root) + strlen(xlator_name) +
+                  strlen(instance_name) + 3;
+        *key_prefix = GF_CALLOC(key_len, sizeof(char), gf_common_mt_char);
+        if (!*key_prefix) {
+            ret = -ENOMEM;
+            goto err;
+        }
+        bytes_written = snprintf(*key_prefix, key_len, "%s.%s.%s", key_root,
+                                 xlator_name, instance_name);
+        if (bytes_written != key_len - 1) {
+            ret = -EINVAL;
+            goto err;
+        }
+    } else {
+        /* +2 for 1 x "." + NULL */
+        key_len = strlen(key_root) + strlen(xlator_name) + 2;
+        *key_prefix = GF_CALLOC(key_len, sizeof(char), gf_common_mt_char);
+        if (!*key_prefix) {
+            ret = -ENOMEM;
+            goto err;
+        }
+        bytes_written = snprintf(*key_prefix, key_len, "%s.%s", key_root,
+                                 xlator_name);
+        if (bytes_written != key_len - 1) {
+            ret = -EINVAL;
+            goto err;
+        }
+    }
+    return 0;
+err:
+    GF_FREE(*key_prefix);
+    *key_prefix = NULL;
+    return ret;
+}
+
+int
+io_stats_dump_global_to_json_logfp(xlator_t *this,
+                                   struct ios_global_stats *stats, time_t now,
+                                   int interval, FILE *logfp)
+{
+    int i = 0;
+    int j = 0;
+    struct ios_conf *conf = NULL;
+    char *key_prefix = NULL;
+    char *str_prefix = NULL;
+    char *lc_fop_name = NULL;
+    int ret = 1; /* Default to error */
+    int rw_size;
+    char *rw_unit = NULL;
+    uint64_t fop_hits;
+    float fop_lat_ave;
+    float fop_lat_min;
+    float fop_lat_max;
+    double interval_sec;
+    double fop_ave_usec = 0.0;
+    double fop_ave_usec_sum = 0.0;
+    double weighted_fop_ave_usec = 0.0;
+    double weighted_fop_ave_usec_sum = 0.0;
+    long total_fop_hits = 0;
+    loc_t unused_loc = {
+        0,
+    };
+    dict_t *xattr = NULL;
+
+    interval_sec = (double)(now - stats->started_at);
+
+    conf = this->private;
+
+    ret = _io_stats_get_key_prefix(this, &key_prefix);
+    if (ret) {
+        goto out;
+    }
+
+    if (interval == -1) {
+        str_prefix = "aggr";
+
+    } else {
+        str_prefix = "inter";
+    }
+    ios_log(this, logfp, "{");
+
+    for (i = 0; i < 31; i++) {
+        rw_size = (1 << i);
+        if (rw_size >= 1024 * 1024) {
+            rw_size = rw_size / (1024 * 1024);
+            rw_unit = "mb";
+        } else if (rw_size >= 1024) {
+            rw_size = rw_size / 1024;
+            rw_unit = "kb";
+        } else {
+            rw_unit = "b";
+        }
+
+        if (interval == -1) {
+            ios_log(this, logfp, "\"%s.%s.read_%d%s\": %" GF_PRI_ATOMIC ",",
+                    key_prefix, str_prefix, rw_size, rw_unit,
+                    GF_ATOMIC_GET(stats->block_count_read[i]));
+            ios_log(this, logfp, "\"%s.%s.write_%d%s\": %" GF_PRI_ATOMIC ",",
+                    key_prefix, str_prefix, rw_size, rw_unit,
+                    GF_ATOMIC_GET(stats->block_count_write[i]));
+        } else {
+            ios_log(this, logfp, "\"%s.%s.read_%d%s_per_sec\": %0.2lf,",
+                    key_prefix, str_prefix, rw_size, rw_unit,
+                    (double)(GF_ATOMIC_GET(stats->block_count_read[i]) /
+                             interval_sec));
+            ios_log(this, logfp, "\"%s.%s.write_%d%s_per_sec\": %0.2lf,",
+                    key_prefix, str_prefix, rw_size, rw_unit,
+                    (double)(GF_ATOMIC_GET(stats->block_count_write[i]) /
+                             interval_sec));
+        }
+    }
+
+    if (interval == -1) {
+        ios_log(this, logfp, "\"%s.%s.fds.open_count\": %" PRId64 ",",
+                key_prefix, str_prefix, conf->cumulative.nr_opens);
+        ios_log(this, logfp, "\"%s.%s.fds.max_open_count\": %" PRId64 ",",
+                key_prefix, str_prefix, conf->cumulative.max_nr_opens);
+    }
+
+    for (i = 0; i < GF_FOP_MAXVALUE; i++) {
+        lc_fop_name = strdupa(gf_fop_list[i]);
+        for (j = 0; lc_fop_name[j]; j++) {
+            lc_fop_name[j] = tolower(lc_fop_name[j]);
+        }
+
+        fop_hits = GF_ATOMIC_GET(stats->fop_hits[i]);
+        fop_lat_ave = 0.0;
+        fop_lat_min = 0.0;
+        fop_lat_max = 0.0;
+        if (fop_hits) {
+            if (stats->latency[i].avg) {
+                fop_lat_ave = stats->latency[i].avg;
+                fop_lat_min = stats->latency[i].min;
+                fop_lat_max = stats->latency[i].max;
+            }
+        }
+        if (interval == -1) {
+            ios_log(this, logfp, "\"%s.%s.fop.%s.count\": %" GF_PRI_ATOMIC ",",
+                    key_prefix, str_prefix, lc_fop_name, fop_hits);
+        } else {
+            ios_log(this, logfp, "\"%s.%s.fop.%s.per_sec\": %0.2lf,",
+                    key_prefix, str_prefix, lc_fop_name,
+                    (double)(fop_hits / interval_sec));
+        }
+
+        ios_log(this, logfp, "\"%s.%s.fop.%s.latency_ave_usec\": %0.2lf,",
+                key_prefix, str_prefix, lc_fop_name, fop_lat_ave);
+        ios_log(this, logfp, "\"%s.%s.fop.%s.latency_min_usec\": %0.2lf,",
+                key_prefix, str_prefix, lc_fop_name, fop_lat_min);
+        ios_log(this, logfp, "\"%s.%s.fop.%s.latency_max_usec\": %0.2lf,",
+                key_prefix, str_prefix, lc_fop_name, fop_lat_max);
+
+        fop_ave_usec_sum += fop_lat_ave;
+        weighted_fop_ave_usec_sum += fop_hits * fop_lat_ave;
+        total_fop_hits += fop_hits;
+    }
+
+    if (total_fop_hits) {
+        weighted_fop_ave_usec = weighted_fop_ave_usec_sum / total_fop_hits;
+        /* Extra key that does not print out an entry w/ 0.00 for
+         * intervals with no data
+         */
+        ios_log(this, logfp,
+                "\"%s.%s.fop.weighted_latency_ave_usec_nozerofill\": "
+                "%0.4lf,",
+                key_prefix, str_prefix, weighted_fop_ave_usec);
+    }
+    ios_log(this, logfp, "\"%s.%s.fop.weighted_latency_ave_usec\": %0.4lf,",
+            key_prefix, str_prefix, weighted_fop_ave_usec);
+    ios_log(this, logfp, "\"%s.%s.fop.weighted_fop_count\": %ld,", key_prefix,
+            str_prefix, total_fop_hits);
+
+    fop_ave_usec = fop_ave_usec_sum / GF_FOP_MAXVALUE;
+    ios_log(this, logfp, "\"%s.%s.fop.unweighted_latency_ave_usec\":%0.4lf,",
+            key_prefix, str_prefix, fop_ave_usec);
+
+    for (i = 0; i < GF_UPCALL_FLAGS_MAXVALUE; i++) {
+        lc_fop_name = strdupa(gf_upcall_list[i]);
+        for (j = 0; lc_fop_name[j]; j++) {
+            lc_fop_name[j] = tolower(lc_fop_name[j]);
+        }
+        fop_hits = GF_ATOMIC_GET(stats->upcall_hits[i]);
+        if (interval == -1) {
+            ios_log(this, logfp, "\"%s.%s.fop.%s.count\": %" GF_PRI_ATOMIC ",",
+                    key_prefix, str_prefix, lc_fop_name, fop_hits);
+        } else {
+            ios_log(this, logfp, "\"%s.%s.fop.%s.per_sec\": %0.2lf,",
+                    key_prefix, str_prefix, lc_fop_name,
+                    (double)(fop_hits / interval_sec));
+        }
+    }
+
+    ret = syncop_getxattr(this, &unused_loc, &xattr, IO_THREADS_QUEUE_SIZE_KEY,
+                          NULL, NULL);
+    if (xattr) {
+        /*
+         * Iterate over the dictionary returned to us by io-threads and
+         * dump the results to the stats file.
+         */
+        data_pair_t *curr = NULL;
+
+        dict_foreach_inline(xattr, curr)
+        {
+            ios_log(this, logfp, "\"%s.%s.%s.queue_size\": %d,", key_prefix,
+                    str_prefix, curr->key, data_to_int32(curr->value));
+        }
+
+        /* Free the dictionary */
+        dict_unref(xattr);
+    } else {
+        gf_log(this->name, GF_LOG_WARNING,
+               "Unable to get queue size counts from "
+               "the io-threads translator!");
+    }
+
+    if (interval == -1) {
+        ios_log(this, logfp, "\"%s.%s.uptime\": %" PRIu64 ",", key_prefix,
+                str_prefix, (uint64_t)(now - stats->started_at));
+        ios_log(this, logfp,
+                "\"%s.%s.bytes_read\": "
+                "%" GF_PRI_ATOMIC ",",
+                key_prefix, str_prefix, GF_ATOMIC_GET(stats->data_read));
+        ios_log(this, logfp,
+                "\"%s.%s.bytes_written\": "
+                "%" GF_PRI_ATOMIC "",
+                key_prefix, str_prefix, GF_ATOMIC_GET(stats->data_written));
+    } else {
+        ios_log(this, logfp, "\"%s.%s.sample_interval_sec\": %0.2lf,",
+                key_prefix, str_prefix, interval_sec);
+        ios_log(this, logfp, "\"%s.%s.bytes_read_per_sec\": %0.2lf,",
+                key_prefix, str_prefix,
+                (double)(GF_ATOMIC_GET(stats->data_read) / interval_sec));
+        ios_log(this, logfp, "\"%s.%s.bytes_written_per_sec\": %0.2lf",
+                key_prefix, str_prefix,
+                (double)(GF_ATOMIC_GET(stats->data_written) / interval_sec));
+    }
+
+    ios_log(this, logfp, "}");
+    ret = 0;
+out:
+    GF_FREE(key_prefix);
+    return ret;
+}
+
+char *
+_resolve_username(xlator_t *this, uid_t uid)
+{
+    struct passwd pwd;
+    struct passwd *pwd_result = NULL;
+    size_t pwd_buf_len;
+    char *pwd_buf = NULL;
+    char *ret = NULL;
+
+    /* Prepare our buffer for the uid->username translation */
+#ifdef _SC_GETGR_R_SIZE_MAX
+    pwd_buf_len = sysconf(_SC_GETGR_R_SIZE_MAX);
+#else
+    pwd_buf_len = -1;
+#endif
+    if (pwd_buf_len == -1) {
+        pwd_buf_len = DEFAULT_PWD_BUF_SZ; /* per the man page */
+    }
+
+    pwd_buf = alloca(pwd_buf_len);
+    if (!pwd_buf)
+        goto err;
+
+    getpwuid_r(uid, &pwd, pwd_buf, pwd_buf_len, &pwd_result);
+    if (!pwd_result)
+        goto err;
+
+    ret = gf_strdup(pwd.pw_name);
+    if (ret)
+        return ret;
+    else
+        gf_log(this->name, GF_LOG_ERROR,
+               "gf_strdup failed, failing username "
+               "resolution.");
+err:
+    return ret;
+}
+
+char *
+_resolve_group_name(xlator_t *this, gid_t gid)
+{
+    struct group grp;
+    struct group *grp_result = NULL;
+    size_t grp_buf_len;
+    char *grp_buf = NULL;
+    char *ret = NULL;
+
+    /* Prepare our buffer for the gid->group name translation */
+#ifdef _SC_GETGR_R_SIZE_MAX
+    grp_buf_len = sysconf(_SC_GETGR_R_SIZE_MAX);
+#else
+    grp_buf_len = -1;
+#endif
+    if (grp_buf_len == -1) {
+        grp_buf_len = DEFAULT_GRP_BUF_SZ; /* per the man page */
+    }
+
+    grp_buf = alloca(grp_buf_len);
+    if (!grp_buf) {
+        goto err;
+    }
+
+    if (getgrgid_r(gid, &grp, grp_buf, grp_buf_len, &grp_result) != 0)
+        goto err;
+
+    if (!grp_result)
+        goto err;
+
+    ret = gf_strdup(grp.gr_name);
+    if (ret)
+        return ret;
+    else
+        gf_log(this->name, GF_LOG_ERROR,
+               "gf_strdup failed, failing username "
+               "resolution.");
+err:
+    return ret;
+}
+
+/*
+ * This function writes out a latency sample to a given file descriptor
+ * and beautifies the output in the process.
+ */
+void
+_io_stats_write_latency_sample(xlator_t *this, ios_sample_t *sample,
+                               FILE *logfp)
+{
+    double epoch_time = 0.00;
+    char *xlator_name = NULL;
+    char *instance_name = NULL;
+    char *hostname = NULL;
+    char *identifier = NULL;
+    char *port = NULL;
+    char *port_pos = NULL;
+    char *group_name = NULL;
+    char *username = NULL;
+    struct ios_conf *conf = NULL;
+
+    conf = this->private;
+
+    epoch_time = (sample->timestamp).tv_sec +
+                 ((sample->timestamp).tv_usec / 1000000.0);
+
+    if (strlen(sample->identifier) == 0) {
+        hostname = "Unknown";
+        port = "Unknown";
+    } else {
+        identifier = strdupa(sample->identifier);
+        port_pos = strrchr(identifier, ':');
+        if (!port_pos || strlen(port_pos) < 2)
+            goto err;
+        port = strdupa(port_pos + 1);
+        if (!port)
+            goto err;
+        *port_pos = '\0';
+        hostname = gf_rev_dns_lookup_cached(identifier, conf->dnscache);
+        if (!hostname)
+            hostname = "Unknown";
+    }
+
+    xlator_name = conf->unique_id;
+    if (!xlator_name || strlen(xlator_name) == 0)
+        xlator_name = "Unknown";
+
+    instance_name = this->instance_name;
+    if (!instance_name || strlen(instance_name) == 0)
+        instance_name = "N/A";
+
+    /* Resolve the UID to a string username */
+    username = _resolve_username(this, sample->uid);
+    if (!username) {
+        username = GF_MALLOC(30, gf_common_mt_char);
+        if (!username) {
+            goto out;
+        }
+        sprintf(username, "%d", (int32_t)sample->uid);
+    }
+
+    /* Resolve the GID to a string group name */
+    group_name = _resolve_group_name(this, sample->gid);
+    if (!group_name) {
+        group_name = GF_MALLOC(30, gf_common_mt_char);
+        if (!group_name) {
+            goto out;
+        }
+        sprintf(group_name, "%d", (int32_t)sample->gid);
+    }
+
+    ios_log(this, logfp, "%0.6lf,%s,%s,%0.4lf,%s,%s,%s,%s,%s,%s", epoch_time,
+            fop_enum_to_pri_string(sample->fop_type),
+            gf_fop_string(sample->fop_type), sample->elapsed, xlator_name,
+            instance_name, username, group_name, hostname, port);
+    goto out;
+err:
+    gf_log(this->name, GF_LOG_ERROR, "Error parsing socket identifier");
+out:
+    GF_FREE(group_name);
+    GF_FREE(username);
+}
+
+/*
+ * Takes our current sample buffer in conf->io_sample_buf, and saves
+ * a reference to this, init's a new buffer, and then dumps out the
+ * contents of the saved reference.
+ */
+int
+io_stats_dump_latency_samples_logfp(xlator_t *this, FILE *logfp)
+{
+    uint64_t i = 0;
+    struct ios_conf *conf = NULL;
+    ios_sample_buf_t *sample_buf = NULL;
+    int ret = 1; /* Default to error */
+
+    conf = this->private;
+
+    /* Save pointer to old buffer; the CS equivalent of
+     * Indiana Jones: https://www.youtube.com/watch?v=Pr-8AP0To4k,
+     * though ours will end better I hope!
+     */
+    sample_buf = conf->ios_sample_buf;
+    if (!sample_buf) {
+        gf_log(this->name, GF_LOG_WARNING, "Sampling buffer is null, bailing!");
+        goto out;
+    }
+
+    /* Empty case, nothing to do, exit. */
+    if (sample_buf->collected == 0) {
+        gf_log(this->name, GF_LOG_DEBUG, "No samples, dump not required.");
+        ret = 0;
+        goto out;
+    }
+
+    /* Init a new buffer, so we are free to work on the one we saved a
+     * reference to above.
+     */
+    if (ios_init_sample_buf(conf) != 0) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "Failed to init new sampling buffer, out of memory?");
+        goto out;
+    }
+
+    /* Wrap-around case, dump from pos to sample_buf->size -1
+     * and then from 0 to sample_buf->pos (covered off by
+     * "simple case")
+     */
+    if (sample_buf->collected > sample_buf->pos + 1) {
+        for (i = sample_buf->pos; i < sample_buf->size; i++) {
+            _io_stats_write_latency_sample(this, &(sample_buf->ios_samples[i]),
+                                           logfp);
+        }
+    }
+
+    /* Simple case: Dump from 0 to sample_buf->pos */
+    for (i = 0; i < sample_buf->pos; i++) {
+        _io_stats_write_latency_sample(this, &(sample_buf->ios_samples[i]),
+                                       logfp);
+    }
+    ios_destroy_sample_buf(sample_buf);
+
+out:
+    return ret;
+}
+
+int
+io_stats_dump_global_to_logfp(xlator_t *this, struct ios_global_stats *stats,
+                              time_t now, int interval, FILE *logfp)
+{
+    int i = 0;
+    int per_line = 0;
+    int index = 0;
+    struct ios_stat_head *list_head = NULL;
+    struct ios_conf *conf = NULL;
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    char str_header[128] = {0};
+    char str_read[128] = {0};
+    char str_write[128] = {0};
+    uint64_t fop_hits = 0;
+    uint64_t block_count_read = 0;
+    uint64_t block_count_write = 0;
+
+    conf = this->private;
+
+    if (interval == -1)
+        ios_log(this, logfp, "\n=== Cumulative stats ===");
+    else
+        ios_log(this, logfp, "\n=== Interval %d stats ===", interval);
+    ios_log(this, logfp, "      Duration : %" PRIu64 " secs",
+            (uint64_t)(now - stats->started_at));
+    ios_log(this, logfp, "     BytesRead : %" GF_PRI_ATOMIC,
+            GF_ATOMIC_GET(stats->data_read));
+    ios_log(this, logfp, "  BytesWritten : %" GF_PRI_ATOMIC "\n",
+            GF_ATOMIC_GET(stats->data_written));
+
+    snprintf(str_header, sizeof(str_header), "%-12s %c", "Block Size", ':');
+    snprintf(str_read, sizeof(str_read), "%-12s %c", "Read Count", ':');
+    snprintf(str_write, sizeof(str_write), "%-12s %c", "Write Count", ':');
+    index = 14;
+    for (i = 0; i < IOS_BLOCK_COUNT_SIZE; i++) {
+        block_count_read = GF_ATOMIC_GET(stats->block_count_read[i]);
+        block_count_write = GF_ATOMIC_GET(stats->block_count_write[i]);
+        if ((block_count_read == 0) && (block_count_write == 0))
+            continue;
+        per_line++;
+
+        snprintf(str_header + index, sizeof(str_header) - index, "%16dB+",
+                 (1 << i));
+        if (block_count_read)
+            snprintf(str_read + index, sizeof(str_read) - index, "%18" PRId64,
+                     block_count_read);
+        else
+            snprintf(str_read + index, sizeof(str_read) - index, "%18s", "0");
+        if (block_count_write)
+            snprintf(str_write + index, sizeof(str_write) - index,
+                     "%18" GF_PRI_ATOMIC, block_count_write);
+        else
+            snprintf(str_write + index, sizeof(str_write) - index, "%18s", "0");
+
+        index += 18;
+        if (per_line == 3) {
+            ios_log(this, logfp, "%s", str_header);
+            ios_log(this, logfp, "%s", str_read);
+            ios_log(this, logfp, "%s\n", str_write);
+
+            snprintf(str_header, sizeof(str_header), "%-12s %c", "Block Size",
+                     ':');
+            snprintf(str_read, sizeof(str_read), "%-12s %c", "Read Count", ':');
+            snprintf(str_write, sizeof(str_write), "%-12s %c", "Write Count",
+                     ':');
+
+            index = 14;
+            per_line = 0;
+        }
+    }
+
+    if (per_line != 0) {
+        ios_log(this, logfp, "%s", str_header);
+        ios_log(this, logfp, "%s", str_read);
+        ios_log(this, logfp, "%s\n", str_write);
+    }
+
+    ios_log(this, logfp, "%-13s %10s %14s %14s %14s", "Fop", "Call Count",
+            "Avg-Latency", "Min-Latency", "Max-Latency");
+    ios_log(this, logfp, "%-13s %10s %14s %14s %14s", "---", "----------",
+            "-----------", "-----------", "-----------");
+
+    for (i = 0; i < GF_FOP_MAXVALUE; i++) {
+        fop_hits = GF_ATOMIC_GET(stats->fop_hits[i]);
+        if (fop_hits && !stats->latency[i].avg)
+            ios_log(this, logfp,
+                    "%-13s %10" GF_PRI_ATOMIC
+                    " %11s "
+                    "us %11s us %11s us",
+                    gf_fop_list[i], fop_hits, "0", "0", "0");
+        else if (fop_hits && stats->latency[i].avg)
+            ios_log(this, logfp,
+                    "%-13s %10" GF_PRI_ATOMIC
+                    " "
+                    "%11.2lf us %11.2lf us %11.2lf us",
+                    gf_fop_list[i], fop_hits, stats->latency[i].avg,
+                    stats->latency[i].min, stats->latency[i].max);
+    }
+
+    for (i = 0; i < GF_UPCALL_FLAGS_MAXVALUE; i++) {
+        fop_hits = GF_ATOMIC_GET(stats->upcall_hits[i]);
+        if (fop_hits)
+            ios_log(this, logfp,
+                    "%-13s %10" PRId64
+                    " %11s "
+                    "us %11s us %11s us",
+                    gf_upcall_list[i], fop_hits, "0", "0", "0");
+    }
+
+    ios_log(this, logfp,
+            "------ ----- ----- ----- ----- ----- ----- ----- "
+            " ----- ----- ----- -----\n");
+
+    if (interval == -1) {
+        LOCK(&conf->lock);
+        {
+            gf_time_fmt_tv(timestr, sizeof timestr,
+                           &conf->cumulative.max_openfd_time, gf_timefmt_FT);
+            ios_log(this, logfp,
+                    "Current open fd's: %" PRId64 " Max open fd's: %" PRId64
+                    " time %s",
+                    conf->cumulative.nr_opens, conf->cumulative.max_nr_opens,
+                    timestr);
+        }
+        UNLOCK(&conf->lock);
+        ios_log(this, logfp, "\n==========Open File Stats========");
+        ios_log(this, logfp, "\nCOUNT:  \t  FILE NAME");
+        list_head = &conf->list[IOS_STATS_TYPE_OPEN];
+        ios_dump_file_stats(list_head, this, logfp);
+
+        ios_log(this, logfp, "\n==========Read File Stats========");
+        ios_log(this, logfp, "\nCOUNT:  \t  FILE NAME");
+        list_head = &conf->list[IOS_STATS_TYPE_READ];
+        ios_dump_file_stats(list_head, this, logfp);
+
+        ios_log(this, logfp, "\n==========Write File Stats========");
+        ios_log(this, logfp, "\nCOUNT:  \t  FILE NAME");
+        list_head = &conf->list[IOS_STATS_TYPE_WRITE];
+        ios_dump_file_stats(list_head, this, logfp);
+
+        ios_log(this, logfp, "\n==========Directory open stats========");
+        ios_log(this, logfp, "\nCOUNT:  \t  DIRECTORY NAME");
+        list_head = &conf->list[IOS_STATS_TYPE_OPENDIR];
+        ios_dump_file_stats(list_head, this, logfp);
+
+        ios_log(this, logfp, "\n========Directory readdirp Stats=======");
+        ios_log(this, logfp, "\nCOUNT:  \t  DIRECTORY NAME");
+        list_head = &conf->list[IOS_STATS_TYPE_READDIRP];
+        ios_dump_file_stats(list_head, this, logfp);
+
+        ios_log(this, logfp, "\n========Read Throughput File Stats=====");
+        ios_log(this, logfp,
+                "\nTIMESTAMP \t\t\t THROUGHPUT(KBPS)"
+                "\tFILE NAME");
+        list_head = &conf->thru_list[IOS_STATS_THRU_READ];
+        ios_dump_throughput_stats(list_head, this, logfp, IOS_STATS_THRU_READ);
+
+        ios_log(this, logfp, "\n======Write Throughput File Stats======");
+        ios_log(this, logfp,
+                "\nTIMESTAMP \t\t\t THROUGHPUT(KBPS)"
+                "\tFILE NAME");
+        list_head = &conf->thru_list[IOS_STATS_THRU_WRITE];
+        ios_dump_throughput_stats(list_head, this, logfp, IOS_STATS_THRU_WRITE);
+    }
+    return 0;
+}
+
+int
+io_stats_dump_global_to_dict(xlator_t *this, struct ios_global_stats *stats,
+                             time_t now, int interval, dict_t *dict)
+{
+    int ret = 0;
+    char key[64] = {0};
+    uint64_t sec = 0;
+    int i = 0;
+    uint64_t count = 0;
+    uint64_t fop_hits = 0;
+
+    GF_ASSERT(stats);
+    GF_ASSERT(now);
+    GF_ASSERT(dict);
+    GF_ASSERT(this);
+
+    if (interval == -1)
+        snprintf(key, sizeof(key), "cumulative");
+    else
+        snprintf(key, sizeof(key), "interval");
+    ret = dict_set_int32(dict, key, interval);
+    if (ret)
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to set "
+               "interval %d",
+               interval);
+
+    snprintf(key, sizeof(key), "%d-duration", interval);
+    sec = now - stats->started_at;
+    ret = dict_set_uint64(dict, key, sec);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to set "
+               "duration(%d) - %" PRId64,
+               interval, sec);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%d-total-read", interval);
+    ret = dict_set_uint64(dict, key, GF_ATOMIC_GET(stats->data_read));
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to set total "
+               "read(%d) - %" GF_PRI_ATOMIC,
+               interval, GF_ATOMIC_GET(stats->data_read));
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%d-total-write", interval);
+    ret = dict_set_uint64(dict, key, GF_ATOMIC_GET(stats->data_written));
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to set total "
+               "write(%d) - %" GF_PRI_ATOMIC,
+               interval, GF_ATOMIC_GET(stats->data_written));
+        goto out;
+    }
+    for (i = 0; i < 32; i++) {
+        count = GF_ATOMIC_GET(stats->block_count_read[i]);
+        if (count) {
+            snprintf(key, sizeof(key), "%d-read-%d", interval, (1 << i));
+            ret = dict_set_uint64(dict, key, count);
+            if (ret) {
+                gf_log(this->name, GF_LOG_ERROR,
+                       "failed to "
+                       "set read-%db+, with: %" PRId64,
+                       (1 << i), count);
+                goto out;
+            }
+        }
+    }
+
+    for (i = 0; i < IOS_BLOCK_COUNT_SIZE; i++) {
+        count = GF_ATOMIC_GET(stats->block_count_write[i]);
+        if (count) {
+            snprintf(key, sizeof(key), "%d-write-%d", interval, (1 << i));
+            ret = dict_set_uint64(dict, key, count);
+            if (ret) {
+                gf_log(this->name, GF_LOG_ERROR,
+                       "failed to "
+                       "set write-%db+, with: %" PRId64,
+                       (1 << i), count);
+                goto out;
+            }
+        }
+    }
+
+    for (i = 0; i < GF_FOP_MAXVALUE; i++) {
+        fop_hits = GF_ATOMIC_GET(stats->fop_hits[i]);
+        if (fop_hits == 0)
+            continue;
+        snprintf(key, sizeof(key), "%d-%d-hits", interval, i);
+        ret = dict_set_uint64(dict, key, fop_hits);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "failed to set "
+                   "%s-fop-hits: %" GF_PRI_ATOMIC,
+                   gf_fop_list[i], fop_hits);
+            goto out;
+        }
+
+        if (stats->latency[i].avg == 0)
+            continue;
+        snprintf(key, sizeof(key), "%d-%d-avglatency", interval, i);
+        ret = dict_set_double(dict, key, stats->latency[i].avg);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "failed to set %s "
+                   "avglatency(%d) with %f",
+                   gf_fop_list[i], interval, stats->latency[i].avg);
+            goto out;
+        }
+        snprintf(key, sizeof(key), "%d-%d-minlatency", interval, i);
+        ret = dict_set_double(dict, key, stats->latency[i].min);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "failed to set %s "
+                   "minlatency(%d) with %f",
+                   gf_fop_list[i], interval, stats->latency[i].min);
+            goto out;
+        }
+        snprintf(key, sizeof(key), "%d-%d-maxlatency", interval, i);
+        ret = dict_set_double(dict, key, stats->latency[i].max);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "failed to set %s "
+                   "maxlatency(%d) with %f",
+                   gf_fop_list[i], interval, stats->latency[i].max);
+            goto out;
+        }
+    }
+    for (i = 0; i < GF_UPCALL_FLAGS_MAXVALUE; i++) {
+        fop_hits = GF_ATOMIC_GET(stats->upcall_hits[i]);
+        if (fop_hits == 0)
+            continue;
+        snprintf(key, sizeof(key), "%d-%d-upcall-hits", interval, i);
+        ret = dict_set_uint64(dict, key, fop_hits);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "failed to "
+                   "set %s-upcall-hits: %" PRIu64,
+                   gf_upcall_list[i], fop_hits);
+            goto out;
+        }
+    }
+out:
+    gf_log(this->name, GF_LOG_DEBUG, "returning %d", ret);
+    return ret;
+}
+
+int
+io_stats_dump_global(xlator_t *this, struct ios_global_stats *stats, time_t now,
+                     int interval, struct ios_dump_args *args)
+{
+    int ret = -1;
+
+    GF_ASSERT(args);
+    GF_ASSERT(now);
+    GF_ASSERT(stats);
+    GF_ASSERT(this);
+
+    switch (args->type) {
+        case IOS_DUMP_TYPE_JSON_FILE:
+            ret = io_stats_dump_global_to_json_logfp(this, stats, now, interval,
+                                                     args->u.logfp);
+            break;
+        case IOS_DUMP_TYPE_FILE:
+            ret = io_stats_dump_global_to_logfp(this, stats, now, interval,
+                                                args->u.logfp);
+            break;
+        case IOS_DUMP_TYPE_DICT:
+            ret = io_stats_dump_global_to_dict(this, stats, now, interval,
+                                               args->u.dict);
+            break;
+        default:
+            GF_ASSERT(0);
+            ret = -1;
+            break;
+    }
+    return ret;
+}
+
+int
+ios_dump_args_init(struct ios_dump_args *args, ios_dump_type_t type,
+                   void *output)
+{
+    int ret = 0;
+
+    GF_ASSERT(args);
+    GF_ASSERT(type > IOS_DUMP_TYPE_NONE && type < IOS_DUMP_TYPE_MAX);
+    GF_ASSERT(output);
+
+    args->type = type;
+    switch (args->type) {
+        case IOS_DUMP_TYPE_JSON_FILE:
+        case IOS_DUMP_TYPE_FILE:
+            args->u.logfp = output;
+            break;
+        case IOS_DUMP_TYPE_DICT:
+            args->u.dict = output;
+            break;
+        default:
+            GF_ASSERT(0);
+            ret = -1;
+    }
+
+    return ret;
+}
+
+static void
+ios_global_stats_clear(struct ios_global_stats *stats, time_t now)
+{
+    GF_ASSERT(stats);
+    GF_ASSERT(now);
+
+    memset(stats, 0, sizeof(*stats));
+    stats->started_at = now;
+}
+
+int
+io_stats_dump(xlator_t *this, struct ios_dump_args *args, ios_info_op_t op,
+              gf_boolean_t is_peek)
+{
+    struct ios_conf *conf = NULL;
+    struct ios_global_stats cumulative = {};
+    struct ios_global_stats incremental = {};
+    int increment = 0;
+    time_t now = 0;
+
+    GF_ASSERT(this);
+    GF_ASSERT(args);
+    GF_ASSERT(args->type > IOS_DUMP_TYPE_NONE);
+    GF_ASSERT(args->type < IOS_DUMP_TYPE_MAX);
+
+    conf = this->private;
+    now = gf_time();
+
+    LOCK(&conf->lock);
+    {
+        if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_CUMULATIVE)
+            cumulative = conf->cumulative;
+
+        if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_INCREMENTAL) {
+            incremental = conf->incremental;
+            increment = conf->increment;
+
+            if (!is_peek) {
+                increment = conf->increment++;
+
+                ios_global_stats_clear(&conf->incremental, now);
+            }
+        }
+    }
+    UNLOCK(&conf->lock);
+
+    if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_CUMULATIVE)
+        io_stats_dump_global(this, &cumulative, now, -1, args);
+
+    if (op == GF_IOS_INFO_ALL || op == GF_IOS_INFO_INCREMENTAL)
+        io_stats_dump_global(this, &incremental, now, increment, args);
+
+    return 0;
+}
+
+int
+io_stats_dump_fd(xlator_t *this, struct ios_fd *iosfd)
+{
+    struct ios_conf *conf = NULL;
+    struct timeval now;
+    int i = 0;
+    double usecs = 0;
+    uint64_t data_read = 0;
+    uint64_t data_written = 0;
+    uint64_t block_count_read = 0;
+    uint64_t block_count_write = 0;
+
+    conf = this->private;
+
+    if (!conf->dump_fd_stats)
+        return 0;
+
+    if (!iosfd)
+        return 0;
+
+    gettimeofday(&now, NULL);
+    usecs = gf_tvdiff(&iosfd->opened_at, &now);
+
+    gf_log(this->name, GF_LOG_INFO, "--- fd stats ---");
+
+    if (iosfd->filename)
+        gf_log(this->name, GF_LOG_INFO, "      Filename : %s", iosfd->filename);
+
+    if (usecs)
+        gf_log(this->name, GF_LOG_INFO, "      Lifetime : %lf secs", usecs);
+
+    data_read = GF_ATOMIC_GET(iosfd->data_read);
+    if (data_read)
+        gf_log(this->name, GF_LOG_INFO, "     BytesRead : %" PRId64 " bytes",
+               data_read);
+
+    data_written = GF_ATOMIC_GET(iosfd->data_written);
+    if (data_written)
+        gf_log(this->name, GF_LOG_INFO, "  BytesWritten : %" PRId64 " bytes",
+               data_written);
+
+    for (i = 0; i < 32; i++) {
+        block_count_read = GF_ATOMIC_GET(iosfd->block_count_read[i]);
+        if (block_count_read)
+            gf_log(this->name, GF_LOG_INFO,
+                   " Read %06db+ :"
+                   "%" PRId64,
+                   (1 << i), block_count_read);
+    }
+    for (i = 0; i < IOS_BLOCK_COUNT_SIZE; i++) {
+        block_count_write = GF_ATOMIC_GET(iosfd->block_count_write[i]);
+        if (block_count_write)
+            gf_log(this->name, GF_LOG_INFO, "Write %06db+ : %" PRId64, (1 << i),
+                   block_count_write);
+    }
+    return 0;
+}
+
+void
+collect_ios_latency_sample(struct ios_conf *conf, glusterfs_fop_t fop_type,
+                           double elapsed, call_frame_t *frame)
+{
+    ios_sample_buf_t *ios_sample_buf = NULL;
+    ios_sample_t *ios_sample = NULL;
+    struct timespec *timestamp = NULL;
+    call_stack_t *root = NULL;
+
+    ios_sample_buf = conf->ios_sample_buf;
+    LOCK(&conf->ios_sampling_lock);
+    if (conf->ios_sample_interval == 0 ||
+        ios_sample_buf->observed % conf->ios_sample_interval != 0)
+        goto out;
+
+    timestamp = &frame->begin;
+    root = frame->root;
+
+    ios_sample = &(ios_sample_buf->ios_samples[ios_sample_buf->pos]);
+    ios_sample->elapsed = elapsed;
+    ios_sample->fop_type = fop_type;
+    ios_sample->uid = root->uid;
+    ios_sample->gid = root->gid;
+    (ios_sample->timestamp).tv_sec = timestamp->tv_sec;
+    (ios_sample->timestamp).tv_usec = timestamp->tv_nsec / 1000;
+    memcpy(&ios_sample->identifier, &root->identifier,
+           sizeof(root->identifier));
+
+    /* We've reached the end of the circular buffer, start from the
+     * beginning. */
+    if (ios_sample_buf->pos == (ios_sample_buf->size - 1))
+        ios_sample_buf->pos = 0;
+    else
+        ios_sample_buf->pos++;
+    ios_sample_buf->collected++;
+out:
+    ios_sample_buf->observed++;
+    UNLOCK(&conf->ios_sampling_lock);
+    return;
+}
+
+static void
+update_ios_latency_stats(struct ios_global_stats *stats, double elapsed,
+                         glusterfs_fop_t op)
+{
+    double avg;
+
+    GF_ASSERT(stats);
+
+    stats->latency[op].total += elapsed;
+
+    if (!stats->latency[op].min)
+        stats->latency[op].min = elapsed;
+    if (stats->latency[op].min > elapsed)
+        stats->latency[op].min = elapsed;
+    if (stats->latency[op].max < elapsed)
+        stats->latency[op].max = elapsed;
+
+    avg = stats->latency[op].avg;
+
+    stats->latency[op].avg = avg + (elapsed - avg) /
+                                       GF_ATOMIC_GET(stats->fop_hits[op]);
+}
+
+int
+update_ios_latency(struct ios_conf *conf, call_frame_t *frame,
+                   glusterfs_fop_t op)
+{
+    double elapsed;
+    struct timespec *begin, *end;
+
+    begin = &frame->begin;
+    end = &frame->end;
+
+    elapsed = gf_tsdiff(begin, end) / 1000.0;
+
+    update_ios_latency_stats(&conf->cumulative, elapsed, op);
+    update_ios_latency_stats(&conf->incremental, elapsed, op);
+    collect_ios_latency_sample(conf, op, elapsed, frame);
+
+    return 0;
+}
+
+int32_t
+io_stats_dump_stats_to_dict(xlator_t *this, dict_t *resp,
+                            ios_stats_type_t flags, int32_t list_cnt)
+{
+    struct ios_conf *conf = NULL;
+    int cnt = 0;
+    char key[32];
+    int keylen;
+    struct ios_stat_head *list_head = NULL;
+    struct ios_stat_list *entry = NULL;
+    int ret = -1;
+    ios_stats_thru_t index = IOS_STATS_THRU_MAX;
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    char *dict_timestr = NULL;
+
+    conf = this->private;
+
+    switch (flags) {
+        case IOS_STATS_TYPE_OPEN:
+            list_head = &conf->list[IOS_STATS_TYPE_OPEN];
+            LOCK(&conf->lock);
+            {
+                ret = dict_set_uint64(resp, "current-open",
+                                      conf->cumulative.nr_opens);
+                if (ret)
+                    goto unlock;
+                ret = dict_set_uint64(resp, "max-open",
+                                      conf->cumulative.max_nr_opens);
+
+                gf_time_fmt_tv(timestr, sizeof timestr,
+                               &conf->cumulative.max_openfd_time,
+                               gf_timefmt_FT);
+
+                dict_timestr = gf_strdup(timestr);
+                if (!dict_timestr)
+                    goto unlock;
+                ret = dict_set_dynstr(resp, "max-openfd-time", dict_timestr);
+                if (ret)
+                    goto unlock;
+            }
+        unlock:
+            UNLOCK(&conf->lock);
+            /* Do not proceed if we came here because of some error
+             * during the dict operation */
+            if (ret)
+                goto out;
+            break;
+        case IOS_STATS_TYPE_READ:
+            list_head = &conf->list[IOS_STATS_TYPE_READ];
+            break;
+        case IOS_STATS_TYPE_WRITE:
+            list_head = &conf->list[IOS_STATS_TYPE_WRITE];
+            break;
+        case IOS_STATS_TYPE_OPENDIR:
+            list_head = &conf->list[IOS_STATS_TYPE_OPENDIR];
+            break;
+        case IOS_STATS_TYPE_READDIRP:
+            list_head = &conf->list[IOS_STATS_TYPE_READDIRP];
+            break;
+        case IOS_STATS_TYPE_READ_THROUGHPUT:
+            list_head = &conf->thru_list[IOS_STATS_THRU_READ];
+            index = IOS_STATS_THRU_READ;
+            break;
+        case IOS_STATS_TYPE_WRITE_THROUGHPUT:
+            list_head = &conf->thru_list[IOS_STATS_THRU_WRITE];
+            index = IOS_STATS_THRU_WRITE;
+            break;
+
+        default:
+            goto out;
+    }
+    ret = dict_set_int32_sizen(resp, "top-op", flags);
+    if (!list_cnt)
+        goto out;
+    LOCK(&list_head->lock);
+    {
+        list_for_each_entry(entry, &list_head->iosstats->list, list)
+        {
+            cnt++;
+            keylen = snprintf(key, sizeof(key), "filename-%d", cnt);
+            ret = dict_set_strn(resp, key, keylen, entry->iosstat->filename);
+            if (ret)
+                goto unlock_list_head;
+            snprintf(key, sizeof(key), "value-%d", cnt);
+            ret = dict_set_uint64(resp, key, entry->value);
+            if (ret)
+                goto unlock_list_head;
+            if (index != IOS_STATS_THRU_MAX) {
+                keylen = snprintf(key, sizeof(key), "time-sec-%d", cnt);
+                ret = dict_set_int32n(
+                    resp, key, keylen,
+                    entry->iosstat->thru_counters[index].time.tv_sec);
+                if (ret)
+                    goto unlock_list_head;
+                keylen = snprintf(key, sizeof(key), "time-usec-%d", cnt);
+                ret = dict_set_int32n(
+                    resp, key, keylen,
+                    entry->iosstat->thru_counters[index].time.tv_usec);
+                if (ret)
+                    goto unlock_list_head;
+            }
+            if (cnt == list_cnt)
+                break;
+        }
+    }
+unlock_list_head:
+    UNLOCK(&list_head->lock);
+    /* ret is !=0 if some dict operation in the above critical region
+     * failed. */
+    if (ret)
+        goto out;
+    ret = dict_set_int32_sizen(resp, "members", cnt);
+out:
+    return ret;
+}
+
+static struct ios_stat *
+ios_init_iosstat(xlator_t *this, char *path, uuid_t gfid, inode_t *inode)
+{
+    struct ios_stat *iosstat = NULL;
+    int i = 0;
+
+    iosstat = GF_CALLOC(1, sizeof(*iosstat), gf_io_stats_mt_ios_stat);
+    if (!iosstat)
+        goto out;
+
+    iosstat->filename = gf_strdup(path);
+    gf_uuid_copy(iosstat->gfid, gfid);
+    LOCK_INIT(&iosstat->lock);
+
+    for (i = 0; i < IOS_STATS_TYPE_MAX; i++)
+        GF_ATOMIC_INIT(iosstat->counters[i], 0);
+
+    ios_inode_ctx_set(inode, this, iosstat);
+
+out:
+    return iosstat;
+}
+
+int
+io_stats_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                    struct iatt *buf, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
+{
+    struct ios_fd *iosfd = NULL;
+    char *path = NULL;
+    struct ios_stat *iosstat = NULL;
+    struct ios_conf *conf = NULL;
+
+    conf = this->private;
+
+    path = frame->local;
+    frame->local = NULL;
+
+    if (!path)
+        goto unwind;
+
+    if (op_ret < 0) {
+        GF_FREE(path);
+        goto unwind;
+    }
+
+    iosfd = GF_CALLOC(1, sizeof(*iosfd), gf_io_stats_mt_ios_fd);
+    if (!iosfd) {
+        GF_FREE(path);
+        goto unwind;
+    }
+
+    iosfd->filename = path;
+    gettimeofday(&iosfd->opened_at, NULL);
+
+    ios_fd_ctx_set(fd, this, iosfd);
+    LOCK(&conf->lock);
+    {
+        conf->cumulative.nr_opens++;
+        if (conf->cumulative.nr_opens > conf->cumulative.max_nr_opens) {
+            conf->cumulative.max_nr_opens = conf->cumulative.nr_opens;
+            conf->cumulative.max_openfd_time = iosfd->opened_at;
+        }
+    }
+    UNLOCK(&conf->lock);
+
+    iosstat = ios_init_iosstat(this, path, buf->ia_gfid, inode);
+    if (!iosstat)
+        GF_FREE(path);
+
+unwind:
+    UPDATE_PROFILE_STATS(frame, CREATE);
+    STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf,
+                        preparent, postparent, xdata);
+    return 0;
+}
+
+int
+io_stats_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    struct ios_fd *iosfd = NULL;
+    char *path = NULL;
+    struct ios_stat *iosstat = NULL;
+    struct ios_conf *conf = NULL;
+    int i = 0;
+
+    conf = this->private;
+    path = frame->local;
+    frame->local = NULL;
+
+    if (!path)
+        goto unwind;
+
+    if (op_ret < 0) {
+        GF_FREE(path);
+        goto unwind;
+    }
+
+    iosfd = GF_CALLOC(1, sizeof(*iosfd), gf_io_stats_mt_ios_fd);
+    if (!iosfd) {
+        GF_FREE(path);
+        goto unwind;
+    }
+
+    iosfd->filename = path;
+    GF_ATOMIC_INIT(iosfd->data_read, 0);
+    GF_ATOMIC_INIT(iosfd->data_written, 0);
+    for (i = 0; i < IOS_BLOCK_COUNT_SIZE; i++) {
+        GF_ATOMIC_INIT(iosfd->block_count_write[i], 0);
+        GF_ATOMIC_INIT(iosfd->block_count_read[i], 0);
+    }
+    gettimeofday(&iosfd->opened_at, NULL);
+
+    ios_fd_ctx_set(fd, this, iosfd);
+
+    ios_inode_ctx_get(fd->inode, this, &iosstat);
+    if (!iosstat) {
+        iosstat = ios_init_iosstat(this, path, fd->inode->gfid, fd->inode);
+    }
+
+    LOCK(&conf->lock);
+    {
+        conf->cumulative.nr_opens++;
+        if (conf->cumulative.nr_opens > conf->cumulative.max_nr_opens) {
+            conf->cumulative.max_nr_opens = conf->cumulative.nr_opens;
+            conf->cumulative.max_openfd_time = iosfd->opened_at;
+        }
+    }
+    UNLOCK(&conf->lock);
+    if (iosstat) {
+        ios_bump_stats(this, iosstat, IOS_STATS_TYPE_OPEN);
+        iosstat = NULL;
+    }
+unwind:
+    UPDATE_PROFILE_STATS(frame, OPEN);
+
+    STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata);
+    return 0;
+}
+
+int
+io_stats_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                  dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, STAT);
+    STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+int
+io_stats_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                   int32_t count, struct iatt *buf, struct iobref *iobref,
+                   dict_t *xdata)
+{
+    int len = 0;
+    fd_t *fd = NULL;
+    struct ios_stat *iosstat = NULL;
+
+    fd = frame->local;
+    frame->local = NULL;
+
+    if (op_ret > 0) {
+        len = iov_length(vector, count);
+        ios_bump_read(this, fd, len);
+    }
+
+    UPDATE_PROFILE_STATS(frame, READ);
+    ios_inode_ctx_get(fd->inode, this, &iosstat);
+
+    if (iosstat) {
+        ios_bump_stats(this, iosstat, IOS_STATS_TYPE_READ);
+        BUMP_THROUGHPUT(iosstat, IOS_STATS_THRU_READ);
+        iosstat = NULL;
+    }
+
+    STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, buf,
+                        iobref, xdata);
+    return 0;
+}
+
+int
+io_stats_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                    struct iatt *postbuf, dict_t *xdata)
+{
+    struct ios_stat *iosstat = NULL;
+    inode_t *inode = NULL;
+
+    UPDATE_PROFILE_STATS(frame, WRITE);
+    if (frame->local) {
+        inode = frame->local;
+        frame->local = NULL;
+        ios_inode_ctx_get(inode, this, &iosstat);
+        if (iosstat) {
+            ios_bump_stats(this, iosstat, IOS_STATS_TYPE_WRITE);
+            BUMP_THROUGHPUT(iosstat, IOS_STATS_THRU_WRITE);
+            inode = NULL;
+            iosstat = NULL;
+        }
+    }
+
+    STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
+
+int
+io_stats_copy_file_range_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                             int32_t op_ret, int32_t op_errno,
+                             struct iatt *stbuf, struct iatt *prebuf_dst,
+                             struct iatt *postbuf_dst, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, COPY_FILE_RANGE);
+
+    STACK_UNWIND_STRICT(copy_file_range, frame, op_ret, op_errno, stbuf,
+                        prebuf_dst, postbuf_dst, xdata);
+    return 0;
+}
+
+int
+io_stats_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, gf_dirent_t *buf,
+                      dict_t *xdata)
+{
+    struct ios_stat *iosstat = NULL;
+    inode_t *inode = frame->local;
+
+    frame->local = NULL;
+
+    UPDATE_PROFILE_STATS(frame, READDIRP);
+
+    ios_inode_ctx_get(inode, this, &iosstat);
+
+    if (iosstat) {
+        ios_bump_stats(this, iosstat, IOS_STATS_TYPE_READDIRP);
+        iosstat = NULL;
+    }
+
+    STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+int
+io_stats_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, gf_dirent_t *buf,
+                     dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, READDIR);
+    STACK_UNWIND_STRICT(readdir, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+int
+io_stats_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                   struct iatt *postbuf, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, FSYNC);
+    STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+    return 0;
+}
+
+int
+io_stats_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *preop,
+                     struct iatt *postop, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, SETATTR);
+    STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, preop, postop, xdata);
+    return 0;
+}
+
+int
+io_stats_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, UNLINK);
+    STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent,
+                        xdata);
+    return 0;
+}
+
+int
+io_stats_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                    struct iatt *preoldparent, struct iatt *postoldparent,
+                    struct iatt *prenewparent, struct iatt *postnewparent,
+                    dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, RENAME);
+    STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, buf, preoldparent,
+                        postoldparent, prenewparent, postnewparent, xdata);
+    return 0;
+}
+
+int
+io_stats_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, const char *buf,
+                      struct iatt *sbuf, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, READLINK);
+    STACK_UNWIND_STRICT(readlink, frame, op_ret, op_errno, buf, sbuf, xdata);
+    return 0;
+}
+
+int
+io_stats_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, inode_t *inode,
+                    struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+    UPDATE_PROFILE_STATS(frame, LOOKUP);
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+                        postparent);
+    return 0;
+}
+
+int
+io_stats_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, inode_t *inode,
+                     struct iatt *buf, struct iatt *preparent,
+                     struct iatt *postparent, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, SYMLINK);
+    STACK_UNWIND_STRICT(symlink, frame, op_ret, op_errno, inode, buf, preparent,
+                        postparent, xdata);
+    return 0;
+}
+
+int
+io_stats_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, inode_t *inode,
+                   struct iatt *buf, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, MKNOD);
+    STACK_UNWIND_STRICT(mknod, frame, op_ret, op_errno, inode, buf, preparent,
+                        postparent, xdata);
+    return 0;
+}
+
+int
+io_stats_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, inode_t *inode,
+                   struct iatt *buf, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
+{
+    char *path = frame->local;
+
+    if (!path)
+        goto unwind;
+
+    UPDATE_PROFILE_STATS(frame, MKDIR);
+    if (op_ret < 0)
+        goto unwind;
+
+    /* allocate a struct ios_stat and set the inode ctx */
+    ios_init_iosstat(this, path, buf->ia_gfid, inode);
+
+unwind:
+    /* local is assigned with path */
+    GF_FREE(frame->local);
+    frame->local = NULL;
+    STACK_UNWIND_STRICT(mkdir, frame, op_ret, op_errno, inode, buf, preparent,
+                        postparent, xdata);
+    return 0;
+}
+
+int
+io_stats_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, inode_t *inode,
+                  struct iatt *buf, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, LINK);
+    STACK_UNWIND_STRICT(link, frame, op_ret, op_errno, inode, buf, preparent,
+                        postparent, xdata);
+    return 0;
+}
+
+int
+io_stats_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, FLUSH);
+    STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+io_stats_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    struct ios_stat *iosstat = NULL;
+    int ret = -1;
+
+    UPDATE_PROFILE_STATS(frame, OPENDIR);
+    if (op_ret < 0)
+        goto unwind;
+
+    ios_fd_ctx_set(fd, this, 0);
+
+    ret = ios_inode_ctx_get(fd->inode, this, &iosstat);
+    if (!ret)
+        ios_bump_stats(this, iosstat, IOS_STATS_TYPE_OPENDIR);
+
+unwind:
+    STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, xdata);
+    return 0;
+}
+
+int
+io_stats_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, RMDIR);
+
+    STACK_UNWIND_STRICT(rmdir, frame, op_ret, op_errno, preparent, postparent,
+                        xdata);
+    return 0;
+}
+
+int
+io_stats_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                      struct iatt *postbuf, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, TRUNCATE);
+    STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
+
+int
+io_stats_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+                    dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, STATFS);
+    STACK_UNWIND_STRICT(statfs, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+int
+io_stats_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, SETXATTR);
+    STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+io_stats_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *dict,
+                      dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, GETXATTR);
+    STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+int
+io_stats_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, REMOVEXATTR);
+    STACK_UNWIND_STRICT(removexattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+io_stats_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, FSETXATTR);
+    STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+io_stats_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *dict,
+                       dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, FGETXATTR);
+    STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+int
+io_stats_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, FREMOVEXATTR);
+    STACK_UNWIND_STRICT(fremovexattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+io_stats_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, FSYNCDIR);
+    STACK_UNWIND_STRICT(fsyncdir, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+io_stats_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, ACCESS);
+    STACK_UNWIND_STRICT(access, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+io_stats_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                       struct iatt *postbuf, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, FTRUNCATE);
+    STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
+
+int
+io_stats_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                   dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, FSTAT);
+    STACK_UNWIND_STRICT(fstat, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+int
+io_stats_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                       struct iatt *postbuf, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, FALLOCATE);
+    STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
+
+int
+io_stats_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, DISCARD);
+    STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
+
+int
+io_stats_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                      struct iatt *postbuf, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, ZEROFILL);
+    STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
+
+int32_t
+io_stats_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, IPC);
+    STACK_UNWIND_STRICT(ipc, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+io_stats_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+                dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, LK);
+    STACK_UNWIND_STRICT(lk, frame, op_ret, op_errno, lock, xdata);
+    return 0;
+}
+
+int
+io_stats_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, ENTRYLK);
+    STACK_UNWIND_STRICT(entrylk, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+io_stats_fentrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, FENTRYLK);
+    STACK_UNWIND_STRICT(fentrylk, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+io_stats_rchecksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, uint32_t weak_checksum,
+                       uint8_t *strong_checksum, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, RCHECKSUM);
+    STACK_UNWIND_STRICT(rchecksum, frame, op_ret, op_errno, weak_checksum,
+                        strong_checksum, xdata);
+    return 0;
+}
+
+int
+io_stats_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, off_t offset, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, SEEK);
+    STACK_UNWIND_STRICT(seek, frame, op_ret, op_errno, offset, xdata);
+    return 0;
+}
+
+int
+io_stats_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct gf_lease *lease,
+                   dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, LEASE);
+    STACK_UNWIND_STRICT(lease, frame, op_ret, op_errno, lease, xdata);
+    return 0;
+}
+
+int
+io_stats_getactivelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno,
+                         lock_migration_info_t *locklist, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, GETACTIVELK);
+    STACK_UNWIND_STRICT(getactivelk, frame, op_ret, op_errno, locklist, xdata);
+    return 0;
+}
+
+int
+io_stats_setactivelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, SETACTIVELK);
+    STACK_UNWIND_STRICT(setactivelk, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+io_stats_compound_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, void *data,
+                      dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, COMPOUND);
+    STACK_UNWIND_STRICT(compound, frame, op_ret, op_errno, data, xdata);
+    return 0;
+}
+
+int
+io_stats_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *dict,
+                     dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, XATTROP);
+    STACK_UNWIND_STRICT(xattrop, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+int
+io_stats_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *dict,
+                      dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, FXATTROP);
+    STACK_UNWIND_STRICT(fxattrop, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+int
+io_stats_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, INODELK);
+    STACK_UNWIND_STRICT(inodelk, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+io_stats_entrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+                 loc_t *loc, const char *basename, entrylk_cmd cmd,
+                 entrylk_type type, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, volume, loc, basename, cmd,
+               type, xdata);
+    return 0;
+}
+
+int
+io_stats_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+                  fd_t *fd, const char *basename, entrylk_cmd cmd,
+                  entrylk_type type, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_fentrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fentrylk, volume, fd, basename, cmd,
+               type, xdata);
+    return 0;
+}
+
+int
+io_stats_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+                 loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_inodelk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->inodelk, volume, loc, cmd, flock,
+               xdata);
+    return 0;
+}
+
+int
+io_stats_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    UPDATE_PROFILE_STATS(frame, FINODELK);
+    STACK_UNWIND_STRICT(finodelk, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+io_stats_finodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+                  fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_finodelk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->finodelk, volume, fd, cmd, flock,
+               xdata);
+    return 0;
+}
+
+int
+io_stats_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                 gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_xattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->xattrop, loc, flags, dict, xdata);
+    return 0;
+}
+
+int
+io_stats_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                  gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_fxattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fxattrop, fd, flags, dict, xdata);
+    return 0;
+}
+
+int
+io_stats_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
+    return 0;
+}
+
+int
+io_stats_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->stat, loc, xdata);
+    return 0;
+}
+
+int
+io_stats_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+                  dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_readlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readlink, loc, size, xdata);
+    return 0;
+}
+
+int
+io_stats_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+               dev_t dev, mode_t umask, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, dev, umask, xdata);
+    return 0;
+}
+
+int
+io_stats_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+               mode_t umask, dict_t *xdata)
+{
+    if (loc->path)
+        frame->local = gf_strdup(loc->path);
+
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_mkdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+    return 0;
+}
+
+int
+io_stats_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+                dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+    return 0;
+}
+
+int
+io_stats_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+               dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_rmdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata);
+    return 0;
+}
+
+int
+io_stats_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+                 loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_symlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, xdata);
+    return 0;
+}
+
+int
+io_stats_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+                loc_t *newloc, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+    return 0;
+}
+
+int
+io_stats_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+              dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+    return 0;
+}
+
+int
+io_stats_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                 struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+    return 0;
+}
+
+int
+io_stats_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+                  dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+    return 0;
+}
+
+int
+io_stats_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+              fd_t *fd, dict_t *xdata)
+{
+    if (loc->path)
+        frame->local = gf_strdup(loc->path);
+
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+    return 0;
+}
+
+int
+io_stats_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+                mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    if (loc->path)
+        frame->local = gf_strdup(loc->path);
+
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+    return 0;
+}
+
+int
+io_stats_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+               off_t offset, uint32_t flags, dict_t *xdata)
+{
+    frame->local = fd;
+
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_readv_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+    return 0;
+}
+
+int
+io_stats_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                struct iovec *vector, int32_t count, off_t offset,
+                uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+    int len = 0;
+
+    if (fd->inode)
+        frame->local = fd->inode;
+    len = iov_length(vector, count);
+
+    ios_bump_write(this, fd, len);
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+               flags, iobref, xdata);
+    return 0;
+}
+
+int
+io_stats_copy_file_range(call_frame_t *frame, xlator_t *this, fd_t *fd_in,
+                         off_t off_in, fd_t *fd_out, off_t off_out, size_t len,
+                         uint32_t flags, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_copy_file_range_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->copy_file_range, fd_in, off_in, fd_out,
+               off_out, len, flags, xdata);
+    return 0;
+}
+
+int
+io_stats_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_statfs_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->statfs, loc, xdata);
+    return 0;
+}
+
+int
+io_stats_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_flush_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->flush, fd, xdata);
+    return 0;
+}
+
+int
+io_stats_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+               dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_fsync_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsync, fd, flags, xdata);
+    return 0;
+}
+
+int
+conditional_dump(dict_t *dict, char *key, data_t *value, void *data)
+{
+    struct {
+        xlator_t *this;
+        inode_t *inode;
+        const char *path;
+    } * stub;
+    xlator_t *this = NULL;
+    char *filename = NULL;
+    FILE *logfp = NULL;
+    struct ios_dump_args args = {0};
+    int pid, namelen, dirlen;
+    char dump_key[100];
+    char *slash_ptr = NULL;
+    char *path_in_value = NULL;
+    char *identifier = NULL;
+    struct ios_conf *conf = NULL;
+
+    stub = data;
+    this = stub->this;
+    conf = this->private;
+
+    /* Don't do this on 'brick-side', only do this on client side */
+    /* Addresses CVE-2018-14659 */
+    if (this->ctx->process_mode != GF_CLIENT_PROCESS) {
+        gf_log(this->name, GF_LOG_DEBUG,
+               "taking io-stats dump using setxattr not permitted on brick."
+               " Use 'gluster profile' instead");
+        return -1;
+    }
+
+    /* Create a file name that is appended with the io-stats instance
+    name as well. This helps when there is more than a single io-stats
+    instance in the graph, or the client and server processes are running
+    on the same node */
+    /* For the sanity of where the file should be located, we should make
+       sure file is written only inside RUNDIR (ie, /var/run/gluster) */
+    /* TODO: provide an option to dump it to different directory of
+       choice, based on options */
+    /* name format: /var/run/gluster/<passed in path/filename>.<xlator name
+     * slashes to -> */
+
+    path_in_value = alloca0(value->len + 1);
+
+    /* We need a memcpy here because of the way dict_unserialize works */
+
+    memcpy(path_in_value, data_to_str(value), value->len);
+    path_in_value[value->len] = '\0';
+
+    if (strstr(path_in_value, "../")) {
+        gf_log(this->name, GF_LOG_ERROR, "%s: no \"../\" allowed in path",
+               path_in_value);
+        return -1;
+    }
+
+    if (path_in_value[0] == '/') {
+        path_in_value = path_in_value + 1;
+    }
+
+    dirlen = strlen(IOS_STATS_DUMP_DIR);
+    if (conf->unique_id) {
+        /* this->name will be the same for all bricks of the volume */
+        identifier = conf->unique_id;
+    } else {
+        identifier = this->name;
+    }
+
+    namelen = (dirlen + value->len + strlen(identifier) + 3);
+    /* +3 for '/', '.' and '\0' added in snprintf below*/
+
+    filename = alloca0(namelen);
+    snprintf(filename, namelen, "%s/%s.%s", IOS_STATS_DUMP_DIR, path_in_value,
+             identifier);
+
+    /* convert any slashes to '-' so that fopen works correctly */
+    slash_ptr = strchr(filename + dirlen + 1, '/');
+    while (slash_ptr) {
+        *slash_ptr = '-';
+        slash_ptr = strchr(slash_ptr, '/');
+    }
+
+    pid = getpid();
+
+    if (!strncmp(filename, "", 1)) {
+        gf_log(this->name, GF_LOG_ERROR, "No filename given");
+        return -1;
+    }
+    logfp = fopen(filename, "w+");
+    if (!logfp) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to open %s "
+               "for writing",
+               filename);
+        return -1;
+    }
+    sprintf(dump_key, "*io*stat*%d_json_dump", pid);
+    if (fnmatch(dump_key, key, 0) == 0) {
+        (void)ios_dump_args_init(&args, IOS_DUMP_TYPE_JSON_FILE, logfp);
+    } else {
+        (void)ios_dump_args_init(&args, IOS_DUMP_TYPE_FILE, logfp);
+    }
+    io_stats_dump(this, &args, GF_IOS_INFO_ALL, _gf_false);
+    fclose(logfp);
+    return 0;
+}
+
+int
+_ios_destroy_dump_thread(struct ios_conf *conf)
+{
+    conf->dump_thread_should_die = _gf_true;
+    if (conf->dump_thread_running) {
+        (void)pthread_cancel(conf->dump_thread);
+        (void)pthread_join(conf->dump_thread, NULL);
+    }
+    return 0;
+}
+
+void *
+_ios_dump_thread(xlator_t *this)
+{
+    struct ios_conf *conf = NULL;
+    FILE *stats_logfp = NULL;
+    FILE *samples_logfp = NULL;
+    struct ios_dump_args args = {0};
+    int i;
+    int stats_bytes_written = 0;
+    int samples_bytes_written = 0;
+    char stats_filename[PATH_MAX];
+    char samples_filename[PATH_MAX];
+    char *xlator_name;
+    char *instance_name;
+    gf_boolean_t log_stats_fopen_failure = _gf_true;
+    gf_boolean_t log_samples_fopen_failure = _gf_true;
+    int old_cancel_type;
+
+    conf = this->private;
+    gf_log(this->name, GF_LOG_INFO,
+           "IO stats dump thread started, "
+           "polling IO stats every %d seconds",
+           conf->ios_dump_interval);
+    xlator_name = strdupa(conf->unique_id);
+    for (i = 0; i < strlen(xlator_name); i++) {
+        if (xlator_name[i] == '/')
+            xlator_name[i] = '_';
+    }
+    instance_name = this->instance_name;
+    if (this->name && strcmp(this->name, "glustershd") == 0) {
+        xlator_name = "shd";
+    } else if (this->prev && strcmp(this->prev->name, "nfs-server") == 0) {
+        xlator_name = "nfsd";
+        instance_name = this->prev->instance_name;
+    }
+    if (sys_mkdir(_IOS_DUMP_DIR, S_IRWXU | S_IRWXO | S_IRWXG) == (-1)) {
+        if (errno != EEXIST) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "could not create stats-dump directory %s", _IOS_DUMP_DIR);
+            goto out;
+        }
+    }
+    if (sys_mkdir(_IOS_SAMP_DIR, S_IRWXU | S_IRWXO | S_IRWXG) == (-1)) {
+        if (errno != EEXIST) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "could not create stats-sample directory %s", _IOS_SAMP_DIR);
+            goto out;
+        }
+    }
+    if (instance_name) {
+        stats_bytes_written = snprintf(stats_filename, PATH_MAX,
+                                       "%s/%s_%s_%s.dump", _IOS_DUMP_DIR,
+                                       __progname, xlator_name, instance_name);
+        samples_bytes_written = snprintf(
+            samples_filename, PATH_MAX, "%s/%s_%s_%s.samp", _IOS_SAMP_DIR,
+            __progname, xlator_name, instance_name);
+    } else {
+        stats_bytes_written = snprintf(stats_filename, PATH_MAX,
+                                       "%s/%s_%s.dump", _IOS_DUMP_DIR,
+                                       __progname, xlator_name);
+        samples_bytes_written = snprintf(samples_filename, PATH_MAX,
+                                         "%s/%s_%s.samp", _IOS_SAMP_DIR,
+                                         __progname, xlator_name);
+    }
+    if ((stats_bytes_written >= PATH_MAX) ||
+        (samples_bytes_written >= PATH_MAX)) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Invalid path for stats dump (%s) and/or latency "
+               "samples (%s)",
+               stats_filename, samples_filename);
+        goto out;
+    }
+    while (1) {
+        if (conf->dump_thread_should_die)
+            break;
+        (void)pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS,
+                                    &old_cancel_type);
+        sleep(conf->ios_dump_interval);
+        (void)pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, &old_cancel_type);
+        /*
+         * It's not clear whether we should reopen this each time, or
+         * just hold it open and rewind/truncate on each iteration.
+         * Leaving it alone for now.
+         */
+        stats_logfp = fopen(stats_filename, "w+");
+        if (stats_logfp) {
+            (void)ios_dump_args_init(&args, conf->dump_format, stats_logfp);
+            io_stats_dump(this, &args, GF_IOS_INFO_ALL, _gf_false);
+            fclose(stats_logfp);
+            log_stats_fopen_failure = _gf_true;
+        } else if (log_stats_fopen_failure) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "could not open stats-dump file %s (%s)", stats_filename,
+                   strerror(errno));
+            log_stats_fopen_failure = _gf_false;
+        }
+        samples_logfp = fopen(samples_filename, "w+");
+        if (samples_logfp) {
+            io_stats_dump_latency_samples_logfp(this, samples_logfp);
+            fclose(samples_logfp);
+            log_samples_fopen_failure = _gf_true;
+        } else if (log_samples_fopen_failure) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "could not open samples-dump file %s (%s)", samples_filename,
+                   strerror(errno));
+            log_samples_fopen_failure = _gf_false;
+        }
+    }
+out:
+    conf->dump_thread_running = _gf_false;
+    gf_log(this->name, GF_LOG_INFO, "IO stats dump thread terminated");
+    return NULL;
+}
+
+static gf_boolean_t
+match_special_xattr(dict_t *d, char *k, data_t *val, void *mdata)
+{
+    gf_boolean_t ret = _gf_false;
+    if (fnmatch("*io*stat*dump", k, 0) == 0) {
+        ret = _gf_true;
+    }
+
+    return ret;
+}
+
+int
+io_stats_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+                  int32_t flags, dict_t *xdata)
+{
+    struct {
+        xlator_t *this;
+        inode_t *inode;
+        const char *path;
+    } stub;
+
+    stub.this = this;
+    stub.inode = loc->inode;
+    stub.path = loc->path;
+
+    (void)dict_foreach_match(dict, match_special_xattr, NULL, conditional_dump,
+                             &stub);
+
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_setxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
+    return 0;
+}
+
+int
+io_stats_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                  const char *name, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_getxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
+    return 0;
+}
+
+int
+io_stats_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                     const char *name, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_removexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+    return 0;
+}
+
+int
+io_stats_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+                   int32_t flags, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_fsetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+    return 0;
+}
+
+int
+io_stats_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                   const char *name, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_fgetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+    return 0;
+}
+
+int
+io_stats_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                      const char *name, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_fremovexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+    return 0;
+}
+
+int
+io_stats_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+                 dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_opendir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+    return 0;
+}
+
+int
+io_stats_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                  off_t offset, dict_t *dict)
+{
+    frame->local = fd->inode;
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict);
+    return 0;
+}
+
+int
+io_stats_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                 off_t offset, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_readdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata);
+    return 0;
+}
+
+int
+io_stats_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                  int32_t datasync, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_fsyncdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsyncdir, fd, datasync, xdata);
+    return 0;
+}
+
+int
+io_stats_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+                dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_access_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->access, loc, mask, xdata);
+    return 0;
+}
+
+int
+io_stats_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                   dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_ftruncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+}
+
+int
+io_stats_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                  struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+    return 0;
+}
+
+int
+io_stats_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_fstat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fstat, fd, xdata);
+    return 0;
+}
+
+int
+io_stats_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+                   off_t offset, size_t len, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_fallocate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+               xdata);
+
+    return 0;
+}
+
+int
+io_stats_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                 size_t len, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_discard_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+
+    return 0;
+}
+
+int
+io_stats_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                  off_t len, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_zerofill_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+
+    return 0;
+}
+
+int32_t
+io_stats_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_ipc_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ipc, op, xdata);
+    return 0;
+}
+
+int
+io_stats_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+            struct gf_flock *lock, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_lk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lk, fd, cmd, lock, xdata);
+    return 0;
+}
+
+int
+io_stats_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                   int32_t len, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_rchecksum_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata);
+    return 0;
+}
+
+int
+io_stats_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+              gf_seek_what_t what, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_seek_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->seek, fd, offset, what, xdata);
+    return 0;
+}
+
+int
+io_stats_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               struct gf_lease *lease, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_lease_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lease, loc, lease, xdata);
+    return 0;
+}
+
+int
+io_stats_getactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                     dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_getactivelk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getactivelk, loc, xdata);
+    return 0;
+}
+
+int
+io_stats_setactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                     lock_migration_info_t *locklist, dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_setactivelk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setactivelk, loc, locklist, xdata);
+    return 0;
+}
+
+int
+io_stats_compound(call_frame_t *frame, xlator_t *this, void *args,
+                  dict_t *xdata)
+{
+    START_FOP_LATENCY(frame);
+
+    STACK_WIND(frame, io_stats_compound_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->compound, args, xdata);
+    return 0;
+}
+
+int
+io_stats_release(xlator_t *this, fd_t *fd)
+{
+    struct ios_fd *iosfd = NULL;
+    struct ios_conf *conf = NULL;
+
+    BUMP_FOP(RELEASE);
+
+    conf = this->private;
+    if (conf) {
+        LOCK(&conf->lock);
+        {
+            conf->cumulative.nr_opens--;
+        }
+        UNLOCK(&conf->lock);
+    }
+
+    ios_fd_ctx_get(fd, this, &iosfd);
+    if (iosfd) {
+        io_stats_dump_fd(this, iosfd);
+
+        GF_FREE(iosfd->filename);
+        GF_FREE(iosfd);
+    }
+
+    return 0;
+}
+
+int
+io_stats_releasedir(xlator_t *this, fd_t *fd)
+{
+    BUMP_FOP(RELEASEDIR);
+
+    return 0;
+}
+
+int
+io_stats_forget(xlator_t *this, inode_t *inode)
+{
+    BUMP_FOP(FORGET);
+    ios_stats_cleanup(this, inode);
+    return 0;
+}
+
+static int
+ios_init_top_stats(struct ios_conf *conf)
+{
+    int i = 0;
+
+    GF_ASSERT(conf);
+
+    for (i = 0; i < IOS_STATS_TYPE_MAX; i++) {
+        conf->list[i].iosstats = GF_CALLOC(1, sizeof(*conf->list[i].iosstats),
+                                           gf_io_stats_mt_ios_stat);
+
+        if (!conf->list[i].iosstats)
+            return -1;
+
+        INIT_LIST_HEAD(&conf->list[i].iosstats->list);
+        LOCK_INIT(&conf->list[i].lock);
+    }
+
+    for (i = 0; i < IOS_STATS_THRU_MAX; i++) {
+        conf->thru_list[i].iosstats = GF_CALLOC(
+            1, sizeof(*conf->thru_list[i].iosstats), gf_io_stats_mt_ios_stat);
+
+        if (!conf->thru_list[i].iosstats)
+            return -1;
+
+        INIT_LIST_HEAD(&conf->thru_list[i].iosstats->list);
+        LOCK_INIT(&conf->thru_list[i].lock);
+    }
+
+    return 0;
+}
+
+static void
+ios_destroy_top_stats(struct ios_conf *conf)
+{
+    int i = 0;
+    struct ios_stat_head *list_head = NULL;
+    struct ios_stat_list *entry = NULL;
+    struct ios_stat_list *tmp = NULL;
+    struct ios_stat_list *list = NULL;
+    struct ios_stat *stat = NULL;
+
+    GF_ASSERT(conf);
+
+    LOCK(&conf->lock);
+
+    conf->cumulative.nr_opens = 0;
+    conf->cumulative.max_nr_opens = 0;
+    conf->cumulative.max_openfd_time.tv_sec = 0;
+    conf->cumulative.max_openfd_time.tv_usec = 0;
+
+    for (i = 0; i < IOS_STATS_TYPE_MAX; i++) {
+        list_head = &conf->list[i];
+        if (!list_head)
+            continue;
+        list_for_each_entry_safe(entry, tmp, &list_head->iosstats->list, list)
+        {
+            list = entry;
+            stat = list->iosstat;
+            ios_stat_unref(stat);
+            list_del(&list->list);
+            GF_FREE(list);
+            list_head->members--;
+        }
+        GF_FREE(list_head->iosstats);
+    }
+
+    for (i = 0; i < IOS_STATS_THRU_MAX; i++) {
+        list_head = &conf->thru_list[i];
+        if (!list_head)
+            continue;
+        list_for_each_entry_safe(entry, tmp, &list_head->iosstats->list, list)
+        {
+            list = entry;
+            stat = list->iosstat;
+            ios_stat_unref(stat);
+            list_del(&list->list);
+            GF_FREE(list);
+            list_head->members--;
+        }
+        GF_FREE(list_head->iosstats);
+    }
+
+    UNLOCK(&conf->lock);
+
+    return;
+}
+
+static void
+io_stats_clear(struct ios_conf *conf)
+{
+    time_t now = 0;
+
+    GF_ASSERT(conf);
+    now = gf_time();
+
+    LOCK(&conf->lock);
+    {
+        ios_global_stats_clear(&conf->cumulative, now);
+        ios_global_stats_clear(&conf->incremental, now);
+        conf->increment = 0;
+    }
+    UNLOCK(&conf->lock);
+}
+
+int32_t
+io_priv(xlator_t *this)
+{
+    int i;
+    char key[GF_DUMP_MAX_BUF_LEN];
+    char key_prefix_cumulative[GF_DUMP_MAX_BUF_LEN];
+    char key_prefix_incremental[GF_DUMP_MAX_BUF_LEN];
+    double min, max, avg;
+    uint64_t count, total;
+    struct ios_conf *conf = NULL;
+
+    conf = this->private;
+    if (!conf)
+        return -1;
+
+    if (!conf->count_fop_hits || !conf->measure_latency)
+        return -1;
+
+    gf_proc_dump_write("cumulative.data_read", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(conf->cumulative.data_read));
+    gf_proc_dump_write("cumulative.data_written", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(conf->cumulative.data_written));
+
+    gf_proc_dump_write("incremental.data_read", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(conf->incremental.data_read));
+    gf_proc_dump_write("incremental.data_written", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(conf->incremental.data_written));
+
+    snprintf(key_prefix_cumulative, GF_DUMP_MAX_BUF_LEN, "%s.cumulative",
+             this->name);
+    snprintf(key_prefix_incremental, GF_DUMP_MAX_BUF_LEN, "%s.incremental",
+             this->name);
+
+    for (i = 0; i < GF_FOP_MAXVALUE; i++) {
+        count = GF_ATOMIC_GET(conf->cumulative.fop_hits[i]);
+        total = conf->cumulative.latency[i].total;
+        min = conf->cumulative.latency[i].min;
+        max = conf->cumulative.latency[i].max;
+        avg = conf->cumulative.latency[i].avg;
+
+        gf_proc_dump_build_key(key, key_prefix_cumulative, "%s",
+                               (char *)gf_fop_list[i]);
+
+        gf_proc_dump_write(key, "%" PRId64 ",%" PRId64 ",%.03f,%.03f,%.03f",
+                           count, total, min, max, avg);
+
+        count = GF_ATOMIC_GET(conf->incremental.fop_hits[i]);
+        total = conf->incremental.latency[i].total;
+        min = conf->incremental.latency[i].min;
+        max = conf->incremental.latency[i].max;
+        avg = conf->incremental.latency[i].avg;
+
+        gf_proc_dump_build_key(key, key_prefix_incremental, "%s",
+                               (char *)gf_fop_list[i]);
+
+        gf_proc_dump_write(key, "%" PRId64 ",%" PRId64 ",%.03f,%.03f,%.03f",
+                           count, total, min, max, avg);
+    }
+
+    return 0;
+}
+
+static void
+ios_set_log_format_code(struct ios_conf *conf, char *dump_format_str)
+{
+    if (strcmp(dump_format_str, "json") == 0)
+        conf->dump_format = IOS_DUMP_TYPE_JSON_FILE;
+    else if (strcmp(dump_format_str, "text") == 0)
+        conf->dump_format = IOS_DUMP_TYPE_FILE;
+    else if (strcmp(dump_format_str, "dict") == 0)
+        conf->dump_format = IOS_DUMP_TYPE_DICT;
+    else if (strcmp(dump_format_str, "samples") == 0)
+        conf->dump_format = IOS_DUMP_TYPE_SAMPLES;
+}
+
+void
+xlator_set_loglevel(xlator_t *this, int log_level)
+{
+    glusterfs_ctx_t *ctx = NULL;
+    glusterfs_graph_t *active = NULL;
+    xlator_t *top = NULL;
+    xlator_t *trav = this;
+
+    ctx = this->ctx;
+    GF_ASSERT(ctx);
+    active = ctx->active;
+    top = active->first;
+
+    if (log_level == -1)
+        return;
+
+    if (ctx->cmd_args.brick_mux) {
+        /* Set log-level for all brick xlators */
+        top->loglevel = log_level;
+
+        /* Set log-level for parent xlator */
+        if (this->parents)
+            this->parents->xlator->loglevel = log_level;
+
+        while (trav) {
+            trav->loglevel = log_level;
+            trav = trav->next;
+        }
+    } else {
+        gf_log_set_loglevel(this->ctx, log_level);
+    }
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    struct ios_conf *conf = NULL;
+    int ret = -1;
+    char *sys_log_str = NULL;
+    char *log_format_str = NULL;
+    char *logger_str = NULL;
+    char *dump_format_str = NULL;
+    int sys_log_level = -1;
+    char *log_str = NULL;
+    int log_level = -1;
+    int log_format = -1;
+    int logger = -1;
+    uint32_t log_buf_size = 0;
+    uint32_t log_flush_timeout = 0;
+    int32_t old_dump_interval;
+    int32_t threads;
+
+    if (!this || !this->private)
+        goto out;
+
+    conf = this->private;
+
+    GF_OPTION_RECONF("dump-fd-stats", conf->dump_fd_stats, options, bool, out);
+
+    GF_OPTION_RECONF("count-fop-hits", conf->count_fop_hits, options, bool,
+                     out);
+
+    GF_OPTION_RECONF("latency-measurement", conf->measure_latency, options,
+                     bool, out);
+
+    old_dump_interval = conf->ios_dump_interval;
+    GF_OPTION_RECONF("ios-dump-interval", conf->ios_dump_interval, options,
+                     int32, out);
+    if ((old_dump_interval <= 0) && (conf->ios_dump_interval > 0)) {
+        conf->dump_thread_running = _gf_true;
+        conf->dump_thread_should_die = _gf_false;
+        ret = gf_thread_create(&conf->dump_thread, NULL,
+                               (void *)&_ios_dump_thread, this, "iosdump");
+        if (ret) {
+            conf->dump_thread_running = _gf_false;
+            gf_log(this ? this->name : "io-stats", GF_LOG_ERROR,
+                   "Failed to start thread"
+                   "while reconfigure. Returning %d",
+                   ret);
+            goto out;
+        }
+    } else if ((old_dump_interval > 0) && (conf->ios_dump_interval == 0)) {
+        _ios_destroy_dump_thread(conf);
+    }
+
+    GF_OPTION_RECONF("ios-sample-interval", conf->ios_sample_interval, options,
+                     int32, out);
+    GF_OPTION_RECONF("ios-dump-format", dump_format_str, options, str, out);
+    ios_set_log_format_code(conf, dump_format_str);
+    GF_OPTION_RECONF("ios-sample-buf-size", conf->ios_sample_buf_size, options,
+                     int32, out);
+    GF_OPTION_RECONF("sys-log-level", sys_log_str, options, str, out);
+    if (sys_log_str) {
+        sys_log_level = glusterd_check_log_level(sys_log_str);
+        set_sys_log_level(sys_log_level);
+    }
+
+    GF_OPTION_RECONF("log-level", log_str, options, str, out);
+    if (log_str) {
+        log_level = glusterd_check_log_level(log_str);
+        /* Set loglevel for all children and server xlators */
+        xlator_set_loglevel(this, log_level);
+    }
+
+    GF_OPTION_RECONF("logger", logger_str, options, str, out);
+    if (logger_str) {
+        logger = gf_check_logger(logger_str);
+        gf_log_set_logger(logger);
+    }
+
+    GF_OPTION_RECONF("log-format", log_format_str, options, str, out);
+    if (log_format_str) {
+        log_format = gf_check_log_format(log_format_str);
+        gf_log_set_logformat(log_format);
+    }
+
+    GF_OPTION_RECONF("log-buf-size", log_buf_size, options, uint32, out);
+    gf_log_set_log_buf_size(log_buf_size);
+
+    GF_OPTION_RECONF("log-flush-timeout", log_flush_timeout, options, time,
+                     out);
+    gf_log_set_log_flush_timeout(log_flush_timeout);
+
+    GF_OPTION_RECONF("threads", threads, options, int32, out);
+    gf_async_adjust_threads(threads);
+
+    ret = 0;
+out:
+    gf_log(this ? this->name : "io-stats", GF_LOG_DEBUG,
+           "reconfigure returning %d", ret);
+    return ret;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_io_stats_mt_end + 1);
+
+    if (ret != 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Memory accounting init"
+               " failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+void
+ios_conf_destroy(struct ios_conf *conf)
+{
+    if (!conf)
+        return;
+
+    ios_destroy_top_stats(conf);
+    _ios_destroy_dump_thread(conf);
+    ios_destroy_sample_buf(conf->ios_sample_buf);
+    LOCK_DESTROY(&conf->lock);
+    gf_dnscache_deinit(conf->dnscache);
+    GF_FREE(conf);
+}
+
+static void
+ios_init_stats(struct ios_global_stats *stats)
+{
+    int i = 0;
+
+    GF_ATOMIC_INIT(stats->data_read, 0);
+    GF_ATOMIC_INIT(stats->data_written, 0);
+
+    for (i = 0; i < IOS_BLOCK_COUNT_SIZE; i++) {
+        GF_ATOMIC_INIT(stats->block_count_write[i], 0);
+        GF_ATOMIC_INIT(stats->block_count_read[i], 0);
+    }
+
+    for (i = 0; i < GF_FOP_MAXVALUE; i++)
+        GF_ATOMIC_INIT(stats->fop_hits[i], 0);
+
+    for (i = 0; i < GF_UPCALL_FLAGS_MAXVALUE; i++)
+        GF_ATOMIC_INIT(stats->upcall_hits[i], 0);
+
+    stats->started_at = gf_time();
+}
+
+int
+init(xlator_t *this)
+{
+    struct ios_conf *conf = NULL;
+    char *volume_id = NULL;
+    char *sys_log_str = NULL;
+    char *logger_str = NULL;
+    char *log_format_str = NULL;
+    char *dump_format_str = NULL;
+    int logger = -1;
+    int log_format = -1;
+    int sys_log_level = -1;
+    char *log_str = NULL;
+    int log_level = -1;
+    int ret = -1;
+    uint32_t log_buf_size = 0;
+    uint32_t log_flush_timeout = 0;
+    int32_t threads;
+
+    if (!this)
+        return -1;
+
+    if (!this->children) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "io_stats translator requires at least one subvolume");
+        return -1;
+    }
+
+    if (!this->parents) {
+        /* This is very much valid as io-stats currently is loaded
+         * on top of volumes on both client and server, hence this is
+         * not an warning message */
+        gf_log(this->name, GF_LOG_DEBUG, "dangling volume. check volfile ");
+    }
+
+    conf = GF_CALLOC(1, sizeof(*conf), gf_io_stats_mt_ios_conf);
+
+    if (!conf)
+        goto out;
+
+    if (dict_get_str(this->options, "unique-id", &conf->unique_id) != 0) {
+        /* This is always set on servers, so we must be a client. */
+        conf->unique_id = this->name;
+    }
+
+    ret = dict_get_strn(this->options, "volume-id", SLEN("volume-id"),
+                        &volume_id);
+    if (!ret) {
+        strncpy(this->graph->volume_id, volume_id, GF_UUID_BUF_SIZE);
+    }
+    /*
+     * Init it just after calloc, so that we are sure the lock is inited
+     * in case of error paths.
+     */
+    LOCK_INIT(&conf->lock);
+    LOCK_INIT(&conf->ios_sampling_lock);
+
+    ios_init_stats(&conf->cumulative);
+    ios_init_stats(&conf->incremental);
+
+    ret = ios_init_top_stats(conf);
+    if (ret)
+        goto out;
+
+    GF_OPTION_INIT("dump-fd-stats", conf->dump_fd_stats, bool, out);
+
+    GF_OPTION_INIT("count-fop-hits", conf->count_fop_hits, bool, out);
+
+    GF_OPTION_INIT("latency-measurement", conf->measure_latency, bool, out);
+
+    GF_OPTION_INIT("ios-dump-interval", conf->ios_dump_interval, int32, out);
+
+    GF_OPTION_INIT("ios-sample-interval", conf->ios_sample_interval, int32,
+                   out);
+
+    GF_OPTION_INIT("ios-dump-format", dump_format_str, str, out);
+    ios_set_log_format_code(conf, dump_format_str);
+
+    GF_OPTION_INIT("ios-sample-buf-size", conf->ios_sample_buf_size, int32,
+                   out);
+
+    ret = ios_init_sample_buf(conf);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR, "Out of memory.");
+        goto out;
+    }
+
+    GF_OPTION_INIT("ios-dnscache-ttl-sec", conf->ios_dnscache_ttl_sec, int32,
+                   out);
+    conf->dnscache = gf_dnscache_init(conf->ios_dnscache_ttl_sec);
+    if (!conf->dnscache) {
+        ret = -1;
+        goto out;
+    }
+
+    GF_OPTION_INIT("sys-log-level", sys_log_str, str, out);
+    if (sys_log_str) {
+        sys_log_level = glusterd_check_log_level(sys_log_str);
+        set_sys_log_level(sys_log_level);
+    }
+
+    GF_OPTION_INIT("log-level", log_str, str, out);
+    if (log_str) {
+        log_level = glusterd_check_log_level(log_str);
+        if (DEFAULT_LOG_LEVEL != log_level)
+            gf_log_set_loglevel(this->ctx, log_level);
+    }
+
+    GF_OPTION_INIT("logger", logger_str, str, out);
+    if (logger_str) {
+        logger = gf_check_logger(logger_str);
+        gf_log_set_logger(logger);
+    }
+
+    GF_OPTION_INIT("log-format", log_format_str, str, out);
+    if (log_format_str) {
+        log_format = gf_check_log_format(log_format_str);
+        gf_log_set_logformat(log_format);
+    }
+
+    GF_OPTION_INIT("log-buf-size", log_buf_size, uint32, out);
+    gf_log_set_log_buf_size(log_buf_size);
+
+    GF_OPTION_INIT("log-flush-timeout", log_flush_timeout, time, out);
+    gf_log_set_log_flush_timeout(log_flush_timeout);
+
+    GF_OPTION_INIT("threads", threads, int32, out);
+    gf_async_adjust_threads(threads);
+
+    this->private = conf;
+    if (conf->ios_dump_interval > 0) {
+        conf->dump_thread_running = _gf_true;
+        conf->dump_thread_should_die = _gf_false;
+        ret = gf_thread_create(&conf->dump_thread, NULL,
+                               (void *)&_ios_dump_thread, this, "iosdump");
+        if (ret) {
+            conf->dump_thread_running = _gf_false;
+            gf_log(this ? this->name : "io-stats", GF_LOG_ERROR,
+                   "Failed to start thread"
+                   "in init. Returning %d",
+                   ret);
+            goto out;
+        }
+    }
+    return 0;
+out:
+    ios_conf_destroy(conf);
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    struct ios_conf *conf = NULL;
+
+    if (!this)
+        return;
+
+    conf = this->private;
+
+    ios_conf_destroy(conf);
+    this->private = NULL;
+    gf_log(this->name, GF_LOG_INFO, "io-stats translator unloaded");
+    return;
+}
+
+int
+notify(xlator_t *this, int32_t event, void *data, ...)
+{
+    int ret = 0;
+    struct ios_dump_args args = {0};
+    dict_t *output = NULL;
+    dict_t *dict = NULL;
+    int32_t op = 0;
+    int32_t list_cnt = 0;
+    double throughput = 0;
+    double time = 0;
+    gf_boolean_t is_peek = _gf_false;
+    va_list ap;
+    struct gf_upcall *up_data = NULL;
+    struct gf_upcall_cache_invalidation *up_ci = NULL;
+
+    dict = data;
+    va_start(ap, data);
+    output = va_arg(ap, dict_t *);
+    va_end(ap);
+    switch (event) {
+        case GF_EVENT_TRANSLATOR_INFO:
+            ret = dict_get_str_boolean(dict, "clear-stats", _gf_false);
+            if (ret) {
+                ret = dict_set_int32(output, "top-op", op);
+                if (ret) {
+                    gf_log(this->name, GF_LOG_ERROR,
+                           "Failed to set top-op in dict");
+                    goto out;
+                }
+                ios_destroy_top_stats(this->private);
+                ret = ios_init_top_stats(this->private);
+                if (ret)
+                    gf_log(this->name, GF_LOG_ERROR,
+                           "Failed to reset top stats");
+                ret = dict_set_int32(output, "stats-cleared", ret ? 0 : 1);
+                if (ret)
+                    gf_log(this->name, GF_LOG_ERROR,
+                           "Failed to set stats-cleared"
+                           " in dict");
+                goto out;
+            }
+
+            ret = dict_get_int32(dict, "top-op", &op);
+            if (!ret) {
+                ret = dict_get_int32(dict, "list-cnt", &list_cnt);
+                if (op > IOS_STATS_TYPE_NONE && op < IOS_STATS_TYPE_MAX)
+                    ret = io_stats_dump_stats_to_dict(this, output, op,
+                                                      list_cnt);
+                if (op == IOS_STATS_TYPE_READ_THROUGHPUT ||
+                    op == IOS_STATS_TYPE_WRITE_THROUGHPUT) {
+                    ret = dict_get_double(dict, "throughput", &throughput);
+                    if (!ret) {
+                        ret = dict_get_double(dict, "time", &time);
+                        if (ret)
+                            goto out;
+                        ret = dict_set_double(output, "throughput", throughput);
+                        if (ret)
+                            goto out;
+                        ret = dict_set_double(output, "time", time);
+                        if (ret)
+                            goto out;
+                    }
+                    ret = 0;
+                }
+            } else {
+                ret = dict_get_int32(dict, "info-op", &op);
+                if (ret || op < GF_IOS_INFO_ALL || GF_IOS_INFO_CLEAR < op)
+                    op = GF_IOS_INFO_ALL;
+
+                ret = dict_set_int32(output, "info-op", op);
+                if (ret) {
+                    gf_log(this->name, GF_LOG_ERROR,
+                           "Failed to set info-op in dict");
+                    goto out;
+                }
+
+                if (GF_IOS_INFO_CLEAR == op) {
+                    io_stats_clear(this->private);
+
+                    ret = dict_set_int32(output, "stats-cleared", 1);
+                    if (ret)
+                        gf_log(this->name, GF_LOG_ERROR,
+                               "Failed to set stats-cleared"
+                               " in dict");
+                } else {
+                    ret = dict_get_str_boolean(dict, "peek", _gf_false);
+                    if (-1 != ret)
+                        is_peek = ret;
+
+                    (void)ios_dump_args_init(&args, IOS_DUMP_TYPE_DICT, output);
+                    ret = io_stats_dump(this, &args, op, is_peek);
+                }
+            }
+            break;
+        case GF_EVENT_UPCALL:
+            up_data = (struct gf_upcall *)data;
+            ios_bump_upcall(this, GF_UPCALL);
+
+            switch (up_data->event_type) {
+                case GF_UPCALL_RECALL_LEASE:
+                    ios_bump_upcall(this, GF_UPCALL_LEASE_RECALL);
+                    break;
+                case GF_UPCALL_CACHE_INVALIDATION:
+                    up_ci = (struct gf_upcall_cache_invalidation *)
+                                up_data->data;
+                    if (up_ci->flags & (UP_XATTR | UP_XATTR_RM))
+                        ios_bump_upcall(this, GF_UPCALL_CI_XATTR);
+                    if (up_ci->flags & IATT_UPDATE_FLAGS)
+                        ios_bump_upcall(this, GF_UPCALL_CI_STAT);
+                    if (up_ci->flags & UP_RENAME_FLAGS)
+                        ios_bump_upcall(this, GF_UPCALL_CI_RENAME);
+                    if (up_ci->flags & UP_FORGET)
+                        ios_bump_upcall(this, GF_UPCALL_CI_FORGET);
+                    if (up_ci->flags & UP_NLINK)
+                        ios_bump_upcall(this, GF_UPCALL_CI_NLINK);
+                    break;
+                default:
+                    gf_msg_debug(this->name, 0,
+                                 "Unknown upcall event "
+                                 "type :%d",
+                                 up_data->event_type);
+                    break;
+            }
+
+            default_notify(this, event, data);
+            break;
+        default:
+            default_notify(this, event, data);
+            break;
+    }
+out:
+    return ret;
+}
+
+struct xlator_dumpops dumpops = {.priv = io_priv};
+
+struct xlator_fops fops = {
+    .stat = io_stats_stat,
+    .readlink = io_stats_readlink,
+    .mknod = io_stats_mknod,
+    .mkdir = io_stats_mkdir,
+    .unlink = io_stats_unlink,
+    .rmdir = io_stats_rmdir,
+    .symlink = io_stats_symlink,
+    .rename = io_stats_rename,
+    .link = io_stats_link,
+    .truncate = io_stats_truncate,
+    .open = io_stats_open,
+    .readv = io_stats_readv,
+    .writev = io_stats_writev,
+    .statfs = io_stats_statfs,
+    .flush = io_stats_flush,
+    .fsync = io_stats_fsync,
+    .setxattr = io_stats_setxattr,
+    .getxattr = io_stats_getxattr,
+    .removexattr = io_stats_removexattr,
+    .fsetxattr = io_stats_fsetxattr,
+    .fgetxattr = io_stats_fgetxattr,
+    .fremovexattr = io_stats_fremovexattr,
+    .opendir = io_stats_opendir,
+    .readdir = io_stats_readdir,
+    .readdirp = io_stats_readdirp,
+    .fsyncdir = io_stats_fsyncdir,
+    .access = io_stats_access,
+    .ftruncate = io_stats_ftruncate,
+    .fstat = io_stats_fstat,
+    .create = io_stats_create,
+    .lk = io_stats_lk,
+    .inodelk = io_stats_inodelk,
+    .finodelk = io_stats_finodelk,
+    .entrylk = io_stats_entrylk,
+    .fentrylk = io_stats_fentrylk,
+    .lookup = io_stats_lookup,
+    .xattrop = io_stats_xattrop,
+    .fxattrop = io_stats_fxattrop,
+    .setattr = io_stats_setattr,
+    .fsetattr = io_stats_fsetattr,
+    .fallocate = io_stats_fallocate,
+    .discard = io_stats_discard,
+    .zerofill = io_stats_zerofill,
+    .ipc = io_stats_ipc,
+    .rchecksum = io_stats_rchecksum,
+    .seek = io_stats_seek,
+    .lease = io_stats_lease,
+    .getactivelk = io_stats_getactivelk,
+    .setactivelk = io_stats_setactivelk,
+    .compound = io_stats_compound,
+    .copy_file_range = io_stats_copy_file_range,
+};
+
+struct xlator_cbks cbks = {
+    .release = io_stats_release,
+    .releasedir = io_stats_releasedir,
+    .forget = io_stats_forget,
+};
+
+struct volume_options options[] = {
+    {.key = {"dump-fd-stats"},
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "If on stats related to file-operations would be "
+                    "tracked inside GlusterFS data-structures."},
+    {.key = {"ios-dump-interval"},
+     .type = GF_OPTION_TYPE_INT,
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .min = 0,
+     .max = 3600,
+     .default_value = "0",
+     .description = "Interval (in seconds) at which to auto-dump "
+                    "statistics. Zero disables automatic dumping."},
+    {.key = {"ios-sample-interval"},
+     .type = GF_OPTION_TYPE_INT,
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .min = 0,
+     .max = 65535,
+     .default_value = "0",
+     .description = "Interval in which we want to collect FOP latency "
+                    "samples.  2 means collect a sample every 2nd FOP."},
+    {.key = {"ios-dump-format"},
+     .type = GF_OPTION_TYPE_STR,
+     .op_version = {GD_OP_VERSION_3_12_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .default_value = "json",
+     .description = " The dump-format option specifies the format in which"
+                    " to dump the statistics. Select between \"text\", "
+                    "\"json\", \"dict\" and \"samples\". Default is "
+                    "\"json\".",
+     .value = {"text", "json", "dict", "samples"}},
+    {.key = {"ios-sample-buf-size"},
+     .type = GF_OPTION_TYPE_INT,
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .min = 1024,
+     .max = 1024 * 1024,
+     .default_value = "65535",
+     .description = "The maximum size of our FOP sampling ring buffer."},
+    {.key = {"ios-dnscache-ttl-sec"},
+     .type = GF_OPTION_TYPE_INT,
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .min = 1,
+     .max = 3600 * 72,
+     .default_value = "86400",
+     .description = "The interval after wish a cached DNS entry will be "
+                    "re-validated.  Default: 24 hrs"},
+    {.key = {"latency-measurement"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .default_value = "off",
+     .description = "If on stats related to the latency of each operation "
+                    "would be tracked inside GlusterFS data-structures. "},
+    {
+        .key = {"count-fop-hits"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .op_version = {1},
+        .flags = OPT_FLAG_SETTABLE,
+        .tags = {"io-stats"},
+    },
+    {.key = {"log-level"},
+     .type = GF_OPTION_TYPE_STR,
+     .value = {"DEBUG", "WARNING", "ERROR", "INFO", "CRITICAL", "NONE",
+               "TRACE"}},
+
+    /* These are synthetic entries to assist validation of CLI's  *
+     *  volume set  command                                       */
+    {.key = {"client-log-level"},
+     .type = GF_OPTION_TYPE_STR,
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .default_value = "INFO",
+     .description = "Changes the log-level of the clients",
+     .value = {"DEBUG", "WARNING", "ERROR", "INFO", "CRITICAL", "NONE",
+               "TRACE"}},
+    {.key = {"sys-log-level"},
+     .type = GF_OPTION_TYPE_STR,
+     .default_value = "CRITICAL",
+     .description = "Gluster's syslog log-level",
+     .value = {"WARNING", "ERROR", "INFO", "CRITICAL"}},
+    {.key = {"brick-log-level"},
+     .type = GF_OPTION_TYPE_STR,
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .default_value = "INFO",
+     .description = "Changes the log-level of the bricks",
+     .value = {"DEBUG", "WARNING", "ERROR", "INFO", "CRITICAL", "NONE",
+               "TRACE"}},
+    {.key = {"logger"},
+     .type = GF_OPTION_TYPE_STR,
+     .value = {GF_LOGGER_GLUSTER_LOG, GF_LOGGER_SYSLOG}},
+    {.key = {"client-logger"},
+     .type = GF_OPTION_TYPE_STR,
+     .op_version = {GD_OP_VERSION_3_6_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .default_value = GF_LOGGER_GLUSTER_LOG,
+     .description = "Changes the logging sub-system to log to, for the "
+                    "clients",
+     .value = {GF_LOGGER_GLUSTER_LOG, GF_LOGGER_SYSLOG}},
+    {.key = {"brick-logger"},
+     .type = GF_OPTION_TYPE_STR,
+     .op_version = {GD_OP_VERSION_3_6_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .default_value = GF_LOGGER_GLUSTER_LOG,
+     .description = "Changes the logging sub-system to log to, for the "
+                    "bricks",
+     .value = {GF_LOGGER_GLUSTER_LOG, GF_LOGGER_SYSLOG}},
+    {.key = {"log-format"},
+     .type = GF_OPTION_TYPE_STR,
+     .value = {GF_LOG_FORMAT_NO_MSG_ID, GF_LOG_FORMAT_WITH_MSG_ID}},
+    {.key = {"client-log-format"},
+     .type = GF_OPTION_TYPE_STR,
+     .op_version = {GD_OP_VERSION_3_6_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .default_value = GF_LOG_FORMAT_WITH_MSG_ID,
+     .description = "Changes log format for the clients",
+     .value = {GF_LOG_FORMAT_NO_MSG_ID, GF_LOG_FORMAT_WITH_MSG_ID}},
+    {.key = {"brick-log-format"},
+     .type = GF_OPTION_TYPE_STR,
+     .op_version = {GD_OP_VERSION_3_6_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .default_value = GF_LOG_FORMAT_WITH_MSG_ID,
+     .description = "Changes the log format for the bricks",
+     .value = {GF_LOG_FORMAT_NO_MSG_ID, GF_LOG_FORMAT_WITH_MSG_ID}},
+    {
+        .key = {"log-buf-size"},
+        .type = GF_OPTION_TYPE_INT,
+        .min = GF_LOG_LRU_BUFSIZE_MIN,
+        .max = GF_LOG_LRU_BUFSIZE_MAX,
+        .default_value = "5",
+    },
+    {.key = {"client-log-buf-size"},
+     .type = GF_OPTION_TYPE_INT,
+     .op_version = {GD_OP_VERSION_3_6_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .min = GF_LOG_LRU_BUFSIZE_MIN,
+     .max = GF_LOG_LRU_BUFSIZE_MAX,
+     .default_value = "5",
+     .description = "This option determines the maximum number of unique "
+                    "log messages that can be buffered for a time equal to"
+                    " the value of the option client-log-flush-timeout."},
+    {.key = {"brick-log-buf-size"},
+     .type = GF_OPTION_TYPE_INT,
+     .op_version = {GD_OP_VERSION_3_6_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .min = GF_LOG_LRU_BUFSIZE_MIN,
+     .max = GF_LOG_LRU_BUFSIZE_MAX,
+     .default_value = "5",
+     .description = "This option determines the maximum number of unique "
+                    "log messages that can be buffered for a time equal to"
+                    " the value of the option brick-log-flush-timeout."},
+    {
+        .key = {"log-flush-timeout"},
+        .type = GF_OPTION_TYPE_TIME,
+        .min = GF_LOG_FLUSH_TIMEOUT_MIN,
+        .max = GF_LOG_FLUSH_TIMEOUT_MAX,
+        .default_value = "120",
+    },
+    {.key = {"client-log-flush-timeout"},
+     .type = GF_OPTION_TYPE_TIME,
+     .op_version = {GD_OP_VERSION_3_6_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .min = GF_LOG_FLUSH_TIMEOUT_MIN,
+     .max = GF_LOG_FLUSH_TIMEOUT_MAX,
+     .default_value = "120",
+     .description = "This option determines the maximum number of unique "
+                    "log messages that can be buffered for a time equal to"
+                    " the value of the option client-log-flush-timeout."},
+    {.key = {"brick-log-flush-timeout"},
+     .type = GF_OPTION_TYPE_TIME,
+     .op_version = {GD_OP_VERSION_3_6_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-stats"},
+     .min = GF_LOG_FLUSH_TIMEOUT_MIN,
+     .max = GF_LOG_FLUSH_TIMEOUT_MAX,
+     .default_value = "120",
+     .description = "This option determines the maximum number of unique "
+                    "log messages that can be buffered for a time equal to"
+                    " the value of the option brick-log-flush-timeout."},
+    {.key = {"unique-id"},
+     .type = GF_OPTION_TYPE_STR,
+     .default_value = "/no/such/path",
+     .description = "Unique ID for our files."},
+    {.key = {"global-threading"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .op_version = {GD_OP_VERSION_6_0},
+     .flags = OPT_FLAG_SETTABLE,
+     .tags = {"io-stats", "threading"},
+     .description = "This option enables the global threading support for "
+                    "bricks. If enabled, it's recommended to also enable "
+                    "'performance.iot-pass-through'"},
+    {.key = {"threads"}, .type = GF_OPTION_TYPE_INT},
+    {.key = {"brick-threads"},
+     .type = GF_OPTION_TYPE_INT,
+     .default_value = "16",
+     .min = 0,
+     .max = GF_ASYNC_MAX_THREADS,
+     .op_version = {GD_OP_VERSION_6_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-stats", "threading"},
+     .description = "When global threading is used, this value determines the "
+                    "maximum amount of threads that can be created on bricks"},
+    {.key = {"client-threads"},
+     .type = GF_OPTION_TYPE_INT,
+     .default_value = "16",
+     .min = 0,
+     .max = GF_ASYNC_MAX_THREADS,
+     .op_version = {GD_OP_VERSION_6_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+     .tags = {"io-stats", "threading"},
+     .description = "When global threading is used, this value determines the "
+                    "maximum amount of threads that can be created on clients"},
+    {.key = {"volume-id"},
+     .type = GF_OPTION_TYPE_STR,
+     .op_version = {GD_OP_VERSION_7_1},
+     .tags = {"global", "volume-id"},
+     .description =
+         "This option points to the 'unique' UUID particular to this "
+         "volume, which would be set in 'graph->volume_id'"},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "io-stats",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/path-convertor/Makefile.am b/xlators/debug/sink/Makefile.am
index d471a3f9243..f2689244371 100644
--- a/xlators/features/path-convertor/Makefile.am
+++ b/xlators/debug/sink/Makefile.am
@@ -1,3 +1,2 @@
 SUBDIRS = src
 
-CLEANFILES = 
diff --git a/xlators/debug/sink/src/Makefile.am b/xlators/debug/sink/src/Makefile.am
new file mode 100644
index 00000000000..f952c2ce6bc
--- /dev/null
+++ b/xlators/debug/sink/src/Makefile.am
@@ -0,0 +1,14 @@
+xlator_LTLIBRARIES = sink.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+        -I$(top_builddir)/rpc/xdr/src
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+sink_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+sink_la_SOURCES = sink.c
+sink_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+CLEANFILES =
+
diff --git a/xlators/debug/sink/src/sink.c b/xlators/debug/sink/src/sink.c
new file mode 100644
index 00000000000..9822bbb732e
--- /dev/null
+++ b/xlators/debug/sink/src/sink.c
@@ -0,0 +1,94 @@
+/*
+   Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+int32_t
+init(xlator_t *this)
+{
+    return 0;
+}
+
+void
+fini(xlator_t *this)
+{
+    return;
+}
+
+/*
+ * notify - when parent sends PARENT_UP, send CHILD_UP event from here
+ */
+int32_t
+notify(xlator_t *this, int32_t event, void *data, ...)
+{
+    switch (event) {
+        case GF_EVENT_PARENT_UP:
+            /* Tell the parent that this xlator is up */
+            default_notify(this, GF_EVENT_CHILD_UP, data);
+            break;
+        case GF_EVENT_PARENT_DOWN:
+            /* Tell the parent that this xlator is down */
+            default_notify(this, GF_EVENT_CHILD_DOWN, data);
+            break;
+        default:
+            break;
+    }
+
+    return 0;
+}
+
+/*
+ * A lookup on "/" is done while mounting or glfs_init() is performed. This
+ * needs to return a valid directory for the root of the mountpoint.
+ *
+ * In case this xlator is used for more advanced debugging, it will need to be
+ * extended to support different LOOKUPs too.
+ */
+static int32_t
+sink_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    struct iatt stbuf = {
+        0,
+    };
+    struct iatt postparent = {
+        0,
+    };
+
+    /* the root of the volume always need to be a directory */
+    stbuf.ia_type = IA_IFDIR;
+
+    STACK_UNWIND_STRICT(lookup, frame, 0, 0, loc ? loc->inode : NULL, &stbuf,
+                        xdata, &postparent);
+
+    return 0;
+}
+
+struct xlator_fops fops = {
+    .lookup = sink_lookup,
+};
+
+struct xlator_cbks cbks = {};
+
+struct volume_options options[] = {
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .op_version = {GD_OP_VERSION_3_12_0},
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "sink",
+    .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/debug/trace/src/Makefile.am b/xlators/debug/trace/src/Makefile.am
index 0f1679a049d..a37ea63af04 100644
--- a/xlators/debug/trace/src/Makefile.am
+++ b/xlators/debug/trace/src/Makefile.am
@@ -2,13 +2,16 @@
 xlator_LTLIBRARIES = trace.la
 xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug
 
-trace_la_LDFLAGS = -module -avoidversion
+trace_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
 
 trace_la_SOURCES = trace.c
 trace_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+noinst_HEADERS = trace.h trace-mem-types.h
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
 
 CLEANFILES = 
 
diff --git a/xlators/debug/trace/src/trace-mem-types.h b/xlators/debug/trace/src/trace-mem-types.h
new file mode 100644
index 00000000000..18a7e0414a6
--- /dev/null
+++ b/xlators/debug/trace/src/trace-mem-types.h
@@ -0,0 +1,20 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __TRACE_MEM_TYPES_H__
+#define __TRACE_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_trace_mem_types_ {
+    gf_trace_mt_trace_conf_t = gf_common_mt_end + 1,
+    gf_trace_mt_end
+};
+#endif
diff --git a/xlators/debug/trace/src/trace.c b/xlators/debug/trace/src/trace.c
index d3039934201..6ed0ca00342 100644
--- a/xlators/debug/trace/src/trace.c
+++ b/xlators/debug/trace/src/trace.c
@@ -1,2321 +1,3534 @@
 /*
-  Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+  Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
   This file is part of GlusterFS.
 
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+#include "trace.h"
+#include "trace-mem-types.h"
 
 /**
  * xlators/debug/trace :
- *    This translator logs all the arguments to the fops/mops and also 
- *    their _cbk functions, which later passes the call to next layer. 
+ *    This translator logs all the arguments to the fops/mops and also
+ *    their _cbk functions, which later passes the call to next layer.
  *    Very helpful translator for debugging.
  */
+#define TRACE_STAT_TO_STR(buf, str) trace_stat_to_str(buf, str, sizeof(str))
+
+static void
+trace_stat_to_str(struct iatt *buf, char *str, size_t len)
+{
+    char atime_buf[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    char mtime_buf[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    char ctime_buf[GF_TIMESTR_SIZE] = {
+        0,
+    };
+
+    if (!buf)
+        return;
+
+    gf_time_fmt(atime_buf, sizeof atime_buf, buf->ia_atime, gf_timefmt_dirent);
+
+    gf_time_fmt(mtime_buf, sizeof mtime_buf, buf->ia_mtime, gf_timefmt_dirent);
+
+    gf_time_fmt(ctime_buf, sizeof ctime_buf, buf->ia_ctime, gf_timefmt_dirent);
+
+    snprintf(str, len,
+             "gfid=%s ino=%" PRIu64
+             ", mode=%o, "
+             "nlink=%" GF_PRI_NLINK ", uid=%u, gid=%u, size=%" PRIu64
+             ", "
+             "blocks=%" PRIu64
+             ", atime=%s mtime=%s ctime=%s "
+             "atime_sec=%" PRId64 ", atime_nsec=%" PRIu32
+             ","
+             " mtime_sec=%" PRId64 ", mtime_nsec=%" PRIu32
+             ", "
+             "ctime_sec=%" PRId64 ", ctime_nsec=%" PRIu32 "",
+             uuid_utoa(buf->ia_gfid), buf->ia_ino,
+             st_mode_from_ia(buf->ia_prot, buf->ia_type), buf->ia_nlink,
+             buf->ia_uid, buf->ia_gid, buf->ia_size, buf->ia_blocks, atime_buf,
+             mtime_buf, ctime_buf, buf->ia_atime, buf->ia_atime_nsec,
+             buf->ia_mtime, buf->ia_mtime_nsec, buf->ia_ctime,
+             buf->ia_ctime_nsec);
+}
+
+int
+dump_history_trace(circular_buffer_t *cb, void *data)
+{
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+
+    /* Since we are continuing with adding entries to the buffer even when
+       gettimeofday () fails, it's safe to check tm and then dump the time
+       at which the entry was added to the buffer */
+
+    gf_time_fmt_tv(timestr, sizeof timestr, &cb->tv, gf_timefmt_Ymd_T);
+    gf_proc_dump_write("TIME", "%s", timestr);
+
+    gf_proc_dump_write("FOP", "%s\n", (char *)cb->data);
+
+    return 0;
+}
+
+int
+trace_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                 struct iatt *buf, struct iatt *preparent,
+                 struct iatt *postparent, dict_t *xdata)
+{
+    char statstr[1024] = {
+        0,
+    };
+    char preparentstr[1024] = {
+        0,
+    };
+    char postparentstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_CREATE].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret >= 0) {
+            TRACE_STAT_TO_STR(buf, statstr);
+            TRACE_STAT_TO_STR(preparent, preparentstr);
+            TRACE_STAT_TO_STR(postparent, postparentstr);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s (op_ret=%d, fd=%p"
+                     "*stbuf {%s}, *preparent {%s}, "
+                     "*postparent = {%s})",
+                     frame->root->unique, uuid_utoa(inode->gfid), op_ret, fd,
+                     statstr, preparentstr, postparentstr);
+
+            /* for 'release' log */
+            fd_ctx_set(fd, this, 0);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64 ": (op_ret=%d, op_errno=%d)",
+                     frame->root->unique, op_ret, op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, buf,
+                       preparent, postparent, xdata);
+    return 0;
+}
+
+int
+trace_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_OPEN].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s op_ret=%d, op_errno=%d, "
+                 "*fd=%p",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret, op_errno,
+                 fd);
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    /* for 'release' log */
+    if (op_ret >= 0)
+        fd_ctx_set(fd, this, 0);
+
+    TRACE_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata);
+    return 0;
+}
+
+int
+trace_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *buf,
+               dict_t *xdata)
+{
+    char statstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_STAT].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(buf, statstr);
+            (void)snprintf(
+                string, sizeof(string), "%" PRId64 ": gfid=%s op_ret=%d buf=%s",
+                frame->root->unique, uuid_utoa(frame->local), op_ret, statstr);
+        } else {
+            (void)snprintf(string, sizeof(string),
+                           "%" PRId64
+                           ": gfid=%s op_ret=%d, "
+                           "op_errno=%d)",
+                           frame->root->unique, uuid_utoa(frame->local), op_ret,
+                           op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+int
+trace_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                int32_t count, struct iatt *buf, struct iobref *iobref,
+                dict_t *xdata)
+{
+    char statstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_READ].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret >= 0) {
+            TRACE_STAT_TO_STR(buf, statstr);
+            snprintf(
+                string, sizeof(string), "%" PRId64 ": gfid=%s op_ret=%d buf=%s",
+                frame->root->unique, uuid_utoa(frame->local), op_ret, statstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "op_errno=%d)",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, buf,
+                       iobref, xdata);
+    return 0;
+}
+
+int
+trace_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                 struct iatt *postbuf, dict_t *xdata)
+{
+    char preopstr[1024] = {
+        0,
+    };
+    char postopstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_WRITE].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret >= 0) {
+            TRACE_STAT_TO_STR(prebuf, preopstr);
+            TRACE_STAT_TO_STR(postbuf, postopstr);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": (op_ret=%d, "
+                     "*prebuf = {%s}, *postbuf = {%s})",
+                     frame->root->unique, op_ret, preopstr, postopstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "op_errno=%d",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+    return 0;
+}
+
+int
+trace_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, gf_dirent_t *buf,
+                  dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_READDIR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 " : gfid=%s op_ret=%d, op_errno=%d",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(readdir, frame, op_ret, op_errno, buf, xdata);
+
+    return 0;
+}
+
+int
+trace_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, gf_dirent_t *buf,
+                   dict_t *xdata)
+{
+    int count = 0;
+    char statstr[1024] = {
+        0,
+    };
+    char string[4096] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+    gf_dirent_t *entry = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_READDIRP].enabled) {
+        snprintf(string, sizeof(string),
+                 "%" PRId64 " : gfid=%s op_ret=%d, op_errno=%d",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+    if (op_ret < 0)
+        goto out;
+
+    list_for_each_entry(entry, &buf->list, list)
+    {
+        count++;
+        TRACE_STAT_TO_STR(&entry->d_stat, statstr);
+        snprintf(string, sizeof(string),
+                 "entry no. %d, pargfid=%s, "
+                 "bname=%s *buf {%s}",
+                 count, uuid_utoa(frame->local), entry->d_name, statstr);
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    TRACE_STACK_UNWIND(readdirp, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+int
+trace_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                struct iatt *postbuf, dict_t *xdata)
+{
+    char preopstr[1024] = {
+        0,
+    };
+    char postopstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FSYNC].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(prebuf, preopstr);
+            TRACE_STAT_TO_STR(postbuf, postopstr);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": (op_ret=%d, "
+                     "*prebuf = {%s}, *postbuf = {%s}",
+                     frame->root->unique, op_ret, preopstr, postopstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "op_errno=%d",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    return 0;
+}
+
+int
+trace_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+                  struct iatt *statpost, dict_t *xdata)
+{
+    char preopstr[1024] = {
+        0,
+    };
+    char postopstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_SETATTR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(statpre, preopstr);
+            TRACE_STAT_TO_STR(statpost, postopstr);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": (op_ret=%d, "
+                     "*prebuf = {%s}, *postbuf = {%s})",
+                     frame->root->unique, op_ret, preopstr, postopstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "op_errno=%d)",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(setattr, frame, op_ret, op_errno, statpre, statpost,
+                       xdata);
+    return 0;
+}
+
+int
+trace_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+                   struct iatt *statpost, dict_t *xdata)
+{
+    char preopstr[1024] = {
+        0,
+    };
+    char postopstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FSETATTR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(statpre, preopstr);
+            TRACE_STAT_TO_STR(statpost, postopstr);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": (op_ret=%d, "
+                     "*prebuf = {%s}, *postbuf = {%s})",
+                     frame->root->unique, op_ret, preopstr, postopstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d)",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, statpre, statpost,
+                       xdata);
+    return 0;
+}
+
+int
+trace_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                 struct iatt *postparent, dict_t *xdata)
+{
+    char preparentstr[1024] = {
+        0,
+    };
+    char postparentstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_UNLINK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(preparent, preparentstr);
+            TRACE_STAT_TO_STR(postparent, postparentstr);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     " *preparent = {%s}, "
+                     "*postparent = {%s})",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     preparentstr, postparentstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "op_errno=%d)",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, postparent,
+                       xdata);
+    return 0;
+}
+
+int
+trace_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                 struct iatt *preoldparent, struct iatt *postoldparent,
+                 struct iatt *prenewparent, struct iatt *postnewparent,
+                 dict_t *xdata)
+{
+    char statstr[1024] = {
+        0,
+    };
+    char preoldparentstr[1024] = {
+        0,
+    };
+    char postoldparentstr[1024] = {
+        0,
+    };
+    char prenewparentstr[1024] = {
+        0,
+    };
+    char postnewparentstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_RENAME].enabled) {
+        char string[6044] = {
+            0,
+        };
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(buf, statstr);
+            TRACE_STAT_TO_STR(preoldparent, preoldparentstr);
+            TRACE_STAT_TO_STR(postoldparent, postoldparentstr);
+            TRACE_STAT_TO_STR(prenewparent, prenewparentstr);
+            TRACE_STAT_TO_STR(postnewparent, postnewparentstr);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": (op_ret=%d, "
+                     "*stbuf = {%s}, *preoldparent = {%s},"
+                     " *postoldparent = {%s}"
+                     " *prenewparent = {%s}, "
+                     "*postnewparent = {%s})",
+                     frame->root->unique, op_ret, statstr, preoldparentstr,
+                     postoldparentstr, prenewparentstr, postnewparentstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "op_errno=%d",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(rename, frame, op_ret, op_errno, buf, preoldparent,
+                       postoldparent, prenewparent, postnewparent, xdata);
+    return 0;
+}
+
+int
+trace_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, const char *buf,
+                   struct iatt *stbuf, dict_t *xdata)
+{
+    char statstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_READLINK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(stbuf, statstr);
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": (op_ret=%d, op_errno=%d,"
+                     "buf=%s, stbuf = { %s })",
+                     frame->root->unique, op_ret, op_errno, buf, statstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "op_errno=%d",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(readlink, frame, op_ret, op_errno, buf, stbuf, xdata);
+    return 0;
+}
+
+int
+trace_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, inode_t *inode,
+                 struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+    char statstr[1024] = {
+        0,
+    };
+    char postparentstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_LOOKUP].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(buf, statstr);
+            TRACE_STAT_TO_STR(postparent, postparentstr);
+            /* print buf->ia_gfid instead of inode->gfid,
+             * since if the inode is not yet linked to the
+             * inode table (fresh lookup) then null gfid
+             * will be printed.
+             */
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s (op_ret=%d "
+                     "*buf {%s}, *postparent {%s}",
+                     frame->root->unique, uuid_utoa(buf->ia_gfid), op_ret,
+                     statstr, postparentstr);
+
+            /* For 'forget' */
+            inode_ctx_put(inode, this, 0);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "op_errno=%d)",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+                       postparent);
+    return 0;
+}
+
+int
+trace_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, inode_t *inode,
+                  struct iatt *buf, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
+{
+    char statstr[1024] = {
+        0,
+    };
+    char preparentstr[1024] = {
+        0,
+    };
+    char postparentstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_SYMLINK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(buf, statstr);
+            TRACE_STAT_TO_STR(preparent, preparentstr);
+            TRACE_STAT_TO_STR(postparent, postparentstr);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s (op_ret=%d "
+                     "*stbuf = {%s}, *preparent = {%s}, "
+                     "*postparent = {%s})",
+                     frame->root->unique, uuid_utoa(inode->gfid), op_ret,
+                     statstr, preparentstr, postparentstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64 ": op_ret=%d, op_errno=%d", frame->root->unique,
+                     op_ret, op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(symlink, frame, op_ret, op_errno, inode, buf, preparent,
+                       postparent, xdata);
+    return 0;
+}
+
+int
+trace_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, inode_t *inode,
+                struct iatt *buf, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
+{
+    char statstr[1024] = {
+        0,
+    };
+    char preparentstr[1024] = {
+        0,
+    };
+    char postparentstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    char string[4096] = {
+        0,
+    };
+    if (trace_fop_names[GF_FOP_MKNOD].enabled) {
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(buf, statstr);
+            TRACE_STAT_TO_STR(preparent, preparentstr);
+            TRACE_STAT_TO_STR(postparent, postparentstr);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s (op_ret=%d "
+                     "*stbuf = {%s}, *preparent = {%s}, "
+                     "*postparent = {%s})",
+                     frame->root->unique, uuid_utoa(inode->gfid), op_ret,
+                     statstr, preparentstr, postparentstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64 ": (op_ret=%d, op_errno=%d)",
+                     frame->root->unique, op_ret, op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent,
+                       postparent, xdata);
+    return 0;
+}
+
+int
+trace_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, inode_t *inode,
+                struct iatt *buf, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
+{
+    char statstr[1024] = {
+        0,
+    };
+    char preparentstr[1024] = {
+        0,
+    };
+    char postparentstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_MKDIR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(buf, statstr);
+            TRACE_STAT_TO_STR(preparent, preparentstr);
+            TRACE_STAT_TO_STR(postparent, postparentstr);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s (op_ret=%d "
+                     ", *stbuf = {%s}, *prebuf = {%s}, "
+                     "*postbuf = {%s} )",
+                     frame->root->unique, uuid_utoa(inode->gfid), op_ret,
+                     statstr, preparentstr, postparentstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64 ": (op_ret=%d, op_errno=%d)",
+                     frame->root->unique, op_ret, op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, buf, preparent,
+                       postparent, xdata);
+    return 0;
+}
+
+int
+trace_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, inode_t *inode,
+               struct iatt *buf, struct iatt *preparent,
+               struct iatt *postparent, dict_t *xdata)
+{
+    char statstr[1024] = {
+        0,
+    };
+    char preparentstr[1024] = {
+        0,
+    };
+    char postparentstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    char string[4096] = {
+        0,
+    };
+    if (trace_fop_names[GF_FOP_LINK].enabled) {
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(buf, statstr);
+            TRACE_STAT_TO_STR(preparent, preparentstr);
+            TRACE_STAT_TO_STR(postparent, postparentstr);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": (op_ret=%d, "
+                     "*stbuf = {%s},  *prebuf = {%s},"
+                     " *postbuf = {%s})",
+                     frame->root->unique, op_ret, statstr, preparentstr,
+                     postparentstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "op_errno=%d",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent,
+                       postparent, xdata);
+    return 0;
+}
+
+int
+trace_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    char string[4096] = {
+        0,
+    };
+    if (trace_fop_names[GF_FOP_FLUSH].enabled) {
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+trace_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    char string[4096] = {
+        0,
+    };
+    if (trace_fop_names[GF_FOP_OPENDIR].enabled) {
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s op_ret=%d, op_errno=%d,"
+                 " fd=%p",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret, op_errno,
+                 fd);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    /* for 'releasedir' log */
+    if (op_ret >= 0)
+        fd_ctx_set(fd, this, 0);
+
+    TRACE_STACK_UNWIND(opendir, frame, op_ret, op_errno, fd, xdata);
+    return 0;
+}
+
+int
+trace_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
+{
+    char preparentstr[1024] = {
+        0,
+    };
+    char postparentstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_RMDIR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(preparent, preparentstr);
+            TRACE_STAT_TO_STR(postparent, postparentstr);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "*prebuf={%s},  *postbuf={%s}",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     preparentstr, postparentstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "op_errno=%d",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(rmdir, frame, op_ret, op_errno, preparent, postparent,
+                       xdata);
+    return 0;
+}
+
+int
+trace_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                   struct iatt *postbuf, dict_t *xdata)
+{
+    char preopstr[1024] = {
+        0,
+    };
+    char postopstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_TRUNCATE].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(prebuf, preopstr);
+            TRACE_STAT_TO_STR(postbuf, postopstr);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": (op_ret=%d, "
+                     "*prebuf = {%s}, *postbuf = {%s} )",
+                     frame->root->unique, op_ret, preopstr, postopstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "op_errno=%d",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+                       xdata);
+    return 0;
+}
+
+int
+trace_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+                 dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_STATFS].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret == 0) {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": ({f_bsize=%lu, "
+                     "f_frsize=%lu, "
+                     "f_blocks=%" GF_PRI_FSBLK ", f_bfree=%" GF_PRI_FSBLK
+                     ", "
+                     "f_bavail=%" GF_PRI_FSBLK
+                     ", "
+                     "f_files=%" GF_PRI_FSBLK
+                     ", "
+                     "f_ffree=%" GF_PRI_FSBLK
+                     ", "
+                     "f_favail=%" GF_PRI_FSBLK
+                     ", "
+                     "f_fsid=%lu, f_flag=%lu, "
+                     "f_namemax=%lu}) => ret=%d",
+                     frame->root->unique, buf->f_bsize, buf->f_frsize,
+                     buf->f_blocks, buf->f_bfree, buf->f_bavail, buf->f_files,
+                     buf->f_ffree, buf->f_favail, buf->f_fsid, buf->f_flag,
+                     buf->f_namemax, op_ret);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": (op_ret=%d, "
+                     "op_errno=%d)",
+                     frame->root->unique, op_ret, op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(statfs, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+int
+trace_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_SETXATTR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+trace_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *dict,
+                   dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_GETXATTR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s op_ret=%d, op_errno=%d,"
+                 " dict=%p",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret, op_errno,
+                 dict);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+
+    return 0;
+}
+
+int
+trace_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FSETXATTR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+trace_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *dict,
+                    dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FGETXATTR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s op_ret=%d, op_errno=%d,"
+                 " dict=%p",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret, op_errno,
+                 dict);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata);
+
+    return 0;
+}
+
+int
+trace_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_REMOVEXATTR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+int
+trace_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FSYNCDIR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(fsyncdir, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+trace_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_ACCESS].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s op_ret=%d, "
+                 "op_errno=%d)",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(access, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+trace_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                    struct iatt *postbuf, dict_t *xdata)
+{
+    char prebufstr[1024] = {
+        0,
+    };
+    char postbufstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FTRUNCATE].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(prebuf, prebufstr);
+            TRACE_STAT_TO_STR(postbuf, postbufstr);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": op_ret=%d, "
+                     "*prebuf = {%s}, *postbuf = {%s} )",
+                     frame->root->unique, op_ret, prebufstr, postbufstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "op_errno=%d",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+                       xdata);
+    return 0;
+}
+
+int
+trace_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                dict_t *xdata)
+{
+    char statstr[1024] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FSTAT].enabled) {
+        char string[4096] = {0.};
+        if (op_ret == 0) {
+            TRACE_STAT_TO_STR(buf, statstr);
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d "
+                     "buf=%s",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     statstr);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "op_errno=%d",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+int
+trace_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_LK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (op_ret == 0) {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "{l_type=%d, l_whence=%d, "
+                     "l_start=%" PRId64
+                     ", "
+                     "l_len=%" PRId64 ", l_pid=%u})",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     lock->l_type, lock->l_whence, lock->l_start, lock->l_len,
+                     lock->l_pid);
+        } else {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s op_ret=%d, "
+                     "op_errno=%d)",
+                     frame->root->unique, uuid_utoa(frame->local), op_ret,
+                     op_errno);
+        }
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(lk, frame, op_ret, op_errno, lock, xdata);
+    return 0;
+}
+
+int
+trace_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_ENTRYLK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(entrylk, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+trace_fentrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FENTRYLK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(fentrylk, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+trace_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_XATTROP].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+int
+trace_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *dict,
+                   dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FXATTROP].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+int
+trace_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_INODELK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(inodelk, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+trace_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FINODELK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s op_ret=%d, op_errno=%d",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+trace_rchecksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, uint32_t weak_checksum,
+                    uint8_t *strong_checksum, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_RCHECKSUM].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s op_ret=%d op_errno=%d",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret,
+                 op_errno);
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    TRACE_STACK_UNWIND(rchecksum, frame, op_ret, op_errno, weak_checksum,
+                       strong_checksum, xdata);
+
+    return 0;
+}
+
+/* *_cbk section over <----------> fop section start */
+
+int
+trace_entrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+              loc_t *loc, const char *basename, entrylk_cmd cmd,
+              entrylk_type type, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_ENTRYLK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s volume=%s, (path=%s "
+                 "basename=%s, cmd=%s, type=%s)",
+                 frame->root->unique, uuid_utoa(loc->inode->gfid), volume,
+                 loc->path, basename,
+                 ((cmd == ENTRYLK_LOCK) ? "ENTRYLK_LOCK" : "ENTRYLK_UNLOCK"),
+                 ((type == ENTRYLK_RDLCK) ? "ENTRYLK_RDLCK" : "ENTRYLK_WRLCK"));
+
+        frame->local = loc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, volume, loc, basename, cmd,
+               type, xdata);
+    return 0;
+}
+
+int
+trace_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+              loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+    char *cmd_str = NULL;
+    char *type_str = NULL;
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_INODELK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        switch (cmd) {
+#if F_GETLK != F_GETLK64
+            case F_GETLK64:
+#endif
+            case F_GETLK:
+                cmd_str = "GETLK";
+                break;
+
+#if F_SETLK != F_SETLK64
+            case F_SETLK64:
+#endif
+            case F_SETLK:
+                cmd_str = "SETLK";
+                break;
+
+#if F_SETLKW != F_SETLKW64
+            case F_SETLKW64:
+#endif
+            case F_SETLKW:
+                cmd_str = "SETLKW";
+                break;
+
+            default:
+                cmd_str = "UNKNOWN";
+                break;
+        }
+
+        switch (flock->l_type) {
+            case F_RDLCK:
+                type_str = "READ";
+                break;
+            case F_WRLCK:
+                type_str = "WRITE";
+                break;
+            case F_UNLCK:
+                type_str = "UNLOCK";
+                break;
+            default:
+                type_str = "UNKNOWN";
+                break;
+        }
+
+        snprintf(
+            string, sizeof(string),
+            "%" PRId64
+            ": gfid=%s volume=%s, (path=%s "
+            "cmd=%s, type=%s, start=%llu, len=%llu, "
+            "pid=%llu)",
+            frame->root->unique, uuid_utoa(loc->inode->gfid), volume, loc->path,
+            cmd_str, type_str, (unsigned long long)flock->l_start,
+            (unsigned long long)flock->l_len, (unsigned long long)flock->l_pid);
+
+        frame->local = loc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_inodelk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->inodelk, volume, loc, cmd, flock,
+               xdata);
+    return 0;
+}
+
+int
+trace_finodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+               fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+    char *cmd_str = NULL;
+    char *type_str = NULL;
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FINODELK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        switch (cmd) {
+#if F_GETLK != F_GETLK64
+            case F_GETLK64:
+#endif
+            case F_GETLK:
+                cmd_str = "GETLK";
+                break;
+
+#if F_SETLK != F_SETLK64
+            case F_SETLK64:
+#endif
+            case F_SETLK:
+                cmd_str = "SETLK";
+                break;
+
+#if F_SETLKW != F_SETLKW64
+            case F_SETLKW64:
+#endif
+            case F_SETLKW:
+                cmd_str = "SETLKW";
+                break;
+
+            default:
+                cmd_str = "UNKNOWN";
+                break;
+        }
+
+        switch (flock->l_type) {
+            case F_RDLCK:
+                type_str = "READ";
+                break;
+            case F_WRLCK:
+                type_str = "WRITE";
+                break;
+            case F_UNLCK:
+                type_str = "UNLOCK";
+                break;
+            default:
+                type_str = "UNKNOWN";
+                break;
+        }
+
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s volume=%s, (fd =%p "
+                 "cmd=%s, type=%s, start=%llu, len=%llu, "
+                 "pid=%llu)",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), volume, fd,
+                 cmd_str, type_str, (unsigned long long)flock->l_start,
+                 (unsigned long long)flock->l_len,
+                 (unsigned long long)flock->l_pid);
+
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    STACK_WIND(frame, trace_finodelk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->finodelk, volume, fd, cmd, flock,
+               xdata);
+    return 0;
+}
 
-#include <time.h>
-#include <errno.h>
-#include "glusterfs.h"
-#include "xlator.h"
-#include "common-utils.h"
-
-#define ERR_EINVAL_NORETURN(cond)                \
-do                                               \
-  {						 \
-    if ((cond))					 \
-      {						 \
-	gf_log ("ERROR", GF_LOG_ERROR,   	 \
-		"%s: %s: (%s) is true", 	 \
-		__FILE__, __FUNCTION__, #cond);	 \
-      }                                          \
-  } while (0)
-
-typedef struct trace_private {
-	int32_t debug_flag;
-} trace_private_t;
-
-struct {
-	char *name;
-	int enabled;
-} trace_fop_names[GF_FOP_MAXVALUE];
-
-int32_t 
-trace_create_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  fd_t *fd,
-		  inode_t *inode,
-		  struct stat *buf)
-{
-	char atime_buf[256], mtime_buf[256], ctime_buf[256];
-	ERR_EINVAL_NORETURN (!this);
-
-	if (trace_fop_names[GF_FOP_CREATE].enabled) {
-		if (op_ret >= 0) {
-			strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime));
-			strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime));
-			strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime));
-
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, fd=%p, ino=%"PRIu64"), "
-				"*buf {st_dev=%"GF_PRI_DEV", st_ino=%"PRIu64", "
-				"st_mode=%d, st_nlink=%"GF_PRI_NLINK", st_uid=%d, "
-				"st_gid=%d, st_rdev=%"GF_PRI_DEV", st_size=%"PRId64", "
-				"st_blksize=%"GF_PRI_BLKSIZE", st_blocks=%"PRId64", "
-				"st_atime=%s, st_mtime=%s, st_ctime=%s})",
-				frame->root->unique, op_ret, fd, inode->ino, buf->st_dev, 
-				buf->st_ino, buf->st_mode, buf->st_nlink, 
-				buf->st_uid, buf->st_gid, buf->st_rdev, buf->st_size, 
-				buf->st_blksize, 
-				buf->st_blocks, atime_buf, mtime_buf, ctime_buf);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
-	return 0;
-}
-
-int32_t 
-trace_open_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		fd_t *fd)
-{
-	ERR_EINVAL_NORETURN (!this);
-
-	if (trace_fop_names[GF_FOP_OPEN].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d, *fd=%p)",
-			frame->root->unique, op_ret, op_errno, fd);
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, fd);
-	return 0;
-}
-
-int32_t 
-trace_stat_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		struct stat *buf)
-{
-	char atime_buf[256], mtime_buf[256], ctime_buf[256];
-	ERR_EINVAL_NORETURN (!this);
-  
-	if (trace_fop_names[GF_FOP_STAT].enabled) {
-
-		if (op_ret >= 0) {
-			strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime));
-			strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime));
-			strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime));
-
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, buf {st_dev=%"GF_PRI_DEV", "
-				"st_ino=%"PRIu64", st_mode=%d, st_nlink=%"GF_PRI_NLINK", "
-				"st_uid=%d, st_gid=%d, st_rdev=%"GF_PRI_DEV", st_size=%"PRId64
-				", st_blksize=%"GF_PRI_BLKSIZE", st_blocks=%"PRId64", "
-				"st_atime=%s, st_mtime=%s, st_ctime=%s})",
-				frame->root->unique, op_ret, buf->st_dev, buf->st_ino, 
-				buf->st_mode, buf->st_nlink, buf->st_uid, buf->st_gid, 
-				buf->st_rdev, buf->st_size, buf->st_blksize, 
-				buf->st_blocks, atime_buf, mtime_buf, ctime_buf);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}    
-
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-trace_readv_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 struct iovec *vector,
-		 int32_t count,
-		 struct stat *buf)
-{
-	char atime_buf[256], mtime_buf[256], ctime_buf[256];
-	ERR_EINVAL_NORETURN (!this);
-
-	if (trace_fop_names[GF_FOP_READ].enabled) {
-
-		if (op_ret >= 0) {
-			strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime));
-			strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime));
-			strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime));
-
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, *buf {st_dev=%"GF_PRI_DEV", "
-				"st_ino=%"PRIu64", st_mode=%d, st_nlink=%"GF_PRI_NLINK", "
-				"st_uid=%d, st_gid=%d, st_rdev=%"GF_PRI_DEV", "
-				"st_size=%"PRId64", st_blksize=%"GF_PRI_BLKSIZE", "
-				"st_blocks=%"PRId64", st_atime=%s, st_mtime=%s, st_ctime=%s})",
-				frame->root->unique, op_ret, buf->st_dev, buf->st_ino, 
-				buf->st_mode, buf->st_nlink, buf->st_uid, buf->st_gid, 
-				buf->st_rdev, buf->st_size, buf->st_blksize, buf->st_blocks, 
-				atime_buf, mtime_buf, ctime_buf);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-  
-	STACK_UNWIND (frame, op_ret, op_errno, vector, count, buf);
-	return 0;
-}
-
-int32_t 
-trace_writev_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct stat *buf)
-{
-	char atime_buf[256], mtime_buf[256], ctime_buf[256];
-	ERR_EINVAL_NORETURN (!this);
-
-	if (trace_fop_names[GF_FOP_WRITE].enabled) {
-		if (op_ret >= 0) {
-			strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime));
-			strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime));
-			strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime));
-
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, *buf {st_ino=%"PRIu64", "
-				"st_size=%"PRId64", st_blocks=%"PRId64", st_atime=%s, "
-				"st_mtime=%s, st_ctime=%s})",
-				frame->root->unique, op_ret, buf->st_ino, buf->st_size, 
-				buf->st_blocks, atime_buf, mtime_buf, ctime_buf);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-trace_getdents_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    dir_entry_t *entries,
-		    int32_t count)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_GETDENTS].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d, count=%d)",
-			frame->root->unique, op_ret, op_errno, count);
-	}
-  
-	STACK_UNWIND (frame, op_ret, op_errno, entries, count);
-	return 0;
-}
-
-int32_t 
-trace_readdir_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   gf_dirent_t *buf)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_READDIR].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64" :(op_ret=%d, op_errno=%d)",
-			frame->root->unique, op_ret, op_errno);
-	}
-  
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-
-	return 0;
-}
-
-int32_t 
-trace_fsync_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_FSYNC].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d)",
-			frame->root->unique, op_ret, op_errno);
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-int32_t 
-trace_chown_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 struct stat *buf)
-{
-	char atime_buf[256], mtime_buf[256], ctime_buf[256];
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_CHOWN].enabled) {
-		if (op_ret >= 0) {
-			strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime));
-			strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime));
-			strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime));
-    
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, *buf {st_ino=%"PRIu64", st_mode=%d, "
-				"st_uid=%d, st_gid=%d, st_atime=%s, st_mtime=%s, st_ctime=%s})",
-				frame->root->unique, op_ret, buf->st_ino, buf->st_mode, 
-				buf->st_uid, buf->st_gid, atime_buf, mtime_buf, ctime_buf);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-trace_chmod_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 struct stat *buf)
-{
-	char atime_buf[256], mtime_buf[256], ctime_buf[256];
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_CHMOD].enabled) {
-		if (op_ret >= 0) {
-			strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime));
-			strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime));
-			strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime));
-    
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, *buf {st_ino=%"PRIu64", st_mode=%d, "
-				"st_atime=%s, st_mtime=%s, st_ctime=%s})",
-				frame->root->unique, op_ret, buf->st_ino, buf->st_mode, 
-				atime_buf, mtime_buf, ctime_buf);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-trace_fchmod_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct stat *buf)
-{
-	char atime_buf[256], mtime_buf[256], ctime_buf[256];
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_FCHMOD].enabled) {
-		if (op_ret >= 0) {
-			strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime));
-			strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime));
-			strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime));
-    
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, *buf {st_ino=%"PRIu64", st_mode=%d, "
-				"st_atime=%s, st_mtime=%s, st_ctime=%s})",
-				frame->root->unique, op_ret, buf->st_ino, buf->st_mode, 
-				atime_buf, mtime_buf, ctime_buf);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-trace_fchown_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct stat *buf)
-{
-	char atime_buf[256], mtime_buf[256], ctime_buf[256];
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_FCHOWN].enabled) {  
-		if (op_ret >= 0) {
-			strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime));
-			strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime));
-			strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime));
-    
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, *buf {st_ino=%"PRIu64", st_mode=%d, "
-				"st_uid=%d, st_gid=%d, st_atime=%s, st_mtime=%s, st_ctime=%s})",
-				frame->root->unique, op_ret, buf->st_ino, buf->st_mode, 
-				buf->st_uid, buf->st_gid, atime_buf, mtime_buf, ctime_buf);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-trace_unlink_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_UNLINK].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d)",
-			frame->root->unique, op_ret, op_errno);
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-int32_t 
-trace_rename_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct stat *buf)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_RENAME].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d, buf {st_ino=%"PRIu64"})",
-			frame->root->unique, op_ret, op_errno, 
-			(buf? buf->st_ino : 0));
-	}
-  
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-trace_readlink_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    const char *buf)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_READLINK].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d, buf=%s)",
-			frame->root->unique, op_ret, op_errno, buf);
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-trace_lookup_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  inode_t *inode,
-		  struct stat *buf,
-		  dict_t *xattr)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_LOOKUP].enabled) {
-		if (op_ret >= 0) {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, ino=%"PRIu64", "
-				"*buf {st_dev=%"GF_PRI_DEV", st_ino=%"PRIu64", st_mode=%d, "
-				"st_nlink=%"GF_PRI_NLINK", st_uid=%d, st_gid=%d, "
-				"st_rdev=%"GF_PRI_DEV", st_size=%"PRId64", "
-				"st_blksize=%"GF_PRI_BLKSIZE", st_blocks=%"PRId64"})",
-				frame->root->unique, op_ret, inode->ino, buf->st_dev, buf->st_ino, 
-				buf->st_mode, buf->st_nlink, buf->st_uid, buf->st_gid, 
-				buf->st_rdev, buf->st_size, buf->st_blksize, buf->st_blocks);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf, xattr);
-	return 0;
-}
-
-int32_t 
-trace_symlink_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   inode_t *inode,
-		   struct stat *buf)
-{
-	char atime_buf[256], mtime_buf[256], ctime_buf[256];
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_SYMLINK].enabled) {
-		if (op_ret >= 0) {
-			strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime));
-			strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime));
-			strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime));
-    
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, ino=%"PRIu64", *buf {st_ino=%"PRIu64", "
-				"st_mode=%d, st_nlink=%"GF_PRI_NLINK", st_uid=%d, st_gid=%d, "
-				"st_size=%"PRId64", st_blocks=%"PRId64", st_atime=%s, "
-				"st_mtime=%s, st_ctime=%s})",
-				frame->root->unique, op_ret, inode->ino, buf->st_ino, 
-				buf->st_mode, buf->st_nlink, buf->st_uid, buf->st_gid, 
-				buf->st_size, buf->st_blocks, atime_buf, mtime_buf, ctime_buf);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
-}
-
-int32_t 
-trace_mknod_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 inode_t *inode,
-		 struct stat *buf)
-{
-	char atime_buf[256], mtime_buf[256], ctime_buf[256];
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_MKNOD].enabled) {  
-		if (op_ret >= 0) {
-			strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime));
-			strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime));
-			strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime));
-    
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, ino=%"PRIu64", *buf {st_dev=%"GF_PRI_DEV
-				", st_ino=%"PRIu64", st_mode=%d, st_nlink=%"GF_PRI_NLINK", "
-				"st_uid=%d, st_gid=%d, st_rdev=%"GF_PRI_DEV", st_size=%"PRId64
-				", st_blksize=%"GF_PRI_BLKSIZE", st_blocks=%"PRId64", st_atime=%s, "
-				"st_mtime=%s, st_ctime=%s})",
-				frame->root->unique, op_ret, inode->ino, buf->st_dev, buf->st_ino, 
-				buf->st_mode, buf->st_nlink, buf->st_uid, buf->st_gid, 
-				buf->st_rdev, buf->st_size, buf->st_blksize, buf->st_blocks, 
-				atime_buf, mtime_buf, ctime_buf);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
-}
-  
-
-int32_t 
-trace_mkdir_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 inode_t *inode,
-		 struct stat *buf)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_MKDIR].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d, ino=%"PRIu64"",
-			frame->root->unique, op_ret, op_errno, 
-			(inode? inode->ino : 0));
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
-}
-  
-int32_t 
-trace_link_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		inode_t *inode,
-		struct stat *buf)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_LINK].enabled) {
-		if (op_ret >= 0) {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, ino=%"PRIu64", "
-				"*buf {st_nlink=%"GF_PRI_NLINK"})",
-				frame->root->unique, op_ret, inode->ino, buf->st_nlink);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
-}
-
-int32_t 
-trace_flush_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_FLUSH].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d)",
-			frame->root->unique, op_ret, op_errno);
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-
-int32_t 
-trace_opendir_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   fd_t *fd)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_OPENDIR].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d, fd=%p)",
-			frame->root->unique, op_ret, op_errno, fd);
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, fd);
-	return 0;
-}
-
-int32_t 
-trace_rmdir_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_RMDIR].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d)",
-			frame->root->unique, op_ret, op_errno);
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-int32_t 
-trace_truncate_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    struct stat *buf)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_TRUNCATE].enabled) {  
-		if (op_ret >= 0) {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, *buf {st_size=%"PRId64", st_blksize=%"
-				GF_PRI_BLKSIZE", st_blocks=%"PRId64"})",
-				frame->root->unique, op_ret, buf->st_size, buf->st_blksize, 
-				buf->st_blocks);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-trace_utimens_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   struct stat *buf)
-{
-	char atime_buf[256], mtime_buf[256], ctime_buf[256];
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_UTIMENS].enabled) {
-		if (op_ret >= 0) {
-			strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime));
-			strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime));
-			strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime));
-    
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, *buf {st_atime=%s, st_mtime=%s, "
-				"st_ctime=%s})",
-				frame->root->unique, op_ret, atime_buf, mtime_buf, ctime_buf);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-trace_statfs_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct statvfs *buf)
-{
-	ERR_EINVAL_NORETURN (!this);
-
-	if (trace_fop_names[GF_FOP_STATFS].enabled) {
-		if (op_ret >= 0) {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": ({f_bsize=%lu, f_frsize=%lu, f_blocks=%"GF_PRI_FSBLK
-				", f_bfree=%"GF_PRI_FSBLK", f_bavail=%"GF_PRI_FSBLK", "
-				"f_files=%"GF_PRI_FSBLK", f_ffree=%"GF_PRI_FSBLK", f_favail=%"
-				GF_PRI_FSBLK", f_fsid=%lu, f_flag=%lu, f_namemax=%lu}) => ret=%d",
-				frame->root->unique, buf->f_bsize, buf->f_frsize, buf->f_blocks, 
-				buf->f_bfree, buf->f_bavail, buf->f_files, buf->f_ffree, 
-				buf->f_favail, buf->f_fsid, buf->f_flag, buf->f_namemax, op_ret);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-trace_setxattr_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_SETXATTR].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d)",
-			frame->root->unique, op_ret, op_errno);
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-int32_t 
-trace_getxattr_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    dict_t *dict)
-{
-	ERR_EINVAL_NORETURN (!this || !dict);
-
-	if (trace_fop_names[GF_FOP_GETXATTR].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d, dict=%p)",
-			frame->root->unique, op_ret, op_errno, dict);
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, dict);
-	return 0;
-}
-
-int32_t 
-trace_removexattr_cbk (call_frame_t *frame,
-		       void *cookie,
-		       xlator_t *this,
-		       int32_t op_ret,
-		       int32_t op_errno)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_REMOVEXATTR].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d)",
-			frame->root->unique, op_ret, op_errno);
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-
-int32_t 
-trace_fsyncdir_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_FSYNCDIR].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d)",
-			frame->root->unique, op_ret, op_errno);
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-int32_t 
-trace_access_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_ACCESS].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d)",
-			frame->root->unique, op_ret, op_errno);
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-int32_t 
-trace_ftruncate_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     struct stat *buf)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_FTRUNCATE].enabled) {
-		if (op_ret >= 0) {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, *buf {st_size=%"PRId64", "
-				"st_blksize=%"GF_PRI_BLKSIZE", st_blocks=%"PRId64"})",
-				frame->root->unique, op_ret, buf->st_size, buf->st_blksize, 
-				buf->st_blocks);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-trace_fstat_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 struct stat *buf)
-{
-	char atime_buf[256], mtime_buf[256], ctime_buf[256];
-	ERR_EINVAL_NORETURN (!this );
-  
-	if (trace_fop_names[GF_FOP_FSTAT].enabled) {
-		if (op_ret >= 0) {
-			strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime));
-			strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime));
-			strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime));
-    
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, *buf {st_dev=%"GF_PRI_DEV", "
-				"st_ino=%"PRIu64", st_mode=%d, st_nlink=%"GF_PRI_NLINK", "
-				"st_uid=%d, st_gid=%d, st_rdev=%"GF_PRI_DEV", st_size=%"PRId64", "
-				"st_blksize=%"GF_PRI_BLKSIZE", st_blocks=%"PRId64", st_atime=%s, "
-				"st_mtime=%s, st_ctime=%s})",
-				frame->root->unique, op_ret, buf->st_dev, buf->st_ino, 
-				buf->st_mode, buf->st_nlink, buf->st_uid, buf->st_gid, 
-				buf->st_rdev, buf->st_size, buf->st_blksize, 
-				buf->st_blocks, atime_buf, mtime_buf, ctime_buf);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-trace_lk_cbk (call_frame_t *frame,
-	      void *cookie,
-	      xlator_t *this,
-	      int32_t op_ret,
-	      int32_t op_errno,
-	      struct flock *lock)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_LK].enabled) {
-		if (op_ret >= 0) {
-			gf_log (this->name, GF_LOG_NORMAL,
-				"%"PRId64": (op_ret=%d, {l_type=%d, l_whence=%d, "
-				"l_start=%"PRId64", l_len=%"PRId64", l_pid=%u})",
-				frame->root->unique, op_ret, lock->l_type, lock->l_whence, 
-				lock->l_start, lock->l_len, lock->l_pid);
-		} else {
-			gf_log (this->name, GF_LOG_NORMAL, 
-				"%"PRId64": (op_ret=%d, op_errno=%d)",
-				frame->root->unique, op_ret, op_errno);
-		}    
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno, lock);
-	return 0;
-}
-
-
-int32_t 
-trace_setdents_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_SETDENTS].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL,
-			"%"PRId64": op_ret=%d, op_errno=%d",
-			frame->root->unique, op_ret, op_errno);
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-int32_t 
-trace_entrylk_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno)
-{
-	ERR_EINVAL_NORETURN (!this );
-
-	if (trace_fop_names[GF_FOP_ENTRYLK].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL,
-			"%"PRId64": op_ret=%d, op_errno=%d",
-			frame->root->unique, op_ret, op_errno);
-	}
-
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-
-int32_t 
-trace_xattrop_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   dict_t *dict)
-{
-	ERR_EINVAL_NORETURN (!this || !dict);
-
-	if (trace_fop_names[GF_FOP_XATTROP].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d)",
-			frame->root->unique, op_ret, op_errno);
-	}
+int
+trace_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+              gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_XATTROP].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s (path=%s flags=%d)", frame->root->unique,
+                 uuid_utoa(loc->inode->gfid), loc->path, flags);
+
+        frame->local = loc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
 
-	STACK_UNWIND (frame, op_ret, op_errno, dict);
-	return 0;
+out:
+    STACK_WIND(frame, trace_xattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->xattrop, loc, flags, dict, xdata);
+
+    return 0;
 }
 
-int32_t 
-trace_fxattrop_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    dict_t *dict)
+int
+trace_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+               gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
 {
-	ERR_EINVAL_NORETURN (!this || !dict);
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FXATTROP].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s fd=%p, flags=%d",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), fd, flags);
+
+        frame->local = fd->inode->gfid;
 
-	if (trace_fop_names[GF_FOP_FXATTROP].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (op_ret=%d, op_errno=%d)",
-			frame->root->unique, op_ret, op_errno);
-	}
+        LOG_ELEMENT(conf, string);
+    }
 
-	STACK_UNWIND (frame, op_ret, op_errno, dict);
-	return 0;
+out:
+    STACK_WIND(frame, trace_fxattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fxattrop, fd, flags, dict, xdata);
+
+    return 0;
 }
 
-int32_t 
-trace_inodelk_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno)
+int
+trace_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
 {
-	ERR_EINVAL_NORETURN (!this );
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_LOOKUP].enabled) {
+        char string[4096] = {
+            0,
+        };
+        /* TODO: print all the keys mentioned in xattr_req */
+        snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s path=%s",
+                 frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path);
+
+        frame->local = loc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
 
-	if (trace_fop_names[GF_FOP_INODELK].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL,
-			"%"PRId64": op_ret=%d, op_errno=%d",
-			frame->root->unique, op_ret, op_errno);
-	}
+out:
+    STACK_WIND(frame, trace_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
 
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
+    return 0;
 }
 
+int
+trace_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
 
-int32_t
-trace_entrylk (call_frame_t *frame, xlator_t *this,
-	       loc_t *loc, const char *basename,
-	       entrylk_cmd cmd, entrylk_type type)
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_STAT].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s path=%s",
+                 frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path);
+
+        frame->local = loc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->stat, loc, xdata);
+
+    return 0;
+}
+
+int
+trace_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+               dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_READLINK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s path=%s, "
+                 "size=%" GF_PRI_SIZET ")",
+                 frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+                 size);
+
+        frame->local = loc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_readlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readlink, loc, size, xdata);
+
+    return 0;
+}
+
+int
+trace_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+            dev_t dev, mode_t umask, dict_t *xdata)
 {
-	ERR_EINVAL_NORETURN (!this || !loc || !basename);
+    trace_conf_t *conf = NULL;
 
-	if (trace_fop_names[GF_FOP_ENTRYLK].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc= {path=%s, ino=%"PRIu64"} basename=%s, cmd=%s, type=%s)",
-			frame->root->unique, loc->path, loc->inode->ino, basename, 
-			((cmd == ENTRYLK_LOCK) ? "ENTRYLK_LOCK" : "ENTRYLK_UNLOCK"), 
-			((type == ENTRYLK_RDLCK) ? "ENTRYLK_RDLCK" : "ENTRYLK_WRLCK"));
-	}
+    conf = this->private;
 
-	STACK_WIND (frame, 
-		    trace_entrylk_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->entrylk,
-		    loc, basename, cmd, type);
-	return 0;
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_MKNOD].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s path=%s mode=%d "
+                 "umask=0%o, dev=%" GF_PRI_DEV ")",
+                 frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+                 mode, umask, dev);
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, dev, umask, xdata);
+
+    return 0;
 }
 
-int32_t
-trace_inodelk (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc, int32_t cmd, struct flock *flock)
+int
+trace_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+            mode_t umask, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_MKDIR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s path=%s mode=%d"
+                 " umask=0%o",
+                 frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+                 mode, umask);
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_mkdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+    return 0;
+}
+
+int
+trace_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+             dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_UNLINK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s path=%s flag=%d",
+                 frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+                 xflag);
+
+        frame->local = loc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    STACK_WIND(frame, trace_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+    return 0;
+}
+
+int
+trace_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+            dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_RMDIR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s path=%s flags=%d", frame->root->unique,
+                 uuid_utoa(loc->inode->gfid), loc->path, flags);
+
+        frame->local = loc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_rmdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata);
+
+    return 0;
+}
+
+int
+trace_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+              loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_SYMLINK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s linkpath=%s, path=%s"
+                 " umask=0%o",
+                 frame->root->unique, uuid_utoa(loc->inode->gfid), linkpath,
+                 loc->path, umask);
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_symlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, xdata);
+
+    return 0;
+}
+
+int
+trace_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+             dict_t *xdata)
+{
+    char oldgfid[50] = {
+        0,
+    };
+    char newgfid[50] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_RENAME].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (newloc->inode)
+            uuid_utoa_r(newloc->inode->gfid, newgfid);
+        else
+            strcpy(newgfid, "0");
+
+        uuid_utoa_r(oldloc->inode->gfid, oldgfid);
+
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": oldgfid=%s oldpath=%s --> "
+                 "newgfid=%s newpath=%s",
+                 frame->root->unique, oldgfid, oldloc->path, newgfid,
+                 newloc->path);
+
+        frame->local = oldloc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+
+    return 0;
+}
+
+int
+trace_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+           dict_t *xdata)
+{
+    char oldgfid[50] = {
+        0,
+    };
+    char newgfid[50] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_LINK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (newloc->inode)
+            uuid_utoa_r(newloc->inode->gfid, newgfid);
+        else
+            strcpy(newgfid, "0");
+
+        uuid_utoa_r(oldloc->inode->gfid, oldgfid);
+
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": oldgfid=%s oldpath=%s --> "
+                 "newgfid=%s newpath=%s",
+                 frame->root->unique, oldgfid, oldloc->path, newgfid,
+                 newloc->path);
+
+        frame->local = oldloc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+    return 0;
+}
+
+int
+trace_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+              struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    char actime_str[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    char modtime_str[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_SETATTR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (valid & GF_SET_ATTR_MODE) {
+            snprintf(
+                string, sizeof(string), "%" PRId64 ": gfid=%s path=%s mode=%o)",
+                frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+                st_mode_from_ia(stbuf->ia_prot, stbuf->ia_type));
+
+            LOG_ELEMENT(conf, string);
+            memset(string, 0, sizeof(string));
+        }
+
+        if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s path=%s uid=%o,"
+                     " gid=%o",
+                     frame->root->unique, uuid_utoa(loc->inode->gfid),
+                     loc->path, stbuf->ia_uid, stbuf->ia_gid);
+
+            LOG_ELEMENT(conf, string);
+            memset(string, 0, sizeof(string));
+        }
+
+        if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
+            gf_time_fmt(actime_str, sizeof actime_str, stbuf->ia_atime,
+                        gf_timefmt_bdT);
+
+            gf_time_fmt(modtime_str, sizeof modtime_str, stbuf->ia_mtime,
+                        gf_timefmt_bdT);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s path=%s "
+                     "ia_atime=%s, ia_mtime=%s",
+                     frame->root->unique, uuid_utoa(loc->inode->gfid),
+                     loc->path, actime_str, modtime_str);
+
+            LOG_ELEMENT(conf, string);
+            memset(string, 0, sizeof(string));
+        }
+        frame->local = loc->inode->gfid;
+    }
+
+out:
+    STACK_WIND(frame, trace_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+
+    return 0;
+}
+
+int
+trace_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+               struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    char actime_str[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    char modtime_str[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FSETATTR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        if (valid & GF_SET_ATTR_MODE) {
+            snprintf(string, sizeof(string),
+                     "%" PRId64 ": gfid=%s fd=%p, mode=%o", frame->root->unique,
+                     uuid_utoa(fd->inode->gfid), fd,
+                     st_mode_from_ia(stbuf->ia_prot, stbuf->ia_type));
+
+            LOG_ELEMENT(conf, string);
+            memset(string, 0, sizeof(string));
+        }
+
+        if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s fd=%p, uid=%o, "
+                     "gid=%o",
+                     frame->root->unique, uuid_utoa(fd->inode->gfid), fd,
+                     stbuf->ia_uid, stbuf->ia_gid);
+
+            LOG_ELEMENT(conf, string);
+            memset(string, 0, sizeof(string));
+        }
+
+        if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
+            gf_time_fmt(actime_str, sizeof actime_str, stbuf->ia_atime,
+                        gf_timefmt_bdT);
+
+            gf_time_fmt(modtime_str, sizeof modtime_str, stbuf->ia_mtime,
+                        gf_timefmt_bdT);
+
+            snprintf(string, sizeof(string),
+                     "%" PRId64
+                     ": gfid=%s fd=%p "
+                     "ia_atime=%s, ia_mtime=%s",
+                     frame->root->unique, uuid_utoa(fd->inode->gfid), fd,
+                     actime_str, modtime_str);
+
+            LOG_ELEMENT(conf, string);
+            memset(string, 0, sizeof(string));
+        }
+        frame->local = fd->inode->gfid;
+    }
+
+out:
+    STACK_WIND(frame, trace_fsetattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+
+    return 0;
+}
+
+static int
+trace_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, off_t offset, dict_t *xdata)
+{
+    trace_conf_t *conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_SEEK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s op_ret=%d op_errno=%d, "
+                 "offset=%" PRId64 "",
+                 frame->root->unique, uuid_utoa(frame->local), op_ret, op_errno,
+                 offset);
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    TRACE_STACK_UNWIND(seek, frame, op_ret, op_errno, offset, xdata);
+    return 0;
+}
+
+static int
+trace_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+           gf_seek_what_t what, dict_t *xdata)
+{
+    trace_conf_t *conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_SEEK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s fd=%p "
+                 "offset=%" PRId64 " what=%d",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), fd, offset,
+                 what);
+
+        frame->local = fd->inode->gfid;
+        LOG_ELEMENT(conf, string);
+    }
+out:
+    STACK_WIND(frame, trace_seek_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->seek, fd, offset, what, xdata);
+
+    return 0;
+}
+
+int
+trace_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+               dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_TRUNCATE].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s path=%s, "
+                 "offset=%" PRId64 "",
+                 frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+                 offset);
+
+        frame->local = loc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+
+    return 0;
+}
+
+int
+trace_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+           fd_t *fd, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_OPEN].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s path=%s flags=%d fd=%p",
+                 frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+                 flags, fd);
+
+        frame->local = loc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+    return 0;
+}
+
+int
+trace_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+             mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_CREATE].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s path=%s, fd=%p, "
+                 "flags=0%o mode=0%o umask=0%o",
+                 frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+                 fd, flags, mode, umask);
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+    return 0;
+}
+
+int
+trace_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t offset, uint32_t flags, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_READ].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s fd=%p, size=%" GF_PRI_SIZET
+                 "offset=%" PRId64 " flags=0%x)",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), fd, size,
+                 offset, flags);
+
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_readv_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+    return 0;
+}
+
+int
+trace_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+             struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
+             struct iobref *iobref, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+    int i = 0;
+    size_t total_size = 0;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_WRITE].enabled) {
+        char string[4096] = {
+            0,
+        };
+        for (i = 0; i < count; i++)
+            total_size += vector[i].iov_len;
+
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s fd=%p, count=%d, "
+                 " offset=%" PRId64 " flags=0%x write_size=%zu",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), fd, count,
+                 offset, flags, total_size);
+
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+               flags, iobref, xdata);
+    return 0;
+}
+
+int
+trace_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_STATFS].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s path=%s",
+                 frame->root->unique,
+                 (loc->inode) ? uuid_utoa(loc->inode->gfid) : "0", loc->path);
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_statfs_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->statfs, loc, xdata);
+    return 0;
+}
+
+int
+trace_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FLUSH].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s fd=%p",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), fd);
+
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_flush_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->flush, fd, xdata);
+    return 0;
+}
+
+int
+trace_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+            dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FSYNC].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s flags=%d fd=%p",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), flags, fd);
+
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_fsync_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsync, fd, flags, xdata);
+    return 0;
+}
+
+int
+trace_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+               int32_t flags, dict_t *xdata)
 {
-	ERR_EINVAL_NORETURN (!this || !loc);
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_SETXATTR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s path=%s flags=%d", frame->root->unique,
+                 uuid_utoa(loc->inode->gfid), loc->path, flags);
 
-	if (trace_fop_names[GF_FOP_INODELK].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"}, cmd=%s)",
-			frame->root->unique, loc->path, loc->inode->ino, 
-			((cmd == F_SETLK)? "F_SETLK" : "unknown"));
-	}
+        frame->local = loc->inode->gfid;
 
-	STACK_WIND (frame, 
-		    trace_inodelk_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->inodelk,
-		    loc, cmd, flock);
-	return 0;
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_setxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
+    return 0;
 }
 
+int
+trace_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               const char *name, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_GETXATTR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s path=%s name=%s",
+                 frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+                 name);
 
-int32_t 
-trace_finodelk_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno)
+        frame->local = loc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_getxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
+    return 0;
+}
+
+int
+trace_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                  const char *name, dict_t *xdata)
 {
-	ERR_EINVAL_NORETURN (!this );
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
 
-	if (trace_fop_names[GF_FOP_FINODELK].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL,
-			"%"PRId64": op_ret=%d, op_errno=%d",
-			frame->root->unique, op_ret, op_errno);
-	}
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_REMOVEXATTR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s path=%s name=%s",
+                 frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+                 name);
 
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
+        frame->local = loc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_removexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+
+    return 0;
+}
+
+int
+trace_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+              dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_OPENDIR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s path=%s fd=%p",
+                 frame->root->unique, uuid_utoa(loc->inode->gfid), loc->path,
+                 fd);
+
+        frame->local = loc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_opendir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+    return 0;
+}
+
+int
+trace_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+               off_t offset, dict_t *dict)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_READDIRP].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s fd=%p, size=%" GF_PRI_SIZET
+                 ", offset=%" PRId64 " dict=%p",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), fd, size,
+                 offset, dict);
+
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict);
+
+    return 0;
+}
+
+int
+trace_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+              off_t offset, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_READDIR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s fd=%p, size=%" GF_PRI_SIZET
+                 ", offset=%" PRId64,
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), fd, size,
+                 offset);
+
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_readdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata);
+
+    return 0;
+}
+
+int
+trace_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+               dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FSYNCDIR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s datasync=%d fd=%p", frame->root->unique,
+                 uuid_utoa(fd->inode->gfid), datasync, fd);
+
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_fsyncdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsyncdir, fd, datasync, xdata);
+    return 0;
+}
+
+int
+trace_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+             dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_ACCESS].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s path=%s mask=0%o", frame->root->unique,
+                 uuid_utoa(loc->inode->gfid), loc->path, mask);
+
+        frame->local = loc->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_access_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->access, loc, mask, xdata);
+    return 0;
 }
 
 int32_t
-trace_finodelk (call_frame_t *frame,
-		xlator_t *this,
-		fd_t *fd, int32_t cmd, struct flock *flock)
+trace_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                int32_t len, dict_t *xdata)
 {
-	ERR_EINVAL_NORETURN (!this || !fd);
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_RCHECKSUM].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s offset=%" PRId64 "len=%u fd=%p",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), offset, len,
+                 fd);
+
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
 
-	if (trace_fop_names[GF_FOP_FINODELK].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (fd=%p, cmd=%s)",
-			frame->root->unique, fd, 
-			((cmd == F_SETLK) ? "F_SETLK" : "unknown"));
-	}
+out:
+    STACK_WIND(frame, trace_rchecksum_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata);
 
-	STACK_WIND (frame, 
-		    trace_finodelk_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->finodelk,
-		    fd, cmd, flock);
-	return 0;
+    return 0;
 }
 
+int32_t
+trace_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+               fd_t *fd, const char *basename, entrylk_cmd cmd,
+               entrylk_type type, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FENTRYLK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s volume=%s, (fd=%p "
+                 "basename=%s, cmd=%s, type=%s)",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), volume, fd,
+                 basename,
+                 ((cmd == ENTRYLK_LOCK) ? "ENTRYLK_LOCK" : "ENTRYLK_UNLOCK"),
+                 ((type == ENTRYLK_RDLCK) ? "ENTRYLK_RDLCK" : "ENTRYLK_WRLCK"));
+
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_fentrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fentrylk, volume, fd, basename, cmd,
+               type, xdata);
+    return 0;
+}
 
 int32_t
-trace_xattrop (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       gf_xattrop_flags_t flags,
-	       dict_t *dict)
+trace_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+                dict_t *xdata)
 {
-	ERR_EINVAL_NORETURN (!this || !loc);
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
 
-	if (trace_fop_names[GF_FOP_XATTROP].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (path=%s, ino=%"PRIu64" flags=%d)",
-			frame->root->unique, loc->path, loc->inode->ino, flags);
-			
-	}
-  
-	STACK_WIND (frame, trace_xattrop_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->xattrop, 
-		    loc, flags, dict);
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FGETXATTR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s fd=%p name=%s",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), fd, name);
 
-	return 0;
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_fgetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+    return 0;
 }
 
 int32_t
-trace_fxattrop (call_frame_t *frame,
-		xlator_t *this,
-		fd_t *fd,
-		gf_xattrop_flags_t flags,
-		dict_t *dict)
-{
-	ERR_EINVAL_NORETURN (!this || !fd);
-
-	if (trace_fop_names[GF_FOP_FXATTROP].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (fd=%p, flags=%d)",
-			frame->root->unique, fd, flags);
-			
-	}
-  
-	STACK_WIND (frame, trace_fxattrop_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->fxattrop, 
-		    fd, flags, dict);
-
-	return 0;
-}
-
-int32_t 
-trace_lookup (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      dict_t *xattr_req)
-{
-	ERR_EINVAL_NORETURN (!this || !loc);
-
-	if (trace_fop_names[GF_FOP_LOOKUP].enabled) {  
-		/* TODO: print all the keys mentioned in xattr_req */
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"})",
-			frame->root->unique, loc->path,
-			loc->inode->ino);
-	}
-  
-	STACK_WIND (frame, trace_lookup_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->lookup, 
-		    loc, xattr_req);
-
-	return 0;
-}
-
-int32_t 
-trace_stat (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc)
-{
-	ERR_EINVAL_NORETURN (!this || !loc );
-
-
-	if (trace_fop_names[GF_FOP_STAT].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"})",
-			frame->root->unique, loc->path, loc->inode->ino);
-	}
-
-	STACK_WIND (frame, 
-		    trace_stat_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->stat, 
-		    loc);
-  
-	return 0;
-}
-
-int32_t 
-trace_readlink (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		size_t size)
-{
-	ERR_EINVAL_NORETURN (!this || !loc || (size < 1));
-
-	if (trace_fop_names[GF_FOP_READLINK].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"}, size=%"GF_PRI_SIZET")",
-			frame->root->unique, loc->path, loc->inode->ino, size);
-	}
-
-	STACK_WIND (frame, 
-		    trace_readlink_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->readlink, 
-		    loc, 
-		    size);
-  
-	return 0;
-}
-
-int32_t 
-trace_mknod (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     mode_t mode,
-	     dev_t dev)
-{
-	ERR_EINVAL_NORETURN (!this || !loc->path);
-
-	if (trace_fop_names[GF_FOP_MKNOD].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"}, mode=%d, dev=%"GF_PRI_DEV")",
-			frame->root->unique, loc->path, loc->inode->ino, mode, dev);
-	}
-
-	STACK_WIND (frame, 
-		    trace_mknod_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->mknod, 
-		    loc,
-		    mode, 
-		    dev);
-  
-	return 0;
-}
-
-int32_t 
-trace_mkdir (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     mode_t mode)
-{
-	ERR_EINVAL_NORETURN (!this || !loc || !loc->path);
-
-	if (trace_fop_names[GF_FOP_MKDIR].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (path=%s, ino=%"PRIu64", mode=%d)",
-			frame->root->unique, loc->path, 
-			((loc->inode)? loc->inode->ino : 0), mode);
-	}
-  
-	STACK_WIND (frame, 
-		    trace_mkdir_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->mkdir, 
-		    loc,
-		    mode);
-	return 0;
-}
-
-int32_t 
-trace_unlink (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc)
-{
-	ERR_EINVAL_NORETURN (!this || !loc);
-
-	if (trace_fop_names[GF_FOP_UNLINK].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"})",
-			frame->root->unique, loc->path, loc->inode->ino);
-	}
-
-	STACK_WIND (frame, 
-		    trace_unlink_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->unlink, 
-		    loc);
-	return 0;
-}
-
-int32_t 
-trace_rmdir (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc)
-{
-	ERR_EINVAL_NORETURN (!this || !loc);
-
-	if (trace_fop_names[GF_FOP_RMDIR].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"})",
-			frame->root->unique, loc->path, loc->inode->ino);
-	}
-
-	STACK_WIND (frame, 
-		    trace_rmdir_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->rmdir, 
-		    loc);
-  
-	return 0;
-}
-
-int32_t 
-trace_symlink (call_frame_t *frame,
-	       xlator_t *this,
-	       const char *linkpath,
-	       loc_t *loc)
-{
-	ERR_EINVAL_NORETURN (!this || !linkpath || !loc || !loc->path);
-
-	if (trace_fop_names[GF_FOP_SYMLINK].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (linkpath=%s, loc {path=%s, ino=%"PRIu64"})",
-			frame->root->unique, linkpath, loc->path, 
-			((loc->inode)? loc->inode->ino : 0));
-	}
-
-	STACK_WIND (frame, 
-		    trace_symlink_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->symlink, 
-		    linkpath,
-		    loc);
-  
-	return 0;
-}
-
-int32_t 
-trace_rename (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *oldloc,
-	      loc_t *newloc)
-{  
-	ERR_EINVAL_NORETURN (!this || !oldloc || !newloc);
-
-	if (trace_fop_names[GF_FOP_RENAME].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (oldloc {path=%s, ino=%"PRIu64"}, "
-			"newloc{path=%s, ino=%"PRIu64"})",
-			frame->root->unique, oldloc->path, oldloc->ino, 
-			newloc->path, newloc->ino);
-	}
-
-	STACK_WIND (frame, 
-		    trace_rename_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->rename, 
-		    oldloc,
-		    newloc);
-  
-	return 0;
-}
-
-int32_t 
-trace_link (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *oldloc,
-	    loc_t *newloc)
-{
-	ERR_EINVAL_NORETURN (!this || !oldloc || !newloc);
-
-	if (trace_fop_names[GF_FOP_LINK].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (oldloc {path=%s, ino=%"PRIu64"}, "
-			"newloc {path=%s, ino=%"PRIu64"})",
-			frame->root->unique, oldloc->path, oldloc->inode->ino, 
-			newloc->path, newloc->inode->ino);
-	}
-
-	STACK_WIND (frame, 
-		    trace_link_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->link, 
-		    oldloc,
-		    newloc);
-	return 0;
-}
-
-int32_t 
-trace_chmod (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     mode_t mode)
-{
-	ERR_EINVAL_NORETURN (!this || !loc);
-
-	if (trace_fop_names[GF_FOP_CHMOD].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"}, mode=%o)",
-			frame->root->unique, loc->path, loc->inode->ino, mode);
-	}
-
-	STACK_WIND (frame, 
-		    trace_chmod_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->chmod, 
-		    loc,
-		    mode);
-  
-	return 0;
-}
-
-int32_t 
-trace_chown (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     uid_t uid,
-	     gid_t gid)
-{
-	ERR_EINVAL_NORETURN (!this || !loc);
-
-	if (trace_fop_names[GF_FOP_CHOWN].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"}, uid=%d, gid=%d)",
-			frame->root->unique, loc->path, loc->inode->ino, uid, gid);
-	}
-
-	STACK_WIND (frame, 
-		    trace_chown_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->chown, 
-		    loc,
-		    uid,
-		    gid);
-
-	return 0;
-}
-
-int32_t 
-trace_truncate (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		off_t offset)
-{
-	ERR_EINVAL_NORETURN (!this || !loc);
-
-	if (trace_fop_names[GF_FOP_TRUNCATE].enabled) { 
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"}, offset=%"PRId64")",
-			frame->root->unique, loc->path, loc->inode->ino, offset);
-	}
-
-	STACK_WIND (frame, 
-		    trace_truncate_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->truncate, 
-		    loc,
-		    offset);
-  
-	return 0;
-}
-
-int32_t 
-trace_utimens (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       struct timespec tv[2])
-{
-	char actime_str[256];
-	char modtime_str[256];
-  
-	ERR_EINVAL_NORETURN (!this || !loc || !tv);
-
-	if (trace_fop_names[GF_FOP_UTIMENS].enabled) {  
-		strftime (actime_str, 256, "[%b %d %H:%M:%S]", localtime (&tv[0].tv_sec));
-		strftime (modtime_str, 256, "[%b %d %H:%M:%S]", localtime (&tv[1].tv_sec));
-
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"}, "
-			"*tv=%p {actime=%s, modtime=%s})",
-			frame->root->unique, loc->path, loc->inode->ino, 
-			tv, actime_str, modtime_str);
-	}
-
-	STACK_WIND (frame, 
-		    trace_utimens_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->utimens, 
-		    loc,
-		    tv);
-
-	return 0;
-}
-
-int32_t 
-trace_open (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    int32_t flags,
-	    fd_t *fd)
-{
-	ERR_EINVAL_NORETURN (!this || !loc);
-
-	if (trace_fop_names[GF_FOP_OPEN].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"}, flags=%d, fd=%p)",
-			frame->root->unique, loc->path, loc->inode->ino, flags, fd);
-	}
-
-	STACK_WIND (frame, 
-		    trace_open_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->open, 
-		    loc,
-		    flags,
-		    fd);
-	return 0;
-}
-
-int32_t 
-trace_create (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      int32_t flags,
-	      mode_t mode,
-	      fd_t *fd)
-{
-	ERR_EINVAL_NORETURN (!this || !loc->path);
-
-	if (trace_fop_names[GF_FOP_CREATE].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"}, flags=0%o mode=0%o)",
-			frame->root->unique, loc->path, loc->inode->ino, flags, mode);
-	}
-
-	STACK_WIND (frame, 
-		    trace_create_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->create, 
-		    loc,
-		    flags,
-		    mode,
-		    fd);
-	return 0;
-}
-
-int32_t 
-trace_readv (call_frame_t *frame,
-	     xlator_t *this,
-	     fd_t *fd,
-	     size_t size,
-	     off_t offset)
-{
-	ERR_EINVAL_NORETURN (!this || !fd || (size < 1));
-
-	if (trace_fop_names[GF_FOP_READ].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (*fd=%p, size=%"GF_PRI_SIZET", offset=%"PRId64")",
-			frame->root->unique, fd, size, offset);
-	}
-
-	STACK_WIND (frame, 
-		    trace_readv_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->readv,
-		    fd,
-		    size,
-		    offset);
-	return 0;
-}
-
-int32_t 
-trace_writev (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      struct iovec *vector,
-	      int32_t count,
-	      off_t offset)
-{
-	ERR_EINVAL_NORETURN (!this || !fd || !vector || (count < 1));
-
-	if (trace_fop_names[GF_FOP_WRITE].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (*fd=%p, *vector=%p, count=%d, offset=%"PRId64")",
-			frame->root->unique, fd, vector, count, offset);
-	}
-
-	STACK_WIND (frame, 
-		    trace_writev_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->writev, 
-		    fd,
-		    vector,
-		    count,
-		    offset);
-	return 0;
-}
-
-int32_t 
-trace_statfs (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc)
-{
-	ERR_EINVAL_NORETURN (!this || !loc);
-
-	if (trace_fop_names[GF_FOP_STATFS].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"})",
-			frame->root->unique, loc->path, 
-			((loc->inode)? loc->inode->ino : 0));
-	}
-
-	STACK_WIND (frame, 
-		    trace_statfs_cbk, 
-		    FIRST_CHILD(this), FIRST_CHILD(this)->fops->statfs, 
-		    loc);
-	return 0; 
-}
-
-int32_t 
-trace_flush (call_frame_t *frame,
-	     xlator_t *this,
-	     fd_t *fd)
-{
-	ERR_EINVAL_NORETURN (!this || !fd);
-
-	if (trace_fop_names[GF_FOP_FLUSH].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (*fd=%p)", 
-			frame->root->unique, fd);
-	}
-
-	STACK_WIND (frame, 
-		    trace_flush_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->flush, 
-		    fd);
-	return 0;
-}
-
-
-int32_t 
-trace_fsync (call_frame_t *frame,
-	     xlator_t *this,
-	     fd_t *fd,
-	     int32_t flags)
-{
-	ERR_EINVAL_NORETURN (!this || !fd);
-
-	if (trace_fop_names[GF_FOP_FSYNC].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (flags=%d, *fd=%p)", 
-			frame->root->unique, flags, fd);
-	}
-
-	STACK_WIND (frame, 
-		    trace_fsync_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->fsync, 
-		    fd,
-		    flags);
-	return 0;
-}
-
-int32_t 
-trace_setxattr (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		dict_t *dict,
-		int32_t flags)
-{
-	ERR_EINVAL_NORETURN (!this || !loc || !dict);
-
-	if (trace_fop_names[GF_FOP_SETXATTR].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"}, dict=%p, flags=%d)",
-			frame->root->unique, loc->path, 
-			((loc->inode)? loc->inode->ino : 0), dict, flags);
-	}
-
-	STACK_WIND (frame, 
-		    trace_setxattr_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->setxattr, 
-		    loc,
-		    dict,
-		    flags);
-	return 0;
-}
-
-int32_t 
-trace_getxattr (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		const char *name)
-{
-	ERR_EINVAL_NORETURN (!this || !loc);
-
-	if (trace_fop_names[GF_FOP_GETXATTR].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"}), name=%s",
-			frame->root->unique, loc->path, 
-			((loc->inode)? loc->inode->ino : 0), name);
-	}
-
-	STACK_WIND (frame, 
-		    trace_getxattr_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->getxattr,
-		    loc,
-		    name);
-	return 0;
-}
-
-int32_t 
-trace_removexattr (call_frame_t *frame,
-		   xlator_t *this,
-		   loc_t *loc,
-		   const char *name)
-{
-	ERR_EINVAL_NORETURN (!this || !loc || !name);
-
-	if (trace_fop_names[GF_FOP_REMOVEXATTR].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (loc {path=%s, ino=%"PRIu64"}, name=%s)",
-			frame->root->unique, loc->path, 
-			((loc->inode)? loc->inode->ino : 0), name);
-	}
-
-	STACK_WIND (frame, 
-		    trace_removexattr_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->removexattr, 
-		    loc,
-		    name);
-
-	return 0;
-}
-
-int32_t 
-trace_opendir (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       fd_t *fd)
-{
-	ERR_EINVAL_NORETURN (!this || !loc );
-
-	if (trace_fop_names[GF_FOP_OPENDIR].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64":( loc {path=%s, ino=%"PRIu64"}, fd=%p)",
-			frame->root->unique, loc->path, loc->inode->ino, fd);
-	}
-
-	STACK_WIND (frame, 
-		    trace_opendir_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->opendir, 
-		    loc,
-		    fd);
-	return 0;
-}
-
-int32_t 
-trace_getdents (call_frame_t *frame,
-		xlator_t *this,
-		fd_t *fd,
-		size_t size,
-		off_t offset,
-		int32_t flag)
-{
-	ERR_EINVAL_NORETURN (!this || !fd);  
-
-	if (trace_fop_names[GF_FOP_GETDENTS].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (fd=%p, size=%"GF_PRI_SIZET", offset=%"PRId64", flag=0x%x)",
-			frame->root->unique, fd, size, offset, flag);
-	}
-
-	STACK_WIND (frame, 
-		    trace_getdents_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->getdents, 
-		    fd,
-		    size, 
-		    offset, 
-		    flag);
-	return 0;
-}
-
-
-int32_t 
-trace_readdir (call_frame_t *frame,
-	       xlator_t *this,
-	       fd_t *fd,
-	       size_t size,
-	       off_t offset)
-{
-	ERR_EINVAL_NORETURN (!this || !fd);  
-
-	if (trace_fop_names[GF_FOP_READDIR].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (fd=%p, size=%"GF_PRI_SIZET", offset=%"PRId64")",
-			frame->root->unique, fd, size, offset);
-	}
-
-	STACK_WIND (frame, 
-		    trace_readdir_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->readdir,
-		    fd,
-		    size, 
-		    offset);
-
-	return 0;
-}
-
-
-int32_t 
-trace_fsyncdir (call_frame_t *frame,
-		xlator_t *this,
-		fd_t *fd,
-		int32_t datasync)
-{
-	ERR_EINVAL_NORETURN (!this || !fd);
-
-	if (trace_fop_names[GF_FOP_FSYNCDIR].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (datasync=%d, *fd=%p)", 
-			frame->root->unique, datasync, fd);
-	}
-
-	STACK_WIND (frame, 
-		    trace_fsyncdir_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->fsyncdir, 
-		    fd,
-		    datasync);
-	return 0;
-}
-
-int32_t 
-trace_access (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      int32_t mask)
-{
-	ERR_EINVAL_NORETURN (!this || !loc);
-
-	if (trace_fop_names[GF_FOP_ACCESS].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (*loc {path=%s, ino=%"PRIu64"}, mask=0%o)",
-			frame->root->unique, loc->path, 
-			((loc->inode)? loc->inode->ino : 0), mask);
-	}
-
-	STACK_WIND (frame, 
-		    trace_access_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->access, 
-		    loc,
-		    mask);
-	return 0;
-}
-
-int32_t 
-trace_ftruncate (call_frame_t *frame,
-		 xlator_t *this,
-		 fd_t *fd,
-		 off_t offset)
-{
-	ERR_EINVAL_NORETURN (!this || !fd);
-
-	if (trace_fop_names[GF_FOP_FTRUNCATE].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (offset=%"PRId64", *fd=%p)", 
-			frame->root->unique, offset, fd);
-	}
-
-	STACK_WIND (frame, 
-		    trace_ftruncate_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->ftruncate, 
-		    fd,
-		    offset);
-
-	return 0;
-}
-
-int32_t 
-trace_fchown (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      uid_t uid,
-	      gid_t gid)
-{
-	ERR_EINVAL_NORETURN (!this || !fd);
-
-	if (trace_fop_names[GF_FOP_FCHOWN].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (*fd=%p, uid=%d, gid=%d)", 
-			frame->root->unique, fd, uid, gid);
-	}
-
-	STACK_WIND (frame, 
-		    trace_fchown_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->fchown, 
-		    fd,
-		    uid,
-		    gid);
-	return 0;
-}
-
-int32_t 
-trace_fchmod (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      mode_t mode)
-{
-	ERR_EINVAL_NORETURN (!this || !fd);
-
-	if (trace_fop_names[GF_FOP_FCHMOD].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (mode=%o, *fd=%p)", 
-			frame->root->unique, mode, fd);
-	}
-
-	STACK_WIND (frame, 
-		    trace_fchmod_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->fchmod, 
-		    fd,
-		    mode);
-	return 0;
-}
-
-int32_t 
-trace_fstat (call_frame_t *frame,
-	     xlator_t *this,
-	     fd_t *fd)
-{
-	ERR_EINVAL_NORETURN (!this || !fd);
-
-	if (trace_fop_names[GF_FOP_FSTAT].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (*fd=%p)", 
-			frame->root->unique, fd);
-	}
-
-	STACK_WIND (frame, 
-		    trace_fstat_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->fstat, 
-		    fd);
-	return 0;
-}
-
-int32_t 
-trace_lk (call_frame_t *frame,
-	  xlator_t *this,
-	  fd_t *fd,
-	  int32_t cmd,
-	  struct flock *lock)
-{
-	ERR_EINVAL_NORETURN (!this || !fd);
-
-	if (trace_fop_names[GF_FOP_LK].enabled) {  
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (*fd=%p, cmd=%d, lock {l_type=%d, l_whence=%d, "
-			"l_start=%"PRId64", l_len=%"PRId64", l_pid=%u})",
-			frame->root->unique, fd, cmd, lock->l_type, lock->l_whence, 
-			lock->l_start, lock->l_len, lock->l_pid);
-	}
-
-	STACK_WIND (frame, 
-		    trace_lk_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->lk, 
-		    fd,
-		    cmd,
-		    lock);
-	return 0;
-}
-
-int32_t 
-trace_setdents (call_frame_t *frame,
-		xlator_t *this,
-		fd_t *fd,
-		int32_t flags,
-		dir_entry_t *entries,
-		int32_t count)
-{
-	if (trace_fop_names[GF_FOP_SETDENTS].enabled) {
-		gf_log (this->name, GF_LOG_NORMAL, 
-			"%"PRId64": (*fd=%p, flags=%d, count=%d", 
-			frame->root->unique, fd, flags, count);
-	}
-
-	STACK_WIND (frame, 
-		    trace_setdents_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->setdents, 
-		    fd,
-		    flags,
-		    entries,
-		    count);
-	return 0;
+trace_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+                int32_t flags, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FSETXATTR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s fd=%p flags=%d",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), fd, flags);
+
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_fsetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+    return 0;
+}
+
+int
+trace_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FTRUNCATE].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64 ": gfid=%s offset=%" PRId64 " fd=%p",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), offset, fd);
+
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_ftruncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+
+    return 0;
 }
 
+int
+trace_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_FSTAT].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "%" PRId64 ": gfid=%s fd=%p",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), fd);
+
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_fstat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fstat, fd, xdata);
+    return 0;
+}
+
+int
+trace_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+         struct gf_flock *lock, dict_t *xdata)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_LK].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string),
+                 "%" PRId64
+                 ": gfid=%s fd=%p, cmd=%d, "
+                 "lock {l_type=%d, "
+                 "l_whence=%d, l_start=%" PRId64
+                 ", "
+                 "l_len=%" PRId64 ", l_pid=%u})",
+                 frame->root->unique, uuid_utoa(fd->inode->gfid), fd, cmd,
+                 lock->l_type, lock->l_whence, lock->l_start, lock->l_len,
+                 lock->l_pid);
+
+        frame->local = fd->inode->gfid;
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    STACK_WIND(frame, trace_lk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lk, fd, cmd, lock, xdata);
+    return 0;
+}
+
+int32_t
+trace_forget(xlator_t *this, inode_t *inode)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+    /* If user want to understand when a lookup happens,
+       he should know about 'forget' too */
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_LOOKUP].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "gfid=%s", uuid_utoa(inode->gfid));
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    return 0;
+}
 
 int32_t
-trace_checksum_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    uint8_t *fchecksum,
-		    uint8_t *dchecksum)
+trace_releasedir(xlator_t *this, fd_t *fd)
 {
-	gf_log (this->name, GF_LOG_NORMAL, 
-		"%"PRId64": op_ret (%d), op_errno(%d)", 
-		frame->root->unique, op_ret, op_errno);
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_OPENDIR].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "gfid=%s fd=%p",
+                 uuid_utoa(fd->inode->gfid), fd);
 
-	STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum);
+        LOG_ELEMENT(conf, string);
+    }
 
-	return 0;
+out:
+    return 0;
 }
 
 int32_t
-trace_checksum (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		int32_t flag)
-{
-	gf_log (this->name, GF_LOG_NORMAL, 
-		"%"PRId64": loc->path (%s) flag (%d)", 
-		frame->root->unique, loc->path, flag);
-  
-	STACK_WIND (frame,
-		    trace_checksum_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->checksum, 
-		    loc,
-		    flag);
-
-	return 0;
-}
-
-
-int32_t 
-trace_stats_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 struct xlator_stats *stats)
-{
-	gf_log (this->name, GF_LOG_NORMAL, 
-		"%"PRId64": op_ret (%d), op_errno(%d)", 
-		frame->root->unique, op_ret, op_errno);
-
-	STACK_UNWIND (frame, op_ret, op_errno, stats);
-	return 0;
-}
-
-int32_t 
-trace_stats (call_frame_t *frame,
-	     xlator_t *this, 
-	     int32_t flags)
-{
-	ERR_EINVAL_NORETURN (!this);
-  
-	gf_log (this->name, GF_LOG_NORMAL, 
-		"%"PRId64": (flags=%d)", 
-		frame->root->unique, flags);
-	
-	STACK_WIND (frame, 
-		    trace_stats_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->mops->stats, 
-		    flags);
-
-	return 0;
+trace_release(xlator_t *this, fd_t *fd)
+{
+    trace_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!conf->log_file && !conf->log_history)
+        goto out;
+    if (trace_fop_names[GF_FOP_OPEN].enabled ||
+        trace_fop_names[GF_FOP_CREATE].enabled) {
+        char string[4096] = {
+            0,
+        };
+        snprintf(string, sizeof(string), "gfid=%s fd=%p",
+                 uuid_utoa(fd->inode->gfid), fd);
+
+        LOG_ELEMENT(conf, string);
+    }
+
+out:
+    return 0;
 }
 
 void
-enable_all_calls (int enabled)
+enable_all_calls(int enabled)
 {
-	int i;
-	for (i = 0; i < GF_FOP_MAXVALUE; i++)
-		trace_fop_names[i].enabled = enabled;
+    int i;
+
+    for (i = 0; i < GF_FOP_MAXVALUE; i++)
+        trace_fop_names[i].enabled = enabled;
 }
 
-void 
-enable_call (const char *name, int enabled)
+void
+enable_call(const char *name, int enabled)
 {
-	int i;
-	for (i = 0; i < GF_FOP_MAXVALUE; i++)
-		if (!strcasecmp(trace_fop_names[i].name, name))
-			trace_fop_names[i].enabled = enabled;
+    int i;
+    for (i = 0; i < GF_FOP_MAXVALUE; i++)
+        if (!strcasecmp(trace_fop_names[i].name, name))
+            trace_fop_names[i].enabled = enabled;
 }
 
-
-/* 
-   include = 1 for "include-ops"
-           = 0 for "exclude-ops" 
+/*
+  include = 1 for "include-ops"
+  = 0 for "exclude-ops"
 */
 void
-process_call_list (const char *list, int include)
-{
-	enable_all_calls (include ? 0 : 1);
-	
-	char *call = strsep ((char **)&list, ",");
-	while (call) {
-		enable_call (call, include);
-		call = strsep ((char **)&list, ",");
-	}
-}
-
-
-int32_t 
-init (xlator_t *this)
-{
-  dict_t *options = this->options;
-  char *includes = NULL, *excludes = NULL;
-  
-  if (!this)
-	  return -1;
-  
-  if (!this->children || this->children->next) {
-	  gf_log (this->name, GF_LOG_ERROR, 
-		  "trace translator requires one subvolume");
-	  return -1;
-  }
-  if (!this->parents) {
-	  gf_log (this->name, GF_LOG_WARNING,
-		  "dangling volume. check volfile ");
-  }
-  
-  
-  includes = data_to_str (dict_get (options, "include-ops"));
-  excludes = data_to_str (dict_get (options, "exclude-ops"));
-  
-  {
-	  int i;
-	  for (i = 0; i < GF_FOP_MAXVALUE; i++) {
-		  trace_fop_names[i].name = (gf_fop_list[i] ?
-					     gf_fop_list[i] : ":O");
-		  trace_fop_names[i].enabled = 1;
-	  }
-  }
-  
-  if (includes && excludes) {
-	  gf_log (this->name, 
-		  GF_LOG_ERROR,
-		  "must specify only one of 'include-ops' and 'exclude-ops'");
-	  return -1;
-  }
-  if (includes)
-	  process_call_list (includes, 1);
-  if (excludes)
-	  process_call_list (excludes, 0);
-  
-  gf_log_set_loglevel (GF_LOG_NORMAL);
-  
-  /* Set this translator's inode table pointer to child node's pointer. */
-  this->itable = FIRST_CHILD (this)->itable;
-  
-  return 0;
+process_call_list(const char *list, int include)
+{
+    enable_all_calls(include ? 0 : 1);
+
+    char *call = strsep((char **)&list, ",");
+
+    while (call) {
+        enable_call(call, include);
+        call = strsep((char **)&list, ",");
+    }
+}
+
+int32_t
+trace_dump_history(xlator_t *this)
+{
+    int ret = -1;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    trace_conf_t *conf = NULL;
+
+    GF_VALIDATE_OR_GOTO("trace", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->history, out);
+
+    conf = this->private;
+    // Is it ok to return silently if log-history option his off?
+    if (conf && conf->log_history == _gf_true) {
+        gf_proc_dump_build_key(key_prefix, "xlator.debug.trace", "history");
+        gf_proc_dump_add_section("%s", key_prefix);
+        eh_dump(this->history, NULL, dump_history_trace);
+    }
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_trace_mt_end + 1);
+
+    if (ret != 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Memory accounting init"
+               " failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    int32_t ret = -1;
+    trace_conf_t *conf = NULL;
+    char *includes = NULL, *excludes = NULL;
+
+    GF_VALIDATE_OR_GOTO("quick-read", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, options, out);
+
+    conf = this->private;
+
+    includes = data_to_str(dict_get(options, "include-ops"));
+    excludes = data_to_str(dict_get(options, "exclude-ops"));
+
+    {
+        int i;
+        for (i = 0; i < GF_FOP_MAXVALUE; i++) {
+            if (gf_fop_list[i])
+                strncpy(trace_fop_names[i].name, gf_fop_list[i],
+                        sizeof(trace_fop_names[i].name));
+            else
+                strncpy(trace_fop_names[i].name, ":0",
+                        sizeof(trace_fop_names[i].name));
+            trace_fop_names[i].enabled = 1;
+            trace_fop_names[i].name[sizeof(trace_fop_names[i].name) - 1] = 0;
+        }
+    }
+
+    if (includes && excludes) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "must specify only one of 'include-ops' and "
+               "'exclude-ops'");
+        goto out;
+    }
+
+    if (includes)
+        process_call_list(includes, 1);
+    if (excludes)
+        process_call_list(excludes, 0);
+
+    /* Should resizing of the event-history be allowed in reconfigure?
+     * for which a new event_history might have to be allocated and the
+     * older history has to be freed.
+     */
+    GF_OPTION_RECONF("log-file", conf->log_file, options, bool, out);
+
+    GF_OPTION_RECONF("log-history", conf->log_history, options, bool, out);
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    dict_t *options = NULL;
+    char *includes = NULL, *excludes = NULL;
+    char *forced_loglevel = NULL;
+    eh_t *history = NULL;
+    int ret = -1;
+    uint64_t history_size = TRACE_DEFAULT_HISTORY_SIZE;
+    trace_conf_t *conf = NULL;
+
+    if (!this)
+        return -1;
+
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "trace translator requires one subvolume");
+        return -1;
+    }
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
+    }
+
+    conf = GF_CALLOC(1, sizeof(trace_conf_t), gf_trace_mt_trace_conf_t);
+    if (!conf) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "cannot allocate "
+               "xl->private");
+        return -1;
+    }
+
+    options = this->options;
+    includes = data_to_str(dict_get(options, "include-ops"));
+    excludes = data_to_str(dict_get(options, "exclude-ops"));
+
+    {
+        int i;
+        for (i = 0; i < GF_FOP_MAXVALUE; i++) {
+            if (gf_fop_list[i])
+                strncpy(trace_fop_names[i].name, gf_fop_list[i],
+                        sizeof(trace_fop_names[i].name));
+            else
+                strncpy(trace_fop_names[i].name, ":O",
+                        sizeof(trace_fop_names[i].name));
+            trace_fop_names[i].enabled = 1;
+            trace_fop_names[i].name[sizeof(trace_fop_names[i].name) - 1] = 0;
+        }
+    }
+
+    if (includes && excludes) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "must specify only one of 'include-ops' and "
+               "'exclude-ops'");
+        return -1;
+    }
+
+    if (includes)
+        process_call_list(includes, 1);
+    if (excludes)
+        process_call_list(excludes, 0);
+
+    GF_OPTION_INIT("history-size", history_size, size, out);
+    conf->history_size = history_size;
+
+    gf_log(this->name, GF_LOG_INFO, "history size %" PRIu64, history_size);
+
+    GF_OPTION_INIT("log-file", conf->log_file, bool, out);
+
+    gf_log(this->name, GF_LOG_INFO, "logging to file %s",
+           (conf->log_file == _gf_true) ? "enabled" : "disabled");
+
+    GF_OPTION_INIT("log-history", conf->log_history, bool, out);
+
+    gf_log(this->name, GF_LOG_DEBUG, "logging to history %s",
+           (conf->log_history == _gf_true) ? "enabled" : "disabled");
+
+    history = eh_new(history_size, _gf_false, NULL);
+    if (!history) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "event history cannot be "
+               "initialized");
+        return -1;
+    }
+
+    this->history = history;
+
+    conf->trace_log_level = GF_LOG_INFO;
+
+    if (dict_get(options, "force-log-level")) {
+        forced_loglevel = data_to_str(dict_get(options, "force-log-level"));
+        if (!forced_loglevel)
+            goto setloglevel;
+
+        if (strcmp(forced_loglevel, "INFO") == 0)
+            conf->trace_log_level = GF_LOG_INFO;
+        else if (strcmp(forced_loglevel, "TRACE") == 0)
+            conf->trace_log_level = GF_LOG_TRACE;
+        else if (strcmp(forced_loglevel, "ERROR") == 0)
+            conf->trace_log_level = GF_LOG_ERROR;
+        else if (strcmp(forced_loglevel, "DEBUG") == 0)
+            conf->trace_log_level = GF_LOG_DEBUG;
+        else if (strcmp(forced_loglevel, "WARNING") == 0)
+            conf->trace_log_level = GF_LOG_WARNING;
+        else if (strcmp(forced_loglevel, "CRITICAL") == 0)
+            conf->trace_log_level = GF_LOG_CRITICAL;
+        else if (strcmp(forced_loglevel, "NONE") == 0)
+            conf->trace_log_level = GF_LOG_NONE;
+    }
+
+setloglevel:
+    gf_log_set_loglevel(this->ctx, conf->trace_log_level);
+    this->private = conf;
+    ret = 0;
+out:
+    if (ret == -1) {
+        if (history)
+            GF_FREE(history);
+        if (conf)
+            GF_FREE(conf);
+    }
+
+    return ret;
 }
 
 void
-fini (xlator_t *this)
+fini(xlator_t *this)
 {
-	if (!this)
-		return;
-	
-	gf_log (this->name, GF_LOG_NORMAL, 
-		"trace translator unloaded");
-	return;
+    if (!this)
+        return;
+
+    if (this->history)
+        eh_destroy(this->history);
+
+    gf_log(this->name, GF_LOG_INFO, "trace translator unloaded");
+    return;
 }
 
 struct xlator_fops fops = {
-  .stat        = trace_stat,
-  .readlink    = trace_readlink,
-  .mknod       = trace_mknod,
-  .mkdir       = trace_mkdir,
-  .unlink      = trace_unlink,
-  .rmdir       = trace_rmdir,
-  .symlink     = trace_symlink,
-  .rename      = trace_rename,
-  .link        = trace_link,
-  .chmod       = trace_chmod,
-  .chown       = trace_chown,
-  .truncate    = trace_truncate,
-  .utimens     = trace_utimens,
-  .open        = trace_open,
-  .readv       = trace_readv,
-  .writev      = trace_writev,
-  .statfs      = trace_statfs,
-  .flush       = trace_flush,
-  .fsync       = trace_fsync,
-  .setxattr    = trace_setxattr,
-  .getxattr    = trace_getxattr,
-  .removexattr = trace_removexattr,
-  .opendir     = trace_opendir,
-  .readdir     = trace_readdir, 
-  .fsyncdir    = trace_fsyncdir,
-  .access      = trace_access,
-  .ftruncate   = trace_ftruncate,
-  .fstat       = trace_fstat,
-  .create      = trace_create,
-  .fchown      = trace_fchown,
-  .fchmod      = trace_fchmod,
-  .lk          = trace_lk,
-  .inodelk     = trace_inodelk,
-  .finodelk    = trace_finodelk,
-  .entrylk     = trace_entrylk,
-  .lookup      = trace_lookup,
-  .setdents    = trace_setdents,
-  .getdents    = trace_getdents,
-  .checksum    = trace_checksum,
-  .xattrop     = trace_xattrop,
-  .fxattrop    = trace_fxattrop,
-};
-
-struct xlator_mops mops = {
-	.stats    = trace_stats,
+    .stat = trace_stat,
+    .readlink = trace_readlink,
+    .mknod = trace_mknod,
+    .mkdir = trace_mkdir,
+    .unlink = trace_unlink,
+    .rmdir = trace_rmdir,
+    .symlink = trace_symlink,
+    .rename = trace_rename,
+    .link = trace_link,
+    .truncate = trace_truncate,
+    .open = trace_open,
+    .readv = trace_readv,
+    .writev = trace_writev,
+    .statfs = trace_statfs,
+    .flush = trace_flush,
+    .fsync = trace_fsync,
+    .setxattr = trace_setxattr,
+    .getxattr = trace_getxattr,
+    .fsetxattr = trace_fsetxattr,
+    .fgetxattr = trace_fgetxattr,
+    .removexattr = trace_removexattr,
+    .opendir = trace_opendir,
+    .readdir = trace_readdir,
+    .readdirp = trace_readdirp,
+    .fsyncdir = trace_fsyncdir,
+    .access = trace_access,
+    .ftruncate = trace_ftruncate,
+    .fstat = trace_fstat,
+    .create = trace_create,
+    .lk = trace_lk,
+    .inodelk = trace_inodelk,
+    .finodelk = trace_finodelk,
+    .entrylk = trace_entrylk,
+    .fentrylk = trace_fentrylk,
+    .lookup = trace_lookup,
+    .rchecksum = trace_rchecksum,
+    .xattrop = trace_xattrop,
+    .fxattrop = trace_fxattrop,
+    .setattr = trace_setattr,
+    .fsetattr = trace_fsetattr,
+    .seek = trace_seek,
 };
 
 struct xlator_cbks cbks = {
+    .release = trace_release,
+    .releasedir = trace_releasedir,
+    .forget = trace_forget,
 };
 
 struct volume_options options[] = {
-	{ .key  = {"include-ops", "include"}, 
-	  .type = GF_OPTION_TYPE_STR,
-	  /*.value = { ""} */
-	},
-	{ .key  = {"exclude-ops", "exclude"}, 
-	  .type = GF_OPTION_TYPE_STR 
-	  /*.value = { ""} */	  
-	},
-	{ .key  = {NULL} },
+    {
+        .key = {"include-ops", "include"},
+        .type = GF_OPTION_TYPE_STR,
+        /*.value = { ""} */
+    },
+    {
+        .key = {"exclude-ops", "exclude"},
+        .type = GF_OPTION_TYPE_STR
+        /*.value = { ""} */
+    },
+    {
+        .key = {"history-size"},
+        .type = GF_OPTION_TYPE_SIZET,
+        .default_value = "1024",
+    },
+    {
+        .key = {"log-file"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "no",
+    },
+    {
+        .key = {"log-history"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "no",
+    },
+    {.key = {NULL}},
 };
 
+struct xlator_dumpops dumpops = {.history = trace_dump_history};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1},
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "trace",
+    .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/debug/trace/src/trace.h b/xlators/debug/trace/src/trace.h
new file mode 100644
index 00000000000..b16304799da
--- /dev/null
+++ b/xlators/debug/trace/src/trace.h
@@ -0,0 +1,55 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <time.h>
+#include <errno.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/event-history.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/circ-buff.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/options.h>
+
+#define TRACE_DEFAULT_HISTORY_SIZE 1024
+
+typedef struct {
+    /* Since the longest fop name is fremovexattr i.e 12 characters, array size
+     * is kept 24, i.e double of the maximum.
+     */
+    char name[24];
+    int enabled;
+} trace_fop_name_t;
+
+trace_fop_name_t trace_fop_names[GF_FOP_MAXVALUE];
+
+typedef struct {
+    gf_boolean_t log_file;
+    gf_boolean_t log_history;
+    uint64_t history_size;
+    int trace_log_level;
+} trace_conf_t;
+
+#define TRACE_STACK_UNWIND(op, frame, params...)                               \
+    do {                                                                       \
+        frame->local = NULL;                                                   \
+        STACK_UNWIND_STRICT(op, frame, params);                                \
+    } while (0);
+
+#define LOG_ELEMENT(_conf, _string)                                            \
+    do {                                                                       \
+        if (_conf) {                                                           \
+            if ((_conf->log_history) == _gf_true)                              \
+                gf_log_eh("%s", _string);                                      \
+            if ((_conf->log_file) == _gf_true)                                 \
+                gf_log(THIS->name, _conf->trace_log_level, "%s", _string);     \
+        }                                                                      \
+    } while (0);
diff --git a/xlators/encryption/Makefile.am b/xlators/encryption/Makefile.am
deleted file mode 100644
index 2cbde680fac..00000000000
--- a/xlators/encryption/Makefile.am
+++ /dev/null
@@ -1,3 +0,0 @@
-SUBDIRS = rot-13
-
-CLEANFILES = 
diff --git a/xlators/encryption/rot-13/src/rot-13.c b/xlators/encryption/rot-13/src/rot-13.c
deleted file mode 100644
index 0b92e25baa2..00000000000
--- a/xlators/encryption/rot-13/src/rot-13.c
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
-  Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-#include <ctype.h>
-#include <sys/uio.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "xlator.h"
-#include "logging.h"
-
-#include "rot-13.h"
-
-/*
- * This is a rot13 ``encryption'' xlator. It rot13's data when 
- * writing to disk and rot13's it back when reading it. 
- * This xlator is meant as an example, NOT FOR PRODUCTION
- * USE ;) (hence no error-checking)
- */
-
-void 
-rot13 (char *buf, int len)
-{
-	int i;
-	for (i = 0; i < len; i++) {
-		if (buf[i] >= 'a' && buf[i] <= 'z')
-			buf[i] = 'a' + ((buf[i] - 'a' + 13) % 26);
-		else if (buf[i] >= 'A' && buf[i] <= 'Z')
-			buf[i] = 'A' + ((buf[i] - 'A' + 13) % 26);
-	}
-}
-
-void
-rot13_iovec (struct iovec *vector, int count)
-{
-	int i;
-	for (i = 0; i < count; i++) {
-		rot13 (vector[i].iov_base, vector[i].iov_len);
-	}
-}
-
-int32_t
-rot13_readv_cbk (call_frame_t *frame,
-                 void *cookie,
-                 xlator_t *this,
-                 int32_t op_ret,
-                 int32_t op_errno,
-                 struct iovec *vector,
-                 int32_t count,
-		 struct stat *stbuf)
-{
-	rot_13_private_t *priv = (rot_13_private_t *)this->private;
-  
-	if (priv->decrypt_read)
-		rot13_iovec (vector, count);
-
-	STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
-	return 0;
-}
-
-int32_t
-rot13_readv (call_frame_t *frame,
-             xlator_t *this,
-             fd_t *fd,
-             size_t size,
-             off_t offset)
-{
-	STACK_WIND (frame,
-		    rot13_readv_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->readv,
-		    fd, size, offset);
-	return 0;
-}
-
-int32_t
-rot13_writev_cbk (call_frame_t *frame,
-                  void *cookie,
-                  xlator_t *this,
-                  int32_t op_ret,
-                  int32_t op_errno,
-		  struct stat *stbuf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, stbuf);
-	return 0;
-}
-
-int32_t
-rot13_writev (call_frame_t *frame,
-              xlator_t *this,
-              fd_t *fd,
-              struct iovec *vector,
-              int32_t count, 
-              off_t offset)
-{
-	rot_13_private_t *priv = (rot_13_private_t *)this->private;
-	if (priv->encrypt_write)
-		rot13_iovec (vector, count);
-
-	STACK_WIND (frame, 
-		    rot13_writev_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->writev,
-		    fd, vector, count, offset);
-	return 0;
-}
-
-int32_t
-init (xlator_t *this)
-{
-	data_t *data = NULL;
-	rot_13_private_t *priv = NULL;
-
-	if (!this->children || this->children->next) {
-		gf_log ("rot13", GF_LOG_ERROR, 
-			"FATAL: rot13 should have exactly one child");
-		return -1;
-	}
-
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-  
-	priv = CALLOC (sizeof (rot_13_private_t), 1);
-	ERR_ABORT (priv);
-	priv->decrypt_read = 1;
-	priv->encrypt_write = 1;
-
-	data = dict_get (this->options, "encrypt-write");
-	if (data) {
-		if (gf_string2boolean (data->data, &priv->encrypt_write) == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"encrypt-write takes only boolean options");
-			return -1;
-		}
-	}
-
-	data = dict_get (this->options, "decrypt-read");
-	if (data) {
-		if (gf_string2boolean (data->data, &priv->decrypt_read) == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"decrypt-read takes only boolean options");
-			return -1;
-		}
-	}
-
-	this->private = priv;
-	gf_log ("rot13", GF_LOG_DEBUG, "rot13 xlator loaded");
-	return 0;
-}
-
-void 
-fini (xlator_t *this)
-{
-	rot_13_private_t *priv = this->private;
-	
-	FREE (priv);
-	
-	return;
-}
-
-struct xlator_fops fops = {
-	.readv        = rot13_readv,
-	.writev       = rot13_writev
-};
-
-struct xlator_mops mops = {
-};
-
-
-struct volume_options options[] = {
-	{ .key  = {"encrypt-write"}, 
-	  .type = GF_OPTION_TYPE_BOOL
-	},
-	{ .key  = {"decrypt-read"}, 
-	  .type = GF_OPTION_TYPE_BOOL 
-	},
-	{ .key  = {NULL} },
-};
diff --git a/xlators/encryption/rot-13/src/rot-13.h b/xlators/encryption/rot-13/src/rot-13.h
deleted file mode 100644
index d45803517d7..00000000000
--- a/xlators/encryption/rot-13/src/rot-13.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
-   Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __ROT_13_H__
-#define __ROT_13_H__
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-typedef struct {
-	gf_boolean_t encrypt_write;
-	gf_boolean_t decrypt_read;
-} rot_13_private_t;
-
-#endif /* __ROT_13_H__ */
diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am
index 9ac9b6f19de..c57897f11ea 100644
--- a/xlators/features/Makefile.am
+++ b/xlators/features/Makefile.am
@@ -1,3 +1,14 @@
-SUBDIRS = locks trash path-convertor filter quota
+if BUILD_CLOUDSYNC
+  CLOUDSYNC_DIR = cloudsync
+endif
 
-CLEANFILES = 
+if BUILD_METADISP
+  METADISP_DIR = metadisp
+endif
+
+SUBDIRS = locks quota read-only quiesce marker index barrier arbiter upcall \
+	compress changelog gfid-access snapview-client snapview-server trash \
+	shard bit-rot leases selinux sdfs namespace $(CLOUDSYNC_DIR) thin-arbiter \
+	utime $(METADISP_DIR)
+
+CLEANFILES =
diff --git a/xlators/performance/symlink-cache/Makefile.am b/xlators/features/arbiter/Makefile.am
index d471a3f9243..a985f42a877 100644
--- a/xlators/performance/symlink-cache/Makefile.am
+++ b/xlators/features/arbiter/Makefile.am
@@ -1,3 +1,3 @@
 SUBDIRS = src
 
-CLEANFILES = 
+CLEANFILES =
diff --git a/xlators/features/arbiter/src/Makefile.am b/xlators/features/arbiter/src/Makefile.am
new file mode 100644
index 00000000000..badc42f37be
--- /dev/null
+++ b/xlators/features/arbiter/src/Makefile.am
@@ -0,0 +1,19 @@
+if WITH_SERVER
+xlator_LTLIBRARIES = arbiter.la
+endif
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+arbiter_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+arbiter_la_SOURCES = arbiter.c
+arbiter_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = arbiter.h arbiter-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/arbiter/src/arbiter-mem-types.h b/xlators/features/arbiter/src/arbiter-mem-types.h
new file mode 100644
index 00000000000..05d18374c46
--- /dev/null
+++ b/xlators/features/arbiter/src/arbiter-mem-types.h
@@ -0,0 +1,18 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __ARBITER_MEM_TYPES_H__
+#define __ARBITER_MEM_TYPES_H__
+#include <glusterfs/mem-types.h>
+
+typedef enum gf_arbiter_mem_types_ {
+    gf_arbiter_mt_inode_ctx_t = gf_common_mt_end + 1,
+    gf_arbiter_mt_end
+} gf_arbiter_mem_types_t;
+#endif
diff --git a/xlators/features/arbiter/src/arbiter.c b/xlators/features/arbiter/src/arbiter.c
new file mode 100644
index 00000000000..83a97e3354b
--- /dev/null
+++ b/xlators/features/arbiter/src/arbiter.c
@@ -0,0 +1,380 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "arbiter.h"
+#include "arbiter-mem-types.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+
+static arbiter_inode_ctx_t *
+__arbiter_inode_ctx_get(inode_t *inode, xlator_t *this)
+{
+    arbiter_inode_ctx_t *ctx = NULL;
+    int ret = 0;
+    uint64_t ctx_addr = 0;
+
+    ret = __inode_ctx_get(inode, this, &ctx_addr);
+    if (ret == 0) {
+        ctx = (arbiter_inode_ctx_t *)(long)ctx_addr;
+        goto out;
+    }
+
+    ctx = GF_CALLOC(1, sizeof(*ctx), gf_arbiter_mt_inode_ctx_t);
+    if (!ctx)
+        goto out;
+
+    ret = __inode_ctx_put(inode, this, (uint64_t)(uintptr_t)ctx);
+    if (ret) {
+        GF_FREE(ctx);
+        ctx = NULL;
+        gf_log_callingfn(this->name, GF_LOG_ERROR,
+                         "failed to "
+                         "set the inode ctx (%s)",
+                         uuid_utoa(inode->gfid));
+    }
+out:
+    return ctx;
+}
+
+static arbiter_inode_ctx_t *
+arbiter_inode_ctx_get(inode_t *inode, xlator_t *this)
+{
+    arbiter_inode_ctx_t *ctx = NULL;
+
+    LOCK(&inode->lock);
+    {
+        ctx = __arbiter_inode_ctx_get(inode, this);
+    }
+    UNLOCK(&inode->lock);
+    return ctx;
+}
+
+int32_t
+arbiter_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, inode_t *inode,
+                   struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+    arbiter_inode_ctx_t *ctx = NULL;
+
+    if (op_ret != 0)
+        goto unwind;
+    ctx = arbiter_inode_ctx_get(inode, this);
+    if (!ctx) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+    memcpy(&ctx->iattbuf, buf, sizeof(ctx->iattbuf));
+
+unwind:
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+                        postparent);
+    return 0;
+}
+
+int32_t
+arbiter_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    STACK_WIND(frame, arbiter_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
+    return 0;
+}
+
+int32_t
+arbiter_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+                 dict_t *xdata)
+{
+    arbiter_inode_ctx_t *ctx = NULL;
+    struct iatt *buf = NULL;
+    int32_t op_ret = 0;
+    int32_t op_errno = 0;
+
+    ctx = arbiter_inode_ctx_get(loc->inode, this);
+    if (!ctx) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+    buf = &ctx->iattbuf;
+unwind:
+    STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, buf, buf, NULL);
+    return 0;
+}
+
+int32_t
+arbiter_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                  dict_t *xdata)
+
+{
+    arbiter_inode_ctx_t *ctx = NULL;
+    struct iatt *buf = NULL;
+    int32_t op_ret = 0;
+    int32_t op_errno = 0;
+
+    ctx = arbiter_inode_ctx_get(fd->inode, this);
+    if (!ctx) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+    buf = &ctx->iattbuf;
+unwind:
+    STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, buf, buf, NULL);
+    return 0;
+}
+
+dict_t *
+arbiter_fill_writev_xdata(fd_t *fd, dict_t *xdata, xlator_t *this)
+{
+    dict_t *rsp_xdata = NULL;
+    int32_t ret = 0;
+    int is_append = 1;
+
+    if (!fd || !fd->inode || gf_uuid_is_null(fd->inode->gfid)) {
+        goto out;
+    }
+
+    if (!xdata)
+        goto out;
+
+    rsp_xdata = dict_new();
+    if (!rsp_xdata)
+        goto out;
+
+    if (dict_get(xdata, GLUSTERFS_OPEN_FD_COUNT)) {
+        ret = dict_set_uint32(rsp_xdata, GLUSTERFS_OPEN_FD_COUNT,
+                              fd->inode->fd_count);
+        if (ret < 0) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set dict value"
+                         " for GLUSTERFS_OPEN_FD_COUNT");
+        }
+    }
+    if (dict_get(xdata, GLUSTERFS_WRITE_IS_APPEND)) {
+        ret = dict_set_uint32(rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, is_append);
+        if (ret < 0) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set dict value"
+                         " for GLUSTERFS_WRITE_IS_APPEND");
+        }
+    }
+out:
+    return rsp_xdata;
+}
+
+int32_t
+arbiter_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+               struct iovec *vector, int32_t count, off_t off, uint32_t flags,
+               struct iobref *iobref, dict_t *xdata)
+{
+    arbiter_inode_ctx_t *ctx = NULL;
+    struct iatt *buf = NULL;
+    dict_t *rsp_xdata = NULL;
+    int op_ret = 0;
+    int op_errno = 0;
+
+    ctx = arbiter_inode_ctx_get(fd->inode, this);
+    if (!ctx) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+    buf = &ctx->iattbuf;
+    op_ret = iov_length(vector, count);
+    rsp_xdata = arbiter_fill_writev_xdata(fd, xdata, this);
+unwind:
+    STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, buf, buf, rsp_xdata);
+    if (rsp_xdata)
+        dict_unref(rsp_xdata);
+    return 0;
+}
+
+int32_t
+arbiter_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                  int32_t keep_size, off_t offset, size_t len, dict_t *xdata)
+{
+    arbiter_inode_ctx_t *ctx = NULL;
+    struct iatt *buf = NULL;
+    int op_ret = 0;
+    int op_errno = 0;
+
+    ctx = arbiter_inode_ctx_get(fd->inode, this);
+    if (!ctx) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+    buf = &ctx->iattbuf;
+unwind:
+    STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, buf, buf, NULL);
+    return 0;
+}
+
+int32_t
+arbiter_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                size_t len, dict_t *xdata)
+{
+    arbiter_inode_ctx_t *ctx = NULL;
+    struct iatt *buf = NULL;
+    int op_ret = 0;
+    int op_errno = 0;
+
+    ctx = arbiter_inode_ctx_get(fd->inode, this);
+    if (!ctx) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+    buf = &ctx->iattbuf;
+unwind:
+    STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, buf, buf, NULL);
+    return 0;
+}
+
+int32_t
+arbiter_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                 off_t len, dict_t *xdata)
+{
+    arbiter_inode_ctx_t *ctx = NULL;
+    struct iatt *buf = NULL;
+    int op_ret = 0;
+    int op_errno = 0;
+
+    ctx = arbiter_inode_ctx_get(fd->inode, this);
+    if (!ctx) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+    buf = &ctx->iattbuf;
+unwind:
+    STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, buf, buf, NULL);
+    return 0;
+}
+
+static int32_t
+arbiter_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+              off_t offset, uint32_t flags, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(readv, frame, -1, ENOSYS, NULL, 0, NULL, NULL, NULL);
+    return 0;
+}
+
+static int32_t
+arbiter_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             gf_seek_what_t what, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(seek, frame, -1, ENOSYS, 0, xdata);
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    ret = xlator_mem_acct_init(this, gf_arbiter_mt_end + 1);
+    if (ret)
+        gf_log(this->name, GF_LOG_ERROR,
+               "Memory accounting "
+               "initialization failed.");
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    return 0;
+}
+
+int
+arbiter_forget(xlator_t *this, inode_t *inode)
+{
+    arbiter_inode_ctx_t *ctx = NULL;
+    uint64_t ctx_addr = 0;
+
+    inode_ctx_del(inode, this, &ctx_addr);
+    if (!ctx_addr)
+        return 0;
+    ctx = (arbiter_inode_ctx_t *)(long)ctx_addr;
+    GF_FREE(ctx);
+    return 0;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "'arbiter' not configured with exactly one child");
+        return -1;
+    }
+
+    if (!this->parents)
+        gf_log(this->name, GF_LOG_ERROR, "dangling volume. check volfile ");
+
+    return 0;
+}
+
+void
+fini(xlator_t *this)
+{
+    return;
+}
+
+struct xlator_fops fops = {
+    .lookup = arbiter_lookup,
+
+    /* Return success for these inode write FOPS without winding it down to
+     * posix; this is needed for AFR write transaction logic to work.*/
+    .truncate = arbiter_truncate,
+    .writev = arbiter_writev,
+    .ftruncate = arbiter_ftruncate,
+    .fallocate = arbiter_fallocate,
+    .discard = arbiter_discard,
+    .zerofill = arbiter_zerofill,
+
+    /* AFR is not expected to wind these inode read FOPS initiated by the
+     * application to the arbiter brick. But in case a bug causes them
+     * to be called, we return ENOSYS. */
+    .readv = arbiter_readv,
+    .seek = arbiter_seek,
+
+    /* The following inode read FOPS initiated by the application are not
+     * wound by AFR either but internal logic like  shd, glfsheal and
+     * client side healing in AFR will send them for selfheal/ inode refresh
+     * operations etc.,so we need to wind them down to posix:
+     *
+     * (f)stat, readdir(p), readlink, (f)getxattr.*/
+
+    /* All other FOPs not listed here are safe to be wound down to posix.*/
+};
+
+struct xlator_cbks cbks = {
+    .forget = arbiter_forget,
+};
+
+struct volume_options options[] = {
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "arbiter",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/arbiter/src/arbiter.h b/xlators/features/arbiter/src/arbiter.h
new file mode 100644
index 00000000000..546db7b751a
--- /dev/null
+++ b/xlators/features/arbiter/src/arbiter.h
@@ -0,0 +1,21 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _ARBITER_H
+#define _ARBITER_H
+
+#include <glusterfs/locking.h>
+#include <glusterfs/common-utils.h>
+
+typedef struct arbiter_inode_ctx_ {
+    struct iatt iattbuf;
+} arbiter_inode_ctx_t;
+
+#endif /* _ARBITER_H */
diff --git a/xlators/features/barrier/Makefile.am b/xlators/features/barrier/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/barrier/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/barrier/src/Makefile.am b/xlators/features/barrier/src/Makefile.am
new file mode 100644
index 00000000000..25099bc56e5
--- /dev/null
+++ b/xlators/features/barrier/src/Makefile.am
@@ -0,0 +1,17 @@
+xlator_LTLIBRARIES = barrier.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+barrier_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+barrier_la_SOURCES = barrier.c
+
+barrier_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = barrier.h barrier-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/barrier/src/barrier-mem-types.h b/xlators/features/barrier/src/barrier-mem-types.h
new file mode 100644
index 00000000000..71ed7898d9c
--- /dev/null
+++ b/xlators/features/barrier/src/barrier-mem-types.h
@@ -0,0 +1,20 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BARRIER_MEM_TYPES_H__
+#define __BARRIER_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_barrier_mem_types_ {
+    gf_barrier_mt_priv_t = gf_common_mt_end + 1,
+    gf_barrier_mt_end
+};
+#endif
diff --git a/xlators/features/barrier/src/barrier.c b/xlators/features/barrier/src/barrier.c
new file mode 100644
index 00000000000..852bbacb99d
--- /dev/null
+++ b/xlators/features/barrier/src/barrier.c
@@ -0,0 +1,809 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "barrier.h"
+#include <glusterfs/defaults.h>
+#include <glusterfs/call-stub.h>
+
+#include <glusterfs/statedump.h>
+
+void
+barrier_local_set_gfid(call_frame_t *frame, uuid_t gfid, xlator_t *this)
+{
+    if (gfid) {
+        uuid_t *id = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+        if (!id) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "Could not set gfid"
+                   ". gfid will not be dumped in statedump file.");
+            return;
+        }
+        gf_uuid_copy(*id, gfid);
+        frame->local = id;
+    }
+}
+
+void
+barrier_local_free_gfid(call_frame_t *frame)
+{
+    if (frame->local) {
+        GF_FREE(frame->local);
+        frame->local = NULL;
+    }
+}
+
+int32_t
+barrier_truncate_cbk_resume(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int32_t op_ret, int32_t op_errno,
+                            struct iatt *prebuf, struct iatt *postbuf,
+                            dict_t *xdata)
+{
+    barrier_local_free_gfid(frame);
+    STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
+
+int32_t
+barrier_ftruncate_cbk_resume(call_frame_t *frame, void *cookie, xlator_t *this,
+                             int32_t op_ret, int32_t op_errno,
+                             struct iatt *prebuf, struct iatt *postbuf,
+                             dict_t *xdata)
+{
+    barrier_local_free_gfid(frame);
+    STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
+
+int32_t
+barrier_unlink_cbk_resume(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno,
+                          struct iatt *preparent, struct iatt *postparent,
+                          dict_t *xdata)
+{
+    barrier_local_free_gfid(frame);
+    STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent,
+                        xdata);
+    return 0;
+}
+
+int32_t
+barrier_rmdir_cbk_resume(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno,
+                         struct iatt *preparent, struct iatt *postparent,
+                         dict_t *xdata)
+{
+    barrier_local_free_gfid(frame);
+    STACK_UNWIND_STRICT(rmdir, frame, op_ret, op_errno, preparent, postparent,
+                        xdata);
+    return 0;
+}
+
+int32_t
+barrier_rename_cbk_resume(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                          struct iatt *preoldparent, struct iatt *postoldparent,
+                          struct iatt *prenewparent, struct iatt *postnewparent,
+                          dict_t *xdata)
+{
+    barrier_local_free_gfid(frame);
+    STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, buf, preoldparent,
+                        postoldparent, prenewparent, postnewparent, xdata);
+    return 0;
+}
+
+int32_t
+barrier_writev_cbk_resume(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                          struct iatt *postbuf, dict_t *xdata)
+{
+    barrier_local_free_gfid(frame);
+    STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
+
+int32_t
+barrier_fsync_cbk_resume(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                         struct iatt *postbuf, dict_t *xdata)
+{
+    barrier_local_free_gfid(frame);
+    STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+    return 0;
+}
+
+int32_t
+barrier_removexattr_cbk_resume(call_frame_t *frame, void *cookie,
+                               xlator_t *this, int32_t op_ret, int32_t op_errno,
+                               dict_t *xdata)
+{
+    barrier_local_free_gfid(frame);
+    STACK_UNWIND_STRICT(removexattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int32_t
+barrier_fremovexattr_cbk_resume(call_frame_t *frame, void *cookie,
+                                xlator_t *this, int32_t op_ret,
+                                int32_t op_errno, dict_t *xdata)
+{
+    barrier_local_free_gfid(frame);
+    STACK_UNWIND_STRICT(fremovexattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int32_t
+barrier_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                   struct iatt *postbuf, dict_t *xdata)
+{
+    BARRIER_FOP_CBK(writev, out, frame, this, op_ret, op_errno, prebuf, postbuf,
+                    xdata);
+out:
+    return 0;
+}
+
+int32_t
+barrier_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    BARRIER_FOP_CBK(fremovexattr, out, frame, this, op_ret, op_errno, xdata);
+out:
+    return 0;
+}
+
+int32_t
+barrier_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    BARRIER_FOP_CBK(removexattr, out, frame, this, op_ret, op_errno, xdata);
+out:
+    return 0;
+}
+
+int32_t
+barrier_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata)
+{
+    BARRIER_FOP_CBK(truncate, out, frame, this, op_ret, op_errno, prebuf,
+                    postbuf, xdata);
+out:
+    return 0;
+}
+
+int32_t
+barrier_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                      struct iatt *postbuf, dict_t *xdata)
+{
+    BARRIER_FOP_CBK(ftruncate, out, frame, this, op_ret, op_errno, prebuf,
+                    postbuf, xdata);
+out:
+    return 0;
+}
+
+int32_t
+barrier_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                   struct iatt *preoldparent, struct iatt *postoldparent,
+                   struct iatt *prenewparent, struct iatt *postnewparent,
+                   dict_t *xdata)
+{
+    BARRIER_FOP_CBK(rename, out, frame, this, op_ret, op_errno, buf,
+                    preoldparent, postoldparent, prenewparent, postnewparent,
+                    xdata);
+out:
+    return 0;
+}
+
+int32_t
+barrier_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
+{
+    BARRIER_FOP_CBK(rmdir, out, frame, this, op_ret, op_errno, preparent,
+                    postparent, xdata);
+out:
+    return 0;
+}
+
+int32_t
+barrier_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
+{
+    BARRIER_FOP_CBK(unlink, out, frame, this, op_ret, op_errno, preparent,
+                    postparent, xdata);
+out:
+    return 0;
+}
+
+int32_t
+barrier_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                  struct iatt *postbuf, dict_t *xdata)
+{
+    BARRIER_FOP_CBK(fsync, out, frame, this, op_ret, op_errno, prebuf, postbuf,
+                    xdata);
+out:
+    return 0;
+}
+
+int32_t
+barrier_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+               struct iovec *vector, int32_t count, off_t off, uint32_t flags,
+               struct iobref *iobref, dict_t *xdata)
+{
+    if (!((flags | fd->flags) & (O_SYNC | O_DSYNC))) {
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->writev, fd, vector, count, off,
+                        flags, iobref, xdata);
+
+        return 0;
+    }
+
+    barrier_local_set_gfid(frame, fd->inode->gfid, this);
+    STACK_WIND(frame, barrier_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, off, flags,
+               iobref, xdata);
+    return 0;
+}
+
+int32_t
+barrier_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                     const char *name, dict_t *xdata)
+{
+    barrier_local_set_gfid(frame, fd->inode->gfid, this);
+    STACK_WIND(frame, barrier_fremovexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+    return 0;
+}
+
+int32_t
+barrier_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    const char *name, dict_t *xdata)
+{
+    barrier_local_set_gfid(frame, loc->inode->gfid, this);
+    STACK_WIND(frame, barrier_removexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+    return 0;
+}
+
+int32_t
+barrier_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+                 dict_t *xdata)
+{
+    barrier_local_set_gfid(frame, loc->inode->gfid, this);
+    STACK_WIND(frame, barrier_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+    return 0;
+}
+
+int32_t
+barrier_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+               loc_t *newloc, dict_t *xdata)
+{
+    barrier_local_set_gfid(frame, oldloc->inode->gfid, this);
+    STACK_WIND(frame, barrier_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+    return 0;
+}
+
+int
+barrier_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+              dict_t *xdata)
+{
+    barrier_local_set_gfid(frame, loc->inode->gfid, this);
+    STACK_WIND(frame, barrier_rmdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata);
+    return 0;
+}
+
+int32_t
+barrier_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+               dict_t *xdata)
+{
+    barrier_local_set_gfid(frame, loc->inode->gfid, this);
+    STACK_WIND(frame, barrier_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+    return 0;
+}
+
+int32_t
+barrier_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                  dict_t *xdata)
+{
+    barrier_local_set_gfid(frame, fd->inode->gfid, this);
+    STACK_WIND(frame, barrier_ftruncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+}
+
+int32_t
+barrier_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+              dict_t *xdata)
+{
+    barrier_local_set_gfid(frame, fd->inode->gfid, this);
+    STACK_WIND(frame, barrier_fsync_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsync, fd, flags, xdata);
+    return 0;
+}
+
+call_stub_t *
+__barrier_dequeue(xlator_t *this, struct list_head *queue)
+{
+    call_stub_t *stub = NULL;
+    barrier_priv_t *priv = NULL;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (list_empty(queue))
+        goto out;
+
+    stub = list_entry(queue->next, call_stub_t, list);
+    list_del_init(&stub->list);
+
+out:
+    return stub;
+}
+
+void
+barrier_dequeue_all(xlator_t *this, struct list_head *queue)
+{
+    call_stub_t *stub = NULL;
+
+    gf_log(this->name, GF_LOG_INFO, "Dequeuing all the barriered fops");
+
+    /* TODO: Start the below task in a new thread */
+    while ((stub = __barrier_dequeue(this, queue)))
+        call_resume(stub);
+
+    gf_log(this->name, GF_LOG_INFO,
+           "Dequeuing the barriered fops is "
+           "finished");
+    return;
+}
+
+void
+barrier_timeout(void *data)
+{
+    xlator_t *this = NULL;
+    barrier_priv_t *priv = NULL;
+    struct list_head queue = {
+        0,
+    };
+
+    this = data;
+    THIS = this;
+    priv = this->private;
+
+    INIT_LIST_HEAD(&queue);
+
+    gf_log(this->name, GF_LOG_CRITICAL,
+           "Disabling barrier because of "
+           "the barrier timeout.");
+
+    LOCK(&priv->lock);
+    {
+        __barrier_disable(this, &queue);
+    }
+    UNLOCK(&priv->lock);
+
+    barrier_dequeue_all(this, &queue);
+
+    return;
+}
+
+void
+__barrier_enqueue(xlator_t *this, call_stub_t *stub)
+{
+    barrier_priv_t *priv = NULL;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    list_add_tail(&stub->list, &priv->queue);
+    priv->queue_size++;
+
+    return;
+}
+
+void
+__barrier_disable(xlator_t *this, struct list_head *queue)
+{
+    GF_UNUSED int ret = 0;
+    barrier_priv_t *priv = NULL;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (priv->timer) {
+        ret = gf_timer_call_cancel(this->ctx, priv->timer);
+        priv->timer = NULL;
+    }
+
+    list_splice_init(&priv->queue, queue);
+    priv->queue_size = 0;
+    priv->barrier_enabled = _gf_false;
+}
+
+int
+__barrier_enable(xlator_t *this, barrier_priv_t *priv)
+{
+    int ret = -1;
+
+    priv->timer = gf_timer_call_after(this->ctx, priv->timeout, barrier_timeout,
+                                      (void *)this);
+    if (!priv->timer) {
+        gf_log(this->name, GF_LOG_CRITICAL,
+               "Couldn't add barrier "
+               "timeout event.");
+        goto out;
+    }
+
+    priv->barrier_enabled = _gf_true;
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+notify(xlator_t *this, int event, void *data, ...)
+{
+    barrier_priv_t *priv = this->private;
+    dict_t *dict = NULL;
+    int ret = -1;
+    int barrier_enabled = _gf_false;
+    struct list_head queue = {
+        0,
+    };
+
+    GF_ASSERT(priv);
+    INIT_LIST_HEAD(&queue);
+
+    switch (event) {
+        case GF_EVENT_TRANSLATOR_OP: {
+            dict = data;
+            barrier_enabled = dict_get_str_boolean(dict, "barrier", -1);
+
+            if (barrier_enabled == -1) {
+                gf_log(this->name, GF_LOG_ERROR,
+                       "Could not fetch "
+                       " barrier key from the dictionary.");
+                goto out;
+            }
+
+            LOCK(&priv->lock);
+            {
+                if (!priv->barrier_enabled) {
+                    if (barrier_enabled) {
+                        ret = __barrier_enable(this, priv);
+                    } else {
+                        UNLOCK(&priv->lock);
+                        gf_log(this->name, GF_LOG_ERROR, "Already disabled.");
+                        goto post_unlock;
+                    }
+                } else {
+                    if (!barrier_enabled) {
+                        __barrier_disable(this, &queue);
+                        ret = 0;
+                    } else {
+                        UNLOCK(&priv->lock);
+                        gf_log(this->name, GF_LOG_ERROR, "Already enabled");
+                        goto post_unlock;
+                    }
+                }
+            }
+            UNLOCK(&priv->lock);
+        post_unlock:
+            if (!list_empty(&queue))
+                barrier_dequeue_all(this, &queue);
+
+            break;
+        }
+        default: {
+            default_notify(this, event, data);
+            ret = 0;
+            goto out;
+        }
+    }
+out:
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    barrier_priv_t *priv = NULL;
+    int ret = -1;
+    gf_boolean_t barrier_enabled = _gf_false;
+    uint32_t timeout = {
+        0,
+    };
+    struct list_head queue = {
+        0,
+    };
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_OPTION_RECONF("barrier", barrier_enabled, options, bool, out);
+    GF_OPTION_RECONF("barrier-timeout", timeout, options, time, out);
+
+    INIT_LIST_HEAD(&queue);
+
+    LOCK(&priv->lock);
+    {
+        if (!priv->barrier_enabled) {
+            if (barrier_enabled) {
+                ret = __barrier_enable(this, priv);
+                if (ret) {
+                    goto unlock;
+                }
+            }
+        } else {
+            if (!barrier_enabled) {
+                __barrier_disable(this, &queue);
+            }
+        }
+        priv->timeout.tv_sec = timeout;
+        ret = 0;
+    }
+unlock:
+    UNLOCK(&priv->lock);
+
+    if (!list_empty(&queue))
+        barrier_dequeue_all(this, &queue);
+
+out:
+    return ret;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    ret = xlator_mem_acct_init(this, gf_barrier_mt_end + 1);
+    if (ret)
+        gf_log(this->name, GF_LOG_ERROR,
+               "Memory accounting "
+               "initialization failed.");
+
+    return ret;
+}
+
+int
+init(xlator_t *this)
+{
+    int ret = -1;
+    barrier_priv_t *priv = NULL;
+    uint32_t timeout = {
+        0,
+    };
+
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "'barrier' not configured with exactly one child");
+        goto out;
+    }
+
+    if (!this->parents)
+        gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_barrier_mt_priv_t);
+    if (!priv)
+        goto out;
+
+    LOCK_INIT(&priv->lock);
+
+    GF_OPTION_INIT("barrier", priv->barrier_enabled, bool, out);
+    GF_OPTION_INIT("barrier-timeout", timeout, time, out);
+    priv->timeout.tv_sec = timeout;
+
+    INIT_LIST_HEAD(&priv->queue);
+
+    if (priv->barrier_enabled) {
+        ret = __barrier_enable(this, priv);
+        if (ret == -1)
+            goto out;
+    }
+
+    this->private = priv;
+    ret = 0;
+out:
+    if (ret && priv)
+        GF_FREE(priv);
+
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    barrier_priv_t *priv = NULL;
+    struct list_head queue = {
+        0,
+    };
+
+    priv = this->private;
+    if (!priv)
+        goto out;
+
+    INIT_LIST_HEAD(&queue);
+
+    gf_log(this->name, GF_LOG_INFO,
+           "Disabling barriering and dequeuing "
+           "all the queued fops");
+    LOCK(&priv->lock);
+    {
+        __barrier_disable(this, &queue);
+    }
+    UNLOCK(&priv->lock);
+
+    if (!list_empty(&queue))
+        barrier_dequeue_all(this, &queue);
+
+    this->private = NULL;
+
+    LOCK_DESTROY(&priv->lock);
+    GF_FREE(priv);
+out:
+    return;
+}
+
+static void
+barrier_dump_stub(call_stub_t *stub, char *prefix)
+{
+    char key[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+
+    gf_proc_dump_build_key(key, prefix, "fop");
+    gf_proc_dump_write(key, "%s", gf_fop_list[stub->fop]);
+
+    if (stub->frame->local) {
+        gf_proc_dump_build_key(key, prefix, "gfid");
+        gf_proc_dump_write(key, "%s",
+                           uuid_utoa(*(uuid_t *)(stub->frame->local)));
+    }
+    if (stub->args.loc.path) {
+        gf_proc_dump_build_key(key, prefix, "path");
+        gf_proc_dump_write(key, "%s", stub->args.loc.path);
+    }
+    if (stub->args.loc.name) {
+        gf_proc_dump_build_key(key, prefix, "name");
+        gf_proc_dump_write(key, "%s", stub->args.loc.name);
+    }
+
+    return;
+}
+
+static void
+__barrier_dump_queue(barrier_priv_t *priv)
+{
+    call_stub_t *stub = NULL;
+    char key[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    int i = 0;
+
+    GF_VALIDATE_OR_GOTO("barrier", priv, out);
+
+    list_for_each_entry(stub, &priv->queue, list)
+    {
+        snprintf(key, sizeof(key), "stub.%d", i++);
+        gf_proc_dump_add_section("%s", key);
+        barrier_dump_stub(stub, key);
+    }
+
+out:
+    return;
+}
+
+int
+barrier_dump_priv(xlator_t *this)
+{
+    int ret = -1;
+    char key[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    barrier_priv_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("barrier", this, out);
+
+    priv = this->private;
+    if (!priv)
+        return 0;
+
+    gf_proc_dump_build_key(key, "xlator.features.barrier", "priv");
+    gf_proc_dump_add_section("%s", key);
+    gf_proc_dump_build_key(key, "barrier", "enabled");
+
+    LOCK(&priv->lock);
+    {
+        gf_proc_dump_write(key, "%d", priv->barrier_enabled);
+        gf_proc_dump_build_key(key, "barrier", "timeout");
+        gf_proc_dump_write(key, "%ld", priv->timeout.tv_sec);
+        if (priv->barrier_enabled) {
+            gf_proc_dump_build_key(key, "barrier", "queue_size");
+            gf_proc_dump_write(key, "%d", priv->queue_size);
+            __barrier_dump_queue(priv);
+        }
+    }
+    UNLOCK(&priv->lock);
+
+out:
+    return ret;
+}
+
+struct xlator_fops fops = {
+
+    /* Barrier Class fops */
+    .rmdir = barrier_rmdir,
+    .unlink = barrier_unlink,
+    .rename = barrier_rename,
+    .removexattr = barrier_removexattr,
+    .fremovexattr = barrier_fremovexattr,
+    .truncate = barrier_truncate,
+    .ftruncate = barrier_ftruncate,
+    .fsync = barrier_fsync,
+
+    /* Writes with only O_SYNC flag */
+    .writev = barrier_writev,
+};
+
+struct xlator_dumpops dumpops = {
+    .priv = barrier_dump_priv,
+};
+
+struct xlator_cbks cbks;
+
+struct volume_options options[] = {
+    {.key = {"barrier"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "disable",
+     .op_version = {GD_OP_VERSION_3_6_0},
+     .flags = OPT_FLAG_SETTABLE,
+     .description = "When \"enabled\", blocks acknowledgements to application "
+                    "for file operations such as rmdir, rename, unlink, "
+                    "removexattr, fremovexattr, truncate, ftruncate, "
+                    "write (with O_SYNC), fsync. It is turned \"off\" by "
+                    "default."},
+    {.key = {"barrier-timeout"},
+     .type = GF_OPTION_TYPE_TIME,
+     .default_value = BARRIER_TIMEOUT,
+     .op_version = {GD_OP_VERSION_3_6_0},
+     .flags = OPT_FLAG_SETTABLE,
+     .description = "After 'timeout' seconds since the time 'barrier' "
+                    "option was set to \"on\", acknowledgements to file "
+                    "operations are no longer blocked and previously "
+                    "blocked acknowledgements are sent to the application"},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "barrier",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/barrier/src/barrier.h b/xlators/features/barrier/src/barrier.h
new file mode 100644
index 00000000000..1337f311f7d
--- /dev/null
+++ b/xlators/features/barrier/src/barrier.h
@@ -0,0 +1,89 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BARRIER_H__
+#define __BARRIER_H__
+
+#include "barrier-mem-types.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/timer.h>
+#include <glusterfs/call-stub.h>
+
+#define BARRIER_FOP_CBK(fop_name, label, frame, this, params...)               \
+    do {                                                                       \
+        barrier_priv_t *_priv = NULL;                                          \
+        call_stub_t *_stub = NULL;                                             \
+        gf_boolean_t _barrier_enabled = _gf_false;                             \
+        struct list_head queue = {                                             \
+            0,                                                                 \
+        };                                                                     \
+                                                                               \
+        INIT_LIST_HEAD(&queue);                                                \
+                                                                               \
+        _priv = this->private;                                                 \
+        GF_ASSERT(_priv);                                                      \
+                                                                               \
+        LOCK(&_priv->lock);                                                    \
+        {                                                                      \
+            if (_priv->barrier_enabled) {                                      \
+                _barrier_enabled = _priv->barrier_enabled;                     \
+                                                                               \
+                _stub = fop_##fop_name##_cbk_stub(                             \
+                    frame, barrier_##fop_name##_cbk_resume, params);           \
+                if (!_stub) {                                                  \
+                    __barrier_disable(this, &queue);                           \
+                    goto unlock;                                               \
+                }                                                              \
+                                                                               \
+                __barrier_enqueue(this, _stub);                                \
+            }                                                                  \
+        }                                                                      \
+    unlock:                                                                    \
+        UNLOCK(&_priv->lock);                                                  \
+                                                                               \
+        if (_stub)                                                             \
+            goto label;                                                        \
+                                                                               \
+        if (_barrier_enabled && !_stub) {                                      \
+            gf_log(this->name, GF_LOG_CRITICAL,                                \
+                   "Failed to barrier FOPs, disabling "                        \
+                   "barrier. FOP: %s, ERROR: %s",                              \
+                   #fop_name, strerror(ENOMEM));                               \
+            barrier_dequeue_all(this, &queue);                                 \
+        }                                                                      \
+        barrier_local_free_gfid(frame);                                        \
+        STACK_UNWIND_STRICT(fop_name, frame, params);                          \
+        goto label;                                                            \
+    } while (0)
+
+typedef struct {
+    gf_timer_t *timer;
+    gf_lock_t lock;
+    struct list_head queue;
+    struct timespec timeout;
+    uint32_t queue_size;
+    gf_boolean_t barrier_enabled;
+    char _pad[3]; /* manual padding */
+} barrier_priv_t;
+
+int
+__barrier_enable(xlator_t *this, barrier_priv_t *priv);
+void
+__barrier_enqueue(xlator_t *this, call_stub_t *stub);
+void
+__barrier_disable(xlator_t *this, struct list_head *queue);
+void
+barrier_timeout(void *data);
+void
+barrier_dequeue_all(xlator_t *this, struct list_head *queue);
+call_stub_t *
+__barrier_dequeue(xlator_t *this, struct list_head *queue);
+
+#endif
diff --git a/xlators/features/bit-rot/Makefile.am b/xlators/features/bit-rot/Makefile.am
new file mode 100644
index 00000000000..f963effea22
--- /dev/null
+++ b/xlators/features/bit-rot/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = src
+\ No newline at end of file
diff --git a/xlators/features/bit-rot/src/Makefile.am b/xlators/features/bit-rot/src/Makefile.am
new file mode 100644
index 00000000000..b5e4a7d62a0
--- /dev/null
+++ b/xlators/features/bit-rot/src/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = stub bitd
diff --git a/xlators/features/bit-rot/src/bitd/Makefile.am b/xlators/features/bit-rot/src/bitd/Makefile.am
new file mode 100644
index 00000000000..6db800e6565
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/Makefile.am
@@ -0,0 +1,23 @@
+if WITH_SERVER
+xlator_LTLIBRARIES = bit-rot.la
+endif
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+bit_rot_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src/ -I$(top_builddir)/rpc/xdr/src/ \
+	-I$(top_srcdir)/rpc/rpc-lib/src -I$(CONTRIBDIR)/timer-wheel \
+	-I$(top_srcdir)/xlators/features/bit-rot/src/stub
+
+bit_rot_la_SOURCES = bit-rot.c bit-rot-scrub.c bit-rot-ssm.c \
+		     bit-rot-scrub-status.c
+bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+	$(top_builddir)/xlators/features/changelog/lib/src/libgfchangelog.la
+
+noinst_HEADERS = bit-rot.h bit-rot-scrub.h bit-rot-bitd-messages.h bit-rot-ssm.h \
+		 bit-rot-scrub-status.h
+
+AM_CFLAGS = -Wall -DBR_RATE_LIMIT_SIGNER $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h b/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h
new file mode 100644
index 00000000000..5bc5103a27c
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h
@@ -0,0 +1,101 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _BITROT_BITD_MESSAGES_H_
+#define _BITROT_BITD_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(BITROT_BITD, BRB_MSG_FD_CREATE_FAILED, BRB_MSG_READV_FAILED,
+           BRB_MSG_BLOCK_READ_FAILED, BRB_MSG_CALC_CHECKSUM_FAILED,
+           BRB_MSG_NO_MEMORY, BRB_MSG_GET_SIGN_FAILED, BRB_MSG_SET_SIGN_FAILED,
+           BRB_MSG_OP_FAILED, BRB_MSG_READ_AND_SIGN_FAILED, BRB_MSG_SIGN_FAILED,
+           BRB_MSG_GET_SUBVOL_FAILED, BRB_MSG_SET_TIMER_FAILED,
+           BRB_MSG_GET_INFO_FAILED, BRB_MSG_PATH_FAILED, BRB_MSG_MARK_BAD_FILE,
+           BRB_MSG_TRIGGER_SIGN, BRB_MSG_REGISTER_FAILED,
+           BRB_MSG_CRAWLING_START, BRB_MSG_SPAWN_FAILED,
+           BRB_MSG_INVALID_SUBVOL_CHILD, BRB_MSG_SKIP_OBJECT, BRB_MSG_NO_CHILD,
+           BRB_MSG_CHECKSUM_MISMATCH, BRB_MSG_MARK_CORRUPTED,
+           BRB_MSG_CRAWLING_FINISH, BRB_MSG_CALC_ERROR, BRB_MSG_LOOKUP_FAILED,
+           BRB_MSG_PARTIAL_VERSION_PRESENCE, BRB_MSG_MEM_ACNT_FAILED,
+           BRB_MSG_TIMER_WHEEL_UNAVAILABLE, BRB_MSG_BITROT_LOADED,
+           BRB_MSG_SCALE_DOWN_FAILED, BRB_MSG_SCALE_UP_FAILED,
+           BRB_MSG_SCALE_DOWN_SCRUBBER, BRB_MSG_SCALING_UP_SCRUBBER,
+           BRB_MSG_UNKNOWN_THROTTLE, BRB_MSG_RATE_LIMIT_INFO,
+           BRB_MSG_SCRUB_INFO, BRB_MSG_CONNECTED_TO_BRICK, BRB_MSG_BRICK_INFO,
+           BRB_MSG_SUBVOL_CONNECT_FAILED, BRB_MSG_INVALID_SUBVOL,
+           BRB_MSG_RESCHEDULE_SCRUBBER_FAILED, BRB_MSG_SCRUB_START,
+           BRB_MSG_SCRUB_FINISH, BRB_MSG_SCRUB_RUNNING,
+           BRB_MSG_SCRUB_RESCHEDULED, BRB_MSG_SCRUB_TUNABLE,
+           BRB_MSG_SCRUB_THREAD_CLEANUP, BRB_MSG_SCRUBBER_CLEANED,
+           BRB_MSG_GENERIC_SSM_INFO, BRB_MSG_ZERO_TIMEOUT_BUG,
+           BRB_MSG_BAD_OBJ_READDIR_FAIL, BRB_MSG_SSM_FAILED,
+           BRB_MSG_SCRUB_WAIT_FAILED, BRB_MSG_TRIGGER_SIGN_FAILED,
+           BRB_MSG_EVENT_UNHANDLED, BRB_MSG_COULD_NOT_SCHEDULE_SCRUB,
+           BRB_MSG_THREAD_CREATION_FAILED, BRB_MSG_MEM_POOL_ALLOC,
+           BRB_MSG_SAVING_HASH_FAILED);
+
+#define BRB_MSG_FD_CREATE_FAILED_STR "failed to create fd for the inode"
+#define BRB_MSG_READV_FAILED_STR "readv failed"
+#define BRB_MSG_BLOCK_READ_FAILED_STR "reading block failed"
+#define BRB_MSG_NO_MEMORY_STR "failed to allocate memory"
+#define BRB_MSG_CALC_CHECKSUM_FAILED_STR "calculating checksum failed"
+#define BRB_MSG_GET_SIGN_FAILED_STR "failed to get the signature"
+#define BRB_MSG_SET_SIGN_FAILED_STR "signing failed"
+#define BRB_MSG_OP_FAILED_STR "failed on object"
+#define BRB_MSG_TRIGGER_SIGN_FAILED_STR "Could not trigger signing"
+#define BRB_MSG_READ_AND_SIGN_FAILED_STR "reading and signing of object failed"
+#define BRB_MSG_SET_TIMER_FAILED_STR "Failed to allocate object expiry timer"
+#define BRB_MSG_GET_SUBVOL_FAILED_STR                                          \
+    "failed to get the subvolume for the brick"
+#define BRB_MSG_PATH_FAILED_STR "path failed"
+#define BRB_MSG_SKIP_OBJECT_STR "Entry is marked corrupted. skipping"
+#define BRB_MSG_PARTIAL_VERSION_PRESENCE_STR                                   \
+    "PArtial version xattr presence detected, ignoring"
+#define BRB_MSG_TRIGGER_SIGN_STR "Triggering signing"
+#define BRB_MSG_CRAWLING_START_STR                                             \
+    "Crawling brick, scanning for unsigned objects"
+#define BRB_MSG_CRAWLING_FINISH_STR "Completed crawling brick"
+#define BRB_MSG_REGISTER_FAILED_STR "Register to changelog failed"
+#define BRB_MSG_SPAWN_FAILED_STR "failed to spawn"
+#define BRB_MSG_CONNECTED_TO_BRICK_STR "Connected to brick"
+#define BRB_MSG_LOOKUP_FAILED_STR "lookup on root failed"
+#define BRB_MSG_GET_INFO_FAILED_STR "failed to get stub info"
+#define BRB_MSG_SCRUB_THREAD_CLEANUP_STR "Error cleaning up scanner thread"
+#define BRB_MSG_SCRUBBER_CLEANED_STR "clened up scrubber for brick"
+#define BRB_MSG_SUBVOL_CONNECT_FAILED_STR                                      \
+    "callback handler for subvolume failed"
+#define BRB_MSG_MEM_ACNT_FAILED_STR "Memory accounting init failed"
+#define BRB_MSG_EVENT_UNHANDLED_STR "Event unhandled for child"
+#define BRB_MSG_INVALID_SUBVOL_STR "Got event from invalid subvolume"
+#define BRB_MSG_RESCHEDULE_SCRUBBER_FAILED_STR                                 \
+    "on demand scrub schedule failed. Scrubber is not in pending state."
+#define BRB_MSG_COULD_NOT_SCHEDULE_SCRUB_STR                                   \
+    "Could not schedule ondemand scrubbing. Scrubbing will continue "          \
+    "according to old frequency."
+#define BRB_MSG_THREAD_CREATION_FAILED_STR "thread creation failed"
+#define BRB_MSG_RATE_LIMIT_INFO_STR "Rate Limit Info"
+#define BRB_MSG_MEM_POOL_ALLOC_STR "failed to allocate mem-pool for timer"
+#define BRB_MSG_NO_CHILD_STR "FATAL: no children"
+#define BRB_MSG_TIMER_WHEEL_UNAVAILABLE_STR "global timer wheel unavailable"
+#define BRB_MSG_BITROT_LOADED_STR "bit-rot xlator loaded"
+#define BRB_MSG_SAVING_HASH_FAILED_STR                                         \
+    "failed to allocate memory for saving hash of the object"
+#endif /* !_BITROT_BITD_MESSAGES_H_ */
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c
new file mode 100644
index 00000000000..5cef2ffa5e5
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c
@@ -0,0 +1,78 @@
+/*
+  Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <string.h>
+#include <stdio.h>
+
+#include "bit-rot-scrub-status.h"
+
+void
+br_inc_unsigned_file_count(br_scrub_stats_t *scrub_stat)
+{
+    if (!scrub_stat)
+        return;
+
+    pthread_mutex_lock(&scrub_stat->lock);
+    {
+        scrub_stat->unsigned_files++;
+    }
+    pthread_mutex_unlock(&scrub_stat->lock);
+}
+
+void
+br_inc_scrubbed_file(br_scrub_stats_t *scrub_stat)
+{
+    if (!scrub_stat)
+        return;
+
+    pthread_mutex_lock(&scrub_stat->lock);
+    {
+        scrub_stat->scrubbed_files++;
+    }
+    pthread_mutex_unlock(&scrub_stat->lock);
+}
+
+void
+br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, time_t time)
+{
+    if (!scrub_stat)
+        return;
+
+    pthread_mutex_lock(&scrub_stat->lock);
+    {
+        scrub_stat->scrub_start_time = time;
+    }
+    pthread_mutex_unlock(&scrub_stat->lock);
+}
+
+void
+br_update_scrub_finish_time(br_scrub_stats_t *scrub_stat, char *timestr,
+                            time_t time)
+{
+    int lst_size = 0;
+
+    if (!scrub_stat)
+        return;
+
+    lst_size = sizeof(scrub_stat->last_scrub_time);
+    if (strlen(timestr) >= lst_size)
+        return;
+
+    pthread_mutex_lock(&scrub_stat->lock);
+    {
+        scrub_stat->scrub_end_time = time;
+
+        scrub_stat->scrub_duration = scrub_stat->scrub_end_time -
+                                     scrub_stat->scrub_start_time;
+
+        snprintf(scrub_stat->last_scrub_time, lst_size, "%s", timestr);
+    }
+    pthread_mutex_unlock(&scrub_stat->lock);
+}
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h
new file mode 100644
index 00000000000..f022aa831eb
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h
@@ -0,0 +1,50 @@
+/*
+   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BIT_ROT_SCRUB_STATUS_H__
+#define __BIT_ROT_SCRUB_STATUS_H__
+
+#include <stdint.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+#include <glusterfs/common-utils.h>
+
+struct br_scrub_stats {
+    uint64_t scrubbed_files; /* Total number of scrubbed files. */
+
+    uint64_t unsigned_files; /* Total number of unsigned files. */
+
+    uint64_t scrub_duration; /* Duration of last scrub. */
+
+    char last_scrub_time[GF_TIMESTR_SIZE]; /* Last scrub completion time. */
+
+    time_t scrub_start_time; /* Scrubbing starting time. */
+
+    time_t scrub_end_time; /* Scrubbing finishing time. */
+
+    int8_t scrub_running; /* Whether scrub running or not. */
+
+    pthread_mutex_t lock;
+};
+
+typedef struct br_scrub_stats br_scrub_stats_t;
+
+void
+br_inc_unsigned_file_count(br_scrub_stats_t *scrub_stat);
+void
+br_inc_scrubbed_file(br_scrub_stats_t *scrub_stat);
+void
+br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, time_t time);
+void
+br_update_scrub_finish_time(br_scrub_stats_t *scrub_stat, char *timestr,
+                            time_t time);
+
+#endif /* __BIT_ROT_SCRUB_STATUS_H__ */
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c
new file mode 100644
index 00000000000..289dd53f610
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c
@@ -0,0 +1,2070 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <math.h>
+#include <ctype.h>
+#include <sys/uio.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/common-utils.h>
+
+#include "bit-rot-scrub.h"
+#include <pthread.h>
+#include "bit-rot-bitd-messages.h"
+#include "bit-rot-scrub-status.h"
+#include <glusterfs/events.h>
+
+struct br_scrubbers {
+    pthread_t scrubthread;
+
+    struct list_head list;
+};
+
+struct br_fsscan_entry {
+    void *data;
+
+    loc_t parent;
+
+    gf_dirent_t *entry;
+
+    struct br_scanfs *fsscan; /* backpointer to subvolume scanner */
+
+    struct list_head list;
+};
+
+/**
+ * fetch signature extended attribute from an object's fd.
+ * NOTE: On success @xattr is not unref'd as @sign points
+ * to the dictionary value.
+ */
+static int32_t
+bitd_fetch_signature(xlator_t *this, br_child_t *child, fd_t *fd,
+                     dict_t **xattr, br_isignature_out_t **sign)
+{
+    int32_t ret = -1;
+
+    ret = syncop_fgetxattr(child->xl, fd, xattr, GLUSTERFS_GET_OBJECT_SIGNATURE,
+                           NULL, NULL);
+    if (ret < 0) {
+        br_log_object(this, "fgetxattr", fd->inode->gfid, -ret);
+        goto out;
+    }
+
+    ret = dict_get_ptr(*xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, (void **)sign);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED,
+               "failed to extract signature info [GFID: %s]",
+               uuid_utoa(fd->inode->gfid));
+        goto unref_dict;
+    }
+
+    return 0;
+
+unref_dict:
+    dict_unref(*xattr);
+out:
+    return -1;
+}
+
+/**
+ * POST COMPUTE CHECK
+ *
+ * Checks to be performed before verifying calculated signature
+ * Object is skipped if:
+ *  - has stale signature
+ *  - mismatches versions caches in pre-compute check
+ */
+
+int32_t
+bitd_scrub_post_compute_check(xlator_t *this, br_child_t *child, fd_t *fd,
+                              unsigned long version,
+                              br_isignature_out_t **signature,
+                              br_scrub_stats_t *scrub_stat,
+                              gf_boolean_t skip_stat)
+{
+    int32_t ret = 0;
+    size_t signlen = 0;
+    dict_t *xattr = NULL;
+    br_isignature_out_t *signptr = NULL;
+
+    ret = bitd_fetch_signature(this, child, fd, &xattr, &signptr);
+    if (ret < 0) {
+        if (!skip_stat)
+            br_inc_unsigned_file_count(scrub_stat);
+        goto out;
+    }
+
+    /**
+     * Either the object got dirtied during the time the signature was
+     * calculated OR the version we saved during pre-compute check does
+     * not match now, implying that the object got dirtied and signed in
+     * between scrubs pre & post compute checks (checksum window).
+     *
+     * The log entry looks pretty ugly, but helps in debugging..
+     */
+    if (signptr->stale || (signptr->version != version)) {
+        if (!skip_stat)
+            br_inc_unsigned_file_count(scrub_stat);
+        gf_msg_debug(this->name, 0,
+                     "<STAGE: POST> Object [GFID: %s] "
+                     "either has a stale signature OR underwent "
+                     "signing during checksumming {Stale: %d | "
+                     "Version: %lu,%lu}",
+                     uuid_utoa(fd->inode->gfid), (signptr->stale) ? 1 : 0,
+                     version, signptr->version);
+        ret = -1;
+        goto unref_dict;
+    }
+
+    signlen = signptr->signaturelen;
+    *signature = GF_MALLOC(sizeof(br_isignature_out_t) + signlen,
+                           gf_common_mt_char);
+
+    (void)memcpy(*signature, signptr, sizeof(br_isignature_out_t) + signlen);
+
+    (*signature)->signaturelen = signlen;
+
+unref_dict:
+    dict_unref(xattr);
+out:
+    return ret;
+}
+
+static int32_t
+bitd_signature_staleness(xlator_t *this, br_child_t *child, fd_t *fd,
+                         int *stale, unsigned long *version,
+                         br_scrub_stats_t *scrub_stat, gf_boolean_t skip_stat)
+{
+    int32_t ret = -1;
+    dict_t *xattr = NULL;
+    br_isignature_out_t *signptr = NULL;
+
+    ret = bitd_fetch_signature(this, child, fd, &xattr, &signptr);
+    if (ret < 0) {
+        if (!skip_stat)
+            br_inc_unsigned_file_count(scrub_stat);
+        goto out;
+    }
+
+    /**
+     * save version for validation in post compute stage
+     * c.f. bitd_scrub_post_compute_check()
+     */
+    *stale = signptr->stale ? 1 : 0;
+    *version = signptr->version;
+
+    dict_unref(xattr);
+
+out:
+    return ret;
+}
+
+/**
+ * PRE COMPUTE CHECK
+ *
+ * Checks to be performed before initiating object signature calculation.
+ * An object is skipped if:
+ *  - it's already marked corrupted
+ *  - has stale signature
+ */
+int32_t
+bitd_scrub_pre_compute_check(xlator_t *this, br_child_t *child, fd_t *fd,
+                             unsigned long *version,
+                             br_scrub_stats_t *scrub_stat,
+                             gf_boolean_t skip_stat)
+{
+    int stale = 0;
+    int32_t ret = -1;
+
+    if (bitd_is_bad_file(this, child, NULL, fd)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SKIP_OBJECT,
+               "Object [GFID: %s] is marked corrupted, skipping..",
+               uuid_utoa(fd->inode->gfid));
+        goto out;
+    }
+
+    ret = bitd_signature_staleness(this, child, fd, &stale, version, scrub_stat,
+                                   skip_stat);
+    if (!ret && stale) {
+        if (!skip_stat)
+            br_inc_unsigned_file_count(scrub_stat);
+        gf_msg_debug(this->name, 0,
+                     "<STAGE: PRE> Object [GFID: %s] "
+                     "has stale signature",
+                     uuid_utoa(fd->inode->gfid));
+        ret = -1;
+    }
+
+out:
+    return ret;
+}
+
+/* static int */
+int
+bitd_compare_ckum(xlator_t *this, br_isignature_out_t *sign, unsigned char *md,
+                  inode_t *linked_inode, gf_dirent_t *entry, fd_t *fd,
+                  br_child_t *child, loc_t *loc)
+{
+    int ret = -1;
+    dict_t *xattr = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, sign, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, child, out);
+    GF_VALIDATE_OR_GOTO(this->name, linked_inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, md, out);
+    GF_VALIDATE_OR_GOTO(this->name, entry, out);
+
+    if (strncmp(sign->signature, (char *)md, sign->signaturelen) == 0) {
+        gf_msg_debug(this->name, 0,
+                     "%s [GFID: %s | Brick: %s] "
+                     "matches calculated checksum",
+                     loc->path, uuid_utoa(linked_inode->gfid),
+                     child->brick_path);
+        return 0;
+    }
+
+    gf_msg(this->name, GF_LOG_DEBUG, 0, BRB_MSG_CHECKSUM_MISMATCH,
+           "Object checksum mismatch: %s [GFID: %s | Brick: %s]", loc->path,
+           uuid_utoa(linked_inode->gfid), child->brick_path);
+    gf_msg(this->name, GF_LOG_ALERT, 0, BRB_MSG_CHECKSUM_MISMATCH,
+           "CORRUPTION DETECTED: Object %s {Brick: %s | GFID: %s}", loc->path,
+           child->brick_path, uuid_utoa(linked_inode->gfid));
+
+    /* Perform bad-file marking */
+    xattr = dict_new();
+    if (!xattr) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_int32(xattr, BITROT_OBJECT_BAD_KEY, _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_MARK_BAD_FILE,
+               "Error setting bad-file marker for %s [GFID: %s | "
+               "Brick: %s]",
+               loc->path, uuid_utoa(linked_inode->gfid), child->brick_path);
+        goto dictfree;
+    }
+
+    gf_msg(this->name, GF_LOG_ALERT, 0, BRB_MSG_MARK_CORRUPTED,
+           "Marking"
+           " %s [GFID: %s | Brick: %s] as corrupted..",
+           loc->path, uuid_utoa(linked_inode->gfid), child->brick_path);
+    gf_event(EVENT_BITROT_BAD_FILE, "gfid=%s;path=%s;brick=%s",
+             uuid_utoa(linked_inode->gfid), loc->path, child->brick_path);
+    ret = syncop_fsetxattr(child->xl, fd, xattr, 0, NULL, NULL);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_MARK_BAD_FILE,
+               "Error marking object %s [GFID: %s] as corrupted", loc->path,
+               uuid_utoa(linked_inode->gfid));
+
+dictfree:
+    dict_unref(xattr);
+out:
+    return ret;
+}
+
+/**
+ * "The Scrubber"
+ *
+ * Perform signature validation for a given object with the assumption
+ * that the signature is SHA256 (because signer as of now _always_
+ * signs with SHA256).
+ */
+int
+br_scrubber_scrub_begin(xlator_t *this, struct br_fsscan_entry *fsentry)
+{
+    int32_t ret = -1;
+    fd_t *fd = NULL;
+    loc_t loc = {
+        0,
+    };
+    struct iatt iatt = {
+        0,
+    };
+    struct iatt parent_buf = {
+        0,
+    };
+    pid_t pid = 0;
+    br_child_t *child = NULL;
+    unsigned char *md = NULL;
+    inode_t *linked_inode = NULL;
+    br_isignature_out_t *sign = NULL;
+    unsigned long signedversion = 0;
+    gf_dirent_t *entry = NULL;
+    br_private_t *priv = NULL;
+    loc_t *parent = NULL;
+    gf_boolean_t skip_stat = _gf_false;
+    uuid_t shard_root_gfid = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("bit-rot", fsentry, out);
+
+    entry = fsentry->entry;
+    parent = &fsentry->parent;
+    child = fsentry->data;
+
+    priv = this->private;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", entry, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", parent, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", child, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", priv, out);
+
+    pid = GF_CLIENT_PID_SCRUB;
+
+    ret = br_prepare_loc(this, child, parent, entry, &loc);
+    if (!ret)
+        goto out;
+
+    syncopctx_setfspid(&pid);
+
+    ret = syncop_lookup(child->xl, &loc, &iatt, &parent_buf, NULL, NULL);
+    if (ret) {
+        br_log_object_path(this, "lookup", loc.path, -ret);
+        goto out;
+    }
+
+    linked_inode = inode_link(loc.inode, parent->inode, loc.name, &iatt);
+    if (linked_inode)
+        inode_lookup(linked_inode);
+
+    gf_msg_debug(this->name, 0, "Scrubbing object %s [GFID: %s]", entry->d_name,
+                 uuid_utoa(linked_inode->gfid));
+
+    if (iatt.ia_type != IA_IFREG) {
+        gf_msg_debug(this->name, 0, "%s is not a regular file", entry->d_name);
+        ret = 0;
+        goto unref_inode;
+    }
+
+    if (IS_DHT_LINKFILE_MODE((&iatt))) {
+        gf_msg_debug(this->name, 0, "%s is a dht sticky bit file",
+                     entry->d_name);
+        ret = 0;
+        goto unref_inode;
+    }
+
+    /* skip updating scrub statistics for shard entries */
+    gf_uuid_parse(SHARD_ROOT_GFID, shard_root_gfid);
+    if (gf_uuid_compare(loc.pargfid, shard_root_gfid) == 0)
+        skip_stat = _gf_true;
+
+    /**
+     * open() an fd for subsequent operations
+     */
+    fd = fd_create(linked_inode, 0);
+    if (!fd) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED,
+               "failed to create fd for inode %s",
+               uuid_utoa(linked_inode->gfid));
+        goto unref_inode;
+    }
+
+    ret = syncop_open(child->xl, &loc, O_RDWR, fd, NULL, NULL);
+    if (ret) {
+        br_log_object(this, "open", linked_inode->gfid, -ret);
+        ret = -1;
+        goto unrefd;
+    }
+
+    fd_bind(fd);
+
+    /**
+     * perform pre compute checks before initiating checksum
+     * computation
+     *  - presence of bad object
+     *  - signature staleness
+     */
+    ret = bitd_scrub_pre_compute_check(this, child, fd, &signedversion,
+                                       &priv->scrub_stat, skip_stat);
+    if (ret)
+        goto unrefd; /* skip this object */
+
+    /* if all's good, proceed to calculate the hash */
+    md = GF_MALLOC(SHA256_DIGEST_LENGTH, gf_common_mt_char);
+    if (!md)
+        goto unrefd;
+
+    ret = br_calculate_obj_checksum(md, child, fd, &iatt);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_CALC_ERROR,
+               "error calculating hash for object [GFID: %s]",
+               uuid_utoa(fd->inode->gfid));
+        ret = -1;
+        goto free_md;
+    }
+
+    /**
+     * perform post compute checks as an object's signature may have
+     * become stale while scrubber calculated checksum.
+     */
+    ret = bitd_scrub_post_compute_check(this, child, fd, signedversion, &sign,
+                                        &priv->scrub_stat, skip_stat);
+    if (ret)
+        goto free_md;
+
+    ret = bitd_compare_ckum(this, sign, md, linked_inode, entry, fd, child,
+                            &loc);
+
+    if (!skip_stat)
+        br_inc_scrubbed_file(&priv->scrub_stat);
+
+    GF_FREE(sign); /* allocated on post-compute */
+
+    /** fd_unref() takes care of closing fd.. like syncop_close() */
+
+free_md:
+    GF_FREE(md);
+unrefd:
+    fd_unref(fd);
+unref_inode:
+    inode_unref(linked_inode);
+out:
+    loc_wipe(&loc);
+    return ret;
+}
+
+static void
+_br_lock_cleaner(void *arg)
+{
+    pthread_mutex_t *mutex = arg;
+
+    pthread_mutex_unlock(mutex);
+}
+
+static void
+wait_for_scrubbing(xlator_t *this, struct br_scanfs *fsscan)
+{
+    br_private_t *priv = NULL;
+    struct br_scrubber *fsscrub = NULL;
+
+    priv = this->private;
+    fsscrub = &priv->fsscrub;
+
+    pthread_cleanup_push(_br_lock_cleaner, &fsscan->waitlock);
+    pthread_mutex_lock(&fsscan->waitlock);
+    {
+        pthread_cleanup_push(_br_lock_cleaner, &fsscrub->mutex);
+        pthread_mutex_lock(&fsscrub->mutex);
+        {
+            list_replace_init(&fsscan->queued, &fsscan->ready);
+
+            /* wake up scrubbers */
+            pthread_cond_broadcast(&fsscrub->cond);
+        }
+        pthread_mutex_unlock(&fsscrub->mutex);
+        pthread_cleanup_pop(0);
+
+        while (fsscan->entries != 0)
+            pthread_cond_wait(&fsscan->waitcond, &fsscan->waitlock);
+    }
+    pthread_mutex_unlock(&fsscan->waitlock);
+    pthread_cleanup_pop(0);
+}
+
+static void
+_br_fsscan_inc_entry_count(struct br_scanfs *fsscan)
+{
+    fsscan->entries++;
+}
+
+static void
+_br_fsscan_dec_entry_count(struct br_scanfs *fsscan)
+{
+    if (--fsscan->entries == 0) {
+        pthread_mutex_lock(&fsscan->waitlock);
+        {
+            pthread_cond_signal(&fsscan->waitcond);
+        }
+        pthread_mutex_unlock(&fsscan->waitlock);
+    }
+}
+
+static void
+_br_fsscan_collect_entry(struct br_scanfs *fsscan,
+                         struct br_fsscan_entry *fsentry)
+{
+    list_add_tail(&fsentry->list, &fsscan->queued);
+    _br_fsscan_inc_entry_count(fsscan);
+}
+
+#define NR_ENTRIES (1 << 7) /* ..bulk scrubbing */
+
+int
+br_fsscanner_handle_entry(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+                          void *data)
+{
+    int32_t ret = -1;
+    int scrub = 0;
+    br_child_t *child = NULL;
+    xlator_t *this = NULL;
+    struct br_scanfs *fsscan = NULL;
+    struct br_fsscan_entry *fsentry = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", subvol, error_return);
+    GF_VALIDATE_OR_GOTO("bit-rot", data, error_return);
+
+    child = data;
+    this = child->this;
+    fsscan = &child->fsscan;
+
+    _mask_cancellation();
+
+    fsentry = GF_CALLOC(1, sizeof(*fsentry), gf_br_mt_br_fsscan_entry_t);
+    if (!fsentry)
+        goto error_return;
+
+    {
+        fsentry->data = data;
+        fsentry->fsscan = &child->fsscan;
+
+        /* copy parent loc */
+        ret = loc_copy(&fsentry->parent, parent);
+        if (ret)
+            goto dealloc;
+
+        /* copy child entry */
+        fsentry->entry = entry_copy(entry);
+        if (!fsentry->entry)
+            goto locwipe;
+
+        INIT_LIST_HEAD(&fsentry->list);
+    }
+
+    LOCK(&fsscan->entrylock);
+    {
+        _br_fsscan_collect_entry(fsscan, fsentry);
+
+        /**
+         * need not be a equality check as entries may be pushed
+         * back onto the scanned queue when thread(s) are cleaned.
+         */
+        if (fsscan->entries >= NR_ENTRIES)
+            scrub = 1;
+    }
+    UNLOCK(&fsscan->entrylock);
+
+    _unmask_cancellation();
+
+    if (scrub)
+        wait_for_scrubbing(this, fsscan);
+
+    return 0;
+
+locwipe:
+    loc_wipe(&fsentry->parent);
+dealloc:
+    GF_FREE(fsentry);
+error_return:
+    return -1;
+}
+
+int32_t
+br_fsscan_deactivate(xlator_t *this)
+{
+    int ret = 0;
+    br_private_t *priv = NULL;
+    br_scrub_state_t nstate = 0;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    ret = gf_tw_del_timer(priv->timer_wheel, scrub_monitor->timer);
+    if (ret == 0) {
+        nstate = BR_SCRUB_STATE_STALLED;
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+               "Volume is under active scrubbing. Pausing scrub..");
+    } else {
+        nstate = BR_SCRUB_STATE_PAUSED;
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+               "Scrubber paused");
+    }
+
+    _br_monitor_set_scrub_state(scrub_monitor, nstate);
+
+    return 0;
+}
+
+static void
+br_scrubber_log_time(xlator_t *this, const char *sfx)
+{
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    br_private_t *priv = NULL;
+    time_t now = 0;
+
+    now = gf_time();
+    priv = this->private;
+
+    gf_time_fmt(timestr, sizeof(timestr), now, gf_timefmt_FT);
+
+    if (strcasecmp(sfx, "started") == 0) {
+        br_update_scrub_start_time(&priv->scrub_stat, now);
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_START,
+               "Scrubbing %s at %s", sfx, timestr);
+    } else {
+        br_update_scrub_finish_time(&priv->scrub_stat, timestr, now);
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_FINISH,
+               "Scrubbing %s at %s", sfx, timestr);
+    }
+}
+
+static void
+br_fsscanner_log_time(xlator_t *this, br_child_t *child, const char *sfx)
+{
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    time_t now = 0;
+
+    now = gf_time();
+    gf_time_fmt(timestr, sizeof(timestr), now, gf_timefmt_FT);
+
+    if (strcasecmp(sfx, "started") == 0) {
+        gf_msg_debug(this->name, 0, "Scrubbing \"%s\" %s at %s",
+                     child->brick_path, sfx, timestr);
+    } else {
+        gf_msg_debug(this->name, 0, "Scrubbing \"%s\" %s at %s",
+                     child->brick_path, sfx, timestr);
+    }
+}
+
+void
+br_child_set_scrub_state(br_child_t *child, gf_boolean_t state)
+{
+    child->active_scrubbing = state;
+}
+
+static void
+br_fsscanner_wait_until_kicked(xlator_t *this, br_child_t *child)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    pthread_cleanup_push(_br_lock_cleaner, &scrub_monitor->wakelock);
+    pthread_mutex_lock(&scrub_monitor->wakelock);
+    {
+        while (!scrub_monitor->kick)
+            pthread_cond_wait(&scrub_monitor->wakecond,
+                              &scrub_monitor->wakelock);
+
+        /* Child lock is to synchronize with disconnect events */
+        pthread_cleanup_push(_br_lock_cleaner, &child->lock);
+        pthread_mutex_lock(&child->lock);
+        {
+            scrub_monitor->active_child_count++;
+            br_child_set_scrub_state(child, _gf_true);
+        }
+        pthread_mutex_unlock(&child->lock);
+        pthread_cleanup_pop(0);
+    }
+    pthread_mutex_unlock(&scrub_monitor->wakelock);
+    pthread_cleanup_pop(0);
+}
+
+static void
+br_scrubber_entry_control(xlator_t *this)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    LOCK(&scrub_monitor->lock);
+    {
+        /* Move the state to BR_SCRUB_STATE_ACTIVE */
+        if (scrub_monitor->state == BR_SCRUB_STATE_PENDING)
+            scrub_monitor->state = BR_SCRUB_STATE_ACTIVE;
+        br_scrubber_log_time(this, "started");
+        priv->scrub_stat.scrub_running = 1;
+    }
+    UNLOCK(&scrub_monitor->lock);
+}
+
+static void
+br_scrubber_exit_control(xlator_t *this)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    LOCK(&scrub_monitor->lock);
+    {
+        br_scrubber_log_time(this, "finished");
+        priv->scrub_stat.scrub_running = 0;
+
+        if (scrub_monitor->state == BR_SCRUB_STATE_ACTIVE) {
+            (void)br_fsscan_activate(this);
+        } else {
+            UNLOCK(&scrub_monitor->lock);
+            gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+                   "Volume waiting to get rescheduled..");
+            return;
+        }
+    }
+    UNLOCK(&scrub_monitor->lock);
+}
+
+static void
+br_fsscanner_entry_control(xlator_t *this, br_child_t *child)
+{
+    br_fsscanner_log_time(this, child, "started");
+}
+
+static void
+br_fsscanner_exit_control(xlator_t *this, br_child_t *child)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    if (!_br_is_child_connected(child)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SCRUB_INFO,
+               "Brick [%s] disconnected while scrubbing. Scrubbing "
+               "might be incomplete",
+               child->brick_path);
+    }
+
+    br_fsscanner_log_time(this, child, "finished");
+
+    pthread_cleanup_push(_br_lock_cleaner, &scrub_monitor->wakelock);
+    pthread_mutex_lock(&scrub_monitor->wakelock);
+    {
+        scrub_monitor->active_child_count--;
+        pthread_cleanup_push(_br_lock_cleaner, &child->lock);
+        pthread_mutex_lock(&child->lock);
+        {
+            br_child_set_scrub_state(child, _gf_false);
+        }
+        pthread_mutex_unlock(&child->lock);
+        pthread_cleanup_pop(0);
+
+        if (scrub_monitor->active_child_count == 0) {
+            /* The last child has finished scrubbing.
+             * Set the kick to false and  wake up other
+             * children who are waiting for the last
+             * child to complete scrubbing.
+             */
+            scrub_monitor->kick = _gf_false;
+            pthread_cond_broadcast(&scrub_monitor->wakecond);
+
+            /* Signal monitor thread waiting for the all
+             * the children to finish scrubbing.
+             */
+            pthread_cleanup_push(_br_lock_cleaner, &scrub_monitor->donelock);
+            pthread_mutex_lock(&scrub_monitor->donelock);
+            {
+                scrub_monitor->done = _gf_true;
+                pthread_cond_signal(&scrub_monitor->donecond);
+            }
+            pthread_mutex_unlock(&scrub_monitor->donelock);
+            pthread_cleanup_pop(0);
+        } else {
+            while (scrub_monitor->active_child_count)
+                pthread_cond_wait(&scrub_monitor->wakecond,
+                                  &scrub_monitor->wakelock);
+        }
+    }
+    pthread_mutex_unlock(&scrub_monitor->wakelock);
+    pthread_cleanup_pop(0);
+}
+
+void *
+br_fsscanner(void *arg)
+{
+    loc_t loc = {
+        0,
+    };
+    br_child_t *child = NULL;
+    xlator_t *this = NULL;
+    struct br_scanfs *fsscan = NULL;
+
+    child = arg;
+    this = child->this;
+    fsscan = &child->fsscan;
+
+    THIS = this;
+    loc.inode = child->table->root;
+
+    while (1) {
+        br_fsscanner_wait_until_kicked(this, child);
+        {
+            /* precursor for scrub */
+            br_fsscanner_entry_control(this, child);
+
+            /* scrub */
+            (void)syncop_ftw(child->xl, &loc, GF_CLIENT_PID_SCRUB, child,
+                             br_fsscanner_handle_entry);
+            if (!list_empty(&fsscan->queued))
+                wait_for_scrubbing(this, fsscan);
+
+            /* scrub exit criteria */
+            br_fsscanner_exit_control(this, child);
+        }
+    }
+
+    return NULL;
+}
+
+/**
+ * Keep this routine extremely simple and do not ever try to acquire
+ * child->lock here: it may lead to deadlock. Scrubber state is
+ * modified in br_fsscanner(). An intermediate state change to pause
+ * changes the scrub state to the _correct_ state by identifying a
+ * non-pending timer.
+ */
+void
+br_kickstart_scanner(struct gf_tw_timer_list *timer, void *data,
+                     unsigned long calltime)
+{
+    xlator_t *this = NULL;
+    struct br_monitor *scrub_monitor = data;
+    br_private_t *priv = NULL;
+
+    THIS = this = scrub_monitor->this;
+    priv = this->private;
+
+    /* Reset scrub statistics */
+    priv->scrub_stat.scrubbed_files = 0;
+    priv->scrub_stat.unsigned_files = 0;
+
+    /* Moves state from PENDING to ACTIVE */
+    (void)br_scrubber_entry_control(this);
+
+    /* kickstart scanning.. */
+    pthread_mutex_lock(&scrub_monitor->wakelock);
+    {
+        scrub_monitor->kick = _gf_true;
+        GF_ASSERT(scrub_monitor->active_child_count == 0);
+        pthread_cond_broadcast(&scrub_monitor->wakecond);
+    }
+    pthread_mutex_unlock(&scrub_monitor->wakelock);
+
+    return;
+}
+
+static uint32_t
+br_fsscan_calculate_delta(uint32_t times)
+{
+    return times;
+}
+
+#define BR_SCRUB_ONDEMAND (1)
+#define BR_SCRUB_MINUTE (60)
+#define BR_SCRUB_HOURLY (60 * 60)
+#define BR_SCRUB_DAILY (1 * 24 * 60 * 60)
+#define BR_SCRUB_WEEKLY (7 * 24 * 60 * 60)
+#define BR_SCRUB_BIWEEKLY (14 * 24 * 60 * 60)
+#define BR_SCRUB_MONTHLY (30 * 24 * 60 * 60)
+
+static unsigned int
+br_fsscan_calculate_timeout(scrub_freq_t freq)
+{
+    uint32_t timo = 0;
+
+    switch (freq) {
+        case BR_FSSCRUB_FREQ_MINUTE:
+            timo = br_fsscan_calculate_delta(BR_SCRUB_MINUTE);
+            break;
+        case BR_FSSCRUB_FREQ_HOURLY:
+            timo = br_fsscan_calculate_delta(BR_SCRUB_HOURLY);
+            break;
+        case BR_FSSCRUB_FREQ_DAILY:
+            timo = br_fsscan_calculate_delta(BR_SCRUB_DAILY);
+            break;
+        case BR_FSSCRUB_FREQ_WEEKLY:
+            timo = br_fsscan_calculate_delta(BR_SCRUB_WEEKLY);
+            break;
+        case BR_FSSCRUB_FREQ_BIWEEKLY:
+            timo = br_fsscan_calculate_delta(BR_SCRUB_BIWEEKLY);
+            break;
+        case BR_FSSCRUB_FREQ_MONTHLY:
+            timo = br_fsscan_calculate_delta(BR_SCRUB_MONTHLY);
+            break;
+        default:
+            timo = 0;
+    }
+
+    return timo;
+}
+
+int32_t
+br_fsscan_schedule(xlator_t *this)
+{
+    uint32_t timo = 0;
+    br_private_t *priv = NULL;
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    struct br_scrubber *fsscrub = NULL;
+    struct gf_tw_timer_list *timer = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    fsscrub = &priv->fsscrub;
+    scrub_monitor = &priv->scrub_monitor;
+
+    scrub_monitor->boot = gf_time();
+
+    timo = br_fsscan_calculate_timeout(fsscrub->frequency);
+    if (timo == 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG,
+               "BUG: Zero schedule timeout");
+        goto error_return;
+    }
+
+    scrub_monitor->timer = GF_CALLOC(1, sizeof(*scrub_monitor->timer),
+                                     gf_br_stub_mt_br_scanner_freq_t);
+    if (!scrub_monitor->timer)
+        goto error_return;
+
+    timer = scrub_monitor->timer;
+    INIT_LIST_HEAD(&timer->entry);
+
+    timer->data = scrub_monitor;
+    timer->expires = timo;
+    timer->function = br_kickstart_scanner;
+
+    gf_tw_add_timer(priv->timer_wheel, timer);
+    _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING);
+
+    gf_time_fmt(timestr, sizeof(timestr), (scrub_monitor->boot + timo),
+                gf_timefmt_FT);
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+           "Scrubbing is "
+           "scheduled to run at %s",
+           timestr);
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+int32_t
+br_fsscan_activate(xlator_t *this)
+{
+    uint32_t timo = 0;
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    time_t now = 0;
+    br_private_t *priv = NULL;
+    struct br_scrubber *fsscrub = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    fsscrub = &priv->fsscrub;
+    scrub_monitor = &priv->scrub_monitor;
+
+    now = gf_time();
+    timo = br_fsscan_calculate_timeout(fsscrub->frequency);
+    if (timo == 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG,
+               "BUG: Zero schedule timeout");
+        return -1;
+    }
+
+    pthread_mutex_lock(&scrub_monitor->donelock);
+    {
+        scrub_monitor->done = _gf_false;
+    }
+    pthread_mutex_unlock(&scrub_monitor->donelock);
+
+    gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT);
+    (void)gf_tw_mod_timer(priv->timer_wheel, scrub_monitor->timer, timo);
+
+    _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING);
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+           "Scrubbing is "
+           "rescheduled to run at %s",
+           timestr);
+
+    return 0;
+}
+
+int32_t
+br_fsscan_reschedule(xlator_t *this)
+{
+    int32_t ret = 0;
+    uint32_t timo = 0;
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    time_t now = 0;
+    br_private_t *priv = NULL;
+    struct br_scrubber *fsscrub = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    fsscrub = &priv->fsscrub;
+    scrub_monitor = &priv->scrub_monitor;
+
+    if (!fsscrub->frequency_reconf)
+        return 0;
+
+    now = gf_time();
+    timo = br_fsscan_calculate_timeout(fsscrub->frequency);
+    if (timo == 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG,
+               "BUG: Zero schedule timeout");
+        return -1;
+    }
+
+    gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT);
+
+    pthread_mutex_lock(&scrub_monitor->donelock);
+    {
+        scrub_monitor->done = _gf_false;
+    }
+    pthread_mutex_unlock(&scrub_monitor->donelock);
+
+    ret = gf_tw_mod_timer_pending(priv->timer_wheel, scrub_monitor->timer,
+                                  timo);
+    if (ret == 0)
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+               "Scrubber is currently running and would be "
+               "rescheduled after completion");
+    else {
+        _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING);
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+               "Scrubbing rescheduled to run at %s", timestr);
+    }
+
+    return 0;
+}
+
+int32_t
+br_fsscan_ondemand(xlator_t *this)
+{
+    int32_t ret = 0;
+    uint32_t timo = 0;
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    time_t now = 0;
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    now = gf_time();
+    timo = BR_SCRUB_ONDEMAND;
+    gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT);
+
+    pthread_mutex_lock(&scrub_monitor->donelock);
+    {
+        scrub_monitor->done = _gf_false;
+    }
+    pthread_mutex_unlock(&scrub_monitor->donelock);
+
+    ret = gf_tw_mod_timer_pending(priv->timer_wheel, scrub_monitor->timer,
+                                  timo);
+    if (ret == 0)
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+               "Scrubber is currently running and would be "
+               "rescheduled after completion");
+    else {
+        _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING);
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+               "Ondemand Scrubbing scheduled to run at %s", timestr);
+    }
+
+    return 0;
+}
+
+#define BR_SCRUB_THREAD_SCALE_LAZY 0
+#define BR_SCRUB_THREAD_SCALE_NORMAL 0.4
+#define BR_SCRUB_THREAD_SCALE_AGGRESSIVE 1.0
+
+#ifndef M_E
+#define M_E 2.718
+#endif
+
+/**
+ * This is just a simple exponential scale to a fixed value selected
+ * per throttle config. We probably need to be more smart and select
+ * the scale based on the number of processor cores too.
+ */
+static unsigned int
+br_scrubber_calc_scale(xlator_t *this, br_private_t *priv,
+                       scrub_throttle_t throttle)
+{
+    unsigned int scale = 0;
+
+    switch (throttle) {
+        case BR_SCRUB_THROTTLE_VOID:
+        case BR_SCRUB_THROTTLE_STALLED:
+            scale = 0;
+            break;
+        case BR_SCRUB_THROTTLE_LAZY:
+            scale = priv->child_count * pow(M_E, BR_SCRUB_THREAD_SCALE_LAZY);
+            break;
+        case BR_SCRUB_THROTTLE_NORMAL:
+            scale = priv->child_count * pow(M_E, BR_SCRUB_THREAD_SCALE_NORMAL);
+            break;
+        case BR_SCRUB_THROTTLE_AGGRESSIVE:
+            scale = priv->child_count *
+                    pow(M_E, BR_SCRUB_THREAD_SCALE_AGGRESSIVE);
+            break;
+        default:
+            gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_UNKNOWN_THROTTLE,
+                   "Unknown throttle %d", throttle);
+    }
+
+    return scale;
+}
+
+static br_child_t *
+_br_scrubber_get_next_child(struct br_scrubber *fsscrub)
+{
+    br_child_t *child = NULL;
+
+    child = list_first_entry(&fsscrub->scrublist, br_child_t, list);
+    list_rotate_left(&fsscrub->scrublist);
+
+    return child;
+}
+
+static void
+_br_scrubber_get_entry(br_child_t *child, struct br_fsscan_entry **fsentry)
+{
+    struct br_scanfs *fsscan = &child->fsscan;
+
+    if (list_empty(&fsscan->ready))
+        return;
+    *fsentry = list_first_entry(&fsscan->ready, struct br_fsscan_entry, list);
+    list_del_init(&(*fsentry)->list);
+}
+
+static void
+_br_scrubber_find_scrubbable_entry(struct br_scrubber *fsscrub,
+                                   struct br_fsscan_entry **fsentry)
+{
+    br_child_t *child = NULL;
+    br_child_t *firstchild = NULL;
+
+    while (1) {
+        while (list_empty(&fsscrub->scrublist))
+            pthread_cond_wait(&fsscrub->cond, &fsscrub->mutex);
+
+        firstchild = NULL;
+        for (child = _br_scrubber_get_next_child(fsscrub); child != firstchild;
+             child = _br_scrubber_get_next_child(fsscrub)) {
+            if (!firstchild)
+                firstchild = child;
+
+            _br_scrubber_get_entry(child, fsentry);
+            if (*fsentry)
+                break;
+        }
+
+        if (*fsentry)
+            break;
+
+        /* nothing to work on.. wait till available */
+        pthread_cond_wait(&fsscrub->cond, &fsscrub->mutex);
+    }
+}
+
+static void
+br_scrubber_pick_entry(struct br_scrubber *fsscrub,
+                       struct br_fsscan_entry **fsentry)
+{
+    pthread_cleanup_push(_br_lock_cleaner, &fsscrub->mutex);
+
+    pthread_mutex_lock(&fsscrub->mutex);
+    {
+        *fsentry = NULL;
+        _br_scrubber_find_scrubbable_entry(fsscrub, fsentry);
+    }
+    pthread_mutex_unlock(&fsscrub->mutex);
+
+    pthread_cleanup_pop(0);
+}
+
+struct br_scrub_entry {
+    gf_boolean_t scrubbed;
+    struct br_fsscan_entry *fsentry;
+};
+
+/**
+ * We need to be a bit careful here. These thread(s) are prone to cancellations
+ * when threads are scaled down (depending on the thottling value configured)
+ * and pausing scrub. A thread can get cancelled while it's waiting for entries
+ * in the ->pending queue or when an object is undergoing scrubbing.
+ */
+static void
+br_scrubber_entry_handle(void *arg)
+{
+    struct br_scanfs *fsscan = NULL;
+    struct br_scrub_entry *sentry = NULL;
+    struct br_fsscan_entry *fsentry = NULL;
+
+    sentry = arg;
+
+    fsentry = sentry->fsentry;
+    fsscan = fsentry->fsscan;
+
+    LOCK(&fsscan->entrylock);
+    {
+        if (sentry->scrubbed) {
+            _br_fsscan_dec_entry_count(fsscan);
+
+            /* cleanup ->entry */
+            fsentry->data = NULL;
+            fsentry->fsscan = NULL;
+            loc_wipe(&fsentry->parent);
+            gf_dirent_entry_free(fsentry->entry);
+
+            GF_FREE(sentry->fsentry);
+        } else {
+            /* (re)queue the entry again for scrub */
+            _br_fsscan_collect_entry(fsscan, sentry->fsentry);
+        }
+    }
+    UNLOCK(&fsscan->entrylock);
+}
+
+static void
+br_scrubber_scrub_entry(xlator_t *this, struct br_fsscan_entry *fsentry)
+{
+    struct br_scrub_entry sentry = {
+        0,
+    };
+
+    sentry.scrubbed = 0;
+    sentry.fsentry = fsentry;
+
+    pthread_cleanup_push(br_scrubber_entry_handle, &sentry);
+    {
+        (void)br_scrubber_scrub_begin(this, fsentry);
+        sentry.scrubbed = 1;
+    }
+    pthread_cleanup_pop(1);
+}
+
+void *
+br_scrubber_proc(void *arg)
+{
+    xlator_t *this = NULL;
+    struct br_scrubber *fsscrub = NULL;
+    struct br_fsscan_entry *fsentry = NULL;
+
+    fsscrub = arg;
+    THIS = this = fsscrub->this;
+
+    while (1) {
+        br_scrubber_pick_entry(fsscrub, &fsentry);
+        br_scrubber_scrub_entry(this, fsentry);
+        sleep(1);
+    }
+
+    return NULL;
+}
+
+static int32_t
+br_scrubber_scale_up(xlator_t *this, struct br_scrubber *fsscrub,
+                     unsigned int v1, unsigned int v2)
+{
+    int i = 0;
+    int32_t ret = -1;
+    int diff = 0;
+    struct br_scrubbers *scrub = NULL;
+
+    diff = (int)(v2 - v1);
+
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCALING_UP_SCRUBBER,
+           "Scaling up scrubbers [%d => %d]", v1, v2);
+
+    for (i = 0; i < diff; i++) {
+        scrub = GF_CALLOC(diff, sizeof(*scrub), gf_br_mt_br_scrubber_t);
+        if (!scrub)
+            break;
+
+        INIT_LIST_HEAD(&scrub->list);
+        ret = gf_thread_create(&scrub->scrubthread, NULL, br_scrubber_proc,
+                               fsscrub, "brsproc");
+        if (ret)
+            break;
+
+        fsscrub->nr_scrubbers++;
+        list_add_tail(&scrub->list, &fsscrub->scrubbers);
+    }
+
+    if ((i != diff) && !scrub)
+        goto error_return;
+
+    if (i != diff) /* degraded scaling.. */
+        gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SCALE_UP_FAILED,
+               "Could not fully scale up to %d scrubber(s). Spawned "
+               "%d/%d [total scrubber(s): %d]",
+               v2, i, diff, (v1 + i));
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static int32_t
+br_scrubber_scale_down(xlator_t *this, struct br_scrubber *fsscrub,
+                       unsigned int v1, unsigned int v2)
+{
+    int i = 0;
+    int diff = 0;
+    int32_t ret = -1;
+    struct br_scrubbers *scrub = NULL;
+
+    diff = (int)(v1 - v2);
+
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCALE_DOWN_SCRUBBER,
+           "Scaling down scrubbers [%d => %d]", v1, v2);
+
+    for (i = 0; i < diff; i++) {
+        scrub = list_first_entry(&fsscrub->scrubbers, struct br_scrubbers,
+                                 list);
+
+        list_del_init(&scrub->list);
+        ret = gf_thread_cleanup_xint(scrub->scrubthread);
+        if (ret)
+            break;
+        GF_FREE(scrub);
+
+        fsscrub->nr_scrubbers--;
+    }
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SCALE_DOWN_FAILED,
+               "Could not fully scale down "
+               "to %d scrubber(s). Terminated %d/%d [total "
+               "scrubber(s): %d]",
+               v1, i, diff, (v2 - i));
+        ret = 0;
+    }
+
+    return ret;
+}
+
+static int32_t
+br_scrubber_configure(xlator_t *this, br_private_t *priv,
+                      struct br_scrubber *fsscrub, scrub_throttle_t nthrottle)
+{
+    int32_t ret = 0;
+    unsigned int v1 = 0;
+    unsigned int v2 = 0;
+
+    v1 = fsscrub->nr_scrubbers;
+    v2 = br_scrubber_calc_scale(this, priv, nthrottle);
+
+    if (v1 == v2)
+        return 0;
+
+    if (v1 > v2)
+        ret = br_scrubber_scale_down(this, fsscrub, v1, v2);
+    else
+        ret = br_scrubber_scale_up(this, fsscrub, v1, v2);
+
+    return ret;
+}
+
+static int32_t
+br_scrubber_fetch_option(xlator_t *this, char *opt, dict_t *options,
+                         char **value)
+{
+    if (options)
+        GF_OPTION_RECONF(opt, *value, options, str, error_return);
+    else
+        GF_OPTION_INIT(opt, *value, str, error_return);
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+/* internal "throttle" override */
+#define BR_SCRUB_STALLED "STALLED"
+
+/* TODO: token buket spec */
+static int32_t
+br_scrubber_handle_throttle(xlator_t *this, br_private_t *priv, dict_t *options,
+                            gf_boolean_t scrubstall)
+{
+    int32_t ret = 0;
+    char *tmp = NULL;
+    struct br_scrubber *fsscrub = NULL;
+    scrub_throttle_t nthrottle = BR_SCRUB_THROTTLE_VOID;
+
+    fsscrub = &priv->fsscrub;
+    fsscrub->throttle_reconf = _gf_false;
+
+    ret = br_scrubber_fetch_option(this, "scrub-throttle", options, &tmp);
+    if (ret)
+        goto error_return;
+
+    if (scrubstall)
+        tmp = BR_SCRUB_STALLED;
+
+    if (strcasecmp(tmp, "lazy") == 0)
+        nthrottle = BR_SCRUB_THROTTLE_LAZY;
+    else if (strcasecmp(tmp, "normal") == 0)
+        nthrottle = BR_SCRUB_THROTTLE_NORMAL;
+    else if (strcasecmp(tmp, "aggressive") == 0)
+        nthrottle = BR_SCRUB_THROTTLE_AGGRESSIVE;
+    else if (strcasecmp(tmp, BR_SCRUB_STALLED) == 0)
+        nthrottle = BR_SCRUB_THROTTLE_STALLED;
+    else
+        goto error_return;
+
+    /* on failure old throttling value is preserved */
+    ret = br_scrubber_configure(this, priv, fsscrub, nthrottle);
+    if (ret)
+        goto error_return;
+
+    if (fsscrub->throttle != nthrottle)
+        fsscrub->throttle_reconf = _gf_true;
+
+    fsscrub->throttle = nthrottle;
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static int32_t
+br_scrubber_handle_stall(xlator_t *this, br_private_t *priv, dict_t *options,
+                         gf_boolean_t *scrubstall)
+{
+    int32_t ret = 0;
+    char *tmp = NULL;
+
+    ret = br_scrubber_fetch_option(this, "scrub-state", options, &tmp);
+    if (ret)
+        goto error_return;
+
+    if (strcasecmp(tmp, "pause") == 0) /* anything else is active */
+        *scrubstall = _gf_true;
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static int32_t
+br_scrubber_handle_freq(xlator_t *this, br_private_t *priv, dict_t *options,
+                        gf_boolean_t scrubstall)
+{
+    int32_t ret = -1;
+    char *tmp = NULL;
+    scrub_freq_t frequency = BR_FSSCRUB_FREQ_HOURLY;
+    struct br_scrubber *fsscrub = NULL;
+
+    fsscrub = &priv->fsscrub;
+    fsscrub->frequency_reconf = _gf_true;
+
+    ret = br_scrubber_fetch_option(this, "scrub-freq", options, &tmp);
+    if (ret)
+        goto error_return;
+
+    if (scrubstall)
+        tmp = BR_SCRUB_STALLED;
+
+    if (strcasecmp(tmp, "hourly") == 0) {
+        frequency = BR_FSSCRUB_FREQ_HOURLY;
+    } else if (strcasecmp(tmp, "daily") == 0) {
+        frequency = BR_FSSCRUB_FREQ_DAILY;
+    } else if (strcasecmp(tmp, "weekly") == 0) {
+        frequency = BR_FSSCRUB_FREQ_WEEKLY;
+    } else if (strcasecmp(tmp, "biweekly") == 0) {
+        frequency = BR_FSSCRUB_FREQ_BIWEEKLY;
+    } else if (strcasecmp(tmp, "monthly") == 0) {
+        frequency = BR_FSSCRUB_FREQ_MONTHLY;
+    } else if (strcasecmp(tmp, "minute") == 0) {
+        frequency = BR_FSSCRUB_FREQ_MINUTE;
+    } else if (strcasecmp(tmp, BR_SCRUB_STALLED) == 0) {
+        frequency = BR_FSSCRUB_FREQ_STALLED;
+    } else
+        goto error_return;
+
+    if (fsscrub->frequency == frequency)
+        fsscrub->frequency_reconf = _gf_false;
+    else
+        fsscrub->frequency = frequency;
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static void
+br_scrubber_log_option(xlator_t *this, br_private_t *priv,
+                       gf_boolean_t scrubstall)
+{
+    struct br_scrubber *fsscrub = &priv->fsscrub;
+    char *scrub_throttle_str[] = {
+        [BR_SCRUB_THROTTLE_LAZY] = "lazy",
+        [BR_SCRUB_THROTTLE_NORMAL] = "normal",
+        [BR_SCRUB_THROTTLE_AGGRESSIVE] = "aggressive",
+        [BR_SCRUB_THROTTLE_STALLED] = "stalled",
+    };
+
+    char *scrub_freq_str[] = {
+        [0] = "",
+        [BR_FSSCRUB_FREQ_HOURLY] = "hourly",
+        [BR_FSSCRUB_FREQ_DAILY] = "daily",
+        [BR_FSSCRUB_FREQ_WEEKLY] = "weekly",
+        [BR_FSSCRUB_FREQ_BIWEEKLY] = "biweekly",
+        [BR_FSSCRUB_FREQ_MONTHLY] = "monthly (30 days)",
+        [BR_FSSCRUB_FREQ_MINUTE] = "every minute",
+    };
+
+    if (scrubstall)
+        return; /* logged as pause */
+
+    if (fsscrub->frequency_reconf || fsscrub->throttle_reconf) {
+        if (fsscrub->throttle == BR_SCRUB_THROTTLE_VOID)
+            return;
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_TUNABLE,
+               "SCRUB TUNABLES:: [Frequency: %s, Throttle: %s]",
+               scrub_freq_str[fsscrub->frequency],
+               scrub_throttle_str[fsscrub->throttle]);
+    }
+}
+
+int32_t
+br_scrubber_handle_options(xlator_t *this, br_private_t *priv, dict_t *options)
+{
+    int32_t ret = 0;
+    gf_boolean_t scrubstall = _gf_false; /* not as dangerous as it sounds */
+
+    ret = br_scrubber_handle_stall(this, priv, options, &scrubstall);
+    if (ret)
+        goto error_return;
+
+    ret = br_scrubber_handle_throttle(this, priv, options, scrubstall);
+    if (ret)
+        goto error_return;
+
+    ret = br_scrubber_handle_freq(this, priv, options, scrubstall);
+    if (ret)
+        goto error_return;
+
+    br_scrubber_log_option(this, priv, scrubstall);
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+inode_t *
+br_lookup_bad_obj_dir(xlator_t *this, br_child_t *child, uuid_t gfid)
+{
+    struct iatt statbuf = {
+        0,
+    };
+    inode_table_t *table = NULL;
+    int32_t ret = -1;
+    loc_t loc = {
+        0,
+    };
+    inode_t *linked_inode = NULL;
+    int32_t op_errno = 0;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-scrubber", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, child, out);
+
+    table = child->table;
+
+    loc.inode = inode_new(table);
+    if (!loc.inode) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
+               "failed to allocate a new inode for"
+               "bad object directory");
+        goto out;
+    }
+
+    gf_uuid_copy(loc.gfid, gfid);
+
+    ret = syncop_lookup(child->xl, &loc, &statbuf, NULL, NULL, NULL);
+    if (ret < 0) {
+        op_errno = -ret;
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_LOOKUP_FAILED,
+               "failed to lookup the bad "
+               "objects directory (gfid: %s (%s))",
+               uuid_utoa(gfid), strerror(op_errno));
+        goto out;
+    }
+
+    linked_inode = inode_link(loc.inode, NULL, NULL, &statbuf);
+    if (linked_inode)
+        inode_lookup(linked_inode);
+
+out:
+    loc_wipe(&loc);
+    return linked_inode;
+}
+
+int32_t
+br_read_bad_object_dir(xlator_t *this, br_child_t *child, fd_t *fd,
+                       dict_t *dict)
+{
+    gf_dirent_t entries;
+    gf_dirent_t *entry = NULL;
+    int32_t ret = -1;
+    off_t offset = 0;
+    int32_t count = 0;
+    char key[32] = {
+        0,
+    };
+    dict_t *out_dict = NULL;
+
+    INIT_LIST_HEAD(&entries.list);
+
+    while ((ret = syncop_readdir(child->xl, fd, 131072, offset, &entries, NULL,
+                                 &out_dict))) {
+        if (ret < 0)
+            goto out;
+
+        list_for_each_entry(entry, &entries.list, list)
+        {
+            offset = entry->d_off;
+
+            snprintf(key, sizeof(key), "quarantine-%d", count);
+
+            /*
+             * ignore the dict_set errors for now. The intention is
+             * to get as many bad objects as possible instead of
+             * erroring out at the first failure.
+             */
+            ret = dict_set_dynstr_with_alloc(dict, key, entry->d_name);
+            if (!ret)
+                count++;
+
+            if (out_dict) {
+                dict_copy(out_dict, dict);
+                dict_unref(out_dict);
+                out_dict = NULL;
+            }
+        }
+
+        gf_dirent_free(&entries);
+    }
+
+    ret = count;
+    ret = dict_set_int32_sizen(dict, "count", count);
+
+out:
+    return ret;
+}
+
+int32_t
+br_get_bad_objects_from_child(xlator_t *this, dict_t *dict, br_child_t *child)
+{
+    inode_t *inode = NULL;
+    inode_table_t *table = NULL;
+    fd_t *fd = NULL;
+    int32_t ret = -1;
+    loc_t loc = {
+        0,
+    };
+    int32_t op_errno = 0;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-scrubber", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, child, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+
+    table = child->table;
+
+    inode = inode_find(table, BR_BAD_OBJ_CONTAINER);
+    if (!inode) {
+        inode = br_lookup_bad_obj_dir(this, child, BR_BAD_OBJ_CONTAINER);
+        if (!inode)
+            goto out;
+    }
+
+    fd = fd_create(inode, 0);
+    if (!fd) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_FD_CREATE_FAILED,
+               "fd creation for the bad "
+               "objects directory failed (gfid: %s)",
+               uuid_utoa(BR_BAD_OBJ_CONTAINER));
+        goto out;
+    }
+
+    loc.inode = inode;
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    ret = syncop_opendir(child->xl, &loc, fd, NULL, NULL);
+    if (ret < 0) {
+        op_errno = -ret;
+        fd_unref(fd);
+        fd = NULL;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_FD_CREATE_FAILED,
+               "failed to open the bad "
+               "objects directory %s",
+               uuid_utoa(BR_BAD_OBJ_CONTAINER));
+        goto out;
+    }
+
+    fd_bind(fd);
+
+    ret = br_read_bad_object_dir(this, child, fd, dict);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_BAD_OBJ_READDIR_FAIL,
+               "readdir of the bad "
+               "objects directory (%s) failed ",
+               uuid_utoa(BR_BAD_OBJ_CONTAINER));
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    loc_wipe(&loc);
+    if (fd)
+        fd_unref(fd);
+    return ret;
+}
+
+int32_t
+br_collect_bad_objects_of_child(xlator_t *this, br_child_t *child, dict_t *dict,
+                                dict_t *child_dict, int32_t total_count)
+{
+    int32_t ret = -1;
+    int32_t count = 0;
+    char key[32] = {
+        0,
+    };
+    char main_key[32] = {
+        0,
+    };
+    int32_t j = 0;
+    int32_t tmp_count = 0;
+    char *entry = NULL;
+    char tmp[PATH_MAX] = {
+        0,
+    };
+    char *path = NULL;
+    int32_t len = 0;
+
+    ret = dict_get_int32_sizen(child_dict, "count", &count);
+    if (ret)
+        goto out;
+
+    tmp_count = total_count;
+
+    for (j = 0; j < count; j++) {
+        len = snprintf(key, sizeof(key), "quarantine-%d", j);
+        ret = dict_get_strn(child_dict, key, len, &entry);
+        if (ret)
+            continue;
+
+        ret = dict_get_str(child_dict, entry, &path);
+        len = snprintf(tmp, PATH_MAX, "%s ==> BRICK: %s\n path: %s", entry,
+                       child->brick_path, path);
+        if ((len < 0) || (len >= PATH_MAX)) {
+            continue;
+        }
+        snprintf(main_key, sizeof(main_key), "quarantine-%d", tmp_count);
+
+        ret = dict_set_dynstr_with_alloc(dict, main_key, tmp);
+        if (!ret)
+            tmp_count++;
+        path = NULL;
+    }
+
+    ret = tmp_count;
+
+out:
+    return ret;
+}
+
+int32_t
+br_collect_bad_objects_from_children(xlator_t *this, dict_t *dict)
+{
+    int32_t ret = -1;
+    dict_t *child_dict = NULL;
+    int32_t i = 0;
+    int32_t total_count = 0;
+    br_child_t *child = NULL;
+    br_private_t *priv = NULL;
+    dict_t *tmp_dict = NULL;
+
+    priv = this->private;
+    tmp_dict = dict;
+
+    for (i = 0; i < priv->child_count; i++) {
+        child = &priv->children[i];
+        GF_ASSERT(child);
+        if (!_br_is_child_connected(child))
+            continue;
+
+        child_dict = dict_new();
+        if (!child_dict) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
+                   "failed to allocate dict");
+            continue;
+        }
+        ret = br_get_bad_objects_from_child(this, child_dict, child);
+        /*
+         * Continue asking the remaining children for the list of
+         * bad objects even though getting the list from one of them
+         * fails.
+         */
+        if (ret) {
+            dict_unref(child_dict);
+            continue;
+        }
+
+        ret = br_collect_bad_objects_of_child(this, child, tmp_dict, child_dict,
+                                              total_count);
+        if (ret < 0) {
+            dict_unref(child_dict);
+            continue;
+        }
+
+        total_count = ret;
+        dict_unref(child_dict);
+        child_dict = NULL;
+    }
+
+    ret = dict_set_int32(tmp_dict, "total-count", total_count);
+
+    return ret;
+}
+
+int32_t
+br_get_bad_objects_list(xlator_t *this, dict_t **dict)
+{
+    int32_t ret = -1;
+    dict_t *tmp_dict = NULL;
+
+    GF_VALIDATE_OR_GOTO("bir-rot-scrubber", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+
+    tmp_dict = *dict;
+    if (!tmp_dict) {
+        tmp_dict = dict_new();
+        if (!tmp_dict) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
+                   "failed to allocate dict");
+            goto out;
+        }
+        *dict = tmp_dict;
+    }
+
+    ret = br_collect_bad_objects_from_children(this, tmp_dict);
+
+out:
+    return ret;
+}
+
+static int
+wait_for_scrub_to_finish(xlator_t *this)
+{
+    int ret = -1;
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", scrub_monitor, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+           "Waiting for all children to start and finish scrub");
+
+    pthread_mutex_lock(&scrub_monitor->donelock);
+    {
+        while (!scrub_monitor->done)
+            pthread_cond_wait(&scrub_monitor->donecond,
+                              &scrub_monitor->donelock);
+    }
+    pthread_mutex_unlock(&scrub_monitor->donelock);
+    ret = 0;
+out:
+    return ret;
+}
+
+/**
+ * This function is executed in a separate thread. This is scrubber monitor
+ * thread that takes care of state machine.
+ */
+void *
+br_monitor_thread(void *arg)
+{
+    int32_t ret = 0;
+    xlator_t *this = NULL;
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    this = arg;
+    priv = this->private;
+
+    /*
+     * Since, this is the topmost xlator, THIS has to be set by bit-rot
+     * xlator itself (STACK_WIND won't help in this case). Also it has
+     * to be done for each thread that gets spawned. Otherwise, a new
+     * thread will get global_xlator's pointer when it does "THIS".
+     */
+    THIS = this;
+
+    scrub_monitor = &priv->scrub_monitor;
+
+    pthread_mutex_lock(&scrub_monitor->mutex);
+    {
+        while (!scrub_monitor->inited)
+            pthread_cond_wait(&scrub_monitor->cond, &scrub_monitor->mutex);
+    }
+    pthread_mutex_unlock(&scrub_monitor->mutex);
+
+    /* this needs to be serialized with reconfigure() */
+    pthread_mutex_lock(&priv->lock);
+    {
+        ret = br_scrub_state_machine(this, _gf_false);
+    }
+    pthread_mutex_unlock(&priv->lock);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_SSM_FAILED,
+               "Scrub state machine failed");
+        goto out;
+    }
+
+    while (1) {
+        /* Wait for all children to finish scrubbing */
+        ret = wait_for_scrub_to_finish(this);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_SCRUB_WAIT_FAILED,
+                   "Scrub wait failed");
+            goto out;
+        }
+
+        /* scrub exit criteria: Move the state to PENDING */
+        br_scrubber_exit_control(this);
+    }
+
+out:
+    return NULL;
+}
+
+static void
+br_set_scrub_state(struct br_monitor *scrub_monitor, br_scrub_state_t state)
+{
+    LOCK(&scrub_monitor->lock);
+    {
+        _br_monitor_set_scrub_state(scrub_monitor, state);
+    }
+    UNLOCK(&scrub_monitor->lock);
+}
+
+int32_t
+br_scrubber_monitor_init(xlator_t *this, br_private_t *priv)
+{
+    struct br_monitor *scrub_monitor = NULL;
+    int ret = 0;
+
+    scrub_monitor = &priv->scrub_monitor;
+
+    LOCK_INIT(&scrub_monitor->lock);
+    scrub_monitor->this = this;
+
+    scrub_monitor->inited = _gf_false;
+    pthread_mutex_init(&scrub_monitor->mutex, NULL);
+    pthread_cond_init(&scrub_monitor->cond, NULL);
+
+    scrub_monitor->kick = _gf_false;
+    scrub_monitor->active_child_count = 0;
+    pthread_mutex_init(&scrub_monitor->wakelock, NULL);
+    pthread_cond_init(&scrub_monitor->wakecond, NULL);
+
+    scrub_monitor->done = _gf_false;
+    pthread_mutex_init(&scrub_monitor->donelock, NULL);
+    pthread_cond_init(&scrub_monitor->donecond, NULL);
+
+    /* Set the state to INACTIVE */
+    br_set_scrub_state(&priv->scrub_monitor, BR_SCRUB_STATE_INACTIVE);
+
+    /* Start the monitor thread */
+    ret = gf_thread_create(&scrub_monitor->thread, NULL, br_monitor_thread,
+                           this, "brmon");
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_SPAWN_FAILED,
+               "monitor thread creation failed");
+        ret = -1;
+        goto err;
+    }
+
+    return 0;
+err:
+    pthread_mutex_destroy(&scrub_monitor->mutex);
+    pthread_cond_destroy(&scrub_monitor->cond);
+
+    pthread_mutex_destroy(&scrub_monitor->wakelock);
+    pthread_cond_destroy(&scrub_monitor->wakecond);
+
+    pthread_mutex_destroy(&scrub_monitor->donelock);
+    pthread_cond_destroy(&scrub_monitor->donecond);
+
+    LOCK_DESTROY(&scrub_monitor->lock);
+
+    return ret;
+}
+
+int32_t
+br_scrubber_init(xlator_t *this, br_private_t *priv)
+{
+    struct br_scrubber *fsscrub = NULL;
+    int ret = 0;
+
+    priv->tbf = tbf_init(NULL, 0);
+    if (!priv->tbf)
+        return -1;
+
+    ret = br_scrubber_monitor_init(this, priv);
+    if (ret)
+        return -1;
+
+    fsscrub = &priv->fsscrub;
+
+    fsscrub->this = this;
+    fsscrub->throttle = BR_SCRUB_THROTTLE_VOID;
+
+    pthread_mutex_init(&fsscrub->mutex, NULL);
+    pthread_cond_init(&fsscrub->cond, NULL);
+
+    fsscrub->nr_scrubbers = 0;
+    INIT_LIST_HEAD(&fsscrub->scrubbers);
+    INIT_LIST_HEAD(&fsscrub->scrublist);
+
+    return 0;
+}
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h
new file mode 100644
index 00000000000..4e5f67bc021
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h
@@ -0,0 +1,46 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BIT_ROT_SCRUB_H__
+#define __BIT_ROT_SCRUB_H__
+
+#include <glusterfs/xlator.h>
+#include "bit-rot.h"
+
+void *
+br_fsscanner(void *);
+
+int32_t
+br_fsscan_schedule(xlator_t *);
+int32_t
+br_fsscan_reschedule(xlator_t *);
+int32_t
+br_fsscan_activate(xlator_t *);
+int32_t
+br_fsscan_deactivate(xlator_t *);
+int32_t
+br_fsscan_ondemand(xlator_t *);
+
+int32_t
+br_scrubber_handle_options(xlator_t *, br_private_t *, dict_t *);
+
+int32_t
+br_scrubber_monitor_init(xlator_t *, br_private_t *);
+
+int32_t
+br_scrubber_init(xlator_t *, br_private_t *);
+
+int32_t
+br_collect_bad_objects_from_children(xlator_t *this, dict_t *dict);
+
+void
+br_child_set_scrub_state(br_child_t *, gf_boolean_t);
+
+#endif /* __BIT_ROT_SCRUB_H__ */
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-ssm.c b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.c
new file mode 100644
index 00000000000..753e31a3b23
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.c
@@ -0,0 +1,124 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "bit-rot-ssm.h"
+#include "bit-rot-scrub.h"
+#include "bit-rot-bitd-messages.h"
+
+int
+br_scrub_ssm_noop(xlator_t *this)
+{
+    return 0;
+}
+
+int
+br_scrub_ssm_state_pause(xlator_t *this)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO,
+           "Scrubber paused");
+    _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PAUSED);
+    return 0;
+}
+
+int
+br_scrub_ssm_state_ipause(xlator_t *this)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO,
+           "Scrubber paused");
+    _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_IPAUSED);
+    return 0;
+}
+
+int
+br_scrub_ssm_state_active(xlator_t *this)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    if (scrub_monitor->done) {
+        (void)br_fsscan_activate(this);
+    } else {
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO,
+               "Scrubbing resumed");
+        _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_ACTIVE);
+    }
+
+    return 0;
+}
+
+int
+br_scrub_ssm_state_stall(xlator_t *this)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO,
+           "Volume is under active scrubbing. Pausing scrub..");
+    _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_STALLED);
+    return 0;
+}
+
+static br_scrub_ssm_call *br_scrub_ssm[BR_SCRUB_MAXSTATES][BR_SCRUB_MAXEVENTS] =
+    {
+        /* INACTIVE */
+        {br_fsscan_schedule, br_scrub_ssm_state_ipause, br_scrub_ssm_noop},
+        /* PENDING  */
+        {br_fsscan_reschedule, br_fsscan_deactivate, br_fsscan_ondemand},
+        /* ACTIVE   */
+        {br_scrub_ssm_noop, br_scrub_ssm_state_stall, br_scrub_ssm_noop},
+        /* PAUSED   */
+        {br_fsscan_activate, br_scrub_ssm_noop, br_scrub_ssm_noop},
+        /* IPAUSED  */
+        {br_fsscan_schedule, br_scrub_ssm_noop, br_scrub_ssm_noop},
+        /* STALLED  */
+        {br_scrub_ssm_state_active, br_scrub_ssm_noop, br_scrub_ssm_noop},
+};
+
+int32_t
+br_scrub_state_machine(xlator_t *this, gf_boolean_t scrub_ondemand)
+{
+    br_private_t *priv = NULL;
+    br_scrub_ssm_call *call = NULL;
+    struct br_scrubber *fsscrub = NULL;
+    br_scrub_state_t currstate = 0;
+    br_scrub_event_t event = 0;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    fsscrub = &priv->fsscrub;
+    scrub_monitor = &priv->scrub_monitor;
+
+    currstate = scrub_monitor->state;
+    if (scrub_ondemand)
+        event = BR_SCRUB_EVENT_ONDEMAND;
+    else
+        event = _br_child_get_scrub_event(fsscrub);
+
+    call = br_scrub_ssm[currstate][event];
+    return call(this);
+}
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h
new file mode 100644
index 00000000000..37b45a42eac
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h
@@ -0,0 +1,38 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BIT_ROT_SSM_H__
+#define __BIT_ROT_SSM_H__
+
+#include <glusterfs/xlator.h>
+
+typedef enum br_scrub_state {
+    BR_SCRUB_STATE_INACTIVE = 0,
+    BR_SCRUB_STATE_PENDING,
+    BR_SCRUB_STATE_ACTIVE,
+    BR_SCRUB_STATE_PAUSED,
+    BR_SCRUB_STATE_IPAUSED,
+    BR_SCRUB_STATE_STALLED,
+    BR_SCRUB_MAXSTATES,
+} br_scrub_state_t;
+
+typedef enum br_scrub_event {
+    BR_SCRUB_EVENT_SCHEDULE = 0,
+    BR_SCRUB_EVENT_PAUSE,
+    BR_SCRUB_EVENT_ONDEMAND,
+    BR_SCRUB_MAXEVENTS,
+} br_scrub_event_t;
+
+struct br_monitor;
+
+int32_t
+br_scrub_state_machine(xlator_t *, gf_boolean_t);
+
+#endif /* __BIT_ROT_SSM_H__ */
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
new file mode 100644
index 00000000000..a2f1c343a1d
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
@@ -0,0 +1,2232 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <ctype.h>
+
+#include <glusterfs/logging.h>
+#include <glusterfs/compat-errno.h>
+
+#include "bit-rot.h"
+#include "bit-rot-scrub.h"
+#include <pthread.h>
+#include "bit-rot-bitd-messages.h"
+
+#define BR_HASH_CALC_READ_SIZE (128 * 1024)
+
+typedef int32_t(br_child_handler)(xlator_t *, br_child_t *);
+
+struct br_child_event {
+    xlator_t *this;
+
+    br_child_t *child;
+
+    br_child_handler *call;
+
+    struct list_head list;
+};
+
+static int
+br_find_child_index(xlator_t *this, xlator_t *child)
+{
+    br_private_t *priv = NULL;
+    int i = -1;
+    int index = -1;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, child, out);
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (child == priv->children[i].xl) {
+            index = i;
+            break;
+        }
+    }
+
+out:
+    return index;
+}
+
+br_child_t *
+br_get_child_from_brick_path(xlator_t *this, char *brick_path)
+{
+    br_private_t *priv = NULL;
+    br_child_t *child = NULL;
+    br_child_t *tmp = NULL;
+    int i = 0;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, brick_path, out);
+
+    priv = this->private;
+
+    pthread_mutex_lock(&priv->lock);
+    {
+        for (i = 0; i < priv->child_count; i++) {
+            tmp = &priv->children[i];
+            if (!strcmp(tmp->brick_path, brick_path)) {
+                child = tmp;
+                break;
+            }
+        }
+    }
+    pthread_mutex_unlock(&priv->lock);
+
+out:
+    return child;
+}
+
+/**
+ * probably we'll encapsulate brick inside our own structure when
+ * needed -- later.
+ */
+void *
+br_brick_init(void *xl, struct gf_brick_spec *brick)
+{
+    return brick;
+}
+
+/**
+ * and cleanup things here when allocated br_brick_init().
+ */
+void
+br_brick_fini(void *xl, char *brick, void *data)
+{
+    return;
+}
+
+/**
+ * TODO: Signature can contain null terminators which causes bitrot
+ * stub to store truncated hash as it depends on string length of
+ * the hash.
+ *
+ * FIX: Send the string length as part of the signature struct and
+ *      change stub to handle this change.
+ */
+static br_isignature_t *
+br_prepare_signature(const unsigned char *sign, unsigned long hashlen,
+                     int8_t hashtype, br_object_t *object)
+{
+    br_isignature_t *signature = NULL;
+
+    /* TODO: use mem-pool */
+    signature = GF_CALLOC(1, signature_size(hashlen + 1),
+                          gf_br_stub_mt_signature_t);
+    if (!signature)
+        return NULL;
+
+    /* object version */
+    signature->signedversion = object->signedversion;
+
+    /* signature length & type */
+    signature->signaturelen = hashlen;
+    signature->signaturetype = hashtype;
+
+    /* signature itself */
+    memcpy(signature->signature, (char *)sign, hashlen);
+    signature->signature[hashlen + 1] = '\0';
+
+    return signature;
+}
+
+gf_boolean_t
+bitd_is_bad_file(xlator_t *this, br_child_t *child, loc_t *loc, fd_t *fd)
+{
+    int32_t ret = -1;
+    dict_t *xattr = NULL;
+    inode_t *inode = NULL;
+    gf_boolean_t bad_file = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+
+    inode = (loc) ? loc->inode : fd->inode;
+
+    if (fd)
+        ret = syncop_fgetxattr(child->xl, fd, &xattr, BITROT_OBJECT_BAD_KEY,
+                               NULL, NULL);
+    else if (loc)
+        ret = syncop_getxattr(child->xl, loc, &xattr, BITROT_OBJECT_BAD_KEY,
+                              NULL, NULL);
+
+    if (!ret) {
+        gf_msg_debug(this->name, 0, "[GFID: %s] is marked corrupted",
+                     uuid_utoa(inode->gfid));
+        bad_file = _gf_true;
+    }
+
+    if (xattr)
+        dict_unref(xattr);
+
+out:
+    return bad_file;
+}
+
+/**
+ * Do a lookup on the gfid present within the object.
+ */
+static int32_t
+br_object_lookup(xlator_t *this, br_object_t *object, struct iatt *iatt,
+                 inode_t **linked_inode)
+{
+    int ret = -EINVAL;
+    loc_t loc = {
+        0,
+    };
+    inode_t *inode = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, object, out);
+
+    inode = inode_find(object->child->table, object->gfid);
+
+    if (inode)
+        loc.inode = inode;
+    else
+        loc.inode = inode_new(object->child->table);
+
+    if (!loc.inode) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    gf_uuid_copy(loc.gfid, object->gfid);
+
+    ret = syncop_lookup(object->child->xl, &loc, iatt, NULL, NULL, NULL);
+    if (ret < 0)
+        goto out;
+
+    /*
+     * The file might have been deleted by the application
+     * after getting the event, but before doing a lookup.
+     * So use linked_inode after inode_link is done.
+     */
+    *linked_inode = inode_link(loc.inode, NULL, NULL, iatt);
+    if (*linked_inode)
+        inode_lookup(*linked_inode);
+
+out:
+    loc_wipe(&loc);
+    return ret;
+}
+
+/**
+ * open the object with O_RDONLY flags and return the fd. How to let brick
+ * know that open is being done by bitd because syncop framework does not allow
+ * passing xdata -- may be use frame->root->pid itself.
+ */
+static int32_t
+br_object_open(xlator_t *this, br_object_t *object, inode_t *inode,
+               fd_t **openfd)
+{
+    int32_t ret = -1;
+    fd_t *fd = NULL;
+    loc_t loc = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, object, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    ret = -EINVAL;
+    fd = fd_create(inode, 0);
+    if (!fd) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED,
+                "gfid=%s", uuid_utoa(inode->gfid), NULL);
+        goto out;
+    }
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    ret = syncop_open(object->child->xl, &loc, O_RDONLY, fd, NULL, NULL);
+    if (ret) {
+        br_log_object(this, "open", inode->gfid, -ret);
+        fd_unref(fd);
+        fd = NULL;
+    } else {
+        fd_bind(fd);
+        *openfd = fd;
+    }
+
+    loc_wipe(&loc);
+
+out:
+    return ret;
+}
+
+/**
+ * read 128k block from the object @object from the offset @offset
+ * and return the buffer.
+ */
+static int32_t
+br_object_read_block_and_sign(xlator_t *this, fd_t *fd, br_child_t *child,
+                              off_t offset, size_t size, SHA256_CTX *sha256)
+{
+    int32_t ret = -1;
+    tbf_t *tbf = NULL;
+    struct iovec *iovec = NULL;
+    struct iobref *iobref = NULL;
+    br_private_t *priv = NULL;
+    int count = 0;
+    int i = 0;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, child, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    priv = this->private;
+
+    GF_VALIDATE_OR_GOTO(this->name, priv->tbf, out);
+    tbf = priv->tbf;
+
+    ret = syncop_readv(child->xl, fd, size, offset, 0, &iovec, &count, &iobref,
+                       NULL, NULL, NULL);
+
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, BRB_MSG_READV_FAILED,
+                "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        ret = -1;
+        goto out;
+    }
+
+    if (ret == 0)
+        goto out;
+
+    for (i = 0; i < count; i++) {
+        TBF_THROTTLE_BEGIN(tbf, TBF_OP_HASH, iovec[i].iov_len);
+        {
+            SHA256_Update(sha256, (const unsigned char *)(iovec[i].iov_base),
+                          iovec[i].iov_len);
+        }
+        TBF_THROTTLE_BEGIN(tbf, TBF_OP_HASH, iovec[i].iov_len);
+    }
+
+out:
+    if (iovec)
+        GF_FREE(iovec);
+
+    if (iobref)
+        iobref_unref(iobref);
+
+    return ret;
+}
+
+int32_t
+br_calculate_obj_checksum(unsigned char *md, br_child_t *child, fd_t *fd,
+                          struct iatt *iatt)
+{
+    int32_t ret = -1;
+    off_t offset = 0;
+    size_t block = BR_HASH_CALC_READ_SIZE;
+    xlator_t *this = NULL;
+
+    SHA256_CTX sha256;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", child, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", iatt, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", fd, out);
+
+    this = child->this;
+
+    SHA256_Init(&sha256);
+
+    while (1) {
+        ret = br_object_read_block_and_sign(this, fd, child, offset, block,
+                                            &sha256);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_BLOCK_READ_FAILED,
+                    "offset=%" PRIu64, offset, "object-gfid=%s",
+                    uuid_utoa(fd->inode->gfid), NULL);
+            break;
+        }
+
+        if (ret == 0)
+            break;
+
+        offset += ret;
+    }
+
+    if (ret == 0)
+        SHA256_Final(md, &sha256);
+
+out:
+    return ret;
+}
+
+static int32_t
+br_object_checksum(unsigned char *md, br_object_t *object, fd_t *fd,
+                   struct iatt *iatt)
+{
+    return br_calculate_obj_checksum(md, object->child, fd, iatt);
+}
+
+static int32_t
+br_object_read_sign(inode_t *linked_inode, fd_t *fd, br_object_t *object,
+                    struct iatt *iatt)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    dict_t *xattr = NULL;
+    unsigned char *md = NULL;
+    br_isignature_t *sign = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", object, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", linked_inode, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", fd, out);
+
+    this = object->this;
+
+    md = GF_MALLOC(SHA256_DIGEST_LENGTH, gf_common_mt_char);
+    if (!md) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_SAVING_HASH_FAILED,
+                "object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto out;
+    }
+
+    ret = br_object_checksum(md, object, fd, iatt);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_CALC_CHECKSUM_FAILED,
+                "object-gfid=%s", uuid_utoa(linked_inode->gfid), NULL);
+        goto free_signature;
+    }
+
+    sign = br_prepare_signature(md, SHA256_DIGEST_LENGTH,
+                                BR_SIGNATURE_TYPE_SHA256, object);
+    if (!sign) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED,
+                "object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto free_signature;
+    }
+
+    xattr = dict_for_key_value(GLUSTERFS_SET_OBJECT_SIGNATURE, (void *)sign,
+                               signature_size(SHA256_DIGEST_LENGTH), _gf_true);
+
+    if (!xattr) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED,
+                "dict-allocation object-gfid=%s", uuid_utoa(fd->inode->gfid),
+                NULL);
+        goto free_isign;
+    }
+
+    ret = syncop_fsetxattr(object->child->xl, fd, xattr, 0, NULL, NULL);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED,
+                "fsetxattr object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto unref_dict;
+    }
+
+    ret = 0;
+
+unref_dict:
+    dict_unref(xattr);
+free_isign:
+    GF_FREE(sign);
+free_signature:
+    GF_FREE(md);
+out:
+    return ret;
+}
+
+static int
+br_object_sign_softerror(int32_t op_errno)
+{
+    return ((op_errno == ENOENT) || (op_errno == ESTALE) ||
+            (op_errno == ENODATA));
+}
+
+void
+br_log_object(xlator_t *this, char *op, uuid_t gfid, int32_t op_errno)
+{
+    int softerror = br_object_sign_softerror(op_errno);
+    if (softerror) {
+        gf_msg_debug(this->name, 0,
+                     "%s() failed on object %s "
+                     "[reason: %s]",
+                     op, uuid_utoa(gfid), strerror(op_errno));
+    } else {
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_OP_FAILED, "op=%s",
+                op, "gfid=%s", uuid_utoa(gfid), NULL);
+    }
+}
+
+void
+br_log_object_path(xlator_t *this, char *op, const char *path, int32_t op_errno)
+{
+    int softerror = br_object_sign_softerror(op_errno);
+    if (softerror) {
+        gf_msg_debug(this->name, 0,
+                     "%s() failed on object %s "
+                     "[reason: %s]",
+                     op, path, strerror(op_errno));
+    } else {
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_OP_FAILED, "op=%s",
+                op, "path=%s", path, NULL);
+    }
+}
+
+static void
+br_trigger_sign(xlator_t *this, br_child_t *child, inode_t *linked_inode,
+                loc_t *loc, gf_boolean_t need_reopen)
+{
+    fd_t *fd = NULL;
+    int32_t ret = -1;
+    uint32_t val = 0;
+    dict_t *dict = NULL;
+    pid_t pid = GF_CLIENT_PID_BITD;
+
+    syncopctx_setfspid(&pid);
+
+    val = (need_reopen == _gf_true) ? BR_OBJECT_REOPEN : BR_OBJECT_RESIGN;
+
+    dict = dict_new();
+    if (!dict)
+        goto out;
+
+    ret = dict_set_uint32(dict, BR_REOPEN_SIGN_HINT_KEY, val);
+    if (ret)
+        goto cleanup_dict;
+
+    ret = -1;
+    fd = fd_create(linked_inode, 0);
+    if (!fd) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED,
+                "gfid=%s", uuid_utoa(linked_inode->gfid), NULL);
+        goto cleanup_dict;
+    }
+
+    ret = syncop_open(child->xl, loc, O_RDWR, fd, NULL, NULL);
+    if (ret) {
+        br_log_object(this, "open", linked_inode->gfid, -ret);
+        goto unref_fd;
+    }
+
+    fd_bind(fd);
+
+    ret = syncop_fsetxattr(child->xl, fd, dict, 0, NULL, NULL);
+    if (ret)
+        br_log_object(this, "fsetxattr", linked_inode->gfid, -ret);
+
+    /* passthough: fd_unref() */
+
+unref_fd:
+    fd_unref(fd);
+cleanup_dict:
+    dict_unref(dict);
+out:
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_TRIGGER_SIGN_FAILED,
+                "gfid=%s", uuid_utoa(linked_inode->gfid), "reopen-hint-val=%d",
+                val, NULL);
+    }
+}
+
+static void
+br_object_resign(xlator_t *this, br_object_t *object, inode_t *linked_inode)
+{
+    loc_t loc = {
+        0,
+    };
+
+    loc.inode = inode_ref(linked_inode);
+    gf_uuid_copy(loc.gfid, linked_inode->gfid);
+
+    br_trigger_sign(this, object->child, linked_inode, &loc, _gf_false);
+
+    loc_wipe(&loc);
+}
+
+/**
+ * Sign a given object. This routine runs full throttle. There needs to be
+ * some form of priority scheduling and/or read burstness to avoid starving
+ * (or kicking) client I/O's.
+ */
+static int32_t
+br_sign_object(br_object_t *object)
+{
+    int32_t ret = -1;
+    inode_t *linked_inode = NULL;
+    xlator_t *this = NULL;
+    fd_t *fd = NULL;
+    struct iatt iatt = {
+        0,
+    };
+    pid_t pid = GF_CLIENT_PID_BITD;
+    br_sign_state_t sign_info = BR_SIGN_NORMAL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", object, out);
+
+    this = object->this;
+
+    /**
+     * FIXME: This is required as signing an object is restricted to
+     * clients with special frame->root->pid. Change the way client
+     * pid is set.
+     */
+    syncopctx_setfspid(&pid);
+
+    ret = br_object_lookup(this, object, &iatt, &linked_inode);
+    if (ret) {
+        br_log_object(this, "lookup", object->gfid, -ret);
+        goto out;
+    }
+
+    /**
+     * For fd's that have notified for reopening, we send an explicit
+     * open() followed by a dummy write() call. This triggers the
+     * actual signing of the object.
+     */
+    sign_info = ntohl(object->sign_info);
+    if (sign_info == BR_SIGN_REOPEN_WAIT) {
+        br_object_resign(this, object, linked_inode);
+        goto unref_inode;
+    }
+
+    ret = br_object_open(this, object, linked_inode, &fd);
+    if (!fd) {
+        br_log_object(this, "open", object->gfid, -ret);
+        goto unref_inode;
+    }
+
+    /**
+     * we have an open file descriptor on the object. from here on,
+     * do not be generous to file operation errors.
+     */
+    gf_msg_debug(this->name, 0, "Signing object [%s]",
+                 uuid_utoa(linked_inode->gfid));
+
+    ret = br_object_read_sign(linked_inode, fd, object, &iatt);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_READ_AND_SIGN_FAILED,
+                "gfid=%s", uuid_utoa(linked_inode->gfid), NULL);
+        goto unref_fd;
+    }
+
+    ret = 0;
+
+unref_fd:
+    fd_unref(fd);
+unref_inode:
+    inode_unref(linked_inode);
+out:
+    return ret;
+}
+
+static br_object_t *
+__br_pick_object(br_private_t *priv)
+{
+    br_object_t *object = NULL;
+
+    while (list_empty(&priv->obj_queue->objects)) {
+        pthread_cond_wait(&priv->object_cond, &priv->lock);
+    }
+
+    object = list_first_entry(&priv->obj_queue->objects, br_object_t, list);
+    list_del_init(&object->list);
+
+    return object;
+}
+
+/**
+ * This is the place where the signing of the objects is triggered.
+ */
+void *
+br_process_object(void *arg)
+{
+    xlator_t *this = NULL;
+    br_object_t *object = NULL;
+    br_private_t *priv = NULL;
+    int32_t ret = -1;
+
+    this = arg;
+    priv = this->private;
+
+    THIS = this;
+
+    for (;;) {
+        pthread_mutex_lock(&priv->lock);
+        {
+            object = __br_pick_object(priv);
+        }
+        pthread_mutex_unlock(&priv->lock);
+
+        ret = br_sign_object(object);
+        if (ret && !br_object_sign_softerror(-ret))
+            gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED,
+                    "gfid=%s", uuid_utoa(object->gfid), NULL);
+        GF_FREE(object);
+    }
+
+    return NULL;
+}
+
+/**
+ * This function gets kicked in once the object is expired from the
+ * timer wheel. This actually adds the object received via notification
+ * from the changelog to the queue from where the objects gets picked
+ * up for signing.
+ *
+ * This routine can be made lightweight by introducing an alternate
+ * timer-wheel API that dispatches _all_ expired objects in one-shot
+ * rather than an object at-a-time. This routine can then just simply
+ * be a call to list_splice_tail().
+ *
+ * NOTE: use call_time to instrument signing time in br_sign_object().
+ */
+void
+br_add_object_to_queue(struct gf_tw_timer_list *timer, void *data,
+                       unsigned long call_time)
+{
+    br_object_t *object = NULL;
+    xlator_t *this = NULL;
+    br_private_t *priv = NULL;
+
+    object = data;
+    this = object->this;
+    priv = this->private;
+
+    THIS = this;
+
+    pthread_mutex_lock(&priv->lock);
+    {
+        list_add_tail(&object->list, &priv->obj_queue->objects);
+        pthread_cond_broadcast(&priv->object_cond);
+    }
+    pthread_mutex_unlock(&priv->lock);
+
+    if (timer)
+        mem_put(timer);
+    return;
+}
+
+static br_object_t *
+br_initialize_object(xlator_t *this, br_child_t *child, changelog_event_t *ev)
+{
+    br_object_t *object = NULL;
+
+    object = GF_CALLOC(1, sizeof(*object), gf_br_mt_br_object_t);
+    if (!object)
+        goto out;
+    INIT_LIST_HEAD(&object->list);
+
+    object->this = this;
+    object->child = child;
+    gf_uuid_copy(object->gfid, ev->u.releasebr.gfid);
+
+    /* NOTE: it's BE, but no worry */
+    object->signedversion = ev->u.releasebr.version;
+    object->sign_info = ev->u.releasebr.sign_info;
+
+out:
+    return object;
+}
+
+static struct gf_tw_timer_list *
+br_initialize_timer(xlator_t *this, br_object_t *object, br_child_t *child,
+                    changelog_event_t *ev)
+{
+    br_private_t *priv = NULL;
+    struct gf_tw_timer_list *timer = NULL;
+
+    priv = this->private;
+
+    timer = mem_get0(child->timer_pool);
+    if (!timer)
+        goto out;
+    INIT_LIST_HEAD(&timer->entry);
+
+    timer->expires = priv->expiry_time;
+    if (!timer->expires)
+        timer->expires = 1;
+
+    timer->data = object;
+    timer->function = br_add_object_to_queue;
+    gf_tw_add_timer(priv->timer_wheel, timer);
+
+out:
+    return timer;
+}
+
+static int32_t
+br_schedule_object_reopen(xlator_t *this, br_object_t *object,
+                          br_child_t *child, changelog_event_t *ev)
+{
+    struct gf_tw_timer_list *timer = NULL;
+
+    timer = br_initialize_timer(this, object, child, ev);
+    if (!timer)
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_TIMER_FAILED,
+                "gfid=%s", uuid_utoa(object->gfid), NULL);
+    return timer ? 0 : -1;
+}
+
+static int32_t
+br_object_quicksign(xlator_t *this, br_object_t *object)
+{
+    br_add_object_to_queue(NULL, object, 0ULL);
+    return 0;
+}
+
+/**
+ * This callback function registered with the changelog is executed
+ * whenever a notification from the changelog is received. This should
+ * add the object (or the gfid) on which the notification has come to
+ * the timer-wheel with some expiry time.
+ *
+ * TODO: use mem-pool for allocations and maybe allocate timer and
+ * object as a single alloc and bifurcate their respective pointers.
+ */
+void
+br_brick_callback(void *xl, char *brick, void *data, changelog_event_t *ev)
+{
+    int32_t ret = 0;
+    uuid_t gfid = {
+        0,
+    };
+    xlator_t *this = NULL;
+    br_object_t *object = NULL;
+    br_child_t *child = NULL;
+    br_sign_state_t sign_info = BR_SIGN_INVALID;
+
+    this = xl;
+
+    GF_VALIDATE_OR_GOTO(this->name, ev, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    GF_ASSERT(ev->ev_type == CHANGELOG_OP_TYPE_BR_RELEASE);
+    GF_ASSERT(!gf_uuid_is_null(ev->u.releasebr.gfid));
+
+    gf_uuid_copy(gfid, ev->u.releasebr.gfid);
+
+    gf_msg_debug(this->name, 0, "RELEASE EVENT [GFID %s]", uuid_utoa(gfid));
+
+    child = br_get_child_from_brick_path(this, brick);
+    if (!child) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SUBVOL_FAILED,
+                "brick=%s", brick, NULL);
+        goto out;
+    }
+
+    object = br_initialize_object(this, child, ev);
+    if (!object) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
+                "object-gfid=%s", uuid_utoa(gfid), NULL);
+        goto out;
+    }
+
+    /* sanity check */
+    sign_info = ntohl(object->sign_info);
+    GF_ASSERT(sign_info != BR_SIGN_NORMAL);
+
+    if (sign_info == BR_SIGN_REOPEN_WAIT)
+        ret = br_schedule_object_reopen(this, object, child, ev);
+    else
+        ret = br_object_quicksign(this, object);
+
+    if (ret)
+        goto free_object;
+
+    gf_msg_debug(this->name, 0, "->callback: brick [%s], type [%d]\n", brick,
+                 ev->ev_type);
+    return;
+
+free_object:
+    GF_FREE(object);
+out:
+    return;
+}
+
+void
+br_fill_brick_spec(struct gf_brick_spec *brick, char *path)
+{
+    brick->brick_path = gf_strdup(path);
+    brick->filter = CHANGELOG_OP_TYPE_BR_RELEASE;
+
+    brick->init = br_brick_init;
+    brick->fini = br_brick_fini;
+    brick->callback = br_brick_callback;
+    brick->connected = NULL;
+    brick->disconnected = NULL;
+}
+
+static gf_boolean_t
+br_check_object_need_sign(xlator_t *this, dict_t *xattr, br_child_t *child)
+{
+    int32_t ret = -1;
+    gf_boolean_t need_sign = _gf_false;
+    br_isignature_out_t *sign = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, xattr, out);
+    GF_VALIDATE_OR_GOTO(this->name, child, out);
+
+    ret = dict_get_ptr(xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, (void **)&sign);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED,
+                "object-info", NULL);
+        goto out;
+    }
+
+    /* Object has been opened and hence dirty. Do not sign it */
+    if (sign->stale)
+        need_sign = _gf_true;
+
+out:
+    return need_sign;
+}
+
+int32_t
+br_prepare_loc(xlator_t *this, br_child_t *child, loc_t *parent,
+               gf_dirent_t *entry, loc_t *loc)
+{
+    int32_t ret = -1;
+    inode_t *inode = NULL;
+
+    inode = inode_grep(child->table, parent->inode, entry->d_name);
+    if (!inode)
+        loc->inode = inode_new(child->table);
+    else {
+        loc->inode = inode;
+        if (loc->inode->ia_type != IA_IFREG) {
+            gf_msg_debug(this->name, 0,
+                         "%s is not a regular "
+                         "file",
+                         entry->d_name);
+            ret = 0;
+            goto out;
+        }
+    }
+
+    loc->parent = inode_ref(parent->inode);
+    gf_uuid_copy(loc->pargfid, parent->inode->gfid);
+
+    ret = inode_path(parent->inode, entry->d_name, (char **)&loc->path);
+    if (ret < 0 || !loc->path) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_PATH_FAILED,
+                "inode_path=%s", entry->d_name, "parent-gfid=%s",
+                uuid_utoa(parent->inode->gfid), NULL);
+        goto out;
+    }
+
+    loc->name = strrchr(loc->path, '/');
+    if (loc->name)
+        loc->name++;
+
+    ret = 1;
+
+out:
+    return ret;
+}
+
+/**
+ * Oneshot crawler
+ * ---------------
+ * This is a catchup mechanism. Objects that remained unsigned from the
+ * last run for whatever reason (node crashes, reboots, etc..) become
+ * candidates for signing. This allows the signature to "catch up" with
+ * the current state of the object. Triggering signing is easy: perform
+ * an open() followed by a close() thereby resulting in call boomerang.
+ * (though not back to itself :))
+ */
+int
+bitd_oneshot_crawl(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+                   void *data)
+{
+    int op_errno = 0;
+    br_child_t *child = NULL;
+    xlator_t *this = NULL;
+    loc_t loc = {
+        0,
+    };
+    struct iatt iatt = {
+        0,
+    };
+    struct iatt parent_buf = {
+        0,
+    };
+    dict_t *xattr = NULL;
+    int32_t ret = -1;
+    inode_t *linked_inode = NULL;
+    gf_boolean_t need_signing = _gf_false;
+    gf_boolean_t need_reopen = _gf_true;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", subvol, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", data, out);
+
+    child = data;
+    this = child->this;
+
+    ret = br_prepare_loc(this, child, parent, entry, &loc);
+    if (!ret)
+        goto out;
+
+    ret = syncop_lookup(child->xl, &loc, &iatt, &parent_buf, NULL, NULL);
+    if (ret) {
+        br_log_object_path(this, "lookup", loc.path, -ret);
+        goto out;
+    }
+
+    linked_inode = inode_link(loc.inode, parent->inode, loc.name, &iatt);
+    if (linked_inode)
+        inode_lookup(linked_inode);
+
+    if (iatt.ia_type != IA_IFREG) {
+        gf_msg_debug(this->name, 0,
+                     "%s is not a regular file, "
+                     "skipping..",
+                     entry->d_name);
+        ret = 0;
+        goto unref_inode;
+    }
+
+    /**
+     * As of now, 2 cases  are possible and handled.
+     * 1) GlusterFS is upgraded from a previous version which does not
+     *    have any idea about bit-rot and have data in the filesystem.
+     *    In this case syncop_getxattr fails with ENODATA and the object
+     *    is signed. (In real, when crawler sends lookup, bit-rot-stub
+     *    creates the xattrs before returning lookup reply)
+     * 2) Bit-rot was not enabled or BitD was does for some reasons, during
+     *    which some files were created, but since BitD was down, were not
+     *    signed.
+     * If the file was just created and was being written some data when
+     * the down BitD came up, then bit-rot stub should be intelligent to
+     * identify this case (by comparing the ongoing version or by checking
+     * if there are any fds present for that inode) and handle properly.
+     */
+
+    if (bitd_is_bad_file(this, child, &loc, NULL)) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SKIP_OBJECT, "path=%s",
+                loc.path, NULL);
+        goto unref_inode;
+    }
+
+    ret = syncop_getxattr(child->xl, &loc, &xattr,
+                          GLUSTERFS_GET_OBJECT_SIGNATURE, NULL, NULL);
+    if (ret < 0) {
+        op_errno = -ret;
+        br_log_object(this, "getxattr", linked_inode->gfid, op_errno);
+
+        /**
+         * No need to sign the zero byte objects as the signing
+         * happens upon first modification of the object.
+         */
+        if (op_errno == ENODATA && (iatt.ia_size != 0))
+            need_signing = _gf_true;
+        if (op_errno == EINVAL)
+            gf_smsg(this->name, GF_LOG_WARNING, 0,
+                    BRB_MSG_PARTIAL_VERSION_PRESENCE, "gfid=%s",
+                    uuid_utoa(linked_inode->gfid), NULL);
+    } else {
+        need_signing = br_check_object_need_sign(this, xattr, child);
+
+        /*
+         * If we are here means, bitrot daemon has started. Is it just
+         * a simple restart of the daemon or is it started because the
+         * feature is enabled is something hard to determine. Hence,
+         * if need_signing is false (because bit-rot version and signature
+         * are present), then still go ahead and sign it.
+         */
+        if (!need_signing) {
+            need_signing = _gf_true;
+            need_reopen = _gf_true;
+        }
+    }
+
+    if (!need_signing)
+        goto unref_dict;
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_TRIGGER_SIGN, "path=%s",
+            loc.path, "gfid=%s", uuid_utoa(linked_inode->gfid), "Brick-path=%s",
+            child->brick_path, NULL);
+    br_trigger_sign(this, child, linked_inode, &loc, need_reopen);
+
+    ret = 0;
+
+unref_dict:
+    if (xattr)
+        dict_unref(xattr);
+unref_inode:
+    inode_unref(linked_inode);
+out:
+    loc_wipe(&loc);
+
+    return ret;
+}
+
+#define BR_CRAWL_THROTTLE_COUNT 50
+#define BR_CRAWL_THROTTLE_ZZZ 5
+
+void *
+br_oneshot_signer(void *arg)
+{
+    loc_t loc = {
+        0,
+    };
+    xlator_t *this = NULL;
+    br_child_t *child = NULL;
+
+    child = arg;
+    this = child->this;
+
+    THIS = this;
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CRAWLING_START, "brick-path=%s",
+            child->brick_path, NULL);
+
+    loc.inode = child->table->root;
+    (void)syncop_ftw_throttle(child->xl, &loc, GF_CLIENT_PID_BITD, child,
+                              bitd_oneshot_crawl, BR_CRAWL_THROTTLE_COUNT,
+                              BR_CRAWL_THROTTLE_ZZZ);
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CRAWLING_FINISH,
+            "brick-path=%s", child->brick_path, NULL);
+
+    return NULL;
+}
+
+static void
+br_set_child_state(br_child_t *child, br_child_state_t state)
+{
+    pthread_mutex_lock(&child->lock);
+    {
+        _br_set_child_state(child, state);
+    }
+    pthread_mutex_unlock(&child->lock);
+}
+
+/**
+ * At this point a thread is spawned to crawl the filesystem (in
+ * tortoise pace) to sign objects that were not signed in previous run(s).
+ * Such objects are identified by examining it's dirtyness and timestamp.
+ *
+ *    pick object:
+ *       signature_is_stale() && (object_timestamp() <= stub_init_time())
+ *
+ * Also, we register to the changelog library to subscribe for event
+ * notifications.
+ */
+static int32_t
+br_enact_signer(xlator_t *this, br_child_t *child, br_stub_init_t *stub)
+{
+    int32_t ret = 0;
+    br_private_t *priv = NULL;
+    struct gf_brick_spec *brick = NULL;
+
+    priv = this->private;
+
+    brick = GF_CALLOC(1, sizeof(struct gf_brick_spec),
+                      gf_common_mt_gf_brick_spec_t);
+    if (!brick)
+        goto error_return;
+
+    br_fill_brick_spec(brick, stub->export);
+    ret = gf_changelog_register_generic(brick, 1, 1,
+                                        this->ctx->cmd_args.log_file, -1, this);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, BRB_MSG_REGISTER_FAILED, NULL);
+        goto dealloc;
+    }
+
+    child->threadrunning = 0;
+    ret = gf_thread_create(&child->thread, NULL, br_oneshot_signer, child,
+                           "brosign");
+    if (ret)
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SPAWN_FAILED,
+                "FS-crawler-thread", NULL);
+    else
+        child->threadrunning = 1;
+
+    /* it's OK to continue, "old" objects would be signed when modified */
+    list_add_tail(&child->list, &priv->signing);
+    return 0;
+
+dealloc:
+    GF_FREE(brick);
+error_return:
+    return -1;
+}
+
+static int32_t
+br_launch_scrubber(xlator_t *this, br_child_t *child, struct br_scanfs *fsscan,
+                   struct br_scrubber *fsscrub)
+{
+    int32_t ret = -1;
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+
+    scrub_monitor = &priv->scrub_monitor;
+    ret = gf_thread_create(&child->thread, NULL, br_fsscanner, child,
+                           "brfsscan");
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_ALERT, 0, BRB_MSG_SPAWN_FAILED,
+                "bitrot-scrubber-daemon Brick-path=%s", child->brick_path,
+                NULL);
+        goto error_return;
+    }
+
+    /* Signal monitor to kick off state machine*/
+    pthread_mutex_lock(&scrub_monitor->mutex);
+    {
+        if (!scrub_monitor->inited)
+            pthread_cond_signal(&scrub_monitor->cond);
+        scrub_monitor->inited = _gf_true;
+    }
+    pthread_mutex_unlock(&scrub_monitor->mutex);
+
+    /**
+     * Everything has been setup.. add this subvolume to scrubbers
+     * list.
+     */
+    pthread_mutex_lock(&fsscrub->mutex);
+    {
+        list_add_tail(&child->list, &fsscrub->scrublist);
+        pthread_cond_broadcast(&fsscrub->cond);
+    }
+    pthread_mutex_unlock(&fsscrub->mutex);
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static int32_t
+br_enact_scrubber(xlator_t *this, br_child_t *child)
+{
+    int32_t ret = 0;
+    br_private_t *priv = NULL;
+    struct br_scanfs *fsscan = NULL;
+    struct br_scrubber *fsscrub = NULL;
+
+    priv = this->private;
+
+    fsscan = &child->fsscan;
+    fsscrub = &priv->fsscrub;
+
+    /**
+     * if this child already witnesses a successful connection earlier
+     * there's no need to initialize mutexes, condvars, etc..
+     */
+    if (_br_child_witnessed_connection(child))
+        return br_launch_scrubber(this, child, fsscan, fsscrub);
+
+    LOCK_INIT(&fsscan->entrylock);
+    pthread_mutex_init(&fsscan->waitlock, NULL);
+    pthread_cond_init(&fsscan->waitcond, NULL);
+
+    fsscan->entries = 0;
+    INIT_LIST_HEAD(&fsscan->queued);
+    INIT_LIST_HEAD(&fsscan->ready);
+
+    ret = br_launch_scrubber(this, child, fsscan, fsscrub);
+    if (ret)
+        goto error_return;
+
+    return 0;
+
+error_return:
+    LOCK_DESTROY(&fsscan->entrylock);
+    pthread_mutex_destroy(&fsscan->waitlock);
+    pthread_cond_destroy(&fsscan->waitcond);
+
+    return -1;
+}
+
+static int32_t
+br_child_enaction(xlator_t *this, br_child_t *child, br_stub_init_t *stub)
+{
+    int32_t ret = -1;
+    br_private_t *priv = this->private;
+
+    pthread_mutex_lock(&child->lock);
+    {
+        if (priv->iamscrubber)
+            ret = br_enact_scrubber(this, child);
+        else
+            ret = br_enact_signer(this, child, stub);
+
+        if (!ret) {
+            child->witnessed = 1;
+            _br_set_child_state(child, BR_CHILD_STATE_CONNECTED);
+            gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CONNECTED_TO_BRICK,
+                    "brick-path=%s", child->brick_path, NULL);
+        }
+    }
+    pthread_mutex_unlock(&child->lock);
+
+    return ret;
+}
+
+/**
+ * This routine fetches various attributes associated with a child which
+ * is basically a subvolume. Attributes include brick path and the stub
+ * birth time. This is done by performing a lookup on the root followed
+ * by getxattr() on a virtual key. Depending on the configuration, the
+ * process either acts as a signer or a scrubber.
+ */
+int32_t
+br_brick_connect(xlator_t *this, br_child_t *child)
+{
+    int32_t ret = -1;
+    loc_t loc = {
+        0,
+    };
+    struct iatt buf = {
+        0,
+    };
+    struct iatt parent = {
+        0,
+    };
+    br_stub_init_t *stub = NULL;
+    dict_t *xattr = NULL;
+    int op_errno = 0;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, child, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    br_child_set_scrub_state(child, _gf_false);
+    br_set_child_state(child, BR_CHILD_STATE_INITIALIZING);
+
+    loc.inode = inode_ref(child->table->root);
+    gf_uuid_copy(loc.gfid, loc.inode->gfid);
+    loc.path = gf_strdup("/");
+
+    ret = syncop_lookup(child->xl, &loc, &buf, &parent, NULL, NULL);
+    if (ret) {
+        op_errno = -ret;
+        ret = -1;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_LOOKUP_FAILED,
+                NULL);
+        goto wipeloc;
+    }
+
+    ret = syncop_getxattr(child->xl, &loc, &xattr,
+                          GLUSTERFS_GET_BR_STUB_INIT_TIME, NULL, NULL);
+    if (ret) {
+        op_errno = -ret;
+        ret = -1;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_GET_INFO_FAILED,
+                NULL);
+        goto wipeloc;
+    }
+
+    ret = dict_get_ptr(xattr, GLUSTERFS_GET_BR_STUB_INIT_TIME, (void **)&stub);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_INFO_FAILED, NULL);
+        goto free_dict;
+    }
+
+    memcpy(child->brick_path, stub->export, strlen(stub->export) + 1);
+    child->tv.tv_sec = ntohl(stub->timebuf[0]);
+    child->tv.tv_usec = ntohl(stub->timebuf[1]);
+
+    ret = br_child_enaction(this, child, stub);
+
+free_dict:
+    dict_unref(xattr);
+wipeloc:
+    loc_wipe(&loc);
+out:
+    if (ret)
+        br_set_child_state(child, BR_CHILD_STATE_CONNFAILED);
+    return ret;
+}
+
+/* TODO: cleanup signer */
+static int32_t
+br_cleanup_signer(xlator_t *this, br_child_t *child)
+{
+    return 0;
+}
+
+static int32_t
+br_cleanup_scrubber(xlator_t *this, br_child_t *child)
+{
+    int32_t ret = 0;
+    br_private_t *priv = NULL;
+    struct br_scrubber *fsscrub = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    fsscrub = &priv->fsscrub;
+    scrub_monitor = &priv->scrub_monitor;
+
+    if (_br_is_child_scrub_active(child)) {
+        scrub_monitor->active_child_count--;
+        br_child_set_scrub_state(child, _gf_false);
+    }
+
+    /**
+     * 0x0: child (brick) goes out of rotation
+     *
+     * This is fully safe w.r.t. entries for this child being actively
+     * scrubbed. Each of the scrubber thread(s) would finish scrubbing
+     * the entry (probably failing due to disconnection) and either
+     * putting the entry back into the queue or continuing further.
+     * Either way, pending entries for this child's queue need not be
+     * drained; entries just sit there in the queued/ready list to be
+     * consumed later upon re-connection.
+     */
+    pthread_mutex_lock(&fsscrub->mutex);
+    {
+        list_del_init(&child->list);
+    }
+    pthread_mutex_unlock(&fsscrub->mutex);
+
+    /**
+     * 0x1: cleanup scanner thread
+     *
+     * The pending timer needs to be removed _after_ cleaning up the
+     * filesystem scanner (scheduling the next scrub time is not a
+     * cancellation point).
+     */
+    ret = gf_thread_cleanup_xint(child->thread);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_THREAD_CLEANUP, NULL);
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUBBER_CLEANED,
+            "brick-path=%s", child->brick_path, NULL);
+
+    return 0;
+}
+
+/**
+ * OK.. this child has made it's mind to go down the drain. So,
+ * let's clean up what it touched. (NOTE: there's no need to clean
+ * the inode table, it's just reused taking care of stale inodes)
+ */
+int32_t
+br_brick_disconnect(xlator_t *this, br_child_t *child)
+{
+    int32_t ret = 0;
+    struct br_monitor *scrub_monitor = NULL;
+    br_private_t *priv = this->private;
+
+    scrub_monitor = &priv->scrub_monitor;
+
+    /* Lock order should be wakelock and then child lock to
+     * dead locks.
+     */
+    pthread_mutex_lock(&scrub_monitor->wakelock);
+    {
+        pthread_mutex_lock(&child->lock);
+        {
+            if (!_br_is_child_connected(child))
+                goto unblock;
+
+            /* child is on death row.. */
+            _br_set_child_state(child, BR_CHILD_STATE_DISCONNECTED);
+
+            if (priv->iamscrubber)
+                ret = br_cleanup_scrubber(this, child);
+            else
+                ret = br_cleanup_signer(this, child);
+        }
+    unblock:
+        pthread_mutex_unlock(&child->lock);
+    }
+    pthread_mutex_unlock(&scrub_monitor->wakelock);
+
+    return ret;
+}
+
+/**
+ * This function is executed in a separate thread. The thread gets the
+ * brick from where CHILD_UP has received from the queue and gets the
+ * information regarding that brick (such as brick path).
+ */
+void *
+br_handle_events(void *arg)
+{
+    int32_t ret = 0;
+    xlator_t *this = NULL;
+    br_private_t *priv = NULL;
+    br_child_t *child = NULL;
+    struct br_child_event *childev = NULL;
+
+    this = arg;
+    priv = this->private;
+
+    /*
+     * Since, this is the topmost xlator, THIS has to be set by bit-rot
+     * xlator itself (STACK_WIND won't help in this case). Also it has
+     * to be done for each thread that gets spawned. Otherwise, a new
+     * thread will get global_xlator's pointer when it does "THIS".
+     */
+    THIS = this;
+
+    while (1) {
+        pthread_mutex_lock(&priv->lock);
+        {
+            while (list_empty(&priv->bricks))
+                pthread_cond_wait(&priv->cond, &priv->lock);
+
+            childev = list_first_entry(&priv->bricks, struct br_child_event,
+                                       list);
+            list_del_init(&childev->list);
+        }
+        pthread_mutex_unlock(&priv->lock);
+
+        child = childev->child;
+        ret = childev->call(this, child);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SUBVOL_CONNECT_FAILED,
+                    "name=%s", child->xl->name, NULL);
+        GF_FREE(childev);
+    }
+
+    return NULL;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int32_t ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_br_stub_mt_end + 1);
+
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_MEM_ACNT_FAILED, NULL);
+        return ret;
+    }
+
+    return ret;
+}
+
+static void
+_br_qchild_event(xlator_t *this, br_child_t *child, br_child_handler *call)
+{
+    br_private_t *priv = NULL;
+    struct br_child_event *childev = NULL;
+
+    priv = this->private;
+
+    childev = GF_CALLOC(1, sizeof(*childev), gf_br_mt_br_child_event_t);
+    if (!childev) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_EVENT_UNHANDLED,
+                "Brick-name=%s", child->xl->name, NULL);
+        return;
+    }
+
+    INIT_LIST_HEAD(&childev->list);
+    childev->this = this;
+    childev->child = child;
+    childev->call = call;
+
+    list_add_tail(&childev->list, &priv->bricks);
+}
+
+int
+br_scrubber_status_get(xlator_t *this, dict_t **dict)
+{
+    int ret = -1;
+    br_private_t *priv = NULL;
+    struct br_scrub_stats *scrub_stats = NULL;
+
+    priv = this->private;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", priv, out);
+
+    scrub_stats = &priv->scrub_stat;
+
+    ret = br_get_bad_objects_list(this, dict);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to collect corrupt "
+                     "files");
+    }
+
+    ret = dict_set_int8(*dict, "scrub-running", scrub_stats->scrub_running);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Failed setting scrub_running "
+                     "entry to the dictionary");
+    }
+
+    ret = dict_set_uint64(*dict, "scrubbed-files", scrub_stats->scrubbed_files);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to setting scrubbed file "
+                     "entry to the dictionary");
+    }
+
+    ret = dict_set_uint64(*dict, "unsigned-files", scrub_stats->unsigned_files);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to set unsigned file count"
+                     " entry to the dictionary");
+    }
+
+    ret = dict_set_uint64(*dict, "scrub-duration", scrub_stats->scrub_duration);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to set scrub duration"
+                     " entry to the dictionary");
+    }
+
+    ret = dict_set_dynstr_with_alloc(*dict, "last-scrub-time",
+                                     scrub_stats->last_scrub_time);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to set "
+                     "last scrub time value");
+    }
+
+out:
+    return ret;
+}
+
+int
+notify(xlator_t *this, int32_t event, void *data, ...)
+{
+    int idx = -1;
+    int ret = -1;
+    xlator_t *subvol = NULL;
+    br_child_t *child = NULL;
+    br_private_t *priv = NULL;
+    dict_t *output = NULL;
+    va_list ap;
+    struct br_monitor *scrub_monitor = NULL;
+
+    subvol = (xlator_t *)data;
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    gf_msg_trace(this->name, 0, "Notification received: %d", event);
+
+    idx = br_find_child_index(this, subvol);
+
+    switch (event) {
+        case GF_EVENT_CHILD_UP:
+            if (idx < 0) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_INVALID_SUBVOL,
+                        "event=%d", event, NULL);
+                goto out;
+            }
+
+            pthread_mutex_lock(&priv->lock);
+            {
+                child = &priv->children[idx];
+                if (child->child_up == 1)
+                    goto unblock_0;
+                priv->up_children++;
+
+                child->child_up = 1;
+                child->xl = subvol;
+                if (!child->table)
+                    child->table = inode_table_new(4096, subvol);
+
+                _br_qchild_event(this, child, br_brick_connect);
+                pthread_cond_signal(&priv->cond);
+            }
+        unblock_0:
+            pthread_mutex_unlock(&priv->lock);
+
+            if (priv->up_children == priv->child_count)
+                default_notify(this, event, data);
+            break;
+
+        case GF_EVENT_CHILD_DOWN:
+            if (idx < 0) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_INVALID_SUBVOL,
+                        "event=%d", event, NULL);
+                goto out;
+            }
+
+            pthread_mutex_lock(&priv->lock);
+            {
+                child = &priv->children[idx];
+                if (child->child_up == 0)
+                    goto unblock_1;
+
+                child->child_up = 0;
+                priv->up_children--;
+
+                _br_qchild_event(this, child, br_brick_disconnect);
+                pthread_cond_signal(&priv->cond);
+            }
+        unblock_1:
+            pthread_mutex_unlock(&priv->lock);
+
+            if (priv->up_children == 0)
+                default_notify(this, event, data);
+            break;
+
+        case GF_EVENT_SCRUB_STATUS:
+            gf_msg_debug(this->name, GF_LOG_INFO,
+                         "BitRot scrub status "
+                         "called");
+            va_start(ap, data);
+            output = va_arg(ap, dict_t *);
+            va_end(ap);
+
+            ret = br_scrubber_status_get(this, &output);
+            gf_msg_debug(this->name, 0, "returning %d", ret);
+            break;
+
+        case GF_EVENT_SCRUB_ONDEMAND:
+            gf_log(this->name, GF_LOG_INFO,
+                   "BitRot scrub ondemand "
+                   "called");
+
+            if (scrub_monitor->state != BR_SCRUB_STATE_PENDING) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0,
+                        BRB_MSG_RESCHEDULE_SCRUBBER_FAILED, "Current-state=%d",
+                        scrub_monitor->state, NULL);
+                return -2;
+            }
+
+            /* Needs synchronization with reconfigure thread */
+            pthread_mutex_lock(&priv->lock);
+            {
+                ret = br_scrub_state_machine(this, _gf_true);
+            }
+            pthread_mutex_unlock(&priv->lock);
+
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0,
+                        BRB_MSG_COULD_NOT_SCHEDULE_SCRUB, NULL);
+            }
+            gf_msg_debug(this->name, 0, "returning %d", ret);
+            break;
+        default:
+            default_notify(this, event, data);
+    }
+
+out:
+    return 0;
+}
+
+static void
+br_fini_signer(xlator_t *this, br_private_t *priv)
+{
+    int i = 0;
+
+    if (priv == NULL)
+        return;
+
+    for (; i < priv->signer_th_count; i++) {
+        (void)gf_thread_cleanup_xint(priv->obj_queue->workers[i]);
+    }
+    GF_FREE(priv->obj_queue->workers);
+
+    pthread_cond_destroy(&priv->object_cond);
+}
+
+/**
+ * Initialize signer specific structures, spawn worker threads.
+ */
+
+static int32_t
+br_init_signer(xlator_t *this, br_private_t *priv)
+{
+    int i = 0;
+    int32_t ret = -1;
+
+    /* initialize gfchangelog xlator context */
+    ret = gf_changelog_init(this);
+    if (ret)
+        goto out;
+
+    pthread_cond_init(&priv->object_cond, NULL);
+
+    priv->obj_queue = GF_CALLOC(1, sizeof(*priv->obj_queue),
+                                gf_br_mt_br_ob_n_wk_t);
+    if (!priv->obj_queue)
+        goto cleanup_cond;
+    INIT_LIST_HEAD(&priv->obj_queue->objects);
+
+    priv->obj_queue->workers = GF_CALLOC(
+        priv->signer_th_count, sizeof(pthread_t), gf_br_mt_br_worker_t);
+    if (!priv->obj_queue->workers)
+        goto cleanup_obj_queue;
+
+    for (i = 0; i < priv->signer_th_count; i++) {
+        ret = gf_thread_create(&priv->obj_queue->workers[i], NULL,
+                               br_process_object, this, "brpobj");
+        if (ret != 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, -ret,
+                    BRB_MSG_THREAD_CREATION_FAILED, NULL);
+            ret = -1;
+            goto cleanup_threads;
+        }
+    }
+
+    return 0;
+
+cleanup_threads:
+    for (i--; i >= 0; i--) {
+        (void)gf_thread_cleanup_xint(priv->obj_queue->workers[i]);
+    }
+    GF_FREE(priv->obj_queue->workers);
+
+cleanup_obj_queue:
+    GF_FREE(priv->obj_queue);
+
+cleanup_cond:
+    /* that's explicit */
+    pthread_cond_destroy(&priv->object_cond);
+out:
+    return -1;
+}
+
+/**
+ * For signer, only rate limit CPU usage (during hash calculation) when
+ * compiled with -DBR_RATE_LIMIT_SIGNER cflags, else let it run full
+ * throttle.
+ */
+static int32_t
+br_rate_limit_signer(xlator_t *this, int child_count, int numbricks)
+{
+    br_private_t *priv = NULL;
+    tbf_opspec_t spec = {
+        0,
+    };
+
+    priv = this->private;
+
+    spec.op = TBF_OP_HASH;
+    spec.rate = 0;
+    spec.maxlimit = 0;
+
+    /**
+     * OK. Most implementations of TBF I've come across generate tokens
+     * every second (UML, etc..) and some chose sub-second granularity
+     * (blk-iothrottle cgroups). TBF algorithm itself does not enforce
+     * any logic for choosing generation interval and it seems pretty
+     * logical as one could jack up token count per interval w.r.t.
+     * generation rate.
+     *
+     * Value used here is chosen based on a series of test(s) performed
+     * to balance object signing time and not maxing out on all available
+     * CPU cores. It's obvious to have seconds granularity and jack up
+     * token count per interval, thereby achieving close to similar
+     * results. Let's stick to this as it seems to be working fine for
+     * the set of ops that are throttled.
+     **/
+    spec.token_gen_interval = 600000; /* In usec */
+
+#ifdef BR_RATE_LIMIT_SIGNER
+
+    double contribution = 0;
+    contribution = ((double)1 - ((double)child_count / (double)numbricks));
+    if (contribution == 0)
+        contribution = 1;
+    spec.rate = BR_HASH_CALC_READ_SIZE * contribution;
+    spec.maxlimit = priv->signer_th_count * BR_HASH_CALC_READ_SIZE;
+
+#endif
+
+    if (!spec.rate)
+        gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_RATE_LIMIT_INFO,
+                "FULL THROTTLE", NULL);
+    else
+        gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_RATE_LIMIT_INFO,
+                "tokens/sec-rate=%lu", spec.rate, "maxlimit=%lu", spec.maxlimit,
+                NULL);
+
+    priv->tbf = tbf_init(&spec, 1);
+    return priv->tbf ? 0 : -1;
+}
+
+static int32_t
+br_signer_handle_options(xlator_t *this, br_private_t *priv, dict_t *options)
+{
+    if (options) {
+        GF_OPTION_RECONF("expiry-time", priv->expiry_time, options, uint32,
+                         error_return);
+        GF_OPTION_RECONF("signer-threads", priv->signer_th_count, options,
+                         uint32, error_return);
+    } else {
+        GF_OPTION_INIT("expiry-time", priv->expiry_time, uint32, error_return);
+        GF_OPTION_INIT("signer-threads", priv->signer_th_count, uint32,
+                       error_return);
+    }
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static int32_t
+br_signer_init(xlator_t *this, br_private_t *priv)
+{
+    int32_t ret = 0;
+    int numbricks = 0;
+
+    GF_OPTION_INIT("expiry-time", priv->expiry_time, uint32, error_return);
+    GF_OPTION_INIT("brick-count", numbricks, int32, error_return);
+    GF_OPTION_INIT("signer-threads", priv->signer_th_count, uint32,
+                   error_return);
+
+    ret = br_rate_limit_signer(this, priv->child_count, numbricks);
+    if (ret)
+        goto error_return;
+
+    ret = br_init_signer(this, priv);
+    if (ret)
+        goto cleanup_tbf;
+
+    return 0;
+
+cleanup_tbf:
+    /* cleanup TBF */
+error_return:
+    return -1;
+}
+
+static void
+br_free_scrubber_monitor(xlator_t *this, br_private_t *priv)
+{
+    struct br_monitor *scrub_monitor = &priv->scrub_monitor;
+
+    if (scrub_monitor->timer) {
+        (void)gf_tw_del_timer(priv->timer_wheel, scrub_monitor->timer);
+
+        GF_FREE(scrub_monitor->timer);
+        scrub_monitor->timer = NULL;
+    }
+
+    (void)gf_thread_cleanup_xint(scrub_monitor->thread);
+
+    /* Clean up cond and mutex variables */
+    pthread_mutex_destroy(&scrub_monitor->mutex);
+    pthread_cond_destroy(&scrub_monitor->cond);
+
+    pthread_mutex_destroy(&scrub_monitor->wakelock);
+    pthread_cond_destroy(&scrub_monitor->wakecond);
+
+    pthread_mutex_destroy(&scrub_monitor->donelock);
+    pthread_cond_destroy(&scrub_monitor->donecond);
+
+    LOCK_DESTROY(&scrub_monitor->lock);
+}
+
+static void
+br_free_children(xlator_t *this, br_private_t *priv, int count)
+{
+    br_child_t *child = NULL;
+
+    for (--count; count >= 0; count--) {
+        child = &priv->children[count];
+        mem_pool_destroy(child->timer_pool);
+        pthread_mutex_destroy(&child->lock);
+    }
+
+    GF_FREE(priv->children);
+    priv->children = NULL;
+}
+
+static int
+br_init_children(xlator_t *this, br_private_t *priv)
+{
+    int i = 0;
+    br_child_t *child = NULL;
+    xlator_list_t *trav = NULL;
+
+    priv->child_count = xlator_subvolume_count(this);
+    priv->children = GF_CALLOC(priv->child_count, sizeof(*priv->children),
+                               gf_br_mt_br_child_t);
+    if (!priv->children)
+        goto err;
+
+    trav = this->children;
+    while (trav) {
+        child = &priv->children[i];
+
+        pthread_mutex_init(&child->lock, NULL);
+        child->witnessed = 0;
+
+        br_set_child_state(child, BR_CHILD_STATE_DISCONNECTED);
+
+        child->this = this;
+        child->xl = trav->xlator;
+
+        child->timer_pool = mem_pool_new(struct gf_tw_timer_list, 4096);
+        if (!child->timer_pool) {
+            gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_MEM_POOL_ALLOC,
+                    NULL);
+            errno = ENOMEM;
+            goto freechild;
+        }
+
+        INIT_LIST_HEAD(&child->list);
+
+        i++;
+        trav = trav->next;
+    }
+
+    return 0;
+
+freechild:
+    br_free_children(this, priv, i);
+err:
+    return -1;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    int32_t ret = -1;
+    br_private_t *priv = NULL;
+
+    if (!this->children) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_NO_CHILD, NULL);
+        goto out;
+    }
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_br_mt_br_private_t);
+    if (!priv) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    GF_OPTION_INIT("scrubber", priv->iamscrubber, bool, free_priv);
+
+    ret = br_init_children(this, priv);
+    if (ret)
+        goto free_priv;
+
+    pthread_mutex_init(&priv->lock, NULL);
+    pthread_cond_init(&priv->cond, NULL);
+
+    INIT_LIST_HEAD(&priv->bricks);
+    INIT_LIST_HEAD(&priv->signing);
+
+    priv->timer_wheel = glusterfs_ctx_tw_get(this->ctx);
+    if (!priv->timer_wheel) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_TIMER_WHEEL_UNAVAILABLE,
+                NULL);
+        goto cleanup;
+    }
+
+    this->private = priv;
+
+    if (!priv->iamscrubber) {
+        ret = br_signer_init(this, priv);
+        if (!ret)
+            ret = br_signer_handle_options(this, priv, NULL);
+    } else {
+        ret = br_scrubber_init(this, priv);
+        if (!ret)
+            ret = br_scrubber_handle_options(this, priv, NULL);
+    }
+
+    if (ret)
+        goto cleanup;
+
+    ret = gf_thread_create(&priv->thread, NULL, br_handle_events, this,
+                           "brhevent");
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_THREAD_CREATION_FAILED,
+                NULL);
+        ret = -1;
+    }
+
+    if (!ret) {
+        gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_BITROT_LOADED, "mode=%s",
+                (priv->iamscrubber) ? "SCRUBBER" : "SIGNER", NULL);
+        return 0;
+    }
+
+cleanup:
+    (void)pthread_cond_destroy(&priv->cond);
+    (void)pthread_mutex_destroy(&priv->lock);
+
+    br_free_children(this, priv, priv->child_count);
+
+free_priv:
+    GF_FREE(priv);
+out:
+    this->private = NULL;
+    return -1;
+}
+
+void
+fini(xlator_t *this)
+{
+    br_private_t *priv = this->private;
+
+    if (!priv)
+        return;
+
+    if (!priv->iamscrubber)
+        br_fini_signer(this, priv);
+    else
+        (void)br_free_scrubber_monitor(this, priv);
+
+    br_free_children(this, priv, priv->child_count);
+
+    this->private = NULL;
+    GF_FREE(priv);
+
+    glusterfs_ctx_tw_put(this->ctx);
+
+    return;
+}
+
+static void
+br_reconfigure_monitor(xlator_t *this)
+{
+    int32_t ret = 0;
+
+    ret = br_scrub_state_machine(this, _gf_false);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_COULD_NOT_SCHEDULE_SCRUB,
+                NULL);
+    }
+}
+
+static int
+br_reconfigure_scrubber(xlator_t *this, dict_t *options)
+{
+    int32_t ret = -1;
+    br_private_t *priv = NULL;
+
+    priv = this->private;
+
+    pthread_mutex_lock(&priv->lock);
+    {
+        ret = br_scrubber_handle_options(this, priv, options);
+    }
+    pthread_mutex_unlock(&priv->lock);
+
+    if (ret)
+        goto err;
+
+    /* change state for all _up_ subvolume(s) */
+    pthread_mutex_lock(&priv->lock);
+    {
+        br_reconfigure_monitor(this);
+    }
+    pthread_mutex_unlock(&priv->lock);
+
+err:
+    return ret;
+}
+
+static int
+br_reconfigure_signer(xlator_t *this, dict_t *options)
+{
+    br_private_t *priv = this->private;
+
+    return br_signer_handle_options(this, priv, options);
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    int ret = 0;
+    br_private_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->iamscrubber)
+        ret = br_reconfigure_scrubber(this, options);
+    else
+        ret = br_reconfigure_signer(this, options);
+
+    return ret;
+}
+
+struct xlator_fops fops;
+
+struct xlator_cbks cbks;
+
+struct volume_options options[] = {
+    {
+        .key = {"expiry-time"},
+        .type = GF_OPTION_TYPE_INT,
+        .default_value = SIGNING_TIMEOUT,
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .description = "Waiting time for an object on which it waits "
+                       "before it is signed",
+    },
+    {
+        .key = {"brick-count"},
+        .type = GF_OPTION_TYPE_STR,
+        .description = "Total number of bricks for the current node for "
+                       "all volumes in the trusted storage pool.",
+    },
+    {
+        .key = {"scrubber", "scrub"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_FORCE,
+        .description = "option to run as a scrubber",
+    },
+    {
+        .key = {"scrub-throttle"},
+        .type = GF_OPTION_TYPE_STR,
+        .default_value = "lazy",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .description = "Scrub-throttle value is a measure of how fast "
+                       "or slow the scrubber scrubs the filesystem for "
+                       "volume <VOLNAME>",
+    },
+    {
+        .key = {"scrub-freq"},
+        .type = GF_OPTION_TYPE_STR,
+        .default_value = "biweekly",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .description = "Scrub frequency for volume <VOLNAME>",
+    },
+    {
+        .key = {"scrub-state"},
+        .type = GF_OPTION_TYPE_STR,
+        .default_value = "active",
+        .op_version = {GD_OP_VERSION_4_0_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .description = "Pause/Resume scrub. Upon resume, scrubber "
+                       "continues from where it left off.",
+    },
+    {
+        .key = {"signer-threads"},
+        .type = GF_OPTION_TYPE_INT,
+        .default_value = BR_WORKERS,
+        .op_version = {GD_OP_VERSION_8_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .description = "Number of signing process threads. As a best "
+                       "practice, set this to the number of processor cores",
+    },
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "bit-rot",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h
new file mode 100644
index 00000000000..8ac7dcdac3d
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.h
@@ -0,0 +1,302 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BIT_ROT_H__
+#define __BIT_ROT_H__
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/syncop-utils.h>
+#include "changelog.h"
+#include "timer-wheel.h"
+
+#include <glusterfs/throttle-tbf.h>
+#include "bit-rot-ssm.h"
+
+#include "bit-rot-common.h"
+#include "bit-rot-stub-mem-types.h"
+#include "bit-rot-scrub-status.h"
+
+#include <openssl/sha.h>
+
+typedef enum scrub_throttle {
+    BR_SCRUB_THROTTLE_VOID = -1,
+    BR_SCRUB_THROTTLE_LAZY = 0,
+    BR_SCRUB_THROTTLE_NORMAL = 1,
+    BR_SCRUB_THROTTLE_AGGRESSIVE = 2,
+    BR_SCRUB_THROTTLE_STALLED = 3,
+} scrub_throttle_t;
+
+typedef enum scrub_freq {
+    BR_FSSCRUB_FREQ_HOURLY = 1,
+    BR_FSSCRUB_FREQ_DAILY,
+    BR_FSSCRUB_FREQ_WEEKLY,
+    BR_FSSCRUB_FREQ_BIWEEKLY,
+    BR_FSSCRUB_FREQ_MONTHLY,
+    BR_FSSCRUB_FREQ_MINUTE,
+    BR_FSSCRUB_FREQ_STALLED,
+} scrub_freq_t;
+
+#define signature_size(hl) (sizeof(br_isignature_t) + hl + 1)
+
+struct br_scanfs {
+    gf_lock_t entrylock;
+
+    pthread_mutex_t waitlock;
+    pthread_cond_t waitcond;
+
+    unsigned int entries;
+    struct list_head queued;
+    struct list_head ready;
+};
+
+/* just need three states to track child status */
+typedef enum br_child_state {
+    BR_CHILD_STATE_CONNECTED = 1,
+    BR_CHILD_STATE_INITIALIZING,
+    BR_CHILD_STATE_CONNFAILED,
+    BR_CHILD_STATE_DISCONNECTED,
+} br_child_state_t;
+
+struct br_child {
+    pthread_mutex_t lock;     /* protects child state */
+    char witnessed;           /* witnessed at least one successful
+                                 connection */
+    br_child_state_t c_state; /* current state of this child */
+
+    char child_up;             /* Indicates whether this child is
+                                  up or not */
+    xlator_t *xl;              /* client xlator corresponding to
+                                  this child */
+    inode_table_t *table;      /* inode table for this child */
+    char brick_path[PATH_MAX]; /* brick export directory of this
+                                  child */
+    struct list_head list;     /* hook to attach to the list of
+                                  UP children */
+    xlator_t *this;            /* Bit rot xlator */
+
+    pthread_t thread;  /* initial crawler for unsigned
+                          object(s) or scrub crawler */
+    int threadrunning; /* active thread */
+
+    struct mem_pool *timer_pool; /* timer-wheel's timer mem-pool */
+
+    struct timeval tv;
+
+    struct br_scanfs fsscan; /* per subvolume FS scanner */
+
+    gf_boolean_t active_scrubbing; /* Actively scrubbing or not */
+};
+
+typedef struct br_child br_child_t;
+
+struct br_obj_n_workers {
+    struct list_head objects; /* queue of objects expired from the
+                                 timer wheel and ready to be picked
+                                 up for signing */
+    pthread_t *workers;       /* Threads which pick up the objects
+                                 from the above queue and start
+                                 signing each object */
+};
+
+struct br_scrubber {
+    xlator_t *this;
+
+    scrub_throttle_t throttle;
+
+    /**
+     * frequency of scanning for this subvolume. this should
+     * normally be per-child, but since all children follow the
+     * same frequency for a volume, this option ends up here
+     * instead of br_child_t.
+     */
+    scrub_freq_t frequency;
+
+    gf_boolean_t frequency_reconf;
+    gf_boolean_t throttle_reconf;
+
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+
+    unsigned int nr_scrubbers;
+    struct list_head scrubbers;
+
+    /**
+     * list of "rotatable" subvolume(s) undergoing scrubbing
+     */
+    struct list_head scrublist;
+};
+
+struct br_monitor {
+    gf_lock_t lock;
+    pthread_t thread; /* Monitor thread */
+
+    gf_boolean_t inited;
+    pthread_mutex_t mutex;
+    pthread_cond_t cond; /* Thread starts and will be waiting on cond.
+                            First child which is up wakes this up */
+
+    xlator_t *this;
+    /* scheduler */
+    uint32_t boot;
+
+    int32_t active_child_count; /* Number of children currently scrubbing */
+    gf_boolean_t kick;          /* This variable tracks the scrubber is
+                                 * kicked or not. Both 'kick' and
+                                 * 'active_child_count' uses the same pair
+                                 * of mutex-cond variable, i.e, wakelock and
+                                 * wakecond. */
+
+    pthread_mutex_t wakelock;
+    pthread_cond_t wakecond;
+
+    gf_boolean_t done;
+    pthread_mutex_t donelock;
+    pthread_cond_t donecond;
+
+    struct gf_tw_timer_list *timer;
+    br_scrub_state_t state; /* current scrub state */
+};
+
+typedef struct br_obj_n_workers br_obj_n_workers_t;
+
+typedef struct br_private br_private_t;
+
+typedef void (*br_scrubbed_file_update)(br_private_t *priv);
+
+struct br_private {
+    pthread_mutex_t lock;
+
+    struct list_head bricks; /* list of bricks from which enents
+                                have been received */
+
+    struct list_head signing;
+
+    pthread_cond_t object_cond; /* handling signing of objects */
+    int child_count;
+    br_child_t *children; /* list of subvolumes */
+    int up_children;
+
+    pthread_cond_t cond; /* handling CHILD_UP notifications */
+    pthread_t thread;    /* thread for connecting each UP
+                            child with changelog */
+
+    struct tvec_base *timer_wheel; /* timer wheel where the objects which
+                                      changelog has sent sits and waits
+                                      for expiry */
+    br_obj_n_workers_t *obj_queue; /* place holder for all the objects
+                                      that are expired from timer wheel
+                                      and ready to be picked up for
+                                      signing and the workers which sign
+                                      the objects */
+
+    uint32_t expiry_time; /* objects "wait" time */
+
+    uint32_t signer_th_count; /* Number of signing process threads */
+
+    tbf_t *tbf; /* token bucket filter */
+
+    gf_boolean_t iamscrubber; /* function as a fs scrubber */
+
+    struct br_scrub_stats scrub_stat; /* statistics of scrub*/
+
+    struct br_scrubber fsscrub; /* scrubbers for this subvolume */
+
+    struct br_monitor scrub_monitor; /* scrubber monitor */
+};
+
+struct br_object {
+    xlator_t *this;
+
+    uuid_t gfid;
+
+    unsigned long signedversion; /* version against which this object will
+                                    be signed */
+    br_child_t *child;           /* object's subvolume */
+
+    int sign_info;
+
+    struct list_head list; /* hook to add to the queue once the
+                              object is expired from timer wheel */
+    void *data;
+};
+
+typedef struct br_object br_object_t;
+typedef int32_t(br_scrub_ssm_call)(xlator_t *);
+
+void
+br_log_object(xlator_t *, char *, uuid_t, int32_t);
+
+void
+br_log_object_path(xlator_t *, char *, const char *, int32_t);
+
+int32_t
+br_calculate_obj_checksum(unsigned char *, br_child_t *, fd_t *, struct iatt *);
+
+int32_t
+br_prepare_loc(xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *);
+
+gf_boolean_t
+bitd_is_bad_file(xlator_t *, br_child_t *, loc_t *, fd_t *);
+
+static inline void
+_br_set_child_state(br_child_t *child, br_child_state_t state)
+{
+    child->c_state = state;
+}
+
+static inline int
+_br_is_child_connected(br_child_t *child)
+{
+    return (child->c_state == BR_CHILD_STATE_CONNECTED);
+}
+
+static inline int
+_br_is_child_scrub_active(br_child_t *child)
+{
+    return child->active_scrubbing;
+}
+
+static inline int
+_br_child_failed_conn(br_child_t *child)
+{
+    return (child->c_state == BR_CHILD_STATE_CONNFAILED);
+}
+
+static inline int
+_br_child_witnessed_connection(br_child_t *child)
+{
+    return (child->witnessed == 1);
+}
+
+/* scrub state */
+static inline void
+_br_monitor_set_scrub_state(struct br_monitor *scrub_monitor,
+                            br_scrub_state_t state)
+{
+    scrub_monitor->state = state;
+}
+
+static inline br_scrub_event_t
+_br_child_get_scrub_event(struct br_scrubber *fsscrub)
+{
+    return (fsscrub->frequency == BR_FSSCRUB_FREQ_STALLED)
+               ? BR_SCRUB_EVENT_PAUSE
+               : BR_SCRUB_EVENT_SCHEDULE;
+}
+
+int32_t
+br_get_bad_objects_list(xlator_t *this, dict_t **dict);
+
+#endif /* __BIT_ROT_H__ */
diff --git a/xlators/features/bit-rot/src/stub/Makefile.am b/xlators/features/bit-rot/src/stub/Makefile.am
new file mode 100644
index 00000000000..f13de7145fc
--- /dev/null
+++ b/xlators/features/bit-rot/src/stub/Makefile.am
@@ -0,0 +1,20 @@
+if WITH_SERVER
+xlator_LTLIBRARIES = bitrot-stub.la
+endif
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+bitrot_stub_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+bitrot_stub_la_SOURCES = bit-rot-stub-helpers.c bit-rot-stub.c
+bitrot_stub_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = bit-rot-stub.h bit-rot-common.h bit-rot-stub-mem-types.h \
+	bit-rot-object-version.h bit-rot-stub-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(top_srcdir)/rpc/rpc-lib/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-common.h b/xlators/features/bit-rot/src/stub/bit-rot-common.h
new file mode 100644
index 00000000000..20561aa7764
--- /dev/null
+++ b/xlators/features/bit-rot/src/stub/bit-rot-common.h
@@ -0,0 +1,178 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BIT_ROT_COMMON_H__
+#define __BIT_ROT_COMMON_H__
+
+#include <glusterfs/glusterfs.h>
+#include "bit-rot-object-version.h"
+
+#define BR_VXATTR_VERSION (1 << 0)
+#define BR_VXATTR_SIGNATURE (1 << 1)
+
+#define BR_VXATTR_SIGN_MISSING (BR_VXATTR_SIGNATURE)
+#define BR_VXATTR_ALL_MISSING (BR_VXATTR_VERSION | BR_VXATTR_SIGNATURE)
+
+#define BR_BAD_OBJ_CONTAINER                                                   \
+    (uuid_t) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 }
+
+typedef enum br_vxattr_state {
+    BR_VXATTR_STATUS_FULL = 0,
+    BR_VXATTR_STATUS_MISSING = 1,
+    BR_VXATTR_STATUS_UNSIGNED = 2,
+    BR_VXATTR_STATUS_INVALID = 3,
+} br_vxattr_status_t;
+
+typedef enum br_sign_state {
+    BR_SIGN_INVALID = -1,
+    BR_SIGN_NORMAL = 0,
+    BR_SIGN_REOPEN_WAIT = 1,
+    BR_SIGN_QUICK = 2,
+} br_sign_state_t;
+
+static inline br_vxattr_status_t
+br_version_xattr_state(dict_t *xattr, br_version_t **obuf,
+                       br_signature_t **sbuf, gf_boolean_t *objbad)
+{
+    int32_t ret = 0;
+    int32_t vxattr = 0;
+    br_vxattr_status_t status;
+    void *data = NULL;
+
+    /**
+     * The key being present in the dict indicates the xattr was set on
+     * disk. The presence of xattr itself as of now is suffecient to say
+     * the the object is bad.
+     */
+    *objbad = _gf_false;
+    ret = dict_get_bin(xattr, BITROT_OBJECT_BAD_KEY, (void **)&data);
+    if (!ret)
+        *objbad = _gf_true;
+
+    ret = dict_get_bin(xattr, BITROT_CURRENT_VERSION_KEY, (void **)obuf);
+    if (ret)
+        vxattr |= BR_VXATTR_VERSION;
+
+    ret = dict_get_bin(xattr, BITROT_SIGNING_VERSION_KEY, (void **)sbuf);
+    if (ret)
+        vxattr |= BR_VXATTR_SIGNATURE;
+
+    switch (vxattr) {
+        case 0:
+            status = BR_VXATTR_STATUS_FULL;
+            break;
+        case BR_VXATTR_SIGN_MISSING:
+            status = BR_VXATTR_STATUS_UNSIGNED;
+            break;
+        case BR_VXATTR_ALL_MISSING:
+            status = BR_VXATTR_STATUS_MISSING;
+            break;
+        default:
+            status = BR_VXATTR_STATUS_INVALID;
+    }
+
+    return status;
+}
+
+/**
+ * in-memory representation of signature used by signer for object
+ * signing.
+ */
+typedef struct br_isignature_in {
+    int8_t signaturetype; /* signature type            */
+
+    unsigned long signedversion; /* version against which the
+                                    object was signed         */
+
+    size_t signaturelen; /* signature length          */
+    char signature[0];   /* object signature          */
+} br_isignature_t;
+
+/**
+ * in-memory representation of signature used by scrubber for object
+ * verification.
+ */
+typedef struct br_isignature_out {
+    char stale; /* stale signature?          */
+
+    unsigned long version; /* current signed version    */
+
+    uint32_t time[2]; /* time when the object
+                         got dirtied               */
+
+    int8_t signaturetype; /* hash type                 */
+    size_t signaturelen;  /* signature length          */
+    char signature[0];    /* signature (hash)          */
+} br_isignature_out_t;
+
+typedef struct br_stub_init {
+    uint32_t timebuf[2];
+    char export[PATH_MAX];
+} br_stub_init_t;
+
+typedef enum {
+    BR_SIGNATURE_TYPE_VOID = -1,  /* object is not signed       */
+    BR_SIGNATURE_TYPE_ZERO = 0,   /* min boundary               */
+    BR_SIGNATURE_TYPE_SHA256 = 1, /* signed with SHA256         */
+    BR_SIGNATURE_TYPE_MAX = 2,    /* max boundary               */
+} br_signature_type;
+
+/* BitRot stub start time (virtual xattr) */
+#define GLUSTERFS_GET_BR_STUB_INIT_TIME "trusted.glusterfs.bit-rot.stub-init"
+
+/* signing/reopen hint */
+#define BR_OBJECT_RESIGN 0
+#define BR_OBJECT_REOPEN 1
+#define BR_REOPEN_SIGN_HINT_KEY "trusted.glusterfs.bit-rot.reopen-hint"
+
+static inline int
+br_is_signature_type_valid(int8_t signaturetype)
+{
+    return ((signaturetype > BR_SIGNATURE_TYPE_ZERO) &&
+            (signaturetype < BR_SIGNATURE_TYPE_MAX));
+}
+
+static inline void
+br_set_default_ongoingversion(br_version_t *buf, uint32_t *tv)
+{
+    buf->ongoingversion = BITROT_DEFAULT_CURRENT_VERSION;
+    buf->timebuf[0] = tv[0];
+    buf->timebuf[1] = tv[1];
+}
+
+static inline void
+br_set_default_signature(br_signature_t *buf, size_t *size)
+{
+    buf->signaturetype = (int8_t)BR_SIGNATURE_TYPE_VOID;
+    buf->signedversion = BITROT_DEFAULT_SIGNING_VERSION;
+
+    *size = sizeof(br_signature_t); /* no signature */
+}
+
+static inline void
+br_set_ongoingversion(br_version_t *buf, unsigned long version, uint32_t *tv)
+{
+    buf->ongoingversion = version;
+    buf->timebuf[0] = tv[0];
+    buf->timebuf[1] = tv[1];
+}
+
+static inline void
+br_set_signature(br_signature_t *buf, br_isignature_t *sign,
+                 size_t signaturelen, size_t *size)
+{
+    buf->signaturetype = sign->signaturetype;
+    buf->signedversion = ntohl(sign->signedversion);
+
+    memcpy(buf->signature, sign->signature, signaturelen);
+    *size = sizeof(br_signature_t) + signaturelen;
+}
+
+#endif /* __BIT_ROT_COMMON_H__ */
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-object-version.h b/xlators/features/bit-rot/src/stub/bit-rot-object-version.h
new file mode 100644
index 00000000000..7ae6a5200df
--- /dev/null
+++ b/xlators/features/bit-rot/src/stub/bit-rot-object-version.h
@@ -0,0 +1,30 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BIT_ROT_OBJECT_VERSION_H
+#define __BIT_ROT_OBJECT_VERSION_H
+
+/**
+ * on-disk formats for ongoing version and object signature.
+ */
+typedef struct br_version {
+    unsigned long ongoingversion;
+    uint32_t timebuf[2];
+} br_version_t;
+
+typedef struct __attribute__((__packed__)) br_signature {
+    int8_t signaturetype;
+
+    unsigned long signedversion;
+
+    char signature[0];
+} br_signature_t;
+
+#endif
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c b/xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c
new file mode 100644
index 00000000000..8ac13a09941
--- /dev/null
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c
@@ -0,0 +1,796 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "bit-rot-stub.h"
+
+br_stub_fd_t *
+br_stub_fd_new(void)
+{
+    br_stub_fd_t *br_stub_fd = NULL;
+
+    br_stub_fd = GF_CALLOC(1, sizeof(*br_stub_fd), gf_br_stub_mt_br_stub_fd_t);
+
+    return br_stub_fd;
+}
+
+int
+__br_stub_fd_ctx_set(xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd)
+{
+    uint64_t value = 0;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, br_stub_fd, out);
+
+    value = (uint64_t)(long)br_stub_fd;
+
+    ret = __fd_ctx_set(fd, this, value);
+
+out:
+    return ret;
+}
+
+br_stub_fd_t *
+__br_stub_fd_ctx_get(xlator_t *this, fd_t *fd)
+{
+    br_stub_fd_t *br_stub_fd = NULL;
+    uint64_t value = 0;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    ret = __fd_ctx_get(fd, this, &value);
+    if (ret)
+        return NULL;
+
+    br_stub_fd = (br_stub_fd_t *)((long)value);
+
+out:
+    return br_stub_fd;
+}
+
+br_stub_fd_t *
+br_stub_fd_ctx_get(xlator_t *this, fd_t *fd)
+{
+    br_stub_fd_t *br_stub_fd = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    LOCK(&fd->lock);
+    {
+        br_stub_fd = __br_stub_fd_ctx_get(this, fd);
+    }
+    UNLOCK(&fd->lock);
+
+out:
+    return br_stub_fd;
+}
+
+int32_t
+br_stub_fd_ctx_set(xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd)
+{
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, br_stub_fd, out);
+
+    LOCK(&fd->lock);
+    {
+        ret = __br_stub_fd_ctx_set(this, fd, br_stub_fd);
+    }
+    UNLOCK(&fd->lock);
+
+out:
+    return ret;
+}
+
+/**
+ * Adds an entry to the bad objects directory.
+ * @gfid: gfid of the bad object being added to the bad objects directory
+ */
+int
+br_stub_add(xlator_t *this, uuid_t gfid)
+{
+    char gfid_path[BR_PATH_MAX_PLUS] = {0};
+    char bad_gfid_path[BR_PATH_MAX_PLUS] = {0};
+    int ret = 0;
+    br_stub_private_t *priv = NULL;
+    struct stat st = {0};
+
+    priv = this->private;
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, !gf_uuid_is_null(gfid), out,
+                                  errno, EINVAL);
+
+    snprintf(gfid_path, sizeof(gfid_path), "%s/%s", priv->stub_basepath,
+             uuid_utoa(gfid));
+
+    ret = sys_stat(gfid_path, &st);
+    if (!ret)
+        goto out;
+    snprintf(bad_gfid_path, sizeof(bad_gfid_path), "%s/stub-%s",
+             priv->stub_basepath, uuid_utoa(priv->bad_object_dir_gfid));
+
+    ret = sys_link(bad_gfid_path, gfid_path);
+    if (ret) {
+        if ((errno != ENOENT) && (errno != EMLINK) && (errno != EEXIST))
+            goto out;
+
+        /*
+         * Continue with success. At least we'll have half of the
+         * functionality, in the sense, object is marked bad and
+         * would be inaccessible. It's only scrub status that would
+         * show up less number of objects. That's fine as we'll have
+         * the log files that will have the missing information.
+         */
+        gf_smsg(this->name, GF_LOG_WARNING, errno, BRS_MSG_LINK_FAIL, "gfid=%s",
+                uuid_utoa(gfid), NULL);
+    }
+
+    return 0;
+out:
+    return -1;
+}
+
+int
+br_stub_del(xlator_t *this, uuid_t gfid)
+{
+    int32_t op_errno __attribute__((unused)) = 0;
+    br_stub_private_t *priv = NULL;
+    int ret = 0;
+    char gfid_path[BR_PATH_MAX_PLUS] = {0};
+
+    priv = this->private;
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, !gf_uuid_is_null(gfid), out,
+                                  op_errno, EINVAL);
+    snprintf(gfid_path, sizeof(gfid_path), "%s/%s", priv->stub_basepath,
+             uuid_utoa(gfid));
+    ret = sys_unlink(gfid_path);
+    if (ret && (errno != ENOENT)) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJ_UNLINK_FAIL,
+                "path=%s", gfid_path, NULL);
+        ret = -errno;
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+static int
+br_stub_check_stub_directory(xlator_t *this, char *fullpath)
+{
+    int ret = 0;
+    struct stat st = {
+        0,
+    };
+    char oldpath[BR_PATH_MAX_PLUS] = {0};
+    br_stub_private_t *priv = NULL;
+
+    priv = this->private;
+
+    snprintf(oldpath, sizeof(oldpath), "%s/%s", priv->export,
+             OLD_BR_STUB_QUARANTINE_DIR);
+
+    ret = sys_stat(fullpath, &st);
+    if (!ret && !S_ISDIR(st.st_mode))
+        goto error_return;
+    if (ret) {
+        if (errno != ENOENT)
+            goto error_return;
+        ret = sys_stat(oldpath, &st);
+        if (ret)
+            ret = mkdir_p(fullpath, 0600, _gf_true);
+        else
+            ret = sys_rename(oldpath, fullpath);
+    }
+
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL,
+                "create-path=%s", fullpath, NULL);
+    return ret;
+
+error_return:
+    gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL,
+            "verify-path=%s", fullpath, NULL);
+    return -1;
+}
+
+/**
+ * Function to create the container for the bad objects within the bad objects
+ * directory.
+ */
+static int
+br_stub_check_stub_file(xlator_t *this, char *path)
+{
+    int ret = 0;
+    int fd = -1;
+    struct stat st = {
+        0,
+    };
+
+    ret = sys_stat(path, &st);
+    if (!ret && !S_ISREG(st.st_mode))
+        goto error_return;
+    if (ret) {
+        if (errno != ENOENT)
+            goto error_return;
+        fd = sys_creat(path, 0);
+        if (fd < 0)
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    BRS_MSG_BAD_OBJECT_DIR_FAIL, "create-path=%s", path, NULL);
+    }
+
+    if (fd >= 0) {
+        sys_close(fd);
+        ret = 0;
+    }
+
+    return ret;
+
+error_return:
+    gf_smsg(this->name, GF_LOG_ERROR, errno, BRS_MSG_BAD_OBJECT_DIR_FAIL,
+            "verify-path=%s", path, NULL);
+    return -1;
+}
+
+int
+br_stub_dir_create(xlator_t *this, br_stub_private_t *priv)
+{
+    int ret = -1;
+    char fullpath[BR_PATH_MAX_PLUS] = {
+        0,
+    };
+    char stub_gfid_path[BR_PATH_MAX_PLUS] = {
+        0,
+    };
+
+    gf_uuid_copy(priv->bad_object_dir_gfid, BR_BAD_OBJ_CONTAINER);
+
+    if (snprintf(fullpath, sizeof(fullpath), "%s", priv->stub_basepath) >=
+        sizeof(fullpath))
+        goto out;
+
+    if (snprintf(stub_gfid_path, sizeof(stub_gfid_path), "%s/stub-%s",
+                 priv->stub_basepath, uuid_utoa(priv->bad_object_dir_gfid)) >=
+        sizeof(stub_gfid_path))
+        goto out;
+
+    ret = br_stub_check_stub_directory(this, fullpath);
+    if (ret)
+        goto out;
+    ret = br_stub_check_stub_file(this, stub_gfid_path);
+    if (ret)
+        goto out;
+
+    return 0;
+
+out:
+    return -1;
+}
+
+call_stub_t *
+__br_stub_dequeue(struct list_head *callstubs)
+{
+    call_stub_t *stub = NULL;
+
+    if (!list_empty(callstubs)) {
+        stub = list_entry(callstubs->next, call_stub_t, list);
+        list_del_init(&stub->list);
+    }
+
+    return stub;
+}
+
+void
+__br_stub_enqueue(struct list_head *callstubs, call_stub_t *stub)
+{
+    list_add_tail(&stub->list, callstubs);
+}
+
+void
+br_stub_worker_enqueue(xlator_t *this, call_stub_t *stub)
+{
+    br_stub_private_t *priv = NULL;
+
+    priv = this->private;
+    pthread_mutex_lock(&priv->container.bad_lock);
+    {
+        __br_stub_enqueue(&priv->container.bad_queue, stub);
+        pthread_cond_signal(&priv->container.bad_cond);
+    }
+    pthread_mutex_unlock(&priv->container.bad_lock);
+}
+
+void *
+br_stub_worker(void *data)
+{
+    br_stub_private_t *priv = NULL;
+    xlator_t *this = NULL;
+    call_stub_t *stub = NULL;
+
+    THIS = data;
+    this = data;
+    priv = this->private;
+
+    for (;;) {
+        pthread_mutex_lock(&priv->container.bad_lock);
+        {
+            while (list_empty(&priv->container.bad_queue)) {
+                (void)pthread_cond_wait(&priv->container.bad_cond,
+                                        &priv->container.bad_lock);
+            }
+
+            stub = __br_stub_dequeue(&priv->container.bad_queue);
+        }
+        pthread_mutex_unlock(&priv->container.bad_lock);
+
+        if (stub) /* guard against spurious wakeups */
+            call_resume(stub);
+    }
+
+    return NULL;
+}
+
+int32_t
+br_stub_lookup_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       dict_t *xattr_req)
+{
+    br_stub_private_t *priv = NULL;
+    struct stat lstatbuf = {0};
+    int ret = 0;
+    int32_t op_errno = EINVAL;
+    int32_t op_ret = -1;
+    struct iatt stbuf = {
+        0,
+    };
+    struct iatt postparent = {
+        0,
+    };
+    dict_t *xattr = NULL;
+    gf_boolean_t ver_enabled = _gf_false;
+
+    BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled);
+    priv = this->private;
+    BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), done);
+
+    VALIDATE_OR_GOTO(loc, done);
+    if (gf_uuid_compare(loc->gfid, priv->bad_object_dir_gfid))
+        goto done;
+
+    ret = sys_lstat(priv->stub_basepath, &lstatbuf);
+    if (ret) {
+        gf_msg_debug(this->name, errno,
+                     "Stat failed on stub bad "
+                     "object dir");
+        op_errno = errno;
+        goto done;
+    } else if (!S_ISDIR(lstatbuf.st_mode)) {
+        gf_msg_debug(this->name, errno,
+                     "bad object container is not "
+                     "a directory");
+        op_errno = ENOTDIR;
+        goto done;
+    }
+
+    iatt_from_stat(&stbuf, &lstatbuf);
+    gf_uuid_copy(stbuf.ia_gfid, priv->bad_object_dir_gfid);
+
+    op_ret = op_errno = 0;
+    xattr = dict_new();
+    if (!xattr) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+    }
+
+done:
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, loc->inode, &stbuf,
+                        xattr, &postparent);
+    if (xattr)
+        dict_unref(xattr);
+    return 0;
+}
+
+static int
+is_bad_gfid_file_current(char *filename, uuid_t gfid)
+{
+    char current_stub_gfid[GF_UUID_BUF_SIZE + 16] = {
+        0,
+    };
+
+    snprintf(current_stub_gfid, sizeof current_stub_gfid, "stub-%s",
+             uuid_utoa(gfid));
+    return (!strcmp(filename, current_stub_gfid));
+}
+
+static void
+check_delete_stale_bad_file(xlator_t *this, char *filename)
+{
+    int ret = 0;
+    struct stat st = {0};
+    char filepath[BR_PATH_MAX_PLUS] = {0};
+    br_stub_private_t *priv = NULL;
+
+    priv = this->private;
+
+    if (is_bad_gfid_file_current(filename, priv->bad_object_dir_gfid))
+        return;
+
+    snprintf(filepath, sizeof(filepath), "%s/%s", priv->stub_basepath,
+             filename);
+
+    ret = sys_stat(filepath, &st);
+    if (!ret && st.st_nlink == 1)
+        sys_unlink(filepath);
+}
+
+static int
+br_stub_fill_readdir(fd_t *fd, br_stub_fd_t *fctx, DIR *dir, off_t off,
+                     size_t size, gf_dirent_t *entries)
+{
+    off_t in_case = -1;
+    off_t last_off = 0;
+    size_t filled = 0;
+    int count = 0;
+    int32_t this_size = -1;
+    gf_dirent_t *this_entry = NULL;
+    xlator_t *this = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+
+    this = THIS;
+    if (!off) {
+        rewinddir(dir);
+    } else {
+        seekdir(dir, off);
+#ifndef GF_LINUX_HOST_OS
+        if ((u_long)telldir(dir) != off && off != fctx->bad_object.dir_eof) {
+            gf_smsg(THIS->name, GF_LOG_ERROR, 0,
+                    BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL, "off=(0x%llx)", off,
+                    "dir=%p", dir, NULL);
+            errno = EINVAL;
+            count = -1;
+            goto out;
+        }
+#endif /* GF_LINUX_HOST_OS */
+    }
+
+    while (filled <= size) {
+        in_case = (u_long)telldir(dir);
+
+        if (in_case == -1) {
+            gf_smsg(THIS->name, GF_LOG_ERROR, 0,
+                    BRS_MSG_BAD_OBJECT_DIR_TELL_FAIL, "dir=%p", dir, "err=%s",
+                    strerror(errno), NULL);
+            goto out;
+        }
+
+        errno = 0;
+        entry = sys_readdir(dir, scratch);
+        if (!entry || errno != 0) {
+            if (errno == EBADF) {
+                gf_smsg(THIS->name, GF_LOG_WARNING, 0,
+                        BRS_MSG_BAD_OBJECT_DIR_READ_FAIL, "dir=%p", dir,
+                        "err=%s", strerror(errno), NULL);
+                goto out;
+            }
+            break;
+        }
+
+        if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
+            continue;
+
+        if (!strncmp(entry->d_name, "stub-", strlen("stub-"))) {
+            check_delete_stale_bad_file(this, entry->d_name);
+            continue;
+        }
+
+        this_size = max(sizeof(gf_dirent_t), sizeof(gfs3_dirplist)) +
+                    strlen(entry->d_name) + 1;
+
+        if (this_size + filled > size) {
+            seekdir(dir, in_case);
+#ifndef GF_LINUX_HOST_OS
+            if ((u_long)telldir(dir) != in_case &&
+                in_case != fctx->bad_object.dir_eof) {
+                gf_smsg(THIS->name, GF_LOG_ERROR, 0,
+                        BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL, "in_case=(0x%llx)",
+                        in_case, "dir=%p", dir, NULL);
+                errno = EINVAL;
+                count = -1;
+                goto out;
+            }
+#endif /* GF_LINUX_HOST_OS */
+            break;
+        }
+
+        this_entry = gf_dirent_for_name(entry->d_name);
+
+        if (!this_entry) {
+            gf_smsg(THIS->name, GF_LOG_ERROR, 0,
+                    BRS_MSG_CREATE_GF_DIRENT_FAILED, "entry-name=%s",
+                    entry->d_name, "err=%s", strerror(errno), NULL);
+            goto out;
+        }
+        /*
+         * we store the offset of next entry here, which is
+         * probably not intended, but code using syncop_readdir()
+         * (glfs-heal.c, afr-self-heald.c, pump.c) rely on it
+         * for directory read resumption.
+         */
+        last_off = (u_long)telldir(dir);
+        this_entry->d_off = last_off;
+        this_entry->d_ino = entry->d_ino;
+
+        list_add_tail(&this_entry->list, &entries->list);
+
+        filled += this_size;
+        count++;
+    }
+
+    if ((!sys_readdir(dir, scratch) && (errno == 0))) {
+        /* Indicate EOF */
+        errno = ENOENT;
+        /* Remember EOF offset for later detection */
+        fctx->bad_object.dir_eof = last_off;
+    }
+out:
+    return count;
+}
+
+int32_t
+br_stub_readdir_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                        size_t size, off_t off, dict_t *xdata)
+{
+    br_stub_fd_t *fctx = NULL;
+    DIR *dir = NULL;
+    int ret = -1;
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    int count = 0;
+    gf_dirent_t entries;
+    gf_boolean_t xdata_unref = _gf_false;
+    dict_t *dict = NULL;
+
+    INIT_LIST_HEAD(&entries.list);
+
+    fctx = br_stub_fd_ctx_get(this, fd);
+    if (!fctx) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_GET_FD_CONTEXT_FAILED,
+                "fd=%p", fd, NULL);
+        op_errno = -ret;
+        goto done;
+    }
+
+    dir = fctx->bad_object.dir;
+
+    if (!dir) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_BAD_HANDLE_DIR_NULL,
+                "fd=%p", fd, NULL);
+        op_errno = EINVAL;
+        goto done;
+    }
+
+    count = br_stub_fill_readdir(fd, fctx, dir, off, size, &entries);
+
+    /* pick ENOENT to indicate EOF */
+    op_errno = errno;
+    op_ret = count;
+
+    dict = xdata;
+    (void)br_stub_bad_objects_path(this, fd, &entries, &dict);
+    if (!xdata && dict) {
+        xdata = dict;
+        xdata_unref = _gf_true;
+    }
+
+done:
+    STACK_UNWIND_STRICT(readdir, frame, op_ret, op_errno, &entries, xdata);
+    gf_dirent_free(&entries);
+    if (xdata_unref)
+        dict_unref(xdata);
+    return 0;
+}
+
+/**
+ * This function is called to mainly obtain the paths of the corrupt
+ * objects (files as of now). Currently scrub status prints only the
+ * gfid of the corrupted files. Reason is, bitrot-stub maintains the
+ * list of the corrupted objects as entries inside the quarantine
+ * directory (<brick export>/.glusterfs/quarantine)
+ *
+ * And the name of each entry in the qurantine directory is the gfid
+ * of the corrupted object. So scrub status will just show that info.
+ * But it helps the users a lot if the actual path to the object is
+ * also reported. Hence the below function to get that information.
+ * The function allocates a new dict to be returned (if it does not
+ * get one from the caller of readdir i.e. scrubber as of now), and
+ * stores the paths of each corrupted gfid there. The gfid is used as
+ * the key and path is used as the value.
+ *
+ * NOTE: The path will be there in following situations
+ * 1) gfid2path option has been enabled (posix xlator option)
+ *    and the corrupted file contains the path as an extended
+ *    attribute.
+ * 2) If the gfid2path option is not enabled, OR if the xattr
+ *    is absent, then the inode table should have it.
+ *    The path will be there if a name based lookup has happened
+ *    on the file which has been corrupted. With lookup a inode and
+ *    dentry would be created in the inode table. And the path is
+ *    constructed using the in memory inode and dentry. If a lookup
+ *    has not happened OR the inode corresponding to the corrupted
+ *    file does not exist in the inode table (because it got purged
+ *    as lru limit of the inodes exceeded) OR a nameless lookup had
+ *    happened to populate the inode in the inode table, then the
+ *    path will not be printed in scrub and only the gfid will be there.
+ **/
+int
+br_stub_bad_objects_path(xlator_t *this, fd_t *fd, gf_dirent_t *entries,
+                         dict_t **dict)
+{
+    gf_dirent_t *entry = NULL;
+    inode_t *inode = NULL;
+    char *hpath = NULL;
+    uuid_t gfid = {0};
+    int ret = -1;
+    dict_t *tmp_dict = NULL;
+    char str_gfid[64] = {0};
+
+    if (list_empty(&entries->list))
+        return 0;
+
+    tmp_dict = *dict;
+
+    if (!tmp_dict) {
+        tmp_dict = dict_new();
+        /*
+         * If the allocation of dict fails then no need treat it
+         * it as a error. This path (or function) is executed when
+         * "gluster volume bitrot <volume name> scrub status" is
+         * executed, to get the list of the corrupted objects.
+         * And the motive of this function is to get the paths of
+         * the corrupted objects. If the dict allocation fails, then
+         * the scrub status will only show the gfids of those corrupted
+         * objects (which is the behavior as of the time of this patch
+         * being worked upon). So just return and only the gfids will
+         * be shown.
+         */
+        if (!tmp_dict) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_FAILED, NULL);
+            goto out;
+        }
+    }
+
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        gf_uuid_clear(gfid);
+        gf_uuid_parse(entry->d_name, gfid);
+
+        inode = inode_find(fd->inode->table, gfid);
+
+        /* No need to check the return value here.
+         * Because @hpath is examined.
+         */
+        (void)br_stub_get_path_of_gfid(this, fd->inode, inode, gfid, &hpath);
+
+        if (hpath) {
+            gf_msg_debug(this->name, 0,
+                         "path of the corrupted "
+                         "object (gfid: %s) is %s",
+                         uuid_utoa(gfid), hpath);
+            br_stub_entry_xattr_fill(this, hpath, entry, tmp_dict);
+        } else
+            gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED,
+                    "gfid=%s", uuid_utoa_r(gfid, str_gfid), NULL);
+
+        inode = NULL;
+        hpath = NULL;
+    }
+
+    ret = 0;
+    *dict = tmp_dict;
+
+out:
+    return ret;
+}
+
+int
+br_stub_get_path_of_gfid(xlator_t *this, inode_t *parent, inode_t *inode,
+                         uuid_t gfid, char **path)
+{
+    int32_t ret = -1;
+    char gfid_str[64] = {0};
+
+    GF_VALIDATE_OR_GOTO("bitrot-stub", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, parent, out);
+    GF_VALIDATE_OR_GOTO(this->name, path, out);
+
+    /* Above, No need to validate the @inode for hard resolution. Because
+     * inode can be NULL and if it is NULL, then syncop_gfid_to_path_hard
+     * will allocate a new inode and proceed. So no need to bother about
+     * @inode. Because we need it only to send a syncop_getxattr call
+     * from inside syncop_gfid_to_path_hard. And getxattr fetches the
+     * path from the backend.
+     */
+
+    ret = syncop_gfid_to_path_hard(parent->table, FIRST_CHILD(this), gfid,
+                                   inode, path, _gf_true);
+    if (ret < 0)
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED,
+                "gfid=%s", uuid_utoa_r(gfid, gfid_str), NULL);
+
+    /*
+     * Try with soft resolution of path if hard resolve fails. Because
+     * checking the xattr on disk to get the path of a inode (or gfid)
+     * is dependent on whether that option is enabled in the posix
+     * xlator or not. If it is not enabled, then hard resolution by
+     * checking the on disk xattr fails.
+     *
+     * Thus in such situations fall back to the soft resolution which
+     * mainly depends on the inode_path() function. And for using
+     * inode_path, @inode has to be linked i.e. a successful lookup should
+     * have happened on the gfid (or the path) to link the inode to the
+     * inode table. And if @inode is NULL, means, the inode has not been
+     * found in the inode table and better not to do inode_path() on the
+     * inode which has not been linked.
+     */
+    if (ret < 0 && inode) {
+        ret = syncop_gfid_to_path_hard(parent->table, FIRST_CHILD(this), gfid,
+                                       inode, path, _gf_false);
+        if (ret < 0)
+            gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_PATH_GET_FAILED,
+                    "from-memory  gfid=%s", uuid_utoa_r(gfid, gfid_str), NULL);
+    }
+
+out:
+    return ret;
+}
+
+/**
+ * NOTE: If the file has multiple hardlinks (in gluster volume
+ * namespace), the path would be one of the hardlinks. Its up to
+ * the user to find the remaining hardlinks (using find -samefile)
+ * and remove them.
+ **/
+void
+br_stub_entry_xattr_fill(xlator_t *this, char *hpath, gf_dirent_t *entry,
+                         dict_t *dict)
+{
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, hpath, out);
+
+    /*
+     * Use the entry->d_name (which is nothing but the gfid of the
+     * corrupted object) as the key. And the value will be the actual
+     * path of that object (or file).
+     *
+     * ALso ignore the dict_set errors. scrubber will get the gfid of
+     * the corrupted object for sure. So, for now lets just log the
+     * dict_set_dynstr failure and move on.
+     */
+
+    ret = dict_set_dynstr(dict, entry->d_name, hpath);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_DICT_SET_FAILED,
+                "path=%s", hpath, "object-name=%s", entry->d_name, NULL);
+out:
+    return;
+}
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
new file mode 100644
index 00000000000..9d93caf069f
--- /dev/null
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
@@ -0,0 +1,36 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _BR_MEM_TYPES_H
+#define _BR_MEM_TYPES_H
+
+#include <glusterfs/mem-types.h>
+
+enum br_mem_types {
+    gf_br_stub_mt_private_t = gf_common_mt_end + 1,
+    gf_br_stub_mt_version_t,
+    gf_br_stub_mt_inode_ctx_t,
+    gf_br_stub_mt_signature_t,
+    gf_br_mt_br_private_t,
+    gf_br_mt_br_child_t,
+    gf_br_mt_br_object_t,
+    gf_br_mt_br_ob_n_wk_t,
+    gf_br_mt_br_scrubber_t,
+    gf_br_mt_br_fsscan_entry_t,
+    gf_br_stub_mt_br_stub_fd_t,
+    gf_br_stub_mt_br_scanner_freq_t,
+    gf_br_stub_mt_sigstub_t,
+    gf_br_mt_br_child_event_t,
+    gf_br_stub_mt_misc,
+    gf_br_mt_br_worker_t,
+    gf_br_stub_mt_end,
+};
+
+#endif
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h
new file mode 100644
index 00000000000..6c15a166f18
--- /dev/null
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h
@@ -0,0 +1,117 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _BITROT_STUB_MESSAGES_H_
+#define _BITROT_STUB_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(BITROT_STUB, BRS_MSG_NO_MEMORY, BRS_MSG_SET_EVENT_FAILED,
+           BRS_MSG_MEM_ACNT_FAILED, BRS_MSG_CREATE_FRAME_FAILED,
+           BRS_MSG_SET_CONTEXT_FAILED, BRS_MSG_CHANGE_VERSION_FAILED,
+           BRS_MSG_ADD_FD_TO_LIST_FAILED, BRS_MSG_SET_FD_CONTEXT_FAILED,
+           BRS_MSG_CREATE_ANONYMOUS_FD_FAILED, BRS_MSG_NO_CHILD,
+           BRS_MSG_STUB_ALLOC_FAILED, BRS_MSG_GET_INODE_CONTEXT_FAILED,
+           BRS_MSG_CANCEL_SIGN_THREAD_FAILED, BRS_MSG_ADD_FD_TO_INODE,
+           BRS_MSG_SIGN_VERSION_ERROR, BRS_MSG_BAD_OBJ_MARK_FAIL,
+           BRS_MSG_NON_SCRUB_BAD_OBJ_MARK, BRS_MSG_REMOVE_INTERNAL_XATTR,
+           BRS_MSG_SET_INTERNAL_XATTR, BRS_MSG_BAD_OBJECT_ACCESS,
+           BRS_MSG_BAD_CONTAINER_FAIL, BRS_MSG_BAD_OBJECT_DIR_FAIL,
+           BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL, BRS_MSG_BAD_OBJECT_DIR_TELL_FAIL,
+           BRS_MSG_BAD_OBJECT_DIR_READ_FAIL, BRS_MSG_GET_FD_CONTEXT_FAILED,
+           BRS_MSG_BAD_HANDLE_DIR_NULL, BRS_MSG_BAD_OBJ_THREAD_FAIL,
+           BRS_MSG_BAD_OBJ_DIR_CLOSE_FAIL, BRS_MSG_LINK_FAIL,
+           BRS_MSG_BAD_OBJ_UNLINK_FAIL, BRS_MSG_DICT_SET_FAILED,
+           BRS_MSG_PATH_GET_FAILED, BRS_MSG_NULL_LOCAL,
+           BRS_MSG_SPAWN_SIGN_THRD_FAILED, BRS_MSG_KILL_SIGN_THREAD,
+           BRS_MSG_NON_BITD_PID, BRS_MSG_SIGN_PREPARE_FAIL,
+           BRS_MSG_USING_DEFAULT_THREAD_SIZE, BRS_MSG_ALLOC_MEM_FAILED,
+           BRS_MSG_DICT_ALLOC_FAILED, BRS_MSG_CREATE_GF_DIRENT_FAILED,
+           BRS_MSG_ALLOC_FAILED, BRS_MSG_PATH_XATTR_GET_FAILED,
+           BRS_MSG_VERSION_PREPARE_FAIL);
+
+#define BRS_MSG_MEM_ACNT_FAILED_STR "Memory accounting init failed"
+#define BRS_MSG_BAD_OBJ_THREAD_FAIL_STR "pthread_init failed"
+#define BRS_MSG_USING_DEFAULT_THREAD_SIZE_STR "Using default thread stack size"
+#define BRS_MSG_NO_CHILD_STR "FATAL: no children"
+#define BRS_MSG_SPAWN_SIGN_THRD_FAILED_STR                                     \
+    "failed to create the new thread for signer"
+#define BRS_MSG_BAD_CONTAINER_FAIL_STR                                         \
+    "failed to launch the thread for storing bad gfids"
+#define BRS_MSG_CANCEL_SIGN_THREAD_FAILED_STR                                  \
+    "Could not cancel sign serializer thread"
+#define BRS_MSG_KILL_SIGN_THREAD_STR "killed the signer thread"
+#define BRS_MSG_GET_INODE_CONTEXT_FAILED_STR                                   \
+    "failed to init the inode context for the inode"
+#define BRS_MSG_ADD_FD_TO_INODE_STR "failed to add fd to the inode"
+#define BRS_MSG_NO_MEMORY_STR "local allocation failed"
+#define BRS_MSG_BAD_OBJECT_ACCESS_STR "bad object accessed. Returning"
+#define BRS_MSG_SIGN_VERSION_ERROR_STR "Signing version exceeds current version"
+#define BRS_MSG_NON_BITD_PID_STR                                               \
+    "PID from where signature request came, does not belong to bit-rot "       \
+    "daemon. Unwinding the fop"
+#define BRS_MSG_SIGN_PREPARE_FAIL_STR                                          \
+    "failed to prepare the signature. Unwinding the fop"
+#define BRS_MSG_VERSION_PREPARE_FAIL_STR                                       \
+    "failed to prepare the version. Unwinding the fop"
+#define BRS_MSG_STUB_ALLOC_FAILED_STR "failed to allocate stub fop, Unwinding"
+#define BRS_MSG_BAD_OBJ_MARK_FAIL_STR "failed to mark object as bad"
+#define BRS_MSG_NON_SCRUB_BAD_OBJ_MARK_STR                                     \
+    "bad object marking is not from the scrubber"
+#define BRS_MSG_ALLOC_MEM_FAILED_STR "failed to allocate memory"
+#define BRS_MSG_SET_INTERNAL_XATTR_STR "called on the internal xattr"
+#define BRS_MSG_REMOVE_INTERNAL_XATTR_STR "removexattr called on internal xattr"
+#define BRS_MSG_CREATE_ANONYMOUS_FD_FAILED_STR                                 \
+    "failed to create anonymous fd for the inode"
+#define BRS_MSG_ADD_FD_TO_LIST_FAILED_STR "failed add fd to the list"
+#define BRS_MSG_SET_FD_CONTEXT_FAILED_STR                                      \
+    "failed to set the fd context for the file"
+#define BRS_MSG_NULL_LOCAL_STR "local is NULL"
+#define BRS_MSG_DICT_ALLOC_FAILED_STR                                          \
+    "dict allocation failed: cannot send IPC FOP to changelog"
+#define BRS_MSG_SET_EVENT_FAILED_STR "cannot set release event in dict"
+#define BRS_MSG_CREATE_FRAME_FAILED_STR "create_frame() failure"
+#define BRS_MSG_BAD_OBJ_DIR_CLOSE_FAIL_STR "closedir error"
+#define BRS_MSG_LINK_FAIL_STR "failed to record gfid"
+#define BRS_MSG_BAD_OBJ_UNLINK_FAIL_STR                                        \
+    "failed to delete bad object link from quaratine directory"
+#define BRS_MSG_BAD_OBJECT_DIR_FAIL_STR "failed stub directory"
+#define BRS_MSG_BAD_OBJECT_DIR_SEEK_FAIL_STR                                   \
+    "seekdir failed. Invalid argument (offset reused from another DIR * "      \
+    "structure)"
+#define BRS_MSG_BAD_OBJECT_DIR_TELL_FAIL_STR "telldir failed on dir"
+#define BRS_MSG_BAD_OBJECT_DIR_READ_FAIL_STR "readdir failed on dir"
+#define BRS_MSG_CREATE_GF_DIRENT_FAILED_STR "could not create gf_dirent"
+#define BRS_MSG_GET_FD_CONTEXT_FAILED_STR "pfd is NULL"
+#define BRS_MSG_BAD_HANDLE_DIR_NULL_STR "dir if NULL"
+#define BRS_MSG_ALLOC_FAILED_STR                                               \
+    "failed to allocate new dict for saving the paths of the corrupted "       \
+    "objects. Scrub status will only display the gfid"
+#define BRS_MSG_PATH_GET_FAILED_STR "failed to get the path"
+#define BRS_MSG_PATH_XATTR_GET_FAILED_STR                                      \
+    "failed to get the path xattr from disk for the gfid. Trying to get path " \
+    "from the memory"
+#define BRS_MSG_DICT_SET_FAILED_STR                                            \
+    "failed to set the actual path as the value in the dict for the "          \
+    "corrupted object"
+#define BRS_MSG_SET_CONTEXT_FAILED_STR                                         \
+    "could not set fd context for release callback"
+#define BRS_MSG_CHANGE_VERSION_FAILED_STR "change version failed"
+#endif /* !_BITROT_STUB_MESSAGES_H_ */
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
new file mode 100644
index 00000000000..447dd47ff41
--- /dev/null
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
@@ -0,0 +1,3590 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <ctype.h>
+#include <sys/uio.h>
+#include <signal.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include "changelog.h"
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/call-stub.h>
+
+#include "bit-rot-stub.h"
+#include "bit-rot-stub-mem-types.h"
+#include "bit-rot-stub-messages.h"
+#include "bit-rot-common.h"
+
+#define BR_STUB_REQUEST_COOKIE 0x1
+
+void
+br_stub_lock_cleaner(void *arg)
+{
+    pthread_mutex_t *clean_mutex = arg;
+
+    pthread_mutex_unlock(clean_mutex);
+    return;
+}
+
+void *
+br_stub_signth(void *);
+
+struct br_stub_signentry {
+    unsigned long v;
+
+    call_stub_t *stub;
+
+    struct list_head list;
+};
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int32_t ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_br_stub_mt_end + 1);
+
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_MEM_ACNT_FAILED, NULL);
+        return ret;
+    }
+
+    return ret;
+}
+
+int
+br_stub_bad_object_container_init(xlator_t *this, br_stub_private_t *priv)
+{
+    pthread_attr_t w_attr;
+    int ret = -1;
+
+    ret = pthread_cond_init(&priv->container.bad_cond, NULL);
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL,
+                "cond_init ret=%d", ret, NULL);
+        goto out;
+    }
+
+    ret = pthread_mutex_init(&priv->container.bad_lock, NULL);
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL,
+                "mutex_init ret=%d", ret, NULL);
+        goto cleanup_cond;
+    }
+
+    ret = pthread_attr_init(&w_attr);
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_THREAD_FAIL,
+                "attr_init ret=%d", ret, NULL);
+        goto cleanup_lock;
+    }
+
+    ret = pthread_attr_setstacksize(&w_attr, BAD_OBJECT_THREAD_STACK_SIZE);
+    if (ret == EINVAL) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0,
+                BRS_MSG_USING_DEFAULT_THREAD_SIZE, NULL);
+    }
+
+    INIT_LIST_HEAD(&priv->container.bad_queue);
+    ret = br_stub_dir_create(this, priv);
+    if (ret < 0)
+        goto cleanup_lock;
+
+    ret = gf_thread_create(&priv->container.thread, &w_attr, br_stub_worker,
+                           this, "brswrker");
+    if (ret)
+        goto cleanup_attr;
+
+    return 0;
+
+cleanup_attr:
+    pthread_attr_destroy(&w_attr);
+cleanup_lock:
+    pthread_mutex_destroy(&priv->container.bad_lock);
+cleanup_cond:
+    pthread_cond_destroy(&priv->container.bad_cond);
+out:
+    return -1;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    int ret = 0;
+    char *tmp = NULL;
+    struct timeval tv = {
+        0,
+    };
+    br_stub_private_t *priv = NULL;
+
+    if (!this->children) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_NO_CHILD, NULL);
+        goto error_return;
+    }
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_br_stub_mt_private_t);
+    if (!priv)
+        goto error_return;
+
+    priv->local_pool = mem_pool_new(br_stub_local_t, 512);
+    if (!priv->local_pool)
+        goto free_priv;
+
+    GF_OPTION_INIT("bitrot", priv->do_versioning, bool, free_mempool);
+
+    GF_OPTION_INIT("export", tmp, str, free_mempool);
+
+    if (snprintf(priv->export, PATH_MAX, "%s", tmp) >= PATH_MAX)
+        goto free_mempool;
+
+    if (snprintf(priv->stub_basepath, sizeof(priv->stub_basepath), "%s/%s",
+                 priv->export,
+                 BR_STUB_QUARANTINE_DIR) >= sizeof(priv->stub_basepath))
+        goto free_mempool;
+
+    (void)gettimeofday(&tv, NULL);
+
+    /* boot time is in network endian format */
+    priv->boot[0] = htonl(tv.tv_sec);
+    priv->boot[1] = htonl(tv.tv_usec);
+
+    pthread_mutex_init(&priv->lock, NULL);
+    pthread_cond_init(&priv->cond, NULL);
+    INIT_LIST_HEAD(&priv->squeue);
+
+    /* Thread creations need 'this' to be passed so that THIS can be
+     * assigned inside the thread. So setting this->private here.
+     */
+    this->private = priv;
+    if (!priv->do_versioning)
+        return 0;
+
+    ret = gf_thread_create(&priv->signth, NULL, br_stub_signth, this,
+                           "brssign");
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SPAWN_SIGN_THRD_FAILED,
+                NULL);
+        goto cleanup_lock;
+    }
+
+    ret = br_stub_bad_object_container_init(this, priv);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_CONTAINER_FAIL, NULL);
+        goto cleanup_lock;
+    }
+
+    gf_msg_debug(this->name, 0, "bit-rot stub loaded");
+
+    return 0;
+
+cleanup_lock:
+    pthread_cond_destroy(&priv->cond);
+    pthread_mutex_destroy(&priv->lock);
+free_mempool:
+    mem_pool_destroy(priv->local_pool);
+    priv->local_pool = NULL;
+free_priv:
+    GF_FREE(priv);
+    this->private = NULL;
+error_return:
+    return -1;
+}
+
+/* TODO:
+ * As of now enabling bitrot option does 2 things.
+ * 1) Start the Bitrot Daemon which signs the objects (currently files only)
+ *    upon getting notified by the stub.
+ * 2) Enable versioning of the objects. Object versions (again files only) are
+ *    incremented upon modification.
+ * So object versioning is tied to bitrot daemon's signing. In future, object
+ * versioning might be necessary for other things as well apart from bit-rot
+ * detection (well that's the objective of bringing in object-versioning :)).
+ * In that case, better to make versioning a new option and letting it to be
+ * enabled despite bit-rot detection is not needed.
+ * Ex: ICAP.
+ */
+int32_t
+reconfigure(xlator_t *this, dict_t *options)
+{
+    int32_t ret = -1;
+    br_stub_private_t *priv = NULL;
+
+    priv = this->private;
+
+    GF_OPTION_RECONF("bitrot", priv->do_versioning, options, bool, err);
+    if (priv->do_versioning && !priv->signth) {
+        ret = gf_thread_create(&priv->signth, NULL, br_stub_signth, this,
+                               "brssign");
+        if (ret != 0) {
+            gf_smsg(this->name, GF_LOG_WARNING, 0,
+                    BRS_MSG_SPAWN_SIGN_THRD_FAILED, NULL);
+            goto err;
+        }
+
+        ret = br_stub_bad_object_container_init(this, priv);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_CONTAINER_FAIL,
+                    NULL);
+            goto err;
+        }
+    } else {
+        if (priv->signth) {
+            if (gf_thread_cleanup_xint(priv->signth)) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0,
+                        BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL);
+            } else {
+                gf_smsg(this->name, GF_LOG_INFO, 0, BRS_MSG_KILL_SIGN_THREAD,
+                        NULL);
+                priv->signth = 0;
+            }
+        }
+
+        if (priv->container.thread) {
+            if (gf_thread_cleanup_xint(priv->container.thread)) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0,
+                        BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL);
+            }
+            priv->container.thread = 0;
+        }
+    }
+
+    ret = 0;
+    return ret;
+err:
+    if (priv->signth) {
+        if (gf_thread_cleanup_xint(priv->signth)) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0,
+                    BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL);
+        }
+        priv->signth = 0;
+    }
+
+    if (priv->container.thread) {
+        if (gf_thread_cleanup_xint(priv->container.thread)) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0,
+                    BRS_MSG_CANCEL_SIGN_THREAD_FAILED, NULL);
+        }
+        priv->container.thread = 0;
+    }
+    ret = -1;
+    return ret;
+}
+
+int
+notify(xlator_t *this, int event, void *data, ...)
+{
+    br_stub_private_t *priv = NULL;
+
+    if (!this)
+        return 0;
+
+    priv = this->private;
+    if (!priv)
+        return 0;
+
+    default_notify(this, event, data);
+    return 0;
+}
+
+void
+fini(xlator_t *this)
+{
+    int32_t ret = 0;
+    br_stub_private_t *priv = this->private;
+    struct br_stub_signentry *sigstub = NULL;
+    call_stub_t *stub = NULL;
+
+    if (!priv)
+        return;
+
+    if (!priv->do_versioning)
+        goto cleanup;
+
+    ret = gf_thread_cleanup_xint(priv->signth);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CANCEL_SIGN_THREAD_FAILED,
+                NULL);
+        goto out;
+    }
+    priv->signth = 0;
+
+    while (!list_empty(&priv->squeue)) {
+        sigstub = list_first_entry(&priv->squeue, struct br_stub_signentry,
+                                   list);
+        list_del_init(&sigstub->list);
+
+        call_stub_destroy(sigstub->stub);
+        GF_FREE(sigstub);
+    }
+
+    ret = gf_thread_cleanup_xint(priv->container.thread);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CANCEL_SIGN_THREAD_FAILED,
+                NULL);
+        goto out;
+    }
+
+    priv->container.thread = 0;
+
+    while (!list_empty(&priv->container.bad_queue)) {
+        stub = list_first_entry(&priv->container.bad_queue, call_stub_t, list);
+        list_del_init(&stub->list);
+        call_stub_destroy(stub);
+    }
+
+    pthread_mutex_destroy(&priv->container.bad_lock);
+    pthread_cond_destroy(&priv->container.bad_cond);
+
+cleanup:
+    pthread_mutex_destroy(&priv->lock);
+    pthread_cond_destroy(&priv->cond);
+
+    if (priv->local_pool) {
+        mem_pool_destroy(priv->local_pool);
+        priv->local_pool = NULL;
+    }
+
+    this->private = NULL;
+    GF_FREE(priv);
+
+out:
+    return;
+}
+
+static int
+br_stub_alloc_versions(br_version_t **obuf, br_signature_t **sbuf,
+                       size_t signaturelen)
+{
+    void *mem = NULL;
+    size_t size = 0;
+
+    if (obuf)
+        size += sizeof(br_version_t);
+    if (sbuf)
+        size += sizeof(br_signature_t) + signaturelen;
+
+    mem = GF_CALLOC(1, size, gf_br_stub_mt_version_t);
+    if (!mem)
+        goto error_return;
+
+    if (obuf) {
+        *obuf = (br_version_t *)mem;
+        mem = ((char *)mem + sizeof(br_version_t));
+    }
+    if (sbuf) {
+        *sbuf = (br_signature_t *)mem;
+    }
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static void
+br_stub_dealloc_versions(void *mem)
+{
+    GF_FREE(mem);
+}
+
+static br_stub_local_t *
+br_stub_alloc_local(xlator_t *this)
+{
+    br_stub_private_t *priv = this->private;
+
+    return mem_get0(priv->local_pool);
+}
+
+static void
+br_stub_dealloc_local(br_stub_local_t *ptr)
+{
+    if (!ptr)
+        return;
+
+    mem_put(ptr);
+}
+
+static int
+br_stub_prepare_version_request(xlator_t *this, dict_t *dict,
+                                br_version_t *obuf, unsigned long oversion)
+{
+    br_stub_private_t *priv = NULL;
+
+    priv = this->private;
+    br_set_ongoingversion(obuf, oversion, priv->boot);
+
+    return dict_set_bin(dict, BITROT_CURRENT_VERSION_KEY, (void *)obuf,
+                        sizeof(br_version_t));
+}
+
+static int
+br_stub_prepare_signing_request(dict_t *dict, br_signature_t *sbuf,
+                                br_isignature_t *sign, size_t signaturelen)
+{
+    size_t size = 0;
+
+    br_set_signature(sbuf, sign, signaturelen, &size);
+
+    return dict_set_bin(dict, BITROT_SIGNING_VERSION_KEY, (void *)sbuf, size);
+}
+
+/**
+ * initialize an inode context starting with a given ongoing version.
+ * a fresh lookup() or a first creat() call initializes the inode
+ * context, hence the inode is marked dirty. this routine also
+ * initializes the transient inode version.
+ */
+static int
+br_stub_init_inode_versions(xlator_t *this, fd_t *fd, inode_t *inode,
+                            unsigned long version, gf_boolean_t markdirty,
+                            gf_boolean_t bad_object, uint64_t *ctx_addr)
+{
+    int32_t ret = 0;
+    br_stub_inode_ctx_t *ctx = NULL;
+
+    ctx = GF_CALLOC(1, sizeof(br_stub_inode_ctx_t), gf_br_stub_mt_inode_ctx_t);
+    if (!ctx)
+        goto error_return;
+
+    INIT_LIST_HEAD(&ctx->fd_list);
+    (markdirty) ? __br_stub_mark_inode_dirty(ctx)
+                : __br_stub_mark_inode_synced(ctx);
+    __br_stub_set_ongoing_version(ctx, version);
+
+    if (bad_object)
+        __br_stub_mark_object_bad(ctx);
+
+    if (fd) {
+        ret = br_stub_add_fd_to_inode(this, fd, ctx);
+        if (ret)
+            goto free_ctx;
+    }
+
+    ret = br_stub_set_inode_ctx(this, inode, ctx);
+    if (ret)
+        goto free_ctx;
+
+    if (ctx_addr)
+        *ctx_addr = (uint64_t)(uintptr_t)ctx;
+    return 0;
+
+free_ctx:
+    GF_FREE(ctx);
+error_return:
+    return -1;
+}
+
+/**
+ * modify the ongoing version of an inode.
+ */
+static int
+br_stub_mod_inode_versions(xlator_t *this, fd_t *fd, inode_t *inode,
+                           unsigned long version)
+{
+    int32_t ret = -1;
+    br_stub_inode_ctx_t *ctx = 0;
+
+    LOCK(&inode->lock);
+    {
+        ctx = __br_stub_get_ongoing_version_ctx(this, inode, NULL);
+        if (ctx == NULL)
+            goto unblock;
+        if (__br_stub_is_inode_dirty(ctx)) {
+            __br_stub_set_ongoing_version(ctx, version);
+            __br_stub_mark_inode_synced(ctx);
+        }
+
+        ret = 0;
+    }
+unblock:
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+static void
+br_stub_fill_local(br_stub_local_t *local, call_stub_t *stub, fd_t *fd,
+                   inode_t *inode, uuid_t gfid, int versioningtype,
+                   unsigned long memversion)
+{
+    local->fopstub = stub;
+    local->versioningtype = versioningtype;
+    local->u.context.version = memversion;
+    if (fd)
+        local->u.context.fd = fd_ref(fd);
+    if (inode)
+        local->u.context.inode = inode_ref(inode);
+    gf_uuid_copy(local->u.context.gfid, gfid);
+}
+
+static void
+br_stub_cleanup_local(br_stub_local_t *local)
+{
+    if (!local)
+        return;
+
+    local->fopstub = NULL;
+    local->versioningtype = 0;
+    local->u.context.version = 0;
+    if (local->u.context.fd) {
+        fd_unref(local->u.context.fd);
+        local->u.context.fd = NULL;
+    }
+    if (local->u.context.inode) {
+        inode_unref(local->u.context.inode);
+        local->u.context.inode = NULL;
+    }
+    memset(local->u.context.gfid, '\0', sizeof(uuid_t));
+}
+
+static int
+br_stub_need_versioning(xlator_t *this, fd_t *fd, gf_boolean_t *versioning,
+                        gf_boolean_t *modified, br_stub_inode_ctx_t **ctx)
+{
+    int32_t ret = -1;
+    uint64_t ctx_addr = 0;
+    br_stub_inode_ctx_t *c = NULL;
+    unsigned long version = BITROT_DEFAULT_CURRENT_VERSION;
+
+    *versioning = _gf_false;
+    *modified = _gf_false;
+
+    /* Bitrot stub inode context was initialized only in lookup, create
+     * and mknod cbk path. Object versioning was enabled by default
+     * irrespective of bitrot enabled or not. But it's made optional now.
+     * As a consequence there could be cases where getting inode ctx would
+     * fail because it's not set yet.
+     * e.g., If versioning (with bitrot enable) is enabled while I/O is
+     * happening, it could directly get other fops like writev without
+     * lookup, where getting inode ctx would fail. Hence initialize the
+     * inode ctx on failure to get ctx. This is done in all places where
+     * applicable.
+     */
+    ret = br_stub_get_inode_ctx(this, fd->inode, &ctx_addr);
+    if (ret < 0) {
+        ret = br_stub_init_inode_versions(this, fd, fd->inode, version,
+                                          _gf_true, _gf_false, &ctx_addr);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0,
+                    BRS_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s",
+                    uuid_utoa(fd->inode->gfid), NULL);
+            goto error_return;
+        }
+    }
+
+    c = (br_stub_inode_ctx_t *)(long)ctx_addr;
+
+    LOCK(&fd->inode->lock);
+    {
+        if (__br_stub_is_inode_dirty(c))
+            *versioning = _gf_true;
+        if (__br_stub_is_inode_modified(c))
+            *modified = _gf_true;
+    }
+    UNLOCK(&fd->inode->lock);
+
+    if (ctx)
+        *ctx = c;
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static int32_t
+br_stub_anon_fd_ctx(xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
+{
+    int32_t ret = -1;
+    br_stub_fd_t *br_stub_fd = NULL;
+
+    br_stub_fd = br_stub_fd_ctx_get(this, fd);
+    if (!br_stub_fd) {
+        ret = br_stub_add_fd_to_inode(this, fd, ctx);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ADD_FD_TO_INODE,
+                    "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+            goto out;
+        }
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+static int
+br_stub_versioning_prep(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                        br_stub_inode_ctx_t *ctx)
+{
+    int32_t ret = -1;
+    br_stub_local_t *local = NULL;
+
+    local = br_stub_alloc_local(this);
+    if (!local) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRS_MSG_NO_MEMORY, "gfid=%s",
+                uuid_utoa(fd->inode->gfid), NULL);
+        goto error_return;
+    }
+
+    if (fd_is_anonymous(fd)) {
+        ret = br_stub_anon_fd_ctx(this, fd, ctx);
+        if (ret)
+            goto free_local;
+    }
+
+    frame->local = local;
+
+    return 0;
+
+free_local:
+    br_stub_dealloc_local(local);
+error_return:
+    return -1;
+}
+
+static int
+br_stub_mark_inode_modified(xlator_t *this, br_stub_local_t *local)
+{
+    fd_t *fd = NULL;
+    int32_t ret = 0;
+    uint64_t ctx_addr = 0;
+    br_stub_inode_ctx_t *ctx = NULL;
+    unsigned long version = BITROT_DEFAULT_CURRENT_VERSION;
+
+    fd = local->u.context.fd;
+
+    ret = br_stub_get_inode_ctx(this, fd->inode, &ctx_addr);
+    if (ret < 0) {
+        ret = br_stub_init_inode_versions(this, fd, fd->inode, version,
+                                          _gf_true, _gf_false, &ctx_addr);
+        if (ret)
+            goto error_return;
+    }
+
+    ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+
+    LOCK(&fd->inode->lock);
+    {
+        __br_stub_set_inode_modified(ctx);
+    }
+    UNLOCK(&fd->inode->lock);
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+/**
+ * The possible return values from br_stub_is_bad_object () are:
+ * 1) 0  => as per the inode context object is not bad
+ * 2) -1 => Failed to get the inode context itself
+ * 3) -2 => As per the inode context object is bad
+ * Both -ve values means the fop which called this function is failed
+ * and error is returned upwards.
+ */
+static int
+br_stub_check_bad_object(xlator_t *this, inode_t *inode, int32_t *op_ret,
+                         int32_t *op_errno)
+{
+    int ret = -1;
+    unsigned long version = BITROT_DEFAULT_CURRENT_VERSION;
+
+    ret = br_stub_is_bad_object(this, inode);
+    if (ret == -2) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJECT_ACCESS,
+                "gfid=%s", uuid_utoa(inode->gfid), NULL);
+        *op_ret = -1;
+        *op_errno = EIO;
+    }
+
+    if (ret == -1) {
+        ret = br_stub_init_inode_versions(this, NULL, inode, version, _gf_true,
+                                          _gf_false, NULL);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0,
+                    BRS_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s",
+                    uuid_utoa(inode->gfid), NULL);
+            *op_ret = -1;
+            *op_errno = EINVAL;
+        }
+    }
+
+    return ret;
+}
+
+/**
+ * callback for inode/fd versioning
+ */
+int
+br_stub_fd_incversioning_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                             int op_ret, int op_errno, dict_t *xdata)
+{
+    fd_t *fd = NULL;
+    inode_t *inode = NULL;
+    unsigned long version = 0;
+    br_stub_local_t *local = NULL;
+
+    local = (br_stub_local_t *)frame->local;
+    if (op_ret < 0)
+        goto done;
+    fd = local->u.context.fd;
+    inode = local->u.context.inode;
+    version = local->u.context.version;
+
+    op_ret = br_stub_mod_inode_versions(this, fd, inode, version);
+    if (op_ret < 0)
+        op_errno = EINVAL;
+
+done:
+    if (op_ret < 0) {
+        frame->local = NULL;
+        call_unwind_error(local->fopstub, -1, op_errno);
+        br_stub_cleanup_local(local);
+        br_stub_dealloc_local(local);
+    } else {
+        call_resume(local->fopstub);
+    }
+    return 0;
+}
+
+/**
+ * Initial object versioning
+ *
+ * Version persists two (2) extended attributes as explained below:
+ *   1. Current (ongoing) version: This is incremented on an writev ()
+ *      or truncate () and is the running version for an object.
+ *   2. Signing version: This is the version against which an object
+ *      was signed (checksummed).
+ *
+ * During initial versioning, both ongoing and signing versions are
+ * set of one and zero respectively. A write() call increments the
+ * ongoing version as an indication of modification to the object.
+ * Additionally this needs to be persisted on disk and needs to be
+ * durable: fsync().. :-/
+ * As an optimization only the first write() synchronizes the ongoing
+ * version to disk, subsequent write()s before the *last* release()
+ * are no-op's.
+ *
+ * create(), just like lookup() initializes the object versions to
+ * the default. As an optimization this is not a durable operation:
+ * in case of a crash, hard reboot etc.. absence of versioning xattrs
+ * is ignored in scrubber along with the one time crawler explicitly
+ * triggering signing for such objects.
+ *
+ * c.f. br_stub_writev() / br_stub_truncate()
+ */
+
+/**
+ * perform full or incremental versioning on an inode pointd by an
+ * fd. incremental versioning is done when an inode is dirty and a
+ * writeback is triggered.
+ */
+
+int
+br_stub_fd_versioning(xlator_t *this, call_frame_t *frame, call_stub_t *stub,
+                      dict_t *dict, fd_t *fd, br_stub_version_cbk *callback,
+                      unsigned long memversion, int versioningtype, int durable)
+{
+    int32_t ret = -1;
+    int flags = 0;
+    dict_t *xdata = NULL;
+    br_stub_local_t *local = NULL;
+
+    xdata = dict_new();
+    if (!xdata)
+        goto done;
+
+    ret = dict_set_int32(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
+    if (ret)
+        goto dealloc_xdata;
+
+    if (durable) {
+        ret = dict_set_int32(xdata, GLUSTERFS_DURABLE_OP, 0);
+        if (ret)
+            goto dealloc_xdata;
+    }
+
+    local = frame->local;
+
+    br_stub_fill_local(local, stub, fd, fd->inode, fd->inode->gfid,
+                       versioningtype, memversion);
+
+    STACK_WIND(frame, callback, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+
+    ret = 0;
+
+dealloc_xdata:
+    dict_unref(xdata);
+done:
+    return ret;
+}
+
+static int
+br_stub_perform_incversioning(xlator_t *this, call_frame_t *frame,
+                              call_stub_t *stub, fd_t *fd,
+                              br_stub_inode_ctx_t *ctx)
+{
+    int32_t ret = -1;
+    dict_t *dict = NULL;
+    br_version_t *obuf = NULL;
+    unsigned long writeback_version = 0;
+    int op_errno = 0;
+    br_stub_local_t *local = NULL;
+
+    op_errno = EINVAL;
+    local = frame->local;
+
+    writeback_version = __br_stub_writeback_version(ctx);
+
+    op_errno = ENOMEM;
+    dict = dict_new();
+    if (!dict)
+        goto out;
+    ret = br_stub_alloc_versions(&obuf, NULL, 0);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_MEM_FAILED,
+                "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto out;
+    }
+    ret = br_stub_prepare_version_request(this, dict, obuf, writeback_version);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_VERSION_PREPARE_FAIL,
+                "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        br_stub_dealloc_versions(obuf);
+        goto out;
+    }
+
+    ret = br_stub_fd_versioning(
+        this, frame, stub, dict, fd, br_stub_fd_incversioning_cbk,
+        writeback_version, BR_STUB_INCREMENTAL_VERSIONING, !WRITEBACK_DURABLE);
+out:
+    if (dict)
+        dict_unref(dict);
+    if (ret) {
+        if (local)
+            frame->local = NULL;
+        call_unwind_error(stub, -1, op_errno);
+        if (local) {
+            br_stub_cleanup_local(local);
+            br_stub_dealloc_local(local);
+        }
+    }
+
+    return ret;
+}
+
+/** {{{ */
+
+/* fsetxattr() */
+
+int32_t
+br_stub_perform_objsign(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                        dict_t *dict, int flags, dict_t *xdata)
+{
+    STACK_WIND(frame, default_fsetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+
+    dict_unref(xdata);
+    return 0;
+}
+
+void *
+br_stub_signth(void *arg)
+{
+    xlator_t *this = arg;
+    br_stub_private_t *priv = this->private;
+    struct br_stub_signentry *sigstub = NULL;
+
+    THIS = this;
+    while (1) {
+        /*
+         * Disabling bit-rot feature leads to this particular thread
+         * getting cleaned up by reconfigure via a call to the function
+         * gf_thread_cleanup_xint (which in turn calls pthread_cancel
+         * and pthread_join). But, if this thread had held the mutex
+         * &priv->lock at the time of cancellation, then it leads to
+         * deadlock in future when bit-rot feature is enabled (which
+         * again spawns this thread which cant hold the lock as the
+         * mutex is still held by the previous instance of the thread
+         * which got killed). Also, the br_stub_handle_object_signature
+         * function which is called whenever file has to be signed
+         * also gets blocked as it too attempts to acquire &priv->lock.
+         *
+         * So, arrange for the lock to be unlocked as part of the
+         * cleanup of this thread using pthread_cleanup_push and
+         * pthread_cleanup_pop.
+         */
+        pthread_cleanup_push(br_stub_lock_cleaner, &priv->lock);
+        pthread_mutex_lock(&priv->lock);
+        {
+            while (list_empty(&priv->squeue))
+                pthread_cond_wait(&priv->cond, &priv->lock);
+
+            sigstub = list_first_entry(&priv->squeue, struct br_stub_signentry,
+                                       list);
+            list_del_init(&sigstub->list);
+        }
+        pthread_mutex_unlock(&priv->lock);
+        pthread_cleanup_pop(0);
+
+        call_resume(sigstub->stub);
+
+        GF_FREE(sigstub);
+    }
+
+    return NULL;
+}
+
+static gf_boolean_t
+br_stub_internal_xattr(dict_t *dict)
+{
+    if (dict_get(dict, GLUSTERFS_SET_OBJECT_SIGNATURE) ||
+        dict_get(dict, GLUSTERFS_GET_OBJECT_SIGNATURE) ||
+        dict_get(dict, BR_REOPEN_SIGN_HINT_KEY) ||
+        dict_get(dict, BITROT_OBJECT_BAD_KEY) ||
+        dict_get(dict, BITROT_SIGNING_VERSION_KEY) ||
+        dict_get(dict, BITROT_CURRENT_VERSION_KEY))
+        return _gf_true;
+
+    return _gf_false;
+}
+
+int
+orderq(struct list_head *elem1, struct list_head *elem2)
+{
+    struct br_stub_signentry *s1 = NULL;
+    struct br_stub_signentry *s2 = NULL;
+
+    s1 = list_entry(elem1, struct br_stub_signentry, list);
+    s2 = list_entry(elem2, struct br_stub_signentry, list);
+
+    return (s1->v > s2->v);
+}
+
+static int
+br_stub_compare_sign_version(xlator_t *this, inode_t *inode,
+                             br_signature_t *sbuf, dict_t *dict,
+                             int *fakesuccess)
+{
+    int32_t ret = -1;
+    uint64_t tmp_ctx = 0;
+    gf_boolean_t invalid = _gf_false;
+    br_stub_inode_ctx_t *ctx = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, sbuf, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+
+    ret = br_stub_get_inode_ctx(this, inode, &tmp_ctx);
+    if (ret) {
+        dict_del(dict, BITROT_SIGNING_VERSION_KEY);
+        goto out;
+    }
+
+    ctx = (br_stub_inode_ctx_t *)(long)tmp_ctx;
+
+    LOCK(&inode->lock);
+    {
+        if (ctx->currentversion < sbuf->signedversion) {
+            invalid = _gf_true;
+        } else if (ctx->currentversion > sbuf->signedversion) {
+            gf_msg_debug(this->name, 0,
+                         "\"Signing version\" "
+                         "(%lu) lower than \"Current version \" "
+                         "(%lu)",
+                         ctx->currentversion, sbuf->signedversion);
+            *fakesuccess = 1;
+        }
+    }
+    UNLOCK(&inode->lock);
+
+    if (invalid) {
+        ret = -1;
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SIGN_VERSION_ERROR,
+                "Signing-ver=%lu", sbuf->signedversion, "current-ver=%lu",
+                ctx->currentversion, NULL);
+    }
+
+out:
+    return ret;
+}
+
+static int
+br_stub_prepare_signature(xlator_t *this, dict_t *dict, inode_t *inode,
+                          br_isignature_t *sign, int *fakesuccess)
+{
+    int32_t ret = -1;
+    size_t signaturelen = 0;
+    br_signature_t *sbuf = NULL;
+
+    if (!br_is_signature_type_valid(sign->signaturetype))
+        goto out;
+
+    signaturelen = sign->signaturelen;
+    ret = br_stub_alloc_versions(NULL, &sbuf, signaturelen);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_MEM_FAILED,
+                "gfid=%s", uuid_utoa(inode->gfid), NULL);
+        ret = -1;
+        goto out;
+    }
+    ret = br_stub_prepare_signing_request(dict, sbuf, sign, signaturelen);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SIGN_PREPARE_FAIL,
+                "gfid=%s", uuid_utoa(inode->gfid), NULL);
+        ret = -1;
+        br_stub_dealloc_versions(sbuf);
+        goto out;
+    }
+
+    /* At this point sbuf has been added to dict, so the memory will be freed
+     * when the data from the dict is destroyed
+     */
+    ret = br_stub_compare_sign_version(this, inode, sbuf, dict, fakesuccess);
+out:
+    return ret;
+}
+
+static void
+br_stub_handle_object_signature(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                                dict_t *dict, br_isignature_t *sign,
+                                dict_t *xdata)
+{
+    int32_t ret = -1;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    int fakesuccess = 0;
+    br_stub_private_t *priv = NULL;
+    struct br_stub_signentry *sigstub = NULL;
+
+    priv = this->private;
+
+    if (frame->root->pid != GF_CLIENT_PID_BITD) {
+        gf_smsg(this->name, GF_LOG_WARNING, op_errno, BRS_MSG_NON_BITD_PID,
+                "PID=%d", frame->root->pid, NULL);
+        goto dofop;
+    }
+
+    ret = br_stub_prepare_signature(this, dict, fd->inode, sign, &fakesuccess);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SIGN_PREPARE_FAIL,
+                "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto dofop;
+    }
+    if (fakesuccess) {
+        op_ret = op_errno = 0;
+        goto dofop;
+    }
+
+    dict_del(dict, GLUSTERFS_SET_OBJECT_SIGNATURE);
+
+    ret = -1;
+    if (!xdata) {
+        xdata = dict_new();
+        if (!xdata)
+            goto dofop;
+    } else {
+        dict_ref(xdata);
+    }
+
+    ret = dict_set_int32(xdata, GLUSTERFS_DURABLE_OP, 0);
+    if (ret)
+        goto unref_dict;
+
+    /* prepare dispatch stub to order object signing */
+    sigstub = GF_CALLOC(1, sizeof(*sigstub), gf_br_stub_mt_sigstub_t);
+    if (!sigstub)
+        goto unref_dict;
+
+    INIT_LIST_HEAD(&sigstub->list);
+    sigstub->v = ntohl(sign->signedversion);
+    sigstub->stub = fop_fsetxattr_stub(frame, br_stub_perform_objsign, fd, dict,
+                                       0, xdata);
+    if (!sigstub->stub)
+        goto cleanup_stub;
+
+    pthread_mutex_lock(&priv->lock);
+    {
+        list_add_order(&sigstub->list, &priv->squeue, orderq);
+        pthread_cond_signal(&priv->cond);
+    }
+    pthread_mutex_unlock(&priv->lock);
+
+    return;
+
+cleanup_stub:
+    GF_FREE(sigstub);
+unref_dict:
+    dict_unref(xdata);
+dofop:
+    STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL);
+}
+
+int32_t
+br_stub_fsetxattr_resume(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    int32_t ret = -1;
+    br_stub_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+
+    ret = br_stub_mark_inode_modified(this, local);
+    if (ret) {
+        op_ret = -1;
+        op_errno = EINVAL;
+    }
+
+    STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata);
+
+    br_stub_cleanup_local(local);
+    br_stub_dealloc_local(local);
+
+    return 0;
+}
+
+/**
+ * Handles object reopens. Object reopens can be of 3 types. 2 are from
+ * oneshot crawler and 1 from the regular signer.
+ * ONESHOT CRAWLER:
+ * For those objects which were created before bitrot was enabled. oneshow
+ * crawler crawls the namespace and signs all the objects. It has to do
+ * the versioning before making bit-rot-stub send a sign notification.
+ * So it sends fsetxattr with BR_OBJECT_REOPEN as the value. And bit-rot-stub
+ * upon getting BR_OBJECT_REOPEN value checks if the version has to be
+ * increased or not. By default the version will be increased. But if the
+ * object is modified before BR_OBJECT_REOPEN from oneshot crawler, then
+ * versioning need not be done. In that case simply a success is returned.
+ * SIGNER:
+ * Signer wait for 2 minutes upon getting the notification from bit-rot-stub
+ * and then it sends a dummy write (in reality a fsetxattr) call, to change
+ * the state of the inode from REOPEN_WAIT to SIGN_QUICK. The funny part here
+ * is though the inode's state is REOPEN_WAIT, the call sent by signer is
+ * BR_OBJECT_RESIGN. Once the state is changed to SIGN_QUICK, then yet another
+ * notification is sent upon release (RESIGN would have happened via fsetxattr,
+ * so a fd is needed) and the object is signed truly this time.
+ * There is a challenge in the above RESIGN method by signer. After sending
+ * the 1st notification, the inode could be forgotten before RESIGN request
+ * is received. In that case, the inode's context (the newly looked up inode)
+ * would not indicate the inode as being modified (it would be in the default
+ * state) and because of this, a SIGN_QUICK notification to truly sign the
+ * object would not be sent. So, this is how its handled.
+ * if (request == RESIGN) {
+ *    if (inode->sign_info == NORMAL) {
+ *        mark_inode_non_dirty;
+ *        mark_inode_modified;
+ *    }
+ *    GOBACK (means unwind without doing versioning)
+ * }
+ */
+static void
+br_stub_handle_object_reopen(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                             uint32_t val)
+{
+    int32_t ret = -1;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    call_stub_t *stub = NULL;
+    gf_boolean_t inc_version = _gf_false;
+    gf_boolean_t modified = _gf_false;
+    br_stub_inode_ctx_t *ctx = NULL;
+    br_stub_local_t *local = NULL;
+    gf_boolean_t goback = _gf_true;
+
+    ret = br_stub_need_versioning(this, fd, &inc_version, &modified, &ctx);
+    if (ret)
+        goto unwind;
+
+    LOCK(&fd->inode->lock);
+    {
+        if ((val == BR_OBJECT_REOPEN) && inc_version)
+            goback = _gf_false;
+        if (val == BR_OBJECT_RESIGN && ctx->info_sign == BR_SIGN_NORMAL) {
+            __br_stub_mark_inode_synced(ctx);
+            __br_stub_set_inode_modified(ctx);
+        }
+        (void)__br_stub_inode_sign_state(ctx, GF_FOP_FSETXATTR, fd);
+    }
+    UNLOCK(&fd->inode->lock);
+
+    if (goback) {
+        op_ret = op_errno = 0;
+        goto unwind;
+    }
+
+    ret = br_stub_versioning_prep(frame, this, fd, ctx);
+    if (ret)
+        goto unwind;
+    local = frame->local;
+
+    stub = fop_fsetxattr_cbk_stub(frame, br_stub_fsetxattr_resume, 0, 0, NULL);
+    if (!stub) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED,
+                "fsetxattr gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto cleanup_local;
+    }
+
+    (void)br_stub_perform_incversioning(this, frame, stub, fd, ctx);
+    return;
+
+cleanup_local:
+    br_stub_cleanup_local(local);
+    br_stub_dealloc_local(local);
+
+unwind:
+    frame->local = NULL;
+    STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL);
+}
+
+/**
+ * This function only handles bad file identification. Instead of checking in
+ * fops like open, readv, writev whether the object is bad or not by doing
+ * getxattr calls, better to catch them when scrubber marks it as bad.
+ * So this callback is called only when the fsetxattr is sent by the scrubber
+ * to mark the object as bad.
+ */
+int
+br_stub_fsetxattr_bad_object_cbk(call_frame_t *frame, void *cookie,
+                                 xlator_t *this, int32_t op_ret,
+                                 int32_t op_errno, dict_t *xdata)
+{
+    br_stub_local_t *local = NULL;
+    int32_t ret = -1;
+
+    local = frame->local;
+    frame->local = NULL;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    /*
+     * What to do if marking the object as bad fails? (i.e. in memory
+     * marking within the inode context. If we are here means fsetxattr
+     * fop has succeeded on disk and the bad object xattr has been set).
+     * We can return failure to scruber, but there is nothing the scrubber
+     * can do with it (it might assume that the on disk setxattr itself has
+     * failed). The main purpose of this operation is to help identify the
+     * bad object by checking the inode context itself (thus avoiding the
+     * necessity of doing a getxattr fop on the disk).
+     *
+     * So as of now, success itself is being returned even though inode
+     * context set operation fails.
+     * In future if there is any change in the policy which can handle this,
+     * then appropriate response should be sent (i.e. success or error).
+     */
+    ret = br_stub_mark_object_bad(this, local->u.context.inode);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_MARK_FAIL,
+                "gfid=%s", uuid_utoa(local->u.context.inode->gfid), NULL);
+
+    ret = br_stub_add(this, local->u.context.inode->gfid);
+
+unwind:
+    STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata);
+    br_stub_cleanup_local(local);
+    br_stub_dealloc_local(local);
+    return 0;
+}
+
+static int32_t
+br_stub_handle_bad_object_key(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                              dict_t *dict, int flags, dict_t *xdata)
+{
+    br_stub_local_t *local = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+
+    if (frame->root->pid != GF_CLIENT_PID_SCRUB) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_NON_SCRUB_BAD_OBJ_MARK,
+                "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto unwind;
+    }
+
+    local = br_stub_alloc_local(this);
+    if (!local) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ALLOC_MEM_FAILED,
+                "fsetxattr gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid,
+                       BR_STUB_NO_VERSIONING, 0);
+    frame->local = local;
+
+    STACK_WIND(frame, br_stub_fsetxattr_bad_object_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+    return 0;
+unwind:
+    STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL);
+    return 0;
+}
+
+/**
+ * As of now, versioning is done by the stub (though as a setxattr
+ * operation) as part of inode modification operations such as writev,
+ * truncate, ftruncate. And signing is done by BitD by a fsetxattr call.
+ * So any kind of setxattr coming on the versioning and the signing xattr is
+ * not allowed (i.e. BITROT_CURRENT_VERSION_KEY and BITROT_SIGNING_VERSION_KEY).
+ * In future if BitD/scrubber are allowed to change the versioning
+ * xattrs (though I cannot see a reason for it as of now), then the below
+ * function can be modified to block setxattr on version for only applications.
+ *
+ * NOTE: BitD sends sign request on GLUSTERFS_SET_OBJECT_SIGNATURE key.
+ *       BITROT_SIGNING_VERSION_KEY is the xattr used to save the signature.
+ *
+ */
+static int32_t
+br_stub_handle_internal_xattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                              char *key)
+{
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+
+    gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_INTERNAL_XATTR,
+            "setxattr key=%s", key, "inode-gfid=%s", uuid_utoa(fd->inode->gfid),
+            NULL);
+
+    STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL);
+    return 0;
+}
+
+static void
+br_stub_dump_xattr(xlator_t *this, dict_t *dict, int *op_errno)
+{
+    char *format = "(%s:%s)";
+    char *dump = NULL;
+
+    dump = GF_CALLOC(1, BR_STUB_DUMP_STR_SIZE, gf_br_stub_mt_misc);
+    if (!dump) {
+        *op_errno = ENOMEM;
+        goto out;
+    }
+    dict_dump_to_str(dict, dump, BR_STUB_DUMP_STR_SIZE, format);
+    gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_INTERNAL_XATTR,
+            "fsetxattr dump=%s", dump, NULL);
+out:
+    if (dump) {
+        GF_FREE(dump);
+    }
+    return;
+}
+
+int
+br_stub_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+                  int flags, dict_t *xdata)
+{
+    int32_t ret = 0;
+    uint32_t val = 0;
+    br_isignature_t *sign = NULL;
+    br_stub_private_t *priv = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+
+    priv = this->private;
+
+    if ((frame->root->pid != GF_CLIENT_PID_BITD &&
+         frame->root->pid != GF_CLIENT_PID_SCRUB) &&
+        br_stub_internal_xattr(dict)) {
+        br_stub_dump_xattr(this, dict, &op_errno);
+        goto unwind;
+    }
+
+    if (!priv->do_versioning)
+        goto wind;
+
+    if (!IA_ISREG(fd->inode->ia_type))
+        goto wind;
+
+    /* object signature request */
+    ret = dict_get_bin(dict, GLUSTERFS_SET_OBJECT_SIGNATURE, (void **)&sign);
+    if (!ret) {
+        gf_msg_debug(this->name, 0, "got SIGNATURE request on %s",
+                     uuid_utoa(fd->inode->gfid));
+        br_stub_handle_object_signature(frame, this, fd, dict, sign, xdata);
+        goto done;
+    }
+
+    /* signing xattr */
+    if (dict_get(dict, BITROT_SIGNING_VERSION_KEY)) {
+        br_stub_handle_internal_xattr(frame, this, fd,
+                                      BITROT_SIGNING_VERSION_KEY);
+        goto done;
+    }
+
+    /* version xattr */
+    if (dict_get(dict, BITROT_CURRENT_VERSION_KEY)) {
+        br_stub_handle_internal_xattr(frame, this, fd,
+                                      BITROT_CURRENT_VERSION_KEY);
+        goto done;
+    }
+
+    if (dict_get(dict, GLUSTERFS_GET_OBJECT_SIGNATURE)) {
+        br_stub_handle_internal_xattr(frame, this, fd,
+                                      GLUSTERFS_GET_OBJECT_SIGNATURE);
+        goto done;
+    }
+
+    /* object reopen request */
+    ret = dict_get_uint32(dict, BR_REOPEN_SIGN_HINT_KEY, &val);
+    if (!ret) {
+        br_stub_handle_object_reopen(frame, this, fd, val);
+        goto done;
+    }
+
+    /* handle bad object */
+    if (dict_get(dict, BITROT_OBJECT_BAD_KEY)) {
+        br_stub_handle_bad_object_key(frame, this, fd, dict, flags, xdata);
+        goto done;
+    }
+
+wind:
+    STACK_WIND(frame, default_fsetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL);
+
+done:
+    return 0;
+}
+
+/**
+ * Currently BitD and scrubber are doing fsetxattr to either sign the object
+ * or to mark it as bad. Hence setxattr on any of those keys is denied directly
+ * without checking from where the fop is coming.
+ * Later, if BitD or Scrubber does setxattr of those keys, then appropriate
+ * check has to be added below.
+ */
+int
+br_stub_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+                 int flags, dict_t *xdata)
+{
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+
+    if (br_stub_internal_xattr(dict)) {
+        br_stub_dump_xattr(this, dict, &op_errno);
+        goto unwind;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr,
+                    loc, dict, flags, xdata);
+    return 0;
+unwind:
+    STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, NULL);
+    return 0;
+}
+
+/** }}} */
+
+/** {{{ */
+
+/* {f}removexattr() */
+
+int32_t
+br_stub_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    const char *name, dict_t *xdata)
+{
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+
+    if (!strcmp(BITROT_OBJECT_BAD_KEY, name) ||
+        !strcmp(BITROT_SIGNING_VERSION_KEY, name) ||
+        !strcmp(BITROT_CURRENT_VERSION_KEY, name)) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_REMOVE_INTERNAL_XATTR,
+                "name=%s", name, "file-path=%s", loc->path, NULL);
+        goto unwind;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+    return 0;
+unwind:
+    STACK_UNWIND_STRICT(removexattr, frame, op_ret, op_errno, NULL);
+    return 0;
+}
+
+int32_t
+br_stub_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                     const char *name, dict_t *xdata)
+{
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+
+    if (!strcmp(BITROT_OBJECT_BAD_KEY, name) ||
+        !strcmp(BITROT_SIGNING_VERSION_KEY, name) ||
+        !strcmp(BITROT_CURRENT_VERSION_KEY, name)) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_REMOVE_INTERNAL_XATTR,
+                "name=%s", name, "inode-gfid=%s", uuid_utoa(fd->inode->gfid),
+                NULL);
+        goto unwind;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+    return 0;
+unwind:
+    STACK_UNWIND_STRICT(fremovexattr, frame, op_ret, op_errno, NULL);
+    return 0;
+}
+
+/** }}} */
+
+/** {{{ */
+
+/* {f}getxattr() */
+
+int
+br_stub_listxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+    if (op_ret < 0)
+        goto unwind;
+
+    br_stub_remove_vxattrs(xattr, _gf_true);
+
+unwind:
+    STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, xattr, xdata);
+    return 0;
+}
+
+/**
+ * ONE SHOT CRAWLER from BitD signs the objects that it encounters while
+ * crawling, if the object is identified as stale by the stub. Stub follows
+ * the below logic to mark an object as stale or not.
+ * If the ongoing version and the signed_version match, then the object is not
+ * stale. Just return. Otherwise if they does not match, then it means one
+ * of the below things.
+ * 1) If the inode does not need write back of the version and the sign state is
+ *    is NORMAL, then some active i/o is going on the object. So skip it.
+ *    A notification will be sent to trigger the sign once the release is
+ *    received on the object.
+ * 2) If inode does not need writeback of the version and the sign state is
+ *    either reopen wait or quick sign, then it means:
+ *    A) BitD restarted and it is not sure whether the object it encountered
+ *       while crawling is in its timer wheel or not. Since there is no way to
+ *       scan the timer wheel as of now, ONE SHOT CRAWLER just goes ahead and
+ *       signs the object. Since the inode does not need writeback, version will
+ *       not be incremented and directly the object will be signed.
+ * 3) If the inode needs writeback, then it means the inode was forgotten after
+ *    the versioning and it has to be signed now.
+ *
+ * This is the algorithm followed:
+ * if (ongoing_version == signed_version); then
+ *     object_is_not_stale;
+ *     return;
+ * else; then
+ *      if (!inode_needs_writeback && inode_sign_state != NORMAL); then
+ *            object_is_stale;
+ *      if (inode_needs_writeback); then
+ *            object_is_stale;
+ *
+ * For SCRUBBER, no need to check for the sign state and inode writeback.
+ * If the ondisk ongoingversion and the ondisk signed version does not match,
+ * then treat the object as stale.
+ */
+char
+br_stub_is_object_stale(xlator_t *this, call_frame_t *frame, inode_t *inode,
+                        br_version_t *obuf, br_signature_t *sbuf)
+{
+    uint64_t ctx_addr = 0;
+    br_stub_inode_ctx_t *ctx = NULL;
+    int32_t ret = -1;
+    char stale = 0;
+
+    if (obuf->ongoingversion == sbuf->signedversion)
+        goto out;
+
+    if (frame->root->pid == GF_CLIENT_PID_SCRUB) {
+        stale = 1;
+        goto out;
+    }
+
+    ret = br_stub_get_inode_ctx(this, inode, &ctx_addr);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
+                "gfid=%s", uuid_utoa(inode->gfid), NULL);
+        goto out;
+    }
+
+    ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+
+    LOCK(&inode->lock);
+    {
+        if ((!__br_stub_is_inode_dirty(ctx) &&
+             ctx->info_sign != BR_SIGN_NORMAL) ||
+            __br_stub_is_inode_dirty(ctx))
+            stale = 1;
+    }
+    UNLOCK(&inode->lock);
+
+out:
+    return stale;
+}
+
+int
+br_stub_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+    int32_t ret = 0;
+    size_t totallen = 0;
+    size_t signaturelen = 0;
+    br_stub_private_t *priv = NULL;
+    br_version_t *obuf = NULL;
+    br_signature_t *sbuf = NULL;
+    br_isignature_out_t *sign = NULL;
+    br_vxattr_status_t status;
+    br_stub_local_t *local = NULL;
+    inode_t *inode = NULL;
+    gf_boolean_t bad_object = _gf_false;
+    gf_boolean_t ver_enabled = _gf_false;
+
+    BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled);
+    priv = this->private;
+
+    if (op_ret < 0)
+        goto unwind;
+    BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), delkeys);
+
+    if (cookie != (void *)BR_STUB_REQUEST_COOKIE)
+        goto unwind;
+
+    local = frame->local;
+    frame->local = NULL;
+    if (!local) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto unwind;
+    }
+    inode = local->u.context.inode;
+
+    op_ret = -1;
+    status = br_version_xattr_state(xattr, &obuf, &sbuf, &bad_object);
+
+    op_errno = EIO;
+    if (bad_object)
+        goto delkeys;
+
+    op_errno = EINVAL;
+    if (status == BR_VXATTR_STATUS_INVALID)
+        goto delkeys;
+
+    op_errno = ENODATA;
+    if ((status == BR_VXATTR_STATUS_MISSING) ||
+        (status == BR_VXATTR_STATUS_UNSIGNED))
+        goto delkeys;
+
+    /**
+     * okay.. we have enough information to satisfy the request,
+     * namely: version and signing extended attribute. what's
+     * pending is the signature length -- that's figured out
+     * indirectly via the size of the _whole_ xattr and the
+     * on-disk signing xattr header size.
+     */
+    op_errno = EINVAL;
+    ret = dict_get_uint32(xattr, BITROT_SIGNING_XATTR_SIZE_KEY,
+                          (uint32_t *)&signaturelen);
+    if (ret)
+        goto delkeys;
+
+    signaturelen -= sizeof(br_signature_t);
+    totallen = sizeof(br_isignature_out_t) + signaturelen;
+
+    op_errno = ENOMEM;
+    sign = GF_CALLOC(1, totallen, gf_br_stub_mt_signature_t);
+    if (!sign)
+        goto delkeys;
+
+    sign->time[0] = obuf->timebuf[0];
+    sign->time[1] = obuf->timebuf[1];
+
+    /* Object's dirty state & current signed version */
+    sign->version = sbuf->signedversion;
+    sign->stale = br_stub_is_object_stale(this, frame, inode, obuf, sbuf);
+
+    /* Object's signature */
+    sign->signaturelen = signaturelen;
+    sign->signaturetype = sbuf->signaturetype;
+    (void)memcpy(sign->signature, sbuf->signature, signaturelen);
+
+    op_errno = EINVAL;
+    ret = dict_set_bin(xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, (void *)sign,
+                       totallen);
+    if (ret < 0) {
+        GF_FREE(sign);
+        goto delkeys;
+    }
+    op_errno = 0;
+    op_ret = totallen;
+
+delkeys:
+    br_stub_remove_vxattrs(xattr, _gf_true);
+
+unwind:
+    STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, xattr, xdata);
+    br_stub_cleanup_local(local);
+    br_stub_dealloc_local(local);
+    return 0;
+}
+
+static void
+br_stub_send_stub_init_time(call_frame_t *frame, xlator_t *this)
+{
+    int op_ret = 0;
+    int op_errno = 0;
+    dict_t *xattr = NULL;
+    br_stub_init_t stub = {
+        {
+            0,
+        },
+    };
+    br_stub_private_t *priv = NULL;
+
+    priv = this->private;
+
+    xattr = dict_new();
+    if (!xattr) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    stub.timebuf[0] = priv->boot[0];
+    stub.timebuf[1] = priv->boot[1];
+    memcpy(stub.export, priv->export, strlen(priv->export) + 1);
+
+    op_ret = dict_set_static_bin(xattr, GLUSTERFS_GET_BR_STUB_INIT_TIME,
+                                 (void *)&stub, sizeof(br_stub_init_t));
+    if (op_ret < 0) {
+        op_errno = EINVAL;
+        goto unwind;
+    }
+
+    op_ret = sizeof(br_stub_init_t);
+
+unwind:
+    STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, xattr, NULL);
+
+    if (xattr)
+        dict_unref(xattr);
+}
+
+int
+br_stub_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                 const char *name, dict_t *xdata)
+{
+    void *cookie = NULL;
+    static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+    fop_getxattr_cbk_t cbk = br_stub_getxattr_cbk;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    br_stub_local_t *local = NULL;
+    br_stub_private_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, loc, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind);
+
+    if (!name) {
+        cbk = br_stub_listxattr_cbk;
+        goto wind;
+    }
+
+    if (br_stub_is_internal_xattr(name))
+        goto unwind;
+
+    priv = this->private;
+    BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    /**
+     * If xattr is node-uuid and the inode is marked bad, return EIO.
+     * Returning EIO would result in AFR to choose correct node-uuid
+     * corresponding to the subvolume * where the good copy of the
+     * file resides.
+     */
+    if (IA_ISREG(loc->inode->ia_type) && XATTR_IS_NODE_UUID(name) &&
+        br_stub_check_bad_object(this, loc->inode, &op_ret, &op_errno)) {
+        goto unwind;
+    }
+
+    /**
+     * this special extended attribute is allowed only on root
+     */
+    if (name &&
+        (strncmp(name, GLUSTERFS_GET_BR_STUB_INIT_TIME,
+                 sizeof(GLUSTERFS_GET_BR_STUB_INIT_TIME) - 1) == 0) &&
+        ((gf_uuid_compare(loc->gfid, rootgfid) == 0) ||
+         (gf_uuid_compare(loc->inode->gfid, rootgfid) == 0))) {
+        BR_STUB_RESET_LOCAL_NULL(frame);
+        br_stub_send_stub_init_time(frame, this);
+        return 0;
+    }
+
+    if (!IA_ISREG(loc->inode->ia_type))
+        goto wind;
+
+    if (name && (strncmp(name, GLUSTERFS_GET_OBJECT_SIGNATURE,
+                         sizeof(GLUSTERFS_GET_OBJECT_SIGNATURE) - 1) == 0)) {
+        cookie = (void *)BR_STUB_REQUEST_COOKIE;
+
+        local = br_stub_alloc_local(this);
+        if (!local) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unwind;
+        }
+
+        br_stub_fill_local(local, NULL, NULL, loc->inode, loc->inode->gfid,
+                           BR_STUB_NO_VERSIONING, 0);
+        frame->local = local;
+    }
+
+wind:
+    STACK_WIND_COOKIE(frame, cbk, cookie, FIRST_CHILD(this),
+                      FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
+    return 0;
+unwind:
+    BR_STUB_RESET_LOCAL_NULL(frame);
+    STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, NULL, NULL);
+    return 0;
+}
+
+int
+br_stub_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                  const char *name, dict_t *xdata)
+{
+    void *cookie = NULL;
+    static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+    fop_fgetxattr_cbk_t cbk = br_stub_getxattr_cbk;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    br_stub_local_t *local = NULL;
+    br_stub_private_t *priv = NULL;
+
+    priv = this->private;
+
+    if (!name) {
+        cbk = br_stub_listxattr_cbk;
+        goto wind;
+    }
+
+    if (br_stub_is_internal_xattr(name))
+        goto unwind;
+
+    BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    /**
+     * If xattr is node-uuid and the inode is marked bad, return EIO.
+     * Returning EIO would result in AFR to choose correct node-uuid
+     * corresponding to the subvolume * where the good copy of the
+     * file resides.
+     */
+    if (IA_ISREG(fd->inode->ia_type) && XATTR_IS_NODE_UUID(name) &&
+        br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno)) {
+        goto unwind;
+    }
+
+    /**
+     * this special extended attribute is allowed only on root
+     */
+    if (name &&
+        (strncmp(name, GLUSTERFS_GET_BR_STUB_INIT_TIME,
+                 sizeof(GLUSTERFS_GET_BR_STUB_INIT_TIME) - 1) == 0) &&
+        (gf_uuid_compare(fd->inode->gfid, rootgfid) == 0)) {
+        BR_STUB_RESET_LOCAL_NULL(frame);
+        br_stub_send_stub_init_time(frame, this);
+        return 0;
+    }
+
+    if (!IA_ISREG(fd->inode->ia_type))
+        goto wind;
+
+    if (name && (strncmp(name, GLUSTERFS_GET_OBJECT_SIGNATURE,
+                         sizeof(GLUSTERFS_GET_OBJECT_SIGNATURE) - 1) == 0)) {
+        cookie = (void *)BR_STUB_REQUEST_COOKIE;
+
+        local = br_stub_alloc_local(this);
+        if (!local) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unwind;
+        }
+
+        br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid,
+                           BR_STUB_NO_VERSIONING, 0);
+        frame->local = local;
+    }
+
+wind:
+    STACK_WIND_COOKIE(frame, cbk, cookie, FIRST_CHILD(this),
+                      FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+    return 0;
+unwind:
+    BR_STUB_RESET_LOCAL_NULL(frame);
+    STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, NULL, NULL);
+    return 0;
+}
+
+int32_t
+br_stub_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+              off_t offset, uint32_t flags, dict_t *xdata)
+{
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    int32_t ret = -1;
+    br_stub_private_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, frame, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, fd, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, unwind);
+
+    priv = this->private;
+    if (!priv->do_versioning)
+        goto wind;
+
+    ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno);
+    if (ret)
+        goto unwind;
+
+wind:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv,
+                    fd, size, offset, flags, xdata);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, NULL, 0, NULL, NULL,
+                        NULL);
+    return 0;
+}
+
+/**
+ * The first write response on the first fd in the list of fds will set
+ * the flag to indicate that the inode is modified. The subsequent write
+ * respnses coming on either the first fd or some other fd will not change
+ * the fd. The inode-modified flag is unset only upon release of all the
+ * fds.
+ */
+int32_t
+br_stub_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                   struct iatt *postbuf, dict_t *xdata)
+{
+    int32_t ret = 0;
+    br_stub_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    ret = br_stub_mark_inode_modified(this, local);
+    if (ret) {
+        op_ret = -1;
+        op_errno = EINVAL;
+    }
+
+unwind:
+    STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+
+    br_stub_cleanup_local(local);
+    br_stub_dealloc_local(local);
+
+    return 0;
+}
+
+int32_t
+br_stub_writev_resume(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                      struct iovec *vector, int32_t count, off_t offset,
+                      uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+    STACK_WIND(frame, br_stub_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+               flags, iobref, xdata);
+    return 0;
+}
+
+/**
+ * This is probably the most crucial part about the whole versioning thing.
+ * There's absolutely no differentiation as such between an anonymous fd
+ * and a regular fd except the fd context initialization. Object versioning
+ * is performed when the inode is dirty. Parallel write operations are no
+ * special with each write performing object versioning followed by marking
+ * the inode as non-dirty (synced). This is followed by the actual operation
+ * (writev() in this case) which on a success marks the inode as modified.
+ * This prevents signing of objects that have not been modified.
+ */
+int32_t
+br_stub_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+               struct iovec *vector, int32_t count, off_t offset,
+               uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    gf_boolean_t inc_version = _gf_false;
+    gf_boolean_t modified = _gf_false;
+    br_stub_inode_ctx_t *ctx = NULL;
+    int32_t ret = -1;
+    fop_writev_cbk_t cbk = default_writev_cbk;
+    br_stub_local_t *local = NULL;
+    br_stub_private_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, frame, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, fd, unwind);
+
+    priv = this->private;
+    if (!priv->do_versioning)
+        goto wind;
+
+    ret = br_stub_need_versioning(this, fd, &inc_version, &modified, &ctx);
+    if (ret)
+        goto unwind;
+
+    ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno);
+    if (ret)
+        goto unwind;
+
+    /**
+     * The inode is not dirty and also witnessed at least one successful
+     * modification operation. Therefore, subsequent operations need not
+     * perform any special tracking.
+     */
+    if (!inc_version && modified)
+        goto wind;
+
+    /**
+     * okay.. so, either the inode needs versioning or the modification
+     * needs to be tracked. ->cbk is set to the appropriate callback
+     * routine for this.
+     * NOTE: ->local needs to be deallocated on failures from here on.
+     */
+    ret = br_stub_versioning_prep(frame, this, fd, ctx);
+    if (ret)
+        goto unwind;
+
+    local = frame->local;
+    if (!inc_version) {
+        br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid,
+                           BR_STUB_NO_VERSIONING, 0);
+        cbk = br_stub_writev_cbk;
+        goto wind;
+    }
+
+    stub = fop_writev_stub(frame, br_stub_writev_resume, fd, vector, count,
+                           offset, flags, iobref, xdata);
+
+    if (!stub) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED,
+                "write  gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto cleanup_local;
+    }
+
+    /* Perform Versioning */
+    return br_stub_perform_incversioning(this, frame, stub, fd, ctx);
+
+wind:
+    STACK_WIND(frame, cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev,
+               fd, vector, count, offset, flags, iobref, xdata);
+    return 0;
+
+cleanup_local:
+    br_stub_cleanup_local(local);
+    br_stub_dealloc_local(local);
+
+unwind:
+    frame->local = NULL;
+    STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+br_stub_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                      struct iatt *postbuf, dict_t *xdata)
+{
+    int32_t ret = -1;
+    br_stub_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    ret = br_stub_mark_inode_modified(this, local);
+    if (ret) {
+        op_ret = -1;
+        op_errno = EINVAL;
+    }
+
+unwind:
+    STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+
+    br_stub_cleanup_local(local);
+    br_stub_dealloc_local(local);
+
+    return 0;
+}
+
+int32_t
+br_stub_ftruncate_resume(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                         off_t offset, dict_t *xdata)
+{
+    STACK_WIND(frame, br_stub_ftruncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+}
+
+/* c.f. br_stub_writev() for explanation */
+int32_t
+br_stub_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                  dict_t *xdata)
+{
+    br_stub_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    gf_boolean_t inc_version = _gf_false;
+    gf_boolean_t modified = _gf_false;
+    br_stub_inode_ctx_t *ctx = NULL;
+    int32_t ret = -1;
+    fop_ftruncate_cbk_t cbk = default_ftruncate_cbk;
+    br_stub_private_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, frame, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, fd, unwind);
+
+    priv = this->private;
+    if (!priv->do_versioning)
+        goto wind;
+
+    ret = br_stub_need_versioning(this, fd, &inc_version, &modified, &ctx);
+    if (ret)
+        goto unwind;
+
+    ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno);
+    if (ret)
+        goto unwind;
+
+    if (!inc_version && modified)
+        goto wind;
+
+    ret = br_stub_versioning_prep(frame, this, fd, ctx);
+    if (ret)
+        goto unwind;
+
+    local = frame->local;
+    if (!inc_version) {
+        br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid,
+                           BR_STUB_NO_VERSIONING, 0);
+        cbk = br_stub_ftruncate_cbk;
+        goto wind;
+    }
+
+    stub = fop_ftruncate_stub(frame, br_stub_ftruncate_resume, fd, offset,
+                              xdata);
+    if (!stub) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED,
+                "ftruncate gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto cleanup_local;
+    }
+
+    return br_stub_perform_incversioning(this, frame, stub, fd, ctx);
+
+wind:
+    STACK_WIND(frame, cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+
+cleanup_local:
+    br_stub_cleanup_local(local);
+    br_stub_dealloc_local(local);
+
+unwind:
+    frame->local = NULL;
+    STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+br_stub_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata)
+{
+    int32_t ret = 0;
+    br_stub_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    ret = br_stub_mark_inode_modified(this, local);
+    if (ret) {
+        op_ret = -1;
+        op_errno = EINVAL;
+    }
+
+unwind:
+    STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    br_stub_cleanup_local(local);
+    br_stub_dealloc_local(local);
+    return 0;
+}
+
+int32_t
+br_stub_truncate_resume(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                        off_t offset, dict_t *xdata)
+{
+    br_stub_local_t *local = frame->local;
+
+    fd_unref(local->u.context.fd);
+    STACK_WIND(frame, br_stub_ftruncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+    return 0;
+}
+
+/**
+ * Bit-rot-stub depends heavily on the fd based operations to for doing
+ * versioning and sending notification. It starts tracking the operation
+ * upon getting first fd based modify operation by doing versioning and
+ * sends notification when last fd using which the inode was modified is
+ * released.
+ * But for truncate there is no fd and hence it becomes difficult to do
+ * the versioning and send notification. It is handled by doing versioning
+ * on an anonymous fd. The fd will be valid till the completion of the
+ * truncate call. It guarantees that release on this anonymous fd will happen
+ * after the truncate call and notification is sent after the truncate call.
+ *
+ * c.f. br_writev_cbk() for explanation
+ */
+int32_t
+br_stub_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+                 dict_t *xdata)
+{
+    br_stub_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    gf_boolean_t inc_version = _gf_false;
+    gf_boolean_t modified = _gf_false;
+    br_stub_inode_ctx_t *ctx = NULL;
+    int32_t ret = -1;
+    fd_t *fd = NULL;
+    fop_truncate_cbk_t cbk = default_truncate_cbk;
+    br_stub_private_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, frame, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, loc, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind);
+
+    priv = this->private;
+    if (!priv->do_versioning)
+        goto wind;
+
+    fd = fd_anonymous(loc->inode);
+    if (!fd) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_CREATE_ANONYMOUS_FD_FAILED,
+                "inode-gfid=%s", uuid_utoa(loc->inode->gfid), NULL);
+        goto unwind;
+    }
+
+    ret = br_stub_need_versioning(this, fd, &inc_version, &modified, &ctx);
+    if (ret)
+        goto cleanup_fd;
+
+    ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno);
+    if (ret)
+        goto unwind;
+
+    if (!inc_version && modified)
+        goto wind;
+
+    ret = br_stub_versioning_prep(frame, this, fd, ctx);
+    if (ret)
+        goto cleanup_fd;
+
+    local = frame->local;
+    if (!inc_version) {
+        br_stub_fill_local(local, NULL, fd, fd->inode, fd->inode->gfid,
+                           BR_STUB_NO_VERSIONING, 0);
+        cbk = br_stub_truncate_cbk;
+        goto wind;
+    }
+
+    stub = fop_truncate_stub(frame, br_stub_truncate_resume, loc, offset,
+                             xdata);
+    if (!stub) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_STUB_ALLOC_FAILED,
+                "truncate gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto cleanup_local;
+    }
+
+    return br_stub_perform_incversioning(this, frame, stub, fd, ctx);
+
+wind:
+    STACK_WIND(frame, cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate,
+               loc, offset, xdata);
+    if (fd)
+        fd_unref(fd);
+    return 0;
+
+cleanup_local:
+    br_stub_cleanup_local(local);
+    br_stub_dealloc_local(local);
+cleanup_fd:
+    fd_unref(fd);
+unwind:
+    frame->local = NULL;
+    STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+/** }}} */
+
+/** {{{ */
+
+/* open() */
+
+/**
+ * It's probably worth mentioning a bit about why some of the housekeeping
+ * work is done in open() call path, rather than the callback path.
+ * Two (or more) open()'s in parallel can race and lead to a situation
+ * where a release() gets triggered (possibly after a series of write()
+ * calls) when *other* open()'s have still not reached callback path
+ * thereby having an active fd on an inode that is in process of getting
+ * signed with the current version.
+ *
+ * Maintaining fd list in the call path ensures that a release() would
+ * not be triggered if an open() call races ahead (followed by a close())
+ * threby finding non-empty fd list.
+ */
+
+int
+br_stub_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+             fd_t *fd, dict_t *xdata)
+{
+    int32_t ret = -1;
+    br_stub_inode_ctx_t *ctx = NULL;
+    uint64_t ctx_addr = 0;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    br_stub_private_t *priv = NULL;
+    unsigned long version = BITROT_DEFAULT_CURRENT_VERSION;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, loc, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, fd, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, unwind);
+
+    priv = this->private;
+
+    if (!priv->do_versioning)
+        goto wind;
+
+    ret = br_stub_get_inode_ctx(this, fd->inode, &ctx_addr);
+    if (ret) {
+        ret = br_stub_init_inode_versions(this, fd, fd->inode, version,
+                                          _gf_true, _gf_false, &ctx_addr);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0,
+                    BRS_MSG_GET_INODE_CONTEXT_FAILED, "path=%s", loc->path,
+                    "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+            goto unwind;
+        }
+    }
+
+    ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+
+    ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno);
+    if (ret)
+        goto unwind;
+
+    if (frame->root->pid == GF_CLIENT_PID_SCRUB)
+        goto wind;
+
+    if (flags == O_RDONLY)
+        goto wind;
+
+    ret = br_stub_add_fd_to_inode(this, fd, ctx);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_ADD_FD_TO_LIST_FAILED,
+                "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto unwind;
+    }
+
+wind:
+    STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+    return 0;
+unwind:
+    STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, NULL, NULL);
+    return 0;
+}
+
+/** }}} */
+
+/** {{{ */
+
+/* creat() */
+
+/**
+ * This routine registers a release callback for the given fd and adds the
+ * fd to the inode context fd tracking list.
+ */
+int32_t
+br_stub_add_fd_to_inode(xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
+{
+    int32_t ret = -1;
+    br_stub_fd_t *br_stub_fd = NULL;
+
+    ret = br_stub_require_release_call(this, fd, &br_stub_fd);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_SET_FD_CONTEXT_FAILED,
+                "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto out;
+    }
+
+    LOCK(&fd->inode->lock);
+    {
+        list_add_tail(&ctx->fd_list, &br_stub_fd->list);
+    }
+    UNLOCK(&fd->inode->lock);
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int
+br_stub_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int op_ret, int op_errno, fd_t *fd, inode_t *inode,
+                   struct iatt *stbuf, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
+{
+    int32_t ret = 0;
+    uint64_t ctx_addr = 0;
+    br_stub_inode_ctx_t *ctx = NULL;
+    unsigned long version = BITROT_DEFAULT_CURRENT_VERSION;
+    br_stub_private_t *priv = NULL;
+
+    priv = this->private;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    if (!priv->do_versioning)
+        goto unwind;
+
+    ret = br_stub_get_inode_ctx(this, fd->inode, &ctx_addr);
+    if (ret < 0) {
+        ret = br_stub_init_inode_versions(this, fd, inode, version, _gf_true,
+                                          _gf_false, &ctx_addr);
+        if (ret) {
+            op_ret = -1;
+            op_errno = EINVAL;
+        }
+    } else {
+        ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+        ret = br_stub_add_fd_to_inode(this, fd, ctx);
+    }
+
+unwind:
+    STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, stbuf,
+                        preparent, postparent, xdata);
+    return 0;
+}
+
+int
+br_stub_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+               mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, loc, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, fd, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, unwind);
+
+    STACK_WIND(frame, br_stub_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+    return 0;
+unwind:
+    STACK_UNWIND_STRICT(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, NULL,
+                        NULL);
+    return 0;
+}
+
+int
+br_stub_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                  int op_errno, inode_t *inode, struct iatt *stbuf,
+                  struct iatt *preparent, struct iatt *postparent,
+                  dict_t *xdata)
+{
+    int32_t ret = -1;
+    unsigned long version = BITROT_DEFAULT_CURRENT_VERSION;
+    br_stub_private_t *priv = NULL;
+
+    priv = this->private;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    if (!priv->do_versioning)
+        goto unwind;
+
+    ret = br_stub_init_inode_versions(this, NULL, inode, version, _gf_true,
+                                      _gf_false, NULL);
+    /**
+     * Like lookup, if init_inode_versions fail, return EINVAL
+     */
+    if (ret) {
+        op_ret = -1;
+        op_errno = EINVAL;
+    }
+
+unwind:
+    STACK_UNWIND_STRICT(mknod, frame, op_ret, op_errno, inode, stbuf, preparent,
+                        postparent, xdata);
+    return 0;
+}
+
+int
+br_stub_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+              dev_t dev, mode_t umask, dict_t *xdata)
+{
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, loc, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind);
+
+    STACK_WIND(frame, br_stub_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, dev, umask, xdata);
+    return 0;
+unwind:
+    STACK_UNWIND_STRICT(mknod, frame, -1, EINVAL, NULL, NULL, NULL, NULL, NULL);
+    return 0;
+}
+
+/** }}} */
+
+/**
+ * As of now, only lookup searches for bad object xattr and marks the
+ * object as bad in its inode context if the xattr is present. But there
+ * is a possibility that, at the time of the lookup the object was not
+ * marked bad (i.e. bad object xattr was not set), and later its marked
+ * as bad. In this case, object is not bad, so when a fop such as open or
+ * readv or writev comes on the object, the fop will be sent downward instead
+ * of sending as error upwards.
+ * The solution for this is to do a getxattr for the below list of fops.
+ * lookup, readdirp, open, readv, writev.
+ * But doing getxattr for each of the above fops might be costly.
+ * So another method followed is to catch the bad file marking by the scrubber
+ * and set that info within the object's inode context. In this way getxattr
+ * calls can be avoided and bad objects can be caught instantly. Fetching the
+ * xattr is needed only in lookups when there is a brick restart or inode
+ * forget.
+ *
+ * If the dict (@xattr) is NULL, then how should that be handled? Fail the
+ * lookup operation? Or let it continue with version being initialized to
+ * BITROT_DEFAULT_CURRENT_VERSION. But what if the version was different
+ * on disk (and also a right signature was there), but posix failed to
+ * successfully allocate the dict? Posix does not treat call back xdata
+ * creattion failure as the lookup failure.
+ */
+static int32_t
+br_stub_lookup_version(xlator_t *this, uuid_t gfid, inode_t *inode,
+                       dict_t *xattr)
+{
+    unsigned long version = 0;
+    br_version_t *obuf = NULL;
+    br_signature_t *sbuf = NULL;
+    br_vxattr_status_t status;
+    gf_boolean_t bad_object = _gf_false;
+
+    /**
+     * versioning xattrs were requested from POSIX. if available, figure
+     * out the correct version to use in the inode context (start with
+     * the default version if unavailable). As of now versions are not
+     * persisted on-disk. The inode is marked dirty, so that the first
+     * operation (such as write(), etc..) triggers synchronization to
+     * disk.
+     */
+    status = br_version_xattr_state(xattr, &obuf, &sbuf, &bad_object);
+    version = ((status == BR_VXATTR_STATUS_FULL) ||
+               (status == BR_VXATTR_STATUS_UNSIGNED))
+                  ? obuf->ongoingversion
+                  : BITROT_DEFAULT_CURRENT_VERSION;
+
+    /**
+     * If signature is there, but version is not there then that status is
+     * is treated as INVALID. So in that case, we should not initialize the
+     * inode context with wrong version names etc.
+     */
+    if (status == BR_VXATTR_STATUS_INVALID)
+        return -1;
+
+    return br_stub_init_inode_versions(this, NULL, inode, version, _gf_true,
+                                       bad_object, NULL);
+}
+
+/** {{{ */
+
+int32_t
+br_stub_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+                dict_t *xdata)
+{
+    br_stub_private_t *priv = NULL;
+    br_stub_fd_t *fd_ctx = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+
+    priv = this->private;
+    if (gf_uuid_compare(fd->inode->gfid, priv->bad_object_dir_gfid))
+        goto normal;
+
+    fd_ctx = br_stub_fd_new();
+    if (!fd_ctx) {
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    fd_ctx->bad_object.dir_eof = -1;
+    fd_ctx->bad_object.dir = sys_opendir(priv->stub_basepath);
+    if (!fd_ctx->bad_object.dir) {
+        op_errno = errno;
+        goto err_freectx;
+    }
+
+    op_ret = br_stub_fd_ctx_set(this, fd, fd_ctx);
+    if (!op_ret)
+        goto unwind;
+
+    sys_closedir(fd_ctx->bad_object.dir);
+
+err_freectx:
+    GF_FREE(fd_ctx);
+unwind:
+    STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, NULL);
+    return 0;
+
+normal:
+    STACK_WIND(frame, default_opendir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+    return 0;
+}
+
+int32_t
+br_stub_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                off_t off, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    br_stub_private_t *priv = NULL;
+
+    priv = this->private;
+    if (!priv->do_versioning)
+        goto out;
+
+    if (gf_uuid_compare(fd->inode->gfid, priv->bad_object_dir_gfid))
+        goto out;
+    stub = fop_readdir_stub(frame, br_stub_readdir_wrapper, fd, size, off,
+                            xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+    br_stub_worker_enqueue(this, stub);
+    return 0;
+out:
+    STACK_WIND(frame, default_readdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdir, fd, size, off, xdata);
+    return 0;
+}
+
+int
+br_stub_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int op_ret, int op_errno, gf_dirent_t *entries,
+                     dict_t *dict)
+{
+    int32_t ret = 0;
+    uint64_t ctxaddr = 0;
+    gf_dirent_t *entry = NULL;
+    br_stub_private_t *priv = NULL;
+    gf_boolean_t ver_enabled = _gf_false;
+
+    BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled);
+    priv = this->private;
+    BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), unwind);
+
+    if (op_ret < 0)
+        goto unwind;
+
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        if ((strcmp(entry->d_name, ".") == 0) ||
+            (strcmp(entry->d_name, "..") == 0))
+            continue;
+
+        if (!IA_ISREG(entry->d_stat.ia_type))
+            continue;
+
+        /*
+         * Readdirp for most part is a bulk lookup for all the entries
+         * present in the directory being read. Ideally, for each
+         * entry, the handling should be similar to that of a lookup
+         * callback. But for now, just keeping this as it has been
+         * until now (which means, this comment has been added much
+         * later as part of a change that wanted to send the flag
+         * of true/false to br_stub_remove_vxattrs to indicate whether
+         * the bad-object xattr should be removed from the entry->dict
+         * or not). Until this change, the function br_stub_remove_vxattrs
+         * was just removing all the xattrs associated with bit-rot-stub
+         * (like version, bad-object, signature etc). But, there are
+         * scenarios where we only want to send bad-object xattr and not
+         * others. So this comment is part of that change which also
+         * mentions about another possible change that might be needed
+         * in future.
+         * But for now, adding _gf_true means functionally its same as
+         * what this function was doing before. Just remove all the stub
+         * related xattrs.
+         */
+        ret = br_stub_get_inode_ctx(this, entry->inode, &ctxaddr);
+        if (ret < 0)
+            ctxaddr = 0;
+        if (ctxaddr) { /* already has the context */
+            br_stub_remove_vxattrs(entry->dict, _gf_true);
+            continue;
+        }
+
+        ret = br_stub_lookup_version(this, entry->inode->gfid, entry->inode,
+                                     entry->dict);
+        br_stub_remove_vxattrs(entry->dict, _gf_true);
+        if (ret) {
+            /**
+             * there's no per-file granularity support in case of
+             * failure. let's fail the entire request for now..
+             */
+            break;
+        }
+    }
+
+    if (ret) {
+        op_ret = -1;
+        op_errno = EINVAL;
+    }
+
+unwind:
+    STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, dict);
+
+    return 0;
+}
+
+int
+br_stub_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                 off_t offset, dict_t *dict)
+{
+    int32_t ret = -1;
+    int op_errno = 0;
+    gf_boolean_t xref = _gf_false;
+    br_stub_private_t *priv = NULL;
+
+    priv = this->private;
+    BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    op_errno = ENOMEM;
+    if (!dict) {
+        dict = dict_new();
+        if (!dict)
+            goto unwind;
+    } else {
+        dict = dict_ref(dict);
+    }
+
+    xref = _gf_true;
+
+    op_errno = EINVAL;
+    ret = dict_set_uint32(dict, BITROT_CURRENT_VERSION_KEY, 0);
+    if (ret)
+        goto unwind;
+    ret = dict_set_uint32(dict, BITROT_SIGNING_VERSION_KEY, 0);
+    if (ret)
+        goto unwind;
+    ret = dict_set_uint32(dict, BITROT_OBJECT_BAD_KEY, 0);
+    if (ret)
+        goto unwind;
+
+wind:
+    STACK_WIND(frame, br_stub_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict);
+    goto unref_dict;
+
+unwind:
+    if (frame->local == (void *)0x1)
+        frame->local = NULL;
+    STACK_UNWIND_STRICT(readdirp, frame, -1, op_errno, NULL, NULL);
+    return 0;
+
+unref_dict:
+    if (xref)
+        dict_unref(dict);
+    return 0;
+}
+
+/** }}} */
+
+/** {{{ */
+
+/* lookup() */
+
+/**
+ * This function mainly handles the ENOENT error for the bad objects. Though
+ * br_stub_forget () handles removal of the link for the bad object from the
+ * quarantine directory, its better to handle it in lookup as well, where
+ * a failed lookup on a bad object with ENOENT, will trigger deletion of the
+ * link for the bad object from quarantine directory. So whoever comes first
+ * either forget () or lookup () will take care of removing the link.
+ */
+void
+br_stub_handle_lookup_error(xlator_t *this, inode_t *inode, int32_t op_errno)
+{
+    int32_t ret = -1;
+    uint64_t ctx_addr = 0;
+    br_stub_inode_ctx_t *ctx = NULL;
+
+    if (op_errno != ENOENT)
+        goto out;
+
+    if (!inode_is_linked(inode))
+        goto out;
+
+    ret = br_stub_get_inode_ctx(this, inode, &ctx_addr);
+    if (ret)
+        goto out;
+
+    ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+
+    LOCK(&inode->lock);
+    {
+        if (__br_stub_is_bad_object(ctx))
+            (void)br_stub_del(this, inode->gfid);
+    }
+    UNLOCK(&inode->lock);
+
+    if (__br_stub_is_bad_object(ctx)) {
+        /* File is not present, might be deleted for recovery,
+         * del the bitrot inode context
+         */
+        ctx_addr = 0;
+        inode_ctx_del(inode, this, &ctx_addr);
+        if (ctx_addr) {
+            ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+            GF_FREE(ctx);
+        }
+    }
+
+out:
+    return;
+}
+
+int
+br_stub_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+                   dict_t *xattr, struct iatt *postparent)
+{
+    int32_t ret = 0;
+    br_stub_private_t *priv = NULL;
+    gf_boolean_t ver_enabled = _gf_false;
+    gf_boolean_t remove_bad_file_marker = _gf_true;
+
+    BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled);
+    priv = this->private;
+
+    if (op_ret < 0) {
+        (void)br_stub_handle_lookup_error(this, inode, op_errno);
+
+        /*
+         * If the lookup error is not ENOENT, then it is better
+         * to send the bad file marker to the higher layer (if
+         * it has been set)
+         */
+        if (op_errno != ENOENT)
+            remove_bad_file_marker = _gf_false;
+        goto delkey;
+    }
+
+    BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), delkey);
+
+    if (!IA_ISREG(stbuf->ia_type))
+        goto unwind;
+
+    /**
+     * If the object is bad, then "bad inode" marker has to be sent back
+     * in resoinse, for revalidated lookups as well. Some xlators such as
+     * quick-read might cache the data in revalidated lookup as fresh
+     * lookup would anyway have sent "bad inode" marker.
+     * In general send bad inode marker for every lookup operation on the
+     * bad object.
+     */
+    if (cookie != (void *)BR_STUB_REQUEST_COOKIE) {
+        ret = br_stub_mark_xdata_bad_object(this, inode, xattr);
+        if (ret) {
+            op_ret = -1;
+            op_errno = EIO;
+            /*
+             * This flag ensures that in the label @delkey below,
+             * bad file marker is not removed from the dictinary,
+             * but other virtual xattrs (such as version, signature)
+             * are removed.
+             */
+            remove_bad_file_marker = _gf_false;
+        }
+        goto delkey;
+    }
+
+    ret = br_stub_lookup_version(this, stbuf->ia_gfid, inode, xattr);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto delkey;
+    }
+
+    /**
+     * If the object is bad, send "bad inode" marker back in response
+     * for xlator(s) to act accordingly (such as quick-read, etc..)
+     */
+    ret = br_stub_mark_xdata_bad_object(this, inode, xattr);
+    if (ret) {
+        /**
+         * aaha! bad object, but sorry we would not
+         * satisfy the request on allocation failures.
+         */
+        op_ret = -1;
+        op_errno = EIO;
+        goto delkey;
+    }
+
+delkey:
+    br_stub_remove_vxattrs(xattr, remove_bad_file_marker);
+unwind:
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
+                        postparent);
+
+    return 0;
+}
+
+int
+br_stub_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    int32_t ret = 0;
+    int op_errno = 0;
+    void *cookie = NULL;
+    uint64_t ctx_addr = 0;
+    gf_boolean_t xref = _gf_false;
+    br_stub_private_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-stub", this, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, loc, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, unwind);
+
+    priv = this->private;
+
+    BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    if (!gf_uuid_compare(loc->gfid, priv->bad_object_dir_gfid) ||
+        !gf_uuid_compare(loc->pargfid, priv->bad_object_dir_gfid)) {
+        stub = fop_lookup_stub(frame, br_stub_lookup_wrapper, loc, xdata);
+        if (!stub) {
+            op_errno = ENOMEM;
+            goto unwind;
+        }
+        br_stub_worker_enqueue(this, stub);
+        return 0;
+    }
+
+    ret = br_stub_get_inode_ctx(this, loc->inode, &ctx_addr);
+    if (ret < 0)
+        ctx_addr = 0;
+    if (ctx_addr != 0)
+        goto wind;
+
+    /**
+     * fresh lookup: request version keys from POSIX
+     */
+    op_errno = ENOMEM;
+    if (!xdata) {
+        xdata = dict_new();
+        if (!xdata)
+            goto unwind;
+    } else {
+        xdata = dict_ref(xdata);
+    }
+
+    xref = _gf_true;
+
+    /**
+     * Requesting both xattrs provides a way of sanity checking the
+     * object. Anomaly checking is done in cbk by examining absence
+     * of either or both xattrs.
+     */
+    op_errno = EINVAL;
+    ret = dict_set_uint32(xdata, BITROT_CURRENT_VERSION_KEY, 0);
+    if (ret)
+        goto unwind;
+    ret = dict_set_uint32(xdata, BITROT_SIGNING_VERSION_KEY, 0);
+    if (ret)
+        goto unwind;
+    ret = dict_set_uint32(xdata, BITROT_OBJECT_BAD_KEY, 0);
+    if (ret)
+        goto unwind;
+    cookie = (void *)BR_STUB_REQUEST_COOKIE;
+
+wind:
+    STACK_WIND_COOKIE(frame, br_stub_lookup_cbk, cookie, FIRST_CHILD(this),
+                      FIRST_CHILD(this)->fops->lookup, loc, xdata);
+    goto dealloc_dict;
+
+unwind:
+    if (frame->local == (void *)0x1)
+        frame->local = NULL;
+    STACK_UNWIND_STRICT(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+dealloc_dict:
+    if (xref)
+        dict_unref(xdata);
+    return 0;
+}
+
+/** }}} */
+
+/** {{{ */
+
+/* stat */
+int
+br_stub_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    int32_t ret = 0;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    br_stub_private_t *priv = NULL;
+
+    priv = this->private;
+
+    if (!priv->do_versioning)
+        goto wind;
+
+    if (!IA_ISREG(loc->inode->ia_type))
+        goto wind;
+
+    ret = br_stub_check_bad_object(this, loc->inode, &op_ret, &op_errno);
+    if (ret)
+        goto unwind;
+
+wind:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat,
+                    loc, xdata);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, NULL, NULL);
+    return 0;
+}
+
+/* fstat */
+int
+br_stub_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    int32_t ret = 0;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    br_stub_private_t *priv = NULL;
+
+    priv = this->private;
+
+    if (!priv->do_versioning)
+        goto wind;
+
+    if (!IA_ISREG(fd->inode->ia_type))
+        goto wind;
+
+    ret = br_stub_check_bad_object(this, fd->inode, &op_ret, &op_errno);
+    if (ret)
+        goto unwind;
+
+wind:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat,
+                    fd, xdata);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(fstat, frame, op_ret, op_errno, NULL, NULL);
+    return 0;
+}
+
+/** }}} */
+
+/** {{{ */
+
+/* unlink() */
+
+int
+br_stub_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
+{
+    br_stub_local_t *local = NULL;
+    inode_t *inode = NULL;
+    uint64_t ctx_addr = 0;
+    br_stub_inode_ctx_t *ctx = NULL;
+    int32_t ret = -1;
+    br_stub_private_t *priv = NULL;
+    gf_boolean_t ver_enabled = _gf_false;
+
+    BR_STUB_VER_ENABLED_IN_CALLPATH(frame, ver_enabled);
+    priv = this->private;
+    BR_STUB_VER_COND_GOTO(priv, (!ver_enabled), unwind);
+
+    local = frame->local;
+    frame->local = NULL;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    if (!local) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_NULL_LOCAL, NULL);
+        goto unwind;
+    }
+    inode = local->u.context.inode;
+    if (!IA_ISREG(inode->ia_type))
+        goto unwind;
+
+    ret = br_stub_get_inode_ctx(this, inode, &ctx_addr);
+    if (ret) {
+        /**
+         * If the inode is bad AND context is not there, then there
+         * is a possibility of the gfid of the object being listed
+         * in the quarantine directory and will be shown in the
+         * bad objects list. So continuing with the fop with a
+         * warning log. The entry from the quarantine directory
+         * has to be removed manually. Its not a good idea to fail
+         * the fop, as the object has already been deleted.
+         */
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
+                "inode-gfid=%s", uuid_utoa(inode->gfid), NULL);
+        goto unwind;
+    }
+
+    ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+
+    LOCK(&inode->lock);
+    {
+        /**
+         * Ignoring the return value of br_stub_del ().
+         * There is not much that can be done if unlinking
+         * of the entry in the quarantine directory fails.
+         * The failure is logged.
+         */
+        if (__br_stub_is_bad_object(ctx))
+            (void)br_stub_del(this, inode->gfid);
+    }
+    UNLOCK(&inode->lock);
+
+unwind:
+    STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent,
+                        xdata);
+    br_stub_cleanup_local(local);
+    br_stub_dealloc_local(local);
+    return 0;
+}
+
+int
+br_stub_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int flag,
+               dict_t *xdata)
+{
+    br_stub_local_t *local = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    br_stub_private_t *priv = NULL;
+
+    priv = this->private;
+    BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    local = br_stub_alloc_local(this);
+    if (!local) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRS_MSG_ALLOC_MEM_FAILED,
+                "local path=%s", loc->path, "gfid=%s",
+                uuid_utoa(loc->inode->gfid), NULL);
+        goto unwind;
+    }
+
+    br_stub_fill_local(local, NULL, NULL, loc->inode, loc->inode->gfid,
+                       BR_STUB_NO_VERSIONING, 0);
+
+    frame->local = local;
+
+wind:
+    STACK_WIND(frame, br_stub_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, flag, xdata);
+    return 0;
+
+unwind:
+    if (frame->local == (void *)0x1)
+        frame->local = NULL;
+    STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, NULL, NULL, NULL);
+    return 0;
+}
+
+/** }}} */
+
+/** {{{ */
+
+/* forget() */
+
+int
+br_stub_forget(xlator_t *this, inode_t *inode)
+{
+    uint64_t ctx_addr = 0;
+    br_stub_inode_ctx_t *ctx = NULL;
+
+    inode_ctx_del(inode, this, &ctx_addr);
+    if (!ctx_addr)
+        return 0;
+
+    ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+
+    GF_FREE(ctx);
+
+    return 0;
+}
+
+/** }}} */
+
+/** {{{ */
+
+int32_t
+br_stub_noop(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, dict_t *xdata)
+{
+    STACK_DESTROY(frame->root);
+    return 0;
+}
+
+static void
+br_stub_send_ipc_fop(xlator_t *this, fd_t *fd, unsigned long releaseversion,
+                     int sign_info)
+{
+    int32_t op = 0;
+    int32_t ret = 0;
+    dict_t *xdata = NULL;
+    call_frame_t *frame = NULL;
+    changelog_event_t ev = {
+        0,
+    };
+
+    ev.ev_type = CHANGELOG_OP_TYPE_BR_RELEASE;
+    ev.u.releasebr.version = releaseversion;
+    ev.u.releasebr.sign_info = sign_info;
+    gf_uuid_copy(ev.u.releasebr.gfid, fd->inode->gfid);
+
+    xdata = dict_new();
+    if (!xdata) {
+        gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, BRS_MSG_DICT_ALLOC_FAILED,
+                NULL);
+        goto out;
+    }
+
+    ret = dict_set_static_bin(xdata, "RELEASE-EVENT", &ev, CHANGELOG_EV_SIZE);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SET_EVENT_FAILED, NULL);
+        goto dealloc_dict;
+    }
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_CREATE_FRAME_FAILED,
+                NULL);
+        goto dealloc_dict;
+    }
+
+    op = GF_IPC_TARGET_CHANGELOG;
+    STACK_WIND(frame, br_stub_noop, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ipc, op, xdata);
+
+dealloc_dict:
+    dict_unref(xdata);
+out:
+    return;
+}
+
+/**
+ * This is how the state machine of sign info works:
+ * 3 states:
+ * 1) BR_SIGN_NORMAL => The default State of the inode
+ * 2) BR_SIGN_REOPEN_WAIT => A release has been sent and is waiting for reopen
+ * 3) BR_SIGN_QUICK => reopen has happened and this release should trigger sign
+ * 2 events:
+ * 1) GF_FOP_RELEASE
+ * 2) GF_FOP_WRITE (actually a dummy write for BitD)
+ *
+ * This is how states are changed based on events:
+ * EVENT: GF_FOP_RELEASE:
+ * if (state == BR_SIGN_NORMAL) ; then
+ *     set state = BR_SIGN_REOPEN_WAIT;
+ * if (state == BR_SIGN_QUICK); then
+ *     set state = BR_SIGN_NORMAL;
+ * EVENT: GF_FOP_WRITE:
+ *  if (state == BR_SIGN_REOPEN_WAIT); then
+ *     set state = BR_SIGN_QUICK;
+ */
+br_sign_state_t
+__br_stub_inode_sign_state(br_stub_inode_ctx_t *ctx, glusterfs_fop_t fop,
+                           fd_t *fd)
+{
+    br_sign_state_t sign_info = BR_SIGN_INVALID;
+
+    switch (fop) {
+        case GF_FOP_FSETXATTR:
+            sign_info = ctx->info_sign = BR_SIGN_QUICK;
+            break;
+
+        case GF_FOP_RELEASE:
+            GF_ASSERT(ctx->info_sign != BR_SIGN_REOPEN_WAIT);
+
+            if (ctx->info_sign == BR_SIGN_NORMAL) {
+                sign_info = ctx->info_sign = BR_SIGN_REOPEN_WAIT;
+            } else {
+                sign_info = ctx->info_sign;
+                ctx->info_sign = BR_SIGN_NORMAL;
+            }
+
+            break;
+        default:
+            break;
+    }
+
+    return sign_info;
+}
+
+int32_t
+br_stub_release(xlator_t *this, fd_t *fd)
+{
+    int32_t ret = 0;
+    int32_t flags = 0;
+    inode_t *inode = NULL;
+    unsigned long releaseversion = 0;
+    br_stub_inode_ctx_t *ctx = NULL;
+    uint64_t tmp = 0;
+    br_stub_fd_t *br_stub_fd = NULL;
+    int32_t signinfo = 0;
+
+    inode = fd->inode;
+
+    LOCK(&inode->lock);
+    {
+        ctx = __br_stub_get_ongoing_version_ctx(this, inode, NULL);
+        if (ctx == NULL)
+            goto unblock;
+        br_stub_fd = br_stub_fd_ctx_get(this, fd);
+        if (br_stub_fd) {
+            list_del_init(&br_stub_fd->list);
+        }
+
+        ret = __br_stub_can_trigger_release(inode, ctx, &releaseversion);
+        if (!ret)
+            goto unblock;
+
+        signinfo = __br_stub_inode_sign_state(ctx, GF_FOP_RELEASE, fd);
+        signinfo = htonl(signinfo);
+
+        /* inode back to initital state: mark dirty */
+        if (ctx->info_sign == BR_SIGN_NORMAL) {
+            __br_stub_mark_inode_dirty(ctx);
+            __br_stub_unset_inode_modified(ctx);
+        }
+    }
+unblock:
+    UNLOCK(&inode->lock);
+
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "releaseversion: %lu | flags: %d "
+                     "| signinfo: %d",
+                     (unsigned long)ntohl(releaseversion), flags,
+                     ntohl(signinfo));
+        br_stub_send_ipc_fop(this, fd, releaseversion, signinfo);
+    }
+
+    ret = fd_ctx_del(fd, this, &tmp);
+    br_stub_fd = (br_stub_fd_t *)(long)tmp;
+
+    GF_FREE(br_stub_fd);
+
+    return 0;
+}
+
+int32_t
+br_stub_releasedir(xlator_t *this, fd_t *fd)
+{
+    br_stub_fd_t *fctx = NULL;
+    uint64_t ctx = 0;
+    int ret = 0;
+
+    ret = fd_ctx_del(fd, this, &ctx);
+    if (ret < 0)
+        goto out;
+
+    fctx = (br_stub_fd_t *)(long)ctx;
+    if (fctx->bad_object.dir) {
+        ret = sys_closedir(fctx->bad_object.dir);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_BAD_OBJ_DIR_CLOSE_FAIL,
+                    "error=%s", strerror(errno), NULL);
+    }
+
+    GF_FREE(fctx);
+out:
+    return 0;
+}
+
+/** }}} */
+
+/** {{{ */
+
+/* ictxmerge */
+
+void
+br_stub_ictxmerge(xlator_t *this, fd_t *fd, inode_t *inode,
+                  inode_t *linked_inode)
+{
+    int32_t ret = 0;
+    uint64_t ctxaddr = 0;
+    uint64_t lctxaddr = 0;
+    br_stub_inode_ctx_t *ctx = NULL;
+    br_stub_inode_ctx_t *lctx = NULL;
+    br_stub_fd_t *br_stub_fd = NULL;
+
+    ret = br_stub_get_inode_ctx(this, inode, &ctxaddr);
+    if (ret < 0)
+        goto done;
+    ctx = (br_stub_inode_ctx_t *)(uintptr_t)ctxaddr;
+
+    LOCK(&linked_inode->lock);
+    {
+        ret = __br_stub_get_inode_ctx(this, linked_inode, &lctxaddr);
+        if (ret < 0)
+            goto unblock;
+        lctx = (br_stub_inode_ctx_t *)(uintptr_t)lctxaddr;
+
+        GF_ASSERT(list_is_singular(&ctx->fd_list));
+        br_stub_fd = list_first_entry(&ctx->fd_list, br_stub_fd_t, list);
+        if (br_stub_fd) {
+            GF_ASSERT(br_stub_fd->fd == fd);
+            list_move_tail(&br_stub_fd->list, &lctx->fd_list);
+        }
+    }
+unblock:
+    UNLOCK(&linked_inode->lock);
+
+done:
+    return;
+}
+
+/** }}} */
+
+struct xlator_fops fops = {
+    .lookup = br_stub_lookup,
+    .stat = br_stub_stat,
+    .fstat = br_stub_fstat,
+    .open = br_stub_open,
+    .create = br_stub_create,
+    .readdirp = br_stub_readdirp,
+    .getxattr = br_stub_getxattr,
+    .fgetxattr = br_stub_fgetxattr,
+    .fsetxattr = br_stub_fsetxattr,
+    .writev = br_stub_writev,
+    .truncate = br_stub_truncate,
+    .ftruncate = br_stub_ftruncate,
+    .mknod = br_stub_mknod,
+    .readv = br_stub_readv,
+    .removexattr = br_stub_removexattr,
+    .fremovexattr = br_stub_fremovexattr,
+    .setxattr = br_stub_setxattr,
+    .opendir = br_stub_opendir,
+    .readdir = br_stub_readdir,
+    .unlink = br_stub_unlink,
+};
+
+struct xlator_cbks cbks = {
+    .forget = br_stub_forget,
+    .release = br_stub_release,
+    .ictxmerge = br_stub_ictxmerge,
+};
+
+struct volume_options options[] = {
+    {.key = {"bitrot"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .op_version = {GD_OP_VERSION_3_7_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_FORCE,
+     .tags = {"bitrot"},
+     .description = "enable/disable bitrot stub"},
+    {.key = {"export"},
+     .type = GF_OPTION_TYPE_PATH,
+     .op_version = {GD_OP_VERSION_3_7_0},
+     .tags = {"bitrot"},
+     .description = "brick path for versioning",
+     .default_value = "{{ brick.path }}"},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "bitrot-stub",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.h b/xlators/features/bit-rot/src/stub/bit-rot-stub.h
new file mode 100644
index 00000000000..edd79a77e4f
--- /dev/null
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.h
@@ -0,0 +1,515 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+#ifndef __BIT_ROT_STUB_H__
+#define __BIT_ROT_STUB_H__
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/call-stub.h>
+#include "bit-rot-stub-mem-types.h"
+#include <glusterfs/syscall.h>
+#include <glusterfs/common-utils.h>
+#include "bit-rot-common.h"
+#include "bit-rot-stub-messages.h"
+#include "glusterfs3-xdr.h"
+#include <glusterfs/syncop.h>
+#include <glusterfs/syncop-utils.h>
+
+#define BAD_OBJECT_THREAD_STACK_SIZE ((size_t)(1024 * 1024))
+#define BR_STUB_DUMP_STR_SIZE 65536
+
+#define BR_PATH_MAX_EXTRA (PATH_MAX + 1024)
+#define BR_PATH_MAX_PLUS (PATH_MAX + 2048)
+
+/*
+ * Oops. Spelling mistake. Correcting it
+ */
+#define OLD_BR_STUB_QUARANTINE_DIR GF_HIDDEN_PATH "/quanrantine"
+#define BR_STUB_QUARANTINE_DIR GF_HIDDEN_PATH "/quarantine"
+
+/* do not reference frame->local in cbk unless initialized.
+ * Assigned 0x1 marks verisoning flag between call path and
+ * cbk path.
+ */
+#define BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, label)                   \
+    do {                                                                       \
+        if (priv->do_versioning)                                               \
+            frame->local = (void *)0x1;                                        \
+        else                                                                   \
+            goto label;                                                        \
+    } while (0)
+
+#define BR_STUB_VER_COND_GOTO(priv, cond, label)                               \
+    do {                                                                       \
+        if (!priv->do_versioning || cond)                                      \
+            goto label;                                                        \
+    } while (0)
+
+#define BR_STUB_VER_ENABLED_IN_CALLPATH(frame, flag)                           \
+    do {                                                                       \
+        if (frame->local)                                                      \
+            flag = _gf_true;                                                   \
+        if (frame->local == (void *)0x1)                                       \
+            frame->local = NULL;                                               \
+    } while (0)
+
+#define BR_STUB_RESET_LOCAL_NULL(frame)                                        \
+    do {                                                                       \
+        if (frame->local == (void *)0x1)                                       \
+            frame->local = NULL;                                               \
+    } while (0)
+
+typedef int(br_stub_version_cbk)(call_frame_t *, void *, xlator_t *, int32_t,
+                                 int32_t, dict_t *);
+
+typedef struct br_stub_inode_ctx {
+    int need_writeback;           /* does the inode need
+                                        a writeback to disk? */
+    unsigned long currentversion; /* ongoing version */
+
+    int info_sign;
+    struct list_head fd_list; /* list of open fds or fds participating in
+                                 write operations */
+    gf_boolean_t bad_object;
+} br_stub_inode_ctx_t;
+
+typedef struct br_stub_fd {
+    fd_t *fd;
+    struct list_head list;
+    struct bad_object_dir {
+        DIR *dir;
+        off_t dir_eof;
+    } bad_object;
+} br_stub_fd_t;
+
+#define I_DIRTY (1 << 0) /* inode needs writeback */
+#define I_MODIFIED (1 << 1)
+#define WRITEBACK_DURABLE 1 /* writeback is durable */
+
+/**
+ * This could just have been a plain struct without unions and all,
+ * but we may need additional things in the future.
+ */
+typedef struct br_stub_local {
+    call_stub_t *fopstub; /* stub for original fop */
+
+    int versioningtype; /* not much used atm */
+
+    union {
+        struct br_stub_ctx {
+            fd_t *fd;
+            uuid_t gfid;
+            inode_t *inode;
+            unsigned long version;
+        } context;
+    } u;
+} br_stub_local_t;
+
+#define BR_STUB_NO_VERSIONING (1 << 0)
+#define BR_STUB_INCREMENTAL_VERSIONING (1 << 1)
+
+typedef struct br_stub_private {
+    gf_boolean_t do_versioning;
+
+    uint32_t boot[2];
+    char export[PATH_MAX];
+
+    pthread_mutex_t lock;
+    pthread_cond_t cond;
+
+    struct list_head squeue; /* ordered signing queue */
+    pthread_t signth;
+    struct bad_objects_container {
+        pthread_t thread;
+        pthread_mutex_t bad_lock;
+        pthread_cond_t bad_cond;
+        struct list_head bad_queue;
+    } container;
+    struct mem_pool *local_pool;
+
+    char stub_basepath[BR_PATH_MAX_EXTRA];
+
+    uuid_t bad_object_dir_gfid;
+} br_stub_private_t;
+
+br_stub_fd_t *
+br_stub_fd_new(void);
+
+int
+__br_stub_fd_ctx_set(xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd);
+
+br_stub_fd_t *
+__br_stub_fd_ctx_get(xlator_t *this, fd_t *fd);
+
+br_stub_fd_t *
+br_stub_fd_ctx_get(xlator_t *this, fd_t *fd);
+
+int32_t
+br_stub_fd_ctx_set(xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd);
+
+static inline gf_boolean_t
+__br_stub_is_bad_object(br_stub_inode_ctx_t *ctx)
+{
+    return ctx->bad_object;
+}
+
+static inline void
+__br_stub_mark_object_bad(br_stub_inode_ctx_t *ctx)
+{
+    ctx->bad_object = _gf_true;
+}
+
+/* inode writeback helpers */
+static inline void
+__br_stub_mark_inode_dirty(br_stub_inode_ctx_t *ctx)
+{
+    ctx->need_writeback |= I_DIRTY;
+}
+
+static inline void
+__br_stub_mark_inode_synced(br_stub_inode_ctx_t *ctx)
+{
+    ctx->need_writeback &= ~I_DIRTY;
+}
+
+static inline int
+__br_stub_is_inode_dirty(br_stub_inode_ctx_t *ctx)
+{
+    return (ctx->need_writeback & I_DIRTY);
+}
+
+/* inode mofification markers */
+static inline void
+__br_stub_set_inode_modified(br_stub_inode_ctx_t *ctx)
+{
+    ctx->need_writeback |= I_MODIFIED;
+}
+
+static inline void
+__br_stub_unset_inode_modified(br_stub_inode_ctx_t *ctx)
+{
+    ctx->need_writeback &= ~I_MODIFIED;
+}
+
+static inline int
+__br_stub_is_inode_modified(br_stub_inode_ctx_t *ctx)
+{
+    return (ctx->need_writeback & I_MODIFIED);
+}
+
+static inline int
+br_stub_require_release_call(xlator_t *this, fd_t *fd, br_stub_fd_t **fd_ctx)
+{
+    int32_t ret = 0;
+    br_stub_fd_t *br_stub_fd = NULL;
+
+    br_stub_fd = br_stub_fd_new();
+    if (!br_stub_fd)
+        return -1;
+
+    br_stub_fd->fd = fd;
+    INIT_LIST_HEAD(&br_stub_fd->list);
+
+    ret = br_stub_fd_ctx_set(this, fd, br_stub_fd);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SET_CONTEXT_FAILED,
+                NULL);
+    else
+        *fd_ctx = br_stub_fd;
+
+    return ret;
+}
+
+/* get/set inode context helpers */
+
+static inline int
+__br_stub_get_inode_ctx(xlator_t *this, inode_t *inode, uint64_t *ctx)
+{
+    return __inode_ctx_get(inode, this, ctx);
+}
+
+static inline int
+br_stub_get_inode_ctx(xlator_t *this, inode_t *inode, uint64_t *ctx)
+{
+    int ret = -1;
+
+    LOCK(&inode->lock);
+    {
+        ret = __br_stub_get_inode_ctx(this, inode, ctx);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+static inline int
+br_stub_set_inode_ctx(xlator_t *this, inode_t *inode, br_stub_inode_ctx_t *ctx)
+{
+    uint64_t ctx_addr = (uint64_t)(uintptr_t)ctx;
+    return inode_ctx_set(inode, this, &ctx_addr);
+}
+
+/* version get/set helpers */
+
+static inline unsigned long
+__br_stub_writeback_version(br_stub_inode_ctx_t *ctx)
+{
+    return (ctx->currentversion + 1);
+}
+
+static inline void
+__br_stub_set_ongoing_version(br_stub_inode_ctx_t *ctx, unsigned long version)
+{
+    if (ctx->currentversion < version)
+        ctx->currentversion = version;
+    else
+        gf_smsg("bit-rot-stub", GF_LOG_WARNING, 0,
+                BRS_MSG_CHANGE_VERSION_FAILED, "current version=%lu",
+                ctx->currentversion, "new version=%lu", version, NULL);
+}
+
+static inline int
+__br_stub_can_trigger_release(inode_t *inode, br_stub_inode_ctx_t *ctx,
+                              unsigned long *version)
+{
+    /**
+     * If the inode is modified, then it has to be dirty. An inode is
+     * marked dirty once version is increased. Its marked as modified
+     * when the modification call (write/truncate) which triggered
+     * the versioning is successful.
+     */
+    if (__br_stub_is_inode_modified(ctx) && list_empty(&ctx->fd_list) &&
+        (ctx->info_sign != BR_SIGN_REOPEN_WAIT)) {
+        GF_ASSERT(__br_stub_is_inode_dirty(ctx) == 0);
+
+        if (version)
+            *version = htonl(ctx->currentversion);
+        return 1;
+    }
+
+    return 0;
+}
+
+static inline int32_t
+br_stub_get_ongoing_version(xlator_t *this, inode_t *inode,
+                            unsigned long *version)
+{
+    int32_t ret = 0;
+    uint64_t ctx_addr = 0;
+    br_stub_inode_ctx_t *ctx = NULL;
+
+    LOCK(&inode->lock);
+    {
+        ret = __inode_ctx_get(inode, this, &ctx_addr);
+        if (ret < 0)
+            goto unblock;
+        ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+        *version = ctx->currentversion;
+    }
+unblock:
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+/**
+ * fetch the current version from inode and return the context.
+ * inode->lock should be held before invoking this as context
+ * *needs* to be valid in the caller.
+ */
+static inline br_stub_inode_ctx_t *
+__br_stub_get_ongoing_version_ctx(xlator_t *this, inode_t *inode,
+                                  unsigned long *version)
+{
+    int32_t ret = 0;
+    uint64_t ctx_addr = 0;
+    br_stub_inode_ctx_t *ctx = NULL;
+
+    ret = __inode_ctx_get(inode, this, &ctx_addr);
+    if (ret < 0)
+        return NULL;
+    ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+    if (version)
+        *version = ctx->currentversion;
+
+    return ctx;
+}
+
+/* filter for xattr fetch */
+static inline int
+br_stub_is_internal_xattr(const char *name)
+{
+    if (name && ((strncmp(name, BITROT_CURRENT_VERSION_KEY,
+                          SLEN(BITROT_CURRENT_VERSION_KEY)) == 0) ||
+                 (strncmp(name, BITROT_SIGNING_VERSION_KEY,
+                          SLEN(BITROT_SIGNING_VERSION_KEY)) == 0)))
+        return 1;
+    return 0;
+}
+
+static inline void
+br_stub_remove_vxattrs(dict_t *xattr, gf_boolean_t remove_bad_marker)
+{
+    if (xattr) {
+        /*
+         * When a file is corrupted, bad-object should be
+         * set in the dict. But, other info such as version,
+         * signature etc should not be set. Hence the flag
+         * remove_bad_marker. The consumer should know whether
+         * to send the bad-object info in the dict or not.
+         */
+        if (remove_bad_marker)
+            dict_del(xattr, BITROT_OBJECT_BAD_KEY);
+        dict_del(xattr, BITROT_CURRENT_VERSION_KEY);
+        dict_del(xattr, BITROT_SIGNING_VERSION_KEY);
+        dict_del(xattr, BITROT_SIGNING_XATTR_SIZE_KEY);
+    }
+}
+
+/**
+ * This function returns the below values for different situations
+ * 0  => as per the inode context object is not bad
+ * -1 => Failed to get the inode context itself
+ * -2 => As per the inode context object is bad
+ * Both -ve values means the fop which called this function is failed
+ * and error is returned upwards.
+ * In future if needed or more errors have to be handled, then those
+ * errors can be made into enums.
+ */
+static inline int
+br_stub_is_bad_object(xlator_t *this, inode_t *inode)
+{
+    int bad_object = 0;
+    gf_boolean_t tmp = _gf_false;
+    uint64_t ctx_addr = 0;
+    br_stub_inode_ctx_t *ctx = NULL;
+    int32_t ret = -1;
+
+    ret = br_stub_get_inode_ctx(this, inode, &ctx_addr);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
+                "inode-gfid=%s", uuid_utoa(inode->gfid), NULL);
+        bad_object = -1;
+        goto out;
+    }
+
+    ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+
+    LOCK(&inode->lock);
+    {
+        tmp = __br_stub_is_bad_object(ctx);
+        if (tmp)
+            bad_object = -2;
+    }
+    UNLOCK(&inode->lock);
+
+out:
+    return bad_object;
+}
+
+static inline int32_t
+br_stub_mark_object_bad(xlator_t *this, inode_t *inode)
+{
+    int32_t ret = -1;
+    uint64_t ctx_addr = 0;
+    br_stub_inode_ctx_t *ctx = NULL;
+
+    ret = br_stub_get_inode_ctx(this, inode, &ctx_addr);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
+                "inode-gfid=%s", uuid_utoa(inode->gfid), NULL);
+        goto out;
+    }
+
+    ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+
+    LOCK(&inode->lock);
+    {
+        __br_stub_mark_object_bad(ctx);
+    }
+    UNLOCK(&inode->lock);
+
+out:
+    return ret;
+}
+
+/**
+ * There is a possibility that dict_set might fail. The o/p of dict_set is
+ * given to the caller and the caller has to decide what to do.
+ */
+static inline int32_t
+br_stub_mark_xdata_bad_object(xlator_t *this, inode_t *inode, dict_t *xdata)
+{
+    int32_t ret = 0;
+
+    if (br_stub_is_bad_object(this, inode) == -2)
+        ret = dict_set_int32(xdata, GLUSTERFS_BAD_INODE, 1);
+
+    return ret;
+}
+
+int32_t
+br_stub_add_fd_to_inode(xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx);
+
+br_sign_state_t
+__br_stub_inode_sign_state(br_stub_inode_ctx_t *ctx, glusterfs_fop_t fop,
+                           fd_t *fd);
+
+int
+br_stub_dir_create(xlator_t *this, br_stub_private_t *priv);
+
+int
+br_stub_add(xlator_t *this, uuid_t gfid);
+
+int32_t
+br_stub_create_stub_gfid(xlator_t *this, char *stub_gfid_path, uuid_t gfid);
+
+int
+br_stub_dir_create(xlator_t *this, br_stub_private_t *priv);
+
+call_stub_t *
+__br_stub_dequeue(struct list_head *callstubs);
+
+void
+__br_stub_enqueue(struct list_head *callstubs, call_stub_t *stub);
+
+void
+br_stub_worker_enqueue(xlator_t *this, call_stub_t *stub);
+
+void *
+br_stub_worker(void *data);
+
+int32_t
+br_stub_lookup_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       dict_t *xattr_req);
+
+int32_t
+br_stub_readdir_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                        size_t size, off_t off, dict_t *xdata);
+
+int
+br_stub_del(xlator_t *this, uuid_t gfid);
+
+int
+br_stub_bad_objects_path(xlator_t *this, fd_t *fd, gf_dirent_t *entries,
+                         dict_t **dict);
+
+void
+br_stub_entry_xattr_fill(xlator_t *this, char *hpath, gf_dirent_t *entry,
+                         dict_t *dict);
+
+int
+br_stub_get_path_of_gfid(xlator_t *this, inode_t *parent, inode_t *inode,
+                         uuid_t gfid, char **path);
+
+#endif /* __BIT_ROT_STUB_H__ */
diff --git a/xlators/features/changelog/Makefile.am b/xlators/features/changelog/Makefile.am
new file mode 100644
index 00000000000..153bb685076
--- /dev/null
+++ b/xlators/features/changelog/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src lib
+
+CLEANFILES =
diff --git a/xlators/features/changelog/lib/Makefile.am b/xlators/features/changelog/lib/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/changelog/lib/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/changelog/lib/examples/c/get-changes-multi.c b/xlators/features/changelog/lib/examples/c/get-changes-multi.c
new file mode 100644
index 00000000000..5ea5bbb6630
--- /dev/null
+++ b/xlators/features/changelog/lib/examples/c/get-changes-multi.c
@@ -0,0 +1,90 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+/**
+ * Compile it using:
+ *  gcc -o getchanges-multi `pkg-config --cflags libgfchangelog` \
+ *  get-changes-multi.c `pkg-config --libs libgfchangelog`
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/un.h>
+#include <limits.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <errno.h>
+
+#include "changelog.h"
+
+void *
+brick_init(void *xl, struct gf_brick_spec *brick)
+{
+    return brick;
+}
+
+void
+brick_fini(void *xl, char *brick, void *data)
+{
+    return;
+}
+
+void
+brick_callback(void *xl, char *brick, void *data, changelog_event_t *ev)
+{
+    printf("->callback: (brick,type) [%s:%d]\n", brick, ev->ev_type);
+}
+
+void
+fill_brick_spec(struct gf_brick_spec *brick, char *path)
+{
+    brick->brick_path = strdup(path);
+    brick->filter = CHANGELOG_OP_TYPE_BR_RELEASE;
+
+    brick->init = brick_init;
+    brick->fini = brick_fini;
+    brick->callback = brick_callback;
+    brick->connected = NULL;
+    brick->disconnected = NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+    int ret = 0;
+    void *bricks = NULL;
+    struct gf_brick_spec *brick = NULL;
+
+    bricks = calloc(2, sizeof(struct gf_brick_spec));
+    if (!bricks)
+        goto error_return;
+
+    brick = (struct gf_brick_spec *)bricks;
+    fill_brick_spec(brick, "/export/z1/zwoop");
+
+    brick++;
+    fill_brick_spec(brick, "/export/z2/zwoop");
+
+    ret = gf_changelog_init(NULL);
+    if (ret)
+        goto error_return;
+
+    ret = gf_changelog_register_generic((struct gf_brick_spec *)bricks, 2, 0,
+                                        "/tmp/multi-changes.log", 9, NULL);
+    if (ret)
+        goto error_return;
+
+    /* let callbacks do the job */
+    select(0, NULL, NULL, NULL, NULL);
+
+error_return:
+    return -1;
+}
diff --git a/xlators/features/changelog/lib/examples/c/get-changes.c b/xlators/features/changelog/lib/examples/c/get-changes.c
new file mode 100644
index 00000000000..8bc651c24a4
--- /dev/null
+++ b/xlators/features/changelog/lib/examples/c/get-changes.c
@@ -0,0 +1,93 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+/**
+ * get set of new changes every 10 seconds (just print the file names)
+ *
+ * Compile it using:
+ *  gcc -o getchanges `pkg-config --cflags libgfchangelog` get-changes.c \
+ *  `pkg-config --libs libgfchangelog`
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/un.h>
+#include <limits.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <errno.h>
+
+#include "changelog.h"
+
+#define handle_error(fn) printf("%s (reason: %s)\n", fn, strerror(errno))
+
+int
+main(int argc, char **argv)
+{
+    int i = 0;
+    int ret = 0;
+    ssize_t nr_changes = 0;
+    ssize_t changes = 0;
+    char fbuf[PATH_MAX] = {
+        0,
+    };
+
+    ret = gf_changelog_init(NULL);
+    if (ret) {
+        handle_error("Init failed");
+        goto out;
+    }
+
+    /* get changes for brick "/home/vshankar/export/yow/yow-1" */
+    ret = gf_changelog_register("/export/z1/zwoop", "/tmp/scratch",
+                                "/tmp/change.log", 9, 5);
+    if (ret) {
+        handle_error("register failed");
+        goto out;
+    }
+
+    while (1) {
+        i = 0;
+        nr_changes = gf_changelog_scan();
+        if (nr_changes < 0) {
+            handle_error("scan(): ");
+            break;
+        }
+
+        if (nr_changes == 0)
+            goto next;
+
+        printf("Got %ld changelog files\n", nr_changes);
+
+        while ((changes = gf_changelog_next_change(fbuf, PATH_MAX)) > 0) {
+            printf("changelog file [%d]: %s\n", ++i, fbuf);
+
+            /* process changelog */
+            /* ... */
+            /* ... */
+            /* ... */
+            /* done processing */
+
+            ret = gf_changelog_done(fbuf);
+            if (ret)
+                handle_error("gf_changelog_done");
+        }
+
+        if (changes == -1)
+            handle_error("gf_changelog_next_change");
+
+    next:
+        sleep(10);
+    }
+
+out:
+    return ret;
+}
diff --git a/xlators/features/changelog/lib/examples/c/get-history.c b/xlators/features/changelog/lib/examples/c/get-history.c
new file mode 100644
index 00000000000..3e888d75ca6
--- /dev/null
+++ b/xlators/features/changelog/lib/examples/c/get-history.c
@@ -0,0 +1,116 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+/**
+ * get set of new changes every 10 seconds (just print the file names)
+ *
+ * Compile it using:
+ *  gcc -o gethistory `pkg-config --cflags libgfchangelog` get-history.c \
+ *  `pkg-config --libs libgfchangelog`
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/un.h>
+#include <limits.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <errno.h>
+
+#include "changelog.h"
+
+#define handle_error(fn) printf("%s (reason: %s)\n", fn, strerror(errno))
+
+int
+main(int argc, char **argv)
+{
+    int i = 0;
+    int ret = 0;
+    ssize_t nr_changes = 0;
+    ssize_t changes = 0;
+    char fbuf[PATH_MAX] = {
+        0,
+    };
+    unsigned long end_ts = 0;
+
+    ret = gf_changelog_init(NULL);
+    if (ret) {
+        handle_error("init failed");
+        goto out;
+    }
+
+    ret = gf_changelog_register("/export/z1/zwoop", "/tmp/scratch_v1",
+                                "/tmp/changes.log", 9, 5);
+    if (ret) {
+        handle_error("register failed");
+        goto out;
+    }
+
+    int a, b;
+    printf("give the two numbers start and end\t");
+    scanf("%d%d", &a, &b);
+    ret = gf_history_changelog("/export/z1/zwoop/.glusterfs/changelogs", a, b,
+                               3, &end_ts);
+    if (ret == -1) {
+        printf("history failed");
+        goto out;
+    }
+
+    printf("end time till when changelog available : %d , ret(%d) \t", end_ts,
+           ret);
+    fflush(stdout);
+
+    while (1) {
+        nr_changes = gf_history_changelog_scan();
+        printf("scanned, nr_changes : %d\n", nr_changes);
+        if (nr_changes < 0) {
+            handle_error("scan(): ");
+            break;
+        }
+
+        if (nr_changes == 0) {
+            printf("done scanning \n");
+            goto out;
+        }
+
+        printf("Got %ld changelog files\n", nr_changes);
+
+        while ((changes = gf_history_changelog_next_change(fbuf, PATH_MAX)) >
+               0) {
+            printf("changelog file [%d]: %s\n", ++i, fbuf);
+
+            /* process changelog */
+            /* ... */
+            /* ... */
+            /* ... */
+            /* done processing */
+
+            ret = gf_history_changelog_done(fbuf);
+            if (ret)
+                handle_error("gf_changelog_done");
+        }
+        /*
+        if (changes == -1)
+                handle_error ("gf_changelog_next_change");
+        if (nr_changes ==1){
+                printf("continue scanning\n");
+        }
+
+        if(nr_changes == 0){
+                printf("done scanning \n");
+                goto out;
+        }
+        */
+    }
+
+out:
+    return ret;
+}
diff --git a/xlators/features/changelog/lib/examples/python/changes.py b/xlators/features/changelog/lib/examples/python/changes.py
new file mode 100755
index 00000000000..c410d3b000d
--- /dev/null
+++ b/xlators/features/changelog/lib/examples/python/changes.py
@@ -0,0 +1,34 @@
+#!/usr/bin/python3
+
+from __future__ import print_function
+import os
+import sys
+import time
+import libgfchangelog
+
+cl = libgfchangelog.Changes()
+
+def get_changes(brick, scratch_dir, log_file, log_level, interval):
+    change_list = []
+    try:
+        cl.cl_init()
+        cl.cl_register(brick, scratch_dir, log_file, log_level)
+        while True:
+            cl.cl_scan()
+            change_list = cl.cl_getchanges()
+            if change_list:
+                print(change_list)
+            for change in change_list:
+                print(('done with %s' % (change)))
+                cl.cl_done(change)
+            time.sleep(interval)
+    except OSError:
+        ex = sys.exc_info()[1]
+        print(ex)
+
+if __name__ == '__main__':
+    if len(sys.argv) != 6:
+        print(("usage: %s <brick> <scratch-dir> <log-file> <fetch-interval>"
+              % (sys.argv[0])))
+        sys.exit(1)
+    get_changes(sys.argv[1], sys.argv[2], sys.argv[3], 9, int(sys.argv[4]))
diff --git a/xlators/features/changelog/lib/examples/python/libgfchangelog.py b/xlators/features/changelog/lib/examples/python/libgfchangelog.py
new file mode 100644
index 00000000000..2da9f2d2a8c
--- /dev/null
+++ b/xlators/features/changelog/lib/examples/python/libgfchangelog.py
@@ -0,0 +1,71 @@
+import os
+from ctypes import *
+from ctypes.util import find_library
+
+class Changes(object):
+    libgfc = CDLL(find_library("gfchangelog"), mode=RTLD_GLOBAL,
+                  use_errno=True)
+
+    @classmethod
+    def geterrno(cls):
+        return get_errno()
+
+    @classmethod
+    def raise_oserr(cls):
+        errn = cls.geterrno()
+        raise OSError(errn, os.strerror(errn))
+
+    @classmethod
+    def _get_api(cls, call):
+        return getattr(cls.libgfc, call)
+
+    @classmethod
+    def cl_init(cls):
+        ret = cls._get_api('gf_changelog_init')(None)
+        if ret == -1:
+            cls.raise_changelog_err()
+
+    @classmethod
+    def cl_register(cls, brick, path, log_file, log_level, retries = 0):
+        ret = cls._get_api('gf_changelog_register')(brick, path,
+                                                    log_file, log_level, retries)
+        if ret == -1:
+            cls.raise_oserr()
+
+    @classmethod
+    def cl_scan(cls):
+        ret = cls._get_api('gf_changelog_scan')()
+        if ret == -1:
+            cls.raise_oserr()
+
+    @classmethod
+    def cl_startfresh(cls):
+        ret = cls._get_api('gf_changelog_start_fresh')()
+        if ret == -1:
+            cls.raise_oserr()
+
+    @classmethod
+    def cl_getchanges(cls):
+        """ remove hardcoding for path name length """
+        def clsort(f):
+            return f.split('.')[-1]
+        changes = []
+        buf = create_string_buffer('\0', 4096)
+        call = cls._get_api('gf_changelog_next_change')
+
+        while True:
+            ret = call(buf, 4096)
+            if ret in (0, -1):
+                break;
+            changes.append(buf.raw[:ret-1])
+        if ret == -1:
+            cls.raise_oserr()
+        # cleanup tracker
+        cls.cl_startfresh()
+        return sorted(changes, key=clsort)
+
+    @classmethod
+    def cl_done(cls, clfile):
+        ret = cls._get_api('gf_changelog_done')(clfile)
+        if ret == -1:
+            cls.raise_oserr()
diff --git a/xlators/features/changelog/lib/src/Makefile.am b/xlators/features/changelog/lib/src/Makefile.am
new file mode 100644
index 00000000000..c933ec53ed2
--- /dev/null
+++ b/xlators/features/changelog/lib/src/Makefile.am
@@ -0,0 +1,35 @@
+libgfchangelog_la_CFLAGS = -Wall $(GF_CFLAGS) $(GF_DARWIN_LIBGLUSTERFS_CFLAGS) \
+	-DDATADIR=\"$(localstatedir)\"
+
+libgfchangelog_la_CPPFLAGS = $(GF_CPPFLAGS) -D__USE_FILE_OFFSET64 -D__USE_LARGEFILE64 -fpic \
+	-I../../../src/ -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/xlators/features/changelog/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(top_srcdir)/rpc/rpc-lib/src \
+	-I$(top_srcdir)/rpc/rpc-transport/socket/src \
+	-DDATADIR=\"$(localstatedir)\"
+
+libgfchangelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+	$(top_builddir)/rpc/xdr/src/libgfxdr.la \
+	$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la
+
+libgfchangelog_la_LDFLAGS = $(GF_LDFLAGS) \
+        -version-info $(LIBGFCHANGELOG_LT_VERSION) \
+        $(GF_NO_UNDEFINED)
+
+lib_LTLIBRARIES = libgfchangelog.la
+
+CONTRIB_BUILDDIR = $(top_builddir)/contrib
+
+libgfchangelog_la_SOURCES = gf-changelog.c gf-changelog-journal-handler.c \
+	gf-changelog-helpers.c gf-changelog-api.c gf-history-changelog.c \
+	gf-changelog-rpc.c gf-changelog-reborp.c \
+	$(top_srcdir)/xlators/features/changelog/src/changelog-rpc-common.c
+
+noinst_HEADERS = gf-changelog-helpers.h gf-changelog-rpc.h \
+	gf-changelog-journal.h changelog-lib-messages.h
+
+CLEANFILES =
+
+$(top_builddir)/libglusterfs/src/libglusterfs.la:
+	$(MAKE) -C $(top_builddir)/libglusterfs/src/ all
diff --git a/xlators/features/changelog/lib/src/changelog-lib-messages.h b/xlators/features/changelog/lib/src/changelog-lib-messages.h
new file mode 100644
index 00000000000..d7fe7274353
--- /dev/null
+++ b/xlators/features/changelog/lib/src/changelog-lib-messages.h
@@ -0,0 +1,74 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _CHANGELOG_LIB_MESSAGES_H_
+#define _CHANGELOG_LIB_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(
+    CHANGELOG_LIB, CHANGELOG_LIB_MSG_OPEN_FAILED,
+    CHANGELOG_LIB_MSG_FAILED_TO_RMDIR,
+    CHANGELOG_LIB_MSG_SCRATCH_DIR_ENTRIES_CREATION_ERROR,
+    CHANGELOG_LIB_MSG_THREAD_CREATION_FAILED, CHANGELOG_LIB_MSG_OPENDIR_ERROR,
+    CHANGELOG_LIB_MSG_RENAME_FAILED, CHANGELOG_LIB_MSG_READ_ERROR,
+    CHANGELOG_LIB_MSG_HTIME_ERROR, CHANGELOG_LIB_MSG_GET_TIME_ERROR,
+    CHANGELOG_LIB_MSG_WRITE_FAILED, CHANGELOG_LIB_MSG_PTHREAD_ERROR,
+    CHANGELOG_LIB_MSG_MMAP_FAILED, CHANGELOG_LIB_MSG_MUNMAP_FAILED,
+    CHANGELOG_LIB_MSG_ASCII_ERROR, CHANGELOG_LIB_MSG_STAT_FAILED,
+    CHANGELOG_LIB_MSG_GET_XATTR_FAILED, CHANGELOG_LIB_MSG_PUBLISH_ERROR,
+    CHANGELOG_LIB_MSG_PARSE_ERROR, CHANGELOG_LIB_MSG_MIN_MAX_INFO,
+    CHANGELOG_LIB_MSG_CLEANUP_ERROR, CHANGELOG_LIB_MSG_UNLINK_FAILED,
+    CHANGELOG_LIB_MSG_NOTIFY_REGISTER_FAILED,
+    CHANGELOG_LIB_MSG_INVOKE_RPC_FAILED, CHANGELOG_LIB_MSG_DRAINING_EVENT_INFO,
+    CHANGELOG_LIB_MSG_CLEANING_BRICK_ENTRY_INFO,
+    CHANGELOG_LIB_MSG_FREEING_ENTRY_INFO, CHANGELOG_LIB_MSG_XDR_DECODING_FAILED,
+    CHANGELOG_LIB_MSG_NOTIFY_REGISTER_INFO,
+    CHANGELOG_LIB_MSG_THREAD_CLEANUP_WARNING,
+    CHANGELOG_LIB_MSG_COPY_FROM_BUFFER_FAILED,
+    CHANGELOG_LIB_MSG_PTHREAD_JOIN_FAILED, CHANGELOG_LIB_MSG_HIST_FAILED,
+    CHANGELOG_LIB_MSG_DRAINED_EVENT_INFO, CHANGELOG_LIB_MSG_PARSE_ERROR_CEASED,
+    CHANGELOG_LIB_MSG_REQUESTING_INFO, CHANGELOG_LIB_MSG_FINAL_INFO);
+
+#define CHANGELOG_LIB_MSG_NOTIFY_REGISTER_INFO_STR "Registering brick"
+#define CHANGELOG_LIB_MSG_RENAME_FAILED_STR "error moving changelog file"
+#define CHANGELOG_LIB_MSG_OPEN_FAILED_STR "cannot open changelog file"
+#define CHANGELOG_LIB_MSG_UNLINK_FAILED_STR "failed to unlink"
+#define CHANGELOG_LIB_MSG_FAILED_TO_RMDIR_STR "failed to rmdir"
+#define CHANGELOG_LIB_MSG_STAT_FAILED_STR "stat failed on changelog file"
+#define CHANGELOG_LIB_MSG_PARSE_ERROR_STR "could not parse changelog"
+#define CHANGELOG_LIB_MSG_PARSE_ERROR_CEASED_STR                               \
+    "parsing error, ceased publishing..."
+#define CHANGELOG_LIB_MSG_HTIME_ERROR_STR "fop failed on htime file"
+#define CHANGELOG_LIB_MSG_GET_XATTR_FAILED_STR                                 \
+    "error extracting max timstamp from htime file"
+#define CHANGELOG_LIB_MSG_MIN_MAX_INFO_STR "changelogs min max"
+#define CHANGELOG_LIB_MSG_REQUESTING_INFO_STR "Requesting historical changelogs"
+#define CHANGELOG_LIB_MSG_FINAL_INFO_STR "FINAL"
+#define CHANGELOG_LIB_MSG_HIST_FAILED_STR                                      \
+    "Requested changelog range is not available"
+#define CHANGELOG_LIB_MSG_GET_TIME_ERROR_STR "wrong result"
+#define CHANGELOG_LIB_MSG_CLEANING_BRICK_ENTRY_INFO_STR                        \
+    "Cleaning brick entry for brick"
+#define CHANGELOG_LIB_MSG_DRAINING_EVENT_INFO_STR "Draining event"
+#define CHANGELOG_LIB_MSG_DRAINED_EVENT_INFO_STR "Drained event"
+#define CHANGELOG_LIB_MSG_FREEING_ENTRY_INFO_STR "freeing entry"
+
+#endif /* !_CHANGELOG_MESSAGES_H_ */
diff --git a/xlators/features/changelog/lib/src/gf-changelog-api.c b/xlators/features/changelog/lib/src/gf-changelog-api.c
new file mode 100644
index 00000000000..81a5cbfec10
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog-api.c
@@ -0,0 +1,224 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/compat-uuid.h>
+#include <glusterfs/globals.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/syscall.h>
+
+#include "gf-changelog-helpers.h"
+#include "gf-changelog-journal.h"
+#include "changelog-mem-types.h"
+#include "changelog-lib-messages.h"
+
+int
+gf_changelog_done(char *file)
+{
+    int ret = -1;
+    char *buffer = NULL;
+    xlator_t *this = NULL;
+    gf_changelog_journal_t *jnl = NULL;
+    char to_path[PATH_MAX] = {
+        0,
+    };
+
+    errno = EINVAL;
+
+    this = THIS;
+    if (!this)
+        goto out;
+
+    jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this);
+    if (!jnl)
+        goto out;
+
+    if (!file || !strlen(file))
+        goto out;
+
+    /* make sure 'file' is inside ->jnl_working_dir */
+    buffer = realpath(file, NULL);
+    if (!buffer)
+        goto out;
+
+    if (strncmp(jnl->jnl_working_dir, buffer, strlen(jnl->jnl_working_dir)))
+        goto out;
+
+    (void)snprintf(to_path, PATH_MAX, "%s%s", jnl->jnl_processed_dir,
+                   basename(buffer));
+    gf_msg_debug(this->name, 0, "moving %s to processed directory", file);
+    ret = sys_rename(buffer, to_path);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_LIB_MSG_RENAME_FAILED, "from=%s", file, "to=%s",
+                to_path, NULL);
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    if (buffer)
+        free(buffer); /* allocated by realpath() */
+    return ret;
+}
+
+/**
+ * @API
+ *  for a set of changelogs, start from the beginning
+ */
+int
+gf_changelog_start_fresh()
+{
+    xlator_t *this = NULL;
+    gf_changelog_journal_t *jnl = NULL;
+
+    this = THIS;
+    if (!this)
+        goto out;
+
+    errno = EINVAL;
+
+    jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this);
+    if (!jnl)
+        goto out;
+
+    if (gf_ftruncate(jnl->jnl_fd, 0))
+        goto out;
+
+    return 0;
+
+out:
+    return -1;
+}
+
+/**
+ * @API
+ * return the next changelog file entry. zero means all chanelogs
+ * consumed.
+ */
+ssize_t
+gf_changelog_next_change(char *bufptr, size_t maxlen)
+{
+    ssize_t size = -1;
+    int tracker_fd = 0;
+    xlator_t *this = NULL;
+    gf_changelog_journal_t *jnl = NULL;
+    char buffer[PATH_MAX] = {
+        0,
+    };
+
+    errno = EINVAL;
+
+    this = THIS;
+    if (!this)
+        goto out;
+
+    jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this);
+    if (!jnl)
+        goto out;
+
+    tracker_fd = jnl->jnl_fd;
+
+    size = gf_readline(tracker_fd, buffer, maxlen);
+    if (size < 0) {
+        size = -1;
+        goto out;
+    }
+
+    if (size == 0)
+        goto out;
+
+    memcpy(bufptr, buffer, size - 1);
+    bufptr[size - 1] = '\0';
+
+out:
+    return size;
+}
+
+/**
+ * @API
+ *  gf_changelog_scan() - scan and generate a list of change entries
+ *
+ * calling this api multiple times (without calling gf_changlog_done())
+ * would result new changelogs(s) being refreshed in the tracker file.
+ * This call also acts as a cancellation point for the consumer.
+ */
+ssize_t
+gf_changelog_scan()
+{
+    int tracker_fd = 0;
+    size_t off = 0;
+    xlator_t *this = NULL;
+    size_t nr_entries = 0;
+    gf_changelog_journal_t *jnl = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    char buffer[PATH_MAX] = {
+        0,
+    };
+
+    this = THIS;
+    if (!this)
+        goto out;
+
+    jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this);
+    if (!jnl)
+        goto out;
+    if (JNL_IS_API_DISCONNECTED(jnl)) {
+        errno = ENOTCONN;
+        goto out;
+    }
+
+    errno = EINVAL;
+
+    tracker_fd = jnl->jnl_fd;
+    if (gf_ftruncate(tracker_fd, 0))
+        goto out;
+
+    rewinddir(jnl->jnl_dir);
+
+    for (;;) {
+        errno = 0;
+        entry = sys_readdir(jnl->jnl_dir, scratch);
+        if (!entry || errno != 0)
+            break;
+
+        if (!strcmp(basename(entry->d_name), ".") ||
+            !strcmp(basename(entry->d_name), ".."))
+            continue;
+
+        nr_entries++;
+
+        GF_CHANGELOG_FILL_BUFFER(jnl->jnl_processing_dir, buffer, off,
+                                 strlen(jnl->jnl_processing_dir));
+        GF_CHANGELOG_FILL_BUFFER(entry->d_name, buffer, off,
+                                 strlen(entry->d_name));
+        GF_CHANGELOG_FILL_BUFFER("\n", buffer, off, 1);
+
+        if (gf_changelog_write(tracker_fd, buffer, off) != off) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_WRITE_FAILED,
+                   "error writing changelog filename"
+                   " to tracker file");
+            break;
+        }
+        off = 0;
+    }
+
+    if (!entry) {
+        if (gf_lseek(tracker_fd, 0, SEEK_SET) != -1)
+            return nr_entries;
+    }
+out:
+    return -1;
+}
diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.c b/xlators/features/changelog/lib/src/gf-changelog-helpers.c
new file mode 100644
index 00000000000..75f8a6dfc08
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.c
@@ -0,0 +1,170 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "changelog-mem-types.h"
+#include "gf-changelog-helpers.h"
+#include "changelog-lib-messages.h"
+#include <glusterfs/syscall.h>
+
+size_t
+gf_changelog_write(int fd, char *buffer, size_t len)
+{
+    ssize_t size = 0;
+    size_t written = 0;
+
+    while (written < len) {
+        size = sys_write(fd, buffer + written, len - written);
+        if (size <= 0)
+            break;
+
+        written += size;
+    }
+
+    return written;
+}
+
+void
+gf_rfc3986_encode_space_newline(unsigned char *s, char *enc, char *estr)
+{
+    for (; *s; s++) {
+        if (estr[*s])
+            sprintf(enc, "%c", estr[*s]);
+        else
+            sprintf(enc, "%%%02X", *s);
+        while (*++enc)
+            ;
+    }
+}
+
+/**
+ * thread safe version of readline with buffering
+ * (taken from Unix Network Programming Volume I, W.R. Stevens)
+ *
+ * This is favoured over fgets() as we'd need to ftruncate()
+ * (see gf_changelog_scan() API) to record new changelog files.
+ * stream open functions does have a truncate like api (although
+ * that can be done via @fflush(fp), @ftruncate(fd) and @fseek(fp),
+ * but this involves mixing POSIX file descriptors and stream FILE *).
+ *
+ * NOTE: This implementation still does work with more than one fd's
+ *       used to perform gf_readline(). For this very reason it's not
+ *       made a part of libglusterfs.
+ */
+
+static __thread read_line_t thread_tsd = {};
+
+static ssize_t
+my_read(read_line_t *tsd, int fd, char *ptr)
+{
+    if (tsd->rl_cnt <= 0) {
+        tsd->rl_cnt = sys_read(fd, tsd->rl_buf, MAXLINE);
+
+        if (tsd->rl_cnt < 0)
+            return -1;
+        else if (tsd->rl_cnt == 0)
+            return 0;
+        tsd->rl_bufptr = tsd->rl_buf;
+    }
+
+    tsd->rl_cnt--;
+    *ptr = *tsd->rl_bufptr++;
+    return 1;
+}
+
+ssize_t
+gf_readline(int fd, void *vptr, size_t maxlen)
+{
+    size_t n = 0;
+    size_t rc = 0;
+    char c = ' ';
+    char *ptr = NULL;
+    read_line_t *tsd = &thread_tsd;
+
+    ptr = vptr;
+    for (n = 1; n < maxlen; n++) {
+        if ((rc = my_read(tsd, fd, &c)) == 1) {
+            *ptr++ = c;
+            if (c == '\n')
+                break;
+        } else if (rc == 0) {
+            *ptr = '\0';
+            return (n - 1);
+        } else
+            return -1;
+    }
+
+    *ptr = '\0';
+    return n;
+}
+
+off_t
+gf_lseek(int fd, off_t offset, int whence)
+{
+    off_t off = 0;
+    read_line_t *tsd = &thread_tsd;
+
+    off = sys_lseek(fd, offset, whence);
+    if (off == -1)
+        return -1;
+
+    tsd->rl_cnt = 0;
+    tsd->rl_bufptr = tsd->rl_buf;
+
+    return off;
+}
+
+int
+gf_ftruncate(int fd, off_t length)
+{
+    read_line_t *tsd = &thread_tsd;
+
+    if (sys_ftruncate(fd, 0))
+        return -1;
+
+    tsd->rl_cnt = 0;
+    tsd->rl_bufptr = tsd->rl_buf;
+
+    return 0;
+}
+
+int
+gf_thread_cleanup(xlator_t *this, pthread_t thread)
+{
+    int ret = 0;
+    void *res = NULL;
+
+    ret = pthread_cancel(thread);
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_WARNING, 0,
+               CHANGELOG_LIB_MSG_THREAD_CLEANUP_WARNING,
+               "Failed to send cancellation to thread");
+        goto error_return;
+    }
+
+    ret = pthread_join(thread, &res);
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_WARNING, 0,
+               CHANGELOG_LIB_MSG_THREAD_CLEANUP_WARNING,
+               "failed to join thread");
+        goto error_return;
+    }
+
+    if (res != PTHREAD_CANCELED) {
+        gf_msg(this->name, GF_LOG_WARNING, 0,
+               CHANGELOG_LIB_MSG_THREAD_CLEANUP_WARNING,
+               "Thread could not be cleaned up");
+        goto error_return;
+    }
+
+    return 0;
+
+error_return:
+    return -1;
+}
diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.h b/xlators/features/changelog/lib/src/gf-changelog-helpers.h
new file mode 100644
index 00000000000..9c609d33172
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.h
@@ -0,0 +1,255 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GF_CHANGELOG_HELPERS_H
+#define _GF_CHANGELOG_HELPERS_H
+
+#include <unistd.h>
+#include <dirent.h>
+#include <limits.h>
+#include <glusterfs/locking.h>
+
+#include <glusterfs/xlator.h>
+
+#include "changelog.h"
+
+#include "changelog-rpc-common.h"
+#include "gf-changelog-journal.h"
+
+#define GF_CHANGELOG_TRACKER "tracker"
+
+#define GF_CHANGELOG_CURRENT_DIR ".current"
+#define GF_CHANGELOG_PROCESSED_DIR ".processed"
+#define GF_CHANGELOG_PROCESSING_DIR ".processing"
+#define GF_CHANGELOG_HISTORY_DIR ".history"
+#define TIMESTAMP_LENGTH 10
+
+#ifndef MAXLINE
+#define MAXLINE 4096
+#endif
+
+#define GF_CHANGELOG_FILL_BUFFER(ptr, ascii, off, len)                         \
+    do {                                                                       \
+        memcpy(ascii + off, ptr, len);                                         \
+        off += len;                                                            \
+    } while (0)
+
+typedef struct read_line {
+    int rl_cnt;
+    char *rl_bufptr;
+    char rl_buf[MAXLINE];
+} read_line_t;
+
+struct gf_changelog;
+struct gf_event;
+
+/**
+ * Event list for ordered event notification
+ *
+ * ->next_seq holds the next _expected_ sequence number.
+ */
+struct gf_event_list {
+    pthread_mutex_t lock; /* protects this structure */
+    pthread_cond_t cond;
+
+    pthread_t invoker;
+
+    unsigned long next_seq; /* next sequence number expected:
+                               zero during bootstrap */
+
+    struct gf_changelog *entry; /* backpointer to it's brick
+                                   encapsulator (entry) */
+    struct list_head events;    /* list of events */
+};
+
+/**
+ * include a refcount if it's of use by additional layers
+ */
+struct gf_event {
+    int count;
+
+    unsigned long seq;
+
+    struct list_head list;
+
+    struct iovec iov[0];
+};
+#define GF_EVENT_CALLOC_SIZE(cnt, len)                                         \
+    (sizeof(struct gf_event) + (cnt * sizeof(struct iovec)) + len)
+
+/**
+ * assign the base address of the IO vector to the correct memory
+o * area and set it's addressable length.
+ */
+#define GF_EVENT_ASSIGN_IOVEC(vec, event, len, pos)                            \
+    do {                                                                       \
+        vec->iov_base = ((char *)event) + sizeof(struct gf_event) +            \
+                        (event->count * sizeof(struct iovec)) + pos;           \
+        vec->iov_len = len;                                                    \
+        pos += len;                                                            \
+    } while (0)
+
+typedef enum gf_changelog_conn_state {
+    GF_CHANGELOG_CONN_STATE_PENDING = 0,
+    GF_CHANGELOG_CONN_STATE_ACCEPTED,
+    GF_CHANGELOG_CONN_STATE_DISCONNECTED,
+} gf_changelog_conn_state_t;
+
+/**
+ * An instance of this structure is allocated for each brick for which
+ * notifications are streamed.
+ */
+typedef struct gf_changelog {
+    gf_lock_t statelock;
+    gf_changelog_conn_state_t connstate;
+
+    xlator_t *this;
+
+    struct list_head list; /* list of instances */
+
+    char brick[PATH_MAX]; /* brick path for this end-point */
+
+    changelog_rpc_t grpc; /* rpc{-clnt,svc} for this brick */
+#define RPC_PROBER(ent) ent->grpc.rpc
+#define RPC_REBORP(ent) ent->grpc.svc
+#define RPC_SOCK(ent) ent->grpc.sock
+
+    unsigned int notify; /* notification flag(s) */
+
+    FINI *fini;               /* destructor callback */
+    CALLBACK *callback;       /* event callback dispatcher */
+    CONNECT *connected;       /* connect callback */
+    DISCONNECT *disconnected; /* disconnection callback */
+
+    void *ptr;           /* owner specific private data */
+    xlator_t *invokerxl; /* consumers _this_, if valid,
+                            assigned to THIS before cbk is
+                            invoked */
+
+    gf_boolean_t ordered;
+
+    void (*queueevent)(struct gf_event_list *, struct gf_event *);
+    void (*pickevent)(struct gf_event_list *, struct gf_event **);
+
+    struct gf_event_list event;
+} gf_changelog_t;
+
+static inline int
+gf_changelog_filter_check(gf_changelog_t *entry, changelog_event_t *event)
+{
+    if (event->ev_type & entry->notify)
+        return 1;
+    return 0;
+}
+
+#define GF_NEED_ORDERED_EVENTS(ent) (ent->ordered == _gf_true)
+
+/** private structure */
+typedef struct gf_private {
+    pthread_mutex_t lock; /* protects ->connections, cleanups */
+    pthread_cond_t cond;
+
+    void *api; /* pointer for API access */
+
+    pthread_t poller;            /* event poller thread */
+    pthread_t connectionjanitor; /* connection cleaner */
+
+    struct list_head connections; /* list of connections */
+    struct list_head cleanups;    /* list of connection to be
+                                     cleaned up */
+} gf_private_t;
+
+#define GF_CHANGELOG_GET_API_PTR(this) (((gf_private_t *)this->private)->api)
+
+/**
+ * upcall: invoke callback with _correct_ THIS
+ */
+#define GF_CHANGELOG_INVOKE_CBK(this, cbk, brick, args...)                     \
+    do {                                                                       \
+        xlator_t *old_this = NULL;                                             \
+        xlator_t *invokerxl = NULL;                                            \
+                                                                               \
+        invokerxl = entry->invokerxl;                                          \
+        old_this = this;                                                       \
+                                                                               \
+        if (invokerxl) {                                                       \
+            THIS = invokerxl;                                                  \
+        }                                                                      \
+                                                                               \
+        cbk(invokerxl, brick, args);                                           \
+        THIS = old_this;                                                       \
+                                                                               \
+    } while (0)
+
+#define SAVE_THIS(xl)                                                          \
+    do {                                                                       \
+        old_this = xl;                                                         \
+        THIS = master;                                                         \
+    } while (0)
+
+#define RESTORE_THIS()                                                         \
+    do {                                                                       \
+        if (old_this)                                                          \
+            THIS = old_this;                                                   \
+    } while (0)
+
+/** APIs and the rest */
+
+void *
+gf_changelog_process(void *data);
+
+void
+gf_rfc3986_encode_space_newline(unsigned char *s, char *enc, char *estr);
+
+size_t
+gf_changelog_write(int fd, char *buffer, size_t len);
+
+ssize_t
+gf_readline(int fd, void *vptr, size_t maxlen);
+
+int
+gf_ftruncate(int fd, off_t length);
+
+off_t
+gf_lseek(int fd, off_t offset, int whence);
+
+int
+gf_changelog_consume(xlator_t *this, gf_changelog_journal_t *jnl,
+                     char *from_path, gf_boolean_t no_publish);
+int
+gf_changelog_publish(xlator_t *this, gf_changelog_journal_t *jnl,
+                     char *from_path);
+int
+gf_thread_cleanup(xlator_t *this, pthread_t thread);
+void *
+gf_changelog_callback_invoker(void *arg);
+
+int
+gf_cleanup_event(xlator_t *, struct gf_event_list *);
+
+/* (un)ordered event queueing */
+void
+queue_ordered_event(struct gf_event_list *, struct gf_event *);
+
+void
+queue_unordered_event(struct gf_event_list *, struct gf_event *);
+
+/* (un)ordered event picking */
+void
+pick_event_ordered(struct gf_event_list *, struct gf_event **);
+
+void
+pick_event_unordered(struct gf_event_list *, struct gf_event **);
+
+/* connection janitor thread */
+void *
+gf_changelog_connection_janitor(void *);
+
+#endif
diff --git a/xlators/features/changelog/lib/src/gf-changelog-journal-handler.c b/xlators/features/changelog/lib/src/gf-changelog-journal-handler.c
new file mode 100644
index 00000000000..7f6e2329e71
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog-journal-handler.c
@@ -0,0 +1,1029 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/compat-uuid.h>
+#include <glusterfs/globals.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/compat-errno.h>
+
+#include "gf-changelog-helpers.h"
+
+/* from the changelog translator */
+#include "changelog-misc.h"
+#include "changelog-mem-types.h"
+
+#include "gf-changelog-journal.h"
+#include "changelog-lib-messages.h"
+
+extern int byebye;
+
+enum changelog_versions { VERSION_1_1 = 0, VERSION_1_2 = 1 };
+
+/**
+ * number of gfid records after fop number
+ */
+int nr_gfids[2][GF_FOP_MAXVALUE] = {{
+                                        [GF_FOP_MKNOD] = 1,
+                                        [GF_FOP_MKDIR] = 1,
+                                        [GF_FOP_UNLINK] = 1,
+                                        [GF_FOP_RMDIR] = 1,
+                                        [GF_FOP_SYMLINK] = 1,
+                                        [GF_FOP_RENAME] = 2,
+                                        [GF_FOP_LINK] = 1,
+                                        [GF_FOP_CREATE] = 1,
+                                    },
+                                    {
+                                        [GF_FOP_MKNOD] = 1,
+                                        [GF_FOP_MKDIR] = 1,
+                                        [GF_FOP_UNLINK] = 2,
+                                        [GF_FOP_RMDIR] = 2,
+                                        [GF_FOP_SYMLINK] = 1,
+                                        [GF_FOP_RENAME] = 2,
+                                        [GF_FOP_LINK] = 1,
+                                        [GF_FOP_CREATE] = 1,
+                                    }};
+
+int nr_extra_recs[2][GF_FOP_MAXVALUE] = {{
+                                             [GF_FOP_MKNOD] = 3,
+                                             [GF_FOP_MKDIR] = 3,
+                                             [GF_FOP_UNLINK] = 0,
+                                             [GF_FOP_RMDIR] = 0,
+                                             [GF_FOP_SYMLINK] = 0,
+                                             [GF_FOP_RENAME] = 0,
+                                             [GF_FOP_LINK] = 0,
+                                             [GF_FOP_CREATE] = 3,
+                                         },
+                                         {
+                                             [GF_FOP_MKNOD] = 3,
+                                             [GF_FOP_MKDIR] = 3,
+                                             [GF_FOP_UNLINK] = 0,
+                                             [GF_FOP_RMDIR] = 0,
+                                             [GF_FOP_SYMLINK] = 0,
+                                             [GF_FOP_RENAME] = 0,
+                                             [GF_FOP_LINK] = 0,
+                                             [GF_FOP_CREATE] = 3,
+                                         }};
+
+static char *
+binary_to_ascii(uuid_t uuid)
+{
+    return uuid_utoa(uuid);
+}
+
+static char *
+conv_noop(char *ptr)
+{
+    return ptr;
+}
+
+#define VERIFY_SEPARATOR(ptr, plen, perr)                                      \
+    {                                                                          \
+        if (*(ptr + plen) != '\0') {                                           \
+            perr = 1;                                                          \
+            break;                                                             \
+        }                                                                      \
+    }
+
+#define MOVER_MOVE(mover, nleft, bytes)                                        \
+    {                                                                          \
+        mover += bytes;                                                        \
+        nleft -= bytes;                                                        \
+    }
+
+#define PARSE_GFID(mov, ptr, le, fn, perr)                                     \
+    {                                                                          \
+        VERIFY_SEPARATOR(mov, le, perr);                                       \
+        ptr = fn(mov);                                                         \
+        if (!ptr) {                                                            \
+            perr = 1;                                                          \
+            break;                                                             \
+        }                                                                      \
+    }
+
+#define FILL_AND_MOVE(pt, buf, of, mo, nl, le)                                 \
+    {                                                                          \
+        GF_CHANGELOG_FILL_BUFFER(pt, buf, of, strlen(pt));                     \
+        MOVER_MOVE(mo, nl, le);                                                \
+    }
+
+#define PARSE_GFID_MOVE(ptr, uuid, mover, nleft, perr)                         \
+    {                                                                          \
+        memcpy(uuid, mover, sizeof(uuid_t));                                   \
+        ptr = binary_to_ascii(uuid);                                           \
+        if (!ptr) {                                                            \
+            perr = 1;                                                          \
+            break;                                                             \
+        }                                                                      \
+        MOVER_MOVE(mover, nleft, sizeof(uuid_t));                              \
+    }
+
+#define LINE_BUFSIZE (3 * PATH_MAX) /* enough buffer for extra chars too */
+
+/**
+ * using mmap() makes parsing easy. fgets() cannot be used here as
+ * the binary gfid could contain a line-feed (0x0A), in that case fgets()
+ * would read an incomplete line and parsing would fail. using POSIX fds
+ * would result is additional code to maintain state in case of partial
+ * reads of data (where multiple entries do not fit extirely in the buffer).
+ *
+ * mmap() gives the flexibility of pointing to an offset in the file
+ * without us worrying about reading it in memory (VM does that for us for
+ * free).
+ */
+
+static int
+gf_changelog_parse_binary(xlator_t *this, gf_changelog_journal_t *jnl,
+                          int from_fd, int to_fd, size_t start_offset,
+                          struct stat *stbuf, int version_idx)
+
+{
+    int ret = -1;
+    off_t off = 0;
+    off_t nleft = 0;
+    uuid_t uuid = {
+        0,
+    };
+    char *ptr = NULL;
+    char *bname_start = NULL;
+    char *bname_end = NULL;
+    char *mover = NULL;
+    void *start = NULL;
+    char current_mover = ' ';
+    size_t blen = 0;
+    int parse_err = 0;
+    char *ascii = NULL;
+
+    ascii = GF_CALLOC(LINE_BUFSIZE, sizeof(char), gf_common_mt_char);
+
+    nleft = stbuf->st_size;
+
+    start = mmap(NULL, nleft, PROT_READ, MAP_PRIVATE, from_fd, 0);
+    if (start == MAP_FAILED) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_MMAP_FAILED,
+               "mmap() error");
+        goto out;
+    }
+
+    mover = start;
+
+    MOVER_MOVE(mover, nleft, start_offset);
+
+    while (nleft > 0) {
+        off = blen = 0;
+        ptr = bname_start = bname_end = NULL;
+
+        current_mover = *mover;
+
+        switch (current_mover) {
+            case 'D':
+            case 'M':
+                MOVER_MOVE(mover, nleft, 1);
+                PARSE_GFID_MOVE(ptr, uuid, mover, nleft, parse_err);
+
+                break;
+
+            case 'E':
+                MOVER_MOVE(mover, nleft, 1);
+                PARSE_GFID_MOVE(ptr, uuid, mover, nleft, parse_err);
+
+                bname_start = mover;
+                bname_end = strchr(mover, '\n');
+                if (bname_end == NULL) {
+                    parse_err = 1;
+                    break;
+                }
+
+                blen = bname_end - bname_start;
+                MOVER_MOVE(mover, nleft, blen);
+
+                break;
+
+            default:
+                parse_err = 1;
+        }
+
+        if (parse_err)
+            break;
+
+        GF_CHANGELOG_FILL_BUFFER(&current_mover, ascii, off, 1);
+        GF_CHANGELOG_FILL_BUFFER(" ", ascii, off, 1);
+        GF_CHANGELOG_FILL_BUFFER(ptr, ascii, off, strlen(ptr));
+        if (blen)
+            GF_CHANGELOG_FILL_BUFFER(bname_start, ascii, off, blen);
+        GF_CHANGELOG_FILL_BUFFER("\n", ascii, off, 1);
+
+        if (gf_changelog_write(to_fd, ascii, off) != off) {
+            gf_msg(this->name, GF_LOG_ERROR, errno,
+                   CHANGELOG_LIB_MSG_ASCII_ERROR,
+                   "processing binary changelog failed due to "
+                   " error in writing ascii change");
+            break;
+        }
+
+        MOVER_MOVE(mover, nleft, 1);
+    }
+
+    if ((nleft == 0) && (!parse_err))
+        ret = 0;
+
+    if (munmap(start, stbuf->st_size))
+        gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_MUNMAP_FAILED,
+               "munmap() error");
+out:
+    if (ascii)
+        GF_FREE(ascii);
+    return ret;
+}
+
+/**
+ * ascii decoder:
+ *  - separate out one entry from another
+ *  - use fop name rather than fop number
+ */
+static int
+gf_changelog_parse_ascii(xlator_t *this, gf_changelog_journal_t *jnl,
+                         int from_fd, int to_fd, size_t start_offset,
+                         struct stat *stbuf, int version_idx)
+{
+    int ng = 0;
+    int ret = -1;
+    int fop = 0;
+    int len = 0;
+    off_t off = 0;
+    off_t nleft = 0;
+    char *ptr = NULL;
+    char *eptr = NULL;
+    void *start = NULL;
+    char *mover = NULL;
+    int parse_err = 0;
+    char current_mover = ' ';
+    char *ascii = NULL;
+    const char *fopname = NULL;
+
+    ascii = GF_CALLOC(LINE_BUFSIZE, sizeof(char), gf_common_mt_char);
+
+    nleft = stbuf->st_size;
+
+    start = mmap(NULL, nleft, PROT_READ, MAP_PRIVATE, from_fd, 0);
+    if (start == MAP_FAILED) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_MMAP_FAILED,
+               "mmap() error");
+        goto out;
+    }
+
+    mover = start;
+
+    MOVER_MOVE(mover, nleft, start_offset);
+
+    while (nleft > 0) {
+        off = 0;
+        current_mover = *mover;
+
+        GF_CHANGELOG_FILL_BUFFER(&current_mover, ascii, off, 1);
+        GF_CHANGELOG_FILL_BUFFER(" ", ascii, off, 1);
+
+        switch (current_mover) {
+            case 'D':
+                MOVER_MOVE(mover, nleft, 1);
+
+                /* target gfid */
+                PARSE_GFID(mover, ptr, UUID_CANONICAL_FORM_LEN, conv_noop,
+                           parse_err);
+                FILL_AND_MOVE(ptr, ascii, off, mover, nleft,
+                              UUID_CANONICAL_FORM_LEN);
+                break;
+            case 'M':
+                MOVER_MOVE(mover, nleft, 1);
+
+                /* target gfid */
+                PARSE_GFID(mover, ptr, UUID_CANONICAL_FORM_LEN, conv_noop,
+                           parse_err);
+                FILL_AND_MOVE(ptr, ascii, off, mover, nleft,
+                              UUID_CANONICAL_FORM_LEN);
+                FILL_AND_MOVE(" ", ascii, off, mover, nleft, 1);
+
+                /* fop */
+                len = strlen(mover);
+                VERIFY_SEPARATOR(mover, len, parse_err);
+
+                fop = atoi(mover);
+                fopname = gf_fop_list[fop];
+                if (fopname == NULL) {
+                    parse_err = 1;
+                    break;
+                }
+
+                MOVER_MOVE(mover, nleft, len);
+
+                len = strlen(fopname);
+                GF_CHANGELOG_FILL_BUFFER(fopname, ascii, off, len);
+
+                break;
+
+            case 'E':
+                MOVER_MOVE(mover, nleft, 1);
+
+                /* target gfid */
+                PARSE_GFID(mover, ptr, UUID_CANONICAL_FORM_LEN, conv_noop,
+                           parse_err);
+                FILL_AND_MOVE(ptr, ascii, off, mover, nleft,
+                              UUID_CANONICAL_FORM_LEN);
+                FILL_AND_MOVE(" ", ascii, off, mover, nleft, 1);
+
+                /* fop */
+                len = strlen(mover);
+                VERIFY_SEPARATOR(mover, len, parse_err);
+
+                fop = atoi(mover);
+                fopname = gf_fop_list[fop];
+                if (fopname == NULL) {
+                    parse_err = 1;
+                    break;
+                }
+
+                MOVER_MOVE(mover, nleft, len);
+
+                len = strlen(fopname);
+                GF_CHANGELOG_FILL_BUFFER(fopname, ascii, off, len);
+
+                ng = nr_extra_recs[version_idx][fop];
+                for (; ng > 0; ng--) {
+                    MOVER_MOVE(mover, nleft, 1);
+                    len = strlen(mover);
+                    VERIFY_SEPARATOR(mover, len, parse_err);
+
+                    GF_CHANGELOG_FILL_BUFFER(" ", ascii, off, 1);
+                    FILL_AND_MOVE(mover, ascii, off, mover, nleft, len);
+                }
+
+                /* pargfid + bname */
+                ng = nr_gfids[version_idx][fop];
+                while (ng-- > 0) {
+                    MOVER_MOVE(mover, nleft, 1);
+                    len = strlen(mover);
+                    if (!len) {
+                        MOVER_MOVE(mover, nleft, 1);
+                        continue;
+                    }
+
+                    GF_CHANGELOG_FILL_BUFFER(" ", ascii, off, 1);
+
+                    PARSE_GFID(mover, ptr, len, conv_noop, parse_err);
+                    eptr = calloc(3, strlen(ptr));
+                    if (!eptr) {
+                        parse_err = 1;
+                        break;
+                    }
+
+                    gf_rfc3986_encode_space_newline((unsigned char *)ptr, eptr,
+                                                    jnl->rfc3986_space_newline);
+                    FILL_AND_MOVE(eptr, ascii, off, mover, nleft, len);
+                    free(eptr);
+                }
+
+                break;
+            default:
+                parse_err = 1;
+        }
+
+        if (parse_err)
+            break;
+
+        GF_CHANGELOG_FILL_BUFFER("\n", ascii, off, 1);
+
+        if (gf_changelog_write(to_fd, ascii, off) != off) {
+            gf_msg(this->name, GF_LOG_ERROR, errno,
+                   CHANGELOG_LIB_MSG_ASCII_ERROR,
+                   "processing ascii changelog failed due to "
+                   " error in writing change");
+            break;
+        }
+
+        MOVER_MOVE(mover, nleft, 1);
+    }
+
+    if ((nleft == 0) && (!parse_err))
+        ret = 0;
+
+    if (munmap(start, stbuf->st_size))
+        gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_MUNMAP_FAILED,
+               "munmap() error");
+
+out:
+    if (ascii)
+        GF_FREE(ascii);
+
+    return ret;
+}
+
+static int
+gf_changelog_decode(xlator_t *this, gf_changelog_journal_t *jnl, int from_fd,
+                    int to_fd, struct stat *stbuf, int *zerob)
+{
+    int ret = -1;
+    int encoding = -1;
+    int major_version = -1;
+    int minor_version = -1;
+    int version_idx = -1;
+    size_t elen = 0;
+    char buffer[1024] = {
+        0,
+    };
+
+    CHANGELOG_GET_HEADER_INFO(from_fd, buffer, sizeof(buffer), encoding,
+                              major_version, minor_version, elen);
+    if (encoding == -1) /* unknown encoding */
+        goto out;
+
+    if (major_version == -1) /* unknown major version */
+        goto out;
+
+    if (minor_version == -1) /* unknown minor version */
+        goto out;
+
+    if (!CHANGELOG_VALID_ENCODING(encoding))
+        goto out;
+
+    if (elen == stbuf->st_size) {
+        *zerob = 1;
+        goto out;
+    }
+
+    if (major_version == 1 && minor_version == 1) {
+        version_idx = VERSION_1_1;
+    } else if (major_version == 1 && minor_version == 2) {
+        version_idx = VERSION_1_2;
+    }
+
+    if (version_idx == -1) /* unknown version number */
+        goto out;
+
+    /**
+     * start processing after the header
+     */
+    if (sys_lseek(from_fd, elen, SEEK_SET) < 0) {
+        goto out;
+    }
+    switch (encoding) {
+        case CHANGELOG_ENCODE_BINARY:
+            /**
+             * this ideally should have been a part of changelog-encoders.c
+             * (ie. part of the changelog translator).
+             */
+            ret = gf_changelog_parse_binary(this, jnl, from_fd, to_fd, elen,
+                                            stbuf, version_idx);
+            break;
+
+        case CHANGELOG_ENCODE_ASCII:
+            ret = gf_changelog_parse_ascii(this, jnl, from_fd, to_fd, elen,
+                                           stbuf, version_idx);
+            break;
+    }
+
+out:
+    return ret;
+}
+
+int
+gf_changelog_publish(xlator_t *this, gf_changelog_journal_t *jnl,
+                     char *from_path)
+{
+    int ret = 0;
+    char dest[PATH_MAX] = {
+        0,
+    };
+    char to_path[PATH_MAX] = {
+        0,
+    };
+    struct stat stbuf = {
+        0,
+    };
+
+    if (snprintf(to_path, PATH_MAX, "%s%s", jnl->jnl_current_dir,
+                 basename(from_path)) >= PATH_MAX)
+        return -1;
+
+    /* handle zerob file that won't exist in current */
+    ret = sys_stat(to_path, &stbuf);
+    if (ret) {
+        if (errno == ENOENT)
+            ret = 0;
+        goto out;
+    }
+
+    if (snprintf(dest, PATH_MAX, "%s%s", jnl->jnl_processing_dir,
+                 basename(from_path)) >= PATH_MAX)
+        return -1;
+
+    ret = sys_rename(to_path, dest);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_LIB_MSG_RENAME_FAILED, "from=%s", to_path, "to=%s",
+                dest, NULL);
+    }
+
+out:
+    return ret;
+}
+
+int
+gf_changelog_consume(xlator_t *this, gf_changelog_journal_t *jnl,
+                     char *from_path, gf_boolean_t no_publish)
+{
+    int ret = -1;
+    int fd1 = 0;
+    int fd2 = 0;
+    int zerob = 0;
+    struct stat stbuf = {
+        0,
+    };
+    char dest[PATH_MAX] = {
+        0,
+    };
+    char to_path[PATH_MAX] = {
+        0,
+    };
+
+    if (snprintf(to_path, PATH_MAX, "%s%s", jnl->jnl_current_dir,
+                 basename(from_path)) >= PATH_MAX)
+        goto out;
+    if (snprintf(dest, PATH_MAX, "%s%s", jnl->jnl_processing_dir,
+                 basename(from_path)) >= PATH_MAX)
+        goto out;
+
+    ret = sys_stat(from_path, &stbuf);
+    if (ret || !S_ISREG(stbuf.st_mode)) {
+        ret = -1;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_STAT_FAILED,
+                "path=%s", from_path, NULL);
+        goto out;
+    }
+
+    fd1 = open(from_path, O_RDONLY);
+    if (fd1 < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_OPEN_FAILED,
+                "path=%s", from_path, NULL);
+        goto out;
+    }
+
+    fd2 = open(to_path, O_CREAT | O_TRUNC | O_RDWR,
+               S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+    if (fd2 < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_OPEN_FAILED,
+                "path=%s", to_path, NULL);
+        goto close_fd;
+    } else {
+        ret = gf_changelog_decode(this, jnl, fd1, fd2, &stbuf, &zerob);
+
+        sys_close(fd2);
+
+        if (!ret) {
+            /* move it to processing on a successful
+               decode */
+            if (no_publish == _gf_true)
+                goto close_fd;
+            ret = sys_rename(to_path, dest);
+            if (ret)
+                gf_smsg(this->name, GF_LOG_ERROR, errno,
+                        CHANGELOG_LIB_MSG_RENAME_FAILED, "from=%s", to_path,
+                        "to=%s", dest, NULL);
+        }
+
+        /* remove it from .current if it's an empty file */
+        if (zerob) {
+            /* zerob changelogs must be unlinked */
+            ret = sys_unlink(to_path);
+            if (ret)
+                gf_smsg(this->name, GF_LOG_ERROR, errno,
+                        CHANGELOG_LIB_MSG_UNLINK_FAILED, "name=empty changelog",
+                        "path=%s", to_path, NULL);
+        }
+    }
+
+close_fd:
+    sys_close(fd1);
+
+out:
+    return ret;
+}
+
+void *
+gf_changelog_process(void *data)
+{
+    xlator_t *this = NULL;
+    gf_changelog_journal_t *jnl = NULL;
+    gf_changelog_entry_t *entry = NULL;
+    gf_changelog_processor_t *jnl_proc = NULL;
+
+    jnl = data;
+    jnl_proc = jnl->jnl_proc;
+    THIS = jnl->this;
+    this = jnl->this;
+
+    while (1) {
+        pthread_mutex_lock(&jnl_proc->lock);
+        {
+            while (list_empty(&jnl_proc->entries)) {
+                jnl_proc->waiting = _gf_true;
+                pthread_cond_wait(&jnl_proc->cond, &jnl_proc->lock);
+            }
+
+            entry = list_first_entry(&jnl_proc->entries, gf_changelog_entry_t,
+                                     list);
+            if (entry)
+                list_del(&entry->list);
+
+            jnl_proc->waiting = _gf_false;
+        }
+        pthread_mutex_unlock(&jnl_proc->lock);
+
+        if (entry) {
+            (void)gf_changelog_consume(this, jnl, entry->path, _gf_false);
+            GF_FREE(entry);
+        }
+    }
+
+    return NULL;
+}
+
+void
+gf_changelog_queue_journal(gf_changelog_processor_t *jnl_proc,
+                           changelog_event_t *event)
+{
+    size_t len = 0;
+    gf_changelog_entry_t *entry = NULL;
+
+    entry = GF_CALLOC(1, sizeof(gf_changelog_entry_t),
+                      gf_changelog_mt_libgfchangelog_entry_t);
+    if (!entry)
+        return;
+    INIT_LIST_HEAD(&entry->list);
+
+    len = strlen(event->u.journal.path);
+    (void)memcpy(entry->path, event->u.journal.path, len + 1);
+    entry->path[len] = '\0';
+
+    pthread_mutex_lock(&jnl_proc->lock);
+    {
+        list_add_tail(&entry->list, &jnl_proc->entries);
+        if (jnl_proc->waiting)
+            pthread_cond_signal(&jnl_proc->cond);
+    }
+    pthread_mutex_unlock(&jnl_proc->lock);
+
+    return;
+}
+
+void
+gf_changelog_handle_journal(void *xl, char *brick, void *cbkdata,
+                            changelog_event_t *event)
+{
+    gf_changelog_journal_t *jnl = NULL;
+    gf_changelog_processor_t *jnl_proc = NULL;
+
+    jnl = cbkdata;
+    jnl_proc = jnl->jnl_proc;
+
+    gf_changelog_queue_journal(jnl_proc, event);
+}
+
+void
+gf_changelog_journal_disconnect(void *xl, char *brick, void *data)
+{
+    gf_changelog_journal_t *jnl = NULL;
+
+    jnl = data;
+
+    pthread_spin_lock(&jnl->lock);
+    {
+        JNL_SET_API_STATE(jnl, JNL_API_DISCONNECTED);
+    };
+    pthread_spin_unlock(&jnl->lock);
+}
+
+void
+gf_changelog_journal_connect(void *xl, char *brick, void *data)
+{
+    gf_changelog_journal_t *jnl = NULL;
+
+    jnl = data;
+
+    pthread_spin_lock(&jnl->lock);
+    {
+        JNL_SET_API_STATE(jnl, JNL_API_CONNECTED);
+    };
+    pthread_spin_unlock(&jnl->lock);
+
+    return;
+}
+
+void
+gf_changelog_cleanup_processor(gf_changelog_journal_t *jnl)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+    gf_changelog_processor_t *jnl_proc = NULL;
+
+    this = THIS;
+    if (!this || !jnl || !jnl->jnl_proc)
+        goto error_return;
+
+    jnl_proc = jnl->jnl_proc;
+
+    ret = gf_thread_cleanup(this, jnl_proc->processor);
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_CLEANUP_ERROR,
+               "failed to cleanup processor thread");
+        goto error_return;
+    }
+
+    (void)pthread_mutex_destroy(&jnl_proc->lock);
+    (void)pthread_cond_destroy(&jnl_proc->cond);
+
+    GF_FREE(jnl_proc);
+
+error_return:
+    return;
+}
+
+int
+gf_changelog_init_processor(gf_changelog_journal_t *jnl)
+{
+    int ret = -1;
+    gf_changelog_processor_t *jnl_proc = NULL;
+
+    jnl_proc = GF_CALLOC(1, sizeof(gf_changelog_processor_t),
+                         gf_changelog_mt_libgfchangelog_t);
+    if (!jnl_proc)
+        goto error_return;
+
+    ret = pthread_mutex_init(&jnl_proc->lock, NULL);
+    if (ret != 0)
+        goto free_jnl_proc;
+    ret = pthread_cond_init(&jnl_proc->cond, NULL);
+    if (ret != 0)
+        goto cleanup_mutex;
+
+    INIT_LIST_HEAD(&jnl_proc->entries);
+    jnl_proc->waiting = _gf_false;
+    jnl->jnl_proc = jnl_proc;
+
+    ret = gf_thread_create(&jnl_proc->processor, NULL, gf_changelog_process,
+                           jnl, "clogproc");
+    if (ret != 0) {
+        jnl->jnl_proc = NULL;
+        goto cleanup_cond;
+    }
+
+    return 0;
+
+cleanup_cond:
+    (void)pthread_cond_destroy(&jnl_proc->cond);
+cleanup_mutex:
+    (void)pthread_mutex_destroy(&jnl_proc->lock);
+free_jnl_proc:
+    GF_FREE(jnl_proc);
+error_return:
+    return -1;
+}
+
+static void
+gf_changelog_cleanup_fds(gf_changelog_journal_t *jnl)
+{
+    /* tracker fd */
+    if (jnl->jnl_fd != -1)
+        sys_close(jnl->jnl_fd);
+    /* processing dir */
+    if (jnl->jnl_dir)
+        sys_closedir(jnl->jnl_dir);
+
+    if (jnl->jnl_working_dir)
+        free(jnl->jnl_working_dir); /* allocated by realpath */
+}
+
+static int
+gf_changelog_open_dirs(xlator_t *this, gf_changelog_journal_t *jnl)
+{
+    int ret = -1;
+    DIR *dir = NULL;
+    int tracker_fd = 0;
+    char tracker_path[PATH_MAX] = {
+        0,
+    };
+
+    /* .current */
+    (void)snprintf(jnl->jnl_current_dir, PATH_MAX,
+                   "%s/" GF_CHANGELOG_CURRENT_DIR "/", jnl->jnl_working_dir);
+    ret = recursive_rmdir(jnl->jnl_current_dir);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_LIB_MSG_FAILED_TO_RMDIR, "path=%s",
+                jnl->jnl_current_dir, NULL);
+        goto out;
+    }
+    ret = mkdir_p(jnl->jnl_current_dir, 0600, _gf_false);
+    if (ret)
+        goto out;
+
+    /* .processed */
+    (void)snprintf(jnl->jnl_processed_dir, PATH_MAX,
+                   "%s/" GF_CHANGELOG_PROCESSED_DIR "/", jnl->jnl_working_dir);
+    ret = mkdir_p(jnl->jnl_processed_dir, 0600, _gf_false);
+    if (ret)
+        goto out;
+
+    /* .processing */
+    (void)snprintf(jnl->jnl_processing_dir, PATH_MAX,
+                   "%s/" GF_CHANGELOG_PROCESSING_DIR "/", jnl->jnl_working_dir);
+    ret = recursive_rmdir(jnl->jnl_processing_dir);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_LIB_MSG_FAILED_TO_RMDIR, "path=%s",
+                jnl->jnl_processing_dir, NULL);
+        goto out;
+    }
+
+    ret = mkdir_p(jnl->jnl_processing_dir, 0600, _gf_false);
+    if (ret)
+        goto out;
+
+    dir = sys_opendir(jnl->jnl_processing_dir);
+    if (!dir) {
+        gf_msg("", GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_OPENDIR_ERROR,
+               "opendir() error");
+        goto out;
+    }
+
+    jnl->jnl_dir = dir;
+
+    (void)snprintf(tracker_path, PATH_MAX, "%s/" GF_CHANGELOG_TRACKER,
+                   jnl->jnl_working_dir);
+
+    tracker_fd = open(tracker_path, O_CREAT | O_APPEND | O_RDWR,
+                      S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+    if (tracker_fd < 0) {
+        sys_closedir(jnl->jnl_dir);
+        ret = -1;
+        goto out;
+    }
+
+    jnl->jnl_fd = tracker_fd;
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+gf_changelog_init_history(xlator_t *this, gf_changelog_journal_t *jnl,
+                          char *brick_path)
+{
+    int i = 0;
+    int ret = 0;
+    char hist_scratch_dir[PATH_MAX] = {
+        0,
+    };
+
+    jnl->hist_jnl = GF_CALLOC(1, sizeof(*jnl),
+                              gf_changelog_mt_libgfchangelog_t);
+    if (!jnl->hist_jnl)
+        goto error_return;
+
+    jnl->hist_jnl->jnl_dir = NULL;
+    jnl->hist_jnl->jnl_fd = -1;
+
+    (void)snprintf(hist_scratch_dir, PATH_MAX,
+                   "%s/" GF_CHANGELOG_HISTORY_DIR "/", jnl->jnl_working_dir);
+
+    ret = mkdir_p(hist_scratch_dir, 0600, _gf_false);
+    if (ret)
+        goto dealloc_hist;
+
+    jnl->hist_jnl->jnl_working_dir = realpath(hist_scratch_dir, NULL);
+    if (!jnl->hist_jnl->jnl_working_dir)
+        goto dealloc_hist;
+
+    ret = gf_changelog_open_dirs(this, jnl->hist_jnl);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_OPENDIR_ERROR,
+               "could not create entries in history scratch dir");
+        goto dealloc_hist;
+    }
+
+    if (snprintf(jnl->hist_jnl->jnl_brickpath, PATH_MAX, "%s", brick_path) >=
+        PATH_MAX)
+        goto dealloc_hist;
+
+    for (i = 0; i < 256; i++) {
+        jnl->hist_jnl->rfc3986_space_newline[i] = (i == ' ' || i == '\n' ||
+                                                   i == '%')
+                                                      ? 0
+                                                      : i;
+    }
+
+    return 0;
+
+dealloc_hist:
+    GF_FREE(jnl->hist_jnl);
+    jnl->hist_jnl = NULL;
+error_return:
+    return -1;
+}
+
+void
+gf_changelog_journal_fini(void *xl, char *brick, void *data)
+{
+    gf_changelog_journal_t *jnl = NULL;
+
+    jnl = data;
+
+    gf_changelog_cleanup_processor(jnl);
+
+    gf_changelog_cleanup_fds(jnl);
+    if (jnl->hist_jnl)
+        gf_changelog_cleanup_fds(jnl->hist_jnl);
+
+    GF_FREE(jnl);
+}
+
+void *
+gf_changelog_journal_init(void *xl, struct gf_brick_spec *brick)
+{
+    int i = 0;
+    int ret = 0;
+    xlator_t *this = NULL;
+    struct stat buf = {
+        0,
+    };
+    char *scratch_dir = NULL;
+    gf_changelog_journal_t *jnl = NULL;
+
+    this = xl;
+    scratch_dir = (char *)brick->ptr;
+
+    jnl = GF_CALLOC(1, sizeof(gf_changelog_journal_t),
+                    gf_changelog_mt_libgfchangelog_t);
+    if (!jnl)
+        goto error_return;
+
+    if (snprintf(jnl->jnl_brickpath, PATH_MAX, "%s", brick->brick_path) >=
+        PATH_MAX)
+        goto dealloc_private;
+
+    if (sys_stat(scratch_dir, &buf) && errno == ENOENT) {
+        ret = mkdir_p(scratch_dir, 0600, _gf_true);
+        if (ret)
+            goto dealloc_private;
+    }
+
+    jnl->jnl_working_dir = realpath(scratch_dir, NULL);
+    if (!jnl->jnl_working_dir)
+        goto dealloc_private;
+
+    ret = gf_changelog_open_dirs(this, jnl);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_OPENDIR_ERROR,
+               "could not create entries in scratch dir");
+        goto dealloc_private;
+    }
+
+    /* RFC 3986 {de,en}coding */
+    for (i = 0; i < 256; i++) {
+        jnl->rfc3986_space_newline[i] = (i == ' ' || i == '\n' || i == '%') ? 0
+                                                                            : i;
+    }
+
+    ret = gf_changelog_init_history(this, jnl, brick->brick_path);
+    if (ret)
+        goto cleanup_fds;
+
+    /* initialize journal processor */
+    jnl->this = this;
+    ret = gf_changelog_init_processor(jnl);
+    if (ret)
+        goto cleanup_fds;
+
+    JNL_SET_API_STATE(jnl, JNL_API_CONN_INPROGESS);
+    ret = pthread_spin_init(&jnl->lock, 0);
+    if (ret != 0)
+        goto cleanup_processor;
+    return jnl;
+
+cleanup_processor:
+    gf_changelog_cleanup_processor(jnl);
+cleanup_fds:
+    gf_changelog_cleanup_fds(jnl);
+    if (jnl->hist_jnl)
+        gf_changelog_cleanup_fds(jnl->hist_jnl);
+dealloc_private:
+    GF_FREE(jnl);
+error_return:
+    return NULL;
+}
diff --git a/xlators/features/changelog/lib/src/gf-changelog-journal.h b/xlators/features/changelog/lib/src/gf-changelog-journal.h
new file mode 100644
index 00000000000..ba5b9bf827e
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog-journal.h
@@ -0,0 +1,116 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __GF_CHANGELOG_JOURNAL_H
+#define __GF_CHANGELOG_JOURNAL_H
+
+#include <unistd.h>
+#include <pthread.h>
+
+#include "changelog.h"
+
+enum api_conn {
+    JNL_API_CONNECTED,
+    JNL_API_CONN_INPROGESS,
+    JNL_API_DISCONNECTED,
+};
+
+typedef struct gf_changelog_entry {
+    char path[PATH_MAX];
+
+    struct list_head list;
+} gf_changelog_entry_t;
+
+typedef struct gf_changelog_processor {
+    pthread_mutex_t lock; /* protects ->entries */
+    pthread_cond_t cond;  /* waiter during empty list */
+    gf_boolean_t waiting;
+
+    pthread_t processor; /* thread-id of journal processing thread */
+
+    struct list_head entries;
+} gf_changelog_processor_t;
+
+typedef struct gf_changelog_journal {
+    DIR *jnl_dir; /* 'processing' directory stream */
+
+    int jnl_fd; /* fd to the tracker file */
+
+    char jnl_brickpath[PATH_MAX]; /* brick path for this end-point */
+
+    gf_changelog_processor_t *jnl_proc;
+
+    char *jnl_working_dir; /* scratch directory */
+
+    char jnl_current_dir[PATH_MAX];
+    char jnl_processed_dir[PATH_MAX];
+    char jnl_processing_dir[PATH_MAX];
+
+    char rfc3986_space_newline[256]; /* RFC 3986 string encoding */
+
+    struct gf_changelog_journal *hist_jnl;
+    int hist_done; /* holds 0 done scanning,
+                      1 keep scanning and -1 error */
+
+    pthread_spinlock_t lock;
+    int connected;
+    xlator_t *this;
+} gf_changelog_journal_t;
+
+#define JNL_SET_API_STATE(jnl, state) (jnl->connected = state)
+#define JNL_IS_API_DISCONNECTED(jnl) (jnl->connected == JNL_API_DISCONNECTED)
+
+/* History API */
+typedef struct gf_changelog_history_data {
+    int len;
+
+    int htime_fd;
+
+    /* parallelism count */
+    int n_parallel;
+
+    /* history from, to indexes */
+    unsigned long from;
+    unsigned long to;
+    xlator_t *this;
+} gf_changelog_history_data_t;
+
+typedef struct gf_changelog_consume_data {
+    /** set of inputs */
+
+    /* fd to read from */
+    int fd;
+
+    /* from @offset */
+    off_t offset;
+
+    xlator_t *this;
+
+    gf_changelog_journal_t *jnl;
+
+    /** set of outputs */
+
+    /* return value */
+    int retval;
+
+    /* journal processed */
+    char changelog[PATH_MAX];
+} gf_changelog_consume_data_t;
+
+/* event handler */
+CALLBACK gf_changelog_handle_journal;
+
+/* init, connect & disconnect handler */
+INIT gf_changelog_journal_init;
+FINI gf_changelog_journal_fini;
+CONNECT gf_changelog_journal_connect;
+DISCONNECT gf_changelog_journal_disconnect;
+
+#endif
diff --git a/xlators/features/changelog/lib/src/gf-changelog-reborp.c b/xlators/features/changelog/lib/src/gf-changelog-reborp.c
new file mode 100644
index 00000000000..56b11cbb705
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog-reborp.c
@@ -0,0 +1,413 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "changelog-misc.h"
+#include "changelog-mem-types.h"
+
+#include "gf-changelog-helpers.h"
+#include "changelog-rpc-common.h"
+#include "changelog-lib-messages.h"
+
+#include <glusterfs/syscall.h>
+
+/**
+ * Reverse socket: actual data transfer handler. Connection
+ * initiator is PROBER, data transfer is REBORP.
+ */
+
+static struct rpcsvc_program *gf_changelog_reborp_programs[];
+
+void *
+gf_changelog_connection_janitor(void *arg)
+{
+    int32_t ret = 0;
+    xlator_t *this = NULL;
+    gf_private_t *priv = NULL;
+    gf_changelog_t *entry = NULL;
+    struct gf_event *event = NULL;
+    struct gf_event_list *ev = NULL;
+    unsigned long drained = 0;
+
+    this = arg;
+    THIS = this;
+
+    priv = this->private;
+
+    while (1) {
+        pthread_mutex_lock(&priv->lock);
+        {
+            while (list_empty(&priv->cleanups))
+                pthread_cond_wait(&priv->cond, &priv->lock);
+
+            entry = list_first_entry(&priv->cleanups, gf_changelog_t, list);
+            list_del_init(&entry->list);
+        }
+        pthread_mutex_unlock(&priv->lock);
+
+        drained = 0;
+        ev = &entry->event;
+
+        gf_smsg(this->name, GF_LOG_INFO, 0,
+                CHANGELOG_LIB_MSG_CLEANING_BRICK_ENTRY_INFO, "brick=%s",
+                entry->brick, NULL);
+
+        /* 0x0: disable rpc-clnt */
+        rpc_clnt_disable(RPC_PROBER(entry));
+
+        /* 0x1: cleanup callback invoker thread */
+        ret = gf_cleanup_event(this, ev);
+        if (ret)
+            continue;
+
+        /* 0x2: drain pending events */
+        while (!list_empty(&ev->events)) {
+            event = list_first_entry(&ev->events, struct gf_event, list);
+            gf_smsg(this->name, GF_LOG_INFO, 0,
+                    CHANGELOG_LIB_MSG_DRAINING_EVENT_INFO, "seq=%lu",
+                    event->seq, "payload=%d", event->count, NULL);
+
+            GF_FREE(event);
+            drained++;
+        }
+
+        gf_smsg(this->name, GF_LOG_INFO, 0,
+                CHANGELOG_LIB_MSG_DRAINED_EVENT_INFO, "num=%lu", drained, NULL);
+
+        /* 0x3: freeup brick entry */
+        gf_smsg(this->name, GF_LOG_INFO, 0,
+                CHANGELOG_LIB_MSG_FREEING_ENTRY_INFO, "entry=%p", entry, NULL);
+        LOCK_DESTROY(&entry->statelock);
+        GF_FREE(entry);
+    }
+
+    return NULL;
+}
+
+int
+gf_changelog_reborp_rpcsvc_notify(rpcsvc_t *rpc, void *mydata,
+                                  rpcsvc_event_t event, void *data)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+    gf_changelog_t *entry = NULL;
+
+    if (!(event == RPCSVC_EVENT_ACCEPT || event == RPCSVC_EVENT_DISCONNECT))
+        return 0;
+
+    entry = mydata;
+    this = entry->this;
+
+    switch (event) {
+        case RPCSVC_EVENT_ACCEPT:
+            ret = sys_unlink(RPC_SOCK(entry));
+            if (ret != 0)
+                gf_smsg(this->name, GF_LOG_WARNING, errno,
+                        CHANGELOG_LIB_MSG_UNLINK_FAILED, "name=reverse socket",
+                        "path=%s", RPC_SOCK(entry), NULL);
+            if (entry->connected)
+                GF_CHANGELOG_INVOKE_CBK(this, entry->connected, entry->brick,
+                                        entry->ptr);
+            break;
+        case RPCSVC_EVENT_DISCONNECT:
+            if (entry->disconnected)
+                GF_CHANGELOG_INVOKE_CBK(this, entry->disconnected, entry->brick,
+                                        entry->ptr);
+            /* passthrough */
+        default:
+            break;
+    }
+
+    return 0;
+}
+
+rpcsvc_t *
+gf_changelog_reborp_init_rpc_listner(xlator_t *this, char *path, char *sock,
+                                     void *cbkdata)
+{
+    CHANGELOG_MAKE_TMP_SOCKET_PATH(path, sock, UNIX_PATH_MAX);
+    return changelog_rpc_server_init(this, sock, cbkdata,
+                                     gf_changelog_reborp_rpcsvc_notify,
+                                     gf_changelog_reborp_programs);
+}
+
+/**
+ * This is dirty and painful as of now until there is event filtering in the
+ * server. The entire event buffer is scanned and interested events are picked,
+ * whereas we _should_ be notified with the events we were interested in
+ * (selected at the time of probe). As of now this is complete BS and needs
+ * fixture ASAP. I just made it work, it needs to be better.
+ *
+ * @FIXME: cleanup this bugger once server filters events.
+ */
+void
+gf_changelog_invoke_callback(gf_changelog_t *entry, struct iovec **vec,
+                             int payloadcnt)
+{
+    int i = 0;
+    int evsize = 0;
+    xlator_t *this = NULL;
+    changelog_event_t *event = NULL;
+
+    this = entry->this;
+
+    for (; i < payloadcnt; i++) {
+        event = (changelog_event_t *)vec[i]->iov_base;
+        evsize = vec[i]->iov_len / CHANGELOG_EV_SIZE;
+
+        for (; evsize > 0; evsize--, event++) {
+            if (gf_changelog_filter_check(entry, event)) {
+                GF_CHANGELOG_INVOKE_CBK(this, entry->callback, entry->brick,
+                                        entry->ptr, event);
+            }
+        }
+    }
+}
+
+/**
+ * Ordered event handler is self-adaptive.. if the event sequence number
+ * is what's expected (->next_seq) there is no ordering list that's
+ * maintained. On out-of-order event notifications, event buffers are
+ * dynamically allocated and ordered.
+ */
+
+int
+__is_expected_sequence(struct gf_event_list *ev, struct gf_event *event)
+{
+    return (ev->next_seq == event->seq);
+}
+
+int
+__can_process_event(struct gf_event_list *ev, struct gf_event **event)
+{
+    *event = list_first_entry(&ev->events, struct gf_event, list);
+
+    if (__is_expected_sequence(ev, *event)) {
+        list_del(&(*event)->list);
+        ev->next_seq++;
+        return 1;
+    }
+
+    return 0;
+}
+
+void
+pick_event_ordered(struct gf_event_list *ev, struct gf_event **event)
+{
+    pthread_mutex_lock(&ev->lock);
+    {
+        while (list_empty(&ev->events) || !__can_process_event(ev, event))
+            pthread_cond_wait(&ev->cond, &ev->lock);
+    }
+    pthread_mutex_unlock(&ev->lock);
+}
+
+void
+pick_event_unordered(struct gf_event_list *ev, struct gf_event **event)
+{
+    pthread_mutex_lock(&ev->lock);
+    {
+        while (list_empty(&ev->events))
+            pthread_cond_wait(&ev->cond, &ev->lock);
+        *event = list_first_entry(&ev->events, struct gf_event, list);
+        list_del(&(*event)->list);
+    }
+    pthread_mutex_unlock(&ev->lock);
+}
+
+void *
+gf_changelog_callback_invoker(void *arg)
+{
+    xlator_t *this = NULL;
+    gf_changelog_t *entry = NULL;
+    struct iovec *vec = NULL;
+    struct gf_event *event = NULL;
+    struct gf_event_list *ev = NULL;
+
+    ev = arg;
+    entry = ev->entry;
+    THIS = this = entry->this;
+
+    while (1) {
+        entry->pickevent(ev, &event);
+
+        vec = (struct iovec *)&event->iov;
+        gf_changelog_invoke_callback(entry, &vec, event->count);
+
+        GF_FREE(event);
+    }
+
+    return NULL;
+}
+
+static int
+orderfn(struct list_head *pos1, struct list_head *pos2)
+{
+    struct gf_event *event1 = NULL;
+    struct gf_event *event2 = NULL;
+
+    event1 = list_entry(pos1, struct gf_event, list);
+    event2 = list_entry(pos2, struct gf_event, list);
+
+    if (event1->seq > event2->seq)
+        return 1;
+    return -1;
+}
+
+void
+queue_ordered_event(struct gf_event_list *ev, struct gf_event *event)
+{
+    /* add event to the ordered event list and wake up listener(s) */
+    pthread_mutex_lock(&ev->lock);
+    {
+        list_add_order(&event->list, &ev->events, orderfn);
+        if (!ev->next_seq)
+            ev->next_seq = event->seq;
+        if (ev->next_seq == event->seq)
+            pthread_cond_signal(&ev->cond);
+    }
+    pthread_mutex_unlock(&ev->lock);
+}
+
+void
+queue_unordered_event(struct gf_event_list *ev, struct gf_event *event)
+{
+    /* add event to the tail of the queue and wake up listener(s) */
+    pthread_mutex_lock(&ev->lock);
+    {
+        list_add_tail(&event->list, &ev->events);
+        pthread_cond_signal(&ev->cond);
+    }
+    pthread_mutex_unlock(&ev->lock);
+}
+
+int
+gf_changelog_event_handler(rpcsvc_request_t *req, xlator_t *this,
+                           gf_changelog_t *entry)
+{
+    int i = 0;
+    size_t payloadlen = 0;
+    ssize_t len = 0;
+    int payloadcnt = 0;
+    changelog_event_req rpc_req = {
+        0,
+    };
+    changelog_event_rsp rpc_rsp = {
+        0,
+    };
+    struct iovec *vec = NULL;
+    struct gf_event *event = NULL;
+    struct gf_event_list *ev = NULL;
+
+    ev = &entry->event;
+
+    len = xdr_to_generic(req->msg[0], &rpc_req,
+                         (xdrproc_t)xdr_changelog_event_req);
+    if (len < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0,
+               CHANGELOG_LIB_MSG_XDR_DECODING_FAILED, "xdr decoding failed");
+        req->rpc_err = GARBAGE_ARGS;
+        goto handle_xdr_error;
+    }
+
+    if (len < req->msg[0].iov_len) {
+        payloadcnt = 1;
+        payloadlen = (req->msg[0].iov_len - len);
+    }
+    for (i = 1; i < req->count; i++) {
+        payloadcnt++;
+        payloadlen += req->msg[i].iov_len;
+    }
+
+    event = GF_CALLOC(1, GF_EVENT_CALLOC_SIZE(payloadcnt, payloadlen),
+                      gf_changelog_mt_libgfchangelog_event_t);
+    if (!event)
+        goto handle_xdr_error;
+    INIT_LIST_HEAD(&event->list);
+
+    payloadlen = 0;
+    event->seq = rpc_req.seq;
+    event->count = payloadcnt;
+
+    /* deep copy IO vectors */
+    vec = &event->iov[0];
+    GF_EVENT_ASSIGN_IOVEC(vec, event, (req->msg[0].iov_len - len), payloadlen);
+    (void)memcpy(vec->iov_base, req->msg[0].iov_base + len, vec->iov_len);
+
+    for (i = 1; i < req->count; i++) {
+        vec = &event->iov[i];
+        GF_EVENT_ASSIGN_IOVEC(vec, event, req->msg[i].iov_len, payloadlen);
+        (void)memcpy(event->iov[i].iov_base, req->msg[i].iov_base,
+                     req->msg[i].iov_len);
+    }
+
+    gf_msg_debug(this->name, 0,
+                 "seq: %" PRIu64 " [%s] (time: %" PRIu64 ".%" PRIu64
+                 "), "
+                 "(vec: %d, len: %zd)",
+                 rpc_req.seq, entry->brick, rpc_req.tv_sec, rpc_req.tv_usec,
+                 payloadcnt, payloadlen);
+
+    /* dispatch event */
+    entry->queueevent(ev, event);
+
+    /* ack sequence number */
+    rpc_rsp.op_ret = 0;
+    rpc_rsp.seq = rpc_req.seq;
+
+    goto submit_rpc;
+
+handle_xdr_error:
+    rpc_rsp.op_ret = -1;
+    rpc_rsp.seq = 0; /* invalid */
+submit_rpc:
+    return changelog_rpc_sumbit_reply(req, &rpc_rsp, NULL, 0, NULL,
+                                      (xdrproc_t)xdr_changelog_event_rsp);
+}
+
+int
+gf_changelog_reborp_handle_event(rpcsvc_request_t *req)
+{
+    xlator_t *this = NULL;
+    rpcsvc_t *svc = NULL;
+    gf_changelog_t *entry = NULL;
+
+    svc = rpcsvc_request_service(req);
+    entry = svc->mydata;
+
+    this = THIS = entry->this;
+
+    return gf_changelog_event_handler(req, this, entry);
+}
+
+static rpcsvc_actor_t gf_changelog_reborp_actors[CHANGELOG_REV_PROC_MAX] = {
+    [CHANGELOG_REV_PROC_EVENT] = {"CHANGELOG EVENT HANDLER",
+                                  gf_changelog_reborp_handle_event, NULL,
+                                  CHANGELOG_REV_PROC_EVENT, DRC_NA, 0},
+};
+
+/**
+ * Do not use synctask as the RPC layer dereferences ->mydata as THIS.
+ * In gf_changelog_setup_rpc(), @cbkdata is of type @gf_changelog_t,
+ * and that's required to invoke the callback with the appropriate
+ * brick path and it's private data.
+ */
+static struct rpcsvc_program gf_changelog_reborp_prog = {
+    .progname = "LIBGFCHANGELOG REBORP",
+    .prognum = CHANGELOG_REV_RPC_PROCNUM,
+    .progver = CHANGELOG_REV_RPC_PROCVER,
+    .numactors = CHANGELOG_REV_PROC_MAX,
+    .actors = gf_changelog_reborp_actors,
+    .synctask = _gf_false,
+};
+
+static struct rpcsvc_program *gf_changelog_reborp_programs[] = {
+    &gf_changelog_reborp_prog,
+    NULL,
+};
diff --git a/xlators/features/changelog/lib/src/gf-changelog-rpc.c b/xlators/features/changelog/lib/src/gf-changelog-rpc.c
new file mode 100644
index 00000000000..8ec6ffbcebc
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog-rpc.c
@@ -0,0 +1,98 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "gf-changelog-rpc.h"
+#include "changelog-misc.h"
+#include "changelog-mem-types.h"
+
+struct rpc_clnt_program gf_changelog_clnt;
+
+/* TODO: piggyback reconnect to called (upcall) */
+int
+gf_changelog_rpc_notify(struct rpc_clnt *rpc, void *mydata,
+                        rpc_clnt_event_t event, void *data)
+{
+    switch (event) {
+        case RPC_CLNT_CONNECT:
+            break;
+        case RPC_CLNT_DISCONNECT:
+        case RPC_CLNT_MSG:
+        case RPC_CLNT_DESTROY:
+        case RPC_CLNT_PING:
+            break;
+    }
+
+    return 0;
+}
+
+struct rpc_clnt *
+gf_changelog_rpc_init(xlator_t *this, gf_changelog_t *entry)
+{
+    char sockfile[UNIX_PATH_MAX] = {
+        0,
+    };
+
+    CHANGELOG_MAKE_SOCKET_PATH(entry->brick, sockfile, UNIX_PATH_MAX);
+    return changelog_rpc_client_init(this, entry, sockfile,
+                                     gf_changelog_rpc_notify);
+}
+
+/**
+ * remote procedure calls declarations.
+ */
+
+int
+gf_probe_changelog_cbk(struct rpc_req *req, struct iovec *iovec, int count,
+                       void *myframe)
+{
+    return 0;
+}
+
+int
+gf_probe_changelog_filter(call_frame_t *frame, xlator_t *this, void *data)
+{
+    char *sock = NULL;
+    gf_changelog_t *entry = NULL;
+    changelog_probe_req req = {
+        0,
+    };
+
+    entry = data;
+    sock = RPC_SOCK(entry);
+
+    (void)memcpy(&req.sock, sock, strlen(sock));
+    req.filter = entry->notify;
+
+    /* invoke RPC */
+    return changelog_rpc_sumbit_req(
+        RPC_PROBER(entry), (void *)&req, frame, &gf_changelog_clnt,
+        CHANGELOG_RPC_PROBE_FILTER, NULL, 0, NULL, this, gf_probe_changelog_cbk,
+        (xdrproc_t)xdr_changelog_probe_req);
+}
+
+int
+gf_changelog_invoke_rpc(xlator_t *this, gf_changelog_t *entry, int procidx)
+{
+    return changelog_invoke_rpc(this, RPC_PROBER(entry), &gf_changelog_clnt,
+                                procidx, entry);
+}
+
+struct rpc_clnt_procedure gf_changelog_procs[CHANGELOG_RPC_PROC_MAX] = {
+    [CHANGELOG_RPC_PROC_NULL] = {"NULL", NULL},
+    [CHANGELOG_RPC_PROBE_FILTER] = {"PROBE FILTER", gf_probe_changelog_filter},
+};
+
+struct rpc_clnt_program gf_changelog_clnt = {
+    .progname = "LIBGFCHANGELOG",
+    .prognum = CHANGELOG_RPC_PROGNUM,
+    .progver = CHANGELOG_RPC_PROGVER,
+    .numproc = CHANGELOG_RPC_PROC_MAX,
+    .proctable = gf_changelog_procs,
+};
diff --git a/xlators/features/changelog/lib/src/gf-changelog-rpc.h b/xlators/features/changelog/lib/src/gf-changelog-rpc.h
new file mode 100644
index 00000000000..5c82d6f1c08
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog-rpc.h
@@ -0,0 +1,28 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __GF_CHANGELOG_RPC_H
+#define __GF_CHANGELOG_RPC_H
+
+#include <glusterfs/xlator.h>
+
+#include "gf-changelog-helpers.h"
+#include "changelog-rpc-common.h"
+
+struct rpc_clnt *
+gf_changelog_rpc_init(xlator_t *, gf_changelog_t *);
+
+int
+gf_changelog_invoke_rpc(xlator_t *, gf_changelog_t *, int);
+
+rpcsvc_t *
+gf_changelog_reborp_init_rpc_listner(xlator_t *, char *, char *, void *);
+
+#endif
diff --git a/xlators/features/changelog/lib/src/gf-changelog.c b/xlators/features/changelog/lib/src/gf-changelog.c
new file mode 100644
index 00000000000..57c3d39ef76
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog.c
@@ -0,0 +1,652 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <errno.h>
+#include <dirent.h>
+#include <stddef.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <string.h>
+
+#include <glusterfs/globals.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/syncop.h>
+
+#include "gf-changelog-rpc.h"
+#include "gf-changelog-helpers.h"
+
+/* from the changelog translator */
+#include "changelog-misc.h"
+#include "changelog-mem-types.h"
+#include "changelog-lib-messages.h"
+
+/**
+ * Global singleton xlator pointer for the library, initialized
+ * during library load. This should probably be hidden inside
+ * an initialized object which is an handle for the consumer.
+ *
+ * TODO: do away with the global..
+ */
+xlator_t *master = NULL;
+
+static inline gf_private_t *
+gf_changelog_alloc_priv()
+{
+    int ret = 0;
+    gf_private_t *priv = NULL;
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_changelog_mt_priv_t);
+    if (!priv)
+        goto error_return;
+    INIT_LIST_HEAD(&priv->connections);
+    INIT_LIST_HEAD(&priv->cleanups);
+
+    ret = pthread_mutex_init(&priv->lock, NULL);
+    if (ret != 0)
+        goto free_priv;
+    ret = pthread_cond_init(&priv->cond, NULL);
+    if (ret != 0)
+        goto cleanup_mutex;
+
+    priv->api = NULL;
+    return priv;
+
+cleanup_mutex:
+    (void)pthread_mutex_destroy(&priv->lock);
+free_priv:
+    GF_FREE(priv);
+error_return:
+    return NULL;
+}
+
+#define GF_CHANGELOG_EVENT_POOL_SIZE 16384
+#define GF_CHANGELOG_EVENT_THREAD_COUNT 4
+
+static int
+gf_changelog_ctx_defaults_init(glusterfs_ctx_t *ctx)
+{
+    cmd_args_t *cmd_args = NULL;
+    struct rlimit lim = {
+        0,
+    };
+    call_pool_t *pool = NULL;
+    int ret = -1;
+
+    ret = xlator_mem_acct_init(THIS, gf_changelog_mt_end);
+    if (ret != 0)
+        return -1;
+
+    ctx->process_uuid = generate_glusterfs_ctx_id();
+    if (!ctx->process_uuid)
+        return -1;
+
+    ctx->page_size = 128 * GF_UNIT_KB;
+
+    ctx->iobuf_pool = iobuf_pool_new();
+    if (!ctx->iobuf_pool)
+        goto free_pool;
+
+    ctx->event_pool = gf_event_pool_new(GF_CHANGELOG_EVENT_POOL_SIZE,
+                                        GF_CHANGELOG_EVENT_THREAD_COUNT);
+    if (!ctx->event_pool)
+        goto free_pool;
+
+    pool = GF_CALLOC(1, sizeof(call_pool_t),
+                     gf_changelog_mt_libgfchangelog_call_pool_t);
+    if (!pool)
+        goto free_pool;
+
+    /* frame_mem_pool size 112 * 64 */
+    pool->frame_mem_pool = mem_pool_new(call_frame_t, 32);
+    if (!pool->frame_mem_pool)
+        goto free_pool;
+
+    /* stack_mem_pool size 256 * 128 */
+    pool->stack_mem_pool = mem_pool_new(call_stack_t, 16);
+
+    if (!pool->stack_mem_pool)
+        goto free_pool;
+
+    ctx->stub_mem_pool = mem_pool_new(call_stub_t, 16);
+    if (!ctx->stub_mem_pool)
+        goto free_pool;
+
+    ctx->dict_pool = mem_pool_new(dict_t, 32);
+    if (!ctx->dict_pool)
+        goto free_pool;
+
+    ctx->dict_pair_pool = mem_pool_new(data_pair_t, 512);
+    if (!ctx->dict_pair_pool)
+        goto free_pool;
+
+    ctx->dict_data_pool = mem_pool_new(data_t, 512);
+    if (!ctx->dict_data_pool)
+        goto free_pool;
+
+    ctx->logbuf_pool = mem_pool_new(log_buf_t, 256);
+    if (!ctx->logbuf_pool)
+        goto free_pool;
+
+    INIT_LIST_HEAD(&pool->all_frames);
+    LOCK_INIT(&pool->lock);
+    ctx->pool = pool;
+
+    LOCK_INIT(&ctx->lock);
+
+    cmd_args = &ctx->cmd_args;
+
+    INIT_LIST_HEAD(&cmd_args->xlator_options);
+
+    lim.rlim_cur = RLIM_INFINITY;
+    lim.rlim_max = RLIM_INFINITY;
+    setrlimit(RLIMIT_CORE, &lim);
+
+    return 0;
+
+free_pool:
+    if (pool) {
+        GF_FREE(pool->frame_mem_pool);
+
+        GF_FREE(pool->stack_mem_pool);
+
+        GF_FREE(pool);
+    }
+
+    GF_FREE(ctx->stub_mem_pool);
+
+    GF_FREE(ctx->dict_pool);
+
+    GF_FREE(ctx->dict_pair_pool);
+
+    GF_FREE(ctx->dict_data_pool);
+
+    GF_FREE(ctx->logbuf_pool);
+
+    GF_FREE(ctx->iobuf_pool);
+
+    GF_FREE(ctx->event_pool);
+
+    return -1;
+}
+
+/* TODO: cleanup ctx defaults */
+void
+gf_changelog_cleanup_this(xlator_t *this)
+{
+    glusterfs_ctx_t *ctx = NULL;
+
+    if (!this)
+        return;
+
+    ctx = this->ctx;
+    syncenv_destroy(ctx->env);
+    free(ctx);
+
+    this->private = NULL;
+    this->ctx = NULL;
+
+    mem_pools_fini();
+}
+
+static int
+gf_changelog_init_context()
+{
+    glusterfs_ctx_t *ctx = NULL;
+
+    ctx = glusterfs_ctx_new();
+    if (!ctx)
+        goto error_return;
+
+    if (glusterfs_globals_init(ctx))
+        goto free_ctx;
+
+    THIS->ctx = ctx;
+    if (gf_changelog_ctx_defaults_init(ctx))
+        goto free_ctx;
+
+    ctx->env = syncenv_new(0, 0, 0);
+    if (!ctx->env)
+        goto free_ctx;
+    return 0;
+
+free_ctx:
+    free(ctx);
+    THIS->ctx = NULL;
+error_return:
+    return -1;
+}
+
+static int
+gf_changelog_init_master()
+{
+    int ret = 0;
+
+    ret = gf_changelog_init_context();
+    mem_pools_init();
+
+    return ret;
+}
+
+/* TODO: cleanup clnt/svc on failure */
+int
+gf_changelog_setup_rpc(xlator_t *this, gf_changelog_t *entry, int proc)
+{
+    int ret = 0;
+    rpcsvc_t *svc = NULL;
+    struct rpc_clnt *rpc = NULL;
+
+    /**
+     * Initialize a connect back socket. A probe() RPC call to the server
+     * triggers a reverse connect.
+     */
+    svc = gf_changelog_reborp_init_rpc_listner(this, entry->brick,
+                                               RPC_SOCK(entry), entry);
+    if (!svc)
+        goto error_return;
+    RPC_REBORP(entry) = svc;
+
+    /* Initialize an RPC client */
+    rpc = gf_changelog_rpc_init(this, entry);
+    if (!rpc)
+        goto error_return;
+    RPC_PROBER(entry) = rpc;
+
+    /**
+     * @FIXME
+     * till we have connection state machine, let's delay the RPC call
+     * for now..
+     */
+    sleep(2);
+
+    /**
+     * Probe changelog translator for reverse connection. After a successful
+     * call, there's less use of the client and can be disconnected, but
+     * let's leave the connection active for any future RPC calls.
+     */
+    ret = gf_changelog_invoke_rpc(this, entry, proc);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_INVOKE_RPC_FAILED,
+               "Could not initiate probe RPC, bailing out!!!");
+        goto error_return;
+    }
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+int
+gf_cleanup_event(xlator_t *this, struct gf_event_list *ev)
+{
+    int ret = 0;
+
+    ret = gf_thread_cleanup(this, ev->invoker);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret,
+               CHANGELOG_LIB_MSG_CLEANUP_ERROR,
+               "cannot cleanup callback invoker thread."
+               " Not freeing resources");
+        return -1;
+    }
+
+    ev->entry = NULL;
+
+    return 0;
+}
+
+static int
+gf_init_event(gf_changelog_t *entry)
+{
+    int ret = 0;
+    struct gf_event_list *ev = NULL;
+
+    ev = &entry->event;
+    ev->entry = entry;
+
+    ret = pthread_mutex_init(&ev->lock, NULL);
+    if (ret != 0)
+        goto error_return;
+    ret = pthread_cond_init(&ev->cond, NULL);
+    if (ret != 0)
+        goto cleanup_mutex;
+    INIT_LIST_HEAD(&ev->events);
+
+    ev->next_seq = 0; /* bootstrap sequencing */
+
+    if (GF_NEED_ORDERED_EVENTS(entry)) {
+        entry->pickevent = pick_event_ordered;
+        entry->queueevent = queue_ordered_event;
+    } else {
+        entry->pickevent = pick_event_unordered;
+        entry->queueevent = queue_unordered_event;
+    }
+
+    ret = gf_thread_create(&ev->invoker, NULL, gf_changelog_callback_invoker,
+                           ev, "clogcbki");
+    if (ret != 0) {
+        entry->pickevent = NULL;
+        entry->queueevent = NULL;
+        goto cleanup_cond;
+    }
+
+    return 0;
+
+cleanup_cond:
+    (void)pthread_cond_destroy(&ev->cond);
+cleanup_mutex:
+    (void)pthread_mutex_destroy(&ev->lock);
+error_return:
+    return -1;
+}
+
+/**
+ * TODO:
+ *  - cleanup invoker thread
+ *  - cleanup event list
+ *  - destroy rpc{-clnt, svc}
+ */
+int
+gf_cleanup_brick_connection(xlator_t *this, gf_changelog_t *entry)
+{
+    return 0;
+}
+
+int
+gf_cleanup_connections(xlator_t *this)
+{
+    return 0;
+}
+
+static int
+gf_setup_brick_connection(xlator_t *this, struct gf_brick_spec *brick,
+                          gf_boolean_t ordered, void *xl)
+{
+    int ret = 0;
+    gf_private_t *priv = NULL;
+    gf_changelog_t *entry = NULL;
+
+    priv = this->private;
+
+    if (!brick->callback || !brick->init || !brick->fini)
+        goto error_return;
+
+    entry = GF_CALLOC(1, sizeof(*entry), gf_changelog_mt_libgfchangelog_t);
+    if (!entry)
+        goto error_return;
+    INIT_LIST_HEAD(&entry->list);
+
+    LOCK_INIT(&entry->statelock);
+    entry->connstate = GF_CHANGELOG_CONN_STATE_PENDING;
+
+    entry->notify = brick->filter;
+    if (snprintf(entry->brick, PATH_MAX, "%s", brick->brick_path) >= PATH_MAX)
+        goto free_entry;
+
+    entry->this = this;
+    entry->invokerxl = xl;
+
+    entry->ordered = ordered;
+    ret = gf_init_event(entry);
+    if (ret)
+        goto free_entry;
+
+    entry->fini = brick->fini;
+    entry->callback = brick->callback;
+    entry->connected = brick->connected;
+    entry->disconnected = brick->disconnected;
+
+    entry->ptr = brick->init(this, brick);
+    if (!entry->ptr)
+        goto cleanup_event;
+    priv->api = entry->ptr; /* pointer to API, if required */
+
+    pthread_mutex_lock(&priv->lock);
+    {
+        list_add_tail(&entry->list, &priv->connections);
+    }
+    pthread_mutex_unlock(&priv->lock);
+
+    ret = gf_changelog_setup_rpc(this, entry, CHANGELOG_RPC_PROBE_FILTER);
+    if (ret)
+        goto cleanup_event;
+    return 0;
+
+cleanup_event:
+    (void)gf_cleanup_event(this, &entry->event);
+free_entry:
+    gf_msg_debug(this->name, 0, "freeing entry %p", entry);
+    list_del(&entry->list); /* FIXME: kludge for now */
+    GF_FREE(entry);
+error_return:
+    return -1;
+}
+
+int
+gf_changelog_register_brick(xlator_t *this, struct gf_brick_spec *brick,
+                            gf_boolean_t ordered, void *xl)
+{
+    return gf_setup_brick_connection(this, brick, ordered, xl);
+}
+
+static int
+gf_changelog_setup_logging(xlator_t *this, char *logfile, int loglevel)
+{
+    /* passing ident as NULL means to use default ident for syslog */
+    if (gf_log_init(this->ctx, logfile, NULL))
+        return -1;
+
+    gf_log_set_loglevel(this->ctx, (loglevel == -1) ? GF_LOG_INFO : loglevel);
+    return 0;
+}
+
+static int
+gf_changelog_set_master(xlator_t *master, void *xl)
+{
+    int32_t ret = 0;
+    xlator_t *this = NULL;
+    xlator_t *old_this = NULL;
+    gf_private_t *priv = NULL;
+
+    this = xl;
+    if (!this || !this->ctx) {
+        ret = gf_changelog_init_master();
+        if (ret)
+            return -1;
+        this = THIS;
+    }
+
+    master->ctx = this->ctx;
+
+    INIT_LIST_HEAD(&master->volume_options);
+    SAVE_THIS(THIS);
+
+    ret = xlator_mem_acct_init(THIS, gf_changelog_mt_end);
+    if (ret != 0)
+        goto restore_this;
+
+    priv = gf_changelog_alloc_priv();
+    if (!priv) {
+        ret = -1;
+        goto restore_this;
+    }
+
+    if (!xl) {
+        /* poller thread */
+        ret = gf_thread_create(&priv->poller, NULL, changelog_rpc_poller, THIS,
+                               "clogpoll");
+        if (ret != 0) {
+            GF_FREE(priv);
+            gf_msg(master->name, GF_LOG_ERROR, 0,
+                   CHANGELOG_LIB_MSG_THREAD_CREATION_FAILED,
+                   "failed to spawn poller thread");
+            goto restore_this;
+        }
+    }
+
+    master->private = priv;
+
+restore_this:
+    RESTORE_THIS();
+
+    return ret;
+}
+
+int
+gf_changelog_init(void *xl)
+{
+    int ret = 0;
+    gf_private_t *priv = NULL;
+
+    if (master)
+        return 0;
+
+    master = calloc(1, sizeof(*master));
+    if (!master)
+        goto error_return;
+
+    master->name = strdup("gfchangelog");
+    if (!master->name)
+        goto dealloc_master;
+
+    ret = gf_changelog_set_master(master, xl);
+    if (ret)
+        goto dealloc_name;
+
+    priv = master->private;
+    ret = gf_thread_create(&priv->connectionjanitor, NULL,
+                           gf_changelog_connection_janitor, master, "clogjan");
+    if (ret != 0) {
+        /* TODO: cleanup priv, mutex (poller thread for !xl) */
+        goto dealloc_name;
+    }
+
+    return 0;
+
+dealloc_name:
+    free(master->name);
+dealloc_master:
+    free(master);
+    master = NULL;
+error_return:
+    return -1;
+}
+
+int
+gf_changelog_register_generic(struct gf_brick_spec *bricks, int count,
+                              int ordered, char *logfile, int lvl, void *xl)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+    xlator_t *old_this = NULL;
+    struct gf_brick_spec *brick = NULL;
+    gf_boolean_t need_order = _gf_false;
+
+    SAVE_THIS(xl);
+
+    this = THIS;
+    if (!this)
+        goto error_return;
+
+    ret = gf_changelog_setup_logging(this, logfile, lvl);
+    if (ret)
+        goto error_return;
+
+    need_order = (ordered) ? _gf_true : _gf_false;
+
+    brick = bricks;
+    while (count--) {
+        gf_smsg(this->name, GF_LOG_INFO, 0,
+                CHANGELOG_LIB_MSG_NOTIFY_REGISTER_INFO, "brick=%s",
+                brick->brick_path, "notify_filter=%d", brick->filter, NULL);
+
+        ret = gf_changelog_register_brick(this, brick, need_order, xl);
+        if (ret != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   CHANGELOG_LIB_MSG_NOTIFY_REGISTER_FAILED,
+                   "Error registering with changelog xlator");
+            break;
+        }
+
+        brick++;
+    }
+
+    if (ret != 0)
+        goto cleanup_inited_bricks;
+
+    RESTORE_THIS();
+    return 0;
+
+cleanup_inited_bricks:
+    gf_cleanup_connections(this);
+error_return:
+    RESTORE_THIS();
+    return -1;
+}
+
+/**
+ * @API
+ *  gf_changelog_register()
+ *
+ * This is _NOT_ a generic register API. It's a special API to handle
+ * updates at a journal granulality. This is used by consumers wanting
+ * to process persistent journal such as geo-replication via a set of
+ * APIs. All of this is required to maintain backward compatibility.
+ * Owner specific private data is stored in ->api (in gf_private_t),
+ * which is used by APIs to access it's private data. This limits
+ * the library access to a single brick, but that's how it used to
+ * be anyway. Furthermore, this API solely _owns_ "this", therefore
+ * callers already having a notion of "this" are expected to use the
+ * newer API.
+ *
+ * Newer applications wanting to use this library need not face this
+ * limitation and reply of the much more feature rich generic register
+ * API, which is purely callback based.
+ *
+ * NOTE: @max_reconnects is not used but required for backward compat.
+ *
+ * For generic API, refer gf_changelog_register_generic().
+ */
+int
+gf_changelog_register(char *brick_path, char *scratch_dir, char *log_file,
+                      int log_level, int max_reconnects)
+{
+    struct gf_brick_spec brick = {
+        0,
+    };
+
+    if (master)
+        THIS = master;
+    else
+        return -1;
+
+    brick.brick_path = brick_path;
+    brick.filter = CHANGELOG_OP_TYPE_JOURNAL;
+
+    brick.init = gf_changelog_journal_init;
+    brick.fini = gf_changelog_journal_fini;
+    brick.callback = gf_changelog_handle_journal;
+    brick.connected = gf_changelog_journal_connect;
+    brick.disconnected = gf_changelog_journal_disconnect;
+
+    brick.ptr = scratch_dir;
+
+    return gf_changelog_register_generic(&brick, 1, 1, log_file, log_level,
+                                         NULL);
+}
diff --git a/xlators/features/changelog/lib/src/gf-history-changelog.c b/xlators/features/changelog/lib/src/gf-history-changelog.c
new file mode 100644
index 00000000000..a16219f3664
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-history-changelog.c
@@ -0,0 +1,1020 @@
+#include <errno.h>
+#include <dirent.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <string.h>
+
+#include <glusterfs/globals.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/syscall.h>
+
+#include "gf-changelog-helpers.h"
+#include "gf-changelog-journal.h"
+
+/* from the changelog translator */
+#include "changelog-misc.h"
+#include "changelog-lib-messages.h"
+#include "changelog-mem-types.h"
+
+/**
+ * @API
+ * gf_history_changelog_done:
+ *    Move processed history changelog file from .processing
+ *    to .processed
+ *
+ * ARGUMENTS:
+ *    file(IN): path to processed history changelog file in
+ *    .processing directory.
+ *
+ * RETURN VALUE:
+ *    0: On success.
+ *   -1: On error.
+ */
+int
+gf_history_changelog_done(char *file)
+{
+    int ret = -1;
+    char *buffer = NULL;
+    xlator_t *this = NULL;
+    gf_changelog_journal_t *jnl = NULL;
+    gf_changelog_journal_t *hist_jnl = NULL;
+    char to_path[PATH_MAX] = {
+        0,
+    };
+
+    errno = EINVAL;
+
+    this = THIS;
+    if (!this)
+        goto out;
+
+    jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this);
+    if (!jnl)
+        goto out;
+
+    hist_jnl = jnl->hist_jnl;
+    if (!hist_jnl)
+        goto out;
+
+    if (!file || !strlen(file))
+        goto out;
+
+    /* make sure 'file' is inside ->jnl_working_dir */
+    buffer = realpath(file, NULL);
+    if (!buffer)
+        goto out;
+
+    if (strncmp(hist_jnl->jnl_working_dir, buffer,
+                strlen(hist_jnl->jnl_working_dir)))
+        goto out;
+
+    (void)snprintf(to_path, PATH_MAX, "%s%s", hist_jnl->jnl_processed_dir,
+                   basename(buffer));
+    gf_msg_debug(this->name, 0, "moving %s to processed directory", file);
+    ret = sys_rename(buffer, to_path);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_LIB_MSG_RENAME_FAILED, "from=%s", file, "to=%s",
+                to_path, NULL);
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    if (buffer)
+        free(buffer); /* allocated by realpath() */
+    return ret;
+}
+
+/**
+ * @API
+ *  gf_history_changelog_start_fresh:
+ *     For a set of changelogs, start from the beginning.
+ *     It will truncates the history tracker fd.
+ *
+ *  RETURN VALUES:
+ *     0: On success.
+ *    -1: On error.
+ */
+int
+gf_history_changelog_start_fresh()
+{
+    xlator_t *this = NULL;
+    gf_changelog_journal_t *jnl = NULL;
+    gf_changelog_journal_t *hist_jnl = NULL;
+
+    this = THIS;
+    if (!this)
+        goto out;
+
+    errno = EINVAL;
+
+    jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this);
+    if (!jnl)
+        goto out;
+
+    hist_jnl = jnl->hist_jnl;
+    if (!hist_jnl)
+        goto out;
+
+    if (gf_ftruncate(hist_jnl->jnl_fd, 0))
+        goto out;
+
+    return 0;
+
+out:
+    return -1;
+}
+
+/**
+ * @API
+ *  gf_history_changelog_next_change:
+ *     Return the next history changelog file entry. Zero means all
+ *     history chanelogs are consumed.
+ *
+ *  ARGUMENTS:
+ *     bufptr(OUT): Path to unprocessed history changelog file
+ *                  from tracker file.
+ *     maxlen(IN): Usually PATH_MAX.
+ *
+ *  RETURN VALUES:
+ *     size: On success.
+ *     -1  : On error.
+ */
+ssize_t
+gf_history_changelog_next_change(char *bufptr, size_t maxlen)
+{
+    ssize_t size = -1;
+    int tracker_fd = 0;
+    xlator_t *this = NULL;
+    gf_changelog_journal_t *jnl = NULL;
+    gf_changelog_journal_t *hist_jnl = NULL;
+    char buffer[PATH_MAX] = {
+        0,
+    };
+
+    if (maxlen > PATH_MAX) {
+        errno = ENAMETOOLONG;
+        goto out;
+    }
+
+    errno = EINVAL;
+
+    this = THIS;
+    if (!this)
+        goto out;
+
+    jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this);
+    if (!jnl)
+        goto out;
+
+    hist_jnl = jnl->hist_jnl;
+    if (!hist_jnl)
+        goto out;
+
+    tracker_fd = hist_jnl->jnl_fd;
+
+    size = gf_readline(tracker_fd, buffer, maxlen);
+    if (size < 0) {
+        size = -1;
+        goto out;
+    }
+
+    if (size == 0)
+        goto out;
+
+    memcpy(bufptr, buffer, size - 1);
+    bufptr[size - 1] = '\0';
+
+out:
+    return size;
+}
+
+/**
+ * @API
+ *  gf_history_changelog_scan:
+ *     Scan and generate a list of change entries.
+ *     Calling this api multiple times (without calling gf_changlog_done())
+ *     would result new changelogs(s) being refreshed in the tracker file.
+ *     This call also acts as a cancellation point for the consumer.
+ *
+ *  RETURN VALUES:
+ *      +ve integer : success and keep scanning.(count of changelogs)
+ *      0           : success and done scanning.
+ *     -1           : error.
+ *
+ *  NOTE: After first 0 return call_get_next change for once more time
+ *        to empty the tracker
+ *
+ */
+ssize_t
+gf_history_changelog_scan()
+{
+    int tracker_fd = 0;
+    size_t off = 0;
+    xlator_t *this = NULL;
+    size_t nr_entries = 0;
+    gf_changelog_journal_t *jnl = NULL;
+    gf_changelog_journal_t *hist_jnl = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    char buffer[PATH_MAX] = {
+        0,
+    };
+    static int is_last_scan;
+
+    this = THIS;
+    if (!this)
+        goto out;
+
+    jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this);
+    if (!jnl)
+        goto out;
+    if (JNL_IS_API_DISCONNECTED(jnl)) {
+        errno = ENOTCONN;
+        goto out;
+    }
+
+    hist_jnl = jnl->hist_jnl;
+    if (!hist_jnl)
+        goto out;
+
+retry:
+    if (is_last_scan == 1)
+        return 0;
+    if (hist_jnl->hist_done == 0)
+        is_last_scan = 1;
+
+    errno = EINVAL;
+    if (hist_jnl->hist_done == -1)
+        goto out;
+
+    tracker_fd = hist_jnl->jnl_fd;
+
+    if (gf_ftruncate(tracker_fd, 0))
+        goto out;
+
+    rewinddir(hist_jnl->jnl_dir);
+
+    for (;;) {
+        errno = 0;
+        entry = sys_readdir(hist_jnl->jnl_dir, scratch);
+        if (!entry || errno != 0)
+            break;
+
+        if (strcmp(basename(entry->d_name), ".") == 0 ||
+            strcmp(basename(entry->d_name), "..") == 0)
+            continue;
+
+        nr_entries++;
+
+        GF_CHANGELOG_FILL_BUFFER(hist_jnl->jnl_processing_dir, buffer, off,
+                                 strlen(hist_jnl->jnl_processing_dir));
+        GF_CHANGELOG_FILL_BUFFER(entry->d_name, buffer, off,
+                                 strlen(entry->d_name));
+        GF_CHANGELOG_FILL_BUFFER("\n", buffer, off, 1);
+
+        if (gf_changelog_write(tracker_fd, buffer, off) != off) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_WRITE_FAILED,
+                   "error writing changelog filename"
+                   " to tracker file");
+            break;
+        }
+        off = 0;
+    }
+
+    gf_msg_debug(this->name, 0, "hist_done %d, is_last_scan: %d",
+                 hist_jnl->hist_done, is_last_scan);
+
+    if (!entry) {
+        if (gf_lseek(tracker_fd, 0, SEEK_SET) != -1) {
+            if (nr_entries > 0)
+                return nr_entries;
+            else {
+                sleep(1);
+                goto retry;
+            }
+        }
+    }
+out:
+    return -1;
+}
+
+/*
+ * Gets timestamp value at the changelog path at index.
+ * Returns 0 on success(updates given time-stamp), -1 on failure.
+ */
+int
+gf_history_get_timestamp(int fd, int index, int len, unsigned long *ts)
+{
+    xlator_t *this = NULL;
+    int n_read = -1;
+    char path_buf[PATH_MAX] = {
+        0,
+    };
+    char *iter = path_buf;
+    size_t offset = index * (len + 1);
+    unsigned long value = 0;
+    int ret = 0;
+
+    this = THIS;
+    if (!this) {
+        return -1;
+    }
+
+    n_read = sys_pread(fd, path_buf, len, offset);
+    if (n_read < 0) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_READ_ERROR,
+               "could not read from htime file");
+        goto out;
+    }
+    iter += len - TIMESTAMP_LENGTH;
+    sscanf(iter, "%lu", &value);
+out:
+    if (ret == 0)
+        *ts = value;
+    return ret;
+}
+
+/*
+ * Function to ensure correctness of search
+ * Checks whether @value is there next to @target_index or not
+ */
+int
+gf_history_check(int fd, int target_index, unsigned long value, int len)
+{
+    int ret = 0;
+    unsigned long ts1 = 0;
+    unsigned long ts2 = 0;
+
+    if (target_index == 0) {
+        ret = gf_history_get_timestamp(fd, target_index, len, &ts1);
+        if (ret == -1)
+            goto out;
+        if (value <= ts1)
+            goto out;
+        else {
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = gf_history_get_timestamp(fd, target_index, len, &ts1);
+    if (ret == -1)
+        goto out;
+    ret = gf_history_get_timestamp(fd, target_index - 1, len, &ts2);
+    if (ret == -1)
+        goto out;
+
+    if ((value <= ts1) && (value > ts2)) {
+        goto out;
+    } else
+        ret = -1;
+out:
+    return ret;
+}
+
+/*
+ * This is a "binary search" based search function which checks neighbours
+ * for in-range availability of the value to be searched and provides the
+ * index at which the changelog file nearest to the requested timestamp(value)
+ * can be read from.
+ *
+ * Actual offset can be calculated as (index* (len+1) ).
+ * "1" is because the changelog paths are null terminated.
+ *
+ * @path        : Htime file to search in
+ * @value       : time stamp to search
+ * @from        : start index to search
+ * @to          : end index to search
+ * @len         : length of fixes length strings separated by null
+ */
+
+int
+gf_history_b_search(int fd, unsigned long value, unsigned long from,
+                    unsigned long to, int len)
+{
+    int m_index = -1;
+    unsigned long cur_value = 0;
+    unsigned long ts1 = 0;
+    int ret = 0;
+
+    m_index = (from + to) / 2;
+
+    if ((to - from) <= 1) {
+        /* either one or 2 changelogs left */
+        if (to != from) {
+            /* check if value is less or greater than to
+             * return accordingly
+             */
+            ret = gf_history_get_timestamp(fd, from, len, &ts1);
+            if (ret == -1)
+                goto out;
+            if (ts1 >= value) {
+                /* actually compatision should be
+                 * exactly == but considering
+                 *
+                 * case of only 2 changelogs in htime file
+                 */
+                return from;
+            } else
+                return to;
+        } else
+            return to;
+    }
+
+    ret = gf_history_get_timestamp(fd, m_index, len, &cur_value);
+    if (ret == -1)
+        goto out;
+    if (cur_value == value) {
+        return m_index;
+    } else if (value > cur_value) {
+        ret = gf_history_get_timestamp(fd, m_index + 1, len, &cur_value);
+        if (ret == -1)
+            goto out;
+        if (value < cur_value)
+            return m_index + 1;
+        else
+            return gf_history_b_search(fd, value, m_index + 1, to, len);
+    } else {
+        if (m_index == 0) {
+            /*  we are sure that values exists
+             *  in this htime file
+             */
+            return 0;
+        } else {
+            ret = gf_history_get_timestamp(fd, m_index - 1, len, &cur_value);
+            if (ret == -1)
+                goto out;
+            if (value > cur_value) {
+                return m_index;
+            } else
+                return gf_history_b_search(fd, value, from, m_index - 1, len);
+        }
+    }
+out:
+    return -1;
+}
+
+/*
+ * Description: Checks if the changelog path is usable or not,
+ *              which is differentiated by checking for "changelog"
+ *              in the path and not "CHANGELOG".
+ *
+ * Returns:
+ * 1 : Yes, usable ( contains "CHANGELOG" )
+ * 0 : No, Not usable ( contains, "changelog")
+ */
+int
+gf_is_changelog_usable(char *cl_path)
+{
+    int ret = -1;
+    const char low_c[] = "changelog";
+    char *str_ret = NULL;
+    char *bname = NULL;
+
+    bname = basename(cl_path);
+
+    str_ret = strstr(bname, low_c);
+
+    if (str_ret != NULL)
+        ret = 0;
+    else
+        ret = 1;
+
+    return ret;
+}
+
+void *
+gf_changelog_consume_wrap(void *data)
+{
+    int ret = -1;
+    ssize_t nread = 0;
+    xlator_t *this = NULL;
+    gf_changelog_consume_data_t *ccd = NULL;
+
+    ccd = (gf_changelog_consume_data_t *)data;
+    this = ccd->this;
+
+    ccd->retval = -1;
+
+    nread = sys_pread(ccd->fd, ccd->changelog, PATH_MAX - 1, ccd->offset);
+    if (nread < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_READ_ERROR,
+               "cannot read from history metadata file");
+        goto out;
+    }
+
+    /* TODO: handle short reads and EOF. */
+    if (gf_is_changelog_usable(ccd->changelog) == 1) {
+        ret = gf_changelog_consume(ccd->this, ccd->jnl, ccd->changelog,
+                                   _gf_true);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_LIB_MSG_PARSE_ERROR,
+                    "name=%s", ccd->changelog, NULL);
+            goto out;
+        }
+    }
+    ccd->retval = 0;
+
+out:
+    return NULL;
+}
+
+/**
+ * "gf_history_consume" is a worker function for history.
+ * parses and moves changelogs files from index "from"
+ * to index "to" in open htime file whose fd is "fd".
+ */
+
+#define MAX_PARALLELS 10
+
+void *
+gf_history_consume(void *data)
+{
+    xlator_t *this = NULL;
+    gf_changelog_journal_t *jnl = NULL;
+    gf_changelog_journal_t *hist_jnl = NULL;
+    int ret = 0;
+    int iter = 0;
+    int fd = -1;
+    int from = -1;
+    int to = -1;
+    int len = -1;
+    int n_parallel = 0;
+    int n_envoked = 0;
+    gf_boolean_t publish = _gf_true;
+    pthread_t th_id[MAX_PARALLELS] = {
+        0,
+    };
+    gf_changelog_history_data_t *hist_data = NULL;
+    gf_changelog_consume_data_t ccd[MAX_PARALLELS] = {
+        {0},
+    };
+    gf_changelog_consume_data_t *curr = NULL;
+
+    hist_data = (gf_changelog_history_data_t *)data;
+    if (hist_data == NULL) {
+        ret = -1;
+        goto out;
+    }
+
+    fd = hist_data->htime_fd;
+    from = hist_data->from;
+    to = hist_data->to;
+    len = hist_data->len;
+    n_parallel = hist_data->n_parallel;
+
+    THIS = hist_data->this;
+    this = hist_data->this;
+    if (!this) {
+        ret = -1;
+        goto out;
+    }
+
+    jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this);
+    if (!jnl) {
+        ret = -1;
+        goto out;
+    }
+
+    hist_jnl = jnl->hist_jnl;
+    if (!hist_jnl) {
+        ret = -1;
+        goto out;
+    }
+
+    while (from <= to) {
+        n_envoked = 0;
+
+        for (iter = 0; (iter < n_parallel) && (from <= to); iter++) {
+            curr = &ccd[iter];
+
+            curr->this = this;
+            curr->jnl = hist_jnl;
+            curr->fd = fd;
+            curr->offset = from * (len + 1);
+
+            curr->retval = 0;
+            memset(curr->changelog, '\0', PATH_MAX);
+
+            ret = gf_thread_create(&th_id[iter], NULL,
+                                   gf_changelog_consume_wrap, curr,
+                                   "clogc%03hx", (iter + 1) & 0x3ff);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, ret,
+                       CHANGELOG_LIB_MSG_THREAD_CREATION_FAILED,
+                       "could not create consume-thread");
+                goto sync;
+            } else
+                n_envoked++;
+
+            from++;
+        }
+
+    sync:
+        for (iter = 0; iter < n_envoked; iter++) {
+            ret = pthread_join(th_id[iter], NULL);
+            if (ret) {
+                publish = _gf_false;
+                gf_msg(this->name, GF_LOG_ERROR, ret,
+                       CHANGELOG_LIB_MSG_PTHREAD_JOIN_FAILED,
+                       "pthread_join() error");
+                /* try to join the rest */
+                continue;
+            }
+
+            if (publish == _gf_false)
+                continue;
+
+            curr = &ccd[iter];
+            if (ccd->retval) {
+                publish = _gf_false;
+                gf_smsg(this->name, GF_LOG_ERROR, 0,
+                        CHANGELOG_LIB_MSG_PARSE_ERROR_CEASED, NULL);
+                continue;
+            }
+
+            ret = gf_changelog_publish(curr->this, curr->jnl, curr->changelog);
+            if (ret) {
+                publish = _gf_false;
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       CHANGELOG_LIB_MSG_PUBLISH_ERROR,
+                       "publish error, ceased publishing...");
+            }
+        }
+    }
+
+    /* informing "parsing done". */
+    hist_jnl->hist_done = (publish == _gf_true) ? 0 : -1;
+
+out:
+    if (fd != -1)
+        (void)sys_close(fd);
+    GF_FREE(hist_data);
+    return NULL;
+}
+
+/**
+ * @API
+ * gf_history_changelog() : Get/parse historical changelogs and get them ready
+ * for consumption.
+ *
+ * Arguments:
+ * @changelog_dir : Directory location from where history changelogs are
+ * supposed to be consumed.
+ * @start: Unix timestamp FROM where changelogs should be consumed.
+ * @end: Unix timestamp TO where changelogsshould be consumed.
+ * @n_parallel : degree of parallelism while changelog parsing.
+ * @actual_end : the end time till where changelogs are available.
+ *
+ * Return:
+ * Returns <timestamp> on success, the last time till where changelogs are
+ *      available.
+ * Returns -1 on failure(error).
+ */
+
+/**
+ * Extract timestamp range from a historical metadata file
+ * Returns:
+ *    0 : Success ({min,max}_ts with the appropriate values)
+ *   -1 : Failure
+ *   -2 : Ignore this metadata file and process next
+ */
+int
+gf_changelog_extract_min_max(const char *dname, const char *htime_dir, int *fd,
+                             unsigned long *total, unsigned long *min_ts,
+                             unsigned long *max_ts)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    char htime_file[PATH_MAX] = {
+        0,
+    };
+    struct stat stbuf = {
+        0,
+    };
+    char *iter = NULL;
+    char x_value[30] = {
+        0,
+    };
+
+    this = THIS;
+
+    snprintf(htime_file, PATH_MAX, "%s/%s", htime_dir, dname);
+
+    iter = (htime_file + strlen(htime_file) - TIMESTAMP_LENGTH);
+    sscanf(iter, "%lu", min_ts);
+
+    ret = sys_stat(htime_file, &stbuf);
+    if (ret) {
+        ret = -1;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_HTIME_ERROR,
+                "op=stat", "path=%s", htime_file, NULL);
+        goto out;
+    }
+
+    /* ignore everything except regular files */
+    if (!S_ISREG(stbuf.st_mode)) {
+        ret = -2;
+        goto out;
+    }
+
+    *fd = open(htime_file, O_RDONLY);
+    if (*fd < 0) {
+        ret = -1;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_HTIME_ERROR,
+                "op=open", "path=%s", htime_file, NULL);
+        goto out;
+    }
+
+    /* Looks good, extract max timestamp */
+    ret = sys_fgetxattr(*fd, HTIME_KEY, x_value, sizeof(x_value));
+    if (ret < 0) {
+        ret = -1;
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_LIB_MSG_GET_XATTR_FAILED, "path=%s", htime_file,
+                NULL);
+        goto out;
+    }
+
+    sscanf(x_value, "%lu:%lu", max_ts, total);
+    gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_LIB_MSG_MIN_MAX_INFO,
+            "min=%lu", *min_ts, "max=%lu", *max_ts, "total_changelogs=%lu",
+            *total, NULL);
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+/* gf_history_changelog returns actual_end and spawns threads to
+ * parse historical changelogs. The return values are as follows.
+ *     0 : On success
+ *     1 : Successful, but partial historical changelogs available,
+ *         end time falls into different htime file or future time
+ *    -2 : Error, requested historical changelog not available, not
+ *         even partial
+ *    -1 : On any error
+ */
+int
+gf_history_changelog(char *changelog_dir, unsigned long start,
+                     unsigned long end, int n_parallel,
+                     unsigned long *actual_end)
+{
+    int ret = 0;
+    int len = -1;
+    int fd = -1;
+    int n_read = -1;
+    unsigned long min_ts = 0;
+    unsigned long max_ts = 0;
+    unsigned long end2 = 0;
+    unsigned long ts1 = 0;
+    unsigned long ts2 = 0;
+    unsigned long to = 0;
+    unsigned long from = 0;
+    unsigned long total_changelog = 0;
+    xlator_t *this = NULL;
+    gf_changelog_journal_t *jnl = NULL;
+    gf_changelog_journal_t *hist_jnl = NULL;
+    gf_changelog_history_data_t *hist_data = NULL;
+    DIR *dirp = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    pthread_t consume_th = 0;
+    char htime_dir[PATH_MAX] = {
+        0,
+    };
+    char buffer[PATH_MAX] = {
+        0,
+    };
+    gf_boolean_t partial_history = _gf_false;
+
+    pthread_attr_t attr;
+
+    this = THIS;
+    if (!this) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = pthread_attr_init(&attr);
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_PTHREAD_ERROR,
+               "Pthread init failed");
+        return -1;
+    }
+
+    jnl = (gf_changelog_journal_t *)GF_CHANGELOG_GET_API_PTR(this);
+    if (!jnl) {
+        ret = -1;
+        goto out;
+    }
+
+    hist_jnl = (gf_changelog_journal_t *)jnl->hist_jnl;
+    if (!hist_jnl) {
+        ret = -1;
+        goto out;
+    }
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_LIB_MSG_REQUESTING_INFO,
+            "start=%lu", start, "end=%lu", end, NULL);
+
+    /* basic sanity check */
+    if (start > end || n_parallel <= 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_HIST_FAILED,
+                "start=%lu", start, "end=%lu", end, "thread_count=%d",
+                n_parallel, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    /* cap parallelism count */
+    if (n_parallel > MAX_PARALLELS)
+        n_parallel = MAX_PARALLELS;
+
+    CHANGELOG_FILL_HTIME_DIR(changelog_dir, htime_dir);
+
+    dirp = sys_opendir(htime_dir);
+    if (dirp == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_LIB_MSG_HTIME_ERROR,
+                "op=opendir", "path=%s", htime_dir, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    for (;;) {
+        errno = 0;
+
+        entry = sys_readdir(dirp, scratch);
+
+        if (!entry || errno != 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    CHANGELOG_LIB_MSG_HIST_FAILED, "start=%lu", start,
+                    "end=%lu", end, NULL);
+            ret = -2;
+            break;
+        }
+
+        ret = gf_changelog_extract_min_max(entry->d_name, htime_dir, &fd,
+                                           &total_changelog, &min_ts, &max_ts);
+        if (ret) {
+            if (-2 == ret)
+                continue;
+            goto out;
+        }
+
+        if (start >= min_ts && start < max_ts) {
+            /**
+             * TODO: handle short reads later...
+             */
+            n_read = sys_read(fd, buffer, PATH_MAX);
+            if (n_read < 0) {
+                ret = -1;
+                gf_msg(this->name, GF_LOG_ERROR, errno,
+                       CHANGELOG_LIB_MSG_READ_ERROR,
+                       "unable to read htime file");
+                goto out;
+            }
+
+            len = strlen(buffer);
+
+            /**
+             * search @start in the htime file returning it's index
+             * (@from)
+             */
+            from = gf_history_b_search(fd, start, 0, total_changelog - 1, len);
+
+            /* ensuring correctness of gf_b_search */
+            if (gf_history_check(fd, from, start, len) != 0) {
+                ret = -1;
+                gf_smsg(this->name, GF_LOG_ERROR, 0,
+                        CHANGELOG_LIB_MSG_GET_TIME_ERROR, "for=start",
+                        "start=%lu", start, "idx=%lu", from, NULL);
+                goto out;
+            }
+
+            end2 = (end <= max_ts) ? end : max_ts;
+
+            /* Check if end falls out of same HTIME file. The end
+             * falling to a different htime file or changelog
+             * disable-enable is detected only after 20 seconds.
+             * This is required because, applications generally
+             * asks historical changelogs till current time and
+             * it is possible changelog is not rolled over yet.
+             * So, buffer time of default rollover time plus 5
+             * seconds is subtracted.  If the application requests
+             * the end time with in half a minute of changelog
+             * disable, it's not detected as changelog disable and
+             * it's application's responsibility to retry after
+             * 20 seconds before confirming it as partial history.
+             */
+            if ((end - 20) > max_ts) {
+                partial_history = _gf_true;
+            }
+
+            /**
+             * search @end2 in htime file returning it's index (@to)
+             */
+            to = gf_history_b_search(fd, end2, 0, total_changelog - 1, len);
+
+            if (gf_history_check(fd, to, end2, len) != 0) {
+                ret = -1;
+                gf_smsg(this->name, GF_LOG_ERROR, 0,
+                        CHANGELOG_LIB_MSG_GET_TIME_ERROR, "for=end",
+                        "start=%lu", end2, "idx=%lu", to, NULL);
+                goto out;
+            }
+
+            ret = gf_history_get_timestamp(fd, from, len, &ts1);
+            if (ret == -1)
+                goto out;
+
+            ret = gf_history_get_timestamp(fd, to, len, &ts2);
+            if (ret == -1)
+                goto out;
+
+            gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_LIB_MSG_FINAL_INFO,
+                    "from=%lu", ts1, "to=%lu", ts2, "changes=%lu",
+                    (to - from + 1), NULL);
+
+            hist_data = GF_CALLOC(1, sizeof(gf_changelog_history_data_t),
+                                  gf_changelog_mt_history_data_t);
+
+            hist_data->htime_fd = fd;
+            hist_data->from = from;
+            hist_data->to = to;
+            hist_data->len = len;
+            hist_data->n_parallel = n_parallel;
+            hist_data->this = this;
+
+            ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+            if (ret != 0) {
+                gf_msg(this->name, GF_LOG_ERROR, ret,
+                       CHANGELOG_LIB_MSG_PTHREAD_ERROR,
+                       "unable to sets the detach"
+                       " state attribute");
+                ret = -1;
+                goto out;
+            }
+
+            /* spawn a thread for background parsing & publishing */
+            ret = gf_thread_create(&consume_th, &attr, gf_history_consume,
+                                   hist_data, "cloghcon");
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, ret,
+                       CHANGELOG_LIB_MSG_THREAD_CREATION_FAILED,
+                       "creation of consume parent-thread"
+                       " failed.");
+                ret = -1;
+                goto out;
+            }
+
+            goto out;
+
+        } else { /* end of range check */
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    CHANGELOG_LIB_MSG_HIST_FAILED, "start=%lu", start,
+                    "end=%lu", end, "chlog_min=%lu", min_ts, "chlog_max=%lu",
+                    max_ts, NULL);
+        }
+    } /* end of readdir() */
+
+out:
+    if (dirp != NULL)
+        (void)sys_closedir(dirp);
+
+    if (ret < 0) {
+        if (fd != -1)
+            (void)sys_close(fd);
+        GF_FREE(hist_data);
+        (void)pthread_attr_destroy(&attr);
+
+        return ret;
+    }
+
+    hist_jnl->hist_done = 1;
+    *actual_end = ts2;
+
+    if (partial_history) {
+        ret = 1;
+    }
+
+    return ret;
+}
diff --git a/xlators/features/changelog/src/Makefile.am b/xlators/features/changelog/src/Makefile.am
new file mode 100644
index 00000000000..eee7dfa238d
--- /dev/null
+++ b/xlators/features/changelog/src/Makefile.am
@@ -0,0 +1,29 @@
+xlator_LTLIBRARIES = changelog.la
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+noinst_HEADERS = changelog-helpers.h changelog-mem-types.h changelog-rt.h \
+	changelog-rpc-common.h changelog-misc.h changelog-encoders.h \
+	changelog-rpc-common.h changelog-rpc.h changelog-ev-handle.h \
+	changelog-messages.h
+
+changelog_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+changelog_la_SOURCES = changelog.c changelog-rt.c changelog-helpers.c \
+	changelog-encoders.c changelog-rpc.c changelog-barrier.c \
+	changelog-rpc-common.c changelog-ev-handle.c
+changelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+	$(top_builddir)/rpc/xdr/src/libgfxdr.la \
+	$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(top_srcdir)/rpc/rpc-lib/src \
+	-I$(top_srcdir)/rpc/rpc-transport/socket/src \
+	-I$(top_srcdir)/xlators/features/changelog/lib/src/ \
+	-fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) \
+	-DDATADIR=\"$(localstatedir)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/changelog/src/changelog-barrier.c b/xlators/features/changelog/src/changelog-barrier.c
new file mode 100644
index 00000000000..0fb89ddb127
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-barrier.c
@@ -0,0 +1,131 @@
+/*
+     Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+     This file is part of GlusterFS.
+
+     This file is licensed to you under your choice of the GNU Lesser
+     General Public License, version 3 or any later version (LGPLv3 or
+     later), or the GNU General Public License, version 2 (GPLv2), in all
+     cases as published by the Free Software Foundation.
+*/
+
+#include "changelog-helpers.h"
+#include "changelog-messages.h"
+#include <glusterfs/call-stub.h>
+
+/* Enqueue a stub*/
+void
+__chlog_barrier_enqueue(xlator_t *this, call_stub_t *stub)
+{
+    changelog_priv_t *priv = NULL;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    list_add_tail(&stub->list, &priv->queue);
+    priv->queue_size++;
+
+    return;
+}
+
+/* Dequeue a stub */
+call_stub_t *
+__chlog_barrier_dequeue(xlator_t *this, struct list_head *queue)
+{
+    call_stub_t *stub = NULL;
+    changelog_priv_t *priv = NULL;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (list_empty(queue))
+        goto out;
+
+    stub = list_entry(queue->next, call_stub_t, list);
+    list_del_init(&stub->list);
+
+out:
+    return stub;
+}
+
+/* Dequeue all the stubs and call corresponding resume functions */
+void
+chlog_barrier_dequeue_all(xlator_t *this, struct list_head *queue)
+{
+    call_stub_t *stub = NULL;
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS,
+            NULL);
+
+    while ((stub = __chlog_barrier_dequeue(this, queue)))
+        call_resume(stub);
+
+    gf_smsg(this->name, GF_LOG_INFO, 0,
+            CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS_FINISHED, NULL);
+    return;
+}
+
+/* Function called on changelog barrier timeout */
+void
+chlog_barrier_timeout(void *data)
+{
+    xlator_t *this = NULL;
+    changelog_priv_t *priv = NULL;
+    struct list_head queue = {
+        0,
+    };
+
+    this = data;
+    THIS = this;
+    priv = this->private;
+
+    INIT_LIST_HEAD(&queue);
+
+    gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_BARRIER_TIMEOUT, NULL);
+
+    LOCK(&priv->lock);
+    {
+        __chlog_barrier_disable(this, &queue);
+    }
+    UNLOCK(&priv->lock);
+
+    chlog_barrier_dequeue_all(this, &queue);
+
+    return;
+}
+
+/* Disable changelog barrier enable flag */
+void
+__chlog_barrier_disable(xlator_t *this, struct list_head *queue)
+{
+    changelog_priv_t *priv = this->private;
+    GF_ASSERT(priv);
+
+    if (priv->timer) {
+        gf_timer_call_cancel(this->ctx, priv->timer);
+        priv->timer = NULL;
+    }
+
+    list_splice_init(&priv->queue, queue);
+    priv->queue_size = 0;
+    priv->barrier_enabled = _gf_false;
+}
+
+/* Enable chagelog barrier enable with timer */
+int
+__chlog_barrier_enable(xlator_t *this, changelog_priv_t *priv)
+{
+    int ret = -1;
+
+    priv->timer = gf_timer_call_after(this->ctx, priv->timeout,
+                                      chlog_barrier_timeout, (void *)this);
+    if (!priv->timer) {
+        gf_smsg(this->name, GF_LOG_CRITICAL, 0,
+                CHANGELOG_MSG_TIMEOUT_ADD_FAILED, NULL);
+        goto out;
+    }
+
+    priv->barrier_enabled = _gf_true;
+    ret = 0;
+out:
+    return ret;
+}
diff --git a/xlators/features/changelog/src/changelog-encoders.c b/xlators/features/changelog/src/changelog-encoders.c
new file mode 100644
index 00000000000..63754516c2e
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-encoders.c
@@ -0,0 +1,232 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "changelog-encoders.h"
+
+size_t
+entry_fn(void *data, char *buffer, gf_boolean_t encode)
+{
+    char *tmpbuf = NULL;
+    size_t bufsz = 0;
+    struct changelog_entry_fields *ce = NULL;
+
+    ce = (struct changelog_entry_fields *)data;
+
+    if (encode) {
+        tmpbuf = uuid_utoa(ce->cef_uuid);
+        CHANGELOG_FILL_BUFFER(buffer, bufsz, tmpbuf, strlen(tmpbuf));
+    } else {
+        CHANGELOG_FILL_BUFFER(buffer, bufsz, ce->cef_uuid, sizeof(uuid_t));
+    }
+
+    CHANGELOG_FILL_BUFFER(buffer, bufsz, "/", 1);
+    CHANGELOG_FILL_BUFFER(buffer, bufsz, ce->cef_bname, strlen(ce->cef_bname));
+    return bufsz;
+}
+
+size_t
+del_entry_fn(void *data, char *buffer, gf_boolean_t encode)
+{
+    char *tmpbuf = NULL;
+    size_t bufsz = 0;
+    struct changelog_entry_fields *ce = NULL;
+
+    ce = (struct changelog_entry_fields *)data;
+
+    if (encode) {
+        tmpbuf = uuid_utoa(ce->cef_uuid);
+        CHANGELOG_FILL_BUFFER(buffer, bufsz, tmpbuf, strlen(tmpbuf));
+    } else {
+        CHANGELOG_FILL_BUFFER(buffer, bufsz, ce->cef_uuid, sizeof(uuid_t));
+    }
+
+    CHANGELOG_FILL_BUFFER(buffer, bufsz, "/", 1);
+    CHANGELOG_FILL_BUFFER(buffer, bufsz, ce->cef_bname, strlen(ce->cef_bname));
+    CHANGELOG_FILL_BUFFER(buffer, bufsz, "\0", 1);
+
+    if (ce->cef_path[0] == '\0') {
+        CHANGELOG_FILL_BUFFER(buffer, bufsz, "\0", 1);
+    } else {
+        CHANGELOG_FILL_BUFFER(buffer, bufsz, ce->cef_path,
+                              strlen(ce->cef_path));
+    }
+
+    return bufsz;
+}
+
+size_t
+fop_fn(void *data, char *buffer, gf_boolean_t encode)
+{
+    char buf[10] = {
+        0,
+    };
+    size_t bufsz = 0;
+    glusterfs_fop_t fop = 0;
+
+    fop = *(glusterfs_fop_t *)data;
+
+    if (encode) {
+        (void)snprintf(buf, sizeof(buf), "%d", fop);
+        CHANGELOG_FILL_BUFFER(buffer, bufsz, buf, strlen(buf));
+    } else
+        CHANGELOG_FILL_BUFFER(buffer, bufsz, &fop, sizeof(fop));
+
+    return bufsz;
+}
+
+size_t
+number_fn(void *data, char *buffer, gf_boolean_t encode)
+{
+    size_t bufsz = 0;
+    unsigned int nr = 0;
+    char buf[20] = {
+        0,
+    };
+
+    nr = *(unsigned int *)data;
+
+    if (encode) {
+        (void)snprintf(buf, sizeof(buf), "%u", nr);
+        CHANGELOG_FILL_BUFFER(buffer, bufsz, buf, strlen(buf));
+    } else
+        CHANGELOG_FILL_BUFFER(buffer, bufsz, &nr, sizeof(unsigned int));
+
+    return bufsz;
+}
+
+void
+entry_free_fn(void *data)
+{
+    changelog_opt_t *co = data;
+
+    if (!co)
+        return;
+
+    GF_FREE(co->co_entry.cef_bname);
+}
+
+void
+del_entry_free_fn(void *data)
+{
+    changelog_opt_t *co = data;
+
+    if (!co)
+        return;
+
+    GF_FREE(co->co_entry.cef_bname);
+    GF_FREE(co->co_entry.cef_path);
+}
+
+/**
+ * try to write all data in one shot
+ */
+
+static void
+changelog_encode_write_xtra(changelog_log_data_t *cld, char *buffer,
+                            size_t *off, gf_boolean_t encode)
+{
+    int i = 0;
+    size_t offset = 0;
+    void *data = NULL;
+    changelog_opt_t *co = NULL;
+
+    offset = *off;
+
+    co = (changelog_opt_t *)cld->cld_ptr;
+
+    for (; i < cld->cld_xtra_records; i++, co++) {
+        CHANGELOG_FILL_BUFFER(buffer, offset, "\0", 1);
+
+        switch (co->co_type) {
+            case CHANGELOG_OPT_REC_FOP:
+                data = &co->co_fop;
+                break;
+            case CHANGELOG_OPT_REC_ENTRY:
+                data = &co->co_entry;
+                break;
+            case CHANGELOG_OPT_REC_UINT32:
+                data = &co->co_uint32;
+                break;
+        }
+
+        if (co->co_convert)
+            offset += co->co_convert(data, buffer + offset, encode);
+        else /* no coversion: write it out as it is */
+            CHANGELOG_FILL_BUFFER(buffer, offset, data, co->co_len);
+    }
+
+    *off = offset;
+}
+
+int
+changelog_encode_ascii(xlator_t *this, changelog_log_data_t *cld)
+{
+    size_t off = 0;
+    size_t gfid_len = 0;
+    char *gfid_str = NULL;
+    char *buffer = NULL;
+    changelog_priv_t *priv = NULL;
+
+    priv = this->private;
+
+    gfid_str = uuid_utoa(cld->cld_gfid);
+    gfid_len = strlen(gfid_str);
+
+    /* extra bytes for decorations */
+    buffer = alloca(gfid_len + cld->cld_ptr_len + 10);
+    CHANGELOG_STORE_ASCII(priv, buffer, off, gfid_str, gfid_len, cld);
+
+    if (cld->cld_xtra_records)
+        changelog_encode_write_xtra(cld, buffer, &off, _gf_true);
+
+    CHANGELOG_FILL_BUFFER(buffer, off, "\0", 1);
+
+    return changelog_write_change(priv, buffer, off);
+}
+
+int
+changelog_encode_binary(xlator_t *this, changelog_log_data_t *cld)
+{
+    size_t off = 0;
+    char *buffer = NULL;
+    changelog_priv_t *priv = NULL;
+
+    priv = this->private;
+
+    /* extra bytes for decorations */
+    buffer = alloca(sizeof(uuid_t) + cld->cld_ptr_len + 10);
+    CHANGELOG_STORE_BINARY(priv, buffer, off, cld->cld_gfid, cld);
+
+    if (cld->cld_xtra_records)
+        changelog_encode_write_xtra(cld, buffer, &off, _gf_false);
+
+    CHANGELOG_FILL_BUFFER(buffer, off, "\0", 1);
+
+    return changelog_write_change(priv, buffer, off);
+}
+
+static struct changelog_encoder cb_encoder[] = {
+    [CHANGELOG_ENCODE_BINARY] =
+        {
+            .encoder = CHANGELOG_ENCODE_BINARY,
+            .encode = changelog_encode_binary,
+        },
+    [CHANGELOG_ENCODE_ASCII] =
+        {
+            .encoder = CHANGELOG_ENCODE_ASCII,
+            .encode = changelog_encode_ascii,
+        },
+};
+
+void
+changelog_encode_change(changelog_priv_t *priv)
+{
+    priv->ce = &cb_encoder[priv->encode_mode];
+}
diff --git a/xlators/features/changelog/src/changelog-encoders.h b/xlators/features/changelog/src/changelog-encoders.h
new file mode 100644
index 00000000000..26252696d56
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-encoders.h
@@ -0,0 +1,50 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_ENCODERS_H
+#define _CHANGELOG_ENCODERS_H
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "changelog-helpers.h"
+
+#define CHANGELOG_STORE_ASCII(priv, buf, off, gfid, gfid_len, cld)             \
+    do {                                                                       \
+        CHANGELOG_FILL_BUFFER(buffer, off, priv->maps[cld->cld_type], 1);      \
+        CHANGELOG_FILL_BUFFER(buffer, off, gfid, gfid_len);                    \
+    } while (0)
+
+#define CHANGELOG_STORE_BINARY(priv, buf, off, gfid, cld)                      \
+    do {                                                                       \
+        CHANGELOG_FILL_BUFFER(buffer, off, priv->maps[cld->cld_type], 1);      \
+        CHANGELOG_FILL_BUFFER(buffer, off, gfid, sizeof(uuid_t));              \
+    } while (0)
+
+size_t
+entry_fn(void *data, char *buffer, gf_boolean_t encode);
+size_t
+del_entry_fn(void *data, char *buffer, gf_boolean_t encode);
+size_t
+fop_fn(void *data, char *buffer, gf_boolean_t encode);
+size_t
+number_fn(void *data, char *buffer, gf_boolean_t encode);
+void
+entry_free_fn(void *data);
+void
+del_entry_free_fn(void *data);
+int
+changelog_encode_binary(xlator_t *, changelog_log_data_t *);
+int
+changelog_encode_ascii(xlator_t *, changelog_log_data_t *);
+void
+changelog_encode_change(changelog_priv_t *);
+
+#endif /* _CHANGELOG_ENCODERS_H */
diff --git a/xlators/features/changelog/src/changelog-ev-handle.c b/xlators/features/changelog/src/changelog-ev-handle.c
new file mode 100644
index 00000000000..aa94459de5a
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-ev-handle.c
@@ -0,0 +1,412 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "changelog-ev-handle.h"
+#include "changelog-rpc-common.h"
+#include "changelog-helpers.h"
+
+struct rpc_clnt_program changelog_ev_program;
+
+#define NR_IOVEC (MAX_IOVEC - 3)
+struct ev_rpc_vec {
+    int count;
+    struct iovec vector[NR_IOVEC];
+
+    /* sequence number */
+    unsigned long seq;
+};
+
+struct ev_rpc {
+    rbuf_list_t *rlist;
+    struct rpc_clnt *rpc;
+    struct ev_rpc_vec vec;
+};
+
+/**
+ * As of now this just does the minimal (retval logging). Going further
+ * un-acknowledges sequence numbers can be retransmitted and other
+ * intelligence can be built into the server.
+ */
+int
+changelog_event_dispatch_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                             void *myframe)
+{
+    return 0;
+}
+
+/* dispatcher RPC */
+int
+changelog_dispatch_vec(call_frame_t *frame, xlator_t *this,
+                       struct rpc_clnt *rpc, struct ev_rpc_vec *vec)
+{
+    struct timeval tv = {
+        0,
+    };
+    changelog_event_req req = {
+        0,
+    };
+
+    (void)gettimeofday(&tv, NULL);
+
+    /**
+     * Event dispatch RPC header contains a sequence number for each
+     * dispatch. This allows the receiver to order the request before
+     * processing.
+     */
+    req.seq = vec->seq;
+    req.tv_sec = tv.tv_sec;
+    req.tv_usec = tv.tv_usec;
+
+    return changelog_rpc_sumbit_req(
+        rpc, (void *)&req, frame, &changelog_ev_program,
+        CHANGELOG_REV_PROC_EVENT, vec->vector, vec->count, NULL, this,
+        changelog_event_dispatch_cbk, (xdrproc_t)xdr_changelog_event_req);
+}
+
+int
+changelog_event_dispatch_rpc(call_frame_t *frame, xlator_t *this, void *data)
+{
+    int idx = 0;
+    int count = 0;
+    int ret = 0;
+    unsigned long sequence = 0;
+    rbuf_iovec_t *rvec = NULL;
+    struct ev_rpc *erpc = NULL;
+    struct rlist_iter riter = {
+        {
+            0,
+        },
+    };
+
+    /* dispatch NR_IOVEC IO vectors at a time. */
+
+    erpc = data;
+    sequence = erpc->rlist->seq[0];
+
+    rlist_iter_init(&riter, erpc->rlist);
+
+    rvec_for_each_entry(rvec, &riter)
+    {
+        idx = count % NR_IOVEC;
+        if (++count == NR_IOVEC) {
+            erpc->vec.vector[idx] = rvec->iov;
+            erpc->vec.seq = sequence++;
+            erpc->vec.count = NR_IOVEC;
+
+            ret = changelog_dispatch_vec(frame, this, erpc->rpc, &erpc->vec);
+            if (ret)
+                break;
+            count = 0;
+            continue;
+        }
+
+        erpc->vec.vector[idx] = rvec->iov;
+    }
+
+    if (ret)
+        goto error_return;
+
+    idx = count % NR_IOVEC;
+    if (idx) {
+        erpc->vec.seq = sequence;
+        erpc->vec.count = idx;
+
+        ret = changelog_dispatch_vec(frame, this, erpc->rpc, &erpc->vec);
+    }
+
+error_return:
+    return ret;
+}
+
+int
+changelog_rpc_notify(struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
+                     void *data)
+{
+    xlator_t *this = NULL;
+    changelog_rpc_clnt_t *crpc = NULL;
+    changelog_clnt_t *c_clnt = NULL;
+    changelog_priv_t *priv = NULL;
+    changelog_ev_selector_t *selection = NULL;
+    uint64_t clntcnt = 0;
+    uint64_t xprtcnt = 0;
+
+    crpc = mydata;
+    this = crpc->this;
+    c_clnt = crpc->c_clnt;
+
+    priv = this->private;
+
+    switch (event) {
+        case RPC_CLNT_CONNECT:
+            selection = &priv->ev_selection;
+            GF_ATOMIC_INC(priv->clntcnt);
+
+            LOCK(&c_clnt->wait_lock);
+            {
+                LOCK(&c_clnt->active_lock);
+                {
+                    changelog_select_event(this, selection, crpc->filter);
+                    list_move_tail(&crpc->list, &c_clnt->active);
+                }
+                UNLOCK(&c_clnt->active_lock);
+            }
+            UNLOCK(&c_clnt->wait_lock);
+
+            break;
+        case RPC_CLNT_DISCONNECT:
+            rpc_clnt_disable(crpc->rpc);
+
+            /* rpc_clnt_disable doesn't unref the rpc. It just marks
+             * the rpc as disabled and cancels reconnection timer.
+             * Hence unref the rpc object to free it.
+             */
+            rpc_clnt_unref(crpc->rpc);
+
+            if (priv)
+                selection = &priv->ev_selection;
+
+            LOCK(&crpc->lock);
+            {
+                if (selection)
+                    changelog_deselect_event(this, selection, crpc->filter);
+                changelog_set_disconnect_flag(crpc, _gf_true);
+            }
+            UNLOCK(&crpc->lock);
+            LOCK(&c_clnt->active_lock);
+            {
+                list_del_init(&crpc->list);
+            }
+            UNLOCK(&c_clnt->active_lock);
+
+            break;
+        case RPC_CLNT_MSG:
+        case RPC_CLNT_DESTROY:
+            /* Free up mydata */
+            changelog_rpc_clnt_unref(crpc);
+            clntcnt = GF_ATOMIC_DEC(priv->clntcnt);
+            xprtcnt = GF_ATOMIC_GET(priv->xprtcnt);
+            if (this->cleanup_starting) {
+                if (!clntcnt && !xprtcnt)
+                    changelog_process_cleanup_event(this);
+            }
+            break;
+        case RPC_CLNT_PING:
+            break;
+    }
+
+    return 0;
+}
+
+void *
+changelog_ev_connector(void *data)
+{
+    xlator_t *this = NULL;
+    changelog_clnt_t *c_clnt = NULL;
+    changelog_rpc_clnt_t *crpc = NULL;
+
+    c_clnt = data;
+    this = c_clnt->this;
+
+    while (1) {
+        pthread_mutex_lock(&c_clnt->pending_lock);
+        {
+            while (list_empty(&c_clnt->pending))
+                pthread_cond_wait(&c_clnt->pending_cond, &c_clnt->pending_lock);
+            crpc = list_first_entry(&c_clnt->pending, changelog_rpc_clnt_t,
+                                    list);
+            crpc->rpc = changelog_rpc_client_init(this, crpc, crpc->sock,
+                                                  changelog_rpc_notify);
+            if (!crpc->rpc) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0,
+                        CHANGELOG_MSG_RPC_CONNECT_ERROR, "path=%s", crpc->sock,
+                        NULL);
+                crpc->cleanup(crpc);
+                goto mutex_unlock;
+            }
+
+            LOCK(&c_clnt->wait_lock);
+            {
+                list_move_tail(&crpc->list, &c_clnt->waitq);
+            }
+            UNLOCK(&c_clnt->wait_lock);
+        }
+    mutex_unlock:
+        pthread_mutex_unlock(&c_clnt->pending_lock);
+    }
+
+    return NULL;
+}
+
+void
+changelog_ev_cleanup_connections(xlator_t *this, changelog_clnt_t *c_clnt)
+{
+    changelog_rpc_clnt_t *crpc = NULL;
+
+    /* cleanup active connections */
+    LOCK(&c_clnt->active_lock);
+    {
+        list_for_each_entry(crpc, &c_clnt->active, list)
+        {
+            rpc_clnt_disable(crpc->rpc);
+        }
+    }
+    UNLOCK(&c_clnt->active_lock);
+}
+
+/**
+ * TODO: granularize lock
+ *
+ * If we have multiple threads dispatching events, doing it this way is
+ * a performance bottleneck.
+ */
+
+static changelog_rpc_clnt_t *
+get_client(changelog_clnt_t *c_clnt, struct list_head **next)
+{
+    changelog_rpc_clnt_t *crpc = NULL;
+
+    LOCK(&c_clnt->active_lock);
+    {
+        if (*next == &c_clnt->active)
+            goto unblock;
+        crpc = list_entry(*next, changelog_rpc_clnt_t, list);
+        /* ref rpc as DISCONNECT might unref the rpc asynchronously */
+        changelog_rpc_clnt_ref(crpc);
+        rpc_clnt_ref(crpc->rpc);
+        *next = (*next)->next;
+    }
+unblock:
+    UNLOCK(&c_clnt->active_lock);
+
+    return crpc;
+}
+
+static void
+put_client(changelog_clnt_t *c_clnt, changelog_rpc_clnt_t *crpc)
+{
+    LOCK(&c_clnt->active_lock);
+    {
+        rpc_clnt_unref(crpc->rpc);
+        changelog_rpc_clnt_unref(crpc);
+    }
+    UNLOCK(&c_clnt->active_lock);
+}
+
+void
+_dispatcher(rbuf_list_t *rlist, void *arg)
+{
+    xlator_t *this = NULL;
+    changelog_clnt_t *c_clnt = NULL;
+    changelog_rpc_clnt_t *crpc = NULL;
+    struct ev_rpc erpc = {
+        0,
+    };
+    struct list_head *next = NULL;
+
+    c_clnt = arg;
+    this = c_clnt->this;
+
+    erpc.rlist = rlist;
+    next = c_clnt->active.next;
+
+    while (1) {
+        crpc = get_client(c_clnt, &next);
+        if (!crpc)
+            break;
+        erpc.rpc = crpc->rpc;
+        (void)changelog_invoke_rpc(this, crpc->rpc, &changelog_ev_program,
+                                   CHANGELOG_REV_PROC_EVENT, &erpc);
+        put_client(c_clnt, crpc);
+    }
+}
+
+/** this is called under rotbuff's lock */
+void
+sequencer(rbuf_list_t *rlist, void *mydata)
+{
+    unsigned long range = 0;
+    changelog_clnt_t *c_clnt = 0;
+
+    c_clnt = mydata;
+
+    range = (RLIST_ENTRY_COUNT(rlist)) / NR_IOVEC;
+    if ((RLIST_ENTRY_COUNT(rlist)) % NR_IOVEC)
+        range++;
+    RLIST_STORE_SEQ(rlist, c_clnt->sequence, range);
+
+    c_clnt->sequence += range;
+}
+
+void *
+changelog_ev_dispatch(void *data)
+{
+    int ret = 0;
+    void *opaque = NULL;
+    xlator_t *this = NULL;
+    changelog_clnt_t *c_clnt = NULL;
+    struct timeval tv = {
+        0,
+    };
+
+    c_clnt = data;
+    this = c_clnt->this;
+
+    while (1) {
+        /* TODO: change this to be pthread cond based.. later */
+
+        tv.tv_sec = 1;
+        tv.tv_usec = 0;
+        select(0, NULL, NULL, NULL, &tv);
+
+        ret = rbuf_get_buffer(c_clnt->rbuf, &opaque, sequencer, c_clnt);
+        if (ret != RBUF_CONSUMABLE) {
+            if (ret != RBUF_EMPTY)
+                gf_smsg(this->name, GF_LOG_WARNING, 0,
+                        CHANGELOG_MSG_BUFFER_STARVATION_ERROR,
+                        "Failed to get buffer for RPC dispatch",
+                        "rbuf_retval=%d", ret, NULL);
+            continue;
+        }
+
+        ret = rbuf_wait_for_completion(c_clnt->rbuf, opaque, _dispatcher,
+                                       c_clnt);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_WARNING, 0,
+                    CHANGELOG_MSG_PUT_BUFFER_FAILED, NULL);
+    }
+
+    return NULL;
+}
+
+void
+changelog_ev_queue_connection(changelog_clnt_t *c_clnt,
+                              changelog_rpc_clnt_t *crpc)
+{
+    pthread_mutex_lock(&c_clnt->pending_lock);
+    {
+        list_add_tail(&crpc->list, &c_clnt->pending);
+        pthread_cond_signal(&c_clnt->pending_cond);
+    }
+    pthread_mutex_unlock(&c_clnt->pending_lock);
+}
+
+struct rpc_clnt_procedure changelog_ev_procs[CHANGELOG_REV_PROC_MAX] = {
+    [CHANGELOG_REV_PROC_NULL] = {"NULL", NULL},
+    [CHANGELOG_REV_PROC_EVENT] = {"EVENT DISPATCH",
+                                  changelog_event_dispatch_rpc},
+};
+
+struct rpc_clnt_program changelog_ev_program = {
+    .progname = "CHANGELOG EVENT DISPATCHER",
+    .prognum = CHANGELOG_REV_RPC_PROCNUM,
+    .progver = CHANGELOG_REV_RPC_PROCVER,
+    .numproc = CHANGELOG_REV_PROC_MAX,
+    .proctable = changelog_ev_procs,
+};
diff --git a/xlators/features/changelog/src/changelog-ev-handle.h b/xlators/features/changelog/src/changelog-ev-handle.h
new file mode 100644
index 00000000000..cc1af58a276
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-ev-handle.h
@@ -0,0 +1,136 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CHANGELOG_EV_HANDLE_H
+#define __CHANGELOG_EV_HANDLE_H
+
+#include <glusterfs/list.h>
+#include <glusterfs/xlator.h>
+#include "rpc-clnt.h"
+
+#include <glusterfs/rot-buffs.h>
+
+struct changelog_clnt;
+
+typedef struct changelog_rpc_clnt {
+    xlator_t *this;
+
+    gf_lock_t lock;
+
+    gf_atomic_t ref;
+    gf_boolean_t disconnected;
+
+    unsigned int filter;
+    char sock[UNIX_PATH_MAX];
+
+    struct changelog_clnt *c_clnt; /* back pointer to list holder */
+
+    struct rpc_clnt *rpc; /* RPC client endpoint */
+
+    struct list_head list; /* ->pending, ->waitq, ->active */
+
+    void (*cleanup)(struct changelog_rpc_clnt *); /* cleanup handler */
+} changelog_rpc_clnt_t;
+
+static inline void
+changelog_rpc_clnt_ref(changelog_rpc_clnt_t *crpc)
+{
+    GF_ATOMIC_INC(crpc->ref);
+}
+
+static inline void
+changelog_set_disconnect_flag(changelog_rpc_clnt_t *crpc, gf_boolean_t flag)
+{
+    crpc->disconnected = flag;
+}
+
+static inline int
+changelog_rpc_clnt_is_disconnected(changelog_rpc_clnt_t *crpc)
+{
+    return (crpc->disconnected == _gf_true);
+}
+
+static inline void
+changelog_rpc_clnt_unref(changelog_rpc_clnt_t *crpc)
+{
+    gf_boolean_t gone = _gf_false;
+    uint64_t ref = 0;
+
+    ref = GF_ATOMIC_DEC(crpc->ref);
+
+    if (!ref && changelog_rpc_clnt_is_disconnected(crpc)) {
+        list_del(&crpc->list);
+        gone = _gf_true;
+    }
+
+    if (gone)
+        crpc->cleanup(crpc);
+}
+
+/**
+ * This structure holds pending and active clients. On probe RPC all
+ * an instance of the above structure (@changelog_rpc_clnt) is placed
+ * in ->pending and gets moved to ->active on a successful connect.
+ *
+ * locking rules:
+ *
+ * Manipulating ->pending
+ * ->pending_lock
+ *    ->pending
+ *
+ * Manipulating ->active
+ * ->active_lock
+ *    ->active
+ *
+ * Moving object from ->pending to ->active
+ * ->pending_lock
+ *   ->active_lock
+ *
+ * Objects are _never_ moved from ->active to ->pending, i.e., during
+ * disconnection, the object is destroyed. Well, we could have tried
+ * to reconnect, but that's pure waste.. let the other end reconnect.
+ */
+
+typedef struct changelog_clnt {
+    xlator_t *this;
+
+    /* pending connections */
+    pthread_mutex_t pending_lock;
+    pthread_cond_t pending_cond;
+    struct list_head pending;
+
+    /* current active connections */
+    gf_lock_t active_lock;
+    struct list_head active;
+
+    gf_lock_t wait_lock;
+    struct list_head waitq;
+
+    /* consumer part of rot-buffs */
+    rbuf_t *rbuf;
+    unsigned long sequence;
+} changelog_clnt_t;
+
+void *
+changelog_ev_connector(void *);
+
+void *
+changelog_ev_dispatch(void *);
+
+/* APIs */
+void
+changelog_ev_queue_connection(changelog_clnt_t *, changelog_rpc_clnt_t *);
+
+void
+changelog_ev_cleanup_connections(xlator_t *, changelog_clnt_t *);
+
+void
+changelog_process_cleanup_event(xlator_t *);
+#endif
diff --git a/xlators/features/changelog/src/changelog-helpers.c b/xlators/features/changelog/src/changelog-helpers.c
new file mode 100644
index 00000000000..e561997d858
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-helpers.c
@@ -0,0 +1,1977 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/iobuf.h>
+#include <glusterfs/syscall.h>
+
+#include "changelog-helpers.h"
+#include "changelog-encoders.h"
+#include "changelog-mem-types.h"
+#include "changelog-messages.h"
+
+#include "changelog-encoders.h"
+#include "changelog-rpc-common.h"
+#include <pthread.h>
+#include <time.h>
+
+static void
+changelog_cleanup_free_mutex(void *arg_mutex)
+{
+    pthread_mutex_t *p_mutex = (pthread_mutex_t *)arg_mutex;
+
+    if (p_mutex)
+        pthread_mutex_unlock(p_mutex);
+}
+
+int
+changelog_thread_cleanup(xlator_t *this, pthread_t thr_id)
+{
+    int ret = 0;
+    void *retval = NULL;
+
+    /* send a cancel request to the thread */
+    ret = pthread_cancel(thr_id);
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_MSG_PTHREAD_CANCEL_FAILED, NULL);
+        goto out;
+    }
+
+    ret = pthread_join(thr_id, &retval);
+    if ((ret != 0) || (retval != PTHREAD_CANCELED)) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_MSG_PTHREAD_CANCEL_FAILED, NULL);
+    }
+
+out:
+    return ret;
+}
+
+void *
+changelog_get_usable_buffer(changelog_local_t *local)
+{
+    changelog_log_data_t *cld = NULL;
+
+    if (!local)
+        return NULL;
+
+    cld = &local->cld;
+    if (!cld->cld_iobuf)
+        return NULL;
+
+    return cld->cld_iobuf->ptr;
+}
+
+static int
+changelog_selector_index(unsigned int selector)
+{
+    return (ffs(selector) - 1);
+}
+
+int
+changelog_ev_selected(xlator_t *this, changelog_ev_selector_t *selection,
+                      unsigned int selector)
+{
+    int idx = 0;
+
+    idx = changelog_selector_index(selector);
+    gf_msg_debug(this->name, 0, "selector ref count for %d (idx: %d): %d",
+                 selector, idx, selection->ref[idx]);
+    /* this can be lockless */
+    return (idx < CHANGELOG_EV_SELECTION_RANGE && (selection->ref[idx] > 0));
+}
+
+void
+changelog_select_event(xlator_t *this, changelog_ev_selector_t *selection,
+                       unsigned int selector)
+{
+    int idx = 0;
+
+    LOCK(&selection->reflock);
+    {
+        while (selector) {
+            idx = changelog_selector_index(selector);
+            if (idx < CHANGELOG_EV_SELECTION_RANGE) {
+                selection->ref[idx]++;
+                gf_msg_debug(this->name, 0, "selecting event %d", idx);
+            }
+            selector &= ~(1 << idx);
+        }
+    }
+    UNLOCK(&selection->reflock);
+}
+
+void
+changelog_deselect_event(xlator_t *this, changelog_ev_selector_t *selection,
+                         unsigned int selector)
+{
+    int idx = 0;
+
+    LOCK(&selection->reflock);
+    {
+        while (selector) {
+            idx = changelog_selector_index(selector);
+            if (idx < CHANGELOG_EV_SELECTION_RANGE) {
+                selection->ref[idx]--;
+                gf_msg_debug(this->name, 0, "de-selecting event %d", idx);
+            }
+            selector &= ~(1 << idx);
+        }
+    }
+    UNLOCK(&selection->reflock);
+}
+
+int
+changelog_init_event_selection(xlator_t *this,
+                               changelog_ev_selector_t *selection)
+{
+    int ret = 0;
+    int j = CHANGELOG_EV_SELECTION_RANGE;
+
+    ret = LOCK_INIT(&selection->reflock);
+    if (ret != 0)
+        return -1;
+
+    LOCK(&selection->reflock);
+    {
+        while (j--) {
+            selection->ref[j] = 0;
+        }
+    }
+    UNLOCK(&selection->reflock);
+
+    return 0;
+}
+
+static void
+changelog_perform_dispatch(xlator_t *this, changelog_priv_t *priv, void *mem,
+                           size_t size)
+{
+    char *buf = NULL;
+    void *opaque = NULL;
+
+    buf = rbuf_reserve_write_area(priv->rbuf, size, &opaque);
+    if (!buf) {
+        gf_msg_callingfn(this->name, GF_LOG_WARNING, 0,
+                         CHANGELOG_MSG_DISPATCH_EVENT_FAILED,
+                         "failed to dispatch event");
+        return;
+    }
+
+    memcpy(buf, mem, size);
+    rbuf_write_complete(opaque);
+}
+
+void
+changelog_dispatch_event(xlator_t *this, changelog_priv_t *priv,
+                         changelog_event_t *ev)
+{
+    changelog_ev_selector_t *selection = NULL;
+
+    selection = &priv->ev_selection;
+    if (changelog_ev_selected(this, selection, ev->ev_type)) {
+        changelog_perform_dispatch(this, priv, ev, CHANGELOG_EV_SIZE);
+    }
+}
+
+void
+changelog_set_usable_record_and_length(changelog_local_t *local, size_t len,
+                                       int xr)
+{
+    changelog_log_data_t *cld = NULL;
+
+    cld = &local->cld;
+
+    cld->cld_ptr_len = len;
+    cld->cld_xtra_records = xr;
+}
+
+void
+changelog_local_cleanup(xlator_t *xl, changelog_local_t *local)
+{
+    int i = 0;
+    changelog_opt_t *co = NULL;
+    changelog_log_data_t *cld = NULL;
+
+    if (!local)
+        return;
+
+    cld = &local->cld;
+
+    /* cleanup dynamic allocation for extra records */
+    if (cld->cld_xtra_records) {
+        co = (changelog_opt_t *)cld->cld_ptr;
+        for (; i < cld->cld_xtra_records; i++, co++)
+            if (co->co_free)
+                co->co_free(co);
+    }
+
+    CHANGELOG_IOBUF_UNREF(cld->cld_iobuf);
+
+    if (local->inode)
+        inode_unref(local->inode);
+
+    mem_put(local);
+}
+
+int
+changelog_write(int fd, char *buffer, size_t len)
+{
+    ssize_t size = 0;
+    size_t written = 0;
+
+    while (written < len) {
+        size = sys_write(fd, buffer + written, len - written);
+        if (size <= 0)
+            break;
+
+        written += size;
+    }
+
+    return (written != len);
+}
+
+int
+htime_update(xlator_t *this, changelog_priv_t *priv, time_t ts, char *buffer)
+{
+    char changelog_path[PATH_MAX + 1] = {
+        0,
+    };
+    int len = -1;
+    char x_value[25] = {
+        0,
+    };
+    /* time stamp(10) + : (1) + rolltime (12 ) + buffer (2) */
+    int ret = 0;
+
+    if (priv->htime_fd == -1) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_HTIME_ERROR,
+                "reason=fd not available", NULL);
+        ret = -1;
+        goto out;
+    }
+    len = snprintf(changelog_path, PATH_MAX, "%s", buffer);
+    if (len >= PATH_MAX) {
+        ret = -1;
+        goto out;
+    }
+    if (changelog_write(priv->htime_fd, (void *)changelog_path, len + 1) < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_HTIME_ERROR,
+                "reason=write failed", NULL);
+        ret = -1;
+        goto out;
+    }
+
+    len = snprintf(x_value, sizeof(x_value), "%ld:%d", ts,
+                   priv->rollover_count);
+    if (len >= sizeof(x_value)) {
+        ret = -1;
+        goto out;
+    }
+
+    if (sys_fsetxattr(priv->htime_fd, HTIME_KEY, x_value, len, XATTR_REPLACE)) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_HTIME_ERROR,
+                "reason=xattr updation failed", "XATTR_REPLACE=true",
+                "changelog=%s", changelog_path, NULL);
+
+        if (sys_fsetxattr(priv->htime_fd, HTIME_KEY, x_value, len, 0)) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_HTIME_ERROR,
+                    "reason=xattr updation failed", "changelog=%s",
+                    changelog_path, NULL);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    priv->rollover_count += 1;
+
+out:
+    return ret;
+}
+
+/*
+ * Description: Check if the changelog to rollover is empty or not.
+ * It is assumed that fd passed is already verified.
+ *
+ * Returns:
+ * 1 : If found empty, changed path from "CHANGELOG.<TS>" to "changelog.<TS>"
+ * 0 : If NOT empty, proceed usual.
+ */
+int
+cl_is_empty(xlator_t *this, int fd)
+{
+    int ret = -1;
+    size_t elen = 0;
+    int encoding = -1;
+    char buffer[1024] = {
+        0,
+    };
+    struct stat stbuf = {
+        0,
+    };
+    int major_version = -1;
+    int minor_version = -1;
+
+    ret = sys_fstat(fd, &stbuf);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSTAT_OP_FAILED,
+                NULL);
+        goto out;
+    }
+
+    ret = sys_lseek(fd, 0, SEEK_SET);
+    if (ret == -1) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_LSEEK_OP_FAILED,
+                NULL);
+        goto out;
+    }
+
+    CHANGELOG_GET_HEADER_INFO(fd, buffer, sizeof(buffer), encoding,
+                              major_version, minor_version, elen);
+
+    if (elen == stbuf.st_size) {
+        ret = 1;
+    } else {
+        ret = 0;
+    }
+
+out:
+    return ret;
+}
+
+/*
+ * Description: Updates "CHANGELOG" to "changelog" for writing changelog path
+ * to htime file.
+ *
+ * Returns:
+ * 0  : Success
+ * -1 : Error
+ */
+int
+update_path(xlator_t *this, char *cl_path)
+{
+    const char low_cl[] = "changelog";
+    const char up_cl[] = "CHANGELOG";
+    char *found = NULL;
+    int ret = -1;
+
+    found = strstr(cl_path, up_cl);
+
+    if (found == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PATH_NOT_FOUND,
+                NULL);
+        goto out;
+    } else {
+        memcpy(found, low_cl, sizeof(low_cl) - 1);
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+changelog_rollover_changelog(xlator_t *this, changelog_priv_t *priv, time_t ts)
+{
+    int ret = -1;
+    int notify = 0;
+    int cl_empty_flag = 0;
+    struct tm *gmt;
+    char yyyymmdd[40];
+    char ofile[PATH_MAX] = {
+        0,
+    };
+    char nfile[PATH_MAX] = {
+        0,
+    };
+    char nfile_dir[PATH_MAX] = {
+        0,
+    };
+    changelog_event_t ev = {
+        0,
+    };
+
+    if (priv->changelog_fd != -1) {
+        ret = sys_fsync(priv->changelog_fd);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    CHANGELOG_MSG_FSYNC_OP_FAILED, NULL);
+        }
+        ret = cl_is_empty(this, priv->changelog_fd);
+        if (ret == 1) {
+            cl_empty_flag = 1;
+        } else if (ret == -1) {
+            /* Log error but proceed as usual */
+            gf_smsg(this->name, GF_LOG_WARNING, 0,
+                    CHANGELOG_MSG_DETECT_EMPTY_CHANGELOG_FAILED, NULL);
+        }
+        sys_close(priv->changelog_fd);
+        priv->changelog_fd = -1;
+    }
+
+    /* Get GMT time. */
+    gmt = gmtime(&ts);
+
+    strftime(yyyymmdd, sizeof(yyyymmdd), "%Y/%m/%d", gmt);
+
+    (void)snprintf(ofile, PATH_MAX, "%s/" CHANGELOG_FILE_NAME,
+                   priv->changelog_dir);
+    (void)snprintf(nfile, PATH_MAX, "%s/%s/" CHANGELOG_FILE_NAME ".%ld",
+                   priv->changelog_dir, yyyymmdd, ts);
+    (void)snprintf(nfile_dir, PATH_MAX, "%s/%s", priv->changelog_dir, yyyymmdd);
+
+    if (cl_empty_flag == 1) {
+        ret = sys_unlink(ofile);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    CHANGELOG_MSG_UNLINK_OP_FAILED, "path=%s", ofile, NULL);
+            ret = 0; /* Error in unlinking empty changelog should
+                        not break further changelog operation, so
+                        reset return value to 0*/
+        }
+    } else {
+        ret = sys_rename(ofile, nfile);
+
+        /* Changelog file rename gets ENOENT when parent dir doesn't exist */
+        if (errno == ENOENT) {
+            ret = mkdir_p(nfile_dir, 0600, _gf_true);
+
+            if ((ret == -1) && (EEXIST != errno)) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno,
+                        CHANGELOG_MSG_MKDIR_ERROR, "%s", nfile_dir, NULL);
+                goto out;
+            }
+
+            ret = sys_rename(ofile, nfile);
+        }
+
+        if (ret && (errno == ENOENT)) {
+            ret = 0;
+            goto out;
+        }
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_RENAME_ERROR,
+                    "from=%s", ofile, "to=%s", nfile, NULL);
+        }
+    }
+
+    if (!ret && (cl_empty_flag == 0)) {
+        notify = 1;
+    }
+
+    if (!ret) {
+        if (cl_empty_flag) {
+            update_path(this, nfile);
+        }
+        ret = htime_update(this, priv, ts, nfile);
+        if (ret == -1) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_HTIME_ERROR,
+                    NULL);
+            goto out;
+        }
+    }
+
+    if (notify) {
+        ev.ev_type = CHANGELOG_OP_TYPE_JOURNAL;
+        memcpy(ev.u.journal.path, nfile, strlen(nfile) + 1);
+        changelog_dispatch_event(this, priv, &ev);
+    }
+out:
+    /* If this is explicit rollover initiated by snapshot,
+     * wakeup reconfigure thread waiting for changelog to
+     * rollover. This should happen even in failure cases as
+     * well otherwise snapshot will timeout and fail. Hence
+     * moved under out.
+     */
+    if (priv->explicit_rollover) {
+        priv->explicit_rollover = _gf_false;
+
+        pthread_mutex_lock(&priv->bn.bnotify_mutex);
+        {
+            if (ret) {
+                priv->bn.bnotify_error = _gf_true;
+                gf_smsg(this->name, GF_LOG_ERROR, 0,
+                        CHANGELOG_MSG_EXPLICIT_ROLLOVER_FAILED, NULL);
+            } else {
+                gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_BNOTIFY_INFO,
+                        "changelog=%s", nfile, NULL);
+            }
+            priv->bn.bnotify = _gf_false;
+            pthread_cond_signal(&priv->bn.bnotify_cond);
+        }
+        pthread_mutex_unlock(&priv->bn.bnotify_mutex);
+    }
+    return ret;
+}
+
+int
+filter_cur_par_dirs(const struct dirent *entry)
+{
+    if (entry == NULL)
+        return 0;
+
+    if ((strcmp(entry->d_name, ".") == 0) || (strcmp(entry->d_name, "..") == 0))
+        return 0;
+    else
+        return 1;
+}
+
+/*
+ * find_current_htime:
+ *       It finds the latest htime file and sets the HTIME_CURRENT
+ *       xattr.
+ *       RETURN VALUE:
+ *           -1 : Error
+ *           ret: Number of directory entries;
+ */
+
+int
+find_current_htime(int ht_dir_fd, const char *ht_dir_path, char *ht_file_bname)
+{
+    struct dirent **namelist = NULL;
+    int ret = 0;
+    int cnt = 0;
+    int i = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(ht_dir_path);
+
+    cnt = scandir(ht_dir_path, &namelist, filter_cur_par_dirs, alphasort);
+    if (cnt < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_SCAN_DIR_FAILED,
+                NULL);
+    } else if (cnt > 0) {
+        if (snprintf(ht_file_bname, NAME_MAX, "%s",
+                     namelist[cnt - 1]->d_name) >= NAME_MAX) {
+            ret = -1;
+            goto out;
+        }
+        if (sys_fsetxattr(ht_dir_fd, HTIME_CURRENT, ht_file_bname,
+                          strlen(ht_file_bname), 0)) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    CHANGELOG_MSG_FSETXATTR_FAILED, "HTIME_CURRENT", NULL);
+            ret = -1;
+            goto out;
+        }
+
+        if (sys_fsync(ht_dir_fd) < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    CHANGELOG_MSG_FSYNC_OP_FAILED, NULL);
+            ret = -1;
+            goto out;
+        }
+    }
+
+out:
+    for (i = 0; i < cnt; i++)
+        free(namelist[i]);
+    free(namelist);
+
+    if (ret)
+        cnt = ret;
+
+    return cnt;
+}
+
+/* Returns 0 on successful open of htime file
+ * returns -1 on failure or error
+ */
+int
+htime_open(xlator_t *this, changelog_priv_t *priv, time_t ts)
+{
+    int ht_file_fd = -1;
+    int ht_dir_fd = -1;
+    int ret = 0;
+    int cnt = 0;
+    char ht_dir_path[PATH_MAX] = {
+        0,
+    };
+    char ht_file_path[PATH_MAX] = {
+        0,
+    };
+    char ht_file_bname[NAME_MAX] = {
+        0,
+    };
+    char x_value[NAME_MAX] = {
+        0,
+    };
+    int flags = 0;
+    unsigned long min_ts = 0;
+    unsigned long max_ts = 0;
+    unsigned long total = 0;
+    unsigned long total1 = 0;
+    ssize_t size = 0;
+    struct stat stat_buf = {
+        0,
+    };
+    unsigned long record_len = 0;
+    int32_t len = 0;
+
+    CHANGELOG_FILL_HTIME_DIR(priv->changelog_dir, ht_dir_path);
+
+    /* Open htime directory to get HTIME_CURRENT */
+    ht_dir_fd = open(ht_dir_path, O_RDONLY);
+    if (ht_dir_fd == -1) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED,
+                "path=%s", ht_dir_path, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    size = sys_fgetxattr(ht_dir_fd, HTIME_CURRENT, ht_file_bname,
+                         sizeof(ht_file_bname));
+    if (size < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FGETXATTR_FAILED,
+                "name=HTIME_CURRENT", NULL);
+
+        /* If upgrade scenario, find the latest HTIME.TSTAMP file
+         * and use the same. If error, create a new HTIME.TSTAMP
+         * file.
+         */
+        cnt = find_current_htime(ht_dir_fd, ht_dir_path, ht_file_bname);
+        if (cnt <= 0) {
+            gf_smsg(this->name, GF_LOG_INFO, errno,
+                    CHANGELOG_MSG_NO_HTIME_CURRENT, NULL);
+            sys_close(ht_dir_fd);
+            return htime_create(this, priv, ts);
+        }
+
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_MSG_HTIME_CURRENT_ERROR, NULL);
+    }
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_HTIME_CURRENT, "path=%s",
+            ht_file_bname, NULL);
+    len = snprintf(ht_file_path, PATH_MAX, "%s/%s", ht_dir_path, ht_file_bname);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        ret = -1;
+        goto out;
+    }
+
+    /* Open in append mode as existing htime file is used */
+    flags |= (O_RDWR | O_SYNC | O_APPEND);
+    ht_file_fd = open(ht_file_path, flags,
+                      S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+    if (ht_file_fd < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED,
+                "path=%s", ht_file_path, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    /* save this htime_fd in priv->htime_fd */
+    priv->htime_fd = ht_file_fd;
+
+    ret = sys_fstat(ht_file_fd, &stat_buf);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_HTIME_STAT_ERROR,
+                "path=%s", ht_file_path, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    /* Initialize rollover-number in priv to current number */
+    size = sys_fgetxattr(ht_file_fd, HTIME_KEY, x_value, sizeof(x_value));
+    if (size < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FGETXATTR_FAILED,
+                "name=%s", HTIME_KEY, "path=%s", ht_file_path, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    sscanf(x_value, "%lu:%lu", &max_ts, &total);
+
+    /* 22 = 1(/) + 20(CHANGELOG.TIMESTAMP) + 1(\x00) */
+    record_len = strlen(priv->changelog_dir) + 22;
+    total1 = stat_buf.st_size / record_len;
+    if (total != total1) {
+        gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_TOTAL_LOG_INFO,
+                "xattr_total=%lu", total, "size_total=%lu", total1, NULL);
+    }
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_TOTAL_LOG_INFO, "min=%lu",
+            min_ts, "max=%lu", max_ts, "total_changelogs=%lu", total, NULL);
+
+    if (total < total1)
+        priv->rollover_count = total1 + 1;
+    else
+        priv->rollover_count = total + 1;
+
+out:
+    if (ht_dir_fd != -1)
+        sys_close(ht_dir_fd);
+    return ret;
+}
+
+/* Returns 0 on successful creation of htime file
+ * returns -1 on failure or error
+ */
+int
+htime_create(xlator_t *this, changelog_priv_t *priv, time_t ts)
+{
+    int ht_file_fd = -1;
+    int ht_dir_fd = -1;
+    int ret = 0;
+    char ht_dir_path[PATH_MAX] = {
+        0,
+    };
+    char ht_file_path[PATH_MAX] = {
+        0,
+    };
+    char ht_file_bname[NAME_MAX + 1] = {
+        0,
+    };
+    int flags = 0;
+    int32_t len = 0;
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_NEW_HTIME_FILE,
+            "name=%ld", ts, NULL);
+
+    CHANGELOG_FILL_HTIME_DIR(priv->changelog_dir, ht_dir_path);
+
+    /* get the htime file name in ht_file_path */
+    len = snprintf(ht_file_path, PATH_MAX, "%s/%s.%ld", ht_dir_path,
+                   HTIME_FILE_NAME, ts);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        ret = -1;
+        goto out;
+    }
+
+    flags |= (O_CREAT | O_RDWR | O_SYNC);
+    ht_file_fd = open(ht_file_path, flags,
+                      S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+    if (ht_file_fd < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED,
+                "path=%s", ht_file_path, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    if (sys_fsetxattr(ht_file_fd, HTIME_KEY, HTIME_INITIAL_VALUE,
+                      sizeof(HTIME_INITIAL_VALUE) - 1, 0)) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_MSG_XATTR_INIT_FAILED, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = sys_fsync(ht_file_fd);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSYNC_OP_FAILED,
+                NULL);
+        goto out;
+    }
+
+    /* save this htime_fd in priv->htime_fd */
+    priv->htime_fd = ht_file_fd;
+
+    ht_file_fd = -1;
+
+    /* Set xattr HTIME_CURRENT on htime directory to htime filename */
+    ht_dir_fd = open(ht_dir_path, O_RDONLY);
+    if (ht_dir_fd == -1) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED,
+                "path=%s", ht_dir_path, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    (void)snprintf(ht_file_bname, sizeof(ht_file_bname), "%s.%ld",
+                   HTIME_FILE_NAME, ts);
+    if (sys_fsetxattr(ht_dir_fd, HTIME_CURRENT, ht_file_bname,
+                      strlen(ht_file_bname), 0)) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSETXATTR_FAILED,
+                " HTIME_CURRENT", NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = sys_fsync(ht_dir_fd);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_FSYNC_OP_FAILED,
+                NULL);
+        goto out;
+    }
+
+    /* initialize rollover-number in priv to 1 */
+    priv->rollover_count = 1;
+
+out:
+    if (ht_dir_fd != -1)
+        sys_close(ht_dir_fd);
+    if (ht_file_fd != -1)
+        sys_close(ht_file_fd);
+    return ret;
+}
+
+/* Description:
+ *      Opens the snap changelog to log call path fops in it.
+ *      This changelos name is "CHANGELOG.SNAP", stored in
+ *      path ".glusterfs/changelogs/csnap".
+ * Returns:
+ *       0  : On success.
+ *      -1  : On failure.
+ */
+int
+changelog_snap_open(xlator_t *this, changelog_priv_t *priv)
+{
+    int fd = -1;
+    int ret = 0;
+    int flags = 0;
+    char buffer[1024] = {
+        0,
+    };
+    char c_snap_path[PATH_MAX] = {
+        0,
+    };
+    char csnap_dir_path[PATH_MAX] = {
+        0,
+    };
+    int32_t len = 0;
+
+    CHANGELOG_FILL_CSNAP_DIR(priv->changelog_dir, csnap_dir_path);
+
+    len = snprintf(c_snap_path, PATH_MAX, "%s/" CSNAP_FILE_NAME,
+                   csnap_dir_path);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        ret = -1;
+        goto out;
+    }
+
+    flags |= (O_CREAT | O_RDWR | O_TRUNC);
+
+    fd = open(c_snap_path, flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+    if (fd < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED,
+                "path=%s", c_snap_path, NULL);
+        ret = -1;
+        goto out;
+    }
+    priv->c_snap_fd = fd;
+
+    (void)snprintf(buffer, 1024, CHANGELOG_HEADER, CHANGELOG_VERSION_MAJOR,
+                   CHANGELOG_VERSION_MINOR, priv->ce->encoder);
+    ret = changelog_snap_write_change(priv, buffer, strlen(buffer));
+    if (ret < 0) {
+        sys_close(priv->c_snap_fd);
+        priv->c_snap_fd = -1;
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+/*
+ * Description:
+ *      Starts logging fop details in CSNAP journal.
+ * Returns:
+ *       0 : On success.
+ *      -1 : On Failure.
+ */
+int
+changelog_snap_logging_start(xlator_t *this, changelog_priv_t *priv)
+{
+    int ret = 0;
+
+    ret = changelog_snap_open(this, priv);
+    gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_SNAP_INFO, "starting",
+            NULL);
+
+    return ret;
+}
+
+/*
+ * Description:
+ *      Stops logging fop details in CSNAP journal.
+ * Returns:
+ *       0 : On success.
+ *      -1 : On Failure.
+ */
+int
+changelog_snap_logging_stop(xlator_t *this, changelog_priv_t *priv)
+{
+    int ret = 0;
+
+    sys_close(priv->c_snap_fd);
+    priv->c_snap_fd = -1;
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_SNAP_INFO, "Stopped",
+            NULL);
+
+    return ret;
+}
+
+int
+changelog_open_journal(xlator_t *this, changelog_priv_t *priv)
+{
+    int fd = 0;
+    int ret = -1;
+    int flags = 0;
+    char buffer[1024] = {
+        0,
+    };
+    char changelog_path[PATH_MAX] = {
+        0,
+    };
+
+    (void)snprintf(changelog_path, PATH_MAX, "%s/" CHANGELOG_FILE_NAME,
+                   priv->changelog_dir);
+
+    flags |= (O_CREAT | O_RDWR);
+    if (priv->fsync_interval == 0)
+        flags |= O_SYNC;
+
+    fd = open(changelog_path, flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+    if (fd < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_OPEN_FAILED,
+                "path=%s", changelog_path, NULL);
+        goto out;
+    }
+
+    priv->changelog_fd = fd;
+
+    (void)snprintf(buffer, 1024, CHANGELOG_HEADER, CHANGELOG_VERSION_MAJOR,
+                   CHANGELOG_VERSION_MINOR, priv->ce->encoder);
+    ret = changelog_write_change(priv, buffer, strlen(buffer));
+    if (ret) {
+        sys_close(priv->changelog_fd);
+        priv->changelog_fd = -1;
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int
+changelog_start_next_change(xlator_t *this, changelog_priv_t *priv, time_t ts,
+                            gf_boolean_t finale)
+{
+    int ret = -1;
+
+    ret = changelog_rollover_changelog(this, priv, ts);
+
+    if (!ret && !finale)
+        ret = changelog_open_journal(this, priv);
+
+    return ret;
+}
+
+/**
+ * return the length of entry
+ */
+size_t
+changelog_entry_length()
+{
+    return sizeof(changelog_log_data_t);
+}
+
+void
+changelog_fill_rollover_data(changelog_log_data_t *cld, gf_boolean_t is_last)
+{
+    cld->cld_type = CHANGELOG_TYPE_ROLLOVER;
+    cld->cld_roll_time = gf_time();
+    cld->cld_finale = is_last;
+}
+
+int
+changelog_snap_write_change(changelog_priv_t *priv, char *buffer, size_t len)
+{
+    return changelog_write(priv->c_snap_fd, buffer, len);
+}
+
+int
+changelog_write_change(changelog_priv_t *priv, char *buffer, size_t len)
+{
+    return changelog_write(priv->changelog_fd, buffer, len);
+}
+
+/*
+ * Descriptions:
+ *      Writes fop details in ascii format to CSNAP.
+ * Issues:
+ *      Not Encoding agnostic.
+ * Returns:
+ *      0 : On Success.
+ *     -1 : On Failure.
+ */
+int
+changelog_snap_handle_ascii_change(xlator_t *this, changelog_log_data_t *cld)
+{
+    size_t off = 0;
+    size_t gfid_len = 0;
+    char *gfid_str = NULL;
+    char *buffer = NULL;
+    changelog_priv_t *priv = NULL;
+    int ret = 0;
+
+    if (this == NULL) {
+        ret = -1;
+        goto out;
+    }
+
+    priv = this->private;
+
+    if (priv == NULL) {
+        ret = -1;
+        goto out;
+    }
+
+    gfid_str = uuid_utoa(cld->cld_gfid);
+    gfid_len = strlen(gfid_str);
+
+    /*  extra bytes for decorations */
+    buffer = alloca(gfid_len + cld->cld_ptr_len + 10);
+    CHANGELOG_STORE_ASCII(priv, buffer, off, gfid_str, gfid_len, cld);
+
+    CHANGELOG_FILL_BUFFER(buffer, off, "\0", 1);
+
+    ret = changelog_snap_write_change(priv, buffer, off);
+
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_WRITE_FAILED,
+                "csnap", NULL);
+    }
+    gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_WROTE_TO_CSNAP, NULL);
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+changelog_handle_change(xlator_t *this, changelog_priv_t *priv,
+                        changelog_log_data_t *cld)
+{
+    int ret = 0;
+
+    if (CHANGELOG_TYPE_IS_ROLLOVER(cld->cld_type)) {
+        changelog_encode_change(priv);
+        ret = changelog_start_next_change(this, priv, cld->cld_roll_time,
+                                          cld->cld_finale);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_ERROR, 0,
+                    CHANGELOG_MSG_GET_TIME_OP_FAILED, NULL);
+        goto out;
+    }
+
+    /**
+     * case when there is reconfigure done (disabling changelog) and there
+     * are still fops that have updates in prgress.
+     */
+    if (priv->changelog_fd == -1)
+        return 0;
+
+    if (CHANGELOG_TYPE_IS_FSYNC(cld->cld_type)) {
+        ret = sys_fsync(priv->changelog_fd);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    CHANGELOG_MSG_FSYNC_OP_FAILED, NULL);
+        }
+        goto out;
+    }
+
+    ret = priv->ce->encode(this, cld);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_WRITE_FAILED,
+                "changelog", NULL);
+    }
+
+out:
+    return ret;
+}
+
+changelog_local_t *
+changelog_local_init(xlator_t *this, inode_t *inode, uuid_t gfid,
+                     int xtra_records, gf_boolean_t update_flag)
+{
+    changelog_local_t *local = NULL;
+    struct iobuf *iobuf = NULL;
+
+    /**
+     * We relax the presence of inode if @update_flag is true.
+     * The caller (implementation of the fop) needs to be careful to
+     * not blindly use local->inode.
+     */
+    if (!update_flag && !inode) {
+        gf_msg_callingfn(this->name, GF_LOG_WARNING, 0,
+                         CHANGELOG_MSG_INODE_NOT_FOUND,
+                         "inode needed for version checking !!!");
+
+        goto out;
+    }
+
+    if (xtra_records) {
+        iobuf = iobuf_get2(this->ctx->iobuf_pool,
+                           xtra_records * CHANGELOG_OPT_RECORD_LEN);
+        if (!iobuf)
+            goto out;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        CHANGELOG_IOBUF_UNREF(iobuf);
+        goto out;
+    }
+
+    local->update_no_check = update_flag;
+
+    gf_uuid_copy(local->cld.cld_gfid, gfid);
+
+    local->cld.cld_iobuf = iobuf;
+    local->cld.cld_xtra_records = 0; /* set by the caller */
+
+    if (inode)
+        local->inode = inode_ref(inode);
+
+out:
+    return local;
+}
+
+int
+changelog_forget(xlator_t *this, inode_t *inode)
+{
+    uint64_t ctx_addr = 0;
+    changelog_inode_ctx_t *ctx = NULL;
+
+    inode_ctx_del(inode, this, &ctx_addr);
+    if (!ctx_addr)
+        return 0;
+
+    ctx = (changelog_inode_ctx_t *)(long)ctx_addr;
+    GF_FREE(ctx);
+
+    return 0;
+}
+
+int
+changelog_inject_single_event(xlator_t *this, changelog_priv_t *priv,
+                              changelog_log_data_t *cld)
+{
+    return priv->cd.dispatchfn(this, priv, priv->cd.cd_data, cld, NULL);
+}
+
+/* Wait till all the black fops are drained */
+void
+changelog_drain_black_fops(xlator_t *this, changelog_priv_t *priv)
+{
+    int ret = 0;
+
+    /* clean up framework of pthread_mutex is required here as
+     * 'reconfigure' terminates the changelog_rollover thread
+     * on graph change.
+     */
+    pthread_cleanup_push(changelog_cleanup_free_mutex,
+                         &priv->dm.drain_black_mutex);
+    ret = pthread_mutex_lock(&priv->dm.drain_black_mutex);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PTHREAD_ERROR,
+                "error=%d", ret, NULL);
+    while (priv->dm.black_fop_cnt > 0) {
+        gf_msg_debug(this->name, 0, "Conditional wait on black fops: %ld",
+                     priv->dm.black_fop_cnt);
+        priv->dm.drain_wait_black = _gf_true;
+        ret = pthread_cond_wait(&priv->dm.drain_black_cond,
+                                &priv->dm.drain_black_mutex);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    CHANGELOG_MSG_PTHREAD_COND_WAIT_FAILED, "error=%d", ret,
+                    NULL);
+    }
+    priv->dm.drain_wait_black = _gf_false;
+    ret = pthread_mutex_unlock(&priv->dm.drain_black_mutex);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PTHREAD_ERROR,
+                "error=%d", ret, NULL);
+    pthread_cleanup_pop(0);
+    gf_msg_debug(this->name, 0, "Woke up: Conditional wait on black fops");
+}
+
+/* Wait till all the white  fops are drained */
+void
+changelog_drain_white_fops(xlator_t *this, changelog_priv_t *priv)
+{
+    int ret = 0;
+
+    /* clean up framework of pthread_mutex is required here as
+     * 'reconfigure' terminates the changelog_rollover thread
+     * on graph change.
+     */
+    pthread_cleanup_push(changelog_cleanup_free_mutex,
+                         &priv->dm.drain_white_mutex);
+    ret = pthread_mutex_lock(&priv->dm.drain_white_mutex);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PTHREAD_ERROR,
+                "error=%d", ret, NULL);
+    while (priv->dm.white_fop_cnt > 0) {
+        gf_msg_debug(this->name, 0, "Conditional wait on white fops : %ld",
+                     priv->dm.white_fop_cnt);
+        priv->dm.drain_wait_white = _gf_true;
+        ret = pthread_cond_wait(&priv->dm.drain_white_cond,
+                                &priv->dm.drain_white_mutex);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    CHANGELOG_MSG_PTHREAD_COND_WAIT_FAILED, "error=%d", ret,
+                    NULL);
+    }
+    priv->dm.drain_wait_white = _gf_false;
+    ret = pthread_mutex_unlock(&priv->dm.drain_white_mutex);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_PTHREAD_ERROR,
+                "error=%d", ret, NULL);
+    pthread_cleanup_pop(0);
+    gf_msg_debug(this->name, 0, "Woke up: Conditional wait on white fops");
+}
+
+/**
+ * TODO: these threads have many thing in common (wake up after
+ * a certain time etc..). move them into separate routine.
+ */
+void *
+changelog_rollover(void *data)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+    struct timespec tv = {
+        0,
+    };
+    changelog_log_data_t cld = {
+        0,
+    };
+    changelog_time_slice_t *slice = NULL;
+    changelog_priv_t *priv = data;
+
+    this = priv->cr.this;
+    slice = &priv->slice;
+
+    while (1) {
+        (void)pthread_testcancel();
+
+        tv.tv_sec = gf_time() + priv->rollover_time;
+        tv.tv_nsec = 0;
+        ret = 0; /* Reset ret to zero */
+
+        /* The race between actual rollover and explicit rollover is
+         * handled. If actual rollover is being done and the
+         * explicit rollover event comes, the event is not missed.
+         * Since explicit rollover sets 'cr.notify' to true, this
+         * thread doesn't wait on 'pthread_cond_timedwait'.
+         */
+        pthread_cleanup_push(changelog_cleanup_free_mutex, &priv->cr.lock);
+        pthread_mutex_lock(&priv->cr.lock);
+        {
+            while (ret == 0 && !priv->cr.notify)
+                ret = pthread_cond_timedwait(&priv->cr.cond, &priv->cr.lock,
+                                             &tv);
+            if (ret == 0)
+                priv->cr.notify = _gf_false;
+        }
+        pthread_mutex_unlock(&priv->cr.lock);
+        pthread_cleanup_pop(0);
+
+        if (ret == 0) {
+            gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_BARRIER_INFO,
+                    NULL);
+            priv->explicit_rollover = _gf_true;
+        } else if (ret && ret != ETIMEDOUT) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    CHANGELOG_MSG_SELECT_FAILED, NULL);
+            continue;
+        } else if (ret && ret == ETIMEDOUT) {
+            gf_msg_debug(this->name, 0, "Wokeup on timeout");
+        }
+
+        /* Reading curent_color without lock is fine here
+         * as it is only modified here and is next to reading.
+         */
+        if (priv->current_color == FOP_COLOR_BLACK) {
+            LOCK(&priv->lock);
+            priv->current_color = FOP_COLOR_WHITE;
+            UNLOCK(&priv->lock);
+            gf_msg_debug(this->name, 0,
+                         "Black fops"
+                         " to be drained:%ld",
+                         priv->dm.black_fop_cnt);
+            changelog_drain_black_fops(this, priv);
+        } else {
+            LOCK(&priv->lock);
+            priv->current_color = FOP_COLOR_BLACK;
+            UNLOCK(&priv->lock);
+            gf_msg_debug(this->name, 0,
+                         "White fops"
+                         " to be drained:%ld",
+                         priv->dm.white_fop_cnt);
+            changelog_drain_white_fops(this, priv);
+        }
+
+        /* Adding delay of 1 second only during explicit rollover:
+         *
+         * Changelog rollover can happen either due to actual
+         * or the explicit rollover during snapshot. Actual
+         * rollover is controlled by tuneable called 'rollover-time'.
+         * The minimum granularity for rollover-time is 1 second.
+         * Explicit rollover is asynchronous in nature and happens
+         * during snapshot.
+         *
+         * Basically, rollover renames the current CHANGELOG file
+         * to CHANGELOG.TIMESTAMP. Let's assume, at time 't1',
+         * actual and explicit rollover raced against  each
+         * other and actual rollover won the race renaming the
+         * CHANGELOG file to CHANGELOG.t1 and opens a new
+         * CHANGELOG file. There is high chance that, an immediate
+         * explicit rollover at time 't1' can happen with in the same
+         * second to rename CHANGELOG file to CHANGELOG.t1 resulting in
+         * purging the earlier CHANGELOG.t1 file created by actual
+         * rollover. So adding a delay of 1 second guarantees unique
+         * CHANGELOG.TIMESTAMP during  explicit rollover.
+         */
+        if (priv->explicit_rollover == _gf_true)
+            sleep(1);
+
+        changelog_fill_rollover_data(&cld, _gf_false);
+
+        _mask_cancellation();
+
+        LOCK(&priv->lock);
+        {
+            ret = changelog_inject_single_event(this, priv, &cld);
+            if (!ret)
+                SLICE_VERSION_UPDATE(slice);
+        }
+        UNLOCK(&priv->lock);
+
+        _unmask_cancellation();
+    }
+
+    return NULL;
+}
+
+void *
+changelog_fsync_thread(void *data)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+    struct timeval tv = {
+        0,
+    };
+    changelog_log_data_t cld = {
+        0,
+    };
+    changelog_priv_t *priv = data;
+
+    this = priv->cf.this;
+    cld.cld_type = CHANGELOG_TYPE_FSYNC;
+
+    while (1) {
+        (void)pthread_testcancel();
+
+        tv.tv_sec = priv->fsync_interval;
+        tv.tv_usec = 0;
+
+        ret = select(0, NULL, NULL, NULL, &tv);
+        if (ret)
+            continue;
+
+        _mask_cancellation();
+
+        ret = changelog_inject_single_event(this, priv, &cld);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_ERROR, 0,
+                    CHANGELOG_MSG_INJECT_FSYNC_FAILED, NULL);
+
+        _unmask_cancellation();
+    }
+
+    return NULL;
+}
+
+/* macros for inode/changelog version checks */
+
+#define INODE_VERSION_UPDATE(priv, inode, iver, slice, type)                   \
+    do {                                                                       \
+        LOCK(&inode->lock);                                                    \
+        {                                                                      \
+            LOCK(&priv->lock);                                                 \
+            {                                                                  \
+                *iver = slice->changelog_version[type];                        \
+            }                                                                  \
+            UNLOCK(&priv->lock);                                               \
+        }                                                                      \
+        UNLOCK(&inode->lock);                                                  \
+    } while (0)
+
+#define INODE_VERSION_EQUALS_SLICE(priv, ver, slice, type, upd)                \
+    do {                                                                       \
+        LOCK(&priv->lock);                                                     \
+        {                                                                      \
+            upd = (ver == slice->changelog_version[type]) ? _gf_false          \
+                                                          : _gf_true;          \
+        }                                                                      \
+        UNLOCK(&priv->lock);                                                   \
+    } while (0)
+
+static int
+__changelog_inode_ctx_set(xlator_t *this, inode_t *inode,
+                          changelog_inode_ctx_t *ctx)
+{
+    uint64_t ctx_addr = (uint64_t)(uintptr_t)ctx;
+    return __inode_ctx_set(inode, this, &ctx_addr);
+}
+
+/**
+ * one shot routine to get the address and the value of a inode version
+ * for a particular type.
+ */
+changelog_inode_ctx_t *
+__changelog_inode_ctx_get(xlator_t *this, inode_t *inode, unsigned long **iver,
+                          unsigned long *version, changelog_log_type type)
+{
+    int ret = 0;
+    uint64_t ctx_addr = 0;
+    changelog_inode_ctx_t *ctx = NULL;
+
+    ret = __inode_ctx_get(inode, this, &ctx_addr);
+    if (ret < 0)
+        ctx_addr = 0;
+    if (ctx_addr != 0) {
+        ctx = (changelog_inode_ctx_t *)(long)ctx_addr;
+        goto out;
+    }
+
+    ctx = GF_CALLOC(1, sizeof(*ctx), gf_changelog_mt_inode_ctx_t);
+    if (!ctx)
+        goto out;
+
+    ret = __changelog_inode_ctx_set(this, inode, ctx);
+    if (ret) {
+        GF_FREE(ctx);
+        ctx = NULL;
+    }
+
+out:
+    if (ctx && iver && version) {
+        *iver = CHANGELOG_INODE_VERSION_TYPE(ctx, type);
+        *version = **iver;
+    }
+
+    return ctx;
+}
+
+static changelog_inode_ctx_t *
+changelog_inode_ctx_get(xlator_t *this, inode_t *inode, unsigned long **iver,
+                        unsigned long *version, changelog_log_type type)
+{
+    changelog_inode_ctx_t *ctx = NULL;
+
+    LOCK(&inode->lock);
+    {
+        ctx = __changelog_inode_ctx_get(this, inode, iver, version, type);
+    }
+    UNLOCK(&inode->lock);
+
+    return ctx;
+}
+
+/**
+ * This is the main update routine. Locking has been made granular so as to
+ * maximize parallelism of fops - I'll try to explain it below using execution
+ * timelines.
+ *
+ * Basically, the contention is between multiple execution threads of this
+ * routine and the roll-over thread. So, instead of having a big lock, we hold
+ * granular locks: inode->lock and priv->lock. Now I'll explain what happens
+ * when there is an update and a roll-over at just about the same time.
+ * NOTE:
+ *  - the dispatcher itself synchronizes updates via it's own lock
+ *  - the slice version in incremented by the roll-over thread
+ *
+ * Case 1: When the rollover thread wins before the inode version can be
+ * compared with the slice version.
+ *
+ *          [updater]                 |             [rollover]
+ *                                    |
+ *                                    |           <SLICE: 1, 1, 1>
+ * <changelog_update>                 |
+ *   <changelog_inode_ctx_get>        |
+ *      <CTX: 1, 1, 1>                |
+ *                                    |         <dispatch-rollover-event>
+ *                                    |         LOCK (&priv->lock)
+ *                                    |            <SLICE_VERSION_UPDATE>
+ *                                    |              <SLICE: 2, 2, 2>
+ *                                    |         UNLOCK (&priv->lock)
+ *                                    |
+ * LOCK (&priv->lock)                 |
+ *   <INODE_VERSION_EQUALS_SLICE>     |
+ *    I: 1 <-> S: 2                   |
+ *    update: true                    |
+ * UNLOCK (&priv->lock)               |
+ *                                    |
+ * <if update == true>                |
+ *  <dispath-update-event>            |
+ *  <INODE_VERSION_UPDATE>            |
+ *   LOCK (&inode->lock)              |
+ *    LOCK (&priv->lock)              |
+ *     <CTX: 2, 1, 1>                 |
+ *    UNLOCK (&priv->lock)            |
+ *   UNLOCK (&inode->lock)            |
+ *
+ * Therefore, the change gets recorded in the next change (no lost change). If
+ * the slice version was ahead of the inode version (say I:1, S: 2), then
+ * anyway the comparison would result in a update (I: 1, S: 3).
+ *
+ * If the rollover time is too less, then there is another contention when the
+ * updater tries to bring up inode version to the slice version (this is also
+ * the case when the roll-over thread wakes up during INODE_VERSION_UPDATE.
+ *
+ *   <CTX: 1, 1, 1>                   |       <SLICE: 2, 2, 2>
+ *                                    |
+ *                                    |
+ * <dispath-update-event>             |
+ * <INODE_VERSION_UPDATE>             |
+ *  LOCK (&inode->lock)               |
+ *   LOCK (&priv->lock)               |
+ *    <CTX: 2, 1, 1>                  |
+ *   UNLOCK (&priv->lock)             |
+ *  UNLOCK (&inode->lock)             |
+ *                                    |         <dispatch-rollover-event>
+ *                                    |         LOCK (&priv->lock)
+ *                                    |            <SLICE_VERSION_UPDATE>
+ *                                    |              <SLICE: 3, 3, 3>
+ *                                    |         UNLOCK (&priv->lock)
+ *
+ *
+ * Case 2: When the fop thread wins
+ *
+ *          [updater]                 |             [rollover]
+ *                                    |
+ *                                    |           <SLICE: 1, 1, 1>
+ * <changelog_update>                 |
+ *   <changelog_inode_ctx_get>        |
+ *      <CTX: 0, 0, 0>                |
+ *                                    |
+ * LOCK (&priv->lock)                 |
+ *   <INODE_VERSION_EQUALS_SLICE>     |
+ *    I: 0 <-> S: 1                   |
+ *    update: true                    |
+ * UNLOCK (&priv->lock)               |
+ *                                    |         <dispatch-rollover-event>
+ *                                    |         LOCK (&priv->lock)
+ *                                    |            <SLICE_VERSION_UPDATE>
+ *                                    |              <SLICE: 2, 2, 2>
+ *                                    |         UNLOCK (&priv->lock)
+ * <if update == true>                |
+ *  <dispath-update-event>            |
+ *  <INODE_VERSION_UPDATE>            |
+ *   LOCK (&inode->lock)              |
+ *    LOCK (&priv->lock)              |
+ *     <CTX: 2, 0, 0>                 |
+ *    UNLOCK (&priv->lock)            |
+ *   UNLOCK (&inode->lock)            |
+ *
+ * Here again, if the inode version was equal to the slice version (I: 1, S: 1)
+ * then there is no need to record an update (as the equality of the two version
+ * signifies an update was recorded in the current time slice).
+ */
+void
+changelog_update(xlator_t *this, changelog_priv_t *priv,
+                 changelog_local_t *local, changelog_log_type type)
+{
+    int ret = 0;
+    unsigned long *iver = NULL;
+    unsigned long version = 0;
+    inode_t *inode = NULL;
+    changelog_time_slice_t *slice = NULL;
+    changelog_inode_ctx_t *ctx = NULL;
+    changelog_log_data_t *cld_0 = NULL;
+    changelog_log_data_t *cld_1 = NULL;
+    changelog_local_t *next_local = NULL;
+    gf_boolean_t need_upd = _gf_true;
+
+    slice = &priv->slice;
+
+    /**
+     * for fops that do not require inode version checking
+     */
+    if (local->update_no_check)
+        goto update;
+
+    inode = local->inode;
+
+    ctx = changelog_inode_ctx_get(this, inode, &iver, &version, type);
+    if (!ctx)
+        goto update;
+
+    INODE_VERSION_EQUALS_SLICE(priv, version, slice, type, need_upd);
+
+update:
+    if (need_upd) {
+        cld_0 = &local->cld;
+        cld_0->cld_type = type;
+
+        if ((next_local = local->prev_entry) != NULL) {
+            cld_1 = &next_local->cld;
+            cld_1->cld_type = type;
+        }
+
+        ret = priv->cd.dispatchfn(this, priv, priv->cd.cd_data, cld_0, cld_1);
+
+        /**
+         * update after the dispatcher has successfully done
+         * it's job.
+         */
+        if (!local->update_no_check && iver && !ret)
+            INODE_VERSION_UPDATE(priv, inode, iver, slice, type);
+    }
+
+    return;
+}
+
+/* Begin: Geo-rep snapshot dependency changes */
+
+/* changelog_color_fop_and_inc_cnt: Assign color and inc fop cnt.
+ *
+ * Assigning color and increment of corresponding fop count should happen
+ * in a lock (i.e., there should be no window between them). If it does not,
+ * we might miss draining those fops which are colored but not yet incremented
+ * the count. Let's assume black fops are draining. If the black fop count
+ * reaches zero, we say draining is completed but we miss black fops which are
+ * not incremented fop count but color is assigned black.
+ */
+
+void
+changelog_color_fop_and_inc_cnt(xlator_t *this, changelog_priv_t *priv,
+                                changelog_local_t *local)
+{
+    if (!priv || !local)
+        return;
+
+    LOCK(&priv->lock);
+    {
+        local->color = priv->current_color;
+        changelog_inc_fop_cnt(this, priv, local);
+    }
+    UNLOCK(&priv->lock);
+}
+
+/* Increments the respective fop counter based on the fop color */
+void
+changelog_inc_fop_cnt(xlator_t *this, changelog_priv_t *priv,
+                      changelog_local_t *local)
+{
+    int ret = 0;
+
+    if (local) {
+        if (local->color == FOP_COLOR_BLACK) {
+            ret = pthread_mutex_lock(&priv->dm.drain_black_mutex);
+            CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out);
+            {
+                priv->dm.black_fop_cnt++;
+            }
+            ret = pthread_mutex_unlock(&priv->dm.drain_black_mutex);
+            CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out);
+        } else {
+            ret = pthread_mutex_lock(&priv->dm.drain_white_mutex);
+            CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out);
+            {
+                priv->dm.white_fop_cnt++;
+            }
+            ret = pthread_mutex_unlock(&priv->dm.drain_white_mutex);
+            CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out);
+        }
+    }
+out:
+    return;
+}
+
+/* Decrements the respective fop counter based on the fop color */
+void
+changelog_dec_fop_cnt(xlator_t *this, changelog_priv_t *priv,
+                      changelog_local_t *local)
+{
+    int ret = 0;
+
+    if (local) {
+        if (local->color == FOP_COLOR_BLACK) {
+            ret = pthread_mutex_lock(&priv->dm.drain_black_mutex);
+            CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out);
+            {
+                priv->dm.black_fop_cnt--;
+                if (priv->dm.black_fop_cnt == 0 &&
+                    priv->dm.drain_wait_black == _gf_true) {
+                    ret = pthread_cond_signal(&priv->dm.drain_black_cond);
+                    CHANGELOG_PTHREAD_ERROR_HANDLE_2(
+                        ret, out, priv->dm.drain_black_mutex);
+                    gf_msg_debug(this->name, 0,
+                                 "Signalled "
+                                 "draining of black");
+                }
+            }
+            ret = pthread_mutex_unlock(&priv->dm.drain_black_mutex);
+            CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out);
+        } else {
+            ret = pthread_mutex_lock(&priv->dm.drain_white_mutex);
+            CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out);
+            {
+                priv->dm.white_fop_cnt--;
+                if (priv->dm.white_fop_cnt == 0 &&
+                    priv->dm.drain_wait_white == _gf_true) {
+                    ret = pthread_cond_signal(&priv->dm.drain_white_cond);
+                    CHANGELOG_PTHREAD_ERROR_HANDLE_2(
+                        ret, out, priv->dm.drain_white_mutex);
+                    gf_msg_debug(this->name, 0,
+                                 "Signalled "
+                                 "draining of white");
+                }
+            }
+            ret = pthread_mutex_unlock(&priv->dm.drain_white_mutex);
+            CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out);
+        }
+    }
+out:
+    return;
+}
+
+/* Write to a pipe setup between changelog main thread and changelog
+ * rollover thread to initiate explicit rollover of changelog journal.
+ */
+int
+changelog_barrier_notify(changelog_priv_t *priv, char *buf)
+{
+    int ret = 0;
+
+    pthread_mutex_lock(&priv->cr.lock);
+    {
+        ret = pthread_cond_signal(&priv->cr.cond);
+        priv->cr.notify = _gf_true;
+    }
+    pthread_mutex_unlock(&priv->cr.lock);
+    return ret;
+}
+
+/* Clean up flags set on barrier notification */
+void
+changelog_barrier_cleanup(xlator_t *this, changelog_priv_t *priv,
+                          struct list_head *queue)
+{
+    int ret = 0;
+
+    LOCK(&priv->bflags.lock);
+    priv->bflags.barrier_ext = _gf_false;
+    UNLOCK(&priv->bflags.lock);
+
+    ret = pthread_mutex_lock(&priv->bn.bnotify_mutex);
+    CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out);
+    {
+        priv->bn.bnotify = _gf_false;
+    }
+    ret = pthread_mutex_unlock(&priv->bn.bnotify_mutex);
+    CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, out);
+
+    /* Disable changelog barrier and dequeue fops */
+    LOCK(&priv->lock);
+    {
+        if (priv->barrier_enabled == _gf_true)
+            __chlog_barrier_disable(this, queue);
+        else
+            ret = -1;
+    }
+    UNLOCK(&priv->lock);
+    if (ret == 0)
+        chlog_barrier_dequeue_all(this, queue);
+
+out:
+    return;
+}
+/* End: Geo-Rep snapshot dependency changes */
+
+int32_t
+changelog_fill_entry_buf(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                         changelog_local_t **local)
+{
+    changelog_opt_t *co = NULL;
+    size_t xtra_len = 0;
+    char *dup_path = NULL;
+    char *bname = NULL;
+    inode_t *parent = NULL;
+
+    GF_ASSERT(this);
+
+    parent = inode_parent(loc->inode, 0, 0);
+    if (!parent) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, CHANGELOG_MSG_INODE_NOT_FOUND,
+                "type=parent", "gfid=%s", uuid_utoa(loc->inode->gfid), NULL);
+        goto err;
+    }
+
+    CHANGELOG_INIT_NOCHECK(this, *local, loc->inode, loc->inode->gfid, 5);
+    if (!(*local)) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_LOCAL_INIT_FAILED,
+                NULL);
+        goto err;
+    }
+
+    co = changelog_get_usable_buffer(*local);
+    if (!co) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_GET_BUFFER_FAILED,
+                NULL);
+        goto err;
+    }
+
+    if (loc->inode->ia_type == IA_IFDIR) {
+        CHANGLOG_FILL_FOP_NUMBER(co, GF_FOP_MKDIR, fop_fn, xtra_len);
+        co++;
+        CHANGELOG_FILL_UINT32(co, S_IFDIR | 0755, number_fn, xtra_len);
+        co++;
+    } else {
+        CHANGLOG_FILL_FOP_NUMBER(co, GF_FOP_CREATE, fop_fn, xtra_len);
+        co++;
+        CHANGELOG_FILL_UINT32(co, S_IFREG | 0644, number_fn, xtra_len);
+        co++;
+    }
+
+    CHANGELOG_FILL_UINT32(co, frame->root->uid, number_fn, xtra_len);
+    co++;
+
+    CHANGELOG_FILL_UINT32(co, frame->root->gid, number_fn, xtra_len);
+    co++;
+
+    dup_path = gf_strdup(loc->path);
+    bname = basename(dup_path);
+
+    CHANGELOG_FILL_ENTRY(co, parent->gfid, bname, entry_fn, entry_free_fn,
+                         xtra_len, err);
+    changelog_set_usable_record_and_length(*local, xtra_len, 5);
+
+    if (dup_path)
+        GF_FREE(dup_path);
+    if (parent)
+        inode_unref(parent);
+    return 0;
+
+err:
+    if (dup_path)
+        GF_FREE(dup_path);
+    if (parent)
+        inode_unref(parent);
+    return -1;
+}
+
+/*
+ * resolve_pargfid_to_path:
+ *      It converts given pargfid to path by doing recursive readlinks at the
+ * backend. If bname is given, it suffixes bname to pargfid to form the
+ * complete path else it doesn't. It allocates memory for the path and is
+ * caller's responsibility to free the same. If bname is NULL and pargfid
+ * is ROOT, then it returns "."
+ */
+
+int
+resolve_pargfid_to_path(xlator_t *this, const uuid_t pgfid, char **path,
+                        char *bname)
+{
+    char *linkname = NULL;
+    char *dir_handle = NULL;
+    char *pgfidstr = NULL;
+    char *saveptr = NULL;
+    ssize_t len = 0;
+    int ret = 0;
+    uuid_t tmp_gfid = {
+        0,
+    };
+    uuid_t pargfid = {
+        0,
+    };
+    changelog_priv_t *priv = NULL;
+    char gpath[PATH_MAX] = {
+        0,
+    };
+    char result[PATH_MAX] = {
+        0,
+    };
+    char *dir_name = NULL;
+    char pre_dir_name[PATH_MAX] = {
+        0,
+    };
+
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    gf_uuid_copy(pargfid, pgfid);
+    if (!path || gf_uuid_is_null(pargfid)) {
+        ret = -1;
+        goto out;
+    }
+
+    if (__is_root_gfid(pargfid)) {
+        if (bname)
+            *path = gf_strdup(bname);
+        else
+            *path = gf_strdup(".");
+        return ret;
+    }
+
+    dir_handle = alloca(PATH_MAX);
+    linkname = alloca(PATH_MAX);
+    (void)snprintf(gpath, PATH_MAX, "%s/.glusterfs/", priv->changelog_brick);
+
+    while (!(__is_root_gfid(pargfid))) {
+        len = snprintf(dir_handle, PATH_MAX, "%s/%02x/%02x/%s", gpath,
+                       pargfid[0], pargfid[1], uuid_utoa(pargfid));
+        if ((len < 0) || (len >= PATH_MAX)) {
+            ret = -1;
+            goto out;
+        }
+
+        len = sys_readlink(dir_handle, linkname, PATH_MAX);
+        if (len < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    CHANGELOG_MSG_READLINK_OP_FAILED,
+                    "could not read the "
+                    "link from the gfid handle",
+                    "handle=%s", dir_handle, NULL);
+            ret = -1;
+            goto out;
+        }
+
+        linkname[len] = '\0';
+
+        pgfidstr = strtok_r(linkname + strlen("../../00/00/"), "/", &saveptr);
+        dir_name = strtok_r(NULL, "/", &saveptr);
+
+        len = snprintf(result, PATH_MAX, "%s/%s", dir_name, pre_dir_name);
+        if ((len < 0) || (len >= PATH_MAX)) {
+            ret = -1;
+            goto out;
+        }
+        if (snprintf(pre_dir_name, len + 1, "%s", result) >= len + 1) {
+            ret = -1;
+            goto out;
+        }
+
+        gf_uuid_parse(pgfidstr, tmp_gfid);
+        gf_uuid_copy(pargfid, tmp_gfid);
+    }
+
+    if (bname)
+        strncat(result, bname, strlen(bname) + 1);
+
+    *path = gf_strdup(result);
+
+out:
+    return ret;
+}
diff --git a/xlators/features/changelog/src/changelog-helpers.h b/xlators/features/changelog/src/changelog-helpers.h
new file mode 100644
index 00000000000..38fa7590c32
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-helpers.h
@@ -0,0 +1,716 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_HELPERS_H
+#define _CHANGELOG_HELPERS_H
+
+#include <glusterfs/locking.h>
+#include <glusterfs/timer.h>
+#include "pthread.h"
+#include <glusterfs/iobuf.h>
+#include <glusterfs/rot-buffs.h>
+
+#include "changelog-misc.h"
+#include <glusterfs/call-stub.h>
+
+#include "rpcsvc.h"
+#include "changelog-ev-handle.h"
+
+#include "changelog.h"
+#include "changelog-messages.h"
+
+/**
+ * the changelog entry
+ */
+typedef struct changelog_log_data {
+    /* rollover related */
+    time_t cld_roll_time;
+
+    /* reopen changelog? */
+    gf_boolean_t cld_finale;
+
+    changelog_log_type cld_type;
+
+    /**
+     * sincd gfid is _always_ a necessity, it's not a part
+     * of the iobuf. by doing this we do not add any overhead
+     * for data and metadata related fops.
+     */
+    uuid_t cld_gfid;
+
+    /**
+     * iobufs are used for optionals records: pargfid, path,
+     * write offsets etc.. It's the fop implementers job
+     * to allocate (iobuf_get() in the fop) and get unref'ed
+     * in the callback (CHANGELOG_STACK_UNWIND).
+     */
+    struct iobuf *cld_iobuf;
+
+#define cld_ptr cld_iobuf->ptr
+
+    /**
+     * after allocation you can point this to the length of
+     * usable data, but make sure it does not exceed the
+     * the size of the requested iobuf.
+     */
+    size_t cld_iobuf_len;
+
+#define cld_ptr_len cld_iobuf_len
+
+    /**
+     * number of optional records
+     */
+    int cld_xtra_records;
+} changelog_log_data_t;
+
+/**
+ * holder for dispatch function and private data
+ */
+
+typedef struct changelog_priv changelog_priv_t;
+
+typedef struct changelog_dispatcher {
+    void *cd_data;
+    int (*dispatchfn)(xlator_t *, changelog_priv_t *, void *,
+                      changelog_log_data_t *, changelog_log_data_t *);
+} changelog_dispatcher_t;
+
+struct changelog_bootstrap {
+    changelog_mode_t mode;
+    int (*ctor)(xlator_t *, changelog_dispatcher_t *);
+    int (*dtor)(xlator_t *, changelog_dispatcher_t *);
+};
+
+struct changelog_encoder {
+    changelog_encoder_t encoder;
+    int (*encode)(xlator_t *, changelog_log_data_t *);
+};
+
+/* xlator private */
+
+typedef struct changelog_time_slice {
+    /**
+     * version of changelog file, incremented each time changes
+     * rollover.
+     */
+    unsigned long changelog_version[CHANGELOG_MAX_TYPE];
+} changelog_time_slice_t;
+
+typedef struct changelog_rollover {
+    /* rollover thread */
+    pthread_t rollover_th;
+
+    xlator_t *this;
+
+    pthread_mutex_t lock;
+    pthread_cond_t cond;
+    gf_boolean_t notify;
+} changelog_rollover_t;
+
+typedef struct changelog_fsync {
+    /* fsync() thread */
+    pthread_t fsync_th;
+
+    xlator_t *this;
+} changelog_fsync_t;
+
+/* Draining during changelog rollover (for geo-rep snapshot dependency):
+ * --------------------------------------------------------------------
+ * The introduction of draining of in-transit fops during changelog rollover
+ * (both explicit/timeout triggered) requires coloring of fops. Basically the
+ * implementation requires two counters, one counter which keeps the count of
+ * current intransit fops which should end up in current changelog and the other
+ * counter to keep track of incoming fops which should be drained as part of
+ * next changelog rollover event. The fops are colored w.r.t these counters.
+ * The fops that are to be drained as part of current changelog rollover is
+ * given one color and the fops which keep incoming during this and not
+ * necessarily should end up in current changelog and should be drained as part
+ * of next changelog rollover are given other color. The color switching
+ * continues with each changelog rollover. Two colors(black and white) are
+ * chosen here and initially black is chosen is default.
+ */
+
+typedef enum chlog_fop_color {
+    FOP_COLOR_BLACK,
+    FOP_COLOR_WHITE
+} chlog_fop_color_t;
+
+/* Barrier notify variable */
+typedef struct barrier_notify {
+    pthread_mutex_t bnotify_mutex;
+    pthread_cond_t bnotify_cond;
+    gf_boolean_t bnotify;
+    gf_boolean_t bnotify_error;
+} barrier_notify_t;
+
+/* Two separate mutex and conditional variable set is used
+ * to drain white and black fops. */
+
+typedef struct drain_mgmt {
+    pthread_mutex_t drain_black_mutex;
+    pthread_cond_t drain_black_cond;
+    pthread_mutex_t drain_white_mutex;
+    pthread_cond_t drain_white_cond;
+    /* Represents black fops count in-transit */
+    unsigned long black_fop_cnt;
+    /* Represents white fops count in-transit */
+    unsigned long white_fop_cnt;
+    gf_boolean_t drain_wait_black;
+    gf_boolean_t drain_wait_white;
+} drain_mgmt_t;
+
+/* External barrier as a result of snap on/off indicating flag*/
+typedef struct barrier_flags {
+    gf_lock_t lock;
+    gf_boolean_t barrier_ext;
+} barrier_flags_t;
+
+/* Event selection */
+typedef struct changelog_ev_selector {
+    gf_lock_t reflock;
+
+    /**
+     * Array of references for each selection bit.
+     */
+    unsigned int ref[CHANGELOG_EV_SELECTION_RANGE];
+} changelog_ev_selector_t;
+
+/* changelog's private structure */
+struct changelog_priv {
+    /* changelog journalling */
+    gf_boolean_t active;
+
+    /* changelog live notifications */
+    gf_boolean_t rpc_active;
+
+    /* to generate unique socket file per brick */
+    char *changelog_brick;
+
+    /* logging directory */
+    char *changelog_dir;
+
+    /* htime directory */
+    char *htime_dir;
+
+    /* one file for all changelog types */
+    int changelog_fd;
+
+    /* htime fd for current changelog session */
+    int htime_fd;
+
+    /*  c_snap_fd is fd for call-path changelog */
+    int c_snap_fd;
+
+    /* rollover_count used by htime */
+    int rollover_count;
+
+    gf_lock_t lock;
+
+    /*  lock to synchronize CSNAP updation */
+    gf_lock_t c_snap_lock;
+
+    /* written end of the pipe */
+    int wfd;
+
+    /* rollover time */
+    int32_t rollover_time;
+
+    /* fsync() interval */
+    int32_t fsync_interval;
+
+    /* changelog type maps */
+    const char *maps[CHANGELOG_MAX_TYPE];
+
+    /* time slicer */
+    changelog_time_slice_t slice;
+
+    /* context of the updater */
+    changelog_dispatcher_t cd;
+
+    /* context of the rollover thread */
+    changelog_rollover_t cr;
+
+    /* context of fsync thread */
+    changelog_fsync_t cf;
+
+    /* operation mode */
+    changelog_mode_t op_mode;
+
+    /* bootstrap routine for 'current' logger */
+    struct changelog_bootstrap *cb;
+
+    /* encoder mode */
+    changelog_encoder_t encode_mode;
+
+    /* encoder */
+    struct changelog_encoder *ce;
+
+    /**
+     * snapshot dependency changes
+     */
+
+    /* Draining of fops*/
+    drain_mgmt_t dm;
+
+    /* Represents the active color. Initially by default black */
+    chlog_fop_color_t current_color;
+
+    /* flag to determine explicit rollover is triggered */
+    gf_boolean_t explicit_rollover;
+
+    /* barrier notification variable protected by mutex */
+    barrier_notify_t bn;
+
+    /* barrier on/off indicating flags */
+    barrier_flags_t bflags;
+
+    /* changelog barrier on/off indicating flag */
+    gf_boolean_t barrier_enabled;
+    struct list_head queue;
+    uint32_t queue_size;
+    gf_timer_t *timer;
+    struct timespec timeout;
+
+    /**
+     * buffers, RPC, event selection, notifications and other
+     * beasts.
+     */
+
+    /* epoll pthread */
+    pthread_t poller;
+
+    /* rotational buffer */
+    rbuf_t *rbuf;
+
+    /* changelog RPC server */
+    rpcsvc_t *rpc;
+
+    /* event selection */
+    changelog_ev_selector_t ev_selection;
+
+    /* client handling (reverse connection) */
+    pthread_t connector;
+
+    int nr_dispatchers;
+    pthread_t *ev_dispatcher;
+
+    changelog_clnt_t connections;
+
+    /* glusterfind dependency to capture paths on deleted entries*/
+    gf_boolean_t capture_del_path;
+
+    /* Save total no. of listners */
+    gf_atomic_t listnercnt;
+
+    /* Save total no. of xprt are associated with listner */
+    gf_atomic_t xprtcnt;
+
+    /* Save xprt list */
+    struct list_head xprt_list;
+
+    /* Save total no. of client connection */
+    gf_atomic_t clntcnt;
+
+    /* Save cleanup brick in victim */
+    xlator_t *victim;
+
+    /* Status to save cleanup notify status */
+    gf_boolean_t notify_down;
+};
+
+struct changelog_local {
+    inode_t *inode;
+    gf_boolean_t update_no_check;
+
+    changelog_log_data_t cld;
+
+    /**
+     * ->prev_entry is used in cases when there needs to be
+     * additional changelog entry for the parent (eg. rename)
+     * It's analogous to ->next in single linked list world,
+     * but we call it as ->prev_entry... ha ha ha
+     */
+    struct changelog_local *prev_entry;
+
+    /* snap dependency changes */
+    chlog_fop_color_t color;
+};
+
+typedef struct changelog_local changelog_local_t;
+
+/* inode version is stored in inode ctx */
+typedef struct changelog_inode_ctx {
+    unsigned long iversion[CHANGELOG_MAX_TYPE];
+} changelog_inode_ctx_t;
+
+#define CHANGELOG_INODE_VERSION_TYPE(ctx, type) &(ctx->iversion[type])
+
+/**
+ * Optional Records:
+ *  fops that need to save additional information request a array of
+ *  @changelog_opt_t struct. The array is allocated via @iobufs.
+ */
+typedef enum {
+    CHANGELOG_OPT_REC_FOP,
+    CHANGELOG_OPT_REC_ENTRY,
+    CHANGELOG_OPT_REC_UINT32,
+} changelog_optional_rec_type_t;
+
+struct changelog_entry_fields {
+    uuid_t cef_uuid;
+    char *cef_bname;
+    char *cef_path;
+};
+
+typedef struct {
+    /**
+     * @co_covert can be used to do post-processing of the record before
+     * it's persisted to the CHANGELOG. If this is NULL, then the record
+     * is persisted as per it's in memory format.
+     */
+    size_t (*co_convert)(void *data, char *buffer, gf_boolean_t encode);
+
+    /* release routines */
+    void (*co_free)(void *data);
+
+    /* type of the field */
+    changelog_optional_rec_type_t co_type;
+
+    /**
+     * sizeof of the 'valid' field in the union. This field is not used if
+     * @co_convert is specified.
+     */
+    size_t co_len;
+
+    union {
+        unsigned int co_uint32;
+        glusterfs_fop_t co_fop;
+        struct changelog_entry_fields co_entry;
+    };
+} changelog_opt_t;
+
+#define CHANGELOG_OPT_RECORD_LEN sizeof(changelog_opt_t)
+
+/**
+ * helpers routines
+ */
+
+int
+changelog_thread_cleanup(xlator_t *this, pthread_t thr_id);
+
+void *
+changelog_get_usable_buffer(changelog_local_t *local);
+
+void
+changelog_set_usable_record_and_length(changelog_local_t *local, size_t len,
+                                       int xr);
+void
+changelog_local_cleanup(xlator_t *xl, changelog_local_t *local);
+changelog_local_t *
+changelog_local_init(xlator_t *this, inode_t *inode, uuid_t gfid,
+                     int xtra_records, gf_boolean_t update_flag);
+int
+changelog_start_next_change(xlator_t *this, changelog_priv_t *priv, time_t ts,
+                            gf_boolean_t finale);
+int
+changelog_open_journal(xlator_t *this, changelog_priv_t *priv);
+void
+changelog_fill_rollover_data(changelog_log_data_t *cld, gf_boolean_t is_last);
+int
+changelog_inject_single_event(xlator_t *this, changelog_priv_t *priv,
+                              changelog_log_data_t *cld);
+size_t
+changelog_entry_length();
+int
+changelog_write(int fd, char *buffer, size_t len);
+int
+changelog_write_change(changelog_priv_t *priv, char *buffer, size_t len);
+int
+changelog_handle_change(xlator_t *this, changelog_priv_t *priv,
+                        changelog_log_data_t *cld);
+void
+changelog_update(xlator_t *this, changelog_priv_t *priv,
+                 changelog_local_t *local, changelog_log_type type);
+void *
+changelog_rollover(void *data);
+void *
+changelog_fsync_thread(void *data);
+int
+changelog_forget(xlator_t *this, inode_t *inode);
+int
+htime_update(xlator_t *this, changelog_priv_t *priv, time_t ts, char *buffer);
+int
+htime_open(xlator_t *this, changelog_priv_t *priv, time_t ts);
+int
+htime_create(xlator_t *this, changelog_priv_t *priv, time_t ts);
+
+/* Geo-Rep snapshot dependency changes */
+void
+changelog_color_fop_and_inc_cnt(xlator_t *this, changelog_priv_t *priv,
+                                changelog_local_t *local);
+void
+changelog_inc_fop_cnt(xlator_t *this, changelog_priv_t *priv,
+                      changelog_local_t *local);
+void
+changelog_dec_fop_cnt(xlator_t *this, changelog_priv_t *priv,
+                      changelog_local_t *local);
+int
+changelog_barrier_notify(changelog_priv_t *priv, char *buf);
+void
+changelog_barrier_cleanup(xlator_t *this, changelog_priv_t *priv,
+                          struct list_head *queue);
+void
+changelog_drain_white_fops(xlator_t *this, changelog_priv_t *priv);
+void
+changelog_drain_black_fops(xlator_t *this, changelog_priv_t *priv);
+
+/* Crash consistency of changelog wrt snapshot */
+int
+changelog_snap_logging_stop(xlator_t *this, changelog_priv_t *priv);
+int
+changelog_snap_logging_start(xlator_t *this, changelog_priv_t *priv);
+int
+changelog_snap_open(xlator_t *this, changelog_priv_t *priv);
+int
+changelog_snap_handle_ascii_change(xlator_t *this, changelog_log_data_t *cld);
+int
+changelog_snap_write_change(changelog_priv_t *priv, char *buffer, size_t len);
+
+/* Changelog barrier routines */
+void
+__chlog_barrier_enqueue(xlator_t *this, call_stub_t *stub);
+void
+__chlog_barrier_disable(xlator_t *this, struct list_head *queue);
+void
+chlog_barrier_dequeue_all(xlator_t *this, struct list_head *queue);
+call_stub_t *
+__chlog_barrier_dequeue(xlator_t *this, struct list_head *queue);
+int
+__chlog_barrier_enable(xlator_t *this, changelog_priv_t *priv);
+
+int32_t
+changelog_fill_entry_buf(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                         changelog_local_t **local);
+
+/* event selection routines */
+void
+changelog_select_event(xlator_t *, changelog_ev_selector_t *, unsigned int);
+void
+changelog_deselect_event(xlator_t *, changelog_ev_selector_t *, unsigned int);
+int
+changelog_init_event_selection(xlator_t *, changelog_ev_selector_t *);
+int
+changelog_ev_selected(xlator_t *, changelog_ev_selector_t *, unsigned int);
+void
+changelog_dispatch_event(xlator_t *, changelog_priv_t *, changelog_event_t *);
+
+changelog_inode_ctx_t *
+__changelog_inode_ctx_get(xlator_t *, inode_t *, unsigned long **,
+                          unsigned long *, changelog_log_type);
+int
+resolve_pargfid_to_path(xlator_t *this, const uuid_t gfid, char **path,
+                        char *bname);
+
+/* macros */
+
+#define CHANGELOG_STACK_UNWIND(fop, frame, params...)                          \
+    do {                                                                       \
+        changelog_local_t *__local = NULL;                                     \
+        xlator_t *__xl = NULL;                                                 \
+        if (frame) {                                                           \
+            __local = frame->local;                                            \
+            __xl = frame->this;                                                \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, params);                               \
+        if (__local && __local->prev_entry)                                    \
+            changelog_local_cleanup(__xl, __local->prev_entry);                \
+        changelog_local_cleanup(__xl, __local);                                \
+    } while (0)
+
+#define CHANGELOG_IOBUF_REF(iobuf)                                             \
+    do {                                                                       \
+        if (iobuf)                                                             \
+            iobuf_ref(iobuf);                                                  \
+    } while (0)
+
+#define CHANGELOG_IOBUF_UNREF(iobuf)                                           \
+    do {                                                                       \
+        if (iobuf)                                                             \
+            iobuf_unref(iobuf);                                                \
+    } while (0)
+
+#define CHANGELOG_FILL_BUFFER(buffer, off, val, len)                           \
+    do {                                                                       \
+        memcpy(buffer + off, val, len);                                        \
+        off += len;                                                            \
+    } while (0)
+
+#define SLICE_VERSION_UPDATE(slice)                                            \
+    do {                                                                       \
+        int i = 0;                                                             \
+        for (; i < CHANGELOG_MAX_TYPE; i++) {                                  \
+            slice->changelog_version[i]++;                                     \
+        }                                                                      \
+    } while (0)
+
+#define CHANGELOG_FILL_UINT32(co, number, converter, xlen)                     \
+    do {                                                                       \
+        co->co_convert = converter;                                            \
+        co->co_free = NULL;                                                    \
+        co->co_type = CHANGELOG_OPT_REC_UINT32;                                \
+        co->co_uint32 = number;                                                \
+        xlen += sizeof(unsigned int);                                          \
+    } while (0)
+
+#define CHANGLOG_FILL_FOP_NUMBER(co, fop, converter, xlen)                     \
+    do {                                                                       \
+        co->co_convert = converter;                                            \
+        co->co_free = NULL;                                                    \
+        co->co_type = CHANGELOG_OPT_REC_FOP;                                   \
+        co->co_fop = fop;                                                      \
+        xlen += sizeof(fop);                                                   \
+    } while (0)
+
+#define CHANGELOG_FILL_ENTRY(co, pargfid, bname, converter, freefn, xlen,      \
+                             label)                                            \
+    do {                                                                       \
+        co->co_convert = converter;                                            \
+        co->co_free = freefn;                                                  \
+        co->co_type = CHANGELOG_OPT_REC_ENTRY;                                 \
+        gf_uuid_copy(co->co_entry.cef_uuid, pargfid);                          \
+        co->co_entry.cef_bname = gf_strdup(bname);                             \
+        if (!co->co_entry.cef_bname)                                           \
+            goto label;                                                        \
+        xlen += (UUID_CANONICAL_FORM_LEN + strlen(bname));                     \
+    } while (0)
+
+#define CHANGELOG_FILL_ENTRY_DIR_PATH(co, pargfid, bname, converter,           \
+                                      del_freefn, xlen, label, capture_del)    \
+    do {                                                                       \
+        co->co_convert = converter;                                            \
+        co->co_free = del_freefn;                                              \
+        co->co_type = CHANGELOG_OPT_REC_ENTRY;                                 \
+        gf_uuid_copy(co->co_entry.cef_uuid, pargfid);                          \
+        co->co_entry.cef_bname = gf_strdup(bname);                             \
+        if (!co->co_entry.cef_bname)                                           \
+            goto label;                                                        \
+        xlen += (UUID_CANONICAL_FORM_LEN + strlen(bname));                     \
+        if (!capture_del ||                                                    \
+            resolve_pargfid_to_path(this, pargfid, &(co->co_entry.cef_path),   \
+                                    co->co_entry.cef_bname)) {                 \
+            co->co_entry.cef_path = gf_strdup("\0");                           \
+            xlen += 1;                                                         \
+        } else {                                                               \
+            xlen += (strlen(co->co_entry.cef_path));                           \
+        }                                                                      \
+    } while (0)
+
+#define CHANGELOG_INIT(this, local, inode, gfid, xrec)                         \
+    local = changelog_local_init(this, inode, gfid, xrec, _gf_false)
+
+#define CHANGELOG_INIT_NOCHECK(this, local, inode, gfid, xrec)                 \
+    local = changelog_local_init(this, inode, gfid, xrec, _gf_true)
+
+#define CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, label)                     \
+    do {                                                                       \
+        if (!priv->active)                                                     \
+            goto label;                                                        \
+        /* ignore rebalance process's activity. */                             \
+        if ((frame->root->pid == GF_CLIENT_PID_DEFRAG) ||                      \
+            (frame->root->pid == GF_CLIENT_PID_TIER_DEFRAG))                   \
+            goto label;                                                        \
+    } while (0)
+
+/* If it is a METADATA entry and fop num being GF_FOP_NULL, don't
+ * log in the changelog as it is of no use. And also if it is
+ * logged, since slicing version checking is done for metadata
+ * entries, the subsequent entries with valid fop num which falls
+ * to same changelog will be missed. Hence check for boundary
+ * condition.
+ */
+#define CHANGELOG_OP_BOUNDARY_CHECK(frame, label)                              \
+    do {                                                                       \
+        if (frame->root->op <= GF_FOP_NULL ||                                  \
+            frame->root->op >= GF_FOP_MAXVALUE)                                \
+            goto label;                                                        \
+    } while (0)
+
+/**
+ * ignore internal fops for all clients except AFR self-heal daemon
+ */
+#define CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(frame, dict, label)                \
+    do {                                                                       \
+        if ((frame->root->pid != GF_CLIENT_PID_SELF_HEALD) && dict &&          \
+            dict_get(dict, GLUSTERFS_INTERNAL_FOP_KEY))                        \
+            goto label;                                                        \
+    } while (0)
+
+#define CHANGELOG_COND_GOTO(priv, cond, label)                                 \
+    do {                                                                       \
+        if (!priv->active || cond)                                             \
+            goto label;                                                        \
+    } while (0)
+
+/* Begin: Geo-Rep snapshot dependency changes */
+
+#define DICT_ERROR -1
+#define BARRIER_OFF 0
+#define BARRIER_ON 1
+#define DICT_DEFAULT 2
+
+#define CHANGELOG_NOT_ON_THEN_GOTO(priv, ret, label)                           \
+    do {                                                                       \
+        if (!priv->active) {                                                   \
+            gf_smsg(this->name, GF_LOG_WARNING, 0,                             \
+                    CHANGELOG_MSG_CHANGELOG_NOT_ACTIVE, NULL);                 \
+            ret = 0;                                                           \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0)
+
+/* Log pthread error and goto label */
+#define CHANGELOG_PTHREAD_ERROR_HANDLE_0(ret, label)                           \
+    do {                                                                       \
+        if (ret) {                                                             \
+            gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_PTHREAD_ERROR,  \
+                    "error=%d", ret, NULL);                                    \
+            ret = -1;                                                          \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0);
+
+/* Log pthread error, set flag and goto label */
+#define CHANGELOG_PTHREAD_ERROR_HANDLE_1(ret, label, flag)                     \
+    do {                                                                       \
+        if (ret) {                                                             \
+            gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_PTHREAD_ERROR,  \
+                    "error=%d", ret, NULL);                                    \
+            ret = -1;                                                          \
+            flag = _gf_true;                                                   \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0)
+
+/* Log pthread error, unlock mutex and goto label */
+#define CHANGELOG_PTHREAD_ERROR_HANDLE_2(ret, label, mutex)                    \
+    do {                                                                       \
+        if (ret) {                                                             \
+            gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_PTHREAD_ERROR,  \
+                    "error=%d", ret, NULL);                                    \
+            ret = -1;                                                          \
+            pthread_mutex_unlock(&mutex);                                      \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0)
+
+/* End: Geo-Rep snapshot dependency changes */
+
+#endif /* _CHANGELOG_HELPERS_H */
diff --git a/xlators/features/changelog/src/changelog-mem-types.h b/xlators/features/changelog/src/changelog-mem-types.h
new file mode 100644
index 00000000000..a2d8a9cbe93
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-mem-types.h
@@ -0,0 +1,34 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_MEM_TYPES_H
+#define _CHANGELOG_MEM_TYPES_H
+
+#include <glusterfs/mem-types.h>
+
+enum gf_changelog_mem_types {
+    gf_changelog_mt_priv_t = gf_common_mt_end + 1,
+    gf_changelog_mt_str_t = gf_common_mt_end + 2,
+    gf_changelog_mt_batch_t = gf_common_mt_end + 3,
+    gf_changelog_mt_rt_t = gf_common_mt_end + 4,
+    gf_changelog_mt_inode_ctx_t = gf_common_mt_end + 5,
+    gf_changelog_mt_rpc_clnt_t = gf_common_mt_end + 6,
+    gf_changelog_mt_libgfchangelog_t = gf_common_mt_end + 7,
+    gf_changelog_mt_libgfchangelog_entry_t = gf_common_mt_end + 8,
+    gf_changelog_mt_libgfchangelog_rl_t = gf_common_mt_end + 9,
+    gf_changelog_mt_changelog_buffer_t = gf_common_mt_end + 10,
+    gf_changelog_mt_history_data_t = gf_common_mt_end + 11,
+    gf_changelog_mt_libgfchangelog_call_pool_t = gf_common_mt_end + 12,
+    gf_changelog_mt_libgfchangelog_event_t = gf_common_mt_end + 13,
+    gf_changelog_mt_ev_dispatcher_t = gf_common_mt_end + 14,
+    gf_changelog_mt_end
+};
+
+#endif
diff --git a/xlators/features/changelog/src/changelog-messages.h b/xlators/features/changelog/src/changelog-messages.h
new file mode 100644
index 00000000000..cb0e16c85d8
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-messages.h
@@ -0,0 +1,172 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _CHANGELOG_MESSAGES_H_
+#define _CHANGELOG_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(
+    CHANGELOG, CHANGELOG_MSG_OPEN_FAILED, CHANGELOG_MSG_BARRIER_FOP_FAILED,
+    CHANGELOG_MSG_VOL_MISCONFIGURED, CHANGELOG_MSG_RENAME_ERROR,
+    CHANGELOG_MSG_READ_ERROR, CHANGELOG_MSG_HTIME_ERROR,
+    CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED,
+    CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED, CHANGELOG_MSG_CHILD_MISCONFIGURED,
+    CHANGELOG_MSG_DIR_OPTIONS_NOT_SET, CHANGELOG_MSG_CLOSE_ERROR,
+    CHANGELOG_MSG_PIPE_CREATION_ERROR, CHANGELOG_MSG_DICT_GET_FAILED,
+    CHANGELOG_MSG_BARRIER_INFO, CHANGELOG_MSG_BARRIER_ERROR,
+    CHANGELOG_MSG_GET_TIME_OP_FAILED, CHANGELOG_MSG_WRITE_FAILED,
+    CHANGELOG_MSG_PTHREAD_ERROR, CHANGELOG_MSG_INODE_NOT_FOUND,
+    CHANGELOG_MSG_FSYNC_OP_FAILED, CHANGELOG_MSG_TOTAL_LOG_INFO,
+    CHANGELOG_MSG_SNAP_INFO, CHANGELOG_MSG_SELECT_FAILED,
+    CHANGELOG_MSG_FCNTL_FAILED, CHANGELOG_MSG_BNOTIFY_INFO,
+    CHANGELOG_MSG_ENTRY_BUF_INFO, CHANGELOG_MSG_CHANGELOG_NOT_ACTIVE,
+    CHANGELOG_MSG_LOCAL_INIT_FAILED, CHANGELOG_MSG_NOTIFY_REGISTER_FAILED,
+    CHANGELOG_MSG_PROGRAM_NAME_REG_FAILED, CHANGELOG_MSG_HANDLE_PROBE_ERROR,
+    CHANGELOG_MSG_SET_FD_CONTEXT, CHANGELOG_MSG_FREEUP_FAILED,
+    CHANGELOG_MSG_RECONFIGURE, CHANGELOG_MSG_RPC_SUBMIT_REPLY_FAILED,
+    CHANGELOG_MSG_RPC_BUILD_ERROR, CHANGELOG_MSG_RPC_CONNECT_ERROR,
+    CHANGELOG_MSG_RPC_START_ERROR, CHANGELOG_MSG_BUFFER_STARVATION_ERROR,
+    CHANGELOG_MSG_SCAN_DIR_FAILED, CHANGELOG_MSG_FSETXATTR_FAILED,
+    CHANGELOG_MSG_FGETXATTR_FAILED, CHANGELOG_MSG_CLEANUP_ON_ACTIVE_REF,
+    CHANGELOG_MSG_DISPATCH_EVENT_FAILED, CHANGELOG_MSG_PUT_BUFFER_FAILED,
+    CHANGELOG_MSG_PTHREAD_COND_WAIT_FAILED, CHANGELOG_MSG_PTHREAD_CANCEL_FAILED,
+    CHANGELOG_MSG_INJECT_FSYNC_FAILED, CHANGELOG_MSG_CREATE_FRAME_FAILED,
+    CHANGELOG_MSG_FSTAT_OP_FAILED, CHANGELOG_MSG_LSEEK_OP_FAILED,
+    CHANGELOG_MSG_STRSTR_OP_FAILED, CHANGELOG_MSG_UNLINK_OP_FAILED,
+    CHANGELOG_MSG_DETECT_EMPTY_CHANGELOG_FAILED,
+    CHANGELOG_MSG_READLINK_OP_FAILED, CHANGELOG_MSG_EXPLICIT_ROLLOVER_FAILED,
+    CHANGELOG_MSG_RPCSVC_NOTIFY_FAILED, CHANGELOG_MSG_MEMORY_INIT_FAILED,
+    CHANGELOG_MSG_NO_MEMORY, CHANGELOG_MSG_HTIME_STAT_ERROR,
+    CHANGELOG_MSG_HTIME_CURRENT_ERROR, CHANGELOG_MSG_BNOTIFY_COND_INFO,
+    CHANGELOG_MSG_NO_HTIME_CURRENT, CHANGELOG_MSG_HTIME_CURRENT,
+    CHANGELOG_MSG_NEW_HTIME_FILE, CHANGELOG_MSG_MKDIR_ERROR,
+    CHANGELOG_MSG_PATH_NOT_FOUND, CHANGELOG_MSG_XATTR_INIT_FAILED,
+    CHANGELOG_MSG_WROTE_TO_CSNAP, CHANGELOG_MSG_UNUSED_0,
+    CHANGELOG_MSG_GET_BUFFER_FAILED, CHANGELOG_MSG_BARRIER_STATE_NOTIFY,
+    CHANGELOG_MSG_BARRIER_DISABLED, CHANGELOG_MSG_BARRIER_ALREADY_DISABLED,
+    CHANGELOG_MSG_BARRIER_ON_ERROR, CHANGELOG_MSG_BARRIER_ENABLE,
+    CHANGELOG_MSG_BARRIER_KEY_NOT_FOUND, CHANGELOG_MSG_ERROR_IN_DICT_GET,
+    CHANGELOG_MSG_UNUSED_1, CHANGELOG_MSG_UNUSED_2,
+    CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS,
+    CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS_FINISHED,
+    CHANGELOG_MSG_BARRIER_TIMEOUT, CHANGELOG_MSG_TIMEOUT_ADD_FAILED,
+    CHANGELOG_MSG_CLEANUP_ALREADY_SET);
+
+#define CHANGELOG_MSG_BARRIER_FOP_FAILED_STR                                   \
+    "failed to barrier FOPs, disabling changelog barrier"
+#define CHANGELOG_MSG_MEMORY_INIT_FAILED_STR "memory accounting init failed"
+#define CHANGELOG_MSG_NO_MEMORY_STR "failed to create local memory pool"
+#define CHANGELOG_MSG_ENTRY_BUF_INFO_STR                                       \
+    "Entry cannot be captured for gfid, Capturing DATA entry."
+#define CHANGELOG_MSG_PTHREAD_ERROR_STR "pthread error"
+#define CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED_STR "pthread_mutex_init failed"
+#define CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED_STR "pthread_cond_init failed"
+#define CHANGELOG_MSG_HTIME_ERROR_STR "failed to update HTIME file"
+#define CHANGELOG_MSG_HTIME_STAT_ERROR_STR "unable to stat htime file"
+#define CHANGELOG_MSG_HTIME_CURRENT_ERROR_STR "Error extracting HTIME_CURRENT."
+#define CHANGELOG_MSG_UNLINK_OP_FAILED_STR "error unlinking empty changelog"
+#define CHANGELOG_MSG_RENAME_ERROR_STR "error renaming"
+#define CHANGELOG_MSG_MKDIR_ERROR_STR "unable to create directory"
+#define CHANGELOG_MSG_BNOTIFY_INFO_STR                                         \
+    "Explicit rollover changelog signaling bnotify"
+#define CHANGELOG_MSG_BNOTIFY_COND_INFO_STR "Woke up: bnotify conditional wait"
+#define CHANGELOG_MSG_RECONFIGURE_STR "Reconfigure: Changelog Enable"
+#define CHANGELOG_MSG_NO_HTIME_CURRENT_STR                                     \
+    "HTIME_CURRENT not found. Changelog enabled before init"
+#define CHANGELOG_MSG_HTIME_CURRENT_STR "HTIME_CURRENT"
+#define CHANGELOG_MSG_NEW_HTIME_FILE_STR                                       \
+    "Changelog enable: Creating new HTIME file"
+#define CHANGELOG_MSG_FGETXATTR_FAILED_STR "fgetxattr failed"
+#define CHANGELOG_MSG_TOTAL_LOG_INFO_STR "changelog info"
+#define CHANGELOG_MSG_PTHREAD_COND_WAIT_FAILED_STR "pthread cond wait failed"
+#define CHANGELOG_MSG_INODE_NOT_FOUND_STR "inode not found"
+#define CHANGELOG_MSG_READLINK_OP_FAILED_STR                                   \
+    "could not read the link from the gfid handle"
+#define CHANGELOG_MSG_OPEN_FAILED_STR "unable to open file"
+#define CHANGELOG_MSG_RPC_CONNECT_ERROR_STR "failed to connect back"
+#define CHANGELOG_MSG_BUFFER_STARVATION_ERROR_STR                              \
+    "Failed to get buffer for RPC dispatch"
+#define CHANGELOG_MSG_PTHREAD_CANCEL_FAILED_STR "could not cancel thread"
+#define CHANGELOG_MSG_FSTAT_OP_FAILED_STR "Could not stat (CHANGELOG)"
+#define CHANGELOG_MSG_LSEEK_OP_FAILED_STR "Could not lseek (changelog)"
+#define CHANGELOG_MSG_PATH_NOT_FOUND_STR                                       \
+    "Could not find CHANGELOG in changelog path"
+#define CHANGELOG_MSG_FSYNC_OP_FAILED_STR "fsync failed"
+#define CHANGELOG_MSG_DETECT_EMPTY_CHANGELOG_FAILED_STR                        \
+    "Error detecting empty changelog"
+#define CHANGELOG_MSG_EXPLICIT_ROLLOVER_FAILED_STR                             \
+    "Fail snapshot because of previous errors"
+#define CHANGELOG_MSG_SCAN_DIR_FAILED_STR "scandir failed"
+#define CHANGELOG_MSG_FSETXATTR_FAILED_STR "fsetxattr failed"
+#define CHANGELOG_MSG_XATTR_INIT_FAILED_STR "Htime xattr initialization failed"
+#define CHANGELOG_MSG_SNAP_INFO_STR "log in call path"
+#define CHANGELOG_MSG_WRITE_FAILED_STR "error writing to disk"
+#define CHANGELOG_MSG_WROTE_TO_CSNAP_STR "Successfully wrote to csnap"
+#define CHANGELOG_MSG_GET_TIME_OP_FAILED_STR "Problem rolling over changelog(s)"
+#define CHANGELOG_MSG_BARRIER_INFO_STR "Explicit wakeup on barrier notify"
+#define CHANGELOG_MSG_SELECT_FAILED_STR "pthread_cond_timedwait failed"
+#define CHANGELOG_MSG_INJECT_FSYNC_FAILED_STR "failed to inject fsync event"
+#define CHANGELOG_MSG_LOCAL_INIT_FAILED_STR                                    \
+    "changelog local initialization failed"
+#define CHANGELOG_MSG_GET_BUFFER_FAILED_STR "Failed to get buffer"
+#define CHANGELOG_MSG_SET_FD_CONTEXT_STR                                       \
+    "could not set fd context(for release cbk)"
+#define CHANGELOG_MSG_DICT_GET_FAILED_STR "Barrier failed"
+#define CHANGELOG_MSG_BARRIER_STATE_NOTIFY_STR "Barrier notification"
+#define CHANGELOG_MSG_BARRIER_ERROR_STR                                        \
+    "Received another barrier off notification while already off"
+#define CHANGELOG_MSG_BARRIER_DISABLED_STR "disabled changelog barrier"
+#define CHANGELOG_MSG_BARRIER_ALREADY_DISABLED_STR                             \
+    "Changelog barrier already disabled"
+#define CHANGELOG_MSG_BARRIER_ON_ERROR_STR                                     \
+    "Received another barrier on notification when last one is not served yet"
+#define CHANGELOG_MSG_BARRIER_ENABLE_STR "Enabled changelog barrier"
+#define CHANGELOG_MSG_BARRIER_KEY_NOT_FOUND_STR "barrier key not found"
+#define CHANGELOG_MSG_ERROR_IN_DICT_GET_STR                                    \
+    "Something went wrong in dict_get_str_boolean"
+#define CHANGELOG_MSG_DIR_OPTIONS_NOT_SET_STR "changelog-dir option is not set"
+#define CHANGELOG_MSG_FREEUP_FAILED_STR "could not cleanup bootstrapper"
+#define CHANGELOG_MSG_CHILD_MISCONFIGURED_STR                                  \
+    "translator needs a single subvolume"
+#define CHANGELOG_MSG_VOL_MISCONFIGURED_STR                                    \
+    "dangling volume. please check volfile"
+#define CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS_STR                               \
+    "Dequeuing all the changelog barriered fops"
+#define CHANGELOG_MSG_DEQUEUING_BARRIER_FOPS_FINISHED_STR                      \
+    "Dequeuing changelog barriered fops is finished"
+#define CHANGELOG_MSG_BARRIER_TIMEOUT_STR                                      \
+    "Disabling changelog barrier because of the timeout"
+#define CHANGELOG_MSG_TIMEOUT_ADD_FAILED_STR                                   \
+    "Couldn't add changelog barrier timeout event"
+#define CHANGELOG_MSG_RPC_BUILD_ERROR_STR "failed to build rpc options"
+#define CHANGELOG_MSG_NOTIFY_REGISTER_FAILED_STR "failed to register notify"
+#define CHANGELOG_MSG_RPC_START_ERROR_STR "failed to start rpc"
+#define CHANGELOG_MSG_CREATE_FRAME_FAILED_STR "failed to create frame"
+#define CHANGELOG_MSG_RPC_SUBMIT_REPLY_FAILED_STR "failed to serialize reply"
+#define CHANGELOG_MSG_PROGRAM_NAME_REG_FAILED_STR "cannot register program"
+#define CHANGELOG_MSG_CHANGELOG_NOT_ACTIVE_STR                                 \
+    "Changelog is not active, return success"
+#define CHANGELOG_MSG_PUT_BUFFER_FAILED_STR                                    \
+    "failed to put buffer after consumption"
+#define CHANGELOG_MSG_CLEANUP_ALREADY_SET_STR                                  \
+    "cleanup_starting flag is already set for xl"
+#define CHANGELOG_MSG_HANDLE_PROBE_ERROR_STR "xdr decoding error"
+#endif /* !_CHANGELOG_MESSAGES_H_ */
diff --git a/xlators/features/changelog/src/changelog-misc.h b/xlators/features/changelog/src/changelog-misc.h
new file mode 100644
index 00000000000..e2addc09414
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-misc.h
@@ -0,0 +1,131 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_MISC_H
+#define _CHANGELOG_MISC_H
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/common-utils.h>
+
+#define CHANGELOG_MAX_TYPE 4
+#define CHANGELOG_FILE_NAME "CHANGELOG"
+#define HTIME_FILE_NAME "HTIME"
+#define CSNAP_FILE_NAME "CHANGELOG.SNAP"
+#define HTIME_KEY "trusted.glusterfs.htime"
+#define HTIME_CURRENT "trusted.glusterfs.current_htime"
+#define HTIME_INITIAL_VALUE "0:0"
+
+#define CHANGELOG_VERSION_MAJOR 1
+#define CHANGELOG_VERSION_MINOR 2
+
+#define CHANGELOG_UNIX_SOCK DEFAULT_VAR_RUN_DIRECTORY "/changelog-%s.sock"
+#define CHANGELOG_TMP_UNIX_SOCK DEFAULT_VAR_RUN_DIRECTORY "/.%s%lu.sock"
+
+/**
+ * header starts with the version and the format of the changelog.
+ * 'version' not much of a use now.
+ */
+#define CHANGELOG_HEADER                                                       \
+    "GlusterFS Changelog | version: v%d.%d | encoding : %d\n"
+
+#define CHANGELOG_MAKE_SOCKET_PATH(brick_path, sockpath, len)                  \
+    do {                                                                       \
+        char xxh64[GF_XXH64_DIGEST_LENGTH * 2 + 1] = {                         \
+            0,                                                                 \
+        };                                                                     \
+        gf_xxh64_wrapper((unsigned char *)brick_path, strlen(brick_path),      \
+                         GF_XXHSUM64_DEFAULT_SEED, xxh64);                     \
+        (void)snprintf(sockpath, len, CHANGELOG_UNIX_SOCK, xxh64);             \
+    } while (0)
+
+#define CHANGELOG_MAKE_TMP_SOCKET_PATH(brick_path, sockpath, len)              \
+    do {                                                                       \
+        unsigned long pid = 0;                                                 \
+        char xxh64[GF_XXH64_DIGEST_LENGTH * 2 + 1] = {                         \
+            0,                                                                 \
+        };                                                                     \
+        pid = (unsigned long)getpid();                                         \
+        gf_xxh64_wrapper((unsigned char *)brick_path, strlen(brick_path),      \
+                         GF_XXHSUM64_DEFAULT_SEED, xxh64);                     \
+        (void)snprintf(sockpath, len, CHANGELOG_TMP_UNIX_SOCK, xxh64, pid);    \
+    } while (0)
+
+/**
+ * ... used by libgfchangelog.
+ */
+#define CHANGELOG_GET_HEADER_INFO(fd, buffer, len, enc, maj, min, elen)        \
+    do {                                                                       \
+        FILE *fp;                                                              \
+        int fd_dup;                                                            \
+                                                                               \
+        enc = -1;                                                              \
+        maj = -1;                                                              \
+        min = -1;                                                              \
+        fd_dup = dup(fd);                                                      \
+                                                                               \
+        if (fd_dup != -1) {                                                    \
+            fp = fdopen(fd_dup, "r");                                          \
+            if (fp) {                                                          \
+                if (fgets(buffer, len, fp)) {                                  \
+                    elen = strlen(buffer);                                     \
+                    sscanf(buffer, CHANGELOG_HEADER, &maj, &min, &enc);        \
+                }                                                              \
+                fclose(fp);                                                    \
+            } else {                                                           \
+                sys_close(fd_dup);                                             \
+            }                                                                  \
+        }                                                                      \
+    } while (0)
+
+#define CHANGELOG_FILL_HTIME_DIR(changelog_dir, path)                          \
+    do {                                                                       \
+        snprintf(path, sizeof(path), "%s/htime", changelog_dir);               \
+    } while (0)
+
+#define CHANGELOG_FILL_CSNAP_DIR(changelog_dir, path)                          \
+    do {                                                                       \
+        snprintf(path, sizeof(path), "%s/csnap", changelog_dir);               \
+    } while (0)
+/**
+ * everything after 'CHANGELOG_TYPE_METADATA_XATTR' are internal types
+ * (ie. none of the fops trigger this type of event), hence
+ * CHANGELOG_MAX_TYPE = 4
+ */
+typedef enum {
+    CHANGELOG_TYPE_DATA = 0,
+    CHANGELOG_TYPE_METADATA,
+    CHANGELOG_TYPE_ENTRY,
+    CHANGELOG_TYPE_METADATA_XATTR,
+    CHANGELOG_TYPE_ROLLOVER,
+    CHANGELOG_TYPE_FSYNC,
+} changelog_log_type;
+
+/* operation modes - RT for now */
+typedef enum {
+    CHANGELOG_MODE_RT = 0,
+} changelog_mode_t;
+
+/* encoder types */
+
+typedef enum {
+    CHANGELOG_ENCODE_MIN = 0,
+    CHANGELOG_ENCODE_BINARY,
+    CHANGELOG_ENCODE_ASCII,
+    CHANGELOG_ENCODE_MAX,
+} changelog_encoder_t;
+
+#define CHANGELOG_VALID_ENCODING(enc)                                          \
+    (enc > CHANGELOG_ENCODE_MIN && enc < CHANGELOG_ENCODE_MAX)
+
+#define CHANGELOG_TYPE_IS_ENTRY(type) (type == CHANGELOG_TYPE_ENTRY)
+#define CHANGELOG_TYPE_IS_ROLLOVER(type) (type == CHANGELOG_TYPE_ROLLOVER)
+#define CHANGELOG_TYPE_IS_FSYNC(type) (type == CHANGELOG_TYPE_FSYNC)
+
+#endif /* _CHANGELOG_MISC_H */
diff --git a/xlators/features/changelog/src/changelog-rpc-common.c b/xlators/features/changelog/src/changelog-rpc-common.c
new file mode 100644
index 00000000000..125246a17e1
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-rpc-common.c
@@ -0,0 +1,359 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "changelog-rpc-common.h"
+#include "changelog-messages.h"
+
+#include <glusterfs/syscall.h>
+/**
+*****************************************************
+                  Client Interface
+*****************************************************
+*/
+
+/**
+ * Initialize and return an RPC client object for a given unix
+ * domain socket.
+ */
+
+void *
+changelog_rpc_poller(void *arg)
+{
+    xlator_t *this = arg;
+
+    (void)gf_event_dispatch(this->ctx->event_pool);
+    return NULL;
+}
+
+struct rpc_clnt *
+changelog_rpc_client_init(xlator_t *this, void *cbkdata, char *sockfile,
+                          rpc_clnt_notify_t fn)
+{
+    int ret = 0;
+    struct rpc_clnt *rpc = NULL;
+    dict_t *options = NULL;
+
+    if (!cbkdata)
+        cbkdata = this;
+
+    options = dict_new();
+    if (!options)
+        goto error_return;
+
+    ret = rpc_transport_unix_options_build(options, sockfile, 0);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_BUILD_ERROR,
+                NULL);
+        goto dealloc_dict;
+    }
+
+    rpc = rpc_clnt_new(options, this, this->name, 16);
+    if (!rpc)
+        goto dealloc_dict;
+
+    ret = rpc_clnt_register_notify(rpc, fn, cbkdata);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0,
+                CHANGELOG_MSG_NOTIFY_REGISTER_FAILED, NULL);
+        goto dealloc_rpc_clnt;
+    }
+
+    ret = rpc_clnt_start(rpc);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_START_ERROR,
+                NULL);
+        goto dealloc_rpc_clnt;
+    }
+
+    dict_unref(options);
+    return rpc;
+
+dealloc_rpc_clnt:
+    rpc_clnt_unref(rpc);
+dealloc_dict:
+    dict_unref(options);
+error_return:
+    return NULL;
+}
+
+/**
+ * Generic RPC client routine to dispatch a request to an
+ * RPC server.
+ */
+int
+changelog_rpc_sumbit_req(struct rpc_clnt *rpc, void *req, call_frame_t *frame,
+                         rpc_clnt_prog_t *prog, int procnum,
+                         struct iovec *payload, int payloadcnt,
+                         struct iobref *iobref, xlator_t *this,
+                         fop_cbk_fn_t cbkfn, xdrproc_t xdrproc)
+{
+    int ret = 0;
+    int count = 0;
+    struct iovec iov = {
+        0,
+    };
+    struct iobuf *iobuf = NULL;
+    char new_iobref = 0;
+    ssize_t xdr_size = 0;
+
+    GF_ASSERT(this);
+
+    if (req) {
+        xdr_size = xdr_sizeof(xdrproc, req);
+
+        iobuf = iobuf_get2(this->ctx->iobuf_pool, xdr_size);
+        if (!iobuf) {
+            goto out;
+        };
+
+        if (!iobref) {
+            iobref = iobref_new();
+            if (!iobref) {
+                goto out;
+            }
+
+            new_iobref = 1;
+        }
+
+        iobref_add(iobref, iobuf);
+
+        iov.iov_base = iobuf->ptr;
+        iov.iov_len = iobuf_size(iobuf);
+
+        /* Create the xdr payload */
+        ret = xdr_serialize_generic(iov, req, xdrproc);
+        if (ret == -1) {
+            goto out;
+        }
+
+        iov.iov_len = ret;
+        count = 1;
+    }
+
+    ret = rpc_clnt_submit(rpc, prog, procnum, cbkfn, &iov, count, payload,
+                          payloadcnt, iobref, frame, NULL, 0, NULL, 0, NULL);
+
+out:
+    if (new_iobref)
+        iobref_unref(iobref);
+    if (iobuf)
+        iobuf_unref(iobuf);
+    return ret;
+}
+
+/**
+ * Entry point to perform a remote procedure call
+ */
+int
+changelog_invoke_rpc(xlator_t *this, struct rpc_clnt *rpc,
+                     rpc_clnt_prog_t *prog, int procidx, void *arg)
+{
+    int ret = 0;
+    call_frame_t *frame = NULL;
+    rpc_clnt_procedure_t *proc = NULL;
+
+    if (!this || !prog)
+        goto error_return;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_CREATE_FRAME_FAILED,
+                NULL);
+        goto error_return;
+    }
+
+    proc = &prog->proctable[procidx];
+    if (proc->fn)
+        ret = proc->fn(frame, this, arg);
+
+    STACK_DESTROY(frame->root);
+    return ret;
+
+error_return:
+    return -1;
+}
+
+/**
+*****************************************************
+                  Server Interface
+*****************************************************
+*/
+
+struct iobuf *
+__changelog_rpc_serialize_reply(rpcsvc_request_t *req, void *arg,
+                                struct iovec *outmsg, xdrproc_t xdrproc)
+{
+    struct iobuf *iob = NULL;
+    ssize_t retlen = 0;
+    ssize_t rsp_size = 0;
+
+    rsp_size = xdr_sizeof(xdrproc, arg);
+    iob = iobuf_get2(req->svc->ctx->iobuf_pool, rsp_size);
+    if (!iob)
+        goto error_return;
+
+    iobuf_to_iovec(iob, outmsg);
+
+    retlen = xdr_serialize_generic(*outmsg, arg, xdrproc);
+    if (retlen == -1)
+        goto unref_iob;
+
+    outmsg->iov_len = retlen;
+    return iob;
+
+unref_iob:
+    iobuf_unref(iob);
+error_return:
+    return NULL;
+}
+
+int
+changelog_rpc_sumbit_reply(rpcsvc_request_t *req, void *arg,
+                           struct iovec *payload, int payloadcount,
+                           struct iobref *iobref, xdrproc_t xdrproc)
+{
+    int ret = -1;
+    struct iobuf *iob = NULL;
+    struct iovec iov = {
+        0,
+    };
+    char new_iobref = 0;
+
+    if (!req)
+        goto return_ret;
+
+    if (!iobref) {
+        iobref = iobref_new();
+        if (!iobref)
+            goto return_ret;
+        new_iobref = 1;
+    }
+
+    iob = __changelog_rpc_serialize_reply(req, arg, &iov, xdrproc);
+    if (!iob)
+        gf_smsg("", GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_SUBMIT_REPLY_FAILED,
+                NULL);
+    else
+        iobref_add(iobref, iob);
+
+    ret = rpcsvc_submit_generic(req, &iov, 1, payload, payloadcount, iobref);
+
+    if (new_iobref)
+        iobref_unref(iobref);
+    if (iob)
+        iobuf_unref(iob);
+return_ret:
+    return ret;
+}
+
+void
+changelog_rpc_server_destroy(xlator_t *this, rpcsvc_t *rpc, char *sockfile,
+                             rpcsvc_notify_t fn, struct rpcsvc_program **progs)
+{
+    rpcsvc_listener_t *listener = NULL;
+    rpcsvc_listener_t *next = NULL;
+    struct rpcsvc_program *prog = NULL;
+    rpc_transport_t *trans = NULL;
+
+    if (!rpc)
+        return;
+
+    while (*progs) {
+        prog = *progs;
+        (void)rpcsvc_program_unregister(rpc, prog);
+        progs++;
+    }
+
+    list_for_each_entry_safe(listener, next, &rpc->listeners, list)
+    {
+        if (listener->trans) {
+            trans = listener->trans;
+            rpc_transport_disconnect(trans, _gf_false);
+        }
+    }
+
+    (void)rpcsvc_unregister_notify(rpc, fn, this);
+
+    /* TODO Avoid freeing rpc object in case of brick multiplex
+       after freeing rpc object svc->rpclock corrupted and it takes
+       more time to detach a brick
+    */
+    if (!this->cleanup_starting) {
+        if (rpc->rxpool) {
+            mem_pool_destroy(rpc->rxpool);
+            rpc->rxpool = NULL;
+        }
+        GF_FREE(rpc);
+    }
+}
+
+rpcsvc_t *
+changelog_rpc_server_init(xlator_t *this, char *sockfile, void *cbkdata,
+                          rpcsvc_notify_t fn, struct rpcsvc_program **progs)
+{
+    int ret = 0;
+    rpcsvc_t *rpc = NULL;
+    dict_t *options = NULL;
+    struct rpcsvc_program *prog = NULL;
+
+    if (!cbkdata)
+        cbkdata = this;
+
+    options = dict_new();
+    if (!options)
+        return NULL;
+
+    ret = rpcsvc_transport_unix_options_build(options, sockfile);
+    if (ret)
+        goto dealloc_dict;
+
+    rpc = rpcsvc_init(this, this->ctx, options, 8);
+    if (rpc == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_RPC_START_ERROR,
+                NULL);
+        goto dealloc_dict;
+    }
+
+    ret = rpcsvc_register_notify(rpc, fn, cbkdata);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0,
+                CHANGELOG_MSG_NOTIFY_REGISTER_FAILED, NULL);
+        goto dealloc_rpc;
+    }
+
+    ret = rpcsvc_create_listeners(rpc, options, this->name);
+    if (ret != 1) {
+        gf_msg_debug(this->name, 0, "failed to create listeners");
+        goto dealloc_rpc;
+    }
+
+    while (*progs) {
+        prog = *progs;
+        ret = rpcsvc_program_register(rpc, prog, _gf_false);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0,
+                    CHANGELOG_MSG_PROGRAM_NAME_REG_FAILED, "name%s",
+                    prog->progname, "prognum=%d", prog->prognum, "pogver=%d",
+                    prog->progver, NULL);
+            goto dealloc_rpc;
+        }
+
+        progs++;
+    }
+
+    dict_unref(options);
+    return rpc;
+
+dealloc_rpc:
+    GF_FREE(rpc);
+dealloc_dict:
+    dict_unref(options);
+    return NULL;
+}
diff --git a/xlators/features/changelog/src/changelog-rpc-common.h b/xlators/features/changelog/src/changelog-rpc-common.h
new file mode 100644
index 00000000000..4d9aa2c694b
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-rpc-common.h
@@ -0,0 +1,85 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CHANGELOG_RPC_COMMON_H
+#define __CHANGELOG_RPC_COMMON_H
+
+#include "rpcsvc.h"
+#include "rpc-clnt.h"
+#include <glusterfs/gf-event.h>
+#include <glusterfs/call-stub.h>
+
+#include "changelog-xdr.h"
+#include "xdr-generic.h"
+
+#include "changelog.h"
+
+/**
+ * Let's keep this non-configurable for now.
+ */
+#define NR_ROTT_BUFFS 4
+#define NR_DISPATCHERS (NR_ROTT_BUFFS - 1)
+
+enum changelog_rpc_procnum {
+    CHANGELOG_RPC_PROC_NULL = 0,
+    CHANGELOG_RPC_PROBE_FILTER = 1,
+    CHANGELOG_RPC_PROC_MAX = 2,
+};
+
+#define CHANGELOG_RPC_PROGNUM 1885957735
+#define CHANGELOG_RPC_PROGVER 1
+
+/**
+ * reverse connection: data xfer path
+ */
+enum changelog_reverse_rpc_procnum {
+    CHANGELOG_REV_PROC_NULL = 0,
+    CHANGELOG_REV_PROC_EVENT = 1,
+    CHANGELOG_REV_PROC_MAX = 2,
+};
+
+#define CHANGELOG_REV_RPC_PROCNUM 1886350951
+#define CHANGELOG_REV_RPC_PROCVER 1
+
+typedef struct changelog_rpc {
+    rpcsvc_t *svc;
+    struct rpc_clnt *rpc;
+    char sock[UNIX_PATH_MAX]; /* tied to server */
+} changelog_rpc_t;
+
+/* event poller */
+void *
+changelog_rpc_poller(void *);
+
+/* CLIENT API */
+struct rpc_clnt *
+changelog_rpc_client_init(xlator_t *, void *, char *, rpc_clnt_notify_t);
+
+int
+changelog_rpc_sumbit_req(struct rpc_clnt *, void *, call_frame_t *,
+                         rpc_clnt_prog_t *, int, struct iovec *, int,
+                         struct iobref *, xlator_t *, fop_cbk_fn_t, xdrproc_t);
+
+int
+changelog_invoke_rpc(xlator_t *, struct rpc_clnt *, rpc_clnt_prog_t *, int,
+                     void *);
+
+/* SERVER API */
+int
+changelog_rpc_sumbit_reply(rpcsvc_request_t *, void *, struct iovec *, int,
+                           struct iobref *, xdrproc_t);
+rpcsvc_t *
+changelog_rpc_server_init(xlator_t *, char *, void *, rpcsvc_notify_t,
+                          struct rpcsvc_program **);
+void
+changelog_rpc_server_destroy(xlator_t *, rpcsvc_t *, char *, rpcsvc_notify_t,
+                             struct rpcsvc_program **);
+
+#endif
diff --git a/xlators/features/changelog/src/changelog-rpc.c b/xlators/features/changelog/src/changelog-rpc.c
new file mode 100644
index 00000000000..440b88091a6
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-rpc.c
@@ -0,0 +1,440 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/syscall.h>
+#include "changelog-rpc.h"
+#include "changelog-mem-types.h"
+#include "changelog-ev-handle.h"
+
+static struct rpcsvc_program *changelog_programs[];
+
+static void
+changelog_cleanup_dispatchers(xlator_t *this, changelog_priv_t *priv, int count)
+{
+    for (count--; count >= 0; count--) {
+        (void)changelog_thread_cleanup(this, priv->ev_dispatcher[count]);
+        priv->ev_dispatcher[count] = 0;
+    }
+}
+
+int
+changelog_cleanup_rpc_threads(xlator_t *this, changelog_priv_t *priv)
+{
+    int ret = 0;
+    changelog_clnt_t *conn = NULL;
+
+    conn = &priv->connections;
+    if (!conn)
+        return 0;
+
+    /** terminate RPC thread(s) */
+    ret = changelog_thread_cleanup(this, priv->connector);
+    if (ret != 0)
+        goto error_return;
+    priv->connector = 0;
+
+    /** terminate dispatcher thread(s) */
+    changelog_cleanup_dispatchers(this, priv, priv->nr_dispatchers);
+
+    /* destroy locks */
+    ret = pthread_mutex_destroy(&conn->pending_lock);
+    if (ret != 0)
+        goto error_return;
+    ret = pthread_cond_destroy(&conn->pending_cond);
+    if (ret != 0)
+        goto error_return;
+    ret = LOCK_DESTROY(&conn->active_lock);
+    if (ret != 0)
+        goto error_return;
+    ret = LOCK_DESTROY(&conn->wait_lock);
+    if (ret != 0)
+        goto error_return;
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static int
+changelog_init_rpc_threads(xlator_t *this, changelog_priv_t *priv, rbuf_t *rbuf,
+                           int nr_dispatchers)
+{
+    int j = 0;
+    int ret = 0;
+    changelog_clnt_t *conn = NULL;
+
+    conn = &priv->connections;
+
+    conn->this = this;
+    conn->rbuf = rbuf;
+    conn->sequence = 1; /* start with sequence number one */
+
+    INIT_LIST_HEAD(&conn->pending);
+    INIT_LIST_HEAD(&conn->active);
+    INIT_LIST_HEAD(&conn->waitq);
+
+    ret = pthread_mutex_init(&conn->pending_lock, NULL);
+    if (ret)
+        goto error_return;
+    ret = pthread_cond_init(&conn->pending_cond, NULL);
+    if (ret)
+        goto cleanup_pending_lock;
+
+    ret = LOCK_INIT(&conn->active_lock);
+    if (ret)
+        goto cleanup_pending_cond;
+    ret = LOCK_INIT(&conn->wait_lock);
+    if (ret)
+        goto cleanup_active_lock;
+
+    /* spawn reverse connection thread */
+    ret = gf_thread_create(&priv->connector, NULL, changelog_ev_connector, conn,
+                           "clogecon");
+    if (ret != 0)
+        goto cleanup_wait_lock;
+
+    /* spawn dispatcher thread(s) */
+    priv->ev_dispatcher = GF_CALLOC(nr_dispatchers, sizeof(pthread_t),
+                                    gf_changelog_mt_ev_dispatcher_t);
+    if (!priv->ev_dispatcher)
+        goto cleanup_connector;
+
+    /* spawn dispatcher threads */
+    for (; j < nr_dispatchers; j++) {
+        ret = gf_thread_create(&priv->ev_dispatcher[j], NULL,
+                               changelog_ev_dispatch, conn, "clogd%03hx",
+                               j & 0x3ff);
+        if (ret != 0) {
+            changelog_cleanup_dispatchers(this, priv, j);
+            break;
+        }
+    }
+
+    if (ret != 0)
+        goto cleanup_connector;
+
+    priv->nr_dispatchers = nr_dispatchers;
+    return 0;
+
+cleanup_connector:
+    (void)pthread_cancel(priv->connector);
+cleanup_wait_lock:
+    LOCK_DESTROY(&conn->wait_lock);
+cleanup_active_lock:
+    LOCK_DESTROY(&conn->active_lock);
+cleanup_pending_cond:
+    (void)pthread_cond_destroy(&conn->pending_cond);
+cleanup_pending_lock:
+    (void)pthread_mutex_destroy(&conn->pending_lock);
+error_return:
+    return -1;
+}
+
+int
+changelog_rpcsvc_notify(rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
+                        void *data)
+{
+    xlator_t *this = NULL;
+    rpc_transport_t *trans = NULL;
+    rpc_transport_t *xprt = NULL;
+    rpc_transport_t *xp_next = NULL;
+    changelog_priv_t *priv = NULL;
+    uint64_t listnercnt = 0;
+    uint64_t xprtcnt = 0;
+    uint64_t clntcnt = 0;
+    rpcsvc_listener_t *listener = NULL;
+    rpcsvc_listener_t *next = NULL;
+    gf_boolean_t listner_found = _gf_false;
+    socket_private_t *sockpriv = NULL;
+
+    if (!xl || !data || !rpc) {
+        gf_msg_callingfn("changelog", GF_LOG_WARNING, 0,
+                         CHANGELOG_MSG_RPCSVC_NOTIFY_FAILED,
+                         "Calling rpc_notify without initializing");
+        goto out;
+    }
+
+    this = xl;
+    trans = data;
+    priv = this->private;
+
+    if (!priv) {
+        gf_msg_callingfn("changelog", GF_LOG_WARNING, 0,
+                         CHANGELOG_MSG_RPCSVC_NOTIFY_FAILED,
+                         "Calling rpc_notify without priv initializing");
+        goto out;
+    }
+
+    if (event == RPCSVC_EVENT_ACCEPT) {
+        GF_ATOMIC_INC(priv->xprtcnt);
+        LOCK(&priv->lock);
+        {
+            list_add_tail(&trans->list, &priv->xprt_list);
+        }
+        UNLOCK(&priv->lock);
+        goto out;
+    }
+
+    if (event == RPCSVC_EVENT_DISCONNECT) {
+        list_for_each_entry_safe(listener, next, &rpc->listeners, list)
+        {
+            if (listener && listener->trans) {
+                if (listener->trans == trans) {
+                    listnercnt = GF_ATOMIC_DEC(priv->listnercnt);
+                    listner_found = _gf_true;
+                    rpcsvc_listener_destroy(listener);
+                }
+            }
+        }
+
+        if (listnercnt > 0) {
+            goto out;
+        }
+        if (listner_found) {
+            LOCK(&priv->lock);
+            list_for_each_entry_safe(xprt, xp_next, &priv->xprt_list, list)
+            {
+                sockpriv = (socket_private_t *)(xprt->private);
+                gf_log("changelog", GF_LOG_INFO,
+                       "Send disconnect"
+                       " on socket %d",
+                       sockpriv->sock);
+                rpc_transport_disconnect(xprt, _gf_false);
+            }
+            UNLOCK(&priv->lock);
+            goto out;
+        }
+        LOCK(&priv->lock);
+        {
+            list_del_init(&trans->list);
+        }
+        UNLOCK(&priv->lock);
+
+        xprtcnt = GF_ATOMIC_DEC(priv->xprtcnt);
+        clntcnt = GF_ATOMIC_GET(priv->clntcnt);
+        if (!xprtcnt && !clntcnt) {
+            changelog_process_cleanup_event(this);
+        }
+    }
+
+out:
+    return 0;
+}
+
+void
+changelog_process_cleanup_event(xlator_t *this)
+{
+    gf_boolean_t cleanup_notify = _gf_false;
+    changelog_priv_t *priv = NULL;
+    char sockfile[UNIX_PATH_MAX] = {
+        0,
+    };
+
+    if (!this)
+        return;
+    priv = this->private;
+    if (!priv)
+        return;
+
+    LOCK(&priv->lock);
+    {
+        cleanup_notify = priv->notify_down;
+        priv->notify_down = _gf_true;
+    }
+    UNLOCK(&priv->lock);
+
+    if (priv->victim && !cleanup_notify) {
+        default_notify(this, GF_EVENT_PARENT_DOWN, priv->victim);
+
+        if (priv->rpc) {
+            /* sockfile path could have been saved to avoid this */
+            CHANGELOG_MAKE_SOCKET_PATH(priv->changelog_brick, sockfile,
+                                       UNIX_PATH_MAX);
+            sys_unlink(sockfile);
+            (void)rpcsvc_unregister_notify(priv->rpc, changelog_rpcsvc_notify,
+                                           this);
+            if (priv->rpc->rxpool) {
+                mem_pool_destroy(priv->rpc->rxpool);
+                priv->rpc->rxpool = NULL;
+            }
+            GF_FREE(priv->rpc);
+            priv->rpc = NULL;
+        }
+    }
+}
+
+void
+changelog_destroy_rpc_listner(xlator_t *this, changelog_priv_t *priv)
+{
+    char sockfile[UNIX_PATH_MAX] = {
+        0,
+    };
+
+    /* sockfile path could have been saved to avoid this */
+    CHANGELOG_MAKE_SOCKET_PATH(priv->changelog_brick, sockfile, UNIX_PATH_MAX);
+    changelog_rpc_server_destroy(this, priv->rpc, sockfile,
+                                 changelog_rpcsvc_notify, changelog_programs);
+}
+
+rpcsvc_t *
+changelog_init_rpc_listener(xlator_t *this, changelog_priv_t *priv,
+                            rbuf_t *rbuf, int nr_dispatchers)
+{
+    int ret = 0;
+    char sockfile[UNIX_PATH_MAX] = {
+        0,
+    };
+    rpcsvc_t *svcp;
+
+    ret = changelog_init_rpc_threads(this, priv, rbuf, nr_dispatchers);
+    if (ret)
+        return NULL;
+
+    CHANGELOG_MAKE_SOCKET_PATH(priv->changelog_brick, sockfile, UNIX_PATH_MAX);
+    (void)sys_unlink(sockfile);
+    svcp = changelog_rpc_server_init(
+        this, sockfile, NULL, changelog_rpcsvc_notify, changelog_programs);
+    return svcp;
+}
+
+void
+changelog_rpc_clnt_cleanup(changelog_rpc_clnt_t *crpc)
+{
+    if (!crpc)
+        return;
+    crpc->c_clnt = NULL;
+    LOCK_DESTROY(&crpc->lock);
+    GF_FREE(crpc);
+}
+
+static changelog_rpc_clnt_t *
+changelog_rpc_clnt_init(xlator_t *this, changelog_probe_req *rpc_req,
+                        changelog_clnt_t *c_clnt)
+{
+    int ret = 0;
+    changelog_rpc_clnt_t *crpc = NULL;
+
+    crpc = GF_CALLOC(1, sizeof(*crpc), gf_changelog_mt_rpc_clnt_t);
+    if (!crpc)
+        goto error_return;
+    INIT_LIST_HEAD(&crpc->list);
+
+    /* Take a ref, the last unref will be on RPC_CLNT_DESTROY
+     * which comes as a result of last rpc_clnt_unref.
+     */
+    GF_ATOMIC_INIT(crpc->ref, 1);
+    changelog_set_disconnect_flag(crpc, _gf_false);
+
+    crpc->filter = rpc_req->filter;
+    (void)memcpy(crpc->sock, rpc_req->sock, strlen(rpc_req->sock));
+
+    crpc->this = this;
+    crpc->c_clnt = c_clnt;
+    crpc->cleanup = changelog_rpc_clnt_cleanup;
+
+    ret = LOCK_INIT(&crpc->lock);
+    if (ret != 0)
+        goto dealloc_crpc;
+    return crpc;
+
+dealloc_crpc:
+    GF_FREE(crpc);
+error_return:
+    return NULL;
+}
+
+/**
+ * Actor declarations
+ */
+
+/**
+ * @probe_handler
+ * A probe RPC call spawns a connect back to the caller. Caller also
+ * passes an hint which acts as a filter for selecting updates.
+ */
+
+int
+changelog_handle_probe(rpcsvc_request_t *req)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+    rpcsvc_t *svc = NULL;
+    changelog_priv_t *priv = NULL;
+    changelog_clnt_t *c_clnt = NULL;
+    changelog_rpc_clnt_t *crpc = NULL;
+
+    changelog_probe_req rpc_req = {
+        0,
+    };
+    changelog_probe_rsp rpc_rsp = {
+        0,
+    };
+
+    this = req->trans->xl;
+    if (this->cleanup_starting) {
+        gf_smsg(this->name, GF_LOG_DEBUG, 0, CHANGELOG_MSG_CLEANUP_ALREADY_SET,
+                NULL);
+        return 0;
+    }
+
+    ret = xdr_to_generic(req->msg[0], &rpc_req,
+                         (xdrproc_t)xdr_changelog_probe_req);
+    if (ret < 0) {
+        gf_smsg("", GF_LOG_ERROR, 0, CHANGELOG_MSG_HANDLE_PROBE_ERROR, NULL);
+        req->rpc_err = GARBAGE_ARGS;
+        goto handle_xdr_error;
+    }
+
+    /* ->xl hidden in rpcsvc */
+    svc = rpcsvc_request_service(req);
+    this = svc->xl;
+    priv = this->private;
+    c_clnt = &priv->connections;
+
+    crpc = changelog_rpc_clnt_init(this, &rpc_req, c_clnt);
+    if (!crpc)
+        goto handle_xdr_error;
+
+    changelog_ev_queue_connection(c_clnt, crpc);
+    rpc_rsp.op_ret = 0;
+
+    goto submit_rpc;
+
+handle_xdr_error:
+    rpc_rsp.op_ret = -1;
+submit_rpc:
+    (void)changelog_rpc_sumbit_reply(req, &rpc_rsp, NULL, 0, NULL,
+                                     (xdrproc_t)xdr_changelog_probe_rsp);
+    return 0;
+}
+
+/**
+ * RPC declarations
+ */
+
+static rpcsvc_actor_t changelog_svc_actors[CHANGELOG_RPC_PROC_MAX] = {
+    [CHANGELOG_RPC_PROBE_FILTER] = {"CHANGELOG PROBE FILTER",
+                                    changelog_handle_probe, NULL,
+                                    CHANGELOG_RPC_PROBE_FILTER, DRC_NA, 0},
+};
+
+static struct rpcsvc_program changelog_svc_prog = {
+    .progname = CHANGELOG_RPC_PROGNAME,
+    .prognum = CHANGELOG_RPC_PROGNUM,
+    .progver = CHANGELOG_RPC_PROGVER,
+    .numactors = CHANGELOG_RPC_PROC_MAX,
+    .actors = changelog_svc_actors,
+    .synctask = _gf_true,
+};
+
+static struct rpcsvc_program *changelog_programs[] = {
+    &changelog_svc_prog,
+    NULL,
+};
diff --git a/xlators/features/changelog/src/changelog-rpc.h b/xlators/features/changelog/src/changelog-rpc.h
new file mode 100644
index 00000000000..b1707565249
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-rpc.h
@@ -0,0 +1,31 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CHANGELOG_RPC_H
+#define __CHANGELOG_RPC_H
+
+#include <glusterfs/xlator.h>
+#include "changelog-helpers.h"
+
+/* one time */
+#include "socket.h"
+#include "changelog-rpc-common.h"
+
+#define CHANGELOG_RPC_PROGNAME "GlusterFS Changelog"
+
+rpcsvc_t *
+changelog_init_rpc_listener(xlator_t *, changelog_priv_t *, rbuf_t *, int);
+
+void
+changelog_destroy_rpc_listner(xlator_t *, changelog_priv_t *);
+
+int
+changelog_cleanup_rpc_threads(xlator_t *this, changelog_priv_t *priv);
+#endif
diff --git a/xlators/features/changelog/src/changelog-rt.c b/xlators/features/changelog/src/changelog-rt.c
new file mode 100644
index 00000000000..841545ae359
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-rt.c
@@ -0,0 +1,66 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/logging.h>
+
+#include "changelog-rt.h"
+#include "changelog-mem-types.h"
+
+int
+changelog_rt_init(xlator_t *this, changelog_dispatcher_t *cd)
+{
+    changelog_rt_t *crt = NULL;
+
+    crt = GF_CALLOC(1, sizeof(*crt), gf_changelog_mt_rt_t);
+    if (!crt)
+        return -1;
+
+    LOCK_INIT(&crt->lock);
+
+    cd->cd_data = crt;
+    cd->dispatchfn = &changelog_rt_enqueue;
+
+    return 0;
+}
+
+int
+changelog_rt_fini(xlator_t *this, changelog_dispatcher_t *cd)
+{
+    changelog_rt_t *crt = NULL;
+
+    crt = cd->cd_data;
+
+    LOCK_DESTROY(&crt->lock);
+    GF_FREE(crt);
+
+    return 0;
+}
+
+int
+changelog_rt_enqueue(xlator_t *this, changelog_priv_t *priv, void *cbatch,
+                     changelog_log_data_t *cld_0, changelog_log_data_t *cld_1)
+{
+    int ret = 0;
+    changelog_rt_t *crt = NULL;
+
+    crt = (changelog_rt_t *)cbatch;
+
+    LOCK(&crt->lock);
+    {
+        ret = changelog_handle_change(this, priv, cld_0);
+        if (!ret && cld_1)
+            ret = changelog_handle_change(this, priv, cld_1);
+    }
+    UNLOCK(&crt->lock);
+
+    return ret;
+}
diff --git a/xlators/features/changelog/src/changelog-rt.h b/xlators/features/changelog/src/changelog-rt.h
new file mode 100644
index 00000000000..28b9827d85b
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-rt.h
@@ -0,0 +1,33 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_RT_H
+#define _CHANGELOG_RT_H
+
+#include <glusterfs/locking.h>
+#include <glusterfs/timer.h>
+#include "pthread.h"
+
+#include "changelog-helpers.h"
+
+/* unused as of now - may be you would need it later */
+typedef struct changelog_rt {
+    gf_lock_t lock;
+} changelog_rt_t;
+
+int
+changelog_rt_init(xlator_t *this, changelog_dispatcher_t *cd);
+int
+changelog_rt_fini(xlator_t *this, changelog_dispatcher_t *cd);
+int
+changelog_rt_enqueue(xlator_t *this, changelog_priv_t *priv, void *cbatch,
+                     changelog_log_data_t *cld_0, changelog_log_data_t *cld_1);
+
+#endif /* _CHANGELOG_RT_H */
diff --git a/xlators/features/changelog/src/changelog.c b/xlators/features/changelog/src/changelog.c
new file mode 100644
index 00000000000..6a6e5af859e
--- /dev/null
+++ b/xlators/features/changelog/src/changelog.c
@@ -0,0 +1,2989 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/iobuf.h>
+
+#include "changelog-rt.h"
+
+#include "changelog-encoders.h"
+#include "changelog-mem-types.h"
+#include "changelog-messages.h"
+
+#include <pthread.h>
+#include <signal.h>
+
+#include "changelog-rpc.h"
+#include "errno.h"
+
+static struct changelog_bootstrap cb_bootstrap[] = {
+    {
+        .mode = CHANGELOG_MODE_RT,
+        .ctor = changelog_rt_init,
+        .dtor = changelog_rt_fini,
+    },
+};
+
+static int
+changelog_init_rpc(xlator_t *this, changelog_priv_t *priv);
+
+static int
+changelog_init(xlator_t *this, changelog_priv_t *priv);
+
+/* Entry operations - TYPE III */
+
+/**
+ * entry operations do not undergo inode version checking.
+ */
+
+/* {{{ */
+
+/* rmdir */
+
+int32_t
+changelog_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(rmdir, frame, op_ret, op_errno, preparent,
+                           postparent, xdata);
+    return 0;
+}
+
+int32_t
+changelog_rmdir_resume(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       int xflags, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+
+    priv = this->private;
+
+    gf_msg_debug(this->name, 0, "Dequeue rmdir");
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_rmdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rmdir, loc, xflags, xdata);
+    return 0;
+}
+
+int32_t
+changelog_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+                dict_t *xdata)
+{
+    size_t xtra_len = 0;
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    call_stub_t *stub = NULL;
+    struct list_head queue = {
+        0,
+    };
+    gf_boolean_t barrier_enabled = _gf_false;
+
+    INIT_LIST_HEAD(&queue);
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, loc->inode->gfid, 2);
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+
+    co++;
+    if (priv->capture_del_path) {
+        CHANGELOG_FILL_ENTRY_DIR_PATH(co, loc->pargfid, loc->name, del_entry_fn,
+                                      del_entry_free_fn, xtra_len, wind,
+                                      _gf_true);
+    } else {
+        CHANGELOG_FILL_ENTRY_DIR_PATH(co, loc->pargfid, loc->name, del_entry_fn,
+                                      del_entry_free_fn, xtra_len, wind,
+                                      _gf_false);
+    }
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 2);
+
+    /* changelog barrier */
+    /* Color assignment and increment of fop_cnt for rmdir/unlink/rename
+     * should be made with in priv lock if changelog barrier is not enabled.
+     * Because if counter is not incremented yet, draining wakes up and
+     * publishes the changelog but later these fops might hit the disk and
+     * present in snapped volume but where as the intention is these fops
+     * should not be present in snapped volume.
+     */
+    LOCK(&priv->lock);
+    {
+        if ((barrier_enabled = priv->barrier_enabled)) {
+            stub = fop_rmdir_stub(frame, changelog_rmdir_resume, loc, xflags,
+                                  xdata);
+            if (!stub)
+                __chlog_barrier_disable(this, &queue);
+            else
+                __chlog_barrier_enqueue(this, stub);
+        } else {
+            ((changelog_local_t *)frame->local)->color = priv->current_color;
+            changelog_inc_fop_cnt(this, priv, frame->local);
+        }
+    }
+    UNLOCK(&priv->lock);
+
+    if (barrier_enabled && stub) {
+        gf_msg_debug(this->name, 0, "Enqueue rmdir");
+        goto out;
+    }
+    if (barrier_enabled && !stub) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+                CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=rmdir", NULL);
+        chlog_barrier_dequeue_all(this, &queue);
+    }
+
+    /* changelog barrier */
+
+wind:
+    STACK_WIND(frame, changelog_rmdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rmdir, loc, xflags, xdata);
+out:
+    return 0;
+}
+
+/* unlink */
+
+int32_t
+changelog_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                     struct iatt *postparent, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent,
+                           postparent, xdata);
+    return 0;
+}
+
+int32_t
+changelog_unlink_resume(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                        int xflags, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+
+    priv = this->private;
+
+    gf_msg_debug(this->name, 0, "Dequeue unlink");
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, xflags, xdata);
+    return 0;
+}
+
+int32_t
+changelog_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+                 dict_t *xdata)
+{
+    size_t xtra_len = 0;
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    call_stub_t *stub = NULL;
+    struct list_head queue = {
+        0,
+    };
+    gf_boolean_t barrier_enabled = _gf_false;
+    dht_changelog_rename_info_t *info = NULL;
+    int ret = 0;
+    char *old_name = NULL;
+    char *new_name = NULL;
+    char *nname = NULL;
+
+    INIT_LIST_HEAD(&queue);
+    priv = this->private;
+
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    ret = dict_get_bin(xdata, DHT_CHANGELOG_RENAME_OP_KEY, (void **)&info);
+    if (!ret) { /* special case: unlink considered as rename */
+        /* 3 == fop + oldloc + newloc */
+        old_name = alloca(info->oldname_len);
+        new_name = alloca(info->newname_len);
+        CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, loc->inode->gfid, 3);
+
+        co = changelog_get_usable_buffer(frame->local);
+        if (!co)
+            goto wind;
+
+        CHANGLOG_FILL_FOP_NUMBER(co, GF_FOP_RENAME, fop_fn, xtra_len);
+
+        co++;
+        strncpy(old_name, info->buffer, info->oldname_len);
+        CHANGELOG_FILL_ENTRY(co, info->old_pargfid, old_name, entry_fn,
+                             entry_free_fn, xtra_len, wind);
+
+        co++;
+        /* new name resides just after old name */
+        nname = info->buffer + info->oldname_len;
+        strncpy(new_name, nname, info->newname_len);
+        CHANGELOG_FILL_ENTRY(co, info->new_pargfid, new_name, entry_fn,
+                             entry_free_fn, xtra_len, wind);
+
+        changelog_set_usable_record_and_length(frame->local, xtra_len, 3);
+    } else { /* default unlink */
+        CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, wind);
+        CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, loc->inode->gfid, 2);
+
+        co = changelog_get_usable_buffer(frame->local);
+        if (!co)
+            goto wind;
+
+        CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+
+        co++;
+        if (priv->capture_del_path) {
+            CHANGELOG_FILL_ENTRY_DIR_PATH(co, loc->pargfid, loc->name,
+                                          del_entry_fn, del_entry_free_fn,
+                                          xtra_len, wind, _gf_true);
+        } else {
+            CHANGELOG_FILL_ENTRY_DIR_PATH(co, loc->pargfid, loc->name,
+                                          del_entry_fn, del_entry_free_fn,
+                                          xtra_len, wind, _gf_false);
+        }
+
+        changelog_set_usable_record_and_length(frame->local, xtra_len, 2);
+    }
+
+    /* changelog barrier */
+    LOCK(&priv->lock);
+    {
+        if ((barrier_enabled = priv->barrier_enabled)) {
+            stub = fop_unlink_stub(frame, changelog_unlink_resume, loc, xflags,
+                                   xdata);
+            if (!stub)
+                __chlog_barrier_disable(this, &queue);
+            else
+                __chlog_barrier_enqueue(this, stub);
+        } else {
+            ((changelog_local_t *)frame->local)->color = priv->current_color;
+            changelog_inc_fop_cnt(this, priv, frame->local);
+        }
+    }
+    UNLOCK(&priv->lock);
+
+    if (barrier_enabled && stub) {
+        gf_msg_debug(this->name, 0, "Enqueue unlink");
+        goto out;
+    }
+    if (barrier_enabled && !stub) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+                CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=unlink", NULL);
+        chlog_barrier_dequeue_all(this, &queue);
+    }
+
+    /* changelog barrier */
+
+wind:
+    STACK_WIND(frame, changelog_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, xflags, xdata);
+out:
+    return 0;
+}
+
+/* rename */
+
+int32_t
+changelog_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                     struct iatt *preoldparent, struct iatt *postoldparent,
+                     struct iatt *prenewparent, struct iatt *postnewparent,
+                     dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+    changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY);
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(rename, frame, op_ret, op_errno, buf, preoldparent,
+                           postoldparent, prenewparent, postnewparent, xdata);
+    return 0;
+}
+
+int32_t
+changelog_rename_resume(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+                        loc_t *newloc, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+
+    priv = this->private;
+
+    gf_msg_debug(this->name, 0, "Dequeue rename");
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+    return 0;
+}
+
+int32_t
+changelog_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+                 loc_t *newloc, dict_t *xdata)
+{
+    size_t xtra_len = 0;
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    call_stub_t *stub = NULL;
+    struct list_head queue = {
+        0,
+    };
+    gf_boolean_t barrier_enabled = _gf_false;
+    dht_changelog_rename_info_t *info = NULL;
+    int ret = 0;
+
+    INIT_LIST_HEAD(&queue);
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    ret = dict_get_bin(xdata, DHT_CHANGELOG_RENAME_OP_KEY, (void **)&info);
+    if (ret && oldloc->inode->ia_type != IA_IFDIR) {
+        /* xdata "NOT" set for a non-directory,
+         * Special rename => avoid logging */
+        goto wind;
+    }
+
+    /* 3 == fop + oldloc + newloc */
+    CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, oldloc->inode->gfid, 3);
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+
+    co++;
+    CHANGELOG_FILL_ENTRY(co, oldloc->pargfid, oldloc->name, entry_fn,
+                         entry_free_fn, xtra_len, wind);
+
+    co++;
+    CHANGELOG_FILL_ENTRY(co, newloc->pargfid, newloc->name, entry_fn,
+                         entry_free_fn, xtra_len, wind);
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 3);
+    /* changelog barrier */
+    LOCK(&priv->lock);
+    {
+        if ((barrier_enabled = priv->barrier_enabled)) {
+            stub = fop_rename_stub(frame, changelog_rename_resume, oldloc,
+                                   newloc, xdata);
+            if (!stub)
+                __chlog_barrier_disable(this, &queue);
+            else
+                __chlog_barrier_enqueue(this, stub);
+        } else {
+            ((changelog_local_t *)frame->local)->color = priv->current_color;
+            changelog_inc_fop_cnt(this, priv, frame->local);
+        }
+    }
+    UNLOCK(&priv->lock);
+
+    if (barrier_enabled && stub) {
+        gf_msg_debug(this->name, 0, "Enqueue rename");
+        goto out;
+    }
+    if (barrier_enabled && !stub) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+                CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=rename", NULL);
+        chlog_barrier_dequeue_all(this, &queue);
+    }
+    /* changelog barrier */
+
+wind:
+    STACK_WIND(frame, changelog_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+out:
+    return 0;
+}
+
+/* link */
+
+int32_t
+changelog_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, inode_t *inode,
+                   struct iatt *buf, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent,
+                           postparent, xdata);
+    return 0;
+}
+
+int32_t
+changelog_link_resume(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+                      loc_t *newloc, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("changelog", this, out);
+    GF_VALIDATE_OR_GOTO("changelog", this->fops, out);
+    GF_VALIDATE_OR_GOTO("changelog", frame, out);
+
+    priv = this->private;
+
+    gf_msg_debug(this->name, 0, "Dequeuing link");
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+    return 0;
+out:
+    return -1;
+}
+int32_t
+changelog_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+               loc_t *newloc, dict_t *xdata)
+{
+    size_t xtra_len = 0;
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    call_stub_t *stub = NULL;
+    struct list_head queue = {
+        0,
+    };
+    gf_boolean_t barrier_enabled = _gf_false;
+
+    priv = this->private;
+
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+    CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, wind);
+
+    CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, oldloc->gfid, 2);
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+
+    co++;
+    CHANGELOG_FILL_ENTRY(co, newloc->pargfid, newloc->name, entry_fn,
+                         entry_free_fn, xtra_len, wind);
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 2);
+
+    LOCK(&priv->lock);
+    {
+        if ((barrier_enabled = priv->barrier_enabled)) {
+            stub = fop_link_stub(frame, changelog_link_resume, oldloc, newloc,
+                                 xdata);
+            if (!stub)
+                __chlog_barrier_disable(this, &queue);
+            else
+                __chlog_barrier_enqueue(this, stub);
+        } else {
+            ((changelog_local_t *)frame->local)->color = priv->current_color;
+            changelog_inc_fop_cnt(this, priv, frame->local);
+        }
+    }
+    UNLOCK(&priv->lock);
+
+    if (barrier_enabled && stub) {
+        gf_msg_debug(this->name, 0, "Enqueued link");
+        goto out;
+    }
+
+    if (barrier_enabled && !stub) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_BARRIER_FOP_FAILED,
+                "fop=link", NULL);
+        chlog_barrier_dequeue_all(this, &queue);
+    }
+wind:
+    STACK_WIND(frame, changelog_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+out:
+    return 0;
+}
+
+/* mkdir */
+
+int32_t
+changelog_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, inode_t *inode,
+                    struct iatt *buf, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, buf,
+                           preparent, postparent, xdata);
+    return 0;
+}
+
+int32_t
+changelog_mkdir_resume(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       mode_t mode, mode_t umask, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("changelog", this, out);
+    GF_VALIDATE_OR_GOTO("changelog", this->fops, out);
+    GF_VALIDATE_OR_GOTO("changelog", frame, out);
+
+    priv = this->private;
+
+    gf_msg_debug(this->name, 0, "Dequeuing mkdir");
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_mkdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+    return 0;
+out:
+    return -1;
+}
+
+int32_t
+changelog_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+                mode_t umask, dict_t *xdata)
+{
+    int ret = -1;
+    uuid_t gfid = {
+        0,
+    };
+    size_t xtra_len = 0;
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    call_stub_t *stub = NULL;
+    struct list_head queue = {
+        0,
+    };
+    gf_boolean_t barrier_enabled = _gf_false;
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    ret = dict_get_gfuuid(xdata, "gfid-req", &gfid);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "failed to get gfid from dict");
+        goto wind;
+    }
+
+    CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, gfid, 5);
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+    co++;
+
+    CHANGELOG_FILL_UINT32(co, S_IFDIR | mode, number_fn, xtra_len);
+    co++;
+
+    CHANGELOG_FILL_UINT32(co, frame->root->uid, number_fn, xtra_len);
+    co++;
+
+    CHANGELOG_FILL_UINT32(co, frame->root->gid, number_fn, xtra_len);
+    co++;
+
+    CHANGELOG_FILL_ENTRY(co, loc->pargfid, loc->name, entry_fn, entry_free_fn,
+                         xtra_len, wind);
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 5);
+
+    LOCK(&priv->lock);
+    {
+        if ((barrier_enabled = priv->barrier_enabled)) {
+            stub = fop_mkdir_stub(frame, changelog_mkdir_resume, loc, mode,
+                                  umask, xdata);
+            if (!stub)
+                __chlog_barrier_disable(this, &queue);
+            else
+                __chlog_barrier_enqueue(this, stub);
+        } else {
+            ((changelog_local_t *)frame->local)->color = priv->current_color;
+            changelog_inc_fop_cnt(this, priv, frame->local);
+        }
+    }
+    UNLOCK(&priv->lock);
+
+    if (barrier_enabled && stub) {
+        gf_msg_debug(this->name, 0, "Enqueued mkdir");
+        goto out;
+    }
+
+    if (barrier_enabled && !stub) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+                CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=mkdir", NULL);
+        chlog_barrier_dequeue_all(this, &queue);
+    }
+
+wind:
+    STACK_WIND(frame, changelog_mkdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+out:
+    return 0;
+}
+
+/* symlink */
+
+int32_t
+changelog_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, inode_t *inode,
+                      struct iatt *buf, struct iatt *preparent,
+                      struct iatt *postparent, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(symlink, frame, op_ret, op_errno, inode, buf,
+                           preparent, postparent, xdata);
+    return 0;
+}
+
+int32_t
+changelog_symlink_resume(call_frame_t *frame, xlator_t *this,
+                         const char *linkname, loc_t *loc, mode_t umask,
+                         dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("changelog", this, out);
+    GF_VALIDATE_OR_GOTO("changelog", this->fops, out);
+    GF_VALIDATE_OR_GOTO("changelog", frame, out);
+
+    priv = this->private;
+
+    gf_msg_debug(this->name, 0, "Dequeuing symlink");
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_symlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata);
+    return 0;
+out:
+    return -1;
+}
+
+int32_t
+changelog_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
+                  loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    int ret = -1;
+    size_t xtra_len = 0;
+    uuid_t gfid = {
+        0,
+    };
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    call_stub_t *stub = NULL;
+    struct list_head queue = {
+        0,
+    };
+    gf_boolean_t barrier_enabled = _gf_false;
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    ret = dict_get_gfuuid(xdata, "gfid-req", &gfid);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "failed to get gfid from dict");
+        goto wind;
+    }
+
+    CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, gfid, 2);
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+    co++;
+
+    CHANGELOG_FILL_ENTRY(co, loc->pargfid, loc->name, entry_fn, entry_free_fn,
+                         xtra_len, wind);
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 2);
+
+    LOCK(&priv->lock);
+    {
+        if ((barrier_enabled = priv->barrier_enabled)) {
+            stub = fop_symlink_stub(frame, changelog_symlink_resume, linkname,
+                                    loc, umask, xdata);
+            if (!stub)
+                __chlog_barrier_disable(this, &queue);
+            else
+                __chlog_barrier_enqueue(this, stub);
+        } else {
+            ((changelog_local_t *)frame->local)->color = priv->current_color;
+            changelog_inc_fop_cnt(this, priv, frame->local);
+        }
+    }
+    UNLOCK(&priv->lock);
+
+    if (barrier_enabled && stub) {
+        gf_msg_debug(this->name, 0, "Enqueued symlink");
+        goto out;
+    }
+
+    if (barrier_enabled && !stub) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+                CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=symlink", NULL);
+        chlog_barrier_dequeue_all(this, &queue);
+    }
+
+wind:
+    STACK_WIND(frame, changelog_symlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata);
+out:
+    return 0;
+}
+
+/* mknod */
+
+int32_t
+changelog_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, inode_t *inode,
+                    struct iatt *buf, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf,
+                           preparent, postparent, xdata);
+    return 0;
+}
+
+int32_t
+changelog_mknod_resume(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("changelog", this, out);
+    GF_VALIDATE_OR_GOTO("changelog", this->fops, out);
+    GF_VALIDATE_OR_GOTO("changelog", frame, out);
+
+    priv = this->private;
+
+    gf_msg_debug(this->name, 0, "Dequeuing mknod");
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata);
+    return 0;
+out:
+    return -1;
+}
+
+int32_t
+changelog_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+                dev_t dev, mode_t umask, dict_t *xdata)
+{
+    int ret = -1;
+    uuid_t gfid = {
+        0,
+    };
+    size_t xtra_len = 0;
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    call_stub_t *stub = NULL;
+    struct list_head queue = {
+        0,
+    };
+    gf_boolean_t barrier_enabled = _gf_false;
+
+    priv = this->private;
+
+    /* Check whether changelog active */
+    if (!(priv->active))
+        goto wind;
+
+    /* Check whether rebalance activity */
+    if (frame->root->pid == GF_CLIENT_PID_DEFRAG)
+        goto wind;
+
+    /* If tier-dht linkto is SET, ignore about verifiying :
+     * 1. Whether internal fop AND
+     * 2. Whether tier rebalance process activity (this will help in
+     * recording mknod if tier rebalance process calls this mknod) */
+    if (!(dict_get(xdata, "trusted.tier.tier-dht.linkto"))) {
+        CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, wind);
+        if (frame->root->pid == GF_CLIENT_PID_TIER_DEFRAG)
+            goto wind;
+    }
+
+    ret = dict_get_gfuuid(xdata, "gfid-req", &gfid);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "failed to get gfid from dict");
+        goto wind;
+    }
+
+    CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, gfid, 5);
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+    co++;
+
+    CHANGELOG_FILL_UINT32(co, mode, number_fn, xtra_len);
+    co++;
+
+    CHANGELOG_FILL_UINT32(co, frame->root->uid, number_fn, xtra_len);
+    co++;
+
+    CHANGELOG_FILL_UINT32(co, frame->root->gid, number_fn, xtra_len);
+    co++;
+
+    CHANGELOG_FILL_ENTRY(co, loc->pargfid, loc->name, entry_fn, entry_free_fn,
+                         xtra_len, wind);
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 5);
+
+    LOCK(&priv->lock);
+    {
+        if ((barrier_enabled = priv->barrier_enabled)) {
+            stub = fop_mknod_stub(frame, changelog_mknod_resume, loc, mode, dev,
+                                  umask, xdata);
+            if (!stub)
+                __chlog_barrier_disable(this, &queue);
+            else
+                __chlog_barrier_enqueue(this, stub);
+        } else {
+            ((changelog_local_t *)frame->local)->color = priv->current_color;
+            changelog_inc_fop_cnt(this, priv, frame->local);
+        }
+    }
+    UNLOCK(&priv->lock);
+
+    if (barrier_enabled && stub) {
+        gf_msg_debug(this->name, 0, "Enqueued mknod");
+        goto out;
+    }
+
+    if (barrier_enabled && !stub) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+                CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=mknod", NULL);
+        chlog_barrier_dequeue_all(this, &queue);
+    }
+
+wind:
+    STACK_WIND(frame, changelog_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, dev, umask, xdata);
+out:
+    return 0;
+}
+
+/* create */
+
+int32_t
+changelog_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                     struct iatt *buf, struct iatt *preparent,
+                     struct iatt *postparent, dict_t *xdata)
+{
+    int32_t ret = 0;
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+    changelog_event_t ev = {
+        0,
+    };
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    /* fill the event structure.. similar to open() */
+    ev.ev_type = CHANGELOG_OP_TYPE_CREATE;
+    gf_uuid_copy(ev.u.create.gfid, buf->ia_gfid);
+    ev.u.create.flags = fd->flags;
+    changelog_dispatch_event(this, priv, &ev);
+
+    if (changelog_ev_selected(this, &priv->ev_selection,
+                              CHANGELOG_OP_TYPE_RELEASE)) {
+        ret = fd_ctx_set(fd, this, (uint64_t)(long)0x1);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_WARNING, 0, CHANGELOG_MSG_SET_FD_CONTEXT,
+                    NULL);
+    }
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, buf,
+                           preparent, postparent, xdata);
+    return 0;
+}
+
+int32_t
+changelog_create_resume(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                        int32_t flags, mode_t mode, mode_t umask, fd_t *fd,
+                        dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("changelog", this, out);
+    GF_VALIDATE_OR_GOTO("changelog", this->fops, out);
+    GF_VALIDATE_OR_GOTO("changelog", frame, out);
+
+    priv = this->private;
+
+    gf_msg_debug(this->name, 0, "Dequeuing create");
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+    return 0;
+
+out:
+    return -1;
+}
+
+int32_t
+changelog_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+                 mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    int ret = -1;
+    uuid_t gfid = {
+        0,
+    };
+    changelog_opt_t *co = NULL;
+    changelog_priv_t *priv = NULL;
+    size_t xtra_len = 0;
+    call_stub_t *stub = NULL;
+    struct list_head queue = {
+        0,
+    };
+    gf_boolean_t barrier_enabled = _gf_false;
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    ret = dict_get_gfuuid(xdata, "gfid-req", &gfid);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "failed to get gfid from dict");
+        goto wind;
+    }
+
+    /* init with two extra records */
+    CHANGELOG_INIT_NOCHECK(this, frame->local, NULL, gfid, 5);
+    if (!frame->local)
+        goto wind;
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+    co++;
+
+    CHANGELOG_FILL_UINT32(co, mode, number_fn, xtra_len);
+    co++;
+
+    CHANGELOG_FILL_UINT32(co, frame->root->uid, number_fn, xtra_len);
+    co++;
+
+    CHANGELOG_FILL_UINT32(co, frame->root->gid, number_fn, xtra_len);
+    co++;
+
+    CHANGELOG_FILL_ENTRY(co, loc->pargfid, loc->name, entry_fn, entry_free_fn,
+                         xtra_len, wind);
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 5);
+
+    LOCK(&priv->lock);
+    {
+        if ((barrier_enabled = priv->barrier_enabled)) {
+            stub = fop_create_stub(frame, changelog_create_resume, loc, flags,
+                                   mode, umask, fd, xdata);
+            if (!stub)
+                __chlog_barrier_disable(this, &queue);
+            else
+                __chlog_barrier_enqueue(this, stub);
+        } else {
+            ((changelog_local_t *)frame->local)->color = priv->current_color;
+            changelog_inc_fop_cnt(this, priv, frame->local);
+        }
+    }
+    UNLOCK(&priv->lock);
+
+    if (barrier_enabled && stub) {
+        gf_msg_debug(this->name, 0, "Enqueued create");
+        goto out;
+    }
+
+    if (barrier_enabled && !stub) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+                CHANGELOG_MSG_BARRIER_FOP_FAILED, "fop=create", NULL);
+        chlog_barrier_dequeue_all(this, &queue);
+    }
+
+wind:
+    STACK_WIND(frame, changelog_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+out:
+    return 0;
+}
+
+/* }}} */
+
+/* Metadata modification fops - TYPE II */
+
+/* {{{ */
+
+/* {f}setattr */
+
+int32_t
+changelog_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno,
+                       struct iatt *preop_stbuf, struct iatt *postop_stbuf,
+                       dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, preop_stbuf,
+                           postop_stbuf, xdata);
+
+    return 0;
+}
+
+int32_t
+changelog_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                   struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    size_t xtra_len = 0;
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    CHANGELOG_OP_BOUNDARY_CHECK(frame, wind);
+
+    CHANGELOG_INIT(this, frame->local, fd->inode, fd->inode->gfid, 1);
+    if (!frame->local)
+        goto wind;
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 1);
+
+wind:
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_fsetattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+    return 0;
+}
+
+int32_t
+changelog_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno,
+                      struct iatt *preop_stbuf, struct iatt *postop_stbuf,
+                      dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(setattr, frame, op_ret, op_errno, preop_stbuf,
+                           postop_stbuf, xdata);
+
+    return 0;
+}
+
+int32_t
+changelog_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                  struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    size_t xtra_len = 0;
+    uuid_t shard_root_gfid = {
+        0,
+    };
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, wind);
+
+    /* Do not record META on .shard */
+    gf_uuid_parse(SHARD_ROOT_GFID, shard_root_gfid);
+    if (gf_uuid_compare(loc->gfid, shard_root_gfid) == 0) {
+        goto wind;
+    }
+
+    CHANGELOG_OP_BOUNDARY_CHECK(frame, wind);
+
+    CHANGELOG_INIT(this, frame->local, loc->inode, loc->inode->gfid, 1);
+    if (!frame->local)
+        goto wind;
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 1);
+
+wind:
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+    return 0;
+}
+
+/* {f}removexattr */
+
+int32_t
+changelog_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA_XATTR);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+int32_t
+changelog_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                       const char *name, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    size_t xtra_len = 0;
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    CHANGELOG_OP_BOUNDARY_CHECK(frame, wind);
+
+    CHANGELOG_INIT(this, frame->local, fd->inode, fd->inode->gfid, 1);
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 1);
+
+wind:
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_fremovexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+    return 0;
+}
+
+int32_t
+changelog_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA_XATTR);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+int32_t
+changelog_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                      const char *name, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    size_t xtra_len = 0;
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    CHANGELOG_OP_BOUNDARY_CHECK(frame, wind);
+
+    CHANGELOG_INIT(this, frame->local, loc->inode, loc->inode->gfid, 1);
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 1);
+
+wind:
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_removexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+    return 0;
+}
+
+/* {f}setxattr */
+
+int32_t
+changelog_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA_XATTR);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+/* changelog_handle_virtual_xattr:
+ *         Handles virtual setxattr 'glusterfs.geo-rep.trigger-sync' on files.
+ *         Following is the behaviour based on the value of xattr.
+ *                         1: Captures only DATA entry in changelog.
+ *                         2: Tries to captures both ENTRY and DATA entry in
+ *                            changelog. If failed to get pargfid, only DATA
+ *                            entry is captured.
+ *           any other value: ENOTSUP is returned.
+ */
+static void
+changelog_handle_virtual_xattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                               dict_t *dict)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+    int32_t value = 0;
+    int ret = 0;
+    int dict_ret = 0;
+    gf_boolean_t valid = _gf_false;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    dict_ret = dict_get_int32(dict, GF_XATTR_TRIGGER_SYNC, &value);
+
+    if ((dict_ret == 0 && value == 1) && ((loc->inode->ia_type == IA_IFDIR) ||
+                                          (loc->inode->ia_type == IA_IFREG)))
+        valid = _gf_true;
+
+    if (valid) {
+        ret = changelog_fill_entry_buf(frame, this, loc, &local);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_ENTRY_BUF_INFO,
+                    "gfid=%s", uuid_utoa(loc->inode->gfid), NULL);
+            goto unwind;
+        }
+        changelog_update(this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+    unwind:
+        /* Capture DATA only if it's a file. */
+        if (loc->inode->ia_type != IA_IFDIR)
+            changelog_update(this, priv, frame->local, CHANGELOG_TYPE_DATA);
+        /* Assign local to prev_entry, so unwind will take
+         * care of cleanup. */
+        ((changelog_local_t *)(frame->local))->prev_entry = local;
+        CHANGELOG_STACK_UNWIND(setxattr, frame, 0, 0, NULL);
+        return;
+    } else {
+        CHANGELOG_STACK_UNWIND(setxattr, frame, -1, ENOTSUP, NULL);
+        return;
+    }
+}
+
+int32_t
+changelog_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   dict_t *dict, int32_t flags, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    size_t xtra_len = 0;
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    CHANGELOG_OP_BOUNDARY_CHECK(frame, wind);
+
+    CHANGELOG_INIT(this, frame->local, loc->inode, loc->inode->gfid, 1);
+
+    /* On setting this virtual xattr on a file, an explicit data
+     * sync is triggered from geo-rep as CREATE|DATA entry is
+     * recorded in changelog based on xattr value.
+     */
+    if (dict_get(dict, GF_XATTR_TRIGGER_SYNC)) {
+        changelog_handle_virtual_xattr(frame, this, loc, dict);
+        return 0;
+    }
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 1);
+
+wind:
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_setxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
+    return 0;
+}
+
+int32_t
+changelog_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA_XATTR);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+int32_t
+changelog_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+                    int32_t flags, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    size_t xtra_len = 0;
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+    CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, wind);
+
+    CHANGELOG_OP_BOUNDARY_CHECK(frame, wind);
+
+    CHANGELOG_INIT(this, frame->local, fd->inode, fd->inode->gfid, 1);
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 1);
+
+wind:
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_fsetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+    return 0;
+}
+
+int32_t
+changelog_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xattr,
+                      dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(xattrop, frame, op_ret, op_errno, xattr, xdata);
+
+    return 0;
+}
+
+int32_t
+changelog_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                  gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    size_t xtra_len = 0;
+    int ret = 0;
+    void *size_attr = NULL;
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+    ret = dict_get_ptr(xattr, GF_XATTR_SHARD_FILE_SIZE, &size_attr);
+    if (ret)
+        goto wind;
+
+    CHANGELOG_OP_BOUNDARY_CHECK(frame, wind);
+
+    CHANGELOG_INIT(this, frame->local, loc->inode, loc->inode->gfid, 1);
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 1);
+
+wind:
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_xattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->xattrop, loc, optype, xattr, xdata);
+    return 0;
+}
+
+int32_t
+changelog_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *xattr,
+                       dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_METADATA_XATTR);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, xattr, xdata);
+
+    return 0;
+}
+
+int32_t
+changelog_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                   gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_opt_t *co = NULL;
+    size_t xtra_len = 0;
+    void *size_attr = NULL;
+    int ret = 0;
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+    ret = dict_get_ptr(xattr, GF_XATTR_SHARD_FILE_SIZE, &size_attr);
+    if (ret)
+        goto wind;
+
+    CHANGELOG_OP_BOUNDARY_CHECK(frame, wind);
+
+    CHANGELOG_INIT(this, frame->local, fd->inode, fd->inode->gfid, 1);
+
+    co = changelog_get_usable_buffer(frame->local);
+    if (!co)
+        goto wind;
+
+    CHANGLOG_FILL_FOP_NUMBER(co, frame->root->op, fop_fn, xtra_len);
+
+    changelog_set_usable_record_and_length(frame->local, xtra_len, 1);
+
+wind:
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_fxattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fxattrop, fd, optype, xattr, xdata);
+    return 0;
+}
+/* }}} */
+
+/* Data modification fops - TYPE I */
+
+/* {{{ */
+
+/* {f}truncate() */
+
+int32_t
+changelog_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                       struct iatt *postbuf, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_DATA);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+                           xdata);
+    return 0;
+}
+
+int32_t
+changelog_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   off_t offset, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    CHANGELOG_INIT(this, frame->local, loc->inode, loc->inode->gfid, 0);
+    LOCK(&priv->c_snap_lock);
+    {
+        if (priv->c_snap_fd != -1 && priv->barrier_enabled == _gf_true) {
+            changelog_snap_handle_ascii_change(
+                this, &(((changelog_local_t *)(frame->local))->cld));
+        }
+    }
+    UNLOCK(&priv->c_snap_lock);
+
+wind:
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+    return 0;
+}
+
+int32_t
+changelog_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                        struct iatt *postbuf, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_DATA);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+                           xdata);
+    return 0;
+}
+
+int32_t
+changelog_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                    dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    CHANGELOG_INIT(this, frame->local, fd->inode, fd->inode->gfid, 0);
+    LOCK(&priv->c_snap_lock);
+    {
+        if (priv->c_snap_fd != -1 && priv->barrier_enabled == _gf_true) {
+            changelog_snap_handle_ascii_change(
+                this, &(((changelog_local_t *)(frame->local))->cld));
+        }
+    }
+    UNLOCK(&priv->c_snap_lock);
+
+wind:
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_ftruncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+}
+
+/* writev() */
+
+int32_t
+changelog_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+    changelog_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret <= 0) || !local), unwind);
+
+    changelog_update(this, priv, local, CHANGELOG_TYPE_DATA);
+
+unwind:
+    changelog_dec_fop_cnt(this, priv, local);
+    CHANGELOG_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf,
+                           xdata);
+    return 0;
+}
+
+int32_t
+changelog_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                 struct iovec *vector, int32_t count, off_t offset,
+                 uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    CHANGELOG_INIT(this, frame->local, fd->inode, fd->inode->gfid, 0);
+    LOCK(&priv->c_snap_lock);
+    {
+        if (priv->c_snap_fd != -1 && priv->barrier_enabled == _gf_true) {
+            changelog_snap_handle_ascii_change(
+                this, &(((changelog_local_t *)(frame->local))->cld));
+        }
+    }
+    UNLOCK(&priv->c_snap_lock);
+
+wind:
+    changelog_color_fop_and_inc_cnt(this, priv, frame->local);
+    STACK_WIND(frame, changelog_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+               flags, iobref, xdata);
+    return 0;
+}
+
+/* }}} */
+
+/* open, release and other beasts */
+
+/* {{{ */
+
+int
+changelog_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+{
+    int ret = 0;
+    changelog_priv_t *priv = NULL;
+    changelog_event_t ev = {
+        0,
+    };
+    gf_boolean_t logopen = _gf_false;
+
+    priv = this->private;
+    if (frame->local) {
+        frame->local = NULL;
+        logopen = _gf_true;
+    }
+
+    CHANGELOG_COND_GOTO(priv, ((op_ret < 0) || !logopen), unwind);
+
+    /* fill the event structure */
+    ev.ev_type = CHANGELOG_OP_TYPE_OPEN;
+    gf_uuid_copy(ev.u.open.gfid, fd->inode->gfid);
+    ev.u.open.flags = fd->flags;
+    changelog_dispatch_event(this, priv, &ev);
+
+    if (changelog_ev_selected(this, &priv->ev_selection,
+                              CHANGELOG_OP_TYPE_RELEASE)) {
+        ret = fd_ctx_set(fd, this, (uint64_t)(long)0x1);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_WARNING, 0, CHANGELOG_MSG_SET_FD_CONTEXT,
+                    NULL);
+    }
+
+unwind:
+    CHANGELOG_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata);
+    return 0;
+}
+
+int
+changelog_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+               fd_t *fd, dict_t *xdata)
+{
+    changelog_priv_t *priv = NULL;
+
+    priv = this->private;
+    CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, wind);
+
+    frame->local = (void *)0x1; /* do not dereference in ->cbk */
+
+wind:
+    STACK_WIND(frame, changelog_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+    return 0;
+}
+
+/* }}} */
+
+/* {{{ */
+
+/* }}} */
+
+int32_t
+_changelog_generic_dispatcher(dict_t *dict, char *key, data_t *value,
+                              void *data)
+{
+    xlator_t *this = NULL;
+    changelog_priv_t *priv = NULL;
+
+    this = data;
+    priv = this->private;
+
+    changelog_dispatch_event(this, priv, (changelog_event_t *)value->data);
+    return 0;
+}
+
+/**
+ * changelog ipc dispatches events, pointers of which are passed in
+ * @xdata. Dispatching is orderless (whatever order dict_foreach()
+ * traverses the dictionary).
+ */
+int32_t
+changelog_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+    if (op != GF_IPC_TARGET_CHANGELOG)
+        goto wind;
+
+    /* it's for us, do the job */
+    if (xdata)
+        (void)dict_foreach(xdata, _changelog_generic_dispatcher, this);
+
+    STACK_UNWIND_STRICT(ipc, frame, 0, 0, NULL);
+    return 0;
+
+wind:
+    STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ipc, op, xdata);
+    return 0;
+}
+
+/* {{{ */
+
+int32_t
+changelog_release(xlator_t *this, fd_t *fd)
+{
+    changelog_event_t ev = {
+        0,
+    };
+    changelog_priv_t *priv = NULL;
+
+    priv = this->private;
+
+    ev.ev_type = CHANGELOG_OP_TYPE_RELEASE;
+    gf_uuid_copy(ev.u.release.gfid, fd->inode->gfid);
+    changelog_dispatch_event(this, priv, &ev);
+
+    (void)fd_ctx_del(fd, this, NULL);
+
+    return 0;
+}
+
+/* }}} */
+
+/**
+ * The
+ *   - @init ()
+ *   - @fini ()
+ *   - @reconfigure ()
+ *   ... and helper routines
+ */
+
+/**
+ * needed if there are more operation modes in the future.
+ */
+static void
+changelog_assign_opmode(changelog_priv_t *priv, char *mode)
+{
+    if (strncmp(mode, "realtime", 8) == 0) {
+        priv->op_mode = CHANGELOG_MODE_RT;
+    }
+}
+
+static void
+changelog_assign_encoding(changelog_priv_t *priv, char *enc)
+{
+    if (strncmp(enc, "binary", 6) == 0) {
+        priv->encode_mode = CHANGELOG_ENCODE_BINARY;
+    } else if (strncmp(enc, "ascii", 5) == 0) {
+        priv->encode_mode = CHANGELOG_ENCODE_ASCII;
+    }
+}
+
+static void
+changelog_assign_barrier_timeout(changelog_priv_t *priv, uint32_t timeout)
+{
+    LOCK(&priv->lock);
+    {
+        priv->timeout.tv_sec = timeout;
+    }
+    UNLOCK(&priv->lock);
+}
+
+/* cleanup any helper threads that are running */
+static void
+changelog_cleanup_helper_threads(xlator_t *this, changelog_priv_t *priv)
+{
+    if (priv->cr.rollover_th) {
+        (void)changelog_thread_cleanup(this, priv->cr.rollover_th);
+        priv->cr.rollover_th = 0;
+    }
+
+    if (priv->cf.fsync_th) {
+        (void)changelog_thread_cleanup(this, priv->cf.fsync_th);
+        priv->cf.fsync_th = 0;
+    }
+}
+
+/* spawn helper thread; cleaning up in case of errors */
+static int
+changelog_spawn_helper_threads(xlator_t *this, changelog_priv_t *priv)
+{
+    int ret = 0;
+
+    /* Geo-Rep snapshot dependency:
+     *
+     * To implement explicit rollover of changlog journal on barrier
+     * notification, a pipe is created to communicate between
+     * 'changelog_rollover' thread and changelog main thread. The select
+     * call used to wait till roll-over time in changelog_rollover thread
+     * is modified to wait on read end of the pipe. When barrier
+     * notification comes (i.e, in 'reconfigure'), select in
+     * changelog_rollover thread is woken up explicitly by writing into
+     * the write end of the pipe in 'reconfigure'.
+     */
+
+    priv->cr.notify = _gf_false;
+    priv->cr.this = this;
+    ret = gf_thread_create(&priv->cr.rollover_th, NULL, changelog_rollover,
+                           priv, "clogro");
+    if (ret)
+        goto out;
+
+    if (priv->fsync_interval) {
+        priv->cf.this = this;
+        ret = gf_thread_create(&priv->cf.fsync_th, NULL, changelog_fsync_thread,
+                               priv, "clogfsyn");
+    }
+
+    if (ret)
+        changelog_cleanup_helper_threads(this, priv);
+
+out:
+    return ret;
+}
+
+int
+notify(xlator_t *this, int event, void *data, ...)
+{
+    changelog_priv_t *priv = NULL;
+    dict_t *dict = NULL;
+    char buf[1] = {1};
+    int barrier = DICT_DEFAULT;
+    gf_boolean_t bclean_req = _gf_false;
+    int ret = 0;
+    int ret1 = 0;
+    struct list_head queue = {
+        0,
+    };
+    uint64_t xprtcnt = 0;
+    uint64_t clntcnt = 0;
+    changelog_clnt_t *conn = NULL;
+    gf_boolean_t cleanup_notify = _gf_false;
+    char sockfile[UNIX_PATH_MAX] = {
+        0,
+    };
+    rpcsvc_listener_t *listener = NULL;
+    rpcsvc_listener_t *next = NULL;
+
+    INIT_LIST_HEAD(&queue);
+
+    priv = this->private;
+    if (!priv)
+        goto out;
+
+    if (event == GF_EVENT_PARENT_DOWN) {
+        priv->victim = data;
+        gf_log(this->name, GF_LOG_INFO,
+               "cleanup changelog rpc connection of brick %s",
+               priv->victim->name);
+
+        if (priv->rpc_active) {
+            this->cleanup_starting = 1;
+            changelog_destroy_rpc_listner(this, priv);
+            conn = &priv->connections;
+            if (conn)
+                changelog_ev_cleanup_connections(this, conn);
+            xprtcnt = GF_ATOMIC_GET(priv->xprtcnt);
+            clntcnt = GF_ATOMIC_GET(priv->clntcnt);
+            if (!xprtcnt && !clntcnt) {
+                LOCK(&priv->lock);
+                {
+                    cleanup_notify = priv->notify_down;
+                    priv->notify_down = _gf_true;
+                }
+                UNLOCK(&priv->lock);
+                if (priv->rpc) {
+                    list_for_each_entry_safe(listener, next,
+                                             &priv->rpc->listeners, list)
+                    {
+                        if (listener->trans) {
+                            rpc_transport_unref(listener->trans);
+                        }
+                    }
+                    rpcsvc_destroy(priv->rpc);
+                    priv->rpc = NULL;
+                }
+                CHANGELOG_MAKE_SOCKET_PATH(priv->changelog_brick, sockfile,
+                                           UNIX_PATH_MAX);
+                sys_unlink(sockfile);
+                if (!cleanup_notify)
+                    default_notify(this, GF_EVENT_PARENT_DOWN, data);
+            }
+        } else {
+            default_notify(this, GF_EVENT_PARENT_DOWN, data);
+        }
+        goto out;
+    }
+
+    if (event == GF_EVENT_TRANSLATOR_OP) {
+        dict = data;
+
+        barrier = dict_get_str_boolean(dict, "barrier", DICT_DEFAULT);
+
+        switch (barrier) {
+            case DICT_ERROR:
+                gf_smsg(this->name, GF_LOG_ERROR, 0,
+                        CHANGELOG_MSG_DICT_GET_FAILED, "dict_get_str_boolean",
+                        NULL);
+                ret = -1;
+                goto out;
+
+            case BARRIER_OFF:
+                gf_smsg(this->name, GF_LOG_INFO, 0,
+                        CHANGELOG_MSG_BARRIER_STATE_NOTIFY, "off", NULL);
+
+                CHANGELOG_NOT_ON_THEN_GOTO(priv, ret, out);
+                LOCK(&priv->c_snap_lock);
+                {
+                    changelog_snap_logging_stop(this, priv);
+                }
+                UNLOCK(&priv->c_snap_lock);
+
+                LOCK(&priv->bflags.lock);
+                {
+                    if (priv->bflags.barrier_ext == _gf_false)
+                        ret = -1;
+                }
+                UNLOCK(&priv->bflags.lock);
+
+                if (ret == -1) {
+                    gf_smsg(this->name, GF_LOG_ERROR, 0,
+                            CHANGELOG_MSG_BARRIER_ERROR, NULL);
+                    goto out;
+                }
+
+                /* Stop changelog barrier and dequeue all fops */
+                LOCK(&priv->lock);
+                {
+                    if (priv->barrier_enabled == _gf_true)
+                        __chlog_barrier_disable(this, &queue);
+                    else
+                        ret = -1;
+                }
+                UNLOCK(&priv->lock);
+                /* If ret = -1, then changelog barrier is already
+                 * disabled because of error or timeout.
+                 */
+                if (ret == 0) {
+                    chlog_barrier_dequeue_all(this, &queue);
+                    gf_smsg(this->name, GF_LOG_INFO, 0,
+                            CHANGELOG_MSG_BARRIER_DISABLED, NULL);
+                } else {
+                    gf_smsg(this->name, GF_LOG_ERROR, 0,
+                            CHANGELOG_MSG_BARRIER_ALREADY_DISABLED, NULL);
+                }
+
+                LOCK(&priv->bflags.lock);
+                {
+                    priv->bflags.barrier_ext = _gf_false;
+                }
+                UNLOCK(&priv->bflags.lock);
+
+                goto out;
+
+            case BARRIER_ON:
+                gf_smsg(this->name, GF_LOG_INFO, 0,
+                        CHANGELOG_MSG_BARRIER_STATE_NOTIFY, "on", NULL);
+
+                CHANGELOG_NOT_ON_THEN_GOTO(priv, ret, out);
+                LOCK(&priv->c_snap_lock);
+                {
+                    changelog_snap_logging_start(this, priv);
+                }
+                UNLOCK(&priv->c_snap_lock);
+
+                LOCK(&priv->bflags.lock);
+                {
+                    if (priv->bflags.barrier_ext == _gf_true)
+                        ret = -1;
+                    else
+                        priv->bflags.barrier_ext = _gf_true;
+                }
+                UNLOCK(&priv->bflags.lock);
+
+                if (ret == -1) {
+                    gf_smsg(this->name, GF_LOG_ERROR, 0,
+                            CHANGELOG_MSG_BARRIER_ON_ERROR, NULL);
+                    goto out;
+                }
+
+                ret = pthread_mutex_lock(&priv->bn.bnotify_mutex);
+                CHANGELOG_PTHREAD_ERROR_HANDLE_1(ret, out, bclean_req);
+                {
+                    priv->bn.bnotify = _gf_true;
+                }
+                ret = pthread_mutex_unlock(&priv->bn.bnotify_mutex);
+                CHANGELOG_PTHREAD_ERROR_HANDLE_1(ret, out, bclean_req);
+
+                /* Start changelog barrier */
+                LOCK(&priv->lock);
+                {
+                    ret = __chlog_barrier_enable(this, priv);
+                }
+                UNLOCK(&priv->lock);
+                if (ret == -1) {
+                    changelog_barrier_cleanup(this, priv, &queue);
+                    goto out;
+                }
+
+                gf_smsg(this->name, GF_LOG_INFO, 0,
+                        CHANGELOG_MSG_BARRIER_ENABLE, NULL);
+
+                ret = changelog_barrier_notify(priv, buf);
+                if (ret) {
+                    gf_smsg(this->name, GF_LOG_ERROR, 0,
+                            CHANGELOG_MSG_WRITE_FAILED, "Explicit roll over",
+                            NULL);
+                    changelog_barrier_cleanup(this, priv, &queue);
+                    ret = -1;
+                    goto out;
+                }
+
+                ret = pthread_mutex_lock(&priv->bn.bnotify_mutex);
+                CHANGELOG_PTHREAD_ERROR_HANDLE_1(ret, out, bclean_req);
+                {
+                    /* The while condition check is required here to
+                     * handle spurious wakeup of cond wait that can
+                     * happen with pthreads. See man page */
+                    while (priv->bn.bnotify == _gf_true) {
+                        ret = pthread_cond_wait(&priv->bn.bnotify_cond,
+                                                &priv->bn.bnotify_mutex);
+                        CHANGELOG_PTHREAD_ERROR_HANDLE_1(ret, out, bclean_req);
+                    }
+                    if (priv->bn.bnotify_error == _gf_true) {
+                        ret = -1;
+                        priv->bn.bnotify_error = _gf_false;
+                    }
+                }
+                ret1 = pthread_mutex_unlock(&priv->bn.bnotify_mutex);
+                CHANGELOG_PTHREAD_ERROR_HANDLE_1(ret1, out, bclean_req);
+                gf_smsg(this->name, GF_LOG_INFO, 0,
+                        CHANGELOG_MSG_BNOTIFY_COND_INFO, NULL);
+
+                goto out;
+
+            case DICT_DEFAULT:
+                gf_smsg(this->name, GF_LOG_ERROR, 0,
+                        CHANGELOG_MSG_BARRIER_KEY_NOT_FOUND, NULL);
+                ret = -1;
+                goto out;
+
+            default:
+                gf_smsg(this->name, GF_LOG_ERROR, EINVAL,
+                        CHANGELOG_MSG_ERROR_IN_DICT_GET, NULL);
+                ret = -1;
+                goto out;
+        }
+    } else {
+        ret = default_notify(this, event, data);
+    }
+
+out:
+    if (bclean_req)
+        changelog_barrier_cleanup(this, priv, &queue);
+
+    return ret;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_changelog_mt_end + 1);
+
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, ENOMEM,
+                CHANGELOG_MSG_MEMORY_INIT_FAILED, NULL);
+        return ret;
+    }
+
+    return ret;
+}
+
+static int
+changelog_init(xlator_t *this, changelog_priv_t *priv)
+{
+    int i = 0;
+    int ret = 0;
+    changelog_log_data_t cld = {
+        0,
+    };
+
+    priv->maps[CHANGELOG_TYPE_DATA] = "D ";
+    priv->maps[CHANGELOG_TYPE_METADATA] = "M ";
+    priv->maps[CHANGELOG_TYPE_METADATA_XATTR] = "M ";
+    priv->maps[CHANGELOG_TYPE_ENTRY] = "E ";
+
+    for (; i < CHANGELOG_MAX_TYPE; i++) {
+        /* start with version 1 */
+        priv->slice.changelog_version[i] = 1;
+    }
+
+    if (!priv->active)
+        return ret;
+
+    /**
+     * start with a fresh changelog file every time. this is done
+     * in case there was an encoding change. so... things are kept
+     * simple here.
+     */
+    changelog_fill_rollover_data(&cld, _gf_false);
+
+    ret = htime_open(this, priv, cld.cld_roll_time);
+    /* call htime open with cld's rollover_time */
+    if (ret)
+        goto out;
+
+    LOCK(&priv->lock);
+    {
+        ret = changelog_inject_single_event(this, priv, &cld);
+    }
+    UNLOCK(&priv->lock);
+
+    /* ... and finally spawn the helpers threads */
+    ret = changelog_spawn_helper_threads(this, priv);
+
+out:
+    return ret;
+}
+
+/**
+ * Init barrier related condition variables and locks
+ */
+static int
+changelog_barrier_pthread_init(xlator_t *this, changelog_priv_t *priv)
+{
+    gf_boolean_t bn_mutex_init = _gf_false;
+    gf_boolean_t bn_cond_init = _gf_false;
+    gf_boolean_t dm_mutex_black_init = _gf_false;
+    gf_boolean_t dm_cond_black_init = _gf_false;
+    gf_boolean_t dm_mutex_white_init = _gf_false;
+    gf_boolean_t dm_cond_white_init = _gf_false;
+    gf_boolean_t cr_mutex_init = _gf_false;
+    gf_boolean_t cr_cond_init = _gf_false;
+    int ret = 0;
+
+    if ((ret = pthread_mutex_init(&priv->bn.bnotify_mutex, NULL)) != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED, "name=bnotify",
+                "ret=%d", ret, NULL);
+        ret = -1;
+        goto out;
+    }
+    bn_mutex_init = _gf_true;
+
+    if ((ret = pthread_cond_init(&priv->bn.bnotify_cond, NULL)) != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED, "name=bnotify",
+                "ret=%d", ret, NULL);
+        ret = -1;
+        goto out;
+    }
+    bn_cond_init = _gf_true;
+
+    if ((ret = pthread_mutex_init(&priv->dm.drain_black_mutex, NULL)) != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED, "name=drain_black",
+                "ret=%d", ret, NULL);
+        ret = -1;
+        goto out;
+    }
+    dm_mutex_black_init = _gf_true;
+
+    if ((ret = pthread_cond_init(&priv->dm.drain_black_cond, NULL)) != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED, "name=drain_black",
+                "ret=%d", ret, NULL);
+        ret = -1;
+        goto out;
+    }
+    dm_cond_black_init = _gf_true;
+
+    if ((ret = pthread_mutex_init(&priv->dm.drain_white_mutex, NULL)) != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED, "name=drain_white",
+                "ret=%d", ret, NULL);
+        ret = -1;
+        goto out;
+    }
+    dm_mutex_white_init = _gf_true;
+
+    if ((ret = pthread_cond_init(&priv->dm.drain_white_cond, NULL)) != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED, "name=drain_white",
+                "ret=%d", ret, NULL);
+        ret = -1;
+        goto out;
+    }
+    dm_cond_white_init = _gf_true;
+
+    if ((pthread_mutex_init(&priv->cr.lock, NULL)) != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_MSG_PTHREAD_MUTEX_INIT_FAILED,
+                "name=changelog_rollover", "ret=%d", ret, NULL);
+        ret = -1;
+        goto out;
+    }
+    cr_mutex_init = _gf_true;
+
+    if ((pthread_cond_init(&priv->cr.cond, NULL)) != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                CHANGELOG_MSG_PTHREAD_COND_INIT_FAILED,
+                "changelog_rollover cond init failed", "ret=%d", ret, NULL);
+        ret = -1;
+        goto out;
+    }
+    cr_cond_init = _gf_true;
+out:
+    if (ret) {
+        if (bn_mutex_init)
+            pthread_mutex_destroy(&priv->bn.bnotify_mutex);
+        if (bn_cond_init)
+            pthread_cond_destroy(&priv->bn.bnotify_cond);
+        if (dm_mutex_black_init)
+            pthread_mutex_destroy(&priv->dm.drain_black_mutex);
+        if (dm_cond_black_init)
+            pthread_cond_destroy(&priv->dm.drain_black_cond);
+        if (dm_mutex_white_init)
+            pthread_mutex_destroy(&priv->dm.drain_white_mutex);
+        if (dm_cond_white_init)
+            pthread_cond_destroy(&priv->dm.drain_white_cond);
+        if (cr_mutex_init)
+            pthread_mutex_destroy(&priv->cr.lock);
+        if (cr_cond_init)
+            pthread_cond_destroy(&priv->cr.cond);
+    }
+    return ret;
+}
+
+/* Destroy barrier related condition variables and locks */
+static void
+changelog_barrier_pthread_destroy(changelog_priv_t *priv)
+{
+    pthread_mutex_destroy(&priv->bn.bnotify_mutex);
+    pthread_cond_destroy(&priv->bn.bnotify_cond);
+    pthread_mutex_destroy(&priv->dm.drain_black_mutex);
+    pthread_cond_destroy(&priv->dm.drain_black_cond);
+    pthread_mutex_destroy(&priv->dm.drain_white_mutex);
+    pthread_cond_destroy(&priv->dm.drain_white_cond);
+    pthread_mutex_destroy(&priv->cr.lock);
+    pthread_cond_destroy(&priv->cr.cond);
+    LOCK_DESTROY(&priv->bflags.lock);
+}
+
+static void
+changelog_cleanup_rpc(xlator_t *this, changelog_priv_t *priv)
+{
+    /* terminate rpc server */
+    if (!this->cleanup_starting)
+        changelog_destroy_rpc_listner(this, priv);
+
+    (void)changelog_cleanup_rpc_threads(this, priv);
+    /* cleanup rot buffs */
+    rbuf_dtor(priv->rbuf);
+
+    /* cleanup poller thread */
+    if (priv->poller)
+        (void)changelog_thread_cleanup(this, priv->poller);
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    int ret = 0;
+    char *tmp = NULL;
+    changelog_priv_t *priv = NULL;
+    gf_boolean_t active_earlier = _gf_true;
+    gf_boolean_t active_now = _gf_true;
+    gf_boolean_t rpc_active_earlier = _gf_true;
+    gf_boolean_t rpc_active_now = _gf_true;
+    gf_boolean_t iniate_rpc = _gf_false;
+    changelog_time_slice_t *slice = NULL;
+    changelog_log_data_t cld = {
+        0,
+    };
+    char htime_dir[PATH_MAX] = {
+        0,
+    };
+    char csnap_dir[PATH_MAX] = {
+        0,
+    };
+    uint32_t timeout = 0;
+
+    priv = this->private;
+    if (!priv)
+        goto out;
+
+    ret = -1;
+    active_earlier = priv->active;
+    rpc_active_earlier = priv->rpc_active;
+
+    /* first stop the rollover and the fsync thread */
+    changelog_cleanup_helper_threads(this, priv);
+
+    GF_OPTION_RECONF("changelog-dir", tmp, options, str, out);
+    if (!tmp) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_DIR_OPTIONS_NOT_SET,
+                NULL);
+        goto out;
+    }
+
+    GF_FREE(priv->changelog_dir);
+    priv->changelog_dir = gf_strdup(tmp);
+    if (!priv->changelog_dir)
+        goto out;
+
+    ret = mkdir_p(priv->changelog_dir, 0600, _gf_true);
+
+    if (ret)
+        goto out;
+    CHANGELOG_FILL_HTIME_DIR(priv->changelog_dir, htime_dir);
+    ret = mkdir_p(htime_dir, 0600, _gf_true);
+
+    if (ret)
+        goto out;
+
+    CHANGELOG_FILL_CSNAP_DIR(priv->changelog_dir, csnap_dir);
+    ret = mkdir_p(csnap_dir, 0600, _gf_true);
+
+    if (ret)
+        goto out;
+
+    GF_OPTION_RECONF("changelog", active_now, options, bool, out);
+    GF_OPTION_RECONF("changelog-notification", rpc_active_now, options, bool,
+                     out);
+
+    /* If journalling is enabled, enable rpc notifications */
+    if (active_now && !active_earlier) {
+        if (!rpc_active_earlier)
+            iniate_rpc = _gf_true;
+    }
+
+    if (rpc_active_now && !rpc_active_earlier) {
+        iniate_rpc = _gf_true;
+    }
+
+    /* TODO: Disable of changelog-notifications is not supported for now
+     * as there is no clean way of cleaning up of rpc resources
+     */
+
+    if (iniate_rpc) {
+        ret = changelog_init_rpc(this, priv);
+        if (ret)
+            goto out;
+        priv->rpc_active = _gf_true;
+    }
+
+    /**
+     * changelog_handle_change() handles changes that could possibly
+     * have been submit changes before changelog deactivation.
+     */
+    if (!active_now)
+        priv->active = _gf_false;
+
+    GF_OPTION_RECONF("op-mode", tmp, options, str, out);
+    changelog_assign_opmode(priv, tmp);
+
+    tmp = NULL;
+
+    GF_OPTION_RECONF("encoding", tmp, options, str, out);
+    changelog_assign_encoding(priv, tmp);
+
+    GF_OPTION_RECONF("rollover-time", priv->rollover_time, options, int32, out);
+    GF_OPTION_RECONF("fsync-interval", priv->fsync_interval, options, int32,
+                     out);
+    GF_OPTION_RECONF("changelog-barrier-timeout", timeout, options, time, out);
+    changelog_assign_barrier_timeout(priv, timeout);
+
+    GF_OPTION_RECONF("capture-del-path", priv->capture_del_path, options, bool,
+                     out);
+
+    if (active_now || active_earlier) {
+        changelog_fill_rollover_data(&cld, !active_now);
+
+        slice = &priv->slice;
+
+        LOCK(&priv->lock);
+        {
+            ret = changelog_inject_single_event(this, priv, &cld);
+            if (!ret && active_now)
+                SLICE_VERSION_UPDATE(slice);
+        }
+        UNLOCK(&priv->lock);
+
+        if (ret)
+            goto out;
+
+        if (active_now) {
+            if (!active_earlier) {
+                gf_smsg(this->name, GF_LOG_INFO, 0, CHANGELOG_MSG_RECONFIGURE,
+                        NULL);
+                htime_create(this, priv, gf_time());
+            }
+            ret = changelog_spawn_helper_threads(this, priv);
+        }
+    }
+
+out:
+    if (ret) {
+        /* TODO */
+    } else {
+        gf_msg_debug(this->name, 0, "changelog reconfigured");
+        if (active_now && priv)
+            priv->active = _gf_true;
+    }
+
+    return ret;
+}
+
+static void
+changelog_freeup_options(xlator_t *this, changelog_priv_t *priv)
+{
+    int ret = 0;
+
+    ret = priv->cb->dtor(this, &priv->cd);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_FREEUP_FAILED, NULL);
+    GF_FREE(priv->changelog_brick);
+    GF_FREE(priv->changelog_dir);
+}
+
+static int
+changelog_init_options(xlator_t *this, changelog_priv_t *priv)
+{
+    int ret = 0;
+    char *tmp = NULL;
+    uint32_t timeout = 0;
+    char htime_dir[PATH_MAX] = {
+        0,
+    };
+    char csnap_dir[PATH_MAX] = {
+        0,
+    };
+
+    GF_OPTION_INIT("changelog-brick", tmp, str, error_return);
+    priv->changelog_brick = gf_strdup(tmp);
+    if (!priv->changelog_brick)
+        goto error_return;
+
+    tmp = NULL;
+
+    GF_OPTION_INIT("changelog-dir", tmp, str, dealloc_1);
+    priv->changelog_dir = gf_strdup(tmp);
+    if (!priv->changelog_dir)
+        goto dealloc_1;
+
+    tmp = NULL;
+
+    /**
+     * create the directory even if change-logging would be inactive
+     * so that consumers can _look_ into it (finding nothing...)
+     */
+    ret = mkdir_p(priv->changelog_dir, 0600, _gf_true);
+
+    if (ret)
+        goto dealloc_2;
+
+    CHANGELOG_FILL_HTIME_DIR(priv->changelog_dir, htime_dir);
+    ret = mkdir_p(htime_dir, 0600, _gf_true);
+    if (ret)
+        goto dealloc_2;
+
+    CHANGELOG_FILL_CSNAP_DIR(priv->changelog_dir, csnap_dir);
+    ret = mkdir_p(csnap_dir, 0600, _gf_true);
+    if (ret)
+        goto dealloc_2;
+
+    GF_OPTION_INIT("changelog", priv->active, bool, dealloc_2);
+    GF_OPTION_INIT("changelog-notification", priv->rpc_active, bool, dealloc_2);
+    GF_OPTION_INIT("capture-del-path", priv->capture_del_path, bool, dealloc_2);
+
+    GF_OPTION_INIT("op-mode", tmp, str, dealloc_2);
+    changelog_assign_opmode(priv, tmp);
+
+    tmp = NULL;
+
+    GF_OPTION_INIT("encoding", tmp, str, dealloc_2);
+    changelog_assign_encoding(priv, tmp);
+    changelog_encode_change(priv);
+
+    GF_OPTION_INIT("rollover-time", priv->rollover_time, int32, dealloc_2);
+
+    GF_OPTION_INIT("fsync-interval", priv->fsync_interval, int32, dealloc_2);
+
+    GF_OPTION_INIT("changelog-barrier-timeout", timeout, time, dealloc_2);
+    changelog_assign_barrier_timeout(priv, timeout);
+
+    GF_ASSERT(cb_bootstrap[priv->op_mode].mode == priv->op_mode);
+    priv->cb = &cb_bootstrap[priv->op_mode];
+
+    /* ... now bootstrap the logger */
+    ret = priv->cb->ctor(this, &priv->cd);
+    if (ret)
+        goto dealloc_2;
+
+    priv->changelog_fd = -1;
+
+    return 0;
+
+dealloc_2:
+    GF_FREE(priv->changelog_dir);
+dealloc_1:
+    GF_FREE(priv->changelog_brick);
+error_return:
+    return -1;
+}
+
+static int
+changelog_init_rpc(xlator_t *this, changelog_priv_t *priv)
+{
+    rpcsvc_t *rpc = NULL;
+    changelog_ev_selector_t *selection = NULL;
+
+    selection = &priv->ev_selection;
+
+    /* initialize event selection */
+    changelog_init_event_selection(this, selection);
+
+    priv->rbuf = rbuf_init(NR_ROTT_BUFFS);
+    if (!priv->rbuf)
+        goto cleanup_thread;
+
+    rpc = changelog_init_rpc_listener(this, priv, priv->rbuf, NR_DISPATCHERS);
+    if (!rpc)
+        goto cleanup_rbuf;
+    priv->rpc = rpc;
+
+    return 0;
+
+cleanup_rbuf:
+    rbuf_dtor(priv->rbuf);
+cleanup_thread:
+    if (priv->poller)
+        (void)changelog_thread_cleanup(this, priv->poller);
+
+    return -1;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    int ret = -1;
+    changelog_priv_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("changelog", this, error_return);
+
+    if (!this->children || this->children->next) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_CHILD_MISCONFIGURED,
+                NULL);
+        goto error_return;
+    }
+
+    if (!this->parents) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, CHANGELOG_MSG_VOL_MISCONFIGURED,
+                NULL);
+        goto error_return;
+    }
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_changelog_mt_priv_t);
+    if (!priv)
+        goto error_return;
+
+    this->local_pool = mem_pool_new(changelog_local_t, 64);
+    if (!this->local_pool) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, CHANGELOG_MSG_NO_MEMORY,
+                NULL);
+        goto cleanup_priv;
+    }
+
+    LOCK_INIT(&priv->lock);
+    LOCK_INIT(&priv->c_snap_lock);
+    GF_ATOMIC_INIT(priv->listnercnt, 0);
+    GF_ATOMIC_INIT(priv->clntcnt, 0);
+    GF_ATOMIC_INIT(priv->xprtcnt, 0);
+    INIT_LIST_HEAD(&priv->xprt_list);
+    priv->htime_fd = -1;
+
+    ret = changelog_init_options(this, priv);
+    if (ret)
+        goto cleanup_mempool;
+
+    /* snap dependency changes */
+    priv->dm.black_fop_cnt = 0;
+    priv->dm.white_fop_cnt = 0;
+    priv->dm.drain_wait_black = _gf_false;
+    priv->dm.drain_wait_white = _gf_false;
+    priv->current_color = FOP_COLOR_BLACK;
+    priv->explicit_rollover = _gf_false;
+
+    priv->cr.notify = _gf_false;
+    /* Mutex is not needed as threads are not spawned yet */
+    priv->bn.bnotify = _gf_false;
+    priv->bn.bnotify_error = _gf_false;
+    ret = changelog_barrier_pthread_init(this, priv);
+    if (ret)
+        goto cleanup_options;
+    LOCK_INIT(&priv->bflags.lock);
+    priv->bflags.barrier_ext = _gf_false;
+
+    /* Changelog barrier init */
+    INIT_LIST_HEAD(&priv->queue);
+    priv->barrier_enabled = _gf_false;
+
+    if (priv->rpc_active || priv->active) {
+        /* RPC ball rolling.. */
+        ret = changelog_init_rpc(this, priv);
+        if (ret)
+            goto cleanup_barrier;
+        priv->rpc_active = _gf_true;
+    }
+
+    ret = changelog_init(this, priv);
+    if (ret)
+        goto cleanup_rpc;
+
+    gf_msg_debug(this->name, 0, "changelog translator loaded");
+
+    this->private = priv;
+    return 0;
+
+cleanup_rpc:
+    if (priv->rpc_active) {
+        changelog_cleanup_rpc(this, priv);
+    }
+cleanup_barrier:
+    changelog_barrier_pthread_destroy(priv);
+cleanup_options:
+    changelog_freeup_options(this, priv);
+cleanup_mempool:
+    mem_pool_destroy(this->local_pool);
+    this->local_pool = NULL;
+cleanup_priv:
+    GF_FREE(priv);
+error_return:
+    this->private = NULL;
+    return -1;
+}
+
+void
+fini(xlator_t *this)
+{
+    changelog_priv_t *priv = NULL;
+    struct list_head queue = {
+        0,
+    };
+
+    priv = this->private;
+
+    if (priv) {
+        if (priv->active || priv->rpc_active) {
+            /* terminate RPC server/threads */
+            changelog_cleanup_rpc(this, priv);
+            GF_FREE(priv->ev_dispatcher);
+        }
+        /* call barrier_disable to cancel timer */
+        if (priv->barrier_enabled)
+            __chlog_barrier_disable(this, &queue);
+
+        /* cleanup barrier related objects */
+        changelog_barrier_pthread_destroy(priv);
+
+        /* cleanup helper threads */
+        changelog_cleanup_helper_threads(this, priv);
+
+        /* cleanup allocated options */
+        changelog_freeup_options(this, priv);
+
+        /* deallocate mempool */
+        mem_pool_destroy(this->local_pool);
+
+        if (priv->htime_fd != -1) {
+            sys_close(priv->htime_fd);
+        }
+
+        /* finally, dealloac private variable */
+        GF_FREE(priv);
+    }
+
+    this->private = NULL;
+    this->local_pool = NULL;
+
+    return;
+}
+
+struct xlator_fops fops = {
+    .open = changelog_open,
+    .mknod = changelog_mknod,
+    .mkdir = changelog_mkdir,
+    .create = changelog_create,
+    .symlink = changelog_symlink,
+    .writev = changelog_writev,
+    .truncate = changelog_truncate,
+    .ftruncate = changelog_ftruncate,
+    .link = changelog_link,
+    .rename = changelog_rename,
+    .unlink = changelog_unlink,
+    .rmdir = changelog_rmdir,
+    .setattr = changelog_setattr,
+    .fsetattr = changelog_fsetattr,
+    .setxattr = changelog_setxattr,
+    .fsetxattr = changelog_fsetxattr,
+    .removexattr = changelog_removexattr,
+    .fremovexattr = changelog_fremovexattr,
+    .ipc = changelog_ipc,
+    .xattrop = changelog_xattrop,
+    .fxattrop = changelog_fxattrop,
+};
+
+struct xlator_cbks cbks = {
+    .forget = changelog_forget,
+    .release = changelog_release,
+};
+
+struct volume_options options[] = {
+    {.key = {"changelog"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "enable/disable change-logging",
+     .op_version = {3},
+     .flags = OPT_FLAG_SETTABLE,
+     .level = OPT_STATUS_BASIC,
+     .tags = {"journal", "georep", "glusterfind"}},
+    {.key = {"changelog-notification"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "enable/disable changelog live notification",
+     .op_version = {3},
+     .level = OPT_STATUS_BASIC,
+     .tags = {"bitrot", "georep"}},
+    {.key = {"changelog-brick"},
+     .type = GF_OPTION_TYPE_PATH,
+     .description = "brick path to generate unique socket file name."
+                    " should be the export directory of the volume strictly.",
+     .default_value = "{{ brick.path }}",
+     .op_version = {3},
+     .tags = {"journal"}},
+    {.key = {"changelog-dir"},
+     .type = GF_OPTION_TYPE_PATH,
+     .description = "directory for the changelog files",
+     .default_value = "{{ brick.path }}/.glusterfs/changelogs",
+     .op_version = {3},
+     .flags = OPT_FLAG_SETTABLE,
+     .level = OPT_STATUS_ADVANCED,
+     .tags = {"journal", "georep", "glusterfind"}},
+    {.key = {"op-mode"},
+     .type = GF_OPTION_TYPE_STR,
+     .default_value = "realtime",
+     .value = {"realtime"},
+     .description = "operation mode - futuristic operation modes",
+     .op_version = {3},
+     .tags = {"journal"}},
+    {.key = {"encoding"},
+     .type = GF_OPTION_TYPE_STR,
+     .default_value = "ascii",
+     .value = {"binary", "ascii"},
+     .description = "encoding type for changelogs",
+     .op_version = {3},
+     .flags = OPT_FLAG_SETTABLE,
+     .level = OPT_STATUS_ADVANCED,
+     .tags = {"journal"}},
+    {.key = {"rollover-time"},
+     .default_value = "15",
+     .type = GF_OPTION_TYPE_TIME,
+     .description = "time to switch to a new changelog file (in seconds)",
+     .op_version = {3},
+     .flags = OPT_FLAG_SETTABLE,
+     .level = OPT_STATUS_ADVANCED,
+     .tags = {"journal", "georep", "glusterfind"}},
+    {.key = {"fsync-interval"},
+     .type = GF_OPTION_TYPE_TIME,
+     .default_value = "5",
+     .description = "do not open CHANGELOG file with O_SYNC mode."
+                    " instead perform fsync() at specified intervals",
+     .op_version = {3},
+     .flags = OPT_FLAG_SETTABLE,
+     .level = OPT_STATUS_ADVANCED,
+     .tags = {"journal"}},
+    {.key = {"changelog-barrier-timeout"},
+     .type = GF_OPTION_TYPE_TIME,
+     .default_value = BARRIER_TIMEOUT,
+     .description = "After 'timeout' seconds since the time 'barrier' "
+                    "option was set to \"on\", unlink/rmdir/rename  "
+                    "operations are no longer blocked and previously "
+                    "blocked fops are allowed to go through",
+     .op_version = {3},
+     .flags = OPT_FLAG_SETTABLE,
+     .level = OPT_STATUS_ADVANCED,
+     .tags = {"journal"}},
+    {.key = {"capture-del-path"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "enable/disable capturing paths of deleted entries",
+     .op_version = {3},
+     .flags = OPT_FLAG_SETTABLE,
+     .level = OPT_STATUS_BASIC,
+     .tags = {"journal", "glusterfind"}},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "changelog",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/cloudsync/Makefile.am b/xlators/features/cloudsync/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/cloudsync/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/cloudsync/src/Makefile.am b/xlators/features/cloudsync/src/Makefile.am
new file mode 100644
index 00000000000..e2a277e372b
--- /dev/null
+++ b/xlators/features/cloudsync/src/Makefile.am
@@ -0,0 +1,46 @@
+SUBDIRS = cloudsync-plugins
+
+xlator_LTLIBRARIES = cloudsync.la
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+cloudsync_sources = cloudsync.c
+
+CLOUDSYNC_SRC = $(top_srcdir)/xlators/features/cloudsync/src
+CLOUDSYNC_BLD = $(top_builddir)/xlators/features/cloudsync/src
+
+cloudsynccommon_sources = $(CLOUDSYNC_SRC)/cloudsync-common.c
+
+noinst_HEADERS = $(CLOUDSYNC_BLD)/cloudsync.h \
+		 $(CLOUDSYNC_BLD)/cloudsync-mem-types.h \
+		 $(CLOUDSYNC_BLD)/cloudsync-messages.h \
+		 $(CLOUDSYNC_BLD)/cloudsync-common.h
+
+cloudsync_la_SOURCES = $(cloudsync_sources) $(cloudsynccommon_sources)
+
+nodist_cloudsync_la_SOURCES = cloudsync-autogen-fops.c cloudsync-autogen-fops.h
+BUILT_SOURCES = cloudsync-autogen-fops.h
+
+cloudsync_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+cloudsync_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIB_DL)
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	      -DCS_PLUGINDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/cloudsync-plugins\"
+AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS)
+
+noinst_PYTHON = cloudsync-fops-c.py cloudsync-fops-h.py
+EXTRA_DIST = cloudsync-autogen-fops-tmpl.c cloudsync-autogen-fops-tmpl.h
+
+cloudsync-autogen-fops.c: cloudsync-fops-c.py cloudsync-autogen-fops-tmpl.c
+	$(PYTHON) $(CLOUDSYNC_SRC)/cloudsync-fops-c.py \
+	$(CLOUDSYNC_SRC)/cloudsync-autogen-fops-tmpl.c > $@
+
+cloudsync-autogen-fops.h: cloudsync-fops-h.py cloudsync-autogen-fops-tmpl.h
+	$(PYTHON) $(CLOUDSYNC_SRC)/cloudsync-fops-h.py \
+	$(CLOUDSYNC_SRC)/cloudsync-autogen-fops-tmpl.h > $@
+
+CLEANFILES = $(nodist_cloudsync_la_SOURCES)
+
+uninstall-local:
+	rm -f $(DESTDIR)$(xlatordir)/cloudsync.so
diff --git a/xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.c b/xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.c
new file mode 100644
index 00000000000..ee63f983980
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.c
@@ -0,0 +1,30 @@
+/*
+  Copyright (c) 2008-2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+/* File: cloudsync-autogen-fops-tmpl.c
+ * This file contains the CLOUDSYNC autogenerated FOPs. This is run through
+ * the code generator, generator.py to generate the required FOPs.
+ */
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <dlfcn.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include "cloudsync.h"
+#include "cloudsync-common.h"
+#include <glusterfs/call-stub.h>
+
+#pragma generate
diff --git a/xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.h b/xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.h
new file mode 100644
index 00000000000..d922c77d8aa
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.h
@@ -0,0 +1,24 @@
+/*
+  Copyright (c) 2008-2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+/* File: clousync-autogen-fops-tmpl.h
+ * This file contains the cloudsync autogenerated FOPs declarations.
+ */
+
+#ifndef _CLOUDSYNC_AUTOGEN_FOPS_H
+#define _CLOUDSYNC_AUTOGEN_FOPS_H
+
+#include <glusterfs/xlator.h>
+#include "cloudsync.h"
+#include "cloudsync-common.h"
+
+#pragma generate
+
+#endif /* _CLOUDSYNC_AUTOGEN_FOPS_H */
diff --git a/xlators/features/cloudsync/src/cloudsync-common.c b/xlators/features/cloudsync/src/cloudsync-common.c
new file mode 100644
index 00000000000..445a31b90e7
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-common.c
@@ -0,0 +1,60 @@
+/*
+  Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "cloudsync-common.h"
+
+void
+cs_xattrinfo_wipe(cs_local_t *local)
+{
+    if (local->xattrinfo.lxattr) {
+        if (local->xattrinfo.lxattr->file_path)
+            GF_FREE(local->xattrinfo.lxattr->file_path);
+
+        if (local->xattrinfo.lxattr->volname)
+            GF_FREE(local->xattrinfo.lxattr->volname);
+
+        GF_FREE(local->xattrinfo.lxattr);
+    }
+}
+
+void
+cs_local_wipe(xlator_t *this, cs_local_t *local)
+{
+    if (!local)
+        return;
+
+    loc_wipe(&local->loc);
+
+    if (local->fd) {
+        fd_unref(local->fd);
+        local->fd = NULL;
+    }
+
+    if (local->stub) {
+        call_stub_destroy(local->stub);
+        local->stub = NULL;
+    }
+
+    if (local->xattr_req)
+        dict_unref(local->xattr_req);
+
+    if (local->xattr_rsp)
+        dict_unref(local->xattr_rsp);
+
+    if (local->dlfd)
+        fd_unref(local->dlfd);
+
+    if (local->remotepath)
+        GF_FREE(local->remotepath);
+
+    cs_xattrinfo_wipe(local);
+
+    mem_put(local);
+}
diff --git a/xlators/features/cloudsync/src/cloudsync-common.h b/xlators/features/cloudsync/src/cloudsync-common.h
new file mode 100644
index 00000000000..11d233460a4
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-common.h
@@ -0,0 +1,134 @@
+/*
+  Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+#ifndef _CLOUDSYNC_COMMON_H
+#define _CLOUDSYNC_COMMON_H
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/compat-errno.h>
+#include "cloudsync-mem-types.h"
+#include "cloudsync-messages.h"
+
+typedef struct cs_loc_xattr {
+    char *file_path;
+    uuid_t uuid;
+    uuid_t gfid;
+    char *volname;
+} cs_loc_xattr_t;
+
+typedef struct cs_size_xattr {
+    uint64_t size;
+    uint64_t blksize;
+    uint64_t blocks;
+} cs_size_xattr_t;
+
+typedef struct cs_local {
+    loc_t loc;
+    fd_t *fd;
+    call_stub_t *stub;
+    call_frame_t *main_frame;
+    int op_errno;
+    int op_ret;
+    fd_t *dlfd;
+    off_t dloffset;
+    struct iatt stbuf;
+    dict_t *xattr_rsp;
+    dict_t *xattr_req;
+    glusterfs_fop_t fop;
+    gf_boolean_t locked;
+    int call_cnt;
+    inode_t *inode;
+    char *remotepath;
+
+    struct {
+        /* offset, flags and size are the information needed
+         * by read fop for remote read operation. These will be
+         * populated in cloudsync read fop, before being passed
+         * on to the plugin performing remote read.
+         */
+        off_t offset;
+        uint32_t flags;
+        size_t size;
+        cs_loc_xattr_t *lxattr;
+    } xattrinfo;
+
+} cs_local_t;
+
+typedef int (*fop_download_t)(call_frame_t *frame, void *config);
+
+typedef int (*fop_remote_read_t)(call_frame_t *, void *);
+
+typedef void *(*store_init)(xlator_t *this);
+
+typedef int (*store_reconfigure)(xlator_t *this, dict_t *options);
+
+typedef void (*store_fini)(void *config);
+
+struct cs_remote_stores {
+    char *name;                    /* store name */
+    void *config;                  /* store related information */
+    fop_download_t dlfop;          /* store specific download function */
+    fop_remote_read_t rdfop;       /* store specific read function */
+    store_init init;               /* store init to initialize store config */
+    store_reconfigure reconfigure; /* reconfigure store config */
+    store_fini fini;
+    void *handle; /* shared library handle*/
+};
+
+typedef struct cs_private {
+    xlator_t *this;
+    struct cs_remote_stores *stores;
+    gf_boolean_t abortdl;
+    pthread_spinlock_t lock;
+    gf_boolean_t remote_read;
+} cs_private_t;
+
+void
+cs_local_wipe(xlator_t *this, cs_local_t *local);
+
+void
+cs_xattrinfo_wipe(cs_local_t *local);
+
+#define CS_STACK_UNWIND(fop, frame, params...)                                 \
+    do {                                                                       \
+        cs_local_t *__local = NULL;                                            \
+        xlator_t *__xl = NULL;                                                 \
+        if (frame) {                                                           \
+            __xl = frame->this;                                                \
+            __local = frame->local;                                            \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, params);                               \
+        cs_local_wipe(__xl, __local);                                          \
+    } while (0)
+
+#define CS_STACK_DESTROY(frame)                                                \
+    do {                                                                       \
+        cs_local_t *__local = NULL;                                            \
+        xlator_t *__xl = NULL;                                                 \
+        __xl = frame->this;                                                    \
+        __local = frame->local;                                                \
+        frame->local = NULL;                                                   \
+        STACK_DESTROY(frame->root);                                            \
+        cs_local_wipe(__xl, __local);                                          \
+    } while (0)
+
+typedef struct store_methods {
+    int (*fop_download)(call_frame_t *frame, void *config);
+    int (*fop_remote_read)(call_frame_t *, void *);
+    /* return type should be the store config */
+    void *(*fop_init)(xlator_t *this);
+    int (*fop_reconfigure)(xlator_t *this, dict_t *options);
+    void (*fop_fini)(void *config);
+} store_methods_t;
+
+#endif /* _CLOUDSYNC_COMMON_H */
diff --git a/xlators/features/cloudsync/src/cloudsync-fops-c.py b/xlators/features/cloudsync/src/cloudsync-fops-c.py
new file mode 100755
index 00000000000..c27df97ae58
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-fops-c.py
@@ -0,0 +1,324 @@
+#!/usr/bin/python3
+
+from __future__ import print_function
+import os
+import sys
+
+curdir = os.path.dirname(sys.argv[0])
+gendir = os.path.join(curdir, '../../../../libglusterfs/src')
+sys.path.append(gendir)
+from generator import ops, fop_subs, cbk_subs, generate
+
+FD_DATA_MODIFYING_OP_FOP_TEMPLATE = """
+int32_t
+cs_@NAME@ (call_frame_t *frame, xlator_t *this,
+           @LONG_ARGS@)
+{
+        int                         op_errno        = EINVAL ;
+        cs_local_t                 *local           = NULL;
+        int                         ret             = 0;
+        cs_inode_ctx_t             *ctx             = NULL;
+        gf_cs_obj_state             state           = -1;
+
+        VALIDATE_OR_GOTO (frame, err);
+        VALIDATE_OR_GOTO (this, err);
+        VALIDATE_OR_GOTO (fd, err);
+
+        local = cs_local_init (this, frame, NULL, fd, GF_FOP_@UPNAME@);
+        if (!local) {
+
+                gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local init failed");
+                op_errno = ENOMEM;
+                goto err;
+        }
+
+        __cs_inode_ctx_get (this, fd->inode, &ctx);
+
+        if (ctx)
+                state = __cs_get_file_state (fd->inode, ctx);
+        else
+                state = GF_CS_LOCAL;
+
+        xdata = xdata ? dict_ref (xdata) : dict_new ();
+
+        if (!xdata) {
+                gf_msg (this->name, GF_LOG_ERROR, 0, 0, "insufficient memory");
+                op_errno = ENOMEM;
+                goto err;
+        }
+
+        local->xattr_req = xdata;
+
+        ret = dict_set_uint32 (local->xattr_req, GF_CS_OBJECT_STATUS, 1);
+        if (ret) {
+                gf_msg (this->name, GF_LOG_ERROR, 0, 0, "dict_set failed key:"
+                        " %s", GF_CS_OBJECT_STATUS);
+                goto err;
+        }
+
+        local->stub = fop_@NAME@_stub (frame, cs_resume_@NAME@,
+                                       @SHORT_ARGS@);
+        if (!local->stub) {
+                gf_msg (this->name, GF_LOG_ERROR, 0, 0, "insufficient memory");
+                op_errno = ENOMEM;
+                goto err;
+        }
+
+
+        if (state == GF_CS_LOCAL) {
+                STACK_WIND (frame, cs_@NAME@_cbk,
+                            FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@,
+                            @SHORT_ARGS@);
+        } else {
+                local->call_cnt++;
+                ret = locate_and_execute (frame);
+                if (ret) {
+                        op_errno = ENOMEM;
+                        goto err;
+                }
+        }
+
+        return 0;
+
+err:
+        CS_STACK_UNWIND (@NAME@, frame, -1, op_errno, @CBK_ERROR_ARGS@);
+
+        return 0;
+}
+"""
+
+FD_DATA_MODIFYING_RESUME_OP_FOP_TEMPLATE = """
+int32_t
+cs_resume_@NAME@ (call_frame_t *frame, xlator_t *this,
+                  @LONG_ARGS@)
+{
+        int              ret    = 0;
+
+        ret = cs_resume_postprocess (this, frame, fd->inode);
+        if (ret) {
+                goto unwind;
+        }
+
+        cs_inodelk_unlock (frame);
+
+        STACK_WIND (frame, cs_@NAME@_cbk,
+                    FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@,
+                    @SHORT_ARGS@);
+
+        return 0;
+
+unwind:
+
+        cs_inodelk_unlock (frame);
+
+        cs_common_cbk (frame);
+
+        return 0;
+}
+"""
+FD_DATA_MODIFYING_OP_FOP_CBK_TEMPLATE = """
+int32_t
+cs_@NAME@_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno,
+               @LONG_ARGS@)
+{
+        cs_local_t      *local = NULL;
+        int              ret = 0;
+        uint64_t         val = 0;
+        fd_t            *fd = NULL;
+
+        local = frame->local;
+        fd = local->fd;
+
+        /* Do we need lock here? */
+        local->call_cnt++;
+
+        if (op_ret == -1) {
+                ret = dict_get_uint64 (xdata, GF_CS_OBJECT_STATUS, &val);
+                if (ret == 0) {
+                        if (val == GF_CS_ERROR) {
+                                gf_msg (this->name, GF_LOG_ERROR, 0, 0,
+                                        "could not get file state, unwinding");
+                                op_ret = -1;
+                                op_errno = EIO;
+                                goto unwind;
+                        } else {
+                                __cs_inode_ctx_update (this, fd->inode, val);
+                                gf_msg (this->name, GF_LOG_INFO, 0, 0,
+                                        " state = %" PRIu64, val);
+
+                                if (local->call_cnt == 1 &&
+                                    (val == GF_CS_REMOTE ||
+                                     val == GF_CS_DOWNLOADING))  {
+                                        gf_msg (this->name, GF_LOG_INFO, 0,
+                                                0, " will repair and download "
+                                                "the file, current state : %"
+                                                PRIu64, val);
+                                        goto repair;
+                                } else {
+                                        gf_msg (this->name, GF_LOG_ERROR, 0, 0,
+                                                "second @NAME@, Unwinding");
+                                        goto unwind;
+                                }
+                        }
+                } else {
+                        gf_msg (this->name, GF_LOG_ERROR, 0, 0, "file state "
+                                "could not be figured, unwinding");
+                        goto unwind;
+                }
+        } else {
+                /* successful @NAME@ => file is local */
+                __cs_inode_ctx_update (this, fd->inode, GF_CS_LOCAL);
+                gf_msg (this->name, GF_LOG_INFO, 0, 0, "state : GF_CS_LOCAL"
+                        ", @NAME@ successful");
+
+                goto unwind;
+        }
+
+repair:
+        ret = locate_and_execute (frame);
+        if (ret) {
+                goto unwind;
+        }
+
+        return 0;
+
+unwind:
+        CS_STACK_UNWIND (@NAME@, frame, op_ret, op_errno, @SHORT_ARGS@);
+
+        return 0;
+}
+"""
+
+LOC_STAT_OP_FOP_TEMPLATE = """
+int32_t
+cs_@NAME@ (call_frame_t *frame, xlator_t *this,
+           @LONG_ARGS@)
+{
+        int              op_errno = EINVAL;
+        cs_local_t      *local = NULL;
+        int              ret   = 0;
+
+        local = cs_local_init (this, frame, loc, NULL, GF_FOP_@UPNAME@);
+        if (!local) {
+                gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local is NULL");
+                op_errno = ENOMEM;
+                goto err;
+        }
+
+        if (loc->inode->ia_type == IA_IFDIR)
+                goto wind;
+
+        xdata = xdata ? dict_ref (xdata) : dict_new ();
+
+        if (!xdata) {
+                gf_msg (this->name, GF_LOG_ERROR, 0, 0, "insufficient memory");
+                op_errno = ENOMEM;
+                goto err;
+        }
+
+        local->xattr_req = xdata;
+
+        ret = dict_set_uint32 (local->xattr_req, GF_CS_OBJECT_STATUS, 1);
+        if (ret) {
+                gf_msg (this->name, GF_LOG_ERROR, 0, 0, "dict_set failed key:"
+                        " %s", GF_CS_OBJECT_STATUS);
+                goto err;
+        }
+
+wind:
+        STACK_WIND (frame, cs_@NAME@_cbk, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->@NAME@,
+                    @SHORT_ARGS@);
+
+        return 0;
+err:
+        CS_STACK_UNWIND (@NAME@, frame, -1, op_errno, @CBK_ERROR_ARGS@);
+
+        return 0;
+}
+"""
+
+LOC_STAT_OP_FOP_CBK_TEMPLATE = """
+int32_t
+cs_@NAME@_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno,
+               @LONG_ARGS@)
+{
+        int              ret = 0;
+        uint64_t         val = 0;
+        loc_t           *loc = NULL;
+        cs_local_t      *local = NULL;
+
+        local = frame->local;
+
+        loc = &local->loc;
+
+        if (op_ret == 0) {
+                ret = dict_get_uint64 (xdata, GF_CS_OBJECT_STATUS, &val);
+                if (!ret) {
+                        ret = __cs_inode_ctx_update (this, loc->inode, val);
+                        if (ret) {
+                                gf_msg (this->name, GF_LOG_ERROR, 0, 0,
+                                        "ctx update failed");
+                        }
+                }
+        } else {
+                cs_inode_ctx_reset (this, loc->inode);
+        }
+
+        CS_STACK_UNWIND (@NAME@, frame, op_ret, op_errno, @SHORT_ARGS@);
+
+        return 0;
+}
+"""
+
+# All xlator FOPs are covered in the following section just to create a clarity
+# The lists themselves are not used.
+entry_ops = ['mknod', 'mkdir', 'unlink', 'rmdir', 'symlink', 'rename', 'link',
+             'create']
+special_ops = ['statfs', 'lookup', 'ipc', 'compound', 'icreate', 'namelink']
+ignored_ops = ['getspec']
+inode_ops = ['stat', 'readlink', 'truncate', 'open', 'setxattr', 'getxattr',
+             'removexattr', 'opendir', 'access', 'inodelk', 'entrylk',
+             'xattrop', 'setattr', 'lease', 'getactivelk', 'setactivelk',
+             'discover']
+fd_ops = ['readv', 'writev', 'flush', 'fsync', 'fsyncdir', 'ftruncate',
+          'fstat', 'lk', 'readdir', 'finodelk', 'fentrylk', 'fxattrop',
+          'fsetxattr', 'fgetxattr', 'rchecksum', 'fsetattr', 'readdirp',
+          'fremovexattr', 'fallocate', 'discard', 'zerofill', 'seek']
+
+
+# These are the current actual lists used to generate the code
+
+# The following list contains fops which are fd based that modifies data
+fd_data_modify_op_fop_template = ['writev', 'flush', 'fsync',
+                                  'ftruncate', 'rchecksum', 'fallocate',
+                                  'discard', 'zerofill', 'seek']
+
+# The following list contains fops which are entry based that does not change
+# data
+loc_stat_op_fop_template = ['lookup', 'stat', 'discover', 'access', 'setattr',
+                            'getattr']
+
+# These fops need a separate implementation
+special_fops = ['statfs', 'setxattr', 'unlink', 'getxattr',
+                'truncate', 'fstat', 'readv', 'readdirp']
+
+def gen_defaults():
+    for name in ops:
+        if name in fd_data_modify_op_fop_template:
+            print(generate(FD_DATA_MODIFYING_OP_FOP_CBK_TEMPLATE, name, cbk_subs))
+            print(generate(FD_DATA_MODIFYING_RESUME_OP_FOP_TEMPLATE, name, fop_subs))
+            print(generate(FD_DATA_MODIFYING_OP_FOP_TEMPLATE, name, fop_subs))
+        elif name in loc_stat_op_fop_template:
+            print(generate(LOC_STAT_OP_FOP_CBK_TEMPLATE, name, cbk_subs))
+            print(generate(LOC_STAT_OP_FOP_TEMPLATE, name, fop_subs))
+
+for l in open(sys.argv[1], 'r').readlines():
+    if l.find('#pragma generate') != -1:
+        print("/* BEGIN GENERATED CODE - DO NOT MODIFY */")
+        gen_defaults()
+        print("/* END GENERATED CODE */")
+    else:
+        print(l[:-1])
diff --git a/xlators/features/cloudsync/src/cloudsync-fops-h.py b/xlators/features/cloudsync/src/cloudsync-fops-h.py
new file mode 100755
index 00000000000..faa2de651a7
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-fops-h.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python3
+
+from __future__ import print_function
+import os
+import sys
+
+curdir = os.path.dirname(sys.argv[0])
+gendir = os.path.join(curdir, '../../../../libglusterfs/src')
+sys.path.append(gendir)
+from generator import ops, fop_subs, cbk_subs, generate
+
+OP_FOP_TEMPLATE = """
+int32_t
+cs_@NAME@ (call_frame_t *frame, xlator_t *this,
+                   @LONG_ARGS@);
+"""
+
+def gen_defaults():
+    for name, value in ops.items():
+        if name == 'getspec':
+            continue
+        print(generate(OP_FOP_TEMPLATE, name, fop_subs))
+
+
+for l in open(sys.argv[1], 'r').readlines():
+    if l.find('#pragma generate') != -1:
+        print("/* BEGIN GENERATED CODE - DO NOT MODIFY */")
+        gen_defaults()
+        print("/* END GENERATED CODE */")
+    else:
+        print(l[:-1])
diff --git a/xlators/features/cloudsync/src/cloudsync-mem-types.h b/xlators/features/cloudsync/src/cloudsync-mem-types.h
new file mode 100644
index 00000000000..220346405d0
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-mem-types.h
@@ -0,0 +1,22 @@
+/*
+ *   Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#ifndef __CLOUDSYNC_MEM_TYPES_H__
+#define __CLOUDSYNC_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+enum cs_mem_types_ {
+    gf_cs_mt_cs_private_t = gf_common_mt_end + 1,
+    gf_cs_mt_cs_remote_stores_t,
+    gf_cs_mt_cs_inode_ctx_t,
+    gf_cs_mt_cs_lxattr_t,
+    gf_cs_mt_end
+};
+#endif /* __CLOUDSYNC_MEM_TYPES_H__ */
diff --git a/xlators/features/cloudsync/src/cloudsync-messages.h b/xlators/features/cloudsync/src/cloudsync-messages.h
new file mode 100644
index 00000000000..fb08f72de7f
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-messages.h
@@ -0,0 +1,16 @@
+/*
+ *   Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#ifndef __CLOUDSYNC_MESSAGES_H__
+#define __CLOUDSYNC_MESSAGES_H__
+
+/*TODO: define relevant message ids */
+
+#endif /* __CLOUDSYNC_MESSAGES_H__ */
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/Makefile.am b/xlators/features/cloudsync/src/cloudsync-plugins/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/Makefile.am b/xlators/features/cloudsync/src/cloudsync-plugins/src/Makefile.am
new file mode 100644
index 00000000000..fb6b0580c6d
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/Makefile.am
@@ -0,0 +1,11 @@
+if BUILD_AMAZONS3_PLUGIN
+  AMAZONS3_DIR = cloudsyncs3
+endif
+
+if BUILD_CVLT_PLUGIN
+  CVLT_DIR = cvlt
+endif
+
+SUBDIRS = ${AMAZONS3_DIR} ${CVLT_DIR}
+
+CLEANFILES =
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/Makefile.am b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/Makefile.am b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/Makefile.am
new file mode 100644
index 00000000000..6509426ef87
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/Makefile.am
@@ -0,0 +1,12 @@
+csp_LTLIBRARIES = cloudsyncs3.la
+cspdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/cloudsync-plugins
+
+cloudsyncs3_la_SOURCES = libcloudsyncs3.c  $(top_srcdir)/xlators/features/cloudsync/src/cloudsync-common.c
+cloudsyncs3_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+cloudsyncs3_la_LDFLAGS = -module -export-symbols $(top_srcdir)/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.sym $(GF_XLATOR_LDFLAGS)
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src   -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src -lcurlpp -lcryptopp
+noinst_HEADERS = libcloudsyncs3.h libcloudsyncs3-mem-types.h
+AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS) -lcurl -lcrypto -I$(top_srcdir)/xlators/features/cloudsync/src
+CLEANFILES =
+
+EXTRA_DIST = libcloudsyncs3.sym
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3-mem-types.h b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3-mem-types.h
new file mode 100644
index 00000000000..7ccfcc9f4b6
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3-mem-types.h
@@ -0,0 +1,19 @@
+/*
+ *   Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#ifndef __LIBAWS_MEM_TYPES_H__
+#define __LIBAWS_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+enum libaws_mem_types_ {
+    gf_libaws_mt_aws_private_t = gf_common_mt_end + 1,
+    gf_libaws_mt_end
+};
+#endif /* __CLOUDSYNC_MEM_TYPES_H__ */
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.c b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.c
new file mode 100644
index 00000000000..23c3599825a
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.c
@@ -0,0 +1,584 @@
+/*
+  Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <stdlib.h>
+#include <openssl/hmac.h>
+#include <openssl/evp.h>
+#include <openssl/bio.h>
+#include <openssl/buffer.h>
+#include <openssl/crypto.h>
+#include <curl/curl.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/glusterfs.h>
+#include "libcloudsyncs3.h"
+#include "cloudsync-common.h"
+
+#define RESOURCE_SIZE 4096
+
+store_methods_t store_ops = {
+    .fop_download = aws_download_s3,
+    .fop_init = aws_init,
+    .fop_reconfigure = aws_reconfigure,
+    .fop_fini = aws_fini,
+};
+
+typedef struct aws_private {
+    char *hostname;
+    char *bucketid;
+    char *awssekey;
+    char *awskeyid;
+    gf_boolean_t abortdl;
+    pthread_spinlock_t lock;
+} aws_private_t;
+
+void *
+aws_init(xlator_t *this)
+{
+    aws_private_t *priv = NULL;
+    char *temp_str = NULL;
+    int ret = 0;
+
+    priv = GF_CALLOC(1, sizeof(aws_private_t), gf_libaws_mt_aws_private_t);
+    if (!priv) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "insufficient memory");
+        return NULL;
+    }
+
+    priv->abortdl = _gf_false;
+
+    pthread_spin_init(&priv->lock, PTHREAD_PROCESS_PRIVATE);
+
+    pthread_spin_lock(&(priv->lock));
+    {
+        if (dict_get_str(this->options, "s3plugin-seckey", &temp_str) == 0) {
+            priv->awssekey = gf_strdup(temp_str);
+            if (!priv->awssekey) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+                       "initializing aws secret key failed");
+                ret = -1;
+                goto unlock;
+            }
+        }
+
+        if (dict_get_str(this->options, "s3plugin-keyid", &temp_str) == 0) {
+            priv->awskeyid = gf_strdup(temp_str);
+            if (!priv->awskeyid) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+                       "initializing aws key ID failed");
+                ret = -1;
+                goto unlock;
+            }
+        }
+
+        if (dict_get_str(this->options, "s3plugin-bucketid", &temp_str) == 0) {
+            priv->bucketid = gf_strdup(temp_str);
+            if (!priv->bucketid) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+                       "initializing aws bucketid failed");
+
+                ret = -1;
+                goto unlock;
+            }
+        }
+
+        if (dict_get_str(this->options, "s3plugin-hostname", &temp_str) == 0) {
+            priv->hostname = gf_strdup(temp_str);
+            if (!priv->hostname) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+                       "initializing aws hostname failed");
+
+                ret = -1;
+                goto unlock;
+            }
+        }
+
+        gf_msg_debug(this->name, 0,
+                     "stored key: %s id: %s "
+                     "bucketid %s hostname: %s",
+                     priv->awssekey, priv->awskeyid, priv->bucketid,
+                     priv->hostname);
+    }
+unlock:
+    pthread_spin_unlock(&(priv->lock));
+
+    if (ret == -1) {
+        GF_FREE(priv->awskeyid);
+        GF_FREE(priv->awssekey);
+        GF_FREE(priv->bucketid);
+        GF_FREE(priv->hostname);
+        GF_FREE(priv);
+        priv = NULL;
+    }
+
+    return (void *)priv;
+}
+
+int
+aws_reconfigure(xlator_t *this, dict_t *options)
+{
+    aws_private_t *priv = NULL;
+    char *temp_str = NULL;
+    int ret = 0;
+    cs_private_t *cspriv = NULL;
+
+    cspriv = this->private;
+
+    priv = cspriv->stores->config;
+
+    if (!priv) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "null priv");
+        return -1;
+    }
+
+    pthread_spin_lock(&(priv->lock));
+    {
+        if (dict_get_str(options, "s3plugin-seckey", &temp_str) == 0) {
+            priv->awssekey = gf_strdup(temp_str);
+            if (!priv->awssekey) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+                       "initializing aws secret key failed");
+                ret = -1;
+                goto out;
+            }
+        }
+
+        if (dict_get_str(options, "s3plugin-keyid", &temp_str) == 0) {
+            priv->awskeyid = gf_strdup(temp_str);
+            if (!priv->awskeyid) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+                       "initializing aws key ID failed");
+                ret = -1;
+                goto out;
+            }
+        }
+
+        if (dict_get_str(options, "s3plugin-bucketid", &temp_str) == 0) {
+            priv->bucketid = gf_strdup(temp_str);
+            if (!priv->bucketid) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+                       "initializing aws bucketid failed");
+                ret = -1;
+                goto out;
+            }
+        }
+
+        if (dict_get_str(options, "s3plugin-hostname", &temp_str) == 0) {
+            priv->hostname = gf_strdup(temp_str);
+            if (!priv->hostname) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+                       "initializing aws hostname failed");
+                ret = -1;
+                goto out;
+            }
+        }
+    }
+out:
+    pthread_spin_unlock(&(priv->lock));
+
+    gf_msg_debug(this->name, 0,
+                 "stored key: %s id: %s "
+                 "bucketid %s hostname: %s",
+                 priv->awssekey, priv->awskeyid, priv->bucketid,
+                 priv->hostname);
+
+    return ret;
+}
+
+void
+aws_fini(void *config)
+{
+    aws_private_t *priv = NULL;
+
+    priv = (aws_private_t *)priv;
+
+    if (priv) {
+        GF_FREE(priv->hostname);
+        GF_FREE(priv->bucketid);
+        GF_FREE(priv->awssekey);
+        GF_FREE(priv->awskeyid);
+
+        pthread_spin_destroy(&priv->lock);
+        GF_FREE(priv);
+    }
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("dht", this, out);
+
+    ret = xlator_mem_acct_init(this, gf_libaws_mt_end + 1);
+
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "Memory accounting init failed");
+        return ret;
+    }
+out:
+    return ret;
+}
+char *
+aws_form_request(char *resource, char **date, char *reqtype, char *bucketid,
+                 char *filepath)
+{
+    char httpdate[256];
+    time_t ctime;
+    struct tm *gtime = NULL;
+    char *sign_req = NULL;
+    int signreq_len = -1;
+    int date_len = -1;
+    int res_len = -1;
+
+    ctime = gf_time();
+    gtime = gmtime(&ctime);
+
+    date_len = strftime(httpdate, sizeof(httpdate),
+                        "%a, %d %b %Y %H:%M:%S +0000", gtime);
+
+    *date = gf_strndup(httpdate, date_len);
+    if (*date == NULL) {
+        gf_msg("CS", GF_LOG_ERROR, ENOMEM, 0,
+               "memory allocation "
+               "failure for date");
+        goto out;
+    }
+
+    res_len = snprintf(resource, RESOURCE_SIZE, "%s/%s", bucketid, filepath);
+
+    gf_msg_debug("CS", 0, "resource %s", resource);
+
+    /* 6 accounts for the 4 new line chars, one forward slash and
+     * one null char */
+    signreq_len = res_len + date_len + strlen(reqtype) + 6;
+
+    sign_req = GF_MALLOC(signreq_len, gf_common_mt_char);
+    if (sign_req == NULL) {
+        gf_msg("CS", GF_LOG_ERROR, ENOMEM, 0,
+               "memory allocation "
+               "failure for sign_req");
+        goto out;
+    }
+
+    snprintf(sign_req, signreq_len, "%s\n\n%s\n%s\n/%s", reqtype, "", *date,
+             resource);
+
+out:
+    return sign_req;
+}
+
+char *
+aws_b64_encode(const unsigned char *input, int length)
+{
+    BIO *bio, *b64;
+    BUF_MEM *bptr;
+    char *buff = NULL;
+
+    b64 = BIO_new(BIO_f_base64());
+    bio = BIO_new(BIO_s_mem());
+    b64 = BIO_push(b64, bio);
+    BIO_write(b64, input, length);
+    BIO_flush(b64);
+    BIO_get_mem_ptr(b64, &bptr);
+
+    buff = GF_MALLOC(bptr->length, gf_common_mt_char);
+    memcpy(buff, bptr->data, bptr->length - 1);
+    buff[bptr->length - 1] = 0;
+
+    BIO_free_all(b64);
+
+    return buff;
+}
+
+char *
+aws_sign_request(char *const str, char *awssekey)
+{
+#if (OPENSSL_VERSION_NUMBER < 0x1010002f)
+    HMAC_CTX ctx;
+#endif
+    HMAC_CTX *pctx = NULL;
+    ;
+
+    unsigned char md[256];
+    unsigned len;
+    char *base64 = NULL;
+
+#if (OPENSSL_VERSION_NUMBER < 0x1010002f)
+    HMAC_CTX_init(&ctx);
+    pctx = &ctx;
+#else
+    pctx = HMAC_CTX_new();
+#endif
+    HMAC_Init_ex(pctx, awssekey, strlen(awssekey), EVP_sha1(), NULL);
+    HMAC_Update(pctx, (unsigned char *)str, strlen(str));
+    HMAC_Final(pctx, (unsigned char *)md, &len);
+
+#if (OPENSSL_VERSION_NUMBER < 0x1010002f)
+    HMAC_CTX_cleanup(pctx);
+#else
+    HMAC_CTX_free(pctx);
+#endif
+    base64 = aws_b64_encode(md, len);
+
+    return base64;
+}
+
+int
+aws_dlwritev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+                 dict_t *xdata)
+{
+    aws_private_t *priv = NULL;
+
+    if (op_ret == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, op_errno,
+               "write failed "
+               ". Aborting Download");
+
+        priv = this->private;
+        pthread_spin_lock(&(priv->lock));
+        {
+            priv->abortdl = _gf_true;
+        }
+        pthread_spin_unlock(&(priv->lock));
+    }
+
+    CS_STACK_DESTROY(frame);
+
+    return op_ret;
+}
+
+size_t
+aws_write_callback(void *dlbuf, size_t size, size_t nitems, void *mainframe)
+{
+    call_frame_t *frame = NULL;
+    fd_t *dlfd = NULL;
+    int ret = 0;
+    cs_local_t *local = NULL;
+    struct iovec iov = {
+        0,
+    };
+    struct iobref *iobref = NULL;
+    struct iobuf *iobuf = NULL;
+    struct iovec dliov = {
+        0,
+    };
+    size_t tsize = 0;
+    xlator_t *this = NULL;
+    cs_private_t *xl_priv = NULL;
+    aws_private_t *priv = NULL;
+    call_frame_t *dlframe = NULL;
+
+    frame = (call_frame_t *)mainframe;
+    this = frame->this;
+    xl_priv = this->private;
+    priv = xl_priv->stores->config;
+
+    pthread_spin_lock(&(priv->lock));
+    {
+        /* returning size other than the size passed from curl will
+         * abort further download*/
+        if (priv->abortdl) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0, "aborting download");
+            pthread_spin_unlock(&(priv->lock));
+            return 0;
+        }
+    }
+    pthread_spin_unlock(&(priv->lock));
+
+    local = frame->local;
+    dlfd = local->dlfd;
+    tsize = size * nitems;
+
+    dliov.iov_base = (void *)dlbuf;
+    dliov.iov_len = tsize;
+
+    ret = iobuf_copy(this->ctx->iobuf_pool, &dliov, 1, &iobref, &iobuf, &iov);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "iobuf_copy failed");
+        goto out;
+    }
+
+    /* copy frame */
+    dlframe = copy_frame(frame);
+    if (!dlframe) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "copy_frame failed");
+        tsize = 0;
+        goto out;
+    }
+
+    STACK_WIND(dlframe, aws_dlwritev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, dlfd, &iov, 1, local->dloffset,
+               0, iobref, NULL);
+
+    local->dloffset += tsize;
+
+out:
+    if (iobuf)
+        iobuf_unref(iobuf);
+    if (iobref)
+        iobref_unref(iobref);
+
+    return tsize;
+}
+
+int
+aws_download_s3(call_frame_t *frame, void *config)
+{
+    char *buf;
+    int bufsize = -1;
+    CURL *handle = NULL;
+    struct curl_slist *slist = NULL;
+    struct curl_slist *tmp = NULL;
+    xlator_t *this = NULL;
+    int ret = 0;
+    int debug = 1;
+    CURLcode res;
+    char errbuf[CURL_ERROR_SIZE];
+    size_t len = 0;
+    long responsecode;
+    char *sign_req = NULL;
+    char *date = NULL;
+    char *const reqtype = "GET";
+    char *signature = NULL;
+    cs_local_t *local = NULL;
+    char resource[RESOURCE_SIZE] = {
+        0,
+    };
+    aws_private_t *priv = NULL;
+
+    this = frame->this;
+
+    local = frame->local;
+
+    priv = (aws_private_t *)config;
+
+    if (!priv->bucketid || !priv->hostname || !priv->awssekey ||
+        !priv->awskeyid) {
+        ret = -1;
+        goto out;
+    }
+
+    sign_req = aws_form_request(resource, &date, reqtype, priv->bucketid,
+                                local->remotepath);
+    if (!sign_req) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "null sign_req, "
+               "aborting download");
+        ret = -1;
+        goto out;
+    }
+
+    gf_msg_debug("CS", 0, "sign_req %s date %s", sign_req, date);
+
+    signature = aws_sign_request(sign_req, priv->awssekey);
+    if (!signature) {
+        gf_msg("CS", GF_LOG_ERROR, 0, 0,
+               "null signature, "
+               "aborting download");
+        ret = -1;
+        goto out;
+    }
+
+    handle = curl_easy_init();
+    this = frame->this;
+
+    /* special numbers 6, 20, 10 accounts for static characters in the
+     * below snprintf string format arguments*/
+    bufsize = strlen(date) + 6 + strlen(priv->awskeyid) + strlen(signature) +
+              20 + strlen(priv->hostname) + 10;
+
+    buf = (char *)alloca(bufsize);
+    if (!buf) {
+        gf_msg("CS", GF_LOG_ERROR, ENOMEM, 0,
+               "mem allocation "
+               "failed for buf");
+        ret = -1;
+        goto out;
+    }
+
+    snprintf(buf, bufsize, "Date: %s", date);
+    slist = curl_slist_append(slist, buf);
+    snprintf(buf, bufsize, "Authorization: AWS %s:%s", priv->awskeyid,
+             signature);
+    slist = curl_slist_append(slist, buf);
+    snprintf(buf, bufsize, "https://%s/%s", priv->hostname, resource);
+
+    if (gf_log_get_loglevel() >= GF_LOG_DEBUG) {
+        tmp = slist;
+        while (tmp) {
+            gf_msg_debug(this->name, 0, "slist for curl - %s", tmp->data);
+            tmp = tmp->next;
+        }
+    }
+
+    curl_easy_setopt(handle, CURLOPT_HTTPHEADER, slist);
+    curl_easy_setopt(handle, CURLOPT_URL, buf);
+    curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, aws_write_callback);
+    curl_easy_setopt(handle, CURLOPT_WRITEDATA, frame);
+    curl_easy_setopt(handle, CURLOPT_VERBOSE, debug);
+    curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, errbuf);
+
+    res = curl_easy_perform(handle);
+    if (res != CURLE_OK) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "download failed. err: %s\n",
+               curl_easy_strerror(res));
+        ret = -1;
+        len = strlen(errbuf);
+        if (len) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0, "curl failure %s", errbuf);
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                   "curl error "
+                   "%s\n",
+                   curl_easy_strerror(res));
+        }
+    }
+
+    if (res == CURLE_OK) {
+        curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &responsecode);
+        gf_msg_debug(this->name, 0, "response code %ld", responsecode);
+        if (responsecode != 200) {
+            ret = -1;
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0, "curl download failed");
+        }
+    }
+
+    curl_slist_free_all(slist);
+    curl_easy_cleanup(handle);
+
+out:
+    if (sign_req)
+        GF_FREE(sign_req);
+    if (date)
+        GF_FREE(date);
+    if (signature)
+        GF_FREE(signature);
+
+    return ret;
+}
+
+struct volume_options cs_options[] = {
+    {.key = {"s3plugin-seckey"},
+     .type = GF_OPTION_TYPE_STR,
+     .description = "aws secret key"},
+    {.key = {"s3plugin-keyid"},
+     .type = GF_OPTION_TYPE_STR,
+     .description = "aws key ID"
+
+    },
+    {.key = {"s3plugin-bucketid"},
+     .type = GF_OPTION_TYPE_STR,
+     .description = "aws bucketid"},
+    {.key = {"s3plugin-hostname"},
+     .type = GF_OPTION_TYPE_STR,
+     .description = "aws hostname e.g. s3.amazonaws.com"},
+    {.key = {NULL}},
+};
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.h b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.h
new file mode 100644
index 00000000000..85ae669486b
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.h
@@ -0,0 +1,50 @@
+/*
+  Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+#ifndef _LIBAWS_H
+#define _LIBAWS_H
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/syncop.h>
+#include <curl/curl.h>
+#include "cloudsync-common.h"
+#include "libcloudsyncs3-mem-types.h"
+
+char *
+aws_b64_encode(const unsigned char *input, int length);
+
+size_t
+aws_write_callback(void *dlbuf, size_t size, size_t nitems, void *mainframe);
+
+int
+aws_download_s3(call_frame_t *frame, void *config);
+
+int
+aws_dlwritev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+                 dict_t *xdata);
+
+void *
+aws_init(xlator_t *this);
+
+int
+aws_reconfigure(xlator_t *this, dict_t *options);
+
+char *
+aws_form_request(char *resource, char **date, char *reqtype, char *bucketid,
+                 char *filepath);
+char *
+aws_sign_request(char *const str, char *awssekey);
+
+void
+aws_fini(void *config);
+
+#endif
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.sym b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.sym
new file mode 100644
index 00000000000..0bc273670d5
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.sym
@@ -0,0 +1 @@
+store_ops
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/Makefile.am b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/Makefile.am b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/Makefile.am
new file mode 100644
index 00000000000..b512464f157
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/Makefile.am
@@ -0,0 +1,12 @@
+csp_LTLIBRARIES = cloudsynccvlt.la
+cspdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/cloudsync-plugins
+
+cloudsynccvlt_la_SOURCES = libcvlt.c  $(top_srcdir)/xlators/features/cloudsync/src/cloudsync-common.c
+cloudsynccvlt_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+cloudsynccvlt_la_LDFLAGS = -module -avoid-version -export-symbols $(top_srcdir)/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcloudsynccvlt.sym
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src   -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+noinst_HEADERS = archivestore.h libcvlt.h libcvlt-mem-types.h cvlt-messages.h
+AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS) -I$(top_srcdir)/xlators/features/cloudsync/src
+CLEANFILES =
+
+EXTRA_DIST = libcloudsynccvlt.sym
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/archivestore.h b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/archivestore.h
new file mode 100644
index 00000000000..7230ef77337
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/archivestore.h
@@ -0,0 +1,203 @@
+/*
+  Copyright (c) 2018 Commvault Systems, Inc. <http://www.commvault.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __ARCHIVESTORE_H__
+#define __ARCHIVESTORE_H__
+
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <dlfcn.h>
+#include <uuid/uuid.h>
+
+#define CS_XATTR_ARCHIVE_UUID "trusted.cloudsync.uuid"
+#define CS_XATTR_PRODUCT_ID "trusted.cloudsync.product-id"
+#define CS_XATTR_STORE_ID "trusted.cloudsync.store-id"
+
+struct _archstore_methods;
+typedef struct _archstore_methods archstore_methods_t;
+
+struct _archstore_desc {
+    void *priv; /* Private field for store mgmt.   */
+                /* To be used only by archive store*/
+};
+typedef struct _archstore_desc archstore_desc_t;
+
+struct _archstore_info {
+    char *id;         /* Identifier for the archivestore */
+    uint32_t idlen;   /* Length of identifier string     */
+    char *prod;       /* Name of the data mgmt. product  */
+    uint32_t prodlen; /* Length of the product string    */
+};
+typedef struct _archstore_info archstore_info_t;
+
+struct _archstore_fileinfo {
+    uuid_t uuid;         /* uuid of the file                */
+    char *path;          /* file path                       */
+    uint32_t pathlength; /* length of file path             */
+};
+typedef struct _archstore_fileinfo archstore_fileinfo_t;
+
+struct _app_callback_info {
+    archstore_info_t *src_archstore;
+    archstore_fileinfo_t *src_archfile;
+    archstore_info_t *dest_archstore;
+    archstore_fileinfo_t *dest_archfile;
+};
+typedef struct _app_callback_info app_callback_info_t;
+
+typedef void (*app_callback_t)(archstore_desc_t *, app_callback_info_t *,
+                               void *, int64_t, int32_t);
+
+enum _archstore_scan_type { FULL = 1, INCREMENTAL = 2 };
+typedef enum _archstore_scan_type archstore_scan_type_t;
+
+typedef int32_t archstore_errno_t;
+
+/*
+ * Initialize archive store.
+ * arg1  pointer to structure containing archive store information
+ * arg2  error number if any generated during the initialization
+ * arg3  name of the log file
+ */
+typedef int32_t (*init_archstore_t)(archstore_desc_t *, archstore_errno_t *,
+                                    const char *);
+
+/*
+ * Clean up archive store.
+ * arg1  pointer to structure containing archive store information
+ * arg2  error number if any generated during the cleanup
+ */
+typedef int32_t (*term_archstore_t)(archstore_desc_t *, archstore_errno_t *);
+
+/*
+ * Read the contents of the file from archive store
+ * arg1  pointer to structure containing archive store description
+ * arg2  pointer to structure containing archive store information
+ * arg3  pointer to structure containing information about file to be read
+ * arg4  offset in the file from which data should be read
+ * arg5  buffer where the data should be read
+ * arg6  number of bytes of data to be read
+ * arg7  error number if any generated during the read from file
+ * arg8  callback handler to be invoked after the data is read
+ * arg9  cookie to be passed when callback is invoked
+ */
+typedef int32_t (*read_archstore_t)(archstore_desc_t *, archstore_info_t *,
+                                    archstore_fileinfo_t *, off_t, char *,
+                                    size_t, archstore_errno_t *, app_callback_t,
+                                    void *);
+
+/*
+ * Restore the contents of the file from archive store
+ * This is basically in-place restore
+ * arg1  pointer to structure containing archive store description
+ * arg2  pointer to structure containing archive store information
+ * arg3  pointer to structure containing information about file to be restored
+ * arg4  error number if any generated during the file restore
+ * arg5  callback to be invoked after the file is restored
+ * arg6  cookie to be passed when callback is invoked
+ */
+typedef int32_t (*recall_archstore_t)(archstore_desc_t *, archstore_info_t *,
+                                      archstore_fileinfo_t *,
+                                      archstore_errno_t *, app_callback_t,
+                                      void *);
+
+/*
+ * Restore the contents of the file from archive store to a different store
+ * This is basically out-of-place restore
+ * arg1  pointer to structure containing archive store description
+ * arg2  pointer to structure containing source archive store information
+ * arg3  pointer to structure containing information about file to be restored
+ * arg4  pointer to structure containing destination archive store information
+ * arg5  pointer to structure containing information about the location to
+         which the file will be restored
+ * arg6  error number if any generated during the file restore
+ * arg7  callback to be invoked after the file is restored
+ * arg8  cookie to be passed when callback is invoked
+ */
+typedef int32_t (*restore_archstore_t)(archstore_desc_t *, archstore_info_t *,
+                                       archstore_fileinfo_t *,
+                                       archstore_info_t *,
+                                       archstore_fileinfo_t *,
+                                       archstore_errno_t *, app_callback_t,
+                                       void *);
+
+/*
+ * Archive the contents of the file to archive store
+ * arg1  pointer to structure containing archive store description
+ * arg2  pointer to structure containing source archive store information
+ * arg3  pointer to structure containing information about files to be archived
+ * arg4  pointer to structure containing destination archive store information
+ * arg5  pointer to structure containing information about files that failed
+ *       to be archived
+ * arg6  error number if any generated during the file archival
+ * arg7  callback to be invoked after the file is archived
+ * arg8  cookie to be passed when callback is invoked
+ */
+typedef int32_t (*archive_archstore_t)(archstore_desc_t *, archstore_info_t *,
+                                       archstore_fileinfo_t *,
+                                       archstore_info_t *,
+                                       archstore_fileinfo_t *,
+                                       archstore_errno_t *, app_callback_t,
+                                       void *);
+
+/*
+ * Backup list of files provided in the input file
+ * arg1  pointer to structure containing archive store description
+ * arg2  pointer to structure containing source archive store information
+ * arg3  pointer to structure containing information about files to be backed up
+ * arg4  pointer to structure containing destination archive store information
+ * arg5  pointer to structure containing information about files that failed
+ *       to be backed up
+ * arg6  error number if any generated during the file archival
+ * arg7  callback to be invoked after the file is archived
+ * arg8  cookie to be passed when callback is invoked
+ */
+typedef int32_t (*backup_archstore_t)(archstore_desc_t *, archstore_info_t *,
+                                      archstore_fileinfo_t *,
+                                      archstore_info_t *,
+                                      archstore_fileinfo_t *,
+                                      archstore_errno_t *, app_callback_t,
+                                      void *);
+
+/*
+ * Scan the contents of a store and determine the files which need to be
+ * backed up.
+ * arg1  pointer to structure containing archive store description
+ * arg2  pointer to structure containing archive store information
+ * arg3  type of scan whether full or incremental
+ * arg4  path to file that contains list of files to be backed up
+ * arg5  error number if any generated during scan operation
+ */
+typedef int32_t (*scan_archstore_t)(archstore_desc_t *, archstore_info_t *,
+                                    archstore_scan_type_t, char *,
+                                    archstore_errno_t *);
+
+struct _archstore_methods {
+    init_archstore_t init;
+    term_archstore_t fini;
+    backup_archstore_t backup;
+    archive_archstore_t archive;
+    scan_archstore_t scan;
+    restore_archstore_t restore;
+    recall_archstore_t recall;
+    read_archstore_t read;
+};
+
+typedef int (*get_archstore_methods_t)(archstore_methods_t *);
+
+/*
+ * Single function that will be invoked by applications for extracting
+ * the function pointers to all data management functions.
+ */
+int32_t
+get_archstore_methods(archstore_methods_t *);
+
+#endif /* End of __ARCHIVESTORE_H__ */
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/cvlt-messages.h b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/cvlt-messages.h
new file mode 100644
index 00000000000..57c9aa77da0
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/cvlt-messages.h
@@ -0,0 +1,30 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _CVLT_MESSAGES_H_
+#define _CVLT_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(CVLT, CVLT_EXTRACTION_FAILED, CVLT_FREE,
+           CVLT_RESOURCE_ALLOCATION_FAILED, CVLT_RESTORE_FAILED,
+           CVLT_READ_FAILED, CVLT_NO_MEMORY, CVLT_DLOPEN_FAILED);
+
+#endif /* !_CVLT_MESSAGES_H_ */
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcloudsynccvlt.sym b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcloudsynccvlt.sym
new file mode 100644
index 00000000000..0bc273670d5
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcloudsynccvlt.sym
@@ -0,0 +1 @@
+store_ops
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt-mem-types.h b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt-mem-types.h
new file mode 100644
index 00000000000..c24fab8bfe7
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt-mem-types.h
@@ -0,0 +1,19 @@
+/*
+ *   Copyright (c) 2018 Commvault Systems, Inc. <http://www.commvault.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#ifndef __LIBCVLT_MEM_TYPES_H__
+#define __LIBCVLT_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+enum libcvlt_mem_types_ {
+    gf_libcvlt_mt_cvlt_private_t = gf_common_mt_end + 1,
+    gf_libcvlt_mt_end
+};
+#endif /* __LIBCVLT_MEM_TYPES_H__ */
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.c b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.c
new file mode 100644
index 00000000000..5b7272bb448
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.c
@@ -0,0 +1,842 @@
+#include <stdlib.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/glusterfs.h>
+#include "libcvlt.h"
+#include "cloudsync-common.h"
+#include "cvlt-messages.h"
+
+#define LIBARCHIVE_SO "libopenarchive.so"
+#define ALIGN_SIZE 4096
+#define CVLT_TRAILER "cvltv1"
+
+store_methods_t store_ops = {
+    .fop_download = cvlt_download,
+    .fop_init = cvlt_init,
+    .fop_reconfigure = cvlt_reconfigure,
+    .fop_fini = cvlt_fini,
+    .fop_remote_read = cvlt_read,
+};
+
+static const int32_t num_req = 32;
+static const int32_t num_iatt = 32;
+static char *plugin = "cvlt_cloudSync";
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_libcvlt_mt_end + 1);
+
+    if (ret != 0) {
+        return ret;
+    }
+
+    return ret;
+}
+
+static void
+cvlt_free_resources(archive_t *arch)
+{
+    /*
+     * We will release all the resources that were allocated by the xlator.
+     * Check whether there are any buffers which have not been released
+     * back to a mempool.
+     */
+
+    if (arch->handle) {
+        dlclose(arch->handle);
+    }
+
+    if (arch->iobuf_pool) {
+        iobuf_pool_destroy(arch->iobuf_pool);
+    }
+
+    if (arch->req_pool) {
+        mem_pool_destroy(arch->req_pool);
+        arch->req_pool = NULL;
+    }
+
+    return;
+}
+
+static int32_t
+cvlt_extract_store_fops(xlator_t *this, archive_t *arch)
+{
+    int32_t op_ret = -1;
+    get_archstore_methods_t get_archstore_methods;
+
+    /*
+     * libopenarchive.so defines methods for performing data management
+     * operations. We will extract the methods from library and these
+     * methods will be invoked for moving data between glusterfs volume
+     * and the data management product.
+     */
+
+    VALIDATE_OR_GOTO(arch, err);
+
+    arch->handle = dlopen(LIBARCHIVE_SO, RTLD_NOW);
+    if (!arch->handle) {
+        gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_DLOPEN_FAILED,
+               " failed to open %s ", LIBARCHIVE_SO);
+        return op_ret;
+    }
+
+    dlerror(); /* Clear any existing error */
+
+    get_archstore_methods = dlsym(arch->handle, "get_archstore_methods");
+    if (!get_archstore_methods) {
+        gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED,
+               " Error extracting get_archstore_methods()");
+        dlclose(arch->handle);
+        arch->handle = NULL;
+        return op_ret;
+    }
+
+    op_ret = get_archstore_methods(&(arch->fops));
+    if (op_ret) {
+        gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED,
+               " Failed to extract methods in get_archstore_methods");
+        dlclose(arch->handle);
+        arch->handle = NULL;
+        return op_ret;
+    }
+
+err:
+    return op_ret;
+}
+
+static int32_t
+cvlt_alloc_resources(xlator_t *this, archive_t *arch, int num_req, int num_iatt)
+{
+    /*
+     * Initialize information about all the memory pools that will be
+     * used by this xlator.
+     */
+    arch->nreqs = 0;
+
+    arch->req_pool = NULL;
+
+    arch->handle = NULL;
+    arch->xl = this;
+
+    arch->req_pool = mem_pool_new(cvlt_request_t, num_req);
+    if (!arch->req_pool) {
+        goto err;
+    }
+
+    arch->iobuf_pool = iobuf_pool_new();
+    if (!arch->iobuf_pool) {
+        goto err;
+    }
+
+    if (cvlt_extract_store_fops(this, arch)) {
+        goto err;
+    }
+
+    return 0;
+
+err:
+
+    return -1;
+}
+
+static void
+cvlt_req_init(cvlt_request_t *req)
+{
+    sem_init(&(req->sem), 0, 0);
+
+    return;
+}
+
+static void
+cvlt_req_destroy(cvlt_request_t *req)
+{
+    if (req->iobuf) {
+        iobuf_unref(req->iobuf);
+    }
+
+    if (req->iobref) {
+        iobref_unref(req->iobref);
+    }
+
+    sem_destroy(&(req->sem));
+
+    return;
+}
+
+static cvlt_request_t *
+cvlt_alloc_req(archive_t *arch)
+{
+    cvlt_request_t *reqptr = NULL;
+
+    if (!arch) {
+        goto err;
+    }
+
+    if (arch->req_pool) {
+        reqptr = mem_get0(arch->req_pool);
+        if (reqptr) {
+            cvlt_req_init(reqptr);
+        }
+    }
+
+    if (reqptr) {
+        LOCK(&(arch->lock));
+        arch->nreqs++;
+        UNLOCK(&(arch->lock));
+    }
+
+err:
+    return reqptr;
+}
+
+static int32_t
+cvlt_free_req(archive_t *arch, cvlt_request_t *reqptr)
+{
+    if (!reqptr) {
+        goto err;
+    }
+
+    if (!arch) {
+        goto err;
+    }
+
+    if (arch->req_pool) {
+        /*
+         * Free the request resources if they exist.
+         */
+
+        cvlt_req_destroy(reqptr);
+        mem_put(reqptr);
+
+        LOCK(&(arch->lock));
+        arch->nreqs--;
+        UNLOCK(&(arch->lock));
+    }
+
+    return 0;
+
+err:
+    return -1;
+}
+
+static int32_t
+cvlt_init_xlator(xlator_t *this, archive_t *arch, int num_req, int num_iatt)
+{
+    int32_t ret = -1;
+    int32_t errnum = -1;
+    int32_t locked = 0;
+
+    /*
+     * Perform all the initializations needed for brining up the xlator.
+     */
+    if (!arch) {
+        goto err;
+    }
+
+    LOCK_INIT(&(arch->lock));
+    LOCK(&(arch->lock));
+
+    locked = 1;
+
+    ret = cvlt_alloc_resources(this, arch, num_req, num_iatt);
+
+    if (ret) {
+        goto err;
+    }
+
+    /*
+     * Now that the fops have been extracted initialize the store
+     */
+    ret = arch->fops.init(&(arch->descinfo), &errnum, plugin);
+    if (ret) {
+        goto err;
+    }
+
+    UNLOCK(&(arch->lock));
+    locked = 0;
+    ret = 0;
+
+    return ret;
+
+err:
+    if (arch) {
+        cvlt_free_resources(arch);
+
+        if (locked) {
+            UNLOCK(&(arch->lock));
+        }
+    }
+
+    return ret;
+}
+
+static int32_t
+cvlt_term_xlator(archive_t *arch)
+{
+    int32_t errnum = -1;
+
+    if (!arch) {
+        goto err;
+    }
+
+    LOCK(&(arch->lock));
+
+    /*
+     * Release the resources that have been allocated inside store
+     */
+    arch->fops.fini(&(arch->descinfo), &errnum);
+
+    cvlt_free_resources(arch);
+
+    UNLOCK(&(arch->lock));
+
+    GF_FREE(arch);
+
+    return 0;
+
+err:
+    return -1;
+}
+
+static int32_t
+cvlt_init_store_info(archive_t *priv, archstore_info_t *store_info)
+{
+    if (!store_info) {
+        return -1;
+    }
+
+    store_info->prod = priv->product_id;
+    store_info->prodlen = strlen(priv->product_id);
+
+    store_info->id = priv->store_id;
+    store_info->idlen = strlen(priv->store_id);
+
+    return 0;
+}
+
+static int32_t
+cvlt_init_file_info(cs_loc_xattr_t *xattr, archstore_fileinfo_t *file_info)
+{
+    if (!xattr || !file_info) {
+        return -1;
+    }
+
+    gf_uuid_copy(file_info->uuid, xattr->uuid);
+    file_info->path = xattr->file_path;
+    file_info->pathlength = strlen(xattr->file_path);
+
+    return 0;
+}
+
+static int32_t
+cvlt_init_gluster_store_info(cs_loc_xattr_t *xattr,
+                             archstore_info_t *store_info)
+{
+    static char *product = "glusterfs";
+
+    if (!xattr || !store_info) {
+        return -1;
+    }
+
+    store_info->prod = product;
+    store_info->prodlen = strlen(product);
+
+    store_info->id = xattr->volname;
+    store_info->idlen = strlen(xattr->volname);
+
+    return 0;
+}
+
+static int32_t
+cvlt_init_gluster_file_info(cs_loc_xattr_t *xattr,
+                            archstore_fileinfo_t *file_info)
+{
+    if (!xattr || !file_info) {
+        return -1;
+    }
+
+    gf_uuid_copy(file_info->uuid, xattr->gfid);
+    file_info->path = xattr->file_path;
+    file_info->pathlength = strlen(xattr->file_path);
+
+    return 0;
+}
+
+static void
+cvlt_copy_stat_info(struct iatt *buf, cs_size_xattr_t *xattrs)
+{
+    /*
+     * If the file was archived then the reported size will not be a
+     * correct one. We need to fix this.
+     */
+    if (buf && xattrs) {
+        buf->ia_size = xattrs->size;
+        buf->ia_blksize = xattrs->blksize;
+        buf->ia_blocks = xattrs->blocks;
+    }
+
+    return;
+}
+
+static void
+cvlt_readv_complete(archstore_desc_t *desc, app_callback_info_t *cbkinfo,
+                    void *cookie, int64_t op_ret, int32_t op_errno)
+{
+    struct iovec iov;
+    xlator_t *this = NULL;
+    struct iatt postbuf = {
+        0,
+    };
+    call_frame_t *frame = NULL;
+    cvlt_request_t *req = (cvlt_request_t *)cookie;
+    cs_local_t *local = NULL;
+    cs_private_t *cspriv = NULL;
+    archive_t *priv = NULL;
+
+    frame = req->frame;
+    this = frame->this;
+    local = frame->local;
+
+    cspriv = this->private;
+    priv = (archive_t *)cspriv->stores->config;
+
+    if (strcmp(priv->trailer, CVLT_TRAILER)) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    gf_msg_debug(plugin, 0,
+                 " Read callback invoked offset:%" PRIu64 "bytes: %" PRIu64
+                 " op : %d ret : %" PRId64 " errno : %d",
+                 req->offset, req->bytes, req->op_type, op_ret, op_errno);
+
+    if (op_ret < 0) {
+        goto out;
+    }
+
+    req->iobref = iobref_new();
+    if (!req->iobref) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    iobref_add(req->iobref, req->iobuf);
+    iov.iov_base = iobuf_ptr(req->iobuf);
+    iov.iov_len = op_ret;
+
+    cvlt_copy_stat_info(&postbuf, &(req->szxattr));
+
+    /*
+     * Hack to notify higher layers of EOF.
+     */
+    if (!postbuf.ia_size || (req->offset + iov.iov_len >= postbuf.ia_size)) {
+        gf_msg_debug(plugin, 0, " signalling end-of-file for uuid=%s",
+                     uuid_utoa(req->file_info.uuid));
+        op_errno = ENOENT;
+    }
+
+out:
+
+    STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, &iov, 1, &postbuf,
+                        req->iobref, local->xattr_rsp);
+
+    cvlt_free_req(priv, req);
+
+    return;
+}
+
+static void
+cvlt_download_complete(archstore_desc_t *store, app_callback_info_t *cbk_info,
+                       void *cookie, int64_t ret, int errcode)
+{
+    cvlt_request_t *req = (cvlt_request_t *)cookie;
+
+    gf_msg_debug(plugin, 0,
+                 " Download callback invoked  ret : %" PRId64 " errno : %d",
+                 ret, errcode);
+
+    req->op_ret = ret;
+    req->op_errno = errcode;
+    sem_post(&(req->sem));
+
+    return;
+}
+
+void *
+cvlt_init(xlator_t *this)
+{
+    int ret = 0;
+    archive_t *priv = NULL;
+
+    if (!this->children || this->children->next) {
+        gf_msg(plugin, GF_LOG_ERROR, ENOMEM, 0,
+               "should have exactly one child");
+        ret = -1;
+        goto out;
+    }
+
+    if (!this->parents) {
+        gf_msg(plugin, GF_LOG_ERROR, ENOMEM, 0,
+               "dangling volume. check volfile");
+        ret = -1;
+        goto out;
+    }
+
+    priv = GF_CALLOC(1, sizeof(archive_t), gf_libcvlt_mt_cvlt_private_t);
+    if (!priv) {
+        ret = -1;
+        goto out;
+    }
+
+    priv->trailer = CVLT_TRAILER;
+    if (cvlt_init_xlator(this, priv, num_req, num_iatt)) {
+        gf_msg(plugin, GF_LOG_ERROR, ENOMEM, 0, "xlator init failed");
+        ret = -1;
+        goto out;
+    }
+
+    GF_OPTION_INIT("cloudsync-store-id", priv->store_id, str, out);
+    GF_OPTION_INIT("cloudsync-product-id", priv->product_id, str, out);
+
+    gf_msg(plugin, GF_LOG_INFO, 0, 0,
+           "store id is : %s "
+           "product id is : %s.",
+           priv->store_id, priv->product_id);
+out:
+    if (ret == -1) {
+        cvlt_term_xlator(priv);
+        return (NULL);
+    }
+    return priv;
+}
+
+int
+cvlt_reconfigure(xlator_t *this, dict_t *options)
+{
+    cs_private_t *cspriv = NULL;
+    archive_t *priv = NULL;
+
+    cspriv = this->private;
+    priv = (archive_t *)cspriv->stores->config;
+
+    if (strcmp(priv->trailer, CVLT_TRAILER))
+        goto out;
+
+    GF_OPTION_RECONF("cloudsync-store-id", priv->store_id, options, str, out);
+
+    GF_OPTION_RECONF("cloudsync-product-id", priv->product_id, options, str,
+                     out);
+    gf_msg_debug(plugin, 0,
+                 "store id is : %s "
+                 "product id is : %s.",
+                 priv->store_id, priv->product_id);
+    return 0;
+out:
+    return -1;
+}
+
+void
+cvlt_fini(void *config)
+{
+    archive_t *priv = NULL;
+
+    priv = (archive_t *)config;
+
+    if (strcmp(priv->trailer, CVLT_TRAILER))
+        return;
+
+    cvlt_term_xlator(priv);
+    gf_msg(plugin, GF_LOG_INFO, 0, CVLT_FREE, " released xlator resources");
+    return;
+}
+
+int
+cvlt_download(call_frame_t *frame, void *config)
+{
+    archive_t *parch = NULL;
+    cs_local_t *local = frame->local;
+    cs_loc_xattr_t *locxattr = local->xattrinfo.lxattr;
+    cvlt_request_t *req = NULL;
+    archstore_info_t dest_storeinfo;
+    archstore_fileinfo_t dest_fileinfo;
+    int32_t op_ret, op_errno;
+
+    parch = (archive_t *)config;
+
+    if (strcmp(parch->trailer, CVLT_TRAILER)) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    gf_msg_debug(plugin, 0, " download invoked for uuid = %s  gfid=%s ",
+                 locxattr->uuid, uuid_utoa(locxattr->gfid));
+
+    if (!(parch->fops.restore)) {
+        op_errno = ELIBBAD;
+        goto err;
+    }
+
+    /*
+     * Download needs to be processed. Allocate a request.
+     */
+    req = cvlt_alloc_req(parch);
+
+    if (!req) {
+        gf_msg(plugin, GF_LOG_ERROR, ENOMEM, CVLT_RESOURCE_ALLOCATION_FAILED,
+               " failed to allocated request for gfid=%s",
+               uuid_utoa(locxattr->gfid));
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    /*
+     * Initialize the request object.
+     */
+    req->op_type = CVLT_RESTORE_OP;
+    req->frame = frame;
+
+    /*
+     * The file is currently residing inside a data management store.
+     * To restore the file contents we need to provide the information
+     * about data management store.
+     */
+    op_ret = cvlt_init_store_info(parch, &(req->store_info));
+    if (op_ret < 0) {
+        gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED,
+               " failed to extract store info for gfid=%s",
+               uuid_utoa(locxattr->gfid));
+        goto err;
+    }
+
+    op_ret = cvlt_init_file_info(locxattr, &(req->file_info));
+    if (op_ret < 0) {
+        gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED,
+               " failed to extract file info for gfid=%s",
+               uuid_utoa(locxattr->gfid));
+        goto err;
+    }
+
+    /*
+     * We need to perform in-place restore of the file from data management
+     * store to gusterfs volume.
+     */
+    op_ret = cvlt_init_gluster_store_info(locxattr, &dest_storeinfo);
+    if (op_ret < 0) {
+        gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED,
+               " failed to extract destination store info for gfid=%s",
+               uuid_utoa(locxattr->gfid));
+        goto err;
+    }
+
+    op_ret = cvlt_init_gluster_file_info(locxattr, &dest_fileinfo);
+    if (op_ret < 0) {
+        gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED,
+               " failed to extract file info for gfid=%s",
+               uuid_utoa(locxattr->gfid));
+        goto err;
+    }
+
+    /*
+     * Submit the restore request.
+     */
+    op_ret = parch->fops.restore(&(parch->descinfo), &(req->store_info),
+                                 &(req->file_info), &dest_storeinfo,
+                                 &dest_fileinfo, &op_errno,
+                                 cvlt_download_complete, req);
+    if (op_ret < 0) {
+        gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_RESTORE_FAILED,
+               " failed to restore file gfid=%s from data management store",
+               uuid_utoa(locxattr->gfid));
+        goto err;
+    }
+
+    /*
+     * Wait for the restore to complete.
+     */
+    sem_wait(&(req->sem));
+
+    if (req->op_ret < 0) {
+        gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_RESTORE_FAILED,
+               " restored failed for gfid=%s", uuid_utoa(locxattr->gfid));
+        goto err;
+    }
+
+    if (req) {
+        cvlt_free_req(parch, req);
+    }
+
+    return 0;
+
+err:
+
+    if (req) {
+        cvlt_free_req(parch, req);
+    }
+
+    return -1;
+}
+
+int
+cvlt_read(call_frame_t *frame, void *config)
+{
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    archive_t *parch = NULL;
+    cvlt_request_t *req = NULL;
+    struct iovec iov = {
+        0,
+    };
+    struct iobref *iobref;
+    size_t size = 0;
+    off_t off = 0;
+
+    cs_local_t *local = frame->local;
+    cs_loc_xattr_t *locxattr = local->xattrinfo.lxattr;
+
+    size = local->xattrinfo.size;
+    off = local->xattrinfo.offset;
+
+    parch = (archive_t *)config;
+
+    if (strcmp(parch->trailer, CVLT_TRAILER)) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    gf_msg_debug(plugin, 0,
+                 " read invoked for gfid = %s offset = %" PRIu64
+                 " file_size = %" PRIu64,
+                 uuid_utoa(locxattr->gfid), off, local->stbuf.ia_size);
+
+    if (off >= local->stbuf.ia_size) {
+        /*
+         * Hack to notify higher layers of EOF.
+         */
+
+        op_errno = ENOENT;
+        op_ret = 0;
+
+        gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_READ_FAILED,
+               " reporting end-of-file for gfid=%s", uuid_utoa(locxattr->gfid));
+
+        goto err;
+    }
+
+    if (!size) {
+        op_errno = EINVAL;
+
+        gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_READ_FAILED,
+               " zero size read attempted on gfid=%s",
+               uuid_utoa(locxattr->gfid));
+        goto err;
+    }
+
+    if (!(parch->fops.read)) {
+        op_errno = ELIBBAD;
+        goto err;
+    }
+
+    /*
+     * The read request need to be processed. Allocate a request.
+     */
+    req = cvlt_alloc_req(parch);
+
+    if (!req) {
+        gf_msg(plugin, GF_LOG_ERROR, ENOMEM, CVLT_NO_MEMORY,
+               " failed to allocated request for gfid=%s",
+               uuid_utoa(locxattr->gfid));
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    req->iobuf = iobuf_get_page_aligned(parch->iobuf_pool, size, ALIGN_SIZE);
+    if (!req->iobuf) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    /*
+     * Initialize the request object.
+     */
+    req->op_type = CVLT_READ_OP;
+    req->offset = off;
+    req->bytes = size;
+    req->frame = frame;
+    req->szxattr.size = local->stbuf.ia_size;
+    req->szxattr.blocks = local->stbuf.ia_blocks;
+    req->szxattr.blksize = local->stbuf.ia_blksize;
+
+    /*
+     * The file is currently residing inside a data management store.
+     * To read the file contents we need to provide the information
+     * about data management store.
+     */
+    op_ret = cvlt_init_store_info(parch, &(req->store_info));
+    if (op_ret < 0) {
+        gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED,
+               " failed to extract store info for gfid=%s"
+               " offset=%" PRIu64 " size=%" GF_PRI_SIZET
+               ", "
+               " buf=%p",
+               uuid_utoa(locxattr->gfid), off, size, req->iobuf->ptr);
+        goto err;
+    }
+
+    op_ret = cvlt_init_file_info(locxattr, &(req->file_info));
+    if (op_ret < 0) {
+        gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED,
+               " failed to extract file info for gfid=%s"
+               " offset=%" PRIu64 " size=%" GF_PRI_SIZET
+               ", "
+               " buf=%p",
+               uuid_utoa(locxattr->gfid), off, size, req->iobuf->ptr);
+        goto err;
+    }
+
+    /*
+     * Submit the read request.
+     */
+    op_ret = parch->fops.read(&(parch->descinfo), &(req->store_info),
+                              &(req->file_info), off, req->iobuf->ptr, size,
+                              &op_errno, cvlt_readv_complete, req);
+
+    if (op_ret < 0) {
+        gf_msg(plugin, GF_LOG_ERROR, 0, CVLT_EXTRACTION_FAILED,
+               " read failed on gfid=%s"
+               " offset=%" PRIu64 " size=%" GF_PRI_SIZET
+               ", "
+               " buf=%p",
+               uuid_utoa(locxattr->gfid), off, size, req->iobuf->ptr);
+        goto err;
+    }
+
+    return 0;
+
+err:
+
+    iobref = iobref_new();
+    gf_msg_debug(plugin, 0, " read unwinding stack op_ret = %d, op_errno = %d",
+                 op_ret, op_errno);
+
+    STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, &iov, 1,
+                        &(local->stbuf), iobref, local->xattr_rsp);
+
+    if (iobref) {
+        iobref_unref(iobref);
+    }
+
+    if (req) {
+        cvlt_free_req(parch, req);
+    }
+
+    return 0;
+}
diff --git a/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.h b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.h
new file mode 100644
index 00000000000..c45ac948f6c
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.h
@@ -0,0 +1,84 @@
+/*
+  Copyright (c) 2018 Commvault Systems, Inc. <http://www.commvault.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+#ifndef _LIBCVLT_H
+#define _LIBCVLT_H
+
+#include <semaphore.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/compat-errno.h>
+#include "cloudsync-common.h"
+#include "libcvlt-mem-types.h"
+#include "archivestore.h"
+
+enum _cvlt_op {
+    CVLT_READ_OP = 1,
+    CVLT_WRITE_OP = 2,
+    CVLT_RESTORE_OP = 3,
+    CVLT_ARCHIVE_OP = 4,
+    CVLT_LOOKUP_OP = 5,
+    CVLT_XATTR_OP = 6,
+    CVLT_STAT_OP = 7,
+    CVLT_FSTAT_op = 8,
+    CVLT_UNDEF_OP = 127
+};
+typedef enum _cvlt_op cvlt_op_t;
+
+struct _archive;
+struct _cvlt_request {
+    uint64_t offset;
+    uint64_t bytes;
+    struct iobuf *iobuf;
+    struct iobref *iobref;
+    call_frame_t *frame;
+    cvlt_op_t op_type;
+    int32_t op_ret;
+    int32_t op_errno;
+    xlator_t *this;
+    sem_t sem;
+    archstore_info_t store_info;
+    archstore_fileinfo_t file_info;
+    cs_size_xattr_t szxattr;
+};
+typedef struct _cvlt_request cvlt_request_t;
+
+struct _archive {
+    gf_lock_t lock;                /* lock for controlling access   */
+    xlator_t *xl;                  /* xlator                        */
+    void *handle;                  /* handle returned from dlopen   */
+    int32_t nreqs;                 /* num requests active           */
+    struct mem_pool *req_pool;     /* pool for requests             */
+    struct iobuf_pool *iobuf_pool; /* iobuff pool                   */
+    archstore_desc_t descinfo;     /* Archive store descriptor info */
+    archstore_methods_t fops;      /* function pointers             */
+    char *product_id;
+    char *store_id;
+    char *trailer;
+};
+typedef struct _archive archive_t;
+
+void *
+cvlt_init(xlator_t *);
+
+int
+cvlt_reconfigure(xlator_t *, dict_t *);
+
+void
+cvlt_fini(void *);
+
+int
+cvlt_download(call_frame_t *, void *);
+
+int
+cvlt_read(call_frame_t *, void *);
+
+#endif
diff --git a/xlators/features/cloudsync/src/cloudsync.c b/xlators/features/cloudsync/src/cloudsync.c
new file mode 100644
index 00000000000..7f0b9e563b8
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync.c
@@ -0,0 +1,2076 @@
+/*
+ *   Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include "cloudsync.h"
+#include "cloudsync-common.h"
+#include <glusterfs/call-stub.h>
+#include "cloudsync-autogen-fops.h"
+
+#include <string.h>
+#include <dlfcn.h>
+
+static void
+cs_cleanup_private(cs_private_t *priv)
+{
+    if (priv) {
+        if (priv->stores) {
+            priv->stores->fini(priv->stores->config);
+            GF_FREE(priv->stores);
+        }
+
+        pthread_spin_destroy(&priv->lock);
+        GF_FREE(priv);
+    }
+
+    return;
+}
+
+static struct cs_plugin plugins[] = {
+    {.name = "cloudsyncs3",
+     .library = "cloudsyncs3.so",
+     .description = "cloudsync s3 store."},
+#if defined(__linux__)
+    {.name = "cvlt",
+     .library = "cloudsynccvlt.so",
+     .description = "Commvault content store."},
+#endif
+    {.name = NULL},
+};
+
+int
+cs_init(xlator_t *this)
+{
+    cs_private_t *priv = NULL;
+    gf_boolean_t per_vol = _gf_false;
+    int ret = 0;
+    char *libpath = NULL;
+    store_methods_t *store_methods = NULL;
+    void *handle = NULL;
+    char *temp_str = NULL;
+    int index = 0;
+    char *libname = NULL;
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_cs_mt_cs_private_t);
+    if (!priv) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "insufficient memory");
+        goto out;
+    }
+
+    priv->this = this;
+
+    this->local_pool = mem_pool_new(cs_local_t, 512);
+    if (!this->local_pool) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, "initialisation failed.");
+        ret = -1;
+        goto out;
+    }
+
+    this->private = priv;
+
+    GF_OPTION_INIT("cloudsync-remote-read", priv->remote_read, bool, out);
+
+    /* temp workaround. Should be configurable through glusterd*/
+    per_vol = _gf_true;
+
+    if (per_vol) {
+        if (dict_get_str_sizen(this->options, "cloudsync-storetype",
+                               &temp_str) == 0) {
+            for (index = 0; plugins[index].name; index++) {
+                if (!strcmp(temp_str, plugins[index].name)) {
+                    libname = plugins[index].library;
+                    break;
+                }
+            }
+        } else {
+            ret = 0;
+        }
+
+        if (!libname) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, 0, "no plugin enabled");
+            ret = 0;
+            goto out;
+        }
+
+        ret = gf_asprintf(&libpath, "%s/%s", CS_PLUGINDIR, libname);
+        if (ret == -1) {
+            goto out;
+        }
+
+        handle = dlopen(libpath, RTLD_NOW);
+        if (!handle) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, 0,
+                   "could not "
+                   "load the required library. %s",
+                   dlerror());
+            ret = 0;
+            goto out;
+        } else {
+            gf_msg(this->name, GF_LOG_INFO, 0, 0,
+                   "loading library:%s successful", libname);
+        }
+
+        priv->stores = GF_CALLOC(1, sizeof(struct cs_remote_stores),
+                                 gf_cs_mt_cs_remote_stores_t);
+        if (!priv->stores) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                   "Could not "
+                   "allocate memory for priv->stores");
+            ret = -1;
+            goto out;
+        }
+
+        (void)dlerror(); /* clear out previous error string */
+
+        /* load library methods */
+        store_methods = (store_methods_t *)dlsym(handle, "store_ops");
+        if (!store_methods) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0, "null store_methods %s",
+                   dlerror());
+            ret = -1;
+            goto out;
+        }
+
+        (void)dlerror();
+
+        if (priv->remote_read) {
+            priv->stores->rdfop = store_methods->fop_remote_read;
+            if (!priv->stores->rdfop) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                       "failed to get"
+                       " read fop %s",
+                       dlerror());
+                ret = -1;
+                goto out;
+            }
+        }
+
+        priv->stores->dlfop = store_methods->fop_download;
+        if (!priv->stores->dlfop) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                   "failed to get"
+                   " download fop %s",
+                   dlerror());
+            ret = -1;
+            goto out;
+        }
+
+        (void)dlerror();
+        priv->stores->init = store_methods->fop_init;
+        if (!priv->stores->init) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                   "failed to get"
+                   " init fop %s",
+                   dlerror());
+            ret = -1;
+            goto out;
+        }
+
+        (void)dlerror();
+        priv->stores->reconfigure = store_methods->fop_reconfigure;
+        if (!priv->stores->reconfigure) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                   "failed to get"
+                   " reconfigure fop %s",
+                   dlerror());
+            ret = -1;
+            goto out;
+        }
+
+        priv->stores->handle = handle;
+
+        priv->stores->config = (void *)((priv->stores->init)(this));
+        if (!priv->stores->config) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0, "null config");
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = 0;
+
+out:
+    if (ret == -1) {
+        if (this->local_pool) {
+            mem_pool_destroy(this->local_pool);
+            this->local_pool = NULL;
+        }
+
+        cs_cleanup_private(priv);
+
+        if (handle) {
+            dlclose(handle);
+        }
+    }
+
+    GF_FREE(libpath);
+
+    return ret;
+}
+
+int
+cs_forget(xlator_t *this, inode_t *inode)
+{
+    uint64_t ctx_int = 0;
+    cs_inode_ctx_t *ctx = NULL;
+
+    inode_ctx_del(inode, this, &ctx_int);
+    if (!ctx_int)
+        return 0;
+
+    ctx = (cs_inode_ctx_t *)(uintptr_t)ctx_int;
+
+    GF_FREE(ctx);
+    return 0;
+}
+
+void
+cs_fini(xlator_t *this)
+{
+    cs_private_t *priv = NULL;
+    priv = this->private;
+
+    cs_cleanup_private(priv);
+}
+
+int
+cs_reconfigure(xlator_t *this, dict_t *options)
+{
+    cs_private_t *priv = NULL;
+    int ret = 0;
+
+    priv = this->private;
+    if (!priv) {
+        ret = -1;
+        goto out;
+    }
+
+    GF_OPTION_RECONF("cloudsync-remote-read", priv->remote_read, options, bool,
+                     out);
+
+    /* needed only for per volume configuration*/
+    ret = priv->stores->reconfigure(this, options);
+
+out:
+    return ret;
+}
+
+int32_t
+cs_mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("cloudsync", this, out);
+
+    ret = xlator_mem_acct_init(this, gf_cs_mt_end + 1);
+
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "Memory accounting init failed");
+        return ret;
+    }
+out:
+    return ret;
+}
+
+int32_t
+cs_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t off, dict_t *xdata)
+{
+    int ret = 0;
+    int op_errno = ENOMEM;
+
+    if (!xdata) {
+        xdata = dict_new();
+        if (!xdata) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM,
+                   "failed to create "
+                   "dict");
+            goto err;
+        }
+    }
+
+    ret = dict_set_uint32(xdata, GF_CS_OBJECT_STATUS, 1);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "dict_set failed key:"
+               " %s",
+               GF_CS_OBJECT_STATUS);
+        goto err;
+    }
+
+    STACK_WIND(frame, default_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(readdirp, frame, -1, op_errno, NULL, NULL);
+    return 0;
+}
+
+int32_t
+cs_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                struct iatt *postbuf, dict_t *xdata)
+{
+    cs_local_t *local = NULL;
+    int ret = 0;
+    uint64_t val = 0;
+
+    local = frame->local;
+
+    local->call_cnt++;
+
+    if (op_ret == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "truncate failed");
+        ret = dict_get_uint64(xdata, GF_CS_OBJECT_STATUS, &val);
+        if (ret == 0) {
+            if (val == GF_CS_ERROR) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                       "could not get file state, unwinding");
+                op_ret = -1;
+                op_errno = EIO;
+                goto unwind;
+            } else {
+                __cs_inode_ctx_update(this, local->loc.inode, val);
+                gf_msg(this->name, GF_LOG_INFO, 0, 0, " state = %" PRIu64, val);
+
+                if (local->call_cnt == 1 &&
+                    (val == GF_CS_REMOTE || val == GF_CS_DOWNLOADING)) {
+                    gf_msg(this->name, GF_LOG_WARNING, 0, 0,
+                           "will repair and download "
+                           "the file, current state : %" PRIu64,
+                           val);
+                    goto repair;
+                } else {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                           "second truncate, Unwinding");
+                    goto unwind;
+                }
+            }
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                   "file state "
+                   "could not be figured, unwinding");
+            goto unwind;
+        }
+    } else {
+        /* successful write => file is local */
+        __cs_inode_ctx_update(this, local->loc.inode, GF_CS_LOCAL);
+        gf_msg(this->name, GF_LOG_INFO, 0, 0,
+               "state : GF_CS_LOCAL"
+               ", truncate successful");
+
+        goto unwind;
+    }
+
+repair:
+    ret = locate_and_execute(frame);
+    if (ret) {
+        goto unwind;
+    }
+
+    return 0;
+
+unwind:
+    CS_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+    return 0;
+}
+
+int32_t
+cs_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+            dict_t *xdata)
+{
+    cs_local_t *local = NULL;
+    int ret = 0;
+    cs_inode_ctx_t *ctx = NULL;
+    gf_cs_obj_state state = -1;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+
+    local = cs_local_init(this, frame, loc, NULL, GF_FOP_TRUNCATE);
+    if (!local) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "local init failed");
+        goto err;
+    }
+
+    __cs_inode_ctx_get(this, loc->inode, &ctx);
+
+    if (ctx)
+        state = __cs_get_file_state(loc->inode, ctx);
+    else
+        state = GF_CS_LOCAL;
+
+    local->xattr_req = xdata ? dict_ref(xdata) : (xdata = dict_new());
+
+    ret = dict_set_uint32(local->xattr_req, GF_CS_OBJECT_STATUS, 1);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "dict_set failed key:"
+               " %s",
+               GF_CS_OBJECT_STATUS);
+        goto err;
+    }
+
+    local->stub = fop_truncate_stub(frame, cs_resume_truncate, loc, offset,
+                                    xdata);
+    if (!local->stub) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "insufficient memory");
+        goto err;
+    }
+
+    if (state == GF_CS_LOCAL) {
+        STACK_WIND(frame, cs_truncate_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+
+    } else {
+        local->call_cnt++;
+        ret = locate_and_execute(frame);
+        if (ret) {
+            goto err;
+        }
+    }
+
+    return 0;
+err:
+    CS_STACK_UNWIND(truncate, frame, -1, ENOMEM, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+cs_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct statvfs *buf, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(statfs, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+int32_t
+cs_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    STACK_WIND(frame, cs_statfs_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->statfs, loc, xdata);
+    return 0;
+}
+
+int32_t
+cs_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+int32_t
+cs_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+            dict_t *xattr_req)
+{
+    STACK_WIND(frame, cs_getxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getxattr, loc, name, xattr_req);
+    return 0;
+}
+
+int32_t
+cs_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    cs_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->locked)
+        cs_inodelk_unlock(frame);
+
+    CS_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+int32_t
+cs_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+            int32_t flags, dict_t *xdata)
+{
+    data_t *tmp = NULL;
+    cs_local_t *local = NULL;
+    int ret = 0;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+
+    local = cs_local_init(this, frame, loc, NULL, GF_FOP_SETXATTR);
+    if (!local) {
+        ret = -1;
+        goto err;
+    }
+
+    local->xattr_req = xdata ? dict_ref(xdata) : (xdata = dict_new());
+
+    tmp = dict_get_sizen(dict, GF_CS_OBJECT_UPLOAD_COMPLETE);
+    if (tmp) {
+        /* Value of key should be the atime */
+        local->stub = fop_setxattr_stub(frame, cs_resume_setxattr, loc, dict,
+                                        flags, xdata);
+
+        if (!local->stub)
+            goto err;
+
+        ret = locate_and_execute(frame);
+        if (ret) {
+            goto err;
+        }
+
+        return 0;
+    }
+
+    STACK_WIND(frame, cs_setxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
+    return 0;
+err:
+    CS_STACK_UNWIND(setxattr, frame, -1, errno, NULL);
+    return 0;
+}
+
+int32_t
+cs_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+int32_t
+cs_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+             dict_t *xdata)
+{
+    STACK_WIND(frame, cs_fgetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+    return 0;
+}
+
+int32_t
+cs_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int32_t
+cs_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+             int32_t flags, dict_t *xdata)
+{
+    STACK_WIND(frame, cs_fsetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+    return 0;
+}
+
+int32_t
+cs_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *preparent, struct iatt *postparent,
+              dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent,
+                        xdata);
+    return 0;
+}
+
+int32_t
+cs_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+          dict_t *xattr_req)
+{
+    cs_local_t *local = NULL;
+    int ret = 0;
+
+    local = cs_local_init(this, frame, loc, NULL, GF_FOP_UNLINK);
+    if (!local)
+        goto err;
+
+    local->xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new();
+
+    ret = dict_set_uint32(local->xattr_req, GF_CS_OBJECT_STATUS, 1);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "dict_set failed key:"
+               " %s",
+               GF_CS_OBJECT_STATUS);
+        goto err;
+    }
+    STACK_WIND(frame, cs_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, flags, local->xattr_req);
+    return 0;
+err:
+    CS_STACK_UNWIND(unlink, frame, -1, errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+cs_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    int ret = 0;
+    uint64_t val = 0;
+
+    if (op_ret == 0) {
+        ret = dict_get_uint64(xdata, GF_CS_OBJECT_STATUS, &val);
+        if (!ret) {
+            ret = __cs_inode_ctx_update(this, fd->inode, val);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, 0, "ctx update failed");
+            }
+        }
+    } else {
+        cs_inode_ctx_reset(this, fd->inode);
+    }
+
+    CS_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata);
+    return 0;
+}
+
+int32_t
+cs_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+        fd_t *fd, dict_t *xattr_req)
+{
+    cs_local_t *local = NULL;
+    int ret = 0;
+
+    local = cs_local_init(this, frame, NULL, fd, GF_FOP_OPEN);
+    if (!local)
+        goto err;
+
+    local->xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new();
+
+    ret = dict_set_uint32(local->xattr_req, GF_CS_OBJECT_STATUS, 1);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "dict_set failed key:"
+               " %s",
+               GF_CS_OBJECT_STATUS);
+        goto err;
+    }
+
+    STACK_WIND(frame, cs_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, loc, flags, fd, local->xattr_req);
+    return 0;
+err:
+    CS_STACK_UNWIND(open, frame, -1, errno, NULL, NULL);
+    return 0;
+}
+
+int32_t
+cs_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+    int ret = 0;
+    uint64_t val = 0;
+    fd_t *fd = NULL;
+    cs_local_t *local = NULL;
+
+    local = frame->local;
+
+    fd = local->fd;
+
+    if (op_ret == 0) {
+        ret = dict_get_uint64(xdata, GF_CS_OBJECT_STATUS, &val);
+        if (!ret) {
+            gf_msg_debug(this->name, 0, "state %" PRIu64, val);
+            ret = __cs_inode_ctx_update(this, fd->inode, val);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, 0, "ctx update failed");
+            }
+        }
+    } else {
+        cs_inode_ctx_reset(this, fd->inode);
+    }
+
+    CS_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata);
+
+    return 0;
+}
+
+int32_t
+cs_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr_req)
+{
+    cs_local_t *local = NULL;
+    int ret = 0;
+
+    local = cs_local_init(this, frame, NULL, fd, GF_FOP_FSTAT);
+    if (!local)
+        goto err;
+
+    if (fd->inode->ia_type == IA_IFDIR)
+        goto wind;
+
+    local->xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new();
+
+    ret = dict_set_uint32(local->xattr_req, GF_CS_OBJECT_STATUS, 1);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "dict_set failed key:"
+               " %s",
+               GF_CS_OBJECT_STATUS);
+        goto err;
+    }
+
+wind:
+    STACK_WIND(frame, cs_fstat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fstat, fd, local->xattr_req);
+    return 0;
+err:
+    CS_STACK_UNWIND(fstat, frame, -1, errno, NULL, NULL);
+    return 0;
+}
+
+cs_local_t *
+cs_local_init(xlator_t *this, call_frame_t *frame, loc_t *loc, fd_t *fd,
+              glusterfs_fop_t fop)
+{
+    cs_local_t *local = NULL;
+    int ret = 0;
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto out;
+
+    if (loc) {
+        ret = loc_copy(&local->loc, loc);
+        if (ret)
+            goto out;
+    }
+
+    if (fd) {
+        local->fd = fd_ref(fd);
+    }
+
+    local->op_ret = -1;
+    local->op_errno = EUCLEAN;
+    local->fop = fop;
+    local->dloffset = 0;
+    frame->local = local;
+    local->locked = _gf_false;
+    local->call_cnt = 0;
+out:
+    if (ret) {
+        if (local)
+            mem_put(local);
+        local = NULL;
+    }
+
+    return local;
+}
+
+call_frame_t *
+cs_lock_frame(call_frame_t *parent_frame)
+{
+    call_frame_t *lock_frame = NULL;
+
+    lock_frame = copy_frame(parent_frame);
+
+    if (lock_frame == NULL)
+        goto out;
+
+    set_lk_owner_from_ptr(&lock_frame->root->lk_owner, parent_frame->root);
+
+out:
+    return lock_frame;
+}
+
+void
+cs_lock_wipe(call_frame_t *lock_frame)
+{
+    CS_STACK_DESTROY(lock_frame);
+}
+
+int32_t
+cs_inodelk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    cs_lock_wipe(frame);
+
+    return 0;
+}
+
+int
+cs_inodelk_unlock(call_frame_t *main_frame)
+{
+    xlator_t *this = NULL;
+    struct gf_flock flock = {
+        0,
+    };
+    call_frame_t *lock_frame = NULL;
+    cs_local_t *lock_local = NULL;
+    cs_local_t *main_local = NULL;
+    int ret = 0;
+
+    this = main_frame->this;
+    main_local = main_frame->local;
+
+    lock_frame = cs_lock_frame(main_frame);
+    if (!lock_frame)
+        goto out;
+
+    lock_local = cs_local_init(this, lock_frame, NULL, NULL, 0);
+    if (!lock_local)
+        goto out;
+
+    ret = cs_build_loc(&lock_local->loc, main_frame);
+    if (ret) {
+        goto out;
+    }
+
+    flock.l_type = F_UNLCK;
+
+    main_local->locked = _gf_false;
+
+    STACK_WIND(lock_frame, cs_inodelk_unlock_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->inodelk, CS_LOCK_DOMAIN,
+               &lock_local->loc, F_SETLKW, &flock, NULL);
+
+    return 0;
+
+out:
+    gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+           "Stale lock would be found on"
+           " server");
+
+    if (lock_frame)
+        cs_lock_wipe(lock_frame);
+
+    return 0;
+}
+
+int
+cs_download_task(void *arg)
+{
+    call_frame_t *frame = NULL;
+    xlator_t *this = NULL;
+    cs_private_t *priv = NULL;
+    int ret = -1;
+    char *sign_req = NULL;
+    fd_t *fd = NULL;
+    cs_local_t *local = NULL;
+    dict_t *dict = NULL;
+
+    frame = (call_frame_t *)arg;
+
+    this = frame->this;
+
+    priv = this->private;
+
+    if (!priv->stores) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "No remote store "
+               "plugins found");
+        ret = -1;
+        goto out;
+    }
+
+    local = frame->local;
+
+    if (local->fd)
+        fd = fd_anonymous(local->fd->inode);
+    else
+        fd = fd_anonymous(local->loc.inode);
+
+    if (!fd) {
+        gf_msg("CS", GF_LOG_ERROR, 0, 0, "fd creation failed");
+        ret = -1;
+        goto out;
+    }
+
+    local->dlfd = fd;
+    local->dloffset = 0;
+
+    dict = dict_new();
+    if (!dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM,
+               "failed to create "
+               "dict");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_uint32(dict, GF_CS_OBJECT_DOWNLOADING, 1);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "dict_set failed");
+        ret = -1;
+        goto out;
+    }
+
+    ret = syncop_fsetxattr(this, local->fd, dict, 0, NULL, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "fsetxattr failed "
+               "key %s",
+               GF_CS_OBJECT_DOWNLOADING);
+        ret = -1;
+        goto out;
+    }
+    /*this calling method is for per volume setting */
+    ret = priv->stores->dlfop(frame, priv->stores->config);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "download failed"
+               ", remotepath: %s",
+               local->remotepath);
+
+        /*using dlfd as it is anonymous and have RDWR flag*/
+        ret = syncop_ftruncate(FIRST_CHILD(this), local->dlfd, 0, NULL, NULL,
+                               NULL, NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, -ret, "ftruncate failed");
+        } else {
+            gf_msg_debug(this->name, 0, "ftruncate succeed");
+        }
+
+        ret = -1;
+        goto out;
+    } else {
+        gf_msg(this->name, GF_LOG_INFO, 0, 0,
+               "download success, path"
+               " : %s",
+               local->remotepath);
+
+        ret = syncop_fremovexattr(this, local->fd, GF_CS_OBJECT_REMOTE, NULL,
+                                  NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, -ret,
+                   "removexattr failed, remotexattr");
+            ret = -1;
+            goto out;
+        } else {
+            gf_msg_debug(this->name, 0,
+                         "fremovexattr success, "
+                         "path : %s",
+                         local->remotepath);
+        }
+
+        ret = syncop_fremovexattr(this, local->fd, GF_CS_OBJECT_DOWNLOADING,
+                                  NULL, NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, -ret,
+                   "removexattr failed, downloading xattr, path %s",
+                   local->remotepath);
+            ret = -1;
+            goto out;
+        } else {
+            gf_msg_debug(this->name, 0,
+                         "fremovexattr success"
+                         " path  %s",
+                         local->remotepath);
+        }
+    }
+
+out:
+    GF_FREE(sign_req);
+
+    if (dict)
+        dict_unref(dict);
+
+    if (fd) {
+        fd_unref(fd);
+        local->dlfd = NULL;
+    }
+
+    return ret;
+}
+
+int
+cs_download(call_frame_t *frame)
+{
+    int ret = 0;
+    cs_local_t *local = NULL;
+    xlator_t *this = NULL;
+
+    local = frame->local;
+    this = frame->this;
+
+    if (!local->remotepath) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "remote path not"
+               " available. Check posix logs to resolve");
+        goto out;
+    }
+
+    ret = cs_download_task((void *)frame);
+out:
+    return ret;
+}
+
+int
+cs_set_xattr_req(call_frame_t *frame)
+{
+    cs_local_t *local = NULL;
+    GF_UNUSED int ret = 0;
+
+    local = frame->local;
+
+    /* When remote reads are performed (i.e. reads on remote store),
+     * there needs to be a way to associate a file on gluster volume
+     * with its correspnding file on the remote store. In order to do
+     * that, a unique key can be maintained as an xattr
+     * (GF_CS_XATTR_ARCHIVE_UUID)on the stub file on gluster bricks.
+     * This xattr should be provided to the plugin to
+     * perform the read fop on the correct file. This assumes that the file
+     * hierarchy and name need not be the same on remote store as that of
+     * the gluster volume.
+     */
+    ret = dict_set_sizen_str_sizen(local->xattr_req, GF_CS_XATTR_ARCHIVE_UUID,
+                                   "1");
+
+    return 0;
+}
+
+int
+cs_update_xattrs(call_frame_t *frame, dict_t *xdata)
+{
+    cs_local_t *local = NULL;
+    xlator_t *this = NULL;
+    int size = -1;
+    GF_UNUSED int ret = 0;
+
+    local = frame->local;
+    this = frame->this;
+
+    local->xattrinfo.lxattr = GF_CALLOC(1, sizeof(cs_loc_xattr_t),
+                                        gf_cs_mt_cs_lxattr_t);
+    if (!local->xattrinfo.lxattr) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto err;
+    }
+
+    gf_uuid_copy(local->xattrinfo.lxattr->gfid, local->loc.gfid);
+
+    if (local->remotepath) {
+        local->xattrinfo.lxattr->file_path = gf_strdup(local->remotepath);
+        if (!local->xattrinfo.lxattr->file_path) {
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            goto err;
+        }
+    }
+
+    ret = dict_get_gfuuid(xdata, GF_CS_XATTR_ARCHIVE_UUID,
+                          &(local->xattrinfo.lxattr->uuid));
+
+    if (ret) {
+        gf_uuid_clear(local->xattrinfo.lxattr->uuid);
+    }
+    size = strlen(this->name) - strlen("-cloudsync") + 1;
+    local->xattrinfo.lxattr->volname = GF_CALLOC(1, size, gf_common_mt_char);
+    if (!local->xattrinfo.lxattr->volname) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto err;
+    }
+    strncpy(local->xattrinfo.lxattr->volname, this->name, size - 1);
+    local->xattrinfo.lxattr->volname[size - 1] = '\0';
+
+    return 0;
+err:
+    cs_xattrinfo_wipe(local);
+    return -1;
+}
+
+int
+cs_serve_readv(call_frame_t *frame, off_t offset, size_t size, uint32_t flags)
+{
+    xlator_t *this = NULL;
+    cs_private_t *priv = NULL;
+    int ret = -1;
+    fd_t *fd = NULL;
+    cs_local_t *local = NULL;
+
+    local = frame->local;
+    this = frame->this;
+    priv = this->private;
+
+    if (!local->remotepath) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "remote path not"
+               " available. Check posix logs to resolve");
+        goto out;
+    }
+
+    if (!priv->stores) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "No remote store "
+               "plugins found");
+        ret = -1;
+        goto out;
+    }
+
+    if (local->fd) {
+        fd = fd_anonymous(local->fd->inode);
+    } else {
+        fd = fd_anonymous(local->loc.inode);
+    }
+
+    local->xattrinfo.size = size;
+    local->xattrinfo.offset = offset;
+    local->xattrinfo.flags = flags;
+
+    if (!fd) {
+        gf_msg("CS", GF_LOG_ERROR, 0, 0, "fd creation failed");
+        ret = -1;
+        goto out;
+    }
+
+    local->dlfd = fd;
+    local->dloffset = offset;
+
+    /*this calling method is for per volume setting */
+    ret = priv->stores->rdfop(frame, priv->stores->config);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "read failed"
+               ", remotepath: %s",
+               local->remotepath);
+        ret = -1;
+        goto out;
+    } else {
+        gf_msg(this->name, GF_LOG_INFO, 0, 0,
+               "read success, path"
+               " : %s",
+               local->remotepath);
+    }
+
+out:
+    if (fd) {
+        fd_unref(fd);
+        local->dlfd = NULL;
+    }
+    return ret;
+}
+
+int32_t
+cs_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iovec *vector, int32_t count,
+             struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
+{
+    cs_local_t *local = NULL;
+    int ret = 0;
+    uint64_t val = 0;
+    fd_t *fd = NULL;
+
+    local = frame->local;
+    fd = local->fd;
+
+    local->call_cnt++;
+
+    if (op_ret == -1) {
+        ret = dict_get_uint64(xdata, GF_CS_OBJECT_STATUS, &val);
+        if (ret == 0) {
+            if (val == GF_CS_ERROR) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                       "could not get file state, unwinding");
+                op_ret = -1;
+                op_errno = EIO;
+                goto unwind;
+            } else {
+                __cs_inode_ctx_update(this, fd->inode, val);
+                gf_msg(this->name, GF_LOG_INFO, 0, 0, " state = %" PRIu64, val);
+
+                if (local->call_cnt == 1 &&
+                    (val == GF_CS_REMOTE || val == GF_CS_DOWNLOADING)) {
+                    gf_msg(this->name, GF_LOG_INFO, 0, 0,
+                           " will read from remote : %" PRIu64, val);
+                    goto repair;
+                } else {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                           "second readv, Unwinding");
+                    goto unwind;
+                }
+            }
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                   "file state "
+                   "could not be figured, unwinding");
+            goto unwind;
+        }
+    } else {
+        /* successful readv => file is local */
+        __cs_inode_ctx_update(this, fd->inode, GF_CS_LOCAL);
+        gf_msg(this->name, GF_LOG_INFO, 0, 0,
+               "state : GF_CS_LOCAL"
+               ", readv successful");
+
+        goto unwind;
+    }
+
+repair:
+    ret = locate_and_execute(frame);
+    if (ret) {
+        goto unwind;
+    }
+
+    return 0;
+
+unwind:
+    CS_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, stbuf,
+                    iobref, xdata);
+
+    return 0;
+}
+
+int32_t
+cs_resume_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                off_t offset, uint32_t flags, dict_t *xdata)
+{
+    int ret = 0;
+
+    ret = cs_resume_postprocess(this, frame, fd->inode);
+    if (ret) {
+        goto unwind;
+    }
+
+    cs_inodelk_unlock(frame);
+
+    STACK_WIND(frame, cs_readv_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+
+    return 0;
+
+unwind:
+    cs_inodelk_unlock(frame);
+
+    cs_common_cbk(frame);
+
+    return 0;
+}
+
+int32_t
+cs_resume_remote_readv(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                       size_t size, off_t offset, uint32_t flags, dict_t *xdata)
+{
+    int ret = 0;
+    cs_local_t *local = NULL;
+    gf_cs_obj_state state = -1;
+    cs_inode_ctx_t *ctx = NULL;
+
+    cs_inodelk_unlock(frame);
+
+    local = frame->local;
+    if (!local) {
+        ret = -1;
+        goto unwind;
+    }
+
+    __cs_inode_ctx_get(this, fd->inode, &ctx);
+
+    state = __cs_get_file_state(fd->inode, ctx);
+    if (state == GF_CS_ERROR) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "status is GF_CS_ERROR."
+               " Aborting readv");
+        local->op_ret = -1;
+        local->op_errno = EREMOTE;
+        ret = -1;
+        goto unwind;
+    }
+
+    /* Serve readv from remote store only if it is remote. */
+    gf_msg_debug(this->name, 0, "status of file %s is %d",
+                 local->remotepath ? local->remotepath : "", state);
+
+    /* We will reach this condition if local inode ctx had REMOTE
+     * state when the control was in cs_readv but after stat
+     * we got an updated state saying that the file is LOCAL.
+     */
+    if (state == GF_CS_LOCAL) {
+        STACK_WIND(frame, cs_readv_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->readv, fd, size, offset, flags,
+                   xdata);
+    } else if (state == GF_CS_REMOTE) {
+        ret = cs_resume_remote_readv_postprocess(this, frame, fd->inode, offset,
+                                                 size, flags);
+        /* Failed to submit the remote readv fop to plugin */
+        if (ret) {
+            local->op_ret = -1;
+            local->op_errno = EREMOTE;
+            goto unwind;
+        }
+        /* When the file is in any other intermediate state,
+         * we should not perform remote reads.
+         */
+    } else {
+        local->op_ret = -1;
+        local->op_errno = EINVAL;
+        goto unwind;
+    }
+
+    return 0;
+
+unwind:
+    cs_common_cbk(frame);
+
+    return 0;
+}
+
+int32_t
+cs_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+         off_t offset, uint32_t flags, dict_t *xdata)
+{
+    int op_errno = ENOMEM;
+    cs_local_t *local = NULL;
+    int ret = 0;
+    cs_inode_ctx_t *ctx = NULL;
+    gf_cs_obj_state state = -1;
+    cs_private_t *priv = NULL;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    priv = this->private;
+
+    local = cs_local_init(this, frame, NULL, fd, GF_FOP_READ);
+    if (!local) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "local init failed");
+        goto err;
+    }
+
+    __cs_inode_ctx_get(this, fd->inode, &ctx);
+
+    if (ctx)
+        state = __cs_get_file_state(fd->inode, ctx);
+    else
+        state = GF_CS_LOCAL;
+
+    local->xattr_req = xdata ? dict_ref(xdata) : (xdata = dict_new());
+
+    ret = dict_set_uint32(local->xattr_req, GF_CS_OBJECT_STATUS, 1);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "dict_set failed key:"
+               " %s",
+               GF_CS_OBJECT_STATUS);
+        goto err;
+    }
+
+    if (priv->remote_read) {
+        local->stub = fop_readv_stub(frame, cs_resume_remote_readv, fd, size,
+                                     offset, flags, xdata);
+    } else {
+        local->stub = fop_readv_stub(frame, cs_resume_readv, fd, size, offset,
+                                     flags, xdata);
+    }
+    if (!local->stub) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "insufficient memory");
+        goto err;
+    }
+
+    if (state == GF_CS_LOCAL) {
+        STACK_WIND(frame, cs_readv_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->readv, fd, size, offset, flags,
+                   xdata);
+    } else {
+        local->call_cnt++;
+        ret = locate_and_execute(frame);
+        if (ret) {
+            goto err;
+        }
+    }
+
+    return 0;
+
+err:
+    CS_STACK_UNWIND(readv, frame, -1, op_errno, NULL, -1, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int
+cs_resume_remote_readv_postprocess(xlator_t *this, call_frame_t *frame,
+                                   inode_t *inode, off_t offset, size_t size,
+                                   uint32_t flags)
+{
+    int ret = 0;
+
+    ret = cs_serve_readv(frame, offset, size, flags);
+
+    return ret;
+}
+
+int
+cs_stat_check_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                  int op_errno, struct iatt *stbuf, dict_t *xdata)
+{
+    cs_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+    char *filepath = NULL;
+    int ret = 0;
+    inode_t *inode = NULL;
+    uint64_t val = 0;
+
+    local = frame->local;
+
+    if (op_ret == -1) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        gf_msg(this->name, GF_LOG_ERROR, 0, op_errno, "stat check failed");
+        goto err;
+    } else {
+        if (local->fd)
+            inode = local->fd->inode;
+        else
+            inode = local->loc.inode;
+
+        if (!inode) {
+            local->op_ret = -1;
+            local->op_errno = EINVAL;
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                   "null inode "
+                   "returned");
+            goto err;
+        }
+
+        ret = dict_get_uint64(xdata, GF_CS_OBJECT_STATUS, &val);
+        if (ret == 0) {
+            if (val == GF_CS_ERROR) {
+                cs_inode_ctx_reset(this, inode);
+                local->op_ret = -1;
+                local->op_errno = EIO;
+                gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                       "status = GF_CS_ERROR. failed to get "
+                       " file state");
+                goto err;
+            } else {
+                ret = __cs_inode_ctx_update(this, inode, val);
+                gf_msg_debug(this->name, 0, "status : %" PRIu64, val);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, 0, "ctx update failed");
+                    local->op_ret = -1;
+                    local->op_errno = ENOMEM;
+                    goto err;
+                }
+            }
+        } else {
+            gf_msg_debug(this->name, 0, "status not found in dict");
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            goto err;
+        }
+
+        ret = dict_get_str_sizen(xdata, GF_CS_OBJECT_REMOTE, &filepath);
+        if (filepath) {
+            gf_msg_debug(this->name, 0, "filepath returned %s", filepath);
+            local->remotepath = gf_strdup(filepath);
+            if (!local->remotepath) {
+                local->op_ret = -1;
+                local->op_errno = ENOMEM;
+                goto err;
+            }
+        } else {
+            gf_msg_debug(this->name, 0, "NULL filepath");
+        }
+
+        ret = cs_update_xattrs(frame, xdata);
+        if (ret)
+            goto err;
+
+        local->op_ret = 0;
+        local->xattr_rsp = dict_ref(xdata);
+        memcpy(&local->stbuf, stbuf, sizeof(struct iatt));
+    }
+
+    stub = local->stub;
+    local->stub = NULL;
+    call_resume(stub);
+
+    return 0;
+err:
+    cs_inodelk_unlock(frame);
+
+    cs_common_cbk(frame);
+
+    return 0;
+}
+
+int
+cs_do_stat_check(call_frame_t *main_frame)
+{
+    cs_local_t *local = NULL;
+    xlator_t *this = NULL;
+    int ret = 0;
+
+    local = main_frame->local;
+    this = main_frame->this;
+
+    ret = dict_set_uint32(local->xattr_req, GF_CS_OBJECT_REPAIR, 256);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "dict_set failed");
+        goto err;
+    }
+
+    cs_set_xattr_req(main_frame);
+
+    if (local->fd) {
+        STACK_WIND(main_frame, cs_stat_check_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fstat, local->fd, local->xattr_req);
+    } else {
+        STACK_WIND(main_frame, cs_stat_check_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->stat, &local->loc,
+                   local->xattr_req);
+    }
+
+    return 0;
+
+err:
+    cs_inodelk_unlock(main_frame);
+
+    cs_common_cbk(main_frame);
+
+    return 0;
+}
+
+void
+cs_common_cbk(call_frame_t *frame)
+{
+    glusterfs_fop_t fop = -1;
+    cs_local_t *local = NULL;
+
+    local = frame->local;
+
+    fop = local->fop;
+
+    /*Note: Only the failure case needs to be handled here. Since for
+     * successful stat check the fop will resume anyway. The unwind can
+     * happen from the fop_cbk and each cbk can unlock the inodelk in case
+     * a lock was taken before. The lock status can be stored in frame */
+
+    /* for failure case  */
+
+    /*TODO: add other fops*/
+    switch (fop) {
+        case GF_FOP_WRITE:
+            CS_STACK_UNWIND(writev, frame, local->op_ret, local->op_errno, NULL,
+                            NULL, NULL);
+            break;
+
+        case GF_FOP_SETXATTR:
+            CS_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+                            NULL);
+            break;
+        case GF_FOP_READ:
+            CS_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, NULL,
+                            0, NULL, NULL, NULL);
+            break;
+        case GF_FOP_FTRUNCATE:
+            CS_STACK_UNWIND(ftruncate, frame, local->op_ret, local->op_errno,
+                            NULL, NULL, NULL);
+            break;
+
+        case GF_FOP_TRUNCATE:
+            CS_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno,
+                            NULL, NULL, NULL);
+            break;
+        default:
+            break;
+    }
+
+    return;
+}
+
+int
+cs_blocking_inodelk_cbk(call_frame_t *lock_frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    cs_local_t *main_local = NULL;
+    call_frame_t *main_frame = NULL;
+    cs_local_t *lock_local = NULL;
+
+    lock_local = lock_frame->local;
+
+    main_frame = lock_local->main_frame;
+    main_local = main_frame->local;
+
+    if (op_ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "inodelk failed");
+        main_local->op_errno = op_errno;
+        main_local->op_ret = op_ret;
+        goto err;
+    }
+
+    main_local->locked = _gf_true;
+
+    cs_lock_wipe(lock_frame);
+
+    cs_do_stat_check(main_frame);
+
+    return 0;
+err:
+    cs_common_cbk(main_frame);
+
+    cs_lock_wipe(lock_frame);
+
+    return 0;
+}
+
+int
+cs_build_loc(loc_t *loc, call_frame_t *frame)
+{
+    cs_local_t *local = NULL;
+    int ret = -1;
+
+    local = frame->local;
+
+    if (local->fd) {
+        loc->inode = inode_ref(local->fd->inode);
+        if (loc->inode) {
+            gf_uuid_copy(loc->gfid, loc->inode->gfid);
+            ret = 0;
+            goto out;
+        } else {
+            ret = -1;
+            goto out;
+        }
+    } else {
+        loc->inode = inode_ref(local->loc.inode);
+        if (loc->inode) {
+            gf_uuid_copy(loc->gfid, loc->inode->gfid);
+            ret = 0;
+            goto out;
+        } else {
+            ret = -1;
+            goto out;
+        }
+    }
+out:
+    return ret;
+}
+
+int
+cs_blocking_inodelk(call_frame_t *parent_frame)
+{
+    call_frame_t *lock_frame = NULL;
+    cs_local_t *lock_local = NULL;
+    xlator_t *this = NULL;
+    struct gf_flock flock = {
+        0,
+    };
+    int ret = 0;
+
+    this = parent_frame->this;
+
+    lock_frame = cs_lock_frame(parent_frame);
+    if (!lock_frame) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "insuffcient memory");
+        goto err;
+    }
+
+    lock_local = cs_local_init(this, lock_frame, NULL, NULL, 0);
+    if (!lock_local) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "local init failed");
+        goto err;
+    }
+
+    lock_local->main_frame = parent_frame;
+
+    flock.l_type = F_WRLCK;
+
+    ret = cs_build_loc(&lock_local->loc, parent_frame);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "build_loc failed");
+        goto err;
+    }
+
+    STACK_WIND(lock_frame, cs_blocking_inodelk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->inodelk, CS_LOCK_DOMAIN,
+               &lock_local->loc, F_SETLKW, &flock, NULL);
+
+    return 0;
+err:
+    if (lock_frame)
+        cs_lock_wipe(lock_frame);
+
+    return -1;
+}
+
+int
+locate_and_execute(call_frame_t *frame)
+{
+    int ret = 0;
+
+    ret = cs_blocking_inodelk(frame);
+
+    if (ret)
+        return -1;
+    else
+        return 0;
+}
+
+int32_t
+cs_resume_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   off_t offset, dict_t *xattr_req)
+{
+    cs_local_t *local = NULL;
+    int ret = 0;
+
+    local = frame->local;
+
+    ret = cs_resume_postprocess(this, frame, loc->inode);
+    if (ret) {
+        goto unwind;
+    }
+
+    cs_inodelk_unlock(frame);
+
+    STACK_WIND(frame, cs_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset,
+               local->xattr_req);
+
+    return 0;
+
+unwind:
+    cs_inodelk_unlock(frame);
+
+    cs_common_cbk(frame);
+
+    return 0;
+}
+
+int32_t
+cs_resume_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   dict_t *dict, int32_t flags, dict_t *xdata)
+{
+    cs_local_t *local = NULL;
+    cs_inode_ctx_t *ctx = NULL;
+    gf_cs_obj_state state = GF_CS_ERROR;
+
+    local = frame->local;
+
+    __cs_inode_ctx_get(this, loc->inode, &ctx);
+
+    state = __cs_get_file_state(loc->inode, ctx);
+
+    if (state == GF_CS_ERROR) {
+        /* file is already remote */
+        local->op_ret = -1;
+        local->op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_WARNING, 0, 0,
+               "file %s , could not figure file state", loc->path);
+        goto unwind;
+    }
+
+    if (state == GF_CS_REMOTE) {
+        /* file is already remote */
+        local->op_ret = -1;
+        local->op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_WARNING, 0, EINVAL,
+               "file %s is already remote", loc->path);
+        goto unwind;
+    }
+
+    if (state == GF_CS_DOWNLOADING) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, 0,
+               " file is in downloading state.");
+        local->op_ret = -1;
+        local->op_errno = EINVAL;
+        goto unwind;
+    }
+
+    STACK_WIND(frame, cs_setxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setxattr, loc, dict, flags,
+               local->xattr_req);
+
+    return 0;
+unwind:
+    cs_inodelk_unlock(frame);
+
+    cs_common_cbk(frame);
+
+    return 0;
+}
+
+gf_cs_obj_state
+__cs_get_file_state(inode_t *inode, cs_inode_ctx_t *ctx)
+{
+    gf_cs_obj_state state = -1;
+
+    if (!ctx)
+        return GF_CS_ERROR;
+
+    LOCK(&inode->lock);
+    {
+        state = ctx->state;
+    }
+    UNLOCK(&inode->lock);
+
+    return state;
+}
+
+void
+__cs_inode_ctx_get(xlator_t *this, inode_t *inode, cs_inode_ctx_t **ctx)
+{
+    uint64_t ctxint = 0;
+    int ret = 0;
+
+    LOCK(&inode->lock);
+    {
+        ret = __inode_ctx_get(inode, this, &ctxint);
+    }
+    UNLOCK(&inode->lock);
+
+    if (ret)
+        *ctx = NULL;
+    else
+        *ctx = (cs_inode_ctx_t *)(uintptr_t)ctxint;
+
+    return;
+}
+
+int
+__cs_inode_ctx_update(xlator_t *this, inode_t *inode, uint64_t val)
+{
+    cs_inode_ctx_t *ctx = NULL;
+    uint64_t ctxint = 0;
+    int ret = 0;
+
+    LOCK(&inode->lock);
+    {
+        ret = __inode_ctx_get(inode, this, &ctxint);
+        if (ret) {
+            ctx = GF_CALLOC(1, sizeof(*ctx), gf_cs_mt_cs_inode_ctx_t);
+            if (!ctx) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, 0, "ctx allocation failed");
+                ret = -1;
+                goto out;
+            }
+
+            ctx->state = val;
+
+            ctxint = (uint64_t)(uintptr_t)ctx;
+
+            ret = __inode_ctx_set(inode, this, &ctxint);
+            if (ret) {
+                GF_FREE(ctx);
+                goto out;
+            }
+        } else {
+            ctx = (cs_inode_ctx_t *)(uintptr_t)ctxint;
+
+            ctx->state = val;
+        }
+    }
+
+out:
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+int
+cs_inode_ctx_reset(xlator_t *this, inode_t *inode)
+{
+    cs_inode_ctx_t *ctx = NULL;
+    uint64_t ctxint = 0;
+
+    inode_ctx_del(inode, this, &ctxint);
+    if (!ctxint) {
+        return 0;
+    }
+
+    ctx = (cs_inode_ctx_t *)(uintptr_t)ctxint;
+
+    GF_FREE(ctx);
+    return 0;
+}
+
+int
+cs_resume_postprocess(xlator_t *this, call_frame_t *frame, inode_t *inode)
+{
+    cs_local_t *local = NULL;
+    gf_cs_obj_state state = -1;
+    cs_inode_ctx_t *ctx = NULL;
+    int ret = 0;
+
+    local = frame->local;
+    if (!local) {
+        ret = -1;
+        goto out;
+    }
+
+    __cs_inode_ctx_get(this, inode, &ctx);
+
+    state = __cs_get_file_state(inode, ctx);
+    if (state == GF_CS_ERROR) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "status is GF_CS_ERROR."
+               " Aborting write");
+        local->op_ret = -1;
+        local->op_errno = EREMOTE;
+        ret = -1;
+        goto out;
+    }
+
+    if (state == GF_CS_REMOTE || state == GF_CS_DOWNLOADING) {
+        gf_msg_debug(this->name, 0, "status is %d", state);
+        ret = cs_download(frame);
+        if (ret == 0) {
+            gf_msg_debug(this->name, 0, "Winding for Final Write");
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+                   " download failed, unwinding writev");
+            local->op_ret = -1;
+            local->op_errno = EREMOTE;
+            ret = -1;
+        }
+    }
+out:
+    return ret;
+}
+
+int32_t
+cs_fdctx_to_dict(xlator_t *this, fd_t *fd, dict_t *dict)
+{
+    return 0;
+}
+
+int32_t
+cs_inode(xlator_t *this)
+{
+    return 0;
+}
+
+int32_t
+cs_inode_to_dict(xlator_t *this, dict_t *dict)
+{
+    return 0;
+}
+
+int32_t
+cs_history(xlator_t *this)
+{
+    return 0;
+}
+
+int32_t
+cs_fd(xlator_t *this)
+{
+    return 0;
+}
+
+int32_t
+cs_fd_to_dict(xlator_t *this, dict_t *dict)
+{
+    return 0;
+}
+
+int32_t
+cs_fdctx(xlator_t *this, fd_t *fd)
+{
+    return 0;
+}
+
+int32_t
+cs_inodectx(xlator_t *this, inode_t *ino)
+{
+    return 0;
+}
+
+int32_t
+cs_inodectx_to_dict(xlator_t *this, inode_t *ino, dict_t *dict)
+{
+    return 0;
+}
+
+int32_t
+cs_priv_to_dict(xlator_t *this, dict_t *dict, char *brickname)
+{
+    return 0;
+}
+
+int32_t
+cs_priv(xlator_t *this)
+{
+    return 0;
+}
+
+int
+cs_notify(xlator_t *this, int event, void *data, ...)
+{
+    return default_notify(this, event, data);
+}
+
+struct xlator_fops cs_fops = {
+    .stat = cs_stat,
+    .readdirp = cs_readdirp,
+    .truncate = cs_truncate,
+    .seek = cs_seek,
+    .statfs = cs_statfs,
+    .fallocate = cs_fallocate,
+    .discard = cs_discard,
+    .getxattr = cs_getxattr,
+    .writev = cs_writev,
+    .setxattr = cs_setxattr,
+    .fgetxattr = cs_fgetxattr,
+    .lookup = cs_lookup,
+    .fsetxattr = cs_fsetxattr,
+    .readv = cs_readv,
+    .ftruncate = cs_ftruncate,
+    .rchecksum = cs_rchecksum,
+    .unlink = cs_unlink,
+    .open = cs_open,
+    .fstat = cs_fstat,
+    .zerofill = cs_zerofill,
+};
+
+struct xlator_cbks cs_cbks = {
+    .forget = cs_forget,
+};
+
+struct xlator_dumpops cs_dumpops = {
+    .fdctx_to_dict = cs_fdctx_to_dict,
+    .inode = cs_inode,
+    .inode_to_dict = cs_inode_to_dict,
+    .history = cs_history,
+    .fd = cs_fd,
+    .fd_to_dict = cs_fd_to_dict,
+    .fdctx = cs_fdctx,
+    .inodectx = cs_inodectx,
+    .inodectx_to_dict = cs_inodectx_to_dict,
+    .priv_to_dict = cs_priv_to_dict,
+    .priv = cs_priv,
+};
+
+struct volume_options cs_options[] = {
+    {.key = {"cloudsync-storetype"},
+     .type = GF_OPTION_TYPE_STR,
+     .description = "Defines which remote store is enabled"},
+    {.key = {"cloudsync-remote-read"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .description = "Defines a remote read fop when on"},
+    {.key = {"cloudsync-store-id"},
+     .type = GF_OPTION_TYPE_STR,
+     .description = "Defines a volume wide store id"},
+    {.key = {"cloudsync-product-id"},
+     .type = GF_OPTION_TYPE_STR,
+     .description = "Defines a volume wide product id"},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = cs_init,
+    .fini = cs_fini,
+    .notify = cs_notify,
+    .reconfigure = cs_reconfigure,
+    .mem_acct_init = cs_mem_acct_init,
+    .dumpops = &cs_dumpops,
+    .fops = &cs_fops,
+    .cbks = &cs_cbks,
+    .options = cs_options,
+    .identifier = "cloudsync",
+    .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/features/cloudsync/src/cloudsync.h b/xlators/features/cloudsync/src/cloudsync.h
new file mode 100644
index 00000000000..d24141978d6
--- /dev/null
+++ b/xlators/features/cloudsync/src/cloudsync.h
@@ -0,0 +1,123 @@
+/*
+ *   Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#ifndef __CLOUDSYNC_H__
+#define __CLOUDSYNC_H__
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/call-stub.h>
+#include "cloudsync-common.h"
+#include "cloudsync-autogen-fops.h"
+
+#define ALIGN_SIZE 4096
+#define CS_LOCK_DOMAIN "cs.protect.file.stat"
+typedef struct cs_dlstore {
+    off_t off;
+    struct iovec *vector;
+    int32_t count;
+    struct iobref *iobref;
+    uint32_t flags;
+} cs_dlstore;
+
+typedef struct cs_inode_ctx {
+    cs_loc_xattr_t locxattr;
+    gf_cs_obj_state state;
+} cs_inode_ctx_t;
+
+struct cs_plugin {
+    char *name;        /* store name */
+    char *library;     /* library to load for the given store */
+    char *description; /* description about the store */
+};
+
+cs_local_t *
+cs_local_init(xlator_t *this, call_frame_t *frame, loc_t *loc, fd_t *fd,
+              glusterfs_fop_t fop);
+
+int
+locate_and_execute(call_frame_t *frame);
+
+int32_t
+cs_resume_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   dict_t *dict, int32_t flags, dict_t *xdata);
+
+int32_t
+cs_inodelk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+size_t
+cs_write_callback(void *lcurlbuf, size_t size, size_t nitems, void *frame);
+
+void
+cs_common_cbk(call_frame_t *frame);
+
+gf_boolean_t
+cs_is_file_remote(struct iatt *stbuf, dict_t *xattr);
+
+int32_t
+cs_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata);
+int
+cs_build_loc(loc_t *loc, call_frame_t *frame);
+
+int
+cs_blocking_inodelk_cbk(call_frame_t *lock_frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *xdata);
+
+int
+cs_read_authinfo(xlator_t *this);
+
+int
+__cs_inode_ctx_update(xlator_t *this, inode_t *inode, uint64_t val);
+
+int
+cs_inode_ctx_reset(xlator_t *this, inode_t *inode);
+
+void
+__cs_inode_ctx_get(xlator_t *this, inode_t *inode, cs_inode_ctx_t **ctx);
+
+gf_cs_obj_state
+__cs_get_file_state(inode_t *inode, cs_inode_ctx_t *ctx);
+
+int
+cs_inodelk_unlock(call_frame_t *main_frame);
+
+int
+cs_resume_postprocess(xlator_t *this, call_frame_t *frame, inode_t *inode);
+
+int32_t
+cs_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                struct iatt *postbuf, dict_t *xdata);
+int32_t
+cs_resume_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   off_t offset, dict_t *xattr_req);
+
+int32_t
+cs_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iovec *vector, int32_t count,
+             struct iatt *stbuf, struct iobref *iobref, dict_t *xdata);
+int32_t
+cs_resume_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                off_t offset, uint32_t flags, dict_t *xdata);
+int32_t
+cs_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+         off_t offset, uint32_t flags, dict_t *xdata);
+
+int
+cs_resume_remote_readv_postprocess(xlator_t *this, call_frame_t *frame,
+                                   inode_t *inode, off_t offset, size_t size,
+                                   uint32_t flags);
+int
+cs_serve_readv(call_frame_t *frame, off_t offset, size_t size, uint32_t flags);
+#endif /* __CLOUDSYNC_H__ */
diff --git a/xlators/features/compress/Makefile.am b/xlators/features/compress/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/compress/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/compress/src/Makefile.am b/xlators/features/compress/src/Makefile.am
new file mode 100644
index 00000000000..98271a9f3fc
--- /dev/null
+++ b/xlators/features/compress/src/Makefile.am
@@ -0,0 +1,19 @@
+xlator_LTLIBRARIES = cdc.la
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+noinst_HEADERS = cdc.h cdc-mem-types.h
+
+cdc_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+cdc_la_SOURCES = cdc.c cdc-helper.c
+cdc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(ZLIB_LIBS)
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) \
+	$(LIBZ_CFLAGS)
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/compress/src/cdc-helper.c b/xlators/features/compress/src/cdc-helper.c
new file mode 100644
index 00000000000..f973ff56cf5
--- /dev/null
+++ b/xlators/features/compress/src/cdc-helper.c
@@ -0,0 +1,527 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/syscall.h>
+
+#include "cdc.h"
+#include "cdc-mem-types.h"
+
+#ifdef HAVE_LIB_Z
+#include "zlib.h"
+#endif
+
+#ifdef HAVE_LIB_Z
+/* gzip header looks something like this
+ * (RFC 1950)
+ *
+ * +---+---+---+---+---+---+---+---+---+---+
+ * |ID1|ID2|CM |FLG|     MTIME     |XFL|OS |
+ * +---+---+---+---+---+---+---+---+---+---+
+ *
+ * Data is usually sent without this header i.e
+ * Data sent = <compressed-data> + trailer(8)
+ * The trailer contains the checksum.
+ *
+ * gzip_header is added only during debugging.
+ * Refer to the function cdc_dump_iovec_to_disk
+ */
+static const char gzip_header[10] = {'\037', '\213', Z_DEFLATED,  0, 0, 0, 0,
+                                     0,      0,      GF_CDC_OS_ID};
+
+static int32_t
+cdc_next_iovec(xlator_t *this, cdc_info_t *ci)
+{
+    int ret = -1;
+
+    ci->ncount++;
+    /* check for iovec overflow -- should not happen */
+    if (ci->ncount == MAX_IOVEC) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Zlib output buffer overflow"
+               " ->ncount (%d) | ->MAX_IOVEC (%d)",
+               ci->ncount, MAX_IOVEC);
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+static void
+cdc_put_long(unsigned char *string, unsigned long x)
+{
+    string[0] = (unsigned char)(x & 0xff);
+    string[1] = (unsigned char)((x & 0xff00) >> 8);
+    string[2] = (unsigned char)((x & 0xff0000) >> 16);
+    string[3] = (unsigned char)((x & 0xff000000) >> 24);
+}
+
+static unsigned long
+cdc_get_long(unsigned char *buf)
+{
+    return ((unsigned long)buf[0]) | (((unsigned long)buf[1]) << 8) |
+           (((unsigned long)buf[2]) << 16) | (((unsigned long)buf[3]) << 24);
+}
+
+static int32_t
+cdc_init_gzip_trailer(xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci)
+{
+    int ret = -1;
+    char *buf = NULL;
+
+    ret = cdc_next_iovec(this, ci);
+    if (ret)
+        goto out;
+
+    buf = CURR_VEC(ci).iov_base = (char *)GF_CALLOC(1, GF_CDC_VALIDATION_SIZE,
+                                                    gf_cdc_mt_gzip_trailer_t);
+
+    if (!CURR_VEC(ci).iov_base)
+        goto out;
+
+    CURR_VEC(ci).iov_len = GF_CDC_VALIDATION_SIZE;
+
+    cdc_put_long((unsigned char *)&buf[0], ci->crc);
+    cdc_put_long((unsigned char *)&buf[4], ci->stream.total_in);
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+static int32_t
+cdc_alloc_iobuf_and_init_vec(xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci,
+                             int size)
+{
+    int ret = -1;
+    int alloc_len = 0;
+    struct iobuf *iobuf = NULL;
+
+    ret = cdc_next_iovec(this, ci);
+    if (ret)
+        goto out;
+
+    alloc_len = size ? size : ci->buffer_size;
+
+    iobuf = iobuf_get2(this->ctx->iobuf_pool, alloc_len);
+    if (!iobuf)
+        goto out;
+
+    ret = iobref_add(ci->iobref, iobuf);
+    if (ret)
+        goto out;
+
+    /* Initialize this iovec */
+    CURR_VEC(ci).iov_base = iobuf->ptr;
+    CURR_VEC(ci).iov_len = alloc_len;
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+static void
+cdc_init_zlib_output_stream(cdc_priv_t *priv, cdc_info_t *ci, int size)
+{
+    ci->stream.next_out = (unsigned char *)CURR_VEC(ci).iov_base;
+    ci->stream.avail_out = size ? size : ci->buffer_size;
+}
+
+/* This routine is for testing and debugging only.
+ * Data written = header(10) + <compressed-data> + trailer(8)
+ * So each gzip dump file is at least 18 bytes in size.
+ */
+void
+cdc_dump_iovec_to_disk(xlator_t *this, cdc_info_t *ci, const char *file)
+{
+    int i = 0;
+    int fd = 0;
+    size_t written = 0;
+    size_t total_written = 0;
+
+    fd = open(file, O_WRONLY | O_CREAT | O_TRUNC, 0777);
+    if (fd < 0) {
+        gf_log(this->name, GF_LOG_ERROR, "Cannot open file: %s", file);
+        return;
+    }
+
+    written = sys_write(fd, (char *)gzip_header, 10);
+    total_written += written;
+    for (i = 0; i < ci->ncount; i++) {
+        written = sys_write(fd, (char *)ci->vec[i].iov_base,
+                            ci->vec[i].iov_len);
+        total_written += written;
+    }
+
+    gf_log(this->name, GF_LOG_DEBUG, "dump'd %zu bytes to %s", total_written,
+           GF_CDC_DEBUG_DUMP_FILE);
+
+    sys_close(fd);
+}
+
+static int32_t
+cdc_flush_libz_buffer(cdc_priv_t *priv, xlator_t *this, cdc_info_t *ci,
+                      int (*libz_func)(z_streamp, int), int flush)
+{
+    int32_t ret = Z_OK;
+    int done = 0;
+    unsigned int deflate_len = 0;
+
+    for (;;) {
+        deflate_len = ci->buffer_size - ci->stream.avail_out;
+
+        if (deflate_len != 0) {
+            CURR_VEC(ci).iov_len = deflate_len;
+
+            ret = cdc_alloc_iobuf_and_init_vec(this, priv, ci, 0);
+            if (ret) {
+                ret = Z_MEM_ERROR;
+                break;
+            }
+
+            /* Re-position Zlib output buffer */
+            cdc_init_zlib_output_stream(priv, ci, 0);
+        }
+
+        if (done) {
+            ci->ncount--;
+            break;
+        }
+
+        ret = libz_func(&ci->stream, flush);
+
+        if (ret == Z_BUF_ERROR) {
+            ret = Z_OK;
+            ci->ncount--;
+            break;
+        }
+
+        done = (ci->stream.avail_out != 0 || ret == Z_STREAM_END);
+
+        if (ret != Z_OK && ret != Z_STREAM_END)
+            break;
+    }
+
+    return ret;
+}
+
+static int32_t
+do_cdc_compress(struct iovec *vec, xlator_t *this, cdc_priv_t *priv,
+                cdc_info_t *ci)
+{
+    int ret = -1;
+
+    /* Initialize defalte */
+    ret = deflateInit2(&ci->stream, priv->cdc_level, Z_DEFLATED,
+                       priv->window_size, priv->mem_level, Z_DEFAULT_STRATEGY);
+
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR, "unable to init Zlib (retval: %d)",
+               ret);
+        goto out;
+    }
+
+    ret = cdc_alloc_iobuf_and_init_vec(this, priv, ci, 0);
+    if (ret)
+        goto out;
+
+    /* setup output buffer */
+    cdc_init_zlib_output_stream(priv, ci, 0);
+
+    /* setup input buffer */
+    ci->stream.next_in = (unsigned char *)vec->iov_base;
+    ci->stream.avail_in = vec->iov_len;
+
+    ci->crc = crc32(ci->crc, (const Bytef *)vec->iov_base, vec->iov_len);
+
+    gf_log(this->name, GF_LOG_DEBUG, "crc=%lu len=%d buffer_size=%d", ci->crc,
+           ci->stream.avail_in, ci->buffer_size);
+
+    /* compress !! */
+    while (ci->stream.avail_in != 0) {
+        if (ci->stream.avail_out == 0) {
+            CURR_VEC(ci).iov_len = ci->buffer_size;
+
+            ret = cdc_alloc_iobuf_and_init_vec(this, priv, ci, 0);
+            if (ret)
+                break;
+
+            /* Re-position Zlib output buffer */
+            cdc_init_zlib_output_stream(priv, ci, 0);
+        }
+
+        ret = deflate(&ci->stream, Z_NO_FLUSH);
+        if (ret != Z_OK)
+            break;
+    }
+
+out:
+    return ret;
+}
+
+int32_t
+cdc_compress(xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, dict_t **xdata)
+{
+    int ret = -1;
+    int i = 0;
+
+    ci->iobref = iobref_new();
+    if (!ci->iobref)
+        goto out;
+
+    if (!*xdata) {
+        *xdata = dict_new();
+        if (!*xdata) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "Cannot allocate xdata"
+                   " dict");
+            goto out;
+        }
+    }
+
+    /* data */
+    for (i = 0; i < ci->count; i++) {
+        ret = do_cdc_compress(&ci->vector[i], this, priv, ci);
+        if (ret != Z_OK)
+            goto deflate_cleanup_out;
+    }
+
+    /* flush zlib buffer */
+    ret = cdc_flush_libz_buffer(priv, this, ci, deflate, Z_FINISH);
+    if (!(ret == Z_OK || ret == Z_STREAM_END)) {
+        gf_log(this->name, GF_LOG_ERROR, "Compression Error: ret (%d)", ret);
+        ret = -1;
+        goto deflate_cleanup_out;
+    }
+
+    /* trailer */
+    ret = cdc_init_gzip_trailer(this, priv, ci);
+    if (ret)
+        goto deflate_cleanup_out;
+
+    gf_log(this->name, GF_LOG_DEBUG, "Compressed %ld to %ld bytes",
+           ci->stream.total_in, ci->stream.total_out);
+
+    ci->nbytes = ci->stream.total_out + GF_CDC_VALIDATION_SIZE;
+
+    /* set deflated canary value for identification */
+    ret = dict_set_int32(*xdata, GF_CDC_DEFLATE_CANARY_VAL, 1);
+    if (ret) {
+        /* Send uncompressed data if we can't _tell_ the client
+         * that deflated data is on it's way. So, we just log
+         * the failure and continue as usual.
+         */
+        gf_log(this->name, GF_LOG_ERROR,
+               "Data deflated, but could not set canary"
+               " value in dict for identification");
+    }
+
+    /* This is to be used in testing */
+    if (priv->debug) {
+        cdc_dump_iovec_to_disk(this, ci, GF_CDC_DEBUG_DUMP_FILE);
+    }
+
+deflate_cleanup_out:
+    (void)deflateEnd(&ci->stream);
+
+out:
+    return ret;
+}
+
+/* deflate content is checked by the presence of a canary
+ * value in the dict as the key
+ */
+static int32_t
+cdc_check_content_for_deflate(dict_t *xdata)
+{
+    return dict_get(xdata, GF_CDC_DEFLATE_CANARY_VAL) ? -1 : 0;
+}
+
+static unsigned long
+cdc_extract_crc(char *trailer)
+{
+    return cdc_get_long((unsigned char *)&trailer[0]);
+}
+
+static unsigned long
+cdc_extract_size(char *trailer)
+{
+    return cdc_get_long((unsigned char *)&trailer[4]);
+}
+
+static int32_t
+cdc_validate_inflate(cdc_info_t *ci, unsigned long crc, unsigned long len)
+{
+    return !((crc == ci->crc)
+             /* inflated length is hidden inside
+              * Zlib stream struct */
+             && (len == ci->stream.total_out));
+}
+
+static int32_t
+do_cdc_decompress(xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci)
+{
+    int ret = -1;
+    int i = 0;
+    int len = 0;
+    char *inflte = NULL;
+    char *trailer = NULL;
+    struct iovec vec = {
+        0,
+    };
+    unsigned long computed_crc = 0;
+    unsigned long computed_len = 0;
+
+    ret = inflateInit2(&ci->stream, priv->window_size);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR, "Zlib: Unable to initialize inflate");
+        goto out;
+    }
+
+    vec = THIS_VEC(ci, 0);
+
+    trailer = (char *)(((char *)vec.iov_base) + vec.iov_len -
+                       GF_CDC_VALIDATION_SIZE);
+
+    /* CRC of uncompressed data */
+    computed_crc = cdc_extract_crc(trailer);
+
+    /* size of uncomrpessed data */
+    computed_len = cdc_extract_size(trailer);
+
+    gf_log(this->name, GF_LOG_DEBUG, "crc=%lu len=%lu buffer_size=%d",
+           computed_crc, computed_len, ci->buffer_size);
+
+    inflte = vec.iov_base;
+    len = vec.iov_len - GF_CDC_VALIDATION_SIZE;
+
+    /* allocate buffer of the original length of the data */
+    ret = cdc_alloc_iobuf_and_init_vec(this, priv, ci, 0);
+    if (ret)
+        goto out;
+
+    /* setup output buffer */
+    cdc_init_zlib_output_stream(priv, ci, 0);
+
+    /* setup input buffer */
+    ci->stream.next_in = (unsigned char *)inflte;
+    ci->stream.avail_in = len;
+
+    while (ci->stream.avail_in != 0) {
+        if (ci->stream.avail_out == 0) {
+            CURR_VEC(ci).iov_len = ci->buffer_size;
+
+            ret = cdc_alloc_iobuf_and_init_vec(this, priv, ci, 0);
+            if (ret)
+                break;
+
+            /* Re-position Zlib output buffer */
+            cdc_init_zlib_output_stream(priv, ci, 0);
+        }
+
+        ret = inflate(&ci->stream, Z_NO_FLUSH);
+        if (ret == Z_STREAM_ERROR)
+            break;
+    }
+
+    /* flush zlib buffer */
+    ret = cdc_flush_libz_buffer(priv, this, ci, inflate, Z_SYNC_FLUSH);
+    if (!(ret == Z_OK || ret == Z_STREAM_END)) {
+        gf_log(this->name, GF_LOG_ERROR, "Decompression Error: ret (%d)", ret);
+        ret = -1;
+        goto out;
+    }
+
+    /* compute CRC of the uncompresses data to check for
+     * correctness */
+
+    for (i = 0; i < ci->ncount; i++) {
+        ci->crc = crc32(ci->crc, (const Bytef *)ci->vec[i].iov_base,
+                        ci->vec[i].iov_len);
+    }
+
+    /* validate inflated data */
+    ret = cdc_validate_inflate(ci, computed_crc, computed_len);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Checksum or length mismatched in inflated data");
+    }
+
+out:
+    return ret;
+}
+
+int32_t
+cdc_decompress(xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, dict_t *xdata)
+{
+    int32_t ret = -1;
+
+    /* check for deflate content */
+    if (!cdc_check_content_for_deflate(xdata)) {
+        gf_log(this->name, GF_LOG_DEBUG,
+               "Content not deflated, passing through ...");
+        goto passthrough_out;
+    }
+
+    ci->iobref = iobref_new();
+    if (!ci->iobref)
+        goto passthrough_out;
+
+    /* do we need to do this? can we assume that one iovec
+     * will hold per request data every time?
+     *
+     * server/client protocol seems to deal with a single
+     * iovec even if op_ret > 1M. So, it looks ok to
+     * assume that a single iovec will contain all the
+     * data (This saves us a lot from finding the trailer
+     * and the data since it could have been split-up onto
+     * two adjacent iovec's.
+     *
+     * But, in case this translator is loaded above quick-read
+     * for some reason, then it's entirely possible that we get
+     * multiple iovec's...
+     *
+     * This case (handled below) is not tested. (by loading the
+     * xlator below quick-read)
+     */
+
+    /* @@ I_HOPE_THIS_IS_NEVER_HIT */
+    if (ci->count > 1) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "unable to handle"
+               " multiple iovecs (%d in number)",
+               ci->count);
+        goto inflate_cleanup_out;
+        /* TODO: coallate all iovecs in one */
+    }
+
+    ret = do_cdc_decompress(this, priv, ci);
+    if (ret)
+        goto inflate_cleanup_out;
+
+    ci->nbytes = ci->stream.total_out;
+
+    gf_log(this->name, GF_LOG_DEBUG, "Inflated %ld to %ld bytes",
+           ci->stream.total_in, ci->stream.total_out);
+
+inflate_cleanup_out:
+    (void)inflateEnd(&ci->stream);
+
+passthrough_out:
+    return ret;
+}
+
+#endif
diff --git a/xlators/features/compress/src/cdc-mem-types.h b/xlators/features/compress/src/cdc-mem-types.h
new file mode 100644
index 00000000000..928afdd2efe
--- /dev/null
+++ b/xlators/features/compress/src/cdc-mem-types.h
@@ -0,0 +1,23 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CDC_MEM_TYPES_H
+#define __CDC_MEM_TYPES_H
+
+#include <glusterfs/mem-types.h>
+
+enum gf_cdc_mem_types {
+    gf_cdc_mt_priv_t = gf_common_mt_end + 1,
+    gf_cdc_mt_vec_t = gf_common_mt_end + 2,
+    gf_cdc_mt_gzip_trailer_t = gf_common_mt_end + 3,
+    gf_cdc_mt_end = gf_common_mt_end + 4,
+};
+
+#endif
diff --git a/xlators/features/compress/src/cdc.c b/xlators/features/compress/src/cdc.c
new file mode 100644
index 00000000000..b0b51e914ed
--- /dev/null
+++ b/xlators/features/compress/src/cdc.c
@@ -0,0 +1,348 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <sys/uio.h>
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/logging.h>
+
+#include "cdc.h"
+#include "cdc-mem-types.h"
+
+static void
+cdc_cleanup_iobref(cdc_info_t *ci)
+{
+    assert(ci->iobref != NULL);
+    iobref_clear(ci->iobref);
+}
+
+int32_t
+cdc_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iovec *vector, int32_t count,
+              struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
+{
+    int ret = -1;
+    cdc_priv_t *priv = NULL;
+    cdc_info_t ci = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("cdc", this, default_out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, default_out);
+
+    priv = this->private;
+
+    if (op_ret <= 0)
+        goto default_out;
+
+    if ((priv->min_file_size != 0) && (op_ret < priv->min_file_size))
+        goto default_out;
+
+    ci.count = count;
+    ci.ibytes = op_ret;
+    ci.vector = vector;
+    ci.buf = NULL;
+    ci.iobref = NULL;
+    ci.ncount = 0;
+    ci.crc = 0;
+    ci.buffer_size = GF_CDC_DEF_BUFFERSIZE;
+
+    /* A readv compresses on the server side and decompresses on the client side
+     */
+    if (priv->op_mode == GF_CDC_MODE_SERVER) {
+        ret = cdc_compress(this, priv, &ci, &xdata);
+    } else if (priv->op_mode == GF_CDC_MODE_CLIENT) {
+        ret = cdc_decompress(this, priv, &ci, xdata);
+    } else {
+        gf_log(this->name, GF_LOG_ERROR, "Invalid operation mode (%d)",
+               priv->op_mode);
+    }
+
+    if (ret)
+        goto default_out;
+
+    STACK_UNWIND_STRICT(readv, frame, ci.nbytes, op_errno, ci.vec, ci.ncount,
+                        stbuf, iobref, xdata);
+    cdc_cleanup_iobref(&ci);
+    return 0;
+
+default_out:
+    STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf,
+                        iobref, xdata);
+    return 0;
+}
+
+int32_t
+cdc_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+          off_t offset, uint32_t flags, dict_t *xdata)
+{
+    fop_readv_cbk_t cbk = NULL;
+
+#ifdef HAVE_LIB_Z
+    cbk = cdc_readv_cbk;
+#else
+    cbk = default_readv_cbk;
+#endif
+    STACK_WIND(frame, cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv,
+               fd, size, offset, flags, xdata);
+    return 0;
+}
+
+int32_t
+cdc_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+               struct iatt *postbuf, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
+
+int32_t
+cdc_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+           int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+           dict_t *xdata)
+{
+    int ret = -1;
+    cdc_priv_t *priv = NULL;
+    cdc_info_t ci = {
+        0,
+    };
+    size_t isize = 0;
+
+    GF_VALIDATE_OR_GOTO("cdc", this, err);
+    GF_VALIDATE_OR_GOTO(this->name, frame, err);
+
+    priv = this->private;
+
+    isize = iov_length(vector, count);
+
+    if (isize <= 0)
+        goto default_out;
+
+    if ((priv->min_file_size != 0) && (isize < priv->min_file_size))
+        goto default_out;
+
+    ci.count = count;
+    ci.ibytes = isize;
+    ci.vector = vector;
+    ci.buf = NULL;
+    ci.iobref = NULL;
+    ci.ncount = 0;
+    ci.crc = 0;
+    ci.buffer_size = GF_CDC_DEF_BUFFERSIZE;
+
+    /* A writev compresses on the client side and decompresses on the server
+     * side
+     */
+    if (priv->op_mode == GF_CDC_MODE_CLIENT) {
+        ret = cdc_compress(this, priv, &ci, &xdata);
+    } else if (priv->op_mode == GF_CDC_MODE_SERVER) {
+        ret = cdc_decompress(this, priv, &ci, xdata);
+    } else {
+        gf_log(this->name, GF_LOG_ERROR, "Invalid operation mode (%d) ",
+               priv->op_mode);
+    }
+
+    if (ret)
+        goto default_out;
+
+    STACK_WIND(frame, cdc_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, ci.vec, ci.ncount, offset,
+               flags, iobref, xdata);
+
+    cdc_cleanup_iobref(&ci);
+    return 0;
+
+default_out:
+    STACK_WIND(frame, cdc_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+               flags, iobref, xdata);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(writev, frame, -1, EINVAL, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_cdc_mt_end);
+
+    if (ret != 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Memory accounting init"
+               "failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    int ret = -1;
+    char *temp_str = NULL;
+    cdc_priv_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("cdc", this, err);
+
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_ERROR, "Need subvolume == 1");
+        goto err;
+    }
+
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_WARNING, "Dangling volume. Check volfile");
+    }
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_cdc_mt_priv_t);
+    if (!priv) {
+        goto err;
+    }
+
+    /* Check if debug mode is turned on */
+    GF_OPTION_INIT("debug", priv->debug, bool, err);
+    if (priv->debug) {
+        gf_log(this->name, GF_LOG_DEBUG, "CDC debug option turned on");
+    }
+
+    /* Set Gzip Window Size */
+    GF_OPTION_INIT("window-size", priv->window_size, int32, err);
+    if ((priv->window_size > GF_CDC_MAX_WINDOWSIZE) ||
+        (priv->window_size < GF_CDC_DEF_WINDOWSIZE)) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "Invalid gzip window size (%d), using default",
+               priv->window_size);
+        priv->window_size = GF_CDC_DEF_WINDOWSIZE;
+    }
+
+    /* Set Gzip (De)Compression Level */
+    GF_OPTION_INIT("compression-level", priv->cdc_level, int32, err);
+    if (((priv->cdc_level < 1) || (priv->cdc_level > 9)) &&
+        (priv->cdc_level != GF_CDC_DEF_COMPRESSION)) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "Invalid gzip (de)compression level (%d),"
+               " using default",
+               priv->cdc_level);
+        priv->cdc_level = GF_CDC_DEF_COMPRESSION;
+    }
+
+    /* Set Gzip Memory Level */
+    GF_OPTION_INIT("mem-level", priv->mem_level, int32, err);
+    if ((priv->mem_level < 1) || (priv->mem_level > 9)) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "Invalid gzip memory level, using the default");
+        priv->mem_level = GF_CDC_DEF_MEMLEVEL;
+    }
+
+    /* Set min file size to enable compression */
+    GF_OPTION_INIT("min-size", priv->min_file_size, int32, err);
+
+    /* Mode of operation - Server/Client */
+    ret = dict_get_str(this->options, "mode", &temp_str);
+    if (ret) {
+        gf_log(this->name, GF_LOG_CRITICAL, "Operation mode not specified !!");
+        goto err;
+    }
+
+    if (GF_CDC_MODE_IS_CLIENT(temp_str)) {
+        priv->op_mode = GF_CDC_MODE_CLIENT;
+    } else if (GF_CDC_MODE_IS_SERVER(temp_str)) {
+        priv->op_mode = GF_CDC_MODE_SERVER;
+    } else {
+        gf_log(this->name, GF_LOG_CRITICAL,
+               "Bogus operation mode (%s) specified", temp_str);
+        goto err;
+    }
+
+    this->private = priv;
+    gf_log(this->name, GF_LOG_DEBUG, "CDC xlator loaded in (%s) mode",
+           temp_str);
+    return 0;
+
+err:
+    if (priv)
+        GF_FREE(priv);
+
+    return -1;
+}
+
+void
+fini(xlator_t *this)
+{
+    cdc_priv_t *priv = this->private;
+
+    if (priv)
+        GF_FREE(priv);
+    this->private = NULL;
+    return;
+}
+
+struct xlator_fops fops = {
+    .readv = cdc_readv,
+    .writev = cdc_writev,
+};
+
+struct xlator_cbks cbks = {};
+
+struct volume_options options[] = {
+    {.key = {"window-size"},
+     .default_value = "-15",
+     .type = GF_OPTION_TYPE_INT,
+     .description = "Size of the zlib history buffer."},
+    {.key = {"mem-level"},
+     .default_value = "8",
+     .type = GF_OPTION_TYPE_INT,
+     .description = "Memory allocated for internal compression state. "
+                    "1 uses minimum memory but is slow and reduces "
+                    "compression ratio; memLevel=9 uses maximum memory "
+                    "for optimal speed. The default value is 8."},
+    {.key = {"compression-level"},
+     .default_value = "-1",
+     .type = GF_OPTION_TYPE_INT,
+     .description = "Compression levels \n"
+                    "0 : no compression, 1 : best speed, \n"
+                    "9 : best compression, -1 : default compression "},
+    {.key = {"min-size"},
+     .default_value = "0",
+     .type = GF_OPTION_TYPE_INT,
+     .description = "Data is compressed only when its size exceeds this."},
+    {.key = {"mode"},
+     .value = {"server", "client"},
+     .type = GF_OPTION_TYPE_STR,
+     .description = "Set on the basis of where the xlator is loaded. "
+                    "This option should NOT be configured by user."},
+    {.key = {"debug"},
+     .default_value = "false",
+     .type = GF_OPTION_TYPE_BOOL,
+     .description = "This is used in testing. Will dump compressed data "
+                    "to disk as a gzip file."},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {GD_OP_VERSION_3_9_0},
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "cdc",
+    .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/features/compress/src/cdc.h b/xlators/features/compress/src/cdc.h
new file mode 100644
index 00000000000..cb87b06a989
--- /dev/null
+++ b/xlators/features/compress/src/cdc.h
@@ -0,0 +1,99 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CDC_H
+#define __CDC_H
+
+#ifdef HAVE_LIB_Z
+#include "zlib.h"
+#endif
+
+#include <glusterfs/xlator.h>
+
+#ifndef MAX_IOVEC
+#define MAX_IOVEC 16
+#endif
+
+typedef struct cdc_priv {
+    int window_size;
+    int mem_level;
+    int cdc_level;
+    int min_file_size;
+    int op_mode;
+    gf_boolean_t debug;
+    gf_lock_t lock;
+} cdc_priv_t;
+
+typedef struct cdc_info {
+    /* input bits */
+    int count;
+    int32_t ibytes;
+    struct iovec *vector;
+    struct iatt *buf;
+
+    /* output bits */
+    int ncount;
+    int nbytes;
+    int buffer_size;
+    struct iovec vec[MAX_IOVEC];
+    struct iobref *iobref;
+
+    /* zlib bits */
+#ifdef HAVE_LIB_Z
+    z_stream stream;
+#endif
+    unsigned long crc;
+} cdc_info_t;
+
+#define NVEC(ci) (ci->ncount - 1)
+#define CURR_VEC(ci) ci->vec[ci->ncount - 1]
+#define THIS_VEC(ci, i) ci->vector[i]
+
+/* Gzip defaults */
+#define GF_CDC_DEF_WINDOWSIZE -15 /* default value */
+#define GF_CDC_MAX_WINDOWSIZE -8  /* max value     */
+
+#ifdef HAVE_LIB_Z
+#define GF_CDC_DEF_COMPRESSION Z_DEFAULT_COMPRESSION
+#else
+#define GF_CDC_DEF_COMPRESSION -1
+#endif
+
+#define GF_CDC_DEF_MEMLEVEL 8
+#define GF_CDC_DEF_BUFFERSIZE 262144  // 256K - default compression buffer size
+
+/* Operation mode
+ * If xlator is loaded on client, readv decompresses and writev compresses
+ * If xlator is loaded on server, readv compresses and writev decompresses
+ */
+#define GF_CDC_MODE_CLIENT 0
+#define GF_CDC_MODE_SERVER 1
+
+/* min size of data to do cmpression
+ * 0 == compress even 1byte
+ */
+#define GF_CDC_MIN_CHUNK_SIZE 0
+
+#define GF_CDC_VALIDATION_SIZE 8
+
+#define GF_CDC_OS_ID 0xFF
+#define GF_CDC_DEFLATE_CANARY_VAL "deflate"
+#define GF_CDC_DEBUG_DUMP_FILE "/tmp/cdcdump.gz"
+
+#define GF_CDC_MODE_IS_CLIENT(m) (strcmp(m, "client") == 0)
+
+#define GF_CDC_MODE_IS_SERVER(m) (strcmp(m, "server") == 0)
+
+int32_t
+cdc_compress(xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, dict_t **xdata);
+int32_t
+cdc_decompress(xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, dict_t *xdata);
+
+#endif
diff --git a/xlators/features/filter/src/Makefile.am b/xlators/features/filter/src/Makefile.am
deleted file mode 100644
index fa0b92214a9..00000000000
--- a/xlators/features/filter/src/Makefile.am
+++ /dev/null
@@ -1,13 +0,0 @@
-xlator_LTLIBRARIES = filter.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
-
-filter_la_LDFLAGS = -module -avoidversion
-
-filter_la_SOURCES = filter.c
-filter_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la 
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
-
-CLEANFILES = 
-
diff --git a/xlators/features/filter/src/filter.c b/xlators/features/filter/src/filter.c
deleted file mode 100644
index ccffa8a779b..00000000000
--- a/xlators/features/filter/src/filter.c
+++ /dev/null
@@ -1,1768 +0,0 @@
-/*
-  Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
-
-#define GF_FILTER_NOBODY_UID         65534
-#define GF_FILTER_NOBODY_GID         65534
-#define GF_FILTER_ROOT_UID           0
-#define GF_FILTER_ROOT_GID           0
-
-#define GF_MAXIMUM_FILTERING_ALLOWED 32
-
-/*
-  option root-filtering on (off by default)
-  option translate-uid <uid-range=newuid,uid=newuid>
-  option translate-gid <gid-range=newgid,gid=newgid>
-  option read-only <yes|true>
-  option fixed-uid <uid>
-  option fixed-gid <gid>
-  option filter-uid <uid-range,uid>
-  option filter-gid <gid-range,gid> // not supported yet
-
-*/
-
-struct gf_filter {
-	/* Flags */
-	gf_boolean_t complete_read_only;
-	char fixed_uid_set;
-	char fixed_gid_set;
-	char partial_filter;
-
-	/* Options */
-	/* Mapping/Filtering/Translate whatever you want to call */
-	int translate_num_uid_entries;
-	int translate_num_gid_entries;
-	int translate_input_uid[GF_MAXIMUM_FILTERING_ALLOWED][2];
-	int translate_output_uid[GF_MAXIMUM_FILTERING_ALLOWED];
-	int translate_input_gid[GF_MAXIMUM_FILTERING_ALLOWED][2];
-	int translate_output_gid[GF_MAXIMUM_FILTERING_ALLOWED];
-
-	/* Fixed uid/gid */
-	int fixed_uid;
-	int fixed_gid;
-
-	/* Filter */
-	int filter_num_uid_entries;
-	int filter_num_gid_entries;
-	int filter_input_uid[GF_MAXIMUM_FILTERING_ALLOWED][2];
-	int filter_input_gid[GF_MAXIMUM_FILTERING_ALLOWED][2];
-	
-};
-
-/* update_frame: The main logic of the whole translator.
-   Return values:
-   0: no change
-   // TRANSLATE
-   1: only uid changed 
-   2: only gid changed
-   3: both uid/gid changed
-   // FILTER
-   4: uid in filter range
-   5: gid in filter range  // not supported yet
-   6: complete fs is readonly
-*/
-
-#define GF_FILTER_NO_CHANGE    0
-#define	GF_FILTER_MAP_UID      1
-#define GF_FILTER_MAP_GID      2
-#define GF_FILTER_MAP_BOTH     3
-#define GF_FILTER_FILTER_UID   4
-#define GF_FILTER_FILTER_GID   5
-#define GF_FILTER_RO_FS        6
-
-static int32_t
-update_frame (call_frame_t *frame,
-	      inode_t *inode,
-	      struct gf_filter *filter)
-{
-	uid_t    uid = 0;
-	int32_t  idx = 0;
-	int32_t  ret = 0;
-	int32_t  dictret = 0;
-	uint64_t tmp_uid = 0;
-	
-	for (idx = 0; idx < filter->translate_num_uid_entries; idx++) {
-		if ((frame->root->uid >=filter->translate_input_uid[idx][0]) &&
-		    (frame->root->uid <=filter->translate_input_uid[idx][1])) {
-			dictret = inode_ctx_get (inode, frame->this, &tmp_uid);
-			uid = (uid_t)tmp_uid;
-			if (dictret == 0) {
-				if (frame->root->uid != uid)
-					ret = GF_FILTER_MAP_UID;
-			} else {
-				ret = GF_FILTER_MAP_UID;
-			}
-			break;
-		}
-	}
-	
-	for (idx = 0; idx < filter->translate_num_gid_entries; idx++) {
-		if ((frame->root->gid >=filter->translate_input_gid[idx][0]) &&
-		    (frame->root->gid <=filter->translate_input_gid[idx][1])) {
-			if (ret == GF_FILTER_NO_CHANGE) 
-				ret = GF_FILTER_MAP_GID;
-			else 
-				ret = GF_FILTER_MAP_BOTH;
-			break;
-		}
-	}
-
-
-	if (filter->complete_read_only)
-		return GF_FILTER_RO_FS;
-	
-	if (filter->partial_filter) {
-		dictret = inode_ctx_get (inode, frame->this, &tmp_uid);
-		uid = (uid_t)tmp_uid;
-		if (dictret != -1) {
-			for (idx = 0; idx < filter->filter_num_uid_entries; 
-			     idx++) {
-				if ((uid >=filter->filter_input_uid[idx][0]) &&
-				    (uid <=filter->filter_input_uid[idx][1])) {
-					return GF_FILTER_FILTER_UID;
-				}
-			}
-		}
-	}
-
-	return ret;
-}
-
-/* if 'root' don't change the uid/gid */
-static int32_t
-update_stat (struct stat *stbuf,
-	     struct gf_filter *filter)
-{
-	int32_t idx = 0;
-	for (idx = 0; idx < filter->translate_num_uid_entries; idx++) {
-		if (stbuf->st_uid == GF_FILTER_ROOT_UID)
-			continue;
-		if ((stbuf->st_uid >= filter->translate_input_uid[idx][0]) &&
-		    (stbuf->st_uid <= filter->translate_input_uid[idx][1])) {
-			stbuf->st_uid = filter->translate_output_uid[idx];
-			break;
-		}
-	}
-	
-	for (idx = 0; idx < filter->translate_num_gid_entries; idx++) {
-		if (stbuf->st_gid == GF_FILTER_ROOT_GID)
-			continue;
-		if ((stbuf->st_gid >= filter->translate_input_gid[idx][0]) &&
-		    (stbuf->st_gid <= filter->translate_input_gid[idx][1])) {
-			stbuf->st_gid = filter->translate_output_gid[idx];
-			break;
-		}
-	}
-
-	if (filter->fixed_uid_set) {
-		stbuf->st_uid = filter->fixed_uid;
-	}
-
-	if (filter->fixed_gid_set) {
-		stbuf->st_gid = filter->fixed_gid;
-	}
-	
-	return 0;
-}
-
-static int32_t 
-filter_lookup_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   inode_t *inode,
-		   struct stat *buf,
-		   dict_t *dict)
-{
-	int ret = 0;
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-		ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->st_uid);
-		if (ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"couldn't set context");
-		}
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf, dict);
-	return 0;
-}
-
-int32_t 
-filter_lookup (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       dict_t *xattr_req)
-{
-	STACK_WIND (frame,
-		    filter_lookup_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->lookup,
-		    loc,
-		    xattr_req);
-	return 0;
-}
-
-
-static int32_t
-filter_stat_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 struct stat *buf)
-{
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t
-filter_stat (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc)
-{
-	STACK_WIND (frame,
-		    filter_stat_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->stat,
-		    loc);
-	return 0;
-}
-
-static int32_t
-filter_chmod_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct stat *buf)
-{
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t
-filter_chmod (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      mode_t mode)
-{
-	int32_t ret = 0;
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {
-	case GF_FILTER_MAP_UID:
-		if (loc->inode->st_mode & S_IWGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (loc->inode->st_mode & S_IWOTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM, NULL);
-		return 0;
-		
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		STACK_UNWIND (frame, -1, EROFS, NULL);
-		return 0;
-	default:
-		break;
-	}
-
-	STACK_WIND (frame,
-		    filter_chmod_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->chmod,
-		    loc,
-		    mode);
-	return 0;
-}
-
-
-static int32_t
-filter_fchmod_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   struct stat *buf)
-{
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-	}
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      buf);
-	return 0;
-}
-
-int32_t 
-filter_fchmod (call_frame_t *frame,
-	       xlator_t *this,
-	       fd_t *fd,
-	       mode_t mode)
-{
-	STACK_WIND (frame,
-		    filter_fchmod_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->fchmod,
-		    fd,
-		    mode);
-	return 0;
-}
-
-static int32_t
-filter_chown_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct stat *buf)
-{
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t
-filter_chown (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      uid_t uid,
-	      gid_t gid)
-{
-	int32_t ret = 0;
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {
-	case GF_FILTER_MAP_UID:
-		if (loc->inode->st_mode & S_IWGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (loc->inode->st_mode & S_IWOTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM, NULL);
-		return 0;
-		
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		STACK_UNWIND (frame, -1, EROFS, NULL);
-		return 0;
-	default:
-		break;
-	}			
-
-	STACK_WIND (frame,	      
-		    filter_chown_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->chown,
-		    loc,
-		    uid,
-		    gid);
-	return 0;
-}
-
-static int32_t
-filter_fchown_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   struct stat *buf)
-{
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-filter_fchown (call_frame_t *frame,
-	       xlator_t *this,
-	       fd_t *fd,
-	       uid_t uid,
-	       gid_t gid)
-{
-	STACK_WIND (frame,	      
-		    filter_fchown_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->fchown,
-		    fd,
-		    uid,
-		    gid);
-	return 0;
-}
-
-static int32_t
-filter_truncate_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     struct stat *buf)
-{
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t
-filter_truncate (call_frame_t *frame,
-		 xlator_t *this,
-		 loc_t *loc,
-		 off_t offset)
-{
-	int32_t ret = 0;
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {
-	case GF_FILTER_MAP_UID:
-		if (loc->inode->st_mode & S_IWGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (loc->inode->st_mode & S_IWOTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM, NULL);
-		return 0;
-		
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		STACK_UNWIND (frame, -1, EROFS, NULL);
-		return 0;
-	}			
-
-	STACK_WIND (frame,
-		    filter_truncate_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->truncate,
-		    loc,
-		    offset);
-	return 0;
-}
-
-static int32_t
-filter_ftruncate_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno,
-		      struct stat *buf)
-{
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t
-filter_ftruncate (call_frame_t *frame,
-		  xlator_t *this,
-		  fd_t *fd,
-		  off_t offset)
-{
-	STACK_WIND (frame,
-		    filter_ftruncate_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->ftruncate,
-		    fd,
-		    offset);
-	return 0;
-}
-
-int32_t 
-filter_utimens_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    struct stat *buf)
-{
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-
-int32_t 
-filter_utimens (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc,
-		struct timespec tv[2])
-{
-	int32_t ret = 0;
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {
-	case GF_FILTER_MAP_UID:
-		if (loc->inode->st_mode & S_IWGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (loc->inode->st_mode & S_IWOTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM, NULL);
-		return 0;
-		
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		STACK_UNWIND (frame, -1, EROFS, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    filter_utimens_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->utimens,
-		    loc,
-		    tv);
-	return 0;
-}
-
-static int32_t
-filter_readlink_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     const char *path)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, path);
-	return 0;
-}
-
-int32_t
-filter_readlink (call_frame_t *frame,
-		 xlator_t *this,
-		 loc_t *loc,
-		 size_t size)
-{
-	int32_t ret = 0;
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {
-	case GF_FILTER_MAP_UID:
-		if (loc->inode->st_mode & S_IRGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (loc->inode->st_mode & S_IROTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM, NULL);
-		return 0;
-	}			
-	STACK_WIND (frame,
-		    filter_readlink_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->readlink,
-		    loc,
-		    size);
-	return 0;
-}
-
-
-static int32_t
-filter_mknod_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  inode_t *inode,
-		  struct stat *buf)
-{
-	int ret = 0;
-
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-		ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->st_uid);
-		if (ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"couldn't set context");
-		}
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
-}
-
-int32_t
-filter_mknod (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      mode_t mode,
-	      dev_t rdev)
-{
-	int ret = 0;
-	inode_t *parent = loc->parent;
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {		
-	case GF_FILTER_MAP_UID:
-		if (parent->st_mode & S_IWGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (parent->st_mode & S_IWOTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM);
-		return 0;
-
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		STACK_UNWIND (frame, -1, EROFS, NULL, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    filter_mknod_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->mknod,
-		    loc, mode, rdev);
-	return 0;
-}
-
-static int32_t
-filter_mkdir_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  inode_t *inode,
-		  struct stat *buf)
-{
-	int ret = 0;
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-		ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->st_uid);
-		if (ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"couldn't set context");
-		}
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
-}
-
-int32_t
-filter_mkdir (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      mode_t mode)
-{
-	int ret = 0;
-	inode_t *parent = loc->parent;
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {		
-	case GF_FILTER_MAP_UID:
-		if (parent->st_mode & S_IWGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (parent->st_mode & S_IWOTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM);
-		return 0;
-
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		STACK_UNWIND (frame, -1, EROFS, NULL, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    filter_mkdir_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->mkdir,
-		    loc, mode);
-	return 0;
-}
-
-static int32_t
-filter_unlink_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-int32_t
-filter_unlink (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc)
-{
-	int32_t ret = 0;
-	inode_t *parent = loc->parent;
-	if (!parent)
-		parent = inode_parent (loc->inode, 0, NULL);
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {
-	case GF_FILTER_MAP_UID:
-		if (parent->st_mode & S_IWGRP)
-			break;
-		if (loc->inode->st_mode & S_IWGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (parent->st_mode & S_IWOTH)
-			break;
-		if (loc->inode->st_mode & S_IWOTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM);
-		return 0;
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		STACK_UNWIND (frame, -1, EROFS);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    filter_unlink_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->unlink,
-		    loc);
-	return 0;
-}
-
-static int32_t
-filter_rmdir_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-int32_t
-filter_rmdir (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc)
-{
-	int32_t ret = 0;
-	inode_t *parent = loc->parent;
-	if (!parent)
-		parent = inode_parent (loc->inode, 0, NULL);
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {
-	case GF_FILTER_MAP_UID:
-		if (parent->st_mode & S_IWGRP)
-			break;
-		if (loc->inode->st_mode & S_IWGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (parent->st_mode & S_IWOTH)
-			break;
-		if (loc->inode->st_mode & S_IWOTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM);
-		return 0;
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-			STACK_UNWIND (frame, -1, EROFS);
-			return 0;
-	}			
-	STACK_WIND (frame,
-		    filter_rmdir_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->rmdir,
-		    loc);
-	return 0;
-}
-
-static int32_t
-filter_symlink_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    inode_t *inode,
-		    struct stat *buf)
-{
-	int ret = 0;
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-		ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->st_uid);
-		if (ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"couldn't set context");
-		}
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
-}
-
-int32_t
-filter_symlink (call_frame_t *frame,
-		xlator_t *this,
-		const char *linkpath,
-		loc_t *loc)
-{
-	int ret = 0;
-	inode_t *parent = loc->parent;
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {		
-	case GF_FILTER_MAP_UID:
-		if (parent->st_mode & S_IWGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (parent->st_mode & S_IWOTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM);
-		return 0;
-
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		STACK_UNWIND (frame, -1, EROFS, NULL, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    filter_symlink_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->symlink,
-		    linkpath, loc);
-	return 0;
-}
-
-
-static int32_t
-filter_rename_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   struct stat *buf)
-{
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t
-filter_rename (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *oldloc,
-	       loc_t *newloc)
-{
-	int32_t ret = 0;
-	inode_t *parent = oldloc->parent;
-	if (!parent)
-		parent = inode_parent (oldloc->inode, 0, NULL);
-	ret = update_frame (frame, oldloc->inode, this->private);
-	switch (ret) {		
-	case GF_FILTER_MAP_UID:
-		if (parent->st_mode & S_IWGRP)
-			break;
-		if (oldloc->inode->st_mode & S_IWGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (parent->st_mode & S_IWOTH)
-			break;
-		if (oldloc->inode->st_mode & S_IWOTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, 
-			"%s -> %s: returning permission denied", oldloc->path, newloc->path);
-		STACK_UNWIND (frame, -1, EPERM);
-		return 0;
-
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		STACK_UNWIND (frame, -1, EROFS, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    filter_rename_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->rename,
-		    oldloc, newloc);
-	return 0;
-}
-
-
-static int32_t
-filter_link_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 inode_t *inode,
-		 struct stat *buf)
-{
-	int ret = 0;
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-		ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->st_uid);
-		if (ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"couldn't set context");
-		}
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, inode,	buf);
-	return 0;
-}
-
-int32_t
-filter_link (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *oldloc,
-	     loc_t *newloc)
-{
-	int ret = 0;
-	ret = update_frame (frame, oldloc->inode, this->private);
-	switch (ret) {
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		STACK_UNWIND (frame, -1, EROFS, NULL, NULL);
-		return 0;
-	}
-	STACK_WIND (frame,
-		    filter_link_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->link,
-		    oldloc, newloc);
-	return 0;
-}
-
-
-static int32_t
-filter_create_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   fd_t *fd,
-		   inode_t *inode,
-		   struct stat *buf)
-{
-	int ret = 0;
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-		ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->st_uid);
-		if (ret == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"couldn't set context");
-		}
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
-	return 0;
-}
-
-int32_t
-filter_create (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       int32_t flags,
-	       mode_t mode, fd_t *fd)
-{
-	int ret = 0;
-	inode_t *parent = loc->parent;
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {		
-	case GF_FILTER_MAP_UID:
-		if (parent->st_mode & S_IWGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (parent->st_mode & S_IWOTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM);
-		return 0;
-
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		STACK_UNWIND (frame, -1, EROFS, NULL, NULL, NULL);
-		return 0;
-	}
-	STACK_WIND (frame, filter_create_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->create,
-		    loc, flags, mode, fd);
-	return 0;
-}
-
-static int32_t
-filter_open_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 fd_t *fd)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, fd);
-	return 0;
-}
-
-int32_t
-filter_open (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     int32_t flags, 
-	     fd_t *fd)
-{
-	int32_t ret = 0;
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {
-	case GF_FILTER_MAP_UID:
-		if (loc->inode->st_mode & S_IWGRP)
-			break;
-		if (!((flags & O_WRONLY) || (flags & O_RDWR)) 
-		    && (loc->inode->st_mode & S_IRGRP))
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (loc->inode->st_mode & S_IWOTH)
-				break;
-		if (!((flags & O_WRONLY) || (flags & O_RDWR))
-		    && (loc->inode->st_mode & S_IROTH))
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, 
-			"%s: returning permission denied (mode: 0%o, flag=0%o)", 
-			loc->path, loc->inode->st_mode, flags);
-		STACK_UNWIND (frame, -1, EPERM, fd);
-		return 0;
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		if (!((flags & O_WRONLY) || (flags & O_RDWR)))
-			break;
-		STACK_UNWIND (frame, -1, EROFS, NULL);
-		return 0;
-		
-	}
-	STACK_WIND (frame,
-		    filter_open_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->open,
-		    loc, flags, fd);
-	return 0;
-}
-
-static int32_t
-filter_readv_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct iovec *vector,
-		  int32_t count,
-		  struct stat *stbuf)
-{
-	if (op_ret >= 0) {
-		update_stat (stbuf, this->private);
-	}
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      vector,
-		      count,
-		      stbuf);
-	return 0;
-}
-
-int32_t
-filter_readv (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd,
-	      size_t size,
-	      off_t offset)
-{
-	STACK_WIND (frame,
-		    filter_readv_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->readv,
-		    fd,
-		    size,
-		    offset);
-	return 0;
-}
-
-
-static int32_t
-filter_writev_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   struct stat *stbuf)
-{
-	if (op_ret >= 0) {
-		update_stat (stbuf, this->private);
-	}
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      stbuf);
-	return 0;
-}
-
-int32_t
-filter_writev (call_frame_t *frame,
-	       xlator_t *this,
-	       fd_t *fd,
-	       struct iovec *vector,
-	       int32_t count,
-	       off_t off)
-{
-	int32_t ret = 0;
-	ret = update_frame (frame, fd->inode, this->private);
-	switch (ret) {
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		STACK_UNWIND (frame, -1, EROFS, NULL);
-		return 0;
-	}
-
-	STACK_WIND (frame,
-		    filter_writev_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->writev,
-		    fd,
-		    vector,
-		    count,
-		    off);
-	return 0;
-}
-
-static int32_t
-filter_fstat_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct stat *buf)
-{
-	if (op_ret >= 0) {
-		update_stat (buf, this->private);
-	}
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      buf);
-	return 0;
-}
-
-int32_t
-filter_fstat (call_frame_t *frame,
-	      xlator_t *this,
-	      fd_t *fd)
-{
-	STACK_WIND (frame,
-		    filter_fstat_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->fstat,
-		    fd);
-	return 0;
-}
-
-static int32_t
-filter_opendir_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    fd_t *fd)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      fd);
-	return 0;
-}
-
-int32_t
-filter_opendir (call_frame_t *frame,
-		xlator_t *this,
-		loc_t *loc, fd_t *fd)
-{
-	int32_t ret = 0;
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {
-	case GF_FILTER_MAP_UID:
-		if (loc->inode->st_mode & S_IWGRP)
-			break;
-		if (loc->inode->st_mode & S_IRGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (loc->inode->st_mode & S_IWOTH)
-			break;
-		if (loc->inode->st_mode & S_IROTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM, fd);
-		return 0;
-	}			
-	STACK_WIND (frame,
-		    filter_opendir_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->opendir,
-		    loc, fd);
-	return 0;
-}
-
-
-static int32_t
-filter_setxattr_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno);
-	return 0;
-}
-
-int32_t
-filter_setxattr (call_frame_t *frame,
-		 xlator_t *this,
-		 loc_t *loc,
-		 dict_t *dict,
-		 int32_t flags)
-{
-
-	int32_t ret = 0;
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {
-	case GF_FILTER_MAP_UID:
-		if (loc->inode->st_mode & S_IWGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (loc->inode->st_mode & S_IWOTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM);
-		return 0;
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		STACK_UNWIND (frame, -1, EROFS);
-		return 0;
-	}			
-
-	STACK_WIND (frame,
-		    filter_setxattr_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->setxattr,
-		    loc,
-		    dict,
-		    flags);
-	return 0;
-}
-
-static int32_t
-filter_getxattr_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     dict_t *dict)
-{
-	STACK_UNWIND (frame,
-		      op_ret,
-		      op_errno,
-		      dict);
-	return 0;
-}
-
-int32_t
-filter_getxattr (call_frame_t *frame,
-		 xlator_t *this,
-		 loc_t *loc,
-		 const char *name)
-{
-	int32_t ret = 0;
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {
-	case GF_FILTER_MAP_UID:
-		if (loc->inode->st_mode & S_IRGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (loc->inode->st_mode & S_IROTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM, NULL);
-		return 0;
-	}			
-
-	STACK_WIND (frame,
-		    filter_getxattr_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->getxattr,
-		    loc,
-		    name);
-	return 0;
-}
-
-static int32_t
-filter_removexattr_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-int32_t
-filter_removexattr (call_frame_t *frame,
-		    xlator_t *this,
-		    loc_t *loc,
-		    const char *name)
-{
-	int32_t ret = 0;
-	ret = update_frame (frame, loc->inode, this->private);
-	switch (ret) {
-	case GF_FILTER_MAP_UID:
-		if (loc->inode->st_mode & S_IWGRP)
-			break;
-	case GF_FILTER_MAP_BOTH:
-		if (loc->inode->st_mode & S_IWOTH)
-			break;
-		gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path);
-		STACK_UNWIND (frame, -1, EPERM);
-		return 0;
-	case GF_FILTER_FILTER_UID:
-	case GF_FILTER_FILTER_GID:
-	case GF_FILTER_RO_FS:
-		STACK_UNWIND (frame, -1, EROFS);
-		return 0;
-	}			
-
-	STACK_WIND (frame,
-		    filter_removexattr_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->removexattr,
-		    loc,
-		    name);
-	return 0;
-}
-
-int32_t 
-init (xlator_t *this)
-{
-	char *value = NULL;
-	char *tmp_str = NULL;
-	char *tmp_str1 = NULL;
-	char *tmp_str2 = NULL;
-	char *dup_str = NULL;
-	char *input_value_str1 = NULL;
-	char *input_value_str2 = NULL;
-	char *output_value_str = NULL;
-	int32_t input_value = 0;
-	int32_t output_value = 0;
-	data_t *option_data = NULL;
-	struct gf_filter *filter = NULL;
-	gf_boolean_t tmp_bool = 0;
-
-	if (!this->children || this->children->next) {
-		gf_log (this->name,
-			GF_LOG_ERROR,
-			"translator not configured with exactly one child");
-		return -1;
-	}
-	
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-
-	filter = CALLOC (sizeof (*filter), 1);
-	ERR_ABORT (filter);
-	
-	if (dict_get (this->options, "read-only")) {
-		value = data_to_str (dict_get (this->options, "read-only"));
-		if (gf_string2boolean (value, &filter->complete_read_only) == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"wrong value provided for 'read-only'");
-			return -1;
-		}
-	}
-
-	if (dict_get (this->options, "root-squashing")) {
-		value = data_to_str (dict_get (this->options, "root-squashing"));
-		if (gf_string2boolean (value, &tmp_bool) == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"wrong value provided for 'root-squashing'");
-			return -1;
-		}
-		if (tmp_bool) {
-			filter->translate_num_uid_entries = 1;
-			filter->translate_num_gid_entries = 1;
-			filter->translate_input_uid[0][0] = GF_FILTER_ROOT_UID; /* root */
-			filter->translate_input_uid[0][1] = GF_FILTER_ROOT_UID; /* root */
-			filter->translate_input_gid[0][0] = GF_FILTER_ROOT_GID; /* root */
-			filter->translate_input_gid[0][1] = GF_FILTER_ROOT_GID; /* root */
-			filter->translate_output_uid[0] = GF_FILTER_NOBODY_UID;
-			filter->translate_output_gid[0] = GF_FILTER_NOBODY_GID;
-		}
-	}
-
-	if (dict_get (this->options, "translate-uid")) {
-		option_data = dict_get (this->options, "translate-uid");
-		value = strtok_r (option_data->data, ",", &tmp_str);
-		while (value) {
-			dup_str = strdup (value);
-			input_value_str1 = strtok_r (dup_str, "=", &tmp_str1);
-			if (input_value_str1) {
-				/* Check for n-m */
-				char *temp_string = strdup (input_value_str1);
-				input_value_str2 = strtok_r (temp_string, "-", &tmp_str2);
-				if (gf_string2int (input_value_str2, &input_value) != 0) {
-					gf_log (this->name, GF_LOG_ERROR, 
-						"invalid number format \"%s\"", 
-						input_value_str2);
-					return -1;
-				}
-				filter->translate_input_uid[filter->translate_num_uid_entries][0] = input_value;
-				input_value_str2 = strtok_r (NULL, "-", &tmp_str2);
-				if (input_value_str2) {
-					if (gf_string2int (input_value_str2, &input_value) != 0) {
-						gf_log (this->name, GF_LOG_ERROR, 
-							"invalid number format \"%s\"", 
-							input_value_str2);
-						return -1;
-					}
-				}
-				filter->translate_input_uid[filter->translate_num_uid_entries][1] = input_value;
-				FREE (temp_string);
-				output_value_str = strtok_r (NULL, "=", &tmp_str1);
-				if (output_value_str) {
-					if (gf_string2int (output_value_str, &output_value) != 0) {
-						gf_log (this->name, GF_LOG_ERROR, 
-							"invalid number format \"%s\"", 
-							output_value_str);
-						return -1;
-					}
-				} else {
-					gf_log (this->name, GF_LOG_ERROR, 
-						"mapping string not valid");
-					return -1;
-				}
-			} else {
-				gf_log (this->name, GF_LOG_ERROR, 
-					"mapping string not valid");
-				return -1;
-			}
-			filter->translate_output_uid[filter->translate_num_uid_entries]   = output_value;
-			gf_log (this->name, 
-				GF_LOG_DEBUG, 
-				"pair %d: input uid '%d' will be changed to uid '%d'", 
-				filter->translate_num_uid_entries, input_value, output_value);
-
-			filter->translate_num_uid_entries++;
-			if (filter->translate_num_uid_entries == GF_MAXIMUM_FILTERING_ALLOWED)
-				break;
-			value = strtok_r (NULL, ",", &tmp_str);
-			FREE (dup_str);
-		}
-	}
-
-	tmp_str1 = NULL;
-	tmp_str2 = NULL;
-	tmp_str  = NULL;
-
-	if (dict_get (this->options, "translate-gid")) {
-		option_data = dict_get (this->options, "translate-gid");
-		value = strtok_r (option_data->data, ",", &tmp_str);
-		while (value) {
-			dup_str = strdup (value);
-			input_value_str1 = strtok_r (dup_str, "=", &tmp_str1);
-			if (input_value_str1) {
-				/* Check for n-m */
-				char *temp_string = strdup (input_value_str1);
-				input_value_str2 = strtok_r (temp_string, "-", &tmp_str2);
-				if (gf_string2int (input_value_str2, &input_value) != 0) {
-					gf_log (this->name, GF_LOG_ERROR, 
-						"invalid number format \"%s\"", 
-						input_value_str2);
-					return -1;
-				}
-				filter->translate_input_gid[filter->translate_num_gid_entries][0] = input_value;
-				input_value_str2 = strtok_r (NULL, "-", &tmp_str2);
-				if (input_value_str2) {
-					if (gf_string2int (input_value_str2, &input_value) != 0) {
-						gf_log (this->name, GF_LOG_ERROR, 
-							"invalid number format \"%s\"", 
-							input_value_str2);
-						return -1;
-					}
-				}
-				filter->translate_input_gid[filter->translate_num_gid_entries][1] = input_value;
-				FREE (temp_string);
-				output_value_str = strtok_r (NULL, "=", &tmp_str1);
-				if (output_value_str) {
-					if (gf_string2int (output_value_str, &output_value) != 0) {
-						gf_log (this->name, GF_LOG_ERROR, 
-							"invalid number format \"%s\"", 
-							output_value_str);
-						return -1;
-					}
-				} else {
-					gf_log (this->name, GF_LOG_ERROR, 
-						"translate-gid value not valid");
-					return -1;
-				}
-			} else {
-				gf_log (this->name, GF_LOG_ERROR, 
-					"translate-gid value not valid");
-				return -1;
-			}
-			
-			filter->translate_output_gid[filter->translate_num_gid_entries] = output_value;
-			
-			gf_log (this->name, GF_LOG_DEBUG, 
-				"pair %d: input gid '%d' will be changed to gid '%d'", 
-				filter->translate_num_gid_entries, input_value, output_value);
-			
-			filter->translate_num_gid_entries++;
-			if (filter->translate_num_gid_entries == GF_MAXIMUM_FILTERING_ALLOWED)
-				break;
-			value = strtok_r (NULL, ",", &tmp_str);
-			FREE (dup_str);
-		}
-	}
-
-	tmp_str  = NULL;
-	tmp_str1 = NULL;
-
-	if (dict_get (this->options, "filter-uid")) {
-		option_data = dict_get (this->options, "filter-uid");
-		value = strtok_r (option_data->data, ",", &tmp_str);
-		while (value) {
-			dup_str = strdup (value);
-			/* Check for n-m */
-			input_value_str1 = strtok_r (dup_str, "-", &tmp_str1);
-			if (gf_string2int (input_value_str1, &input_value) != 0) {
-				gf_log (this->name, GF_LOG_ERROR, 
-					"invalid number format \"%s\"", 
-					input_value_str1);
-				return -1;
-			}
-			filter->filter_input_uid[filter->filter_num_uid_entries][0] = input_value;
-			input_value_str1 = strtok_r (NULL, "-", &tmp_str1);
-			if (input_value_str1) {
-				if (gf_string2int (input_value_str1, &input_value) != 0) {
-					gf_log (this->name, GF_LOG_ERROR, 
-						"invalid number format \"%s\"", 
-						input_value_str1);
-					return -1;
-				}
-			}
-			filter->filter_input_uid[filter->filter_num_uid_entries][1] = input_value;
-
-			gf_log (this->name, 
-				GF_LOG_DEBUG, 
-				"filter [%d]: input uid(s) '%s' will be filtered", 
-				filter->filter_num_uid_entries, dup_str);
-			
-			filter->filter_num_uid_entries++;
-			if (filter->filter_num_uid_entries == GF_MAXIMUM_FILTERING_ALLOWED)
-				break;
-			value = strtok_r (NULL, ",", &tmp_str);
-			FREE (dup_str);
-		}
-		filter->partial_filter = 1;
-	}
-
-	tmp_str  = NULL;
-	tmp_str1 = NULL;
-
-	if (dict_get (this->options, "filter-gid")) {
-		option_data = dict_get (this->options, "filter-gid");
-		value = strtok_r (option_data->data, ",", &tmp_str);
-		while (value) {
-			dup_str = strdup (value);
-			/* Check for n-m */
-			input_value_str1 = strtok_r (dup_str, "-", &tmp_str1);
-			if (gf_string2int (input_value_str1, &input_value) != 0) {
-				gf_log (this->name, GF_LOG_ERROR, 
-					"invalid number format \"%s\"", 
-					input_value_str1);
-				return -1;
-			}
-			filter->filter_input_gid[filter->filter_num_gid_entries][0] = input_value;
-			input_value_str1 = strtok_r (NULL, "-", &tmp_str1);
-			if (input_value_str1) {
-				if (gf_string2int (input_value_str1, &input_value) != 0) {
-					gf_log (this->name, GF_LOG_ERROR, 
-						"invalid number format \"%s\"", 
-						input_value_str1);
-					return -1;
-				}
-			}
-			filter->filter_input_gid[filter->filter_num_gid_entries][1] = input_value;
-
-			gf_log (this->name, 
-				GF_LOG_DEBUG, 
-				"filter [%d]: input gid(s) '%s' will be filtered", 
-				filter->filter_num_gid_entries, dup_str);
-			
-			filter->filter_num_gid_entries++;
-			if (filter->filter_num_gid_entries == GF_MAXIMUM_FILTERING_ALLOWED)
-				break;
-			value = strtok_r (NULL, ",", &tmp_str);
-			FREE (dup_str);
-		}
-		gf_log (this->name, GF_LOG_ERROR, "this option is not supported currently.. exiting");
-		return -1;
-		filter->partial_filter = 1;
-	}
-
-	if (dict_get (this->options, "fixed-uid")) {
-		option_data = dict_get (this->options, "fixed-uid");
-		if (gf_string2int (option_data->data, &input_value) != 0) {
-			gf_log (this->name, GF_LOG_ERROR, 
-				"invalid number format \"%s\"", 
-				option_data->data);
-			return -1;
-		}
-		filter->fixed_uid = input_value;
-		filter->fixed_uid_set = 1;
-	}
-
-	if (dict_get (this->options, "fixed-gid")) {
-		option_data = dict_get (this->options, "fixed-gid");
-		if (gf_string2int (option_data->data, &input_value) != 0) {
-			gf_log (this->name, GF_LOG_ERROR, 
-				"invalid number format \"%s\"", 
-				option_data->data);
-			return -1;
-		}
-		filter->fixed_gid = input_value;
-		filter->fixed_gid_set = 1;
-	}
-
-	this->private = filter;
-	return 0;
-}
-
-
-void
-fini (xlator_t *this)
-{
-	struct gf_filter *filter = this->private;
-
-	FREE (filter);
-
-	return;
-}
-
-
-struct xlator_fops fops = {
-	.lookup      = filter_lookup,
-	.stat        = filter_stat,
-	.fstat       = filter_fstat,
-	.chmod       = filter_chmod,
-	.fchmod      = filter_fchmod,
-	.readlink    = filter_readlink,
-	.mknod       = filter_mknod,
-	.mkdir       = filter_mkdir,
-	.unlink      = filter_unlink,
-	.rmdir       = filter_rmdir,
-	.symlink     = filter_symlink,
-	.rename      = filter_rename,
-	.link        = filter_link,
-	.chown       = filter_chown,
-	.fchown      = filter_fchown,
-	.truncate    = filter_truncate,
-	.ftruncate   = filter_ftruncate,
-	.create      = filter_create,
-	.open        = filter_open,
-	.readv       = filter_readv,
-	.writev      = filter_writev,
-	.setxattr    = filter_setxattr,
-	.getxattr    = filter_getxattr,
-	.removexattr = filter_removexattr,
-	.opendir     = filter_opendir,
-	.utimens     = filter_utimens,
-};
-
-struct xlator_mops mops = {
-};
-
-struct xlator_cbks cbks = {
-};
-
-struct volume_options options[] = {
-	{ .key  = { "root-squashing" }, 
-	  .type = GF_OPTION_TYPE_BOOL 
-	},
-	{ .key  = { "read-only" }, 
-	  .type = GF_OPTION_TYPE_BOOL
-	},
-	{ .key  = { "fixed-uid" },  
-	  .type = GF_OPTION_TYPE_INT
-	},
-	{ .key  = { "fixed-gid" },  
-	  .type = GF_OPTION_TYPE_INT
-	},
-	{ .key  = { "translate-uid" },  
-	  .type = GF_OPTION_TYPE_ANY 
-	},
-	{ .key  = { "translate-gid" },  
-	  .type = GF_OPTION_TYPE_ANY
-	},
-	{ .key  = { "filter-uid" },  
-	  .type = GF_OPTION_TYPE_ANY 
-	},
-	{ .key  = { "filter-gid" },  
-	  .type = GF_OPTION_TYPE_ANY 
-	},
-	{ .key = {NULL} },
-};
diff --git a/xlators/bindings/python/Makefile.am b/xlators/features/gfid-access/Makefile.am
index af437a64d6d..af437a64d6d 100644
--- a/xlators/bindings/python/Makefile.am
+++ b/xlators/features/gfid-access/Makefile.am
diff --git a/xlators/features/gfid-access/src/Makefile.am b/xlators/features/gfid-access/src/Makefile.am
new file mode 100644
index 00000000000..ff95604c4de
--- /dev/null
+++ b/xlators/features/gfid-access/src/Makefile.am
@@ -0,0 +1,16 @@
+xlator_LTLIBRARIES = gfid-access.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+gfid_access_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+gfid_access_la_SOURCES = gfid-access.c
+gfid_access_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = gfid-access.h gfid-access-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/gfid-access/src/gfid-access-mem-types.h b/xlators/features/gfid-access/src/gfid-access-mem-types.h
new file mode 100644
index 00000000000..1c4d0b93de2
--- /dev/null
+++ b/xlators/features/gfid-access/src/gfid-access-mem-types.h
@@ -0,0 +1,22 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GFID_ACCESS_MEM_TYPES_H
+#define _GFID_ACCESS_MEM_TYPES_H
+
+#include <glusterfs/mem-types.h>
+
+enum gf_changelog_mem_types {
+    gf_gfid_access_mt_priv_t = gf_common_mt_end + 1,
+    gf_gfid_access_mt_gfid_t,
+    gf_gfid_access_mt_end
+};
+
+#endif
diff --git a/xlators/features/gfid-access/src/gfid-access.c b/xlators/features/gfid-access/src/gfid-access.c
new file mode 100644
index 00000000000..3fea5672a21
--- /dev/null
+++ b/xlators/features/gfid-access/src/gfid-access.c
@@ -0,0 +1,1420 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include "gfid-access.h"
+#include <glusterfs/inode.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/statedump.h>
+
+int
+ga_valid_inode_loc_copy(loc_t *dst, loc_t *src, xlator_t *this)
+{
+    int ret = 0;
+    uint64_t value = 0;
+
+    /* if its an entry operation, on the virtual */
+    /* directory inode as parent, we need to handle */
+    /* it properly */
+    ret = loc_copy(dst, src);
+    if (ret < 0)
+        goto out;
+
+    /*
+     * Change ALL virtual inodes with real-inodes in loc
+     */
+    if (dst->parent) {
+        ret = inode_ctx_get(dst->parent, this, &value);
+        if (ret < 0) {
+            ret = 0;  // real-inode
+            goto out;
+        }
+        inode_unref(dst->parent);
+        dst->parent = inode_ref((inode_t *)(uintptr_t)value);
+        gf_uuid_copy(dst->pargfid, dst->parent->gfid);
+    }
+
+    if (dst->inode) {
+        ret = inode_ctx_get(dst->inode, this, &value);
+        if (ret < 0) {
+            ret = 0;  // real-inode
+            goto out;
+        }
+        inode_unref(dst->inode);
+        dst->inode = inode_ref((inode_t *)(uintptr_t)value);
+        gf_uuid_copy(dst->gfid, dst->inode->gfid);
+    }
+out:
+
+    return ret;
+}
+
+void
+ga_newfile_args_free(ga_newfile_args_t *args)
+{
+    if (!args)
+        goto out;
+
+    GF_FREE(args->bname);
+
+    if (S_ISLNK(args->st_mode) && args->args.symlink.linkpath) {
+        GF_FREE(args->args.symlink.linkpath);
+        args->args.symlink.linkpath = NULL;
+    }
+
+    mem_put(args);
+out:
+    return;
+}
+
+void
+ga_heal_args_free(ga_heal_args_t *args)
+{
+    if (!args)
+        goto out;
+
+    GF_FREE(args->bname);
+
+    mem_put(args);
+out:
+    return;
+}
+
+ga_newfile_args_t *
+ga_newfile_parse_args(xlator_t *this, data_t *data)
+{
+    ga_newfile_args_t *args = NULL;
+    ga_private_t *priv = NULL;
+    int len = 0;
+    int blob_len = 0;
+    int min_len = 0;
+    void *blob = NULL;
+
+    priv = this->private;
+
+    blob = data->data;
+    blob_len = data->len;
+
+    min_len = sizeof(args->uid) + sizeof(args->gid) + sizeof(args->gfid) +
+              sizeof(args->st_mode) + 2 + 2;
+    if (blob_len < min_len) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Invalid length: Total length is less "
+               "than minimum length.");
+        goto err;
+    }
+
+    args = mem_get0(priv->newfile_args_pool);
+    if (args == NULL)
+        goto err;
+
+    args->uid = ntoh32(*(uint32_t *)blob);
+    blob += sizeof(uint32_t);
+    blob_len -= sizeof(uint32_t);
+
+    args->gid = ntoh32(*(uint32_t *)blob);
+    blob += sizeof(uint32_t);
+    blob_len -= sizeof(uint32_t);
+
+    memcpy(args->gfid, blob, sizeof(args->gfid));
+    blob += sizeof(args->gfid);
+    blob_len -= sizeof(args->gfid);
+
+    args->st_mode = ntoh32(*(uint32_t *)blob);
+    blob += sizeof(uint32_t);
+    blob_len -= sizeof(uint32_t);
+
+    len = strnlen(blob, blob_len);
+    if (len == blob_len) {
+        gf_log(this->name, GF_LOG_ERROR, "gfid: %s. No null byte present.",
+               args->gfid);
+        goto err;
+    }
+
+    args->bname = GF_MALLOC(len + 1, gf_common_mt_char);
+    if (args->bname == NULL)
+        goto err;
+
+    memcpy(args->bname, blob, (len + 1));
+    blob += (len + 1);
+    blob_len -= (len + 1);
+
+    if (S_ISDIR(args->st_mode)) {
+        if (blob_len < sizeof(uint32_t)) {
+            gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length",
+                   args->gfid);
+            goto err;
+        }
+        args->args.mkdir.mode = ntoh32(*(uint32_t *)blob);
+        blob += sizeof(uint32_t);
+        blob_len -= sizeof(uint32_t);
+
+        if (blob_len < sizeof(uint32_t)) {
+            gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length",
+                   args->gfid);
+            goto err;
+        }
+        args->args.mkdir.umask = ntoh32(*(uint32_t *)blob);
+        blob_len -= sizeof(uint32_t);
+        if (blob_len < 0) {
+            gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length",
+                   args->gfid);
+            goto err;
+        }
+    } else if (S_ISLNK(args->st_mode)) {
+        len = strnlen(blob, blob_len);
+        if (len == blob_len) {
+            gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length",
+                   args->gfid);
+            goto err;
+        }
+        args->args.symlink.linkpath = GF_MALLOC(len + 1, gf_common_mt_char);
+        if (args->args.symlink.linkpath == NULL)
+            goto err;
+
+        memcpy(args->args.symlink.linkpath, blob, (len + 1));
+        blob_len -= (len + 1);
+    } else {
+        if (blob_len < sizeof(uint32_t)) {
+            gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length",
+                   args->gfid);
+            goto err;
+        }
+        args->args.mknod.mode = ntoh32(*(uint32_t *)blob);
+        blob += sizeof(uint32_t);
+        blob_len -= sizeof(uint32_t);
+
+        if (blob_len < sizeof(uint32_t)) {
+            gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length",
+                   args->gfid);
+            goto err;
+        }
+        args->args.mknod.rdev = ntoh32(*(uint32_t *)blob);
+        blob += sizeof(uint32_t);
+        blob_len -= sizeof(uint32_t);
+
+        if (blob_len < sizeof(uint32_t)) {
+            gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length",
+                   args->gfid);
+            goto err;
+        }
+        args->args.mknod.umask = ntoh32(*(uint32_t *)blob);
+        blob_len -= sizeof(uint32_t);
+    }
+
+    if (blob_len) {
+        gf_log(this->name, GF_LOG_ERROR, "gfid: %s. Invalid length",
+               args->gfid);
+        goto err;
+    }
+
+    return args;
+
+err:
+    if (args)
+        ga_newfile_args_free(args);
+
+    return NULL;
+}
+
+ga_heal_args_t *
+ga_heal_parse_args(xlator_t *this, data_t *data)
+{
+    ga_heal_args_t *args = NULL;
+    ga_private_t *priv = NULL;
+    void *blob = NULL;
+    int len = 0;
+    int blob_len = 0;
+
+    blob = data->data;
+    blob_len = data->len;
+
+    priv = this->private;
+
+    /* bname should at least contain a character */
+    if (blob_len < (sizeof(args->gfid) + 2))
+        goto err;
+
+    args = mem_get0(priv->heal_args_pool);
+    if (!args)
+        goto err;
+
+    memcpy(args->gfid, blob, sizeof(args->gfid));
+    blob += sizeof(args->gfid);
+    blob_len -= sizeof(args->gfid);
+
+    len = strnlen(blob, blob_len);
+    if (len == blob_len)
+        goto err;
+
+    args->bname = GF_MALLOC(len + 1, gf_common_mt_char);
+    if (!args->bname)
+        goto err;
+
+    memcpy(args->bname, blob, len);
+    args->bname[len] = '\0';
+    blob_len -= (len + 1);
+
+    if (blob_len)
+        goto err;
+
+    return args;
+
+err:
+    if (args)
+        ga_heal_args_free(args);
+
+    return NULL;
+}
+
+static int32_t
+ga_fill_tmp_loc(loc_t *loc, xlator_t *this, uuid_t gfid, char *bname,
+                dict_t *xdata, loc_t *new_loc)
+{
+    int ret = -1;
+    uint64_t value = 0;
+    inode_t *parent = NULL;
+    unsigned char *gfid_ptr = NULL;
+
+    parent = loc->inode;
+    ret = inode_ctx_get(loc->inode, this, &value);
+    if (!ret) {
+        parent = (void *)(uintptr_t)value;
+        if (gf_uuid_is_null(parent->gfid))
+            parent = loc->inode;
+    }
+
+    /* parent itself should be looked up */
+    gf_uuid_copy(new_loc->pargfid, parent->gfid);
+    new_loc->parent = inode_ref(parent);
+
+    new_loc->inode = inode_grep(parent->table, parent, bname);
+    if (!new_loc->inode) {
+        new_loc->inode = inode_new(parent->table);
+        gf_uuid_copy(new_loc->inode->gfid, gfid);
+    }
+
+    loc_path(new_loc, bname);
+    if (new_loc->path) {
+        new_loc->name = strrchr(new_loc->path, '/');
+        if (new_loc->name)
+            new_loc->name++;
+    }
+
+    gfid_ptr = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!gfid_ptr) {
+        ret = -1;
+        goto out;
+    }
+    gf_uuid_copy(gfid_ptr, gfid);
+    ret = dict_set_gfuuid(xdata, "gfid-req", gfid_ptr, false);
+    if (ret < 0)
+        goto out;
+
+    ret = 0;
+
+out:
+    if (ret && gfid_ptr)
+        GF_FREE(gfid_ptr);
+    return ret;
+}
+
+static gf_boolean_t
+__is_gfid_access_dir(uuid_t gfid)
+{
+    static uuid_t aux_gfid = {0, 0, 0, 0, 0, 0, 0, 0,
+                              0, 0, 0, 0, 0, 0, 0, GF_AUX_GFID};
+
+    if (gf_uuid_compare(gfid, aux_gfid) == 0)
+        return _gf_true;
+
+    return _gf_false;
+}
+
+int32_t
+ga_forget(xlator_t *this, inode_t *inode)
+{
+    int ret = -1;
+    uint64_t value = 0;
+    inode_t *tmp_inode = NULL;
+
+    ret = inode_ctx_del(inode, this, &value);
+    if (ret)
+        goto out;
+
+    tmp_inode = (void *)(uintptr_t)value;
+    inode_unref(tmp_inode);
+
+out:
+    return 0;
+}
+
+static int
+ga_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, inode_t *inode, struct iatt *stat, dict_t *dict,
+            struct iatt *postparent)
+{
+    call_frame_t *orig_frame = NULL;
+
+    orig_frame = frame->local;
+    frame->local = NULL;
+
+    /* don't worry about inode linking and other stuff. They'll happen on
+     * the next lookup.
+     */
+    STACK_DESTROY(frame->root);
+
+    STACK_UNWIND_STRICT(setxattr, orig_frame, op_ret, op_errno, dict);
+
+    return 0;
+}
+
+static int
+ga_newentry_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, inode_t *inode,
+                struct iatt *buf, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
+{
+    ga_local_t *local = NULL;
+
+    local = frame->local;
+
+    /* don't worry about inode linking and other stuff. They'll happen on
+     * the next lookup.
+     */
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+
+    STACK_UNWIND_STRICT(setxattr, local->orig_frame, op_ret, op_errno, xdata);
+
+    if (local->xdata)
+        dict_unref(local->xdata);
+    loc_wipe(&local->loc);
+    mem_put(local);
+
+    return 0;
+}
+
+static int
+ga_newentry_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, inode_t *inode,
+                       struct iatt *stat, dict_t *xdata,
+                       struct iatt *postparent)
+
+{
+    ga_local_t *local = NULL;
+
+    local = frame->local;
+
+    if ((op_ret < 0) && ((op_errno != ENOENT) && (op_errno != ESTALE)))
+        goto err;
+
+    STACK_WIND(frame, ga_newentry_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, &local->loc, local->mode,
+               local->rdev, local->umask, local->xdata);
+    return 0;
+
+err:
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+    STACK_UNWIND_STRICT(setxattr, local->orig_frame, op_ret, op_errno, xdata);
+    if (local->xdata)
+        dict_unref(local->xdata);
+    loc_wipe(&local->loc);
+    mem_put(local);
+
+    return 0;
+}
+
+int32_t
+ga_new_entry(call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *data,
+             dict_t *xdata)
+{
+    int ret = -1;
+    ga_newfile_args_t *args = NULL;
+    loc_t tmp_loc = {
+        0,
+    };
+    call_frame_t *new_frame = NULL;
+    ga_local_t *local = NULL;
+    uuid_t gfid = {
+        0,
+    };
+
+    if (!xdata) {
+        xdata = dict_new();
+    } else {
+        xdata = dict_ref(xdata);
+    }
+
+    if (!xdata) {
+        ret = -1;
+        goto out;
+    }
+
+    args = ga_newfile_parse_args(this, data);
+    if (!args)
+        goto out;
+
+    ret = gf_uuid_parse(args->gfid, gfid);
+    if (ret)
+        goto out;
+
+    ret = ga_fill_tmp_loc(loc, this, gfid, args->bname, xdata, &tmp_loc);
+    if (ret)
+        goto out;
+
+    new_frame = copy_frame(frame);
+    if (!new_frame)
+        goto out;
+
+    local = mem_get0(this->local_pool);
+    local->orig_frame = frame;
+
+    loc_copy(&local->loc, &tmp_loc);
+
+    new_frame->local = local;
+    new_frame->root->uid = args->uid;
+    new_frame->root->gid = args->gid;
+
+    if (S_ISDIR(args->st_mode)) {
+        STACK_WIND(new_frame, ga_newentry_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->mkdir, &tmp_loc,
+                   args->args.mkdir.mode, args->args.mkdir.umask, xdata);
+    } else if (S_ISLNK(args->st_mode)) {
+        STACK_WIND(new_frame, ga_newentry_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->symlink,
+                   args->args.symlink.linkpath, &tmp_loc, 0, xdata);
+    } else {
+        /* use 07777 (4 7s) for considering the Sticky bits etc) */
+        ((ga_local_t *)new_frame->local)->mode = (S_IFMT & args->st_mode) |
+                                                 (07777 &
+                                                  args->args.mknod.mode);
+
+        ((ga_local_t *)new_frame->local)->umask = args->args.mknod.umask;
+        ((ga_local_t *)new_frame->local)->rdev = args->args.mknod.rdev;
+        ((ga_local_t *)new_frame->local)->xdata = dict_ref(xdata);
+
+        /* send a named lookup, so that dht can cleanup up stale linkto
+         * files etc.
+         */
+        STACK_WIND(new_frame, ga_newentry_lookup_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->lookup, &tmp_loc, NULL);
+    }
+
+    ret = 0;
+out:
+    ga_newfile_args_free(args);
+
+    if (xdata)
+        dict_unref(xdata);
+
+    loc_wipe(&tmp_loc);
+
+    return ret;
+}
+
+int32_t
+ga_heal_entry(call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *data,
+              dict_t *xdata)
+{
+    int ret = -1;
+    ga_heal_args_t *args = NULL;
+    loc_t tmp_loc = {
+        0,
+    };
+    call_frame_t *new_frame = NULL;
+    uuid_t gfid = {
+        0,
+    };
+
+    args = ga_heal_parse_args(this, data);
+    if (!args)
+        goto out;
+
+    ret = gf_uuid_parse(args->gfid, gfid);
+    if (ret)
+        goto out;
+
+    if (!xdata)
+        xdata = dict_new();
+    else
+        xdata = dict_ref(xdata);
+
+    if (!xdata) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = ga_fill_tmp_loc(loc, this, gfid, args->bname, xdata, &tmp_loc);
+    if (ret)
+        goto out;
+
+    new_frame = copy_frame(frame);
+    if (!new_frame)
+        goto out;
+
+    new_frame->local = (void *)frame;
+
+    STACK_WIND(new_frame, ga_heal_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, &tmp_loc, xdata);
+
+    ret = 0;
+out:
+    if (args)
+        ga_heal_args_free(args);
+
+    loc_wipe(&tmp_loc);
+
+    if (xdata)
+        dict_unref(xdata);
+
+    return ret;
+}
+
+int32_t
+ga_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int32_t
+ga_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+            int32_t flags, dict_t *xdata)
+{
+    data_t *data = NULL;
+    int op_errno = ENOMEM;
+    int ret = 0;
+    loc_t ga_loc = {
+        0,
+    };
+
+    GFID_ACCESS_INODE_OP_CHECK(loc, op_errno, err);
+
+    data = dict_get(dict, GF_FUSE_AUX_GFID_NEWFILE);
+    if (data) {
+        ret = ga_new_entry(frame, this, loc, data, xdata);
+        if (ret)
+            goto err;
+        return 0;
+    }
+
+    data = dict_get(dict, GF_FUSE_AUX_GFID_HEAL);
+    if (data) {
+        ret = ga_heal_entry(frame, this, loc, data, xdata);
+        if (ret)
+            goto err;
+        return 0;
+    }
+
+    // If the inode is a virtual inode change the inode otherwise perform
+    // the operation on same inode
+    ret = ga_valid_inode_loc_copy(&ga_loc, loc, this);
+    if (ret < 0)
+        goto err;
+
+    STACK_WIND(frame, ga_setxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setxattr, &ga_loc, dict, flags, xdata);
+
+    loc_wipe(&ga_loc);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(setxattr, frame, -1, op_errno, xdata);
+    return 0;
+}
+
+int32_t
+ga_virtual_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, inode_t *inode,
+                      struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+    int ret = 0;
+    inode_t *cbk_inode = NULL;
+    inode_t *true_inode = NULL;
+    uuid_t random_gfid = {
+        0,
+    };
+    inode_t *linked_inode = NULL;
+
+    if (frame->local)
+        cbk_inode = frame->local;
+    else
+        cbk_inode = inode_ref(inode);
+
+    frame->local = NULL;
+    if (op_ret)
+        goto unwind;
+
+    if (!IA_ISDIR(buf->ia_type))
+        goto unwind;
+
+    /* need to send back a different inode for linking in itable */
+    if (cbk_inode == inode) {
+        /* check if the inode is in the 'itable' or
+           if its just previously discover()'d inode */
+        true_inode = inode_find(inode->table, buf->ia_gfid);
+        if (!true_inode) {
+            /* This unref is for 'inode_ref()' done in beginning.
+               This is needed as cbk_inode is allocated new inode
+               whose unref is taken at the end*/
+            inode_unref(cbk_inode);
+            cbk_inode = inode_new(inode->table);
+
+            if (!cbk_inode) {
+                op_ret = -1;
+                op_errno = ENOMEM;
+                goto unwind;
+            }
+            /* the inode is not present in itable, ie, the actual
+               path is not yet looked up. Use the current inode
+               itself for now */
+
+            linked_inode = inode_link(inode, NULL, NULL, buf);
+            inode = linked_inode;
+        } else {
+            /* 'inode_ref()' has been done in inode_find() */
+            inode = true_inode;
+        }
+
+        ret = inode_ctx_put(cbk_inode, this, (uint64_t)(uintptr_t)inode);
+        if (ret) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "failed to set the inode ctx with"
+                   "the actual inode");
+            if (inode)
+                inode_unref(inode);
+        }
+        inode = NULL;
+    }
+
+    if (!gf_uuid_is_null(cbk_inode->gfid)) {
+        /* if the previous linked inode is used, use the
+           same gfid */
+        gf_uuid_copy(random_gfid, cbk_inode->gfid);
+    } else {
+        /* replace the buf->ia_gfid to a random gfid
+           for directory, for files, what we received is fine */
+        gf_uuid_generate(random_gfid);
+    }
+
+    gf_uuid_copy(buf->ia_gfid, random_gfid);
+
+    buf->ia_ino = gfid_to_ino(buf->ia_gfid);
+
+unwind:
+    /* Lookup on non-existing gfid returns ESTALE.
+       Convert into ENOENT for virtual lookup*/
+    if (op_errno == ESTALE)
+        op_errno = ENOENT;
+
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, cbk_inode, buf, xdata,
+                        postparent);
+
+    /* Also handles inode_unref of frame->local if done in ga_lookup */
+    if (cbk_inode)
+        inode_unref(cbk_inode);
+
+    return 0;
+}
+
+int32_t
+ga_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+              struct iatt *postparent)
+{
+    ga_private_t *priv = NULL;
+
+    /* if the entry in question is not 'root',
+       then follow the normal path */
+    if (op_ret || !__is_root_gfid(buf->ia_gfid))
+        goto unwind;
+
+    priv = this->private;
+
+    /* do we need to copy root stbuf every time? */
+    /* mostly yes, as we want to have the 'stat' info show latest
+       in every _cbk() */
+
+    /* keep the reference for root stat buf */
+    priv->root_stbuf = *buf;
+    priv->gfiddir_stbuf = priv->root_stbuf;
+    priv->gfiddir_stbuf.ia_gfid[15] = GF_AUX_GFID;
+    priv->gfiddir_stbuf.ia_ino = GF_AUX_GFID;
+
+unwind:
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+                        postparent);
+    return 0;
+}
+
+int32_t
+ga_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    ga_private_t *priv = NULL;
+    int ret = -1;
+    uuid_t tmp_gfid = {
+        0,
+    };
+    loc_t tmp_loc = {
+        0,
+    };
+    uint64_t value = 0;
+    inode_t *inode = NULL;
+    inode_t *true_inode = NULL;
+    int32_t op_errno = ENOENT;
+
+    priv = this->private;
+
+    /* Handle nameless lookup on ".gfid" */
+    if (!loc->parent && __is_gfid_access_dir(loc->gfid)) {
+        STACK_UNWIND_STRICT(lookup, frame, 0, 0, loc->inode,
+                            &priv->gfiddir_stbuf, xdata, &priv->root_stbuf);
+        return 0;
+    }
+
+    /* if its discover(), no need for any action here */
+    if (!loc->name)
+        goto wind;
+
+    /* if its revalidate, and inode is not of type directory,
+       proceed with 'wind' */
+    if (loc->inode && loc->inode->ia_type && !IA_ISDIR(loc->inode->ia_type)) {
+        /* a revalidate on ".gfid/<dentry>" is possible, check for it */
+        if (((loc->parent && __is_gfid_access_dir(loc->parent->gfid)) ||
+             __is_gfid_access_dir(loc->pargfid))) {
+            /* here, just send 'loc->gfid' and 'loc->inode' */
+            tmp_loc.inode = inode_ref(loc->inode);
+            gf_uuid_copy(tmp_loc.gfid, loc->inode->gfid);
+
+            STACK_WIND(frame, default_lookup_cbk, FIRST_CHILD(this),
+                       FIRST_CHILD(this)->fops->lookup, &tmp_loc, xdata);
+
+            inode_unref(tmp_loc.inode);
+
+            return 0;
+        }
+
+        /* not something to bother, continue the flow */
+        goto wind;
+    }
+
+    /* need to check if the lookup is on virtual dir */
+    if ((loc->name && !strcmp(GF_GFID_DIR, loc->name)) &&
+        ((loc->parent && __is_root_gfid(loc->parent->gfid)) ||
+         __is_root_gfid(loc->pargfid))) {
+        /* this means, the query is on '/.gfid', return the fake stat,
+           and say success */
+
+        STACK_UNWIND_STRICT(lookup, frame, 0, 0, loc->inode,
+                            &priv->gfiddir_stbuf, xdata, &priv->root_stbuf);
+        return 0;
+    }
+
+    /* now, check if the lookup() is on an existing entry,
+       but on gfid-path */
+    if (!((loc->parent && __is_gfid_access_dir(loc->parent->gfid)) ||
+          __is_gfid_access_dir(loc->pargfid))) {
+        if (!loc->parent)
+            goto wind;
+
+        ret = inode_ctx_get(loc->parent, this, &value);
+        if (ret)
+            goto wind;
+
+        inode = (inode_t *)(uintptr_t)value;
+
+        ret = loc_copy_overload_parent(&tmp_loc, loc, inode);
+        if (ret)
+            goto err;
+
+        STACK_WIND(frame, ga_lookup_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->lookup, &tmp_loc, xdata);
+
+        loc_wipe(&tmp_loc);
+        return 0;
+    }
+
+    /* make sure the 'basename' is actually a 'canonical-gfid',
+       otherwise, return error */
+    ret = gf_uuid_parse(loc->name, tmp_gfid);
+    if (ret)
+        goto err;
+
+    /* if its fresh lookup, go ahead and send it down, if not,
+       for directory, we need indirection to actual dir inode */
+    if (!(loc->inode && loc->inode->ia_type))
+        goto discover;
+
+    /* revalidate on directory */
+    ret = inode_ctx_get(loc->inode, this, &value);
+    if (ret)
+        goto err;
+
+    inode = (void *)(uintptr_t)value;
+
+    /* valid inode, already looked up, work on that */
+    if (inode->ia_type)
+        goto discover;
+
+    /* check if the inode is in the 'itable' or
+       if its just previously discover()'d inode */
+    true_inode = inode_find(loc->inode->table, tmp_gfid);
+    if (true_inode) {
+        /* time do another lookup and update the context
+           with proper inode */
+        op_errno = ESTALE;
+        /* 'inode_ref()' done in inode_find */
+        inode_unref(true_inode);
+        goto err;
+    }
+
+discover:
+    /* for the virtual entries, we don't need to send 'gfid-req' key, as
+       for these entries, we don't want to 'set' a new gfid */
+    if (xdata)
+        dict_del(xdata, "gfid-req");
+
+    gf_uuid_copy(tmp_loc.gfid, tmp_gfid);
+
+    /* if revalidate, then we need to have the proper reference */
+    if (inode) {
+        tmp_loc.inode = inode_ref(inode);
+        frame->local = inode_ref(loc->inode);
+    } else {
+        tmp_loc.inode = inode_ref(loc->inode);
+    }
+
+    STACK_WIND(frame, ga_virtual_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, &tmp_loc, xdata);
+
+    inode_unref(tmp_loc.inode);
+
+    return 0;
+
+wind:
+    /* used for all the normal lookup path */
+    STACK_WIND(frame, ga_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
+
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(lookup, frame, -1, op_errno, loc->inode,
+                        &priv->gfiddir_stbuf, xdata, &priv->root_stbuf);
+    return 0;
+}
+
+int
+ga_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         mode_t umask, dict_t *xdata)
+{
+    int op_errno = ENOMEM;
+
+    GFID_ACCESS_ENTRY_OP_CHECK(loc, op_errno, err);
+
+    STACK_WIND(frame, default_mkdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(mkdir, frame, -1, op_errno, loc->inode, NULL, NULL,
+                        NULL, xdata);
+    return 0;
+}
+
+int
+ga_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+          mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    int op_errno = ENOMEM;
+
+    GFID_ACCESS_ENTRY_OP_CHECK(loc, op_errno, err);
+
+    STACK_WIND(frame, default_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL, xdata);
+
+    return 0;
+}
+
+int
+ga_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
+           loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    int op_errno = ENOMEM;
+
+    GFID_ACCESS_ENTRY_OP_CHECK(loc, op_errno, err);
+
+    STACK_WIND(frame, default_symlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        xdata);
+
+    return 0;
+}
+
+int
+ga_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    int op_errno = ENOMEM;
+
+    GFID_ACCESS_ENTRY_OP_CHECK(loc, op_errno, err);
+
+    STACK_WIND(frame, default_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        xdata);
+
+    return 0;
+}
+
+int
+ga_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flag,
+         dict_t *xdata)
+{
+    int op_errno = ENOMEM;
+    int ret = -1;
+    loc_t ga_loc = {
+        0,
+    };
+
+    GFID_ACCESS_ENTRY_OP_CHECK(loc, op_errno, err);
+
+    ret = ga_valid_inode_loc_copy(&ga_loc, loc, this);
+    if (ret < 0)
+        goto err;
+
+    STACK_WIND(frame, default_rmdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rmdir, &ga_loc, flag, xdata);
+
+    loc_wipe(&ga_loc);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(rmdir, frame, -1, op_errno, NULL, NULL, xdata);
+
+    return 0;
+}
+
+int
+ga_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag,
+          dict_t *xdata)
+{
+    int op_errno = ENOMEM;
+    int ret = -1;
+    loc_t ga_loc = {
+        0,
+    };
+
+    GFID_ACCESS_ENTRY_OP_CHECK(loc, op_errno, err);
+
+    ret = ga_valid_inode_loc_copy(&ga_loc, loc, this);
+    if (ret < 0)
+        goto err;
+
+    STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, &ga_loc, xflag, xdata);
+
+    loc_wipe(&ga_loc);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(unlink, frame, -1, op_errno, NULL, NULL, xdata);
+
+    return 0;
+}
+
+int
+ga_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+          dict_t *xdata)
+{
+    int op_errno = ENOMEM;
+    int ret = 0;
+    loc_t ga_oldloc = {
+        0,
+    };
+    loc_t ga_newloc = {
+        0,
+    };
+
+    GFID_ACCESS_ENTRY_OP_CHECK(oldloc, op_errno, err);
+    GFID_ACCESS_ENTRY_OP_CHECK(newloc, op_errno, err);
+
+    ret = ga_valid_inode_loc_copy(&ga_oldloc, oldloc, this);
+    if (ret < 0)
+        goto err;
+
+    ret = ga_valid_inode_loc_copy(&ga_newloc, newloc, this);
+    if (ret < 0) {
+        loc_wipe(&ga_oldloc);
+        goto err;
+    }
+
+    STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, &ga_oldloc, &ga_newloc, xdata);
+
+    loc_wipe(&ga_newloc);
+    loc_wipe(&ga_oldloc);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL, xdata);
+
+    return 0;
+}
+
+int
+ga_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+        dict_t *xdata)
+{
+    int op_errno = ENOMEM;
+    int ret = 0;
+    loc_t ga_oldloc = {
+        0,
+    };
+    loc_t ga_newloc = {
+        0,
+    };
+
+    GFID_ACCESS_ENTRY_OP_CHECK(oldloc, op_errno, err);
+    GFID_ACCESS_ENTRY_OP_CHECK(newloc, op_errno, err);
+
+    ret = ga_valid_inode_loc_copy(&ga_oldloc, oldloc, this);
+    if (ret < 0)
+        goto err;
+
+    ret = ga_valid_inode_loc_copy(&ga_newloc, newloc, this);
+    if (ret < 0) {
+        loc_wipe(&ga_oldloc);
+        goto err;
+    }
+
+    STACK_WIND(frame, default_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, &ga_oldloc, &ga_newloc, xdata);
+
+    loc_wipe(&ga_newloc);
+    loc_wipe(&ga_oldloc);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(link, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        xdata);
+
+    return 0;
+}
+
+int32_t
+ga_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+           dict_t *xdata)
+{
+    int op_errno = ENOMEM;
+
+    GFID_ACCESS_INODE_OP_CHECK(loc, op_errno, err);
+
+    /* also check if the loc->inode itself is virtual
+       inode, if yes, return with failure, mainly because we
+       can't handle all the readdirp and other things on it. */
+    if (inode_ctx_get(loc->inode, this, NULL) == 0) {
+        op_errno = ENOTSUP;
+        goto err;
+    }
+
+    STACK_WIND(frame, default_opendir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(opendir, frame, -1, op_errno, NULL, xdata);
+
+    return 0;
+}
+
+int32_t
+ga_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+            dict_t *xdata)
+{
+    int op_errno = ENOMEM;
+    int ret = -1;
+    loc_t ga_loc = {
+        0,
+    };
+
+    GFID_ACCESS_INODE_OP_CHECK(loc, op_errno, err);
+    ret = ga_valid_inode_loc_copy(&ga_loc, loc, this);
+    if (ret < 0)
+        goto err;
+
+    STACK_WIND(frame, default_getxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getxattr, &ga_loc, name, xdata);
+
+    loc_wipe(&ga_loc);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(getxattr, frame, -1, op_errno, NULL, xdata);
+
+    return 0;
+}
+
+int32_t
+ga_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    int op_errno = ENOMEM;
+    int ret = -1;
+    loc_t ga_loc = {
+        0,
+    };
+    ga_private_t *priv = NULL;
+
+    priv = this->private;
+    /* If stat is on ".gfid" itself, do not wind further,
+     * return fake stat and return success.
+     */
+    if (__is_gfid_access_dir(loc->gfid))
+        goto out;
+
+    ret = ga_valid_inode_loc_copy(&ga_loc, loc, this);
+    if (ret < 0)
+        goto err;
+
+    STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->stat, &ga_loc, xdata);
+
+    loc_wipe(&ga_loc);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(stat, frame, -1, op_errno, NULL, xdata);
+
+    return 0;
+
+out:
+    STACK_UNWIND_STRICT(stat, frame, 0, 0, &priv->gfiddir_stbuf, xdata);
+    return 0;
+}
+
+int32_t
+ga_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+           int32_t valid, dict_t *xdata)
+{
+    int op_errno = ENOMEM;
+    int ret = -1;
+    loc_t ga_loc = {
+        0,
+    };
+
+    GFID_ACCESS_INODE_OP_CHECK(loc, op_errno, err);
+    ret = ga_valid_inode_loc_copy(&ga_loc, loc, this);
+    if (ret < 0)
+        goto err;
+
+    STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, &ga_loc, stbuf, valid, xdata);
+
+    loc_wipe(&ga_loc);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(setattr, frame, -1, op_errno, NULL, NULL, xdata);
+
+    return 0;
+}
+
+int32_t
+ga_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               const char *name, dict_t *xdata)
+{
+    int op_errno = ENOMEM;
+    int ret = -1;
+    loc_t ga_loc = {
+        0,
+    };
+
+    GFID_ACCESS_INODE_OP_CHECK(loc, op_errno, err);
+    ret = ga_valid_inode_loc_copy(&ga_loc, loc, this);
+    if (ret < 0)
+        goto err;
+
+    STACK_WIND(frame, default_removexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->removexattr, &ga_loc, name, xdata);
+
+    loc_wipe(&ga_loc);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(removexattr, frame, -1, op_errno, xdata);
+
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_gfid_access_mt_end + 1);
+
+    if (ret != 0) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "Memory accounting"
+               " init failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    ga_private_t *priv = NULL;
+    int ret = -1;
+
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "not configured with exactly one child. exiting");
+        goto out;
+    }
+
+    /* This can be the top of graph in certain cases */
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_DEBUG, "dangling volume. check volfile ");
+    }
+
+    /* TODO: define a mem-type structure */
+    priv = GF_CALLOC(1, sizeof(*priv), gf_gfid_access_mt_priv_t);
+    if (!priv)
+        goto out;
+
+    priv->newfile_args_pool = mem_pool_new(ga_newfile_args_t, 512);
+    if (!priv->newfile_args_pool)
+        goto out;
+
+    priv->heal_args_pool = mem_pool_new(ga_heal_args_t, 512);
+    if (!priv->heal_args_pool)
+        goto out;
+
+    this->local_pool = mem_pool_new(ga_local_t, 16);
+    if (!this->local_pool) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to create local_t's memory pool");
+        goto out;
+    }
+
+    this->private = priv;
+
+    ret = 0;
+out:
+    if (ret && priv) {
+        if (priv->newfile_args_pool)
+            mem_pool_destroy(priv->newfile_args_pool);
+        GF_FREE(priv);
+    }
+
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    ga_private_t *priv = NULL;
+    priv = this->private;
+    this->private = NULL;
+
+    if (priv) {
+        if (priv->newfile_args_pool)
+            mem_pool_destroy(priv->newfile_args_pool);
+        if (priv->heal_args_pool)
+            mem_pool_destroy(priv->heal_args_pool);
+        GF_FREE(priv);
+    }
+
+    return;
+}
+
+int32_t
+ga_dump_inodectx(xlator_t *this, inode_t *inode)
+{
+    int ret = -1;
+    uint64_t value = 0;
+    inode_t *tmp_inode = NULL;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+
+    ret = inode_ctx_get(inode, this, &value);
+    if (ret == 0) {
+        tmp_inode = (void *)(uintptr_t)value;
+        gf_proc_dump_build_key(key_prefix, this->name, "inode");
+        gf_proc_dump_add_section("%s", key_prefix);
+        gf_proc_dump_write("real-gfid", "%s", uuid_utoa(tmp_inode->gfid));
+    }
+
+    return 0;
+}
+
+struct xlator_fops fops = {
+    .lookup = ga_lookup,
+
+    /* entry fops */
+    .mkdir = ga_mkdir,
+    .mknod = ga_mknod,
+    .create = ga_create,
+    .symlink = ga_symlink,
+    .link = ga_link,
+    .unlink = ga_unlink,
+    .rmdir = ga_rmdir,
+    .rename = ga_rename,
+
+    /* handle any other directory operations here */
+    .opendir = ga_opendir,
+    .stat = ga_stat,
+    .setattr = ga_setattr,
+    .getxattr = ga_getxattr,
+    .removexattr = ga_removexattr,
+
+    /* special fop to handle more entry creations */
+    .setxattr = ga_setxattr,
+};
+
+struct xlator_cbks cbks = {
+    .forget = ga_forget,
+};
+
+struct xlator_dumpops dumpops = {
+    .inodectx = ga_dump_inodectx,
+};
+
+struct volume_options options[] = {
+    /* This translator doesn't take any options, or provide any options */
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1},
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "gfid-access",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/gfid-access/src/gfid-access.h b/xlators/features/gfid-access/src/gfid-access.h
new file mode 100644
index 00000000000..b1e255e56c0
--- /dev/null
+++ b/xlators/features/gfid-access/src/gfid-access.h
@@ -0,0 +1,107 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef __GFID_ACCESS_H__
+#define __GFID_ACCESS_H__
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include "gfid-access-mem-types.h"
+
+#define UUID_CANONICAL_FORM_LEN 36
+
+#define GF_FUSE_AUX_GFID_NEWFILE "glusterfs.gfid.newfile"
+#define GF_FUSE_AUX_GFID_HEAL "glusterfs.gfid.heal"
+
+#define GF_GFID_KEY "GLUSTERFS_GFID"
+#define GF_GFID_DIR ".gfid"
+#define GF_AUX_GFID 0xd
+
+#define GFID_ACCESS_ENTRY_OP_CHECK(loc, err, lbl)                              \
+    do {                                                                       \
+        /* need to check if the lookup is on virtual dir */                    \
+        if ((loc->name && !strcmp(GF_GFID_DIR, loc->name)) &&                  \
+            ((loc->parent && __is_root_gfid(loc->parent->gfid)) ||             \
+             __is_root_gfid(loc->pargfid))) {                                  \
+            err = ENOTSUP;                                                     \
+            goto lbl;                                                          \
+        }                                                                      \
+                                                                               \
+        /* now, check if the lookup() is on an existing */                     \
+        /* entry, but on gfid-path */                                          \
+        if ((loc->parent && __is_gfid_access_dir(loc->parent->gfid)) ||        \
+            __is_gfid_access_dir(loc->pargfid)) {                              \
+            err = EPERM;                                                       \
+            goto lbl;                                                          \
+        }                                                                      \
+    } while (0)
+
+#define GFID_ACCESS_INODE_OP_CHECK(loc, err, lbl)                              \
+    do {                                                                       \
+        /*Check if it is on .gfid*/                                            \
+        if (__is_gfid_access_dir(loc->gfid)) {                                 \
+            err = ENOTSUP;                                                     \
+            goto lbl;                                                          \
+        }                                                                      \
+    } while (0)
+typedef struct {
+    unsigned int uid;
+    unsigned int gid;
+    char gfid[UUID_CANONICAL_FORM_LEN + 1];
+    unsigned int st_mode;
+    char *bname;
+
+    union {
+        struct _symlink_in {
+            char *linkpath;
+        } __attribute__((__packed__)) symlink;
+
+        struct _mknod_in {
+            unsigned int mode;
+            unsigned int rdev;
+            unsigned int umask;
+        } __attribute__((__packed__)) mknod;
+
+        struct _mkdir_in {
+            unsigned int mode;
+            unsigned int umask;
+        } __attribute__((__packed__)) mkdir;
+    } __attribute__((__packed__)) args;
+} __attribute__((__packed__)) ga_newfile_args_t;
+
+typedef struct {
+    char gfid[UUID_CANONICAL_FORM_LEN + 1];
+    char *bname; /* a null terminated basename */
+} __attribute__((__packed__)) ga_heal_args_t;
+
+struct ga_private {
+    /* root inode's stbuf */
+    struct iatt root_stbuf;
+    struct iatt gfiddir_stbuf;
+    struct mem_pool *newfile_args_pool;
+    struct mem_pool *heal_args_pool;
+};
+typedef struct ga_private ga_private_t;
+
+struct __ga_local {
+    call_frame_t *orig_frame;
+    unsigned int uid;
+    unsigned int gid;
+    loc_t loc;
+    mode_t mode;
+    dev_t rdev;
+    mode_t umask;
+    dict_t *xdata;
+};
+typedef struct __ga_local ga_local_t;
+
+#endif /* __GFID_ACCESS_H__ */
diff --git a/xlators/features/index/Makefile.am b/xlators/features/index/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/index/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/index/src/Makefile.am b/xlators/features/index/src/Makefile.am
new file mode 100644
index 00000000000..c71c238c163
--- /dev/null
+++ b/xlators/features/index/src/Makefile.am
@@ -0,0 +1,19 @@
+if WITH_SERVER
+xlator_LTLIBRARIES = index.la
+endif
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+index_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+index_la_SOURCES = index.c
+index_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = index.h index-mem-types.h index-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(top_srcdir)/rpc/rpc-lib/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/index/src/index-mem-types.h b/xlators/features/index/src/index-mem-types.h
new file mode 100644
index 00000000000..58833d0ec9b
--- /dev/null
+++ b/xlators/features/index/src/index-mem-types.h
@@ -0,0 +1,23 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __INDEX_MEM_TYPES_H__
+#define __INDEX_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_index_mem_types_ {
+    gf_index_mt_priv_t = gf_common_mt_end + 1,
+    gf_index_inode_ctx_t,
+    gf_index_fd_ctx_t,
+    gf_index_mt_local_t,
+    gf_index_mt_end
+};
+#endif
diff --git a/xlators/features/index/src/index-messages.h b/xlators/features/index/src/index-messages.h
new file mode 100644
index 00000000000..364f17cd34e
--- /dev/null
+++ b/xlators/features/index/src/index-messages.h
@@ -0,0 +1,33 @@
+/*
+ Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _INDEX_MESSAGES_H_
+#define _INDEX_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(INDEX, INDEX_MSG_INDEX_DIR_CREATE_FAILED,
+           INDEX_MSG_INDEX_READDIR_FAILED, INDEX_MSG_INDEX_ADD_FAILED,
+           INDEX_MSG_INDEX_DEL_FAILED, INDEX_MSG_DICT_SET_FAILED,
+           INDEX_MSG_INODE_CTX_GET_SET_FAILED, INDEX_MSG_INVALID_ARGS,
+           INDEX_MSG_FD_OP_FAILED, INDEX_MSG_WORKER_THREAD_CREATE_FAILED,
+           INDEX_MSG_INVALID_GRAPH);
+
+#endif /* !_INDEX_MESSAGES_H_ */
diff --git a/xlators/features/index/src/index.c b/xlators/features/index/src/index.c
new file mode 100644
index 00000000000..4abb2c73ce5
--- /dev/null
+++ b/xlators/features/index/src/index.c
@@ -0,0 +1,2682 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include "index.h"
+#include <glusterfs/options.h>
+#include "glusterfs3-xdr.h"
+#include <glusterfs/syscall.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/common-utils.h>
+#include "index-messages.h"
+#include <ftw.h>
+#include <libgen.h> /* for dirname() */
+#include <signal.h>
+
+#define XATTROP_SUBDIR "xattrop"
+#define DIRTY_SUBDIR "dirty"
+#define ENTRY_CHANGES_SUBDIR "entry-changes"
+
+struct index_syncop_args {
+    inode_t *parent;
+    gf_dirent_t *entries;
+    char *path;
+};
+
+static char *index_vgfid_xattrs[XATTROP_TYPE_END] = {
+    [XATTROP] = GF_XATTROP_INDEX_GFID,
+    [DIRTY] = GF_XATTROP_DIRTY_GFID,
+    [ENTRY_CHANGES] = GF_XATTROP_ENTRY_CHANGES_GFID};
+
+static char *index_subdirs[XATTROP_TYPE_END] = {
+    [XATTROP] = XATTROP_SUBDIR,
+    [DIRTY] = DIRTY_SUBDIR,
+    [ENTRY_CHANGES] = ENTRY_CHANGES_SUBDIR};
+
+int
+index_get_type_from_vgfid(index_priv_t *priv, uuid_t vgfid)
+{
+    int i = 0;
+
+    for (i = 0; i < XATTROP_TYPE_END; i++) {
+        if (gf_uuid_compare(priv->internal_vgfid[i], vgfid) == 0)
+            return i;
+    }
+    return -1;
+}
+
+gf_boolean_t
+index_is_virtual_gfid(index_priv_t *priv, uuid_t vgfid)
+{
+    if (index_get_type_from_vgfid(priv, vgfid) < 0)
+        return _gf_false;
+    return _gf_true;
+}
+
+static int
+__index_inode_ctx_get(inode_t *inode, xlator_t *this, index_inode_ctx_t **ctx)
+{
+    int ret = 0;
+    index_inode_ctx_t *ictx = NULL;
+    uint64_t tmpctx = 0;
+
+    ret = __inode_ctx_get(inode, this, &tmpctx);
+    if (!ret) {
+        ictx = (index_inode_ctx_t *)(long)tmpctx;
+        goto out;
+    }
+    ictx = GF_CALLOC(1, sizeof(*ictx), gf_index_inode_ctx_t);
+    if (!ictx) {
+        ret = -1;
+        goto out;
+    }
+
+    INIT_LIST_HEAD(&ictx->callstubs);
+    ret = __inode_ctx_put(inode, this, (uint64_t)(uintptr_t)ictx);
+    if (ret) {
+        GF_FREE(ictx);
+        ictx = NULL;
+        goto out;
+    }
+out:
+    if (ictx)
+        *ctx = ictx;
+    return ret;
+}
+
+static int
+index_inode_ctx_get(inode_t *inode, xlator_t *this, index_inode_ctx_t **ctx)
+{
+    int ret = 0;
+
+    LOCK(&inode->lock);
+    {
+        ret = __index_inode_ctx_get(inode, this, ctx);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+static gf_boolean_t
+index_is_subdir_of_entry_changes(xlator_t *this, inode_t *inode)
+{
+    index_inode_ctx_t *ctx = NULL;
+    int ret = 0;
+
+    if (!inode)
+        return _gf_false;
+
+    ret = index_inode_ctx_get(inode, this, &ctx);
+    if ((ret == 0) && !gf_uuid_is_null(ctx->virtual_pargfid))
+        return _gf_true;
+    return _gf_false;
+}
+
+static int
+index_get_type_from_vgfid_xattr(const char *name)
+{
+    int i = 0;
+
+    for (i = 0; i < XATTROP_TYPE_END; i++) {
+        if (strcmp(name, index_vgfid_xattrs[i]) == 0)
+            return i;
+    }
+    return -1;
+}
+
+gf_boolean_t
+index_is_fop_on_internal_inode(xlator_t *this, inode_t *inode, uuid_t gfid)
+{
+    index_priv_t *priv = this->private;
+    uuid_t vgfid = {0};
+
+    if (!inode)
+        return _gf_false;
+
+    if (gfid && !gf_uuid_is_null(gfid))
+        gf_uuid_copy(vgfid, gfid);
+    else
+        gf_uuid_copy(vgfid, inode->gfid);
+
+    if (index_is_virtual_gfid(priv, vgfid))
+        return _gf_true;
+    if (index_is_subdir_of_entry_changes(this, inode))
+        return _gf_true;
+    return _gf_false;
+}
+
+static gf_boolean_t
+index_is_vgfid_xattr(const char *name)
+{
+    if (index_get_type_from_vgfid_xattr(name) < 0)
+        return _gf_false;
+    return _gf_true;
+}
+
+call_stub_t *
+__index_dequeue(struct list_head *callstubs)
+{
+    call_stub_t *stub = NULL;
+
+    if (!list_empty(callstubs)) {
+        stub = list_entry(callstubs->next, call_stub_t, list);
+        list_del_init(&stub->list);
+    }
+
+    return stub;
+}
+
+static void
+__index_enqueue(struct list_head *callstubs, call_stub_t *stub)
+{
+    list_add_tail(&stub->list, callstubs);
+}
+
+static void
+worker_enqueue(xlator_t *this, call_stub_t *stub)
+{
+    index_priv_t *priv = NULL;
+
+    priv = this->private;
+    pthread_mutex_lock(&priv->mutex);
+    {
+        __index_enqueue(&priv->callstubs, stub);
+        GF_ATOMIC_INC(priv->stub_cnt);
+        pthread_cond_signal(&priv->cond);
+    }
+    pthread_mutex_unlock(&priv->mutex);
+}
+
+void *
+index_worker(void *data)
+{
+    index_priv_t *priv = NULL;
+    xlator_t *this = NULL;
+    call_stub_t *stub = NULL;
+    gf_boolean_t bye = _gf_false;
+
+    THIS = data;
+    this = data;
+    priv = this->private;
+
+    for (;;) {
+        pthread_mutex_lock(&priv->mutex);
+        {
+            while (list_empty(&priv->callstubs)) {
+                if (priv->down) {
+                    bye = _gf_true; /*Avoid wait*/
+                    break;
+                }
+                (void)pthread_cond_wait(&priv->cond, &priv->mutex);
+                if (priv->down) {
+                    bye = _gf_true;
+                    break;
+                }
+            }
+            if (!bye)
+                stub = __index_dequeue(&priv->callstubs);
+            if (bye) {
+                priv->curr_count--;
+                if (priv->curr_count == 0)
+                    pthread_cond_broadcast(&priv->cond);
+            }
+        }
+        pthread_mutex_unlock(&priv->mutex);
+
+        if (stub) { /* guard against spurious wakeups */
+            call_resume(stub);
+            GF_ATOMIC_DEC(priv->stub_cnt);
+        }
+        stub = NULL;
+        if (bye)
+            break;
+    }
+
+    return NULL;
+}
+
+static void
+make_index_dir_path(char *base, const char *subdir, char *index_dir, size_t len)
+{
+    snprintf(index_dir, len, "%s/%s", base, subdir);
+}
+
+int
+index_dir_create(xlator_t *this, const char *subdir)
+{
+    int ret = 0;
+    struct stat st = {0};
+    char fullpath[PATH_MAX] = {0};
+    char path[PATH_MAX] = {0};
+    char *dir = NULL;
+    index_priv_t *priv = NULL;
+    size_t len = 0;
+    size_t pathlen = 0;
+
+    priv = this->private;
+    make_index_dir_path(priv->index_basepath, subdir, fullpath,
+                        sizeof(fullpath));
+    ret = sys_stat(fullpath, &st);
+    if (!ret) {
+        if (!S_ISDIR(st.st_mode))
+            ret = -2;
+        goto out;
+    }
+
+    pathlen = strlen(fullpath);
+    if ((pathlen > 1) && fullpath[pathlen - 1] == '/')
+        fullpath[pathlen - 1] = '\0';
+    dir = strchr(fullpath, '/');
+    while (dir) {
+        dir = strchr(dir + 1, '/');
+        if (dir)
+            len = pathlen - strlen(dir);
+        else
+            len = pathlen;
+        strncpy(path, fullpath, len);
+        path[len] = '\0';
+        ret = sys_mkdir(path, 0600);
+        if (ret && (errno != EEXIST))
+            goto out;
+    }
+    ret = 0;
+out:
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, errno,
+               INDEX_MSG_INDEX_DIR_CREATE_FAILED,
+               "%s/%s: Failed to "
+               "create",
+               priv->index_basepath, subdir);
+    } else if (ret == -2) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOTDIR,
+               INDEX_MSG_INDEX_DIR_CREATE_FAILED,
+               "%s/%s: Failed to "
+               "create, path exists, not a directory ",
+               priv->index_basepath, subdir);
+    }
+    return ret;
+}
+
+void
+index_get_index(index_priv_t *priv, uuid_t index)
+{
+    LOCK(&priv->lock);
+    {
+        gf_uuid_copy(index, priv->index);
+    }
+    UNLOCK(&priv->lock);
+}
+
+void
+index_generate_index(index_priv_t *priv, uuid_t index)
+{
+    LOCK(&priv->lock);
+    {
+        // To prevent duplicate generates.
+        // This method fails if number of contending threads is greater
+        // than MAX_LINK count of the fs
+        if (!gf_uuid_compare(priv->index, index))
+            gf_uuid_generate(priv->index);
+        gf_uuid_copy(index, priv->index);
+    }
+    UNLOCK(&priv->lock);
+}
+
+static void
+make_index_path(char *base, const char *subdir, uuid_t index, char *index_path,
+                size_t len)
+{
+    make_index_dir_path(base, subdir, index_path, len);
+    snprintf(index_path + strlen(index_path), len - strlen(index_path),
+             "/%s-%s", subdir, uuid_utoa(index));
+}
+
+static void
+make_gfid_path(char *base, const char *subdir, uuid_t gfid, char *gfid_path,
+               size_t len)
+{
+    make_index_dir_path(base, subdir, gfid_path, len);
+    snprintf(gfid_path + strlen(gfid_path), len - strlen(gfid_path), "/%s",
+             uuid_utoa(gfid));
+}
+
+static void
+make_file_path(char *base, const char *subdir, const char *filename,
+               char *file_path, size_t len)
+{
+    make_index_dir_path(base, subdir, file_path, len);
+    snprintf(file_path + strlen(file_path), len - strlen(file_path), "/%s",
+             filename);
+}
+
+static int
+is_index_file_current(char *filename, uuid_t priv_index, char *subdir)
+{
+    char current_index[GF_UUID_BUF_SIZE + 16] = {
+        0,
+    };
+
+    snprintf(current_index, sizeof current_index, "%s-%s", subdir,
+             uuid_utoa(priv_index));
+    return (!strcmp(filename, current_index));
+}
+
+static void
+check_delete_stale_index_file(xlator_t *this, char *filename, char *subdir)
+{
+    int ret = 0;
+    struct stat st = {0};
+    char filepath[PATH_MAX] = {0};
+    index_priv_t *priv = NULL;
+
+    priv = this->private;
+
+    if (is_index_file_current(filename, priv->index, subdir))
+        return;
+
+    make_file_path(priv->index_basepath, subdir, filename, filepath,
+                   sizeof(filepath));
+    ret = sys_stat(filepath, &st);
+    if (!ret && st.st_nlink == 1)
+        sys_unlink(filepath);
+}
+
+static void
+index_set_link_count(index_priv_t *priv, int64_t count,
+                     index_xattrop_type_t type)
+{
+    switch (type) {
+        case XATTROP:
+            LOCK(&priv->lock);
+            {
+                priv->pending_count = count;
+            }
+            UNLOCK(&priv->lock);
+            break;
+        default:
+            break;
+    }
+}
+
+static void
+index_get_link_count(index_priv_t *priv, int64_t *count,
+                     index_xattrop_type_t type)
+{
+    switch (type) {
+        case XATTROP:
+            LOCK(&priv->lock);
+            {
+                *count = priv->pending_count;
+            }
+            UNLOCK(&priv->lock);
+            break;
+        default:
+            break;
+    }
+}
+
+static void
+index_dec_link_count(index_priv_t *priv, index_xattrop_type_t type)
+{
+    switch (type) {
+        case XATTROP:
+            LOCK(&priv->lock);
+            {
+                priv->pending_count--;
+                if (priv->pending_count == 0)
+                    priv->pending_count--;
+            }
+            UNLOCK(&priv->lock);
+            break;
+        default:
+            break;
+    }
+}
+
+char *
+index_get_subdir_from_type(index_xattrop_type_t type)
+{
+    if (type < XATTROP || type >= XATTROP_TYPE_END)
+        return NULL;
+    return index_subdirs[type];
+}
+
+char *
+index_get_subdir_from_vgfid(index_priv_t *priv, uuid_t vgfid)
+{
+    return index_get_subdir_from_type(index_get_type_from_vgfid(priv, vgfid));
+}
+
+static int
+index_fill_readdir(fd_t *fd, index_fd_ctx_t *fctx, DIR *dir, off_t off,
+                   size_t size, gf_dirent_t *entries)
+{
+    off_t in_case = -1;
+    off_t last_off = 0;
+    size_t filled = 0;
+    int count = 0;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    int32_t this_size = -1;
+    gf_dirent_t *this_entry = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    if (!off) {
+        rewinddir(dir);
+    } else {
+        seekdir(dir, off);
+#ifndef GF_LINUX_HOST_OS
+        if ((u_long)telldir(dir) != off && off != fctx->dir_eof) {
+            gf_msg(THIS->name, GF_LOG_ERROR, EINVAL,
+                   INDEX_MSG_INDEX_READDIR_FAILED,
+                   "seekdir(0x%llx) failed on dir=%p: "
+                   "Invalid argument (offset reused from "
+                   "another DIR * structure?)",
+                   off, dir);
+            errno = EINVAL;
+            count = -1;
+            goto out;
+        }
+#endif /* GF_LINUX_HOST_OS */
+    }
+
+    while (filled <= size) {
+        in_case = (u_long)telldir(dir);
+
+        if (in_case == -1) {
+            gf_msg(THIS->name, GF_LOG_ERROR, errno,
+                   INDEX_MSG_INDEX_READDIR_FAILED, "telldir failed on dir=%p",
+                   dir);
+            goto out;
+        }
+
+        errno = 0;
+        entry = sys_readdir(dir, scratch);
+        if (!entry || errno != 0) {
+            if (errno == EBADF) {
+                gf_msg(THIS->name, GF_LOG_WARNING, errno,
+                       INDEX_MSG_INDEX_READDIR_FAILED,
+                       "readdir failed on dir=%p", dir);
+                goto out;
+            }
+            break;
+        }
+
+        if (!strncmp(entry->d_name, XATTROP_SUBDIR "-",
+                     strlen(XATTROP_SUBDIR "-"))) {
+            check_delete_stale_index_file(this, entry->d_name, XATTROP_SUBDIR);
+            continue;
+        } else if (!strncmp(entry->d_name, DIRTY_SUBDIR "-",
+                            strlen(DIRTY_SUBDIR "-"))) {
+            check_delete_stale_index_file(this, entry->d_name, DIRTY_SUBDIR);
+            continue;
+        }
+
+        this_size = max(sizeof(gf_dirent_t), sizeof(gfs3_dirplist)) +
+                    strlen(entry->d_name) + 1;
+
+        if (this_size + filled > size) {
+            seekdir(dir, in_case);
+#ifndef GF_LINUX_HOST_OS
+            if ((u_long)telldir(dir) != in_case && in_case != fctx->dir_eof) {
+                gf_msg(THIS->name, GF_LOG_ERROR, EINVAL,
+                       INDEX_MSG_INDEX_READDIR_FAILED,
+                       "seekdir(0x%llx) failed on dir=%p: "
+                       "Invalid argument (offset reused from "
+                       "another DIR * structure?)",
+                       in_case, dir);
+                errno = EINVAL;
+                count = -1;
+                goto out;
+            }
+#endif /* GF_LINUX_HOST_OS */
+            break;
+        }
+
+        this_entry = gf_dirent_for_name(entry->d_name);
+
+        if (!this_entry) {
+            gf_msg(THIS->name, GF_LOG_ERROR, errno,
+                   INDEX_MSG_INDEX_READDIR_FAILED,
+                   "could not create gf_dirent for entry %s", entry->d_name);
+            goto out;
+        }
+        /*
+         * we store the offset of next entry here, which is
+         * probably not intended, but code using syncop_readdir()
+         * (glfs-heal.c, afr-self-heald.c, pump.c) rely on it
+         * for directory read resumption.
+         */
+        last_off = (u_long)telldir(dir);
+        this_entry->d_off = last_off;
+        this_entry->d_ino = entry->d_ino;
+
+        list_add_tail(&this_entry->list, &entries->list);
+
+        filled += this_size;
+        count++;
+    }
+
+    errno = 0;
+
+    if ((!sys_readdir(dir, scratch) && (errno == 0))) {
+        /* Indicate EOF */
+        errno = ENOENT;
+        /* Remember EOF offset for later detection */
+        fctx->dir_eof = last_off;
+    }
+out:
+    return count;
+}
+
+int
+index_link_to_base(xlator_t *this, char *fpath, const char *subdir)
+{
+    int ret = 0;
+    int fd = 0;
+    int op_errno = 0;
+    uuid_t index = {0};
+    index_priv_t *priv = this->private;
+    char base[PATH_MAX] = {0};
+
+    index_get_index(priv, index);
+    make_index_path(priv->index_basepath, subdir, index, base, sizeof(base));
+
+    ret = sys_link(base, fpath);
+    if (!ret || (errno == EEXIST)) {
+        ret = 0;
+        goto out;
+    }
+
+    op_errno = errno;
+    if (op_errno == ENOENT) {
+        ret = index_dir_create(this, subdir);
+        if (ret) {
+            op_errno = errno;
+            goto out;
+        }
+    } else if (op_errno == EMLINK) {
+        index_generate_index(priv, index);
+        make_index_path(priv->index_basepath, subdir, index, base,
+                        sizeof(base));
+    } else {
+        goto out;
+    }
+
+    op_errno = 0;
+    fd = sys_creat(base, 0);
+    if ((fd < 0) && (errno != EEXIST)) {
+        op_errno = errno;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, INDEX_MSG_INDEX_ADD_FAILED,
+               "%s: Not able to "
+               "create index",
+               fpath);
+        goto out;
+    }
+
+    if (fd >= 0)
+        sys_close(fd);
+
+    ret = sys_link(base, fpath);
+    if (ret && (errno != EEXIST)) {
+        op_errno = errno;
+        gf_msg(this->name, GF_LOG_ERROR, errno, INDEX_MSG_INDEX_ADD_FAILED,
+               "%s: Not able to "
+               "add to index",
+               fpath);
+        goto out;
+    }
+out:
+    return -op_errno;
+}
+
+int
+index_add(xlator_t *this, uuid_t gfid, const char *subdir,
+          index_xattrop_type_t type)
+{
+    char gfid_path[PATH_MAX] = {0};
+    int ret = -1;
+    index_priv_t *priv = NULL;
+    struct stat st = {0};
+
+    priv = this->private;
+
+    if (gf_uuid_is_null(gfid)) {
+        GF_ASSERT(0);
+        goto out;
+    }
+
+    make_gfid_path(priv->index_basepath, subdir, gfid, gfid_path,
+                   sizeof(gfid_path));
+
+    ret = sys_stat(gfid_path, &st);
+    if (!ret)
+        goto out;
+    ret = index_link_to_base(this, gfid_path, subdir);
+out:
+    return ret;
+}
+
+int
+index_del(xlator_t *this, uuid_t gfid, const char *subdir, int type)
+{
+    int32_t op_errno __attribute__((unused)) = 0;
+    index_priv_t *priv = NULL;
+    int ret = 0;
+    char gfid_path[PATH_MAX] = {0};
+    char rename_dst[PATH_MAX] = {
+        0,
+    };
+    uuid_t uuid;
+
+    priv = this->private;
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, !gf_uuid_is_null(gfid), out,
+                                  op_errno, EINVAL);
+    make_gfid_path(priv->index_basepath, subdir, gfid, gfid_path,
+                   sizeof(gfid_path));
+
+    if ((strcmp(subdir, ENTRY_CHANGES_SUBDIR)) == 0) {
+        ret = sys_rmdir(gfid_path);
+        /* rmdir above could fail with ENOTEMPTY if the indices under
+         * it were created when granular-entry-heal was enabled, whereas
+         * the actual heal that happened was non-granular (or full) in
+         * nature, resulting in name indices getting left out. To
+         * clean up this directory without it affecting the IO path perf,
+         * the directory is renamed to a unique name under
+         * indices/entry-changes. Self-heal will pick up this entry
+         * during crawl and on lookup into the file system figure that
+         * the index is stale and subsequently wipe it out using rmdir().
+         */
+        if ((ret) && (errno == ENOTEMPTY)) {
+            gf_uuid_generate(uuid);
+            make_gfid_path(priv->index_basepath, subdir, uuid, rename_dst,
+                           sizeof(rename_dst));
+            ret = sys_rename(gfid_path, rename_dst);
+        }
+    } else {
+        ret = sys_unlink(gfid_path);
+    }
+
+    if (ret && (errno != ENOENT)) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, INDEX_MSG_INDEX_DEL_FAILED,
+               "%s: failed to delete"
+               " from index",
+               gfid_path);
+        ret = -errno;
+        goto out;
+    }
+
+    index_dec_link_count(priv, type);
+    ret = 0;
+out:
+    return ret;
+}
+
+static gf_boolean_t
+_is_xattr_in_watchlist(dict_t *d, char *k, data_t *v, void *tmp)
+{
+    if (!strncmp(k, tmp, strlen(k)))
+        return _gf_true;
+
+    return _gf_false;
+}
+
+static gf_boolean_t
+is_xattr_in_watchlist(dict_t *this, char *key, data_t *value, void *matchdata)
+{
+    int ret = -1;
+
+    // matchdata is a list of xattrs
+    // key is strncmp'ed with each xattr in matchdata.
+    // ret will be 0 if key pattern is not present in the matchdata
+    // else ret will be count number of xattrs the key pattern-matches with.
+    ret = dict_foreach_match(matchdata, _is_xattr_in_watchlist, key,
+                             dict_null_foreach_fn, NULL);
+
+    if (ret > 0)
+        return _gf_true;
+    return _gf_false;
+}
+
+static int
+index_find_xattr_type(dict_t *d, char *k, data_t *v)
+{
+    int idx = -1;
+    index_priv_t *priv = THIS->private;
+
+    if (priv->dirty_watchlist &&
+        is_xattr_in_watchlist(d, k, v, priv->dirty_watchlist))
+        idx = DIRTY;
+    else if (priv->pending_watchlist &&
+             is_xattr_in_watchlist(d, k, v, priv->pending_watchlist))
+        idx = XATTROP;
+
+    return idx;
+}
+
+int
+index_fill_zero_array(dict_t *d, char *k, data_t *v, void *adata)
+{
+    int idx = -1;
+    int *zfilled = adata;
+    // zfilled array contains `state` for all types xattrs.
+    // state : whether the gfid file of this file exists in
+    // corresponding xattr directory or not.
+
+    idx = index_find_xattr_type(d, k, v);
+    if (idx == -1)
+        return 0;
+    zfilled[idx] = 0;
+    return 0;
+}
+
+static int
+_check_key_is_zero_filled(dict_t *d, char *k, data_t *v, void *tmp)
+{
+    int *zfilled = tmp;
+    int idx = -1;
+
+    idx = index_find_xattr_type(d, k, v);
+    if (idx == -1)
+        return 0;
+
+    /* Along with checking that the value of a key is zero filled
+     * the key's corresponding index should be assigned
+     * appropriate value.
+     * zfilled[idx] will be 0(false) if value not zero.
+     *              will be 1(true) if value is zero.
+     */
+    if (mem_0filled((const char *)v->data, v->len)) {
+        zfilled[idx] = 0;
+        return 0;
+    }
+
+    /* If zfilled[idx] was previously 0, it means at least
+     * one xattr of its "kind" is non-zero. Keep its value
+     * the same.
+     */
+    if (zfilled[idx])
+        zfilled[idx] = 1;
+    return 0;
+}
+
+int
+index_entry_create(xlator_t *this, inode_t *inode, char *filename)
+{
+    int ret = -1;
+    int op_errno = 0;
+    char pgfid_path[PATH_MAX] = {0};
+    char entry_path[PATH_MAX] = {0};
+    index_priv_t *priv = NULL;
+    index_inode_ctx_t *ctx = NULL;
+    int32_t len = 0;
+
+    priv = this->private;
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, !gf_uuid_is_null(inode->gfid),
+                                  out, op_errno, EINVAL);
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, filename, out, op_errno, EINVAL);
+
+    ret = index_inode_ctx_get(inode, this, &ctx);
+    if (ret) {
+        op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               INDEX_MSG_INODE_CTX_GET_SET_FAILED,
+               "Not able to get inode ctx for %s", uuid_utoa(inode->gfid));
+        goto out;
+    }
+
+    make_gfid_path(priv->index_basepath, ENTRY_CHANGES_SUBDIR, inode->gfid,
+                   pgfid_path, sizeof(pgfid_path));
+
+    if (ctx->state[ENTRY_CHANGES] != IN) {
+        ret = sys_mkdir(pgfid_path, 0600);
+        if (ret != 0 && errno != EEXIST) {
+            op_errno = errno;
+            goto out;
+        }
+        ctx->state[ENTRY_CHANGES] = IN;
+    }
+
+    if (strchr(filename, '/')) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, INDEX_MSG_INDEX_ADD_FAILED,
+               "Got invalid entry (%s) for pargfid path (%s)", filename,
+               pgfid_path);
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    len = snprintf(entry_path, sizeof(entry_path), "%s/%s", pgfid_path,
+                   filename);
+    if ((len < 0) || (len >= sizeof(entry_path))) {
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    op_errno = 0;
+
+    ret = index_link_to_base(this, entry_path, ENTRY_CHANGES_SUBDIR);
+out:
+    if (op_errno)
+        ret = -op_errno;
+    return ret;
+}
+
+int
+index_entry_delete(xlator_t *this, uuid_t pgfid, char *filename)
+{
+    int ret = 0;
+    int op_errno = 0;
+    char pgfid_path[PATH_MAX] = {0};
+    char entry_path[PATH_MAX] = {0};
+    index_priv_t *priv = NULL;
+    int32_t len = 0;
+
+    priv = this->private;
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, !gf_uuid_is_null(pgfid), out,
+                                  op_errno, EINVAL);
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name, filename, out, op_errno, EINVAL);
+
+    make_gfid_path(priv->index_basepath, ENTRY_CHANGES_SUBDIR, pgfid,
+                   pgfid_path, sizeof(pgfid_path));
+
+    if (strchr(filename, '/')) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, INDEX_MSG_INDEX_DEL_FAILED,
+               "Got invalid entry (%s) for pargfid path (%s)", filename,
+               pgfid_path);
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    len = snprintf(entry_path, sizeof(entry_path), "%s/%s", pgfid_path,
+                   filename);
+    if ((len < 0) || (len >= sizeof(entry_path))) {
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    ret = sys_unlink(entry_path);
+    if (ret && (errno != ENOENT)) {
+        op_errno = errno;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, INDEX_MSG_INDEX_DEL_FAILED,
+               "%s: failed to delete from index/entry-changes", entry_path);
+    }
+
+out:
+    return -op_errno;
+}
+
+int
+index_entry_action(xlator_t *this, inode_t *inode, dict_t *xdata, char *key)
+{
+    int ret = 0;
+    char *filename = NULL;
+
+    ret = dict_get_str(xdata, key, &filename);
+    if (ret != 0) {
+        ret = 0;
+        goto out;
+    }
+
+    if (strcmp(key, GF_XATTROP_ENTRY_IN_KEY) == 0)
+        ret = index_entry_create(this, inode, filename);
+    else if (strcmp(key, GF_XATTROP_ENTRY_OUT_KEY) == 0)
+        ret = index_entry_delete(this, inode->gfid, filename);
+
+out:
+    return ret;
+}
+
+void
+_index_action(xlator_t *this, inode_t *inode, int *zfilled)
+{
+    int ret = 0;
+    int i = 0;
+    index_inode_ctx_t *ctx = NULL;
+    char *subdir = NULL;
+
+    ret = index_inode_ctx_get(inode, this, &ctx);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+               INDEX_MSG_INODE_CTX_GET_SET_FAILED,
+               "Not able to get"
+               " inode context for %s.",
+               uuid_utoa(inode->gfid));
+        goto out;
+    }
+
+    for (i = 0; i < XATTROP_TYPE_END; i++) {
+        subdir = index_get_subdir_from_type(i);
+        if (zfilled[i] == 1) {
+            if (ctx->state[i] == NOTIN)
+                continue;
+            ret = index_del(this, inode->gfid, subdir, i);
+            if (!ret)
+                ctx->state[i] = NOTIN;
+        } else if (zfilled[i] == 0) {
+            if (ctx->state[i] == IN)
+                continue;
+            ret = index_add(this, inode->gfid, subdir, i);
+            if (!ret)
+                ctx->state[i] = IN;
+        }
+    }
+out:
+    return;
+}
+
+static void
+index_init_state(xlator_t *this, inode_t *inode, index_inode_ctx_t *ctx,
+                 char *subdir)
+{
+    int ret = -1;
+    char pgfid_path[PATH_MAX] = {0};
+    struct stat st = {0};
+    index_priv_t *priv = NULL;
+
+    priv = this->private;
+
+    make_gfid_path(priv->index_basepath, subdir, inode->gfid, pgfid_path,
+                   sizeof(pgfid_path));
+
+    ret = sys_stat(pgfid_path, &st);
+    if (ret == 0)
+        ctx->state[ENTRY_CHANGES] = IN;
+    else if (ret != 0 && errno == ENOENT)
+        ctx->state[ENTRY_CHANGES] = NOTIN;
+
+    return;
+}
+
+void
+xattrop_index_action(xlator_t *this, index_local_t *local, dict_t *xattr,
+                     dict_match_t match, void *match_data)
+{
+    int ret = 0;
+    int zfilled[XATTROP_TYPE_END] = {
+        0,
+    };
+    int8_t value = 0;
+    char *subdir = NULL;
+    dict_t *req_xdata = NULL;
+    inode_t *inode = NULL;
+    index_inode_ctx_t *ctx = NULL;
+
+    inode = local->inode;
+    req_xdata = local->xdata;
+
+    memset(zfilled, -1, sizeof(zfilled));
+    ret = dict_foreach_match(xattr, match, match_data,
+                             _check_key_is_zero_filled, zfilled);
+    _index_action(this, inode, zfilled);
+
+    if (req_xdata) {
+        ret = index_entry_action(this, inode, req_xdata,
+                                 GF_XATTROP_ENTRY_OUT_KEY);
+
+        ret = dict_get_int8(req_xdata, GF_XATTROP_PURGE_INDEX, &value);
+        if ((ret) || (value == 0))
+            goto out;
+    }
+
+    if (zfilled[XATTROP] != 1)
+        goto out;
+
+    if (inode->ia_type != IA_IFDIR)
+        goto out;
+
+    subdir = index_get_subdir_from_type(ENTRY_CHANGES);
+    ret = index_inode_ctx_get(inode, this, &ctx);
+    if (ctx->state[ENTRY_CHANGES] == UNKNOWN)
+        index_init_state(this, inode, ctx, subdir);
+    if (ctx->state[ENTRY_CHANGES] == IN) {
+        ret = index_del(this, inode->gfid, subdir, ENTRY_CHANGES);
+        ctx->state[ENTRY_CHANGES] = NOTIN;
+    }
+
+out:
+    return;
+}
+
+static gf_boolean_t
+index_xattrop_track(xlator_t *this, gf_xattrop_flags_t flags, dict_t *dict)
+{
+    index_priv_t *priv = this->private;
+
+    if (flags == GF_XATTROP_ADD_ARRAY)
+        return _gf_true;
+
+    if (flags != GF_XATTROP_ADD_ARRAY64)
+        return _gf_false;
+
+    if (!priv->pending_watchlist)
+        return _gf_false;
+
+    if (dict_foreach_match(dict, is_xattr_in_watchlist, priv->pending_watchlist,
+                           dict_null_foreach_fn, NULL) > 0)
+        return _gf_true;
+
+    return _gf_false;
+}
+
+int
+index_inode_path(xlator_t *this, inode_t *inode, char *dirpath, size_t len)
+{
+    char *subdir = NULL;
+    int ret = 0;
+    index_priv_t *priv = NULL;
+    index_inode_ctx_t *ictx = NULL;
+
+    priv = this->private;
+    if (!index_is_fop_on_internal_inode(this, inode, NULL)) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    subdir = index_get_subdir_from_vgfid(priv, inode->gfid);
+    if (subdir) {
+        if (len <= strlen(priv->index_basepath) + 1 /*'/'*/ + strlen(subdir)) {
+            ret = -EINVAL;
+            goto out;
+        }
+        make_index_dir_path(priv->index_basepath, subdir, dirpath, len);
+    } else {
+        ret = index_inode_ctx_get(inode, this, &ictx);
+        if (ret)
+            goto out;
+        if (gf_uuid_is_null(ictx->virtual_pargfid)) {
+            ret = -EINVAL;
+            goto out;
+        }
+        make_index_dir_path(priv->index_basepath, ENTRY_CHANGES_SUBDIR, dirpath,
+                            len);
+        if (len <= strlen(dirpath) + 1 /*'/'*/ + SLEN(UUID0_STR)) {
+            ret = -EINVAL;
+            goto out;
+        }
+        strcat(dirpath, "/");
+        strcat(dirpath, uuid_utoa(ictx->virtual_pargfid));
+    }
+out:
+    return ret;
+}
+
+int
+__index_fd_ctx_get(fd_t *fd, xlator_t *this, index_fd_ctx_t **ctx)
+{
+    int ret = 0;
+    index_fd_ctx_t *fctx = NULL;
+    uint64_t tmpctx = 0;
+    char dirpath[PATH_MAX] = {0};
+
+    ret = __fd_ctx_get(fd, this, &tmpctx);
+    if (!ret) {
+        fctx = (index_fd_ctx_t *)(long)tmpctx;
+        *ctx = fctx;
+        goto out;
+    }
+
+    ret = index_inode_path(this, fd->inode, dirpath, sizeof(dirpath));
+    if (ret)
+        goto out;
+
+    fctx = GF_CALLOC(1, sizeof(*fctx), gf_index_fd_ctx_t);
+    if (!fctx) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    fctx->dir = sys_opendir(dirpath);
+    if (!fctx->dir) {
+        ret = -errno;
+        GF_FREE(fctx);
+        fctx = NULL;
+        goto out;
+    }
+    fctx->dir_eof = -1;
+
+    ret = __fd_ctx_set(fd, this, (uint64_t)(long)fctx);
+    if (ret) {
+        (void)sys_closedir(fctx->dir);
+        GF_FREE(fctx);
+        fctx = NULL;
+        ret = -EINVAL;
+        goto out;
+    }
+    *ctx = fctx;
+out:
+    return ret;
+}
+
+int
+index_fd_ctx_get(fd_t *fd, xlator_t *this, index_fd_ctx_t **ctx)
+{
+    int ret = 0;
+    LOCK(&fd->lock);
+    {
+        ret = __index_fd_ctx_get(fd, this, ctx);
+    }
+    UNLOCK(&fd->lock);
+    return ret;
+}
+
+// new - Not NULL means start a fop
+// new - NULL means done processing the fop
+void
+index_queue_process(xlator_t *this, inode_t *inode, call_stub_t *new)
+{
+    call_stub_t *stub = NULL;
+    index_inode_ctx_t *ctx = NULL;
+    int ret = 0;
+    call_frame_t *frame = NULL;
+
+    LOCK(&inode->lock);
+    {
+        ret = __index_inode_ctx_get(inode, this, &ctx);
+        if (ret)
+            goto unlock;
+
+        if (new) {
+            __index_enqueue(&ctx->callstubs, new);
+            new = NULL;
+        } else {
+            ctx->processing = _gf_false;
+        }
+
+        if (!ctx->processing) {
+            stub = __index_dequeue(&ctx->callstubs);
+            if (stub)
+                ctx->processing = _gf_true;
+            else
+                ctx->processing = _gf_false;
+        }
+    }
+unlock:
+    UNLOCK(&inode->lock);
+
+    if (ret && new) {
+        frame = new->frame;
+        if (new->fop == GF_FOP_XATTROP) {
+            INDEX_STACK_UNWIND(xattrop, frame, -1, ENOMEM, NULL, NULL);
+        } else if (new->fop == GF_FOP_FXATTROP) {
+            INDEX_STACK_UNWIND(fxattrop, frame, -1, ENOMEM, NULL, NULL);
+        }
+        call_stub_destroy(new);
+    } else if (stub) {
+        call_resume(stub);
+    }
+    return;
+}
+
+static int
+xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, dict_t *xattr, dict_t *xdata, dict_match_t match,
+            dict_t *matchdata)
+{
+    inode_t *inode = NULL;
+    index_local_t *local = NULL;
+
+    local = frame->local;
+    inode = inode_ref(local->inode);
+
+    if (op_ret < 0)
+        goto out;
+
+    xattrop_index_action(this, local, xattr, match, matchdata);
+out:
+    INDEX_STACK_UNWIND(xattrop, frame, op_ret, op_errno, xattr, xdata);
+    index_queue_process(this, inode, NULL);
+    inode_unref(inode);
+
+    return 0;
+}
+
+int32_t
+index_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, dict_t *xattr,
+                  dict_t *xdata)
+{
+    index_priv_t *priv = this->private;
+
+    xattrop_cbk(frame, cookie, this, op_ret, op_errno, xattr, xdata,
+                is_xattr_in_watchlist, priv->complete_watchlist);
+    return 0;
+}
+
+int32_t
+index_xattrop64_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *xattr,
+                    dict_t *xdata)
+{
+    index_priv_t *priv = this->private;
+
+    return xattrop_cbk(frame, cookie, this, op_ret, op_errno, xattr, xdata,
+                       is_xattr_in_watchlist, priv->pending_watchlist);
+}
+
+void
+index_xattrop_do(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+                 gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    int ret = -1;
+    int zfilled[XATTROP_TYPE_END] = {
+        0,
+    };
+    index_local_t *local = NULL;
+    fop_xattrop_cbk_t x_cbk = NULL;
+
+    local = frame->local;
+
+    if (optype == GF_XATTROP_ADD_ARRAY)
+        x_cbk = index_xattrop_cbk;
+    else
+        x_cbk = index_xattrop64_cbk;
+
+    // In wind phase bring the gfid into index. This way if the brick crashes
+    // just after posix performs xattrop before _cbk reaches index xlator
+    // we will still have the gfid in index.
+    memset(zfilled, -1, sizeof(zfilled));
+
+    /* Foreach xattr, set corresponding index of zfilled to 1
+     * zfilled[index] = 1 implies the xattr's value is zero filled
+     * and should be added in its corresponding subdir.
+     *
+     * zfilled should be set to 1 only for those index that
+     * exist in xattr variable. This is to distinguish
+     * between different types of volumes.
+     * For e.g., if the check is not made,
+     * zfilled[DIRTY] is set to 1 for EC volumes,
+     * index file will be tried to create in indices/dirty dir
+     * which doesn't exist for an EC volume.
+     */
+    ret = dict_foreach(xattr, index_fill_zero_array, zfilled);
+
+    _index_action(this, local->inode, zfilled);
+    if (xdata)
+        ret = index_entry_action(this, local->inode, xdata,
+                                 GF_XATTROP_ENTRY_IN_KEY);
+    if (ret < 0) {
+        x_cbk(frame, NULL, this, -1, -ret, NULL, NULL);
+        return;
+    }
+
+    if (loc)
+        STACK_WIND(frame, x_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->xattrop, loc, optype, xattr, xdata);
+    else
+        STACK_WIND(frame, x_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fxattrop, fd, optype, xattr, xdata);
+}
+
+int
+index_xattrop_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                      gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    index_xattrop_do(frame, this, loc, NULL, optype, xattr, xdata);
+    return 0;
+}
+
+int
+index_fxattrop_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                       gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    index_xattrop_do(frame, this, NULL, fd, optype, xattr, xdata);
+    return 0;
+}
+
+int32_t
+index_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+              gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    index_local_t *local = NULL;
+
+    if (!index_xattrop_track(this, flags, dict))
+        goto out;
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+    local->inode = inode_ref(loc->inode);
+    if (xdata)
+        local->xdata = dict_ref(xdata);
+    stub = fop_xattrop_stub(frame, index_xattrop_wrapper, loc, flags, dict,
+                            xdata);
+
+err:
+    if ((!local) || (!stub)) {
+        INDEX_STACK_UNWIND(xattrop, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+
+    index_queue_process(this, loc->inode, stub);
+    return 0;
+out:
+    STACK_WIND(frame, default_xattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->xattrop, loc, flags, dict, xdata);
+    return 0;
+}
+
+int32_t
+index_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+               gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    index_local_t *local = NULL;
+
+    if (!index_xattrop_track(this, flags, dict))
+        goto out;
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+    local->inode = inode_ref(fd->inode);
+    if (xdata)
+        local->xdata = dict_ref(xdata);
+    stub = fop_fxattrop_stub(frame, index_fxattrop_wrapper, fd, flags, dict,
+                             xdata);
+
+err:
+    if ((!local) || (!stub)) {
+        INDEX_STACK_UNWIND(fxattrop, frame, -1, ENOMEM, NULL, xdata);
+        return 0;
+    }
+
+    index_queue_process(this, fd->inode, stub);
+    return 0;
+out:
+    STACK_WIND(frame, default_fxattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fxattrop, fd, flags, dict, xdata);
+    return 0;
+}
+
+uint64_t
+index_entry_count(xlator_t *this, char *subdir)
+{
+    uint64_t count = 0;
+    index_priv_t *priv = NULL;
+    DIR *dirp = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    char index_dir[PATH_MAX] = {
+        0,
+    };
+
+    priv = this->private;
+
+    make_index_dir_path(priv->index_basepath, subdir, index_dir,
+                        sizeof(index_dir));
+
+    dirp = sys_opendir(index_dir);
+    if (!dirp)
+        return 0;
+
+    for (;;) {
+        errno = 0;
+        entry = sys_readdir(dirp, scratch);
+        if (!entry || errno != 0)
+            break;
+
+        if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
+            continue;
+
+        if (!strncmp(entry->d_name, subdir, strlen(subdir)))
+            continue;
+
+        count++;
+    }
+
+    (void)sys_closedir(dirp);
+
+    return count;
+}
+
+int32_t
+index_getxattr_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       const char *name, dict_t *xdata)
+{
+    index_priv_t *priv = NULL;
+    dict_t *xattr = NULL;
+    int ret = 0;
+    int vgfid_type = 0;
+    uint64_t count = 0;
+
+    priv = this->private;
+
+    xattr = dict_new();
+    if (!xattr) {
+        ret = -ENOMEM;
+        goto done;
+    }
+
+    vgfid_type = index_get_type_from_vgfid_xattr(name);
+    if (vgfid_type >= 0) {
+        ret = dict_set_static_bin(xattr, (char *)name,
+                                  priv->internal_vgfid[vgfid_type],
+                                  sizeof(priv->internal_vgfid[vgfid_type]));
+        if (ret) {
+            ret = -EINVAL;
+            gf_msg(this->name, GF_LOG_ERROR, -ret, INDEX_MSG_DICT_SET_FAILED,
+                   "xattrop index "
+                   "gfid set failed");
+            goto done;
+        }
+    }
+
+    /* TODO: Need to check what kind of link-counts are needed for
+     * ENTRY-CHANGES before refactor of this block with array*/
+    if (strcmp(name, GF_XATTROP_INDEX_COUNT) == 0) {
+        count = index_entry_count(this, XATTROP_SUBDIR);
+
+        ret = dict_set_uint64(xattr, (char *)name, count);
+        if (ret) {
+            ret = -EINVAL;
+            gf_msg(this->name, GF_LOG_ERROR, -ret, INDEX_MSG_DICT_SET_FAILED,
+                   "xattrop index "
+                   "count set failed");
+            goto done;
+        }
+    } else if (strcmp(name, GF_XATTROP_DIRTY_COUNT) == 0) {
+        count = index_entry_count(this, DIRTY_SUBDIR);
+
+        ret = dict_set_uint64(xattr, (char *)name, count);
+        if (ret) {
+            ret = -EINVAL;
+            gf_msg(this->name, GF_LOG_ERROR, -ret, INDEX_MSG_DICT_SET_FAILED,
+                   "dirty index "
+                   "count set failed");
+            goto done;
+        }
+    }
+done:
+    if (ret)
+        STACK_UNWIND_STRICT(getxattr, frame, -1, -ret, xattr, NULL);
+    else
+        STACK_UNWIND_STRICT(getxattr, frame, 0, 0, xattr, NULL);
+
+    if (xattr)
+        dict_unref(xattr);
+
+    return 0;
+}
+
+static int
+index_save_pargfid_for_entry_changes(xlator_t *this, loc_t *loc, char *path)
+{
+    index_priv_t *priv = NULL;
+    index_inode_ctx_t *ctx = NULL;
+    int ret = 0;
+
+    priv = this->private;
+    if (!loc)
+        return -1;
+    if (gf_uuid_compare(loc->pargfid, priv->internal_vgfid[ENTRY_CHANGES]))
+        return 0;
+
+    ret = index_inode_ctx_get(loc->inode, this, &ctx);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+               INDEX_MSG_INODE_CTX_GET_SET_FAILED,
+               "Unable to get inode context for %s", path);
+        return -EINVAL;
+    }
+    ret = gf_uuid_parse(loc->name, ctx->virtual_pargfid);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+               INDEX_MSG_INODE_CTX_GET_SET_FAILED,
+               "Unable to store "
+               "virtual gfid in inode context for %s",
+               path);
+        return -EINVAL;
+    }
+    return 0;
+}
+
+int32_t
+index_lookup_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                     dict_t *xattr_req)
+{
+    index_priv_t *priv = NULL;
+    struct stat lstatbuf = {0};
+    int ret = 0;
+    int32_t op_errno = EINVAL;
+    int32_t op_ret = -1;
+    uint64_t val = IA_INVAL;
+    char path[PATH_MAX] = {0};
+    struct iatt stbuf = {
+        0,
+    };
+    struct iatt postparent = {
+        0,
+    };
+    dict_t *xattr = NULL;
+    gf_boolean_t is_dir = _gf_false;
+    char *subdir = NULL;
+    loc_t iloc = {0};
+
+    priv = this->private;
+    loc_copy(&iloc, loc);
+
+    VALIDATE_OR_GOTO(loc, done);
+    if (index_is_fop_on_internal_inode(this, loc->parent, loc->pargfid)) {
+        subdir = index_get_subdir_from_vgfid(priv, loc->pargfid);
+        ret = index_inode_path(this, loc->parent, path, sizeof(path));
+        if (ret < 0) {
+            op_errno = -ret;
+            goto done;
+        }
+        ret = snprintf(path + strlen(path), PATH_MAX - strlen(path), "/%s",
+                       loc->name);
+
+        if ((ret < 0) || (ret > (PATH_MAX - strlen(path)))) {
+            op_errno = EINVAL;
+            op_ret = -1;
+            goto done;
+        }
+
+    } else if (index_is_virtual_gfid(priv, loc->gfid)) {
+        subdir = index_get_subdir_from_vgfid(priv, loc->gfid);
+        make_index_dir_path(priv->index_basepath, subdir, path, sizeof(path));
+        is_dir = _gf_true;
+
+        if ((xattr_req) && (dict_get(xattr_req, GF_INDEX_IA_TYPE_GET_REQ))) {
+            if (0 == strcmp(subdir, index_get_subdir_from_type(ENTRY_CHANGES)))
+                val = IA_IFDIR;
+            else
+                val = IA_IFREG;
+        }
+    } else {
+        if (!inode_is_linked(loc->inode)) {
+            inode_unref(iloc.inode);
+            iloc.inode = inode_find(loc->inode->table, loc->gfid);
+        }
+        ret = index_inode_path(this, iloc.inode, path, sizeof(path));
+        if (ret < 0) {
+            op_errno = -ret;
+            goto done;
+        }
+    }
+    ret = sys_lstat(path, &lstatbuf);
+    if (ret) {
+        gf_msg_debug(this->name, errno, "Stat failed on %s dir ", path);
+        op_errno = errno;
+        goto done;
+    } else if (!S_ISDIR(lstatbuf.st_mode) && is_dir) {
+        op_errno = ENOTDIR;
+        gf_msg_debug(this->name, op_errno,
+                     "Stat failed on %s dir, "
+                     "not a directory",
+                     path);
+        goto done;
+    }
+    xattr = dict_new();
+    if (!xattr) {
+        op_errno = ENOMEM;
+        goto done;
+    }
+
+    if (val != IA_INVAL) {
+        ret = dict_set_uint64(xattr, GF_INDEX_IA_TYPE_GET_RSP, val);
+        if (ret) {
+            op_ret = -1;
+            op_errno = -ret;
+            goto done;
+        }
+    }
+
+    iatt_from_stat(&stbuf, &lstatbuf);
+    if (is_dir || inode_is_linked(iloc.inode))
+        loc_gfid(&iloc, stbuf.ia_gfid);
+    else
+        gf_uuid_generate(stbuf.ia_gfid);
+
+    ret = index_save_pargfid_for_entry_changes(this, &iloc, path);
+    if (ret) {
+        op_ret = -1;
+        op_errno = -ret;
+        goto done;
+    }
+
+    stbuf.ia_ino = -1;
+    op_ret = 0;
+done:
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno,
+                        loc ? loc->inode : NULL, &stbuf, xattr, &postparent);
+    if (xattr)
+        dict_unref(xattr);
+    loc_wipe(&iloc);
+    return 0;
+}
+
+int
+index_get_gfid_type(void *opaque)
+{
+    gf_dirent_t *entry = NULL;
+    xlator_t *this = THIS;
+    struct index_syncop_args *args = opaque;
+    loc_t loc = {0};
+    struct iatt iatt = {0};
+    int ret = 0;
+
+    list_for_each_entry(entry, &args->entries->list, list)
+    {
+        if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
+            continue;
+
+        loc_wipe(&loc);
+
+        entry->d_type = gf_d_type_from_ia_type(IA_INVAL);
+        entry->d_stat.ia_type = IA_INVAL;
+        if (gf_uuid_parse(entry->d_name, loc.gfid))
+            continue;
+
+        loc.inode = inode_find(args->parent->table, loc.gfid);
+        if (loc.inode) {
+            entry->d_stat.ia_type = loc.inode->ia_type;
+            entry->d_type = gf_d_type_from_ia_type(loc.inode->ia_type);
+            continue;
+        }
+        loc.inode = inode_new(args->parent->table);
+        if (!loc.inode)
+            continue;
+        ret = syncop_lookup(FIRST_CHILD(this), &loc, &iatt, 0, 0, 0);
+        if (ret == 0) {
+            entry->d_type = gf_d_type_from_ia_type(iatt.ia_type);
+            entry->d_stat = iatt;
+        }
+    }
+    loc_wipe(&loc);
+
+    return 0;
+}
+
+int32_t
+index_readdir_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                      size_t size, off_t off, dict_t *xdata)
+{
+    index_fd_ctx_t *fctx = NULL;
+    index_priv_t *priv = NULL;
+    DIR *dir = NULL;
+    int ret = -1;
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    int count = 0;
+    gf_dirent_t entries;
+    struct index_syncop_args args = {0};
+
+    priv = this->private;
+    INIT_LIST_HEAD(&entries.list);
+
+    ret = index_fd_ctx_get(fd, this, &fctx);
+    if (ret < 0) {
+        op_errno = -ret;
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, INDEX_MSG_FD_OP_FAILED,
+               "pfd is NULL, fd=%p", fd);
+        goto done;
+    }
+
+    dir = fctx->dir;
+    if (!dir) {
+        op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_WARNING, op_errno,
+               INDEX_MSG_INDEX_READDIR_FAILED, "dir is NULL for fd=%p", fd);
+        goto done;
+    }
+
+    count = index_fill_readdir(fd, fctx, dir, off, size, &entries);
+
+    /* pick ENOENT to indicate EOF */
+    op_errno = errno;
+    op_ret = count;
+    if (index_is_virtual_gfid(priv, fd->inode->gfid) && xdata &&
+        dict_get(xdata, "get-gfid-type")) {
+        args.parent = fd->inode;
+        args.entries = &entries;
+        ret = synctask_new(this->ctx->env, index_get_gfid_type, NULL, NULL,
+                           &args);
+    }
+done:
+    STACK_UNWIND_STRICT(readdir, frame, op_ret, op_errno, &entries, NULL);
+    gf_dirent_free(&entries);
+    return 0;
+}
+
+int
+deletion_handler(const char *fpath, const struct stat *sb, int typeflag,
+                 struct FTW *ftwbuf)
+{
+    ia_type_t type = IA_INVAL;
+
+    switch (sb->st_mode & S_IFMT) {
+        case S_IFREG:
+            sys_unlink(fpath);
+            break;
+
+        case S_IFDIR:
+            sys_rmdir(fpath);
+            break;
+        default:
+            type = ia_type_from_st_mode(sb->st_mode);
+            gf_msg(THIS->name, GF_LOG_WARNING, EINVAL, INDEX_MSG_INVALID_ARGS,
+                   "%s neither a regular file nor a directory - type:%s", fpath,
+                   gf_inode_type_to_str(type));
+            break;
+    }
+    return 0;
+}
+
+static int
+index_wipe_index_subdir(void *opaque)
+{
+    struct index_syncop_args *args = opaque;
+
+    nftw(args->path, deletion_handler, 1, FTW_DEPTH | FTW_PHYS);
+    return 0;
+}
+
+static void
+index_get_parent_iatt(struct iatt *parent, char *path, loc_t *loc,
+                      int32_t *op_ret, int32_t *op_errno)
+{
+    int ret = -1;
+    struct stat lstatbuf = {
+        0,
+    };
+
+    ret = sys_lstat(path, &lstatbuf);
+    if (ret < 0) {
+        *op_ret = -1;
+        *op_errno = errno;
+        return;
+    }
+
+    iatt_from_stat(parent, &lstatbuf);
+    gf_uuid_copy(parent->ia_gfid, loc->pargfid);
+    parent->ia_ino = -1;
+
+    return;
+}
+
+int
+index_rmdir_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc, int flag,
+                    dict_t *xdata)
+{
+    int ret = 0;
+    int32_t op_ret = 0;
+    int32_t op_errno = 0;
+    char *subdir = NULL;
+    char index_dir[PATH_MAX] = {0};
+    char index_subdir[PATH_MAX] = {0};
+    uuid_t gfid = {0};
+    struct iatt preparent = {0};
+    struct iatt postparent = {0};
+    index_priv_t *priv = NULL;
+    index_xattrop_type_t type = XATTROP_TYPE_UNSET;
+    struct index_syncop_args args = {
+        0,
+    };
+
+    priv = this->private;
+
+    type = index_get_type_from_vgfid(priv, loc->pargfid);
+    subdir = index_get_subdir_from_vgfid(priv, loc->pargfid);
+    make_index_dir_path(priv->index_basepath, subdir, index_dir,
+                        sizeof(index_dir));
+
+    index_get_parent_iatt(&preparent, index_dir, loc, &op_ret, &op_errno);
+    if (op_ret < 0)
+        goto done;
+
+    gf_uuid_parse(loc->name, gfid);
+    make_gfid_path(priv->index_basepath, subdir, gfid, index_subdir,
+                   sizeof(index_subdir));
+
+    if (flag == 0) {
+        ret = index_del(this, gfid, subdir, type);
+        if (ret < 0) {
+            op_ret = -1;
+            op_errno = -ret;
+            goto done;
+        }
+    } else {
+        args.path = index_subdir;
+        ret = synctask_new(this->ctx->env, index_wipe_index_subdir, NULL, NULL,
+                           &args);
+    }
+
+    index_get_parent_iatt(&postparent, index_dir, loc, &op_ret, &op_errno);
+    if (op_ret < 0)
+        goto done;
+
+done:
+    INDEX_STACK_UNWIND(rmdir, frame, op_ret, op_errno, &preparent, &postparent,
+                       xdata);
+    return 0;
+}
+
+int
+index_unlink_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc, int flag,
+                     dict_t *xdata)
+{
+    index_priv_t *priv = NULL;
+    index_inode_ctx_t *ictx = NULL;
+    int32_t op_ret = 0;
+    int32_t op_errno = 0;
+    int ret = 0;
+    index_xattrop_type_t type = XATTROP_TYPE_UNSET;
+    struct iatt preparent = {0};
+    struct iatt postparent = {0};
+    char index_dir[PATH_MAX] = {0};
+    char filepath[PATH_MAX] = {0};
+    uuid_t gfid = {0};
+    char *subdir = NULL;
+
+    priv = this->private;
+    type = index_get_type_from_vgfid(priv, loc->pargfid);
+    ret = index_inode_path(this, loc->parent, index_dir, sizeof(index_dir));
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = -ret;
+        goto done;
+    }
+
+    index_get_parent_iatt(&preparent, index_dir, loc, &op_ret, &op_errno);
+    if (op_ret < 0)
+        goto done;
+
+    if (type <= XATTROP_TYPE_UNSET) {
+        ret = index_inode_ctx_get(loc->parent, this, &ictx);
+        if ((ret == 0) && gf_uuid_is_null(ictx->virtual_pargfid)) {
+            ret = -EINVAL;
+        }
+        if (ret == 0) {
+            ret = index_entry_delete(this, ictx->virtual_pargfid,
+                                     (char *)loc->name);
+        }
+    } else if (type == ENTRY_CHANGES) {
+        make_file_path(priv->index_basepath, ENTRY_CHANGES_SUBDIR,
+                       (char *)loc->name, filepath, sizeof(filepath));
+        ret = sys_unlink(filepath);
+    } else {
+        subdir = index_get_subdir_from_type(type);
+        gf_uuid_parse(loc->name, gfid);
+        ret = index_del(this, gfid, subdir, type);
+    }
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = -ret;
+        goto done;
+    }
+
+    index_get_parent_iatt(&postparent, index_dir, loc, &op_ret, &op_errno);
+    if (op_ret < 0)
+        goto done;
+done:
+    INDEX_STACK_UNWIND(unlink, frame, op_ret, op_errno, &preparent, &postparent,
+                       xdata);
+    return 0;
+}
+
+int32_t
+index_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               const char *name, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+
+    if (!name ||
+        (!index_is_vgfid_xattr(name) && strcmp(GF_XATTROP_INDEX_COUNT, name) &&
+         strcmp(GF_XATTROP_DIRTY_COUNT, name)))
+        goto out;
+
+    stub = fop_getxattr_stub(frame, index_getxattr_wrapper, loc, name, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(getxattr, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+    worker_enqueue(this, stub);
+    return 0;
+out:
+    STACK_WIND(frame, default_getxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
+    return 0;
+}
+
+int64_t
+index_fetch_link_count(xlator_t *this, index_xattrop_type_t type)
+{
+    index_priv_t *priv = this->private;
+    char *subdir = NULL;
+    struct stat lstatbuf = {
+        0,
+    };
+    int ret = -1;
+    int64_t count = -1;
+    DIR *dirp = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    char index_dir[PATH_MAX] = {
+        0,
+    };
+    char index_path[PATH_MAX] = {
+        0,
+    };
+
+    subdir = index_get_subdir_from_type(type);
+    make_index_dir_path(priv->index_basepath, subdir, index_dir,
+                        sizeof(index_dir));
+
+    dirp = sys_opendir(index_dir);
+    if (!dirp)
+        goto out;
+
+    for (;;) {
+        errno = 0;
+        entry = sys_readdir(dirp, scratch);
+        if (!entry || errno != 0) {
+            if (count == -1)
+                count = 0;
+            goto out;
+        }
+
+        if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
+            continue;
+
+        make_file_path(priv->index_basepath, subdir, entry->d_name, index_path,
+                       sizeof(index_path));
+
+        ret = sys_lstat(index_path, &lstatbuf);
+        if (ret < 0) {
+            count = -2;
+            continue;
+        } else {
+            count = lstatbuf.st_nlink - 1;
+            if (count == 0)
+                continue;
+            else
+                break;
+        }
+    }
+out:
+    if (dirp)
+        (void)sys_closedir(dirp);
+    return count;
+}
+
+dict_t *
+index_fill_link_count(xlator_t *this, dict_t *xdata)
+{
+    int ret = -1;
+    index_priv_t *priv = NULL;
+    int64_t count = -1;
+
+    priv = this->private;
+    xdata = (xdata) ? dict_ref(xdata) : dict_new();
+    if (!xdata)
+        goto out;
+
+    index_get_link_count(priv, &count, XATTROP);
+    if (count < 0) {
+        count = index_fetch_link_count(this, XATTROP);
+        index_set_link_count(priv, count, XATTROP);
+    }
+
+    if (count == 0) {
+        ret = dict_set_int8(xdata, "link-count", 0);
+        if (ret < 0)
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, INDEX_MSG_DICT_SET_FAILED,
+                   "Unable to set link-count");
+    } else {
+        ret = dict_set_int8(xdata, "link-count", 1);
+        if (ret < 0)
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, INDEX_MSG_DICT_SET_FAILED,
+                   "Unable to set link-count");
+    }
+
+out:
+    return xdata;
+}
+
+int32_t
+index_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, inode_t *inode,
+                 struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+    xdata = index_fill_link_count(this, xdata);
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+                        postparent);
+    if (xdata)
+        dict_unref(xdata);
+    return 0;
+}
+
+int32_t
+index_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+    inode_t *inode = NULL;
+    call_stub_t *stub = NULL;
+    char *flag = NULL;
+    int ret = -1;
+
+    if (!index_is_fop_on_internal_inode(this, loc->parent, loc->pargfid) &&
+        !index_is_fop_on_internal_inode(this, loc->inode, loc->gfid)) {
+        if (!inode_is_linked(loc->inode)) {
+            inode = inode_find(loc->inode->table, loc->gfid);
+            if (!index_is_fop_on_internal_inode(this, inode, loc->gfid)) {
+                inode_unref(inode);
+                goto normal;
+            }
+            inode_unref(inode);
+        } else {
+            goto normal;
+        }
+    }
+
+    stub = fop_lookup_stub(frame, index_lookup_wrapper, loc, xattr_req);
+    if (!stub) {
+        STACK_UNWIND_STRICT(lookup, frame, -1, ENOMEM, loc->inode, NULL, NULL,
+                            NULL);
+        return 0;
+    }
+    worker_enqueue(this, stub);
+    return 0;
+normal:
+    ret = dict_get_str_sizen(xattr_req, "link-count", &flag);
+    if ((ret == 0) && (strcmp(flag, GF_XATTROP_INDEX_COUNT) == 0)) {
+        STACK_WIND(frame, index_lookup_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->lookup, loc, xattr_req);
+    } else {
+        STACK_WIND(frame, default_lookup_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->lookup, loc, xattr_req);
+    }
+
+    return 0;
+}
+
+int32_t
+index_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                dict_t *xdata)
+{
+    xdata = index_fill_link_count(this, xdata);
+    STACK_UNWIND_STRICT(fstat, frame, op_ret, op_errno, buf, xdata);
+    if (xdata)
+        dict_unref(xdata);
+    return 0;
+}
+
+int32_t
+index_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    int ret = -1;
+    char *flag = NULL;
+
+    ret = dict_get_str(xdata, "link-count", &flag);
+    if ((ret == 0) && (strcmp(flag, GF_XATTROP_INDEX_COUNT) == 0)) {
+        STACK_WIND(frame, index_fstat_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fstat, fd, xdata);
+    } else {
+        STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fstat, fd, xdata);
+    }
+
+    return 0;
+}
+
+int32_t
+index_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+              dict_t *xdata)
+{
+    if (!index_is_fop_on_internal_inode(this, fd->inode, NULL))
+        goto normal;
+
+    frame->local = NULL;
+    STACK_UNWIND_STRICT(opendir, frame, 0, 0, fd, NULL);
+    return 0;
+
+normal:
+    STACK_WIND(frame, default_opendir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+    return 0;
+}
+
+int32_t
+index_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+              off_t off, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+
+    if (!index_is_fop_on_internal_inode(this, fd->inode, NULL))
+        goto out;
+
+    stub = fop_readdir_stub(frame, index_readdir_wrapper, fd, size, off, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+    worker_enqueue(this, stub);
+    return 0;
+out:
+    STACK_WIND(frame, default_readdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdir, fd, size, off, xdata);
+    return 0;
+}
+
+int
+index_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+             dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+
+    if (!index_is_fop_on_internal_inode(this, loc->parent, NULL))
+        goto out;
+
+    stub = fop_unlink_stub(frame, index_unlink_wrapper, loc, xflag, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(unlink, frame, -1, ENOMEM, NULL, NULL, NULL);
+        return 0;
+    }
+    worker_enqueue(this, stub);
+    return 0;
+out:
+    STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+    return 0;
+}
+
+int
+index_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+            dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+
+    if (!index_is_fop_on_internal_inode(this, loc->parent, NULL))
+        goto out;
+
+    stub = fop_rmdir_stub(frame, index_rmdir_wrapper, loc, flags, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(rmdir, frame, -1, ENOMEM, NULL, NULL, NULL);
+        return 0;
+    }
+    worker_enqueue(this, stub);
+    return 0;
+out:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir,
+                    loc, flags, xdata);
+    return 0;
+}
+
+int
+index_make_xattrop_watchlist(xlator_t *this, index_priv_t *priv,
+                             char *watchlist, index_xattrop_type_t type)
+{
+    char *delim = NULL;
+    char *dup_watchlist = NULL;
+    char *key = NULL;
+    char *saveptr = NULL;
+    dict_t *xattrs = NULL;
+    data_t *dummy = NULL;
+    int ret = 0;
+
+    if (!watchlist)
+        return 0;
+
+    dup_watchlist = gf_strdup(watchlist);
+    if (!dup_watchlist)
+        return -1;
+
+    xattrs = dict_new();
+    if (!xattrs) {
+        ret = -1;
+        goto out;
+    }
+
+    dummy = int_to_data(1);
+    if (!dummy) {
+        ret = -1;
+        goto out;
+    }
+
+    data_ref(dummy);
+
+    delim = ",";
+    key = strtok_r(dup_watchlist, delim, &saveptr);
+    while (key) {
+        if (strlen(key) == 0) {
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_set(xattrs, key, dummy);
+        if (ret)
+            goto out;
+
+        key = strtok_r(NULL, delim, &saveptr);
+    }
+
+    switch (type) {
+        case DIRTY:
+            priv->dirty_watchlist = dict_copy_with_ref(xattrs,
+                                                       priv->dirty_watchlist);
+            if (!priv->dirty_watchlist) {
+                ret = -1;
+                goto out;
+            }
+            break;
+        case XATTROP:
+            priv->pending_watchlist = dict_copy_with_ref(
+                xattrs, priv->pending_watchlist);
+            if (!priv->pending_watchlist) {
+                ret = -1;
+                goto out;
+            }
+            break;
+        default:
+            break;
+    }
+
+    ret = 0;
+out:
+    if (xattrs)
+        dict_unref(xattrs);
+
+    GF_FREE(dup_watchlist);
+
+    if (dummy)
+        data_unref(dummy);
+
+    return ret;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    ret = xlator_mem_acct_init(this, gf_index_mt_end + 1);
+
+    return ret;
+}
+
+int
+init(xlator_t *this)
+{
+    int i = 0;
+    int ret = -1;
+    int64_t count = -1;
+    index_priv_t *priv = NULL;
+    pthread_attr_t w_attr;
+    gf_boolean_t mutex_inited = _gf_false;
+    gf_boolean_t cond_inited = _gf_false;
+    gf_boolean_t attr_inited = _gf_false;
+    char *watchlist = NULL;
+    char *dirtylist = NULL;
+    char *pendinglist = NULL;
+    char *index_base_parent = NULL;
+    char *tmp = NULL;
+
+    if (!this->children || this->children->next) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, INDEX_MSG_INVALID_GRAPH,
+               "'index' not configured with exactly one child");
+        goto out;
+    }
+
+    if (!this->parents) {
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, INDEX_MSG_INVALID_GRAPH,
+               "dangling volume. check volfile ");
+    }
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_index_mt_priv_t);
+    if (!priv)
+        goto out;
+
+    LOCK_INIT(&priv->lock);
+    if ((ret = pthread_cond_init(&priv->cond, NULL)) != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, ret, INDEX_MSG_INVALID_ARGS,
+               "pthread_cond_init failed");
+        goto out;
+    }
+    cond_inited = _gf_true;
+
+    if ((ret = pthread_mutex_init(&priv->mutex, NULL)) != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, ret, INDEX_MSG_INVALID_ARGS,
+               "pthread_mutex_init failed");
+        goto out;
+    }
+    mutex_inited = _gf_true;
+
+    if ((ret = pthread_attr_init(&w_attr)) != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, ret, INDEX_MSG_INVALID_ARGS,
+               "pthread_attr_init failed");
+        goto out;
+    }
+    attr_inited = _gf_true;
+
+    ret = pthread_attr_setstacksize(&w_attr, INDEX_THREAD_STACK_SIZE);
+    if (ret == EINVAL) {
+        gf_msg(this->name, GF_LOG_WARNING, ret, INDEX_MSG_INVALID_ARGS,
+               "Using default thread stack size");
+    }
+
+    GF_OPTION_INIT("index-base", priv->index_basepath, path, out);
+    tmp = gf_strdup(priv->index_basepath);
+    index_base_parent = dirname(tmp);
+    if (gf_lstat_dir(index_base_parent, NULL) != 0) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, errno,
+               INDEX_MSG_INDEX_DIR_CREATE_FAILED,
+               "Failed to find parent dir (%s) of index basepath %s.",
+               index_base_parent, priv->index_basepath);
+        goto out;
+    }
+
+    GF_OPTION_INIT("xattrop64-watchlist", watchlist, str, out);
+    ret = index_make_xattrop_watchlist(this, priv, watchlist, XATTROP);
+    if (ret)
+        goto out;
+
+    GF_OPTION_INIT("xattrop-dirty-watchlist", dirtylist, str, out);
+    ret = index_make_xattrop_watchlist(this, priv, dirtylist, DIRTY);
+    if (ret)
+        goto out;
+
+    GF_OPTION_INIT("xattrop-pending-watchlist", pendinglist, str, out);
+    ret = index_make_xattrop_watchlist(this, priv, pendinglist, XATTROP);
+    if (ret)
+        goto out;
+
+    if (priv->dirty_watchlist)
+        priv->complete_watchlist = dict_copy_with_ref(priv->dirty_watchlist,
+                                                      priv->complete_watchlist);
+    if (priv->pending_watchlist)
+        priv->complete_watchlist = dict_copy_with_ref(priv->pending_watchlist,
+                                                      priv->complete_watchlist);
+
+    gf_uuid_generate(priv->index);
+    for (i = 0; i < XATTROP_TYPE_END; i++)
+        gf_uuid_generate(priv->internal_vgfid[i]);
+
+    INIT_LIST_HEAD(&priv->callstubs);
+    GF_ATOMIC_INIT(priv->stub_cnt, 0);
+
+    this->local_pool = mem_pool_new(index_local_t, 64);
+    if (!this->local_pool) {
+        ret = -1;
+        goto out;
+    }
+
+    this->private = priv;
+
+    ret = index_dir_create(this, XATTROP_SUBDIR);
+    if (ret < 0)
+        goto out;
+
+    if (priv->dirty_watchlist) {
+        ret = index_dir_create(this, DIRTY_SUBDIR);
+        if (ret < 0)
+            goto out;
+    }
+
+    ret = index_dir_create(this, ENTRY_CHANGES_SUBDIR);
+    if (ret < 0)
+        goto out;
+
+    /*init indices files counts*/
+    count = index_fetch_link_count(this, XATTROP);
+    index_set_link_count(priv, count, XATTROP);
+    priv->down = _gf_false;
+
+    priv->curr_count = 0;
+    ret = gf_thread_create(&priv->thread, &w_attr, index_worker, this,
+                           "idxwrker");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, ret,
+               INDEX_MSG_WORKER_THREAD_CREATE_FAILED,
+               "Failed to create worker thread, aborting");
+        goto out;
+    }
+    priv->curr_count++;
+    ret = 0;
+out:
+    GF_FREE(tmp);
+
+    if (ret) {
+        if (cond_inited)
+            pthread_cond_destroy(&priv->cond);
+        if (mutex_inited)
+            pthread_mutex_destroy(&priv->mutex);
+        if (priv && priv->dirty_watchlist)
+            dict_unref(priv->dirty_watchlist);
+        if (priv && priv->pending_watchlist)
+            dict_unref(priv->pending_watchlist);
+        if (priv && priv->complete_watchlist)
+            dict_unref(priv->complete_watchlist);
+        if (priv)
+            GF_FREE(priv);
+        this->private = NULL;
+        mem_pool_destroy(this->local_pool);
+        this->local_pool = NULL;
+    }
+
+    if (attr_inited)
+        pthread_attr_destroy(&w_attr);
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    index_priv_t *priv = NULL;
+
+    priv = this->private;
+    if (!priv)
+        goto out;
+
+    priv->down = _gf_true;
+    pthread_cond_broadcast(&priv->cond);
+    if (priv->thread) {
+        gf_thread_cleanup_xint(priv->thread);
+        priv->thread = 0;
+    }
+    this->private = NULL;
+    LOCK_DESTROY(&priv->lock);
+    pthread_cond_destroy(&priv->cond);
+    pthread_mutex_destroy(&priv->mutex);
+    if (priv->dirty_watchlist)
+        dict_unref(priv->dirty_watchlist);
+    if (priv->pending_watchlist)
+        dict_unref(priv->pending_watchlist);
+    if (priv->complete_watchlist)
+        dict_unref(priv->complete_watchlist);
+    GF_FREE(priv);
+
+    if (this->local_pool) {
+        mem_pool_destroy(this->local_pool);
+        this->local_pool = NULL;
+    }
+out:
+    return;
+}
+
+int
+index_forget(xlator_t *this, inode_t *inode)
+{
+    uint64_t tmp_cache = 0;
+    if (!inode_ctx_del(inode, this, &tmp_cache))
+        GF_FREE((index_inode_ctx_t *)(long)tmp_cache);
+
+    return 0;
+}
+
+int32_t
+index_releasedir(xlator_t *this, fd_t *fd)
+{
+    index_fd_ctx_t *fctx = NULL;
+    uint64_t ctx = 0;
+    int ret = 0;
+
+    ret = fd_ctx_del(fd, this, &ctx);
+    if (ret < 0)
+        goto out;
+
+    fctx = (index_fd_ctx_t *)(long)ctx;
+    if (fctx->dir) {
+        ret = sys_closedir(fctx->dir);
+        if (ret)
+            gf_msg(this->name, GF_LOG_ERROR, errno, INDEX_MSG_FD_OP_FAILED,
+                   "closedir error");
+    }
+
+    GF_FREE(fctx);
+out:
+    return 0;
+}
+
+int32_t
+index_release(xlator_t *this, fd_t *fd)
+{
+    index_fd_ctx_t *fctx = NULL;
+    uint64_t ctx = 0;
+    int ret = 0;
+
+    ret = fd_ctx_del(fd, this, &ctx);
+    if (ret < 0)
+        goto out;
+
+    fctx = (index_fd_ctx_t *)(long)ctx;
+    GF_FREE(fctx);
+out:
+    return 0;
+}
+
+int
+notify(xlator_t *this, int event, void *data, ...)
+{
+    int ret = 0;
+    index_priv_t *priv = NULL;
+    uint64_t stub_cnt = 0;
+    xlator_t *victim = data;
+    struct timespec sleep_till = {
+        0,
+    };
+
+    if (!this)
+        return 0;
+
+    priv = this->private;
+    if (!priv)
+        return 0;
+
+    if ((event == GF_EVENT_PARENT_DOWN) && victim->cleanup_starting) {
+        stub_cnt = GF_ATOMIC_GET(priv->stub_cnt);
+        timespec_now_realtime(&sleep_till);
+        sleep_till.tv_sec += 1;
+
+        /* Wait for draining stub from queue before notify PARENT_DOWN */
+        pthread_mutex_lock(&priv->mutex);
+        {
+            while (stub_cnt) {
+                (void)pthread_cond_timedwait(&priv->cond, &priv->mutex,
+                                             &sleep_till);
+                stub_cnt = GF_ATOMIC_GET(priv->stub_cnt);
+            }
+        }
+        pthread_mutex_unlock(&priv->mutex);
+        gf_log(this->name, GF_LOG_INFO,
+               "Notify GF_EVENT_PARENT_DOWN for brick %s", victim->name);
+    }
+
+    if ((event == GF_EVENT_CHILD_DOWN) && victim->cleanup_starting) {
+        pthread_mutex_lock(&priv->mutex);
+        {
+            priv->down = _gf_true;
+            pthread_cond_broadcast(&priv->cond);
+            while (priv->curr_count)
+                pthread_cond_wait(&priv->cond, &priv->mutex);
+        }
+        pthread_mutex_unlock(&priv->mutex);
+
+        gf_log(this->name, GF_LOG_INFO,
+               "Notify GF_EVENT_CHILD_DOWN for brick %s", victim->name);
+    }
+
+    ret = default_notify(this, event, data);
+    return ret;
+}
+
+struct xlator_fops fops = {
+    .xattrop = index_xattrop,
+    .fxattrop = index_fxattrop,
+
+    // interface functions follow
+    .getxattr = index_getxattr,
+    .lookup = index_lookup,
+    .opendir = index_opendir,
+    .readdir = index_readdir,
+    .unlink = index_unlink,
+    .rmdir = index_rmdir,
+    .fstat = index_fstat,
+};
+
+struct xlator_dumpops dumpops;
+
+struct xlator_cbks cbks = {.forget = index_forget,
+                           .release = index_release,
+                           .releasedir = index_releasedir};
+
+struct volume_options options[] = {
+    {.key = {"index-base"},
+     .type = GF_OPTION_TYPE_PATH,
+     .description = "path where the index files need to be stored",
+     .default_value = "{{ brick.path }}/.glusterfs/indices"},
+    {.key = {"xattrop64-watchlist"},
+     .type = GF_OPTION_TYPE_STR,
+     .description = "Comma separated list of xattrs that are watched",
+     .default_value = "trusted.ec.dirty"},
+    {.key = {"xattrop-dirty-watchlist"},
+     .type = GF_OPTION_TYPE_STR,
+     .description = "Comma separated list of xattrs that are watched",
+     .default_value = "trusted.afr.dirty"},
+    {.key = {"xattrop-pending-watchlist"},
+     .type = GF_OPTION_TYPE_STR,
+     .description = "Comma separated list of xattrs that are watched",
+     .default_value = "trusted.afr.{{ volume.name }}"},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "index",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/index/src/index.h b/xlators/features/index/src/index.h
new file mode 100644
index 00000000000..a2b6e6e2570
--- /dev/null
+++ b/xlators/features/index/src/index.h
@@ -0,0 +1,86 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __INDEX_H__
+#define __INDEX_H__
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/common-utils.h>
+#include "index-mem-types.h"
+
+#define INDEX_THREAD_STACK_SIZE ((size_t)(1024 * 1024))
+
+typedef enum { UNKNOWN, IN, NOTIN } index_state_t;
+
+typedef enum {
+    XATTROP_TYPE_UNSET = -1,
+    XATTROP,
+    DIRTY,
+    ENTRY_CHANGES,
+    XATTROP_TYPE_END
+} index_xattrop_type_t;
+
+typedef struct index_inode_ctx {
+    gf_boolean_t processing;
+    struct list_head callstubs;
+    int state[XATTROP_TYPE_END];
+    uuid_t virtual_pargfid; /* virtual gfid of dir under
+                              .glusterfs/indices/entry-changes. */
+} index_inode_ctx_t;
+
+typedef struct index_fd_ctx {
+    DIR *dir;
+    off_t dir_eof;
+} index_fd_ctx_t;
+
+typedef struct index_priv {
+    char *index_basepath;
+    char *dirty_basepath;
+    uuid_t index;
+    gf_lock_t lock;
+    uuid_t internal_vgfid[XATTROP_TYPE_END];
+    struct list_head callstubs;
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+    dict_t *dirty_watchlist;
+    dict_t *pending_watchlist;
+    dict_t *complete_watchlist;
+    int64_t pending_count;
+    pthread_t thread;
+    gf_boolean_t down;
+    gf_atomic_t stub_cnt;
+    int32_t curr_count;
+} index_priv_t;
+
+typedef struct index_local {
+    inode_t *inode;
+    dict_t *xdata;
+} index_local_t;
+
+#define INDEX_STACK_UNWIND(fop, frame, params...)                              \
+    do {                                                                       \
+        index_local_t *__local = NULL;                                         \
+        if (frame) {                                                           \
+            __local = frame->local;                                            \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, params);                               \
+        if (__local) {                                                         \
+            inode_unref(__local->inode);                                       \
+            if (__local->xdata)                                                \
+                dict_unref(__local->xdata);                                    \
+            mem_put(__local);                                                  \
+        }                                                                      \
+    } while (0)
+
+#endif
diff --git a/xlators/features/leases/Makefile.am b/xlators/features/leases/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/leases/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/leases/src/Makefile.am b/xlators/features/leases/src/Makefile.am
new file mode 100644
index 00000000000..a1aef10e299
--- /dev/null
+++ b/xlators/features/leases/src/Makefile.am
@@ -0,0 +1,20 @@
+if WITH_SERVER
+xlator_LTLIBRARIES = leases.la
+endif
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+leases_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+leases_la_SOURCES = leases.c leases-internal.c
+
+leases_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = leases.h leases-mem-types.h leases-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(CONTRIBDIR)/timer-wheel
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/leases/src/leases-internal.c b/xlators/features/leases/src/leases-internal.c
new file mode 100644
index 00000000000..56dee244281
--- /dev/null
+++ b/xlators/features/leases/src/leases-internal.c
@@ -0,0 +1,1412 @@
+/*
+   Copyright (c) 2015-2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "leases.h"
+
+/* Mutex locks used in this xlator and their order of acquisition:
+ * Check lease conflict:
+ *         lease_ctx lock
+ *                 add_timer => internal timer locks
+ *         lease_ctx unlock
+ *
+ * Add/remove lease:
+ *         lease_ctx lock
+ *                 add_timer => internal timer locks
+ *                 OR
+ *                 priv lock => Adding/removing to/from the cleanup client list
+ *                 priv unlock
+ *         lease_ctx unlock
+ *
+ * Timer thread:
+ *         Timer internal lock
+ *                 priv lock => By timer handler
+ *                 priv unlock
+ *         Timer internal unlock
+ *
+ * Expired recall cleanup thread:
+ *         priv lock
+ *                 priv condwait
+ *         priv unlock
+ *         lease_ctx lock
+ *                 priv lock
+ *                 priv unlock
+ *         lease_ctx unlock
+ */
+
+/*
+ * Check if lease_lk is enabled
+ * Return Value:
+ * _gf_true  - lease lock option enabled
+ * _gf_false - lease lock option disabled
+ */
+gf_boolean_t
+is_leases_enabled(xlator_t *this)
+{
+    leases_private_t *priv = NULL;
+    gf_boolean_t is_enabled = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("leases", this, out);
+
+    if (this->private) {
+        priv = (leases_private_t *)this->private;
+        is_enabled = priv->leases_enabled;
+    }
+out:
+    return is_enabled;
+}
+
+/*
+ * Get the recall_leaselk_timeout
+ * Return Value:
+ * timeout value(in seconds) set as an option to this xlator.
+ * -1 error case
+ */
+static int32_t
+get_recall_lease_timeout(xlator_t *this)
+{
+    leases_private_t *priv = NULL;
+    int32_t timeout = -1;
+
+    GF_VALIDATE_OR_GOTO("leases", this, out);
+
+    if (this->private) {
+        priv = (leases_private_t *)this->private;
+        timeout = priv->recall_lease_timeout;
+    }
+out:
+    return timeout;
+}
+
+static void
+__dump_leases_info(xlator_t *this, lease_inode_ctx_t *lease_ctx)
+{
+    lease_id_entry_t *lease_entry = NULL;
+    lease_id_entry_t *tmp = NULL;
+
+    GF_VALIDATE_OR_GOTO("leases", this, out);
+    GF_VALIDATE_OR_GOTO("leases", lease_ctx, out);
+
+    gf_msg_debug(this->name, 0,
+                 "Lease held on this inode, lease_type: %d,"
+                 " lease_cnt:%" PRIu64
+                 ", RD lease:%d, RW lease:%d, "
+                 "openfd cnt:%" PRIu64,
+                 lease_ctx->lease_type, lease_ctx->lease_cnt,
+                 lease_ctx->lease_type_cnt[GF_RD_LEASE],
+                 lease_ctx->lease_type_cnt[GF_RW_LEASE], lease_ctx->openfd_cnt);
+
+    list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list,
+                             lease_id_list)
+    {
+        gf_msg_debug(this->name, 0,
+                     "Leases held by client: %s, lease "
+                     "ID:%s, RD lease:%d, RW lease:%d, lease_type: %d, "
+                     "lease_cnt:%" PRIu64,
+                     lease_entry->client_uid, lease_entry->lease_id,
+                     lease_entry->lease_type_cnt[GF_RD_LEASE],
+                     lease_entry->lease_type_cnt[GF_RW_LEASE],
+                     lease_entry->lease_type, lease_entry->lease_cnt);
+    }
+out:
+    return;
+}
+
+static int
+__lease_ctx_set(inode_t *inode, xlator_t *this)
+{
+    lease_inode_ctx_t *inode_ctx = NULL;
+    int ret = -1;
+    uint64_t ctx = 0;
+
+    GF_VALIDATE_OR_GOTO("leases", inode, out);
+    GF_VALIDATE_OR_GOTO("leases", this, out);
+
+    ret = __inode_ctx_get(inode, this, &ctx);
+    if (!ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, LEASE_MSG_INVAL_INODE_CTX,
+               "inode_ctx_get failed");
+        goto out;
+    }
+
+    inode_ctx = GF_CALLOC(1, sizeof(*inode_ctx),
+                          gf_leases_mt_lease_inode_ctx_t);
+    GF_CHECK_ALLOC(inode_ctx, ret, out);
+
+    pthread_mutex_init(&inode_ctx->lock, NULL);
+    INIT_LIST_HEAD(&inode_ctx->lease_id_list);
+    INIT_LIST_HEAD(&inode_ctx->blocked_list);
+
+    inode_ctx->lease_cnt = 0;
+
+    ret = __inode_ctx_set(inode, this, (uint64_t *)inode_ctx);
+    if (ret) {
+        GF_FREE(inode_ctx);
+        gf_msg(this->name, GF_LOG_INFO, 0, LEASE_MSG_INVAL_INODE_CTX,
+               "failed to set inode ctx (%p)", inode);
+    }
+out:
+    return ret;
+}
+
+static lease_inode_ctx_t *
+__lease_ctx_get(inode_t *inode, xlator_t *this)
+{
+    lease_inode_ctx_t *inode_ctx = NULL;
+    uint64_t ctx = 0;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO("leases", inode, out);
+    GF_VALIDATE_OR_GOTO("leases", this, out);
+
+    ret = __inode_ctx_get(inode, this, &ctx);
+    if (ret < 0) {
+        ret = __lease_ctx_set(inode, this);
+        if (ret < 0)
+            goto out;
+
+        ret = __inode_ctx_get(inode, this, &ctx);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, LEASE_MSG_INVAL_INODE_CTX,
+                   "failed to get inode ctx (%p)", inode);
+            goto out;
+        }
+    }
+
+    inode_ctx = (lease_inode_ctx_t *)(long)ctx;
+out:
+    return inode_ctx;
+}
+
+lease_inode_ctx_t *
+lease_ctx_get(inode_t *inode, xlator_t *this)
+{
+    lease_inode_ctx_t *inode_ctx = NULL;
+
+    GF_VALIDATE_OR_GOTO("leases", inode, out);
+    GF_VALIDATE_OR_GOTO("leases", this, out);
+
+    LOCK(&inode->lock);
+    {
+        inode_ctx = __lease_ctx_get(inode, this);
+    }
+    UNLOCK(&inode->lock);
+out:
+    return inode_ctx;
+}
+
+static lease_id_entry_t *
+new_lease_id_entry(call_frame_t *frame, const char *lease_id)
+{
+    lease_id_entry_t *lease_entry = NULL;
+
+    GF_VALIDATE_OR_GOTO("leases", frame, out);
+    GF_VALIDATE_OR_GOTO("leases", lease_id, out);
+
+    lease_entry = GF_CALLOC(1, sizeof(*lease_entry),
+                            gf_leases_mt_lease_id_entry_t);
+    if (!lease_entry) {
+        gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, LEASE_MSG_NO_MEM,
+               "Memory allocation for lease_entry failed");
+        return NULL;
+    }
+
+    INIT_LIST_HEAD(&lease_entry->lease_id_list);
+    lease_entry->lease_type = NONE;
+    lease_entry->lease_cnt = 0;
+    lease_entry->recall_time = get_recall_lease_timeout(frame->this);
+    lease_entry->client_uid = gf_strdup(frame->root->client->client_uid);
+    if (!lease_entry->client_uid) {
+        gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, LEASE_MSG_NO_MEM,
+               "Memory allocation for client_uid failed");
+        GF_FREE(lease_entry);
+        lease_entry = NULL;
+        goto out;
+    }
+
+    memcpy(lease_entry->lease_id, lease_id, LEASE_ID_SIZE);
+out:
+    return lease_entry;
+}
+
+static void
+__destroy_lease_id_entry(lease_id_entry_t *lease_entry)
+{
+    GF_VALIDATE_OR_GOTO("leases", lease_entry, out);
+
+    list_del_init(&lease_entry->lease_id_list);
+    GF_FREE(lease_entry->client_uid);
+    GF_FREE(lease_entry);
+out:
+    return;
+}
+
+static inline gf_boolean_t
+__is_same_lease_id(const char *k1, const char *k2)
+{
+    if (memcmp(k1, k2, strlen(k1)) == 0)
+        return _gf_true;
+
+    return _gf_false;
+}
+
+/* Checks if there are any leases, other than the leases taken
+ * by the given lease_id
+ */
+static gf_boolean_t
+__another_lease_found(lease_inode_ctx_t *lease_ctx, const char *lease_id)
+{
+    lease_id_entry_t *lease_entry = NULL;
+    lease_id_entry_t *tmp = NULL;
+    gf_boolean_t found_lease = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("leases", lease_id, out);
+    GF_VALIDATE_OR_GOTO("leases", lease_ctx, out);
+
+    list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list,
+                             lease_id_list)
+    {
+        if (!__is_same_lease_id(lease_id, lease_entry->lease_id)) {
+            if (lease_entry->lease_cnt > 0) {
+                found_lease = _gf_true;
+                break;
+            }
+        }
+    }
+out:
+    return found_lease;
+}
+
+/* Returns the lease_id_entry for a given lease_id and a given inode.
+ * Return values:
+ * NULL - If no client entry found
+ * lease_id_entry_t* - a pointer to the client entry if found
+ */
+static lease_id_entry_t *
+__get_lease_id_entry(lease_inode_ctx_t *lease_ctx, const char *lease_id)
+{
+    lease_id_entry_t *lease_entry = NULL;
+    lease_id_entry_t *tmp = NULL;
+    lease_id_entry_t *found = NULL;
+
+    GF_VALIDATE_OR_GOTO("leases", lease_id, out);
+    GF_VALIDATE_OR_GOTO("leases", lease_ctx, out);
+
+    list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list,
+                             lease_id_list)
+    {
+        if (__is_same_lease_id(lease_id, lease_entry->lease_id)) {
+            found = lease_entry;
+            gf_msg_debug("leases", 0,
+                         "lease ID entry found "
+                         "Client UID:%s, lease id:%s",
+                         lease_entry->client_uid,
+                         leaseid_utoa(lease_entry->lease_id));
+            break;
+        }
+    }
+out:
+    return found;
+}
+
+/* Returns the lease_id_entry for a given lease_id and a given inode,
+ * if none found creates one.
+ * Return values:
+ * lease_id_entry_t* - a pointer to the client entry
+ */
+static lease_id_entry_t *
+__get_or_new_lease_entry(call_frame_t *frame, const char *lease_id,
+                         lease_inode_ctx_t *lease_ctx)
+{
+    lease_id_entry_t *lease_entry = NULL;
+
+    GF_VALIDATE_OR_GOTO("leases", frame, out);
+    GF_VALIDATE_OR_GOTO("leases", lease_id, out);
+    GF_VALIDATE_OR_GOTO("leases", lease_ctx, out);
+
+    lease_entry = __get_lease_id_entry(lease_ctx, lease_id);
+    if (!lease_entry) { /* create one */
+        lease_entry = new_lease_id_entry(frame, lease_id);
+        if (!lease_entry)
+            goto out;
+
+        list_add_tail(&lease_entry->lease_id_list, &lease_ctx->lease_id_list);
+
+        gf_msg_debug(frame->this->name, 0,
+                     "lease ID entry added,"
+                     " Client UID:%s, lease id:%s",
+                     lease_entry->client_uid,
+                     leaseid_utoa(lease_entry->lease_id));
+    }
+out:
+    return lease_entry;
+}
+
+static lease_inode_t *
+new_lease_inode(inode_t *inode)
+{
+    lease_inode_t *l_inode = GF_MALLOC(sizeof(*l_inode),
+                                       gf_leases_mt_lease_inode_t);
+    if (!l_inode)
+        goto out;
+
+    INIT_LIST_HEAD(&l_inode->list);
+    l_inode->inode = inode_ref(inode);
+out:
+    return l_inode;
+}
+
+static void
+__destroy_lease_inode(lease_inode_t *l_inode)
+{
+    list_del_init(&l_inode->list);
+    inode_unref(l_inode->inode);
+    GF_FREE(l_inode);
+}
+
+static lease_client_t *
+new_lease_client(const char *client_uid)
+{
+    lease_client_t *clnt = GF_MALLOC(sizeof(*clnt),
+                                     gf_leases_mt_lease_client_t);
+    if (!clnt)
+        goto out;
+
+    INIT_LIST_HEAD(&clnt->client_list);
+    INIT_LIST_HEAD(&clnt->inode_list);
+    clnt->client_uid = gf_strdup(client_uid);
+out:
+    return clnt;
+}
+
+static void
+__destroy_lease_client(lease_client_t *clnt)
+{
+    list_del_init(&clnt->inode_list);
+    list_del_init(&clnt->client_list);
+    GF_FREE(clnt);
+
+    return;
+}
+
+static lease_client_t *
+__get_lease_client(xlator_t *this, leases_private_t *priv,
+                   const char *client_uid)
+{
+    lease_client_t *clnt = NULL;
+    lease_client_t *tmp = NULL;
+    lease_client_t *found = NULL;
+
+    list_for_each_entry_safe(clnt, tmp, &priv->client_list, client_list)
+    {
+        if ((strcmp(clnt->client_uid, client_uid) == 0)) {
+            found = clnt;
+            gf_msg_debug(this->name, 0,
+                         "Client:%s already found "
+                         "in the cleanup list",
+                         client_uid);
+            break;
+        }
+    }
+    return found;
+}
+
+static lease_client_t *
+__get_or_new_lease_client(xlator_t *this, leases_private_t *priv,
+                          const char *client_uid)
+{
+    lease_client_t *found = NULL;
+
+    found = __get_lease_client(this, priv, client_uid);
+    if (!found) {
+        found = new_lease_client(client_uid);
+        if (!found)
+            goto out;
+        list_add_tail(&found->client_list, &priv->client_list);
+        gf_msg_debug(this->name, 0,
+                     "Adding a new client:%s entry "
+                     "to the cleanup list",
+                     client_uid);
+    }
+out:
+    return found;
+}
+
+static int
+add_inode_to_client_list(xlator_t *this, inode_t *inode, const char *client_uid)
+{
+    leases_private_t *priv = this->private;
+    lease_client_t *clnt = NULL;
+
+    lease_inode_t *lease_inode = new_lease_inode(inode);
+    if (!lease_inode)
+        return -ENOMEM;
+
+    pthread_mutex_lock(&priv->mutex);
+    {
+        clnt = __get_or_new_lease_client(this, priv, client_uid);
+        if (!clnt) {
+            pthread_mutex_unlock(&priv->mutex);
+            __destroy_lease_inode(lease_inode);
+            return -ENOMEM;
+        }
+        list_add_tail(&clnt->inode_list, &lease_inode->list);
+    }
+    pthread_mutex_unlock(&priv->mutex);
+    gf_msg_debug(this->name, 0,
+                 "Added a new inode:%p to the client(%s) "
+                 "cleanup list, gfid(%s)",
+                 inode, client_uid, uuid_utoa(inode->gfid));
+    return 0;
+}
+
+/* Add lease entry to the corresponding client entry.
+ * Return values:
+ * 0 Success
+ * -1 Failure
+ */
+static int
+__add_lease(call_frame_t *frame, inode_t *inode, lease_inode_ctx_t *lease_ctx,
+            const char *client_uid, struct gf_lease *lease)
+{
+    lease_id_entry_t *lease_entry = NULL;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("leases", frame, out);
+    GF_VALIDATE_OR_GOTO("leases", client_uid, out);
+    GF_VALIDATE_OR_GOTO("leases", lease_ctx, out);
+    GF_VALIDATE_OR_GOTO("leases", inode, out);
+    GF_VALIDATE_OR_GOTO("leases", lease, out);
+
+    gf_msg_trace(frame->this->name, 0,
+                 "Granting lease lock to client %s with lease id %s"
+                 " on gfid(%s)",
+                 client_uid, leaseid_utoa(lease->lease_id),
+                 uuid_utoa(inode->gfid));
+
+    lease_entry = __get_or_new_lease_entry(frame, lease->lease_id, lease_ctx);
+    if (!lease_entry) {
+        errno = ENOMEM;
+        goto out;
+    }
+
+    lease_entry->lease_type_cnt[lease->lease_type]++;
+    lease_entry->lease_cnt++;
+    lease_entry->lease_type |= lease->lease_type;
+    /* If this is the first lease taken by the client on the file, then
+     * add this inode/file to the client disconnect cleanup list
+     */
+    if (lease_entry->lease_cnt == 1) {
+        add_inode_to_client_list(frame->this, inode, client_uid);
+    }
+
+    lease_ctx->lease_cnt++;
+    lease_ctx->lease_type_cnt[lease->lease_type]++;
+    lease_ctx->lease_type |= lease->lease_type;
+
+    /* Take a ref for the first lock taken on this inode. Corresponding
+     * unref when all the leases are unlocked or during DISCONNECT
+     * Ref is required because the inode on which lease is acquired should
+     * not be deleted when lru cleanup kicks in*/
+    if (lease_ctx->lease_cnt == 1) {
+        lease_ctx->inode = inode_ref(inode);
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static gf_boolean_t
+__is_clnt_lease_none(const char *client_uid, lease_inode_ctx_t *lease_ctx)
+{
+    gf_boolean_t lease_none = _gf_true;
+    lease_id_entry_t *lease_entry = NULL;
+    lease_id_entry_t *tmp = NULL;
+
+    list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list,
+                             lease_id_list)
+    {
+        if ((strcmp(client_uid, lease_entry->client_uid) == 0) &&
+            (lease_entry->lease_cnt != 0)) {
+            lease_none = _gf_false;
+            break;
+        }
+    }
+
+    return lease_none;
+}
+
+static int
+__remove_inode_from_clnt_list(xlator_t *this, lease_client_t *clnt,
+                              inode_t *inode)
+{
+    int ret = -1;
+    lease_inode_t *l_inode = NULL;
+    lease_inode_t *tmp1 = NULL;
+
+    list_for_each_entry_safe(l_inode, tmp1, &clnt->inode_list, list)
+    {
+        if (l_inode->inode == inode) {
+            __destroy_lease_inode(l_inode);
+            gf_msg_debug(this->name, 0,
+                         "Removed the inode from the client cleanup list");
+            ret = 0;
+        }
+    }
+    /* TODO: Remove the client entry from the cleanup list */
+
+    return ret;
+}
+
+static int
+remove_from_clnt_list(xlator_t *this, const char *client_uid, inode_t *inode)
+{
+    leases_private_t *priv = NULL;
+    int ret = -1;
+    lease_client_t *clnt = NULL;
+
+    priv = this->private;
+    if (!priv)
+        goto out;
+
+    pthread_mutex_lock(&priv->mutex);
+    {
+        clnt = __get_lease_client(this, priv, client_uid);
+        if (!clnt) {
+            pthread_mutex_unlock(&priv->mutex);
+            gf_msg(this->name, GF_LOG_ERROR, 0, LEASE_MSG_CLNT_NOTFOUND,
+                   "There is no client entry found in the cleanup list");
+            goto out;
+        }
+        ret = __remove_inode_from_clnt_list(this, clnt, inode);
+        if (ret) {
+            pthread_mutex_unlock(&priv->mutex);
+            gf_msg(this->name, GF_LOG_ERROR, 0, LEASE_MSG_INODE_NOTFOUND,
+                   "There is no inode entry found in the cleanup list");
+            goto out;
+        }
+    }
+    pthread_mutex_unlock(&priv->mutex);
+out:
+    return ret;
+}
+
+/* Remove lease entry in the corresponding client entry.
+ */
+static int
+__remove_lease(xlator_t *this, inode_t *inode, lease_inode_ctx_t *lease_ctx,
+               const char *client_uid, struct gf_lease *lease)
+{
+    lease_id_entry_t *lease_entry = NULL;
+    int ret = 0;
+    int32_t lease_type = 0;
+    leases_private_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("leases", lease_ctx, out);
+    GF_VALIDATE_OR_GOTO("leases", lease, out);
+
+    priv = this->private;
+
+    gf_msg_trace(this->name, 0,
+                 "Removing lease entry for client: %s, "
+                 "lease type:%d, lease id:%s",
+                 client_uid, lease->lease_type, leaseid_utoa(lease->lease_id));
+
+    /* There could be a race where in server recalled the lease and by the time
+     * client sends lease_unlock request, server may have revoked it. To handle
+     * such cases, if lease doesnt exist treat it as noop and return success.
+     */
+    lease_entry = __get_lease_id_entry(lease_ctx, lease->lease_id);
+    if (!lease_entry) {
+        gf_msg(this->name, GF_LOG_INFO, 0, LEASE_MSG_INVAL_UNLK_LEASE,
+               "Got unlock lease request from client:%s, but has no "
+               "corresponding lock",
+               client_uid);
+        ret = 0;
+        goto out;
+    }
+
+    if (!(lease_entry->lease_type & lease->lease_type)) {
+        gf_msg(this->name, GF_LOG_INFO, 0, LEASE_MSG_INVAL_UNLK_LEASE,
+               "Got unlock lease request from client:%s for an invalid "
+               "lease_type",
+               client_uid);
+        ret = -EINVAL;
+        errno = EINVAL;
+        goto out;
+    }
+    lease_type = lease->lease_type;
+    lease_entry->lease_type_cnt[lease_type]--;
+    lease_entry->lease_cnt--;
+
+    lease_ctx->lease_type_cnt[lease_type]--;
+    lease_ctx->lease_cnt--;
+
+    if (lease_entry->lease_type_cnt[lease_type] == 0)
+        lease_entry->lease_type = lease_entry->lease_type & (~lease_type);
+
+    if (lease_ctx->lease_type_cnt[lease_type] == 0)
+        lease_ctx->lease_type = lease_ctx->lease_type & (~lease_type);
+
+    if (lease_entry->lease_cnt == 0) {
+        if (__is_clnt_lease_none(client_uid, lease_ctx)) {
+            gf_msg_trace(this->name, 0,
+                         "Client(%s) has no leases"
+                         " on gfid (%s), hence removing the inode"
+                         " from the client cleanup list",
+                         client_uid, uuid_utoa(inode->gfid));
+            remove_from_clnt_list(this, client_uid, lease_ctx->inode);
+        }
+        __destroy_lease_id_entry(lease_entry);
+        lease_ctx->blocked_fops_resuming = _gf_true;
+    }
+
+    if (lease_ctx->lease_cnt == 0 && lease_ctx->timer) {
+        ret = gf_tw_del_timer(priv->timer_wheel, lease_ctx->timer);
+        lease_ctx->recall_in_progress = _gf_false;
+        lease_ctx->timer = NULL;
+    }
+out:
+    return ret;
+}
+
+static gf_boolean_t
+__is_lease_grantable(xlator_t *this, lease_inode_ctx_t *lease_ctx,
+                     struct gf_lease *lease, inode_t *inode)
+{
+    uint32_t fd_count = 0;
+    int32_t flags = 0;
+    fd_t *iter_fd = NULL;
+    gf_boolean_t grant = _gf_false;
+    int ret = 0;
+    lease_fd_ctx_t *fd_ctx = NULL;
+    uint64_t ctx = 0;
+
+    GF_VALIDATE_OR_GOTO("leases", lease_ctx, out);
+    GF_VALIDATE_OR_GOTO("leases", lease, out);
+    GF_VALIDATE_OR_GOTO("leases", inode, out);
+
+    if (lease_ctx->recall_in_progress) {
+        gf_msg_debug(this->name, 0,
+                     "Recall in progress, hence "
+                     "failing the lease request");
+        grant = _gf_false;
+        goto out;
+    }
+
+    if (lease_ctx->blocked_fops_resuming) {
+        gf_msg_debug(this->name, 0,
+                     "Previously blocked fops resuming, hence "
+                     "failing the lease request");
+        grant = _gf_false;
+        goto out;
+    }
+
+    LOCK(&inode->lock);
+    {
+        list_for_each_entry(iter_fd, &inode->fd_list, inode_list)
+        {
+            ret = fd_ctx_get(iter_fd, this, &ctx);
+            if (ret < 0) {
+                grant = _gf_false;
+                UNLOCK(&inode->lock);
+                gf_msg(this->name, GF_LOG_ERROR, 0, LEASE_MSG_INVAL_FD_CTX,
+                       "Unable to get fd ctx");
+                goto out;
+            }
+            fd_ctx = (lease_fd_ctx_t *)(long)ctx;
+
+            /* Check for open fd conflict, note that open fds from
+             * the same lease id is not checked for conflict, as it is
+             * lease id based lease.
+             */
+            if (fd_ctx->client_uid != NULL &&
+                !__is_same_lease_id(fd_ctx->lease_id, lease->lease_id)) {
+                fd_count++;
+                flags |= iter_fd->flags;
+            }
+        }
+    }
+    UNLOCK(&inode->lock);
+
+    gf_msg_debug(this->name, 0, "open fd count:%d flags:%d", fd_count, flags);
+
+    __dump_leases_info(this, lease_ctx);
+
+    switch (lease->lease_type) {
+        case GF_RD_LEASE:
+            /* check open fd conflict */
+            if ((fd_count > 0) && ((flags & O_WRONLY) || (flags & O_RDWR))) {
+                grant = _gf_false;
+                break;
+            }
+
+            /* check for conflict with existing leases */
+            if (lease_ctx->lease_type == NONE ||
+                lease_ctx->lease_type == GF_RD_LEASE ||
+                !(__another_lease_found(lease_ctx, lease->lease_id)))
+                grant = _gf_true;
+            else
+                grant = _gf_false;
+            break;
+
+        case GF_RW_LEASE:
+            /* check open fd conflict; conflict if there are any fds open
+             * other than the client on which the lease is requested. */
+            if (fd_count > 0) {
+                grant = _gf_false;
+                break;
+            }
+
+            /* check existing lease conflict */
+            if (lease_ctx->lease_type == NONE ||
+                !(__another_lease_found(lease_ctx, lease->lease_id)))
+                grant = _gf_true;
+            else
+                grant = _gf_false;
+            break;
+
+        default:
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, LEASE_MSG_INVAL_LEASE_TYPE,
+                   "Invalid lease type specified");
+            break;
+    }
+out:
+    return grant;
+}
+
+static void
+do_blocked_fops(xlator_t *this, lease_inode_ctx_t *lease_ctx)
+{
+    struct list_head wind_list;
+    fop_stub_t *blk_fop = NULL;
+    fop_stub_t *tmp = NULL;
+
+    INIT_LIST_HEAD(&wind_list);
+
+    pthread_mutex_lock(&lease_ctx->lock);
+    {
+        if (!lease_ctx->blocked_fops_resuming) {
+            /* lease_ctx->blocked_fops_resuming will be set
+             * only when the last lease is released. That
+             * is when we need to resume blocked fops and unref
+             * the inode taken in __add_lease (when lease_cnt == 1).
+             * Return otherwise.
+             */
+            pthread_mutex_unlock(&lease_ctx->lock);
+            return;
+        }
+
+        list_for_each_entry_safe(blk_fop, tmp, &lease_ctx->blocked_list, list)
+        {
+            list_del_init(&blk_fop->list);
+            list_add_tail(&blk_fop->list, &wind_list);
+        }
+    }
+    pthread_mutex_unlock(&lease_ctx->lock);
+
+    gf_msg_trace(this->name, 0, "Executing the blocked stubs on gfid(%s)",
+                 uuid_utoa(lease_ctx->inode->gfid));
+    list_for_each_entry_safe(blk_fop, tmp, &wind_list, list)
+    {
+        list_del_init(&blk_fop->list);
+        gf_msg_trace(this->name, 0, "Executing fop:%d", blk_fop->stub->fop);
+        call_resume(blk_fop->stub);
+        GF_FREE(blk_fop);
+    }
+
+    pthread_mutex_lock(&lease_ctx->lock);
+    {
+        lease_ctx->lease_type = NONE;
+        /* unref the inode taken in __add_lease
+         * (when lease_cnt == 1) */
+        lease_ctx->blocked_fops_resuming = _gf_false;
+        inode_unref(lease_ctx->inode);
+        lease_ctx->inode = NULL;
+    }
+    pthread_mutex_unlock(&lease_ctx->lock);
+
+    return;
+}
+
+void
+recall_lease_timer_handler(struct gf_tw_timer_list *timer, void *data,
+                           unsigned long calltime)
+{
+    inode_t *inode = NULL;
+    lease_inode_t *lease_inode = NULL;
+    leases_private_t *priv = NULL;
+    lease_timer_data_t *timer_data = NULL;
+
+    timer_data = data;
+
+    priv = timer_data->this->private;
+    inode = timer_data->inode;
+    lease_inode = new_lease_inode(inode);
+    if (!lease_inode) {
+        errno = ENOMEM;
+        goto out;
+    }
+    pthread_mutex_lock(&priv->mutex);
+    {
+        list_add_tail(&lease_inode->list, &priv->recall_list);
+        pthread_cond_broadcast(&priv->cond);
+    }
+    pthread_mutex_unlock(&priv->mutex);
+out:
+    /* unref the inode_ref taken by timer_data in __recall_lease */
+    inode_unref(timer_data->inode);
+
+    GF_FREE(timer);
+}
+
+static void
+__recall_lease(xlator_t *this, lease_inode_ctx_t *lease_ctx)
+{
+    lease_id_entry_t *lease_entry = NULL;
+    lease_id_entry_t *tmp = NULL;
+    struct gf_upcall up_req = {
+        0,
+    };
+    struct gf_upcall_recall_lease recall_req = {
+        0,
+    };
+    int notify_ret = -1;
+    struct gf_tw_timer_list *timer = NULL;
+    leases_private_t *priv = NULL;
+    lease_timer_data_t *timer_data = NULL;
+    time_t recall_time;
+
+    if (lease_ctx->recall_in_progress) {
+        gf_msg_debug(this->name, 0,
+                     "Lease recall is already in "
+                     "progress, hence not sending another recall");
+        goto out;
+    }
+
+    priv = this->private;
+    recall_time = gf_time();
+    list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list,
+                             lease_id_list)
+    {
+        gf_uuid_copy(up_req.gfid, lease_ctx->inode->gfid);
+        up_req.client_uid = lease_entry->client_uid;
+        up_req.event_type = GF_UPCALL_RECALL_LEASE;
+        up_req.data = &recall_req;
+
+        notify_ret = this->notify(this, GF_EVENT_UPCALL, &up_req);
+        if (notify_ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, LEASE_MSG_RECALL_FAIL,
+                   "Recall notification to client: %s failed",
+                   lease_entry->client_uid);
+            /* Do not return from here, continue registering the timer,
+               this is required mostly o keep replicas in sync*/
+        } else {
+            gf_msg_debug(this->name, 0,
+                         "Recall lease (all)"
+                         "notification sent to client %s",
+                         lease_entry->client_uid);
+        }
+
+        lease_ctx->recall_in_progress = _gf_true;
+        lease_entry->recall_time = recall_time;
+    }
+    timer = GF_MALLOC(sizeof(*timer), gf_common_mt_tw_timer_list);
+    if (!timer) {
+        goto out;
+    }
+    timer_data = GF_MALLOC(sizeof(lease_timer_data_t),
+                           gf_leases_mt_timer_data_t);
+    if (!timer_data) {
+        GF_FREE(timer);
+        goto out;
+    }
+
+    timer_data->inode = inode_ref(lease_ctx->inode);
+    timer_data->this = this;
+    timer->data = timer_data;
+
+    INIT_LIST_HEAD(&timer->entry);
+    timer->expires = get_recall_lease_timeout(this);
+    timer->function = recall_lease_timer_handler;
+    lease_ctx->timer = timer;
+    gf_tw_add_timer(priv->timer_wheel, timer);
+    gf_msg_trace(this->name, 0,
+                 "Registering timer "
+                 "%p, after "
+                 "sending recall",
+                 timer);
+out:
+    return;
+}
+
+/* ret = 0; STACK_UNWIND Success
+ * ret = -1; STACK_UNWIND failure
+ */
+int
+process_lease_req(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                  struct gf_lease *lease)
+{
+    int ret = 0;
+    char *client_uid = NULL;
+    lease_inode_ctx_t *lease_ctx = NULL;
+
+    GF_VALIDATE_OR_GOTO("leases", frame, out);
+    GF_VALIDATE_OR_GOTO("leases", this, out);
+    GF_VALIDATE_OR_GOTO("leases", inode, out);
+    GF_VALIDATE_OR_GOTO("leases", lease, out);
+
+    client_uid = frame->root->client->client_uid;
+
+    if (!is_valid_lease_id(lease->lease_id)) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, LEASE_MSG_INVAL_LEASE_ID,
+               "Invalid lease id, from"
+               "client:%s",
+               client_uid);
+        ret = -EINVAL;
+        errno = EINVAL;
+        goto out;
+    }
+
+    lease_ctx = lease_ctx_get(inode, this);
+    if (!lease_ctx) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM,
+               "Unable to create/get inode ctx, "
+               "inode:%p",
+               inode);
+        ret = -ENOMEM;
+        errno = ENOMEM;
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0,
+                 "Lease request from client: %s, "
+                 "lease type:%d, lease cmd:%d, lease ID:%s, gfid:%s",
+                 client_uid, lease->lease_type, lease->cmd,
+                 leaseid_utoa(lease->lease_id), uuid_utoa(inode->gfid));
+
+    pthread_mutex_lock(&lease_ctx->lock);
+    {
+        switch (lease->cmd) {
+            case GF_GET_LEASE:
+                lease->lease_type = lease_ctx->lease_type;
+                gf_msg_debug(this->name, 0,
+                             "Get lease, existing lease"
+                             "type: %d",
+                             lease_ctx->lease_type);
+                /*TODO:Should it consider lease id or client_uid?*/
+                break;
+
+            case GF_SET_LEASE:
+                if (__is_lease_grantable(this, lease_ctx, lease, inode)) {
+                    __add_lease(frame, inode, lease_ctx, client_uid, lease);
+                    ret = 0;
+                } else {
+                    gf_msg_debug(this->name, GF_LOG_DEBUG,
+                                 "Not granting the conflicting lease"
+                                 " request from %s on gfid(%s)",
+                                 client_uid, uuid_utoa(inode->gfid));
+                    __recall_lease(this, lease_ctx);
+                    ret = -1;
+                }
+                break;
+            case GF_UNLK_LEASE:
+                ret = __remove_lease(this, inode, lease_ctx, client_uid, lease);
+                if ((ret >= 0) && (lease_ctx->lease_cnt == 0)) {
+                    pthread_mutex_unlock(&lease_ctx->lock);
+                    goto unblock;
+                }
+                break;
+            default:
+                ret = -EINVAL;
+                break;
+        }
+    }
+    pthread_mutex_unlock(&lease_ctx->lock);
+
+    return ret;
+
+unblock:
+    do_blocked_fops(this, lease_ctx);
+out:
+    return ret;
+}
+
+/* ret = 1 conflict
+ * ret = 0 no conflict
+ */
+gf_boolean_t
+__check_lease_conflict(call_frame_t *frame, lease_inode_ctx_t *lease_ctx,
+                       const char *lease_id, gf_boolean_t is_write)
+{
+    gf_lease_types_t lease_type = {
+        0,
+    };
+    gf_boolean_t conflicts = _gf_false;
+    lease_id_entry_t *lease_entry = NULL;
+
+    GF_VALIDATE_OR_GOTO("leases", frame, out);
+    GF_VALIDATE_OR_GOTO("leases", lease_ctx, out);
+
+    lease_type = lease_ctx->lease_type;
+
+    /* If the fop is rename or unlink conflict the lease even if its
+     * from the same client??
+     */
+    if ((frame->root->op == GF_FOP_RENAME) ||
+        (frame->root->op == GF_FOP_UNLINK)) {
+        conflicts = _gf_true;
+        goto recall;
+    }
+
+    /* As internal fops are used to maintain data integrity but do not
+     * make modififications to the client data, no need to conflict with
+     * them.
+     *
+     * @todo: like for locks, even lease state has to be handled by
+     * rebalance or self-heal daemon process. */
+    if (frame->root->pid < 0) {
+        conflicts = _gf_false;
+        goto recall;
+    }
+
+    /* If lease_id is not sent, set conflicts = true if there is
+     * an existing lease */
+    if (!lease_id && (lease_ctx->lease_cnt > 0)) {
+        conflicts = _gf_true;
+        goto recall;
+    }
+
+    switch (lease_type) {
+        case (GF_RW_LEASE | GF_RD_LEASE):
+        case GF_RW_LEASE:
+            lease_entry = __get_lease_id_entry(lease_ctx, lease_id);
+            if (lease_entry && (lease_entry->lease_type & GF_RW_LEASE))
+                conflicts = _gf_false;
+            else
+                conflicts = _gf_true;
+            break;
+        case GF_RD_LEASE:
+            if (is_write && __another_lease_found(lease_ctx, lease_id))
+                conflicts = _gf_true;
+            else
+                conflicts = _gf_false;
+            break;
+        default:
+            break;
+    }
+
+recall:
+    /* If there is a conflict found and recall is not already sent to all
+     * the clients, then send recall to each of the client holding lease.
+     */
+    if (conflicts)
+        __recall_lease(frame->this, lease_ctx);
+out:
+    return conflicts;
+}
+
+/* Return values:
+ * -1 : error, unwind the fop
+ * WIND_FOP: No conflict, wind the fop
+ * BLOCK_FOP: Found a conflicting lease, block the fop
+ */
+int
+check_lease_conflict(call_frame_t *frame, inode_t *inode, const char *lease_id,
+                     uint32_t fop_flags)
+{
+    lease_inode_ctx_t *lease_ctx = NULL;
+    gf_boolean_t is_blocking_fop = _gf_false;
+    gf_boolean_t is_write_fop = _gf_false;
+    gf_boolean_t conflicts = _gf_false;
+    int ret = WIND_FOP;
+
+    lease_ctx = lease_ctx_get(inode, frame->this);
+    if (!lease_ctx) {
+        gf_msg(frame->this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM,
+               "Unable to create/get inode ctx");
+        ret = -1;
+        errno = ENOMEM;
+        goto out;
+    }
+
+    is_blocking_fop = ((fop_flags & BLOCKING_FOP) != 0);
+    is_write_fop = ((fop_flags & DATA_MODIFY_FOP) != 0);
+
+    pthread_mutex_lock(&lease_ctx->lock);
+    {
+        if (lease_ctx->lease_type == NONE) {
+            pthread_mutex_unlock(&lease_ctx->lock);
+            gf_msg_debug(frame->this->name, 0,
+                         "No leases found continuing with the"
+                         " fop:%s",
+                         gf_fop_list[frame->root->op]);
+            ret = WIND_FOP;
+            goto out;
+        }
+        conflicts = __check_lease_conflict(frame, lease_ctx, lease_id,
+                                           is_write_fop);
+        if (conflicts) {
+            if (is_blocking_fop) {
+                gf_msg_debug(frame->this->name, 0,
+                             "Fop: %s "
+                             "conflicting existing "
+                             "lease: %d, blocking the"
+                             "fop",
+                             gf_fop_list[frame->root->op],
+                             lease_ctx->lease_type);
+                ret = BLOCK_FOP;
+            } else {
+                gf_msg_debug(frame->this->name, 0,
+                             "Fop: %s "
+                             "conflicting existing "
+                             "lease: %d, sending "
+                             "EAGAIN",
+                             gf_fop_list[frame->root->op],
+                             lease_ctx->lease_type);
+                errno = EAGAIN;
+                ret = -1;
+            }
+        }
+    }
+    pthread_mutex_unlock(&lease_ctx->lock);
+out:
+    return ret;
+}
+
+static int
+remove_clnt_leases(const char *client_uid, inode_t *inode, xlator_t *this)
+{
+    lease_inode_ctx_t *lease_ctx = NULL;
+    lease_id_entry_t *lease_entry = NULL;
+    lease_id_entry_t *tmp = NULL;
+    int ret = 0;
+    int i = 0;
+
+    lease_ctx = lease_ctx_get(inode, this);
+    if (!lease_ctx) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_INVAL_INODE_CTX,
+               "Unable to create/get inode ctx");
+        ret = -1;
+        errno = ENOMEM;
+        goto out;
+    }
+
+    pthread_mutex_lock(&lease_ctx->lock);
+    {
+        list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list,
+                                 lease_id_list)
+        {
+            if (strcmp(client_uid, lease_entry->client_uid) == 0) {
+                for (i = 0; i < GF_LEASE_MAX_TYPE; i++) {
+                    lease_ctx->lease_type_cnt[i] -= lease_entry
+                                                        ->lease_type_cnt[i];
+                }
+                lease_ctx->lease_cnt -= lease_entry->lease_cnt;
+                __destroy_lease_id_entry(lease_entry);
+                if (lease_ctx->lease_cnt == 0) {
+                    lease_ctx->blocked_fops_resuming = _gf_true;
+                    pthread_mutex_unlock(&lease_ctx->lock);
+                    goto unblock;
+                }
+            }
+        }
+    }
+    pthread_mutex_unlock(&lease_ctx->lock);
+out:
+    return ret;
+
+unblock:
+    do_blocked_fops(this, lease_ctx);
+    return ret;
+}
+
+int
+cleanup_client_leases(xlator_t *this, const char *client_uid)
+{
+    lease_client_t *clnt = NULL;
+    lease_client_t *tmp = NULL;
+    struct list_head cleanup_list = {
+        0,
+    };
+    lease_inode_t *l_inode = NULL;
+    lease_inode_t *tmp1 = NULL;
+    leases_private_t *priv = NULL;
+    int ret = 0;
+
+    priv = this->private;
+    if (!priv) {
+        ret = -1;
+        errno = EINVAL;
+        goto out;
+    }
+
+    INIT_LIST_HEAD(&cleanup_list);
+    pthread_mutex_lock(&priv->mutex);
+    {
+        list_for_each_entry_safe(clnt, tmp, &priv->client_list, client_list)
+        {
+            if ((strcmp(clnt->client_uid, client_uid) == 0)) {
+                list_for_each_entry_safe(l_inode, tmp1, &clnt->inode_list, list)
+                {
+                    list_del_init(&l_inode->list);
+                    list_add_tail(&l_inode->list, &cleanup_list);
+                }
+                __destroy_lease_client(clnt);
+                break;
+            }
+        }
+    }
+    pthread_mutex_unlock(&priv->mutex);
+
+    l_inode = tmp1 = NULL;
+    list_for_each_entry_safe(l_inode, tmp1, &cleanup_list, list)
+    {
+        remove_clnt_leases(client_uid, l_inode->inode, this);
+        __destroy_lease_inode(l_inode);
+    }
+out:
+    return ret;
+}
+
+static void
+__remove_all_leases(xlator_t *this, lease_inode_ctx_t *lease_ctx)
+{
+    int i = 0;
+    lease_id_entry_t *lease_entry = NULL;
+    lease_id_entry_t *tmp = NULL;
+
+    if (lease_ctx->lease_cnt == 0) {
+        /* No leases to remove. Return */
+        return;
+    }
+    __dump_leases_info(this, lease_ctx);
+
+    list_for_each_entry_safe(lease_entry, tmp, &lease_ctx->lease_id_list,
+                             lease_id_list)
+    {
+        lease_entry->lease_cnt = 0;
+        remove_from_clnt_list(this, lease_entry->client_uid, lease_ctx->inode);
+        __destroy_lease_id_entry(lease_entry);
+    }
+    INIT_LIST_HEAD(&lease_ctx->lease_id_list);
+    for (i = 0; i <= GF_LEASE_MAX_TYPE; i++)
+        lease_ctx->lease_type_cnt[i] = 0;
+    lease_ctx->lease_type = 0;
+    lease_ctx->lease_cnt = 0;
+    lease_ctx->recall_in_progress = _gf_false;
+    lease_ctx->timer = NULL;
+    lease_ctx->blocked_fops_resuming = _gf_true;
+
+    /* TODO:
+     * - Mark the corresponding fd bad. Could be done on client side
+     * as a result of recall
+     * - Free the lease_ctx
+     */
+    return;
+}
+
+static int
+remove_all_leases(xlator_t *this, inode_t *inode)
+{
+    lease_inode_ctx_t *lease_ctx = NULL;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO("leases", inode, out);
+
+    lease_ctx = lease_ctx_get(inode, this);
+    if (!lease_ctx) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_INVAL_INODE_CTX,
+               "Unable to create/get inode ctx");
+        ret = -1;
+        errno = ENOMEM;
+        goto out;
+    }
+
+    pthread_mutex_lock(&lease_ctx->lock);
+    {
+        __remove_all_leases(this, lease_ctx);
+    }
+    pthread_mutex_unlock(&lease_ctx->lock);
+
+    do_blocked_fops(this, lease_ctx);
+out:
+    return ret;
+}
+
+void *
+expired_recall_cleanup(void *data)
+{
+    struct timespec sleep_till = {
+        0,
+    };
+    struct list_head recall_cleanup_list;
+    lease_inode_t *recall_entry = NULL;
+    lease_inode_t *tmp = NULL;
+    leases_private_t *priv = NULL;
+    xlator_t *this = NULL;
+    time_t time_now;
+
+    GF_VALIDATE_OR_GOTO("leases", data, out);
+
+    this = data;
+    priv = this->private;
+
+    gf_msg_debug(this->name, 0, "Started the expired_recall_cleanup thread");
+
+    while (1) {
+        time_now = gf_time();
+        pthread_mutex_lock(&priv->mutex);
+        {
+            if (priv->fini) {
+                pthread_mutex_unlock(&priv->mutex);
+                goto out;
+            }
+            INIT_LIST_HEAD(&recall_cleanup_list);
+            if (list_empty(&priv->recall_list)) {
+                sleep_till.tv_sec = time_now + 600;
+                pthread_cond_timedwait(&priv->cond, &priv->mutex, &sleep_till);
+            }
+            if (!list_empty(&priv->recall_list)) {
+                gf_msg_debug(this->name, 0, "Found expired recalls");
+                list_for_each_entry_safe(recall_entry, tmp, &priv->recall_list,
+                                         list)
+                {
+                    list_del_init(&recall_entry->list);
+                    list_add_tail(&recall_entry->list, &recall_cleanup_list);
+                }
+            }
+        }
+        pthread_mutex_unlock(&priv->mutex);
+
+        recall_entry = tmp = NULL;
+        list_for_each_entry_safe(recall_entry, tmp, &recall_cleanup_list, list)
+        {
+            gf_msg_debug(this->name, 0,
+                         "Recall lease was sent on"
+                         " inode:%p, recall timer has expired"
+                         " and clients haven't unlocked the lease"
+                         " hence cleaning up leases on the inode",
+                         recall_entry->inode);
+            remove_all_leases(this, recall_entry->inode);
+            /* no need to take priv->mutex lock as this entry
+             * reference is removed from global recall list. */
+            __destroy_lease_inode(recall_entry);
+        }
+    }
+
+out:
+    return NULL;
+}
diff --git a/xlators/features/leases/src/leases-mem-types.h b/xlators/features/leases/src/leases-mem-types.h
new file mode 100644
index 00000000000..25664b44156
--- /dev/null
+++ b/xlators/features/leases/src/leases-mem-types.h
@@ -0,0 +1,27 @@
+/*
+   Copyright (c) 2015-2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __LEASES_MEM_TYPES_H__
+#define __LEASES_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_leases_mem_types_ {
+    gf_leases_mt_private_t = gf_common_mt_end + 1,
+    gf_leases_mt_lease_client_t,
+    gf_leases_mt_lease_inode_t,
+    gf_leases_mt_fd_ctx_t,
+    gf_leases_mt_lease_inode_ctx_t,
+    gf_leases_mt_lease_id_entry_t,
+    gf_leases_mt_fop_stub_t,
+    gf_leases_mt_timer_data_t,
+    gf_leases_mt_end
+};
+#endif
diff --git a/xlators/features/leases/src/leases-messages.h b/xlators/features/leases/src/leases-messages.h
new file mode 100644
index 00000000000..da696b832de
--- /dev/null
+++ b/xlators/features/leases/src/leases-messages.h
@@ -0,0 +1,33 @@
+/*
+ Copyright (c) 2015-2016 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _LEASES_MESSAGES_H_
+#define _LEASES_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(LEASES, LEASE_MSG_NO_MEM, LEASE_MSG_RECALL_FAIL,
+           LEASE_MSG_INVAL_LEASE_ID, LEASE_MSG_INVAL_UNLK_LEASE,
+           LEASE_MSG_INVAL_INODE_CTX, LEASE_MSG_NOT_ENABLED,
+           LEASE_MSG_NO_TIMER_WHEEL, LEASE_MSG_CLNT_NOTFOUND,
+           LEASE_MSG_INODE_NOTFOUND, LEASE_MSG_INVAL_FD_CTX,
+           LEASE_MSG_INVAL_LEASE_TYPE);
+
+#endif /* !_LEASES_MESSAGES_H_ */
diff --git a/xlators/features/leases/src/leases.c b/xlators/features/leases/src/leases.c
new file mode 100644
index 00000000000..04bee50ba3f
--- /dev/null
+++ b/xlators/features/leases/src/leases.c
@@ -0,0 +1,1168 @@
+/*
+   Copyright (c) 2015-2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "leases.h"
+
+int32_t
+leases_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata);
+
+    return 0;
+}
+
+int32_t
+leases_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+            fd_t *fd, dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    int32_t op_errno = EINVAL;
+    int ret = 0;
+    lease_fd_ctx_t *fd_ctx = NULL;
+    char *lease_id = NULL;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    fd_ctx = GF_CALLOC(1, sizeof(*fd_ctx), gf_leases_mt_fd_ctx_t);
+    if (!fd_ctx) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    fd_ctx->client_uid = gf_strdup(frame->root->client->client_uid);
+    if (!fd_ctx->client_uid) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    GET_FLAGS(frame->root->op, flags);
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    if (lease_id != NULL)
+        memcpy(fd_ctx->lease_id, lease_id, LEASE_ID_SIZE);
+    else
+        memset(fd_ctx->lease_id, 0, LEASE_ID_SIZE);
+
+    ret = fd_ctx_set(fd, this, (uint64_t)(uintptr_t)fd_ctx);
+    if (ret) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(fd->inode, open, frame, this, loc, flags, fd, xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+    return 0;
+
+err:
+    if (fd_ctx) {
+        GF_FREE(fd_ctx->client_uid);
+        GF_FREE(fd_ctx);
+    }
+
+    STACK_UNWIND_STRICT(open, frame, -1, op_errno, NULL, NULL);
+    return 0;
+}
+
+int32_t
+leases_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                  int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+                  dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+
+    return 0;
+}
+
+int32_t
+leases_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+              struct iovec *vector, int count, off_t off, uint32_t flags,
+              struct iobref *iobref, dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, fd->flags);
+
+    ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(fd->inode, writev, frame, this, fd, vector, count, off,
+                    flags, iobref, xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, off, flags,
+               iobref, xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(writev, frame, -1, errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+leases_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, struct iovec *vector, int count,
+                 struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf,
+                        iobref, xdata);
+
+    return 0;
+}
+
+int32_t
+leases_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+             off_t offset, uint32_t flags, dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, fd->flags);
+
+    ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(fd->inode, readv, frame, this, fd, size, offset, flags,
+                    xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_readv_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(readv, frame, -1, errno, NULL, 0, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+leases_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(lk, frame, op_ret, op_errno, lock, xdata);
+
+    return 0;
+}
+
+int32_t
+leases_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+          struct gf_flock *flock, dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS_LK(cmd, flock->l_type, fd->flags);
+
+    ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(fd->inode, lk, frame, this, fd, cmd, flock, xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_lk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lk, fd, cmd, flock, xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(lk, frame, -1, errno, NULL, NULL);
+    return 0;
+}
+
+int32_t
+leases_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+             struct gf_lease *lease, dict_t *xdata)
+{
+    int32_t op_errno = 0;
+    int ret = 0;
+    struct gf_lease nullease = {
+        0,
+    };
+    int32_t op_ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    ret = process_lease_req(frame, this, loc->inode, lease);
+    if (ret < 0) {
+        op_errno = -ret;
+        op_ret = -1;
+    }
+    goto unwind;
+
+out:
+    gf_msg(this->name, GF_LOG_ERROR, EINVAL, LEASE_MSG_NOT_ENABLED,
+           "\"features/leases\" translator is not enabled. "
+           "You need to enable it for proper functioning of your "
+           "application");
+    op_errno = ENOSYS;
+    op_ret = -1;
+
+unwind:
+    STACK_UNWIND_STRICT(lease, frame, op_ret, op_errno,
+                        (op_errno == ENOSYS) ? &nullease : lease, xdata);
+    return 0;
+}
+
+int32_t
+leases_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int op_ret, int op_errno, struct iatt *prebuf,
+                    struct iatt *postbuf, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+
+    return 0;
+}
+
+int32_t
+leases_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+                dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, 0);
+
+    ret = check_lease_conflict(frame, loc->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(loc->inode, truncate, frame, this, loc, offset, xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(truncate, frame, -1, errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+leases_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int op_ret, int op_errno, struct iatt *statpre,
+                   struct iatt *statpost, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, statpre, statpost,
+                        xdata);
+
+    return 0;
+}
+
+int32_t
+leases_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, 0);
+
+    ret = check_lease_conflict(frame, loc->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(loc->inode, setattr, frame, this, loc, stbuf, valid, xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(setattr, frame, -1, errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+leases_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
+                  struct iatt *preoldparent, struct iatt *postoldparent,
+                  struct iatt *prenewparent, struct iatt *postnewparent,
+                  dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, stbuf, preoldparent,
+                        postoldparent, prenewparent, postnewparent, xdata);
+
+    return 0;
+}
+
+int32_t
+leases_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+              dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    /* should the lease be also checked for newloc */
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, 0);
+
+    ret = check_lease_conflict(frame, oldloc->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(oldloc->inode, rename, frame, this, oldloc, newloc, xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(rename, frame, -1, errno, NULL, NULL, NULL, NULL, NULL,
+                        NULL);
+    return 0;
+}
+
+int32_t
+leases_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                  int op_errno, struct iatt *preparent, struct iatt *postparent,
+                  dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent,
+                        xdata);
+
+    return 0;
+}
+
+int32_t
+leases_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+              dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, 0);
+
+    ret = check_lease_conflict(frame, loc->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(loc->inode, unlink, frame, this, loc, xflag, xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(unlink, frame, -1, errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+leases_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                int op_errno, inode_t *inode, struct iatt *stbuf,
+                struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(link, frame, op_ret, op_errno, inode, stbuf, preparent,
+                        postparent, xdata);
+
+    return 0;
+}
+
+int32_t
+leases_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+            dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, 0);
+
+    ret = check_lease_conflict(frame, oldloc->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(oldloc->inode, link, frame, this, oldloc, newloc, xdata);
+    return 0;
+out:
+    STACK_WIND(frame, leases_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(link, frame, -1, errno, NULL, NULL, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+leases_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                  int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
+                  struct iatt *preparent, struct iatt *postparent,
+                  dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, stbuf,
+                        preparent, postparent, xdata);
+
+    return 0;
+}
+
+int32_t
+leases_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+              mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, flags);
+
+    ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(fd->inode, create, frame, this, loc, flags, mode, umask, fd,
+                    xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(create, frame, -1, errno, NULL, NULL, NULL, NULL, NULL,
+                        NULL);
+    return 0;
+}
+
+int32_t
+leases_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                 struct iatt *postbuf, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+    return 0;
+}
+
+int32_t
+leases_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+             dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, fd->flags);
+
+    ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(fd->inode, fsync, frame, this, fd, flags, xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_fsync_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsync, fd, flags, xdata);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(fsync, frame, -1, errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+leases_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
+
+int32_t
+leases_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                 dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, 0); /* TODO:fd->flags?*/
+
+    ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(fd->inode, ftruncate, frame, this, fd, offset, xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_ftruncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(ftruncate, frame, -1, errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+leases_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+                    struct iatt *statpost, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(fsetattr, frame, op_ret, op_errno, statpre, statpost,
+                        xdata);
+    return 0;
+}
+
+int32_t
+leases_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, fd->flags);
+
+    ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(fd->inode, fsetattr, frame, this, fd, stbuf, valid, xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_fsetattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(fsetattr, frame, -1, errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+leases_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *pre,
+                     struct iatt *post, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, pre, post, xdata);
+
+    return 0;
+}
+
+int32_t
+leases_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+                 off_t offset, size_t len, dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, fd->flags);
+
+    ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(fd->inode, fallocate, frame, this, fd, mode, offset, len,
+                    xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_fallocate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+               xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(fallocate, frame, -1, errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+leases_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *pre,
+                   struct iatt *post, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, pre, post, xdata);
+
+    return 0;
+}
+
+int32_t
+leases_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+               size_t len, dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, fd->flags);
+
+    ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(fd->inode, discard, frame, this, fd, offset, len, xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_discard_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(discard, frame, -1, errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+leases_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *pre,
+                    struct iatt *post, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, pre, post, xdata);
+
+    return 0;
+}
+
+int
+leases_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                off_t len, dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, fd->flags);
+
+    ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(fd->inode, zerofill, frame, this, fd, offset, len, xdata);
+    return 0;
+
+out:
+    STACK_WIND(frame, leases_zerofill_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(zerofill, frame, -1, errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int
+leases_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+int
+leases_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    uint32_t fop_flags = 0;
+    char *lease_id = NULL;
+    int ret = 0;
+    lease_fd_ctx_t *fd_ctx = NULL;
+    uint64_t ctx = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+    EXIT_IF_INTERNAL_FOP(frame, xdata, out);
+
+    GET_LEASE_ID(xdata, lease_id, frame->root->client->client_uid);
+    GET_FLAGS(frame->root->op, fd->flags);
+
+    ret = check_lease_conflict(frame, fd->inode, lease_id, fop_flags);
+    if (ret < 0)
+        goto err;
+    else if (ret == BLOCK_FOP)
+        goto block;
+    else if (ret == WIND_FOP)
+        goto out;
+
+block:
+    LEASE_BLOCK_FOP(fd->inode, flush, frame, this, fd, xdata);
+    return 0;
+
+out:
+    /* *
+     * currently release is not called after the close fop from the
+     * application. Hence lease fd ctx is reset on here.
+     * This is actually not the right way, since flush can be called
+     * not only from the close op.
+     * TODO :
+     *     - Either identify the flush is called from close call on fd from
+     *     from the application.
+     *                      OR
+     *     - Find why release is not called post the last close call
+     */
+    ret = fd_ctx_get(fd, this, &ctx);
+    if (ret == 0) {
+        fd_ctx = (lease_fd_ctx_t *)(long)ctx;
+        if (fd_ctx->client_uid) {
+            GF_FREE(fd_ctx->client_uid);
+            fd_ctx->client_uid = NULL;
+        }
+        memset(fd_ctx->lease_id, 0, LEASE_ID_SIZE);
+    }
+    STACK_WIND(frame, leases_flush_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->flush, fd, xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(create, frame, -1, errno, NULL, NULL, NULL, NULL, NULL,
+                        NULL);
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_leases_mt_end + 1);
+
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM,
+               "mem account init failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+static int
+leases_init_priv(xlator_t *this)
+{
+    int ret = 0;
+    leases_private_t *priv = NULL;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (!priv->timer_wheel) {
+        priv->timer_wheel = glusterfs_ctx_tw_get(this->ctx);
+        if (!priv->timer_wheel) {
+            ret = -1;
+            goto out;
+        }
+    }
+
+    if (!priv->inited_recall_thr) {
+        ret = gf_thread_create(&priv->recall_thr, NULL, expired_recall_cleanup,
+                               this, "leasercl");
+        if (!ret)
+            priv->inited_recall_thr = _gf_true;
+    }
+
+out:
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    leases_private_t *priv = NULL;
+    int ret = -1;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* TODO: In case of reconfigure, if its enabling the leases
+     * its not an issue, but if its disabling the leases, there
+     * is more to it, like recall all the existing leases, wait
+     * for unlock of all the leases etc., hence not supporting the
+     * reconfigure for now.
+
+    GF_OPTION_RECONF ("leases", priv->leases_enabled,
+                      options, bool, out);
+
+    if (priv->leases_enabled) {
+            ret = leases_init_priv (this);
+            if (ret)
+                    goto out;
+    }
+    */
+
+    GF_OPTION_RECONF("lease-lock-recall-timeout", priv->recall_lease_timeout,
+                     options, int32, out);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+init(xlator_t *this)
+{
+    int ret = -1;
+    leases_private_t *priv = NULL;
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_leases_mt_private_t);
+    if (!priv) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM,
+               "Leases init failed");
+        goto out;
+    }
+
+    GF_OPTION_INIT("leases", priv->leases_enabled, bool, out);
+    GF_OPTION_INIT("lease-lock-recall-timeout", priv->recall_lease_timeout,
+                   int32, out);
+    pthread_mutex_init(&priv->mutex, NULL);
+    INIT_LIST_HEAD(&priv->client_list);
+    INIT_LIST_HEAD(&priv->recall_list);
+
+    this->private = priv;
+
+    if (priv->leases_enabled) {
+        ret = leases_init_priv(this);
+        if (ret)
+            goto out;
+    }
+
+    ret = 0;
+
+out:
+    if (ret) {
+        GF_FREE(priv);
+        this->private = NULL;
+    }
+
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    leases_private_t *priv = NULL;
+
+    priv = this->private;
+    if (!priv) {
+        return;
+    }
+    this->private = NULL;
+
+    priv->fini = _gf_true;
+    pthread_cond_broadcast(&priv->cond);
+    if (priv->recall_thr) {
+        gf_thread_cleanup_xint(priv->recall_thr);
+        priv->recall_thr = 0;
+        priv->inited_recall_thr = _gf_false;
+    }
+
+    if (priv->timer_wheel) {
+        glusterfs_ctx_tw_put(this->ctx);
+    }
+
+    GF_FREE(priv);
+    return;
+}
+
+static int
+leases_forget(xlator_t *this, inode_t *inode)
+{
+    /* TODO:leases_cleanup_inode_ctx (this, inode); */
+    return 0;
+}
+
+static int
+leases_release(xlator_t *this, fd_t *fd)
+{
+    int ret = -1;
+    uint64_t tmp = 0;
+    lease_fd_ctx_t *fd_ctx = NULL;
+
+    if (fd == NULL) {
+        goto out;
+    }
+
+    gf_log(this->name, GF_LOG_TRACE, "Releasing all leases with fd %p", fd);
+
+    ret = fd_ctx_del(fd, this, &tmp);
+    if (ret) {
+        gf_log(this->name, GF_LOG_DEBUG, "Could not get fdctx");
+        goto out;
+    }
+
+    fd_ctx = (lease_fd_ctx_t *)(long)tmp;
+    if (fd_ctx)
+        GF_FREE(fd_ctx);
+out:
+    return ret;
+}
+
+static int
+leases_clnt_disconnect_cbk(xlator_t *this, client_t *client)
+{
+    int ret = 0;
+
+    EXIT_IF_LEASES_OFF(this, out);
+
+    ret = cleanup_client_leases(this, client->client_uid);
+out:
+    return ret;
+}
+
+struct xlator_fops fops = {
+    /* Metadata modifying fops */
+    .fsetattr = leases_fsetattr,
+    .setattr = leases_setattr,
+
+    /* File Data reading fops */
+    .open = leases_open,
+    .readv = leases_readv,
+
+    /* File Data modifying fops */
+    .truncate = leases_truncate,
+    .ftruncate = leases_ftruncate,
+    .writev = leases_writev,
+    .zerofill = leases_zerofill,
+    .fallocate = leases_fallocate,
+    .discard = leases_discard,
+    .lk = leases_lk,
+    .fsync = leases_fsync,
+    .flush = leases_flush,
+    .lease = leases_lease,
+
+    /* Directory Data modifying fops */
+    .create = leases_create,
+    .rename = leases_rename,
+    .unlink = leases_unlink,
+    .link = leases_link,
+
+#ifdef NOT_SUPPORTED
+    /* internal lk fops */
+    .inodelk = leases_inodelk,
+    .finodelk = leases_finodelk,
+    .entrylk = leases_entrylk,
+    .fentrylk = leases_fentrylk,
+
+    /* Internal special fops*/
+    .xattrop = leases_xattrop,
+    .fxattrop = leases_fxattrop,
+#endif
+};
+
+struct xlator_cbks cbks = {
+    .forget = leases_forget,
+    .release = leases_release,
+    .client_disconnect = leases_clnt_disconnect_cbk,
+};
+
+struct volume_options options[] = {
+    {.key = {"leases"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .op_version = {GD_OP_VERSION_3_8_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .description = "When \"on\", enables leases support"},
+    {.key = {"lease-lock-recall-timeout"},
+     .type = GF_OPTION_TYPE_INT,
+     .default_value = RECALL_LEASE_LK_TIMEOUT,
+     .op_version = {GD_OP_VERSION_3_8_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .description = "After 'timeout' seconds since the recall_lease"
+                    " request has been sent to the client, the lease lock"
+                    " will be forcefully purged by the server."},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "leases",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/leases/src/leases.h b/xlators/features/leases/src/leases.h
new file mode 100644
index 00000000000..a6e8a6824cc
--- /dev/null
+++ b/xlators/features/leases/src/leases.h
@@ -0,0 +1,259 @@
+/*
+  Copyright (c) 2015-2016 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _LEASES_H
+#define _LEASES_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <glusterfs/common-utils.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/client_t.h>
+#include <glusterfs/lkowner.h>
+#include <glusterfs/locking.h>
+#include <glusterfs/upcall-utils.h>
+#include "timer-wheel.h"
+#include "leases-mem-types.h"
+#include "leases-messages.h"
+
+/* The time period for which a client lease lock will be stored after its been
+ * recalled for the first time. */
+#define RECALL_LEASE_LK_TIMEOUT "60"
+
+#define DATA_MODIFY_FOP 0x0001
+#define BLOCKING_FOP 0x0002
+
+#define BLOCK_FOP 0x0001
+#define WIND_FOP 0x0002
+
+#define EXIT_IF_LEASES_OFF(this, label)                                        \
+    do {                                                                       \
+        if (!is_leases_enabled(this))                                          \
+            goto label;                                                        \
+    } while (0)
+
+#define EXIT_IF_INTERNAL_FOP(frame, xdata, label)                              \
+    do {                                                                       \
+        if (frame->root->pid < 0)                                              \
+            goto label;                                                        \
+        if (xdata && dict_get(xdata, GLUSTERFS_INTERNAL_FOP_KEY))              \
+            goto label;                                                        \
+    } while (0)
+
+#define GET_LEASE_ID(xdata, lease_id, client_uid)                              \
+    do {                                                                       \
+        int ret_val = -1;                                                      \
+        ret_val = dict_get_bin(xdata, "lease-id", (void **)&lease_id);         \
+        if (ret_val) {                                                         \
+            ret_val = 0;                                                       \
+            gf_msg_debug("leases", 0, "Lease id is not set for client:%s",     \
+                         client_uid);                                          \
+        }                                                                      \
+    } while (0)
+
+#define GET_FLAGS(fop, fd_flags)                                               \
+    do {                                                                       \
+        if ((fd_flags & (O_WRONLY | O_RDWR)) && fop == GF_FOP_OPEN)            \
+            fop_flags = DATA_MODIFY_FOP;                                       \
+                                                                               \
+        if (fop == GF_FOP_UNLINK || fop == GF_FOP_RENAME ||                    \
+            fop == GF_FOP_TRUNCATE || fop == GF_FOP_FTRUNCATE ||               \
+            fop == GF_FOP_FLUSH || fop == GF_FOP_FSYNC ||                      \
+            fop == GF_FOP_WRITE || fop == GF_FOP_FALLOCATE ||                  \
+            fop == GF_FOP_DISCARD || fop == GF_FOP_ZEROFILL ||                 \
+            fop == GF_FOP_SETATTR || fop == GF_FOP_FSETATTR ||                 \
+            fop == GF_FOP_LINK)                                                \
+            fop_flags = DATA_MODIFY_FOP;                                       \
+                                                                               \
+        if (!(fd_flags & (O_NONBLOCK | O_NDELAY)))                             \
+            fop_flags |= BLOCKING_FOP;                                         \
+                                                                               \
+    } while (0)
+
+#define GET_FLAGS_LK(cmd, l_type, fd_flags)                                    \
+    do {                                                                       \
+        /* TODO: handle F_RESLK_LCK and other glusterfs_lk_recovery_cmds_t */  \
+        if ((cmd == F_SETLKW || cmd == F_SETLKW64 || cmd == F_SETLK ||         \
+             cmd == F_SETLK64) &&                                              \
+            l_type == F_WRLCK)                                                 \
+            fop_flags = DATA_MODIFY_FOP;                                       \
+                                                                               \
+        if (fd_flags & (O_NONBLOCK | O_NDELAY) &&                              \
+            (cmd == F_SETLKW || cmd == F_SETLKW64))                            \
+            fop_flags |= BLOCKING_FOP;                                         \
+                                                                               \
+    } while (0)
+
+#define LEASE_BLOCK_FOP(inode, fop_name, frame, this, params...)               \
+    do {                                                                       \
+        call_stub_t *__stub = NULL;                                            \
+        fop_stub_t *blk_fop = NULL;                                            \
+        lease_inode_ctx_t *lease_ctx = NULL;                                   \
+                                                                               \
+        __stub = fop_##fop_name##_stub(frame, default_##fop_name##_resume,     \
+                                       params);                                \
+        if (!__stub) {                                                         \
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM,       \
+                   "Unable to create stub");                                   \
+            ret = -ENOMEM;                                                     \
+            goto __out;                                                        \
+        }                                                                      \
+                                                                               \
+        blk_fop = GF_CALLOC(1, sizeof(*blk_fop), gf_leases_mt_fop_stub_t);     \
+        if (!blk_fop) {                                                        \
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM,       \
+                   "Unable to create lease fop stub");                         \
+            ret = -ENOMEM;                                                     \
+            goto __out;                                                        \
+        }                                                                      \
+                                                                               \
+        lease_ctx = lease_ctx_get(inode, this);                                \
+        if (!lease_ctx) {                                                      \
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM,       \
+                   "Unable to create/get inode ctx");                          \
+            ret = -ENOMEM;                                                     \
+            goto __out;                                                        \
+        }                                                                      \
+                                                                               \
+        blk_fop->stub = __stub;                                                \
+        pthread_mutex_lock(&lease_ctx->lock);                                  \
+        {                                                                      \
+            /*TODO: If the lease is unlocked btw check lease conflict and      \
+             * by now, then this fop shouldn't be add to the blocked fop       \
+             * list, can use generation number for the same?*/                 \
+            list_add_tail(&blk_fop->list, &lease_ctx->blocked_list);           \
+        }                                                                      \
+        pthread_mutex_unlock(&lease_ctx->lock);                                \
+                                                                               \
+    __out:                                                                     \
+        if (ret < 0) {                                                         \
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM, LEASE_MSG_NO_MEM,       \
+                   "Unable to create stub for blocking the fop:%s (%s)",       \
+                   gf_fop_list[frame->root->op], strerror(ENOMEM));            \
+            if (__stub != NULL) {                                              \
+                call_stub_destroy(__stub);                                     \
+            }                                                                  \
+            GF_FREE(blk_fop);                                                  \
+            goto err;                                                          \
+        }                                                                      \
+    } while (0)
+
+struct _leases_private {
+    struct list_head client_list;
+    struct list_head recall_list;
+    struct tvec_base *timer_wheel; /* timer wheel where the recall request
+                                      is qued and waits for unlock/expiry */
+    pthread_t recall_thr;
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+    int32_t recall_lease_timeout;
+    gf_boolean_t inited_recall_thr;
+    gf_boolean_t fini;
+    gf_boolean_t leases_enabled;
+
+    char _pad[1]; /* manual padding */
+};
+typedef struct _leases_private leases_private_t;
+
+struct _lease_client {
+    char *client_uid;
+    struct list_head client_list;
+    struct list_head inode_list;
+};
+typedef struct _lease_client lease_client_t;
+
+struct _lease_inode {
+    inode_t *inode;
+    struct list_head
+        list; /* This can be part of both inode_list and recall_list */
+};
+typedef struct _lease_inode lease_inode_t;
+
+struct _lease_fd_ctx {
+    char *client_uid;
+    char lease_id[LEASE_ID_SIZE];
+};
+typedef struct _lease_fd_ctx lease_fd_ctx_t;
+
+struct _lease_inode_ctx {
+    struct list_head lease_id_list; /* clients that have taken leases */
+    int lease_type_cnt[GF_LEASE_MAX_TYPE + 1];
+    uint64_t lease_cnt;            /* Total number of leases on this inode */
+    uint64_t openfd_cnt;           /* number of fds open */
+    struct list_head blocked_list; /* List of fops blocked until the
+                                      lease recall is complete */
+    inode_t *inode;                /* this represents the inode on which the
+                                      lock was taken, required mainly during
+                                      disconnect cleanup */
+    struct gf_tw_timer_list *timer;
+    pthread_mutex_t lock;
+    int lease_type;                  /* Types of leases acquired */
+    gf_boolean_t recall_in_progress; /* if lease recall is sent on this inode */
+    gf_boolean_t blocked_fops_resuming; /* if blocked fops are being resumed */
+
+    char _pad[2]; /* manual padding */
+};
+typedef struct _lease_inode_ctx lease_inode_ctx_t;
+
+struct _lease_id_entry {
+    struct list_head lease_id_list;
+    char lease_id[LEASE_ID_SIZE];
+    char *client_uid;                          /* uid of the client that has
+                                                  taken the lease */
+    int lease_type_cnt[GF_LEASE_MAX_TYPE + 1]; /* count of each lease type */
+    uint64_t lease_cnt; /* Number of leases taken under the
+                           given lease id */
+    time_t recall_time; /* time @ which recall was sent */
+    int lease_type;     /* Union of all the leases taken
+                           under the given lease id */
+    char _pad[4];       /* manual padding */
+};
+typedef struct _lease_id_entry lease_id_entry_t;
+
+/* Required? as stub itself will have list */
+struct __fop_stub {
+    struct list_head list;
+    call_stub_t *stub;
+};
+typedef struct __fop_stub fop_stub_t;
+
+struct __lease_timer_data {
+    inode_t *inode;
+    xlator_t *this;
+};
+typedef struct __lease_timer_data lease_timer_data_t;
+
+gf_boolean_t
+is_leases_enabled(xlator_t *this);
+
+lease_inode_ctx_t *
+lease_ctx_get(inode_t *inode, xlator_t *this);
+
+int
+process_lease_req(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                  struct gf_lease *lease);
+
+int
+check_lease_conflict(call_frame_t *frame, inode_t *inode, const char *lease_id,
+                     uint32_t fop_flags);
+
+int
+cleanup_client_leases(xlator_t *this, const char *client_uid);
+
+void *
+expired_recall_cleanup(void *data);
+
+#endif /* _LEASES_H */
diff --git a/xlators/features/locks/src/Makefile.am b/xlators/features/locks/src/Makefile.am
index ec4a953eb91..0b174c19d2d 100644
--- a/xlators/features/locks/src/Makefile.am
+++ b/xlators/features/locks/src/Makefile.am
@@ -1,20 +1,29 @@
+if WITH_SERVER
 xlator_LTLIBRARIES = locks.la
+endif
 xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
 
-locks_la_LDFLAGS = -module -avoidversion
+locks_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
 
-locks_la_SOURCES = common.c posix.c internal.c
-locks_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la 
+locks_la_SOURCES = common.c posix.c entrylk.c inodelk.c reservelk.c \
+	clear.c
 
-noinst_HEADERS = locks.h common.h
+locks_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -fno-strict-aliasing -D$(GF_HOST_OS) \
-	-I$(top_srcdir)/libglusterfs/src $(GF_CFLAGS) -shared -nostartfiles
+noinst_HEADERS = locks.h common.h locks-mem-types.h clear.h pl-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+
+AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS)
 
 CLEANFILES = 
 
+if WITH_SERVER
 uninstall-local:
 	rm -f $(DESTDIR)$(xlatordir)/posix-locks.so
 
 install-data-hook:
-	ln -sf locks.so $(DESTDIR)$(xlatordir)/posix-locks.so
-\ No newline at end of file
+	ln -sf locks.so $(DESTDIR)$(xlatordir)/posix-locks.so
+endif
diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c
new file mode 100644
index 00000000000..ab1eac68a53
--- /dev/null
+++ b/xlators/features/locks/src/clear.c
@@ -0,0 +1,460 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <unistd.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <pthread.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/common-utils.h>
+
+#include "locks.h"
+#include "common.h"
+#include <glusterfs/statedump.h>
+#include "clear.h"
+
+const char *clrlk_type_names[CLRLK_TYPE_MAX] = {
+    [CLRLK_INODE] = "inode",
+    [CLRLK_ENTRY] = "entry",
+    [CLRLK_POSIX] = "posix",
+};
+
+int
+clrlk_get_kind(char *kind)
+{
+    char *clrlk_kinds[CLRLK_KIND_MAX] = {"dummy", "blocked", "granted", "all"};
+    int ret_kind = CLRLK_KIND_MAX;
+    int i = 0;
+
+    for (i = CLRLK_BLOCKED; i < CLRLK_KIND_MAX; i++) {
+        if (!strcmp(clrlk_kinds[i], kind)) {
+            ret_kind = i;
+            break;
+        }
+    }
+
+    return ret_kind;
+}
+
+int
+clrlk_get_type(char *type)
+{
+    char *clrlk_types[CLRLK_TYPE_MAX] = {"inode", "entry", "posix"};
+    int ret_type = CLRLK_TYPE_MAX;
+    int i = 0;
+
+    for (i = CLRLK_INODE; i < CLRLK_TYPE_MAX; i++) {
+        if (!strcmp(clrlk_types[i], type)) {
+            ret_type = i;
+            break;
+        }
+    }
+
+    return ret_type;
+}
+
+int
+clrlk_get_lock_range(char *range_str, struct gf_flock *ulock,
+                     gf_boolean_t *chk_range)
+{
+    int ret = -1;
+
+    if (!chk_range)
+        goto out;
+
+    if (!range_str) {
+        ret = 0;
+        *chk_range = _gf_false;
+        goto out;
+    }
+
+    if (sscanf(range_str,
+               "%hd,%" PRId64 "-"
+               "%" PRId64,
+               &ulock->l_whence, &ulock->l_start, &ulock->l_len) != 3) {
+        goto out;
+    }
+
+    ret = 0;
+    *chk_range = _gf_true;
+out:
+    return ret;
+}
+
+int
+clrlk_parse_args(const char *cmd, clrlk_args *args)
+{
+    char *opts = NULL;
+    char *cur = NULL;
+    char *tok = NULL;
+    char *sptr = NULL;
+    char *free_ptr = NULL;
+    char kw[KW_MAX] = {
+        [KW_TYPE] = 't',
+        [KW_KIND] = 'k',
+    };
+    int ret = -1;
+    int i = 0;
+
+    GF_ASSERT(cmd);
+    free_ptr = opts = GF_CALLOC(1, strlen(cmd), gf_common_mt_char);
+    if (!opts)
+        goto out;
+
+    if (sscanf(cmd, GF_XATTR_CLRLK_CMD ".%s", opts) < 1) {
+        ret = -1;
+        goto out;
+    }
+
+    /*clr_lk_prefix.ttype.kkind.args, args - type specific*/
+    cur = opts;
+    for (i = 0; i < KW_MAX && (tok = strtok_r(cur, ".", &sptr));
+         cur = NULL, i++) {
+        if (tok[0] != kw[i]) {
+            ret = -1;
+            goto out;
+        }
+        if (i == KW_TYPE)
+            args->type = clrlk_get_type(tok + 1);
+        if (i == KW_KIND)
+            args->kind = clrlk_get_kind(tok + 1);
+    }
+
+    if ((args->type == CLRLK_TYPE_MAX) || (args->kind == CLRLK_KIND_MAX))
+        goto out;
+
+    /*optional args, neither range nor basename can 'legally' contain
+     * "/" in them*/
+    tok = strtok_r(NULL, "/", &sptr);
+    if (tok)
+        args->opts = gf_strdup(tok);
+
+    ret = 0;
+out:
+    GF_FREE(free_ptr);
+    return ret;
+}
+
+int
+clrlk_clear_posixlk(xlator_t *this, pl_inode_t *pl_inode, clrlk_args *args,
+                    int *blkd, int *granted, int *op_errno)
+{
+    posix_lock_t *plock = NULL;
+    posix_lock_t *tmp = NULL;
+    struct gf_flock ulock = {
+        0,
+    };
+    int ret = -1;
+    int bcount = 0;
+    int gcount = 0;
+    gf_boolean_t chk_range = _gf_false;
+
+    if (clrlk_get_lock_range(args->opts, &ulock, &chk_range)) {
+        *op_errno = EINVAL;
+        goto out;
+    }
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        list_for_each_entry_safe(plock, tmp, &pl_inode->ext_list, list)
+        {
+            if ((plock->blocked && !(args->kind & CLRLK_BLOCKED)) ||
+                (!plock->blocked && !(args->kind & CLRLK_GRANTED)))
+                continue;
+
+            if (chk_range && (plock->user_flock.l_whence != ulock.l_whence ||
+                              plock->user_flock.l_start != ulock.l_start ||
+                              plock->user_flock.l_len != ulock.l_len))
+                continue;
+
+            list_del_init(&plock->list);
+            if (plock->blocked) {
+                bcount++;
+                pl_trace_out(this, plock->frame, NULL, NULL, F_SETLKW,
+                             &plock->user_flock, -1, EINTR, NULL);
+
+                STACK_UNWIND_STRICT(lk, plock->frame, -1, EINTR,
+                                    &plock->user_flock, NULL);
+
+            } else {
+                gcount++;
+            }
+            __destroy_lock(plock);
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+    grant_blocked_locks(this, pl_inode);
+    ret = 0;
+out:
+    *blkd = bcount;
+    *granted = gcount;
+    return ret;
+}
+
+/* Returns 0 on success and -1 on failure */
+int
+clrlk_clear_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom,
+                    clrlk_args *args, int *blkd, int *granted, int *op_errno)
+{
+    posix_locks_private_t *priv;
+    pl_inode_lock_t *ilock = NULL;
+    pl_inode_lock_t *tmp = NULL;
+    struct gf_flock ulock = {
+        0,
+    };
+    int ret = -1;
+    int bcount = 0;
+    int gcount = 0;
+    gf_boolean_t chk_range = _gf_false;
+    struct list_head *pcontend = NULL;
+    struct list_head released;
+    struct list_head contend;
+    struct timespec now = {};
+
+    INIT_LIST_HEAD(&released);
+
+    priv = this->private;
+    if (priv->notify_contention) {
+        pcontend = &contend;
+        INIT_LIST_HEAD(pcontend);
+        timespec_now(&now);
+    }
+
+    if (clrlk_get_lock_range(args->opts, &ulock, &chk_range)) {
+        *op_errno = EINVAL;
+        goto out;
+    }
+
+    if (args->kind & CLRLK_BLOCKED)
+        goto blkd;
+
+    if (args->kind & CLRLK_GRANTED)
+        goto granted;
+
+blkd:
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        list_for_each_entry_safe(ilock, tmp, &dom->blocked_inodelks,
+                                 blocked_locks)
+        {
+            if (chk_range && (ilock->user_flock.l_whence != ulock.l_whence ||
+                              ilock->user_flock.l_start != ulock.l_start ||
+                              ilock->user_flock.l_len != ulock.l_len))
+                continue;
+
+            bcount++;
+            list_del_init(&ilock->client_list);
+            list_del_init(&ilock->blocked_locks);
+            list_add(&ilock->blocked_locks, &released);
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    if (!list_empty(&released)) {
+        list_for_each_entry_safe(ilock, tmp, &released, blocked_locks)
+        {
+            list_del_init(&ilock->blocked_locks);
+            pl_trace_out(this, ilock->frame, NULL, NULL, F_SETLKW,
+                         &ilock->user_flock, -1, EAGAIN, ilock->volume);
+            STACK_UNWIND_STRICT(inodelk, ilock->frame, -1, EAGAIN, NULL);
+            // No need to take lock as the locks are only in one list
+            __pl_inodelk_unref(ilock);
+        }
+    }
+
+    if (!(args->kind & CLRLK_GRANTED)) {
+        ret = 0;
+        goto out;
+    }
+
+granted:
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        list_for_each_entry_safe(ilock, tmp, &dom->inodelk_list, list)
+        {
+            if (chk_range && (ilock->user_flock.l_whence != ulock.l_whence ||
+                              ilock->user_flock.l_start != ulock.l_start ||
+                              ilock->user_flock.l_len != ulock.l_len))
+                continue;
+
+            gcount++;
+            list_del_init(&ilock->client_list);
+            list_del_init(&ilock->list);
+            list_add(&ilock->list, &released);
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    list_for_each_entry_safe(ilock, tmp, &released, list)
+    {
+        list_del_init(&ilock->list);
+        // No need to take lock as the locks are only in one list
+        __pl_inodelk_unref(ilock);
+    }
+
+    ret = 0;
+out:
+    grant_blocked_inode_locks(this, pl_inode, dom, &now, pcontend);
+    if (pcontend != NULL) {
+        inodelk_contention_notify(this, pcontend);
+    }
+    *blkd = bcount;
+    *granted = gcount;
+    return ret;
+}
+
+/* Returns 0 on success and -1 on failure */
+int
+clrlk_clear_entrylk(xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom,
+                    clrlk_args *args, int *blkd, int *granted, int *op_errno)
+{
+    posix_locks_private_t *priv;
+    pl_entry_lock_t *elock = NULL;
+    pl_entry_lock_t *tmp = NULL;
+    int bcount = 0;
+    int gcount = 0;
+    int ret = -1;
+    struct list_head *pcontend = NULL;
+    struct list_head removed;
+    struct list_head released;
+    struct list_head contend;
+    struct timespec now;
+
+    INIT_LIST_HEAD(&released);
+
+    priv = this->private;
+    if (priv->notify_contention) {
+        pcontend = &contend;
+        INIT_LIST_HEAD(pcontend);
+        timespec_now(&now);
+    }
+
+    if (args->kind & CLRLK_BLOCKED)
+        goto blkd;
+
+    if (args->kind & CLRLK_GRANTED)
+        goto granted;
+
+blkd:
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        list_for_each_entry_safe(elock, tmp, &dom->blocked_entrylks,
+                                 blocked_locks)
+        {
+            if (args->opts) {
+                if (!elock->basename || strcmp(elock->basename, args->opts))
+                    continue;
+            }
+
+            bcount++;
+
+            list_del_init(&elock->client_list);
+            list_del_init(&elock->blocked_locks);
+            list_add_tail(&elock->blocked_locks, &released);
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    if (!list_empty(&released)) {
+        list_for_each_entry_safe(elock, tmp, &released, blocked_locks)
+        {
+            list_del_init(&elock->blocked_locks);
+            entrylk_trace_out(this, elock->frame, elock->volume, NULL, NULL,
+                              elock->basename, ENTRYLK_LOCK, elock->type, -1,
+                              EAGAIN);
+            STACK_UNWIND_STRICT(entrylk, elock->frame, -1, EAGAIN, NULL);
+
+            __pl_entrylk_unref(elock);
+        }
+    }
+
+    if (!(args->kind & CLRLK_GRANTED)) {
+        ret = 0;
+        goto out;
+    }
+
+granted:
+    INIT_LIST_HEAD(&removed);
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        list_for_each_entry_safe(elock, tmp, &dom->entrylk_list, domain_list)
+        {
+            if (args->opts) {
+                if (!elock->basename || strcmp(elock->basename, args->opts))
+                    continue;
+            }
+
+            gcount++;
+            list_del_init(&elock->client_list);
+            list_del_init(&elock->domain_list);
+            list_add_tail(&elock->domain_list, &removed);
+
+            __pl_entrylk_unref(elock);
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    grant_blocked_entry_locks(this, pl_inode, dom, &now, pcontend);
+    if (pcontend != NULL) {
+        entrylk_contention_notify(this, pcontend);
+    }
+
+    ret = 0;
+out:
+    *blkd = bcount;
+    *granted = gcount;
+    return ret;
+}
+
+int
+clrlk_clear_lks_in_all_domains(xlator_t *this, pl_inode_t *pl_inode,
+                               clrlk_args *args, int *blkd, int *granted,
+                               int *op_errno)
+{
+    pl_dom_list_t *dom = NULL;
+    int ret = -1;
+    int tmp_bcount = 0;
+    int tmp_gcount = 0;
+
+    if (list_empty(&pl_inode->dom_list)) {
+        ret = 0;
+        goto out;
+    }
+
+    list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
+    {
+        tmp_bcount = tmp_gcount = 0;
+
+        switch (args->type) {
+            case CLRLK_INODE:
+                ret = clrlk_clear_inodelk(this, pl_inode, dom, args,
+                                          &tmp_bcount, &tmp_gcount, op_errno);
+                if (ret)
+                    goto out;
+                break;
+            case CLRLK_ENTRY:
+                ret = clrlk_clear_entrylk(this, pl_inode, dom, args,
+                                          &tmp_bcount, &tmp_gcount, op_errno);
+                if (ret)
+                    goto out;
+                break;
+        }
+
+        *blkd += tmp_bcount;
+        *granted += tmp_gcount;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
diff --git a/xlators/features/locks/src/clear.h b/xlators/features/locks/src/clear.h
new file mode 100644
index 00000000000..bc118cb1b81
--- /dev/null
+++ b/xlators/features/locks/src/clear.h
@@ -0,0 +1,73 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef __CLEAR_H__
+#define __CLEAR_H__
+
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/stack.h>
+#include <glusterfs/call-stub.h>
+#include "locks.h"
+
+typedef enum {
+    CLRLK_INODE,
+    CLRLK_ENTRY,
+    CLRLK_POSIX,
+    CLRLK_TYPE_MAX
+} clrlk_type;
+
+extern const char *clrlk_type_names[];
+
+typedef enum {
+    CLRLK_BLOCKED = 1,
+    CLRLK_GRANTED,
+    CLRLK_ALL,
+    CLRLK_KIND_MAX
+} clrlk_kind;
+
+typedef enum {
+    KW_TYPE,
+    KW_KIND,
+    /*add new keywords here*/
+    KW_MAX
+} clrlk_opts;
+
+struct _clrlk_args;
+typedef struct _clrlk_args clrlk_args;
+
+struct _clrlk_args {
+    int type;
+    int kind;
+    char *opts;
+};
+
+int
+clrlk_get__kind(char *kind);
+int
+clrlk_get_type(char *type);
+int
+clrlk_get_lock_range(char *range_str, struct gf_flock *ulock,
+                     gf_boolean_t *chk_range);
+int
+clrlk_parse_args(const char *cmd, clrlk_args *args);
+
+int
+clrlk_clear_posixlk(xlator_t *this, pl_inode_t *pl_inode, clrlk_args *args,
+                    int *blkd, int *granted, int *op_errno);
+int
+clrlk_clear_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom,
+                    clrlk_args *args, int *blkd, int *granted, int *op_errno);
+int
+clrlk_clear_entrylk(xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom,
+                    clrlk_args *args, int *blkd, int *granted, int *op_errno);
+int
+clrlk_clear_lks_in_all_domains(xlator_t *this, pl_inode_t *pl_inode,
+                               clrlk_args *args, int *blkd, int *granted,
+                               int *op_errno);
+#endif /* __CLEAR_H__ */
diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c
index d87aec229ed..a2c6be93e03 100644
--- a/xlators/features/locks/src/common.c
+++ b/xlators/features/locks/src/common.c
@@ -1,559 +1,1591 @@
 /*
-  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+   Copyright (c) 2006-2012, 2015-2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 #include <unistd.h>
 #include <fcntl.h>
 #include <limits.h>
 #include <pthread.h>
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "compat.h"
-#include "xlator.h"
-#include "inode.h"
-#include "logging.h"
-#include "common-utils.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/syncop.h>
 
 #include "locks.h"
-
+#include "common.h"
 
 static int
-__is_lock_grantable (pl_inode_t *pl_inode, posix_lock_t *lock,
-		     gf_lk_domain_t dom);
+__is_lock_grantable(pl_inode_t *pl_inode, posix_lock_t *lock);
 static void
-__insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock,
-		    gf_lk_domain_t dom);
+__insert_and_merge(pl_inode_t *pl_inode, posix_lock_t *lock);
+static int
+pl_send_prelock_unlock(xlator_t *this, pl_inode_t *pl_inode,
+                       posix_lock_t *old_lock);
 
-#define DOMAIN_HEAD(pl_inode, dom) (dom == GF_LOCK_POSIX	\
-				    ? &pl_inode->ext_list	\
-				    : &pl_inode->int_list)
+static pl_dom_list_t *
+__allocate_domain(const char *volume)
+{
+    pl_dom_list_t *dom = NULL;
 
-pl_inode_t *
-pl_inode_get (xlator_t *this, inode_t *inode)
+    dom = GF_CALLOC(1, sizeof(*dom), gf_locks_mt_pl_dom_list_t);
+    if (!dom)
+        goto out;
+
+    dom->domain = gf_strdup(volume);
+    if (!dom->domain)
+        goto out;
+
+    gf_log("posix-locks", GF_LOG_TRACE, "New domain allocated: %s",
+           dom->domain);
+
+    INIT_LIST_HEAD(&dom->inode_list);
+    INIT_LIST_HEAD(&dom->entrylk_list);
+    INIT_LIST_HEAD(&dom->blocked_entrylks);
+    INIT_LIST_HEAD(&dom->inodelk_list);
+    INIT_LIST_HEAD(&dom->blocked_inodelks);
+
+out:
+    if (dom && (NULL == dom->domain)) {
+        GF_FREE(dom);
+        dom = NULL;
+    }
+
+    return dom;
+}
+
+/* Returns domain for the lock. If domain is not present,
+ * allocates a domain and returns it
+ */
+pl_dom_list_t *
+get_domain(pl_inode_t *pl_inode, const char *volume)
+{
+    pl_dom_list_t *dom = NULL;
+
+    GF_VALIDATE_OR_GOTO("posix-locks", pl_inode, out);
+    GF_VALIDATE_OR_GOTO("posix-locks", volume, out);
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
+        {
+            if (strcmp(dom->domain, volume) == 0)
+                goto unlock;
+        }
+
+        dom = __allocate_domain(volume);
+        if (dom)
+            list_add(&dom->inode_list, &pl_inode->dom_list);
+    }
+unlock:
+    pthread_mutex_unlock(&pl_inode->mutex);
+    if (dom) {
+        gf_log("posix-locks", GF_LOG_TRACE, "Domain %s found", volume);
+    } else {
+        gf_log("posix-locks", GF_LOG_TRACE, "Domain %s not found", volume);
+    }
+out:
+    return dom;
+}
+
+unsigned long
+fd_to_fdnum(fd_t *fd)
 {
-	pl_inode_t *pl_inode = NULL;
-	mode_t      st_mode = 0;
-	int         ret = 0;
+    return ((unsigned long)fd);
+}
 
-	ret = inode_ctx_get (inode, this,
-                             (uint64_t *)(&pl_inode));
-	if (ret == 0)
-		goto out;
+fd_t *
+fd_from_fdnum(posix_lock_t *lock)
+{
+    return ((fd_t *)lock->fd_num);
+}
 
-	pl_inode = CALLOC (1, sizeof (*pl_inode));
-	if (!pl_inode) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
+int
+__pl_inode_is_empty(pl_inode_t *pl_inode)
+{
+    return (list_empty(&pl_inode->ext_list));
+}
 
-	st_mode  = inode->st_mode;
-	if ((st_mode & S_ISGID) && !(st_mode & S_IXGRP))
-		pl_inode->mandatory = 1;
+void
+pl_print_locker(char *str, int size, xlator_t *this, call_frame_t *frame)
+{
+    snprintf(str, size, "Pid=%llu, lk-owner=%s, Client=%p, Frame=%llu",
+             (unsigned long long)frame->root->pid,
+             lkowner_utoa(&frame->root->lk_owner), frame->root->client,
+             (unsigned long long)frame->root->unique);
+}
 
+void
+pl_print_lockee(char *str, int size, fd_t *fd, loc_t *loc)
+{
+    inode_t *inode = NULL;
+    char *ipath = NULL;
+    int ret = 0;
+
+    if (fd)
+        inode = fd->inode;
+    if (loc)
+        inode = loc->inode;
+
+    if (!inode) {
+        snprintf(str, size, "<nul>");
+        return;
+    }
+
+    if (loc && loc->path) {
+        ipath = gf_strdup(loc->path);
+    } else {
+        ret = inode_path(inode, NULL, &ipath);
+        if (ret <= 0)
+            ipath = NULL;
+    }
+
+    snprintf(str, size, "gfid=%s, fd=%p, path=%s", uuid_utoa(inode->gfid), fd,
+             ipath ? ipath : "<nul>");
+
+    GF_FREE(ipath);
+}
 
-	pthread_mutex_init (&pl_inode->mutex, NULL);
+void
+pl_print_lock(char *str, int size, int cmd, struct gf_flock *flock,
+              gf_lkowner_t *owner)
+{
+    char *cmd_str = NULL;
+    char *type_str = NULL;
 
-	INIT_LIST_HEAD (&pl_inode->dir_list);
-	INIT_LIST_HEAD (&pl_inode->ext_list);
-	INIT_LIST_HEAD (&pl_inode->int_list);
-	INIT_LIST_HEAD (&pl_inode->rw_list);
+    switch (cmd) {
+#if F_GETLK != F_GETLK64
+        case F_GETLK64:
+#endif
+        case F_GETLK:
+            cmd_str = "GETLK";
+            break;
 
-	ret = inode_ctx_put (inode, this, (uint64_t)(long)(pl_inode));
+#if F_SETLK != F_SETLK64
+        case F_SETLK64:
+#endif
+        case F_SETLK:
+            cmd_str = "SETLK";
+            break;
 
-out:
-	return pl_inode;
+#if F_SETLKW != F_SETLKW64
+        case F_SETLKW64:
+#endif
+        case F_SETLKW:
+            cmd_str = "SETLKW";
+            break;
+
+        default:
+            cmd_str = "UNKNOWN";
+            break;
+    }
+
+    switch (flock->l_type) {
+        case F_RDLCK:
+            type_str = "READ";
+            break;
+        case F_WRLCK:
+            type_str = "WRITE";
+            break;
+        case F_UNLCK:
+            type_str = "UNLOCK";
+            break;
+        default:
+            type_str = "UNKNOWN";
+            break;
+    }
+
+    snprintf(str, size,
+             "lock=FCNTL, cmd=%s, type=%s, "
+             "start=%llu, len=%llu, pid=%llu, lk-owner=%s",
+             cmd_str, type_str, (unsigned long long)flock->l_start,
+             (unsigned long long)flock->l_len, (unsigned long long)flock->l_pid,
+             lkowner_utoa(owner));
 }
 
+void
+pl_trace_in(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, int cmd,
+            struct gf_flock *flock, const char *domain)
+{
+    posix_locks_private_t *priv = this->private;
+    char pl_locker[256];
+    char pl_lockee[256];
+    char pl_lock[256];
+
+    if (!priv->trace)
+        return;
+
+    pl_print_locker(pl_locker, 256, this, frame);
+    pl_print_lockee(pl_lockee, 256, fd, loc);
+    if (domain)
+        pl_print_inodelk(pl_lock, 256, cmd, flock, domain);
+    else
+        pl_print_lock(pl_lock, 256, cmd, flock, &frame->root->lk_owner);
+
+    gf_log(this->name, GF_LOG_INFO,
+           "[REQUEST] Locker = {%s} Lockee = {%s} Lock = {%s}", pl_locker,
+           pl_lockee, pl_lock);
+}
 
-/* Create a new posix_lock_t */
-posix_lock_t *
-new_posix_lock (struct flock *flock, transport_t *transport, pid_t client_pid)
+void
+pl_print_verdict(char *str, int size, int op_ret, int op_errno)
 {
-	posix_lock_t *lock = NULL;
+    char *verdict = NULL;
+
+    if (op_ret == 0) {
+        verdict = "GRANTED";
+    } else {
+        switch (op_errno) {
+            case EAGAIN:
+                verdict = "TRYAGAIN";
+                break;
+            default:
+                verdict = strerror(op_errno);
+        }
+    }
+
+    snprintf(str, size, "%s", verdict);
+}
+
+void
+pl_trace_out(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, int cmd,
+             struct gf_flock *flock, int op_ret, int op_errno,
+             const char *domain)
 
-	lock = CALLOC (1, sizeof (posix_lock_t));
-	if (!lock) {
-		return NULL;
-	}
+{
+    posix_locks_private_t *priv = NULL;
+    char pl_locker[256];
+    char pl_lockee[256];
+    char pl_lock[256];
+    char verdict[32];
 
-	lock->fl_start = flock->l_start;
-	lock->fl_type  = flock->l_type;
+    priv = this->private;
 
-	if (flock->l_len == 0)
-		lock->fl_end = LLONG_MAX;
-	else
-		lock->fl_end = flock->l_start + flock->l_len - 1;
+    if (!priv->trace)
+        return;
 
-	lock->transport  = transport;
-	lock->client_pid = client_pid;
+    pl_print_locker(pl_locker, 256, this, frame);
+    pl_print_lockee(pl_lockee, 256, fd, loc);
+    if (domain)
+        pl_print_inodelk(pl_lock, 256, cmd, flock, domain);
+    else
+        pl_print_lock(pl_lock, 256, cmd, flock, &frame->root->lk_owner);
 
-	INIT_LIST_HEAD (&lock->list);
+    pl_print_verdict(verdict, 32, op_ret, op_errno);
 
-	return lock;
+    gf_log(this->name, GF_LOG_INFO,
+           "[%s] Locker = {%s} Lockee = {%s} Lock = {%s}", verdict, pl_locker,
+           pl_lockee, pl_lock);
 }
 
+void
+pl_trace_block(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc,
+               int cmd, struct gf_flock *flock, const char *domain)
+
+{
+    posix_locks_private_t *priv = this->private;
+    char pl_locker[256];
+    char pl_lockee[256];
+    char pl_lock[256];
+
+    if (!priv->trace)
+        return;
+
+    pl_print_locker(pl_locker, 256, this, frame);
+    pl_print_lockee(pl_lockee, 256, fd, loc);
+    if (domain)
+        pl_print_inodelk(pl_lock, 256, cmd, flock, domain);
+    else
+        pl_print_lock(pl_lock, 256, cmd, flock, &frame->root->lk_owner);
+
+    gf_log(this->name, GF_LOG_INFO,
+           "[BLOCKED] Locker = {%s} Lockee = {%s} Lock = {%s}", pl_locker,
+           pl_lockee, pl_lock);
+}
 
-/* Delete a lock from the inode's lock list */
 void
-__delete_lock (pl_inode_t *pl_inode, posix_lock_t *lock)
+pl_trace_flush(xlator_t *this, call_frame_t *frame, fd_t *fd)
 {
-	list_del_init (&lock->list);
+    posix_locks_private_t *priv = NULL;
+    char pl_locker[256];
+    char pl_lockee[256];
+    pl_inode_t *pl_inode = NULL;
+
+    priv = this->private;
+
+    if (!priv->trace)
+        return;
+
+    pl_inode = pl_inode_get(this, fd->inode, NULL);
+
+    if (pl_inode && __pl_inode_is_empty(pl_inode))
+        return;
+
+    pl_print_locker(pl_locker, 256, this, frame);
+    pl_print_lockee(pl_lockee, 256, fd, NULL);
+
+    gf_log(this->name, GF_LOG_INFO, "[FLUSH] Locker = {%s} Lockee = {%s}",
+           pl_locker, pl_lockee);
 }
 
+void
+pl_trace_release(xlator_t *this, fd_t *fd)
+{
+    posix_locks_private_t *priv = NULL;
+    char pl_lockee[256];
+
+    priv = this->private;
+
+    if (!priv->trace)
+        return;
+
+    pl_print_lockee(pl_lockee, 256, fd, NULL);
+
+    gf_log(this->name, GF_LOG_INFO, "[RELEASE] Lockee = {%s}", pl_lockee);
+}
 
-/* Destroy a posix_lock */
 void
-__destroy_lock (posix_lock_t *lock)
+pl_update_refkeeper(xlator_t *this, inode_t *inode)
+{
+    pl_inode_t *pl_inode = NULL;
+    int is_empty = 0;
+    int need_unref = 0;
+    int need_ref = 0;
+
+    pl_inode = pl_inode_get(this, inode, NULL);
+    if (!pl_inode)
+        return;
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        is_empty = __pl_inode_is_empty(pl_inode);
+
+        if (is_empty && pl_inode->refkeeper) {
+            need_unref = 1;
+            pl_inode->refkeeper = NULL;
+        }
+
+        if (!is_empty && !pl_inode->refkeeper) {
+            need_ref = 1;
+            pl_inode->refkeeper = inode;
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    if (need_unref)
+        inode_unref(inode);
+
+    if (need_ref)
+        inode_ref(inode);
+}
+
+/* Get lock enforcement info from disk */
+int
+pl_fetch_mlock_info_from_disk(xlator_t *this, pl_inode_t *pl_inode,
+                              pl_local_t *local)
 {
-	free (lock);
+    dict_t *xdata_rsp = NULL;
+    int ret = 0;
+    int op_ret = 0;
+
+    if (!local) {
+        return -1;
+    }
+
+    if (local->fd) {
+        op_ret = syncop_fgetxattr(this, local->fd, &xdata_rsp,
+                                  GF_ENFORCE_MANDATORY_LOCK, NULL, NULL);
+    } else {
+        op_ret = syncop_getxattr(this, &local->loc[0], &xdata_rsp,
+                                 GF_ENFORCE_MANDATORY_LOCK, NULL, NULL);
+    }
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        if (op_ret >= 0) {
+            pl_inode->mlock_enforced = _gf_true;
+            pl_inode->check_mlock_info = _gf_false;
+        } else {
+            gf_msg(this->name, GF_LOG_WARNING, -op_ret, 0,
+                   "getxattr failed with %d", op_ret);
+            pl_inode->mlock_enforced = _gf_false;
+
+            if (-op_ret == ENODATA) {
+                pl_inode->check_mlock_info = _gf_false;
+            } else {
+                pl_inode->check_mlock_info = _gf_true;
+            }
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    return ret;
 }
 
+pl_inode_t *
+pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local)
+{
+    uint64_t tmp_pl_inode = 0;
+    pl_inode_t *pl_inode = NULL;
+    int ret = 0;
+
+    LOCK(&inode->lock);
+    {
+        ret = __inode_ctx_get(inode, this, &tmp_pl_inode);
+        if (ret == 0) {
+            pl_inode = (pl_inode_t *)(long)tmp_pl_inode;
+            goto unlock;
+        }
+
+        pl_inode = GF_CALLOC(1, sizeof(*pl_inode), gf_locks_mt_pl_inode_t);
+        if (!pl_inode) {
+            goto unlock;
+        }
+
+        gf_log(this->name, GF_LOG_TRACE, "Allocating new pl inode");
+
+        pthread_mutex_init(&pl_inode->mutex, NULL);
+        pthread_cond_init(&pl_inode->check_fop_wind_count, 0);
+
+        INIT_LIST_HEAD(&pl_inode->dom_list);
+        INIT_LIST_HEAD(&pl_inode->ext_list);
+        INIT_LIST_HEAD(&pl_inode->rw_list);
+        INIT_LIST_HEAD(&pl_inode->reservelk_list);
+        INIT_LIST_HEAD(&pl_inode->blocked_reservelks);
+        INIT_LIST_HEAD(&pl_inode->blocked_calls);
+        INIT_LIST_HEAD(&pl_inode->metalk_list);
+        INIT_LIST_HEAD(&pl_inode->queued_locks);
+        INIT_LIST_HEAD(&pl_inode->waiting);
+        gf_uuid_copy(pl_inode->gfid, inode->gfid);
+
+        pl_inode->check_mlock_info = _gf_true;
+        pl_inode->mlock_enforced = _gf_false;
+
+        /* -2 means never looked up. -1 means something went wrong and link
+         * tracking is disabled. */
+        pl_inode->links = -2;
+
+        ret = __inode_ctx_put(inode, this, (uint64_t)(long)(pl_inode));
+        if (ret) {
+            pthread_mutex_destroy(&pl_inode->mutex);
+            GF_FREE(pl_inode);
+            pl_inode = NULL;
+            goto unlock;
+        }
+    }
+unlock:
+    UNLOCK(&inode->lock);
+
+    if ((pl_inode != NULL) && pl_is_mandatory_locking_enabled(pl_inode) &&
+        pl_inode->check_mlock_info && local) {
+        /* Note: The lock enforcement information per file can be stored in the
+           attribute flag of stat(x) in posix. With that there won't be a need
+           for doing getxattr post a reboot
+        */
+        pl_fetch_mlock_info_from_disk(this, pl_inode, local);
+    }
+
+    return pl_inode;
+}
 
-/* Convert a posix_lock to a struct flock */
+/* Create a new posix_lock_t */
+posix_lock_t *
+new_posix_lock(struct gf_flock *flock, client_t *client, pid_t client_pid,
+               gf_lkowner_t *owner, fd_t *fd, uint32_t lk_flags, int blocking,
+               int32_t *op_errno)
+{
+    posix_lock_t *lock = NULL;
+
+    GF_VALIDATE_OR_GOTO("posix-locks", flock, out);
+    GF_VALIDATE_OR_GOTO("posix-locks", client, out);
+    GF_VALIDATE_OR_GOTO("posix-locks", fd, out);
+
+    if (!pl_is_lk_owner_valid(owner, client)) {
+        *op_errno = EINVAL;
+        goto out;
+    }
+
+    lock = GF_CALLOC(1, sizeof(posix_lock_t), gf_locks_mt_posix_lock_t);
+    if (!lock) {
+        *op_errno = ENOMEM;
+        goto out;
+    }
+
+    lock->fl_start = flock->l_start;
+    lock->fl_type = flock->l_type;
+
+    if (flock->l_len == 0)
+        lock->fl_end = LLONG_MAX;
+    else
+        lock->fl_end = flock->l_start + flock->l_len - 1;
+
+    lock->client = client;
+
+    lock->client_uid = gf_strdup(client->client_uid);
+    if (lock->client_uid == NULL) {
+        GF_FREE(lock);
+        lock = NULL;
+        *op_errno = ENOMEM;
+        goto out;
+    }
+
+    lock->fd_num = fd_to_fdnum(fd);
+    lock->fd = fd;
+    lock->client_pid = client_pid;
+    lock->owner = *owner;
+    lock->lk_flags = lk_flags;
+
+    lock->blocking = blocking;
+    memcpy(&lock->user_flock, flock, sizeof(lock->user_flock));
+
+    INIT_LIST_HEAD(&lock->list);
+
+out:
+    return lock;
+}
+
+/* Delete a lock from the inode's lock list */
 void
-posix_lock_to_flock (posix_lock_t *lock, struct flock *flock)
+__delete_lock(posix_lock_t *lock)
 {
-	flock->l_pid   = lock->client_pid;
-	flock->l_type  = lock->fl_type;
-	flock->l_start = lock->fl_start;
+    list_del_init(&lock->list);
+}
 
-	if (lock->fl_end == 0)
-		flock->l_len = LLONG_MAX;
-	else
-		flock->l_len = lock->fl_end - lock->fl_start + 1;
+/* Destroy a posix_lock */
+void
+__destroy_lock(posix_lock_t *lock)
+{
+    GF_FREE(lock->client_uid);
+    GF_FREE(lock);
 }
 
+static posix_lock_t *
+__copy_lock(posix_lock_t *src)
+{
+    posix_lock_t *dst;
+
+    dst = GF_MALLOC(sizeof(posix_lock_t), gf_locks_mt_posix_lock_t);
+    if (dst != NULL) {
+        memcpy(dst, src, sizeof(posix_lock_t));
+        dst->client_uid = gf_strdup(src->client_uid);
+        if (dst->client_uid == NULL) {
+            GF_FREE(dst);
+            dst = NULL;
+        }
+
+        if (dst != NULL)
+            INIT_LIST_HEAD(&dst->list);
+    }
+
+    return dst;
+}
+
+/* Convert a posix_lock to a struct gf_flock */
+void
+posix_lock_to_flock(posix_lock_t *lock, struct gf_flock *flock)
+{
+    flock->l_pid = lock->user_flock.l_pid;
+    flock->l_type = lock->fl_type;
+    flock->l_start = lock->fl_start;
+    flock->l_owner = lock->owner;
+
+    if (lock->fl_end == LLONG_MAX)
+        flock->l_len = 0;
+    else
+        flock->l_len = lock->fl_end - lock->fl_start + 1;
+}
 
 /* Insert the lock into the inode's lock list */
 static void
-__insert_lock (pl_inode_t *pl_inode, posix_lock_t *lock, gf_lk_domain_t dom)
+__insert_lock(pl_inode_t *pl_inode, posix_lock_t *lock)
 {
-	list_add_tail (&lock->list, DOMAIN_HEAD (pl_inode, dom));
+    if (lock->blocked)
+        lock->blkd_time = gf_time();
+    else
+        lock->granted_time = gf_time();
 
-	return;
+    list_add_tail(&lock->list, &pl_inode->ext_list);
 }
 
-
 /* Return true if the locks overlap, false otherwise */
 int
-locks_overlap (posix_lock_t *l1, posix_lock_t *l2)
+locks_overlap(posix_lock_t *l1, posix_lock_t *l2)
 {
-	/* 
-	   Note:
-	   FUSE always gives us absolute offsets, so no need to worry 
-	   about SEEK_CUR or SEEK_END
-	*/
+    /*
+       Note:
+       FUSE always gives us absolute offsets, so no need to worry
+       about SEEK_CUR or SEEK_END
+    */
 
-	return ((l1->fl_end >= l2->fl_start) &&
-		(l2->fl_end >= l1->fl_start));
+    return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start));
 }
 
-
 /* Return true if the locks have the same owner */
 int
-same_owner (posix_lock_t *l1, posix_lock_t *l2)
+same_owner(posix_lock_t *l1, posix_lock_t *l2)
 {
-	return ((l1->client_pid == l2->client_pid) &&
-		(l1->transport  == l2->transport));
+    return (is_same_lkowner(&l1->owner, &l2->owner) &&
+            (l1->client == l2->client));
 }
 
-
 /* Delete all F_UNLCK locks */
 void
-__delete_unlck_locks (pl_inode_t *pl_inode, gf_lk_domain_t dom)
+__delete_unlck_locks(pl_inode_t *pl_inode)
 {
-	posix_lock_t *l = NULL;
-	posix_lock_t *tmp = NULL;
-
-	list_for_each_entry_safe (l, tmp, DOMAIN_HEAD (pl_inode, dom), list) {
-		if (l->fl_type == F_UNLCK) {
-			__delete_lock (pl_inode, l);
-			__destroy_lock (l);
-		}
-	}
+    posix_lock_t *l = NULL;
+    posix_lock_t *tmp = NULL;
+
+    list_for_each_entry_safe(l, tmp, &pl_inode->ext_list, list)
+    {
+        if (l->fl_type == F_UNLCK) {
+            __delete_lock(l);
+            __destroy_lock(l);
+        }
+    }
 }
 
-
 /* Add two locks */
 static posix_lock_t *
-add_locks (posix_lock_t *l1, posix_lock_t *l2)
+add_locks(posix_lock_t *l1, posix_lock_t *l2, posix_lock_t *dst)
 {
-	posix_lock_t *sum = NULL;
+    posix_lock_t *sum = NULL;
+
+    sum = __copy_lock(dst);
+    if (!sum)
+        return NULL;
 
-	sum = CALLOC (1, sizeof (posix_lock_t));
-	if (!sum)
-		return NULL;
+    sum->fl_start = min(l1->fl_start, l2->fl_start);
+    sum->fl_end = max(l1->fl_end, l2->fl_end);
 
-	sum->fl_start = min (l1->fl_start, l2->fl_start);
-	sum->fl_end   = max (l1->fl_end, l2->fl_end);
+    posix_lock_to_flock(sum, &sum->user_flock);
 
-	return sum;
+    return sum;
 }
 
 /* Subtract two locks */
 struct _values {
-	posix_lock_t *locks[3];
+    posix_lock_t *locks[3];
 };
 
 /* {big} must always be contained inside {small} */
 static struct _values
-subtract_locks (posix_lock_t *big, posix_lock_t *small)
-{
-	struct _values v = { .locks = {0, 0, 0} };
-  
-	if ((big->fl_start == small->fl_start) && 
-	    (big->fl_end   == small->fl_end)) {  
-		/* both edges coincide with big */
-		v.locks[0] = CALLOC (1, sizeof (posix_lock_t));
-		ERR_ABORT (v.locks[0]);
-		memcpy (v.locks[0], big, sizeof (posix_lock_t));
-		v.locks[0]->fl_type = small->fl_type;
-	}
-	else if ((small->fl_start > big->fl_start) &&
-		 (small->fl_end   < big->fl_end)) {
-		/* both edges lie inside big */
-		v.locks[0] = CALLOC (1, sizeof (posix_lock_t));
-		ERR_ABORT (v.locks[0]);
-		v.locks[1] = CALLOC (1, sizeof (posix_lock_t));
-		ERR_ABORT (v.locks[1]);
-		v.locks[2] = CALLOC (1, sizeof (posix_lock_t));
-		ERR_ABORT (v.locks[2]);
-
-		memcpy (v.locks[0], big, sizeof (posix_lock_t));
-		v.locks[0]->fl_end = small->fl_start - 1;
-
-		memcpy (v.locks[1], small, sizeof (posix_lock_t));
-		memcpy (v.locks[2], big, sizeof (posix_lock_t));
-		v.locks[2]->fl_start = small->fl_end + 1;
-	}
-	/* one edge coincides with big */
-	else if (small->fl_start == big->fl_start) {
-		v.locks[0] = CALLOC (1, sizeof (posix_lock_t));
-		ERR_ABORT (v.locks[0]);
-		v.locks[1] = CALLOC (1, sizeof (posix_lock_t));
-		ERR_ABORT (v.locks[1]);
-    
-		memcpy (v.locks[0], big, sizeof (posix_lock_t));
-		v.locks[0]->fl_start = small->fl_end + 1;
-    
-		memcpy (v.locks[1], small, sizeof (posix_lock_t));
-	}
-	else if (small->fl_end   == big->fl_end) {
-		v.locks[0] = CALLOC (1, sizeof (posix_lock_t));
-		ERR_ABORT (v.locks[0]);
-		v.locks[1] = CALLOC (1, sizeof (posix_lock_t));
-		ERR_ABORT (v.locks[1]);
-
-		memcpy (v.locks[0], big, sizeof (posix_lock_t));
-		v.locks[0]->fl_end = small->fl_start - 1;
-    
-		memcpy (v.locks[1], small, sizeof (posix_lock_t));
-	}
-	else {
-		gf_log ("posix-locks", GF_LOG_DEBUG, 
-			"unexpected case in subtract_locks");
-	}
-
-	return v;
-}
-
-/* 
-   Start searching from {begin}, and return the first lock that
-   conflicts, NULL if no conflict
-   If {begin} is NULL, then start from the beginning of the list
+subtract_locks(posix_lock_t *big, posix_lock_t *small)
+{
+    struct _values v = {.locks = {0, 0, 0}};
+
+    if ((big->fl_start == small->fl_start) && (big->fl_end == small->fl_end)) {
+        /* both edges coincide with big */
+        v.locks[0] = __copy_lock(big);
+        if (!v.locks[0]) {
+            goto out;
+        }
+
+        v.locks[0]->fl_type = small->fl_type;
+        v.locks[0]->user_flock.l_type = small->fl_type;
+        goto done;
+    }
+
+    if ((small->fl_start > big->fl_start) && (small->fl_end < big->fl_end)) {
+        /* both edges lie inside big */
+        v.locks[0] = __copy_lock(big);
+        v.locks[1] = __copy_lock(small);
+        v.locks[2] = __copy_lock(big);
+        if ((v.locks[0] == NULL) || (v.locks[1] == NULL) ||
+            (v.locks[2] == NULL)) {
+            goto out;
+        }
+
+        v.locks[0]->fl_end = small->fl_start - 1;
+        v.locks[2]->fl_start = small->fl_end + 1;
+        posix_lock_to_flock(v.locks[0], &v.locks[0]->user_flock);
+        posix_lock_to_flock(v.locks[2], &v.locks[2]->user_flock);
+        goto done;
+    }
+
+    /* one edge coincides with big */
+    if (small->fl_start == big->fl_start) {
+        v.locks[0] = __copy_lock(big);
+        v.locks[1] = __copy_lock(small);
+        if ((v.locks[0] == NULL) || (v.locks[1] == NULL)) {
+            goto out;
+        }
+
+        v.locks[0]->fl_start = small->fl_end + 1;
+        posix_lock_to_flock(v.locks[0], &v.locks[0]->user_flock);
+        goto done;
+    }
+
+    if (small->fl_end == big->fl_end) {
+        v.locks[0] = __copy_lock(big);
+        v.locks[1] = __copy_lock(small);
+        if ((v.locks[0] == NULL) || (v.locks[1] == NULL)) {
+            goto out;
+        }
+
+        v.locks[0]->fl_end = small->fl_start - 1;
+        posix_lock_to_flock(v.locks[0], &v.locks[0]->user_flock);
+        goto done;
+    }
+
+    GF_ASSERT(0);
+    gf_log("posix-locks", GF_LOG_ERROR, "Unexpected case in subtract_locks");
+
+out:
+    if (v.locks[0]) {
+        __destroy_lock(v.locks[0]);
+        v.locks[0] = NULL;
+    }
+    if (v.locks[1]) {
+        __destroy_lock(v.locks[1]);
+        v.locks[1] = NULL;
+    }
+    if (v.locks[2]) {
+        __destroy_lock(v.locks[2]);
+        v.locks[2] = NULL;
+    }
+
+done:
+    return v;
+}
+
+static posix_lock_t *
+first_conflicting_overlap(pl_inode_t *pl_inode, posix_lock_t *lock)
+{
+    posix_lock_t *l = NULL;
+    posix_lock_t *conf = NULL;
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        list_for_each_entry(l, &pl_inode->ext_list, list)
+        {
+            if (l->blocked)
+                continue;
+
+            if (locks_overlap(l, lock)) {
+                if (same_owner(l, lock))
+                    continue;
+
+                if ((l->fl_type == F_WRLCK) || (lock->fl_type == F_WRLCK)) {
+                    conf = l;
+                    goto unlock;
+                }
+            }
+        }
+    }
+unlock:
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    return conf;
+}
+
+/*
+  Start searching from {begin}, and return the first lock that
+  conflicts, NULL if no conflict
+  If {begin} is NULL, then start from the beginning of the list
 */
 static posix_lock_t *
-first_overlap (pl_inode_t *pl_inode, posix_lock_t *lock,
-	       gf_lk_domain_t dom)
+first_overlap(pl_inode_t *pl_inode, posix_lock_t *lock)
 {
-	posix_lock_t *l = NULL;
+    posix_lock_t *l = NULL;
 
-	list_for_each_entry (l, DOMAIN_HEAD (pl_inode, dom), list) {
-		if (l->blocked)
-			continue;
+    list_for_each_entry(l, &pl_inode->ext_list, list)
+    {
+        if (l->blocked)
+            continue;
 
-		if (locks_overlap (l, lock))
-			return l;
-	}
+        if (locks_overlap(l, lock))
+            return l;
+    }
 
-	return NULL;
+    return NULL;
 }
 
-
-
 /* Return true if lock is grantable */
 static int
-__is_lock_grantable (pl_inode_t *pl_inode, posix_lock_t *lock,
-		     gf_lk_domain_t dom)
+__is_lock_grantable(pl_inode_t *pl_inode, posix_lock_t *lock)
 {
-	posix_lock_t *l = NULL;
-	int           ret = 1;
-
-	list_for_each_entry (l, DOMAIN_HEAD (pl_inode, dom), list) {
-		if (!l->blocked && locks_overlap (lock, l)) {
-			if (((l->fl_type == F_WRLCK)
-			     || (lock->fl_type == F_WRLCK))
-			    && (lock->fl_type != F_UNLCK)
-			    && !same_owner (l, lock)) {
-				ret = 0;
-				break;
-			}
-		}
-	}
-	return ret;
+    posix_lock_t *l = NULL;
+    int ret = 1;
+
+    list_for_each_entry(l, &pl_inode->ext_list, list)
+    {
+        if (!l->blocked && locks_overlap(lock, l)) {
+            if (((l->fl_type == F_WRLCK) || (lock->fl_type == F_WRLCK)) &&
+                (lock->fl_type != F_UNLCK) && !same_owner(l, lock)) {
+                ret = 0;
+                break;
+            }
+        }
+    }
+    return ret;
 }
 
+extern void
+do_blocked_rw(pl_inode_t *);
 
-extern void do_blocked_rw (pl_inode_t *);
+static void
+__insert_and_merge(pl_inode_t *pl_inode, posix_lock_t *lock)
+{
+    posix_lock_t *conf = NULL;
+    posix_lock_t *t = NULL;
+    posix_lock_t *sum = NULL;
+    int i = 0;
+    struct _values v = {.locks = {0, 0, 0}};
+
+    list_for_each_entry_safe(conf, t, &pl_inode->ext_list, list)
+    {
+        if (conf->blocked)
+            continue;
+        if (!locks_overlap(conf, lock))
+            continue;
+
+        if (same_owner(conf, lock)) {
+            if (conf->fl_type == lock->fl_type &&
+                conf->lk_flags == lock->lk_flags) {
+                sum = add_locks(lock, conf, lock);
+
+                __delete_lock(conf);
+                __destroy_lock(conf);
+
+                __destroy_lock(lock);
+                INIT_LIST_HEAD(&sum->list);
+                posix_lock_to_flock(sum, &sum->user_flock);
+                __insert_and_merge(pl_inode, sum);
+
+                return;
+            } else {
+                sum = add_locks(lock, conf, conf);
+
+                v = subtract_locks(sum, lock);
+
+                __delete_lock(conf);
+                __destroy_lock(conf);
+
+                __delete_lock(lock);
+                __destroy_lock(lock);
+
+                __destroy_lock(sum);
+
+                for (i = 0; i < 3; i++) {
+                    if (!v.locks[i])
+                        continue;
+
+                    __insert_and_merge(pl_inode, v.locks[i]);
+                }
+
+                __delete_unlck_locks(pl_inode);
+                return;
+            }
+        }
+
+        if (lock->fl_type == F_UNLCK) {
+            continue;
+        }
+
+        if ((conf->fl_type == F_RDLCK) && (lock->fl_type == F_RDLCK)) {
+            __insert_lock(pl_inode, lock);
+            return;
+        }
+    }
+
+    /* no conflicts, so just insert */
+    if (lock->fl_type != F_UNLCK) {
+        __insert_lock(pl_inode, lock);
+    } else {
+        __destroy_lock(lock);
+    }
+}
 
+void
+__grant_blocked_locks(xlator_t *this, pl_inode_t *pl_inode,
+                      struct list_head *granted)
+{
+    struct list_head tmp_list;
+    posix_lock_t *l = NULL;
+    posix_lock_t *tmp = NULL;
+    posix_lock_t *conf = NULL;
+
+    INIT_LIST_HEAD(&tmp_list);
+
+    list_for_each_entry_safe(l, tmp, &pl_inode->ext_list, list)
+    {
+        if (l->blocked) {
+            conf = first_overlap(pl_inode, l);
+            if (conf)
+                continue;
+
+            l->blocked = 0;
+            list_move_tail(&l->list, &tmp_list);
+        }
+    }
+
+    list_for_each_entry_safe(l, tmp, &tmp_list, list)
+    {
+        list_del_init(&l->list);
+
+        if (__is_lock_grantable(pl_inode, l)) {
+            conf = GF_CALLOC(1, sizeof(*conf), gf_locks_mt_posix_lock_t);
+
+            if (!conf) {
+                l->blocked = 1;
+                __insert_lock(pl_inode, l);
+                continue;
+            }
+
+            conf->frame = l->frame;
+            l->frame = NULL;
+
+            posix_lock_to_flock(l, &conf->user_flock);
+
+            gf_log(this->name, GF_LOG_TRACE,
+                   "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64
+                   " => Granted",
+                   l->fl_type == F_UNLCK ? "Unlock" : "Lock", l->client_pid,
+                   lkowner_utoa(&l->owner), l->user_flock.l_start,
+                   l->user_flock.l_len);
+
+            __insert_and_merge(pl_inode, l);
+
+            list_add(&conf->list, granted);
+        } else {
+            l->blocked = 1;
+            __insert_lock(pl_inode, l);
+        }
+    }
+}
 
-static void
-__insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock,
-		    gf_lk_domain_t dom)
+void
+grant_blocked_locks(xlator_t *this, pl_inode_t *pl_inode)
 {
-	posix_lock_t  *conf = NULL;
-	posix_lock_t  *t = NULL;
-	posix_lock_t  *sum = NULL;
-	int            i = 0;
-	struct _values v = { .locks = {0, 0, 0} };
+    struct list_head granted_list;
+    posix_lock_t *tmp = NULL;
+    posix_lock_t *lock = NULL;
+    pl_local_t *local = NULL;
+    INIT_LIST_HEAD(&granted_list);
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        __grant_blocked_locks(this, pl_inode, &granted_list);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    list_for_each_entry_safe(lock, tmp, &granted_list, list)
+    {
+        list_del_init(&lock->list);
+
+        pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock,
+                     0, 0, NULL);
+        local = lock->frame->local;
+        PL_STACK_UNWIND_AND_FREE(local, lk, lock->frame, 0, 0,
+                                 &lock->user_flock, NULL);
+        __destroy_lock(lock);
+    }
+
+    return;
+}
 
-	list_for_each_entry_safe (conf, t, DOMAIN_HEAD (pl_inode, dom), list) {
-		if (!locks_overlap (conf, lock))
-			continue;
+static int
+pl_send_prelock_unlock(xlator_t *this, pl_inode_t *pl_inode,
+                       posix_lock_t *old_lock)
+{
+    struct gf_flock flock = {
+        0,
+    };
+    posix_lock_t *unlock_lock = NULL;
+    int32_t op_errno = 0;
 
-		if (same_owner (conf, lock)) {
-			if (conf->fl_type == lock->fl_type) {
-				sum = add_locks (lock, conf);
+    struct list_head granted_list;
+    posix_lock_t *tmp = NULL;
+    posix_lock_t *lock = NULL;
+    pl_local_t *local = NULL;
 
-				sum->fl_type    = lock->fl_type;
-				sum->transport  = lock->transport;
-				sum->client_pid = lock->client_pid;
+    int ret = -1;
 
-				__delete_lock (pl_inode, conf); 
-				__destroy_lock (conf);
+    INIT_LIST_HEAD(&granted_list);
 
-				__destroy_lock (lock);
-				__insert_and_merge (pl_inode, sum, dom);
+    flock.l_type = F_UNLCK;
+    flock.l_whence = old_lock->user_flock.l_whence;
+    flock.l_start = old_lock->user_flock.l_start;
+    flock.l_len = old_lock->user_flock.l_len;
+    flock.l_pid = old_lock->user_flock.l_pid;
 
-				return;
-			} else {
-				sum = add_locks (lock, conf);
+    unlock_lock = new_posix_lock(&flock, old_lock->client, old_lock->client_pid,
+                                 &old_lock->owner, old_lock->fd,
+                                 old_lock->lk_flags, 0, &op_errno);
+    GF_VALIDATE_OR_GOTO(this->name, unlock_lock, out);
+    ret = 0;
 
-				sum->fl_type    = conf->fl_type;
-				sum->transport  = conf->transport;
-				sum->client_pid = conf->client_pid;
+    __insert_and_merge(pl_inode, unlock_lock);
 
-				v = subtract_locks (sum, lock);
-	
-				__delete_lock (pl_inode, conf);
-				__destroy_lock (conf);
+    __grant_blocked_locks(this, pl_inode, &granted_list);
 
-				__delete_lock (pl_inode, lock);
-				__destroy_lock (lock);
+    list_for_each_entry_safe(lock, tmp, &granted_list, list)
+    {
+        list_del_init(&lock->list);
 
-				__destroy_lock (sum);
+        pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock,
+                     0, 0, NULL);
+        local = lock->frame->local;
+        PL_STACK_UNWIND_AND_FREE(local, lk, lock->frame, 0, 0,
+                                 &lock->user_flock, NULL);
+        __destroy_lock(lock);
+    }
 
-				for (i = 0; i < 3; i++) {
-					if (!v.locks[i])
-						continue;
+out:
+    return ret;
+}
 
-					if (v.locks[i]->fl_type == F_UNLCK) {
-						__destroy_lock (v.locks[i]);
-						continue;
-					}
-					__insert_and_merge (pl_inode,
-							    v.locks[i], dom);
-				}
+int
+pl_setlk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock,
+         int can_block)
+{
+    int ret = 0;
+
+    errno = 0;
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        /* Send unlock before the actual lock to
+           prevent lock upgrade / downgrade
+           problems only if:
+           - it is a blocking call
+           - it has other conflicting locks
+        */
+
+        if (can_block && !(__is_lock_grantable(pl_inode, lock))) {
+            ret = pl_send_prelock_unlock(this, pl_inode, lock);
+            if (ret)
+                gf_log(this->name, GF_LOG_DEBUG,
+                       "Could not send pre-lock "
+                       "unlock");
+        }
+
+        if (__is_lock_grantable(pl_inode, lock)) {
+            if (pl_metalock_is_active(pl_inode)) {
+                __pl_queue_lock(pl_inode, lock);
+                pthread_mutex_unlock(&pl_inode->mutex);
+                ret = -2;
+                goto out;
+            }
+            gf_log(this->name, GF_LOG_TRACE,
+                   "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64 " => OK",
+                   lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
+                   lock->client_pid, lkowner_utoa(&lock->owner),
+                   lock->user_flock.l_start, lock->user_flock.l_len);
+            __insert_and_merge(pl_inode, lock);
+        } else if (can_block) {
+            if (pl_metalock_is_active(pl_inode)) {
+                __pl_queue_lock(pl_inode, lock);
+                pthread_mutex_unlock(&pl_inode->mutex);
+                ret = -2;
+                goto out;
+            }
+            gf_log(this->name, GF_LOG_TRACE,
+                   "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64
+                   " => Blocked",
+                   lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
+                   lock->client_pid, lkowner_utoa(&lock->owner),
+                   lock->user_flock.l_start, lock->user_flock.l_len);
+
+            pl_trace_block(this, lock->frame, NULL, NULL, F_SETLKW,
+                           &lock->user_flock, NULL);
+
+            lock->blocked = 1;
+            __insert_lock(pl_inode, lock);
+            ret = -1;
+        } else {
+            gf_log(this->name, GF_LOG_TRACE,
+                   "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64 " => NOK",
+                   lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
+                   lock->client_pid, lkowner_utoa(&lock->owner),
+                   lock->user_flock.l_start, lock->user_flock.l_len);
+            errno = EAGAIN;
+            ret = -1;
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    grant_blocked_locks(this, pl_inode);
+
+    do_blocked_rw(pl_inode);
 
-				__delete_unlck_locks (pl_inode, dom);
-				return; 
-			}
-		}
+out:
+    return ret;
+}
 
-		if (lock->fl_type == F_UNLCK) {
-			continue;
-		}
+posix_lock_t *
+pl_getlk(pl_inode_t *pl_inode, posix_lock_t *lock)
+{
+    posix_lock_t *conf = first_conflicting_overlap(pl_inode, lock);
+    if (conf == NULL) {
+        lock->fl_type = F_UNLCK;
+        return lock;
+    }
 
-		if ((conf->fl_type == F_RDLCK) && (lock->fl_type == F_RDLCK)) {
-			__insert_lock (pl_inode, lock, dom);
-			return;
-		}
-	}
+    return conf;
+}
 
-	/* no conflicts, so just insert */
-	if (lock->fl_type != F_UNLCK) {
-		__insert_lock (pl_inode, lock, dom);
-	} else {
-		__destroy_lock (lock);
-	}
+gf_boolean_t
+pl_does_monkey_want_stuck_lock()
+{
+    long int monkey_unlock_rand = 0;
+    long int monkey_unlock_rand_rem = 0;
+
+    /* coverity[DC.WEAK_CRYPTO] */
+    monkey_unlock_rand = random();
+    monkey_unlock_rand_rem = monkey_unlock_rand % 100;
+    if (monkey_unlock_rand_rem == 0)
+        return _gf_true;
+    return _gf_false;
 }
 
+int
+pl_lock_preempt(pl_inode_t *pl_inode, posix_lock_t *reqlock)
+{
+    posix_lock_t *lock = NULL;
+    posix_lock_t *i = NULL;
+    pl_rw_req_t *rw = NULL;
+    pl_rw_req_t *itr = NULL;
+    struct list_head unwind_blist = {
+        0,
+    };
+    struct list_head unwind_rw_list = {
+        0,
+    };
+    int ret = 0;
+
+    INIT_LIST_HEAD(&unwind_blist);
+    INIT_LIST_HEAD(&unwind_rw_list);
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        /*
+            - go through the lock list
+            - remove all locks from different owners
+            - same owner locks will be added or substracted based on
+              the new request
+            - add the new lock
+        */
+        list_for_each_entry_safe(lock, i, &pl_inode->ext_list, list)
+        {
+            if (lock->blocked) {
+                list_del_init(&lock->list);
+                list_add(&lock->list, &unwind_blist);
+                continue;
+            }
+
+            if (locks_overlap(lock, reqlock)) {
+                if (same_owner(lock, reqlock))
+                    continue;
+
+                /* remove conflicting locks */
+                list_del_init(&lock->list);
+                __delete_lock(lock);
+                __destroy_lock(lock);
+            }
+        }
+
+        __insert_and_merge(pl_inode, reqlock);
+
+        list_for_each_entry_safe(rw, itr, &pl_inode->rw_list, list)
+        {
+            list_del_init(&rw->list);
+            list_add(&rw->list, &unwind_rw_list);
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    /* unwind blocked locks */
+    list_for_each_entry_safe(lock, i, &unwind_blist, list)
+    {
+        PL_STACK_UNWIND_AND_FREE(((pl_local_t *)lock->frame->local), lk,
+                                 lock->frame, -1, EBUSY, &lock->user_flock,
+                                 NULL);
+        __destroy_lock(lock);
+    }
+
+    /* unwind blocked IOs */
+    list_for_each_entry_safe(rw, itr, &unwind_rw_list, list)
+    {
+        pl_clean_local(rw->stub->frame->local);
+        call_unwind_error(rw->stub, -1, EBUSY);
+    }
+
+    return ret;
+}
 
-void
-__grant_blocked_locks (xlator_t *this, pl_inode_t *pl_inode,
-		       gf_lk_domain_t dom, struct list_head *granted)
+/* Return true in case we need to ensure mandatory-locking
+ * semantics under different modes.
+ */
+gf_boolean_t
+pl_is_mandatory_locking_enabled(pl_inode_t *pl_inode)
 {
-	struct list_head  tmp_list;
-	posix_lock_t     *l = NULL;
-	posix_lock_t     *tmp = NULL;
-	posix_lock_t     *conf = NULL;
+    posix_locks_private_t *priv = THIS->private;
 
-	INIT_LIST_HEAD (&tmp_list);
+    if (priv->mandatory_mode == MLK_FILE_BASED && pl_inode->mandatory)
+        return _gf_true;
+    else if (priv->mandatory_mode == MLK_FORCED ||
+             priv->mandatory_mode == MLK_OPTIMAL)
+        return _gf_true;
 
-	list_for_each_entry_safe (l, tmp, DOMAIN_HEAD (pl_inode, dom), list) {
-		if (l->blocked) {
-			conf = first_overlap (pl_inode, l, dom);
-			if (conf)
-				continue;
+    return _gf_false;
+}
 
-			l->blocked = 0;
-			list_move_tail (&l->list, &tmp_list);
-		}
-	}
+void
+pl_clean_local(pl_local_t *local)
+{
+    if (!local)
+        return;
+
+    if (local->inodelk_dom_count_req)
+        data_unref(local->inodelk_dom_count_req);
+    loc_wipe(&local->loc[0]);
+    loc_wipe(&local->loc[1]);
+    if (local->fd)
+        fd_unref(local->fd);
+    if (local->inode)
+        inode_unref(local->inode);
+    mem_put(local);
+}
 
-	list_for_each_entry_safe (l, tmp, &tmp_list, list) {
-		list_del_init (&l->list);
+/*
+TODO: detach local initialization from PL_LOCAL_GET_REQUESTS and add it here
+*/
+int
+pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
+{
+    pl_local_t *local = NULL;
 
-		if (__is_lock_grantable (pl_inode, l, dom)) {
-			conf = CALLOC (1, sizeof (*conf));
+    if (!loc && !fd) {
+        return -1;
+    }
 
-			if (!conf) {
-				l->blocked = 1;
-				__insert_lock (pl_inode, l, dom);
-				continue;
-			}
+    if (!frame->local) {
+        local = mem_get0(this->local_pool);
+        if (!local) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+                   "mem allocation failed");
+            return -1;
+        }
 
-			conf->frame = l->frame;
-			l->frame = NULL;
+        local->inode = (loc ? inode_ref(loc->inode) : inode_ref(fd->inode));
 
-			posix_lock_to_flock (l, &conf->user_flock);
+        frame->local = local;
+    }
 
-			gf_log (this->name, GF_LOG_DEBUG,
-				"%s (pid=%d) %"PRId64" - %"PRId64" => Granted",
-				l->fl_type == F_UNLCK ? "Unlock" : "Lock",
-				l->client_pid,
-				l->user_flock.l_start,
-				l->user_flock.l_len);
+    return 0;
+}
 
-			__insert_and_merge (pl_inode, l, dom);
+gf_boolean_t
+pl_is_lk_owner_valid(gf_lkowner_t *owner, client_t *client)
+{
+    if (client && (client->opversion < GD_OP_VERSION_7_0)) {
+        return _gf_true;
+    }
+
+    if (is_lk_owner_null(owner)) {
+        return _gf_false;
+    }
+    return _gf_true;
+}
 
-			list_add (&conf->list, granted);
-		} else {
-			l->blocked = 1;
-			__insert_lock (pl_inode, l, dom);
-		}
-	}
+static int32_t
+pl_inode_from_loc(loc_t *loc, inode_t **pinode)
+{
+    inode_t *inode = NULL;
+    int32_t error = 0;
+
+    if (loc->inode != NULL) {
+        inode = inode_ref(loc->inode);
+        goto done;
+    }
+
+    if (loc->parent == NULL) {
+        error = EINVAL;
+        goto done;
+    }
+
+    if (!gf_uuid_is_null(loc->gfid)) {
+        inode = inode_find(loc->parent->table, loc->gfid);
+        if (inode != NULL) {
+            goto done;
+        }
+    }
+
+    if (loc->name == NULL) {
+        error = EINVAL;
+        goto done;
+    }
+
+    inode = inode_grep(loc->parent->table, loc->parent, loc->name);
+    if (inode == NULL) {
+        /* We haven't found any inode. This means that the file doesn't exist
+         * or that even if it exists, we don't have any knowledge about it, so
+         * we don't have locks on it either, which is fine for our purposes. */
+        goto done;
+    }
+
+done:
+    *pinode = inode;
+
+    return error;
 }
 
+static gf_boolean_t
+pl_inode_has_owners(xlator_t *xl, client_t *client, pl_inode_t *pl_inode,
+                    struct timespec *now, struct list_head *contend)
+{
+    pl_dom_list_t *dom;
+    pl_inode_lock_t *lock;
+    gf_boolean_t has_owners = _gf_false;
+
+    list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
+    {
+        list_for_each_entry(lock, &dom->inodelk_list, list)
+        {
+            /* If the lock belongs to the same client, we assume it's related
+             * to the same operation, so we allow the removal to continue. */
+            if (lock->client == client) {
+                continue;
+            }
+            /* If the lock belongs to an internal process, we don't block the
+             * removal. */
+            if (lock->client_pid < 0) {
+                continue;
+            }
+            if (contend == NULL) {
+                return _gf_true;
+            }
+            has_owners = _gf_true;
+            inodelk_contention_notify_check(xl, lock, now, contend);
+        }
+    }
+
+    return has_owners;
+}
 
-void
-grant_blocked_locks (xlator_t *this, pl_inode_t *pl_inode, gf_lk_domain_t dom)
+int32_t
+pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc,
+                        pl_inode_t **ppl_inode, struct list_head *contend)
 {
-	struct list_head granted_list;
-	posix_lock_t     *tmp = NULL;
-	posix_lock_t     *lock = NULL;
+    struct timespec now;
+    inode_t *inode;
+    pl_inode_t *pl_inode;
+    int32_t error;
+
+    pl_inode = NULL;
+
+    error = pl_inode_from_loc(loc, &inode);
+    if ((error != 0) || (inode == NULL)) {
+        goto done;
+    }
+
+    pl_inode = pl_inode_get(xl, inode, NULL);
+    if (pl_inode == NULL) {
+        inode_unref(inode);
+        error = ENOMEM;
+        goto done;
+    }
+
+    /* pl_inode_from_loc() already increments ref count for inode, so
+     * we only assign here our reference. */
+    pl_inode->inode = inode;
+
+    timespec_now(&now);
 
-	INIT_LIST_HEAD (&granted_list);
+    pthread_mutex_lock(&pl_inode->mutex);
 
-	pthread_mutex_lock (&pl_inode->mutex);
-	{
-		__grant_blocked_locks (this, pl_inode, dom, &granted_list);
-	}
-	pthread_mutex_unlock (&pl_inode->mutex);
+    if (pl_inode->removed) {
+        error = ESTALE;
+        goto unlock;
+    }
 
-	list_for_each_entry_safe (lock, tmp, &granted_list, list) {
-		list_del_init (&lock->list);
+    if (pl_inode_has_owners(xl, frame->root->client, pl_inode, &now, contend)) {
+        error = -1;
+        /* We skip the unlock here because the caller must create a stub when
+         * we return -1 and do a call to pl_inode_remove_complete(), which
+         * assumes the lock is still acquired and will release it once
+         * everything else is prepared. */
+        goto done;
+    }
 
-		STACK_UNWIND (lock->frame, 0, 0, &lock->user_flock);
+    pl_inode->is_locked = _gf_true;
+    pl_inode->remove_running++;
 
-		FREE (lock);
-	}
+unlock:
+    pthread_mutex_unlock(&pl_inode->mutex);
 
-	return;
+done:
+    *ppl_inode = pl_inode;
+
+    return error;
 }
 
+int32_t
+pl_inode_remove_complete(xlator_t *xl, pl_inode_t *pl_inode, call_stub_t *stub,
+                         struct list_head *contend)
+{
+    pl_inode_lock_t *lock;
+    int32_t error = -1;
 
-int
-pl_setlk (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock,
-	  int can_block,  gf_lk_domain_t dom)
-{
-	int              ret = 0;
-
-	errno = 0;
-
-	pthread_mutex_lock (&pl_inode->mutex);
-	{
-		if (__is_lock_grantable (pl_inode, lock, dom)) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"%s (pid=%d) %"PRId64" - %"PRId64" => OK",
-				lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
-				lock->client_pid,
-				lock->user_flock.l_start,
-				lock->user_flock.l_len);
-			__insert_and_merge (pl_inode, lock, dom);
-		} else if (can_block) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"%s (pid=%d) %"PRId64" - %"PRId64" => Blocked",
-				lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
-				lock->client_pid,
-				lock->user_flock.l_start,
-				lock->user_flock.l_len);
-			lock->blocked = 1;
-			__insert_lock (pl_inode, lock, dom);
-			ret = -1;
-		} else {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"%s (pid=%d) %"PRId64" - %"PRId64" => NOK",
-				lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
-				lock->client_pid,
-				lock->user_flock.l_start,
-				lock->user_flock.l_len);
-			errno = EAGAIN;
-			ret = -1;
-		}
-	}
-	pthread_mutex_unlock (&pl_inode->mutex);
-
-	grant_blocked_locks (this, pl_inode, dom);
-
-	do_blocked_rw (pl_inode);
-
-	return ret;
+    if (stub != NULL) {
+        list_add_tail(&stub->list, &pl_inode->waiting);
+        pl_inode->is_locked = _gf_true;
+    } else {
+        error = ENOMEM;
+
+        while (!list_empty(contend)) {
+            lock = list_first_entry(contend, pl_inode_lock_t, list);
+            list_del_init(&lock->list);
+            __pl_inodelk_unref(lock);
+        }
+    }
+
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    if (error < 0) {
+        inodelk_contention_notify(xl, contend);
+    }
+
+    inode_unref(pl_inode->inode);
+
+    return error;
 }
 
+void
+pl_inode_remove_wake(struct list_head *list)
+{
+    call_stub_t *stub;
+
+    while (!list_empty(list)) {
+        stub = list_first_entry(list, call_stub_t, list);
+        list_del_init(&stub->list);
 
-posix_lock_t *
-pl_getlk (pl_inode_t *pl_inode, posix_lock_t *lock, gf_lk_domain_t dom)
+        call_resume(stub);
+    }
+}
+
+void
+pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error)
 {
-	posix_lock_t *conf = NULL;
+    struct list_head contend, granted;
+    struct timespec now;
+    pl_dom_list_t *dom;
+
+    if (pl_inode == NULL) {
+        return;
+    }
+
+    INIT_LIST_HEAD(&contend);
+    INIT_LIST_HEAD(&granted);
+    timespec_now(&now);
+
+    pthread_mutex_lock(&pl_inode->mutex);
 
-	conf = first_overlap (pl_inode, lock, dom);
+    if (error == 0) {
+        if (pl_inode->links >= 0) {
+            pl_inode->links--;
+        }
+        if (pl_inode->links == 0) {
+            pl_inode->removed = _gf_true;
+        }
+    }
 
-	if (conf == NULL) {
-		lock->fl_type = F_UNLCK;
-		return lock;
-	}
+    pl_inode->remove_running--;
 
-	return conf;
+    if ((pl_inode->remove_running == 0) && list_empty(&pl_inode->waiting)) {
+        pl_inode->is_locked = _gf_false;
+
+        list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
+        {
+            __grant_blocked_inode_locks(xl, pl_inode, &granted, dom, &now,
+                                        &contend);
+        }
+    }
+
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    unwind_granted_inodes(xl, pl_inode, &granted);
+
+    inodelk_contention_notify(xl, &contend);
+
+    inode_unref(pl_inode->inode);
+}
+
+void
+pl_inode_remove_unlocked(xlator_t *xl, pl_inode_t *pl_inode,
+                         struct list_head *list)
+{
+    call_stub_t *stub, *tmp;
+
+    if (!pl_inode->is_locked) {
+        return;
+    }
+
+    list_for_each_entry_safe(stub, tmp, &pl_inode->waiting, list)
+    {
+        if (!pl_inode_has_owners(xl, stub->frame->root->client, pl_inode, NULL,
+                                 NULL)) {
+            list_move_tail(&stub->list, list);
+        }
+    }
+}
+
+/* This function determines if an inodelk attempt can be done now or it needs
+ * to wait.
+ *
+ * Possible return values:
+ *   < 0: An error occurred. Currently only -ESTALE can be returned if the
+ *        inode has been deleted previously by unlink/rmdir/rename
+ *   = 0: The lock can be attempted.
+ *   > 0: The lock needs to wait because a conflicting remove operation is
+ *        ongoing.
+ */
+int32_t
+pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock)
+{
+    pl_dom_list_t *dom;
+    pl_inode_lock_t *ilock;
+
+    /* If the inode has been deleted, we won't allow any lock. */
+    if (pl_inode->removed) {
+        return -ESTALE;
+    }
+
+    /* We only synchronize with locks made for regular operations coming from
+     * the user. Locks done for internal purposes are hard to control and could
+     * lead to long delays or deadlocks quite easily. */
+    if (lock->client_pid < 0) {
+        return 0;
+    }
+    if (!pl_inode->is_locked) {
+        return 0;
+    }
+    if (pl_inode->remove_running > 0) {
+        return 1;
+    }
+
+    list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
+    {
+        list_for_each_entry(ilock, &dom->inodelk_list, list)
+        {
+            /* If a lock from the same client is already granted, we allow this
+             * one to continue. This is necessary to prevent deadlocks when
+             * multiple locks are taken for the same operation.
+             *
+             * On the other side it's unlikely that the same client sends
+             * completely unrelated locks for the same inode.
+             */
+            if (ilock->client == lock->client) {
+                return 0;
+            }
+        }
+    }
+
+    return 1;
 }
diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h
index ee17b008737..281223bf3b8 100644
--- a/xlators/features/locks/src/common.h
+++ b/xlators/features/locks/src/common.h
@@ -1,52 +1,262 @@
 /*
-  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+   Copyright (c) 2006-2012, 2015-2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 #ifndef __COMMON_H__
 #define __COMMON_H__
 
+/*dump locks format strings */
+#define RANGE_FMT "type=%s, whence=%hd, start=%llu, len=%llu"
+#define ENTRY_FMT "type=%s on basename=%s"
+#define DUMP_GEN_FMT "pid = %llu, owner=%s, client=%p"
+#define GRNTD_AT "granted at %s"
+#define BLKD_AT "blocked at %s"
+#define CONN_ID "connection-id=%s"
+#define DUMP_BLKD_FMT DUMP_GEN_FMT ", " CONN_ID ", " BLKD_AT
+#define DUMP_GRNTD_FMT DUMP_GEN_FMT ", " CONN_ID ", " GRNTD_AT
+#define DUMP_BLKD_GRNTD_FMT DUMP_GEN_FMT ", " CONN_ID ", " BLKD_AT ", " GRNTD_AT
+
+#define ENTRY_BLKD_FMT ENTRY_FMT ", " DUMP_BLKD_FMT
+#define ENTRY_GRNTD_FMT ENTRY_FMT ", " DUMP_GRNTD_FMT
+#define ENTRY_BLKD_GRNTD_FMT ENTRY_FMT ", " DUMP_BLKD_GRNTD_FMT
+
+#define RANGE_BLKD_FMT RANGE_FMT ", " DUMP_BLKD_FMT
+#define RANGE_GRNTD_FMT RANGE_FMT ", " DUMP_GRNTD_FMT
+#define RANGE_BLKD_GRNTD_FMT RANGE_FMT ", " DUMP_BLKD_GRNTD_FMT
+
+#define SET_FLOCK_PID(flock, lock) ((flock)->l_pid = lock->client_pid)
+
+#define PL_STACK_UNWIND_AND_FREE(__local, fop, frame, op_ret, params...)       \
+    do {                                                                       \
+        frame->local = NULL;                                                   \
+        STACK_UNWIND_STRICT(fop, frame, op_ret, params);                       \
+        if (__local) {                                                         \
+            if (__local->inodelk_dom_count_req)                                \
+                data_unref(__local->inodelk_dom_count_req);                    \
+            loc_wipe(&__local->loc[0]);                                        \
+            loc_wipe(&__local->loc[1]);                                        \
+            if (__local->fd)                                                   \
+                fd_unref(__local->fd);                                         \
+            if (__local->inode)                                                \
+                inode_unref(__local->inode);                                   \
+            if (__local->xdata) {                                              \
+                dict_unref(__local->xdata);                                    \
+                __local->xdata = NULL;                                         \
+            }                                                                  \
+            mem_put(__local);                                                  \
+        }                                                                      \
+    } while (0)
+
 posix_lock_t *
-new_posix_lock (struct flock *flock, transport_t *transport, pid_t client_pid);
+new_posix_lock(struct gf_flock *flock, client_t *client, pid_t client_pid,
+               gf_lkowner_t *owner, fd_t *fd, uint32_t lk_flags, int blocking,
+               int32_t *op_errno);
 
 pl_inode_t *
-pl_inode_get (xlator_t *this, inode_t *inode);
+pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local);
 
 posix_lock_t *
-pl_getlk (pl_inode_t *inode, posix_lock_t *lock, gf_lk_domain_t domain);
+pl_getlk(pl_inode_t *inode, posix_lock_t *lock);
+
+int
+pl_setlk(xlator_t *this, pl_inode_t *inode, posix_lock_t *lock, int can_block);
 
 int
-pl_setlk (xlator_t *this, pl_inode_t *inode, posix_lock_t *lock,
-	  int can_block, gf_lk_domain_t domain);
+pl_lock_preempt(pl_inode_t *pl_inode, posix_lock_t *reqlock);
+
+void
+grant_blocked_locks(xlator_t *this, pl_inode_t *inode);
+
+void
+posix_lock_to_flock(posix_lock_t *lock, struct gf_flock *flock);
+
+int
+locks_overlap(posix_lock_t *l1, posix_lock_t *l2);
+
+int
+same_owner(posix_lock_t *l1, posix_lock_t *l2);
+
+void
+__delete_lock(posix_lock_t *);
+
+void
+__destroy_lock(posix_lock_t *);
+
+pl_dom_list_t *
+get_domain(pl_inode_t *pl_inode, const char *volume);
+
+void
+grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
+                          pl_dom_list_t *dom, struct timespec *now,
+                          struct list_head *contend);
+
+void
+inodelk_contention_notify(xlator_t *this, struct list_head *contend);
+
+void
+__delete_inode_lock(pl_inode_lock_t *lock);
+
+void
+__pl_inodelk_unref(pl_inode_lock_t *lock);
+
+void
+__grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
+                            struct list_head *granted, pl_dom_list_t *dom,
+                            struct timespec *now, struct list_head *contend);
+
+void
+unwind_granted_inodes(xlator_t *this, pl_inode_t *pl_inode,
+                      struct list_head *granted);
+
+void
+grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode,
+                          pl_dom_list_t *dom, struct timespec *now,
+                          struct list_head *contend);
+
+void
+entrylk_contention_notify(xlator_t *this, struct list_head *contend);
+
+void
+pl_update_refkeeper(xlator_t *this, inode_t *inode);
+
+int32_t
+__get_inodelk_count(xlator_t *this, pl_inode_t *pl_inode, char *domname);
+int32_t
+get_inodelk_count(xlator_t *this, inode_t *inode, char *domname);
+
+int32_t
+__get_entrylk_count(xlator_t *this, pl_inode_t *pl_inode);
+int32_t
+get_entrylk_count(xlator_t *this, inode_t *inode);
+
+void
+pl_trace_in(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, int cmd,
+            struct gf_flock *flock, const char *domain);
+
+void
+pl_trace_out(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc, int cmd,
+             struct gf_flock *flock, int op_ret, int op_errno,
+             const char *domain);
 
 void
-grant_blocked_locks (xlator_t *this, pl_inode_t *inode, gf_lk_domain_t domain);
+pl_trace_block(xlator_t *this, call_frame_t *frame, fd_t *fd, loc_t *loc,
+               int cmd, struct gf_flock *flock, const char *domain);
 
 void
-posix_lock_to_flock (posix_lock_t *lock, struct flock *flock);
+pl_trace_flush(xlator_t *this, call_frame_t *frame, fd_t *fd);
+
+void
+entrylk_trace_in(xlator_t *this, call_frame_t *frame, const char *volume,
+                 fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd,
+                 entrylk_type type);
+
+void
+entrylk_trace_out(xlator_t *this, call_frame_t *frame, const char *volume,
+                  fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd,
+                  entrylk_type type, int op_ret, int op_errno);
+
+void
+entrylk_trace_block(xlator_t *this, call_frame_t *frame, const char *volume,
+                    fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd,
+                    entrylk_type type);
+
+void
+pl_print_verdict(char *str, int size, int op_ret, int op_errno);
+
+void
+pl_print_lockee(char *str, int size, fd_t *fd, loc_t *loc);
+
+void
+pl_print_locker(char *str, int size, xlator_t *this, call_frame_t *frame);
+
+void
+pl_print_inodelk(char *str, int size, int cmd, struct gf_flock *flock,
+                 const char *domain);
+
+void
+pl_trace_release(xlator_t *this, fd_t *fd);
+
+unsigned long
+fd_to_fdnum(fd_t *fd);
+
+fd_t *
+fd_from_fdnum(posix_lock_t *lock);
+
+int
+pl_reserve_setlk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock,
+                 int can_block);
+int
+reservelks_equal(posix_lock_t *l1, posix_lock_t *l2);
+
+int
+pl_verify_reservelk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock,
+                    int can_block);
+int
+pl_reserve_unlock(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *reqlock);
+
+int32_t
+check_entrylk_on_basename(xlator_t *this, inode_t *parent, char *basename);
+
+void
+__pl_inodelk_unref(pl_inode_lock_t *lock);
+void
+__pl_entrylk_unref(pl_entry_lock_t *lock);
 
 int
-locks_overlap (posix_lock_t *l1, posix_lock_t *l2);
+pl_metalock_is_active(pl_inode_t *pl_inode);
+
+void
+__pl_queue_lock(pl_inode_t *pl_inode, posix_lock_t *reqlock);
+
+void
+inodelk_contention_notify_check(xlator_t *xl, pl_inode_lock_t *lock,
+                                struct timespec *now,
+                                struct list_head *contend);
+
+void
+entrylk_contention_notify_check(xlator_t *xl, pl_entry_lock_t *lock,
+                                struct timespec *now,
+                                struct list_head *contend);
+
+gf_boolean_t
+pl_does_monkey_want_stuck_lock();
+
+gf_boolean_t
+pl_is_mandatory_locking_enabled(pl_inode_t *pl_inode);
+
+void
+pl_clean_local(pl_local_t *local);
 
 int
-same_owner (posix_lock_t *l1, posix_lock_t *l2);
+pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd);
+
+gf_boolean_t
+pl_is_lk_owner_valid(gf_lkowner_t *owner, client_t *client);
+
+int32_t
+pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc,
+                        pl_inode_t **ppl_inode, struct list_head *contend);
+
+int32_t
+pl_inode_remove_complete(xlator_t *xl, pl_inode_t *pl_inode, call_stub_t *stub,
+                         struct list_head *contend);
 
-void __delete_lock (pl_inode_t *, posix_lock_t *);
+void
+pl_inode_remove_wake(struct list_head *list);
+
+void
+pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error);
+
+void
+pl_inode_remove_unlocked(xlator_t *xl, pl_inode_t *pl_inode,
+                         struct list_head *list);
 
-void __destroy_lock (posix_lock_t *);
+int32_t
+pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock);
 
 #endif /* __COMMON_H__ */
diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c
new file mode 100644
index 00000000000..fd772c850dd
--- /dev/null
+++ b/xlators/features/locks/src/entrylk.c
@@ -0,0 +1,1153 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/list.h>
+#include <glusterfs/upcall-utils.h>
+
+#include "locks.h"
+#include "clear.h"
+#include "common.h"
+#include "pl-messages.h"
+
+void
+__pl_entrylk_unref(pl_entry_lock_t *lock)
+{
+    lock->ref--;
+    if (!lock->ref) {
+        GF_FREE((char *)lock->basename);
+        GF_FREE(lock->connection_id);
+        GF_FREE(lock);
+    }
+}
+
+static void
+__pl_entrylk_ref(pl_entry_lock_t *lock)
+{
+    lock->ref++;
+}
+
+static pl_entry_lock_t *
+new_entrylk_lock(pl_inode_t *pinode, const char *basename, entrylk_type type,
+                 const char *domain, call_frame_t *frame, char *conn_id,
+                 int32_t *op_errno)
+{
+    pl_entry_lock_t *newlock = NULL;
+
+    if (!pl_is_lk_owner_valid(&frame->root->lk_owner, frame->root->client)) {
+        *op_errno = EINVAL;
+        goto out;
+    }
+
+    newlock = GF_CALLOC(1, sizeof(pl_entry_lock_t),
+                        gf_locks_mt_pl_entry_lock_t);
+    if (!newlock) {
+        *op_errno = ENOMEM;
+        goto out;
+    }
+
+    newlock->basename = basename ? gf_strdup(basename) : NULL;
+    newlock->type = type;
+    newlock->client = frame->root->client;
+    newlock->client_pid = frame->root->pid;
+    newlock->volume = domain;
+    newlock->owner = frame->root->lk_owner;
+    newlock->frame = frame;
+    newlock->this = frame->this;
+
+    if (conn_id) {
+        newlock->connection_id = gf_strdup(conn_id);
+    }
+
+    INIT_LIST_HEAD(&newlock->domain_list);
+    INIT_LIST_HEAD(&newlock->blocked_locks);
+    INIT_LIST_HEAD(&newlock->client_list);
+
+    __pl_entrylk_ref(newlock);
+out:
+    return newlock;
+}
+
+/**
+ * all_names - does a basename represent all names?
+ * @basename: name to check
+ */
+
+#define all_names(basename) ((basename == NULL) ? 1 : 0)
+
+/**
+ * names_conflict - do two names conflict?
+ * @n1: name
+ * @n2: name
+ */
+
+static int
+names_conflict(const char *n1, const char *n2)
+{
+    return all_names(n1) || all_names(n2) || !strcmp(n1, n2);
+}
+
+static int
+__same_entrylk_owner(pl_entry_lock_t *l1, pl_entry_lock_t *l2)
+{
+    return (is_same_lkowner(&l1->owner, &l2->owner) &&
+            (l1->client == l2->client));
+}
+
+/* Just as in inodelk, allow conflicting name locks from same (lk_owner, conn)*/
+static int
+__conflicting_entrylks(pl_entry_lock_t *l1, pl_entry_lock_t *l2)
+{
+    if (names_conflict(l1->basename, l2->basename) &&
+        !__same_entrylk_owner(l1, l2))
+        return 1;
+
+    return 0;
+}
+
+/* See comments in inodelk.c for details */
+static inline gf_boolean_t
+__stale_entrylk(xlator_t *this, pl_entry_lock_t *candidate_lock,
+                pl_entry_lock_t *requested_lock, time_t *lock_age_sec)
+{
+    posix_locks_private_t *priv = NULL;
+
+    priv = this->private;
+
+    /* Question: Should we just prune them all given the
+     * chance?  Or just the locks we are attempting to acquire?
+     */
+    if (names_conflict(candidate_lock->basename, requested_lock->basename)) {
+        *lock_age_sec = gf_time() - candidate_lock->granted_time;
+        if (*lock_age_sec > priv->revocation_secs)
+            return _gf_true;
+    }
+    return _gf_false;
+}
+
+/* See comments in inodelk.c for details */
+static gf_boolean_t
+__entrylk_prune_stale(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom,
+                      pl_entry_lock_t *lock)
+{
+    posix_locks_private_t *priv = NULL;
+    pl_entry_lock_t *tmp = NULL;
+    pl_entry_lock_t *lk = NULL;
+    gf_boolean_t revoke_lock = _gf_false;
+    int bcount = 0;
+    int gcount = 0;
+    int op_errno = 0;
+    clrlk_args args;
+    args.opts = NULL;
+    time_t lk_age_sec = 0;
+    uint32_t max_blocked = 0;
+    char *reason_str = NULL;
+
+    priv = this->private;
+    args.type = CLRLK_ENTRY;
+    if (priv->revocation_clear_all == _gf_true)
+        args.kind = CLRLK_ALL;
+    else
+        args.kind = CLRLK_GRANTED;
+
+    if (list_empty(&dom->entrylk_list))
+        goto out;
+
+    pthread_mutex_lock(&pinode->mutex);
+    lock->pinode = pinode;
+    list_for_each_entry_safe(lk, tmp, &dom->entrylk_list, domain_list)
+    {
+        if (__stale_entrylk(this, lk, lock, &lk_age_sec) == _gf_true) {
+            revoke_lock = _gf_true;
+            reason_str = "age";
+            break;
+        }
+    }
+    max_blocked = priv->revocation_max_blocked;
+    if (max_blocked != 0 && revoke_lock == _gf_false) {
+        list_for_each_entry_safe(lk, tmp, &dom->blocked_entrylks, blocked_locks)
+        {
+            max_blocked--;
+            if (max_blocked == 0) {
+                revoke_lock = _gf_true;
+                reason_str = "max blocked";
+                break;
+            }
+        }
+    }
+    pthread_mutex_unlock(&pinode->mutex);
+
+out:
+    if (revoke_lock == _gf_true) {
+        clrlk_clear_entrylk(this, pinode, dom, &args, &bcount, &gcount,
+                            &op_errno);
+        gf_log(this->name, GF_LOG_WARNING,
+               "Lock revocation [reason: %s; gfid: %s; domain: %s; "
+               "age: %ld sec] - Entry lock revoked:  %d granted & %d "
+               "blocked locks cleared",
+               reason_str, uuid_utoa(pinode->gfid), dom->domain, lk_age_sec,
+               gcount, bcount);
+    }
+
+    return revoke_lock;
+}
+
+void
+entrylk_contention_notify_check(xlator_t *this, pl_entry_lock_t *lock,
+                                struct timespec *now, struct list_head *contend)
+{
+    posix_locks_private_t *priv;
+    int64_t elapsed;
+
+    priv = this->private;
+
+    /* If this lock is in a list, it means that we are about to send a
+     * notification for it, so no need to do anything else. */
+    if (!list_empty(&lock->contend)) {
+        return;
+    }
+
+    elapsed = now->tv_sec;
+    elapsed -= lock->contention_time.tv_sec;
+    if (now->tv_nsec < lock->contention_time.tv_nsec) {
+        elapsed--;
+    }
+    if (elapsed < priv->notify_contention_delay) {
+        return;
+    }
+
+    /* All contention notifications will be sent outside of the locked
+     * region. This means that currently granted locks might have already
+     * been unlocked by that time. To avoid the lock or the inode to be
+     * destroyed before we process them, we take an additional reference
+     * on both. */
+    inode_ref(lock->pinode->inode);
+    __pl_entrylk_ref(lock);
+
+    lock->contention_time = *now;
+
+    list_add_tail(&lock->contend, contend);
+}
+
+void
+entrylk_contention_notify(xlator_t *this, struct list_head *contend)
+{
+    struct gf_upcall up;
+    struct gf_upcall_entrylk_contention lc;
+    pl_entry_lock_t *lock;
+    pl_inode_t *pl_inode;
+    client_t *client;
+    gf_boolean_t notify;
+
+    while (!list_empty(contend)) {
+        lock = list_first_entry(contend, pl_entry_lock_t, contend);
+
+        pl_inode = lock->pinode;
+
+        pthread_mutex_lock(&pl_inode->mutex);
+
+        /* If the lock has already been released, no notification is
+         * sent. We clear the notification time in this case. */
+        notify = !list_empty(&lock->domain_list);
+        if (!notify) {
+            lock->contention_time.tv_sec = 0;
+            lock->contention_time.tv_nsec = 0;
+        } else {
+            lc.type = lock->type;
+            lc.name = lock->basename;
+            lc.pid = lock->client_pid;
+            lc.domain = lock->volume;
+            lc.xdata = NULL;
+
+            gf_uuid_copy(up.gfid, lock->pinode->gfid);
+            client = (client_t *)lock->client;
+            if (client == NULL) {
+                /* A NULL client can be found if the entrylk
+                 * was issued by a server side xlator. */
+                up.client_uid = NULL;
+            } else {
+                up.client_uid = client->client_uid;
+            }
+        }
+
+        pthread_mutex_unlock(&pl_inode->mutex);
+
+        if (notify) {
+            up.event_type = GF_UPCALL_ENTRYLK_CONTENTION;
+            up.data = &lc;
+
+            if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) {
+                gf_msg_debug(this->name, 0,
+                             "Entrylk contention notification "
+                             "failed");
+            } else {
+                gf_msg_debug(this->name, 0,
+                             "Entrylk contention notification "
+                             "sent");
+            }
+        }
+
+        pthread_mutex_lock(&pl_inode->mutex);
+
+        list_del_init(&lock->contend);
+        __pl_entrylk_unref(lock);
+
+        pthread_mutex_unlock(&pl_inode->mutex);
+
+        inode_unref(pl_inode->inode);
+    }
+}
+
+/**
+ * entrylk_grantable - is this lock grantable?
+ * @inode: inode in which to look
+ * @basename: name we're trying to lock
+ * @type: type of lock
+ */
+static pl_entry_lock_t *
+__entrylk_grantable(xlator_t *this, pl_dom_list_t *dom, pl_entry_lock_t *lock,
+                    struct timespec *now, struct list_head *contend)
+{
+    pl_entry_lock_t *tmp = NULL;
+    pl_entry_lock_t *ret = NULL;
+
+    list_for_each_entry(tmp, &dom->entrylk_list, domain_list)
+    {
+        if (__conflicting_entrylks(tmp, lock)) {
+            if (ret == NULL) {
+                ret = tmp;
+                if (contend == NULL) {
+                    break;
+                }
+            }
+            entrylk_contention_notify_check(this, tmp, now, contend);
+        }
+    }
+
+    return ret;
+}
+
+static pl_entry_lock_t *
+__blocked_entrylk_conflict(pl_dom_list_t *dom, pl_entry_lock_t *lock)
+{
+    pl_entry_lock_t *tmp = NULL;
+
+    list_for_each_entry(tmp, &dom->blocked_entrylks, blocked_locks)
+    {
+        if (names_conflict(tmp->basename, lock->basename))
+            return lock;
+    }
+
+    return NULL;
+}
+
+static int
+__owner_has_lock(pl_dom_list_t *dom, pl_entry_lock_t *newlock)
+{
+    pl_entry_lock_t *lock = NULL;
+
+    list_for_each_entry(lock, &dom->entrylk_list, domain_list)
+    {
+        if (__same_entrylk_owner(lock, newlock))
+            return 1;
+    }
+
+    list_for_each_entry(lock, &dom->blocked_entrylks, blocked_locks)
+    {
+        if (__same_entrylk_owner(lock, newlock))
+            return 1;
+    }
+
+    return 0;
+}
+
+static int
+names_equal(const char *n1, const char *n2)
+{
+    return (n1 == NULL && n2 == NULL) || (n1 && n2 && !strcmp(n1, n2));
+}
+
+void
+pl_print_entrylk(char *str, int size, entrylk_cmd cmd, entrylk_type type,
+                 const char *basename, const char *domain)
+{
+    char *cmd_str = NULL;
+    char *type_str = NULL;
+
+    switch (cmd) {
+        case ENTRYLK_LOCK:
+            cmd_str = "LOCK";
+            break;
+
+        case ENTRYLK_LOCK_NB:
+            cmd_str = "LOCK_NB";
+            break;
+
+        case ENTRYLK_UNLOCK:
+            cmd_str = "UNLOCK";
+            break;
+
+        default:
+            cmd_str = "UNKNOWN";
+            break;
+    }
+
+    switch (type) {
+        case ENTRYLK_RDLCK:
+            type_str = "READ";
+            break;
+        case ENTRYLK_WRLCK:
+            type_str = "WRITE";
+            break;
+        default:
+            type_str = "UNKNOWN";
+            break;
+    }
+
+    snprintf(str, size,
+             "lock=ENTRYLK, cmd=%s, type=%s, basename=%s, domain: %s", cmd_str,
+             type_str, basename, domain);
+}
+
+void
+entrylk_trace_in(xlator_t *this, call_frame_t *frame, const char *domain,
+                 fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd,
+                 entrylk_type type)
+{
+    posix_locks_private_t *priv = NULL;
+    char pl_locker[256];
+    char pl_lockee[256];
+    char pl_entrylk[256];
+
+    priv = this->private;
+
+    if (!priv->trace)
+        return;
+
+    pl_print_locker(pl_locker, 256, this, frame);
+    pl_print_lockee(pl_lockee, 256, fd, loc);
+    pl_print_entrylk(pl_entrylk, 256, cmd, type, basename, domain);
+
+    gf_log(this->name, GF_LOG_INFO,
+           "[REQUEST] Locker = {%s} Lockee = {%s} Lock = {%s}", pl_locker,
+           pl_lockee, pl_entrylk);
+}
+
+void
+entrylk_trace_out(xlator_t *this, call_frame_t *frame, const char *domain,
+                  fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd,
+                  entrylk_type type, int op_ret, int op_errno)
+{
+    posix_locks_private_t *priv = NULL;
+    char pl_locker[256];
+    char pl_lockee[256];
+    char pl_entrylk[256];
+    char verdict[32];
+
+    priv = this->private;
+
+    if (!priv->trace)
+        return;
+
+    pl_print_locker(pl_locker, 256, this, frame);
+    pl_print_lockee(pl_lockee, 256, fd, loc);
+    pl_print_entrylk(pl_entrylk, 256, cmd, type, basename, domain);
+    pl_print_verdict(verdict, 32, op_ret, op_errno);
+
+    gf_log(this->name, GF_LOG_INFO,
+           "[%s] Locker = {%s} Lockee = {%s} Lock = {%s}", verdict, pl_locker,
+           pl_lockee, pl_entrylk);
+}
+
+void
+entrylk_trace_block(xlator_t *this, call_frame_t *frame, const char *volume,
+                    fd_t *fd, loc_t *loc, const char *basename, entrylk_cmd cmd,
+                    entrylk_type type)
+
+{
+    posix_locks_private_t *priv = NULL;
+    char pl_locker[256];
+    char pl_lockee[256];
+    char pl_entrylk[256];
+
+    priv = this->private;
+
+    if (!priv->trace)
+        return;
+
+    pl_print_locker(pl_locker, 256, this, frame);
+    pl_print_lockee(pl_lockee, 256, fd, loc);
+    pl_print_entrylk(pl_entrylk, 256, cmd, type, basename, volume);
+
+    gf_log(this->name, GF_LOG_INFO,
+           "[BLOCKED] Locker = {%s} Lockee = {%s} Lock = {%s}", pl_locker,
+           pl_lockee, pl_entrylk);
+}
+
+/**
+ * __find_most_matching_lock - find the lock struct which most matches in order
+ * of: lock on the exact basename || an all_names lock
+ *
+ *
+ * @inode: inode in which to look
+ * @basename: name to search for
+ */
+
+static pl_entry_lock_t *
+__find_most_matching_lock(pl_dom_list_t *dom, const char *basename)
+{
+    pl_entry_lock_t *lock;
+    pl_entry_lock_t *all = NULL;
+    pl_entry_lock_t *exact = NULL;
+
+    if (list_empty(&dom->entrylk_list))
+        return NULL;
+
+    list_for_each_entry(lock, &dom->entrylk_list, domain_list)
+    {
+        if (all_names(lock->basename))
+            all = lock;
+        else if (names_equal(lock->basename, basename))
+            exact = lock;
+    }
+
+    return (exact ? exact : all);
+}
+
+static pl_entry_lock_t *
+__find_matching_lock(pl_dom_list_t *dom, pl_entry_lock_t *lock)
+{
+    pl_entry_lock_t *tmp = NULL;
+
+    list_for_each_entry(tmp, &dom->entrylk_list, domain_list)
+    {
+        if (names_equal(lock->basename, tmp->basename) &&
+            __same_entrylk_owner(lock, tmp) && (lock->type == tmp->type))
+            return tmp;
+    }
+    return NULL;
+}
+
+static int
+__lock_blocked_add(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom,
+                   pl_entry_lock_t *lock, int nonblock)
+{
+    if (nonblock)
+        goto out;
+
+    lock->blkd_time = gf_time();
+    list_add_tail(&lock->blocked_locks, &dom->blocked_entrylks);
+
+    gf_msg_trace(this->name, 0, "Blocking lock: {pinode=%p, basename=%s}",
+                 pinode, lock->basename);
+
+    entrylk_trace_block(this, lock->frame, NULL, NULL, NULL, lock->basename,
+                        ENTRYLK_LOCK, lock->type);
+out:
+    return -EAGAIN;
+}
+
+/**
+ * __lock_entrylk - lock a name in a directory
+ * @inode: inode for the directory in which to lock
+ * @basename: name of the entry to lock
+ *            if null, lock the entire directory
+ *
+ * the entire directory being locked is represented as: a single
+ * pl_entry_lock_t present in the entrylk_locks list with its
+ * basename = NULL
+ */
+
+int
+__lock_entrylk(xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock,
+               int nonblock, pl_dom_list_t *dom, struct timespec *now,
+               struct list_head *contend)
+{
+    pl_entry_lock_t *conf = NULL;
+    int ret = -EAGAIN;
+
+    conf = __entrylk_grantable(this, dom, lock, now, contend);
+    if (conf) {
+        ret = __lock_blocked_add(this, pinode, dom, lock, nonblock);
+        goto out;
+    }
+
+    /* To prevent blocked locks starvation, check if there are any blocked
+     * locks thay may conflict with this lock. If there is then don't grant
+     * the lock. BUT grant the lock if the owner already has lock to allow
+     * nested locks.
+     * Example: SHD from Machine1 takes (gfid, basename=257-length-name)
+     * and is granted.
+     * SHD from machine2 takes (gfid, basename=NULL) and is blocked.
+     * When SHD from Machine1 takes (gfid, basename=NULL) it needs to be
+     * granted, without which self-heal can't progress.
+     * TODO: Find why 'owner_has_lock' is checked even for blocked locks.
+     */
+    if (__blocked_entrylk_conflict(dom, lock) &&
+        !(__owner_has_lock(dom, lock))) {
+        if (nonblock == 0) {
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "Lock is grantable, but blocking to prevent "
+                   "starvation");
+        }
+
+        ret = __lock_blocked_add(this, pinode, dom, lock, nonblock);
+        goto out;
+    }
+
+    __pl_entrylk_ref(lock);
+    lock->granted_time = gf_time();
+    list_add(&lock->domain_list, &dom->entrylk_list);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/**
+ * __unlock_entrylk - unlock a name in a directory
+ * @inode: inode for the directory to unlock in
+ * @basename: name of the entry to unlock
+ *            if null, unlock the entire directory
+ */
+
+pl_entry_lock_t *
+__unlock_entrylk(pl_dom_list_t *dom, pl_entry_lock_t *lock)
+{
+    pl_entry_lock_t *ret_lock = NULL;
+
+    ret_lock = __find_matching_lock(dom, lock);
+
+    if (ret_lock) {
+        list_del_init(&ret_lock->domain_list);
+    } else {
+        gf_log("locks", GF_LOG_ERROR,
+               "unlock on %s "
+               "(type=ENTRYLK_WRLCK) attempted but no matching lock "
+               "found",
+               lock->basename);
+    }
+
+    return ret_lock;
+}
+
+int32_t
+check_entrylk_on_basename(xlator_t *this, inode_t *parent, char *basename)
+{
+    int32_t entrylk = 0;
+    pl_dom_list_t *dom = NULL;
+    pl_entry_lock_t *conf = NULL;
+
+    pl_inode_t *pinode = pl_inode_get(this, parent, NULL);
+    if (!pinode)
+        goto out;
+    pthread_mutex_lock(&pinode->mutex);
+    {
+        list_for_each_entry(dom, &pinode->dom_list, inode_list)
+        {
+            conf = __find_most_matching_lock(dom, basename);
+            if (conf && conf->basename) {
+                entrylk = 1;
+                break;
+            }
+        }
+    }
+    pthread_mutex_unlock(&pinode->mutex);
+
+out:
+    return entrylk;
+}
+
+void
+__grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode,
+                            pl_dom_list_t *dom, struct list_head *granted,
+                            struct timespec *now, struct list_head *contend)
+{
+    int bl_ret = 0;
+    pl_entry_lock_t *bl = NULL;
+    pl_entry_lock_t *tmp = NULL;
+
+    struct list_head blocked_list;
+
+    INIT_LIST_HEAD(&blocked_list);
+    list_splice_init(&dom->blocked_entrylks, &blocked_list);
+
+    list_for_each_entry_safe(bl, tmp, &blocked_list, blocked_locks)
+    {
+        list_del_init(&bl->blocked_locks);
+
+        bl_ret = __lock_entrylk(bl->this, pl_inode, bl, 0, dom, now, contend);
+
+        if (bl_ret == 0) {
+            list_add_tail(&bl->blocked_locks, granted);
+        }
+    }
+}
+
+/* Grants locks if possible which are blocked on a lock */
+void
+grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode,
+                          pl_dom_list_t *dom, struct timespec *now,
+                          struct list_head *contend)
+{
+    struct list_head granted_list;
+    pl_entry_lock_t *tmp = NULL;
+    pl_entry_lock_t *lock = NULL;
+
+    INIT_LIST_HEAD(&granted_list);
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        __grant_blocked_entry_locks(this, pl_inode, dom, &granted_list, now,
+                                    contend);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    list_for_each_entry_safe(lock, tmp, &granted_list, blocked_locks)
+    {
+        entrylk_trace_out(this, lock->frame, NULL, NULL, NULL, lock->basename,
+                          ENTRYLK_LOCK, lock->type, 0, 0);
+
+        STACK_UNWIND_STRICT(entrylk, lock->frame, 0, 0, NULL);
+        lock->frame = NULL;
+    }
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        list_for_each_entry_safe(lock, tmp, &granted_list, blocked_locks)
+        {
+            list_del_init(&lock->blocked_locks);
+            __pl_entrylk_unref(lock);
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+}
+
+/* Common entrylk code called by pl_entrylk and pl_fentrylk */
+int
+pl_common_entrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+                  inode_t *inode, const char *basename, entrylk_cmd cmd,
+                  entrylk_type type, loc_t *loc, fd_t *fd, dict_t *xdata)
+
+{
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    int ret = -1;
+    char unwind = 1;
+    GF_UNUSED int dict_ret = -1;
+    pl_inode_t *pinode = NULL;
+    pl_entry_lock_t *reqlock = NULL;
+    pl_entry_lock_t *unlocked = NULL;
+    pl_dom_list_t *dom = NULL;
+    char *conn_id = NULL;
+    pl_ctx_t *ctx = NULL;
+    int nonblock = 0;
+    gf_boolean_t need_inode_unref = _gf_false;
+    posix_locks_private_t *priv = NULL;
+    struct list_head *pcontend = NULL;
+    struct list_head contend;
+    struct timespec now = {};
+
+    priv = this->private;
+
+    if (priv->notify_contention) {
+        pcontend = &contend;
+        INIT_LIST_HEAD(pcontend);
+        timespec_now(&now);
+    }
+
+    if (xdata)
+        dict_ret = dict_get_str(xdata, "connection-id", &conn_id);
+
+    pinode = pl_inode_get(this, inode, NULL);
+    if (!pinode) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    if (frame->root->client) {
+        ctx = pl_ctx_get(frame->root->client, this);
+        if (!ctx) {
+            op_errno = ENOMEM;
+            gf_log(this->name, GF_LOG_INFO, "pl_ctx_get() failed");
+            goto unwind;
+        }
+    }
+
+    dom = get_domain(pinode, volume);
+    if (!dom) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    entrylk_trace_in(this, frame, volume, fd, loc, basename, cmd, type);
+
+    reqlock = new_entrylk_lock(pinode, basename, type, dom->domain, frame,
+                               conn_id, &op_errno);
+    if (!reqlock) {
+        op_ret = -1;
+        goto unwind;
+    }
+
+    /* Ideally, AFTER a successful lock (both blocking and non-blocking) or
+     * an unsuccessful blocking lock operation, the inode needs to be ref'd.
+     *
+     * But doing so might give room to a race where the lock-requesting
+     * client could send a DISCONNECT just before this thread refs the inode
+     * after the locking is done, and the epoll thread could unref the inode
+     * in cleanup which means the inode's refcount would come down to 0, and
+     * the call to pl_forget() at this point destroys @pinode. Now when
+     * the io-thread executing this function tries to access pinode,
+     * it could crash on account of illegal memory access.
+     *
+     * To get around this problem, the inode is ref'd once even before
+     * adding the lock into client_list as a precautionary measure.
+     * This way even if there are DISCONNECTs, there will always be 1 extra
+     * ref on the inode, so @pinode is still alive until after the
+     * current stack unwinds.
+     */
+    pinode->inode = inode_ref(inode);
+    if (priv->revocation_secs != 0) {
+        if (cmd != ENTRYLK_UNLOCK) {
+            __entrylk_prune_stale(this, pinode, dom, reqlock);
+        } else if (priv->monkey_unlocking == _gf_true) {
+            if (pl_does_monkey_want_stuck_lock()) {
+                gf_log(this->name, GF_LOG_WARNING,
+                       "MONKEY LOCKING (forcing stuck lock)!");
+                op_ret = 0;
+                need_inode_unref = _gf_true;
+                pthread_mutex_lock(&pinode->mutex);
+                {
+                    __pl_entrylk_unref(reqlock);
+                }
+                pthread_mutex_unlock(&pinode->mutex);
+                goto out;
+            }
+        }
+    }
+
+    switch (cmd) {
+        case ENTRYLK_LOCK_NB:
+            nonblock = 1;
+            /* fall through */
+        case ENTRYLK_LOCK:
+            if (ctx)
+                pthread_mutex_lock(&ctx->lock);
+            pthread_mutex_lock(&pinode->mutex);
+            {
+                reqlock->pinode = pinode;
+
+                ret = __lock_entrylk(this, pinode, reqlock, nonblock, dom, &now,
+                                     pcontend);
+                if (ret == 0) {
+                    reqlock->frame = NULL;
+                    op_ret = 0;
+                } else {
+                    op_errno = -ret;
+                }
+
+                if (ctx && (!ret || !nonblock))
+                    list_add(&reqlock->client_list, &ctx->entrylk_lockers);
+
+                if (ret == -EAGAIN && !nonblock) {
+                    /* blocked */
+                    unwind = 0;
+                } else {
+                    __pl_entrylk_unref(reqlock);
+                }
+
+                /* For all but the case where a non-blocking lock
+                 * attempt fails, the extra ref taken before the switch
+                 * block must be negated.
+                 */
+                if ((ret == -EAGAIN) && (nonblock))
+                    need_inode_unref = _gf_true;
+            }
+            pthread_mutex_unlock(&pinode->mutex);
+            if (ctx)
+                pthread_mutex_unlock(&ctx->lock);
+            break;
+
+        case ENTRYLK_UNLOCK:
+            if (ctx)
+                pthread_mutex_lock(&ctx->lock);
+            pthread_mutex_lock(&pinode->mutex);
+            {
+                /* Irrespective of whether unlock succeeds or not,
+                 * the extra inode ref that was done before the switch
+                 * block must be negated. Towards this,
+                 * @need_inode_unref flag is set unconditionally here.
+                 */
+                need_inode_unref = _gf_true;
+                unlocked = __unlock_entrylk(dom, reqlock);
+                if (unlocked) {
+                    list_del_init(&unlocked->client_list);
+                    __pl_entrylk_unref(unlocked);
+                    op_ret = 0;
+                } else {
+                    op_errno = EINVAL;
+                }
+                __pl_entrylk_unref(reqlock);
+            }
+            pthread_mutex_unlock(&pinode->mutex);
+            if (ctx)
+                pthread_mutex_unlock(&ctx->lock);
+
+            grant_blocked_entry_locks(this, pinode, dom, &now, pcontend);
+
+            break;
+
+        default:
+            need_inode_unref = _gf_true;
+            gf_log(this->name, GF_LOG_ERROR,
+                   "Unexpected case in entrylk (cmd=%d). Please file"
+                   "a bug report at http://bugs.gluster.com",
+                   cmd);
+            goto out;
+    }
+    /* The following (extra) unref corresponds to the ref that
+     * was done at the time the lock was granted.
+     */
+    if ((cmd == ENTRYLK_UNLOCK) && (op_ret == 0))
+        inode_unref(pinode->inode);
+
+out:
+
+    if (need_inode_unref)
+        inode_unref(pinode->inode);
+
+    if (unwind) {
+        entrylk_trace_out(this, frame, volume, fd, loc, basename, cmd, type,
+                          op_ret, op_errno);
+    unwind:
+        STACK_UNWIND_STRICT(entrylk, frame, op_ret, op_errno, NULL);
+    }
+
+    if (pcontend != NULL) {
+        entrylk_contention_notify(this, pcontend);
+    }
+
+    return 0;
+}
+
+/**
+ * pl_entrylk:
+ *
+ * Locking on names (directory entries)
+ */
+
+int
+pl_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+           const char *basename, entrylk_cmd cmd, entrylk_type type,
+           dict_t *xdata)
+{
+    pl_common_entrylk(frame, this, volume, loc->inode, basename, cmd, type, loc,
+                      NULL, xdata);
+
+    return 0;
+}
+
+/**
+ * pl_fentrylk:
+ *
+ * Locking on names (directory entries)
+ */
+
+int
+pl_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            const char *basename, entrylk_cmd cmd, entrylk_type type,
+            dict_t *xdata)
+{
+    pl_common_entrylk(frame, this, volume, fd->inode, basename, cmd, type, NULL,
+                      fd, xdata);
+
+    return 0;
+}
+
+static void
+pl_entrylk_log_cleanup(pl_entry_lock_t *lock)
+{
+    pl_inode_t *pinode = NULL;
+
+    pinode = lock->pinode;
+
+    gf_log(THIS->name, GF_LOG_WARNING,
+           "releasing lock on %s held by "
+           "{client=%p, pid=%" PRId64 " lk-owner=%s}",
+           uuid_utoa(pinode->gfid), lock->client, (uint64_t)lock->client_pid,
+           lkowner_utoa(&lock->owner));
+}
+
+/* Release all entrylks from this client */
+int
+pl_entrylk_client_cleanup(xlator_t *this, pl_ctx_t *ctx)
+{
+    posix_locks_private_t *priv;
+    pl_entry_lock_t *tmp = NULL;
+    pl_entry_lock_t *l = NULL;
+    pl_dom_list_t *dom = NULL;
+    pl_inode_t *pinode = NULL;
+    struct list_head *pcontend = NULL;
+    struct list_head released;
+    struct list_head unwind;
+    struct list_head contend;
+    struct timespec now = {};
+
+    INIT_LIST_HEAD(&released);
+    INIT_LIST_HEAD(&unwind);
+
+    priv = this->private;
+    if (priv->notify_contention) {
+        pcontend = &contend;
+        INIT_LIST_HEAD(pcontend);
+        timespec_now(&now);
+    }
+
+    pthread_mutex_lock(&ctx->lock);
+    {
+        list_for_each_entry_safe(l, tmp, &ctx->entrylk_lockers, client_list)
+        {
+            pl_entrylk_log_cleanup(l);
+
+            pinode = l->pinode;
+
+            pthread_mutex_lock(&pinode->mutex);
+            {
+                /* If the entrylk object is part of granted list but not
+                 * blocked list, then perform the following actions:
+                 * i.   delete the object from granted list;
+                 * ii.  grant other locks (from other clients) that may
+                 *      have been blocked on this entrylk; and
+                 * iii. unref the object.
+                 *
+                 * If the entrylk object (L1) is part of both granted
+                 * and blocked lists, then this means that a parallel
+                 * unlock on another entrylk (L2 say) may have 'granted'
+                 * L1 and added it to 'granted' list in
+                 * __grant_blocked_entry_locks() (although using the
+                 * 'blocked_locks' member). In that case, the cleanup
+                 * codepath must try and grant other overlapping
+                 * blocked entrylks from other clients, now that L1 is
+                 * out of their way and then unref L1 in the end, and
+                 * leave it to the other thread (the one executing
+                 * unlock codepath) to unwind L1's frame, delete it from
+                 * blocked_locks list, and perform the last unref on L1.
+                 *
+                 * If the entrylk object (L1) is part of blocked list
+                 * only, the cleanup code path must:
+                 * i.   delete it from the blocked_locks list inside
+                 *      this critical section,
+                 * ii.  unwind its frame with EAGAIN,
+                 * iii. try and grant blocked entry locks from other
+                 *      clients that were otherwise grantable, but were
+                 *      blocked to avoid leaving L1 to starve forever.
+                 * iv.  unref the object.
+                 */
+                list_del_init(&l->client_list);
+
+                if (!list_empty(&l->domain_list)) {
+                    list_del_init(&l->domain_list);
+                    list_add_tail(&l->client_list, &released);
+                } else {
+                    list_del_init(&l->blocked_locks);
+                    list_add_tail(&l->client_list, &unwind);
+                }
+            }
+            pthread_mutex_unlock(&pinode->mutex);
+        }
+    }
+    pthread_mutex_unlock(&ctx->lock);
+
+    if (!list_empty(&unwind)) {
+        list_for_each_entry_safe(l, tmp, &unwind, client_list)
+        {
+            list_del_init(&l->client_list);
+
+            if (l->frame)
+                STACK_UNWIND_STRICT(entrylk, l->frame, -1, EAGAIN, NULL);
+            list_add_tail(&l->client_list, &released);
+        }
+    }
+
+    if (!list_empty(&released)) {
+        list_for_each_entry_safe(l, tmp, &released, client_list)
+        {
+            list_del_init(&l->client_list);
+
+            pinode = l->pinode;
+
+            dom = get_domain(pinode, l->volume);
+
+            grant_blocked_entry_locks(this, pinode, dom, &now, pcontend);
+
+            pthread_mutex_lock(&pinode->mutex);
+            {
+                __pl_entrylk_unref(l);
+            }
+            pthread_mutex_unlock(&pinode->mutex);
+
+            inode_unref(pinode->inode);
+        }
+    }
+
+    if (pcontend != NULL) {
+        entrylk_contention_notify(this, pcontend);
+    }
+
+    return 0;
+}
+
+int32_t
+__get_entrylk_count(xlator_t *this, pl_inode_t *pl_inode)
+{
+    int32_t count = 0;
+    pl_entry_lock_t *lock = NULL;
+    pl_dom_list_t *dom = NULL;
+
+    list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
+    {
+        list_for_each_entry(lock, &dom->entrylk_list, domain_list) { count++; }
+
+        list_for_each_entry(lock, &dom->blocked_entrylks, blocked_locks)
+        {
+            count++;
+        }
+    }
+
+    return count;
+}
+
+int32_t
+get_entrylk_count(xlator_t *this, inode_t *inode)
+{
+    pl_inode_t *pl_inode = NULL;
+    uint64_t tmp_pl_inode = 0;
+    int ret = 0;
+    int32_t count = 0;
+
+    ret = inode_ctx_get(inode, this, &tmp_pl_inode);
+    if (ret != 0) {
+        goto out;
+    }
+
+    pl_inode = (pl_inode_t *)(long)tmp_pl_inode;
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        count = __get_entrylk_count(this, pl_inode);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+out:
+    return count;
+}
diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c
new file mode 100644
index 00000000000..d4e51d6e0a1
--- /dev/null
+++ b/xlators/features/locks/src/inodelk.c
@@ -0,0 +1,1174 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/list.h>
+#include <glusterfs/upcall-utils.h>
+
+#include "locks.h"
+#include "clear.h"
+#include "common.h"
+
+void
+__delete_inode_lock(pl_inode_lock_t *lock)
+{
+    list_del_init(&lock->list);
+}
+
+static void
+__pl_inodelk_ref(pl_inode_lock_t *lock)
+{
+    lock->ref++;
+}
+
+void
+__pl_inodelk_unref(pl_inode_lock_t *lock)
+{
+    lock->ref--;
+    if (!lock->ref) {
+        GF_FREE(lock->connection_id);
+        GF_FREE(lock);
+    }
+}
+
+/* Check if 2 inodelks are conflicting on type. Only 2 shared locks don't
+ * conflict */
+static int
+inodelk_type_conflict(pl_inode_lock_t *l1, pl_inode_lock_t *l2)
+{
+    if (l2->fl_type == F_WRLCK || l1->fl_type == F_WRLCK)
+        return 1;
+
+    return 0;
+}
+
+void
+pl_print_inodelk(char *str, int size, int cmd, struct gf_flock *flock,
+                 const char *domain)
+{
+    char *cmd_str = NULL;
+    char *type_str = NULL;
+
+    switch (cmd) {
+#if F_GETLK != F_GETLK64
+        case F_GETLK64:
+#endif
+        case F_GETLK:
+            cmd_str = "GETLK";
+            break;
+
+#if F_SETLK != F_SETLK64
+        case F_SETLK64:
+#endif
+        case F_SETLK:
+            cmd_str = "SETLK";
+            break;
+
+#if F_SETLKW != F_SETLKW64
+        case F_SETLKW64:
+#endif
+        case F_SETLKW:
+            cmd_str = "SETLKW";
+            break;
+
+        default:
+            cmd_str = "UNKNOWN";
+            break;
+    }
+
+    switch (flock->l_type) {
+        case F_RDLCK:
+            type_str = "READ";
+            break;
+        case F_WRLCK:
+            type_str = "WRITE";
+            break;
+        case F_UNLCK:
+            type_str = "UNLOCK";
+            break;
+        default:
+            type_str = "UNKNOWN";
+            break;
+    }
+
+    snprintf(str, size,
+             "lock=INODELK, cmd=%s, type=%s, "
+             "domain: %s, start=%llu, len=%llu, pid=%llu",
+             cmd_str, type_str, domain, (unsigned long long)flock->l_start,
+             (unsigned long long)flock->l_len,
+             (unsigned long long)flock->l_pid);
+}
+
+/* Determine if the two inodelks overlap reach other's lock regions */
+static int
+inodelk_overlap(pl_inode_lock_t *l1, pl_inode_lock_t *l2)
+{
+    return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start));
+}
+
+/* Returns true if the 2 inodelks have the same owner */
+static int
+same_inodelk_owner(pl_inode_lock_t *l1, pl_inode_lock_t *l2)
+{
+    return (is_same_lkowner(&l1->owner, &l2->owner) &&
+            (l1->client == l2->client));
+}
+
+/* Returns true if the 2 inodelks conflict with each other */
+static int
+inodelk_conflict(pl_inode_lock_t *l1, pl_inode_lock_t *l2)
+{
+    return (inodelk_overlap(l1, l2) && inodelk_type_conflict(l1, l2));
+}
+
+/*
+ * Check to see if the candidate lock overlaps/conflicts with the
+ * requested lock.  If so, determine how old the lock is and return
+ * true if it exceeds the configured threshold, false otherwise.
+ */
+static inline gf_boolean_t
+__stale_inodelk(xlator_t *this, pl_inode_lock_t *candidate_lock,
+                pl_inode_lock_t *requested_lock, time_t *lock_age_sec)
+{
+    posix_locks_private_t *priv = NULL;
+
+    priv = this->private;
+    /* Question: Should we just prune them all given the
+     * chance?  Or just the locks we are attempting to acquire?
+     */
+    if (inodelk_conflict(candidate_lock, requested_lock)) {
+        *lock_age_sec = gf_time() - candidate_lock->granted_time;
+        if (*lock_age_sec > priv->revocation_secs)
+            return _gf_true;
+    }
+    return _gf_false;
+}
+
+/* Examine any locks held on this inode and potentially revoke the lock
+ * if the age exceeds revocation_secs.  We will clear _only_ those locks
+ * which are granted, and then grant those locks which are blocked.
+ *
+ * Depending on how this patch works in the wild, we may expand this and
+ * introduce a heuristic which clears blocked locks as well if they
+ * are beyond a threshold.
+ */
+static gf_boolean_t
+__inodelk_prune_stale(xlator_t *this, pl_inode_t *pinode, pl_dom_list_t *dom,
+                      pl_inode_lock_t *lock)
+{
+    posix_locks_private_t *priv = NULL;
+    pl_inode_lock_t *tmp = NULL;
+    pl_inode_lock_t *lk = NULL;
+    gf_boolean_t revoke_lock = _gf_false;
+    int bcount = 0;
+    int gcount = 0;
+    int op_errno = 0;
+    clrlk_args args;
+    args.opts = NULL;
+    time_t lk_age_sec = 0;
+    uint32_t max_blocked = 0;
+    char *reason_str = NULL;
+
+    priv = this->private;
+
+    args.type = CLRLK_INODE;
+    if (priv->revocation_clear_all == _gf_true)
+        args.kind = CLRLK_ALL;
+    else
+        args.kind = CLRLK_GRANTED;
+
+    if (list_empty(&dom->inodelk_list))
+        goto out;
+
+    pthread_mutex_lock(&pinode->mutex);
+    list_for_each_entry_safe(lk, tmp, &dom->inodelk_list, list)
+    {
+        if (__stale_inodelk(this, lk, lock, &lk_age_sec) == _gf_true) {
+            revoke_lock = _gf_true;
+            reason_str = "age";
+            break;
+        }
+    }
+
+    max_blocked = priv->revocation_max_blocked;
+    if (max_blocked != 0 && revoke_lock == _gf_false) {
+        list_for_each_entry_safe(lk, tmp, &dom->blocked_inodelks, blocked_locks)
+        {
+            max_blocked--;
+            if (max_blocked == 0) {
+                revoke_lock = _gf_true;
+                reason_str = "max blocked";
+                break;
+            }
+        }
+    }
+    pthread_mutex_unlock(&pinode->mutex);
+
+out:
+    if (revoke_lock == _gf_true) {
+        clrlk_clear_inodelk(this, pinode, dom, &args, &bcount, &gcount,
+                            &op_errno);
+        gf_log(this->name, GF_LOG_WARNING,
+               "Lock revocation [reason: %s; gfid: %s; domain: %s; "
+               "age: %ld sec] - Inode lock revoked:  %d granted & %d "
+               "blocked locks cleared",
+               reason_str, uuid_utoa(pinode->gfid), dom->domain, lk_age_sec,
+               gcount, bcount);
+    }
+    return revoke_lock;
+}
+
+void
+inodelk_contention_notify_check(xlator_t *this, pl_inode_lock_t *lock,
+                                struct timespec *now, struct list_head *contend)
+{
+    posix_locks_private_t *priv;
+    int64_t elapsed;
+
+    priv = this->private;
+
+    /* If this lock is in a list, it means that we are about to send a
+     * notification for it, so no need to do anything else. */
+    if (!list_empty(&lock->contend)) {
+        return;
+    }
+
+    elapsed = now->tv_sec;
+    elapsed -= lock->contention_time.tv_sec;
+    if (now->tv_nsec < lock->contention_time.tv_nsec) {
+        elapsed--;
+    }
+    if (elapsed < priv->notify_contention_delay) {
+        return;
+    }
+
+    /* All contention notifications will be sent outside of the locked
+     * region. This means that currently granted locks might have already
+     * been unlocked by that time. To avoid the lock or the inode to be
+     * destroyed before we process them, we take an additional reference
+     * on both. */
+    inode_ref(lock->pl_inode->inode);
+    __pl_inodelk_ref(lock);
+
+    lock->contention_time = *now;
+
+    list_add_tail(&lock->contend, contend);
+}
+
+void
+inodelk_contention_notify(xlator_t *this, struct list_head *contend)
+{
+    struct gf_upcall up;
+    struct gf_upcall_inodelk_contention lc;
+    pl_inode_lock_t *lock;
+    pl_inode_t *pl_inode;
+    client_t *client;
+    gf_boolean_t notify;
+
+    while (!list_empty(contend)) {
+        lock = list_first_entry(contend, pl_inode_lock_t, contend);
+
+        pl_inode = lock->pl_inode;
+
+        pthread_mutex_lock(&pl_inode->mutex);
+
+        /* If the lock has already been released, no notification is
+         * sent. We clear the notification time in this case. */
+        notify = !list_empty(&lock->list);
+        if (!notify) {
+            lock->contention_time.tv_sec = 0;
+            lock->contention_time.tv_nsec = 0;
+        } else {
+            memcpy(&lc.flock, &lock->user_flock, sizeof(lc.flock));
+            lc.pid = lock->client_pid;
+            lc.domain = lock->volume;
+            lc.xdata = NULL;
+
+            gf_uuid_copy(up.gfid, lock->pl_inode->gfid);
+            client = (client_t *)lock->client;
+            if (client == NULL) {
+                /* A NULL client can be found if the inodelk
+                 * was issued by a server side xlator. */
+                up.client_uid = NULL;
+            } else {
+                up.client_uid = client->client_uid;
+            }
+        }
+
+        pthread_mutex_unlock(&pl_inode->mutex);
+
+        if (notify) {
+            up.event_type = GF_UPCALL_INODELK_CONTENTION;
+            up.data = &lc;
+
+            if (this->notify(this, GF_EVENT_UPCALL, &up) < 0) {
+                gf_msg_debug(this->name, 0,
+                             "Inodelk contention notification "
+                             "failed");
+            } else {
+                gf_msg_debug(this->name, 0,
+                             "Inodelk contention notification "
+                             "sent");
+            }
+        }
+
+        pthread_mutex_lock(&pl_inode->mutex);
+
+        list_del_init(&lock->contend);
+        __pl_inodelk_unref(lock);
+
+        pthread_mutex_unlock(&pl_inode->mutex);
+
+        inode_unref(pl_inode->inode);
+    }
+}
+
+/* Determine if lock is grantable or not */
+static pl_inode_lock_t *
+__inodelk_grantable(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock,
+                    struct timespec *now, struct list_head *contend)
+{
+    pl_inode_lock_t *l = NULL;
+    pl_inode_lock_t *ret = NULL;
+
+    list_for_each_entry(l, &dom->inodelk_list, list)
+    {
+        if (inodelk_conflict(lock, l) && !same_inodelk_owner(lock, l)) {
+            if (ret == NULL) {
+                ret = l;
+                if (contend == NULL) {
+                    break;
+                }
+            }
+            inodelk_contention_notify_check(this, l, now, contend);
+        }
+    }
+
+    return ret;
+}
+
+static pl_inode_lock_t *
+__blocked_lock_conflict(pl_dom_list_t *dom, pl_inode_lock_t *lock)
+{
+    pl_inode_lock_t *l = NULL;
+
+    list_for_each_entry(l, &dom->blocked_inodelks, blocked_locks)
+    {
+        if (inodelk_conflict(lock, l)) {
+            return l;
+        }
+    }
+
+    return NULL;
+}
+
+static int
+__owner_has_lock(pl_dom_list_t *dom, pl_inode_lock_t *newlock)
+{
+    pl_inode_lock_t *lock = NULL;
+
+    list_for_each_entry(lock, &dom->inodelk_list, list)
+    {
+        if (same_inodelk_owner(lock, newlock))
+            return 1;
+    }
+
+    list_for_each_entry(lock, &dom->blocked_inodelks, blocked_locks)
+    {
+        if (same_inodelk_owner(lock, newlock))
+            return 1;
+    }
+
+    return 0;
+}
+
+static int
+__lock_blocked_add(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock,
+                   int can_block)
+{
+    if (can_block == 0) {
+        goto out;
+    }
+
+    lock->blkd_time = gf_time();
+    list_add_tail(&lock->blocked_locks, &dom->blocked_inodelks);
+
+    gf_msg_trace(this->name, 0,
+                 "%s (pid=%d) (lk-owner=%s) %" PRId64
+                 " - "
+                 "%" PRId64 " => Blocked",
+                 lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid,
+                 lkowner_utoa(&lock->owner), lock->user_flock.l_start,
+                 lock->user_flock.l_len);
+
+    pl_trace_block(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock,
+                   lock->volume);
+out:
+    return -EAGAIN;
+}
+
+/* Determines if lock can be granted and adds the lock. If the lock
+ * is blocking, adds it to the blocked_inodelks list of the domain.
+ */
+static int
+__lock_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
+               int can_block, pl_dom_list_t *dom, struct timespec *now,
+               struct list_head *contend)
+{
+    pl_inode_lock_t *conf = NULL;
+    int ret;
+
+    ret = pl_inode_remove_inodelk(pl_inode, lock);
+    if (ret < 0) {
+        return ret;
+    }
+    if (ret == 0) {
+        conf = __inodelk_grantable(this, dom, lock, now, contend);
+    }
+    if ((ret > 0) || (conf != NULL)) {
+        return __lock_blocked_add(this, dom, lock, can_block);
+    }
+
+    /* To prevent blocked locks starvation, check if there are any blocked
+     * locks thay may conflict with this lock. If there is then don't grant
+     * the lock. BUT grant the lock if the owner already has lock to allow
+     * nested locks.
+     * Example:
+     * SHD from Machine1 takes (gfid, 0-infinity) and is granted.
+     * SHD from machine2 takes (gfid, 0-infinity) and is blocked.
+     * When SHD from Machine1 takes (gfid, 0-128KB) it
+     * needs to be granted, without which the earlier lock on 0-infinity
+     * will not be unlocked by SHD from Machine1.
+     * TODO: Find why 'owner_has_lock' is checked even for blocked locks.
+     */
+    if (__blocked_lock_conflict(dom, lock) && !(__owner_has_lock(dom, lock))) {
+        if (can_block != 0) {
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "Lock is grantable, but blocking to prevent "
+                   "starvation");
+        }
+
+        return __lock_blocked_add(this, dom, lock, can_block);
+    }
+    __pl_inodelk_ref(lock);
+    lock->granted_time = gf_time();
+    list_add(&lock->list, &dom->inodelk_list);
+
+    return 0;
+}
+
+/* Return true if the two inodelks have exactly same lock boundaries */
+static int
+inodelks_equal(pl_inode_lock_t *l1, pl_inode_lock_t *l2)
+{
+    if ((l1->fl_start == l2->fl_start) && (l1->fl_end == l2->fl_end))
+        return 1;
+
+    return 0;
+}
+
+static pl_inode_lock_t *
+find_matching_inodelk(pl_inode_lock_t *lock, pl_dom_list_t *dom)
+{
+    pl_inode_lock_t *l = NULL;
+    list_for_each_entry(l, &dom->inodelk_list, list)
+    {
+        if (inodelks_equal(l, lock) && same_inodelk_owner(l, lock))
+            return l;
+    }
+    return NULL;
+}
+
+/* Set F_UNLCK removes a lock which has the exact same lock boundaries
+ * as the UNLCK lock specifies. If such a lock is not found, returns invalid
+ */
+static pl_inode_lock_t *
+__inode_unlock_lock(xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom)
+{
+    pl_inode_lock_t *conf = NULL;
+    inode_t *inode = NULL;
+
+    inode = lock->pl_inode->inode;
+
+    conf = find_matching_inodelk(lock, dom);
+    if (!conf) {
+        gf_log(this->name, GF_LOG_ERROR,
+               " Matching lock not found for unlock %llu-%llu, by %s "
+               "on %p for gfid:%s",
+               (unsigned long long)lock->fl_start,
+               (unsigned long long)lock->fl_end, lkowner_utoa(&lock->owner),
+               lock->client, inode ? uuid_utoa(inode->gfid) : "UNKNOWN");
+        goto out;
+    }
+    __delete_inode_lock(conf);
+    gf_log(this->name, GF_LOG_DEBUG,
+           " Matching lock found for unlock %llu-%llu, by %s on %p for gfid:%s",
+           (unsigned long long)lock->fl_start, (unsigned long long)lock->fl_end,
+           lkowner_utoa(&lock->owner), lock->client,
+           inode ? uuid_utoa(inode->gfid) : "UNKNOWN");
+
+out:
+    return conf;
+}
+
+void
+__grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
+                            struct list_head *granted, pl_dom_list_t *dom,
+                            struct timespec *now, struct list_head *contend)
+{
+    pl_inode_lock_t *bl = NULL;
+    pl_inode_lock_t *tmp = NULL;
+
+    struct list_head blocked_list;
+
+    INIT_LIST_HEAD(&blocked_list);
+    list_splice_init(&dom->blocked_inodelks, &blocked_list);
+
+    list_for_each_entry_safe(bl, tmp, &blocked_list, blocked_locks)
+    {
+        list_del_init(&bl->blocked_locks);
+
+        bl->status = __lock_inodelk(this, pl_inode, bl, 1, dom, now, contend);
+
+        if (bl->status != -EAGAIN) {
+            list_add_tail(&bl->blocked_locks, granted);
+        }
+    }
+}
+
+void
+unwind_granted_inodes(xlator_t *this, pl_inode_t *pl_inode,
+                      struct list_head *granted)
+{
+    pl_inode_lock_t *lock;
+    pl_inode_lock_t *tmp;
+    int32_t op_ret;
+    int32_t op_errno;
+
+    list_for_each_entry_safe(lock, tmp, granted, blocked_locks)
+    {
+        if (lock->status == 0) {
+            op_ret = 0;
+            op_errno = 0;
+            gf_log(this->name, GF_LOG_TRACE,
+                   "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64
+                   " => Granted",
+                   lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
+                   lock->client_pid, lkowner_utoa(&lock->owner),
+                   lock->user_flock.l_start, lock->user_flock.l_len);
+        } else {
+            op_ret = -1;
+            op_errno = -lock->status;
+        }
+        pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock,
+                     op_ret, op_errno, lock->volume);
+
+        STACK_UNWIND_STRICT(inodelk, lock->frame, op_ret, op_errno, NULL);
+        lock->frame = NULL;
+    }
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        list_for_each_entry_safe(lock, tmp, granted, blocked_locks)
+        {
+            list_del_init(&lock->blocked_locks);
+            __pl_inodelk_unref(lock);
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+}
+
+/* Grant all inodelks blocked on a lock */
+void
+grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
+                          pl_dom_list_t *dom, struct timespec *now,
+                          struct list_head *contend)
+{
+    struct list_head granted;
+
+    INIT_LIST_HEAD(&granted);
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        __grant_blocked_inode_locks(this, pl_inode, &granted, dom, now,
+                                    contend);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    unwind_granted_inodes(this, pl_inode, &granted);
+}
+
+static void
+pl_inodelk_log_cleanup(pl_inode_lock_t *lock)
+{
+    pl_inode_t *pl_inode = NULL;
+
+    pl_inode = lock->pl_inode;
+
+    gf_log(THIS->name, GF_LOG_WARNING,
+           "releasing lock on %s held by "
+           "{client=%p, pid=%" PRId64 " lk-owner=%s}",
+           uuid_utoa(pl_inode->gfid), lock->client, (uint64_t)lock->client_pid,
+           lkowner_utoa(&lock->owner));
+}
+
+/* Release all inodelks from this client */
+int
+pl_inodelk_client_cleanup(xlator_t *this, pl_ctx_t *ctx)
+{
+    posix_locks_private_t *priv;
+    pl_inode_lock_t *tmp = NULL;
+    pl_inode_lock_t *l = NULL;
+    pl_dom_list_t *dom = NULL;
+    pl_inode_t *pl_inode = NULL;
+    struct list_head *pcontend = NULL;
+    struct list_head released;
+    struct list_head unwind;
+    struct list_head contend;
+    struct timespec now = {};
+
+    priv = this->private;
+
+    INIT_LIST_HEAD(&released);
+    INIT_LIST_HEAD(&unwind);
+
+    if (priv->notify_contention) {
+        pcontend = &contend;
+        INIT_LIST_HEAD(pcontend);
+        timespec_now(&now);
+    }
+
+    pthread_mutex_lock(&ctx->lock);
+    {
+        list_for_each_entry_safe(l, tmp, &ctx->inodelk_lockers, client_list)
+        {
+            pl_inodelk_log_cleanup(l);
+
+            pl_inode = l->pl_inode;
+
+            pthread_mutex_lock(&pl_inode->mutex);
+            {
+                /* If the inodelk object is part of granted list but not
+                 * blocked list, then perform the following actions:
+                 * i.   delete the object from granted list;
+                 * ii.  grant other locks (from other clients) that may
+                 *      have been blocked on this inodelk; and
+                 * iii. unref the object.
+                 *
+                 * If the inodelk object (L1) is part of both granted
+                 * and blocked lists, then this means that a parallel
+                 * unlock on another inodelk (L2 say) may have 'granted'
+                 * L1 and added it to 'granted' list in
+                 * __grant_blocked_inode_locks() (although using the
+                 * 'blocked_locks' member). In that case, the cleanup
+                 * codepath must try and grant other overlapping
+                 * blocked inodelks from other clients, now that L1 is
+                 * out of their way and then unref L1 in the end, and
+                 * leave it to the other thread (the one executing
+                 * unlock codepath) to unwind L1's frame, delete it from
+                 * blocked_locks list, and perform the last unref on L1.
+                 *
+                 * If the inodelk object (L1) is part of blocked list
+                 * only, the cleanup code path must:
+                 * i.   delete it from the blocked_locks list inside
+                 *      this critical section,
+                 * ii.  unwind its frame with EAGAIN,
+                 * iii. try and grant blocked inode locks from other
+                 *      clients that were otherwise grantable, but just
+                 *      got blocked to avoid leaving L1 to starve
+                 *      forever.
+                 * iv.  unref the object.
+                 */
+                list_del_init(&l->client_list);
+
+                if (!list_empty(&l->list)) {
+                    __delete_inode_lock(l);
+                    list_add_tail(&l->client_list, &released);
+                } else {
+                    list_del_init(&l->blocked_locks);
+                    list_add_tail(&l->client_list, &unwind);
+                }
+            }
+            pthread_mutex_unlock(&pl_inode->mutex);
+        }
+    }
+    pthread_mutex_unlock(&ctx->lock);
+
+    if (!list_empty(&unwind)) {
+        list_for_each_entry_safe(l, tmp, &unwind, client_list)
+        {
+            list_del_init(&l->client_list);
+
+            if (l->frame)
+                STACK_UNWIND_STRICT(inodelk, l->frame, -1, EAGAIN, NULL);
+            list_add_tail(&l->client_list, &released);
+        }
+    }
+
+    if (!list_empty(&released)) {
+        list_for_each_entry_safe(l, tmp, &released, client_list)
+        {
+            list_del_init(&l->client_list);
+
+            pl_inode = l->pl_inode;
+
+            dom = get_domain(pl_inode, l->volume);
+
+            grant_blocked_inode_locks(this, pl_inode, dom, &now, pcontend);
+
+            pthread_mutex_lock(&pl_inode->mutex);
+            {
+                __pl_inodelk_unref(l);
+            }
+            pthread_mutex_unlock(&pl_inode->mutex);
+            inode_unref(pl_inode->inode);
+        }
+    }
+
+    if (pcontend != NULL) {
+        inodelk_contention_notify(this, pcontend);
+    }
+
+    return 0;
+}
+
+static int
+pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
+               pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom,
+               inode_t *inode)
+{
+    posix_locks_private_t *priv = NULL;
+    int ret = -EINVAL;
+    pl_inode_lock_t *retlock = NULL;
+    gf_boolean_t unref = _gf_true;
+    gf_boolean_t need_inode_unref = _gf_false;
+    struct list_head *pcontend = NULL;
+    struct list_head contend;
+    struct list_head wake;
+    struct timespec now = {};
+    short fl_type;
+
+    lock->pl_inode = pl_inode;
+    fl_type = lock->fl_type;
+
+    priv = this->private;
+
+    /* Ideally, AFTER a successful lock (both blocking and non-blocking) or
+     * an unsuccessful blocking lock operation, the inode needs to be ref'd.
+     *
+     * But doing so might give room to a race where the lock-requesting
+     * client could send a DISCONNECT just before this thread refs the inode
+     * after the locking is done, and the epoll thread could unref the inode
+     * in cleanup which means the inode's refcount would come down to 0, and
+     * the call to pl_forget() at this point destroys @pl_inode. Now when
+     * the io-thread executing this function tries to access pl_inode,
+     * it could crash on account of illegal memory access.
+     *
+     * To get around this problem, the inode is ref'd once even before
+     * adding the lock into client_list as a precautionary measure.
+     * This way even if there are DISCONNECTs, there will always be 1 extra
+     * ref on the inode, so @pl_inode is still alive until after the
+     * current stack unwinds.
+     */
+    pl_inode->inode = inode_ref(inode);
+
+    if (priv->revocation_secs != 0) {
+        if (lock->fl_type != F_UNLCK) {
+            __inodelk_prune_stale(this, pl_inode, dom, lock);
+        } else if (priv->monkey_unlocking == _gf_true) {
+            if (pl_does_monkey_want_stuck_lock()) {
+                pthread_mutex_lock(&pl_inode->mutex);
+                {
+                    __pl_inodelk_unref(lock);
+                }
+                pthread_mutex_unlock(&pl_inode->mutex);
+                inode_unref(pl_inode->inode);
+                gf_log(this->name, GF_LOG_WARNING,
+                       "MONKEY LOCKING (forcing stuck lock)!");
+                return 0;
+            }
+        }
+    }
+
+    if (priv->notify_contention) {
+        pcontend = &contend;
+        INIT_LIST_HEAD(pcontend);
+        timespec_now(&now);
+    }
+
+    INIT_LIST_HEAD(&wake);
+
+    if (ctx)
+        pthread_mutex_lock(&ctx->lock);
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        if (lock->fl_type != F_UNLCK) {
+            ret = __lock_inodelk(this, pl_inode, lock, can_block, dom, &now,
+                                 pcontend);
+            if (ret == 0) {
+                lock->frame = NULL;
+                gf_log(this->name, GF_LOG_TRACE,
+                       "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64
+                       " => OK",
+                       lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
+                       lock->client_pid, lkowner_utoa(&lock->owner),
+                       lock->fl_start, lock->fl_end);
+            } else if (ret == -EAGAIN) {
+                gf_log(this->name, GF_LOG_TRACE,
+                       "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64
+                       " => NOK",
+                       lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
+                       lock->client_pid, lkowner_utoa(&lock->owner),
+                       lock->user_flock.l_start, lock->user_flock.l_len);
+                if (can_block) {
+                    unref = _gf_false;
+                }
+            }
+            /* For all but the case where a non-blocking lock attempt fails
+             * with -EAGAIN, the extra ref taken at the start of this function
+             * must be negated. */
+            need_inode_unref = (ret != 0) && ((ret != -EAGAIN) || !can_block);
+            if (ctx && !need_inode_unref) {
+                list_add_tail(&lock->client_list, &ctx->inodelk_lockers);
+            }
+        } else {
+            /* Irrespective of whether unlock succeeds or not,
+             * the extra inode ref that was done at the start of
+             * this function must be negated. Towards this,
+             * @need_inode_unref flag is set unconditionally here.
+             */
+            need_inode_unref = _gf_true;
+            retlock = __inode_unlock_lock(this, lock, dom);
+            if (!retlock) {
+                gf_log(this->name, GF_LOG_DEBUG,
+                       "Bad Unlock issued on Inode lock");
+                ret = -EINVAL;
+                goto out;
+            }
+            list_del_init(&retlock->client_list);
+            __pl_inodelk_unref(retlock);
+
+            pl_inode_remove_unlocked(this, pl_inode, &wake);
+
+            ret = 0;
+        }
+    out:
+        if (unref)
+            __pl_inodelk_unref(lock);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+    if (ctx)
+        pthread_mutex_unlock(&ctx->lock);
+
+    pl_inode_remove_wake(&wake);
+
+    /* The following (extra) unref corresponds to the ref that
+     * was done at the time the lock was granted.
+     */
+    if ((fl_type == F_UNLCK) && (ret == 0)) {
+        inode_unref(pl_inode->inode);
+        grant_blocked_inode_locks(this, pl_inode, dom, &now, pcontend);
+    }
+
+    if (need_inode_unref) {
+        inode_unref(pl_inode->inode);
+    }
+
+    if (pcontend != NULL) {
+        inodelk_contention_notify(this, pcontend);
+    }
+
+    return ret;
+}
+
+/* Create a new inode_lock_t */
+static pl_inode_lock_t *
+new_inode_lock(struct gf_flock *flock, client_t *client, pid_t client_pid,
+               call_frame_t *frame, xlator_t *this, const char *volume,
+               char *conn_id, int32_t *op_errno)
+
+{
+    pl_inode_lock_t *lock = NULL;
+
+    if (!pl_is_lk_owner_valid(&frame->root->lk_owner, frame->root->client)) {
+        *op_errno = EINVAL;
+        goto out;
+    }
+
+    lock = GF_CALLOC(1, sizeof(*lock), gf_locks_mt_pl_inode_lock_t);
+    if (!lock) {
+        *op_errno = ENOMEM;
+        goto out;
+    }
+
+    lock->fl_start = flock->l_start;
+    lock->fl_type = flock->l_type;
+
+    if (flock->l_len == 0)
+        lock->fl_end = LLONG_MAX;
+    else
+        lock->fl_end = flock->l_start + flock->l_len - 1;
+
+    lock->client = client;
+    lock->client_pid = client_pid;
+    lock->volume = volume;
+    lock->owner = frame->root->lk_owner;
+    lock->frame = frame;
+    lock->this = this;
+
+    if (conn_id) {
+        lock->connection_id = gf_strdup(conn_id);
+    }
+
+    INIT_LIST_HEAD(&lock->list);
+    INIT_LIST_HEAD(&lock->blocked_locks);
+    INIT_LIST_HEAD(&lock->client_list);
+    INIT_LIST_HEAD(&lock->contend);
+    __pl_inodelk_ref(lock);
+
+out:
+    return lock;
+}
+
+int32_t
+_pl_convert_volume(const char *volume, char **res)
+{
+    char *mdata_vol = NULL;
+    int ret = 0;
+
+    mdata_vol = strrchr(volume, ':');
+    // if the volume already ends with :metadata don't bother
+    if (mdata_vol && (strcmp(mdata_vol, ":metadata") == 0))
+        return 0;
+
+    ret = gf_asprintf(res, "%s:metadata", volume);
+    if (ret <= 0)
+        return ENOMEM;
+    return 0;
+}
+
+int32_t
+_pl_convert_volume_for_special_range(struct gf_flock *flock, const char *volume,
+                                     char **res)
+{
+    int32_t ret = 0;
+
+    if ((flock->l_start == LLONG_MAX - 1) && (flock->l_len == 0)) {
+        ret = _pl_convert_volume(volume, res);
+    }
+
+    return ret;
+}
+
+/* Common inodelk code called from pl_inodelk and pl_finodelk */
+int
+pl_common_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+                  inode_t *inode, int32_t cmd, struct gf_flock *flock,
+                  loc_t *loc, fd_t *fd, dict_t *xdata)
+{
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    int ret = -1;
+    GF_UNUSED int dict_ret = -1;
+    int can_block = 0;
+    short lock_type = 0;
+    pl_inode_t *pinode = NULL;
+    pl_inode_lock_t *reqlock = NULL;
+    pl_dom_list_t *dom = NULL;
+    char *res = NULL;
+    char *res1 = NULL;
+    char *conn_id = NULL;
+    pl_ctx_t *ctx = NULL;
+
+    if (xdata)
+        dict_ret = dict_get_str(xdata, "connection-id", &conn_id);
+
+    VALIDATE_OR_GOTO(frame, out);
+    VALIDATE_OR_GOTO(inode, unwind);
+    VALIDATE_OR_GOTO(flock, unwind);
+
+    if ((flock->l_start < 0) || (flock->l_len < 0)) {
+        op_errno = EINVAL;
+        goto unwind;
+    }
+
+    op_errno = _pl_convert_volume_for_special_range(flock, volume, &res);
+    if (op_errno)
+        goto unwind;
+    if (res)
+        volume = res;
+
+    pl_trace_in(this, frame, fd, loc, cmd, flock, volume);
+
+    if (frame->root->client) {
+        ctx = pl_ctx_get(frame->root->client, this);
+        if (!ctx) {
+            op_errno = ENOMEM;
+            gf_log(this->name, GF_LOG_INFO, "pl_ctx_get() failed");
+            goto unwind;
+        }
+    }
+
+    pinode = pl_inode_get(this, inode, NULL);
+    if (!pinode) {
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    dom = get_domain(pinode, volume);
+    if (!dom) {
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    reqlock = new_inode_lock(flock, frame->root->client, frame->root->pid,
+                             frame, this, dom->domain, conn_id, &op_errno);
+
+    if (!reqlock) {
+        op_ret = -1;
+        goto unwind;
+    }
+
+    switch (cmd) {
+        case F_SETLKW:
+            can_block = 1;
+
+            /* fall through */
+
+        case F_SETLK:
+            lock_type = flock->l_type;
+            memcpy(&reqlock->user_flock, flock, sizeof(struct gf_flock));
+            ret = pl_inode_setlk(this, ctx, pinode, reqlock, can_block, dom,
+                                 inode);
+
+            if (ret < 0) {
+                if (ret == -EAGAIN) {
+                    if (can_block && (F_UNLCK != lock_type)) {
+                        goto out;
+                    }
+                    gf_log(this->name, GF_LOG_TRACE, "returning EAGAIN");
+                } else {
+                    gf_log(this->name, GF_LOG_TRACE, "returning %d", ret);
+                }
+                op_errno = -ret;
+                goto unwind;
+            }
+            break;
+
+        default:
+            op_errno = ENOTSUP;
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "Lock command F_GETLK not supported for [f]inodelk "
+                   "(cmd=%d)",
+                   cmd);
+            goto unwind;
+    }
+
+    op_ret = 0;
+
+unwind:
+    if (flock != NULL)
+        pl_trace_out(this, frame, fd, loc, cmd, flock, op_ret, op_errno,
+                     volume);
+
+    STACK_UNWIND_STRICT(inodelk, frame, op_ret, op_errno, NULL);
+out:
+    GF_FREE(res);
+    GF_FREE(res1);
+    return 0;
+}
+
+int
+pl_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+           int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+    pl_common_inodelk(frame, this, volume, loc->inode, cmd, flock, loc, NULL,
+                      xdata);
+
+    return 0;
+}
+
+int
+pl_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+    pl_common_inodelk(frame, this, volume, fd->inode, cmd, flock, NULL, fd,
+                      xdata);
+
+    return 0;
+}
+
+static int32_t
+__get_inodelk_dom_count(pl_dom_list_t *dom)
+{
+    pl_inode_lock_t *lock = NULL;
+    int32_t count = 0;
+
+    list_for_each_entry(lock, &dom->inodelk_list, list) { count++; }
+    list_for_each_entry(lock, &dom->blocked_inodelks, blocked_locks)
+    {
+        count++;
+    }
+    return count;
+}
+
+/* Returns the no. of locks (blocked/granted) held on a given domain name
+ * If @domname is NULL, returns the no. of locks in all the domains present.
+ * If @domname is non-NULL and non-existent, returns 0 */
+int32_t
+__get_inodelk_count(xlator_t *this, pl_inode_t *pl_inode, char *domname)
+{
+    int32_t count = 0;
+    pl_dom_list_t *dom = NULL;
+
+    list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
+    {
+        if (domname) {
+            if (strcmp(domname, dom->domain) == 0) {
+                count = __get_inodelk_dom_count(dom);
+                goto out;
+            }
+
+        } else {
+            /* Counting locks from all domains */
+            count += __get_inodelk_dom_count(dom);
+        }
+    }
+
+out:
+    return count;
+}
+
+int32_t
+get_inodelk_count(xlator_t *this, inode_t *inode, char *domname)
+{
+    pl_inode_t *pl_inode = NULL;
+    uint64_t tmp_pl_inode = 0;
+    int ret = 0;
+    int32_t count = 0;
+
+    ret = inode_ctx_get(inode, this, &tmp_pl_inode);
+    if (ret != 0) {
+        goto out;
+    }
+
+    pl_inode = (pl_inode_t *)(long)tmp_pl_inode;
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        count = __get_inodelk_count(this, pl_inode, domname);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+out:
+    return count;
+}
diff --git a/xlators/features/locks/src/internal.c b/xlators/features/locks/src/internal.c
deleted file mode 100644
index 985762fb9eb..00000000000
--- a/xlators/features/locks/src/internal.c
+++ /dev/null
@@ -1,855 +0,0 @@
-/*
-  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "compat.h"
-#include "xlator.h"
-#include "inode.h"
-#include "logging.h"
-#include "common-utils.h"
-#include "list.h"
-
-#include "locks.h"
-#include "common.h"
-
-
-
-static int
-delete_locks_of_transport (pl_inode_t *pinode, transport_t *trans)
-{
-	posix_lock_t *tmp = NULL;
-	posix_lock_t *l = NULL;
-
-	list_for_each_entry_safe (l, tmp, &pinode->dir_list, list) {
-		if (l->transport == trans) {
-			__delete_lock (pinode, tmp);
-			__destroy_lock (tmp);
-		}
-	}
-
-	return 0;
-}
-
-
-/**
- * pl_inodelk: 
- *
- * This fop provides fcntl-style locking on files for internal
- * purposes. Locks held through this fop reside in a domain different
- * from those held by applications. This fop is for the use of AFR.
- */
-
-int
-pl_inodelk (call_frame_t *frame, xlator_t *this,
-	    loc_t *loc, int32_t cmd, struct flock *flock)
-{
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-	int     ret      = -1;
-	int     can_block = 0;
-	posix_locks_private_t * priv       = NULL;
-	transport_t *           transport  = NULL;
-	pid_t                   client_pid = -1;
-	pl_inode_t *            pinode     = NULL;
-	posix_lock_t *          reqlock    = NULL;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (loc, out);
-	VALIDATE_OR_GOTO (flock, out);
-	
-	if ((flock->l_start < 0) || (flock->l_len < 0)) {
-		op_errno = EINVAL;
-		goto unwind;
-	}
-
-	transport  = frame->root->trans;
-	client_pid = frame->root->pid;
-
-	priv = (posix_locks_private_t *) this->private;
-
-	VALIDATE_OR_GOTO (priv, out);
-
-	pinode = pl_inode_get (this, loc->inode);
-	if (!pinode) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		op_errno = ENOMEM;
-		goto unwind;
-	}
-
-	if (client_pid == 0) {
-		/* 
-		   special case: this means release all locks 
-		   from this transport
-		*/
-		gf_log (this->name, GF_LOG_DEBUG,
-			"releasing all locks from transport %p", transport);
-
-		delete_locks_of_transport (pinode, transport);
-		goto unwind;
-	}
-
-	reqlock = new_posix_lock (flock, transport, client_pid);
-	if (!reqlock) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		op_ret = -1;
-		op_errno = ENOMEM;
-		goto unwind;
-	}
-
-	switch (cmd) {
-	case F_SETLKW:
-		can_block = 1;
-		reqlock->frame = frame;
-		reqlock->this  = this;
-
-		/* fall through */
-
-	case F_SETLK:
-		memcpy (&reqlock->user_flock, flock, sizeof (struct flock));
-		ret = pl_setlk (this, pinode, reqlock,
-				can_block, GF_LOCK_INTERNAL);
-
-		if (ret == -1) {
-			if (can_block)
-				goto out;
-
-			gf_log (this->name, GF_LOG_DEBUG, "returning EAGAIN");
-			op_errno = EAGAIN;
-			__destroy_lock (reqlock);
-			goto unwind;
-		}
-		break;
-
-	default:
-		op_errno = ENOTSUP;
-		gf_log (this->name, GF_LOG_ERROR,
-			"lock command F_GETLK not supported for GF_FILE_LK (cmd=%d)", 
-			cmd);
-			goto unwind;
-	}
-
-	op_ret = 0;
-
-unwind:	
-	STACK_UNWIND (frame, op_ret, op_errno);
-out:
-	return 0;
-}
-
-
-int
-pl_finodelk (call_frame_t *frame, xlator_t *this,
-	     fd_t *fd, int32_t cmd, struct flock *flock)
-{
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-	int     ret      = -1;
-	int     can_block = 0;
-	posix_locks_private_t * priv       = NULL;
-	transport_t *           transport  = NULL;
-	pid_t                   client_pid = -1;
-	pl_inode_t *            pinode     = NULL;
-	posix_lock_t *          reqlock    = NULL;
-
-	VALIDATE_OR_GOTO (frame, out);
-	VALIDATE_OR_GOTO (fd, out);
-	VALIDATE_OR_GOTO (flock, out);
-	
-	if ((flock->l_start < 0) || (flock->l_len < 0)) {
-		op_errno = EINVAL;
-		goto unwind;
-	}
-
-	transport  = frame->root->trans;
-	client_pid = frame->root->pid;
-
-	priv = (posix_locks_private_t *) this->private;
-
-	VALIDATE_OR_GOTO (priv, out);
-
-	pinode = pl_inode_get (this, fd->inode);
-	if (!pinode) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		op_errno = ENOMEM;
-		goto unwind;
-	}
-
-	if (client_pid == 0) {
-		/* 
-		   special case: this means release all locks 
-		   from this transport
-		*/
-		gf_log (this->name, GF_LOG_DEBUG,
-			"releasing all locks from transport %p", transport);
-
-		delete_locks_of_transport (pinode, transport);
-		goto unwind;
-	}
-
-	reqlock = new_posix_lock (flock, transport, client_pid);
-	if (!reqlock) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		op_ret = -1;
-		op_errno = ENOMEM;
-		goto unwind;
-	}
-
-	switch (cmd) {
-	case F_SETLKW:
-		can_block = 1;
-		reqlock->frame = frame;
-		reqlock->this  = this;
-		reqlock->fd    = fd;
-
-		/* fall through */
-
-	case F_SETLK:
-		memcpy (&reqlock->user_flock, flock, sizeof (struct flock));
-		ret = pl_setlk (this, pinode, reqlock,
-				can_block, GF_LOCK_INTERNAL);
-
-		if (ret == -1) {
-			if (can_block)
-				goto out;
-
-			gf_log (this->name, GF_LOG_DEBUG, "returning EAGAIN");
-			op_errno = EAGAIN;
-			__destroy_lock (reqlock);
-			goto unwind;
-		}
-		break;
-
-	default:
-		op_errno = ENOTSUP;
-		gf_log (this->name, GF_LOG_ERROR,
-			"lock command F_GETLK not supported for GF_FILE_LK (cmd=%d)", 
-			cmd);
-			goto unwind;
-	}
-
-	op_ret = 0;
-
-unwind:	
-	STACK_UNWIND (frame, op_ret, op_errno);
-out:
-	return 0;
-}
-
-
-/**
- * types_conflict - do two types of lock conflict?
- * @t1: type
- * @t2: type
- *
- * two read locks do not conflict
- * any other case conflicts
- */
-
-static int
-types_conflict (entrylk_type t1, entrylk_type t2)
-{
-	return !((t1 == ENTRYLK_RDLCK) && (t2 == ENTRYLK_RDLCK));
-}
-
-/**
- * all_names - does a basename represent all names?
- * @basename: name to check
- */
-
-#define all_names(basename) ((basename == NULL) ? 1 : 0)
-
-/**
- * names_conflict - do two names conflict?
- * @n1: name
- * @n2: name
- */
-
-static int 
-names_conflict (const char *n1, const char *n2)
-{
-	return all_names (n1) || all_names (n2) || !strcmp (n1, n2);
-}
-
-
-static int 
-names_equal (const char *n1, const char *n2)
-{
-	return (n1 == NULL && n2 == NULL) || (n1 && n2 && !strcmp (n1, n2));
-}
-
-/**
- * lock_grantable - is this lock grantable?
- * @inode: inode in which to look
- * @basename: name we're trying to lock
- * @type: type of lock
- */
-
-static pl_entry_lock_t *
-__lock_grantable (pl_inode_t *pinode, const char *basename, entrylk_type type)
-{
-	pl_entry_lock_t *lock = NULL;
-
-	if (list_empty (&pinode->dir_list))
-		return NULL;
-
-	list_for_each_entry (lock, &pinode->dir_list, inode_list) {
-		if (names_conflict (lock->basename, basename) &&
-		    types_conflict (lock->type, type))
-			return lock;
-	}
-
-	return NULL;
-}
-
-/**
- * find_most_matching_lock - find the lock struct which most matches in order of:
- *                           lock on the exact basename ||
- *                           an all_names lock
- *                      
- *
- * @inode: inode in which to look
- * @basename: name to search for
- */
-
-static pl_entry_lock_t * 
-__find_most_matching_lock (pl_inode_t *pinode, const char *basename)
-{
-	pl_entry_lock_t *lock;
-	pl_entry_lock_t *all = NULL;
-	pl_entry_lock_t *exact = NULL;
-
-	if (list_empty (&pinode->dir_list)) 
-		return NULL;
-
-	list_for_each_entry (lock, &pinode->dir_list, inode_list) {
-		if (all_names (lock->basename))
-			all = lock;
-		else if (names_equal (lock->basename, basename))
-			exact = lock;
-	}
-
-	return (exact ? exact : all);
-}
-
-
-/**
- * insert_new_lock - insert a new dir lock into the inode with the given parameters
- * @pinode: inode to insert into
- * @basename: basename for the lock
- * @type: type of the lock
- */
-
-static pl_entry_lock_t *
-new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type,
-		  transport_t *trans)
-{
-	pl_entry_lock_t *newlock = NULL;
-
-	newlock = CALLOC (sizeof (pl_entry_lock_t), 1);
-	if (!newlock) {
-		goto out;
-	}
-
-	newlock->basename = basename ? strdup (basename) : NULL;
-	newlock->type     = type;
-	newlock->trans    = trans;
-
-	if (type == ENTRYLK_RDLCK)
-		newlock->read_count = 1;
-
-	INIT_LIST_HEAD (&newlock->inode_list);
-	INIT_LIST_HEAD (&newlock->blocked_locks);
-
-out:
-	return newlock;
-}
-
-/**
- * lock_name - lock a name in a directory
- * @inode: inode for the directory in which to lock
- * @basename: name of the entry to lock
- *            if null, lock the entire directory
- *            
- * the entire directory being locked is represented as: a single
- * pl_entry_lock_t present in the entrylk_locks list with its
- * basename = NULL
- */
-
-int
-__lock_name (pl_inode_t *pinode, const char *basename, entrylk_type type,
-	     call_frame_t *frame, xlator_t *this, int nonblock)
-{
-	pl_entry_lock_t *lock    = NULL;
-	pl_entry_lock_t *conf    = NULL;
-
-	transport_t *trans = NULL;
-
-	int ret = -EINVAL;
-
-	trans = frame->root->trans;
-
-	conf = __lock_grantable (pinode, basename, type);
-	if (conf) {
-		ret = -EAGAIN;
-		if (nonblock)
-			goto out;
-
-		lock = new_entrylk_lock (pinode, basename, type, trans);
-
-		if (!lock) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		gf_log (this->name, GF_LOG_DEBUG,
-			"blocking lock: {pinode=%p, basename=%s}",
-			pinode, basename);
-
-		lock->frame   = frame;
-		lock->this    = this;
-		lock->blocked = 1;
-
-		list_add (&lock->blocked_locks, &conf->blocked_locks);
-
-
-		goto out;
-	}
-		
-	switch (type) {
-	case ENTRYLK_RDLCK:
-		lock = __find_most_matching_lock (pinode, basename);
-
-		if (lock && names_equal (lock->basename, basename)) {
-			lock->read_count++;
-
-			FREE (lock->basename);
-			FREE (lock);
-
-			lock = NULL;
-		} else {
-			lock = new_entrylk_lock (pinode, basename, type, trans);
-
-			if (!lock) {
-				ret = -ENOMEM;
-				goto out;
-			}
-
-			list_add (&lock->inode_list, &pinode->dir_list);
-		}
-		break;
-
-	case ENTRYLK_WRLCK:
-		lock = new_entrylk_lock (pinode, basename, type, trans);
-			
-		if (!lock) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		list_add (&lock->inode_list, &pinode->dir_list);
-		break;
-	}
-
-	ret = 0;
-out:
-	return ret;
-}
-
-
-/**
- * unlock_name - unlock a name in a directory
- * @inode: inode for the directory to unlock in
- * @basename: name of the entry to unlock
- *            if null, unlock the entire directory
- */
-
-pl_entry_lock_t *
-__unlock_name (pl_inode_t *pinode, const char *basename, entrylk_type type)
-{
-	pl_entry_lock_t *lock = NULL;
-	pl_entry_lock_t *ret_lock = NULL;
-
-	lock = __find_most_matching_lock (pinode, basename);
-	
-	if (!lock) {
-		gf_log ("locks", GF_LOG_DEBUG,
-			"unlock on %s (type=%s) attempted but no matching lock found",
-			basename, type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" : 
-			"ENTRYLK_WRLCK");
-		goto out;
-	}
-	
-	if (names_equal (lock->basename, basename)
-	    && lock->type == type) {
-		if (type == ENTRYLK_RDLCK) {
-			lock->read_count--;
-		}
-		if (type == ENTRYLK_WRLCK || lock->read_count == 0) {
-			list_del (&lock->inode_list);
-			ret_lock = lock;
-		}
-	} else {
-		gf_log ("locks", GF_LOG_ERROR,
-			"unlock for a non-existing lock!");
-		goto out;
-	}
-
-out:
-	return ret_lock;
-}
-
-
-void
-__grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
-			     pl_entry_lock_t *lock,
-			     struct list_head *granted)
-{
-	int              bl_ret = 0;
-	pl_entry_lock_t *bl   = NULL;
-	pl_entry_lock_t *tmp  = NULL;
-
-	list_for_each_entry_safe (bl, tmp, &lock->blocked_locks, 
-				  blocked_locks) {
-		list_del_init (&bl->blocked_locks);
-
-		/* TODO: error checking */
-
-		gf_log ("locks", GF_LOG_DEBUG,
-			"trying to unblock: {pinode=%p, basename=%s}",
-			pl_inode, bl->basename);
-					
-		bl_ret = __lock_name (pl_inode, bl->basename, bl->type,
-				      bl->frame, bl->this, 0);
-
-		if (bl_ret == 0) {
-			list_add (&bl->blocked_locks, granted);
-		} else {
-			if (bl->basename)
-				FREE (bl->basename);
-			FREE (bl);
-		}
-	}	
-	return;
-}
-
-
-void
-grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
-			   pl_entry_lock_t *unlocked)
-{
-	struct list_head  granted_list;
-	pl_entry_lock_t  *tmp = NULL;
-	pl_entry_lock_t  *lock = NULL;
-
-	INIT_LIST_HEAD (&granted_list);
-
-	pthread_mutex_lock (&pl_inode->mutex);
-	{
-		__grant_blocked_entry_locks (this, pl_inode, unlocked,
-					     &granted_list);
-	}
-	pthread_mutex_unlock (&pl_inode->mutex);
-
-	list_for_each_entry_safe (lock, tmp, &granted_list, blocked_locks) {
-		list_del_init (&lock->blocked_locks);
-
-		STACK_UNWIND (lock->frame, 0, 0);
-
-		FREE (lock->basename);
-		FREE (lock);
-	}
-
-	FREE (unlocked->basename);
-	FREE (unlocked);
-
-	return;
-}
-
-
-/**
- * release_entry_locks_for_transport: release all entry locks from this
- * transport for this loc_t
- */
-
-static int
-release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode,
-				   transport_t *trans)
-{
-	pl_entry_lock_t  *lock;
-	pl_entry_lock_t  *tmp;
-	struct list_head  granted;
-
-	INIT_LIST_HEAD (&granted);
-
-	pthread_mutex_lock (&pinode->mutex);
-	{
-		if (list_empty (&pinode->dir_list)) {
-			goto unlock;
-		}
-
-		list_for_each_entry_safe (lock, tmp, &pinode->dir_list,
-					  inode_list) {
-			if (lock->trans != trans)
-				continue;
-
-			list_del_init (&lock->inode_list);
-			__grant_blocked_entry_locks (this, pinode, lock,
-						     &granted);
-
-			FREE (lock->basename);
-			FREE (lock);
-		}
-	}
-unlock:
-	pthread_mutex_unlock (&pinode->mutex);
-
-	list_for_each_entry_safe (lock, tmp, &granted, blocked_locks) {
-		list_del_init (&lock->blocked_locks);
-
-		STACK_UNWIND (lock->frame, 0, 0);
-
-		FREE (lock->basename);
-		FREE (lock);
-	}
-
-	return 0;
-}
-
-
-/**
- * pl_entrylk:
- * 
- * Locking on names (directory entries)
- */
-
-int
-pl_entrylk (call_frame_t *frame, xlator_t *this,
-	    loc_t *loc, const char *basename, 
-	    entrylk_cmd cmd, entrylk_type type)
-{
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-
-	transport_t * transport = NULL;
-	pid_t pid = -1;
-
-	pl_inode_t *       pinode = NULL; 
-	int                ret    = -1;
-	pl_entry_lock_t   *unlocked = NULL;
-	char               unwind = 1;
-
-
-	pinode = pl_inode_get (this, loc->inode);
-	if (!pinode) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		op_errno = ENOMEM;
-		goto out;
-	}
-
-	pid       = frame->root->pid;
-	transport = frame->root->trans;
-
-	if (pid == 0) {
-		/* 
-		   this is a special case that means release
-		   all locks from this transport 
-		*/
-
-		gf_log (this->name, GF_LOG_DEBUG,
-			"releasing locks for transport %p", transport);
-
-		release_entry_locks_for_transport (this, pinode, transport);
-		op_ret = 0;
-
-		goto out;
-	}
-
-	switch (cmd) {
-	case ENTRYLK_LOCK:
-		pthread_mutex_lock (&pinode->mutex);
-		{
-			ret = __lock_name (pinode, basename, type,
-					   frame, this, 0);
-		}
-		pthread_mutex_unlock (&pinode->mutex);
-
-		if (ret < 0) {
-			if (ret == -EAGAIN)
-				unwind = 0;
-			op_errno = -ret;
-			goto out;
-		}
-
-		break;
-
-	case ENTRYLK_LOCK_NB:
-		pthread_mutex_lock (&pinode->mutex);
-		{
-			ret = __lock_name (pinode, basename, type,
-					   frame, this, 1);
-		}
-		pthread_mutex_unlock (&pinode->mutex);
-
-		if (ret < 0) {
-			op_errno = -ret;
-			goto out;
-		}
-
-		break;
-
-	case ENTRYLK_UNLOCK:
-		pthread_mutex_lock (&pinode->mutex);
-		{
-			unlocked = __unlock_name (pinode, basename, type);
-		}
-		pthread_mutex_unlock (&pinode->mutex);
-
-		if (unlocked)
-			grant_blocked_entry_locks (this, pinode, unlocked);
-
-		break;
-
-	default:
-		gf_log (this->name, GF_LOG_ERROR,
-			"unexpected case!");
-		goto out;
-	}
-
-	op_ret = 0;
-out:
-	if (unwind) {
-		STACK_UNWIND (frame, op_ret, op_errno);
-	}
-	
-	return 0;
-}
-
-
-/**
- * pl_entrylk:
- * 
- * Locking on names (directory entries)
- */
-
-int
-pl_fentrylk (call_frame_t *frame, xlator_t *this,
-	     fd_t *fd, const char *basename, 
-	     entrylk_cmd cmd, entrylk_type type)
-{
-	int32_t op_ret   = -1;
-	int32_t op_errno = 0;
-
-	transport_t * transport = NULL;
-	pid_t pid = -1;
-
-	pl_inode_t *       pinode = NULL; 
-	int                ret    = -1;
-	pl_entry_lock_t   *unlocked = NULL;
-	char               unwind = 1;
-
-	pinode = pl_inode_get (this, fd->inode);
-	if (!pinode) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto out;
-	}
-
-	pid       = frame->root->pid;
-	transport = frame->root->trans;
-
-	if (pid == 0) {
-		/* 
-		   this is a special case that means release
-		   all locks from this transport 
-		*/
-
-		gf_log (this->name, GF_LOG_DEBUG,
-			"releasing locks for transport %p", transport);
-
-		release_entry_locks_for_transport (this, pinode, transport);
-		op_ret = 0;
-		goto out;
-	}
-
-	switch (cmd) {
-	case ENTRYLK_LOCK:
-		pthread_mutex_lock (&pinode->mutex);
-		{
-			ret = __lock_name (pinode, basename, type,
-					   frame, this, 0);
-		}
-		pthread_mutex_unlock (&pinode->mutex);
-
-		if (ret < 0) {
-			if (ret == -EAGAIN)
-				unwind = 0;
-			op_errno = -ret;
-			goto out;
-		}
-		break;
-
-	case ENTRYLK_LOCK_NB:
-		pthread_mutex_lock (&pinode->mutex);
-		{
-			ret = __lock_name (pinode, basename, type,
-					   frame, this, 1);
-		}
-		pthread_mutex_unlock (&pinode->mutex);
-
-		if (ret < 0) {
-			op_errno = -ret;
-			goto out;
-		}
-		break;
-
-	case ENTRYLK_UNLOCK:
-		pthread_mutex_lock (&pinode->mutex);
-		{
-			unlocked = __unlock_name (pinode, basename, type);
-		}
-		pthread_mutex_unlock (&pinode->mutex);
-
-		if (unlocked)
-			grant_blocked_entry_locks (this, pinode, unlocked);
-		break;
-
-	default:
-		gf_log (this->name, GF_LOG_ERROR,
-			"unexpected case!");
-		goto out;
-	}
-
-	op_ret = 0;
-out:
-	if (unwind) {
-		STACK_UNWIND (frame, op_ret, op_errno);
-	}
-	
-	return 0;
-}
diff --git a/xlators/features/locks/src/locks-mem-types.h b/xlators/features/locks/src/locks-mem-types.h
new file mode 100644
index 00000000000..a76605027b3
--- /dev/null
+++ b/xlators/features/locks/src/locks-mem-types.h
@@ -0,0 +1,28 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __LOCKS_MEM_TYPES_H__
+#define __LOCKS_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_locks_mem_types_ {
+    gf_locks_mt_pl_dom_list_t = gf_common_mt_end + 1,
+    gf_locks_mt_pl_inode_t,
+    gf_locks_mt_posix_lock_t,
+    gf_locks_mt_pl_entry_lock_t,
+    gf_locks_mt_pl_inode_lock_t,
+    gf_locks_mt_pl_rw_req_t,
+    gf_locks_mt_posix_locks_private_t,
+    gf_locks_mt_pl_fdctx_t,
+    gf_locks_mt_pl_meta_lock_t,
+    gf_locks_mt_end
+};
+#endif
diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h
index 5a834657d0b..c868eb494a2 100644
--- a/xlators/features/locks/src/locks.h
+++ b/xlators/features/locks/src/locks.h
@@ -1,111 +1,292 @@
 /*
-   Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
+   Copyright (c) 2006-2012, 2015-2016 Red Hat, Inc. <http://www.redhat.com>
    This file is part of GlusterFS.
 
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
 */
-
 #ifndef __POSIX_LOCKS_H__
 #define __POSIX_LOCKS_H__
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/stack.h>
+#include <glusterfs/call-stub.h>
+#include "locks-mem-types.h"
+#include <glusterfs/client_t.h>
+
+#include <glusterfs/lkowner.h>
 
-#include "compat-errno.h"
-#include "transport.h"
-#include "stack.h"
-#include "call-stub.h"
+typedef enum {
+    MLK_NONE,
+    MLK_FILE_BASED,
+    MLK_FORCED,
+    MLK_OPTIMAL
+} mlk_mode_t; /* defines different mandatory locking modes*/
 
 struct __pl_fd;
 
 struct __posix_lock {
-	struct list_head   list;
+    struct list_head list;
+
+    off_t fl_start;
+    off_t fl_end;
+    uint32_t lk_flags;
+
+    short fl_type;
+    short blocked;              /* waiting to acquire */
+    struct gf_flock user_flock; /* the flock supplied by the user */
+    xlator_t *this;             /* required for blocked locks */
+    unsigned long fd_num;
+
+    fd_t *fd;
+    call_frame_t *frame;
 
-	short              fl_type;
-	off_t              fl_start;
-	off_t              fl_end;
+    time_t blkd_time;    /* time at which lock was queued into blkd list */
+    time_t granted_time; /* time at which lock was queued into active list */
 
-	short              blocked;    /* waiting to acquire */
-	struct flock       user_flock; /* the flock supplied by the user */
-	xlator_t          *this;       /* required for blocked locks */
-	fd_t              *fd;
+    /* These two together serve to uniquely identify each process
+       across nodes */
 
-	call_frame_t      *frame;
+    void *client; /* to identify client node */
 
-	/* These two together serve to uniquely identify each process
-	   across nodes */
+    /* This field uniquely identifies the client the lock belongs to.  As
+     * lock migration is handled by rebalance, the client_t object will be
+     * overwritten by rebalance and can't be deemed as the owner of the
+     * lock on destination. Hence, the below field is migrated from
+     * source to destination by lock_migration_info_t and updated on the
+     * destination. So that on client-server disconnection, server can
+     * cleanup the locks proper;y.  */
 
-	transport_t       *transport;     /* to identify client node */
-	pid_t              client_pid;    /* pid of client process */
+    char *client_uid;
+    gf_lkowner_t owner;
+    pid_t client_pid; /* pid of client process */
+
+    int blocking;
 };
 typedef struct __posix_lock posix_lock_t;
 
-struct __pl_rw_req_t {
-	struct list_head      list;
-	call_stub_t          *stub;
-	posix_lock_t          region;
+struct __pl_inode_lock {
+    struct list_head list;
+    struct list_head blocked_locks; /* list_head pointing to blocked_inodelks */
+    struct list_head contend;       /* list of contending locks */
+    int ref;
+
+    off_t fl_start;
+    off_t fl_end;
+
+    const char *volume;
+
+    struct gf_flock user_flock; /* the flock supplied by the user */
+    xlator_t *this;             /* required for blocked locks */
+    struct __pl_inode *pl_inode;
+
+    call_frame_t *frame;
+
+    time_t blkd_time;    /* time at which lock was queued into blkd list */
+    time_t granted_time; /* time at which lock was queued into active list */
+
+    /*last time at which lock contention was detected and notified*/
+    struct timespec contention_time;
+
+    /* These two together serve to uniquely identify each process
+       across nodes */
+
+    void *client; /* to identify client node */
+    gf_lkowner_t owner;
+    pid_t client_pid; /* pid of client process */
+
+    char *connection_id; /* stores the client connection id */
+
+    struct list_head client_list; /* list of all locks from a client */
+    short fl_type;
+
+    int32_t status; /* Error code when we try to grant a lock in blocked
+                       state */
 };
-typedef struct __pl_rw_req_t pl_rw_req_t;
+typedef struct __pl_inode_lock pl_inode_lock_t;
 
+struct _pl_rw_req {
+    struct list_head list;
+    call_stub_t *stub;
+    posix_lock_t region;
+};
+typedef struct _pl_rw_req pl_rw_req_t;
+
+struct _pl_dom_list {
+    struct list_head inode_list; /* list_head back to pl_inode_t */
+    const char *domain;
+    struct list_head entrylk_list;     /* List of entry locks */
+    struct list_head blocked_entrylks; /* List of all blocked entrylks */
+    struct list_head inodelk_list;     /* List of inode locks */
+    struct list_head blocked_inodelks; /* List of all blocked inodelks */
+};
+typedef struct _pl_dom_list pl_dom_list_t;
 
 struct __entry_lock {
-	struct list_head  inode_list;    /* list_head back to pl_inode_t */
-	struct list_head  blocked_locks; /* locks blocked due to this lock */
-
-	call_frame_t     *frame;
-	xlator_t         *this;
-	int               blocked;
-	
-	const char       *basename;
-	entrylk_type      type;
-	unsigned int      read_count;    /* number of read locks */
-	transport_t      *trans;
+    struct list_head domain_list;   /* list_head back to pl_dom_list_t */
+    struct list_head blocked_locks; /* list_head back to blocked_entrylks */
+    struct list_head contend;       /* list of contending locks */
+    int ref;
+
+    call_frame_t *frame;
+    xlator_t *this;
+    struct __pl_inode *pinode;
+
+    const char *volume;
+
+    const char *basename;
+
+    time_t blkd_time;    /* time at which lock was queued into blkd list */
+    time_t granted_time; /* time at which lock was queued into active list */
+
+    /*last time at which lock contention was detected and notified*/
+    struct timespec contention_time;
+
+    void *client;
+    gf_lkowner_t owner;
+    pid_t client_pid; /* pid of client process */
+
+    char *connection_id; /* stores the client connection id */
+
+    struct list_head client_list; /* list of all locks from a client */
+    entrylk_type type;
 };
 typedef struct __entry_lock pl_entry_lock_t;
 
-
-/* The "simulated" inode. This contains a list of all the locks associated 
+/* The "simulated" inode. This contains a list of all the locks associated
    with this file */
 
 struct __pl_inode {
-	pthread_mutex_t  mutex;
+    pthread_mutex_t mutex;
 
-	struct list_head dir_list;       /* list of entry locks */
-	struct list_head ext_list;       /* list of fcntl locks */
-	struct list_head int_list;       /* list of internal locks */
-	struct list_head rw_list;        /* list of waiting r/w requests */
-	int              mandatory;      /* if mandatory locking is enabled */
-};
-typedef struct __pl_inode pl_inode_t;
+    struct list_head dom_list;           /* list of domains */
+    struct list_head ext_list;           /* list of fcntl locks */
+    struct list_head rw_list;            /* list of waiting r/w requests */
+    struct list_head reservelk_list;     /* list of reservelks */
+    struct list_head blocked_reservelks; /* list of blocked reservelks */
+    struct list_head blocked_calls;      /* List of blocked lock calls while a
+                                            reserve is held*/
+    struct list_head metalk_list;        /* Meta lock list */
+    struct list_head queued_locks;       /* This is to store the incoming lock
+                                            requests while meta lock is enabled */
+    struct list_head waiting; /* List of pending fops waiting to unlink/rmdir
+                                 the inode. */
+    int mandatory;            /* if mandatory locking is enabled */
 
+    inode_t *refkeeper; /* hold refs on an inode while locks are
+                           held to prevent pruning */
+    uuid_t gfid;        /* placeholder for gfid of the inode */
+    inode_t *inode;     /* pointer to be used for ref and unref
+                           of inode_t as long as there are
+                           locks on it */
+    gf_boolean_t migrated;
 
-#define LOCKS_FOR_DOMAIN(inode,domain) (domain == GF_LOCK_POSIX \
-					? inode->fcntl_locks	\
-					: inode->inodelk_locks)
+    /* Flag to indicate whether to read mlock-enforce xattr from disk */
+    gf_boolean_t check_mlock_info;
 
-struct __pl_fd {
-	gf_boolean_t nonblocking;       /* whether O_NONBLOCK has been set */
+    /* Mandatory_lock enforce: IO will be allowed if and only if the lkowner has
+       held the lock.
+
+       Note: An xattr is set on the file to recover this information post
+       reboot. If client does not want mandatory lock to be enforced, then it
+       should remove this xattr explicitly
+    */
+    gf_boolean_t mlock_enforced;
+    /* There are scenarios where mandatory lock is granted but there are IOs
+       pending at posix level. To avoid this before preempting the previous lock
+       owner, we wait for all the fops to be unwound.
+    */
+    int fop_wind_count;
+    pthread_cond_t check_fop_wind_count;
+
+    gf_boolean_t track_fop_wind_count;
+
+    int32_t links;           /* Number of hard links the inode has. */
+    uint32_t remove_running; /* Number of remove operations running. */
+    gf_boolean_t is_locked;  /* Regular locks will be blocked. */
+    gf_boolean_t removed;    /* The inode has been deleted. */
 };
-typedef struct __pl_fd pl_fd_t;
+typedef struct __pl_inode pl_inode_t;
+
+struct __pl_metalk {
+    pthread_mutex_t mutex;
+    /* For pl_inode meta lock list */
+    struct list_head list;
+    /* For pl_ctx_t list */
+    struct list_head client_list;
+    char *client_uid;
 
+    pl_inode_t *pl_inode;
+    int ref;
+};
+typedef struct __pl_metalk pl_meta_lock_t;
 
 typedef struct {
-	gf_boolean_t    mandatory;      /* if mandatory locking is enabled */
+    char *brickname;
+    uint32_t revocation_secs;
+    uint32_t revocation_max_blocked;
+    uint32_t notify_contention_delay;
+    mlk_mode_t mandatory_mode; /* holds current mandatory locking mode */
+    gf_boolean_t trace;        /* trace lock requests in and out */
+    gf_boolean_t monkey_unlocking;
+    gf_boolean_t revocation_clear_all;
+    gf_boolean_t notify_contention;
+    gf_boolean_t mlock_enforced;
 } posix_locks_private_t;
 
+typedef struct {
+    data_t *inodelk_dom_count_req;
+
+    dict_t *xdata;
+    loc_t loc[2];
+    fd_t *fd;
+    inode_t *inode;
+    off_t offset;
+    glusterfs_fop_t op;
+    gf_boolean_t entrylk_count_req;
+    gf_boolean_t inodelk_count_req;
+    gf_boolean_t posixlk_count_req;
+    gf_boolean_t parent_entrylk_req;
+    gf_boolean_t multiple_dom_lk_requests;
+    int update_mlock_enforced_flag;
+} pl_local_t;
+
+typedef struct {
+    struct list_head locks_list;
+} pl_fdctx_t;
+
+struct _locker {
+    struct list_head lockers;
+    char *volume;
+    inode_t *inode;
+    gf_lkowner_t owner;
+};
+
+typedef struct _locks_ctx {
+    pthread_mutex_t lock;
+    struct list_head inodelk_lockers;
+    struct list_head entrylk_lockers;
+    struct list_head metalk_list;
+} pl_ctx_t;
+
+typedef struct _multi_dom_lk_data {
+    xlator_t *this;
+    inode_t *inode;
+    dict_t *xdata_rsp;
+    gf_boolean_t keep_max;
+} multi_dom_lk_data;
+
+typedef enum { DECREMENT, INCREMENT } pl_count_op_t;
+
+pl_ctx_t *
+pl_ctx_get(client_t *client, xlator_t *xlator);
+
+int
+pl_inodelk_client_cleanup(xlator_t *this, pl_ctx_t *ctx);
+
+int
+pl_entrylk_client_cleanup(xlator_t *this, pl_ctx_t *ctx);
 
 #endif /* __POSIX_LOCKS_H__ */
diff --git a/xlators/features/locks/src/pl-messages.h b/xlators/features/locks/src/pl-messages.h
new file mode 100644
index 00000000000..e2d3d7ca974
--- /dev/null
+++ b/xlators/features/locks/src/pl-messages.h
@@ -0,0 +1,29 @@
+/*
+  Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _PL_MESSAGES_H_
+#define _PL_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(PL, PL_MSG_LOCK_NUMBER, PL_MSG_INODELK_CONTENTION_FAILED,
+           PL_MSG_ENTRYLK_CONTENTION_FAILED);
+
+#endif /* !_PL_MESSAGES_H_ */
diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c
index 46d2cb9a003..cf0ae4c57dd 100644
--- a/xlators/features/locks/src/posix.c
+++ b/xlators/features/locks/src/posix.c
@@ -1,834 +1,5095 @@
 /*
-  Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+   Copyright (c) 2006-2012, 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
 #include <unistd.h>
 #include <fcntl.h>
 #include <limits.h>
 #include <pthread.h>
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "compat.h"
-#include "xlator.h"
-#include "inode.h"
-#include "logging.h"
-#include "common-utils.h"
+#include <glusterfs/compat.h>
+#include <glusterfs/logging.h>
 
 #include "locks.h"
 #include "common.h"
+#include <glusterfs/statedump.h>
+#include "clear.h"
+#include <glusterfs/defaults.h>
+#include <glusterfs/syncop.h>
 
 #ifndef LLONG_MAX
 #define LLONG_MAX LONG_LONG_MAX /* compat with old gcc */
-#endif /* LLONG_MAX */
+#endif                          /* LLONG_MAX */
 
 /* Forward declarations */
 
+void
+do_blocked_rw(pl_inode_t *);
+static int
+__rw_allowable(pl_inode_t *, posix_lock_t *, glusterfs_fop_t);
+static int
+format_brickname(char *);
+int
+pl_lockinfo_get_brickname(xlator_t *, inode_t *, int32_t *);
+static int
+fetch_pathinfo(xlator_t *, inode_t *, int32_t *, char **);
 
-void do_blocked_rw (pl_inode_t *);
-static int __rw_allowable (pl_inode_t *, posix_lock_t *, glusterfs_fop_t);
+/*
+ * The client is always requesting data, but older
+ * servers were not returning it. Newer ones are, so
+ * the client is receiving a mix of NULL and non-NULL
+ * xdata in the answers when bricks are of different
+ * versions. This triggers a bug in older clients.
+ * To prevent that, we avoid returning extra xdata to
+ * older clients (making the newer brick to behave as
+ * an old brick).
+ */
+#define PL_STACK_UNWIND_FOR_CLIENT(fop, xdata, frame, op_ret, params...)       \
+    do {                                                                       \
+        pl_local_t *__local = NULL;                                            \
+        if (frame->root->client &&                                             \
+            (frame->root->client->opversion < GD_OP_VERSION_3_10_0)) {         \
+            __local = frame->local;                                            \
+            PL_STACK_UNWIND_AND_FREE(__local, fop, frame, op_ret, params);     \
+        } else {                                                               \
+            PL_STACK_UNWIND(fop, xdata, frame, op_ret, params);                \
+        }                                                                      \
+    } while (0)
+
+#define PL_STACK_UNWIND(fop, xdata, frame, op_ret, params...)                  \
+    do {                                                                       \
+        pl_local_t *__local = NULL;                                            \
+        inode_t *__parent = NULL;                                              \
+        inode_t *__inode = NULL;                                               \
+        char *__name = NULL;                                                   \
+        dict_t *__unref = NULL;                                                \
+        int __i = 0;                                                           \
+        __local = frame->local;                                                \
+        if (op_ret >= 0 && pl_needs_xdata_response(frame->local)) {            \
+            if (xdata)                                                         \
+                dict_ref(xdata);                                               \
+            else                                                               \
+                xdata = dict_new();                                            \
+            if (xdata) {                                                       \
+                __unref = xdata;                                               \
+                while (__local->fd || __local->loc[__i].inode) {               \
+                    pl_get_xdata_rsp_args(__local, #fop, &__parent, &__inode,  \
+                                          &__name, __i);                       \
+                    pl_set_xdata_response(frame->this, __local, __parent,      \
+                                          __inode, __name, xdata, __i > 0);    \
+                    if (__local->fd || __i == 1)                               \
+                        break;                                                 \
+                    __i++;                                                     \
+                }                                                              \
+            }                                                                  \
+        }                                                                      \
+        PL_STACK_UNWIND_AND_FREE(__local, fop, frame, op_ret, params);         \
+        if (__unref)                                                           \
+            dict_unref(__unref);                                               \
+    } while (0)
+
+#define PL_LOCAL_GET_REQUESTS(frame, this, xdata, __fd, __loc, __newloc)       \
+    do {                                                                       \
+        if (pl_has_xdata_requests(xdata)) {                                    \
+            if (!frame->local)                                                 \
+                frame->local = mem_get0(this->local_pool);                     \
+            pl_local_t *__local = frame->local;                                \
+            if (__local) {                                                     \
+                if (__fd) {                                                    \
+                    __local->fd = fd_ref(__fd);                                \
+                    __local->inode = inode_ref(__fd->inode);                   \
+                } else {                                                       \
+                    if (__loc)                                                 \
+                        loc_copy(&__local->loc[0], __loc);                     \
+                    if (__newloc)                                              \
+                        loc_copy(&__local->loc[1], __newloc);                  \
+                    __local->inode = inode_ref(__local->loc[0].inode);         \
+                }                                                              \
+                pl_get_xdata_requests(__local, xdata);                         \
+            }                                                                  \
+        }                                                                      \
+    } while (0)
+
+#define PL_CHECK_LOCK_ENFORCE_KEY(frame, dict, name, this, loc, fd, priv)      \
+    do {                                                                       \
+        if ((dict && (dict_get(dict, GF_ENFORCE_MANDATORY_LOCK))) ||           \
+            (name && (strcmp(name, GF_ENFORCE_MANDATORY_LOCK) == 0))) {        \
+            inode_t *__inode = (loc ? loc->inode : fd->inode);                 \
+            pl_inode_t *__pl_inode = pl_inode_get(this, __inode, NULL);        \
+            if (__pl_inode == NULL) {                                          \
+                op_ret = -1;                                                   \
+                op_errno = ENOMEM;                                             \
+                goto unwind;                                                   \
+            }                                                                  \
+            if (!pl_is_mandatory_locking_enabled(__pl_inode) ||                \
+                !priv->mlock_enforced) {                                       \
+                op_ret = -1;                                                   \
+                gf_msg(this->name, GF_LOG_DEBUG, EINVAL, 0,                    \
+                       "option %s would need mandatory lock to be enabled "    \
+                       "and feature.enforce-mandatory-lock option to be set "  \
+                       "to on",                                                \
+                       GF_ENFORCE_MANDATORY_LOCK);                             \
+                op_errno = EINVAL;                                             \
+                goto unwind;                                                   \
+            }                                                                  \
+                                                                               \
+            op_ret = pl_local_init(frame, this, loc, fd);                      \
+            if (op_ret) {                                                      \
+                op_errno = ENOMEM;                                             \
+                goto unwind;                                                   \
+            }                                                                  \
+                                                                               \
+            ((pl_local_t *)(frame->local))->update_mlock_enforced_flag = 1;    \
+        }                                                                      \
+    } while (0)
+
+#define PL_INODE_REMOVE(_fop, _frame, _xl, _loc1, _loc2, _cont, _cbk,          \
+                        _args...)                                              \
+    ({                                                                         \
+        struct list_head contend;                                              \
+        pl_inode_t *__pl_inode;                                                \
+        call_stub_t *__stub;                                                   \
+        int32_t __error;                                                       \
+        INIT_LIST_HEAD(&contend);                                              \
+        __error = pl_inode_remove_prepare(_xl, _frame, _loc2 ? _loc2 : _loc1,  \
+                                          &__pl_inode, &contend);              \
+        if (__error < 0) {                                                     \
+            __stub = fop_##_fop##_stub(_frame, _cont, ##_args);                \
+            __error = pl_inode_remove_complete(_xl, __pl_inode, __stub,        \
+                                               &contend);                      \
+        } else if (__error == 0) {                                             \
+            PL_LOCAL_GET_REQUESTS(_frame, _xl, xdata, ((fd_t *)NULL), _loc1,   \
+                                  _loc2);                                      \
+            STACK_WIND_COOKIE(_frame, _cbk, __pl_inode, FIRST_CHILD(_xl),      \
+                              FIRST_CHILD(_xl)->fops->_fop, ##_args);          \
+        }                                                                      \
+        __error;                                                               \
+    })
+
+gf_boolean_t
+pl_has_xdata_requests(dict_t *xdata)
+{
+    static char *reqs[] = {GLUSTERFS_ENTRYLK_COUNT,
+                           GLUSTERFS_INODELK_COUNT,
+                           GLUSTERFS_INODELK_DOM_COUNT,
+                           GLUSTERFS_POSIXLK_COUNT,
+                           GLUSTERFS_PARENT_ENTRYLK,
+                           GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS,
+                           NULL};
+    static int reqs_size[] = {SLEN(GLUSTERFS_ENTRYLK_COUNT),
+                              SLEN(GLUSTERFS_INODELK_COUNT),
+                              SLEN(GLUSTERFS_INODELK_DOM_COUNT),
+                              SLEN(GLUSTERFS_POSIXLK_COUNT),
+                              SLEN(GLUSTERFS_PARENT_ENTRYLK),
+                              SLEN(GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS),
+                              0};
+    int i = 0;
+
+    if (!xdata)
+        return _gf_false;
+
+    for (i = 0; reqs[i]; i++)
+        if (dict_getn(xdata, reqs[i], reqs_size[i]))
+            return _gf_true;
+
+    return _gf_false;
+}
 
-struct _truncate_ops {
-	loc_t  loc;
-	fd_t  *fd;
-	off_t  offset;
-	enum {TRUNCATE, FTRUNCATE} op;
-};
+static int
+dict_delete_domain_key(dict_t *dict, char *key, data_t *value, void *data)
+{
+    dict_del(dict, key);
+    return 0;
+}
 
+void
+pl_get_xdata_requests(pl_local_t *local, dict_t *xdata)
+{
+    if (!local || !xdata)
+        return;
+
+    GF_ASSERT(local->xdata == NULL);
+    local->xdata = dict_copy_with_ref(xdata, NULL);
+
+    if (dict_get_sizen(xdata, GLUSTERFS_ENTRYLK_COUNT)) {
+        local->entrylk_count_req = 1;
+        dict_del_sizen(xdata, GLUSTERFS_ENTRYLK_COUNT);
+    }
+    if (dict_get_sizen(xdata, GLUSTERFS_INODELK_COUNT)) {
+        local->inodelk_count_req = 1;
+        dict_del_sizen(xdata, GLUSTERFS_INODELK_COUNT);
+    }
+    if (dict_get_sizen(xdata, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS)) {
+        local->multiple_dom_lk_requests = 1;
+        dict_del_sizen(xdata, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS);
+        dict_foreach_fnmatch(xdata, GLUSTERFS_INODELK_DOM_PREFIX "*",
+                             dict_delete_domain_key, NULL);
+    }
+
+    local->inodelk_dom_count_req = dict_get_sizen(xdata,
+                                                  GLUSTERFS_INODELK_DOM_COUNT);
+    if (local->inodelk_dom_count_req) {
+        data_ref(local->inodelk_dom_count_req);
+        dict_del_sizen(xdata, GLUSTERFS_INODELK_DOM_COUNT);
+    }
+
+    if (dict_get_sizen(xdata, GLUSTERFS_POSIXLK_COUNT)) {
+        local->posixlk_count_req = 1;
+        dict_del_sizen(xdata, GLUSTERFS_POSIXLK_COUNT);
+    }
+
+    if (dict_get_sizen(xdata, GLUSTERFS_PARENT_ENTRYLK)) {
+        local->parent_entrylk_req = 1;
+        dict_del_sizen(xdata, GLUSTERFS_PARENT_ENTRYLK);
+    }
+}
 
-int
-pl_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		 int32_t op_ret, int32_t op_errno, struct stat *buf)
+gf_boolean_t
+pl_needs_xdata_response(pl_local_t *local)
 {
-	struct _truncate_ops *local = NULL;
+    if (!local)
+        return _gf_false;
 
-	local = frame->local;
+    if (local->parent_entrylk_req || local->entrylk_count_req ||
+        local->inodelk_dom_count_req || local->inodelk_count_req ||
+        local->posixlk_count_req || local->multiple_dom_lk_requests)
+        return _gf_true;
 
-	if (local->op == TRUNCATE)
-		loc_wipe (&local->loc);
+    return _gf_false;
+}
 
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
+void
+pl_get_xdata_rsp_args(pl_local_t *local, char *fop, inode_t **parent,
+                      inode_t **inode, char **name, int i)
+{
+    if (strcmp(fop, "lookup") == 0) {
+        *parent = local->loc[0].parent;
+        *inode = local->loc[0].inode;
+        *name = (char *)local->loc[0].name;
+    } else {
+        if (local->fd) {
+            *inode = local->fd->inode;
+        } else {
+            *inode = local->loc[i].parent;
+        }
+    }
 }
 
+static inline int
+pl_track_io_fop_count(pl_local_t *local, xlator_t *this, pl_count_op_t op)
+{
+    pl_inode_t *pl_inode = NULL;
+
+    if (!local)
+        return -1;
+
+    pl_inode = pl_inode_get(this, local->inode, NULL);
+    if (!pl_inode)
+        return -1;
+
+    if (pl_inode->mlock_enforced && pl_inode->track_fop_wind_count) {
+        pthread_mutex_lock(&pl_inode->mutex);
+        {
+            if (op == DECREMENT) {
+                pl_inode->fop_wind_count--;
+                /* fop_wind_count can go negative when lock enforcement is
+                 * enabled on unwind path of an IO. Hence the "<" comparision.
+                 */
+                if (pl_inode->fop_wind_count <= 0) {
+                    pthread_cond_broadcast(&pl_inode->check_fop_wind_count);
+                    pl_inode->track_fop_wind_count = _gf_false;
+                    pl_inode->fop_wind_count = 0;
+                }
+            } else {
+                pl_inode->fop_wind_count++;
+            }
+        }
+        pthread_mutex_unlock(&pl_inode->mutex);
+    }
+
+    return 0;
+}
 
-static int
-truncate_allowed (pl_inode_t *pl_inode, 
-		  transport_t *transport, pid_t client_pid, 
-		  off_t offset)
+static int32_t
+__get_posixlk_count(pl_inode_t *pl_inode)
 {
-	posix_lock_t *l = NULL;
-	posix_lock_t  region = {.list = {0, }, };
-	int           ret = 1;
+    posix_lock_t *lock = NULL;
+    int32_t count = 0;
+
+    list_for_each_entry(lock, &pl_inode->ext_list, list) { count++; }
+
+    return count;
+}
+
+int32_t
+get_posixlk_count(xlator_t *this, inode_t *inode)
+{
+    pl_inode_t *pl_inode = NULL;
+    uint64_t tmp_pl_inode = 0;
+    int32_t count = 0;
+
+    int ret = inode_ctx_get(inode, this, &tmp_pl_inode);
+    if (ret != 0) {
+        goto out;
+    }
+
+    pl_inode = (pl_inode_t *)(long)tmp_pl_inode;
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        count = __get_posixlk_count(pl_inode);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
 
-	region.fl_start   = offset;
-	region.fl_end     = LLONG_MAX;
-	region.transport  = transport;
-	region.client_pid = client_pid;
+out:
+    return count;
+}
 
-	pthread_mutex_lock (&pl_inode->mutex);
-	{
-		list_for_each_entry (l, &pl_inode->ext_list, list) {
-			if (!l->blocked
-			    && locks_overlap (&region, l)
-			    && !same_owner (&region, l)) {
-				ret = 0;
-				break;
-			}
-		}
-	}
-	pthread_mutex_unlock (&pl_inode->mutex);
+void
+pl_parent_entrylk_xattr_fill(xlator_t *this, inode_t *parent, char *basename,
+                             dict_t *dict, gf_boolean_t keep_max)
+{
+    int32_t entrylk = 0;
+    int32_t maxcount = -1;
+    int ret = -1;
+
+    if (!parent || !basename)
+        goto out;
+    if (keep_max) {
+        ret = dict_get_int32_sizen(dict, GLUSTERFS_PARENT_ENTRYLK, &maxcount);
+        if (ret < 0)
+            gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s",
+                         GLUSTERFS_PARENT_ENTRYLK);
+    }
+    entrylk = check_entrylk_on_basename(this, parent, basename);
+    if (maxcount >= entrylk)
+        return;
+out:
+    ret = dict_set_int32_sizen(dict, GLUSTERFS_PARENT_ENTRYLK, entrylk);
+    if (ret < 0) {
+        gf_msg_debug(this->name, 0, " dict_set failed on key %s",
+                     GLUSTERFS_PARENT_ENTRYLK);
+    }
+}
 
-	return ret;
+void
+pl_entrylk_xattr_fill(xlator_t *this, inode_t *inode, dict_t *dict,
+                      gf_boolean_t keep_max)
+{
+    int32_t count = 0;
+    int32_t maxcount = -1;
+    int ret = -1;
+
+    if (keep_max) {
+        ret = dict_get_int32_sizen(dict, GLUSTERFS_ENTRYLK_COUNT, &maxcount);
+        if (ret < 0)
+            gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s",
+                         GLUSTERFS_ENTRYLK_COUNT);
+    }
+    count = get_entrylk_count(this, inode);
+    if (maxcount >= count)
+        return;
+
+    ret = dict_set_int32_sizen(dict, GLUSTERFS_ENTRYLK_COUNT, count);
+    if (ret < 0) {
+        gf_msg_debug(this->name, 0, " dict_set failed on key %s",
+                     GLUSTERFS_ENTRYLK_COUNT);
+    }
 }
 
+void
+pl_inodelk_xattr_fill(xlator_t *this, inode_t *inode, dict_t *dict,
+                      char *domname, gf_boolean_t keep_max)
+{
+    int32_t count = 0;
+    int32_t maxcount = -1;
+    int ret = -1;
+
+    if (keep_max) {
+        ret = dict_get_int32_sizen(dict, GLUSTERFS_INODELK_COUNT, &maxcount);
+        if (ret < 0)
+            gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s",
+                         GLUSTERFS_INODELK_COUNT);
+    }
+    count = get_inodelk_count(this, inode, domname);
+    if (maxcount >= count)
+        return;
+
+    ret = dict_set_int32_sizen(dict, GLUSTERFS_INODELK_COUNT, count);
+    if (ret < 0) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to set count for "
+                     "key %s",
+                     GLUSTERFS_INODELK_COUNT);
+    }
+
+    return;
+}
+
+void
+pl_posixlk_xattr_fill(xlator_t *this, inode_t *inode, dict_t *dict,
+                      gf_boolean_t keep_max)
+{
+    int32_t count = 0;
+    int32_t maxcount = -1;
+    int ret = -1;
+
+    if (keep_max) {
+        ret = dict_get_int32_sizen(dict, GLUSTERFS_POSIXLK_COUNT, &maxcount);
+        if (ret < 0)
+            gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s",
+                         GLUSTERFS_POSIXLK_COUNT);
+    }
+    count = get_posixlk_count(this, inode);
+    if (maxcount >= count)
+        return;
+
+    ret = dict_set_int32_sizen(dict, GLUSTERFS_POSIXLK_COUNT, count);
+    if (ret < 0) {
+        gf_msg_debug(this->name, 0, " dict_set failed on key %s",
+                     GLUSTERFS_POSIXLK_COUNT);
+    }
+}
+
+void
+pl_inodelk_xattr_fill_each(xlator_t *this, inode_t *inode, dict_t *dict,
+                           char *domname, gf_boolean_t keep_max, char *key)
+{
+    int32_t count = 0;
+    int32_t maxcount = -1;
+    int ret = -1;
+
+    if (keep_max) {
+        ret = dict_get_int32(dict, key, &maxcount);
+        if (ret < 0)
+            gf_msg_debug(this->name, 0, " Failed to fetch the value for key %s",
+                         GLUSTERFS_INODELK_COUNT);
+    }
+    count = get_inodelk_count(this, inode, domname);
+    if (maxcount >= count)
+        return;
+
+    ret = dict_set_int32(dict, key, count);
+    if (ret < 0) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to set count for "
+                     "key %s",
+                     key);
+    }
+
+    return;
+}
 
 static int
-truncate_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		   int32_t op_ret, int32_t op_errno, struct stat *buf)
-{
-	posix_locks_private_t *priv = NULL;
-	struct _truncate_ops  *local = NULL;
-	inode_t               *inode = NULL;
-	pl_inode_t            *pl_inode = NULL;
-
-
-	priv = this->private;
-	local = frame->local;
-
-	if (op_ret != 0) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"got error (errno=%d, stderror=%s) from child", 
-			op_errno, strerror (op_errno));
-		goto unwind;
-	}
-
-	if (local->op == TRUNCATE)
-		inode = local->loc.inode;
-	else
-		inode = local->fd->inode;
-
-	pl_inode = pl_inode_get (this, inode);
-	if (!pl_inode) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"unable to get pl_inode from %p", inode);
-		op_errno = ENOMEM;
-		goto unwind;
-	}
-
-	if (priv->mandatory
-	    && pl_inode->mandatory
-	    && !truncate_allowed (pl_inode, frame->root->trans,
-				  frame->root->pid, local->offset)) {
-		op_errno = EAGAIN;
-		goto unwind;
-	}
-
-	switch (local->op) {
-	case TRUNCATE:
-		STACK_WIND (frame, pl_truncate_cbk, FIRST_CHILD (this),
-			    FIRST_CHILD (this)->fops->truncate,
-			    &local->loc, local->offset);
-		break;
-	case FTRUNCATE:
-		STACK_WIND (frame, pl_truncate_cbk, FIRST_CHILD (this),
-			    FIRST_CHILD (this)->fops->ftruncate,
-			    local->fd, local->offset);
-		break;
-	}
-
-	return 0;
+pl_inodelk_xattr_fill_multiple(dict_t *this, char *key, data_t *value,
+                               void *data)
+{
+    multi_dom_lk_data *d = data;
+    char *tmp_key = NULL;
+    char *save_ptr = NULL;
+
+    tmp_key = gf_strdup(key);
+    if (!tmp_key)
+        return -1;
+
+    strtok_r(tmp_key, ":", &save_ptr);
+    if (!*save_ptr) {
+        if (tmp_key)
+            GF_FREE(tmp_key);
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, EINVAL,
+               "Could not tokenize domain string from key %s", key);
+        return -1;
+    }
+
+    pl_inodelk_xattr_fill_each(d->this, d->inode, d->xdata_rsp, save_ptr,
+                               d->keep_max, key);
+    if (tmp_key)
+        GF_FREE(tmp_key);
+
+    return 0;
+}
 
-unwind:
-	if (local->op == TRUNCATE)
-		loc_wipe (&local->loc);
+void
+pl_fill_multiple_dom_lk_requests(xlator_t *this, pl_local_t *local,
+                                 inode_t *inode, dict_t *dict,
+                                 gf_boolean_t keep_max)
+{
+    multi_dom_lk_data data;
+
+    data.this = this;
+    data.inode = inode;
+    data.xdata_rsp = dict;
+    data.keep_max = keep_max;
+
+    dict_foreach_fnmatch(local->xdata, GLUSTERFS_INODELK_DOM_PREFIX "*",
+                         pl_inodelk_xattr_fill_multiple, &data);
+}
+
+void
+pl_set_xdata_response(xlator_t *this, pl_local_t *local, inode_t *parent,
+                      inode_t *inode, char *name, dict_t *xdata,
+                      gf_boolean_t max_lock)
+{
+    if (!xdata || !local)
+        return;
+
+    if (local->parent_entrylk_req && parent && name && name[0] != '\0')
+        pl_parent_entrylk_xattr_fill(this, parent, name, xdata, max_lock);
+
+    if (!inode)
+        return;
+
+    if (local->entrylk_count_req)
+        pl_entrylk_xattr_fill(this, inode, xdata, max_lock);
 
-	STACK_UNWIND (frame, -1, ENOMEM, buf);
-	return 0;
+    if (local->inodelk_dom_count_req)
+        pl_inodelk_xattr_fill(this, inode, xdata,
+                              data_to_str(local->inodelk_dom_count_req),
+                              max_lock);
+
+    if (local->inodelk_count_req)
+        pl_inodelk_xattr_fill(this, inode, xdata, NULL, max_lock);
+
+    if (local->posixlk_count_req)
+        pl_posixlk_xattr_fill(this, inode, xdata, max_lock);
+
+    if (local->multiple_dom_lk_requests)
+        pl_fill_multiple_dom_lk_requests(this, local, inode, xdata, max_lock);
+}
+
+/* Checks whether the region where fop is acting upon conflicts
+ * with existing locks. If there is no conflict function returns
+ * 1 else returns 0 with can_block boolean set accordingly to
+ * indicate block/fail the fop.
+ */
+int
+pl_is_fop_allowed(pl_inode_t *pl_inode, posix_lock_t *region, fd_t *fd,
+                  glusterfs_fop_t op, gf_boolean_t *can_block)
+{
+    int ret = 0;
+
+    if (!__rw_allowable(pl_inode, region, op)) {
+        if (pl_inode->mlock_enforced) {
+            *can_block = _gf_false;
+        } else if ((!fd) || (fd && (fd->flags & O_NONBLOCK))) {
+            gf_log("locks", GF_LOG_TRACE,
+                   "returning EAGAIN"
+                   " because fd is O_NONBLOCK");
+            *can_block = _gf_false;
+        } else {
+            *can_block = _gf_true;
+        }
+    } else {
+        ret = 1;
+    }
+
+    return ret;
+}
+
+static pl_fdctx_t *
+pl_new_fdctx()
+{
+    pl_fdctx_t *fdctx = GF_MALLOC(sizeof(*fdctx), gf_locks_mt_pl_fdctx_t);
+    GF_VALIDATE_OR_GOTO("posix-locks", fdctx, out);
+
+    INIT_LIST_HEAD(&fdctx->locks_list);
+
+out:
+    return fdctx;
 }
 
+static pl_fdctx_t *
+pl_check_n_create_fdctx(xlator_t *this, fd_t *fd)
+{
+    int ret = 0;
+    uint64_t tmp = 0;
+    pl_fdctx_t *fdctx = NULL;
+
+    GF_VALIDATE_OR_GOTO("posix-locks", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    LOCK(&fd->lock);
+    {
+        ret = __fd_ctx_get(fd, this, &tmp);
+        if ((ret != 0) || (tmp == 0)) {
+            fdctx = pl_new_fdctx();
+            if (fdctx == NULL) {
+                goto unlock;
+            }
+        }
+
+        ret = __fd_ctx_set(fd, this, (uint64_t)(long)fdctx);
+        if (ret != 0) {
+            GF_FREE(fdctx);
+            fdctx = NULL;
+            UNLOCK(&fd->lock);
+            gf_log(this->name, GF_LOG_DEBUG, "failed to set fd ctx");
+            goto out;
+        }
+    }
+unlock:
+    UNLOCK(&fd->lock);
+
+out:
+    return fdctx;
+}
+
+int32_t
+pl_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+               struct iatt *postbuf, dict_t *xdata)
+{
+    pl_track_io_fop_count(frame->local, this, DECREMENT);
+
+    PL_STACK_UNWIND(discard, xdata, frame, op_ret, op_errno, prebuf, postbuf,
+                    xdata);
+    return 0;
+}
 
 int
-pl_truncate (call_frame_t *frame, xlator_t *this,
-	     loc_t *loc, off_t offset)
+pl_discard_cont(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                size_t len, dict_t *xdata)
+{
+    pl_track_io_fop_count(frame->local, this, INCREMENT);
+
+    STACK_WIND(frame, pl_discard_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+    return 0;
+}
+
+int32_t
+pl_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+           size_t len, dict_t *xdata)
 {
-	struct _truncate_ops *local = NULL;
+    pl_local_t *local = NULL;
+    pl_inode_t *pl_inode = NULL;
+    pl_rw_req_t *rw = NULL;
+    posix_lock_t region = {
+        .list =
+            {
+                0,
+            },
+    };
+    gf_boolean_t enabled = _gf_false;
+    gf_boolean_t can_block = _gf_true;
+    int op_ret = 0;
+    int op_errno = 0;
+    int allowed = 1;
+
+    GF_VALIDATE_OR_GOTO("locks", this, unwind);
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    frame->local = local;
+    local->inode = inode_ref(fd->inode);
+    local->fd = fd_ref(fd);
+
+    pl_inode = pl_inode_get(this, fd->inode, local);
+    if (!pl_inode) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    if (frame->root->pid < 0)
+        enabled = _gf_false;
+    else
+        enabled = pl_is_mandatory_locking_enabled(pl_inode);
+
+    if (enabled) {
+        region.fl_start = offset;
+        region.fl_end = offset + len - 1;
+        region.client = frame->root->client;
+        region.fd_num = fd_to_fdnum(fd);
+        region.client_pid = frame->root->pid;
+        region.owner = frame->root->lk_owner;
+
+        pthread_mutex_lock(&pl_inode->mutex);
+        {
+            allowed = pl_is_fop_allowed(pl_inode, &region, fd, GF_FOP_DISCARD,
+                                        &can_block);
+            if (allowed == 1) {
+                if (pl_inode->mlock_enforced &&
+                    pl_inode->track_fop_wind_count) {
+                    pl_inode->fop_wind_count++;
+                }
+                goto unlock;
+            } else if (!can_block) {
+                op_errno = EAGAIN;
+                op_ret = -1;
+                goto unlock;
+            }
+
+            rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t);
+            if (!rw) {
+                op_errno = ENOMEM;
+                op_ret = -1;
+                goto unlock;
+            }
+
+            rw->stub = fop_discard_stub(frame, pl_discard_cont, fd, offset, len,
+                                        xdata);
+            if (!rw->stub) {
+                op_errno = ENOMEM;
+                op_ret = -1;
+                GF_FREE(rw);
+                goto unlock;
+            }
+
+            rw->region = region;
+
+            list_add_tail(&rw->list, &pl_inode->rw_list);
+        }
+    unlock:
+        pthread_mutex_unlock(&pl_inode->mutex);
+    }
+
+    if (allowed == 1)
+        STACK_WIND(frame, pl_discard_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+unwind:
+    if (op_ret == -1)
+        PL_STACK_UNWIND(discard, xdata, frame, op_ret, op_errno, NULL, NULL,
+                        NULL);
 
-	local = CALLOC (1, sizeof (struct _truncate_ops));
-	if (!local) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto unwind;
-	}
+    return 0;
+}
 
-	local->op         = TRUNCATE;
-	local->offset     = offset;
-	loc_copy (&local->loc, loc);
+int32_t
+pl_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                struct iatt *postbuf, dict_t *xdata)
+{
+    pl_track_io_fop_count(frame->local, this, DECREMENT);
 
-	frame->local = local;
+    PL_STACK_UNWIND(zerofill, xdata, frame, op_ret, op_errno, prebuf, postbuf,
+                    xdata);
+    return 0;
+}
 
-	STACK_WIND (frame, truncate_stat_cbk, FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->stat, loc);
+int
+pl_zerofill_cont(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                 off_t len, dict_t *xdata)
+{
+    pl_track_io_fop_count(frame->local, this, INCREMENT);
 
-	return 0;
+    STACK_WIND(frame, pl_zerofill_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+    return 0;
+}
 
+int32_t
+pl_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            off_t len, dict_t *xdata)
+{
+    pl_local_t *local = NULL;
+    pl_inode_t *pl_inode = NULL;
+    pl_rw_req_t *rw = NULL;
+    posix_lock_t region = {
+        .list =
+            {
+                0,
+            },
+    };
+    gf_boolean_t enabled = _gf_false;
+    gf_boolean_t can_block = _gf_true;
+    int op_ret = 0;
+    int op_errno = 0;
+    int allowed = 1;
+
+    GF_VALIDATE_OR_GOTO("locks", this, unwind);
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    frame->local = local;
+    local->inode = inode_ref(fd->inode);
+    local->fd = fd_ref(fd);
+
+    pl_inode = pl_inode_get(this, fd->inode, local);
+    if (!pl_inode) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    if (frame->root->pid < 0)
+        enabled = _gf_false;
+    else
+        enabled = pl_is_mandatory_locking_enabled(pl_inode);
+
+    if (enabled) {
+        region.fl_start = offset;
+        region.fl_end = offset + len - 1;
+        region.client = frame->root->client;
+        region.fd_num = fd_to_fdnum(fd);
+        region.client_pid = frame->root->pid;
+        region.owner = frame->root->lk_owner;
+
+        pthread_mutex_lock(&pl_inode->mutex);
+        {
+            allowed = pl_is_fop_allowed(pl_inode, &region, fd, GF_FOP_ZEROFILL,
+                                        &can_block);
+            if (allowed == 1) {
+                if (pl_inode->mlock_enforced &&
+                    pl_inode->track_fop_wind_count) {
+                    pl_inode->fop_wind_count++;
+                }
+                goto unlock;
+            } else if (!can_block) {
+                op_errno = EAGAIN;
+                op_ret = -1;
+                goto unlock;
+            }
+
+            rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t);
+            if (!rw) {
+                op_errno = ENOMEM;
+                op_ret = -1;
+                goto unlock;
+            }
+
+            rw->stub = fop_zerofill_stub(frame, pl_zerofill_cont, fd, offset,
+                                         len, xdata);
+            if (!rw->stub) {
+                op_errno = ENOMEM;
+                op_ret = -1;
+                GF_FREE(rw);
+                goto unlock;
+            }
+
+            rw->region = region;
+
+            list_add_tail(&rw->list, &pl_inode->rw_list);
+        }
+    unlock:
+        pthread_mutex_unlock(&pl_inode->mutex);
+    }
+
+    if (allowed == 1)
+        STACK_WIND(frame, pl_zerofill_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
 unwind:
-	STACK_UNWIND (frame, -1, ENOMEM, NULL);
+    if (op_ret == -1)
+        PL_STACK_UNWIND(zerofill, xdata, frame, op_ret, op_errno, NULL, NULL,
+                        NULL);
 
-	return 0;
+    return 0;
 }
 
+int
+pl_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                struct iatt *postbuf, dict_t *xdata)
+{
+    pl_local_t *local = frame->local;
+
+    pl_track_io_fop_count(local, this, DECREMENT);
+
+    if (local->op == GF_FOP_TRUNCATE)
+        PL_STACK_UNWIND(truncate, xdata, frame, op_ret, op_errno, prebuf,
+                        postbuf, xdata);
+    else
+        PL_STACK_UNWIND(ftruncate, xdata, frame, op_ret, op_errno, prebuf,
+                        postbuf, xdata);
+    return 0;
+}
 
 int
-pl_ftruncate (call_frame_t *frame, xlator_t *this,
-	      fd_t *fd, off_t offset)
+pl_ftruncate_cont(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                  dict_t *xdata)
 {
-	struct _truncate_ops *local = NULL;
+    pl_track_io_fop_count(frame->local, this, INCREMENT);
 
-	local = CALLOC (1, sizeof (struct _truncate_ops));
-	if (!local) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto unwind;
-	}
+    STACK_WIND(frame, pl_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+}
 
-	local->op         = FTRUNCATE;
-	local->offset     = offset;
-	local->fd         = fd;
+int
+pl_truncate_cont(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+                 dict_t *xdata)
+{
+    pl_track_io_fop_count(frame->local, this, INCREMENT);
 
-	frame->local = local;
+    STACK_WIND(frame, pl_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+    return 0;
+}
 
-	STACK_WIND (frame, truncate_stat_cbk, FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->fstat, fd);
-	return 0;
+static int
+truncate_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                  dict_t *xdata)
+{
+    pl_local_t *local = frame->local;
+    inode_t *inode = NULL;
+    pl_inode_t *pl_inode = NULL;
+    pl_rw_req_t *rw = NULL;
+    posix_lock_t region = {
+        .list =
+            {
+                0,
+            },
+    };
+    gf_boolean_t enabled = _gf_false;
+    gf_boolean_t can_block = _gf_true;
+    int allowed = 1;
+
+    GF_VALIDATE_OR_GOTO("locks", this, unwind);
+
+    if (op_ret != 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "got error (errno=%d, stderror=%s) from child", op_errno,
+               strerror(op_errno));
+        goto unwind;
+    }
+
+    if (local->op == GF_FOP_TRUNCATE)
+        inode = local->loc[0].inode;
+    else
+        inode = local->fd->inode;
+
+    local->inode = inode_ref(inode);
+
+    pl_inode = pl_inode_get(this, inode, local);
+    if (!pl_inode) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    if (frame->root->pid < 0)
+        enabled = _gf_false;
+    else
+        enabled = pl_is_mandatory_locking_enabled(pl_inode);
+
+    if (enabled) {
+        region.fl_start = local->offset;
+        region.fl_end = LLONG_MAX;
+        region.client = frame->root->client;
+        region.fd_num = fd_to_fdnum(local->fd);
+        region.client_pid = frame->root->pid;
+        region.owner = frame->root->lk_owner;
+        pthread_mutex_lock(&pl_inode->mutex);
+        {
+            allowed = pl_is_fop_allowed(pl_inode, &region, local->fd, local->op,
+                                        &can_block);
+
+            if (allowed == 1) {
+                if (pl_inode->mlock_enforced &&
+                    pl_inode->track_fop_wind_count) {
+                    pl_inode->fop_wind_count++;
+                }
+                goto unlock;
+            } else if (!can_block) {
+                op_errno = EAGAIN;
+                op_ret = -1;
+                goto unlock;
+            }
+
+            rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t);
+            if (!rw) {
+                op_errno = ENOMEM;
+                op_ret = -1;
+                goto unlock;
+            }
+
+            if (local->op == GF_FOP_TRUNCATE)
+                rw->stub = fop_truncate_stub(frame, pl_truncate_cont,
+                                             &local->loc[0], local->offset,
+                                             local->xdata);
+            else
+                rw->stub = fop_ftruncate_stub(frame, pl_ftruncate_cont,
+                                              local->fd, local->offset,
+                                              local->xdata);
+            if (!rw->stub) {
+                op_errno = ENOMEM;
+                op_ret = -1;
+                GF_FREE(rw);
+                goto unlock;
+            }
+
+            rw->region = region;
+
+            list_add_tail(&rw->list, &pl_inode->rw_list);
+        }
+    unlock:
+        pthread_mutex_unlock(&pl_inode->mutex);
+    }
+
+    if (allowed == 1) {
+        switch (local->op) {
+            case GF_FOP_TRUNCATE:
+                STACK_WIND(frame, pl_truncate_cbk, FIRST_CHILD(this),
+                           FIRST_CHILD(this)->fops->truncate, &local->loc[0],
+                           local->offset, local->xdata);
+                break;
+            case GF_FOP_FTRUNCATE:
+                STACK_WIND(frame, pl_truncate_cbk, FIRST_CHILD(this),
+                           FIRST_CHILD(this)->fops->ftruncate, local->fd,
+                           local->offset, local->xdata);
+                break;
+            default:
+                break;
+        }
+    }
+unwind:
+    if (op_ret == -1) {
+        gf_log(this ? this->name : "locks", GF_LOG_ERROR,
+               "truncate failed with "
+               "ret: %d, error: %s",
+               op_ret, strerror(op_errno));
+
+        switch (local->op) {
+            case GF_FOP_TRUNCATE:
+                PL_STACK_UNWIND(truncate, xdata, frame, op_ret, op_errno, buf,
+                                NULL, xdata);
+                break;
+            case GF_FOP_FTRUNCATE:
+                PL_STACK_UNWIND(ftruncate, xdata, frame, op_ret, op_errno, buf,
+                                NULL, xdata);
+                break;
+            default:
+                break;
+        }
+    }
+    return 0;
+}
+
+int
+pl_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+            dict_t *xdata)
+{
+    pl_local_t *local = NULL;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("locks", this, unwind);
+
+    local = mem_get0(this->local_pool);
+    GF_VALIDATE_OR_GOTO(this->name, local, unwind);
+
+    local->op = GF_FOP_TRUNCATE;
+    local->offset = offset;
+    loc_copy(&local->loc[0], loc);
+    if (xdata)
+        local->xdata = dict_ref(xdata);
+
+    frame->local = local;
+
+    STACK_WIND(frame, truncate_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->stat, loc, NULL);
+    ret = 0;
 
 unwind:
-	STACK_UNWIND (frame, -1, ENOMEM, NULL);
+    if (ret == -1) {
+        gf_log(this ? this->name : "locks", GF_LOG_ERROR,
+               "truncate on %s failed with"
+               " ret: %d, error: %s",
+               loc->path, -1, strerror(ENOMEM));
+        STACK_UNWIND_STRICT(truncate, frame, -1, ENOMEM, NULL, NULL, NULL);
+    }
+    return 0;
+}
+
+int
+pl_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             dict_t *xdata)
+{
+    pl_local_t *local = NULL;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("locks", this, unwind);
+    local = mem_get0(this->local_pool);
+    GF_VALIDATE_OR_GOTO(this->name, local, unwind);
 
-	return 0;
+    local->op = GF_FOP_FTRUNCATE;
+    local->offset = offset;
+    local->fd = fd_ref(fd);
+    if (xdata)
+        local->xdata = dict_ref(xdata);
+
+    frame->local = local;
+
+    STACK_WIND(frame, truncate_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fstat, fd, xdata);
+    ret = 0;
+unwind:
+    if (ret == -1) {
+        gf_log(this ? this->name : "locks", GF_LOG_ERROR,
+               "ftruncate failed with"
+               " ret: %d, error: %s",
+               -1, strerror(ENOMEM));
+        STACK_UNWIND_STRICT(ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL);
+    }
+    return 0;
 }
 
+int
+pl_locks_by_fd(pl_inode_t *pl_inode, fd_t *fd)
+{
+    posix_lock_t *l = NULL;
+    int found = 0;
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        list_for_each_entry(l, &pl_inode->ext_list, list)
+        {
+            if (l->fd_num == fd_to_fdnum(fd)) {
+                found = 1;
+                break;
+            }
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+    return found;
+}
 
 static void
-__delete_locks_of_owner (pl_inode_t *pl_inode,
-			 transport_t *transport, pid_t pid)
+delete_locks_of_fd(xlator_t *this, pl_inode_t *pl_inode, fd_t *fd)
 {
-	posix_lock_t *tmp = NULL;
-	posix_lock_t *l = NULL;
+    posix_lock_t *tmp = NULL;
+    posix_lock_t *l = NULL;
+
+    struct list_head blocked_list;
+
+    INIT_LIST_HEAD(&blocked_list);
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        list_for_each_entry_safe(l, tmp, &pl_inode->ext_list, list)
+        {
+            if (l->fd_num == fd_to_fdnum(fd)) {
+                if (l->blocked) {
+                    list_move_tail(&l->list, &blocked_list);
+                    continue;
+                }
+                __delete_lock(l);
+                __destroy_lock(l);
+            }
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    list_for_each_entry_safe(l, tmp, &blocked_list, list)
+    {
+        list_del_init(&l->list);
+        STACK_UNWIND_STRICT(lk, l->frame, -1, EAGAIN, &l->user_flock, NULL);
+        __destroy_lock(l);
+    }
+
+    grant_blocked_locks(this, pl_inode);
+
+    do_blocked_rw(pl_inode);
+}
 
-	/* TODO: what if it is a blocked lock with pending l->frame */
+static void
+__delete_locks_of_owner(pl_inode_t *pl_inode, client_t *client,
+                        gf_lkowner_t *owner)
+{
+    posix_lock_t *tmp = NULL;
+    posix_lock_t *l = NULL;
+
+    /* TODO: what if it is a blocked lock with pending l->frame */
+
+    list_for_each_entry_safe(l, tmp, &pl_inode->ext_list, list)
+    {
+        if (l->blocked)
+            continue;
+        if ((l->client == client) && is_same_lkowner(&l->owner, owner)) {
+            gf_log("posix-locks", GF_LOG_TRACE,
+                   " Flushing lock"
+                   "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64
+                   " state: %s",
+                   l->fl_type == F_UNLCK ? "Unlock" : "Lock", l->client_pid,
+                   lkowner_utoa(&l->owner), l->user_flock.l_start,
+                   l->user_flock.l_len, l->blocked == 1 ? "Blocked" : "Active");
+
+            __delete_lock(l);
+            __destroy_lock(l);
+        }
+    }
+
+    return;
+}
 
-	list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) {
-		if ((l->transport == transport)
-		    && (l->client_pid == pid)) {
-			__delete_lock (pl_inode, l);
-			__destroy_lock (l);
-		}
-	}
+int32_t
+pl_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
 
-	list_for_each_entry_safe (l, tmp, &pl_inode->int_list, list) {
-		if ((l->transport == transport)
-		    && (l->client_pid == pid)) {
-			__delete_lock (pl_inode, l);
-			__destroy_lock (l);
-		}
-	}
+static int32_t
+pl_getxattr_clrlk(xlator_t *this, const char *name, inode_t *inode,
+                  dict_t **dict, int32_t *op_errno)
+{
+    int32_t bcount = 0;
+    int32_t gcount = 0;
+    char *key = NULL;
+    char *lk_summary = NULL;
+    pl_inode_t *pl_inode = NULL;
+    clrlk_args args = {
+        0,
+    };
+    char *brickname = NULL;
+    int32_t op_ret = -1;
+
+    *op_errno = EINVAL;
+
+    if (clrlk_parse_args(name, &args)) {
+        *op_errno = EINVAL;
+        goto out;
+    }
+
+    *dict = dict_new();
+    if (!*dict) {
+        *op_errno = ENOMEM;
+        goto out;
+    }
+
+    pl_inode = pl_inode_get(this, inode, NULL);
+    if (!pl_inode) {
+        *op_errno = ENOMEM;
+        goto out;
+    }
+
+    switch (args.type) {
+        case CLRLK_INODE:
+        case CLRLK_ENTRY:
+            op_ret = clrlk_clear_lks_in_all_domains(this, pl_inode, &args,
+                                                    &bcount, &gcount, op_errno);
+            break;
+        case CLRLK_POSIX:
+            op_ret = clrlk_clear_posixlk(this, pl_inode, &args, &bcount,
+                                         &gcount, op_errno);
+            break;
+        default:
+            op_ret = -1;
+            *op_errno = EINVAL;
+    }
+    if (op_ret) {
+        if (args.type >= CLRLK_TYPE_MAX) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "clear locks: invalid lock type %d", args.type);
+        } else {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "clear locks of type %s failed: %s",
+                   clrlk_type_names[args.type], strerror(*op_errno));
+        }
+
+        goto out;
+    }
+
+    op_ret = fetch_pathinfo(this, inode, op_errno, &brickname);
+    if (op_ret) {
+        gf_log(this->name, GF_LOG_WARNING, "Couldn't get brickname");
+    } else {
+        op_ret = format_brickname(brickname);
+        if (op_ret) {
+            gf_log(this->name, GF_LOG_WARNING, "Couldn't format brickname");
+            GF_FREE(brickname);
+            brickname = NULL;
+        }
+    }
+
+    if (!gcount && !bcount) {
+        if (gf_asprintf(&lk_summary, "No locks cleared.") == -1) {
+            op_ret = -1;
+            *op_errno = ENOMEM;
+            goto out;
+        }
+    } else if (gf_asprintf(&lk_summary,
+                           "%s: %s blocked locks=%d "
+                           "granted locks=%d",
+                           (brickname == NULL) ? this->name : brickname,
+                           clrlk_type_names[args.type], bcount, gcount) == -1) {
+        op_ret = -1;
+        *op_errno = ENOMEM;
+        goto out;
+    }
+    gf_log(this->name, GF_LOG_DEBUG, "%s", lk_summary);
+
+    key = gf_strdup(name);
+    if (!key) {
+        op_ret = -1;
+        goto out;
+    }
+    if (dict_set_dynstr(*dict, key, lk_summary)) {
+        op_ret = -1;
+        *op_errno = ENOMEM;
+        goto out;
+    }
+
+    op_ret = 0;
 
-	return;
+out:
+    GF_FREE(brickname);
+    GF_FREE(args.opts);
+    GF_FREE(key);
+    if (op_ret) {
+        GF_FREE(lk_summary);
+    }
+
+    return op_ret;
 }
 
+int32_t
+pl_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+            dict_t *xdata)
+{
+    int32_t op_errno = EINVAL;
+    int32_t op_ret = -1;
+    dict_t *dict = NULL;
 
-int
-pl_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	      int32_t op_ret, int32_t op_errno)
+    if (!name)
+        goto usual;
+
+    if (strncmp(name, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)))
+        goto usual;
+
+    op_ret = pl_getxattr_clrlk(this, name, loc->inode, &dict, &op_errno);
+
+    STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xdata);
+
+    if (dict)
+        dict_unref(dict);
+    return 0;
+
+usual:
+    STACK_WIND(frame, pl_getxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
+    return 0;
+}
+
+static int
+format_brickname(char *brickname)
 {
-	STACK_UNWIND (frame, op_ret, op_errno);
+    int ret = -1;
+    char *hostname = NULL;
+    char *volume = NULL;
+    char *saveptr = NULL;
+
+    if (!brickname)
+        goto out;
 
-	return 0;
+    strtok_r(brickname, ":", &saveptr);
+    hostname = gf_strdup(strtok_r(NULL, ":", &saveptr));
+    if (hostname == NULL)
+        goto out;
+    volume = gf_strdup(strtok_r(NULL, ".", &saveptr));
+    if (volume == NULL)
+        goto out;
+
+    sprintf(brickname, "%s:%s", hostname, volume);
+
+    ret = 0;
+out:
+    GF_FREE(hostname);
+    GF_FREE(volume);
+    return ret;
 }
 
+static int
+fetch_pathinfo(xlator_t *this, inode_t *inode, int32_t *op_errno,
+               char **brickname)
+{
+    int ret = -1;
+    loc_t loc = {
+        0,
+    };
+    dict_t *dict = NULL;
+
+    if (!brickname)
+        goto out;
+
+    if (!op_errno)
+        goto out;
+
+    gf_uuid_copy(loc.gfid, inode->gfid);
+    loc.inode = inode_ref(inode);
+
+    ret = syncop_getxattr(FIRST_CHILD(this), &loc, &dict, GF_XATTR_PATHINFO_KEY,
+                          NULL, NULL);
+    if (ret < 0) {
+        *op_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_str_sizen(dict, GF_XATTR_PATHINFO_KEY, brickname);
+    if (ret)
+        goto out;
+
+    *brickname = gf_strdup(*brickname);
+    if (*brickname == NULL) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (dict != NULL) {
+        dict_unref(dict);
+    }
+    loc_wipe(&loc);
+
+    return ret;
+}
 
 int
-pl_flush (call_frame_t *frame, xlator_t *this,
-	  fd_t *fd)
+pl_lockinfo_get_brickname(xlator_t *this, inode_t *inode, int32_t *op_errno)
+{
+    posix_locks_private_t *priv = this->private;
+    char *brickname = NULL;
+    char *end = NULL;
+    char *tmp = NULL;
+
+    int ret = fetch_pathinfo(this, inode, op_errno, &brickname);
+    if (ret)
+        goto out;
+
+    end = strrchr(brickname, ':');
+    if (!end) {
+        GF_FREE(brickname);
+        ret = -1;
+        goto out;
+    }
+
+    tmp = brickname;
+    brickname = gf_strndup(brickname, (end - brickname));
+    if (brickname == NULL) {
+        ret = -1;
+        goto out;
+    }
+
+    priv->brickname = brickname;
+    ret = 0;
+out:
+    GF_FREE(tmp);
+    return ret;
+}
+
+char *
+pl_lockinfo_key(xlator_t *this, inode_t *inode, int32_t *op_errno)
+{
+    posix_locks_private_t *priv = this->private;
+    char *key = NULL;
+    int ret = 0;
+
+    if (priv->brickname == NULL) {
+        ret = pl_lockinfo_get_brickname(this, inode, op_errno);
+        if (ret < 0) {
+            gf_log(this->name, GF_LOG_WARNING, "cannot get brickname");
+            goto out;
+        }
+    }
+
+    key = priv->brickname;
+out:
+    return key;
+}
+
+int32_t
+pl_fgetxattr_handle_lockinfo(xlator_t *this, fd_t *fd, dict_t *dict,
+                             int32_t *op_errno)
 {
-	posix_locks_private_t *priv = NULL;
-	pl_inode_t            *pl_inode = NULL;
+    char *key = NULL, *buf = NULL;
+    int32_t op_ret = 0;
+    unsigned long fdnum = 0;
+    int32_t len = 0;
+    dict_t *tmp = NULL;
+
+    pl_inode_t *pl_inode = pl_inode_get(this, fd->inode, NULL);
+
+    if (!pl_inode) {
+        gf_log(this->name, GF_LOG_DEBUG, "Could not get inode.");
+        *op_errno = EBADFD;
+        op_ret = -1;
+        goto out;
+    }
+
+    if (!pl_locks_by_fd(pl_inode, fd)) {
+        op_ret = 0;
+        goto out;
+    }
+
+    fdnum = fd_to_fdnum(fd);
+
+    key = pl_lockinfo_key(this, fd->inode, op_errno);
+    if (key == NULL) {
+        op_ret = -1;
+        goto out;
+    }
+
+    tmp = dict_new();
+    if (tmp == NULL) {
+        op_ret = -1;
+        *op_errno = ENOMEM;
+        goto out;
+    }
+
+    op_ret = dict_set_uint64(tmp, key, fdnum);
+    if (op_ret < 0) {
+        *op_errno = -op_ret;
+        op_ret = -1;
+        gf_log(this->name, GF_LOG_WARNING,
+               "setting lockinfo value "
+               "(%lu) for fd (ptr:%p inode-gfid:%s) failed (%s)",
+               fdnum, fd, uuid_utoa(fd->inode->gfid), strerror(*op_errno));
+        goto out;
+    }
+
+    op_ret = dict_allocate_and_serialize(tmp, (char **)&buf,
+                                         (unsigned int *)&len);
+    if (op_ret != 0) {
+        *op_errno = -op_ret;
+        op_ret = -1;
+        gf_log(this->name, GF_LOG_WARNING,
+               "dict_serialized_length failed (%s) while handling "
+               "lockinfo for fd (ptr:%p inode-gfid:%s)",
+               strerror(*op_errno), fd, uuid_utoa(fd->inode->gfid));
+        goto out;
+    }
+
+    op_ret = dict_set_dynptr(dict, GF_XATTR_LOCKINFO_KEY, buf, len);
+    if (op_ret < 0) {
+        *op_errno = -op_ret;
+        op_ret = -1;
+        gf_log(this->name, GF_LOG_WARNING,
+               "setting lockinfo value "
+               "(%lu) for fd (ptr:%p inode-gfid:%s) failed (%s)",
+               fdnum, fd, uuid_utoa(fd->inode->gfid), strerror(*op_errno));
+        goto out;
+    }
+
+    buf = NULL;
+out:
+    if (tmp != NULL) {
+        dict_unref(tmp);
+    }
 
-	priv = this->private;
+    if (buf != NULL) {
+        GF_FREE(buf);
+    }
 
-	pl_inode = pl_inode_get (this, fd->inode);
+    return op_ret;
+}
 
-	if (!pl_inode) {
-		gf_log (this->name, GF_LOG_ERROR, "returning EBADFD");
-		STACK_UNWIND (frame, -1, EBADFD);
-		return 0;
-	}
+int32_t
+pl_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+             dict_t *xdata)
+{
+    int32_t op_ret = 0, op_errno = 0;
+    dict_t *dict = NULL;
+
+    if (!name) {
+        goto usual;
+    }
+
+    if (strcmp(name, GF_XATTR_LOCKINFO_KEY) == 0) {
+        dict = dict_new();
+        if (dict == NULL) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unwind;
+        }
+
+        op_ret = pl_fgetxattr_handle_lockinfo(this, fd, dict, &op_errno);
+        if (op_ret < 0) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "getting lockinfo on fd (ptr:%p inode-gfid:%s) "
+                   "failed (%s)",
+                   fd, uuid_utoa(fd->inode->gfid), strerror(op_errno));
+        }
+
+        goto unwind;
+    } else if (strncmp(name, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)) ==
+               0) {
+        op_ret = pl_getxattr_clrlk(this, name, fd->inode, &dict, &op_errno);
+
+        goto unwind;
+    } else {
+        goto usual;
+    }
 
-	pthread_mutex_lock (&pl_inode->mutex);
-	{
-		__delete_locks_of_owner (pl_inode, frame->root->trans,
-					 frame->root->pid);
-	}
-	pthread_mutex_unlock (&pl_inode->mutex);
+unwind:
+    STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, dict, NULL);
+    if (dict != NULL) {
+        dict_unref(dict);
+    }
 
-	grant_blocked_locks (this, pl_inode, GF_LOCK_POSIX);
-	grant_blocked_locks (this, pl_inode, GF_LOCK_INTERNAL);
+    return 0;
 
-	do_blocked_rw (pl_inode);
+usual:
+    STACK_WIND(frame, default_fgetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+    return 0;
+}
 
-	STACK_WIND (frame, pl_flush_cbk, FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->flush, fd);
-	return 0;
+int32_t
+pl_migrate_locks(call_frame_t *frame, fd_t *newfd, uint64_t oldfd_num,
+                 int32_t *op_errno)
+{
+    posix_lock_t *l = NULL;
+    int32_t op_ret = 0;
+    uint64_t newfd_num = fd_to_fdnum(newfd);
+
+    pl_inode_t *pl_inode = pl_inode_get(frame->this, newfd->inode, NULL);
+    if (pl_inode == NULL) {
+        op_ret = -1;
+        *op_errno = EBADFD;
+        goto out;
+    }
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        list_for_each_entry(l, &pl_inode->ext_list, list)
+        {
+            if (l->fd_num == oldfd_num) {
+                l->fd_num = newfd_num;
+                l->client = frame->root->client;
+            }
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    op_ret = 0;
+out:
+    return op_ret;
 }
 
+int32_t
+pl_fsetxattr_handle_lockinfo(call_frame_t *frame, fd_t *fd, char *lockinfo_buf,
+                             int len, int32_t *op_errno)
+{
+    int32_t op_ret = -1;
+    uint64_t oldfd_num = 0;
+    char *key = NULL;
+
+    dict_t *lockinfo = dict_new();
+    if (lockinfo == NULL) {
+        op_ret = -1;
+        *op_errno = ENOMEM;
+        goto out;
+    }
+
+    op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo);
+    if (op_ret < 0) {
+        *op_errno = -op_ret;
+        op_ret = -1;
+        goto out;
+    }
+
+    key = pl_lockinfo_key(frame->this, fd->inode, op_errno);
+    if (key == NULL) {
+        op_ret = -1;
+        goto out;
+    }
+
+    op_ret = dict_get_uint64(lockinfo, key, &oldfd_num);
+
+    if (oldfd_num == 0) {
+        op_ret = 0;
+        goto out;
+    }
+
+    op_ret = pl_migrate_locks(frame, fd, oldfd_num, op_errno);
+    if (op_ret < 0) {
+        gf_log(frame->this->name, GF_LOG_WARNING,
+               "migration of locks from oldfd (ptr:%p) to newfd "
+               "(ptr:%p) (inode-gfid:%s)",
+               (void *)(uintptr_t)oldfd_num, fd, uuid_utoa(fd->inode->gfid));
+        goto out;
+    }
+
+out:
+    dict_unref(lockinfo);
+
+    return op_ret;
+}
+
+int32_t
+pl_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    pl_local_t *local = NULL;
+    pl_inode_t *pl_inode = NULL;
+
+    local = frame->local;
+    if (local && local->update_mlock_enforced_flag && op_ret != -1) {
+        pl_inode = pl_inode_get(this, local->inode, NULL);
+        if (!pl_inode) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unwind;
+        }
+
+        pthread_mutex_lock(&pl_inode->mutex);
+        {
+            pl_inode->mlock_enforced = _gf_true;
+            pl_inode->check_mlock_info = _gf_false;
+        }
+        pthread_mutex_unlock(&pl_inode->mutex);
+    }
+
+unwind:
+    PL_STACK_UNWIND_FOR_CLIENT(fsetxattr, xdata, frame, op_ret, op_errno,
+                               xdata);
+    return 0;
+}
+
+int32_t
+pl_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+             int32_t flags, dict_t *xdata)
+{
+    int32_t op_errno = 0;
+    void *lockinfo_buf = NULL;
+    int len = 0;
+    char *name = NULL;
+    posix_locks_private_t *priv = this->private;
+
+    int32_t op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY,
+                                          &lockinfo_buf, &len);
+    if (lockinfo_buf == NULL) {
+        goto usual;
+    }
+
+    op_ret = pl_fsetxattr_handle_lockinfo(frame, fd, lockinfo_buf, len,
+                                          &op_errno);
+    if (op_ret < 0) {
+        goto unwind;
+    }
+
+usual:
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+
+    PL_CHECK_LOCK_ENFORCE_KEY(frame, dict, name, this, ((loc_t *)NULL), fd,
+                              priv);
+
+    STACK_WIND(frame, pl_fsetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+    return 0;
+
+unwind:
+    PL_STACK_UNWIND_FOR_CLIENT(fsetxattr, xdata, frame, op_ret, op_errno, NULL);
+
+    return 0;
+}
+
+int32_t
+pl_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    pl_fdctx_t *fdctx = NULL;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    fdctx = pl_check_n_create_fdctx(this, fd);
+    if (!fdctx) {
+        op_errno = ENOMEM;
+        op_ret = -1;
+        goto unwind;
+    }
+
+unwind:
+    PL_STACK_UNWIND(opendir, xdata, frame, op_ret, op_errno, fd, xdata);
+
+    return 0;
+}
+
+int32_t
+pl_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+           dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+    STACK_WIND(frame, pl_opendir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+    return 0;
+}
 
 int
-pl_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	     int32_t op_ret, int32_t op_errno, fd_t *fd)
+pl_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, dict_t *xdata)
 {
-	STACK_UNWIND (frame, op_ret, op_errno, fd);
+    PL_STACK_UNWIND_FOR_CLIENT(flush, xdata, frame, op_ret, op_errno, xdata);
 
-	return 0;
+    return 0;
 }
 
+int
+pl_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    pl_inode_t *pl_inode = pl_inode_get(this, fd->inode, NULL);
+    if (!pl_inode) {
+        gf_log(this->name, GF_LOG_DEBUG, "Could not get inode.");
+        STACK_UNWIND_STRICT(flush, frame, -1, EBADFD, NULL);
+        return 0;
+    }
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        if (pl_inode->migrated) {
+            pthread_mutex_unlock(&pl_inode->mutex);
+            STACK_UNWIND_STRICT(flush, frame, -1, EREMOTE, NULL);
+            return 0;
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    pl_trace_flush(this, frame, fd);
+
+    if (frame->root->lk_owner.len == 0) {
+        /* Handle special case when protocol/server sets lk-owner to zero.
+         * This usually happens due to a client disconnection. Hence, free
+         * all locks opened with this fd.
+         */
+        gf_log(this->name, GF_LOG_TRACE, "Releasing all locks with fd %p", fd);
+        delete_locks_of_fd(this, pl_inode, fd);
+        goto wind;
+    }
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        __delete_locks_of_owner(pl_inode, frame->root->client,
+                                &frame->root->lk_owner);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    grant_blocked_locks(this, pl_inode);
+
+    do_blocked_rw(pl_inode);
+
+wind:
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+    STACK_WIND(frame, pl_flush_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->flush, fd, xdata);
+    return 0;
+}
 
 int
-pl_open (call_frame_t *frame, xlator_t *this,
-	 loc_t *loc, int32_t flags, fd_t *fd)
+pl_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, fd_t *fd, dict_t *xdata)
 {
-	/* why isn't O_TRUNC being handled ? */
-	STACK_WIND (frame, pl_open_cbk, 
-		    FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, 
-		    loc, flags & ~O_TRUNC, fd);
+    pl_fdctx_t *fdctx = NULL;
 
-	return 0;
+    if (op_ret < 0)
+        goto unwind;
+
+    fdctx = pl_check_n_create_fdctx(this, fd);
+    if (!fdctx) {
+        op_errno = ENOMEM;
+        op_ret = -1;
+        goto unwind;
+    }
+
+unwind:
+    STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata);
+
+    return 0;
 }
 
+int
+pl_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+        fd_t *fd, dict_t *xdata)
+{
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    pl_inode_t *pl_inode = NULL;
+    posix_lock_t *l = NULL;
+    posix_locks_private_t *priv = this->private;
+
+    GF_VALIDATE_OR_GOTO("locks", this, unwind);
+
+    op_ret = 0, op_errno = 0;
+    pl_inode = pl_inode_get(this, fd->inode, NULL);
+    if (!pl_inode) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM, "Could not get inode");
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    /* As per design, under forced and file-based mandatory locking modes
+     * it doesn't matter whether inodes's lock list contain advisory or
+     * mandatory type locks. So we just check whether inode's lock list is
+     * empty or not to make sure that no locks are being held for the file.
+     * Whereas under optimal mandatory locking mode, we strictly fail open
+     * if and only if lock list contain mandatory locks.
+     */
+    if (((priv->mandatory_mode == MLK_FILE_BASED) && pl_inode->mandatory) ||
+        priv->mandatory_mode == MLK_FORCED) {
+        if (fd->flags & O_TRUNC) {
+            pthread_mutex_lock(&pl_inode->mutex);
+            {
+                if (!list_empty(&pl_inode->ext_list)) {
+                    op_ret = -1;
+                    op_errno = EAGAIN;
+                }
+            }
+            pthread_mutex_unlock(&pl_inode->mutex);
+        }
+    } else if (priv->mandatory_mode == MLK_OPTIMAL) {
+        if (fd->flags & O_TRUNC) {
+            pthread_mutex_lock(&pl_inode->mutex);
+            {
+                list_for_each_entry(l, &pl_inode->ext_list, list)
+                {
+                    if ((l->lk_flags & GF_LK_MANDATORY)) {
+                        op_ret = -1;
+                        op_errno = EAGAIN;
+                        break;
+                    }
+                }
+            }
+            pthread_mutex_unlock(&pl_inode->mutex);
+        }
+    }
+
+unwind:
+    if (op_ret == -1)
+        STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, NULL, NULL);
+    else
+        STACK_WIND(frame, pl_open_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+    return 0;
+}
 
 int
-pl_create_cbk (call_frame_t *frame, void *cookie,
-	       xlator_t *this, int32_t op_ret, int32_t op_errno,
-	       fd_t *fd, inode_t *inode, struct stat *buf)
+pl_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf,
+              struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
 {
-	STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
+    pl_fdctx_t *fdctx = NULL;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    fdctx = pl_check_n_create_fdctx(this, fd);
+    if (!fdctx) {
+        op_errno = ENOMEM;
+        op_ret = -1;
+        goto unwind;
+    }
 
-	return 0;
+unwind:
+    PL_STACK_UNWIND(create, xdata, frame, op_ret, op_errno, fd, inode, buf,
+                    preparent, postparent, xdata);
+
+    return 0;
 }
 
+int
+pl_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+          mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+
+    STACK_WIND(frame, pl_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+    return 0;
+}
 
 int
-pl_create (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, int32_t flags, mode_t mode, fd_t *fd)
+pl_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iovec *vector, int32_t count,
+             struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
 {
-	STACK_WIND (frame, pl_create_cbk,
-		    FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, 
-		    loc, flags, mode, fd);
-	return 0;
+    pl_track_io_fop_count(frame->local, this, DECREMENT);
+
+    PL_STACK_UNWIND(readv, xdata, frame, op_ret, op_errno, vector, count, stbuf,
+                    iobref, xdata);
+
+    return 0;
 }
 
+int
+pl_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
+              dict_t *xdata)
+{
+    pl_track_io_fop_count(frame->local, this, DECREMENT);
+
+    PL_STACK_UNWIND(writev, xdata, frame, op_ret, op_errno, prebuf, postbuf,
+                    xdata);
+
+    return 0;
+}
+
+void
+do_blocked_rw(pl_inode_t *pl_inode)
+{
+    struct list_head wind_list;
+    pl_rw_req_t *rw = NULL;
+    pl_rw_req_t *tmp = NULL;
+
+    INIT_LIST_HEAD(&wind_list);
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        list_for_each_entry_safe(rw, tmp, &pl_inode->rw_list, list)
+        {
+            if (__rw_allowable(pl_inode, &rw->region, rw->stub->fop)) {
+                list_del_init(&rw->list);
+                list_add_tail(&rw->list, &wind_list);
+                if (pl_inode->mlock_enforced &&
+                    pl_inode->track_fop_wind_count) {
+                    pl_inode->fop_wind_count++;
+                }
+            }
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    list_for_each_entry_safe(rw, tmp, &wind_list, list)
+    {
+        list_del_init(&rw->list);
+        call_resume(rw->stub);
+        GF_FREE(rw);
+    }
+
+    return;
+}
+
+/* when mandatory lock is enforced:
+    If an IO request comes on a region which is out of the boundary of the
+    granted mandatory lock, it will be rejected.
+
+    Note: There is no IO blocking with mandatory lock enforced as it may be
+    a stale data from an old client.
+ */
+gf_boolean_t static within_range(posix_lock_t *existing, posix_lock_t *new)
+{
+    if (existing->fl_start <= new->fl_start && existing->fl_end >= new->fl_end)
+        return _gf_true;
+
+    return _gf_false;
+}
+
+static int
+__rw_allowable(pl_inode_t *pl_inode, posix_lock_t *region, glusterfs_fop_t op)
+{
+    posix_lock_t *l = NULL;
+    posix_locks_private_t *priv = THIS->private;
+    int ret = 1;
+
+    if (pl_inode->mlock_enforced) {
+        list_for_each_entry(l, &pl_inode->ext_list, list)
+        {
+            /*
+                with lock enforced (fencing) there should not be any blocking
+                lock coexisting.
+            */
+            if (same_owner(l, region)) {
+                /* Should range check be strict for same owner with fencing? */
+                if (locks_overlap(l, region)) {
+                    if (within_range(l, region)) {
+                        return 1;
+                    } else {
+                        /*
+                        Should we allow read fop if it does not fit it in the
+                        range?
+                        if (op == GF_FOP_READ && l->fl_type != F_WRLCK) {
+                            return 1;
+                        }
+                        */
+                        return 0;
+                    }
+                }
+            } else {
+                if (locks_overlap(l, region)) {
+                    /*
+                    with fencing should a read from a different owner be
+                    allowed if the mandatory lock taken is F_RDLCK?
+                    if (op == GF_FOP_READ && l->fl_type != F_WRLCK) {
+                        return 1;
+                    }
+                    */
+                    return 0;
+                }
+            }
+        }
+
+        /* No lock has been taken by this owner */
+        return 0;
+    }
+
+    list_for_each_entry(l, &pl_inode->ext_list, list)
+    {
+        if (!l->blocked && locks_overlap(l, region) && !same_owner(l, region)) {
+            if ((op == GF_FOP_READ) && (l->fl_type != F_WRLCK))
+                continue;
+            /* Check for mandatory lock under optimal
+             * mandatory-locking mode */
+            if (priv->mandatory_mode == MLK_OPTIMAL &&
+                !(l->lk_flags & GF_LK_MANDATORY))
+                continue;
+            ret = 0;
+            break;
+        }
+    }
+
+    return ret;
+}
 
 int
-pl_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	      int32_t op_ret, int32_t op_errno,
-	      struct iovec *vector, int32_t count, struct stat *stbuf)
+pl_readv_cont(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+              off_t offset, uint32_t flags, dict_t *xdata)
 {
-	STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
+    pl_track_io_fop_count(frame->local, this, INCREMENT);
+
+    STACK_WIND(frame, pl_readv_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
 
-	return 0;
+    return 0;
 }
 
 int
-pl_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, 
-	       int32_t op_ret, int32_t op_errno, struct stat *stbuf)
+pl_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+         off_t offset, uint32_t flags, dict_t *xdata)
 {
-	STACK_UNWIND (frame, op_ret, op_errno, stbuf);
+    pl_local_t *local = NULL;
+    pl_inode_t *pl_inode = NULL;
+    pl_rw_req_t *rw = NULL;
+    posix_lock_t region = {
+        .list =
+            {
+                0,
+            },
+    };
+    gf_boolean_t enabled = _gf_false;
+    gf_boolean_t can_block = _gf_true;
+    int op_ret = 0;
+    int op_errno = 0;
+    int allowed = 1;
+
+    GF_VALIDATE_OR_GOTO("locks", this, unwind);
+
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+
+    if (!frame->local) {
+        frame->local = mem_get0(this->local_pool);
+        local = frame->local;
+        local->inode = inode_ref(fd->inode);
+        local->fd = fd_ref(fd);
+    }
+
+    pl_inode = pl_inode_get(this, fd->inode, local);
+    if (!pl_inode) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    if (frame->root->pid < 0)
+        enabled = _gf_false;
+    else
+        enabled = pl_is_mandatory_locking_enabled(pl_inode);
+
+    if (enabled) {
+        region.fl_start = offset;
+        region.fl_end = offset + size - 1;
+        region.client = frame->root->client;
+        region.fd_num = fd_to_fdnum(fd);
+        region.client_pid = frame->root->pid;
+        region.owner = frame->root->lk_owner;
+
+        pthread_mutex_lock(&pl_inode->mutex);
+        {
+            allowed = pl_is_fop_allowed(pl_inode, &region, fd, GF_FOP_READ,
+                                        &can_block);
+            if (allowed == 1) {
+                if (pl_inode->mlock_enforced &&
+                    pl_inode->track_fop_wind_count) {
+                    pl_inode->fop_wind_count++;
+                }
+                goto unlock;
+            } else if (!can_block) {
+                op_errno = EAGAIN;
+                op_ret = -1;
+                goto unlock;
+            }
+
+            rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t);
+            if (!rw) {
+                op_errno = ENOMEM;
+                op_ret = -1;
+                goto unlock;
+            }
+
+            rw->stub = fop_readv_stub(frame, pl_readv_cont, fd, size, offset,
+                                      flags, xdata);
+            if (!rw->stub) {
+                op_errno = ENOMEM;
+                op_ret = -1;
+                GF_FREE(rw);
+                goto unlock;
+            }
+
+            rw->region = region;
+
+            list_add_tail(&rw->list, &pl_inode->rw_list);
+        }
+    unlock:
+        pthread_mutex_unlock(&pl_inode->mutex);
+    }
+
+    if (allowed == 1) {
+        STACK_WIND(frame, pl_readv_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->readv, fd, size, offset, flags,
+                   xdata);
+    }
+unwind:
+    if (op_ret == -1)
+        PL_STACK_UNWIND(readv, xdata, frame, op_ret, op_errno, NULL, 0, NULL,
+                        NULL, NULL);
 
-	return 0;
+    return 0;
 }
 
+int
+pl_writev_cont(call_frame_t *frame, xlator_t *this, fd_t *fd,
+               struct iovec *vector, int count, off_t offset, uint32_t flags,
+               struct iobref *iobref, dict_t *xdata)
+{
+    pl_track_io_fop_count(frame->local, this, INCREMENT);
+
+    STACK_WIND(frame, pl_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+               flags, iobref, xdata);
 
-void
-do_blocked_rw (pl_inode_t *pl_inode)
+    return 0;
+}
+
+int
+pl_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+          int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+          dict_t *xdata)
 {
-	struct list_head  wind_list;
-	pl_rw_req_t      *rw = NULL;
-	pl_rw_req_t      *tmp = NULL;
+    pl_local_t *local = NULL;
+    pl_inode_t *pl_inode = NULL;
+    pl_rw_req_t *rw = NULL;
+    posix_lock_t region = {
+        .list =
+            {
+                0,
+            },
+    };
+    gf_boolean_t enabled = _gf_false;
+    gf_boolean_t can_block = _gf_true;
+    int op_ret = 0;
+    int op_errno = 0;
+    int allowed = 1;
+
+    GF_VALIDATE_OR_GOTO("locks", this, unwind);
+
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+
+    if (!frame->local) {
+        frame->local = mem_get0(this->local_pool);
+        local = frame->local;
+        local->inode = inode_ref(fd->inode);
+        local->fd = fd_ref(fd);
+    }
+
+    pl_inode = pl_inode_get(this, fd->inode, local);
+    if (!pl_inode) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    if (frame->root->pid < 0)
+        enabled = _gf_false;
+    else
+        enabled = pl_is_mandatory_locking_enabled(pl_inode);
+
+    if (enabled) {
+        region.fl_start = offset;
+        region.fl_end = offset + iov_length(vector, count) - 1;
+        region.client = frame->root->client;
+        region.fd_num = fd_to_fdnum(fd);
+        region.client_pid = frame->root->pid;
+        region.owner = frame->root->lk_owner;
+
+        pthread_mutex_lock(&pl_inode->mutex);
+        {
+            allowed = pl_is_fop_allowed(pl_inode, &region, fd, GF_FOP_WRITE,
+                                        &can_block);
+            if (allowed == 1) {
+                if (pl_inode->mlock_enforced &&
+                    pl_inode->track_fop_wind_count) {
+                    pl_inode->fop_wind_count++;
+                }
+                goto unlock;
+            } else if (!can_block) {
+                if (pl_inode->mlock_enforced) {
+                    op_errno = EBUSY;
+                } else {
+                    op_errno = EAGAIN;
+                }
+
+                op_ret = -1;
+                goto unlock;
+            }
+
+            rw = GF_MALLOC(sizeof(*rw), gf_locks_mt_pl_rw_req_t);
+            if (!rw) {
+                op_errno = ENOMEM;
+                op_ret = -1;
+                goto unlock;
+            }
+
+            rw->stub = fop_writev_stub(frame, pl_writev_cont, fd, vector, count,
+                                       offset, flags, iobref, xdata);
+            if (!rw->stub) {
+                op_errno = ENOMEM;
+                op_ret = -1;
+                GF_FREE(rw);
+                goto unlock;
+            }
+
+            rw->region = region;
+
+            list_add_tail(&rw->list, &pl_inode->rw_list);
+        }
+    unlock:
+        pthread_mutex_unlock(&pl_inode->mutex);
+    }
+
+    if (allowed == 1) {
+        STACK_WIND(frame, pl_writev_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+                   flags, iobref, xdata);
+    }
+unwind:
+    if (op_ret == -1)
+        PL_STACK_UNWIND(writev, xdata, frame, op_ret, op_errno, NULL, NULL,
+                        NULL);
 
-	INIT_LIST_HEAD (&wind_list);
+    return 0;
+}
 
-	pthread_mutex_lock (&pl_inode->mutex);
-	{
-		list_for_each_entry_safe (rw, tmp, &pl_inode->rw_list, list) {
-			if (__rw_allowable (pl_inode, &rw->region,
-					    rw->stub->fop)) {
-				list_del_init (&rw->list);
-				list_add_tail (&rw->list, &wind_list);
-			}
-		}
-	}
-	pthread_mutex_unlock (&pl_inode->mutex);
+static int
+__fd_has_locks(pl_inode_t *pl_inode, fd_t *fd)
+{
+    posix_lock_t *l = NULL;
 
-	list_for_each_entry_safe (rw, tmp, &wind_list, list) {
-		list_del_init (&rw->list);
-		call_resume (rw->stub);
-		free (rw);
-	}
+    list_for_each_entry(l, &pl_inode->ext_list, list)
+    {
+        if (l->fd_num == fd_to_fdnum(fd)) {
+            return 1;
+        }
+    }
 
-	return;
+    return 0;
 }
 
+static posix_lock_t *
+lock_dup(posix_lock_t *lock)
+{
+    int32_t op_errno = 0;
+    return new_posix_lock(&lock->user_flock, lock->client, lock->client_pid,
+                          &lock->owner, (fd_t *)lock->fd_num, lock->lk_flags,
+                          lock->blocking, &op_errno);
+}
 
 static int
-__rw_allowable (pl_inode_t *pl_inode, posix_lock_t *region,
-		glusterfs_fop_t op)
+__dup_locks_to_fdctx(pl_inode_t *pl_inode, fd_t *fd, pl_fdctx_t *fdctx)
 {
-	posix_lock_t *l = NULL;
-	int           ret = 1;
+    posix_lock_t *l = NULL;
+    posix_lock_t *duplock = NULL;
+    int ret = 0;
+
+    list_for_each_entry(l, &pl_inode->ext_list, list)
+    {
+        if (l->fd_num == fd_to_fdnum(fd)) {
+            duplock = lock_dup(l);
+            if (!duplock) {
+                ret = -1;
+                break;
+            }
+
+            list_add_tail(&duplock->list, &fdctx->locks_list);
+        }
+    }
+
+    return ret;
+}
 
-	list_for_each_entry (l, &pl_inode->ext_list, list) {
-		if (locks_overlap (l, region) && !same_owner (l, region)) {
-			if ((op == GF_FOP_READ) && (l->fl_type != F_WRLCK))
-				continue;
-			ret = 0;
-			break;
-		}
-	}
+static int
+__copy_locks_to_fdctx(pl_inode_t *pl_inode, fd_t *fd, pl_fdctx_t *fdctx)
+{
+    return __dup_locks_to_fdctx(pl_inode, fd, fdctx);
+}
 
-	return ret;
+static void
+pl_mark_eol_lock(posix_lock_t *lock)
+{
+    lock->user_flock.l_type = GF_LK_EOL;
+    return;
 }
 
+static posix_lock_t *
+__get_next_fdctx_lock(pl_fdctx_t *fdctx)
+{
+    posix_lock_t *lock = NULL;
+
+    GF_ASSERT(fdctx);
+
+    if (list_empty(&fdctx->locks_list)) {
+        gf_log(THIS->name, GF_LOG_DEBUG, "fdctx lock list empty");
+        goto out;
+    }
+
+    lock = list_entry(fdctx->locks_list.next, typeof(*lock), list);
+
+    GF_ASSERT(lock);
+
+    list_del_init(&lock->list);
+
+out:
+    return lock;
+}
+
+static int
+__set_next_lock_fd(pl_fdctx_t *fdctx, posix_lock_t *reqlock)
+{
+    posix_lock_t *lock = NULL;
+    int ret = 0;
+
+    GF_ASSERT(fdctx);
+
+    lock = __get_next_fdctx_lock(fdctx);
+    if (!lock) {
+        gf_log(THIS->name, GF_LOG_DEBUG, "marking EOL in reqlock");
+        pl_mark_eol_lock(reqlock);
+        goto out;
+    }
+
+    reqlock->user_flock = lock->user_flock;
+    reqlock->fl_start = lock->fl_start;
+    reqlock->fl_type = lock->fl_type;
+    reqlock->fl_end = lock->fl_end;
+    reqlock->owner = lock->owner;
+
+out:
+    if (lock)
+        __destroy_lock(lock);
+
+    return ret;
+}
+
+static int
+pl_getlk_fd(xlator_t *this, pl_inode_t *pl_inode, fd_t *fd,
+            posix_lock_t *reqlock)
+{
+    uint64_t tmp = 0;
+    pl_fdctx_t *fdctx = NULL;
+    int ret = 0;
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        if (!__fd_has_locks(pl_inode, fd)) {
+            pthread_mutex_unlock(&pl_inode->mutex);
+            gf_log(this->name, GF_LOG_DEBUG, "fd=%p has no active locks", fd);
+            ret = 0;
+            goto out;
+        }
+
+        gf_log(this->name, GF_LOG_DEBUG, "There are active locks on fd");
+
+        ret = fd_ctx_get(fd, this, &tmp);
+        fdctx = (pl_fdctx_t *)(long)tmp;
+
+        if (list_empty(&fdctx->locks_list)) {
+            gf_log(this->name, GF_LOG_TRACE,
+                   "no fdctx -> copying all locks on fd");
+
+            ret = __copy_locks_to_fdctx(pl_inode, fd, fdctx);
+            if (ret) {
+                goto unlock;
+            }
+
+            ret = __set_next_lock_fd(fdctx, reqlock);
+
+        } else {
+            gf_log(this->name, GF_LOG_TRACE,
+                   "fdctx present -> returning the next lock");
+            ret = __set_next_lock_fd(fdctx, reqlock);
+            if (ret) {
+                pthread_mutex_unlock(&pl_inode->mutex);
+                gf_log(this->name, GF_LOG_DEBUG,
+                       "could not get next lock of fd");
+                goto out;
+            }
+        }
+    }
+
+unlock:
+    pthread_mutex_unlock(&pl_inode->mutex);
+out:
+    return ret;
+}
+
+int
+pl_metalock_is_active(pl_inode_t *pl_inode)
+{
+    if (list_empty(&pl_inode->metalk_list))
+        return 0;
+    else
+        return 1;
+}
+
+void
+__pl_queue_lock(pl_inode_t *pl_inode, posix_lock_t *reqlock)
+{
+    list_add_tail(&reqlock->list, &pl_inode->queued_locks);
+}
 
 int
-pl_readv_cont (call_frame_t *frame, xlator_t *this,
-	       fd_t *fd, size_t size, off_t offset)
+pl_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+      struct gf_flock *flock, dict_t *xdata)
 {
-	STACK_WIND (frame, pl_readv_cbk,
-		    FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv,
-		    fd, size, offset);
+    pl_inode_t *pl_inode = NULL;
+    int op_ret = 0;
+    int op_errno = 0;
+    int can_block = 0;
+    posix_lock_t *reqlock = NULL;
+    posix_lock_t *conf = NULL;
+    uint32_t lk_flags = 0;
+    posix_locks_private_t *priv = this->private;
+    pl_local_t *local = NULL;
+    short lock_type = 0;
+
+    int ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_flags);
+    if (ret == 0) {
+        if (priv->mandatory_mode == MLK_NONE)
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "Lock flags received "
+                   "in a non-mandatory locking environment, "
+                   "continuing");
+        else
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "Lock flags received, "
+                   "continuing");
+    }
+
+    if ((flock->l_start < 0) || ((flock->l_start + flock->l_len) < 0)) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto unwind;
+    }
+
+    /* As per 'man 3 fcntl', the value of l_len may be
+     * negative. In such cases, lock request should be
+     * considered for the range starting at 'l_start+l_len'
+     * and ending at 'l_start-1'. Update the fields accordingly.
+     */
+    if (flock->l_len < 0) {
+        flock->l_start += flock->l_len;
+        flock->l_len = labs(flock->l_len);
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    } else {
+        frame->local = local;
+        local->fd = fd_ref(fd);
+    }
+
+    pl_inode = pl_inode_get(this, fd->inode, local);
+    if (!pl_inode) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    reqlock = new_posix_lock(flock, frame->root->client, frame->root->pid,
+                             &frame->root->lk_owner, fd, lk_flags, can_block,
+                             &op_errno);
+
+    if (!reqlock) {
+        op_ret = -1;
+        goto unwind;
+    }
+
+    pl_trace_in(this, frame, fd, NULL, cmd, flock, NULL);
+
+    switch (cmd) {
+        case F_RESLK_LCKW:
+            can_block = 1;
+
+            /* fall through */
+        case F_RESLK_LCK:
+            reqlock->frame = frame;
+            reqlock->this = this;
+
+            ret = pl_reserve_setlk(this, pl_inode, reqlock, can_block);
+            if (ret < 0) {
+                if (can_block)
+                    goto out;
+
+                op_ret = -1;
+                op_errno = -ret;
+                __destroy_lock(reqlock);
+                goto unwind;
+            }
+            /* Finally a getlk and return the call */
+            conf = pl_getlk(pl_inode, reqlock);
+            if (conf)
+                posix_lock_to_flock(conf, flock);
+            break;
+
+        case F_RESLK_UNLCK:
+            reqlock->frame = frame;
+            reqlock->this = this;
+            ret = pl_reserve_unlock(this, pl_inode, reqlock);
+            if (ret < 0) {
+                op_ret = -1;
+                op_errno = -ret;
+            }
+            __destroy_lock(reqlock);
+            goto unwind;
+
+            break;
+
+        case F_GETLK_FD:
+            reqlock->frame = frame;
+            reqlock->this = this;
+            ret = pl_verify_reservelk(this, pl_inode, reqlock, can_block);
+            GF_ASSERT(ret >= 0);
+
+            ret = pl_getlk_fd(this, pl_inode, fd, reqlock);
+            if (ret < 0) {
+                gf_log(this->name, GF_LOG_DEBUG, "getting locks on fd failed");
+                op_ret = -1;
+                op_errno = ENOLCK;
+                goto unwind;
+            }
+
+            gf_log(this->name, GF_LOG_TRACE,
+                   "Replying with a lock on fd for healing");
+
+            posix_lock_to_flock(reqlock, flock);
+            __destroy_lock(reqlock);
+
+            break;
+
+#if F_GETLK != F_GETLK64
+        case F_GETLK64:
+#endif
+        case F_GETLK:
+            conf = pl_getlk(pl_inode, reqlock);
+            posix_lock_to_flock(conf, flock);
+            __destroy_lock(reqlock);
+
+            break;
+
+#if F_SETLKW != F_SETLKW64
+        case F_SETLKW64:
+#endif
+        case F_SETLKW:
+            can_block = 1;
+            reqlock->frame = frame;
+            reqlock->this = this;
+            reqlock->blocking = can_block;
+            /* fall through */
+
+#if F_SETLK != F_SETLK64
+        case F_SETLK64:
+#endif
+        case F_SETLK:
+            reqlock->frame = frame;
+            reqlock->this = this;
+            lock_type = flock->l_type;
+
+            pthread_mutex_lock(&pl_inode->mutex);
+            {
+                if (pl_inode->migrated) {
+                    op_errno = EREMOTE;
+                    pthread_mutex_unlock(&pl_inode->mutex);
+                    STACK_UNWIND_STRICT(lk, frame, -1, op_errno, flock, xdata);
+
+                    __destroy_lock(reqlock);
+                    goto out;
+                }
+            }
+            pthread_mutex_unlock(&pl_inode->mutex);
+
+            ret = pl_verify_reservelk(this, pl_inode, reqlock, can_block);
+            if (ret < 0) {
+                gf_log(this->name, GF_LOG_TRACE,
+                       "Lock blocked due to conflicting reserve lock");
+                goto out;
+            }
+
+            if (reqlock->fl_type != F_UNLCK && pl_inode->mlock_enforced) {
+                ret = pl_lock_preempt(pl_inode, reqlock);
+                if (ret == -1) {
+                    gf_log(this->name, GF_LOG_ERROR, "lock preempt failed");
+                    op_ret = -1;
+                    op_errno = EAGAIN;
+                    __destroy_lock(reqlock);
+                    goto out;
+                }
+
+                pl_trace_block(this, frame, fd, NULL, cmd, flock, NULL);
+                goto unwind;
+            }
+
+            ret = pl_setlk(this, pl_inode, reqlock, can_block);
+            if (ret == -1) {
+                if ((can_block) && (F_UNLCK != lock_type)) {
+                    goto out;
+                }
+                gf_log(this->name, GF_LOG_DEBUG, "returning EAGAIN");
+                op_ret = -1;
+                op_errno = EAGAIN;
+                __destroy_lock(reqlock);
+            } else if (ret == -2) {
+                goto out;
+            } else if ((0 == ret) && (F_UNLCK == flock->l_type)) {
+                /* For NLM's last "unlock on fd" detection */
+                if (pl_locks_by_fd(pl_inode, fd))
+                    flock->l_type = F_RDLCK;
+                else
+                    flock->l_type = F_UNLCK;
+            }
+    }
+
+unwind:
+    pl_trace_out(this, frame, fd, NULL, cmd, flock, op_ret, op_errno, NULL);
+    pl_update_refkeeper(this, fd->inode);
 
-	return 0;
+    PL_STACK_UNWIND(lk, xdata, frame, op_ret, op_errno, flock, xdata);
+out:
+    return 0;
+}
+
+/* TODO: this function just logs, no action required?? */
+int
+pl_forget(xlator_t *this, inode_t *inode)
+{
+    pl_inode_t *pl_inode = NULL;
+
+    posix_lock_t *ext_tmp = NULL;
+    posix_lock_t *ext_l = NULL;
+    struct list_head posixlks_released;
+
+    pl_inode_lock_t *ino_tmp = NULL;
+    pl_inode_lock_t *ino_l = NULL;
+    struct list_head inodelks_released;
+
+    pl_rw_req_t *rw_tmp = NULL;
+    pl_rw_req_t *rw_req = NULL;
+
+    pl_entry_lock_t *entry_tmp = NULL;
+    pl_entry_lock_t *entry_l = NULL;
+    struct list_head entrylks_released;
+
+    pl_dom_list_t *dom = NULL;
+    pl_dom_list_t *dom_tmp = NULL;
+
+    INIT_LIST_HEAD(&posixlks_released);
+    INIT_LIST_HEAD(&inodelks_released);
+    INIT_LIST_HEAD(&entrylks_released);
+
+    pl_inode = pl_inode_get(this, inode, NULL);
+    if (!pl_inode)
+        return 0;
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        if (!list_empty(&pl_inode->rw_list)) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "Pending R/W requests found, releasing.");
+
+            list_for_each_entry_safe(rw_req, rw_tmp, &pl_inode->rw_list, list)
+            {
+                list_del(&rw_req->list);
+                call_stub_destroy(rw_req->stub);
+                GF_FREE(rw_req);
+            }
+        }
+
+        if (!list_empty(&pl_inode->ext_list)) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "Pending fcntl locks found, releasing.");
+
+            list_for_each_entry_safe(ext_l, ext_tmp, &pl_inode->ext_list, list)
+            {
+                __delete_lock(ext_l);
+                if (ext_l->blocked) {
+                    list_add_tail(&ext_l->list, &posixlks_released);
+                    continue;
+                }
+                __destroy_lock(ext_l);
+            }
+        }
+
+        list_for_each_entry_safe(dom, dom_tmp, &pl_inode->dom_list, inode_list)
+        {
+            if (!list_empty(&dom->inodelk_list)) {
+                gf_log(this->name, GF_LOG_WARNING,
+                       "Pending inode locks found, releasing.");
+
+                list_for_each_entry_safe(ino_l, ino_tmp, &dom->inodelk_list,
+                                         list)
+                {
+                    __delete_inode_lock(ino_l);
+                    __pl_inodelk_unref(ino_l);
+                }
+
+                list_splice_init(&dom->blocked_inodelks, &inodelks_released);
+            }
+            if (!list_empty(&dom->entrylk_list)) {
+                gf_log(this->name, GF_LOG_WARNING,
+                       "Pending entry locks found, releasing.");
+
+                list_for_each_entry_safe(entry_l, entry_tmp, &dom->entrylk_list,
+                                         domain_list)
+                {
+                    list_del_init(&entry_l->domain_list);
+
+                    GF_FREE((char *)entry_l->basename);
+                    GF_FREE(entry_l->connection_id);
+                    GF_FREE(entry_l);
+                }
+
+                list_splice_init(&dom->blocked_entrylks, &entrylks_released);
+            }
+
+            list_del(&dom->inode_list);
+            gf_log("posix-locks", GF_LOG_TRACE, " Cleaning up domain: %s",
+                   dom->domain);
+            GF_FREE((char *)(dom->domain));
+            GF_FREE(dom);
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    if (!list_empty(&posixlks_released)) {
+        list_for_each_entry_safe(ext_l, ext_tmp, &posixlks_released, list)
+        {
+            STACK_UNWIND_STRICT(lk, ext_l->frame, -1, 0, &ext_l->user_flock,
+                                NULL);
+            __destroy_lock(ext_l);
+        }
+    }
+
+    if (!list_empty(&inodelks_released)) {
+        list_for_each_entry_safe(ino_l, ino_tmp, &inodelks_released,
+                                 blocked_locks)
+        {
+            STACK_UNWIND_STRICT(inodelk, ino_l->frame, -1, 0, NULL);
+            __pl_inodelk_unref(ino_l);
+        }
+    }
+
+    if (!list_empty(&entrylks_released)) {
+        list_for_each_entry_safe(entry_l, entry_tmp, &entrylks_released,
+                                 blocked_locks)
+        {
+            STACK_UNWIND_STRICT(entrylk, entry_l->frame, -1, 0, NULL);
+            GF_FREE((char *)entry_l->basename);
+            GF_FREE(entry_l->connection_id);
+            GF_FREE(entry_l);
+        }
+    }
+
+    pthread_mutex_destroy(&pl_inode->mutex);
+
+    GF_FREE(pl_inode);
+
+    return 0;
 }
 
+int
+pl_release(xlator_t *this, fd_t *fd)
+{
+    pl_inode_t *pl_inode = NULL;
+    uint64_t tmp_pl_inode = 0;
+    int ret = -1;
+    uint64_t tmp = 0;
+    pl_fdctx_t *fdctx = NULL;
+
+    if (fd == NULL) {
+        goto out;
+    }
+
+    ret = inode_ctx_get(fd->inode, this, &tmp_pl_inode);
+    if (ret != 0)
+        goto clean;
+
+    pl_inode = (pl_inode_t *)(long)tmp_pl_inode;
+
+    pl_trace_release(this, fd);
+
+    gf_log(this->name, GF_LOG_TRACE, "Releasing all locks with fd %p", fd);
+
+    delete_locks_of_fd(this, pl_inode, fd);
+    pl_update_refkeeper(this, fd->inode);
+
+clean:
+    ret = fd_ctx_del(fd, this, &tmp);
+    if (ret) {
+        gf_log(this->name, GF_LOG_DEBUG, "Could not get fdctx");
+        goto out;
+    }
+
+    fdctx = (pl_fdctx_t *)(long)tmp;
+
+    GF_FREE(fdctx);
+out:
+    return ret;
+}
 
 int
-pl_readv (call_frame_t *frame, xlator_t *this,
-	  fd_t *fd, size_t size, off_t offset)
+pl_releasedir(xlator_t *this, fd_t *fd)
+{
+    int ret = -1;
+    uint64_t tmp = 0;
+    pl_fdctx_t *fdctx = NULL;
+
+    if (fd == NULL) {
+        goto out;
+    }
+
+    ret = fd_ctx_del(fd, this, &tmp);
+    if (ret) {
+        gf_log(this->name, GF_LOG_DEBUG, "Could not get fdctx");
+        goto out;
+    }
+
+    fdctx = (pl_fdctx_t *)(long)tmp;
+
+    GF_FREE(fdctx);
+out:
+    return ret;
+}
+
+static int32_t
+pl_request_link_count(dict_t **pxdata)
 {
-	posix_locks_private_t *priv = NULL;
-	pl_inode_t            *pl_inode = NULL;
-	pl_rw_req_t           *rw = NULL;
-	posix_lock_t           region = {.list = {0, }, };
-	int                    op_ret = 0;
-	int                    op_errno = 0;
-	char                   allowable = 0;
+    dict_t *xdata;
+
+    xdata = *pxdata;
+    if (xdata == NULL) {
+        xdata = dict_new();
+        if (xdata == NULL) {
+            return ENOMEM;
+        }
+    } else {
+        dict_ref(xdata);
+    }
+
+    if (dict_set_uint32(xdata, GET_LINK_COUNT, 0) != 0) {
+        dict_unref(xdata);
+        return ENOMEM;
+    }
+
+    *pxdata = xdata;
+
+    return 0;
+}
+
+static int32_t
+pl_check_link_count(dict_t *xdata)
+{
+    int32_t count;
 
+    /* In case we are unable to read the link count from xdata, we take a
+     * conservative approach and return -2, which will prevent the inode from
+     * being considered deleted. In fact it will cause link tracking for this
+     * inode to be disabled completely to avoid races. */
 
-	priv = this->private;
-	pl_inode = pl_inode_get (this, fd->inode);
+    if (xdata == NULL) {
+        return -2;
+    }
 
-	if (priv->mandatory && pl_inode->mandatory) {
-		region.fl_start   = offset;
-		region.fl_end     = offset + size - 1;
-		region.transport  = frame->root->trans;
-		region.client_pid = frame->root->pid;
-    
-		pthread_mutex_lock (&pl_inode->mutex);
-		{
-			allowable = __rw_allowable (pl_inode, &region,
-						    GF_FOP_READ);
-			if (allowable)
-				goto unlock;
+    if (dict_get_int32(xdata, GET_LINK_COUNT, &count) != 0) {
+        return -2;
+    }
 
-			if (fd->flags & O_NONBLOCK) {
-				gf_log (this->name, GF_LOG_DEBUG,
-					"returning EWOULDBLOCK");
-				op_errno = EWOULDBLOCK;
-				op_ret = -1;
-				goto unlock;
-			}
+    return count;
+}
 
-			rw = CALLOC (1, sizeof (*rw));
-			if (!rw) {
-				gf_log (this->name, GF_LOG_ERROR,
-					"out of memory :(");
-				op_errno = ENOMEM;
-				op_ret = -1;
-				goto unlock;
-			}
+int32_t
+pl_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+              struct iatt *postparent)
+{
+    pl_inode_t *pl_inode;
+
+    if (op_ret >= 0) {
+        pl_inode = pl_inode_get(this, inode, NULL);
+        if (pl_inode == NULL) {
+            PL_STACK_UNWIND(lookup, xdata, frame, -1, ENOMEM, NULL, NULL, NULL,
+                            NULL);
+            return 0;
+        }
+
+        pthread_mutex_lock(&pl_inode->mutex);
+
+        /* We only update the link count if we previously didn't know it.
+         * Doing it always can lead to races since lookup is not executed
+         * atomically most of the times. */
+        if (pl_inode->links == -2) {
+            pl_inode->links = pl_check_link_count(xdata);
+            if (buf->ia_type == IA_IFDIR) {
+                /* Directories have at least 2 links. To avoid special handling
+                 * for directories, we simply decrement the value here to make
+                 * them equivalent to regular files. */
+                pl_inode->links--;
+            }
+        }
+
+        pthread_mutex_unlock(&pl_inode->mutex);
+    }
+
+    PL_STACK_UNWIND(lookup, xdata, frame, op_ret, op_errno, inode, buf, xdata,
+                    postparent);
+    return 0;
+}
 
-			rw->stub = fop_readv_stub (frame, pl_readv_cont,
-						   fd, size, offset);
-			if (!rw->stub) {
-				gf_log (this->name, GF_LOG_ERROR,
-					"out of memory :(");
-				op_errno = ENOMEM;
-				op_ret = -1;
-				free (rw);
-				goto unlock;
-			}
+int32_t
+pl_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    int32_t error;
+
+    error = pl_request_link_count(&xdata);
+    if (error == 0) {
+        PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+        STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->lookup, loc, xdata);
+        dict_unref(xdata);
+    } else {
+        STACK_UNWIND_STRICT(lookup, frame, -1, error, NULL, NULL, NULL, NULL);
+    }
+    return 0;
+}
+
+int32_t
+pl_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+    PL_STACK_UNWIND(fstat, xdata, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
 
-			rw->region = region;
+int32_t
+pl_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+    STACK_WIND(frame, pl_fstat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fstat, fd, xdata);
+    return 0;
+}
 
-			list_add_tail (&rw->list, &pl_inode->rw_list);
-		}
-	unlock:
-		pthread_mutex_unlock (&pl_inode->mutex);
+int
+pl_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                int op_errno, gf_dirent_t *entries, dict_t *xdata)
+{
+    pl_local_t *local = NULL;
+    gf_dirent_t *entry = NULL;
 
-		goto unwind;
-	}
+    if (op_ret <= 0)
+        goto unwind;
 
+    local = frame->local;
+    if (!local)
+        goto unwind;
 
-	STACK_WIND (frame, pl_readv_cbk, 
-		    FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv,
-		    fd, size, offset);
-	return 0;
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        pl_set_xdata_response(this, local, local->fd->inode, entry->inode,
+                              entry->d_name, entry->dict, 0);
+    }
 
 unwind:
-	if (op_ret == -1)
-		STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL);
+    PL_STACK_UNWIND(readdirp, xdata, frame, op_ret, op_errno, entries, xdata);
+
+    return 0;
+}
+
+int
+pl_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t offset, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+    STACK_WIND(frame, pl_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, offset, xdata);
+
+    return 0;
+}
 
-	return 0;
+lock_migration_info_t *
+gf_mig_info_for_lock(posix_lock_t *lock)
+{
+    lock_migration_info_t *new = GF_MALLOC(sizeof(lock_migration_info_t),
+                                           gf_common_mt_lock_mig);
+    if (new == NULL) {
+        goto out;
+    }
+
+    INIT_LIST_HEAD(&new->list);
+
+    posix_lock_to_flock(lock, &new->flock);
+
+    new->lk_flags = lock->lk_flags;
+
+    new->client_uid = gf_strdup(lock->client_uid);
+
+out:
+    return new;
 }
 
+int
+pl_fill_active_locks(pl_inode_t *pl_inode, lock_migration_info_t *lmi)
+{
+    posix_lock_t *temp = NULL;
+    lock_migration_info_t *newlock = NULL;
+    int count = 0;
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        if (list_empty(&pl_inode->ext_list)) {
+            count = 0;
+            goto unlock;
+        }
+
+        list_for_each_entry(temp, &pl_inode->ext_list, list)
+        {
+            if (temp->blocked)
+                continue;
+
+            newlock = gf_mig_info_for_lock(temp);
+            if (!newlock) {
+                pthread_mutex_unlock(&pl_inode->mutex);
+                gf_msg(THIS->name, GF_LOG_ERROR, 0, 0, "lock_dup failed");
+                count = -1;
+                goto out;
+            }
+
+            list_add_tail(&newlock->list, &lmi->list);
+            count++;
+        }
+    }
+
+unlock:
+    pthread_mutex_unlock(&pl_inode->mutex);
+out:
+    return count;
+}
+
+/* This function reads only active locks */
+static int
+pl_getactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    pl_inode_t *pl_inode = NULL;
+    lock_migration_info_t locks;
+    int op_ret = 0;
+    int op_errno = 0;
+    int count = 0;
+
+    INIT_LIST_HEAD(&locks.list);
+
+    pl_inode = pl_inode_get(this, loc->inode, NULL);
+    if (!pl_inode) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "pl_inode_get failed");
+
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    count = pl_fill_active_locks(pl_inode, &locks);
+
+    op_ret = count;
+
+out:
+    STACK_UNWIND_STRICT(getactivelk, frame, op_ret, op_errno, &locks, NULL);
+
+    gf_free_mig_locks(&locks);
+
+    return 0;
+}
+
+void
+pl_metalk_unref(pl_meta_lock_t *lock)
+{
+    lock->ref--;
+    if (!lock->ref) {
+        GF_FREE(lock->client_uid);
+        GF_FREE(lock);
+    }
+}
+
+void
+__pl_metalk_ref(pl_meta_lock_t *lock)
+{
+    lock->ref++;
+}
+
+pl_meta_lock_t *
+new_meta_lock(call_frame_t *frame, xlator_t *this)
+{
+    pl_meta_lock_t *lock = GF_CALLOC(1, sizeof(*lock),
+                                     gf_locks_mt_pl_meta_lock_t);
+
+    if (!lock) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM,
+               "mem allocation"
+               " failed for meta lock");
+        goto out;
+    }
+
+    INIT_LIST_HEAD(&lock->list);
+    INIT_LIST_HEAD(&lock->client_list);
+
+    lock->client_uid = gf_strdup(frame->root->client->client_uid);
+    if (!lock->client_uid) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM,
+               "mem allocation"
+               " failed for client_uid");
+        GF_FREE(lock);
+        lock = NULL;
+        goto out;
+    }
+
+    __pl_metalk_ref(lock);
+out:
+    return lock;
+}
 
 int
-pl_writev_cont (call_frame_t *frame, xlator_t *this, fd_t *fd,
-		struct iovec *vector, int count, off_t offset)
+pl_insert_metalk(pl_inode_t *pl_inode, pl_ctx_t *ctx, pl_meta_lock_t *lock)
 {
-	STACK_WIND (frame, pl_writev_cbk,
-		    FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev,
-		    fd, vector, count, offset);
+    int ret = 0;
+
+    if (!pl_inode || !ctx || !lock) {
+        gf_msg(THIS->name, GF_LOG_INFO, 0, 0, "NULL parameter");
+        ret = -1;
+        goto out;
+    }
+
+    lock->pl_inode = pl_inode;
+
+    /* refer function pl_inode_setlk for more info for this ref.
+     * This should be unrefed on meta-unlock triggered by rebalance or
+     * in cleanup with client disconnect*/
+    /*TODO: unref this in  cleanup code for disconnect and meta-unlock*/
+    pl_inode->inode = inode_ref(pl_inode->inode);
+
+    /* NOTE:In case of a client-server disconnect we need to cleanup metalk.
+     * Hence, adding the metalk to pl_ctx_t as well. The mutex lock order
+     * should always be on ctx and then on pl_inode*/
+
+    pthread_mutex_lock(&ctx->lock);
+    {
+        pthread_mutex_lock(&pl_inode->mutex);
+        {
+            list_add_tail(&lock->list, &pl_inode->metalk_list);
+        }
+        pthread_mutex_unlock(&pl_inode->mutex);
+
+        list_add_tail(&lock->client_list, &ctx->metalk_list);
+    }
+    pthread_mutex_unlock(&ctx->lock);
 
-	return 0;
+out:
+    return ret;
 }
 
+int32_t
+pl_metalk(call_frame_t *frame, xlator_t *this, inode_t *inode)
+{
+    pl_inode_t *pl_inode = NULL;
+    int ret = 0;
+    pl_meta_lock_t *reqlk = NULL;
+    pl_ctx_t *ctx = NULL;
+
+    pl_inode = pl_inode_get(this, inode, NULL);
+    if (!pl_inode) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, ENOMEM,
+               "pl_inode mem allocation failedd");
+
+        ret = -1;
+        goto out;
+    }
+
+    /* Non rebalance process trying to do metalock */
+    if (frame->root->pid != GF_CLIENT_PID_DEFRAG) {
+        ret = -1;
+        goto out;
+    }
+
+    /* Note: In the current scheme of glusterfs where lock migration is
+     * experimental, (ideally) the rebalance process which is migrating
+     * the file should request for a metalock. Hence, the metalock count
+     * should not be more than one for an inode. In future, if there is a
+     * need for meta-lock from other clients, the following block can be
+     * removed.
+     *
+     * Since pl_metalk is called as part of setxattr operation, any client
+     * process(non-rebalance) residing outside trusted network can exhaust
+     * memory of the server node by issuing setxattr repetitively on the
+     * metalock key. The following code makes sure that more than
+     * one metalock cannot be granted on an inode*/
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        if (pl_metalock_is_active(pl_inode)) {
+            ret = -1;
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, 0,
+               "More than one meta-lock cannot be granted on"
+               " the inode");
+        goto out;
+    }
+
+    if (frame->root->client) {
+        ctx = pl_ctx_get(frame->root->client, this);
+        if (!ctx) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0, "pl_ctx_get failed");
+
+            ret = -1;
+            goto out;
+        }
+    } else {
+        gf_msg(this->name, GF_LOG_INFO, 0, 0,
+               "frame-root-client "
+               "is NULL");
+
+        ret = -1;
+        goto out;
+    }
+
+    reqlk = new_meta_lock(frame, this);
+    if (!reqlk) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = pl_insert_metalk(pl_inode, ctx, reqlk);
+    if (ret < 0) {
+        pl_metalk_unref(reqlk);
+    }
+
+out:
+    return ret;
+}
+
+static void
+__unwind_queued_locks(pl_inode_t *pl_inode, struct list_head *tmp_list)
+{
+    if (list_empty(&pl_inode->queued_locks))
+        return;
+
+    list_splice_init(&pl_inode->queued_locks, tmp_list);
+}
+
+static void
+__unwind_blocked_locks(pl_inode_t *pl_inode, struct list_head *tmp_list)
+{
+    posix_lock_t *lock = NULL;
+    posix_lock_t *tmp = NULL;
+
+    if (list_empty(&pl_inode->ext_list))
+        return;
+
+    list_for_each_entry_safe(lock, tmp, &pl_inode->ext_list, list)
+    {
+        if (!lock->blocking)
+            continue;
+
+        list_del_init(&lock->list);
+        list_add_tail(&lock->list, tmp_list);
+    }
+}
 
 int
-pl_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
-	   struct iovec *vector, int32_t count, off_t offset)
+pl_metaunlock(call_frame_t *frame, xlator_t *this, inode_t *inode, dict_t *dict)
 {
-	posix_locks_private_t *priv = NULL;
-	pl_inode_t            *pl_inode = NULL;
-	pl_rw_req_t           *rw = NULL;
-	posix_lock_t           region = {.list = {0, }, };
-	int                    op_ret = 0;
-	int                    op_errno = 0;
-	char                   allowable = 0;
+    pl_inode_t *pl_inode = NULL;
+    int ret = 0;
+    pl_meta_lock_t *meta_lock = NULL;
+    pl_meta_lock_t *tmp_metalk = NULL;
+    pl_ctx_t *ctx = NULL;
+    posix_lock_t *posix_lock = NULL;
+    posix_lock_t *tmp_posixlk = NULL;
+    struct list_head tmp_posixlk_list;
+
+    INIT_LIST_HEAD(&tmp_posixlk_list);
+
+    if (frame->root->client) {
+        ctx = pl_ctx_get(frame->root->client, this);
+        if (!ctx) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, 0, "pl_ctx_get failed");
+
+            ret = -1;
+            goto out;
+        }
+    } else {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+               "frame-root-client is "
+               "NULL");
+        ret = -1;
+        goto out;
+    }
+
+    pl_inode = pl_inode_get(this, inode, NULL);
+    if (!pl_inode) {
+        ret = -1;
+        goto out;
+    }
+
+    pthread_mutex_lock(&ctx->lock);
+    {
+        pthread_mutex_lock(&pl_inode->mutex);
+        {
+            /* Unwind queued locks regardless of migration status */
+            __unwind_queued_locks(pl_inode, &tmp_posixlk_list);
+
+            /* Unwind blocked locks only for successful migration */
+            if (dict_get_sizen(dict, "status")) {
+                /* unwind all blocked locks */
+                __unwind_blocked_locks(pl_inode, &tmp_posixlk_list);
+            }
+
+            /* unlock metalk */
+            /* if this list is empty then pl_inode->metalk_list
+             * should be empty too. meta lock should in all cases
+             * be added/removed from both pl_ctx_t and pl_inode */
+
+            if (list_empty(&ctx->metalk_list))
+                goto unlock;
+
+            list_for_each_entry_safe(meta_lock, tmp_metalk, &ctx->metalk_list,
+                                     client_list)
+            {
+                list_del_init(&meta_lock->client_list);
+
+                pl_inode = meta_lock->pl_inode;
+
+                list_del_init(&meta_lock->list);
+
+                pl_metalk_unref(meta_lock);
+
+                /* The corresponding ref is taken in
+                 * pl_insert_metalk*/
+                inode_unref(pl_inode->inode);
+            }
+
+            if (dict_get_sizen(dict, "status"))
+                pl_inode->migrated = _gf_true;
+            else
+                pl_inode->migrated = _gf_false;
+        }
+    unlock:
+
+        pthread_mutex_unlock(&pl_inode->mutex);
+    }
+    pthread_mutex_unlock(&ctx->lock);
+
+out:
+    list_for_each_entry_safe(posix_lock, tmp_posixlk, &tmp_posixlk_list, list)
+    {
+        list_del_init(&posix_lock->list);
+
+        STACK_UNWIND_STRICT(lk, posix_lock->frame, -1, EREMOTE,
+                            &posix_lock->user_flock, NULL);
 
+        __destroy_lock(posix_lock);
+    }
 
-	priv = this->private;
-	pl_inode = pl_inode_get (this, fd->inode);
+    return ret;
+}
 
-	if (priv->mandatory && pl_inode->mandatory) {
-		region.fl_start   = offset;
-		region.fl_end     = offset + iov_length (vector, count) - 1;
-		region.transport  = frame->root->trans;
-		region.client_pid = frame->root->pid;
-    
-		pthread_mutex_lock (&pl_inode->mutex);
-		{
-			allowable = __rw_allowable (pl_inode, &region,
-						    GF_FOP_WRITE);
-			if (allowable)
-				goto unlock;
+int32_t
+pl_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    pl_local_t *local = NULL;
+    pl_inode_t *pl_inode = NULL;
+    local = frame->local;
+    if (local && local->update_mlock_enforced_flag && op_ret != -1) {
+        pl_inode = pl_inode_get(this, local->inode, NULL);
+        if (!pl_inode) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unwind;
+        }
+
+        pthread_mutex_lock(&pl_inode->mutex);
+        {
+            while (pl_inode->fop_wind_count > 0) {
+                gf_msg(this->name, GF_LOG_INFO, 0, 0,
+                       "waiting for existing fops (count %d) to drain for "
+                       "gfid %s",
+                       pl_inode->fop_wind_count, uuid_utoa(pl_inode->gfid));
+                pthread_cond_wait(&pl_inode->check_fop_wind_count,
+                                  &pl_inode->mutex);
+            }
+            pl_inode->mlock_enforced = _gf_true;
+            pl_inode->check_mlock_info = _gf_false;
+        }
+        pthread_mutex_unlock(&pl_inode->mutex);
+    }
 
-			if (fd->flags & O_NONBLOCK) {
-				gf_log (this->name, GF_LOG_DEBUG,
-					"returning EWOULDBLOCK");
-				op_errno = EWOULDBLOCK;
-				op_ret = -1;
-				goto unlock;
-			}
+unwind:
+    PL_STACK_UNWIND_FOR_CLIENT(setxattr, xdata, frame, op_ret, op_errno, xdata);
+    return 0;
+}
 
-			rw = CALLOC (1, sizeof (*rw));
-			if (!rw) {
-				gf_log (this->name, GF_LOG_ERROR,
-					"out of memory :(");
-				op_errno = ENOMEM;
-				op_ret = -1;
-				goto unlock;
-			}
+int32_t
+pl_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+            int flags, dict_t *xdata)
+{
+    int op_ret = 0;
+    int op_errno = EINVAL;
+    dict_t *xdata_rsp = NULL;
+    char *name = NULL;
+    posix_locks_private_t *priv = this->private;
 
-			rw->stub = fop_writev_stub (frame, pl_writev_cont,
-						    fd, vector, count, offset);
-			if (!rw->stub) {
-				gf_log (this->name, GF_LOG_ERROR,
-					"out of memory :(");
-				op_errno = ENOMEM;
-				op_ret = -1;
-				free (rw);
-				goto unlock;
-			}
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
 
-			rw->region = region;
+    if (dict_get_sizen(dict, GF_META_LOCK_KEY)) {
+        op_ret = pl_metalk(frame, this, loc->inode);
 
-			list_add_tail (&rw->list, &pl_inode->rw_list);
-		}
-	unlock:
-		pthread_mutex_unlock (&pl_inode->mutex);
+    } else if (dict_get_sizen(dict, GF_META_UNLOCK_KEY)) {
+        op_ret = pl_metaunlock(frame, this, loc->inode, dict);
+    } else {
+        goto usual;
+    }
 
-		goto unwind;
-	}
+    PL_STACK_UNWIND_FOR_CLIENT(setxattr, xdata_rsp, frame, op_ret, op_errno,
+                               xdata_rsp);
+    return 0;
 
+usual:
+    PL_CHECK_LOCK_ENFORCE_KEY(frame, dict, name, this, loc, ((fd_t *)NULL),
+                              priv);
 
-	STACK_WIND (frame, pl_writev_cbk, 
-		    FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev,
-		    fd, vector, count, offset);
-	return 0;
+    STACK_WIND(frame, pl_setxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
+    return 0;
 
 unwind:
-	if (op_ret == -1)
-		STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL);
+    PL_STACK_UNWIND_FOR_CLIENT(setxattr, xdata, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+void
+pl_dump_lock(char *str, int size, struct gf_flock *flock, gf_lkowner_t *owner,
+             void *trans, char *conn_id, time_t *granted_time,
+             time_t *blkd_time, gf_boolean_t active)
+{
+    char *type_str = NULL;
+    char granted[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    char blocked[GF_TIMESTR_SIZE] = {
+        0,
+    };
+
+    if (granted_time)
+        gf_time_fmt(granted, sizeof(granted), *granted_time, gf_timefmt_FT);
+    if (blkd_time)
+        gf_time_fmt(blocked, sizeof(blocked), *blkd_time, gf_timefmt_FT);
+    switch (flock->l_type) {
+        case F_RDLCK:
+            type_str = "READ";
+            break;
+        case F_WRLCK:
+            type_str = "WRITE";
+            break;
+        case F_UNLCK:
+            type_str = "UNLOCK";
+            break;
+        default:
+            type_str = "UNKNOWN";
+            break;
+    }
+
+    if (active) {
+        if (blkd_time && *blkd_time == 0) {
+            snprintf(str, size, RANGE_GRNTD_FMT, type_str, flock->l_whence,
+                     (unsigned long long)flock->l_start,
+                     (unsigned long long)flock->l_len,
+                     (unsigned long long)flock->l_pid, lkowner_utoa(owner),
+                     trans, conn_id, granted);
+        } else {
+            snprintf(str, size, RANGE_BLKD_GRNTD_FMT, type_str, flock->l_whence,
+                     (unsigned long long)flock->l_start,
+                     (unsigned long long)flock->l_len,
+                     (unsigned long long)flock->l_pid, lkowner_utoa(owner),
+                     trans, conn_id, blocked, granted);
+        }
+    } else {
+        snprintf(str, size, RANGE_BLKD_FMT, type_str, flock->l_whence,
+                 (unsigned long long)flock->l_start,
+                 (unsigned long long)flock->l_len,
+                 (unsigned long long)flock->l_pid, lkowner_utoa(owner), trans,
+                 conn_id, blocked);
+    }
+}
+
+void
+__dump_entrylks(pl_inode_t *pl_inode)
+{
+    pl_dom_list_t *dom = NULL;
+    pl_entry_lock_t *lock = NULL;
+    char blocked[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    char granted[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    int count = 0;
+    char key[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    char *k = "xlator.feature.locks.lock-dump.domain.entrylk";
+
+    char tmp[4098];
+
+    list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
+    {
+        count = 0;
+
+        gf_proc_dump_build_key(key, "lock-dump.domain", "domain");
+        gf_proc_dump_write(key, "%s", dom->domain);
+
+        list_for_each_entry(lock, &dom->entrylk_list, domain_list)
+        {
+            gf_time_fmt(granted, sizeof(granted), lock->granted_time,
+                        gf_timefmt_FT);
+            gf_proc_dump_build_key(key, k, "entrylk[%d](ACTIVE)", count);
+            if (lock->blkd_time == 0) {
+                snprintf(tmp, sizeof(tmp), ENTRY_GRNTD_FMT,
+                         lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK"
+                                                     : "ENTRYLK_WRLCK",
+                         lock->basename, (unsigned long long)lock->client_pid,
+                         lkowner_utoa(&lock->owner), lock->client,
+                         lock->connection_id, granted);
+            } else {
+                gf_time_fmt(blocked, sizeof(blocked), lock->blkd_time,
+                            gf_timefmt_FT);
+                snprintf(tmp, sizeof(tmp), ENTRY_BLKD_GRNTD_FMT,
+                         lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK"
+                                                     : "ENTRYLK_WRLCK",
+                         lock->basename, (unsigned long long)lock->client_pid,
+                         lkowner_utoa(&lock->owner), lock->client,
+                         lock->connection_id, blocked, granted);
+            }
+
+            gf_proc_dump_write(key, "%s", tmp);
+
+            count++;
+        }
+
+        list_for_each_entry(lock, &dom->blocked_entrylks, blocked_locks)
+        {
+            gf_time_fmt(blocked, sizeof(blocked), lock->blkd_time,
+                        gf_timefmt_FT);
+
+            gf_proc_dump_build_key(key, k, "entrylk[%d](BLOCKED)", count);
+            snprintf(
+                tmp, sizeof(tmp), ENTRY_BLKD_FMT,
+                lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" : "ENTRYLK_WRLCK",
+                lock->basename, (unsigned long long)lock->client_pid,
+                lkowner_utoa(&lock->owner), lock->client, lock->connection_id,
+                blocked);
+
+            gf_proc_dump_write(key, "%s", tmp);
+
+            count++;
+        }
+    }
+}
+
+void
+dump_entrylks(pl_inode_t *pl_inode)
+{
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        __dump_entrylks(pl_inode);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+}
+
+void
+__dump_inodelks(pl_inode_t *pl_inode)
+{
+    pl_dom_list_t *dom = NULL;
+    pl_inode_lock_t *lock = NULL;
+    int count = 0;
+    char key[GF_DUMP_MAX_BUF_LEN];
+
+    char tmp[4098];
+
+    list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
+    {
+        count = 0;
+
+        gf_proc_dump_build_key(key, "lock-dump.domain", "domain");
+        gf_proc_dump_write(key, "%s", dom->domain);
+
+        list_for_each_entry(lock, &dom->inodelk_list, list)
+        {
+            gf_proc_dump_build_key(key, "inodelk", "inodelk[%d](ACTIVE)",
+                                   count);
+
+            SET_FLOCK_PID(&lock->user_flock, lock);
+            pl_dump_lock(tmp, sizeof(tmp), &lock->user_flock, &lock->owner,
+                         lock->client, lock->connection_id, &lock->granted_time,
+                         &lock->blkd_time, _gf_true);
+            gf_proc_dump_write(key, "%s", tmp);
+
+            count++;
+        }
+
+        list_for_each_entry(lock, &dom->blocked_inodelks, blocked_locks)
+        {
+            gf_proc_dump_build_key(key, "inodelk", "inodelk[%d](BLOCKED)",
+                                   count);
+            SET_FLOCK_PID(&lock->user_flock, lock);
+            pl_dump_lock(tmp, sizeof(tmp), &lock->user_flock, &lock->owner,
+                         lock->client, lock->connection_id, 0, &lock->blkd_time,
+                         _gf_false);
+            gf_proc_dump_write(key, "%s", tmp);
+
+            count++;
+        }
+    }
+}
+
+void
+dump_inodelks(pl_inode_t *pl_inode)
+{
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        __dump_inodelks(pl_inode);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+}
+
+void
+__dump_posixlks(pl_inode_t *pl_inode)
+{
+    posix_lock_t *lock = NULL;
+    int count = 0;
+    char key[GF_DUMP_MAX_BUF_LEN];
+
+    char tmp[4098];
+
+    list_for_each_entry(lock, &pl_inode->ext_list, list)
+    {
+        SET_FLOCK_PID(&lock->user_flock, lock);
+        gf_proc_dump_build_key(key, "posixlk", "posixlk[%d](%s)", count,
+                               lock->blocked ? "BLOCKED" : "ACTIVE");
+        pl_dump_lock(tmp, sizeof(tmp), &lock->user_flock, &lock->owner,
+                     lock->client, lock->client_uid, &lock->granted_time,
+                     &lock->blkd_time, (lock->blocked) ? _gf_false : _gf_true);
+        gf_proc_dump_write(key, "%s", tmp);
+
+        count++;
+    }
+}
+
+void
+dump_posixlks(pl_inode_t *pl_inode)
+{
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        __dump_posixlks(pl_inode);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+}
+
+int32_t
+pl_dump_inode_priv(xlator_t *this, inode_t *inode)
+{
+    int ret = -1;
+    uint64_t tmp_pl_inode = 0;
+    pl_inode_t *pl_inode = NULL;
+    char *pathname = NULL;
+    gf_boolean_t section_added = _gf_false;
+
+    int count = 0;
+
+    if (!inode) {
+        errno = EINVAL;
+        goto out;
+    }
+
+    ret = TRY_LOCK(&inode->lock);
+    if (ret)
+        goto out;
+    {
+        ret = __inode_ctx_get(inode, this, &tmp_pl_inode);
+        if (ret)
+            goto unlock;
+    }
+unlock:
+    UNLOCK(&inode->lock);
+    if (ret)
+        goto out;
+
+    pl_inode = (pl_inode_t *)(long)tmp_pl_inode;
+    if (!pl_inode) {
+        ret = -1;
+        goto out;
+    }
+
+    gf_proc_dump_add_section("xlator.features.locks.%s.inode", this->name);
+    section_added = _gf_true;
+
+    /*We are safe to call __inode_path since we have the
+     * inode->table->lock */
+    __inode_path(inode, NULL, &pathname);
+    if (pathname)
+        gf_proc_dump_write("path", "%s", pathname);
+
+    gf_proc_dump_write("mandatory", "%d", pl_inode->mandatory);
+
+    ret = pthread_mutex_trylock(&pl_inode->mutex);
+    if (ret)
+        goto out;
+    {
+        count = __get_entrylk_count(this, pl_inode);
+        if (count) {
+            gf_proc_dump_write("entrylk-count", "%d", count);
+            __dump_entrylks(pl_inode);
+        }
+
+        count = __get_inodelk_count(this, pl_inode, NULL);
+        if (count) {
+            gf_proc_dump_write("inodelk-count", "%d", count);
+            __dump_inodelks(pl_inode);
+        }
+
+        count = __get_posixlk_count(pl_inode);
+        if (count) {
+            gf_proc_dump_write("posixlk-count", "%d", count);
+            __dump_posixlks(pl_inode);
+        }
+
+        gf_proc_dump_write("links", "%d", pl_inode->links);
+        gf_proc_dump_write("removes_pending", "%u", pl_inode->remove_running);
+        gf_proc_dump_write("removed", "%u", pl_inode->removed);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
 
-	return 0;
+out:
+    GF_FREE(pathname);
+
+    if (ret && inode) {
+        if (!section_added)
+            gf_proc_dump_add_section(
+                "xlator.features.locks.%s."
+                "inode",
+                this->name);
+        gf_proc_dump_write("Unable to print lock state",
+                           "(Lock "
+                           "acquisition failure) %s",
+                           uuid_utoa(inode->gfid));
+    }
+    return ret;
 }
 
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_locks_mt_end + 1);
+
+    if (ret != 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Memory accounting init"
+               "failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+pl_ctx_t *
+pl_ctx_get(client_t *client, xlator_t *xlator)
+{
+    void *tmp = NULL;
+    pl_ctx_t *ctx = NULL;
+    pl_ctx_t *setted_ctx = NULL;
+
+    client_ctx_get(client, xlator, &tmp);
+
+    ctx = tmp;
+
+    if (ctx != NULL)
+        goto out;
+
+    ctx = GF_CALLOC(1, sizeof(pl_ctx_t), gf_locks_mt_posix_lock_t);
+
+    if (ctx == NULL)
+        goto out;
+
+    pthread_mutex_init(&ctx->lock, NULL);
+    INIT_LIST_HEAD(&ctx->inodelk_lockers);
+    INIT_LIST_HEAD(&ctx->entrylk_lockers);
+    INIT_LIST_HEAD(&ctx->metalk_list);
+
+    setted_ctx = client_ctx_set(client, xlator, ctx);
+    if (ctx != setted_ctx) {
+        pthread_mutex_destroy(&ctx->lock);
+        GF_FREE(ctx);
+        ctx = setted_ctx;
+    }
+out:
+    return ctx;
+}
 
 int
-pl_lk (call_frame_t *frame, xlator_t *this,
-       fd_t *fd, int32_t cmd, struct flock *flock)
+pl_metalk_client_cleanup(xlator_t *this, pl_ctx_t *ctx)
 {
-	transport_t           *transport = NULL;
-	pid_t                  client_pid = 0;
-	posix_locks_private_t *priv = NULL;
-	pl_inode_t            *pl_inode = NULL;
-	int                    op_ret = 0;
-	int                    op_errno = 0;
-	int                    can_block = 0;
-	posix_lock_t          *reqlock = NULL;
-	posix_lock_t          *conf = NULL;
-	int                    ret = 0;
+    pl_meta_lock_t *meta_lock = NULL;
+    pl_meta_lock_t *tmp_metalk = NULL;
+    pl_inode_t *pl_inode = NULL;
+    posix_lock_t *posix_lock = NULL;
+    posix_lock_t *tmp_posixlk = NULL;
+    struct list_head tmp_posixlk_list;
+
+    INIT_LIST_HEAD(&tmp_posixlk_list);
+
+    pthread_mutex_lock(&ctx->lock);
+    {
+        /* if this list is empty then pl_inode->metalk_list should be
+         * empty too. meta lock should in all cases be added/removed
+         * from both pl_ctx_t and pl_inode */
+        if (list_empty(&ctx->metalk_list))
+            goto unlock;
+
+        list_for_each_entry_safe(meta_lock, tmp_metalk, &ctx->metalk_list,
+                                 client_list)
+        {
+            list_del_init(&meta_lock->client_list);
+
+            pl_inode = meta_lock->pl_inode;
+
+            pthread_mutex_lock(&pl_inode->mutex);
+
+            {
+                /* Since the migration status is unknown here
+                 * unwind all queued and blocked locks to check
+                 * migration status and find the correct
+                 * destination */
+                __unwind_queued_locks(pl_inode, &tmp_posixlk_list);
+
+                __unwind_blocked_locks(pl_inode, &tmp_posixlk_list);
+
+                list_del_init(&meta_lock->list);
+
+                pl_metalk_unref(meta_lock);
+            }
+            pthread_mutex_unlock(&pl_inode->mutex);
+
+            /* The corresponding ref is taken in
+             * pl_insert_metalk*/
+            inode_unref(pl_inode->inode);
+        }
+    }
+
+unlock:
+    pthread_mutex_unlock(&ctx->lock);
+
+    list_for_each_entry_safe(posix_lock, tmp_posixlk, &tmp_posixlk_list, list)
+    {
+        list_del_init(&posix_lock->list);
+
+        STACK_UNWIND_STRICT(lk, posix_lock->frame, -1, EREMOTE,
+                            &posix_lock->user_flock, NULL);
+
+        __destroy_lock(posix_lock);
+    }
+    return 0;
+}
 
-	transport  = frame->root->trans;
-	client_pid = frame->root->pid;
-	priv       = this->private;
+static int
+pl_client_disconnect_cbk(xlator_t *this, client_t *client)
+{
+    pl_ctx_t *pl_ctx = pl_ctx_get(client, this);
+    if (pl_ctx) {
+        pl_inodelk_client_cleanup(this, pl_ctx);
+        pl_entrylk_client_cleanup(this, pl_ctx);
+        pl_metalk_client_cleanup(this, pl_ctx);
+    }
+
+    return 0;
+}
 
-	pl_inode = pl_inode_get (this, fd->inode);
-	if (!pl_inode) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		op_ret = -1;
-		op_errno = ENOMEM;
-		goto unwind;
-	}
+static int
+pl_client_destroy_cbk(xlator_t *this, client_t *client)
+{
+    void *tmp = NULL;
+    pl_ctx_t *pl_ctx = NULL;
 
-	reqlock = new_posix_lock (flock, transport, client_pid);
-	if (!reqlock) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		op_ret = -1;
-		op_errno = ENOMEM;
-		goto unwind;
-	}
+    pl_client_disconnect_cbk(this, client);
 
-	switch (cmd) {
+    client_ctx_del(client, this, &tmp);
 
-#if F_GETLK != F_GETLK64
-	case F_GETLK64:
-#endif
-	case F_GETLK:
-		conf = pl_getlk (pl_inode, reqlock, GF_LOCK_POSIX);
-		posix_lock_to_flock (conf, flock);
-		__destroy_lock (reqlock);
+    if (tmp == NULL)
+        return 0;
 
-		break;
+    pl_ctx = tmp;
 
-#if F_SETLKW != F_SETLKW64
-	case F_SETLKW64:
-#endif
-	case F_SETLKW:
-		can_block = 1;
-		reqlock->frame = frame;
-		reqlock->this  = this;
-		reqlock->fd    = fd;
+    GF_ASSERT(list_empty(&pl_ctx->inodelk_lockers));
+    GF_ASSERT(list_empty(&pl_ctx->entrylk_lockers));
 
-		/* fall through */
+    pthread_mutex_destroy(&pl_ctx->lock);
+    GF_FREE(pl_ctx);
 
-#if F_SETLK != F_SETLK64
-	case F_SETLK64:
-#endif
-	case F_SETLK:
-		memcpy (&reqlock->user_flock, flock, sizeof (struct flock));
-		ret = pl_setlk (this, pl_inode, reqlock,
-				can_block, GF_LOCK_POSIX);
-
-		if (ret == -1) {
-			if (can_block)
-				goto out;
-
-			gf_log (this->name, GF_LOG_DEBUG, "returning EAGAIN");
-			op_ret = -1;
-			op_errno = EAGAIN;
-			__destroy_lock (reqlock);
-		}
-	}
+    return 0;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    posix_locks_private_t *priv = this->private;
+    int ret = -1;
+    char *tmp_str = NULL;
+
+    GF_OPTION_RECONF("trace", priv->trace, options, bool, out);
+
+    GF_OPTION_RECONF("monkey-unlocking", priv->monkey_unlocking, options, bool,
+                     out);
+
+    GF_OPTION_RECONF("revocation-secs", priv->revocation_secs, options, uint32,
+                     out);
+
+    GF_OPTION_RECONF("revocation-clear-all", priv->revocation_clear_all,
+                     options, bool, out);
+
+    GF_OPTION_RECONF("revocation-max-blocked", priv->revocation_max_blocked,
+                     options, uint32, out);
+
+    GF_OPTION_RECONF("notify-contention", priv->notify_contention, options,
+                     bool, out);
+
+    GF_OPTION_RECONF("notify-contention-delay", priv->notify_contention_delay,
+                     options, uint32, out);
+
+    GF_OPTION_RECONF("mandatory-locking", tmp_str, options, str, out);
+
+    GF_OPTION_RECONF("enforce-mandatory-lock", priv->mlock_enforced, options,
+                     bool, out);
+
+    if (!strcmp(tmp_str, "forced"))
+        priv->mandatory_mode = MLK_FORCED;
+    else if (!strcmp(tmp_str, "file"))
+        priv->mandatory_mode = MLK_FILE_BASED;
+    else if (!strcmp(tmp_str, "optimal"))
+        priv->mandatory_mode = MLK_OPTIMAL;
+    else
+        priv->mandatory_mode = MLK_NONE;
+
+    ret = 0;
 
-unwind:
-	STACK_UNWIND (frame, op_ret, op_errno, flock);
 out:
-	return 0;
+    return ret;
 }
 
+int
+init(xlator_t *this)
+{
+    posix_locks_private_t *priv = NULL;
+    xlator_list_t *trav = NULL;
+    char *tmp_str = NULL;
+    int ret = -1;
+
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_CRITICAL,
+               "FATAL: posix-locks should have exactly one child");
+        goto out;
+    }
+
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "Volume is dangling. Please check the volume file.");
+    }
+
+    trav = this->children;
+    while (trav->xlator->children)
+        trav = trav->xlator->children;
+
+    if (strncmp("storage/", trav->xlator->type, 8)) {
+        gf_log(this->name, GF_LOG_CRITICAL,
+               "'locks' translator is not loaded over a storage "
+               "translator");
+        goto out;
+    }
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_locks_mt_posix_locks_private_t);
+
+    GF_OPTION_INIT("mandatory-locking", tmp_str, str, out);
+    if (!strcmp(tmp_str, "forced"))
+        priv->mandatory_mode = MLK_FORCED;
+    else if (!strcmp(tmp_str, "file"))
+        priv->mandatory_mode = MLK_FILE_BASED;
+    else if (!strcmp(tmp_str, "optimal"))
+        priv->mandatory_mode = MLK_OPTIMAL;
+    else
+        priv->mandatory_mode = MLK_NONE;
+
+    tmp_str = NULL;
+
+    GF_OPTION_INIT("trace", priv->trace, bool, out);
+
+    GF_OPTION_INIT("monkey-unlocking", priv->monkey_unlocking, bool, out);
+
+    GF_OPTION_INIT("revocation-secs", priv->revocation_secs, uint32, out);
+
+    GF_OPTION_INIT("revocation-clear-all", priv->revocation_clear_all, bool,
+                   out);
+
+    GF_OPTION_INIT("revocation-max-blocked", priv->revocation_max_blocked,
+                   uint32, out);
+
+    GF_OPTION_INIT("notify-contention", priv->notify_contention, bool, out);
+
+    GF_OPTION_INIT("notify-contention-delay", priv->notify_contention_delay,
+                   uint32, out);
+
+    GF_OPTION_INIT("enforce-mandatory-lock", priv->mlock_enforced, bool, out);
+
+    this->local_pool = mem_pool_new(pl_local_t, 32);
+    if (!this->local_pool) {
+        ret = -1;
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to create local_t's memory pool");
+        goto out;
+    }
+
+    this->private = priv;
+    ret = 0;
+
+out:
+    if (ret) {
+        GF_FREE(priv);
+    }
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    posix_locks_private_t *priv = this->private;
+    if (!priv)
+        return;
+    this->private = NULL;
+    if (this->local_pool) {
+        mem_pool_destroy(this->local_pool);
+        this->local_pool = NULL;
+    }
+    GF_FREE(priv->brickname);
+    GF_FREE(priv);
+
+    return;
+}
+
+int
+pl_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+           int32_t cmd, struct gf_flock *flock, dict_t *xdata);
+
+int
+pl_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            int32_t cmd, struct gf_flock *flock, dict_t *xdata);
 
-/* TODO: this function just logs, no action required?? */
 int
-pl_forget (xlator_t *this,
-	   inode_t *inode)
+pl_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+           const char *basename, entrylk_cmd cmd, entrylk_type type,
+           dict_t *xdata);
+
+int
+pl_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            const char *basename, entrylk_cmd cmd, entrylk_type type,
+            dict_t *xdata);
+
+int32_t
+pl_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *buf, struct iatt *preoldparent,
+              struct iatt *postoldparent, struct iatt *prenewparent,
+              struct iatt *postnewparent, dict_t *xdata)
 {
-	pl_inode_t   *pl_inode = NULL;
+    pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0);
 
-	pl_inode = pl_inode_get (this, inode);
+    PL_STACK_UNWIND(rename, xdata, frame, op_ret, op_errno, buf, preoldparent,
+                    postoldparent, prenewparent, postnewparent, xdata);
 
-	if (!list_empty (&pl_inode->rw_list)) {
-		gf_log (this->name, GF_LOG_CRITICAL,
-			"pending R/W requests found!");
-	}
+    return 0;
+}
 
-	if (!list_empty (&pl_inode->ext_list)) {
-		gf_log (this->name, GF_LOG_CRITICAL,
-			"Pending fcntl locks found!");
-	}
+int32_t
+pl_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+          dict_t *xdata)
+{
+    int32_t error;
+
+    error = PL_INODE_REMOVE(rename, frame, this, oldloc, newloc, pl_rename,
+                            pl_rename_cbk, oldloc, newloc, xdata);
+    if (error > 0) {
+        STACK_UNWIND_STRICT(rename, frame, -1, error, NULL, NULL, NULL, NULL,
+                            NULL, NULL);
+    }
 
-	if (!list_empty (&pl_inode->int_list)) {
-		gf_log (this->name, GF_LOG_CRITICAL,
-			"Pending internal locks found!");
-	}
+    return 0;
+}
+
+posix_lock_t *
+gf_lkmig_info_to_posix_lock(call_frame_t *frame, lock_migration_info_t *lmi)
+{
+    posix_lock_t *lock = GF_CALLOC(1, sizeof(posix_lock_t),
+                                   gf_locks_mt_posix_lock_t);
+    if (!lock)
+        goto out;
 
-	if (!list_empty (&pl_inode->dir_list)) {
-		gf_log (this->name, GF_LOG_CRITICAL,
-			"Pending entry locks found!");
-	}
+    lock->fl_start = lmi->flock.l_start;
+    lock->fl_type = lmi->flock.l_type;
 
-        FREE (pl_inode);
+    if (lmi->flock.l_len == 0)
+        lock->fl_end = LLONG_MAX;
+    else
+        lock->fl_end = lmi->flock.l_start + lmi->flock.l_len - 1;
 
-	return 0;
+    lock->client = frame->root->client;
+
+    lock->lk_flags = lmi->lk_flags;
+
+    lock->client_uid = gf_strdup(lmi->client_uid);
+    if (lock->client_uid == NULL) {
+        GF_FREE(lock);
+        lock = NULL;
+        goto out;
+    }
+
+    lock->client_pid = lmi->flock.l_pid;
+    lock->owner = lmi->flock.l_owner;
+
+    INIT_LIST_HEAD(&lock->list);
+
+out:
+    return lock;
 }
 
+/* This function is supposed to write the active locks from the source brick(in
+ * rebalance context) and write here. Hence, will add the locks directly to the
+ * pl_inode->ext_list*/
+int
+pl_write_active_locks(call_frame_t *frame, pl_inode_t *pl_inode,
+                      lock_migration_info_t *locklist)
+{
+    posix_lock_t *newlock = NULL;
+    lock_migration_info_t *temp = NULL;
+    int ret = 0;
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        /* Just making sure the activelk list is empty. Should not
+         * happen though*/
+        if (!list_empty(&pl_inode->ext_list)) {
+            pthread_mutex_unlock(&pl_inode->mutex);
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, 0, "invalid locks found");
+
+            ret = -1;
+            goto out;
+        }
+
+        /* This list also should not be empty */
+        if (list_empty(&locklist->list)) {
+            pthread_mutex_unlock(&pl_inode->mutex);
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, 0, "empty lock list");
+
+            ret = -1;
+            goto out;
+        }
+
+        list_for_each_entry(temp, &locklist->list, list)
+        {
+            newlock = gf_lkmig_info_to_posix_lock(frame, temp);
+            if (!newlock) {
+                pthread_mutex_unlock(&pl_inode->mutex);
+                gf_msg(THIS->name, GF_LOG_ERROR, 0, 0,
+                       "mem allocation failed for newlock");
+
+                ret = -1;
+                goto out;
+            }
+            list_add_tail(&newlock->list, &pl_inode->ext_list);
+        }
+    }
+    /*TODO: What if few lock add failed with ENOMEM. Should the already
+     *      added locks be clearted */
+    pthread_mutex_unlock(&pl_inode->mutex);
+out:
+    return ret;
+}
+
+static int
+pl_setactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               lock_migration_info_t *locklist, dict_t *xdata)
+{
+    int op_ret = 0;
+    int op_errno = 0;
+    int ret = 0;
+
+    pl_inode_t *pl_inode = pl_inode_get(this, loc->inode, NULL);
+    if (!pl_inode) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, 0, "pl_inode_get failed");
+
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto out;
+    }
+    ret = pl_write_active_locks(frame, pl_inode, locklist);
+
+    op_ret = ret;
+
+out:
+    STACK_UNWIND_STRICT(setactivelk, frame, op_ret, op_errno, NULL);
+
+    return 0;
+}
+
+int32_t
+pl_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *preparent, struct iatt *postparent,
+              dict_t *xdata)
+{
+    pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0);
+
+    PL_STACK_UNWIND(unlink, xdata, frame, op_ret, op_errno, preparent,
+                    postparent, xdata);
+
+    return 0;
+}
+
+int32_t
+pl_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+          dict_t *xdata)
+{
+    int32_t error;
+
+    error = PL_INODE_REMOVE(unlink, frame, this, loc, NULL, pl_unlink,
+                            pl_unlink_cbk, loc, xflag, xdata);
+    if (error > 0) {
+        STACK_UNWIND_STRICT(unlink, frame, -1, error, NULL, NULL, NULL);
+    }
+
+    return 0;
+}
+
+int32_t
+pl_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, inode_t *inode, struct iatt *buf,
+             struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(mkdir, xdata, frame, op_ret, op_errno, inode,
+                               buf, preparent, postparent, xdata);
+    return 0;
+}
 
 int
-init (xlator_t *this)
+pl_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         mode_t umask, dict_t *xdata)
 {
-	posix_locks_private_t *priv = NULL;
-	xlator_list_t         *trav = NULL;
-	data_t                *mandatory = NULL;
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+    STACK_WIND(frame, pl_mkdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+    return 0;
+}
 
-	if (!this->children || this->children->next) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"FATAL: posix-locks should have exactly one child");
-		return -1;
-	}
+int32_t
+pl_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(stat, xdata, frame, op_ret, op_errno, buf,
+                               xdata);
+    return 0;
+}
 
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
+int
+pl_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+    STACK_WIND(frame, pl_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->stat, loc, xdata);
+    return 0;
+}
 
-	trav = this->children;
-	while (trav->xlator->children)
-		trav = trav->xlator->children;
+int32_t
+pl_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, inode_t *inode, struct iatt *buf,
+             struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(mknod, xdata, frame, op_ret, op_errno, inode,
+                               buf, preparent, postparent, xdata);
+    return 0;
+}
 
-	if (strncmp ("storage/", trav->xlator->type, 8)) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"'posix-locks' not loaded over storage translator");
-		return -1;
-	}
+int
+pl_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+    STACK_WIND(frame, pl_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata);
+    return 0;
+}
 
-	priv = CALLOC (1, sizeof (*priv));
+int32_t
+pl_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iatt *preparent, struct iatt *postparent,
+             dict_t *xdata)
+{
+    pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0);
 
-	mandatory = dict_get (this->options, "mandatory-locks");
-	if (mandatory) {
-		if (gf_string2boolean (mandatory->data,
-				       &priv->mandatory) == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"'mandatory-locks' takes only boolean "
-				"options");
-			return -1;
-		}
-	}
+    PL_STACK_UNWIND_FOR_CLIENT(rmdir, xdata, frame, op_ret, op_errno, preparent,
+                               postparent, xdata);
 
-	this->private = priv;
-	return 0;
+    return 0;
 }
 
+int
+pl_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+         dict_t *xdata)
+{
+    int32_t error;
+
+    error = PL_INODE_REMOVE(rmdir, frame, this, loc, NULL, pl_rmdir,
+                            pl_rmdir_cbk, loc, xflags, xdata);
+    if (error > 0) {
+        STACK_UNWIND_STRICT(rmdir, frame, -1, error, NULL, NULL, NULL);
+    }
+
+    return 0;
+}
+
+int32_t
+pl_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, inode_t *inode,
+               struct iatt *buf, struct iatt *preparent,
+               struct iatt *postparent, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(symlink, xdata, frame, op_ret, op_errno, inode,
+                               buf, preparent, postparent, xdata);
+    return 0;
+}
 
 int
-fini (xlator_t *this)
+pl_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
+           loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+    STACK_WIND(frame, pl_symlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata);
+    return 0;
+}
+
+int32_t
+pl_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, inode_t *inode, struct iatt *buf,
+            struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
 {
-	posix_locks_private_t *priv = NULL;
+    pl_inode_t *pl_inode = (pl_inode_t *)cookie;
 
-	priv = this->private;
-	free (priv);
+    if (op_ret >= 0) {
+        pthread_mutex_lock(&pl_inode->mutex);
 
-	return 0;
+        /* TODO: can happen pl_inode->links == 0 ? */
+        if (pl_inode->links >= 0) {
+            pl_inode->links++;
+        }
+
+        pthread_mutex_unlock(&pl_inode->mutex);
+    }
+
+    PL_STACK_UNWIND_FOR_CLIENT(link, xdata, frame, op_ret, op_errno, inode, buf,
+                               preparent, postparent, xdata);
+    return 0;
 }
 
+int
+pl_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+        dict_t *xdata)
+{
+    pl_inode_t *pl_inode;
+
+    pl_inode = pl_inode_get(this, oldloc->inode, NULL);
+    if (pl_inode == NULL) {
+        STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
+                            NULL);
+        return 0;
+    }
+
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), oldloc, newloc);
+    STACK_WIND_COOKIE(frame, pl_link_cbk, pl_inode, FIRST_CHILD(this),
+                      FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+    return 0;
+}
+
+int32_t
+pl_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
+             dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(fsync, xdata, frame, op_ret, op_errno, prebuf,
+                               postbuf, xdata);
+    return 0;
+}
 
 int
-pl_inodelk (call_frame_t *frame, xlator_t *this, 
-	    loc_t *loc, int32_t cmd, struct flock *flock);
+pl_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+         dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+    STACK_WIND(frame, pl_fsync_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata);
+    return 0;
+}
+
+int32_t
+pl_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+               dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(readdir, xdata, frame, op_ret, op_errno, entries,
+                               xdata);
+    return 0;
+}
 
 int
-pl_finodelk (call_frame_t *frame, xlator_t *this, 
-	     fd_t *fd, int32_t cmd, struct flock *flock);
+pl_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+           off_t offset, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+    STACK_WIND(frame, pl_readdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata);
+    return 0;
+}
+
+int32_t
+pl_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(fsyncdir, xdata, frame, op_ret, op_errno, xdata);
+    return 0;
+}
 
 int
-pl_entrylk (call_frame_t *frame, xlator_t *this, 
-	    loc_t *loc, const char *basename, 
-	    entrylk_cmd cmd, entrylk_type type);
+pl_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+            dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+    STACK_WIND(frame, pl_fsyncdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsyncdir, fd, datasync, xdata);
+    return 0;
+}
+
+int32_t
+pl_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct statvfs *buf, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(statfs, xdata, frame, op_ret, op_errno, buf,
+                               xdata);
+    return 0;
+}
 
 int
-pl_fentrylk (call_frame_t *frame, xlator_t *this, 
-	     fd_t *fd, const char *basename, 
-	     entrylk_cmd cmd, entrylk_type type);
+pl_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+    STACK_WIND(frame, pl_statfs_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->statfs, loc, xdata);
+    return 0;
+}
 
-struct xlator_fops fops = {
-	.create      = pl_create,
-	.truncate    = pl_truncate,
-	.ftruncate   = pl_ftruncate,
-	.open        = pl_open,
-	.readv       = pl_readv,
-	.writev      = pl_writev,
-	.lk          = pl_lk,
-	.inodelk     = pl_inodelk,
-	.finodelk    = pl_finodelk,
-	.entrylk     = pl_entrylk,
-	.fentrylk    = pl_fentrylk,
-	.flush       = pl_flush,
-};
+int32_t
+pl_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    pl_local_t *local = NULL;
+    pl_inode_t *pl_inode = NULL;
+
+    local = frame->local;
+    if (local && local->update_mlock_enforced_flag && op_ret != -1) {
+        pl_inode = pl_inode_get(this, local->inode, NULL);
+        if (!pl_inode) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unwind;
+        }
+
+        pthread_mutex_lock(&pl_inode->mutex);
+        {
+            pl_inode->mlock_enforced = _gf_false;
+            pl_inode->check_mlock_info = _gf_false;
+            pl_inode->track_fop_wind_count = _gf_true;
+        }
+        pthread_mutex_unlock(&pl_inode->mutex);
+    }
 
+unwind:
+    PL_STACK_UNWIND_FOR_CLIENT(removexattr, xdata, frame, op_ret, op_errno,
+                               xdata);
+    return 0;
+}
+
+int
+pl_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               const char *name, dict_t *xdata)
+{
+    int op_ret = 0;
+    int op_errno = EINVAL;
+    posix_locks_private_t *priv = this->private;
 
-struct xlator_mops mops = {
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+
+    PL_CHECK_LOCK_ENFORCE_KEY(frame, ((dict_t *)NULL), name, this, loc,
+                              ((fd_t *)NULL), priv);
+
+    STACK_WIND(frame, pl_removexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+    return 0;
+
+unwind:
+    PL_STACK_UNWIND_FOR_CLIENT(removexattr, xdata, frame, op_ret, op_errno,
+                               NULL);
+
+    return 0;
+}
+
+int32_t
+pl_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    pl_local_t *local = NULL;
+    pl_inode_t *pl_inode = NULL;
+
+    local = frame->local;
+    if (local && local->update_mlock_enforced_flag && op_ret != -1) {
+        pl_inode = pl_inode_get(this, local->inode, NULL);
+        if (!pl_inode) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unwind;
+        }
+
+        pthread_mutex_lock(&pl_inode->mutex);
+        {
+            pl_inode->mlock_enforced = _gf_false;
+            pl_inode->check_mlock_info = _gf_false;
+        }
+        pthread_mutex_unlock(&pl_inode->mutex);
+    }
+
+unwind:
+    PL_STACK_UNWIND_FOR_CLIENT(fremovexattr, xdata, frame, op_ret, op_errno,
+                               xdata);
+    return 0;
+}
+
+int
+pl_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+                dict_t *xdata)
+{
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    posix_locks_private_t *priv = this->private;
+
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+
+    PL_CHECK_LOCK_ENFORCE_KEY(frame, ((dict_t *)NULL), name, this,
+                              ((loc_t *)NULL), fd, priv);
+
+    STACK_WIND(frame, pl_fremovexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+    return 0;
+
+unwind:
+    PL_STACK_UNWIND_FOR_CLIENT(fremovexattr, xdata, frame, op_ret, op_errno,
+                               NULL);
+    return 0;
+}
+
+int32_t
+pl_rchecksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, uint32_t weak_cksum,
+                 uint8_t *strong_cksum, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(rchecksum, xdata, frame, op_ret, op_errno,
+                               weak_cksum, strong_cksum, xdata);
+    return 0;
+}
+
+int
+pl_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             int32_t len, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+    STACK_WIND(frame, pl_rchecksum_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata);
+    return 0;
+}
+
+int32_t
+pl_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(xattrop, xdata, frame, op_ret, op_errno, dict,
+                               xdata);
+    return 0;
+}
+
+int
+pl_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+           gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+    STACK_WIND(frame, pl_xattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->xattrop, loc, optype, xattr, xdata);
+    return 0;
+}
+
+int32_t
+pl_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(fxattrop, xdata, frame, op_ret, op_errno, dict,
+                               xdata);
+    return 0;
+}
+
+int
+pl_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+            gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+    STACK_WIND(frame, pl_fxattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fxattrop, fd, optype, xattr, xdata);
+    return 0;
+}
+
+int32_t
+pl_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+               struct iatt *statpost, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(setattr, xdata, frame, op_ret, op_errno, statpre,
+                               statpost, xdata);
+    return 0;
+}
+
+int
+pl_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+           int32_t valid, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+    STACK_WIND(frame, pl_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+    return 0;
+}
+
+int32_t
+pl_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+                struct iatt *statpost, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(fsetattr, xdata, frame, op_ret, op_errno,
+                               statpre, statpost, xdata);
+    return 0;
+}
+
+int
+pl_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+            int32_t valid, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+    STACK_WIND(frame, pl_fsetattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+    return 0;
+}
+
+int32_t
+pl_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *pre,
+                 struct iatt *post, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(fallocate, xdata, frame, op_ret, op_errno, pre,
+                               post, xdata);
+    return 0;
+}
+
+int
+pl_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size,
+             off_t offset, size_t len, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+    STACK_WIND(frame, pl_fallocate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset, len,
+               xdata);
+    return 0;
+}
+
+int32_t
+pl_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, const char *path,
+                struct iatt *buf, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(readlink, xdata, frame, op_ret, op_errno, path,
+                               buf, xdata);
+    return 0;
+}
+
+int
+pl_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+            dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+    STACK_WIND(frame, pl_readlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readlink, loc, size, xdata);
+    return 0;
+}
+
+int32_t
+pl_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(access, xdata, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+pl_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+          dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+    STACK_WIND(frame, pl_access_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->access, loc, mask, xdata);
+    return 0;
+}
+
+int32_t
+pl_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, off_t offset, dict_t *xdata)
+{
+    PL_STACK_UNWIND_FOR_CLIENT(seek, xdata, frame, op_ret, op_errno, offset,
+                               xdata);
+    return 0;
+}
+
+int32_t
+pl_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+        gf_seek_what_t what, dict_t *xdata)
+{
+    PL_LOCAL_GET_REQUESTS(frame, this, xdata, fd, NULL, NULL);
+    STACK_WIND(frame, pl_seek_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->seek, fd, offset, what, xdata);
+    return 0;
+}
+
+struct xlator_fops fops = {
+    .lookup = pl_lookup,
+    .create = pl_create,
+    .fstat = pl_fstat,
+    .truncate = pl_truncate,
+    .ftruncate = pl_ftruncate,
+    .discard = pl_discard,
+    .zerofill = pl_zerofill,
+    .open = pl_open,
+    .readv = pl_readv,
+    .writev = pl_writev,
+    .lk = pl_lk,
+    .inodelk = pl_inodelk,
+    .finodelk = pl_finodelk,
+    .entrylk = pl_entrylk,
+    .fentrylk = pl_fentrylk,
+    .flush = pl_flush,
+    .opendir = pl_opendir,
+    .readdirp = pl_readdirp,
+    .setxattr = pl_setxattr,
+    .fsetxattr = pl_fsetxattr,
+    .getxattr = pl_getxattr,
+    .fgetxattr = pl_fgetxattr,
+    .removexattr = pl_removexattr,
+    .fremovexattr = pl_fremovexattr,
+    .rename = pl_rename,
+    .getactivelk = pl_getactivelk,
+    .setactivelk = pl_setactivelk,
+    .unlink = pl_unlink,
+    .access = pl_access,
+    .readlink = pl_readlink,
+    .fallocate = pl_fallocate,
+    .fsetattr = pl_fsetattr,
+    .setattr = pl_setattr,
+    .fxattrop = pl_fxattrop,
+    .xattrop = pl_xattrop,
+    .rchecksum = pl_rchecksum,
+    .statfs = pl_statfs,
+    .fsyncdir = pl_fsyncdir,
+    .readdir = pl_readdir,
+    .symlink = pl_symlink,
+    .link = pl_link,
+    .rmdir = pl_rmdir,
+    .mknod = pl_mknod,
+    .stat = pl_stat,
+    .seek = pl_seek,
 };
 
+struct xlator_dumpops dumpops = {
+    .inodectx = pl_dump_inode_priv,
+};
 
 struct xlator_cbks cbks = {
-	.forget      = pl_forget,
+    .forget = pl_forget,
+    .release = pl_release,
+    .releasedir = pl_releasedir,
+    .client_destroy = pl_client_destroy_cbk,
+    .client_disconnect = pl_client_disconnect_cbk,
 };
 
-
 struct volume_options options[] = {
-	{ .key  = { "mandatory-locks", "mandatory" }, 
-	  .type = GF_OPTION_TYPE_BOOL 
-	},
-	{ .key = {NULL} },
+    {.key = {"mandatory-locking"},
+     .type = GF_OPTION_TYPE_STR,
+     .default_value = "off",
+     .op_version = {GD_OP_VERSION_3_8_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"locks"},
+     .description = "Specifies the mandatory-locking mode. Valid options "
+                    "are 'file' to use linux style mandatory locks, "
+                    "'forced' to use volume strictly under mandatory lock "
+                    "semantics only and 'optimal' to treat advisory and "
+                    "mandatory locks separately on their own."},
+    {.key = {"trace"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .op_version = {GD_OP_VERSION_3_7_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"locks"},
+     .description = "Trace the different lock requests "
+                    "to logs."},
+    {.key = {"monkey-unlocking"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "false",
+     .op_version = {GD_OP_VERSION_3_9_0},
+     .flags = OPT_FLAG_SETTABLE,
+     .tags = {"locks"},
+     .description = "Ignore a random number of unlock requests.  Useful "
+                    "for testing/creating robust lock recovery mechanisms."},
+    {
+        .key = {"revocation-secs"},
+        .type = GF_OPTION_TYPE_INT,
+        .min = 0,
+        .max = INT_MAX,
+        .default_value = "0",
+        .op_version = {GD_OP_VERSION_3_9_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"locks"},
+        .description = "Maximum time a lock can be taken out, before"
+                       "being revoked.",
+    },
+    {
+        .key = {"revocation-clear-all"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .op_version = {GD_OP_VERSION_3_9_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"locks"},
+        .description = "If set to true, will revoke BOTH granted and blocked "
+                       "(pending) lock requests if a revocation threshold is "
+                       "hit.",
+    },
+    {.key = {"revocation-max-blocked"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 0,
+     .max = INT_MAX,
+     .default_value = "0",
+     .op_version = {GD_OP_VERSION_3_9_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"locks"},
+     .description = "A number of blocked lock requests after which a lock "
+                    "will be revoked to allow the others to proceed.  Can "
+                    "be used in conjunction w/ revocation-clear-all."},
+    {.key = {"notify-contention"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "yes",
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .op_version = {GD_OP_VERSION_4_0_0},
+     .tags = {"locks", "contention"},
+     .description = "When this option is enabled and a lock request "
+                    "conflicts with a currently granted lock, an upcall "
+                    "notification will be sent to the current owner of "
+                    "the lock to request it to be released as soon as "
+                    "possible."},
+    {.key = {"notify-contention-delay"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 0, /* An upcall notification is sent every time a conflict is
+                * detected. */
+     .max = 60,
+     .default_value = "5",
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .op_version = {GD_OP_VERSION_4_0_0},
+     .tags = {"locks", "contention", "timeout"},
+     .description = "This value determines the minimum amount of time "
+                    "(in seconds) between upcall contention notifications "
+                    "on the same inode. If multiple lock requests are "
+                    "received during this period, only one upcall will "
+                    "be sent."},
+    {.key = {"enforce-mandatory-lock"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .flags = OPT_FLAG_SETTABLE,
+     .op_version = {GD_OP_VERSION_6_0},
+     .description = "option to enable lock enforcement"},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "locks",
+    .category = GF_MAINTAINED,
 };
diff --git a/xlators/features/locks/src/reservelk.c b/xlators/features/locks/src/reservelk.c
new file mode 100644
index 00000000000..604691fd887
--- /dev/null
+++ b/xlators/features/locks/src/reservelk.c
@@ -0,0 +1,382 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/list.h>
+
+#include "locks.h"
+#include "common.h"
+
+/* Return true if the two reservelks have exactly same lock boundaries */
+int
+reservelks_equal(posix_lock_t *l1, posix_lock_t *l2)
+{
+    if ((l1->fl_start == l2->fl_start) && (l1->fl_end == l2->fl_end))
+        return 1;
+
+    return 0;
+}
+
+/* Determine if lock is grantable or not */
+static posix_lock_t *
+__reservelk_grantable(pl_inode_t *pl_inode, posix_lock_t *lock)
+{
+    xlator_t *this = THIS;
+    posix_lock_t *l = NULL;
+    posix_lock_t *ret_lock = NULL;
+
+    if (list_empty(&pl_inode->reservelk_list)) {
+        gf_log(this->name, GF_LOG_TRACE, "No reservelks in list");
+        goto out;
+    }
+    list_for_each_entry(l, &pl_inode->reservelk_list, list)
+    {
+        if (reservelks_equal(lock, l)) {
+            ret_lock = l;
+            break;
+        }
+    }
+out:
+    return ret_lock;
+}
+
+static int
+__same_owner_reservelk(posix_lock_t *l1, posix_lock_t *l2)
+{
+    return (is_same_lkowner(&l1->owner, &l2->owner));
+}
+
+static posix_lock_t *
+__matching_reservelk(pl_inode_t *pl_inode, posix_lock_t *lock)
+{
+    posix_lock_t *l = NULL;
+
+    if (list_empty(&pl_inode->reservelk_list)) {
+        gf_log("posix-locks", GF_LOG_TRACE, "reservelk list empty");
+        return NULL;
+    }
+
+    list_for_each_entry(l, &pl_inode->reservelk_list, list)
+    {
+        if (reservelks_equal(l, lock)) {
+            gf_log("posix-locks", GF_LOG_TRACE, "equal reservelk found");
+            break;
+        }
+    }
+
+    return l;
+}
+
+static int
+__reservelk_conflict(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock)
+{
+    int ret = 0;
+
+    posix_lock_t *conf = __matching_reservelk(pl_inode, lock);
+    if (conf) {
+        gf_log(this->name, GF_LOG_TRACE, "Matching reservelk found");
+        if (__same_owner_reservelk(lock, conf)) {
+            list_del_init(&conf->list);
+            gf_log(this->name, GF_LOG_TRACE,
+                   "Removing the matching reservelk for setlk to progress");
+            __destroy_lock(conf);
+            ret = 0;
+        } else {
+            gf_log(this->name, GF_LOG_TRACE, "Conflicting reservelk found");
+            ret = 1;
+        }
+    }
+    return ret;
+}
+
+int
+pl_verify_reservelk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock,
+                    const int can_block)
+{
+    int ret = 0;
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        if (__reservelk_conflict(this, pl_inode, lock)) {
+            lock->blocked = can_block;
+            list_add_tail(&lock->list, &pl_inode->blocked_calls);
+            pthread_mutex_unlock(&pl_inode->mutex);
+            gf_log(this->name, GF_LOG_TRACE,
+                   "Found conflicting reservelk. Blocking until reservelk is "
+                   "unlocked.");
+            ret = -1;
+            goto out;
+        }
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+    gf_log(this->name, GF_LOG_TRACE,
+           "no conflicting reservelk found. Call continuing");
+    ret = 0;
+out:
+    return ret;
+}
+
+/* Determines if lock can be granted and adds the lock. If the lock
+ * is blocking, adds it to the blocked_reservelks.
+ */
+static int
+__lock_reservelk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock,
+                 const int can_block)
+{
+    int ret = -EINVAL;
+
+    posix_lock_t *conf = __reservelk_grantable(pl_inode, lock);
+    if (conf) {
+        ret = -EAGAIN;
+        if (can_block == 0)
+            goto out;
+
+        list_add_tail(&lock->list, &pl_inode->blocked_reservelks);
+
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s (pid=%d) lk-owner:%s %" PRId64 " - %" PRId64 " => Blocked",
+               lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid,
+               lkowner_utoa(&lock->owner), lock->user_flock.l_start,
+               lock->user_flock.l_len);
+
+        goto out;
+    }
+
+    list_add(&lock->list, &pl_inode->reservelk_list);
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+static posix_lock_t *
+find_matching_reservelk(posix_lock_t *lock, pl_inode_t *pl_inode)
+{
+    posix_lock_t *l = NULL;
+    list_for_each_entry(l, &pl_inode->reservelk_list, list)
+    {
+        if (reservelks_equal(l, lock))
+            return l;
+    }
+    return NULL;
+}
+
+/* Set F_UNLCK removes a lock which has the exact same lock boundaries
+ * as the UNLCK lock specifies. If such a lock is not found, returns invalid
+ */
+static posix_lock_t *
+__reserve_unlock_lock(xlator_t *this, posix_lock_t *lock, pl_inode_t *pl_inode)
+{
+    posix_lock_t *conf = find_matching_reservelk(lock, pl_inode);
+    if (!conf) {
+        gf_log(this->name, GF_LOG_DEBUG, " Matching lock not found for unlock");
+        goto out;
+    }
+    __delete_lock(conf);
+    gf_log(this->name, GF_LOG_DEBUG, " Matching lock found for unlock");
+
+out:
+    return conf;
+}
+
+static void
+__grant_blocked_reserve_locks(xlator_t *this, pl_inode_t *pl_inode,
+                              struct list_head *granted)
+{
+    int bl_ret = 0;
+    posix_lock_t *bl = NULL;
+    posix_lock_t *tmp = NULL;
+
+    struct list_head blocked_list;
+
+    INIT_LIST_HEAD(&blocked_list);
+    list_splice_init(&pl_inode->blocked_reservelks, &blocked_list);
+
+    list_for_each_entry_safe(bl, tmp, &blocked_list, list)
+    {
+        list_del_init(&bl->list);
+
+        bl_ret = __lock_reservelk(this, pl_inode, bl, 1);
+
+        if (bl_ret == 0) {
+            list_add(&bl->list, granted);
+        }
+    }
+    return;
+}
+
+/* Grant all reservelks blocked on lock(s) */
+void
+grant_blocked_reserve_locks(xlator_t *this, pl_inode_t *pl_inode)
+{
+    struct list_head granted;
+    posix_lock_t *lock = NULL;
+    posix_lock_t *tmp = NULL;
+
+    INIT_LIST_HEAD(&granted);
+
+    if (list_empty(&pl_inode->blocked_reservelks)) {
+        gf_log(this->name, GF_LOG_TRACE, "No blocked locks to be granted");
+        return;
+    }
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        __grant_blocked_reserve_locks(this, pl_inode, &granted);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    list_for_each_entry_safe(lock, tmp, &granted, list)
+    {
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => Granted",
+               lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid,
+               lkowner_utoa(&lock->owner), lock->user_flock.l_start,
+               lock->user_flock.l_len);
+
+        STACK_UNWIND_STRICT(lk, lock->frame, 0, 0, &lock->user_flock, NULL);
+    }
+}
+
+static void
+__grant_blocked_lock_calls(xlator_t *this, pl_inode_t *pl_inode,
+                           struct list_head *granted)
+{
+    int bl_ret = 0;
+    posix_lock_t *bl = NULL;
+    posix_lock_t *tmp = NULL;
+
+    struct list_head blocked_list;
+
+    INIT_LIST_HEAD(&blocked_list);
+    list_splice_init(&pl_inode->blocked_reservelks, &blocked_list);
+
+    list_for_each_entry_safe(bl, tmp, &blocked_list, list)
+    {
+        list_del_init(&bl->list);
+
+        bl_ret = pl_verify_reservelk(this, pl_inode, bl, bl->blocked);
+
+        if (bl_ret == 0) {
+            list_add_tail(&bl->list, granted);
+        }
+    }
+    return;
+}
+
+void
+grant_blocked_lock_calls(xlator_t *this, pl_inode_t *pl_inode)
+{
+    struct list_head granted;
+    posix_lock_t *lock = NULL;
+    posix_lock_t *tmp = NULL;
+    fd_t *fd = NULL;
+
+    int can_block = 0;
+    int32_t cmd = 0;
+    int ret = 0;
+
+    if (list_empty(&pl_inode->blocked_calls)) {
+        gf_log(this->name, GF_LOG_TRACE, "No blocked lock calls to be granted");
+        return;
+    }
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        __grant_blocked_lock_calls(this, pl_inode, &granted);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    list_for_each_entry_safe(lock, tmp, &granted, list)
+    {
+        fd = fd_from_fdnum(lock);
+
+        if (lock->blocked) {
+            can_block = 1;
+            cmd = F_SETLKW;
+        } else
+            cmd = F_SETLK;
+
+        lock->blocked = 0;
+        ret = pl_setlk(this, pl_inode, lock, can_block);
+        if (ret == -1) {
+            if (can_block) {
+                continue;
+            } else {
+                gf_log(this->name, GF_LOG_DEBUG, "returning EAGAIN");
+                pl_trace_out(this, lock->frame, fd, NULL, cmd,
+                             &lock->user_flock, -1, EAGAIN, NULL);
+                pl_update_refkeeper(this, fd->inode);
+                STACK_UNWIND_STRICT(lk, lock->frame, -1, EAGAIN,
+                                    &lock->user_flock, NULL);
+                __destroy_lock(lock);
+            }
+        }
+    }
+}
+
+int
+pl_reserve_unlock(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock)
+{
+    posix_lock_t *retlock = NULL;
+    int ret = -1;
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        retlock = __reserve_unlock_lock(this, lock, pl_inode);
+        if (!retlock) {
+            pthread_mutex_unlock(&pl_inode->mutex);
+            gf_log(this->name, GF_LOG_DEBUG, "Bad Unlock issued on Inode lock");
+            ret = -EINVAL;
+            goto out;
+        }
+
+        gf_log(this->name, GF_LOG_TRACE, "Reservelk Unlock successful");
+        __destroy_lock(retlock);
+        ret = 0;
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+out:
+    grant_blocked_reserve_locks(this, pl_inode);
+    grant_blocked_lock_calls(this, pl_inode);
+
+    return ret;
+}
+
+int
+pl_reserve_setlk(xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock,
+                 int can_block)
+{
+    int ret = -EINVAL;
+
+    pthread_mutex_lock(&pl_inode->mutex);
+    {
+        ret = __lock_reservelk(this, pl_inode, lock, can_block);
+    }
+    pthread_mutex_unlock(&pl_inode->mutex);
+
+    if (ret < 0)
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => NOK",
+               lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid,
+               lkowner_utoa(&lock->owner), lock->user_flock.l_start,
+               lock->user_flock.l_len);
+    else
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => OK",
+               lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid,
+               lkowner_utoa(&lock->owner), lock->fl_start, lock->fl_end);
+
+    return ret;
+}
diff --git a/xlators/features/locks/tests/unit-test.c b/xlators/features/locks/tests/unit-test.c
index 06e77d56bfb..d285b12b5aa 100644
--- a/xlators/features/locks/tests/unit-test.c
+++ b/xlators/features/locks/tests/unit-test.c
@@ -1,75 +1,77 @@
 /*
-  Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
 
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
 */
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "compat.h"
-#include "xlator.h"
-#include "inode.h"
-#include "logging.h"
-#include "common-utils.h"
-#include "list.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/list.h>
 
 #include "locks.h"
 #include "common.h"
 
-#define expect(cond) if (!(cond)) { goto out; }
+#define expect(cond)                                                           \
+    if (!(cond)) {                                                             \
+        goto out;                                                              \
+    }
 
-extern int lock_name (pl_inode_t *, const char *, entrylk_type);
-extern int unlock_name (pl_inode_t *, const char *, entrylk_type);
+extern int
+lock_name(pl_inode_t *, const char *, entrylk_type);
+extern int
+unlock_name(pl_inode_t *, const char *, entrylk_type);
 
-int main (int argc, char **argv)
+int
+main(int argc, char **argv)
 {
-	int ret = 1;
-	int r = -1;
+    int ret = 1;
+    int r = -1;
+
+    pl_inode_t *pinode = CALLOC(sizeof(pl_inode_t), 1);
+    pthread_mutex_init(&pinode->dir_lock_mutex, NULL);
+    INIT_LIST_HEAD(&pinode->gf_dir_locks);
 
-	pl_inode_t *pinode = CALLOC (sizeof (pl_inode_t), 1);
-	pthread_mutex_init (&pinode->dir_lock_mutex, NULL);
-	INIT_LIST_HEAD (&pinode->gf_dir_locks);
+    r = lock_name(pinode, NULL, ENTRYLK_WRLCK);
+    expect(r == 0);
+    {
+        r = lock_name(pinode, "foo", ENTRYLK_WRLCK);
+        expect(r == -EAGAIN);
+    }
+    r = unlock_name(pinode, NULL, ENTRYLK_WRLCK);
+    expect(r == 0);
 
-	r = lock_name (pinode, NULL, ENTRYLK_WRLCK); expect (r == 0);
-	{
-		r = lock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == -EAGAIN);
-	}
-	r = unlock_name (pinode, NULL, ENTRYLK_WRLCK); expect (r == 0);
+    r = lock_name(pinode, "foo", ENTRYLK_RDLCK);
+    expect(r == 0);
+    {
+        r = lock_name(pinode, "foo", ENTRYLK_RDLCK);
+        expect(r == 0);
+        {
+            r = lock_name(pinode, "foo", ENTRYLK_WRLCK);
+            expect(r == -EAGAIN);
+        }
+        r = unlock_name(pinode, "foo", ENTRYLK_RDLCK);
+        expect(r == 0);
+    }
+    r = unlock_name(pinode, "foo", ENTRYLK_RDLCK);
+    expect(r == 0);
 
-	r = lock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0);
-	{
-		r = lock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0);
-		{
-			r = lock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == -EAGAIN);
-		}
-		r = unlock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0);
-	}
-	r = unlock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0);
-	
-	r = lock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == 0);
-	r = unlock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == 0);
+    r = lock_name(pinode, "foo", ENTRYLK_WRLCK);
+    expect(r == 0);
+    r = unlock_name(pinode, "foo", ENTRYLK_WRLCK);
+    expect(r == 0);
 
-	r = lock_name (pinode, "baz", ENTRYLK_WRLCK); expect (r == 0);
-	r = lock_name (pinode, "baz", ENTRYLK_RDLCK); expect (r == -EAGAIN);
+    r = lock_name(pinode, "baz", ENTRYLK_WRLCK);
+    expect(r == 0);
+    r = lock_name(pinode, "baz", ENTRYLK_RDLCK);
+    expect(r == -EAGAIN);
 
-	ret = 0;
+    ret = 0;
 out:
-	return ret;
+    return ret;
 }
diff --git a/xlators/features/marker/Makefile.am b/xlators/features/marker/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/marker/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/marker/src/Makefile.am b/xlators/features/marker/src/Makefile.am
new file mode 100644
index 00000000000..58056b36511
--- /dev/null
+++ b/xlators/features/marker/src/Makefile.am
@@ -0,0 +1,24 @@
+if WITH_SERVER
+xlator_LTLIBRARIES = marker.la
+endif
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+marker_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+marker_la_SOURCES = marker.c marker-quota.c marker-quota-helper.c \
+	marker-common.c
+
+marker_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = marker-mem-types.h marker.h marker-quota.h \
+	marker-quota-helper.h marker-common.h \
+	$(top_builddir)/xlators/lib/src/libxlator.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(top_srcdir)/xlators/lib/src
+
+AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS)
+
+CLEANFILES =
+
diff --git a/xlators/features/marker/src/marker-common.c b/xlators/features/marker/src/marker-common.c
new file mode 100644
index 00000000000..9c9047005d6
--- /dev/null
+++ b/xlators/features/marker/src/marker-common.c
@@ -0,0 +1,57 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <fnmatch.h>
+#include "marker-common.h"
+
+marker_inode_ctx_t *
+marker_inode_ctx_new()
+{
+    marker_inode_ctx_t *ctx = NULL;
+
+    ctx = GF_CALLOC(1, sizeof(marker_inode_ctx_t),
+                    gf_marker_mt_marker_inode_ctx_t);
+    if (ctx == NULL)
+        goto out;
+
+    ctx->quota_ctx = NULL;
+out:
+    return ctx;
+}
+
+int32_t
+marker_force_inode_ctx_get(inode_t *inode, xlator_t *this,
+                           marker_inode_ctx_t **ctx)
+{
+    int32_t ret = -1;
+    uint64_t ctx_int = 0;
+
+    LOCK(&inode->lock);
+    {
+        ret = __inode_ctx_get(inode, this, &ctx_int);
+        if (ret == 0)
+            *ctx = (marker_inode_ctx_t *)(unsigned long)ctx_int;
+        else {
+            *ctx = marker_inode_ctx_new();
+            if (*ctx == NULL)
+                goto unlock;
+
+            ret = __inode_ctx_put(inode, this, (uint64_t)(unsigned long)*ctx);
+            if (ret == -1) {
+                GF_FREE(*ctx);
+                goto unlock;
+            }
+            ret = 0;
+        }
+    }
+unlock:
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
diff --git a/xlators/features/marker/src/marker-common.h b/xlators/features/marker/src/marker-common.h
new file mode 100644
index 00000000000..7f8cffe7d35
--- /dev/null
+++ b/xlators/features/marker/src/marker-common.h
@@ -0,0 +1,19 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _MARKER_COMMON_H
+#define _MARKER_COMMON_H
+
+#include <glusterfs/xlator.h>
+#include "marker.h"
+
+int32_t
+marker_force_inode_ctx_get(inode_t *, xlator_t *, marker_inode_ctx_t **);
+
+#endif
diff --git a/xlators/features/marker/src/marker-mem-types.h b/xlators/features/marker/src/marker-mem-types.h
new file mode 100644
index 00000000000..aedfdb4a1b7
--- /dev/null
+++ b/xlators/features/marker/src/marker-mem-types.h
@@ -0,0 +1,28 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef __MARKER_MEM_TYPES_H__
+#define __MARKER_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_marker_mem_types_ {
+    /* Those are used by ALLOCATE_OR_GOTO macro */
+    gf_marker_mt_marker_conf_t = gf_common_mt_end + 1,
+    gf_marker_mt_loc_t,
+    gf_marker_mt_volume_mark,
+    gf_marker_mt_int64_t,
+    gf_marker_mt_quota_inode_ctx_t,
+    gf_marker_mt_marker_inode_ctx_t,
+    gf_marker_mt_inode_contribution_t,
+    gf_marker_mt_quota_meta_t,
+    gf_marker_mt_quota_synctask_t,
+    gf_marker_mt_end
+};
+#endif
diff --git a/xlators/features/marker/src/marker-quota-helper.c b/xlators/features/marker/src/marker-quota-helper.c
new file mode 100644
index 00000000000..ecd85d67b2b
--- /dev/null
+++ b/xlators/features/marker/src/marker-quota-helper.c
@@ -0,0 +1,380 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/locking.h>
+#include "marker-quota.h"
+#include "marker-common.h"
+#include "marker-quota-helper.h"
+#include "marker-mem-types.h"
+
+int
+mq_loc_fill(loc_t *loc, inode_t *inode, inode_t *parent, char *path)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", inode, out);
+    GF_VALIDATE_OR_GOTO("marker", path, out);
+    /* Not checking for parent because while filling
+     * loc of root, parent will be NULL
+     */
+
+    if (inode) {
+        loc->inode = inode_ref(inode);
+    }
+
+    if (parent)
+        loc->parent = inode_ref(parent);
+
+    if (!gf_uuid_is_null(inode->gfid))
+        gf_uuid_copy(loc->gfid, inode->gfid);
+
+    loc->path = gf_strdup(path);
+    if (!loc->path) {
+        gf_log("loc fill", GF_LOG_ERROR, "strdup failed");
+        goto out;
+    }
+
+    loc->name = strrchr(loc->path, '/');
+    if (loc->name)
+        loc->name++;
+    else
+        goto out;
+
+    ret = 0;
+
+out:
+    if (ret < 0)
+        loc_wipe(loc);
+
+    return ret;
+}
+
+int32_t
+mq_inode_loc_fill(const char *parent_gfid, inode_t *inode, loc_t *loc)
+{
+    char *resolvedpath = NULL;
+    inode_t *parent = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    xlator_t *this = NULL;
+    int ret = -1;
+
+    this = THIS;
+
+    if (inode == NULL) {
+        gf_log_callingfn("marker", GF_LOG_ERROR,
+                         "loc fill failed, "
+                         "inode is NULL");
+        return ret;
+    }
+
+    if (loc == NULL)
+        return ret;
+
+    if ((inode) && __is_root_gfid(inode->gfid)) {
+        loc->parent = NULL;
+        goto ignore_parent;
+    }
+
+    if (parent_gfid == NULL)
+        parent = inode_parent(inode, 0, NULL);
+    else
+        parent = inode_find(inode->table, (unsigned char *)parent_gfid);
+
+    if (parent == NULL) {
+        gf_log("marker", GF_LOG_ERROR, "parent is NULL for %s",
+               uuid_utoa(inode->gfid));
+        goto err;
+    }
+
+ignore_parent:
+    ret = inode_path(inode, NULL, &resolvedpath);
+    if (ret < 0) {
+        gf_log("marker", GF_LOG_ERROR, "failed to resolve path for %s",
+               uuid_utoa(inode->gfid));
+        goto err;
+    }
+
+    ret = mq_loc_fill(loc, inode, parent, resolvedpath);
+    if (ret < 0)
+        goto err;
+
+    ret = mq_inode_ctx_get(inode, this, &ctx);
+    if (ret < 0 || ctx == NULL)
+        ctx = mq_inode_ctx_new(inode, this);
+    if (ctx == NULL) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "mq_inode_ctx_new "
+               "failed for %s",
+               uuid_utoa(inode->gfid));
+        ret = -1;
+        goto err;
+    }
+    ret = 0;
+
+err:
+    if (parent)
+        inode_unref(parent);
+
+    GF_FREE(resolvedpath);
+
+    return ret;
+}
+
+quota_inode_ctx_t *
+mq_alloc_inode_ctx()
+{
+    int32_t ret = -1;
+    quota_inode_ctx_t *ctx = NULL;
+
+    QUOTA_ALLOC(ctx, quota_inode_ctx_t, ret);
+    if (ret == -1)
+        goto out;
+
+    ctx->size = 0;
+    ctx->dirty = 0;
+    ctx->updation_status = _gf_false;
+    LOCK_INIT(&ctx->lock);
+    INIT_LIST_HEAD(&ctx->contribution_head);
+out:
+    return ctx;
+}
+
+static void
+mq_contri_fini(inode_contribution_t *contri)
+{
+    LOCK_DESTROY(&contri->lock);
+    GF_FREE(contri);
+}
+
+inode_contribution_t *
+mq_contri_init(inode_t *inode)
+{
+    inode_contribution_t *contri = NULL;
+    int32_t ret = 0;
+
+    QUOTA_ALLOC(contri, inode_contribution_t, ret);
+    if (ret == -1)
+        goto out;
+
+    GF_REF_INIT(contri, mq_contri_fini);
+
+    contri->contribution = 0;
+    contri->file_count = 0;
+    contri->dir_count = 0;
+    gf_uuid_copy(contri->gfid, inode->gfid);
+
+    LOCK_INIT(&contri->lock);
+    INIT_LIST_HEAD(&contri->contri_list);
+
+out:
+    return contri;
+}
+
+inode_contribution_t *
+mq_get_contribution_node(inode_t *inode, quota_inode_ctx_t *ctx)
+{
+    inode_contribution_t *contri = NULL;
+    inode_contribution_t *temp = NULL;
+
+    if (!inode || !ctx)
+        goto out;
+
+    LOCK(&ctx->lock);
+    {
+        if (list_empty(&ctx->contribution_head))
+            goto unlock;
+
+        list_for_each_entry(temp, &ctx->contribution_head, contri_list)
+        {
+            if (gf_uuid_compare(temp->gfid, inode->gfid) == 0) {
+                contri = temp;
+                GF_REF_GET(contri);
+                break;
+            }
+        }
+    }
+unlock:
+    UNLOCK(&ctx->lock);
+
+out:
+    return contri;
+}
+
+inode_contribution_t *
+__mq_add_new_contribution_node(xlator_t *this, quota_inode_ctx_t *ctx,
+                               loc_t *loc)
+{
+    inode_contribution_t *contribution = NULL;
+
+    if (!loc->parent) {
+        if (!gf_uuid_is_null(loc->pargfid))
+            loc->parent = inode_find(loc->inode->table, loc->pargfid);
+
+        if (!loc->parent)
+            loc->parent = inode_parent(loc->inode, loc->pargfid, loc->name);
+        if (!loc->parent)
+            goto out;
+    }
+
+    list_for_each_entry(contribution, &ctx->contribution_head, contri_list)
+    {
+        if (loc->parent &&
+            gf_uuid_compare(contribution->gfid, loc->parent->gfid) == 0) {
+            goto out;
+        }
+    }
+
+    contribution = mq_contri_init(loc->parent);
+    if (contribution == NULL)
+        goto out;
+
+    list_add_tail(&contribution->contri_list, &ctx->contribution_head);
+
+out:
+    return contribution;
+}
+
+inode_contribution_t *
+mq_add_new_contribution_node(xlator_t *this, quota_inode_ctx_t *ctx, loc_t *loc)
+{
+    inode_contribution_t *contribution = NULL;
+
+    if ((ctx == NULL) || (loc == NULL))
+        return NULL;
+
+    if (((loc->path) && (strcmp(loc->path, "/") == 0)) ||
+        (!loc->path && gf_uuid_is_null(loc->pargfid)))
+        return NULL;
+
+    LOCK(&ctx->lock);
+    {
+        contribution = __mq_add_new_contribution_node(this, ctx, loc);
+        if (contribution)
+            GF_REF_GET(contribution);
+    }
+    UNLOCK(&ctx->lock);
+
+    return contribution;
+}
+
+int32_t
+mq_dict_set_contribution(xlator_t *this, dict_t *dict, loc_t *loc, uuid_t gfid,
+                         char *contri_key)
+{
+    int32_t ret = -1;
+    char key[QUOTA_KEY_MAX] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("marker", this, out);
+    GF_VALIDATE_OR_GOTO("marker", dict, out);
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+
+    if (gfid && !gf_uuid_is_null(gfid)) {
+        GET_CONTRI_KEY(this, key, gfid, ret);
+    } else if (loc->parent) {
+        GET_CONTRI_KEY(this, key, loc->parent->gfid, ret);
+    } else {
+        /* nameless lookup, fetch contributions to all parents */
+        GET_CONTRI_KEY(this, key, NULL, ret);
+    }
+
+    if (ret < 0)
+        goto out;
+
+    ret = dict_set_int64(dict, key, 0);
+    if (ret < 0)
+        goto out;
+
+    if (contri_key)
+        if (snprintf(contri_key, QUOTA_KEY_MAX, "%s", key) >= QUOTA_KEY_MAX) {
+            ret = -1;
+            goto out;
+        }
+
+out:
+    if (ret < 0)
+        gf_log_callingfn(this ? this->name : "Marker", GF_LOG_ERROR,
+                         "dict set failed");
+
+    return ret;
+}
+
+int32_t
+mq_inode_ctx_get(inode_t *inode, xlator_t *this, quota_inode_ctx_t **ctx)
+{
+    int32_t ret = -1;
+    uint64_t ctx_int = 0;
+    marker_inode_ctx_t *mark_ctx = NULL;
+
+    GF_VALIDATE_OR_GOTO("marker", inode, out);
+    GF_VALIDATE_OR_GOTO("marker", this, out);
+    GF_VALIDATE_OR_GOTO("marker", ctx, out);
+
+    ret = inode_ctx_get(inode, this, &ctx_int);
+    if (ret < 0) {
+        ret = -1;
+        *ctx = NULL;
+        goto out;
+    }
+
+    mark_ctx = (marker_inode_ctx_t *)(unsigned long)ctx_int;
+    if (mark_ctx->quota_ctx == NULL) {
+        ret = -1;
+        goto out;
+    }
+
+    *ctx = mark_ctx->quota_ctx;
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+quota_inode_ctx_t *
+__mq_inode_ctx_new(inode_t *inode, xlator_t *this)
+{
+    int32_t ret = -1;
+    quota_inode_ctx_t *quota_ctx = NULL;
+    marker_inode_ctx_t *mark_ctx = NULL;
+
+    ret = marker_force_inode_ctx_get(inode, this, &mark_ctx);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_ERROR, "marker_force_inode_ctx_get() failed");
+        goto out;
+    }
+
+    LOCK(&inode->lock);
+    {
+        if (mark_ctx->quota_ctx == NULL) {
+            quota_ctx = mq_alloc_inode_ctx();
+            if (quota_ctx == NULL) {
+                ret = -1;
+                goto unlock;
+            }
+            mark_ctx->quota_ctx = quota_ctx;
+        } else {
+            quota_ctx = mark_ctx->quota_ctx;
+        }
+
+        ret = 0;
+    }
+unlock:
+    UNLOCK(&inode->lock);
+out:
+    return quota_ctx;
+}
+
+quota_inode_ctx_t *
+mq_inode_ctx_new(inode_t *inode, xlator_t *this)
+{
+    return __mq_inode_ctx_new(inode, this);
+}
diff --git a/xlators/features/marker/src/marker-quota-helper.h b/xlators/features/marker/src/marker-quota-helper.h
new file mode 100644
index 00000000000..d4091dd2180
--- /dev/null
+++ b/xlators/features/marker/src/marker-quota-helper.h
@@ -0,0 +1,66 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _MARKER_QUOTA_HELPER_H
+#define _MARKER_QUOTA_HELPER_H
+
+#include "marker.h"
+
+#define QUOTA_FREE_CONTRIBUTION_NODE(ctx, _contribution)                       \
+    do {                                                                       \
+        LOCK(&ctx->lock);                                                      \
+        {                                                                      \
+            list_del_init(&_contribution->contri_list);                        \
+            GF_REF_PUT(_contribution);                                         \
+        }                                                                      \
+        UNLOCK(&ctx->lock);                                                    \
+    } while (0)
+
+#define QUOTA_SAFE_INCREMENT(lock, var)                                        \
+    do {                                                                       \
+        LOCK(lock);                                                            \
+        var++;                                                                 \
+        UNLOCK(lock);                                                          \
+    } while (0)
+
+#define QUOTA_SAFE_DECREMENT(lock, var, value)                                 \
+    do {                                                                       \
+        LOCK(lock);                                                            \
+        {                                                                      \
+            value = --var;                                                     \
+        }                                                                      \
+        UNLOCK(lock);                                                          \
+    } while (0)
+
+inode_contribution_t *
+mq_add_new_contribution_node(xlator_t *, quota_inode_ctx_t *, loc_t *);
+
+int32_t
+mq_dict_set_contribution(xlator_t *, dict_t *, loc_t *, uuid_t, char *);
+
+quota_inode_ctx_t *
+mq_inode_ctx_new(inode_t *, xlator_t *);
+
+int32_t
+mq_inode_ctx_get(inode_t *, xlator_t *, quota_inode_ctx_t **);
+
+int32_t
+mq_delete_contribution_node(dict_t *, char *, inode_contribution_t *);
+
+int32_t
+mq_inode_loc_fill(const char *, inode_t *, loc_t *);
+
+inode_contribution_t *
+mq_contri_init(inode_t *inode);
+
+inode_contribution_t *
+mq_get_contribution_node(inode_t *, quota_inode_ctx_t *);
+
+#endif
diff --git a/xlators/features/marker/src/marker-quota.c b/xlators/features/marker/src/marker-quota.c
new file mode 100644
index 00000000000..3de2ea1c92c
--- /dev/null
+++ b/xlators/features/marker/src/marker-quota.c
@@ -0,0 +1,2297 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include "libxlator.h"
+#include <glusterfs/common-utils.h>
+#include <glusterfs/byte-order.h>
+#include "marker-quota.h"
+#include "marker-quota-helper.h"
+#include <glusterfs/syncop.h>
+#include <glusterfs/quota-common-utils.h>
+
+int
+mq_loc_copy(loc_t *dst, loc_t *src)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("marker", dst, out);
+    GF_VALIDATE_OR_GOTO("marker", src, out);
+
+    if (src->inode == NULL ||
+        ((src->parent == NULL) && (gf_uuid_is_null(src->pargfid)) &&
+         !__is_root_gfid(src->inode->gfid))) {
+        gf_log("marker", GF_LOG_WARNING, "src loc is not valid");
+        goto out;
+    }
+
+    ret = loc_copy(dst, src);
+out:
+    return ret;
+}
+
+static void
+mq_set_ctx_status(quota_inode_ctx_t *ctx, gf_boolean_t *flag,
+                  gf_boolean_t status)
+{
+    LOCK(&ctx->lock);
+    {
+        *flag = status;
+    }
+    UNLOCK(&ctx->lock);
+}
+
+static void
+mq_test_and_set_ctx_status(quota_inode_ctx_t *ctx, gf_boolean_t *flag,
+                           gf_boolean_t *status)
+{
+    gf_boolean_t temp = _gf_false;
+
+    LOCK(&ctx->lock);
+    {
+        temp = *status;
+        *status = *flag;
+        *flag = temp;
+    }
+    UNLOCK(&ctx->lock);
+}
+
+static void
+mq_get_ctx_status(quota_inode_ctx_t *ctx, gf_boolean_t *flag,
+                  gf_boolean_t *status)
+{
+    LOCK(&ctx->lock);
+    {
+        *status = *flag;
+    }
+    UNLOCK(&ctx->lock);
+}
+
+int32_t
+mq_get_ctx_updation_status(quota_inode_ctx_t *ctx, gf_boolean_t *status)
+{
+    GF_VALIDATE_OR_GOTO("marker", ctx, out);
+    GF_VALIDATE_OR_GOTO("marker", status, out);
+
+    mq_get_ctx_status(ctx, &ctx->updation_status, status);
+    return 0;
+out:
+    return -1;
+}
+
+int32_t
+mq_set_ctx_updation_status(quota_inode_ctx_t *ctx, gf_boolean_t status)
+{
+    GF_VALIDATE_OR_GOTO("marker", ctx, out);
+
+    mq_set_ctx_status(ctx, &ctx->updation_status, status);
+    return 0;
+out:
+    return -1;
+}
+
+int32_t
+mq_test_and_set_ctx_updation_status(quota_inode_ctx_t *ctx,
+                                    gf_boolean_t *status)
+{
+    GF_VALIDATE_OR_GOTO("marker", ctx, out);
+    GF_VALIDATE_OR_GOTO("marker", status, out);
+
+    mq_test_and_set_ctx_status(ctx, &ctx->updation_status, status);
+    return 0;
+out:
+    return -1;
+}
+
+int32_t
+mq_set_ctx_create_status(quota_inode_ctx_t *ctx, gf_boolean_t status)
+{
+    GF_VALIDATE_OR_GOTO("marker", ctx, out);
+
+    mq_set_ctx_status(ctx, &ctx->create_status, status);
+    return 0;
+out:
+    return -1;
+}
+
+int32_t
+mq_test_and_set_ctx_create_status(quota_inode_ctx_t *ctx, gf_boolean_t *status)
+{
+    GF_VALIDATE_OR_GOTO("marker", ctx, out);
+    GF_VALIDATE_OR_GOTO("marker", status, out);
+
+    mq_test_and_set_ctx_status(ctx, &ctx->create_status, status);
+    return 0;
+out:
+    return -1;
+}
+
+static void
+mq_set_ctx_dirty_status(quota_inode_ctx_t *ctx, gf_boolean_t status)
+{
+    GF_VALIDATE_OR_GOTO("marker", ctx, out);
+
+    mq_set_ctx_status(ctx, &ctx->dirty_status, status);
+out:
+    return;
+}
+
+int
+mq_build_ancestry(xlator_t *this, loc_t *loc)
+{
+    int32_t ret = -1;
+    fd_t *fd = NULL;
+    gf_dirent_t entries;
+    gf_dirent_t *entry = NULL;
+    dict_t *xdata = NULL;
+    inode_t *tmp_parent = NULL;
+    inode_t *tmp_inode = NULL;
+    inode_t *linked_inode = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    INIT_LIST_HEAD(&entries.list);
+
+    xdata = dict_new();
+    if (xdata == NULL) {
+        gf_log(this->name, GF_LOG_ERROR, "dict_new failed");
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    ret = dict_set_int8(xdata, GET_ANCESTRY_DENTRY_KEY, 1);
+    if (ret < 0)
+        goto out;
+
+    fd = fd_anonymous(loc->inode);
+    if (fd == NULL) {
+        gf_log(this->name, GF_LOG_ERROR, "fd creation failed");
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    fd_bind(fd);
+
+    ret = syncop_readdirp(this, fd, 131072, 0, &entries, xdata, NULL);
+    if (ret < 0) {
+        gf_log(this->name,
+               (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+               "readdirp failed "
+               "for %s: %s",
+               loc->path, strerror(-ret));
+        goto out;
+    }
+
+    if (list_empty(&entries.list)) {
+        ret = -1;
+        goto out;
+    }
+
+    list_for_each_entry(entry, &entries.list, list)
+    {
+        if (__is_root_gfid(entry->inode->gfid)) {
+            /* The list contains a sub-list for each possible path
+             * to the target inode. Each sub-list starts with the
+             * root entry of the tree and is followed by the child
+             * entries for a particular path to the target entry.
+             * The root entry is an implied sub-list delimiter,
+             * as it denotes we have started processing a new path.
+             * Reset the parent pointer and continue
+             */
+
+            tmp_parent = NULL;
+        } else {
+            linked_inode = inode_link(entry->inode, tmp_parent, entry->d_name,
+                                      &entry->d_stat);
+            if (linked_inode) {
+                tmp_inode = entry->inode;
+                entry->inode = linked_inode;
+                inode_unref(tmp_inode);
+            } else {
+                gf_log(this->name, GF_LOG_ERROR, "inode link failed");
+                ret = -EINVAL;
+                goto out;
+            }
+        }
+
+        ctx = mq_inode_ctx_new(entry->inode, this);
+        if (ctx == NULL) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "mq_inode_ctx_new "
+                   "failed for %s",
+                   uuid_utoa(entry->inode->gfid));
+            ret = -ENOMEM;
+            goto out;
+        }
+
+        /* For non-directory, posix_get_ancestry_non_directory returns
+         * all hard-links that are represented by nodes adjacent to
+         * each other in the dentry-list.
+         * (Unlike the directory case where adjacent nodes either have
+         *  a parent/child relationship or belong to different paths).
+         */
+        if (entry->inode->ia_type == IA_IFDIR)
+            tmp_parent = entry->inode;
+    }
+
+    if (loc->parent)
+        inode_unref(loc->parent);
+
+    loc->parent = inode_parent(loc->inode, 0, NULL);
+    if (loc->parent == NULL) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    gf_dirent_free(&entries);
+
+    if (fd)
+        fd_unref(fd);
+
+    if (xdata)
+        dict_unref(xdata);
+
+    return ret;
+}
+
+/* This function should be used only in inspect_directory and inspect_file
+ * function to heal quota xattrs.
+ * Inode quota feature is introduced in 3.7.
+ * If gluster setup is upgraded from 3.6 to 3.7, there can be a
+ * getxattr and setxattr spikes with quota heal as inode quota is missing.
+ * So this wrapper function is to avoid xattrs spikes during upgrade.
+ * This function returns success even is inode-quota xattrs are missing and
+ * hence no healing performed.
+ */
+static int32_t
+_quota_dict_get_meta(xlator_t *this, dict_t *dict, char *key, const int keylen,
+                     quota_meta_t *meta, ia_type_t ia_type,
+                     gf_boolean_t add_delta)
+{
+    int32_t ret = 0;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    ret = quota_dict_get_inode_meta(dict, key, keylen, meta);
+    if (ret == -2 && (priv->feature_enabled & GF_INODE_QUOTA) == 0) {
+        /* quota_dict_get_inode_meta returns -2 if
+         * inode quota xattrs are not present.
+         * if inode quota self heal is turned off,
+         * then we should skip healing inode quotas
+         */
+
+        gf_log(this->name, GF_LOG_DEBUG,
+               "inode quota disabled. "
+               "inode quota self heal will not be performed");
+        ret = 0;
+        if (add_delta) {
+            if (ia_type == IA_IFDIR)
+                meta->dir_count = 1;
+            else
+                meta->file_count = 1;
+        }
+    }
+
+    return ret;
+}
+
+int32_t
+quota_dict_set_size_meta(xlator_t *this, dict_t *dict, const quota_meta_t *meta)
+{
+    int32_t ret = -ENOMEM;
+    quota_meta_t *value = NULL;
+    char size_key[QUOTA_KEY_MAX] = {
+        0,
+    };
+
+    value = GF_MALLOC(2 * sizeof(quota_meta_t), gf_common_quota_meta_t);
+    if (value == NULL) {
+        goto out;
+    }
+    value[0].size = hton64(meta->size);
+    value[0].file_count = hton64(meta->file_count);
+    value[0].dir_count = hton64(meta->dir_count);
+
+    value[1].size = 0;
+    value[1].file_count = 0;
+    value[1].dir_count = hton64(1);
+
+    GET_SIZE_KEY(this, size_key, ret);
+    if (ret < 0)
+        goto out;
+    ret = dict_set_bin(dict, size_key, value, (sizeof(quota_meta_t) * 2));
+    if (ret < 0) {
+        gf_log_callingfn("quota", GF_LOG_ERROR, "dict set failed");
+        GF_FREE(value);
+    }
+out:
+    return ret;
+}
+
+void
+mq_compute_delta(quota_meta_t *delta, const quota_meta_t *op1,
+                 const quota_meta_t *op2)
+{
+    delta->size = op1->size - op2->size;
+    delta->file_count = op1->file_count - op2->file_count;
+    delta->dir_count = op1->dir_count - op2->dir_count;
+}
+
+void
+mq_add_meta(quota_meta_t *dst, const quota_meta_t *src)
+{
+    dst->size += src->size;
+    dst->file_count += src->file_count;
+    dst->dir_count += src->dir_count;
+}
+
+void
+mq_sub_meta(quota_meta_t *dst, const quota_meta_t *src)
+{
+    if (src == NULL) {
+        dst->size = -dst->size;
+        dst->file_count = -dst->file_count;
+        dst->dir_count = -dst->dir_count;
+    } else {
+        dst->size = src->size - dst->size;
+        dst->file_count = src->file_count - dst->file_count;
+        dst->dir_count = src->dir_count - dst->dir_count;
+    }
+}
+
+int32_t
+mq_are_xattrs_set(xlator_t *this, loc_t *loc, gf_boolean_t *contri_set,
+                  gf_boolean_t *size_set)
+{
+    int32_t ret = -1;
+    char contri_key[QUOTA_KEY_MAX] = {
+        0,
+    };
+    char size_key[QUOTA_KEY_MAX] = {
+        0,
+    };
+    quota_meta_t meta = {
+        0,
+    };
+    struct iatt stbuf = {
+        0,
+    };
+    dict_t *dict = NULL;
+    dict_t *rsp_dict = NULL;
+
+    dict = dict_new();
+    if (dict == NULL) {
+        gf_log(this->name, GF_LOG_ERROR, "dict_new failed");
+        goto out;
+    }
+
+    ret = mq_req_xattr(this, loc, dict, contri_key, size_key);
+    if (ret < 0)
+        goto out;
+
+    ret = syncop_lookup(FIRST_CHILD(this), loc, &stbuf, NULL, dict, &rsp_dict);
+    if (ret < 0) {
+        gf_log_callingfn(
+            this->name,
+            (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+            "lookup failed "
+            "for %s: %s",
+            loc->path, strerror(-ret));
+        goto out;
+    }
+
+    if (rsp_dict == NULL)
+        goto out;
+
+    *contri_set = _gf_true;
+    *size_set = _gf_true;
+    if (loc->inode->ia_type == IA_IFDIR) {
+        ret = quota_dict_get_inode_meta(rsp_dict, size_key, strlen(size_key),
+                                        &meta);
+        if (ret < 0 || meta.dir_count == 0)
+            *size_set = _gf_false;
+    }
+
+    if (!loc_is_root(loc)) {
+        ret = quota_dict_get_inode_meta(rsp_dict, contri_key,
+                                        strlen(contri_key), &meta);
+        if (ret < 0)
+            *contri_set = _gf_false;
+    }
+
+    ret = 0;
+out:
+    if (dict)
+        dict_unref(dict);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    return ret;
+}
+
+int32_t
+mq_create_size_xattrs(xlator_t *this, quota_inode_ctx_t *ctx, loc_t *loc)
+{
+    int32_t ret = -1;
+    quota_meta_t size = {
+        0,
+    };
+    dict_t *dict = NULL;
+
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", loc->inode, out);
+
+    if (loc->inode->ia_type != IA_IFDIR) {
+        ret = 0;
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_log(this->name, GF_LOG_ERROR, "dict_new failed");
+        ret = -1;
+        goto out;
+    }
+
+    ret = quota_dict_set_size_meta(this, dict, &size);
+    if (ret < 0)
+        goto out;
+
+    ret = syncop_xattrop(FIRST_CHILD(this), loc,
+                         GF_XATTROP_ADD_ARRAY64_WITH_DEFAULT, dict, NULL, NULL,
+                         NULL);
+
+    if (ret < 0) {
+        gf_log_callingfn(
+            this->name,
+            (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+            "xattrop failed "
+            "for %s: %s",
+            loc->path, strerror(-ret));
+        goto out;
+    }
+
+out:
+    if (dict)
+        dict_unref(dict);
+
+    return ret;
+}
+
+int32_t
+mq_lock(xlator_t *this, loc_t *loc, short l_type)
+{
+    struct gf_flock lock = {
+        0,
+    };
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", loc->inode, out);
+
+    gf_log(this->name, GF_LOG_DEBUG, "set lock type %d on %s", l_type,
+           loc->path);
+
+    lock.l_len = 0;
+    lock.l_start = 0;
+    lock.l_type = l_type;
+    lock.l_whence = SEEK_SET;
+
+    ret = syncop_inodelk(FIRST_CHILD(this), this->name, loc, F_SETLKW, &lock,
+                         NULL, NULL);
+    if (ret < 0)
+        gf_log_callingfn(
+            this->name,
+            (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+            "inodelk failed "
+            "for %s: %s",
+            loc->path, strerror(-ret));
+
+out:
+
+    return ret;
+}
+
+int32_t
+mq_get_dirty(xlator_t *this, loc_t *loc, int32_t *dirty)
+{
+    int32_t ret = -1;
+    int8_t value = 0;
+    dict_t *dict = NULL;
+    dict_t *rsp_dict = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+
+    dict = dict_new();
+    if (dict == NULL) {
+        gf_log(this->name, GF_LOG_ERROR, "dict_new failed");
+        goto out;
+    }
+
+    ret = dict_set_int64(dict, QUOTA_DIRTY_KEY, 0);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_WARNING, "dict set failed");
+        goto out;
+    }
+
+    ret = syncop_lookup(FIRST_CHILD(this), loc, &stbuf, NULL, dict, &rsp_dict);
+    if (ret < 0) {
+        gf_log_callingfn(
+            this->name,
+            (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+            "lookup failed "
+            "for %s: %s",
+            loc->path, strerror(-ret));
+        goto out;
+    }
+
+    ret = dict_get_int8(rsp_dict, QUOTA_DIRTY_KEY, &value);
+    if (ret < 0)
+        goto out;
+
+    *dirty = value;
+
+out:
+    if (dict)
+        dict_unref(dict);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    return ret;
+}
+
+int32_t
+mq_get_set_dirty(xlator_t *this, loc_t *loc, int32_t dirty, int32_t *prev_dirty)
+{
+    int32_t ret = -1;
+    int8_t value = 0;
+    quota_inode_ctx_t *ctx = NULL;
+    dict_t *dict = NULL;
+    dict_t *rsp_dict = NULL;
+
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", loc->inode, out);
+    GF_VALIDATE_OR_GOTO("marker", prev_dirty, out);
+
+    ret = mq_inode_ctx_get(loc->inode, this, &ctx);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to get inode ctx for "
+               "%s",
+               loc->path);
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_log(this->name, GF_LOG_ERROR, "dict_new failed");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_int8(dict, QUOTA_DIRTY_KEY, dirty);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_ERROR, "dict_set failed");
+        goto out;
+    }
+
+    ret = syncop_xattrop(FIRST_CHILD(this), loc, GF_XATTROP_GET_AND_SET, dict,
+                         NULL, NULL, &rsp_dict);
+    if (ret < 0) {
+        gf_log_callingfn(
+            this->name,
+            (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+            "xattrop failed "
+            "for %s: %s",
+            loc->path, strerror(-ret));
+        goto out;
+    }
+
+    *prev_dirty = 0;
+    if (rsp_dict) {
+        ret = dict_get_int8(rsp_dict, QUOTA_DIRTY_KEY, &value);
+        if (ret == 0)
+            *prev_dirty = value;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->dirty = dirty;
+    }
+    UNLOCK(&ctx->lock);
+    ret = 0;
+out:
+    if (dict)
+        dict_unref(dict);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    return ret;
+}
+
+int32_t
+mq_mark_dirty(xlator_t *this, loc_t *loc, int32_t dirty)
+{
+    int32_t ret = -1;
+    dict_t *dict = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", loc->inode, out);
+
+    ret = mq_inode_ctx_get(loc->inode, this, &ctx);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to get inode ctx for "
+               "%s",
+               loc->path);
+        ret = 0;
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        ret = -1;
+        gf_log(this->name, GF_LOG_ERROR, "dict_new failed");
+        goto out;
+    }
+
+    ret = dict_set_int8(dict, QUOTA_DIRTY_KEY, dirty);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_ERROR, "dict_set failed");
+        goto out;
+    }
+
+    ret = syncop_setxattr(FIRST_CHILD(this), loc, dict, 0, NULL, NULL);
+    if (ret < 0) {
+        gf_log_callingfn(
+            this->name,
+            (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+            "setxattr dirty = %d "
+            "failed for %s: %s",
+            dirty, loc->path, strerror(-ret));
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->dirty = dirty;
+    }
+    UNLOCK(&ctx->lock);
+
+out:
+    if (dict)
+        dict_unref(dict);
+
+    return ret;
+}
+
+int32_t
+_mq_get_metadata(xlator_t *this, loc_t *loc, quota_meta_t *contri,
+                 quota_meta_t *size, uuid_t contri_gfid)
+{
+    int32_t ret = -1;
+    quota_meta_t meta = {
+        0,
+    };
+    char contri_key[QUOTA_KEY_MAX] = {
+        0,
+    };
+    char size_key[QUOTA_KEY_MAX] = {
+        0,
+    };
+    int keylen = 0;
+    dict_t *dict = NULL;
+    dict_t *rsp_dict = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", loc->inode, out);
+
+    if (size == NULL && contri == NULL)
+        goto out;
+
+    dict = dict_new();
+    if (dict == NULL) {
+        gf_log(this->name, GF_LOG_ERROR, "dict_new failed");
+        goto out;
+    }
+
+    if (size && loc->inode->ia_type == IA_IFDIR) {
+        GET_SIZE_KEY(this, size_key, keylen);
+        if (keylen < 0)
+            goto out;
+        ret = dict_set_int64(dict, size_key, 0);
+        if (ret < 0) {
+            gf_log(this->name, GF_LOG_ERROR, "dict_set failed.");
+            goto out;
+        }
+    }
+
+    if (contri && !loc_is_root(loc)) {
+        ret = mq_dict_set_contribution(this, dict, loc, contri_gfid,
+                                       contri_key);
+        if (ret < 0)
+            goto out;
+    }
+
+    ret = syncop_lookup(FIRST_CHILD(this), loc, &stbuf, NULL, dict, &rsp_dict);
+    if (ret < 0) {
+        gf_log_callingfn(
+            this->name,
+            (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+            "lookup failed "
+            "for %s: %s",
+            loc->path, strerror(-ret));
+        goto out;
+    }
+
+    if (size) {
+        if (loc->inode->ia_type == IA_IFDIR) {
+            ret = quota_dict_get_meta(rsp_dict, size_key, keylen, &meta);
+            if (ret < 0) {
+                gf_log(this->name, GF_LOG_ERROR, "dict_get failed.");
+                goto out;
+            }
+
+            size->size = meta.size;
+            size->file_count = meta.file_count;
+            size->dir_count = meta.dir_count;
+        } else {
+            size->size = stbuf.ia_blocks * 512;
+            size->file_count = 1;
+            size->dir_count = 0;
+        }
+    }
+
+    if (contri && !loc_is_root(loc)) {
+        ret = quota_dict_get_meta(rsp_dict, contri_key, strlen(contri_key),
+                                  &meta);
+        if (ret < 0) {
+            contri->size = 0;
+            contri->file_count = 0;
+            contri->dir_count = 0;
+        } else {
+            contri->size = meta.size;
+            contri->file_count = meta.file_count;
+            contri->dir_count = meta.dir_count;
+        }
+    }
+
+    ret = 0;
+
+out:
+    if (dict)
+        dict_unref(dict);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    return ret;
+}
+
+int32_t
+mq_get_metadata(xlator_t *this, loc_t *loc, quota_meta_t *contri,
+                quota_meta_t *size, quota_inode_ctx_t *ctx,
+                inode_contribution_t *contribution)
+{
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", loc->inode, out);
+    GF_VALIDATE_OR_GOTO("marker", ctx, out);
+    GF_VALIDATE_OR_GOTO("marker", contribution, out);
+
+    if (size == NULL && contri == NULL) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = _mq_get_metadata(this, loc, contri, size, contribution->gfid);
+    if (ret < 0)
+        goto out;
+
+    if (size) {
+        LOCK(&ctx->lock);
+        {
+            ctx->size = size->size;
+            ctx->file_count = size->file_count;
+            ctx->dir_count = size->dir_count;
+        }
+        UNLOCK(&ctx->lock);
+    }
+
+    if (contri) {
+        LOCK(&contribution->lock);
+        {
+            contribution->contribution = contri->size;
+            contribution->file_count = contri->file_count;
+            contribution->dir_count = contri->dir_count;
+        }
+        UNLOCK(&contribution->lock);
+    }
+
+out:
+    return ret;
+}
+
+int32_t
+mq_get_delta(xlator_t *this, loc_t *loc, quota_meta_t *delta,
+             quota_inode_ctx_t *ctx, inode_contribution_t *contribution)
+{
+    int32_t ret = -1;
+    quota_meta_t size = {
+        0,
+    };
+    quota_meta_t contri = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", loc->inode, out);
+    GF_VALIDATE_OR_GOTO("marker", ctx, out);
+    GF_VALIDATE_OR_GOTO("marker", contribution, out);
+
+    ret = mq_get_metadata(this, loc, &contri, &size, ctx, contribution);
+    if (ret < 0)
+        goto out;
+
+    mq_compute_delta(delta, &size, &contri);
+
+out:
+    return ret;
+}
+
+int32_t
+mq_remove_contri(xlator_t *this, loc_t *loc, quota_inode_ctx_t *ctx,
+                 inode_contribution_t *contri, quota_meta_t *delta,
+                 uint32_t nlink)
+{
+    int32_t ret = -1;
+    char contri_key[QUOTA_KEY_MAX] = {
+        0,
+    };
+
+    if (nlink == 1) {
+        /*File was a last link and has been deleted */
+        ret = 0;
+        goto done;
+    }
+
+    GET_CONTRI_KEY(this, contri_key, contri->gfid, ret);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "get contri_key "
+               "failed for %s",
+               uuid_utoa(contri->gfid));
+        goto out;
+    }
+
+    ret = syncop_removexattr(FIRST_CHILD(this), loc, contri_key, 0, NULL);
+    if (ret < 0) {
+        if (-ret == ENOENT || -ret == ESTALE || -ret == ENODATA ||
+            -ret == ENOATTR) {
+            /* Remove contri in done when unlink operation is
+             * performed, so return success on ENOENT/ESTSLE
+             * rename operation removes xattr earlier,
+             * so return success on ENODATA
+             */
+            ret = 0;
+        } else {
+            gf_log_callingfn(this->name, GF_LOG_ERROR,
+                             "removexattr %s failed for %s: %s", contri_key,
+                             loc->path, strerror(-ret));
+            goto out;
+        }
+    }
+
+done:
+    LOCK(&contri->lock);
+    {
+        contri->contribution += delta->size;
+        contri->file_count += delta->file_count;
+        contri->dir_count += delta->dir_count;
+    }
+    UNLOCK(&contri->lock);
+
+    ret = 0;
+
+out:
+    QUOTA_FREE_CONTRIBUTION_NODE(ctx, contri);
+
+    return ret;
+}
+
+int32_t
+mq_update_contri(xlator_t *this, loc_t *loc, inode_contribution_t *contri,
+                 quota_meta_t *delta)
+{
+    int32_t ret = -1;
+    char contri_key[QUOTA_KEY_MAX] = {
+        0,
+    };
+    dict_t *dict = NULL;
+
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", loc->inode, out);
+    GF_VALIDATE_OR_GOTO("marker", delta, out);
+    GF_VALIDATE_OR_GOTO("marker", contri, out);
+
+    if (quota_meta_is_null(delta)) {
+        ret = 0;
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_log(this->name, GF_LOG_ERROR, "dict_new failed");
+        ret = -1;
+        goto out;
+    }
+
+    GET_CONTRI_KEY(this, contri_key, contri->gfid, ret);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "get contri_key "
+               "failed for %s",
+               uuid_utoa(contri->gfid));
+        goto out;
+    }
+
+    ret = quota_dict_set_meta(dict, contri_key, delta, loc->inode->ia_type);
+    if (ret < 0)
+        goto out;
+
+    ret = syncop_xattrop(FIRST_CHILD(this), loc, GF_XATTROP_ADD_ARRAY64, dict,
+                         NULL, NULL, NULL);
+    if (ret < 0) {
+        gf_log_callingfn(
+            this->name,
+            (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+            "xattrop failed "
+            "for %s: %s",
+            loc->path, strerror(-ret));
+        goto out;
+    }
+
+    LOCK(&contri->lock);
+    {
+        contri->contribution += delta->size;
+        contri->file_count += delta->file_count;
+        contri->dir_count += delta->dir_count;
+    }
+    UNLOCK(&contri->lock);
+
+out:
+    if (dict)
+        dict_unref(dict);
+
+    return ret;
+}
+
+int32_t
+mq_update_size(xlator_t *this, loc_t *loc, quota_meta_t *delta)
+{
+    int32_t ret = -1;
+    quota_inode_ctx_t *ctx = NULL;
+    dict_t *dict = NULL;
+
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", loc->inode, out);
+    GF_VALIDATE_OR_GOTO("marker", delta, out);
+
+    if (quota_meta_is_null(delta)) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = mq_inode_ctx_get(loc->inode, this, &ctx);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to get inode ctx for "
+               "%s",
+               loc->path);
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_log(this->name, GF_LOG_ERROR, "dict_new failed");
+        ret = -1;
+        goto out;
+    }
+
+    ret = quota_dict_set_size_meta(this, dict, delta);
+    if (ret < 0)
+        goto out;
+
+    ret = syncop_xattrop(FIRST_CHILD(this), loc,
+                         GF_XATTROP_ADD_ARRAY64_WITH_DEFAULT, dict, NULL, NULL,
+                         NULL);
+    if (ret < 0) {
+        gf_log_callingfn(
+            this->name,
+            (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+            "xattrop failed "
+            "for %s: %s",
+            loc->path, strerror(-ret));
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->size += delta->size;
+        ctx->file_count += delta->file_count;
+        if (ctx->dir_count == 0)
+            ctx->dir_count += delta->dir_count + 1;
+        else
+            ctx->dir_count += delta->dir_count;
+    }
+    UNLOCK(&ctx->lock);
+
+out:
+    if (dict)
+        dict_unref(dict);
+
+    return ret;
+}
+
+int
+mq_synctask_cleanup(int ret, call_frame_t *frame, void *opaque)
+{
+    quota_synctask_t *args = NULL;
+
+    GF_ASSERT(opaque);
+
+    args = (quota_synctask_t *)opaque;
+    loc_wipe(&args->loc);
+
+    if (args->stub)
+        call_resume(args->stub);
+
+    if (!args->is_static)
+        GF_FREE(args);
+
+    return 0;
+}
+
+int
+mq_synctask1(xlator_t *this, synctask_fn_t task, gf_boolean_t spawn, loc_t *loc,
+             quota_meta_t *contri, uint32_t nlink, call_stub_t *stub)
+{
+    int32_t ret = -1;
+    quota_synctask_t *args = NULL;
+    quota_synctask_t static_args = {
+        0,
+    };
+
+    if (spawn) {
+        QUOTA_ALLOC_OR_GOTO(args, quota_synctask_t, ret, out);
+        args->is_static = _gf_false;
+    } else {
+        args = &static_args;
+        args->is_static = _gf_true;
+    }
+
+    args->this = this;
+    args->stub = stub;
+    loc_copy(&args->loc, loc);
+    args->ia_nlink = nlink;
+
+    if (contri) {
+        args->contri = *contri;
+    } else {
+        args->contri.size = -1;
+        args->contri.file_count = -1;
+        args->contri.dir_count = -1;
+    }
+
+    if (spawn) {
+        ret = synctask_new1(this->ctx->env, 1024 * 16, task,
+                            mq_synctask_cleanup, NULL, args);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "Failed to spawn "
+                   "new synctask");
+            mq_synctask_cleanup(ret, NULL, args);
+        }
+    } else {
+        ret = task(args);
+        mq_synctask_cleanup(ret, NULL, args);
+    }
+
+out:
+    return ret;
+}
+
+int
+mq_synctask(xlator_t *this, synctask_fn_t task, gf_boolean_t spawn, loc_t *loc)
+{
+    return mq_synctask1(this, task, spawn, loc, NULL, -1, NULL);
+}
+
+int32_t
+mq_prevalidate_txn(xlator_t *this, loc_t *origin_loc, loc_t *loc,
+                   quota_inode_ctx_t **ctx, struct iatt *buf)
+{
+    int32_t ret = -1;
+    quota_inode_ctx_t *ctxtmp = NULL;
+
+    if (buf) {
+        if (buf->ia_type == IA_IFREG && IS_DHT_LINKFILE_MODE(buf))
+            goto out;
+
+        if (buf->ia_type != IA_IFREG && buf->ia_type != IA_IFLNK &&
+            buf->ia_type != IA_IFDIR)
+            goto out;
+    }
+
+    if (origin_loc == NULL || origin_loc->inode == NULL ||
+        gf_uuid_is_null(origin_loc->inode->gfid))
+        goto out;
+
+    loc_copy(loc, origin_loc);
+
+    if (gf_uuid_is_null(loc->gfid))
+        gf_uuid_copy(loc->gfid, loc->inode->gfid);
+
+    if (!loc_is_root(loc) && loc->parent == NULL)
+        loc->parent = inode_parent(loc->inode, 0, NULL);
+
+    ret = mq_inode_ctx_get(loc->inode, this, &ctxtmp);
+    if (ret < 0) {
+        gf_log_callingfn(this->name, GF_LOG_WARNING,
+                         "inode ctx for "
+                         "is NULL for %s",
+                         loc->path);
+        goto out;
+    }
+    if (ctx)
+        *ctx = ctxtmp;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+mq_create_xattrs_task(void *opaque)
+{
+    int32_t ret = -1;
+    gf_boolean_t locked = _gf_false;
+    gf_boolean_t contri_set = _gf_false;
+    gf_boolean_t size_set = _gf_false;
+    gf_boolean_t need_txn = _gf_false;
+    quota_synctask_t *args = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    xlator_t *this = NULL;
+    loc_t *loc = NULL;
+    gf_boolean_t status = _gf_false;
+
+    GF_ASSERT(opaque);
+
+    args = (quota_synctask_t *)opaque;
+    loc = &args->loc;
+    this = args->this;
+    THIS = this;
+
+    ret = mq_inode_ctx_get(loc->inode, this, &ctx);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "Failed to"
+               "get inode ctx, aborting quota create txn");
+        goto out;
+    }
+
+    if (loc->inode->ia_type == IA_IFDIR) {
+        /* lock not required for files */
+        ret = mq_lock(this, loc, F_WRLCK);
+        if (ret < 0)
+            goto out;
+        locked = _gf_true;
+    }
+
+    ret = mq_are_xattrs_set(this, loc, &contri_set, &size_set);
+    if (ret < 0 || (contri_set && size_set))
+        goto out;
+
+    mq_set_ctx_create_status(ctx, _gf_false);
+    status = _gf_true;
+
+    if (loc->inode->ia_type == IA_IFDIR && size_set == _gf_false) {
+        ret = mq_create_size_xattrs(this, ctx, loc);
+        if (ret < 0)
+            goto out;
+    }
+
+    need_txn = _gf_true;
+out:
+    if (locked)
+        ret = mq_lock(this, loc, F_UNLCK);
+
+    if (status == _gf_false)
+        mq_set_ctx_create_status(ctx, _gf_false);
+
+    if (need_txn)
+        ret = mq_initiate_quota_blocking_txn(this, loc, NULL);
+
+    return ret;
+}
+
+static int
+_mq_create_xattrs_txn(xlator_t *this, loc_t *origin_loc, struct iatt *buf,
+                      gf_boolean_t spawn)
+{
+    int32_t ret = -1;
+    quota_inode_ctx_t *ctx = NULL;
+    gf_boolean_t status = _gf_true;
+    loc_t loc = {
+        0,
+    };
+    inode_contribution_t *contribution = NULL;
+
+    ret = mq_prevalidate_txn(this, origin_loc, &loc, &ctx, buf);
+    if (ret < 0)
+        goto out;
+
+    ret = mq_test_and_set_ctx_create_status(ctx, &status);
+    if (ret < 0 || status == _gf_true)
+        goto out;
+
+    if (!loc_is_root(&loc) && loc.parent) {
+        contribution = mq_add_new_contribution_node(this, ctx, &loc);
+        if (contribution == NULL) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "cannot add a new contribution node "
+                   "(%s)",
+                   uuid_utoa(loc.gfid));
+            ret = -1;
+            goto out;
+        } else {
+            GF_REF_PUT(contribution);
+        }
+    }
+
+    ret = mq_synctask(this, mq_create_xattrs_task, spawn, &loc);
+out:
+    if (ret < 0 && status == _gf_false)
+        mq_set_ctx_create_status(ctx, _gf_false);
+
+    loc_wipe(&loc);
+    return ret;
+}
+
+int
+mq_create_xattrs_txn(xlator_t *this, loc_t *loc, struct iatt *buf)
+{
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", loc->inode, out);
+
+    ret = _mq_create_xattrs_txn(this, loc, buf, _gf_true);
+out:
+    return ret;
+}
+
+int32_t
+mq_reduce_parent_size_task(void *opaque)
+{
+    int32_t ret = -1;
+    int32_t prev_dirty = 0;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_inode_ctx_t *parent_ctx = NULL;
+    inode_contribution_t *contribution = NULL;
+    quota_meta_t delta = {
+        0,
+    };
+    quota_meta_t contri = {
+        0,
+    };
+    loc_t parent_loc = {
+        0,
+    };
+    gf_boolean_t locked = _gf_false;
+    gf_boolean_t dirty = _gf_false;
+    quota_synctask_t *args = NULL;
+    xlator_t *this = NULL;
+    loc_t *loc = NULL;
+    gf_boolean_t remove_xattr = _gf_true;
+    uint32_t nlink = 0;
+
+    GF_ASSERT(opaque);
+
+    args = (quota_synctask_t *)opaque;
+    loc = &args->loc;
+    contri = args->contri;
+    nlink = args->ia_nlink;
+    this = args->this;
+    THIS = this;
+
+    ret = mq_inode_loc_fill(NULL, loc->parent, &parent_loc);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "parent_loc fill failed for "
+               "child inode %s: ",
+               uuid_utoa(loc->inode->gfid));
+        goto out;
+    }
+
+    ret = mq_lock(this, &parent_loc, F_WRLCK);
+    if (ret < 0)
+        goto out;
+    locked = _gf_true;
+
+    if (contri.size >= 0) {
+        /* contri parameter is supplied only for rename operation.
+         * remove xattr is alreday performed, we need to skip
+         * removexattr for rename operation
+         */
+        remove_xattr = _gf_false;
+        delta.size = contri.size;
+        delta.file_count = contri.file_count;
+        delta.dir_count = contri.dir_count;
+    } else {
+        remove_xattr = _gf_true;
+
+        ret = mq_inode_ctx_get(loc->inode, this, &ctx);
+        if (ret < 0) {
+            gf_log_callingfn(this->name, GF_LOG_WARNING,
+                             "ctx for"
+                             " the node %s is NULL",
+                             loc->path);
+            goto out;
+        }
+
+        contribution = mq_get_contribution_node(loc->parent, ctx);
+        if (contribution == NULL) {
+            ret = -1;
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "contribution for the node %s is NULL", loc->path);
+            goto out;
+        }
+
+        LOCK(&contribution->lock);
+        {
+            delta.size = contribution->contribution;
+            delta.file_count = contribution->file_count;
+            delta.dir_count = contribution->dir_count;
+        }
+        UNLOCK(&contribution->lock);
+    }
+
+    ret = mq_get_set_dirty(this, &parent_loc, 1, &prev_dirty);
+    if (ret < 0)
+        goto out;
+    dirty = _gf_true;
+
+    mq_sub_meta(&delta, NULL);
+
+    if (remove_xattr) {
+        ret = mq_remove_contri(this, loc, ctx, contribution, &delta, nlink);
+        if (ret < 0)
+            goto out;
+    }
+
+    if (quota_meta_is_null(&delta))
+        goto out;
+
+    ret = mq_update_size(this, &parent_loc, &delta);
+    if (ret < 0)
+        goto out;
+
+out:
+    if (dirty) {
+        if (ret < 0 || prev_dirty) {
+            /* On failure clear dirty status flag.
+             * In the next lookup inspect_directory_xattr
+             * can set the status flag and fix the
+             * dirty directory.
+             * Do the same if dir was dirty before
+             * the txn
+             */
+            ret = mq_inode_ctx_get(parent_loc.inode, this, &parent_ctx);
+            if (ret == 0)
+                mq_set_ctx_dirty_status(parent_ctx, _gf_false);
+        } else {
+            ret = mq_mark_dirty(this, &parent_loc, 0);
+        }
+    }
+
+    if (locked)
+        ret = mq_lock(this, &parent_loc, F_UNLCK);
+
+    if (ret >= 0)
+        ret = mq_initiate_quota_blocking_txn(this, &parent_loc, NULL);
+
+    loc_wipe(&parent_loc);
+
+    if (contribution)
+        GF_REF_PUT(contribution);
+
+    return ret;
+}
+
+int32_t
+mq_reduce_parent_size_txn(xlator_t *this, loc_t *origin_loc,
+                          quota_meta_t *contri, uint32_t nlink,
+                          call_stub_t *stub)
+{
+    int32_t ret = -1;
+    loc_t loc = {
+        0,
+    };
+    gf_boolean_t resume_stub = _gf_true;
+
+    GF_VALIDATE_OR_GOTO("marker", this, out);
+    GF_VALIDATE_OR_GOTO("marker", origin_loc, out);
+
+    ret = mq_prevalidate_txn(this, origin_loc, &loc, NULL, NULL);
+    if (ret < 0)
+        goto out;
+
+    if (loc_is_root(&loc)) {
+        ret = 0;
+        goto out;
+    }
+
+    resume_stub = _gf_false;
+    ret = mq_synctask1(this, mq_reduce_parent_size_task, _gf_true, &loc, contri,
+                       nlink, stub);
+out:
+    loc_wipe(&loc);
+
+    if (resume_stub && stub)
+        call_resume(stub);
+
+    if (ret)
+        gf_log_callingfn(this ? this->name : "Marker", GF_LOG_ERROR,
+                         "mq_reduce_parent_size_txn failed");
+
+    return ret;
+}
+
+int
+mq_initiate_quota_task(void *opaque)
+{
+    int32_t ret = -1;
+    int32_t prev_dirty = 0;
+    loc_t child_loc = {
+        0,
+    };
+    loc_t parent_loc = {
+        0,
+    };
+    gf_boolean_t locked = _gf_false;
+    gf_boolean_t dirty = _gf_false;
+    gf_boolean_t status = _gf_false;
+    quota_meta_t delta = {
+        0,
+    };
+    quota_synctask_t *args = NULL;
+    xlator_t *this = NULL;
+    loc_t *loc = NULL;
+    inode_contribution_t *contri = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_inode_ctx_t *parent_ctx = NULL;
+    inode_t *tmp_parent = NULL;
+
+    GF_VALIDATE_OR_GOTO("marker", opaque, out);
+
+    args = (quota_synctask_t *)opaque;
+    loc = &args->loc;
+    this = args->this;
+
+    GF_VALIDATE_OR_GOTO("marker", this, out);
+    THIS = this;
+
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    ret = mq_loc_copy(&child_loc, loc);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_ERROR, "loc copy failed");
+        goto out;
+    }
+
+    while (!__is_root_gfid(child_loc.gfid)) {
+        ret = mq_inode_ctx_get(child_loc.inode, this, &ctx);
+        if (ret < 0) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "inode ctx get failed for %s, "
+                   "aborting update txn",
+                   child_loc.path);
+            goto out;
+        }
+
+        /* To improve performance, abort current transaction
+         * if one is already in progress for same inode
+         */
+        if (status == _gf_true) {
+            /* status will already set before txn start,
+             * so it should not be set in first
+             * loop iteration
+             */
+            ret = mq_test_and_set_ctx_updation_status(ctx, &status);
+            if (ret < 0 || status == _gf_true)
+                goto out;
+        }
+
+        if (child_loc.parent == NULL) {
+            ret = mq_build_ancestry(this, &child_loc);
+            if (ret < 0 || child_loc.parent == NULL) {
+                /* If application performs parallel remove
+                 * operations on same set of files/directories
+                 * then we may get ENOENT/ESTALE
+                 */
+                gf_log(this->name,
+                       (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG
+                                                          : GF_LOG_ERROR,
+                       "build ancestry failed for inode %s",
+                       uuid_utoa(child_loc.inode->gfid));
+                ret = -1;
+                goto out;
+            }
+        }
+
+        ret = mq_inode_loc_fill(NULL, child_loc.parent, &parent_loc);
+        if (ret < 0) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "parent_loc fill "
+                   "failed for child inode %s: ",
+                   uuid_utoa(child_loc.inode->gfid));
+            goto out;
+        }
+
+        ret = mq_lock(this, &parent_loc, F_WRLCK);
+        if (ret < 0)
+            goto out;
+        locked = _gf_true;
+
+        mq_set_ctx_updation_status(ctx, _gf_false);
+        status = _gf_true;
+
+        /* Contribution node can be NULL in below scenarios and
+           create if needed:
+
+           Scenario 1)
+           In this case create a new contribution node
+           Suppose hard link for a file f1 present in a directory d1 is
+           created in the directory d2 (as f2). Now, since d2's
+           contribution is not there in f1's inode ctx, d2's
+           contribution xattr won't be created and will create problems
+           for quota operations.
+
+           Don't create contribution if parent has been changed after
+           taking a lock, this can happen when rename is performed
+           and writes is still in-progress for the same file
+
+           Scenario 2)
+           When a rename operation is performed, contribution node
+           for olp path will be removed.
+
+           Create contribution node only if oldparent is same as
+           newparent.
+           Consider below example
+           1) rename FOP invoked on file 'x'
+           2) write is still in progress for file 'x'
+           3) rename takes a lock on old-parent
+           4) write-update txn blocked on old-parent to acquire lock
+           5) in rename_cbk, contri xattrs are removed and contribution
+              is deleted and lock is released
+           6) now write-update txn gets the lock and updates the
+              wrong parent as it was holding lock on old parent
+              so validate parent once the lock is acquired
+
+             For more information on this problem, please see
+             doc for marker_rename in file marker.c
+        */
+        contri = mq_get_contribution_node(child_loc.parent, ctx);
+        if (contri == NULL) {
+            tmp_parent = inode_parent(child_loc.inode, 0, NULL);
+            if (tmp_parent == NULL) {
+                /* This can happen if application performs
+                 * parallel remove operations on same set
+                 * of files/directories
+                 */
+                gf_log(this->name, GF_LOG_WARNING,
+                       "parent is "
+                       "NULL for inode %s",
+                       uuid_utoa(child_loc.inode->gfid));
+                ret = -1;
+                goto out;
+            }
+            if (gf_uuid_compare(tmp_parent->gfid, parent_loc.gfid)) {
+                /* abort txn if parent has changed */
+                ret = 0;
+                goto out;
+            }
+
+            inode_unref(tmp_parent);
+            tmp_parent = NULL;
+
+            contri = mq_add_new_contribution_node(this, ctx, &child_loc);
+            if (contri == NULL) {
+                gf_log(this->name, GF_LOG_ERROR,
+                       "Failed to "
+                       "create contribution node for %s, "
+                       "abort update txn",
+                       child_loc.path);
+                ret = -1;
+                goto out;
+            }
+        }
+
+        ret = mq_get_delta(this, &child_loc, &delta, ctx, contri);
+        if (ret < 0)
+            goto out;
+
+        if (quota_meta_is_null(&delta))
+            goto out;
+
+        ret = mq_get_set_dirty(this, &parent_loc, 1, &prev_dirty);
+        if (ret < 0)
+            goto out;
+        dirty = _gf_true;
+
+        ret = mq_update_contri(this, &child_loc, contri, &delta);
+        if (ret < 0)
+            goto out;
+
+        ret = mq_update_size(this, &parent_loc, &delta);
+        if (ret < 0) {
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "rollback "
+                   "contri updation");
+            mq_sub_meta(&delta, NULL);
+            mq_update_contri(this, &child_loc, contri, &delta);
+            goto out;
+        }
+
+        if (prev_dirty == 0) {
+            ret = mq_mark_dirty(this, &parent_loc, 0);
+        } else {
+            ret = mq_inode_ctx_get(parent_loc.inode, this, &parent_ctx);
+            if (ret == 0)
+                mq_set_ctx_dirty_status(parent_ctx, _gf_false);
+        }
+        dirty = _gf_false;
+        prev_dirty = 0;
+
+        ret = mq_lock(this, &parent_loc, F_UNLCK);
+        locked = _gf_false;
+
+        if (__is_root_gfid(parent_loc.gfid))
+            break;
+
+        /* Repeate above steps upwards till the root */
+        loc_wipe(&child_loc);
+        ret = mq_loc_copy(&child_loc, &parent_loc);
+        if (ret < 0)
+            goto out;
+
+        loc_wipe(&parent_loc);
+        GF_REF_PUT(contri);
+        contri = NULL;
+    }
+
+out:
+    if ((dirty) && (ret < 0)) {
+        /* On failure clear dirty status flag.
+         * In the next lookup inspect_directory_xattr
+         * can set the status flag and fix the
+         * dirty directory.
+         * Do the same if the dir was dirty before
+         * txn
+         */
+        ret = mq_inode_ctx_get(parent_loc.inode, this, &parent_ctx);
+        if (ret == 0)
+            mq_set_ctx_dirty_status(parent_ctx, _gf_false);
+    }
+
+    if (locked)
+        ret = mq_lock(this, &parent_loc, F_UNLCK);
+
+    if (ctx && status == _gf_false)
+        mq_set_ctx_updation_status(ctx, _gf_false);
+
+    loc_wipe(&child_loc);
+    loc_wipe(&parent_loc);
+
+    if (tmp_parent)
+        inode_unref(tmp_parent);
+
+    if (contri)
+        GF_REF_PUT(contri);
+
+    return 0;
+}
+
+int
+_mq_initiate_quota_txn(xlator_t *this, loc_t *origin_loc, struct iatt *buf,
+                       gf_boolean_t spawn)
+{
+    int32_t ret = -1;
+    quota_inode_ctx_t *ctx = NULL;
+    gf_boolean_t status = _gf_true;
+    loc_t loc = {
+        0,
+    };
+
+    ret = mq_prevalidate_txn(this, origin_loc, &loc, &ctx, buf);
+    if (ret < 0)
+        goto out;
+
+    if (loc_is_root(&loc)) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = mq_test_and_set_ctx_updation_status(ctx, &status);
+    if (ret < 0 || status == _gf_true)
+        goto out;
+
+    ret = mq_synctask(this, mq_initiate_quota_task, spawn, &loc);
+
+out:
+    if (ret < 0 && status == _gf_false)
+        mq_set_ctx_updation_status(ctx, _gf_false);
+
+    loc_wipe(&loc);
+    return ret;
+}
+
+int
+mq_initiate_quota_txn(xlator_t *this, loc_t *loc, struct iatt *buf)
+{
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("marker", this, out);
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", loc->inode, out);
+
+    ret = _mq_initiate_quota_txn(this, loc, buf, _gf_true);
+out:
+    return ret;
+}
+
+int
+mq_initiate_quota_blocking_txn(xlator_t *this, loc_t *loc, struct iatt *buf)
+{
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("marker", this, out);
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", loc->inode, out);
+
+    ret = _mq_initiate_quota_txn(this, loc, buf, _gf_false);
+out:
+    return ret;
+}
+
+int
+mq_update_dirty_inode_task(void *opaque)
+{
+    int32_t ret = -1;
+    fd_t *fd = NULL;
+    off_t offset = 0;
+    gf_dirent_t entries;
+    gf_dirent_t *entry = NULL;
+    gf_boolean_t locked = _gf_false;
+    gf_boolean_t updated = _gf_false;
+    int32_t dirty = 0;
+    quota_meta_t contri = {
+        0,
+    };
+    quota_meta_t size = {
+        0,
+    };
+    quota_meta_t contri_sum = {
+        0,
+    };
+    quota_meta_t delta = {
+        0,
+    };
+    quota_synctask_t *args = NULL;
+    xlator_t *this = NULL;
+    loc_t *loc = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    dict_t *xdata = NULL;
+    char contri_key[QUOTA_KEY_MAX] = {
+        0,
+    };
+    int keylen = 0;
+
+    GF_ASSERT(opaque);
+
+    args = (quota_synctask_t *)opaque;
+    loc = &args->loc;
+    this = args->this;
+    THIS = this;
+    INIT_LIST_HEAD(&entries.list);
+
+    ret = mq_inode_ctx_get(loc->inode, this, &ctx);
+    if (ret < 0)
+        goto out;
+
+    GET_CONTRI_KEY(this, contri_key, loc->gfid, keylen);
+    if (keylen < 0) {
+        ret = keylen;
+        goto out;
+    }
+
+    xdata = dict_new();
+    if (xdata == NULL) {
+        gf_log(this->name, GF_LOG_ERROR, "dict_new failed");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_int64(xdata, contri_key, 0);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_ERROR, "dict_set failed");
+        goto out;
+    }
+
+    ret = mq_lock(this, loc, F_WRLCK);
+    if (ret < 0)
+        goto out;
+    locked = _gf_true;
+
+    ret = mq_get_dirty(this, loc, &dirty);
+    if (ret < 0 || dirty == 0) {
+        ret = 0;
+        goto out;
+    }
+
+    fd = fd_create(loc->inode, 0);
+    if (!fd) {
+        gf_log(this->name, GF_LOG_ERROR, "Failed to create fd");
+        ret = -1;
+        goto out;
+    }
+
+    ret = syncop_opendir(this, loc, fd, NULL, NULL);
+    if (ret < 0) {
+        gf_log(this->name,
+               (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+               "opendir failed "
+               "for %s: %s",
+               loc->path, strerror(-ret));
+        goto out;
+    }
+
+    fd_bind(fd);
+    while ((ret = syncop_readdirp(this, fd, 131072, offset, &entries, xdata,
+                                  NULL)) != 0) {
+        if (ret < 0) {
+            gf_log(this->name,
+                   (-ret == ENOENT || -ret == ESTALE) ? GF_LOG_DEBUG
+                                                      : GF_LOG_ERROR,
+                   "readdirp failed "
+                   "for %s: %s",
+                   loc->path, strerror(-ret));
+            goto out;
+        }
+
+        if (list_empty(&entries.list))
+            break;
+
+        list_for_each_entry(entry, &entries.list, list)
+        {
+            offset = entry->d_off;
+
+            if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
+                continue;
+
+            memset(&contri, 0, sizeof(contri));
+            quota_dict_get_meta(entry->dict, contri_key, keylen, &contri);
+            if (quota_meta_is_null(&contri))
+                continue;
+
+            mq_add_meta(&contri_sum, &contri);
+        }
+
+        gf_dirent_free(&entries);
+    }
+    /* Inculde for self */
+    contri_sum.dir_count++;
+
+    ret = _mq_get_metadata(this, loc, NULL, &size, 0);
+    if (ret < 0)
+        goto out;
+
+    mq_compute_delta(&delta, &contri_sum, &size);
+
+    if (quota_meta_is_null(&delta))
+        goto out;
+
+    gf_log(this->name, GF_LOG_INFO,
+           "calculated size = %" PRId64 ", original size = %" PRIu64
+           ", diff = %" PRIu64 ", path = %s ",
+           contri_sum.size, size.size, delta.size, loc->path);
+
+    gf_log(this->name, GF_LOG_INFO,
+           "calculated f_count = %" PRId64 ", original f_count = %" PRIu64
+           ", diff = %" PRIu64 ", path = %s ",
+           contri_sum.file_count, size.file_count, delta.file_count, loc->path);
+
+    gf_log(this->name, GF_LOG_INFO,
+           "calculated d_count = %" PRId64 ", original d_count = %" PRIu64
+           ", diff = %" PRIu64 ", path = %s ",
+           contri_sum.dir_count, size.dir_count, delta.dir_count, loc->path);
+
+    ret = mq_update_size(this, loc, &delta);
+    if (ret < 0)
+        goto out;
+
+    updated = _gf_true;
+
+out:
+    gf_dirent_free(&entries);
+
+    if (fd)
+        fd_unref(fd);
+
+    if (xdata)
+        dict_unref(xdata);
+
+    if (ret < 0) {
+        /* On failure clear dirty status flag.
+         * In the next lookup inspect_directory_xattr
+         * can set the status flag and fix the
+         * dirty directory
+         */
+        if (ctx)
+            mq_set_ctx_dirty_status(ctx, _gf_false);
+    } else if (dirty) {
+        mq_mark_dirty(this, loc, 0);
+    }
+
+    if (locked)
+        mq_lock(this, loc, F_UNLCK);
+
+    if (updated)
+        mq_initiate_quota_blocking_txn(this, loc, NULL);
+
+    return ret;
+}
+
+int32_t
+mq_update_dirty_inode_txn(xlator_t *this, loc_t *loc, quota_inode_ctx_t *ctx)
+{
+    int32_t ret = -1;
+    gf_boolean_t status = _gf_true;
+
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", loc->inode, out);
+
+    mq_test_and_set_ctx_status(ctx, &ctx->dirty_status, &status);
+    if (status == _gf_true)
+        goto out;
+
+    ret = mq_synctask(this, mq_update_dirty_inode_task, _gf_true, loc);
+out:
+    if (ret < 0 && status == _gf_false)
+        mq_set_ctx_dirty_status(ctx, _gf_false);
+
+    return ret;
+}
+
+int32_t
+mq_inspect_directory_xattr(xlator_t *this, quota_inode_ctx_t *ctx,
+                           inode_contribution_t *contribution, loc_t *loc,
+                           dict_t *dict)
+{
+    int32_t ret = -1;
+    int8_t dirty = -1;
+    quota_meta_t size = {
+        0,
+    };
+    quota_meta_t contri = {
+        0,
+    };
+    quota_meta_t delta = {
+        0,
+    };
+    char contri_key[QUOTA_KEY_MAX] = {
+        0,
+    };
+    char size_key[QUOTA_KEY_MAX] = {
+        0,
+    };
+    int keylen = 0;
+    gf_boolean_t status = _gf_false;
+
+    ret = dict_get_int8(dict, QUOTA_DIRTY_KEY, &dirty);
+    if (ret < 0) {
+        /* dirty is set only on the first file write operation
+         * so ignore this error
+         */
+        ret = 0;
+        dirty = 0;
+    }
+
+    GET_SIZE_KEY(this, size_key, keylen);
+    if (keylen < 0) {
+        ret = -1;
+        goto out;
+    }
+    ret = _quota_dict_get_meta(this, dict, size_key, keylen, &size, IA_IFDIR,
+                               _gf_false);
+    if (ret < 0)
+        goto create_xattr;
+
+    if (!contribution)
+        goto create_xattr;
+
+    if (!loc_is_root(loc)) {
+        GET_CONTRI_KEY(this, contri_key, contribution->gfid, keylen);
+        if (keylen < 0) {
+            ret = -1;
+            goto out;
+        }
+        ret = _quota_dict_get_meta(this, dict, contri_key, keylen, &contri,
+                                   IA_IFDIR, _gf_false);
+        if (ret < 0)
+            goto create_xattr;
+
+        LOCK(&contribution->lock);
+        {
+            contribution->contribution = contri.size;
+            contribution->file_count = contri.file_count;
+            contribution->dir_count = contri.dir_count;
+        }
+        UNLOCK(&contribution->lock);
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->size = size.size;
+        ctx->file_count = size.file_count;
+        ctx->dir_count = size.dir_count;
+        ctx->dirty = dirty;
+    }
+    UNLOCK(&ctx->lock);
+
+    ret = mq_get_ctx_updation_status(ctx, &status);
+    if (ret < 0 || status == _gf_true) {
+        /* If the update txn is in progress abort inspection */
+        ret = 0;
+        goto out;
+    }
+
+    mq_compute_delta(&delta, &size, &contri);
+
+    if (dirty) {
+        ret = mq_update_dirty_inode_txn(this, loc, ctx);
+        goto out;
+    }
+
+    if (!loc_is_root(loc) && !quota_meta_is_null(&delta))
+        mq_initiate_quota_txn(this, loc, NULL);
+
+    ret = 0;
+    goto out;
+
+create_xattr:
+    if (ret < 0)
+        ret = mq_create_xattrs_txn(this, loc, NULL);
+
+out:
+    return ret;
+}
+
+int32_t
+mq_inspect_file_xattr(xlator_t *this, quota_inode_ctx_t *ctx,
+                      inode_contribution_t *contribution, loc_t *loc,
+                      dict_t *dict, struct iatt *buf)
+{
+    int32_t ret = -1;
+    quota_meta_t size = {
+        0,
+    };
+    quota_meta_t contri = {
+        0,
+    };
+    quota_meta_t delta = {
+        0,
+    };
+    char contri_key[QUOTA_KEY_MAX] = {
+        0,
+    };
+    int keylen = 0;
+    gf_boolean_t status = _gf_false;
+
+    if (!buf || !contribution || !ctx)
+        goto out;
+
+    LOCK(&ctx->lock);
+    {
+        ctx->size = 512 * buf->ia_blocks;
+        ctx->file_count = 1;
+        ctx->dir_count = 0;
+
+        size.size = ctx->size;
+        size.file_count = ctx->file_count;
+        size.dir_count = ctx->dir_count;
+    }
+    UNLOCK(&ctx->lock);
+
+    GET_CONTRI_KEY(this, contri_key, contribution->gfid, keylen);
+    if (keylen < 0) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = _quota_dict_get_meta(this, dict, contri_key, keylen, &contri,
+                               IA_IFREG, _gf_true);
+    if (ret < 0) {
+        ret = mq_create_xattrs_txn(this, loc, NULL);
+    } else {
+        LOCK(&contribution->lock);
+        {
+            contribution->contribution = contri.size;
+            contribution->file_count = contri.file_count;
+            contribution->dir_count = contri.dir_count;
+        }
+        UNLOCK(&contribution->lock);
+
+        ret = mq_get_ctx_updation_status(ctx, &status);
+        if (ret < 0 || status == _gf_true) {
+            /* If the update txn is in progress abort inspection */
+            ret = 0;
+            goto out;
+        }
+
+        mq_compute_delta(&delta, &size, &contri);
+        if (!quota_meta_is_null(&delta))
+            mq_initiate_quota_txn(this, loc, NULL);
+    }
+    /* TODO: revist this code when fixing hardlinks */
+
+out:
+    return ret;
+}
+
+int32_t
+mq_xattr_state(xlator_t *this, loc_t *origin_loc, dict_t *dict,
+               struct iatt *buf)
+{
+    int32_t ret = -1;
+    quota_inode_ctx_t *ctx = NULL;
+    loc_t loc = {
+        0,
+    };
+    inode_contribution_t *contribution = NULL;
+
+    ret = mq_prevalidate_txn(this, origin_loc, &loc, &ctx, buf);
+    if (ret < 0 || loc.parent == NULL)
+        goto out;
+
+    if (!loc_is_root(&loc)) {
+        contribution = mq_add_new_contribution_node(this, ctx, &loc);
+        if (contribution == NULL) {
+            if (!gf_uuid_is_null(loc.inode->gfid))
+                gf_log(this->name, GF_LOG_WARNING,
+                       "cannot add a new contribution node "
+                       "(%s)",
+                       uuid_utoa(loc.gfid));
+            ret = -1;
+            goto out;
+        }
+        if (buf->ia_type == IA_IFDIR)
+            mq_inspect_directory_xattr(this, ctx, contribution, &loc, dict);
+        else
+            mq_inspect_file_xattr(this, ctx, contribution, &loc, dict, buf);
+    } else {
+        mq_inspect_directory_xattr(this, ctx, 0, &loc, dict);
+    }
+
+out:
+    loc_wipe(&loc);
+
+    if (contribution)
+        GF_REF_PUT(contribution);
+
+    return ret;
+}
+
+int32_t
+mq_req_xattr(xlator_t *this, loc_t *loc, dict_t *dict, char *contri_key,
+             char *size_key)
+{
+    int32_t ret = -1;
+    char key[QUOTA_KEY_MAX] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("marker", this, out);
+    GF_VALIDATE_OR_GOTO("marker", loc, out);
+    GF_VALIDATE_OR_GOTO("marker", dict, out);
+
+    if (!loc_is_root(loc)) {
+        ret = mq_dict_set_contribution(this, dict, loc, NULL, contri_key);
+        if (ret < 0)
+            goto out;
+    }
+
+    GET_SIZE_KEY(this, key, ret);
+    if (ret < 0)
+        goto out;
+    if (size_key)
+        if (snprintf(size_key, QUOTA_KEY_MAX, "%s", key) >= QUOTA_KEY_MAX) {
+            ret = -1;
+            goto out;
+        }
+
+    ret = dict_set_uint64(dict, key, 0);
+    if (ret < 0)
+        goto out;
+
+    ret = dict_set_int8(dict, QUOTA_DIRTY_KEY, 0);
+
+out:
+    if (ret < 0)
+        gf_log_callingfn(this ? this->name : "Marker", GF_LOG_ERROR,
+                         "dict set failed");
+    return ret;
+}
+
+int32_t
+mq_forget(xlator_t *this, quota_inode_ctx_t *ctx)
+{
+    inode_contribution_t *contri = NULL;
+    inode_contribution_t *next = NULL;
+
+    GF_VALIDATE_OR_GOTO("marker", this, out);
+    GF_VALIDATE_OR_GOTO("marker", ctx, out);
+
+    list_for_each_entry_safe(contri, next, &ctx->contribution_head, contri_list)
+    {
+        list_del_init(&contri->contri_list);
+        GF_REF_PUT(contri);
+    }
+
+    LOCK_DESTROY(&ctx->lock);
+    GF_FREE(ctx);
+out:
+    return 0;
+}
diff --git a/xlators/features/marker/src/marker-quota.h b/xlators/features/marker/src/marker-quota.h
new file mode 100644
index 00000000000..4bbf6878b22
--- /dev/null
+++ b/xlators/features/marker/src/marker-quota.h
@@ -0,0 +1,140 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _MARKER_QUOTA_H
+#define _MARKER_QUOTA_H
+
+#include <glusterfs/xlator.h>
+#include "marker-mem-types.h"
+#include <glusterfs/refcount.h>
+#include <glusterfs/quota-common-utils.h>
+#include <glusterfs/call-stub.h>
+
+#define QUOTA_XATTR_PREFIX "trusted.glusterfs"
+#define QUOTA_DIRTY_KEY "trusted.glusterfs.quota.dirty"
+
+#define CONTRIBUTION "contri"
+#define QUOTA_KEY_MAX 512
+#define READDIR_BUF 4096
+
+#define QUOTA_ALLOC(var, type, ret)                                            \
+    do {                                                                       \
+        ret = 0;                                                               \
+        var = GF_CALLOC(sizeof(type), 1, gf_marker_mt_##type);                 \
+        if (!var) {                                                            \
+            ret = -1;                                                          \
+        }                                                                      \
+    } while (0);
+
+#define QUOTA_ALLOC_OR_GOTO(var, type, ret, label)                             \
+    do {                                                                       \
+        var = GF_CALLOC(sizeof(type), 1, gf_marker_mt_##type);                 \
+        if (!var) {                                                            \
+            gf_log("", GF_LOG_ERROR, "out of memory");                         \
+            ret = -1;                                                          \
+            goto label;                                                        \
+        }                                                                      \
+        ret = 0;                                                               \
+    } while (0);
+
+#define GET_QUOTA_KEY(_this, var, key, _ret)                                   \
+    do {                                                                       \
+        marker_conf_t *_priv = _this->private;                                 \
+        if (_priv->version > 0)                                                \
+            _ret = snprintf(var, QUOTA_KEY_MAX, "%s.%d", key, _priv->version); \
+        else                                                                   \
+            _ret = snprintf(var, QUOTA_KEY_MAX, "%s", key);                    \
+    } while (0)
+
+#define GET_CONTRI_KEY(_this, var, _gfid, _ret)                                \
+    do {                                                                       \
+        char _tmp_var[QUOTA_KEY_MAX] = {                                       \
+            0,                                                                 \
+        };                                                                     \
+        if (_gfid != NULL) {                                                   \
+            char _gfid_unparsed[40];                                           \
+            gf_uuid_unparse(_gfid, _gfid_unparsed);                            \
+            _ret = snprintf(_tmp_var, QUOTA_KEY_MAX,                           \
+                            QUOTA_XATTR_PREFIX ".%s.%s." CONTRIBUTION,         \
+                            "quota", _gfid_unparsed);                          \
+        } else {                                                               \
+            _ret = snprintf(_tmp_var, QUOTA_KEY_MAX,                           \
+                            QUOTA_XATTR_PREFIX ".%s.." CONTRIBUTION, "quota"); \
+        }                                                                      \
+        GET_QUOTA_KEY(_this, var, _tmp_var, _ret);                             \
+    } while (0)
+
+#define GET_SIZE_KEY(_this, var, _ret)                                         \
+    {                                                                          \
+        GET_QUOTA_KEY(_this, var, QUOTA_SIZE_KEY, _ret);                       \
+    }
+
+#define QUOTA_SAFE_INCREMENT(lock, var)                                        \
+    do {                                                                       \
+        LOCK(lock);                                                            \
+        var++;                                                                 \
+        UNLOCK(lock);                                                          \
+    } while (0)
+
+struct quota_inode_ctx {
+    int64_t size;
+    int64_t file_count;
+    int64_t dir_count;
+    int8_t dirty;
+    gf_boolean_t create_status;
+    gf_boolean_t updation_status;
+    gf_boolean_t dirty_status;
+    gf_lock_t lock;
+    struct list_head contribution_head;
+};
+typedef struct quota_inode_ctx quota_inode_ctx_t;
+
+struct quota_synctask {
+    xlator_t *this;
+    loc_t loc;
+    quota_meta_t contri;
+    gf_boolean_t is_static;
+    uint32_t ia_nlink;
+    call_stub_t *stub;
+};
+typedef struct quota_synctask quota_synctask_t;
+
+struct inode_contribution {
+    struct list_head contri_list;
+    int64_t contribution;
+    int64_t file_count;
+    int64_t dir_count;
+    uuid_t gfid;
+    gf_lock_t lock;
+    GF_REF_DECL;
+};
+typedef struct inode_contribution inode_contribution_t;
+
+int32_t
+mq_req_xattr(xlator_t *, loc_t *, dict_t *, char *, char *);
+
+int32_t
+mq_xattr_state(xlator_t *, loc_t *, dict_t *, struct iatt *);
+
+int
+mq_initiate_quota_txn(xlator_t *, loc_t *, struct iatt *);
+
+int
+mq_initiate_quota_blocking_txn(xlator_t *, loc_t *, struct iatt *);
+
+int
+mq_create_xattrs_txn(xlator_t *this, loc_t *loc, struct iatt *buf);
+
+int32_t
+mq_reduce_parent_size_txn(xlator_t *, loc_t *, quota_meta_t *, uint32_t nlink,
+                          call_stub_t *stub);
+
+int32_t
+mq_forget(xlator_t *, quota_inode_ctx_t *);
+#endif
diff --git a/xlators/features/marker/src/marker.c b/xlators/features/marker/src/marker.c
new file mode 100644
index 00000000000..1375ccc498c
--- /dev/null
+++ b/xlators/features/marker/src/marker.c
@@ -0,0 +1,3568 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include "libxlator.h"
+#include "marker.h"
+#include "marker-mem-types.h"
+#include "marker-quota.h"
+#include "marker-quota-helper.h"
+#include "marker-common.h"
+#include <glusterfs/byte-order.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/syscall.h>
+
+#include <fnmatch.h>
+
+#define _GF_UID_GID_CHANGED 1
+
+static char *mq_ext_xattrs[] = {
+    QUOTA_SIZE_KEY,
+    QUOTA_LIMIT_KEY,
+    QUOTA_LIMIT_OBJECTS_KEY,
+    NULL,
+};
+
+void
+fini(xlator_t *this);
+
+int32_t
+marker_start_setxattr(call_frame_t *, xlator_t *);
+
+/* When client/quotad request for quota xattrs,
+ * replace the key-name by adding the version number
+ * in end of the key-name.
+ * In the cbk, result value of xattrs for original
+ * key-name.
+ * Below function marker_key_replace_with_ver and
+ * marker_key_set_ver is used for setting/removing
+ * version for the key-name
+ */
+int
+marker_key_replace_with_ver(xlator_t *this, dict_t *dict)
+{
+    int ret = -1;
+    int i = 0;
+    marker_conf_t *priv = NULL;
+    char key[QUOTA_KEY_MAX] = {
+        0,
+    };
+
+    priv = this->private;
+
+    if (dict == NULL || priv->version <= 0) {
+        ret = 0;
+        goto out;
+    }
+
+    for (i = 0; mq_ext_xattrs[i]; i++) {
+        if (dict_get(dict, mq_ext_xattrs[i])) {
+            GET_QUOTA_KEY(this, key, mq_ext_xattrs[i], ret);
+            if (ret < 0)
+                goto out;
+
+            ret = dict_set(dict, key, dict_get(dict, mq_ext_xattrs[i]));
+            if (ret < 0)
+                goto out;
+
+            dict_del(dict, mq_ext_xattrs[i]);
+        }
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int
+marker_key_set_ver(xlator_t *this, dict_t *dict)
+{
+    int ret = -1;
+    int i = -1;
+    marker_conf_t *priv = NULL;
+    char key[QUOTA_KEY_MAX] = {
+        0,
+    };
+
+    priv = this->private;
+
+    if (dict == NULL || priv->version <= 0) {
+        ret = 0;
+        goto out;
+    }
+
+    for (i = 0; mq_ext_xattrs[i]; i++) {
+        GET_QUOTA_KEY(this, key, mq_ext_xattrs[i], ret);
+        if (ret < 0)
+            goto out;
+
+        if (dict_get(dict, key))
+            dict_set(dict, mq_ext_xattrs[i], dict_get(dict, key));
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+marker_local_t *
+marker_local_ref(marker_local_t *local)
+{
+    GF_VALIDATE_OR_GOTO("marker", local, err);
+
+    LOCK(&local->lock);
+    {
+        local->ref++;
+    }
+    UNLOCK(&local->lock);
+
+    return local;
+err:
+    return NULL;
+}
+
+int
+marker_loc_fill(loc_t *loc, inode_t *inode, inode_t *parent, char *path)
+{
+    int ret = -1;
+
+    if (!loc)
+        return ret;
+
+    if (inode) {
+        loc->inode = inode_ref(inode);
+        if (gf_uuid_is_null(loc->gfid)) {
+            gf_uuid_copy(loc->gfid, loc->inode->gfid);
+        }
+    }
+
+    if (parent)
+        loc->parent = inode_ref(parent);
+
+    if (path) {
+        loc->path = gf_strdup(path);
+        if (!loc->path) {
+            gf_log("loc fill", GF_LOG_ERROR, "strdup failed");
+            goto loc_wipe;
+        }
+
+        loc->name = strrchr(loc->path, '/');
+        if (loc->name)
+            loc->name++;
+    }
+
+    ret = 0;
+loc_wipe:
+    if (ret < 0)
+        loc_wipe(loc);
+
+    return ret;
+}
+
+int
+_marker_inode_loc_fill(inode_t *inode, inode_t *parent, char *name, loc_t *loc)
+{
+    char *resolvedpath = NULL;
+    int ret = -1;
+    gf_boolean_t free_parent = _gf_false;
+
+    if ((!inode) || (!loc))
+        return ret;
+
+    if (parent && name)
+        ret = inode_path(parent, name, &resolvedpath);
+    else
+        ret = inode_path(inode, NULL, &resolvedpath);
+    if (ret < 0)
+        goto err;
+
+    if (parent == NULL) {
+        parent = inode_parent(inode, NULL, NULL);
+        free_parent = _gf_true;
+    }
+
+    ret = marker_loc_fill(loc, inode, parent, resolvedpath);
+    if (ret < 0)
+        goto err;
+
+err:
+    if (free_parent)
+        inode_unref(parent);
+
+    GF_FREE(resolvedpath);
+
+    return ret;
+}
+
+int
+marker_inode_loc_fill(inode_t *inode, loc_t *loc)
+{
+    return _marker_inode_loc_fill(inode, NULL, NULL, loc);
+}
+
+int32_t
+marker_trav_parent(marker_local_t *local)
+{
+    int32_t ret = 0;
+    loc_t loc = {
+        0,
+    };
+    inode_t *parent = NULL;
+    int8_t need_unref = 0;
+
+    if (!local->loc.parent) {
+        parent = inode_parent(local->loc.inode, NULL, NULL);
+        if (parent)
+            need_unref = 1;
+    } else
+        parent = local->loc.parent;
+
+    ret = marker_inode_loc_fill(parent, &loc);
+
+    if (ret < 0) {
+        ret = -1;
+        goto out;
+    }
+
+    loc_wipe(&local->loc);
+
+    local->loc = loc;
+out:
+    if (need_unref)
+        inode_unref(parent);
+
+    return ret;
+}
+
+void
+marker_error_handler(xlator_t *this, marker_local_t *local, int32_t op_errno)
+{
+    marker_conf_t *priv = (marker_conf_t *)this->private;
+    const char *path = local ? ((local->loc.path) ? local->loc.path
+                                                  : uuid_utoa(local->loc.gfid))
+                             : "<nul>";
+
+    gf_log(this->name, GF_LOG_CRITICAL,
+           "Indexing gone corrupt at %s (reason: %s)."
+           " Geo-replication slave content needs to be revalidated",
+           path, strerror(op_errno));
+    sys_unlink(priv->timestamp_file);
+}
+
+int32_t
+marker_local_unref(marker_local_t *local)
+{
+    int32_t var = 0;
+
+    if (local == NULL)
+        return -1;
+
+    LOCK(&local->lock);
+    {
+        var = --local->ref;
+    }
+    UNLOCK(&local->lock);
+
+    if (var != 0)
+        goto out;
+
+    loc_wipe(&local->loc);
+    loc_wipe(&local->parent_loc);
+    if (local->xdata)
+        dict_unref(local->xdata);
+
+    if (local->lk_frame) {
+        STACK_DESTROY(local->lk_frame->root);
+        local->lk_frame = NULL;
+    }
+
+    if (local->oplocal) {
+        marker_local_unref(local->oplocal);
+        local->oplocal = NULL;
+    }
+    mem_put(local);
+out:
+    return 0;
+}
+
+int32_t
+stat_stampfile(xlator_t *this, marker_conf_t *priv, struct volume_mark **status)
+{
+    struct stat buf = {
+        0,
+    };
+    struct volume_mark *vol_mark = NULL;
+
+    vol_mark = GF_CALLOC(sizeof(struct volume_mark), 1,
+                         gf_marker_mt_volume_mark);
+
+    vol_mark->major = 1;
+    vol_mark->minor = 0;
+
+    GF_ASSERT(sizeof(priv->volume_uuid_bin) == 16);
+    memcpy(vol_mark->uuid, priv->volume_uuid_bin, 16);
+
+    if (sys_stat(priv->timestamp_file, &buf) != -1) {
+        vol_mark->retval = 0;
+        vol_mark->sec = htonl(buf.st_mtime);
+        vol_mark->usec = htonl(ST_MTIM_NSEC(&buf) / 1000);
+    } else
+        vol_mark->retval = 1;
+
+    *status = vol_mark;
+
+    return 0;
+}
+
+int32_t
+marker_getxattr_stampfile_cbk(call_frame_t *frame, xlator_t *this,
+                              const char *name, struct volume_mark *vol_mark,
+                              dict_t *xdata)
+{
+    int32_t ret = -1;
+    dict_t *dict = NULL;
+
+    if (vol_mark == NULL) {
+        STACK_UNWIND_STRICT(getxattr, frame, -1, ENOMEM, NULL, NULL);
+
+        goto out;
+    }
+
+    dict = dict_new();
+
+    ret = dict_set_bin(dict, (char *)name, vol_mark,
+                       sizeof(struct volume_mark));
+    if (ret) {
+        GF_FREE(vol_mark);
+        gf_log(this->name, GF_LOG_WARNING, "failed to set key %s", name);
+    }
+
+    STACK_UNWIND_STRICT(getxattr, frame, 0, 0, dict, xdata);
+
+    if (dict)
+        dict_unref(dict);
+out:
+    return 0;
+}
+
+gf_boolean_t
+call_from_special_client(call_frame_t *frame, xlator_t *this, const char *name)
+{
+    struct volume_mark *vol_mark = NULL;
+    marker_conf_t *priv = NULL;
+    gf_boolean_t is_true = _gf_true;
+
+    priv = (marker_conf_t *)this->private;
+
+    if (frame->root->pid != GF_CLIENT_PID_GSYNCD || name == NULL ||
+        strcmp(name, MARKER_XATTR_PREFIX "." VOLUME_MARK) != 0) {
+        is_true = _gf_false;
+        goto out;
+    }
+
+    stat_stampfile(this, priv, &vol_mark);
+
+    marker_getxattr_stampfile_cbk(frame, this, name, vol_mark, NULL);
+out:
+    return is_true;
+}
+
+static gf_boolean_t
+_is_quota_internal_xattr(dict_t *d, char *k, data_t *v, void *data)
+{
+    int i = 0;
+    char **external_xattrs = data;
+
+    for (i = 0; external_xattrs && external_xattrs[i]; i++) {
+        if (strcmp(k, external_xattrs[i]) == 0)
+            return _gf_false;
+    }
+
+    if (fnmatch("trusted.glusterfs.quota*", k, 0) == 0)
+        return _gf_true;
+
+    /* It would be nice if posix filters pgfid xattrs. But since marker
+     * also takes up responsibility to clean these up, adding the filtering
+     * here (Check 'quota_xattr_cleaner')
+     */
+    if (fnmatch(PGFID_XATTR_KEY_PREFIX "*", k, 0) == 0)
+        return _gf_true;
+
+    return _gf_false;
+}
+
+static void
+marker_filter_internal_xattrs(xlator_t *this, dict_t *xattrs)
+{
+    marker_conf_t *priv = NULL;
+    char **ext = NULL;
+
+    priv = this->private;
+    if (priv->feature_enabled & GF_QUOTA)
+        ext = mq_ext_xattrs;
+
+    dict_foreach_match(xattrs, _is_quota_internal_xattr, ext,
+                       dict_remove_foreach_fn, NULL);
+}
+
+static void
+marker_filter_gsyncd_xattrs(call_frame_t *frame, xlator_t *this, dict_t *xattrs)
+{
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(frame);
+
+    if (xattrs && frame->root->pid != GF_CLIENT_PID_GSYNCD) {
+        GF_REMOVE_INTERNAL_XATTR(GF_XATTR_XTIME_PATTERN, xattrs);
+    }
+    return;
+}
+
+int32_t
+marker_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *dict,
+                    dict_t *xdata)
+{
+    int32_t ret = -1;
+    if (op_ret < 0)
+        goto unwind;
+
+    ret = marker_key_set_ver(this, dict);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    if (cookie) {
+        gf_log(this->name, GF_LOG_DEBUG,
+               "Filtering the quota extended attributes");
+
+        /* If the getxattr is from a non special client, then do not
+           copy the quota related xattrs (except the quota limit key
+           i.e trusted.glusterfs.quota.limit-set which has been set by
+           glusterd on the directory on which quota limit is set.) for
+           directories. Let the healing of xattrs happen upon lookup.
+           NOTE: setting of trusted.glusterfs.quota.limit-set as of now
+           happens from glusterd. It should be moved to quotad. Also
+           trusted.glusterfs.quota.limit-set is set on directory which
+           is permanent till quota is removed on that directory or limit
+           is changed. So let that xattr be healed by other xlators
+           properly whenever directory healing is done.
+        */
+        /*
+         * Except limit-set xattr, rest of the xattrs are maintained
+         * by quota xlator. Don't expose them to other xlators.
+         * This filter makes sure quota xattrs are not healed as part of
+         * metadata self-heal
+         */
+        marker_filter_internal_xattrs(frame->this, dict);
+    }
+
+    /* Filter gsyncd xtime xattr for non gsyncd clients */
+    marker_filter_gsyncd_xattrs(frame, frame->this, dict);
+
+unwind:
+    MARKER_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+int32_t
+marker_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                const char *name, dict_t *xdata)
+{
+    gf_boolean_t is_true = _gf_false;
+    marker_conf_t *priv = NULL;
+    unsigned long cookie = 0;
+    marker_local_t *local = NULL;
+    char key[QUOTA_KEY_MAX] = {
+        0,
+    };
+    int32_t ret = -1;
+    int32_t i = 0;
+
+    priv = this->private;
+
+    if (name) {
+        for (i = 0; mq_ext_xattrs[i]; i++) {
+            if (strcmp(name, mq_ext_xattrs[i]))
+                continue;
+
+            GET_QUOTA_KEY(this, key, mq_ext_xattrs[i], ret);
+            if (ret < 0)
+                goto out;
+            name = key;
+            break;
+        }
+    }
+
+    frame->local = mem_get0(this->local_pool);
+    local = frame->local;
+    if (local == NULL)
+        goto out;
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    if ((loc_copy(&local->loc, loc)) < 0)
+        goto out;
+
+    gf_log(this->name, GF_LOG_DEBUG, "USER:PID = %d", frame->root->pid);
+
+    if (priv && priv->feature_enabled & GF_XTIME)
+        is_true = call_from_special_client(frame, this, name);
+
+    if (is_true == _gf_false) {
+        if (name == NULL) {
+            /* Signifies that marker translator
+             * has to filter the quota's xattr's,
+             * this is to prevent afr from performing
+             * self healing on marker-quota xattrs'
+             */
+            cookie = 1;
+        }
+        STACK_WIND_COOKIE(frame, marker_getxattr_cbk, (void *)cookie,
+                          FIRST_CHILD(this), FIRST_CHILD(this)->fops->getxattr,
+                          loc, name, xdata);
+    }
+
+    return 0;
+out:
+    MARKER_STACK_UNWIND(getxattr, frame, -1, ENOMEM, NULL, NULL);
+    return 0;
+}
+
+int32_t
+marker_setxattr_done(call_frame_t *frame)
+{
+    marker_local_t *local = NULL;
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+
+    STACK_DESTROY(frame->root);
+
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int
+marker_specific_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                             int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    int32_t ret = 0;
+    int32_t done = 1;
+    marker_local_t *local = NULL;
+
+    local = (marker_local_t *)frame->local;
+
+    if (op_ret == -1 && op_errno == ENOSPC) {
+        marker_error_handler(this, local, op_errno);
+        goto out;
+    }
+
+    if (local) {
+        if (local->loc.path && strcmp(local->loc.path, "/") == 0) {
+            goto out;
+        }
+        if (__is_root_gfid(local->loc.gfid)) {
+            goto out;
+        }
+    }
+
+    ret = (local) ? marker_trav_parent(local) : -1;
+
+    if (ret == -1) {
+        gf_log(this->name, GF_LOG_DEBUG,
+               "Error occurred "
+               "while traversing to the parent, stopping marker");
+        goto out;
+    }
+
+    marker_start_setxattr(frame, this);
+    done = 0;
+out:
+    if (done) {
+        marker_setxattr_done(frame);
+    }
+
+    return 0;
+}
+
+int32_t
+marker_start_setxattr(call_frame_t *frame, xlator_t *this)
+{
+    int32_t ret = -1;
+    dict_t *dict = NULL;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    local = (marker_local_t *)frame->local;
+
+    if (!local)
+        goto out;
+
+    dict = dict_new();
+
+    if (!dict)
+        goto out;
+
+    if (local->loc.inode && gf_uuid_is_null(local->loc.gfid))
+        gf_uuid_copy(local->loc.gfid, local->loc.inode->gfid);
+
+    GF_UUID_ASSERT(local->loc.gfid);
+
+    ret = dict_set_static_bin(dict, priv->marker_xattr, (void *)local->timebuf,
+                              8);
+    if (ret) {
+        gf_log(this->name, GF_LOG_WARNING, "failed to set marker xattr (%s)",
+               local->loc.path);
+        goto out;
+    }
+
+    STACK_WIND(frame, marker_specific_setxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setxattr, &local->loc, dict, 0, NULL);
+
+    ret = 0;
+out:
+    if (dict)
+        dict_unref(dict);
+
+    return ret;
+}
+
+void
+marker_gettimeofday(marker_local_t *local)
+{
+    struct timeval tv = {
+        0,
+    };
+
+    gettimeofday(&tv, NULL);
+
+    local->timebuf[0] = htonl(tv.tv_sec);
+    local->timebuf[1] = htonl(tv.tv_usec);
+
+    return;
+}
+
+int32_t
+marker_create_frame(xlator_t *this, marker_local_t *local)
+{
+    call_frame_t *frame = NULL;
+
+    frame = create_frame(this, this->ctx->pool);
+
+    if (!frame)
+        return -1;
+
+    frame->local = (void *)local;
+
+    marker_start_setxattr(frame, this);
+
+    return 0;
+}
+
+int32_t
+marker_xtime_update_marks(xlator_t *this, marker_local_t *local)
+{
+    marker_conf_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("marker", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, local, out);
+
+    priv = this->private;
+
+    if ((local->pid == GF_CLIENT_PID_GSYNCD &&
+         !(priv->feature_enabled & GF_XTIME_GSYNC_FORCE)) ||
+        (local->pid == GF_CLIENT_PID_DEFRAG))
+        goto out;
+
+    marker_gettimeofday(local);
+
+    marker_local_ref(local);
+
+    marker_create_frame(this, local);
+out:
+    return 0;
+}
+
+int32_t
+marker_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, inode_t *inode,
+                 struct iatt *buf, struct iatt *preparent,
+                 struct iatt *postparent, dict_t *xdata)
+{
+    marker_conf_t *priv = NULL;
+    marker_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE,
+               "error occurred "
+               "while creating directory %s",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+    priv = this->private;
+
+    if (op_ret >= 0 && inode && (priv->feature_enabled & GF_QUOTA)) {
+        ctx = mq_inode_ctx_new(inode, this);
+        if (ctx == NULL) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "mq_inode_ctx_new "
+                   "failed for %s",
+                   uuid_utoa(inode->gfid));
+            op_ret = -1;
+            op_errno = ENOMEM;
+        }
+    }
+
+    STACK_UNWIND_STRICT(mkdir, frame, op_ret, op_errno, inode, buf, preparent,
+                        postparent, xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    if (gf_uuid_is_null(local->loc.gfid))
+        gf_uuid_copy(local->loc.gfid, buf->ia_gfid);
+
+    if (priv->feature_enabled & GF_QUOTA)
+        mq_create_xattrs_txn(this, &local->loc, NULL);
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int
+marker_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+             mode_t umask, dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = loc_copy(&local->loc, loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_mkdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+
+    return 0;
+err:
+    MARKER_STACK_UNWIND(mkdir, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+marker_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                  struct iatt *buf, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE,
+               "error occurred "
+               "while creating file %s",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+    priv = this->private;
+
+    if (op_ret >= 0 && inode && (priv->feature_enabled & GF_QUOTA)) {
+        ctx = mq_inode_ctx_new(inode, this);
+        if (ctx == NULL) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "mq_inode_ctx_new "
+                   "failed for %s",
+                   uuid_utoa(inode->gfid));
+            op_ret = -1;
+            op_errno = ENOMEM;
+        }
+    }
+
+    STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf,
+                        preparent, postparent, xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    if (gf_uuid_is_null(local->loc.gfid))
+        gf_uuid_copy(local->loc.gfid, buf->ia_gfid);
+
+    if (priv->feature_enabled & GF_QUOTA)
+        mq_create_xattrs_txn(this, &local->loc, buf);
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int32_t
+marker_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+              mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = loc_copy(&local->loc, loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL,
+                        NULL);
+
+    return 0;
+}
+
+int32_t
+marker_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                  struct iatt *postbuf, dict_t *xdata)
+{
+    marker_conf_t *priv = NULL;
+    marker_local_t *local = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE,
+               "error occurred "
+               "while write, %s",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    priv = this->private;
+
+    if (priv->feature_enabled & GF_QUOTA)
+        mq_initiate_quota_txn(this, &local->loc, postbuf);
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int32_t
+marker_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+              struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
+              struct iobref *iobref, dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = marker_inode_loc_fill(fd->inode, &local->loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+               flags, iobref, xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(writev, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+marker_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                 struct iatt *postparent, dict_t *xdata)
+{
+    marker_conf_t *priv = NULL;
+    marker_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE,
+               "error occurred "
+               "rmdir %s",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+    priv = this->private;
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+
+    if (priv->feature_enabled & GF_QUOTA) {
+        /* If a 'rm -rf' is performed by a client, rmdir can be faster
+           than marker background mq_reduce_parent_size_txn.
+           In this case, as part of rmdir parent child association
+           will be removed in the server protocol.
+           This can lead to mq_reduce_parent_size_txn failures.
+
+           So perform mq_reduce_parent_size_txn in foreground
+           and unwind to server once txn is complete
+         */
+
+        stub = fop_rmdir_cbk_stub(frame, default_rmdir_cbk, op_ret, op_errno,
+                                  preparent, postparent, xdata);
+        mq_reduce_parent_size_txn(this, &local->loc, NULL, 1, stub);
+
+        if (stub) {
+            marker_local_unref(local);
+            return 0;
+        }
+    }
+
+out:
+    STACK_UNWIND_STRICT(rmdir, frame, op_ret, op_errno, preparent, postparent,
+                        xdata);
+
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int32_t
+marker_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+             dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = loc_copy(&local->loc, loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_rmdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(rmdir, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+marker_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
+{
+    marker_conf_t *priv = NULL;
+    marker_local_t *local = NULL;
+    uint32_t nlink = -1;
+    GF_UNUSED int32_t ret = 0;
+    call_stub_t *stub = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE, "%s occurred in unlink",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+    priv = this->private;
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+
+    if (priv->feature_enabled & GF_QUOTA) {
+        if (local->skip_txn)
+            goto out;
+
+        if (xdata) {
+            ret = dict_get_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA, &nlink);
+            if (ret) {
+                gf_log(this->name, GF_LOG_TRACE, "dict get failed %s ",
+                       strerror(-ret));
+            }
+        }
+
+        /* If a 'rm -rf' is performed by a client, unlink can be faster
+           than marker background mq_reduce_parent_size_txn.
+           In this case, as part of unlink parent child association
+           will be removed in the server protocol.
+           This can lead to mq_reduce_parent_size_txn failures.
+
+          So perform mq_reduce_parent_size_txn in foreground
+          and unwind to server once txn is complete
+        */
+
+        stub = fop_unlink_cbk_stub(frame, default_unlink_cbk, op_ret, op_errno,
+                                   preparent, postparent, xdata);
+        mq_reduce_parent_size_txn(this, &local->loc, NULL, nlink, stub);
+
+        if (stub) {
+            marker_local_unref(local);
+            return 0;
+        }
+    }
+
+out:
+    STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent,
+                        xdata);
+
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int32_t
+marker_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+              dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+    gf_boolean_t dict_free = _gf_false;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto unlink_wind;
+
+    local = mem_get0(this->local_pool);
+    local->xflag = xflag;
+    if (xdata)
+        local->xdata = dict_ref(xdata);
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = loc_copy(&local->loc, loc);
+
+    if (ret == -1)
+        goto err;
+
+    if (xdata && dict_get(xdata, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY)) {
+        local->skip_txn = 1;
+        goto unlink_wind;
+    }
+
+    if (xdata == NULL) {
+        xdata = dict_new();
+        dict_free = _gf_true;
+    }
+
+    ret = dict_set_int32(xdata, GF_REQUEST_LINK_COUNT_XDATA, 1);
+    if (ret < 0)
+        goto err;
+
+unlink_wind:
+    STACK_WIND(frame, marker_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+    goto out;
+
+err:
+    MARKER_STACK_UNWIND(unlink, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+out:
+    if (dict_free)
+        dict_unref(xdata);
+    return 0;
+}
+
+int32_t
+marker_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, inode_t *inode,
+                struct iatt *buf, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s occurred while "
+               "linking a file ",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(link, frame, op_ret, op_errno, inode, buf, preparent,
+                        postparent, xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    priv = this->private;
+
+    if (priv->feature_enabled & GF_QUOTA) {
+        if (!local->skip_txn)
+            mq_create_xattrs_txn(this, &local->loc, buf);
+    }
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int32_t
+marker_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+            dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = loc_copy(&local->loc, newloc);
+
+    if (ret == -1)
+        goto err;
+
+    if (xdata && dict_get(xdata, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY))
+        local->skip_txn = 1;
+wind:
+    STACK_WIND(frame, marker_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+marker_rename_done(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    marker_local_t *local = NULL, *oplocal = NULL;
+    loc_t newloc = {
+        0,
+    };
+    marker_conf_t *priv = NULL;
+
+    local = frame->local;
+    oplocal = local->oplocal;
+
+    priv = this->private;
+
+    frame->local = NULL;
+
+    if (op_ret < 0) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "inodelk (UNLOCK) failed on path:%s (gfid:%s) (%s)",
+               oplocal->parent_loc.path,
+               uuid_utoa(oplocal->parent_loc.inode->gfid), strerror(op_errno));
+    }
+
+    if (local->err != 0)
+        goto err;
+
+    mq_reduce_parent_size_txn(this, &oplocal->loc, &oplocal->contribution, -1,
+                              NULL);
+
+    if (local->loc.inode != NULL) {
+        /* If destination file exits before rename, it would have
+         * been unlinked while renaming a file
+         */
+        mq_reduce_parent_size_txn(this, &local->loc, NULL, local->ia_nlink,
+                                  NULL);
+    }
+
+    newloc.inode = inode_ref(oplocal->loc.inode);
+    newloc.path = gf_strdup(local->loc.path);
+    newloc.name = strrchr(newloc.path, '/');
+    if (newloc.name)
+        newloc.name++;
+    newloc.parent = inode_ref(local->loc.parent);
+
+    mq_create_xattrs_txn(this, &newloc, &local->buf);
+
+    loc_wipe(&newloc);
+
+    if (priv->feature_enabled & GF_XTIME) {
+        if (!local->loc.inode)
+            local->loc.inode = inode_ref(oplocal->loc.inode);
+        // update marks on oldpath
+        gf_uuid_copy(local->loc.gfid, oplocal->loc.inode->gfid);
+        marker_xtime_update_marks(this, oplocal);
+        marker_xtime_update_marks(this, local);
+    }
+
+err:
+    marker_local_unref(local);
+    marker_local_unref(oplocal);
+
+    return 0;
+}
+
+void
+marker_rename_release_oldp_lock(marker_local_t *local, xlator_t *this)
+{
+    marker_local_t *oplocal = NULL;
+    call_frame_t *lk_frame = NULL;
+    struct gf_flock lock = {
+        0,
+    };
+
+    oplocal = local->oplocal;
+    lk_frame = local->lk_frame;
+
+    if (lk_frame == NULL)
+        goto err;
+
+    lock.l_type = F_UNLCK;
+    lock.l_whence = SEEK_SET;
+    lock.l_start = 0;
+    lock.l_len = 0;
+    lock.l_pid = 0;
+
+    STACK_WIND(lk_frame, marker_rename_done, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->inodelk, this->name,
+               &oplocal->parent_loc, F_SETLKW, &lock, NULL);
+
+    return;
+
+err:
+    marker_local_unref(local);
+    marker_local_unref(oplocal);
+}
+
+int32_t
+marker_rename_unwind(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_local_t *oplocal = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    inode_contribution_t *contri = NULL;
+
+    local = frame->local;
+    oplocal = local->oplocal;
+    frame->local = NULL;
+
+    // Reset frame uid and gid if set.
+    if (cookie == (void *)_GF_UID_GID_CHANGED)
+        MARKER_RESET_UID_GID(frame, frame->root, local);
+
+    if (op_ret < 0)
+        local->err = op_errno ? op_errno : EINVAL;
+
+    if (local->stub != NULL) {
+        /* Remove contribution node from in-memory even if
+         * remove-xattr has failed as the rename is already performed
+         * if local->stub is set, which means rename was successful
+         */
+        (void)mq_inode_ctx_get(oplocal->loc.inode, this, &ctx);
+        if (ctx) {
+            contri = mq_get_contribution_node(oplocal->loc.parent, ctx);
+            if (contri) {
+                QUOTA_FREE_CONTRIBUTION_NODE(ctx, contri);
+                GF_REF_PUT(contri);
+            }
+        }
+
+        call_resume(local->stub);
+        local->stub = NULL;
+        local->err = 0;
+    } else if (local->err != 0) {
+        STACK_UNWIND_STRICT(rename, frame, -1, local->err, NULL, NULL, NULL,
+                            NULL, NULL, NULL);
+    } else {
+        gf_log(this->name, GF_LOG_CRITICAL,
+               "continuation stub to unwind the call is absent, hence "
+               "call will be hung (call-stack id = %" PRIu64 ")",
+               frame->root->unique);
+    }
+
+    /* If there are in-progress writes on old-path when during rename
+     * operation, update txn will update the wrong path if lock
+     * is released before rename unwind.
+     * So release lock only after rename unwind
+     */
+    marker_rename_release_oldp_lock(local, this);
+
+    return 0;
+}
+
+int32_t
+marker_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                  struct iatt *preoldparent, struct iatt *postoldparent,
+                  struct iatt *prenewparent, struct iatt *postnewparent,
+                  dict_t *xdata)
+{
+    marker_conf_t *priv = NULL;
+    marker_local_t *local = NULL;
+    marker_local_t *oplocal = NULL;
+    call_stub_t *stub = NULL;
+    int32_t ret = 0;
+    char contri_key[QUOTA_KEY_MAX] = {
+        0,
+    };
+    loc_t newloc = {
+        0,
+    };
+
+    local = (marker_local_t *)frame->local;
+
+    if (local != NULL) {
+        oplocal = local->oplocal;
+    }
+
+    priv = this->private;
+
+    if (op_ret < 0) {
+        if (local != NULL) {
+            local->err = op_errno;
+        }
+
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s occurred while "
+               "renaming a file ",
+               strerror(op_errno));
+    }
+
+    if (priv->feature_enabled & GF_QUOTA) {
+        if ((op_ret < 0) || (local == NULL)) {
+            goto quota_err;
+        }
+
+        local->ia_nlink = 0;
+        if (xdata)
+            ret = dict_get_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA,
+                                  &local->ia_nlink);
+
+        local->buf = *buf;
+        stub = fop_rename_cbk_stub(frame, default_rename_cbk, op_ret, op_errno,
+                                   buf, preoldparent, postoldparent,
+                                   prenewparent, postnewparent, xdata);
+        if (stub == NULL) {
+            local->err = ENOMEM;
+            goto quota_err;
+        }
+
+        local->stub = stub;
+
+        GET_CONTRI_KEY(this, contri_key, oplocal->loc.parent->gfid, ret);
+        if (ret < 0) {
+            local->err = ENOMEM;
+            goto quota_err;
+        }
+
+        /* Removexattr requires uid and gid to be 0,
+         * reset them in the callback.
+         */
+        MARKER_SET_UID_GID(frame, local, frame->root);
+
+        newloc.inode = inode_ref(oplocal->loc.inode);
+        newloc.path = gf_strdup(local->loc.path);
+        newloc.name = strrchr(newloc.path, '/');
+        if (newloc.name)
+            newloc.name++;
+        newloc.parent = inode_ref(local->loc.parent);
+        gf_uuid_copy(newloc.gfid, oplocal->loc.inode->gfid);
+
+        STACK_WIND_COOKIE(
+            frame, marker_rename_unwind, frame->cookie, FIRST_CHILD(this),
+            FIRST_CHILD(this)->fops->removexattr, &newloc, contri_key, NULL);
+
+        loc_wipe(&newloc);
+    } else {
+        frame->local = NULL;
+
+        STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, buf, preoldparent,
+                            postoldparent, prenewparent, postnewparent, xdata);
+
+        if ((op_ret < 0) || (local == NULL)) {
+            goto out;
+        }
+
+        if (priv->feature_enabled & GF_XTIME) {
+            // update marks on oldpath
+            if (!local->loc.inode)
+                local->loc.inode = inode_ref(oplocal->loc.inode);
+            gf_uuid_copy(local->loc.gfid, oplocal->loc.inode->gfid);
+            marker_xtime_update_marks(this, oplocal);
+            marker_xtime_update_marks(this, local);
+        }
+    }
+
+out:
+    if (!(priv->feature_enabled & GF_QUOTA)) {
+        marker_local_unref(local);
+        marker_local_unref(oplocal);
+    }
+
+    return 0;
+
+quota_err:
+    marker_rename_unwind(frame, NULL, this, 0, 0, NULL);
+    return 0;
+}
+
+int32_t
+marker_do_rename(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_local_t *oplocal = NULL;
+    char contri_key[QUOTA_KEY_MAX] = {
+        0,
+    };
+    int keylen = 0;
+    quota_meta_t contribution = {
+        0,
+    };
+
+    local = frame->local;
+    oplocal = local->oplocal;
+
+    // Reset frame uid and gid if set.
+    if (cookie == (void *)_GF_UID_GID_CHANGED)
+        MARKER_RESET_UID_GID(frame, frame->root, local);
+
+    if ((op_ret < 0) && (op_errno != ENOATTR) && (op_errno != ENODATA)) {
+        local->err = op_errno ? op_errno : EINVAL;
+        gf_log(this->name, GF_LOG_WARNING,
+               "fetching contribution values from %s (gfid:%s) "
+               "failed (%s)",
+               oplocal->loc.path, uuid_utoa(oplocal->loc.inode->gfid),
+               strerror(op_errno));
+        goto err;
+    }
+
+    GET_CONTRI_KEY(this, contri_key, oplocal->loc.parent->gfid, keylen);
+    if (keylen < 0) {
+        local->err = errno ? errno : ENOMEM;
+        goto err;
+    }
+    quota_dict_get_meta(dict, contri_key, keylen, &contribution);
+    oplocal->contribution = contribution;
+
+    STACK_WIND(frame, marker_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, &oplocal->loc, &local->loc,
+               local->xdata);
+
+    return 0;
+
+err:
+    marker_rename_unwind(frame, NULL, this, 0, 0, NULL);
+    return 0;
+}
+
+int32_t
+marker_get_oldpath_contribution(call_frame_t *lk_frame, void *cookie,
+                                xlator_t *this, int32_t op_ret,
+                                int32_t op_errno, dict_t *xdata)
+{
+    call_frame_t *frame = NULL;
+    marker_local_t *local = NULL;
+    marker_local_t *oplocal = NULL;
+    char contri_key[QUOTA_KEY_MAX] = {
+        0,
+    };
+    int32_t ret = 0;
+
+    local = lk_frame->local;
+    oplocal = local->oplocal;
+    frame = local->frame;
+
+    if (op_ret < 0) {
+        local->err = op_errno ? op_errno : EINVAL;
+        gf_log(this->name, GF_LOG_WARNING,
+               "cannot hold inodelk on %s (gfid:%s) (%s)", oplocal->loc.path,
+               uuid_utoa(oplocal->loc.inode->gfid), strerror(op_errno));
+        if (local->lk_frame) {
+            STACK_DESTROY(local->lk_frame->root);
+            local->lk_frame = NULL;
+        }
+        goto err;
+    }
+
+    GET_CONTRI_KEY(this, contri_key, oplocal->loc.parent->gfid, ret);
+    if (ret < 0) {
+        local->err = errno ? errno : ENOMEM;
+        goto err;
+    }
+
+    /* getxattr requires uid and gid to be 0,
+     * reset them in the callback.
+     */
+    MARKER_SET_UID_GID(frame, local, frame->root);
+
+    if (gf_uuid_is_null(oplocal->loc.gfid))
+        gf_uuid_copy(oplocal->loc.gfid, oplocal->loc.inode->gfid);
+
+    GF_UUID_ASSERT(oplocal->loc.gfid);
+
+    STACK_WIND_COOKIE(frame, marker_do_rename, frame->cookie, FIRST_CHILD(this),
+                      FIRST_CHILD(this)->fops->getxattr, &oplocal->loc,
+                      contri_key, NULL);
+
+    return 0;
+err:
+    marker_rename_unwind(frame, NULL, this, 0, 0, NULL);
+    return 0;
+}
+
+/* For a marker_rename FOP, following is the algorithm used for Quota
+ * accounting. The use-case considered is:
+ * 1. rename (src, dst)
+ * 2. both src and dst exist
+ * 3. there are parallel operations on src and dst (lets say through fds
+ *    opened on them before rename was initiated).
+ *
+ * PS: We've not thought through whether this algo works in the presence of
+ *     hardlinks to src and/or dst.
+ *
+ * Algorithm:
+ * ==========
+ *
+ * 1) set inodelk on src-parent
+ *    As part of rename operation, parent can change for the file.
+ *    We need to remove contribution (both on disk xattr and in-memory one)
+ *    to src-parent (and its ancestors) and add the contribution to dst-parent
+ *    (and its ancestors). While we are doing these operations, contribution of
+ *    the file/directory shouldn't be changing as we want to be sure that
+ *      a) what we subtract from src-parent is exactly what we add to dst-parent
+ *      b) we should subtract from src-parent exactly what we contributed to
+ *         src-parent
+ *    So, We hold a lock on src-parent to block any parallel transcations on
+ *    src-inode (since that's the one which survives rename).
+ *
+ *    If there are any parallel transactions on dst-inode they keep succeeding
+ *    till the association of dst-inode with dst-parent is broken because of an
+ *    inode_rename after unwind of rename fop from marker. Only after unwind
+ *    (and hence inode_rename), we delete and subtract the contribution of
+ *    dst-inode to dst-parent. That way we are making sure we subtract exactly
+ *    what dst-inode contributed to dst-parent.
+ *
+ * 2) lookup contribution to src-parent on src-inode.
+ *    We need to save the contribution info for use at step-8.
+ *
+ * 3) wind rename
+ *    Perform rename on disk
+ *
+ * 4) remove xattr on src-loc
+ *    After rename, parent can change, so
+ *    need to remove xattrs storing contribution to src-parent.
+ *
+ * 5) remove contribution node corresponding to src-parent from the in-memory
+ *    list.
+ *    After rename, contri gfid can change and we have
+ *    also removed xattr from file.
+ *    We need to remove in-memory contribution node to prevent updations to
+ *    src-parent even after a successful rename
+ *
+ * 6) unwind rename
+ *    This will ensure that rename is done in the server
+ *    inode table. An inode_rename disassociates src-inode from src-parent and
+ *    associates it with dst-parent. It also disassociates dst-inode from
+ *    dst-parent. After inode_rename, inode_parent on src-inode will give
+ *    dst-parent and inode_parent on dst-inode will return NULL (assuming
+ *    dst-inode doesn't have any hardlinks).
+ *
+ * 7) release inodelk on src-parent
+ *    Lock on src-parent should be released only after
+ *    rename on disk, remove xattr and rename_unwind (and hence inode_rename)
+ *    operations. If lock is released before inode_rename, a parallel
+ *    transaction on src-inode can still update src-parent (as inode_parent on
+ *    src-inode can still return src-parent). This would make the
+ *    contribution from src-inode to src-parent stored in step-2 stale.
+ *
+ * 8) Initiate mq_reduce_parent_size_txn on src-parent to remove contribution
+ *    of src-inode to src-parent. We use the contribution stored in step-2.
+ *    Since, we had acquired the lock on src-parent all along step-2 through
+ *    inode_rename, we can be sure that a parallel transaction wouldn't have
+ *    added a delta to src-parent.
+ *
+ * 9) Initiate mq_reduce_parent_size_txn on dst-parent if dst-inode exists.
+ *    The size reduced from dst-parent and its ancestors is the
+ *    size stored as contribution to dst-parent in dst-inode.
+ *    If the destination file had existed, rename will unlink the
+ *    destination file as part of its operation.
+ *    We need to reduce the size on the dest parent similarly to
+ *    unlink. Since, we are initiating reduce-parent-size transaction after
+ *    inode_rename, we can be sure that a parallel transaction wouldn't add
+ *    delta to dst-parent while we are reducing the contribution of dst-inode
+ *    from its ancestors before rename.
+ *
+ * 10) create contribution xattr to dst-parent on src-inode.
+ */
+int32_t
+marker_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+              dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_local_t *oplocal = NULL;
+    marker_conf_t *priv = NULL;
+    struct gf_flock lock = {
+        0,
+    };
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto rename_wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    oplocal = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, oplocal);
+
+    frame->local = local;
+
+    local->oplocal = marker_local_ref(oplocal);
+
+    ret = loc_copy(&local->loc, newloc);
+    if (ret < 0)
+        goto err;
+
+    ret = loc_copy(&oplocal->loc, oldloc);
+    if (ret < 0)
+        goto err;
+
+    if (!(priv->feature_enabled & GF_QUOTA)) {
+        goto rename_wind;
+    }
+
+    ret = mq_inode_loc_fill(NULL, newloc->parent, &local->parent_loc);
+    if (ret < 0)
+        goto err;
+
+    ret = mq_inode_loc_fill(NULL, oldloc->parent, &oplocal->parent_loc);
+    if (ret < 0)
+        goto err;
+
+    lock.l_len = 0;
+    lock.l_start = 0;
+    lock.l_type = F_WRLCK;
+    lock.l_whence = SEEK_SET;
+
+    local->xdata = xdata ? dict_ref(xdata) : dict_new();
+    ret = dict_set_int32(local->xdata, GF_REQUEST_LINK_COUNT_XDATA, 1);
+    if (ret < 0)
+        goto err;
+
+    local->frame = frame;
+    local->lk_frame = create_frame(this, this->ctx->pool);
+    if (local->lk_frame == NULL)
+        goto err;
+
+    local->lk_frame->root->uid = 0;
+    local->lk_frame->root->gid = 0;
+    local->lk_frame->local = local;
+    set_lk_owner_from_ptr(&local->lk_frame->root->lk_owner,
+                          local->lk_frame->root);
+
+    STACK_WIND(local->lk_frame, marker_get_oldpath_contribution,
+               FIRST_CHILD(this), FIRST_CHILD(this)->fops->inodelk, this->name,
+               &oplocal->parent_loc, F_SETLKW, &lock, NULL);
+
+    return 0;
+
+rename_wind:
+    STACK_WIND(frame, marker_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+
+    return 0;
+err:
+    MARKER_STACK_UNWIND(rename, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL,
+                        NULL);
+    marker_local_unref(oplocal);
+
+    return 0;
+}
+
+int32_t
+marker_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                    struct iatt *postbuf, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s occurred while "
+               "truncating a file ",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    priv = this->private;
+
+    if (priv->feature_enabled & GF_QUOTA) {
+        /* DHT Rebalance process, at the end of migration will
+         * first make the src file as a linkto file and then
+         * truncate the file. By doing a truncate after making the
+         * src file as linkto file, the contri which is already
+         * accounted is left over.
+         * So, we need to account for the linkto file when a truncate
+         * happens, thereby updating the contri properly.
+         * By passing NULL for postbuf, mq_prevalidate does not check
+         * for linkto file.
+         * Same happens with ftruncate as well.
+         */
+        if (postbuf && IS_DHT_LINKFILE_MODE(postbuf))
+            mq_initiate_quota_txn(this, &local->loc, NULL);
+        else
+            mq_initiate_quota_txn(this, &local->loc, postbuf);
+    }
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int32_t
+marker_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+                dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = loc_copy(&local->loc, loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(truncate, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+marker_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s occurred while "
+               "truncating a file ",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    priv = this->private;
+
+    if (priv->feature_enabled & GF_QUOTA) {
+        if (postbuf && IS_DHT_LINKFILE_MODE(postbuf))
+            mq_initiate_quota_txn(this, &local->loc, NULL);
+        else
+            mq_initiate_quota_txn(this, &local->loc, postbuf);
+    }
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int32_t
+marker_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                 dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = marker_inode_loc_fill(fd->inode, &local->loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_ftruncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+marker_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, inode_t *inode,
+                   struct iatt *buf, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
+{
+    marker_conf_t *priv = NULL;
+    marker_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s occurred while "
+               "creating symlinks ",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+    priv = this->private;
+
+    if (op_ret >= 0 && inode && (priv->feature_enabled & GF_QUOTA)) {
+        ctx = mq_inode_ctx_new(inode, this);
+        if (ctx == NULL) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "mq_inode_ctx_new "
+                   "failed for %s",
+                   uuid_utoa(inode->gfid));
+            op_ret = -1;
+            op_errno = ENOMEM;
+        }
+    }
+
+    STACK_UNWIND_STRICT(symlink, frame, op_ret, op_errno, inode, buf, preparent,
+                        postparent, xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    if (gf_uuid_is_null(local->loc.gfid))
+        gf_uuid_copy(local->loc.gfid, buf->ia_gfid);
+
+    if (priv->feature_enabled & GF_QUOTA) {
+        mq_create_xattrs_txn(this, &local->loc, buf);
+    }
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int
+marker_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+               loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = loc_copy(&local->loc, loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_symlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(symlink, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
+                        NULL);
+
+    return 0;
+}
+
+int32_t
+marker_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, inode_t *inode,
+                 struct iatt *buf, struct iatt *preparent,
+                 struct iatt *postparent, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s occurred with "
+               "mknod ",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+    priv = this->private;
+
+    if (op_ret >= 0 && inode && (priv->feature_enabled & GF_QUOTA)) {
+        ctx = mq_inode_ctx_new(inode, this);
+        if (ctx == NULL) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "mq_inode_ctx_new "
+                   "failed for %s",
+                   uuid_utoa(inode->gfid));
+            op_ret = -1;
+            op_errno = ENOMEM;
+        }
+    }
+
+    STACK_UNWIND_STRICT(mknod, frame, op_ret, op_errno, inode, buf, preparent,
+                        postparent, xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    if (gf_uuid_is_null(local->loc.gfid))
+        gf_uuid_copy(local->loc.gfid, buf->ia_gfid);
+
+    if ((priv->feature_enabled & GF_QUOTA) && (S_ISREG(local->mode))) {
+        mq_create_xattrs_txn(this, &local->loc, buf);
+    }
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int
+marker_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+             dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = loc_copy(&local->loc, loc);
+
+    local->mode = mode;
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(mknod, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+marker_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s occurred while "
+               "fallocating a file ",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    priv = this->private;
+
+    if (priv->feature_enabled & GF_QUOTA)
+        mq_initiate_quota_txn(this, &local->loc, postbuf);
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int32_t
+marker_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+                 off_t offset, size_t len, dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = marker_inode_loc_fill(fd->inode, &local->loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_fallocate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+               xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+marker_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                   struct iatt *postbuf, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE, "%s occurred during discard",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    priv = this->private;
+
+    if (priv->feature_enabled & GF_QUOTA)
+        mq_initiate_quota_txn(this, &local->loc, postbuf);
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int32_t
+marker_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+               size_t len, dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = marker_inode_loc_fill(fd->inode, &local->loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_discard_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(discard, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+marker_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                    struct iatt *postbuf, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE, "%s occurred during zerofill",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    priv = this->private;
+
+    if (priv->feature_enabled & GF_QUOTA)
+        mq_initiate_quota_txn(this, &local->loc, postbuf);
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int32_t
+marker_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                off_t len, dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = marker_inode_loc_fill(fd->inode, &local->loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_zerofill_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(zerofill, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    return 0;
+}
+
+/* when a call from the special client is received on
+ * key trusted.glusterfs.volume-mark with value "RESET"
+ * or if the value is 0length, update the change the
+ * access time and modification time via touching the
+ * timestamp file.
+ */
+int32_t
+call_from_sp_client_to_reset_tmfile(call_frame_t *frame, xlator_t *this,
+                                    dict_t *dict)
+{
+    int32_t fd = 0;
+    int32_t op_ret = 0;
+    int32_t op_errno = 0;
+    data_t *data = NULL;
+    marker_conf_t *priv = NULL;
+
+    if (frame == NULL || this == NULL || dict == NULL)
+        return -1;
+
+    priv = this->private;
+
+    data = dict_get(dict, "trusted.glusterfs.volume-mark");
+    if (data == NULL)
+        return -1;
+
+    if (frame->root->pid != GF_CLIENT_PID_GSYNCD) {
+        op_ret = -1;
+        op_errno = EPERM;
+
+        goto out;
+    }
+
+    if (data->len == 0 ||
+        (data->len == 5 && memcmp(data->data, "RESET", 5) == 0)) {
+        fd = open(priv->timestamp_file, O_WRONLY | O_TRUNC);
+        if (fd != -1) {
+            /* TODO check  whether the O_TRUNC would update the
+             * timestamps on a zero length file on all machies.
+             */
+            sys_close(fd);
+        }
+
+        if (fd != -1 || errno == ENOENT) {
+            op_ret = 0;
+            op_errno = 0;
+        } else {
+            op_ret = -1;
+            op_errno = errno;
+        }
+    } else {
+        op_ret = -1;
+        op_errno = EINVAL;
+    }
+out:
+    STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, NULL);
+
+    return 0;
+}
+
+int32_t
+marker_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s occurred in "
+               "setxattr ",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    priv = this->private;
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int
+remove_quota_keys(dict_t *dict, char *k, data_t *v, void *data)
+{
+    call_frame_t *frame = data;
+    marker_local_t *local = frame->local;
+    xlator_t *this = frame->this;
+    marker_conf_t *priv = NULL;
+    char ver_str[NAME_MAX] = {
+        0,
+    };
+    char *dot = NULL;
+    int ret = -1;
+
+    priv = this->private;
+
+    /* If quota is enabled immediately after disable.
+     * quota healing starts creating new xattrs
+     * before completing the cleanup operation.
+     * So we should check if the xattr is the new.
+     * Do not remove xattr if its xattr
+     * version is same as current version
+     */
+    if ((priv->feature_enabled & GF_QUOTA) && priv->version > 0) {
+        snprintf(ver_str, sizeof(ver_str), ".%d", priv->version);
+        dot = strrchr(k, '.');
+        if (dot && !strcmp(dot, ver_str))
+            return 0;
+    }
+
+    ret = syncop_removexattr(FIRST_CHILD(this), &local->loc, k, 0, NULL);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "%s: Failed to remove "
+               "extended attribute: %s",
+               local->loc.path, k);
+        return -1;
+    }
+    return 0;
+}
+
+int
+quota_xattr_cleaner_cbk(int ret, call_frame_t *frame, void *args)
+{
+    dict_t *xdata = args;
+    int op_ret = -1;
+    int op_errno = 0;
+
+    op_ret = (ret < 0) ? -1 : 0;
+    op_errno = -ret;
+
+    MARKER_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+    return ret;
+}
+
+int
+quota_xattr_cleaner(void *args)
+{
+    struct synctask *task = NULL;
+    call_frame_t *frame = NULL;
+    xlator_t *this = NULL;
+    marker_local_t *local = NULL;
+    dict_t *xdata = NULL;
+    int ret = -1;
+
+    task = synctask_get();
+    if (!task)
+        goto out;
+
+    frame = task->frame;
+    this = frame->this;
+    local = frame->local;
+
+    ret = syncop_listxattr(FIRST_CHILD(this), &local->loc, &xdata, NULL, NULL);
+    if (ret == -1) {
+        ret = -errno;
+        goto out;
+    }
+
+    ret = dict_foreach_fnmatch(xdata, "trusted.glusterfs.quota.*",
+                               remove_quota_keys, frame);
+    if (ret == -1) {
+        ret = -errno;
+        goto out;
+    }
+    ret = dict_foreach_fnmatch(xdata, PGFID_XATTR_KEY_PREFIX "*",
+                               remove_quota_keys, frame);
+    if (ret == -1) {
+        ret = -errno;
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (xdata)
+        dict_unref(xdata);
+
+    return ret;
+}
+
+int
+marker_do_xattr_cleanup(call_frame_t *frame, xlator_t *this, dict_t *xdata,
+                        loc_t *loc)
+{
+    int ret = -1;
+    marker_local_t *local = NULL;
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto out;
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    loc_copy(&local->loc, loc);
+    ret = synctask_new(this->ctx->env, quota_xattr_cleaner,
+                       quota_xattr_cleaner_cbk, frame, xdata);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Failed to create synctask "
+               "for cleaning up quota extended attributes");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (ret)
+        MARKER_STACK_UNWIND(setxattr, frame, -1, ENOMEM, xdata);
+
+    return ret;
+}
+
+static gf_boolean_t
+marker_xattr_cleanup_cmd(dict_t *dict)
+{
+    return (dict_get(dict, VIRTUAL_QUOTA_XATTR_CLEANUP_KEY) != NULL);
+}
+
+int32_t
+marker_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+                int32_t flags, dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+    int op_errno = ENOMEM;
+
+    priv = this->private;
+
+    if (marker_xattr_cleanup_cmd(dict)) {
+        if (frame->root->uid != 0 || frame->root->gid != 0) {
+            op_errno = EPERM;
+            ret = -1;
+            goto err;
+        }
+
+        /* The following function does the cleanup and then unwinds the
+         * corresponding call*/
+        loc_path(loc, NULL);
+        marker_do_xattr_cleanup(frame, this, xdata, loc);
+        return 0;
+    }
+
+    ret = marker_key_replace_with_ver(this, dict);
+    if (ret < 0)
+        goto err;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    ret = call_from_sp_client_to_reset_tmfile(frame, this, dict);
+    if (ret == 0)
+        return 0;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = loc_copy(&local->loc, loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_setxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+
+    return 0;
+}
+
+int32_t
+marker_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s occurred in "
+               "fsetxattr",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    priv = this->private;
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int32_t
+marker_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+                 int32_t flags, dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    ret = call_from_sp_client_to_reset_tmfile(frame, this, dict);
+    if (ret == 0)
+        return 0;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = marker_inode_loc_fill(fd->inode, &local->loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_fsetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(fsetxattr, frame, -1, ENOMEM, NULL);
+
+    return 0;
+}
+
+int32_t
+marker_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+                    struct iatt *statpost, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s occurred in "
+               "fsetattr ",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(fsetattr, frame, op_ret, op_errno, statpre, statpost,
+                        xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    priv = this->private;
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int32_t
+marker_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = marker_inode_loc_fill(fd->inode, &local->loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_fsetattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+marker_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+                   struct iatt *statpost, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE, "%s occurred during setattr of %s",
+               strerror(op_errno), (local ? local->loc.path : "<nul>"));
+    }
+
+    STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, statpre, statpost,
+                        xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    priv = this->private;
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int32_t
+marker_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = loc_copy(&local->loc, loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(setattr, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int32_t
+marker_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE,
+               "%s occurred while "
+               "removing extended attribute",
+               strerror(op_errno));
+    }
+
+    local = (marker_local_t *)frame->local;
+
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(removexattr, frame, op_ret, op_errno, xdata);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    priv = this->private;
+
+    if (priv->feature_enabled & GF_XTIME)
+        marker_xtime_update_marks(this, local);
+out:
+    marker_local_unref(local);
+
+    return 0;
+}
+
+int32_t
+marker_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   const char *name, dict_t *xdata)
+{
+    int32_t ret = -1;
+    int32_t i = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+    char key[QUOTA_KEY_MAX] = {
+        0,
+    };
+
+    priv = this->private;
+
+    if (name) {
+        for (i = 0; mq_ext_xattrs[i]; i++) {
+            if (strcmp(name, mq_ext_xattrs[i]))
+                continue;
+
+            GET_QUOTA_KEY(this, key, mq_ext_xattrs[i], ret);
+            if (ret < 0)
+                goto err;
+            name = key;
+            break;
+        }
+    }
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = loc_copy(&local->loc, loc);
+
+    if (ret == -1)
+        goto err;
+wind:
+    STACK_WIND(frame, marker_removexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+    return 0;
+err:
+    MARKER_STACK_UNWIND(removexattr, frame, -1, ENOMEM, NULL);
+
+    return 0;
+}
+
+static gf_boolean_t
+__has_quota_xattrs(dict_t *xattrs)
+{
+    if (dict_foreach_match(xattrs, _is_quota_internal_xattr, NULL,
+                           dict_null_foreach_fn, NULL) > 0)
+        return _gf_true;
+
+    return _gf_false;
+}
+
+int32_t
+marker_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, inode_t *inode,
+                  struct iatt *buf, dict_t *dict, struct iatt *postparent)
+{
+    marker_conf_t *priv = NULL;
+    marker_local_t *local = NULL;
+    dict_t *xattrs = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    int32_t ret = -1;
+
+    priv = this->private;
+    local = (marker_local_t *)frame->local;
+    frame->local = NULL;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_TRACE, "lookup failed with %s",
+               strerror(op_errno));
+        goto unwind;
+    }
+
+    ret = marker_key_set_ver(this, dict);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    if (dict && __has_quota_xattrs(dict)) {
+        xattrs = dict_copy_with_ref(dict, NULL);
+        if (!xattrs) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+        } else {
+            marker_filter_internal_xattrs(this, xattrs);
+        }
+    } else if (dict) {
+        xattrs = dict_ref(dict);
+    }
+
+    if (op_ret >= 0 && inode && (priv->feature_enabled & GF_QUOTA)) {
+        ctx = mq_inode_ctx_new(inode, this);
+        if (ctx == NULL) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "mq_inode_ctx_new "
+                   "failed for %s",
+                   uuid_utoa(inode->gfid));
+            op_ret = -1;
+            op_errno = ENOMEM;
+        }
+    }
+
+unwind:
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xattrs,
+                        postparent);
+
+    if (op_ret == -1 || local == NULL)
+        goto out;
+
+    /* copy the gfid from the stat structure instead of inode,
+     * since if the lookup is fresh lookup, then the inode
+     * would have not yet linked to the inode table which happens
+     * in protocol/server.
+     */
+    if (gf_uuid_is_null(local->loc.gfid))
+        gf_uuid_copy(local->loc.gfid, buf->ia_gfid);
+
+    if (priv->feature_enabled & GF_QUOTA) {
+        mq_xattr_state(this, &local->loc, dict, buf);
+    }
+
+out:
+    marker_local_unref(local);
+    if (xattrs)
+        dict_unref(xattrs);
+
+    return 0;
+}
+
+int32_t
+marker_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc,
+              dict_t *xattr_req)
+{
+    int32_t ret = 0;
+    marker_local_t *local = NULL;
+    marker_conf_t *priv = NULL;
+
+    priv = this->private;
+
+    xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new();
+    if (!xattr_req)
+        goto err;
+
+    ret = marker_key_replace_with_ver(this, xattr_req);
+    if (ret < 0)
+        goto err;
+
+    if (priv->feature_enabled == 0)
+        goto wind;
+
+    local = mem_get0(this->local_pool);
+    if (local == NULL)
+        goto err;
+
+    MARKER_INIT_LOCAL(frame, local);
+
+    ret = loc_copy(&local->loc, loc);
+    if (ret == -1)
+        goto err;
+
+    if ((priv->feature_enabled & GF_QUOTA))
+        mq_req_xattr(this, loc, xattr_req, NULL, NULL);
+
+wind:
+    STACK_WIND(frame, marker_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xattr_req);
+
+    dict_unref(xattr_req);
+
+    return 0;
+err:
+    MARKER_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
+
+    if (xattr_req)
+        dict_unref(xattr_req);
+
+    return 0;
+}
+
+int
+marker_build_ancestry_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int op_ret, int op_errno, gf_dirent_t *entries,
+                          dict_t *xdata)
+{
+    gf_dirent_t *entry = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    int ret = -1;
+
+    if ((op_ret <= 0) || (entries == NULL)) {
+        goto out;
+    }
+
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        if (entry->inode == NULL)
+            continue;
+
+        ret = marker_key_set_ver(this, entry->dict);
+        if (ret < 0) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            break;
+        }
+
+        ctx = mq_inode_ctx_new(entry->inode, this);
+        if (ctx == NULL)
+            gf_log(this->name, GF_LOG_WARNING,
+                   "mq_inode_ctx_new "
+                   "failed for %s",
+                   uuid_utoa(entry->inode->gfid));
+    }
+
+out:
+    STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata);
+    return 0;
+}
+
+int
+marker_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int op_ret, int op_errno, gf_dirent_t *entries,
+                    dict_t *xdata)
+{
+    gf_dirent_t *entry = NULL;
+    marker_conf_t *priv = NULL;
+    marker_local_t *local = NULL;
+    loc_t loc = {
+        0,
+    };
+    int ret = -1;
+    char *resolvedpath = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    if (op_ret <= 0)
+        goto unwind;
+
+    priv = this->private;
+    local = frame->local;
+
+    if (!(priv->feature_enabled & GF_QUOTA) || (local == NULL)) {
+        goto unwind;
+    }
+
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        if ((strcmp(entry->d_name, ".") == 0) ||
+            (strcmp(entry->d_name, "..") == 0) || entry->inode == NULL)
+            continue;
+
+        loc.parent = inode_ref(local->loc.inode);
+        loc.inode = inode_ref(entry->inode);
+        ret = inode_path(loc.parent, entry->d_name, &resolvedpath);
+        if (ret < 0) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "failed to get the "
+                   "path for the entry %s",
+                   entry->d_name);
+            loc_wipe(&loc);
+            continue;
+        }
+
+        loc.path = resolvedpath;
+        resolvedpath = NULL;
+
+        ctx = mq_inode_ctx_new(loc.inode, this);
+        if (ctx == NULL)
+            gf_log(this->name, GF_LOG_WARNING,
+                   "mq_inode_ctx_new "
+                   "failed for %s",
+                   uuid_utoa(loc.inode->gfid));
+
+        mq_xattr_state(this, &loc, entry->dict, &entry->d_stat);
+        loc_wipe(&loc);
+
+        ret = marker_key_set_ver(this, entry->dict);
+        if (ret < 0) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unwind;
+        }
+    }
+
+unwind:
+    MARKER_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata);
+
+    return 0;
+}
+
+int
+marker_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                off_t offset, dict_t *dict)
+{
+    marker_conf_t *priv = NULL;
+    loc_t loc = {
+        0,
+    };
+    marker_local_t *local = NULL;
+    int ret = -1;
+
+    priv = this->private;
+
+    dict = dict ? dict_ref(dict) : dict_new();
+    if (!dict)
+        goto unwind;
+
+    ret = marker_key_replace_with_ver(this, dict);
+    if (ret < 0)
+        goto unwind;
+
+    if (dict_get(dict, GET_ANCESTRY_DENTRY_KEY)) {
+        STACK_WIND(frame, marker_build_ancestry_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict);
+    } else {
+        if (priv->feature_enabled & GF_QUOTA) {
+            local = mem_get0(this->local_pool);
+
+            MARKER_INIT_LOCAL(frame, local);
+
+            loc.parent = local->loc.inode = inode_ref(fd->inode);
+
+            mq_req_xattr(this, &loc, dict, NULL, NULL);
+        }
+
+        STACK_WIND(frame, marker_readdirp_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict);
+    }
+
+    dict_unref(dict);
+    return 0;
+unwind:
+    MARKER_STACK_UNWIND(readdirp, frame, -1, ENOMEM, NULL, NULL);
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_marker_mt_end + 1);
+
+    if (ret != 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Memory accounting init"
+               " failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+int32_t
+init_xtime_priv(xlator_t *this, dict_t *options)
+{
+    int32_t ret = -1;
+    marker_conf_t *priv = NULL;
+    char *tmp_opt = NULL;
+
+    GF_VALIDATE_OR_GOTO("marker", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, options, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    priv = this->private;
+
+    ret = dict_get_str(options, "volume-uuid", &tmp_opt);
+
+    if (ret) {
+        priv->volume_uuid = NULL;
+        tmp_opt = "";
+
+        gf_log(this->name, GF_LOG_ERROR,
+               "please specify the volume-uuid"
+               "in the translator options");
+
+        return -1;
+    }
+    gf_asprintf(&priv->volume_uuid, "%s", tmp_opt);
+
+    ret = gf_uuid_parse(priv->volume_uuid, priv->volume_uuid_bin);
+
+    if (ret == -1) {
+        gf_log(this->name, GF_LOG_ERROR, "invalid volume uuid %s",
+               priv->volume_uuid);
+        goto out;
+    }
+
+    ret = gf_asprintf(&(priv->marker_xattr), "%s.%s.%s", MARKER_XATTR_PREFIX,
+                      priv->volume_uuid, XTIME);
+
+    if (ret == -1) {
+        priv->marker_xattr = NULL;
+        goto out;
+    }
+
+    gf_log(this->name, GF_LOG_DEBUG, "volume-uuid = %s", priv->volume_uuid);
+
+    ret = dict_get_str(options, "timestamp-file", &tmp_opt);
+    if (ret) {
+        priv->timestamp_file = NULL;
+        tmp_opt = "";
+
+        gf_log(this->name, GF_LOG_ERROR,
+               "please specify the timestamp-file"
+               "in the translator options");
+
+        goto out;
+    }
+
+    ret = gf_asprintf(&priv->timestamp_file, "%s", tmp_opt);
+    if (ret == -1) {
+        priv->timestamp_file = NULL;
+        goto out;
+    }
+
+    gf_log(this->name, GF_LOG_DEBUG, "the timestamp-file is = %s",
+           priv->timestamp_file);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+void
+marker_xtime_priv_cleanup(xlator_t *this)
+{
+    marker_conf_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("marker", this, out);
+
+    priv = (marker_conf_t *)this->private;
+
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    GF_FREE(priv->volume_uuid);
+
+    GF_FREE(priv->timestamp_file);
+
+    GF_FREE(priv->marker_xattr);
+out:
+    return;
+}
+
+void
+marker_priv_cleanup(xlator_t *this)
+{
+    marker_conf_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("marker", this, out);
+
+    priv = (marker_conf_t *)this->private;
+
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    marker_xtime_priv_cleanup(this);
+
+    LOCK_DESTROY(&priv->lock);
+
+    GF_FREE(priv);
+
+    if (this->local_pool) {
+        mem_pool_destroy(this->local_pool);
+        this->local_pool = NULL;
+    }
+
+out:
+    return;
+}
+
+int32_t
+reconfigure(xlator_t *this, dict_t *options)
+{
+    int32_t ret = 0;
+    data_t *data = NULL;
+    gf_boolean_t flag = _gf_false;
+    marker_conf_t *priv = NULL;
+    int32_t version = 0;
+
+    GF_ASSERT(this);
+    GF_ASSERT(this->private);
+
+    priv = this->private;
+
+    priv->feature_enabled = 0;
+
+    GF_VALIDATE_OR_GOTO(this->name, options, out);
+
+    data = dict_get(options, "quota");
+    if (data) {
+        ret = gf_string2boolean(data->data, &flag);
+        if (ret == 0 && flag == _gf_true)
+            priv->feature_enabled |= GF_QUOTA;
+    }
+
+    data = dict_get(options, "inode-quota");
+    if (data) {
+        ret = gf_string2boolean(data->data, &flag);
+        if (ret == 0 && flag == _gf_true)
+            priv->feature_enabled |= GF_INODE_QUOTA;
+    }
+
+    data = dict_get(options, "quota-version");
+    if (data)
+        ret = gf_string2int32(data->data, &version);
+
+    if (priv->feature_enabled) {
+        if (version >= 0)
+            priv->version = version;
+        else
+            gf_log(this->name, GF_LOG_ERROR,
+                   "Invalid quota "
+                   "version %d",
+                   priv->version);
+    }
+
+    data = dict_get(options, "xtime");
+    if (data) {
+        ret = gf_string2boolean(data->data, &flag);
+        if (ret == 0 && flag == _gf_true) {
+            marker_xtime_priv_cleanup(this);
+
+            ret = init_xtime_priv(this, options);
+            if (ret < 0) {
+                gf_log(this->name, GF_LOG_WARNING,
+                       "failed to initialize xtime private, "
+                       "xtime updation will fail");
+            } else {
+                priv->feature_enabled |= GF_XTIME;
+                data = dict_get(options, "gsync-force-xtime");
+                if (!data)
+                    goto out;
+                ret = gf_string2boolean(data->data, &flag);
+                if (ret == 0 && flag)
+                    priv->feature_enabled |= GF_XTIME_GSYNC_FORCE;
+            }
+        }
+    }
+out:
+    return ret;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    dict_t *options = NULL;
+    data_t *data = NULL;
+    int32_t ret = 0;
+    gf_boolean_t flag = _gf_false;
+    marker_conf_t *priv = NULL;
+
+    if (!this->children) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "marker translator needs subvolume defined.");
+        return -1;
+    }
+
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_WARNING, "Volume is dangling.");
+        return -1;
+    }
+
+    options = this->options;
+
+    ALLOCATE_OR_GOTO(this->private, marker_conf_t, err);
+
+    priv = this->private;
+
+    priv->feature_enabled = 0;
+    priv->version = 0;
+
+    LOCK_INIT(&priv->lock);
+
+    data = dict_get(options, "quota");
+    if (data) {
+        ret = gf_string2boolean(data->data, &flag);
+        if (ret == 0 && flag == _gf_true)
+            priv->feature_enabled |= GF_QUOTA;
+    }
+
+    data = dict_get(options, "inode-quota");
+    if (data) {
+        ret = gf_string2boolean(data->data, &flag);
+        if (ret == 0 && flag == _gf_true)
+            priv->feature_enabled |= GF_INODE_QUOTA;
+    }
+
+    data = dict_get(options, "quota-version");
+    if (data)
+        ret = gf_string2int32(data->data, &priv->version);
+
+    if ((ret == 0) && priv->feature_enabled && priv->version < 0) {
+        gf_log(this->name, GF_LOG_ERROR, "Invalid quota version %d",
+               priv->version);
+        goto err;
+    }
+
+    data = dict_get(options, "xtime");
+    if (data) {
+        ret = gf_string2boolean(data->data, &flag);
+        if (ret == 0 && flag == _gf_true) {
+            ret = init_xtime_priv(this, options);
+            if (ret < 0)
+                goto err;
+
+            priv->feature_enabled |= GF_XTIME;
+            data = dict_get(options, "gsync-force-xtime");
+            if (!data)
+                goto cont;
+            ret = gf_string2boolean(data->data, &flag);
+            if (ret == 0 && flag)
+                priv->feature_enabled |= GF_XTIME_GSYNC_FORCE;
+        }
+    }
+
+cont:
+    this->local_pool = mem_pool_new(marker_local_t, 128);
+    if (!this->local_pool) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to create local_t's memory pool");
+        goto err;
+    }
+
+    return 0;
+err:
+    marker_priv_cleanup(this);
+
+    return -1;
+}
+
+int32_t
+marker_forget(xlator_t *this, inode_t *inode)
+{
+    marker_inode_ctx_t *ctx = NULL;
+    uint64_t value = 0;
+
+    if (inode_ctx_del(inode, this, &value) != 0)
+        goto out;
+
+    ctx = (marker_inode_ctx_t *)(unsigned long)value;
+    if (ctx == NULL) {
+        goto out;
+    }
+
+    mq_forget(this, ctx->quota_ctx);
+
+    GF_FREE(ctx);
+out:
+    return 0;
+}
+
+void
+fini(xlator_t *this)
+{
+    marker_priv_cleanup(this);
+}
+
+struct xlator_fops fops = {
+    .lookup = marker_lookup,
+    .create = marker_create,
+    .mkdir = marker_mkdir,
+    .writev = marker_writev,
+    .truncate = marker_truncate,
+    .ftruncate = marker_ftruncate,
+    .symlink = marker_symlink,
+    .link = marker_link,
+    .unlink = marker_unlink,
+    .rmdir = marker_rmdir,
+    .rename = marker_rename,
+    .mknod = marker_mknod,
+    .setxattr = marker_setxattr,
+    .fsetxattr = marker_fsetxattr,
+    .setattr = marker_setattr,
+    .fsetattr = marker_fsetattr,
+    .removexattr = marker_removexattr,
+    .getxattr = marker_getxattr,
+    .readdirp = marker_readdirp,
+    .fallocate = marker_fallocate,
+    .discard = marker_discard,
+    .zerofill = marker_zerofill,
+};
+
+struct xlator_cbks cbks = {.forget = marker_forget};
+
+struct volume_options options[] = {
+    {.key = {"volume-uuid"}, .default_value = "{{ volume.id }}"},
+    {.key = {"timestamp-file"}},
+    {
+        .key = {"quota"},
+        .op_version = {1},
+        .flags = OPT_FLAG_NONE,
+        .tags = {},
+    },
+    {
+        .key = {"inode-quota"},
+        .op_version = {1},
+        .flags = OPT_FLAG_NONE,
+        .tags = {},
+    },
+    {
+        .key = {"xtime"},
+        .op_version = {1},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_FORCE,
+        .tags = {},
+    },
+    {
+        .key = {"gsync-force-xtime"},
+        .op_version = {2},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_FORCE,
+        .tags = {},
+    },
+    {
+        .key = {"quota-version"},
+        .flags = OPT_FLAG_NONE,
+    },
+    {.key = {NULL}}};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "marker",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/marker/src/marker.h b/xlators/features/marker/src/marker.h
new file mode 100644
index 00000000000..4821094c14b
--- /dev/null
+++ b/xlators/features/marker/src/marker.h
@@ -0,0 +1,148 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _MARKER_H
+#define _MARKER_H
+
+#include "marker-quota.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/compat-uuid.h>
+#include <glusterfs/call-stub.h>
+
+#define MARKER_XATTR_PREFIX "trusted.glusterfs"
+#define XTIME "xtime"
+#define VOLUME_MARK "volume-mark"
+#define VOLUME_UUID "volume-uuid"
+#define TIMESTAMP_FILE "timestamp-file"
+
+enum {
+    GF_QUOTA = 1,
+    GF_XTIME = 2,
+    GF_XTIME_GSYNC_FORCE = 4,
+    GF_INODE_QUOTA = 8,
+};
+
+/*initialize the local variable*/
+#define MARKER_INIT_LOCAL(_frame, _local)                                      \
+    do {                                                                       \
+        _frame->local = _local;                                                \
+        _local->pid = _frame->root->pid;                                       \
+        memset(&_local->loc, 0, sizeof(loc_t));                                \
+        _local->ref = 1;                                                       \
+        _local->uid = -1;                                                      \
+        _local->gid = -1;                                                      \
+        LOCK_INIT(&_local->lock);                                              \
+        _local->oplocal = NULL;                                                \
+    } while (0)
+
+/* try alloc and if it fails, goto label */
+#define ALLOCATE_OR_GOTO(var, type, label)                                     \
+    do {                                                                       \
+        var = GF_CALLOC(sizeof(type), 1, gf_marker_mt_##type);                 \
+        if (!var) {                                                            \
+            gf_log(this->name, GF_LOG_ERROR, "out of memory :(");              \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0)
+
+#define _MARKER_SET_UID_GID(dest, src)                                         \
+    do {                                                                       \
+        if (src->uid != -1 && src->gid != -1) {                                \
+            dest->uid = src->uid;                                              \
+            dest->gid = src->gid;                                              \
+        }                                                                      \
+    } while (0)
+
+#define MARKER_SET_UID_GID(frame, dest, src)                                   \
+    do {                                                                       \
+        _MARKER_SET_UID_GID(dest, src);                                        \
+        frame->root->uid = 0;                                                  \
+        frame->root->gid = 0;                                                  \
+        frame->cookie = (void *)_GF_UID_GID_CHANGED;                           \
+    } while (0)
+
+#define MARKER_RESET_UID_GID(frame, dest, src)                                 \
+    do {                                                                       \
+        _MARKER_SET_UID_GID(dest, src);                                        \
+        frame->cookie = NULL;                                                  \
+    } while (0)
+
+#define MARKER_STACK_UNWIND(fop, frame, params...)                             \
+    do {                                                                       \
+        quota_local_t *_local = NULL;                                          \
+        if (frame) {                                                           \
+            _local = frame->local;                                             \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, params);                               \
+        if (_local)                                                            \
+            marker_local_unref(_local);                                        \
+    } while (0)
+
+struct marker_local {
+    uint32_t timebuf[2];
+    pid_t pid;
+    loc_t loc;
+    loc_t parent_loc;
+    uid_t uid;
+    gid_t gid;
+    int32_t ref;
+    uint32_t ia_nlink;
+    struct iatt buf;
+    gf_lock_t lock;
+    mode_t mode;
+    int32_t err;
+    call_stub_t *stub;
+    call_frame_t *lk_frame;
+    quota_meta_t contribution;
+    struct marker_local *oplocal;
+
+    /* marker quota specific */
+    int64_t delta;
+    int64_t d_off;
+    int64_t sum;
+    int64_t size;
+    int32_t hl_count;
+    int32_t dentry_child_count;
+
+    fd_t *fd;
+    call_frame_t *frame;
+
+    quota_inode_ctx_t *ctx;
+    inode_contribution_t *contri;
+
+    int xflag;
+    dict_t *xdata;
+    gf_boolean_t skip_txn;
+};
+typedef struct marker_local marker_local_t;
+
+#define quota_local_t marker_local_t
+
+struct marker_inode_ctx {
+    struct quota_inode_ctx *quota_ctx;
+};
+typedef struct marker_inode_ctx marker_inode_ctx_t;
+
+struct marker_conf {
+    char feature_enabled;
+    char *size_key;
+    char *dirty_key;
+    char *volume_uuid;
+    uuid_t volume_uuid_bin;
+    char *timestamp_file;
+    char *marker_xattr;
+    uint64_t quota_lk_owner;
+    gf_lock_t lock;
+    int32_t version;
+};
+typedef struct marker_conf marker_conf_t;
+
+#endif
diff --git a/xlators/features/metadisp/Makefile.am b/xlators/features/metadisp/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/metadisp/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/metadisp/src/Makefile.am b/xlators/features/metadisp/src/Makefile.am
new file mode 100644
index 00000000000..1520ad8c424
--- /dev/null
+++ b/xlators/features/metadisp/src/Makefile.am
@@ -0,0 +1,38 @@
+noinst_PYTHON = gen-fops.py
+
+EXTRA_DIST = fops-tmpl.c
+
+xlator_LTLIBRARIES = metadisp.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+nodist_metadisp_la_SOURCES = fops.c
+
+BUILT_SOURCES = fops.c
+
+metadisp_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+metadisp_la_SOURCES = metadisp.c \
+		metadisp-unlink.c \
+		metadisp-stat.c \
+		metadisp-lookup.c \
+		metadisp-readdir.c \
+		metadisp-create.c \
+		metadisp-open.c \
+		metadisp-fsync.c \
+		metadisp-setattr.c \
+		backend.c
+
+metadisp_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = metadisp.h metadisp-fops.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+fops.c: fops-tmpl.c $(top_srcdir)/libglusterfs/src/generator.py gen-fops.py
+	PYTHONPATH=$(top_srcdir)/libglusterfs/src \
+	$(PYTHON) $(srcdir)/gen-fops.py $(srcdir)/fops-tmpl.c > $@
+
+CLEANFILES = $(nodist_metadisp_la_SOURCES)
diff --git a/xlators/features/metadisp/src/backend.c b/xlators/features/metadisp/src/backend.c
new file mode 100644
index 00000000000..ee2c25bfaa7
--- /dev/null
+++ b/xlators/features/metadisp/src/backend.c
@@ -0,0 +1,45 @@
+#define GFID_STR_LEN 37
+
+#include "metadisp.h"
+
+/*
+ * backend.c
+ *
+ * functions responsible for converting user-facing paths to backend-style
+ * "/$GFID" paths.
+ */
+
+int32_t
+build_backend_loc(uuid_t gfid, loc_t *src_loc, loc_t *dst_loc)
+{
+    static uuid_t root = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+    char gfid_buf[GFID_STR_LEN + 1] = {
+        0,
+    };
+    char *path = NULL;
+
+    GF_VALIDATE_OR_GOTO("metadisp", src_loc, out);
+    GF_VALIDATE_OR_GOTO("metadisp", dst_loc, out);
+
+    loc_copy(dst_loc, src_loc);
+    memcpy(dst_loc->pargfid, root, sizeof(root));
+    GF_FREE((char *)dst_loc->path);  // we are overwriting path so nuke
+                                     // whatever loc_copy gave us
+
+    uuid_utoa_r(gfid, gfid_buf);
+
+    path = GF_CALLOC(GFID_STR_LEN + 1, sizeof(char),
+                     gf_common_mt_char);  // freed via loc_wipe
+
+    path[0] = '/';
+    strncpy(path + 1, gfid_buf, GFID_STR_LEN);
+    path[GFID_STR_LEN] = 0;
+    dst_loc->path = path;
+    if (src_loc->name)
+        dst_loc->name = strrchr(dst_loc->path, '/');
+    if (dst_loc->name)
+        dst_loc->name++;
+    return 0;
+out:
+    return -1;
+}
diff --git a/xlators/features/metadisp/src/fops-tmpl.c b/xlators/features/metadisp/src/fops-tmpl.c
new file mode 100644
index 00000000000..4385b7dd5b7
--- /dev/null
+++ b/xlators/features/metadisp/src/fops-tmpl.c
@@ -0,0 +1,10 @@
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <glusterfs/xlator.h>
+#include "metadisp.h"
+#include "metadisp-fops.h"
+
+#pragma generate
diff --git a/xlators/features/metadisp/src/gen-fops.py b/xlators/features/metadisp/src/gen-fops.py
new file mode 100644
index 00000000000..8b5e120fdec
--- /dev/null
+++ b/xlators/features/metadisp/src/gen-fops.py
@@ -0,0 +1,160 @@
+#!/usr/bin/python
+
+import sys
+from generator import fop_subs, generate
+
+FN_METADATA_CHILD_GENERIC = """
+int32_t
+metadisp_@NAME@ (call_frame_t *frame, xlator_t *this,
+                 @LONG_ARGS@)
+{
+  METADISP_TRACE("@NAME@ metadata");
+  STACK_WIND (frame, default_@NAME@_cbk,
+              METADATA_CHILD(this), METADATA_CHILD(this)->fops->@NAME@,
+              @SHORT_ARGS@);
+  return 0;
+}
+"""
+
+FN_GENERIC_TEMPLATE = """
+int32_t
+metadisp_@NAME@ (call_frame_t *frame, xlator_t *this,
+                          @LONG_ARGS@)
+{
+  METADISP_TRACE("@NAME@ generic");
+  STACK_WIND (frame, default_@NAME@_cbk,
+                          DATA_CHILD(this), DATA_CHILD(this)->fops->@NAME@,
+                          @SHORT_ARGS@);
+  return 0;
+}
+"""
+
+FN_DATAFD_TEMPLATE = """
+int32_t
+metadisp_@NAME@ (call_frame_t *frame, xlator_t *this,
+                          @LONG_ARGS@)
+{
+  METADISP_TRACE("@NAME@ datafd");
+  xlator_t *child = NULL;
+  child = DATA_CHILD(this);
+  STACK_WIND (frame, default_@NAME@_cbk,
+                          child, child->fops->@NAME@,
+                          @SHORT_ARGS@);
+  return 0;
+}
+"""
+
+FN_DATALOC_TEMPLATE = """
+int32_t
+metadisp_@NAME@ (call_frame_t *frame, xlator_t *this,
+                          @LONG_ARGS@)
+{
+  METADISP_TRACE("@NAME@ dataloc");
+  loc_t backend_loc = {
+      0,
+  };
+  if (build_backend_loc(loc->gfid, loc, &backend_loc)) {
+      goto unwind;
+  }
+  xlator_t *child = NULL;
+  child = DATA_CHILD(this);
+  STACK_WIND (frame, default_@NAME@_cbk,
+                          child, child->fops->@NAME@,
+                          @SHORT_ARGS@);
+  return 0;
+
+unwind:
+  STACK_UNWIND_STRICT(lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL);
+  return 0;
+}
+"""
+
+FOPS_LINE_TEMPLATE = "\t.@NAME@ = metadisp_@NAME@,"
+
+skipped = [
+    "readdir",
+    "readdirp",
+    "lookup",
+    "fsync",
+    "stat",
+    "open",
+    "create",
+    "unlink",
+    "setattr",
+    # TODO: implement "inodelk",
+]
+
+
+def gen_fops():
+    done = skipped
+
+    #
+    # these are fops that wind to the DATA_CHILD
+    #
+    # NOTE: re-written in order from google doc:
+    #          https://docs.google.com/document/d/1KEwVtSNvDhs4qb63gWx2ulCp5GJjge77NGJk4p_Ms4Q
+    for name in [
+        "writev",
+        "readv",
+        "ftruncate",
+        "zerofill",
+        "discard",
+        "seek",
+        "fstat",
+    ]:
+        done = done + [name]
+        print(generate(FN_DATAFD_TEMPLATE, name, fop_subs))
+
+    for name in ["truncate"]:
+        done = done + [name]
+        print(generate(FN_DATALOC_TEMPLATE, name, fop_subs))
+
+    # these are fops that operate solely on dentries, folders,
+    # or extended attributes. Therefore, they must always
+    # wind to METADATA_CHILD and should never perform
+    # any path rewriting
+    #
+    # NOTE: re-written in order from google doc:
+    #          https://docs.google.com/document/d/1KEwVtSNvDhs4qb63gWx2ulCp5GJjge77NGJk4p_Ms4Q
+    for name in [
+        "mkdir",
+        "symlink",
+        "link",
+        "rename",
+        "mknod",
+        "opendir",
+        # "readdir,  # special-cased
+        # "readdirp, # special-cased
+        "fsyncdir",
+        # "setattr", # special-cased
+        "readlink",
+        "fentrylk",
+        "access",
+        # TODO: these wind to both,
+        # data for backend-attributes and metadata for the rest
+        "xattrop",
+        "setxattr",
+        "getxattr",
+        "removexattr",
+        "fgetxattr",
+        "fsetxattr",
+        "fremovexattr",
+    ]:
+
+        done = done + [name]
+        print(generate(FN_METADATA_CHILD_GENERIC, name, fop_subs))
+
+    print("struct xlator_fops fops = {")
+    for name in done:
+        print(generate(FOPS_LINE_TEMPLATE, name, fop_subs))
+
+    print("};")
+
+
+for l in open(sys.argv[1], "r").readlines():
+    if l.find("#pragma generate") != -1:
+        print("/* BEGIN GENERATED CODE - DO NOT MODIFY */")
+        gen_fops()
+        print("/* END GENERATED CODE */")
+    else:
+        print(l[:-1])
diff --git a/xlators/features/metadisp/src/metadisp-create.c b/xlators/features/metadisp/src/metadisp-create.c
new file mode 100644
index 00000000000..f8c9798dd59
--- /dev/null
+++ b/xlators/features/metadisp/src/metadisp-create.c
@@ -0,0 +1,101 @@
+#include "metadisp.h"
+#include <glusterfs/call-stub.h>
+
+/**
+ * Create, like stat, is a two-step process. We send a create
+ * to the METADATA_CHILD, then send another create to the DATA_CHILD.
+ *
+ * We do the metadata child first to ensure that the ACLs are enforced.
+ */
+
+int32_t
+metadisp_create_dentry_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno, fd_t *fd,
+                           inode_t *inode, struct iatt *buf,
+                           struct iatt *preparent, struct iatt *postparent,
+                           dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf,
+                        preparent, postparent, xdata);
+    return 0;
+}
+
+int32_t
+metadisp_create_resume(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       int32_t flags, mode_t mode, mode_t umask, fd_t *fd,
+                       dict_t *xdata)
+{
+    // create the backend data inode
+    STACK_WIND(frame, metadisp_create_dentry_cbk, DATA_CHILD(this),
+               DATA_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+    return 0;
+}
+
+int32_t
+metadisp_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                    struct iatt *buf, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
+{
+    METADISP_TRACE("%d %d", op_ret, op_errno);
+    call_stub_t *stub = cookie;
+    if (op_ret != 0) {
+        STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf,
+                            preparent, postparent, xdata);
+        return 0;
+    }
+
+    if (stub == NULL) {
+        goto unwind;
+    }
+
+    if (stub->poison) {
+        call_stub_destroy(stub);
+        return 0;
+    }
+
+    call_resume(stub);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, NULL,
+                        NULL);
+    return 0;
+}
+
+int32_t
+metadisp_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+                mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    METADISP_TRACE(".");
+
+    loc_t backend_loc = {
+        0,
+    };
+    call_stub_t *stub = NULL;
+    uuid_t *gfid_req = NULL;
+
+    RESOLVE_GFID_REQ(xdata, gfid_req, out);
+
+    if (build_backend_loc(*gfid_req, loc, &backend_loc)) {
+        goto unwind;
+    }
+
+    frame->local = loc;
+
+    stub = fop_create_stub(frame, metadisp_create_resume, &backend_loc, flags,
+                           mode, umask, fd, xdata);
+
+    STACK_WIND_COOKIE(frame, metadisp_create_cbk, stub, METADATA_CHILD(this),
+                      METADATA_CHILD(this)->fops->create, loc, flags, mode,
+                      umask, fd, xdata);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, NULL,
+                        NULL);
+    return 0;
+out:
+    return -1;
+}
diff --git a/xlators/features/metadisp/src/metadisp-fops.h b/xlators/features/metadisp/src/metadisp-fops.h
new file mode 100644
index 00000000000..56dd427cf34
--- /dev/null
+++ b/xlators/features/metadisp/src/metadisp-fops.h
@@ -0,0 +1,51 @@
+#ifndef GF_METADISP_FOPS_H_
+#define GF_METADISP_FOPS_H_
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/glusterfs.h>
+
+#include <sys/types.h>
+
+/* fops in here are defined in their own file. Every other fop is just defined
+ * inline of fops.c */
+
+int
+metadisp_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                 off_t off, dict_t *xdata);
+
+int
+metadisp_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                  off_t off, dict_t *dict);
+
+int
+metadisp_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+int
+metadisp_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+                mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata);
+
+int
+metadisp_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+              fd_t *fd, dict_t *xdata);
+
+int
+metadisp_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+int
+metadisp_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+                 loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata);
+
+int
+metadisp_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+               dict_t *xdata);
+
+int
+metadisp_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+                dict_t *xdata);
+
+int
+metadisp_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                 struct iatt *stbuf, int32_t valid, dict_t *xdata);
+
+#endif
diff --git a/xlators/features/metadisp/src/metadisp-fsync.c b/xlators/features/metadisp/src/metadisp-fsync.c
new file mode 100644
index 00000000000..2e46fa84eac
--- /dev/null
+++ b/xlators/features/metadisp/src/metadisp-fsync.c
@@ -0,0 +1,54 @@
+
+#include "metadisp.h"
+#include <glusterfs/call-stub.h>
+
+int32_t
+metadisp_fsync_resume(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                      int32_t flags, dict_t *xdata)
+{
+    STACK_WIND(frame, default_fsync_cbk, DATA_CHILD(this),
+               DATA_CHILD(this)->fops->fsync, fd, flags, xdata);
+    return 0;
+}
+
+int32_t
+metadisp_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                   struct iatt *postbuf, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    if (cookie) {
+        stub = cookie;
+    }
+
+    if (op_ret != 0) {
+        goto unwind;
+    }
+
+    if (stub->poison) {
+        call_stub_destroy(stub);
+        stub = NULL;
+        return 0;
+    }
+
+    call_resume(stub);
+    return 0;
+
+unwind:
+    if (stub) {
+        call_stub_destroy(stub);
+    }
+    STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+    return 0;
+}
+
+int32_t
+metadisp_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+               dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    stub = fop_fsync_stub(frame, metadisp_fsync_resume, fd, flags, xdata);
+    STACK_WIND_COOKIE(frame, metadisp_fsync_cbk, stub, METADATA_CHILD(this),
+                      METADATA_CHILD(this)->fops->fsync, fd, flags, xdata);
+    return 0;
+}
diff --git a/xlators/features/metadisp/src/metadisp-lookup.c b/xlators/features/metadisp/src/metadisp-lookup.c
new file mode 100644
index 00000000000..27d90c9f746
--- /dev/null
+++ b/xlators/features/metadisp/src/metadisp-lookup.c
@@ -0,0 +1,90 @@
+#include "metadisp.h"
+#include <glusterfs/call-stub.h>
+
+/**
+ * Lookup, like stat, is a two-step process for grabbing the metadata details
+ * as well as the data details.
+ */
+
+int32_t
+metadisp_backend_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int32_t op_ret, int32_t op_errno, inode_t *inode,
+                            struct iatt *buf, dict_t *xdata,
+                            struct iatt *postparent)
+{
+    METADISP_TRACE("backend_lookup_cbk");
+    if (op_errno == ENOENT) {
+        op_errno = ENODATA;
+        op_ret = -1;
+    }
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+                        postparent);
+    return 0;
+}
+
+int32_t
+metadisp_backend_lookup_resume(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                               dict_t *xdata)
+{
+    METADISP_TRACE("backend_lookup_resume");
+    loc_t backend_loc = {
+        0,
+    };
+    if (build_backend_loc(loc->gfid, loc, &backend_loc)) {
+        goto unwind;
+    }
+
+    STACK_WIND(frame, metadisp_backend_lookup_cbk, DATA_CHILD(this),
+               DATA_CHILD(this)->fops->lookup, &backend_loc, xdata);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+metadisp_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, inode_t *inode,
+                    struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+    METADISP_TRACE("%d %d", op_ret, op_errno);
+    call_stub_t *stub = NULL;
+    stub = cookie;
+
+    if (op_ret != 0) {
+        goto unwind;
+    }
+
+    if (!IA_ISREG(buf->ia_type)) {
+        goto unwind;
+    } else if (!stub) {
+        op_errno = EINVAL;
+        goto unwind;
+    }
+
+    METADISP_TRACE("resuming stub");
+
+    // memcpy(stub->args.loc.gfid, buf->ia_gfid, sizeof(uuid_t));
+    call_resume(stub);
+    return 0;
+unwind:
+    METADISP_TRACE("unwinding %d %d", op_ret, op_errno);
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+                        postparent);
+    if (stub) {
+        call_stub_destroy(stub);
+    }
+    return 0;
+}
+
+int32_t
+metadisp_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    METADISP_TRACE("lookup");
+    call_stub_t *stub = NULL;
+    stub = fop_lookup_stub(frame, metadisp_backend_lookup_resume, loc, xdata);
+    STACK_WIND_COOKIE(frame, metadisp_lookup_cbk, stub, METADATA_CHILD(this),
+                      METADATA_CHILD(this)->fops->lookup, loc, xdata);
+    return 0;
+}
diff --git a/xlators/features/metadisp/src/metadisp-open.c b/xlators/features/metadisp/src/metadisp-open.c
new file mode 100644
index 00000000000..64814afe636
--- /dev/null
+++ b/xlators/features/metadisp/src/metadisp-open.c
@@ -0,0 +1,70 @@
+#include <glusterfs/call-stub.h>
+#include "metadisp.h"
+
+int32_t
+metadisp_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    METADISP_TRACE("got open results %d %d", op_ret, op_errno);
+
+    call_stub_t *stub = NULL;
+    if (cookie) {
+        stub = cookie;
+    }
+
+    if (op_ret != 0) {
+        goto unwind;
+    }
+
+    if (!stub) {
+        goto unwind;
+    }
+
+    if (stub->poison) {
+        call_stub_destroy(stub);
+        stub = NULL;
+        return 0;
+    }
+
+    call_resume(stub);
+    return 0;
+
+unwind:
+    if (stub) {
+        call_stub_destroy(stub);
+    }
+    STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata);
+    return 0;
+}
+
+int32_t
+metadisp_open_resume(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                     int32_t flags, fd_t *fd, dict_t *xdata)
+{
+    STACK_WIND_COOKIE(frame, metadisp_open_cbk, NULL, DATA_CHILD(this),
+                      DATA_CHILD(this)->fops->open, loc, flags, fd, xdata);
+    return 0;
+}
+
+int32_t
+metadisp_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+              fd_t *fd, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    loc_t backend_loc = {
+        0,
+    };
+
+    if (build_backend_loc(loc->gfid, loc, &backend_loc)) {
+        goto unwind;
+    }
+
+    stub = fop_open_stub(frame, metadisp_open_resume, &backend_loc, flags, fd,
+                         xdata);
+    STACK_WIND_COOKIE(frame, metadisp_open_cbk, stub, METADATA_CHILD(this),
+                      METADATA_CHILD(this)->fops->open, loc, flags, fd, xdata);
+    return 0;
+unwind:
+    STACK_UNWIND_STRICT(open, frame, -1, EINVAL, NULL, NULL);
+    return 0;
+}
diff --git a/xlators/features/metadisp/src/metadisp-readdir.c b/xlators/features/metadisp/src/metadisp-readdir.c
new file mode 100644
index 00000000000..5f840b1e88f
--- /dev/null
+++ b/xlators/features/metadisp/src/metadisp-readdir.c
@@ -0,0 +1,65 @@
+#include "metadisp.h"
+
+/**
+ * With a change to the posix xlator, readdir and readdirp are shockingly
+ * simple.
+ *
+ * The issue with separating the backend data of the files
+ * with the metadata is that readdirs must now read from multiple sources
+ * to coalesce the directory entries.
+ *
+ * The way we do this is to tell the METADATA_CHILD that when it's
+ * running readdirp, each file entry should have a stat wound to
+ * 'stat-source-of-truth'.
+ *
+ * see metadisp_stat for how it handles winds _from_posix.
+ */
+
+int32_t
+metadisp_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                 off_t off, dict_t *xdata)
+{
+    METADISP_TRACE(".");
+    /*
+     * Always use readdirp, even if the original was readdir. Why? Because NFS.
+     * There are multiple translations between Gluster, UNIX, and NFS stat
+     * structures in that path. One of them uses the type etc. from the stat
+     * structure, which is only filled in by readdirp. If we use readdir, the
+     * entries do actually go all the way back to the client and are visible in
+     * getdents, but then the readdir throws them away because of the
+     * uninitialized type.
+     */
+    GF_UNUSED int32_t ret;
+    if (!xdata) {
+        xdata = dict_new();
+    }
+
+    // ret = dict_set_int32 (xdata, "list-xattr", 1);
+
+    // I'm my own source of truth!
+    ret = dict_set_static_ptr(xdata, "stat-source-of-truth", (void *)this);
+
+    STACK_WIND(frame, default_readdirp_cbk, METADATA_CHILD(this),
+               METADATA_CHILD(this)->fops->readdirp, fd, size, off, xdata);
+
+    return 0;
+}
+
+int32_t
+metadisp_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                  off_t off, dict_t *xdata)
+{
+    METADISP_TRACE(".");
+    if (!xdata) {
+        xdata = dict_new();
+    }
+    GF_UNUSED int32_t ret;
+    // ret = dict_set_int32 (xdata, "list-xattr", 1);
+
+    // I'm my own source of truth!
+    ret = dict_set_static_ptr(xdata, "stat-source-of-truth", (void *)this);
+
+    STACK_WIND(frame, default_readdirp_cbk, METADATA_CHILD(this),
+               METADATA_CHILD(this)->fops->readdirp, fd, size, off, xdata);
+    return 0;
+}
diff --git a/xlators/features/metadisp/src/metadisp-setattr.c b/xlators/features/metadisp/src/metadisp-setattr.c
new file mode 100644
index 00000000000..6991cf644f3
--- /dev/null
+++ b/xlators/features/metadisp/src/metadisp-setattr.c
@@ -0,0 +1,90 @@
+#include "metadisp.h"
+#include <glusterfs/call-stub.h>
+
+int32_t
+metadisp_backend_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                             int32_t op_ret, int32_t op_errno,
+                             struct iatt *statpre, struct iatt *statpost,
+                             dict_t *xdata)
+
+{
+    METADISP_TRACE("backend_setattr_cbk");
+    if (op_errno == ENOENT) {
+        op_errno = ENODATA;
+        op_ret = -1;
+    }
+    STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, statpre, statpost,
+                        xdata);
+    return 0;
+}
+
+int32_t
+metadisp_backend_setattr_resume(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                                struct iatt *stbuf, int32_t valid,
+                                dict_t *xdata)
+
+{
+    METADISP_TRACE("backend_setattr_resume");
+    loc_t backend_loc = {
+        0,
+    };
+    if (build_backend_loc(loc->gfid, loc, &backend_loc)) {
+        goto unwind;
+    }
+
+    STACK_WIND(frame, metadisp_backend_setattr_cbk, DATA_CHILD(this),
+               DATA_CHILD(this)->fops->setattr, &backend_loc, stbuf, valid,
+               xdata);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(setattr, frame, -1, EINVAL, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+metadisp_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+                     struct iatt *statpost, dict_t *xdata)
+{
+    METADISP_TRACE("%d %d", op_ret, op_errno);
+    call_stub_t *stub = NULL;
+    stub = cookie;
+
+    if (op_ret != 0) {
+        goto unwind;
+    }
+
+    if (!IA_ISREG(statpost->ia_type)) {
+        goto unwind;
+    } else if (!stub) {
+        op_errno = EINVAL;
+        goto unwind;
+    }
+
+    METADISP_TRACE("resuming stub");
+    call_resume(stub);
+    return 0;
+unwind:
+    METADISP_TRACE("unwinding %d %d", op_ret, op_errno);
+    STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, statpre, statpost,
+                        xdata);
+    if (stub) {
+        call_stub_destroy(stub);
+    }
+    return 0;
+}
+
+int32_t
+metadisp_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                 struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    METADISP_TRACE("setattr");
+    call_stub_t *stub = NULL;
+    stub = fop_setattr_stub(frame, metadisp_backend_setattr_resume, loc, stbuf,
+                            valid, xdata);
+    STACK_WIND_COOKIE(frame, metadisp_setattr_cbk, stub, METADATA_CHILD(this),
+                      METADATA_CHILD(this)->fops->setattr, loc, stbuf, valid,
+                      xdata);
+    return 0;
+}
diff --git a/xlators/features/metadisp/src/metadisp-stat.c b/xlators/features/metadisp/src/metadisp-stat.c
new file mode 100644
index 00000000000..b06d0dbcddd
--- /dev/null
+++ b/xlators/features/metadisp/src/metadisp-stat.c
@@ -0,0 +1,124 @@
+#include "metadisp.h"
+#include <glusterfs/call-stub.h>
+
+/**
+ * The stat flow in METADISP is complicated because we must
+ * do ensure a few things:
+ *    1. stat, on the path within the metadata layer,
+ *       MUST get the backend FD of the data layer.
+ *        --- we wind to the metadata layer, then the data layer.
+ *
+ *    2. the metadata layer MUST be able to ask the data
+ *       layer for stat information.
+ *        --- this is 'syncop-internal-from-posix'
+ *
+ *    3. when the metadata exists BUT the data is missing,
+ *       we MUST mark the backend file as bad and heal it.
+ */
+
+int32_t
+metadisp_stat_backend_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                          dict_t *xdata)
+{
+    METADISP_TRACE("got backend stat results %d %d", op_ret, op_errno);
+    if (op_errno == ENOENT) {
+        STACK_UNWIND_STRICT(open, frame, -1, ENODATA, NULL, NULL);
+        return 0;
+    }
+    STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+int32_t
+metadisp_stat_resume(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                     dict_t *xdata)
+{
+    METADISP_TRACE("winding stat to path %s", loc->path);
+    if (gf_uuid_is_null(loc->gfid)) {
+        METADISP_TRACE("bad object, sending EUCLEAN");
+        STACK_UNWIND_STRICT(open, frame, -1, EUCLEAN, NULL, NULL);
+        return 0;
+    }
+
+    STACK_WIND(frame, metadisp_stat_backend_cbk, SECOND_CHILD(this),
+               SECOND_CHILD(this)->fops->stat, loc, xdata);
+    return 0;
+}
+
+int32_t
+metadisp_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                  dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+
+    METADISP_TRACE("got stat results %d %d", op_ret, op_errno);
+
+    if (cookie) {
+        stub = cookie;
+    }
+
+    if (op_ret != 0) {
+        goto unwind;
+    }
+
+    // only use the stub for the files
+    if (!IA_ISREG(buf->ia_type)) {
+        goto unwind;
+    }
+
+    if (stub->poison) {
+        call_stub_destroy(stub);
+        stub = NULL;
+        return 0;
+    }
+
+    call_resume(stub);
+    return 0;
+
+unwind:
+    if (stub) {
+        call_stub_destroy(stub);
+    }
+    STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+int32_t
+metadisp_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    int32_t ret = 0;
+    loc_t backend_loc = {
+        0,
+    };
+    METADISP_FILTER_ROOT(stat, loc, xdata);
+
+    if (build_backend_loc(loc->gfid, loc, &backend_loc)) {
+        goto unwind;
+    }
+
+    if (dict_get_int32(xdata, "syncop-internal-from-posix", &ret) == 0) {
+        // if we've just been sent a stat from posix, then we know
+        // that we must send down a stat for a file to the second child.
+        //
+        // that means we can skip the stat for the first child and just
+        // send to the data disk.
+        METADISP_TRACE("got syncop-internal-from-posix");
+        STACK_WIND(frame, default_stat_cbk, DATA_CHILD(this),
+                   DATA_CHILD(this)->fops->stat, &backend_loc, xdata);
+        return 0;
+    }
+
+    // we do not know if the request is for a file, folder, etc. wind
+    // to first child to find out.
+    stub = fop_stat_stub(frame, metadisp_stat_resume, &backend_loc, xdata);
+    METADISP_TRACE("winding stat to first child %s", loc->path);
+    STACK_WIND_COOKIE(frame, metadisp_stat_cbk, stub, METADATA_CHILD(this),
+                      METADATA_CHILD(this)->fops->stat, loc, xdata);
+    return 0;
+unwind:
+    STACK_UNWIND_STRICT(stat, frame, -1, EINVAL, NULL, NULL);
+    return 0;
+}
diff --git a/xlators/features/metadisp/src/metadisp-unlink.c b/xlators/features/metadisp/src/metadisp-unlink.c
new file mode 100644
index 00000000000..1f6a8eb35ce
--- /dev/null
+++ b/xlators/features/metadisp/src/metadisp-unlink.c
@@ -0,0 +1,160 @@
+
+#include "metadisp.h"
+#include <glusterfs/call-stub.h>
+
+/**
+ * The unlink flow in metadisp is complicated because we must
+ * do ensure that UNLINK causes both the metadata objects
+ * to get removed and the data objects to get removed.
+ */
+
+int32_t
+metadisp_unlink_resume(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       int xflag, dict_t *xdata)
+{
+    METADISP_TRACE("winding backend unlink to path %s", loc->path);
+    STACK_WIND(frame, default_unlink_cbk, DATA_CHILD(this),
+               DATA_CHILD(this)->fops->unlink, loc, xflag, xdata);
+    return 0;
+}
+
+int32_t
+metadisp_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
+{
+    METADISP_TRACE(". %d %d", op_ret, op_errno);
+
+    int ret = 0;
+    call_stub_t *stub = NULL;
+    int nlink = 0;
+
+    if (cookie) {
+        stub = cookie;
+    }
+
+    if (op_ret != 0) {
+        goto unwind;
+    }
+
+    if (stub->poison) {
+        call_stub_destroy(stub);
+        stub = NULL;
+        return 0;
+    }
+
+    ret = dict_get_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA, &nlink);
+    if (ret != 0) {
+        op_errno = EINVAL;
+        op_ret = -1;
+        goto unwind;
+    }
+    METADISP_TRACE("frontend hardlink count %d %d", ret, nlink);
+    if (nlink > 1) {
+        goto unwind;
+    }
+
+    call_resume(stub);
+    return 0;
+
+unwind:
+    if (stub) {
+        call_stub_destroy(stub);
+    }
+    STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent,
+                        xdata);
+    return 0;
+}
+
+int32_t
+metadisp_unlink_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno, inode_t *inode,
+                           struct iatt *buf, dict_t *xdata,
+                           struct iatt *postparent)
+{
+    call_stub_t *stub = NULL;
+
+    if (cookie) {
+        stub = cookie;
+    }
+
+    if (op_ret != 0) {
+        goto unwind;
+    }
+
+    // fail fast on empty gfid so we don't loop forever
+    if (gf_uuid_is_null(buf->ia_gfid)) {
+        op_ret = -1;
+        op_errno = ENODATA;
+        goto unwind;
+    }
+
+    // fill gfid since the stub is incomplete
+    memcpy(stub->args.loc.gfid, buf->ia_gfid, sizeof(uuid_t));
+    memcpy(stub->args.loc.pargfid, postparent->ia_gfid, sizeof(uuid_t));
+
+    if (stub->poison) {
+        call_stub_destroy(stub);
+        stub = NULL;
+        return 0;
+    }
+
+    call_resume(stub);
+    return 0;
+
+unwind:
+    if (stub) {
+        call_stub_destroy(stub);
+    }
+    STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+metadisp_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+                dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    loc_t backend_loc = {
+        0,
+    };
+
+    if (gf_uuid_is_null(loc->gfid)) {
+        METADISP_TRACE("winding lookup for unlink to path %s", loc->path);
+
+        // loop back to ourselves after a lookup
+        stub = fop_unlink_stub(frame, metadisp_unlink, loc, xflag, xdata);
+        STACK_WIND_COOKIE(frame, metadisp_unlink_lookup_cbk, stub,
+                          METADATA_CHILD(this),
+                          METADATA_CHILD(this)->fops->lookup, loc, xdata);
+        return 0;
+    }
+
+    if (build_backend_loc(loc->gfid, loc, &backend_loc)) {
+        goto unwind;
+    }
+
+    //
+    // ensure we get the link count on the unlink response, so we can
+    // account for hardlinks before winding to the backend.
+    // NOTE:
+    //   multiple xlators use GF_REQUEST_LINK_COUNT_XDATA. confirmation
+    //   is needed to ensure that multiple requests will work in the same
+    //   xlator stack.
+    //
+    if (!xdata) {
+        xdata = dict_new();
+    }
+    dict_set_int32(xdata, GF_REQUEST_LINK_COUNT_XDATA, 1);
+
+    METADISP_TRACE("winding frontend unlink to path %s", loc->path);
+    stub = fop_unlink_stub(frame, metadisp_unlink_resume, &backend_loc, xflag,
+                           xdata);
+
+    STACK_WIND_COOKIE(frame, metadisp_unlink_cbk, stub, METADATA_CHILD(this),
+                      METADATA_CHILD(this)->fops->unlink, loc, xflag, xdata);
+    return 0;
+unwind:
+    STACK_UNWIND_STRICT(unlink, frame, -1, EINVAL, NULL, NULL, NULL);
+    return 0;
+}
diff --git a/xlators/features/metadisp/src/metadisp.c b/xlators/features/metadisp/src/metadisp.c
new file mode 100644
index 00000000000..3c8f150cebc
--- /dev/null
+++ b/xlators/features/metadisp/src/metadisp.c
@@ -0,0 +1,46 @@
+#include <glusterfs/call-stub.h>
+
+#include "metadisp.h"
+#include "metadisp-fops.h"
+
+int32_t
+init(xlator_t *this)
+{
+    if (!this->children) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "not configured with children. exiting");
+        return -1;
+    }
+
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
+    }
+
+    return 0;
+}
+
+void
+fini(xlator_t *this)
+{
+    return;
+}
+
+/* defined in fops.c */
+struct xlator_fops fops;
+
+struct xlator_cbks cbks = {};
+
+struct volume_options options[] = {
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .op_version = {1},
+    .identifier = "metadisp",
+    .category = GF_EXPERIMENTAL,
+};
diff --git a/xlators/features/metadisp/src/metadisp.h b/xlators/features/metadisp/src/metadisp.h
new file mode 100644
index 00000000000..c8fd7a13c04
--- /dev/null
+++ b/xlators/features/metadisp/src/metadisp.h
@@ -0,0 +1,45 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef GF_METADISP_H_
+#define GF_METADISP_H_
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#define METADATA_CHILD(_this) FIRST_CHILD(_this)
+#define DATA_CHILD(_this) SECOND_CHILD(_this)
+
+int32_t
+build_backend_loc(uuid_t gfid, loc_t *src_loc, loc_t *dst_loc);
+
+#define METADISP_TRACE(_args...) gf_log("metadisp", GF_LOG_INFO, _args)
+
+#define METADISP_FILTER_ROOT(_op, _args...)                                    \
+    if (strcmp(loc->path, "/") == 0) {                                         \
+        STACK_WIND(frame, default_##_op##_cbk, METADATA_CHILD(this),           \
+                   METADATA_CHILD(this)->fops->_op, _args);                    \
+        return 0;                                                              \
+    }
+
+#define METADISP_FILTER_ROOT_BY_GFID(_op, _gfid, _args...)                     \
+    if (__is_root_gfid(_gfid)) {                                               \
+        STACK_WIND(frame, default_##_op##_cbk, METADATA_CHILD(this),           \
+                   METADATA_CHILD(this)->fops->_op, _args);                    \
+        return 0;                                                              \
+    }
+
+#define RESOLVE_GFID_REQ(_dict, _dest, _lbl)                                   \
+    VALIDATE_OR_GOTO(dict_get_ptr(_dict, "gfid-req", (void **)&_dest) == 0,    \
+                     _lbl)
+
+#endif /* __TEMPLATE_H__ */
diff --git a/xlators/features/namespace/Makefile.am b/xlators/features/namespace/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/namespace/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/namespace/src/Makefile.am b/xlators/features/namespace/src/Makefile.am
new file mode 100644
index 00000000000..e355d42cf4e
--- /dev/null
+++ b/xlators/features/namespace/src/Makefile.am
@@ -0,0 +1,17 @@
+xlator_LTLIBRARIES = namespace.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+namespace_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+namespace_la_SOURCES = namespace.c
+namespace_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = namespace.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(top_srcdir)/xlators/lib/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/namespace/src/namespace.c b/xlators/features/namespace/src/namespace.c
new file mode 100644
index 00000000000..86c5ebee900
--- /dev/null
+++ b/xlators/features/namespace/src/namespace.c
@@ -0,0 +1,1344 @@
+/*
+ * Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ * This file is part of GlusterFS.
+ *
+ * This file is licensed to you under your choice of the GNU Lesser
+ * General Public License, version 3 or any later version (LGPLv3 or
+ * later), or the GNU General Public License, version 2 (GPLv2), in all
+ * cases as published by the Free Software Foundation.
+ *
+ * xlators/features/namespace:
+ *      This translator tags each request with a namespace hash,
+ *      which then can be used in later translators to track and
+ *      throttle fops per namespace.
+ */
+
+#include <sys/types.h>
+
+#include <glusterfs/defaults.h>
+#include <glusterfs/hashfn.h>
+#include <glusterfs/logging.h>
+#include "namespace.h"
+
+/* Return codes for common path parsing functions. */
+enum _path_parse_result {
+    PATH_PARSE_RESULT_NO_PATH = 0,
+    PATH_PARSE_RESULT_FOUND = 1,
+    PATH_PARSE_RESULT_IS_GFID = 2,
+};
+
+typedef enum _path_parse_result path_parse_result_t;
+
+/* Clean up an ns_local struct. Wipe a loc (its inode is ref'd, so we're good.)
+ */
+static inline void
+ns_local_cleanup(ns_local_t *local)
+{
+    if (!local) {
+        return;
+    }
+
+    loc_wipe(&local->loc);
+    GF_FREE(local);
+}
+
+/* Create a new ns_local. We ref the inode, fake a new loc struct, and stash
+ * the stub given to us. */
+static inline ns_local_t *
+ns_local_new(call_stub_t *stub, inode_t *inode)
+{
+    ns_local_t *local = NULL;
+    loc_t loc = {
+        0,
+    };
+
+    if (!stub || !inode) {
+        goto out;
+    }
+
+    local = GF_CALLOC(1, sizeof(ns_local_t), 0);
+    if (local == NULL) {
+        goto out;
+    }
+
+    /* Set up a fake loc_t struct to give to the getxattr call. */
+    gf_uuid_copy(loc.gfid, inode->gfid);
+    loc.inode = inode_ref(inode);
+
+    /* If for some reason inode_ref() fails, then just give up. */
+    if (!loc.inode) {
+        GF_FREE(local);
+        goto out;
+    }
+
+    local->stub = stub;
+    local->loc = loc;
+
+out:
+    return local;
+}
+
+/* Try parsing a path string. If the path string is a GFID, then return
+ * with PATH_PARSE_RESULT_IS_GFID. If we have no namespace (i.e. '/') then
+ * return PATH_PARSE_RESULT_NO_PATH and set the hash to 1. Otherwise, hash the
+ * namespace and store it in the info struct. */
+static path_parse_result_t
+parse_path(ns_info_t *info, const char *path)
+{
+    int len = 0;
+    const char *ns_begin = path;
+    const char *ns_end = NULL;
+
+    if (!path || strlen(path) == 0) {
+        return PATH_PARSE_RESULT_NO_PATH;
+    }
+
+    if (path[0] == '<') {
+        return PATH_PARSE_RESULT_IS_GFID;
+    }
+
+    /* Right now we only want the top-level directory, so
+     * skip the initial '/' and read until the next '/'. */
+    while (*ns_begin == '/') {
+        ns_begin++;
+    }
+
+    /* ns_end will point to the next '/' or NULL if there is no delimiting
+     * '/' (i.e. "/directory" or the top level "/") */
+    ns_end = strchr(ns_begin, '/');
+    len = ns_end ? (ns_end - ns_begin) : strlen(ns_begin);
+
+    if (len != 0) {
+        info->hash = SuperFastHash(ns_begin, len);
+    } else {
+        /* If our substring is empty, then we can hash '/' instead.
+         * '/' is used in the namespace config for the top-level
+         * namespace. */
+        info->hash = SuperFastHash("/", 1);
+    }
+
+    info->found = _gf_true;
+    return PATH_PARSE_RESULT_FOUND;
+}
+
+/* Cache namespace info stored in the stack (info) into the inode. */
+static int
+ns_inode_ctx_put(inode_t *inode, xlator_t *this, ns_info_t *info)
+{
+    ns_info_t *cached_ns_info = NULL;
+    uint64_t ns_as_64 = 0;
+    int ret = -1;
+
+    if (!inode || !this) {
+        gf_log(this ? this->name : "namespace", GF_LOG_WARNING,
+               "Need a valid inode and xlator to cache ns_info.");
+        ret = -1;
+        goto out;
+    }
+
+    cached_ns_info = GF_CALLOC(1, sizeof(ns_info_t), 0);
+
+    /* If we've run out of memory, then return ENOMEM. */
+    if (cached_ns_info == NULL) {
+        gf_log(this->name, GF_LOG_WARNING, "No memory to cache ns_info.");
+        ret = -(ENOMEM);
+        goto out;
+    }
+
+    *cached_ns_info = *info;
+    ns_as_64 = (uint64_t)(uintptr_t)cached_ns_info;
+
+    ret = inode_ctx_put(inode, this, ns_as_64);
+
+    if (ret) {
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (ret && cached_ns_info) {
+        GF_FREE(cached_ns_info);
+    }
+
+    return ret;
+}
+
+/* Retrieve namespace info cached in the inode into the stack for use in later
+ * translators. */
+static int
+ns_inode_ctx_get(inode_t *inode, xlator_t *this, ns_info_t *info)
+{
+    ns_info_t *cached_ns_info = NULL;
+    uint64_t ns_as_64 = 0;
+    int ret = -1;
+
+    if (!inode) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    ret = inode_ctx_get(inode, this, &ns_as_64);
+
+    if (!ret) {
+        cached_ns_info = (ns_info_t *)(uintptr_t)ns_as_64;
+        *info = *cached_ns_info;
+    }
+
+out:
+    return ret;
+}
+
+/* This callback is the top of the unwind path of our attempt to get the path
+ * manually from the posix translator. We'll try to parse the path returned
+ * if it exists, then cache the hash if possible. Then just return to the
+ * default stub that we provide in the local, since there's nothing else to do
+ * once we've gotten the namespace hash. */
+int32_t
+get_path_resume_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *dict,
+                    dict_t *xdata)
+{
+    path_parse_result_t ret = PATH_PARSE_RESULT_NO_PATH;
+    call_frame_t *resume_frame = NULL;
+    ns_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+    ns_info_t *info = NULL;
+    char *path = NULL;
+
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO(this->name, local, out);
+    stub = local->stub;
+
+    GF_VALIDATE_OR_GOTO(this->name, stub, out);
+    /* Get the ns_info from the frame that we will eventually resume,
+     * not the frame that we're going to destroy (frame). */
+    resume_frame = stub->frame;
+
+    GF_VALIDATE_OR_GOTO(this->name, resume_frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, resume_frame->root, out);
+    info = &resume_frame->root->ns_info;
+
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+
+    /* If we get a value back for the GET_ANCESTRY_PATH_KEY, then we
+     * try to access it and parse it like a path. */
+    if (!op_ret && !dict_get_str(dict, GET_ANCESTRY_PATH_KEY, &path)) {
+        gf_log(this->name, GF_LOG_DEBUG, "G>P %s retrieved path %s",
+               uuid_utoa(local->loc.gfid), path);
+        /* Now let's parse a path, finally. */
+        ret = parse_path(info, path);
+    }
+
+    if (ret == PATH_PARSE_RESULT_FOUND) {
+        /* If we finally found namespace, then stash it. */
+        ns_inode_ctx_put(local->loc.inode, this, info);
+
+        gf_log(this->name, GF_LOG_DEBUG, "G>P %s %10u namespace found %s",
+               uuid_utoa(local->loc.inode->gfid), info->hash, path);
+    } else if (ret == PATH_PARSE_RESULT_NO_PATH) {
+        gf_log(this->name, GF_LOG_WARNING, "G>P %s has no path",
+               uuid_utoa(local->loc.inode->gfid));
+    } else if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "G>P %s winding failed, still have gfid",
+               uuid_utoa(local->loc.inode->gfid));
+    }
+
+out:
+    /* Make sure to clean up local finally. */
+
+    if (frame) {
+        frame->local = NULL;
+        STACK_DESTROY(frame->root);
+    }
+
+    if (local) {
+        ns_local_cleanup(local);
+    }
+
+    if (stub) {
+        call_resume(stub);
+    }
+
+    return 0;
+}
+
+/* This function tries first to set a namespace based on the information that
+ * it can retrieve from an `loc_t`. This includes first looking for a cached
+ * namespace in the inode, then trying to parse the path string in the `loc_t`
+ * struct. If this fails, then it will try to call inode_path. */
+static path_parse_result_t
+set_ns_from_loc(const char *fn, call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+    path_parse_result_t ret = PATH_PARSE_RESULT_NO_PATH;
+    ns_private_t *priv = (ns_private_t *)this->private;
+    ns_info_t *info = &frame->root->ns_info;
+    char *path = NULL;
+
+    info->hash = 0;
+    info->found = _gf_false;
+
+    if (!priv->tag_namespaces) {
+        return ret;
+    }
+
+    /* This is our first pass at trying to get a path. Try getting
+     * from the inode context, then from the loc's path itself. */
+    if (!loc || !loc->path || !loc->inode) {
+        ret = PATH_PARSE_RESULT_NO_PATH;
+    } else if (!ns_inode_ctx_get(loc->inode, this, info)) {
+        ret = PATH_PARSE_RESULT_FOUND;
+    } else {
+        ret = parse_path(info, loc->path);
+        gf_log(this->name, GF_LOG_DEBUG, "%s: LOC retrieved path %s", fn,
+               loc->path);
+
+        if (ret == PATH_PARSE_RESULT_FOUND) {
+            ns_inode_ctx_put(loc->inode, this, info);
+        }
+    }
+
+    /* Keep trying by calling inode_path next, making sure to copy
+    the loc's gfid into its inode if necessary. */
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        if (gf_uuid_is_null(loc->inode->gfid)) {
+            gf_uuid_copy(loc->inode->gfid, loc->gfid);
+        }
+
+        if (inode_path(loc->inode, NULL, &path) >= 0 && path) {
+            ret = parse_path(info, loc->path);
+            gf_log(this->name, GF_LOG_DEBUG, "%s: LOC retrieved path %s", fn,
+                   path);
+
+            if (ret == PATH_PARSE_RESULT_FOUND) {
+                ns_inode_ctx_put(loc->inode, this, info);
+            }
+        }
+
+        if (path) {
+            GF_FREE(path);
+        }
+    }
+
+    /* Report our status, and if we have a GFID, we'll eventually try a
+     * GET_ANCESTRY_PATH_KEY wind when we return from this function. */
+    if (ret == PATH_PARSE_RESULT_FOUND) {
+        gf_log(this->name, GF_LOG_DEBUG,
+               "%s: LOC %s %10u namespace found for %s", fn,
+               uuid_utoa(loc->inode->gfid), info->hash, loc->path);
+    } else if (ret == PATH_PARSE_RESULT_NO_PATH) {
+        gf_log(this->name, GF_LOG_WARNING, "%s: LOC has no path", fn);
+    } else if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        /* Make sure to copy the inode's gfid for the eventual wind. */
+        if (gf_uuid_is_null(loc->inode->gfid)) {
+            gf_uuid_copy(loc->inode->gfid, loc->gfid);
+        }
+
+        gf_log(this->name, GF_LOG_DEBUG, "%s: LOC %s winding, looking for path",
+               fn, uuid_utoa(loc->inode->gfid));
+    }
+
+    return ret;
+}
+
+/* This function tries first to set a namespace based on the information that
+ * it can retrieve from an `fd_t`. This includes first looking for a cached
+ * namespace in the inode, then trying to call inode_path manually. */
+static path_parse_result_t
+set_ns_from_fd(const char *fn, call_frame_t *frame, xlator_t *this, fd_t *fd)
+{
+    path_parse_result_t ret = PATH_PARSE_RESULT_NO_PATH;
+    ns_private_t *priv = (ns_private_t *)this->private;
+    ns_info_t *info = &frame->root->ns_info;
+    char *path = NULL;
+
+    info->hash = 0;
+    info->found = _gf_false;
+
+    if (!priv->tag_namespaces) {
+        return ret;
+    }
+
+    /* This is our first pass at trying to get a path. Try getting
+     * from the inode context, then inode_path. */
+    if (!fd || !fd->inode) {
+        ret = PATH_PARSE_RESULT_NO_PATH;
+    } else if (!ns_inode_ctx_get(fd->inode, this, info)) {
+        ret = PATH_PARSE_RESULT_FOUND;
+    } else if (inode_path(fd->inode, NULL, &path) >= 0 && path) {
+        ret = parse_path(info, path);
+        gf_log(this->name, GF_LOG_DEBUG, "%s: FD  retrieved path %s", fn, path);
+
+        if (ret == PATH_PARSE_RESULT_FOUND) {
+            ns_inode_ctx_put(fd->inode, this, info);
+        }
+    }
+
+    if (path) {
+        GF_FREE(path);
+    }
+
+    /* Report our status, and if we have a GFID, we'll eventually try a
+     * GET_ANCESTRY_PATH_KEY wind when we return from this function. */
+    if (ret == PATH_PARSE_RESULT_FOUND) {
+        gf_log(this->name, GF_LOG_DEBUG, "%s: FD  %s %10u namespace found", fn,
+               uuid_utoa(fd->inode->gfid), info->hash);
+    } else if (ret == PATH_PARSE_RESULT_NO_PATH) {
+        gf_log(this->name, GF_LOG_WARNING, "%s: FD  has no path", fn);
+    } else if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        gf_log(this->name, GF_LOG_DEBUG, "%s: FD  %s winding, looking for path",
+               fn, uuid_utoa(fd->inode->gfid));
+    }
+
+    return ret;
+}
+
+/* This macro does the work of winding down a call of `getxattr` in the case
+ * that we have to retrieve the path manually. It assumes that there is a label
+ * called `wind` and the existence of several basic variables (frame, this),
+ * but otherwise is general enough for any fop (fd- or loc-based.) */
+#define GET_ANCESTRY_PATH_WIND(fop, inode, args...)                            \
+    do {                                                                       \
+        ns_info_t *info = &frame->root->ns_info;                               \
+        call_frame_t *new_frame = NULL;                                        \
+        ns_local_t *local = NULL;                                              \
+        call_stub_t *stub = NULL;                                              \
+                                                                               \
+        gf_log(this->name, GF_LOG_DEBUG, "    %s winding, looking for path",   \
+               uuid_utoa(inode->gfid));                                        \
+                                                                               \
+        new_frame = create_frame(this, this->ctx->pool);                       \
+        if (!new_frame) {                                                      \
+            gf_log(this->name, GF_LOG_ERROR,                                   \
+                   "Cannot allocate new call frame.");                         \
+            goto wind;                                                         \
+        }                                                                      \
+                                                                               \
+        stub = fop_##fop##_stub(frame, default_##fop, args);                   \
+        if (!stub) {                                                           \
+            gf_log(this->name, GF_LOG_ERROR,                                   \
+                   "Cannot allocate function stub.");                          \
+            goto wind;                                                         \
+        }                                                                      \
+                                                                               \
+        new_frame->root->uid = 0;                                              \
+        new_frame->root->gid = 0;                                              \
+        /* Put a phony "not found" NS info into this call. */                  \
+        new_frame->root->ns_info = *info;                                      \
+                                                                               \
+        local = ns_local_new(stub, inode);                                     \
+        if (!local) {                                                          \
+            gf_log(this->name, GF_LOG_ERROR,                                   \
+                   "Cannot allocate function local.");                         \
+            goto wind;                                                         \
+        }                                                                      \
+                                                                               \
+        new_frame->local = local;                                              \
+        /* After allocating a new frame, a call stub (to                       \
+         * resume our current fop), and a local variables                      \
+         * struct (for our loc to getxattr and our resume                      \
+         * stub), call getxattr and unwind to get_path_resume_cbk.             \
+         */                                                                    \
+        STACK_WIND(new_frame, get_path_resume_cbk, FIRST_CHILD(this),          \
+                   FIRST_CHILD(this)->fops->getxattr, &local->loc,             \
+                   GET_ANCESTRY_PATH_KEY, NULL);                               \
+    } while (0)
+
+int32_t
+ns_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+         dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(rmdir, loc->inode, loc, xflags, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_rmdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rmdir, loc, xflags, xdata);
+    return 0;
+}
+
+int32_t
+ns_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+          dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(unlink, loc->inode, loc, xflags, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, xflags, xdata);
+    return 0;
+}
+
+int32_t
+ns_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+          dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this,
+                                              newloc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(rename, newloc->inode, oldloc, newloc, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+    return 0;
+}
+
+int32_t
+ns_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+        dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this,
+                                              newloc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(link, newloc->inode, oldloc, newloc, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+    return 0;
+}
+
+int32_t
+ns_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         mode_t umask, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(mkdir, loc->inode, loc, mode, umask, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_mkdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+    return 0;
+}
+
+int32_t
+ns_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
+           loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(symlink, loc->inode, linkname, loc, umask,
+                               xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_symlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata);
+    return 0;
+}
+
+int32_t
+ns_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         dev_t dev, mode_t umask, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(mknod, loc->inode, loc, mode, dev, umask, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, dev, umask, xdata);
+    return 0;
+}
+
+int32_t
+ns_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+          mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(create, loc->inode, loc, flags, mode, umask, fd,
+                               xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+    return 0;
+}
+
+int32_t
+ns_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+            int32_t valid, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(fsetattr, fd->inode, fd, stbuf, valid, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+    return 0;
+}
+
+int32_t
+ns_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+           int32_t valid, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(setattr, loc->inode, loc, stbuf, valid, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+    return 0;
+}
+
+int32_t
+ns_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+                dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(fremovexattr, fd->inode, fd, name, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_fremovexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+    return 0;
+}
+
+int32_t
+ns_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               const char *name, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(removexattr, loc->inode, loc, name, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_removexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+    return 0;
+}
+
+int32_t
+ns_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+            int32_t flags, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(setxattr, loc->inode, loc, dict, flags, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_setxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
+    return 0;
+}
+
+int32_t
+ns_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+             int32_t flags, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(fsetxattr, fd->inode, fd, dict, flags, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_fsetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+    return 0;
+}
+
+int32_t
+ns_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+            dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(truncate, loc->inode, loc, offset, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+    return 0;
+}
+
+int32_t
+ns_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(ftruncate, fd->inode, fd, offset, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_ftruncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+}
+
+int32_t
+ns_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+          int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+          dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(writev, fd->inode, fd, vector, count, offset,
+                               flags, iobref, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+               flags, iobref, xdata);
+    return 0;
+}
+
+int32_t
+ns_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(lookup, loc->inode, loc, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
+    return 0;
+}
+
+int32_t
+ns_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(stat, loc->inode, loc, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->stat, loc, xdata);
+    return 0;
+}
+
+int32_t
+ns_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(fstat, fd->inode, fd, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fstat, fd, xdata);
+    return 0;
+}
+
+int32_t
+ns_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+            dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(readlink, loc->inode, loc, size, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_readlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readlink, loc, size, xdata);
+    return 0;
+}
+
+int32_t
+ns_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+          dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(access, loc->inode, loc, mask, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_access_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->access, loc, mask, xdata);
+    return 0;
+}
+
+int32_t
+ns_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+        fd_t *fd, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(open, fd->inode, loc, flags, fd, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+    return 0;
+}
+
+int32_t
+ns_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+         off_t offset, uint32_t flags, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(readv, fd->inode, fd, size, offset, flags,
+                               xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+    return 0;
+}
+
+int32_t
+ns_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(flush, fd->inode, fd, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_flush_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->flush, fd, xdata);
+    return 0;
+}
+
+int32_t
+ns_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+         dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(fsync, fd->inode, fd, datasync, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata);
+    return 0;
+}
+
+int32_t
+ns_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+           dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(opendir, loc->inode, loc, fd, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_opendir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+    return 0;
+}
+
+int32_t
+ns_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+            dict_t *xdata)
+
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(fsyncdir, fd->inode, fd, datasync, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_fsyncdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsyncdir, fd, datasync, xdata);
+    return 0;
+}
+
+int32_t
+ns_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             int32_t len, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(rchecksum, fd->inode, fd, offset, len, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_rchecksum_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata);
+    return 0;
+}
+
+int32_t
+ns_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(statfs, loc->inode, loc, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_statfs_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->statfs, loc, xdata);
+    return 0;
+}
+
+int32_t
+ns_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+           int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(inodelk, loc->inode, volume, loc, cmd, flock,
+                               xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_inodelk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->inodelk, volume, loc, cmd, flock,
+               xdata);
+    return 0;
+}
+
+int32_t
+ns_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(finodelk, fd->inode, volume, fd, cmd, flock,
+                               xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_finodelk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->finodelk, volume, fd, cmd, flock,
+               xdata);
+    return 0;
+}
+
+int32_t
+ns_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+           const char *basename, entrylk_cmd cmd, entrylk_type type,
+           dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(entrylk, loc->inode, volume, loc, basename, cmd,
+                               type, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, volume, loc, basename, cmd,
+               type, xdata);
+    return 0;
+}
+
+int32_t
+ns_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            const char *basename, entrylk_cmd cmd, entrylk_type type,
+            dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(fentrylk, fd->inode, volume, fd, basename, cmd,
+                               type, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_fentrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fentrylk, volume, fd, basename, cmd,
+               type, xdata);
+    return 0;
+}
+
+int32_t
+ns_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+             dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(fgetxattr, fd->inode, fd, name, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_fgetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+    return 0;
+}
+
+int32_t
+ns_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+            dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(getxattr, loc->inode, loc, name, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_getxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
+    return 0;
+}
+
+int32_t
+ns_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+      struct gf_flock *flock, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(lk, fd->inode, fd, cmd, flock, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_lk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lk, fd, cmd, flock, xdata);
+    return 0;
+}
+
+int32_t
+ns_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+           off_t offset, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(readdir, fd->inode, fd, size, offset, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_readdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata);
+
+    return 0;
+}
+
+int32_t
+ns_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t offset, dict_t *dict)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(readdirp, fd->inode, fd, size, offset, dict);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict);
+    return 0;
+}
+
+int32_t
+ns_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+           gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_loc(__FUNCTION__, frame, this, loc);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(xattrop, loc->inode, loc, flags, dict, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_xattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->xattrop, loc, flags, dict, xdata);
+
+    return 0;
+}
+
+int32_t
+ns_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+            gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(fxattrop, fd->inode, fd, flags, dict, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_fxattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fxattrop, fd, flags, dict, xdata);
+
+    return 0;
+}
+
+int32_t
+ns_getspec(call_frame_t *frame, xlator_t *this, const char *key, int32_t flag)
+{
+    STACK_WIND(frame, default_getspec_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getspec, key, flag);
+    return 0;
+}
+
+int32_t
+ns_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size,
+             off_t offset, size_t len, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(fallocate, fd->inode, fd, keep_size, offset, len,
+                               xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_fallocate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset, len,
+               xdata);
+    return 0;
+}
+
+int32_t
+ns_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+           size_t len, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(discard, fd->inode, fd, offset, len, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_discard_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+    return 0;
+}
+
+int32_t
+ns_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            off_t len, dict_t *xdata)
+{
+    path_parse_result_t ret = set_ns_from_fd(__FUNCTION__, frame, this, fd);
+
+    if (ret == PATH_PARSE_RESULT_IS_GFID) {
+        GET_ANCESTRY_PATH_WIND(zerofill, fd->inode, fd, offset, len, xdata);
+        return 0;
+    }
+wind:
+    STACK_WIND(frame, default_zerofill_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+    return 0;
+}
+
+int
+ns_forget(xlator_t *this, inode_t *inode)
+{
+    uint64_t ns_as_64 = 0;
+    ns_info_t *info = NULL;
+
+    inode_ctx_del(inode, this, &ns_as_64);
+
+    if (!ns_as_64) {
+        return 0;
+    }
+
+    info = (ns_info_t *)(uintptr_t)ns_as_64;
+    GF_FREE(info);
+
+    return 0;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    int32_t ret = -1;
+    ns_private_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NAMESPACE, this, out);
+
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "translator needs a single subvolume.");
+        goto out;
+    }
+
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "dangling volume. please check volfile.");
+        goto out;
+    }
+
+    priv = GF_CALLOC(1, sizeof(ns_private_t), 0);
+
+    if (!priv) {
+        gf_log(this->name, GF_LOG_ERROR, "Can't allocate ns_priv structure.");
+        goto out;
+    }
+
+    GF_OPTION_INIT("tag-namespaces", priv->tag_namespaces, bool, out);
+
+    gf_log(this->name, GF_LOG_INFO, "Namespace xlator loaded");
+    this->private = priv;
+    ret = 0;
+
+out:
+    if (ret) {
+        GF_FREE(priv);
+    }
+
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    GF_FREE(this->private);
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    int ret = -1;
+    ns_private_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, options, out);
+
+    priv = (ns_private_t *)this->private;
+
+    GF_OPTION_RECONF("tag-namespaces", priv->tag_namespaces, options, bool,
+                     out);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+struct xlator_fops fops = {
+    .lookup = ns_lookup,
+    .stat = ns_stat,
+    .fstat = ns_fstat,
+    .truncate = ns_truncate,
+    .ftruncate = ns_ftruncate,
+    .access = ns_access,
+    .readlink = ns_readlink,
+    .mknod = ns_mknod,
+    .mkdir = ns_mkdir,
+    .unlink = ns_unlink,
+    .rmdir = ns_rmdir,
+    .symlink = ns_symlink,
+    .rename = ns_rename,
+    .link = ns_link,
+    .create = ns_create,
+    .open = ns_open,
+    .readv = ns_readv,
+    .writev = ns_writev,
+    .flush = ns_flush,
+    .fsync = ns_fsync,
+    .opendir = ns_opendir,
+    .readdir = ns_readdir,
+    .readdirp = ns_readdirp,
+    .fsyncdir = ns_fsyncdir,
+    .statfs = ns_statfs,
+    .setxattr = ns_setxattr,
+    .getxattr = ns_getxattr,
+    .fsetxattr = ns_fsetxattr,
+    .fgetxattr = ns_fgetxattr,
+    .removexattr = ns_removexattr,
+    .fremovexattr = ns_fremovexattr,
+    .lk = ns_lk,
+    .inodelk = ns_inodelk,
+    .finodelk = ns_finodelk,
+    .entrylk = ns_entrylk,
+    .fentrylk = ns_fentrylk,
+    .rchecksum = ns_rchecksum,
+    .xattrop = ns_xattrop,
+    .fxattrop = ns_fxattrop,
+    .setattr = ns_setattr,
+    .fsetattr = ns_fsetattr,
+    .getspec = ns_getspec,
+    .fallocate = ns_fallocate,
+    .discard = ns_discard,
+    .zerofill = ns_zerofill,
+};
+
+struct xlator_cbks cbks = {
+    .forget = ns_forget,
+};
+
+struct xlator_dumpops dumpops;
+
+struct volume_options options[] = {
+    {
+        .key = {"tag-namespaces"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "This option enables this translator's functionality "
+                       "that tags every fop with a namespace hash for later "
+                       "throttling, stats collection, logging, etc.",
+        .op_version = {GD_OP_VERSION_4_1_0},
+        .tags = {"namespace"},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+    },
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .op_version = {GD_OP_VERSION_3_12_0},
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "namespace",
+    .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/features/namespace/src/namespace.h b/xlators/features/namespace/src/namespace.h
new file mode 100644
index 00000000000..3a9b84d6426
--- /dev/null
+++ b/xlators/features/namespace/src/namespace.h
@@ -0,0 +1,23 @@
+#ifndef __NAMESPACE_H__
+#define __NAMESPACE_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/call-stub.h>
+
+#define GF_NAMESPACE "namespace"
+
+typedef struct {
+    gf_boolean_t tag_namespaces;
+} ns_private_t;
+
+typedef struct {
+    loc_t loc;         /* We store a "fake" loc_t for the getxattr wind. */
+    call_stub_t *stub; /* A stub back to the function we're resuming. */
+} ns_local_t;
+
+#endif /* __NAMESPACE_H__ */
diff --git a/xlators/features/path-convertor/src/Makefile.am b/xlators/features/path-convertor/src/Makefile.am
deleted file mode 100644
index 1fde1935238..00000000000
--- a/xlators/features/path-convertor/src/Makefile.am
+++ /dev/null
@@ -1,14 +0,0 @@
-
-xlator_LTLIBRARIES = path-converter.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
-
-path_converter_la_LDFLAGS = -module -avoidversion 
-
-path_converter_la_SOURCES = path.c
-path_converter_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la 
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
-
-CLEANFILES = 
-
diff --git a/xlators/features/path-convertor/src/path.c b/xlators/features/path-convertor/src/path.c
deleted file mode 100644
index 293f34177a5..00000000000
--- a/xlators/features/path-convertor/src/path.c
+++ /dev/null
@@ -1,1217 +0,0 @@
-/*
-  Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-/* TODO: add gf_log to all the cases returning errors */
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-/**
- * xlators/features/path-translator:
- *    This translator converts the path it gets into user specified targets.
- */
-
-#include <sys/types.h>
-#include <regex.h>
-#include <time.h>
-#include <errno.h>
-#include "glusterfs.h"
-#include "xlator.h"
-
-typedef struct path_private
-{
-	int32_t this_len;
-	int32_t start_off;
-	int32_t end_off;
-	char *this;
-	char *that;
-	char *path;
-	regex_t *preg;
-} path_private_t;
-
-static char *
-name_this_to_that (xlator_t *xl, const char *path, const char *name)
-{
-	path_private_t *priv = xl->private;
-	char priv_path[ZR_PATH_MAX] = {0,};
-	char *tmp_name = NULL;
-	int32_t path_len = strlen (path);
-	int32_t name_len = strlen (name) - ZR_FILE_CONTENT_STRLEN;
-	int32_t total_len = path_len + name_len;
-	int32_t i = 0, j = 0;
-
-	if (path_len >= priv->end_off)
-		return (char *)name;
-
-	if (priv->end_off && (total_len > priv->end_off)) {
-		j = priv->start_off;
-		tmp_name = CALLOC (1, (total_len + ZR_FILE_CONTENT_STRLEN));
-		ERR_ABORT (tmp_name);
-
-		/* Get the complete path for the file first */
-		strcpy (tmp_name, path);
-		strcat (tmp_name, name + ZR_FILE_CONTENT_STRLEN);
-
-		strncpy (priv_path, tmp_name, priv->start_off);
-		for (i = priv->start_off; i < priv->end_off; i++) {
-			if (tmp_name[i] == '/')
-				continue;
-			priv_path[j++] = tmp_name[i];
-		}
-		memcpy ((priv_path + j), 
-			(tmp_name + priv->end_off), 
-			(total_len - priv->end_off));
-		priv_path[(total_len - (priv->end_off - j))] = '\0';
-
-		strcpy (tmp_name, ZR_FILE_CONTENT_STR);
-		strcat (tmp_name, priv_path);
-
-		return tmp_name;
-	}
-
-	return (char *)name;
-}
-
-/* This function should return 
- *  NULL - 
- *  converted path - if path match
- *  same path - if it doesn't match
- */
-static char *
-path_this_to_that (xlator_t *xl, const char *path)
-{
-	path_private_t *priv = xl->private;
-	char *priv_path = NULL;
-	int32_t path_len = strlen (path);
-	int32_t i = 0, j = 0;
-
-	if (priv->end_off && (path_len > priv->start_off)) {
-		priv_path = CALLOC (1, path_len);
-		ERR_ABORT (priv_path);
-
-		if (priv->start_off && (path_len > priv->start_off))
-			memcpy (priv_path, path, priv->start_off);
-		if (path_len > priv->end_off) {
-			j = priv->start_off;
-			for (i = priv->start_off; i < priv->end_off; i++) {
-				if (path[i] == '/')
-					continue;
-				priv_path[j++] = path[i];
-			}
-			memcpy ((priv_path + j), 
-				(path + priv->end_off), 
-				(path_len - priv->end_off));
-			priv_path[(path_len - (priv->end_off - j))] = '\0';
-		}
-		return priv_path;
-	}
-	return (char *)path;
-}
-
-int32_t 
-path_create_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 fd_t *fd,
-		 inode_t *inode,
-		 struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
-	return 0;
-}
-
-int32_t 
-path_open_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno,
-	       fd_t *fd)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, fd);
-	return 0;
-}
-
-int32_t 
-path_getdents_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   dir_entry_t *entries,
-		   int32_t count)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, entries, count);
-	return 0;
-}
-
-int32_t 
-path_readdir_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  gf_dirent_t *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-
-int32_t 
-path_readlink_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   const char *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-path_lookup_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 inode_t *inode,
-		 struct stat *buf,
-		 dict_t *xattr)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf, xattr);
-	return 0;
-}
-
-
-int32_t 
-path_symlink_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  inode_t *inode,
-		  struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
-}
-
-int32_t 
-path_mknod_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		inode_t *inode,
-		struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
-}
-  
-
-int32_t 
-path_mkdir_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		inode_t *inode,
-		struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
-}
-  
-int32_t 
-path_link_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno,
-	       inode_t *inode,
-	       struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
-}
-
-int32_t 
-path_opendir_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  fd_t *fd)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, fd);
-	return 0;
-}
-
-
-int32_t 
-path_common_buf_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno,
-		     struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-int32_t 
-path_common_dict_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno,
-		      dict_t *dict)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, dict);
-	return 0;
-}
-
-int32_t 
-path_common_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-/* */
-int32_t 
-path_lookup (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     dict_t *xattr_req)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, path_lookup_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->lookup, 
-		    loc, xattr_req);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_stat (call_frame_t *frame,
-	   xlator_t *this,
-	   loc_t *loc)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_common_buf_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->stat, 
-		    loc);
-  
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_readlink (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       size_t size)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_readlink_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->readlink, 
-		    loc, 
-		    size);
-  
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_mknod (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    mode_t mode,
-	    dev_t dev)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_mknod_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->mknod, 
-		    loc, 
-		    mode, 
-		    dev);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_mkdir (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    mode_t mode)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_mkdir_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->mkdir, 
-		    loc, 
-		    mode);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_unlink (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_common_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->unlink, 
-		    loc);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_rmdir (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_common_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->rmdir, 
-		    loc);
-  
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_symlink (call_frame_t *frame,
-	      xlator_t *this,
-	      const char *linkpath,
-	      loc_t *loc)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_symlink_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->symlink, 
-		    linkpath,
-		    loc);
-  
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_rename (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *oldloc,
-	     loc_t *newloc)
-{  
-	char *oldloc_path = (char *)oldloc->path;
-	char *tmp_oldloc_path = NULL;
-
-	char *newloc_path = (char *)newloc->path;
-	char *tmp_newloc_path = NULL;
-	
-	if (!(tmp_oldloc_path = path_this_to_that (this, oldloc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	oldloc->path = tmp_oldloc_path;
-
-	if (!(tmp_newloc_path = path_this_to_that (this, newloc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	newloc->path = tmp_newloc_path;
-
-	STACK_WIND (frame, 
-		    path_common_buf_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->rename, 
-		    oldloc,
-		    newloc);
-  
-	oldloc->path = oldloc_path;	
-	if (tmp_oldloc_path != oldloc_path)
-		FREE (tmp_oldloc_path);
-
-	newloc->path = newloc_path;	
-	if (tmp_newloc_path != newloc_path)
-		FREE (tmp_newloc_path);
-
-	return 0;
-}
-
-int32_t 
-path_link (call_frame_t *frame,
-	   xlator_t *this,
-	   loc_t *oldloc,
-	   loc_t *newloc)
-{
-	char *oldloc_path = (char *)oldloc->path;
-	char *tmp_oldloc_path = NULL;
-
-	char *newloc_path = (char *)newloc->path;
-	char *tmp_newloc_path = NULL;
-	
-	if (!(tmp_oldloc_path = path_this_to_that (this, oldloc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	oldloc->path = tmp_oldloc_path;
-
-	if (!(tmp_newloc_path = path_this_to_that (this, newloc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	newloc->path = tmp_newloc_path;
-
-	STACK_WIND (frame, 
-		    path_link_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->link, 
-		    oldloc, 
-		    newloc);
-
-	oldloc->path = oldloc_path;	
-	if (tmp_oldloc_path != oldloc_path)
-		FREE (tmp_oldloc_path);
-
-	newloc->path = newloc_path;	
-	if (tmp_newloc_path != newloc_path)
-		FREE (tmp_newloc_path);
-
-	return 0;
-}
-
-int32_t 
-path_chmod (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    mode_t mode)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_common_buf_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->chmod, 
-		    loc, 
-		    mode);
-  
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_chown (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    uid_t uid,
-	    gid_t gid)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_common_buf_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->chown, 
-		    loc, 
-		    uid,
-		    gid);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_truncate (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       off_t offset)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_common_buf_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->truncate, 
-		    loc, 
-		    offset);
-  
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_utimens (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      struct timespec tv[2])
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_common_buf_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->utimens, 
-		    loc, 
-		    tv);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_open (call_frame_t *frame,
-	   xlator_t *this,
-	   loc_t *loc,
-	   int32_t flags,
-	   fd_t *fd)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_open_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->open, 
-		    loc, 
-		    flags,
-		    fd);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_create (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     int32_t flags,
-	     mode_t mode,
-	     fd_t *fd)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_create_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->create, 
-		    loc, 
-		    flags,
-		    mode,
-		    fd);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_setxattr (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       dict_t *dict,
-	       int32_t flags)
-{
-	char *tmp_name = NULL;
-	data_pair_t *trav = dict->members_list;
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	if (ZR_FILE_CONTENT_REQUEST(trav->key)) {
-		tmp_name = name_this_to_that (this, loc->path, trav->key);
-		if (tmp_name != trav->key) {
-			trav->key = tmp_name;
-		} else {
-			tmp_name = NULL;
-		}
-	}
-
-	STACK_WIND (frame, 
-		    path_common_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->setxattr, 
-		    loc, 
-		    dict,
-		    flags);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	if (tmp_name)
-		FREE (tmp_name);
-
-	return 0;
-}
-
-int32_t 
-path_getxattr (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       const char *name)
-{
-	char *tmp_name = (char *)name;
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	if (ZR_FILE_CONTENT_REQUEST(name)) {
-		tmp_name = name_this_to_that (this, loc->path, name);
-	}
-
-	STACK_WIND (frame, 
-		    path_common_dict_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->getxattr,
-		    loc, 
-		    tmp_name);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	if (tmp_name != name)
-		FREE (tmp_name);
-
-	return 0;
-}
-
-int32_t 
-path_removexattr (call_frame_t *frame,
-		  xlator_t *this,
-		  loc_t *loc,
-		  const char *name)
-{
-	char *tmp_name = (char *)name;
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	if (ZR_FILE_CONTENT_REQUEST(name)) {
-		tmp_name = name_this_to_that (this, loc->path, name);
-	}
-
-	STACK_WIND (frame, 
-		    path_common_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->removexattr, 
-		    loc, 
-		    tmp_name);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	if (tmp_name != name)
-		FREE (tmp_name);
-
-	return 0;
-}
-
-int32_t 
-path_opendir (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      fd_t *fd)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_opendir_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->opendir, 
-		    loc, 
-		    fd);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t 
-path_access (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     int32_t mask)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, 
-		    path_common_cbk, 
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->access, 
-		    loc, 
-		    mask);
-
-	loc->path = loc_path;
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t
-path_checksum_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   uint8_t *fchecksum,
-		   uint8_t *dchecksum)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum);
-	return 0;
-}
-
-int32_t
-path_checksum (call_frame_t *frame,
-	       xlator_t *this,
-	       loc_t *loc,
-	       int32_t flag)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame,
-		    path_checksum_cbk,
-		    FIRST_CHILD(this), 
-		    FIRST_CHILD(this)->fops->checksum, 
-		    loc, 
-		    flag);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-
-int32_t
-path_entrylk (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, const char *basename,
-	      entrylk_cmd cmd, entrylk_type type)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame, path_common_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->entrylk,
-		    loc, basename, cmd, type);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-int32_t
-path_inodelk (call_frame_t *frame, xlator_t *this,
-		 loc_t *loc, int32_t cmd, struct flock *lock)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame,
-		    path_common_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->inodelk,
-		    loc, cmd, lock);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-
-int32_t
-path_xattrop (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      gf_xattrop_flags_t flags,
-	      dict_t *dict)
-{
-	char *loc_path = (char *)loc->path;
-	char *tmp_path = NULL;
-	
-	if (!(tmp_path = path_this_to_that (this, loc->path))) {
-		STACK_UNWIND (frame, -1, ENOENT, NULL, NULL);
-		return 0;
-	}
-	loc->path = tmp_path;
-
-	STACK_WIND (frame,
-		    path_common_dict_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->xattrop,
-		    loc,
-		    flags,
-		    dict);
-
-	loc->path = loc_path;	
-	if (tmp_path != loc_path)
-		FREE (tmp_path);
-
-	return 0;
-}
-
-
-int32_t 
-init (xlator_t *this)
-{
-	dict_t *options = this->options;
-	path_private_t *priv = NULL;
-
-	if (!this->children || this->children->next) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"path translator requires exactly one subvolume");
-		return -1;
-	}
-    
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-  
-	priv = CALLOC (1, sizeof (*priv));
-	ERR_ABORT (priv);
-	if (dict_get (options, "start-offset")) {
-		priv->start_off = data_to_int32 (dict_get (options, 
-							   "start-offset"));
-	}
-	if (dict_get (options, "end-offset")) {
-		priv->end_off = data_to_int32 (dict_get (options, 
-							 "end-offset"));
-	}
-
-	if (dict_get (options, "regex")) {
-		int32_t ret = 0;
-		priv->preg = CALLOC (1, sizeof (regex_t));
-		ERR_ABORT (priv->preg);
-		ret = regcomp (priv->preg, 
-			       data_to_str (dict_get (options, "regex")), 
-			       REG_EXTENDED);
-		if (ret) {
-			gf_log (this->name, GF_LOG_ERROR, 
-				"Failed to compile the 'option regex'");
-			FREE (priv);
-			return -1;
-		}
-		if (dict_get (options, "replace-with")) {
-			priv->that = data_to_str (dict_get (options, 
-							    "replace-with"));
-		} else {
-			priv->that = "";
-		}
-	}
-
-	this->private = priv;
-	return 0;
-}
-
-void
-fini (xlator_t *this)
-{
-	return;
-}
-
-struct xlator_fops fops = {
-	.stat        = path_stat,
-	.readlink    = path_readlink,
-	.mknod       = path_mknod,
-	.mkdir       = path_mkdir,
-	.unlink      = path_unlink,
-	.rmdir       = path_rmdir,
-	.symlink     = path_symlink,
-	.rename      = path_rename,
-	.link        = path_link,
-	.chmod       = path_chmod,
-	.chown       = path_chown,
-	.truncate    = path_truncate,
-	.utimens     = path_utimens,
-	.open        = path_open,
-	.setxattr    = path_setxattr,
-	.getxattr    = path_getxattr,
-	.removexattr = path_removexattr,
-	.opendir     = path_opendir,
-	.access      = path_access,
-	.create      = path_create,
-	.lookup      = path_lookup,
-	.checksum    = path_checksum,
-	.xattrop     = path_xattrop,
-	.entrylk     = path_entrylk,
-	.inodelk     = path_inodelk,
-};
-
-
-struct xlator_mops mops = {
-};
-
-
-struct xlator_cbks cbks = {
-};
-
-struct volume_options options[] = { 
-	{ .key  = {"start-offset"}, 
-	  .type = GF_OPTION_TYPE_INT, 
-	  .min  = 0, 
-	  .max  = 4095 
-	},
-	{ .key  = {"end-offset"}, 
-	  .type = GF_OPTION_TYPE_INT, 
-	  .min  = 1, 
-	  .max  = 4096 
-	},
-	{ .key  = {"replace-with"}, 
-	  .type = GF_OPTION_TYPE_ANY 
-	},
-	{ .key  = {NULL} },
-};
diff --git a/xlators/features/quiesce/Makefile.am b/xlators/features/quiesce/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/quiesce/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/quiesce/src/Makefile.am b/xlators/features/quiesce/src/Makefile.am
new file mode 100644
index 00000000000..74ea999c045
--- /dev/null
+++ b/xlators/features/quiesce/src/Makefile.am
@@ -0,0 +1,16 @@
+xlator_LTLIBRARIES = quiesce.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+quiesce_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+quiesce_la_SOURCES = quiesce.c
+quiesce_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = quiesce.h quiesce-mem-types.h quiesce-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/quiesce/src/quiesce-mem-types.h b/xlators/features/quiesce/src/quiesce-mem-types.h
new file mode 100644
index 00000000000..416456b13af
--- /dev/null
+++ b/xlators/features/quiesce/src/quiesce-mem-types.h
@@ -0,0 +1,21 @@
+/*
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __QUIESCE_MEM_TYPES_H__
+#define __QUIESCE_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_quiesce_mem_types_ {
+    gf_quiesce_mt_priv_t = gf_common_mt_end + 1,
+    gf_quiesce_mt_failover_hosts,
+    gf_quiesce_mt_end
+};
+#endif
diff --git a/xlators/features/quiesce/src/quiesce-messages.h b/xlators/features/quiesce/src/quiesce-messages.h
new file mode 100644
index 00000000000..32ffd409807
--- /dev/null
+++ b/xlators/features/quiesce/src/quiesce-messages.h
@@ -0,0 +1,28 @@
+/*
+ *   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#ifndef __QUIESCE_MESSAGES_H__
+#define __QUIESCE_MESSAGES_H__
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(QUIESCE, QUIESCE_MSG_INVAL_HOST, QUIESCE_MSG_FAILOVER_FAILED);
+
+#endif /* __NL_CACHE_MESSAGES_H__ */
diff --git a/xlators/features/quiesce/src/quiesce.c b/xlators/features/quiesce/src/quiesce.c
new file mode 100644
index 00000000000..0e5eb60a16f
--- /dev/null
+++ b/xlators/features/quiesce/src/quiesce.c
@@ -0,0 +1,2704 @@
+/*
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include "quiesce.h"
+#include <glusterfs/defaults.h>
+#include <glusterfs/call-stub.h>
+
+/* TODO: */
+/* Think about 'writev/_*_lk/setattr/xattrop/' fops to do re-transmittion */
+
+void
+gf_quiesce_timeout(void *data);
+
+/* Quiesce Specific Functions */
+void
+gf_quiesce_local_wipe(xlator_t *this, quiesce_local_t *local)
+{
+    if (!local || !this || !this->private)
+        return;
+
+    if (local->loc.inode)
+        loc_wipe(&local->loc);
+    if (local->fd)
+        fd_unref(local->fd);
+    GF_FREE(local->name);
+    GF_FREE(local->volname);
+    if (local->dict)
+        dict_unref(local->dict);
+    if (local->iobref)
+        iobref_unref(local->iobref);
+    GF_FREE(local->vector);
+
+    mem_put(local);
+}
+
+void
+__gf_quiesce_start_timer(xlator_t *this, quiesce_priv_t *priv)
+{
+    struct timespec timeout = {
+        0,
+    };
+
+    if (!priv->timer) {
+        timeout.tv_sec = priv->timeout;
+        timeout.tv_nsec = 0;
+
+        priv->timer = gf_timer_call_after(this->ctx, timeout,
+                                          gf_quiesce_timeout, (void *)this);
+        if (priv->timer == NULL) {
+            gf_log(this->name, GF_LOG_ERROR, "Cannot create timer");
+        }
+    }
+}
+
+static void
+__gf_quiesce_cleanup_failover_hosts(xlator_t *this, quiesce_priv_t *priv)
+{
+    quiesce_failover_hosts_t *tmp = NULL;
+    quiesce_failover_hosts_t *failover_host = NULL;
+
+    list_for_each_entry_safe(failover_host, tmp, &priv->failover_list, list)
+    {
+        GF_FREE(failover_host->addr);
+        list_del(&failover_host->list);
+        GF_FREE(failover_host);
+    }
+    return;
+}
+
+void
+gf_quiesce_populate_failover_hosts(xlator_t *this, quiesce_priv_t *priv,
+                                   const char *value)
+{
+    char *dup_val = NULL;
+    char *addr_tok = NULL;
+    char *save_ptr = NULL;
+    quiesce_failover_hosts_t *failover_host = NULL;
+
+    if (!value)
+        goto out;
+
+    dup_val = gf_strdup(value);
+    if (!dup_val)
+        goto out;
+
+    addr_tok = strtok_r(dup_val, ",", &save_ptr);
+    LOCK(&priv->lock);
+    {
+        if (!list_empty(&priv->failover_list))
+            __gf_quiesce_cleanup_failover_hosts(this, priv);
+
+        while (addr_tok) {
+            if (!valid_internet_address(addr_tok, _gf_true, _gf_false)) {
+                gf_msg(this->name, GF_LOG_INFO, 0, QUIESCE_MSG_INVAL_HOST,
+                       "Specified "
+                       "invalid internet address:%s",
+                       addr_tok);
+                continue;
+            }
+            failover_host = GF_CALLOC(1, sizeof(*failover_host),
+                                      gf_quiesce_mt_failover_hosts);
+            failover_host->addr = gf_strdup(addr_tok);
+            INIT_LIST_HEAD(&failover_host->list);
+            list_add(&failover_host->list, &priv->failover_list);
+            addr_tok = strtok_r(NULL, ",", &save_ptr);
+        }
+    }
+    UNLOCK(&priv->lock);
+    GF_FREE(dup_val);
+out:
+    return;
+}
+
+int32_t
+gf_quiesce_failover_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+
+    if (op_ret < 0) {
+        /* Failure here doesn't mean the failover to another host didn't
+         * succeed, we will know if failover succeeds or not by the
+         * CHILD_UP/CHILD_DOWN event. A failure here indicates something
+         * went wrong with the submission of failover command, hence
+         * just abort the failover attempts without retrying with other
+         * hosts.
+         */
+        gf_msg(this->name, GF_LOG_INFO, op_errno, QUIESCE_MSG_FAILOVER_FAILED,
+               "Initiating failover to host:%s failed:", (char *)cookie);
+    }
+
+    GF_FREE(cookie);
+    STACK_DESTROY(frame->root);
+
+    priv = this->private;
+    __gf_quiesce_start_timer(this, priv);
+
+    return 0;
+}
+
+int
+__gf_quiesce_perform_failover(xlator_t *this)
+{
+    int ret = 0;
+    call_frame_t *frame = NULL;
+    dict_t *dict = NULL;
+    quiesce_priv_t *priv = NULL;
+    quiesce_failover_hosts_t *failover_host = NULL;
+    quiesce_failover_hosts_t *host = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        gf_msg_trace(this->name, 0,
+                     "child is up, hence not "
+                     "performing any failover");
+        goto out;
+    }
+
+    list_for_each_entry(failover_host, &priv->failover_list, list)
+    {
+        if (failover_host->tried == 0) {
+            host = failover_host;
+            failover_host->tried = 1;
+            break;
+        }
+    }
+    if (!host) {
+        /*TODO: Keep trying until any of the gfproxy comes back up.
+                Currently it tries failing over once for each host,
+                if it doesn't succeed then returns error to mount point
+           list_for_each_entry (failover_host,
+                        &priv->failover_list, list) {
+                failover_host->tried = 0;
+        }*/
+        gf_msg_debug(this->name, 0,
+                     "all the failover hosts have "
+                     "been tried and looks like didn't succeed");
+        ret = -1;
+        goto out;
+    }
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame) {
+        gf_msg_debug(this->name, 0, "failed to create the frame");
+        ret = -1;
+        goto out;
+    }
+
+    dict = dict_new();
+
+    ret = dict_set_dynstr(dict, CLIENT_CMD_CONNECT, gf_strdup(host->addr));
+
+    gf_msg_trace(this->name, 0, "Initiating failover to:%s", host->addr);
+
+    STACK_WIND_COOKIE(frame, gf_quiesce_failover_cbk, NULL, FIRST_CHILD(this),
+                      FIRST_CHILD(this)->fops->setxattr, NULL, dict, 0, NULL);
+out:
+
+    if (dict)
+        dict_unref(dict);
+
+    return ret;
+}
+
+call_stub_t *
+gf_quiesce_dequeue(xlator_t *this)
+{
+    call_stub_t *stub = NULL;
+    quiesce_priv_t *priv = NULL;
+
+    priv = this->private;
+
+    if (!priv || list_empty(&priv->req))
+        return NULL;
+
+    LOCK(&priv->lock);
+    {
+        stub = list_entry(priv->req.next, call_stub_t, list);
+        list_del_init(&stub->list);
+        priv->queue_size--;
+    }
+    UNLOCK(&priv->lock);
+
+    return stub;
+}
+
+void *
+gf_quiesce_dequeue_start(void *data)
+{
+    xlator_t *this = NULL;
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    this = data;
+    priv = this->private;
+    THIS = this;
+
+    while (!list_empty(&priv->req)) {
+        stub = gf_quiesce_dequeue(this);
+        if (stub) {
+            call_resume(stub);
+        }
+    }
+
+    return 0;
+}
+
+void
+gf_quiesce_timeout(void *data)
+{
+    xlator_t *this = NULL;
+    quiesce_priv_t *priv = NULL;
+    int ret = -1;
+
+    this = data;
+    priv = this->private;
+    THIS = this;
+
+    LOCK(&priv->lock);
+    {
+        priv->timer = NULL;
+        if (priv->pass_through) {
+            UNLOCK(&priv->lock);
+            goto out;
+        }
+        ret = __gf_quiesce_perform_failover(THIS);
+    }
+    UNLOCK(&priv->lock);
+
+    if (ret < 0) {
+        priv->pass_through = _gf_true;
+        gf_quiesce_dequeue_start(this);
+    }
+
+out:
+    return;
+}
+
+void
+gf_quiesce_enqueue(xlator_t *this, call_stub_t *stub)
+{
+    quiesce_priv_t *priv = NULL;
+
+    priv = this->private;
+    if (!priv) {
+        gf_log_callingfn(this->name, GF_LOG_ERROR, "this->private == NULL");
+        return;
+    }
+
+    LOCK(&priv->lock);
+    {
+        list_add_tail(&stub->list, &priv->req);
+        priv->queue_size++;
+        __gf_quiesce_start_timer(this, priv);
+    }
+    UNLOCK(&priv->lock);
+
+    return;
+}
+
+/* _CBK function section */
+
+int32_t
+quiesce_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, inode_t *inode,
+                   struct iatt *buf, dict_t *dict, struct iatt *postparent)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_lookup_stub(frame, default_lookup_resume, &local->loc,
+                               local->dict);
+        if (!stub) {
+            STACK_UNWIND_STRICT(lookup, frame, -1, ENOMEM, NULL, NULL, NULL,
+                                NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, dict,
+                        postparent);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                 dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_stat_stub(frame, default_stat_resume, &local->loc, xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(stat, frame, -1, ENOMEM, NULL, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_access_stub(frame, default_access_resume, &local->loc,
+                               local->flag, xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(access, frame, -1, ENOMEM, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(access, frame, op_ret, op_errno, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, const char *path,
+                     struct iatt *buf, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_readlink_stub(frame, default_readlink_resume, &local->loc,
+                                 local->size, xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(readlink, frame, -1, ENOMEM, NULL, NULL, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(readlink, frame, op_ret, op_errno, path, buf, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_open_stub(frame, default_open_resume, &local->loc,
+                             local->flag, local->fd, xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(open, frame, -1, ENOMEM, NULL, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                  int32_t count, struct iatt *stbuf, struct iobref *iobref,
+                  dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_readv_stub(frame, default_readv_resume, local->fd,
+                              local->size, local->offset, local->io_flag,
+                              xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(readv, frame, -1, ENOMEM, NULL, 0, NULL, NULL,
+                                NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf,
+                        iobref, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_flush_stub(frame, default_flush_resume, local->fd, xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(flush, frame, -1, ENOMEM, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                  struct iatt *postbuf, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_fsync_stub(frame, default_fsync_resume, local->fd,
+                              local->flag, xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(fsync, frame, -1, ENOMEM, NULL, NULL, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                  dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_fstat_stub(frame, default_fstat_resume, local->fd, xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, NULL, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(fstat, frame, op_ret, op_errno, buf, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_opendir_stub(frame, default_opendir_resume, &local->loc,
+                                local->fd, xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(opendir, frame, -1, ENOMEM, NULL, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_fsyncdir_stub(frame, default_fsyncdir_resume, local->fd,
+                                 local->flag, xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(fsyncdir, frame, -1, ENOMEM, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(fsyncdir, frame, op_ret, op_errno, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+                   dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_statfs_stub(frame, default_statfs_resume, &local->loc,
+                               xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(statfs, frame, -1, ENOMEM, NULL, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(statfs, frame, op_ret, op_errno, buf, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *dict,
+                      dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_fgetxattr_stub(frame, default_fgetxattr_resume, local->fd,
+                                  local->name, xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(fgetxattr, frame, -1, ENOMEM, NULL, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, dict, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *dict,
+                     dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_getxattr_stub(frame, default_getxattr_resume, &local->loc,
+                                 local->name, xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(getxattr, frame, -1, ENOMEM, NULL, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_rchecksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, uint32_t weak_checksum,
+                      uint8_t *strong_checksum, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_rchecksum_stub(frame, default_rchecksum_resume, local->fd,
+                                  local->offset, local->flag, xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(rchecksum, frame, -1, ENOMEM, 0, NULL, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(rchecksum, frame, op_ret, op_errno, weak_checksum,
+                        strong_checksum, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+                    dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_readdir_stub(frame, default_readdir_resume, local->fd,
+                                local->size, local->offset, xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(readdir, frame, op_ret, op_errno, entries, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int32_t
+quiesce_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+                     dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_readdirp_stub(frame, default_readdirp_resume, local->fd,
+                                 local->size, local->offset, local->dict);
+        if (!stub) {
+            STACK_UNWIND_STRICT(readdirp, frame, -1, ENOMEM, NULL, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+#if 0
+
+int32_t
+quiesce_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                    struct iatt *postbuf, dict_t *xdata)
+{
+        quiesce_priv_t *priv = NULL;
+        call_stub_t    *stub = NULL;
+        quiesce_local_t *local = NULL;
+
+        priv = this->private;
+
+        local = frame->local;
+        frame->local = NULL;
+        if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+                /* Re-transmit (by putting in the queue) */
+                stub = fop_writev_stub (frame, default_writev_resume,
+                                        local->fd, local->vector, local->flag,
+                                        local->offset, local->io_flags,
+                                        local->iobref, xdata);
+                if (!stub) {
+                        STACK_UNWIND_STRICT (writev, frame, -1, ENOMEM,
+                                             NULL, NULL, NULL);
+                        goto out;
+                }
+
+                gf_quiesce_enqueue (this, stub);
+                goto out;
+        }
+
+        STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+out:
+        gf_quiesce_local_wipe (this, local);
+
+        return 0;
+}
+
+int32_t
+quiesce_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+        quiesce_priv_t *priv = NULL;
+        call_stub_t    *stub = NULL;
+        quiesce_local_t *local = NULL;
+
+        priv = this->private;
+
+        local = frame->local;
+        frame->local = NULL;
+        if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+                /* Re-transmit (by putting in the queue) */
+                stub = fop_xattrop_stub (frame, default_xattrop_resume,
+                                         &local->loc, local->xattrop_flags,
+                                         local->dict, xdata);
+                if (!stub) {
+                        STACK_UNWIND_STRICT (xattrop, frame, -1, ENOMEM,
+                                             NULL, NULL);
+                        goto out;
+                }
+
+                gf_quiesce_enqueue (this, stub);
+                goto out;
+        }
+
+        STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, dict, xdata);
+out:
+        gf_quiesce_local_wipe (this, local);
+
+        return 0;
+}
+
+int32_t
+quiesce_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+        quiesce_priv_t *priv = NULL;
+        call_stub_t    *stub = NULL;
+        quiesce_local_t *local = NULL;
+
+        priv = this->private;
+
+        local = frame->local;
+        frame->local = NULL;
+        if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+                /* Re-transmit (by putting in the queue) */
+                stub = fop_fxattrop_stub (frame, default_fxattrop_resume,
+                                          local->fd, local->xattrop_flags,
+                                          local->dict, xdata);
+                if (!stub) {
+                        STACK_UNWIND_STRICT (fxattrop, frame, -1, ENOMEM,
+                                             NULL, NULL);
+                        goto out;
+                }
+
+                gf_quiesce_enqueue (this, stub);
+                goto out;
+        }
+
+        STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, dict, xdata);
+out:
+        gf_quiesce_local_wipe (this, local);
+
+        return 0;
+}
+
+int32_t
+quiesce_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+{
+        quiesce_priv_t *priv = NULL;
+        call_stub_t    *stub = NULL;
+        quiesce_local_t *local = NULL;
+
+        priv = this->private;
+
+        local = frame->local;
+        frame->local = NULL;
+        if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+                /* Re-transmit (by putting in the queue) */
+                stub = fop_lk_stub (frame, default_lk_resume,
+                                    local->fd, local->flag, &local->flock, xdata);
+                if (!stub) {
+                        STACK_UNWIND_STRICT (lk, frame, -1, ENOMEM,
+                                             NULL, NULL);
+                        goto out;
+                }
+
+                gf_quiesce_enqueue (this, stub);
+                goto out;
+        }
+
+        STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock, xdata);
+out:
+        gf_quiesce_local_wipe (this, local);
+
+        return 0;
+}
+
+int32_t
+quiesce_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+        quiesce_priv_t *priv = NULL;
+        call_stub_t    *stub = NULL;
+        quiesce_local_t *local = NULL;
+
+        priv = this->private;
+
+        local = frame->local;
+        frame->local = NULL;
+        if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+                /* Re-transmit (by putting in the queue) */
+                stub = fop_inodelk_stub (frame, default_inodelk_resume,
+                                         local->volname, &local->loc,
+                                         local->flag, &local->flock, xdata);
+                if (!stub) {
+                        STACK_UNWIND_STRICT (inodelk, frame, -1, ENOMEM, NULL);
+                        goto out;
+                }
+
+                gf_quiesce_enqueue (this, stub);
+                goto out;
+        }
+
+        STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, xdata);
+out:
+        gf_quiesce_local_wipe (this, local);
+
+        return 0;
+}
+
+
+int32_t
+quiesce_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+        quiesce_priv_t *priv = NULL;
+        call_stub_t    *stub = NULL;
+        quiesce_local_t *local = NULL;
+
+        priv = this->private;
+
+        local = frame->local;
+        frame->local = NULL;
+        if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+                /* Re-transmit (by putting in the queue) */
+                stub = fop_finodelk_stub (frame, default_finodelk_resume,
+                                         local->volname, local->fd,
+                                         local->flag, &local->flock, xdata);
+                if (!stub) {
+                        STACK_UNWIND_STRICT (finodelk, frame, -1, ENOMEM, NULL);
+                        goto out;
+                }
+
+                gf_quiesce_enqueue (this, stub);
+                goto out;
+        }
+
+        STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno, xdata);
+out:
+        gf_quiesce_local_wipe (this, local);
+
+        return 0;
+}
+
+int32_t
+quiesce_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+        quiesce_priv_t *priv = NULL;
+        call_stub_t    *stub = NULL;
+        quiesce_local_t *local = NULL;
+
+        priv = this->private;
+
+        local = frame->local;
+        frame->local = NULL;
+        if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+                /* Re-transmit (by putting in the queue) */
+                stub = fop_entrylk_stub (frame, default_entrylk_resume,
+                                         local->volname, &local->loc,
+                                         local->name, local->cmd, local->type, xdata);
+                if (!stub) {
+                        STACK_UNWIND_STRICT (entrylk, frame, -1, ENOMEM, NULL);
+                        goto out;
+                }
+
+                gf_quiesce_enqueue (this, stub);
+                goto out;
+        }
+
+        STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, xdata);
+out:
+        gf_quiesce_local_wipe (this, local);
+
+        return 0;
+}
+
+int32_t
+quiesce_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+        quiesce_priv_t *priv = NULL;
+        call_stub_t    *stub = NULL;
+        quiesce_local_t *local = NULL;
+
+        priv = this->private;
+
+        local = frame->local;
+        frame->local = NULL;
+        if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+                /* Re-transmit (by putting in the queue) */
+                stub = fop_fentrylk_stub (frame, default_fentrylk_resume,
+                                          local->volname, local->fd,
+                                          local->name, local->cmd, local->type, xdata);
+                if (!stub) {
+                        STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOMEM, NULL);
+                        goto out;
+                }
+
+                gf_quiesce_enqueue (this, stub);
+                goto out;
+        }
+
+        STACK_UNWIND_STRICT (fentrylk, frame, op_ret, op_errno, xdata);
+out:
+        gf_quiesce_local_wipe (this, local);
+
+        return 0;
+}
+
+int32_t
+quiesce_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+                     struct iatt *statpost, dict_t *xdata)
+{
+        quiesce_priv_t *priv = NULL;
+        call_stub_t    *stub = NULL;
+        quiesce_local_t *local = NULL;
+
+        priv = this->private;
+
+        local = frame->local;
+        frame->local = NULL;
+        if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+                /* Re-transmit (by putting in the queue) */
+                stub = fop_setattr_stub (frame, default_setattr_resume,
+                                         &local->loc, &local->stbuf, local->flag, xdata);
+                if (!stub) {
+                        STACK_UNWIND_STRICT (setattr, frame, -1, ENOMEM,
+                                             NULL, NULL, NULL);
+                        goto out;
+                }
+
+                gf_quiesce_enqueue (this, stub);
+                goto out;
+        }
+
+        STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, statpre,
+                             statpost, xdata);
+out:
+        gf_quiesce_local_wipe (this, local);
+
+        return 0;
+}
+
+int32_t
+quiesce_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+                      struct iatt *statpost, dict_t *xdata)
+{
+        quiesce_priv_t *priv = NULL;
+        call_stub_t    *stub = NULL;
+        quiesce_local_t *local = NULL;
+
+        priv = this->private;
+
+        local = frame->local;
+        frame->local = NULL;
+
+        if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+                /* Re-transmit (by putting in the queue) */
+                stub = fop_fsetattr_stub (frame, default_fsetattr_resume,
+                                          local->fd, &local->stbuf, local->flag, xdata);
+                if (!stub) {
+                        STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOMEM,
+                                             NULL, NULL, NULL);
+                        goto out;
+                }
+
+                gf_quiesce_enqueue (this, stub);
+                goto out;
+        }
+
+        STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno, statpre,
+                             statpost, xdata);
+out:
+        gf_quiesce_local_wipe (this, local);
+
+        return 0;
+}
+
+#endif /* if 0 */
+
+/* FOP */
+
+/* No retransmittion */
+
+int32_t
+quiesce_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    const char *name, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        STACK_WIND(frame, default_removexattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+        return 0;
+    }
+
+    stub = fop_removexattr_stub(frame, default_removexattr_resume, loc, name,
+                                xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(removexattr, frame, -1, ENOMEM, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                     const char *name, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        STACK_WIND(frame, default_fremovexattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+        return 0;
+    }
+
+    stub = fop_fremovexattr_stub(frame, default_fremovexattr_resume, fd, name,
+                                 xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(fremovexattr, frame, -1, ENOMEM, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+                 dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        STACK_WIND(frame, default_truncate_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+        return 0;
+    }
+
+    stub = fop_truncate_stub(frame, default_truncate_resume, loc, offset,
+                             xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(truncate, frame, -1, ENOMEM, NULL, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+                  int32_t flags, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        STACK_WIND(frame, default_fsetxattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+        return 0;
+    }
+
+    stub = fop_fsetxattr_stub(frame, default_fsetxattr_resume, fd, dict, flags,
+                              xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(fsetxattr, frame, -1, ENOMEM, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+                 int32_t flags, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        STACK_WIND(frame, default_setxattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
+        return 0;
+    }
+
+    stub = fop_setxattr_stub(frame, default_setxattr_resume, loc, dict, flags,
+                             xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(setxattr, frame, -1, ENOMEM, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+               mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        /* Don't send O_APPEND below, as write() re-transmittions can
+           fail with O_APPEND */
+        STACK_WIND(frame, default_create_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->create, loc, (flags & ~O_APPEND),
+                   mode, umask, fd, xdata);
+        return 0;
+    }
+
+    stub = fop_create_stub(frame, default_create_resume, loc,
+                           (flags & ~O_APPEND), mode, umask, fd, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
+                            NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+             dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        STACK_WIND(frame, default_link_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+        return 0;
+    }
+
+    stub = fop_link_stub(frame, default_link_resume, oldloc, newloc, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
+                            NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+               loc_t *newloc, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+        return 0;
+    }
+
+    stub = fop_rename_stub(frame, default_rename_resume, oldloc, newloc, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(rename, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
+                            NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int
+quiesce_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+                loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        STACK_WIND(frame, default_symlink_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask,
+                   xdata);
+        return 0;
+    }
+
+    stub = fop_symlink_stub(frame, default_symlink_resume, linkpath, loc, umask,
+                            xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(symlink, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
+                            NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int
+quiesce_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+              dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        STACK_WIND(frame, default_rmdir_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata);
+        return 0;
+    }
+
+    stub = fop_rmdir_stub(frame, default_rmdir_resume, loc, flags, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(rmdir, frame, -1, ENOMEM, NULL, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+               dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+        return 0;
+    }
+
+    stub = fop_unlink_stub(frame, default_unlink_resume, loc, xflag, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(unlink, frame, -1, ENOMEM, NULL, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int
+quiesce_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+              mode_t umask, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        STACK_WIND(frame, default_mkdir_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+        return 0;
+    }
+
+    stub = fop_mkdir_stub(frame, default_mkdir_resume, loc, mode, umask, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(mkdir, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
+                            NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int
+quiesce_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+              dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        STACK_WIND(frame, default_mknod_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask,
+                   xdata);
+        return 0;
+    }
+
+    stub = fop_mknod_stub(frame, default_mknod_resume, loc, mode, rdev, umask,
+                          xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(mknod, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
+                            NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                  dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv->pass_through) {
+        STACK_WIND(frame, default_ftruncate_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+        return 0;
+    }
+
+    stub = fop_ftruncate_stub(frame, default_ftruncate_resume, fd, offset,
+                              xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+/* Re-transmittion */
+
+int32_t
+quiesce_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+                 dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        loc_dup(loc, &local->loc);
+        local->size = size;
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_readlink_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->readlink, loc, size, xdata);
+        return 0;
+    }
+
+    stub = fop_readlink_stub(frame, default_readlink_resume, loc, size, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(readlink, frame, -1, ENOMEM, NULL, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+               dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        loc_dup(loc, &local->loc);
+        local->flag = mask;
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_access_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->access, loc, mask, xdata);
+        return 0;
+    }
+
+    stub = fop_access_stub(frame, default_access_resume, loc, mask, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(access, frame, -1, ENOMEM, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                  const char *name, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        local->fd = fd_ref(fd);
+        if (name)
+            local->name = gf_strdup(name);
+
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_fgetxattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+        return 0;
+    }
+
+    stub = fop_fgetxattr_stub(frame, default_fgetxattr_resume, fd, name, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(fgetxattr, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        loc_dup(loc, &local->loc);
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_statfs_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->statfs, loc, xdata);
+        return 0;
+    }
+
+    stub = fop_statfs_stub(frame, default_statfs_resume, loc, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(statfs, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+                 dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        local->fd = fd_ref(fd);
+        local->flag = flags;
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_fsyncdir_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fsyncdir, fd, flags, xdata);
+        return 0;
+    }
+
+    stub = fop_fsyncdir_stub(frame, default_fsyncdir_resume, fd, flags, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(fsyncdir, frame, -1, ENOMEM, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+                dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        loc_dup(loc, &local->loc);
+        local->fd = fd_ref(fd);
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_opendir_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+        return 0;
+    }
+
+    stub = fop_opendir_stub(frame, default_opendir_resume, loc, fd, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(opendir, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        local->fd = fd_ref(fd);
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_fstat_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fstat, fd, xdata);
+        return 0;
+    }
+
+    stub = fop_fstat_stub(frame, default_fstat_resume, fd, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+              dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        local->fd = fd_ref(fd);
+        local->flag = flags;
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_fsync_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fsync, fd, flags, xdata);
+        return 0;
+    }
+
+    stub = fop_fsync_stub(frame, default_fsync_resume, fd, flags, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(fsync, frame, -1, ENOMEM, NULL, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        local->fd = fd_ref(fd);
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_flush_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->flush, fd, xdata);
+        return 0;
+    }
+
+    stub = fop_flush_stub(frame, default_flush_resume, fd, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(flush, frame, -1, ENOMEM, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+               struct iovec *vector, int32_t count, off_t off, uint32_t flags,
+               struct iobref *iobref, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        STACK_WIND(frame, default_writev_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->writev, fd, vector, count, off,
+                   flags, iobref, xdata);
+        return 0;
+    }
+
+    stub = fop_writev_stub(frame, default_writev_resume, fd, vector, count, off,
+                           flags, iobref, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(writev, frame, -1, ENOMEM, NULL, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+              off_t offset, uint32_t flags, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        local->fd = fd_ref(fd);
+        local->size = size;
+        local->offset = offset;
+        local->io_flag = flags;
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_readv_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->readv, fd, size, offset, flags,
+                   xdata);
+        return 0;
+    }
+
+    stub = fop_readv_stub(frame, default_readv_resume, fd, size, offset, flags,
+                          xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(readv, frame, -1, ENOMEM, NULL, 0, NULL, NULL,
+                            NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+             fd_t *fd, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        loc_dup(loc, &local->loc);
+        local->fd = fd_ref(fd);
+
+        /* Don't send O_APPEND below, as write() re-transmittions can
+           fail with O_APPEND */
+        local->flag = (flags & ~O_APPEND);
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_open_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->open, loc, (flags & ~O_APPEND), fd,
+                   xdata);
+        return 0;
+    }
+
+    stub = fop_open_stub(frame, default_open_resume, loc, (flags & ~O_APPEND),
+                         fd, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(open, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                 const char *name, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        loc_dup(loc, &local->loc);
+        if (name)
+            local->name = gf_strdup(name);
+
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_getxattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
+        return 0;
+    }
+
+    stub = fop_getxattr_stub(frame, default_getxattr_resume, loc, name, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(getxattr, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        STACK_WIND(frame, default_xattrop_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->xattrop, loc, flags, dict, xdata);
+        return 0;
+    }
+
+    stub = fop_xattrop_stub(frame, default_xattrop_resume, loc, flags, dict,
+                            xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(xattrop, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                 gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        STACK_WIND(frame, default_fxattrop_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fxattrop, fd, flags, dict, xdata);
+        return 0;
+    }
+
+    stub = fop_fxattrop_stub(frame, default_fxattrop_resume, fd, flags, dict,
+                             xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(fxattrop, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+           struct gf_flock *lock, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        STACK_WIND(frame, default_lk_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->lk, fd, cmd, lock, xdata);
+        return 0;
+    }
+
+    stub = fop_lk_stub(frame, default_lk_resume, fd, cmd, lock, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(lk, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+                loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        STACK_WIND(frame, default_inodelk_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->inodelk, volume, loc, cmd, lock,
+                   xdata);
+        return 0;
+    }
+
+    stub = fop_inodelk_stub(frame, default_inodelk_resume, volume, loc, cmd,
+                            lock, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(inodelk, frame, -1, ENOMEM, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_finodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+                 fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        STACK_WIND(frame, default_finodelk_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->finodelk, volume, fd, cmd, lock,
+                   xdata);
+        return 0;
+    }
+
+    stub = fop_finodelk_stub(frame, default_finodelk_resume, volume, fd, cmd,
+                             lock, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(finodelk, frame, -1, ENOMEM, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_entrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+                loc_t *loc, const char *basename, entrylk_cmd cmd,
+                entrylk_type type, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        STACK_WIND(frame, default_entrylk_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->entrylk, volume, loc, basename, cmd,
+                   type, xdata);
+        return 0;
+    }
+
+    stub = fop_entrylk_stub(frame, default_entrylk_resume, volume, loc,
+                            basename, cmd, type, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(entrylk, frame, -1, ENOMEM, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+                 fd_t *fd, const char *basename, entrylk_cmd cmd,
+                 entrylk_type type, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        STACK_WIND(frame, default_fentrylk_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fentrylk, volume, fd, basename, cmd,
+                   type, xdata);
+        return 0;
+    }
+
+    stub = fop_fentrylk_stub(frame, default_fentrylk_resume, volume, fd,
+                             basename, cmd, type, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(fentrylk, frame, -1, ENOMEM, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                  int32_t len, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        local->fd = fd_ref(fd);
+        local->offset = offset;
+        local->flag = len;
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_rchecksum_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata);
+        return 0;
+    }
+
+    stub = fop_rchecksum_stub(frame, default_rchecksum_resume, fd, offset, len,
+                              xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(rchecksum, frame, -1, ENOMEM, 0, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                off_t off, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        local->fd = fd_ref(fd);
+        local->size = size;
+        local->offset = off;
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_readdir_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->readdir, fd, size, off, xdata);
+        return 0;
+    }
+
+    stub = fop_readdir_stub(frame, default_readdir_resume, fd, size, off,
+                            xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                 off_t off, dict_t *dict)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        local->fd = fd_ref(fd);
+        local->size = size;
+        local->offset = off;
+        local->dict = dict_ref(dict);
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_readdirp_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->readdirp, fd, size, off, dict);
+        return 0;
+    }
+
+    stub = fop_readdirp_stub(frame, default_readdirp_resume, fd, size, off,
+                             dict);
+    if (!stub) {
+        STACK_UNWIND_STRICT(readdirp, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+        return 0;
+    }
+
+    stub = fop_setattr_stub(frame, default_setattr_resume, loc, stbuf, valid,
+                            xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(setattr, frame, -1, ENOMEM, NULL, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        loc_dup(loc, &local->loc);
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_stat_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->stat, loc, xdata);
+        return 0;
+    }
+
+    stub = fop_stat_stub(frame, default_stat_resume, loc, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(stat, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               dict_t *xattr_req)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        loc_dup(loc, &local->loc);
+        local->dict = dict_ref(xattr_req);
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_lookup_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->lookup, loc, xattr_req);
+        return 0;
+    }
+
+    stub = fop_lookup_stub(frame, default_lookup_resume, loc, xattr_req);
+    if (!stub) {
+        STACK_UNWIND_STRICT(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                 struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+        return 0;
+    }
+
+    stub = fop_fsetattr_stub(frame, default_fsetattr_resume, fd, stbuf, valid,
+                             xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+quiesce_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+                  off_t offset, size_t len, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        STACK_WIND(frame, default_fallocate_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+                   xdata);
+        return 0;
+    }
+
+    stub = fop_fallocate_stub(frame, default_fallocate_resume, fd, mode, offset,
+                              len, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int
+quiesce_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, off_t offset, dict_t *xdata)
+{
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    local = frame->local;
+    frame->local = NULL;
+    if ((op_ret == -1) && (op_errno == ENOTCONN)) {
+        /* Re-transmit (by putting in the queue) */
+        stub = fop_seek_stub(frame, default_seek_resume, local->fd,
+                             local->offset, local->what, xdata);
+        if (!stub) {
+            STACK_UNWIND_STRICT(seek, frame, -1, ENOMEM, 0, NULL);
+            goto out;
+        }
+
+        gf_quiesce_enqueue(this, stub);
+        goto out;
+    }
+
+    STACK_UNWIND_STRICT(seek, frame, op_ret, op_errno, offset, xdata);
+out:
+    gf_quiesce_local_wipe(this, local);
+
+    return 0;
+}
+
+int
+quiesce_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             gf_seek_what_t what, dict_t *xdata)
+{
+    quiesce_priv_t *priv = NULL;
+    call_stub_t *stub = NULL;
+    quiesce_local_t *local = NULL;
+
+    priv = this->private;
+
+    if (priv && priv->pass_through) {
+        local = mem_get0(priv->local_pool);
+        local->fd = fd_ref(fd);
+        local->offset = offset;
+        local->what = what;
+
+        frame->local = local;
+
+        STACK_WIND(frame, quiesce_seek_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->seek, fd, offset, what, xdata);
+        return 0;
+    }
+
+    stub = fop_seek_stub(frame, default_seek_resume, fd, offset, what, xdata);
+    if (!stub) {
+        STACK_UNWIND_STRICT(seek, frame, -1, ENOMEM, 0, NULL);
+        return 0;
+    }
+
+    gf_quiesce_enqueue(this, stub);
+
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    ret = xlator_mem_acct_init(this, gf_quiesce_mt_end + 1);
+
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    int32_t ret = -1;
+    quiesce_priv_t *priv = NULL;
+
+    priv = this->private;
+
+    GF_OPTION_RECONF("timeout", priv->timeout, options, time, out);
+    GF_OPTION_RECONF("failover-hosts", priv->failover_hosts, options, str, out);
+    gf_quiesce_populate_failover_hosts(this, priv, priv->failover_hosts);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+init(xlator_t *this)
+{
+    int ret = -1;
+    quiesce_priv_t *priv = NULL;
+
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "'quiesce' not configured with exactly one child");
+        goto out;
+    }
+
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
+    }
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_quiesce_mt_priv_t);
+    if (!priv)
+        goto out;
+
+    INIT_LIST_HEAD(&priv->failover_list);
+
+    GF_OPTION_INIT("timeout", priv->timeout, time, out);
+    GF_OPTION_INIT("failover-hosts", priv->failover_hosts, str, out);
+    gf_quiesce_populate_failover_hosts(this, priv, priv->failover_hosts);
+
+    priv->local_pool = mem_pool_new(quiesce_local_t,
+                                    GF_FOPS_EXPECTED_IN_PARALLEL);
+
+    LOCK_INIT(&priv->lock);
+    priv->pass_through = _gf_false;
+
+    INIT_LIST_HEAD(&priv->req);
+
+    this->private = priv;
+    ret = 0;
+out:
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    quiesce_priv_t *priv = NULL;
+
+    priv = this->private;
+    if (!priv)
+        goto out;
+    this->private = NULL;
+
+    mem_pool_destroy(priv->local_pool);
+    priv->local_pool = NULL;
+    LOCK_DESTROY(&priv->lock);
+    GF_FREE(priv);
+out:
+    return;
+}
+
+int
+notify(xlator_t *this, int event, void *data, ...)
+{
+    int ret = 0;
+    quiesce_priv_t *priv = NULL;
+
+    priv = this->private;
+    if (!priv)
+        goto out;
+
+    switch (event) {
+        case GF_EVENT_CHILD_UP: {
+            ret = gf_thread_create(&priv->thr, NULL, gf_quiesce_dequeue_start,
+                                   this, "quiesce");
+            if (ret) {
+                gf_log(this->name, GF_LOG_ERROR,
+                       "failed to create the quiesce-dequeue thread");
+            }
+
+            LOCK(&priv->lock);
+            {
+                priv->pass_through = _gf_true;
+            }
+            UNLOCK(&priv->lock);
+            break;
+        }
+        case GF_EVENT_CHILD_DOWN:
+            LOCK(&priv->lock);
+            {
+                priv->pass_through = _gf_false;
+                __gf_quiesce_start_timer(this, priv);
+            }
+            UNLOCK(&priv->lock);
+            break;
+        default:
+            break;
+    }
+
+    ret = default_notify(this, event, data);
+out:
+    return ret;
+}
+
+struct xlator_fops fops = {
+    /* write/modifying fops */
+    .mknod = quiesce_mknod,
+    .create = quiesce_create,
+    .truncate = quiesce_truncate,
+    .ftruncate = quiesce_ftruncate,
+    .setxattr = quiesce_setxattr,
+    .fsetxattr = quiesce_fsetxattr,
+    .removexattr = quiesce_removexattr,
+    .fremovexattr = quiesce_fremovexattr,
+    .symlink = quiesce_symlink,
+    .unlink = quiesce_unlink,
+    .link = quiesce_link,
+    .mkdir = quiesce_mkdir,
+    .rmdir = quiesce_rmdir,
+    .rename = quiesce_rename,
+    .fallocate = quiesce_fallocate,
+
+    /* The below calls are known to change state, hence
+       re-transmittion is not advised */
+    .lk = quiesce_lk,
+    .inodelk = quiesce_inodelk,
+    .finodelk = quiesce_finodelk,
+    .entrylk = quiesce_entrylk,
+    .fentrylk = quiesce_fentrylk,
+    .xattrop = quiesce_xattrop,
+    .fxattrop = quiesce_fxattrop,
+    .setattr = quiesce_setattr,
+    .fsetattr = quiesce_fsetattr,
+
+    /* Special case, re-transmittion is not harmful *
+     * as offset is properly sent from above layers */
+    /* TODO: not re-transmitted as of now */
+    .writev = quiesce_writev,
+
+    /* re-transmittable fops */
+    .lookup = quiesce_lookup,
+    .stat = quiesce_stat,
+    .fstat = quiesce_fstat,
+    .access = quiesce_access,
+    .readlink = quiesce_readlink,
+    .getxattr = quiesce_getxattr,
+    .fgetxattr = quiesce_fgetxattr,
+    .open = quiesce_open,
+    .readv = quiesce_readv,
+    .flush = quiesce_flush,
+    .fsync = quiesce_fsync,
+    .statfs = quiesce_statfs,
+    .opendir = quiesce_opendir,
+    .readdir = quiesce_readdir,
+    .readdirp = quiesce_readdirp,
+    .fsyncdir = quiesce_fsyncdir,
+    .seek = quiesce_seek,
+};
+
+struct xlator_dumpops dumpops;
+
+struct xlator_cbks cbks;
+
+struct volume_options options[] = {
+    {
+        .key = {"timeout"},
+        .type = GF_OPTION_TYPE_TIME,
+        .default_value = "45",
+        .description =
+            "After 'timeout' seconds since the time 'quiesce' "
+            "option was set to \"!pass-through\", acknowledgements to file "
+            "operations are no longer quiesced and previously "
+            "quiesced acknowledgements are sent to the application",
+        .op_version = {GD_OP_VERSION_4_0_0},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+    },
+    {.key = {"failover-hosts"},
+     .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
+     .op_version = {GD_OP_VERSION_4_0_0},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .description = "It is a comma separated list of hostname/IP "
+                    "addresses. It Specifies the list of hosts where "
+                    "the gfproxy daemons are running, to which the "
+                    "the thin clients can failover to."},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {GD_OP_VERSION_3_12_0},
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "quiesce",
+    .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/features/quiesce/src/quiesce.h b/xlators/features/quiesce/src/quiesce.h
new file mode 100644
index 00000000000..6ab2af40a56
--- /dev/null
+++ b/xlators/features/quiesce/src/quiesce.h
@@ -0,0 +1,65 @@
+/*
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __QUIESCE_H__
+#define __QUIESCE_H__
+
+#include "quiesce-mem-types.h"
+#include "quiesce-messages.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/timer.h>
+
+#define GF_FOPS_EXPECTED_IN_PARALLEL 512
+
+typedef struct {
+    struct list_head list;
+    char *addr;
+    gf_boolean_t tried; /* indicates attempted connecting */
+} quiesce_failover_hosts_t;
+
+typedef struct {
+    gf_timer_t *timer;
+    gf_boolean_t pass_through;
+    gf_lock_t lock;
+    struct list_head req;
+    int queue_size;
+    pthread_t thr;
+    struct mem_pool *local_pool;
+    uint32_t timeout;
+    char *failover_hosts;
+    struct list_head failover_list;
+} quiesce_priv_t;
+
+typedef struct {
+    fd_t *fd;
+    char *name;
+    char *volname;
+    loc_t loc;
+    off_t size;
+    off_t offset;
+    mode_t mode;
+    int32_t flag;
+    struct iatt stbuf;
+    struct iovec *vector;
+    struct iobref *iobref;
+    dict_t *dict;
+    struct gf_flock flock;
+    entrylk_cmd cmd;
+    entrylk_type type;
+    gf_xattrop_flags_t xattrop_flags;
+    int32_t wbflags;
+    uint32_t io_flag;
+    /* for fallocate */
+    size_t len;
+    /* for lseek */
+    gf_seek_what_t what;
+} quiesce_local_t;
+
+#endif
diff --git a/xlators/features/quota/src/Makefile.am b/xlators/features/quota/src/Makefile.am
index 886d839643c..1c2dcef0ca3 100644
--- a/xlators/features/quota/src/Makefile.am
+++ b/xlators/features/quota/src/Makefile.am
@@ -1,13 +1,29 @@
-xlator_LTLIBRARIES = quota.la
+if WITH_SERVER
+xlator_LTLIBRARIES = quota.la quotad.la
+endif
 xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
 
-quota_la_LDFLAGS = -module -avoidversion
+quota_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+quotad_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
 
-quota_la_SOURCES = quota.c
-quota_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la 
+quota_la_SOURCES = quota.c quota-enforcer-client.c
+quota_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+                  $(top_builddir)/rpc/xdr/src/libgfxdr.la \
+                  $(top_builddir)/rpc/rpc-lib/src/libgfrpc.la
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+quotad_la_SOURCES = quotad.c quotad-helpers.c quotad-aggregator.c
+quotad_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+                   $(top_builddir)/rpc/xdr/src/libgfxdr.la \
+                   $(top_builddir)/rpc/rpc-lib/src/libgfrpc.la
 
-CLEANFILES = 
+noinst_HEADERS = quota-mem-types.h quota.h quotad-aggregator.h \
+	quotad-helpers.h quota-messages.h
 
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src/ -I$(top_builddir)/rpc/xdr/src/ \
+	-I$(top_srcdir)/rpc/rpc-lib/src \
+	-I$(top_srcdir)/xlators/cluster/dht/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/quota/src/quota-enforcer-client.c b/xlators/features/quota/src/quota-enforcer-client.c
new file mode 100644
index 00000000000..480d64ade27
--- /dev/null
+++ b/xlators/features/quota/src/quota-enforcer-client.c
@@ -0,0 +1,503 @@
+/*
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <stdio.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/resource.h>
+#include <sys/file.h>
+#include <netdb.h>
+#include <signal.h>
+#include <libgen.h>
+
+#include <sys/utsname.h>
+
+#include <stdint.h>
+#include <pthread.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <semaphore.h>
+#include <errno.h>
+
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#include "quota.h"
+#include "quota-messages.h"
+
+extern struct rpc_clnt_program quota_enforcer_clnt;
+
+int32_t
+quota_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, inode_t *inode,
+                   struct iatt *buf, dict_t *xdata, struct iatt *postparent);
+
+int
+quota_enforcer_submit_request(void *req, call_frame_t *frame,
+                              rpc_clnt_prog_t *prog, int procnum,
+                              struct iobref *iobref, xlator_t *this,
+                              fop_cbk_fn_t cbkfn, xdrproc_t xdrproc)
+{
+    int ret = -1;
+    int count = 0;
+    struct iovec iov = {
+        0,
+    };
+    struct iobuf *iobuf = NULL;
+    char new_iobref = 0;
+    ssize_t xdr_size = 0;
+    quota_priv_t *priv = NULL;
+
+    GF_ASSERT(this);
+
+    priv = this->private;
+
+    if (req) {
+        xdr_size = xdr_sizeof(xdrproc, req);
+        iobuf = iobuf_get2(this->ctx->iobuf_pool, xdr_size);
+        if (!iobuf) {
+            goto out;
+        }
+
+        if (!iobref) {
+            iobref = iobref_new();
+            if (!iobref) {
+                goto out;
+            }
+
+            new_iobref = 1;
+        }
+
+        iobref_add(iobref, iobuf);
+
+        iov.iov_base = iobuf->ptr;
+        iov.iov_len = iobuf_size(iobuf);
+
+        /* Create the xdr payload */
+        ret = xdr_serialize_generic(iov, req, xdrproc);
+        if (ret == -1) {
+            goto out;
+        }
+        iov.iov_len = ret;
+        count = 1;
+    }
+
+    /* Send the msg */
+    ret = rpc_clnt_submit(priv->rpc_clnt, prog, procnum, cbkfn, &iov, count,
+                          NULL, 0, iobref, frame, NULL, 0, NULL, 0, NULL);
+    ret = 0;
+
+out:
+    if (new_iobref)
+        iobref_unref(iobref);
+    if (iobuf)
+        iobuf_unref(iobuf);
+
+    return ret;
+}
+
+int
+quota_enforcer_lookup_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                          void *myframe)
+{
+    quota_local_t *local = NULL;
+    call_frame_t *frame = NULL;
+    int ret = 0;
+    gfs3_lookup_rsp rsp = {
+        0,
+    };
+    struct iatt stbuf = {
+        0,
+    };
+    struct iatt postparent = {
+        0,
+    };
+    int op_errno = EINVAL;
+    dict_t *xdata = NULL;
+    inode_t *inode = NULL;
+    xlator_t *this = NULL;
+    quota_priv_t *priv = NULL;
+    struct timespec retry_delay = {
+        0,
+    };
+    gf_timer_t *timer = NULL;
+
+    this = THIS;
+
+    frame = myframe;
+    local = frame->local;
+    inode = local->validate_loc.inode;
+    priv = this->private;
+
+    if (-1 == req->rpc_status) {
+        rsp.op_ret = -1;
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gfs3_lookup_rsp);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, Q_MSG_XDR_DECODING_FAILED,
+               "XDR decoding failed");
+        rsp.op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    op_errno = gf_error_to_errno(rsp.op_errno);
+    gf_stat_to_iatt(&rsp.postparent, &postparent);
+
+    if (rsp.op_ret == -1)
+        goto out;
+
+    rsp.op_ret = -1;
+    gf_stat_to_iatt(&rsp.stat, &stbuf);
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(frame->this, xdata, (rsp.xdata.xdata_val),
+                                 (rsp.xdata.xdata_len), rsp.op_ret, op_errno,
+                                 out);
+
+    if ((!gf_uuid_is_null(inode->gfid)) &&
+        (gf_uuid_compare(stbuf.ia_gfid, inode->gfid) != 0)) {
+        gf_msg_debug(frame->this->name, ESTALE, "gfid changed for %s",
+                     local->validate_loc.path);
+        rsp.op_ret = -1;
+        op_errno = ESTALE;
+        goto out;
+    }
+
+    rsp.op_ret = 0;
+
+out:
+    rsp.op_errno = op_errno;
+
+    /* We need to retry connecting to quotad on ENOTCONN error.
+     * Suppose if there are two volumes vol1 and vol2,
+     * and quota is enabled and limit is set on vol1.
+     * Now if IO is happening on vol1 and quota is enabled/disabled
+     * on vol2, quotad gets restarted and client will receive
+     * ENOTCONN in the IO path of vol1
+     */
+    if (rsp.op_ret == -1 && rsp.op_errno == ENOTCONN) {
+        if (local->quotad_conn_retry >= 12) {
+            priv->quotad_conn_status = 1;
+            gf_log(this->name, GF_LOG_WARNING,
+                   "failed to connect "
+                   "to quotad after retry count %d)",
+                   local->quotad_conn_retry);
+        } else {
+            local->quotad_conn_retry++;
+        }
+
+        if (priv->quotad_conn_status == 0) {
+            /* retry connecting after 5secs for 12 retries
+             * (up to 60sec).
+             */
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "retry connecting to "
+                   "quotad (retry count %d)",
+                   local->quotad_conn_retry);
+
+            retry_delay.tv_sec = 5;
+            retry_delay.tv_nsec = 0;
+            timer = gf_timer_call_after(this->ctx, retry_delay,
+                                        _quota_enforcer_lookup, (void *)frame);
+            if (timer == NULL) {
+                gf_log(this->name, GF_LOG_WARNING,
+                       "failed to "
+                       "set quota_enforcer_lookup with timer");
+            } else {
+                goto clean;
+            }
+        }
+    } else {
+        priv->quotad_conn_status = 0;
+    }
+
+    if (rsp.op_ret == -1) {
+        /* any error other than ENOENT */
+        if (rsp.op_errno != ENOENT)
+            gf_msg(
+                this->name, GF_LOG_WARNING, rsp.op_errno, Q_MSG_LOOKUP_FAILED,
+                "Getting cluster-wide size of directory failed "
+                "(path: %s gfid:%s)",
+                local->validate_loc.path, loc_gfid_utoa(&local->validate_loc));
+        else
+            gf_msg_trace(this->name, ENOENT, "not found on remote node");
+
+    } else if (local->quotad_conn_retry) {
+        gf_log(this->name, GF_LOG_DEBUG,
+               "connected to quotad after "
+               "retry count %d",
+               local->quotad_conn_retry);
+    }
+
+    local->validate_cbk(frame, NULL, this, rsp.op_ret, rsp.op_errno, inode,
+                        &stbuf, xdata, &postparent);
+
+clean:
+    if (xdata)
+        dict_unref(xdata);
+
+    free(rsp.xdata.xdata_val);
+
+    return 0;
+}
+
+void
+_quota_enforcer_lookup(void *data)
+{
+    quota_local_t *local = NULL;
+    gfs3_lookup_req req = {
+        {
+            0,
+        },
+    };
+    int ret = 0;
+    int op_errno = ESTALE;
+    quota_priv_t *priv = NULL;
+    call_frame_t *frame = NULL;
+    loc_t *loc = NULL;
+    xlator_t *this = NULL;
+    char *dir_path = NULL;
+
+    frame = data;
+    local = frame->local;
+    this = local->this;
+    loc = &local->validate_loc;
+
+    priv = this->private;
+
+    if (!(loc && loc->inode))
+        goto unwind;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req.gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req.gfid, loc->gfid, 16);
+
+    if (local->validate_xdata) {
+        GF_PROTOCOL_DICT_SERIALIZE(this, local->validate_xdata,
+                                   (&req.xdata.xdata_val), req.xdata.xdata_len,
+                                   op_errno, unwind);
+    }
+
+    if (loc->name)
+        req.bname = (char *)loc->name;
+    else
+        req.bname = "";
+
+    if (loc->path)
+        dir_path = (char *)loc->path;
+    else
+        dir_path = "";
+
+    ret = quota_enforcer_submit_request(
+        &req, frame, priv->quota_enforcer, GF_AGGREGATOR_LOOKUP, NULL, this,
+        quota_enforcer_lookup_cbk, (xdrproc_t)xdr_gfs3_lookup_req);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_RPC_SUBMIT_FAILED,
+               "Couldn't send the request to "
+               "fetch cluster wide size of directory (path:%s gfid:%s)",
+               dir_path, req.gfid);
+    }
+
+    GF_FREE(req.xdata.xdata_val);
+
+    return;
+
+unwind:
+    local->validate_cbk(frame, NULL, this, -1, op_errno, NULL, NULL, NULL,
+                        NULL);
+
+    GF_FREE(req.xdata.xdata_val);
+
+    return;
+}
+
+int
+quota_enforcer_lookup(call_frame_t *frame, xlator_t *this, dict_t *xdata,
+                      fop_lookup_cbk_t validate_cbk)
+{
+    quota_local_t *local = NULL;
+
+    if (!frame || !this)
+        goto unwind;
+
+    local = frame->local;
+    local->this = this;
+    local->validate_cbk = validate_cbk;
+    local->validate_xdata = dict_ref(xdata);
+
+    _quota_enforcer_lookup(frame);
+
+    return 0;
+
+unwind:
+    validate_cbk(frame, NULL, this, -1, ESTALE, NULL, NULL, NULL, NULL);
+
+    return 0;
+}
+
+int
+quota_enforcer_notify(struct rpc_clnt *rpc, void *mydata,
+                      rpc_clnt_event_t event, void *data)
+{
+    xlator_t *this = NULL;
+    int ret = 0;
+    quota_priv_t *priv = NULL;
+
+    this = mydata;
+    priv = this->private;
+    switch (event) {
+        case RPC_CLNT_CONNECT: {
+            pthread_mutex_lock(&priv->conn_mutex);
+            {
+                priv->conn_status = _gf_true;
+            }
+            pthread_mutex_unlock(&priv->conn_mutex);
+            gf_msg_trace(this->name, 0, "got RPC_CLNT_CONNECT");
+            break;
+        }
+
+        case RPC_CLNT_DISCONNECT: {
+            pthread_mutex_lock(&priv->conn_mutex);
+            {
+                priv->conn_status = _gf_false;
+                pthread_cond_signal(&priv->conn_cond);
+            }
+            pthread_mutex_unlock(&priv->conn_mutex);
+            gf_msg_trace(this->name, 0, "got RPC_CLNT_DISCONNECT");
+            break;
+        }
+
+        default:
+            gf_msg_trace(this->name, 0, "got some other RPC event %d", event);
+            ret = 0;
+            break;
+    }
+
+    return ret;
+}
+
+int
+quota_enforcer_blocking_connect(rpc_clnt_t *rpc)
+{
+    dict_t *options = NULL;
+    int ret = -1;
+
+    options = dict_new();
+    if (options == NULL)
+        goto out;
+
+    ret = dict_set_sizen_str_sizen(options, "non-blocking-io", "no");
+    if (ret)
+        goto out;
+
+    rpc->conn.trans->reconfigure(rpc->conn.trans, options);
+
+    rpc_clnt_start(rpc);
+
+    ret = dict_set_sizen_str_sizen(options, "non-blocking-io", "yes");
+    if (ret)
+        goto out;
+
+    rpc->conn.trans->reconfigure(rpc->conn.trans, options);
+
+    ret = 0;
+out:
+    if (options)
+        dict_unref(options);
+
+    return ret;
+}
+
+// Returns a started rpc_clnt. Creates a new rpc_clnt if quota_priv doesn't have
+// one already
+struct rpc_clnt *
+quota_enforcer_init(xlator_t *this, dict_t *options)
+{
+    struct rpc_clnt *rpc = NULL;
+    quota_priv_t *priv = NULL;
+    int ret = -1;
+
+    priv = this->private;
+
+    LOCK(&priv->lock);
+    {
+        if (priv->rpc_clnt) {
+            ret = 0;
+            rpc = priv->rpc_clnt;
+        }
+    }
+    UNLOCK(&priv->lock);
+
+    if (rpc)
+        goto out;
+
+    priv->quota_enforcer = &quota_enforcer_clnt;
+
+    ret = dict_set_sizen_str_sizen(options, "transport.address-family", "unix");
+    if (ret)
+        goto out;
+
+    ret = dict_set_sizen_str_sizen(options, "transport-type", "socket");
+    if (ret)
+        goto out;
+
+    ret = dict_set_sizen_str_sizen(options, "transport.socket.connect-path",
+                                   "/var/run/gluster/quotad.socket");
+    if (ret)
+        goto out;
+
+    rpc = rpc_clnt_new(options, this, this->name, 16);
+    if (!rpc) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = rpc_clnt_register_notify(rpc, quota_enforcer_notify, this);
+    if (ret) {
+        gf_msg("quota", GF_LOG_ERROR, 0, Q_MSG_RPCCLNT_REGISTER_NOTIFY_FAILED,
+               "failed to register notify");
+        goto out;
+    }
+
+    ret = quota_enforcer_blocking_connect(rpc);
+    if (ret)
+        goto out;
+
+    ret = 0;
+out:
+    if (ret) {
+        if (rpc)
+            rpc_clnt_unref(rpc);
+        rpc = NULL;
+    }
+
+    return rpc;
+}
+
+struct rpc_clnt_procedure quota_enforcer_actors[GF_AGGREGATOR_MAXVALUE] = {
+    [GF_AGGREGATOR_NULL] = {"NULL", NULL},
+    [GF_AGGREGATOR_LOOKUP] = {"LOOKUP", NULL},
+};
+
+struct rpc_clnt_program quota_enforcer_clnt = {
+    .progname = "Quota enforcer",
+    .prognum = GLUSTER_AGGREGATOR_PROGRAM,
+    .progver = GLUSTER_AGGREGATOR_VERSION,
+    .numproc = GF_AGGREGATOR_MAXVALUE,
+    .proctable = quota_enforcer_actors,
+};
diff --git a/xlators/features/quota/src/quota-mem-types.h b/xlators/features/quota/src/quota-mem-types.h
new file mode 100644
index 00000000000..782a7de96bb
--- /dev/null
+++ b/xlators/features/quota/src/quota-mem-types.h
@@ -0,0 +1,30 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef __QUOTA_MEM_TYPES_H__
+#define __QUOTA_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_quota_mem_types_ {
+    /* Those are used by QUOTA_ALLOC_OR_GOTO macro */
+    gf_quota_mt_quota_priv_t = gf_common_mt_end + 1,
+    gf_quota_mt_quota_inode_ctx_t,
+    gf_quota_mt_loc_t,
+    gf_quota_mt_char,
+    gf_quota_mt_int64_t,
+    gf_quota_mt_int32_t,
+    gf_quota_mt_limits_t,
+    gf_quota_mt_quota_dentry_t,
+    gf_quota_mt_quota_limits_level_t,
+    gf_quota_mt_qd_vols_conf_t,
+    gf_quota_mt_aggregator_state_t,
+    gf_quota_mt_end
+};
+#endif
diff --git a/xlators/features/quota/src/quota-messages.h b/xlators/features/quota/src/quota-messages.h
new file mode 100644
index 00000000000..d434ed75e76
--- /dev/null
+++ b/xlators/features/quota/src/quota-messages.h
@@ -0,0 +1,39 @@
+/*
+  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _QUOTA_MESSAGES_H_
+#define _QUOTA_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(QUOTA, Q_MSG_ENFORCEMENT_FAILED, Q_MSG_ENOMEM, Q_MSG_PARENT_NULL,
+           Q_MSG_CROSSED_SOFT_LIMIT, Q_MSG_QUOTA_ENFORCER_RPC_INIT_FAILED,
+           Q_MSG_REMOTE_OPERATION_FAILED, Q_MSG_FAILED_TO_SEND_FOP,
+           Q_MSG_INVALID_VOLFILE, Q_MSG_INODE_PARENT_NOT_FOUND,
+           Q_MSG_XDR_DECODE_ERROR, Q_MSG_DICT_UNSERIALIZE_FAIL,
+           Q_MSG_DICT_SERIALIZE_FAIL, Q_MSG_RPCSVC_INIT_FAILED,
+           Q_MSG_RPCSVC_LISTENER_CREATION_FAILED, Q_MSG_RPCSVC_REGISTER_FAILED,
+           Q_MSG_XDR_DECODING_FAILED, Q_MSG_RPCCLNT_REGISTER_NOTIFY_FAILED,
+           Q_MSG_ANCESTRY_BUILD_FAILED, Q_MSG_SIZE_KEY_MISSING,
+           Q_MSG_INODE_CTX_GET_FAILED, Q_MSG_INODE_CTX_SET_FAILED,
+           Q_MSG_LOOKUP_FAILED, Q_MSG_RPC_SUBMIT_FAILED,
+           Q_MSG_ENFORCEMENT_SKIPPED, Q_MSG_INTERNAL_FOP_KEY_MISSING);
+
+#endif /* !_QUOTA_MESSAGES_H_ */
diff --git a/xlators/features/quota/src/quota.c b/xlators/features/quota/src/quota.c
index a65050e9924..18df9ae6d19 100644
--- a/xlators/features/quota/src/quota.c
+++ b/xlators/features/quota/src/quota.c
@@ -1,1056 +1,5336 @@
 /*
-  Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
 */
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include <sys/time.h>
-
-#include "xlator.h"
-#include "defaults.h"
-#include "common-utils.h"
-
-struct quota_local {
-	struct stat    stbuf;
-	inode_t       *inode;
-	char          *path;
-	fd_t          *fd;
-	off_t          offset;
-	int32_t        count;
-	struct iovec  *vector;
-	dict_t        *refs;
-	loc_t          loc;
-};
+#include "quota.h"
+#include <glusterfs/statedump.h>
+#include "quota-messages.h"
+#include <glusterfs/events.h>
+
+struct volume_options options[];
 
+static int32_t
+__quota_init_inode_ctx(inode_t *inode, xlator_t *this,
+                       quota_inode_ctx_t **context)
+{
+    int32_t ret = -1;
+    quota_inode_ctx_t *ctx = NULL;
 
-struct quota_priv {
-	char       only_first_time;          /* Used to make sure a call is done only one time */
-	gf_lock_t  lock;                     /* Used while updating variables */
+    if (inode == NULL) {
+        goto out;
+    }
 
-	uint64_t   disk_usage_limit;         /* Used for Disk usage quota */
-	uint64_t   current_disk_usage;       /* Keep the current usage value */
+    QUOTA_ALLOC_OR_GOTO(ctx, quota_inode_ctx_t, out);
 
-	uint32_t   min_free_disk_limit;        /* user specified limit, in %*/
-	uint32_t   current_free_disk;          /* current free disk space available, in % */
-	uint32_t   refresh_interval;           /* interval in seconds */
-	uint32_t   min_disk_last_updated_time; /* used for interval calculation */	
-};
+    LOCK_INIT(&ctx->lock);
+
+    if (context != NULL) {
+        *context = ctx;
+    }
+
+    INIT_LIST_HEAD(&ctx->parents);
+
+    ret = __inode_ctx_put(inode, this, (uint64_t)(long)ctx);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INODE_CTX_SET_FAILED,
+               "cannot set quota context "
+               "in inode (gfid:%s)",
+               uuid_utoa(inode->gfid));
+        GF_FREE(ctx);
+    }
+out:
+    return ret;
+}
+
+static int32_t
+quota_inode_ctx_get(inode_t *inode, xlator_t *this, quota_inode_ctx_t **ctx,
+                    char create_if_absent)
+{
+    int32_t ret = 0;
+    uint64_t ctx_int;
 
+    LOCK(&inode->lock);
+    {
+        ret = __inode_ctx_get(inode, this, &ctx_int);
+
+        if ((ret == 0) && (ctx != NULL)) {
+            *ctx = (quota_inode_ctx_t *)(unsigned long)ctx_int;
+        } else if (create_if_absent) {
+            ret = __quota_init_inode_ctx(inode, this, ctx);
+        }
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
 
 int
-quota_statvfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		   int32_t op_ret, int32_t op_errno, struct statvfs *stbuf)
+quota_loc_fill(loc_t *loc, inode_t *inode, inode_t *parent, char *path)
 {
-	struct quota_priv *priv = this->private;
-	
-	if (op_ret >= 0) {
-		priv->current_free_disk =
-			(stbuf->f_bavail * 100) / stbuf->f_blocks;
-	}
+    int ret = -1;
+
+    if (!loc || (inode == NULL))
+        return ret;
+
+    if (inode) {
+        loc->inode = inode_ref(inode);
+        gf_uuid_copy(loc->gfid, inode->gfid);
+    }
+
+    if (parent) {
+        loc->parent = inode_ref(parent);
+    }
+
+    if (path != NULL) {
+        loc->path = gf_strdup(path);
+
+        loc->name = strrchr(loc->path, '/');
+        if (loc->name) {
+            loc->name++;
+        }
+    }
 
-	STACK_DESTROY (frame->root);
-	return 0;
+    ret = 0;
+
+    return ret;
 }
 
+int
+quota_inode_loc_fill(inode_t *inode, loc_t *loc)
+{
+    char *resolvedpath = NULL;
+    inode_t *parent = NULL;
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    if ((!inode) || (!loc)) {
+        return ret;
+    }
+
+    this = THIS;
+
+    if ((inode) && __is_root_gfid(inode->gfid)) {
+        loc->parent = NULL;
+        goto ignore_parent;
+    }
+
+    parent = inode_parent(inode, 0, NULL);
+    if (!parent) {
+        gf_msg_debug(this->name, 0,
+                     "cannot find parent for "
+                     "inode (gfid:%s)",
+                     uuid_utoa(inode->gfid));
+    }
+
+ignore_parent:
+    ret = inode_path(inode, NULL, &resolvedpath);
+    if (ret < 0) {
+        gf_msg_debug(this->name, 0,
+                     "cannot construct path for "
+                     "inode (gfid:%s)",
+                     uuid_utoa(inode->gfid));
+    }
+
+    ret = quota_loc_fill(loc, inode, parent, resolvedpath);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "cannot fill loc");
+        goto err;
+    }
+
+err:
+    if (parent) {
+        inode_unref(parent);
+    }
+
+    GF_FREE(resolvedpath);
+
+    return ret;
+}
 
-static void
-build_root_loc (xlator_t *this, loc_t *loc)
+int32_t
+quota_local_cleanup(quota_local_t *local)
 {
-	loc->path = "/";
+    if (local == NULL) {
+        goto out;
+    }
+
+    loc_wipe(&local->loc);
+    loc_wipe(&local->newloc);
+    loc_wipe(&local->oldloc);
+    loc_wipe(&local->validate_loc);
+
+    inode_unref(local->inode);
+
+    if (local->xdata)
+        dict_unref(local->xdata);
+
+    if (local->validate_xdata)
+        dict_unref(local->validate_xdata);
+
+    if (local->stub)
+        call_stub_destroy(local->stub);
+
+    LOCK_DESTROY(&local->lock);
+
+    mem_put(local);
+out:
+    return 0;
+}
+
+static quota_local_t *
+quota_local_new()
+{
+    quota_local_t *local = NULL;
+    local = mem_get0(THIS->local_pool);
+    if (local == NULL)
+        goto out;
+
+    LOCK_INIT(&local->lock);
+    local->space_available = -1;
+
+out:
+    return local;
 }
 
+quota_dentry_t *
+__quota_dentry_new(quota_inode_ctx_t *ctx, char *name, uuid_t par)
+{
+    quota_dentry_t *dentry = NULL;
+    GF_UNUSED int32_t ret = 0;
+
+    QUOTA_ALLOC_OR_GOTO(dentry, quota_dentry_t, err);
+
+    INIT_LIST_HEAD(&dentry->next);
+
+    dentry->name = gf_strdup(name);
+    if (dentry->name == NULL) {
+        GF_FREE(dentry);
+        dentry = NULL;
+        goto err;
+    }
+
+    gf_uuid_copy(dentry->par, par);
+
+    if (ctx != NULL)
+        list_add_tail(&dentry->next, &ctx->parents);
+
+err:
+    return dentry;
+}
 
 void
-gf_quota_usage_subtract (xlator_t *this, size_t size)
+__quota_dentry_free(quota_dentry_t *dentry)
 {
-	struct quota_priv *priv = NULL;
+    if (dentry == NULL) {
+        goto out;
+    }
 
-	priv = this->private;
+    list_del_init(&dentry->next);
 
-	LOCK (&priv->lock);
-	{
-		if (priv->current_disk_usage < size)
-			priv->current_disk_usage = 0;
-		else
-			priv->current_disk_usage -= size;
-	}
-	UNLOCK (&priv->lock);
+    GF_FREE(dentry->name);
+    GF_FREE(dentry);
+out:
+    return;
 }
 
+void
+__quota_dentry_del(quota_inode_ctx_t *ctx, const char *name, uuid_t par)
+{
+    quota_dentry_t *dentry = NULL;
+    quota_dentry_t *tmp = NULL;
+
+    list_for_each_entry_safe(dentry, tmp, &ctx->parents, next)
+    {
+        if ((strcmp(dentry->name, name) == 0) &&
+            (gf_uuid_compare(dentry->par, par) == 0)) {
+            __quota_dentry_free(dentry);
+            break;
+        }
+    }
+}
 
 void
-gf_quota_usage_add (xlator_t *this, size_t size)
+quota_dentry_del(quota_inode_ctx_t *ctx, const char *name, uuid_t par)
 {
-	struct quota_priv *priv = this->private;
+    LOCK(&ctx->lock);
+    {
+        __quota_dentry_del(ctx, name, par);
+    }
+    UNLOCK(&ctx->lock);
+}
 
-	LOCK (&priv->lock);
-	{
-		priv->current_disk_usage += size;
-	}
-	UNLOCK (&priv->lock);
+static inode_t *
+__quota_inode_parent(inode_t *inode, uuid_t pargfid, const char *name)
+{
+    inode_t *parent = NULL;
+
+    parent = inode_parent(inode, pargfid, name);
+    inode_unref(inode);
+    return parent;
 }
 
+static inode_t *
+quota_inode_parent(inode_t *inode, uuid_t pargfid, const char *name)
+{
+    inode_t *parent = NULL;
+
+    parent = __quota_inode_parent(inode, pargfid, name);
+    if (!parent)
+        gf_msg_callingfn(THIS->name, GF_LOG_ERROR, 0, Q_MSG_PARENT_NULL,
+                         "Failed to find "
+                         "ancestor for inode (%s)",
+                         uuid_utoa(inode->gfid));
+
+    return parent;
+}
 
-void 
-gf_quota_update_current_free_disk (xlator_t *this)
+int32_t
+quota_inode_depth(inode_t *inode)
 {
-	call_frame_t *frame = NULL;
-	call_pool_t   *pool = NULL;
-	loc_t          loc;
+    int depth = 0;
+    inode_t *cur_inode = NULL;
 
-	pool  = this->ctx->pool;
-	frame = create_frame (this, pool);
-  
-	build_root_loc (this, &loc);
+    cur_inode = inode_ref(inode);
+    while (cur_inode && !__is_root_gfid(cur_inode->gfid)) {
+        depth++;
+        cur_inode = quota_inode_parent(cur_inode, 0, NULL);
+        if (!cur_inode)
+            depth = -1;
+    }
 
-	STACK_WIND (frame, quota_statvfs_cbk,
-		    this->children->xlator,
-		    this->children->xlator->fops->statfs, &loc);
+    if (cur_inode)
+        inode_unref(cur_inode);
 
-	return ;
+    return depth;
 }
 
-
-int
-gf_quota_check_free_disk (xlator_t *this) 
+int32_t
+quota_find_common_ancestor(inode_t *inode1, inode_t *inode2,
+                           uuid_t *common_ancestor)
 {
-        struct quota_priv * priv = NULL;
-	struct timeval tv = {0, 0};
+    int32_t depth1 = 0;
+    int32_t depth2 = 0;
+    int32_t ret = -1;
+    inode_t *cur_inode1 = NULL;
+    inode_t *cur_inode2 = NULL;
+
+    depth1 = quota_inode_depth(inode1);
+    if (depth1 < 0)
+        goto out;
+
+    depth2 = quota_inode_depth(inode2);
+    if (depth2 < 0)
+        goto out;
+
+    cur_inode1 = inode_ref(inode1);
+    cur_inode2 = inode_ref(inode2);
+
+    while (cur_inode1 && depth1 > depth2) {
+        cur_inode1 = quota_inode_parent(cur_inode1, 0, NULL);
+        depth1--;
+    }
+
+    while (cur_inode2 && depth2 > depth1) {
+        cur_inode2 = quota_inode_parent(cur_inode2, 0, NULL);
+        depth2--;
+    }
+
+    while (depth1 && cur_inode1 && cur_inode2 && cur_inode1 != cur_inode2) {
+        cur_inode1 = quota_inode_parent(cur_inode1, 0, NULL);
+        cur_inode2 = quota_inode_parent(cur_inode2, 0, NULL);
+        depth1--;
+    }
+
+    if (cur_inode1 && cur_inode2) {
+        gf_uuid_copy(*common_ancestor, cur_inode1->gfid);
+        ret = 0;
+    }
+out:
+    if (cur_inode1)
+        inode_unref(cur_inode1);
+
+    if (cur_inode2)
+        inode_unref(cur_inode2);
+
+    return ret;
+}
 
-	priv = this->private;
-	if (priv->min_free_disk_limit) {
-		gettimeofday (&tv, NULL);
-		if (tv.tv_sec > (priv->refresh_interval + 
-				 priv->min_disk_last_updated_time)) {
-			priv->min_disk_last_updated_time = tv.tv_sec;
-			gf_quota_update_current_free_disk (this);
-		}
-		if (priv->current_free_disk <= priv->min_free_disk_limit)
-			return -1;
-	}
+void
+check_ancestory_continue(struct list_head *parents, inode_t *inode,
+                         int32_t op_ret, int32_t op_errno, void *data)
+{
+    call_frame_t *frame = NULL;
+    quota_local_t *local = NULL;
+    uint32_t link_count = 0;
+
+    frame = data;
+    local = frame->local;
+
+    if (parents && list_empty(parents)) {
+        gf_msg(THIS->name, GF_LOG_WARNING, EIO, Q_MSG_ANCESTRY_BUILD_FAILED,
+               "Couldn't build ancestry for inode (gfid:%s). "
+               "Without knowing ancestors till root, quota "
+               "cannot be enforced. "
+               "Hence, failing fop with EIO",
+               uuid_utoa(inode->gfid));
+        op_errno = EIO;
+        op_ret = -1;
+    }
+
+    LOCK(&local->lock);
+    {
+        link_count = --local->link_count;
+        if (op_ret < 0) {
+            local->op_ret = op_ret;
+            local->op_errno = op_errno;
+        }
+    }
+    UNLOCK(&local->lock);
 
-	return 0;
+    if (link_count == 0)
+        local->fop_continue_cbk(frame);
 }
 
+void
+check_ancestory(call_frame_t *frame, inode_t *inode)
+{
+    inode_t *cur_inode = NULL;
+    inode_t *parent = NULL;
+
+    cur_inode = inode_ref(inode);
+    while (cur_inode && !__is_root_gfid(cur_inode->gfid)) {
+        parent = inode_parent(cur_inode, 0, NULL);
+        if (!parent) {
+            quota_build_ancestry(cur_inode, check_ancestory_continue, frame);
+            inode_unref(cur_inode);
+            return;
+        }
+        inode_unref(cur_inode);
+        cur_inode = parent;
+    }
+
+    if (cur_inode) {
+        inode_unref(cur_inode);
+        check_ancestory_continue(NULL, NULL, 0, 0, frame);
+    } else {
+        check_ancestory_continue(NULL, NULL, -1, ESTALE, frame);
+    }
+}
 
-int
-quota_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		    int32_t op_ret, int32_t op_errno, struct stat *buf)
+void
+check_ancestory_2_cbk(struct list_head *parents, inode_t *inode, int32_t op_ret,
+                      int32_t op_errno, void *data)
 {
-	struct quota_priv *priv = this->private;
-	struct quota_local *local = NULL;
+    inode_t *this_inode = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    this_inode = data;
+
+    if (op_ret < 0)
+        goto out;
 
-	local = frame->local;
+    if (parents == NULL || list_empty(parents)) {
+        gf_msg(THIS->name, GF_LOG_WARNING, 0, Q_MSG_ENFORCEMENT_FAILED,
+               "Couldn't build ancestry for inode (gfid:%s). "
+               "Without knowing ancestors till root, quota "
+               "cannot be enforced.",
+               uuid_utoa(this_inode->gfid));
+        goto out;
+    }
 
-	if ((op_ret >= 0) && priv->disk_usage_limit) {
-		gf_quota_usage_subtract (this, (local->stbuf.st_blocks -
-						buf->st_blocks) * 512);
-		loc_wipe (&local->loc);
-	}
+    quota_inode_ctx_get(this_inode, THIS, &ctx, 0);
+    if (ctx)
+        ctx->ancestry_built = _gf_true;
 
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
+out:
+    inode_unref(this_inode);
 }
 
+void
+check_ancestory_2(xlator_t *this, quota_local_t *local, inode_t *inode)
+{
+    inode_t *cur_inode = NULL;
+    inode_t *parent = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    char *name = NULL;
+    uuid_t pgfid = {0};
+
+    name = (char *)local->loc.name;
+    if (local->loc.parent) {
+        gf_uuid_copy(pgfid, local->loc.parent->gfid);
+    }
+
+    cur_inode = inode_ref(inode);
+    while (cur_inode && !__is_root_gfid(cur_inode->gfid)) {
+        quota_inode_ctx_get(cur_inode, this, &ctx, 0);
+        /* build ancestry is required only on the first lookup,
+         * so stop crawling when the inode_ctx is set for an inode
+         */
+        if (ctx && ctx->ancestry_built)
+            goto setctx;
+
+        parent = inode_parent(cur_inode, pgfid, name);
+        if (!parent) {
+            quota_build_ancestry(cur_inode, check_ancestory_2_cbk,
+                                 inode_ref(inode));
+            goto out;
+        }
 
-int
-quota_truncate_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			 int32_t op_ret, int32_t op_errno, struct stat *buf)
+        if (name != NULL) {
+            name = NULL;
+            gf_uuid_clear(pgfid);
+        }
+
+        inode_unref(cur_inode);
+        cur_inode = parent;
+    }
+
+setctx:
+    if (cur_inode && cur_inode != inode) {
+        quota_inode_ctx_get(inode, this, &ctx, 0);
+        if (ctx)
+            ctx->ancestry_built = _gf_true;
+    }
+out:
+    if (cur_inode)
+        inode_unref(cur_inode);
+}
+
+static void
+quota_link_count_decrement(call_frame_t *frame)
 {
-	struct quota_local *local = NULL;
-	struct quota_priv  *priv = NULL;
+    call_frame_t *tmpframe = NULL;
+    quota_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+    int link_count = -1;
+
+    local = frame->local;
+    if (local && local->par_frame) {
+        local = local->par_frame->local;
+        tmpframe = frame;
+    }
+
+    if (local == NULL)
+        goto out;
+
+    LOCK(&local->lock);
+    {
+        link_count = --local->link_count;
+        if (link_count == 0) {
+            stub = local->stub;
+            local->stub = NULL;
+        }
+    }
+    UNLOCK(&local->lock);
 
-	priv = this->private;
-	local = frame->local;
+    if (stub != NULL) {
+        call_resume(stub);
+    }
 
-	if (op_ret >= 0) {
-		local->stbuf = *buf;
-	}
+out:
+    if (tmpframe) {
+        local = tmpframe->local;
+        tmpframe->local = NULL;
 
-	STACK_WIND (frame, quota_truncate_cbk,
-		    FIRST_CHILD (this), FIRST_CHILD (this)->fops->truncate,
-		    &local->loc, local->offset);
-	return 0;
+        STACK_DESTROY(frame->root);
+        if (local)
+            quota_local_cleanup(local);
+    }
+
+    return;
 }
 
+static void
+quota_handle_validate_error(call_frame_t *frame, int32_t op_ret,
+                            int32_t op_errno)
+{
+    quota_local_t *local;
+
+    local = frame->local;
+    if (local && local->par_frame)
+        local = local->par_frame->local;
 
-int
-quota_truncate (call_frame_t *frame, xlator_t *this,
-		loc_t *loc, off_t offset)
+    if (local == NULL)
+        goto out;
+
+    if (op_ret < 0) {
+        LOCK(&local->lock);
+        {
+            local->op_ret = op_ret;
+            local->op_errno = op_errno;
+        }
+        UNLOCK(&local->lock);
+    }
+    /* we abort checking limits on this path to root */
+    quota_link_count_decrement(frame);
+out:
+    return;
+}
+
+int32_t
+quota_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, inode_t *inode,
+                   struct iatt *buf, dict_t *xdata, struct iatt *postparent)
 {
-	struct quota_local *local = NULL;
-	struct quota_priv  *priv = NULL;
+    quota_local_t *local = NULL;
+    int32_t ret = 0;
+    quota_inode_ctx_t *ctx = NULL;
+    uint64_t value = 0;
+    quota_meta_t size = {
+        0,
+    };
+
+    local = frame->local;
+
+    if (op_ret < 0) {
+        goto unwind;
+    }
+
+    GF_ASSERT(local);
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO_WITH_ERROR("quota", this, unwind, op_errno, EINVAL);
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, xdata, unwind, op_errno, EINVAL);
+
+    ret = inode_ctx_get(local->validate_loc.inode, this, &value);
+
+    ctx = (quota_inode_ctx_t *)(unsigned long)value;
+    if ((ret == -1) || (ctx == NULL)) {
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, Q_MSG_INODE_CTX_GET_FAILED,
+               "quota context is"
+               " not present in  inode (gfid:%s)",
+               uuid_utoa(local->validate_loc.inode->gfid));
+        op_errno = EINVAL;
+        goto unwind;
+    }
+
+    ret = quota_dict_get_meta(xdata, QUOTA_SIZE_KEY, SLEN(QUOTA_SIZE_KEY),
+                              &size);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, Q_MSG_SIZE_KEY_MISSING,
+               "quota size key not present "
+               "in dict");
+        op_errno = EINVAL;
+    }
+
+    local->just_validated = 1; /* so that we don't go into infinite
+                                * loop of validation and checking
+                                * limit when timeout is zero.
+                                */
+    LOCK(&ctx->lock);
+    {
+        ctx->size = size.size;
+        ctx->validate_time = gf_time();
+        ctx->file_count = size.file_count;
+        ctx->dir_count = size.dir_count;
+    }
+    UNLOCK(&ctx->lock);
+
+    quota_check_limit(frame, local->validate_loc.inode, this);
+    return 0;
 
-	priv = this->private;
+unwind:
+    quota_handle_validate_error(frame, op_ret, op_errno);
+    return 0;
+}
 
-	if (priv->disk_usage_limit) {
-		local = CALLOC (1, sizeof (struct quota_local));
-		frame->local  = local;
+static inline gf_boolean_t
+quota_timeout(time_t t, uint32_t timeout)
+{
+    return (gf_time() - t) >= timeout;
+}
 
-		loc_copy (&local->loc, loc);
-		local->offset = offset;
+/* Return: 1 if new entry added
+ *         0 no entry added
+ *        -1 on errors
+ */
+static int32_t
+quota_add_parent(struct list_head *list, char *name, uuid_t pgfid)
+{
+    quota_dentry_t *entry = NULL;
+    gf_boolean_t found = _gf_false;
+    int ret = 0;
+
+    if (!list_empty(list)) {
+        list_for_each_entry(entry, list, next)
+        {
+            if (gf_uuid_compare(pgfid, entry->par) == 0) {
+                found = _gf_true;
+                goto out;
+            }
+        }
+    }
+
+    entry = __quota_dentry_new(NULL, name, pgfid);
+    if (entry)
+        list_add_tail(&entry->next, list);
+    else
+        ret = -1;
+
+out:
+    if (found)
+        return 0;
+    else if (ret == 0)
+        return 1;
+    else
+        return -1;
+}
 
-		STACK_WIND (frame, quota_truncate_stat_cbk,
-			    FIRST_CHILD(this),
-			    FIRST_CHILD(this)->fops->stat, loc);
-		return 0;
-	}
+/* This function iterates the parent list in inode
+ * context and add unique parent to the list
+ * Returns number of dentry added to the list, or -1 on errors
+ */
+static int32_t
+quota_add_parents_from_ctx(quota_inode_ctx_t *ctx, struct list_head *list)
+{
+    int ret = 0;
+    quota_dentry_t *dentry = NULL;
+    int32_t count = 0;
+
+    if (ctx == NULL || list == NULL)
+        goto out;
+
+    LOCK(&ctx->lock);
+    {
+        list_for_each_entry(dentry, &ctx->parents, next)
+        {
+            ret = quota_add_parent(list, dentry->name, dentry->par);
+            if (ret == 1)
+                count++;
+            else if (ret == -1)
+                break;
+        }
+    }
+    UNLOCK(&ctx->lock);
 
-	STACK_WIND (frame, quota_truncate_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->truncate,
-		    loc, offset);
-	return 0;
+out:
+    return (ret == -1) ? -1 : count;
 }
 
+int32_t
+quota_build_ancestry_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+                         dict_t *xdata)
+{
+    inode_t *parent = NULL;
+    inode_t *tmp_parent = NULL;
+    inode_t *linked_inode = NULL;
+    inode_t *tmp_inode = NULL;
+    gf_dirent_t *entry = NULL;
+    loc_t loc = {
+        0,
+    };
+    quota_dentry_t *dentry = NULL;
+    quota_dentry_t *tmp = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    struct list_head parents;
+    quota_local_t *local = NULL;
+    int ret;
+
+    INIT_LIST_HEAD(&parents);
+
+    local = frame->local;
+    frame->local = NULL;
+
+    if (op_ret < 0)
+        goto err;
+
+    if ((op_ret > 0) && (entries != NULL)) {
+        list_for_each_entry(entry, &entries->list, list)
+        {
+            if (__is_root_gfid(entry->inode->gfid)) {
+                /* The list contains a sub-list for each
+                 * possible path to the target inode. Each
+                 * sub-list starts with the root entry of the
+                 * tree and is followed by the child entries
+                 * for a particular path to the target entry.
+                 * The root entry is an implied sub-list
+                 * delimiter, as it denotes we have started
+                 * processing a new path. Reset the parent
+                 * pointer and continue
+                 */
+
+                tmp_parent = NULL;
+            } else {
+                /* For a non-root entry, link this inode */
+                linked_inode = inode_link(entry->inode, tmp_parent,
+                                          entry->d_name, &entry->d_stat);
+                if (linked_inode) {
+                    tmp_inode = entry->inode;
+                    entry->inode = linked_inode;
+                    inode_unref(tmp_inode);
+                } else {
+                    gf_msg(this->name, GF_LOG_WARNING, EINVAL,
+                           Q_MSG_PARENT_NULL, "inode link failed");
+                    op_errno = EINVAL;
+                    goto err;
+                }
+            }
+
+            gf_uuid_copy(loc.gfid, entry->d_stat.ia_gfid);
+
+            loc.inode = inode_ref(entry->inode);
+            loc.parent = inode_ref(tmp_parent);
+            loc.name = entry->d_name;
+
+            quota_fill_inodectx(this, entry->inode, entry->dict, &loc,
+                                &entry->d_stat, &op_errno);
+
+            /* For non-directory, posix_get_ancestry_non_directory
+             * returns all hard-links that are represented by nodes
+             * adjacent to each other in the dentry-list.
+             * (Unlike the directory case where adjacent nodes
+             * either have a parent/child relationship or belong to
+             * different paths).
+             */
+            if (entry->inode->ia_type == IA_IFDIR)
+                tmp_parent = entry->inode;
+
+            loc_wipe(&loc);
+        }
+    }
+
+    parent = inode_parent(local->loc.inode, 0, NULL);
+    if (parent == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, Q_MSG_PARENT_NULL,
+               "parent is NULL");
+        op_errno = EINVAL;
+        goto err;
+    }
+
+    quota_inode_ctx_get(local->loc.inode, this, &ctx, 0);
+
+    ret = quota_add_parents_from_ctx(ctx, &parents);
+    if (ret == -1) {
+        op_errno = errno;
+        goto err;
+    }
+
+    if (list_empty(&parents)) {
+        /* we built ancestry for a directory */
+        list_for_each_entry(entry, &entries->list, list)
+        {
+            if (entry->inode == local->loc.inode)
+                break;
+        }
+
+        /* Getting assertion here, need to investigate
+           comment for now
+           GF_ASSERT (&entry->list != &entries->list);
+        */
+
+        ret = quota_add_parent(&parents, entry->d_name, parent->gfid);
+        if (ret == -1) {
+            op_errno = errno;
+            goto err;
+        }
+    }
+
+    local->ancestry_cbk(&parents, local->loc.inode, 0, 0, local->ancestry_data);
+    goto cleanup;
+
+err:
+    local->ancestry_cbk(NULL, NULL, -1, op_errno, local->ancestry_data);
+
+cleanup:
+    STACK_DESTROY(frame->root);
+    quota_local_cleanup(local);
+
+    if (parent != NULL) {
+        inode_unref(parent);
+        parent = NULL;
+    }
+
+    if (!list_empty(&parents)) {
+        list_for_each_entry_safe(dentry, tmp, &parents, next)
+        {
+            __quota_dentry_free(dentry);
+        }
+    }
+
+    return 0;
+}
 
 int
-quota_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		     int32_t op_ret, int32_t op_errno, struct stat *buf)
+quota_build_ancestry(inode_t *inode, quota_ancestry_built_t ancestry_cbk,
+                     void *data)
 {
-	struct quota_priv  *priv = NULL;
-	struct quota_local *local = NULL;
+    fd_t *fd = NULL;
+    quota_local_t *local = NULL;
+    call_frame_t *new_frame = NULL;
+    int op_errno = ENOMEM;
+    int op_ret = -1;
+    xlator_t *this = NULL;
+    dict_t *xdata_req = NULL;
+
+    this = THIS;
+
+    xdata_req = dict_new();
+    if (xdata_req == NULL)
+        goto err;
+
+    fd = fd_anonymous(inode);
+    if (fd == NULL)
+        goto err;
+
+    new_frame = create_frame(this, this->ctx->pool);
+    if (new_frame == NULL)
+        goto err;
+
+    local = quota_local_new();
+    if (local == NULL)
+        goto err;
+
+    new_frame->root->uid = new_frame->root->gid = 0;
+    new_frame->local = local;
+    local->ancestry_cbk = ancestry_cbk;
+    local->ancestry_data = data;
+    local->loc.inode = inode_ref(inode);
+
+    op_ret = dict_set_int8(xdata_req, QUOTA_LIMIT_KEY, 1);
+    if (op_ret < 0) {
+        op_errno = -op_ret;
+        goto err;
+    }
+
+    op_ret = dict_set_int8(xdata_req, QUOTA_LIMIT_OBJECTS_KEY, 1);
+    if (op_ret < 0) {
+        op_errno = -op_ret;
+        goto err;
+    }
+
+    op_ret = dict_set_int8(xdata_req, GET_ANCESTRY_DENTRY_KEY, 1);
+    if (op_ret < 0) {
+        op_errno = -op_ret;
+        goto err;
+    }
+
+    /* This would ask posix layer to construct dentry chain till root
+     * We don't need to do a opendir, we can use the anonymous fd
+     * here for  the readidrp.
+     * avoiding opendir also reduces the window size where another FOP
+     * can be executed before completion of build ancestry
+     */
+    STACK_WIND(new_frame, quota_build_ancestry_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, 0, 0, xdata_req);
+
+    op_ret = 0;
+
+err:
+    if (fd)
+        fd_unref(fd);
+
+    if (xdata_req)
+        dict_unref(xdata_req);
+
+    if (op_ret < 0) {
+        ancestry_cbk(NULL, NULL, -1, op_errno, data);
+
+        if (new_frame) {
+            local = new_frame->local;
+            new_frame->local = NULL;
+            STACK_DESTROY(new_frame->root);
+        }
 
-	local = frame->local;
-	priv = this->private;
+        if (local)
+            quota_local_cleanup(local);
+    }
 
-	if ((op_ret >= 0) && priv->disk_usage_limit) {
-		gf_quota_usage_subtract (this, (local->stbuf.st_blocks -
-						buf->st_blocks) * 512);
-		fd_unref (local->fd);
-	}
+    return 0;
+}
 
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
+int
+quota_validate(call_frame_t *frame, inode_t *inode, xlator_t *this,
+               fop_lookup_cbk_t cbk_fn)
+{
+    quota_local_t *local = NULL;
+    int ret = 0;
+    dict_t *xdata = NULL;
+    quota_priv_t *priv = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    LOCK(&local->lock);
+    {
+        loc_wipe(&local->validate_loc);
+
+        ret = quota_inode_loc_fill(inode, &local->validate_loc);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENFORCEMENT_FAILED,
+                   "cannot fill loc for inode (gfid:%s), hence "
+                   "aborting quota-checks and continuing with fop",
+                   uuid_utoa(inode->gfid));
+        }
+    }
+    UNLOCK(&local->lock);
+
+    if (ret < 0) {
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    xdata = dict_new();
+    if (xdata == NULL) {
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    ret = dict_set_int8(xdata, QUOTA_SIZE_KEY, 1);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "dict set failed");
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    ret = dict_set_str(xdata, "volume-uuid", priv->volume_uuid);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "dict set failed");
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    ret = quota_enforcer_lookup(frame, this, xdata, cbk_fn);
+    if (ret < 0) {
+        ret = -ENOTCONN;
+        goto err;
+    }
+
+    ret = 0;
+err:
+    if (xdata)
+        dict_unref(xdata);
+
+    return ret;
 }
 
+void
+quota_check_limit_continuation(struct list_head *parents, inode_t *inode,
+                               int32_t op_ret, int32_t op_errno, void *data)
+{
+    call_frame_t *frame = NULL;
+    xlator_t *this = NULL;
+    quota_local_t *local = NULL;
+    quota_local_t *par_local = NULL;
+    quota_dentry_t *entry = NULL;
+    inode_t *parent = NULL;
+    int parent_count = 0;
+
+    frame = data;
+    local = frame->local;
+    this = THIS;
+
+    if (local->par_frame)
+        par_local = local->par_frame->local;
+    else
+        par_local = local;
+
+    if ((op_ret < 0) || list_empty(parents)) {
+        if (op_ret >= 0) {
+            gf_msg(this->name, GF_LOG_WARNING, EIO, Q_MSG_ANCESTRY_BUILD_FAILED,
+                   "Couldn't build ancestry for inode (gfid:%s). "
+                   "Without knowing ancestors till root, quota"
+                   "cannot be enforced. "
+                   "Hence, failing fop with EIO",
+                   uuid_utoa(inode->gfid));
+            op_errno = EIO;
+        }
 
-int
-quota_ftruncate_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			   int32_t op_ret, int32_t op_errno, struct stat *buf)
+        quota_handle_validate_error(frame, -1, op_errno);
+        goto out;
+    }
+
+    list_for_each_entry(entry, parents, next) { parent_count++; }
+
+    LOCK(&par_local->lock);
+    {
+        par_local->link_count += (parent_count - 1);
+    }
+    UNLOCK(&par_local->lock);
+
+    if (local->par_frame) {
+        list_for_each_entry(entry, parents, next)
+        {
+            parent = inode_find(inode->table, entry->par);
+            quota_check_limit(frame, parent, this);
+            inode_unref(parent);
+        }
+    } else {
+        list_for_each_entry(entry, parents, next)
+        {
+            parent = do_quota_check_limit(frame, inode, this, entry, _gf_true);
+            if (parent)
+                inode_unref(parent);
+            else
+                quota_link_count_decrement(frame);
+        }
+    }
+
+out:
+    return;
+}
+
+int32_t
+quota_check_object_limit(call_frame_t *frame, quota_inode_ctx_t *ctx,
+                         quota_priv_t *priv, inode_t *_inode, xlator_t *this,
+                         int32_t *op_errno, int just_validated,
+                         quota_local_t *local, gf_boolean_t *skip_check)
 {
-	struct quota_local *local = NULL;
-	struct quota_priv  *priv = NULL;
+    int32_t ret = -1;
+    uint32_t timeout = 0;
+    char need_validate = 0;
+    gf_boolean_t hard_limit_exceeded = 0;
+    int64_t object_aggr_count = 0;
+
+    GF_ASSERT(frame);
+    GF_ASSERT(priv);
+    GF_ASSERT(_inode);
+    GF_ASSERT(this);
+    GF_ASSERT(local);
+
+    if (ctx != NULL && (ctx->object_hard_lim > 0 || ctx->object_soft_lim)) {
+        LOCK(&ctx->lock);
+        {
+            timeout = priv->soft_timeout;
+
+            object_aggr_count = ctx->file_count + ctx->dir_count + 1;
+            if (((ctx->object_soft_lim >= 0) &&
+                 (object_aggr_count) > ctx->object_soft_lim)) {
+                timeout = priv->hard_timeout;
+            }
+
+            if (!just_validated && quota_timeout(ctx->validate_time, timeout)) {
+                need_validate = 1;
+            } else if ((object_aggr_count) > ctx->object_hard_lim) {
+                hard_limit_exceeded = 1;
+            }
+        }
+        UNLOCK(&ctx->lock);
+
+        if (need_validate && *skip_check != _gf_true) {
+            *skip_check = _gf_true;
+            ret = quota_validate(frame, _inode, this, quota_validate_cbk);
+            if (ret < 0) {
+                *op_errno = -ret;
+                *skip_check = _gf_false;
+            }
+            goto out;
+        }
 
-	priv = this->private;
-	local = frame->local;
+        if (hard_limit_exceeded) {
+            local->op_ret = -1;
+            local->op_errno = EDQUOT;
+            *op_errno = EDQUOT;
+            goto out;
+        }
 
-	if (op_ret >= 0) {
-		local->stbuf = *buf;
-	}
+        /*We log usage only if quota limit is configured on
+           that inode
+        */
+        quota_log_usage(this, ctx, _inode, 0);
+    }
 
-	STACK_WIND (frame, quota_ftruncate_cbk,
-		    FIRST_CHILD (this), FIRST_CHILD (this)->fops->ftruncate,
-		    local->fd, local->offset);
-	return 0;
+    ret = 0;
+
+out:
+    return ret;
 }
 
+int32_t
+quota_check_size_limit(call_frame_t *frame, quota_inode_ctx_t *ctx,
+                       quota_priv_t *priv, inode_t *_inode, xlator_t *this,
+                       int32_t *op_errno, int just_validated, int64_t delta,
+                       quota_local_t *local, gf_boolean_t *skip_check)
+{
+    int32_t ret = -1;
+    uint32_t timeout = 0;
+    char need_validate = 0;
+    gf_boolean_t hard_limit_exceeded = 0;
+    int64_t space_available = 0;
+    int64_t wouldbe_size = 0;
+
+    GF_ASSERT(frame);
+    GF_ASSERT(priv);
+    GF_ASSERT(_inode);
+    GF_ASSERT(this);
+    GF_ASSERT(local);
+
+    if (ctx != NULL && (ctx->hard_lim > 0 || ctx->soft_lim > 0)) {
+        wouldbe_size = ctx->size + delta;
+
+        LOCK(&ctx->lock);
+        {
+            timeout = priv->soft_timeout;
+
+            if ((ctx->soft_lim >= 0) && (wouldbe_size > ctx->soft_lim)) {
+                timeout = priv->hard_timeout;
+            }
+
+            if (!just_validated && quota_timeout(ctx->validate_time, timeout)) {
+                need_validate = 1;
+            } else if (wouldbe_size >= ctx->hard_lim) {
+                hard_limit_exceeded = 1;
+            }
+        }
+        UNLOCK(&ctx->lock);
+
+        if (need_validate && *skip_check != _gf_true) {
+            *skip_check = _gf_true;
+            ret = quota_validate(frame, _inode, this, quota_validate_cbk);
+            if (ret < 0) {
+                *op_errno = -ret;
+                *skip_check = _gf_false;
+            }
+            goto out;
+        }
+
+        if (hard_limit_exceeded) {
+            local->op_ret = -1;
+            local->op_errno = EDQUOT;
 
-int
-quota_ftruncate (call_frame_t *frame, xlator_t *this,
-		 fd_t *fd, off_t offset)
+            space_available = ctx->hard_lim - ctx->size;
+
+            if (space_available < 0)
+                space_available = 0;
+
+            if ((local->space_available < 0) ||
+                (local->space_available > space_available)) {
+                local->space_available = space_available;
+            }
+
+            if (space_available == 0) {
+                *op_errno = EDQUOT;
+                goto out;
+            }
+        }
+
+        /* We log usage only if quota limit is configured on
+           that inode. */
+        quota_log_usage(this, ctx, _inode, delta);
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+quota_check_limit(call_frame_t *frame, inode_t *inode, xlator_t *this)
 {
-	struct quota_local *local = NULL;
-	struct quota_priv  *priv = NULL;
+    int32_t ret = -1, op_errno = EINVAL;
+    inode_t *_inode = NULL, *parent = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_priv_t *priv = NULL;
+    quota_local_t *local = NULL;
+    quota_local_t *par_local = NULL;
+    char just_validated = 0;
+    int64_t delta = 0;
+    int8_t object_delta = 0;
+    uint64_t value = 0;
+    gf_boolean_t skip_check = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("quota", this, err);
+    GF_VALIDATE_OR_GOTO(this->name, frame, err);
+    GF_VALIDATE_OR_GOTO(this->name, inode, err);
+
+    local = frame->local;
+    GF_VALIDATE_OR_GOTO(this->name, local, err);
+
+    if (local->par_frame) {
+        par_local = local->par_frame->local;
+        GF_VALIDATE_OR_GOTO(this->name, par_local, err);
+    } else {
+        par_local = local;
+    }
+
+    delta = par_local->delta;
+    object_delta = par_local->object_delta;
+
+    GF_VALIDATE_OR_GOTO(this->name, par_local->stub, err);
+    /* Allow all the trusted clients
+     * Don't block the gluster internal processes like rebalance, gsyncd,
+     * self heal etc from the disk quotas.
+     *
+     * Method: Allow all the clients with PID negative. This is by the
+     * assumption that any kernel assigned pid doesn't have the negative
+     * number.
+     */
+    if (0 > frame->root->pid) {
+        ret = 0;
+        quota_link_count_decrement(frame);
+        goto done;
+    }
+
+    priv = this->private;
+
+    inode_ctx_get(inode, this, &value);
+    ctx = (quota_inode_ctx_t *)(unsigned long)value;
+
+    _inode = inode_ref(inode);
+
+    LOCK(&local->lock);
+    {
+        just_validated = local->just_validated;
+        local->just_validated = 0;
+    }
+    UNLOCK(&local->lock);
+
+    do {
+        /* In a rename operation, enforce should be stopped at common
+           ancestor */
+        if (!gf_uuid_is_null(par_local->common_ancestor) &&
+            !gf_uuid_compare(_inode->gfid, par_local->common_ancestor)) {
+            quota_link_count_decrement(frame);
+            break;
+        }
+
+        if (object_delta <= 0)
+            goto skip_check_object_limit;
+
+        ret = quota_check_object_limit(frame, ctx, priv, _inode, this,
+                                       &op_errno, just_validated, par_local,
+                                       &skip_check);
+        if (skip_check == _gf_true)
+            goto done;
+
+        if (ret) {
+            if (op_errno != EDQUOT)
+                gf_msg(this->name, GF_LOG_ERROR, 0, Q_MSG_ENFORCEMENT_FAILED,
+                       "Failed to "
+                       "check quota object limit");
+            goto err;
+        }
+
+    skip_check_object_limit:
+        ret = quota_check_size_limit(frame, ctx, priv, _inode, this, &op_errno,
+                                     just_validated, delta, par_local,
+                                     &skip_check);
+        if (skip_check == _gf_true)
+            goto done;
+
+        if (ret) {
+            if (op_errno != EDQUOT)
+                gf_msg(this->name, GF_LOG_ERROR, 0, Q_MSG_ENFORCEMENT_FAILED,
+                       "Failed to "
+                       "check quota size limit");
+            goto err;
+        }
+
+        if (__is_root_gfid(_inode->gfid)) {
+            quota_link_count_decrement(frame);
+            break;
+        }
 
+        parent = inode_parent(_inode, 0, NULL);
+        if (parent == NULL) {
+            ret = quota_build_ancestry(_inode, quota_check_limit_continuation,
+                                       frame);
+            if (ret < 0) {
+                op_errno = -ret;
+                goto err;
+            }
 
-	priv = this->private;
+            break;
+        }
+
+        inode_unref(_inode);
+        _inode = parent;
+        just_validated = 0;
 
-	if (priv->disk_usage_limit) {
-		local = CALLOC (1, sizeof (struct quota_local));
-		frame->local  = local;
+        value = 0;
+        inode_ctx_get(_inode, this, &value);
+        ctx = (quota_inode_ctx_t *)(unsigned long)value;
+    } while (1);
 
-		local->fd = fd_ref (fd);
-		local->offset = offset;
+done:
+    if (_inode != NULL) {
+        inode_unref(_inode);
+        _inode = NULL;
+    }
+    return 0;
 
-		STACK_WIND (frame, quota_ftruncate_fstat_cbk,
-			    FIRST_CHILD(this),
-			    FIRST_CHILD(this)->fops->fstat, fd);
-		return 0;
-	}
+err:
+    quota_handle_validate_error(frame, -1, op_errno);
 
-	STACK_WIND (frame, quota_ftruncate_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->ftruncate,
-		    fd, offset);
-	return 0;
+    inode_unref(_inode);
+    return 0;
 }
 
+inode_t *
+do_quota_check_limit(call_frame_t *frame, inode_t *inode, xlator_t *this,
+                     quota_dentry_t *dentry, gf_boolean_t force)
+{
+    int32_t ret = -1;
+    inode_t *parent = NULL;
+    call_frame_t *new_frame = NULL;
+    quota_local_t *new_local = NULL;
+
+    parent = inode_parent(inode, dentry->par, dentry->name);
+    if (parent == NULL) {
+        if (force)
+            parent = inode_find(inode->table, dentry->par);
+        else
+            goto out;
+    }
+    if (parent == NULL)
+        goto out;
+
+    new_frame = copy_frame(frame);
+    if (new_frame == NULL)
+        goto out;
+
+    new_local = quota_local_new();
+    if (new_local == NULL)
+        goto out;
+
+    new_frame->local = new_local;
+    new_local->par_frame = frame;
+
+    quota_check_limit(new_frame, parent, this);
+
+    ret = 0;
+out:
+    if (ret < 0) {
+        if (parent) {
+            /* Caller should decrement link_count, in case parent is
+             * NULL
+             */
+            quota_handle_validate_error(frame, -1, ENOMEM);
+        }
+
+        if (new_frame) {
+            new_frame->local = NULL;
+            STACK_DESTROY(new_frame->root);
+        }
+    }
+
+    return parent;
+}
+
+static int
+quota_get_limits(xlator_t *this, dict_t *dict, int64_t *hard_lim,
+                 int64_t *soft_lim, int64_t *object_hard_limit,
+                 int64_t *object_soft_limit)
+{
+    quota_limits_t *limit = NULL;
+    quota_limits_t *object_limit = NULL;
+    quota_priv_t *priv = NULL;
+    int64_t soft_lim_percent = 0;
+    int64_t *ptr = NULL;
+    int ret = 0;
+
+    if ((this == NULL) || (dict == NULL) || (hard_lim == NULL) ||
+        (soft_lim == NULL))
+        goto out;
+
+    priv = this->private;
+
+    ret = dict_get_bin(dict, QUOTA_LIMIT_KEY, (void **)&ptr);
+    limit = (quota_limits_t *)ptr;
+
+    if (limit) {
+        *hard_lim = ntoh64(limit->hl);
+        soft_lim_percent = ntoh64(limit->sl);
+    }
+
+    if (soft_lim_percent < 0) {
+        soft_lim_percent = priv->default_soft_lim;
+    }
+
+    if ((*hard_lim > 0) && (soft_lim_percent > 0)) {
+        *soft_lim = (soft_lim_percent * (*hard_lim)) / 100;
+    }
+
+    ret = dict_get_bin(dict, QUOTA_LIMIT_OBJECTS_KEY, (void **)&ptr);
+    if (ret)
+        return 0;
+    object_limit = (quota_limits_t *)ptr;
+
+    if (object_limit) {
+        *object_hard_limit = ntoh64(object_limit->hl);
+        soft_lim_percent = ntoh64(object_limit->sl);
+    }
+
+    if (soft_lim_percent < 0) {
+        soft_lim_percent = priv->default_soft_lim;
+    }
+
+    if ((*object_hard_limit > 0) && (soft_lim_percent > 0)) {
+        *object_soft_limit = (soft_lim_percent * (*object_hard_limit)) / 100;
+    }
+
+out:
+    return 0;
+}
 
 int
-quota_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		 int32_t op_ret, int32_t op_errno,
-		 inode_t *inode, struct stat *buf)
+quota_fill_inodectx(xlator_t *this, inode_t *inode, dict_t *dict, loc_t *loc,
+                    struct iatt *buf, int32_t *op_errno)
 {
-	struct quota_priv *priv = NULL;
+    int32_t ret = -1;
+    char found = 0;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_dentry_t *dentry = NULL;
+    uint64_t value = 0;
+    int64_t hard_lim = 0;
+    int64_t soft_lim = 0;
+    int64_t object_hard_limit = 0;
+    int64_t object_soft_limit = 0;
+
+    quota_get_limits(this, dict, &hard_lim, &soft_lim, &object_hard_limit,
+                     &object_soft_limit);
+
+    inode_ctx_get(inode, this, &value);
+    ctx = (quota_inode_ctx_t *)(unsigned long)value;
+
+    if ((((ctx == NULL) || (ctx->hard_lim == hard_lim)) && (hard_lim < 0) &&
+         !QUOTA_REG_OR_LNK_FILE(buf->ia_type))) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = quota_inode_ctx_get(inode, this, &ctx, 1);
+    if ((ret == -1) || (ctx == NULL)) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_INODE_CTX_GET_FAILED,
+               "cannot create quota "
+               "context in inode(gfid:%s)",
+               uuid_utoa(inode->gfid));
+        ret = -1;
+        *op_errno = ENOMEM;
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->hard_lim = hard_lim;
+        ctx->soft_lim = soft_lim;
+        ctx->object_hard_lim = object_hard_limit;
+        ctx->object_soft_lim = object_soft_limit;
+
+        ctx->buf = *buf;
+
+        if (!QUOTA_REG_OR_LNK_FILE(buf->ia_type)) {
+            goto unlock;
+        }
 
-	priv = this->private;
+        /* do nothing if it is a nameless lookup */
+        if (loc->name == NULL || !loc->parent)
+            goto unlock;
+
+        list_for_each_entry(dentry, &ctx->parents, next)
+        {
+            if ((strcmp(dentry->name, loc->name) == 0) &&
+                (gf_uuid_compare(loc->parent->gfid, dentry->par) == 0)) {
+                found = 1;
+                break;
+            }
+        }
 
-	if ((op_ret >= 0) && priv->disk_usage_limit) {
-		gf_quota_usage_add (this, buf->st_blocks * 512);
-	}
+        if (!found) {
+            dentry = __quota_dentry_new(ctx, (char *)loc->name,
+                                        loc->parent->gfid);
+            if (dentry == NULL) {
+                /*
+                  gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
+                          Q_MSG_ENOMEM,
+                          "cannot create a new dentry (par:%"
+-                                          PRId64", name:%s) for inode(ino:%"
+-                                          PRId64", gfid:%s)",
+-                                          uuid_utoa (local->loc.inode->gfid));
+                */
+                ret = -1;
+                *op_errno = ENOMEM;
+                goto unlock;
+            }
+        }
+    }
+unlock:
+    UNLOCK(&ctx->lock);
 
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
+out:
+    return ret;
 }
 
+/*
+ * return _gf_true if enforcement is needed and _gf_false otherwise
+ */
+gf_boolean_t
+should_quota_enforce(xlator_t *this, dict_t *dict, glusterfs_fop_t fop)
+{
+    int ret = 0;
+
+    ret = dict_check_flag(dict, GF_INTERNAL_CTX_KEY, GF_DHT_HEAL_DIR);
+
+    if (fop == GF_FOP_MKDIR && ret == DICT_FLAG_SET) {
+        return _gf_false;
+    } else if (ret == -ENOENT) {
+        gf_msg(this->name, GF_LOG_DEBUG, EINVAL, Q_MSG_INTERNAL_FOP_KEY_MISSING,
+               "No internal fop context present");
+        goto out;
+    }
+out:
+    return _gf_true;
+}
 
-int
-quota_mknod (call_frame_t *frame, xlator_t *this,
-	     loc_t *loc, mode_t mode, dev_t rdev)
+int32_t
+quota_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, inode_t *inode,
+                 struct iatt *buf, dict_t *dict, struct iatt *postparent)
 {
-	struct quota_priv *priv = NULL;
+    quota_local_t *local = NULL;
+    inode_t *this_inode = NULL;
 
-	priv = this->private;
+    local = frame->local;
+    frame->local = NULL;
 
-	if (gf_quota_check_free_disk (this) == -1) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"min-free-disk limit (%u) crossed, current available is %u",
-			priv->min_free_disk_limit, priv->current_free_disk);
-		STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL);
-		return 0;
-	}
+    if (op_ret >= 0 && inode) {
+        this_inode = inode_ref(inode);
 
-        if (priv->current_disk_usage > priv->disk_usage_limit) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"Disk usage limit (%"PRIu64") crossed, current usage is %"PRIu64"",
-			priv->disk_usage_limit, priv->current_disk_usage);
-		STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL);
-		return 0;
-        }
+        op_ret = quota_fill_inodectx(this, inode, dict, &local->loc, buf,
+                                     &op_errno);
+        if (op_ret < 0)
+            op_errno = ENOMEM;
+    }
 
-	STACK_WIND (frame, quota_mknod_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->mknod,
-		    loc, mode, rdev);
-	return 0;
-}
+    QUOTA_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, dict,
+                       postparent);
 
+    if (op_ret < 0 || this_inode == NULL || gf_uuid_is_null(this_inode->gfid))
+        goto out;
 
-int
-quota_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		 int32_t op_ret, int32_t op_errno, inode_t *inode,
-		 struct stat *buf)
+    check_ancestory_2(this, local, this_inode);
+
+out:
+    if (this_inode)
+        inode_unref(this_inode);
+
+    quota_local_cleanup(local);
+
+    return 0;
+}
+
+int32_t
+quota_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
 {
-	struct quota_priv *priv = NULL;
+    quota_priv_t *priv = NULL;
+    int32_t ret = -1;
+    quota_local_t *local = NULL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new();
+    if (!xattr_req)
+        goto err;
+
+    local = quota_local_new();
+    if (local == NULL) {
+        goto err;
+    }
 
-	priv = this->private;
+    frame->local = local;
+    loc_copy(&local->loc, loc);
 
-	if ((op_ret >= 0) && priv->disk_usage_limit) {
-		gf_quota_usage_subtract (this, buf->st_blocks * 512);
-	}
+    ret = dict_set_int8(xattr_req, QUOTA_LIMIT_KEY, 1);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "dict set of key for "
+               "hard-limit failed");
+        goto err;
+    }
 
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
+    ret = dict_set_int8(xattr_req, QUOTA_LIMIT_OBJECTS_KEY, 1);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "dict set of key for quota object limit failed");
+        goto err;
+    }
+
+    STACK_WIND(frame, quota_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xattr_req);
+
+    ret = 0;
+
+err:
+    if (xattr_req)
+        dict_unref(xattr_req);
+
+    if (ret < 0) {
+        QUOTA_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
+    }
+
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup,
+                    loc, xattr_req);
+    return 0;
 }
 
+int32_t
+quota_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                 struct iatt *postbuf, dict_t *xdata)
+{
+    int32_t ret = 0;
+    uint64_t ctx_int = 0;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_local_t *local = NULL;
+
+    local = frame->local;
+
+    if ((op_ret < 0) || (local == NULL) || (postbuf == NULL)) {
+        goto out;
+    }
+
+    ret = inode_ctx_get(local->loc.inode, this, &ctx_int);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INODE_CTX_GET_FAILED,
+               "%s: failed to get the "
+               "context",
+               local->loc.path);
+        goto out;
+    }
+
+    ctx = (quota_inode_ctx_t *)(unsigned long)ctx_int;
+
+    if (ctx == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INODE_CTX_GET_FAILED,
+               "quota context not set in %s (gfid:%s)", local->loc.path,
+               uuid_utoa(local->loc.inode->gfid));
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->buf = *postbuf;
+    }
+    UNLOCK(&ctx->lock);
+
+out:
+    QUOTA_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    return 0;
+}
 
-int
-quota_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode)
+static int gf_quota_enforcer_log;
+
+int32_t
+quota_writev_helper(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                    struct iovec *vector, int32_t count, off_t off,
+                    uint32_t flags, struct iobref *iobref, dict_t *xdata)
 {
-	struct quota_priv *priv = NULL;
+    quota_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
+    struct iovec *new_vector = NULL;
+    int32_t new_count = 0;
+
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO("quota", local, unwind);
+
+    if (local->op_ret == -1) {
+        op_errno = local->op_errno;
+
+        if ((op_errno == EDQUOT) && (local->space_available > 0)) {
+            new_count = iov_subset(vector, count, 0, local->space_available,
+                                   &new_vector, 0);
+            if (new_count < 0) {
+                local->op_ret = -1;
+                local->op_errno = ENOMEM;
+                goto unwind;
+            }
+
+            vector = new_vector;
+            count = new_count;
+        } else if (op_errno == ENOENT || op_errno == ESTALE) {
+            /* We may get ENOENT/ESTALE in case of below scenario
+             *     fd = open file.txt
+             *     unlink file.txt
+             *     write on fd
+             * Here build_ancestry can fail as the file is removed.
+             * For now ignore ENOENT/ESTALE with writes on active fd
+             * We need to re-visit this code once we understand
+             * how other file-system behave in this scenario
+             */
+            gf_msg_debug(this->name, 0,
+                         "quota enforcer failed "
+                         "with ENOENT/ESTALE on %s, cannot check "
+                         "quota limits and allowing writes",
+                         uuid_utoa(fd->inode->gfid));
+        } else if ((op_errno == EINVAL) &&
+                   !inode_parent(local->loc.inode, 0, NULL)) {
+            /* We may get INVAL with parent == NULL,
+             * in case of below scenario
+             *     1. enable quota
+             *     2. glusterfsd stop/start
+             *     3. nameless lookup
+             *     4. write on fd
+             * Here build_ancestry can fail as the file's pgfid
+             * is't exist.
+             * For now ignore EINVAL with writes on active fd
+             * untils the pgfid is created at name lookup
+             */
+            GF_LOG_OCCASIONALLY(gf_quota_enforcer_log, this->name,
+                                GF_LOG_CRITICAL,
+                                "Quota cannot be enforced as "
+                                "parent is not available and writes are being "
+                                "allowed without checking whether they are "
+                                "within quota limits. This can happen if Quota "
+                                "crawl is not complete. If crawl has been "
+                                "completed, please file a bug.");
+        } else {
+            goto unwind;
+        }
+    }
 
-	priv = this->private;
+    STACK_WIND(frame, quota_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, off, flags,
+               iobref, xdata);
 
-	if (gf_quota_check_free_disk (this) == -1) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"min-free-disk limit (%u) crossed, current available is %u",
-			priv->min_free_disk_limit, priv->current_free_disk);
-		STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL);
-		return 0;
-		
-	}
+    if (new_vector != NULL)
+        GF_FREE(new_vector);
 
-        if (priv->current_disk_usage > priv->disk_usage_limit) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"Disk usage limit (%"PRIu64") crossed, current usage is %"PRIu64"",
-			priv->disk_usage_limit, priv->current_disk_usage);
-		STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL);
-		return 0;
+    return 0;
+
+unwind:
+    QUOTA_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+quota_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+             struct iovec *vector, int32_t count, off_t off, uint32_t flags,
+             struct iobref *iobref, dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    int32_t op_errno = EINVAL;
+    int32_t parents = 0;
+    int32_t fail_count = 0;
+    uint64_t size = 0;
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_dentry_t *dentry = NULL, *tmp = NULL;
+    call_stub_t *stub = NULL;
+    struct list_head head;
+    inode_t *par_inode = NULL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    INIT_LIST_HEAD(&head);
+
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO("quota", this, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, fd, unwind);
+
+    local = quota_local_new();
+    if (local == NULL) {
+        goto unwind;
+    }
+
+    frame->local = local;
+    local->loc.inode = inode_ref(fd->inode);
+
+    (void)quota_inode_ctx_get(fd->inode, this, &ctx, 0);
+    if (ctx == NULL) {
+        gf_msg_debug(this->name, 0,
+                     "quota context is NULL on inode"
+                     " (%s). If quota is not enabled recently and "
+                     "crawler has finished crawling, its an error",
+                     uuid_utoa(fd->inode->gfid));
+    }
+
+    stub = fop_writev_stub(frame, quota_writev_helper, fd, vector, count, off,
+                           flags, iobref, xdata);
+    if (stub == NULL) {
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, unwind);
+
+    parents = quota_add_parents_from_ctx(ctx, &head);
+    if (parents == -1) {
+        op_errno = errno;
+        goto unwind;
+    }
+
+    size = iov_length(vector, count);
+
+    LOCK(&local->lock);
+    {
+        local->delta = size;
+        local->object_delta = 0;
+        local->link_count = (parents != 0) ? parents : 1;
+        local->stub = stub;
+    }
+    UNLOCK(&local->lock);
+
+    if (parents == 0) {
+        /* nameless lookup on this inode, allow quota to reconstruct
+         * ancestry as part of check_limit.
+         */
+        quota_check_limit(frame, fd->inode, this);
+    } else {
+        list_for_each_entry_safe(dentry, tmp, &head, next)
+        {
+            par_inode = do_quota_check_limit(frame, fd->inode, this, dentry,
+                                             _gf_false);
+            if (par_inode == NULL) {
+                if (ctx) {
+                    /* remove stale entry from inode ctx */
+                    quota_dentry_del(ctx, dentry->name, dentry->par);
+                    parents--;
+                    fail_count++;
+                }
+            } else {
+                inode_unref(par_inode);
+            }
+            __quota_dentry_free(dentry);
         }
 
-	STACK_WIND (frame, quota_mkdir_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->mkdir,
-		    loc, mode);
+        if (parents == 0) {
+            LOCK(&local->lock);
+            {
+                local->link_count++;
+            }
+            UNLOCK(&local->lock);
+            quota_check_limit(frame, fd->inode, this);
+        }
 
-	return 0;
+        while (fail_count != 0) {
+            quota_link_count_decrement(frame);
+            fail_count--;
+        }
+    }
+
+    return 0;
+
+unwind:
+    QUOTA_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev,
+                    fd, vector, count, off, flags, iobref, xdata);
+    return 0;
 }
 
+int32_t
+quota_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, inode_t *inode,
+                struct iatt *buf, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
+{
+    QUOTA_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, buf, preparent,
+                       postparent, xdata);
+    return 0;
+}
 
-int
-quota_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		  int32_t op_ret, int32_t op_errno)
+int32_t
+quota_mkdir_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+                   mode_t umask, dict_t *xdata)
 {
-	struct quota_local *local = NULL;
+    quota_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
 
-	local = frame->local;
+    local = frame->local;
 
-	if (local) {
-		if (op_ret >= 0) {
-			gf_quota_usage_subtract (this,
-						 local->stbuf.st_blocks * 512);
-		}
-		loc_wipe (&local->loc);
-	}
+    GF_VALIDATE_OR_GOTO("quota", local, unwind);
 
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
+    op_errno = local->op_errno;
+
+    if (local->op_ret == -1) {
+        goto unwind;
+    }
+
+    STACK_WIND(frame, quota_mkdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+
+    return 0;
+
+unwind:
+    QUOTA_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                       NULL);
+    return 0;
 }
 
+int32_t
+quota_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+            mode_t umask, dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    int32_t ret = 0, op_errno = 0;
+    quota_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    if (!should_quota_enforce(this, xdata, GF_FOP_MKDIR)) {
+        gf_msg(this->name, GF_LOG_DEBUG, 0, Q_MSG_ENFORCEMENT_SKIPPED,
+               "Enforcement has been skipped(internal fop).");
+        goto off;
+    }
+
+    local = quota_local_new();
+    if (local == NULL) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    frame->local = local;
+
+    ret = loc_copy(&local->loc, loc);
+    if (ret) {
+        op_errno = ENOMEM;
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "loc_copy failed");
+        goto err;
+    }
+
+    stub = fop_mkdir_stub(frame, quota_mkdir_helper, loc, mode, umask, xdata);
+    if (stub == NULL) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    LOCK(&local->lock);
+    {
+        local->stub = stub;
+        local->delta = 0;
+        local->object_delta = 1;
+        local->link_count = 1;
+    }
+    UNLOCK(&local->lock);
+
+    quota_check_limit(frame, loc->parent, this);
+    return 0;
+
+err:
+    QUOTA_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                       NULL);
+
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir,
+                    loc, mode, umask, xdata);
+
+    return 0;
+}
 
-int
-quota_unlink_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		       int32_t op_ret, int32_t op_errno, struct stat *buf)
+int32_t
+quota_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                 struct iatt *buf, struct iatt *preparent,
+                 struct iatt *postparent, dict_t *xdata)
 {
-	struct quota_local *local = NULL;
+    int32_t ret = -1;
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_dentry_t *dentry = NULL;
+
+    local = frame->local;
+    if (op_ret < 0) {
+        goto unwind;
+    }
+
+    ret = quota_inode_ctx_get(inode, this, &ctx, 1);
+    if ((ret == -1) || (ctx == NULL)) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_INODE_CTX_GET_FAILED,
+               "cannot create quota "
+               "context in inode(gfid:%s)",
+               uuid_utoa(inode->gfid));
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->buf = *buf;
+
+        dentry = __quota_dentry_new(ctx, (char *)local->loc.name,
+                                    local->loc.parent->gfid);
+        if (dentry == NULL) {
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+                   "cannot create a new dentry "
+                   "(name:%s) for inode(gfid:%s)",
+                   local->loc.name, uuid_utoa(local->loc.inode->gfid));
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unlock;
+        }
+    }
+unlock:
+    UNLOCK(&ctx->lock);
 
-	local = frame->local;
+unwind:
+    QUOTA_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, buf,
+                       preparent, postparent, xdata);
+    return 0;
+}
+
+int32_t
+quota_create_helper(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    int32_t flags, mode_t mode, mode_t umask, fd_t *fd,
+                    dict_t *xdata)
+{
+    quota_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
 
-	if (op_ret >= 0) {
-		if (buf->st_nlink == 1) {
-			local->stbuf = *buf;
-		}
-	}
+    local = frame->local;
 
-	STACK_WIND (frame, quota_unlink_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->unlink,
-		    &local->loc);
+    GF_VALIDATE_OR_GOTO("quota", local, unwind);
+
+    if (local->op_ret == -1) {
+        op_errno = local->op_errno;
+        goto unwind;
+    }
+
+    STACK_WIND(frame, quota_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+    return 0;
+
+unwind:
+    QUOTA_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                       NULL, NULL);
+    return 0;
+}
 
-	return 0;
+int32_t
+quota_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+             mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    int32_t ret = -1;
+    quota_local_t *local = NULL;
+    int32_t op_errno = 0;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+    QUOTA_WIND_FOR_INTERNAL_FOP(xdata, off);
+
+    local = quota_local_new();
+    if (local == NULL) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    frame->local = local;
+
+    ret = loc_copy(&local->loc, loc);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "loc_copy failed");
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    stub = fop_create_stub(frame, quota_create_helper, loc, flags, mode, umask,
+                           fd, xdata);
+    if (stub == NULL) {
+        goto err;
+    }
+
+    LOCK(&local->lock);
+    {
+        local->link_count = 1;
+        local->stub = stub;
+        local->delta = 0;
+        local->object_delta = 1;
+    }
+    UNLOCK(&local->lock);
+
+    quota_check_limit(frame, loc->parent, this);
+    return 0;
+err:
+    QUOTA_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                       NULL, NULL);
+
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->create,
+                    loc, flags, mode, umask, fd, xdata);
+    return 0;
 }
 
+int32_t
+quota_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                 struct iatt *postparent, dict_t *xdata)
+{
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    uint64_t value = 0;
+
+    if (op_ret < 0) {
+        goto out;
+    }
 
-int
-quota_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc)
+    local = (quota_local_t *)frame->local;
+
+    inode_ctx_get(local->loc.inode, this, &value);
+    ctx = (quota_inode_ctx_t *)(unsigned long)value;
+
+    if (ctx == NULL) {
+        gf_msg(this->name, GF_LOG_INFO, EINVAL, Q_MSG_INODE_CTX_GET_FAILED,
+               "quota context not set inode (gfid:%s)",
+               uuid_utoa(local->loc.gfid));
+        goto out;
+    }
+
+    quota_dentry_del(ctx, local->loc.name, local->loc.parent->gfid);
+
+out:
+    QUOTA_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, postparent,
+                       xdata);
+    return 0;
+}
+
+int32_t
+quota_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+             dict_t *xdata)
 {
-	struct quota_local *local = NULL;
-	struct quota_priv  *priv = NULL;
+    quota_priv_t *priv = NULL;
+    int32_t ret = -1;
+    quota_local_t *local = NULL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    local = quota_local_new();
+    if (local == NULL) {
+        goto err;
+    }
 
-	priv = this->private;
+    frame->local = local;
 
-	if (priv->disk_usage_limit) {
-		local = CALLOC (1, sizeof (struct quota_local));
-		frame->local = local;
+    ret = loc_copy(&local->loc, loc);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "loc_copy failed");
+        goto err;
+    }
 
-		loc_copy (&local->loc, loc);
+    STACK_WIND(frame, quota_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
 
-		STACK_WIND (frame,
-			    quota_unlink_stat_cbk,
-			    FIRST_CHILD(this),
-			    FIRST_CHILD(this)->fops->stat,
-			    loc);
-		return 0;
-	}
+    ret = 0;
 
-	STACK_WIND (frame, quota_unlink_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->unlink,
-		    loc);
-	return 0;
+err:
+    if (ret == -1) {
+        QUOTA_STACK_UNWIND(unlink, frame, -1, 0, NULL, NULL, NULL);
+    }
+
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink,
+                    loc, xflag, xdata);
+    return 0;
 }
 
+int32_t
+quota_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, inode_t *inode,
+               struct iatt *buf, struct iatt *preparent,
+               struct iatt *postparent, dict_t *xdata)
+{
+    int32_t ret = -1;
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_dentry_t *dentry = NULL;
+    char found = 0;
+
+    if (op_ret < 0) {
+        goto out;
+    }
+
+    local = (quota_local_t *)frame->local;
+
+    ret = quota_inode_ctx_get(inode, this, &ctx, 0);
+    if ((ret == -1) || (ctx == NULL)) {
+        gf_msg_debug(this->name, 0,
+                     "quota context is NULL on inode"
+                     " (%s). If quota is not enabled recently and "
+                     "crawler has finished crawling, its an error",
+                     uuid_utoa(inode->gfid));
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        list_for_each_entry(dentry, &ctx->parents, next)
+        {
+            if ((strcmp(dentry->name, local->loc.name) == 0) &&
+                (gf_uuid_compare(local->loc.parent->gfid, dentry->par) == 0)) {
+                found = 1;
+
+                gf_msg_debug(this->name, 0,
+                             "new entry being"
+                             " linked (name:%s) for inode "
+                             "(gfid:%s) is already present "
+                             "in inode-dentry-list",
+                             dentry->name, uuid_utoa(local->loc.inode->gfid));
+                break;
+            }
+        }
+
+        if (!found) {
+            dentry = __quota_dentry_new(ctx, (char *)local->loc.name,
+                                        local->loc.parent->gfid);
+            if (dentry == NULL) {
+                gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+                       "cannot create a new dentry (name:%s)"
+                       "for inode(gfid:%s)",
+                       local->loc.name, uuid_utoa(local->loc.inode->gfid));
+                op_ret = -1;
+                op_errno = ENOMEM;
+                goto unlock;
+            }
+        }
 
-int
-quota_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		 int32_t op_ret, int32_t op_errno)
+        ctx->buf = *buf;
+    }
+unlock:
+    UNLOCK(&ctx->lock);
+
+out:
+    QUOTA_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent,
+                       postparent, xdata);
+
+    return 0;
+}
+
+int32_t
+quota_link_helper(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+                  loc_t *newloc, dict_t *xdata)
 {
-	struct quota_local *local = NULL;
+    quota_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
+
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO("quota", local, unwind);
+
+    op_errno = local->op_errno;
 
-	local = frame->local;
+    if (local->op_ret == -1) {
+        goto unwind;
+    }
 
-	if (local) {
-		if (op_ret >= 0) {
-			gf_quota_usage_subtract (this, local->stbuf.st_blocks * 512);
-		}
-		loc_wipe (&local->loc);
-	}
+    STACK_WIND(frame, quota_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+    return 0;
 
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
+unwind:
+    QUOTA_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+    return 0;
 }
 
+void
+quota_link_continue(call_frame_t *frame)
+{
+    int32_t ret = -1;
+    int32_t op_errno = EIO;
+    quota_local_t *local = NULL;
+    uuid_t common_ancestor = {0};
+    xlator_t *this = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    inode_t *src_parent = NULL;
+    inode_t *dst_parent = NULL;
+
+    local = frame->local;
+    this = THIS;
+
+    if (local->op_ret < 0) {
+        op_errno = local->op_errno;
+        goto err;
+    }
+
+    if (local->xdata && dict_get(local->xdata, GLUSTERFS_INTERNAL_FOP_KEY)) {
+        /* Treat link as rename, crawl upwards only till common ancestor
+         */
+        ret = quota_find_common_ancestor(
+            local->oldloc.inode, local->newloc.parent, &common_ancestor);
+        if (ret < 0 || gf_uuid_is_null(common_ancestor)) {
+            gf_msg(this->name, GF_LOG_ERROR, ESTALE,
+                   Q_MSG_ANCESTRY_BUILD_FAILED,
+                   "failed to get "
+                   "common_ancestor for %s and %s",
+                   local->oldloc.path, local->newloc.path);
+            op_errno = ESTALE;
+            goto err;
+        }
+    } else {
+        /* Treat link as a new file.
+         * TODO: Currently marker accounts twice for the links created
+         * across directories.
+         * This needs re-visit if marker accounts only once
+         * for the links created across directories
+         */
+        if (local->oldloc.parent)
+            src_parent = inode_ref(local->oldloc.parent);
+        else
+            src_parent = inode_parent(local->oldloc.inode, 0, NULL);
+        dst_parent = local->newloc.parent;
+
+        /* No need to check quota limit if src and dst parents are same
+         */
+        if (src_parent == dst_parent ||
+            gf_uuid_compare(src_parent->gfid, dst_parent->gfid) == 0) {
+            inode_unref(src_parent);
+            goto wind;
+        }
+
+        inode_unref(src_parent);
+    }
+
+    quota_inode_ctx_get(local->oldloc.inode, this, &ctx, 0);
+    if (ctx == NULL) {
+        gf_msg_debug(this->name, 0,
+                     "quota context is NULL on inode"
+                     " (%s). If quota is not enabled recently and "
+                     "crawler has finished crawling, its an error",
+                     uuid_utoa(local->oldloc.inode->gfid));
+    }
+
+    LOCK(&local->lock);
+    {
+        local->link_count = 1;
+        local->delta = (ctx != NULL) ? ctx->buf.ia_blocks * 512 : 0;
+        local->object_delta = 1;
+        gf_uuid_copy(local->common_ancestor, common_ancestor);
+    }
+    UNLOCK(&local->lock);
+
+    quota_check_limit(frame, local->newloc.parent, this);
+    return;
+
+err:
+    QUOTA_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+    return;
+
+wind:
+    STACK_WIND(frame, quota_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, &(local->oldloc),
+               &(local->newloc), local->xdata);
+    return;
+}
 
-int
-quota_rmdir_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		      int32_t op_ret, int32_t op_errno, struct stat *buf)
+int32_t
+quota_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+           dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    int32_t ret = -1;
+    int32_t op_errno = ENOMEM;
+    quota_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    local = quota_local_new();
+    if (local == NULL) {
+        goto err;
+    }
+
+    frame->local = (void *)local;
+
+    if (xdata)
+        local->xdata = dict_ref(xdata);
+
+    ret = loc_copy(&local->loc, newloc);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "loc_copy failed");
+        goto err;
+    }
+
+    ret = loc_copy(&local->oldloc, oldloc);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "loc_copy failed");
+        goto err;
+    }
+
+    ret = loc_copy(&local->newloc, newloc);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "loc_copy failed");
+        goto err;
+    }
+
+    /* No need to check quota limit if src and dst parents are same */
+    if (oldloc->parent && newloc->parent &&
+        !gf_uuid_compare(oldloc->parent->gfid, newloc->parent->gfid)) {
+        gf_msg_debug(this->name, GF_LOG_DEBUG,
+                     "link %s -> %s are "
+                     "in the same directory, so skip check limit",
+                     oldloc->path, newloc->path);
+        goto wind;
+    }
+
+    stub = fop_link_stub(frame, quota_link_helper, oldloc, newloc, xdata);
+    if (stub == NULL) {
+        goto err;
+    }
+
+    LOCK(&local->lock);
+    {
+        local->link_count = 2;
+        local->fop_continue_cbk = quota_link_continue;
+        local->stub = stub;
+    }
+    UNLOCK(&local->lock);
+
+    check_ancestory(frame, newloc->parent);
+
+    /* source parent can be NULL, so do check_ancestry on a file */
+    if (oldloc->parent)
+        check_ancestory(frame, oldloc->parent);
+    else
+        check_ancestory(frame, oldloc->inode);
+
+    return 0;
+
+err:
+    QUOTA_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+                    oldloc, newloc, xdata);
+    return 0;
+
+wind:
+    STACK_WIND(frame, quota_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+    return 0;
+}
+
+int32_t
+quota_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                 struct iatt *preoldparent, struct iatt *postoldparent,
+                 struct iatt *prenewparent, struct iatt *postnewparent,
+                 dict_t *xdata)
 {
-	struct quota_local *local = NULL;
+    int32_t ret = -1;
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_dentry_t *old_dentry = NULL, *dentry = NULL;
+    char new_dentry_found = 0;
+
+    if (op_ret < 0) {
+        goto out;
+    }
+
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO("quota", local, out);
+
+    if (!QUOTA_REG_OR_LNK_FILE(local->oldloc.inode->ia_type))
+        goto out;
+
+    ret = quota_inode_ctx_get(local->oldloc.inode, this, &ctx, 0);
+    if ((ret == -1) || (ctx == NULL)) {
+        gf_msg_debug(this->name, 0,
+                     "quota context is NULL on inode"
+                     " (%s). If quota is not enabled recently and "
+                     "crawler has finished crawling, its an error",
+                     uuid_utoa(local->oldloc.inode->gfid));
+
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        list_for_each_entry(dentry, &ctx->parents, next)
+        {
+            if ((strcmp(dentry->name, local->oldloc.name) == 0) &&
+                (gf_uuid_compare(local->oldloc.parent->gfid, dentry->par) ==
+                 0)) {
+                old_dentry = dentry;
+            } else if ((strcmp(dentry->name, local->newloc.name) == 0) &&
+                       (gf_uuid_compare(local->newloc.parent->gfid,
+                                        dentry->par) == 0)) {
+                new_dentry_found = 1;
+                gf_msg_debug(this->name, 0,
+                             "new entry being "
+                             "linked (name:%s) for inode (gfid:%s) "
+                             "is in inode-dentry-list",
+                             dentry->name,
+                             uuid_utoa(local->oldloc.inode->gfid));
+            }
+
+            if (old_dentry && new_dentry_found)
+                break;
+        }
 
-	local = frame->local;
+        if (old_dentry != NULL) {
+            __quota_dentry_free(old_dentry);
+        } else {
+            gf_msg_debug(this->name, 0,
+                         "dentry corresponding"
+                         "the path just renamed (name:%s) is not"
+                         " present",
+                         local->oldloc.name);
+        }
 
-	if (op_ret >= 0) {
-		local->stbuf = *buf;
-	}
+        if (!new_dentry_found) {
+            dentry = __quota_dentry_new(ctx, (char *)local->newloc.name,
+                                        local->newloc.parent->gfid);
+            if (dentry == NULL) {
+                gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+                       "cannot create a new dentry (name:%s) "
+                       "for inode(gfid:%s)",
+                       local->newloc.name,
+                       uuid_utoa(local->newloc.inode->gfid));
+                op_ret = -1;
+                op_errno = ENOMEM;
+                goto unlock;
+            }
+        }
 
-	STACK_WIND (frame, quota_rmdir_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->rmdir,
-		    &local->loc);
+        ctx->buf = *buf;
+    }
+unlock:
+    UNLOCK(&ctx->lock);
 
-	return 0;
+out:
+    QUOTA_STACK_UNWIND(rename, frame, op_ret, op_errno, buf, preoldparent,
+                       postoldparent, prenewparent, postnewparent, xdata);
+
+    return 0;
 }
 
+int32_t
+quota_rename_helper(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+                    loc_t *newloc, dict_t *xdata)
+{
+    quota_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
 
-int
-quota_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc)
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO("quota", local, unwind);
+
+    op_errno = local->op_errno;
+
+    if (local->op_ret == -1) {
+        goto unwind;
+    }
+
+    STACK_WIND(frame, quota_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+
+    return 0;
+
+unwind:
+    QUOTA_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                       NULL, NULL);
+    return 0;
+}
+
+static int32_t
+quota_rename_get_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, inode_t *inode,
+                          struct iatt *buf, dict_t *xdata,
+                          struct iatt *postparent)
+{
+    quota_local_t *local = NULL;
+    int32_t ret = 0;
+    int64_t *size = 0;
+
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO_WITH_ERROR("quota", this, out, op_errno, EINVAL);
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, xdata, out, op_errno, EINVAL);
+    local = frame->local;
+    GF_ASSERT(local);
+    local->link_count = 1;
+
+    if (op_ret < 0)
+        goto out;
+
+    ret = dict_get_bin(xdata, QUOTA_SIZE_KEY, (void **)&size);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, Q_MSG_SIZE_KEY_MISSING,
+               "size key not present in dict");
+        op_errno = EINVAL;
+        goto out;
+    }
+    local->delta = ntoh64(*size);
+    local->object_delta = 1;
+    quota_check_limit(frame, local->newloc.parent, this);
+    return 0;
+
+out:
+    quota_handle_validate_error(frame, -1, op_errno);
+    return 0;
+}
+
+void
+quota_rename_continue(call_frame_t *frame)
 {
-	struct quota_local *local = NULL;
-	struct quota_priv  *priv = NULL;
+    int32_t ret = -1;
+    int32_t op_errno = EIO;
+    quota_local_t *local = NULL;
+    uuid_t common_ancestor = {0};
+    xlator_t *this = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    local = frame->local;
+    this = THIS;
+
+    if (local->op_ret < 0) {
+        op_errno = local->op_errno;
+        goto err;
+    }
+
+    ret = quota_find_common_ancestor(local->oldloc.parent, local->newloc.parent,
+                                     &common_ancestor);
+    if (ret < 0 || gf_uuid_is_null(common_ancestor)) {
+        gf_msg(this->name, GF_LOG_ERROR, ESTALE, Q_MSG_ANCESTRY_BUILD_FAILED,
+               "failed to get "
+               "common_ancestor for %s and %s",
+               local->oldloc.path, local->newloc.path);
+        op_errno = ESTALE;
+        goto err;
+    }
+
+    LOCK(&local->lock);
+    {
+        local->link_count = 1;
+        gf_uuid_copy(local->common_ancestor, common_ancestor);
+    }
+    UNLOCK(&local->lock);
+
+    if (QUOTA_REG_OR_LNK_FILE(local->oldloc.inode->ia_type)) {
+        ret = quota_inode_ctx_get(local->oldloc.inode, this, &ctx, 0);
+        if (ctx == NULL) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INODE_CTX_GET_FAILED,
+                   "quota context not set in inode (gfid:%s), "
+                   "considering file size as zero while enforcing "
+                   "quota on new ancestry",
+                   uuid_utoa(local->oldloc.inode->gfid));
+
+            local->delta = 0;
+            local->object_delta = 1;
+        } else {
+            /* FIXME: We need to account for the size occupied by
+             * this inode on the target directory. To avoid double
+             * accounting, we need to modify enforcer to perform
+             * quota_check_limit only up till the least common
+             * ancestor directory inode*/
+
+            /* FIXME: The following code assumes that regular files
+             * and link files are present, in their entirety, in a
+             * single brick. This *assumption is invalid in the
+             * case of stripe.*/
+
+            local->delta = ctx->buf.ia_blocks * 512;
+            local->object_delta = 1;
+        }
 
-	priv = this->private;
+    } else if (IA_ISDIR(local->oldloc.inode->ia_type)) {
+        ret = quota_validate(frame, local->oldloc.inode, this,
+                             quota_rename_get_size_cbk);
+        if (ret) {
+            op_errno = -ret;
+            goto err;
+        }
 
-	if (priv->disk_usage_limit) {
-		local = CALLOC (1, sizeof (struct quota_local));
-		frame->local = local;
+        return;
+    }
 
-		loc_copy (&local->loc, loc);
+    quota_check_limit(frame, local->newloc.parent, this);
+    return;
 
-		STACK_WIND (frame, quota_rmdir_stat_cbk,
-			    FIRST_CHILD(this),
-			    FIRST_CHILD(this)->fops->stat, loc);
-		return 0;
-	}
+err:
+    QUOTA_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                       NULL, NULL);
+    return;
+}
 
-	STACK_WIND (frame, quota_rmdir_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->rmdir,
-		    loc);
-	return 0;
+int32_t
+quota_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+             dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    int32_t ret = -1;
+    int32_t op_errno = ENOMEM;
+    quota_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    local = quota_local_new();
+    if (local == NULL) {
+        goto err;
+    }
+
+    frame->local = local;
+
+    ret = loc_copy(&local->oldloc, oldloc);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "loc_copy failed");
+        goto err;
+    }
+
+    ret = loc_copy(&local->newloc, newloc);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "loc_copy failed");
+        goto err;
+    }
+
+    /* No need to check quota limit if src and dst parents are same */
+    if (oldloc->parent && newloc->parent &&
+        !gf_uuid_compare(oldloc->parent->gfid, newloc->parent->gfid)) {
+        gf_msg_debug(this->name, 0,
+                     "rename %s -> %s are "
+                     "in the same directory, so skip check limit",
+                     oldloc->path, newloc->path);
+        goto wind;
+    }
+
+    stub = fop_rename_stub(frame, quota_rename_helper, oldloc, newloc, xdata);
+    if (stub == NULL) {
+        goto err;
+    }
+
+    LOCK(&local->lock);
+    {
+        /* link_count here tell how many check_ancestry should be done
+         * before continuing the FOP
+         */
+        local->link_count = 2;
+        local->stub = stub;
+        local->fop_continue_cbk = quota_rename_continue;
+    }
+    UNLOCK(&local->lock);
+
+    check_ancestory(frame, newloc->parent);
+    check_ancestory(frame, oldloc->parent);
+    return 0;
+
+err:
+    QUOTA_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                       NULL, NULL);
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename,
+                    oldloc, newloc, xdata);
+    return 0;
+
+wind:
+    STACK_WIND(frame, quota_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+    return 0;
 }
 
+int32_t
+quota_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, inode_t *inode,
+                  struct iatt *buf, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
+{
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_dentry_t *dentry = NULL;
+    int32_t ret = -1;
+
+    if (op_ret < 0) {
+        goto out;
+    }
+
+    local = frame->local;
+
+    ret = quota_inode_ctx_get(local->loc.inode, this, &ctx, 1);
+    if ((ret == -1) || (ctx == NULL)) {
+        gf_msg_debug(this->name, 0,
+                     "quota context is NULL on inode"
+                     " (%s). If quota is not enabled recently and "
+                     "crawler has finished crawling, its an error",
+                     uuid_utoa(local->loc.inode->gfid));
+
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->buf = *buf;
+
+        dentry = __quota_dentry_new(ctx, (char *)local->loc.name,
+                                    local->loc.parent->gfid);
+        if (dentry == NULL) {
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+                   "cannot create "
+                   "a new dentry (name:%s) for inode(gfid:%s)",
+                   local->loc.name, uuid_utoa(local->loc.inode->gfid));
+            op_ret = -1;
+            op_errno = ENOMEM;
+        }
+    }
+    UNLOCK(&ctx->lock);
+
+out:
+    QUOTA_STACK_UNWIND(symlink, frame, op_ret, op_errno, inode, buf, preparent,
+                       postparent, xdata);
+
+    return 0;
+}
 
 int
-quota_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		   int32_t op_ret, int32_t op_errno, inode_t *inode,
-		   struct stat *buf)
+quota_symlink_helper(call_frame_t *frame, xlator_t *this, const char *linkpath,
+                     loc_t *loc, mode_t umask, dict_t *xdata)
 {
-	struct quota_priv *priv = NULL;
+    quota_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
 
-	priv = this->private;
+    local = frame->local;
 
-	if ((op_ret >= 0) && priv->disk_usage_limit) {
-		gf_quota_usage_add (this, buf->st_blocks * 512);
-	}
+    GF_VALIDATE_OR_GOTO("quota", local, unwind);
 
-	STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-	return 0;
-}
+    if (local->op_ret == -1) {
+        op_errno = local->op_errno;
+        goto unwind;
+    }
 
+    STACK_WIND(frame, quota_symlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, xdata);
+    return 0;
+
+unwind:
+    QUOTA_STACK_UNWIND(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                       NULL);
+    return 0;
+}
 
 int
-quota_symlink (call_frame_t *frame, xlator_t *this,
-	       const char *linkpath, loc_t *loc)
+quota_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+              loc_t *loc, mode_t umask, dict_t *xdata)
 {
-	struct quota_priv *priv = NULL;
+    quota_priv_t *priv = NULL;
+    int32_t ret = -1;
+    int32_t op_errno = ENOMEM;
+    quota_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    local = quota_local_new();
+    if (local == NULL) {
+        goto err;
+    }
+
+    frame->local = local;
+
+    ret = loc_copy(&local->loc, loc);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "loc_copy failed");
+        goto err;
+    }
+
+    stub = fop_symlink_stub(frame, quota_symlink_helper, linkpath, loc, umask,
+                            xdata);
+    if (stub == NULL) {
+        goto err;
+    }
+
+    LOCK(&local->lock);
+    {
+        local->stub = stub;
+        local->delta = strlen(linkpath);
+        local->object_delta = 1;
+        local->link_count = 1;
+    }
+    UNLOCK(&local->lock);
+
+    quota_check_limit(frame, loc->parent, this);
+    return 0;
+
+err:
+    QUOTA_STACK_UNWIND(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                       NULL);
+
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink,
+                    linkpath, loc, umask, xdata);
+    return 0;
+}
 
-	priv = this->private;
+int32_t
+quota_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                   struct iatt *postbuf, dict_t *xdata)
+{
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    if (op_ret < 0) {
+        goto out;
+    }
+
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO("quota", local, out);
+
+    quota_inode_ctx_get(local->loc.inode, this, &ctx, 0);
+    if (ctx == NULL) {
+        gf_msg_debug(this->name, 0,
+                     "quota context is NULL on inode"
+                     " (%s). If quota is not enabled recently and "
+                     "crawler has finished crawling, its an error",
+                     uuid_utoa(local->loc.inode->gfid));
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->buf = *postbuf;
+    }
+    UNLOCK(&ctx->lock);
+
+out:
+    QUOTA_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+                       xdata);
+    return 0;
+}
 
-	if (gf_quota_check_free_disk (this) == -1) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"min-free-disk limit (%u) crossed, current available is %u",
-			priv->min_free_disk_limit, priv->current_free_disk);
-		STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL);
-		return 0;
-		
-	}
-        if (priv->current_disk_usage > priv->disk_usage_limit) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"Disk usage limit (%"PRIu64") crossed, current usage is %"PRIu64"",
-			priv->disk_usage_limit, priv->current_disk_usage);
-		STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL);
-		return 0;
+int32_t
+quota_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+               dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    int32_t ret = -1;
+    quota_local_t *local = NULL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    local = quota_local_new();
+    if (local == NULL) {
+        goto err;
+    }
+
+    frame->local = local;
+
+    ret = loc_copy(&local->loc, loc);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "loc_copy failed");
+        goto err;
+    }
+
+    STACK_WIND(frame, quota_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+
+    return 0;
+
+err:
+    QUOTA_STACK_UNWIND(truncate, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    return 0;
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate,
+                    loc, offset, xdata);
+    return 0;
+}
+
+int32_t
+quota_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                    struct iatt *postbuf, dict_t *xdata)
+{
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    if (op_ret < 0) {
+        goto out;
+    }
+
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO("quota", local, out);
+
+    quota_inode_ctx_get(local->loc.inode, this, &ctx, 0);
+    if (ctx == NULL) {
+        gf_msg_debug(this->name, 0,
+                     "quota context is NULL on inode"
+                     " (%s). If quota is not enabled recently and "
+                     "crawler has finished crawling, its an error",
+                     uuid_utoa(local->loc.inode->gfid));
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->buf = *postbuf;
+    }
+    UNLOCK(&ctx->lock);
+
+out:
+    QUOTA_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+                       xdata);
+    return 0;
+}
+
+int32_t
+quota_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    quota_local_t *local = NULL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    local = quota_local_new();
+    if (local == NULL)
+        goto err;
+
+    frame->local = local;
+
+    local->loc.inode = inode_ref(fd->inode);
+
+    STACK_WIND(frame, quota_ftruncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+
+    return 0;
+err:
+    QUOTA_STACK_UNWIND(ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+}
+
+static int32_t
+quota_send_dir_limit_to_cli(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                            const char *name, const int namelen)
+{
+    int32_t ret = 0;
+    int dir_limit_len = 0;
+    char dir_limit[64] = {
+        0,
+    };
+    dict_t *dict = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    uint64_t value = 0;
+    quota_priv_t *priv = NULL;
+
+    priv = this->private;
+    if (!priv->is_quota_on) {
+        dir_limit_len = snprintf(dir_limit, sizeof(dir_limit),
+                                 "Quota is disabled please turn on");
+        goto dict_set;
+    }
+
+    ret = inode_ctx_get(inode, this, &value);
+    if (ret < 0)
+        goto out;
+
+    ctx = (quota_inode_ctx_t *)(unsigned long)value;
+    dir_limit_len = snprintf(dir_limit, sizeof(dir_limit),
+                             "%" PRId64 ",%" PRId64, ctx->size, ctx->hard_lim);
+
+dict_set:
+    dict = dict_new();
+    if (dict == NULL) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_nstrn(dict, (char *)name, namelen, dir_limit, dir_limit_len);
+    if (ret < 0)
+        goto out;
+
+    gf_msg_debug(this->name, 0, "str = %s", dir_limit);
+
+    QUOTA_STACK_UNWIND(getxattr, frame, 0, 0, dict, NULL);
+
+    ret = 0;
+
+out:
+    if (dict)
+        dict_unref(dict);
+    return ret;
+}
+
+int32_t
+quota_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+                dict_t *xdata)
+{
+    int32_t ret = 0;
+
+    if (name && strcasecmp(name, "trusted.limit.list") == 0) {
+        ret = quota_send_dir_limit_to_cli(frame, this, fd->inode,
+                                          "trusted.limit.list",
+                                          SLEN("trusted.limit.list"));
+        if (ret == 0) {
+            return 0;
         }
+    }
 
-	STACK_WIND (frame, quota_symlink_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->symlink,
-		    linkpath, loc);
-	return 0;
+    STACK_WIND(frame, default_fgetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+    return 0;
 }
 
+int32_t
+quota_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               const char *name, dict_t *xdata)
+{
+    int32_t ret = 0;
+
+    if ((name != NULL) && strcasecmp(name, "trusted.limit.list") == 0) {
+        ret = quota_send_dir_limit_to_cli(frame, this, loc->inode,
+                                          "trusted.limit.list",
+                                          SLEN("trusted.limit.list"));
+        if (ret == 0)
+            return 0;
+    }
+
+    STACK_WIND(frame, default_getxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
+    return 0;
+}
 
-int
-quota_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		  int32_t op_ret, int32_t op_errno,
-		  fd_t *fd, inode_t *inode, struct stat *buf)
+int32_t
+quota_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *buf,
+               dict_t *xdata)
 {
-	struct quota_priv *priv = this->private;
-	int                ret = 0;
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    if (op_ret < 0) {
+        goto out;
+    }
+
+    local = frame->local;
 
-	if ((op_ret >= 0) && priv->disk_usage_limit) {
-		gf_quota_usage_add (this, buf->st_blocks * 512);
+    GF_VALIDATE_OR_GOTO("quota", local, out);
 
-		ret = fd_ctx_set (fd, this, 1);
-	}
+    quota_inode_ctx_get(local->loc.inode, this, &ctx, 0);
+    if (ctx == NULL) {
+        if (!IA_ISDIR(buf->ia_type)) {
+            gf_msg_debug(this->name, 0,
+                         "quota context is NULL on inode"
+                         " (%s). If quota is not enabled recently and "
+                         "crawler has finished crawling, its an error",
+                         uuid_utoa(local->loc.inode->gfid));
+        }
+
+        goto out;
+    }
+
+    if (buf) {
+        LOCK(&ctx->lock);
+        ctx->buf = *buf;
+        UNLOCK(&ctx->lock);
+    }
 
-	STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
-	return 0;
+out:
+    QUOTA_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata);
+    return 0;
 }
 
+int32_t
+quota_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    quota_local_t *local = NULL;
+    int32_t ret = -1;
 
-int
-quota_create (call_frame_t *frame, xlator_t *this,
-	      loc_t *loc, int32_t flags, mode_t mode, fd_t *fd)
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    local = quota_local_new();
+    if (local == NULL) {
+        goto unwind;
+    }
+
+    frame->local = local;
+    ret = loc_copy(&local->loc, loc);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "loc_copy failed");
+        goto unwind;
+    }
+
+    STACK_WIND(frame, quota_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->stat, loc, xdata);
+    return 0;
+
+unwind:
+    QUOTA_STACK_UNWIND(stat, frame, -1, ENOMEM, NULL, NULL);
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->stat,
+                    loc, xdata);
+    return 0;
+}
+
+int32_t
+quota_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                dict_t *xdata)
 {
-	struct quota_priv *priv = NULL;
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
 
-	priv = this->private;
+    if (op_ret < 0) {
+        goto out;
+    }
 
-	if (gf_quota_check_free_disk (this) == -1) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"min-free-disk limit (%u) crossed, current available is %u",
-			priv->min_free_disk_limit, priv->current_free_disk);
-		STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL, NULL);
-		return 0;
-		
-	}
-        if (priv->current_disk_usage > priv->disk_usage_limit) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"Disk usage limit (%"PRIu64") crossed, current usage is %"PRIu64"",
-			priv->disk_usage_limit, priv->current_disk_usage);
-		STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL, NULL);
-		return 0;
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO("quota", local, out);
+
+    quota_inode_ctx_get(local->loc.inode, this, &ctx, 0);
+    if (ctx == NULL) {
+        if (!IA_ISDIR(buf->ia_type)) {
+            gf_msg_debug(this->name, 0,
+                         "quota context is NULL on inode"
+                         " (%s). If quota is not enabled recently and "
+                         "crawler has finished crawling, its an error",
+                         uuid_utoa(local->loc.inode->gfid));
         }
 
-	STACK_WIND (frame, quota_create_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->create,
-		    loc, flags, mode, fd);
-	return 0;
+        goto out;
+    }
+
+    if (buf) {
+        LOCK(&ctx->lock);
+        ctx->buf = *buf;
+        UNLOCK(&ctx->lock);
+    }
+
+out:
+    QUOTA_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+int32_t
+quota_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    quota_local_t *local = NULL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    local = quota_local_new();
+    if (local == NULL) {
+        goto unwind;
+    }
+
+    frame->local = local;
+
+    local->loc.inode = inode_ref(fd->inode);
+
+    STACK_WIND(frame, quota_fstat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fstat, fd, xdata);
+    return 0;
+
+unwind:
+    QUOTA_STACK_UNWIND(fstat, frame, -1, ENOMEM, NULL, NULL);
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat,
+                    fd, xdata);
+    return 0;
 }
 
+int32_t
+quota_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, const char *path,
+                   struct iatt *buf, dict_t *xdata)
+{
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    if (op_ret < 0) {
+        goto out;
+    }
+
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO("quota", local, out);
+
+    quota_inode_ctx_get(local->loc.inode, this, &ctx, 0);
+    if (ctx == NULL) {
+        gf_msg_debug(this->name, 0,
+                     "quota context is NULL on inode"
+                     " (%s). If quota is not enabled recently and "
+                     "crawler has finished crawling, its an error",
+                     uuid_utoa(local->loc.inode->gfid));
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->buf = *buf;
+    }
+    UNLOCK(&ctx->lock);
+
+out:
+    QUOTA_STACK_UNWIND(readlink, frame, op_ret, op_errno, path, buf, xdata);
+    return 0;
+}
 
-int
-quota_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		int32_t op_ret, int32_t op_errno, fd_t *fd)
+int32_t
+quota_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+               dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    quota_local_t *local = NULL;
+    int32_t ret = -1;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    local = quota_local_new();
+    if (local == NULL) {
+        goto unwind;
+    }
+
+    frame->local = local;
+
+    ret = loc_copy(&local->loc, loc);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "loc_copy failed");
+        goto unwind;
+    }
+
+    STACK_WIND(frame, quota_readlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readlink, loc, size, xdata);
+    return 0;
+
+unwind:
+    QUOTA_STACK_UNWIND(readlink, frame, -1, ENOMEM, NULL, NULL, NULL);
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readlink,
+                    loc, size, xdata);
+    return 0;
+}
+
+int32_t
+quota_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                int32_t count, struct iatt *buf, struct iobref *iobref,
+                dict_t *xdata)
 {
-	int                ret = 0;
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    if (op_ret < 0) {
+        goto out;
+    }
+
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO("quota", local, out);
+
+    quota_inode_ctx_get(local->loc.inode, this, &ctx, 0);
+    if (ctx == NULL) {
+        gf_msg_debug(this->name, 0,
+                     "quota context is NULL on inode"
+                     " (%s). If quota is not enabled recently and "
+                     "crawler has finished crawling, its an error",
+                     uuid_utoa(local->loc.inode->gfid));
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->buf = *buf;
+    }
+    UNLOCK(&ctx->lock);
+
+out:
+    QUOTA_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, buf,
+                       iobref, xdata);
+    return 0;
+}
 
-	if (op_ret >= 0)
-		ret = fd_ctx_set (fd, this, 1);
+int32_t
+quota_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t offset, uint32_t flags, dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    quota_local_t *local = NULL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    local = quota_local_new();
+    if (local == NULL) {
+        goto unwind;
+    }
+
+    frame->local = local;
 
-	STACK_UNWIND (frame, op_ret, op_errno, fd);
-	return 0;
+    local->loc.inode = inode_ref(fd->inode);
+
+    STACK_WIND(frame, quota_readv_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+    return 0;
+
+unwind:
+    QUOTA_STACK_UNWIND(readv, frame, -1, ENOMEM, NULL, -1, NULL, NULL, NULL);
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv,
+                    fd, size, offset, flags, xdata);
+    return 0;
 }
 
+int32_t
+quota_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                struct iatt *postbuf, dict_t *xdata)
+{
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+
+    if (op_ret < 0) {
+        goto out;
+    }
+
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO("quota", local, out);
+
+    quota_inode_ctx_get(local->loc.inode, this, &ctx, 0);
+    if (ctx == NULL) {
+        gf_msg_debug(this->name, 0,
+                     "quota context is NULL on inode"
+                     " (%s). If quota is not enabled recently and "
+                     "crawler has finished crawling, its an error",
+                     uuid_utoa(local->loc.inode->gfid));
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->buf = *postbuf;
+    }
+    UNLOCK(&ctx->lock);
+
+out:
+    QUOTA_STACK_UNWIND(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+    return 0;
+}
 
-int
-quota_open (call_frame_t *frame, xlator_t *this,
-	    loc_t *loc, int32_t flags, fd_t *fd)
+int32_t
+quota_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+            dict_t *xdata)
 {
-	STACK_WIND (frame, quota_open_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->open,
-		    loc, flags, fd);
-	return 0;
+    quota_priv_t *priv = NULL;
+    quota_local_t *local = NULL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    local = quota_local_new();
+    if (local == NULL) {
+        goto unwind;
+    }
+
+    local->loc.inode = inode_ref(fd->inode);
+
+    frame->local = local;
+
+    STACK_WIND(frame, quota_fsync_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsync, fd, flags, xdata);
+    return 0;
+
+unwind:
+    QUOTA_STACK_UNWIND(fsync, frame, -1, ENOMEM, NULL, NULL, NULL);
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync,
+                    fd, flags, xdata);
+    return 0;
 }
 
+int32_t
+quota_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+                  struct iatt *statpost, dict_t *xdata)
+{
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
 
-int
-quota_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		  int32_t op_ret, int32_t op_errno, struct stat *stbuf)
+    if (op_ret < 0) {
+        goto out;
+    }
+
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO("quota", local, out);
+
+    quota_inode_ctx_get(local->loc.inode, this, &ctx, 0);
+    if (ctx == NULL) {
+        if (!IA_ISDIR(statpost->ia_type)) {
+            gf_msg_debug(this->name, 0,
+                         "quota context is NULL on inode"
+                         " (%s). If quota is not enabled recently and "
+                         "crawler has finished crawling, its an error",
+                         uuid_utoa(local->loc.inode->gfid));
+        }
+
+        goto out;
+    }
+
+    if (statpost) {
+        LOCK(&ctx->lock);
+        ctx->buf = *statpost;
+        UNLOCK(&ctx->lock);
+    }
+
+out:
+    QUOTA_STACK_UNWIND(setattr, frame, op_ret, op_errno, statpre, statpost,
+                       xdata);
+    return 0;
+}
+
+int32_t
+quota_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+              struct iatt *stbuf, int32_t valid, dict_t *xdata)
 {
-	struct quota_priv *priv = NULL;
-	struct quota_local *local = NULL;
+    quota_priv_t *priv = NULL;
+    quota_local_t *local = NULL;
+    int32_t ret = -1;
+
+    priv = this->private;
 
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
 
-	priv = this->private;
-	local = frame->local;
+    local = quota_local_new();
+    if (local == NULL) {
+        goto unwind;
+    }
 
-	if (priv->disk_usage_limit) {
-		if (op_ret >= 0) { 
-			gf_quota_usage_add (this, (stbuf->st_blocks -
-						   local->stbuf.st_blocks) * 512);
-		}
-		fd_unref (local->fd);
-		dict_unref (local->refs);
-	}
+    frame->local = local;
 
-	STACK_UNWIND (frame, op_ret, op_errno, stbuf);
-	return 0;
+    ret = loc_copy(&local->loc, loc);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "loc_copy failed");
+        goto unwind;
+    }
+
+    STACK_WIND(frame, quota_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+    return 0;
+
+unwind:
+    QUOTA_STACK_UNWIND(setattr, frame, -1, ENOMEM, NULL, NULL, NULL);
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setattr,
+                    loc, stbuf, valid, xdata);
+    return 0;
 }
 
+int32_t
+quota_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+                   struct iatt *statpost, dict_t *xdata)
+{
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
 
-int
-quota_writev_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-			int32_t op_ret,	int32_t op_errno, struct stat *buf)
+    if (op_ret < 0) {
+        goto out;
+    }
+
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO("quota", local, out);
+
+    quota_inode_ctx_get(local->loc.inode, this, &ctx, 0);
+    if (ctx == NULL) {
+        if (!IA_ISDIR(statpost->ia_type)) {
+            gf_msg_debug(this->name, 0,
+                         "quota context is NULL on inode"
+                         " (%s). If quota is not enabled recently and "
+                         "crawler has finished crawling, its an error",
+                         uuid_utoa(local->loc.inode->gfid));
+        }
+
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->buf = *statpost;
+    }
+    UNLOCK(&ctx->lock);
+
+out:
+    QUOTA_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, statpre, statpost,
+                       xdata);
+    return 0;
+}
+
+int32_t
+quota_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+               struct iatt *stbuf, int32_t valid, dict_t *xdata)
 {
-	struct quota_local *local = NULL;
-	struct quota_priv  *priv = NULL;
-	int                 iovlen = 0;
+    quota_priv_t *priv = NULL;
+    quota_local_t *local = NULL;
+
+    priv = this->private;
 
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
 
-	local = frame->local;
-	priv = this->private;
+    local = quota_local_new();
+    if (local == NULL) {
+        goto unwind;
+    }
 
-	if (op_ret >= 0) {
-		if (priv->current_disk_usage > priv->disk_usage_limit) {
-			iovlen = iov_length (local->vector, local->count);
+    frame->local = local;
 
-			if (iovlen > (buf->st_blksize - (buf->st_size % buf->st_blksize))) {
-				fd_unref (local->fd);
-				dict_unref (local->refs);
-				STACK_UNWIND (frame, -1, ENOSPC, NULL);
-				return 0;
-			}
-		}
-		local->stbuf = *buf;
-	}
-	
-	STACK_WIND (frame, quota_writev_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->writev,
-		    local->fd, local->vector, local->count, local->offset);
+    local->loc.inode = inode_ref(fd->inode);
+
+    STACK_WIND(frame, quota_fsetattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+    return 0;
+
+unwind:
+    QUOTA_STACK_UNWIND(fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL);
+    return 0;
 
-	return 0;
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetattr,
+                    fd, stbuf, valid, xdata);
+    return 0;
 }
 
+int32_t
+quota_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, inode_t *inode,
+                struct iatt *buf, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
+{
+    int32_t ret = -1;
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_dentry_t *dentry = NULL;
+
+    local = frame->local;
+    if (op_ret < 0) {
+        goto unwind;
+    }
+
+    ret = quota_inode_ctx_get(inode, this, &ctx, 1);
+    if ((ret == -1) || (ctx == NULL)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INODE_CTX_GET_FAILED,
+               "cannot create quota context in "
+               "inode(gfid:%s)",
+               uuid_utoa(inode->gfid));
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->buf = *buf;
+
+        dentry = __quota_dentry_new(ctx, (char *)local->loc.name,
+                                    local->loc.parent->gfid);
+        if (dentry == NULL) {
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+                   "cannot create a new dentry "
+                   "(name:%s) for inode(gfid:%s)",
+                   local->loc.name, uuid_utoa(local->loc.inode->gfid));
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unlock;
+        }
+    }
+unlock:
+    UNLOCK(&ctx->lock);
+
+unwind:
+    QUOTA_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent,
+                       postparent, xdata);
+    return 0;
+}
 
 int
-quota_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
-	      struct iovec *vector, int32_t count, off_t off)
+quota_mknod_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+                   dev_t rdev, mode_t umask, dict_t *xdata)
 {
-	struct quota_local *local = NULL;
-	struct quota_priv  *priv = NULL;
+    quota_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
+
+    local = frame->local;
 
-	priv = this->private;
+    GF_VALIDATE_OR_GOTO("quota", local, unwind);
 
-	if (gf_quota_check_free_disk (this) == -1) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"min-free-disk limit (%u) crossed, current available is %u",
-			priv->min_free_disk_limit, priv->current_free_disk);
-		STACK_UNWIND (frame, -1, ENOSPC, NULL);
-		return 0;
-	}
+    if (local->op_ret == -1) {
+        op_errno = local->op_errno;
+        goto unwind;
+    }
 
-	if (priv->disk_usage_limit) {
-		local = CALLOC (1, sizeof (struct quota_local));
-		local->fd     = fd_ref (fd);
-		local->refs   = dict_ref (frame->root->req_refs);
-		local->vector = vector;
-		local->count  = count;
-		local->offset = off;
-		frame->local  = local;
+    STACK_WIND(frame, quota_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata);
 
-		STACK_WIND (frame, quota_writev_fstat_cbk,
-			    FIRST_CHILD(this),
-			    FIRST_CHILD(this)->fops->fstat, fd);
-		return 0;
-	}
+    return 0;
 
-	STACK_WIND (frame, quota_writev_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->writev,
-		    fd, vector, count, off);
-	return 0;
+unwind:
+    QUOTA_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                       NULL);
+    return 0;
 }
 
+int
+quota_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+            dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    int32_t ret = -1;
+    quota_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+    QUOTA_WIND_FOR_INTERNAL_FOP(xdata, off);
+
+    local = quota_local_new();
+    if (local == NULL) {
+        goto err;
+    }
+
+    frame->local = local;
+
+    ret = loc_copy(&local->loc, loc);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "loc_copy failed");
+        goto err;
+    }
+
+    stub = fop_mknod_stub(frame, quota_mknod_helper, loc, mode, rdev, umask,
+                          xdata);
+    if (stub == NULL) {
+        goto err;
+    }
+
+    LOCK(&local->lock);
+    {
+        local->link_count = 1;
+        local->stub = stub;
+        local->delta = 0;
+        local->object_delta = 1;
+    }
+    UNLOCK(&local->lock);
+
+    quota_check_limit(frame, loc->parent, this);
+    return 0;
+
+err:
+    QUOTA_STACK_UNWIND(mknod, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL);
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod,
+                    loc, mode, rdev, umask, xdata);
+    return 0;
+}
 
 int
-quota_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		       int32_t op_ret, int32_t op_errno)
+quota_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int op_ret, int op_errno, dict_t *xdata)
 {
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_CRITICAL, 
-			"failed to remove the disk-usage value: %s",
-			strerror (op_errno));
-	} 
-	
-	STACK_DESTROY (frame->root);
-	return 0;
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    int ret = 0;
+
+    if (op_ret < 0) {
+        goto out;
+    }
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    ret = quota_inode_ctx_get(local->loc.inode, this, &ctx, 1);
+    if ((ret < 0) || (ctx == NULL)) {
+        op_errno = -1;
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->hard_lim = local->limit.hl;
+        ctx->soft_lim = local->limit.sl;
+        ctx->object_hard_lim = local->object_limit.hl;
+        ctx->object_soft_lim = local->object_limit.sl;
+    }
+    UNLOCK(&ctx->lock);
+
+out:
+    QUOTA_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+    return 0;
 }
 
+int
+quota_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+               int flags, dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    int op_errno = EINVAL;
+    int op_ret = -1;
+    int64_t hard_lim = -1;
+    int64_t soft_lim = -1;
+    int64_t object_hard_limit = -1;
+    int64_t object_soft_limit = -1;
+    quota_local_t *local = NULL;
+    gf_boolean_t internal_fop = _gf_false;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+
+    if (xdata && dict_get_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY))
+        internal_fop = _gf_true;
+
+    if (frame->root->pid >= 0 && internal_fop == _gf_false) {
+        GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.quota*", dict, op_errno,
+                                  err);
+        GF_IF_INTERNAL_XATTR_GOTO("trusted.pgfid*", dict, op_errno, err);
+    }
+
+    quota_get_limits(this, dict, &hard_lim, &soft_lim, &object_hard_limit,
+                     &object_soft_limit);
+
+    if (hard_lim > 0 || object_hard_limit > 0) {
+        local = quota_local_new();
+        if (local == NULL) {
+            op_errno = ENOMEM;
+            goto err;
+        }
+        frame->local = local;
+        loc_copy(&local->loc, loc);
+    }
+
+    if (hard_lim > 0) {
+        local->limit.hl = hard_lim;
+        local->limit.sl = soft_lim;
+    }
+
+    if (object_hard_limit > 0) {
+        local->object_limit.hl = object_hard_limit;
+        local->object_limit.sl = object_soft_limit;
+    }
+
+    STACK_WIND(frame, quota_setxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
+    return 0;
+err:
+    QUOTA_STACK_UNWIND(setxattr, frame, op_ret, op_errno, NULL);
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr,
+                    loc, dict, flags, xdata);
+    return 0;
+}
 
 int
-quota_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		    int32_t op_ret, int32_t op_errno)
+quota_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int op_ret, int op_errno, dict_t *xdata)
 {
-	if (op_ret == -1) {
-		gf_log (this->name, GF_LOG_CRITICAL, 
-			"failed to set the disk-usage value: %s",
-			strerror (op_errno));
-	} 
+    quota_inode_ctx_t *ctx = NULL;
+    quota_local_t *local = NULL;
+
+    if (op_ret < 0)
+        goto out;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    op_ret = quota_inode_ctx_get(local->loc.inode, this, &ctx, 1);
+    if ((op_ret < 0) || (ctx == NULL)) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->hard_lim = local->limit.hl;
+        ctx->soft_lim = local->limit.sl;
+        ctx->object_hard_lim = local->object_limit.hl;
+        ctx->object_soft_lim = local->object_limit.sl;
+    }
+    UNLOCK(&ctx->lock);
+
+out:
+    QUOTA_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
 
-	STACK_DESTROY (frame->root);
-	return 0;
+int
+quota_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+                int flags, dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    quota_local_t *local = NULL;
+    int64_t hard_lim = -1;
+    int64_t soft_lim = -1;
+    int64_t object_hard_limit = -1;
+    int64_t object_soft_limit = -1;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    if (0 <= frame->root->pid) {
+        GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.quota*", dict, op_errno,
+                                  err);
+        GF_IF_INTERNAL_XATTR_GOTO("trusted.pgfid*", dict, op_errno, err);
+    }
+
+    quota_get_limits(this, dict, &hard_lim, &soft_lim, &object_hard_limit,
+                     &object_soft_limit);
+
+    if (hard_lim > 0 || object_hard_limit > 0) {
+        local = quota_local_new();
+        if (local == NULL) {
+            op_errno = ENOMEM;
+            goto err;
+        }
+        frame->local = local;
+        local->loc.inode = inode_ref(fd->inode);
+    }
+
+    if (hard_lim > 0) {
+        local->limit.hl = hard_lim;
+        local->limit.sl = soft_lim;
+    }
+
+    if (object_hard_limit > 0) {
+        local->object_limit.hl = object_hard_limit;
+        local->object_limit.sl = object_soft_limit;
+    }
+
+    STACK_WIND(frame, quota_fsetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+    return 0;
+err:
+    QUOTA_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, NULL);
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+    return 0;
 }
 
+int
+quota_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    QUOTA_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
 
 int
-quota_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		  int32_t op_ret, int32_t op_errno, struct statvfs *statvfs)
+quota_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                  const char *name, dict_t *xdata)
 {
-	struct quota_priv *priv = NULL;
-	uint64_t           f_blocks = 0;
-	int64_t            f_bfree = 0;
-	uint64_t           f_bused = 0;
+    quota_priv_t *priv = NULL;
+    int32_t op_errno = EINVAL;
 
+    priv = this->private;
 
-	priv = this->private;
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
 
-	if (op_ret != 0)
-		goto unwind;
+    VALIDATE_OR_GOTO(this, err);
 
-	f_blocks = priv->disk_usage_limit / statvfs->f_frsize;
-	f_bused = priv->current_disk_usage / statvfs->f_frsize;
+    /* all quota xattrs can be cleaned up by doing setxattr on special key.
+     * Hence its ok that we don't allow removexattr on quota keys here.
+     */
+    if (frame->root->pid >= 0) {
+        GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.quota*", name, op_errno,
+                                err);
+        GF_IF_NATIVE_XATTR_GOTO("trusted.pgfid*", name, op_errno, err);
+    }
 
-	if (f_blocks && (f_blocks < statvfs->f_blocks))
-		statvfs->f_blocks = f_blocks;
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(loc, err);
 
-	f_bfree = (statvfs->f_blocks - f_bused);
+    STACK_WIND(frame, quota_removexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+    return 0;
 
-	if (f_bfree >= 0)
-		statvfs->f_bfree = statvfs->f_bavail = f_bfree;
-	else
-		statvfs->f_bfree = statvfs->f_bavail = 0;
+err:
+    QUOTA_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
+    return 0;
 
-unwind:
-	STACK_UNWIND (frame, op_ret, op_errno, statvfs);
-	return 0;
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+    return 0;
 }
 
-
 int
-quota_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc)
+quota_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
-	STACK_WIND (frame, quota_statfs_cbk,
-		    FIRST_CHILD (this), FIRST_CHILD (this)->fops->statfs, loc);
+    QUOTA_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
 
-	return 0;
+int
+quota_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                   const char *name, dict_t *xdata)
+{
+    quota_priv_t *priv = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(fd, err);
+
+    if (frame->root->pid >= 0) {
+        GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.quota*", name, op_errno,
+                                err);
+        GF_IF_NATIVE_XATTR_GOTO("trusted.pgfid*", name, op_errno, err);
+    }
+    STACK_WIND(frame, quota_fremovexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+    return 0;
+err:
+    QUOTA_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, NULL);
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+    return 0;
 }
 
+int32_t
+quota_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+                 dict_t *xdata)
+{
+    inode_t *inode = NULL;
+    uint64_t value = 0;
+    int64_t usage = -1;
+    int64_t avail = -1;
+    int64_t blocks = 0;
+    quota_inode_ctx_t *ctx = NULL;
+    int ret = 0;
+
+    inode = cookie;
+
+    /* This fop will fail mostly in case of client disconnect,
+     * which is already logged. Hence, not logging here */
+    if (op_ret == -1)
+        goto unwind;
+    /*
+     * We should never get here unless quota_statfs (below) sent us a
+     * cookie, and it would only do so if the value was non-NULL.  This
+     * check is therefore just routine defensive coding.
+     */
+
+    GF_VALIDATE_OR_GOTO("quota", inode, unwind);
+
+    inode_ctx_get(inode, this, &value);
+    ctx = (quota_inode_ctx_t *)(unsigned long)value;
+    if (!ctx || ctx->hard_lim <= 0)
+        goto unwind;
+
+    { /* statfs is adjusted in this code block */
+        usage = (ctx->size) / buf->f_bsize;
+
+        blocks = ctx->hard_lim / buf->f_bsize;
+        buf->f_blocks = blocks;
+
+        avail = buf->f_blocks - usage;
+        avail = max(avail, 0);
+
+        buf->f_bfree = avail;
+        /*
+         * We have to assume that the total assigned quota
+         * won't cause us to dip into the reserved space,
+         * because dealing with the overcommitted cases is
+         * just too hairy (especially when different bricks
+         * might be using different reserved percentages and
+         * such).
+         */
+        buf->f_bavail = buf->f_bfree;
+    }
+
+    xdata = xdata ? dict_ref(xdata) : dict_new();
+    if (!xdata)
+        goto unwind;
+
+    ret = dict_set_int8(xdata, "quota-deem-statfs", 1);
+    if (-1 == ret)
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, Q_MSG_ENOMEM,
+               "Dict set failed, deem-statfs option may "
+               "have no effect");
 
-int
-quota_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-		    int32_t op_ret, int32_t op_errno, dict_t *value)
+unwind:
+    QUOTA_STACK_UNWIND(statfs, frame, op_ret, op_errno, buf, xdata);
+
+    if (xdata)
+        dict_unref(xdata);
+
+    return 0;
+}
+
+int32_t
+quota_statfs_helper(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    dict_t *xdata)
 {
-	data_t *data = NULL;
-	struct quota_priv *priv = this->private;
-	
-	if (op_ret >= 0) {
-		data = dict_get (value, "trusted.glusterfs-quota-du");
-		if (data) {
-			LOCK (&priv->lock);
-			{
-				priv->current_disk_usage = data_to_uint64 (data);
-			}
-			UNLOCK (&priv->lock);
+    quota_local_t *local = frame->local;
+    int op_errno = EINVAL;
 
-			return 0;
-		}
-	} 
+    GF_VALIDATE_OR_GOTO("quota", local, err);
 
-	STACK_DESTROY (frame->root);
+    if (-1 == local->op_ret) {
+        op_errno = local->op_errno;
+        goto err;
+    }
 
-	return 0;
+    STACK_WIND_COOKIE(frame, quota_statfs_cbk, local->inode, FIRST_CHILD(this),
+                      FIRST_CHILD(this)->fops->statfs, loc, xdata);
+    return 0;
+err:
+    QUOTA_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
 }
 
+int32_t
+quota_statfs_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, inode_t *inode,
+                          struct iatt *buf, dict_t *xdata,
+                          struct iatt *postparent)
+{
+    quota_local_t *local = NULL;
+    int32_t ret = 0;
+    quota_inode_ctx_t *ctx = NULL;
+    uint64_t value = 0;
+    quota_meta_t size = {
+        0,
+    };
+
+    local = frame->local;
+
+    if (op_ret < 0)
+        goto resume;
+
+    GF_ASSERT(local);
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO_WITH_ERROR("quota", this, resume, op_errno, EINVAL);
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, xdata, resume, op_errno, EINVAL);
+
+    ret = inode_ctx_get(local->validate_loc.inode, this, &value);
+
+    ctx = (quota_inode_ctx_t *)(unsigned long)value;
+    if ((ret == -1) || (ctx == NULL)) {
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, Q_MSG_INODE_CTX_GET_FAILED,
+               "quota context is not present in inode (gfid:%s)",
+               uuid_utoa(local->validate_loc.inode->gfid));
+        op_errno = EINVAL;
+        goto resume;
+    }
+
+    ret = quota_dict_get_meta(xdata, QUOTA_SIZE_KEY, SLEN(QUOTA_SIZE_KEY),
+                              &size);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, Q_MSG_SIZE_KEY_MISSING,
+               "size key not present in "
+               "dict");
+        op_errno = EINVAL;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->size = size.size;
+        ctx->validate_time = gf_time();
+        ctx->file_count = size.file_count;
+        ctx->dir_count = size.dir_count;
+    }
+    UNLOCK(&ctx->lock);
+
+resume:
+    local->op_errno = op_errno;
+    quota_link_count_decrement(frame);
+    return 0;
+}
 
 void
-gf_quota_get_disk_usage (xlator_t *this)
+quota_get_limit_dir_continuation(struct list_head *parents, inode_t *inode,
+                                 int32_t op_ret, int32_t op_errno, void *data)
 {
-	call_frame_t *frame = NULL;
-	call_pool_t  *pool = NULL;
-	loc_t         loc;
+    call_frame_t *frame = NULL;
+    xlator_t *this = NULL;
+    quota_dentry_t *entry = NULL;
+    inode_t *parent = NULL;
+
+    frame = data;
+    this = THIS;
+
+    if ((op_ret < 0) || list_empty(parents)) {
+        if (op_ret >= 0) {
+            gf_msg(this->name, GF_LOG_WARNING, EIO, Q_MSG_ANCESTRY_BUILD_FAILED,
+                   "Couldn't build ancestry for inode (gfid:%s). "
+                   "Without knowing ancestors till root, quota "
+                   "cannot be enforced. "
+                   "Hence, failing fop with EIO",
+                   uuid_utoa(inode->gfid));
+            op_errno = EIO;
+        }
+
+        quota_handle_validate_error(frame, -1, op_errno);
+        goto out;
+    }
 
-	pool = this->ctx->pool;
-	frame = create_frame (this, pool);
-	build_root_loc (this, &loc);
+    entry = list_entry(parents, quota_dentry_t, next);
+    parent = inode_find(inode->table, entry->par);
 
-	STACK_WIND (frame, quota_getxattr_cbk,
-		    this->children->xlator,
-		    this->children->xlator->fops->getxattr,
-		    &loc,
-		    "trusted.glusterfs-quota-du");
-	return ;
+    quota_get_limit_dir(frame, parent, this);
+
+    inode_unref(parent);
+out:
+    return;
 }
 
+void
+quota_statfs_continue(call_frame_t *frame, xlator_t *this, inode_t *inode)
+{
+    quota_local_t *local = frame->local;
+    int ret = -1;
+
+    LOCK(&local->lock);
+    {
+        local->inode = inode_ref(inode);
+    }
+    UNLOCK(&local->lock);
+
+    ret = quota_validate(frame, local->inode, this, quota_statfs_validate_cbk);
+    if (0 > ret)
+        quota_handle_validate_error(frame, -1, -ret);
+}
 
 void
-gf_quota_cache_sync (xlator_t *this)
+quota_get_limit_dir(call_frame_t *frame, inode_t *cur_inode, xlator_t *this)
+{
+    inode_t *inode = NULL;
+    inode_t *parent = NULL;
+    uint64_t value = 0;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_local_t *local = frame->local;
+
+    if (!cur_inode)
+        goto out;
+
+    inode = inode_ref(cur_inode);
+    while (inode) {
+        value = 0;
+        inode_ctx_get(inode, this, &value);
+
+        if (value) {
+            ctx = (quota_inode_ctx_t *)(unsigned long)value;
+            if (ctx->hard_lim > 0)
+                break;
+        }
+
+        if (__is_root_gfid(inode->gfid))
+            goto off;
+
+        parent = inode_parent(inode, 0, NULL);
+        if (!parent) {
+            (void)quota_build_ancestry(inode, quota_get_limit_dir_continuation,
+                                       frame);
+            goto out;
+        }
+
+        inode_unref(inode);
+        inode = parent;
+    }
+
+    quota_statfs_continue(frame, this, inode);
+    inode_unref(inode);
+    return;
+
+off:
+    gf_msg_debug(this->name, 0, "No limit set on the inode or it's parents.");
+
+    QUOTA_STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                          FIRST_CHILD(this)->fops->statfs, &local->loc,
+                          local->xdata);
+out:
+    inode_unref(inode);
+
+    return;
+}
+
+int32_t
+quota_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
 {
-	struct quota_priv *priv = NULL;
-	call_frame_t      *frame = NULL;
-	dict_t            *dict = get_new_dict ();
-	loc_t              loc;
+    int op_errno = 0;
+    int ret = -1;
+    int8_t ignore_deem_statfs = 0;
+    quota_priv_t *priv = NULL;
+    quota_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+
+    priv = this->private;
+    GF_ASSERT(loc);
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    ret = dict_get_int8(xdata, GF_INTERNAL_IGNORE_DEEM_STATFS,
+                        &ignore_deem_statfs);
+    ret = 0;
+
+    if (ignore_deem_statfs)
+        goto off;
+
+    if (priv->consider_statfs && loc->inode) {
+        local = quota_local_new();
+        if (!local) {
+            op_errno = ENOMEM;
+            goto err;
+        }
+        frame->local = local;
 
+        ret = loc_copy(&local->loc, loc);
+        if (-1 == ret) {
+            op_errno = ENOMEM;
+            goto err;
+        }
 
-	priv = this->private;
-	build_root_loc (this, &loc);
+        if (xdata)
+            local->xdata = dict_ref(xdata);
 
-	frame = create_frame (this, this->ctx->pool);
-	dict_set (dict, "trusted.glusterfs-quota-du", 
-		  data_from_uint64 (priv->current_disk_usage));
+        stub = fop_statfs_stub(frame, quota_statfs_helper, &local->loc,
+                               local->xdata);
+        if (!stub) {
+            op_errno = ENOMEM;
+            goto err;
+        }
 
-	STACK_WIND (frame, quota_setxattr_cbk,
-		    this->children->xlator,
-		    this->children->xlator->fops->setxattr,
-		    &loc, dict, 0);
+        LOCK(&local->lock);
+        {
+            local->link_count = 1;
+            local->stub = stub;
+        }
+        UNLOCK(&local->lock);
+
+        quota_get_limit_dir(frame, loc->inode, this);
+
+        return 0;
+    }
+
+    /*
+     * We have to make sure that we never get to quota_statfs_cbk
+     * with a cookie that points to something other than an inode,
+     * which is exactly what would happen with STACK_UNWIND using
+     * that as a callback.  Therefore, use default_statfs_cbk in
+     * this case instead.
+     *
+     * Also if the option deem-statfs is not set to "on" don't
+     * bother calculating quota limit on / in statfs_cbk.
+     */
+    if (priv->consider_statfs)
+        gf_log(this->name, GF_LOG_ERROR,
+               "Missing inode, can't adjust for quota");
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->statfs,
+                    loc, xdata);
+    return 0;
+
+err:
+    QUOTA_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
 }
 
+int
+quota_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int op_ret, int op_errno, gf_dirent_t *entries,
+                   dict_t *xdata)
+{
+    gf_dirent_t *entry = NULL;
+    quota_local_t *local = NULL;
+    loc_t loc = {
+        0,
+    };
+
+    if (op_ret <= 0)
+        goto unwind;
+
+    local = frame->local;
+
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        if ((strcmp(entry->d_name, ".") == 0) ||
+            (strcmp(entry->d_name, "..") == 0) || entry->inode == NULL)
+            continue;
+
+        gf_uuid_copy(loc.gfid, entry->d_stat.ia_gfid);
+        loc.inode = inode_ref(entry->inode);
+        loc.parent = inode_ref(local->loc.inode);
+        gf_uuid_copy(loc.pargfid, loc.parent->gfid);
+        loc.name = entry->d_name;
+
+        quota_fill_inodectx(this, entry->inode, entry->dict, &loc,
+                            &entry->d_stat, &op_errno);
+
+        loc_wipe(&loc);
+    }
+
+unwind:
+    QUOTA_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata);
+
+    return 0;
+}
 
 int
-quota_release (xlator_t *this, fd_t *fd)
+quota_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+               off_t offset, dict_t *dict)
 {
-	gf_quota_cache_sync (this);
+    quota_priv_t *priv = NULL;
+    int ret = 0;
+    gf_boolean_t new_dict = _gf_false;
+    quota_local_t *local = NULL;
+
+    priv = this->private;
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    local = quota_local_new();
 
-	return 0;
+    if (local == NULL) {
+        goto err;
+    }
+
+    frame->local = local;
+
+    local->loc.inode = inode_ref(fd->inode);
+
+    if (dict == NULL) {
+        dict = dict_new();
+        new_dict = _gf_true;
+    }
+
+    if (dict) {
+        ret = dict_set_int8(dict, QUOTA_LIMIT_KEY, 1);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+                   "dict set of key for hard-limit");
+            goto err;
+        }
+    }
+
+    if (dict) {
+        ret = dict_set_int8(dict, QUOTA_LIMIT_OBJECTS_KEY, 1);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+                   "dict set of key for hard-limit "
+                   "failed");
+            goto err;
+        }
+    }
+
+    STACK_WIND(frame, quota_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict);
+
+    if (new_dict) {
+        dict_unref(dict);
+    }
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(readdirp, frame, -1, EINVAL, NULL, NULL);
+
+    if (new_dict) {
+        dict_unref(dict);
+    }
+
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp,
+                    fd, size, offset, dict);
+    return 0;
 }
 
+int32_t
+quota_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                    struct iatt *postbuf, dict_t *xdata)
+{
+    int32_t ret = 0;
+    uint64_t ctx_int = 0;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_local_t *local = NULL;
+
+    local = frame->local;
+
+    if ((op_ret < 0) || (local == NULL)) {
+        goto out;
+    }
+
+    ret = inode_ctx_get(local->loc.inode, this, &ctx_int);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INODE_CTX_GET_FAILED,
+               "%s: failed to get the context", local->loc.path);
+        goto out;
+    }
+
+    ctx = (quota_inode_ctx_t *)(unsigned long)ctx_int;
+
+    if (ctx == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INODE_CTX_GET_FAILED,
+               "quota context not set in %s (gfid:%s)", local->loc.path,
+               uuid_utoa(local->loc.inode->gfid));
+        goto out;
+    }
+
+    LOCK(&ctx->lock);
+    {
+        ctx->buf = *postbuf;
+    }
+    UNLOCK(&ctx->lock);
+
+out:
+    QUOTA_STACK_UNWIND(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+                       xdata);
+
+    return 0;
+}
 
-/* notify */
 int32_t
-notify (xlator_t *this,
-	int32_t event,
-	void *data,
-	...)
-{
-	struct quota_priv *priv = this->private;
-	
-	switch (event)
-	{
-	case GF_EVENT_CHILD_UP:
-		if (priv->only_first_time) {
-			priv->only_first_time = 0;
-			if (priv->disk_usage_limit) {
-				gf_quota_get_disk_usage (this);
-			}
-		}
-	default:
-		default_notify (this, event, data);
-		break;
-	}
-
-	return 0;
-}
-
-
-int32_t 
-init (xlator_t *this)
-{
-	int     ret  = 0;
-        data_t *data = NULL;
-	struct quota_priv *_private = NULL;
-
-	if (!this->children || this->children->next) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"FATAL: quota should have exactly one child");
-		return -1;
-	}
-	
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-
-	_private = CALLOC (1, sizeof (struct quota_priv));
-        _private->disk_usage_limit = 0;
-        data = dict_get (this->options, "disk-usage-limit");
-        if (data) {
-		if (gf_string2bytesize (data->data, &_private->disk_usage_limit) != 0) {
-                        gf_log (this->name, GF_LOG_ERROR, 
-                                "invalid number '%s' for disk-usage limit", data->data);
-			ret = -1;
-			goto out;
-                }
+quota_fallocate_helper(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                       int32_t mode, off_t offset, size_t len, dict_t *xdata)
+{
+    quota_local_t *local = NULL;
+    int32_t op_errno = EINVAL;
+
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO("quota", local, unwind);
+
+    if (local->op_ret == -1) {
+        op_errno = local->op_errno;
+        if (op_errno == ENOENT || op_errno == ESTALE) {
+            /* We may get ENOENT/ESTALE in case of below scenario
+             *     fd = open file.txt
+             *     unlink file.txt
+             *     fallocate on fd
+             * Here build_ancestry can fail as the file is removed.
+             * For now ignore ENOENT/ESTALE on active fd
+             * We need to re-visit this code once we understand
+             * how other file-system behave in this scenario
+             */
+            gf_msg_debug(this->name, 0,
+                         "quota enforcer failed "
+                         "with ENOENT/ESTALE on %s, cannot check "
+                         "quota limits and allowing fallocate",
+                         uuid_utoa(fd->inode->gfid));
+        } else {
+            goto unwind;
+        }
+    }
+
+    STACK_WIND(frame, quota_fallocate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+               xdata);
+    return 0;
 
-		LOCK_INIT (&_private->lock);
-		_private->current_disk_usage = 0;
-	}
-	
-        _private->min_free_disk_limit = 0;
-        data = dict_get (this->options, "min-free-disk-limit");
-        if (data) {
-		if (gf_string2percent (data->data, &_private->min_free_disk_limit) != 0) {
-                        gf_log (this->name, GF_LOG_ERROR, 
-                                "invalid percent '%s' for min-free-disk limit", data->data);
-			ret = -1;
-			goto out;
+unwind:
+    QUOTA_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+quota_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+                off_t offset, size_t len, dict_t *xdata)
+{
+    int32_t op_errno = EINVAL;
+    int32_t parents = 0;
+    int32_t fail_count = 0;
+    quota_local_t *local = NULL;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_priv_t *priv = NULL;
+    quota_dentry_t *dentry = NULL;
+    quota_dentry_t *tmp = NULL;
+    call_stub_t *stub = NULL;
+    struct list_head head = {
+        0,
+    };
+    inode_t *par_inode = NULL;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, unwind);
+
+    WIND_IF_QUOTAOFF(priv->is_quota_on, off);
+
+    INIT_LIST_HEAD(&head);
+
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO("quota", this, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, fd, unwind);
+
+    local = quota_local_new();
+    if (local == NULL) {
+        goto unwind;
+    }
+
+    frame->local = local;
+    local->loc.inode = inode_ref(fd->inode);
+
+    (void)quota_inode_ctx_get(fd->inode, this, &ctx, 0);
+    if (ctx == NULL) {
+        gf_msg_debug(this->name, 0,
+                     "quota context is NULL on inode"
+                     " (%s). If quota is not enabled recently and "
+                     "crawler has finished crawling, its an error",
+                     uuid_utoa(local->loc.inode->gfid));
+    }
+
+    stub = fop_fallocate_stub(frame, quota_fallocate_helper, fd, mode, offset,
+                              len, xdata);
+    if (stub == NULL) {
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, unwind);
+
+    parents = quota_add_parents_from_ctx(ctx, &head);
+    if (parents == -1) {
+        op_errno = errno;
+        goto unwind;
+    }
+
+    /*
+     * Note that by using len as the delta we're assuming the range from
+     * offset to offset+len has not already been allocated. This can result
+     * in ENOSPC errors attempting to allocate an already allocated range.
+     */
+    local->delta = len;
+    local->object_delta = 0;
+    local->stub = stub;
+    local->link_count = parents;
+
+    if (parents == 0) {
+        local->link_count = 1;
+        quota_check_limit(frame, fd->inode, this);
+    } else {
+        list_for_each_entry_safe(dentry, tmp, &head, next)
+        {
+            par_inode = do_quota_check_limit(frame, fd->inode, this, dentry,
+                                             _gf_false);
+            if (par_inode == NULL) {
+                /* remove stale entry from inode_ctx */
+                quota_dentry_del(ctx, dentry->name, dentry->par);
+                parents--;
+                fail_count++;
+            } else {
+                inode_unref(par_inode);
+            }
+            __quota_dentry_free(dentry);
+        }
+
+        if (parents == 0) {
+            LOCK(&local->lock);
+            {
+                local->link_count++;
+            }
+            UNLOCK(&local->lock);
+            quota_check_limit(frame, fd->inode, this);
+        }
+
+        while (fail_count != 0) {
+            quota_link_count_decrement(frame);
+            fail_count--;
+        }
+    }
+
+    return 0;
+
+unwind:
+    QUOTA_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+
+off:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+                    xdata);
+    return 0;
+}
+
+void
+quota_log_helper(char **usage_str, int64_t cur_size, inode_t *inode,
+                 char **path, time_t *cur_time)
+{
+    xlator_t *this = THIS;
+
+    if (!usage_str || !inode || !path || !cur_time) {
+        gf_log(this->name, GF_LOG_ERROR, "Received null argument");
+        return;
+    }
+
+    *usage_str = gf_uint64_2human_readable(cur_size);
+    if (!(*usage_str))
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, Q_MSG_ENOMEM,
+               "integer to string conversion failed Reason"
+               ":\"Cannot allocate memory\"");
+
+    inode_path(inode, NULL, path);
+    if (!(*path))
+        *path = uuid_utoa(inode->gfid);
+
+    *cur_time = gf_time();
+}
+
+/* Logs if
+ *  i.   Usage crossed soft limit
+ *  ii.  Usage above soft limit and alert-time elapsed
+ */
+void
+quota_log_usage(xlator_t *this, quota_inode_ctx_t *ctx, inode_t *inode,
+                int64_t delta)
+{
+    time_t cur_time = 0;
+    char *usage_str = NULL;
+    char *path = NULL;
+    int64_t cur_size = 0;
+    quota_priv_t *priv = NULL;
+
+    priv = this->private;
+    cur_size = ctx->size + delta;
+
+    if ((ctx->soft_lim <= 0) || cur_size < ctx->soft_lim)
+        return;
+
+    /* Usage crossed/reached soft limit */
+    if (DID_REACH_LIMIT(ctx->soft_lim, ctx->size, cur_size)) {
+        quota_log_helper(&usage_str, cur_size, inode, &path, &cur_time);
+
+        gf_msg(this->name, GF_LOG_ALERT, 0, Q_MSG_CROSSED_SOFT_LIMIT,
+               "Usage crossed soft limit: "
+               "%s used by %s",
+               usage_str, path);
+
+        gf_event(EVENT_QUOTA_CROSSED_SOFT_LIMIT,
+                 "Usage=%s;volume=%s;"
+                 "path=%s",
+                 usage_str, priv->volume_uuid, path);
+
+        ctx->prev_log_time = cur_time;
+
+    }
+    /* Usage is above soft limit */
+    else if (cur_size > ctx->soft_lim &&
+             quota_timeout(ctx->prev_log_time, priv->log_timeout)) {
+        quota_log_helper(&usage_str, cur_size, inode, &path, &cur_time);
+
+        gf_msg(this->name, GF_LOG_ALERT, 0, Q_MSG_CROSSED_SOFT_LIMIT,
+               "Usage is above soft limit: %s used by %s", usage_str, path);
+
+        gf_event(EVENT_QUOTA_CROSSED_SOFT_LIMIT,
+                 "Usage=%s;volume=%s;"
+                 "path=%s",
+                 usage_str, priv->volume_uuid, path);
+
+        ctx->prev_log_time = cur_time;
+    }
+
+    if (path)
+        GF_FREE(path);
+
+    if (usage_str)
+        GF_FREE(usage_str);
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_quota_mt_end + 1);
+
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "Memory accounting init failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+int32_t
+quota_forget(xlator_t *this, inode_t *inode)
+{
+    int32_t ret = 0;
+    uint64_t ctx_int = 0;
+    quota_inode_ctx_t *ctx = NULL;
+    quota_dentry_t *dentry = NULL, *tmp;
+
+    ret = inode_ctx_del(inode, this, &ctx_int);
+
+    if (ret < 0) {
+        return 0;
+    }
+
+    ctx = (quota_inode_ctx_t *)(long)ctx_int;
+
+    LOCK(&ctx->lock);
+    {
+        list_for_each_entry_safe(dentry, tmp, &ctx->parents, next)
+        {
+            __quota_dentry_free(dentry);
+        }
+    }
+    UNLOCK(&ctx->lock);
+
+    LOCK_DESTROY(&ctx->lock);
+
+    GF_FREE(ctx);
+
+    return 0;
+}
+
+int
+notify(xlator_t *this, int event, void *data, ...)
+{
+    quota_priv_t *priv = NULL;
+    int ret = 0;
+    rpc_clnt_t *rpc = NULL;
+    gf_boolean_t conn_status = _gf_true;
+    xlator_t *victim = data;
+
+    priv = this->private;
+    if (!priv || !priv->is_quota_on)
+        goto out;
+
+    if (event == GF_EVENT_PARENT_DOWN) {
+        rpc = priv->rpc_clnt;
+        if (rpc) {
+            rpc_clnt_disable(rpc);
+            pthread_mutex_lock(&priv->conn_mutex);
+            {
+                conn_status = priv->conn_status;
+                while (conn_status) {
+                    (void)pthread_cond_wait(&priv->conn_cond,
+                                            &priv->conn_mutex);
+                    conn_status = priv->conn_status;
                 }
-		_private->refresh_interval = 20; /* 20seconds is default */
-		data = dict_get (this->options, "refresh-interval");
-		if (data) {
-			if (gf_string2time (data->data, 
-					    &_private->refresh_interval)!= 0) {
-				gf_log (this->name, GF_LOG_ERROR, 
-					"invalid time '%s' for refresh "
-					"interval", data->data);
-				ret = -1;
-				goto out;
-			}
-		}
+            }
+            pthread_mutex_unlock(&priv->conn_mutex);
+            gf_log(this->name, GF_LOG_INFO,
+                   "Notify GF_EVENT_PARENT_DOWN for brick %s", victim->name);
         }
+    }
 
-	_private->only_first_time = 1;
-        this->private = (void *)_private;
-	ret = 0;
- out:
-	return ret;
+out:
+    ret = default_notify(this, event, data);
+    return ret;
 }
 
-void 
-fini (xlator_t *this)
+int32_t
+init(xlator_t *this)
 {
-	struct quota_priv *_private = this->private;
+    int32_t ret = -1;
+    quota_priv_t *priv = NULL;
+    rpc_clnt_t *rpc = NULL;
+
+    if ((this->children == NULL) || this->children->next) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, Q_MSG_INVALID_VOLFILE,
+               "FATAL: quota (%s) not configured with "
+               "exactly one child",
+               this->name);
+        return -1;
+    }
+
+    if (this->parents == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_INVALID_VOLFILE,
+               "dangling volume. check volfile");
+    }
+
+    QUOTA_ALLOC_OR_GOTO(priv, quota_priv_t, err);
+
+    LOCK_INIT(&priv->lock);
+
+    this->private = priv;
+
+    GF_OPTION_INIT("deem-statfs", priv->consider_statfs, bool, err);
+    GF_OPTION_INIT("server-quota", priv->is_quota_on, bool, err);
+    GF_OPTION_INIT("default-soft-limit", priv->default_soft_lim, percent, err);
+    GF_OPTION_INIT("soft-timeout", priv->soft_timeout, time, err);
+    GF_OPTION_INIT("hard-timeout", priv->hard_timeout, time, err);
+    GF_OPTION_INIT("alert-time", priv->log_timeout, time, err);
+    GF_OPTION_INIT("volume-uuid", priv->volume_uuid, str, err);
+
+    this->local_pool = mem_pool_new(quota_local_t, 64);
+    if (!this->local_pool) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, Q_MSG_ENOMEM,
+               "failed to create local_t's memory pool");
+        goto err;
+    }
+
+    pthread_mutex_init(&priv->conn_mutex, NULL);
+    pthread_cond_init(&priv->conn_cond, NULL);
+    priv->conn_status = _gf_false;
+
+    if (priv->is_quota_on) {
+        rpc = quota_enforcer_init(this, this->options);
+        if (rpc == NULL) {
+            ret = -1;
+            gf_msg(this->name, GF_LOG_WARNING, 0,
+                   Q_MSG_QUOTA_ENFORCER_RPC_INIT_FAILED,
+                   "quota enforcer rpc init failed");
+            goto err;
+        }
+
+        LOCK(&priv->lock);
+        {
+            priv->rpc_clnt = rpc;
+        }
+        UNLOCK(&priv->lock);
+    }
 
-	if (_private) {
-		gf_quota_cache_sync (this);
-		this->private = NULL;
-	}
-	
-	return ;
+    ret = 0;
+err:
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    int32_t ret = -1;
+    quota_priv_t *priv = NULL;
+    gf_boolean_t quota_on = _gf_false;
+    rpc_clnt_t *rpc = NULL;
+
+    priv = this->private;
+
+    GF_OPTION_RECONF("deem-statfs", priv->consider_statfs, options, bool, out);
+    GF_OPTION_RECONF("server-quota", quota_on, options, bool, out);
+    GF_OPTION_RECONF("default-soft-limit", priv->default_soft_lim, options,
+                     percent, out);
+    GF_OPTION_RECONF("alert-time", priv->log_timeout, options, time, out);
+    GF_OPTION_RECONF("soft-timeout", priv->soft_timeout, options, time, out);
+    GF_OPTION_RECONF("hard-timeout", priv->hard_timeout, options, time, out);
+
+    if (quota_on) {
+        priv->rpc_clnt = quota_enforcer_init(this, this->options);
+        if (priv->rpc_clnt == NULL) {
+            ret = -1;
+            gf_msg(this->name, GF_LOG_WARNING, 0,
+                   Q_MSG_QUOTA_ENFORCER_RPC_INIT_FAILED,
+                   "quota enforcer rpc init failed");
+            goto out;
+        }
+
+    } else {
+        LOCK(&priv->lock);
+        {
+            rpc = priv->rpc_clnt;
+            priv->rpc_clnt = NULL;
+        }
+        UNLOCK(&priv->lock);
+
+        if (rpc != NULL) {
+            // Quotad is shutdown when there is no started volume
+            // which has quota enabled. So, we should disable the
+            // enforcer client when quota is disabled on a volume,
+            // to avoid spurious reconnect attempts to a service
+            // (quotad), that is known to be down.
+            rpc_clnt_unref(rpc);
+        }
+    }
+
+    priv->is_quota_on = quota_on;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+quota_priv_dump(xlator_t *this)
+{
+    quota_priv_t *priv = NULL;
+    int32_t ret = -1;
+
+    GF_ASSERT(this);
+
+    priv = this->private;
+    if (!priv)
+        goto out;
+
+    gf_proc_dump_add_section("xlators.features.quota.priv");
+
+    ret = TRY_LOCK(&priv->lock);
+    if (ret)
+        goto out;
+    else {
+        gf_proc_dump_write("soft-timeout", "%u", priv->soft_timeout);
+        gf_proc_dump_write("hard-timeout", "%u", priv->hard_timeout);
+        gf_proc_dump_write("alert-time", "%u", priv->log_timeout);
+        gf_proc_dump_write("quota-on", "%d", priv->is_quota_on);
+        gf_proc_dump_write("statfs", "%d", priv->consider_statfs);
+        gf_proc_dump_write("volume-uuid", "%s", priv->volume_uuid);
+        gf_proc_dump_write("validation-count", "%" PRIu64,
+                           priv->validation_count);
+    }
+    UNLOCK(&priv->lock);
+
+out:
+    return 0;
+}
+
+void
+fini(xlator_t *this)
+{
+    quota_priv_t *priv = NULL;
+    rpc_clnt_t *rpc = NULL;
+
+    priv = this->private;
+    if (!priv)
+        return;
+    rpc = priv->rpc_clnt;
+    priv->rpc_clnt = NULL;
+    if (rpc) {
+        rpc_clnt_connection_cleanup(&rpc->conn);
+        rpc_clnt_unref(rpc);
+    }
+
+    this->private = NULL;
+    LOCK_DESTROY(&priv->lock);
+    pthread_mutex_destroy(&priv->conn_mutex);
+    pthread_cond_destroy(&priv->conn_cond);
+
+    GF_FREE(priv);
+    if (this->local_pool) {
+        mem_pool_destroy(this->local_pool);
+        this->local_pool = NULL;
+    }
+    return;
 }
 
 struct xlator_fops fops = {
-	.create      = quota_create,
-	.open        = quota_open,
-	.truncate    = quota_truncate,
-	.ftruncate   = quota_ftruncate,
-	.writev      = quota_writev,
-	.unlink      = quota_unlink,
-	.rmdir       = quota_rmdir,
-	.mknod       = quota_mknod,
-	.mkdir       = quota_mkdir,
-	.symlink     = quota_symlink,
-	.statfs      = quota_statfs,
+    .statfs = quota_statfs,
+    .lookup = quota_lookup,
+    .writev = quota_writev,
+    .create = quota_create,
+    .mkdir = quota_mkdir,
+    .truncate = quota_truncate,
+    .ftruncate = quota_ftruncate,
+    .unlink = quota_unlink,
+    .symlink = quota_symlink,
+    .link = quota_link,
+    .rename = quota_rename,
+    .getxattr = quota_getxattr,
+    .fgetxattr = quota_fgetxattr,
+    .stat = quota_stat,
+    .fstat = quota_fstat,
+    .readlink = quota_readlink,
+    .readv = quota_readv,
+    .fsync = quota_fsync,
+    .setattr = quota_setattr,
+    .fsetattr = quota_fsetattr,
+    .mknod = quota_mknod,
+    .setxattr = quota_setxattr,
+    .fsetxattr = quota_fsetxattr,
+    .removexattr = quota_removexattr,
+    .fremovexattr = quota_fremovexattr,
+    .readdirp = quota_readdirp,
+    .fallocate = quota_fallocate,
 };
 
-struct xlator_mops mops = {
-};
+struct xlator_cbks cbks = {.forget = quota_forget};
 
-struct xlator_cbks cbks = {
-	.release     = quota_release
+struct xlator_dumpops dumpops = {
+    .priv = quota_priv_dump,
 };
-
 struct volume_options options[] = {
-	{ .key  = {"min-free-disk-limit"}, 
-	  .type = GF_OPTION_TYPE_PERCENT
-	},
-	{ .key  = {"refresh-interval"}, 
-	  .type = GF_OPTION_TYPE_TIME
-	},
-	{ .key  = {"disk-usage-limit"}, 
-	  .type = GF_OPTION_TYPE_SIZET 
-	},
-	{ .key = {NULL} },
+    {
+        .key = {"enable"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "enable is the volume option that can be used "
+                       "to turn on quota.",
+        .op_version = {1},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .level = OPT_STATUS_BASIC,
+        .tags = {},
+    },
+    {
+        .key = {"deem-statfs"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "on",
+        .description = "If set to on, it takes quota limits into"
+                       " consideration while estimating fs size. (df command)"
+                       " (Default is on).",
+        .op_version = {2},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {},
+    },
+    {
+        .key = {"server-quota"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "Skip the quota enforcement if the feature is"
+                       " not turned on. This is not a user exposed option.",
+        .flags = OPT_FLAG_NONE,
+    },
+    {
+        .key = {"default-soft-limit"},
+        .type = GF_OPTION_TYPE_PERCENT,
+        .default_value = "80%",
+        .op_version = {3},
+        .description = "Soft limit is expressed as a proportion of hard limit."
+                       " Default-soft-limit is the proportion used when the "
+                       " user does not supply any soft limit value.",
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {},
+    },
+    {
+        .key = {"soft-timeout"},
+        .type = GF_OPTION_TYPE_TIME,
+        .min = 0,
+        .max = 1800,
+        .default_value = "60",
+        .description = "quota caches the directory sizes on client. "
+                       "soft-timeout indicates the timeout for the validity of"
+                       " cache before soft-limit has been crossed.",
+        .op_version = {3},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {},
+    },
+    {
+        .key = {"hard-timeout"},
+        .type = GF_OPTION_TYPE_TIME,
+        .min = 0,
+        .max = 60,
+        .default_value = "5",
+        .description = "quota caches the directory sizes on client. "
+                       "hard-timeout indicates the timeout for the validity of"
+                       " cache after soft-limit has been crossed.",
+        .op_version = {3},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {},
+    },
+    {.key = {"volume-uuid"},
+     .type = GF_OPTION_TYPE_STR,
+     .default_value = "{{ volume.id }}",
+     .description = "uuid of the volume this brick is part of."},
+    {
+        .key = {"alert-time"},
+        .type = GF_OPTION_TYPE_TIME,
+        .min = 0,
+        .max = 7 * 86400,
+        .default_value = "86400",
+        .op_version = {3},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .description = "Frequency of limit breach messages in log.",
+        .tags = {},
+    },
+    {.key = {NULL}}};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "quota",
+    .category = GF_MAINTAINED,
 };
diff --git a/xlators/features/quota/src/quota.h b/xlators/features/quota/src/quota.h
new file mode 100644
index 00000000000..0395d78c9ef
--- /dev/null
+++ b/xlators/features/quota/src/quota.h
@@ -0,0 +1,266 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _QUOTA_H
+#define _QUOTA_H
+
+#include <glusterfs/call-stub.h>
+#include "quota-mem-types.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/gf-event.h>
+#include "rpcsvc.h"
+#include "rpc-clnt.h"
+#include <glusterfs/byte-order.h>
+#include "glusterfs3-xdr.h"
+#include "glusterfs3.h"
+#include "xdr-generic.h"
+#include <glusterfs/compat-errno.h>
+#include "protocol-common.h"
+#include <glusterfs/quota-common-utils.h>
+#include "quota-messages.h"
+
+#define DIRTY "dirty"
+#define SIZE "size"
+#define CONTRIBUTION "contri"
+#define VAL_LENGTH 8
+#define READDIR_BUF 4096
+
+#ifndef UUID_CANONICAL_FORM_LEN
+#define UUID_CANONICAL_FORM_LEN 36
+#endif
+
+#define WIND_IF_QUOTAOFF(is_quota_on, label)                                   \
+    if (!is_quota_on)                                                          \
+        goto label;
+
+#define QUOTA_WIND_FOR_INTERNAL_FOP(xdata, label)                              \
+    do {                                                                       \
+        if (xdata && dict_get_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY))        \
+            goto label;                                                        \
+    } while (0)
+
+#define DID_REACH_LIMIT(lim, prev_size, cur_size)                              \
+    ((cur_size) >= (lim) && (prev_size) < (lim))
+
+#define QUOTA_SAFE_INCREMENT(lock, var)                                        \
+    do {                                                                       \
+        LOCK(lock);                                                            \
+        var++;                                                                 \
+        UNLOCK(lock);                                                          \
+    } while (0)
+
+#define QUOTA_SAFE_DECREMENT(lock, var)                                        \
+    do {                                                                       \
+        LOCK(lock);                                                            \
+        var--;                                                                 \
+        UNLOCK(lock);                                                          \
+    } while (0)
+
+#define QUOTA_ALLOC_OR_GOTO(var, type, label)                                  \
+    do {                                                                       \
+        var = GF_CALLOC(sizeof(type), 1, gf_quota_mt_##type);                  \
+        if (!var) {                                                            \
+            gf_msg("", GF_LOG_ERROR, ENOMEM, Q_MSG_ENOMEM, "out of memory");   \
+            ret = -1;                                                          \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0);
+
+#define QUOTA_STACK_WIND_TAIL(frame, params...)                                \
+    do {                                                                       \
+        quota_local_t *_local = NULL;                                          \
+                                                                               \
+        if (frame) {                                                           \
+            _local = frame->local;                                             \
+            frame->local = NULL;                                               \
+        }                                                                      \
+                                                                               \
+        STACK_WIND_TAIL(frame, params);                                        \
+                                                                               \
+        if (_local)                                                            \
+            quota_local_cleanup(_local);                                       \
+    } while (0)
+
+#define QUOTA_STACK_UNWIND(fop, frame, params...)                              \
+    do {                                                                       \
+        quota_local_t *_local = NULL;                                          \
+        if (frame) {                                                           \
+            _local = frame->local;                                             \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, params);                               \
+        quota_local_cleanup(_local);                                           \
+    } while (0)
+
+#define QUOTA_FREE_CONTRIBUTION_NODE(_contribution)                            \
+    do {                                                                       \
+        list_del(&_contribution->contri_list);                                 \
+        GF_FREE(_contribution);                                                \
+    } while (0)
+
+#define GET_CONTRI_KEY(var, _vol_name, _gfid, _ret)                            \
+    do {                                                                       \
+        char _gfid_unparsed[40];                                               \
+        if (_gfid != NULL) {                                                   \
+            gf_uuid_unparse(_gfid, _gfid_unparsed);                            \
+            _ret = gf_asprintf(var, QUOTA_XATTR_PREFIX "%s.%s." CONTRIBUTION,  \
+                               _vol_name, _gfid_unparsed);                     \
+        } else {                                                               \
+            _ret = gf_asprintf(var, QUOTA_XATTR_PREFIX "%s.." CONTRIBUTION,    \
+                               _vol_name);                                     \
+        }                                                                      \
+    } while (0)
+
+#define GET_CONTRI_KEY_OR_GOTO(var, _vol_name, _gfid, label)                   \
+    do {                                                                       \
+        GET_CONTRI_KEY(var, _vol_name, _gfid, ret);                            \
+        if (ret == -1)                                                         \
+            goto label;                                                        \
+    } while (0)
+
+#define GET_DIRTY_KEY_OR_GOTO(var, _vol_name, label)                           \
+    do {                                                                       \
+        ret = gf_asprintf(var, QUOTA_XATTR_PREFIX "%s." DIRTY, _vol_name);     \
+        if (ret == -1)                                                         \
+            goto label;                                                        \
+    } while (0)
+
+#define QUOTA_REG_OR_LNK_FILE(ia_type) (IA_ISREG(ia_type) || IA_ISLNK(ia_type))
+
+struct quota_dentry {
+    char *name;
+    uuid_t par;
+    struct list_head next;
+};
+typedef struct quota_dentry quota_dentry_t;
+
+struct quota_inode_ctx {
+    int64_t size;
+    int64_t hard_lim;
+    int64_t soft_lim;
+    int64_t file_count;
+    int64_t dir_count;
+    int64_t object_hard_lim;
+    int64_t object_soft_lim;
+    struct iatt buf;
+    struct list_head parents;
+    time_t validate_time;
+    time_t prev_log_time;
+    gf_boolean_t ancestry_built;
+    gf_lock_t lock;
+};
+typedef struct quota_inode_ctx quota_inode_ctx_t;
+
+typedef void (*quota_ancestry_built_t)(struct list_head *parents,
+                                       inode_t *inode, int32_t op_ret,
+                                       int32_t op_errno, void *data);
+
+typedef void (*quota_fop_continue_t)(call_frame_t *frame);
+
+struct quota_local {
+    gf_lock_t lock;
+    uint32_t link_count;
+    loc_t loc;
+    loc_t oldloc;
+    loc_t newloc;
+    loc_t validate_loc;
+    int64_t delta;
+    int8_t object_delta;
+    int32_t op_ret;
+    int32_t op_errno;
+    int64_t size;
+    char just_validated;
+    fop_lookup_cbk_t validate_cbk;
+    quota_fop_continue_t fop_continue_cbk;
+    inode_t *inode;
+    uuid_t common_ancestor; /* Used by quota_rename */
+    call_stub_t *stub;
+    struct iobref *iobref;
+    quota_limits_t limit;
+    quota_limits_t object_limit;
+    int64_t space_available;
+    quota_ancestry_built_t ancestry_cbk;
+    void *ancestry_data;
+    dict_t *xdata;
+    dict_t *validate_xdata;
+    int32_t quotad_conn_retry;
+    xlator_t *this;
+    call_frame_t *par_frame;
+};
+typedef struct quota_local quota_local_t;
+
+struct quota_priv {
+    /* FIXME: consider time_t for timeouts. */
+    uint32_t soft_timeout;
+    uint32_t hard_timeout;
+    uint32_t log_timeout;
+    double default_soft_lim;
+    gf_boolean_t is_quota_on;
+    gf_boolean_t consider_statfs;
+    gf_lock_t lock;
+    rpc_clnt_prog_t *quota_enforcer;
+    struct rpcsvc_program *quotad_aggregator;
+    struct rpc_clnt *rpc_clnt;
+    rpcsvc_t *rpcsvc;
+    inode_table_t *itable;
+    char *volume_uuid;
+    uint64_t validation_count;
+    int32_t quotad_conn_status;
+    pthread_mutex_t conn_mutex;
+    pthread_cond_t conn_cond;
+    gf_boolean_t conn_status;
+};
+typedef struct quota_priv quota_priv_t;
+
+int
+quota_enforcer_lookup(call_frame_t *frame, xlator_t *this, dict_t *xdata,
+                      fop_lookup_cbk_t cbk);
+
+void
+_quota_enforcer_lookup(void *data);
+
+struct rpc_clnt *
+quota_enforcer_init(xlator_t *this, dict_t *options);
+
+void
+quota_log_usage(xlator_t *this, quota_inode_ctx_t *ctx, inode_t *inode,
+                int64_t delta);
+
+int
+quota_build_ancestry(inode_t *inode, quota_ancestry_built_t ancestry_cbk,
+                     void *data);
+
+void
+quota_get_limit_dir(call_frame_t *frame, inode_t *cur_inode, xlator_t *this);
+
+int32_t
+quota_check_limit(call_frame_t *frame, inode_t *inode, xlator_t *this);
+
+inode_t *
+do_quota_check_limit(call_frame_t *frame, inode_t *inode, xlator_t *this,
+                     quota_dentry_t *dentry, gf_boolean_t force);
+int
+quota_fill_inodectx(xlator_t *this, inode_t *inode, dict_t *dict, loc_t *loc,
+                    struct iatt *buf, int32_t *op_errno);
+
+int32_t
+quota_check_size_limit(call_frame_t *frame, quota_inode_ctx_t *ctx,
+                       quota_priv_t *priv, inode_t *_inode, xlator_t *this,
+                       int32_t *op_errno, int just_validated, int64_t delta,
+                       quota_local_t *local, gf_boolean_t *skip_check);
+
+int32_t
+quota_check_object_limit(call_frame_t *frame, quota_inode_ctx_t *ctx,
+                         quota_priv_t *priv, inode_t *_inode, xlator_t *this,
+                         int32_t *op_errno, int just_validated,
+                         quota_local_t *local, gf_boolean_t *skip_check);
+#endif
diff --git a/xlators/features/quota/src/quotad-aggregator.c b/xlators/features/quota/src/quotad-aggregator.c
new file mode 100644
index 00000000000..75d47867b5b
--- /dev/null
+++ b/xlators/features/quota/src/quotad-aggregator.c
@@ -0,0 +1,494 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "cli1-xdr.h"
+#include "quota.h"
+#include "quotad-helpers.h"
+#include "quotad-aggregator.h"
+
+static char *qd_ext_xattrs[] = {
+    QUOTA_SIZE_KEY,
+    QUOTA_LIMIT_KEY,
+    QUOTA_LIMIT_OBJECTS_KEY,
+    NULL,
+};
+
+static struct rpcsvc_program quotad_aggregator_prog;
+
+struct iobuf *
+quotad_serialize_reply(rpcsvc_request_t *req, void *arg, struct iovec *outmsg,
+                       xdrproc_t xdrproc)
+{
+    struct iobuf *iob = NULL;
+    ssize_t retlen = 0;
+    ssize_t xdr_size = 0;
+
+    GF_VALIDATE_OR_GOTO("server", req, ret);
+
+    /* First, get the io buffer into which the reply in arg will
+     * be serialized.
+     */
+    if (arg && xdrproc) {
+        xdr_size = xdr_sizeof(xdrproc, arg);
+        iob = iobuf_get2(req->svc->ctx->iobuf_pool, xdr_size);
+        if (!iob) {
+            gf_log_callingfn(THIS->name, GF_LOG_ERROR, "Failed to get iobuf");
+            goto ret;
+        };
+
+        iobuf_to_iovec(iob, outmsg);
+        /* Use the given serializer to translate the given C structure
+         * in arg to XDR format which will be written into the buffer
+         * in outmsg.
+         */
+        /* retlen is used to received the error since size_t is unsigned and we
+         * need -1 for error notification during encoding.
+         */
+
+        retlen = xdr_serialize_generic(*outmsg, arg, xdrproc);
+        if (retlen == -1) {
+            /* Failed to Encode 'GlusterFS' msg in RPC is not exactly
+               failure of RPC return values.. Client should get
+               notified about this, so there are no missing frames */
+            gf_log_callingfn("", GF_LOG_ERROR, "Failed to encode message");
+            req->rpc_err = GARBAGE_ARGS;
+            retlen = 0;
+        }
+    }
+    outmsg->iov_len = retlen;
+ret:
+    return iob;
+}
+
+int
+quotad_aggregator_submit_reply(call_frame_t *frame, rpcsvc_request_t *req,
+                               void *arg, struct iovec *payload,
+                               int payloadcount, struct iobref *iobref,
+                               xdrproc_t xdrproc)
+{
+    struct iobuf *iob = NULL;
+    int ret = -1;
+    struct iovec rsp = {
+        0,
+    };
+    quotad_aggregator_state_t *state = NULL;
+    char new_iobref = 0;
+
+    GF_VALIDATE_OR_GOTO("server", req, ret);
+
+    if (frame) {
+        state = frame->root->state;
+        frame->local = NULL;
+    }
+
+    if (!iobref) {
+        iobref = iobref_new();
+        if (!iobref) {
+            goto ret;
+        }
+
+        new_iobref = 1;
+    }
+
+    iob = quotad_serialize_reply(req, arg, &rsp, xdrproc);
+    if (!iob) {
+        gf_msg("", GF_LOG_ERROR, 0, Q_MSG_DICT_SERIALIZE_FAIL,
+               "Failed to serialize reply");
+        goto ret;
+    }
+
+    iobref_add(iobref, iob);
+
+    ret = rpcsvc_submit_generic(req, &rsp, 1, payload, payloadcount, iobref);
+
+    iobuf_unref(iob);
+
+    ret = 0;
+ret:
+    if (state) {
+        quotad_aggregator_free_state(state);
+    }
+
+    if (frame)
+        STACK_DESTROY(frame->root);
+
+    if (new_iobref) {
+        iobref_unref(iobref);
+    }
+
+    return ret;
+}
+
+int
+quotad_aggregator_getlimit_cbk(xlator_t *this, call_frame_t *frame,
+                               void *lookup_rsp)
+{
+    gfs3_lookup_rsp *rsp = lookup_rsp;
+    gf_cli_rsp cli_rsp = {
+        0,
+    };
+    dict_t *xdata = NULL;
+    quotad_aggregator_state_t *state = NULL;
+    int ret = -1;
+    int type = 0;
+
+    if (!rsp || (rsp->op_ret == -1))
+        goto reply;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(frame->this, xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), rsp->op_ret,
+                                 rsp->op_errno, out);
+
+    if (xdata) {
+        state = frame->root->state;
+        ret = dict_get_int32n(state->req_xdata, "type", SLEN("type"), &type);
+        if (ret < 0)
+            goto out;
+
+        ret = dict_set_int32_sizen(xdata, "type", type);
+        if (ret < 0)
+            goto out;
+    }
+
+    ret = 0;
+out:
+    rsp->op_ret = ret;
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, Q_MSG_DICT_UNSERIALIZE_FAIL,
+               "failed to unserialize "
+               "nameless lookup rsp");
+        goto reply;
+    }
+    cli_rsp.op_ret = rsp->op_ret;
+    cli_rsp.op_errno = rsp->op_errno;
+    cli_rsp.op_errstr = "";
+    if (xdata) {
+        GF_PROTOCOL_DICT_SERIALIZE(frame->this, xdata, (&cli_rsp.dict.dict_val),
+                                   (cli_rsp.dict.dict_len), cli_rsp.op_errno,
+                                   reply);
+    }
+
+reply:
+    quotad_aggregator_submit_reply(frame, (frame) ? frame->local : NULL,
+                                   (void *)&cli_rsp, NULL, 0, NULL,
+                                   (xdrproc_t)xdr_gf_cli_rsp);
+
+    dict_unref(xdata);
+    GF_FREE(cli_rsp.dict.dict_val);
+    return 0;
+}
+
+int
+quotad_aggregator_getlimit(rpcsvc_request_t *req)
+{
+    call_frame_t *frame = NULL;
+    gf_cli_req cli_req = {
+        {0},
+    };
+    gf_cli_rsp cli_rsp = {0};
+    quotad_aggregator_state_t *state = NULL;
+    xlator_t *this = NULL;
+    dict_t *dict = NULL;
+    int ret = -1, op_errno = 0;
+    char *gfid_str = NULL;
+    uuid_t gfid = {0};
+    char *volume_uuid = NULL;
+
+    GF_VALIDATE_OR_GOTO("quotad-aggregator", req, err);
+
+    this = THIS;
+
+    cli_req.dict.dict_val = alloca(req->msg[0].iov_len);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg("this->name", GF_LOG_ERROR, 0, Q_MSG_XDR_DECODE_ERROR,
+               "xdr decoding error");
+        req->rpc_err = GARBAGE_ARGS;
+        goto err;
+    }
+
+    if (cli_req.dict.dict_len) {
+        dict = dict_new();
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, Q_MSG_DICT_UNSERIALIZE_FAIL,
+                   "Failed to unserialize req-buffer to "
+                   "dictionary");
+            goto err;
+        }
+    }
+
+    ret = dict_get_strn(dict, "gfid", SLEN("gfid"), &gfid_str);
+    if (ret) {
+        goto err;
+    }
+
+    ret = dict_get_strn(dict, "volume-uuid", SLEN("volume-uuid"), &volume_uuid);
+    if (ret) {
+        goto err;
+    }
+
+    gf_uuid_parse((const char *)gfid_str, gfid);
+
+    frame = quotad_aggregator_get_frame_from_req(req);
+    if (frame == NULL) {
+        cli_rsp.op_errno = ENOMEM;
+        goto errx;
+    }
+    state = frame->root->state;
+    state->req_xdata = dict;
+    state->xdata = dict_new();
+    dict = NULL;
+
+    ret = dict_set_int32_sizen(state->xdata, QUOTA_LIMIT_KEY, 42);
+    if (ret)
+        goto err;
+
+    ret = dict_set_int32_sizen(state->xdata, QUOTA_LIMIT_OBJECTS_KEY, 42);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, Q_MSG_ENOMEM,
+               "Failed to set QUOTA_LIMIT_OBJECTS_KEY");
+        goto err;
+    }
+
+    ret = dict_set_int32_sizen(state->xdata, QUOTA_SIZE_KEY, 42);
+    if (ret)
+        goto err;
+
+    ret = dict_set_int32_sizen(state->xdata, GET_ANCESTRY_PATH_KEY, 42);
+    if (ret)
+        goto err;
+
+    ret = qd_nameless_lookup(this, frame, (char *)gfid, state->xdata,
+                             volume_uuid, quotad_aggregator_getlimit_cbk);
+    if (ret) {
+        cli_rsp.op_errno = ret;
+        goto errx;
+    }
+
+    return ret;
+
+err:
+    cli_rsp.op_errno = op_errno;
+errx:
+    cli_rsp.op_ret = -1;
+    cli_rsp.op_errstr = "";
+
+    quotad_aggregator_getlimit_cbk(this, frame, &cli_rsp);
+    if (dict)
+        dict_unref(dict);
+    return ret;
+}
+
+int
+quotad_aggregator_lookup_cbk(xlator_t *this, call_frame_t *frame, void *rsp)
+{
+    quotad_aggregator_submit_reply(frame, frame ? frame->local : NULL, rsp,
+                                   NULL, 0, NULL,
+                                   (xdrproc_t)xdr_gfs3_lookup_rsp);
+
+    return 0;
+}
+
+int
+quotad_aggregator_lookup(rpcsvc_request_t *req)
+{
+    call_frame_t *frame = NULL;
+    gfs3_lookup_req args = {
+        {
+            0,
+        },
+    };
+    int i = 0, ret = -1, op_errno = 0;
+    gfs3_lookup_rsp rsp = {
+        0,
+    };
+    quotad_aggregator_state_t *state = NULL;
+    xlator_t *this = NULL;
+    dict_t *dict = NULL;
+    char *volume_uuid = NULL;
+
+    GF_VALIDATE_OR_GOTO("quotad-aggregator", req, err);
+
+    this = THIS;
+
+    args.bname = alloca(req->msg[0].iov_len);
+    args.xdata.xdata_val = alloca(req->msg[0].iov_len);
+
+    ret = xdr_to_generic(req->msg[0], &args, (xdrproc_t)xdr_gfs3_lookup_req);
+    if (ret < 0) {
+        rsp.op_errno = EINVAL;
+        goto err;
+    }
+
+    frame = quotad_aggregator_get_frame_from_req(req);
+    if (frame == NULL) {
+        rsp.op_errno = ENOMEM;
+        goto err;
+    }
+
+    state = frame->root->state;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, dict, (args.xdata.xdata_val),
+                                 (args.xdata.xdata_len), ret, op_errno, err);
+
+    ret = dict_get_str(dict, "volume-uuid", &volume_uuid);
+    if (ret) {
+        goto err;
+    }
+
+    state->xdata = dict_new();
+
+    for (i = 0; qd_ext_xattrs[i]; i++) {
+        if (dict_get(dict, qd_ext_xattrs[i])) {
+            ret = dict_set_uint32(state->xdata, qd_ext_xattrs[i], 1);
+            if (ret < 0)
+                goto err;
+        }
+    }
+
+    ret = qd_nameless_lookup(this, frame, args.gfid, state->xdata, volume_uuid,
+                             quotad_aggregator_lookup_cbk);
+    if (ret) {
+        rsp.op_errno = ret;
+        goto err;
+    }
+
+    if (dict)
+        dict_unref(dict);
+
+    return ret;
+
+err:
+    rsp.op_ret = -1;
+    rsp.op_errno = op_errno;
+
+    quotad_aggregator_lookup_cbk(this, frame, &rsp);
+    if (dict)
+        dict_unref(dict);
+
+    return ret;
+}
+
+int
+quotad_aggregator_rpc_notify(rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
+                             void *data)
+{
+    if (!xl || !data) {
+        gf_log_callingfn("server", GF_LOG_WARNING,
+                         "Calling rpc_notify without initializing");
+        goto out;
+    }
+
+    switch (event) {
+        case RPCSVC_EVENT_ACCEPT:
+            break;
+
+        case RPCSVC_EVENT_DISCONNECT:
+            break;
+
+        default:
+            break;
+    }
+
+out:
+    return 0;
+}
+
+int
+quotad_aggregator_init(xlator_t *this)
+{
+    quota_priv_t *priv = NULL;
+    int ret = -1;
+
+    priv = this->private;
+
+    if (priv->rpcsvc) {
+        /* Listener already created */
+        return 0;
+    }
+
+    ret = dict_set_nstrn(this->options, "transport.address-family",
+                         SLEN("transport.address-family"), "unix",
+                         SLEN("unix"));
+    if (ret)
+        goto out;
+
+    ret = dict_set_nstrn(this->options, "transport-type",
+                         SLEN("transport-type"), "socket", SLEN("socket"));
+    if (ret)
+        goto out;
+
+    ret = dict_set_nstrn(this->options, "transport.socket.listen-path",
+                         SLEN("transport.socket.listen-path"),
+                         "/var/run/gluster/quotad.socket",
+                         SLEN("/var/run/gluster/quotad.socket"));
+    if (ret)
+        goto out;
+
+    /* RPC related */
+    priv->rpcsvc = rpcsvc_init(this, this->ctx, this->options, 0);
+    if (priv->rpcsvc == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_RPCSVC_INIT_FAILED,
+               "creation of rpcsvc failed");
+        ret = -1;
+        goto out;
+    }
+
+    ret = rpcsvc_create_listeners(priv->rpcsvc, this->options, this->name);
+    if (ret < 1) {
+        gf_msg(this->name, GF_LOG_WARNING, 0,
+               Q_MSG_RPCSVC_LISTENER_CREATION_FAILED,
+               "creation of listener failed");
+        ret = -1;
+        goto out;
+    }
+
+    priv->quotad_aggregator = &quotad_aggregator_prog;
+    quotad_aggregator_prog.options = this->options;
+
+    ret = rpcsvc_program_register(priv->rpcsvc, &quotad_aggregator_prog,
+                                  _gf_false);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, Q_MSG_RPCSVC_REGISTER_FAILED,
+               "registration of program (name:%s, prognum:%d, "
+               "progver:%d) failed",
+               quotad_aggregator_prog.progname, quotad_aggregator_prog.prognum,
+               quotad_aggregator_prog.progver);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (ret && priv->rpcsvc) {
+        GF_FREE(priv->rpcsvc);
+        priv->rpcsvc = NULL;
+    }
+
+    return ret;
+}
+
+static rpcsvc_actor_t quotad_aggregator_actors[GF_AGGREGATOR_MAXVALUE] = {
+    [GF_AGGREGATOR_NULL] = {"NULL", NULL, NULL, GF_AGGREGATOR_NULL, DRC_NA, 0},
+    [GF_AGGREGATOR_LOOKUP] = {"LOOKUP", quotad_aggregator_lookup, NULL,
+                              GF_AGGREGATOR_NULL, DRC_NA, 0},
+    [GF_AGGREGATOR_GETLIMIT] = {"GETLIMIT", quotad_aggregator_getlimit, NULL,
+                                GF_AGGREGATOR_GETLIMIT, DRC_NA, 0},
+};
+
+static struct rpcsvc_program quotad_aggregator_prog = {
+    .progname = "GlusterFS 3.3",
+    .prognum = GLUSTER_AGGREGATOR_PROGRAM,
+    .progver = GLUSTER_AGGREGATOR_VERSION,
+    .numactors = GF_AGGREGATOR_MAXVALUE,
+    .actors = quotad_aggregator_actors};
diff --git a/xlators/features/quota/src/quotad-aggregator.h b/xlators/features/quota/src/quotad-aggregator.h
new file mode 100644
index 00000000000..706592c7d50
--- /dev/null
+++ b/xlators/features/quota/src/quotad-aggregator.h
@@ -0,0 +1,38 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _QUOTAD_AGGREGATOR_H
+#define _QUOTAD_AGGREGATOR_H
+
+#include "quota.h"
+#include <glusterfs/stack.h>
+#include "glusterfs3-xdr.h"
+#include <glusterfs/inode.h>
+
+typedef struct {
+    void *pool;
+    xlator_t *this;
+    xlator_t *active_subvol;
+    inode_table_t *itable;
+    loc_t loc;
+    dict_t *xdata;
+    dict_t *req_xdata;
+} quotad_aggregator_state_t;
+
+typedef int (*quotad_aggregator_lookup_cbk_t)(xlator_t *this,
+                                              call_frame_t *frame, void *rsp);
+int
+qd_nameless_lookup(xlator_t *this, call_frame_t *frame, char *gfid,
+                   dict_t *xdata, char *volume_uuid,
+                   quotad_aggregator_lookup_cbk_t lookup_cbk);
+int
+quotad_aggregator_init(xlator_t *this);
+
+#endif
diff --git a/xlators/features/quota/src/quotad-helpers.c b/xlators/features/quota/src/quotad-helpers.c
new file mode 100644
index 00000000000..51ff1d7e98d
--- /dev/null
+++ b/xlators/features/quota/src/quotad-helpers.c
@@ -0,0 +1,107 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "quotad-helpers.h"
+
+quotad_aggregator_state_t *
+get_quotad_aggregator_state(xlator_t *this, rpcsvc_request_t *req)
+{
+    quotad_aggregator_state_t *state = NULL;
+    xlator_t *active_subvol = NULL;
+    quota_priv_t *priv = NULL;
+
+    state = (void *)GF_CALLOC(1, sizeof(*state),
+                              gf_quota_mt_aggregator_state_t);
+    if (!state)
+        return NULL;
+
+    state->this = THIS;
+    priv = this->private;
+
+    LOCK(&priv->lock);
+    {
+        active_subvol = state->active_subvol = FIRST_CHILD(this);
+    }
+    UNLOCK(&priv->lock);
+
+    if (active_subvol->itable == NULL)
+        active_subvol->itable = inode_table_new(4096, active_subvol);
+
+    state->itable = active_subvol->itable;
+
+    state->pool = this->ctx->pool;
+
+    return state;
+}
+
+void
+quotad_aggregator_free_state(quotad_aggregator_state_t *state)
+{
+    if (state->xdata)
+        dict_unref(state->xdata);
+
+    if (state->req_xdata)
+        dict_unref(state->req_xdata);
+
+    GF_FREE(state);
+}
+
+call_frame_t *
+quotad_aggregator_alloc_frame(rpcsvc_request_t *req)
+{
+    call_frame_t *frame = NULL;
+    quotad_aggregator_state_t *state = NULL;
+    xlator_t *this = NULL;
+
+    GF_VALIDATE_OR_GOTO("server", req, out);
+    GF_VALIDATE_OR_GOTO("server", req->trans, out);
+    GF_VALIDATE_OR_GOTO("server", req->svc, out);
+    GF_VALIDATE_OR_GOTO("server", req->svc->ctx, out);
+
+    this = req->svc->xl;
+
+    frame = create_frame(this, req->svc->ctx->pool);
+    if (!frame)
+        goto out;
+
+    state = get_quotad_aggregator_state(this, req);
+    if (!state)
+        goto out;
+
+    frame->root->state = state;
+
+    frame->this = this;
+out:
+    return frame;
+}
+
+call_frame_t *
+quotad_aggregator_get_frame_from_req(rpcsvc_request_t *req)
+{
+    call_frame_t *frame = NULL;
+
+    GF_VALIDATE_OR_GOTO("server", req, out);
+
+    frame = quotad_aggregator_alloc_frame(req);
+    if (!frame)
+        goto out;
+
+    frame->root->op = req->procnum;
+
+    frame->root->uid = req->uid;
+    frame->root->gid = req->gid;
+    frame->root->pid = req->pid;
+
+    frame->root->lk_owner = req->lk_owner;
+
+    frame->local = req;
+out:
+    return frame;
+}
diff --git a/xlators/features/quota/src/quotad-helpers.h b/xlators/features/quota/src/quotad-helpers.h
new file mode 100644
index 00000000000..bcb39fe845e
--- /dev/null
+++ b/xlators/features/quota/src/quotad-helpers.h
@@ -0,0 +1,24 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef QUOTAD_HELPERS_H
+#define QUOTAD_HELPERS_H
+
+#include "rpcsvc.h"
+#include "quota.h"
+#include "quotad-aggregator.h"
+
+void
+quotad_aggregator_free_state(quotad_aggregator_state_t *state);
+
+call_frame_t *
+quotad_aggregator_get_frame_from_req(rpcsvc_request_t *req);
+
+#endif
diff --git a/xlators/features/quota/src/quotad.c b/xlators/features/quota/src/quotad.c
new file mode 100644
index 00000000000..643f25c9c2a
--- /dev/null
+++ b/xlators/features/quota/src/quotad.c
@@ -0,0 +1,245 @@
+/*
+   Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include "quota.h"
+#include "quotad-aggregator.h"
+
+int
+qd_notify(xlator_t *this, int32_t event, void *data, ...)
+{
+    switch (event) {
+        case GF_EVENT_PARENT_UP:
+            quotad_aggregator_init(this);
+    }
+
+    default_notify(this, event, data);
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_quota_mt_end + 1);
+
+    if (0 != ret) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "Memory accounting "
+               "init failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+int32_t
+qd_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+              struct iatt *postparent)
+{
+    quotad_aggregator_lookup_cbk_t lookup_cbk = NULL;
+    gfs3_lookup_rsp rsp = {
+        0,
+    };
+
+    lookup_cbk = cookie;
+
+    rsp.op_ret = op_ret;
+    rsp.op_errno = op_errno;
+
+    gf_stat_from_iatt(&rsp.postparent, postparent);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&rsp.xdata.xdata_val),
+                               rsp.xdata.xdata_len, rsp.op_errno, out);
+
+    gf_stat_from_iatt(&rsp.stat, buf);
+
+out:
+    lookup_cbk(this, frame, &rsp);
+
+    GF_FREE(rsp.xdata.xdata_val);
+
+    inode_unref(inode);
+
+    return 0;
+}
+
+xlator_t *
+qd_find_subvol(xlator_t *this, char *volume_uuid)
+{
+    xlator_list_t *child = NULL;
+    xlator_t *subvol = NULL;
+    char key[1024];
+    int keylen = 0;
+    char *optstr = NULL;
+
+    if (!this || !volume_uuid)
+        goto out;
+
+    for (child = this->children; child; child = child->next) {
+        keylen = snprintf(key, sizeof(key), "%s.volume-id",
+                          child->xlator->name);
+        if (dict_get_strn(this->options, key, keylen, &optstr) < 0)
+            continue;
+
+        if (strcmp(optstr, volume_uuid) == 0) {
+            subvol = child->xlator;
+            break;
+        }
+    }
+
+out:
+    return subvol;
+}
+
+int
+qd_nameless_lookup(xlator_t *this, call_frame_t *frame, char *gfid,
+                   dict_t *xdata, char *volume_uuid,
+                   quotad_aggregator_lookup_cbk_t lookup_cbk)
+{
+    gfs3_lookup_rsp rsp = {
+        0,
+    };
+    int op_errno = 0, ret = -1;
+    loc_t loc = {
+        0,
+    };
+    quotad_aggregator_state_t *state = NULL;
+    xlator_t *subvol = NULL;
+
+    state = frame->root->state;
+
+    frame->root->op = GF_FOP_LOOKUP;
+
+    loc.inode = inode_new(state->itable);
+    if (loc.inode == NULL) {
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    memcpy(loc.gfid, gfid, 16);
+
+    ret = dict_set_int8(xdata, QUOTA_READ_ONLY_KEY, 1);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, Q_MSG_ENOMEM,
+               "dict set failed");
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    subvol = qd_find_subvol(this, volume_uuid);
+    if (subvol == NULL) {
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    STACK_WIND_COOKIE(frame, qd_lookup_cbk, lookup_cbk, subvol,
+                      subvol->fops->lookup, &loc, xdata);
+    return 0;
+
+out:
+    rsp.op_ret = -1;
+    rsp.op_errno = op_errno;
+
+    lookup_cbk(this, frame, &rsp);
+
+    inode_unref(loc.inode);
+    return 0;
+}
+
+int
+qd_reconfigure(xlator_t *this, dict_t *options)
+{
+    /* As of now quotad is restarted upon alteration of volfile */
+    return 0;
+}
+
+void
+qd_fini(xlator_t *this)
+{
+    quota_priv_t *priv = NULL;
+
+    if (this == NULL || this->private == NULL)
+        goto out;
+
+    priv = this->private;
+
+    if (priv->rpcsvc) {
+        GF_FREE(priv->rpcsvc);
+        priv->rpcsvc = NULL;
+    }
+
+    GF_FREE(priv);
+
+out:
+    return;
+}
+
+int32_t
+qd_init(xlator_t *this)
+{
+    int32_t ret = -1;
+    quota_priv_t *priv = NULL;
+
+    if (NULL == this->children) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "FATAL: quota (%s) not configured for min of 1 child",
+               this->name);
+        ret = -1;
+        goto err;
+    }
+
+    QUOTA_ALLOC_OR_GOTO(priv, quota_priv_t, err);
+    LOCK_INIT(&priv->lock);
+
+    this->private = priv;
+
+    ret = 0;
+err:
+    if (ret) {
+        GF_FREE(priv);
+    }
+    return ret;
+}
+
+struct xlator_fops fops = {};
+
+struct xlator_cbks cbks = {};
+
+struct volume_options options[] = {
+    {.key = {"transport-type"},
+     .value = {"rpc", "rpc-over-rdma", "tcp", "socket", "ib-verbs", "unix",
+               "ib-sdp", "tcp/server", "ib-verbs/server", "rdma",
+               "rdma*([ \t]),*([ \t])socket", "rdma*([ \t]),*([ \t])tcp",
+               "tcp*([ \t]),*([ \t])rdma", "socket*([ \t]),*([ \t])rdma"},
+     .type = GF_OPTION_TYPE_STR},
+    {
+        .key = {"transport.*"},
+        .type = GF_OPTION_TYPE_ANY,
+    },
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = qd_init,
+    .fini = qd_fini,
+    .reconfigure = qd_reconfigure,
+    .notify = qd_notify,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1},
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "quotad",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/cluster/map/Makefile.am b/xlators/features/read-only/Makefile.am
index d471a3f9243..d471a3f9243 100644
--- a/xlators/cluster/map/Makefile.am
+++ b/xlators/features/read-only/Makefile.am
diff --git a/xlators/features/read-only/src/Makefile.am b/xlators/features/read-only/src/Makefile.am
new file mode 100644
index 00000000000..e4a2017ef0d
--- /dev/null
+++ b/xlators/features/read-only/src/Makefile.am
@@ -0,0 +1,23 @@
+xlator_LTLIBRARIES = read-only.la worm.la
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+noinst_HEADERS = read-only.h read-only-mem-types.h read-only-common.h worm-helper.h
+
+read_only_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+read_only_la_SOURCES = read-only.c read-only-common.c
+read_only_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la 
+
+worm_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+worm_la_SOURCES = read-only-common.c worm-helper.c worm.c
+worm_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES = 
+
diff --git a/xlators/features/read-only/src/read-only-common.c b/xlators/features/read-only/src/read-only-common.c
new file mode 100644
index 00000000000..9640e7e3eee
--- /dev/null
+++ b/xlators/features/read-only/src/read-only-common.c
@@ -0,0 +1,406 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include "read-only.h"
+#include "read-only-mem-types.h"
+#include <glusterfs/defaults.h>
+
+gf_boolean_t
+is_readonly_or_worm_enabled(call_frame_t *frame, xlator_t *this)
+{
+    read_only_priv_t *priv = NULL;
+    gf_boolean_t readonly_or_worm_enabled = _gf_false;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    readonly_or_worm_enabled = priv->readonly_or_worm_enabled;
+
+    if (frame->root->pid < GF_CLIENT_PID_MAX)
+        readonly_or_worm_enabled = _gf_false;
+
+    return readonly_or_worm_enabled;
+}
+
+static int
+_check_key_is_zero_filled(dict_t *d, char *k, data_t *v, void *tmp)
+{
+    if (mem_0filled((const char *)v->data, v->len)) {
+        /* -1 means, no more iterations, treat as 'break' */
+        return -1;
+    }
+    return 0;
+}
+
+int32_t
+ro_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+           gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    gf_boolean_t allzero = _gf_false;
+    int ret = 0;
+
+    ret = dict_foreach(dict, _check_key_is_zero_filled, NULL);
+    if (ret == 0)
+        allzero = _gf_true;
+
+    if (is_readonly_or_worm_enabled(frame, this) && !allzero)
+        STACK_UNWIND_STRICT(xattrop, frame, -1, EROFS, NULL, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->xattrop, loc, flags, dict,
+                        xdata);
+    return 0;
+}
+
+int32_t
+ro_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+            gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    gf_boolean_t allzero = _gf_false;
+    int ret = 0;
+
+    ret = dict_foreach(dict, _check_key_is_zero_filled, NULL);
+    if (ret == 0)
+        allzero = _gf_true;
+
+    if (is_readonly_or_worm_enabled(frame, this) && !allzero)
+        STACK_UNWIND_STRICT(fxattrop, frame, -1, EROFS, NULL, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->fxattrop, fd, flags, dict,
+                        xdata);
+
+    return 0;
+}
+
+int32_t
+ro_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+           const char *basename, entrylk_cmd cmd, entrylk_type type,
+           dict_t *xdata)
+{
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->entrylk,
+                    volume, loc, basename, cmd, type, xdata);
+
+    return 0;
+}
+
+int32_t
+ro_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            const char *basename, entrylk_cmd cmd, entrylk_type type,
+            dict_t *xdata)
+{
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->fentrylk,
+                    volume, fd, basename, cmd, type, xdata);
+
+    return 0;
+}
+
+int32_t
+ro_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+           int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+{
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->inodelk,
+                    volume, loc, cmd, lock, xdata);
+
+    return 0;
+}
+
+int32_t
+ro_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+{
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->finodelk,
+                    volume, fd, cmd, lock, xdata);
+
+    return 0;
+}
+
+int32_t
+ro_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd,
+      struct gf_flock *flock, dict_t *xdata)
+{
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lk, fd,
+                    cmd, flock, xdata);
+
+    return 0;
+}
+
+int32_t
+ro_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+           int32_t valid, dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(setattr, frame, -1, EROFS, NULL, NULL, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid,
+                        xdata);
+
+    return 0;
+}
+
+int32_t
+ro_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+            int32_t valid, dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(fsetattr, frame, -1, EROFS, NULL, NULL, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid,
+                        xdata);
+
+    return 0;
+}
+
+int32_t
+ro_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+            dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(truncate, frame, -1, EROFS, NULL, NULL, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+
+    return 0;
+}
+
+int32_t
+ro_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(ftruncate, frame, -1, EROFS, NULL, NULL, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+
+    return 0;
+}
+
+int32_t
+ro_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+             off_t offset, size_t len, dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(fallocate, frame, -1, EROFS, NULL, NULL, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->fallocate, fd, mode, offset,
+                        len, xdata);
+    return 0;
+}
+
+int
+ro_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(mknod, frame, -1, EROFS, NULL, NULL, NULL, NULL,
+                            xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask,
+                        xdata);
+
+    return 0;
+}
+
+int
+ro_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         mode_t umask, dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(mkdir, frame, -1, EROFS, NULL, NULL, NULL, NULL,
+                            xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->mkdir, loc, mode, umask,
+                        xdata);
+
+    return 0;
+}
+
+int32_t
+ro_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+          dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(unlink, frame, -1, EROFS, NULL, NULL, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+
+    return 0;
+}
+
+int
+ro_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+         dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(rmdir, frame, -1, EROFS, NULL, NULL, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata);
+
+    return 0;
+}
+
+int
+ro_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+           loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(symlink, frame, -1, EROFS, NULL, NULL, NULL, NULL,
+                            xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask,
+                        xdata);
+
+    return 0;
+}
+
+int32_t
+ro_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+          dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(rename, frame, -1, EROFS, NULL, NULL, NULL, NULL,
+                            NULL, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+
+    return 0;
+}
+
+int32_t
+ro_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+        dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(link, frame, -1, EROFS, NULL, NULL, NULL, NULL,
+                            xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+                        oldloc, newloc, xdata);
+
+    return 0;
+}
+
+int32_t
+ro_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+          mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(create, frame, -1, EROFS, NULL, NULL, NULL, NULL,
+                            NULL, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->create, loc, flags, mode,
+                        umask, fd, xdata);
+
+    return 0;
+}
+
+static int32_t
+ro_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata);
+    return 0;
+}
+
+int32_t
+ro_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+        fd_t *fd, dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this) &&
+        (((flags & O_ACCMODE) == O_WRONLY) ||
+         ((flags & O_ACCMODE) == O_RDWR))) {
+        STACK_UNWIND_STRICT(open, frame, -1, EROFS, NULL, xdata);
+        return 0;
+    }
+
+    STACK_WIND(frame, ro_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+    return 0;
+}
+
+int32_t
+ro_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+             int32_t flags, dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(fsetxattr, frame, -1, EROFS, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags,
+                        xdata);
+
+    return 0;
+}
+
+int32_t
+ro_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+            dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(fsyncdir, frame, -1, EROFS, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->fsyncdir, fd, flags, xdata);
+
+    return 0;
+}
+
+int32_t
+ro_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+          int32_t count, off_t off, uint32_t flags, struct iobref *iobref,
+          dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(writev, frame, -1, EROFS, NULL, NULL, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->writev, fd, vector, count, off,
+                        flags, iobref, xdata);
+
+    return 0;
+}
+
+int32_t
+ro_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+            int32_t flags, dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(setxattr, frame, -1, EROFS, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->setxattr, loc, dict, flags,
+                        xdata);
+
+    return 0;
+}
+
+int32_t
+ro_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               const char *name, dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this))
+        STACK_UNWIND_STRICT(removexattr, frame, -1, EROFS, xdata);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+
+    return 0;
+}
diff --git a/xlators/features/read-only/src/read-only-common.h b/xlators/features/read-only/src/read-only-common.h
new file mode 100644
index 00000000000..5561961ffa2
--- /dev/null
+++ b/xlators/features/read-only/src/read-only-common.h
@@ -0,0 +1,121 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+gf_boolean_t
+is_readonly_or_worm_enabled(call_frame_t *frame, xlator_t *this);
+
+int32_t
+ro_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+           gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata);
+
+int32_t
+ro_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+            gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata);
+
+int32_t
+ro_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+           const char *basename, entrylk_cmd cmd, entrylk_type type,
+           dict_t *xdata);
+
+int32_t
+ro_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            const char *basename, entrylk_cmd cmd, entrylk_type type,
+            dict_t *xdata);
+
+int32_t
+ro_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+           int32_t cmd, struct gf_flock *lock, dict_t *xdata);
+
+int32_t
+ro_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            int32_t cmd, struct gf_flock *lock, dict_t *xdata);
+
+int32_t
+ro_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd,
+      struct gf_flock *flock, dict_t *xdata);
+
+int32_t
+ro_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+           int32_t valid, dict_t *xdata);
+
+int32_t
+ro_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+            int32_t valid, dict_t *xdata);
+
+int32_t
+ro_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+            dict_t *xdata);
+
+int32_t
+ro_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             dict_t *xdata);
+
+int
+ro_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         dev_t rdev, mode_t umask, dict_t *xdata);
+
+int
+ro_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         mode_t umask, dict_t *xdata);
+
+int32_t
+ro_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+          dict_t *xdata);
+
+int
+ro_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+         dict_t *xdata);
+
+int
+ro_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+           loc_t *loc, mode_t umask, dict_t *xdata);
+
+int32_t
+ro_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+          dict_t *xdata);
+
+int32_t
+ro_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+        dict_t *xdata);
+
+int32_t
+ro_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+          mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata);
+
+int32_t
+ro_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+        fd_t *fd, dict_t *xdata);
+
+int32_t
+ro_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+             int32_t flags, dict_t *xdata);
+
+int32_t
+ro_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+            dict_t *xdata);
+
+int32_t
+ro_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+          int32_t count, off_t off, uint32_t flags, struct iobref *iobref,
+          dict_t *xdata);
+
+int32_t
+ro_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+            int32_t flags, dict_t *xdata);
+
+int32_t
+ro_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               const char *name, dict_t *xdata);
+
+int32_t
+ro_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+             off_t offset, size_t len, dict_t *xdata);
diff --git a/xlators/features/read-only/src/read-only-mem-types.h b/xlators/features/read-only/src/read-only-mem-types.h
new file mode 100644
index 00000000000..c67d6c02cd0
--- /dev/null
+++ b/xlators/features/read-only/src/read-only-mem-types.h
@@ -0,0 +1,20 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __READONLY_MEM_TYPES_H__
+#define __READONLY_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_read_only_mem_types_ {
+    gf_read_only_mt_priv_t = gf_common_mt_end + 1,
+    gf_read_only_mt_end
+};
+#endif
diff --git a/xlators/features/read-only/src/read-only.c b/xlators/features/read-only/src/read-only.c
new file mode 100644
index 00000000000..48654998e63
--- /dev/null
+++ b/xlators/features/read-only/src/read-only.c
@@ -0,0 +1,144 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include "read-only-common.h"
+#include "read-only-mem-types.h"
+#include "read-only.h"
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    ret = xlator_mem_acct_init(this, gf_read_only_mt_end + 1);
+    if (ret)
+        gf_log(this->name, GF_LOG_ERROR,
+               "Memory accounting "
+               "initialization failed.");
+
+    return ret;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    int ret = -1;
+    read_only_priv_t *priv = NULL;
+
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "translator not configured with exactly one child");
+        return -1;
+    }
+
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
+    }
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_read_only_mt_priv_t);
+    if (!priv)
+        goto out;
+
+    this->private = priv;
+
+    GF_OPTION_INIT("read-only", priv->readonly_or_worm_enabled, bool, out);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    read_only_priv_t *priv = NULL;
+    int ret = -1;
+    gf_boolean_t readonly_or_worm_enabled = _gf_false;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_OPTION_RECONF("read-only", readonly_or_worm_enabled, options, bool, out);
+    priv->readonly_or_worm_enabled = readonly_or_worm_enabled;
+    ret = 0;
+out:
+    gf_log(this->name, GF_LOG_DEBUG, "returning %d", ret);
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    read_only_priv_t *priv = NULL;
+
+    priv = this->private;
+    if (!priv)
+        return;
+
+    this->private = NULL;
+    GF_FREE(priv);
+
+    return;
+}
+
+struct xlator_fops fops = {
+    .mknod = ro_mknod,
+    .mkdir = ro_mkdir,
+    .unlink = ro_unlink,
+    .rmdir = ro_rmdir,
+    .symlink = ro_symlink,
+    .rename = ro_rename,
+    .link = ro_link,
+    .truncate = ro_truncate,
+    .open = ro_open,
+    .writev = ro_writev,
+    .setxattr = ro_setxattr,
+    .fsetxattr = ro_fsetxattr,
+    .removexattr = ro_removexattr,
+    .fsyncdir = ro_fsyncdir,
+    .ftruncate = ro_ftruncate,
+    .create = ro_create,
+    .setattr = ro_setattr,
+    .fsetattr = ro_fsetattr,
+    .xattrop = ro_xattrop,
+    .fxattrop = ro_fxattrop,
+    .inodelk = ro_inodelk,
+    .finodelk = ro_finodelk,
+    .entrylk = ro_entrylk,
+    .fentrylk = ro_fentrylk,
+    .lk = ro_lk,
+    .fallocate = ro_fallocate,
+};
+
+struct xlator_cbks cbks = {};
+
+struct volume_options options[] = {
+    {.key = {"read-only"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     /*.validate_fn = validate_boolean,*/
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE,
+     .description = "When \"on\", makes a volume read-only. It is turned "
+                    "\"off\" by default."},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "read-only",
+    .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/features/read-only/src/read-only.h b/xlators/features/read-only/src/read-only.h
new file mode 100644
index 00000000000..aced5d3c577
--- /dev/null
+++ b/xlators/features/read-only/src/read-only.h
@@ -0,0 +1,37 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __READONLY_H__
+#define __READONLY_H__
+
+#include <stdint.h>               // for uint64_t, uint8_t
+#include <sys/time.h>             // for time_t
+#include "glusterfs/glusterfs.h"  // for gf_boolean_t
+
+typedef struct {
+    uint8_t worm : 1;
+    uint8_t retain : 1;
+    uint8_t legal_hold : 1;
+    uint8_t ret_mode : 1;
+    int64_t ret_period;
+    int64_t auto_commit_period;
+} worm_reten_state_t;
+
+typedef struct {
+    gf_boolean_t readonly_or_worm_enabled;
+    gf_boolean_t worm_file;
+    gf_boolean_t worm_files_deletable;
+    int64_t reten_period;
+    int64_t com_period;
+    int reten_mode;
+    time_t start_time;
+} read_only_priv_t;
+
+#endif
diff --git a/xlators/features/read-only/src/worm-helper.c b/xlators/features/read-only/src/worm-helper.c
new file mode 100644
index 00000000000..df45f2a940b
--- /dev/null
+++ b/xlators/features/read-only/src/worm-helper.c
@@ -0,0 +1,395 @@
+/*
+   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include "read-only-mem-types.h"
+#include "read-only.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/syncop.h>
+#include "worm-helper.h"
+
+/*Function to check whether file is read-only.
+ * The input *stbuf contains the attributes of the file, which is used to check
+ * the write protection bits for all the users of the file.
+ * Return true if all the write bits are disabled,false otherwise*/
+gf_boolean_t
+gf_worm_write_disabled(struct iatt *stbuf)
+{
+    gf_boolean_t ret = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("worm", stbuf, out);
+
+    if (stbuf->ia_prot.owner.write == 0 && stbuf->ia_prot.group.write == 0 &&
+        stbuf->ia_prot.other.write == 0)
+        ret = _gf_true;
+out:
+    return ret;
+}
+
+int32_t
+worm_init_state(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr)
+{
+    int ret = -1;
+    uint64_t start_time = 0;
+    dict_t *dict = NULL;
+
+    GF_VALIDATE_OR_GOTO("worm", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, file_ptr, out);
+
+    start_time = gf_time();
+    dict = dict_new();
+    if (!dict) {
+        gf_log(this->name, GF_LOG_ERROR, "Error creating the dict");
+        goto out;
+    }
+    ret = dict_set_uint64(dict, "trusted.start_time", start_time);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR, "Error in setting the dict");
+        goto out;
+    }
+    if (fop_with_fd)
+        ret = syncop_fsetxattr(this, (fd_t *)file_ptr, dict, 0, NULL, NULL);
+    else
+        ret = syncop_setxattr(this, (loc_t *)file_ptr, dict, 0, NULL, NULL);
+out:
+    if (dict)
+        dict_unref(dict);
+    return ret;
+}
+
+/*Function to set the retention state for a file.
+ * It loads the WORM/Retention state into the retention_state pointer.*/
+int32_t
+worm_set_state(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr,
+               worm_reten_state_t *retention_state, struct iatt *stbuf)
+{
+    read_only_priv_t *priv = NULL;
+    struct iatt stpre = {
+        0,
+    };
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("worm", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, file_ptr, out);
+    GF_VALIDATE_OR_GOTO(this->name, retention_state, out);
+    GF_VALIDATE_OR_GOTO(this->name, stbuf, out);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+    retention_state->worm = 1;
+    retention_state->retain = 1;
+    retention_state->legal_hold = 0;
+    retention_state->ret_mode = priv->reten_mode;
+    retention_state->ret_period = priv->reten_period;
+    retention_state->auto_commit_period = priv->com_period;
+    if (fop_with_fd)
+        ret = syncop_fstat(this, (fd_t *)file_ptr, &stpre, NULL, NULL);
+    else
+        ret = syncop_stat(this, (loc_t *)file_ptr, &stpre, NULL, NULL);
+    if (ret)
+        goto out;
+    stbuf->ia_mtime = stpre.ia_mtime;
+    stbuf->ia_atime = gf_time() + retention_state->ret_period;
+
+    if (fop_with_fd)
+        ret = syncop_fsetattr(this, (fd_t *)file_ptr, stbuf, GF_SET_ATTR_ATIME,
+                              NULL, NULL, NULL, NULL);
+    else
+        ret = syncop_setattr(this, (loc_t *)file_ptr, stbuf, GF_SET_ATTR_ATIME,
+                             NULL, NULL, NULL, NULL);
+    if (ret)
+        goto out;
+
+    ret = gf_worm_set_xattr(this, retention_state, fop_with_fd, file_ptr);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR, "Error setting xattr");
+        goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+/*This function gets the state of the WORM/Retention xattr and loads it in the
+ * dict pointer.*/
+int32_t
+worm_get_state(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr,
+               worm_reten_state_t *reten_state)
+{
+    dict_t *dict = NULL;
+    char *val = NULL;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("worm", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, file_ptr, out);
+    GF_VALIDATE_OR_GOTO(this->name, reten_state, out);
+
+    if (fop_with_fd)
+        ret = syncop_fgetxattr(this, (fd_t *)file_ptr, &dict,
+                               "trusted.reten_state", NULL, NULL);
+    else
+        ret = syncop_getxattr(this, (loc_t *)file_ptr, &dict,
+                              "trusted.reten_state", NULL, NULL);
+    if (ret < 0 || !dict) {
+        ret = -1;
+        goto out;
+    }
+    ret = dict_get_str(dict, "trusted.reten_state", &val);
+    if (ret) {
+        ret = -2;
+        gf_log(this->name, GF_LOG_ERROR, "Empty val");
+    }
+    gf_worm_deserialize_state(val, reten_state);
+out:
+    if (dict)
+        dict_unref(dict);
+    return ret;
+}
+
+/*Function to lookup the current state of the WORM/Retention profile.
+ * Based on the retain value and the access time of the file, the transition
+ * from WORM/Retention to WORM is made.*/
+void
+gf_worm_state_lookup(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr,
+                     worm_reten_state_t *reten_state, struct iatt *stbuf)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("worm", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, file_ptr, out);
+    GF_VALIDATE_OR_GOTO(this->name, reten_state, out);
+    GF_VALIDATE_OR_GOTO(this->name, stbuf, out);
+
+    stbuf->ia_atime -= reten_state->ret_period;
+    reten_state->retain = 0;
+    reten_state->ret_period = 0;
+    reten_state->auto_commit_period = 0;
+    ret = gf_worm_set_xattr(this, reten_state, fop_with_fd, file_ptr);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR, "Error setting xattr");
+        goto out;
+    }
+
+    if (fop_with_fd)
+        ret = syncop_fsetattr(this, (fd_t *)file_ptr, stbuf, GF_SET_ATTR_ATIME,
+                              NULL, NULL, NULL, NULL);
+    else
+        ret = syncop_setattr(this, (loc_t *)file_ptr, stbuf, GF_SET_ATTR_ATIME,
+                             NULL, NULL, NULL, NULL);
+    if (ret)
+        goto out;
+    gf_log(this->name, GF_LOG_INFO, "Retention state reset");
+out:
+    return;
+}
+
+/*This function serializes and stores the WORM/Retention state of a file in an
+ * uint64_t variable by setting the bits using the bitwise operations.*/
+void
+gf_worm_serialize_state(worm_reten_state_t *reten_state, char *val)
+{
+    uint32_t state = 0;
+
+    GF_VALIDATE_OR_GOTO("worm", reten_state, out);
+    GF_VALIDATE_OR_GOTO("worm", val, out);
+
+    state |= reten_state->worm << 0;
+    state |= reten_state->retain << 1;
+    state |= reten_state->legal_hold << 2;
+    state |= reten_state->ret_mode << 3;
+    sprintf(val, "%d/%" PRIu64 "/%" PRIu64, state, reten_state->ret_period,
+            reten_state->auto_commit_period);
+
+out:
+    return;
+}
+
+/*This function deserializes the data stored in the xattr of the file and loads
+ * the value to the reten_state structure.*/
+void
+gf_worm_deserialize_state(char *val, worm_reten_state_t *reten_state)
+{
+    char *token = NULL;
+    uint32_t state = 0;
+
+    GF_VALIDATE_OR_GOTO("worm", val, out);
+    GF_VALIDATE_OR_GOTO("worm", reten_state, out);
+
+    token = strtok(val, "/");
+    state = atoi(token);
+    reten_state->worm = (state >> 0) & 1;
+    reten_state->retain = (state >> 1) & 1;
+    reten_state->legal_hold = (state >> 2) & 1;
+    reten_state->ret_mode = (state >> 3) & 1;
+    token = strtok(NULL, "/");
+    reten_state->ret_period = atoi(token);
+    token = strtok(NULL, "/");
+    reten_state->auto_commit_period = atoi(token);
+
+out:
+    return;
+}
+
+/*Function to set the xattr for a file.
+ * If the xattr is already present then it will replace that.*/
+int32_t
+gf_worm_set_xattr(xlator_t *this, worm_reten_state_t *reten_state,
+                  gf_boolean_t fop_with_fd, void *file_ptr)
+{
+    char val[100] = "";
+    int ret = -1;
+    dict_t *dict = NULL;
+
+    GF_VALIDATE_OR_GOTO("worm", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, reten_state, out);
+    GF_VALIDATE_OR_GOTO(this->name, file_ptr, out);
+
+    gf_worm_serialize_state(reten_state, val);
+    dict = dict_new();
+    if (!dict) {
+        gf_log(this->name, GF_LOG_ERROR, "Error creating the dict");
+        goto out;
+    }
+    ret = dict_set_str(dict, "trusted.reten_state", val);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR, "Error in setting the dict");
+        goto out;
+    }
+    if (fop_with_fd)
+        ret = syncop_fsetxattr(this, (fd_t *)file_ptr, dict, 0, NULL, NULL);
+    else
+        ret = syncop_setxattr(this, (loc_t *)file_ptr, dict, 0, NULL, NULL);
+out:
+    if (dict)
+        dict_unref(dict);
+    return ret;
+}
+
+/*This function checks whether a file's timeout is happened for the state
+ * transition and if yes, then it will do the transition from the current state
+ * to the appropriate state. It also decides whether to continue or to block
+ * the FOP.
+ * Return:
+ * 0 : If the FOP should continue i.e., if the file is not in the WORM-Retained
+ *     state or if the FOP is unlink and the file is not in the Retained state.
+ * 1: If the FOP sholud block i.e., if the file is in WORM-Retained/WORM state.
+ * 2: Blocks the FOP if any operation fails while doing the state transition or
+ *    fails to get the state of the file.*/
+int
+gf_worm_state_transition(xlator_t *this, gf_boolean_t fop_with_fd,
+                         void *file_ptr, glusterfs_fop_t op)
+{
+    int op_errno = EROFS;
+    int ret = -1;
+    time_t now = 0;
+    uint64_t com_period = 0;
+    uint64_t start_time = 0;
+    dict_t *dict = NULL;
+    worm_reten_state_t reten_state = {
+        0,
+    };
+    read_only_priv_t *priv = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (fop_with_fd)
+        ret = syncop_fgetxattr(this, (fd_t *)file_ptr, &dict,
+                               "trusted.start_time", NULL, NULL);
+    else
+        ret = syncop_getxattr(this, (loc_t *)file_ptr, &dict,
+                              "trusted.start_time", NULL, NULL);
+    if (ret < 0 || !dict) {
+        op_errno = ret;
+        gf_msg(this->name, GF_LOG_ERROR, -ret, 0, "Error getting xattr");
+        goto out;
+    }
+    ret = dict_get_uint64(dict, "trusted.start_time", &start_time);
+    if (ret) {
+        op_errno = ret;
+        gf_msg(this->name, GF_LOG_ERROR, -ret, 0, "Error getting start time");
+        goto out;
+    }
+
+    com_period = priv->com_period;
+    if (fop_with_fd)
+        ret = syncop_fstat(this, (fd_t *)file_ptr, &stbuf, NULL, NULL);
+    else
+        ret = syncop_stat(this, (loc_t *)file_ptr, &stbuf, NULL, NULL);
+    if (ret) {
+        op_errno = ret;
+        gf_msg(this->name, GF_LOG_ERROR, -ret, 0, "Error getting file stat");
+        goto out;
+    }
+
+    ret = worm_get_state(this, fop_with_fd, file_ptr, &reten_state);
+    if (ret == -2) {
+        op_errno = ret;
+        gf_msg(this->name, GF_LOG_ERROR, -ret, 0,
+               "Error getting worm/retention state");
+        goto out;
+    }
+
+    now = gf_time();
+
+    if (ret == -1 && (now - start_time) >= com_period) {
+        if ((now - stbuf.ia_mtime) >= com_period) {
+            ret = worm_set_state(this, fop_with_fd, file_ptr, &reten_state,
+                                 &stbuf);
+            if (ret) {
+                op_errno = ret;
+                gf_msg(this->name, GF_LOG_ERROR, -ret, 0,
+                       "Error setting worm/retention state");
+                goto out;
+            }
+            goto out;
+        } else {
+            op_errno = 0;
+            goto out;
+        }
+    } else if (ret == -1 && (now - start_time) < com_period) {
+        op_errno = 0;
+        goto out;
+    } else if (reten_state.retain && ((now >= stbuf.ia_atime))) {
+        gf_worm_state_lookup(this, fop_with_fd, file_ptr, &reten_state, &stbuf);
+    }
+    if (reten_state.worm && !reten_state.retain && priv->worm_files_deletable &&
+        op == GF_FOP_UNLINK) {
+        op_errno = 0;
+        goto out;
+    }
+
+out:
+    if (dict)
+        dict_unref(dict);
+    return op_errno;
+}
+
+/*Function to check whether a file is independently WORMed (i.e., file level
+ * WORM is set on the file). */
+int32_t
+is_wormfile(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr)
+{
+    int ret = -1;
+    dict_t *dict = NULL;
+
+    if (fop_with_fd)
+        ret = syncop_fgetxattr(this, (fd_t *)file_ptr, &dict,
+                               "trusted.worm_file", NULL, NULL);
+    else
+        ret = syncop_getxattr(this, (loc_t *)file_ptr, &dict,
+                              "trusted.worm_file", NULL, NULL);
+    if (dict) {
+        ret = 0;
+        dict_unref(dict);
+    }
+    return ret;
+}
diff --git a/xlators/features/read-only/src/worm-helper.h b/xlators/features/read-only/src/worm-helper.h
new file mode 100644
index 00000000000..b42f8d2b40c
--- /dev/null
+++ b/xlators/features/read-only/src/worm-helper.h
@@ -0,0 +1,44 @@
+/*
+   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+gf_boolean_t
+gf_worm_write_disabled(struct iatt *stbuf);
+
+int32_t
+worm_init_state(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr);
+
+int32_t
+worm_set_state(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr,
+               worm_reten_state_t *retention_state, struct iatt *stbuf);
+
+int32_t
+worm_get_state(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr,
+               worm_reten_state_t *reten_state);
+
+void
+gf_worm_state_lookup(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr,
+                     worm_reten_state_t *reten_state, struct iatt *stbuf);
+
+void
+gf_worm_serialize_state(worm_reten_state_t *reten_state, char *val);
+
+void
+gf_worm_deserialize_state(char *val, worm_reten_state_t *reten_state);
+
+int32_t
+gf_worm_set_xattr(xlator_t *this, worm_reten_state_t *reten_state,
+                  gf_boolean_t fop_with_fd, void *file_ptr);
+
+int
+gf_worm_state_transition(xlator_t *this, gf_boolean_t fop_with_fd,
+                         void *file_ptr, glusterfs_fop_t op);
+
+int32_t
+is_wormfile(xlator_t *this, gf_boolean_t fop_with_fd, void *file_ptr);
diff --git a/xlators/features/read-only/src/worm.c b/xlators/features/read-only/src/worm.c
new file mode 100644
index 00000000000..1cc5526d5cd
--- /dev/null
+++ b/xlators/features/read-only/src/worm.c
@@ -0,0 +1,722 @@
+/*
+   Copyright (c) 2008-2012, 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include "read-only-common.h"
+#include "read-only-mem-types.h"
+#include "read-only.h"
+#include <glusterfs/syncop.h>
+#include "worm-helper.h"
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    ret = xlator_mem_acct_init(this, gf_read_only_mt_end + 1);
+    if (ret)
+        gf_log(this->name, GF_LOG_ERROR,
+               "Memory accounting "
+               "initialization failed.");
+
+    return ret;
+}
+
+static int32_t
+worm_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+          fd_t *fd, dict_t *xdata)
+{
+    if (is_readonly_or_worm_enabled(frame, this) &&
+        (flags & (O_WRONLY | O_RDWR | O_APPEND | O_TRUNC))) {
+        STACK_UNWIND_STRICT(open, frame, -1, EROFS, NULL, NULL);
+        return 0;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->open,
+                    loc, flags, fd, xdata);
+    return 0;
+}
+
+static int32_t
+worm_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+          dict_t *xdata)
+{
+    int op_errno = EROFS;
+    read_only_priv_t *priv = NULL;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+    if (is_readonly_or_worm_enabled(frame, this))
+        goto out;
+    if (!priv->worm_file || (frame->root->pid < 0)) {
+        op_errno = 0;
+        goto out;
+    }
+
+    gf_uuid_copy(oldloc->gfid, oldloc->inode->gfid);
+    if (is_wormfile(this, _gf_false, oldloc)) {
+        op_errno = 0;
+        goto out;
+    }
+    op_errno = gf_worm_state_transition(this, _gf_false, oldloc, GF_FOP_LINK);
+
+out:
+    if (op_errno) {
+        if (op_errno < 0)
+            op_errno = EROFS;
+        STACK_UNWIND_STRICT(link, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                            NULL);
+    } else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+                        oldloc, newloc, xdata);
+    return 0;
+}
+
+static int32_t
+worm_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+            dict_t *xdata)
+{
+    int op_errno = EROFS;
+    read_only_priv_t *priv = NULL;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+    if (is_readonly_or_worm_enabled(frame, this)) {
+        goto out;
+    }
+    if (!priv->worm_file || (frame->root->pid < 0)) {
+        op_errno = 0;
+        goto out;
+    }
+
+    gf_uuid_copy(loc->gfid, loc->inode->gfid);
+    if (is_wormfile(this, _gf_false, loc)) {
+        op_errno = 0;
+        goto out;
+    }
+    op_errno = gf_worm_state_transition(this, _gf_false, loc, GF_FOP_UNLINK);
+out:
+    if (op_errno) {
+        if (op_errno < 0)
+            op_errno = EROFS;
+        STACK_UNWIND_STRICT(unlink, frame, -1, op_errno, NULL, NULL, NULL);
+    } else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->unlink, loc, flags, xdata);
+    return 0;
+}
+
+static int32_t
+worm_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+            dict_t *xdata)
+{
+    int op_errno = EROFS;
+    read_only_priv_t *priv = NULL;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+    if (is_readonly_or_worm_enabled(frame, this))
+        goto out;
+    if (!priv->worm_file || (frame->root->pid < 0)) {
+        op_errno = 0;
+        goto out;
+    }
+
+    gf_uuid_copy(oldloc->gfid, oldloc->inode->gfid);
+    if (is_wormfile(this, _gf_false, oldloc)) {
+        op_errno = 0;
+        goto check_newloc;
+    }
+    op_errno = gf_worm_state_transition(this, _gf_false, oldloc, GF_FOP_RENAME);
+
+    if (op_errno == 0) {
+    check_newloc:
+        if (newloc->inode != NULL) {
+            gf_uuid_copy(newloc->gfid, newloc->inode->gfid);
+            if (is_wormfile(this, _gf_false, newloc)) {
+                op_errno = 0;
+                goto out;
+            }
+            op_errno = gf_worm_state_transition(this, _gf_false, newloc,
+                                                GF_FOP_RENAME);
+        }
+    }
+
+out:
+    if (op_errno) {
+        if (op_errno < 0)
+            op_errno = EROFS;
+        STACK_UNWIND_STRICT(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                            NULL, NULL);
+    } else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+    return 0;
+}
+
+static int32_t
+worm_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+              dict_t *xdata)
+{
+    int op_errno = EROFS;
+    read_only_priv_t *priv = NULL;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+    if (is_readonly_or_worm_enabled(frame, this))
+        goto out;
+    if (!priv->worm_file || (frame->root->pid < 0)) {
+        op_errno = 0;
+        goto out;
+    }
+
+    if (is_wormfile(this, _gf_false, loc)) {
+        op_errno = 0;
+        goto out;
+    }
+    op_errno = gf_worm_state_transition(this, _gf_false, loc, GF_FOP_TRUNCATE);
+
+out:
+    if (op_errno) {
+        if (op_errno < 0)
+            op_errno = EROFS;
+        STACK_UNWIND_STRICT(truncate, frame, -1, op_errno, NULL, NULL, NULL);
+    } else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+    return 0;
+}
+
+static int32_t
+worm_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+               dict_t *xdata)
+{
+    int op_errno = EROFS;
+    read_only_priv_t *priv = NULL;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+    if (is_readonly_or_worm_enabled(frame, this))
+        goto out;
+    if (!priv->worm_file || (frame->root->pid < 0)) {
+        op_errno = 0;
+        goto out;
+    }
+
+    if (is_wormfile(this, _gf_true, fd)) {
+        op_errno = 0;
+        goto out;
+    }
+    op_errno = gf_worm_state_transition(this, _gf_true, fd, GF_FOP_FTRUNCATE);
+
+out:
+    if (op_errno) {
+        if (op_errno < 0)
+            op_errno = EROFS;
+        STACK_UNWIND_STRICT(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+    } else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+}
+
+static int32_t
+worm_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+             struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    gf_boolean_t rd_only = _gf_false;
+    worm_reten_state_t reten_state = {
+        0,
+    };
+    struct iatt stpre = {
+        0,
+    };
+    read_only_priv_t *priv = NULL;
+    int op_errno = EROFS;
+    int ret = -1;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+    if (!priv->worm_file) {
+        op_errno = 0;
+        goto out;
+    }
+
+    if (is_wormfile(this, _gf_false, loc)) {
+        op_errno = 0;
+        goto out;
+    }
+    if (valid & GF_SET_ATTR_MODE) {
+        rd_only = gf_worm_write_disabled(stbuf);
+        if (!rd_only) {
+            op_errno = 0;
+            goto out;
+        }
+
+        ret = worm_set_state(this, _gf_false, loc, &reten_state, stbuf);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR, "Error setting worm state");
+            goto out;
+        }
+    } else if (valid & GF_SET_ATTR_ATIME) {
+        ret = worm_get_state(this, _gf_false, loc, &reten_state);
+        if (ret) {
+            op_errno = 0;
+            goto out;
+        }
+        if (reten_state.retain) {
+            ret = syncop_stat(this, loc, &stpre, NULL, NULL);
+            if (ret)
+                goto out;
+            if (reten_state.ret_mode == 0) {
+                if (stbuf->ia_atime < stpre.ia_mtime) {
+                    gf_log(this->name, GF_LOG_ERROR,
+                           "Cannot set atime less than "
+                           "the mtime for a WORM-Retained "
+                           "file");
+                    goto out;
+                }
+            } else {
+                if (stbuf->ia_atime < stpre.ia_atime) {
+                    gf_log(this->name, GF_LOG_ERROR,
+                           "Cannot decrease the atime of a"
+                           " WORM-Retained file in "
+                           "Enterprise mode");
+                    goto out;
+                }
+            }
+            reten_state.ret_period = reten_state.ret_period + stbuf->ia_atime -
+                                     stpre.ia_atime;
+            ret = gf_worm_set_xattr(this, &reten_state, _gf_false, loc);
+            if (ret) {
+                goto out;
+            }
+            stbuf->ia_mtime = stpre.ia_mtime;
+        }
+    }
+    op_errno = 0;
+
+out:
+    if (op_errno)
+        STACK_UNWIND_STRICT(setattr, frame, -1, EROFS, NULL, NULL, NULL);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid,
+                        xdata);
+    return 0;
+}
+
+static int32_t
+worm_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+              int32_t valid, dict_t *xdata)
+{
+    gf_boolean_t rd_only = _gf_false;
+    worm_reten_state_t reten_state = {
+        0,
+    };
+    struct iatt stpre = {
+        0,
+    };
+    read_only_priv_t *priv = NULL;
+    int op_errno = EROFS;
+    int ret = -1;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+    if (!priv->worm_file) {
+        op_errno = 0;
+        goto out;
+    }
+
+    if (is_wormfile(this, _gf_true, fd)) {
+        op_errno = 0;
+        goto out;
+    }
+    if (valid & GF_SET_ATTR_MODE) {
+        rd_only = gf_worm_write_disabled(stbuf);
+        if (!rd_only) {
+            op_errno = 0;
+            goto out;
+        }
+
+        ret = worm_set_state(this, _gf_true, fd, &reten_state, stbuf);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR, "Error setting worm state");
+            goto out;
+        }
+    } else if (valid & GF_SET_ATTR_ATIME) {
+        ret = worm_get_state(this, _gf_true, fd, &reten_state);
+        if (ret) {
+            op_errno = 0;
+            goto out;
+        }
+        if (reten_state.retain) {
+            ret = syncop_fstat(this, fd, &stpre, NULL, NULL);
+            if (ret)
+                goto out;
+            if (reten_state.ret_mode == 0) {
+                if (stbuf->ia_atime < stpre.ia_mtime) {
+                    gf_log(this->name, GF_LOG_ERROR,
+                           "Cannot set atime less than "
+                           "the mtime for a WORM-Retained "
+                           "file");
+                    goto out;
+                }
+            } else {
+                if (stbuf->ia_atime < stpre.ia_atime) {
+                    gf_log(this->name, GF_LOG_ERROR,
+                           "Cannot decrease the atime of a"
+                           " WORM-Retained file in "
+                           "Enterprise mode");
+                    goto out;
+                }
+            }
+            reten_state.ret_period = reten_state.ret_period + stbuf->ia_atime -
+                                     stpre.ia_atime;
+            ret = gf_worm_set_xattr(this, &reten_state, _gf_true, fd);
+            if (ret) {
+                goto out;
+            }
+
+            stbuf->ia_mtime = stpre.ia_mtime;
+        }
+    }
+    op_errno = 0;
+
+out:
+    if (op_errno)
+        STACK_UNWIND_STRICT(fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
+    else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid,
+                        xdata);
+    return 0;
+}
+
+static int32_t
+worm_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+            int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+            dict_t *xdata)
+{
+    read_only_priv_t *priv = NULL;
+    int op_errno = EROFS;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+    if (!priv->worm_file || (frame->root->pid < 0)) {
+        op_errno = 0;
+        goto out;
+    }
+    if (is_wormfile(this, _gf_true, fd)) {
+        op_errno = 0;
+        goto out;
+    }
+    op_errno = gf_worm_state_transition(this, _gf_true, fd, GF_FOP_WRITE);
+
+out:
+    if (op_errno) {
+        if (op_errno < 0)
+            op_errno = EROFS;
+        STACK_UNWIND_STRICT(writev, frame, -1, op_errno, NULL, NULL, NULL);
+    } else
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->writev, fd, vector, count,
+                        offset, flags, iobref, xdata);
+    return 0;
+}
+
+static int32_t
+worm_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                struct iatt *buf, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
+{
+    int ret = 0;
+    read_only_priv_t *priv = NULL;
+    // In case of an error exit because fd can be NULL and this would
+    // cause an segfault when performing fsetxattr . We explicitly
+    // unwind to avoid future problems
+    if (op_ret < 0) {
+        goto out;
+    }
+
+    priv = this->private;
+    GF_ASSERT(priv);
+    if (priv->worm_file) {
+        ret = fd_ctx_set(fd, this, 1);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "Failed to set the fd ctx "
+                   "for gfid:%s . Worm feature may not work for the gfid",
+                   uuid_utoa(inode->gfid));
+        }
+        ret = worm_init_state(this, _gf_true, fd);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR, "Error initializing state");
+        }
+    }
+
+out:
+    STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf,
+                        preparent, postparent, xdata);
+    return ret;
+}
+
+static int32_t
+worm_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+            mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    STACK_WIND(frame, worm_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+    return 0;
+}
+
+static void
+set_reten_mode(read_only_priv_t *priv, char *reten_mode)
+{
+    if (strcmp(reten_mode, "relax") == 0)
+        priv->reten_mode = 0;
+    else
+        priv->reten_mode = 1;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    int ret = -1;
+    read_only_priv_t *priv = NULL;
+    char *reten_mode = NULL;
+
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "translator not configured with exactly one child");
+        return -1;
+    }
+
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
+    }
+
+    this->local_pool = mem_pool_new(read_only_priv_t, 64);
+    if (!this->local_pool) {
+        ret = -1;
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to create read_only_priv_t's memory pool");
+        goto out;
+    }
+
+    priv = mem_get0(this->local_pool);
+    if (!priv) {
+        gf_log(this->name, GF_LOG_ERROR, "Error allocating priv");
+        goto out;
+    }
+
+    this->private = priv;
+
+    GF_OPTION_INIT("worm", priv->readonly_or_worm_enabled, bool, out);
+    GF_OPTION_INIT("worm-file-level", priv->worm_file, bool, out);
+    GF_OPTION_INIT("default-retention-period", priv->reten_period, int64, out);
+    GF_OPTION_INIT("auto-commit-period", priv->com_period, int64, out);
+    GF_OPTION_INIT("retention-mode", reten_mode, str, out);
+    set_reten_mode(priv, reten_mode);
+    GF_OPTION_INIT("worm-files-deletable", priv->worm_files_deletable, bool,
+                   out);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    read_only_priv_t *priv = NULL;
+    char *reten_mode = NULL;
+    int ret = -1;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_OPTION_RECONF("worm", priv->readonly_or_worm_enabled, options, bool,
+                     out);
+    GF_OPTION_RECONF("worm-file-level", priv->worm_file, options, bool, out);
+    GF_OPTION_RECONF("default-retention-period", priv->reten_period, options,
+                     int64, out);
+    GF_OPTION_RECONF("retention-mode", reten_mode, options, str, out);
+    set_reten_mode(priv, reten_mode);
+    GF_OPTION_RECONF("auto-commit-period", priv->com_period, options, int64,
+                     out);
+    GF_OPTION_RECONF("worm-files-deletable", priv->worm_files_deletable,
+                     options, bool, out);
+    ret = 0;
+out:
+    gf_log(this->name, GF_LOG_DEBUG, "returning %d", ret);
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    read_only_priv_t *priv = NULL;
+
+    priv = this->private;
+    if (!priv)
+        goto out;
+    mem_put(priv);
+    this->private = NULL;
+    mem_pool_destroy(this->local_pool);
+    this->local_pool = NULL;
+out:
+    return;
+}
+
+struct xlator_fops fops = {
+    .open = worm_open,
+    .writev = worm_writev,
+    .setattr = worm_setattr,
+    .fsetattr = worm_fsetattr,
+    .rename = worm_rename,
+    .link = worm_link,
+    .unlink = worm_unlink,
+    .truncate = worm_truncate,
+    .ftruncate = worm_ftruncate,
+    .create = worm_create,
+
+    .rmdir = ro_rmdir,
+    .removexattr = ro_removexattr,
+    .fsyncdir = ro_fsyncdir,
+    .xattrop = ro_xattrop,
+    .inodelk = ro_inodelk,
+    .finodelk = ro_finodelk,
+    .entrylk = ro_entrylk,
+    .fentrylk = ro_fentrylk,
+    .lk = ro_lk,
+};
+
+int32_t
+worm_release(xlator_t *this, fd_t *fd)
+{
+    dict_t *dict = NULL;
+    int ret = -1;
+    dict = dict_new();
+    uint64_t value = 0;
+    loc_t loc = {
+        0,
+    };
+    read_only_priv_t *priv = NULL;
+    priv = this->private;
+
+    if (priv->worm_file) {
+        if (!dict) {
+            gf_log(this->name, GF_LOG_ERROR, "Error creating the dict");
+            goto out;
+        }
+
+        ret = fd_ctx_get(fd, this, &value);
+        if (ret) {
+            gf_log(this->name, GF_LOG_DEBUG, "Failed to get the fd ctx");
+        }
+        if (!value) {
+            goto out;
+        }
+
+        ret = dict_set_int8(dict, "trusted.worm_file", 1);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "Error in setting "
+                   "the dict");
+            goto out;
+        }
+
+        loc.inode = inode_ref(fd->inode);
+        gf_uuid_copy(loc.gfid, fd->inode->gfid);
+        ret = syncop_setxattr(this, &loc, dict, 0, NULL, NULL);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR, "Error setting xattr");
+            goto out;
+        }
+
+        gf_worm_state_transition(this, _gf_false, &loc, GF_FOP_WRITE);
+    }
+
+out:
+    loc_wipe(&loc);
+    if (dict)
+        dict_unref(dict);
+    return 0;
+}
+
+struct xlator_cbks cbks = {
+    .release = worm_release,
+};
+
+struct volume_options options[] = {
+    {.key = {"worm"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     /*.validate_fn = validate_boolean,*/
+     .op_version = {2},
+     .flags = OPT_FLAG_SETTABLE,
+     .description = "When \"on\", makes a volume get write once read many "
+                    " feature. It is turned \"off\" by default."},
+    {.key = {"worm-file-level"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     /*.validate_fn = validate_boolean,*/
+     .op_version = {GD_OP_VERSION_3_8_0},
+     .flags = OPT_FLAG_SETTABLE,
+     .description = "When \"on\", activates the file level worm. "
+                    "It is turned \"off\" by default."},
+    {.key = {"worm-files-deletable"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     /*.validate_fn = validate_boolean,*/
+     .op_version = {GD_OP_VERSION_3_13_0},
+     .flags = OPT_FLAG_SETTABLE,
+     .description = "When \"off\", doesn't allow the Worm files"
+                    "to be deleted. It is turned \"on\" by default."},
+    {.key = {"default-retention-period"},
+     .type = GF_OPTION_TYPE_TIME,
+     .default_value = "120",
+     /*.validate_fn = validate_worm_period,*/
+     .op_version = {GD_OP_VERSION_3_8_0},
+     .flags = OPT_FLAG_SETTABLE,
+     .description = "The default retention period for the files."},
+    {.key = {"retention-mode"},
+     .type = GF_OPTION_TYPE_STR,
+     .default_value = "relax",
+     /*.validate_fn = validate_reten_mode,*/
+     .op_version = {GD_OP_VERSION_3_8_0},
+     .flags = OPT_FLAG_SETTABLE,
+     .description = "The mode of retention (relax/enterprise). "
+                    "It is relax by default."},
+    {.key = {"auto-commit-period"},
+     .type = GF_OPTION_TYPE_TIME,
+     .default_value = "180",
+     /*.validate_fn = validate_worm_period,*/
+     .op_version = {GD_OP_VERSION_3_8_0},
+     .flags = OPT_FLAG_SETTABLE,
+     .description = "Auto commit period for the files."},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "worm",
+    .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/features/sdfs/Makefile.am b/xlators/features/sdfs/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/sdfs/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/sdfs/src/Makefile.am b/xlators/features/sdfs/src/Makefile.am
new file mode 100644
index 00000000000..6118d46ad22
--- /dev/null
+++ b/xlators/features/sdfs/src/Makefile.am
@@ -0,0 +1,19 @@
+if WITH_SERVER
+xlator_LTLIBRARIES = sdfs.la
+endif
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+sdfs_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+sdfs_la_SOURCES = sdfs.c
+sdfs_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = sdfs.h sdfs-messages.h $(top_builddir)/xlators/lib/src/libxlator.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+                                 -I$(top_srcdir)/xlators/lib/src \
+        -I$(top_srcdir)/rpc/xdr/src/ -I$(top_builddir)/rpc/xdr/src/
+
+AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/sdfs/src/sdfs-messages.h b/xlators/features/sdfs/src/sdfs-messages.h
new file mode 100644
index 00000000000..3053efa8935
--- /dev/null
+++ b/xlators/features/sdfs/src/sdfs-messages.h
@@ -0,0 +1,67 @@
+/*
+ Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _DFS_MESSAGES_H_
+#define _DFS_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* file bit-rot-bitd-messages.h
+ * brief SDFS log-message IDs and their descriptions
+ */
+
+/* NOTE: Rules for message additions
+ * 1) Each instance of a message is _better_ left with a unique message ID, even
+ *    if the message format is the same. Reasoning is that, if the message
+ *    format needs to change in one instance, the other instances are not
+ *    impacted or the new change does not change the ID of the instance being
+ *    modified.
+ * 2) Addition of a message,
+ *       - Should increment the GLFS_NUM_MESSAGES
+ *       - Append to the list of messages defined, towards the end
+ *       - Retain macro naming as glfs_msg_X (for redability across developers)
+ * NOTE: Rules for message format modifications
+ * 3) Check acorss the code if the message ID macro in question is reused
+ *    anywhere. If reused then then the modifications should ensure correctness
+ *    everywhere, or needs a new message ID as (1) above was not adhered to. If
+ *    not used anywhere, proceed with the required modification.
+ * NOTE: Rules for message deletion
+ * 4) Check (3) and if used anywhere else, then cannot be deleted. If not used
+ *    anywhere, then can be deleted, but will leave a hole by design, as
+ *    addition rules specify modification to the end of the list and not filling
+ *    holes.
+ */
+
+#define GLFS_SDFS_BASE GLFS_MSGID_COMP_SDFS
+#define GLFS_SDFS_NUM_MESSAGES 2
+#define GLFS_MSGID_END (GLFS_SDFS_BASE + GLFS_SDFS_NUM_MESSAGES + 1)
+/* Messaged with message IDs */
+#define glfs_msg_start_x GLFS_DFS_BASE, "Invalid: Start of messages"
+/*------------*/
+
+#define SDFS_MSG_ENTRYLK_ERROR (GLFS_SDFS_BASE + 1)
+/*!
+ * @messageid
+ * @diagnosis
+ * @recommendedaction
+ *
+ */
+
+#define SDFS_MSG_MKDIR_ERROR (GLFS_SDFS_BASE + 2)
+/*!
+ * @messageid
+ * @diagnosis
+ * @recommendedaction
+ *
+ */
+/*------------*/
+
+#define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
+#endif /* !_SDFS_MESSAGES_H_ */
diff --git a/xlators/features/sdfs/src/sdfs.c b/xlators/features/sdfs/src/sdfs.c
new file mode 100644
index 00000000000..aaf13f0852e
--- /dev/null
+++ b/xlators/features/sdfs/src/sdfs.c
@@ -0,0 +1,1479 @@
+/*
+   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <libgen.h>
+#include "sdfs.h"
+
+static int
+sdfs_frame_return(call_frame_t *frame)
+{
+    sdfs_local_t *local = NULL;
+
+    if (!frame)
+        return -1;
+
+    local = frame->local;
+
+    return GF_ATOMIC_DEC(local->call_cnt);
+}
+
+static void
+sdfs_lock_free(sdfs_entry_lock_t *entrylk)
+{
+    if (entrylk == NULL)
+        goto out;
+
+    loc_wipe(&entrylk->parent_loc);
+    GF_FREE(entrylk->basename);
+
+out:
+    return;
+}
+
+static void
+sdfs_lock_array_free(sdfs_lock_t *lock)
+{
+    sdfs_entry_lock_t *entrylk = NULL;
+    int i = 0;
+
+    if (lock == NULL)
+        goto out;
+
+    for (i = 0; i < lock->lock_count; i++) {
+        entrylk = &lock->entrylk[i];
+        sdfs_lock_free(entrylk);
+    }
+
+out:
+    return;
+}
+
+static void
+sdfs_local_cleanup(sdfs_local_t *local)
+{
+    if (!local)
+        return;
+
+    loc_wipe(&local->loc);
+    loc_wipe(&local->parent_loc);
+
+    if (local->stub) {
+        call_stub_destroy(local->stub);
+        local->stub = NULL;
+    }
+
+    sdfs_lock_array_free(local->lock);
+    GF_FREE(local->lock);
+
+    mem_put(local);
+}
+
+static int
+sdfs_build_parent_loc(loc_t *parent, loc_t *child)
+{
+    int ret = -1;
+    char *path = NULL;
+
+    if (!child->parent) {
+        goto out;
+    }
+    parent->inode = inode_ref(child->parent);
+    path = gf_strdup(child->path);
+    if (!path) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    parent->path = dirname(path);
+    if (!parent->path) {
+        goto out;
+    }
+
+    gf_uuid_copy(parent->gfid, child->pargfid);
+    return 0;
+
+out:
+    GF_FREE(path);
+    return ret;
+}
+
+static sdfs_local_t *
+sdfs_local_init(call_frame_t *frame, xlator_t *this)
+{
+    sdfs_local_t *local = NULL;
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto out;
+
+    frame->local = local;
+out:
+    return local;
+}
+
+static int
+sdfs_get_new_frame_common(call_frame_t *frame, call_frame_t **new_frame)
+{
+    int ret = -1;
+    sdfs_local_t *local = NULL;
+    client_t *client = NULL;
+
+    *new_frame = copy_frame(frame);
+    if (!*new_frame) {
+        goto err;
+    }
+
+    client = frame->root->client;
+    gf_client_ref(client);
+    (*new_frame)->root->client = client;
+
+    local = sdfs_local_init(*new_frame, THIS);
+    if (!local) {
+        goto err;
+    }
+
+    local->main_frame = frame;
+    /*Set unique lk-owner for the fop*/
+    set_lk_owner_from_ptr(&(*new_frame)->root->lk_owner, (*new_frame)->root);
+
+    ret = 0;
+err:
+    if ((ret == -1) && (*new_frame)) {
+        SDFS_STACK_DESTROY((*new_frame));
+        *new_frame = NULL;
+    }
+
+    return ret;
+}
+
+static int
+sdfs_get_new_frame(call_frame_t *frame, loc_t *loc, call_frame_t **new_frame)
+{
+    int ret = -1;
+    sdfs_local_t *local = NULL;
+
+    ret = sdfs_get_new_frame_common(frame, new_frame);
+    if (ret < 0) {
+        goto err;
+    }
+
+    local = (*new_frame)->local;
+
+    ret = sdfs_build_parent_loc(&local->parent_loc, loc);
+    if (ret) {
+        goto err;
+    }
+
+    ret = loc_copy(&local->loc, loc);
+    if (ret == -1) {
+        goto err;
+    }
+
+    ret = 0;
+err:
+    if (ret && (*new_frame)) {
+        SDFS_STACK_DESTROY((*new_frame));
+        *new_frame = NULL;
+        ret = -1;
+    }
+
+    return ret;
+}
+
+static int
+sdfs_get_new_frame_readdirp(call_frame_t *frame, fd_t *fd,
+                            call_frame_t **new_frame)
+{
+    int ret = -1;
+    sdfs_local_t *local = NULL;
+
+    ret = sdfs_get_new_frame_common(frame, new_frame);
+    if (ret < 0) {
+        goto err;
+    }
+
+    local = (*new_frame)->local;
+    local->parent_loc.inode = inode_ref(fd->inode);
+    gf_uuid_copy(local->parent_loc.gfid, fd->inode->gfid);
+
+    ret = 0;
+err:
+    return ret;
+}
+
+int
+sdfs_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    call_stub_t *stub = NULL;
+
+    local = frame->local;
+
+    local->op_ret = op_ret;
+    local->op_errno = op_errno;
+
+    if (local->stub) {
+        stub = local->stub;
+        local->stub = NULL;
+        call_resume(stub);
+    } else {
+        if (op_ret < 0)
+            gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR,
+                   "Unlocking entry lock failed for %s", local->loc.name);
+
+        SDFS_STACK_DESTROY(frame);
+    }
+
+    return 0;
+}
+
+int
+sdfs_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, inode_t *inode,
+               struct iatt *stbuf, struct iatt *preparent,
+               struct iatt *postparent, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+
+    local = frame->local;
+
+    STACK_UNWIND_STRICT(mkdir, local->main_frame, op_ret, op_errno, inode,
+                        stbuf, preparent, postparent, xdata);
+
+    local->main_frame = NULL;
+    STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               local->loc.name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
+    return 0;
+}
+
+int
+sdfs_mkdir_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+                  mode_t umask, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+    int op_errno = -1;
+
+    local = frame->local;
+
+    gf_uuid_unparse(loc->pargfid, gfid);
+
+    if (local->op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR,
+               "Acquiring entry lock failed for directory %s "
+               "with parent gfid %s",
+               local->loc.name, gfid);
+        op_errno = local->op_errno;
+        goto err;
+    }
+
+    STACK_WIND(frame, sdfs_mkdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(mkdir, local->main_frame, -1, op_errno, NULL, NULL,
+                        NULL, NULL, NULL);
+
+    local->main_frame = NULL;
+    SDFS_STACK_DESTROY(frame);
+    return 0;
+}
+
+int
+sdfs_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+           mode_t umask, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    call_frame_t *new_frame = NULL;
+    call_stub_t *stub = NULL;
+    int op_errno = 0;
+
+    if (-1 == sdfs_get_new_frame(frame, loc, &new_frame)) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    stub = fop_mkdir_stub(new_frame, sdfs_mkdir_helper, loc, mode, umask,
+                          xdata);
+    if (!stub) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local = new_frame->local;
+    local->stub = stub;
+
+    STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               local->loc.name, ENTRYLK_LOCK, ENTRYLK_WRLCK, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL);
+
+    if (new_frame)
+        SDFS_STACK_DESTROY(new_frame);
+
+    return 0;
+}
+
+int
+sdfs_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+               struct iatt *postparent, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+
+    local = frame->local;
+
+    STACK_UNWIND_STRICT(rmdir, local->main_frame, op_ret, op_errno, preparent,
+                        postparent, xdata);
+
+    local->main_frame = NULL;
+    STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               local->loc.name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
+    return 0;
+}
+
+int
+sdfs_rmdir_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+                  dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+
+    gf_uuid_unparse(loc->pargfid, gfid);
+
+    if (local->op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR,
+               "Acquiring entry lock failed for directory %s "
+               "with parent gfid %s",
+               local->loc.name, gfid);
+        goto err;
+    }
+
+    STACK_WIND(frame, sdfs_rmdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(rmdir, local->main_frame, -1, local->op_errno, NULL,
+                        NULL, NULL);
+
+    local->main_frame = NULL;
+    SDFS_STACK_DESTROY(frame);
+    return 0;
+}
+
+int
+sdfs_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+           dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    call_frame_t *new_frame = NULL;
+    call_stub_t *stub = NULL;
+    int op_errno = 0;
+
+    if (-1 == sdfs_get_new_frame(frame, loc, &new_frame)) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    stub = fop_rmdir_stub(new_frame, sdfs_rmdir_helper, loc, flags, xdata);
+    if (!stub) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local = new_frame->local;
+    local->stub = stub;
+
+    STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               local->loc.name, ENTRYLK_LOCK, ENTRYLK_WRLCK, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(rmdir, frame, -1, op_errno, NULL, NULL, NULL);
+
+    if (new_frame)
+        SDFS_STACK_DESTROY(new_frame);
+
+    return 0;
+}
+
+int
+sdfs_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                struct iatt *stbuf, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+
+    local = frame->local;
+
+    STACK_UNWIND_STRICT(create, local->main_frame, op_ret, op_errno, fd, inode,
+                        stbuf, preparent, postparent, xdata);
+
+    local->main_frame = NULL;
+    STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               local->loc.name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
+    return 0;
+}
+
+int
+sdfs_create_helper(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   int32_t flags, mode_t mode, mode_t umask, fd_t *fd,
+                   dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+
+    gf_uuid_unparse(loc->pargfid, gfid);
+
+    if (local->op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR,
+               "Acquiring entry lock failed for directory %s "
+               "with parent gfid %s",
+               local->loc.name, gfid);
+        goto err;
+    }
+
+    STACK_WIND(frame, sdfs_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(create, local->main_frame, -1, local->op_errno, NULL,
+                        NULL, NULL, NULL, NULL, NULL);
+
+    local->main_frame = NULL;
+    SDFS_STACK_DESTROY(frame);
+    return 0;
+}
+
+int
+sdfs_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+            mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    call_frame_t *new_frame = NULL;
+    call_stub_t *stub = NULL;
+    int op_errno = 0;
+
+    if (-1 == sdfs_get_new_frame(frame, loc, &new_frame)) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    stub = fop_create_stub(new_frame, sdfs_create_helper, loc, flags, mode,
+                           umask, fd, xdata);
+    if (!stub) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local = new_frame->local;
+    local->stub = stub;
+
+    STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               local->loc.name, ENTRYLK_LOCK, ENTRYLK_WRLCK, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL, NULL);
+
+    if (new_frame)
+        SDFS_STACK_DESTROY(new_frame);
+
+    return 0;
+}
+
+int
+sdfs_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+
+    local = frame->local;
+
+    STACK_UNWIND_STRICT(unlink, local->main_frame, op_ret, op_errno, preparent,
+                        postparent, xdata);
+
+    local->main_frame = NULL;
+    STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               local->loc.name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
+    return 0;
+}
+
+int
+sdfs_unlink_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+                   dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+
+    gf_uuid_unparse(loc->pargfid, gfid);
+
+    if (local->op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR,
+               "Acquiring entry lock failed for directory %s "
+               "with parent gfid %s",
+               local->loc.name, gfid);
+        goto err;
+    }
+
+    STACK_WIND(frame, sdfs_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, flags, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(unlink, local->main_frame, -1, local->op_errno, NULL,
+                        NULL, NULL);
+
+    local->main_frame = NULL;
+    SDFS_STACK_DESTROY(frame);
+    return 0;
+}
+
+int
+sdfs_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+            dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    call_frame_t *new_frame = NULL;
+    call_stub_t *stub = NULL;
+    int op_errno = 0;
+
+    if (-1 == sdfs_get_new_frame(frame, loc, &new_frame)) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    stub = fop_unlink_stub(new_frame, sdfs_unlink_helper, loc, flags, xdata);
+    if (!stub) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local = new_frame->local;
+    local->stub = stub;
+
+    STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               local->loc.name, ENTRYLK_LOCK, ENTRYLK_WRLCK, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(unlink, frame, -1, op_errno, NULL, NULL, NULL);
+
+    if (new_frame)
+        SDFS_STACK_DESTROY(new_frame);
+
+    return 0;
+}
+
+int
+sdfs_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, inode_t *inode,
+                 struct iatt *stbuf, struct iatt *preparent,
+                 struct iatt *postparent, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+
+    local = frame->local;
+
+    STACK_UNWIND_STRICT(link, local->main_frame, op_ret, op_errno, inode, stbuf,
+                        preparent, postparent, xdata);
+
+    local->main_frame = NULL;
+    STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               local->loc.name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
+    return 0;
+}
+
+int
+sdfs_symlink_helper(call_frame_t *frame, xlator_t *this, const char *linkname,
+                    loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+
+    gf_uuid_unparse(loc->pargfid, gfid);
+
+    if (local->op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR,
+               "Acquiring entry lock failed for directory %s "
+               "with parent gfid %s",
+               local->loc.name, gfid);
+        goto err;
+    }
+
+    STACK_WIND(frame, sdfs_symlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(link, local->main_frame, -1, local->op_errno, NULL,
+                        NULL, NULL, NULL, NULL);
+
+    local->main_frame = NULL;
+    SDFS_STACK_DESTROY(frame);
+    return 0;
+}
+
+int
+sdfs_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
+             loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    call_frame_t *new_frame = NULL;
+    call_stub_t *stub = NULL;
+    int op_errno = 0;
+
+    if (-1 == sdfs_get_new_frame(frame, loc, &new_frame)) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    stub = fop_symlink_stub(new_frame, sdfs_symlink_helper, linkname, loc,
+                            umask, xdata);
+    if (!stub) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local = new_frame->local;
+    local->stub = stub;
+
+    STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               local->loc.name, ENTRYLK_LOCK, ENTRYLK_WRLCK, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(link, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL);
+
+    if (new_frame)
+        SDFS_STACK_DESTROY(new_frame);
+
+    return 0;
+}
+
+int
+sdfs_common_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    int this_call_cnt = 0;
+    int lk_index = 0;
+    sdfs_lock_t *locks = NULL;
+    call_stub_t *stub = NULL;
+
+    local = frame->local;
+    locks = local->lock;
+    lk_index = (long)cookie;
+
+    if (op_ret < 0) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+    } else {
+        locks->entrylk->locked[lk_index] = _gf_true;
+    }
+
+    this_call_cnt = sdfs_frame_return(frame);
+    if (this_call_cnt > 0) {
+        gf_log(this->name, GF_LOG_DEBUG,
+               "As there are more callcnt (%d) returning without WIND",
+               this_call_cnt);
+        return 0;
+    }
+
+    if (local->stub) {
+        stub = local->stub;
+        local->stub = NULL;
+        call_resume(stub);
+    } else {
+        if (local->op_ret < 0)
+            gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR,
+                   "unlocking entry lock failed ");
+        SDFS_STACK_DESTROY(frame);
+    }
+
+    return 0;
+}
+
+int
+sdfs_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, inode_t *inode, struct iatt *stbuf,
+              struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    sdfs_lock_t *lock = NULL;
+    int i = 0;
+    int lock_count = 0;
+
+    local = frame->local;
+    lock = local->lock;
+
+    STACK_UNWIND_STRICT(link, local->main_frame, op_ret, op_errno, inode, stbuf,
+                        preparent, postparent, xdata);
+
+    local->main_frame = NULL;
+    lock_count = lock->lock_count;
+    for (i = 0; i < lock_count; i++) {
+        STACK_WIND_COOKIE(frame, sdfs_common_entrylk_cbk, (void *)(long)i,
+                          FIRST_CHILD(this), FIRST_CHILD(this)->fops->entrylk,
+                          this->name, &lock->entrylk[i].parent_loc,
+                          lock->entrylk[i].basename, ENTRYLK_UNLOCK,
+                          ENTRYLK_WRLCK, xdata);
+    }
+
+    return 0;
+}
+
+int
+sdfs_link_helper(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+                 loc_t *newloc, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    sdfs_lock_t *locks = NULL;
+    gf_boolean_t stack_destroy = _gf_true;
+    int lock_count = 0;
+    int i = 0;
+
+    local = frame->local;
+    locks = local->lock;
+
+    if (local->op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR,
+               "Acquiring entry lock failed");
+        goto err;
+    }
+
+    STACK_WIND(frame, sdfs_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(link, local->main_frame, -1, local->op_errno, NULL,
+                        NULL, NULL, NULL, NULL);
+
+    local->main_frame = NULL;
+    for (i = 0; i < locks->lock_count && locks->entrylk->locked[i]; i++) {
+        lock_count++;
+    }
+    GF_ATOMIC_INIT(local->call_cnt, lock_count);
+
+    for (i = 0; i < lock_count; i++) {
+        if (!locks->entrylk->locked[i]) {
+            lock_count++;
+            continue;
+        }
+
+        stack_destroy = _gf_false;
+        STACK_WIND(frame, sdfs_common_entrylk_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->entrylk, this->name,
+                   &locks->entrylk[i].parent_loc, locks->entrylk[i].basename,
+                   ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
+    }
+
+    if (stack_destroy)
+        SDFS_STACK_DESTROY(frame);
+
+    return 0;
+}
+
+static int
+sdfs_init_entry_lock(sdfs_entry_lock_t *lock, loc_t *loc)
+{
+    int ret = 0;
+
+    ret = sdfs_build_parent_loc(&lock->parent_loc, loc);
+    if (ret)
+        return -1;
+
+    lock->basename = gf_strdup(loc->name);
+    if (!lock->basename)
+        return -1;
+
+    return 0;
+}
+
+int
+sdfs_entry_lock_cmp(const void *l1, const void *l2)
+{
+    const sdfs_entry_lock_t *r1 = l1;
+    const sdfs_entry_lock_t *r2 = l2;
+    int ret = 0;
+    uuid_t gfid1 = {0};
+    uuid_t gfid2 = {0};
+
+    loc_gfid((loc_t *)&r1->parent_loc, gfid1);
+    loc_gfid((loc_t *)&r2->parent_loc, gfid2);
+    ret = gf_uuid_compare(gfid1, gfid2);
+    /*Entrylks with NULL basename are the 'smallest'*/
+    if (ret == 0) {
+        if (!r1->basename)
+            return -1;
+        if (!r2->basename)
+            return 1;
+        ret = strcmp(r1->basename, r2->basename);
+    }
+
+    if (ret <= 0)
+        return -1;
+    else
+        return 1;
+}
+
+int
+sdfs_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+          dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    call_frame_t *new_frame = NULL;
+    call_stub_t *stub = NULL;
+    sdfs_lock_t *lock = NULL;
+    client_t *client = NULL;
+    int ret = 0;
+    int op_errno = ENOMEM;
+
+    new_frame = copy_frame(frame);
+    if (!new_frame) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+    /*Set unique lk-owner for the fop*/
+    set_lk_owner_from_ptr(&new_frame->root->lk_owner, new_frame->root);
+
+    gf_client_ref(client);
+    new_frame->root->client = client;
+    local = sdfs_local_init(new_frame, this);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->main_frame = frame;
+
+    lock = GF_CALLOC(1, sizeof(*lock), gf_common_mt_char);
+    if (!lock)
+        goto err;
+
+    local->lock = lock;
+
+    ret = sdfs_init_entry_lock(&lock->entrylk[0], newloc);
+    if (ret)
+        goto err;
+
+    ++lock->lock_count;
+
+    local->lock = lock;
+    GF_ATOMIC_INIT(local->call_cnt, lock->lock_count);
+
+    ret = loc_copy(&local->loc, newloc);
+    if (ret == -1) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    stub = fop_link_stub(new_frame, sdfs_link_helper, oldloc, newloc, xdata);
+    if (!stub) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->stub = stub;
+
+    STACK_WIND_COOKIE(new_frame, sdfs_common_entrylk_cbk, 0, FIRST_CHILD(this),
+                      FIRST_CHILD(this)->fops->entrylk, this->name,
+                      &lock->entrylk[0].parent_loc, lock->entrylk[0].basename,
+                      ENTRYLK_LOCK, ENTRYLK_WRLCK, xdata);
+
+    return 0;
+err:
+
+    STACK_UNWIND_STRICT(link, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL);
+
+    if (new_frame)
+        SDFS_STACK_DESTROY(new_frame);
+
+    return 0;
+}
+
+int
+sdfs_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, inode_t *inode,
+               struct iatt *stbuf, struct iatt *preparent,
+               struct iatt *postparent, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+
+    local = frame->local;
+
+    STACK_UNWIND_STRICT(mknod, local->main_frame, op_ret, op_errno, inode,
+                        stbuf, preparent, postparent, xdata);
+
+    local->main_frame = NULL;
+    STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               local->loc.name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
+    return 0;
+}
+
+int
+sdfs_mknod_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+                  dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+
+    gf_uuid_unparse(loc->pargfid, gfid);
+
+    if (local->op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR,
+               "Acquiring entry lock failed for directory %s "
+               "with parent gfid %s",
+               local->loc.name, gfid);
+        goto err;
+    }
+
+    STACK_WIND(frame, sdfs_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(mknod, local->main_frame, -1, local->op_errno, NULL,
+                        NULL, NULL, NULL, NULL);
+
+    local->main_frame = NULL;
+    SDFS_STACK_DESTROY(frame);
+    return 0;
+}
+
+int
+sdfs_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+           dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    call_frame_t *new_frame = NULL;
+    call_stub_t *stub = NULL;
+    int op_errno = 0;
+
+    if (-1 == sdfs_get_new_frame(frame, loc, &new_frame)) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    stub = fop_mknod_stub(new_frame, sdfs_mknod_helper, loc, mode, rdev, umask,
+                          xdata);
+    if (!stub) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local = new_frame->local;
+    local->stub = stub;
+
+    STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               local->loc.name, ENTRYLK_LOCK, ENTRYLK_WRLCK, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL);
+
+    if (new_frame)
+        SDFS_STACK_DESTROY(new_frame);
+
+    return 0;
+}
+
+int
+sdfs_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
+                struct iatt *preoldparent, struct iatt *postoldparent,
+                struct iatt *prenewparent, struct iatt *postnewparent,
+                dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    sdfs_lock_t *lock = NULL;
+    int i = 0;
+    int call_cnt = 0;
+
+    local = frame->local;
+    lock = local->lock;
+    GF_ATOMIC_INIT(local->call_cnt, lock->lock_count);
+
+    STACK_UNWIND_STRICT(rename, local->main_frame, op_ret, op_errno, stbuf,
+                        preoldparent, postoldparent, prenewparent,
+                        postnewparent, xdata);
+
+    local->main_frame = NULL;
+    call_cnt = GF_ATOMIC_GET(local->call_cnt);
+
+    for (i = 0; i < call_cnt; i++) {
+        STACK_WIND_COOKIE(frame, sdfs_common_entrylk_cbk, (void *)(long)i,
+                          FIRST_CHILD(this), FIRST_CHILD(this)->fops->entrylk,
+                          this->name, &lock->entrylk[i].parent_loc,
+                          lock->entrylk[i].basename, ENTRYLK_UNLOCK,
+                          ENTRYLK_WRLCK, xdata);
+    }
+
+    return 0;
+}
+
+int
+sdfs_rename_helper(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+                   loc_t *newloc, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    sdfs_lock_t *lock = NULL;
+    gf_boolean_t stack_destroy = _gf_true;
+    int lock_count = 0;
+    int i = 0;
+
+    local = frame->local;
+    lock = local->lock;
+
+    if (local->op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR,
+               "Acquiring entry lock failed ");
+        goto err;
+    }
+
+    STACK_WIND(frame, sdfs_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(rename, local->main_frame, -1, local->op_errno, NULL,
+                        NULL, NULL, NULL, NULL, NULL);
+
+    local->main_frame = NULL;
+    for (i = 0; i < lock->lock_count && lock->entrylk->locked[i]; i++) {
+        lock_count++;
+    }
+    GF_ATOMIC_INIT(local->call_cnt, lock_count);
+
+    for (i = 0; i < lock_count; i++) {
+        if (!lock->entrylk->locked[i]) {
+            lock_count++;
+            continue;
+        }
+        stack_destroy = _gf_false;
+        STACK_WIND(frame, sdfs_common_entrylk_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->entrylk, this->name,
+                   &lock->entrylk[i].parent_loc, lock->entrylk[i].basename,
+                   ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
+    }
+
+    if (stack_destroy)
+        SDFS_STACK_DESTROY(frame);
+
+    return 0;
+}
+
+int
+sdfs_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+            dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    sdfs_lock_t *lock = NULL;
+    call_frame_t *new_frame = NULL;
+    call_stub_t *stub = NULL;
+    client_t *client = NULL;
+    int ret = 0;
+    int op_errno = ENOMEM;
+    int i = 0;
+    int call_cnt = 0;
+
+    new_frame = copy_frame(frame);
+    if (!new_frame) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+    /*Set unique lk-owner for the fop*/
+    set_lk_owner_from_ptr(&new_frame->root->lk_owner, new_frame->root);
+
+    gf_client_ref(client);
+    new_frame->root->client = client;
+    local = sdfs_local_init(new_frame, this);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->main_frame = frame;
+
+    lock = GF_CALLOC(1, sizeof(*lock), gf_common_mt_char);
+    if (!lock)
+        goto err;
+
+    local->lock = lock;
+
+    ret = sdfs_init_entry_lock(&lock->entrylk[0], oldloc);
+    if (ret)
+        goto err;
+    lock->entrylk->locked[0] = _gf_false;
+
+    ++lock->lock_count;
+
+    ret = sdfs_init_entry_lock(&lock->entrylk[1], newloc);
+    if (ret)
+        goto err;
+    lock->entrylk->locked[1] = _gf_false;
+
+    ++lock->lock_count;
+
+    qsort(lock->entrylk, lock->lock_count, sizeof(*lock->entrylk),
+          sdfs_entry_lock_cmp);
+
+    local->lock = lock;
+    GF_ATOMIC_INIT(local->call_cnt, lock->lock_count);
+
+    stub = fop_rename_stub(new_frame, sdfs_rename_helper, oldloc, newloc,
+                           xdata);
+    if (!stub) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local->stub = stub;
+    call_cnt = GF_ATOMIC_GET(local->call_cnt);
+    for (i = 0; i < call_cnt; i++) {
+        STACK_WIND_COOKIE(new_frame, sdfs_common_entrylk_cbk, (void *)(long)i,
+                          FIRST_CHILD(this), FIRST_CHILD(this)->fops->entrylk,
+                          this->name, &lock->entrylk[i].parent_loc,
+                          lock->entrylk[i].basename, ENTRYLK_LOCK,
+                          ENTRYLK_WRLCK, xdata);
+    }
+
+    return 0;
+err:
+
+    STACK_UNWIND_STRICT(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL, NULL);
+
+    if (new_frame)
+        SDFS_STACK_DESTROY(new_frame);
+
+    return 0;
+}
+
+int
+sdfs_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, inode_t *inode,
+                struct iatt *stbuf, dict_t *xdata, struct iatt *postparent)
+{
+    sdfs_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (!local->loc.parent) {
+        sdfs_local_cleanup(local);
+        frame->local = NULL;
+        STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, stbuf,
+                            xdata, postparent);
+        return 0;
+    }
+
+    STACK_UNWIND_STRICT(lookup, local->main_frame, op_ret, op_errno, inode,
+                        stbuf, xdata, postparent);
+
+    local->main_frame = NULL;
+    STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               local->loc.name, ENTRYLK_UNLOCK, ENTRYLK_RDLCK, xdata);
+    return 0;
+}
+
+int
+sdfs_lookup_helper(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+
+    gf_uuid_unparse(loc->pargfid, gfid);
+
+    if (local->op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR,
+               "Acquiring entry lock failed for directory %s "
+               "with parent gfid %s",
+               local->loc.name, gfid);
+        goto err;
+    }
+
+    STACK_WIND(frame, sdfs_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(lookup, local->main_frame, -1, local->op_errno, NULL,
+                        NULL, NULL, NULL);
+    local->main_frame = NULL;
+
+    SDFS_STACK_DESTROY(frame);
+    return 0;
+}
+
+int
+sdfs_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    call_frame_t *new_frame = NULL;
+    call_stub_t *stub = NULL;
+    int op_errno = 0;
+
+    if (!loc->parent) {
+        local = sdfs_local_init(frame, this);
+        if (!local) {
+            op_errno = ENOMEM;
+            goto err;
+        }
+
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->lookup, loc, xdata);
+        return 0;
+    }
+
+    if (-1 == sdfs_get_new_frame(frame, loc, &new_frame)) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    stub = fop_lookup_stub(new_frame, sdfs_lookup_helper, loc, xdata);
+    if (!stub) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local = new_frame->local;
+    local->stub = stub;
+
+    STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               local->loc.name, ENTRYLK_LOCK, ENTRYLK_RDLCK, xdata);
+
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+
+    if (new_frame)
+        SDFS_STACK_DESTROY(new_frame);
+
+    return 0;
+}
+
+int32_t
+sdfs_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+                  dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+
+    local = frame->local;
+    STACK_UNWIND_STRICT(readdirp, local->main_frame, op_ret, op_errno, entries,
+                        xdata);
+
+    local->main_frame = NULL;
+    STACK_WIND(frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               NULL, ENTRYLK_UNLOCK, ENTRYLK_RDLCK, xdata);
+    return 0;
+}
+
+int32_t
+sdfs_readdirp_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                     off_t off, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    local = frame->local;
+
+    gf_uuid_unparse(fd->inode->gfid, gfid);
+
+    if (local->op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SDFS_MSG_ENTRYLK_ERROR,
+               "Acquiring entry lock failed for directory %s "
+               "with parent gfid %s",
+               local->loc.name, gfid);
+        goto err;
+    }
+
+    STACK_WIND(frame, sdfs_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(readdirp, local->main_frame, -1, local->op_errno, NULL,
+                        NULL);
+
+    local->main_frame = NULL;
+
+    SDFS_STACK_DESTROY(frame);
+    return 0;
+}
+
+int32_t
+sdfs_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+              off_t off, dict_t *xdata)
+{
+    sdfs_local_t *local = NULL;
+    call_frame_t *new_frame = NULL;
+    call_stub_t *stub = NULL;
+    int op_errno = 0;
+
+    if (-1 == sdfs_get_new_frame_readdirp(frame, fd, &new_frame)) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    stub = fop_readdirp_stub(new_frame, sdfs_readdirp_helper, fd, size, off,
+                             xdata);
+    if (!stub) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local = new_frame->local;
+    local->stub = stub;
+
+    STACK_WIND(new_frame, sdfs_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &local->parent_loc,
+               NULL, ENTRYLK_LOCK, ENTRYLK_RDLCK, xdata);
+
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(readdirp, frame, -1, op_errno, NULL, NULL);
+
+    if (new_frame)
+        SDFS_STACK_DESTROY(new_frame);
+
+    return 0;
+}
+
+int
+init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "'dentry-fop-serializer' not configured with exactly one child");
+        goto out;
+    }
+
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
+    }
+
+    this->local_pool = mem_pool_new(sdfs_local_t, 512);
+    if (!this->local_pool) {
+        goto out;
+    }
+
+    GF_OPTION_INIT("pass-through", this->pass_through, bool, out);
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    int ret = -1;
+
+    GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    mem_pool_destroy(this->local_pool);
+    this->local_pool = NULL;
+    return;
+}
+
+struct xlator_fops fops = {
+    .mkdir = sdfs_mkdir,
+    .rmdir = sdfs_rmdir,
+    .create = sdfs_create,
+    .unlink = sdfs_unlink,
+    .symlink = sdfs_symlink,
+    .link = sdfs_link,
+    .mknod = sdfs_mknod,
+    .rename = sdfs_rename,
+    .lookup = sdfs_lookup,
+    .readdirp = sdfs_readdirp,
+};
+
+struct xlator_cbks cbks;
+
+struct volume_options options[] = {
+    {.key = {"pass-through"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "true",
+     .op_version = {GD_OP_VERSION_4_1_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+     .tags = {"sdfs"},
+     .description = "Enable/Disable dentry serialize functionality"},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .op_version = {GD_OP_VERSION_4_0_0},
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "sdfs",
+    .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/features/sdfs/src/sdfs.h b/xlators/features/sdfs/src/sdfs.h
new file mode 100644
index 00000000000..dded5a2d7fc
--- /dev/null
+++ b/xlators/features/sdfs/src/sdfs.h
@@ -0,0 +1,49 @@
+/*
+   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/call-stub.h>
+#include "sdfs-messages.h"
+#include <glusterfs/atomic.h>
+
+#define SDFS_LOCK_COUNT_MAX 2
+
+typedef struct {
+    loc_t parent_loc;
+    char *basename;
+    int locked[SDFS_LOCK_COUNT_MAX];
+} sdfs_entry_lock_t;
+
+typedef struct {
+    sdfs_entry_lock_t entrylk[SDFS_LOCK_COUNT_MAX];
+    int lock_count;
+} sdfs_lock_t;
+
+struct sdfs_local {
+    call_frame_t *main_frame;
+    loc_t loc;
+    loc_t parent_loc;
+    call_stub_t *stub;
+    sdfs_lock_t *lock;
+    int op_ret;
+    int op_errno;
+    gf_atomic_t call_cnt;
+};
+typedef struct sdfs_local sdfs_local_t;
+
+#define SDFS_STACK_DESTROY(frame)                                              \
+    do {                                                                       \
+        sdfs_local_t *__local = NULL;                                          \
+        __local = frame->local;                                                \
+        frame->local = NULL;                                                   \
+        gf_client_unref(frame->root->client);                                  \
+        STACK_DESTROY(frame->root);                                            \
+        sdfs_local_cleanup(__local);                                           \
+    } while (0)
diff --git a/xlators/features/selinux/Makefile.am b/xlators/features/selinux/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/selinux/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/selinux/src/Makefile.am b/xlators/features/selinux/src/Makefile.am
new file mode 100644
index 00000000000..4f1e5e149b3
--- /dev/null
+++ b/xlators/features/selinux/src/Makefile.am
@@ -0,0 +1,20 @@
+if WITH_SERVER
+xlator_LTLIBRARIES = selinux.la
+endif
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+selinux_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+selinux_la_SOURCES = selinux.c
+
+selinux_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = selinux.h selinux-messages.h selinux-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
+
diff --git a/xlators/features/selinux/src/selinux-mem-types.h b/xlators/features/selinux/src/selinux-mem-types.h
new file mode 100644
index 00000000000..553e59e5a9d
--- /dev/null
+++ b/xlators/features/selinux/src/selinux-mem-types.h
@@ -0,0 +1,19 @@
+/*
+   Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef __SELINUX_MEM_TYPES_H__
+#define __SELINUX_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_selinux_mem_types_ {
+    gf_selinux_mt_selinux_priv_t = gf_common_mt_end + 1,
+    gf_selinux_mt_end
+};
+#endif
diff --git a/xlators/features/selinux/src/selinux-messages.h b/xlators/features/selinux/src/selinux-messages.h
new file mode 100644
index 00000000000..f49a54f956c
--- /dev/null
+++ b/xlators/features/selinux/src/selinux-messages.h
@@ -0,0 +1,30 @@
+/*
+  Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _SELINUX_MESSAGES_H__
+#define _SELINUX_MESSAGES_H__
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(SL, SL_MSG_INVALID_VOLFILE, SL_MSG_ENOMEM,
+           SL_MSG_MEM_ACCT_INIT_FAILED, SL_MSG_SELINUX_GLUSTER_XATTR_MISSING,
+           SL_MSG_SELINUX_XATTR_MISSING);
+
+#endif /*_SELINUX_MESSAGES_H */
diff --git a/xlators/features/selinux/src/selinux.c b/xlators/features/selinux/src/selinux.c
new file mode 100644
index 00000000000..9b1b4b55e1a
--- /dev/null
+++ b/xlators/features/selinux/src/selinux.c
@@ -0,0 +1,323 @@
+/*
+   Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+
+#include "selinux.h"
+#include "selinux-messages.h"
+#include "selinux-mem-types.h"
+#include <glusterfs/compat-errno.h>
+
+static int
+selinux_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int op_ret, int op_errno, dict_t *dict, dict_t *xdata)
+{
+    int ret = 0;
+    char *name = cookie;
+
+    if (op_errno == 0 && dict && name &&
+        (!strcmp(name, SELINUX_GLUSTER_XATTR))) {
+        ret = dict_rename_key(dict, SELINUX_GLUSTER_XATTR, SELINUX_XATTR);
+        if (ret < 0)
+            gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                   SL_MSG_SELINUX_GLUSTER_XATTR_MISSING,
+                   "getxattr failed for %s", SELINUX_XATTR);
+    }
+
+    STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, dict, xdata);
+    return ret;
+}
+
+static int
+selinux_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                  const char *name, dict_t *xdata)
+{
+    selinux_priv_t *priv = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    char *xattr_name = (char *)name;
+
+    priv = this->private;
+
+    GF_VALIDATE_OR_GOTO("selinux", priv, err);
+
+    /* name can be NULL for listxattr calls */
+    if (!priv->selinux_enabled || !name)
+        goto off;
+
+    if (strcmp(name, SELINUX_XATTR) == 0)
+        xattr_name = SELINUX_GLUSTER_XATTR;
+
+off:
+    STACK_WIND_COOKIE(frame, selinux_fgetxattr_cbk, xattr_name,
+                      FIRST_CHILD(this), FIRST_CHILD(this)->fops->fgetxattr, fd,
+                      xattr_name, xdata);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, NULL, xdata);
+
+    return 0;
+}
+
+static int
+selinux_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int op_ret, int op_errno, dict_t *dict, dict_t *xdata)
+{
+    int ret = 0;
+    char *name = cookie;
+
+    if (op_errno == 0 && dict && name &&
+        (!strcmp(name, SELINUX_GLUSTER_XATTR))) {
+        ret = dict_rename_key(dict, SELINUX_GLUSTER_XATTR, SELINUX_XATTR);
+        if (ret < 0)
+            gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                   SL_MSG_SELINUX_GLUSTER_XATTR_MISSING,
+                   "getxattr failed for %s", SELINUX_XATTR);
+    }
+
+    STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, xdata);
+
+    return 0;
+}
+
+static int
+selinux_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                 const char *name, dict_t *xdata)
+{
+    selinux_priv_t *priv = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    char *xattr_name = (char *)name;
+
+    priv = this->private;
+
+    GF_VALIDATE_OR_GOTO("selinux", priv, err);
+
+    /* name can be NULL for listxattr calls */
+    if (!priv->selinux_enabled || !name)
+        goto off;
+
+    if (strcmp(name, SELINUX_XATTR) == 0)
+        xattr_name = SELINUX_GLUSTER_XATTR;
+
+off:
+    STACK_WIND_COOKIE(frame, selinux_getxattr_cbk, xattr_name,
+                      FIRST_CHILD(this), FIRST_CHILD(this)->fops->getxattr, loc,
+                      xattr_name, xdata);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, NULL, xdata);
+    return 0;
+}
+
+static int
+selinux_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int op_ret, int op_errno, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+static int
+selinux_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+                  int flags, dict_t *xdata)
+{
+    selinux_priv_t *priv = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    int32_t ret = -1;
+
+    priv = this->private;
+
+    GF_VALIDATE_OR_GOTO("selinux", priv, err);
+
+    if (!priv->selinux_enabled && !dict)
+        goto off;
+
+    ret = dict_rename_key(dict, SELINUX_XATTR, SELINUX_GLUSTER_XATTR);
+    if (ret < 0 && ret != -ENODATA)
+        goto err;
+
+off:
+    STACK_WIND(frame, selinux_fsetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+
+    return 0;
+err:
+    STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+static int
+selinux_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int op_ret, int op_errno, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+static int
+selinux_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+                 int flags, dict_t *xdata)
+{
+    selinux_priv_t *priv = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    int32_t ret = -1;
+
+    priv = this->private;
+
+    GF_VALIDATE_OR_GOTO("selinux", priv, err);
+
+    if (!priv->selinux_enabled && !dict)
+        goto off;
+
+    ret = dict_rename_key(dict, SELINUX_XATTR, SELINUX_GLUSTER_XATTR);
+    if (ret < 0 && ret != -ENODATA)
+        goto err;
+
+off:
+    STACK_WIND(frame, selinux_setxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("selinux", this, out);
+
+    ret = xlator_mem_acct_init(this, gf_selinux_mt_end + 1);
+
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SL_MSG_MEM_ACCT_INIT_FAILED,
+               "Memory accounting init failed");
+        return ret;
+    }
+out:
+    return ret;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    int32_t ret = -1;
+    selinux_priv_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("selinux", this, out);
+
+    if (!this->children || this->children->next) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, SL_MSG_INVALID_VOLFILE,
+               "Error: SELinux (%s) not configured with exactly one "
+               "child",
+               this->name);
+        return -1;
+    }
+
+    if (this->parents == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, SL_MSG_INVALID_VOLFILE,
+               "Dangling volume. Please check the volfile");
+    }
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_selinux_mt_selinux_priv_t);
+    if (!priv) {
+        gf_log(this->name, GF_LOG_ERROR, "out of memory");
+        goto out;
+    }
+
+    GF_OPTION_INIT("selinux", priv->selinux_enabled, bool, out);
+
+    this->local_pool = mem_pool_new(selinux_priv_t, 64);
+    if (!this->local_pool) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, SL_MSG_ENOMEM,
+               "Failed to create local_t's memory pool");
+        goto out;
+    }
+
+    this->private = (void *)priv;
+    ret = 0;
+out:
+    if (ret) {
+        GF_FREE(priv);
+        mem_pool_destroy(this->local_pool);
+        this->local_pool = NULL;
+    }
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    int32_t ret = -1;
+    selinux_priv_t *priv = NULL;
+
+    priv = this->private;
+
+    GF_OPTION_RECONF("selinux", priv->selinux_enabled, options, bool, out);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    selinux_priv_t *priv = NULL;
+
+    priv = this->private;
+    GF_FREE(priv);
+
+    mem_pool_destroy(this->local_pool);
+    this->local_pool = NULL;
+
+    return;
+}
+
+struct xlator_fops fops = {
+    .getxattr = selinux_getxattr,
+    .fgetxattr = selinux_fgetxattr,
+    .setxattr = selinux_setxattr,
+    .fsetxattr = selinux_fsetxattr,
+};
+
+struct xlator_cbks cbks = {};
+
+struct volume_options options[] = {
+    {
+        .key = {"selinux"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "on",
+        .description = "Enable/disable selinux translator",
+        .op_version = {GD_OP_VERSION_3_11_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .tags = {"security", "linux"},
+    },
+    {
+        .key = {NULL},
+    }};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "selinux",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/selinux/src/selinux.h b/xlators/features/selinux/src/selinux.h
new file mode 100644
index 00000000000..1bbdad3bb36
--- /dev/null
+++ b/xlators/features/selinux/src/selinux.h
@@ -0,0 +1,24 @@
+/*
+   Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef __SELINUX_H__
+#define __SELINUX_H__
+
+#include <glusterfs/common-utils.h>
+
+#define SELINUX_XATTR "security.selinux"
+#define SELINUX_GLUSTER_XATTR "trusted.glusterfs.selinux"
+
+struct selinux_priv {
+    gf_boolean_t selinux_enabled;
+};
+
+typedef struct selinux_priv selinux_priv_t;
+
+#endif
diff --git a/xlators/features/shard/Makefile.am b/xlators/features/shard/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/shard/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/shard/src/Makefile.am b/xlators/features/shard/src/Makefile.am
new file mode 100644
index 00000000000..bf5700d4bcc
--- /dev/null
+++ b/xlators/features/shard/src/Makefile.am
@@ -0,0 +1,17 @@
+xlator_LTLIBRARIES = shard.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+shard_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+shard_la_SOURCES = shard.c
+
+shard_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = shard.h shard-mem-types.h shard-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/shard/src/shard-mem-types.h b/xlators/features/shard/src/shard-mem-types.h
new file mode 100644
index 00000000000..1fe7e2e2798
--- /dev/null
+++ b/xlators/features/shard/src/shard-mem-types.h
@@ -0,0 +1,24 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef __SHARD_MEM_TYPES_H__
+#define __SHARD_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_shard_mem_types_ {
+    gf_shard_mt_priv_t = gf_common_mt_end + 1,
+    gf_shard_mt_inode_list,
+    gf_shard_mt_inode_ctx_t,
+    gf_shard_mt_iovec,
+    gf_shard_mt_int64_t,
+    gf_shard_mt_uint64_t,
+    gf_shard_mt_end
+};
+#endif
diff --git a/xlators/features/shard/src/shard-messages.h b/xlators/features/shard/src/shard-messages.h
new file mode 100644
index 00000000000..2d0867eb136
--- /dev/null
+++ b/xlators/features/shard/src/shard-messages.h
@@ -0,0 +1,39 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _SHARD_MESSAGES_H_
+#define _SHARD_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(SHARD, SHARD_MSG_BASE_FILE_LOOKUP_FAILED, SHARD_MSG_DICT_OP_FAILED,
+           SHARD_MSG_DOT_SHARD_NODIR, SHARD_MSG_FD_CTX_SET_FAILED,
+           SHARD_MSG_INODE_CTX_GET_FAILED, SHARD_MSG_INODE_CTX_SET_FAILED,
+           SHARD_MSG_INODE_PATH_FAILED, SHARD_MSG_INTERNAL_XATTR_MISSING,
+           SHARD_MSG_INVALID_VOLFILE, SHARD_MSG_LOOKUP_SHARD_FAILED,
+           SHARD_MSG_MEM_ACCT_INIT_FAILED, SHARD_MSG_NULL_THIS,
+           SHARD_MSG_SIZE_SET_FAILED, SHARD_MSG_STAT_FAILED,
+           SHARD_MSG_TRUNCATE_LAST_SHARD_FAILED,
+           SHARD_MSG_UPDATE_FILE_SIZE_FAILED, SHARD_MSG_FOP_NOT_SUPPORTED,
+           SHARD_MSG_INVALID_FOP, SHARD_MSG_MEMALLOC_FAILED,
+           SHARD_MSG_FOP_FAILED, SHARD_MSG_SHARDS_DELETION_FAILED,
+           SHARD_MSG_SHARD_DELETION_COMPLETED);
+
+#endif /* !_SHARD_MESSAGES_H_ */
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
new file mode 100644
index 00000000000..e5f93063943
--- /dev/null
+++ b/xlators/features/shard/src/shard.c
@@ -0,0 +1,7382 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <unistd.h>
+
+#include "shard.h"
+#include "shard-mem-types.h"
+#include <glusterfs/byte-order.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/statedump.h>
+
+static gf_boolean_t
+__is_shard_dir(uuid_t gfid)
+{
+    shard_priv_t *priv = THIS->private;
+
+    if (gf_uuid_compare(gfid, priv->dot_shard_gfid) == 0)
+        return _gf_true;
+
+    return _gf_false;
+}
+
+static gf_boolean_t
+__is_gsyncd_on_shard_dir(call_frame_t *frame, loc_t *loc)
+{
+    if (frame->root->pid == GF_CLIENT_PID_GSYNCD &&
+        (__is_shard_dir(loc->pargfid) ||
+         (loc->parent && __is_shard_dir(loc->parent->gfid))))
+        return _gf_true;
+
+    return _gf_false;
+}
+
+void
+shard_make_block_bname(int block_num, uuid_t gfid, char *buf, size_t len)
+{
+    char gfid_str[GF_UUID_BUF_SIZE] = {
+        0,
+    };
+
+    gf_uuid_unparse(gfid, gfid_str);
+    snprintf(buf, len, "%s.%d", gfid_str, block_num);
+}
+
+void
+shard_make_block_abspath(int block_num, uuid_t gfid, char *filepath, size_t len)
+{
+    char gfid_str[GF_UUID_BUF_SIZE] = {
+        0,
+    };
+
+    gf_uuid_unparse(gfid, gfid_str);
+    snprintf(filepath, len, "/%s/%s.%d", GF_SHARD_DIR, gfid_str, block_num);
+}
+
+int
+__shard_inode_ctx_get(inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx)
+{
+    int ret = -1;
+    uint64_t ctx_uint = 0;
+    shard_inode_ctx_t *ctx_p = NULL;
+
+    ret = __inode_ctx_get(inode, this, &ctx_uint);
+    if (ret == 0) {
+        *ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint;
+        return ret;
+    }
+
+    ctx_p = GF_CALLOC(1, sizeof(*ctx_p), gf_shard_mt_inode_ctx_t);
+    if (!ctx_p)
+        return ret;
+
+    INIT_LIST_HEAD(&ctx_p->ilist);
+    INIT_LIST_HEAD(&ctx_p->to_fsync_list);
+
+    ctx_uint = (uint64_t)(uintptr_t)ctx_p;
+    ret = __inode_ctx_set(inode, this, &ctx_uint);
+    if (ret < 0) {
+        GF_FREE(ctx_p);
+        return ret;
+    }
+
+    *ctx = ctx_p;
+
+    return ret;
+}
+
+int
+shard_inode_ctx_get(inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx)
+{
+    int ret = 0;
+
+    LOCK(&inode->lock);
+    {
+        ret = __shard_inode_ctx_get(inode, this, ctx);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+int
+__shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf,
+                      uint64_t block_size, int32_t valid)
+{
+    int ret = -1;
+    shard_inode_ctx_t *ctx = NULL;
+
+    ret = __shard_inode_ctx_get(inode, this, &ctx);
+    if (ret)
+        return ret;
+
+    if (valid & SHARD_MASK_BLOCK_SIZE)
+        ctx->block_size = block_size;
+
+    if (valid & SHARD_MASK_PROT)
+        ctx->stat.ia_prot = stbuf->ia_prot;
+
+    if (valid & SHARD_MASK_NLINK)
+        ctx->stat.ia_nlink = stbuf->ia_nlink;
+
+    if (valid & SHARD_MASK_UID)
+        ctx->stat.ia_uid = stbuf->ia_uid;
+
+    if (valid & SHARD_MASK_GID)
+        ctx->stat.ia_gid = stbuf->ia_gid;
+
+    if (valid & SHARD_MASK_SIZE)
+        ctx->stat.ia_size = stbuf->ia_size;
+
+    if (valid & SHARD_MASK_BLOCKS)
+        ctx->stat.ia_blocks = stbuf->ia_blocks;
+
+    if (valid & SHARD_MASK_TIMES) {
+        SHARD_TIME_UPDATE(ctx->stat.ia_mtime, ctx->stat.ia_mtime_nsec,
+                          stbuf->ia_mtime, stbuf->ia_mtime_nsec);
+        SHARD_TIME_UPDATE(ctx->stat.ia_ctime, ctx->stat.ia_ctime_nsec,
+                          stbuf->ia_ctime, stbuf->ia_ctime_nsec);
+        SHARD_TIME_UPDATE(ctx->stat.ia_atime, ctx->stat.ia_atime_nsec,
+                          stbuf->ia_atime, stbuf->ia_atime_nsec);
+    }
+
+    if (valid & SHARD_MASK_OTHERS) {
+        ctx->stat.ia_ino = stbuf->ia_ino;
+        gf_uuid_copy(ctx->stat.ia_gfid, stbuf->ia_gfid);
+        ctx->stat.ia_dev = stbuf->ia_dev;
+        ctx->stat.ia_type = stbuf->ia_type;
+        ctx->stat.ia_rdev = stbuf->ia_rdev;
+        ctx->stat.ia_blksize = stbuf->ia_blksize;
+    }
+
+    if (valid & SHARD_MASK_REFRESH_RESET)
+        ctx->refresh = _gf_false;
+
+    return 0;
+}
+
+int
+shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf,
+                    uint64_t block_size, int32_t valid)
+{
+    int ret = -1;
+
+    LOCK(&inode->lock);
+    {
+        ret = __shard_inode_ctx_set(inode, this, stbuf, block_size, valid);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+int
+__shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this)
+{
+    int ret = -1;
+    shard_inode_ctx_t *ctx = NULL;
+
+    ret = __shard_inode_ctx_get(inode, this, &ctx);
+    if (ret)
+        return ret;
+
+    ctx->refresh = _gf_true;
+
+    return 0;
+}
+int
+shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this)
+{
+    int ret = -1;
+
+    LOCK(&inode->lock);
+    {
+        ret = __shard_inode_ctx_set_refresh_flag(inode, this);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+int
+__shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this)
+{
+    int ret = -1;
+    shard_inode_ctx_t *ctx = NULL;
+
+    ret = __shard_inode_ctx_get(inode, this, &ctx);
+    if (ret)
+        return ret;
+
+    ctx->refreshed = _gf_true;
+    return 0;
+}
+
+int
+shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this)
+{
+    int ret = -1;
+
+    LOCK(&inode->lock);
+    {
+        ret = __shard_inode_ctx_mark_dir_refreshed(inode, this);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+int
+__shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this,
+                                    inode_t *shard_inode)
+{
+    int ret = -1;
+    shard_inode_ctx_t *base_ictx = NULL;
+    shard_inode_ctx_t *shard_ictx = NULL;
+
+    ret = __shard_inode_ctx_get(base_inode, this, &base_ictx);
+    if (ret)
+        return ret;
+
+    ret = __shard_inode_ctx_get(shard_inode, this, &shard_ictx);
+    if (ret)
+        return ret;
+
+    if (shard_ictx->fsync_needed) {
+        shard_ictx->fsync_needed++;
+        return 1;
+    }
+
+    list_add_tail(&shard_ictx->to_fsync_list, &base_ictx->to_fsync_list);
+    shard_ictx->inode = shard_inode;
+    shard_ictx->fsync_needed++;
+    base_ictx->fsync_count++;
+    shard_ictx->base_inode = base_inode;
+
+    return 0;
+}
+
+int
+shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this,
+                                  inode_t *shard_inode)
+{
+    int ret = -1;
+
+    /* This ref acts as a refkeepr on the base inode. We
+     * need to keep this inode alive as it holds the head
+     * of the to_fsync_list.
+     */
+    inode_ref(base_inode);
+    inode_ref(shard_inode);
+
+    LOCK(&base_inode->lock);
+    LOCK(&shard_inode->lock);
+    {
+        ret = __shard_inode_ctx_add_to_fsync_list(base_inode, this,
+                                                  shard_inode);
+    }
+    UNLOCK(&shard_inode->lock);
+    UNLOCK(&base_inode->lock);
+
+    /* Unref the base inode corresponding to the ref above, if the shard is
+     * found to be already part of the fsync list.
+     */
+    if (ret != 0) {
+        inode_unref(base_inode);
+        inode_unref(shard_inode);
+    }
+    return ret;
+}
+
+gf_boolean_t
+__shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this)
+{
+    int ret = -1;
+    shard_inode_ctx_t *ctx = NULL;
+
+    ret = __shard_inode_ctx_get(inode, this, &ctx);
+    /* If inode ctx get fails, better to err on the side of caution and
+     * try again? Unless the failure is due to mem-allocation.
+     */
+    if (ret)
+        return _gf_true;
+
+    return !ctx->refreshed;
+}
+
+gf_boolean_t
+shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this)
+{
+    gf_boolean_t flag = _gf_false;
+
+    LOCK(&inode->lock);
+    {
+        flag = __shard_inode_ctx_needs_lookup(inode, this);
+    }
+    UNLOCK(&inode->lock);
+
+    return flag;
+}
+int
+__shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, struct iatt *stbuf)
+{
+    int ret = -1;
+    shard_inode_ctx_t *ctx = NULL;
+
+    ret = __shard_inode_ctx_get(inode, this, &ctx);
+    if (ret)
+        return ret;
+
+    if ((stbuf->ia_size != ctx->stat.ia_size) ||
+        (stbuf->ia_blocks != ctx->stat.ia_blocks))
+        ctx->refresh = _gf_true;
+
+    return 0;
+}
+
+int
+shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, struct iatt *stbuf)
+{
+    int ret = -1;
+
+    LOCK(&inode->lock);
+    {
+        ret = __shard_inode_ctx_invalidate(inode, this, stbuf);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+int
+__shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this,
+                                 uint64_t *block_size)
+{
+    int ret = -1;
+    uint64_t ctx_uint = 0;
+    shard_inode_ctx_t *ctx = NULL;
+
+    ret = __inode_ctx_get(inode, this, &ctx_uint);
+    if (ret < 0)
+        return ret;
+
+    ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint;
+
+    *block_size = ctx->block_size;
+
+    return 0;
+}
+
+int
+shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this,
+                               uint64_t *block_size)
+{
+    int ret = -1;
+
+    LOCK(&inode->lock);
+    {
+        ret = __shard_inode_ctx_get_block_size(inode, this, block_size);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+int
+__shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this,
+                                  int *fsync_count)
+{
+    int ret = -1;
+    uint64_t ctx_uint = 0;
+    shard_inode_ctx_t *ctx = NULL;
+
+    ret = __inode_ctx_get(inode, this, &ctx_uint);
+    if (ret < 0)
+        return ret;
+
+    ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint;
+
+    *fsync_count = ctx->fsync_needed;
+
+    return 0;
+}
+
+int
+shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this,
+                                int *fsync_count)
+{
+    int ret = -1;
+
+    LOCK(&inode->lock);
+    {
+        ret = __shard_inode_ctx_get_fsync_count(inode, this, fsync_count);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+int
+__shard_inode_ctx_get_all(inode_t *inode, xlator_t *this,
+                          shard_inode_ctx_t *ctx_out)
+{
+    int ret = -1;
+    uint64_t ctx_uint = 0;
+    shard_inode_ctx_t *ctx = NULL;
+
+    ret = __inode_ctx_get(inode, this, &ctx_uint);
+    if (ret < 0)
+        return ret;
+
+    ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint;
+
+    memcpy(ctx_out, ctx, sizeof(shard_inode_ctx_t));
+    return 0;
+}
+
+int
+shard_inode_ctx_get_all(inode_t *inode, xlator_t *this,
+                        shard_inode_ctx_t *ctx_out)
+{
+    int ret = -1;
+
+    LOCK(&inode->lock);
+    {
+        ret = __shard_inode_ctx_get_all(inode, this, ctx_out);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+int
+__shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this,
+                                       struct iatt *buf,
+                                       gf_boolean_t *need_refresh)
+{
+    int ret = -1;
+    uint64_t ctx_uint = 0;
+    shard_inode_ctx_t *ctx = NULL;
+
+    ret = __inode_ctx_get(inode, this, &ctx_uint);
+    if (ret < 0)
+        return ret;
+
+    ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint;
+
+    if (ctx->refresh == _gf_false)
+        *buf = ctx->stat;
+    else
+        *need_refresh = _gf_true;
+
+    return 0;
+}
+
+int
+shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this,
+                                     struct iatt *buf,
+                                     gf_boolean_t *need_refresh)
+{
+    int ret = -1;
+
+    LOCK(&inode->lock);
+    {
+        ret = __shard_inode_ctx_fill_iatt_from_cache(inode, this, buf,
+                                                     need_refresh);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+void
+shard_local_wipe(shard_local_t *local)
+{
+    int i = 0;
+    int count = 0;
+
+    count = local->num_blocks;
+
+    syncbarrier_destroy(&local->barrier);
+    loc_wipe(&local->loc);
+    loc_wipe(&local->dot_shard_loc);
+    loc_wipe(&local->dot_shard_rm_loc);
+    loc_wipe(&local->loc2);
+    loc_wipe(&local->tmp_loc);
+    loc_wipe(&local->int_inodelk.loc);
+    loc_wipe(&local->int_entrylk.loc);
+    loc_wipe(&local->newloc);
+
+    if (local->name)
+        GF_FREE(local->name);
+
+    if (local->int_entrylk.basename)
+        GF_FREE(local->int_entrylk.basename);
+    if (local->fd)
+        fd_unref(local->fd);
+
+    if (local->xattr_req)
+        dict_unref(local->xattr_req);
+    if (local->xattr_rsp)
+        dict_unref(local->xattr_rsp);
+
+    for (i = 0; i < count; i++) {
+        if (!local->inode_list)
+            break;
+
+        if (local->inode_list[i])
+            inode_unref(local->inode_list[i]);
+    }
+
+    GF_FREE(local->inode_list);
+
+    GF_FREE(local->vector);
+    if (local->iobref)
+        iobref_unref(local->iobref);
+    if (local->list_inited)
+        gf_dirent_free(&local->entries_head);
+    if (local->inodelk_frame)
+        SHARD_STACK_DESTROY(local->inodelk_frame);
+    if (local->entrylk_frame)
+        SHARD_STACK_DESTROY(local->entrylk_frame);
+}
+
+int
+shard_modify_size_and_block_count(struct iatt *stbuf, dict_t *dict)
+{
+    int ret = -1;
+    void *size_attr = NULL;
+    uint64_t size_array[4];
+
+    ret = dict_get_ptr(dict, GF_XATTR_SHARD_FILE_SIZE, &size_attr);
+    if (ret) {
+        gf_msg_callingfn(THIS->name, GF_LOG_ERROR, 0,
+                         SHARD_MSG_INTERNAL_XATTR_MISSING,
+                         "Failed to "
+                         "get " GF_XATTR_SHARD_FILE_SIZE " for %s",
+                         uuid_utoa(stbuf->ia_gfid));
+        return ret;
+    }
+
+    memcpy(size_array, size_attr, sizeof(size_array));
+
+    stbuf->ia_size = ntoh64(size_array[0]);
+    stbuf->ia_blocks = ntoh64(size_array[2]);
+
+    return 0;
+}
+
+int
+shard_call_count_return(call_frame_t *frame)
+{
+    int call_count = 0;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    LOCK(&frame->lock);
+    {
+        call_count = --local->call_count;
+    }
+    UNLOCK(&frame->lock);
+
+    return call_count;
+}
+
+static char *
+shard_internal_dir_string(shard_internal_dir_type_t type)
+{
+    char *str = NULL;
+
+    switch (type) {
+        case SHARD_INTERNAL_DIR_DOT_SHARD:
+            str = GF_SHARD_DIR;
+            break;
+        case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME:
+            str = GF_SHARD_REMOVE_ME_DIR;
+            break;
+        default:
+            break;
+    }
+    return str;
+}
+
+static int
+shard_init_internal_dir_loc(xlator_t *this, shard_local_t *local,
+                            shard_internal_dir_type_t type)
+{
+    int ret = -1;
+    char *bname = NULL;
+    inode_t *parent = NULL;
+    loc_t *internal_dir_loc = NULL;
+    shard_priv_t *priv = NULL;
+
+    priv = this->private;
+    if (!local)
+        return -1;
+
+    switch (type) {
+        case SHARD_INTERNAL_DIR_DOT_SHARD:
+            internal_dir_loc = &local->dot_shard_loc;
+            bname = GF_SHARD_DIR;
+            parent = inode_ref(this->itable->root);
+            break;
+        case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME:
+            internal_dir_loc = &local->dot_shard_rm_loc;
+            bname = GF_SHARD_REMOVE_ME_DIR;
+            parent = inode_ref(priv->dot_shard_inode);
+            break;
+        default:
+            break;
+    }
+
+    internal_dir_loc->inode = inode_new(this->itable);
+    internal_dir_loc->parent = parent;
+    ret = inode_path(internal_dir_loc->parent, bname,
+                     (char **)&internal_dir_loc->path);
+    if (ret < 0 || !(internal_dir_loc->inode)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED,
+               "Inode path failed on %s", bname);
+        goto out;
+    }
+
+    internal_dir_loc->name = strrchr(internal_dir_loc->path, '/');
+    if (internal_dir_loc->name)
+        internal_dir_loc->name++;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+inode_t *
+__shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
+                                 inode_t *base_inode, int block_num,
+                                 uuid_t gfid)
+{
+    char block_bname[256] = {
+        0,
+    };
+    inode_t *lru_inode = NULL;
+    shard_priv_t *priv = NULL;
+    shard_inode_ctx_t *ctx = NULL;
+    shard_inode_ctx_t *lru_inode_ctx = NULL;
+    shard_inode_ctx_t *lru_base_inode_ctx = NULL;
+    inode_t *fsync_inode = NULL;
+    inode_t *lru_base_inode = NULL;
+    gf_boolean_t do_fsync = _gf_false;
+
+    priv = this->private;
+
+    shard_inode_ctx_get(linked_inode, this, &ctx);
+
+    if (list_empty(&ctx->ilist)) {
+        if (priv->inode_count + 1 <= priv->lru_limit) {
+            /* If this inode was linked here for the first time (indicated
+             * by empty list), and if there is still space in the priv list,
+             * add this ctx to the tail of the list.
+             */
+            /* For as long as an inode is in lru list, we try to
+             * keep it alive by holding a ref on it.
+             */
+            inode_ref(linked_inode);
+            if (base_inode)
+                gf_uuid_copy(ctx->base_gfid, base_inode->gfid);
+            else
+                gf_uuid_copy(ctx->base_gfid, gfid);
+            ctx->block_num = block_num;
+            list_add_tail(&ctx->ilist, &priv->ilist_head);
+            priv->inode_count++;
+            ctx->base_inode = inode_ref(base_inode);
+        } else {
+            /*If on the other hand there is no available slot for this inode
+             * in the list, delete the lru inode from the head of the list,
+             * unlink it. And in its place add this new inode into the list.
+             */
+            lru_inode_ctx = list_first_entry(&priv->ilist_head,
+                                             shard_inode_ctx_t, ilist);
+            GF_ASSERT(lru_inode_ctx->block_num > 0);
+            lru_base_inode = lru_inode_ctx->base_inode;
+            list_del_init(&lru_inode_ctx->ilist);
+            lru_inode = inode_find(linked_inode->table,
+                                   lru_inode_ctx->stat.ia_gfid);
+            /* If the lru inode was part of the pending-fsync list,
+             * the base inode needs to be unref'd, the lru inode
+             * deleted from fsync list and fsync'd in a new frame,
+             * and then unlinked in memory and forgotten.
+             */
+            if (!lru_base_inode)
+                goto after_fsync_check;
+            LOCK(&lru_base_inode->lock);
+            LOCK(&lru_inode->lock);
+            {
+                if (!list_empty(&lru_inode_ctx->to_fsync_list)) {
+                    list_del_init(&lru_inode_ctx->to_fsync_list);
+                    lru_inode_ctx->fsync_needed = 0;
+                    do_fsync = _gf_true;
+                    __shard_inode_ctx_get(lru_base_inode, this,
+                                          &lru_base_inode_ctx);
+                    lru_base_inode_ctx->fsync_count--;
+                }
+            }
+            UNLOCK(&lru_inode->lock);
+            UNLOCK(&lru_base_inode->lock);
+
+        after_fsync_check:
+            if (!do_fsync) {
+                shard_make_block_bname(lru_inode_ctx->block_num,
+                                       lru_inode_ctx->base_gfid, block_bname,
+                                       sizeof(block_bname));
+                /* The following unref corresponds to the ref held at
+                 * the time the shard was added to the lru list.
+                 */
+                inode_unref(lru_inode);
+                inode_unlink(lru_inode, priv->dot_shard_inode, block_bname);
+                inode_forget(lru_inode, 0);
+            } else {
+                /* The following unref corresponds to the ref
+                 * held when the shard was added to fsync list.
+                 */
+                inode_unref(lru_inode);
+                fsync_inode = lru_inode;
+                if (lru_base_inode)
+                    inode_unref(lru_base_inode);
+            }
+            /* The following unref corresponds to the ref
+             * held by inode_find() above.
+             */
+            inode_unref(lru_inode);
+
+            /* The following unref corresponds to the ref held on the base shard
+             * at the time of adding shard inode to lru list
+             */
+            if (lru_base_inode)
+                inode_unref(lru_base_inode);
+
+            /* For as long as an inode is in lru list, we try to
+             * keep it alive by holding a ref on it.
+             */
+            inode_ref(linked_inode);
+            if (base_inode)
+                gf_uuid_copy(ctx->base_gfid, base_inode->gfid);
+            else
+                gf_uuid_copy(ctx->base_gfid, gfid);
+            ctx->block_num = block_num;
+            ctx->base_inode = inode_ref(base_inode);
+            list_add_tail(&ctx->ilist, &priv->ilist_head);
+        }
+    } else {
+        /* If this is not the first time this inode is being operated on, move
+         * it to the most recently used end of the list.
+         */
+        list_move_tail(&ctx->ilist, &priv->ilist_head);
+    }
+    return fsync_inode;
+}
+
+int
+shard_common_failure_unwind(glusterfs_fop_t fop, call_frame_t *frame,
+                            int32_t op_ret, int32_t op_errno)
+{
+    switch (fop) {
+        case GF_FOP_LOOKUP:
+            SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, NULL, NULL,
+                               NULL, NULL);
+            break;
+        case GF_FOP_STAT:
+            SHARD_STACK_UNWIND(stat, frame, op_ret, op_errno, NULL, NULL);
+            break;
+        case GF_FOP_FSTAT:
+            SHARD_STACK_UNWIND(fstat, frame, op_ret, op_errno, NULL, NULL);
+            break;
+        case GF_FOP_TRUNCATE:
+            SHARD_STACK_UNWIND(truncate, frame, op_ret, op_errno, NULL, NULL,
+                               NULL);
+            break;
+        case GF_FOP_FTRUNCATE:
+            SHARD_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, NULL, NULL,
+                               NULL);
+            break;
+        case GF_FOP_MKNOD:
+            SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, NULL, NULL, NULL,
+                               NULL, NULL);
+            break;
+        case GF_FOP_LINK:
+            SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, NULL, NULL, NULL,
+                               NULL, NULL);
+            break;
+        case GF_FOP_CREATE:
+            SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, NULL, NULL,
+                               NULL, NULL, NULL, NULL);
+            break;
+        case GF_FOP_UNLINK:
+            SHARD_STACK_UNWIND(unlink, frame, op_ret, op_errno, NULL, NULL,
+                               NULL);
+            break;
+        case GF_FOP_RENAME:
+            SHARD_STACK_UNWIND(rename, frame, op_ret, op_errno, NULL, NULL,
+                               NULL, NULL, NULL, NULL);
+            break;
+        case GF_FOP_WRITE:
+            SHARD_STACK_UNWIND(writev, frame, op_ret, op_errno, NULL, NULL,
+                               NULL);
+            break;
+        case GF_FOP_FALLOCATE:
+            SHARD_STACK_UNWIND(fallocate, frame, op_ret, op_errno, NULL, NULL,
+                               NULL);
+            break;
+        case GF_FOP_ZEROFILL:
+            SHARD_STACK_UNWIND(zerofill, frame, op_ret, op_errno, NULL, NULL,
+                               NULL);
+            break;
+        case GF_FOP_DISCARD:
+            SHARD_STACK_UNWIND(discard, frame, op_ret, op_errno, NULL, NULL,
+                               NULL);
+            break;
+        case GF_FOP_READ:
+            SHARD_STACK_UNWIND(readv, frame, op_ret, op_errno, NULL, -1, NULL,
+                               NULL, NULL);
+            break;
+        case GF_FOP_FSYNC:
+            SHARD_STACK_UNWIND(fsync, frame, op_ret, op_errno, NULL, NULL,
+                               NULL);
+            break;
+        case GF_FOP_REMOVEXATTR:
+            SHARD_STACK_UNWIND(removexattr, frame, op_ret, op_errno, NULL);
+            break;
+        case GF_FOP_FREMOVEXATTR:
+            SHARD_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, NULL);
+            break;
+        case GF_FOP_FGETXATTR:
+            SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, NULL, NULL);
+            break;
+        case GF_FOP_GETXATTR:
+            SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, NULL, NULL);
+            break;
+        case GF_FOP_FSETXATTR:
+            SHARD_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, NULL);
+            break;
+        case GF_FOP_SETXATTR:
+            SHARD_STACK_UNWIND(setxattr, frame, op_ret, op_errno, NULL);
+            break;
+        case GF_FOP_SETATTR:
+            SHARD_STACK_UNWIND(setattr, frame, op_ret, op_errno, NULL, NULL,
+                               NULL);
+            break;
+        case GF_FOP_FSETATTR:
+            SHARD_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, NULL, NULL,
+                               NULL);
+            break;
+        case GF_FOP_SEEK:
+            SHARD_STACK_UNWIND(seek, frame, op_ret, op_errno, 0, NULL);
+            break;
+        default:
+            gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP,
+                   "Invalid fop id = %d", fop);
+            break;
+    }
+    return 0;
+}
+
+int
+shard_common_inode_write_success_unwind(glusterfs_fop_t fop,
+                                        call_frame_t *frame, int32_t op_ret)
+{
+    shard_local_t *local = frame->local;
+
+    /* the below 3 variables are required because, in SHARD_STACK_UNWIND()
+       macro, there is a check for local being null. So many static analyzers
+       backtrace the code with assumption of possible (local == NULL) case,
+       and complains for below lines. By handling it like below, we overcome
+       the warnings */
+
+    struct iatt *prebuf = ((local) ? &local->prebuf : NULL);
+    struct iatt *postbuf = ((local) ? &local->postbuf : NULL);
+    dict_t *xattr_rsp = ((local) ? local->xattr_rsp : NULL);
+
+    switch (fop) {
+        case GF_FOP_WRITE:
+            SHARD_STACK_UNWIND(writev, frame, op_ret, 0, prebuf, postbuf,
+                               xattr_rsp);
+            break;
+        case GF_FOP_FALLOCATE:
+            SHARD_STACK_UNWIND(fallocate, frame, op_ret, 0, prebuf, postbuf,
+                               xattr_rsp);
+            break;
+        case GF_FOP_ZEROFILL:
+            SHARD_STACK_UNWIND(zerofill, frame, op_ret, 0, prebuf, postbuf,
+                               xattr_rsp);
+            break;
+        case GF_FOP_DISCARD:
+            SHARD_STACK_UNWIND(discard, frame, op_ret, 0, prebuf, postbuf,
+                               xattr_rsp);
+            break;
+        default:
+            gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP,
+                   "Invalid fop id = %d", fop);
+            break;
+    }
+    return 0;
+}
+
+int
+shard_evicted_inode_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                              int32_t op_ret, int32_t op_errno,
+                              struct iatt *prebuf, struct iatt *postbuf,
+                              dict_t *xdata)
+{
+    char block_bname[256] = {
+        0,
+    };
+    fd_t *anon_fd = cookie;
+    inode_t *shard_inode = NULL;
+    shard_inode_ctx_t *ctx = NULL;
+    shard_priv_t *priv = NULL;
+
+    priv = this->private;
+
+    if (anon_fd == NULL || op_ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, SHARD_MSG_MEMALLOC_FAILED,
+               "fsync failed on shard");
+        goto out;
+    }
+    shard_inode = anon_fd->inode;
+
+    LOCK(&priv->lock);
+    LOCK(&shard_inode->lock);
+    {
+        __shard_inode_ctx_get(shard_inode, this, &ctx);
+        if ((list_empty(&ctx->to_fsync_list)) && (list_empty(&ctx->ilist))) {
+            shard_make_block_bname(ctx->block_num, shard_inode->gfid,
+                                   block_bname, sizeof(block_bname));
+            inode_unlink(shard_inode, priv->dot_shard_inode, block_bname);
+            /* The following unref corresponds to the ref held by
+             * inode_link() at the time the shard was created or
+             * looked up
+             */
+            inode_unref(shard_inode);
+            inode_forget(shard_inode, 0);
+        }
+    }
+    UNLOCK(&shard_inode->lock);
+    UNLOCK(&priv->lock);
+
+out:
+    if (anon_fd)
+        fd_unref(anon_fd);
+    STACK_DESTROY(frame->root);
+    return 0;
+}
+
+int
+shard_initiate_evicted_inode_fsync(xlator_t *this, inode_t *inode)
+{
+    fd_t *anon_fd = NULL;
+    call_frame_t *fsync_frame = NULL;
+
+    fsync_frame = create_frame(this, this->ctx->pool);
+    if (!fsync_frame) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED,
+               "Failed to create new frame "
+               "to fsync shard");
+        return -1;
+    }
+
+    anon_fd = fd_anonymous(inode);
+    if (!anon_fd) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED,
+               "Failed to create anon fd to"
+               " fsync shard");
+        STACK_DESTROY(fsync_frame->root);
+        return -1;
+    }
+
+    STACK_WIND_COOKIE(fsync_frame, shard_evicted_inode_fsync_cbk, anon_fd,
+                      FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync,
+                      anon_fd, 1, NULL);
+    return 0;
+}
+
+int
+shard_common_inode_write_post_lookup_shards_handler(call_frame_t *frame,
+                                                    xlator_t *this);
+
+int
+shard_common_resolve_shards(call_frame_t *frame, xlator_t *this,
+                            shard_post_resolve_fop_handler_t post_res_handler)
+{
+    int i = -1;
+    uint32_t shard_idx_iter = 0;
+    char path[PATH_MAX] = {
+        0,
+    };
+    uuid_t gfid = {
+        0,
+    };
+    inode_t *inode = NULL;
+    inode_t *res_inode = NULL;
+    inode_t *fsync_inode = NULL;
+    shard_priv_t *priv = NULL;
+    shard_local_t *local = NULL;
+    uint64_t resolve_count = 0;
+
+    priv = this->private;
+    local = frame->local;
+    local->call_count = 0;
+    shard_idx_iter = local->first_block;
+    res_inode = local->resolver_base_inode;
+
+    if ((local->op_ret < 0) || (local->resolve_not))
+        goto out;
+
+    /* If this prealloc FOP is for fresh file creation, then the size of the
+     * file will be 0. Then there will be no shards associated with this file.
+     * So we can skip the lookup process for the shards which do not exists
+     * and directly issue mknod to crete shards.
+     *
+     * In case the prealloc fop is to extend the preallocated file to bigger
+     * size then just lookup and populate inodes of existing shards and
+     * update the create count
+     */
+    if (local->fop == GF_FOP_FALLOCATE) {
+        if (!local->prebuf.ia_size) {
+            local->inode_list[0] = inode_ref(res_inode);
+            local->create_count = local->last_block;
+            shard_common_inode_write_post_lookup_shards_handler(frame, this);
+            return 0;
+        }
+        if (local->prebuf.ia_size < local->total_size)
+            local->create_count = local->last_block -
+                                  ((local->prebuf.ia_size - 1) /
+                                   local->block_size);
+    }
+
+    resolve_count = local->last_block - local->create_count;
+
+    if (res_inode)
+        gf_uuid_copy(gfid, res_inode->gfid);
+    else
+        gf_uuid_copy(gfid, local->base_gfid);
+
+    while (shard_idx_iter <= resolve_count) {
+        i++;
+        if (shard_idx_iter == 0) {
+            local->inode_list[i] = inode_ref(res_inode);
+            shard_idx_iter++;
+            continue;
+        }
+
+        shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path));
+
+        inode = NULL;
+        inode = inode_resolve(this->itable, path);
+        if (inode) {
+            gf_msg_debug(this->name, 0,
+                         "Shard %d already "
+                         "present. gfid=%s. Saving inode for future.",
+                         shard_idx_iter, uuid_utoa(inode->gfid));
+            local->inode_list[i] = inode;
+            /* Let the ref on the inodes that are already present
+             * in inode table still be held so that they don't get
+             * forgotten by the time the fop reaches the actual
+             * write stage.
+             */
+            LOCK(&priv->lock);
+            {
+                fsync_inode = __shard_update_shards_inode_list(
+                    inode, this, res_inode, shard_idx_iter, gfid);
+            }
+            UNLOCK(&priv->lock);
+            shard_idx_iter++;
+            if (fsync_inode)
+                shard_initiate_evicted_inode_fsync(this, fsync_inode);
+            continue;
+        } else {
+            local->call_count++;
+            shard_idx_iter++;
+        }
+    }
+out:
+    post_res_handler(frame, this);
+    return 0;
+}
+
+int
+shard_update_file_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno, dict_t *dict,
+                           dict_t *xdata)
+{
+    inode_t *inode = NULL;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if ((local->fd) && (local->fd->inode))
+        inode = local->fd->inode;
+    else if (local->loc.inode)
+        inode = local->loc.inode;
+
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SHARD_MSG_UPDATE_FILE_SIZE_FAILED,
+               "Update to file size"
+               " xattr failed on %s",
+               uuid_utoa(inode->gfid));
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto err;
+    }
+
+    if (shard_modify_size_and_block_count(&local->postbuf, dict)) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto err;
+    }
+err:
+    local->post_update_size_handler(frame, this);
+    return 0;
+}
+
+int
+shard_set_size_attrs(int64_t size, int64_t block_count, int64_t **size_attr_p)
+{
+    int ret = -1;
+    int64_t *size_attr = NULL;
+
+    if (!size_attr_p)
+        goto out;
+
+    size_attr = GF_CALLOC(4, sizeof(int64_t), gf_shard_mt_int64_t);
+    if (!size_attr)
+        goto out;
+
+    size_attr[0] = hton64(size);
+    /* As sharding evolves, it _may_ be necessary to embed more pieces of
+     * information within the same xattr. So allocating slots for them in
+     * advance. For now, only bytes 0-63 and 128-191 which would make up the
+     * current size and block count respectively of the file are valid.
+     */
+    size_attr[2] = hton64(block_count);
+
+    *size_attr_p = size_attr;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+shard_update_file_size(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                       loc_t *loc, shard_post_update_size_fop_handler_t handler)
+{
+    int ret = -1;
+    int64_t *size_attr = NULL;
+    int64_t delta_blocks = 0;
+    inode_t *inode = NULL;
+    shard_local_t *local = NULL;
+    dict_t *xattr_req = NULL;
+
+    local = frame->local;
+    local->post_update_size_handler = handler;
+
+    xattr_req = dict_new();
+    if (!xattr_req) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto out;
+    }
+
+    if (fd)
+        inode = fd->inode;
+    else
+        inode = loc->inode;
+
+    /* If both size and block count have not changed, then skip the xattrop.
+     */
+    delta_blocks = GF_ATOMIC_GET(local->delta_blocks);
+    if ((local->delta_size + local->hole_size == 0) && (delta_blocks == 0)) {
+        goto out;
+    }
+
+    ret = shard_set_size_attrs(local->delta_size + local->hole_size,
+                               delta_blocks, &size_attr);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SIZE_SET_FAILED,
+               "Failed to set size attrs for %s", uuid_utoa(inode->gfid));
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto out;
+    }
+
+    ret = dict_set_bin(xattr_req, GF_XATTR_SHARD_FILE_SIZE, size_attr, 8 * 4);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED,
+               "Failed to set key %s into dict. gfid=%s",
+               GF_XATTR_SHARD_FILE_SIZE, uuid_utoa(inode->gfid));
+        GF_FREE(size_attr);
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto out;
+    }
+
+    if (fd)
+        STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fxattrop, fd,
+                   GF_XATTROP_ADD_ARRAY64, xattr_req, NULL);
+    else
+        STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->xattrop, loc,
+                   GF_XATTROP_ADD_ARRAY64, xattr_req, NULL);
+
+    dict_unref(xattr_req);
+    return 0;
+
+out:
+    if (xattr_req)
+        dict_unref(xattr_req);
+    handler(frame, this);
+    return 0;
+}
+
+static inode_t *
+shard_link_internal_dir_inode(shard_local_t *local, inode_t *inode,
+                              struct iatt *buf, shard_internal_dir_type_t type)
+{
+    inode_t *linked_inode = NULL;
+    shard_priv_t *priv = NULL;
+    char *bname = NULL;
+    inode_t **priv_inode = NULL;
+    inode_t *parent = NULL;
+
+    priv = THIS->private;
+
+    switch (type) {
+        case SHARD_INTERNAL_DIR_DOT_SHARD:
+            bname = GF_SHARD_DIR;
+            priv_inode = &priv->dot_shard_inode;
+            parent = inode->table->root;
+            break;
+        case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME:
+            bname = GF_SHARD_REMOVE_ME_DIR;
+            priv_inode = &priv->dot_shard_rm_inode;
+            parent = priv->dot_shard_inode;
+            break;
+        default:
+            break;
+    }
+
+    linked_inode = inode_link(inode, parent, bname, buf);
+    inode_lookup(linked_inode);
+    *priv_inode = linked_inode;
+    return linked_inode;
+}
+
+int
+shard_refresh_internal_dir_cbk(call_frame_t *frame, void *cookie,
+                               xlator_t *this, int32_t op_ret, int32_t op_errno,
+                               inode_t *inode, struct iatt *buf, dict_t *xdata,
+                               struct iatt *postparent)
+{
+    shard_local_t *local = NULL;
+    inode_t *linked_inode = NULL;
+    shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie;
+
+    local = frame->local;
+
+    if (op_ret) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto out;
+    }
+
+    /* To-Do: Fix refcount increment per call to
+     * shard_link_internal_dir_inode().
+     */
+    linked_inode = shard_link_internal_dir_inode(local, inode, buf, type);
+    shard_inode_ctx_mark_dir_refreshed(linked_inode, this);
+out:
+    shard_common_resolve_shards(frame, this, local->post_res_handler);
+    return 0;
+}
+
+int
+shard_refresh_internal_dir(call_frame_t *frame, xlator_t *this,
+                           shard_internal_dir_type_t type)
+{
+    loc_t loc = {
+        0,
+    };
+    inode_t *inode = NULL;
+    shard_priv_t *priv = NULL;
+    shard_local_t *local = NULL;
+    uuid_t gfid = {
+        0,
+    };
+
+    local = frame->local;
+    priv = this->private;
+
+    switch (type) {
+        case SHARD_INTERNAL_DIR_DOT_SHARD:
+            gf_uuid_copy(gfid, priv->dot_shard_gfid);
+            break;
+        case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME:
+            gf_uuid_copy(gfid, priv->dot_shard_rm_gfid);
+            break;
+        default:
+            break;
+    }
+
+    inode = inode_find(this->itable, gfid);
+
+    if (!shard_inode_ctx_needs_lookup(inode, this)) {
+        local->op_ret = 0;
+        goto out;
+    }
+
+    /* Plain assignment because the ref is already taken above through
+     * call to inode_find()
+     */
+    loc.inode = inode;
+    gf_uuid_copy(loc.gfid, gfid);
+
+    STACK_WIND_COOKIE(frame, shard_refresh_internal_dir_cbk, (void *)(long)type,
+                      FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, &loc,
+                      NULL);
+    loc_wipe(&loc);
+
+    return 0;
+
+out:
+    shard_common_resolve_shards(frame, this, local->post_res_handler);
+    return 0;
+}
+
+int
+shard_lookup_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                              int32_t op_ret, int32_t op_errno, inode_t *inode,
+                              struct iatt *buf, dict_t *xdata,
+                              struct iatt *postparent)
+{
+    inode_t *link_inode = NULL;
+    shard_local_t *local = NULL;
+    shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie;
+
+    local = frame->local;
+
+    if (op_ret) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto unwind;
+    }
+
+    if (!IA_ISDIR(buf->ia_type)) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, SHARD_MSG_DOT_SHARD_NODIR,
+               "%s already exists and "
+               "is not a directory. Please remove it from all bricks "
+               "and try again",
+               shard_internal_dir_string(type));
+        local->op_ret = -1;
+        local->op_errno = EIO;
+        goto unwind;
+    }
+
+    link_inode = shard_link_internal_dir_inode(local, inode, buf, type);
+    if (link_inode != inode) {
+        shard_refresh_internal_dir(frame, this, type);
+    } else {
+        shard_inode_ctx_mark_dir_refreshed(link_inode, this);
+        shard_common_resolve_shards(frame, this, local->post_res_handler);
+    }
+    return 0;
+
+unwind:
+    local->post_res_handler(frame, this);
+    return 0;
+}
+
+int
+shard_lookup_internal_dir(call_frame_t *frame, xlator_t *this,
+                          shard_post_resolve_fop_handler_t post_res_handler,
+                          shard_internal_dir_type_t type)
+{
+    int ret = -1;
+    dict_t *xattr_req = NULL;
+    shard_priv_t *priv = NULL;
+    shard_local_t *local = NULL;
+    uuid_t *gfid = NULL;
+    loc_t *loc = NULL;
+    gf_boolean_t free_gfid = _gf_true;
+
+    local = frame->local;
+    priv = this->private;
+    local->post_res_handler = post_res_handler;
+
+    gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!gfid)
+        goto err;
+
+    xattr_req = dict_new();
+    if (!xattr_req) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto err;
+    }
+
+    switch (type) {
+        case SHARD_INTERNAL_DIR_DOT_SHARD:
+            gf_uuid_copy(*gfid, priv->dot_shard_gfid);
+            loc = &local->dot_shard_loc;
+            break;
+        case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME:
+            gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid);
+            loc = &local->dot_shard_rm_loc;
+            break;
+        default:
+            bzero(*gfid, sizeof(uuid_t));
+            break;
+    }
+
+    ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED,
+               "Failed to set gfid of %s into dict",
+               shard_internal_dir_string(type));
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto err;
+    } else {
+        free_gfid = _gf_false;
+    }
+
+    STACK_WIND_COOKIE(frame, shard_lookup_internal_dir_cbk, (void *)(long)type,
+                      FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, loc,
+                      xattr_req);
+
+    dict_unref(xattr_req);
+    return 0;
+
+err:
+    if (xattr_req)
+        dict_unref(xattr_req);
+    if (free_gfid)
+        GF_FREE(gfid);
+    post_res_handler(frame, this);
+    return 0;
+}
+
+static void
+shard_inode_ctx_update(inode_t *inode, xlator_t *this, dict_t *xdata,
+                       struct iatt *buf)
+{
+    int ret = 0;
+    uint64_t size = 0;
+    void *bsize = NULL;
+
+    if (shard_inode_ctx_get_block_size(inode, this, &size)) {
+        /* Fresh lookup */
+        ret = dict_get_ptr(xdata, GF_XATTR_SHARD_BLOCK_SIZE, &bsize);
+        if (!ret)
+            size = ntoh64(*((uint64_t *)bsize));
+        /* If the file is sharded, set its block size, otherwise just
+         * set 0.
+         */
+
+        shard_inode_ctx_set(inode, this, buf, size, SHARD_MASK_BLOCK_SIZE);
+    }
+    /* If the file is sharded, also set the remaining attributes,
+     * except for ia_size and ia_blocks.
+     */
+    if (size) {
+        shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK);
+        (void)shard_inode_ctx_invalidate(inode, this, buf);
+    }
+}
+
+int
+shard_delete_shards(void *opaque);
+
+int
+shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data);
+
+int
+shard_start_background_deletion(xlator_t *this)
+{
+    int ret = 0;
+    gf_boolean_t i_cleanup = _gf_true;
+    shard_priv_t *priv = NULL;
+    call_frame_t *cleanup_frame = NULL;
+
+    priv = this->private;
+
+    LOCK(&priv->lock);
+    {
+        switch (priv->bg_del_state) {
+            case SHARD_BG_DELETION_NONE:
+                i_cleanup = _gf_true;
+                priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING;
+                break;
+            case SHARD_BG_DELETION_LAUNCHING:
+                i_cleanup = _gf_false;
+                break;
+            case SHARD_BG_DELETION_IN_PROGRESS:
+                priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING;
+                i_cleanup = _gf_false;
+                break;
+            default:
+                break;
+        }
+    }
+    UNLOCK(&priv->lock);
+    if (!i_cleanup)
+        return 0;
+
+    cleanup_frame = create_frame(this, this->ctx->pool);
+    if (!cleanup_frame) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED,
+               "Failed to create "
+               "new frame to delete shards");
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    set_lk_owner_from_ptr(&cleanup_frame->root->lk_owner, cleanup_frame->root);
+
+    ret = synctask_new(this->ctx->env, shard_delete_shards,
+                       shard_delete_shards_cbk, cleanup_frame, cleanup_frame);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, errno,
+               SHARD_MSG_SHARDS_DELETION_FAILED,
+               "failed to create task to do background "
+               "cleanup of shards");
+        STACK_DESTROY(cleanup_frame->root);
+        goto err;
+    }
+    return 0;
+
+err:
+    LOCK(&priv->lock);
+    {
+        priv->bg_del_state = SHARD_BG_DELETION_NONE;
+    }
+    UNLOCK(&priv->lock);
+    return ret;
+}
+
+int
+shard_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, inode_t *inode,
+                 struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+    int ret = -1;
+    shard_priv_t *priv = NULL;
+    gf_boolean_t i_start_cleanup = _gf_false;
+
+    priv = this->private;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    if (IA_ISDIR(buf->ia_type))
+        goto unwind;
+
+    /* Also, if the file is sharded, get the file size and block cnt xattr,
+     * and store them in the stbuf appropriately.
+     */
+
+    if (dict_get(xdata, GF_XATTR_SHARD_FILE_SIZE) &&
+        frame->root->pid != GF_CLIENT_PID_GSYNCD)
+        shard_modify_size_and_block_count(buf, xdata);
+
+    /* If this was a fresh lookup, there are two possibilities:
+     * 1) If the file is sharded (indicated by the presence of block size
+     *    xattr), store this block size, along with rdev and mode in its
+     *    inode ctx.
+     * 2) If the file is not sharded, store size along with rdev and mode
+     *    (which are anyway don't cares) in inode ctx. Since @ctx_tmp is
+     *    already initialised to all zeroes, nothing more needs to be done.
+     */
+
+    (void)shard_inode_ctx_update(inode, this, xdata, buf);
+
+    LOCK(&priv->lock);
+    {
+        if (priv->first_lookup_done == _gf_false) {
+            priv->first_lookup_done = _gf_true;
+            i_start_cleanup = _gf_true;
+        }
+    }
+    UNLOCK(&priv->lock);
+
+    if (!i_start_cleanup)
+        goto unwind;
+
+    ret = shard_start_background_deletion(this);
+    if (ret < 0) {
+        LOCK(&priv->lock);
+        {
+            priv->first_lookup_done = _gf_false;
+        }
+        UNLOCK(&priv->lock);
+    }
+
+unwind:
+    SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+                       postparent);
+    return 0;
+}
+
+int
+shard_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+    int ret = -1;
+    int32_t op_errno = ENOMEM;
+    uint64_t block_size = 0;
+    shard_local_t *local = NULL;
+
+    this->itable = loc->inode->table;
+    if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) &&
+        (frame->root->pid != GF_CLIENT_PID_GLFS_HEAL)) {
+        SHARD_ENTRY_FOP_CHECK(loc, op_errno, err);
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+
+    loc_copy(&local->loc, loc);
+
+    local->xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new();
+    if (!local->xattr_req)
+        goto err;
+
+    if (shard_inode_ctx_get_block_size(loc->inode, this, &block_size)) {
+        ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED,
+                   "Failed to set dict"
+                   " value: key:%s for path %s",
+                   GF_XATTR_SHARD_BLOCK_SIZE, loc->path);
+            goto err;
+        }
+    }
+
+    if (frame->root->pid != GF_CLIENT_PID_GSYNCD) {
+        ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE,
+                              8 * 4);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED,
+                   "Failed to set dict value: key:%s for path %s.",
+                   GF_XATTR_SHARD_FILE_SIZE, loc->path);
+            goto err;
+        }
+    }
+
+    if ((xattr_req) && (dict_get(xattr_req, GF_CONTENT_KEY)))
+        dict_del(xattr_req, GF_CONTENT_KEY);
+
+    STACK_WIND(frame, shard_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, local->xattr_req);
+    return 0;
+err:
+    shard_common_failure_unwind(GF_FOP_LOOKUP, frame, -1, op_errno);
+    return 0;
+}
+
+int
+shard_set_iattr_invoke_post_handler(call_frame_t *frame, xlator_t *this,
+                                    inode_t *inode, int32_t op_ret,
+                                    int32_t op_errno, struct iatt *buf,
+                                    dict_t *xdata)
+{
+    int ret = -1;
+    int32_t mask = SHARD_INODE_WRITE_MASK;
+    shard_local_t *local = frame->local;
+    shard_inode_ctx_t ctx = {
+        0,
+    };
+
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SHARD_MSG_BASE_FILE_LOOKUP_FAILED,
+               "Lookup on base file"
+               " failed : %s",
+               uuid_utoa(inode->gfid));
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto unwind;
+    }
+
+    local->prebuf = *buf;
+    if (shard_modify_size_and_block_count(&local->prebuf, xdata)) {
+        local->op_ret = -1;
+        local->op_errno = EINVAL;
+        goto unwind;
+    }
+
+    if (shard_inode_ctx_get_all(inode, this, &ctx))
+        mask = SHARD_ALL_MASK;
+
+    ret = shard_inode_ctx_set(inode, this, &local->prebuf, 0,
+                              (mask | SHARD_MASK_REFRESH_RESET));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, SHARD_MSG_INODE_CTX_SET_FAILED, 0,
+               "Failed to set inode"
+               " write params into inode ctx for %s",
+               uuid_utoa(buf->ia_gfid));
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto unwind;
+    }
+
+unwind:
+    local->handler(frame, this);
+    return 0;
+}
+
+int
+shard_fstat_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                          dict_t *xdata)
+{
+    shard_local_t *local = frame->local;
+
+    shard_set_iattr_invoke_post_handler(frame, this, local->fd->inode, op_ret,
+                                        op_errno, buf, xdata);
+    return 0;
+}
+
+int
+shard_lookup_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno, inode_t *inode,
+                           struct iatt *buf, dict_t *xdata,
+                           struct iatt *postparent)
+{
+    /* In case of op_ret < 0, inode passed to this function will be NULL
+       ex: in case of op_errno = ENOENT. So refer prefilled inode data
+       which is part of local.
+       Note: Reassigning/overriding the inode passed to this cbk with inode
+       which is part of *struct shard_local_t* won't cause any issue as
+       both inodes have same reference/address as of the inode passed */
+    inode = ((shard_local_t *)frame->local)->loc.inode;
+
+    shard_set_iattr_invoke_post_handler(frame, this, inode, op_ret, op_errno,
+                                        buf, xdata);
+    return 0;
+}
+
+/* This function decides whether to make file based lookup or
+ * fd based lookup (fstat) depending on the 3rd and 4th arg.
+ * If fd != NULL and loc == NULL then call is for fstat
+ * If fd == NULL and loc != NULL then call is for file based
+ * lookup. Please pass args based on the requirement.
+ */
+int
+shard_refresh_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                        fd_t *fd, shard_post_fop_handler_t handler)
+{
+    int ret = -1;
+    inode_t *inode = NULL;
+    shard_local_t *local = NULL;
+    dict_t *xattr_req = NULL;
+    gf_boolean_t need_refresh = _gf_false;
+
+    local = frame->local;
+    local->handler = handler;
+    inode = fd ? fd->inode : loc->inode;
+
+    ret = shard_inode_ctx_fill_iatt_from_cache(inode, this, &local->prebuf,
+                                               &need_refresh);
+    /* By this time, inode ctx should have been created either in create,
+     * mknod, readdirp or lookup. If not it is a bug!
+     */
+    if ((ret == 0) && (need_refresh == _gf_false)) {
+        gf_msg_debug(this->name, 0,
+                     "Skipping lookup on base file: %s"
+                     "Serving prebuf off the inode ctx cache",
+                     uuid_utoa(inode->gfid));
+        goto out;
+    }
+
+    xattr_req = dict_new();
+    if (!xattr_req) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto out;
+    }
+
+    SHARD_MD_READ_FOP_INIT_REQ_DICT(this, xattr_req, inode->gfid, local, out);
+
+    if (fd)
+        STACK_WIND(frame, shard_fstat_base_file_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fstat, fd, xattr_req);
+    else
+        STACK_WIND(frame, shard_lookup_base_file_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->lookup, loc, xattr_req);
+
+    dict_unref(xattr_req);
+    return 0;
+
+out:
+    if (xattr_req)
+        dict_unref(xattr_req);
+    handler(frame, this);
+    return 0;
+}
+
+int
+shard_post_fstat_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret >= 0)
+        shard_inode_ctx_set(local->fd->inode, this, &local->prebuf, 0,
+                            SHARD_LOOKUP_MASK);
+
+    SHARD_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno,
+                       &local->prebuf, local->xattr_rsp);
+    return 0;
+}
+
+int
+shard_post_stat_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret >= 0)
+        shard_inode_ctx_set(local->loc.inode, this, &local->prebuf, 0,
+                            SHARD_LOOKUP_MASK);
+
+    SHARD_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno,
+                       &local->prebuf, local->xattr_rsp);
+    return 0;
+}
+
+int
+shard_common_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                      dict_t *xdata)
+{
+    inode_t *inode = NULL;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_STAT_FAILED,
+               "stat failed: %s",
+               local->fd ? uuid_utoa(local->fd->inode->gfid)
+                         : uuid_utoa((local->loc.inode)->gfid));
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto unwind;
+    }
+
+    local->prebuf = *buf;
+    if (shard_modify_size_and_block_count(&local->prebuf, xdata)) {
+        local->op_ret = -1;
+        local->op_errno = EINVAL;
+        goto unwind;
+    }
+    local->xattr_rsp = dict_ref(xdata);
+
+    if (local->loc.inode)
+        inode = local->loc.inode;
+    else
+        inode = local->fd->inode;
+
+    shard_inode_ctx_invalidate(inode, this, &local->prebuf);
+
+unwind:
+    local->handler(frame, this);
+    return 0;
+}
+
+int
+shard_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    int ret = -1;
+    uint64_t block_size = 0;
+    shard_local_t *local = NULL;
+
+    if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) {
+        STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->stat, loc, xdata);
+        return 0;
+    }
+
+    ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get block "
+               "size from inode ctx of %s",
+               uuid_utoa(loc->inode->gfid));
+        goto err;
+    }
+
+    if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) {
+        STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->stat, loc, xdata);
+        return 0;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+
+    local->handler = shard_post_stat_handler;
+    loc_copy(&local->loc, loc);
+    local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+    if (!local->xattr_req)
+        goto err;
+
+    SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid,
+                                    local, err);
+
+    STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->stat, loc, local->xattr_req);
+    return 0;
+err:
+    shard_common_failure_unwind(GF_FOP_STAT, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    int ret = -1;
+    uint64_t block_size = 0;
+    shard_local_t *local = NULL;
+
+    if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) {
+        STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fstat, fd, xdata);
+        return 0;
+    }
+
+    ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get block "
+               "size from inode ctx of %s",
+               uuid_utoa(fd->inode->gfid));
+        goto err;
+    }
+
+    if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) {
+        STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fstat, fd, xdata);
+        return 0;
+    }
+
+    if (!this->itable)
+        this->itable = fd->inode->table;
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+
+    local->handler = shard_post_fstat_handler;
+    local->fd = fd_ref(fd);
+    local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+    if (!local->xattr_req)
+        goto err;
+
+    SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid,
+                                    local, err);
+
+    STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fstat, fd, local->xattr_req);
+    return 0;
+err:
+    shard_common_failure_unwind(GF_FOP_FSTAT, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_post_update_size_truncate_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->fop == GF_FOP_TRUNCATE)
+        SHARD_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno,
+                           &local->prebuf, &local->postbuf, NULL);
+    else
+        SHARD_STACK_UNWIND(ftruncate, frame, local->op_ret, local->op_errno,
+                           &local->prebuf, &local->postbuf, NULL);
+    return 0;
+}
+
+int
+shard_truncate_last_shard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                              int32_t op_ret, int32_t op_errno,
+                              struct iatt *prebuf, struct iatt *postbuf,
+                              dict_t *xdata)
+{
+    inode_t *inode = NULL;
+    int64_t delta_blocks = 0;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    SHARD_UNSET_ROOT_FS_ID(frame, local);
+
+    inode = (local->fop == GF_FOP_TRUNCATE) ? local->loc.inode
+                                            : local->fd->inode;
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SHARD_MSG_TRUNCATE_LAST_SHARD_FAILED,
+               "truncate on last"
+               " shard failed : %s",
+               uuid_utoa(inode->gfid));
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto err;
+    }
+
+    local->postbuf.ia_size = local->offset;
+    /* Let the delta be negative. We want xattrop to do subtraction */
+    local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size;
+    delta_blocks = GF_ATOMIC_ADD(local->delta_blocks,
+                                 postbuf->ia_blocks - prebuf->ia_blocks);
+    GF_ASSERT(delta_blocks <= 0);
+    local->postbuf.ia_blocks += delta_blocks;
+    local->hole_size = 0;
+
+    shard_inode_ctx_set(inode, this, &local->postbuf, 0, SHARD_MASK_TIMES);
+    shard_update_file_size(frame, this, NULL, &local->loc,
+                           shard_post_update_size_truncate_handler);
+    return 0;
+err:
+    shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                local->op_errno);
+    return 0;
+}
+
+int
+shard_truncate_last_shard(call_frame_t *frame, xlator_t *this, inode_t *inode)
+{
+    size_t last_shard_size_after = 0;
+    loc_t loc = {
+        0,
+    };
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    /* A NULL inode could be due to the fact that the last shard which
+     * needs to be truncated does not exist due to it lying in a hole
+     * region. So the only thing left to do in that case would be an
+     * update to file size xattr.
+     */
+    if (!inode) {
+        gf_msg_debug(this->name, 0,
+                     "Last shard to be truncated absent in backend: %" PRIu64
+                     " of gfid %s. Directly proceeding to update file size",
+                     local->first_block, uuid_utoa(local->loc.inode->gfid));
+        shard_update_file_size(frame, this, NULL, &local->loc,
+                               shard_post_update_size_truncate_handler);
+        return 0;
+    }
+
+    SHARD_SET_ROOT_FS_ID(frame, local);
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    last_shard_size_after = (local->offset % local->block_size);
+
+    STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, &loc, last_shard_size_after,
+               NULL);
+    loc_wipe(&loc);
+    return 0;
+}
+
+void
+shard_unlink_block_inode(shard_local_t *local, int shard_block_num);
+
+int
+shard_truncate_htol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno,
+                        struct iatt *preparent, struct iatt *postparent,
+                        dict_t *xdata)
+{
+    int ret = 0;
+    int call_count = 0;
+    int shard_block_num = (long)cookie;
+    uint64_t block_count = 0;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret < 0) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto done;
+    }
+    ret = dict_get_uint64(xdata, GF_GET_FILE_BLOCK_COUNT, &block_count);
+    if (!ret) {
+        GF_ATOMIC_SUB(local->delta_blocks, block_count);
+    } else {
+        /* dict_get failed possibly due to a heterogeneous cluster? */
+        gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED,
+               "Failed to get key %s from dict during truncate of gfid %s",
+               GF_GET_FILE_BLOCK_COUNT,
+               uuid_utoa(local->resolver_base_inode->gfid));
+    }
+
+    shard_unlink_block_inode(local, shard_block_num);
+done:
+    call_count = shard_call_count_return(frame);
+    if (call_count == 0) {
+        SHARD_UNSET_ROOT_FS_ID(frame, local);
+        shard_truncate_last_shard(frame, this, local->inode_list[0]);
+    }
+    return 0;
+}
+
+int
+shard_truncate_htol(call_frame_t *frame, xlator_t *this, inode_t *inode)
+{
+    int i = 1;
+    int ret = -1;
+    int call_count = 0;
+    uint32_t cur_block = 0;
+    uint32_t last_block = 0;
+    char path[PATH_MAX] = {
+        0,
+    };
+    char *bname = NULL;
+    loc_t loc = {
+        0,
+    };
+    gf_boolean_t wind_failed = _gf_false;
+    shard_local_t *local = NULL;
+    shard_priv_t *priv = NULL;
+    dict_t *xdata_req = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    cur_block = local->first_block + 1;
+    last_block = local->last_block;
+
+    /* Determine call count */
+    for (i = 1; i < local->num_blocks; i++) {
+        if (!local->inode_list[i])
+            continue;
+        call_count++;
+    }
+
+    if (!call_count) {
+        /* Call count = 0 implies that all of the shards that need to be
+         * unlinked do not exist. So shard xlator would now proceed to
+         * do the final truncate + size updates.
+         */
+        gf_msg_debug(this->name, 0,
+                     "Shards to be unlinked as part of "
+                     "truncate absent in backend: %s. Directly "
+                     "proceeding to update file size",
+                     uuid_utoa(inode->gfid));
+        local->postbuf.ia_size = local->offset;
+        local->postbuf.ia_blocks = local->prebuf.ia_blocks;
+        local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size;
+        GF_ATOMIC_INIT(local->delta_blocks, 0);
+        local->hole_size = 0;
+        shard_update_file_size(frame, this, local->fd, &local->loc,
+                               shard_post_update_size_truncate_handler);
+        return 0;
+    }
+
+    local->call_count = call_count;
+    i = 1;
+    xdata_req = dict_new();
+    if (!xdata_req) {
+        shard_common_failure_unwind(local->fop, frame, -1, ENOMEM);
+        return 0;
+    }
+    ret = dict_set_uint64(xdata_req, GF_GET_FILE_BLOCK_COUNT, 8 * 8);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED,
+               "Failed to set key %s into dict during truncate of %s",
+               GF_GET_FILE_BLOCK_COUNT,
+               uuid_utoa(local->resolver_base_inode->gfid));
+        dict_unref(xdata_req);
+        shard_common_failure_unwind(local->fop, frame, -1, ENOMEM);
+        return 0;
+    }
+
+    SHARD_SET_ROOT_FS_ID(frame, local);
+    while (cur_block <= last_block) {
+        if (!local->inode_list[i]) {
+            cur_block++;
+            i++;
+            continue;
+        }
+        if (wind_failed) {
+            shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1,
+                                    ENOMEM, NULL, NULL, NULL);
+            goto next;
+        }
+
+        shard_make_block_abspath(cur_block, inode->gfid, path, sizeof(path));
+        bname = strrchr(path, '/') + 1;
+        loc.parent = inode_ref(priv->dot_shard_inode);
+        ret = inode_path(loc.parent, bname, (char **)&(loc.path));
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED,
+                   "Inode path failed"
+                   " on %s. Base file gfid = %s",
+                   bname, uuid_utoa(inode->gfid));
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            loc_wipe(&loc);
+            wind_failed = _gf_true;
+            shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1,
+                                    ENOMEM, NULL, NULL, NULL);
+            goto next;
+        }
+        loc.name = strrchr(loc.path, '/');
+        if (loc.name)
+            loc.name++;
+        loc.inode = inode_ref(local->inode_list[i]);
+
+        STACK_WIND_COOKIE(frame, shard_truncate_htol_cbk,
+                          (void *)(long)cur_block, FIRST_CHILD(this),
+                          FIRST_CHILD(this)->fops->unlink, &loc, 0, xdata_req);
+        loc_wipe(&loc);
+    next:
+        i++;
+        cur_block++;
+        if (!--call_count)
+            break;
+    }
+    dict_unref(xdata_req);
+    return 0;
+}
+
+int
+shard_truncate_do(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->num_blocks == 1) {
+        /* This means that there are no shards to be unlinked.
+         * The fop boils down to truncating the last shard, updating
+         * the size and unwinding.
+         */
+        shard_truncate_last_shard(frame, this, local->inode_list[0]);
+        return 0;
+    } else {
+        shard_truncate_htol(frame, this, local->loc.inode);
+    }
+    return 0;
+}
+
+int
+shard_post_lookup_shards_truncate_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                    local->op_errno);
+        return 0;
+    }
+
+    shard_truncate_do(frame, this);
+    return 0;
+}
+
+void
+shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
+                       struct iatt *buf)
+{
+    int list_index = 0;
+    char block_bname[256] = {
+        0,
+    };
+    uuid_t gfid = {
+        0,
+    };
+    inode_t *linked_inode = NULL;
+    xlator_t *this = NULL;
+    inode_t *fsync_inode = NULL;
+    shard_priv_t *priv = NULL;
+    inode_t *base_inode = NULL;
+
+    this = THIS;
+    priv = this->private;
+    if (local->loc.inode) {
+        gf_uuid_copy(gfid, local->loc.inode->gfid);
+        base_inode = local->loc.inode;
+    } else if (local->resolver_base_inode) {
+        gf_uuid_copy(gfid, local->resolver_base_inode->gfid);
+        base_inode = local->resolver_base_inode;
+    } else {
+        gf_uuid_copy(gfid, local->base_gfid);
+    }
+
+    shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname));
+
+    shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK);
+    linked_inode = inode_link(inode, priv->dot_shard_inode, block_bname, buf);
+    inode_lookup(linked_inode);
+    list_index = block_num - local->first_block;
+    local->inode_list[list_index] = linked_inode;
+
+    LOCK(&priv->lock);
+    {
+        fsync_inode = __shard_update_shards_inode_list(
+            linked_inode, this, base_inode, block_num, gfid);
+    }
+    UNLOCK(&priv->lock);
+    if (fsync_inode)
+        shard_initiate_evicted_inode_fsync(this, fsync_inode);
+}
+
+int
+shard_common_lookup_shards_cbk(call_frame_t *frame, void *cookie,
+                               xlator_t *this, int32_t op_ret, int32_t op_errno,
+                               inode_t *inode, struct iatt *buf, dict_t *xdata,
+                               struct iatt *postparent)
+{
+    int call_count = 0;
+    int shard_block_num = (long)cookie;
+    uuid_t gfid = {
+        0,
+    };
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+    if (local->resolver_base_inode)
+        gf_uuid_copy(gfid, local->resolver_base_inode->gfid);
+    else
+        gf_uuid_copy(gfid, local->base_gfid);
+
+    if (op_ret < 0) {
+        /* Ignore absence of shards in the backend in truncate fop. */
+        switch (local->fop) {
+            case GF_FOP_TRUNCATE:
+            case GF_FOP_FTRUNCATE:
+            case GF_FOP_RENAME:
+            case GF_FOP_UNLINK:
+                if (op_errno == ENOENT)
+                    goto done;
+                break;
+            case GF_FOP_WRITE:
+            case GF_FOP_READ:
+            case GF_FOP_ZEROFILL:
+            case GF_FOP_DISCARD:
+            case GF_FOP_FALLOCATE:
+                if ((!local->first_lookup_done) && (op_errno == ENOENT)) {
+                    LOCK(&frame->lock);
+                    {
+                        local->create_count++;
+                    }
+                    UNLOCK(&frame->lock);
+                    goto done;
+                }
+                break;
+            default:
+                break;
+        }
+
+        /* else */
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SHARD_MSG_LOOKUP_SHARD_FAILED,
+               "Lookup on shard %d "
+               "failed. Base file gfid = %s",
+               shard_block_num, uuid_utoa(gfid));
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto done;
+    }
+
+    shard_link_block_inode(local, shard_block_num, inode, buf);
+
+done:
+    if (local->lookup_shards_barriered) {
+        syncbarrier_wake(&local->barrier);
+        return 0;
+    } else {
+        call_count = shard_call_count_return(frame);
+        if (call_count == 0) {
+            if (!local->first_lookup_done)
+                local->first_lookup_done = _gf_true;
+            local->pls_fop_handler(frame, this);
+        }
+    }
+    return 0;
+}
+
+dict_t *
+shard_create_gfid_dict(dict_t *dict)
+{
+    int ret = 0;
+    dict_t *new = NULL;
+    unsigned char *gfid = NULL;
+
+    new = dict_copy_with_ref(dict, NULL);
+    if (!new)
+        return NULL;
+
+    gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_char);
+    if (!gfid) {
+        ret = -1;
+        goto out;
+    }
+
+    gf_uuid_generate(gfid);
+
+    ret = dict_set_gfuuid(new, "gfid-req", gfid, false);
+
+out:
+    if (ret) {
+        dict_unref(new);
+        new = NULL;
+        GF_FREE(gfid);
+    }
+
+    return new;
+}
+
+int
+shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                           shard_post_lookup_shards_fop_handler_t handler)
+{
+    int i = 0;
+    int ret = 0;
+    int count = 0;
+    int call_count = 0;
+    int32_t shard_idx_iter = 0;
+    int lookup_count = 0;
+    char path[PATH_MAX] = {
+        0,
+    };
+    char *bname = NULL;
+    uuid_t gfid = {
+        0,
+    };
+    loc_t loc = {
+        0,
+    };
+    shard_local_t *local = NULL;
+    shard_priv_t *priv = NULL;
+    gf_boolean_t wind_failed = _gf_false;
+    dict_t *xattr_req = NULL;
+
+    priv = this->private;
+    local = frame->local;
+    count = call_count = local->call_count;
+    shard_idx_iter = local->first_block;
+    lookup_count = local->last_block - local->create_count;
+    local->pls_fop_handler = handler;
+    if (local->lookup_shards_barriered)
+        local->barrier.waitfor = local->call_count;
+
+    if (inode)
+        gf_uuid_copy(gfid, inode->gfid);
+    else
+        gf_uuid_copy(gfid, local->base_gfid);
+
+    while (shard_idx_iter <= lookup_count) {
+        if (local->inode_list[i]) {
+            i++;
+            shard_idx_iter++;
+            continue;
+        }
+
+        if (wind_failed) {
+            shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter,
+                                           this, -1, ENOMEM, NULL, NULL, NULL,
+                                           NULL);
+            goto next;
+        }
+
+        shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path));
+
+        bname = strrchr(path, '/') + 1;
+        loc.inode = inode_new(this->itable);
+        loc.parent = inode_ref(priv->dot_shard_inode);
+        gf_uuid_copy(loc.pargfid, priv->dot_shard_gfid);
+        ret = inode_path(loc.parent, bname, (char **)&(loc.path));
+        if (ret < 0 || !(loc.inode)) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED,
+                   "Inode path failed"
+                   " on %s, base file gfid = %s",
+                   bname, uuid_utoa(gfid));
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            loc_wipe(&loc);
+            wind_failed = _gf_true;
+            shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter,
+                                           this, -1, ENOMEM, NULL, NULL, NULL,
+                                           NULL);
+            goto next;
+        }
+
+        loc.name = strrchr(loc.path, '/');
+        if (loc.name)
+            loc.name++;
+
+        xattr_req = shard_create_gfid_dict(local->xattr_req);
+        if (!xattr_req) {
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            wind_failed = _gf_true;
+            loc_wipe(&loc);
+            shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter,
+                                           this, -1, ENOMEM, NULL, NULL, NULL,
+                                           NULL);
+            goto next;
+        }
+
+        STACK_WIND_COOKIE(frame, shard_common_lookup_shards_cbk,
+                          (void *)(long)shard_idx_iter, FIRST_CHILD(this),
+                          FIRST_CHILD(this)->fops->lookup, &loc, xattr_req);
+        loc_wipe(&loc);
+        dict_unref(xattr_req);
+    next:
+        shard_idx_iter++;
+        i++;
+
+        if (!--call_count)
+            break;
+    }
+    if (local->lookup_shards_barriered) {
+        syncbarrier_wait(&local->barrier, count);
+        local->pls_fop_handler(frame, this);
+    }
+    return 0;
+}
+
+int
+shard_post_resolve_truncate_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        if (local->op_errno == ENOENT) {
+            /* If lookup on /.shard fails with ENOENT, it means that
+             * the file was 0-byte in size but truncated sometime in
+             * the past to a higher size which is reflected in the
+             * size xattr, and now being truncated to a lower size.
+             * In this case, the only thing that needs to be done is
+             * to update the size xattr of the file and unwind.
+             */
+            local->first_block = local->last_block = 0;
+            local->num_blocks = 1;
+            local->call_count = 0;
+            local->op_ret = 0;
+            local->postbuf.ia_size = local->offset;
+            shard_update_file_size(frame, this, local->fd, &local->loc,
+                                   shard_post_update_size_truncate_handler);
+            return 0;
+        } else {
+            shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                        local->op_errno);
+            return 0;
+        }
+    }
+
+    if (!local->call_count)
+        shard_truncate_do(frame, this);
+    else
+        shard_common_lookup_shards(frame, this, local->loc.inode,
+                                   shard_post_lookup_shards_truncate_handler);
+
+    return 0;
+}
+
+int
+shard_truncate_begin(call_frame_t *frame, xlator_t *this)
+{
+    int ret = 0;
+    shard_local_t *local = NULL;
+    shard_priv_t *priv = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    /* First participant block here is the lowest numbered block that would
+     * hold the last byte of the file post successful truncation.
+     * Last participant block is the block that contains the last byte in
+     * the current state of the file.
+     * If (first block == last_block):
+     *         then that means that the file only needs truncation of the
+     *         first (or last since both are same) block.
+     * Else
+     *         if (new_size % block_size == 0)
+     *                 then that means there is no truncate to be done with
+     *                 only shards from first_block + 1 through the last
+     *                 block needing to be unlinked.
+     *         else
+     *                 both truncate of the first block and unlink of the
+     *                 remaining shards until end of file is required.
+     */
+    local->first_block = (local->offset == 0)
+                             ? 0
+                             : get_lowest_block(local->offset - 1,
+                                                local->block_size);
+    local->last_block = get_highest_block(0, local->prebuf.ia_size,
+                                          local->block_size);
+
+    local->num_blocks = local->last_block - local->first_block + 1;
+    GF_ASSERT(local->num_blocks > 0);
+    local->resolver_base_inode = (local->fop == GF_FOP_TRUNCATE)
+                                     ? local->loc.inode
+                                     : local->fd->inode;
+
+    if ((local->first_block == 0) && (local->num_blocks == 1)) {
+        if (local->fop == GF_FOP_TRUNCATE)
+            STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this),
+                       FIRST_CHILD(this)->fops->truncate, &local->loc,
+                       local->offset, local->xattr_req);
+        else
+            STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this),
+                       FIRST_CHILD(this)->fops->ftruncate, local->fd,
+                       local->offset, local->xattr_req);
+        return 0;
+    }
+
+    local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *),
+                                  gf_shard_mt_inode_list);
+    if (!local->inode_list)
+        goto err;
+
+    local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid);
+    if (!local->dot_shard_loc.inode) {
+        ret = shard_init_internal_dir_loc(this, local,
+                                          SHARD_INTERNAL_DIR_DOT_SHARD);
+        if (ret)
+            goto err;
+        shard_lookup_internal_dir(frame, this,
+                                  shard_post_resolve_truncate_handler,
+                                  SHARD_INTERNAL_DIR_DOT_SHARD);
+    } else {
+        local->post_res_handler = shard_post_resolve_truncate_handler;
+        shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD);
+    }
+    return 0;
+
+err:
+    shard_common_failure_unwind(local->fop, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_post_lookup_truncate_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+    struct iatt tmp_stbuf = {
+        0,
+    };
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                    local->op_errno);
+        return 0;
+    }
+
+    local->postbuf = tmp_stbuf = local->prebuf;
+
+    if (local->prebuf.ia_size == local->offset) {
+        /* If the file size is same as requested size, unwind the call
+         * immediately.
+         */
+        if (local->fop == GF_FOP_TRUNCATE)
+            SHARD_STACK_UNWIND(truncate, frame, 0, 0, &local->prebuf,
+                               &local->postbuf, NULL);
+        else
+            SHARD_STACK_UNWIND(ftruncate, frame, 0, 0, &local->prebuf,
+                               &local->postbuf, NULL);
+    } else if (local->offset > local->prebuf.ia_size) {
+        /* If the truncate is from a lower to a higher size, set the
+         * new size xattr and unwind.
+         */
+        local->hole_size = local->offset - local->prebuf.ia_size;
+        local->delta_size = 0;
+        GF_ATOMIC_INIT(local->delta_blocks, 0);
+        local->postbuf.ia_size = local->offset;
+        tmp_stbuf.ia_size = local->offset;
+        shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0,
+                            SHARD_INODE_WRITE_MASK);
+        shard_update_file_size(frame, this, NULL, &local->loc,
+                               shard_post_update_size_truncate_handler);
+    } else {
+        /* ... else
+         * i.   unlink all shards that need to be unlinked.
+         * ii.  truncate the last of the shards.
+         * iii. update the new size using setxattr.
+         * and unwind the fop.
+         */
+        local->hole_size = 0;
+        local->delta_size = (local->offset - local->prebuf.ia_size);
+        GF_ATOMIC_INIT(local->delta_blocks, 0);
+        tmp_stbuf.ia_size = local->offset;
+        shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0,
+                            SHARD_INODE_WRITE_MASK);
+        shard_truncate_begin(frame, this);
+    }
+    return 0;
+}
+
+/* TO-DO:
+ * Fix updates to size and block count with racing write(s) and truncate(s).
+ */
+
+int
+shard_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+               dict_t *xdata)
+{
+    int ret = -1;
+    uint64_t block_size = 0;
+    shard_local_t *local = NULL;
+
+    ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get block "
+               "size from inode ctx of %s",
+               uuid_utoa(loc->inode->gfid));
+        goto err;
+    }
+
+    if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) {
+        STACK_WIND(frame, default_truncate_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+        return 0;
+    }
+
+    if (!this->itable)
+        this->itable = loc->inode->table;
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+
+    ret = syncbarrier_init(&local->barrier);
+    if (ret)
+        goto err;
+    loc_copy(&local->loc, loc);
+    local->offset = offset;
+    local->block_size = block_size;
+    local->fop = GF_FOP_TRUNCATE;
+    local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+    if (!local->xattr_req)
+        goto err;
+    local->resolver_base_inode = loc->inode;
+    GF_ATOMIC_INIT(local->delta_blocks, 0);
+
+    shard_refresh_base_file(frame, this, &local->loc, NULL,
+                            shard_post_lookup_truncate_handler);
+    return 0;
+
+err:
+    shard_common_failure_unwind(GF_FOP_TRUNCATE, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                dict_t *xdata)
+{
+    int ret = -1;
+    uint64_t block_size = 0;
+    shard_local_t *local = NULL;
+
+    ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get block "
+               "size from inode ctx of %s",
+               uuid_utoa(fd->inode->gfid));
+        goto err;
+    }
+
+    if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) {
+        STACK_WIND(frame, default_ftruncate_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+        return 0;
+    }
+
+    if (!this->itable)
+        this->itable = fd->inode->table;
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+    ret = syncbarrier_init(&local->barrier);
+    if (ret)
+        goto err;
+    local->fd = fd_ref(fd);
+    local->offset = offset;
+    local->block_size = block_size;
+    local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+    if (!local->xattr_req)
+        goto err;
+    local->fop = GF_FOP_FTRUNCATE;
+
+    local->loc.inode = inode_ref(fd->inode);
+    gf_uuid_copy(local->loc.gfid, fd->inode->gfid);
+    local->resolver_base_inode = fd->inode;
+    GF_ATOMIC_INIT(local->delta_blocks, 0);
+
+    shard_refresh_base_file(frame, this, NULL, fd,
+                            shard_post_lookup_truncate_handler);
+    return 0;
+err:
+    shard_common_failure_unwind(GF_FOP_FTRUNCATE, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, inode_t *inode,
+                struct iatt *buf, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
+{
+    int ret = -1;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret == -1)
+        goto unwind;
+
+    ret = shard_inode_ctx_set(inode, this, buf, local->block_size,
+                              SHARD_ALL_MASK);
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED,
+               "Failed to set inode "
+               "ctx for %s",
+               uuid_utoa(inode->gfid));
+
+unwind:
+    SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent,
+                       postparent, xdata);
+
+    return 0;
+}
+
+int
+shard_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+            dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    shard_priv_t *priv = NULL;
+    shard_local_t *local = NULL;
+
+    priv = this->private;
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+    local->block_size = priv->block_size;
+    if (!__is_gsyncd_on_shard_dir(frame, loc)) {
+        SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err);
+    }
+
+    STACK_WIND(frame, shard_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata);
+    return 0;
+err:
+    shard_common_failure_unwind(GF_FOP_MKNOD, frame, -1, ENOMEM);
+    return 0;
+}
+
+int32_t
+shard_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, inode_t *inode,
+               struct iatt *buf, struct iatt *preparent,
+               struct iatt *postparent, dict_t *xdata)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+    if (op_ret < 0)
+        goto err;
+
+    shard_inode_ctx_set(inode, this, buf, 0,
+                        SHARD_MASK_NLINK | SHARD_MASK_TIMES);
+    buf->ia_size = local->prebuf.ia_size;
+    buf->ia_blocks = local->prebuf.ia_blocks;
+
+    SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent,
+                       postparent, xdata);
+    return 0;
+err:
+    shard_common_failure_unwind(GF_FOP_LINK, frame, op_ret, op_errno);
+    return 0;
+}
+
+int
+shard_post_lookup_link_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        SHARD_STACK_UNWIND(link, frame, local->op_ret, local->op_errno, NULL,
+                           NULL, NULL, NULL, NULL);
+        return 0;
+    }
+
+    STACK_WIND(frame, shard_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, &local->loc, &local->loc2,
+               local->xattr_req);
+    return 0;
+}
+
+int32_t
+shard_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+           dict_t *xdata)
+{
+    int ret = -1;
+    uint64_t block_size = 0;
+    shard_local_t *local = NULL;
+
+    ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get block "
+               "size from inode ctx of %s",
+               uuid_utoa(oldloc->inode->gfid));
+        goto err;
+    }
+
+    if (!block_size) {
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+                        oldloc, newloc, xdata);
+        return 0;
+    }
+
+    if (!this->itable)
+        this->itable = oldloc->inode->table;
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+
+    loc_copy(&local->loc, oldloc);
+    loc_copy(&local->loc2, newloc);
+    local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+    if (!local->xattr_req)
+        goto err;
+
+    shard_refresh_base_file(frame, this, &local->loc, NULL,
+                            shard_post_lookup_link_handler);
+    return 0;
+err:
+    shard_common_failure_unwind(GF_FOP_LINK, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode);
+
+int
+shard_post_lookup_shards_unlink_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+    uuid_t gfid = {
+        0,
+    };
+
+    local = frame->local;
+
+    if (local->resolver_base_inode)
+        gf_uuid_copy(gfid, local->resolver_base_inode->gfid);
+    else
+        gf_uuid_copy(gfid, local->base_gfid);
+
+    if ((local->op_ret < 0) && (local->op_errno != ENOENT)) {
+        gf_msg(this->name, GF_LOG_ERROR, local->op_errno, SHARD_MSG_FOP_FAILED,
+               "failed to delete shards of %s", uuid_utoa(gfid));
+        return 0;
+    }
+    local->op_ret = 0;
+    local->op_errno = 0;
+
+    shard_unlink_shards_do(frame, this, local->resolver_base_inode);
+    return 0;
+}
+
+int
+shard_post_resolve_unlink_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+    local->lookup_shards_barriered = _gf_true;
+
+    if (!local->call_count)
+        shard_unlink_shards_do(frame, this, local->resolver_base_inode);
+    else
+        shard_common_lookup_shards(frame, this, local->resolver_base_inode,
+                                   shard_post_lookup_shards_unlink_handler);
+    return 0;
+}
+
+void
+shard_unlink_block_inode(shard_local_t *local, int shard_block_num)
+{
+    char block_bname[256] = {
+        0,
+    };
+    uuid_t gfid = {
+        0,
+    };
+    inode_t *inode = NULL;
+    inode_t *base_inode = NULL;
+    xlator_t *this = NULL;
+    shard_priv_t *priv = NULL;
+    shard_inode_ctx_t *ctx = NULL;
+    shard_inode_ctx_t *base_ictx = NULL;
+    int unref_base_inode = 0;
+    int unref_shard_inode = 0;
+
+    this = THIS;
+    priv = this->private;
+
+    inode = local->inode_list[shard_block_num - local->first_block];
+    shard_inode_ctx_get(inode, this, &ctx);
+    base_inode = ctx->base_inode;
+    if (base_inode)
+        gf_uuid_copy(gfid, base_inode->gfid);
+    else
+        gf_uuid_copy(gfid, ctx->base_gfid);
+    shard_make_block_bname(shard_block_num, gfid, block_bname,
+                           sizeof(block_bname));
+
+    LOCK(&priv->lock);
+    if (base_inode)
+        LOCK(&base_inode->lock);
+    LOCK(&inode->lock);
+    {
+        __shard_inode_ctx_get(inode, this, &ctx);
+        if (!list_empty(&ctx->ilist)) {
+            list_del_init(&ctx->ilist);
+            priv->inode_count--;
+            unref_base_inode++;
+            unref_shard_inode++;
+            GF_ASSERT(priv->inode_count >= 0);
+        }
+        if (ctx->fsync_needed) {
+            unref_base_inode++;
+            unref_shard_inode++;
+            list_del_init(&ctx->to_fsync_list);
+            if (base_inode) {
+                __shard_inode_ctx_get(base_inode, this, &base_ictx);
+                base_ictx->fsync_count--;
+            }
+        }
+    }
+    UNLOCK(&inode->lock);
+    if (base_inode)
+        UNLOCK(&base_inode->lock);
+
+    inode_unlink(inode, priv->dot_shard_inode, block_bname);
+    inode_ref_reduce_by_n(inode, unref_shard_inode);
+    inode_forget(inode, 0);
+
+    if (base_inode && unref_base_inode)
+        inode_ref_reduce_by_n(base_inode, unref_base_inode);
+    UNLOCK(&priv->lock);
+}
+
+int
+shard_rename_cbk(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    SHARD_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno,
+                       &local->prebuf, &local->preoldparent,
+                       &local->postoldparent, &local->prenewparent,
+                       &local->postnewparent, local->xattr_rsp);
+    return 0;
+}
+
+int32_t
+shard_unlink_cbk(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = frame->local;
+
+    SHARD_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
+                       &local->preoldparent, &local->postoldparent,
+                       local->xattr_rsp);
+    return 0;
+}
+
+int
+shard_unlink_shards_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno,
+                           struct iatt *preparent, struct iatt *postparent,
+                           dict_t *xdata)
+{
+    int shard_block_num = (long)cookie;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret < 0) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto done;
+    }
+
+    shard_unlink_block_inode(local, shard_block_num);
+done:
+    syncbarrier_wake(&local->barrier);
+    return 0;
+}
+
+int
+shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode)
+{
+    int i = 0;
+    int ret = -1;
+    int count = 0;
+    uint32_t cur_block = 0;
+    uint32_t cur_block_idx = 0; /*this is idx into inode_list[] array */
+    char *bname = NULL;
+    char path[PATH_MAX] = {
+        0,
+    };
+    uuid_t gfid = {
+        0,
+    };
+    loc_t loc = {
+        0,
+    };
+    gf_boolean_t wind_failed = _gf_false;
+    shard_local_t *local = NULL;
+    shard_priv_t *priv = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    if (inode)
+        gf_uuid_copy(gfid, inode->gfid);
+    else
+        gf_uuid_copy(gfid, local->base_gfid);
+
+    for (i = 0; i < local->num_blocks; i++) {
+        if (!local->inode_list[i])
+            continue;
+        count++;
+    }
+
+    if (!count) {
+        /* callcount = 0 implies that all of the shards that need to be
+         * unlinked are non-existent (in other words the file is full of
+         * holes).
+         */
+        gf_msg_debug(this->name, 0,
+                     "All shards that need to be "
+                     "unlinked are non-existent: %s",
+                     uuid_utoa(gfid));
+        return 0;
+    }
+
+    SHARD_SET_ROOT_FS_ID(frame, local);
+    local->barrier.waitfor = count;
+    cur_block = cur_block_idx + local->first_block;
+
+    while (cur_block_idx < local->num_blocks) {
+        if (!local->inode_list[cur_block_idx])
+            goto next;
+
+        if (wind_failed) {
+            shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1,
+                                       ENOMEM, NULL, NULL, NULL);
+            goto next;
+        }
+
+        shard_make_block_abspath(cur_block, gfid, path, sizeof(path));
+        bname = strrchr(path, '/') + 1;
+        loc.parent = inode_ref(priv->dot_shard_inode);
+        ret = inode_path(loc.parent, bname, (char **)&(loc.path));
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED,
+                   "Inode path failed"
+                   " on %s, base file gfid = %s",
+                   bname, uuid_utoa(gfid));
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            loc_wipe(&loc);
+            wind_failed = _gf_true;
+            shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1,
+                                       ENOMEM, NULL, NULL, NULL);
+            goto next;
+        }
+
+        loc.name = strrchr(loc.path, '/');
+        if (loc.name)
+            loc.name++;
+        loc.inode = inode_ref(local->inode_list[cur_block_idx]);
+
+        STACK_WIND_COOKIE(frame, shard_unlink_shards_do_cbk,
+                          (void *)(long)cur_block, FIRST_CHILD(this),
+                          FIRST_CHILD(this)->fops->unlink, &loc, local->xflag,
+                          local->xattr_req);
+        loc_wipe(&loc);
+    next:
+        cur_block++;
+        cur_block_idx++;
+    }
+    syncbarrier_wait(&local->barrier, count);
+    SHARD_UNSET_ROOT_FS_ID(frame, local);
+    return 0;
+}
+
+int
+shard_regulated_shards_deletion(call_frame_t *cleanup_frame, xlator_t *this,
+                                int now, int first_block, gf_dirent_t *entry)
+{
+    int i = 0;
+    int ret = 0;
+    shard_local_t *local = NULL;
+    uuid_t gfid = {
+        0,
+    };
+
+    local = cleanup_frame->local;
+
+    local->inode_list = GF_CALLOC(now, sizeof(inode_t *),
+                                  gf_shard_mt_inode_list);
+    if (!local->inode_list)
+        return -ENOMEM;
+
+    local->first_block = first_block;
+    local->last_block = first_block + now - 1;
+    local->num_blocks = now;
+    gf_uuid_parse(entry->d_name, gfid);
+    gf_uuid_copy(local->base_gfid, gfid);
+    local->resolver_base_inode = inode_find(this->itable, gfid);
+    local->call_count = 0;
+    ret = syncbarrier_init(&local->barrier);
+    if (ret) {
+        GF_FREE(local->inode_list);
+        local->inode_list = NULL;
+        inode_unref(local->resolver_base_inode);
+        local->resolver_base_inode = NULL;
+        return -errno;
+    }
+    shard_common_resolve_shards(cleanup_frame, this,
+                                shard_post_resolve_unlink_handler);
+
+    for (i = 0; i < local->num_blocks; i++) {
+        if (local->inode_list[i])
+            inode_unref(local->inode_list[i]);
+    }
+    GF_FREE(local->inode_list);
+    local->inode_list = NULL;
+    if (local->op_ret)
+        ret = -local->op_errno;
+    syncbarrier_destroy(&local->barrier);
+    inode_unref(local->resolver_base_inode);
+    local->resolver_base_inode = NULL;
+    STACK_RESET(cleanup_frame->root);
+    return ret;
+}
+
+int
+__shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this,
+                               gf_dirent_t *entry, inode_t *inode)
+{
+    int ret = 0;
+    int shard_count = 0;
+    int first_block = 0;
+    int now = 0;
+    uint64_t size = 0;
+    uint64_t block_size = 0;
+    uint64_t size_array[4] = {
+        0,
+    };
+    void *bsize = NULL;
+    void *size_attr = NULL;
+    dict_t *xattr_rsp = NULL;
+    loc_t loc = {
+        0,
+    };
+    shard_local_t *local = NULL;
+    shard_priv_t *priv = NULL;
+
+    priv = this->private;
+    local = cleanup_frame->local;
+    ret = dict_reset(local->xattr_req);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED,
+               "Failed to reset dict");
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED,
+               "Failed to set dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE);
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, 8 * 4);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED,
+               "Failed to set dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE);
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    loc.inode = inode_ref(inode);
+    loc.parent = inode_ref(priv->dot_shard_rm_inode);
+    ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path));
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED,
+               "Inode path  failed on %s", entry->d_name);
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    loc.name = strrchr(loc.path, '/');
+    if (loc.name)
+        loc.name++;
+    ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, local->xattr_req,
+                        &xattr_rsp);
+    if (ret)
+        goto err;
+
+    ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_BLOCK_SIZE, &bsize);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED,
+               "Failed to get dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE);
+        goto err;
+    }
+    block_size = ntoh64(*((uint64_t *)bsize));
+
+    ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_FILE_SIZE, &size_attr);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED,
+               "Failed to get dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE);
+        goto err;
+    }
+
+    memcpy(size_array, size_attr, sizeof(size_array));
+    size = ntoh64(size_array[0]);
+
+    shard_count = (size / block_size) - 1;
+    if (shard_count < 0) {
+        gf_msg_debug(this->name, 0,
+                     "Size of %s hasn't grown beyond "
+                     "its shard-block-size. Nothing to delete. "
+                     "Returning",
+                     entry->d_name);
+        /* File size < shard-block-size, so nothing to delete */
+        ret = 0;
+        goto delete_marker;
+    }
+    if ((size % block_size) > 0)
+        shard_count++;
+
+    if (shard_count == 0) {
+        gf_msg_debug(this->name, 0,
+                     "Size of %s is exactly equal to "
+                     "its shard-block-size. Nothing to delete. "
+                     "Returning",
+                     entry->d_name);
+        ret = 0;
+        goto delete_marker;
+    }
+    gf_msg_debug(this->name, 0,
+                 "base file = %s, "
+                 "shard-block-size=%" PRIu64 ", file-size=%" PRIu64
+                 ", "
+                 "shard_count=%d",
+                 entry->d_name, block_size, size, shard_count);
+
+    /* Perform a gfid-based lookup to see if gfid corresponding to marker
+     * file's base name exists.
+     */
+    loc_wipe(&loc);
+    loc.inode = inode_new(this->itable);
+    if (!loc.inode) {
+        ret = -ENOMEM;
+        goto err;
+    }
+    gf_uuid_parse(entry->d_name, loc.gfid);
+    ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL);
+    if (!ret) {
+        gf_msg_debug(this->name, 0,
+                     "Base shard corresponding to gfid "
+                     "%s is present. Skipping shard deletion. "
+                     "Returning",
+                     entry->d_name);
+        ret = 0;
+        goto delete_marker;
+    }
+
+    first_block = 1;
+
+    while (shard_count) {
+        if (shard_count < local->deletion_rate) {
+            now = shard_count;
+            shard_count = 0;
+        } else {
+            now = local->deletion_rate;
+            shard_count -= local->deletion_rate;
+        }
+
+        gf_msg_debug(this->name, 0,
+                     "deleting %d shards starting from "
+                     "block %d of gfid %s",
+                     now, first_block, entry->d_name);
+        ret = shard_regulated_shards_deletion(cleanup_frame, this, now,
+                                              first_block, entry);
+        if (ret)
+            goto err;
+        first_block += now;
+    }
+
+delete_marker:
+    loc_wipe(&loc);
+    loc.inode = inode_ref(inode);
+    loc.parent = inode_ref(priv->dot_shard_rm_inode);
+    ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path));
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED,
+               "Inode path  failed on %s", entry->d_name);
+        ret = -ENOMEM;
+        goto err;
+    }
+    loc.name = strrchr(loc.path, '/');
+    if (loc.name)
+        loc.name++;
+    ret = syncop_unlink(FIRST_CHILD(this), &loc, NULL, NULL);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SHARDS_DELETION_FAILED,
+               "Failed to delete %s "
+               "from /%s",
+               entry->d_name, GF_SHARD_REMOVE_ME_DIR);
+err:
+    if (xattr_rsp)
+        dict_unref(xattr_rsp);
+    loc_wipe(&loc);
+    return ret;
+}
+
+int
+shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this,
+                             gf_dirent_t *entry, inode_t *inode)
+{
+    int ret = -1;
+    loc_t loc = {
+        0,
+    };
+    shard_priv_t *priv = NULL;
+
+    priv = this->private;
+    loc.inode = inode_ref(priv->dot_shard_rm_inode);
+
+    ret = syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name,
+                         ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL, NULL);
+    if (ret < 0) {
+        if (ret == -EAGAIN) {
+            ret = 0;
+        }
+        goto out;
+    }
+    {
+        ret = __shard_delete_shards_of_entry(cleanup_frame, this, entry, inode);
+    }
+    syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name,
+                   ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL, NULL);
+out:
+    loc_wipe(&loc);
+    return ret;
+}
+
+int
+shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data)
+{
+    SHARD_STACK_DESTROY(frame);
+    return 0;
+}
+
+int
+shard_resolve_internal_dir(xlator_t *this, shard_local_t *local,
+                           shard_internal_dir_type_t type)
+{
+    int ret = 0;
+    char *bname = NULL;
+    loc_t *loc = NULL;
+    shard_priv_t *priv = NULL;
+    uuid_t gfid = {
+        0,
+    };
+    struct iatt stbuf = {
+        0,
+    };
+
+    priv = this->private;
+
+    switch (type) {
+        case SHARD_INTERNAL_DIR_DOT_SHARD:
+            loc = &local->dot_shard_loc;
+            gf_uuid_copy(gfid, priv->dot_shard_gfid);
+            bname = GF_SHARD_DIR;
+            break;
+        case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME:
+            loc = &local->dot_shard_rm_loc;
+            gf_uuid_copy(gfid, priv->dot_shard_rm_gfid);
+            bname = GF_SHARD_REMOVE_ME_DIR;
+            break;
+        default:
+            break;
+    }
+
+    loc->inode = inode_find(this->itable, gfid);
+    if (!loc->inode) {
+        ret = shard_init_internal_dir_loc(this, local, type);
+        if (ret)
+            goto err;
+        ret = dict_reset(local->xattr_req);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED,
+                   "Failed to reset "
+                   "dict");
+            ret = -ENOMEM;
+            goto err;
+        }
+        ret = dict_set_gfuuid(local->xattr_req, "gfid-req", gfid, true);
+        ret = syncop_lookup(FIRST_CHILD(this), loc, &stbuf, NULL,
+                            local->xattr_req, NULL);
+        if (ret < 0) {
+            if (ret != -ENOENT)
+                gf_msg(this->name, GF_LOG_ERROR, -ret,
+                       SHARD_MSG_SHARDS_DELETION_FAILED,
+                       "Lookup on %s failed, exiting", bname);
+            goto err;
+        } else {
+            shard_link_internal_dir_inode(local, loc->inode, &stbuf, type);
+        }
+    }
+    ret = 0;
+err:
+    return ret;
+}
+
+int
+shard_lookup_marker_entry(xlator_t *this, shard_local_t *local,
+                          gf_dirent_t *entry)
+{
+    int ret = 0;
+    loc_t loc = {
+        0,
+    };
+
+    loc.inode = inode_new(this->itable);
+    if (!loc.inode) {
+        ret = -ENOMEM;
+        goto err;
+    }
+    loc.parent = inode_ref(local->fd->inode);
+
+    ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path));
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED,
+               "Inode path failed on %s", entry->d_name);
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    loc.name = strrchr(loc.path, '/');
+    if (loc.name)
+        loc.name++;
+
+    ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL);
+    if (ret < 0) {
+        goto err;
+    }
+    entry->inode = inode_ref(loc.inode);
+    ret = 0;
+err:
+    loc_wipe(&loc);
+    return ret;
+}
+
+int
+shard_delete_shards(void *opaque)
+{
+    int ret = 0;
+    off_t offset = 0;
+    loc_t loc = {
+        0,
+    };
+    inode_t *link_inode = NULL;
+    xlator_t *this = NULL;
+    shard_priv_t *priv = NULL;
+    shard_local_t *local = NULL;
+    gf_dirent_t entries;
+    gf_dirent_t *entry = NULL;
+    call_frame_t *cleanup_frame = NULL;
+    gf_boolean_t done = _gf_false;
+
+    this = THIS;
+    priv = this->private;
+    INIT_LIST_HEAD(&entries.list);
+
+    cleanup_frame = opaque;
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED,
+               "Failed to create local to "
+               "delete shards");
+        ret = -ENOMEM;
+        goto err;
+    }
+    cleanup_frame->local = local;
+    local->fop = GF_FOP_UNLINK;
+
+    local->xattr_req = dict_new();
+    if (!local->xattr_req) {
+        ret = -ENOMEM;
+        goto err;
+    }
+    local->deletion_rate = priv->deletion_rate;
+
+    ret = shard_resolve_internal_dir(this, local, SHARD_INTERNAL_DIR_DOT_SHARD);
+    if (ret == -ENOENT) {
+        gf_msg_debug(this->name, 0,
+                     ".shard absent. Nothing to"
+                     " delete. Exiting");
+        ret = 0;
+        goto err;
+    } else if (ret < 0) {
+        goto err;
+    }
+
+    ret = shard_resolve_internal_dir(this, local,
+                                     SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME);
+    if (ret == -ENOENT) {
+        gf_msg_debug(this->name, 0,
+                     ".remove_me absent. "
+                     "Nothing to delete. Exiting");
+        ret = 0;
+        goto err;
+    } else if (ret < 0) {
+        goto err;
+    }
+
+    local->fd = fd_anonymous(local->dot_shard_rm_loc.inode);
+    if (!local->fd) {
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    for (;;) {
+        offset = 0;
+        LOCK(&priv->lock);
+        {
+            if (priv->bg_del_state == SHARD_BG_DELETION_LAUNCHING) {
+                priv->bg_del_state = SHARD_BG_DELETION_IN_PROGRESS;
+            } else if (priv->bg_del_state == SHARD_BG_DELETION_IN_PROGRESS) {
+                priv->bg_del_state = SHARD_BG_DELETION_NONE;
+                done = _gf_true;
+            }
+        }
+        UNLOCK(&priv->lock);
+        if (done)
+            break;
+        while (
+            (ret = syncop_readdirp(FIRST_CHILD(this), local->fd, 131072, offset,
+                                   &entries, local->xattr_req, NULL))) {
+            if (ret > 0)
+                ret = 0;
+            list_for_each_entry(entry, &entries.list, list)
+            {
+                offset = entry->d_off;
+
+                if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
+                    continue;
+
+                if (!entry->inode) {
+                    ret = shard_lookup_marker_entry(this, local, entry);
+                    if (ret < 0)
+                        continue;
+                }
+                link_inode = inode_link(entry->inode, local->fd->inode,
+                                        entry->d_name, &entry->d_stat);
+
+                gf_msg_debug(this->name, 0,
+                             "Initiating deletion of "
+                             "shards of gfid %s",
+                             entry->d_name);
+                ret = shard_delete_shards_of_entry(cleanup_frame, this, entry,
+                                                   link_inode);
+                inode_unlink(link_inode, local->fd->inode, entry->d_name);
+                inode_unref(link_inode);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, -ret,
+                           SHARD_MSG_SHARDS_DELETION_FAILED,
+                           "Failed to clean up shards of gfid %s",
+                           entry->d_name);
+                    continue;
+                }
+                gf_msg(this->name, GF_LOG_INFO, 0,
+                       SHARD_MSG_SHARD_DELETION_COMPLETED,
+                       "Deleted "
+                       "shards of gfid=%s from backend",
+                       entry->d_name);
+            }
+            gf_dirent_free(&entries);
+            if (ret)
+                break;
+        }
+    }
+    ret = 0;
+    loc_wipe(&loc);
+    return ret;
+
+err:
+    LOCK(&priv->lock);
+    {
+        priv->bg_del_state = SHARD_BG_DELETION_NONE;
+    }
+    UNLOCK(&priv->lock);
+    loc_wipe(&loc);
+    return ret;
+}
+
+int
+shard_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    if (op_ret)
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED,
+               "Unlock failed. Please check brick logs for "
+               "more details");
+    SHARD_STACK_DESTROY(frame);
+    return 0;
+}
+
+int
+shard_unlock_inodelk(call_frame_t *frame, xlator_t *this)
+{
+    loc_t *loc = NULL;
+    call_frame_t *lk_frame = NULL;
+    shard_local_t *local = NULL;
+    shard_local_t *lk_local = NULL;
+    shard_inodelk_t *lock = NULL;
+
+    local = frame->local;
+    lk_frame = local->inodelk_frame;
+    lk_local = lk_frame->local;
+    local->inodelk_frame = NULL;
+    loc = &local->int_inodelk.loc;
+    lock = &lk_local->int_inodelk;
+    lock->flock.l_type = F_UNLCK;
+
+    STACK_WIND(lk_frame, shard_unlock_inodelk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->inodelk, lock->domain, loc, F_SETLK,
+               &lock->flock, NULL);
+    local->int_inodelk.acquired_lock = _gf_false;
+    return 0;
+}
+
+int
+shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                     struct iatt *preoldparent, struct iatt *postoldparent,
+                     struct iatt *prenewparent, struct iatt *postnewparent,
+                     dict_t *xdata);
+int
+shard_rename_src_base_file(call_frame_t *frame, xlator_t *this)
+{
+    int ret = 0;
+    loc_t *dst_loc = NULL;
+    loc_t tmp_loc = {
+        0,
+    };
+    shard_local_t *local = frame->local;
+
+    if (local->dst_block_size) {
+        tmp_loc.parent = inode_ref(local->loc2.parent);
+        ret = inode_path(tmp_loc.parent, local->loc2.name,
+                         (char **)&tmp_loc.path);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED,
+                   "Inode path failed"
+                   " on pargfid=%s bname=%s",
+                   uuid_utoa(tmp_loc.parent->gfid), local->loc2.name);
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            goto err;
+        }
+
+        tmp_loc.name = strrchr(tmp_loc.path, '/');
+        if (tmp_loc.name)
+            tmp_loc.name++;
+        dst_loc = &tmp_loc;
+    } else {
+        dst_loc = &local->loc2;
+    }
+
+    /* To-Do: Request open-fd count on dst base file */
+    STACK_WIND(frame, shard_rename_src_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, &local->loc, dst_loc,
+               local->xattr_req);
+    loc_wipe(&tmp_loc);
+    return 0;
+err:
+    loc_wipe(&tmp_loc);
+    shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                local->op_errno);
+    return 0;
+}
+
+int
+shard_unlink_base_file(call_frame_t *frame, xlator_t *this);
+
+int
+shard_set_size_attrs_on_marker_file_cbk(call_frame_t *frame, void *cookie,
+                                        xlator_t *this, int32_t op_ret,
+                                        int32_t op_errno, dict_t *dict,
+                                        dict_t *xdata)
+{
+    shard_priv_t *priv = NULL;
+    shard_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED,
+               "Xattrop on marker file failed "
+               "while performing %s; entry gfid=%s",
+               gf_fop_string(local->fop), local->newloc.name);
+        goto err;
+    }
+
+    inode_unlink(local->newloc.inode, priv->dot_shard_rm_inode,
+                 local->newloc.name);
+
+    if (local->fop == GF_FOP_UNLINK)
+        shard_unlink_base_file(frame, this);
+    else if (local->fop == GF_FOP_RENAME)
+        shard_rename_src_base_file(frame, this);
+    return 0;
+err:
+    shard_common_failure_unwind(local->fop, frame, op_ret, op_errno);
+    return 0;
+}
+
+int
+shard_set_size_attrs_on_marker_file(call_frame_t *frame, xlator_t *this)
+{
+    int op_errno = ENOMEM;
+    uint64_t bs = 0;
+    dict_t *xdata = NULL;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+    xdata = dict_new();
+    if (!xdata)
+        goto err;
+
+    if (local->fop == GF_FOP_UNLINK)
+        bs = local->block_size;
+    else if (local->fop == GF_FOP_RENAME)
+        bs = local->dst_block_size;
+    SHARD_INODE_CREATE_INIT(this, bs, xdata, &local->newloc,
+                            local->prebuf.ia_size, 0, err);
+    STACK_WIND(frame, shard_set_size_attrs_on_marker_file_cbk,
+               FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop,
+               &local->newloc, GF_XATTROP_GET_AND_SET, xdata, NULL);
+    dict_unref(xdata);
+    return 0;
+err:
+    if (xdata)
+        dict_unref(xdata);
+    shard_common_failure_unwind(local->fop, frame, -1, op_errno);
+    return 0;
+}
+
+int
+shard_lookup_marker_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                             int32_t op_ret, int32_t op_errno, inode_t *inode,
+                             struct iatt *buf, dict_t *xdata,
+                             struct iatt *postparent)
+{
+    inode_t *linked_inode = NULL;
+    shard_priv_t *priv = NULL;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    if (op_ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED,
+               "Lookup on marker file failed "
+               "while performing %s; entry gfid=%s",
+               gf_fop_string(local->fop), local->newloc.name);
+        goto err;
+    }
+
+    linked_inode = inode_link(inode, priv->dot_shard_rm_inode,
+                              local->newloc.name, buf);
+    inode_unref(local->newloc.inode);
+    local->newloc.inode = linked_inode;
+    shard_set_size_attrs_on_marker_file(frame, this);
+    return 0;
+err:
+    shard_common_failure_unwind(local->fop, frame, op_ret, op_errno);
+    return 0;
+}
+
+int
+shard_lookup_marker_file(call_frame_t *frame, xlator_t *this)
+{
+    int op_errno = ENOMEM;
+    dict_t *xattr_req = NULL;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    xattr_req = shard_create_gfid_dict(local->xattr_req);
+    if (!xattr_req)
+        goto err;
+
+    STACK_WIND(frame, shard_lookup_marker_file_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, &local->newloc, xattr_req);
+    dict_unref(xattr_req);
+    return 0;
+err:
+    shard_common_failure_unwind(local->fop, frame, -1, op_errno);
+    return 0;
+}
+
+int
+shard_create_marker_file_under_remove_me_cbk(
+    call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+    int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent,
+    struct iatt *postparent, dict_t *xdata)
+{
+    inode_t *linked_inode = NULL;
+    shard_priv_t *priv = NULL;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+    priv = this->private;
+
+    SHARD_UNSET_ROOT_FS_ID(frame, local);
+    if (op_ret < 0) {
+        if ((op_errno != EEXIST) && (op_errno != ENODATA)) {
+            local->op_ret = op_ret;
+            local->op_errno = op_errno;
+            gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED,
+                   "Marker file creation "
+                   "failed while performing %s; entry gfid=%s",
+                   gf_fop_string(local->fop), local->newloc.name);
+            goto err;
+        } else {
+            shard_lookup_marker_file(frame, this);
+            return 0;
+        }
+    }
+
+    linked_inode = inode_link(inode, priv->dot_shard_rm_inode,
+                              local->newloc.name, buf);
+    inode_unref(local->newloc.inode);
+    local->newloc.inode = linked_inode;
+
+    if (local->fop == GF_FOP_UNLINK)
+        shard_unlink_base_file(frame, this);
+    else if (local->fop == GF_FOP_RENAME)
+        shard_rename_src_base_file(frame, this);
+    return 0;
+err:
+    shard_common_failure_unwind(local->fop, frame, -1, local->op_errno);
+    return 0;
+}
+
+int
+shard_create_marker_file_under_remove_me(call_frame_t *frame, xlator_t *this,
+                                         loc_t *loc)
+{
+    int ret = 0;
+    int op_errno = ENOMEM;
+    uint64_t bs = 0;
+    char g1[64] = {
+        0,
+    };
+    char g2[64] = {
+        0,
+    };
+    dict_t *xattr_req = NULL;
+    shard_priv_t *priv = NULL;
+    shard_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    SHARD_SET_ROOT_FS_ID(frame, local);
+
+    xattr_req = shard_create_gfid_dict(local->xattr_req);
+    if (!xattr_req)
+        goto err;
+
+    local->newloc.inode = inode_new(this->itable);
+    local->newloc.parent = inode_ref(priv->dot_shard_rm_inode);
+    ret = inode_path(local->newloc.parent, uuid_utoa(loc->inode->gfid),
+                     (char **)&local->newloc.path);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED,
+               "Inode path failed on "
+               "pargfid=%s bname=%s",
+               uuid_utoa_r(priv->dot_shard_rm_gfid, g1),
+               uuid_utoa_r(loc->inode->gfid, g2));
+        goto err;
+    }
+    local->newloc.name = strrchr(local->newloc.path, '/');
+    if (local->newloc.name)
+        local->newloc.name++;
+
+    if (local->fop == GF_FOP_UNLINK)
+        bs = local->block_size;
+    else if (local->fop == GF_FOP_RENAME)
+        bs = local->dst_block_size;
+
+    SHARD_INODE_CREATE_INIT(this, bs, xattr_req, &local->newloc,
+                            local->prebuf.ia_size, 0, err);
+
+    STACK_WIND(frame, shard_create_marker_file_under_remove_me_cbk,
+               FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod,
+               &local->newloc, 0, 0, 0644, xattr_req);
+    dict_unref(xattr_req);
+    return 0;
+
+err:
+    if (xattr_req)
+        dict_unref(xattr_req);
+    shard_create_marker_file_under_remove_me_cbk(frame, 0, this, -1, op_errno,
+                                                 NULL, NULL, NULL, NULL, NULL);
+    return 0;
+}
+
+int
+shard_unlock_entrylk(call_frame_t *frame, xlator_t *this);
+
+int
+shard_unlink_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno,
+                           struct iatt *preparent, struct iatt *postparent,
+                           dict_t *xdata)
+{
+    int ret = 0;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret < 0) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+    } else {
+        shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this);
+        local->preoldparent = *preparent;
+        local->postoldparent = *postparent;
+        if (xdata)
+            local->xattr_rsp = dict_ref(xdata);
+        if (local->cleanup_required)
+            shard_start_background_deletion(this);
+    }
+
+    if (local->entrylk_frame) {
+        ret = shard_unlock_entrylk(frame, this);
+        if (ret < 0) {
+            local->op_ret = -1;
+            local->op_errno = -ret;
+        }
+    }
+
+    ret = shard_unlock_inodelk(frame, this);
+    if (ret < 0) {
+        local->op_ret = -1;
+        local->op_errno = -ret;
+    }
+
+    shard_unlink_cbk(frame, this);
+    return 0;
+}
+
+int
+shard_unlink_base_file(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = frame->local;
+
+    /* To-Do: Request open-fd count on base file */
+    STACK_WIND(frame, shard_unlink_base_file_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag,
+               local->xattr_req);
+    return 0;
+}
+
+int
+shard_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    if (op_ret)
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED,
+               "Unlock failed. Please check brick logs for "
+               "more details");
+    SHARD_STACK_DESTROY(frame);
+    return 0;
+}
+
+int
+shard_unlock_entrylk(call_frame_t *frame, xlator_t *this)
+{
+    loc_t *loc = NULL;
+    call_frame_t *lk_frame = NULL;
+    shard_local_t *local = NULL;
+    shard_local_t *lk_local = NULL;
+    shard_entrylk_t *lock = NULL;
+
+    local = frame->local;
+    lk_frame = local->entrylk_frame;
+    lk_local = lk_frame->local;
+    local->entrylk_frame = NULL;
+    lock = &lk_local->int_entrylk;
+    loc = &lock->loc;
+
+    STACK_WIND(lk_frame, shard_unlock_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, loc,
+               lk_local->int_entrylk.basename, ENTRYLK_UNLOCK, ENTRYLK_WRLCK,
+               NULL);
+    local->int_entrylk.acquired_lock = _gf_false;
+    return 0;
+}
+
+int
+shard_post_entrylk_fop_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    switch (local->fop) {
+        case GF_FOP_UNLINK:
+        case GF_FOP_RENAME:
+            shard_create_marker_file_under_remove_me(frame, this,
+                                                     &local->int_inodelk.loc);
+            break;
+        default:
+            gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP,
+                   "post-entrylk handler not defined. This case should not"
+                   " be hit");
+            break;
+    }
+    return 0;
+}
+
+int
+shard_acquire_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    call_frame_t *main_frame = NULL;
+    shard_local_t *local = NULL;
+    shard_local_t *main_local = NULL;
+
+    local = frame->local;
+    main_frame = local->main_frame;
+    main_local = main_frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(main_local->fop, main_frame, op_ret,
+                                    op_errno);
+        return 0;
+    }
+    main_local->int_entrylk.acquired_lock = _gf_true;
+    shard_post_entrylk_fop_handler(main_frame, this);
+    return 0;
+}
+
+int
+shard_acquire_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+                      uuid_t gfid)
+{
+    char gfid_str[GF_UUID_BUF_SIZE] = {
+        0,
+    };
+    shard_local_t *local = NULL;
+    shard_local_t *entrylk_local = NULL;
+    shard_entrylk_t *int_entrylk = NULL;
+    call_frame_t *entrylk_frame = NULL;
+
+    local = frame->local;
+    entrylk_frame = create_frame(this, this->ctx->pool);
+    if (!entrylk_frame) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED,
+               "Failed to create new frame "
+               "to lock marker file");
+        goto err;
+    }
+
+    entrylk_local = mem_get0(this->local_pool);
+    if (!entrylk_local) {
+        STACK_DESTROY(entrylk_frame->root);
+        goto err;
+    }
+
+    entrylk_frame->local = entrylk_local;
+    entrylk_local->main_frame = frame;
+    int_entrylk = &entrylk_local->int_entrylk;
+
+    int_entrylk->loc.inode = inode_ref(inode);
+    set_lk_owner_from_ptr(&entrylk_frame->root->lk_owner, entrylk_frame->root);
+    local->entrylk_frame = entrylk_frame;
+    gf_uuid_unparse(gfid, gfid_str);
+    int_entrylk->basename = gf_strdup(gfid_str);
+
+    STACK_WIND(entrylk_frame, shard_acquire_entrylk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->entrylk, this->name, &int_entrylk->loc,
+               int_entrylk->basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+    return 0;
+err:
+    shard_common_failure_unwind(local->fop, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_post_lookup_base_shard_rm_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+    shard_priv_t *priv = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(local->fop, frame, -1, local->op_errno);
+        return 0;
+    }
+
+    if (local->prebuf.ia_nlink > 1) {
+        gf_msg_debug(this->name, 0,
+                     "link count on %s > 1:%d, "
+                     "performing rename()/unlink()",
+                     local->int_inodelk.loc.path, local->prebuf.ia_nlink);
+        if (local->fop == GF_FOP_RENAME)
+            shard_rename_src_base_file(frame, this);
+        else if (local->fop == GF_FOP_UNLINK)
+            shard_unlink_base_file(frame, this);
+    } else {
+        gf_msg_debug(this->name, 0,
+                     "link count on %s = 1, creating "
+                     "file under .remove_me",
+                     local->int_inodelk.loc.path);
+        local->cleanup_required = _gf_true;
+        shard_acquire_entrylk(frame, this, priv->dot_shard_rm_inode,
+                              local->prebuf.ia_gfid);
+    }
+    return 0;
+}
+
+int
+shard_post_inodelk_fop_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    switch (local->fop) {
+        case GF_FOP_UNLINK:
+        case GF_FOP_RENAME:
+            shard_refresh_base_file(frame, this, &local->int_inodelk.loc, NULL,
+                                    shard_post_lookup_base_shard_rm_handler);
+            break;
+        default:
+            gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP,
+                   "post-inodelk handler not defined. This case should not"
+                   " be hit");
+            break;
+    }
+    return 0;
+}
+
+int
+shard_acquire_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    call_frame_t *main_frame = NULL;
+    shard_local_t *local = NULL;
+    shard_local_t *main_local = NULL;
+
+    local = frame->local;
+    main_frame = local->main_frame;
+    main_local = main_frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(main_local->fop, main_frame, op_ret,
+                                    op_errno);
+        return 0;
+    }
+    main_local->int_inodelk.acquired_lock = _gf_true;
+    shard_post_inodelk_fop_handler(main_frame, this);
+    return 0;
+}
+
+int
+shard_acquire_inodelk(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+    call_frame_t *lk_frame = NULL;
+    shard_local_t *local = NULL;
+    shard_local_t *lk_local = NULL;
+    shard_inodelk_t *int_inodelk = NULL;
+
+    local = frame->local;
+    lk_frame = create_frame(this, this->ctx->pool);
+    if (!lk_frame) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED,
+               "Failed to create new frame "
+               "to lock base shard");
+        goto err;
+    }
+    lk_local = mem_get0(this->local_pool);
+    if (!lk_local) {
+        STACK_DESTROY(lk_frame->root);
+        goto err;
+    }
+
+    lk_frame->local = lk_local;
+    lk_local->main_frame = frame;
+    int_inodelk = &lk_local->int_inodelk;
+
+    int_inodelk->flock.l_len = 0;
+    int_inodelk->flock.l_start = 0;
+    int_inodelk->domain = this->name;
+    int_inodelk->flock.l_type = F_WRLCK;
+    loc_copy(&local->int_inodelk.loc, loc);
+    set_lk_owner_from_ptr(&lk_frame->root->lk_owner, lk_frame->root);
+    local->inodelk_frame = lk_frame;
+
+    STACK_WIND(lk_frame, shard_acquire_inodelk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->inodelk, int_inodelk->domain,
+               &local->int_inodelk.loc, F_SETLKW, &int_inodelk->flock, NULL);
+    return 0;
+err:
+    shard_common_failure_unwind(local->fop, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_post_mkdir_rm_handler(call_frame_t *frame, xlator_t *this)
+{
+    loc_t *loc = NULL;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(local->fop, frame, -1, local->op_errno);
+        return 0;
+    }
+    if (local->fop == GF_FOP_UNLINK)
+        loc = &local->loc;
+    else if (local->fop == GF_FOP_RENAME)
+        loc = &local->loc2;
+    shard_acquire_inodelk(frame, this, loc);
+    return 0;
+}
+
+int
+shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this,
+                         shard_post_resolve_fop_handler_t handler,
+                         shard_internal_dir_type_t type);
+int
+shard_pre_mkdir_rm_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(local->fop, frame, -1, local->op_errno);
+        return 0;
+    }
+    shard_mkdir_internal_dir(frame, this, shard_post_mkdir_rm_handler,
+                             SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME);
+    return 0;
+}
+
+void
+shard_begin_rm_resolution(call_frame_t *frame, xlator_t *this)
+{
+    shard_priv_t *priv = NULL;
+    shard_local_t *local = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    local->dot_shard_rm_loc.inode = inode_find(this->itable,
+                                               priv->dot_shard_rm_gfid);
+    if (!local->dot_shard_rm_loc.inode) {
+        local->dot_shard_loc.inode = inode_find(this->itable,
+                                                priv->dot_shard_gfid);
+        if (!local->dot_shard_loc.inode) {
+            shard_mkdir_internal_dir(frame, this, shard_pre_mkdir_rm_handler,
+                                     SHARD_INTERNAL_DIR_DOT_SHARD);
+        } else {
+            local->post_res_handler = shard_pre_mkdir_rm_handler;
+            shard_refresh_internal_dir(frame, this,
+                                       SHARD_INTERNAL_DIR_DOT_SHARD);
+        }
+    } else {
+        local->post_res_handler = shard_post_mkdir_rm_handler;
+        shard_refresh_internal_dir(frame, this,
+                                   SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME);
+    }
+}
+
+int
+shard_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+             dict_t *xdata)
+{
+    int ret = -1;
+    uint64_t block_size = 0;
+    shard_local_t *local = NULL;
+
+    ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size);
+    if ((ret) && (!IA_ISLNK(loc->inode->ia_type))) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get block "
+               "size from inode ctx of %s",
+               uuid_utoa(loc->inode->gfid));
+        goto err;
+    }
+
+    if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) {
+        STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+        return 0;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+
+    loc_copy(&local->loc, loc);
+    local->xflag = xflag;
+    local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+    local->block_size = block_size;
+    local->resolver_base_inode = loc->inode;
+    local->fop = GF_FOP_UNLINK;
+    if (!this->itable)
+        this->itable = (local->loc.inode)->table;
+
+    local->resolve_not = _gf_true;
+    shard_begin_rm_resolution(frame, this);
+    return 0;
+err:
+    shard_common_failure_unwind(GF_FOP_UNLINK, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_post_rename_lookup_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_rename_cbk(frame, this);
+    return 0;
+}
+
+int
+shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                     struct iatt *preoldparent, struct iatt *postoldparent,
+                     struct iatt *prenewparent, struct iatt *postnewparent,
+                     dict_t *xdata)
+{
+    int ret = 0;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret < 0) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto err;
+    }
+    /* Set ctx->refresh to TRUE to force a lookup on disk when
+     * shard_lookup_base_file() is called next to refresh the hard link
+     * count in ctx. Note that this is applicable only to the case where
+     * the rename dst is already existent and sharded.
+     */
+    if ((local->dst_block_size) && (!local->cleanup_required))
+        shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this);
+
+    local->prebuf = *buf;
+    local->preoldparent = *preoldparent;
+    local->postoldparent = *postoldparent;
+    local->prenewparent = *prenewparent;
+    local->postnewparent = *postnewparent;
+    if (xdata)
+        local->xattr_rsp = dict_ref(xdata);
+
+    if (local->dst_block_size) {
+        if (local->entrylk_frame) {
+            ret = shard_unlock_entrylk(frame, this);
+            if (ret < 0) {
+                local->op_ret = -1;
+                local->op_errno = -ret;
+            }
+        }
+
+        ret = shard_unlock_inodelk(frame, this);
+        if (ret < 0) {
+            local->op_ret = -1;
+            local->op_errno = -ret;
+            goto err;
+        }
+        if (local->cleanup_required)
+            shard_start_background_deletion(this);
+    }
+
+    /* Now the base file of src, if sharded, is looked up to gather ia_size
+     * and ia_blocks.*/
+    if (local->block_size) {
+        local->tmp_loc.inode = inode_new(this->itable);
+        gf_uuid_copy(local->tmp_loc.gfid, (local->loc.inode)->gfid);
+        shard_refresh_base_file(frame, this, &local->tmp_loc, NULL,
+                                shard_post_rename_lookup_handler);
+    } else {
+        shard_rename_cbk(frame, this);
+    }
+    return 0;
+err:
+    shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                local->op_errno);
+    return 0;
+}
+
+int
+shard_post_lookup_dst_base_file_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                    local->op_errno);
+        return 0;
+    }
+
+    /* Save dst base file attributes into postbuf so the information is not
+     * lost when it is overwritten after lookup on base file of src in
+     * shard_lookup_base_file_cbk().
+     */
+    local->postbuf = local->prebuf;
+    shard_rename_src_base_file(frame, this);
+    return 0;
+}
+
+int
+shard_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+             dict_t *xdata)
+{
+    int ret = -1;
+    uint64_t block_size = 0;
+    uint64_t dst_block_size = 0;
+    shard_local_t *local = NULL;
+
+    if (IA_ISDIR(oldloc->inode->ia_type)) {
+        STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+        return 0;
+    }
+
+    ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size);
+    if ((ret) && (!IA_ISLNK(oldloc->inode->ia_type))) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get block "
+               "size from inode ctx of %s",
+               uuid_utoa(oldloc->inode->gfid));
+        goto err;
+    }
+
+    if (newloc->inode)
+        ret = shard_inode_ctx_get_block_size(newloc->inode, this,
+                                             &dst_block_size);
+
+    /* The following stack_wind covers the case where:
+     * a. the src file is not sharded and dst doesn't exist, OR
+     * b. the src and dst both exist but are not sharded.
+     */
+    if (((!block_size) && (!dst_block_size)) ||
+        frame->root->pid == GF_CLIENT_PID_GSYNCD) {
+        STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+        return 0;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+    loc_copy(&local->loc, oldloc);
+    loc_copy(&local->loc2, newloc);
+    local->resolver_base_inode = newloc->inode;
+    local->fop = GF_FOP_RENAME;
+    local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+    if (!local->xattr_req)
+        goto err;
+
+    local->block_size = block_size;
+    local->dst_block_size = dst_block_size;
+    if (!this->itable)
+        this->itable = (local->loc.inode)->table;
+    local->resolve_not = _gf_true;
+
+    /* The following if-block covers the case where the dst file exists
+     * and is sharded.
+     */
+    if (local->dst_block_size) {
+        shard_begin_rm_resolution(frame, this);
+    } else {
+        /* The following block covers the case where the dst either doesn't
+         * exist or is NOT sharded but the src is sharded. In this case, shard
+         * xlator would go ahead and rename src to dst. Once done, it would also
+         * lookup the base shard of src to get the ia_size and ia_blocks xattr
+         * values.
+         */
+        shard_rename_src_base_file(frame, this);
+    }
+    return 0;
+
+err:
+    shard_common_failure_unwind(GF_FOP_RENAME, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                 struct iatt *stbuf, struct iatt *preparent,
+                 struct iatt *postparent, dict_t *xdata)
+{
+    int ret = -1;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret == -1)
+        goto unwind;
+
+    ret = shard_inode_ctx_set(inode, this, stbuf, local->block_size,
+                              SHARD_ALL_MASK);
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED,
+               "Failed to set inode "
+               "ctx for %s",
+               uuid_utoa(inode->gfid));
+
+unwind:
+    SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf,
+                       preparent, postparent, xdata);
+    return 0;
+}
+
+int
+shard_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+             mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    shard_priv_t *priv = NULL;
+    shard_local_t *local = NULL;
+
+    priv = this->private;
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+    local->block_size = priv->block_size;
+
+    if (!__is_gsyncd_on_shard_dir(frame, loc)) {
+        SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err);
+    }
+
+    STACK_WIND(frame, shard_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+    return 0;
+err:
+    shard_common_failure_unwind(GF_FOP_CREATE, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    /* To-Do: Handle open with O_TRUNC under locks */
+    SHARD_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata);
+    return 0;
+}
+
+int
+shard_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+           fd_t *fd, dict_t *xdata)
+{
+    STACK_WIND(frame, shard_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+    return 0;
+}
+
+int
+shard_readv_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                   int32_t count, struct iatt *stbuf, struct iobref *iobref,
+                   dict_t *xdata)
+{
+    int i = 0;
+    int call_count = 0;
+    void *address = NULL;
+    uint64_t block_num = 0;
+    off_t off = 0;
+    struct iovec vec = {
+        0,
+    };
+    shard_local_t *local = NULL;
+    fd_t *anon_fd = cookie;
+    shard_inode_ctx_t *ctx = NULL;
+
+    local = frame->local;
+
+    /* If shard has already seen a failure here before, there is no point
+     * in aggregating subsequent reads, so just go to out.
+     */
+    if (local->op_ret < 0)
+        goto out;
+
+    if (op_ret < 0) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto out;
+    }
+
+    if (local->op_ret >= 0)
+        local->op_ret += op_ret;
+
+    shard_inode_ctx_get(anon_fd->inode, this, &ctx);
+    block_num = ctx->block_num;
+
+    if (block_num == local->first_block) {
+        address = local->iobuf->ptr;
+    } else {
+        /* else
+         * address to start writing to = beginning of buffer +
+         *                    number of bytes until end of first block +
+         *                    + block_size times number of blocks
+         *                    between the current block and the first
+         */
+        address = (char *)local->iobuf->ptr +
+                  (local->block_size - (local->offset % local->block_size)) +
+                  ((block_num - local->first_block - 1) * local->block_size);
+    }
+
+    for (i = 0; i < count; i++) {
+        address = (char *)address + off;
+        memcpy(address, vector[i].iov_base, vector[i].iov_len);
+        off += vector[i].iov_len;
+    }
+
+out:
+    if (anon_fd)
+        fd_unref(anon_fd);
+    call_count = shard_call_count_return(frame);
+    if (call_count == 0) {
+        SHARD_UNSET_ROOT_FS_ID(frame, local);
+        if (local->op_ret < 0) {
+            shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret,
+                                        local->op_errno);
+        } else {
+            if (xdata)
+                local->xattr_rsp = dict_ref(xdata);
+            vec.iov_base = local->iobuf->ptr;
+            if (local->offset + local->req_size > local->prebuf.ia_size)
+                local->total_size = local->prebuf.ia_size - local->offset;
+            vec.iov_len = local->total_size;
+            local->op_ret = local->total_size;
+            SHARD_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno,
+                               &vec, 1, &local->prebuf, local->iobref,
+                               local->xattr_rsp);
+            return 0;
+        }
+    }
+
+    return 0;
+}
+
+int
+shard_readv_do(call_frame_t *frame, xlator_t *this)
+{
+    int i = 0;
+    int call_count = 0;
+    int last_block = 0;
+    int cur_block = 0;
+    off_t orig_offset = 0;
+    off_t shard_offset = 0;
+    size_t read_size = 0;
+    size_t remaining_size = 0;
+    fd_t *fd = NULL;
+    fd_t *anon_fd = NULL;
+    shard_local_t *local = NULL;
+    gf_boolean_t wind_failed = _gf_false;
+
+    local = frame->local;
+    fd = local->fd;
+
+    orig_offset = local->offset;
+    cur_block = local->first_block;
+    last_block = local->last_block;
+    remaining_size = local->total_size;
+    local->call_count = call_count = local->num_blocks;
+
+    SHARD_SET_ROOT_FS_ID(frame, local);
+
+    if (fd->flags & O_DIRECT)
+        local->flags = O_DIRECT;
+
+    while (cur_block <= last_block) {
+        if (wind_failed) {
+            shard_readv_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, NULL,
+                               0, NULL, NULL, NULL);
+            goto next;
+        }
+
+        shard_offset = orig_offset % local->block_size;
+        read_size = local->block_size - shard_offset;
+        if (read_size > remaining_size)
+            read_size = remaining_size;
+
+        remaining_size -= read_size;
+
+        if (cur_block == 0) {
+            anon_fd = fd_ref(fd);
+        } else {
+            anon_fd = fd_anonymous(local->inode_list[i]);
+            if (!anon_fd) {
+                local->op_ret = -1;
+                local->op_errno = ENOMEM;
+                wind_failed = _gf_true;
+                shard_readv_do_cbk(frame, (void *)(long)anon_fd, this, -1,
+                                   ENOMEM, NULL, 0, NULL, NULL, NULL);
+                goto next;
+            }
+        }
+
+        STACK_WIND_COOKIE(frame, shard_readv_do_cbk, anon_fd, FIRST_CHILD(this),
+                          FIRST_CHILD(this)->fops->readv, anon_fd, read_size,
+                          shard_offset, local->flags, local->xattr_req);
+
+        orig_offset += read_size;
+    next:
+        cur_block++;
+        i++;
+        call_count--;
+    }
+    return 0;
+}
+
+int
+shard_common_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, inode_t *inode,
+                       struct iatt *buf, struct iatt *preparent,
+                       struct iatt *postparent, dict_t *xdata)
+{
+    int shard_block_num = (long)cookie;
+    int call_count = 0;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret < 0) {
+        if (op_errno == EEXIST) {
+            LOCK(&frame->lock);
+            {
+                local->eexist_count++;
+            }
+            UNLOCK(&frame->lock);
+        } else {
+            local->op_ret = op_ret;
+            local->op_errno = op_errno;
+        }
+        gf_msg_debug(this->name, 0,
+                     "mknod of shard %d "
+                     "failed: %s",
+                     shard_block_num, strerror(op_errno));
+        goto done;
+    }
+
+    shard_link_block_inode(local, shard_block_num, inode, buf);
+
+done:
+    call_count = shard_call_count_return(frame);
+    if (call_count == 0) {
+        SHARD_UNSET_ROOT_FS_ID(frame, local);
+        local->create_count = 0;
+        local->post_mknod_handler(frame, this);
+    }
+
+    return 0;
+}
+
+int
+shard_common_resume_mknod(call_frame_t *frame, xlator_t *this,
+                          shard_post_mknod_fop_handler_t post_mknod_handler)
+{
+    int i = 0;
+    int shard_idx_iter = 0;
+    int last_block = 0;
+    int ret = 0;
+    int call_count = 0;
+    char path[PATH_MAX] = {
+        0,
+    };
+    mode_t mode = 0;
+    char *bname = NULL;
+    shard_priv_t *priv = NULL;
+    shard_inode_ctx_t ctx_tmp = {
+        0,
+    };
+    shard_local_t *local = NULL;
+    gf_boolean_t wind_failed = _gf_false;
+    fd_t *fd = NULL;
+    loc_t loc = {
+        0,
+    };
+    dict_t *xattr_req = NULL;
+
+    local = frame->local;
+    priv = this->private;
+    fd = local->fd;
+    shard_idx_iter = local->first_block;
+    last_block = local->last_block;
+    call_count = local->call_count = local->create_count;
+    local->post_mknod_handler = post_mknod_handler;
+
+    SHARD_SET_ROOT_FS_ID(frame, local);
+
+    ret = shard_inode_ctx_get_all(fd->inode, this, &ctx_tmp);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get inode "
+               "ctx for %s",
+               uuid_utoa(fd->inode->gfid));
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto err;
+    }
+    mode = st_mode_from_ia(ctx_tmp.stat.ia_prot, ctx_tmp.stat.ia_type);
+
+    while (shard_idx_iter <= last_block) {
+        if (local->inode_list[i]) {
+            shard_idx_iter++;
+            i++;
+            continue;
+        }
+
+        if (wind_failed) {
+            shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this,
+                                   -1, ENOMEM, NULL, NULL, NULL, NULL, NULL);
+            goto next;
+        }
+
+        shard_make_block_abspath(shard_idx_iter, fd->inode->gfid, path,
+                                 sizeof(path));
+
+        xattr_req = shard_create_gfid_dict(local->xattr_req);
+        if (!xattr_req) {
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            wind_failed = _gf_true;
+            shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this,
+                                   -1, ENOMEM, NULL, NULL, NULL, NULL, NULL);
+            goto next;
+        }
+
+        bname = strrchr(path, '/') + 1;
+        loc.inode = inode_new(this->itable);
+        loc.parent = inode_ref(priv->dot_shard_inode);
+        ret = inode_path(loc.parent, bname, (char **)&(loc.path));
+        if (ret < 0 || !(loc.inode)) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED,
+                   "Inode path failed"
+                   "on %s, base file gfid = %s",
+                   bname, uuid_utoa(fd->inode->gfid));
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            wind_failed = _gf_true;
+            loc_wipe(&loc);
+            dict_unref(xattr_req);
+            shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this,
+                                   -1, ENOMEM, NULL, NULL, NULL, NULL, NULL);
+            goto next;
+        }
+
+        loc.name = strrchr(loc.path, '/');
+        if (loc.name)
+            loc.name++;
+
+        STACK_WIND_COOKIE(frame, shard_common_mknod_cbk,
+                          (void *)(long)shard_idx_iter, FIRST_CHILD(this),
+                          FIRST_CHILD(this)->fops->mknod, &loc, mode,
+                          ctx_tmp.stat.ia_rdev, 0, xattr_req);
+        loc_wipe(&loc);
+        dict_unref(xattr_req);
+
+    next:
+        shard_idx_iter++;
+        i++;
+        if (!--call_count)
+            break;
+    }
+
+    return 0;
+err:
+    /*
+     * This block is for handling failure in shard_inode_ctx_get_all().
+     * Failures in the while-loop are handled within the loop.
+     */
+    SHARD_UNSET_ROOT_FS_ID(frame, local);
+    post_mknod_handler(frame, this);
+    return 0;
+}
+
+int
+shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this);
+
+int
+shard_post_lookup_shards_readv_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret,
+                                    local->op_errno);
+        return 0;
+    }
+
+    if (local->create_count) {
+        shard_common_resume_mknod(frame, this, shard_post_mknod_readv_handler);
+    } else {
+        shard_readv_do(frame, this);
+    }
+
+    return 0;
+}
+
+int
+shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret,
+                                    local->op_errno);
+        return 0;
+    }
+
+    if (!local->eexist_count) {
+        shard_readv_do(frame, this);
+    } else {
+        local->call_count = local->eexist_count;
+        shard_common_lookup_shards(frame, this, local->loc.inode,
+                                   shard_post_lookup_shards_readv_handler);
+    }
+    return 0;
+}
+
+int
+shard_post_resolve_readv_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        if (local->op_errno != ENOENT) {
+            shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret,
+                                        local->op_errno);
+            return 0;
+        } else {
+            struct iovec vec = {
+                0,
+            };
+
+            vec.iov_base = local->iobuf->ptr;
+            vec.iov_len = local->total_size;
+            local->op_ret = local->total_size;
+            SHARD_STACK_UNWIND(readv, frame, local->op_ret, 0, &vec, 1,
+                               &local->prebuf, local->iobref, NULL);
+            return 0;
+        }
+    }
+
+    if (local->call_count) {
+        shard_common_lookup_shards(frame, this, local->resolver_base_inode,
+                                   shard_post_lookup_shards_readv_handler);
+    } else {
+        shard_readv_do(frame, this);
+    }
+
+    return 0;
+}
+
+int
+shard_post_lookup_readv_handler(call_frame_t *frame, xlator_t *this)
+{
+    int ret = 0;
+    struct iobuf *iobuf = NULL;
+    shard_local_t *local = NULL;
+    shard_priv_t *priv = NULL;
+
+    priv = this->private;
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret,
+                                    local->op_errno);
+        return 0;
+    }
+
+    if (local->offset >= local->prebuf.ia_size) {
+        /* If the read is being performed past the end of the file,
+         * unwind the FOP with 0 bytes read as status.
+         */
+        struct iovec vec = {
+            0,
+        };
+
+        iobuf = iobuf_get2(this->ctx->iobuf_pool, local->req_size);
+        if (!iobuf)
+            goto err;
+
+        vec.iov_base = iobuf->ptr;
+        vec.iov_len = 0;
+        local->iobref = iobref_new();
+        iobref_add(local->iobref, iobuf);
+        iobuf_unref(iobuf);
+
+        SHARD_STACK_UNWIND(readv, frame, 0, 0, &vec, 1, &local->prebuf,
+                           local->iobref, NULL);
+        return 0;
+    }
+
+    local->first_block = get_lowest_block(local->offset, local->block_size);
+
+    local->total_size = local->req_size;
+
+    local->last_block = get_highest_block(local->offset, local->total_size,
+                                          local->block_size);
+
+    local->num_blocks = local->last_block - local->first_block + 1;
+    GF_ASSERT(local->num_blocks > 0);
+    local->resolver_base_inode = local->loc.inode;
+
+    local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *),
+                                  gf_shard_mt_inode_list);
+    if (!local->inode_list)
+        goto err;
+
+    iobuf = iobuf_get2(this->ctx->iobuf_pool, local->total_size);
+    if (!iobuf)
+        goto err;
+
+    local->iobref = iobref_new();
+    if (!local->iobref) {
+        iobuf_unref(iobuf);
+        goto err;
+    }
+
+    if (iobref_add(local->iobref, iobuf) != 0) {
+        iobuf_unref(iobuf);
+        goto err;
+    }
+
+    memset(iobuf->ptr, 0, local->total_size);
+    iobuf_unref(iobuf);
+    local->iobuf = iobuf;
+
+    local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid);
+    if (!local->dot_shard_loc.inode) {
+        ret = shard_init_internal_dir_loc(this, local,
+                                          SHARD_INTERNAL_DIR_DOT_SHARD);
+        if (ret)
+            goto err;
+        shard_lookup_internal_dir(frame, this, shard_post_resolve_readv_handler,
+                                  SHARD_INTERNAL_DIR_DOT_SHARD);
+    } else {
+        local->post_res_handler = shard_post_resolve_readv_handler;
+        shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD);
+    }
+    return 0;
+err:
+    shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t offset, uint32_t flags, dict_t *xdata)
+{
+    int ret = 0;
+    uint64_t block_size = 0;
+    shard_local_t *local = NULL;
+
+    ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get block "
+               "size for %s from its inode ctx",
+               uuid_utoa(fd->inode->gfid));
+        goto err;
+    }
+
+    if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) {
+        /* block_size = 0 means that the file was created before
+         * sharding was enabled on the volume.
+         */
+        STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->readv, fd, size, offset, flags,
+                   xdata);
+        return 0;
+    }
+
+    if (!this->itable)
+        this->itable = fd->inode->table;
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+
+    ret = syncbarrier_init(&local->barrier);
+    if (ret)
+        goto err;
+    local->fd = fd_ref(fd);
+    local->block_size = block_size;
+    local->offset = offset;
+    local->req_size = size;
+    local->flags = flags;
+    local->fop = GF_FOP_READ;
+    local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+    if (!local->xattr_req)
+        goto err;
+
+    local->loc.inode = inode_ref(fd->inode);
+    gf_uuid_copy(local->loc.gfid, fd->inode->gfid);
+
+    shard_refresh_base_file(frame, this, NULL, fd,
+                            shard_post_lookup_readv_handler);
+    return 0;
+err:
+    shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_common_inode_write_post_update_size_handler(call_frame_t *frame,
+                                                  xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                    local->op_errno);
+    } else {
+        shard_common_inode_write_success_unwind(local->fop, frame,
+                                                local->written_size);
+    }
+    return 0;
+}
+
+static gf_boolean_t
+shard_is_appending_write(shard_local_t *local)
+{
+    if (local->fop != GF_FOP_WRITE)
+        return _gf_false;
+    if (local->flags & O_APPEND)
+        return _gf_true;
+    if (local->fd->flags & O_APPEND)
+        return _gf_true;
+    return _gf_false;
+}
+
+int
+__shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode,
+                                      xlator_t *this)
+{
+    int ret = -1;
+    uint64_t ctx_uint = 0;
+    shard_inode_ctx_t *ctx = NULL;
+
+    ret = __inode_ctx_get(inode, this, &ctx_uint);
+    if (ret < 0)
+        return ret;
+
+    ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint;
+
+    if (shard_is_appending_write(local)) {
+        local->delta_size = local->total_size;
+    } else if (local->offset + local->total_size > ctx->stat.ia_size) {
+        local->delta_size = (local->offset + local->total_size) -
+                            ctx->stat.ia_size;
+    } else {
+        local->delta_size = 0;
+    }
+    ctx->stat.ia_size += (local->delta_size);
+    local->postbuf = ctx->stat;
+
+    return 0;
+}
+
+int
+shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode,
+                                    xlator_t *this)
+{
+    int ret = -1;
+
+    LOCK(&inode->lock);
+    {
+        ret = __shard_get_delta_size_from_inode_ctx(local, inode, this);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+int
+shard_common_inode_write_do_cbk(call_frame_t *frame, void *cookie,
+                                xlator_t *this, int32_t op_ret,
+                                int32_t op_errno, struct iatt *pre,
+                                struct iatt *post, dict_t *xdata)
+{
+    int call_count = 0;
+    fd_t *anon_fd = cookie;
+    shard_local_t *local = NULL;
+    glusterfs_fop_t fop = 0;
+
+    local = frame->local;
+    fop = local->fop;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret < 0) {
+            local->op_ret = op_ret;
+            local->op_errno = op_errno;
+        } else {
+            local->written_size += op_ret;
+            GF_ATOMIC_ADD(local->delta_blocks,
+                          post->ia_blocks - pre->ia_blocks);
+            local->delta_size += (post->ia_size - pre->ia_size);
+            shard_inode_ctx_set(local->fd->inode, this, post, 0,
+                                SHARD_MASK_TIMES);
+            if (local->fd->inode != anon_fd->inode)
+                shard_inode_ctx_add_to_fsync_list(local->fd->inode, this,
+                                                  anon_fd->inode);
+        }
+    }
+    UNLOCK(&frame->lock);
+
+    if (anon_fd)
+        fd_unref(anon_fd);
+
+    call_count = shard_call_count_return(frame);
+    if (call_count == 0) {
+        SHARD_UNSET_ROOT_FS_ID(frame, local);
+        if (local->op_ret < 0) {
+            shard_common_failure_unwind(fop, frame, local->op_ret,
+                                        local->op_errno);
+        } else {
+            shard_get_delta_size_from_inode_ctx(local, local->fd->inode, this);
+            local->hole_size = 0;
+            if (xdata)
+                local->xattr_rsp = dict_ref(xdata);
+            shard_update_file_size(
+                frame, this, local->fd, NULL,
+                shard_common_inode_write_post_update_size_handler);
+        }
+    }
+
+    return 0;
+}
+
+int
+shard_common_inode_write_wind(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                              struct iovec *vec, int count, off_t shard_offset,
+                              size_t size)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    switch (local->fop) {
+        case GF_FOP_WRITE:
+            STACK_WIND_COOKIE(
+                frame, shard_common_inode_write_do_cbk, fd, FIRST_CHILD(this),
+                FIRST_CHILD(this)->fops->writev, fd, vec, count, shard_offset,
+                local->flags, local->iobref, local->xattr_req);
+            break;
+        case GF_FOP_FALLOCATE:
+            STACK_WIND_COOKIE(
+                frame, shard_common_inode_write_do_cbk, fd, FIRST_CHILD(this),
+                FIRST_CHILD(this)->fops->fallocate, fd, local->flags,
+                shard_offset, size, local->xattr_req);
+            break;
+        case GF_FOP_ZEROFILL:
+            STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd,
+                              FIRST_CHILD(this),
+                              FIRST_CHILD(this)->fops->zerofill, fd,
+                              shard_offset, size, local->xattr_req);
+            break;
+        case GF_FOP_DISCARD:
+            STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd,
+                              FIRST_CHILD(this),
+                              FIRST_CHILD(this)->fops->discard, fd,
+                              shard_offset, size, local->xattr_req);
+            break;
+        default:
+            gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP,
+                   "Invalid fop id = %d", local->fop);
+            break;
+    }
+    return 0;
+}
+
+int
+shard_common_inode_write_do(call_frame_t *frame, xlator_t *this)
+{
+    int i = 0;
+    int count = 0;
+    int call_count = 0;
+    int last_block = 0;
+    uint32_t cur_block = 0;
+    fd_t *fd = NULL;
+    fd_t *anon_fd = NULL;
+    shard_local_t *local = NULL;
+    struct iovec *vec = NULL;
+    gf_boolean_t wind_failed = _gf_false;
+    gf_boolean_t odirect = _gf_false;
+    off_t orig_offset = 0;
+    off_t shard_offset = 0;
+    off_t vec_offset = 0;
+    size_t remaining_size = 0;
+    size_t shard_write_size = 0;
+
+    local = frame->local;
+    fd = local->fd;
+
+    orig_offset = local->offset;
+    remaining_size = local->total_size;
+    cur_block = local->first_block;
+    local->call_count = call_count = local->num_blocks;
+    last_block = local->last_block;
+
+    SHARD_SET_ROOT_FS_ID(frame, local);
+
+    if (dict_set_uint32(local->xattr_req, GLUSTERFS_WRITE_UPDATE_ATOMIC, 4)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED,
+               "Failed to set " GLUSTERFS_WRITE_UPDATE_ATOMIC
+               " into "
+               "dict: %s",
+               uuid_utoa(fd->inode->gfid));
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        local->call_count = 1;
+        shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1,
+                                        ENOMEM, NULL, NULL, NULL);
+        return 0;
+    }
+
+    if ((fd->flags & O_DIRECT) && (local->fop == GF_FOP_WRITE))
+        odirect = _gf_true;
+
+    while (cur_block <= last_block) {
+        if (wind_failed) {
+            shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1,
+                                            ENOMEM, NULL, NULL, NULL);
+            goto next;
+        }
+
+        shard_offset = orig_offset % local->block_size;
+        shard_write_size = local->block_size - shard_offset;
+        if (shard_write_size > remaining_size)
+            shard_write_size = remaining_size;
+
+        remaining_size -= shard_write_size;
+
+        if (local->fop == GF_FOP_WRITE) {
+            vec = NULL;
+            count = iov_subset(local->vector, local->count, vec_offset,
+                               shard_write_size, &vec, 0);
+            if (count < 0) {
+                local->op_ret = -1;
+                local->op_errno = ENOMEM;
+                wind_failed = _gf_true;
+                shard_common_inode_write_do_cbk(frame, (void *)(long)0, this,
+                                                -1, ENOMEM, NULL, NULL, NULL);
+                goto next;
+            }
+        }
+
+        if (cur_block == 0) {
+            anon_fd = fd_ref(fd);
+        } else {
+            anon_fd = fd_anonymous(local->inode_list[i]);
+            if (!anon_fd) {
+                local->op_ret = -1;
+                local->op_errno = ENOMEM;
+                wind_failed = _gf_true;
+                GF_FREE(vec);
+                shard_common_inode_write_do_cbk(frame, (void *)(long)anon_fd,
+                                                this, -1, ENOMEM, NULL, NULL,
+                                                NULL);
+                goto next;
+            }
+
+            if (local->fop == GF_FOP_WRITE) {
+                if (odirect)
+                    local->flags = O_DIRECT;
+                else
+                    local->flags = GF_ANON_FD_FLAGS;
+            }
+        }
+
+        shard_common_inode_write_wind(frame, this, anon_fd, vec, count,
+                                      shard_offset, shard_write_size);
+        if (vec)
+            vec_offset += shard_write_size;
+        orig_offset += shard_write_size;
+        GF_FREE(vec);
+        vec = NULL;
+    next:
+        cur_block++;
+        i++;
+        call_count--;
+    }
+    return 0;
+}
+
+int
+shard_common_inode_write_post_mknod_handler(call_frame_t *frame,
+                                            xlator_t *this);
+
+int
+shard_common_inode_write_post_lookup_shards_handler(call_frame_t *frame,
+                                                    xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                    local->op_errno);
+        return 0;
+    }
+
+    if (local->create_count) {
+        shard_common_resume_mknod(frame, this,
+                                  shard_common_inode_write_post_mknod_handler);
+    } else {
+        shard_common_inode_write_do(frame, this);
+    }
+
+    return 0;
+}
+
+int
+shard_common_inode_write_post_mknod_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                    local->op_errno);
+        return 0;
+    }
+
+    if (!local->eexist_count) {
+        shard_common_inode_write_do(frame, this);
+    } else {
+        local->call_count = local->eexist_count;
+        shard_common_lookup_shards(
+            frame, this, local->loc.inode,
+            shard_common_inode_write_post_lookup_shards_handler);
+    }
+
+    return 0;
+}
+
+int
+shard_common_inode_write_post_resolve_handler(call_frame_t *frame,
+                                              xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                    local->op_errno);
+        return 0;
+    }
+
+    if (local->call_count) {
+        shard_common_lookup_shards(
+            frame, this, local->resolver_base_inode,
+            shard_common_inode_write_post_lookup_shards_handler);
+    } else if (local->create_count) {
+        shard_common_inode_write_post_lookup_shards_handler(frame, this);
+    } else {
+        shard_common_inode_write_do(frame, this);
+    }
+
+    return 0;
+}
+
+int
+shard_common_inode_write_post_lookup_handler(call_frame_t *frame,
+                                             xlator_t *this)
+{
+    shard_local_t *local = frame->local;
+    shard_priv_t *priv = this->private;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                    local->op_errno);
+        return 0;
+    }
+
+    local->postbuf = local->prebuf;
+
+    /*Adjust offset to EOF so that correct shard is chosen for append*/
+    if (shard_is_appending_write(local))
+        local->offset = local->prebuf.ia_size;
+
+    local->first_block = get_lowest_block(local->offset, local->block_size);
+    local->last_block = get_highest_block(local->offset, local->total_size,
+                                          local->block_size);
+    local->num_blocks = local->last_block - local->first_block + 1;
+    GF_ASSERT(local->num_blocks > 0);
+    local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *),
+                                  gf_shard_mt_inode_list);
+    if (!local->inode_list) {
+        shard_common_failure_unwind(local->fop, frame, -1, ENOMEM);
+        return 0;
+    }
+
+    gf_msg_trace(this->name, 0,
+                 "%s: gfid=%s first_block=%" PRIu64
+                 " "
+                 "last_block=%" PRIu64 " num_blocks=%" PRIu64 " offset=%" PRId64
+                 " total_size=%zu flags=%" PRId32 "",
+                 gf_fop_list[local->fop],
+                 uuid_utoa(local->resolver_base_inode->gfid),
+                 local->first_block, local->last_block, local->num_blocks,
+                 local->offset, local->total_size, local->flags);
+
+    local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid);
+
+    if (!local->dot_shard_loc.inode) {
+        /*change handler*/
+        shard_mkdir_internal_dir(frame, this,
+                                 shard_common_inode_write_post_resolve_handler,
+                                 SHARD_INTERNAL_DIR_DOT_SHARD);
+    } else {
+        /*change handler*/
+        local->post_res_handler = shard_common_inode_write_post_resolve_handler;
+        shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD);
+    }
+    return 0;
+}
+
+int
+shard_mkdir_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                             int32_t op_ret, int32_t op_errno, inode_t *inode,
+                             struct iatt *buf, struct iatt *preparent,
+                             struct iatt *postparent, dict_t *xdata)
+{
+    inode_t *link_inode = NULL;
+    shard_local_t *local = NULL;
+    shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie;
+
+    local = frame->local;
+
+    SHARD_UNSET_ROOT_FS_ID(frame, local);
+
+    if (op_ret == -1) {
+        if (op_errno != EEXIST) {
+            local->op_ret = op_ret;
+            local->op_errno = op_errno;
+            goto unwind;
+        } else {
+            gf_msg_debug(this->name, 0,
+                         "mkdir on %s failed "
+                         "with EEXIST. Attempting lookup now",
+                         shard_internal_dir_string(type));
+            shard_lookup_internal_dir(frame, this, local->post_res_handler,
+                                      type);
+            return 0;
+        }
+    }
+
+    link_inode = shard_link_internal_dir_inode(local, inode, buf, type);
+    if (link_inode != inode) {
+        shard_refresh_internal_dir(frame, this, type);
+    } else {
+        shard_inode_ctx_mark_dir_refreshed(link_inode, this);
+        shard_common_resolve_shards(frame, this, local->post_res_handler);
+    }
+    return 0;
+unwind:
+    shard_common_resolve_shards(frame, this, local->post_res_handler);
+    return 0;
+}
+
+int
+shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this,
+                         shard_post_resolve_fop_handler_t handler,
+                         shard_internal_dir_type_t type)
+{
+    int ret = -1;
+    shard_local_t *local = NULL;
+    shard_priv_t *priv = NULL;
+    dict_t *xattr_req = NULL;
+    uuid_t *gfid = NULL;
+    loc_t *loc = NULL;
+    gf_boolean_t free_gfid = _gf_true;
+
+    local = frame->local;
+    priv = this->private;
+
+    local->post_res_handler = handler;
+    gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!gfid)
+        goto err;
+
+    switch (type) {
+        case SHARD_INTERNAL_DIR_DOT_SHARD:
+            gf_uuid_copy(*gfid, priv->dot_shard_gfid);
+            loc = &local->dot_shard_loc;
+            break;
+        case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME:
+            gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid);
+            loc = &local->dot_shard_rm_loc;
+            break;
+        default:
+            bzero(*gfid, sizeof(uuid_t));
+            break;
+    }
+
+    xattr_req = dict_new();
+    if (!xattr_req)
+        goto err;
+
+    ret = shard_init_internal_dir_loc(this, local, type);
+    if (ret)
+        goto err;
+
+    ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED,
+               "Failed to set gfid-req for %s",
+               shard_internal_dir_string(type));
+        goto err;
+    } else {
+        free_gfid = _gf_false;
+    }
+
+    SHARD_SET_ROOT_FS_ID(frame, local);
+
+    STACK_WIND_COOKIE(frame, shard_mkdir_internal_dir_cbk, (void *)(long)type,
+                      FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, loc,
+                      0755, 0, xattr_req);
+    dict_unref(xattr_req);
+    return 0;
+
+err:
+    if (xattr_req)
+        dict_unref(xattr_req);
+    local->op_ret = -1;
+    local->op_errno = ENOMEM;
+    if (free_gfid)
+        GF_FREE(gfid);
+    handler(frame, this);
+    return 0;
+}
+
+int
+shard_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    /* To-Do: Wind flush on all shards of the file */
+    SHARD_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int
+shard_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    STACK_WIND(frame, shard_flush_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->flush, fd, xdata);
+    return 0;
+}
+
+int
+__shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode,
+                                      xlator_t *this)
+{
+    int ret = -1;
+    uint64_t ctx_uint = 0;
+    shard_inode_ctx_t *ctx = NULL;
+
+    ret = __inode_ctx_get(inode, this, &ctx_uint);
+    if (ret < 0)
+        return ret;
+
+    ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint;
+
+    local->postbuf.ia_ctime = ctx->stat.ia_ctime;
+    local->postbuf.ia_ctime_nsec = ctx->stat.ia_ctime_nsec;
+    local->postbuf.ia_atime = ctx->stat.ia_atime;
+    local->postbuf.ia_atime_nsec = ctx->stat.ia_atime_nsec;
+    local->postbuf.ia_mtime = ctx->stat.ia_mtime;
+    local->postbuf.ia_mtime_nsec = ctx->stat.ia_mtime_nsec;
+
+    return 0;
+}
+
+int
+shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode,
+                                    xlator_t *this)
+{
+    int ret = 0;
+
+    LOCK(&inode->lock);
+    {
+        ret = __shard_get_timestamps_from_inode_ctx(local, inode, this);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+int
+shard_fsync_shards_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                       struct iatt *postbuf, dict_t *xdata)
+{
+    int call_count = 0;
+    uint64_t fsync_count = 0;
+    fd_t *anon_fd = cookie;
+    shard_local_t *local = NULL;
+    shard_inode_ctx_t *ctx = NULL;
+    shard_inode_ctx_t *base_ictx = NULL;
+    inode_t *base_inode = NULL;
+    gf_boolean_t unref_shard_inode = _gf_false;
+
+    local = frame->local;
+    base_inode = local->fd->inode;
+
+    if (local->op_ret < 0)
+        goto out;
+
+    LOCK(&frame->lock);
+    {
+        if (op_ret < 0) {
+            local->op_ret = op_ret;
+            local->op_errno = op_errno;
+            UNLOCK(&frame->lock);
+            goto out;
+        }
+        shard_inode_ctx_set(local->fd->inode, this, postbuf, 0,
+                            SHARD_MASK_TIMES);
+    }
+    UNLOCK(&frame->lock);
+    fd_ctx_get(anon_fd, this, &fsync_count);
+out:
+    if (anon_fd && (base_inode != anon_fd->inode)) {
+        LOCK(&base_inode->lock);
+        LOCK(&anon_fd->inode->lock);
+        {
+            __shard_inode_ctx_get(anon_fd->inode, this, &ctx);
+            __shard_inode_ctx_get(base_inode, this, &base_ictx);
+            if (op_ret == 0)
+                ctx->fsync_needed -= fsync_count;
+            GF_ASSERT(ctx->fsync_needed >= 0);
+            if (ctx->fsync_needed != 0) {
+                list_add_tail(&ctx->to_fsync_list, &base_ictx->to_fsync_list);
+                base_ictx->fsync_count++;
+            } else {
+                unref_shard_inode = _gf_true;
+            }
+        }
+        UNLOCK(&anon_fd->inode->lock);
+        UNLOCK(&base_inode->lock);
+    }
+
+    if (unref_shard_inode)
+        inode_unref(anon_fd->inode);
+    if (anon_fd)
+        fd_unref(anon_fd);
+
+    call_count = shard_call_count_return(frame);
+    if (call_count != 0)
+        return 0;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret,
+                                    local->op_errno);
+    } else {
+        shard_get_timestamps_from_inode_ctx(local, base_inode, this);
+        SHARD_STACK_UNWIND(fsync, frame, local->op_ret, local->op_errno,
+                           &local->prebuf, &local->postbuf, local->xattr_rsp);
+    }
+    return 0;
+}
+
+int
+shard_post_lookup_fsync_handler(call_frame_t *frame, xlator_t *this)
+{
+    int ret = 0;
+    int call_count = 0;
+    int fsync_count = 0;
+    fd_t *anon_fd = NULL;
+    inode_t *base_inode = NULL;
+    shard_local_t *local = NULL;
+    shard_inode_ctx_t *ctx = NULL;
+    shard_inode_ctx_t *iter = NULL;
+    struct list_head copy = {
+        0,
+    };
+    shard_inode_ctx_t *tmp = NULL;
+
+    local = frame->local;
+    base_inode = local->fd->inode;
+    local->postbuf = local->prebuf;
+    INIT_LIST_HEAD(&copy);
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret,
+                                    local->op_errno);
+        return 0;
+    }
+
+    LOCK(&base_inode->lock);
+    {
+        __shard_inode_ctx_get(base_inode, this, &ctx);
+        list_splice_init(&ctx->to_fsync_list, &copy);
+        call_count = ctx->fsync_count;
+        ctx->fsync_count = 0;
+    }
+    UNLOCK(&base_inode->lock);
+
+    local->call_count = ++call_count;
+
+    /* Send fsync() on the base shard first */
+    anon_fd = fd_ref(local->fd);
+    STACK_WIND_COOKIE(frame, shard_fsync_shards_cbk, anon_fd, FIRST_CHILD(this),
+                      FIRST_CHILD(this)->fops->fsync, anon_fd, local->datasync,
+                      local->xattr_req);
+    call_count--;
+    anon_fd = NULL;
+
+    list_for_each_entry_safe(iter, tmp, &copy, to_fsync_list)
+    {
+        list_del_init(&iter->to_fsync_list);
+        fsync_count = 0;
+        shard_inode_ctx_get_fsync_count(iter->inode, this, &fsync_count);
+        GF_ASSERT(fsync_count > 0);
+        anon_fd = fd_anonymous(iter->inode);
+        if (!anon_fd) {
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM,
+                   SHARD_MSG_MEMALLOC_FAILED,
+                   "Failed to create "
+                   "anon fd to fsync shard");
+            shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1,
+                                   ENOMEM, NULL, NULL, NULL);
+            continue;
+        }
+
+        ret = fd_ctx_set(anon_fd, this, fsync_count);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_FD_CTX_SET_FAILED,
+                   "Failed to set fd "
+                   "ctx for shard inode gfid=%s",
+                   uuid_utoa(iter->inode->gfid));
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1,
+                                   ENOMEM, NULL, NULL, NULL);
+            continue;
+        }
+        STACK_WIND_COOKIE(frame, shard_fsync_shards_cbk, anon_fd,
+                          FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync,
+                          anon_fd, local->datasync, local->xattr_req);
+        call_count--;
+    }
+
+    return 0;
+}
+
+int
+shard_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+            dict_t *xdata)
+{
+    int ret = 0;
+    uint64_t block_size = 0;
+    shard_local_t *local = NULL;
+
+    ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get block "
+               "size for %s from its inode ctx",
+               uuid_utoa(fd->inode->gfid));
+        goto err;
+    }
+
+    if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) {
+        STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata);
+        return 0;
+    }
+
+    if (!this->itable)
+        this->itable = fd->inode->table;
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+
+    local->fd = fd_ref(fd);
+    local->fop = GF_FOP_FSYNC;
+    local->datasync = datasync;
+    local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+    if (!local->xattr_req)
+        goto err;
+
+    local->loc.inode = inode_ref(fd->inode);
+    gf_uuid_copy(local->loc.gfid, fd->inode->gfid);
+
+    shard_refresh_base_file(frame, this, NULL, fd,
+                            shard_post_lookup_fsync_handler);
+    return 0;
+err:
+    shard_common_failure_unwind(GF_FOP_FSYNC, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_readdir_past_dot_shard_cbk(call_frame_t *frame, void *cookie,
+                                 xlator_t *this, int32_t op_ret,
+                                 int32_t op_errno, gf_dirent_t *orig_entries,
+                                 dict_t *xdata)
+{
+    gf_dirent_t *entry = NULL;
+    gf_dirent_t *tmp = NULL;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list)
+    {
+        list_del_init(&entry->list);
+        list_add_tail(&entry->list, &local->entries_head.list);
+
+        if (!entry->dict)
+            continue;
+
+        if (IA_ISDIR(entry->d_stat.ia_type))
+            continue;
+
+        if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE))
+            shard_modify_size_and_block_count(&entry->d_stat, entry->dict);
+        if (!entry->inode)
+            continue;
+
+        shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat);
+    }
+    local->op_ret += op_ret;
+
+unwind:
+    if (local->fop == GF_FOP_READDIR)
+        SHARD_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno,
+                           &local->entries_head, xdata);
+    else
+        SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno,
+                           &local->entries_head, xdata);
+    return 0;
+}
+
+int32_t
+shard_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, gf_dirent_t *orig_entries,
+                  dict_t *xdata)
+{
+    fd_t *fd = NULL;
+    gf_dirent_t *entry = NULL;
+    gf_dirent_t *tmp = NULL;
+    shard_local_t *local = NULL;
+    gf_boolean_t last_entry = _gf_false;
+
+    local = frame->local;
+    fd = local->fd;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list)
+    {
+        if (last_entry)
+            last_entry = _gf_false;
+
+        if (__is_root_gfid(fd->inode->gfid) &&
+            !(strcmp(entry->d_name, GF_SHARD_DIR))) {
+            local->offset = entry->d_off;
+            op_ret--;
+            last_entry = _gf_true;
+            continue;
+        }
+
+        list_del_init(&entry->list);
+        list_add_tail(&entry->list, &local->entries_head.list);
+
+        if (!entry->dict)
+            continue;
+
+        if (IA_ISDIR(entry->d_stat.ia_type))
+            continue;
+
+        if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE) &&
+            frame->root->pid != GF_CLIENT_PID_GSYNCD)
+            shard_modify_size_and_block_count(&entry->d_stat, entry->dict);
+
+        if (!entry->inode)
+            continue;
+
+        shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat);
+    }
+
+    local->op_ret = op_ret;
+
+    if (last_entry) {
+        if (local->fop == GF_FOP_READDIR)
+            STACK_WIND(frame, shard_readdir_past_dot_shard_cbk,
+                       FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir,
+                       local->fd, local->readdir_size, local->offset,
+                       local->xattr_req);
+        else
+            STACK_WIND(frame, shard_readdir_past_dot_shard_cbk,
+                       FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp,
+                       local->fd, local->readdir_size, local->offset,
+                       local->xattr_req);
+        return 0;
+    }
+
+unwind:
+    if (local->fop == GF_FOP_READDIR)
+        SHARD_STACK_UNWIND(readdir, frame, op_ret, op_errno,
+                           &local->entries_head, xdata);
+    else
+        SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno,
+                           &local->entries_head, xdata);
+    return 0;
+}
+
+int
+shard_readdir_do(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                 off_t offset, int whichop, dict_t *xdata)
+{
+    int ret = 0;
+    shard_local_t *local = NULL;
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        goto err;
+    }
+
+    frame->local = local;
+
+    local->fd = fd_ref(fd);
+    local->fop = whichop;
+    local->readdir_size = size;
+    INIT_LIST_HEAD(&local->entries_head.list);
+    local->list_inited = _gf_true;
+
+    if (whichop == GF_FOP_READDIR) {
+        STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata);
+    } else {
+        local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+        SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid,
+                                        local, err);
+        ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0);
+        if (ret) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "Failed to set "
+                   "dict value: key:%s, directory gfid=%s",
+                   GF_XATTR_SHARD_BLOCK_SIZE, uuid_utoa(fd->inode->gfid));
+            goto err;
+        }
+
+        STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->readdirp, fd, size, offset,
+                   local->xattr_req);
+    }
+
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL);
+    return 0;
+}
+
+int32_t
+shard_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+              off_t offset, dict_t *xdata)
+{
+    shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIR, xdata);
+    return 0;
+}
+
+int32_t
+shard_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+               off_t offset, dict_t *xdata)
+{
+    shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIRP, xdata);
+    return 0;
+}
+
+int32_t
+shard_modify_and_set_iatt_in_dict(dict_t *xdata, shard_local_t *local,
+                                  char *key)
+{
+    int ret = 0;
+    struct iatt *tmpbuf = NULL;
+    struct iatt *stbuf = NULL;
+    data_t *data = NULL;
+
+    if (!xdata)
+        return 0;
+
+    data = dict_get(xdata, key);
+    if (!data)
+        return 0;
+
+    tmpbuf = data_to_iatt(data, key);
+    stbuf = GF_MALLOC(sizeof(struct iatt), gf_common_mt_char);
+    if (stbuf == NULL) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto err;
+    }
+    *stbuf = *tmpbuf;
+    stbuf->ia_size = local->prebuf.ia_size;
+    stbuf->ia_blocks = local->prebuf.ia_blocks;
+    ret = dict_set_iatt(xdata, key, stbuf, false);
+    if (ret < 0) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto err;
+    }
+    return 0;
+
+err:
+    GF_FREE(stbuf);
+    return -1;
+}
+
+int32_t
+shard_common_remove_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                              int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    int ret = -1;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret < 0) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto err;
+    }
+
+    ret = shard_modify_and_set_iatt_in_dict(xdata, local, GF_PRESTAT);
+    if (ret < 0)
+        goto err;
+
+    ret = shard_modify_and_set_iatt_in_dict(xdata, local, GF_POSTSTAT);
+    if (ret < 0)
+        goto err;
+
+    if (local->fd)
+        SHARD_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno,
+                           xdata);
+    else
+        SHARD_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+                           xdata);
+    return 0;
+
+err:
+    shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                local->op_errno);
+    return 0;
+}
+
+int32_t
+shard_post_lookup_remove_xattr_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                    local->op_errno);
+        return 0;
+    }
+
+    if (local->fd)
+        STACK_WIND(frame, shard_common_remove_xattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fremovexattr, local->fd,
+                   local->name, local->xattr_req);
+    else
+        STACK_WIND(frame, shard_common_remove_xattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->removexattr, &local->loc,
+                   local->name, local->xattr_req);
+    return 0;
+}
+
+int32_t
+shard_common_remove_xattr(call_frame_t *frame, xlator_t *this,
+                          glusterfs_fop_t fop, loc_t *loc, fd_t *fd,
+                          const char *name, dict_t *xdata)
+{
+    int ret = -1;
+    int op_errno = ENOMEM;
+    uint64_t block_size = 0;
+    shard_local_t *local = NULL;
+    inode_t *inode = loc ? loc->inode : fd->inode;
+
+    if ((IA_ISDIR(inode->ia_type)) || (IA_ISLNK(inode->ia_type))) {
+        if (loc)
+            STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                            FIRST_CHILD(this)->fops->removexattr, loc, name,
+                            xdata);
+        else
+            STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                            FIRST_CHILD(this)->fops->fremovexattr, fd, name,
+                            xdata);
+        return 0;
+    }
+
+    /* If shard's special xattrs are attempted to be removed,
+     * fail the fop with EPERM (except if the client is gsyncd).
+     */
+    if (frame->root->pid != GF_CLIENT_PID_GSYNCD) {
+        GF_IF_NATIVE_XATTR_GOTO(SHARD_XATTR_PREFIX "*", name, op_errno, err);
+    }
+
+    /* Repeat the same check for bulk-removexattr */
+    if (xdata && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) {
+        dict_del(xdata, GF_XATTR_SHARD_BLOCK_SIZE);
+        dict_del(xdata, GF_XATTR_SHARD_FILE_SIZE);
+    }
+
+    ret = shard_inode_ctx_get_block_size(inode, this, &block_size);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get block size from inode ctx of %s",
+               uuid_utoa(inode->gfid));
+        goto err;
+    }
+
+    if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) {
+        if (loc)
+            STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                            FIRST_CHILD(this)->fops->removexattr, loc, name,
+                            xdata);
+        else
+            STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                            FIRST_CHILD(this)->fops->fremovexattr, fd, name,
+                            xdata);
+        return 0;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+    local->fop = fop;
+    if (loc) {
+        if (loc_copy(&local->loc, loc) != 0)
+            goto err;
+    }
+
+    if (fd) {
+        local->fd = fd_ref(fd);
+        local->loc.inode = inode_ref(fd->inode);
+        gf_uuid_copy(local->loc.gfid, fd->inode->gfid);
+    }
+
+    if (name) {
+        local->name = gf_strdup(name);
+        if (!local->name)
+            goto err;
+    }
+
+    if (xdata)
+        local->xattr_req = dict_ref(xdata);
+
+    shard_refresh_base_file(frame, this, loc, fd,
+                            shard_post_lookup_remove_xattr_handler);
+    return 0;
+err:
+    shard_common_failure_unwind(fop, frame, -1, op_errno);
+    return 0;
+}
+
+int32_t
+shard_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                  const char *name, dict_t *xdata)
+{
+    shard_common_remove_xattr(frame, this, GF_FOP_REMOVEXATTR, loc, NULL, name,
+                              xdata);
+    return 0;
+}
+
+int32_t
+shard_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                   const char *name, dict_t *xdata)
+{
+    shard_common_remove_xattr(frame, this, GF_FOP_FREMOVEXATTR, NULL, fd, name,
+                              xdata);
+    return 0;
+}
+
+int32_t
+shard_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *dict,
+                    dict_t *xdata)
+{
+    if (op_ret < 0)
+        goto unwind;
+
+    if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) {
+        dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE);
+        dict_del(dict, GF_XATTR_SHARD_FILE_SIZE);
+    }
+
+unwind:
+    SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+int32_t
+shard_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+                dict_t *xdata)
+{
+    int op_errno = EINVAL;
+
+    if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) &&
+        (!strncmp(name, SHARD_XATTR_PREFIX, SLEN(SHARD_XATTR_PREFIX)))) {
+        op_errno = ENODATA;
+        goto out;
+    }
+
+    STACK_WIND(frame, shard_fgetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+    return 0;
+out:
+    shard_common_failure_unwind(GF_FOP_FGETXATTR, frame, -1, op_errno);
+    return 0;
+}
+
+int32_t
+shard_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *dict,
+                   dict_t *xdata)
+{
+    if (op_ret < 0)
+        goto unwind;
+
+    if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) {
+        dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE);
+        dict_del(dict, GF_XATTR_SHARD_FILE_SIZE);
+    }
+
+unwind:
+    SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+int32_t
+shard_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               const char *name, dict_t *xdata)
+{
+    int op_errno = EINVAL;
+
+    if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) &&
+        (!strncmp(name, SHARD_XATTR_PREFIX, sizeof(SHARD_XATTR_PREFIX) - 1))) {
+        op_errno = ENODATA;
+        goto out;
+    }
+
+    STACK_WIND(frame, shard_getxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
+    return 0;
+out:
+    shard_common_failure_unwind(GF_FOP_GETXATTR, frame, -1, op_errno);
+    return 0;
+}
+
+int32_t
+shard_common_set_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    int ret = -1;
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret < 0) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto err;
+    }
+
+    ret = shard_modify_and_set_iatt_in_dict(xdata, local, GF_PRESTAT);
+    if (ret < 0)
+        goto err;
+
+    ret = shard_modify_and_set_iatt_in_dict(xdata, local, GF_POSTSTAT);
+    if (ret < 0)
+        goto err;
+
+    if (local->fd)
+        SHARD_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno,
+                           xdata);
+    else
+        SHARD_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+                           xdata);
+    return 0;
+
+err:
+    shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                local->op_errno);
+    return 0;
+}
+
+int32_t
+shard_post_lookup_set_xattr_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->op_ret < 0) {
+        shard_common_failure_unwind(local->fop, frame, local->op_ret,
+                                    local->op_errno);
+        return 0;
+    }
+
+    if (local->fd)
+        STACK_WIND(frame, shard_common_set_xattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fsetxattr, local->fd,
+                   local->xattr_req, local->flags, local->xattr_rsp);
+    else
+        STACK_WIND(frame, shard_common_set_xattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->setxattr, &local->loc,
+                   local->xattr_req, local->flags, local->xattr_rsp);
+    return 0;
+}
+
+int32_t
+shard_common_set_xattr(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+                       loc_t *loc, fd_t *fd, dict_t *dict, int32_t flags,
+                       dict_t *xdata)
+{
+    int ret = -1;
+    int op_errno = ENOMEM;
+    uint64_t block_size = 0;
+    shard_local_t *local = NULL;
+    inode_t *inode = loc ? loc->inode : fd->inode;
+
+    if ((IA_ISDIR(inode->ia_type)) || (IA_ISLNK(inode->ia_type))) {
+        if (loc)
+            STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                            FIRST_CHILD(this)->fops->setxattr, loc, dict, flags,
+                            xdata);
+        else
+            STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                            FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags,
+                            xdata);
+        return 0;
+    }
+
+    /* Sharded or not, if shard's special xattrs are attempted to be set,
+     * fail the fop with EPERM (except if the client is gsyncd.
+     */
+    if (frame->root->pid != GF_CLIENT_PID_GSYNCD) {
+        GF_IF_INTERNAL_XATTR_GOTO(SHARD_XATTR_PREFIX "*", dict, op_errno, err);
+    }
+
+    ret = shard_inode_ctx_get_block_size(inode, this, &block_size);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get block size from inode ctx of %s",
+               uuid_utoa(inode->gfid));
+        goto err;
+    }
+
+    if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) {
+        if (loc)
+            STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                            FIRST_CHILD(this)->fops->setxattr, loc, dict, flags,
+                            xdata);
+        else
+            STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                            FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags,
+                            xdata);
+        return 0;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+    local->fop = fop;
+    if (loc) {
+        if (loc_copy(&local->loc, loc) != 0)
+            goto err;
+    }
+
+    if (fd) {
+        local->fd = fd_ref(fd);
+        local->loc.inode = inode_ref(fd->inode);
+        gf_uuid_copy(local->loc.gfid, fd->inode->gfid);
+    }
+    local->flags = flags;
+    /* Reusing local->xattr_req and local->xattr_rsp to store the setxattr dict
+     * and the xdata dict
+     */
+    if (dict)
+        local->xattr_req = dict_ref(dict);
+    if (xdata)
+        local->xattr_rsp = dict_ref(xdata);
+
+    shard_refresh_base_file(frame, this, loc, fd,
+                            shard_post_lookup_set_xattr_handler);
+    return 0;
+err:
+    shard_common_failure_unwind(fop, frame, -1, op_errno);
+    return 0;
+}
+
+int32_t
+shard_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+                int32_t flags, dict_t *xdata)
+{
+    shard_common_set_xattr(frame, this, GF_FOP_FSETXATTR, NULL, fd, dict, flags,
+                           xdata);
+    return 0;
+}
+
+int32_t
+shard_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+               int32_t flags, dict_t *xdata)
+{
+    shard_common_set_xattr(frame, this, GF_FOP_SETXATTR, loc, NULL, dict, flags,
+                           xdata);
+    return 0;
+}
+
+int
+shard_post_setattr_handler(call_frame_t *frame, xlator_t *this)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (local->fop == GF_FOP_SETATTR) {
+        if (local->op_ret >= 0)
+            shard_inode_ctx_set(local->loc.inode, this, &local->postbuf, 0,
+                                SHARD_LOOKUP_MASK);
+        SHARD_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno,
+                           &local->prebuf, &local->postbuf, local->xattr_rsp);
+    } else if (local->fop == GF_FOP_FSETATTR) {
+        if (local->op_ret >= 0)
+            shard_inode_ctx_set(local->fd->inode, this, &local->postbuf, 0,
+                                SHARD_LOOKUP_MASK);
+        SHARD_STACK_UNWIND(fsetattr, frame, local->op_ret, local->op_errno,
+                           &local->prebuf, &local->postbuf, local->xattr_rsp);
+    }
+
+    return 0;
+}
+
+int
+shard_common_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                         struct iatt *postbuf, dict_t *xdata)
+{
+    shard_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret < 0) {
+        local->op_ret = op_ret;
+        local->op_errno = op_errno;
+        goto unwind;
+    }
+
+    local->prebuf = *prebuf;
+    if (shard_modify_size_and_block_count(&local->prebuf, xdata)) {
+        local->op_ret = -1;
+        local->op_errno = EINVAL;
+        goto unwind;
+    }
+    if (xdata)
+        local->xattr_rsp = dict_ref(xdata);
+    local->postbuf = *postbuf;
+    local->postbuf.ia_size = local->prebuf.ia_size;
+    local->postbuf.ia_blocks = local->prebuf.ia_blocks;
+
+unwind:
+    local->handler(frame, this);
+    return 0;
+}
+
+int
+shard_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+              struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    int ret = -1;
+    uint64_t block_size = 0;
+    shard_local_t *local = NULL;
+
+    if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) {
+        STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+        return 0;
+    }
+
+    ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get block size from inode ctx of %s",
+               uuid_utoa(loc->inode->gfid));
+        goto err;
+    }
+
+    if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) {
+        STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+        return 0;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+
+    local->handler = shard_post_setattr_handler;
+    local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+    if (!local->xattr_req)
+        goto err;
+    local->fop = GF_FOP_SETATTR;
+    loc_copy(&local->loc, loc);
+
+    SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid,
+                                    local, err);
+
+    STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid,
+               local->xattr_req);
+    return 0;
+err:
+    shard_common_failure_unwind(GF_FOP_SETATTR, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+               struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    int ret = -1;
+    uint64_t block_size = 0;
+    shard_local_t *local = NULL;
+
+    if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) {
+        STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+        return 0;
+    }
+
+    ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get block size from inode ctx of %s",
+               uuid_utoa(fd->inode->gfid));
+        goto err;
+    }
+
+    if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) {
+        STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+        return 0;
+    }
+
+    if (!this->itable)
+        this->itable = fd->inode->table;
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto err;
+
+    frame->local = local;
+
+    local->handler = shard_post_setattr_handler;
+    local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+    if (!local->xattr_req)
+        goto err;
+    local->fop = GF_FOP_FSETATTR;
+    local->fd = fd_ref(fd);
+
+    SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid,
+                                    local, err);
+
+    STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid,
+               local->xattr_req);
+    return 0;
+err:
+    shard_common_failure_unwind(GF_FOP_FSETATTR, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_common_inode_write_begin(call_frame_t *frame, xlator_t *this,
+                               glusterfs_fop_t fop, fd_t *fd,
+                               struct iovec *vector, int32_t count,
+                               off_t offset, uint32_t flags, size_t len,
+                               struct iobref *iobref, dict_t *xdata)
+{
+    int ret = 0;
+    int i = 0;
+    uint64_t block_size = 0;
+    shard_local_t *local = NULL;
+
+    ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED,
+               "Failed to get block "
+               "size for %s from its inode ctx",
+               uuid_utoa(fd->inode->gfid));
+        goto out;
+    }
+
+    if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) {
+        /* block_size = 0 means that the file was created before
+         * sharding was enabled on the volume.
+         */
+        switch (fop) {
+            case GF_FOP_WRITE:
+                STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                                FIRST_CHILD(this)->fops->writev, fd, vector,
+                                count, offset, flags, iobref, xdata);
+                break;
+            case GF_FOP_FALLOCATE:
+                STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                                FIRST_CHILD(this)->fops->fallocate, fd, flags,
+                                offset, len, xdata);
+                break;
+            case GF_FOP_ZEROFILL:
+                STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                                FIRST_CHILD(this)->fops->zerofill, fd, offset,
+                                len, xdata);
+                break;
+            case GF_FOP_DISCARD:
+                STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                                FIRST_CHILD(this)->fops->discard, fd, offset,
+                                len, xdata);
+                break;
+            default:
+                gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP,
+                       "Invalid fop id = %d", fop);
+                break;
+        }
+        return 0;
+    }
+
+    if (!this->itable)
+        this->itable = fd->inode->table;
+
+    local = mem_get0(this->local_pool);
+    if (!local)
+        goto out;
+
+    frame->local = local;
+
+    ret = syncbarrier_init(&local->barrier);
+    if (ret)
+        goto out;
+    local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+    if (!local->xattr_req)
+        goto out;
+
+    if (vector) {
+        local->vector = iov_dup(vector, count);
+        if (!local->vector)
+            goto out;
+        for (i = 0; i < count; i++)
+            local->total_size += vector[i].iov_len;
+        local->count = count;
+    } else {
+        local->total_size = len;
+    }
+
+    local->fop = fop;
+    local->offset = offset;
+    local->flags = flags;
+    if (iobref)
+        local->iobref = iobref_ref(iobref);
+    local->fd = fd_ref(fd);
+    local->block_size = block_size;
+    local->resolver_base_inode = local->fd->inode;
+    GF_ATOMIC_INIT(local->delta_blocks, 0);
+
+    local->loc.inode = inode_ref(fd->inode);
+    gf_uuid_copy(local->loc.gfid, fd->inode->gfid);
+
+    shard_refresh_base_file(frame, this, NULL, fd,
+                            shard_common_inode_write_post_lookup_handler);
+    return 0;
+out:
+    shard_common_failure_unwind(fop, frame, -1, ENOMEM);
+    return 0;
+}
+
+int
+shard_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+             struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
+             struct iobref *iobref, dict_t *xdata)
+{
+    shard_common_inode_write_begin(frame, this, GF_FOP_WRITE, fd, vector, count,
+                                   offset, flags, 0, iobref, xdata);
+    return 0;
+}
+
+int
+shard_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                int32_t keep_size, off_t offset, size_t len, dict_t *xdata)
+{
+    if ((keep_size != 0) && (keep_size != FALLOC_FL_ZERO_RANGE) &&
+        (keep_size != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)))
+        goto out;
+
+    shard_common_inode_write_begin(frame, this, GF_FOP_FALLOCATE, fd, NULL, 0,
+                                   offset, keep_size, len, NULL, xdata);
+    return 0;
+out:
+    shard_common_failure_unwind(GF_FOP_FALLOCATE, frame, -1, ENOTSUP);
+    return 0;
+}
+
+int
+shard_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+               off_t len, dict_t *xdata)
+{
+    shard_common_inode_write_begin(frame, this, GF_FOP_ZEROFILL, fd, NULL, 0,
+                                   offset, 0, len, NULL, xdata);
+    return 0;
+}
+
+int
+shard_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+              size_t len, dict_t *xdata)
+{
+    shard_common_inode_write_begin(frame, this, GF_FOP_DISCARD, fd, NULL, 0,
+                                   offset, 0, len, NULL, xdata);
+    return 0;
+}
+
+int32_t
+shard_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+           gf_seek_what_t what, dict_t *xdata)
+{
+    /* TBD */
+    gf_msg(this->name, GF_LOG_INFO, ENOTSUP, SHARD_MSG_FOP_NOT_SUPPORTED,
+           "seek called on %s.", uuid_utoa(fd->inode->gfid));
+    shard_common_failure_unwind(GF_FOP_SEEK, frame, -1, ENOTSUP);
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_shard_mt_end + 1);
+
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_MEM_ACCT_INIT_FAILED,
+               "Memory accounting init"
+               "failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+int
+init(xlator_t *this)
+{
+    int ret = -1;
+    shard_priv_t *priv = NULL;
+
+    if (!this) {
+        gf_msg("shard", GF_LOG_ERROR, 0, SHARD_MSG_NULL_THIS,
+               "this is NULL. init() failed");
+        return -1;
+    }
+
+    if (!this->parents) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE,
+               "Dangling volume. Check volfile");
+        goto out;
+    }
+
+    if (!this->children || this->children->next) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE,
+               "shard not configured with exactly one sub-volume. "
+               "Check volfile");
+        goto out;
+    }
+
+    priv = GF_CALLOC(1, sizeof(shard_priv_t), gf_shard_mt_priv_t);
+    if (!priv)
+        goto out;
+
+    GF_OPTION_INIT("shard-block-size", priv->block_size, size_uint64, out);
+
+    GF_OPTION_INIT("shard-deletion-rate", priv->deletion_rate, uint32, out);
+
+    GF_OPTION_INIT("shard-lru-limit", priv->lru_limit, uint64, out);
+
+    this->local_pool = mem_pool_new(shard_local_t, 128);
+    if (!this->local_pool) {
+        ret = -1;
+        goto out;
+    }
+    gf_uuid_parse(SHARD_ROOT_GFID, priv->dot_shard_gfid);
+    gf_uuid_parse(DOT_SHARD_REMOVE_ME_GFID, priv->dot_shard_rm_gfid);
+
+    this->private = priv;
+    LOCK_INIT(&priv->lock);
+    INIT_LIST_HEAD(&priv->ilist_head);
+    ret = 0;
+out:
+    if (ret) {
+        GF_FREE(priv);
+        mem_pool_destroy(this->local_pool);
+    }
+
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    shard_priv_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("shard", this, out);
+
+    /*Itable was not created by shard, hence setting to NULL.*/
+    this->itable = NULL;
+
+    mem_pool_destroy(this->local_pool);
+    this->local_pool = NULL;
+
+    priv = this->private;
+    if (!priv)
+        goto out;
+
+    this->private = NULL;
+    LOCK_DESTROY(&priv->lock);
+    GF_FREE(priv);
+
+out:
+    return;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    int ret = -1;
+    shard_priv_t *priv = NULL;
+
+    priv = this->private;
+
+    GF_OPTION_RECONF("shard-block-size", priv->block_size, options, size, out);
+
+    GF_OPTION_RECONF("shard-deletion-rate", priv->deletion_rate, options,
+                     uint32, out);
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int
+shard_forget(xlator_t *this, inode_t *inode)
+{
+    uint64_t ctx_uint = 0;
+    shard_inode_ctx_t *ctx = NULL;
+    shard_priv_t *priv = NULL;
+
+    priv = this->private;
+    if (!priv)
+        return 0;
+
+    inode_ctx_del(inode, this, &ctx_uint);
+    if (!ctx_uint)
+        return 0;
+
+    ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint;
+
+    /* When LRU limit reaches inode will be forcefully removed from the
+     * table, inode needs to be removed from LRU of shard as well.
+     */
+    if (!list_empty(&ctx->ilist)) {
+        LOCK(&priv->lock);
+        {
+            list_del_init(&ctx->ilist);
+            priv->inode_count--;
+        }
+        UNLOCK(&priv->lock);
+    }
+    GF_FREE(ctx);
+
+    return 0;
+}
+
+int
+shard_release(xlator_t *this, fd_t *fd)
+{
+    /* TBD */
+    return 0;
+}
+
+int
+shard_priv_dump(xlator_t *this)
+{
+    shard_priv_t *priv = NULL;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    char *str = NULL;
+
+    priv = this->private;
+
+    snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
+    gf_proc_dump_add_section("%s", key_prefix);
+    str = gf_uint64_2human_readable(priv->block_size);
+    gf_proc_dump_write("shard-block-size", "%s", str);
+    gf_proc_dump_write("inode-count", "%d", priv->inode_count);
+    gf_proc_dump_write("ilist_head", "%p", &priv->ilist_head);
+    gf_proc_dump_write("lru-max-limit", "%" PRIu64, priv->lru_limit);
+
+    GF_FREE(str);
+
+    return 0;
+}
+
+int
+shard_releasedir(xlator_t *this, fd_t *fd)
+{
+    return 0;
+}
+
+struct xlator_fops fops = {
+    .lookup = shard_lookup,
+    .open = shard_open,
+    .flush = shard_flush,
+    .fsync = shard_fsync,
+    .stat = shard_stat,
+    .fstat = shard_fstat,
+    .getxattr = shard_getxattr,
+    .fgetxattr = shard_fgetxattr,
+    .readv = shard_readv,
+    .writev = shard_writev,
+    .truncate = shard_truncate,
+    .ftruncate = shard_ftruncate,
+    .setxattr = shard_setxattr,
+    .fsetxattr = shard_fsetxattr,
+    .setattr = shard_setattr,
+    .fsetattr = shard_fsetattr,
+    .removexattr = shard_removexattr,
+    .fremovexattr = shard_fremovexattr,
+    .fallocate = shard_fallocate,
+    .discard = shard_discard,
+    .zerofill = shard_zerofill,
+    .readdir = shard_readdir,
+    .readdirp = shard_readdirp,
+    .create = shard_create,
+    .mknod = shard_mknod,
+    .link = shard_link,
+    .unlink = shard_unlink,
+    .rename = shard_rename,
+    .seek = shard_seek,
+};
+
+struct xlator_cbks cbks = {
+    .forget = shard_forget,
+    .release = shard_release,
+    .releasedir = shard_releasedir,
+};
+
+struct xlator_dumpops dumpops = {
+    .priv = shard_priv_dump,
+};
+
+struct volume_options options[] = {
+    {
+        .key = {"shard"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "enable/disable shard",
+        .op_version = {GD_OP_VERSION_6_0},
+        .flags = OPT_FLAG_SETTABLE,
+    },
+    {
+        .key = {"shard-block-size"},
+        .type = GF_OPTION_TYPE_SIZET,
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .tags = {"shard"},
+        .default_value = "64MB",
+        .min = SHARD_MIN_BLOCK_SIZE,
+        .max = SHARD_MAX_BLOCK_SIZE,
+        .description = "The size unit used to break a file into multiple "
+                       "chunks",
+    },
+    {
+        .key = {"shard-deletion-rate"},
+        .type = GF_OPTION_TYPE_INT,
+        .op_version = {GD_OP_VERSION_5_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .tags = {"shard"},
+        .default_value = "100",
+        .min = 100,
+        .max = INT_MAX,
+        .description = "The number of shards to send deletes on at a time",
+    },
+    {
+        .key = {"shard-lru-limit"},
+        .type = GF_OPTION_TYPE_INT,
+        .op_version = {GD_OP_VERSION_5_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT,
+        .tags = {"shard"},
+        .default_value = "16384",
+        .min = 20,
+        .max = INT_MAX,
+        .description = "The number of resolved shard inodes to keep in "
+                       "memory. A higher number means shards that are "
+                       "resolved will remain in memory longer, avoiding "
+                       "frequent lookups on them when they participate in "
+                       "file operations. The option also has a bearing on "
+                       "amount of memory consumed by these inodes and their "
+                       "internal metadata",
+    },
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "shard",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h
new file mode 100644
index 00000000000..4fe181b64d5
--- /dev/null
+++ b/xlators/features/shard/src/shard.h
@@ -0,0 +1,348 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __SHARD_H__
+#define __SHARD_H__
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/compat-errno.h>
+#include "shard-messages.h"
+#include <glusterfs/syncop.h>
+
+#define GF_SHARD_DIR ".shard"
+#define GF_SHARD_REMOVE_ME_DIR ".remove_me"
+#define SHARD_MIN_BLOCK_SIZE (4 * GF_UNIT_MB)
+#define SHARD_MAX_BLOCK_SIZE (4 * GF_UNIT_TB)
+#define SHARD_XATTR_PREFIX "trusted.glusterfs.shard."
+#define GF_XATTR_SHARD_BLOCK_SIZE "trusted.glusterfs.shard.block-size"
+/**
+ *  Bit masks for the valid flag, which is used while updating ctx
+ **/
+#define SHARD_MASK_BLOCK_SIZE (1 << 0)
+#define SHARD_MASK_PROT (1 << 1)
+#define SHARD_MASK_NLINK (1 << 2)
+#define SHARD_MASK_UID (1 << 3)
+#define SHARD_MASK_GID (1 << 4)
+#define SHARD_MASK_SIZE (1 << 6)
+#define SHARD_MASK_BLOCKS (1 << 7)
+#define SHARD_MASK_TIMES (1 << 8)
+#define SHARD_MASK_OTHERS (1 << 9)
+#define SHARD_MASK_REFRESH_RESET (1 << 10)
+
+#define SHARD_INODE_WRITE_MASK                                                 \
+    (SHARD_MASK_SIZE | SHARD_MASK_BLOCKS | SHARD_MASK_TIMES)
+
+#define SHARD_LOOKUP_MASK                                                      \
+    (SHARD_MASK_PROT | SHARD_MASK_NLINK | SHARD_MASK_UID | SHARD_MASK_GID |    \
+     SHARD_MASK_TIMES | SHARD_MASK_OTHERS)
+
+#define SHARD_ALL_MASK                                                         \
+    (SHARD_MASK_BLOCK_SIZE | SHARD_MASK_PROT | SHARD_MASK_NLINK |              \
+     SHARD_MASK_UID | SHARD_MASK_GID | SHARD_MASK_SIZE | SHARD_MASK_BLOCKS |   \
+     SHARD_MASK_TIMES | SHARD_MASK_OTHERS)
+
+#define get_lowest_block(off, shard_size) ((off) / (shard_size))
+#define get_highest_block(off, len, shard_size)                                \
+    (((((off) + (len)) == 0) ? 0 : ((off) + (len)-1)) / (shard_size))
+
+int
+shard_unlock_inodelk(call_frame_t *frame, xlator_t *this);
+
+int
+shard_unlock_entrylk(call_frame_t *frame, xlator_t *this);
+
+#define SHARD_ENTRY_FOP_CHECK(loc, op_errno, label)                            \
+    do {                                                                       \
+        if ((loc->name && !strcmp(GF_SHARD_DIR, loc->name)) &&                 \
+            (((loc->parent) && __is_root_gfid(loc->parent->gfid)) ||           \
+             __is_root_gfid(loc->pargfid))) {                                  \
+            op_errno = EPERM;                                                  \
+            goto label;                                                        \
+        }                                                                      \
+                                                                               \
+        if ((loc->parent && __is_shard_dir(loc->parent->gfid)) ||              \
+            __is_shard_dir(loc->pargfid)) {                                    \
+            op_errno = EPERM;                                                  \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0)
+
+#define SHARD_INODE_OP_CHECK(gfid, err, label)                                 \
+    do {                                                                       \
+        if (__is_shard_dir(gfid)) {                                            \
+            err = EPERM;                                                       \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0)
+
+#define SHARD_STACK_UNWIND(fop, frame, params...)                              \
+    do {                                                                       \
+        shard_local_t *__local = NULL;                                         \
+        if (frame) {                                                           \
+            __local = frame->local;                                            \
+            if (__local && __local->int_inodelk.acquired_lock)                 \
+                shard_unlock_inodelk(frame, frame->this);                      \
+            if (__local && __local->int_entrylk.acquired_lock)                 \
+                shard_unlock_entrylk(frame, frame->this);                      \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, params);                               \
+        if (__local) {                                                         \
+            shard_local_wipe(__local);                                         \
+            mem_put(__local);                                                  \
+        }                                                                      \
+    } while (0)
+
+#define SHARD_STACK_DESTROY(frame)                                             \
+    do {                                                                       \
+        shard_local_t *__local = NULL;                                         \
+        __local = frame->local;                                                \
+        frame->local = NULL;                                                   \
+        STACK_DESTROY(frame->root);                                            \
+        if (__local) {                                                         \
+            shard_local_wipe(__local);                                         \
+            mem_put(__local);                                                  \
+        }                                                                      \
+    } while (0);
+
+#define SHARD_INODE_CREATE_INIT(this, block_size, xattr_req, loc, size,        \
+                                block_count, label)                            \
+    do {                                                                       \
+        int __ret = -1;                                                        \
+        int64_t *__size_attr = NULL;                                           \
+        uint64_t *__bs = 0;                                                    \
+                                                                               \
+        __bs = GF_MALLOC(sizeof(uint64_t), gf_shard_mt_uint64_t);              \
+        if (!__bs)                                                             \
+            goto label;                                                        \
+        *__bs = hton64(block_size);                                            \
+        __ret = dict_set_bin(xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, __bs,       \
+                             sizeof(*__bs));                                   \
+        if (__ret) {                                                           \
+            gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED,    \
+                   "Failed to set key: %s "                                    \
+                   "on path %s",                                               \
+                   GF_XATTR_SHARD_BLOCK_SIZE, (loc)->path);                    \
+            GF_FREE(__bs);                                                     \
+            goto label;                                                        \
+        }                                                                      \
+                                                                               \
+        __ret = shard_set_size_attrs(size, block_count, &__size_attr);         \
+        if (__ret)                                                             \
+            goto label;                                                        \
+                                                                               \
+        __ret = dict_set_bin(xattr_req, GF_XATTR_SHARD_FILE_SIZE, __size_attr, \
+                             8 * 4);                                           \
+        if (__ret) {                                                           \
+            gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED,    \
+                   "Failed to set key: %s "                                    \
+                   "on path %s",                                               \
+                   GF_XATTR_SHARD_FILE_SIZE, (loc)->path);                     \
+            GF_FREE(__size_attr);                                              \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0)
+
+#define SHARD_MD_READ_FOP_INIT_REQ_DICT(this, dict, gfid, local, label)        \
+    do {                                                                       \
+        int __ret = -1;                                                        \
+                                                                               \
+        __ret = dict_set_uint64(dict, GF_XATTR_SHARD_FILE_SIZE, 8 * 4);        \
+        if (__ret) {                                                           \
+            local->op_ret = -1;                                                \
+            local->op_errno = ENOMEM;                                          \
+            gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED,    \
+                   "Failed to set dict value:"                                 \
+                   " key:%s for %s.",                                          \
+                   GF_XATTR_SHARD_FILE_SIZE, uuid_utoa(gfid));                 \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0)
+
+#define SHARD_SET_ROOT_FS_ID(frame, local)                                     \
+    do {                                                                       \
+        if (!local->is_set_fsid) {                                             \
+            local->uid = frame->root->uid;                                     \
+            local->gid = frame->root->gid;                                     \
+            frame->root->uid = 0;                                              \
+            frame->root->gid = 0;                                              \
+            local->is_set_fsid = _gf_true;                                     \
+        }                                                                      \
+    } while (0)
+
+#define SHARD_UNSET_ROOT_FS_ID(frame, local)                                   \
+    do {                                                                       \
+        if (local->is_set_fsid) {                                              \
+            frame->root->uid = local->uid;                                     \
+            frame->root->gid = local->gid;                                     \
+            local->is_set_fsid = _gf_false;                                    \
+        }                                                                      \
+    } while (0)
+
+#define SHARD_TIME_UPDATE(ctx_sec, ctx_nsec, new_sec, new_nsec)                \
+    do {                                                                       \
+        if (ctx_sec == new_sec)                                                \
+            ctx_nsec = new_nsec = max(new_nsec, ctx_nsec);                     \
+        else if (ctx_sec > new_sec) {                                          \
+            new_sec = ctx_sec;                                                 \
+            new_nsec = ctx_nsec;                                               \
+        } else {                                                               \
+            ctx_sec = new_sec;                                                 \
+            ctx_nsec = new_nsec;                                               \
+        }                                                                      \
+    } while (0)
+
+typedef enum {
+    SHARD_BG_DELETION_NONE = 0,
+    SHARD_BG_DELETION_LAUNCHING,
+    SHARD_BG_DELETION_IN_PROGRESS,
+} shard_bg_deletion_state_t;
+
+/* rm = "remove me" */
+
+typedef struct shard_priv {
+    uint64_t block_size;
+    uuid_t dot_shard_gfid;
+    uuid_t dot_shard_rm_gfid;
+    inode_t *dot_shard_inode;
+    inode_t *dot_shard_rm_inode;
+    gf_lock_t lock;
+    int inode_count;
+    struct list_head ilist_head;
+    uint32_t deletion_rate;
+    shard_bg_deletion_state_t bg_del_state;
+    gf_boolean_t first_lookup_done;
+    uint64_t lru_limit;
+} shard_priv_t;
+
+typedef struct {
+    loc_t loc;
+    char *domain;
+    struct gf_flock flock;
+    gf_boolean_t acquired_lock;
+} shard_inodelk_t;
+
+typedef struct {
+    loc_t loc;
+    char *domain;
+    char *basename;
+    entrylk_cmd cmd;
+    entrylk_type type;
+    gf_boolean_t acquired_lock;
+} shard_entrylk_t;
+
+typedef int32_t (*shard_post_fop_handler_t)(call_frame_t *frame,
+                                            xlator_t *this);
+typedef int32_t (*shard_post_resolve_fop_handler_t)(call_frame_t *frame,
+                                                    xlator_t *this);
+typedef int32_t (*shard_post_lookup_shards_fop_handler_t)(call_frame_t *frame,
+                                                          xlator_t *this);
+
+typedef int32_t (*shard_post_mknod_fop_handler_t)(call_frame_t *frame,
+                                                  xlator_t *this);
+
+typedef int32_t (*shard_post_update_size_fop_handler_t)(call_frame_t *frame,
+                                                        xlator_t *this);
+
+typedef struct shard_local {
+    int op_ret;
+    int op_errno;
+    uint64_t first_block;
+    uint64_t last_block;
+    uint64_t num_blocks;
+    int call_count;
+    int eexist_count;
+    int create_count;
+    int xflag;
+    int count;
+    uint32_t flags;
+    uint32_t uid;
+    uint32_t gid;
+    uint64_t block_size;
+    uint64_t dst_block_size;
+    int32_t datasync;
+    off_t offset;
+    size_t total_size;
+    size_t written_size;
+    size_t hole_size;
+    size_t req_size;
+    size_t readdir_size;
+    int64_t delta_size;
+    gf_atomic_t delta_blocks;
+    loc_t loc;
+    loc_t dot_shard_loc;
+    loc_t dot_shard_rm_loc;
+    loc_t loc2;
+    loc_t tmp_loc;
+    fd_t *fd;
+    dict_t *xattr_req;
+    dict_t *xattr_rsp;
+    inode_t **inode_list;
+    glusterfs_fop_t fop;
+    struct iatt prebuf;
+    struct iatt postbuf;
+    struct iatt preoldparent;
+    struct iatt postoldparent;
+    struct iatt prenewparent;
+    struct iatt postnewparent;
+    struct iovec *vector;
+    struct iobref *iobref;
+    struct iobuf *iobuf;
+    gf_dirent_t entries_head;
+    gf_boolean_t is_set_fsid;
+    gf_boolean_t list_inited;
+    shard_post_fop_handler_t handler;
+    shard_post_lookup_shards_fop_handler_t pls_fop_handler;
+    shard_post_resolve_fop_handler_t post_res_handler;
+    shard_post_mknod_fop_handler_t post_mknod_handler;
+    shard_post_update_size_fop_handler_t post_update_size_handler;
+    shard_inodelk_t int_inodelk;
+    shard_entrylk_t int_entrylk;
+    inode_t *resolver_base_inode;
+    gf_boolean_t first_lookup_done;
+    syncbarrier_t barrier;
+    gf_boolean_t lookup_shards_barriered;
+    gf_boolean_t unlink_shards_barriered;
+    gf_boolean_t resolve_not;
+    loc_t newloc;
+    call_frame_t *main_frame;
+    call_frame_t *inodelk_frame;
+    call_frame_t *entrylk_frame;
+    uint32_t deletion_rate;
+    gf_boolean_t cleanup_required;
+    uuid_t base_gfid;
+    char *name;
+} shard_local_t;
+
+typedef struct shard_inode_ctx {
+    uint64_t block_size; /* The block size with which this inode is
+                            sharded */
+    struct iatt stat;
+    gf_boolean_t refresh;
+    /* The following members of inode ctx will be applicable only to the
+     * individual shards' ctx and never the base file ctx.
+     */
+    struct list_head ilist;
+    uuid_t base_gfid;
+    int block_num;
+    gf_boolean_t refreshed;
+    struct list_head to_fsync_list;
+    int fsync_needed;
+    inode_t *inode;
+    int fsync_count;
+    inode_t *base_inode;
+} shard_inode_ctx_t;
+
+typedef enum {
+    SHARD_INTERNAL_DIR_DOT_SHARD = 1,
+    SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME,
+} shard_internal_dir_type_t;
+
+#endif /* __SHARD_H__ */
diff --git a/xlators/performance/stat-prefetch/Makefile.am b/xlators/features/snapview-client/Makefile.am
index af437a64d6d..af437a64d6d 100644
--- a/xlators/performance/stat-prefetch/Makefile.am
+++ b/xlators/features/snapview-client/Makefile.am
diff --git a/xlators/features/snapview-client/src/Makefile.am b/xlators/features/snapview-client/src/Makefile.am
new file mode 100644
index 00000000000..fa08656c537
--- /dev/null
+++ b/xlators/features/snapview-client/src/Makefile.am
@@ -0,0 +1,16 @@
+xlator_LTLIBRARIES = snapview-client.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+snapview_client_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+snapview_client_la_SOURCES = snapview-client.c
+snapview_client_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = snapview-client.h snapview-client-mem-types.h snapview-client-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/snapview-client/src/snapview-client-mem-types.h b/xlators/features/snapview-client/src/snapview-client-mem-types.h
new file mode 100644
index 00000000000..3c3ab555a55
--- /dev/null
+++ b/xlators/features/snapview-client/src/snapview-client-mem-types.h
@@ -0,0 +1,24 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _SVC_MEM_TYPES_H
+#define _SVC_MEM_TYPES_H
+
+#include <glusterfs/mem-types.h>
+
+enum svc_mem_types {
+    gf_svc_mt_svc_private_t = gf_common_mt_end + 1,
+    gf_svc_mt_svc_local_t,
+    gf_svc_mt_svc_inode_t,
+    gf_svc_mt_svc_fd_t,
+    gf_svc_mt_end
+};
+
+#endif
diff --git a/xlators/features/snapview-client/src/snapview-client-messages.h b/xlators/features/snapview-client/src/snapview-client-messages.h
new file mode 100644
index 00000000000..c02fb154930
--- /dev/null
+++ b/xlators/features/snapview-client/src/snapview-client-messages.h
@@ -0,0 +1,71 @@
+/*
+ Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _SNAPVIEW_CLIENT_MESSAGES_H_
+#define _SNAPVIEW_CLIENT_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(SNAPVIEW_CLIENT, SVC_MSG_NO_MEMORY, SVC_MSG_MEM_ACNT_FAILED,
+           SVC_MSG_SET_INODE_CONTEXT_FAILED, SVC_MSG_GET_INODE_CONTEXT_FAILED,
+           SVC_MSG_DELETE_INODE_CONTEXT_FAILED, SVC_MSG_SET_FD_CONTEXT_FAILED,
+           SVC_MSG_GET_FD_CONTEXT_FAILED, SVC_MSG_DICT_SET_FAILED,
+           SVC_MSG_SUBVOLUME_NULL, SVC_MSG_NO_CHILD_FOR_XLATOR,
+           SVC_MSG_XLATOR_CHILDREN_WRONG, SVC_MSG_NORMAL_GRAPH_LOOKUP_FAIL,
+           SVC_MSG_SNAPVIEW_GRAPH_LOOKUP_FAIL, SVC_MSG_OPENDIR_SPECIAL_DIR,
+           SVC_MSG_RENAME_SNAPSHOT_ENTRY, SVC_MSG_LINK_SNAPSHOT_ENTRY,
+           SVC_MSG_COPY_ENTRY_POINT_FAILED, SVC_MSG_ENTRY_POINT_SPECIAL_DIR,
+           SVC_MSG_STR_LEN, SVC_MSG_INVALID_ENTRY_POINT, SVC_MSG_NULL_PRIV,
+           SVC_MSG_PRIV_DESTROY_FAILED, SVC_MSG_ALLOC_FD_FAILED,
+           SVC_MSG_ALLOC_INODE_FAILED, SVC_MSG_NULL_SPECIAL_DIR,
+           SVC_MSG_MEM_POOL_GET_FAILED);
+
+#define SVC_MSG_ALLOC_FD_FAILED_STR "failed to allocate new fd context"
+#define SVC_MSG_SET_FD_CONTEXT_FAILED_STR "failed to set fd context"
+#define SVC_MSG_STR_LEN_STR                                                    \
+    "destination buffer size is less than the length of entry point name"
+#define SVC_MSG_NORMAL_GRAPH_LOOKUP_FAIL_STR "lookup failed on normal graph"
+#define SVC_MSG_SNAPVIEW_GRAPH_LOOKUP_FAIL_STR "lookup failed on snapview graph"
+#define SVC_MSG_SET_INODE_CONTEXT_FAILED_STR "failed to set inode context"
+#define SVC_MSG_NO_MEMORY_STR "failed to allocate memory"
+#define SVC_MSG_COPY_ENTRY_POINT_FAILED_STR                                    \
+    "failed to copy the entry point string"
+#define SVC_MSG_GET_FD_CONTEXT_FAILED_STR "fd context not found"
+#define SVC_MSG_GET_INODE_CONTEXT_FAILED_STR "failed to get inode context"
+#define SVC_MSG_ALLOC_INODE_FAILED_STR "failed to allocate new inode"
+#define SVC_MSG_DICT_SET_FAILED_STR "failed to set dict"
+#define SVC_MSG_RENAME_SNAPSHOT_ENTRY_STR                                      \
+    "rename happening on a entry residing in snapshot"
+#define SVC_MSG_DELETE_INODE_CONTEXT_FAILED_STR "failed to delete inode context"
+#define SVC_MSG_NULL_PRIV_STR "priv NULL"
+#define SVC_MSG_INVALID_ENTRY_POINT_STR "not a valid entry point"
+#define SVC_MSG_MEM_ACNT_FAILED_STR "Memory accouting init failed"
+#define SVC_MSG_NO_CHILD_FOR_XLATOR_STR "configured without any child"
+#define SVC_MSG_XLATOR_CHILDREN_WRONG_STR                                      \
+    "snap-view-client has got wrong subvolumes. It can have only 2"
+#define SVC_MSG_ENTRY_POINT_SPECIAL_DIR_STR                                    \
+    "entry point directory cannot be part of special directory"
+#define SVC_MSG_NULL_SPECIAL_DIR_STR "null special directory"
+#define SVC_MSG_MEM_POOL_GET_FAILED_STR                                        \
+    "could not get mem pool for frame->local"
+#define SVC_MSG_PRIV_DESTROY_FAILED_STR "failed to destroy private"
+#define SVC_MSG_LINK_SNAPSHOT_ENTRY_STR                                        \
+    "link happening on a entry residin gin snapshot"
+#endif /* !_SNAPVIEW_CLIENT_MESSAGES_H_ */
diff --git a/xlators/features/snapview-client/src/snapview-client.c b/xlators/features/snapview-client/src/snapview-client.c
new file mode 100644
index 00000000000..486c5179d5b
--- /dev/null
+++ b/xlators/features/snapview-client/src/snapview-client.c
@@ -0,0 +1,2791 @@
+/*
+  Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "snapview-client.h"
+#include <glusterfs/inode.h>
+#include <glusterfs/byte-order.h>
+
+static void
+svc_local_free(svc_local_t *local)
+{
+    if (local) {
+        loc_wipe(&local->loc);
+        if (local->fd)
+            fd_unref(local->fd);
+        if (local->xdata)
+            dict_unref(local->xdata);
+        mem_put(local);
+    }
+}
+
+static xlator_t *
+svc_get_subvolume(xlator_t *this, int inode_type)
+{
+    xlator_t *subvolume = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+
+    if (inode_type == VIRTUAL_INODE)
+        subvolume = SECOND_CHILD(this);
+    else
+        subvolume = FIRST_CHILD(this);
+
+out:
+    return subvolume;
+}
+
+static int32_t
+__svc_inode_ctx_set(xlator_t *this, inode_t *inode, int inode_type)
+{
+    uint64_t value = 0;
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    value = inode_type;
+
+    ret = __inode_ctx_set(inode, this, &value);
+
+out:
+    return ret;
+}
+
+static int
+__svc_inode_ctx_get(xlator_t *this, inode_t *inode, int *inode_type)
+{
+    uint64_t value = 0;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    ret = __inode_ctx_get(inode, this, &value);
+    if (ret < 0)
+        goto out;
+
+    *inode_type = (int)(value);
+
+out:
+    return ret;
+}
+
+static int
+svc_inode_ctx_get(xlator_t *this, inode_t *inode, int *inode_type)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    LOCK(&inode->lock);
+    {
+        ret = __svc_inode_ctx_get(this, inode, inode_type);
+    }
+    UNLOCK(&inode->lock);
+
+out:
+    return ret;
+}
+
+static int32_t
+svc_inode_ctx_set(xlator_t *this, inode_t *inode, int inode_type)
+{
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    LOCK(&inode->lock);
+    {
+        ret = __svc_inode_ctx_set(this, inode, inode_type);
+    }
+    UNLOCK(&inode->lock);
+
+out:
+    return ret;
+}
+
+static svc_fd_t *
+svc_fd_new(void)
+{
+    svc_fd_t *svc_fd = NULL;
+
+    svc_fd = GF_CALLOC(1, sizeof(*svc_fd), gf_svc_mt_svc_fd_t);
+
+    return svc_fd;
+}
+
+static svc_fd_t *
+__svc_fd_ctx_get(xlator_t *this, fd_t *fd)
+{
+    svc_fd_t *svc_fd = NULL;
+    uint64_t value = 0;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    ret = __fd_ctx_get(fd, this, &value);
+    if (ret)
+        return NULL;
+
+    svc_fd = (svc_fd_t *)((long)value);
+
+out:
+    return svc_fd;
+}
+
+static svc_fd_t *
+svc_fd_ctx_get(xlator_t *this, fd_t *fd)
+{
+    svc_fd_t *svc_fd = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    LOCK(&fd->lock);
+    {
+        svc_fd = __svc_fd_ctx_get(this, fd);
+    }
+    UNLOCK(&fd->lock);
+
+out:
+    return svc_fd;
+}
+
+static int
+__svc_fd_ctx_set(xlator_t *this, fd_t *fd, svc_fd_t *svc_fd)
+{
+    uint64_t value = 0;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, svc_fd, out);
+
+    value = (uint64_t)(long)svc_fd;
+
+    ret = __fd_ctx_set(fd, this, value);
+
+out:
+    return ret;
+}
+
+static svc_fd_t *
+__svc_fd_ctx_get_or_new(xlator_t *this, fd_t *fd)
+{
+    svc_fd_t *svc_fd = NULL;
+    int ret = -1;
+    inode_t *inode = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    inode = fd->inode;
+    svc_fd = __svc_fd_ctx_get(this, fd);
+    if (svc_fd) {
+        ret = 0;
+        goto out;
+    }
+
+    svc_fd = svc_fd_new();
+    if (!svc_fd) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, SVC_MSG_ALLOC_FD_FAILED,
+                "gfid=%s", uuid_utoa(inode->gfid), NULL);
+        goto out;
+    }
+
+    ret = __svc_fd_ctx_set(this, fd, svc_fd);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_SET_FD_CONTEXT_FAILED,
+                "gfid=%s", uuid_utoa(inode->gfid), NULL);
+        ret = -1;
+    }
+
+out:
+    if (ret) {
+        GF_FREE(svc_fd);
+        svc_fd = NULL;
+    }
+
+    return svc_fd;
+}
+
+static svc_fd_t *
+svc_fd_ctx_get_or_new(xlator_t *this, fd_t *fd)
+{
+    svc_fd_t *svc_fd = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    LOCK(&fd->lock);
+    {
+        svc_fd = __svc_fd_ctx_get_or_new(this, fd);
+    }
+    UNLOCK(&fd->lock);
+
+out:
+    return svc_fd;
+}
+
+/**
+ * @this: xlator
+ * @entry_point: pointer to the buffer provided by consumer
+ *
+ * This function is mainly for copying the entry point name
+ * (stored as string in priv->path) to a buffer point to by
+ * @entry_point within the lock. It is for the consumer to
+ * allocate the memory for the buffer.
+ *
+ * This function is called by all the functions (or fops)
+ * who need to use priv->path for avoiding the race.
+ * For example, either in lookup or in any other fop,
+ * while priv->path is being accessed, a reconfigure can
+ * happen to change priv->path. This ensures that, a lock
+ * is taken before accessing priv->path.
+ **/
+int
+gf_svc_get_entry_point(xlator_t *this, char *entry_point, size_t dest_size)
+{
+    int ret = -1;
+    svc_private_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, entry_point, out);
+
+    priv = this->private;
+
+    LOCK(&priv->lock);
+    {
+        if (dest_size <= strlen(priv->path)) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_STR_LEN,
+                    "dest-size=%zu", dest_size, "priv-path-len=%zu",
+                    strlen(priv->path), "path=%s", priv->path, NULL);
+        } else {
+            snprintf(entry_point, dest_size, "%s", priv->path);
+            ret = 0;
+        }
+    }
+    UNLOCK(&priv->lock);
+
+out:
+    return ret;
+}
+
+static int32_t
+gf_svc_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, inode_t *inode,
+                  struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+    svc_local_t *local = NULL;
+    xlator_t *subvolume = NULL;
+    gf_boolean_t do_unwind = _gf_true;
+    int inode_type = -1;
+    int ret = -1;
+
+    local = frame->local;
+    subvolume = local->subvolume;
+    if (!subvolume) {
+        gf_msg_callingfn(this->name, GF_LOG_ERROR, 0, SVC_MSG_SUBVOLUME_NULL,
+                         "path: %s gfid: %s ", local->loc.path,
+                         inode ? uuid_utoa(inode->gfid) : "");
+        GF_ASSERT(0);
+    }
+
+    /* There is a possibility that, the client process just came online
+       and does not have the inode on which the lookup came. In that case,
+       the fresh inode created from fuse for the lookup fop, won't have
+       the inode context set without which svc cannot decide where to
+       STACK_WIND to. So by default it decides to send the fop to the
+       regular subvolume (i.e first child of the xlator). If lookup fails
+       on the regular volume, then there is a possibility that the lookup
+       is happening on a virtual inode (i.e history data residing in snaps).
+       So if lookup fails with ENOENT and the inode context is not there,
+       then send the lookup to the 2nd child of svc.
+
+       If there are any changes in volfile/client-restarted then inode-ctx
+       is lost. In this case if nameless lookup fails with ESTALE,
+       then send the lookup to the 2nd child of svc.
+    */
+    if (op_ret) {
+        if (subvolume == FIRST_CHILD(this)) {
+            gf_smsg(this->name,
+                    (op_errno == ENOENT || op_errno == ESTALE) ? GF_LOG_DEBUG
+                                                               : GF_LOG_ERROR,
+                    op_errno, SVC_MSG_NORMAL_GRAPH_LOOKUP_FAIL, "error=%s",
+                    strerror(op_errno), NULL);
+        } else {
+            gf_smsg(this->name,
+                    (op_errno == ENOENT || op_errno == ESTALE) ? GF_LOG_DEBUG
+                                                               : GF_LOG_ERROR,
+                    op_errno, SVC_MSG_SNAPVIEW_GRAPH_LOOKUP_FAIL, "error=%s",
+                    strerror(op_errno), NULL);
+            goto out;
+        }
+
+        if ((op_errno == ENOENT || op_errno == ESTALE) &&
+            !gf_uuid_is_null(local->loc.gfid)) {
+            if (inode != NULL)
+                ret = svc_inode_ctx_get(this, inode, &inode_type);
+
+            if (ret < 0 || inode == NULL) {
+                gf_msg_debug(this->name, 0,
+                             "Lookup on normal graph failed. "
+                             " Sending lookup to snapview-server");
+                subvolume = SECOND_CHILD(this);
+                local->subvolume = subvolume;
+                STACK_WIND(frame, gf_svc_lookup_cbk, subvolume,
+                           subvolume->fops->lookup, &local->loc, xdata);
+                do_unwind = _gf_false;
+            }
+        }
+
+        goto out;
+    }
+
+    if (subvolume == FIRST_CHILD(this))
+        inode_type = NORMAL_INODE;
+    else
+        inode_type = VIRTUAL_INODE;
+
+    ret = svc_inode_ctx_set(this, inode, inode_type);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_SET_INODE_CONTEXT_FAILED,
+                "gfid=%s", uuid_utoa(inode->gfid), NULL);
+
+out:
+    if (do_unwind) {
+        SVC_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+                         postparent);
+    }
+
+    return 0;
+}
+
+static int32_t
+gf_svc_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    int32_t ret = -1;
+    svc_local_t *local = NULL;
+    xlator_t *subvolume = NULL;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    inode_t *parent = NULL;
+    dict_t *new_xdata = NULL;
+    int inode_type = -1;
+    int parent_type = -1;
+    gf_boolean_t wind = _gf_false;
+    char entry_point[NAME_MAX + 1] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    ret = svc_inode_ctx_get(this, loc->inode, &inode_type);
+    if (!__is_root_gfid(loc->gfid)) {
+        if (loc->parent) {
+            parent = inode_ref(loc->parent);
+            ret = svc_inode_ctx_get(this, loc->parent, &parent_type);
+        } else {
+            parent = inode_parent(loc->inode, loc->pargfid, NULL);
+            if (parent)
+                ret = svc_inode_ctx_get(this, parent, &parent_type);
+        }
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, SVC_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    frame->local = local;
+    loc_copy(&local->loc, loc);
+
+    if (__is_root_gfid(loc->inode->gfid)) {
+        subvolume = FIRST_CHILD(this);
+        GF_ASSERT(subvolume);
+        local->subvolume = subvolume;
+        wind = _gf_true;
+        goto out;
+    }
+
+    /* nfs sends nameless lookups directly using the gfid. In that case
+       loc->name will be NULL. So check if loc->name is NULL. If so, then
+       try to get the subvolume using inode context. But if the inode has
+       not been looked up yet, then send the lookup call to the first
+       subvolume.
+    */
+
+    if (!loc->name) {
+        if (gf_uuid_is_null(loc->inode->gfid)) {
+            subvolume = FIRST_CHILD(this);
+            local->subvolume = subvolume;
+            wind = _gf_true;
+            goto out;
+        } else {
+            if (inode_type >= 0)
+                subvolume = svc_get_subvolume(this, inode_type);
+            else
+                subvolume = FIRST_CHILD(this);
+            local->subvolume = subvolume;
+            wind = _gf_true;
+            goto out;
+        }
+    }
+
+    if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) {
+        gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+                SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL);
+        goto out;
+    }
+
+    if (strcmp(loc->name, entry_point)) {
+        if (parent_type == VIRTUAL_INODE) {
+            subvolume = SECOND_CHILD(this);
+        } else {
+            /*
+             * Either parent type is normal graph, or the parent
+             * type is uncertain.
+             */
+            subvolume = FIRST_CHILD(this);
+        }
+        local->subvolume = subvolume;
+    } else {
+        subvolume = SECOND_CHILD(this);
+        local->subvolume = subvolume;
+        if (parent_type == NORMAL_INODE) {
+            /* Indication of whether the lookup is happening on the
+               entry point or not, to the snapview-server.
+            */
+            SVC_ENTRY_POINT_SET(this, xdata, op_ret, op_errno, new_xdata, ret,
+                                out);
+        }
+    }
+
+    wind = _gf_true;
+
+out:
+    if (wind)
+        STACK_WIND(frame, gf_svc_lookup_cbk, subvolume, subvolume->fops->lookup,
+                   loc, xdata);
+    else
+        SVC_STACK_UNWIND(lookup, frame, op_ret, op_errno, NULL, NULL, NULL,
+                         NULL);
+    if (new_xdata)
+        dict_unref(new_xdata);
+
+    if (parent)
+        inode_unref(parent);
+
+    return 0;
+}
+
+static int32_t
+gf_svc_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    xlator_t *subvolume = NULL;
+    int32_t ret = -1;
+    int inode_type = -1;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+    svc_private_t *priv = NULL;
+    const char *path = NULL;
+    int path_len = -1;
+    int snap_len = -1;
+    loc_t root_loc = {
+        0,
+    };
+    loc_t *temp_loc = NULL;
+    char entry_point[NAME_MAX + 1] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    priv = this->private;
+    SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, loc->inode,
+                            subvolume, out);
+    path_len = strlen(loc->path);
+    snap_len = strlen(priv->path);
+    temp_loc = loc;
+
+    if (path_len >= snap_len && inode_type == VIRTUAL_INODE) {
+        path = &loc->path[path_len - snap_len];
+        if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) {
+            gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+                    SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL);
+            goto out;
+        }
+
+        if (!strcmp(path, entry_point)) {
+            /*
+             * statfs call for virtual snap directory.
+             * Sent the fops to parent volume by removing
+             * virtual directory from path
+             */
+            subvolume = FIRST_CHILD(this);
+            root_loc.path = gf_strdup("/");
+            gf_uuid_clear(root_loc.gfid);
+            root_loc.gfid[15] = 1;
+            root_loc.inode = inode_ref(loc->inode->table->root);
+            temp_loc = &root_loc;
+        }
+    }
+
+    STACK_WIND_TAIL(frame, subvolume, subvolume->fops->statfs, temp_loc, xdata);
+    if (temp_loc == &root_loc)
+        loc_wipe(temp_loc);
+
+    wind = _gf_true;
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(statfs, frame, op_ret, op_errno, NULL, NULL);
+    return 0;
+}
+
+static int32_t
+gf_svc_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                dict_t *xdata)
+{
+    /* TODO: FIX ME
+     * Consider a testcase:
+     * #mount -t nfs host1:/vol1 /mnt
+     * #ls /mnt
+     * #ls /mnt/.snaps (As expected this fails)
+     * #gluster volume set vol1 features.uss enable
+     * Now `ls /mnt/.snaps` should work, but fails with No such file or
+     * directory. This is because NFS client (gNFS) caches the list of files
+     * in a directory. This cache is updated if there are any changes in the
+     * directory attributes. So, one way to solve this problem is to change
+     * 'ctime' attribute when USS is enabled as below.
+     *
+     * if (op_ret == 0 && IA_ISDIR(buf->ia_type))
+     *     buf->ia_ctime_nsec++;
+     *
+     * But this is not the ideal solution as applications see the unexpected
+     * ctime change causing failures.
+     */
+
+    SVC_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+/* should all the fops be handled like lookup is supposed to be
+   handled? i.e just based on inode type decide where the call should
+   be sent and in the call back update the contexts.
+*/
+static int32_t
+gf_svc_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    int32_t ret = -1;
+    int inode_type = -1;
+    xlator_t *subvolume = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, loc->inode,
+                            subvolume, out);
+
+    STACK_WIND(frame, gf_svc_stat_cbk, subvolume, subvolume->fops->stat, loc,
+               xdata);
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(stat, frame, op_ret, op_errno, NULL, NULL);
+    return 0;
+}
+
+static int32_t
+gf_svc_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    int32_t ret = -1;
+    int inode_type = -1;
+    xlator_t *subvolume = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, out);
+
+    SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, fd->inode,
+                            subvolume, out);
+
+    STACK_WIND_TAIL(frame, subvolume, subvolume->fops->fstat, fd, xdata);
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(fstat, frame, op_ret, op_errno, NULL, NULL);
+
+    return ret;
+}
+
+static int32_t
+gf_svc_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    svc_fd_t *svc_fd = NULL;
+    svc_local_t *local = NULL;
+    svc_private_t *priv = NULL;
+    gf_boolean_t special_dir = _gf_false;
+    char path[PATH_MAX] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    if (op_ret)
+        goto out;
+
+    priv = this->private;
+    local = frame->local;
+
+    if (local->subvolume == FIRST_CHILD(this) && priv->special_dir &&
+        strcmp(priv->special_dir, "")) {
+        if (!__is_root_gfid(fd->inode->gfid))
+            snprintf(path, sizeof(path), "%s/.", priv->special_dir);
+        else
+            snprintf(path, sizeof(path), "/.");
+
+        if (!strcmp(local->loc.path, priv->special_dir) ||
+            !strcmp(local->loc.path, path)) {
+            gf_msg_debug(this->name, 0,
+                         "got opendir on special directory"
+                         " %s (gfid: %s)",
+                         path, uuid_utoa(fd->inode->gfid));
+            special_dir = _gf_true;
+        }
+    }
+
+    if (special_dir) {
+        svc_fd = svc_fd_ctx_get_or_new(this, fd);
+        if (!svc_fd) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_GET_FD_CONTEXT_FAILED,
+                    "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+            goto out;
+        }
+
+        svc_fd->last_offset = -1;
+        svc_fd->special_dir = special_dir;
+    }
+
+out:
+    STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, xdata);
+
+    return 0;
+}
+
+/* If the inode represents a directory which is actually
+   present in a snapshot, then opendir on that directory
+   should be sent to the snap-view-server which opens
+   the directory in the corresponding graph.
+   In fact any opendir call on a virtual directory
+   should be sent to svs. Because if it fakes success
+   here, then later when readdir on that fd comes, there
+   will not be any corresponding fd opened on svs and
+   svc has to do things that open-behind is doing.
+*/
+static int32_t
+gf_svc_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+               dict_t *xdata)
+{
+    int32_t ret = -1;
+    int inode_type = -1;
+    xlator_t *subvolume = NULL;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+    svc_local_t *local = NULL;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        op_errno = ENOMEM;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, SVC_MSG_NO_MEMORY,
+                "path=%s", loc->path, "gfid=%s", uuid_utoa(fd->inode->gfid),
+                NULL);
+        goto out;
+    }
+    loc_copy(&local->loc, loc);
+    frame->local = local;
+
+    SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, loc->inode,
+                            subvolume, out);
+    local->subvolume = subvolume;
+
+    STACK_WIND(frame, gf_svc_opendir_cbk, subvolume, subvolume->fops->opendir,
+               loc, fd, xdata);
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(opendir, frame, op_ret, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+gf_svc_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    int32_t ret = -1;
+    int inode_type = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    ret = svc_inode_ctx_get(this, loc->inode, &inode_type);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                SVC_MSG_GET_INODE_CONTEXT_FAILED, "path=%s", loc->path,
+                "gfid= %s", uuid_utoa(loc->inode->gfid), NULL);
+        goto out;
+    }
+
+    if (inode_type == NORMAL_INODE) {
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid,
+                        xdata);
+    } else {
+        op_ret = -1;
+        op_errno = EROFS;
+        goto out;
+    }
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(setattr, frame, op_ret, op_errno, NULL, NULL, NULL);
+    return 0;
+}
+
+/* XXX: This function is currently not used. Remove "#if 0" when required */
+#if 0
+static int32_t
+gf_svc_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+                 struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+        int32_t      ret        = -1;
+        int          inode_type = -1;
+        int          op_ret     = -1;
+        int          op_errno   = EINVAL;
+        gf_boolean_t wind       = _gf_false;
+
+        GF_VALIDATE_OR_GOTO ("svc", this, out);
+        GF_VALIDATE_OR_GOTO (this->name, frame, out);
+        GF_VALIDATE_OR_GOTO (this->name, fd, out);
+        GF_VALIDATE_OR_GOTO (this->name, fd->inode, out);
+
+        ret = svc_inode_ctx_get (this, fd->inode, &inode_type);
+        if (ret < 0) {
+                op_ret = -1;
+                op_errno = EINVAL;
+                gf_msg (this->name, GF_LOG_ERROR, op_errno,
+                        SVC_MSG_GET_INODE_CONTEXT_FAILED, "failed to "
+                        "get the inode context for %s",
+                        uuid_utoa (fd->inode->gfid));
+                goto out;
+        }
+
+        if (inode_type == NORMAL_INODE) {
+                STACK_WIND_TAIL (frame, FIRST_CHILD (this),
+                                 FIRST_CHILD (this)->fops->fsetattr, fd, stbuf,
+                                 valid, xdata);
+        } else {
+                op_ret = -1;
+                op_errno = EROFS;
+                goto out;
+        }
+
+        wind = _gf_true;
+
+out:
+        if (!wind)
+                SVC_STACK_UNWIND (fsetattr, frame, op_ret, op_errno,
+                                  NULL, NULL, NULL);
+        return 0;
+}
+#endif /* gf_svc_fsetattr() is not used */
+
+static int32_t
+gf_svc_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                const char *name, dict_t *xdata)
+{
+    int32_t ret = -1;
+    int inode_type = -1;
+    xlator_t *subvolume = NULL;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+    svc_private_t *priv = NULL;
+    char attrname[PATH_MAX] = "";
+    char attrval[64] = "";
+    dict_t *dict = NULL;
+    char entry_point[NAME_MAX + 1] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    /*
+     * Samba sends this special key for case insensitive
+     * filename check. This request comes with a parent
+     * path and with a special key GF_XATTR_GET_REAL_FILENAME_KEY.
+     * e.g. "glusterfs.get_real_filename:.snaps".
+     * If the name variable matches this key then we have
+     * to send back .snaps as the real filename.
+     */
+    if (!name)
+        goto stack_wind;
+
+    sscanf(name, "%[^:]:%[^@]", attrname, attrval);
+    strcat(attrname, ":");
+
+    if (!strcmp(attrname, GF_XATTR_GET_REAL_FILENAME_KEY)) {
+        if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) {
+            gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+                    SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL);
+            goto out;
+        }
+
+        if (!strcasecmp(attrval, entry_point)) {
+            dict = dict_new();
+            if (NULL == dict) {
+                op_errno = ENOMEM;
+                goto out;
+            }
+
+            ret = dict_set_dynstr_with_alloc(dict, (char *)name, entry_point);
+
+            if (ret) {
+                op_errno = ENOMEM;
+                goto out;
+            }
+
+            op_errno = 0;
+            op_ret = strlen(entry_point) + 1;
+            /* We should return from here */
+            goto out;
+        }
+    }
+stack_wind:
+    SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, loc->inode,
+                            subvolume, out);
+
+    STACK_WIND_TAIL(frame, subvolume, subvolume->fops->getxattr, loc, name,
+                    xdata);
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, NULL);
+
+    if (dict)
+        dict_unref(dict);
+
+    return 0;
+}
+
+/* XXX: This function is currently not used. Mark it '#if 0' when required */
+#if 0
+static int32_t
+gf_svc_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+                  const char *name, dict_t *xdata)
+{
+        int32_t       ret        = -1;
+        int           inode_type = -1;
+        xlator_t     *subvolume  = NULL;
+        gf_boolean_t  wind       = _gf_false;
+        int           op_ret     = -1;
+        int           op_errno   = EINVAL;
+
+        GF_VALIDATE_OR_GOTO ("svc", this, out);
+        GF_VALIDATE_OR_GOTO (this->name, frame, out);
+        GF_VALIDATE_OR_GOTO (this->name, fd, out);
+        GF_VALIDATE_OR_GOTO (this->name, fd->inode, out);
+
+        SVC_GET_SUBVOL_FROM_CTX (this, op_ret, op_errno, inode_type, ret,
+                                 fd->inode, subvolume, out);
+
+        STACK_WIND_TAIL (frame, subvolume,
+                         subvolume->fops->fgetxattr, fd, name, xdata);
+
+        wind = _gf_true;
+
+out:
+        if (!wind)
+                SVC_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno,
+                                  NULL, NULL);
+        return 0;
+}
+#endif /* gf_svc_fgetxattr() is not used */
+
+static int32_t
+gf_svc_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+                int32_t flags, dict_t *xdata)
+{
+    int32_t ret = -1;
+    int inode_type = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    ret = svc_inode_ctx_get(this, loc->inode, &inode_type);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                SVC_MSG_GET_INODE_CONTEXT_FAILED, "name=%s", loc->name,
+                "gfid=%s", uuid_utoa(loc->inode->gfid), NULL);
+        goto out;
+    }
+
+    if (inode_type == NORMAL_INODE) {
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->setxattr, loc, dict, flags,
+                        xdata);
+    } else {
+        op_ret = -1;
+        op_errno = EROFS;
+        goto out;
+    }
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(setxattr, frame, op_ret, op_errno, NULL);
+
+    return 0;
+}
+
+static int32_t
+gf_svc_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+                 int32_t flags, dict_t *xdata)
+{
+    int32_t ret = -1;
+    int inode_type = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, out);
+
+    ret = svc_inode_ctx_get(this, fd->inode, &inode_type);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s",
+                uuid_utoa(fd->inode->gfid), NULL);
+        goto out;
+    }
+
+    if (inode_type == NORMAL_INODE) {
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags,
+                        xdata);
+    } else {
+        op_ret = -1;
+        op_errno = EROFS;
+        goto out;
+    }
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, NULL);
+
+    return 0;
+}
+
+static int32_t
+gf_svc_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+             dict_t *xdata)
+{
+    int inode_type = -1;
+    int ret = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    ret = svc_inode_ctx_get(this, loc->inode, &inode_type);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                SVC_MSG_GET_INODE_CONTEXT_FAILED, "name=%s", loc->name,
+                "gfid=%s", uuid_utoa(loc->inode->gfid), NULL);
+        goto out;
+    }
+
+    if (inode_type == NORMAL_INODE) {
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata);
+    } else {
+        op_ret = -1;
+        op_errno = EROFS;
+        goto out;
+    }
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(rmdir, frame, op_ret, op_errno, NULL, NULL, NULL);
+    return 0;
+}
+
+static int32_t
+gf_svc_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, inode_t *inode,
+                 struct iatt *buf, struct iatt *preparent,
+                 struct iatt *postparent, dict_t *xdata)
+{
+    int inode_type = -1;
+    int ret = -1;
+
+    if (op_ret < 0)
+        goto out;
+
+    inode_type = NORMAL_INODE;
+    ret = svc_inode_ctx_set(this, inode, inode_type);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_SET_INODE_CONTEXT_FAILED,
+                NULL);
+
+out:
+    SVC_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, buf, preparent,
+                     postparent, xdata);
+    return 0;
+}
+
+static int32_t
+gf_svc_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+             mode_t umask, dict_t *xdata)
+{
+    int parent_type = -1;
+    int ret = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+    char entry_point[NAME_MAX + 1] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    ret = svc_inode_ctx_get(this, loc->parent, &parent_type);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s",
+                uuid_utoa(loc->parent->gfid), NULL);
+        goto out;
+    }
+
+    if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) {
+        gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+                SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL);
+        goto out;
+    }
+
+    if (strcmp(loc->name, entry_point) && parent_type == NORMAL_INODE) {
+        STACK_WIND(frame, gf_svc_mkdir_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+    } else {
+        op_ret = -1;
+        op_errno = EROFS;
+        goto out;
+    }
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(mkdir, frame, op_ret, op_errno, NULL, NULL, NULL, NULL,
+                         NULL);
+    return 0;
+}
+
+static int32_t
+gf_svc_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, inode_t *inode,
+                 struct iatt *buf, struct iatt *preparent,
+                 struct iatt *postparent, dict_t *xdata)
+{
+    int inode_type = -1;
+    int ret = -1;
+
+    if (op_ret < 0)
+        goto out;
+
+    inode_type = NORMAL_INODE;
+    ret = svc_inode_ctx_set(this, inode, inode_type);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_SET_INODE_CONTEXT_FAILED,
+                NULL);
+
+out:
+    SVC_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent,
+                     postparent, xdata);
+    return 0;
+}
+
+static int32_t
+gf_svc_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+             dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    int parent_type = -1;
+    int ret = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+    char entry_point[NAME_MAX + 1] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    ret = svc_inode_ctx_get(this, loc->parent, &parent_type);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s",
+                uuid_utoa(loc->parent->gfid), NULL);
+        goto out;
+    }
+
+    if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) {
+        gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+                SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL);
+        goto out;
+    }
+
+    if (strcmp(loc->name, entry_point) && parent_type == NORMAL_INODE) {
+        STACK_WIND(frame, gf_svc_mknod_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask,
+                   xdata);
+    } else {
+        op_ret = -1;
+        op_errno = EROFS;
+        goto out;
+    }
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(mknod, frame, op_ret, op_errno, NULL, NULL, NULL, NULL,
+                         NULL);
+    return 0;
+}
+
+/* If the flags of the open call contain O_WRONLY or O_RDWR and the inode is
+   a virtual inode, then unwind the call back with EROFS. Otherwise simply
+   STACK_WIND the call to the first child of svc xlator.
+*/
+static int32_t
+gf_svc_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+            fd_t *fd, dict_t *xdata)
+{
+    xlator_t *subvolume = NULL;
+    int inode_type = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    int ret = -1;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    /* Another way is to STACK_WIND to normal subvolume, if inode
+       type is not there in the context. If the file actually resides
+       in snapshots, then ENOENT would be returned. Needs more analysis.
+    */
+    SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, loc->inode,
+                            subvolume, out);
+
+    if (((flags & O_ACCMODE) == O_WRONLY) || ((flags & O_ACCMODE) == O_RDWR)) {
+        if (subvolume != FIRST_CHILD(this)) {
+            op_ret = -1;
+            op_errno = EINVAL;
+            goto out;
+        }
+    }
+
+    STACK_WIND_TAIL(frame, subvolume, subvolume->fops->open, loc, flags, fd,
+                    xdata);
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(open, frame, op_ret, op_errno, NULL, NULL);
+    return 0;
+}
+
+static int32_t
+gf_svc_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                  struct iatt *stbuf, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
+{
+    int inode_type = -1;
+    int ret = -1;
+
+    if (op_ret < 0)
+        goto out;
+
+    inode_type = NORMAL_INODE;
+    ret = svc_inode_ctx_set(this, inode, inode_type);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_SET_INODE_CONTEXT_FAILED,
+                NULL);
+
+out:
+    SVC_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf,
+                     preparent, postparent, xdata);
+
+    return 0;
+}
+
+static int32_t
+gf_svc_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+              mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    int parent_type = -1;
+    int ret = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+    char entry_point[NAME_MAX + 1] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    ret = svc_inode_ctx_get(this, loc->parent, &parent_type);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s",
+                uuid_utoa(loc->parent->gfid), NULL);
+        goto out;
+    }
+
+    if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) {
+        gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+                SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL);
+        goto out;
+    }
+
+    if (strcmp(loc->name, entry_point) && parent_type == NORMAL_INODE) {
+        STACK_WIND(frame, gf_svc_create_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+                   xdata);
+    } else {
+        op_ret = -1;
+        op_errno = EROFS;
+        goto out;
+    }
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(create, frame, op_ret, op_errno, NULL, NULL, NULL,
+                         NULL, NULL, NULL);
+    return 0;
+}
+
+static int32_t
+gf_svc_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, inode_t *inode,
+                   struct iatt *buf, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
+{
+    int inode_type = -1;
+    int ret = -1;
+
+    if (op_ret < 0)
+        goto out;
+
+    inode_type = NORMAL_INODE;
+    ret = svc_inode_ctx_set(this, inode, inode_type);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_SET_INODE_CONTEXT_FAILED,
+                NULL);
+
+out:
+    SVC_STACK_UNWIND(symlink, frame, op_ret, op_errno, inode, buf, preparent,
+                     postparent, xdata);
+
+    return 0;
+}
+
+static int32_t
+gf_svc_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+               loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    int parent_type = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    int ret = -1;
+    gf_boolean_t wind = _gf_false;
+    char entry_point[NAME_MAX + 1] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    ret = svc_inode_ctx_get(this, loc->parent, &parent_type);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s",
+                uuid_utoa(loc->parent->gfid), NULL);
+        goto out;
+    }
+
+    if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) {
+        gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+                SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL);
+        goto out;
+    }
+
+    if (strcmp(loc->name, entry_point) && parent_type == NORMAL_INODE) {
+        STACK_WIND(frame, gf_svc_symlink_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask,
+                   xdata);
+    } else {
+        op_ret = -1;
+        op_errno = EROFS;
+        goto out;
+    }
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(symlink, frame, op_ret, op_errno, NULL, NULL, NULL,
+                         NULL, NULL);
+    return 0;
+}
+
+static int32_t
+gf_svc_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+              dict_t *xdata)
+{
+    int inode_type = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    int ret = -1;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    ret = svc_inode_ctx_get(this, loc->inode, &inode_type);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s",
+                uuid_utoa(loc->parent->gfid), NULL);
+        goto out;
+    }
+
+    if (inode_type == NORMAL_INODE) {
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->unlink, loc, flags, xdata);
+    } else {
+        op_ret = -1;
+        op_errno = EROFS;
+        goto out;
+    }
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(unlink, frame, op_ret, op_errno, NULL, NULL, NULL);
+    return 0;
+}
+
+static int32_t
+gf_svc_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+             off_t offset, uint32_t flags, dict_t *xdata)
+{
+    int inode_type = -1;
+    xlator_t *subvolume = NULL;
+    int ret = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, out);
+
+    SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, fd->inode,
+                            subvolume, out);
+
+    STACK_WIND_TAIL(frame, subvolume, subvolume->fops->readv, fd, size, offset,
+                    flags, xdata);
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(readv, frame, op_ret, op_errno, NULL, 0, NULL, NULL,
+                         NULL);
+    return 0;
+}
+
+static int32_t
+gf_svc_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+                dict_t *xdata)
+{
+    int inode_type = -1;
+    xlator_t *subvolume = NULL;
+    int ret = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, loc->inode,
+                            subvolume, out);
+
+    STACK_WIND_TAIL(frame, subvolume, subvolume->fops->readlink, loc, size,
+                    xdata);
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        STACK_UNWIND_STRICT(readlink, frame, op_ret, op_errno, NULL, NULL,
+                            NULL);
+    return 0;
+}
+
+static int32_t
+gf_svc_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+              dict_t *xdata)
+{
+    int ret = -1;
+    int inode_type = -1;
+    xlator_t *subvolume = NULL;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, loc->inode,
+                            subvolume, out);
+
+    STACK_WIND_TAIL(frame, subvolume, subvolume->fops->access, loc, mask,
+                    xdata);
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(access, frame, op_ret, op_errno, NULL);
+
+    return 0;
+}
+
+int32_t
+gf_svc_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+                   dict_t *xdata)
+{
+    gf_dirent_t *entry = NULL;
+    gf_dirent_t *tmpentry = NULL;
+    svc_local_t *local = NULL;
+    char entry_point[NAME_MAX + 1] = {
+        0,
+    };
+
+    if (op_ret < 0)
+        goto out;
+
+    local = frame->local;
+
+    /* If .snaps pre-exists, then it should not be listed
+     * in the NORMAL INODE directory when USS is enabled,
+     * so filter the .snaps entry if exists.
+     * However it is OK to list .snaps in VIRTUAL world
+     */
+    if (local->subvolume != FIRST_CHILD(this))
+        goto out;
+
+    /*
+     * Better to goto out if getting the entry point
+     * fails. We might end up sending the directory
+     * entry for the snapview entry point in the readdir
+     * response. But, the intention is to avoid the race
+     * condition where priv->path is being changed in
+     * reconfigure while this is accessing it.
+     */
+    if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) {
+        gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+                SVC_MSG_COPY_ENTRY_POINT_FAILED, NULL);
+        goto out;
+    }
+
+    list_for_each_entry_safe(entry, tmpentry, &entries->list, list)
+    {
+        if (strcmp(entry_point, entry->d_name) == 0)
+            gf_dirent_entry_free(entry);
+    }
+
+out:
+    SVC_STACK_UNWIND(readdir, frame, op_ret, op_errno, entries, xdata);
+    return 0;
+}
+
+static int32_t
+gf_svc_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+               off_t off, dict_t *xdata)
+{
+    int inode_type = -1;
+    xlator_t *subvolume = NULL;
+    svc_local_t *local = NULL;
+    int ret = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+    svc_fd_t *svc_fd = NULL;
+    gf_dirent_t entries;
+
+    INIT_LIST_HEAD(&entries);
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, out);
+
+    svc_fd = svc_fd_ctx_get_or_new(this, fd);
+    if (!svc_fd)
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_GET_FD_CONTEXT_FAILED,
+                "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+    else {
+        if (svc_fd->entry_point_handled && off == svc_fd->last_offset) {
+            op_ret = 0;
+            op_errno = ENOENT;
+            goto out;
+        }
+    }
+
+    SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, fd->inode,
+                            subvolume, out);
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, SVC_MSG_NO_MEMORY,
+                "inode-gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto out;
+    }
+    local->subvolume = subvolume;
+    frame->local = local;
+
+    STACK_WIND(frame, gf_svc_readdir_cbk, subvolume, subvolume->fops->readdir,
+               fd, size, off, xdata);
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL);
+
+    gf_dirent_free(&entries);
+
+    return 0;
+}
+
+/*
+ * This lookup if mainly for supporting USS for windows.
+ * Since the dentry for the entry-point directory is not sent in
+ * the readdir response, from windows explorer, there is no way
+ * to access the snapshots. If the explicit path of the entry-point
+ * directory is mentioned in the address bar, then windows sends
+ * readdir on the parent directory and compares if the entry point
+ * directory's name is there in readdir response. If it is not there
+ * then access to snapshot world is denied. And windows users cannot
+ * access snapshots via samba.
+ * So, to handle this a new option called special-directory is created,
+ * which if set, snapview-client will send the entry-point's dentry
+ * in readdirp o/p for the special directory, so that it will be
+ * visible from windows explorer.
+ * But to send that virtual entry, the following mechanism is used.
+ * 1) Check if readdir from posix is over.
+ * 2) If so, then send a lookup on entry point directory to snap daemon
+ * (this is needed because in readdirp inodes are linked, so we need to
+ * maintain 1:1 mapping between inodes (gfids) from snapview server to
+ * snapview client).
+ * 3) Once successful lookup response received, send a new entry to
+ * windows.
+ */
+
+static int32_t
+gf_svc_readdirp_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno, inode_t *inode,
+                           struct iatt *buf, dict_t *xdata,
+                           struct iatt *postparent)
+{
+    gf_dirent_t entries;
+    gf_dirent_t *entry = NULL;
+    svc_fd_t *svc_fd = NULL;
+    svc_local_t *local = NULL;
+    int inode_type = -1;
+    int ret = -1;
+    char entry_point[NAME_MAX + 1] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+
+    INIT_LIST_HEAD(&entries.list);
+
+    local = frame->local;
+
+    if (op_ret) {
+        if (op_errno == ESTALE && !local->revalidate) {
+            local->revalidate = 1;
+            ret = gf_svc_special_dir_revalidate_lookup(frame, this, xdata);
+
+            if (!ret)
+                return 0;
+        }
+        op_ret = 0;
+        op_errno = ENOENT;
+        goto out;
+    }
+
+    svc_fd = svc_fd_ctx_get(this, local->fd);
+    if (!svc_fd) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_GET_FD_CONTEXT_FAILED,
+                "gfid=%s", uuid_utoa(local->fd->inode->gfid), NULL);
+        op_ret = 0;
+        op_errno = ENOENT;
+        goto out;
+    }
+
+    if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, SVC_MSG_COPY_ENTRY_POINT_FAILED,
+                NULL);
+        op_ret = 0;
+        op_errno = ENOENT;
+        goto out;
+    }
+
+    entry = gf_dirent_for_name(entry_point);
+    if (!entry) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_NO_MEMORY,
+                "entry-point=%s", entry_point, NULL);
+        op_ret = 0;
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    entry->inode = inode_ref(inode);
+    entry->d_off = svc_fd->last_offset + 22;
+    entry->d_ino = buf->ia_ino;
+    entry->d_type = DT_DIR;
+    entry->d_stat = *buf;
+    inode_type = VIRTUAL_INODE;
+    ret = svc_inode_ctx_set(this, entry->inode, inode_type);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_SET_INODE_CONTEXT_FAILED,
+                "entry-name=%s", entry->d_name, NULL);
+
+    list_add_tail(&entry->list, &entries.list);
+    op_ret = 1;
+    svc_fd->last_offset = entry->d_off;
+    svc_fd->entry_point_handled = _gf_true;
+
+out:
+    SVC_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries,
+                     local ? local->xdata : NULL);
+
+    gf_dirent_free(&entries);
+
+    return 0;
+}
+
+int
+gf_svc_special_dir_revalidate_lookup(call_frame_t *frame, xlator_t *this,
+                                     dict_t *xdata)
+{
+    svc_local_t *local = NULL;
+    loc_t *loc = NULL;
+    dict_t *tmp_xdata = NULL;
+    char *path = NULL;
+    int ret = -1;
+    char entry_point[NAME_MAX + 1] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+
+    local = frame->local;
+    loc = &local->loc;
+
+    if (local->xdata) {
+        dict_unref(local->xdata);
+        local->xdata = NULL;
+    }
+
+    if (xdata)
+        local->xdata = dict_ref(xdata);
+
+    inode_unref(loc->inode);
+    loc->inode = inode_new(loc->parent->table);
+    if (!loc->inode) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, SVC_MSG_ALLOC_INODE_FAILED,
+                NULL);
+        goto out;
+    }
+
+    if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, SVC_MSG_COPY_ENTRY_POINT_FAILED,
+                NULL);
+        goto out;
+    }
+
+    gf_uuid_copy(local->loc.gfid, loc->inode->gfid);
+    ret = inode_path(loc->parent, entry_point, &path);
+    if (ret < 0)
+        goto out;
+
+    if (loc->path)
+        GF_FREE((char *)loc->path);
+
+    loc->path = gf_strdup(path);
+    if (loc->path) {
+        if (!loc->name || (loc->name && !strcmp(loc->name, ""))) {
+            loc->name = strrchr(loc->path, '/');
+            if (loc->name)
+                loc->name++;
+        }
+    } else
+        loc->path = NULL;
+
+    tmp_xdata = dict_new();
+    if (!tmp_xdata) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_str(tmp_xdata, "entry-point", "true");
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_DICT_SET_FAILED, NULL);
+        goto out;
+    }
+
+    STACK_WIND(frame, gf_svc_readdirp_lookup_cbk, SECOND_CHILD(this),
+               SECOND_CHILD(this)->fops->lookup, loc, tmp_xdata);
+out:
+    if (tmp_xdata)
+        dict_unref(tmp_xdata);
+
+    GF_FREE(path);
+    return ret;
+}
+
+static gf_boolean_t
+gf_svc_readdir_on_special_dir(call_frame_t *frame, void *cookie, xlator_t *this,
+                              int32_t op_ret, int32_t op_errno,
+                              gf_dirent_t *entries, dict_t *xdata)
+{
+    svc_local_t *local = NULL;
+    svc_private_t *private = NULL;
+    inode_t *inode = NULL;
+    fd_t *fd = NULL;
+    char *path = NULL;
+    loc_t *loc = NULL;
+    dict_t *tmp_xdata = NULL;
+    int ret = -1;
+    gf_boolean_t unwind = _gf_true;
+    svc_fd_t *svc_fd = NULL;
+    char entry_point[NAME_MAX + 1] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+   private
+    = this->private;
+    local = frame->local;
+
+    loc = &local->loc;
+    fd = local->fd;
+    svc_fd = svc_fd_ctx_get(this, fd);
+    if (!svc_fd) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_GET_FD_CONTEXT_FAILED,
+                "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto out;
+    }
+
+    /*
+     * check if its end of readdir operation from posix, if special_dir
+     * option is set, if readdir is done on special directory and if
+     * readdirp is from normal regular graph.
+     */
+
+    if (!private->show_entry_point)
+        goto out;
+
+    if (op_ret == 0 && op_errno == ENOENT && private->special_dir &&
+        strcmp(private->special_dir, "") && svc_fd->special_dir &&
+        local->subvolume == FIRST_CHILD(this)) {
+        if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) {
+            gf_smsg(this->name, GF_LOG_WARNING, 0,
+                    SVC_MSG_GET_FD_CONTEXT_FAILED, NULL);
+            goto out;
+        }
+
+        inode = inode_grep(fd->inode->table, fd->inode, entry_point);
+        if (!inode) {
+            inode = inode_new(fd->inode->table);
+            if (!inode) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_ALLOC_INODE_FAILED,
+                        NULL);
+                goto out;
+            }
+        }
+
+        gf_uuid_copy(local->loc.pargfid, fd->inode->gfid);
+        gf_uuid_copy(local->loc.gfid, inode->gfid);
+        if (gf_uuid_is_null(inode->gfid))
+            ret = inode_path(fd->inode, entry_point, &path);
+        else
+            ret = inode_path(inode, NULL, &path);
+
+        if (ret < 0)
+            goto out;
+        loc->path = gf_strdup(path);
+        if (loc->path) {
+            if (!loc->name || (loc->name && !strcmp(loc->name, ""))) {
+                loc->name = strrchr(loc->path, '/');
+                if (loc->name)
+                    loc->name++;
+            }
+        }
+
+        loc->inode = inode;
+        loc->parent = inode_ref(fd->inode);
+        tmp_xdata = dict_new();
+        if (!tmp_xdata)
+            goto out;
+        ret = dict_set_str(tmp_xdata, "entry-point", "true");
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_DICT_SET_FAILED, NULL);
+            goto out;
+        }
+
+        local->cookie = cookie;
+        if (local->xdata) {
+            dict_unref(local->xdata);
+            local->xdata = NULL;
+        }
+        if (xdata)
+            local->xdata = dict_ref(xdata);
+
+        STACK_WIND(frame, gf_svc_readdirp_lookup_cbk, SECOND_CHILD(this),
+                   SECOND_CHILD(this)->fops->lookup, loc, tmp_xdata);
+        unwind = _gf_false;
+    }
+
+out:
+    if (tmp_xdata)
+        dict_unref(tmp_xdata);
+
+    GF_FREE(path);
+    return unwind;
+}
+
+static int32_t
+gf_svc_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+                    dict_t *xdata)
+{
+    gf_dirent_t *entry = NULL;
+    gf_dirent_t *tmpentry = NULL;
+    svc_local_t *local = NULL;
+    int inode_type = -1;
+    int ret = -1;
+    svc_fd_t *svc_fd = NULL;
+    gf_boolean_t unwind = _gf_true;
+    char entry_point[NAME_MAX + 1] = {
+        0,
+    };
+
+    if (op_ret < 0)
+        goto out;
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+
+    local = frame->local;
+
+    svc_fd = svc_fd_ctx_get(this, local->fd);
+    if (!svc_fd) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, SVC_MSG_GET_FD_CONTEXT_FAILED,
+                "gfid=%s", uuid_utoa(local->fd->inode->gfid), NULL);
+    }
+
+    if (local->subvolume == FIRST_CHILD(this))
+        inode_type = NORMAL_INODE;
+    else
+        inode_type = VIRTUAL_INODE;
+
+    /*
+     * Better to goto out and return whatever is there in the
+     * readdirp response (even if the readdir response contains
+     * a directory entry for the snapshot entry point). Otherwise
+     * if we ignore the error, then there is a chance of race
+     * condition where, priv->path is changed in reconfigure
+     */
+    if (gf_svc_get_entry_point(this, entry_point, sizeof(entry_point))) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, SVC_MSG_COPY_ENTRY_POINT_FAILED,
+                NULL);
+        goto out;
+    }
+
+    list_for_each_entry_safe(entry, tmpentry, &entries->list, list)
+    {
+        /* If .snaps pre-exists, then it should not be listed
+         * in the NORMAL INODE directory when USS is enabled,
+         * so filter the .snaps entry if exists.
+         * However it is OK to list .snaps in VIRTUAL world
+         */
+        if (inode_type == NORMAL_INODE && !strcmp(entry_point, entry->d_name)) {
+            gf_dirent_entry_free(entry);
+            continue;
+        }
+
+        if (!entry->inode)
+            continue;
+
+        ret = svc_inode_ctx_set(this, entry->inode, inode_type);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_ERROR, 0,
+                    SVC_MSG_SET_INODE_CONTEXT_FAILED, NULL);
+        if (svc_fd)
+            svc_fd->last_offset = entry->d_off;
+    }
+
+    unwind = gf_svc_readdir_on_special_dir(frame, cookie, this, op_ret,
+                                           op_errno, entries, xdata);
+
+out:
+    if (unwind)
+        SVC_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata);
+
+    return 0;
+}
+
+static int32_t
+gf_svc_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                off_t off, dict_t *xdata)
+{
+    int inode_type = -1;
+    xlator_t *subvolume = NULL;
+    svc_local_t *local = NULL;
+    int ret = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+    svc_fd_t *svc_fd = NULL;
+    gf_dirent_t entries;
+
+    INIT_LIST_HEAD(&entries.list);
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, out);
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        op_errno = ENOMEM;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, SVC_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    /*
+     * This is mainly for samba shares (or windows clients). As part of
+     * readdirp on the directory used as samba share, the entry point
+     * directory would have been added at the end. So when a new readdirp
+     * request comes, we have to check if the entry point has been handled
+     * or not in readdirp. That information and the offset used for it
+     * is remembered in fd context. If it has been handled, then simply
+     * unwind indication end of readdir operation.
+     */
+    svc_fd = svc_fd_ctx_get_or_new(this, fd);
+    if (!svc_fd)
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_GET_FD_CONTEXT_FAILED,
+                "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+    else {
+        if (svc_fd->entry_point_handled && off == svc_fd->last_offset) {
+            op_ret = 0;
+            op_errno = ENOENT;
+            goto out;
+        }
+    }
+
+    SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, fd->inode,
+                            subvolume, out);
+
+    local->subvolume = subvolume;
+    local->fd = fd_ref(fd);
+    frame->local = local;
+
+    STACK_WIND(frame, gf_svc_readdirp_cbk, subvolume, subvolume->fops->readdirp,
+               fd, size, off, xdata);
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL);
+
+    gf_dirent_free(&entries);
+
+    return 0;
+}
+
+/* Renaming the entries from or to snapshots is not allowed as the snapshots
+   are read-only.
+*/
+static int32_t
+gf_svc_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+              dict_t *xdata)
+{
+    int src_inode_type = -1;
+    int dst_inode_type = -1;
+    int dst_parent_type = -1;
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    int32_t ret = -1;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, oldloc, out);
+    GF_VALIDATE_OR_GOTO(this->name, oldloc->inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, newloc, out);
+
+    ret = svc_inode_ctx_get(this, oldloc->inode, &src_inode_type);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s",
+                uuid_utoa(oldloc->inode->gfid), NULL);
+        goto out;
+    }
+
+    if (src_inode_type == VIRTUAL_INODE) {
+        op_ret = -1;
+        op_errno = EROFS;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                SVC_MSG_RENAME_SNAPSHOT_ENTRY, "name=%s", oldloc->name, NULL);
+        goto out;
+    }
+
+    if (newloc->inode) {
+        ret = svc_inode_ctx_get(this, newloc->inode, &dst_inode_type);
+        if (!ret && dst_inode_type == VIRTUAL_INODE) {
+            op_ret = -1;
+            op_errno = EROFS;
+            gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                    SVC_MSG_RENAME_SNAPSHOT_ENTRY, "oldloc-name=%s",
+                    oldloc->name, "newloc-name=%s", newloc->name, NULL);
+            goto out;
+        }
+    }
+
+    if (dst_inode_type < 0) {
+        ret = svc_inode_ctx_get(this, newloc->parent, &dst_parent_type);
+        if (!ret && dst_parent_type == VIRTUAL_INODE) {
+            op_ret = -1;
+            op_errno = EROFS;
+            gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                    SVC_MSG_RENAME_SNAPSHOT_ENTRY, "oldloc-name=%s",
+                    oldloc->name, "newloc-name=%s", newloc->name, NULL);
+            goto out;
+        }
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename,
+                    oldloc, newloc, xdata);
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(rename, frame, op_ret, op_errno, NULL, NULL, NULL,
+                         NULL, NULL, NULL);
+    return 0;
+}
+
+/* Creating hardlinks for the files from the snapshot is not allowed as it
+   will be equivalent of creating hardlinks across different filesystems.
+   And so is vice versa.
+*/
+static int32_t
+gf_svc_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+            dict_t *xdata)
+{
+    int src_inode_type = -1;
+    int dst_parent_type = -1;
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    int32_t ret = -1;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, oldloc, out);
+    GF_VALIDATE_OR_GOTO(this->name, oldloc->inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, newloc, out);
+
+    ret = svc_inode_ctx_get(this, oldloc->inode, &src_inode_type);
+    if (!ret && src_inode_type == VIRTUAL_INODE) {
+        op_ret = -1;
+        op_errno = EROFS;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, SVC_MSG_LINK_SNAPSHOT_ENTRY,
+                "oldloc-name=%s", oldloc->name, NULL);
+        goto out;
+    }
+
+    ret = svc_inode_ctx_get(this, newloc->parent, &dst_parent_type);
+    if (!ret && dst_parent_type == VIRTUAL_INODE) {
+        op_ret = -1;
+        op_errno = EROFS;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, SVC_MSG_LINK_SNAPSHOT_ENTRY,
+                "oldloc-name=%s", oldloc->name, "newloc-name=%s", newloc->name,
+                NULL);
+        goto out;
+    }
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+                    oldloc, newloc, xdata);
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(link, frame, op_ret, op_errno, NULL, NULL, NULL, NULL,
+                         NULL);
+    return 0;
+}
+
+static int32_t
+gf_svc_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   const char *name, dict_t *xdata)
+{
+    int ret = -1;
+    int inode_type = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    ret = svc_inode_ctx_get(this, loc->inode, &inode_type);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                SVC_MSG_GET_INODE_CONTEXT_FAILED, "path=%s", loc->path,
+                "gfid=%s", uuid_utoa(loc->inode->gfid), NULL);
+        goto out;
+    }
+
+    if (inode_type == NORMAL_INODE) {
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+    } else {
+        op_ret = -1;
+        op_errno = EROFS;
+        goto out;
+    }
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(removexattr, frame, op_ret, op_errno, NULL);
+
+    return 0;
+}
+
+static int
+gf_svc_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync,
+             dict_t *xdata)
+{
+    int inode_type = -1;
+    int ret = -1;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, out);
+
+    ret = svc_inode_ctx_get(this, fd->inode, &inode_type);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+                SVC_MSG_GET_INODE_CONTEXT_FAILED, "gfid=%s",
+                uuid_utoa(fd->inode->gfid), NULL);
+        goto out;
+    }
+
+    if (inode_type == NORMAL_INODE) {
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata);
+    } else {
+        op_ret = -1;
+        op_errno = EROFS;
+        goto out;
+    }
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(fsync, frame, op_ret, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+gf_svc_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    int ret = -1;
+    int inode_type = -1;
+    xlator_t *subvolume = NULL;
+    gf_boolean_t wind = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, out);
+
+    SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret, fd->inode,
+                            subvolume, out);
+
+    STACK_WIND_TAIL(frame, subvolume, subvolume->fops->flush, fd, xdata);
+
+    wind = _gf_true;
+
+out:
+    if (!wind)
+        SVC_STACK_UNWIND(flush, frame, op_ret, op_errno, NULL);
+
+    return 0;
+}
+
+static int32_t
+gf_svc_releasedir(xlator_t *this, fd_t *fd)
+{
+    svc_fd_t *sfd = NULL;
+    uint64_t tmp_pfd = 0;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO("snapview-client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    ret = fd_ctx_del(fd, this, &tmp_pfd);
+    if (ret < 0) {
+        gf_msg_debug(this->name, 0, "pfd from fd=%p is NULL", fd);
+        goto out;
+    }
+
+    GF_FREE(sfd);
+
+out:
+    return 0;
+}
+
+static int32_t
+gf_svc_forget(xlator_t *this, inode_t *inode)
+{
+    int ret = -1;
+    uint64_t value = 0;
+
+    GF_VALIDATE_OR_GOTO("svc", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    ret = inode_ctx_del(inode, this, &value);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0,
+                SVC_MSG_DELETE_INODE_CONTEXT_FAILED, "gfid=%s",
+                uuid_utoa(inode->gfid), NULL);
+        goto out;
+    }
+
+out:
+    return 0;
+}
+
+static int
+gf_svc_priv_destroy(xlator_t *this, svc_private_t *priv)
+{
+    int ret = -1;
+
+    if (!priv) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, SVC_MSG_NULL_PRIV, NULL);
+        goto out;
+    }
+
+    GF_FREE(priv->path);
+    GF_FREE(priv->special_dir);
+
+    LOCK_DESTROY(&priv->lock);
+
+    GF_FREE(priv);
+
+    if (this->local_pool) {
+        mem_pool_destroy(this->local_pool);
+        this->local_pool = NULL;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+/**
+ * ** NOTE **:
+ * =============
+ * The option "snapdir-entry-path" is NOT reconfigurable.
+ * That option as of now is only for the consumption of
+ * samba, where, it needs to tell glusterfs about the
+ * directory that is shared with windows client for the
+ * access. Now, in windows-explorer (GUI) interface, for
+ * the directory shared, the entry point to the snapshot
+ * world (snapshot-directory option) should be visible,
+ * atleast as a hidden entry. For that to happen, glusterfs
+ * has to send that entry in the readdir response coming on
+ * the directory used as the smb share. Therefore, samba,
+ * while initializing the gluster volume (via gfapi) sets
+ * the xlator option "snapdir-entry-path" to the directory
+ * which is to be shared with windows (check the file
+ * vfs_glusterfs.c from samba source code). So to avoid
+ * problems with smb access, not allowing snapdir-entry-path
+ * option to be configurable. That option is for those
+ * consumers who know what they are doing.
+ **/
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    svc_private_t *priv = NULL;
+    char *path = NULL;
+    gf_boolean_t show_entry_point = _gf_false;
+    char *tmp = NULL;
+
+    priv = this->private;
+
+    GF_OPTION_RECONF("snapshot-directory", path, options, str, out);
+    if (!path || (strlen(path) > NAME_MAX) || path[0] != '.') {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_INVALID_ENTRY_POINT,
+                "path=%s", path, NULL);
+        goto out;
+    }
+
+    GF_OPTION_RECONF("show-snapshot-directory", show_entry_point, options, bool,
+                     out);
+
+    /*
+     * The assumption now is that priv->path is an allocated memory (either
+     * in init or in a previous reconfigure).
+     * So, the intention here is to preserve the older contents of the option
+     * until the new option's value has been completely stored in the priv.
+     * So, do this.
+     *  - Store the pointer of priv->path in a temporary pointer.
+     *  - Allocate new memory for the new value of the option that is just
+     *    obtained from the above call to GF_OPTION_RECONF.
+     *  - If the above allocation fails, again set the pointer from priv
+     *    to the address stored in tmp. i.e. the previous value.
+     *  - If the allocation succeeds, then free the tmp pointer.
+     * WARNING: Before changing the allocation and freeing logic of
+     *          priv->path, always check the init function to see how
+     *          priv->path is set. Take decisions accordingly. As of now,
+     *          the assumption is that, the string elements of private
+     *          structure of snapview-client are allocated (either in
+     *          init or here in reconfugure).
+     */
+    LOCK(&priv->lock);
+    {
+        tmp = priv->path;
+        priv->path = NULL;
+        priv->path = gf_strdup(path);
+        if (!priv->path) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "failed to reconfigure snapshot-directory option to %s",
+                   path);
+            priv->path = tmp;
+        } else {
+            GF_FREE(tmp);
+            tmp = NULL;
+        }
+
+        priv->show_entry_point = show_entry_point;
+    }
+    UNLOCK(&priv->lock);
+
+out:
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int32_t ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_svc_mt_end + 1);
+
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, SVC_MSG_MEM_ACNT_FAILED, NULL);
+    }
+
+    return ret;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    svc_private_t *private = NULL;
+    int ret = -1;
+    int children = 0;
+    xlator_list_t *xl = NULL;
+    char *path = NULL;
+    char *special_dir = NULL;
+
+    if (!this->children) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_NO_CHILD_FOR_XLATOR, NULL);
+        goto out;
+    }
+
+    xl = this->children;
+    while (xl) {
+        children++;
+        xl = xl->next;
+    }
+
+    if (children != 2) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_XLATOR_CHILDREN_WRONG,
+                "subvol-num=%d", children, NULL);
+        goto out;
+    }
+
+    /* This can be the top of graph in certain cases */
+    if (!this->parents) {
+        gf_msg_debug(this->name, 0,
+                     "dangling volume. Check "
+                     "volfile");
+    }
+
+   private
+    = GF_CALLOC(1, sizeof(*private), gf_svc_mt_svc_private_t);
+    if (!private)
+        goto out;
+
+    LOCK_INIT(&private->lock);
+
+    GF_OPTION_INIT("snapshot-directory", path, str, out);
+    if (!path || (strlen(path) > NAME_MAX) || path[0] != '.') {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_INVALID_ENTRY_POINT,
+                "path=%s", path, NULL);
+        goto out;
+    }
+
+   private
+    ->path = gf_strdup(path);
+    if (!private->path) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_NO_MEMORY,
+                "entry-point-path=%s", path, NULL);
+        goto out;
+    }
+
+    GF_OPTION_INIT("snapdir-entry-path", special_dir, str, out);
+    if (!special_dir || strstr(special_dir, path)) {
+        if (special_dir)
+            gf_smsg(this->name, GF_LOG_ERROR, 0,
+                    SVC_MSG_ENTRY_POINT_SPECIAL_DIR, "path=%s", path,
+                    "special-dir=%s", special_dir);
+        else
+            gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_NULL_SPECIAL_DIR,
+                    NULL);
+        goto out;
+    }
+
+   private
+    ->special_dir = gf_strdup(special_dir);
+    if (!private->special_dir) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_NO_MEMORY,
+                "special-directory=%s", special_dir, NULL);
+        goto out;
+    }
+
+    GF_OPTION_INIT("show-snapshot-directory", private->show_entry_point, bool,
+                   out);
+
+    this->local_pool = mem_pool_new(svc_local_t, 128);
+    if (!this->local_pool) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, SVC_MSG_MEM_POOL_GET_FAILED, NULL);
+        goto out;
+    }
+
+    this->private = private;
+
+    ret = 0;
+
+out:
+    if (ret)
+        (void)gf_svc_priv_destroy(this, private);
+
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    svc_private_t *priv = NULL;
+
+    if (!this)
+        return;
+
+    priv = this->private;
+    if (!priv)
+        return;
+
+    /*
+     * Just log the failure and go ahead to
+     * set this->priv to NULL.
+     */
+    if (gf_svc_priv_destroy(this, priv))
+        gf_smsg(this->name, GF_LOG_WARNING, 0, SVC_MSG_PRIV_DESTROY_FAILED,
+                NULL);
+
+    this->private = NULL;
+
+    return;
+}
+
+int
+notify(xlator_t *this, int event, void *data, ...)
+{
+    xlator_t *subvol = NULL;
+    int ret = 0;
+
+    subvol = data;
+
+    /* As there are two subvolumes in snapview-client, there is
+     * a possibility that the regular subvolume is still down and
+     * snapd subvolume come up first. So if we don't handle this situation
+     * CHILD_UP event will be propagated upwards to fuse when
+     * regular subvolume is still down.
+     * This can cause data unavailable for the application.
+     * So for now send notifications up only for regular subvolume.
+     *
+     * TODO: In future if required we may need to handle
+     * notifications from virtual subvolume
+     */
+    if (subvol != SECOND_CHILD(this))
+        ret = default_notify(this, event, data);
+
+    return ret;
+}
+
+struct xlator_fops fops = {
+    .lookup = gf_svc_lookup,
+    .opendir = gf_svc_opendir,
+    .stat = gf_svc_stat,
+    .fstat = gf_svc_fstat,
+    .statfs = gf_svc_statfs,
+    .rmdir = gf_svc_rmdir,
+    .rename = gf_svc_rename,
+    .mkdir = gf_svc_mkdir,
+    .open = gf_svc_open,
+    .unlink = gf_svc_unlink,
+    .setattr = gf_svc_setattr,
+    .getxattr = gf_svc_getxattr,
+    .setxattr = gf_svc_setxattr,
+    .fsetxattr = gf_svc_fsetxattr,
+    .readv = gf_svc_readv,
+    .readdir = gf_svc_readdir,
+    .readdirp = gf_svc_readdirp,
+    .create = gf_svc_create,
+    .readlink = gf_svc_readlink,
+    .mknod = gf_svc_mknod,
+    .symlink = gf_svc_symlink,
+    .flush = gf_svc_flush,
+    .link = gf_svc_link,
+    .access = gf_svc_access,
+    .removexattr = gf_svc_removexattr,
+    .fsync = gf_svc_fsync,
+};
+
+struct xlator_cbks cbks = {
+    .forget = gf_svc_forget,
+    .releasedir = gf_svc_releasedir,
+};
+
+struct volume_options options[] = {
+    {
+        .key = {"snapshot-directory"},
+        .type = GF_OPTION_TYPE_STR,
+        .default_value = ".snaps",
+    },
+    {
+        .key = {"snapdir-entry-path"},
+        .type = GF_OPTION_TYPE_STR,
+        .description = "An option to set the path of a directory on which "
+                       "when readdir comes, dentry for the snapshot-directory"
+                       " should be created and added in the readdir response",
+        .default_value = "",
+    },
+    {
+        .key = {"show-snapshot-directory"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .description = "If this option is set, and the option "
+                       "\"snapdir-entry-path\" is set (which is set by samba "
+                       "vfs plugin for glusterfs, then send the entry point "
+                       "when readdir comes on the snapdir-entry-path",
+        .default_value = "off",
+    },
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1},
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "snapview-client",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/snapview-client/src/snapview-client.h b/xlators/features/snapview-client/src/snapview-client.h
new file mode 100644
index 00000000000..166116a439d
--- /dev/null
+++ b/xlators/features/snapview-client/src/snapview-client.h
@@ -0,0 +1,101 @@
+/*
+  Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+#ifndef __SNAP_VIEW_CLIENT_H__
+#define __SNAP_VIEW_CLIENT_H__
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include "snapview-client-mem-types.h"
+#include "snapview-client-messages.h"
+
+struct __svc_local {
+    loc_t loc;
+    xlator_t *subvolume;
+    fd_t *fd;
+    void *cookie;
+    dict_t *xdata;
+    uint16_t revalidate;
+};
+typedef struct __svc_local svc_local_t;
+
+#define SVC_STACK_UNWIND(fop, frame, params...)                                \
+    do {                                                                       \
+        svc_local_t *__local = NULL;                                           \
+        if (frame) {                                                           \
+            __local = frame->local;                                            \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, params);                               \
+        svc_local_free(__local);                                               \
+    } while (0)
+
+#define SVC_ENTRY_POINT_SET(this, xdata, op_ret, op_errno, new_xdata, ret,     \
+                            label)                                             \
+    do {                                                                       \
+        if (!xdata) {                                                          \
+            xdata = new_xdata = dict_new();                                    \
+            if (!new_xdata) {                                                  \
+                gf_log(this->name, GF_LOG_ERROR,                               \
+                       "failed to allocate new dict");                         \
+                op_ret = -1;                                                   \
+                op_errno = ENOMEM;                                             \
+                goto label;                                                    \
+            }                                                                  \
+        }                                                                      \
+        ret = dict_set_str(xdata, "entry-point", "true");                      \
+        if (ret) {                                                             \
+            gf_log(this->name, GF_LOG_ERROR, "failed to set dict");            \
+            op_ret = -1;                                                       \
+            op_errno = ENOMEM;                                                 \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0);
+
+#define SVC_GET_SUBVOL_FROM_CTX(this, op_ret, op_errno, inode_type, ret,       \
+                                inode, subvolume, label)                       \
+    do {                                                                       \
+        ret = svc_inode_ctx_get(this, inode, &inode_type);                     \
+        if (ret < 0) {                                                         \
+            gf_log(this->name, GF_LOG_ERROR,                                   \
+                   "inode context not found for gfid %s",                      \
+                   uuid_utoa(inode->gfid));                                    \
+            op_ret = -1;                                                       \
+            op_errno = EINVAL;                                                 \
+            goto label;                                                        \
+        }                                                                      \
+                                                                               \
+        subvolume = svc_get_subvolume(this, inode_type);                       \
+    } while (0);
+
+struct svc_private {
+    char *path;
+    char *special_dir; /* needed for samba */
+    gf_boolean_t show_entry_point;
+    gf_lock_t lock; /* mainly to guard private->path */
+};
+typedef struct svc_private svc_private_t;
+
+struct svc_fd {
+    off_t last_offset;
+    gf_boolean_t entry_point_handled;
+    gf_boolean_t special_dir;
+};
+typedef struct svc_fd svc_fd_t;
+
+typedef enum { NORMAL_INODE = 1, VIRTUAL_INODE } inode_type_t;
+
+int
+gf_svc_special_dir_revalidate_lookup(call_frame_t *frame, xlator_t *this,
+                                     dict_t *xdata);
+
+#endif /* __SNAP_VIEW_CLIENT_H__ */
diff --git a/xlators/features/snapview-server/Makefile.am b/xlators/features/snapview-server/Makefile.am
new file mode 100644
index 00000000000..af437a64d6d
--- /dev/null
+++ b/xlators/features/snapview-server/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = src
diff --git a/xlators/features/snapview-server/src/Makefile.am b/xlators/features/snapview-server/src/Makefile.am
new file mode 100644
index 00000000000..2935f138a4c
--- /dev/null
+++ b/xlators/features/snapview-server/src/Makefile.am
@@ -0,0 +1,25 @@
+if WITH_SERVER
+xlator_LTLIBRARIES = snapview-server.la
+endif
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+snapview_server_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+snapview_server_la_SOURCES = snapview-server.c snapview-server-mgmt.c \
+	snapview-server-helpers.c
+
+snapview_server_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+	$(top_builddir)/api/src/libgfapi.la \
+	$(RLLIBS) $(top_builddir)/rpc/xdr/src/libgfxdr.la \
+	$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la
+
+noinst_HEADERS = snapview-server.h snapview-server-mem-types.h snapview-server-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/api/src -I$(top_srcdir)/rpc/rpc-lib/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-DDATADIR=\"$(localstatedir)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/snapview-server/src/snapview-server-helpers.c b/xlators/features/snapview-server/src/snapview-server-helpers.c
new file mode 100644
index 00000000000..62c1ddac49c
--- /dev/null
+++ b/xlators/features/snapview-server/src/snapview-server-helpers.c
@@ -0,0 +1,715 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include "snapview-server.h"
+#include "snapview-server-mem-types.h"
+
+#include <glusterfs/xlator.h>
+#include "rpc-clnt.h"
+#include "xdr-generic.h"
+#include "protocol-common.h"
+#include <pthread.h>
+
+int
+__svs_inode_ctx_set(xlator_t *this, inode_t *inode, svs_inode_t *svs_inode)
+{
+    uint64_t value = 0;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, svs_inode, out);
+
+    value = (uint64_t)(long)svs_inode;
+
+    ret = __inode_ctx_set(inode, this, &value);
+
+out:
+    return ret;
+}
+
+svs_inode_t *
+__svs_inode_ctx_get(xlator_t *this, inode_t *inode)
+{
+    svs_inode_t *svs_inode = NULL;
+    uint64_t value = 0;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    ret = __inode_ctx_get(inode, this, &value);
+    if (ret)
+        goto out;
+
+    svs_inode = (svs_inode_t *)((long)value);
+
+out:
+    return svs_inode;
+}
+
+svs_inode_t *
+svs_inode_ctx_get(xlator_t *this, inode_t *inode)
+{
+    svs_inode_t *svs_inode = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    LOCK(&inode->lock);
+    {
+        svs_inode = __svs_inode_ctx_get(this, inode);
+    }
+    UNLOCK(&inode->lock);
+
+out:
+    return svs_inode;
+}
+
+int32_t
+svs_inode_ctx_set(xlator_t *this, inode_t *inode, svs_inode_t *svs_inode)
+{
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, svs_inode, out);
+
+    LOCK(&inode->lock);
+    {
+        ret = __svs_inode_ctx_set(this, inode, svs_inode);
+    }
+    UNLOCK(&inode->lock);
+
+out:
+    return ret;
+}
+
+svs_inode_t *
+svs_inode_new(void)
+{
+    svs_inode_t *svs_inode = NULL;
+
+    svs_inode = GF_CALLOC(1, sizeof(*svs_inode), gf_svs_mt_svs_inode_t);
+
+    return svs_inode;
+}
+
+svs_inode_t *
+svs_inode_ctx_get_or_new(xlator_t *this, inode_t *inode)
+{
+    svs_inode_t *svs_inode = NULL;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    LOCK(&inode->lock);
+    {
+        svs_inode = __svs_inode_ctx_get(this, inode);
+        if (!svs_inode) {
+            svs_inode = svs_inode_new();
+            if (svs_inode) {
+                ret = __svs_inode_ctx_set(this, inode, svs_inode);
+                if (ret) {
+                    GF_FREE(svs_inode);
+                    svs_inode = NULL;
+                }
+            }
+        }
+    }
+    UNLOCK(&inode->lock);
+
+out:
+    return svs_inode;
+}
+
+svs_fd_t *
+svs_fd_new(void)
+{
+    svs_fd_t *svs_fd = NULL;
+
+    svs_fd = GF_CALLOC(1, sizeof(*svs_fd), gf_svs_mt_svs_fd_t);
+
+    return svs_fd;
+}
+
+int
+__svs_fd_ctx_set(xlator_t *this, fd_t *fd, svs_fd_t *svs_fd)
+{
+    uint64_t value = 0;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, svs_fd, out);
+
+    value = (uint64_t)(long)svs_fd;
+
+    ret = __fd_ctx_set(fd, this, value);
+
+out:
+    return ret;
+}
+
+svs_fd_t *
+__svs_fd_ctx_get(xlator_t *this, fd_t *fd)
+{
+    svs_fd_t *svs_fd = NULL;
+    uint64_t value = 0;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    ret = __fd_ctx_get(fd, this, &value);
+    if (ret)
+        return NULL;
+
+    svs_fd = (svs_fd_t *)((long)value);
+
+out:
+    return svs_fd;
+}
+
+svs_fd_t *
+svs_fd_ctx_get(xlator_t *this, fd_t *fd)
+{
+    svs_fd_t *svs_fd = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    LOCK(&fd->lock);
+    {
+        svs_fd = __svs_fd_ctx_get(this, fd);
+    }
+    UNLOCK(&fd->lock);
+
+out:
+    return svs_fd;
+}
+
+int32_t
+svs_fd_ctx_set(xlator_t *this, fd_t *fd, svs_fd_t *svs_fd)
+{
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, svs_fd, out);
+
+    LOCK(&fd->lock);
+    {
+        ret = __svs_fd_ctx_set(this, fd, svs_fd);
+    }
+    UNLOCK(&fd->lock);
+
+out:
+    return ret;
+}
+
+svs_fd_t *
+__svs_fd_ctx_get_or_new(xlator_t *this, fd_t *fd)
+{
+    svs_fd_t *svs_fd = NULL;
+    int ret = -1;
+    glfs_t *fs = NULL;
+    glfs_object_t *object = NULL;
+    svs_inode_t *inode_ctx = NULL;
+    glfs_fd_t *glfd = NULL;
+    inode_t *inode = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    inode = fd->inode;
+    svs_fd = __svs_fd_ctx_get(this, fd);
+    if (svs_fd) {
+        ret = 0;
+        goto out;
+    }
+
+    svs_fd = svs_fd_new();
+    if (!svs_fd) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_NEW_FD_CTX_FAILED,
+               "failed to allocate new fd "
+               "context for gfid %s",
+               uuid_utoa(inode->gfid));
+        goto out;
+    }
+
+    if (fd_is_anonymous(fd)) {
+        inode_ctx = svs_inode_ctx_get(this, inode);
+        if (!inode_ctx) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   SVS_MSG_GET_INODE_CONTEXT_FAILED,
+                   "failed to get inode "
+                   "context for %s",
+                   uuid_utoa(inode->gfid));
+            goto out;
+        }
+
+        fs = inode_ctx->fs;
+        object = inode_ctx->object;
+
+        if (inode->ia_type == IA_IFDIR) {
+            glfd = glfs_h_opendir(fs, object);
+            if (!glfd) {
+                gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_OPENDIR_FAILED,
+                       "failed to "
+                       "open the directory %s",
+                       uuid_utoa(inode->gfid));
+                goto out;
+            }
+        }
+
+        if (inode->ia_type == IA_IFREG) {
+            glfd = glfs_h_open(fs, object, O_RDONLY | O_LARGEFILE);
+            if (!glfd) {
+                gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_OPEN_FAILED,
+                       "failed to "
+                       "open the file %s",
+                       uuid_utoa(inode->gfid));
+                goto out;
+            }
+        }
+
+        svs_fd->fd = glfd;
+    }
+
+    ret = __svs_fd_ctx_set(this, fd, svs_fd);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_SET_FD_CONTEXT_FAILED,
+               "failed to set fd context "
+               "for gfid %s",
+               uuid_utoa(inode->gfid));
+        if (svs_fd->fd) {
+            if (inode->ia_type == IA_IFDIR) {
+                ret = glfs_closedir(svs_fd->fd);
+                if (ret)
+                    gf_msg(this->name, GF_LOG_ERROR, errno,
+                           SVS_MSG_CLOSEDIR_FAILED,
+                           "failed to close the fd for %s",
+                           uuid_utoa(inode->gfid));
+            }
+            if (inode->ia_type == IA_IFREG) {
+                ret = glfs_close(svs_fd->fd);
+                if (ret)
+                    gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_CLOSE_FAILED,
+                           "failed to close the fd for %s",
+                           uuid_utoa(inode->gfid));
+            }
+        }
+        ret = -1;
+    }
+
+out:
+    if (ret) {
+        GF_FREE(svs_fd);
+        svs_fd = NULL;
+    }
+
+    return svs_fd;
+}
+
+svs_fd_t *
+svs_fd_ctx_get_or_new(xlator_t *this, fd_t *fd)
+{
+    svs_fd_t *svs_fd = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    LOCK(&fd->lock);
+    {
+        svs_fd = __svs_fd_ctx_get_or_new(this, fd);
+    }
+    UNLOCK(&fd->lock);
+
+out:
+    return svs_fd;
+}
+
+int
+svs_uuid_generate(xlator_t *this, uuid_t gfid, char *snapname,
+                  uuid_t origin_gfid)
+{
+    char ino_string[NAME_MAX + 32] = "";
+    uuid_t tmp = {
+        0,
+    };
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, snapname, out);
+
+    (void)snprintf(ino_string, sizeof(ino_string), "%s%s", snapname,
+                   uuid_utoa(origin_gfid));
+
+    if (gf_gfid_generate_from_xxh64(tmp, ino_string)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, SVS_MSG_GFID_GEN_FAILED,
+               "failed to generate "
+               "gfid for object with actual gfid of %s "
+               "(snapname: %s, key: %s)",
+               uuid_utoa(origin_gfid), snapname, ino_string);
+        goto out;
+    }
+
+    gf_uuid_copy(gfid, tmp);
+
+    ret = 0;
+
+    gf_msg_debug(this->name, 0, "gfid generated is %s ", uuid_utoa(gfid));
+
+out:
+    return ret;
+}
+
+void
+svs_fill_ino_from_gfid(struct iatt *buf)
+{
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, buf, out);
+
+    /* consider least significant 8 bytes of value out of gfid */
+    if (gf_uuid_is_null(buf->ia_gfid)) {
+        buf->ia_ino = -1;
+        goto out;
+    }
+
+    buf->ia_ino = gfid_to_ino(buf->ia_gfid);
+out:
+    return;
+}
+
+void
+svs_iatt_fill(uuid_t gfid, struct iatt *buf)
+{
+    struct timeval tv = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, buf, out);
+
+    buf->ia_type = IA_IFDIR;
+    buf->ia_uid = 0;
+    buf->ia_gid = 0;
+    buf->ia_size = 0;
+    buf->ia_nlink = 2;
+    buf->ia_blocks = 8;
+    buf->ia_size = 4096;
+
+    gf_uuid_copy(buf->ia_gfid, gfid);
+    svs_fill_ino_from_gfid(buf);
+
+    buf->ia_prot = ia_prot_from_st_mode(0755);
+
+    gettimeofday(&tv, 0);
+
+    buf->ia_mtime = buf->ia_atime = buf->ia_ctime = tv.tv_sec;
+    buf->ia_mtime_nsec = buf->ia_atime_nsec = buf->ia_ctime_nsec = (tv.tv_usec *
+                                                                    1000);
+
+out:
+    return;
+}
+
+/* priv->snaplist_lock should be held before calling this function */
+snap_dirent_t *
+__svs_get_snap_dirent(xlator_t *this, const char *name)
+{
+    svs_private_t *private = NULL;
+    int i = 0;
+    snap_dirent_t *dirents = NULL;
+    snap_dirent_t *tmp_dirent = NULL;
+    snap_dirent_t *dirent = NULL;
+
+   private
+    = this->private;
+
+    dirents = private->dirents;
+    if (!dirents) {
+        goto out;
+    }
+
+    tmp_dirent = dirents;
+    for (i = 0; i < private->num_snaps; i++) {
+        if (!strcmp(tmp_dirent->name, name)) {
+            dirent = tmp_dirent;
+            break;
+        }
+        tmp_dirent++;
+    }
+
+out:
+    return dirent;
+}
+
+glfs_t *
+__svs_initialise_snapshot_volume(xlator_t *this, const char *name,
+                                 int32_t *op_errno)
+{
+    svs_private_t *priv = NULL;
+    int32_t ret = -1;
+    int32_t local_errno = ESTALE;
+    snap_dirent_t *dirent = NULL;
+    char volname[PATH_MAX] = {
+        0,
+    };
+    glfs_t *fs = NULL;
+    int loglevel = GF_LOG_INFO;
+    char logfile[PATH_MAX] = {
+        0,
+    };
+    char *volfile_server = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, name, out);
+
+    priv = this->private;
+
+    dirent = __svs_get_snap_dirent(this, name);
+    if (!dirent) {
+        gf_msg_debug(this->name, 0,
+                     "snap entry for "
+                     "name %s not found",
+                     name);
+        local_errno = ENOENT;
+        goto out;
+    }
+
+    if (dirent->fs) {
+        ret = 0;
+        fs = dirent->fs;
+        goto out;
+    }
+
+    snprintf(volname, sizeof(volname), "/snaps/%s/%s/%s", dirent->name,
+             dirent->snap_volname, dirent->snap_volname);
+
+    fs = glfs_new(volname);
+    if (!fs) {
+        local_errno = ENOMEM;
+        gf_msg(this->name, GF_LOG_ERROR, local_errno, SVS_MSG_GLFS_NEW_FAILED,
+               "glfs instance for snap volume %s "
+               "failed",
+               dirent->name);
+        goto out;
+    }
+
+    /*
+     * Before, localhost was used as the volfile server. But, with that
+     * method, accessing snapshots started giving ENOENT error if a
+     * specific bind address is mentioned in the glusterd volume file.
+     * Check the bug https://bugzilla.redhat.com/show_bug.cgi?id=1725211.
+     * So, the new method is tried below, where, snapview-server first
+     * uses the volfile server used by the snapd (obtained from the
+     * command line arguments saved in the global context of the process).
+     * If the volfile server in global context is NULL, then localhost
+     * is tried (like before).
+     */
+    if (this->ctx->cmd_args.volfile_server) {
+        volfile_server = gf_strdup(this->ctx->cmd_args.volfile_server);
+        if (!volfile_server) {
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM,
+                   SVS_MSG_VOLFILE_SERVER_GET_FAIL,
+                   "failed to copy volfile server %s. ",
+                   this->ctx->cmd_args.volfile_server);
+            ret = -1;
+            goto out;
+        }
+    } else {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM,
+               SVS_MSG_VOLFILE_SERVER_GET_FAIL,
+               "volfile server is NULL in cmd args. "
+               "Trying with localhost");
+        volfile_server = gf_strdup("localhost");
+        if (!volfile_server) {
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM,
+                   SVS_MSG_VOLFILE_SERVER_GET_FAIL,
+                   "failed to copy volfile server localhost.");
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = glfs_set_volfile_server(fs, "tcp", volfile_server, 24007);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, local_errno,
+               SVS_MSG_SET_VOLFILE_SERVR_FAILED,
+               "setting the "
+               "volfile server %s for snap volume %s "
+               "failed",
+               volfile_server, dirent->name);
+        goto out;
+    }
+
+    snprintf(logfile, sizeof(logfile),
+             DEFAULT_SVD_LOG_FILE_DIRECTORY "/snaps/%s/%s-%s.log",
+             priv->volname, name, dirent->uuid);
+
+    ret = glfs_set_logging(fs, logfile, loglevel);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, local_errno,
+               SVS_MSG_SET_LOGGING_FAILED,
+               "failed to set the "
+               "log file path");
+        goto out;
+    }
+
+    ret = glfs_init(fs);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, local_errno, SVS_MSG_GLFS_INIT_FAILED,
+               "initing the "
+               "fs for %s failed",
+               dirent->name);
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    if (ret) {
+        if (op_errno)
+            *op_errno = local_errno;
+
+        if (fs)
+            glfs_fini(fs);
+        fs = NULL;
+    }
+
+    if (fs) {
+        dirent->fs = fs;
+    }
+
+    GF_FREE(volfile_server);
+    return fs;
+}
+
+glfs_t *
+svs_initialise_snapshot_volume(xlator_t *this, const char *name,
+                               int32_t *op_errno)
+{
+    glfs_t *fs = NULL;
+    svs_private_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, name, out);
+
+    priv = this->private;
+
+    LOCK(&priv->snaplist_lock);
+    {
+        fs = __svs_initialise_snapshot_volume(this, name, op_errno);
+    }
+    UNLOCK(&priv->snaplist_lock);
+
+out:
+
+    return fs;
+}
+
+snap_dirent_t *
+svs_get_latest_snap_entry(xlator_t *this)
+{
+    svs_private_t *priv = NULL;
+    snap_dirent_t *dirents = NULL;
+    snap_dirent_t *dirent = NULL;
+
+    GF_VALIDATE_OR_GOTO("svs", this, out);
+
+    priv = this->private;
+
+    LOCK(&priv->snaplist_lock);
+    {
+        dirents = priv->dirents;
+        if (!dirents) {
+            goto unlock;
+        }
+        if (priv->num_snaps)
+            dirent = &dirents[priv->num_snaps - 1];
+    }
+unlock:
+    UNLOCK(&priv->snaplist_lock);
+
+out:
+    return dirent;
+}
+
+glfs_t *
+svs_get_latest_snapshot(xlator_t *this)
+{
+    glfs_t *fs = NULL;
+    snap_dirent_t *dirent = NULL;
+    svs_private_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("svs", this, out);
+    priv = this->private;
+
+    dirent = svs_get_latest_snap_entry(this);
+
+    if (dirent) {
+        LOCK(&priv->snaplist_lock);
+        {
+            fs = dirent->fs;
+        }
+        UNLOCK(&priv->snaplist_lock);
+    }
+
+out:
+    return fs;
+}
+
+glfs_t *
+svs_inode_ctx_glfs_mapping(xlator_t *this, svs_inode_t *inode_ctx)
+{
+    glfs_t *fs = NULL;
+
+    GF_VALIDATE_OR_GOTO("svs", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode_ctx, out);
+
+    fs = inode_ctx->fs;
+
+    SVS_CHECK_VALID_SNAPSHOT_HANDLE(fs, this);
+
+out:
+    return fs;
+}
+
+glfs_t *
+svs_inode_glfs_mapping(xlator_t *this, inode_t *inode)
+{
+    svs_inode_t *inode_ctx = NULL;
+    glfs_t *fs = NULL;
+
+    inode_ctx = svs_inode_ctx_get(this, inode);
+    if (!inode_ctx) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "inode context not found for"
+               " the inode %s",
+               uuid_utoa(inode->gfid));
+        goto out;
+    }
+
+    fs = svs_inode_ctx_glfs_mapping(this, inode_ctx);
+
+out:
+    return fs;
+}
diff --git a/xlators/features/snapview-server/src/snapview-server-mem-types.h b/xlators/features/snapview-server/src/snapview-server-mem-types.h
new file mode 100644
index 00000000000..63456b85323
--- /dev/null
+++ b/xlators/features/snapview-server/src/snapview-server-mem-types.h
@@ -0,0 +1,25 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __SNAP_VIEW_MEM_TYPES_H
+#define __SNAP_VIEW_MEM_TYPES_H
+
+#include <glusterfs/mem-types.h>
+
+enum snapview_mem_types {
+    gf_svs_mt_priv_t = gf_common_mt_end + 1,
+    gf_svs_mt_svs_inode_t,
+    gf_svs_mt_dirents_t,
+    gf_svs_mt_svs_fd_t,
+    gf_svs_mt_snaplist_t,
+    gf_svs_mt_end
+};
+
+#endif
diff --git a/xlators/features/snapview-server/src/snapview-server-messages.h b/xlators/features/snapview-server/src/snapview-server-messages.h
new file mode 100644
index 00000000000..f634ab5d2b0
--- /dev/null
+++ b/xlators/features/snapview-server/src/snapview-server-messages.h
@@ -0,0 +1,54 @@
+/*
+ Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _SNAPVIEW_SERVER_MESSAGES_H_
+#define _SNAPVIEW_SERVER_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(SNAPVIEW_SERVER, SVS_MSG_NO_MEMORY, SVS_MSG_MEM_ACNT_FAILED,
+           SVS_MSG_NULL_GFID, SVS_MSG_GET_LATEST_SNAP_FAILED,
+           SVS_MSG_INVALID_GLFS_CTX, SVS_MSG_LOCK_DESTROY_FAILED,
+           SVS_MSG_SNAPSHOT_LIST_CHANGED, SVS_MSG_MGMT_INIT_FAILED,
+           SVS_MSG_GET_SNAPSHOT_LIST_FAILED, SVS_MSG_GET_GLFS_H_OBJECT_FAILED,
+           SVS_MSG_PARENT_CTX_OR_NAME_NULL, SVS_MSG_SET_INODE_CONTEXT_FAILED,
+           SVS_MSG_GET_INODE_CONTEXT_FAILED, SVS_MSG_NEW_INODE_CTX_FAILED,
+           SVS_MSG_DELETE_INODE_CONTEXT_FAILED, SVS_MSG_SET_FD_CONTEXT_FAILED,
+           SVS_MSG_NEW_FD_CTX_FAILED, SVS_MSG_DELETE_FD_CTX_FAILED,
+           SVS_MSG_GETXATTR_FAILED, SVS_MSG_LISTXATTR_FAILED,
+           SVS_MSG_RELEASEDIR_FAILED, SVS_MSG_RELEASE_FAILED,
+           SVS_MSG_TELLDIR_FAILED, SVS_MSG_STAT_FAILED, SVS_MSG_STATFS_FAILED,
+           SVS_MSG_OPEN_FAILED, SVS_MSG_READ_FAILED, SVS_MSG_READLINK_FAILED,
+           SVS_MSG_ACCESS_FAILED, SVS_MSG_GET_FD_CONTEXT_FAILED,
+           SVS_MSG_DICT_SET_FAILED, SVS_MSG_OPENDIR_FAILED,
+           SVS_MSG_FS_INSTANCE_INVALID, SVS_MSG_SETFSUID_FAIL,
+           SVS_MSG_SETFSGID_FAIL, SVS_MSG_SETFSGRPS_FAIL,
+           SVS_MSG_BUILD_TRNSPRT_OPT_FAILED, SVS_MSG_RPC_INIT_FAILED,
+           SVS_MSG_REG_NOTIFY_FAILED, SVS_MSG_REG_CBK_PRGM_FAILED,
+           SVS_MSG_RPC_CLNT_START_FAILED, SVS_MSG_XDR_PAYLOAD_FAILED,
+           SVS_MSG_NULL_CTX, SVS_MSG_RPC_CALL_FAILED, SVS_MSG_XDR_DECODE_FAILED,
+           SVS_MSG_RSP_DICT_EMPTY, SVS_MSG_DICT_GET_FAILED,
+           SVS_MSG_SNAP_LIST_REFRESH_FAILED, SVS_MSG_RPC_REQ_FAILED,
+           SVS_MSG_CLOSEDIR_FAILED, SVS_MSG_CLOSE_FAILED,
+           SVS_MSG_GFID_GEN_FAILED, SVS_MSG_GLFS_NEW_FAILED,
+           SVS_MSG_SET_VOLFILE_SERVR_FAILED, SVS_MSG_SET_LOGGING_FAILED,
+           SVS_MSG_VOLFILE_SERVER_GET_FAIL, SVS_MSG_GLFS_INIT_FAILED);
+
+#endif /* !_SNAPVIEW_CLIENT_MESSAGES_H_ */
diff --git a/xlators/features/snapview-server/src/snapview-server-mgmt.c b/xlators/features/snapview-server/src/snapview-server-mgmt.c
new file mode 100644
index 00000000000..ecf31c3b880
--- /dev/null
+++ b/xlators/features/snapview-server/src/snapview-server-mgmt.c
@@ -0,0 +1,524 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include "snapview-server.h"
+#include "snapview-server-mem-types.h"
+#include <pthread.h>
+
+int
+mgmt_cbk_snap(struct rpc_clnt *rpc, void *mydata, void *data)
+{
+    xlator_t *this = NULL;
+
+    this = mydata;
+    GF_ASSERT(this);
+
+    gf_msg("mgmt", GF_LOG_INFO, 0, SVS_MSG_SNAPSHOT_LIST_CHANGED,
+           "list of snapshots changed");
+
+    svs_get_snapshot_list(this);
+    return 0;
+}
+
+static rpcclnt_cb_actor_t svs_cbk_actors[GF_CBK_MAXVALUE] = {
+    [GF_CBK_GET_SNAPS] = {"GETSNAPS", mgmt_cbk_snap, GF_CBK_GET_SNAPS},
+};
+
+static struct rpcclnt_cb_program svs_cbk_prog = {
+    .progname = "GlusterFS Callback",
+    .prognum = GLUSTER_CBK_PROGRAM,
+    .progver = GLUSTER_CBK_VERSION,
+    .actors = svs_cbk_actors,
+    .numactors = GF_CBK_MAXVALUE,
+};
+
+static char *clnt_handshake_procs[GF_HNDSK_MAXVALUE] = {
+    [GF_HNDSK_NULL] = "NULL",
+    [GF_HNDSK_EVENT_NOTIFY] = "EVENTNOTIFY",
+};
+
+static rpc_clnt_prog_t svs_clnt_handshake_prog = {
+    .progname = "GlusterFS Handshake",
+    .prognum = GLUSTER_HNDSK_PROGRAM,
+    .progver = GLUSTER_HNDSK_VERSION,
+    .procnames = clnt_handshake_procs,
+};
+
+static int
+svs_rpc_notify(struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
+               void *data)
+{
+    xlator_t *this = NULL;
+    int ret = 0;
+
+    this = mydata;
+
+    switch (event) {
+        case RPC_CLNT_CONNECT:
+            ret = svs_get_snapshot_list(this);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+                       SVS_MSG_GET_SNAPSHOT_LIST_FAILED,
+                       "Error in refreshing the snaplist "
+                       "infrastructure");
+                ret = -1;
+            }
+            break;
+        default:
+            break;
+    }
+    return ret;
+}
+
+int
+svs_mgmt_init(xlator_t *this)
+{
+    int ret = -1;
+    svs_private_t *priv = NULL;
+    dict_t *options = NULL;
+    int port = GF_DEFAULT_BASE_PORT;
+    char *host = NULL;
+    cmd_args_t *cmd_args = NULL;
+    glusterfs_ctx_t *ctx = NULL;
+    xlator_cmdline_option_t *opt = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->ctx, out);
+
+    priv = this->private;
+
+    ctx = this->ctx;
+    cmd_args = &ctx->cmd_args;
+
+    host = "localhost";
+    if (cmd_args->volfile_server)
+        host = cmd_args->volfile_server;
+
+    options = dict_new();
+    if (!options)
+        goto out;
+
+    opt = find_xlator_option_in_cmd_args_t("address-family", cmd_args);
+    ret = rpc_transport_inet_options_build(options, host, port,
+                                           (opt != NULL ? opt->value : NULL));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_BUILD_TRNSPRT_OPT_FAILED,
+               "failed to build the "
+               "transport options");
+        goto out;
+    }
+
+    priv->rpc = rpc_clnt_new(options, this, this->name, 8);
+    if (!priv->rpc) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_RPC_INIT_FAILED,
+               "failed to initialize RPC");
+        goto out;
+    }
+
+    ret = rpc_clnt_register_notify(priv->rpc, svs_rpc_notify, this);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, SVS_MSG_REG_NOTIFY_FAILED,
+               "failed to register notify function");
+        goto out;
+    }
+
+    ret = rpcclnt_cbk_program_register(priv->rpc, &svs_cbk_prog, this);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_REG_CBK_PRGM_FAILED,
+               "failed to register callback program");
+        goto out;
+    }
+
+    ret = rpc_clnt_start(priv->rpc);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_RPC_CLNT_START_FAILED,
+               "failed to start the rpc "
+               "client");
+        goto out;
+    }
+
+    ret = 0;
+
+    gf_msg_debug(this->name, 0, "svs mgmt init successful");
+
+out:
+    if (options)
+        dict_unref(options);
+    if (ret)
+        if (priv) {
+            rpc_clnt_connection_cleanup(&priv->rpc->conn);
+            rpc_clnt_unref(priv->rpc);
+            priv->rpc = NULL;
+        }
+
+    return ret;
+}
+
+int
+svs_mgmt_submit_request(void *req, call_frame_t *frame, glusterfs_ctx_t *ctx,
+                        rpc_clnt_prog_t *prog, int procnum, fop_cbk_fn_t cbkfn,
+                        xdrproc_t xdrproc)
+{
+    int ret = -1;
+    int count = 0;
+    struct iovec iov = {
+        0,
+    };
+    struct iobuf *iobuf = NULL;
+    struct iobref *iobref = NULL;
+    ssize_t xdr_size = 0;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", frame, out);
+    GF_VALIDATE_OR_GOTO("snapview-server", req, out);
+    GF_VALIDATE_OR_GOTO("snapview-server", ctx, out);
+    GF_VALIDATE_OR_GOTO("snapview-server", prog, out);
+
+    GF_ASSERT(frame->this);
+
+    iobref = iobref_new();
+    if (!iobref) {
+        gf_msg(frame->this->name, GF_LOG_WARNING, ENOMEM, SVS_MSG_NO_MEMORY,
+               "failed to allocate "
+               "new iobref");
+        goto out;
+    }
+
+    if (req) {
+        xdr_size = xdr_sizeof(xdrproc, req);
+
+        iobuf = iobuf_get2(ctx->iobuf_pool, xdr_size);
+        if (!iobuf) {
+            goto out;
+        }
+
+        iobref_add(iobref, iobuf);
+
+        iov.iov_base = iobuf->ptr;
+        iov.iov_len = iobuf_pagesize(iobuf);
+
+        /* Create the xdr payload */
+        ret = xdr_serialize_generic(iov, req, xdrproc);
+        if (ret == -1) {
+            gf_msg(frame->this->name, GF_LOG_WARNING, 0,
+                   SVS_MSG_XDR_PAYLOAD_FAILED, "Failed to create XDR payload");
+            goto out;
+        }
+        iov.iov_len = ret;
+        count = 1;
+    }
+
+    ret = rpc_clnt_submit(ctx->mgmt, prog, procnum, cbkfn, &iov, count, NULL, 0,
+                          iobref, frame, NULL, 0, NULL, 0, NULL);
+
+out:
+    if (iobref)
+        iobref_unref(iobref);
+
+    if (iobuf)
+        iobuf_unref(iobuf);
+    return ret;
+}
+
+int
+mgmt_get_snapinfo_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                      void *myframe)
+{
+    gf_getsnap_name_uuid_rsp rsp = {
+        0,
+    };
+    call_frame_t *frame = NULL;
+    glusterfs_ctx_t *ctx = NULL;
+    int ret = -1;
+    dict_t *dict = NULL;
+    char key[32] = {0};
+    int len;
+    int snapcount = 0;
+    svs_private_t *priv = NULL;
+    xlator_t *this = NULL;
+    int i = 0;
+    int j = 0;
+    char *value = NULL;
+    snap_dirent_t *dirents = NULL;
+    snap_dirent_t *old_dirents = NULL;
+    int oldcount = 0;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", req, error_out);
+    GF_VALIDATE_OR_GOTO("snapview-server", myframe, error_out);
+    GF_VALIDATE_OR_GOTO("snapview-server", iov, error_out);
+
+    frame = myframe;
+    this = frame->this;
+    ctx = frame->this->ctx;
+    priv = this->private;
+
+    if (!ctx) {
+        errno = EINVAL;
+        gf_msg(frame->this->name, GF_LOG_ERROR, errno, SVS_MSG_NULL_CTX,
+               "NULL context");
+        goto out;
+    }
+
+    if (-1 == req->rpc_status) {
+        errno = EINVAL;
+        gf_msg(frame->this->name, GF_LOG_ERROR, errno, SVS_MSG_RPC_CALL_FAILED,
+               "RPC call is not successful");
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gf_getsnap_name_uuid_rsp);
+    if (ret < 0) {
+        gf_msg(frame->this->name, GF_LOG_ERROR, 0, SVS_MSG_XDR_DECODE_FAILED,
+               "Failed to decode xdr response, rsp.op_ret = %d", rsp.op_ret);
+        goto out;
+    }
+
+    if (rsp.op_ret == -1) {
+        errno = rsp.op_errno;
+        ret = -1;
+        goto out;
+    }
+
+    if (!rsp.dict.dict_len) {
+        ret = -1;
+        errno = EINVAL;
+        gf_msg(frame->this->name, GF_LOG_ERROR, errno, SVS_MSG_RSP_DICT_EMPTY,
+               "Response dict is not populated");
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        ret = -1;
+        errno = ENOMEM;
+        goto out;
+    }
+
+    ret = dict_unserialize(rsp.dict.dict_val, rsp.dict.dict_len, &dict);
+    if (ret) {
+        errno = EINVAL;
+        gf_msg(frame->this->name, GF_LOG_ERROR, errno,
+               LG_MSG_DICT_UNSERIAL_FAILED, "Failed to unserialize dictionary");
+        goto out;
+    }
+
+    ret = dict_get_int32(dict, "snap-count", (int32_t *)&snapcount);
+    if (ret) {
+        errno = EINVAL;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_DICT_GET_FAILED,
+               "Error retrieving snapcount");
+        goto out;
+    }
+
+    if (snapcount > 0) {
+        /* first time we are fetching snap list */
+        dirents = GF_CALLOC(snapcount, sizeof(snap_dirent_t),
+                            gf_svs_mt_dirents_t);
+        if (!dirents) {
+            errno = ENOMEM;
+            ret = -1;
+            gf_msg(frame->this->name, GF_LOG_ERROR, errno, SVS_MSG_NO_MEMORY,
+                   "Unable to allocate memory");
+            goto out;
+        }
+    }
+
+    for (i = 0; i < snapcount; i++) {
+        len = snprintf(key, sizeof(key), "snap-volname.%d", i + 1);
+        ret = dict_get_strn(dict, key, len, &value);
+        if (ret) {
+            errno = EINVAL;
+            ret = -1;
+            gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_DICT_GET_FAILED,
+                   "Error retrieving snap volname %d", i + 1);
+            goto out;
+        }
+
+        strncpy(dirents[i].snap_volname, value,
+                sizeof(dirents[i].snap_volname));
+
+        len = snprintf(key, sizeof(key), "snap-id.%d", i + 1);
+        ret = dict_get_strn(dict, key, len, &value);
+        if (ret) {
+            errno = EINVAL;
+            ret = -1;
+            gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_DICT_GET_FAILED,
+                   "Error retrieving snap uuid %d", i + 1);
+            goto out;
+        }
+        strncpy(dirents[i].uuid, value, sizeof(dirents[i].uuid));
+
+        len = snprintf(key, sizeof(key), "snapname.%d", i + 1);
+        ret = dict_get_strn(dict, key, len, &value);
+        if (ret) {
+            errno = EINVAL;
+            ret = -1;
+            gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_DICT_GET_FAILED,
+                   "Error retrieving snap name %d", i + 1);
+            goto out;
+        }
+        strncpy(dirents[i].name, value, sizeof(dirents[i].name));
+    }
+
+    /*
+     * Got the new snap list populated in dirents
+     * The new snap list is either a subset or a superset of
+     * the existing snaplist old_dirents which has priv->num_snaps
+     * number of entries.
+     *
+     * If subset, then clean up the fs for entries which are
+     * no longer relevant.
+     *
+     * For other overlapping entries set the fs for new dirents
+     * entries which have a fs assigned already in old_dirents
+     *
+     * We do this as we don't want to do new glfs_init()s repeatedly
+     * as the dirents entries for snapshot volumes get repatedly
+     * cleaned up and allocated. And if we don't then that will lead
+     * to memleaks
+     */
+
+    LOCK(&priv->snaplist_lock);
+    {
+        oldcount = priv->num_snaps;
+        old_dirents = priv->dirents;
+        for (i = 0; i < priv->num_snaps; i++) {
+            for (j = 0; j < snapcount; j++) {
+                if ((!strcmp(old_dirents[i].name, dirents[j].name)) &&
+                    (!strcmp(old_dirents[i].uuid, dirents[j].uuid))) {
+                    dirents[j].fs = old_dirents[i].fs;
+                    old_dirents[i].fs = NULL;
+                    break;
+                }
+            }
+        }
+
+        priv->dirents = dirents;
+        priv->num_snaps = snapcount;
+    }
+    UNLOCK(&priv->snaplist_lock);
+
+    if (old_dirents) {
+        for (i = 0; i < oldcount; i++) {
+            if (old_dirents[i].fs)
+                gf_msg_debug(this->name, 0,
+                             "calling glfs_fini on "
+                             "name: %s, snap_volname: %s, uuid: %s",
+                             old_dirents[i].name, old_dirents[i].snap_volname,
+                             old_dirents[i].uuid);
+            glfs_fini(old_dirents[i].fs);
+        }
+    }
+
+    GF_FREE(old_dirents);
+
+    ret = 0;
+
+out:
+    if (dict) {
+        dict_unref(dict);
+    }
+    free(rsp.dict.dict_val);
+    free(rsp.op_errstr);
+
+    if (ret && dirents) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, SVS_MSG_SNAP_LIST_REFRESH_FAILED,
+               "Could not update dirents with refreshed snap list");
+        GF_FREE(dirents);
+    }
+
+    if (myframe)
+        SVS_STACK_DESTROY(myframe);
+
+error_out:
+    return ret;
+}
+
+int
+svs_get_snapshot_list(xlator_t *this)
+{
+    gf_getsnap_name_uuid_req req = {{
+        0,
+    }};
+    int ret = -1;
+    dict_t *dict = NULL;
+    glusterfs_ctx_t *ctx = NULL;
+    call_frame_t *frame = NULL;
+    svs_private_t *priv = NULL;
+    gf_boolean_t frame_cleanup = _gf_true;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+
+    ctx = this->ctx;
+    if (!ctx) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_NULL_CTX, "ctx is NULL");
+        goto out;
+    }
+
+    frame = create_frame(this, ctx->pool);
+    if (!frame) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, LG_MSG_FRAME_ERROR,
+               "Error allocating frame");
+        goto out;
+    }
+
+    priv = this->private;
+
+    dict = dict_new();
+    if (!dict) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, SVS_MSG_NO_MEMORY,
+               "Error allocating dictionary");
+        goto out;
+    }
+
+    ret = dict_set_str(dict, "volname", priv->volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_DICT_SET_FAILED,
+               "Error setting volname in dict");
+        goto out;
+    }
+
+    ret = dict_allocate_and_serialize(dict, &req.dict.dict_val,
+                                      &req.dict.dict_len);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, LG_MSG_DICT_UNSERIAL_FAILED,
+               "Failed to serialize dictionary");
+        ret = -1;
+        goto out;
+    }
+
+    ret = svs_mgmt_submit_request(
+        &req, frame, ctx, &svs_clnt_handshake_prog, GF_HNDSK_GET_SNAPSHOT_INFO,
+        mgmt_get_snapinfo_cbk, (xdrproc_t)xdr_gf_getsnap_name_uuid_req);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_RPC_REQ_FAILED,
+               "Error sending snapshot names RPC request");
+    }
+
+    frame_cleanup = _gf_false;
+
+out:
+    if (dict) {
+        dict_unref(dict);
+    }
+    GF_FREE(req.dict.dict_val);
+
+    if (frame_cleanup && frame) {
+        /*
+         * Destroy the frame if we encountered an error
+         * Else we need to clean it up in
+         * mgmt_get_snapinfo_cbk
+         */
+        SVS_STACK_DESTROY(frame);
+    }
+
+    return ret;
+}
diff --git a/xlators/features/snapview-server/src/snapview-server.c b/xlators/features/snapview-server/src/snapview-server.c
new file mode 100644
index 00000000000..76cccae5914
--- /dev/null
+++ b/xlators/features/snapview-server/src/snapview-server.c
@@ -0,0 +1,2720 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include "snapview-server.h"
+#include "snapview-server-mem-types.h"
+#include <glusterfs/compat-errno.h>
+
+#include <glusterfs/xlator.h>
+#include "rpc-clnt.h"
+#include "xdr-generic.h"
+#include "protocol-common.h"
+#include <glusterfs/syscall.h>
+#include <pthread.h>
+
+#include "glfs-internal.h"
+
+int
+gf_setcredentials(uid_t *uid, gid_t *gid, uint16_t ngrps, uint32_t *groups)
+{
+    int ret = 0;
+
+    if (uid) {
+        ret = glfs_setfsuid(*uid);
+        if (ret != 0) {
+            gf_msg("snapview-server", GF_LOG_ERROR, 0, SVS_MSG_SETFSUID_FAIL,
+                   "failed to set uid "
+                   "%u in thread context",
+                   *uid);
+            return ret;
+        }
+    }
+    if (gid) {
+        ret = glfs_setfsgid(*gid);
+        if (ret != 0) {
+            gf_msg("snapview-server", GF_LOG_ERROR, 0, SVS_MSG_SETFSGID_FAIL,
+                   "failed to set gid "
+                   "%u in thread context",
+                   *gid);
+            return ret;
+        }
+    }
+
+    if (ngrps != 0 && groups) {
+        ret = glfs_setfsgroups(ngrps, groups);
+        if (ret != 0) {
+            gf_msg("snapview-server", GF_LOG_ERROR, 0, SVS_MSG_SETFSGRPS_FAIL,
+                   "failed to set "
+                   "groups in thread context");
+            return ret;
+        }
+    }
+    return 0;
+}
+
+int32_t
+svs_lookup_entry_point(xlator_t *this, loc_t *loc, inode_t *parent,
+                       struct iatt *buf, struct iatt *postparent,
+                       int32_t *op_errno)
+{
+    uuid_t gfid;
+    svs_inode_t *inode_ctx = NULL;
+    int op_ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, buf, out);
+    GF_VALIDATE_OR_GOTO(this->name, postparent, out);
+
+    if (gf_uuid_is_null(loc->inode->gfid)) {
+        gf_uuid_generate(gfid);
+        svs_iatt_fill(gfid, buf);
+
+        /* Here the inode context of the entry point directory
+           is filled with just the type of the inode and the gfid
+           of the parent from where the entry point was entered.
+           The glfs object and the fs instance will be NULL.
+        */
+        if (parent)
+            svs_iatt_fill(parent->gfid, postparent);
+        else {
+            svs_iatt_fill(buf->ia_gfid, postparent);
+        }
+
+        inode_ctx = svs_inode_ctx_get_or_new(this, loc->inode);
+        if (!inode_ctx) {
+            op_ret = -1;
+            *op_errno = ENOMEM;
+            gf_msg(this->name, GF_LOG_ERROR, *op_errno,
+                   SVS_MSG_NEW_INODE_CTX_FAILED,
+                   "failed to "
+                   "allocate inode context for entry point "
+                   "directory");
+            goto out;
+        }
+
+        gf_uuid_copy(inode_ctx->pargfid, loc->pargfid);
+        memcpy(&inode_ctx->buf, buf, sizeof(*buf));
+        inode_ctx->type = SNAP_VIEW_ENTRY_POINT_INODE;
+    } else {
+        inode_ctx = svs_inode_ctx_get(this, loc->inode);
+        if (inode_ctx) {
+            memcpy(buf, &inode_ctx->buf, sizeof(*buf));
+            svs_iatt_fill(inode_ctx->pargfid, postparent);
+        } else {
+            svs_iatt_fill(loc->inode->gfid, buf);
+            if (parent)
+                svs_iatt_fill(parent->gfid, postparent);
+            else {
+                svs_iatt_fill(loc->inode->gfid, postparent);
+            }
+        }
+    }
+
+    op_ret = 0;
+
+out:
+    return op_ret;
+}
+
+/* When lookup comes from client and the protocol/server tries to resolve
+   the pargfid via just sending the gfid as part of lookup, if the inode
+   for the parent gfid is not found. But since that gfid has not yet been
+   looked  up yet, inode will not be having inode context and parent is not
+   there (as it is the parent of the entry that is being resolved). So
+   without parent and inode context, svs cannot know which snapshot
+   to look into. In such cases, the amguity is handled by looking
+   into the latest snapshot. If the directory is there in the latest
+   snapshot, lookup is successful, otherwise it is a failure. So for
+   any directory created after taking the latest snapshot, entry into
+   snapshot world is denied. i.e you have to be part of snapshot world
+   to enter it. If the gfid is not found there, then unwind with
+   ESTALE
+   This gets executed mainly in the situation where the snapshot entry
+   point is entered from a non-root directory and that non-root directory's
+   inode (or gfid) is not yet looked up. And in each case when a gfid has to
+   be looked up (without any inode contex and parent context present), last
+   snapshot is referred and a random gfid is not generated.
+*/
+int32_t
+svs_lookup_gfid(xlator_t *this, loc_t *loc, struct iatt *buf,
+                struct iatt *postparent, int32_t *op_errno)
+{
+    int32_t op_ret = -1;
+    unsigned char handle_obj[GFAPI_HANDLE_LENGTH] = {
+        0,
+    };
+    glfs_t *fs = NULL;
+    glfs_object_t *object = NULL;
+    struct stat statbuf = {
+        0,
+    };
+    svs_inode_t *inode_ctx = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, buf, out);
+    GF_VALIDATE_OR_GOTO(this->name, postparent, out);
+
+    if (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_NULL_GFID, "gfid is NULL");
+        goto out;
+    }
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(handle_obj, loc->inode->gfid, GFAPI_HANDLE_LENGTH);
+    else
+        memcpy(handle_obj, loc->gfid, GFAPI_HANDLE_LENGTH);
+
+    fs = svs_get_latest_snapshot(this);
+    if (!fs) {
+        op_ret = -1;
+        *op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_ERROR, *op_errno,
+               SVS_MSG_GET_LATEST_SNAP_FAILED,
+               "failed to get the latest "
+               "snapshot");
+        goto out;
+    }
+
+    object = glfs_h_create_from_handle(fs, handle_obj, GFAPI_HANDLE_LENGTH,
+                                       &statbuf);
+    if (!object) {
+        op_ret = -1;
+        *op_errno = ESTALE;
+        gf_msg(this->name, GF_LOG_ERROR, *op_errno,
+               SVS_MSG_GET_GLFS_H_OBJECT_FAILED,
+               "failed to do lookup and get "
+               "the handle on the snapshot %s (path: %s, gfid: %s)",
+               loc->name, loc->path, uuid_utoa(loc->gfid));
+        goto out;
+    }
+
+    inode_ctx = svs_inode_ctx_get_or_new(this, loc->inode);
+    if (!inode_ctx) {
+        op_ret = -1;
+        *op_errno = ENOMEM;
+        gf_msg(this->name, GF_LOG_ERROR, *op_errno,
+               SVS_MSG_NEW_INODE_CTX_FAILED,
+               "failed to allocate inode "
+               "context");
+        goto out;
+    }
+
+    iatt_from_stat(buf, &statbuf);
+    if (!gf_uuid_is_null(loc->gfid))
+        gf_uuid_copy(buf->ia_gfid, loc->gfid);
+    else
+        gf_uuid_copy(buf->ia_gfid, loc->inode->gfid);
+
+    inode_ctx->type = SNAP_VIEW_VIRTUAL_INODE;
+    inode_ctx->fs = fs;
+    inode_ctx->object = object;
+    memcpy(&inode_ctx->buf, buf, sizeof(*buf));
+    svs_iatt_fill(buf->ia_gfid, postparent);
+
+    op_ret = 0;
+
+out:
+    return op_ret;
+}
+
+/* If the parent is an entry point inode, then create the handle for the
+   snapshot on which lookup came. i.e in reality lookup came on
+   the directory from which the entry point directory was entered, but
+   lookup is into the past. So create the handle for it by doing
+   the name-less lookup on the gfid (which can be obtained from
+   parent's context
+*/
+int32_t
+svs_lookup_snapshot(xlator_t *this, loc_t *loc, struct iatt *buf,
+                    struct iatt *postparent, inode_t *parent,
+                    svs_inode_t *parent_ctx, int32_t *op_errno)
+{
+    int32_t op_ret = -1;
+    unsigned char handle_obj[GFAPI_HANDLE_LENGTH] = {
+        0,
+    };
+    glfs_t *fs = NULL;
+    glfs_object_t *object = NULL;
+    struct stat statbuf = {
+        0,
+    };
+    svs_inode_t *inode_ctx = NULL;
+    uuid_t gfid;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, buf, out);
+    GF_VALIDATE_OR_GOTO(this->name, postparent, out);
+    GF_VALIDATE_OR_GOTO(this->name, parent_ctx, out);
+    GF_VALIDATE_OR_GOTO(this->name, parent, out);
+
+    fs = svs_initialise_snapshot_volume(this, loc->name, op_errno);
+    if (!fs) {
+        gf_msg_debug(this->name, 0,
+                     "failed to create "
+                     "the fs instance for snap %s",
+                     loc->name);
+        *op_errno = ENOENT;
+        op_ret = -1;
+        goto out;
+    }
+
+    memcpy(handle_obj, parent_ctx->pargfid, GFAPI_HANDLE_LENGTH);
+    object = glfs_h_create_from_handle(fs, handle_obj, GFAPI_HANDLE_LENGTH,
+                                       &statbuf);
+    if (!object) {
+        op_ret = -1;
+        *op_errno = errno;
+        /* Should this be in warning or error mode? */
+        gf_msg_debug(this->name, 0,
+                     "failed to do lookup and "
+                     "get the handle on the snapshot %s",
+                     loc->name);
+        goto out;
+    }
+
+    inode_ctx = svs_inode_ctx_get_or_new(this, loc->inode);
+    if (!inode_ctx) {
+        op_ret = -1;
+        *op_errno = ENOMEM;
+        gf_msg(this->name, GF_LOG_ERROR, *op_errno,
+               SVS_MSG_NEW_INODE_CTX_FAILED,
+               "failed to allocate "
+               "inode context");
+        goto out;
+    }
+
+    if (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid))
+        gf_uuid_generate(gfid);
+    else {
+        if (!gf_uuid_is_null(loc->inode->gfid))
+            gf_uuid_copy(gfid, loc->inode->gfid);
+        else
+            gf_uuid_copy(gfid, loc->gfid);
+    }
+    iatt_from_stat(buf, &statbuf);
+    gf_uuid_copy(buf->ia_gfid, gfid);
+    svs_fill_ino_from_gfid(buf);
+    inode_ctx->type = SNAP_VIEW_SNAPSHOT_INODE;
+    inode_ctx->fs = fs;
+    inode_ctx->object = object;
+    memcpy(&inode_ctx->buf, buf, sizeof(*buf));
+    svs_iatt_fill(parent->gfid, postparent);
+
+    SVS_STRDUP(inode_ctx->snapname, loc->name);
+    if (!inode_ctx->snapname) {
+        op_ret = -1;
+        *op_errno = ENOMEM;
+        goto out;
+    }
+    op_ret = 0;
+
+out:
+    if (op_ret) {
+        if (object)
+            glfs_h_close(object);
+
+        if (inode_ctx)
+            inode_ctx->object = NULL;
+    }
+
+    return op_ret;
+}
+
+/* Both parent and entry are from snapshot world */
+int32_t
+svs_lookup_entry(xlator_t *this, loc_t *loc, struct iatt *buf,
+                 struct iatt *postparent, inode_t *parent,
+                 svs_inode_t *parent_ctx, int32_t *op_errno)
+{
+    int32_t op_ret = -1;
+    glfs_t *fs = NULL;
+    glfs_object_t *object = NULL;
+    struct stat statbuf = {
+        0,
+    };
+    svs_inode_t *inode_ctx = NULL;
+    glfs_object_t *parent_object = NULL;
+    uuid_t gfid = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, buf, out);
+    GF_VALIDATE_OR_GOTO(this->name, postparent, out);
+    GF_VALIDATE_OR_GOTO(this->name, parent_ctx, out);
+    GF_VALIDATE_OR_GOTO(this->name, parent, out);
+
+    parent_object = parent_ctx->object;
+    fs = parent_ctx->fs;
+
+    object = glfs_h_lookupat(fs, parent_object, loc->name, &statbuf, 0);
+    if (!object) {
+        /* should this be in WARNING or ERROR mode? */
+        gf_msg_debug(this->name, 0,
+                     "failed to do lookup and "
+                     "get the handle for entry %s (path: %s)",
+                     loc->name, loc->path);
+        op_ret = -1;
+        *op_errno = errno;
+        goto out;
+    }
+
+    if (gf_uuid_is_null(object->gfid)) {
+        /* should this be in WARNING or ERROR mode? */
+        gf_msg_debug(this->name, 0,
+                     "gfid from glfs handle is "
+                     "NULL for entry %s (path: %s)",
+                     loc->name, loc->path);
+        op_ret = -1;
+        *op_errno = errno;
+        goto out;
+    }
+
+    inode_ctx = svs_inode_ctx_get_or_new(this, loc->inode);
+    if (!inode_ctx) {
+        op_ret = -1;
+        *op_errno = ENOMEM;
+        gf_msg(this->name, GF_LOG_ERROR, *op_errno,
+               SVS_MSG_NEW_INODE_CTX_FAILED,
+               "failed to allocate "
+               "inode context");
+        goto out;
+    }
+
+    if (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid)) {
+        if (svs_uuid_generate(this, gfid, parent_ctx->snapname, object->gfid)) {
+            /*
+             * should op_errno be something else such as
+             * EINVAL or ESTALE?
+             */
+            op_ret = -1;
+            *op_errno = EIO;
+            goto out;
+        }
+    } else {
+        if (!gf_uuid_is_null(loc->inode->gfid))
+            gf_uuid_copy(gfid, loc->inode->gfid);
+        else
+            gf_uuid_copy(gfid, loc->gfid);
+    }
+
+    iatt_from_stat(buf, &statbuf);
+    gf_uuid_copy(buf->ia_gfid, gfid);
+    svs_fill_ino_from_gfid(buf);
+    inode_ctx->type = SNAP_VIEW_VIRTUAL_INODE;
+    inode_ctx->fs = fs;
+    inode_ctx->object = object;
+    memcpy(&inode_ctx->buf, buf, sizeof(*buf));
+    svs_iatt_fill(parent->gfid, postparent);
+
+    if (IA_ISDIR(buf->ia_type)) {
+        SVS_STRDUP(inode_ctx->snapname, parent_ctx->snapname);
+        if (!inode_ctx->snapname) {
+            op_ret = -1;
+            *op_errno = ENOMEM;
+            goto out;
+        }
+    }
+
+    op_ret = 0;
+
+out:
+    if (op_ret) {
+        if (object)
+            glfs_h_close(object);
+
+        if (inode_ctx)
+            inode_ctx->object = NULL;
+    }
+
+    return op_ret;
+}
+
+/* inode context is there means lookup has come on an object which was
+   built either as part of lookup or as part of readdirp. But in readdirp
+   we would not have got the handle to access the object in the gfapi
+   world.
+   So if inode context contains glfs_t instance for the right
+   gfapi world and glfs_object_t handle for accessing it in the gfapi
+   world, then unwind with success as the snapshots as of now are
+   read-only.
+   If the above condition is not met, then send lookup call again to
+   the gfapi world. It can happen only if both parent context and
+   the name of the entry are present.
+
+   If parent is an entry point to snapshot world:
+   * parent is needed for getting the gfid on which lookup has to be done
+     (the gfid present in the inode is a virtual gfid) in the snapshot
+     world.
+   * name is required to get the right glfs_t instance on which lookup
+     has to be done
+
+   If parent is a directory from snapshot world:
+   * parent context is needed to get the glfs_t instance and to get the
+     handle to parent directory in the snapshot world.
+   * name is needed to do the lookup on the right entry in the snapshot
+     world
+*/
+int32_t
+svs_revalidate(xlator_t *this, loc_t *loc, inode_t *parent,
+               svs_inode_t *inode_ctx, svs_inode_t *parent_ctx,
+               struct iatt *buf, struct iatt *postparent, int32_t *op_errno)
+{
+    int32_t op_ret = -1;
+    int ret = -1;
+    char tmp_uuid[64] = {
+        0,
+    };
+    glfs_t *fs = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, buf, out);
+    GF_VALIDATE_OR_GOTO(this->name, postparent, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode_ctx, out);
+
+    if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) {
+        svs_iatt_fill(loc->inode->gfid, buf);
+        if (parent)
+            svs_iatt_fill(parent->gfid, postparent);
+        else
+            svs_iatt_fill(loc->inode->gfid, postparent);
+        op_ret = 0;
+        goto out;
+    } else {
+        /* Though fs and object are present in the inode context, its
+         * better to check if fs is valid or not before doing anything.
+         * Its for the protection from the following operations.
+         * 1) Create a file on the glusterfs mount point
+         * 2) Create a snapshot (say "snap1")
+         * 3) Access the contents of the snapshot
+         * 4) Delete the file from the mount point
+         * 5) Delete the snapshot "snap1"
+         * 6) Create a new snapshot "snap1"
+         *
+         * Now accessing the new snapshot "snap1" gives problems.
+         * Because the inode and dentry created for snap1 would not be
+         * deleted upon the deletion of the snapshot (as deletion of
+         * snapshot is a gluster cli operation, not a fop). So next time
+         * upon creation of a new snap with same name, the previous
+         * inode and dentry itself will be used. But the inode context
+         * contains old information about the glfs_t instance and the
+         * handle in the gfapi world. Thus the glfs_t instance should
+         * be checked before accessing. If its wrong, then right
+         * instance should be obtained by doing the lookup.
+         */
+        if (inode_ctx->fs && inode_ctx->object) {
+            fs = inode_ctx->fs;
+            SVS_CHECK_VALID_SNAPSHOT_HANDLE(fs, this);
+            if (fs) {
+                memcpy(buf, &inode_ctx->buf, sizeof(*buf));
+                if (parent)
+                    svs_iatt_fill(parent->gfid, postparent);
+                else
+                    svs_iatt_fill(buf->ia_gfid, postparent);
+                op_ret = 0;
+                goto out;
+            } else {
+                inode_ctx->fs = NULL;
+                inode_ctx->object = NULL;
+                ret = svs_get_handle(this, loc, inode_ctx, op_errno);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, *op_errno,
+                           SVS_MSG_GET_GLFS_H_OBJECT_FAILED,
+                           "failed to get the handle for "
+                           "%s (gfid %s)",
+                           loc->path, uuid_utoa_r(loc->inode->gfid, tmp_uuid));
+                    op_ret = -1;
+                    goto out;
+                }
+            }
+        }
+
+        /* To send the lookup to gfapi world, both the name of the
+           entry as well as the parent context is needed.
+        */
+        if (!loc->name || !parent_ctx) {
+            *op_errno = ESTALE;
+            gf_msg(this->name, GF_LOG_ERROR, *op_errno,
+                   SVS_MSG_PARENT_CTX_OR_NAME_NULL, "%s is NULL",
+                   loc->name ? "parent context" : "loc->name");
+            goto out;
+        }
+
+        if (parent_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE)
+            op_ret = svs_lookup_snapshot(this, loc, buf, postparent, parent,
+                                         parent_ctx, op_errno);
+        else
+            op_ret = svs_lookup_entry(this, loc, buf, postparent, parent,
+                                      parent_ctx, op_errno);
+
+        goto out;
+    }
+
+out:
+    return op_ret;
+}
+
+int32_t
+svs_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    struct iatt buf = {
+        0,
+    };
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    struct iatt postparent = {
+        0,
+    };
+    svs_inode_t *inode_ctx = NULL;
+    svs_inode_t *parent_ctx = NULL;
+    int32_t ret = -1;
+    inode_t *parent = NULL;
+    gf_boolean_t entry_point_key = _gf_false;
+    gf_boolean_t entry_point = _gf_false;
+    call_stack_t *root = NULL;
+
+    GF_VALIDATE_OR_GOTO("svs", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    root = frame->root;
+    op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps,
+                               root->groups);
+    if (op_ret != 0) {
+        goto out;
+    }
+
+    /* For lookups sent on inodes (i.e not parent inode + basename, but
+       direct inode itself which usually is a nameless lookup or revalidate
+       on the inode), loc->name will not be there. Get it from path if
+       it is there.
+       This is the difference between nameless lookup and revalidate lookup
+       on an inode:
+       nameless lookup: loc->path contains gfid and strrchr on it fails
+       revalidate lookup: loc->path contains the entry name of the inode
+                          and strrchr gives the name of the entry from path
+    */
+    if (loc->path) {
+        if (!loc->name || (loc->name && !strcmp(loc->name, ""))) {
+            loc->name = strrchr(loc->path, '/');
+            if (loc->name)
+                loc->name++;
+        }
+    }
+
+    if (loc->parent)
+        parent = inode_ref(loc->parent);
+    else {
+        parent = inode_find(loc->inode->table, loc->pargfid);
+        if (!parent)
+            parent = inode_parent(loc->inode, NULL, NULL);
+    }
+    if (parent)
+        parent_ctx = svs_inode_ctx_get(this, parent);
+
+    inode_ctx = svs_inode_ctx_get(this, loc->inode);
+
+    if (xdata && !inode_ctx) {
+        ret = dict_get_str_boolean(xdata, "entry-point", _gf_false);
+        if (ret == -1) {
+            gf_msg_debug(this->name, 0,
+                         "failed to get the "
+                         "entry point info");
+            entry_point_key = _gf_false;
+        } else {
+            entry_point_key = ret;
+        }
+
+        if (loc->name && strlen(loc->name)) {
+            /* lookup can come with the entry-point set in the dict
+             * for the parent directory of the entry-point as well.
+             * So consider entry_point only for named lookup
+             */
+            entry_point = entry_point_key;
+        }
+    }
+
+    if (inode_ctx && inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) {
+        /* entry-point may not be set in the dictonary.
+         * This can happen if snap-view client is restarted where
+         * inode-ctx not available and a nameless lookup has come
+         */
+        entry_point = _gf_true;
+    }
+
+    /* lookup is on the entry point to the snapshot world */
+    if (entry_point) {
+        op_ret = svs_lookup_entry_point(this, loc, parent, &buf, &postparent,
+                                        &op_errno);
+        goto out;
+    }
+
+    /* revalidate */
+    if (inode_ctx) {
+        op_ret = svs_revalidate(this, loc, parent, inode_ctx, parent_ctx, &buf,
+                                &postparent, &op_errno);
+        goto out;
+    }
+
+    /* This can happen when entry point directory is entered from non-root
+       directory. (ex: if /mnt/glusterfs is the mount point, then entry
+       point (say .snaps) is entered from /mnt/glusterfs/dir/.snaps). Also
+       it can happen when client sends a nameless lookup on just a gfid and
+       the server does not have the inode in the inode table.
+    */
+    if (!inode_ctx && !parent_ctx) {
+        if (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid)) {
+            op_ret = -1;
+            op_errno = ESTALE;
+            gf_msg_debug(this->name, 0,
+                         "gfid is NULL. Either the lookup "
+                         "came on missing entry or the "
+                         "entry is stale");
+            goto out;
+        }
+
+        if (!entry_point_key) {
+            /* This can happen when there is no inode_ctx available.
+             * snapview-server might have restarted or
+             * graph change might have happened
+             */
+            op_ret = -1;
+            op_errno = ESTALE;
+            goto out;
+        }
+
+        /* lookup is on the parent directory of entry-point.
+         * this would have already looked up by snap-view client
+         * so return success
+         */
+        if (!gf_uuid_is_null(loc->gfid))
+            gf_uuid_copy(buf.ia_gfid, loc->gfid);
+        else
+            gf_uuid_copy(buf.ia_gfid, loc->inode->gfid);
+
+        svs_iatt_fill(buf.ia_gfid, &buf);
+        svs_iatt_fill(buf.ia_gfid, &postparent);
+
+        op_ret = 0;
+        goto out;
+    }
+
+    if (parent_ctx) {
+        if (parent_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE)
+            op_ret = svs_lookup_snapshot(this, loc, &buf, &postparent, parent,
+                                         parent_ctx, &op_errno);
+        else
+            op_ret = svs_lookup_entry(this, loc, &buf, &postparent, parent,
+                                      parent_ctx, &op_errno);
+        goto out;
+    }
+
+out:
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno,
+                        loc ? loc->inode : NULL, &buf, xdata, &postparent);
+
+    if (parent)
+        inode_unref(parent);
+
+    return 0;
+}
+
+int32_t
+svs_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+            dict_t *xdata)
+{
+    svs_inode_t *inode_ctx = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    svs_fd_t *svs_fd = NULL;
+    glfs_fd_t *glfd = NULL;
+    glfs_t *fs = NULL;
+    glfs_object_t *object = NULL;
+    call_stack_t *root = NULL;
+
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    root = frame->root;
+    op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps,
+                               root->groups);
+    if (op_ret != 0) {
+        goto out;
+    }
+
+    inode_ctx = svs_inode_ctx_get(this, loc->inode);
+    if (!inode_ctx) {
+        op_ret = -1;
+        op_errno = ESTALE;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "inode context not found "
+               "for the inode %s",
+               uuid_utoa(loc->inode->gfid));
+        goto out;
+    }
+
+    /* Fake success is sent if the opendir is on the entry point directory
+       or the inode is SNAP_VIEW_ENTRY_POINT_INODE
+    */
+    if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) {
+        op_ret = 0;
+        op_errno = 0;
+        goto out;
+    } else {
+        SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, op_ret,
+                               op_errno, out);
+
+        glfd = glfs_h_opendir(fs, object);
+        if (!glfd) {
+            op_ret = -1;
+            op_errno = errno;
+            gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_OPENDIR_FAILED,
+                   "opendir on %s failed "
+                   "(gfid: %s)",
+                   loc->name, uuid_utoa(loc->inode->gfid));
+            goto out;
+        }
+        svs_fd = svs_fd_ctx_get_or_new(this, fd);
+        if (!svs_fd) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                   SVS_MSG_NEW_FD_CTX_FAILED,
+                   "failed to allocate fd context "
+                   "for %s (gfid: %s)",
+                   loc->name, uuid_utoa(fd->inode->gfid));
+            glfs_closedir(glfd);
+            goto out;
+        }
+        svs_fd->fd = glfd;
+
+        op_ret = 0;
+        op_errno = 0;
+    }
+
+out:
+    STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, NULL);
+
+    return 0;
+}
+
+/*
+ * This function adds the xattr keys present in the list (@list) to the dict.
+ * But the list contains only the names of the xattrs (and no value, as
+ * the gfapi functions for the listxattr operations would return only the
+ * names of the xattrs in the buffer provided by the caller, though they had
+ * got the values of those xattrs from posix) as described in the man page of
+ * listxattr. But before unwinding snapview-server has to put those names
+ * back into the dict. But to get the values for those xattrs it has to do the
+ * getxattr operation on each xattr which might turn out to be a costly
+ * operation. So for each of the xattrs present in the list, a 0 byte value
+ * ("") is set into the dict before unwinding. Since ("") is also a valid xattr
+ * value(in a file system) we use an extra key in the same dictionary as an
+ * indicator to other xlators which want to cache the xattrs (as of now,
+ * md-cache which caches acl and selinux related xattrs) to not to cache the
+ * values of the xattrs present in the dict.
+ */
+int32_t
+svs_add_xattrs_to_dict(xlator_t *this, dict_t *dict, char *list, ssize_t size)
+{
+    char keybuffer[4096] = {
+        0,
+    };
+    size_t remaining_size = 0;
+    int32_t list_offset = 0;
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("snapview-daemon", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+    GF_VALIDATE_OR_GOTO(this->name, list, out);
+
+    remaining_size = size;
+    list_offset = 0;
+    while (remaining_size > 0) {
+        strncpy(keybuffer, list + list_offset, sizeof(keybuffer) - 1);
+#ifdef GF_DARWIN_HOST_OS
+        /* The protocol expect namespace for now */
+        char *newkey = NULL;
+        gf_add_prefix(XATTR_USER_PREFIX, keybuffer, &newkey);
+        strcpy(keybuffer, newkey);
+        GF_FREE(newkey);
+#endif
+        ret = dict_set_str(dict, keybuffer, "");
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_DICT_SET_FAILED,
+                   "dict set operation "
+                   "for the key %s failed.",
+                   keybuffer);
+            goto out;
+        }
+
+        remaining_size -= strlen(keybuffer) + 1;
+        list_offset += strlen(keybuffer) + 1;
+    } /* while (remaining_size > 0) */
+
+    /* Add an additional key to indicate that we don't need to cache these
+     * xattrs(with value "") */
+    ret = dict_set_str(dict, "glusterfs.skip-cache", "");
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_DICT_SET_FAILED,
+               "dict set operation for the key glusterfs.skip-cache failed.");
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int32_t
+svs_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+             dict_t *xdata)
+{
+    svs_inode_t *inode_ctx = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    glfs_t *fs = NULL;
+    glfs_object_t *object = NULL;
+    char *value = 0;
+    ssize_t size = 0;
+    dict_t *dict = NULL;
+    call_stack_t *root = NULL;
+
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out);
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", frame, out);
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", loc, out);
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", loc->inode, out);
+
+    root = frame->root;
+    op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps,
+                               root->groups);
+    if (op_ret != 0) {
+        goto out;
+    }
+
+    inode_ctx = svs_inode_ctx_get(this, loc->inode);
+    if (!inode_ctx) {
+        op_ret = -1;
+        op_errno = ESTALE;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "inode context not found "
+               "for the inode %s",
+               uuid_utoa(loc->inode->gfid));
+        goto out;
+    }
+
+    /* ENODATA is sent if the getxattr is on entry point directory
+       or the inode is SNAP_VIEW_ENTRY_POINT_INODE. Entry point is
+       a virtual directory on which setxattr operations are not
+       allowed. If getxattr has to be faked as success, then a value
+       for the name of the xattr has to be sent which we don't have.
+    */
+    if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) {
+        op_ret = -1;
+        op_errno = ENODATA;
+        goto out;
+    } else {
+        SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, op_ret,
+                               op_errno, out);
+
+        dict = dict_new();
+        if (!dict) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY,
+                   "failed to allocate dict");
+            goto out;
+        }
+
+        size = glfs_h_getxattrs(fs, object, name, NULL, 0);
+        if (size == -1) {
+            op_ret = -1;
+            op_errno = errno;
+            if (errno == ENODATA) {
+                gf_msg_debug(this->name, 0,
+                             "getxattr on "
+                             "%s failed (ket: %s) with %s",
+                             loc->path, name, strerror(errno));
+            } else {
+                gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                       SVS_MSG_GETXATTR_FAILED,
+                       "getxattr on %s failed (key: %s) with %s", loc->path,
+                       name, strerror(errno));
+            }
+            goto out;
+        }
+        value = GF_CALLOC(size + 1, sizeof(char), gf_common_mt_char);
+        if (!value) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY,
+                   "failed to allocate memory for getxattr "
+                   "on %s (key: %s)",
+                   loc->name, name);
+            goto out;
+        }
+
+        size = glfs_h_getxattrs(fs, object, name, value, size);
+        if (size == -1) {
+            op_ret = -1;
+            op_errno = errno;
+            gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_GETXATTR_FAILED,
+                   "failed to get the xattr %s for "
+                   "entry %s",
+                   name, loc->name);
+            goto out;
+        }
+        value[size] = '\0';
+
+        if (name) {
+            op_ret = dict_set_dynptr(dict, (char *)name, value, size);
+            if (op_ret < 0) {
+                op_errno = -op_ret;
+                gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                       SVS_MSG_DICT_SET_FAILED,
+                       "dict set operation for %s for "
+                       "the key %s failed.",
+                       loc->path, name);
+                GF_FREE(value);
+                value = NULL;
+                goto out;
+            }
+        } else {
+            op_ret = svs_add_xattrs_to_dict(this, dict, value, size);
+            if (op_ret == -1) {
+                op_errno = ENOMEM;
+                gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY,
+                       "failed to add xattrs from the list to "
+                       "dict for %s (gfid: %s)",
+                       loc->path, uuid_utoa(loc->inode->gfid));
+                goto out;
+            }
+            GF_FREE(value);
+            value = NULL;
+        }
+    }
+
+out:
+    if (op_ret && value)
+        GF_FREE(value);
+
+    STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, NULL);
+
+    if (dict)
+        dict_unref(dict);
+
+    return 0;
+}
+
+int32_t
+svs_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+              dict_t *xdata)
+{
+    svs_inode_t *inode_ctx = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    char *value = 0;
+    ssize_t size = 0;
+    dict_t *dict = NULL;
+    svs_fd_t *sfd = NULL;
+    glfs_fd_t *glfd = NULL;
+
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out);
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", frame, out);
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", fd, out);
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", fd->inode, out);
+
+    inode_ctx = svs_inode_ctx_get(this, fd->inode);
+    if (!inode_ctx) {
+        op_ret = -1;
+        op_errno = ESTALE;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "inode context not found "
+               "for the inode %s",
+               uuid_utoa(fd->inode->gfid));
+        goto out;
+    }
+
+    if (!(svs_inode_ctx_glfs_mapping(this, inode_ctx))) {
+        op_ret = -1;
+        op_errno = EBADF;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_FS_INSTANCE_INVALID,
+               "glfs instance %p to which the inode %s "
+               "belongs to does not exist. The snapshot "
+               "corresponding to the instance might have"
+               "been deleted or deactivated",
+               inode_ctx->fs, uuid_utoa(fd->inode->gfid));
+        goto out;
+    }
+
+    sfd = svs_fd_ctx_get_or_new(this, fd);
+    if (!sfd) {
+        op_ret = -1;
+        op_errno = EBADFD;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SVS_MSG_GET_FD_CONTEXT_FAILED,
+               "failed to get the fd "
+               "context for %s",
+               uuid_utoa(fd->inode->gfid));
+        goto out;
+    }
+
+    glfd = sfd->fd;
+    /* EINVAL is sent if the getxattr is on entry point directory
+       or the inode is SNAP_VIEW_ENTRY_POINT_INODE. Entry point is
+       a virtual directory on which setxattr operations are not
+       allowed. If getxattr has to be faked as success, then a value
+       for the name of the xattr has to be sent which we don't have.
+    */
+    if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    } else {
+        dict = dict_new();
+        if (!dict) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY,
+                   "failed to allocate dict "
+                   "(gfid: %s, key: %s)",
+                   uuid_utoa(fd->inode->gfid), name);
+            goto out;
+        }
+
+        if (name) {
+            size = glfs_fgetxattr(glfd, name, NULL, 0);
+            if (size == -1) {
+                op_ret = -1;
+                op_errno = errno;
+                gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                       SVS_MSG_GETXATTR_FAILED,
+                       "getxattr on %s failed "
+                       "(key: %s)",
+                       uuid_utoa(fd->inode->gfid), name);
+                goto out;
+            }
+            value = GF_CALLOC(size + 1, sizeof(char), gf_common_mt_char);
+            if (!value) {
+                op_ret = -1;
+                op_errno = ENOMEM;
+                gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY,
+                       "failed to "
+                       "allocate memory for getxattr on %s "
+                       "(key: %s)",
+                       uuid_utoa(fd->inode->gfid), name);
+                goto out;
+            }
+
+            size = glfs_fgetxattr(glfd, name, value, size);
+            if (size == -1) {
+                op_ret = -1;
+                op_errno = errno;
+                gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                       SVS_MSG_GETXATTR_FAILED,
+                       "failed to get the xattr %s "
+                       "for inode %s",
+                       name, uuid_utoa(fd->inode->gfid));
+                goto out;
+            }
+            value[size] = '\0';
+
+            op_ret = dict_set_dynptr(dict, (char *)name, value, size);
+            if (op_ret < 0) {
+                op_errno = -op_ret;
+                gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                       SVS_MSG_DICT_SET_FAILED,
+                       "dict set operation for gfid %s "
+                       "for the key %s failed.",
+                       uuid_utoa(fd->inode->gfid), name);
+                goto out;
+            }
+        } else {
+            size = glfs_flistxattr(glfd, NULL, 0);
+            if (size == -1) {
+                op_errno = errno;
+                gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                       SVS_MSG_LISTXATTR_FAILED, "listxattr on %s failed",
+                       uuid_utoa(fd->inode->gfid));
+                goto out;
+            }
+
+            value = GF_CALLOC(size + 1, sizeof(char), gf_common_mt_char);
+            if (!value) {
+                op_ret = -1;
+                op_errno = ENOMEM;
+                gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY,
+                       "failed to "
+                       "allocate buffer for xattr "
+                       "list (%s)",
+                       uuid_utoa(fd->inode->gfid));
+                goto out;
+            }
+
+            size = glfs_flistxattr(glfd, value, size);
+            if (size == -1) {
+                op_ret = -1;
+                op_errno = errno;
+                gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                       SVS_MSG_LISTXATTR_FAILED, "listxattr on %s failed",
+                       uuid_utoa(fd->inode->gfid));
+                goto out;
+            }
+
+            op_ret = svs_add_xattrs_to_dict(this, dict, value, size);
+            if (op_ret == -1) {
+                op_errno = ENOMEM;
+                gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY,
+                       "failed to add xattrs from the list "
+                       "to dict (gfid: %s)",
+                       uuid_utoa(fd->inode->gfid));
+                goto out;
+            }
+            GF_FREE(value);
+        }
+
+        op_ret = 0;
+        op_errno = 0;
+    }
+
+out:
+    if (op_ret)
+        GF_FREE(value);
+
+    STACK_UNWIND_STRICT(fgetxattr, frame, op_ret, op_errno, dict, NULL);
+
+    if (dict)
+        dict_unref(dict);
+
+    return 0;
+}
+
+int32_t
+svs_releasedir(xlator_t *this, fd_t *fd)
+{
+    svs_fd_t *sfd = NULL;
+    uint64_t tmp_pfd = 0;
+    int ret = 0;
+    svs_inode_t *svs_inode = NULL;
+    glfs_t *fs = NULL;
+    inode_t *inode = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    ret = fd_ctx_del(fd, this, &tmp_pfd);
+    if (ret < 0) {
+        gf_msg_debug(this->name, 0, "pfd from fd=%p is NULL", fd);
+        goto out;
+    }
+
+    inode = fd->inode;
+
+    svs_inode = svs_inode_ctx_get(this, inode);
+    if (svs_inode) {
+        fs = svs_inode->fs; /* should inode->lock be held for this? */
+        SVS_CHECK_VALID_SNAPSHOT_HANDLE(fs, this);
+        if (fs) {
+            sfd = (svs_fd_t *)(long)tmp_pfd;
+            if (sfd->fd) {
+                ret = glfs_closedir(sfd->fd);
+                if (ret)
+                    gf_msg(this->name, GF_LOG_WARNING, errno,
+                           SVS_MSG_RELEASEDIR_FAILED,
+                           "failed to close the glfd for "
+                           "directory %s",
+                           uuid_utoa(fd->inode->gfid));
+            }
+        }
+    }
+
+    GF_FREE(sfd);
+
+out:
+    return 0;
+}
+
+int32_t
+svs_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    int ret = -1;
+    uint64_t value = 0;
+    svs_inode_t *inode_ctx = NULL;
+    call_stack_t *root = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    root = frame->root;
+    op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps,
+                               root->groups);
+    if (op_ret != 0) {
+        goto out;
+    }
+
+    inode_ctx = svs_inode_ctx_get(this, fd->inode);
+    if (!inode_ctx) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "inode context not found for"
+               " the inode %s",
+               uuid_utoa(fd->inode->gfid));
+        goto out;
+    }
+
+    ret = fd_ctx_get(fd, this, &value);
+    if (ret < 0 && inode_ctx->type != SNAP_VIEW_ENTRY_POINT_INODE) {
+        op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_WARNING, op_errno,
+               SVS_MSG_GET_FD_CONTEXT_FAILED, "pfd is NULL on fd=%p", fd);
+        goto out;
+    }
+
+    op_ret = 0;
+
+out:
+    STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, NULL);
+
+    return 0;
+}
+
+int32_t
+svs_release(xlator_t *this, fd_t *fd)
+{
+    svs_fd_t *sfd = NULL;
+    uint64_t tmp_pfd = 0;
+    int ret = 0;
+    inode_t *inode = NULL;
+    svs_inode_t *svs_inode = NULL;
+    glfs_t *fs = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+    ret = fd_ctx_del(fd, this, &tmp_pfd);
+    if (ret < 0) {
+        gf_msg_debug(this->name, 0, "pfd from fd=%p is NULL", fd);
+        goto out;
+    }
+
+    inode = fd->inode;
+
+    svs_inode = svs_inode_ctx_get(this, inode);
+    if (svs_inode) {
+        fs = svs_inode->fs; /* should inode->lock be held for this? */
+        SVS_CHECK_VALID_SNAPSHOT_HANDLE(fs, this);
+        if (fs) {
+            sfd = (svs_fd_t *)(long)tmp_pfd;
+            if (sfd->fd) {
+                ret = glfs_close(sfd->fd);
+                if (ret)
+                    gf_msg(this->name, GF_LOG_ERROR, errno,
+                           SVS_MSG_RELEASE_FAILED,
+                           "failed to close "
+                           "the glfd for %s",
+                           uuid_utoa(fd->inode->gfid));
+            }
+        }
+    }
+
+    GF_FREE(sfd);
+out:
+    return 0;
+}
+
+int32_t
+svs_forget(xlator_t *this, inode_t *inode)
+{
+    int ret = -1;
+    uint64_t value = 0;
+    svs_inode_t *inode_ctx = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    ret = inode_ctx_del(inode, this, &value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_DELETE_INODE_CONTEXT_FAILED,
+               "failed to delete the inode "
+               "context of %s",
+               uuid_utoa(inode->gfid));
+        goto out;
+    }
+
+    inode_ctx = (svs_inode_t *)(uintptr_t)value;
+    if (!inode_ctx)
+        goto out;
+
+    if (inode_ctx->snapname)
+        GF_FREE(inode_ctx->snapname);
+
+    /*
+     * glfs_h_close leads to unref and forgetting of the
+     * underlying inode in the gfapi world. i.e. the inode
+     * which inode_ctx->object points to.
+     * As of now the only possibility is, this forget came as a
+     * result of snapdaemon's inode table reaching the lru
+     * limit and receiving forget as a result of purging of
+     * extra inodes that exceeded the limit. But, care must
+     * be taken to ensure that, the gfapi instance to which
+     * the glfs_h_object belongs to is not deleted. Otherwise
+     * this might result in access of a freed pointer.
+     * This will still be helpful in reducing the memory
+     * footprint of snapdaemon when the fs instance itself is
+     * valid (i.e. present and not destroyed due to either snap
+     * deactivate or snap delete), but the lru limit is reached.
+     * The forget due to lru limit will make the underlying inode
+     * being unrefed and forgotten.
+     */
+    if (svs_inode_ctx_glfs_mapping(this, inode_ctx)) {
+        glfs_h_close(inode_ctx->object);
+        inode_ctx->object = NULL;
+    }
+    GF_FREE(inode_ctx);
+
+out:
+    return 0;
+}
+
+int
+svs_fill_readdir(xlator_t *this, gf_dirent_t *entries, size_t size, off_t off)
+{
+    gf_dirent_t *entry = NULL;
+    svs_private_t *priv = NULL;
+    int i = 0;
+    snap_dirent_t *dirents = NULL;
+    int this_size = 0;
+    int filled_size = 0;
+    int count = 0;
+
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out);
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", entries, out);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* create the dir entries */
+    LOCK(&priv->snaplist_lock);
+    {
+        dirents = priv->dirents;
+
+        for (i = off; i < priv->num_snaps;) {
+            this_size = sizeof(gf_dirent_t) + strlen(dirents[i].name) + 1;
+            if (this_size + filled_size > size)
+                goto unlock;
+
+            entry = gf_dirent_for_name(dirents[i].name);
+            if (!entry) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, SVS_MSG_NO_MEMORY,
+                       "failed to allocate dentry for %s", dirents[i].name);
+                goto unlock;
+            }
+
+            entry->d_off = i + 1;
+            /*
+             * readdir on the entry-point directory to the snapshot
+             * world, will return elements in the list of the
+             * snapshots as the directory entries. Since the entries
+             * returned are virtual entries which does not exist
+             * physically on the disk, pseudo inode numbers are
+             * generated.
+             */
+            entry->d_ino = i + 2 * 42;
+            entry->d_type = DT_DIR;
+            list_add_tail(&entry->list, &entries->list);
+            ++i;
+            count++;
+            filled_size += this_size;
+        }
+    }
+unlock:
+    UNLOCK(&priv->snaplist_lock);
+
+out:
+    return count;
+}
+
+int32_t
+svs_glfs_readdir(xlator_t *this, glfs_fd_t *glfd, gf_dirent_t *entries,
+                 int32_t *op_errno, struct iatt *buf, gf_boolean_t readdirplus,
+                 size_t size)
+{
+    int filled_size = 0;
+    int this_size = 0;
+    int32_t ret = -1;
+    int32_t count = 0;
+    gf_dirent_t *entry = NULL;
+    struct dirent *dirents = NULL;
+    struct dirent de = {
+        0,
+    };
+    struct stat statbuf = {
+        0,
+    };
+    off_t in_case = -1;
+
+    GF_VALIDATE_OR_GOTO("svs", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, glfd, out);
+    GF_VALIDATE_OR_GOTO(this->name, entries, out);
+
+    while (filled_size < size) {
+        in_case = glfs_telldir(glfd);
+        if (in_case == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_TELLDIR_FAILED,
+                   "telldir failed");
+            break;
+        }
+
+        if (readdirplus)
+            ret = glfs_readdirplus_r(glfd, &statbuf, &de, &dirents);
+        else
+            ret = glfs_readdir_r(glfd, &de, &dirents);
+
+        if (ret == 0 && dirents != NULL) {
+            if (readdirplus)
+                this_size = max(sizeof(gf_dirent_t), sizeof(gfs3_dirplist)) +
+                            strlen(de.d_name) + 1;
+            else
+                this_size = sizeof(gf_dirent_t) + strlen(de.d_name) + 1;
+
+            if (this_size + filled_size > size) {
+                glfs_seekdir(glfd, in_case);
+                break;
+            }
+
+            entry = gf_dirent_for_name(de.d_name);
+            if (!entry) {
+                /*
+                 * Since gf_dirent_for_name can return
+                 * NULL only when it fails to allocate
+                 * memory for the directory entry,
+                 * SVS_MSG_NO_MEMORY is used as the
+                 * message-id.
+                 */
+                gf_msg(this->name, GF_LOG_ERROR, errno, SVS_MSG_NO_MEMORY,
+                       "could not create gf_dirent "
+                       "for entry %s: (%s)",
+                       entry->d_name, strerror(errno));
+                break;
+            }
+            entry->d_off = glfs_telldir(glfd);
+            entry->d_ino = de.d_ino;
+            entry->d_type = de.d_type;
+            if (readdirplus) {
+                iatt_from_stat(buf, &statbuf);
+                entry->d_stat = *buf;
+            }
+            list_add_tail(&entry->list, &entries->list);
+
+            filled_size += this_size;
+            count++;
+        } else if (ret == 0 && dirents == NULL) {
+            *op_errno = ENOENT;
+            break;
+        } else if (ret != 0) {
+            *op_errno = errno;
+            break;
+        }
+        dirents = NULL;
+    }
+
+out:
+    return count;
+}
+
+/* readdirp can be of 2 types.
+   1) It can come on entry point directory where the list of snapshots
+      is sent as dirents. In this case, the iatt structure is filled
+      on the fly if the inode is not found for the entry or the inode
+      context is NULL. Other wise if inode is found and inode context
+      is there the iatt structure saved in the context is used.
+   2) It can be on a directory in one of the snapshots. In this case,
+      the readdirp call would have sent us a iatt structure. So the same
+      structure is used with the exception that the gfid and the inode
+      numbers will be newly generated and filled in.
+*/
+void
+svs_readdirp_fill(xlator_t *this, inode_t *parent, svs_inode_t *parent_ctx,
+                  gf_dirent_t *entry)
+{
+    inode_t *inode = NULL;
+    uuid_t random_gfid = {
+        0,
+    };
+    struct iatt buf = {
+        0,
+    };
+    svs_inode_t *inode_ctx = NULL;
+
+    GF_VALIDATE_OR_GOTO("snapview-server", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, parent, out);
+    GF_VALIDATE_OR_GOTO(this->name, parent_ctx, out);
+    GF_VALIDATE_OR_GOTO(this->name, entry, out);
+
+    if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
+        goto out;
+
+    inode = inode_grep(parent->table, parent, entry->d_name);
+    if (inode) {
+        entry->inode = inode;
+        inode_ctx = svs_inode_ctx_get(this, inode);
+        if (!inode_ctx) {
+            gf_uuid_copy(buf.ia_gfid, inode->gfid);
+            svs_iatt_fill(inode->gfid, &buf);
+            buf.ia_type = inode->ia_type;
+        } else {
+            buf = inode_ctx->buf;
+        }
+
+        entry->d_ino = buf.ia_ino;
+
+        if (parent_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE)
+            entry->d_stat = buf;
+        else {
+            entry->d_stat.ia_ino = buf.ia_ino;
+            gf_uuid_copy(entry->d_stat.ia_gfid, buf.ia_gfid);
+        }
+    } else {
+        if (parent_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) {
+            inode = inode_new(parent->table);
+            entry->inode = inode;
+
+            /* If inode context allocation fails, then do not send
+             * the inode for that particular entry as part of
+             * readdirp response. Fuse and protocol/server will link
+             * the inodes in readdirp only if the entry contains
+             * inode in it.
+             */
+            inode_ctx = svs_inode_ctx_get_or_new(this, inode);
+            if (!inode_ctx) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, SVS_MSG_NO_MEMORY,
+                       "failed to allocate inode "
+                       "context for %s",
+                       entry->d_name);
+                inode_unref(entry->inode);
+                entry->inode = NULL;
+                goto out;
+            }
+
+            /* Generate virtual gfid for SNAPSHOT dir and
+             * update the statbuf
+             */
+            gf_uuid_generate(random_gfid);
+            gf_uuid_copy(buf.ia_gfid, random_gfid);
+            svs_fill_ino_from_gfid(&buf);
+            buf.ia_type = IA_IFDIR;
+            entry->d_ino = buf.ia_ino;
+            entry->d_stat = buf;
+            inode_ctx->buf = buf;
+            inode_ctx->type = SNAP_VIEW_SNAPSHOT_INODE;
+        } else {
+            /* For files under snapshot world do not set
+             * entry->inode and reset statbuf (except ia_ino),
+             * so that FUSE/Kernel will send an explicit lookup.
+             * entry->d_stat contains the statbuf information
+             * of original file, so for NFS not to cache this
+             * information and to send explicit lookup, it is
+             * required to reset the statbuf.
+             * Virtual gfid for these files will be generated in the
+             * first lookup.
+             */
+            buf.ia_ino = entry->d_ino;
+            entry->d_stat = buf;
+        }
+    }
+
+out:
+    return;
+}
+
+/* In readdirp, though new inode is created along with the generation of
+   new gfid, the inode context created will not contain the glfs_t instance
+   for the filesystem it belongs to and the handle for it in the gfapi
+   world. (handle is obtained only by doing the lookup call on the entry
+   and doing lookup on each entry received as part of readdir call is a
+   costly operation. So the fs and handle is NULL in the inode context
+   and is filled in when lookup comes on that object.
+*/
+int32_t
+svs_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+             off_t off, dict_t *dict)
+{
+    gf_dirent_t entries;
+    gf_dirent_t *entry = NULL;
+    struct iatt buf = {
+        0,
+    };
+    int count = 0;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    svs_inode_t *parent_ctx = NULL;
+    svs_fd_t *svs_fd = NULL;
+    call_stack_t *root = NULL;
+
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", this, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, frame, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, fd, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, unwind);
+
+    INIT_LIST_HEAD(&entries.list);
+
+    root = frame->root;
+    op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps,
+                               root->groups);
+    if (op_ret != 0) {
+        goto unwind;
+    }
+
+    parent_ctx = svs_inode_ctx_get(this, fd->inode);
+    if (!parent_ctx) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "failed to get the inode "
+               "context for %s",
+               uuid_utoa(fd->inode->gfid));
+        goto unwind;
+    }
+
+    if (parent_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) {
+        LOCK(&fd->lock);
+        {
+            count = svs_fill_readdir(this, &entries, size, off);
+        }
+        UNLOCK(&fd->lock);
+
+        op_ret = count;
+
+        list_for_each_entry(entry, &entries.list, list)
+        {
+            svs_readdirp_fill(this, fd->inode, parent_ctx, entry);
+        }
+
+        goto unwind;
+    } else {
+        svs_fd = svs_fd_ctx_get_or_new(this, fd);
+        if (!svs_fd) {
+            op_ret = -1;
+            op_errno = EBADFD;
+            gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                   SVS_MSG_GET_FD_CONTEXT_FAILED,
+                   "failed to get the fd context "
+                   "for the inode %s",
+                   uuid_utoa(fd->inode->gfid));
+            goto unwind;
+        }
+
+        glfs_seekdir(svs_fd->fd, off);
+
+        LOCK(&fd->lock);
+        {
+            count = svs_glfs_readdir(this, svs_fd->fd, &entries, &op_errno,
+                                     &buf, _gf_true, size);
+        }
+        UNLOCK(&fd->lock);
+
+        op_ret = count;
+
+        list_for_each_entry(entry, &entries.list, list)
+        {
+            svs_readdirp_fill(this, fd->inode, parent_ctx, entry);
+        }
+
+        goto unwind;
+    }
+
+unwind:
+    STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, &entries, dict);
+
+    gf_dirent_free(&entries);
+
+    return 0;
+}
+
+int32_t
+svs_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t off, dict_t *xdata)
+{
+    gf_dirent_t entries = {
+        {
+            {
+                0,
+            },
+        },
+    };
+    int count = 0;
+    svs_inode_t *inode_ctx = NULL;
+    int op_errno = EINVAL;
+    int op_ret = -1;
+    svs_fd_t *svs_fd = NULL;
+    glfs_fd_t *glfd = NULL;
+
+    INIT_LIST_HEAD(&entries.list);
+
+    GF_VALIDATE_OR_GOTO("snap-view-server", this, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, frame, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, fd, unwind);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, unwind);
+
+    inode_ctx = svs_inode_ctx_get(this, fd->inode);
+    if (!inode_ctx) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "inode context not found in "
+               "the inode %s",
+               uuid_utoa(fd->inode->gfid));
+        goto unwind;
+    }
+
+    if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) {
+        LOCK(&fd->lock);
+        {
+            count = svs_fill_readdir(this, &entries, size, off);
+        }
+        UNLOCK(&fd->lock);
+    } else {
+        svs_fd = svs_fd_ctx_get_or_new(this, fd);
+        if (!svs_fd) {
+            op_ret = -1;
+            op_errno = EBADFD;
+            gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                   SVS_MSG_GET_FD_CONTEXT_FAILED,
+                   "failed to get the fd "
+                   "context for %s",
+                   uuid_utoa(fd->inode->gfid));
+            goto unwind;
+        }
+
+        glfd = svs_fd->fd;
+
+        LOCK(&fd->lock);
+        {
+            count = svs_glfs_readdir(this, glfd, &entries, &op_errno, NULL,
+                                     _gf_false, size);
+        }
+        UNLOCK(&fd->lock);
+    }
+
+    op_ret = count;
+
+unwind:
+    STACK_UNWIND_STRICT(readdir, frame, op_ret, op_errno, &entries, xdata);
+
+    gf_dirent_free(&entries);
+
+    return 0;
+}
+
+/*
+ * This function is mainly helpful for NFS. Till now NFS server was not linking
+ * the inodes in readdirp, which caused problems when below operations were
+ * performed.
+ *
+ * 1) ls -l in one of the snaopshots (snapview-server would generate gfids for
+ *    each entry on the fly and link the inodes associated with those entries)
+ * 2) NFS server upon getting readdirp reply would not link the inodes of the
+ *    entries. But it used to generate filehandles for each entry and associate
+ *    the gfid of that entry with the filehandle and send it as part of the
+ *    reply to nfs client.
+ * 3) NFS client would send the filehandle of one of those entries when some
+ *    activity is done on it.
+ * 4) NFS server would not be able to find the inode for the gfid present in the
+ *    filehandle (as the inode was not linked) and would go for hard resolution
+ *    by sending a lookup on the gfid by creating a new inode.
+ * 5) snapview-client will not able to identify whether the inode is a real
+ *    inode existing in the main volume or a virtual inode existing in the
+ *    snapshots as there would not be any inode context.
+ * 6) Since the gfid upon which lookup is sent is a virtual gfid which is not
+ *    present in the disk, lookup would fail and the application would get an
+ *    error.
+ *
+ * The above problem is fixed by the below commit which makes snapview server
+ * more compatible with nfs server (1dea949cb60c3814c9206df6ba8dddec8d471a94).
+ * But now because NFS server does inode linking in readdirp has introduced
+ * the below issue.
+ * In readdirp though snapview-server allocates inode contexts it does not
+ * actually perform lookup on each entry it obtained in readdirp (as doing
+ * a lookup via gfapi over the network for each entry would be costly).
+ *
+ * Till now it was not a problem with NFS server, as NFS was sending a lookup on
+ * the gfid it got from NFS client, for which it was not able to find the right
+ * inode. So snapview-server was able to get the fs instance (glfs_t) of the
+ * snapshot volume to which the entry belongs to, and the handle for the entry
+ * from the corresponding snapshot volume and fill those information in the
+ * inode context.
+ *
+ * But now, since NFS server is able to find the inode from the inode table for
+ * the gfid it got from the NFS client, it won't send lookup. Rather it directly
+ * sends the fop it received from the client. Now this causes problems for
+ * snapview-server. Because for each fop snapview-server assumes that lookup has
+ * been performed on that entry and the entry's inode context contains the
+ * pointers for the fs instance and the handle to the entry in that fs. When NFS
+ * server sends the fop and snapview-server finds that the fs instance and the
+ * handle within the inode context are NULL it unwinds with EINVAL.
+ *
+ * So to handle this, if fs instance or handle within the inode context are
+ * NULL, then do a lookup based on parent inode context's fs instance. And
+ * unwind the results obtained as part of lookup
+ */
+
+int32_t
+svs_get_handle(xlator_t *this, loc_t *loc, svs_inode_t *inode_ctx,
+               int32_t *op_errno)
+{
+    svs_inode_t *parent_ctx = NULL;
+    int ret = -1;
+    inode_t *parent = NULL;
+    struct iatt postparent = {
+        0,
+    };
+    struct iatt buf = {
+        0,
+    };
+    char uuid1[64];
+
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    if (loc->path) {
+        if (!loc->name || (loc->name && !strcmp(loc->name, ""))) {
+            loc->name = strrchr(loc->path, '/');
+            if (loc->name)
+                loc->name++;
+        }
+    }
+
+    if (loc->parent)
+        parent = inode_ref(loc->parent);
+    else {
+        parent = inode_find(loc->inode->table, loc->pargfid);
+        if (!parent)
+            parent = inode_parent(loc->inode, NULL, NULL);
+    }
+
+    if (parent)
+        parent_ctx = svs_inode_ctx_get(this, parent);
+
+    if (!parent_ctx) {
+        *op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_WARNING, *op_errno,
+               SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "failed to get the parent "
+               "context for %s (%s)",
+               loc->path, uuid_utoa_r(loc->inode->gfid, uuid1));
+        goto out;
+    }
+
+    if (parent_ctx) {
+        if (parent_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE)
+            ret = svs_lookup_snapshot(this, loc, &buf, &postparent, parent,
+                                      parent_ctx, op_errno);
+        else
+            ret = svs_lookup_entry(this, loc, &buf, &postparent, parent,
+                                   parent_ctx, op_errno);
+    }
+
+out:
+    if (parent)
+        inode_unref(parent);
+
+    return ret;
+}
+
+int32_t
+svs_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    struct iatt buf = {
+        0,
+    };
+    int32_t op_errno = EINVAL;
+    int32_t op_ret = -1;
+    svs_inode_t *inode_ctx = NULL;
+    glfs_t *fs = NULL;
+    glfs_object_t *object = NULL;
+    struct stat stat = {
+        0,
+    };
+    int ret = -1;
+    call_stack_t *root = NULL;
+
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    root = frame->root;
+    op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps,
+                               root->groups);
+    if (op_ret != 0) {
+        goto out;
+    }
+
+    /* Instead of doing the check of whether it is a entry point directory
+       or not by checking the name of the entry and then deciding what
+       to do, just check the inode context and decide what to be done.
+    */
+
+    inode_ctx = svs_inode_ctx_get(this, loc->inode);
+    if (!inode_ctx) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "inode context not found for %s", uuid_utoa(loc->inode->gfid));
+        goto out;
+    }
+
+    if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) {
+        svs_iatt_fill(loc->inode->gfid, &buf);
+        op_ret = 0;
+    } else {
+        SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, op_ret,
+                               op_errno, out);
+
+        ret = glfs_h_stat(fs, object, &stat);
+        if (ret) {
+            op_ret = -1;
+            op_errno = errno;
+            gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_STAT_FAILED,
+                   "glfs_h_stat on %s (gfid: %s) "
+                   "failed",
+                   loc->name, uuid_utoa(loc->inode->gfid));
+            goto out;
+        } else
+            gf_msg_debug(this->name, 0, "stat on %s (%s) successful", loc->path,
+                         uuid_utoa(loc->inode->gfid));
+
+        iatt_from_stat(&buf, &stat);
+        gf_uuid_copy(buf.ia_gfid, loc->inode->gfid);
+        svs_fill_ino_from_gfid(&buf);
+        op_ret = ret;
+    }
+
+out:
+    STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, &buf, xdata);
+    return 0;
+}
+
+int32_t
+svs_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    struct iatt buf = {
+        0,
+    };
+    int32_t op_errno = EINVAL;
+    int32_t op_ret = -1;
+    svs_inode_t *inode_ctx = NULL;
+    struct stat stat = {
+        0,
+    };
+    int ret = -1;
+    glfs_fd_t *glfd = NULL;
+    svs_fd_t *sfd = NULL;
+    call_stack_t *root = NULL;
+
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, out);
+
+    /* Instead of doing the check of whether it is a entry point directory
+       or not by checking the name of the entry and then deciding what
+       to do, just check the inode context and decide what to be done.
+    */
+
+    root = frame->root;
+    op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps,
+                               root->groups);
+    if (op_ret != 0) {
+        goto out;
+    }
+
+    inode_ctx = svs_inode_ctx_get(this, fd->inode);
+    if (!inode_ctx) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "inode context not found for"
+               " the inode %s",
+               uuid_utoa(fd->inode->gfid));
+        goto out;
+    }
+
+    if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) {
+        svs_iatt_fill(fd->inode->gfid, &buf);
+        op_ret = 0;
+    } else {
+        if (!(svs_inode_ctx_glfs_mapping(this, inode_ctx))) {
+            op_ret = -1;
+            op_errno = EBADF;
+            gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                   SVS_MSG_FS_INSTANCE_INVALID,
+                   "glfs instance %p to which the inode %s "
+                   "belongs to does not exist. That snapshot "
+                   "corresponding to the fs instance "
+                   "might have been deleted or deactivated.",
+                   inode_ctx->fs, uuid_utoa(fd->inode->gfid));
+            goto out;
+        }
+
+        sfd = svs_fd_ctx_get_or_new(this, fd);
+        if (!sfd) {
+            op_ret = -1;
+            op_errno = EBADFD;
+            gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                   SVS_MSG_GET_FD_CONTEXT_FAILED,
+                   "failed to get the fd context "
+                   "for %s",
+                   uuid_utoa(fd->inode->gfid));
+            goto out;
+        }
+
+        glfd = sfd->fd;
+        ret = glfs_fstat(glfd, &stat);
+        if (ret) {
+            op_ret = -1;
+            op_errno = errno;
+            gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_STAT_FAILED,
+                   "glfs_fstat on gfid: %s failed", uuid_utoa(fd->inode->gfid));
+            goto out;
+        }
+
+        iatt_from_stat(&buf, &stat);
+        gf_uuid_copy(buf.ia_gfid, fd->inode->gfid);
+        svs_fill_ino_from_gfid(&buf);
+        op_ret = ret;
+    }
+
+out:
+    STACK_UNWIND_STRICT(fstat, frame, op_ret, op_errno, &buf, xdata);
+    return 0;
+}
+
+int32_t
+svs_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    struct statvfs buf = {
+        0,
+    };
+    int32_t op_errno = EINVAL;
+    int32_t op_ret = -1;
+    svs_inode_t *inode_ctx = NULL;
+    glfs_t *fs = NULL;
+    glfs_object_t *object = NULL;
+    int ret = -1;
+    call_stack_t *root = NULL;
+
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    root = frame->root;
+    op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps,
+                               root->groups);
+    if (op_ret != 0) {
+        goto out;
+    }
+
+    /* Instead of doing the check of whether it is a entry point directory
+       or not by checking the name of the entry and then deciding what
+       to do, just check the inode context and decide what to be done.
+    */
+    inode_ctx = svs_inode_ctx_get(this, loc->inode);
+    if (!inode_ctx) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "inode context not found for %s", uuid_utoa(loc->inode->gfid));
+        goto out;
+    }
+
+    SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, op_ret, op_errno,
+                           out);
+
+    ret = glfs_h_statfs(fs, object, &buf);
+    if (ret) {
+        op_ret = -1;
+        op_errno = errno;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_STATFS_FAILED,
+               "glfs_h_statvfs on %s (gfid: %s) "
+               "failed",
+               loc->name, uuid_utoa(loc->inode->gfid));
+        goto out;
+    }
+    op_ret = ret;
+
+out:
+    STACK_UNWIND_STRICT(statfs, frame, op_ret, op_errno, &buf, xdata);
+    return 0;
+}
+
+int32_t
+svs_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+         fd_t *fd, dict_t *xdata)
+{
+    svs_inode_t *inode_ctx = NULL;
+    svs_fd_t *sfd = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    glfs_fd_t *glfd = NULL;
+    glfs_t *fs = NULL;
+    glfs_object_t *object = NULL;
+    call_stack_t *root = NULL;
+
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    root = frame->root;
+
+    inode_ctx = svs_inode_ctx_get(this, loc->inode);
+    if (!inode_ctx) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "inode context for %s (gfid: %s) "
+               "not found",
+               loc->name, uuid_utoa(loc->inode->gfid));
+        goto out;
+    }
+
+    if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE)
+        GF_ASSERT(0);  // on entry point it should always be opendir
+
+    SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, op_ret, op_errno,
+                           out);
+
+    op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps,
+                               root->groups);
+    if (op_ret != 0) {
+        goto out;
+    }
+
+    glfd = glfs_h_open(fs, object, flags);
+    if (!glfd) {
+        op_ret = -1;
+        op_errno = errno;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_OPEN_FAILED,
+               "glfs_h_open on %s failed (gfid: %s)", loc->name,
+               uuid_utoa(loc->inode->gfid));
+        goto out;
+    }
+
+    sfd = svs_fd_ctx_get_or_new(this, fd);
+    if (!sfd) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY,
+               "failed to allocate fd context "
+               "for %s (gfid: %s)",
+               loc->name, uuid_utoa(loc->inode->gfid));
+        glfs_close(glfd);
+        goto out;
+    }
+    sfd->fd = glfd;
+
+    op_ret = 0;
+
+out:
+    STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, NULL);
+    return 0;
+}
+
+int32_t
+svs_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+          off_t offset, uint32_t flags, dict_t *xdata)
+{
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    svs_private_t *priv = NULL;
+    struct iobuf *iobuf = NULL;
+    struct iobref *iobref = NULL;
+    struct iovec vec = {
+        0,
+    };
+    svs_fd_t *sfd = NULL;
+    int ret = -1;
+    struct glfs_stat fstatbuf = {
+        0,
+    };
+    glfs_fd_t *glfd = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+    call_stack_t *root = NULL;
+
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, out);
+
+    priv = this->private;
+    VALIDATE_OR_GOTO(priv, out);
+
+    root = frame->root;
+    op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps,
+                               root->groups);
+    if (op_ret != 0) {
+        goto out;
+    }
+
+    if (!svs_inode_glfs_mapping(this, fd->inode)) {
+        op_ret = -1;
+        op_errno = EBADF; /* should this be some other error? */
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_FS_INSTANCE_INVALID,
+               "glfs instance to which the inode "
+               "%s receiving read request belongs, "
+               "does not exist anymore",
+               uuid_utoa(fd->inode->gfid));
+        goto out;
+    }
+
+    sfd = svs_fd_ctx_get_or_new(this, fd);
+    if (!sfd) {
+        op_ret = -1;
+        op_errno = EBADFD;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "failed to get the fd "
+               "context for %s",
+               uuid_utoa(fd->inode->gfid));
+        goto out;
+    }
+
+    glfd = sfd->fd;
+
+    iobuf = iobuf_get2(this->ctx->iobuf_pool, size);
+    if (!iobuf) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_NO_MEMORY,
+               "failed to "
+               "allocate iobuf while reading the "
+               "file with gfid %s",
+               uuid_utoa(fd->inode->gfid));
+        goto out;
+    }
+
+    ret = glfs_pread(glfd, iobuf->ptr, size, offset, 0, &fstatbuf);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = errno;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_READ_FAILED,
+               "glfs_read failed on %s (%s)", uuid_utoa(fd->inode->gfid),
+               strerror(op_errno));
+        goto out;
+    }
+
+    vec.iov_base = iobuf->ptr;
+    vec.iov_len = ret;
+
+    iobref = iobref_new();
+
+    iobref_add(iobref, iobuf);
+    glfs_iatt_from_statx(&stbuf, &fstatbuf);
+    gf_uuid_copy(stbuf.ia_gfid, fd->inode->gfid);
+    svs_fill_ino_from_gfid(&stbuf);
+
+    /* Hack to notify higher layers of EOF. */
+    if (!stbuf.ia_size || (offset + vec.iov_len) >= stbuf.ia_size)
+        op_errno = ENOENT;
+
+    op_ret = vec.iov_len;
+
+out:
+
+    STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, &vec, 1, &stbuf, iobref,
+                        NULL);
+
+    if (iobref)
+        iobref_unref(iobref);
+    if (iobuf)
+        iobuf_unref(iobuf);
+
+    return 0;
+}
+
+int32_t
+svs_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+             dict_t *xdata)
+{
+    svs_inode_t *inode_ctx = NULL;
+    glfs_t *fs = NULL;
+    glfs_object_t *object = NULL;
+    int op_ret = -1;
+    int op_errno = EINVAL;
+    char *buf = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+    int ret = -1;
+    struct stat stat = {
+        0,
+    };
+    call_stack_t *root = NULL;
+
+    GF_VALIDATE_OR_GOTO("snap-view-daemon", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    root = frame->root;
+    op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps,
+                               root->groups);
+    if (op_ret != 0) {
+        goto out;
+    }
+
+    inode_ctx = svs_inode_ctx_get(this, loc->inode);
+    if (!inode_ctx) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "failed to get inode context "
+               "for %s (gfid: %s)",
+               loc->name, uuid_utoa(loc->inode->gfid));
+        goto out;
+    }
+
+    SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, op_ret, op_errno,
+                           out);
+
+    ret = glfs_h_stat(fs, object, &stat);
+    if (ret) {
+        op_ret = -1;
+        op_errno = errno;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_STAT_FAILED,
+               "glfs_h_stat on %s (gfid: %s) "
+               "failed",
+               loc->name, uuid_utoa(loc->inode->gfid));
+        goto out;
+    }
+
+    iatt_from_stat(&stbuf, &stat);
+    gf_uuid_copy(stbuf.ia_gfid, loc->inode->gfid);
+    svs_fill_ino_from_gfid(&stbuf);
+
+    buf = alloca(size + 1);
+    op_ret = glfs_h_readlink(fs, object, buf, size);
+    if (op_ret == -1) {
+        op_errno = errno;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_READLINK_FAILED,
+               "readlink on %s failed (gfid: %s)", loc->name,
+               uuid_utoa(loc->inode->gfid));
+        goto out;
+    }
+
+    buf[op_ret] = 0;
+
+out:
+    STACK_UNWIND_STRICT(readlink, frame, op_ret, op_errno, buf, &stbuf, NULL);
+
+    return 0;
+}
+
+int32_t
+svs_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int mask,
+           dict_t *xdata)
+{
+    int ret = -1;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    glfs_t *fs = NULL;
+    glfs_object_t *object = NULL;
+    svs_inode_t *inode_ctx = NULL;
+    gf_boolean_t is_fuse_call = 0;
+    int mode = 0;
+    call_stack_t *root = NULL;
+
+    GF_VALIDATE_OR_GOTO("svs", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, loc->inode, out);
+
+    root = frame->root;
+    op_ret = gf_setcredentials(&root->uid, &root->gid, root->ngrps,
+                               root->groups);
+    if (op_ret != 0) {
+        goto out;
+    }
+
+    inode_ctx = svs_inode_ctx_get(this, loc->inode);
+    if (!inode_ctx) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno,
+               SVS_MSG_GET_INODE_CONTEXT_FAILED,
+               "inode context not found for %s", uuid_utoa(loc->inode->gfid));
+        goto out;
+    }
+
+    is_fuse_call = __is_fuse_call(frame);
+
+    /*
+     * For entry-point directory, set read and execute bits. But not write
+     * permissions.
+     */
+    if (inode_ctx->type == SNAP_VIEW_ENTRY_POINT_INODE) {
+        if (is_fuse_call) {
+            op_ret = 0;
+            op_errno = 0;
+        } else {
+            op_ret = 0;
+            mode |= POSIX_ACL_READ;
+            mode |= POSIX_ACL_EXECUTE;
+            op_errno = mode;
+        }
+        goto out;
+    }
+
+    SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, op_ret, op_errno,
+                           out);
+
+    /* The actual posix_acl xlator does acl checks differently for
+       fuse and nfs. So set frame->root->pid as fspid of the syncop
+       if the call came from nfs
+    */
+    if (!is_fuse_call) {
+        syncopctx_setfspid(&frame->root->pid);
+        syncopctx_setfsuid(&frame->root->uid);
+        syncopctx_setfsgid(&frame->root->gid);
+        syncopctx_setfsgroups(frame->root->ngrps, frame->root->groups);
+    }
+
+    ret = glfs_h_access(fs, object, mask);
+    if (ret < 0) {
+        op_ret = -1;
+        op_errno = errno;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, SVS_MSG_ACCESS_FAILED,
+               "failed to access %s (gfid: %s)", loc->path,
+               uuid_utoa(loc->inode->gfid));
+        goto out;
+    }
+
+    op_ret = 0;
+    op_errno = ret;
+
+out:
+
+    STACK_UNWIND_STRICT(access, frame, op_ret, op_errno, NULL);
+    return 0;
+}
+
+int32_t
+notify(xlator_t *this, int32_t event, void *data, ...)
+{
+    switch (event) {
+        case GF_EVENT_PARENT_UP: {
+            /* Tell the parent that snapview-server xlator is up */
+            default_notify(this, GF_EVENT_CHILD_UP, data);
+        } break;
+        default:
+            break;
+    }
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_svs_mt_end + 1);
+
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, SVS_MSG_MEM_ACNT_FAILED,
+               "Memory accounting"
+               " init failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    svs_private_t *priv = NULL;
+    int ret = -1;
+
+    /* This can be the top of graph in certain cases */
+    if (!this->parents) {
+        gf_msg_debug(this->name, 0, "dangling volume. check volfile ");
+    }
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_svs_mt_priv_t);
+    if (!priv) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, SVS_MSG_NO_MEMORY,
+               "failed to "
+               "allocate memory for this->private ");
+        goto out;
+    }
+
+    this->private = priv;
+
+    GF_OPTION_INIT("volname", priv->volname, str, out);
+    LOCK_INIT(&priv->snaplist_lock);
+
+    LOCK(&priv->snaplist_lock);
+    {
+        priv->num_snaps = 0;
+    }
+    UNLOCK(&priv->snaplist_lock);
+
+    /* What to do here upon failure? should init be failed or succeed? */
+    /* If succeeded, then dynamic management of snapshots will not */
+    /* happen.*/
+    ret = svs_mgmt_init(this);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, SVS_MSG_MGMT_INIT_FAILED,
+               "failed to initiate the "
+               "mgmt rpc callback for svs. Dymamic management of the"
+               "snapshots will not happen");
+        goto out;
+    }
+
+    /* get the list of snaps first to return to client xlator */
+    ret = svs_get_snapshot_list(this);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+               SVS_MSG_GET_SNAPSHOT_LIST_FAILED,
+               "Error initializing snaplist infrastructure");
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    if (ret && priv) {
+        LOCK_DESTROY(&priv->snaplist_lock);
+        GF_FREE(priv->dirents);
+        GF_FREE(priv);
+    }
+
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    svs_private_t *priv = NULL;
+    glusterfs_ctx_t *ctx = NULL;
+    int ret = 0;
+
+    GF_ASSERT(this);
+    priv = this->private;
+    this->private = NULL;
+    ctx = this->ctx;
+    if (!ctx)
+        gf_msg(this->name, GF_LOG_ERROR, 0, SVS_MSG_INVALID_GLFS_CTX,
+               "Invalid ctx found");
+
+    if (priv) {
+        ret = LOCK_DESTROY(&priv->snaplist_lock);
+        if (ret != 0) {
+            gf_msg(this->name, GF_LOG_WARNING, errno,
+                   SVS_MSG_LOCK_DESTROY_FAILED,
+                   "Could not destroy mutex snaplist_lock");
+        }
+
+        if (priv->dirents) {
+            GF_FREE(priv->dirents);
+        }
+
+        if (priv->rpc) {
+            /* cleanup the saved-frames before last unref */
+            rpc_clnt_connection_cleanup(&priv->rpc->conn);
+            rpc_clnt_unref(priv->rpc);
+        }
+
+        GF_FREE(priv);
+    }
+
+    return;
+}
+
+struct xlator_fops fops = {
+    .lookup = svs_lookup,
+    .stat = svs_stat,
+    .statfs = svs_statfs,
+    .opendir = svs_opendir,
+    .readdirp = svs_readdirp,
+    .readdir = svs_readdir,
+    .open = svs_open,
+    .readv = svs_readv,
+    .flush = svs_flush,
+    .fstat = svs_fstat,
+    .getxattr = svs_getxattr,
+    .access = svs_access,
+    .readlink = svs_readlink,
+    /* entry fops */
+};
+
+struct xlator_cbks cbks = {
+    .release = svs_release,
+    .releasedir = svs_releasedir,
+    .forget = svs_forget,
+};
+
+struct volume_options options[] = {
+    {
+        .key = {"volname"},
+        .type = GF_OPTION_TYPE_STR,
+    },
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1},
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "snapview-server",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/snapview-server/src/snapview-server.h b/xlators/features/snapview-server/src/snapview-server.h
new file mode 100644
index 00000000000..6472422e715
--- /dev/null
+++ b/xlators/features/snapview-server/src/snapview-server.h
@@ -0,0 +1,255 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef __SNAP_VIEW_H__
+#define __SNAP_VIEW_H__
+
+#include <glusterfs/dict.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/mem-types.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/iatt.h>
+#include <ctype.h>
+#include <sys/uio.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include "glfs.h"
+#include "glfs-handles.h"
+#include "glfs-internal.h"
+#include "glusterfs3-xdr.h"
+#include <glusterfs/glusterfs-acl.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/list.h>
+#include <glusterfs/timer.h>
+#include "rpc-clnt.h"
+#include "protocol-common.h"
+#include "xdr-generic.h"
+#include "snapview-server-messages.h"
+
+#define DEFAULT_SVD_LOG_FILE_DIRECTORY DATADIR "/log/glusterfs"
+
+#define SNAP_VIEW_MAX_GLFS_T 256
+#define SNAP_VIEW_MAX_GLFS_FDS 1024
+#define SNAP_VIEW_MAX_GLFS_OBJ_HANDLES 1024
+
+#define SVS_STACK_DESTROY(_frame)                                              \
+    do {                                                                       \
+        ((call_frame_t *)_frame)->local = NULL;                                \
+        STACK_DESTROY(((call_frame_t *)_frame)->root);                         \
+    } while (0)
+
+#define SVS_CHECK_VALID_SNAPSHOT_HANDLE(fs, this)                              \
+    do {                                                                       \
+        svs_private_t *_private = NULL;                                        \
+        _private = this->private;                                              \
+        int i = 0;                                                             \
+        gf_boolean_t found = _gf_false;                                        \
+        glfs_t *tmp_fs = NULL;                                                 \
+        LOCK(&_private->snaplist_lock);                                        \
+        {                                                                      \
+            for (i = 0; i < _private->num_snaps; i++) {                        \
+                tmp_fs = _private->dirents[i].fs;                              \
+                gf_log(this->name, GF_LOG_DEBUG,                               \
+                       "snap name: %s, snap volume: %s,"                       \
+                       "dirent->fs: %p",                                       \
+                       _private->dirents[i].name,                              \
+                       _private->dirents[i].snap_volname, tmp_fs);             \
+                if (tmp_fs && fs && (tmp_fs == fs)) {                          \
+                    found = _gf_true;                                          \
+                    gf_msg_debug(this->name, 0,                                \
+                                 "found the fs "                               \
+                                 "instance");                                  \
+                    break;                                                     \
+                }                                                              \
+            }                                                                  \
+        }                                                                      \
+        UNLOCK(&_private->snaplist_lock);                                      \
+                                                                               \
+        if (!found) {                                                          \
+            gf_log(this->name, GF_LOG_WARNING,                                 \
+                   "failed to"                                                 \
+                   " find the fs instance %p",                                 \
+                   fs);                                                        \
+            fs = NULL;                                                         \
+        }                                                                      \
+    } while (0)
+
+#define SVS_GET_INODE_CTX_INFO(inode_ctx, fs, object, this, loc, ret,          \
+                               op_errno, label)                                \
+    do {                                                                       \
+        fs = inode_ctx->fs;                                                    \
+        object = inode_ctx->object;                                            \
+        SVS_CHECK_VALID_SNAPSHOT_HANDLE(fs, this);                             \
+        if (!fs)                                                               \
+            object = NULL;                                                     \
+                                                                               \
+        if (!fs || !object) {                                                  \
+            int32_t tmp = -1;                                                  \
+            char tmp_uuid[64];                                                 \
+                                                                               \
+            tmp = svs_get_handle(this, loc, inode_ctx, &op_errno);             \
+            if (tmp) {                                                         \
+                gf_log(this->name, GF_LOG_ERROR,                               \
+                       "failed to get the handle for %s "                      \
+                       "(gfid: %s)",                                           \
+                       loc->path, uuid_utoa_r(loc->inode->gfid, tmp_uuid));    \
+                ret = -1;                                                      \
+                goto label;                                                    \
+            }                                                                  \
+                                                                               \
+            fs = inode_ctx->fs;                                                \
+            object = inode_ctx->object;                                        \
+        }                                                                      \
+    } while (0);
+
+#define SVS_STRDUP(dst, src)                                                   \
+    do {                                                                       \
+        if (dst && strcmp(src, dst)) {                                         \
+            GF_FREE(dst);                                                      \
+            dst = NULL;                                                        \
+        }                                                                      \
+                                                                               \
+        if (!dst)                                                              \
+            dst = gf_strdup(src);                                              \
+    } while (0)
+
+int
+svs_mgmt_submit_request(void *req, call_frame_t *frame, glusterfs_ctx_t *ctx,
+                        rpc_clnt_prog_t *prog, int procnum, fop_cbk_fn_t cbkfn,
+                        xdrproc_t xdrproc);
+
+int
+svs_get_snapshot_list(xlator_t *this);
+
+int
+mgmt_get_snapinfo_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                      void *myframe);
+
+typedef enum {
+    SNAP_VIEW_ENTRY_POINT_INODE = 0,
+    SNAP_VIEW_SNAPSHOT_INODE,
+    SNAP_VIEW_VIRTUAL_INODE
+} inode_type_t;
+
+struct svs_inode {
+    glfs_t *fs;
+    glfs_object_t *object;
+    inode_type_t type;
+
+    /* used only for entry point directory where gfid of the directory
+       from where the entry point was entered is saved.
+    */
+    uuid_t pargfid;
+
+    /* This is used to generate gfid for all sub files/dirs under this
+     * snapshot
+     */
+    char *snapname;
+    struct iatt buf;
+};
+typedef struct svs_inode svs_inode_t;
+
+struct svs_fd {
+    glfs_fd_t *fd;
+};
+typedef struct svs_fd svs_fd_t;
+
+struct snap_dirent {
+    char name[NAME_MAX];
+    char uuid[UUID_CANONICAL_FORM_LEN + 1];
+    char snap_volname[NAME_MAX];
+    glfs_t *fs;
+};
+typedef struct snap_dirent snap_dirent_t;
+
+struct svs_private {
+    snap_dirent_t *dirents;
+    int num_snaps;
+    char *volname;
+    struct list_head snaplist;
+    gf_lock_t snaplist_lock;
+    struct rpc_clnt *rpc;
+};
+typedef struct svs_private svs_private_t;
+
+int
+__svs_inode_ctx_set(xlator_t *this, inode_t *inode, svs_inode_t *svs_inode);
+
+svs_inode_t *
+__svs_inode_ctx_get(xlator_t *this, inode_t *inode);
+
+svs_inode_t *
+svs_inode_ctx_get(xlator_t *this, inode_t *inode);
+
+int32_t
+svs_inode_ctx_set(xlator_t *this, inode_t *inode, svs_inode_t *svs_inode);
+
+svs_inode_t *
+svs_inode_ctx_get_or_new(xlator_t *this, inode_t *inode);
+
+int
+__svs_fd_ctx_set(xlator_t *this, fd_t *fd, svs_fd_t *svs_fd);
+
+svs_fd_t *
+__svs_fd_ctx_get(xlator_t *this, fd_t *fd);
+
+svs_fd_t *
+svs_fd_ctx_get(xlator_t *this, fd_t *fd);
+
+int32_t
+svs_fd_ctx_set(xlator_t *this, fd_t *fd, svs_fd_t *svs_fd);
+
+svs_fd_t *
+__svs_fd_ctx_get_or_new(xlator_t *this, fd_t *fd);
+
+svs_fd_t *
+svs_fd_ctx_get_or_new(xlator_t *this, fd_t *fd);
+
+int
+svs_uuid_generate(xlator_t *this, uuid_t gfid, char *snapname,
+                  uuid_t origin_gfid);
+
+void
+svs_fill_ino_from_gfid(struct iatt *buf);
+
+void
+svs_iatt_fill(uuid_t gfid, struct iatt *buf);
+
+snap_dirent_t *
+svs_get_latest_snap_entry(xlator_t *this);
+
+glfs_t *
+svs_get_latest_snapshot(xlator_t *this);
+
+glfs_t *
+svs_initialise_snapshot_volume(xlator_t *this, const char *name,
+                               int32_t *op_errno);
+
+glfs_t *
+__svs_initialise_snapshot_volume(xlator_t *this, const char *name,
+                                 int32_t *op_errno);
+
+snap_dirent_t *
+__svs_get_snap_dirent(xlator_t *this, const char *name);
+
+int
+svs_mgmt_init(xlator_t *this);
+
+int32_t
+svs_get_handle(xlator_t *this, loc_t *loc, svs_inode_t *inode_ctx,
+               int32_t *op_errno);
+
+glfs_t *
+svs_inode_glfs_mapping(xlator_t *this, inode_t *inode);
+
+glfs_t *
+svs_inode_ctx_glfs_mapping(xlator_t *this, svs_inode_t *inode_ctx);
+
+#endif /* __SNAP_VIEW_H__ */
diff --git a/xlators/features/thin-arbiter/Makefile.am b/xlators/features/thin-arbiter/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/thin-arbiter/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/thin-arbiter/src/Makefile.am b/xlators/features/thin-arbiter/src/Makefile.am
new file mode 100644
index 00000000000..a3c133e7798
--- /dev/null
+++ b/xlators/features/thin-arbiter/src/Makefile.am
@@ -0,0 +1,22 @@
+xlator_LTLIBRARIES = thin-arbiter.la
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+thin_arbiter_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+thin_arbiter_la_SOURCES = thin-arbiter.c \
+    $(top_builddir)/xlators/lib/src/libxlator.c
+
+thin_arbiter_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = thin-arbiter.h thin-arbiter-mem-types.h thin-arbiter-messages.h \
+    $(top_builddir)/xlators/lib/src/libxlator.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+    -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
+    -I$(top_srcdir)/rpc/rpc-lib/src \
+    -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/thin-arbiter/src/thin-arbiter-mem-types.h b/xlators/features/thin-arbiter/src/thin-arbiter-mem-types.h
new file mode 100644
index 00000000000..69562d2febc
--- /dev/null
+++ b/xlators/features/thin-arbiter/src/thin-arbiter-mem-types.h
@@ -0,0 +1,19 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __THIN_ARBITER_MEM_TYPES_H__
+#define __THIN_ARBITER_MEM_TYPES_H__
+#include <glusterfs/mem-types.h>
+
+typedef enum gf_ta_mem_types_ {
+    gf_ta_mt_local_t = gf_common_mt_end + 1,
+    gf_ta_mt_char,
+    gf_ta_mt_end
+} gf_ta_mem_types_t;
+#endif
diff --git a/xlators/features/thin-arbiter/src/thin-arbiter-messages.h b/xlators/features/thin-arbiter/src/thin-arbiter-messages.h
new file mode 100644
index 00000000000..81d7491577a
--- /dev/null
+++ b/xlators/features/thin-arbiter/src/thin-arbiter-messages.h
@@ -0,0 +1,28 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _TA_MESSAGES_H_
+#define _TA_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(TA, TA_MSG_INVALID_FOP);
+
+#endif /* !_TA_MESSAGES_H_ */
diff --git a/xlators/features/thin-arbiter/src/thin-arbiter.c b/xlators/features/thin-arbiter/src/thin-arbiter.c
new file mode 100644
index 00000000000..ce3008636f1
--- /dev/null
+++ b/xlators/features/thin-arbiter/src/thin-arbiter.c
@@ -0,0 +1,661 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "thin-arbiter.h"
+#include "thin-arbiter-messages.h"
+#include "thin-arbiter-mem-types.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/common-utils.h>
+
+int
+ta_set_incoming_values(dict_t *dict, char *key, data_t *value, void *data)
+{
+    int32_t ret = 0;
+    ta_fop_t *fop = (ta_fop_t *)data;
+    int32_t *pending = NULL;
+
+    pending = GF_CALLOC(1, value->len, gf_ta_mt_char);
+    if (!pending) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    ret = dict_set_bin(fop->brick_xattr, key, pending, value->len);
+out:
+    return ret;
+}
+
+int
+ta_get_incoming_and_brick_values(dict_t *dict, char *key, data_t *value,
+                                 void *data)
+{
+    ta_fop_t *fop = data;
+    char *source = NULL;
+    char *in_coming = NULL;
+    int32_t len = 0, ret = 0;
+
+    source = GF_CALLOC(1, value->len, gf_ta_mt_char);
+    if (!source) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    ret = dict_get_ptr_and_len(fop->dict, key, (void **)&in_coming, &len);
+
+    if (!in_coming || value->len != len) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if (!memcmp(value->data, source, value->len) &&
+        (!memcmp(in_coming, source, len))) {
+        fop->on_disk[fop->idx] = 0;
+    } else {
+        fop->on_disk[fop->idx] = 1;
+    }
+
+    fop->idx++;
+out:
+    GF_FREE(source);
+    return ret;
+}
+
+void
+ta_release_fop(ta_fop_t *fop)
+{
+    if (!fop) {
+        return;
+    }
+    if (fop->fd) {
+        fd_unref(fop->fd);
+    }
+    loc_wipe(&fop->loc);
+    if (fop->dict) {
+        dict_unref(fop->dict);
+    }
+    if (fop->brick_xattr) {
+        dict_unref(fop->brick_xattr);
+    }
+
+    GF_FREE(fop);
+    return;
+}
+
+int32_t
+ta_set_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *dict,
+                   dict_t *xdata)
+{
+    TA_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+/*
+case 1 - If brick value is 0 and incoming value is also 0, fine
+case 2 - If brick value is 0 and incoming value is non 0, fine
+case 3 - If brick value is non 0 and incoming value is also 0, fine
+case 4 - If brick value is non 0 and incoming value is non 0, fine
+case 5 - If incoming value is non zero on both brick, it is wrong
+case 6 - If incoming value is non zero but brick value for other
+brick is also non zero, wrong
+*/
+
+int32_t
+ta_verify_on_disk_source(ta_fop_t *fop, dict_t *dict)
+{
+    int ret = 0;
+
+    if (!fop) {
+        return -EINVAL;
+    }
+
+    ret = dict_foreach(dict, ta_get_incoming_and_brick_values, (void *)fop);
+    if (ret < 0) {
+        return ret;
+    }
+    if (fop->on_disk[0] && fop->on_disk[1]) {
+        return -EINVAL;
+    }
+    return 0;
+}
+
+int32_t
+ta_get_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *dict,
+                   dict_t *xdata)
+{
+    ta_fop_t *fop = NULL;
+    int ret = 0;
+
+    fop = frame->local;
+    if (op_ret) {
+        goto unwind;
+    }
+
+    ret = ta_verify_on_disk_source(fop, dict);
+    if (ret < 0) {
+        op_errno = -ret;
+        goto unwind;
+    }
+
+    if (fop->fd) {
+        STACK_WIND(frame, ta_set_xattrop_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fxattrop, fop->fd,
+                   fop->xattrop_flags, fop->dict, NULL);
+    } else {
+        STACK_WIND(frame, ta_set_xattrop_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->xattrop, &fop->loc,
+                   fop->xattrop_flags, fop->dict, NULL);
+    }
+    return 0;
+
+unwind:
+
+    TA_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL);
+    return -1;
+}
+
+ta_fop_t *
+ta_prepare_fop(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+               gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    ta_fop_t *fop = NULL;
+    int ret = 0;
+
+    fop = GF_CALLOC(1, sizeof(*fop), gf_ta_mt_local_t);
+    if (!fop) {
+        goto out;
+    }
+
+    if (loc) {
+        loc_copy(&fop->loc, loc);
+    }
+
+    if (fd) {
+        fop->fd = fd_ref(fd);
+    }
+
+    fop->xattrop_flags = flags;
+    fop->idx = 0;
+
+    if (dict != NULL) {
+        fop->dict = dict_ref(dict);
+    }
+    fop->brick_xattr = dict_new();
+    if (fop->brick_xattr == NULL) {
+        goto out;
+    }
+    ret = dict_foreach(dict, ta_set_incoming_values, (void *)fop);
+    if (ret < 0) {
+        goto out;
+    }
+    frame->local = fop;
+    return fop;
+
+out:
+    ta_release_fop(fop);
+    return NULL;
+}
+
+int32_t
+ta_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+            gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    int ret = 0;
+    ta_fop_t *fop = NULL;
+
+    fop = ta_prepare_fop(frame, this, NULL, fd, flags, dict, xdata);
+    if (!fop) {
+        ret = -ENOMEM;
+        goto unwind;
+    }
+
+    STACK_WIND(frame, ta_get_xattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fxattrop, fd, flags, fop->brick_xattr,
+               xdata);
+    return 0;
+
+unwind:
+
+    TA_STACK_UNWIND(xattrop, frame, -1, -ret, NULL, NULL);
+    return 0;
+}
+
+int32_t
+ta_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+           gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    int ret = 0;
+    ta_fop_t *fop = NULL;
+
+    fop = ta_prepare_fop(frame, this, loc, NULL, flags, dict, xdata);
+    if (!fop) {
+        ret = -ENOMEM;
+        goto unwind;
+    }
+
+    STACK_WIND(frame, ta_get_xattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->xattrop, loc, flags, fop->brick_xattr,
+               xdata);
+    return 0;
+
+unwind:
+
+    TA_STACK_UNWIND(xattrop, frame, -1, -ret, NULL, NULL);
+    return 0;
+}
+
+int32_t
+ta_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+          int32_t count, off_t off, uint32_t flags, struct iobref *iobref,
+          dict_t *xdata)
+{
+    TA_FAILED_FOP(writev, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+             int32_t flags, dict_t *xdata)
+{
+    TA_FAILED_FOP(fsetxattr, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+            int32_t flags, dict_t *xdata)
+{
+    TA_FAILED_FOP(setxattr, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size,
+             off_t offset, size_t len, dict_t *xdata)
+{
+    TA_FAILED_FOP(fallocate, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+          dict_t *xdata)
+{
+    TA_FAILED_FOP(access, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+           size_t len, dict_t *xdata)
+{
+    TA_FAILED_FOP(discard, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+           const char *basename, entrylk_cmd cmd, entrylk_type type,
+           dict_t *xdata)
+{
+    TA_FAILED_FOP(entrylk, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            const char *basename, entrylk_cmd cmd, entrylk_type type,
+            dict_t *xdata)
+{
+    TA_FAILED_FOP(fentrylk, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    TA_FAILED_FOP(flush, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+         dict_t *xdata)
+{
+    TA_FAILED_FOP(fsync, frame, EINVAL);
+    return 0;
+}
+int32_t
+ta_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+            dict_t *xdata)
+{
+    TA_FAILED_FOP(fsyncdir, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+            dict_t *xdata)
+{
+    TA_FAILED_FOP(getxattr, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+             dict_t *xdata)
+{
+    TA_FAILED_FOP(fgetxattr, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+        dict_t *xdata)
+{
+    TA_FAILED_FOP(link, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+      struct gf_flock *flock, dict_t *xdata)
+{
+    TA_FAILED_FOP(lk, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         mode_t umask, dict_t *xdata)
+{
+    TA_FAILED_FOP(mkdir, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    TA_FAILED_FOP(mknod, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+        fd_t *fd, dict_t *xdata)
+{
+    TA_FAILED_FOP(open, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+           dict_t *xdata)
+{
+    TA_FAILED_FOP(opendir, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+           off_t offset, dict_t *xdata)
+{
+    TA_FAILED_FOP(readdir, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t offset, dict_t *xdata)
+{
+    TA_FAILED_FOP(readdirp, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+            dict_t *xdata)
+{
+    TA_FAILED_FOP(readlink, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+         off_t offset, uint32_t flags, dict_t *xdata)
+{
+    TA_FAILED_FOP(readv, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               const char *name, dict_t *xdata)
+{
+    TA_FAILED_FOP(removexattr, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+                dict_t *xdata)
+{
+    TA_FAILED_FOP(fremovexattr, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+          dict_t *xdata)
+{
+    TA_FAILED_FOP(rename, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+         dict_t *xdata)
+{
+    TA_FAILED_FOP(rmdir, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+           int32_t valid, dict_t *xdata)
+{
+    TA_FAILED_FOP(setattr, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+            int32_t valid, dict_t *xdata)
+{
+    TA_FAILED_FOP(fsetattr, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    TA_FAILED_FOP(stat, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    TA_FAILED_FOP(fstat, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    TA_FAILED_FOP(statfs, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
+           loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    TA_FAILED_FOP(symlink, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+            dict_t *xdata)
+{
+    TA_FAILED_FOP(truncate, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             dict_t *xdata)
+{
+    TA_FAILED_FOP(ftruncate, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+          dict_t *xdata)
+{
+    TA_FAILED_FOP(unlink, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            off_t len, dict_t *xdata)
+{
+    TA_FAILED_FOP(zerofill, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+ta_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+        gf_seek_what_t what, dict_t *xdata)
+{
+    TA_FAILED_FOP(seek, frame, EINVAL);
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    ret = xlator_mem_acct_init(this, gf_ta_mt_end + 1);
+    if (ret)
+        gf_log(this->name, GF_LOG_ERROR,
+               "Memory accounting "
+               "initialization failed.");
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    return 0;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "'thin_arbiter' not configured with exactly one child");
+        return -1;
+    }
+
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_ERROR, "dangling volume. check volfile ");
+    }
+    return 0;
+}
+
+void
+fini(xlator_t *this)
+{
+    return;
+}
+
+struct xlator_fops fops = {
+    /*Passed fop*/
+    .xattrop = ta_xattrop,
+    .fxattrop = ta_fxattrop,
+    /*Failed fop*/
+    .writev = ta_writev,
+    .stat = ta_stat,
+    .fstat = ta_fstat,
+    .truncate = ta_truncate,
+    .ftruncate = ta_ftruncate,
+    .access = ta_access,
+    .readlink = ta_readlink,
+    .mknod = ta_mknod,
+    .mkdir = ta_mkdir,
+    .unlink = ta_unlink,
+    .rmdir = ta_rmdir,
+    .symlink = ta_symlink,
+    .rename = ta_rename,
+    .link = ta_link,
+    .open = ta_open,
+    .readv = ta_readv,
+    .flush = ta_flush,
+    .fsync = ta_fsync,
+    .opendir = ta_opendir,
+    .readdir = ta_readdir,
+    .readdirp = ta_readdirp,
+    .fsyncdir = ta_fsyncdir,
+    .statfs = ta_statfs,
+    .setxattr = ta_setxattr,
+    .getxattr = ta_getxattr,
+    .fsetxattr = ta_fsetxattr,
+    .fgetxattr = ta_fgetxattr,
+    .removexattr = ta_removexattr,
+    .fremovexattr = ta_fremovexattr,
+    .lk = ta_lk,
+    .entrylk = ta_entrylk,
+    .fentrylk = ta_fentrylk,
+    .setattr = ta_setattr,
+    .fsetattr = ta_fsetattr,
+    .fallocate = ta_fallocate,
+    .discard = ta_discard,
+    .zerofill = ta_zerofill,
+    .seek = ta_seek,
+};
+
+struct xlator_cbks cbks = {};
+
+struct volume_options options[] = {
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {GD_OP_VERSION_6_0},
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "thin-arbiter",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/thin-arbiter/src/thin-arbiter.h b/xlators/features/thin-arbiter/src/thin-arbiter.h
new file mode 100644
index 00000000000..e5f914b84bf
--- /dev/null
+++ b/xlators/features/thin-arbiter/src/thin-arbiter.h
@@ -0,0 +1,59 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _THIN_ARBITER_H
+#define _THIN_ARBITER_H
+
+#include <glusterfs/locking.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/list.h>
+
+#define THIN_ARBITER_SOURCE_XATTR "trusted.ta.source"
+#define THIN_ARBITER_SOURCE_SIZE 2
+
+#define TA_FAILED_FOP(fop, frame, op_errno)                                    \
+    do {                                                                       \
+        default_##fop##_failure_cbk(frame, op_errno);                          \
+    } while (0)
+
+#define TA_STACK_UNWIND(fop, frame, op_ret, op_errno, params...)               \
+    do {                                                                       \
+        ta_fop_t *__local = NULL;                                              \
+        int32_t __op_ret = 0;                                                  \
+        int32_t __op_errno = 0;                                                \
+                                                                               \
+        __local = frame->local;                                                \
+        __op_ret = op_ret;                                                     \
+        __op_errno = op_errno;                                                 \
+        if (__local) {                                                         \
+            ta_release_fop(__local);                                           \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, __op_ret, __op_errno, params);         \
+                                                                               \
+    } while (0)
+
+struct _ta_fop;
+typedef struct _ta_fop ta_fop_t;
+
+struct _ta_fop {
+    gf_xattrop_flags_t xattrop_flags;
+    loc_t loc;
+    fd_t *fd;
+    dict_t *dict;
+    dict_t *brick_xattr;
+    int32_t on_disk[2];
+    int32_t idx;
+};
+
+#endif /* _THIN_ARBITER_H */
diff --git a/xlators/features/trash/src/Makefile.am b/xlators/features/trash/src/Makefile.am
index d61f608aaa8..8557e7171af 100644
--- a/xlators/features/trash/src/Makefile.am
+++ b/xlators/features/trash/src/Makefile.am
@@ -1,13 +1,19 @@
+if WITH_SERVER
 xlator_LTLIBRARIES = trash.la
+endif
 xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
 
-trash_la_LDFLAGS = -module -avoidversion 
+trash_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
 
 trash_la_SOURCES = trash.c
 trash_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la 
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+noinst_HEADERS = trash.h trash-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
 
 CLEANFILES = 
 
diff --git a/xlators/features/trash/src/trash-mem-types.h b/xlators/features/trash/src/trash-mem-types.h
new file mode 100644
index 00000000000..43353c8f095
--- /dev/null
+++ b/xlators/features/trash/src/trash-mem-types.h
@@ -0,0 +1,22 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef __TRASH_MEM_TYPES_H__
+#define __TRASH_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_trash_mem_types_ {
+    gf_trash_mt_trash_private_t = gf_common_mt_end + 1,
+    gf_trash_mt_char,
+    gf_trash_mt_uuid,
+    gf_trash_mt_trash_elim_path,
+    gf_trash_mt_end
+};
+#endif
diff --git a/xlators/features/trash/src/trash.c b/xlators/features/trash/src/trash.c
index b2c4a9ab396..7d09cba3e9c 100644
--- a/xlators/features/trash/src/trash.c
+++ b/xlators/features/trash/src/trash.c
@@ -1,596 +1,2653 @@
 /*
-  Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
 */
+#include "trash.h"
+#include "trash-mem-types.h"
+#include <glusterfs/syscall.h>
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+#define root_gfid                                                              \
+    (uuid_t) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }
+#define trash_gfid                                                             \
+    (uuid_t) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 }
+#define internal_op_gfid                                                       \
+    (uuid_t) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 }
 
+int32_t
+trash_truncate_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                          struct iatt *postbuf, dict_t *xdata);
 
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
-#include "defaults.h"
+int32_t
+trash_truncate_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, inode_t *inode,
+                         struct iatt *stbuf, struct iatt *preparent,
+                         struct iatt *postparent, dict_t *xdata);
 
-#include <libgen.h>
+int32_t
+trash_unlink_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                        struct iatt *preoldparent, struct iatt *postoldparent,
+                        struct iatt *prenewparent, struct iatt *postnewparent,
+                        dict_t *xdata);
+/* Common routines used in this translator */
 
-/* TODO: currently it can work only above posix, no other translators 
- *       between them. Not a good thing. Try making more reliable methods.
+/**
+ * When a directory/file is created under trash directory, it should have
+ * the same permission as before. This function will fetch permission from
+ * the existing directory and returns the same
  */
+mode_t
+get_permission(char *path)
+{
+    mode_t mode = 0755;
+    struct stat sbuf = {
+        0,
+    };
+    struct iatt ibuf = {
+        0,
+    };
+    int ret = 0;
 
-struct trash_struct {
-	inode_t *inode;
-	loc_t loc1;
-	loc_t loc2;
-	char origpath[ZR_PATH_MAX];
-	char newpath[ZR_PATH_MAX];
-	char oldpath[ZR_PATH_MAX]; // used only in case of rename
-};
-typedef struct trash_struct trash_local_t;
+    ret = sys_stat(path, &sbuf);
+    if (!ret) {
+        iatt_from_stat(&ibuf, &sbuf);
+        mode = st_mode_from_ia(ibuf.ia_prot, ibuf.ia_type);
+    } else
+        gf_log("trash", GF_LOG_DEBUG,
+               "stat on %s failed"
+               " using default",
+               path);
+    return mode;
+}
 
-struct trash_priv {
-	char trash_dir[ZR_PATH_MAX];
-};
-typedef struct trash_priv trash_private_t;
+/**
+ * For normalization, trash directory name is stored inside priv structure as
+ * '/trash_directory/'. As a result the trailing and leading slashes are being
+ * striped out for additional usage.
+ */
+int
+extract_trash_directory(char *priv_value, const char **trash_directory)
+{
+    char *tmp = NULL;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO("trash", priv_value, out);
+
+    tmp = gf_strdup(priv_value + 1);
+    if (!tmp) {
+        ret = ENOMEM;
+        goto out;
+    }
+    if (tmp[strlen(tmp) - 1] == '/')
+        tmp[strlen(tmp) - 1] = '\0';
+    *trash_directory = gf_strdup(tmp);
+    if (!(*trash_directory)) {
+        ret = ENOMEM;
+        goto out;
+    }
+out:
+    if (tmp)
+        GF_FREE(tmp);
+    return ret;
+}
+
+/**
+ * The trash directory path should be append at beginning of file path for
+ * delete or truncate operations. Normal trashing moves the contents to
+ * trash directory and trashing done by internal operations are moved to
+ * internal_op directory inside trash.
+ */
+void
+copy_trash_path(const char *priv_value, gf_boolean_t internal, char *path,
+                size_t path_size)
+{
+    char trash_path[PATH_MAX] = {
+        0,
+    };
+
+    strncpy(trash_path, priv_value, sizeof(trash_path));
+    trash_path[sizeof(trash_path) - 1] = 0;
+    if (internal)
+        strncat(trash_path, "internal_op/",
+                sizeof(trash_path) - strlen(trash_path) - 1);
+
+    strncpy(path, trash_path, path_size);
+    path[path_size - 1] = 0;
+}
+
+/**
+ * This function performs the reverse operation of copy_trash_path(). It gives
+ * out a pointer, whose starting value will be the path inside trash directory,
+ * similar to original path.
+ */
+void
+remove_trash_path(const char *path, gf_boolean_t internal, char **rem_path)
+{
+    if (rem_path == NULL) {
+        return;
+    }
 
+    *rem_path = strchr(path + 1, '/');
+    if (internal)
+        *rem_path = strchr(*rem_path + 1, '/');
+}
+
+/**
+ * Checks whether the given path reside under the specified eliminate path
+ */
+int
+check_whether_eliminate_path(trash_elim_path *trav, const char *path)
+{
+    int match = 0;
+
+    while (trav) {
+        if (strncmp(path, trav->path, strlen(trav->path)) == 0) {
+            match++;
+            break;
+        }
+        trav = trav->next;
+    }
+    return match;
+}
+
+/**
+ * Stores the eliminate path into internal eliminate path structure
+ */
+int
+store_eliminate_path(char *str, trash_elim_path **eliminate)
+{
+    trash_elim_path *trav = NULL;
+    char *component = NULL;
+    char elm_path[PATH_MAX] = {
+        0,
+    };
+    int ret = 0;
+    char *strtokptr = NULL;
+
+    if ((str == NULL) || (eliminate == NULL)) {
+        ret = EINVAL;
+        goto out;
+    }
+
+    component = strtok_r(str, ",", &strtokptr);
+    while (component) {
+        trav = GF_CALLOC(1, sizeof(*trav), gf_trash_mt_trash_elim_path);
+        if (!trav) {
+            ret = ENOMEM;
+            goto out;
+        }
+        if (component[0] == '/')
+            sprintf(elm_path, "%s", component);
+        else
+            sprintf(elm_path, "/%s", component);
+
+        if (component[strlen(component) - 1] != '/')
+            strncat(elm_path, "/", sizeof(elm_path) - strlen(elm_path) - 1);
+
+        trav->path = gf_strdup(elm_path);
+        if (!trav->path) {
+            ret = ENOMEM;
+            gf_log("trash", GF_LOG_DEBUG, "out of memory");
+            GF_FREE(trav);
+            goto out;
+        }
+        trav->next = *eliminate;
+        *eliminate = trav;
+        component = strtok_r(NULL, ",", &strtokptr);
+    }
+out:
+    return ret;
+}
+
+/**
+ * Appends time stamp to given string
+ */
+void
+append_time_stamp(char *name, size_t name_size)
+{
+    int i;
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+
+    gf_time_fmt(timestr, sizeof(timestr), gf_time(), gf_timefmt_F_HMS);
+
+    /* removing white spaces in timestamp */
+    for (i = 0; i < strlen(timestr); i++) {
+        if (timestr[i] == ' ')
+            timestr[i] = '_';
+    }
+    strncat(name, "_", name_size - strlen(name) - 1);
+    strncat(name, timestr, name_size - strlen(name) - 1);
+}
+
+/* *
+ * Check whether delete/rename operation is permitted on
+ * trash directory
+ */
+
+gf_boolean_t
+check_whether_op_permitted(trash_private_t *priv, loc_t *loc)
+{
+    if ((priv->state && (gf_uuid_compare(loc->inode->gfid, trash_gfid) == 0)))
+        return _gf_false;
+    if (priv->internal &&
+        (gf_uuid_compare(loc->inode->gfid, internal_op_gfid) == 0))
+        return _gf_false;
+
+    return _gf_true;
+}
+
+/**
+ * Wipe the memory used by trash location variable
+ */
+void
+trash_local_wipe(trash_local_t *local)
+{
+    if (!local)
+        goto out;
+
+    loc_wipe(&local->loc);
+    loc_wipe(&local->newloc);
+
+    if (local->fd)
+        fd_unref(local->fd);
+    if (local->newfd)
+        fd_unref(local->newfd);
+
+    mem_put(local);
+out:
+    return;
+}
+
+/**
+ * Wipe the memory used by eliminate path through a
+ * recursive call
+ */
+void
+wipe_eliminate_path(trash_elim_path **trav)
+{
+    if (trav == NULL) {
+        return;
+    }
+
+    if (*trav == NULL) {
+        return;
+    }
+
+    wipe_eliminate_path(&(*trav)->next);
+    GF_FREE((*trav)->path);
+    GF_FREE(*trav);
+    *trav = NULL;
+}
+
+/**
+ * This is the call back of rename fop initated using STACK_WIND in
+ * reconfigure/notify function which is used to rename trash directory
+ * in the brick when it is required either in volume start or set.
+ * This frame  must destroyed from this function itself since it was
+ * created by trash xlator
+ */
 int32_t
-trash_unlink_rename_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 struct stat *buf);
+trash_dir_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                     struct iatt *preoldparent, struct iatt *postoldparent,
+                     struct iatt *prenewparent, struct iatt *postnewparent,
+                     dict_t *xdata)
+{
+    trash_private_t *priv = NULL;
+    trash_local_t *local = NULL;
+
+    priv = this->private;
+
+    local = frame->local;
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "rename trash directory "
+               "failed: %s",
+               strerror(op_errno));
+        goto out;
+    }
+
+    GF_FREE(priv->oldtrash_dir);
+
+    priv->oldtrash_dir = gf_strdup(priv->newtrash_dir);
+    if (!priv->oldtrash_dir) {
+        op_ret = ENOMEM;
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+    }
+
+out:
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+    trash_local_wipe(local);
+    return op_ret;
+}
+
+int
+rename_trash_directory(xlator_t *this)
+{
+    trash_private_t *priv = NULL;
+    int ret = 0;
+    loc_t loc = {
+        0,
+    };
+    loc_t old_loc = {
+        0,
+    };
+    call_frame_t *frame = NULL;
+    trash_local_t *local = NULL;
+
+    priv = this->private;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (frame == NULL) {
+        gf_log(this->name, GF_LOG_ERROR, "failed to create frame");
+        ret = ENOMEM;
+        goto out;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        ret = ENOMEM;
+        goto out;
+    }
+    frame->local = local;
+
+    /* assign new location values to new_loc members */
+    gf_uuid_copy(loc.gfid, trash_gfid);
+    gf_uuid_copy(loc.pargfid, root_gfid);
+    ret = extract_trash_directory(priv->newtrash_dir, &loc.name);
+    if (ret) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        goto out;
+    }
+    loc.path = gf_strdup(priv->newtrash_dir);
+    if (!loc.path) {
+        ret = ENOMEM;
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        goto out;
+    }
+
+    /* assign old location values to old_loc members */
+    gf_uuid_copy(old_loc.gfid, trash_gfid);
+    gf_uuid_copy(old_loc.pargfid, root_gfid);
+    ret = extract_trash_directory(priv->oldtrash_dir, &old_loc.name);
+    if (ret) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        goto out;
+    }
+    old_loc.path = gf_strdup(priv->oldtrash_dir);
+    if (!old_loc.path) {
+        ret = ENOMEM;
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        goto out;
+    }
+
+    old_loc.inode = inode_ref(priv->trash_inode);
+    gf_uuid_copy(old_loc.inode->gfid, old_loc.gfid);
+
+    loc_copy(&local->loc, &old_loc);
+    loc_copy(&local->newloc, &loc);
+
+    STACK_WIND(frame, trash_dir_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, &old_loc, &loc, NULL);
+    return 0;
+
+out:
+    if (frame) {
+        frame->local = NULL;
+        STACK_DESTROY(frame->root);
+    }
+
+    trash_local_wipe(local);
+
+    return ret;
+}
+
 int32_t
-trash_rename_rename_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 struct stat *buf);
+trash_internal_op_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int32_t op_ret, int32_t op_errno, inode_t *inode,
+                            struct iatt *buf, struct iatt *preparent,
+                            struct iatt *postparent, dict_t *xdata)
+{
+    trash_local_t *local = NULL;
+    local = frame->local;
+
+    if (op_ret != 0 && !(op_errno == EEXIST))
+        gf_log(this->name, GF_LOG_ERROR,
+               "mkdir failed for "
+               "internal op directory : %s",
+               strerror(op_errno));
+
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+    trash_local_wipe(local);
+    return op_ret;
+}
 
 /**
- * trash_common_unwind_cbk -
+ * This is the call back of mkdir fop initated using STACK_WIND in
+ * notify/reconfigure function which is used to create trash directory
+ * in the brick when "trash" is on. The frame of the mkdir must
+ * destroyed from this function itself since it was created by trash xlator
  */
+
 int32_t
-trash_common_unwind_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno)
+trash_dir_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, inode_t *inode,
+                    struct iatt *buf, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
 {
-	trash_local_t *local = frame->local;
+    trash_private_t *priv = NULL;
+    trash_local_t *local = NULL;
 
-	if (local->loc1.path)
-		loc_wipe (&local->loc1);
-	
-	if (local->loc2.path)
-		loc_wipe (&local->loc2);
+    priv = this->private;
 
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
+    local = frame->local;
+
+    if (op_ret == 0) {
+        priv->oldtrash_dir = gf_strdup(priv->newtrash_dir);
+        if (!priv->oldtrash_dir) {
+            gf_log(this->name, GF_LOG_ERROR, "out of memory");
+            op_ret = ENOMEM;
+        }
+    } else if (op_ret != 0 && errno != EEXIST)
+        gf_log(this->name, GF_LOG_ERROR,
+               "mkdir failed for trash"
+               " directory : %s",
+               strerror(op_errno));
+
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+    trash_local_wipe(local);
+    return op_ret;
 }
 
 /**
- * trash_common_unwind_buf_cbk -
+ * This getxattr calls returns existing trash directory path in
+ * the dictionary
  */
 int32_t
-trash_common_unwind_buf_cbk (call_frame_t *frame,
-			     void *cookie,
-			     xlator_t *this,
-			     int32_t op_ret,
-			     int32_t op_errno,
-			     struct stat *buf)
+trash_dir_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, dict_t *dict,
+                       dict_t *xdata)
 {
-	trash_local_t *local = frame->local;
+    data_t *data = NULL;
+    trash_private_t *priv = NULL;
+    int ret = 0;
+    trash_local_t *local = NULL;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+
+    local = frame->local;
 
-	if (local->loc1.path)
-		loc_wipe (&local->loc1);
-	
-	if (local->loc2.path)
-		loc_wipe (&local->loc2);
+    data = dict_get(dict, GET_ANCESTRY_PATH_KEY);
+    if (!data) {
+        goto out;
+    }
+    priv->oldtrash_dir = GF_MALLOC(PATH_MAX, gf_common_mt_char);
+    if (!priv->oldtrash_dir) {
+        gf_log(this->name, GF_LOG_ERROR, "out of memory");
+        ret = ENOMEM;
+        goto out;
+    }
+    /* appending '/' if it is not present */
+    sprintf(priv->oldtrash_dir, "%s%c", data->data,
+            data->data[strlen(data->data) - 1] != '/' ? '/' : '\0');
+    gf_log(this->name, GF_LOG_DEBUG,
+           "old trash directory path "
+           "is %s",
+           priv->oldtrash_dir);
+    if (strcmp(priv->newtrash_dir, priv->oldtrash_dir) != 0) {
+        /* When user set a new name for trash directory, trash
+         * xlator will perform a rename operation on old trash
+         * directory to the new one using a STACK_WIND from here.
+         * This option can be configured only when volume is in
+         * started state
+         */
+        ret = rename_trash_directory(this);
+    }
 
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
+out:
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+    trash_local_wipe(local);
+    return ret;
+}
+/**
+ * This is a nameless look up for internal op directory
+ * The lookup is based on gfid, because internal op directory
+ * has fixed gfid.
+ */
+int32_t
+trash_internalop_dir_lookup_cbk(call_frame_t *frame, void *cookie,
+                                xlator_t *this, int32_t op_ret,
+                                int32_t op_errno, inode_t *inode,
+                                struct iatt *buf, dict_t *xdata,
+                                struct iatt *postparent)
+{
+    trash_private_t *priv = NULL;
+    int ret = 0;
+    uuid_t *gfid_ptr = NULL;
+    loc_t loc = {
+        0,
+    };
+    char internal_op_path[PATH_MAX] = {
+        0,
+    };
+    dict_t *dict = NULL;
+    trash_local_t *local = NULL;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+
+    local = frame->local;
+    if (op_ret != 0 && op_errno == ENOENT) {
+        loc_wipe(&local->loc);
+        gfid_ptr = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+        if (!gfid_ptr) {
+            ret = ENOMEM;
+            goto out;
+        }
+
+        gf_uuid_copy(*gfid_ptr, internal_op_gfid);
+
+        dict = dict_new();
+        if (!dict) {
+            ret = ENOMEM;
+            goto out;
+        }
+        ret = dict_set_gfuuid(dict, "gfid-req", *gfid_ptr, false);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR, "setting key gfid-req failed");
+            goto out;
+        }
+        gf_uuid_copy(loc.gfid, internal_op_gfid);
+        gf_uuid_copy(loc.pargfid, trash_gfid);
+
+        loc.inode = inode_new(priv->trash_itable);
+
+        /* The mkdir call for creating internal op directory */
+        loc.name = gf_strdup("internal_op");
+        if (!loc.name) {
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            ret = ENOMEM;
+            goto out;
+        }
+        sprintf(internal_op_path, "%s%s/", priv->newtrash_dir, loc.name);
+
+        loc.path = gf_strdup(internal_op_path);
+        if (!loc.path) {
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            ret = ENOMEM;
+            goto out;
+        }
+
+        loc_copy(&local->loc, &loc);
+        STACK_WIND(frame, trash_internal_op_mkdir_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->mkdir, &loc, 0755, 0022, dict);
+        return 0;
+    }
+
+out:
+    if (ret && gfid_ptr)
+        GF_FREE(gfid_ptr);
+    if (dict)
+        dict_unref(dict);
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+    trash_local_wipe(local);
+    return op_ret;
+}
+
+/**
+ * This is a nameless look up for old trash directory
+ * The lookup is based on gfid, because trash directory
+ * has fixed gfid.
+ */
+int32_t
+trash_dir_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, inode_t *inode,
+                     struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+    trash_private_t *priv = NULL;
+    loc_t loc = {
+        0,
+    };
+    int ret = 0;
+    uuid_t *gfid_ptr = NULL;
+    dict_t *dict = NULL;
+    trash_local_t *local = NULL;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+
+    local = frame->local;
+
+    loc_wipe(&local->loc);
+    if (op_ret == 0) {
+        gf_log(this->name, GF_LOG_DEBUG, "inode found with gfid %s",
+               uuid_utoa(buf->ia_gfid));
+
+        gf_uuid_copy(loc.gfid, trash_gfid);
+
+        /* Find trash inode using available information */
+        priv->trash_inode = inode_link(inode, NULL, NULL, buf);
+
+        loc.inode = inode_ref(priv->trash_inode);
+        loc_copy(&local->loc, &loc);
+
+        /*Used to find path of old trash directory*/
+        STACK_WIND(frame, trash_dir_getxattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->getxattr, &loc,
+                   GET_ANCESTRY_PATH_KEY, xdata);
+        return 0;
+    }
+
+    /* If there is no old trash directory we set its value to new one,
+     * which is the valid condition for trash directory creation
+     */
+    else {
+        gf_log(this->name, GF_LOG_DEBUG,
+               "Creating trash "
+               "directory %s ",
+               priv->newtrash_dir);
+
+        gfid_ptr = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+        if (!gfid_ptr) {
+            ret = ENOMEM;
+            goto out;
+        }
+        gf_uuid_copy(*gfid_ptr, trash_gfid);
+
+        gf_uuid_copy(loc.gfid, trash_gfid);
+        gf_uuid_copy(loc.pargfid, root_gfid);
+        ret = extract_trash_directory(priv->newtrash_dir, &loc.name);
+        if (ret) {
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            goto out;
+        }
+        loc.path = gf_strdup(priv->newtrash_dir);
+        if (!loc.path) {
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            ret = ENOMEM;
+            goto out;
+        }
+
+        priv->trash_inode = inode_new(priv->trash_itable);
+        priv->trash_inode->ia_type = IA_IFDIR;
+        loc.inode = inode_ref(priv->trash_inode);
+        dict = dict_new();
+        if (!dict) {
+            ret = ENOMEM;
+            goto out;
+        }
+        /* Fixed gfid is set for trash directory with
+         * this function
+         */
+        ret = dict_set_gfuuid(dict, "gfid-req", *gfid_ptr, false);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR, "setting key gfid-req failed");
+            goto out;
+        }
+        loc_copy(&local->loc, &loc);
+
+        /* The mkdir call for creating trash directory */
+        STACK_WIND(frame, trash_dir_mkdir_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->mkdir, &loc, 0755, 0022, dict);
+        return 0;
+    }
+out:
+    if (ret && gfid_ptr)
+        GF_FREE(gfid_ptr);
+    if (dict)
+        dict_unref(dict);
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+    trash_local_wipe(local);
+    return ret;
+}
+
+int
+create_or_rename_trash_directory(xlator_t *this)
+{
+    trash_private_t *priv = NULL;
+    int ret = 0;
+    loc_t loc = {
+        0,
+    };
+    call_frame_t *frame = NULL;
+    trash_local_t *local = NULL;
+
+    priv = this->private;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (frame == NULL) {
+        gf_log(this->name, GF_LOG_ERROR, "failed to create frame");
+        ret = ENOMEM;
+        goto out;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        ret = ENOMEM;
+        goto out;
+    }
+    frame->local = local;
+
+    loc.inode = inode_new(priv->trash_itable);
+    gf_uuid_copy(loc.gfid, trash_gfid);
+    loc_copy(&local->loc, &loc);
+    gf_log(this->name, GF_LOG_DEBUG,
+           "nameless lookup for"
+           "old trash directory");
+    STACK_WIND(frame, trash_dir_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, &loc, NULL);
+out:
+    return ret;
+}
+
+int
+create_internalop_directory(xlator_t *this)
+{
+    trash_private_t *priv = NULL;
+    int ret = 0;
+    loc_t loc = {
+        0,
+    };
+    call_frame_t *frame = NULL;
+    trash_local_t *local = NULL;
+
+    priv = this->private;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (frame == NULL) {
+        gf_log(this->name, GF_LOG_ERROR, "failed to create frame");
+        ret = ENOMEM;
+        goto out;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        ret = ENOMEM;
+        goto out;
+    }
+    frame->local = local;
+
+    gf_uuid_copy(loc.gfid, internal_op_gfid);
+    gf_uuid_copy(loc.pargfid, trash_gfid);
+    loc.inode = inode_new(priv->trash_itable);
+    loc.inode->ia_type = IA_IFDIR;
+
+    loc_copy(&local->loc, &loc);
+    STACK_WIND(frame, trash_internalop_dir_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, &loc, NULL);
+out:
+
+    return ret;
+}
+
+int32_t
+trash_common_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, inode_t *inode,
+                       struct iatt *buf, struct iatt *preparent,
+                       struct iatt *postparent, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(mkdir, frame, op_ret, op_errno, inode, buf, preparent,
+                        postparent, xdata);
+    return 0;
+}
+
+int32_t
+trash_common_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                        struct iatt *preoldparent, struct iatt *postoldparent,
+                        struct iatt *prenewparent, struct iatt *postnewparent,
+                        dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, buf, preoldparent,
+                        postoldparent, prenewparent, postnewparent, xdata);
+    return 0;
 }
 
 int32_t
-trash_mkdir_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 inode_t *inode,
-		 struct stat *stbuf)
-{
-	trash_local_t *local = frame->local;
-	char *tmp_str = strdup (local->newpath);
-	int32_t count = 0;
-	char *tmp_path = NULL;
-	char *tmp_dirname = NULL;
-
-	if (op_ret == -1 && op_errno == ENOENT) {
-		tmp_dirname = strchr (tmp_str, '/');
-		while (tmp_dirname) {
-			count = tmp_dirname - tmp_str;
-			if (count == 0)
-				count = 1;
-			tmp_path = CALLOC (1, count + 1);
-			ERR_ABORT (tmp_path);
-			memcpy (tmp_path, local->newpath, count);
-			loc_t tmp_loc = {
-				.inode = NULL,
-				.path = tmp_path,
-			};
-
-			/* TODO:create the directory with proper permissions */
-			STACK_WIND_COOKIE (frame,
-					   trash_mkdir_cbk,
-					   tmp_path,
-					   this->children->xlator,
-					   this->children->xlator->fops->mkdir,
-					   &tmp_loc,
-					   0777);
-			tmp_dirname = strchr (tmp_str + count + 1, '/');
-		}
-		free (cookie);
-		free (tmp_str);
-		return 0;
-	}
-	char *dir_name = dirname (tmp_str);
-	if (strcmp((char*)cookie, dir_name) == 0) {
-		loc_t new_loc = {
-			.inode = NULL,
-			.path = local->newpath
-		};
-		STACK_WIND (frame,
-			    trash_unlink_rename_cbk,
-			    this->children->xlator,
-			    this->children->xlator->fops->rename,
-			    &local->loc2,
-			    &new_loc);
-
-	}
-	free (cookie); /* strdup (dir_name) was sent here :) */
-	free (tmp_str);
-	return 0;
-}
-
-/**
- * trash_unlink_rename_cbk -
+trash_common_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                       struct iatt *postparent, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(rmdir, frame, op_ret, op_errno, preparent, postparent,
+                        xdata);
+    return 0;
+}
+
+/**
+ * move backs from trash translator to unlink call
  */
 int32_t
-trash_unlink_rename_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 struct stat *buf)
-{
-	trash_local_t *local = frame->local;
-	if (op_ret == -1 && op_errno == ENOENT) {
-		/* check for the errno, if its ENOENT create directory and call
-		 * rename later
-		 */
-		char *tmp_str = strdup (local->newpath);
-		char *dir_name = dirname (tmp_str);
-		loc_t tmp_loc = {
-			.inode = NULL,
-			.path = dir_name,
-		};
-		/* TODO: create the directory with proper permissions */
-		STACK_WIND_COOKIE (frame,
-				   trash_mkdir_cbk,
-				   strdup (dir_name),
-				   this->children->xlator,
-				   this->children->xlator->fops->mkdir,
-				   &tmp_loc,
-				   0777);
-		free (tmp_str);
-	} else if (op_ret == -1 && op_errno == ENOTDIR) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"Target exists, cannot keep the copy, deleting");
-		STACK_WIND (frame,
-			    trash_common_unwind_cbk,
-			    this->children->xlator,
-			    this->children->xlator->fops->unlink,
-			    &local->loc2);
-	} else if (op_ret == -1 && op_errno == EISDIR) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"Target exists as a directory, cannot keep the copy, "
-			"deleting");
-		STACK_WIND (frame,
-			    trash_common_unwind_cbk,
-			    this->children->xlator,
-			    this->children->xlator->fops->unlink,
-			    &local->loc2);
-	} else {
-		/* */
-		STACK_UNWIND (frame, 0, op_errno);
-	}
-
-	return 0;
-}
-
-
-/**
- * trash_unlink -
+trash_common_unwind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno,
+                        struct iatt *preparent, struct iatt *postparent,
+                        dict_t *xdata)
+{
+    TRASH_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, postparent,
+                       xdata);
+    return 0;
+}
+
+/**
+ * If the path is not present in the trash directory,it will recursively
+ * call this call-back and one by one directories will be created from
+ * the starting
  */
 int32_t
-trash_unlink (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc)
-{
-	trash_private_t *priv = this->private;
-	trash_local_t *local = NULL;
-	time_t       utime = 0;
-	struct tm   *tm = NULL;
-	char         timestr[256];
-
-	if (strncmp (loc->path, priv->trash_dir, 
-		     strlen(priv->trash_dir)) == 0) {
-		/* Trying to rename from the trash can dir, do the
-		   actual unlink */
-		STACK_WIND (frame,
-			    trash_common_unwind_cbk,
-			    this->children->xlator,
-			    this->children->xlator->fops->unlink,
-			    loc);
-	} else {
-		local = CALLOC (1, sizeof (trash_local_t));
-		if (!local) {
-			STACK_UNWIND (frame, -1, ENOMEM);
-			return 0;
-		}
-		frame->local = local;
-		
-		loc_copy (&local->loc2, loc);
-
-		strcpy (local->newpath, priv->trash_dir);
-		strcat (local->newpath, loc->path);
-
-		utime = time (NULL);
-		tm    = localtime (&utime);
-		strftime (timestr, 256, ".%Y%m%d%H%M%S", tm); 
-		strcat (local->newpath, timestr);
-
-		{
-			loc_t new_loc = {
-				.inode = NULL,
-				.path = local->newpath
-			};
-			STACK_WIND (frame,
-				    trash_unlink_rename_cbk,
-				    this->children->xlator,
-				    this->children->xlator->fops->rename,
-				    loc,
-				    &new_loc);
-		}
-	}
-	return 0;
-}
-
-/* */
+trash_unlink_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, inode_t *inode,
+                       struct iatt *stbuf, struct iatt *preparent,
+                       struct iatt *postparent, dict_t *xdata)
+{
+    trash_local_t *local = NULL;
+    char *tmp_str = NULL;
+    char *tmp_path = NULL;
+    char *tmp_dirname = NULL;
+    char *tmp_stat = NULL;
+    char real_path[PATH_MAX] = {
+        0,
+    };
+    char *dir_name = NULL;
+    size_t count = 0;
+    int32_t loop_count = 0;
+    int i = 0;
+    loc_t tmp_loc = {
+        0,
+    };
+    trash_private_t *priv = NULL;
+    int ret = 0;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+
+    local = frame->local;
+    GF_VALIDATE_OR_GOTO("trash", local, out);
+
+    TRASH_UNSET_PID(frame, local);
+
+    tmp_str = gf_strdup(local->newpath);
+    if (!tmp_str) {
+        gf_log(this->name, GF_LOG_ERROR, "out of memory");
+        ret = -1;
+        goto out;
+    }
+    loop_count = local->loop_count;
+
+    /* The directory is not present , need to create it */
+    if ((op_ret == -1) && (op_errno == ENOENT)) {
+        tmp_dirname = strchr(tmp_str, '/');
+        while (tmp_dirname) {
+            count = tmp_dirname - tmp_str;
+            if (count == 0)
+                count = 1;
+            i++;
+            if (i > loop_count)
+                break;
+            tmp_dirname = strchr(tmp_str + count + 1, '/');
+        }
+        tmp_path = gf_memdup(local->newpath, count + 1);
+        if (!tmp_path) {
+            gf_log(this->name, GF_LOG_ERROR, "out of memory");
+            ret = ENOMEM;
+            goto out;
+        }
+        tmp_path[count] = '\0';
+
+        loc_copy(&tmp_loc, &local->loc);
+        tmp_loc.path = gf_strdup(tmp_path);
+        if (!tmp_loc.path) {
+            gf_log(this->name, GF_LOG_ERROR, "out of memory");
+            ret = ENOMEM;
+            goto out;
+        }
+
+        /* Stores the the name of directory to be created */
+        tmp_loc.name = gf_strdup(strrchr(tmp_path, '/') + 1);
+        if (!tmp_loc.name) {
+            gf_log(this->name, GF_LOG_ERROR, "out of memory");
+            ret = ENOMEM;
+            goto out;
+        }
+        strncpy(real_path, priv->brick_path, sizeof(real_path));
+        real_path[sizeof(real_path) - 1] = 0;
+
+        remove_trash_path(tmp_path, (frame->root->pid < 0), &tmp_stat);
+        if (tmp_stat)
+            strncat(real_path, tmp_stat,
+                    sizeof(real_path) - strlen(real_path) - 1);
+
+        TRASH_SET_PID(frame, local);
+
+        STACK_WIND_COOKIE(frame, trash_unlink_mkdir_cbk, tmp_path,
+                          FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir,
+                          &tmp_loc, get_permission(real_path), 0022, xdata);
+        loc_wipe(&tmp_loc);
+        goto out;
+    }
+
+    /* Given path is created , comparing to the required path */
+    if (op_ret == 0) {
+        dir_name = dirname(tmp_str);
+        if (strcmp((char *)cookie, dir_name) == 0) {
+            /* File path exists we can rename it*/
+            loc_copy(&tmp_loc, &local->loc);
+            tmp_loc.path = local->newpath;
+            STACK_WIND(frame, trash_unlink_rename_cbk, FIRST_CHILD(this),
+                       FIRST_CHILD(this)->fops->rename, &local->loc, &tmp_loc,
+                       xdata);
+            goto out;
+        }
+    }
+
+    if ((op_ret == -1) && (op_errno != EEXIST)) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Directory creation failed [%s]. "
+               "Therefore unlinking %s without moving to trash "
+               "directory",
+               strerror(op_errno), local->loc.name);
+        STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, &local->loc, 0, xdata);
+        goto out;
+    }
+
+    LOCK(&frame->lock);
+    {
+        loop_count = ++local->loop_count;
+    }
+    UNLOCK(&frame->lock);
+
+    tmp_dirname = strchr(tmp_str, '/');
+
+    /* Path is not completed , need to create remaining path */
+    while (tmp_dirname) {
+        count = tmp_dirname - tmp_str;
+        if (count == 0)
+            count = 1;
+        i++;
+        if (i > loop_count)
+            break;
+        tmp_dirname = strchr(tmp_str + count + 1, '/');
+    }
+    tmp_path = gf_memdup(local->newpath, count + 1);
+    if (!tmp_path) {
+        gf_log(this->name, GF_LOG_ERROR, "out of memory");
+        ret = -1;
+        goto out;
+    }
+    tmp_path[count] = '\0';
+
+    loc_copy(&tmp_loc, &local->loc);
+    tmp_loc.path = gf_strdup(tmp_path);
+    if (!tmp_loc.path) {
+        gf_log(this->name, GF_LOG_ERROR, "out of memory");
+        ret = -1;
+        goto out;
+    }
+
+    /* Stores the the name of directory to be created */
+    tmp_loc.name = gf_strdup(strrchr(tmp_path, '/') + 1);
+    if (!tmp_loc.name) {
+        gf_log(this->name, GF_LOG_ERROR, "out of memory");
+        ret = -1;
+        goto out;
+    }
+
+    strncpy(real_path, priv->brick_path, sizeof(real_path));
+    real_path[sizeof(real_path) - 1] = 0;
+
+    remove_trash_path(tmp_path, (frame->root->pid < 0), &tmp_stat);
+    if (tmp_stat)
+        strncat(real_path, tmp_stat, sizeof(real_path) - strlen(real_path) - 1);
+
+    TRASH_SET_PID(frame, local);
+
+    STACK_WIND_COOKIE(frame, trash_unlink_mkdir_cbk, tmp_path,
+                      FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir,
+                      &tmp_loc, get_permission(real_path), 0022, xdata);
+
+out:
+    if (tmp_path)
+        GF_FREE(tmp_path);
+    if (tmp_str)
+        GF_FREE(tmp_str);
+    return ret;
+}
+
+/**
+ * The name of unlinking file should be renamed as starting
+ * from trash directory as mentioned in the mount point
+ */
 int32_t
-trash_rename_mkdir_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno,
-			inode_t *inode,
-			struct stat *stbuf)
-{
-	trash_local_t *local = frame->local;
-	char *tmp_str = strdup (local->newpath);
-
-	if (op_ret == -1 && op_errno == ENOENT) {
-		int32_t count = 0;
-		char *tmp_path = NULL;
-		char *tmp_dirname = strchr (tmp_str, '/');
-
-		while (tmp_dirname) {
-			count = tmp_dirname - tmp_str;
-			if (count == 0)
-				count = 1;
-			tmp_path = CALLOC (1, count + 2);
-			ERR_ABORT (tmp_path);
-			memcpy (tmp_path, local->newpath, count);
-			loc_t tmp_loc = {
-				.inode = NULL,
-				.path = tmp_path,
-			};
-
-			/* TODO:create the directory with proper permissions */
-			STACK_WIND_COOKIE (frame,
-					   trash_rename_mkdir_cbk,
-					   tmp_path,
-					   this->children->xlator,
-					   this->children->xlator->fops->mkdir,
-					   &tmp_loc,
-					   0777);
-			tmp_dirname = strchr (tmp_str + count + 1, '/');
-		}
-		free (cookie);
-		free (tmp_str);
-		return 0;
-	}
-	char *dir_name = dirname (tmp_str);
-	if (strcmp((char*)cookie, dir_name) == 0) {
-		loc_t new_loc = {
-			.inode = NULL,
-			.path = local->newpath
-		};
-		STACK_WIND (frame,
-			    trash_rename_rename_cbk,
-			    this->children->xlator,
-			    this->children->xlator->fops->rename,
-			    &local->loc2,
-			    &new_loc);
-
-	}
-	free (cookie); /* strdup (dir_name) was sent here :) */
-	free (tmp_str);
-	return 0;
-}
-
-
-/**
- * trash_unlink_rename_cbk -
+trash_unlink_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                        struct iatt *preoldparent, struct iatt *postoldparent,
+                        struct iatt *prenewparent, struct iatt *postnewparent,
+                        dict_t *xdata)
+{
+    trash_local_t *local = NULL;
+    trash_private_t *priv = NULL;
+    char *tmp_str = NULL;
+    char *dir_name = NULL;
+    char *tmp_cookie = NULL;
+    loc_t tmp_loc = {
+        0,
+    };
+    dict_t *new_xdata = NULL;
+    char *tmp_stat = NULL;
+    char real_path[PATH_MAX] = {
+        0,
+    };
+    int ret = 0;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+
+    local = frame->local;
+    GF_VALIDATE_OR_GOTO("trash", local, out);
+
+    if ((op_ret == -1) && (op_errno == ENOENT)) {
+        /* the file path does not exist we want to create path
+         * for the file
+         */
+        tmp_str = gf_strdup(local->newpath);
+        if (!tmp_str) {
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            ret = ENOMEM;
+            goto out;
+        }
+        dir_name = dirname(tmp_str); /* stores directory name */
+
+        loc_copy(&tmp_loc, &local->loc);
+        tmp_loc.path = gf_strdup(dir_name);
+        if (!tmp_loc.path) {
+            gf_log(this->name, GF_LOG_ERROR, "out of memory");
+            ret = ENOMEM;
+            goto out;
+        }
+
+        tmp_cookie = gf_strdup(dir_name);
+        if (!tmp_cookie) {
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            ret = ENOMEM;
+            goto out;
+        }
+        strncpy(real_path, priv->brick_path, sizeof(real_path));
+        real_path[sizeof(real_path) - 1] = 0;
+        remove_trash_path(tmp_str, (frame->root->pid < 0), &tmp_stat);
+        if (tmp_stat)
+            strncat(real_path, tmp_stat,
+                    sizeof(real_path) - strlen(real_path) - 1);
+
+        TRASH_SET_PID(frame, local);
+
+        /* create the directory with proper permissions */
+        STACK_WIND_COOKIE(frame, trash_unlink_mkdir_cbk, tmp_cookie,
+                          FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir,
+                          &tmp_loc, get_permission(real_path), 0022, xdata);
+        loc_wipe(&tmp_loc);
+        goto out;
+    }
+
+    if ((op_ret == -1) && (op_errno == ENOTDIR)) {
+        /* if entry is already present in trash directory,
+         * new one is not copied*/
+        gf_log(this->name, GF_LOG_DEBUG,
+               "target(%s) exists, cannot keep the copy, deleting",
+               local->newpath);
+
+        STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, &local->loc, 0, xdata);
+
+        goto out;
+    }
+
+    if ((op_ret == -1) && (op_errno == EISDIR)) {
+        /* if entry is directory,we remove directly */
+        gf_log(this->name, GF_LOG_DEBUG,
+               "target(%s) exists as directory, cannot keep copy, "
+               "deleting",
+               local->newpath);
+
+        STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, &local->loc, 0, xdata);
+        goto out;
+    }
+
+    /**********************************************************************
+     *
+     * CTR Xlator message handling done here!
+     *
+     **********************************************************************/
+    /**
+     * If unlink is handled by trash translator, it should inform the
+     * CTR Xlator. And trash translator only handles the unlink for
+     * the last hardlink.
+     *
+     * Check if there is a GF_REQUEST_LINK_COUNT_XDATA from CTR Xlator
+     *
+     */
+
+    if (local->ctr_link_count_req) {
+        /* Sending back inode link count to ctr_unlink
+         * (changetimerecoder xlator) via
+         * "GF_RESPONSE_LINK_COUNT_XDATA" key using xdata.
+         * */
+        if (xdata) {
+            ret = dict_set_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA, 1);
+            if (ret == -1) {
+                gf_log(this->name, GF_LOG_WARNING,
+                       "Failed to set"
+                       " GF_RESPONSE_LINK_COUNT_XDATA");
+            }
+        } else {
+            new_xdata = dict_new();
+            if (!new_xdata) {
+                gf_log(this->name, GF_LOG_WARNING,
+                       "Memory allocation failure while "
+                       "creating new_xdata");
+                goto ctr_out;
+            }
+            ret = dict_set_uint32(new_xdata, GF_RESPONSE_LINK_COUNT_XDATA, 1);
+            if (ret == -1) {
+                gf_log(this->name, GF_LOG_WARNING,
+                       "Failed to set"
+                       " GF_RESPONSE_LINK_COUNT_XDATA");
+            }
+        ctr_out:
+            TRASH_STACK_UNWIND(unlink, frame, 0, op_errno, preoldparent,
+                               postoldparent, new_xdata);
+            goto out;
+        }
+    }
+    /* All other cases, unlink should return success */
+    TRASH_STACK_UNWIND(unlink, frame, 0, op_errno, preoldparent, postoldparent,
+                       xdata);
+out:
+
+    if (tmp_str)
+        GF_FREE(tmp_str);
+    if (tmp_cookie)
+        GF_FREE(tmp_cookie);
+    if (new_xdata)
+        dict_unref(new_xdata);
+
+    return ret;
+}
+
+/**
+ * move backs from trash translator to truncate call
  */
 int32_t
-trash_rename_rename_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 struct stat *buf)
-{
-	trash_local_t *local = frame->local;
-	if (op_ret == -1 && op_errno == ENOENT) {
-		/* check for the errno, if its ENOENT create directory and call
-		 * rename later
-		 */
-		char *tmp_str = strdup (local->newpath);
-		char *dir_name = dirname (tmp_str);
-		loc_t tmp_loc = {
-			.inode = NULL,
-			.path = dir_name,
-		};
-		/* TODO: create the directory with proper permissions */
-		STACK_WIND_COOKIE (frame,
-				   trash_rename_mkdir_cbk,
-				   strdup (dir_name),
-				   this->children->xlator,
-				   this->children->xlator->fops->mkdir,
-				   &tmp_loc,
-				   0777);
-		free (tmp_str);
-		return 0;
-	} else if (op_ret == -1 && op_errno == ENOTDIR) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"Target exists, cannot keep the dest entry %s, "
-			"renaming",
-			local->loc2.path);
-	} else if (op_ret == -1 && op_errno == EISDIR) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"Target exists as a directory, cannot keep the "
-			"copy %s, renaming",
-			local->loc2.path);
-	}
-	loc_t new_loc = {
-		.inode = NULL,
-		.parent = local->loc2.parent,
-		.path = local->loc2.path,
-	};
-	STACK_WIND (frame,
-		    trash_common_unwind_buf_cbk,
-		    this->children->xlator,
-		    this->children->xlator->fops->rename,
-		    &local->loc1,
-		    &new_loc);
-
-	return 0;
-}
-
-/**
- * trash_rename_lookup_cbk -
+trash_common_unwind_buf_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int32_t op_ret, int32_t op_errno,
+                            struct iatt *prebuf, struct iatt *postbuf,
+                            dict_t *xdata)
+{
+    TRASH_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+                       xdata);
+    return 0;
+}
+
+int32_t
+trash_unlink_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                      dict_t *xdata)
+{
+    trash_private_t *priv = NULL;
+    trash_local_t *local = NULL;
+    loc_t new_loc = {
+        0,
+    };
+    int ret = 0;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+
+    local = frame->local;
+    GF_VALIDATE_OR_GOTO("trash", local, out);
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_DEBUG, "%s: %s", local->loc.path,
+               strerror(op_errno));
+        TRASH_STACK_UNWIND(unlink, frame, op_ret, op_errno, buf, NULL, xdata);
+        ret = -1;
+        goto out;
+    }
+
+    /* Only last hardlink will be moved to trash directory */
+    if (buf->ia_nlink > 1) {
+        STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, &local->loc, 0, xdata);
+        goto out;
+    }
+
+    /* if the file is too big  just unlink it */
+    if (buf->ia_size > (priv->max_trash_file_size)) {
+        gf_log(this->name, GF_LOG_DEBUG,
+               "%s: file size too big (%" PRId64
+               ") to "
+               "move into trash directory",
+               local->loc.path, buf->ia_size);
+
+        STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, &local->loc, 0, xdata);
+        goto out;
+    }
+
+    /* Copies new path for renaming */
+    loc_copy(&new_loc, &local->loc);
+    new_loc.path = gf_strdup(local->newpath);
+    if (!new_loc.path) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        ret = ENOMEM;
+        goto out;
+    }
+
+    STACK_WIND(frame, trash_unlink_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, &local->loc, &new_loc, xdata);
+
+out:
+    loc_wipe(&new_loc);
+
+    return ret;
+}
+
+/**
+ * Unlink is called internally by rm system call and also
+ * by internal operations of gluster such as self-heal
  */
 int32_t
-trash_rename_lookup_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 inode_t *inode,
-			 struct stat *buf,
-			 dict_t *xattr)
-{
-	trash_local_t *local = frame->local;
-
-	if (op_ret == -1) {
-		STACK_WIND (frame,
-			    trash_common_unwind_buf_cbk,
-			    this->children->xlator,
-			    this->children->xlator->fops->rename,
-			    &local->loc1,
-			    &local->loc2);
-		return 0;
-	}
-
-	loc_t oldloc = {
-		.parent = local->loc2.parent,
-		.inode = inode,
-		.path = local->loc2.path,
-	};
-	loc_t newloc = {
-		.inode = NULL,
-		.path = local->newpath
-	};
-	STACK_WIND (frame,
-		    trash_rename_rename_cbk,
-		    this->children->xlator,
-		    this->children->xlator->fops->rename,
-		    &oldloc,
-		    &newloc);
-
-	return 0;
-}
-
-
-/**
- * trash_rename -
+trash_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+             dict_t *xdata)
+{
+    trash_private_t *priv = NULL;
+    trash_local_t *local = NULL; /* files inside trash */
+    int32_t match = 0;
+    int32_t ctr_link_req = 0;
+    char *pathbuf = NULL;
+    int ret = 0;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+
+    /* If trash is not active or not enabled through cli, then
+     *  we bypass and wind back
+     */
+    if (!priv->state) {
+        STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, loc, 0, xdata);
+        goto out;
+    }
+
+    /* The files removed by gluster internal operations such as self-heal,
+     * should moved to trash directory , but files by client should not
+     * moved
+     */
+    if ((frame->root->pid < 0) && !priv->internal) {
+        STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, loc, 0, xdata);
+        goto out;
+    }
+    /* loc need some gfid which will be present in inode */
+    gf_uuid_copy(loc->gfid, loc->inode->gfid);
+
+    /* Checking for valid location */
+    if (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid)) {
+        gf_log(this->name, GF_LOG_DEBUG, "Bad address");
+        STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, loc, 0, xdata);
+        ret = EFAULT;
+        goto out;
+    }
+
+    /* This will be more accurate */
+    inode_path(loc->inode, NULL, &pathbuf);
+    /* Check whether the file is present under eliminate paths or
+     * inside trash directory. In both cases we don't need to move the
+     * file to trash directory. Instead delete it permanently
+     */
+    match = check_whether_eliminate_path(priv->eliminate, pathbuf);
+    if ((strncmp(pathbuf, priv->newtrash_dir, strlen(priv->newtrash_dir)) ==
+         0) ||
+        (match)) {
+        if (match) {
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "%s is a file comes under an eliminate path, "
+                   "so it is not moved to trash",
+                   loc->name);
+        }
+
+        /* Trying to unlink from the trash-dir. So do the
+         * actual unlink without moving to trash-dir.
+         */
+        STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, loc, 0, xdata);
+        goto out;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        TRASH_STACK_UNWIND(unlink, frame, -1, ENOMEM, NULL, NULL, xdata);
+        ret = ENOMEM;
+        goto out;
+    }
+    frame->local = local;
+    loc_copy(&local->loc, loc);
+
+    /* rename new location of file as starting from trash directory */
+    copy_trash_path(priv->newtrash_dir, (frame->root->pid < 0), local->newpath,
+                    sizeof(local->newpath));
+    strncat(local->newpath, pathbuf,
+            sizeof(local->newpath) - strlen(local->newpath) - 1);
+
+    /* append timestamp to file name so that we can avoid
+     * name collisions inside trash
+     */
+    append_time_stamp(local->newpath, sizeof(local->newpath));
+    if (strlen(local->newpath) > PATH_MAX) {
+        STACK_WIND(frame, trash_common_unwind_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, loc, 0, xdata);
+        goto out;
+    }
+
+    /* To know whether CTR xlator requested for the link count */
+    ret = dict_get_int32(xdata, GF_REQUEST_LINK_COUNT_XDATA, &ctr_link_req);
+    if (ret) {
+        local->ctr_link_count_req = _gf_false;
+        ret = 0;
+    } else
+        local->ctr_link_count_req = _gf_true;
+
+    LOCK_INIT(&frame->lock);
+
+    STACK_WIND(frame, trash_unlink_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->stat, loc, xdata);
+out:
+    return ret;
+}
+
+/**
+ * Use this when a failure occurs, and delete the newly created file
  */
 int32_t
-trash_rename (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *oldloc,
-	      loc_t *newloc)
-{
-	trash_private_t *priv = this->private;
-	trash_local_t *local = NULL;
-	time_t       utime = 0;
-	struct tm   *tm = NULL;
-	char         timestr[256];
-
-	if (strncmp (oldloc->path, priv->trash_dir, 
-		     strlen(priv->trash_dir)) == 0) {
-		/* Trying to rename from the trash can dir, 
-		   do the actual rename */
-		STACK_WIND (frame,
-			    trash_common_unwind_buf_cbk,
-			    this->children->xlator,
-			    this->children->xlator->fops->rename,
-			    oldloc,
-			    newloc);
-	} else {
-		/* Trying to rename a regular file from GlusterFS */
-		local = CALLOC (1, sizeof (trash_local_t));
-		if (!local) {
-			STACK_UNWIND (frame, -1, ENOMEM, NULL);
-			return 0;
-		}
-		frame->local = local;
-		loc_copy (&local->loc1, oldloc);
-		loc_copy (&local->loc2, newloc);
-
-		strcpy (local->newpath, priv->trash_dir);
-		strcat (local->newpath, newloc->path);
-
-		utime = time (NULL);
-		tm    = localtime (&utime);
-		strftime (timestr, 256, ".%Y%m%d%H%M%S", tm); 
-		strcat (local->newpath, timestr);
-
-		/* Send a lookup call on newloc, to ensure we are not 
-		   overwriting */
-		STACK_WIND (frame,
-			    trash_rename_lookup_cbk,
-			    this->children->xlator,
-			    this->children->xlator->fops->lookup,
-			    newloc,
-			    0);
-	}
-	return 0;
-}
-
-/**
- * trash_init -
+trash_truncate_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno,
+                          struct iatt *preparent, struct iatt *postparent,
+                          dict_t *xdata)
+{
+    trash_local_t *local = NULL;
+
+    local = frame->local;
+    GF_VALIDATE_OR_GOTO("trash", local, out);
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_DEBUG, "deleting the newly created file: %s",
+               strerror(op_errno));
+    }
+
+    STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, &local->loc,
+               local->fop_offset, xdata);
+out:
+    return 0;
+}
+
+/**
+ * Read from source file
  */
 int32_t
-init (xlator_t *this)
-{
-  	data_t *trash_dir = NULL;
-	xlator_list_t *trav = NULL;
-	trash_private_t *_priv = NULL;
-
-	/* Create .trashcan directory in init */
-	if (!this->children || this->children->next) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"not configured with exactly one child. exiting");
-		return -1;
-	}
-
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-
-	trav = this->children;
-	while (trav->xlator->children)
-		trav = trav->xlator->children;
-
-	if (strncmp ("storage/", trav->xlator->type, 8))
-	{
-		gf_log (this->name, GF_LOG_ERROR,
-			"'trash' translator not loaded over storage "
-			"translator, not a supported setup");
-		return -1;
-	}
-
-	_priv = CALLOC (1, sizeof (*_priv));
-	ERR_ABORT (_priv);
-
-	trash_dir = dict_get (this->options, "trash-dir");
-	if (!trash_dir) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"no option specified for 'trash-dir', "
-			"using \"/.trashcan/\"");
-		strcpy (_priv->trash_dir, "/.trashcan");
-	} else {
-		/* Need a path with '/' as the first char, if not 
-		   given, append it */
-		if (trash_dir->data[0] == '/') {
-			strcpy (_priv->trash_dir, trash_dir->data);
-		} else {
-			strcpy (_priv->trash_dir, "/");
-			strcat (_priv->trash_dir, trash_dir->data);
-		}
-	}
-
-	this->private = (void *)_priv;
-	return 0;
+trash_truncate_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                         int32_t count, struct iatt *stbuf,
+                         struct iobref *iobuf, dict_t *xdata)
+{
+    trash_local_t *local = NULL;
+
+    local = frame->local;
+    GF_VALIDATE_OR_GOTO("trash", local, out);
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_DEBUG,
+               "readv on the existing file failed: %s", strerror(op_errno));
+
+        STACK_WIND(frame, trash_truncate_unlink_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, &local->newloc, 0, xdata);
+        goto out;
+    }
+
+    local->fsize = stbuf->ia_size;
+    STACK_WIND(frame, trash_truncate_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, local->newfd, vector, count,
+               local->cur_offset, 0, iobuf, xdata);
+
+out:
+    return 0;
 }
 
-void
-fini (xlator_t *this)
+/**
+ * Write to file created in trash directory
+ */
+int32_t
+trash_truncate_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                          struct iatt *postbuf, dict_t *xdata)
+{
+    trash_local_t *local = NULL;
+
+    local = frame->local;
+    GF_VALIDATE_OR_GOTO("trash", local, out);
+
+    if (op_ret == -1) {
+        /* Let truncate work, but previous copy is not preserved. */
+        gf_log(this->name, GF_LOG_DEBUG,
+               "writev on the existing file failed: %s", strerror(op_errno));
+
+        STACK_WIND(frame, trash_truncate_unlink_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, &local->newloc, 0, xdata);
+        goto out;
+    }
+
+    if (local->cur_offset < local->fsize) {
+        local->cur_offset += GF_BLOCK_READV_SIZE;
+        /* Loop back and Read the contents again. */
+        STACK_WIND(frame, trash_truncate_readv_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->readv, local->fd,
+                   (size_t)GF_BLOCK_READV_SIZE, local->cur_offset, 0, xdata);
+        goto out;
+    }
+
+    /* OOFH.....Finally calling Truncate. */
+    STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, &local->loc,
+               local->fop_offset, xdata);
+
+out:
+    return 0;
+}
+
+/**
+ * The source file is opened for reading and writing
+ */
+int32_t
+trash_truncate_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, fd_t *fd,
+                        dict_t *xdata)
 {
-	trash_private_t *priv = this->private;
-	FREE (priv);
-	return;
+    trash_local_t *local = NULL;
+
+    local = frame->local;
+    GF_VALIDATE_OR_GOTO("trash", local, out);
+
+    if (op_ret == -1) {
+        /* Let truncate work, but previous copy is not preserved. */
+        gf_log(this->name, GF_LOG_DEBUG, "open on the existing file failed: %s",
+               strerror(op_errno));
+
+        STACK_WIND(frame, trash_truncate_unlink_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->unlink, &local->newloc, 0, xdata);
+        goto out;
+    }
+
+    fd_bind(fd);
+
+    local->cur_offset = 0;
+
+    STACK_WIND(frame, trash_truncate_readv_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readv, local->fd,
+               (size_t)GF_BLOCK_READV_SIZE, local->cur_offset, 0, xdata);
+
+out:
+    return 0;
 }
 
+/**
+ * Creates new file descriptor for read and write operations,
+ * if the path is present in trash directory
+ */
+int32_t
+trash_truncate_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, fd_t *fd,
+                          inode_t *inode, struct iatt *buf,
+                          struct iatt *preparent, struct iatt *postparent,
+                          dict_t *xdata)
+{
+    trash_local_t *local = NULL;
+    char *tmp_str = NULL;
+    char *dir_name = NULL;
+    char *tmp_path = NULL;
+    int32_t flags = 0;
+    loc_t tmp_loc = {
+        0,
+    };
+    char *tmp_stat = NULL;
+    char real_path[PATH_MAX] = {
+        0,
+    };
+    trash_private_t *priv = NULL;
 
-struct xlator_fops fops = {
-	.unlink = trash_unlink,
-	.rename = trash_rename,
-};
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
 
-struct xlator_mops mops = {
+    local = frame->local;
+    GF_VALIDATE_OR_GOTO("trash", local, out);
 
-};
+    TRASH_UNSET_PID(frame, local);
+
+    /* Checks whether path is present in trash directory or not */
+
+    if ((op_ret == -1) && (op_errno == ENOENT)) {
+        /* Creating the directory structure here. */
+        tmp_str = gf_strdup(local->newpath);
+        if (!tmp_str) {
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            goto out;
+        }
+        dir_name = dirname(tmp_str);
+
+        tmp_path = gf_strdup(dir_name);
+        if (!tmp_path) {
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            goto out;
+        }
+        loc_copy(&tmp_loc, &local->newloc);
+        tmp_loc.path = gf_strdup(tmp_path);
+        if (!tmp_loc.path) {
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            goto out;
+        }
+        strncpy(real_path, priv->brick_path, sizeof(real_path));
+        real_path[sizeof(real_path) - 1] = 0;
+        remove_trash_path(tmp_path, (frame->root->pid < 0), &tmp_stat);
+        if (tmp_stat)
+            strncat(real_path, tmp_stat,
+                    sizeof(real_path) - strlen(real_path) - 1);
+
+        TRASH_SET_PID(frame, local);
+
+        /* create the directory with proper permissions */
+        STACK_WIND_COOKIE(frame, trash_truncate_mkdir_cbk, tmp_path,
+                          FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir,
+                          &tmp_loc, get_permission(real_path), 0022, xdata);
+        loc_wipe(&tmp_loc);
+        goto out;
+    }
+
+    if (op_ret == -1) {
+        /* Let truncate work, but previous copy is not preserved.
+         * Deleting the newly created copy.
+         */
+        gf_log(this->name, GF_LOG_DEBUG,
+               "creation of new file in trash-dir failed, "
+               "when truncate was called: %s",
+               strerror(op_errno));
+
+        STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->truncate, &local->loc,
+                   local->fop_offset, xdata);
+        goto out;
+    }
+
+    fd_bind(fd);
+    flags = O_RDONLY;
+
+    /* fd which represents source file for reading and writing from it */
+
+    local->fd = fd_create(local->loc.inode, frame->root->pid);
+
+    STACK_WIND(frame, trash_truncate_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, &local->loc, flags, local->fd, 0);
+out:
+    if (tmp_str)
+        GF_FREE(tmp_str);
+    if (tmp_path)
+        GF_FREE(tmp_path);
+
+    return 0;
+}
+
+/**
+ * If the path is not present in the trash directory,it will recursively call
+ * this call-back and one by one directories will be created from the
+ * beginning
+ */
+int32_t
+trash_truncate_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, inode_t *inode,
+                         struct iatt *stbuf, struct iatt *preparent,
+                         struct iatt *postparent, dict_t *xdata)
+{
+    trash_local_t *local = NULL;
+    trash_private_t *priv = NULL;
+    char *tmp_str = NULL;
+    char *tmp_path = NULL;
+    char *tmp_dirname = NULL;
+    char *dir_name = NULL;
+    char *tmp_stat = NULL;
+    char real_path[PATH_MAX] = {
+        0,
+    };
+    size_t count = 0;
+    int32_t flags = 0;
+    int32_t loop_count = 0;
+    int i = 0;
+    loc_t tmp_loc = {
+        0,
+    };
+    int ret = 0;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+
+    local = frame->local;
+    GF_VALIDATE_OR_GOTO("trash", local, out);
+
+    loop_count = local->loop_count;
+
+    TRASH_UNSET_PID(frame, local);
+
+    tmp_str = gf_strdup(local->newpath);
+    if (!tmp_str) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        ret = ENOMEM;
+        goto out;
+    }
+
+    if ((op_ret == -1) && (op_errno == ENOENT)) {
+        tmp_dirname = strchr(tmp_str, '/');
+        while (tmp_dirname) {
+            count = tmp_dirname - tmp_str;
+            if (count == 0)
+                count = 1;
+            i++;
+            if (i > loop_count)
+                break;
+            tmp_dirname = strchr(tmp_str + count + 1, '/');
+        }
+        tmp_path = gf_memdup(local->newpath, count + 1);
+        if (!tmp_path) {
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            ret = ENOMEM;
+            goto out;
+        }
+        tmp_path[count] = '\0';
+
+        loc_copy(&tmp_loc, &local->newloc);
+        tmp_loc.path = gf_strdup(tmp_path);
+        if (!tmp_loc.path) {
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            ret = ENOMEM;
+            goto out;
+        }
+
+        /* Stores the the name of directory to be created */
+        tmp_loc.name = gf_strdup(strrchr(tmp_path, '/') + 1);
+        if (!tmp_loc.name) {
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            ret = ENOMEM;
+            goto out;
+        }
+        strncpy(real_path, priv->brick_path, sizeof(real_path));
+        real_path[sizeof(real_path) - 1] = 0;
+        remove_trash_path(tmp_path, (frame->root->pid < 0), &tmp_stat);
+        if (tmp_stat)
+            strncat(real_path, tmp_stat,
+                    sizeof(real_path) - strlen(real_path) - 1);
+
+        TRASH_SET_PID(frame, local);
+
+        STACK_WIND_COOKIE(frame, trash_truncate_mkdir_cbk, tmp_path,
+                          FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir,
+                          &tmp_loc, get_permission(real_path), 0022, xdata);
+        loc_wipe(&tmp_loc);
+        goto out;
+    }
+
+    if (op_ret == 0) {
+        dir_name = dirname(tmp_str);
+        if (strcmp((char *)cookie, dir_name) == 0) {
+            flags = O_CREAT | O_EXCL | O_WRONLY;
+            strncpy(real_path, priv->brick_path, sizeof(real_path));
+            real_path[sizeof(real_path) - 1] = 0;
+            strncat(real_path, local->origpath,
+                    sizeof(real_path) - strlen(real_path) - 1);
+            /* Call create again once directory structure
+               is created. */
+
+            TRASH_SET_PID(frame, local);
+
+            STACK_WIND(frame, trash_truncate_create_cbk, FIRST_CHILD(this),
+                       FIRST_CHILD(this)->fops->create, &local->newloc, flags,
+                       get_permission(real_path), 0022, local->newfd, xdata);
+            goto out;
+        }
+    }
+
+    if ((op_ret == -1) && (op_errno != EEXIST)) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Directory creation failed [%s]. "
+               "Therefore truncating %s without moving the "
+               "original copy to trash directory",
+               strerror(op_errno), local->loc.name);
+        STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->truncate, &local->loc,
+                   local->fop_offset, xdata);
+        goto out;
+    }
+
+    LOCK(&frame->lock);
+    {
+        loop_count = ++local->loop_count;
+    }
+    UNLOCK(&frame->lock);
+
+    tmp_dirname = strchr(tmp_str, '/');
+    while (tmp_dirname) {
+        count = tmp_dirname - tmp_str;
+        if (count == 0)
+            count = 1;
+        i++;
+        if (i > loop_count)
+            break;
+        tmp_dirname = strchr(tmp_str + count + 1, '/');
+    }
+    tmp_path = gf_memdup(local->newpath, count + 1);
+    if (!tmp_path) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        ret = ENOMEM;
+        goto out;
+    }
+    tmp_path[count] = '\0';
+
+    loc_copy(&tmp_loc, &local->newloc);
+    tmp_loc.path = gf_strdup(tmp_path);
+    if (!tmp_loc.path) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        ret = ENOMEM;
+        goto out;
+    }
+
+    /* Stores the the name of directory to be created */
+    tmp_loc.name = gf_strdup(strrchr(tmp_path, '/') + 1);
+    if (!tmp_loc.name) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        goto out;
+    }
+
+    strncpy(real_path, priv->brick_path, sizeof(real_path));
+    real_path[sizeof(real_path) - 1] = 0;
+    remove_trash_path(tmp_path, (frame->root->pid < 0), &tmp_stat);
+    if (tmp_stat)
+        strncat(real_path, tmp_stat, sizeof(real_path) - strlen(real_path) - 1);
+
+    TRASH_SET_PID(frame, local);
+
+    STACK_WIND_COOKIE(frame, trash_truncate_mkdir_cbk, tmp_path,
+                      FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir,
+                      &tmp_loc, get_permission(real_path), 0022, xdata);
+
+out:
+    if (tmp_str)
+        GF_FREE(tmp_str);
+    if (tmp_path)
+        GF_FREE(tmp_path);
+
+    return ret;
+}
+
+int32_t
+trash_truncate_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                        dict_t *xdata)
+{
+    trash_private_t *priv = NULL;
+    trash_local_t *local = NULL;
+    char loc_newname[PATH_MAX] = {
+        0,
+    };
+    int32_t flags = 0;
+    dentry_t *dir_entry = NULL;
+    inode_table_t *table = NULL;
+    int ret = 0;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+
+    local = frame->local;
+    GF_VALIDATE_OR_GOTO("trash", local, out);
+
+    table = local->loc.inode->table;
+
+    pthread_mutex_lock(&table->lock);
+    {
+        dir_entry = __dentry_search_arbit(local->loc.inode);
+    }
+    pthread_mutex_unlock(&table->lock);
+
+    if (op_ret == -1) {
+        gf_log(this->name, GF_LOG_DEBUG, "fstat on the file failed: %s",
+               strerror(op_errno));
+
+        TRASH_STACK_UNWIND(truncate, frame, op_ret, op_errno, buf, NULL, xdata);
+        goto out;
+    }
+
+    /* Only last hardlink will be moved to trash directory */
+    if (buf->ia_nlink > 1) {
+        STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->truncate, &local->loc,
+                   local->fop_offset, xdata);
+        goto out;
+    }
+
+    /**
+     * If the file is too big or if it is extended truncate,
+     * just don't move it to trash directory.
+     */
+    if (buf->ia_size > (priv->max_trash_file_size) ||
+        buf->ia_size <= local->fop_offset) {
+        gf_log(this->name, GF_LOG_DEBUG,
+               "%s: file is too large to move to trash", local->loc.path);
+
+        STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->truncate, &local->loc,
+                   local->fop_offset, xdata);
+        goto out;
+    }
+
+    /* Retrieves the name of file from path */
+    local->loc.name = gf_strdup(strrchr(local->loc.path, '/'));
+    if (!local->loc.name) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        goto out;
+    }
+
+    /* Stores new path for source file */
+    copy_trash_path(priv->newtrash_dir, (frame->root->pid < 0), local->newpath,
+                    sizeof(local->newpath));
+    strncat(local->newpath, local->loc.path,
+            sizeof(local->newpath) - strlen(local->newpath) - 1);
+
+    /* append timestamp to file name so that we can avoid
+       name collisions inside trash */
+    append_time_stamp(local->newpath, sizeof(local->newpath));
+    if (strlen(local->newpath) > PATH_MAX) {
+        STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->truncate, &local->loc,
+                   local->fop_offset, xdata);
+        goto out;
+    }
+
+    strncpy(loc_newname, local->loc.name, sizeof(loc_newname));
+    loc_newname[sizeof(loc_newname) - 1] = 0;
+    append_time_stamp(loc_newname, sizeof(loc_newname));
+    /* local->newloc represents old file(file inside trash),
+       where as local->loc represents truncated file. We need
+       to create new inode and fd for new file*/
+    local->newloc.name = gf_strdup(loc_newname);
+    if (!local->newloc.name) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        ret = ENOMEM;
+        goto out;
+    }
+    local->newloc.path = gf_strdup(local->newpath);
+    if (!local->newloc.path) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        ret = ENOMEM;
+        goto out;
+    }
+    local->newloc.inode = inode_new(local->loc.inode->table);
+    local->newfd = fd_create(local->newloc.inode, frame->root->pid);
+
+    /* Creating valid parent and pargfids for both files */
+
+    if (dir_entry == NULL) {
+        ret = EINVAL;
+        goto out;
+    }
+    local->loc.parent = inode_ref(dir_entry->parent);
+    gf_uuid_copy(local->loc.pargfid, dir_entry->parent->gfid);
+
+    local->newloc.parent = inode_ref(dir_entry->parent);
+    gf_uuid_copy(local->newloc.pargfid, dir_entry->parent->gfid);
+
+    flags = O_CREAT | O_EXCL | O_WRONLY;
+
+    TRASH_SET_PID(frame, local);
+
+    STACK_WIND(frame, trash_truncate_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, &local->newloc, flags,
+               st_mode_from_ia(buf->ia_prot, local->loc.inode->ia_type), 0022,
+               local->newfd, xdata);
+
+out:
+    return ret;
+}
+
+/**
+ * Truncate can be explicitly called or implicitly by some other applications
+ * like text editors etc..
+ */
+int32_t
+trash_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+               dict_t *xdata)
+{
+    trash_private_t *priv = NULL;
+    trash_local_t *local = NULL;
+    int32_t match = 0;
+    char *pathbuf = NULL;
+    int ret = 0;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+    /* If trash is not active or not enabled through cli, then
+     * we bypass and wind back
+     */
+    if (!priv->state) {
+        STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+        goto out;
+    }
+
+    /* The files removed by gluster operations such as self-heal,
+       should moved to trash directory, but files by client should
+       not moved */
+    if ((frame->root->pid < 0) && !priv->internal) {
+        STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+        goto out;
+    }
+    /* This will be more accurate */
+    inode_path(loc->inode, NULL, &pathbuf);
+
+    /* Checks whether file is in trash directory or eliminate path.
+     * In all such cases it does not move to trash directory,
+     * truncate will be performed
+     */
+    match = check_whether_eliminate_path(priv->eliminate, pathbuf);
+
+    if ((strncmp(pathbuf, priv->newtrash_dir, strlen(priv->newtrash_dir)) ==
+         0) ||
+        (match)) {
+        if (match) {
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "%s: file not moved to trash as per option "
+                   "'eliminate path'",
+                   loc->path);
+        }
 
-struct xlator_cbks cbks = {
+        /* Trying to truncate from the trash-dir. So do the
+         * actual truncate without moving to trash-dir.
+         */
+        STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+        goto out;
+    }
+
+    LOCK_INIT(&frame->lock);
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        TRASH_STACK_UNWIND(truncate, frame, -1, ENOMEM, NULL, NULL, xdata);
+        ret = ENOMEM;
+        goto out;
+    }
+
+    strncpy(local->origpath, pathbuf, sizeof(local->origpath));
+    local->origpath[sizeof(local->origpath) - 1] = 0;
+
+    loc_copy(&local->loc, loc);
+    local->loc.path = pathbuf;
+    local->fop_offset = offset;
+
+    frame->local = local;
+
+    STACK_WIND(frame, trash_truncate_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->stat, loc, xdata);
+
+out:
+    return ret;
+}
+
+/**
+ * When we call truncate from terminal it comes to ftruncate of trash-xlator.
+ * Since truncate internally calls ftruncate and we receive fd of the file,
+ * other than that it also called by Rebalance operation
+ */
+int32_t
+trash_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                dict_t *xdata)
+{
+    trash_private_t *priv = NULL;
+    trash_local_t *local = NULL; /* file inside trash */
+    char *pathbuf = NULL;        /* path of file from fd */
+    int32_t retval = 0;
+    int32_t match = 0;
+    int ret = 0;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+    /* If trash is not active or not enabled through cli, then
+     * we bypass and wind back
+     */
+    if (!priv->state) {
+        STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+        goto out;
+    }
+
+    /* The files removed by gluster operations such as self-heal,
+     * should moved to trash directory, but files by client
+     * should not moved
+     */
+    if ((frame->root->pid < 0) && !priv->internal) {
+        STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+        goto out;
+    }
+    /* This will be more accurate */
+    retval = inode_path(fd->inode, NULL, &pathbuf);
+
+    /* Checking  the eliminate path */
+
+    /* Checks whether file is trash directory or eliminate path or
+     * invalid fd. In all such cases it does not move to trash directory,
+     * ftruncate will be performed
+     */
+    match = check_whether_eliminate_path(priv->eliminate, pathbuf);
+    if ((strncmp(pathbuf, priv->newtrash_dir, strlen(priv->newtrash_dir)) ==
+         0) ||
+        match || !retval) {
+        if (match) {
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "%s: file matches eliminate path, "
+                   "not moved to trash",
+                   pathbuf);
+        }
+
+        /* Trying to ftruncate from the trash-dir. So do the
+         * actual ftruncate without moving to trash-dir
+         */
+        STACK_WIND(frame, trash_common_unwind_buf_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+        goto out;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        TRASH_STACK_UNWIND(ftruncate, frame, -1, ENOMEM, NULL, NULL, xdata);
+        ret = -1;
+        goto out;
+    }
+
+    strncpy(local->origpath, pathbuf, sizeof(local->origpath));
+    local->origpath[sizeof(local->origpath) - 1] = 0;
+
+    /* To convert fd to location */
+    frame->local = local;
+
+    local->loc.path = pathbuf;
+    local->loc.inode = inode_ref(fd->inode);
+    gf_uuid_copy(local->loc.gfid, local->loc.inode->gfid);
+
+    local->fop_offset = offset;
+
+    /* Else remains same to truncate code, so from here flow goes
+     * to truncate_stat
+     */
+    STACK_WIND(frame, trash_truncate_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fstat, fd, xdata);
+out:
+    return ret;
+}
+
+/**
+ * The mkdir call is intercepted to avoid creation of
+ * trash directory in the mount by the user
+ */
+int32_t
+trash_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+            mode_t umask, dict_t *xdata)
+{
+    int32_t op_ret = 0;
+    int32_t op_errno = 0;
+    trash_private_t *priv = NULL;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+
+    if (!check_whether_op_permitted(priv, loc)) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "mkdir issued on %s, which is not permitted",
+               priv->newtrash_dir);
+        op_errno = EPERM;
+        op_ret = -1;
+
+        STACK_UNWIND_STRICT(mkdir, frame, op_ret, op_errno, NULL, NULL, NULL,
+                            NULL, xdata);
+    } else {
+        STACK_WIND(frame, trash_common_mkdir_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+    }
+
+out:
+    return 0;
+}
+
+/**
+ * The rename call is intercepted to avoid renaming
+ * of trash directory in the mount by the user
+ */
+int
+trash_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+             dict_t *xdata)
+{
+    int32_t op_ret = 0;
+    int32_t op_errno = 0;
+    trash_private_t *priv = NULL;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+
+    if (!check_whether_op_permitted(priv, oldloc)) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "rename issued on %s, which is not permitted",
+               priv->newtrash_dir);
+        op_errno = EPERM;
+        op_ret = -1;
+
+        STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, NULL, NULL, NULL,
+                            NULL, NULL, xdata);
+    } else {
+        STACK_WIND(frame, trash_common_rename_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+    }
+
+out:
+    return 0;
+}
+
+/**
+ * The rmdir call is intercepted to avoid deletion of
+ * trash directory in the mount by the user
+ */
+int32_t
+trash_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+            dict_t *xdata)
+{
+    int32_t op_ret = 0;
+    int32_t op_errno = 0;
+    trash_private_t *priv = NULL;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+
+    if (!check_whether_op_permitted(priv, loc)) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "rmdir issued on %s, which is not permitted",
+               priv->newtrash_dir);
+        op_errno = EPERM;
+        op_ret = -1;
+
+        STACK_UNWIND_STRICT(rmdir, frame, op_ret, op_errno, NULL, NULL, xdata);
+    } else {
+        STACK_WIND(frame, trash_common_rmdir_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata);
+    }
+
+out:
+    return 0;
+}
+
+/**
+ * Volume set option is handled by the reconfigure function.
+ * Here we checks whether each option is set or not ,if it
+ * sets then corresponding modifciations will be made
+ */
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    uint64_t max_fsize = 0;
+    int ret = 0;
+    char *tmp = NULL;
+    char *tmp_str = NULL;
+    trash_private_t *priv = NULL;
+    char trash_dir[PATH_MAX] = {
+        0,
+    };
+
+    priv = this->private;
+
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+
+    GF_OPTION_RECONF("trash-internal-op", priv->internal, options, bool, out);
+    GF_OPTION_RECONF("trash-dir", tmp, options, str, out);
+
+    GF_OPTION_RECONF("trash", priv->state, options, bool, out);
+
+    if (priv->state) {
+        ret = create_or_rename_trash_directory(this);
+
+        if (tmp)
+            sprintf(trash_dir, "/%s/", tmp);
+        else
+            sprintf(trash_dir, "%s", priv->oldtrash_dir);
+
+        if (strcmp(priv->newtrash_dir, trash_dir) != 0) {
+            /* When user set a new name for trash directory, trash
+             * xlator will perform a rename operation on old trash
+             * directory to the new one using a STACK_WIND from here.
+             * This option can be configured only when volume is in
+             * started state
+             */
+
+            GF_FREE(priv->newtrash_dir);
+
+            priv->newtrash_dir = gf_strdup(trash_dir);
+            if (!priv->newtrash_dir) {
+                ret = ENOMEM;
+                gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+                goto out;
+            }
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "Renaming %s -> %s from reconfigure", priv->oldtrash_dir,
+                   priv->newtrash_dir);
+
+            if (!priv->newtrash_dir) {
+                gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+                ret = ENOMEM;
+                goto out;
+            }
+            ret = rename_trash_directory(this);
+        }
+
+        if (priv->internal) {
+            ret = create_internalop_directory(this);
+        }
+    }
+    tmp = NULL;
+
+    GF_OPTION_RECONF("trash-max-filesize", max_fsize, options, size_uint64,
+                     out);
+    if (max_fsize) {
+        priv->max_trash_file_size = max_fsize;
+        gf_log(this->name, GF_LOG_DEBUG, "%" GF_PRI_SIZET " max-size",
+               priv->max_trash_file_size);
+    }
+    GF_OPTION_RECONF("trash-eliminate-path", tmp, options, str, out);
+    if (!tmp) {
+        gf_log(this->name, GF_LOG_DEBUG,
+               "no option specified for 'eliminate', using NULL");
+    } else {
+        if (priv->eliminate)
+            wipe_eliminate_path(&priv->eliminate);
+
+        tmp_str = gf_strdup(tmp);
+        if (!tmp_str) {
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            ret = ENOMEM;
+            goto out;
+        }
+        ret = store_eliminate_path(tmp_str, &priv->eliminate);
+    }
+
+out:
+
+    return ret;
+}
+
+/**
+ * Notify is used to create the trash directory with fixed gfid
+ * using STACK_WIND only when posix xlator is up
+ */
+int
+notify(xlator_t *this, int event, void *data, ...)
+{
+    trash_private_t *priv = NULL;
+    int ret = 0;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("trash", priv, out);
+
+    /* Check whether posix is up not */
+    if (event == GF_EVENT_CHILD_UP) {
+        if (!priv->state) {
+            gf_log(this->name, GF_LOG_DEBUG, "trash xlator is off");
+            goto out;
+        }
+
+        /* Here there is two possibilities ,if trash directory already
+         * exist ,then we need to perform a rename operation on the
+         * old one. Otherwise, we need to create the trash directory
+         * For both, we need to pass location variable, gfid of parent
+         * and a frame for calling STACK_WIND.The location variable
+         * requires name,path,gfid and inode
+         */
+        if (!priv->oldtrash_dir)
+            ret = create_or_rename_trash_directory(this);
+        else if (strcmp(priv->newtrash_dir, priv->oldtrash_dir) != 0)
+            ret = rename_trash_directory(this);
+        if (ret)
+            goto out;
+
+        if (priv->internal)
+            (void)create_internalop_directory(this);
+    }
+
+out:
+    ret = default_notify(this, event, data);
+    if (ret)
+        gf_log(this->name, GF_LOG_INFO, "default notify event failed");
+    return ret;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("trash", this, out);
+
+    ret = xlator_mem_acct_init(this, gf_trash_mt_end + 1);
+    if (ret != 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Memory accounting init"
+               "failed");
+        return ret;
+    }
+out:
+    return ret;
+}
+
+/**
+ * trash_init
+ */
+int32_t
+init(xlator_t *this)
+{
+    trash_private_t *priv = NULL;
+    int ret = -1;
+    char *tmp = NULL;
+    char *tmp_str = NULL;
+    char trash_dir[PATH_MAX] = {
+        0,
+    };
+    uint64_t max_trash_file_size64 = 0;
+    data_t *data = NULL;
+
+    GF_VALIDATE_OR_GOTO("trash", this, out);
+
+    if (!this->children || this->children->next) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "not configured with exactly one child. exiting");
+        ret = -1;
+        goto out;
+    }
+
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile");
+    }
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_trash_mt_trash_private_t);
+    if (!priv) {
+        gf_log(this->name, GF_LOG_ERROR, "out of memory");
+        ret = ENOMEM;
+        goto out;
+    }
+
+    /* Trash priv data members are initialized through the following
+     * set of statements
+     */
+    GF_OPTION_INIT("trash", priv->state, bool, out);
+
+    GF_OPTION_INIT("trash-dir", tmp, str, out);
+
+    /* We store trash dir value as path for easier manipulation*/
+    if (!tmp) {
+        gf_log(this->name, GF_LOG_INFO,
+               "no option specified for 'trash-dir', "
+               "using \"/.trashcan/\"");
+        priv->newtrash_dir = gf_strdup("/.trashcan/");
+        if (!priv->newtrash_dir) {
+            ret = ENOMEM;
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            goto out;
+        }
+    } else {
+        sprintf(trash_dir, "/%s/", tmp);
+        priv->newtrash_dir = gf_strdup(trash_dir);
+        if (!priv->newtrash_dir) {
+            ret = ENOMEM;
+            gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+            goto out;
+        }
+    }
+    tmp = NULL;
+
+    GF_OPTION_INIT("trash-eliminate-path", tmp, str, out);
+    if (!tmp) {
+        gf_log(this->name, GF_LOG_INFO,
+               "no option specified for 'eliminate', using NULL");
+    } else {
+        tmp_str = gf_strdup(tmp);
+        if (!tmp_str) {
+            gf_log(this->name, GF_LOG_ERROR, "out of memory");
+            ret = ENOMEM;
+            goto out;
+        }
+        ret = store_eliminate_path(tmp_str, &priv->eliminate);
+    }
+    tmp = NULL;
+
+    GF_OPTION_INIT("trash-max-filesize", max_trash_file_size64, size_uint64,
+                   out);
+    if (!max_trash_file_size64) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "no option specified for 'max-trashable-file-size', "
+               "using default = %lld MB",
+               GF_DEFAULT_MAX_FILE_SIZE / GF_UNIT_MB);
+        priv->max_trash_file_size = GF_DEFAULT_MAX_FILE_SIZE;
+    } else {
+        priv->max_trash_file_size = max_trash_file_size64;
+        gf_log(this->name, GF_LOG_DEBUG, "%" GF_PRI_SIZET " max-size",
+               priv->max_trash_file_size);
+    }
+
+    GF_OPTION_INIT("trash-internal-op", priv->internal, bool, out);
+
+    this->local_pool = mem_pool_new(trash_local_t, 64);
+    if (!this->local_pool) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "failed to create local_t's memory pool");
+        ret = ENOMEM;
+        goto out;
+    }
+
+    /* For creating directories inside trash with proper permissions,
+     * we need to perform stat on that directories, for this we use
+     * brick path
+     */
+    data = dict_get(this->options, "brick-path");
+    if (!data) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "no option specified for 'brick-path'");
+        ret = ENOMEM;
+        goto out;
+    }
+    priv->brick_path = gf_strdup(data->data);
+    if (!priv->brick_path) {
+        ret = ENOMEM;
+        gf_log(this->name, GF_LOG_DEBUG, "out of memory");
+        goto out;
+    }
+
+    priv->trash_itable = inode_table_new(0, this);
+    gf_log(this->name, GF_LOG_DEBUG, "brick path is%s", priv->brick_path);
+
+    this->private = (void *)priv;
+    ret = 0;
+
+out:
+    if (tmp_str)
+        GF_FREE(tmp_str);
+    if (ret) {
+        if (priv) {
+            if (priv->newtrash_dir)
+                GF_FREE(priv->newtrash_dir);
+            if (priv->oldtrash_dir)
+                GF_FREE(priv->oldtrash_dir);
+            if (priv->brick_path)
+                GF_FREE(priv->brick_path);
+            if (priv->eliminate)
+                wipe_eliminate_path(&priv->eliminate);
+            GF_FREE(priv);
+        }
+        mem_pool_destroy(this->local_pool);
+        this->local_pool = NULL;
+    }
+    return ret;
+}
+
+/**
+ * trash_fini
+ */
+void
+fini(xlator_t *this)
+{
+    trash_private_t *priv = NULL;
+    inode_table_t *inode_table = NULL;
+
+    GF_VALIDATE_OR_GOTO("trash", this, out);
+    priv = this->private;
+    if (priv) {
+        inode_table = priv->trash_itable;
+        if (priv->newtrash_dir) {
+            GF_FREE(priv->newtrash_dir);
+            priv->newtrash_dir = NULL;
+        }
+        if (priv->oldtrash_dir) {
+            GF_FREE(priv->oldtrash_dir);
+            priv->oldtrash_dir = NULL;
+        }
+        if (priv->brick_path) {
+            GF_FREE(priv->brick_path);
+            priv->brick_path = NULL;
+        }
+        if (priv->eliminate) {
+            wipe_eliminate_path(&priv->eliminate);
+            priv->eliminate = NULL;
+        }
+        if (inode_table) {
+            inode_table_destroy(inode_table);
+            priv->trash_itable = NULL;
+        }
+        GF_FREE(priv);
+    }
+
+    if (this->local_pool) {
+        mem_pool_destroy(this->local_pool);
+        this->local_pool = NULL;
+    }
+    this->private = NULL;
+out:
+    return;
+}
+
+struct xlator_fops fops = {
+    .unlink = trash_unlink,
+    .truncate = trash_truncate,
+    .ftruncate = trash_ftruncate,
+    .rmdir = trash_rmdir,
+    .mkdir = trash_mkdir,
+    .rename = trash_rename,
 };
 
+struct xlator_cbks cbks = {};
+
 struct volume_options options[] = {
-	{ .key  = { "trash-dir" }, 
-	  .type = GF_OPTION_TYPE_PATH 
-	},
-	{ .key  = {NULL} },
+    {
+        .key = {"trash"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "Enable/disable trash translator",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"backup"},
+    },
+    {
+        .key = {"trash-dir"},
+        .type = GF_OPTION_TYPE_STR,
+        .default_value = ".trashcan",
+        .description = "Directory for trash files",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"backup"},
+    },
+    {
+        .key = {"trash-eliminate-path"},
+        .type = GF_OPTION_TYPE_STR,
+        .description = "Eliminate paths to be excluded "
+                       "from trashing",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"backup"},
+    },
+    {
+        .key = {"trash-max-filesize"},
+        .type = GF_OPTION_TYPE_SIZET,
+        .default_value = "5MB",
+        .description = "Maximum size of file that can be "
+                       "moved to trash",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"backup"},
+    },
+    {
+        .key = {"trash-internal-op"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "Enable/disable trash translator for "
+                       "internal operations",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"backup"},
+    },
+    {.key = {"brick-path"},
+     .type = GF_OPTION_TYPE_PATH,
+     .default_value = "{{ brick.path }}"},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "trash",
+    .category = GF_TECH_PREVIEW,
 };
diff --git a/xlators/features/trash/src/trash.h b/xlators/features/trash/src/trash.h
new file mode 100644
index 00000000000..6671617c2c6
--- /dev/null
+++ b/xlators/features/trash/src/trash.h
@@ -0,0 +1,97 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef __TRASH_H__
+#define __TRASH_H__
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include "inode.c"
+#include "fnmatch.h"
+
+#include <libgen.h>
+
+#ifndef GF_BLOCK_READV_SIZE
+#define GF_BLOCK_READV_SIZE (128 * GF_UNIT_KB)
+#endif
+
+#ifndef GF_DEFAULT_MAX_FILE_SIZE
+#define GF_DEFAULT_MAX_FILE_SIZE (200 * GF_UNIT_MB)
+#endif
+
+struct trash_struct {
+    fd_t *fd;         /* for the fd of existing file */
+    fd_t *newfd;      /* for the newly created file */
+    loc_t loc;        /* to store the location of the existing file */
+    loc_t newloc;     /* to store the location for the new file */
+    size_t fsize;     /* for keeping the size of existing file */
+    off_t cur_offset; /* current offset for read and write ops */
+    off_t fop_offset; /* original offset received with the fop */
+    pid_t pid;
+    char origpath[PATH_MAX];
+    char newpath[PATH_MAX];
+    int32_t loop_count;
+    gf_boolean_t is_set_pid;
+    struct iatt preparent;
+    struct iatt postparent;
+    gf_boolean_t ctr_link_count_req;
+};
+typedef struct trash_struct trash_local_t;
+
+struct _trash_elim_path {
+    struct _trash_elim_path *next;
+    char *path;
+};
+typedef struct _trash_elim_path trash_elim_path;
+
+struct trash_priv {
+    char *oldtrash_dir;
+    char *newtrash_dir;
+    char *brick_path;
+    trash_elim_path *eliminate;
+    size_t max_trash_file_size;
+    gf_boolean_t state;
+    gf_boolean_t internal;
+    inode_t *trash_inode;
+    inode_table_t *trash_itable;
+};
+typedef struct trash_priv trash_private_t;
+
+#define TRASH_SET_PID(frame, local)                                            \
+    do {                                                                       \
+        GF_ASSERT(!local->is_set_pid);                                         \
+        if (!local->is_set_pid) {                                              \
+            local->pid = frame->root->pid;                                     \
+            frame->root->pid = GF_SERVER_PID_TRASH;                            \
+            local->is_set_pid = _gf_true;                                      \
+        }                                                                      \
+    } while (0)
+
+#define TRASH_UNSET_PID(frame, local)                                          \
+    do {                                                                       \
+        GF_ASSERT(local->is_set_pid);                                          \
+        if (local->is_set_pid) {                                               \
+            frame->root->pid = local->pid;                                     \
+            local->is_set_pid = _gf_false;                                     \
+        }                                                                      \
+    } while (0)
+
+#define TRASH_STACK_UNWIND(op, frame, params...)                               \
+    do {                                                                       \
+        trash_local_t *__local = NULL;                                         \
+        __local = frame->local;                                                \
+        frame->local = NULL;                                                   \
+        STACK_UNWIND_STRICT(op, frame, params);                                \
+        trash_local_wipe(__local);                                             \
+    } while (0)
+
+#endif /* __TRASH_H__ */
diff --git a/xlators/features/upcall/Makefile.am b/xlators/features/upcall/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/upcall/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/upcall/src/Makefile.am b/xlators/features/upcall/src/Makefile.am
new file mode 100644
index 00000000000..72b7f55ae0a
--- /dev/null
+++ b/xlators/features/upcall/src/Makefile.am
@@ -0,0 +1,23 @@
+if WITH_SERVER
+xlator_LTLIBRARIES = upcall.la
+endif
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+upcall_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+upcall_la_SOURCES = upcall.c upcall-internal.c
+
+upcall_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+	$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \
+	$(top_builddir)/rpc/xdr/src/libgfxdr.la
+
+noinst_HEADERS = upcall.h upcall-mem-types.h upcall-messages.h \
+	upcall-cache-invalidation.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/rpc-lib/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/upcall/src/upcall-cache-invalidation.h b/xlators/features/upcall/src/upcall-cache-invalidation.h
new file mode 100644
index 00000000000..db649b2c9a6
--- /dev/null
+++ b/xlators/features/upcall/src/upcall-cache-invalidation.h
@@ -0,0 +1,18 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __UPCALL_CACHE_INVALIDATION_H__
+#define __UPCALL_CACHE_INVALIDATION_H__
+
+/* The time period for which a client will be notified of cache_invalidation
+ * events post its last access */
+#define CACHE_INVALIDATION_TIMEOUT "60"
+
+#endif /* __UPCALL_CACHE_INVALIDATION_H__ */
diff --git a/xlators/features/upcall/src/upcall-internal.c b/xlators/features/upcall/src/upcall-internal.c
new file mode 100644
index 00000000000..c641bd6f432
--- /dev/null
+++ b/xlators/features/upcall/src/upcall-internal.c
@@ -0,0 +1,689 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <limits.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/common-utils.h>
+
+#include <glusterfs/statedump.h>
+#include <glusterfs/syncop.h>
+
+#include "upcall.h"
+#include "upcall-mem-types.h"
+#include "glusterfs3-xdr.h"
+#include "protocol-common.h"
+#include <glusterfs/defaults.h>
+
+/*
+ * Check if any of the upcall options are enabled:
+ *     - cache_invalidation
+ */
+gf_boolean_t
+is_upcall_enabled(xlator_t *this)
+{
+    upcall_private_t *priv = NULL;
+
+    if (this->private) {
+        priv = (upcall_private_t *)this->private;
+        return priv->cache_invalidation_enabled;
+    }
+
+    return _gf_false;
+}
+
+/*
+ * Get the cache_invalidation_timeout
+ */
+static int32_t
+get_cache_invalidation_timeout(xlator_t *this)
+{
+    upcall_private_t *priv = NULL;
+
+    if (this->private) {
+        priv = (upcall_private_t *)this->private;
+        return priv->cache_invalidation_timeout;
+    }
+
+    return 0;
+}
+
+static upcall_client_t *
+__add_upcall_client(call_frame_t *frame, client_t *client,
+                    upcall_inode_ctx_t *up_inode_ctx, time_t now)
+{
+    upcall_client_t *up_client_entry = GF_MALLOC(
+        sizeof(*up_client_entry), gf_upcall_mt_upcall_client_entry_t);
+    if (!up_client_entry) {
+        gf_msg("upcall", GF_LOG_WARNING, 0, UPCALL_MSG_NO_MEMORY,
+               "Memory allocation failed");
+        return NULL;
+    }
+    INIT_LIST_HEAD(&up_client_entry->client_list);
+    up_client_entry->client_uid = gf_strdup(client->client_uid);
+    up_client_entry->access_time = now;
+    up_client_entry->expire_time_attr = get_cache_invalidation_timeout(
+        frame->this);
+
+    list_add_tail(&up_client_entry->client_list, &up_inode_ctx->client_list);
+
+    gf_log(THIS->name, GF_LOG_DEBUG, "upcall_entry_t client added - %s",
+           up_client_entry->client_uid);
+
+    return up_client_entry;
+}
+
+static int
+__upcall_inode_ctx_set(inode_t *inode, xlator_t *this)
+{
+    upcall_inode_ctx_t *inode_ctx = NULL;
+    upcall_private_t *priv = NULL;
+    int ret = -1;
+    uint64_t ctx = 0;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = __inode_ctx_get(inode, this, &ctx);
+
+    if (!ret)
+        goto out;
+
+    inode_ctx = GF_MALLOC(sizeof(upcall_inode_ctx_t),
+                          gf_upcall_mt_upcall_inode_ctx_t);
+
+    if (!inode_ctx) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    pthread_mutex_init(&inode_ctx->client_list_lock, NULL);
+    INIT_LIST_HEAD(&inode_ctx->inode_ctx_list);
+    INIT_LIST_HEAD(&inode_ctx->client_list);
+    inode_ctx->destroy = 0;
+    gf_uuid_copy(inode_ctx->gfid, inode->gfid);
+
+    ctx = (long)inode_ctx;
+    ret = __inode_ctx_set(inode, this, &ctx);
+    if (ret) {
+        gf_log(this->name, GF_LOG_DEBUG, "failed to set inode ctx (%p)", inode);
+        GF_FREE(inode_ctx);
+        goto out;
+    }
+
+    /* add this inode_ctx to the global list */
+    LOCK(&priv->inode_ctx_lk);
+    {
+        list_add_tail(&inode_ctx->inode_ctx_list, &priv->inode_ctx_list);
+    }
+    UNLOCK(&priv->inode_ctx_lk);
+out:
+    return ret;
+}
+
+static upcall_inode_ctx_t *
+__upcall_inode_ctx_get(inode_t *inode, xlator_t *this)
+{
+    upcall_inode_ctx_t *inode_ctx = NULL;
+    uint64_t ctx = 0;
+    int ret = 0;
+
+    ret = __inode_ctx_get(inode, this, &ctx);
+
+    if (ret < 0) {
+        ret = __upcall_inode_ctx_set(inode, this);
+        if (ret < 0)
+            goto out;
+
+        ret = __inode_ctx_get(inode, this, &ctx);
+        if (ret < 0)
+            goto out;
+    }
+
+    inode_ctx = (upcall_inode_ctx_t *)(long)(ctx);
+
+out:
+    return inode_ctx;
+}
+
+upcall_inode_ctx_t *
+upcall_inode_ctx_get(inode_t *inode, xlator_t *this)
+{
+    upcall_inode_ctx_t *inode_ctx = NULL;
+
+    LOCK(&inode->lock);
+    {
+        inode_ctx = __upcall_inode_ctx_get(inode, this);
+    }
+    UNLOCK(&inode->lock);
+
+    return inode_ctx;
+}
+
+static int
+__upcall_cleanup_client_entry(upcall_client_t *up_client)
+{
+    list_del_init(&up_client->client_list);
+
+    GF_FREE(up_client->client_uid);
+    GF_FREE(up_client);
+
+    return 0;
+}
+
+static int
+upcall_cleanup_expired_clients(xlator_t *this, upcall_inode_ctx_t *up_inode_ctx,
+                               time_t now)
+{
+    upcall_client_t *up_client = NULL;
+    upcall_client_t *tmp = NULL;
+    int ret = -1;
+    time_t timeout = 0;
+    time_t t_expired = 0;
+
+    timeout = get_cache_invalidation_timeout(this);
+
+    pthread_mutex_lock(&up_inode_ctx->client_list_lock);
+    {
+        list_for_each_entry_safe(up_client, tmp, &up_inode_ctx->client_list,
+                                 client_list)
+        {
+            t_expired = now - up_client->access_time;
+
+            if (t_expired > (2 * timeout)) {
+                gf_log(THIS->name, GF_LOG_TRACE, "Cleaning up client_entry(%s)",
+                       up_client->client_uid);
+
+                ret = __upcall_cleanup_client_entry(up_client);
+
+                if (ret) {
+                    gf_msg("upcall", GF_LOG_WARNING, 0,
+                           UPCALL_MSG_INTERNAL_ERROR,
+                           "Client entry cleanup failed (%p)", up_client);
+                    goto out;
+                }
+            }
+        }
+    }
+    pthread_mutex_unlock(&up_inode_ctx->client_list_lock);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/*
+ * Free Upcall inode_ctx client list
+ */
+int
+__upcall_cleanup_inode_ctx_client_list(upcall_inode_ctx_t *inode_ctx)
+{
+    upcall_client_t *up_client = NULL;
+    upcall_client_t *tmp = NULL;
+
+    list_for_each_entry_safe(up_client, tmp, &inode_ctx->client_list,
+                             client_list)
+    {
+        __upcall_cleanup_client_entry(up_client);
+    }
+
+    return 0;
+}
+
+static void
+upcall_cache_forget(xlator_t *this, inode_t *inode,
+                    upcall_inode_ctx_t *up_inode_ctx);
+
+/*
+ * Free upcall_inode_ctx
+ */
+int
+upcall_cleanup_inode_ctx(xlator_t *this, inode_t *inode)
+{
+    uint64_t ctx = 0;
+    upcall_inode_ctx_t *inode_ctx = NULL;
+    int ret = 0;
+    upcall_private_t *priv = NULL;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = inode_ctx_del(inode, this, &ctx);
+
+    if (ret < 0) {
+        gf_msg("upcall", GF_LOG_WARNING, 0, UPCALL_MSG_INTERNAL_ERROR,
+               "Failed to del upcall_inode_ctx (%p)", inode);
+        goto out;
+    }
+
+    inode_ctx = (upcall_inode_ctx_t *)(long)ctx;
+
+    if (inode_ctx) {
+        /* Invalidate all the upcall cache entries */
+        upcall_cache_forget(this, inode, inode_ctx);
+
+        /* do we really need lock? yes now reaper thread
+         * may also be trying to cleanup the client entries.
+         */
+        pthread_mutex_lock(&inode_ctx->client_list_lock);
+        {
+            if (!list_empty(&inode_ctx->client_list)) {
+                __upcall_cleanup_inode_ctx_client_list(inode_ctx);
+            }
+        }
+        pthread_mutex_unlock(&inode_ctx->client_list_lock);
+
+        /* Mark the inode_ctx to be destroyed */
+        inode_ctx->destroy = 1;
+        gf_msg_debug("upcall", 0, "set upcall_inode_ctx (%p) to destroy mode",
+                     inode_ctx);
+    }
+
+out:
+    return ret;
+}
+
+/*
+ * Traverse through the list of upcall_inode_ctx(s),
+ * cleanup the expired client entries and destroy the ctx
+ * which is no longer valid and has destroy bit set.
+ */
+void *
+upcall_reaper_thread(void *data)
+{
+    upcall_private_t *priv = NULL;
+    upcall_inode_ctx_t *inode_ctx = NULL;
+    upcall_inode_ctx_t *tmp = NULL;
+    xlator_t *this = NULL;
+    time_t timeout = 0;
+    time_t time_now;
+
+    this = (xlator_t *)data;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    time_now = gf_time();
+    while (!priv->fini) {
+        list_for_each_entry_safe(inode_ctx, tmp, &priv->inode_ctx_list,
+                                 inode_ctx_list)
+        {
+            /* cleanup expired clients */
+            upcall_cleanup_expired_clients(this, inode_ctx, time_now);
+
+            if (!inode_ctx->destroy) {
+                continue;
+            }
+
+            /* client list would have been cleaned up*/
+            gf_msg_debug("upcall", 0, "Freeing upcall_inode_ctx (%p)",
+                         inode_ctx);
+            LOCK(&priv->inode_ctx_lk);
+            {
+                list_del_init(&inode_ctx->inode_ctx_list);
+                pthread_mutex_destroy(&inode_ctx->client_list_lock);
+            }
+            UNLOCK(&priv->inode_ctx_lk);
+            GF_FREE(inode_ctx);
+            inode_ctx = NULL;
+        }
+
+        /* don't do a very busy loop */
+        timeout = get_cache_invalidation_timeout(this);
+        sleep(timeout / 2);
+        time_now = gf_time();
+    }
+
+    return NULL;
+}
+
+/*
+ * Initialize upcall reaper thread.
+ */
+int
+upcall_reaper_thread_init(xlator_t *this)
+{
+    upcall_private_t *priv = NULL;
+    int ret = -1;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = gf_thread_create(&priv->reaper_thr, NULL, upcall_reaper_thread, this,
+                           "upreaper");
+
+    return ret;
+}
+
+int
+up_compare_afr_xattr(dict_t *d, char *k, data_t *v, void *tmp)
+{
+    dict_t *dict = tmp;
+
+    if (!strncmp(k, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX)) &&
+        (!is_data_equal(v, dict_get(dict, k))))
+        return -1;
+
+    return 0;
+}
+
+static void
+up_filter_afr_xattr(dict_t *xattrs, char *xattr, data_t *v)
+{
+    /* Filter the afr pending xattrs, with value 0. Ideally this should
+     * be executed only in case of xattrop and not in set and removexattr,
+     * butset and remove xattr fops do not come with keys AFR_XATTR_PREFIX
+     */
+    if (!strncmp(xattr, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX)) &&
+        (mem_0filled(v->data, v->len) == 0)) {
+        dict_del(xattrs, xattr);
+    }
+    return;
+}
+
+static gf_boolean_t
+up_key_is_regd_xattr(dict_t *regd_xattrs, char *regd_xattr, data_t *v,
+                     void *xattr)
+{
+    int ret = _gf_false;
+    char *key = xattr;
+
+    if (fnmatch(regd_xattr, key, 0) == 0)
+        ret = _gf_true;
+
+    return ret;
+}
+
+int
+up_filter_unregd_xattr(dict_t *xattrs, char *xattr, data_t *v,
+                       void *regd_xattrs)
+{
+    int ret = 0;
+
+    ret = dict_foreach_match(regd_xattrs, up_key_is_regd_xattr, xattr,
+                             dict_null_foreach_fn, NULL);
+    if (ret == 0) {
+        /* xattr was not found in the registered xattr, hence do not
+         * send notification for its change
+         */
+        dict_del(xattrs, xattr);
+        goto out;
+    }
+    up_filter_afr_xattr(xattrs, xattr, v);
+out:
+    return 0;
+}
+
+int
+up_filter_xattr(dict_t *xattr, dict_t *regd_xattrs)
+{
+    int ret = 0;
+
+    ret = dict_foreach(xattr, up_filter_unregd_xattr, regd_xattrs);
+
+    return ret;
+}
+
+static void
+upcall_client_cache_invalidate(xlator_t *this, uuid_t gfid,
+                               upcall_client_t *up_client_entry, uint32_t flags,
+                               struct iatt *stbuf, struct iatt *p_stbuf,
+                               struct iatt *oldp_stbuf, dict_t *xattr,
+                               time_t now);
+
+gf_boolean_t
+up_invalidate_needed(dict_t *xattrs)
+{
+    if (dict_key_count(xattrs) == 0) {
+        gf_msg_trace("upcall", 0,
+                     "None of xattrs requested for"
+                     " invalidation, were changed. Nothing to "
+                     "invalidate");
+        return _gf_false;
+    }
+
+    return _gf_true;
+}
+
+/*
+ * Given a client, first fetch upcall_entry_t from the inode_ctx client list.
+ * Later traverse through the client list of that upcall entry. If this client
+ * is not present in the list, create one client entry with this client info.
+ * Also check if there are other clients which need to be notified of this
+ * op. If yes send notify calls to them.
+ *
+ * Since sending notifications for cache_invalidation is a best effort,
+ * any errors during the process are logged and ignored.
+ */
+void
+upcall_cache_invalidate(call_frame_t *frame, xlator_t *this, client_t *client,
+                        inode_t *inode, uint32_t flags, struct iatt *stbuf,
+                        struct iatt *p_stbuf, struct iatt *oldp_stbuf,
+                        dict_t *xattr)
+{
+    upcall_client_t *up_client_entry = NULL;
+    upcall_client_t *tmp = NULL;
+    upcall_inode_ctx_t *up_inode_ctx = NULL;
+    gf_boolean_t found = _gf_false;
+    time_t time_now;
+    inode_t *linked_inode = NULL;
+
+    if (!is_upcall_enabled(this))
+        return;
+
+    /* server-side generated fops like quota/marker will not have any
+     * client associated with them. Ignore such fops.
+     */
+    if (!client) {
+        gf_msg_debug("upcall", 0, "Internal fop - client NULL");
+        return;
+    }
+
+    /* For nameless LOOKUPs, inode created shall always be
+     * invalid. Hence check if there is any already linked inode.
+     * If yes, update the inode_ctx of that valid inode
+     */
+    if (inode && (inode->ia_type == IA_INVAL) && stbuf) {
+        linked_inode = inode_find(inode->table, stbuf->ia_gfid);
+        if (linked_inode) {
+            gf_log("upcall", GF_LOG_DEBUG,
+                   "upcall_inode_ctx_get of linked inode (%p)", inode);
+            up_inode_ctx = upcall_inode_ctx_get(linked_inode, this);
+        }
+    }
+
+    if (inode && !up_inode_ctx)
+        up_inode_ctx = upcall_inode_ctx_get(inode, this);
+
+    if (!up_inode_ctx) {
+        gf_msg("upcall", GF_LOG_WARNING, 0, UPCALL_MSG_INTERNAL_ERROR,
+               "upcall_inode_ctx_get failed (%p)", inode);
+        return;
+    }
+
+    /* In case of LOOKUP, if first time, inode created shall be
+     * invalid till it gets linked to inode table. Read gfid from
+     * the stat returned in such cases.
+     */
+    if (gf_uuid_is_null(up_inode_ctx->gfid) && stbuf) {
+        /* That means inode must have been invalid when this inode_ctx
+         * is created. Copy the gfid value from stbuf instead.
+         */
+        gf_uuid_copy(up_inode_ctx->gfid, stbuf->ia_gfid);
+    }
+
+    if (gf_uuid_is_null(up_inode_ctx->gfid)) {
+        gf_msg_debug(this->name, 0,
+                     "up_inode_ctx->gfid and "
+                     "stbuf->ia_gfid is NULL, fop:%s",
+                     gf_fop_list[frame->root->op]);
+        goto out;
+    }
+
+    time_now = gf_time();
+    pthread_mutex_lock(&up_inode_ctx->client_list_lock);
+    {
+        list_for_each_entry_safe(up_client_entry, tmp,
+                                 &up_inode_ctx->client_list, client_list)
+        {
+            /* Do not send UPCALL event if same client. */
+            if (!strcmp(client->client_uid, up_client_entry->client_uid)) {
+                up_client_entry->access_time = time_now;
+                found = _gf_true;
+                continue;
+            }
+
+            /*
+             * Ignore sending notifications in case of only UP_ATIME
+             */
+            if (!(flags & ~(UP_ATIME))) {
+                if (found)
+                    break;
+                else /* we still need to find current client entry*/
+                    continue;
+            }
+
+            /* any other client */
+
+            /* XXX: Send notifications asynchrounously
+             * instead of in the I/O path - BZ 1200264
+             *  Also if the file is frequently accessed, set
+             *  expire_time_attr to 0.
+             */
+            upcall_client_cache_invalidate(
+                this, up_inode_ctx->gfid, up_client_entry, flags, stbuf,
+                p_stbuf, oldp_stbuf, xattr, time_now);
+        }
+
+        if (!found) {
+            up_client_entry = __add_upcall_client(frame, client, up_inode_ctx,
+                                                  time_now);
+        }
+    }
+    pthread_mutex_unlock(&up_inode_ctx->client_list_lock);
+out:
+    /* release the ref from inode_find */
+    if (linked_inode)
+        inode_unref(linked_inode);
+    return;
+}
+
+/*
+ * If the upcall_client_t has recently accessed the file (i.e, within
+ * priv->cache_invalidation_timeout), send a upcall notification.
+ */
+static void
+upcall_client_cache_invalidate(xlator_t *this, uuid_t gfid,
+                               upcall_client_t *up_client_entry, uint32_t flags,
+                               struct iatt *stbuf, struct iatt *p_stbuf,
+                               struct iatt *oldp_stbuf, dict_t *xattr,
+                               time_t now)
+{
+    struct gf_upcall up_req = {
+        0,
+    };
+    struct gf_upcall_cache_invalidation ca_req = {
+        0,
+    };
+    time_t timeout = 0;
+    int ret = -1;
+    time_t t_expired = now - up_client_entry->access_time;
+
+    GF_VALIDATE_OR_GOTO("upcall_client_cache_invalidate",
+                        !(gf_uuid_is_null(gfid)), out);
+    timeout = get_cache_invalidation_timeout(this);
+
+    if (t_expired < timeout) {
+        /* Send notify call */
+        up_req.client_uid = up_client_entry->client_uid;
+        gf_uuid_copy(up_req.gfid, gfid);
+
+        ca_req.flags = flags;
+        ca_req.expire_time_attr = up_client_entry->expire_time_attr;
+        if (stbuf)
+            ca_req.stat = *stbuf;
+        if (p_stbuf)
+            ca_req.p_stat = *p_stbuf;
+        if (oldp_stbuf)
+            ca_req.oldp_stat = *oldp_stbuf;
+        ca_req.dict = xattr;
+
+        up_req.data = &ca_req;
+        up_req.event_type = GF_UPCALL_CACHE_INVALIDATION;
+
+        gf_log(THIS->name, GF_LOG_TRACE,
+               "Cache invalidation notification sent to %s",
+               up_client_entry->client_uid);
+
+        /* Need to send inode flags */
+        ret = this->notify(this, GF_EVENT_UPCALL, &up_req);
+
+        /*
+         * notify may fail as the client could have been
+         * dis(re)connected. Cleanup the client entry.
+         */
+        if (ret < 0)
+            __upcall_cleanup_client_entry(up_client_entry);
+
+    } else {
+        gf_log(THIS->name, GF_LOG_TRACE,
+               "Cache invalidation notification NOT sent to %s",
+               up_client_entry->client_uid);
+
+        if (t_expired > (2 * timeout)) {
+            /* Cleanup the entry */
+            __upcall_cleanup_client_entry(up_client_entry);
+        }
+    }
+out:
+    return;
+}
+
+/*
+ * This is called during upcall_inode_ctx cleanup in case of 'inode_forget'.
+ * Send "UP_FORGET" to all the clients so that they invalidate their cache
+ * entry and do a fresh lookup next time when any I/O comes in.
+ */
+static void
+upcall_cache_forget(xlator_t *this, inode_t *inode,
+                    upcall_inode_ctx_t *up_inode_ctx)
+{
+    upcall_client_t *up_client_entry = NULL;
+    upcall_client_t *tmp = NULL;
+    uint32_t flags = UP_FORGET;
+    time_t time_now;
+
+    if (!up_inode_ctx) {
+        return;
+    }
+
+    time_now = gf_time();
+    pthread_mutex_lock(&up_inode_ctx->client_list_lock);
+    {
+        list_for_each_entry_safe(up_client_entry, tmp,
+                                 &up_inode_ctx->client_list, client_list)
+        {
+            /* Set the access time to gf_time()
+             * to send notify */
+            up_client_entry->access_time = time_now;
+
+            upcall_client_cache_invalidate(this, up_inode_ctx->gfid,
+                                           up_client_entry, flags, NULL, NULL,
+                                           NULL, NULL, time_now);
+        }
+    }
+    pthread_mutex_unlock(&up_inode_ctx->client_list_lock);
+}
diff --git a/xlators/features/upcall/src/upcall-mem-types.h b/xlators/features/upcall/src/upcall-mem-types.h
new file mode 100644
index 00000000000..f9883d9d72c
--- /dev/null
+++ b/xlators/features/upcall/src/upcall-mem-types.h
@@ -0,0 +1,23 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __UPCALL_MEM_TYPES_H__
+#define __UPCALL_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_upcall_mem_types_ {
+    gf_upcall_mt_conf_t = gf_common_mt_end + 1,
+    gf_upcall_mt_private_t,
+    gf_upcall_mt_upcall_inode_ctx_t,
+    gf_upcall_mt_upcall_client_entry_t,
+    gf_upcall_mt_end
+};
+#endif
diff --git a/xlators/features/upcall/src/upcall-messages.h b/xlators/features/upcall/src/upcall-messages.h
new file mode 100644
index 00000000000..4095a34c200
--- /dev/null
+++ b/xlators/features/upcall/src/upcall-messages.h
@@ -0,0 +1,29 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _UPCALL_MESSAGES_H_
+#define _UPCALL_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(UPCALL, UPCALL_MSG_NO_MEMORY, UPCALL_MSG_INTERNAL_ERROR,
+           UPCALL_MSG_NOTIFY_FAILED);
+
+#endif /* !_UPCALL_MESSAGES_H_ */
diff --git a/xlators/features/upcall/src/upcall.c b/xlators/features/upcall/src/upcall.c
new file mode 100644
index 00000000000..0795f58059d
--- /dev/null
+++ b/xlators/features/upcall/src/upcall.c
@@ -0,0 +1,2505 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <pthread.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/common-utils.h>
+
+#include <glusterfs/statedump.h>
+
+#include "upcall.h"
+#include "upcall-mem-types.h"
+#include "glusterfs3-xdr.h"
+#include "protocol-common.h"
+#include <glusterfs/defaults.h>
+
+static int32_t
+up_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+        fd_t *fd, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(open, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+              int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+              dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_WRITE_FLAGS;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, postbuf,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+
+    return 0;
+}
+
+static int32_t
+up_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+          int count, off_t off, uint32_t flags, struct iobref *iobref,
+          dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, off, flags,
+               iobref, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+             int op_errno, struct iovec *vector, int count, struct iatt *stbuf,
+             struct iobref *iobref, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, stbuf,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, stbuf,
+                        iobref, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+         off_t offset, uint32_t flags, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_readv_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+          int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(lk, frame, op_ret, op_errno, lock, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+      struct gf_flock *flock, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_lk_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->lk,
+               fd, cmd, flock, xdata);
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+                dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_WRITE_FLAGS;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, postbuf,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+
+    return 0;
+}
+
+static int32_t
+up_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+            dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+               int op_errno, struct iatt *statpre, struct iatt *statpost,
+               dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    /* XXX: setattr -> UP_SIZE or UP_OWN or UP_MODE or UP_TIMES
+     * or INODE_UPDATE (or UP_PERM esp in case of ACLs -> INODE_INVALIDATE)
+     * Need to check what attr is changed and accordingly pass UP_FLAGS.
+     * Bug1200271.
+     */
+    flags = UP_ATTR_FLAGS;
+    /* If mode bits have changed invalidate the xattrs, as posix-acl and
+     * others store permission related information in xattrs. With changing
+     * of permissions/mode, we need to make clients to forget all the
+     * xattrs related to permissions.
+     * TODO: Invalidate the xattr system.posix_acl_access alone.
+     */
+    if (is_same_mode(statpre->ia_prot, statpost->ia_prot) != 0)
+        flags |= UP_XATTR;
+
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, statpost,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(setattr, frame, op_ret, op_errno, statpre, statpost,
+                        xdata);
+
+    return 0;
+}
+
+static int32_t
+up_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+           int32_t valid, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *stbuf, struct iatt *preoldparent,
+              struct iatt *postoldparent, struct iatt *prenewparent,
+              struct iatt *postnewparent, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = (UP_RENAME_FLAGS | UP_PARENT_DENTRY_FLAGS);
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, stbuf,
+                            postnewparent, postoldparent, NULL);
+
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->rename_oldloc.parent,
+                            flags, postoldparent, NULL, NULL, NULL);
+
+    if (local->rename_oldloc.parent == local->loc.parent)
+        goto out;
+
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->loc.parent, flags,
+                            postnewparent, NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(rename, frame, op_ret, op_errno, stbuf, preoldparent,
+                        postoldparent, prenewparent, postnewparent, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+          dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, newloc, NULL, oldloc->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+    /* copy oldloc */
+    loc_copy(&local->rename_oldloc, oldloc);
+out:
+    STACK_WIND(frame, up_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+              int op_errno, struct iatt *preparent, struct iatt *postparent,
+              dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = (UP_NLINK_FLAGS | UP_PARENT_DENTRY_FLAGS);
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL,
+                            postparent, NULL, NULL);
+
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->loc.parent, flags,
+                            postparent, NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, postparent,
+                        xdata);
+
+    return 0;
+}
+
+static int32_t
+up_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+          dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, loc, NULL, loc->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+            int op_errno, inode_t *inode, struct iatt *stbuf,
+            struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = (UP_NLINK_FLAGS | UP_PARENT_DENTRY_FLAGS);
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, stbuf,
+                            postparent, NULL, NULL);
+
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->loc.parent, flags,
+                            postparent, NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent,
+                        postparent, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+        dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, newloc, NULL, oldloc->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL);
+
+    return 0;
+}
+
+static int32_t
+up_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+             int op_errno, struct iatt *preparent, struct iatt *postparent,
+             dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+
+    flags = (UP_NLINK_FLAGS | UP_PARENT_DENTRY_FLAGS);
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL,
+                            postparent, NULL, NULL);
+
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->loc.parent, flags,
+                            postparent, NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(rmdir, frame, op_ret, op_errno, preparent, postparent,
+                        xdata);
+
+    return 0;
+}
+
+static int32_t
+up_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+         dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, loc, NULL, loc->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_rmdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rmdir, loc, flags, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+             int op_errno, inode_t *inode, struct iatt *stbuf,
+             struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+
+    /* invalidate parent's entry too */
+    flags = UP_TIMES;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags,
+                            postparent, NULL, NULL, NULL);
+
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->loc.inode, flags, stbuf,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, stbuf, preparent,
+                        postparent, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         mode_t umask, dict_t *params)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, loc, NULL, loc->parent, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_mkdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, params);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL);
+
+    return 0;
+}
+
+static int32_t
+up_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+              int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
+              struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+
+    /* As its a new file create, no need of sending notification
+     * However invalidate parent's entry and update that fact that the
+     * client has accessed the newly created entry */
+    flags = UP_TIMES;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags,
+                            postparent, NULL, NULL, NULL);
+
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->loc.inode, flags, stbuf,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf,
+                        preparent, postparent, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+          mode_t mode, mode_t umask, fd_t *fd, dict_t *params)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, loc, NULL, loc->parent, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               params);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+              int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr,
+              struct iatt *postparent)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, stbuf,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
+                        postparent);
+
+    return 0;
+}
+
+static int32_t
+up_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xattr_req);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, buf, NULL,
+                            NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->stat, loc, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fstat, fd, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+              int op_errno, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(access, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+          dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_access_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->access, loc, mask, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(access, frame, -1, op_errno, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                int op_errno, const char *path, struct iatt *stbuf,
+                dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, stbuf,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(readlink, frame, op_ret, op_errno, path, stbuf, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+            dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_readlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readlink, loc, size, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(readlink, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, inode_t *inode, struct iatt *buf,
+             struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+
+    /* invalidate parent's entry too */
+    flags = UP_TIMES;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags,
+                            postparent, NULL, NULL, NULL);
+
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->loc.inode, flags, buf,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent,
+                        postparent, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+         dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, loc, NULL, loc->parent, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL);
+
+    return 0;
+}
+
+static int32_t
+up_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, inode_t *inode,
+               struct iatt *buf, struct iatt *preparent,
+               struct iatt *postparent, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+
+    /* invalidate parent's entry too */
+    flags = UP_TIMES;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags,
+                            postparent, NULL, NULL, NULL);
+
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->loc.inode, flags, buf,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(symlink, frame, op_ret, op_errno, inode, buf, preparent,
+                        postparent, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+           loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, loc, NULL, loc->parent, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_symlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL);
+
+    return 0;
+}
+
+static int32_t
+up_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(opendir, frame, op_ret, op_errno, fd, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+           dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_opendir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(opendir, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct statvfs *buf, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(statfs, frame, op_ret, op_errno, buf, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_statfs_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->statfs, loc, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+               dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(readdir, frame, op_ret, op_errno, entries, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+           off_t off, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_readdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdir, fd, size, off, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+                dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+    gf_dirent_t *entry = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL,
+                            NULL, NULL, NULL);
+
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        if (entry->inode == NULL) {
+            continue;
+        }
+        upcall_cache_invalidate(frame, this, client, entry->inode, flags,
+                                &entry->d_stat, NULL, NULL, NULL);
+    }
+
+out:
+    UPCALL_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t off, dict_t *dict)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, off, dict);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(readdirp, frame, -1, op_errno, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+            int32_t valid, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *pre,
+                 struct iatt *post, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_WRITE_FLAGS;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, post,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(fallocate, frame, op_ret, op_errno, pre, post, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+             off_t offset, size_t len, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_fallocate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+               xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *pre,
+               struct iatt *post, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_WRITE_FLAGS;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, post,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(discard, frame, op_ret, op_errno, pre, post, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+           size_t len, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_discard_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *pre,
+                struct iatt *post, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_WRITE_FLAGS;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, post,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(zerofill, frame, op_ret, op_errno, pre, post, xdata);
+
+    return 0;
+}
+
+static int
+up_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            off_t len, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_zerofill_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+            int op_errno, off_t offset, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(seek, frame, op_ret, op_errno, offset, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+        gf_seek_what_t what, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_seek_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->seek, fd, offset, what, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(seek, frame, -1, op_errno, 0, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+    int ret = 0;
+    struct iatt stbuf = {
+        0,
+    };
+    upcall_private_t *priv = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+
+    flags = UP_XATTR;
+
+    ret = up_filter_xattr(local->xattr, priv->xattrs);
+    if (ret < 0) {
+        op_ret = ret;
+        goto out;
+    }
+    if (!up_invalidate_needed(local->xattr))
+        goto out;
+
+    ret = dict_get_iatt(xdata, GF_POSTSTAT, &stbuf);
+    if (ret == 0)
+        flags |= UP_TIMES;
+
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, &stbuf,
+                            NULL, NULL, local->xattr);
+
+out:
+    UPCALL_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+            int32_t flags, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, loc, NULL, loc->inode, dict);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_setxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+    int ret = 0;
+    struct iatt stbuf = {
+        0,
+    };
+    upcall_private_t *priv = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+
+    flags = UP_XATTR;
+
+    ret = up_filter_xattr(local->xattr, priv->xattrs);
+    if (ret < 0) {
+        op_ret = ret;
+        goto out;
+    }
+    if (!up_invalidate_needed(local->xattr))
+        goto out;
+
+    ret = dict_get_iatt(xdata, GF_POSTSTAT, &stbuf);
+    if (ret == 0)
+        flags |= UP_TIMES;
+
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, &stbuf,
+                            NULL, NULL, local->xattr);
+
+out:
+    UPCALL_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+static int32_t
+up_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+             int32_t flags, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, fd, fd->inode, dict);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_fsetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+
+    return 0;
+
+err:
+    UPCALL_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+    int ret = 0;
+    upcall_private_t *priv = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_XATTR_RM;
+
+    ret = up_filter_xattr(local->xattr, priv->xattrs);
+    if (ret < 0) {
+        op_ret = ret;
+        goto out;
+    }
+    if (!up_invalidate_needed(local->xattr))
+        goto out;
+
+    ret = dict_get_iatt(xdata, GF_POSTSTAT, &stbuf);
+    if (ret == 0)
+        flags |= UP_TIMES;
+
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, &stbuf,
+                            NULL, NULL, local->xattr);
+
+out:
+    UPCALL_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+static int32_t
+up_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+                dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+    dict_t *xattr = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    xattr = dict_for_key_value(name, "", 1, _gf_true);
+    if (!xattr) {
+        goto err;
+    }
+
+    local = upcall_local_init(frame, this, NULL, fd, fd->inode, xattr);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    if (xattr)
+        dict_unref(xattr);
+
+    STACK_WIND(frame, up_fremovexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+    return 0;
+
+err:
+    if (xattr)
+        dict_unref(xattr);
+
+    UPCALL_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+    int ret = 0;
+    upcall_private_t *priv = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+    flags = UP_XATTR_RM;
+
+    ret = up_filter_xattr(local->xattr, priv->xattrs);
+    if (ret < 0) {
+        op_ret = ret;
+        goto out;
+    }
+    if (!up_invalidate_needed(local->xattr))
+        goto out;
+
+    ret = dict_get_iatt(xdata, GF_POSTSTAT, &stbuf);
+    if (ret == 0)
+        flags |= UP_TIMES;
+
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, &stbuf,
+                            NULL, NULL, local->xattr);
+
+out:
+    UPCALL_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+static int32_t
+up_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+               const char *name, dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+    dict_t *xattr = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    xattr = dict_for_key_value(name, "", 1, _gf_true);
+    if (!xattr) {
+        goto err;
+    }
+
+    local = upcall_local_init(frame, this, loc, NULL, loc->inode, xattr);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    if (xattr)
+        dict_unref(xattr);
+
+    STACK_WIND(frame, up_removexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+    return 0;
+
+err:
+    if (xattr)
+        dict_unref(xattr);
+
+    UPCALL_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
+
+    return 0;
+}
+
+static int32_t
+up_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+static int32_t
+up_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+             dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, fd->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_fgetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+    return 0;
+err:
+    UPCALL_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL);
+    return 0;
+}
+
+static int32_t
+up_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    client_t *client = NULL;
+    uint32_t flags = 0;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+
+    flags = UP_UPDATE_CLIENT;
+    upcall_cache_invalidate(frame, this, client, local->inode, flags, NULL,
+                            NULL, NULL, NULL);
+
+out:
+    UPCALL_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+static int32_t
+up_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+            dict_t *xdata)
+{
+    int32_t op_errno = ENOMEM;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    local = upcall_local_init(frame, this, NULL, NULL, loc->inode, NULL);
+    if (!local) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_getxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
+    return 0;
+err:
+    UPCALL_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL);
+    return 0;
+}
+
+/* The xattrops here mainly tracks changes in afr pending xattr.
+ *    1. xattrop doesn't carry info saying post op/pre op.
+ *    2. Pre xattrop will have 0 value for all pending xattrs,
+ *       the cbk of pre xattrop carries the on-disk xattr value.
+ *       Non zero on-disk xattr indicates pending healing.
+ *    3. Post xattrop will either have 0 or 1 as value of pending xattrs,
+ *       0 on success, 1 on failure. But the post xattrop cbk will have
+ *       0 or 1 or any higher value.
+ *       0 - if no healing required*
+ *       1 - if this is the first time pending xattr is being set.
+ *       n - if there is already a pending xattr set, it will increment
+ *       the on-disk value and send that in cbk.
+ * Our aim is to send an invalidation, only the first time a pending
+ * xattr was set on a file. Below are some of the exceptions in handling
+ * xattrop:
+ * - Do not filter unregistered xattrs in the cbk, but in the call path.
+ *   Else, we will be invalidating on every preop, if the file already has
+ *   pending xattr set. Filtering unregistered xattrs on the fop path
+ *   ensures we invalidate only in postop, every time a postop comes with
+ *   pending xattr value 1.
+ * - Consider a brick is down, and the postop sets pending xattrs as long
+ *   as the other brick is down. But we do not want to invalidate every time
+ *   a pending xattr is set, but we want to invalidate only the first time
+ *   a pending xattr is set on any file. Hence, to identify if its the first
+ *   time a pending xattr is set, we compare the value of pending xattrs that
+ *   came in postop and postop cbk, if its same then its the first time.
+ */
+static int32_t
+up_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    client_t *client = NULL;
+    upcall_local_t *local = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    client = frame->root->client;
+    local = frame->local;
+
+    if ((op_ret < 0) || !local) {
+        goto out;
+    }
+
+    if (up_invalidate_needed(local->xattr)) {
+        if (dict_foreach(local->xattr, up_compare_afr_xattr, dict) < 0)
+            goto out;
+
+        upcall_cache_invalidate(frame, this, client, local->inode, UP_XATTR,
+                                NULL, NULL, NULL, local->xattr);
+    }
+out:
+    if (frame->root->op == GF_FOP_FXATTROP) {
+        UPCALL_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, dict, xdata);
+    } else {
+        UPCALL_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata);
+    }
+    return 0;
+}
+
+static int32_t
+up_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+           gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    int32_t op_errno = EINVAL;
+    upcall_local_t *local = NULL;
+    int ret = 0;
+    upcall_private_t *priv = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    local = upcall_local_init(frame, this, loc, NULL, loc->inode, xattr);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    ret = up_filter_xattr(local->xattr, priv->xattrs);
+    if (ret < 0) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_xattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->xattrop, loc, optype, xattr, xdata);
+    return 0;
+err:
+    UPCALL_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL);
+    return 0;
+}
+
+static int32_t
+up_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+            gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    int32_t op_errno = EINVAL;
+    upcall_local_t *local = NULL;
+    int ret = 0;
+    upcall_private_t *priv = NULL;
+
+    EXIT_IF_UPCALL_OFF(this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    local = upcall_local_init(frame, this, NULL, fd, fd->inode, xattr);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    ret = up_filter_xattr(local->xattr, priv->xattrs);
+    if (ret < 0) {
+        goto err;
+    }
+
+out:
+    STACK_WIND(frame, up_xattrop_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fxattrop, fd, optype, xattr, xdata);
+    return 0;
+err:
+    STACK_UNWIND_STRICT(fxattrop, frame, -1, op_errno, NULL, NULL);
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_upcall_mt_end + 1);
+
+    if (ret != 0) {
+        gf_msg("upcall", GF_LOG_WARNING, 0, UPCALL_MSG_NO_MEMORY,
+               "Memory allocation failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+void
+upcall_local_wipe(xlator_t *this, upcall_local_t *local)
+{
+    if (local) {
+        inode_unref(local->inode);
+        if (local->xattr)
+            dict_unref(local->xattr);
+        loc_wipe(&local->rename_oldloc);
+        loc_wipe(&local->loc);
+        if (local->fd)
+            fd_unref(local->fd);
+        mem_put(local);
+    }
+}
+
+upcall_local_t *
+upcall_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+                  inode_t *inode, dict_t *xattr)
+{
+    upcall_local_t *local = NULL;
+
+    GF_VALIDATE_OR_GOTO("upcall", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    local = mem_get0(THIS->local_pool);
+
+    if (!local)
+        goto out;
+
+    local->inode = inode_ref(inode);
+    if (xattr)
+        local->xattr = dict_copy_with_ref(xattr, NULL);
+
+    if (loc)
+        loc_copy(&local->loc, loc);
+    if (fd)
+        local->fd = fd_ref(fd);
+
+    frame->local = local;
+
+out:
+    return local;
+}
+
+static int32_t
+update_xattrs(dict_t *dict, char *key, data_t *value, void *data)
+{
+    dict_t *xattrs = data;
+    int ret = 0;
+
+    ret = dict_set_int8(xattrs, key, 0);
+    return ret;
+}
+
+int32_t
+up_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+    upcall_private_t *priv = NULL;
+    int ret = 0;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    if (op != GF_IPC_TARGET_UPCALL)
+        goto wind;
+
+    /* TODO: Bz-1371622 Along with the xattrs also store list of clients
+     * that are interested in notifications, so that the notification
+     * can be sent to the clients that have registered.
+     * Once this implemented there can be unregister of xattrs for
+     * notifications. Until then there is no unregister of xattrs*/
+    if (xdata && priv->xattrs) {
+        ret = dict_foreach(xdata, update_xattrs, priv->xattrs);
+    }
+
+out:
+    STACK_UNWIND_STRICT(ipc, frame, ret, 0, NULL);
+    return 0;
+
+wind:
+    STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ipc, op, xdata);
+    return 0;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    upcall_private_t *priv = NULL;
+    int ret = -1;
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    GF_OPTION_RECONF("cache-invalidation", priv->cache_invalidation_enabled,
+                     options, bool, out);
+    GF_OPTION_RECONF("cache-invalidation-timeout",
+                     priv->cache_invalidation_timeout, options, int32, out);
+
+    ret = 0;
+
+    if (priv->cache_invalidation_enabled && !priv->reaper_init_done) {
+        ret = upcall_reaper_thread_init(this);
+
+        if (ret) {
+            gf_msg("upcall", GF_LOG_WARNING, 0, UPCALL_MSG_INTERNAL_ERROR,
+                   "reaper_thread creation failed (%s)."
+                   " Disabling cache_invalidation",
+                   strerror(errno));
+        }
+        priv->reaper_init_done = _gf_true;
+    }
+
+out:
+    return ret;
+}
+
+int
+init(xlator_t *this)
+{
+    int ret = -1;
+    upcall_private_t *priv = NULL;
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_upcall_mt_private_t);
+    if (!priv)
+        goto out;
+
+    priv->xattrs = dict_new();
+    if (!priv->xattrs)
+        goto out;
+
+    GF_OPTION_INIT("cache-invalidation", priv->cache_invalidation_enabled, bool,
+                   out);
+    GF_OPTION_INIT("cache-invalidation-timeout",
+                   priv->cache_invalidation_timeout, int32, out);
+
+    LOCK_INIT(&priv->inode_ctx_lk);
+    INIT_LIST_HEAD(&priv->inode_ctx_list);
+
+    priv->fini = 0;
+    priv->reaper_init_done = _gf_false;
+
+    this->private = priv;
+    this->local_pool = mem_pool_new(upcall_local_t, 512);
+    ret = 0;
+
+    if (priv->cache_invalidation_enabled) {
+        ret = upcall_reaper_thread_init(this);
+
+        if (ret) {
+            gf_msg("upcall", GF_LOG_WARNING, 0, UPCALL_MSG_INTERNAL_ERROR,
+                   "reaper_thread creation failed (%s)."
+                   " Disabling cache_invalidation",
+                   strerror(errno));
+        }
+        priv->reaper_init_done = _gf_true;
+    }
+out:
+    if (ret && priv) {
+        if (priv->xattrs)
+            dict_unref(priv->xattrs);
+
+        GF_FREE(priv);
+    }
+
+    return ret;
+}
+
+void
+fini(xlator_t *this)
+{
+    upcall_private_t *priv = NULL;
+
+    priv = this->private;
+    if (!priv) {
+        return;
+    }
+    this->private = NULL;
+
+    priv->fini = 1;
+
+    if (priv->reaper_thr) {
+        gf_thread_cleanup_xint(priv->reaper_thr);
+        priv->reaper_thr = 0;
+        priv->reaper_init_done = _gf_false;
+    }
+
+    dict_unref(priv->xattrs);
+    LOCK_DESTROY(&priv->inode_ctx_lk);
+
+    /* Do we need to cleanup the inode_ctxs? IMO not required
+     * as inode_forget would have been done on all the inodes
+     * before calling xlator_fini */
+    GF_FREE(priv);
+
+    if (this->local_pool) {
+        mem_pool_destroy(this->local_pool);
+        this->local_pool = NULL;
+    }
+
+    return;
+}
+
+int
+upcall_forget(xlator_t *this, inode_t *inode)
+{
+    upcall_private_t *priv = this->private;
+
+    if (!priv)
+        goto out;
+
+    upcall_cleanup_inode_ctx(this, inode);
+out:
+    return 0;
+}
+
+int
+upcall_release(xlator_t *this, fd_t *fd)
+{
+    return 0;
+}
+
+int
+notify(xlator_t *this, int32_t event, void *data, ...)
+{
+    int ret = -1;
+    struct gf_upcall *up_req = NULL;
+
+    switch (event) {
+        case GF_EVENT_UPCALL: {
+            gf_log(this->name, GF_LOG_DEBUG, "Upcall Notify event = %d", event);
+
+            up_req = (struct gf_upcall *)data;
+
+            GF_VALIDATE_OR_GOTO(this->name, up_req, out);
+
+            ret = default_notify(this, event, up_req);
+
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, UPCALL_MSG_NOTIFY_FAILED,
+                       "Failed to notify cache invalidation"
+                       " to client(%s)",
+                       up_req->client_uid);
+                goto out;
+            }
+        } break;
+        default:
+            default_notify(this, event, data);
+            break;
+    }
+    ret = 0;
+
+out:
+    return ret;
+}
+
+struct xlator_fops fops = {
+    .ipc = up_ipc,
+    /* fops which change only "ATIME" do not result
+     * in any cache invalidation. Hence upcall
+     * notifications are not sent in this case.
+     * But however, we need to store/update the
+     * client info in the upcall state to be able
+     * to notify them in case of any changes done
+     * to the data.
+     *
+     * Below such fops do not trigger upcall
+     * notifications but will add/update
+     * clients info in the upcall inode ctx.*/
+    .lookup = up_lookup,
+    .open = up_open,
+    .statfs = up_statfs,
+    .opendir = up_opendir,
+    .readdir = up_readdir,
+    .readdirp = up_readdirp,
+    .stat = up_stat,
+    .fstat = up_fstat,
+    .access = up_access,
+    .readlink = up_readlink,
+    .readv = up_readv,
+    .lk = up_lk,
+    .seek = up_seek,
+
+    /* fops doing  write */
+    .truncate = up_truncate,
+    .ftruncate = up_ftruncate,
+    .writev = up_writev,
+    .zerofill = up_zerofill,
+    .fallocate = up_fallocate,
+    .discard = up_discard,
+
+    /* fops changing attributes */
+    .fsetattr = up_fsetattr,
+    .setattr = up_setattr,
+
+    /* fops affecting parent dirent */
+    .mknod = up_mknod,
+    .create = up_create,
+    .symlink = up_symlink,
+    .mkdir = up_mkdir,
+
+    /* fops affecting both file and parent
+     * cache entries */
+    .unlink = up_unlink,
+    .link = up_link,
+    .rmdir = up_rmdir,
+    .rename = up_rename,
+
+    .setxattr = up_setxattr,
+    .fsetxattr = up_fsetxattr,
+    .getxattr = up_getxattr,
+    .fgetxattr = up_fgetxattr,
+    .fremovexattr = up_fremovexattr,
+    .removexattr = up_removexattr,
+    .xattrop = up_xattrop,
+    .fxattrop = up_fxattrop,
+
+#ifdef NOT_SUPPORTED
+    /* internal lk fops */
+    .inodelk = up_inodelk,
+    .finodelk = up_finodelk,
+    .entrylk = up_entrylk,
+    .fentrylk = up_fentrylk,
+
+    /* Below fops follow 'WRITE' which
+     * would have already sent upcall
+     * notifications */
+    .flush = up_flush,
+    .fsync = up_fsync,
+    .fsyncdir = up_fsyncdir,
+#endif
+};
+
+struct xlator_cbks cbks = {
+    .forget = upcall_forget,
+    .release = upcall_release,
+};
+
+struct volume_options options[] = {
+    {
+        .key = {"cache-invalidation"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "When \"on\", sends cache-invalidation"
+                       " notifications.",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {"cache", "cacheconsistency", "upcall"},
+    },
+    {.key = {"cache-invalidation-timeout"},
+     .type = GF_OPTION_TYPE_INT,
+     .default_value = CACHE_INVALIDATION_TIMEOUT,
+     .description = "After 'timeout' seconds since the time"
+                    " client accessed any file, cache-invalidation"
+                    " notifications are no longer sent to that client.",
+     .op_version = {GD_OP_VERSION_3_7_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"cache", "cachetimeout", "upcall"}},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "upcall",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/upcall/src/upcall.h b/xlators/features/upcall/src/upcall.h
new file mode 100644
index 00000000000..aa535088ad7
--- /dev/null
+++ b/xlators/features/upcall/src/upcall.h
@@ -0,0 +1,131 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef __UPCALL_H__
+#define __UPCALL_H__
+
+#include <glusterfs/compat-errno.h>
+#include "upcall-mem-types.h"
+#include <glusterfs/client_t.h>
+#include "upcall-messages.h"
+#include "upcall-cache-invalidation.h"
+#include <glusterfs/upcall-utils.h>
+
+#define EXIT_IF_UPCALL_OFF(this, label)                                        \
+    do {                                                                       \
+        if (!is_upcall_enabled(this))                                          \
+            goto label;                                                        \
+    } while (0)
+
+#define UPCALL_STACK_UNWIND(fop, frame, params...)                             \
+    do {                                                                       \
+        upcall_local_t *__local = NULL;                                        \
+        xlator_t *__xl = NULL;                                                 \
+        if (frame) {                                                           \
+            __xl = frame->this;                                                \
+            __local = frame->local;                                            \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, params);                               \
+        upcall_local_wipe(__xl, __local);                                      \
+    } while (0)
+
+#define UPCALL_STACK_DESTROY(frame)                                            \
+    do {                                                                       \
+        upcall_local_t *__local = NULL;                                        \
+        xlator_t *__xl = NULL;                                                 \
+        __xl = frame->this;                                                    \
+        __local = frame->local;                                                \
+        frame->local = NULL;                                                   \
+        STACK_DESTROY(frame->root);                                            \
+        upcall_local_wipe(__xl, __local);                                      \
+    } while (0)
+
+struct _upcall_private {
+    gf_boolean_t cache_invalidation_enabled;
+    int32_t cache_invalidation_timeout;
+    struct list_head inode_ctx_list;
+    gf_lock_t inode_ctx_lk;
+    gf_boolean_t reaper_init_done;
+    pthread_t reaper_thr;
+    int32_t fini;
+    dict_t *xattrs; /* list of xattrs registered by clients
+                       for receiving invalidation */
+};
+typedef struct _upcall_private upcall_private_t;
+
+struct _upcall_client {
+    struct list_head client_list;
+    /* strdup to store client_uid, strdup. Free it explicitly */
+    char *client_uid;
+    time_t access_time; /* time last accessed */
+    /* the amount of time which client can cache this entry */
+    uint32_t expire_time_attr;
+};
+typedef struct _upcall_client upcall_client_t;
+
+/* Upcall entries are maintained in inode_ctx */
+struct _upcall_inode_ctx {
+    struct list_head inode_ctx_list;
+    struct list_head client_list;
+    pthread_mutex_t client_list_lock; /* mutex for clients list
+                                         of this upcall entry */
+    int destroy;
+    uuid_t gfid; /* gfid of the entry */
+};
+typedef struct _upcall_inode_ctx upcall_inode_ctx_t;
+
+struct upcall_local {
+    /* XXX: need to check if we can store
+     * pointers in 'local' which may get freed
+     * in future by other thread
+     */
+    inode_t *inode;
+    loc_t rename_oldloc;
+    loc_t loc; /* required for stat in *xattr_cbk */
+    fd_t *fd;  /* required for fstat in *xattr_cbk */
+    dict_t *xattr;
+};
+typedef struct upcall_local upcall_local_t;
+
+void
+upcall_local_wipe(xlator_t *this, upcall_local_t *local);
+upcall_local_t *
+upcall_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+                  inode_t *inode, dict_t *xattr);
+
+upcall_inode_ctx_t *
+upcall_inode_ctx_get(inode_t *inode, xlator_t *this);
+int
+upcall_cleanup_inode_ctx(xlator_t *this, inode_t *inode);
+
+void *
+upcall_reaper_thread(void *data);
+int
+upcall_reaper_thread_init(xlator_t *this);
+
+/* Xlator options */
+gf_boolean_t
+is_upcall_enabled(xlator_t *this);
+
+/* Cache invalidation specific */
+void
+upcall_cache_invalidate(call_frame_t *frame, xlator_t *this, client_t *client,
+                        inode_t *inode, uint32_t flags, struct iatt *stbuf,
+                        struct iatt *p_stbuf, struct iatt *oldp_stbuf,
+                        dict_t *xattr);
+int
+up_filter_xattr(dict_t *xattr, dict_t *regd_xattrs);
+
+int
+up_compare_afr_xattr(dict_t *d, char *k, data_t *v, void *tmp);
+
+gf_boolean_t
+up_invalidate_needed(dict_t *xattrs);
+#endif /* __UPCALL_H__ */
diff --git a/xlators/features/utime/Makefile.am b/xlators/features/utime/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/utime/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/utime/src/Makefile.am b/xlators/features/utime/src/Makefile.am
new file mode 100644
index 00000000000..7c3adbc2195
--- /dev/null
+++ b/xlators/features/utime/src/Makefile.am
@@ -0,0 +1,41 @@
+xlator_LTLIBRARIES = utime.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+UTIME_SRC = $(top_srcdir)/xlators/features/utime/src
+
+utime_sources  = $(UTIME_SRC)/utime-helpers.c
+utime_sources += $(UTIME_SRC)/utime.c
+
+utime_la_SOURCES = $(utime_sources)
+nodist_utime_la_SOURCES = utime-autogen-fops.c utime-autogen-fops.h
+BUILT_SOURCES = utime-autogen-fops.h
+
+utime_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+utime_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS_utime  = $(UTIME_SRC)/utime-helpers.h
+noinst_HEADERS_utime += $(UTIME_SRC)/utime.h
+noinst_HEADERS_utime += $(UTIME_SRC)/utime-messages.h
+noinst_HEADERS_utime += $(UTIME_SRC)/utime-mem-types.h
+noinst_HEADERS = $(top_srcdir)/xlators/lib/src/libxlator.h
+noinst_HEADERS += $(noinst_HEADERS_utime)
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(top_srcdir)/xlators/lib/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+noinst_PYTHON	= utime-gen-fops-c.py utime-gen-fops-h.py
+EXTRA_DIST	= utime-autogen-fops-tmpl.c utime-autogen-fops-tmpl.h
+
+utime-autogen-fops.c: utime-gen-fops-c.py utime-autogen-fops-tmpl.c
+	$(PYTHON) $(UTIME_SRC)/utime-gen-fops-c.py $(UTIME_SRC)/utime-autogen-fops-tmpl.c > $@
+
+utime-autogen-fops.h: utime-gen-fops-h.py utime-autogen-fops-tmpl.h
+	$(PYTHON) $(UTIME_SRC)/utime-gen-fops-h.py $(UTIME_SRC)/utime-autogen-fops-tmpl.h > $@
+
+CLEANFILES = $(nodist_utime_la_SOURCES)
+
+uninstall-local:
+	rm -f $(DESTDIR)$(xlatordir)/utime.so
diff --git a/xlators/features/utime/src/utime-autogen-fops-tmpl.c b/xlators/features/utime/src/utime-autogen-fops-tmpl.c
new file mode 100644
index 00000000000..f2f35322926
--- /dev/null
+++ b/xlators/features/utime/src/utime-autogen-fops-tmpl.c
@@ -0,0 +1,28 @@
+/*
+  Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+/* File: utime-autogen-fops-tmpl.c
+ * This file contains the utime autogenerated FOPs. This is run through
+ * the code generator, generator.py to generate the required FOPs.
+ */
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/statedump.h>
+#include "utime-helpers.h"
+#include <glusterfs/timespec.h>
+
+#pragma generate
diff --git a/xlators/features/utime/src/utime-autogen-fops-tmpl.h b/xlators/features/utime/src/utime-autogen-fops-tmpl.h
new file mode 100644
index 00000000000..4e102ffed6c
--- /dev/null
+++ b/xlators/features/utime/src/utime-autogen-fops-tmpl.h
@@ -0,0 +1,22 @@
+/*
+  Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+/* File: utime-autogen-fops-tmpl.h
+ * This file contains the utime autogenerated FOPs declarations.
+ */
+
+#ifndef _UTIME_AUTOGEN_FOPS_H
+#define _UTIME_AUTOGEN_FOPS_H
+
+#include <glusterfs/xlator.h>
+
+#pragma generate
+
+#endif /* _UTIME_AUTOGEN_FOPS_H */
diff --git a/xlators/features/utime/src/utime-gen-fops-c.py b/xlators/features/utime/src/utime-gen-fops-c.py
new file mode 100755
index 00000000000..9fb3e1b8b1a
--- /dev/null
+++ b/xlators/features/utime/src/utime-gen-fops-c.py
@@ -0,0 +1,147 @@
+#!/usr/bin/python3
+
+from __future__ import print_function
+import os
+import sys
+
+curdir = os.path.dirname(sys.argv[0])
+gendir = os.path.join(curdir, '../../../../libglusterfs/src')
+sys.path.append(gendir)
+from generator import ops, fop_subs, cbk_subs, generate
+
+FOPS_COMMON_TEMPLATE = """
+int32_t
+gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this,
+                @LONG_ARGS@)
+{
+        gl_timespec_get(&frame->root->ctime);
+
+        (void) utime_update_attribute_flags(frame, this, GF_FOP_@UPNAME@);
+        STACK_WIND (frame, gf_utime_@NAME@_cbk, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->@NAME@, @SHORT_ARGS@);
+        return 0;
+}
+"""
+
+FOPS_CBK_COMMON_TEMPLATE = """
+int32_t
+gf_utime_@NAME@_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno,
+                    @LONG_ARGS@)
+{
+        STACK_UNWIND_STRICT (@NAME@, frame, op_ret, op_errno, @SHORT_ARGS@);
+        return 0;
+}
+"""
+
+FOPS_READ_TEMPLATE = """
+int32_t
+gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this,
+                @LONG_ARGS@)
+{
+        gl_timespec_get(&frame->root->ctime);
+
+        (void) utime_update_attribute_flags(frame, this, GF_FOP_READ);
+        STACK_WIND (frame, gf_utime_@NAME@_cbk, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->@NAME@, @SHORT_ARGS@);
+        return 0;
+}
+"""
+
+FOPS_WRITE_TEMPLATE = """
+int32_t
+gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this,
+                @LONG_ARGS@)
+{
+        gl_timespec_get(&frame->root->ctime);
+
+        (void) utime_update_attribute_flags(frame, this, GF_FOP_WRITE);
+        STACK_WIND (frame, gf_utime_@NAME@_cbk, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->@NAME@, @SHORT_ARGS@);
+        return 0;
+}
+"""
+
+FOPS_COPY_FILE_RANGE_TEMPLATE = """
+int32_t
+gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this,
+                @LONG_ARGS@)
+{
+        gl_timespec_get(&frame->root->ctime);
+
+        (void) utime_update_attribute_flags(frame, this, GF_FOP_COPY_FILE_RANGE);
+        STACK_WIND (frame, gf_utime_@NAME@_cbk, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->@NAME@, @SHORT_ARGS@);
+        return 0;
+}
+"""
+
+FOPS_SETATTR_TEMPLATE = """
+int32_t
+gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this,
+             @LONG_ARGS@)
+{
+        gl_timespec_get(&frame->root->ctime);
+
+        if (!valid) {
+                frame->root->flags |= MDATA_CTIME;
+        }
+
+        if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
+                frame->root->flags |= MDATA_CTIME;
+        }
+
+        if (valid & GF_SET_ATTR_MODE) {
+                frame->root->flags |= MDATA_CTIME;
+        }
+
+        if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
+            if (valid & GF_ATTR_ATIME_NOW) {
+                frame->root->ctime.tv_sec = stbuf->ia_atime;
+                frame->root->ctime.tv_nsec = stbuf->ia_atime_nsec;
+            } else if (valid & GF_ATTR_MTIME_NOW) {
+                frame->root->ctime.tv_sec = stbuf->ia_mtime;
+                frame->root->ctime.tv_nsec = stbuf->ia_mtime_nsec;
+            }
+        }
+
+        STACK_WIND (frame, gf_utime_@NAME@_cbk, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->@NAME@, @SHORT_ARGS@);
+        return 0;
+}
+"""
+
+utime_ops = ['fallocate', 'zerofill', 'opendir', 'mknod', 'mkdir',
+             'unlink', 'rmdir', 'symlink', 'rename', 'link', 'truncate',
+             'ftruncate', 'create', 'open', 'removexattr', 'fremovexattr']
+
+utime_read_op = ['readv']
+utime_write_op = ['writev']
+utime_setattr_ops = ['setattr', 'fsetattr']
+utime_copy_file_range_ops = ['copy_file_range']
+
+def gen_defaults():
+    for name in ops:
+        if name in utime_ops:
+            print(generate(FOPS_CBK_COMMON_TEMPLATE, name, cbk_subs))
+            print(generate(FOPS_COMMON_TEMPLATE, name, fop_subs))
+        if name in utime_read_op:
+            print(generate(FOPS_CBK_COMMON_TEMPLATE, name, cbk_subs))
+            print(generate(FOPS_READ_TEMPLATE, name, fop_subs))
+        if name in utime_write_op:
+            print(generate(FOPS_CBK_COMMON_TEMPLATE, name, cbk_subs))
+            print(generate(FOPS_WRITE_TEMPLATE, name, fop_subs))
+        if name in utime_setattr_ops:
+            print(generate(FOPS_CBK_COMMON_TEMPLATE, name, cbk_subs))
+            print(generate(FOPS_SETATTR_TEMPLATE, name, fop_subs))
+        if name in utime_copy_file_range_ops:
+            print(generate(FOPS_CBK_COMMON_TEMPLATE, name, cbk_subs))
+            print(generate(FOPS_COPY_FILE_RANGE_TEMPLATE, name, fop_subs))
+
+for l in open(sys.argv[1], 'r').readlines():
+    if l.find('#pragma generate') != -1:
+        print("/* BEGIN GENERATED CODE - DO NOT MODIFY */")
+        gen_defaults()
+        print("/* END GENERATED CODE */")
+    else:
+        print(l[:-1])
diff --git a/xlators/features/utime/src/utime-gen-fops-h.py b/xlators/features/utime/src/utime-gen-fops-h.py
new file mode 100755
index 00000000000..e96274c229a
--- /dev/null
+++ b/xlators/features/utime/src/utime-gen-fops-h.py
@@ -0,0 +1,35 @@
+#!/usr/bin/python3
+
+from __future__ import print_function
+import os
+import sys
+
+curdir = os.path.dirname(sys.argv[0])
+gendir = os.path.join(curdir, '../../../../libglusterfs/src')
+sys.path.append(gendir)
+from generator import ops, fop_subs, generate
+
+OP_FOP_TEMPLATE = """
+int32_t
+gf_utime_@NAME@ (call_frame_t *frame, xlator_t *this,
+                @LONG_ARGS@);
+"""
+
+utime_ops = ['fallocate', 'zerofill', 'opendir', 'mknod', 'mkdir',
+             'unlink', 'rmdir', 'symlink', 'rename', 'link', 'truncate',
+             'ftruncate', 'create', 'open', 'removexattr', 'fremovexattr',
+             'readv', 'writev', 'setattr', 'fsetattr', 'copy_file_range']
+
+def gen_defaults():
+    for name, value in ops.items():
+        if name in utime_ops:
+            print(generate(OP_FOP_TEMPLATE, name, fop_subs))
+
+
+for l in open(sys.argv[1], 'r').readlines():
+    if l.find('#pragma generate') != -1:
+        print("/* BEGIN GENERATED CODE - DO NOT MODIFY */")
+        gen_defaults()
+        print("/* END GENERATED CODE */")
+    else:
+        print(l[:-1])
diff --git a/xlators/features/utime/src/utime-helpers.c b/xlators/features/utime/src/utime-helpers.c
new file mode 100644
index 00000000000..29d9ad93561
--- /dev/null
+++ b/xlators/features/utime/src/utime-helpers.c
@@ -0,0 +1,110 @@
+/*
+  Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "utime-helpers.h"
+#include "utime.h"
+
+void
+gl_timespec_get(struct timespec *ts)
+{
+#ifdef TIME_UTC
+    timespec_get(ts, TIME_UTC);
+#else
+    timespec_now_realtime(ts);
+#endif
+}
+
+void
+utime_update_attribute_flags(call_frame_t *frame, xlator_t *this,
+                             glusterfs_fop_t fop)
+{
+    utime_priv_t *utime_priv = NULL;
+
+    if (!frame || !this) {
+        goto out;
+    }
+
+    utime_priv = this->private;
+
+    switch (fop) {
+        case GF_FOP_SETXATTR:
+        case GF_FOP_FSETXATTR:
+            frame->root->flags |= MDATA_CTIME;
+            break;
+
+        case GF_FOP_FALLOCATE:
+        case GF_FOP_ZEROFILL:
+            frame->root->flags |= MDATA_MTIME;
+            frame->root->flags |= MDATA_ATIME;
+            break;
+
+        case GF_FOP_OPENDIR:
+        case GF_FOP_OPEN:
+        case GF_FOP_READ:
+            if (!utime_priv->noatime) {
+                frame->root->flags |= MDATA_ATIME;
+            }
+            break;
+        case GF_FOP_MKNOD:
+        case GF_FOP_MKDIR:
+        case GF_FOP_SYMLINK:
+        case GF_FOP_CREATE:
+            frame->root->flags |= MDATA_ATIME;
+            frame->root->flags |= MDATA_CTIME;
+            frame->root->flags |= MDATA_MTIME;
+            frame->root->flags |= MDATA_PAR_CTIME;
+            frame->root->flags |= MDATA_PAR_MTIME;
+            break;
+
+        case GF_FOP_UNLINK:
+        case GF_FOP_RMDIR:
+            frame->root->flags |= MDATA_CTIME;
+            frame->root->flags |= MDATA_PAR_CTIME;
+            frame->root->flags |= MDATA_PAR_MTIME;
+            break;
+
+        case GF_FOP_WRITE:
+            frame->root->flags |= MDATA_MTIME;
+            frame->root->flags |= MDATA_CTIME;
+            break;
+
+        case GF_FOP_LINK:
+        case GF_FOP_RENAME:
+            frame->root->flags |= MDATA_CTIME;
+            frame->root->flags |= MDATA_PAR_CTIME;
+            frame->root->flags |= MDATA_PAR_MTIME;
+            break;
+
+        case GF_FOP_TRUNCATE:
+        case GF_FOP_FTRUNCATE:
+            frame->root->flags |= MDATA_CTIME;
+            frame->root->flags |= MDATA_MTIME;
+            break;
+
+        case GF_FOP_REMOVEXATTR:
+        case GF_FOP_FREMOVEXATTR:
+            frame->root->flags |= MDATA_CTIME;
+            break;
+
+        case GF_FOP_COPY_FILE_RANGE:
+            /* Below 2 are for destination fd */
+            frame->root->flags |= MDATA_CTIME;
+            frame->root->flags |= MDATA_MTIME;
+            /* Below flag is for the source fd */
+            if (!utime_priv->noatime) {
+                frame->root->flags |= MDATA_ATIME;
+            }
+            break;
+        default:
+            frame->root->flags = 0;
+    }
+out:
+    return;
+}
diff --git a/xlators/features/utime/src/utime-helpers.h b/xlators/features/utime/src/utime-helpers.h
new file mode 100644
index 00000000000..2e32d4bece6
--- /dev/null
+++ b/xlators/features/utime/src/utime-helpers.h
@@ -0,0 +1,25 @@
+/*
+  Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _UTIME_HELPERS_H
+#define _UTIME_HELPERS_H
+
+#include <glusterfs/stack.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/timespec.h>
+#include <time.h>
+
+void
+gl_timespec_get(struct timespec *ts);
+void
+utime_update_attribute_flags(call_frame_t *frame, xlator_t *this,
+                             glusterfs_fop_t fop);
+
+#endif /* _UTIME_HELPERS_H */
diff --git a/xlators/features/utime/src/utime-mem-types.h b/xlators/features/utime/src/utime-mem-types.h
new file mode 100644
index 00000000000..ad1255f85f3
--- /dev/null
+++ b/xlators/features/utime/src/utime-mem-types.h
@@ -0,0 +1,21 @@
+/*
+  Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __UTIME_MEM_TYPES_H__
+#define __UTIME_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_utime_mem_types_ {
+    utime_mt_utime_t = gf_common_mt_end + 1,
+    utime_mt_end
+};
+
+#endif /* __UTIME_MEM_TYPES_H__ */
diff --git a/xlators/features/utime/src/utime-messages.h b/xlators/features/utime/src/utime-messages.h
new file mode 100644
index 00000000000..bd40265abaf
--- /dev/null
+++ b/xlators/features/utime/src/utime-messages.h
@@ -0,0 +1,29 @@
+/*
+  Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __UTIME_MESSAGES_H__
+#define __UTIME_MESSAGES_H__
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(UTIME, UTIME_MSG_NO_MEMORY, UTIME_MSG_SET_MDATA_FAILED,
+           UTIME_MSG_DICT_SET_FAILED);
+
+#endif /* __UTIME_MESSAGES_H__ */
diff --git a/xlators/features/utime/src/utime.c b/xlators/features/utime/src/utime.c
new file mode 100644
index 00000000000..2acc63e6a05
--- /dev/null
+++ b/xlators/features/utime/src/utime.c
@@ -0,0 +1,392 @@
+/*
+  Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "utime.h"
+#include "utime-helpers.h"
+#include "utime-messages.h"
+#include "utime-mem-types.h"
+#include <glusterfs/call-stub.h>
+
+int32_t
+gf_utime_invalidate(xlator_t *this, inode_t *inode)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_forget(xlator_t *this, inode_t *inode)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_client_destroy(xlator_t *this, client_t *client)
+{
+    return 0;
+}
+
+void
+gf_utime_ictxmerge(xlator_t *this, fd_t *fd, inode_t *inode,
+                   inode_t *linked_inode)
+{
+    return;
+}
+
+int32_t
+gf_utime_release(xlator_t *this, fd_t *fd)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_releasedir(xlator_t *this, fd_t *fd)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_client_disconnect(xlator_t *this, client_t *client)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_fdctx_to_dict(xlator_t *this, fd_t *fd, dict_t *dict)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_inode(xlator_t *this)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_inode_to_dict(xlator_t *this, dict_t *dict)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_history(xlator_t *this)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_fd(xlator_t *this)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_fd_to_dict(xlator_t *this, dict_t *dict)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_fdctx(xlator_t *this, fd_t *fd)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_inodectx(xlator_t *this, inode_t *ino)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_inodectx_to_dict(xlator_t *this, inode_t *ino, dict_t *dict)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_priv_to_dict(xlator_t *this, dict_t *dict, char *brickname)
+{
+    return 0;
+}
+
+int32_t
+gf_utime_priv(xlator_t *this)
+{
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    if (xlator_mem_acct_init(this, utime_mt_end + 1) != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, UTIME_MSG_NO_MEMORY,
+               "Memory accounting initialization failed.");
+        return -1;
+    }
+    return 0;
+}
+
+int32_t
+gf_utime_set_mdata_setxattr_cbk(call_frame_t *frame, void *cookie,
+                                xlator_t *this, int op_ret, int op_errno,
+                                dict_t *xdata)
+{
+    call_stub_t *stub = frame->local;
+    /* Don't fail lookup if mdata setxattr fails */
+    if (op_ret) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, UTIME_MSG_SET_MDATA_FAILED,
+               "dict set of key for set-ctime-mdata failed");
+    }
+    frame->local = NULL;
+    call_resume(stub);
+    STACK_DESTROY(frame->root);
+    return 0;
+}
+
+int32_t
+gf_utime_set_mdata_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                              int32_t op_ret, int32_t op_errno, inode_t *inode,
+                              struct iatt *stbuf, dict_t *xdata,
+                              struct iatt *postparent)
+{
+    dict_t *dict = NULL;
+    struct mdata_iatt *mdata = NULL;
+    int ret = 0;
+    loc_t loc = {
+        0,
+    };
+    call_frame_t *new_frame = NULL;
+
+    if (!op_ret && dict_get(xdata, GF_XATTR_MDATA_KEY) == NULL) {
+        dict = dict_new();
+        if (!dict) {
+            op_errno = ENOMEM;
+            goto err;
+        }
+        mdata = GF_MALLOC(sizeof(struct mdata_iatt), gf_common_mt_char);
+        if (mdata == NULL) {
+            op_errno = ENOMEM;
+            goto err;
+        }
+        iatt_to_mdata(mdata, stbuf);
+        ret = dict_set_mdata(dict, CTIME_MDATA_XDATA_KEY, mdata, _gf_false);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM, UTIME_MSG_NO_MEMORY,
+                   "dict set of key for set-ctime-mdata failed");
+            goto err;
+        }
+        new_frame = copy_frame(frame);
+        if (!new_frame) {
+            op_errno = ENOMEM;
+            goto stub_err;
+        }
+
+        new_frame->local = fop_lookup_cbk_stub(frame, default_lookup_cbk,
+                                               op_ret, op_errno, inode, stbuf,
+                                               xdata, postparent);
+        if (!new_frame->local) {
+            gf_msg(this->name, GF_LOG_WARNING, ENOMEM, UTIME_MSG_NO_MEMORY,
+                   "lookup_cbk stub allocation failed");
+            op_errno = ENOMEM;
+            STACK_DESTROY(new_frame->root);
+            goto stub_err;
+        }
+
+        loc.inode = inode_ref(inode);
+        gf_uuid_copy(loc.gfid, stbuf->ia_gfid);
+
+        new_frame->root->uid = 0;
+        new_frame->root->gid = 0;
+        new_frame->root->pid = GF_CLIENT_PID_SET_UTIME;
+        STACK_WIND(new_frame, gf_utime_set_mdata_setxattr_cbk,
+                   FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, &loc,
+                   dict, 0, NULL);
+
+        dict_unref(dict);
+        inode_unref(loc.inode);
+        return 0;
+    }
+
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, stbuf, xdata,
+                        postparent);
+    return 0;
+
+err:
+    if (mdata) {
+        GF_FREE(mdata);
+    }
+stub_err:
+    if (dict) {
+        dict_unref(dict);
+    }
+    STACK_UNWIND_STRICT(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+    return 0;
+}
+
+int
+gf_utime_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    int op_errno = EINVAL;
+    int ret = -1;
+
+    VALIDATE_OR_GOTO(frame, err);
+    VALIDATE_OR_GOTO(this, err);
+    VALIDATE_OR_GOTO(loc, err);
+    VALIDATE_OR_GOTO(loc->inode, err);
+
+    xdata = xdata ? dict_ref(xdata) : dict_new();
+    if (!xdata) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    ret = dict_set_int8(xdata, GF_XATTR_MDATA_KEY, 1);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, -ret, UTIME_MSG_DICT_SET_FAILED,
+               "%s: Unable to set dict value for %s", loc->path,
+               GF_XATTR_MDATA_KEY);
+        op_errno = -ret;
+        goto free_dict;
+    }
+
+    STACK_WIND(frame, gf_utime_set_mdata_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
+    dict_unref(xdata);
+    return 0;
+
+free_dict:
+    dict_unref(xdata);
+err:
+    STACK_UNWIND_STRICT(lookup, frame, ret, op_errno, NULL, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    utime_priv_t *utime = NULL;
+
+    utime = GF_MALLOC(sizeof(*utime), utime_mt_utime_t);
+    if (utime == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, UTIME_MSG_NO_MEMORY,
+               "Failed to allocate private memory.");
+        return -1;
+    }
+    memset(utime, 0, sizeof(*utime));
+
+    this->private = utime;
+    GF_OPTION_INIT("noatime", utime->noatime, bool, err);
+
+    return 0;
+err:
+    return -1;
+}
+
+void
+fini(xlator_t *this)
+{
+    utime_priv_t *utime = NULL;
+
+    utime = this->private;
+    GF_FREE(utime);
+    return;
+}
+
+int32_t
+reconfigure(xlator_t *this, dict_t *options)
+{
+    utime_priv_t *utime = this->private;
+
+    GF_OPTION_RECONF("noatime", utime->noatime, options, bool, err);
+
+    return 0;
+err:
+    return -1;
+}
+
+int
+notify(xlator_t *this, int event, void *data, ...)
+{
+    return default_notify(this, event, data);
+}
+
+struct xlator_fops fops = {
+    .rename = gf_utime_rename,
+    .mknod = gf_utime_mknod,
+    .readv = gf_utime_readv,
+    .fremovexattr = gf_utime_fremovexattr,
+    .open = gf_utime_open,
+    .create = gf_utime_create,
+    .mkdir = gf_utime_mkdir,
+    .writev = gf_utime_writev,
+    .rmdir = gf_utime_rmdir,
+    .fallocate = gf_utime_fallocate,
+    .truncate = gf_utime_truncate,
+    .symlink = gf_utime_symlink,
+    .zerofill = gf_utime_zerofill,
+    .link = gf_utime_link,
+    .ftruncate = gf_utime_ftruncate,
+    .unlink = gf_utime_unlink,
+    .setattr = gf_utime_setattr,
+    .fsetattr = gf_utime_fsetattr,
+    .opendir = gf_utime_opendir,
+    .removexattr = gf_utime_removexattr,
+    .lookup = gf_utime_lookup,
+};
+struct xlator_cbks cbks = {
+    .invalidate = gf_utime_invalidate,
+    .forget = gf_utime_forget,
+    .client_destroy = gf_utime_client_destroy,
+    .ictxmerge = gf_utime_ictxmerge,
+    .release = gf_utime_release,
+    .releasedir = gf_utime_releasedir,
+    .client_disconnect = gf_utime_client_disconnect,
+};
+struct xlator_dumpops dumpops = {
+    .fdctx_to_dict = gf_utime_fdctx_to_dict,
+    .inode = gf_utime_inode,
+    .inode_to_dict = gf_utime_inode_to_dict,
+    .history = gf_utime_history,
+    .fd = gf_utime_fd,
+    .fd_to_dict = gf_utime_fd_to_dict,
+    .fdctx = gf_utime_fdctx,
+    .inodectx = gf_utime_inodectx,
+    .inodectx_to_dict = gf_utime_inodectx_to_dict,
+    .priv_to_dict = gf_utime_priv_to_dict,
+    .priv = gf_utime_priv,
+};
+
+struct volume_options options[] = {
+    {.key = {"noatime"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .op_version = {GD_OP_VERSION_5_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+     .tags = {"ctime"},
+     .description = "Enable/Disable atime updation when ctime feature is "
+                    "enabled. When noatime is on, atime is not updated with "
+                    "ctime feature enabled and vice versa."},
+    {.key = {NULL}}};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {GD_OP_VERSION_5_0},
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "utime",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/utime/src/utime.h b/xlators/features/utime/src/utime.h
new file mode 100644
index 00000000000..ba55eec00de
--- /dev/null
+++ b/xlators/features/utime/src/utime.h
@@ -0,0 +1,23 @@
+/*
+  Copyright (c) 2018 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __UTIME_H__
+#define __UTIME_H__
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include "utime-autogen-fops.h"
+
+typedef struct utime_priv {
+    gf_boolean_t noatime;
+} utime_priv_t;
+
+#endif /* __UTIME_H__ */
diff --git a/xlators/lib/src/libxlator.c b/xlators/lib/src/libxlator.c
new file mode 100644
index 00000000000..8075fa0c29f
--- /dev/null
+++ b/xlators/lib/src/libxlator.c
@@ -0,0 +1,490 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "libxlator.h"
+
+int marker_xtime_default_gauge[] = {
+    [MCNT_FOUND] = 1,     [MCNT_NOTFOUND] = -1, [MCNT_ENODATA] = -1,
+    [MCNT_ENOTCONN] = -1, [MCNT_ENOENT] = -1,   [MCNT_EOTHER] = -1,
+};
+
+int marker_uuid_default_gauge[] = {
+    [MCNT_FOUND] = 1,    [MCNT_NOTFOUND] = 0, [MCNT_ENODATA] = 0,
+    [MCNT_ENOTCONN] = 0, [MCNT_ENOENT] = 0,   [MCNT_EOTHER] = 0,
+};
+
+static int marker_idx_errno_map[] = {
+    [MCNT_FOUND] = EINVAL,    [MCNT_NOTFOUND] = EINVAL,
+    [MCNT_ENOENT] = ENOENT,   [MCNT_ENOTCONN] = ENOTCONN,
+    [MCNT_ENODATA] = ENODATA, [MCNT_EOTHER] = EINVAL,
+    [MCNT_MAX] = 0,
+};
+
+/*Copy the contents of oldtimebuf to newtimbuf*/
+static void
+update_timebuf(uint32_t *oldtimbuf, uint32_t *newtimebuf)
+{
+    newtimebuf[0] = (oldtimbuf[0]);
+    newtimebuf[1] = (oldtimbuf[1]);
+}
+
+/* Convert Timebuf in network order to host order */
+static void
+get_hosttime(uint32_t *oldtimbuf, uint32_t *newtimebuf)
+{
+    newtimebuf[0] = ntohl(oldtimbuf[0]);
+    newtimebuf[1] = ntohl(oldtimbuf[1]);
+}
+
+/* Match the Incoming trusted.glusterfs.<uuid>.xtime against volume uuid */
+int
+match_uuid_local(const char *name, char *uuid)
+{
+    if (!uuid || !*uuid)
+        return -1;
+
+    name = strtail((char *)name, MARKER_XATTR_PREFIX);
+    if (!name || name++ [0] != '.')
+        return -1;
+
+    name = strtail((char *)name, uuid);
+    if (!name || strcmp(name, ".xtime") != 0)
+        return -1;
+
+    return 0;
+}
+
+static void
+marker_local_incr_errcount(xl_marker_local_t *local, int op_errno)
+{
+    marker_result_idx_t i = -1;
+
+    if (!local)
+        return;
+
+    switch (op_errno) {
+        case ENODATA:
+            i = MCNT_ENODATA;
+            break;
+        case ENOENT:
+            i = MCNT_ENOENT;
+            break;
+        case ENOTCONN:
+            i = MCNT_ENOTCONN;
+            break;
+        default:
+            i = MCNT_EOTHER;
+            break;
+    }
+
+    local->count[i]++;
+}
+
+static int
+evaluate_marker_results(int *gauge, int *count)
+{
+    int i = 0;
+    int op_errno = 0;
+    gf_boolean_t sane = _gf_true;
+
+    /* check if the policy of the gauge is violated;
+     * if yes, try to get the best errno, ie. look
+     * for the first position where there is a more
+     * specific kind of vioilation than the generic EINVAL
+     */
+    for (i = 0; i < MCNT_MAX; i++) {
+        if (sane) {
+            if ((gauge[i] > 0 && count[i] < gauge[i]) ||
+                (gauge[i] < 0 && count[i] >= -gauge[i])) {
+                sane = _gf_false;
+                /* generic action: adopt corresponding errno */
+                op_errno = marker_idx_errno_map[i];
+            }
+        } else {
+            /* already insane; trying to get a more informative
+             * errno by checking subsequent counters
+             */
+            if (count[i] > 0)
+                op_errno = marker_idx_errno_map[i];
+        }
+        if (op_errno && op_errno != EINVAL)
+            break;
+    }
+
+    return op_errno;
+}
+
+static void
+cluster_marker_unwind(call_frame_t *frame, char *key, void *value, size_t size,
+                      dict_t *dict)
+{
+    xl_marker_local_t *local = frame->local;
+    int ret = 0;
+    int32_t op_ret = 0;
+    int32_t op_errno = 0;
+    gf_boolean_t unref = _gf_false;
+
+    frame->local = local->xl_local;
+
+    if (local->count[MCNT_FOUND]) {
+        if (!dict) {
+            dict = dict_new();
+            if (dict) {
+                unref = _gf_true;
+            } else {
+                op_ret = -1;
+                op_errno = ENOMEM;
+                goto out;
+            }
+        }
+
+        ret = dict_set_static_bin(dict, key, value, size);
+        if (ret) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto out;
+        }
+    }
+
+    op_errno = evaluate_marker_results(local->gauge, local->count);
+    if (op_errno)
+        op_ret = -1;
+
+out:
+    if (local->xl_specf_unwind) {
+        local->xl_specf_unwind(frame, op_ret, op_errno, dict, NULL);
+    } else {
+        STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, dict, NULL);
+    }
+
+    GF_FREE(local);
+    if (unref)
+        dict_unref(dict);
+}
+
+/* Aggregate all the <volid>.xtime attrs of the cluster and send the max*/
+int32_t
+cluster_markerxtime_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int op_ret, int op_errno, dict_t *dict, dict_t *xdata)
+
+{
+    int32_t callcnt = 0;
+    uint32_t *net_timebuf = NULL;
+    uint32_t host_timebuf[2] = {
+        0,
+    };
+    char marker_xattr[128] = {0};
+    xl_marker_local_t *local = NULL;
+
+    local = frame->local;
+
+    snprintf(marker_xattr, sizeof(marker_xattr), "%s.%s.%s",
+             MARKER_XATTR_PREFIX, local->vol_uuid, XTIME);
+
+    LOCK(&frame->lock);
+    {
+        callcnt = --local->call_count;
+
+        if (op_ret) {
+            marker_local_incr_errcount(local, op_errno);
+            goto unlock;
+        }
+
+        if (dict_get_ptr(dict, marker_xattr, (void **)&net_timebuf)) {
+            local->count[MCNT_NOTFOUND]++;
+            UNLOCK(&frame->lock);
+            gf_log(this->name, GF_LOG_WARNING,
+                   "Unable to get <uuid>.xtime attr");
+            goto post_unlock;
+        }
+
+        if (local->count[MCNT_FOUND]) {
+            get_hosttime(net_timebuf, host_timebuf);
+            if ((host_timebuf[0] > local->host_timebuf[0]) ||
+                (host_timebuf[0] == local->host_timebuf[0] &&
+                 host_timebuf[1] >= local->host_timebuf[1])) {
+                update_timebuf(net_timebuf, local->net_timebuf);
+                update_timebuf(host_timebuf, local->host_timebuf);
+            }
+
+        } else {
+            get_hosttime(net_timebuf, local->host_timebuf);
+            update_timebuf(net_timebuf, local->net_timebuf);
+            local->count[MCNT_FOUND]++;
+        }
+    }
+unlock:
+    UNLOCK(&frame->lock);
+post_unlock:
+    if (callcnt == 0)
+        cluster_marker_unwind(frame, marker_xattr, local->net_timebuf, 8, dict);
+
+    return 0;
+}
+
+int32_t
+cluster_markeruuid_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int op_ret, int op_errno, dict_t *dict, dict_t *xdata)
+{
+    int32_t callcnt = 0;
+    struct volume_mark *volmark = NULL;
+    xl_marker_local_t *local = NULL;
+    int32_t ret = -1;
+    char *vol_uuid = NULL;
+
+    local = frame->local;
+
+    LOCK(&frame->lock);
+    {
+        callcnt = --local->call_count;
+        vol_uuid = local->vol_uuid;
+
+        if (op_ret) {
+            marker_local_incr_errcount(local, op_errno);
+            goto unlock;
+        }
+
+        ret = dict_get_bin(dict, GF_XATTR_MARKER_KEY, (void *)&volmark);
+        if (ret)
+            goto unlock;
+
+        if (local->count[MCNT_FOUND]) {
+            if ((local->volmark->major != volmark->major) ||
+                (local->volmark->minor != volmark->minor)) {
+                op_ret = -1;
+                op_errno = EINVAL;
+                goto unlock;
+            }
+
+            if (local->retval) {
+                goto unlock;
+            } else if (volmark->retval) {
+                GF_FREE(local->volmark);
+                local->volmark = gf_memdup(volmark, sizeof(*volmark));
+                local->retval = volmark->retval;
+            } else if ((volmark->sec > local->volmark->sec) ||
+                       ((volmark->sec == local->volmark->sec) &&
+                        (volmark->usec >= local->volmark->usec))) {
+                GF_FREE(local->volmark);
+                local->volmark = gf_memdup(volmark, sizeof(*volmark));
+            }
+
+        } else {
+            local->volmark = gf_memdup(volmark, sizeof(*volmark));
+            VALIDATE_OR_GOTO(local->volmark, unlock);
+            gf_uuid_unparse(volmark->uuid, vol_uuid);
+            if (volmark->retval)
+                local->retval = volmark->retval;
+            local->count[MCNT_FOUND]++;
+        }
+    }
+unlock:
+    UNLOCK(&frame->lock);
+
+    if (callcnt == 0)
+        cluster_marker_unwind(frame, GF_XATTR_MARKER_KEY, local->volmark,
+                              sizeof(*local->volmark), dict);
+
+    return 0;
+}
+
+int
+gf_get_min_stime(xlator_t *this, dict_t *dst, char *key, data_t *value)
+{
+    int ret = -1;
+    uint32_t *net_timebuf = NULL;
+    uint32_t *value_timebuf = NULL;
+    uint32_t host_timebuf[2] = {
+        0,
+    };
+    uint32_t host_value_timebuf[2] = {
+        0,
+    };
+
+    /* stime should be minimum of all the other nodes */
+    ret = dict_get_bin(dst, key, (void **)&net_timebuf);
+    if (ret < 0) {
+        net_timebuf = GF_CALLOC(1, sizeof(int64_t), gf_common_mt_char);
+        if (!net_timebuf)
+            goto out;
+
+        ret = dict_set_bin(dst, key, net_timebuf, sizeof(int64_t));
+        if (ret < 0) {
+            gf_log(this->name, GF_LOG_WARNING, "key=%s: dict set failed", key);
+            goto error;
+        }
+    }
+
+    value_timebuf = data_to_bin(value);
+    if (!value_timebuf) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "key=%s: getting value of stime failed", key);
+        ret = -1;
+        goto out;
+    }
+
+    get_hosttime(value_timebuf, host_value_timebuf);
+    get_hosttime(net_timebuf, host_timebuf);
+
+    /* can't use 'min()' macro here as we need to compare two fields
+       in the array, selectively */
+    if ((host_value_timebuf[0] < host_timebuf[0]) ||
+        ((host_value_timebuf[0] == host_timebuf[0]) &&
+         (host_value_timebuf[1] < host_timebuf[1]))) {
+        update_timebuf(value_timebuf, net_timebuf);
+    }
+
+    ret = 0;
+out:
+    return ret;
+error:
+    /* To be used only when net_timebuf is not set in the dict */
+    if (net_timebuf)
+        GF_FREE(net_timebuf);
+
+    return ret;
+}
+
+int
+gf_get_max_stime(xlator_t *this, dict_t *dst, char *key, data_t *value)
+{
+    int ret = -ENOMEM;
+    uint32_t *net_timebuf = NULL;
+    uint32_t *value_timebuf = NULL;
+    uint32_t host_timebuf[2] = {
+        0,
+    };
+    uint32_t host_value_timebuf[2] = {
+        0,
+    };
+
+    /* stime should be maximum of all the other nodes */
+    ret = dict_get_bin(dst, key, (void **)&net_timebuf);
+    if (ret < 0) {
+        net_timebuf = GF_CALLOC(1, sizeof(int64_t), gf_common_mt_char);
+        if (!net_timebuf)
+            goto out;
+
+        ret = dict_set_bin(dst, key, net_timebuf, sizeof(int64_t));
+        if (ret < 0) {
+            gf_log(this->name, GF_LOG_WARNING, "key=%s: dict set failed", key);
+            goto error;
+        }
+    }
+
+    value_timebuf = data_to_bin(value);
+    if (!value_timebuf) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "key=%s: getting value of stime failed", key);
+        ret = -EINVAL;
+        goto out;
+    }
+
+    get_hosttime(value_timebuf, host_value_timebuf);
+    get_hosttime(net_timebuf, host_timebuf);
+
+    /* can't use 'max()' macro here as we need to compare two fields
+       in the array, selectively */
+    if ((host_value_timebuf[0] > host_timebuf[0]) ||
+        ((host_value_timebuf[0] == host_timebuf[0]) &&
+         (host_value_timebuf[1] > host_timebuf[1]))) {
+        update_timebuf(value_timebuf, net_timebuf);
+    }
+
+    ret = 0;
+out:
+    return ret;
+error:
+    /* To be used only when net_timebuf is not set in the dict */
+    if (net_timebuf)
+        GF_FREE(net_timebuf);
+
+    return ret;
+}
+
+static int
+_get_children_count(xlator_t *xl)
+{
+    int i = 0;
+    xlator_list_t *trav = NULL;
+    for (i = 0, trav = xl->children; trav; trav = trav->next, i++) {
+        /*'i' will have the value */
+    }
+
+    return i;
+}
+
+int
+cluster_handle_marker_getxattr(call_frame_t *frame, loc_t *loc,
+                               const char *name, char *vol_uuid,
+                               xlator_specf_unwind_t unwind,
+                               int (*populate_args)(call_frame_t *frame,
+                                                    int type, int *gauge,
+                                                    xlator_t **subvols))
+{
+    xlator_t *this = frame->this;
+    xlator_t **subvols = NULL;
+    int num_subvols = 0;
+    int type = 0;
+    int i = 0;
+    int gauge[MCNT_MAX] = {0};
+    xl_marker_local_t *local = NULL;
+
+    if (GF_CLIENT_PID_GSYNCD != frame->root->pid)
+        return -EINVAL;
+
+    if (name == NULL)
+        return -EINVAL;
+
+    if (strcmp(GF_XATTR_MARKER_KEY, name) == 0) {
+        type = MARKER_UUID_TYPE;
+        memcpy(gauge, marker_uuid_default_gauge, sizeof(gauge));
+    } else if (match_uuid_local(name, vol_uuid) == 0) {
+        type = MARKER_XTIME_TYPE;
+        memcpy(gauge, marker_xtime_default_gauge, sizeof(gauge));
+    } else {
+        return -EINVAL;
+    }
+
+    num_subvols = _get_children_count(this);
+    subvols = alloca(num_subvols * sizeof(*subvols));
+    num_subvols = populate_args(frame, type, gauge, subvols);
+
+    local = GF_CALLOC(sizeof(struct marker_str), 1,
+                      gf_common_mt_libxl_marker_local);
+
+    if (!local)
+        goto fail;
+
+    local->xl_local = frame->local;
+    local->call_count = num_subvols;
+    local->xl_specf_unwind = unwind;
+    local->vol_uuid = vol_uuid;
+    memcpy(local->gauge, gauge, sizeof(local->gauge));
+
+    frame->local = local;
+
+    for (i = 0; i < num_subvols; i++) {
+        if (MARKER_UUID_TYPE == type)
+            STACK_WIND(frame, cluster_markeruuid_cbk, subvols[i],
+                       subvols[i]->fops->getxattr, loc, name, NULL);
+        else if (MARKER_XTIME_TYPE == type)
+            STACK_WIND(frame, cluster_markerxtime_cbk, subvols[i],
+                       subvols[i]->fops->getxattr, loc, name, NULL);
+    }
+
+    return 0;
+fail:
+    if (unwind)
+        unwind(frame, -1, ENOMEM, NULL, NULL);
+    else
+        default_getxattr_failure_cbk(frame, ENOMEM);
+    return 0;
+}
diff --git a/xlators/lib/src/libxlator.h b/xlators/lib/src/libxlator.h
new file mode 100644
index 00000000000..81da4060d55
--- /dev/null
+++ b/xlators/lib/src/libxlator.h
@@ -0,0 +1,147 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _LIBXLATOR_H
+#define _LIBXLATOR_H
+
+#include <glusterfs/defaults.h>
+
+#include <stdint.h>             // for int32_t
+#include "glusterfs/dict.h"     // for dict_t, data_t
+#include "glusterfs/globals.h"  // for xlator_t, loc_t
+#include "glusterfs/stack.h"    // for call_frame_t
+#include <glusterfs/compat.h>
+#include <glusterfs/compat-errno.h>
+
+#define MARKER_XATTR_PREFIX "trusted.glusterfs"
+#define XTIME "xtime"
+#define VOLUME_MARK "volume-mark"
+#define GF_XATTR_MARKER_KEY MARKER_XATTR_PREFIX "." VOLUME_MARK
+#define UUID_SIZE 36
+#define MARKER_UUID_TYPE 1
+#define MARKER_XTIME_TYPE 2
+
+typedef int32_t (*xlator_specf_unwind_t)(call_frame_t *frame, int op_ret,
+                                         int op_errno, dict_t *dict,
+                                         dict_t *xdata);
+
+struct volume_mark {
+    uint8_t major;
+    uint8_t minor;
+    uint8_t uuid[16];
+    uint8_t retval;
+    uint32_t sec;
+    uint32_t usec;
+} __attribute__((__packed__));
+
+/*
+ * The enumerated type here
+ * is used to index two kind
+ * of integer arrays:
+ * - gauges
+ * - counters
+
+ * A counter is used internally,
+ * in getxattr callbacks, to count
+ * the results, categorized as
+ * the enum names suggest. So values
+ * in the counter are always non-negative.
+
+ * Gauges are part of the API.
+ * The caller passes one to the
+ * top-level aggregator function,
+ * cluster_getmarkerattr(). The gauge
+ * defines an evaluation policy for the
+ * counter. That is, at the
+ * end of the aggregation process
+ * the gauge is matched against the
+ * counter, and the policy
+ * represented by the gauge decides
+ * whether to return with success or failure,
+ * and in latter case, what particular failure
+ * case (errno).
+
+ * The rules are the following: for some index i,
+ * - if gauge[i] == 0, no requirement is set
+ *   against counter[i];
+ * - if gauge[i] > 0, counter[i] >= gauge[i]
+ *   is required;
+ * - if gauge[i] < 0, counter[i] < |gauge[i]|
+ *   is required.
+
+ * If the requirement is not met, then i is mapped
+ * to the respective errno (MCNT_ENOENT -> ENOENT),
+ * or in lack of that, EINVAL.
+
+ * Cf. evaluate_marker_results() and marker_idx_errno_map[]
+ * in libxlator.c
+
+ * We provide two default gauges, one intended for xtime
+ * aggregation, other for volume mark aggregation. The
+ * policies they represent agree with the hard-coded
+ * one prior to gauges. Cf. marker_xtime_default_gauge
+ * and marker_uuid_default_gauge in libxlator.c
+ */
+
+typedef enum {
+    MCNT_FOUND,
+    MCNT_NOTFOUND,
+    MCNT_ENODATA,
+    MCNT_ENOTCONN,
+    MCNT_ENOENT,
+    MCNT_EOTHER,
+    MCNT_MAX
+} marker_result_idx_t;
+
+extern int marker_xtime_default_gauge[];
+extern int marker_uuid_default_gauge[];
+
+struct marker_str {
+    struct volume_mark *volmark;
+    data_t *data;
+
+    uint32_t host_timebuf[2];
+    uint32_t net_timebuf[2];
+    int32_t call_count;
+    int gauge[MCNT_MAX];
+    int count[MCNT_MAX];
+
+    xlator_specf_unwind_t xl_specf_unwind;
+    void *xl_local;
+    char *vol_uuid;
+    uint8_t retval;
+};
+
+typedef struct marker_str xl_marker_local_t;
+
+int32_t
+cluster_markerxtime_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int op_ret, int op_errno, dict_t *dict, dict_t *xdata);
+
+int32_t
+cluster_markeruuid_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int op_ret, int op_errno, dict_t *dict, dict_t *xdata);
+
+int
+cluster_handle_marker_getxattr(call_frame_t *frame, loc_t *loc,
+                               const char *name, char *vol_uuid,
+                               xlator_specf_unwind_t unwind,
+                               int (*populate_args)(call_frame_t *frame,
+                                                    int type, int *gauge,
+                                                    xlator_t **subvols));
+int
+match_uuid_local(const char *name, char *uuid);
+
+int
+gf_get_min_stime(xlator_t *this, dict_t *dst, char *key, data_t *value);
+
+int
+gf_get_max_stime(xlator_t *this, dict_t *dst, char *key, data_t *value);
+
+#endif /* !_LIBXLATOR_H */
diff --git a/xlators/meta/Makefile.am b/xlators/meta/Makefile.am
index e1c45f3051c..af437a64d6d 100644
--- a/xlators/meta/Makefile.am
+++ b/xlators/meta/Makefile.am
@@ -1 +1 @@
-SUBDIRS=src
-\ No newline at end of file
+SUBDIRS = src
diff --git a/xlators/meta/src/Makefile.am b/xlators/meta/src/Makefile.am
index 385ff553f59..29b871d7984 100644
--- a/xlators/meta/src/Makefile.am
+++ b/xlators/meta/src/Makefile.am
@@ -1,10 +1,44 @@
-xlator_PROGRAMS = meta.so
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/
+xlator_LTLIBRARIES = meta.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator
 
-meta_so_SOURCES = meta.c tree.c misc.c view.c
-noinst_HEADERS = meta.h tree.h misc.h view.h
+meta_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles
+meta_la_SOURCES = meta.c meta-helpers.c meta-defaults.c \
+	root-dir.c \
+	graphs-dir.c \
+	frames-file.c \
+	graph-dir.c \
+	active-link.c \
+	xlator-dir.c \
+	top-link.c \
+	logging-dir.c \
+	logfile-link.c \
+	loglevel-file.c \
+	process_uuid-file.c \
+	volfile-file.c \
+	view-dir.c \
+	subvolumes-dir.c \
+	subvolume-link.c \
+	type-file.c \
+	version-file.c \
+	options-dir.c \
+	option-file.c \
+	cmdline-file.c \
+	name-file.c \
+	private-file.c \
+	history-file.c \
+	mallinfo-file.c \
+	meminfo-file.c \
+	measure-file.c \
+	profile-file.c
 
-CLEANFILES = 
+meta_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = meta.h meta-hooks.h meta-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/meta/src/active-link.c b/xlators/meta/src/active-link.c
new file mode 100644
index 00000000000..7ee780d89e9
--- /dev/null
+++ b/xlators/meta/src/active-link.c
@@ -0,0 +1,34 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+
+static int
+active_link_fill(xlator_t *this, inode_t *inode, strfd_t *strfd)
+{
+    strprintf(strfd, "%s", this->ctx->active->graph_uuid);
+
+    return 0;
+}
+
+struct meta_ops active_link_ops = {.link_fill = active_link_fill};
+
+int
+meta_active_link_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                      dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &active_link_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/cmdline-file.c b/xlators/meta/src/cmdline-file.c
new file mode 100644
index 00000000000..eb24e985af9
--- /dev/null
+++ b/xlators/meta/src/cmdline-file.c
@@ -0,0 +1,39 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include <glusterfs/strfd.h>
+#include <glusterfs/lkowner.h>
+
+static int
+cmdline_file_fill(xlator_t *this, inode_t *file, strfd_t *strfd)
+{
+    if (this->ctx->cmdlinestr)
+        strprintf(strfd, "{ \n  \"Cmdlinestr\": \"%s\"\n}",
+                  this->ctx->cmdlinestr);
+    return strfd->size;
+}
+
+static struct meta_ops cmdline_file_ops = {
+    .file_fill = cmdline_file_fill,
+};
+
+int
+meta_cmdline_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &cmdline_file_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/frames-file.c b/xlators/meta/src/frames-file.c
new file mode 100644
index 00000000000..9a13db9a934
--- /dev/null
+++ b/xlators/meta/src/frames-file.c
@@ -0,0 +1,107 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include <glusterfs/strfd.h>
+#include <glusterfs/lkowner.h>
+
+static int
+frames_file_fill(xlator_t *this, inode_t *file, strfd_t *strfd)
+{
+    struct call_pool *pool = NULL;
+    call_stack_t *stack = NULL;
+    call_frame_t *frame = NULL;
+    int i = 0;
+    int j = 1;
+
+    if (!this || !file || !strfd)
+        return -1;
+
+    pool = this->ctx->pool;
+
+    strprintf(strfd, "{ \n\t\"Stack\": [\n");
+
+    LOCK(&pool->lock);
+    {
+        list_for_each_entry(stack, &pool->all_frames, all_frames)
+        {
+            strprintf(strfd, "\t   {\n");
+            strprintf(strfd, "\t\t\"Number\": %d,\n", ++i);
+            strprintf(strfd, "\t\t\"Frame\": [\n");
+            j = 1;
+            list_for_each_entry(frame, &stack->myframes, frames)
+            {
+                strprintf(strfd, "\t\t   {\n");
+                strprintf(strfd, "\t\t\t\"Number\": %d,\n", j++);
+                strprintf(strfd, "\t\t\t\"Xlator\": \"%s\",\n",
+                          frame->this->name);
+                if (frame->begin.tv_sec)
+                    strprintf(strfd, "\t\t\t\"Creation_time\": %d.%09d,\n",
+                              (int)frame->begin.tv_sec,
+                              (int)frame->begin.tv_nsec);
+                strprintf(strfd, " \t\t\t\"Refcount\": %d,\n",
+                          frame->ref_count);
+                if (frame->parent)
+                    strprintf(strfd, "\t\t\t\"Parent\": \"%s\",\n",
+                              frame->parent->this->name);
+                if (frame->wind_from)
+                    strprintf(strfd, "\t\t\t\"Wind_from\": \"%s\",\n",
+                              frame->wind_from);
+                if (frame->wind_to)
+                    strprintf(strfd, "\t\t\t\"Wind_to\": \"%s\",\n",
+                              frame->wind_to);
+                if (frame->unwind_from)
+                    strprintf(strfd, "\t\t\t\"Unwind_from\": \"%s\",\n",
+                              frame->unwind_from);
+                if (frame->unwind_to)
+                    strprintf(strfd, "\t\t\t\"Unwind_to\": \"%s\",\n",
+                              frame->unwind_to);
+                strprintf(strfd, "\t\t\t\"Complete\": %d\n", frame->complete);
+                if (list_is_last(&frame->frames, &stack->myframes))
+                    strprintf(strfd, "\t\t   }\n");
+                else
+                    strprintf(strfd, "\t\t   },\n");
+            }
+            strprintf(strfd, "\t\t],\n");
+            strprintf(strfd, "\t\t\"Unique\": %" PRId64 ",\n", stack->unique);
+            strprintf(strfd, "\t\t\"Type\": \"%s\",\n", gf_fop_list[stack->op]);
+            strprintf(strfd, "\t\t\"UID\": %d,\n", stack->uid);
+            strprintf(strfd, "\t\t\"GID\": %d,\n", stack->gid);
+            strprintf(strfd, "\t\t\"LK_owner\": \"%s\"\n",
+                      lkowner_utoa(&stack->lk_owner));
+            if (i == (int)pool->cnt)
+                strprintf(strfd, "\t   }\n");
+            else
+                strprintf(strfd, "\t   },\n");
+        }
+        strprintf(strfd, "\t],\n");
+        strprintf(strfd, "\t\"Call_Count\": %d\n", (int)pool->cnt);
+        strprintf(strfd, "}");
+    }
+    UNLOCK(&pool->lock);
+
+    return strfd->size;
+}
+
+static struct meta_ops frames_file_ops = {
+    .file_fill = frames_file_fill,
+};
+
+int
+meta_frames_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                      dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &frames_file_ops);
+    return 0;
+}
diff --git a/xlators/meta/src/graph-dir.c b/xlators/meta/src/graph-dir.c
new file mode 100644
index 00000000000..a8f4787880d
--- /dev/null
+++ b/xlators/meta/src/graph-dir.c
@@ -0,0 +1,98 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include "meta-hooks.h"
+
+static struct meta_dirent graph_dir_dirents[] = {
+    DOT_DOTDOT,
+
+    {
+        .name = "top",
+        .type = IA_IFLNK,
+        .hook = meta_top_link_hook,
+    },
+    {
+        .name = "volfile",
+        .type = IA_IFREG,
+        .hook = meta_volfile_file_hook,
+    },
+    {.name = NULL}};
+
+static int
+graph_dir_fill(xlator_t *this, inode_t *inode, struct meta_dirent **dp)
+{
+    struct meta_dirent *dirents = NULL;
+    glusterfs_graph_t *graph = NULL;
+    int i = 0;
+    int count = 0;
+    xlator_t *xl = NULL;
+
+    graph = meta_ctx_get(inode, this);
+
+    for (xl = graph->first; xl; xl = xl->next)
+        count++;
+
+    dirents = GF_MALLOC(sizeof(*dirents) * count, gf_meta_mt_dirents_t);
+    if (!dirents)
+        return -1;
+
+    i = 0;
+    for (xl = graph->first; xl; xl = xl->next) {
+        dirents[i].name = gf_strdup(xl->name);
+        dirents[i].type = IA_IFDIR;
+        dirents[i].hook = meta_xlator_dir_hook;
+        i++;
+    }
+
+    *dp = dirents;
+    return i;
+}
+
+struct meta_ops graph_dir_ops = {
+    .fixed_dirents = graph_dir_dirents,
+    .dir_fill = graph_dir_fill,
+};
+
+static glusterfs_graph_t *
+glusterfs_graph_lookup(xlator_t *this, const char *graph_uuid)
+{
+    glusterfs_graph_t *graph = NULL;
+    glusterfs_graph_t *tmp = NULL;
+
+    list_for_each_entry(tmp, &this->ctx->graphs, list)
+    {
+        if (strcmp(graph_uuid, tmp->graph_uuid) == 0) {
+            graph = tmp;
+            break;
+        }
+    }
+
+    return graph;
+}
+
+int
+meta_graph_dir_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    dict_t *xdata)
+{
+    glusterfs_graph_t *graph = NULL;
+
+    graph = glusterfs_graph_lookup(this, loc->name);
+
+    meta_ops_set(loc->inode, this, &graph_dir_ops);
+
+    meta_ctx_set(loc->inode, this, (void *)graph);
+
+    return 0;
+}
diff --git a/xlators/meta/src/graphs-dir.c b/xlators/meta/src/graphs-dir.c
new file mode 100644
index 00000000000..a1ffbca7d5a
--- /dev/null
+++ b/xlators/meta/src/graphs-dir.c
@@ -0,0 +1,67 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include "meta-hooks.h"
+
+static struct meta_dirent graphs_dir_dirents[] = {
+    DOT_DOTDOT,
+
+    {
+        .name = "active",
+        .type = IA_IFLNK,
+        .hook = meta_active_link_hook,
+    },
+    {.name = NULL}};
+
+static int
+graphs_dir_fill(xlator_t *this, inode_t *dir, struct meta_dirent **dp)
+{
+    glusterfs_graph_t *graph = NULL;
+    int graphs_count = 0;
+    int i = 0;
+    struct meta_dirent *dirents = NULL;
+
+    list_for_each_entry(graph, &this->ctx->graphs, list) { graphs_count++; }
+
+    dirents = GF_CALLOC(sizeof(*dirents), graphs_count + 3,
+                        gf_meta_mt_dirents_t);
+    if (!dirents)
+        return -1;
+
+    i = 0;
+    list_for_each_entry(graph, &this->ctx->graphs, list)
+    {
+        dirents[i].name = gf_strdup(graph->graph_uuid);
+        dirents[i].type = IA_IFDIR;
+        dirents[i].hook = meta_graph_dir_hook;
+        i++;
+    }
+
+    *dp = dirents;
+
+    return i;
+}
+
+struct meta_ops graphs_dir_ops = {.fixed_dirents = graphs_dir_dirents,
+                                  .dir_fill = graphs_dir_fill};
+
+int
+meta_graphs_dir_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                     dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &graphs_dir_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/history-file.c b/xlators/meta/src/history-file.c
new file mode 100644
index 00000000000..7742a635fed
--- /dev/null
+++ b/xlators/meta/src/history-file.c
@@ -0,0 +1,44 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include <glusterfs/strfd.h>
+#include <glusterfs/statedump.h>
+
+static int
+history_file_fill(xlator_t *this, inode_t *file, strfd_t *strfd)
+{
+    xlator_t *xl = NULL;
+
+    xl = meta_ctx_get(file, this);
+
+    gf_proc_dump_xlator_history(xl, strfd);
+
+    return strfd->size;
+}
+
+static struct meta_ops history_file_ops = {
+    .file_fill = history_file_fill,
+};
+
+int
+meta_history_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &history_file_ops);
+
+    meta_ctx_set(loc->inode, this, meta_ctx_get(loc->parent, this));
+
+    return 0;
+}
diff --git a/xlators/meta/src/logfile-link.c b/xlators/meta/src/logfile-link.c
new file mode 100644
index 00000000000..616a54518c0
--- /dev/null
+++ b/xlators/meta/src/logfile-link.c
@@ -0,0 +1,34 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+
+static int
+logfile_link_fill(xlator_t *this, inode_t *inode, strfd_t *strfd)
+{
+    strprintf(strfd, "%s", this->ctx->log.filename);
+
+    return 0;
+}
+
+struct meta_ops logfile_link_ops = {.link_fill = logfile_link_fill};
+
+int
+meta_logfile_link_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &logfile_link_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/logging-dir.c b/xlators/meta/src/logging-dir.c
new file mode 100644
index 00000000000..46e6f9e95dd
--- /dev/null
+++ b/xlators/meta/src/logging-dir.c
@@ -0,0 +1,44 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include "meta-hooks.h"
+
+static struct meta_dirent logging_dir_dirents[] = {
+    DOT_DOTDOT,
+
+    {
+        .name = "logfile",
+        .type = IA_IFLNK,
+        .hook = meta_logfile_link_hook,
+    },
+    {
+        .name = "loglevel",
+        .type = IA_IFREG,
+        .hook = meta_loglevel_file_hook,
+    },
+    {.name = NULL}};
+
+struct meta_ops logging_dir_ops = {
+    .fixed_dirents = logging_dir_dirents,
+};
+
+int
+meta_logging_dir_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                      dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &logging_dir_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/loglevel-file.c b/xlators/meta/src/loglevel-file.c
new file mode 100644
index 00000000000..eeeeeaa5907
--- /dev/null
+++ b/xlators/meta/src/loglevel-file.c
@@ -0,0 +1,50 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include <glusterfs/strfd.h>
+
+static int
+loglevel_file_fill(xlator_t *this, inode_t *file, strfd_t *strfd)
+{
+    strprintf(strfd, "%d\n", this->ctx->log.loglevel);
+
+    return strfd->size;
+}
+
+static int
+loglevel_file_write(xlator_t *this, fd_t *fd, struct iovec *iov, int count)
+{
+    long int level = -1;
+
+    level = strtol(iov[0].iov_base, NULL, 0);
+    if (level >= GF_LOG_NONE && level <= GF_LOG_TRACE)
+        gf_log_set_loglevel(this->ctx, level);
+
+    return iov_length(iov, count);
+}
+
+static struct meta_ops loglevel_file_ops = {
+    .file_fill = loglevel_file_fill,
+    .file_write = loglevel_file_write,
+};
+
+int
+meta_loglevel_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                        dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &loglevel_file_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/mallinfo-file.c b/xlators/meta/src/mallinfo-file.c
new file mode 100644
index 00000000000..b4396d72189
--- /dev/null
+++ b/xlators/meta/src/mallinfo-file.c
@@ -0,0 +1,36 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include <glusterfs/statedump.h>
+
+static int
+mallinfo_file_fill(xlator_t *this, inode_t *file, strfd_t *strfd)
+{
+    gf_proc_dump_mallinfo(strfd);
+    return strfd->size;
+}
+
+static struct meta_ops mallinfo_file_ops = {
+    .file_fill = mallinfo_file_fill,
+};
+
+int
+meta_mallinfo_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                        dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &mallinfo_file_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/measure-file.c b/xlators/meta/src/measure-file.c
new file mode 100644
index 00000000000..52e92e48590
--- /dev/null
+++ b/xlators/meta/src/measure-file.c
@@ -0,0 +1,49 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include <glusterfs/strfd.h>
+
+static int
+measure_file_fill(xlator_t *this, inode_t *file, strfd_t *strfd)
+{
+    strprintf(strfd, "%d\n", this->ctx->measure_latency);
+
+    return strfd->size;
+}
+
+static int
+measure_file_write(xlator_t *this, fd_t *fd, struct iovec *iov, int count)
+{
+    long int num = -1;
+
+    num = strtol(iov[0].iov_base, NULL, 0);
+    this->ctx->measure_latency = !!num;
+
+    return iov_length(iov, count);
+}
+
+static struct meta_ops measure_file_ops = {
+    .file_fill = measure_file_fill,
+    .file_write = measure_file_write,
+};
+
+int
+meta_measure_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &measure_file_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/meminfo-file.c b/xlators/meta/src/meminfo-file.c
new file mode 100644
index 00000000000..d889dfb2ae8
--- /dev/null
+++ b/xlators/meta/src/meminfo-file.c
@@ -0,0 +1,44 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include <glusterfs/strfd.h>
+#include <glusterfs/statedump.h>
+
+static int
+meminfo_file_fill(xlator_t *this, inode_t *file, strfd_t *strfd)
+{
+    xlator_t *xl = NULL;
+
+    xl = meta_ctx_get(file, this);
+
+    gf_proc_dump_xlator_meminfo(xl, strfd);
+
+    return strfd->size;
+}
+
+static struct meta_ops meminfo_file_ops = {
+    .file_fill = meminfo_file_fill,
+};
+
+int
+meta_meminfo_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &meminfo_file_ops);
+
+    meta_ctx_set(loc->inode, this, meta_ctx_get(loc->parent, this));
+
+    return 0;
+}
diff --git a/xlators/meta/src/meta-defaults.c b/xlators/meta/src/meta-defaults.c
new file mode 100644
index 00000000000..91c328473f8
--- /dev/null
+++ b/xlators/meta/src/meta-defaults.c
@@ -0,0 +1,655 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+
+#include <glusterfs/compat-errno.h>
+
+int
+meta_default_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                       const char *name, dict_t *xdata)
+{
+    return default_fgetxattr_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                       dict_t *dict, int32_t flags, dict_t *xdata)
+{
+    return default_fsetxattr_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                      dict_t *dict, int32_t flags, dict_t *xdata)
+{
+    return default_setxattr_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    dict_t *xdata)
+{
+    return default_statfs_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                      int32_t flags, dict_t *xdata)
+{
+    return default_fsyncdir_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+                     dict_t *xdata)
+{
+    META_STACK_UNWIND(opendir, frame, 0, 0, fd, xdata);
+    return 0;
+}
+
+int
+meta_default_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    struct iatt iatt = {};
+
+    meta_iatt_fill(&iatt, fd->inode, fd->inode->ia_type);
+
+    META_STACK_UNWIND(fstat, frame, 0, 0, &iatt, xdata);
+
+    return 0;
+}
+
+int
+meta_default_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+                   dict_t *xdata)
+{
+    return default_fsync_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    META_STACK_UNWIND(flush, frame, 0, 0, xdata);
+    return 0;
+}
+
+int
+meta_default_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                    struct iovec *vector, int32_t count, off_t off,
+                    uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+    struct meta_ops *ops = NULL;
+    int ret = 0;
+    struct iatt dummy = {};
+
+    ops = meta_ops_get(fd->inode, this);
+    if (!ops)
+        goto err;
+
+    if (!ops->file_write)
+        goto err;
+
+    ret = ops->file_write(this, fd, vector, count);
+
+    META_STACK_UNWIND(writev, frame, (ret >= 0 ? ret : -1),
+                      (ret < 0 ? -ret : 0), &dummy, &dummy, xdata);
+    return 0;
+err:
+    return default_writev_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                   off_t offset, uint32_t flags, dict_t *xdata)
+{
+    meta_fd_t *meta_fd = NULL;
+    struct iovec iov = {};
+    struct iobuf *iobuf = NULL;
+    struct iobref *iobref = NULL;
+    off_t copy_offset = 0;
+    int copy_size = 0;
+    struct iatt iatt = {};
+
+    meta_fd = meta_fd_get(fd, this);
+    if (!meta_fd)
+        return default_readv_failure_cbk(frame, ENODATA);
+
+    if (!meta_fd->size)
+        meta_file_fill(this, fd);
+
+    iobuf = iobuf_get2(this->ctx->iobuf_pool, size);
+    if (!iobuf)
+        return default_readv_failure_cbk(frame, ENOMEM);
+
+    iobref = iobref_new();
+    if (!iobref) {
+        iobuf_unref(iobuf);
+        return default_readv_failure_cbk(frame, ENOMEM);
+    }
+
+    if (iobref_add(iobref, iobuf) != 0) {
+        iobref_unref(iobref);
+        iobuf_unref(iobuf);
+        return default_readv_failure_cbk(frame, ENOMEM);
+    }
+
+    iov.iov_base = iobuf_ptr(iobuf);
+
+    /* iobref would have taken a ref */
+    iobuf_unref(iobuf);
+
+    copy_offset = min(meta_fd->size, offset);
+    copy_size = min(size, (meta_fd->size - copy_offset));
+
+    if (copy_size)
+        memcpy(iov.iov_base, meta_fd->data + copy_offset, copy_size);
+    iov.iov_len = copy_size;
+
+    META_STACK_UNWIND(readv, frame, copy_size, 0, &iov, 1, &iatt, iobref, 0);
+
+    iobref_unref(iobref);
+
+    return 0;
+}
+
+int
+meta_default_open(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                  int32_t flags, fd_t *fd, dict_t *xdata)
+{
+    dict_t *xdata_rsp = NULL;
+
+    xdata_rsp = meta_direct_io_mode(xdata, frame);
+
+    META_STACK_UNWIND(open, frame, 0, 0, fd, xdata_rsp);
+
+    return 0;
+}
+
+int
+meta_default_create(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    int32_t flags, mode_t mode, mode_t umask, fd_t *fd,
+                    dict_t *xdata)
+{
+    return default_create_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+                  loc_t *newloc, dict_t *xdata)
+{
+    return default_link_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+                    loc_t *newloc, dict_t *xdata)
+{
+    return default_rename_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+                     loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    return default_symlink_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+                   dict_t *xdata)
+{
+    return default_rmdir_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+                    dict_t *xdata)
+{
+    return default_unlink_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+                   mode_t umask, dict_t *xdata)
+{
+    return default_mkdir_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+                   dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    return default_mknod_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                      size_t size, dict_t *xdata)
+{
+    struct meta_ops *ops = NULL;
+    strfd_t *strfd = NULL;
+    struct iatt iatt = {};
+    int len = -1;
+
+    ops = meta_ops_get(loc->inode, this);
+    if (!ops || !ops->link_fill) {
+        META_STACK_UNWIND(readlink, frame, -1, EPERM, 0, 0, 0);
+        return 0;
+    }
+
+    strfd = strfd_open();
+    if (!strfd) {
+        META_STACK_UNWIND(readlink, frame, -1, ENOMEM, 0, 0, 0);
+        return 0;
+    }
+
+    ops->link_fill(this, loc->inode, strfd);
+
+    meta_iatt_fill(&iatt, loc->inode, IA_IFLNK);
+
+    if (strfd->data) {
+        len = strlen(strfd->data);
+        META_STACK_UNWIND(readlink, frame, len, 0, strfd->data, &iatt, xdata);
+    } else
+        META_STACK_UNWIND(readlink, frame, -1, ENODATA, 0, 0, 0);
+
+    strfd_close(strfd);
+
+    return 0;
+}
+
+int
+meta_default_access(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    int32_t mask, dict_t *xdata)
+{
+    return default_access_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                       off_t offset, dict_t *xdata)
+{
+    struct iatt iatt = {};
+
+    meta_iatt_fill(&iatt, fd->inode, IA_IFREG);
+
+    META_STACK_UNWIND(ftruncate, frame, 0, 0, &iatt, &iatt, xdata);
+
+    return 0;
+}
+
+int
+meta_default_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                      const char *name, dict_t *xdata)
+{
+    return default_getxattr_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                     gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    return default_xattrop_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                      gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+{
+    return default_fxattrop_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                         const char *name, dict_t *xdata)
+{
+    return default_removexattr_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                          const char *name, dict_t *xdata)
+{
+    return default_fremovexattr_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+                struct gf_flock *lock, dict_t *xdata)
+{
+    return default_lk_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+                     loc_t *loc, int32_t cmd, struct gf_flock *lock,
+                     dict_t *xdata)
+{
+    return default_inodelk_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_finodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+                      fd_t *fd, int32_t cmd, struct gf_flock *lock,
+                      dict_t *xdata)
+{
+    return default_finodelk_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_entrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+                     loc_t *loc, const char *basename, entrylk_cmd cmd,
+                     entrylk_type type, dict_t *xdata)
+{
+    return default_entrylk_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume,
+                      fd_t *fd, const char *basename, entrylk_cmd cmd,
+                      entrylk_type type, dict_t *xdata)
+{
+    return default_fentrylk_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                       off_t offset, int32_t len, dict_t *xdata)
+{
+    return default_rchecksum_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                     off_t off, dict_t *xdata)
+{
+    meta_fd_t *meta_fd = NULL;
+    int i = 0;
+    gf_dirent_t head;
+    gf_dirent_t *list = NULL;
+    int ret = 0;
+    int this_size = 0;
+    int filled_size = 0;
+    int fixed_size = 0;
+    int dyn_size = 0;
+    struct meta_dirent *fixed_dirents = NULL;
+    struct meta_dirent *dyn_dirents = NULL;
+    struct meta_dirent *dirents = NULL;
+    struct meta_dirent *end = NULL;
+    struct meta_ops *ops = NULL;
+
+    INIT_LIST_HEAD(&head.list);
+
+    ops = meta_ops_get(fd->inode, this);
+    if (!ops)
+        goto err;
+
+    meta_fd = meta_fd_get(fd, this);
+    if (!meta_fd)
+        goto err;
+
+    meta_dir_fill(this, fd);
+
+    fixed_dirents = ops->fixed_dirents;
+    fixed_size = fixed_dirents_len(fixed_dirents);
+
+    dyn_dirents = meta_fd->dirents;
+    dyn_size = meta_fd->size;
+
+    for (i = off; i < (fixed_size + dyn_size);) {
+        if (i >= fixed_size) {
+            dirents = dyn_dirents + (i - fixed_size);
+            end = dyn_dirents + dyn_size;
+        } else {
+            dirents = fixed_dirents + i;
+            end = fixed_dirents + fixed_size;
+        }
+
+        while (dirents < end) {
+            this_size = sizeof(gf_dirent_t) + strlen(dirents->name) + 1;
+            if (this_size + filled_size > size)
+                goto unwind;
+
+            list = gf_dirent_for_name(dirents->name);
+            if (!list)
+                break;
+
+            list->d_off = i + 1;
+            list->d_ino = i + 42;
+            switch (dirents->type) {
+                case IA_IFDIR:
+                    list->d_type = DT_DIR;
+                    break;
+                case IA_IFCHR:
+                    list->d_type = DT_CHR;
+                    break;
+                case IA_IFBLK:
+                    list->d_type = DT_BLK;
+                    break;
+                case IA_IFIFO:
+                    list->d_type = DT_FIFO;
+                    break;
+                case IA_IFLNK:
+                    list->d_type = DT_LNK;
+                    break;
+                case IA_IFREG:
+                    list->d_type = DT_REG;
+                    break;
+                case IA_IFSOCK:
+                    list->d_type = DT_SOCK;
+                    break;
+                case IA_INVAL:
+                    list->d_type = DT_UNKNOWN;
+                    break;
+            }
+
+            list_add_tail(&list->list, &head.list);
+            ret++;
+            i++;
+            dirents++;
+            filled_size += this_size;
+        }
+    }
+
+unwind:
+    META_STACK_UNWIND(readdir, frame, ret, 0, &head, xdata);
+
+    gf_dirent_free(&head);
+
+    return 0;
+err:
+    META_STACK_UNWIND(readdir, frame, -1, ENOMEM, 0, 0);
+    return 0;
+}
+
+int
+meta_default_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                      size_t size, off_t off, dict_t *xdata)
+{
+    return meta_default_readdir(frame, this, fd, size, off, xdata);
+}
+
+int
+meta_default_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                     struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    return default_setattr_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                      off_t offset, dict_t *xdata)
+{
+    struct iatt iatt = {};
+
+    meta_iatt_fill(&iatt, loc->inode, IA_IFREG);
+
+    META_STACK_UNWIND(truncate, frame, 0, 0, &iatt, &iatt, xdata);
+
+    return 0;
+}
+
+int
+meta_default_stat(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                  dict_t *xdata)
+{
+    struct iatt iatt = {};
+
+    meta_iatt_fill(&iatt, loc->inode, loc->inode->ia_type);
+
+    META_STACK_UNWIND(stat, frame, 0, 0, &iatt, xdata);
+
+    return 0;
+}
+
+int
+meta_default_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    dict_t *xdata)
+{
+    struct meta_ops *ops = NULL;
+    struct meta_dirent *dirent = NULL;
+    struct meta_dirent *dp = NULL;
+    int i = 0;
+    int ret = 0;
+
+    if (!loc->name)
+        return meta_inode_discover(frame, this, loc, xdata);
+
+    ops = meta_ops_get(loc->parent, this);
+    if (!ops)
+        return default_lookup_failure_cbk(frame, EPERM);
+
+    for (dirent = ops->fixed_dirents; dirent && dirent->name; dirent++) {
+        if (strcmp(dirent->name, loc->name) == 0)
+            goto hook;
+    }
+
+    dirent = NULL;
+    if (ops->dir_fill)
+        ret = ops->dir_fill(this, loc->parent, &dp);
+
+    for (i = 0; i < ret; i++) {
+        if (strcmp(dp[i].name, loc->name) == 0) {
+            dirent = &dp[i];
+            goto hook;
+        }
+    }
+hook:
+    if (dirent && dirent->hook) {
+        struct iatt parent = {};
+        struct iatt iatt = {};
+
+        dirent->hook(frame, this, loc, xdata);
+
+        meta_iatt_fill(&iatt, loc->inode, dirent->type);
+
+        META_STACK_UNWIND(lookup, frame, 0, 0, loc->inode, &iatt, xdata,
+                          &parent);
+    } else {
+        META_STACK_UNWIND(lookup, frame, -1, ENOENT, 0, 0, 0, 0);
+    }
+
+    for (i = 0; i < ret; i++)
+        GF_FREE((void *)dp[i].name);
+    GF_FREE(dp);
+
+    return 0;
+}
+
+int
+meta_default_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                      struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    return default_fsetattr_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                       int32_t keep_size, off_t offset, size_t len,
+                       dict_t *xdata)
+{
+    return default_fallocate_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_discard(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                     off_t offset, size_t len, dict_t *xdata)
+{
+    return default_discard_failure_cbk(frame, EPERM);
+}
+
+int
+meta_default_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                      off_t offset, off_t len, dict_t *xdata)
+{
+    return default_zerofill_failure_cbk(frame, EPERM);
+}
+
+#define SET_META_DEFAULT_FOP(f, name)                                          \
+    do {                                                                       \
+        if (!f->name)                                                          \
+            f->name = meta_default_##name;                                     \
+    } while (0)
+
+struct xlator_fops *
+meta_defaults_init(struct xlator_fops *fops)
+{
+    SET_META_DEFAULT_FOP(fops, create);
+    SET_META_DEFAULT_FOP(fops, open);
+    SET_META_DEFAULT_FOP(fops, stat);
+    SET_META_DEFAULT_FOP(fops, readlink);
+    SET_META_DEFAULT_FOP(fops, mknod);
+    SET_META_DEFAULT_FOP(fops, mkdir);
+    SET_META_DEFAULT_FOP(fops, unlink);
+    SET_META_DEFAULT_FOP(fops, rmdir);
+    SET_META_DEFAULT_FOP(fops, symlink);
+    SET_META_DEFAULT_FOP(fops, rename);
+    SET_META_DEFAULT_FOP(fops, link);
+    SET_META_DEFAULT_FOP(fops, truncate);
+    SET_META_DEFAULT_FOP(fops, readv);
+    SET_META_DEFAULT_FOP(fops, writev);
+    SET_META_DEFAULT_FOP(fops, statfs);
+    SET_META_DEFAULT_FOP(fops, flush);
+    SET_META_DEFAULT_FOP(fops, fsync);
+    SET_META_DEFAULT_FOP(fops, setxattr);
+    SET_META_DEFAULT_FOP(fops, getxattr);
+    SET_META_DEFAULT_FOP(fops, fsetxattr);
+    SET_META_DEFAULT_FOP(fops, fgetxattr);
+    SET_META_DEFAULT_FOP(fops, removexattr);
+    SET_META_DEFAULT_FOP(fops, fremovexattr);
+    SET_META_DEFAULT_FOP(fops, opendir);
+    SET_META_DEFAULT_FOP(fops, readdir);
+    SET_META_DEFAULT_FOP(fops, readdirp);
+    SET_META_DEFAULT_FOP(fops, fsyncdir);
+    SET_META_DEFAULT_FOP(fops, access);
+    SET_META_DEFAULT_FOP(fops, ftruncate);
+    SET_META_DEFAULT_FOP(fops, fstat);
+    SET_META_DEFAULT_FOP(fops, lk);
+    SET_META_DEFAULT_FOP(fops, inodelk);
+    SET_META_DEFAULT_FOP(fops, finodelk);
+    SET_META_DEFAULT_FOP(fops, entrylk);
+    SET_META_DEFAULT_FOP(fops, fentrylk);
+    SET_META_DEFAULT_FOP(fops, lookup);
+    SET_META_DEFAULT_FOP(fops, rchecksum);
+    SET_META_DEFAULT_FOP(fops, xattrop);
+    SET_META_DEFAULT_FOP(fops, fxattrop);
+    SET_META_DEFAULT_FOP(fops, setattr);
+    SET_META_DEFAULT_FOP(fops, fsetattr);
+    SET_META_DEFAULT_FOP(fops, fallocate);
+    SET_META_DEFAULT_FOP(fops, discard);
+    SET_META_DEFAULT_FOP(fops, zerofill);
+
+    return fops;
+}
diff --git a/xlators/meta/src/meta-helpers.c b/xlators/meta/src/meta-helpers.c
new file mode 100644
index 00000000000..cb54f547468
--- /dev/null
+++ b/xlators/meta/src/meta-helpers.c
@@ -0,0 +1,332 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+
+meta_fd_t *
+meta_fd_get(fd_t *fd, xlator_t *this)
+{
+    uint64_t value = 0;
+    meta_fd_t *meta_fd = NULL;
+
+    LOCK(&fd->lock);
+    {
+        if (__fd_ctx_get(fd, this, &value) < 0) {
+            if (!value) {
+                meta_fd = GF_CALLOC(1, sizeof(*meta_fd), gf_meta_mt_fd_t);
+                if (!meta_fd)
+                    goto unlock;
+                value = (long)meta_fd;
+                __fd_ctx_set(fd, this, value);
+            }
+        } else {
+            meta_fd = (void *)(uintptr_t)value;
+        }
+    }
+unlock:
+    UNLOCK(&fd->lock);
+
+    return meta_fd;
+}
+
+int
+meta_fd_release(fd_t *fd, xlator_t *this)
+{
+    uint64_t value = 0;
+    meta_fd_t *meta_fd = NULL;
+    int i = 0;
+
+    fd_ctx_get(fd, this, &value);
+    meta_fd = (void *)(uintptr_t)value;
+
+    if (meta_fd && meta_fd->dirents) {
+        for (i = 0; i < meta_fd->size; i++)
+            GF_FREE((void *)meta_fd->dirents[i].name);
+        GF_FREE(meta_fd->dirents);
+    }
+
+    if (meta_fd) {
+        GF_FREE(meta_fd->data);
+        GF_FREE(meta_fd);
+    }
+    return 0;
+}
+
+struct meta_ops *
+meta_ops_get(inode_t *inode, xlator_t *this)
+{
+    struct meta_ops *ops = NULL;
+    uint64_t value = 0;
+
+    inode_ctx_get2(inode, this, NULL, &value);
+
+    ops = (void *)(uintptr_t)value;
+
+    return ops;
+}
+
+struct xlator_fops *
+meta_fops_get(inode_t *inode, xlator_t *this)
+{
+    struct meta_ops *ops = NULL;
+
+    ops = meta_ops_get(inode, this);
+    if (!ops)
+        return default_fops;
+
+    return &ops->fops;
+}
+
+int
+meta_ops_set(inode_t *inode, xlator_t *this, struct meta_ops *ops)
+{
+    uint64_t value = 0;
+    int ret = 0;
+
+    meta_defaults_init(&ops->fops);
+
+    value = (long)ops;
+
+    ret = inode_ctx_set2(inode, this, NULL, &value);
+
+    return ret;
+}
+
+void *
+meta_ctx_get(inode_t *inode, xlator_t *this)
+{
+    void *ctx = NULL;
+    uint64_t value = 0;
+
+    inode_ctx_get2(inode, this, &value, 0);
+
+    ctx = (void *)(uintptr_t)value;
+
+    return ctx;
+}
+
+int
+meta_ctx_set(inode_t *inode, xlator_t *this, void *ctx)
+{
+    uint64_t value = 0;
+    int ret = 0;
+
+    value = (long)ctx;
+
+    ret = inode_ctx_set2(inode, this, &value, 0);
+
+    return ret;
+}
+
+void
+meta_local_cleanup(meta_local_t *local, xlator_t *this)
+{
+    if (!local)
+        return;
+
+    if (local->xdata)
+        dict_unref(local->xdata);
+
+    GF_FREE(local);
+    return;
+}
+
+meta_local_t *
+meta_local(call_frame_t *frame)
+{
+    meta_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        local = frame->local = GF_CALLOC(1, sizeof(*local), gf_meta_mt_local_t);
+    return local;
+}
+
+dict_t *
+meta_direct_io_mode(dict_t *xdata, call_frame_t *frame)
+{
+    meta_local_t *local = NULL;
+
+    if (!xdata) {
+        local = meta_local(frame);
+        if (!local)
+            return NULL;
+        xdata = local->xdata = dict_new();
+        if (!xdata)
+            return NULL;
+    }
+
+    if (dict_set_int8(xdata, "direct-io-mode", 1) != 0)
+        return NULL;
+
+    return xdata;
+}
+
+static void
+meta_uuid_copy(uuid_t dst, uuid_t src)
+{
+    gf_uuid_copy(dst, src);
+    if (gf_uuid_is_null(dst))
+        gf_uuid_generate(dst);
+}
+
+static void
+default_meta_iatt_fill(struct iatt *iatt, inode_t *inode, ia_type_t type,
+                       gf_boolean_t is_tunable)
+{
+    struct timeval tv = {};
+
+    iatt->ia_type = type;
+    switch (type) {
+        case IA_IFDIR:
+            iatt->ia_prot = ia_prot_from_st_mode(0555);
+            iatt->ia_nlink = 2;
+            break;
+        case IA_IFLNK:
+            iatt->ia_prot = ia_prot_from_st_mode(0777);
+            iatt->ia_nlink = 1;
+            break;
+        default:
+            iatt->ia_prot = ia_prot_from_st_mode(is_tunable ? 0644 : 0444);
+            iatt->ia_nlink = 1;
+            break;
+    }
+    iatt->ia_uid = 0;
+    iatt->ia_gid = 0;
+    iatt->ia_size = 0;
+
+    meta_uuid_copy(iatt->ia_gfid, inode->gfid);
+    iatt->ia_ino = gfid_to_ino(iatt->ia_gfid);
+
+    gettimeofday(&tv, 0);
+    iatt->ia_mtime = iatt->ia_ctime = iatt->ia_atime = tv.tv_sec;
+    iatt->ia_mtime_nsec = iatt->ia_ctime_nsec = iatt->ia_atime_nsec =
+        (tv.tv_usec * 1000);
+    return;
+}
+
+void
+meta_iatt_fill(struct iatt *iatt, inode_t *inode, ia_type_t type)
+{
+    struct meta_ops *ops = NULL;
+
+    ops = meta_ops_get(inode, THIS);
+    if (!ops)
+        return;
+
+    if (!ops->iatt_fill)
+        default_meta_iatt_fill(iatt, inode, type, !!ops->file_write);
+    else
+        ops->iatt_fill(THIS, inode, iatt);
+    return;
+}
+
+int
+meta_inode_discover(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    dict_t *xdata)
+{
+    struct iatt iatt = {};
+    struct iatt postparent = {};
+
+    meta_iatt_fill(&iatt, loc->inode, loc->inode->ia_type);
+
+    META_STACK_UNWIND(lookup, frame, 0, 0, loc->inode, &iatt, xdata,
+                      &postparent);
+    return 0;
+}
+
+int
+meta_file_fill(xlator_t *this, fd_t *fd)
+{
+    meta_fd_t *meta_fd = NULL;
+    strfd_t *strfd = NULL;
+    struct meta_ops *ops = NULL;
+    int ret = 0;
+
+    meta_fd = meta_fd_get(fd, this);
+    if (!meta_fd)
+        return -1;
+
+    if (meta_fd->data)
+        return meta_fd->size;
+
+    strfd = strfd_open();
+    if (!strfd)
+        return -1;
+
+    ops = meta_ops_get(fd->inode, this);
+    if (!ops) {
+        strfd_close(strfd);
+        return -1;
+    }
+
+    if (ops->file_fill)
+        ret = ops->file_fill(this, fd->inode, strfd);
+
+    if (ret >= 0) {
+        meta_fd->data = strfd->data;
+        meta_fd->size = strfd->size;
+
+        strfd->data = NULL;
+    }
+
+    strfd_close(strfd);
+
+    return meta_fd->size;
+}
+
+int
+meta_dir_fill(xlator_t *this, fd_t *fd)
+{
+    meta_fd_t *meta_fd = NULL;
+    struct meta_ops *ops = NULL;
+    struct meta_dirent *dp = NULL;
+    int ret = 0;
+
+    meta_fd = meta_fd_get(fd, this);
+    if (!meta_fd)
+        return -1;
+
+    if (meta_fd->dirents)
+        return meta_fd->size;
+
+    ops = meta_ops_get(fd->inode, this);
+    if (!ops)
+        return -1;
+
+    if (ops->dir_fill)
+        ret = ops->dir_fill(this, fd->inode, &dp);
+
+    if (dp) {
+        meta_fd->dirents = dp;
+        meta_fd->size = ret;
+    }
+
+    return meta_fd->size;
+}
+
+int
+fixed_dirents_len(struct meta_dirent *dirents)
+{
+    int i = 0;
+    struct meta_dirent *dirent = NULL;
+
+    if (!dirents)
+        return 0;
+
+    for (dirent = dirents; dirent->name; dirent++)
+        i++;
+
+    return i;
+}
diff --git a/xlators/meta/src/meta-hooks.h b/xlators/meta/src/meta-hooks.h
new file mode 100644
index 00000000000..7208641398a
--- /dev/null
+++ b/xlators/meta/src/meta-hooks.h
@@ -0,0 +1,48 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __META_HOOKS_H
+#define __META_HOOKS_H
+#include <glusterfs/xlator.h>
+
+#define DECLARE_HOOK(name)                                                     \
+    int meta_##name##_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,    \
+                           dict_t *xdata)
+
+DECLARE_HOOK(root_dir);
+DECLARE_HOOK(graphs_dir);
+DECLARE_HOOK(frames_file);
+DECLARE_HOOK(graph_dir);
+DECLARE_HOOK(active_link);
+DECLARE_HOOK(xlator_dir);
+DECLARE_HOOK(top_link);
+DECLARE_HOOK(logging_dir);
+DECLARE_HOOK(logfile_link);
+DECLARE_HOOK(loglevel_file);
+DECLARE_HOOK(process_uuid_file);
+DECLARE_HOOK(volfile_file);
+DECLARE_HOOK(view_dir);
+DECLARE_HOOK(subvolumes_dir);
+DECLARE_HOOK(subvolume_link);
+DECLARE_HOOK(type_file);
+DECLARE_HOOK(version_file);
+DECLARE_HOOK(options_dir);
+DECLARE_HOOK(option_file);
+DECLARE_HOOK(cmdline_file);
+DECLARE_HOOK(name_file);
+DECLARE_HOOK(private_file);
+DECLARE_HOOK(mallinfo_file);
+DECLARE_HOOK(history_file);
+DECLARE_HOOK(master_dir);
+DECLARE_HOOK(meminfo_file);
+DECLARE_HOOK(measure_file);
+DECLARE_HOOK(profile_file);
+
+#endif
diff --git a/xlators/meta/src/meta-mem-types.h b/xlators/meta/src/meta-mem-types.h
new file mode 100644
index 00000000000..033c306682f
--- /dev/null
+++ b/xlators/meta/src/meta-mem-types.h
@@ -0,0 +1,25 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __META_MEM_TYPES_H__
+#define __META_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_meta_mem_types_ {
+    gf_meta_mt_priv_t = gf_common_mt_end + 1,
+    gf_meta_mt_fd_t,
+    gf_meta_mt_fd_data_t,
+    gf_meta_mt_strfd_t,
+    gf_meta_mt_dirents_t,
+    gf_meta_mt_local_t,
+    gf_meta_mt_end
+};
+#endif
diff --git a/xlators/meta/src/meta.c b/xlators/meta/src/meta.c
index e0cfe630c24..e1b9a2b6581 100644
--- a/xlators/meta/src/meta.c
+++ b/xlators/meta/src/meta.c
@@ -1,1285 +1,276 @@
 /*
-   Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
    This file is part of GlusterFS.
 
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
 */
 
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "dict.h"
-#include "xlator.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
 
+#include "meta-mem-types.h"
 #include "meta.h"
-#include "view.h"
-
-int32_t
-meta_getattr_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct stat *buf)
-{
-  STACK_UNWIND (frame, op_ret, op_errno, buf);
-  return 0;
-}
-
-int32_t
-meta_getattr (call_frame_t *frame,
-	      xlator_t *this,
-	      const char *path)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  meta_dirent_t *file = lookup_meta_entry (root, path, NULL);
-  
-  if (file) {
-    if (file->fops && file->fops->getattr) {
-      STACK_WIND (frame, meta_getattr_cbk,
-		  this, file->fops->getattr, path);
-      return 0;
-    }
-    else {
-      STACK_UNWIND (frame, 0, 0, file->stbuf);
-      return 0;
-    }
-  }
-  else {
-    STACK_WIND (frame, meta_getattr_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->getattr,
-	      path);
-    return 0;
-  }
-}
-
-int32_t
-meta_chmod_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		struct stat *buf)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		buf);
-  return 0;
-}
-
-int32_t
-meta_chmod (call_frame_t *frame,
-	    xlator_t *this,
-	    const char *path,
-	    mode_t mode)
-{
-  STACK_WIND (frame,
-	      meta_chmod_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->chmod,
-	      path,
-	      mode);
-  return 0;
-}
-
-int32_t
-meta_chown_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		struct stat *buf)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		buf);
-  return 0;
-}
-
-int32_t
-meta_chown (call_frame_t *frame,
-	    xlator_t *this,
-	    const char *path,
-	    uid_t uid,
-	    gid_t gid)
-{
-  STACK_WIND (frame,	      
-	      meta_chown_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->chown,
-	      path,
-	      uid,
-	      gid);
-  return 0;
-}
-
-
-int32_t
-meta_truncate_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   struct stat *buf)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		buf);
-  return 0;
-}
-
-int32_t
-meta_truncate (call_frame_t *frame,
-	       xlator_t *this,
-	       const char *path,
-	       off_t offset)
-{
-  STACK_WIND (frame,
-	      meta_truncate_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->truncate,
-	      path,
-	      offset);
-  return 0;
-}
-
-
-int32_t
-meta_ftruncate_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    struct stat *buf)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		buf);
-  return 0;
-}
-
-int32_t
-meta_ftruncate (call_frame_t *frame,
-		xlator_t *this,
-		dict_t *fd,
-		off_t offset)
-{
-  STACK_WIND (frame,
-	      meta_ftruncate_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->ftruncate,
-	      fd,
-	      offset);
-  return 0;
-}
-
-
-int32_t
-meta_utimes_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 struct stat *buf)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		buf);
-  return 0;
-}
-
-int32_t
-meta_utimes (call_frame_t *frame,
-	     xlator_t *this,
-	     const char *path,
-	     struct timespec *buf)
-{
-  STACK_WIND (frame,
-	      meta_utimes_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->utimes,
-	      path,
-	      buf);
-  return 0;
-}
-
 
-int32_t
-meta_access_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno);
-  return 0;
-}
-
-int32_t
-meta_access (call_frame_t *frame,
-	     xlator_t *this,
-	     const char *path,
-	     mode_t mode)
-{
-  STACK_WIND (frame,
-	      meta_access_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->access,
-	      path,
-	      mode);
-  return 0;
-}
-
-int32_t
-meta_readlink_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   char *dest)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		dest);
-  return 0;
-}
-
-int32_t
-meta_readlink (call_frame_t *frame,
-	       xlator_t *this,
-	       const char *path,
-	       size_t size)
-{
-  STACK_WIND (frame,
-	      meta_readlink_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->readlink,
-	      path,
-	      size);
-  return 0;
-}
+#include "meta-hooks.h"
 
-int32_t
-meta_mknod_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		struct stat *buf)
+int
+meta_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
 {
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		buf);
-  return 0;
-}
+    inode_t *inode = NULL;
 
-int32_t
-meta_mknod (call_frame_t *frame,
-	    xlator_t *this,
-	    const char *path,
-	    mode_t mode,
-	    dev_t dev)
-{
-  STACK_WIND (frame,
-	      meta_mknod_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->mknod,
-	      path,
-	      mode,
-	      dev);
-  return 0;
-}
+    if (META_HOOK(loc) || IS_META_ROOT_GFID(loc->gfid)) {
+        struct iatt iatt = {};
+        struct iatt parent = {};
 
-int32_t
-meta_mkdir_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		struct stat *buf)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		buf);
-  return 0;
-}
+        meta_root_dir_hook(frame, this, loc, xdata);
 
-int32_t
-meta_mkdir (call_frame_t *frame,
-	    xlator_t *this,
-	    const char *path,
-	    mode_t mode)
-{
-  STACK_WIND (frame,
-	      meta_mkdir_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->mkdir,
-	      path,
-	      mode);
-  return 0;
-}
-
-int32_t
-meta_unlink_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno);
-  return 0;
-}
+        meta_iatt_fill(&iatt, loc->inode, IA_IFDIR);
+        gf_uuid_parse(META_ROOT_GFID, iatt.ia_gfid);
 
-int32_t
-meta_unlink (call_frame_t *frame,
-	     xlator_t *this,
-	     const char *path)
-{
-  STACK_WIND (frame,
-	      meta_unlink_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->unlink,
-	      path);
-  return 0;
-}
-
-int32_t
-meta_rmdir_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno);
-  return 0;
-}
-
-int32_t
-meta_rmdir (call_frame_t *frame,
-	    xlator_t *this,
-	    const char *path)
-{
-  STACK_WIND (frame,
-	      meta_rmdir_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->rmdir,
-	      path);
-  return 0;
-}
-
-int32_t
-meta_symlink_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct stat *buf)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		buf);
-  return 0;
-}
-
-int32_t
-meta_symlink (call_frame_t *frame,
-	      xlator_t *this,
-	      const char *oldpath,
-	      const char *newpath)
-{
-  STACK_WIND (frame,
-	      meta_symlink_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->symlink,
-	      oldpath,
-	      newpath);
-  return 0;
-}
-
-int32_t
-meta_rename_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno);
-  return 0;
-}
-
-int32_t
-meta_rename (call_frame_t *frame,
-	     xlator_t *this,
-	     const char *oldpath,
-	     const char *newpath)
-{
-  STACK_WIND (frame,
-	      meta_rename_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->rename,
-	      oldpath,
-	      newpath);
-  return 0;
-}
-
-int32_t
-meta_link_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno,
-	       struct stat *buf)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		buf);
-  return 0;
-}
-
-int32_t
-meta_link (call_frame_t *frame,
-	   xlator_t *this,
-	   const char *oldpath,
-	   const char *newpath)
-{
-  STACK_WIND (frame,
-	      meta_link_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->link,
-	      oldpath,
-	      newpath);
-  return 0;
-}
-
-struct _open_local {
-  const char *path;
-};
+        META_STACK_UNWIND(lookup, frame, 0, 0, loc->inode, &iatt, xdata,
+                          &parent);
+        return 0;
+    }
 
-int32_t
-meta_open_cbk (call_frame_t *frame, void *cookie,
-	       xlator_t *this, int32_t op_ret, int32_t op_errno,
-	       dict_t *ctx, struct stat *buf)
-{
-  struct _open_local *local = frame->local;
-  if (local)
-    dict_set (ctx, this->name, str_to_data (local->path));
-  STACK_UNWIND (frame, op_ret, op_errno, ctx, buf);
-  return 0;
-}
+    if (loc->parent)
+        inode = loc->parent;
+    else
+        inode = loc->inode;
 
-int32_t
-meta_open (call_frame_t *frame, xlator_t *this,
-	   const char *path, int32_t flags, mode_t mode)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  meta_dirent_t *file = lookup_meta_entry (root, path, NULL);
+    META_FOP(inode, lookup, frame, this, loc, xdata);
 
-  if (file) {
-    if (file->fops && file->fops->open) {
-      struct _open_local *local = CALLOC (1, sizeof (struct _open_local));
-      ERR_ABORT (local);
-      local->path = strdup (path);
-      frame->local = local;
-      STACK_WIND (frame, meta_open_cbk,
-		  this, file->fops->open,
-		  path, flags, mode);
-      return 0;
-    }
-    else {
-      dict_t *ctx = get_new_dict ();
-      dict_ref (ctx);
-      dict_set (ctx, this->name, str_to_data (strdup (path)));
-      STACK_UNWIND (frame, 0, 0, ctx, file->stbuf);
-      return 0;
-    }
-  }
-  else {  
-    STACK_WIND (frame, meta_open_cbk,
-		FIRST_CHILD(this), FIRST_CHILD(this)->fops->open,
-		path, flags, mode);
     return 0;
-  }
 }
 
-int32_t
-meta_create (call_frame_t *frame, xlator_t *this,
-	     const char *path, int32_t flags, mode_t mode)
+int
+meta_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+             dict_t *xdata)
 {
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  meta_dirent_t *file = lookup_meta_entry (root, path, NULL);
+    META_FOP(fd->inode, opendir, frame, this, loc, fd, xdata);
 
-  if (file) {
-    if (file->fops && file->fops->create) {
-      struct _open_local *local = CALLOC (1, sizeof (struct _open_local));
-      ERR_ABORT (local);
-      local->path = strdup (path);
-      frame->local = local;
-      STACK_WIND (frame, meta_open_cbk,
-		  this, file->fops->create,
-		  path, flags, mode);
-      return 0;
-    }
-    else {
-      STACK_UNWIND (frame, -1, 0, NULL, NULL);
-      return 0;
-    }
-  }
-  else {
-    STACK_WIND (frame, meta_open_cbk,
-		FIRST_CHILD(this), FIRST_CHILD(this)->fops->create,
-		path, flags, mode);
     return 0;
-  }
 }
 
-int32_t
-meta_readv_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		struct iovec *vector,
-		int32_t count)
+int
+meta_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd,
+          dict_t *xdata)
 {
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		vector,
-		count);
-  return 0;
-}
+    META_FOP(fd->inode, open, frame, this, loc, flags, fd, xdata);
 
-int32_t
-meta_readv (call_frame_t *frame,
-	    xlator_t *this,
-	    dict_t *fd,
-	    size_t size,
-	    off_t offset)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  data_t *path_data = dict_get (fd, this->name);
-
-  if (path_data) {
-    const char *path = data_to_str (path_data);
-    meta_dirent_t *file = lookup_meta_entry (root, path, NULL);
-  
-    if (file && file->fops && file->fops->readv) {
-      STACK_WIND (frame, meta_readv_cbk, 
-		  this, file->fops->readv,
-		  fd, size, offset);
-      return 0;
-    }
-  }
-  else {
-    STACK_WIND (frame, meta_readv_cbk,
-		FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv,
-		fd, size, offset);
     return 0;
-  }
 }
 
-int32_t
-meta_writev_cbk (call_frame_t *frame, void *cookie,
-		 xlator_t *this, int32_t op_ret,
-		 int32_t op_errno)
-{
-  STACK_UNWIND (frame, op_ret, op_errno);
-  return 0;
-}
-
-int32_t
-meta_writev (call_frame_t *frame, xlator_t *this,
-	     dict_t *fd, 
-	     struct iovec *vector, int32_t count, off_t offset)
+int
+meta_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+           off_t offset, uint32_t flags, dict_t *xdata)
 {
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  data_t *path_data = dict_get (fd, this->name);
+    META_FOP(fd->inode, readv, frame, this, fd, size, offset, flags, xdata);
 
-  if (path_data) {
-    const char *path = data_to_str (path_data);
-    meta_dirent_t *file = lookup_meta_entry (root, path, NULL);
-  
-    if (file && file->fops && file->fops->writev) {
-      STACK_WIND (frame, meta_writev_cbk, 
-		  this, file->fops->writev,
-		  fd, vector, count, offset);
-      return 0;
-    }
-  }
-  else {
-    STACK_WIND (frame, meta_readv_cbk,
-		FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev,
-		fd, vector, count, offset);
     return 0;
-  }
 }
 
-int32_t
-meta_flush_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno)
+int
+meta_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
 {
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno);
-  return 0;
-}
+    META_FOP(fd->inode, flush, frame, this, fd, xdata);
 
-int32_t
-meta_flush (call_frame_t *frame,
-	    xlator_t *this,
-	    dict_t *fd)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  data_t *path_data = dict_get (fd, this->name);
- 
-  if (path_data) {
-    const char *path = data_to_str (path_data);
-    meta_dirent_t *file = lookup_meta_entry (root, path, NULL);
-  
-    if (file) {
-      if (file->fops && file->fops->flush) {
-	STACK_WIND (frame, meta_flush_cbk,
-		    this, file->fops->flush,
-		    fd);
-	return 0;
-      }
-      else {
-	STACK_UNWIND (frame, 0, 0);
-	return 0;
-      }
-    }
-  }
-  else {
-    STACK_WIND (frame, meta_flush_cbk,
-		FIRST_CHILD(this), FIRST_CHILD(this)->fops->flush,
-		fd);
     return 0;
-  }
 }
 
-int32_t
-meta_release_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno)
+int
+meta_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
 {
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno);
-  return 0;
-}
+    META_FOP(loc->inode, stat, frame, this, loc, xdata);
 
-int32_t
-meta_release (call_frame_t *frame,
-	      xlator_t *this,
-	      dict_t *fd)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  data_t *path_data = dict_get (fd, this->name);
-
-  if (path_data) {
-    const char *path = data_to_str (path_data);
-    meta_dirent_t *file = lookup_meta_entry (root, path, NULL);
-  
-    if (file) {
-      dict_unref (fd);
-      STACK_UNWIND (frame, 0, 0);
-      return 0;
-    }
-  }
-  else {
-    STACK_WIND (frame, meta_release_cbk,
-		FIRST_CHILD(this), FIRST_CHILD(this)->fops->release,
-		fd);
     return 0;
-  }
 }
 
-int32_t
-meta_fsync_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno)
+int
+meta_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
 {
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno);
-  return 0;
-}
+    META_FOP(fd->inode, fstat, frame, this, fd, xdata);
 
-int32_t
-meta_fsync (call_frame_t *frame,
-	    xlator_t *this,
-	    dict_t *fd,
-	    int32_t flags)
-{
-  STACK_WIND (frame,
-	      meta_fsync_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->fsync,
-	      fd,
-	      flags);
-  return 0;
+    return 0;
 }
 
-int32_t
-meta_fgetattr_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   struct stat *buf)
+int
+meta_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+             off_t offset, dict_t *xdata)
 {
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		buf);
-  return 0;
-}
+    META_FOP(fd->inode, readdir, frame, this, fd, size, offset, xdata);
 
-int32_t
-meta_fgetattr (call_frame_t *frame,
-	       xlator_t *this,
-	       dict_t *fd)
-{
-  STACK_WIND (frame,
-	      meta_fgetattr_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->fgetattr,
-	      fd);
-  return 0;
+    return 0;
 }
 
-int32_t
-meta_opendir_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  dict_t *fd)
+int
+meta_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+              off_t offset, dict_t *xdata)
 {
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		fd);
-  return 0;
-}
+    META_FOP(fd->inode, readdirp, frame, this, fd, size, offset, xdata);
 
-int32_t
-meta_opendir (call_frame_t *frame,
-	      xlator_t *this,
-	      const char *path)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  meta_dirent_t *dir = lookup_meta_entry (root, path, NULL);
-  
-  if (dir) {
-    dict_t *ctx = get_new_dict ();
-    dict_set (ctx, this->name, str_to_data (strdup (path)));
-    STACK_UNWIND (frame, 0, 0, ctx);
     return 0;
-  }
-  else {  
-    STACK_WIND (frame, meta_opendir_cbk,
-		FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir,
-		path);
-    return 0;
-  }
 }
 
-int32_t
-meta_readdir_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  dir_entry_t *entries,
-		  int32_t count)
+int
+meta_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+              dict_t *xdata)
 {
-  meta_private_t *priv = (meta_private_t *)this->private;
-
-  if ((int) cookie == 1) {
-    dir_entry_t *dir = CALLOC (1, sizeof (dir_entry_t));
-    ERR_ABORT (dir);
+    META_FOP(loc->inode, readlink, frame, this, loc, size, xdata);
 
-    dir->name = strdup (".meta");
-    memcpy (&dir->buf, priv->tree->stbuf, sizeof (struct stat));
-    dir->next = entries->next;
-    entries->next = dir;
-
-    STACK_UNWIND (frame, op_ret, op_errno, entries, count+1);
     return 0;
-  }
-  
-  STACK_UNWIND (frame, op_ret, op_errno, entries, count);
-  return 0;
 }
 
-int32_t
-meta_readdir (call_frame_t *frame,
-	      xlator_t *this,
-	      const char *path)
+int
+meta_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov,
+            int count, off_t offset, uint32_t flags, struct iobref *iobref,
+            dict_t *xdata)
 {
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-
-  meta_dirent_t *dir = lookup_meta_entry (root, path, NULL);
-  if (dir) {
-    if (dir->fops && dir->fops->readdir) {
-      STACK_WIND (frame, meta_readdir_cbk, 
-		  this, dir->fops->readdir, path);
-      return 0;
-    }
-    else {
-      int count = 0;
-      dir = dir->children;
-      dir_entry_t *entries = NULL;
-
-      while (dir) {
-	dir_entry_t *d = CALLOC (1, sizeof (dir_entry_t));
-	ERR_ABORT (d);
-	d->name = dir->name;
-	d->buf  = *dir->stbuf;
-	d->next = entries;
-	entries = d;
-	count++;
-	dir = dir->next;
-      }
-
-      dir_entry_t *header = CALLOC (1, sizeof (dir_entry_t));
-      ERR_ABORT (header);
-      header->next = entries;
-      STACK_UNWIND (frame, 0, 0, header, count);
-      return 0;
-    }
-  }
-  else {
-    if (!strcmp (path, "/")) {
-      STACK_WIND_COOKIE (frame, meta_readdir_cbk, 
-		   (int) 1, /* cookie to tell _cbk to add .meta entry */
-		   FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir,
-		   path);
-    }
-    else {
-      STACK_WIND (frame, meta_readdir_cbk, 
-		  FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir,
-		  path);
-    }
-  }
-  return 0;
+    META_FOP(fd->inode, writev, frame, this, fd, iov, count, offset, flags,
+             iobref, xdata);
+    return 0;
 }
 
-int32_t
-meta_releasedir_cbk (call_frame_t *frame,
-		     void *cookie,
-		     xlator_t *this,
-		     int32_t op_ret,
-		     int32_t op_errno)
+int
+meta_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+              dict_t *xdata)
 {
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno);
-  return 0;
-}
+    META_FOP(loc->inode, truncate, frame, this, loc, offset, xdata);
 
-int32_t
-meta_releasedir (call_frame_t *frame,
-		 xlator_t *this,
-		 dict_t *fd)
-{
-  STACK_WIND (frame,
-	      meta_releasedir_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->releasedir,
-	      fd);
-  return 0;
+    return 0;
 }
 
-int32_t
-meta_fsyncdir_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno)
+int
+meta_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+               dict_t *xdata)
 {
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno);
-  return 0;
-}
+    META_FOP(fd->inode, ftruncate, frame, this, fd, offset, xdata);
 
-int32_t
-meta_fsyncdir (call_frame_t *frame,
-	       xlator_t *this,
-	       dict_t *fd,
-	       int32_t flags)
-{
-  STACK_WIND (frame,
-	      meta_fsyncdir_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->fsyncdir,
-	      fd,
-	      flags);
-  return 0;
+    return 0;
 }
 
 int32_t
-meta_statfs_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 struct statvfs *buf)
+meta_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+           dict_t *xdata)
 {
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		buf);
-  return 0;
-}
+    META_FOP(fd->inode, fsync, frame, this, fd, flags, xdata);
 
-int32_t
-meta_statfs (call_frame_t *frame,
-	     xlator_t *this,
-	     const char *path)
-{
-  STACK_WIND (frame,
-	      meta_statfs_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->statfs,
-	      path);
-  return 0;
+    return 0;
 }
 
 int32_t
-meta_setxattr_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno)
+meta_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+              dict_t *xdata)
 {
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno);
-  return 0;
-}
+    META_FOP(fd->inode, fsyncdir, frame, this, fd, flags, xdata);
 
-int32_t
-meta_setxattr (call_frame_t *frame,
-	       xlator_t *this,
-	       const char *path,
-	       const char *name,
-	       const char *value,
-	       size_t size,
-	       int32_t flags)
-{
-  STACK_WIND (frame,
-	      meta_setxattr_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->setxattr,
-	      path,
-	      name,
-	      value,
-	      size,
-	      flags);
-  return 0;
+    return 0;
 }
 
-int32_t
-meta_getxattr_cbk (call_frame_t *frame,
-		   void *cookie,
-		   xlator_t *this,
-		   int32_t op_ret,
-		   int32_t op_errno,
-		   char *value)
+int
+meta_forget(xlator_t *this, inode_t *inode)
 {
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		value);
-  return 0;
+    return 0;
 }
 
-int32_t
-meta_getxattr (call_frame_t *frame,
-	       xlator_t *this,
-	       const char *path,
-	       const char *name,
-	       size_t size)
+int
+meta_release(xlator_t *this, fd_t *fd)
 {
-  STACK_WIND (frame,
-	      meta_getxattr_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->getxattr,
-	      path,
-	      name,
-	      size);
-  return 0;
+    return meta_fd_release(fd, this);
 }
 
-int32_t
-meta_listxattr_cbk (call_frame_t *frame,
-		    void *cookie,
-		    xlator_t *this,
-		    int32_t op_ret,
-		    int32_t op_errno,
-		    char *value)
+int
+meta_releasedir(xlator_t *this, fd_t *fd)
 {
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		value);
-  return 0;
+    return meta_fd_release(fd, this);
 }
 
-int32_t
-meta_listxattr (call_frame_t *frame,
-		xlator_t *this,
-		const char *path,
-		size_t size)
+int
+mem_acct_init(xlator_t *this)
 {
-  STACK_WIND (frame,
-	      meta_listxattr_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->listxattr,
-	      path,
-	      size);
-  return 0;
-}
+    int ret = -1;
 
-int32_t
-meta_removexattr_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno);
-  return 0;
-}
+    if (!this)
+        return ret;
 
-int32_t
-meta_removexattr (call_frame_t *frame,
-		  xlator_t *this,
-		  const char *path,
-		  const char *name)
-{
-  STACK_WIND (frame,
-	      meta_removexattr_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->removexattr,
-	      path,
-	      name);
-  return 0;
-}
+    ret = xlator_mem_acct_init(this, gf_meta_mt_end + 1);
 
-int32_t
-meta_lk_cbk (call_frame_t *frame,
-	     void *cookie,
-	     xlator_t *this,
-	     int32_t op_ret,
-	     int32_t op_errno,
-	     struct flock *lock)
-{
-  STACK_UNWIND (frame,
-		op_ret,
-		op_errno,
-		lock);
-  return 0;
-}
+    if (ret != 0) {
+        gf_log(this->name, GF_LOG_ERROR, "Memory accounting init failed");
+        return ret;
+    }
 
-int32_t
-meta_lk (call_frame_t *frame,
-	 xlator_t *this,
-	 dict_t *file,
-	 int32_t cmd,
-	 struct flock *lock)
-{
-  STACK_WIND (frame,
-	      meta_lk_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->lk,
-	      file,
-	      cmd,
-	      lock);
-  return 0;
+    return ret;
 }
 
-static void
-add_xlator_to_tree (meta_dirent_t *tree, xlator_t *this,
-		    const char *prefix)
+int
+init(xlator_t *this)
 {
-  char *dir;
-  asprintf (&dir, "%s/%s", prefix, this->name);
+    meta_priv_t *priv = NULL;
+    int ret = -1;
 
-  char *children;
-  asprintf (&children, "%s/%s", dir, "subvolumes");
+    priv = GF_CALLOC(sizeof(*priv), 1, gf_meta_mt_priv_t);
+    if (!priv)
+        return ret;
 
-  char *type;
-  asprintf (&type, "%s/%s", dir, "type");
+    GF_OPTION_INIT("meta-dir-name", priv->meta_dir_name, str, out);
 
-  char *view;
-  asprintf (&view, "%s/%s", dir, "view");
+    this->private = priv;
+    ret = 0;
+out:
+    if (ret)
+        GF_FREE(priv);
 
-  insert_meta_entry (tree, dir, S_IFDIR, NULL, NULL);
-  insert_meta_entry (tree, children, S_IFDIR, NULL, NULL);
-  meta_dirent_t *v = insert_meta_entry (tree, view, S_IFDIR, NULL, 
-					&meta_xlator_view_fops);
-  v->view_xlator = this;
-  meta_dirent_t *t = insert_meta_entry (tree, type, S_IFREG, NULL, 
-					&meta_xlator_type_fops);
-  t->view_xlator = this;
-
-  xlator_list_t *trav = this->children;
-  while (trav) {
-    add_xlator_to_tree (tree, trav->xlator, children);
-    trav = trav->next;
-  }
+    return ret;
 }
 
-static void
-build_meta_tree (xlator_t *this)
+void
+fini(xlator_t *this)
 {
-  meta_private_t *priv = (meta_private_t *) this->private;
-  priv->tree = CALLOC (1, sizeof (meta_dirent_t));
-  ERR_ABORT (priv->tree);
-  priv->tree->name = strdup (".meta");
-  priv->tree->stbuf = new_stbuf ();
-  priv->tree->stbuf->st_mode = S_IFDIR | S_IRUSR | S_IRGRP | S_IROTH |
-    S_IXUSR | S_IXGRP | S_IXOTH;
-
-  insert_meta_entry (priv->tree, "/.meta/version", 
-		     S_IFREG, NULL, &meta_version_fops);
-
-  insert_meta_entry (priv->tree, "/.meta/xlators",
-		     S_IFDIR, NULL, NULL);
-
-  xlator_list_t *trav = this->children;
-  while (trav) {
-    add_xlator_to_tree (priv->tree, trav->xlator, "/.meta/xlators");
-    trav = trav->next;
-  }
+    GF_FREE(this->private);
+    return;
 }
 
-int32_t
-init (xlator_t *this)
-{
-  if (this->parent != NULL) {
-    gf_log ("meta", GF_LOG_ERROR, "FATAL: meta should be the root of the xlator tree");
-    return -1;
-  }
-  
-  meta_private_t *priv = CALLOC (1, sizeof (meta_private_t));
-  ERR_ABORT (priv);
-  
-  data_t *directory = dict_get (this->options, "directory");
-  if (directory) {
-    priv->directory = strdup (data_to_str (directory));
-  }
-  else {
-    priv->directory = ".meta";
-  }
-  
-  this->private = priv;
-  build_meta_tree (this);
-
-  return 0;
-}
+struct xlator_fops fops = {.lookup = meta_lookup,
+                           .opendir = meta_opendir,
+                           .open = meta_open,
+                           .readv = meta_readv,
+                           .flush = meta_flush,
+                           .stat = meta_stat,
+                           .fstat = meta_fstat,
+                           .readdir = meta_readdir,
+                           .readdirp = meta_readdirp,
+                           .readlink = meta_readlink,
+                           .writev = meta_writev,
+                           .truncate = meta_truncate,
+                           .ftruncate = meta_ftruncate,
+                           .fsync = meta_fsync,
+                           .fsyncdir = meta_fsyncdir};
 
-int32_t
-fini (xlator_t *this)
-{
-  return 0;
-}
+struct xlator_cbks cbks = {
+    .forget = meta_forget,
+    .release = meta_release,
+    .releasedir = meta_releasedir,
+};
 
-struct xlator_fops fops = {
-  .getattr     = meta_getattr,
-  .readlink    = meta_readlink,
-  .mknod       = meta_mknod,
-  .mkdir       = meta_mkdir,
-  .unlink      = meta_unlink,
-  .rmdir       = meta_rmdir,
-  .symlink     = meta_symlink,
-  .rename      = meta_rename,
-  .link        = meta_link,
-  .chmod       = meta_chmod,
-  .chown       = meta_chown,
-  .truncate    = meta_truncate,
-  .utimes      = meta_utimes,
-  .open        = meta_open,
-  .readv       = meta_readv,
-  .writev      = meta_writev,
-  .statfs      = meta_statfs,
-  .flush       = meta_flush,
-  .release     = meta_release,
-  .fsync       = meta_fsync,
-  .setxattr    = meta_setxattr,
-  .getxattr    = meta_getxattr,
-  .listxattr   = meta_listxattr,
-  .removexattr = meta_removexattr,
-  .opendir     = meta_opendir,
-  .readdir     = meta_readdir,
-  .releasedir  = meta_releasedir,
-  .fsyncdir    = meta_fsyncdir,
-  .access      = meta_access,
-  .ftruncate   = meta_ftruncate,
-  .fgetattr    = meta_fgetattr,
-  .create      = meta_create,
-  .lk          = meta_lk,
+struct volume_options options[] = {
+    {.key = {"meta-dir-name"},
+     .type = GF_OPTION_TYPE_STR,
+     .default_value = DEFAULT_META_DIR_NAME,
+     .description = "Name of default meta directory."},
+    {.key = {NULL}},
 };
 
-struct xlator_mops mops = {
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "meta",
+    .category = GF_TECH_PREVIEW,
 };
diff --git a/xlators/meta/src/meta.h b/xlators/meta/src/meta.h
index 2c3d37237d5..7f0cf28808a 100644
--- a/xlators/meta/src/meta.h
+++ b/xlators/meta/src/meta.h
@@ -1,48 +1,140 @@
 /*
-   Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
    This file is part of GlusterFS.
 
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
 */
-
 #ifndef __META_H__
 #define __META_H__
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-struct _meta_dirent {
-  const char *name;
-  int type;
-  struct _meta_dirent *children;
-  struct _meta_dirent *parent;
-  struct _meta_dirent *next;
-  struct stat *stbuf;
-  xlator_t *view_xlator;
-  struct xlator_fops *fops;
+#include <glusterfs/strfd.h>
+
+#define DEFAULT_META_DIR_NAME ".meta"
+
+#define META_ROOT_GFID "ba926388-bb9c-4eec-ad60-79dba4cc083a"
+
+#define IS_META_ROOT_GFID(g) (strcmp(uuid_utoa(g), META_ROOT_GFID) == 0)
+
+typedef int (*meta_hook_t)(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                           dict_t *xdata);
+
+typedef struct {
+    dict_t *xdata;
+} meta_local_t;
+
+typedef struct {
+    char *meta_dir_name;
+} meta_priv_t;
+
+struct meta_dirent {
+    const char *name;
+    ia_type_t type;
+    meta_hook_t hook;
+};
+
+#define DOT_DOTDOT                                                             \
+    {.name = ".", .type = IA_IFDIR}, { .name = "..", .type = IA_IFDIR }
+
+struct meta_ops {
+    struct meta_dirent *fixed_dirents;
+    int (*dir_fill)(xlator_t *this, inode_t *dir, struct meta_dirent **entries);
+    int (*file_fill)(xlator_t *this, inode_t *file, strfd_t *strfd);
+    int (*iatt_fill)(xlator_t *this, inode_t *inode, struct iatt *iatt);
+    int (*link_fill)(xlator_t *this, inode_t *inode, strfd_t *strfd);
+    int (*file_write)(xlator_t *this, fd_t *fd, struct iovec *iov, int count);
+    struct xlator_fops fops;
+    struct xlator_cbks cbks;
 };
-typedef struct _meta_dirent meta_dirent_t;
 
 typedef struct {
-  const char *directory;
-  meta_dirent_t *tree;
-} meta_private_t;
+    char *data;
+    struct meta_dirent *dirents;
+    size_t size;
+} meta_fd_t;
+
+#define COUNT(arr) (sizeof(arr) / sizeof(arr[0]))
+
+#define META_HOOK(loc)                                                         \
+    (__is_root_gfid(loc->pargfid) &&                                           \
+     !strcmp(loc->name, META_PRIV(THIS)->meta_dir_name))
+
+#define META_PRIV(t) ((meta_priv_t *)(t->private))
+
+#define META_STACK_UNWIND(fop, frame, params...)                               \
+    do {                                                                       \
+        meta_local_t *__local = NULL;                                          \
+        xlator_t *__this = NULL;                                               \
+        if (frame) {                                                           \
+            __local = frame->local;                                            \
+            __this = frame->this;                                              \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, params);                               \
+        if (__local) {                                                         \
+            meta_local_cleanup(__local, __this);                               \
+        }                                                                      \
+    } while (0)
+
+#define META_FOP(i, fop, fr, t, params...)                                     \
+    {                                                                          \
+        struct xlator_fops *_fops = NULL;                                      \
+                                                                               \
+        _fops = meta_fops_get(i, t);                                           \
+                                                                               \
+        _fops->fop(fr, t, params);                                             \
+    }                                                                          \
+    while (0)
+
+void
+meta_iatt_fill(struct iatt *iatt, inode_t *inode, ia_type_t type);
+
+int
+meta_inode_discover(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    dict_t *xdata);
+
+int
+meta_ops_set(inode_t *inode, xlator_t *this, struct meta_ops *ops);
+
+struct xlator_fops *
+meta_fops_get(inode_t *inode, xlator_t *this);
+struct xlator_cbks *
+meta_cbks_get(inode_t *inode, xlator_t *this);
+struct meta_ops *
+meta_ops_get(inode_t *inode, xlator_t *this);
+
+int
+meta_ctx_set(inode_t *inode, xlator_t *this, void *ctx);
+
+void *
+meta_ctx_get(inode_t *inode, xlator_t *this);
+
+void
+meta_local_cleanup(meta_local_t *local, xlator_t *this);
+
+struct xlator_fops *
+meta_defaults_init(struct xlator_fops *fops);
+
+meta_fd_t *
+meta_fd_get(fd_t *fd, xlator_t *this);
+
+int
+meta_fd_release(fd_t *fd, xlator_t *this);
+
+dict_t *
+meta_direct_io_mode(dict_t *xdata, call_frame_t *frame);
+
+meta_local_t *
+meta_local(call_frame_t *frame);
+
+int
+meta_file_fill(xlator_t *this, fd_t *fd);
 
-#include "tree.h"
-#include "misc.h"
+int
+meta_dir_fill(xlator_t *this, fd_t *fd);
 
+int
+fixed_dirents_len(struct meta_dirent *dirents);
 #endif /* __META_H__ */
diff --git a/xlators/meta/src/misc.c b/xlators/meta/src/misc.c
deleted file mode 100644
index a6ea441ee2a..00000000000
--- a/xlators/meta/src/misc.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
-   Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#include <unistd.h>
-#include <sys/uio.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "xlator.h"
-#include "meta.h"
-
-#define min(x,y)   ((x) < (y) ? (x) : (y))
-
-/* /.meta/version */
-static const char *version_str = PACKAGE_NAME " " PACKAGE_VERSION "\n";
-
-int32_t
-meta_version_readv (call_frame_t *frame, xlator_t *this,
-		    dict_t *fd, size_t size, off_t offset)
-{
-  static int version_size;
-  version_size = strlen (version_str);
-  
-  struct iovec vec;
-  vec.iov_base = version_str + offset;
-  vec.iov_len  = min (version_size - offset, size);
-
-  STACK_UNWIND (frame, vec.iov_len, 0, &vec, 1);
-  return 0;
-}
-
-int32_t
-meta_version_getattr (call_frame_t *frame, 
-			      xlator_t *this,
-			      const char *path)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  meta_dirent_t *file = lookup_meta_entry (root, path, NULL);
-  file->stbuf->st_size = strlen (version_str);
-  STACK_UNWIND (frame, 0, 0, file->stbuf);
-}
-
-struct xlator_fops meta_version_fops = {
-  .readv   = meta_version_readv,
-  .getattr = meta_version_getattr
-};
-
diff --git a/xlators/meta/src/misc.h b/xlators/meta/src/misc.h
deleted file mode 100644
index 64eabf9a238..00000000000
--- a/xlators/meta/src/misc.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
-   Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __MISC_H__
-#define __MISC_H__
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-
-struct xlator_fops meta_version_fops;
-
-#endif /* __MISC_H__ */
diff --git a/xlators/meta/src/name-file.c b/xlators/meta/src/name-file.c
new file mode 100644
index 00000000000..5874a24d78a
--- /dev/null
+++ b/xlators/meta/src/name-file.c
@@ -0,0 +1,44 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include <glusterfs/strfd.h>
+#include <glusterfs/lkowner.h>
+
+static int
+name_file_fill(xlator_t *this, inode_t *file, strfd_t *strfd)
+{
+    xlator_t *xl = NULL;
+
+    xl = meta_ctx_get(file, this);
+
+    strprintf(strfd, "%s\n", xl->name);
+
+    return strfd->size;
+}
+
+static struct meta_ops name_file_ops = {
+    .file_fill = name_file_fill,
+};
+
+int
+meta_name_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &name_file_ops);
+
+    meta_ctx_set(loc->inode, this, meta_ctx_get(loc->parent, this));
+
+    return 0;
+}
diff --git a/xlators/meta/src/option-file.c b/xlators/meta/src/option-file.c
new file mode 100644
index 00000000000..ff55eca592f
--- /dev/null
+++ b/xlators/meta/src/option-file.c
@@ -0,0 +1,45 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include "meta-hooks.h"
+
+static int
+option_file_fill(xlator_t *this, inode_t *inode, strfd_t *strfd)
+{
+    data_t *data = NULL;
+
+    data = meta_ctx_get(inode, this);
+
+    strprintf(strfd, "%s\n", data_to_str(data));
+
+    return strfd->size;
+}
+
+static struct meta_ops option_file_ops = {.file_fill = option_file_fill};
+
+int
+meta_option_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                      dict_t *xdata)
+{
+    xlator_t *xl = NULL;
+
+    xl = meta_ctx_get(loc->parent, this);
+
+    meta_ctx_set(loc->inode, this, dict_get(xl->options, (char *)loc->name));
+
+    meta_ops_set(loc->inode, this, &option_file_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/options-dir.c b/xlators/meta/src/options-dir.c
new file mode 100644
index 00000000000..d68a7eeaffc
--- /dev/null
+++ b/xlators/meta/src/options-dir.c
@@ -0,0 +1,65 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include "meta-hooks.h"
+
+static int
+dict_key_add(dict_t *dict, char *key, data_t *value, void *data)
+{
+    struct meta_dirent **direntp = data;
+
+    (*direntp)->name = gf_strdup(key);
+    (*direntp)->type = IA_IFREG;
+    (*direntp)->hook = meta_option_file_hook;
+
+    (*direntp)++;
+    return 0;
+}
+
+static int
+options_dir_fill(xlator_t *this, inode_t *inode, struct meta_dirent **dp)
+{
+    struct meta_dirent *dirent = NULL;
+    struct meta_dirent *direntp = NULL;
+    xlator_t *xl = NULL;
+
+    xl = meta_ctx_get(inode, this);
+
+    dirent = GF_CALLOC(sizeof(*dirent), xl->options->count,
+                       gf_meta_mt_dirents_t);
+    if (!dirent)
+        return -1;
+
+    direntp = dirent;
+
+    dict_foreach(xl->options, dict_key_add, &direntp);
+
+    *dp = dirent;
+
+    return xl->options->count;
+}
+
+static struct meta_ops options_dir_ops = {.dir_fill = options_dir_fill};
+
+int
+meta_options_dir_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                      dict_t *xdata)
+{
+    meta_ctx_set(loc->inode, this, meta_ctx_get(loc->parent, this));
+
+    meta_ops_set(loc->inode, this, &options_dir_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/private-file.c b/xlators/meta/src/private-file.c
new file mode 100644
index 00000000000..23ec319456b
--- /dev/null
+++ b/xlators/meta/src/private-file.c
@@ -0,0 +1,44 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include <glusterfs/strfd.h>
+#include <glusterfs/statedump.h>
+
+static int
+private_file_fill(xlator_t *this, inode_t *file, strfd_t *strfd)
+{
+    xlator_t *xl = NULL;
+
+    xl = meta_ctx_get(file, this);
+
+    gf_proc_dump_xlator_private(xl, strfd);
+
+    return strfd->size;
+}
+
+static struct meta_ops private_file_ops = {
+    .file_fill = private_file_fill,
+};
+
+int
+meta_private_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &private_file_ops);
+
+    meta_ctx_set(loc->inode, this, meta_ctx_get(loc->parent, this));
+
+    return 0;
+}
diff --git a/xlators/meta/src/process_uuid-file.c b/xlators/meta/src/process_uuid-file.c
new file mode 100644
index 00000000000..a24c1b57ab3
--- /dev/null
+++ b/xlators/meta/src/process_uuid-file.c
@@ -0,0 +1,37 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include <glusterfs/strfd.h>
+#include <glusterfs/lkowner.h>
+
+static int
+process_uuid_file_fill(xlator_t *this, inode_t *file, strfd_t *strfd)
+{
+    strprintf(strfd, "%s\n", this->ctx->process_uuid);
+    return strfd->size;
+}
+
+static struct meta_ops process_uuid_file_ops = {
+    .file_fill = process_uuid_file_fill,
+};
+
+int
+meta_process_uuid_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                            dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &process_uuid_file_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/profile-file.c b/xlators/meta/src/profile-file.c
new file mode 100644
index 00000000000..829dcb77451
--- /dev/null
+++ b/xlators/meta/src/profile-file.c
@@ -0,0 +1,44 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include <glusterfs/strfd.h>
+#include <glusterfs/statedump.h>
+
+static int
+profile_file_fill(xlator_t *this, inode_t *file, strfd_t *strfd)
+{
+    xlator_t *xl = NULL;
+
+    xl = meta_ctx_get(file, this);
+
+    gf_proc_dump_xlator_profile(xl, strfd);
+
+    return strfd->size;
+}
+
+static struct meta_ops profile_file_ops = {
+    .file_fill = profile_file_fill,
+};
+
+int
+meta_profile_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &profile_file_ops);
+
+    meta_ctx_set(loc->inode, this, meta_ctx_get(loc->parent, this));
+
+    return 0;
+}
diff --git a/xlators/meta/src/root-dir.c b/xlators/meta/src/root-dir.c
new file mode 100644
index 00000000000..80292bd3dda
--- /dev/null
+++ b/xlators/meta/src/root-dir.c
@@ -0,0 +1,77 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include "meta-hooks.h"
+
+static struct meta_dirent root_dir_dirents[] = {
+    DOT_DOTDOT,
+
+    {
+        .name = "graphs",
+        .type = IA_IFDIR,
+        .hook = meta_graphs_dir_hook,
+    },
+    {
+        .name = "frames",
+        .type = IA_IFREG,
+        .hook = meta_frames_file_hook,
+    },
+    {
+        .name = "logging",
+        .type = IA_IFDIR,
+        .hook = meta_logging_dir_hook,
+    },
+    {
+        .name = "process_uuid",
+        .type = IA_IFREG,
+        .hook = meta_process_uuid_file_hook,
+    },
+    {
+        .name = "version",
+        .type = IA_IFREG,
+        .hook = meta_version_file_hook,
+    },
+    {
+        .name = "cmdline",
+        .type = IA_IFREG,
+        .hook = meta_cmdline_file_hook,
+    },
+    {
+        .name = "mallinfo",
+        .type = IA_IFREG,
+        .hook = meta_mallinfo_file_hook,
+    },
+    {
+        .name = "master",
+        .type = IA_IFDIR,
+        .hook = meta_master_dir_hook,
+    },
+    {
+        .name = "measure_latency",
+        .type = IA_IFREG,
+        .hook = meta_measure_file_hook,
+    },
+    {.name = NULL}};
+
+static struct meta_ops meta_root_dir_ops = {.fixed_dirents = root_dir_dirents};
+
+int
+meta_root_dir_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &meta_root_dir_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/subvolume-link.c b/xlators/meta/src/subvolume-link.c
new file mode 100644
index 00000000000..5b1f752efd0
--- /dev/null
+++ b/xlators/meta/src/subvolume-link.c
@@ -0,0 +1,56 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+
+static int
+subvolume_link_fill(xlator_t *this, inode_t *inode, strfd_t *strfd)
+{
+    xlator_t *xl = NULL;
+
+    xl = meta_ctx_get(inode, this);
+
+    strprintf(strfd, "../../%s", xl->name);
+
+    return 0;
+}
+
+struct meta_ops subvolume_link_ops = {.link_fill = subvolume_link_fill};
+
+int
+meta_subvolume_link_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                         dict_t *xdata)
+{
+    int count = 0;
+    int i = 0;
+    xlator_t *xl = NULL;
+    xlator_list_t *subv = NULL;
+    xlator_t *subvol = NULL;
+
+    count = strtol(loc->name, 0, 0);
+    xl = meta_ctx_get(loc->parent, this);
+
+    for (subv = xl->children; subv; subv = subv->next) {
+        if (i == count) {
+            subvol = subv->xlator;
+            break;
+        }
+        i++;
+    }
+
+    meta_ctx_set(loc->inode, this, subvol);
+
+    meta_ops_set(loc->inode, this, &subvolume_link_ops);
+    return 0;
+}
diff --git a/xlators/meta/src/subvolumes-dir.c b/xlators/meta/src/subvolumes-dir.c
new file mode 100644
index 00000000000..3cb170ea1f4
--- /dev/null
+++ b/xlators/meta/src/subvolumes-dir.c
@@ -0,0 +1,62 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include "meta-hooks.h"
+
+static int
+subvolumes_dir_fill(xlator_t *this, inode_t *dir, struct meta_dirent **dp)
+{
+    struct meta_dirent *dirents = NULL;
+    xlator_t *xl = NULL;
+    xlator_list_t *subv = NULL;
+    int i = 0;
+    int count = 0;
+
+    xl = meta_ctx_get(dir, this);
+
+    for (subv = xl->children; subv; subv = subv->next)
+        count++;
+
+    dirents = GF_MALLOC(sizeof(*dirents) * count, gf_meta_mt_dirents_t);
+    if (!dirents)
+        return -1;
+
+    for (subv = xl->children; subv; subv = subv->next) {
+        char num[16] = {};
+        snprintf(num, 16, "%d", i);
+
+        dirents[i].name = gf_strdup(num);
+        dirents[i].type = IA_IFLNK;
+        dirents[i].hook = meta_subvolume_link_hook;
+        i++;
+    }
+
+    *dp = dirents;
+
+    return count;
+}
+
+static struct meta_ops subvolumes_dir_ops = {.dir_fill = subvolumes_dir_fill};
+
+int
+meta_subvolumes_dir_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                         dict_t *xdata)
+{
+    meta_ctx_set(loc->inode, this, meta_ctx_get(loc->parent, this));
+
+    meta_ops_set(loc->inode, this, &subvolumes_dir_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/top-link.c b/xlators/meta/src/top-link.c
new file mode 100644
index 00000000000..33f0d407411
--- /dev/null
+++ b/xlators/meta/src/top-link.c
@@ -0,0 +1,40 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+
+static int
+top_link_fill(xlator_t *this, inode_t *inode, strfd_t *strfd)
+{
+    glusterfs_graph_t *graph = NULL;
+
+    graph = meta_ctx_get(inode, this);
+
+    strprintf(strfd, "%s", ((xlator_t *)graph->top)->name);
+
+    return 0;
+}
+
+struct meta_ops top_link_ops = {.link_fill = top_link_fill};
+
+int
+meta_top_link_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &top_link_ops);
+
+    meta_ctx_set(loc->inode, this, meta_ctx_get(loc->parent, this));
+
+    return 0;
+}
diff --git a/xlators/meta/src/tree.c b/xlators/meta/src/tree.c
deleted file mode 100644
index 8eb982a6a49..00000000000
--- a/xlators/meta/src/tree.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
-   Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <string.h>
-
-#include "glusterfs.h"
-#include "xlator.h"
-
-#include "meta.h"
-
-static int
-is_meta_path (const char *path)
-{
-  while (*path == '/')
-    path++;
-  if (!strncmp (path, ".meta", strlen (".meta")))
-    return 1;
-  return 0;
-}
-
-struct stat *
-new_stbuf (void)
-{
-  static int next_inode = 0;
-  struct stat *stbuf = CALLOC (1, sizeof (struct stat));
-  
-  ERR_ABORT (stbuf);
-
-  stbuf->st_dev = 0;
-  stbuf->st_ino = next_inode++;
-  stbuf->st_mode = S_IRUSR | S_IRGRP | S_IROTH;
-  stbuf->st_nlink = 1;
-  stbuf->st_uid = 0;
-  stbuf->st_gid = 0;
-  stbuf->st_rdev = 0;
-  stbuf->st_size = 0;
-  stbuf->st_blksize = 0;
-  stbuf->st_blocks = 0;
-  stbuf->st_atime = time (NULL);
-  stbuf->st_atim.tv_nsec = 0;
-  stbuf->st_mtime = stbuf->st_atime;
-  stbuf->st_mtim.tv_nsec = 0;
-  stbuf->st_ctime = stbuf->st_ctime;
-  stbuf->st_ctim.tv_nsec = 0;
-
-  return stbuf;
-}
-
-/* find an entry among the siblings of an entry */
-static meta_dirent_t *
-find_entry (meta_dirent_t *node, const char *dir)
-{
-  meta_dirent_t *trav = node;
-  while (trav) {
-    if (!strcmp (trav->name, dir))
-      return trav;
-    trav = trav->next;
-  }
-  return NULL;
-}
-
-/*
- * Return the meta_dirent_t corresponding to the pathname.
- *
- * If pathname does not exist in the meta tree, try to return
- * its highest parent that does exist. The part of the
- * pathname that is left over is returned in the value-result
- * variable {remain}.
- * For example, for "/.meta/xlators/brick1/view/foo/bar/baz",
- * return the entry for "/.meta/xlators/brick1/view"
- * and set remain to "/bar/baz"
- */
-
-meta_dirent_t *
-lookup_meta_entry (meta_dirent_t *root, const char *path,
-		   char **remain)
-{
-  char *_path = strdup (path);
-
-  if (!is_meta_path (path))
-    return NULL;
-
-  meta_dirent_t *trav = root;
-  char *dir = strtok (_path, "/");
-  dir = strtok (NULL, "/");
-
-  while (dir) {
-    meta_dirent_t *ntrav;
-    ntrav = find_entry (trav->children, dir);
-    if (!ntrav) {
-      /* we have reached bottom of the meta tree. 
-         Unknown dragons lie further below */
-      if (remain) {
-	char *piece = dir;
-	while (piece) {
-	  char *tmp = *remain;
-	  if (*remain)
-	    asprintf (remain, "/%s/%s", *remain, piece);
-	  else
-	    asprintf (remain, "/%s", piece);
-	  if (tmp) free (tmp);
-	  piece = strtok (NULL, "/");
-	}
-      }
-      return trav;
-    }
-    dir = strtok (NULL, "/");
-    trav = ntrav;
-  }
-
-  free (_path);
-  return trav;
-}
-
-meta_dirent_t *
-insert_meta_entry (meta_dirent_t *root, const char *path,
-		   int type, struct stat *stbuf, struct xlator_fops *fops)
-{
-  if (!is_meta_path (path))
-    return NULL;
-  char *slashpos = strrchr (path, '/');
-  char *dir = strndup (path, slashpos - path);
-  meta_dirent_t *parent = lookup_meta_entry (root, dir, NULL);
-  if (!dir)
-    return NULL;
-
-  meta_dirent_t *new = CALLOC (1, sizeof (meta_dirent_t));
-  ERR_ABORT (new);
-  new->name        = strdup (slashpos+1);
-  new->type        = type;
-  new->parent      = parent;
-  new->next        = parent->children;
-  parent->children = new;
-  if (stbuf)
-    new->stbuf     = stbuf;
-  else 
-    new->stbuf     = new_stbuf ();
-
-  new->stbuf->st_mode |= type;
-  new->fops        = fops;
-  return new;
-}
-
-int main (void)
-{
-  meta_dirent_t *root = CALLOC (1, sizeof (meta_dirent_t));
-  ERR_ABORT (root);
-  root->name = strdup (".meta");
-
-  insert_meta_entry (root, "/.meta/version", S_IFREG, NULL, NULL);
-  return 0;
-}
diff --git a/xlators/meta/src/tree.h b/xlators/meta/src/tree.h
deleted file mode 100644
index 15a9b3ad399..00000000000
--- a/xlators/meta/src/tree.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
-   Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __TREE_H__
-#define __TREE_H__
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-meta_dirent_t *
-insert_meta_entry (meta_dirent_t *root, const char *path,
-		   int type, struct stat *stbuf, struct xlator_fops *fops);
-meta_dirent_t *
-lookup_meta_entry (meta_dirent_t *root, const char *path, 
-		   char **remain);
-
-#endif /* __TREE_H__ */
diff --git a/xlators/meta/src/type-file.c b/xlators/meta/src/type-file.c
new file mode 100644
index 00000000000..ece342a0b2a
--- /dev/null
+++ b/xlators/meta/src/type-file.c
@@ -0,0 +1,44 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include <glusterfs/strfd.h>
+#include <glusterfs/lkowner.h>
+
+static int
+type_file_fill(xlator_t *this, inode_t *file, strfd_t *strfd)
+{
+    xlator_t *xl = NULL;
+
+    xl = meta_ctx_get(file, this);
+
+    strprintf(strfd, "%s\n", xl->type);
+
+    return strfd->size;
+}
+
+static struct meta_ops type_file_ops = {
+    .file_fill = type_file_fill,
+};
+
+int
+meta_type_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                    dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &type_file_ops);
+
+    meta_ctx_set(loc->inode, this, meta_ctx_get(loc->parent, this));
+
+    return 0;
+}
diff --git a/xlators/meta/src/version-file.c b/xlators/meta/src/version-file.c
new file mode 100644
index 00000000000..36276fb810a
--- /dev/null
+++ b/xlators/meta/src/version-file.c
@@ -0,0 +1,37 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include <glusterfs/strfd.h>
+#include <glusterfs/lkowner.h>
+
+static int
+version_file_fill(xlator_t *this, inode_t *file, strfd_t *strfd)
+{
+    strprintf(strfd, "{ \n  \"Package Version\": \"%s\"\n}", PACKAGE_VERSION);
+    return strfd->size;
+}
+
+static struct meta_ops version_file_ops = {
+    .file_fill = version_file_fill,
+};
+
+int
+meta_version_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &version_file_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/view-dir.c b/xlators/meta/src/view-dir.c
new file mode 100644
index 00000000000..30931061567
--- /dev/null
+++ b/xlators/meta/src/view-dir.c
@@ -0,0 +1,33 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include "meta-hooks.h"
+
+static struct meta_dirent view_dir_dirents[] = {DOT_DOTDOT,
+
+                                                {.name = NULL}};
+
+static struct meta_ops view_dir_ops = {.fixed_dirents = view_dir_dirents};
+
+int
+meta_view_dir_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   dict_t *xdata)
+{
+    meta_ctx_set(loc->inode, this, meta_ctx_get(loc->parent, this));
+
+    meta_ops_set(loc->inode, this, &view_dir_ops);
+
+    return 0;
+}
diff --git a/xlators/meta/src/view.c b/xlators/meta/src/view.c
deleted file mode 100644
index 26f1b6e0382..00000000000
--- a/xlators/meta/src/view.c
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
-   Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "xlator.h"
-
-#include "meta.h"
-
-/* 
- * This file contains fops for the files and directories in 
- * an xlator directory
- */
-
-/* /.meta/xlators/.../type */
-
-int32_t
-meta_xlator_type_readv (call_frame_t *frame, xlator_t *this,
-			dict_t *fd, size_t size, off_t offset)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  data_t *path_data = dict_get (fd, this->name);
-
-  if (path_data) {
-    const char *path = data_to_str (path_data);
-    meta_dirent_t *file = lookup_meta_entry (root, path, NULL);
-    xlator_t *view_xlator = file->view_xlator;
-
-    int type_size;
-    type_size = strlen (view_xlator->type);
-  
-    struct iovec vec;
-    vec.iov_base = view_xlator->type + offset;
-    vec.iov_len  = min (type_size - offset, size);
-
-    STACK_UNWIND (frame, vec.iov_len, 0, &vec, 1);
-    return 0;
-  }
-}
-
-int32_t
-meta_xlator_type_getattr (call_frame_t *frame, 
-			  xlator_t *this,
-			  const char *path)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-
-  meta_dirent_t *file = lookup_meta_entry (root, path, NULL);
-  xlator_t *view_xlator = file->view_xlator;
-  file->stbuf->st_size = strlen (view_xlator->type);
-
-  STACK_UNWIND (frame, 0, 0, file->stbuf);
-  return 0;
-}
-
-struct xlator_fops meta_xlator_type_fops = {
-  .readv   = meta_xlator_type_readv,
-  .getattr = meta_xlator_type_getattr
-};
-
-/* 
- * fops for the "view" directory
- * {xlator}/view shows the filesystem as it appears
- * to {xlator}
- */
-
-static int32_t
-meta_xlator_view_getattr_cbk (call_frame_t *frame, void *cookie,
-			      xlator_t *this, int32_t op_ret, int32_t op_errno,
-			      struct stat *buf)
-{
-  STACK_UNWIND (frame, op_ret, op_errno, buf);
-  return 0;
-}
-
-int32_t
-meta_xlator_view_getattr (call_frame_t *frame, 
-			  xlator_t *this,
-			  const char *path)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  char *op_path = NULL;
-
-  meta_dirent_t *file = lookup_meta_entry (root, path, &op_path);
-
-  if (op_path) {
-    STACK_WIND (frame, meta_xlator_view_getattr_cbk, file->view_xlator,
-		file->view_xlator->fops->getattr,
-		op_path);
-  }
-  else {
-    STACK_UNWIND (frame, 0, 0, file->stbuf);
-  }
-
-  return 0;
-}
-
-static int32_t
-meta_xlator_view_readdir_cbk (call_frame_t *frame, void *cookie,
-			      xlator_t *this, int32_t op_ret, int32_t op_errno,
-			      dir_entry_t *entries, int32_t count)
-{
-  STACK_UNWIND (frame, op_ret, op_errno, entries, count);
-  return 0;
-}
-
-int32_t
-meta_xlator_view_readdir (call_frame_t *frame,
-			  xlator_t *this,
-			  const char *path)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  char *op_path = NULL;
-
-  meta_dirent_t *dir = lookup_meta_entry (root, path, &op_path);
-
-  STACK_WIND (frame, meta_xlator_view_readdir_cbk, 
-	      dir->view_xlator, dir->view_xlator->fops->readdir,
-	      op_path ? op_path : "/");
-  return 0;
-}
-
-static int32_t
-meta_xlator_view_open_cbk (call_frame_t *frame, void *cookie,
-			   xlator_t *this, 
-			   int32_t op_ret, int32_t op_errno,
-			   dict_t *ctx, struct stat *buf)
-{
-  STACK_UNWIND (frame, op_ret, op_errno, ctx, buf);
-  return 0;
-}
-
-int32_t
-meta_xlator_view_open (call_frame_t *frame, xlator_t *this,
-		       const char *path, int32_t flags, mode_t mode)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  char *op_path = NULL;
-
-  meta_dirent_t *file = lookup_meta_entry (root, path, &op_path);
-  STACK_WIND (frame, meta_xlator_view_open_cbk,
-	      file->view_xlator, file->view_xlator->fops->open,
-	      op_path, flags, mode);
-  return 0;
-}
-
-int32_t
-meta_xlator_view_create (call_frame_t *frame, xlator_t *this,
-			 const char *path, int32_t flags, mode_t mode)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  char *op_path = NULL;
-
-  meta_dirent_t *file = lookup_meta_entry (root, path, &op_path);
-  STACK_WIND (frame, meta_xlator_view_open_cbk,
-	      file->view_xlator, file->view_xlator->fops->create,
-	      op_path, flags, mode);
-  return 0;
-}
-
-static int32_t
-meta_xlator_view_readv_cbk (call_frame_t *frame, void *cookie,
-			    xlator_t *this, int32_t op_ret,
-			    int32_t op_errno, struct iovec *vector,
-			    int32_t count)
-{
-  STACK_UNWIND (frame, op_ret, op_errno, vector, count);
-  return 0;
-}
-
-int32_t
-meta_xlator_view_readv (call_frame_t *frame, xlator_t *this,
-			dict_t *fd, size_t size, off_t offset)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  data_t *path_data = dict_get (fd, this->name);
-
-  if (path_data) {
-    const char *path = data_to_str (path_data);
-    meta_dirent_t *file = lookup_meta_entry (root, path, NULL);
-
-    STACK_WIND (frame, meta_xlator_view_readv_cbk,
-		file->view_xlator, file->view_xlator->fops->readv,
-		fd, size, offset);
-    return 0;
-  }
-
-  STACK_UNWIND (frame, -1, EBADFD, NULL, 0);
-  return 0;
-}
-
-static int32_t
-meta_xlator_view_writev_cbk (call_frame_t *frame, void *cookie,
-			     xlator_t *this, int32_t op_ret,
-			     int32_t op_errno)
-{
-  STACK_UNWIND (frame, op_ret, op_errno);
-  return 0;
-}
-
-int32_t
-meta_xlator_view_writev (call_frame_t *frame, xlator_t *this,
-			 dict_t *fd, 
-			 struct iovec *vector, int32_t count, off_t offset)
-{
-  meta_private_t *priv = (meta_private_t *) this->private;
-  meta_dirent_t *root = priv->tree;
-  data_t *path_data = dict_get (fd, this->name);
-
-  if (path_data) {
-    const char *path = data_to_str (path_data);
-    meta_dirent_t *file = lookup_meta_entry (root, path, NULL);
-
-    STACK_WIND (frame, meta_xlator_view_writev_cbk,
-		file->view_xlator, file->view_xlator->fops->writev,
-		fd, vector, count, offset);
-    return 0;
-  }
-
-  STACK_UNWIND (frame, -1, EBADFD, NULL, 0);
-  return 0;
-}
-
-struct xlator_fops meta_xlator_view_fops = {
-  .getattr = meta_xlator_view_getattr,
-  .readdir = meta_xlator_view_readdir,
-  .open    = meta_xlator_view_open,
-  .create  = meta_xlator_view_create,
-  .readv   = meta_xlator_view_readv,
-  .writev  = meta_xlator_view_writev
-};
diff --git a/xlators/meta/src/view.h b/xlators/meta/src/view.h
deleted file mode 100644
index c7c4f923efe..00000000000
--- a/xlators/meta/src/view.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
-   Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __VIEW_H__
-#define __VIEW_H__
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-
-struct xlator_fops meta_xlator_type_fops;
-struct xlator_fops meta_xlator_view_fops;
-
-#endif /* __VIEW_H__ */
diff --git a/xlators/meta/src/volfile-file.c b/xlators/meta/src/volfile-file.c
new file mode 100644
index 00000000000..b2e2562ab8b
--- /dev/null
+++ b/xlators/meta/src/volfile-file.c
@@ -0,0 +1,79 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include <glusterfs/strfd.h>
+
+static int
+xldump_options(dict_t *this, char *key, data_t *value, void *strfd)
+{
+    strprintf(strfd, "    option %s %s\n", key, value->data);
+    return 0;
+}
+
+static void
+xldump_subvolumes(xlator_t *this, void *strfd)
+{
+    xlator_list_t *subv = NULL;
+
+    if (!this->children)
+        return;
+
+    strprintf(strfd, "    subvolumes");
+
+    for (subv = this->children; subv; subv = subv->next)
+        strprintf(strfd, " %s", subv->xlator->name);
+
+    strprintf(strfd, "\n");
+}
+
+static void
+xldump(xlator_t *each, void *strfd)
+{
+    strprintf(strfd, "volume %s\n", each->name);
+    strprintf(strfd, "    type %s\n", each->type);
+    dict_foreach(each->options, xldump_options, strfd);
+
+    xldump_subvolumes(each, strfd);
+
+    strprintf(strfd, "end-volume\n");
+    strprintf(strfd, "\n");
+}
+
+static int
+volfile_file_fill(xlator_t *this, inode_t *file, strfd_t *strfd)
+{
+    glusterfs_graph_t *graph = NULL;
+
+    graph = meta_ctx_get(file, this);
+
+    xlator_foreach_depth_first(graph->top, xldump, strfd);
+
+    return strfd->size;
+}
+
+static struct meta_ops volfile_file_ops = {
+    .file_fill = volfile_file_fill,
+};
+
+int
+meta_volfile_file_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                       dict_t *xdata)
+{
+    meta_ops_set(loc->inode, this, &volfile_file_ops);
+
+    meta_ctx_set(loc->inode, this, meta_ctx_get(loc->parent, this));
+
+    return 0;
+}
diff --git a/xlators/meta/src/xlator-dir.c b/xlators/meta/src/xlator-dir.c
new file mode 100644
index 00000000000..86189715790
--- /dev/null
+++ b/xlators/meta/src/xlator-dir.c
@@ -0,0 +1,97 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+#include "meta-mem-types.h"
+#include "meta.h"
+#include "meta-hooks.h"
+
+static struct meta_dirent xlator_dir_dirents[] = {
+    DOT_DOTDOT,
+
+    {
+        .name = "view",
+        .type = IA_IFDIR,
+        .hook = meta_view_dir_hook,
+    },
+    {
+        .name = "type",
+        .type = IA_IFREG,
+        .hook = meta_type_file_hook,
+    },
+    {
+        .name = "name",
+        .type = IA_IFREG,
+        .hook = meta_name_file_hook,
+    },
+    {
+        .name = "subvolumes",
+        .type = IA_IFDIR,
+        .hook = meta_subvolumes_dir_hook,
+    },
+    {
+        .name = "options",
+        .type = IA_IFDIR,
+        .hook = meta_options_dir_hook,
+    },
+    {
+        .name = "private",
+        .type = IA_IFREG,
+        .hook = meta_private_file_hook,
+    },
+    {
+        .name = "history",
+        .type = IA_IFREG,
+        .hook = meta_history_file_hook,
+    },
+    {
+        .name = "meminfo",
+        .type = IA_IFREG,
+        .hook = meta_meminfo_file_hook,
+    },
+    {
+        .name = "profile",
+        .type = IA_IFREG,
+        .hook = meta_profile_file_hook,
+    },
+    {.name = NULL}};
+
+static struct meta_ops xlator_dir_ops = {.fixed_dirents = xlator_dir_dirents};
+
+int
+meta_xlator_dir_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                     dict_t *xdata)
+{
+    glusterfs_graph_t *graph = NULL;
+    xlator_t *xl = NULL;
+
+    graph = meta_ctx_get(loc->parent, this);
+
+    xl = xlator_search_by_name(graph->first, loc->name);
+
+    meta_ctx_set(loc->inode, this, xl);
+
+    meta_ops_set(loc->inode, this, &xlator_dir_ops);
+
+    return 0;
+}
+
+int
+meta_master_dir_hook(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                     dict_t *xdata)
+{
+    meta_ctx_set(loc->inode, this, this->ctx->master);
+
+    meta_ops_set(loc->inode, this, &xlator_dir_ops);
+
+    return 0;
+}
diff --git a/xlators/mgmt/Makefile.am b/xlators/mgmt/Makefile.am
new file mode 100644
index 00000000000..bf09b07c309
--- /dev/null
+++ b/xlators/mgmt/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = glusterd
+
+CLEANFILES = 
diff --git a/xlators/cluster/stripe/Makefile.am b/xlators/mgmt/glusterd/Makefile.am
index d471a3f9243..d471a3f9243 100644
--- a/xlators/cluster/stripe/Makefile.am
+++ b/xlators/mgmt/glusterd/Makefile.am
diff --git a/xlators/mgmt/glusterd/src/Makefile.am b/xlators/mgmt/glusterd/src/Makefile.am
new file mode 100644
index 00000000000..685beb42d27
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/Makefile.am
@@ -0,0 +1,79 @@
+if WITH_SERVER
+xlator_LTLIBRARIES = glusterd.la
+endif
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/mgmt
+glusterd_la_CPPFLAGS = $(AM_CPPFLAGS) \
+	-DFILTERDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/filter\" \
+	-DXLATORDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator\" \
+	-I$(top_srcdir)/libglusterd/src/
+
+glusterd_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+glusterd_la_SOURCES = glusterd.c glusterd-handler.c glusterd-sm.c \
+	glusterd-op-sm.c glusterd-utils.c glusterd-rpc-ops.c \
+	glusterd-store.c glusterd-handshake.c glusterd-pmap.c \
+	glusterd-volgen.c glusterd-rebalance.c \
+	glusterd-quota.c glusterd-bitrot.c glusterd-geo-rep.c \
+	glusterd-replace-brick.c glusterd-log-ops.c \
+	glusterd-volume-ops.c glusterd-brick-ops.c glusterd-mountbroker.c \
+	glusterd-syncop.c glusterd-hooks.c glusterd-volume-set.c \
+	glusterd-locks.c glusterd-snapshot.c glusterd-mgmt-handler.c \
+	glusterd-mgmt.c glusterd-peer-utils.c glusterd-statedump.c \
+	glusterd-snapshot-utils.c glusterd-conn-mgmt.c \
+	glusterd-proc-mgmt.c glusterd-svc-mgmt.c \
+	glusterd-nfs-svc.c glusterd-quotad-svc.c glusterd-svc-helper.c \
+	glusterd-conn-helper.c glusterd-snapd-svc.c glusterd-snapd-svc-helper.c \
+	glusterd-bitd-svc.c glusterd-scrub-svc.c glusterd-server-quorum.c \
+	glusterd-reset-brick.c glusterd-shd-svc.c glusterd-shd-svc-helper.c \
+        glusterd-gfproxyd-svc.c glusterd-gfproxyd-svc-helper.c glusterd-ganesha.c \
+	$(CONTRIBDIR)/mount/mntent.c
+
+glusterd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+	$(top_builddir)/libglusterd/src/libglusterd.la \
+	$(top_builddir)/rpc/xdr/src/libgfxdr.la \
+	$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \
+	$(XML_LIBS) -lcrypto $(URCU_LIBS) $(URCU_CDS_LIBS) $(LIB_DL) $(GF_XLATOR_MGNT_LIBADD)
+
+noinst_HEADERS = glusterd.h glusterd-utils.h glusterd-op-sm.h \
+	glusterd-sm.h glusterd-store.h glusterd-mem-types.h \
+	glusterd-pmap.h glusterd-volgen.h glusterd-mountbroker.h \
+	glusterd-syncop.h glusterd-hooks.h glusterd-locks.h glusterd-quota.h \
+	glusterd-mgmt.h glusterd-messages.h glusterd-peer-utils.h \
+	glusterd-statedump.h glusterd-snapshot-utils.h glusterd-geo-rep.h \
+	glusterd-conn-mgmt.h glusterd-conn-helper.h glusterd-proc-mgmt.h \
+	glusterd-svc-mgmt.h glusterd-nfs-svc.h \
+	glusterd-quotad-svc.h glusterd-svc-helper.h glusterd-snapd-svc.h \
+	glusterd-snapd-svc-helper.h glusterd-rcu.h glusterd-bitd-svc.h \
+	glusterd-scrub-svc.h glusterd-server-quorum.h glusterd-errno.h \
+        glusterd-shd-svc.h glusterd-shd-svc-helper.h \
+        glusterd-gfproxyd-svc.h glusterd-gfproxyd-svc-helper.h \
+	$(CONTRIBDIR)/userspace-rcu/rculist-extra.h \
+	$(CONTRIBDIR)/mount/mntent_compat.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(CONTRIBDIR)/rbtree -I$(top_srcdir)/rpc/rpc-lib/src \
+	-I$(CONTRIBDIR)/mount -I$(CONTRIBDIR)/userspace-rcu \
+	-DSBIN_DIR=\"$(sbindir)\" -DDATADIR=\"$(localstatedir)\" \
+	-DGSYNCD_PREFIX=\"$(GLUSTERFS_LIBEXECDIR)\" \
+	-DCONFDIR=\"$(localstatedir)/run/gluster/shared_storage/nfs-ganesha\" \
+	-DGANESHA_PREFIX=\"$(libexecdir)/ganesha\" \
+	-DSYNCDAEMON_COMPILE=$(SYNCDAEMON_COMPILE) \
+	-I$(top_srcdir)/libglusterd/src/
+
+
+AM_CFLAGS = -Wall $(GF_CFLAGS) $(URCU_CFLAGS) $(URCU_CDS_CFLAGS) $(XML_CFLAGS)
+
+AM_LDFLAGS = -L$(xlatordir) $(URCU_LIBS) $(URCU_CDS_LIBS)
+
+CLEANFILES =
+
+install-data-hook:
+if WITH_SERVER
+if GF_INSTALL_GLUSTERD_WORKDIR
+	$(mkdir_p) $(DESTDIR)$(GLUSTERD_WORKDIR)
+	(stat $(DESTDIR)$(sysconfdir)/glusterd && \
+	    mv $(DESTDIR)$(sysconfdir)/glusterd $(DESTDIR)$(GLUSTERD_WORKDIR)) || true;
+	(ln -sf $(DESTDIR)$(GLUSTERD_WORKDIR) $(sysconfdir)/glusterd) || true;
+endif
+endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-bitd-svc.c b/xlators/mgmt/glusterd/src/glusterd-bitd-svc.c
new file mode 100644
index 00000000000..6adb799b18f
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-bitd-svc.c
@@ -0,0 +1,206 @@
+/*
+  Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/globals.h>
+#include <glusterfs/run.h>
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-bitd-svc.h"
+#include "glusterd-svc-helper.h"
+
+void
+glusterd_bitdsvc_build(glusterd_svc_t *svc)
+{
+    svc->manager = glusterd_bitdsvc_manager;
+    svc->start = glusterd_bitdsvc_start;
+    svc->stop = glusterd_bitdsvc_stop;
+}
+
+int
+glusterd_bitdsvc_init(glusterd_svc_t *svc)
+{
+    return glusterd_svc_init(svc, bitd_svc_name);
+}
+
+static int
+glusterd_bitdsvc_create_volfile()
+{
+    char filepath[PATH_MAX] = {
+        0,
+    };
+    int ret = -1;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    glusterd_svc_build_volfile_path(bitd_svc_name, conf->workdir, filepath,
+                                    sizeof(filepath));
+
+    ret = glusterd_create_global_volfile(build_bitd_graph, filepath, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Failed to create volfile");
+        goto out;
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_bitdsvc_manager(glusterd_svc_t *svc, void *data, int flags)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (!svc->inited) {
+        ret = glusterd_bitdsvc_init(svc);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BITD_INIT_FAIL,
+                   "Failed to init "
+                   "bitd service");
+            goto out;
+        } else {
+            svc->inited = _gf_true;
+            gf_msg_debug(this->name, 0,
+                         "BitD service "
+                         "initialized");
+        }
+    }
+
+    if (glusterd_should_i_stop_bitd()) {
+        ret = svc->stop(svc, SIGTERM);
+    } else {
+        ret = glusterd_bitdsvc_create_volfile();
+        if (ret)
+            goto out;
+
+        ret = svc->stop(svc, SIGKILL);
+        if (ret)
+            goto out;
+
+        ret = svc->start(svc, flags);
+        if (ret)
+            goto out;
+
+        ret = glusterd_conn_connect(&(svc->conn));
+        if (ret)
+            goto out;
+    }
+
+out:
+    if (ret)
+        gf_event(EVENT_SVC_MANAGER_FAILED, "svc_name=%s", svc->name);
+
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_bitdsvc_start(glusterd_svc_t *svc, int flags)
+{
+    int ret = -1;
+    dict_t *cmdict = NULL;
+
+    cmdict = dict_new();
+    if (!cmdict)
+        goto error_return;
+
+    ret = dict_set_str(cmdict, "cmdarg0", "--global-timer-wheel");
+    if (ret)
+        goto dealloc_dict;
+
+    ret = glusterd_svc_start(svc, flags, cmdict);
+
+dealloc_dict:
+    dict_unref(cmdict);
+error_return:
+    return ret;
+}
+
+int
+glusterd_bitdsvc_stop(glusterd_svc_t *svc, int sig)
+{
+    return glusterd_svc_stop(svc, sig);
+}
+
+int
+glusterd_bitdsvc_reconfigure()
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    gf_boolean_t identical = _gf_false;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    if (glusterd_should_i_stop_bitd())
+        goto manager;
+    /*
+     * Check both OLD and NEW volfiles, if they are SAME by size
+     * and cksum i.e. "character-by-character". If YES, then
+     * NOTHING has been changed, just return.
+     */
+    ret = glusterd_svc_check_volfile_identical(priv->bitd_svc.name,
+                                               build_bitd_graph, &identical);
+    if (ret)
+        goto out;
+    if (identical) {
+        ret = 0;
+        goto out;
+    }
+
+    /*
+     * They are not identical. Find out if the topology is changed
+     * OR just the volume options. If just the options which got
+     * changed, then inform the xlator to reconfigure the options.
+     */
+    identical = _gf_false; /* RESET the FLAG */
+    ret = glusterd_svc_check_topology_identical(priv->bitd_svc.name,
+                                                build_bitd_graph, &identical);
+    if (ret)
+        goto out; /*not able to compare due to some corruption */
+
+    /* Topology is not changed, but just the options. But write the
+     * options to bitd volfile, so that bitd will be reconfigured.
+     */
+    if (identical) {
+        ret = glusterd_bitdsvc_create_volfile();
+        if (ret == 0) { /* Only if above PASSES */
+            ret = glusterd_fetchspec_notify(THIS);
+        }
+        goto out;
+    }
+
+manager:
+    /*
+     * bitd volfile's topology has been changed. bitd server needs
+     * to be RESTARTED to ACT on the changed volfile.
+     */
+    ret = priv->bitd_svc.manager(&(priv->bitd_svc), NULL, PROC_START_NO_WAIT);
+
+out:
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-bitd-svc.h b/xlators/mgmt/glusterd/src/glusterd-bitd-svc.h
new file mode 100644
index 00000000000..1bff084a9a8
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-bitd-svc.h
@@ -0,0 +1,40 @@
+/*
+  Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_BITD_SVC_H_
+#define _GLUSTERD_BITD_SVC_H_
+
+#include "glusterd-svc-mgmt.h"
+
+#define bitd_svc_name "bitd"
+
+void
+glusterd_bitdsvc_build(glusterd_svc_t *svc);
+
+int
+glusterd_bitdsvc_init(glusterd_svc_t *svc);
+
+int
+glusterd_bitdsvc_manager(glusterd_svc_t *svc, void *data, int flags);
+
+int
+glusterd_bitdsvc_start(glusterd_svc_t *svc, int flags);
+
+int
+glusterd_bitdsvc_stop(glusterd_svc_t *svc, int sig);
+
+int
+glusterd_bitdsvc_reconfigure();
+
+void
+glusterd_bitdsvc_build_volfile_path(char *server, char *workdir, char *volfile,
+                                    size_t len);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-bitrot.c b/xlators/mgmt/glusterd/src/glusterd-bitrot.c
new file mode 100644
index 00000000000..37429fe9214
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-bitrot.c
@@ -0,0 +1,822 @@
+/*
+   Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+ */
+
+#include <glusterfs/common-utils.h>
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+#include "glusterd.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-store.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include <glusterfs/run.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/compat-errno.h>
+#include "glusterd-scrub-svc.h"
+#include "glusterd-messages.h"
+
+#include <sys/wait.h>
+#include <dlfcn.h>
+
+const char *gd_bitrot_op_list[GF_BITROT_OPTION_TYPE_MAX] = {
+    [GF_BITROT_OPTION_TYPE_NONE] = "none",
+    [GF_BITROT_OPTION_TYPE_ENABLE] = "enable",
+    [GF_BITROT_OPTION_TYPE_DISABLE] = "disable",
+    [GF_BITROT_OPTION_TYPE_SCRUB_THROTTLE] = "scrub-throttle",
+    [GF_BITROT_OPTION_TYPE_SCRUB_FREQ] = "scrub-frequency",
+    [GF_BITROT_OPTION_TYPE_SCRUB] = "scrub",
+    [GF_BITROT_OPTION_TYPE_EXPIRY_TIME] = "expiry-time",
+    [GF_BITROT_OPTION_TYPE_SIGNER_THREADS] = "signer-threads",
+};
+
+int
+__glusterd_handle_bitrot(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    glusterd_op_t cli_op = GD_OP_BITROT;
+    char *volname = NULL;
+    char *scrub = NULL;
+    int32_t type = 0;
+    char msg[256] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    GF_ASSERT(req);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(msg, sizeof(msg),
+                     "Unable to decode the "
+                     "command");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+    }
+
+    ret = dict_get_str(dict, "volname", &volname);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Unable to get volume name");
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name, "
+               "while handling bitrot command");
+        goto out;
+    }
+
+    ret = dict_get_int32(dict, "type", &type);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Unable to get type of command");
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get type of cmd, "
+               "while handling bitrot command");
+        goto out;
+    }
+
+    if (conf->op_version < GD_OP_VERSION_3_7_0) {
+        snprintf(msg, sizeof(msg),
+                 "Cannot execute command. The "
+                 "cluster is operating at version %d. Bitrot command "
+                 "%s is unavailable in this version",
+                 conf->op_version, gd_bitrot_op_list[type]);
+        ret = -1;
+        goto out;
+    }
+
+    if (type == GF_BITROT_CMD_SCRUB_STATUS) {
+        /* Backward compatibility handling for scrub status command*/
+        if (conf->op_version < GD_OP_VERSION_3_7_7) {
+            snprintf(msg, sizeof(msg),
+                     "Cannot execute command. "
+                     "The cluster is operating at version %d. "
+                     "Bitrot scrub status command unavailable in "
+                     "this version",
+                     conf->op_version);
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_get_str(dict, "scrub-value", &scrub);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get scrub value.");
+            ret = -1;
+            goto out;
+        }
+
+        if (!strncmp(scrub, "status", SLEN("status"))) {
+            ret = glusterd_op_begin_synctask(req, GD_OP_SCRUB_STATUS, dict);
+            goto out;
+        }
+    }
+
+    if (type == GF_BITROT_CMD_SCRUB_ONDEMAND) {
+        /* Backward compatibility handling for scrub status command*/
+        if (conf->op_version < GD_OP_VERSION_3_9_0) {
+            snprintf(msg, sizeof(msg),
+                     "Cannot execute command. "
+                     "The cluster is operating at version %d. "
+                     "Bitrot scrub ondemand command unavailable in "
+                     "this version",
+                     conf->op_version);
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_get_str(dict, "scrub-value", &scrub);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get scrub value.");
+            ret = -1;
+            goto out;
+        }
+
+        if (!strncmp(scrub, "ondemand", SLEN("ondemand"))) {
+            ret = glusterd_op_begin_synctask(req, GD_OP_SCRUB_ONDEMAND, dict);
+            goto out;
+        }
+    }
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_BITROT, dict);
+
+out:
+    if (ret) {
+        if (msg[0] == '\0')
+            snprintf(msg, sizeof(msg), "Bitrot operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, msg);
+    }
+
+    return ret;
+}
+
+int
+glusterd_handle_bitrot(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_bitrot);
+}
+
+static int
+glusterd_bitrot_scrub_throttle(glusterd_volinfo_t *volinfo, dict_t *dict,
+                               char *key, char **op_errstr)
+{
+    int32_t ret = -1;
+    char *scrub_throttle = NULL;
+    char *option = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_str(dict, "scrub-throttle-value", &scrub_throttle);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch scrub-"
+               "throttle value");
+        goto out;
+    }
+
+    option = gf_strdup(scrub_throttle);
+    ret = dict_set_dynstr(volinfo->dict, key, option);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Failed to set option %s", key);
+        goto out;
+    }
+
+    ret = glusterd_scrubsvc_reconfigure();
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SCRUBSVC_RECONF_FAIL,
+               "Failed to reconfigure scrub "
+               "services");
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+static int
+glusterd_bitrot_scrub_freq(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                           char **op_errstr)
+{
+    int32_t ret = -1;
+    char *scrub_freq = NULL;
+    xlator_t *this = NULL;
+    char *option = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_str(dict, "scrub-frequency-value", &scrub_freq);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch scrub-"
+               "freq value");
+        goto out;
+    }
+
+    option = gf_strdup(scrub_freq);
+    ret = dict_set_dynstr(volinfo->dict, key, option);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Failed to set option %s", key);
+        goto out;
+    }
+
+    ret = glusterd_scrubsvc_reconfigure();
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SCRUBSVC_RECONF_FAIL,
+               "Failed to reconfigure scrub "
+               "services");
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+static int
+glusterd_bitrot_scrub(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                      char **op_errstr)
+{
+    int32_t ret = -1;
+    char *scrub_value = NULL;
+    xlator_t *this = NULL;
+    char *option = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_str(dict, "scrub-value", &scrub_value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch scrub"
+               "value");
+        goto out;
+    }
+
+    if (!strcmp(scrub_value, "resume")) {
+        option = gf_strdup("Active");
+    } else {
+        option = gf_strdup(scrub_value);
+    }
+
+    ret = dict_set_dynstr(volinfo->dict, key, option);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Failed to set option %s", key);
+        goto out;
+    }
+
+    ret = glusterd_scrubsvc_reconfigure();
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SCRUBSVC_RECONF_FAIL,
+               "Failed to reconfigure scrub "
+               "services");
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+static int
+glusterd_bitrot_expiry_time(glusterd_volinfo_t *volinfo, dict_t *dict,
+                            char *key, char **op_errstr)
+{
+    int32_t ret = -1;
+    uint32_t expiry_time = 0;
+    xlator_t *this = NULL;
+    char dkey[32] = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_uint32(dict, "expiry-time", &expiry_time);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get bitrot expiry"
+               " timer value.");
+        goto out;
+    }
+
+    snprintf(dkey, sizeof(dkey), "%d", expiry_time);
+
+    ret = dict_set_dynstr_with_alloc(volinfo->dict, key, dkey);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Failed to set option %s", key);
+        goto out;
+    }
+
+    ret = glusterd_bitdsvc_reconfigure();
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BITDSVC_RECONF_FAIL,
+               "Failed to reconfigure bitrot"
+               "services");
+        goto out;
+    }
+out:
+    return ret;
+}
+
+static gf_boolean_t
+is_bitd_configure_noop(xlator_t *this, glusterd_volinfo_t *volinfo)
+{
+    gf_boolean_t noop = _gf_true;
+    glusterd_brickinfo_t *brickinfo = NULL;
+
+    if (!glusterd_is_bitrot_enabled(volinfo))
+        goto out;
+    else if (volinfo->status != GLUSTERD_STATUS_STARTED)
+        goto out;
+    else {
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            if (!glusterd_is_local_brick(this, volinfo, brickinfo))
+                continue;
+            noop = _gf_false;
+            return noop;
+        }
+    }
+out:
+    return noop;
+}
+
+static int
+glusterd_bitrot_signer_threads(glusterd_volinfo_t *volinfo, dict_t *dict,
+                               char *key, char **op_errstr)
+{
+    int32_t ret = -1;
+    uint32_t signer_th_count = 0;
+    uint32_t existing_th_count = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    char dkey[32] = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    ret = dict_get_uint32(dict, "signer-threads", &signer_th_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get bitrot signer thread count.");
+        goto out;
+    }
+
+    ret = dict_get_uint32(volinfo->dict, key, &existing_th_count);
+    if (ret == 0 && signer_th_count == existing_th_count) {
+        goto out;
+    }
+
+    snprintf(dkey, sizeof(dkey), "%d", signer_th_count);
+    ret = dict_set_dynstr_with_alloc(volinfo->dict, key, dkey);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Failed to set option %s", key);
+        goto out;
+    }
+
+    if (!is_bitd_configure_noop(this, volinfo)) {
+        ret = priv->bitd_svc.manager(&(priv->bitd_svc), NULL,
+                                     PROC_START_NO_WAIT);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BITDSVC_RECONF_FAIL,
+                   "Failed to reconfigure bitrot services");
+            goto out;
+        }
+    }
+out:
+    return ret;
+}
+
+static int
+glusterd_bitrot_enable(glusterd_volinfo_t *volinfo, char **op_errstr)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_errstr, out);
+
+    if (glusterd_is_volume_started(volinfo) == 0) {
+        *op_errstr = gf_strdup(
+            "Volume is stopped, start volume "
+            "to enable bitrot.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_is_bitrot_enabled(volinfo);
+    if (ret) {
+        *op_errstr = gf_strdup("Bitrot is already enabled");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(volinfo->dict, VKEY_FEATURES_BITROT, "on");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "dict set failed");
+        goto out;
+    }
+
+    /*Once bitrot is enable scrubber should be in Active state*/
+    ret = dict_set_dynstr_with_alloc(volinfo->dict, "features.scrub", "Active");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Failed to set option "
+               "features.scrub value");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (ret && op_errstr && !*op_errstr)
+        gf_asprintf(op_errstr,
+                    "Enabling bitrot on volume %s has been "
+                    "unsuccessful",
+                    volinfo->volname);
+    return ret;
+}
+
+static int
+glusterd_bitrot_disable(glusterd_volinfo_t *volinfo, char **op_errstr)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_errstr, out);
+
+    ret = dict_set_dynstr_with_alloc(volinfo->dict, VKEY_FEATURES_BITROT,
+                                     "off");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "dict set failed");
+        goto out;
+    }
+
+    /*Once bitrot disabled scrubber should be Inactive state*/
+    ret = dict_set_dynstr_with_alloc(volinfo->dict, "features.scrub",
+                                     "Inactive");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Failed to set "
+               "features.scrub value");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (ret && op_errstr && !*op_errstr)
+        gf_asprintf(op_errstr,
+                    "Disabling bitrot on volume %s has "
+                    "been unsuccessful",
+                    volinfo->volname);
+    return ret;
+}
+
+gf_boolean_t
+glusterd_should_i_stop_bitd()
+{
+    glusterd_conf_t *conf = THIS->private;
+    glusterd_volinfo_t *volinfo = NULL;
+    gf_boolean_t stopped = _gf_true;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    cds_list_for_each_entry(volinfo, &conf->volumes, vol_list)
+    {
+        if (!glusterd_is_bitrot_enabled(volinfo))
+            continue;
+        else if (volinfo->status != GLUSTERD_STATUS_STARTED)
+            continue;
+        else {
+            cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+            {
+                if (!glusterd_is_local_brick(this, volinfo, brickinfo))
+                    continue;
+                stopped = _gf_false;
+                return stopped;
+            }
+
+            /* Before stopping bitrot/scrubber daemon check
+             * other volume also whether respective volume
+             * host a brick from this node or not.*/
+            continue;
+        }
+    }
+
+    return stopped;
+}
+
+static int
+glusterd_manage_bitrot(int opcode)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    switch (opcode) {
+        case GF_BITROT_OPTION_TYPE_ENABLE:
+        case GF_BITROT_OPTION_TYPE_DISABLE:
+            ret = priv->bitd_svc.manager(&(priv->bitd_svc), NULL,
+                                         PROC_START_NO_WAIT);
+            if (ret)
+                break;
+            ret = priv->scrub_svc.manager(&(priv->scrub_svc), NULL,
+                                          PROC_START_NO_WAIT);
+            break;
+        default:
+            ret = 0;
+            break;
+    }
+
+    return ret;
+}
+
+int
+glusterd_op_bitrot(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    int32_t ret = -1;
+    char *volname = NULL;
+    int type = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_str(dict, "volname", &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_asprintf(op_errstr, FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+
+    ret = dict_get_int32(dict, "type", &type);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get type from "
+               "dict");
+        goto out;
+    }
+
+    switch (type) {
+        case GF_BITROT_OPTION_TYPE_ENABLE:
+            ret = glusterd_bitrot_enable(volinfo, op_errstr);
+            if (ret < 0)
+                goto out;
+            break;
+
+        case GF_BITROT_OPTION_TYPE_DISABLE:
+            ret = glusterd_bitrot_disable(volinfo, op_errstr);
+            if (ret < 0)
+                goto out;
+
+            break;
+
+        case GF_BITROT_OPTION_TYPE_SCRUB_THROTTLE:
+            ret = glusterd_bitrot_scrub_throttle(
+                volinfo, dict, "features.scrub-throttle", op_errstr);
+            if (ret)
+                goto out;
+            break;
+
+        case GF_BITROT_OPTION_TYPE_SCRUB_FREQ:
+            ret = glusterd_bitrot_scrub_freq(volinfo, dict,
+                                             "features.scrub-freq", op_errstr);
+            if (ret)
+                goto out;
+            break;
+
+        case GF_BITROT_OPTION_TYPE_SCRUB:
+            ret = glusterd_bitrot_scrub(volinfo, dict, "features.scrub",
+                                        op_errstr);
+            if (ret)
+                goto out;
+            break;
+
+        case GF_BITROT_OPTION_TYPE_EXPIRY_TIME:
+            ret = glusterd_bitrot_expiry_time(
+                volinfo, dict, "features.expiry-time", op_errstr);
+            if (ret)
+                goto out;
+            break;
+
+        case GF_BITROT_OPTION_TYPE_SIGNER_THREADS:
+            ret = glusterd_bitrot_signer_threads(
+                volinfo, dict, "features.signer-threads", op_errstr);
+            if (ret)
+                goto out;
+            break;
+
+        case GF_BITROT_CMD_SCRUB_STATUS:
+        case GF_BITROT_CMD_SCRUB_ONDEMAND:
+            break;
+
+        default:
+            gf_asprintf(op_errstr,
+                        "Bitrot command failed. Invalid "
+                        "opcode");
+            ret = -1;
+            goto out;
+    }
+
+    ret = glusterd_manage_bitrot(type);
+    if (ret)
+        goto out;
+
+    ret = glusterd_create_volfiles_and_notify_services(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Unable to re-create "
+               "volfiles");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to store volinfo for "
+                     "bitrot");
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+int
+glusterd_op_stage_bitrot(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    int ret = 0;
+    char *volname = NULL;
+    char *scrub_cmd = NULL;
+    char *scrub_cmd_from_dict = NULL;
+    char msg[2048] = {
+        0,
+    };
+    int type = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+
+    ret = dict_get_str(dict, "volname", &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_asprintf(op_errstr, FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+
+    if (!glusterd_is_volume_started(volinfo)) {
+        *op_errstr = gf_strdup(
+            "Volume is stopped, start volume "
+            "before executing bit rot command.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_int32(dict, "type", &type);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get type for "
+               "operation");
+
+        *op_errstr = gf_strdup(
+            "Staging stage failed for bitrot "
+            "operation.");
+        goto out;
+    }
+
+    if ((GF_BITROT_OPTION_TYPE_ENABLE != type) &&
+        (glusterd_is_bitrot_enabled(volinfo) == 0)) {
+        ret = -1;
+        gf_asprintf(op_errstr, "Bitrot is not enabled on volume %s", volname);
+        goto out;
+    }
+
+    if ((GF_BITROT_OPTION_TYPE_SCRUB == type)) {
+        ret = dict_get_str(volinfo->dict, "features.scrub",
+                           &scrub_cmd_from_dict);
+        if (!ret) {
+            ret = dict_get_str(dict, "scrub-value", &scrub_cmd);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                       "Unable to "
+                       "get scrub-value");
+                *op_errstr = gf_strdup(
+                    "Staging failed for "
+                    "bitrot operation. "
+                    "Please check log file"
+                    " for more details.");
+                goto out;
+            }
+            /* If scrubber is resume then value of scrubber will be
+             * "Active" in the dictionary. */
+            if (!strcmp(scrub_cmd_from_dict, scrub_cmd) ||
+                (!strncmp("Active", scrub_cmd_from_dict, SLEN("Active")) &&
+                 !strncmp("resume", scrub_cmd, SLEN("resume")))) {
+                snprintf(msg, sizeof(msg),
+                         "Scrub is already"
+                         " %sd for volume %s",
+                         scrub_cmd, volinfo->volname);
+                *op_errstr = gf_strdup(msg);
+                ret = -1;
+                goto out;
+            }
+        }
+        ret = 0;
+    }
+
+out:
+    if (ret && op_errstr && *op_errstr)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_STAGE_BITROT_FAIL, "%s",
+               *op_errstr);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
new file mode 100644
index 00000000000..e56cd0e6c74
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
@@ -0,0 +1,2796 @@
+/*
+   Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/common-utils.h>
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+#include "glusterd.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-geo-rep.h"
+#include "glusterd-store.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-svc-helper.h"
+#include "glusterd-messages.h"
+#include "glusterd-server-quorum.h"
+#include <glusterfs/run.h>
+#include <glusterfs/syscall.h>
+#include <sys/signal.h>
+
+/* misc */
+
+/* In this function, we decide, based on the 'count' of the brick,
+   where to add it in the current volume. 'count' tells us already
+   how many of the given bricks are added. other argument are self-
+   descriptive. */
+int
+add_brick_at_right_order(glusterd_brickinfo_t *brickinfo,
+                         glusterd_volinfo_t *volinfo, int count,
+                         int32_t stripe_cnt, int32_t replica_cnt)
+{
+    int idx = 0;
+    int i = 0;
+    int sub_cnt = 0;
+    glusterd_brickinfo_t *brick = NULL;
+
+    /* The complexity of the function is in deciding at which index
+       to add new brick. Even though it can be defined with a complex
+       single formula for all volume, it is separated out to make it
+       more readable */
+    if (stripe_cnt) {
+        /* common formula when 'stripe_count' is set */
+        /* idx = ((count / ((stripe_cnt * volinfo->replica_count) -
+           volinfo->dist_leaf_count)) * volinfo->dist_leaf_count) +
+           (count + volinfo->dist_leaf_count);
+        */
+
+        sub_cnt = volinfo->dist_leaf_count;
+
+        idx = ((count / ((stripe_cnt * volinfo->replica_count) - sub_cnt)) *
+               sub_cnt) +
+              (count + sub_cnt);
+
+        goto insert_brick;
+    }
+
+    /* replica count is set */
+    /* common formula when 'replica_count' is set */
+    /* idx = ((count / (replica_cnt - existing_replica_count)) *
+       existing_replica_count) +
+       (count + existing_replica_count);
+    */
+
+    sub_cnt = volinfo->replica_count;
+    idx = (count / (replica_cnt - sub_cnt) * sub_cnt) + (count + sub_cnt);
+
+insert_brick:
+    i = 0;
+    cds_list_for_each_entry(brick, &volinfo->bricks, brick_list)
+    {
+        i++;
+        if (i < idx)
+            continue;
+        gf_msg_debug(THIS->name, 0, "brick:%s index=%d, count=%d", brick->path,
+                     idx, count);
+
+        cds_list_add(&brickinfo->brick_list, &brick->brick_list);
+        break;
+    }
+
+    return 0;
+}
+
+static int
+gd_addbr_validate_replica_count(glusterd_volinfo_t *volinfo, int replica_count,
+                                int arbiter_count, int total_bricks, int *type,
+                                char *err_str, int err_len)
+{
+    int ret = -1;
+
+    /* replica count is set */
+    switch (volinfo->type) {
+        case GF_CLUSTER_TYPE_NONE:
+            if ((volinfo->brick_count * replica_count) == total_bricks) {
+                /* Change the volume type */
+                *type = GF_CLUSTER_TYPE_REPLICATE;
+                gf_msg(THIS->name, GF_LOG_INFO, 0,
+                       GD_MSG_VOL_TYPE_CHANGING_INFO,
+                       "Changing the type of volume %s from "
+                       "'distribute' to 'replica'",
+                       volinfo->volname);
+                ret = 0;
+                goto out;
+
+            } else {
+                snprintf(err_str, err_len,
+                         "Incorrect number of "
+                         "bricks (%d) supplied for replica count (%d).",
+                         (total_bricks - volinfo->brick_count), replica_count);
+                gf_msg(THIS->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                       "%s", err_str);
+                goto out;
+            }
+            break;
+        case GF_CLUSTER_TYPE_REPLICATE:
+            if (replica_count < volinfo->replica_count) {
+                snprintf(err_str, err_len,
+                         "Incorrect replica count (%d) supplied. "
+                         "Volume already has (%d)",
+                         replica_count, volinfo->replica_count);
+                gf_msg(THIS->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                       "%s", err_str);
+                goto out;
+            }
+            if (replica_count == volinfo->replica_count) {
+                if (arbiter_count && !volinfo->arbiter_count) {
+                    snprintf(err_str, err_len,
+                             "Cannot convert replica 3 volume "
+                             "to arbiter volume.");
+                    gf_msg(THIS->name, GF_LOG_ERROR, EINVAL,
+                           GD_MSG_INVALID_ENTRY, "%s", err_str);
+                    goto out;
+                }
+                if (!(total_bricks % volinfo->dist_leaf_count)) {
+                    ret = 1;
+                    goto out;
+                }
+            }
+            if (replica_count > volinfo->replica_count) {
+                /* We have to make sure before and after 'add-brick',
+                   the number or subvolumes for distribute will remain
+                   same, when replica count is given */
+                if ((total_bricks * volinfo->dist_leaf_count) ==
+                    (volinfo->brick_count *
+                     (replica_count * volinfo->stripe_count))) {
+                    /* Change the dist_leaf_count */
+                    gf_msg(THIS->name, GF_LOG_INFO, 0,
+                           GD_MSG_REPLICA_COUNT_CHANGE_INFO,
+                           "Changing the replica count of "
+                           "volume %s from %d to %d",
+                           volinfo->volname, volinfo->replica_count,
+                           replica_count);
+                    ret = 0;
+                    goto out;
+                }
+            }
+            break;
+        case GF_CLUSTER_TYPE_DISPERSE:
+            snprintf(err_str, err_len,
+                     "Volume %s cannot be converted "
+                     "from dispersed to replicated-"
+                     "dispersed",
+                     volinfo->volname);
+            gf_msg(THIS->name, GF_LOG_ERROR, EPERM, GD_MSG_OP_NOT_PERMITTED,
+                   "%s", err_str);
+            goto out;
+    }
+out:
+    return ret;
+}
+
+static int
+gd_rmbr_validate_replica_count(glusterd_volinfo_t *volinfo,
+                               int32_t replica_count, int32_t brick_count,
+                               char *err_str, size_t err_len)
+{
+    int ret = -1;
+    int replica_nodes = 0;
+    xlator_t *this = NULL;
+    this = THIS;
+    GF_ASSERT(this);
+
+    switch (volinfo->type) {
+        case GF_CLUSTER_TYPE_NONE:
+        case GF_CLUSTER_TYPE_DISPERSE:
+            snprintf(err_str, err_len,
+                     "replica count (%d) option given for non replicate "
+                     "volume %s",
+                     replica_count, volinfo->volname);
+            gf_smsg(this->name, GF_LOG_WARNING, EINVAL, GD_MSG_INVALID_ARGUMENT,
+                    err_str, NULL);
+            goto out;
+
+        case GF_CLUSTER_TYPE_REPLICATE:
+            /* in remove brick, you can only reduce the replica count */
+            if (replica_count > volinfo->replica_count) {
+                snprintf(err_str, err_len,
+                         "given replica count (%d) option is more "
+                         "than volume %s's replica count (%d)",
+                         replica_count, volinfo->volname,
+                         volinfo->replica_count);
+                gf_smsg(this->name, GF_LOG_WARNING, EINVAL,
+                        GD_MSG_INVALID_ARGUMENT, err_str, NULL);
+                goto out;
+            }
+            if (replica_count == volinfo->replica_count) {
+                /* This means the 'replica N' option on CLI was
+                   redundant. Check if the total number of bricks given
+                   for removal is same as 'dist_leaf_count' */
+                if (brick_count % volinfo->dist_leaf_count) {
+                    snprintf(err_str, err_len,
+                             "number of bricks provided (%d) is "
+                             "not valid. need at least %d "
+                             "(or %dxN)",
+                             brick_count, volinfo->dist_leaf_count,
+                             volinfo->dist_leaf_count);
+                    gf_smsg(this->name, GF_LOG_WARNING, EINVAL,
+                            GD_MSG_INVALID_ARGUMENT, err_str, NULL);
+                    goto out;
+                }
+                ret = 1;
+                goto out;
+            }
+
+            replica_nodes = ((volinfo->brick_count / volinfo->replica_count) *
+                             (volinfo->replica_count - replica_count));
+
+            if (brick_count % replica_nodes) {
+                snprintf(err_str, err_len,
+                         "need %d(xN) bricks for reducing replica "
+                         "count of the volume from %d to %d",
+                         replica_nodes, volinfo->replica_count, replica_count);
+                gf_smsg(this->name, GF_LOG_WARNING, EINVAL,
+                        GD_MSG_INVALID_ARGUMENT, err_str, NULL);
+                goto out;
+            }
+            break;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/* Handler functions */
+int
+__glusterd_handle_add_brick(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    char *bricks = NULL;
+    char *volname = NULL;
+    int brick_count = 0;
+    void *cli_rsp = NULL;
+    char err_str[2048] = "";
+    gf_cli_rsp rsp = {
+        0,
+    };
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    int total_bricks = 0;
+    int32_t replica_count = 0;
+    int32_t arbiter_count = 0;
+    int32_t stripe_count = 0;
+    int type = 0;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(req);
+
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        req->rpc_err = GARBAGE_ARGS;
+        snprintf(err_str, sizeof(err_str), "Garbage args received");
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto out;
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_ADD_BRICK_REQ_RECVD,
+           "Received add brick req");
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, errno,
+                   GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        }
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get volume "
+                 "name");
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get volinfo "
+                 "for volume name %s",
+                 volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "count", SLEN("count"), &brick_count);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get volume "
+                 "brick count");
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "replica-count", SLEN("replica-count"),
+                          &replica_count);
+    if (!ret) {
+        gf_msg(this->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_SUCCESS,
+               "replica-count is %d", replica_count);
+    }
+
+    ret = dict_get_int32n(dict, "arbiter-count", SLEN("arbiter-count"),
+                          &arbiter_count);
+    if (!ret) {
+        gf_msg(this->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_SUCCESS,
+               "arbiter-count is %d", arbiter_count);
+    }
+
+    ret = dict_get_int32n(dict, "stripe-count", SLEN("stripe-count"),
+                          &stripe_count);
+    if (!ret) {
+        gf_msg(this->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_SUCCESS,
+               "stripe-count is %d", stripe_count);
+    }
+
+    if (!dict_getn(dict, "force", SLEN("force"))) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Failed to get flag");
+        goto out;
+    }
+
+    total_bricks = volinfo->brick_count + brick_count;
+
+    if (!stripe_count && !replica_count) {
+        if (volinfo->type == GF_CLUSTER_TYPE_NONE)
+            goto brick_val;
+
+        if ((volinfo->brick_count < volinfo->dist_leaf_count) &&
+            (total_bricks <= volinfo->dist_leaf_count))
+            goto brick_val;
+
+        if ((brick_count % volinfo->dist_leaf_count) != 0) {
+            snprintf(err_str, sizeof(err_str),
+                     "Incorrect number "
+                     "of bricks supplied %d with count %d",
+                     brick_count, volinfo->dist_leaf_count);
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_REPLICA,
+                   "%s", err_str);
+            ret = -1;
+            goto out;
+        }
+        goto brick_val;
+        /* done with validation.. below section is if stripe|replica
+           count is given */
+    }
+
+    ret = gd_addbr_validate_replica_count(volinfo, replica_count, arbiter_count,
+                                          total_bricks, &type, err_str,
+                                          sizeof(err_str));
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COUNT_VALIDATE_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    /* if replica count is same as earlier, set it back to 0 */
+    if (ret == 1)
+        replica_count = 0;
+
+    ret = dict_set_int32n(dict, "replica-count", SLEN("replica-count"),
+                          replica_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "failed to set the replica-count in dict");
+        goto out;
+    }
+
+brick_val:
+    ret = dict_get_strn(dict, "bricks", SLEN("bricks"), &bricks);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get volume "
+                 "bricks");
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    if (type != volinfo->type) {
+        ret = dict_set_int32n(dict, "type", SLEN("type"), type);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                   "failed to set the new type in dict");
+            goto out;
+        }
+    }
+
+    if (conf->op_version <= GD_OP_VERSION_3_7_5) {
+        gf_msg_debug(this->name, 0,
+                     "The cluster is operating at "
+                     "version less than or equal to %d. Falling back "
+                     "to syncop framework.",
+                     GD_OP_VERSION_3_7_5);
+        ret = glusterd_op_begin_synctask(req, GD_OP_ADD_BRICK, dict);
+    } else {
+        ret = glusterd_mgmt_v3_initiate_all_phases(req, GD_OP_ADD_BRICK, dict);
+    }
+
+out:
+    if (ret) {
+        rsp.op_ret = -1;
+        rsp.op_errno = 0;
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        rsp.op_errstr = err_str;
+        cli_rsp = &rsp;
+        glusterd_to_cli(req, cli_rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_cli_rsp,
+                        dict);
+        ret = 0;  // sent error to cli, prevent second reply
+    }
+
+    free(cli_req.dict.dict_val);  // its malloced by xdr
+
+    return ret;
+}
+
+int
+glusterd_handle_add_brick(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_add_brick);
+}
+
+static int
+subvol_matcher_init(int **subvols, int count)
+{
+    int ret = -1;
+
+    *subvols = GF_CALLOC(count, sizeof(int), gf_gld_mt_int);
+    if (*subvols)
+        ret = 0;
+
+    return ret;
+}
+
+static void
+subvol_matcher_update(int *subvols, glusterd_volinfo_t *volinfo,
+                      glusterd_brickinfo_t *brickinfo)
+{
+    glusterd_brickinfo_t *tmp = NULL;
+    int32_t sub_volume = 0;
+    int pos = 0;
+    if (subvols) {
+        cds_list_for_each_entry(tmp, &volinfo->bricks, brick_list)
+        {
+            if (strcmp(tmp->hostname, brickinfo->hostname) ||
+                strcmp(tmp->path, brickinfo->path)) {
+                pos++;
+                continue;
+            }
+            gf_msg_debug(THIS->name, 0, LOGSTR_FOUND_BRICK, brickinfo->hostname,
+                         brickinfo->path, volinfo->volname);
+            sub_volume = (pos / volinfo->dist_leaf_count);
+            subvols[sub_volume]++;
+            break;
+        }
+    }
+}
+
+static int
+subvol_matcher_verify(int *subvols, glusterd_volinfo_t *volinfo, char *err_str,
+                      size_t err_len, char *vol_type, int replica_count)
+{
+    int i = 0;
+    int ret = 0;
+    int count = volinfo->replica_count - replica_count;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (replica_count && subvols) {
+        for (i = 0; i < volinfo->subvol_count; i++) {
+            if (subvols[i] != count) {
+                ret = -1;
+                snprintf(err_str, err_len,
+                         "Remove exactly %d"
+                         " brick(s) from each subvolume.",
+                         count);
+                gf_smsg(this->name, GF_LOG_ERROR, errno,
+                        GD_MSG_BRICK_SUBVOL_VERIFY_FAIL, err_str, NULL);
+                break;
+            }
+        }
+        return ret;
+    }
+
+    do {
+        if (subvols && (subvols[i] % volinfo->dist_leaf_count == 0)) {
+            continue;
+        } else {
+            ret = -1;
+            snprintf(err_str, err_len, "Bricks not from same subvol for %s",
+                     vol_type);
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    GD_MSG_BRICK_SUBVOL_VERIFY_FAIL, err_str, NULL);
+            break;
+        }
+    } while (++i < volinfo->subvol_count);
+
+    return ret;
+}
+
+static void
+subvol_matcher_destroy(int *subvols)
+{
+    GF_FREE(subvols);
+}
+
+static int
+glusterd_remove_brick_validate_arbiters(glusterd_volinfo_t *volinfo,
+                                        int32_t count, int32_t replica_count,
+                                        glusterd_brickinfo_t **brickinfo_list,
+                                        char *err_str, size_t err_len)
+{
+    int i = 0;
+    int ret = 0;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *last = NULL;
+    char *arbiter_array = NULL;
+    xlator_t *this = NULL;
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (volinfo->type != GF_CLUSTER_TYPE_REPLICATE)
+        goto out;
+
+    if (!replica_count || !volinfo->arbiter_count)
+        goto out;
+
+    if (replica_count == 2) {
+        /* If it is an arbiter to replica 2 conversion, only permit
+         *  removal of the arbiter brick.*/
+        for (i = 0; i < count; i++) {
+            brickinfo = brickinfo_list[i];
+            last = get_last_brick_of_brick_group(volinfo, brickinfo);
+            if (last != brickinfo) {
+                snprintf(err_str, err_len,
+                         "Remove arbiter "
+                         "brick(s) only when converting from "
+                         "arbiter to replica 2 subvolume.");
+                gf_smsg(this->name, GF_LOG_ERROR, errno,
+                        GD_MSG_REMOVE_ARBITER_BRICK, err_str, NULL);
+                ret = -1;
+                goto out;
+            }
+        }
+    } else if (replica_count == 1) {
+        /* If it is an arbiter to plain distribute conversion, in every
+         * replica subvol, the arbiter has to be one of the bricks that
+         * are removed. */
+        arbiter_array = GF_CALLOC(volinfo->subvol_count, sizeof(*arbiter_array),
+                                  gf_common_mt_char);
+        if (!arbiter_array)
+            return -1;
+        for (i = 0; i < count; i++) {
+            brickinfo = brickinfo_list[i];
+            last = get_last_brick_of_brick_group(volinfo, brickinfo);
+            if (last == brickinfo)
+                arbiter_array[brickinfo->group] = 1;
+        }
+        for (i = 0; i < volinfo->subvol_count; i++)
+            if (!arbiter_array[i]) {
+                snprintf(err_str, err_len,
+                         "Removed bricks "
+                         "must contain arbiter when converting"
+                         " to plain distribute.");
+                gf_smsg(this->name, GF_LOG_ERROR, errno,
+                        GD_MSG_REMOVE_ARBITER_BRICK, err_str, NULL);
+                ret = -1;
+                break;
+            }
+        GF_FREE(arbiter_array);
+    }
+
+out:
+    return ret;
+}
+
+int
+__glusterd_handle_remove_brick(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    int32_t count = 0;
+    char *brick = NULL;
+    char key[64] = "";
+    int keylen;
+    int i = 1;
+    glusterd_conf_t *conf = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t **brickinfo_list = NULL;
+    int *subvols = NULL;
+    char err_str[2048] = "";
+    gf_cli_rsp rsp = {
+        0,
+    };
+    void *cli_rsp = NULL;
+    char vol_type[256] = "";
+    int32_t replica_count = 0;
+    char *volname = 0;
+    xlator_t *this = NULL;
+    int cmd = -1;
+
+    GF_ASSERT(req);
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        req->rpc_err = GARBAGE_ARGS;
+        snprintf(err_str, sizeof(err_str), "Received garbage args");
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto out;
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_REM_BRICK_REQ_RECVD,
+           "Received rem brick req");
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, errno,
+                   GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        }
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get volume "
+                 "name");
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "count", SLEN("count"), &count);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get brick "
+                 "count");
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str), "Volume %s does not exist", volname);
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "command", SLEN("command"), &cmd);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get cmd "
+                 "ccommand");
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "replica-count", SLEN("replica-count"),
+                          &replica_count);
+    if (!ret) {
+        gf_msg(this->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_FAILED,
+               "request to change replica-count to %d", replica_count);
+        ret = gd_rmbr_validate_replica_count(volinfo, replica_count, count,
+                                             err_str, sizeof(err_str));
+        if (ret < 0) {
+            /* logging and error msg are done in above function
+               itself */
+            goto out;
+        }
+        dict_deln(dict, "replica-count", SLEN("replica-count"));
+        if (ret) {
+            replica_count = 0;
+        } else {
+            ret = dict_set_int32n(dict, "replica-count", SLEN("replica-count"),
+                                  replica_count);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, errno,
+                       GD_MSG_DICT_SET_FAILED,
+                       "failed to set the replica_count "
+                       "in dict");
+                goto out;
+            }
+        }
+    }
+
+    /* 'vol_type' is used for giving the meaning full error msg for user */
+    if (volinfo->type == GF_CLUSTER_TYPE_REPLICATE) {
+        strcpy(vol_type, "replica");
+    } else if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) {
+        strcpy(vol_type, "disperse");
+    } else {
+        strcpy(vol_type, "distribute");
+    }
+
+    if (!replica_count && (volinfo->type == GF_CLUSTER_TYPE_REPLICATE) &&
+        (volinfo->brick_count == volinfo->dist_leaf_count)) {
+        snprintf(err_str, sizeof(err_str),
+                 "Removing bricks from replicate configuration "
+                 "is not allowed without reducing replica count "
+                 "explicitly.");
+        gf_msg(this->name, GF_LOG_ERROR, EPERM, GD_MSG_OP_NOT_PERMITTED_AC_REQD,
+               "%s", err_str);
+        ret = -1;
+        goto out;
+    }
+
+    /* Do not allow remove-brick if the bricks given is less than
+       the replica count or stripe count */
+    if (!replica_count && (volinfo->type != GF_CLUSTER_TYPE_NONE)) {
+        if (volinfo->dist_leaf_count && (count % volinfo->dist_leaf_count)) {
+            snprintf(err_str, sizeof(err_str),
+                     "Remove brick "
+                     "incorrect brick count of %d for %s %d",
+                     count, vol_type, volinfo->dist_leaf_count);
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY, "%s",
+                   err_str);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    if ((volinfo->type != GF_CLUSTER_TYPE_NONE) &&
+        (volinfo->subvol_count > 1)) {
+        ret = subvol_matcher_init(&subvols, volinfo->subvol_count);
+        if (ret)
+            goto out;
+    }
+
+    brickinfo_list = GF_CALLOC(count, sizeof(*brickinfo_list),
+                               gf_common_mt_pointer);
+    if (!brickinfo_list) {
+        ret = -1;
+        goto out;
+    }
+
+    while (i <= count) {
+        keylen = snprintf(key, sizeof(key), "brick%d", i);
+        ret = dict_get_strn(dict, key, keylen, &brick);
+        if (ret) {
+            snprintf(err_str, sizeof(err_str), "Unable to get %s", key);
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                   "%s", err_str);
+            goto out;
+        }
+        gf_msg_debug(this->name, 0,
+                     "Remove brick count %d brick:"
+                     " %s",
+                     i, brick);
+
+        ret = glusterd_volume_brickinfo_get_by_brick(brick, volinfo, &brickinfo,
+                                                     _gf_false);
+
+        if (ret) {
+            snprintf(err_str, sizeof(err_str),
+                     "Incorrect brick "
+                     "%s for volume %s",
+                     brick, volname);
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_BRICK_NOT_FOUND,
+                   "%s", err_str);
+            goto out;
+        }
+        brickinfo_list[i - 1] = brickinfo;
+
+        i++;
+        if ((volinfo->type == GF_CLUSTER_TYPE_NONE) ||
+            (volinfo->brick_count <= volinfo->dist_leaf_count))
+            continue;
+
+        subvol_matcher_update(subvols, volinfo, brickinfo);
+    }
+
+    if ((volinfo->type != GF_CLUSTER_TYPE_NONE) &&
+        (volinfo->subvol_count > 1)) {
+        ret = subvol_matcher_verify(subvols, volinfo, err_str, sizeof(err_str),
+                                    vol_type, replica_count);
+        if (ret)
+            goto out;
+    }
+
+    ret = glusterd_remove_brick_validate_arbiters(volinfo, count, replica_count,
+                                                  brickinfo_list, err_str,
+                                                  sizeof(err_str));
+    if (ret)
+        goto out;
+
+    if (conf->op_version < GD_OP_VERSION_8_0) {
+        gf_msg_debug(this->name, 0,
+                     "The cluster is operating at "
+                     "version less than %d. remove-brick operation"
+                     "falling back to syncop framework.",
+                     GD_OP_VERSION_8_0);
+        ret = glusterd_op_begin_synctask(req, GD_OP_REMOVE_BRICK, dict);
+    } else {
+        ret = glusterd_mgmt_v3_initiate_all_phases(req, GD_OP_REMOVE_BRICK,
+                                                   dict);
+    }
+
+out:
+    if (ret) {
+        rsp.op_ret = -1;
+        rsp.op_errno = 0;
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_OP_FAILED, "%s",
+               err_str);
+        rsp.op_errstr = err_str;
+        cli_rsp = &rsp;
+        glusterd_to_cli(req, cli_rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_cli_rsp,
+                        dict);
+
+        ret = 0;  // sent error to cli, prevent second reply
+    }
+
+    if (brickinfo_list)
+        GF_FREE(brickinfo_list);
+    subvol_matcher_destroy(subvols);
+    free(cli_req.dict.dict_val);  // its malloced by xdr
+
+    return ret;
+}
+
+int
+glusterd_handle_remove_brick(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_remove_brick);
+}
+
+static int
+_glusterd_restart_gsync_session(dict_t *this, char *key, data_t *value,
+                                void *data)
+{
+    char *slave = NULL;
+    char *slave_buf = NULL;
+    char *path_list = NULL;
+    char *slave_vol = NULL;
+    char *slave_host = NULL;
+    char *slave_url = NULL;
+    char *conf_path = NULL;
+    char **errmsg = NULL;
+    int ret = -1;
+    glusterd_gsync_status_temp_t *param = NULL;
+    gf_boolean_t is_running = _gf_false;
+
+    param = (glusterd_gsync_status_temp_t *)data;
+
+    GF_ASSERT(param);
+    GF_ASSERT(param->volinfo);
+
+    slave = strchr(value->data, ':');
+    if (slave) {
+        slave++;
+        slave_buf = gf_strdup(slave);
+        if (!slave_buf) {
+            gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "Failed to gf_strdup");
+            ret = -1;
+            goto out;
+        }
+    } else
+        return 0;
+
+    ret = dict_set_dynstrn(param->rsp_dict, "slave", SLEN("slave"), slave_buf);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Unable to store slave");
+        if (slave_buf)
+            GF_FREE(slave_buf);
+        goto out;
+    }
+
+    ret = glusterd_get_slave_details_confpath(param->volinfo, param->rsp_dict,
+                                              &slave_url, &slave_host,
+                                              &slave_vol, &conf_path, errmsg);
+    if (ret) {
+        if (errmsg && *errmsg)
+            gf_msg("glusterd", GF_LOG_ERROR, 0,
+                   GD_MSG_SLAVE_CONFPATH_DETAILS_FETCH_FAIL, "%s", *errmsg);
+        else
+            gf_msg("glusterd", GF_LOG_ERROR, 0,
+                   GD_MSG_SLAVE_CONFPATH_DETAILS_FETCH_FAIL,
+                   "Unable to fetch slave or confpath details.");
+        goto out;
+    }
+
+    /* In cases that gsyncd is not running, we will not invoke it
+     * because of add-brick. */
+    ret = glusterd_check_gsync_running_local(param->volinfo->volname, slave,
+                                             conf_path, &is_running);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_GSYNC_VALIDATION_FAIL,
+               "gsync running validation failed.");
+        goto out;
+    }
+    if (_gf_false == is_running) {
+        gf_msg_debug("glusterd", 0,
+                     "gsync session for %s and %s is"
+                     " not running on this node. Hence not restarting.",
+                     param->volinfo->volname, slave);
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_get_local_brickpaths(param->volinfo, &path_list);
+    if (!path_list) {
+        gf_msg_debug("glusterd", 0,
+                     "This node not being part of"
+                     " volume should not be running gsyncd. Hence"
+                     " no gsyncd process to restart.");
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_check_restart_gsync_session(
+        param->volinfo, slave, param->rsp_dict, path_list, conf_path, 0);
+    if (ret)
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_GSYNC_RESTART_FAIL,
+               "Unable to restart gsync session.");
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d.", ret);
+    return ret;
+}
+
+/* op-sm */
+
+int
+glusterd_op_perform_add_bricks(glusterd_volinfo_t *volinfo, int32_t count,
+                               char *bricks, dict_t *dict)
+{
+    char *brick = NULL;
+    int32_t i = 1;
+    char *brick_list = NULL;
+    char *free_ptr1 = NULL;
+    char *free_ptr2 = NULL;
+    char *saveptr = NULL;
+    int32_t ret = -1;
+    int32_t stripe_count = 0;
+    int32_t replica_count = 0;
+    int32_t arbiter_count = 0;
+    int32_t type = 0;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_gsync_status_temp_t param = {
+        0,
+    };
+    gf_boolean_t restart_needed = 0;
+    int brickid = 0;
+    char key[64] = "";
+    char *brick_mount_dir = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    gf_boolean_t is_valid_add_brick = _gf_false;
+    gf_boolean_t restart_shd = _gf_false;
+    struct statvfs brickstat = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(volinfo);
+
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    if (bricks) {
+        brick_list = gf_strdup(bricks);
+        free_ptr1 = brick_list;
+    }
+
+    if (count)
+        brick = strtok_r(brick_list + 1, " \n", &saveptr);
+
+    if (dict) {
+        ret = dict_get_int32n(dict, "stripe-count", SLEN("stripe-count"),
+                              &stripe_count);
+        if (!ret)
+            gf_msg(THIS->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_SUCCESS,
+                   "stripe-count is set %d", stripe_count);
+
+        ret = dict_get_int32n(dict, "replica-count", SLEN("replica-count"),
+                              &replica_count);
+        if (!ret)
+            gf_msg(THIS->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_SUCCESS,
+                   "replica-count is set %d", replica_count);
+        ret = dict_get_int32n(dict, "arbiter-count", SLEN("arbiter-count"),
+                              &arbiter_count);
+        if (!ret)
+            gf_msg(THIS->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_SUCCESS,
+                   "arbiter-count is set %d", arbiter_count);
+        ret = dict_get_int32n(dict, "type", SLEN("type"), &type);
+        if (!ret)
+            gf_msg(THIS->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_SUCCESS,
+                   "type is set %d, need to change it", type);
+    }
+
+    brickid = glusterd_get_next_available_brickid(volinfo);
+    if (brickid < 0)
+        goto out;
+    while (i <= count) {
+        ret = glusterd_brickinfo_new_from_brick(brick, &brickinfo, _gf_true,
+                                                NULL);
+        if (ret)
+            goto out;
+
+        GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO(brickinfo, volinfo, brickid++);
+
+        /* A bricks mount dir is required only by snapshots which were
+         * introduced in gluster-3.6.0
+         */
+        if (conf->op_version >= GD_OP_VERSION_3_6_0) {
+            brick_mount_dir = NULL;
+
+            snprintf(key, sizeof(key), "brick%d.mount_dir", i);
+            ret = dict_get_str(dict, key, &brick_mount_dir);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                       "%s not present", key);
+                goto out;
+            }
+            strncpy(brickinfo->mount_dir, brick_mount_dir,
+                    SLEN(brickinfo->mount_dir));
+        }
+
+        ret = glusterd_resolve_brick(brickinfo);
+        if (ret)
+            goto out;
+
+        if (!gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+            ret = sys_statvfs(brickinfo->path, &brickstat);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_STATVFS_FAILED,
+                       "Failed to fetch disk utilization "
+                       "from the brick (%s:%s). Please check the health of "
+                       "the brick. Error code was %s",
+                       brickinfo->hostname, brickinfo->path, strerror(errno));
+
+                goto out;
+            }
+            brickinfo->statfs_fsid = brickstat.f_fsid;
+        }
+        if (stripe_count || replica_count) {
+            add_brick_at_right_order(brickinfo, volinfo, (i - 1), stripe_count,
+                                     replica_count);
+        } else {
+            cds_list_add_tail(&brickinfo->brick_list, &volinfo->bricks);
+        }
+        brick = strtok_r(NULL, " \n", &saveptr);
+        i++;
+        volinfo->brick_count++;
+    }
+
+    /* Gets changed only if the options are given in add-brick cli */
+    if (type)
+        volinfo->type = type;
+    /* performance.client-io-threads is turned on by default,
+     * however this has adverse effects on replicate volumes due to
+     * replication design issues, till that get addressed
+     * performance.client-io-threads option is turned off for all
+     * replicate volumes if not already explicitly enabled.
+     */
+    if (type && glusterd_is_volume_replicate(volinfo) &&
+        conf->op_version >= GD_OP_VERSION_3_12_2) {
+        ret = dict_set_nstrn(volinfo->dict, "performance.client-io-threads",
+                             SLEN("performance.client-io-threads"), "off",
+                             SLEN("off"));
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set "
+                   "performance.client-io-threads to off");
+            goto out;
+        }
+    }
+
+    if (replica_count) {
+        volinfo->replica_count = replica_count;
+    }
+    if (arbiter_count) {
+        volinfo->arbiter_count = arbiter_count;
+    }
+    if (stripe_count) {
+        volinfo->stripe_count = stripe_count;
+    }
+    volinfo->dist_leaf_count = glusterd_get_dist_leaf_count(volinfo);
+
+    /* backward compatibility */
+    volinfo->sub_count = ((volinfo->dist_leaf_count == 1)
+                              ? 0
+                              : volinfo->dist_leaf_count);
+
+    volinfo->subvol_count = (volinfo->brick_count / volinfo->dist_leaf_count);
+
+    ret = 0;
+    if (GLUSTERD_STATUS_STARTED != volinfo->status)
+        goto generate_volfiles;
+
+    ret = generate_brick_volfiles(volinfo);
+    if (ret)
+        goto out;
+
+    brick_list = gf_strdup(bricks);
+    free_ptr2 = brick_list;
+    i = 1;
+
+    if (count)
+        brick = strtok_r(brick_list + 1, " \n", &saveptr);
+
+    if (glusterd_is_volume_replicate(volinfo)) {
+        if (replica_count && conf->op_version >= GD_OP_VERSION_3_7_10) {
+            is_valid_add_brick = _gf_true;
+            if (volinfo->status == GLUSTERD_STATUS_STARTED) {
+                ret = volinfo->shd.svc.stop(&(volinfo->shd.svc), SIGTERM);
+                if (ret) {
+                    gf_msg("glusterd", GF_LOG_ERROR, 0,
+                           GD_MSG_GLUSTER_SERVICES_STOP_FAIL,
+                           "Failed to stop shd for %s.", volinfo->volname);
+                }
+                restart_shd = _gf_true;
+            }
+            ret = generate_dummy_client_volfiles(volinfo);
+            if (ret) {
+                gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+                       "Failed to create volfile.");
+                goto out;
+            }
+        }
+    }
+
+    while (i <= count) {
+        ret = glusterd_volume_brickinfo_get_by_brick(brick, volinfo, &brickinfo,
+                                                     _gf_true);
+        if (ret)
+            goto out;
+
+        if (gf_uuid_is_null(brickinfo->uuid)) {
+            ret = glusterd_resolve_brick(brickinfo);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_RESOLVE_BRICK_FAIL,
+                       FMTSTR_RESOLVE_BRICK, brickinfo->hostname,
+                       brickinfo->path);
+                goto out;
+            }
+        }
+
+        /* if the volume is a replicate volume, do: */
+        if (is_valid_add_brick) {
+            if (!gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+                ret = glusterd_handle_replicate_brick_ops(volinfo, brickinfo,
+                                                          GD_OP_ADD_BRICK);
+                if (ret < 0)
+                    goto out;
+            }
+        }
+        ret = glusterd_brick_start(volinfo, brickinfo, _gf_true, _gf_false);
+        if (ret)
+            goto out;
+        i++;
+        brick = strtok_r(NULL, " \n", &saveptr);
+
+        /* Check if the brick is added in this node, and set
+         * the restart_needed flag. */
+        if ((!gf_uuid_compare(brickinfo->uuid, MY_UUID)) && !restart_needed) {
+            restart_needed = 1;
+            gf_msg_debug("glusterd", 0,
+                         "Restart gsyncd session, if it's already "
+                         "running.");
+        }
+    }
+
+    /* If the restart_needed flag is set, restart gsyncd sessions for that
+     * particular master with all the slaves. */
+    if (restart_needed) {
+        param.rsp_dict = dict;
+        param.volinfo = volinfo;
+        dict_foreach(volinfo->gsync_slaves, _glusterd_restart_gsync_session,
+                     &param);
+    }
+
+generate_volfiles:
+    if (conf->op_version <= GD_OP_VERSION_3_7_5) {
+        ret = glusterd_create_volfiles_and_notify_services(volinfo);
+    } else {
+        /*
+         * The cluster is operating at version greater than
+         * gluster-3.7.5. So no need to sent volfile fetch
+         * request in commit phase, the same will be done
+         * in post validate phase with v3 framework.
+         */
+    }
+
+out:
+    GF_FREE(free_ptr1);
+    GF_FREE(free_ptr2);
+    if (restart_shd) {
+        if (volinfo->shd.svc.manager(&(volinfo->shd.svc), volinfo,
+                                     PROC_START_NO_WAIT)) {
+            gf_msg("glusterd", GF_LOG_CRITICAL, 0,
+                   GD_MSG_GLUSTER_SERVICE_START_FAIL,
+                   "Failed to start shd for %s.", volinfo->volname);
+        }
+    }
+
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_perform_remove_brick(glusterd_volinfo_t *volinfo, char *brick,
+                                 int force, int *need_migrate)
+{
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(brick);
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    ret = glusterd_volume_brickinfo_get_by_brick(brick, volinfo, &brickinfo,
+                                                 _gf_false);
+    if (ret)
+        goto out;
+
+    ret = glusterd_resolve_brick(brickinfo);
+    if (ret)
+        goto out;
+
+    glusterd_volinfo_reset_defrag_stats(volinfo);
+
+    if (!gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+        /* Only if the brick is in this glusterd, do the rebalance */
+        if (need_migrate)
+            *need_migrate = 1;
+    }
+
+    if (force) {
+        ret = glusterd_brick_stop(volinfo, brickinfo, _gf_true);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_STOP_FAIL,
+                   "Unable to stop "
+                   "glusterfs, ret: %d",
+                   ret);
+        }
+        goto out;
+    }
+
+    brickinfo->decommissioned = 1;
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_stage_add_brick(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    int ret = 0;
+    char *volname = NULL;
+    int count = 0;
+    int replica_count = 0;
+    int arbiter_count = 0;
+    int i = 0;
+    int32_t local_brick_count = 0;
+    char *bricks = NULL;
+    char *brick_list = NULL;
+    char *saveptr = NULL;
+    char *free_ptr = NULL;
+    char *brick = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    char msg[4096] = "";
+    char key[64] = "";
+    gf_boolean_t brick_alloc = _gf_false;
+    char *all_bricks = NULL;
+    char *str_ret = NULL;
+    gf_boolean_t is_force = _gf_false;
+    glusterd_conf_t *conf = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "Unable to find volume: %s", volname);
+        goto out;
+    }
+
+    ret = glusterd_validate_volume_id(dict, volinfo);
+    if (ret)
+        goto out;
+
+    ret = dict_get_int32n(dict, "replica-count", SLEN("replica-count"),
+                          &replica_count);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Unable to get replica count");
+    }
+
+    if (replica_count > 0) {
+        ret = op_version_check(this, GD_OP_VER_PERSISTENT_AFR_XATTRS, msg,
+                               sizeof(msg));
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_VERSION_MISMATCH,
+                   "%s", msg);
+            *op_errstr = gf_strdup(msg);
+            goto out;
+        }
+    }
+
+    glusterd_add_peers_to_auth_list(volname);
+
+    if (replica_count && glusterd_is_volume_replicate(volinfo)) {
+        /* Do not allow add-brick for stopped volumes when replica-count
+         * is being increased.
+         */
+        if (GLUSTERD_STATUS_STOPPED == volinfo->status &&
+            conf->op_version >= GD_OP_VERSION_3_7_10) {
+            ret = -1;
+            snprintf(msg, sizeof(msg),
+                     " Volume must not be in"
+                     " stopped state when replica-count needs to "
+                     " be increased.");
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL, "%s",
+                   msg);
+            *op_errstr = gf_strdup(msg);
+            goto out;
+        }
+        /* op-version check for replica 2 to arbiter conversion. If we
+         * don't have this check, an older peer added as arbiter brick
+         * will not have the  arbiter xlator in its volfile. */
+        if ((replica_count == 3) && (conf->op_version < GD_OP_VERSION_3_8_0)) {
+            ret = dict_get_int32n(dict, "arbiter-count", SLEN("arbiter-count"),
+                                  &arbiter_count);
+            if (ret) {
+                gf_msg_debug(this->name, 0,
+                             "No arbiter count present in the dict");
+            } else if (arbiter_count == 1) {
+                ret = -1;
+                snprintf(msg, sizeof(msg),
+                         "Cluster op-version must "
+                         "be >= 30800 to add arbiter brick to a "
+                         "replica 2 volume.");
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL, "%s",
+                       msg);
+                *op_errstr = gf_strdup(msg);
+                goto out;
+            }
+        }
+        /* Do not allow increasing replica count for arbiter volumes. */
+        if (volinfo->arbiter_count) {
+            ret = -1;
+            snprintf(msg, sizeof(msg),
+                     "Increasing replica count "
+                     "for arbiter volumes is not supported.");
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL, "%s",
+                   msg);
+            *op_errstr = gf_strdup(msg);
+            goto out;
+        }
+    }
+
+    is_force = dict_get_str_boolean(dict, "force", _gf_false);
+
+    /* Check brick order if the volume type is replicate or disperse. If
+     * force at the end of command not given then check brick order.
+     * doing this check at the originator node is sufficient.
+     */
+
+    if (!is_force && is_origin_glusterd(dict)) {
+        ret = 0;
+        if (volinfo->type == GF_CLUSTER_TYPE_REPLICATE) {
+            gf_msg_debug(this->name, 0,
+                         "Replicate cluster type "
+                         "found. Checking brick order.");
+            if (replica_count)
+                ret = glusterd_check_brick_order(dict, msg, volinfo->type,
+                                                 &volname, &bricks, &count,
+                                                 replica_count);
+            else
+                ret = glusterd_check_brick_order(dict, msg, volinfo->type,
+                                                 &volname, &bricks, &count,
+                                                 volinfo->replica_count);
+        } else if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) {
+            gf_msg_debug(this->name, 0,
+                         "Disperse cluster type"
+                         " found. Checking brick order.");
+            ret = glusterd_check_brick_order(dict, msg, volinfo->type, &volname,
+                                             &bricks, &count,
+                                             volinfo->disperse_count);
+        }
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BAD_BRKORDER,
+                   "Not adding brick because of "
+                   "bad brick order. %s",
+                   msg);
+            *op_errstr = gf_strdup(msg);
+            goto out;
+        }
+    }
+
+    if (volinfo->replica_count < replica_count && !is_force) {
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            if (gf_uuid_compare(brickinfo->uuid, MY_UUID))
+                continue;
+            if (brickinfo->status == GF_BRICK_STOPPED) {
+                ret = -1;
+                len = snprintf(msg, sizeof(msg),
+                               "Brick %s "
+                               "is down, changing replica "
+                               "count needs all the bricks "
+                               "to be up to avoid data loss",
+                               brickinfo->path);
+                if (len < 0) {
+                    strcpy(msg, "<error>");
+                }
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL, "%s",
+                       msg);
+                *op_errstr = gf_strdup(msg);
+                goto out;
+            }
+        }
+    }
+
+    if (conf->op_version > GD_OP_VERSION_3_7_5 && is_origin_glusterd(dict)) {
+        ret = glusterd_validate_quorum(this, GD_OP_ADD_BRICK, dict, op_errstr);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_SERVER_QUORUM_NOT_MET,
+                   "Server quorum not met. Rejecting operation.");
+            goto out;
+        }
+    } else {
+        /* Case 1: conf->op_version <= GD_OP_VERSION_3_7_5
+         *         in this case the add-brick is running
+         *         syncop framework that will do a quorum
+         *         check by default
+         * Case 2: We don't need to do quorum check on every
+         *         node, only originator glusterd need to
+         *         check for quorum
+         * So nothing need to be done in else
+         */
+    }
+
+    if (glusterd_is_defrag_on(volinfo)) {
+        snprintf(msg, sizeof(msg),
+                 "Volume name %s rebalance is in "
+                 "progress. Please retry after completion",
+                 volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OIP_RETRY_LATER, "%s", msg);
+        *op_errstr = gf_strdup(msg);
+        ret = -1;
+        goto out;
+    }
+
+    if (volinfo->snap_count > 0 || !cds_list_empty(&volinfo->snap_volumes)) {
+        snprintf(msg, sizeof(msg),
+                 "Volume %s  has %" PRIu64
+                 " snapshots. "
+                 "Changing the volume configuration will not effect snapshots."
+                 "But the snapshot brick mount should be intact to "
+                 "make them function.",
+                 volname, volinfo->snap_count);
+        gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_SNAP_WARN, "%s", msg);
+        msg[0] = '\0';
+    }
+
+    if (!count) {
+        ret = dict_get_int32n(dict, "count", SLEN("count"), &count);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get count");
+            goto out;
+        }
+    }
+
+    if (!bricks) {
+        ret = dict_get_strn(dict, "bricks", SLEN("bricks"), &bricks);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get bricks");
+            goto out;
+        }
+    }
+
+    if (bricks) {
+        brick_list = gf_strdup(bricks);
+        all_bricks = gf_strdup(bricks);
+        free_ptr = brick_list;
+    }
+
+    if (count)
+        brick = strtok_r(brick_list + 1, " \n", &saveptr);
+
+    while (i < count) {
+        if (!glusterd_store_is_valid_brickpath(volname, brick) ||
+            !glusterd_is_valid_volfpath(volname, brick)) {
+            snprintf(msg, sizeof(msg),
+                     "brick path %s is "
+                     "too long",
+                     brick);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRKPATH_TOO_LONG, "%s",
+                   msg);
+            *op_errstr = gf_strdup(msg);
+
+            ret = -1;
+            goto out;
+        }
+
+        ret = glusterd_brickinfo_new_from_brick(brick, &brickinfo, _gf_true,
+                                                NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_NOT_FOUND,
+                   "Add-brick: Unable"
+                   " to get brickinfo");
+            goto out;
+        }
+        brick_alloc = _gf_true;
+
+        ret = glusterd_new_brick_validate(brick, brickinfo, msg, sizeof(msg),
+                                          NULL);
+        if (ret) {
+            *op_errstr = gf_strdup(msg);
+            ret = -1;
+            goto out;
+        }
+
+        if (!gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+            ret = glusterd_validate_and_create_brickpath(
+                brickinfo, volinfo->volume_id, volinfo->volname, op_errstr,
+                is_force, _gf_false);
+            if (ret)
+                goto out;
+
+            /* A bricks mount dir is required only by snapshots which were
+             * introduced in gluster-3.6.0
+             */
+            if (conf->op_version >= GD_OP_VERSION_3_6_0) {
+                ret = glusterd_get_brick_mount_dir(
+                    brickinfo->path, brickinfo->hostname, brickinfo->mount_dir);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_BRICK_MOUNTDIR_GET_FAIL,
+                           "Failed to get brick mount_dir");
+                    goto out;
+                }
+
+                snprintf(key, sizeof(key), "brick%d.mount_dir", i + 1);
+                ret = dict_set_dynstr_with_alloc(rsp_dict, key,
+                                                 brickinfo->mount_dir);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, errno,
+                           GD_MSG_DICT_SET_FAILED, "Failed to set %s", key);
+                    goto out;
+                }
+            }
+
+            local_brick_count = i + 1;
+        }
+
+        glusterd_brickinfo_delete(brickinfo);
+        brick_alloc = _gf_false;
+        brickinfo = NULL;
+        brick = strtok_r(NULL, " \n", &saveptr);
+        i++;
+    }
+
+    ret = dict_set_int32n(rsp_dict, "brick_count", SLEN("brick_count"),
+                          local_brick_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Failed to set local_brick_count");
+        goto out;
+    }
+
+out:
+    GF_FREE(free_ptr);
+    if (brick_alloc && brickinfo)
+        glusterd_brickinfo_delete(brickinfo);
+    GF_FREE(str_ret);
+    GF_FREE(all_bricks);
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_remove_brick_validate_bricks(gf1_op_commands cmd, int32_t brick_count,
+                                      dict_t *dict, glusterd_volinfo_t *volinfo,
+                                      char **errstr,
+                                      gf_cli_defrag_type cmd_defrag)
+{
+    char *brick = NULL;
+    char msg[2048] = "";
+    char key[64] = "";
+    int keylen;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    int i = 0;
+    int ret = -1;
+    char pidfile[PATH_MAX + 1] = {
+        0,
+    };
+    glusterd_conf_t *priv = THIS->private;
+    int pid = -1;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    /* Check whether all the nodes of the bricks to be removed are
+     * up, if not fail the operation */
+    for (i = 1; i <= brick_count; i++) {
+        keylen = snprintf(key, sizeof(key), "brick%d", i);
+        ret = dict_get_strn(dict, key, keylen, &brick);
+        if (ret) {
+            snprintf(msg, sizeof(msg), "Unable to get %s", key);
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                    "key=%s", key, NULL);
+            *errstr = gf_strdup(msg);
+            goto out;
+        }
+
+        ret = glusterd_volume_brickinfo_get_by_brick(brick, volinfo, &brickinfo,
+                                                     _gf_false);
+        if (ret) {
+            snprintf(msg, sizeof(msg),
+                     "Incorrect brick "
+                     "%s for volume %s",
+                     brick, volinfo->volname);
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INCORRECT_BRICK,
+                    "Brick=%s, Volume=%s", brick, volinfo->volname, NULL);
+            *errstr = gf_strdup(msg);
+            goto out;
+        }
+        /* Do not allow commit if the bricks are not decommissioned
+         * if its a remove brick commit
+         */
+        if (!brickinfo->decommissioned && cmd == GF_OP_CMD_COMMIT) {
+            snprintf(msg, sizeof(msg),
+                     "Brick %s "
+                     "is not decommissioned. "
+                     "Use start or force option",
+                     brick);
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_BRICK_NOT_DECOM,
+                    "Use 'start' or 'force' option, Brick=%s", brick, NULL);
+            *errstr = gf_strdup(msg);
+            ret = -1;
+            goto out;
+        }
+
+        if (glusterd_is_local_brick(THIS, volinfo, brickinfo)) {
+            switch (cmd) {
+                case GF_OP_CMD_START:
+                    goto check;
+                case GF_OP_CMD_NONE:
+                default:
+                    break;
+            }
+
+            switch (cmd_defrag) {
+                case GF_DEFRAG_CMD_NONE:
+                default:
+                    continue;
+            }
+        check:
+            if (brickinfo->status != GF_BRICK_STARTED) {
+                snprintf(msg, sizeof(msg),
+                         "Found stopped "
+                         "brick %s. Use force option to "
+                         "remove the offline brick",
+                         brick);
+                gf_smsg(
+                    this->name, GF_LOG_ERROR, errno, GD_MSG_BRICK_STOPPED,
+                    "Use 'force' option to remove the offline brick, Brick=%s",
+                    brick, NULL);
+                *errstr = gf_strdup(msg);
+                ret = -1;
+                goto out;
+            }
+            GLUSTERD_GET_BRICK_PIDFILE(pidfile, volinfo, brickinfo, priv);
+            if (!gf_is_service_running(pidfile, &pid)) {
+                snprintf(msg, sizeof(msg),
+                         "Found dead "
+                         "brick %s",
+                         brick);
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_BRICK_DEAD,
+                        "Brick=%s", brick, NULL);
+                *errstr = gf_strdup(msg);
+                ret = -1;
+                goto out;
+            } else {
+                ret = 0;
+            }
+            continue;
+        }
+
+        RCU_READ_LOCK;
+        peerinfo = glusterd_peerinfo_find_by_uuid(brickinfo->uuid);
+        if (!peerinfo) {
+            RCU_READ_UNLOCK;
+            snprintf(msg, sizeof(msg),
+                     "Host node of the "
+                     "brick %s is not in cluster",
+                     brick);
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    GD_MSG_BRICK_HOST_NOT_FOUND, "Brick=%s", brick, NULL);
+            *errstr = gf_strdup(msg);
+            ret = -1;
+            goto out;
+        }
+        if (!peerinfo->connected) {
+            RCU_READ_UNLOCK;
+            snprintf(msg, sizeof(msg),
+                     "Host node of the "
+                     "brick %s is down",
+                     brick);
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_HOST_DOWN,
+                    "Brick=%s", brick, NULL);
+            *errstr = gf_strdup(msg);
+            ret = -1;
+            goto out;
+        }
+        RCU_READ_UNLOCK;
+    }
+
+out:
+    return ret;
+}
+
+int
+glusterd_op_stage_remove_brick(dict_t *dict, char **op_errstr)
+{
+    int ret = -1;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char *errstr = NULL;
+    int32_t brick_count = 0;
+    char msg[2048] = "";
+    int32_t flag = 0;
+    gf1_op_commands cmd = GF_OP_CMD_NONE;
+    char *task_id_str = NULL;
+    xlator_t *this = NULL;
+    gsync_status_param_t param = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = op_version_check(this, GD_OP_VER_PERSISTENT_AFR_XATTRS, msg,
+                           sizeof(msg));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_VERSION_MISMATCH, "%s",
+               msg);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "Volume %s does not exist", volname);
+        goto out;
+    }
+
+    ret = glusterd_validate_volume_id(dict, volinfo);
+    if (ret)
+        goto out;
+
+    ret = dict_get_int32n(dict, "command", SLEN("command"), &flag);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get brick command");
+        goto out;
+    }
+    cmd = flag;
+
+    ret = dict_get_int32n(dict, "count", SLEN("count"), &brick_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get brick count");
+        goto out;
+    }
+
+    ret = 0;
+    if (volinfo->brick_count == brick_count) {
+        errstr = gf_strdup(
+            "Deleting all the bricks of the "
+            "volume is not allowed");
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_DELETE, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = -1;
+    switch (cmd) {
+        case GF_OP_CMD_NONE:
+            errstr = gf_strdup("no remove-brick command issued");
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_NO_REMOVE_CMD,
+                    NULL);
+            goto out;
+
+        case GF_OP_CMD_STATUS:
+            ret = 0;
+            goto out;
+        case GF_OP_CMD_START: {
+            if ((volinfo->type == GF_CLUSTER_TYPE_REPLICATE) &&
+                dict_getn(dict, "replica-count", SLEN("replica-count"))) {
+                snprintf(msg, sizeof(msg),
+                         "Migration of data is not "
+                         "needed when reducing replica count. Use the"
+                         " 'force' option");
+                errstr = gf_strdup(msg);
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_USE_THE_FORCE, "%s",
+                       errstr);
+                goto out;
+            }
+
+            if (GLUSTERD_STATUS_STARTED != volinfo->status) {
+                snprintf(msg, sizeof(msg),
+                         "Volume %s needs "
+                         "to be started before remove-brick "
+                         "(you can use 'force' or 'commit' "
+                         "to override this behavior)",
+                         volinfo->volname);
+                errstr = gf_strdup(msg);
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_STARTED,
+                       "%s", errstr);
+                goto out;
+            }
+            if (!gd_is_remove_brick_committed(volinfo)) {
+                snprintf(msg, sizeof(msg),
+                         "An earlier remove-brick "
+                         "task exists for volume %s. Either commit it"
+                         " or stop it before starting a new task.",
+                         volinfo->volname);
+                errstr = gf_strdup(msg);
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_OLD_REMOVE_BRICK_EXISTS,
+                       "Earlier remove-brick"
+                       " task exists for volume %s.",
+                       volinfo->volname);
+                goto out;
+            }
+            if (glusterd_is_defrag_on(volinfo)) {
+                errstr = gf_strdup(
+                    "Rebalance is in progress. Please "
+                    "retry after completion");
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OIP_RETRY_LATER,
+                       "%s", errstr);
+                goto out;
+            }
+
+            /* Check if the connected clients are all of version
+             * glusterfs-3.6 and higher. This is needed to prevent some data
+             * loss issues that could occur when older clients are connected
+             * when rebalance is run.
+             */
+            ret = glusterd_check_client_op_version_support(
+                volname, GD_OP_VERSION_3_6_0, NULL);
+            if (ret) {
+                ret = gf_asprintf(op_errstr,
+                                  "Volume %s has one or "
+                                  "more connected clients of a version"
+                                  " lower than GlusterFS-v3.6.0. "
+                                  "Starting remove-brick in this state "
+                                  "could lead to data loss.\nPlease "
+                                  "disconnect those clients before "
+                                  "attempting this command again.",
+                                  volname);
+                goto out;
+            }
+
+            if (volinfo->snap_count > 0 ||
+                !cds_list_empty(&volinfo->snap_volumes)) {
+                snprintf(msg, sizeof(msg),
+                         "Volume %s  has %" PRIu64
+                         " snapshots. "
+                         "Changing the volume configuration will not effect "
+                         "snapshots."
+                         "But the snapshot brick mount should be intact to "
+                         "make them function.",
+                         volname, volinfo->snap_count);
+                gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_SNAP_WARN, "%s",
+                       msg);
+                msg[0] = '\0';
+            }
+
+            ret = glusterd_remove_brick_validate_bricks(
+                cmd, brick_count, dict, volinfo, &errstr, GF_DEFRAG_CMD_NONE);
+            if (ret)
+                goto out;
+
+            if (is_origin_glusterd(dict)) {
+                ret = glusterd_generate_and_set_task_id(
+                    dict, GF_REMOVE_BRICK_TID_KEY,
+                    SLEN(GF_REMOVE_BRICK_TID_KEY));
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TASKID_GEN_FAIL,
+                           "Failed to generate task-id");
+                    goto out;
+                }
+            } else {
+                ret = dict_get_strn(dict, GF_REMOVE_BRICK_TID_KEY,
+                                    SLEN(GF_REMOVE_BRICK_TID_KEY),
+                                    &task_id_str);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_WARNING, errno,
+                           GD_MSG_DICT_GET_FAILED, "Missing remove-brick-id");
+                    ret = 0;
+                }
+            }
+            break;
+        }
+
+        case GF_OP_CMD_STOP:
+            ret = 0;
+            break;
+
+        case GF_OP_CMD_COMMIT:
+            if (volinfo->decommission_in_progress) {
+                errstr = gf_strdup(
+                    "use 'force' option as migration "
+                    "is in progress");
+                gf_smsg(this->name, GF_LOG_WARNING, 0, GD_MSG_MIGRATION_PROG,
+                        "Use 'force' option", NULL);
+                goto out;
+            }
+
+            if (volinfo->rebal.defrag_status == GF_DEFRAG_STATUS_FAILED) {
+                errstr = gf_strdup(
+                    "use 'force' option as migration "
+                    "has failed");
+                gf_smsg(this->name, GF_LOG_WARNING, 0, GD_MSG_MIGRATION_FAIL,
+                        "Use 'force' option", NULL);
+                goto out;
+            }
+
+            if (volinfo->rebal.defrag_status == GF_DEFRAG_STATUS_COMPLETE) {
+                if (volinfo->rebal.rebalance_failures > 0 ||
+                    volinfo->rebal.skipped_files > 0) {
+                    errstr = gf_strdup(
+                        "use 'force' option as migration "
+                        "of some files might have been skipped or "
+                        "has failed");
+                    gf_smsg(this->name, GF_LOG_WARNING, 0,
+                            GD_MSG_MIGRATION_FAIL,
+                            "Use 'force' option, some files might have been "
+                            "skipped",
+                            NULL);
+                    goto out;
+                }
+            }
+
+            ret = glusterd_remove_brick_validate_bricks(
+                cmd, brick_count, dict, volinfo, &errstr, GF_DEFRAG_CMD_NONE);
+            if (ret)
+                goto out;
+
+            /* If geo-rep is configured, for this volume, it should be
+             * stopped.
+             */
+            param.volinfo = volinfo;
+            ret = glusterd_check_geo_rep_running(&param, op_errstr);
+            if (ret || param.is_active) {
+                ret = -1;
+                goto out;
+            }
+
+            break;
+
+        case GF_OP_CMD_COMMIT_FORCE:
+        case GF_OP_CMD_DETACH_START:
+        case GF_OP_CMD_DETACH_COMMIT:
+        case GF_OP_CMD_DETACH_COMMIT_FORCE:
+        case GF_OP_CMD_STOP_DETACH_TIER:
+            break;
+    }
+    ret = 0;
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    if (ret && errstr) {
+        if (op_errstr)
+            *op_errstr = errstr;
+    }
+    if (!op_errstr && errstr)
+        GF_FREE(errstr);
+    return ret;
+}
+
+int
+glusterd_remove_brick_migrate_cbk(glusterd_volinfo_t *volinfo,
+                                  gf_defrag_status_t status)
+{
+    int ret = 0;
+
+#if 0 /* TODO: enable this behavior once cluster-wide awareness comes for      \
+         defrag cbk function */
+        glusterd_brickinfo_t *brickinfo = NULL;
+        glusterd_brickinfo_t *tmp = NULL;
+
+        switch (status) {
+        case GF_DEFRAG_STATUS_PAUSED:
+        case GF_DEFRAG_STATUS_FAILED:
+                /* No changes required in the volume file.
+                   everything should remain as is */
+                break;
+        case GF_DEFRAG_STATUS_STOPPED:
+                /* Fall back to the old volume file */
+                cds_list_for_each_entry_safe (brickinfo, tmp, &volinfo->bricks,
+                                              brick_list) {
+                        if (!brickinfo->decommissioned)
+                                continue;
+                        brickinfo->decommissioned = 0;
+                }
+                break;
+
+        case GF_DEFRAG_STATUS_COMPLETE:
+                /* Done with the task, you can remove the brick from the
+                   volume file */
+                cds_list_for_each_entry_safe (brickinfo, tmp, &volinfo->bricks,
+                                              brick_list) {
+                        if (!brickinfo->decommissioned)
+                                continue;
+                        gf_log (THIS->name, GF_LOG_INFO, "removing the brick %s",
+                                brickinfo->path);
+                        brickinfo->decommissioned = 0;
+                        if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+                            /*TODO: use the 'atomic' flavour of brick_stop*/
+                                ret = glusterd_brick_stop (volinfo, brickinfo);
+                                if (ret) {
+                                        gf_log (THIS->name, GF_LOG_ERROR,
+                                                "Unable to stop glusterfs (%d)", ret);
+                                }
+                        }
+                        glusterd_delete_brick (volinfo, brickinfo);
+                }
+                break;
+
+        default:
+                GF_ASSERT (!"cbk function called with wrong status");
+                break;
+        }
+
+        ret = glusterd_create_volfiles_and_notify_services (volinfo);
+        if (ret)
+                gf_log (THIS->name, GF_LOG_ERROR,
+                        "Unable to write volume files (%d)", ret);
+
+        ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+        if (ret)
+                gf_log (THIS->name, GF_LOG_ERROR,
+                        "Unable to store volume info (%d)", ret);
+
+
+        if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+                ret = glusterd_check_generate_start_nfs ();
+                if (ret)
+                        gf_log (THIS->name, GF_LOG_ERROR,
+                                "Unable to start nfs process (%d)", ret);
+        }
+
+#endif
+
+    volinfo->decommission_in_progress = 0;
+    return ret;
+}
+
+int
+glusterd_op_add_brick(dict_t *dict, char **op_errstr)
+{
+    int ret = 0;
+    char *volname = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    char *bricks = NULL;
+    int32_t count = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+               "Unable to allocate memory");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "count", SLEN("count"), &count);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get count");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "bricks", SLEN("bricks"), &bricks);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get bricks");
+        goto out;
+    }
+
+    ret = glusterd_op_perform_add_bricks(volinfo, count, bricks, dict);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL,
+               "Unable to add bricks");
+        goto out;
+    }
+    if (priv->op_version <= GD_OP_VERSION_3_7_5) {
+        ret = glusterd_store_volinfo(volinfo,
+                                     GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+        if (ret)
+            goto out;
+    } else {
+        /*
+         * The cluster is operating at version greater than
+         * gluster-3.7.5. So no need to store volfiles
+         * in commit phase, the same will be done
+         * in post validate phase with v3 framework.
+         */
+    }
+
+    if (GLUSTERD_STATUS_STARTED == volinfo->status)
+        ret = glusterd_svcs_manager(volinfo);
+
+out:
+    return ret;
+}
+
+int
+glusterd_post_commit_add_brick(dict_t *dict, char **op_errstr)
+{
+    int ret = 0;
+    char *volname = NULL;
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+    ret = glusterd_replace_old_auth_allow_list(volname);
+out:
+    return ret;
+}
+
+int
+glusterd_post_commit_replace_brick(dict_t *dict, char **op_errstr)
+{
+    int ret = 0;
+    char *volname = NULL;
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+    ret = glusterd_replace_old_auth_allow_list(volname);
+out:
+    return ret;
+}
+
+int
+glusterd_set_rebalance_id_for_remove_brick(dict_t *req_dict, dict_t *rsp_dict)
+{
+    int ret = -1;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char msg[2048] = {0};
+    char *task_id_str = NULL;
+    xlator_t *this = NULL;
+    int32_t cmd = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(req_dict);
+
+    ret = dict_get_strn(rsp_dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "volname not found");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+               "Unable to allocate memory");
+        goto out;
+    }
+
+    ret = dict_get_int32n(rsp_dict, "command", SLEN("command"), &cmd);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get command");
+        goto out;
+    }
+
+    /* remove brick task id is generted in glusterd_op_stage_remove_brick(),
+     * but rsp_dict is unavailable there. So copying it to rsp_dict from
+     * req_dict here. */
+
+    if (is_origin_glusterd(rsp_dict)) {
+        ret = dict_get_strn(req_dict, GF_REMOVE_BRICK_TID_KEY,
+                            SLEN(GF_REMOVE_BRICK_TID_KEY), &task_id_str);
+        if (ret) {
+            snprintf(msg, sizeof(msg), "Missing rebalance id for remove-brick");
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_REBALANCE_ID_MISSING,
+                   "%s", msg);
+            ret = 0;
+        } else {
+            gf_uuid_parse(task_id_str, volinfo->rebal.rebalance_id);
+
+            ret = glusterd_copy_uuid_to_dict(volinfo->rebal.rebalance_id,
+                                             rsp_dict, GF_REMOVE_BRICK_TID_KEY,
+                                             SLEN(GF_REMOVE_BRICK_TID_KEY));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_REMOVE_BRICK_ID_SET_FAIL,
+                       "Failed to set remove-brick-id");
+                goto out;
+            }
+        }
+    }
+    if (!gf_uuid_is_null(volinfo->rebal.rebalance_id) &&
+        GD_OP_REMOVE_BRICK == volinfo->rebal.op) {
+        ret = glusterd_copy_uuid_to_dict(volinfo->rebal.rebalance_id, rsp_dict,
+                                         GF_REMOVE_BRICK_TID_KEY,
+                                         SLEN(GF_REMOVE_BRICK_TID_KEY));
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set task-id for volume %s", volname);
+            goto out;
+        }
+    }
+out:
+    return ret;
+}
+int
+glusterd_op_remove_brick(dict_t *dict, char **op_errstr)
+{
+    int ret = -1;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char *brick = NULL;
+    int32_t count = 0;
+    int32_t i = 1;
+    char key[64] = "";
+    int keylen;
+    int32_t flag = 0;
+    int need_rebalance = 0;
+    int force = 0;
+    gf1_op_commands cmd = 0;
+    int32_t replica_count = 0;
+    char *task_id_str = NULL;
+    xlator_t *this = NULL;
+    dict_t *bricks_dict = NULL;
+    char *brick_tmpstr = NULL;
+    int start_remove = 0;
+    uint32_t commit_hash = 0;
+    int defrag_cmd = 0;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+               "Unable to allocate memory");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "command", SLEN("command"), &flag);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get command");
+        goto out;
+    }
+    cmd = flag;
+
+    if (GF_OP_CMD_START == cmd)
+        start_remove = 1;
+
+    /* Set task-id, if available, in ctx dict for operations other than
+     * start
+     */
+
+    if (is_origin_glusterd(dict) && (!start_remove)) {
+        if (!gf_uuid_is_null(volinfo->rebal.rebalance_id)) {
+            ret = glusterd_copy_uuid_to_dict(volinfo->rebal.rebalance_id, dict,
+                                             GF_REMOVE_BRICK_TID_KEY,
+                                             SLEN(GF_REMOVE_BRICK_TID_KEY));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_REMOVE_BRICK_ID_SET_FAIL,
+                       "Failed to set remove-brick-id");
+                goto out;
+            }
+        }
+    }
+
+    /* Clear task-id, rebal.op and stored bricks on commmitting/stopping
+     * remove-brick */
+    if ((!start_remove) && (cmd != GF_OP_CMD_STATUS)) {
+        gf_uuid_clear(volinfo->rebal.rebalance_id);
+        volinfo->rebal.op = GD_OP_NONE;
+        dict_unref(volinfo->rebal.dict);
+        volinfo->rebal.dict = NULL;
+    }
+
+    ret = -1;
+    switch (cmd) {
+        case GF_OP_CMD_NONE:
+            goto out;
+
+        case GF_OP_CMD_STATUS:
+            ret = 0;
+            goto out;
+
+        case GF_OP_CMD_STOP:
+        case GF_OP_CMD_START:
+            /* Reset defrag status to 'NOT STARTED' whenever a
+             * remove-brick/rebalance command is issued to remove
+             * stale information from previous run.
+             * Update defrag_cmd as well or it will only be done
+             * for nodes on which the brick to be removed exists.
+             */
+            /* coverity[MIXED_ENUMS] */
+            volinfo->rebal.defrag_cmd = cmd;
+            volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_NOT_STARTED;
+            ret = dict_get_strn(dict, GF_REMOVE_BRICK_TID_KEY,
+                                SLEN(GF_REMOVE_BRICK_TID_KEY), &task_id_str);
+            if (ret) {
+                gf_msg_debug(this->name, errno, "Missing remove-brick-id");
+                ret = 0;
+            } else {
+                gf_uuid_parse(task_id_str, volinfo->rebal.rebalance_id);
+                volinfo->rebal.op = GD_OP_REMOVE_BRICK;
+            }
+            force = 0;
+            break;
+
+        case GF_OP_CMD_COMMIT:
+            force = 1;
+            break;
+
+        case GF_OP_CMD_COMMIT_FORCE:
+
+            if (volinfo->decommission_in_progress) {
+                if (volinfo->rebal.defrag) {
+                    LOCK(&volinfo->rebal.defrag->lock);
+                    /* Fake 'rebalance-complete' so the graph change
+                       happens right away */
+                    volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_COMPLETE;
+
+                    UNLOCK(&volinfo->rebal.defrag->lock);
+                }
+                /* Graph change happens in rebalance _cbk function,
+                   no need to do anything here */
+                /* TODO: '_cbk' function is not doing anything for now */
+            }
+
+            ret = 0;
+            force = 1;
+            break;
+        case GF_OP_CMD_DETACH_START:
+        case GF_OP_CMD_DETACH_COMMIT_FORCE:
+        case GF_OP_CMD_DETACH_COMMIT:
+        case GF_OP_CMD_STOP_DETACH_TIER:
+            break;
+    }
+
+    ret = dict_get_int32n(dict, "count", SLEN("count"), &count);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get count");
+        goto out;
+    }
+    /* Save the list of bricks for later usage only on starting a
+     * remove-brick. Right now this is required for displaying the task
+     * parameters with task status in volume status.
+     */
+
+    if (start_remove) {
+        bricks_dict = dict_new();
+        if (!bricks_dict) {
+            ret = -1;
+            goto out;
+        }
+        ret = dict_set_int32n(bricks_dict, "count", SLEN("count"), count);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                   "Failed to save remove-brick count");
+            goto out;
+        }
+    }
+
+    while (i <= count) {
+        keylen = snprintf(key, sizeof(key), "brick%d", i);
+        ret = dict_get_strn(dict, key, keylen, &brick);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get %s", key);
+            goto out;
+        }
+
+        if (start_remove) {
+            brick_tmpstr = gf_strdup(brick);
+            if (!brick_tmpstr) {
+                ret = -1;
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                       "Failed to duplicate brick name");
+                goto out;
+            }
+            ret = dict_set_dynstrn(bricks_dict, key, keylen, brick_tmpstr);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                       "Failed to add brick to dict");
+                goto out;
+            }
+            brick_tmpstr = NULL;
+        }
+
+        ret = glusterd_op_perform_remove_brick(volinfo, brick, force,
+                                               &need_rebalance);
+        if (ret)
+            goto out;
+        i++;
+    }
+
+    if (start_remove)
+        volinfo->rebal.dict = dict_ref(bricks_dict);
+
+    ret = dict_get_int32n(dict, "replica-count", SLEN("replica-count"),
+                          &replica_count);
+    if (!ret) {
+        gf_msg(this->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_FAILED,
+               "changing replica count %d to %d on volume %s",
+               volinfo->replica_count, replica_count, volinfo->volname);
+        volinfo->replica_count = replica_count;
+        /* A reduction in replica count implies an arbiter volume
+         * earlier is now no longer one. */
+        if (volinfo->arbiter_count)
+            volinfo->arbiter_count = 0;
+        volinfo->sub_count = replica_count;
+        volinfo->dist_leaf_count = glusterd_get_dist_leaf_count(volinfo);
+
+        /*
+         * volinfo->type and sub_count have already been set for
+         * volumes undergoing a detach operation, they should not
+         * be modified here.
+         */
+        if (replica_count == 1) {
+            if (volinfo->type == GF_CLUSTER_TYPE_REPLICATE) {
+                volinfo->type = GF_CLUSTER_TYPE_NONE;
+                /* backward compatibility */
+                volinfo->sub_count = 0;
+            }
+        }
+    }
+    volinfo->subvol_count = (volinfo->brick_count / volinfo->dist_leaf_count);
+
+    if (!glusterd_is_volume_replicate(volinfo) &&
+        conf->op_version >= GD_OP_VERSION_3_12_2) {
+        ret = dict_set_nstrn(volinfo->dict, "performance.client-io-threads",
+                             SLEN("performance.client-io-threads"), "on",
+                             SLEN("on"));
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set "
+                   "performance.client-io-threads to on");
+            goto out;
+        }
+    }
+
+    ret = glusterd_create_volfiles_and_notify_services(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "failed to create volfiles");
+        goto out;
+    }
+
+    ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLINFO_STORE_FAIL,
+               "failed to store volinfo");
+        goto out;
+    }
+
+    if (start_remove && volinfo->status == GLUSTERD_STATUS_STARTED) {
+        ret = glusterd_svcs_reconfigure(volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_NFS_RECONF_FAIL,
+                   "Unable to reconfigure NFS-Server");
+            goto out;
+        }
+    }
+
+    /* Need to reset the defrag/rebalance status accordingly */
+    switch (volinfo->rebal.defrag_status) {
+        case GF_DEFRAG_STATUS_FAILED:
+        case GF_DEFRAG_STATUS_COMPLETE:
+            volinfo->rebal.defrag_status = 0;
+        /* FALLTHROUGH */
+        default:
+            break;
+    }
+    if (!force && need_rebalance) {
+        if (dict_get_uint32(dict, "commit-hash", &commit_hash) == 0) {
+            volinfo->rebal.commit_hash = commit_hash;
+        }
+        /* perform the rebalance operations */
+        defrag_cmd = GF_DEFRAG_CMD_START_FORCE;
+        /*
+         * We need to set this *before* we issue commands to the
+         * bricks, or else we might end up setting it after the bricks
+         * have responded.  If we fail to send the request(s) we'll
+         * clear it ourselves because nobody else will.
+         */
+        volinfo->decommission_in_progress = 1;
+        char err_str[4096] = "";
+        ret = glusterd_handle_defrag_start(
+            volinfo, err_str, sizeof(err_str), defrag_cmd,
+            glusterd_remove_brick_migrate_cbk, GD_OP_REMOVE_BRICK);
+
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REBALANCE_START_FAIL,
+                   "failed to start the rebalance");
+            /* TBD: shouldn't we do more than print a message? */
+            volinfo->decommission_in_progress = 0;
+            if (op_errstr)
+                *op_errstr = gf_strdup(err_str);
+        }
+    } else {
+        if (GLUSTERD_STATUS_STARTED == volinfo->status)
+            ret = glusterd_svcs_manager(volinfo);
+    }
+out:
+    GF_FREE(brick_tmpstr);
+    if (bricks_dict)
+        dict_unref(bricks_dict);
+    gf_msg_debug(this->name, 0, "returning %d ", ret);
+    return ret;
+}
+
+int
+glusterd_op_stage_barrier(dict_t *dict, char **op_errstr)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    char *volname = NULL;
+    glusterd_volinfo_t *vol = NULL;
+    char *barrier_op = NULL;
+
+    GF_ASSERT(dict);
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Volname not present in "
+               "dict");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &vol);
+    if (ret) {
+        gf_asprintf(op_errstr, "Volume %s does not exist", volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND, "%s",
+               *op_errstr);
+        goto out;
+    }
+
+    if (!glusterd_is_volume_started(vol)) {
+        gf_asprintf(op_errstr, "Volume %s is not started", volname);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "barrier", SLEN("barrier"), &barrier_op);
+    if (ret == -1) {
+        gf_asprintf(op_errstr,
+                    "Barrier op for volume %s not present "
+                    "in dict",
+                    volname);
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, "%s",
+               *op_errstr);
+        goto out;
+    }
+    ret = 0;
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_barrier(dict_t *dict, char **op_errstr)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    char *volname = NULL;
+    glusterd_volinfo_t *vol = NULL;
+    char *barrier_op = NULL;
+
+    GF_ASSERT(dict);
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Volname not present in "
+               "dict");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &vol);
+    if (ret) {
+        gf_asprintf(op_errstr, "Volume %s does not exist", volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND, "%s",
+               *op_errstr);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "barrier", SLEN("barrier"), &barrier_op);
+    if (ret) {
+        gf_asprintf(op_errstr,
+                    "Barrier op for volume %s not present "
+                    "in dict",
+                    volname);
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, "%s",
+               *op_errstr);
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(vol->dict, "features.barrier", barrier_op);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Failed to set barrier op in"
+               " volume option dict");
+        goto out;
+    }
+
+    gd_update_volume_op_versions(vol);
+    ret = glusterd_create_volfiles(vol);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Failed to create volfiles");
+        goto out;
+    }
+    ret = glusterd_store_volinfo(vol, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_handle_add_tier_brick(rpcsvc_request_t *req)
+{
+    return 0;
+}
+
+int
+glusterd_handle_attach_tier(rpcsvc_request_t *req)
+{
+    return 0;
+}
+
+int
+glusterd_handle_detach_tier(rpcsvc_request_t *req)
+{
+    return 0;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-conn-helper.c b/xlators/mgmt/glusterd/src/glusterd-conn-helper.c
new file mode 100644
index 00000000000..a7f54ec24b7
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-conn-helper.c
@@ -0,0 +1,21 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "glusterd-conn-mgmt.h"
+#include "glusterd-svc-mgmt.h"
+
+#define _LGPL_SOURCE
+#include <urcu/rculist.h>
+
+glusterd_svc_t *
+glusterd_conn_get_svc_object(glusterd_conn_t *conn)
+{
+    return cds_list_entry(conn, glusterd_svc_t, conn);
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-conn-helper.h b/xlators/mgmt/glusterd/src/glusterd-conn-helper.h
new file mode 100644
index 00000000000..6f500309175
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-conn-helper.h
@@ -0,0 +1,21 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_CONN_HELPER_H_
+#define _GLUSTERD_CONN_HELPER_H_
+
+#include "rpc-clnt.h"
+
+#include "glusterd-conn-mgmt.h"
+
+glusterd_svc_t *
+glusterd_conn_get_svc_object(glusterd_conn_t *conn);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-conn-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-conn-mgmt.c
new file mode 100644
index 00000000000..5c01f0c70b6
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-conn-mgmt.c
@@ -0,0 +1,191 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include "rpc-clnt.h"
+#include "glusterd.h"
+#include "glusterd-conn-mgmt.h"
+#include "glusterd-conn-helper.h"
+#include "glusterd-utils.h"
+#include "glusterd-messages.h"
+
+int
+glusterd_conn_init(glusterd_conn_t *conn, char *sockpath, int frame_timeout,
+                   glusterd_conn_notify_t notify)
+{
+    int ret = -1;
+    dict_t *options = NULL;
+    struct rpc_clnt *rpc = NULL;
+    xlator_t *this = THIS;
+    glusterd_svc_t *svc = NULL;
+
+    if (!this) {
+        gf_smsg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_XLATOR_NOT_DEFINED,
+                NULL);
+        goto out;
+    }
+
+    options = dict_new();
+    if (!options) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    svc = glusterd_conn_get_svc_object(conn);
+    if (!svc) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SVC_GET_FAIL,
+               "Failed to get the service");
+        goto out;
+    }
+
+    ret = rpc_transport_unix_options_build(options, sockpath, frame_timeout);
+    if (ret)
+        goto out;
+
+    ret = dict_set_int32n(options, "transport.socket.ignore-enoent",
+                          SLEN("transport.socket.ignore-enoent"), 1);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=transport.socket.ignore-enoent", NULL);
+        goto out;
+    }
+
+    /* @options is free'd by rpc_transport when destroyed */
+    rpc = rpc_clnt_new(options, this, (char *)svc->name, 16);
+    if (!rpc) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = rpc_clnt_register_notify(rpc, glusterd_conn_common_notify, conn);
+    if (ret)
+        goto out;
+
+    ret = snprintf(conn->sockpath, sizeof(conn->sockpath), "%s", sockpath);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    } else
+        ret = 0;
+
+    conn->frame_timeout = frame_timeout;
+    conn->rpc = rpc;
+    conn->notify = notify;
+out:
+    if (options)
+        dict_unref(options);
+    if (ret) {
+        if (rpc) {
+            rpc_clnt_unref(rpc);
+            rpc = NULL;
+        }
+    }
+    return ret;
+}
+
+int
+glusterd_conn_term(glusterd_conn_t *conn)
+{
+    rpc_clnt_unref(conn->rpc);
+    return 0;
+}
+
+int
+glusterd_conn_connect(glusterd_conn_t *conn)
+{
+    return rpc_clnt_start(conn->rpc);
+}
+
+int
+glusterd_conn_disconnect(glusterd_conn_t *conn)
+{
+    rpc_clnt_disable(conn->rpc);
+
+    return 0;
+}
+
+int
+__glusterd_conn_common_notify(struct rpc_clnt *rpc, void *mydata,
+                              rpc_clnt_event_t event, void *data)
+{
+    glusterd_conn_t *conn = mydata;
+
+    /* Silently ignoring this error, exactly like the current
+     * implementation */
+    if (!conn)
+        return 0;
+
+    return conn->notify(conn, event);
+}
+
+int
+glusterd_conn_common_notify(struct rpc_clnt *rpc, void *mydata,
+                            rpc_clnt_event_t event, void *data)
+{
+    return glusterd_big_locked_notify(rpc, mydata, event, data,
+                                      __glusterd_conn_common_notify);
+}
+
+int32_t
+glusterd_conn_build_socket_filepath(char *rundir, uuid_t uuid, char *socketpath,
+                                    int len)
+{
+    char sockfilepath[PATH_MAX] = {
+        0,
+    };
+
+    snprintf(sockfilepath, sizeof(sockfilepath), "%s/run-%s", rundir,
+             uuid_utoa(uuid));
+
+    glusterd_set_socket_filepath(sockfilepath, socketpath, len);
+    return 0;
+}
+
+int
+__glusterd_muxsvc_conn_common_notify(struct rpc_clnt *rpc, void *mydata,
+                                     rpc_clnt_event_t event, void *data)
+{
+    glusterd_conf_t *conf = THIS->private;
+    glusterd_svc_proc_t *mux_proc = mydata;
+    int ret = -1;
+
+    /* Silently ignoring this error, exactly like the current
+     * implementation */
+    if (!mux_proc)
+        return 0;
+
+    if (event == RPC_CLNT_DESTROY) {
+        /*RPC_CLNT_DESTROY will only called after mux_proc detached from the
+         * list. So it is safe to call without lock. Processing
+         * RPC_CLNT_DESTROY under a lock will lead to deadlock.
+         */
+        if (mux_proc->data) {
+            glusterd_volinfo_unref(mux_proc->data);
+            mux_proc->data = NULL;
+        }
+        GF_FREE(mux_proc);
+        ret = 0;
+    } else {
+        pthread_mutex_lock(&conf->attach_lock);
+        {
+            ret = mux_proc->notify(mux_proc, event);
+        }
+        pthread_mutex_unlock(&conf->attach_lock);
+    }
+    return ret;
+}
+
+int
+glusterd_muxsvc_conn_common_notify(struct rpc_clnt *rpc, void *mydata,
+                                   rpc_clnt_event_t event, void *data)
+{
+    return glusterd_big_locked_notify(rpc, mydata, event, data,
+                                      __glusterd_muxsvc_conn_common_notify);
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-conn-mgmt.h b/xlators/mgmt/glusterd/src/glusterd-conn-mgmt.h
new file mode 100644
index 00000000000..1b225621ab1
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-conn-mgmt.h
@@ -0,0 +1,53 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_CONN_MGMT_H_
+#define _GLUSTERD_CONN_MGMT_H_
+
+#include "rpc-clnt.h"
+
+typedef struct glusterd_conn_ glusterd_conn_t;
+
+typedef int (*glusterd_conn_notify_t)(glusterd_conn_t *conn,
+                                      rpc_clnt_event_t event);
+
+struct glusterd_conn_ {
+    struct rpc_clnt *rpc;
+    /* Existing daemons tend to specialize their respective
+     * notify implementations, so ... */
+    glusterd_conn_notify_t notify;
+    int frame_timeout;
+    char sockpath[PATH_MAX];
+};
+
+int
+glusterd_conn_init(glusterd_conn_t *conn, char *sockpath, int frame_timeout,
+                   glusterd_conn_notify_t notify);
+
+int
+glusterd_conn_term(glusterd_conn_t *conn);
+
+int
+glusterd_conn_connect(glusterd_conn_t *conn);
+
+int
+glusterd_conn_disconnect(glusterd_conn_t *conn);
+
+int
+glusterd_conn_common_notify(struct rpc_clnt *rpc, void *mydata,
+                            rpc_clnt_event_t event, void *data);
+int
+glusterd_muxsvc_conn_common_notify(struct rpc_clnt *rpc, void *mydata,
+                                   rpc_clnt_event_t event, void *data);
+
+int32_t
+glusterd_conn_build_socket_filepath(char *rundir, uuid_t uuid, char *socketpath,
+                                    int len);
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-errno.h b/xlators/mgmt/glusterd/src/glusterd-errno.h
new file mode 100644
index 00000000000..c74070e0e8d
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-errno.h
@@ -0,0 +1,33 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_ERRNO_H
+#define _GLUSTERD_ERRNO_H
+
+enum glusterd_op_errno {
+    EG_INTRNL = 30800,    /* Internal Error                    */
+    EG_OPNOTSUP = 30801,  /* Gluster Op Not Supported          */
+    EG_ANOTRANS = 30802,  /* Another Transaction in Progress   */
+    EG_BRCKDWN = 30803,   /* One or more brick is down         */
+    EG_NODEDWN = 30804,   /* One or more node is down          */
+    EG_HRDLMT = 30805,    /* Hard Limit is reached             */
+    EG_NOVOL = 30806,     /* Volume does not exist             */
+    EG_NOSNAP = 30807,    /* Snap does not exist               */
+    EG_RBALRUN = 30808,   /* Rebalance is running              */
+    EG_VOLRUN = 30809,    /* Volume is running                 */
+    EG_VOLSTP = 30810,    /* Volume is not running             */
+    EG_VOLEXST = 30811,   /* Volume exists                     */
+    EG_SNAPEXST = 30812,  /* Snapshot exists                   */
+    EG_ISSNAP = 30813,    /* Volume is a snap volume           */
+    EG_GEOREPRUN = 30814, /* Geo-Replication is running        */
+    EG_NOTTHINP = 30815,  /* Bricks are not thinly provisioned */
+    EG_NOGANESHA = 30816, /* Global ganesha is not enabled   */
+};
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-ganesha.c b/xlators/mgmt/glusterd/src/glusterd-ganesha.c
new file mode 100644
index 00000000000..f08bd6cebee
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-ganesha.c
@@ -0,0 +1,927 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/common-utils.h>
+#include "glusterd.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-store.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-messages.h"
+#include <glusterfs/syscall.h>
+
+#include <ctype.h>
+
+int
+start_ganesha(char **op_errstr);
+
+typedef struct service_command {
+    char *binary;
+    char *service;
+    int (*action)(struct service_command *, char *);
+} service_command;
+
+/* parsing_ganesha_ha_conf will allocate the returned string
+ * to be freed (GF_FREE) by the caller
+ * return NULL if error or not found */
+static char *
+parsing_ganesha_ha_conf(const char *key)
+{
+#define MAX_LINE 1024
+    char scratch[MAX_LINE * 2] = {
+        0,
+    };
+    char *value = NULL, *pointer = NULL, *end_pointer = NULL;
+    FILE *fp;
+
+    fp = fopen(GANESHA_HA_CONF, "r");
+    if (fp == NULL) {
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "couldn't open the file %s", GANESHA_HA_CONF);
+        goto end_ret;
+    }
+    while ((pointer = fgets(scratch, MAX_LINE, fp)) != NULL) {
+        /* Read config file until we get matching "^[[:space:]]*key" */
+        if (*pointer == '#') {
+            continue;
+        }
+        while (isblank(*pointer)) {
+            pointer++;
+        }
+        if (strncmp(pointer, key, strlen(key))) {
+            continue;
+        }
+        pointer += strlen(key);
+        /* key found : if we fail to parse, we'll return an error
+         * rather than trying next one
+         * - supposition : conf file is bash compatible : no space
+         *   around the '=' */
+        if (*pointer != '=') {
+            gf_msg(THIS->name, GF_LOG_ERROR, errno,
+                   GD_MSG_GET_CONFIG_INFO_FAILED, "Parsing %s failed at key %s",
+                   GANESHA_HA_CONF, key);
+            goto end_close;
+        }
+        pointer++; /* jump the '=' */
+
+        if (*pointer == '"' || *pointer == '\'') {
+            /* dont get the quote */
+            pointer++;
+        }
+        end_pointer = pointer;
+        /* stop at the next closing quote or  blank/newline */
+        do {
+            end_pointer++;
+        } while (!(*end_pointer == '\'' || *end_pointer == '"' ||
+                   isspace(*end_pointer) || *end_pointer == '\0'));
+        *end_pointer = '\0';
+
+        /* got it. copy it and return */
+        value = gf_strdup(pointer);
+        break;
+    }
+
+end_close:
+    fclose(fp);
+end_ret:
+    return value;
+}
+
+static int
+sc_systemctl_action(struct service_command *sc, char *command)
+{
+    runner_t runner = {
+        0,
+    };
+
+    runinit(&runner);
+    runner_add_args(&runner, sc->binary, command, sc->service, NULL);
+    return runner_run(&runner);
+}
+
+static int
+sc_service_action(struct service_command *sc, char *command)
+{
+    runner_t runner = {
+        0,
+    };
+
+    runinit(&runner);
+    runner_add_args(&runner, sc->binary, sc->service, command, NULL);
+    return runner_run(&runner);
+}
+
+static int
+manage_service(char *action)
+{
+    int i = 0;
+    int ret = 0;
+    struct service_command sc_list[] = {{.binary = "/bin/systemctl",
+                                         .service = "nfs-ganesha",
+                                         .action = sc_systemctl_action},
+                                        {.binary = "/sbin/invoke-rc.d",
+                                         .service = "nfs-ganesha",
+                                         .action = sc_service_action},
+                                        {.binary = "/sbin/service",
+                                         .service = "nfs-ganesha",
+                                         .action = sc_service_action},
+                                        {.binary = NULL}};
+
+    while (sc_list[i].binary != NULL) {
+        ret = sys_access(sc_list[i].binary, X_OK);
+        if (ret == 0) {
+            gf_msg_debug(THIS->name, 0, "%s found.", sc_list[i].binary);
+            return sc_list[i].action(&sc_list[i], action);
+        }
+        i++;
+    }
+    gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_UNRECOGNIZED_SVC_MNGR,
+           "Could not %s NFS-Ganesha.Service manager for distro"
+           " not recognized.",
+           action);
+    return ret;
+}
+
+/*
+ * Check if the cluster is a ganesha cluster or not *
+ */
+gf_boolean_t
+glusterd_is_ganesha_cluster()
+{
+    int ret = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    gf_boolean_t ret_bool = _gf_false;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("ganesha", this, out);
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    ret = dict_get_str_boolean(priv->opts, GLUSTERD_STORE_KEY_GANESHA_GLOBAL,
+                               _gf_false);
+    if (ret == _gf_true) {
+        ret_bool = _gf_true;
+        gf_msg_debug(this->name, 0, "nfs-ganesha is enabled for the cluster");
+    } else
+        gf_msg_debug(this->name, 0, "nfs-ganesha is disabled for the cluster");
+
+out:
+    return ret_bool;
+}
+
+/* Check if ganesha.enable is set to 'on', that checks if
+ * a  particular volume is exported via NFS-Ganesha */
+gf_boolean_t
+glusterd_check_ganesha_export(glusterd_volinfo_t *volinfo)
+{
+    char *value = NULL;
+    gf_boolean_t is_exported = _gf_false;
+    int ret = 0;
+
+    ret = glusterd_volinfo_get(volinfo, "ganesha.enable", &value);
+    if ((ret == 0) && value) {
+        if (strcmp(value, "on") == 0) {
+            gf_msg_debug(THIS->name, 0,
+                         "ganesha.enable set"
+                         " to %s",
+                         value);
+            is_exported = _gf_true;
+        }
+    }
+    return is_exported;
+}
+
+/* *
+ * The below function is called as part of commit phase for volume set option
+ * "ganesha.enable". If the value is "on", it creates export configuration file
+ * and then export the volume via dbus command. Incase of "off", the volume
+ * will be already unexported during stage phase, so it will remove the conf
+ * file from shared storage
+ */
+int
+glusterd_check_ganesha_cmd(char *key, char *value, char **errstr, dict_t *dict)
+{
+    int ret = 0;
+    char *volname = NULL;
+
+    GF_ASSERT(key);
+    GF_ASSERT(value);
+    GF_ASSERT(dict);
+
+    if ((strcmp(key, "ganesha.enable") == 0)) {
+        if ((strcmp(value, "on")) && (strcmp(value, "off"))) {
+            gf_asprintf(errstr,
+                        "Invalid value"
+                        " for volume set command. Use on/off only.");
+            ret = -1;
+            goto out;
+        }
+        if (strcmp(value, "on") == 0) {
+            ret = glusterd_handle_ganesha_op(dict, errstr, key, value);
+
+        } else if (is_origin_glusterd(dict)) {
+            ret = dict_get_str(dict, "volname", &volname);
+            if (ret) {
+                gf_msg("glusterd-ganesha", GF_LOG_ERROR, errno,
+                       GD_MSG_DICT_GET_FAILED, "Unable to get volume name");
+                goto out;
+            }
+            ret = manage_export_config(volname, "off", errstr);
+        }
+    }
+out:
+    if (ret) {
+        gf_msg("glusterd-ganesha", GF_LOG_ERROR, 0,
+               GD_MSG_NFS_GNS_OP_HANDLE_FAIL,
+               "Handling NFS-Ganesha"
+               " op failed.");
+    }
+    return ret;
+}
+
+int
+glusterd_op_stage_set_ganesha(dict_t *dict, char **op_errstr)
+{
+    int ret = -1;
+    char *value = NULL;
+    char *str = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(dict);
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_str(dict, "value", &value);
+    if (value == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "value not present.");
+        goto out;
+    }
+    /* This dict_get will fail if the user had never set the key before */
+    /*Ignoring the ret value and proceeding */
+    ret = dict_get_str(priv->opts, GLUSTERD_STORE_KEY_GANESHA_GLOBAL, &str);
+    if (str ? strcmp(value, str) == 0 : strcmp(value, "disable") == 0) {
+        gf_asprintf(op_errstr, "nfs-ganesha is already %sd.", value);
+        ret = -1;
+        goto out;
+    }
+
+    if (strcmp(value, "enable") == 0) {
+        ret = start_ganesha(op_errstr);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_NFS_GNS_START_FAIL,
+                   "Could not start NFS-Ganesha");
+        }
+    } else {
+        ret = stop_ganesha(op_errstr);
+        if (ret)
+            gf_msg_debug(THIS->name, 0,
+                         "Could not stop "
+                         "NFS-Ganesha.");
+    }
+
+out:
+
+    if (ret) {
+        if (!(*op_errstr)) {
+            *op_errstr = gf_strdup("Error, Validation Failed");
+            gf_msg_debug(this->name, 0, "Error, Cannot Validate option :%s",
+                         GLUSTERD_STORE_KEY_GANESHA_GLOBAL);
+        } else {
+            gf_msg_debug(this->name, 0, "Error, Cannot Validate option");
+        }
+    }
+    return ret;
+}
+
+int
+glusterd_op_set_ganesha(dict_t *dict, char **errstr)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    char *key = NULL;
+    char *value = NULL;
+    char *next_version = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_str(dict, "key", &key);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Couldn't get key in global option set");
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "value", &value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Couldn't get value in global option set");
+        goto out;
+    }
+
+    ret = glusterd_handle_ganesha_op(dict, errstr, key, value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NFS_GNS_SETUP_FAIL,
+               "Initial NFS-Ganesha set up failed");
+        ret = -1;
+        goto out;
+    }
+    ret = dict_set_dynstr_with_alloc(priv->opts,
+                                     GLUSTERD_STORE_KEY_GANESHA_GLOBAL, value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, errno, GD_MSG_DICT_SET_FAILED,
+               "Failed to set"
+               " nfs-ganesha in dict.");
+        goto out;
+    }
+    ret = glusterd_get_next_global_opt_version_str(priv->opts, &next_version);
+    if (ret) {
+        gf_msg_debug(THIS->name, 0,
+                     "Could not fetch "
+                     " global op version");
+        goto out;
+    }
+    ret = dict_set_str(priv->opts, GLUSTERD_GLOBAL_OPT_VERSION, next_version);
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_options(this, priv->opts);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_FAIL,
+               "Failed to store options");
+        goto out;
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "returning %d", ret);
+    return ret;
+}
+
+/* Following function parse GANESHA_HA_CONF
+ * The sample file looks like below,
+ * HA_NAME="ganesha-ha-360"
+ * HA_VOL_NAME="ha-state"
+ * HA_CLUSTER_NODES="server1,server2"
+ * VIP_rhs_1="10.x.x.x"
+ * VIP_rhs_2="10.x.x.x." */
+
+/* Check if the localhost is listed as one of nfs-ganesha nodes */
+gf_boolean_t
+check_host_list(void)
+{
+    glusterd_conf_t *priv = NULL;
+    char *hostname, *hostlist;
+    gf_boolean_t ret = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    hostlist = parsing_ganesha_ha_conf("HA_CLUSTER_NODES");
+    if (hostlist == NULL) {
+        gf_msg(this->name, GF_LOG_INFO, errno, GD_MSG_GET_CONFIG_INFO_FAILED,
+               "couldn't get HA_CLUSTER_NODES from file %s", GANESHA_HA_CONF);
+        return _gf_false;
+    }
+
+    /* Hostlist is a comma separated list now */
+    hostname = strtok(hostlist, ",");
+    while (hostname != NULL) {
+        ret = gf_is_local_addr(hostname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_NFS_GNS_HOST_FOUND,
+                   "ganesha host found "
+                   "Hostname is %s",
+                   hostname);
+            break;
+        }
+        hostname = strtok(NULL, ",");
+    }
+
+    GF_FREE(hostlist);
+    return ret;
+}
+
+int
+gd_ganesha_send_dbus(char *volname, char *value)
+{
+    runner_t runner = {
+        0,
+    };
+    int ret = -1;
+    runinit(&runner);
+
+    GF_VALIDATE_OR_GOTO("glusterd-ganesha", volname, out);
+    GF_VALIDATE_OR_GOTO("glusterd-ganesha", value, out);
+
+    ret = 0;
+    if (check_host_list()) {
+        /* Check whether ganesha is running on this node */
+        if (manage_service("status")) {
+            gf_msg("glusterd-ganesha", GF_LOG_WARNING, 0,
+                   GD_MSG_GANESHA_NOT_RUNNING,
+                   "Export failed, NFS-Ganesha is not running");
+        } else {
+            runner_add_args(&runner, GANESHA_PREFIX "/dbus-send.sh", CONFDIR,
+                            value, volname, NULL);
+            ret = runner_run(&runner);
+        }
+    }
+out:
+    return ret;
+}
+
+int
+manage_export_config(char *volname, char *value, char **op_errstr)
+{
+    runner_t runner = {
+        0,
+    };
+    int ret = -1;
+
+    GF_ASSERT(volname);
+    runinit(&runner);
+    runner_add_args(&runner, GANESHA_PREFIX "/create-export-ganesha.sh",
+                    CONFDIR, value, volname, NULL);
+    ret = runner_run(&runner);
+
+    if (ret && op_errstr)
+        gf_asprintf(op_errstr,
+                    "Failed to create"
+                    " NFS-Ganesha export config file.");
+
+    return ret;
+}
+
+/* Exports and unexports a particular volume via NFS-Ganesha */
+int
+ganesha_manage_export(dict_t *dict, char *value,
+                      gf_boolean_t update_cache_invalidation, char **op_errstr)
+{
+    int ret = -1;
+    glusterd_volinfo_t *volinfo = NULL;
+    dict_t *vol_opts = NULL;
+    char *volname = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    gf_boolean_t option = _gf_false;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+
+    GF_ASSERT(value);
+    GF_ASSERT(dict);
+    GF_ASSERT(priv);
+
+    ret = dict_get_str(dict, "volname", &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+    ret = gf_string2boolean(value, &option);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "invalid value.");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+               FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+
+    ret = glusterd_check_ganesha_export(volinfo);
+    if (ret && option) {
+        gf_asprintf(op_errstr,
+                    "ganesha.enable "
+                    "is already 'on'.");
+        ret = -1;
+        goto out;
+
+    } else if (!option && !ret) {
+        gf_asprintf(op_errstr,
+                    "ganesha.enable "
+                    "is already 'off'.");
+        ret = -1;
+        goto out;
+    }
+
+    /* Check if global option is enabled, proceed only then */
+    ret = dict_get_str_boolean(priv->opts, GLUSTERD_STORE_KEY_GANESHA_GLOBAL,
+                               _gf_false);
+    if (ret == -1) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to get "
+                     "global option dict.");
+        gf_asprintf(op_errstr,
+                    "The option "
+                    "nfs-ganesha should be "
+                    "enabled before setting ganesha.enable.");
+        goto out;
+    }
+    if (!ret) {
+        gf_asprintf(op_errstr,
+                    "The option "
+                    "nfs-ganesha should be "
+                    "enabled before setting ganesha.enable.");
+        ret = -1;
+        goto out;
+    }
+
+    /* *
+     * Create the export file from the node where ganesha.enable "on"
+     * is executed
+     * */
+    if (option && is_origin_glusterd(dict)) {
+        ret = manage_export_config(volname, "on", op_errstr);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_EXPORT_FILE_CREATE_FAIL,
+                   "Failed to create"
+                   "export file for NFS-Ganesha\n");
+            goto out;
+        }
+    }
+    ret = gd_ganesha_send_dbus(volname, value);
+    if (ret) {
+        gf_asprintf(op_errstr,
+                    "Dynamic export addition/deletion failed."
+                    " Please see log file for details");
+        goto out;
+    }
+    if (update_cache_invalidation) {
+        vol_opts = volinfo->dict;
+        ret = dict_set_dynstr_with_alloc(vol_opts,
+                                         "features.cache-invalidation", value);
+        if (ret)
+            gf_asprintf(op_errstr,
+                        "Cache-invalidation could not"
+                        " be set to %s.",
+                        value);
+        ret = glusterd_store_volinfo(volinfo,
+                                     GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+        if (ret)
+            gf_asprintf(op_errstr, "failed to store volinfo for %s",
+                        volinfo->volname);
+    }
+out:
+    return ret;
+}
+
+int
+tear_down_cluster(gf_boolean_t run_teardown)
+{
+    int ret = 0;
+    runner_t runner = {
+        0,
+    };
+    struct stat st = {
+        0,
+    };
+    DIR *dir = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    char path[PATH_MAX] = {
+        0,
+    };
+
+    if (run_teardown) {
+        runinit(&runner);
+        runner_add_args(&runner, GANESHA_PREFIX "/ganesha-ha.sh", "teardown",
+                        CONFDIR, NULL);
+        ret = runner_run(&runner);
+        /* *
+         * Remove all the entries in CONFDIR expect ganesha.conf and
+         * ganesha-ha.conf
+         */
+        dir = sys_opendir(CONFDIR);
+        if (!dir) {
+            gf_msg_debug(THIS->name, 0,
+                         "Failed to open directory %s. "
+                         "Reason : %s",
+                         CONFDIR, strerror(errno));
+            ret = 0;
+            goto out;
+        }
+
+        while ((entry = sys_readdir(dir, scratch))) {
+            if (gf_irrelevant_entry(entry))
+                continue;
+            snprintf(path, PATH_MAX, "%s/%s", CONFDIR, entry->d_name);
+            ret = sys_lstat(path, &st);
+            if (ret == -1) {
+                gf_msg_debug(THIS->name, 0,
+                             "Failed to stat entry %s :"
+                             " %s",
+                             path, strerror(errno));
+                goto out;
+            }
+
+            if (strcmp(entry->d_name, "ganesha.conf") == 0 ||
+                strcmp(entry->d_name, "ganesha-ha.conf") == 0)
+                gf_msg_debug(THIS->name, 0,
+                             " %s is not required"
+                             " to remove",
+                             path);
+            else if (S_ISDIR(st.st_mode))
+                ret = recursive_rmdir(path);
+            else
+                ret = sys_unlink(path);
+
+            if (ret) {
+                gf_msg_debug(THIS->name, 0,
+                             " Failed to remove %s. "
+                             "Reason : %s",
+                             path, strerror(errno));
+            }
+
+            gf_msg_debug(THIS->name, 0, "%s %s",
+                         ret ? "Failed to remove" : "Removed", entry->d_name);
+        }
+
+        ret = sys_closedir(dir);
+        if (ret) {
+            gf_msg_debug(THIS->name, 0,
+                         "Failed to close dir %s. Reason :"
+                         " %s",
+                         CONFDIR, strerror(errno));
+        }
+        goto exit;
+    }
+
+out:
+    if (dir && sys_closedir(dir)) {
+        gf_msg_debug(THIS->name, 0,
+                     "Failed to close dir %s. Reason :"
+                     " %s",
+                     CONFDIR, strerror(errno));
+    }
+exit:
+    return ret;
+}
+
+int
+setup_cluster(gf_boolean_t run_setup)
+{
+    int ret = 0;
+    runner_t runner = {
+        0,
+    };
+
+    if (run_setup) {
+        runinit(&runner);
+        runner_add_args(&runner, GANESHA_PREFIX "/ganesha-ha.sh", "setup",
+                        CONFDIR, NULL);
+        ret = runner_run(&runner);
+    }
+    return ret;
+}
+
+static int
+teardown(gf_boolean_t run_teardown, char **op_errstr)
+{
+    runner_t runner = {
+        0,
+    };
+    int ret = 1;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    dict_t *vol_opts = NULL;
+
+    priv = THIS->private;
+
+    ret = tear_down_cluster(run_teardown);
+    if (ret == -1) {
+        gf_asprintf(op_errstr,
+                    "Cleanup of NFS-Ganesha"
+                    " HA config failed.");
+        goto out;
+    }
+
+    runinit(&runner);
+    runner_add_args(&runner, GANESHA_PREFIX "/ganesha-ha.sh", "cleanup",
+                    CONFDIR, NULL);
+    ret = runner_run(&runner);
+    if (ret)
+        gf_msg_debug(THIS->name, 0,
+                     "Could not clean up"
+                     " NFS-Ganesha related config");
+
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+    {
+        vol_opts = volinfo->dict;
+        /* All the volumes exported via NFS-Ganesha will be
+        unexported, hence setting the appropriate keys */
+        ret = dict_set_str(vol_opts, "features.cache-invalidation", "off");
+        if (ret)
+            gf_msg(THIS->name, GF_LOG_WARNING, errno, GD_MSG_DICT_SET_FAILED,
+                   "Could not set features.cache-invalidation "
+                   "to off for %s",
+                   volinfo->volname);
+
+        ret = dict_set_str(vol_opts, "ganesha.enable", "off");
+        if (ret)
+            gf_msg(THIS->name, GF_LOG_WARNING, errno, GD_MSG_DICT_SET_FAILED,
+                   "Could not set ganesha.enable to off for %s",
+                   volinfo->volname);
+
+        ret = glusterd_store_volinfo(volinfo,
+                                     GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+        if (ret)
+            gf_msg(THIS->name, GF_LOG_WARNING, 0, GD_MSG_VOLINFO_SET_FAIL,
+                   "failed to store volinfo for %s", volinfo->volname);
+    }
+out:
+    return ret;
+}
+
+int
+stop_ganesha(char **op_errstr)
+{
+    int ret = 0;
+    runner_t runner = {
+        0,
+    };
+
+    if (check_host_list()) {
+        runinit(&runner);
+        runner_add_args(&runner, GANESHA_PREFIX "/ganesha-ha.sh",
+                        "--setup-ganesha-conf-files", CONFDIR, "no", NULL);
+        ret = runner_run(&runner);
+        if (ret) {
+            gf_asprintf(op_errstr,
+                        "removal of symlink ganesha.conf "
+                        "in /etc/ganesha failed");
+        }
+        ret = manage_service("stop");
+        if (ret)
+            gf_asprintf(op_errstr,
+                        "NFS-Ganesha service could not"
+                        "be stopped.");
+    }
+    return ret;
+}
+
+int
+start_ganesha(char **op_errstr)
+{
+    int ret = -1;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    runner_t runner = {
+        0,
+    };
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+    {
+#ifdef BUILD_GNFS
+        /* Gluster-nfs has to be disabled across the trusted pool */
+        /* before attempting to start nfs-ganesha */
+        ret = dict_set_str_sizen(volinfo->dict, NFS_DISABLE_MAP_KEY, "on");
+        if (ret)
+            goto out;
+#endif
+        ret = glusterd_store_volinfo(volinfo,
+                                     GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+        if (ret) {
+            *op_errstr = gf_strdup(
+                "Failed to store the "
+                "Volume information");
+            goto out;
+        }
+    }
+
+    /* If the nfs svc is not initialized it means that the service is not
+     * running, hence we can skip the process of stopping gluster-nfs
+     * service
+     */
+#ifdef BUILD_GNFS
+    if (priv->nfs_svc.inited) {
+        ret = priv->nfs_svc.stop(&(priv->nfs_svc), SIGKILL);
+        if (ret) {
+            ret = -1;
+            gf_asprintf(op_errstr,
+                        "Gluster-NFS service could"
+                        "not be stopped, exiting.");
+            goto out;
+        }
+    }
+#endif
+
+    if (check_host_list()) {
+        runinit(&runner);
+        runner_add_args(&runner, GANESHA_PREFIX "/ganesha-ha.sh",
+                        "--setup-ganesha-conf-files", CONFDIR, "yes", NULL);
+        ret = runner_run(&runner);
+        if (ret) {
+            gf_asprintf(op_errstr,
+                        "creation of symlink ganesha.conf "
+                        "in /etc/ganesha failed");
+            goto out;
+        }
+        ret = manage_service("start");
+        if (ret)
+            gf_asprintf(op_errstr,
+                        "NFS-Ganesha failed to start."
+                        "Please see log file for details");
+    }
+
+out:
+    return ret;
+}
+
+static int
+pre_setup(gf_boolean_t run_setup, char **op_errstr)
+{
+    int ret = 0;
+    if (run_setup) {
+        if (!check_host_list()) {
+            gf_asprintf(op_errstr,
+                        "Running nfs-ganesha setup command "
+                        "from node which is not part of ganesha cluster");
+            return -1;
+        }
+    }
+    ret = setup_cluster(run_setup);
+    if (ret == -1)
+        gf_asprintf(op_errstr,
+                    "Failed to set up HA "
+                    "config for NFS-Ganesha. "
+                    "Please check the log file for details");
+    return ret;
+}
+
+int
+glusterd_handle_ganesha_op(dict_t *dict, char **op_errstr, char *key,
+                           char *value)
+{
+    int32_t ret = -1;
+    gf_boolean_t option = _gf_false;
+
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(key);
+    GF_ASSERT(value);
+
+    if (strcmp(key, "ganesha.enable") == 0) {
+        ret = ganesha_manage_export(dict, value, _gf_true, op_errstr);
+        if (ret < 0)
+            goto out;
+    }
+
+    /* It is possible that the key might not be set */
+    ret = gf_string2boolean(value, &option);
+    if (ret == -1) {
+        gf_asprintf(op_errstr, "Invalid value in key-value pair.");
+        goto out;
+    }
+
+    if (strcmp(key, GLUSTERD_STORE_KEY_GANESHA_GLOBAL) == 0) {
+        /* *
+         * The set up/teardown of pcs cluster should be performed only
+         * once. This will done on the node in which the cli command
+         * 'gluster nfs-ganesha <enable/disable>' got executed. So that
+         * node should part of ganesha HA cluster
+         */
+        if (option) {
+            ret = pre_setup(is_origin_glusterd(dict), op_errstr);
+            if (ret < 0)
+                goto out;
+        } else {
+            ret = teardown(is_origin_glusterd(dict), op_errstr);
+            if (ret < 0)
+                goto out;
+        }
+    }
+
+out:
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c
new file mode 100644
index 00000000000..bf062c87060
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c
@@ -0,0 +1,6782 @@
+/*
+   Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/common-utils.h>
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+#include "glusterd.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-geo-rep.h"
+#include "glusterd-store.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-svc-helper.h"
+#include <glusterfs/run.h>
+#include <glusterfs/syscall.h>
+#include "glusterd-messages.h"
+
+#include <signal.h>
+
+static int
+dict_get_param(dict_t *dict, char *key, char **param);
+
+struct gsync_config_opt_vals_ gsync_confopt_vals[] = {
+    {
+        .op_name = "change_detector",
+        .no_of_pos_vals = 2,
+        .case_sensitive = _gf_true,
+        .values = {"xsync", "changelog"},
+    },
+    {.op_name = "special_sync_mode",
+     .no_of_pos_vals = 2,
+     .case_sensitive = _gf_true,
+     .values = {"partial", "recover"}},
+    {.op_name = "log-level",
+     .no_of_pos_vals = 5,
+     .case_sensitive = _gf_false,
+     .values = {"critical", "error", "warning", "info", "debug"}},
+    {.op_name = "use-tarssh",
+     .no_of_pos_vals = 6,
+     .case_sensitive = _gf_false,
+     .values = {"true", "false", "0", "1", "yes", "no"}},
+    {.op_name = "ignore_deletes",
+     .no_of_pos_vals = 6,
+     .case_sensitive = _gf_false,
+     .values = {"true", "false", "0", "1", "yes", "no"}},
+    {.op_name = "use_meta_volume",
+     .no_of_pos_vals = 6,
+     .case_sensitive = _gf_false,
+     .values = {"true", "false", "0", "1", "yes", "no"}},
+    {.op_name = "use-meta-volume",
+     .no_of_pos_vals = 6,
+     .case_sensitive = _gf_false,
+     .values = {"true", "false", "0", "1", "yes", "no"}},
+    {
+        .op_name = NULL,
+    },
+};
+
+static char *gsync_reserved_opts[] = {"gluster-command",
+                                      "pid-file",
+                                      "state-file",
+                                      "session-owner",
+                                      "state-socket-unencoded",
+                                      "socketdir",
+                                      "local-id",
+                                      "local-path",
+                                      "slave-id",
+                                      NULL};
+
+static char *gsync_no_restart_opts[] = {"checkpoint", "log_rsync_performance",
+                                        "log-rsync-performance", NULL};
+
+void
+set_gsyncd_inet6_arg(runner_t *runner)
+{
+    xlator_t *this = NULL;
+    char *af;
+    int ret;
+
+    this = THIS;
+    ret = dict_get_str(this->options, "transport.address-family", &af);
+    if (ret == 0)
+        runner_argprintf(runner, "--%s", af);
+}
+
+int
+__glusterd_handle_sys_exec(rpcsvc_request_t *req)
+{
+    int32_t ret = 0;
+    dict_t *dict = NULL;
+    gf_cli_req cli_req = {
+        {0},
+    };
+    glusterd_op_t cli_op = GD_OP_SYS_EXEC;
+    glusterd_conf_t *priv = NULL;
+    char *host_uuid = NULL;
+    char err_str[64] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    GF_ASSERT(req);
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        req->rpc_err = GARBAGE_ARGS;
+        snprintf(err_str, sizeof(err_str), "Garbage args received");
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        dict = dict_new();
+        if (!dict) {
+            gf_smsg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            goto out;
+        }
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+
+        host_uuid = gf_strdup(uuid_utoa(MY_UUID));
+        if (host_uuid == NULL) {
+            snprintf(err_str, sizeof(err_str),
+                     "Failed to get "
+                     "the uuid of local glusterd");
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_UUID_GET_FAIL,
+                    NULL);
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_set_dynstr(dict, "host-uuid", host_uuid);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=host-uuid", NULL);
+            goto out;
+        }
+    }
+
+    ret = glusterd_op_begin_synctask(req, cli_op, dict);
+
+out:
+    if (ret) {
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, err_str);
+    }
+    return ret;
+}
+
+int
+__glusterd_handle_copy_file(rpcsvc_request_t *req)
+{
+    int32_t ret = 0;
+    dict_t *dict = NULL;
+    gf_cli_req cli_req = {
+        {0},
+    };
+    glusterd_op_t cli_op = GD_OP_COPY_FILE;
+    glusterd_conf_t *priv = NULL;
+    char *host_uuid = NULL;
+    char err_str[64] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    GF_ASSERT(req);
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        req->rpc_err = GARBAGE_ARGS;
+        snprintf(err_str, sizeof(err_str), "Garbage args received");
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        dict = dict_new();
+        if (!dict) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            goto out;
+        }
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to"
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+
+        host_uuid = gf_strdup(uuid_utoa(MY_UUID));
+        if (host_uuid == NULL) {
+            snprintf(err_str, sizeof(err_str),
+                     "Failed to get "
+                     "the uuid of local glusterd");
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_UUID_GET_FAIL,
+                    NULL);
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_set_dynstr(dict, "host-uuid", host_uuid);
+        if (ret)
+            goto out;
+    }
+
+    ret = glusterd_op_begin_synctask(req, cli_op, dict);
+
+out:
+    if (ret) {
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, err_str);
+    }
+    return ret;
+}
+
+int
+__glusterd_handle_gsync_set(rpcsvc_request_t *req)
+{
+    int32_t ret = 0;
+    dict_t *dict = NULL;
+    gf_cli_req cli_req = {
+        {0},
+    };
+    glusterd_op_t cli_op = GD_OP_GSYNC_SET;
+    char *master = NULL;
+    char *slave = NULL;
+    char operation[64] = {
+        0,
+    };
+    int type = 0;
+    glusterd_conf_t *priv = NULL;
+    char *host_uuid = NULL;
+    char err_str[64] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    GF_ASSERT(req);
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        req->rpc_err = GARBAGE_ARGS;
+        snprintf(err_str, sizeof(err_str), "Garbage args received");
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        dict = dict_new();
+        if (!dict) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            goto out;
+        }
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+
+        host_uuid = gf_strdup(uuid_utoa(MY_UUID));
+        if (host_uuid == NULL) {
+            snprintf(err_str, sizeof(err_str),
+                     "Failed to get "
+                     "the uuid of local glusterd");
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_UUID_GET_FAIL,
+                    NULL);
+            ret = -1;
+            goto out;
+        }
+        ret = dict_set_dynstr(dict, "host-uuid", host_uuid);
+        if (ret)
+            goto out;
+    }
+
+    ret = dict_get_str(dict, "master", &master);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "master not found, while handling " GEOREP " options");
+        master = "(No Master)";
+    }
+
+    ret = dict_get_str(dict, "slave", &slave);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "slave not found, while handling " GEOREP " options");
+        slave = "(No Slave)";
+    }
+
+    ret = dict_get_int32(dict, "type", &type);
+    if (ret < 0) {
+        snprintf(err_str, sizeof(err_str),
+                 "Command type not found "
+                 "while handling " GEOREP " options");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    switch (type) {
+        case GF_GSYNC_OPTION_TYPE_CREATE:
+            snprintf(operation, sizeof(operation), "create");
+            cli_op = GD_OP_GSYNC_CREATE;
+            break;
+
+        case GF_GSYNC_OPTION_TYPE_START:
+            snprintf(operation, sizeof(operation), "start");
+            break;
+
+        case GF_GSYNC_OPTION_TYPE_STOP:
+            snprintf(operation, sizeof(operation), "stop");
+            break;
+
+        case GF_GSYNC_OPTION_TYPE_PAUSE:
+            snprintf(operation, sizeof(operation), "pause");
+            break;
+
+        case GF_GSYNC_OPTION_TYPE_RESUME:
+            snprintf(operation, sizeof(operation), "resume");
+            break;
+
+        case GF_GSYNC_OPTION_TYPE_CONFIG:
+            snprintf(operation, sizeof(operation), "config");
+            break;
+
+        case GF_GSYNC_OPTION_TYPE_STATUS:
+            snprintf(operation, sizeof(operation), "status");
+            break;
+    }
+
+    ret = glusterd_op_begin_synctask(req, cli_op, dict);
+
+out:
+    if (ret) {
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, err_str);
+    }
+    return ret;
+}
+
+int
+glusterd_handle_sys_exec(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_sys_exec);
+}
+
+int
+glusterd_handle_copy_file(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_copy_file);
+}
+
+int
+glusterd_handle_gsync_set(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_gsync_set);
+}
+
+/*****
+ *
+ * glusterd_urltransform* internal API
+ *
+ *****/
+
+static void
+glusterd_urltransform_init(runner_t *runner, const char *transname)
+{
+    runinit(runner);
+    runner_add_arg(runner, GSYNCD_PREFIX "/gsyncd");
+    set_gsyncd_inet6_arg(runner);
+    runner_argprintf(runner, "--%s-url", transname);
+}
+
+static void
+glusterd_urltransform_add(runner_t *runner, const char *url)
+{
+    runner_add_arg(runner, url);
+}
+
+/* Helper routine to terminate just before slave_voluuid */
+static int32_t
+parse_slave_url(char *slv_url, char **slave)
+{
+    char *tmp = NULL;
+    xlator_t *this = NULL;
+    int32_t ret = -1;
+
+    this = THIS;
+
+    /* slave format:
+     * master_node_uuid:ssh://slave_host::slave_vol:slave_voluuid */
+    *slave = strchr(slv_url, ':');
+    if (!(*slave)) {
+        goto out;
+    }
+    (*slave)++;
+
+    /* To terminate at : before slave volume uuid */
+    tmp = strstr(*slave, "::");
+    if (!tmp) {
+        goto out;
+    }
+    tmp += 2;
+    tmp = strchr(tmp, ':');
+    if (!tmp)
+        gf_msg_debug(this->name, 0, "old slave: %s!", *slave);
+    else
+        *tmp = '\0';
+
+    ret = 0;
+    gf_msg_debug(this->name, 0, "parsed slave: %s!", *slave);
+out:
+    return ret;
+}
+
+static int
+_glusterd_urltransform_add_iter(dict_t *dict, char *key, data_t *value,
+                                void *data)
+{
+    runner_t *runner = (runner_t *)data;
+    char slv_url[VOLINFO_SLAVE_URL_MAX] = {0};
+    char *slave = NULL;
+    xlator_t *this = NULL;
+    int32_t ret = -1;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    gf_msg_debug(this->name, 0, "value->data %s", value->data);
+
+    if (snprintf(slv_url, sizeof(slv_url), "%s", value->data) >=
+        sizeof(slv_url)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVE_VOL_PARSE_FAIL,
+               "Error in copying slave: %s!", value->data);
+        goto out;
+    }
+
+    ret = parse_slave_url(slv_url, &slave);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVE_VOL_PARSE_FAIL,
+               "Error in parsing slave: %s!", value->data);
+        goto out;
+    }
+
+    runner_add_arg(runner, slave);
+    ret = 0;
+out:
+    return ret;
+}
+
+static void
+glusterd_urltransform_free(char **linearr, unsigned n)
+{
+    int i = 0;
+
+    for (; i < n; i++)
+        GF_FREE(linearr[i]);
+
+    GF_FREE(linearr);
+}
+
+static int
+glusterd_urltransform(runner_t *runner, char ***linearrp)
+{
+    char **linearr = NULL;
+    char *line = NULL;
+    unsigned arr_len = 32;
+    unsigned arr_idx = 0;
+    gf_boolean_t error = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    linearr = GF_CALLOC(arr_len, sizeof(char *), gf_gld_mt_linearr);
+    if (!linearr) {
+        error = _gf_true;
+        goto out;
+    }
+
+    runner_redir(runner, STDOUT_FILENO, RUN_PIPE);
+    if (runner_start(runner) != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SPAWNING_CHILD_FAILED,
+               "spawning child failed");
+
+        error = _gf_true;
+        goto out;
+    }
+
+    arr_idx = 0;
+    for (;;) {
+        size_t len;
+        line = GF_MALLOC(1024, gf_gld_mt_linebuf);
+        if (!line) {
+            error = _gf_true;
+            goto out;
+        }
+
+        if (fgets(line, 1024, runner_chio(runner, STDOUT_FILENO)) == NULL) {
+            GF_FREE(line);
+            break;
+        }
+
+        len = strlen(line);
+        if (len == 0 || line[len - 1] != '\n') {
+            GF_FREE(line);
+            error = _gf_true;
+            goto out;
+        }
+        line[len - 1] = '\0';
+
+        if (arr_idx == arr_len) {
+            void *p = linearr;
+            arr_len <<= 1;
+            p = GF_REALLOC(linearr, arr_len);
+            if (!p) {
+                GF_FREE(line);
+                error = _gf_true;
+                goto out;
+            }
+            linearr = p;
+        }
+        linearr[arr_idx] = line;
+
+        arr_idx++;
+    }
+
+out:
+
+    /* XXX chpid field is not exported by run API
+     * but runner_end() does not abort the invoked
+     * process (ie. it might block in waitpid(2))
+     * so we resort to a manual kill a the private field
+     */
+    if (error && runner->chpid > 0)
+        kill(runner->chpid, SIGKILL);
+
+    if (runner_end(runner) != 0)
+        error = _gf_true;
+
+    if (error) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_READ_CHILD_DATA_FAILED,
+               "reading data from child failed");
+        glusterd_urltransform_free(linearr, arr_idx);
+        return -1;
+    }
+
+    *linearrp = linearr;
+    return arr_idx;
+}
+
+static int
+glusterd_urltransform_single(const char *url, const char *transname,
+                             char ***linearrp)
+{
+    runner_t runner = {
+        0,
+    };
+
+    glusterd_urltransform_init(&runner, transname);
+    glusterd_urltransform_add(&runner, url);
+    return glusterd_urltransform(&runner, linearrp);
+}
+
+struct dictidxmark {
+    unsigned isrch;
+    unsigned ithis;
+    char *ikey;
+};
+
+struct slave_vol_config {
+    char old_slvhost[_POSIX_HOST_NAME_MAX + 1];
+    char old_slvuser[LOGIN_NAME_MAX];
+    unsigned old_slvidx;
+    char slave_voluuid[UUID_CANONICAL_FORM_LEN + 1];
+};
+
+static int
+_dict_mark_atindex(dict_t *dict, char *key, data_t *value, void *data)
+{
+    struct dictidxmark *dim = data;
+
+    if (dim->isrch == dim->ithis)
+        dim->ikey = key;
+
+    dim->ithis++;
+    return 0;
+}
+
+static char *
+dict_get_by_index(dict_t *dict, unsigned i)
+{
+    struct dictidxmark dim = {
+        0,
+    };
+
+    dim.isrch = i;
+    dict_foreach(dict, _dict_mark_atindex, &dim);
+
+    return dim.ikey;
+}
+
+static int
+glusterd_get_slave(glusterd_volinfo_t *vol, const char *slaveurl,
+                   char **slavekey)
+{
+    runner_t runner = {
+        0,
+    };
+    int n = 0;
+    int i = 0;
+    char **linearr = NULL;
+    int32_t ret = 0;
+
+    glusterd_urltransform_init(&runner, "canonicalize");
+    ret = dict_foreach(vol->gsync_slaves, _glusterd_urltransform_add_iter,
+                       &runner);
+    if (ret < 0)
+        return -2;
+
+    glusterd_urltransform_add(&runner, slaveurl);
+
+    n = glusterd_urltransform(&runner, &linearr);
+    if (n == -1)
+        return -2;
+
+    for (i = 0; i < n - 1; i++) {
+        if (strcmp(linearr[i], linearr[n - 1]) == 0)
+            break;
+    }
+    glusterd_urltransform_free(linearr, n);
+
+    if (i < n - 1)
+        *slavekey = dict_get_by_index(vol->gsync_slaves, i);
+    else
+        i = -1;
+
+    return i;
+}
+
+static int
+glusterd_query_extutil_generic(char *resbuf, size_t blen, runner_t *runner,
+                               void *data,
+                               int (*fcbk)(char *resbuf, size_t blen, FILE *fp,
+                                           void *data))
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    runner_redir(runner, STDOUT_FILENO, RUN_PIPE);
+    if (runner_start(runner) != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SPAWNING_CHILD_FAILED,
+               "spawning child failed");
+
+        return -1;
+    }
+
+    ret = fcbk(resbuf, blen, runner_chio(runner, STDOUT_FILENO), data);
+
+    ret |= runner_end(runner);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_READ_CHILD_DATA_FAILED,
+               "reading data from child failed");
+
+    return ret ? -1 : 0;
+}
+
+static int
+_fcbk_singleline(char *resbuf, size_t blen, FILE *fp, void *data)
+{
+    char *ptr = NULL;
+
+    errno = 0;
+    ptr = fgets(resbuf, blen, fp);
+    if (ptr) {
+        size_t len = strlen(resbuf);
+        if (len && resbuf[len - 1] == '\n')
+            resbuf[len - 1] = '\0';  // strip off \n
+    }
+
+    return errno ? -1 : 0;
+}
+
+static int
+glusterd_query_extutil(char *resbuf, runner_t *runner)
+{
+    return glusterd_query_extutil_generic(resbuf, PATH_MAX, runner, NULL,
+                                          _fcbk_singleline);
+}
+
+static int
+glusterd_get_slave_voluuid(char *slave_host, char *slave_vol, char *vol_uuid)
+{
+    runner_t runner = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    int ret = -1;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    runinit(&runner);
+    runner_add_arg(&runner, GSYNCD_PREFIX "/gsyncd");
+    set_gsyncd_inet6_arg(&runner);
+    runner_add_arg(&runner, "--slavevoluuid-get");
+    runner_argprintf(&runner, "%s::%s", slave_host, slave_vol);
+
+    synclock_unlock(&priv->big_lock);
+    ret = glusterd_query_extutil(vol_uuid, &runner);
+    synclock_lock(&priv->big_lock);
+
+out:
+    return ret;
+}
+
+static int
+_fcbk_conftodict(char *resbuf, size_t blen, FILE *fp, void *data)
+{
+    char *ptr = NULL;
+    dict_t *dict = data;
+    char *v = NULL;
+
+    for (;;) {
+        errno = 0;
+        ptr = fgets(resbuf, blen - 2, fp);
+        if (!ptr)
+            break;
+        v = resbuf + strlen(resbuf) - 1;
+        while (isspace(*v))
+            /* strip trailing space */
+            *v-- = '\0';
+        if (v == resbuf)
+            /* skip empty line */
+            continue;
+        v = strchr(resbuf, ':');
+        if (!v)
+            return -1;
+        *v++ = '\0';
+        while (isspace(*v))
+            v++;
+        v = gf_strdup(v);
+        if (!v)
+            return -1;
+        if (dict_set_dynstr(dict, resbuf, v) != 0) {
+            GF_FREE(v);
+            return -1;
+        }
+    }
+
+    return errno ? -1 : 0;
+}
+
+static int
+glusterd_gsync_get_config(char *master, char *slave, char *conf_path,
+                          dict_t *dict)
+{
+    /* key + value, where value must be able to accommodate a path */
+    char resbuf[256 + PATH_MAX] = {
+        0,
+    };
+    runner_t runner = {
+        0,
+    };
+
+    runinit(&runner);
+    runner_add_args(&runner, GSYNCD_PREFIX "/gsyncd", "-c", NULL);
+    runner_argprintf(&runner, "%s", conf_path);
+    set_gsyncd_inet6_arg(&runner);
+    runner_argprintf(&runner, "--iprefix=%s", DATADIR);
+    runner_argprintf(&runner, ":%s", master);
+    runner_add_args(&runner, slave, "--config-get-all", NULL);
+
+    return glusterd_query_extutil_generic(resbuf, sizeof(resbuf), &runner, dict,
+                                          _fcbk_conftodict);
+}
+
+static int
+_fcbk_statustostruct(char *resbuf, size_t blen, FILE *fp, void *data)
+{
+    char *ptr = NULL;
+    char *v = NULL;
+    char *k = NULL;
+    gf_gsync_status_t *sts_val = NULL;
+    size_t len = 0;
+
+    sts_val = (gf_gsync_status_t *)data;
+
+    for (;;) {
+        errno = 0;
+        ptr = fgets(resbuf, blen - 2, fp);
+        if (!ptr)
+            break;
+
+        v = resbuf + strlen(resbuf) - 1;
+        while (isspace(*v))
+            /* strip trailing space */
+            *v-- = '\0';
+        if (v == resbuf)
+            /* skip empty line */
+            continue;
+        v = strchr(resbuf, ':');
+        if (!v)
+            return -1;
+        *v++ = '\0';
+        while (isspace(*v))
+            v++;
+        v = gf_strdup(v);
+        if (!v)
+            return -1;
+
+        k = gf_strdup(resbuf);
+        if (!k) {
+            GF_FREE(v);
+            return -1;
+        }
+
+        if (strcmp(k, "worker_status") == 0) {
+            len = min(strlen(v), (sizeof(sts_val->worker_status) - 1));
+            memcpy(sts_val->worker_status, v, len);
+            sts_val->worker_status[len] = '\0';
+        } else if (strcmp(k, "slave_node") == 0) {
+            len = min(strlen(v), (sizeof(sts_val->slave_node) - 1));
+            memcpy(sts_val->slave_node, v, len);
+            sts_val->slave_node[len] = '\0';
+        } else if (strcmp(k, "crawl_status") == 0) {
+            len = min(strlen(v), (sizeof(sts_val->crawl_status) - 1));
+            memcpy(sts_val->crawl_status, v, len);
+            sts_val->crawl_status[len] = '\0';
+        } else if (strcmp(k, "last_synced") == 0) {
+            len = min(strlen(v), (sizeof(sts_val->last_synced) - 1));
+            memcpy(sts_val->last_synced, v, len);
+            sts_val->last_synced[len] = '\0';
+        } else if (strcmp(k, "last_synced_utc") == 0) {
+            len = min(strlen(v), (sizeof(sts_val->last_synced_utc) - 1));
+            memcpy(sts_val->last_synced_utc, v, len);
+            sts_val->last_synced_utc[len] = '\0';
+        } else if (strcmp(k, "entry") == 0) {
+            len = min(strlen(v), (sizeof(sts_val->entry) - 1));
+            memcpy(sts_val->entry, v, len);
+            sts_val->entry[len] = '\0';
+        } else if (strcmp(k, "data") == 0) {
+            len = min(strlen(v), (sizeof(sts_val->data) - 1));
+            memcpy(sts_val->data, v, len);
+            sts_val->data[len] = '\0';
+        } else if (strcmp(k, "meta") == 0) {
+            len = min(strlen(v), (sizeof(sts_val->meta) - 1));
+            memcpy(sts_val->meta, v, len);
+            sts_val->meta[len] = '\0';
+        } else if (strcmp(k, "failures") == 0) {
+            len = min(strlen(v), (sizeof(sts_val->failures) - 1));
+            memcpy(sts_val->failures, v, len);
+            sts_val->failures[len] = '\0';
+        } else if (strcmp(k, "checkpoint_time") == 0) {
+            len = min(strlen(v), (sizeof(sts_val->checkpoint_time) - 1));
+            memcpy(sts_val->checkpoint_time, v, len);
+            sts_val->checkpoint_time[len] = '\0';
+        } else if (strcmp(k, "checkpoint_time_utc") == 0) {
+            len = min(strlen(v), (sizeof(sts_val->checkpoint_time_utc) - 1));
+            memcpy(sts_val->checkpoint_time_utc, v, len);
+            sts_val->checkpoint_time_utc[len] = '\0';
+        } else if (strcmp(k, "checkpoint_completed") == 0) {
+            len = min(strlen(v), (sizeof(sts_val->checkpoint_completed) - 1));
+            memcpy(sts_val->checkpoint_completed, v, len);
+            sts_val->checkpoint_completed[len] = '\0';
+        } else if (strcmp(k, "checkpoint_completion_time") == 0) {
+            len = min(strlen(v),
+                      (sizeof(sts_val->checkpoint_completion_time) - 1));
+            memcpy(sts_val->checkpoint_completion_time, v, len);
+            sts_val->checkpoint_completion_time[len] = '\0';
+        } else if (strcmp(k, "checkpoint_completion_time_utc") == 0) {
+            len = min(strlen(v),
+                      (sizeof(sts_val->checkpoint_completion_time_utc) - 1));
+            memcpy(sts_val->checkpoint_completion_time_utc, v, len);
+            sts_val->checkpoint_completion_time_utc[len] = '\0';
+        }
+        GF_FREE(v);
+        GF_FREE(k);
+    }
+
+    return errno ? -1 : 0;
+}
+
+static int
+glusterd_gsync_get_status(char *master, char *slave, char *conf_path,
+                          char *brick_path, gf_gsync_status_t *sts_val)
+{
+    /* key + value, where value must be able to accommodate a path */
+    char resbuf[256 + PATH_MAX] = {
+        0,
+    };
+    runner_t runner = {
+        0,
+    };
+
+    runinit(&runner);
+    runner_add_args(&runner, GSYNCD_PREFIX "/gsyncd", "-c", NULL);
+    runner_argprintf(&runner, "%s", conf_path);
+    set_gsyncd_inet6_arg(&runner);
+    runner_argprintf(&runner, "--iprefix=%s", DATADIR);
+    runner_argprintf(&runner, ":%s", master);
+    runner_add_args(&runner, slave, "--status-get", NULL);
+    runner_add_args(&runner, "--path", brick_path, NULL);
+
+    return glusterd_query_extutil_generic(resbuf, sizeof(resbuf), &runner,
+                                          sts_val, _fcbk_statustostruct);
+}
+
+static int
+glusterd_gsync_get_param_file(char *prmfile, const char *param, char *master,
+                              char *slave, char *conf_path)
+{
+    runner_t runner = {
+        0,
+    };
+
+    runinit(&runner);
+    runner_add_args(&runner, GSYNCD_PREFIX "/gsyncd", "-c", NULL);
+    runner_argprintf(&runner, "%s", conf_path);
+    set_gsyncd_inet6_arg(&runner);
+    runner_argprintf(&runner, "--iprefix=%s", DATADIR);
+    runner_argprintf(&runner, ":%s", master);
+    runner_add_args(&runner, slave, "--config-get", NULL);
+    runner_argprintf(&runner, "%s-file", param);
+
+    return glusterd_query_extutil(prmfile, &runner);
+}
+
+static int
+gsyncd_getpidfile(char *master, char *slave, char *pidfile, char *conf_path,
+                  gf_boolean_t *is_template_in_use)
+{
+    char temp_conf_path[PATH_MAX] = "";
+    char *working_conf_path = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = -1;
+    struct stat stbuf = {
+        0,
+    };
+    xlator_t *this = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(this->private);
+    GF_ASSERT(conf_path);
+
+    priv = this->private;
+
+    GF_VALIDATE_OR_GOTO("gsync", master, out);
+    GF_VALIDATE_OR_GOTO("gsync", slave, out);
+
+    len = snprintf(temp_conf_path, sizeof(temp_conf_path),
+                   "%s/" GSYNC_CONF_TEMPLATE, priv->workdir);
+    if ((len < 0) || (len >= sizeof(temp_conf_path))) {
+        goto out;
+    }
+
+    ret = sys_lstat(conf_path, &stbuf);
+    if (!ret) {
+        gf_msg_debug(this->name, 0, "Using passed config template(%s).",
+                     conf_path);
+        working_conf_path = conf_path;
+    } else {
+        gf_msg(this->name, GF_LOG_WARNING, ENOENT, GD_MSG_FILE_OP_FAILED,
+               "Config file (%s) missing. Looking for template "
+               "config file (%s)",
+               conf_path, temp_conf_path);
+        ret = sys_lstat(temp_conf_path, &stbuf);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOENT, GD_MSG_FILE_OP_FAILED,
+                   "Template config file (%s) missing.", temp_conf_path);
+            goto out;
+        }
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DEFAULT_TEMP_CONFIG,
+               "Using default config template(%s).", temp_conf_path);
+        working_conf_path = temp_conf_path;
+        *is_template_in_use = _gf_true;
+    }
+
+fetch_data:
+
+    ret = glusterd_gsync_get_param_file(pidfile, "pid", master, slave,
+                                        working_conf_path);
+    if ((ret == -1) || strlen(pidfile) == 0) {
+        if (*is_template_in_use == _gf_false) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_PIDFILE_CREATE_FAILED,
+                   "failed to create the pidfile string. "
+                   "Trying default config template");
+            working_conf_path = temp_conf_path;
+            *is_template_in_use = _gf_true;
+            goto fetch_data;
+        } else {
+            ret = -2;
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_PIDFILE_CREATE_FAILED,
+                   "failed to "
+                   "create the pidfile string from template "
+                   "config");
+            goto out;
+        }
+    }
+
+    gf_msg_debug(this->name, 0, "pidfile = %s", pidfile);
+
+    ret = open(pidfile, O_RDWR);
+out:
+    return ret;
+}
+
+static int
+gsync_status_byfd(int fd)
+{
+    GF_ASSERT(fd >= -1);
+
+    if (lockf(fd, F_TEST, 0) == -1 && (errno == EAGAIN || errno == EACCES))
+        /* gsyncd keeps the pidfile locked */
+        return 0;
+
+    return -1;
+}
+
+/* status: return 0 when gsync is running
+ * return -1 when not running
+ */
+int
+gsync_status(char *master, char *slave, char *conf_path, int *status,
+             gf_boolean_t *is_template_in_use)
+{
+    char pidfile[PATH_MAX] = {
+        0,
+    };
+    int fd = -1;
+
+    fd = gsyncd_getpidfile(master, slave, pidfile, conf_path,
+                           is_template_in_use);
+    if (fd == -2)
+        return -1;
+
+    *status = gsync_status_byfd(fd);
+
+    sys_close(fd);
+
+    return 0;
+}
+
+static int32_t
+glusterd_gsync_volinfo_dict_set(glusterd_volinfo_t *volinfo, char *key,
+                                char *value)
+{
+    int32_t ret = -1;
+    char *gsync_status = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    gsync_status = gf_strdup(value);
+    if (!gsync_status) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Unable to allocate memory");
+        goto out;
+    }
+
+    ret = dict_set_dynstr(volinfo->dict, key, gsync_status);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set dict");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+glusterd_verify_gsyncd_spawn(char *master, char *slave)
+{
+    int ret = 0;
+    runner_t runner = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    runinit(&runner);
+    runner_add_args(&runner, GSYNCD_PREFIX "/gsyncd", "--verify", "spawning",
+                    NULL);
+    runner_argprintf(&runner, ":%s", master);
+    runner_add_args(&runner, slave, NULL);
+    runner_redir(&runner, STDOUT_FILENO, RUN_PIPE);
+    ret = runner_start(&runner);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SPAWNING_CHILD_FAILED,
+               "spawning child failed");
+        ret = -1;
+        goto out;
+    }
+
+    if (runner_end(&runner) != 0)
+        ret = -1;
+
+out:
+    gf_msg_debug(this->name, 0, "returning %d", ret);
+    return ret;
+}
+
+static int
+gsync_verify_config_options(dict_t *dict, char **op_errstr, char *volname)
+{
+    char **resopt = NULL;
+    int i = 0;
+    int ret = -1;
+    char *subop = NULL;
+    char *slave = NULL;
+    char *op_name = NULL;
+    char *op_value = NULL;
+    char *t = NULL;
+    char errmsg[PATH_MAX] = "";
+    gf_boolean_t banned = _gf_true;
+    gf_boolean_t op_match = _gf_true;
+    gf_boolean_t val_match = _gf_true;
+    struct gsync_config_opt_vals_ *conf_vals = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (dict_get_str(dict, "subop", &subop) != 0) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+               "missing subop");
+        *op_errstr = gf_strdup("Invalid config request");
+        return -1;
+    }
+
+    if (dict_get_str(dict, "slave", &slave) != 0) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+               GEOREP " CONFIG: no slave given");
+        *op_errstr = gf_strdup("Slave required");
+        return -1;
+    }
+
+    if (strcmp(subop, "get-all") == 0)
+        return 0;
+
+    if (dict_get_str(dict, "op_name", &op_name) != 0) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+               "option name missing");
+        *op_errstr = gf_strdup("Option name missing");
+        return -1;
+    }
+
+    if (runcmd(GSYNCD_PREFIX "/gsyncd", "--config-check", op_name, NULL)) {
+        ret = glusterd_verify_gsyncd_spawn(volname, slave);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GSYNCD_SPAWN_FAILED,
+                   "Unable to spawn "
+                   "gsyncd");
+            return 0;
+        }
+
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, GD_MSG_INVALID_ENTRY,
+               "Invalid option %s", op_name);
+        *op_errstr = gf_strdup("Invalid option");
+
+        return -1;
+    }
+
+    if (strcmp(subop, "get") == 0)
+        return 0;
+
+    t = strtail(subop, "set");
+    if (!t)
+        t = strtail(subop, "del");
+    if (!t || (t[0] && strcmp(t, "-glob") != 0)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SUBOP_NOT_FOUND,
+               "unknown subop %s", subop);
+        *op_errstr = gf_strdup("Invalid config request");
+        return -1;
+    }
+
+    if (strtail(subop, "set") &&
+        dict_get_str(dict, "op_value", &op_value) != 0) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+               "missing value for set");
+        *op_errstr = gf_strdup("missing value");
+    }
+
+    /* match option name against reserved options, modulo -/_
+     * difference
+     */
+    for (resopt = gsync_reserved_opts; *resopt; resopt++) {
+        banned = _gf_true;
+        for (i = 0; (*resopt)[i] && op_name[i]; i++) {
+            if ((*resopt)[i] == op_name[i] ||
+                ((*resopt)[i] == '-' && op_name[i] == '_'))
+                continue;
+            banned = _gf_false;
+        }
+
+        if (op_name[i] != '\0')
+            banned = _gf_false;
+
+        if (banned) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_RESERVED_OPTION,
+                   "Reserved option %s", op_name);
+            *op_errstr = gf_strdup("Reserved option");
+
+            return -1;
+            break;
+        }
+    }
+
+    /* Check options in gsync_confopt_vals for invalid values */
+    for (conf_vals = gsync_confopt_vals; conf_vals->op_name; conf_vals++) {
+        op_match = _gf_true;
+        for (i = 0; conf_vals->op_name[i] && op_name[i]; i++) {
+            if (conf_vals->op_name[i] == op_name[i] ||
+                (conf_vals->op_name[i] == '_' && op_name[i] == '-'))
+                continue;
+            op_match = _gf_false;
+        }
+
+        if (op_match) {
+            if (!op_value)
+                goto out;
+            val_match = _gf_false;
+            for (i = 0; i < conf_vals->no_of_pos_vals; i++) {
+                if (conf_vals->case_sensitive) {
+                    if (!strcmp(conf_vals->values[i], op_value))
+                        val_match = _gf_true;
+                } else {
+                    if (!strcasecmp(conf_vals->values[i], op_value))
+                        val_match = _gf_true;
+                }
+            }
+
+            if (!val_match) {
+                ret = snprintf(errmsg, sizeof(errmsg) - 1,
+                               "Invalid value(%s) for"
+                               " option %s",
+                               op_value, op_name);
+                errmsg[ret] = '\0';
+
+                gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                       "%s", errmsg);
+                *op_errstr = gf_strdup(errmsg);
+                return -1;
+            }
+        }
+    }
+out:
+    return 0;
+}
+
+static int
+glusterd_get_gsync_status_mst_slv(glusterd_volinfo_t *volinfo, char *slave,
+                                  char *conf_path, dict_t *rsp_dict,
+                                  char *node);
+
+static int
+_get_status_mst_slv(dict_t *dict, char *key, data_t *value, void *data)
+{
+    glusterd_gsync_status_temp_t *param = NULL;
+    char *slave = NULL;
+    char *slave_buf = NULL;
+    char *slave_url = NULL;
+    char *slave_vol = NULL;
+    char *slave_host = NULL;
+    char *errmsg = NULL;
+    char conf_path[PATH_MAX] = "";
+    int ret = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    char slv_url[VOLINFO_SLAVE_URL_MAX] = {0};
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    param = (glusterd_gsync_status_temp_t *)data;
+
+    GF_VALIDATE_OR_GOTO(this->name, param, out);
+    GF_VALIDATE_OR_GOTO(this->name, param->volinfo, out);
+
+    if (this)
+        priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    if (snprintf(slv_url, sizeof(slv_url), "%s", value->data) >=
+        sizeof(slv_url)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVE_VOL_PARSE_FAIL,
+               "Error in copying slave: %s!", value->data);
+        goto out;
+    }
+
+    ret = parse_slave_url(slv_url, &slave);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVE_VOL_PARSE_FAIL,
+               "Error in parsing slave: %s!", value->data);
+        goto out;
+    }
+
+    ret = glusterd_get_slave_info(slave, &slave_url, &slave_host, &slave_vol,
+                                  &errmsg);
+    if (ret) {
+        if (errmsg)
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVEINFO_FETCH_ERROR,
+                   "Unable to fetch slave details. Error: %s", errmsg);
+        else
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVEINFO_FETCH_ERROR,
+                   "Unable to fetch slave details.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = snprintf(conf_path, sizeof(conf_path) - 1,
+                   "%s/" GEOREP "/%s_%s_%s/gsyncd.conf", priv->workdir,
+                   param->volinfo->volname, slave_host, slave_vol);
+    conf_path[ret] = '\0';
+
+    ret = glusterd_get_gsync_status_mst_slv(param->volinfo, slave, conf_path,
+                                            param->rsp_dict, param->node);
+out:
+
+    if (errmsg)
+        GF_FREE(errmsg);
+
+    if (slave_buf)
+        GF_FREE(slave_buf);
+
+    if (slave_vol)
+        GF_FREE(slave_vol);
+
+    if (slave_url)
+        GF_FREE(slave_url);
+
+    if (slave_host)
+        GF_FREE(slave_host);
+
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d.", ret);
+    return ret;
+}
+
+static int
+_get_max_gsync_slave_num(dict_t *dict, char *key, data_t *value, void *data)
+{
+    int tmp_slvnum = 0;
+    int *slvnum = (int *)data;
+
+    sscanf(key, "slave%d", &tmp_slvnum);
+    if (tmp_slvnum > *slvnum)
+        *slvnum = tmp_slvnum;
+
+    return 0;
+}
+
+static int
+_get_slave_idx_slave_voluuid(dict_t *dict, char *key, data_t *value, void *data)
+{
+    char *slave_info = NULL;
+    xlator_t *this = NULL;
+    struct slave_vol_config *slave_cfg = NULL;
+    int i = 0;
+    int ret = -1;
+    unsigned tmp_slvnum = 0;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    slave_cfg = data;
+
+    if (value)
+        slave_info = value->data;
+
+    if (!(slave_info) || strlen(slave_info) == 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_SLAVE,
+               "Invalid slave in dict");
+        ret = -2;
+        goto out;
+    }
+
+    /* slave format:
+     * master_node_uuid:ssh://slave_host::slave_vol:slave_voluuid */
+    while (i++ < 5) {
+        slave_info = strchr(slave_info, ':');
+        if (slave_info)
+            slave_info++;
+        else {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVE_VOL_PARSE_FAIL,
+                   "slave_info becomes NULL!");
+            ret = -2;
+            goto out;
+        }
+    }
+    if (strcmp(slave_info, slave_cfg->slave_voluuid) == 0) {
+        gf_msg_debug(this->name, 0,
+                     "Same slave volume "
+                     "already present %s",
+                     slave_cfg->slave_voluuid);
+        ret = -1;
+
+        sscanf(key, "slave%d", &tmp_slvnum);
+        slave_cfg->old_slvidx = tmp_slvnum;
+
+        gf_msg_debug(this->name, 0,
+                     "and "
+                     "its index is: %d",
+                     tmp_slvnum);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+glusterd_remove_slave_in_info(glusterd_volinfo_t *volinfo, char *slave,
+                              char **op_errstr)
+{
+    int zero_slave_entries = _gf_true;
+    int ret = 0;
+    char *slavekey = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(slave);
+
+    do {
+        ret = glusterd_get_slave(volinfo, slave, &slavekey);
+        if (ret < 0 && zero_slave_entries) {
+            ret++;
+            goto out;
+        }
+        zero_slave_entries = _gf_false;
+        dict_del(volinfo->gsync_slaves, slavekey);
+    } while (ret >= 0);
+
+    ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+    if (ret) {
+        *op_errstr = gf_strdup(
+            "Failed to store the Volume"
+            "information");
+        goto out;
+    }
+out:
+    gf_msg_debug(this->name, 0, "returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_gsync_get_uuid(char *slave, glusterd_volinfo_t *vol, uuid_t uuid)
+{
+    int ret = 0;
+    char *slavekey = NULL;
+    char *slaveentry = NULL;
+    char *t = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(vol);
+    GF_ASSERT(slave);
+
+    ret = glusterd_get_slave(vol, slave, &slavekey);
+    if (ret < 0) {
+        /* XXX colliding cases of failure and non-extant
+         * slave... now just doing this as callers of this
+         * function can make sense only of -1 and 0 as retvals;
+         * getting at the proper semanticals will involve
+         * fixing callers as well.
+         */
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_str(vol->gsync_slaves, slavekey, &slaveentry);
+    GF_ASSERT(ret == 0);
+
+    t = strchr(slaveentry, ':');
+    GF_ASSERT(t);
+    *t = '\0';
+    ret = gf_uuid_parse(slaveentry, uuid);
+    *t = ':';
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+update_slave_voluuid(dict_t *dict, char *key, data_t *value, void *data)
+{
+    char *slave = NULL;
+    char *slave_url = NULL;
+    char *slave_vol = NULL;
+    char *slave_host = NULL;
+    char *errmsg = NULL;
+    xlator_t *this = NULL;
+    int ret = -1;
+    char slv_url[VOLINFO_SLAVE_URL_MAX] = {0};
+    char slave_voluuid[GF_UUID_BUF_SIZE] = {0};
+    char *slave_info = NULL;
+    char *new_value = NULL;
+    char *same_key = NULL;
+    int cnt = 0;
+    gf_boolean_t *voluuid_updated = NULL;
+
+    this = THIS;
+
+    voluuid_updated = data;
+    slave_info = value->data;
+    gf_msg_debug(this->name, 0, "slave_info: %s!", slave_info);
+
+    /* old slave format:
+     * master_node_uuid:ssh://slave_host::slave_vol
+     * New slave format:
+     * master_node_uuid:ssh://slave_host::slave_vol:slave_voluuid */
+    while (slave_info) {
+        slave_info = strchr(slave_info, ':');
+        if (slave_info)
+            cnt++;
+        else
+            break;
+
+        slave_info++;
+    }
+
+    gf_msg_debug(this->name, 0, "cnt: %d", cnt);
+    /* check whether old slave format and update vol uuid if old format.
+     * With volume uuid, number of ':' is 5 and is 4 without.
+     */
+    if (cnt == 4) {
+        if (snprintf(slv_url, sizeof(slv_url), "%s", value->data) >=
+            sizeof(slv_url)) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVE_VOL_PARSE_FAIL,
+                   "Error in copying slave: %s!", value->data);
+            goto out;
+        }
+
+        ret = parse_slave_url(slv_url, &slave);
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVE_VOL_PARSE_FAIL,
+                   "Error in parsing slave: %s!", value->data);
+            goto out;
+        }
+
+        ret = glusterd_get_slave_info(slave, &slave_url, &slave_host,
+                                      &slave_vol, &errmsg);
+        if (ret) {
+            if (errmsg)
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_SLAVEINFO_FETCH_ERROR,
+                       "Unable to fetch slave details. Error: %s", errmsg);
+            else
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_SLAVEINFO_FETCH_ERROR,
+                       "Unable to fetch slave details.");
+            ret = -1;
+            goto out;
+        }
+
+        ret = glusterd_get_slave_voluuid(slave_host, slave_vol, slave_voluuid);
+        if ((ret) || (strlen(slave_voluuid) == 0)) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REMOTE_VOL_UUID_FAIL,
+                   "Unable to get remote volume uuid"
+                   "slavehost:%s slavevol:%s",
+                   slave_host, slave_vol);
+            /* Avoiding failure due to remote vol uuid fetch */
+            ret = 0;
+            goto out;
+        }
+        ret = gf_asprintf(&new_value, "%s:%s", value->data, slave_voluuid);
+        ret = gf_asprintf(&same_key, "%s", key);
+
+        /* delete old key and add new value */
+        dict_del(dict, key);
+
+        /* set new value for the same key*/
+        ret = dict_set_dynstr(dict, same_key, new_value);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REMOTE_VOL_UUID_FAIL,
+                   "Error in setting dict value"
+                   "new_value :%s",
+                   new_value);
+            goto out;
+        }
+        *voluuid_updated = _gf_true;
+    }
+
+    ret = 0;
+out:
+    if (errmsg)
+        GF_FREE(errmsg);
+
+    if (slave_url)
+        GF_FREE(slave_url);
+
+    if (slave_vol)
+        GF_FREE(slave_vol);
+
+    if (slave_host)
+        GF_FREE(slave_host);
+
+    gf_msg_debug(this->name, 0, "Returning %d.", ret);
+    return ret;
+}
+
+static int
+glusterd_update_slave_voluuid_slaveinfo(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    gf_boolean_t voluuid_updated = _gf_false;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+
+    ret = dict_foreach(volinfo->gsync_slaves, update_slave_voluuid,
+                       &voluuid_updated);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REMOTE_VOL_UUID_FAIL,
+               "Error in updating"
+               "volinfo");
+        goto out;
+    }
+
+    if (_gf_true == voluuid_updated) {
+        ret = glusterd_store_volinfo(volinfo,
+                                     GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_STORE_FAIL,
+                   "Error in storing"
+                   "volinfo");
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug((this ? this->name : "glusterd"), 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_check_gsync_running_local(char *master, char *slave, char *conf_path,
+                                   gf_boolean_t *is_run)
+{
+    int ret = -1;
+    int ret_status = 0;
+    gf_boolean_t is_template_in_use = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(master);
+    GF_ASSERT(slave);
+    GF_ASSERT(is_run);
+
+    *is_run = _gf_false;
+    ret = gsync_status(master, slave, conf_path, &ret_status,
+                       &is_template_in_use);
+    if (ret == 0 && ret_status == 0)
+        *is_run = _gf_true;
+    else if (ret == -1) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VALIDATE_FAILED,
+               GEOREP " validation failed");
+        goto out;
+    }
+    ret = 0;
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_store_slave_in_info(glusterd_volinfo_t *volinfo, char *slave,
+                             char *host_uuid, char *slave_voluuid,
+                             char **op_errstr, gf_boolean_t is_force)
+{
+    int ret = 0;
+    int maxslv = 0;
+    char **linearr = NULL;
+    char *value = NULL;
+    char *slavekey = NULL;
+    char *slaveentry = NULL;
+    char key[32] = {
+        0,
+    };
+    int keylen;
+    char *t = NULL;
+    xlator_t *this = NULL;
+    struct slave_vol_config slave1 = {
+        {0},
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(slave);
+    GF_ASSERT(host_uuid);
+    GF_VALIDATE_OR_GOTO(this->name, slave_voluuid, out);
+
+    ret = glusterd_get_slave(volinfo, slave, &slavekey);
+    switch (ret) {
+        case -2:
+            ret = -1;
+            goto out;
+        case -1:
+            break;
+        default:
+            if (!is_force)
+                GF_ASSERT(ret > 0);
+            ret = dict_get_str(volinfo->gsync_slaves, slavekey, &slaveentry);
+            GF_ASSERT(ret == 0);
+
+            /* same-name + same-uuid slave entries should have been filtered
+             * out in glusterd_op_verify_gsync_start_options(), so we can
+             * assert an uuid mismatch
+             */
+            t = strtail(slaveentry, host_uuid);
+            if (!is_force)
+                GF_ASSERT(!t || *t != ':');
+
+            if (is_force) {
+                gf_msg_debug(this->name, 0,
+                             GEOREP
+                             " has already "
+                             "been invoked for the %s (master) and "
+                             "%s (slave). Allowing without saving "
+                             "info again due to force command.",
+                             volinfo->volname, slave);
+                ret = 0;
+                goto out;
+            }
+
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVOKE_ERROR,
+                   GEOREP
+                   " has already been invoked for "
+                   "the %s (master) and %s (slave) from a different "
+                   "machine",
+                   volinfo->volname, slave);
+            *op_errstr = gf_strdup(GEOREP
+                                   " already running in "
+                                   "another machine");
+            ret = -1;
+            goto out;
+    }
+
+    ret = glusterd_urltransform_single(slave, "normalize", &linearr);
+    if (ret == -1)
+        goto out;
+
+    ret = gf_asprintf(&value, "%s:%s:%s", host_uuid, linearr[0], slave_voluuid);
+
+    glusterd_urltransform_free(linearr, 1);
+    if (ret == -1)
+        goto out;
+
+    /* Given the slave volume uuid, check and get any existing slave */
+    memcpy(slave1.slave_voluuid, slave_voluuid, UUID_CANONICAL_FORM_LEN);
+    ret = dict_foreach(volinfo->gsync_slaves, _get_slave_idx_slave_voluuid,
+                       &slave1);
+
+    if (ret == 0) { /* New slave */
+        dict_foreach(volinfo->gsync_slaves, _get_max_gsync_slave_num, &maxslv);
+        keylen = snprintf(key, sizeof(key), "slave%d", maxslv + 1);
+
+        ret = dict_set_dynstrn(volinfo->gsync_slaves, key, keylen, value);
+        if (ret) {
+            GF_FREE(value);
+            goto out;
+        }
+    } else if (ret == -1) { /* Existing slave */
+        keylen = snprintf(key, sizeof(key), "slave%d", slave1.old_slvidx);
+
+        gf_msg_debug(this->name, 0,
+                     "Replacing key:%s with new value"
+                     ":%s",
+                     key, value);
+
+        /* Add new slave's value, with the same slave index */
+        ret = dict_set_dynstrn(volinfo->gsync_slaves, key, keylen, value);
+        if (ret) {
+            GF_FREE(value);
+            goto out;
+        }
+    } else {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REMOTE_VOL_UUID_FAIL,
+               "_get_slave_idx_slave_voluuid failed!");
+        GF_FREE(value);
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+    if (ret) {
+        *op_errstr = gf_strdup(
+            "Failed to store the Volume "
+            "information");
+        goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+glusterd_op_verify_gsync_start_options(glusterd_volinfo_t *volinfo, char *slave,
+                                       char *conf_path, char *statefile,
+                                       char **op_errstr, gf_boolean_t is_force)
+{
+    int ret = -1;
+    int ret_status = 0;
+    gf_boolean_t is_template_in_use = _gf_false;
+    char msg[2048] = {0};
+    uuid_t uuid = {0};
+    xlator_t *this = NULL;
+    struct stat stbuf = {
+        0,
+    };
+    char statefiledir[PATH_MAX] = {
+        0,
+    };
+    char *statedir = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(slave);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(conf_path);
+    GF_ASSERT(this && this->private);
+
+    if (GLUSTERD_STATUS_STARTED != volinfo->status) {
+        snprintf(msg, sizeof(msg),
+                 "Volume %s needs to be started "
+                 "before " GEOREP " start",
+                 volinfo->volname);
+        goto out;
+    }
+
+    /* check session directory as statefile may not present
+     * during upgrade */
+    if (snprintf(statefiledir, sizeof(statefiledir), "%s", statefile) >=
+        sizeof(statefiledir)) {
+        snprintf(msg, sizeof(msg), "statefiledir truncated");
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED, "%s",
+               msg);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+    statedir = dirname(statefiledir);
+
+    ret = sys_lstat(statedir, &stbuf);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "Session between %s and %s has"
+                 " not been created. Please create session and retry.",
+                 volinfo->volname, slave);
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "%s statefile: %s", msg, statefile);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    /* Check if the gsync slave info is stored. If not
+     * session has not been created */
+    ret = glusterd_gsync_get_uuid(slave, volinfo, uuid);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "Session between %s and %s has"
+                 " not been created. Please create session and retry.",
+                 volinfo->volname, slave);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SESSION_CREATE_ERROR, "%s",
+               msg);
+        goto out;
+    }
+
+    /*Check if the gsync is already started in cmd. inited host
+     * If so initiate add it into the glusterd's priv*/
+    ret = gsync_status(volinfo->volname, slave, conf_path, &ret_status,
+                       &is_template_in_use);
+    if (ret == 0) {
+        if ((ret_status == 0) && !is_force) {
+            snprintf(msg, sizeof(msg),
+                     GEOREP
+                     " session between"
+                     " %s & %s already started",
+                     volinfo->volname, slave);
+            ret = -1;
+            goto out;
+        }
+    } else if (ret == -1) {
+        snprintf(msg, sizeof(msg),
+                 GEOREP
+                 " start option "
+                 "validation failed ");
+        goto out;
+    }
+
+    if (is_template_in_use == _gf_true) {
+        snprintf(msg, sizeof(msg),
+                 GEOREP
+                 " start "
+                 "failed : pid-file entry missing "
+                 "in config file.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_verify_gsyncd_spawn(volinfo->volname, slave);
+    if (ret && !is_force) {
+        snprintf(msg, sizeof(msg), "Unable to spawn gsyncd");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GSYNCD_SPAWN_FAILED, "%s",
+               msg);
+    }
+out:
+    if (ret && (msg[0] != '\0')) {
+        *op_errstr = gf_strdup(msg);
+    }
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+void
+glusterd_check_geo_rep_configured(glusterd_volinfo_t *volinfo,
+                                  gf_boolean_t *flag)
+{
+    GF_ASSERT(volinfo);
+    GF_ASSERT(flag);
+
+    if (volinfo->gsync_slaves->count)
+        *flag = _gf_true;
+    else
+        *flag = _gf_false;
+
+    return;
+}
+
+/*
+ * is_geo_rep_active:
+ *      This function reads the state_file and sets is_active to 1 if the
+ *      monitor status is neither "Stopped" or "Created"
+ *
+ * RETURN VALUE:
+ *       0: On successful read of state_file.
+ *      -1: error.
+ */
+
+static int
+is_geo_rep_active(glusterd_volinfo_t *volinfo, char *slave, char *conf_path,
+                  int *is_active)
+{
+    dict_t *confd = NULL;
+    char *statefile = NULL;
+    char *master = NULL;
+    char monitor_status[PATH_MAX] = "";
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    master = volinfo->volname;
+
+    confd = dict_new();
+    if (!confd) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Not able to create dict.");
+        goto out;
+    }
+
+    ret = glusterd_gsync_get_config(master, slave, conf_path, confd);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GET_CONFIG_INFO_FAILED,
+               "Unable to get configuration data "
+               "for %s(master), %s(slave)",
+               master, slave);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_param(confd, "state_file", &statefile);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get state_file's name "
+               "for %s(master), %s(slave). Please check gsync "
+               "config file.",
+               master, slave);
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_gsync_read_frm_status(statefile, monitor_status,
+                                         sizeof(monitor_status));
+    if (ret <= 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STAT_FILE_READ_FAILED,
+               "Unable to read the status file for %s(master), "
+               "%s(slave)",
+               master, slave);
+        snprintf(monitor_status, sizeof(monitor_status), "defunct");
+    }
+
+    if ((!strcmp(monitor_status, "Stopped")) ||
+        (!strcmp(monitor_status, "Created"))) {
+        *is_active = 0;
+    } else {
+        *is_active = 1;
+    }
+    ret = 0;
+out:
+    if (confd)
+        dict_unref(confd);
+    return ret;
+}
+
+/*
+ * _get_slave_status:
+ *      Called for each slave in the volume from dict_foreach.
+ *      It calls is_geo_rep_active to get the monitor status.
+ *
+ * RETURN VALUE:
+ *      0: On successful read of state_file from is_geo_rep_active.
+ *         When it is found geo-rep is already active from previous calls.
+ *         When there is no slave.
+ *     -1: On error.
+ */
+
+int
+_get_slave_status(dict_t *dict, char *key, data_t *value, void *data)
+{
+    gsync_status_param_t *param = NULL;
+    char *slave = NULL;
+    char *slave_url = NULL;
+    char *slave_vol = NULL;
+    char *slave_host = NULL;
+    char *errmsg = NULL;
+    char conf_path[PATH_MAX] = "";
+    int ret = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    param = (gsync_status_param_t *)data;
+
+    GF_ASSERT(param);
+    GF_ASSERT(param->volinfo);
+    if (param->is_active) {
+        ret = 0;
+        goto out;
+    }
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    if (priv == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_PRIV_NOT_FOUND,
+               "priv of glusterd not present");
+        goto out;
+    }
+
+    slave = strchr(value->data, ':');
+    if (!slave) {
+        ret = 0;
+        goto out;
+    }
+    slave++;
+
+    ret = glusterd_get_slave_info(slave, &slave_url, &slave_host, &slave_vol,
+                                  &errmsg);
+    if (ret) {
+        if (errmsg)
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVEINFO_FETCH_ERROR,
+                   "Unable to fetch"
+                   " slave details. Error: %s",
+                   errmsg);
+        else
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVEINFO_FETCH_ERROR,
+                   "Unable to fetch slave details.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = snprintf(conf_path, sizeof(conf_path) - 1,
+                   "%s/" GEOREP "/%s_%s_%s/gsyncd.conf", priv->workdir,
+                   param->volinfo->volname, slave_host, slave_vol);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CONF_PATH_ASSIGN_FAILED,
+               "Unable to assign conf_path.");
+        ret = -1;
+        goto out;
+    }
+    conf_path[ret] = '\0';
+
+    ret = is_geo_rep_active(param->volinfo, slave, conf_path,
+                            &param->is_active);
+out:
+    if (errmsg)
+        GF_FREE(errmsg);
+
+    if (slave_vol)
+        GF_FREE(slave_vol);
+
+    if (slave_url)
+        GF_FREE(slave_url);
+    if (slave_host)
+        GF_FREE(slave_host);
+
+    return ret;
+}
+
+/* glusterd_check_geo_rep_running:
+ *          Checks if any geo-rep session is running for the volume.
+ *
+ *    RETURN VALUE:
+ *          Sets param.active to true if any geo-rep session is active.
+ *    This function sets op_errstr during some error and when any geo-rep
+ *    session is active. It is caller's responsibility to free op_errstr
+ *    in above cases.
+ */
+
+int
+glusterd_check_geo_rep_running(gsync_status_param_t *param, char **op_errstr)
+{
+    char msg[2048] = {
+        0,
+    };
+    gf_boolean_t enabled = _gf_false;
+    int ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(param);
+    GF_ASSERT(param->volinfo);
+    GF_ASSERT(op_errstr);
+
+    glusterd_check_geo_rep_configured(param->volinfo, &enabled);
+
+    if (enabled) {
+        ret = dict_foreach(param->volinfo->gsync_slaves, _get_slave_status,
+                           param);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVEINFO_FETCH_ERROR,
+                   "_get_slave_satus failed");
+            snprintf(msg, sizeof(msg),
+                     GEOREP
+                     " Unable to"
+                     " get the status of active " GEOREP
+                     ""
+                     " session for the volume '%s'.\n"
+                     " Please check the log file for"
+                     " more info.",
+                     param->volinfo->volname);
+            *op_errstr = gf_strdup(msg);
+            ret = -1;
+            goto out;
+        }
+
+        if (param->is_active) {
+            snprintf(msg, sizeof(msg),
+                     GEOREP
+                     " sessions"
+                     " are active for the volume %s.\nStop"
+                     " " GEOREP
+                     " sessions involved in this"
+                     " volume. Use 'volume " GEOREP
+                     " status' command for more info.",
+                     param->volinfo->volname);
+            *op_errstr = gf_strdup(msg);
+            goto out;
+        }
+    }
+out:
+    return ret;
+}
+
+static int
+glusterd_op_verify_gsync_running(glusterd_volinfo_t *volinfo, char *slave,
+                                 char *conf_path, char **op_errstr)
+{
+    int pfd = -1;
+    int ret = -1;
+    char msg[2048] = {0};
+    char pidfile[PATH_MAX] = {
+        0,
+    };
+    gf_boolean_t is_template_in_use = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(THIS && THIS->private);
+    GF_ASSERT(volinfo);
+    GF_ASSERT(slave);
+    GF_ASSERT(conf_path);
+    GF_ASSERT(op_errstr);
+
+    if (GLUSTERD_STATUS_STARTED != volinfo->status) {
+        snprintf(msg, sizeof(msg),
+                 "Volume %s needs to be started "
+                 "before " GEOREP " start",
+                 volinfo->volname);
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_GEO_REP_START_FAILED,
+                "Volume is not in a started state, Volname=%s",
+                volinfo->volname, NULL);
+
+        goto out;
+    }
+
+    pfd = gsyncd_getpidfile(volinfo->volname, slave, pidfile, conf_path,
+                            &is_template_in_use);
+    if (pfd == -2) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VALIDATE_FAILED,
+               GEOREP " stop validation failed for %s & %s", volinfo->volname,
+               slave);
+        ret = -1;
+        goto out;
+    }
+    if (gsync_status_byfd(pfd) == -1) {
+        snprintf(msg, sizeof(msg),
+                 GEOREP
+                 " session b/w %s & %s is "
+                 "not running on this node.",
+                 volinfo->volname, slave);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SESSION_INACTIVE, "%s", msg);
+        ret = -1;
+        /* monitor gsyncd already dead */
+        goto out;
+    }
+
+    if (is_template_in_use) {
+        snprintf(msg, sizeof(msg),
+                 "pid-file entry missing in "
+                 "the config file(%s).",
+                 conf_path);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PIDFILE_NOT_FOUND, "%s",
+               msg);
+        ret = -1;
+        goto out;
+    }
+
+    if (pfd < 0)
+        goto out;
+
+    ret = 0;
+out:
+    if (ret && (msg[0] != '\0')) {
+        *op_errstr = gf_strdup(msg);
+    }
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_verify_gsync_status_opts(dict_t *dict, char **op_errstr)
+{
+    char *slave = NULL;
+    char *volname = NULL;
+    char errmsg[PATH_MAX] = {
+        0,
+    };
+    glusterd_volinfo_t *volinfo = NULL;
+    int ret = 0;
+    char *conf_path = NULL;
+    char *slave_url = NULL;
+    char *slave_host = NULL;
+    char *slave_vol = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (THIS)
+        priv = THIS->private;
+    if (priv == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_PRIV_NOT_FOUND,
+               "priv of glusterd not present");
+        *op_errstr = gf_strdup("glusterd defunct");
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "master", &volname);
+    if (ret < 0) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOL_NOT_FOUND,
+               "volume name does not exist");
+        snprintf(errmsg, sizeof(errmsg),
+                 "Volume name %s does not"
+                 " exist",
+                 volname);
+        *op_errstr = gf_strdup(errmsg);
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "slave", &slave);
+    if (ret < 0) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_get_slave_details_confpath(volinfo, dict, &slave_url,
+                                              &slave_host, &slave_vol,
+                                              &conf_path, op_errstr);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVEINFO_FETCH_ERROR,
+               "Unable to fetch slave  or confpath details.");
+        ret = -1;
+        goto out;
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_gsync_args_get(dict_t *dict, char **op_errstr, char **master,
+                           char **slave, char **host_uuid)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+
+    if (master) {
+        ret = dict_get_str(dict, "master", master);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+                   "master not found");
+            *op_errstr = gf_strdup("master not found");
+            goto out;
+        }
+    }
+
+    if (slave) {
+        ret = dict_get_str(dict, "slave", slave);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+                   "slave not found");
+            *op_errstr = gf_strdup("slave not found");
+            goto out;
+        }
+    }
+
+    if (host_uuid) {
+        ret = dict_get_str(dict, "host-uuid", host_uuid);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+                   "host_uuid not found");
+            *op_errstr = gf_strdup("host_uuid not found");
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_stage_sys_exec(dict_t *dict, char **op_errstr)
+{
+    char errmsg[PATH_MAX] = "";
+    char *command = NULL;
+    char command_path[PATH_MAX] = "";
+    struct stat st = {
+        0,
+    };
+    int ret = -1;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    if (conf->op_version < 2) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNSUPPORTED_VERSION,
+               "Op Version not supported.");
+        snprintf(errmsg, sizeof(errmsg),
+                 "One or more nodes do not"
+                 " support the required op version.");
+        *op_errstr = gf_strdup(errmsg);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "command", &command);
+    if (ret) {
+        strcpy(errmsg, "internal error");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get command from dict");
+        goto out;
+    }
+
+    /* enforce local occurrence of the command */
+    if (strchr(command, '/')) {
+        strcpy(errmsg, "invalid command name");
+        ret = -1;
+        goto out;
+    }
+
+    sprintf(command_path, GSYNCD_PREFIX "/peer_%s", command);
+    /* check if it's executable */
+    ret = sys_access(command_path, X_OK);
+    if (!ret)
+        /* check if it's a regular file */
+        ret = sys_stat(command_path, &st);
+    if (!ret && !S_ISREG(st.st_mode))
+        ret = -1;
+
+out:
+    if (ret) {
+        if (errmsg[0] == '\0') {
+            if (command)
+                snprintf(errmsg, sizeof(errmsg),
+                         "gsync peer_%s command not found.", command);
+            else
+                snprintf(errmsg, sizeof(errmsg), "%s",
+                         "gsync peer command was not "
+                         "specified");
+        }
+        *op_errstr = gf_strdup(errmsg);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEER_CMD_ERROR, "%s",
+               errmsg);
+    }
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_stage_copy_file(dict_t *dict, char **op_errstr)
+{
+    char abs_filename[PATH_MAX] = "";
+    char errmsg[PATH_MAX] = "";
+    char *filename = NULL;
+    char *host_uuid = NULL;
+    char uuid_str[64] = {0};
+    int ret = -1;
+    glusterd_conf_t *priv = NULL;
+    struct stat stbuf = {
+        0,
+    };
+    xlator_t *this = NULL;
+    char workdir[PATH_MAX] = {
+        0,
+    };
+    char realpath_filename[PATH_MAX] = {
+        0,
+    };
+    char realpath_workdir[PATH_MAX] = {
+        0,
+    };
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (THIS)
+        priv = THIS->private;
+    if (priv == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_PRIV_NOT_FOUND,
+               "priv of glusterd not present");
+        *op_errstr = gf_strdup("glusterd defunct");
+        goto out;
+    }
+
+    if (priv->op_version < 2) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNSUPPORTED_VERSION,
+               "Op Version not supported.");
+        snprintf(errmsg, sizeof(errmsg),
+                 "One or more nodes do not"
+                 " support the required op version.");
+        *op_errstr = gf_strdup(errmsg);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "host-uuid", &host_uuid);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch host-uuid from dict.");
+        goto out;
+    }
+
+    uuid_utoa_r(MY_UUID, uuid_str);
+    if (!strcmp(uuid_str, host_uuid)) {
+        ret = dict_get_str(dict, "source", &filename);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to fetch filename from dict.");
+            *op_errstr = gf_strdup("command unsuccessful");
+            goto out;
+        }
+        len = snprintf(abs_filename, sizeof(abs_filename), "%s/%s",
+                       priv->workdir, filename);
+        if ((len < 0) || (len >= sizeof(abs_filename))) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_COPY_FAIL, NULL);
+            ret = -1;
+            goto out;
+        }
+
+        if (!realpath(priv->workdir, realpath_workdir)) {
+            len = snprintf(errmsg, sizeof(errmsg),
+                           "Failed to "
+                           "get realpath of %s: %s",
+                           priv->workdir, strerror(errno));
+            if (len < 0) {
+                strcpy(errmsg, "<error>");
+            }
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_REALPATH_GET_FAIL,
+                    "Realpath=%s, Reason=%s", priv->workdir, strerror(errno),
+                    NULL);
+            *op_errstr = gf_strdup(errmsg);
+            ret = -1;
+            goto out;
+        }
+
+        if (!realpath(abs_filename, realpath_filename)) {
+            snprintf(errmsg, sizeof(errmsg),
+                     "Failed to get "
+                     "realpath of %s: %s",
+                     filename, strerror(errno));
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_REALPATH_GET_FAIL,
+                    "Filename=%s, Reason=%s", filename, strerror(errno), NULL);
+            *op_errstr = gf_strdup(errmsg);
+            ret = -1;
+            goto out;
+        }
+
+        /* Add Trailing slash to workdir, without slash strncmp
+           will succeed for /var/lib/glusterd_bad */
+        len = snprintf(workdir, sizeof(workdir), "%s/", realpath_workdir);
+        if ((len < 0) || (len >= sizeof(workdir))) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_COPY_FAIL, NULL);
+            ret = -1;
+            goto out;
+        }
+
+        /* Protect against file copy outside $workdir */
+        if (strncmp(workdir, realpath_filename, strlen(workdir))) {
+            len = snprintf(errmsg, sizeof(errmsg),
+                           "Source file"
+                           " is outside of %s directory",
+                           priv->workdir);
+            if (len < 0) {
+                strcpy(errmsg, "<error>");
+            }
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_SRC_FILE_ERROR, errmsg,
+                    NULL);
+            *op_errstr = gf_strdup(errmsg);
+            ret = -1;
+            goto out;
+        }
+
+        ret = sys_lstat(abs_filename, &stbuf);
+        if (ret) {
+            len = snprintf(errmsg, sizeof(errmsg),
+                           "Source file"
+                           " does not exist in %s",
+                           priv->workdir);
+            if (len < 0) {
+                strcpy(errmsg, "<error>");
+            }
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_SRC_FILE_ERROR, errmsg,
+                    NULL);
+            *op_errstr = gf_strdup(errmsg);
+            goto out;
+        }
+
+        if (!S_ISREG(stbuf.st_mode)) {
+            snprintf(errmsg, sizeof(errmsg),
+                     "Source file"
+                     " is not a regular file.");
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_SRC_FILE_ERROR, errmsg,
+                    NULL);
+            *op_errstr = gf_strdup(errmsg);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_get_statefile_name(glusterd_volinfo_t *volinfo, char *slave,
+                            char *conf_path, char **statefile,
+                            gf_boolean_t *is_template_in_use)
+{
+    char *master = NULL;
+    char *buf = NULL;
+    char *working_conf_path = NULL;
+    char temp_conf_path[PATH_MAX] = "";
+    dict_t *confd = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = -1;
+    struct stat stbuf = {
+        0,
+    };
+    xlator_t *this = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(this->private);
+    GF_ASSERT(volinfo);
+    GF_ASSERT(conf_path);
+    GF_ASSERT(is_template_in_use);
+
+    master = volinfo->volname;
+
+    confd = dict_new();
+    if (!confd) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Unable to create new dict");
+        goto out;
+    }
+
+    priv = THIS->private;
+
+    len = snprintf(temp_conf_path, sizeof(temp_conf_path),
+                   "%s/" GSYNC_CONF_TEMPLATE, priv->workdir);
+    if ((len < 0) || (len >= sizeof(temp_conf_path))) {
+        goto out;
+    }
+
+    ret = sys_lstat(conf_path, &stbuf);
+    if (!ret) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_CONFIG_INFO,
+               "Using passed config template(%s).", conf_path);
+        working_conf_path = conf_path;
+    } else {
+        gf_msg(this->name, GF_LOG_WARNING, ENOENT, GD_MSG_FILE_OP_FAILED,
+               "Config file (%s) missing. Looking for template config"
+               " file (%s)",
+               conf_path, temp_conf_path);
+        ret = sys_lstat(temp_conf_path, &stbuf);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOENT, GD_MSG_FILE_OP_FAILED,
+                   "Template "
+                   "config file (%s) missing.",
+                   temp_conf_path);
+            goto out;
+        }
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DEFAULT_TEMP_CONFIG,
+               "Using default config template(%s).", temp_conf_path);
+        working_conf_path = temp_conf_path;
+        *is_template_in_use = _gf_true;
+    }
+
+fetch_data:
+    ret = glusterd_gsync_get_config(master, slave, working_conf_path, confd);
+    if (ret) {
+        if (*is_template_in_use == _gf_false) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GET_CONFIG_INFO_FAILED,
+                   "Unable to get configuration data "
+                   "for %s(master), %s(slave). "
+                   "Trying template config.",
+                   master, slave);
+            working_conf_path = temp_conf_path;
+            *is_template_in_use = _gf_true;
+            goto fetch_data;
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GET_CONFIG_INFO_FAILED,
+                   "Unable to get configuration data "
+                   "for %s(master), %s(slave) from "
+                   "template config",
+                   master, slave);
+            goto out;
+        }
+    }
+
+    ret = dict_get_param(confd, "state_file", &buf);
+    if (ret) {
+        if (*is_template_in_use == _gf_false) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get state_file's name. "
+                   "Trying template config.");
+            working_conf_path = temp_conf_path;
+            *is_template_in_use = _gf_true;
+            goto fetch_data;
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_GET_STATEFILE_NAME_FAILED,
+                   "Unable to get state_file's "
+                   "name from template.");
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    if (buf) {
+        *statefile = gf_strdup(buf);
+        if (!*statefile)
+            ret = -1;
+    }
+
+    if (confd)
+        dict_unref(confd);
+
+    gf_msg_debug(this->name, 0, "Returning %d ", ret);
+    return ret;
+}
+
+int
+glusterd_create_status_file(char *master, char *slave, char *slave_host,
+                            char *slave_vol, char *status)
+{
+    int ret = -1;
+    runner_t runner = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (THIS)
+        priv = THIS->private;
+    if (priv == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_PRIV_NOT_FOUND,
+               "priv of glusterd not present");
+        goto out;
+    }
+
+    if (!status) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STATUS_NULL, "Status Empty");
+        goto out;
+    }
+    gf_msg_debug(this->name, 0, "slave = %s", slave);
+
+    runinit(&runner);
+    runner_add_args(&runner, GSYNCD_PREFIX "/gsyncd", "--create", status, "-c",
+                    NULL);
+    runner_argprintf(&runner, "%s/" GEOREP "/%s_%s_%s/gsyncd.conf",
+                     priv->workdir, master, slave_host, slave_vol);
+    runner_argprintf(&runner, "--iprefix=%s", DATADIR);
+    runner_argprintf(&runner, ":%s", master);
+    runner_add_args(&runner, slave, NULL);
+    synclock_unlock(&priv->big_lock);
+    ret = runner_run(&runner);
+    synclock_lock(&priv->big_lock);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STATUSFILE_CREATE_FAILED,
+               "Creating status file failed.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug(this->name, 0, "returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_verify_slave(char *volname, char *slave_url, char *slave_vol,
+                      int ssh_port, char **op_errstr,
+                      gf_boolean_t *is_force_blocker)
+{
+    int32_t ret = -1;
+    runner_t runner = {
+        0,
+    };
+    char log_file_path[PATH_MAX] = "";
+    char buf[PATH_MAX] = "";
+    char *tmp = NULL;
+    char *slave_url_buf = NULL;
+    char *save_ptr = NULL;
+    char *slave_user = NULL;
+    char *slave_ip = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    char *af = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(volname);
+    GF_ASSERT(slave_url);
+    GF_ASSERT(slave_vol);
+
+    /* Fetch the slave_user and slave_ip from the slave_url.
+     * If the slave_user is not present. Use "root"
+     */
+    if (strstr(slave_url, "@")) {
+        slave_url_buf = gf_strdup(slave_url);
+        if (!slave_url_buf) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_STRDUP_FAILED,
+                    "Slave_url=%s", slave_url, NULL);
+            goto out;
+        }
+
+        slave_user = strtok_r(slave_url_buf, "@", &save_ptr);
+        slave_ip = strtok_r(NULL, "@", &save_ptr);
+    } else {
+        slave_user = "root";
+        slave_ip = slave_url;
+    }
+
+    if (!slave_user || !slave_ip) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVE_URL_INVALID,
+               "Invalid slave url.");
+        goto out;
+    }
+
+    snprintf(log_file_path, sizeof(log_file_path), "%s/create_verify_log",
+             priv->logdir);
+
+    runinit(&runner);
+    runner_add_args(&runner, GSYNCD_PREFIX "/gverify.sh", NULL);
+    runner_argprintf(&runner, "%s", volname);
+    runner_argprintf(&runner, "%s", slave_user);
+    runner_argprintf(&runner, "%s", slave_ip);
+    runner_argprintf(&runner, "%s", slave_vol);
+    runner_argprintf(&runner, "%d", ssh_port);
+    runner_argprintf(&runner, "%s", log_file_path);
+    ret = dict_get_str(this->options, "transport.address-family", &af);
+    if (ret)
+        af = "-";
+
+    runner_argprintf(&runner, "%s", af);
+
+    gf_msg_debug(this->name, 0, "gverify Args = %s %s %s %s %s %s %s %s",
+                 runner.argv[0], runner.argv[1], runner.argv[2], runner.argv[3],
+                 runner.argv[4], runner.argv[5], runner.argv[6],
+                 runner.argv[7]);
+    runner_redir(&runner, STDOUT_FILENO, RUN_PIPE);
+    synclock_unlock(&priv->big_lock);
+    ret = runner_run(&runner);
+    synclock_lock(&priv->big_lock);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_SLAVE,
+               "Not a valid slave");
+        ret = glusterd_gsync_read_frm_status(log_file_path, buf, sizeof(buf));
+        if (ret <= 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_READ_ERROR,
+                   "Unable to read from %s", log_file_path);
+            goto out;
+        }
+
+        /* Tokenize the error message from gverify.sh to figure out
+         * if the error is a force blocker or not. */
+        tmp = strtok_r(buf, "|", &save_ptr);
+        if (!tmp) {
+            ret = -1;
+            goto out;
+        }
+        if (!strcmp(tmp, "FORCE_BLOCKER"))
+            *is_force_blocker = 1;
+        else {
+            /* No FORCE_BLOCKER flag present so all that is
+             * present is the error message. */
+            *is_force_blocker = 0;
+            *op_errstr = gf_strdup(tmp);
+            ret = -1;
+            goto out;
+        }
+
+        /* Copy rest of the error message to op_errstr */
+        tmp = strtok_r(NULL, "|", &save_ptr);
+        if (tmp)
+            *op_errstr = gf_strdup(tmp);
+        ret = -1;
+        goto out;
+    }
+    ret = 0;
+out:
+    GF_FREE(slave_url_buf);
+    sys_unlink(log_file_path);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/** @slave_ip remains unmodified */
+int
+glusterd_geo_rep_parse_slave(char *slave_url, char **hostname, char **op_errstr)
+{
+    int ret = -1;
+    char *tmp = NULL;
+    char *save_ptr = NULL;
+    char *host = NULL;
+    char errmsg[PATH_MAX] = "";
+    char *saved_url = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(slave_url);
+    GF_ASSERT(*slave_url);
+
+    saved_url = gf_strdup(slave_url);
+    if (!saved_url)
+        goto out;
+
+    /* Checking if hostname has user specified */
+    host = strstr(saved_url, "@");
+    if (!host) { /* no user specified */
+        if (hostname) {
+            *hostname = gf_strdup(saved_url);
+            if (!*hostname)
+                goto out;
+        }
+
+        ret = 0;
+        goto out;
+    } else {
+        /* Moving the host past the '@' and checking if the
+         * actual hostname also has '@' */
+        host++;
+        if (strstr(host, "@")) {
+            gf_msg_debug(this->name, 0, "host = %s", host);
+            ret = snprintf(errmsg, sizeof(errmsg) - 1, "Invalid Hostname (%s).",
+                           host);
+            errmsg[ret] = '\0';
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY, "%s",
+                   errmsg);
+            ret = -1;
+            if (op_errstr)
+                *op_errstr = gf_strdup(errmsg);
+            goto out;
+        }
+
+        ret = -1;
+
+        /**
+         * preliminary check for valid slave format.
+         */
+        tmp = strtok_r(saved_url, "@", &save_ptr);
+        tmp = strtok_r(NULL, "@", &save_ptr);
+        if (!tmp)
+            goto out;
+        if (hostname) {
+            *hostname = gf_strdup(tmp);
+            if (!*hostname)
+                goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    GF_FREE(saved_url);
+    if (ret)
+        if (hostname)
+            GF_FREE(*hostname);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Return -1 only if there is a match in volume uuid */
+static int
+get_slavehost_from_voluuid(dict_t *dict, char *key, data_t *value, void *data)
+{
+    char *slave_info = NULL;
+    char *tmp = NULL;
+    char *slave_host = NULL;
+    xlator_t *this = NULL;
+    struct slave_vol_config *slave_vol = NULL;
+    int i = 0;
+    int ret = -1;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    slave_vol = data;
+    slave_info = value->data;
+
+    gf_msg_debug(this->name, 0, "slave_info:%s !", slave_info);
+
+    if (!(slave_info) || strlen(slave_info) == 0) {
+        /* no slaves present, peace  */
+        ret = 0;
+        goto out;
+    }
+
+    /* slave format:
+     * master_node_uuid:ssh://slave_host::slave_vol:slave_voluuid */
+    while (i++ < 5) {
+        slave_info = strchr(slave_info, ':');
+        if (slave_info)
+            slave_info++;
+        else
+            break;
+    }
+
+    if (!(slave_info) || strlen(slave_info) == 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVE_VOL_PARSE_FAIL,
+               "slave_info format is wrong!");
+        ret = -2;
+        goto out;
+    } else {
+        if (strcmp(slave_info, slave_vol->slave_voluuid) == 0) {
+            ret = -1;
+
+            /* get corresponding slave host for reference*/
+            slave_host = value->data;
+            slave_host = strstr(slave_host, "://");
+            if (slave_host) {
+                slave_host += 3;
+            } else {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVE_VOL_PARSE_FAIL,
+                       "Invalid slave_host format!");
+                ret = -2;
+                goto out;
+            }
+            /* To go past username in non-root geo-rep session */
+            tmp = strchr(slave_host, '@');
+            if (tmp) {
+                if ((tmp - slave_host) >= LOGIN_NAME_MAX) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_SLAVE_VOL_PARSE_FAIL,
+                           "Invalid slave user length in %s", slave_host);
+                    ret = -2;
+                    goto out;
+                }
+                strncpy(slave_vol->old_slvuser, slave_host, (tmp - slave_host));
+                slave_vol->old_slvuser[(tmp - slave_host) + 1] = '\0';
+                slave_host = tmp + 1;
+            } else
+                strcpy(slave_vol->old_slvuser, "root");
+
+            tmp = strchr(slave_host, ':');
+            if (!tmp) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVE_VOL_PARSE_FAIL,
+                       "Invalid slave_host!");
+                ret = -2;
+                goto out;
+            }
+
+            strncpy(slave_vol->old_slvhost, slave_host, (tmp - slave_host));
+            slave_vol->old_slvhost[(tmp - slave_host) + 1] = '\0';
+
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/* Given slave host and slave volume, check whether slave volume uuid
+ * already present.
+ * If slave volume uuid is present, get corresponding slave host
+ * for reference */
+static int
+glusterd_get_slavehost_from_voluuid(glusterd_volinfo_t *volinfo,
+                                    char *slave_host, char *slave_vol,
+                                    struct slave_vol_config *slave1)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+
+    ret = dict_foreach(volinfo->gsync_slaves, get_slavehost_from_voluuid,
+                       slave1);
+out:
+    return ret;
+}
+
+int
+glusterd_op_stage_gsync_create(dict_t *dict, char **op_errstr)
+{
+    char *down_peerstr = NULL;
+    char *slave = NULL;
+    char *volname = NULL;
+    char *host_uuid = NULL;
+    char *statefile = NULL;
+    char *slave_url = NULL;
+    char *slave_host = NULL;
+    char *slave_vol = NULL;
+    char *conf_path = NULL;
+    char errmsg[PATH_MAX] = "";
+    char common_pem_file[PATH_MAX] = "";
+    char hook_script[PATH_MAX] = "";
+    char uuid_str[64] = "";
+    int ret = -1;
+    int is_pem_push = -1;
+    int ssh_port = 22;
+    gf_boolean_t is_force = -1;
+    gf_boolean_t is_no_verify = -1;
+    gf_boolean_t is_force_blocker = -1;
+    gf_boolean_t is_template_in_use = _gf_false;
+    glusterd_conf_t *conf = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    struct stat stbuf = {
+        0,
+    };
+    xlator_t *this = NULL;
+    struct slave_vol_config slave1 = {
+        {0},
+    };
+    char old_slave_url[SLAVE_URL_INFO_MAX] = {0};
+    char old_confpath[PATH_MAX] = {0};
+    gf_boolean_t is_running = _gf_false;
+    char *statedir = NULL;
+    char statefiledir[PATH_MAX] = {
+        0,
+    };
+    gf_boolean_t is_different_slavehost = _gf_false;
+    gf_boolean_t is_different_username = _gf_false;
+    char *slave_user = NULL;
+    char *save_ptr = NULL;
+    char *slave_url_buf = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    ret = glusterd_op_gsync_args_get(dict, op_errstr, &volname, &slave,
+                                     &host_uuid);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_ARG_FETCH_ERROR,
+               "Unable to fetch arguments");
+        gf_msg_debug(this->name, 0, "Returning %d", ret);
+        return -1;
+    }
+
+    if (conf->op_version < 2) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNSUPPORTED_VERSION,
+               "Op Version not supported.");
+        snprintf(errmsg, sizeof(errmsg),
+                 "One or more nodes do not"
+                 " support the required op version.");
+        *op_errstr = gf_strdup(errmsg);
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOL_NOT_FOUND,
+               "volume name does not exist");
+        snprintf(errmsg, sizeof(errmsg),
+                 "Volume name %s does not"
+                 " exist",
+                 volname);
+        goto out;
+    }
+
+    ret = glusterd_get_slave_details_confpath(volinfo, dict, &slave_url,
+                                              &slave_host, &slave_vol,
+                                              &conf_path, op_errstr);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVEINFO_FETCH_ERROR,
+               "Unable to fetch slave or confpath details.");
+        ret = -1;
+        goto out;
+    }
+
+    is_force = dict_get_str_boolean(dict, "force", _gf_false);
+
+    uuid_utoa_r(MY_UUID, uuid_str);
+    if (!strcmp(uuid_str, host_uuid)) {
+        ret = glusterd_are_vol_all_peers_up(volinfo, &conf->peers,
+                                            &down_peerstr);
+        if ((ret == _gf_false) && !is_force) {
+            snprintf(errmsg, sizeof(errmsg),
+                     "Peer %s,"
+                     " which is a part of %s volume, is"
+                     " down. Please bring up the peer and"
+                     " retry.",
+                     down_peerstr, volinfo->volname);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEER_DISCONNECTED, "%s",
+                   errmsg);
+            *op_errstr = gf_strdup(errmsg);
+            GF_FREE(down_peerstr);
+            down_peerstr = NULL;
+            gf_msg_debug(this->name, 0, "Returning %d", ret);
+            return -1;
+        } else if (ret == _gf_false) {
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_PEER_DISCONNECTED,
+                   "Peer %s, which is a part of %s volume, is"
+                   " down. Force creating geo-rep session."
+                   " On bringing up the peer, re-run"
+                   " \"gluster system:: execute"
+                   " gsec_create\" and \"gluster volume"
+                   " geo-replication %s %s create push-pem"
+                   " force\"",
+                   down_peerstr, volinfo->volname, volinfo->volname, slave);
+            GF_FREE(down_peerstr);
+            down_peerstr = NULL;
+        }
+
+        ret = dict_get_int32(dict, "ssh_port", &ssh_port);
+        if (ret < 0 && ret != -ENOENT) {
+            snprintf(errmsg, sizeof(errmsg),
+                     "Fetching ssh_port failed while "
+                     "handling " GEOREP " options");
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+                   errmsg);
+            goto out;
+        }
+
+        is_no_verify = dict_get_str_boolean(dict, "no_verify", _gf_false);
+
+        if (!is_no_verify) {
+            /* Checking if slave host is pingable, has proper passwordless
+             * ssh login setup, slave volume is created, slave vol is empty,
+             * and if it has enough memory and bypass in case of force if
+             * the error is not a force blocker */
+            ret = glusterd_verify_slave(volname, slave_url, slave_vol, ssh_port,
+                                        op_errstr, &is_force_blocker);
+            if (ret) {
+                if (is_force && !is_force_blocker) {
+                    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_INVALID_SLAVE,
+                           "%s is not a valid slave "
+                           "volume. Error: %s. Force "
+                           "creating geo-rep"
+                           " session.",
+                           slave, *op_errstr);
+                } else {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_SLAVE,
+                           "%s is not a valid slave "
+                           "volume. Error: %s",
+                           slave, *op_errstr);
+                    ret = -1;
+
+                    goto out;
+                }
+            }
+        }
+
+        ret = dict_get_int32(dict, "push_pem", &is_pem_push);
+        if (!ret && is_pem_push) {
+            ret = snprintf(common_pem_file, sizeof(common_pem_file),
+                           "%s" GLUSTERD_COMMON_PEM_PUB_FILE, conf->workdir);
+            if ((ret < 0) || (ret >= sizeof(common_pem_file))) {
+                ret = -1;
+                goto out;
+            }
+
+            ret = snprintf(hook_script, sizeof(hook_script),
+                           "%s" GLUSTERD_CREATE_HOOK_SCRIPT, conf->workdir);
+            if ((ret < 0) || (ret >= sizeof(hook_script))) {
+                ret = -1;
+                goto out;
+            }
+
+            ret = sys_lstat(common_pem_file, &stbuf);
+            if (ret) {
+                len = snprintf(errmsg, sizeof(errmsg),
+                               "%s"
+                               " required for push-pem is"
+                               " not present. Please run"
+                               " \"gluster system:: execute"
+                               " gsec_create\"",
+                               common_pem_file);
+                if (len < 0) {
+                    strcpy(errmsg, "<error>");
+                }
+                gf_msg(this->name, GF_LOG_ERROR, ENOENT, GD_MSG_FILE_OP_FAILED,
+                       "%s", errmsg);
+                *op_errstr = gf_strdup(errmsg);
+                ret = -1;
+                goto out;
+            }
+
+            ret = sys_lstat(hook_script, &stbuf);
+            if (ret) {
+                len = snprintf(errmsg, sizeof(errmsg),
+                               "The hook-script (%s) "
+                               "required for push-pem is not "
+                               "present. Please install the "
+                               "hook-script and retry",
+                               hook_script);
+                if (len < 0) {
+                    strcpy(errmsg, "<error>");
+                }
+                gf_msg(this->name, GF_LOG_ERROR, ENOENT, GD_MSG_FILE_OP_FAILED,
+                       "%s", errmsg);
+                *op_errstr = gf_strdup(errmsg);
+                ret = -1;
+                goto out;
+            }
+
+            if (!S_ISREG(stbuf.st_mode)) {
+                len = snprintf(errmsg, sizeof(errmsg),
+                               "%s"
+                               " required for push-pem is"
+                               " not a regular file. Please"
+                               " run \"gluster system:: "
+                               "execute gsec_create\"",
+                               common_pem_file);
+                if (len < 0) {
+                    strcpy(errmsg, "<error>");
+                }
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REG_FILE_MISSING,
+                       "%s", errmsg);
+                ret = -1;
+                goto out;
+            }
+        }
+    }
+
+    ret = glusterd_get_statefile_name(volinfo, slave, conf_path, &statefile,
+                                      &is_template_in_use);
+    if (ret) {
+        if (!strstr(slave, "::"))
+            snprintf(errmsg, sizeof(errmsg), "%s is not a valid slave url.",
+                     slave);
+        else
+            snprintf(errmsg, sizeof(errmsg),
+                     "Please check gsync "
+                     "config file. Unable to get statefile's name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STATEFILE_NAME_NOT_FOUND,
+               "%s", errmsg);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_str(dict, "statefile", statefile);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to store statefile path");
+        goto out;
+    }
+
+    if (snprintf(statefiledir, sizeof(statefiledir), "%s", statefile) >=
+        sizeof(statefiledir)) {
+        snprintf(errmsg, sizeof(errmsg), "Failed copying statefiledir");
+        goto out;
+    }
+    statedir = dirname(statefiledir);
+
+    ret = sys_lstat(statedir, &stbuf);
+    if (!ret && !is_force) {
+        snprintf(errmsg, sizeof(errmsg),
+                 "Session between %s"
+                 " and %s is already created.",
+                 volinfo->volname, slave);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SESSION_ALREADY_EXIST, "%s",
+               errmsg);
+        ret = -1;
+        goto out;
+    } else if (!ret)
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_FORCE_CREATE_SESSION,
+               "Session between %s and %s is already created. Force"
+               " creating again.",
+               volinfo->volname, slave);
+
+    ret = glusterd_get_slave_voluuid(slave_host, slave_vol,
+                                     slave1.slave_voluuid);
+    if ((ret) || (strlen(slave1.slave_voluuid) == 0)) {
+        snprintf(errmsg, sizeof(errmsg), "Unable to get remote volume uuid.");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REMOTE_VOL_UUID_FAIL, "%s",
+               errmsg);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(dict, "slave_voluuid",
+                                     slave1.slave_voluuid);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set slave volume uuid in the dict");
+        goto out;
+    }
+
+    /* Check whether session is already created using slave volume uuid */
+    ret = glusterd_get_slavehost_from_voluuid(volinfo, slave_host, slave_vol,
+                                              &slave1);
+    if (ret == -1) {
+        if (!is_force) {
+            snprintf(errmsg, sizeof(errmsg),
+                     "Session between %s"
+                     " and %s:%s is already created! Cannot create "
+                     "with new slave:%s again!",
+                     volinfo->volname, slave1.old_slvhost, slave_vol,
+                     slave_host);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FORCE_CREATE_SESSION,
+                   "Session between"
+                   " %s and %s:%s is already created! "
+                   "Cannot create with new slave:%s again!",
+                   volinfo->volname, slave1.old_slvhost, slave_vol, slave_host);
+            goto out;
+        }
+
+        /* There is a remote possibility that slave_host can be NULL when
+           control reaches here. Add a check so we wouldn't crash in next
+           line */
+        if (!slave_host)
+            goto out;
+
+        /* Now, check whether session is already started.If so, warn!*/
+        is_different_slavehost = (strcmp(slave_host, slave1.old_slvhost) != 0)
+                                     ? _gf_true
+                                     : _gf_false;
+
+        if (strstr(slave_url, "@")) {
+            slave_url_buf = gf_strdup(slave_url);
+            if (!slave_url_buf) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                       "Unable to allocate memory");
+                ret = -1;
+                goto out;
+            }
+            slave_user = strtok_r(slave_url_buf, "@", &save_ptr);
+        } else
+            slave_user = "root";
+        is_different_username = (strcmp(slave_user, slave1.old_slvuser) != 0)
+                                    ? _gf_true
+                                    : _gf_false;
+
+        /* Do the check, only if different slave host/slave user */
+        if (is_different_slavehost || is_different_username) {
+            len = snprintf(old_confpath, sizeof(old_confpath),
+                           "%s/" GEOREP "/%s_%s_%s/gsyncd.conf", conf->workdir,
+                           volinfo->volname, slave1.old_slvhost, slave_vol);
+            if ((len < 0) || (len >= sizeof(old_confpath))) {
+                ret = -1;
+                goto out;
+            }
+
+            /* construct old slave url with (old) slave host */
+            len = snprintf(old_slave_url, sizeof(old_slave_url), "%s::%s",
+                           slave1.old_slvhost, slave_vol);
+            if ((len < 0) || (len >= sizeof(old_slave_url))) {
+                ret = -1;
+                goto out;
+            }
+
+            ret = glusterd_check_gsync_running_local(
+                volinfo->volname, old_slave_url, old_confpath, &is_running);
+            if (_gf_true == is_running) {
+                (void)snprintf(errmsg, sizeof(errmsg),
+                               "Geo"
+                               "-replication session between %s and %s"
+                               " is still active. Please stop the "
+                               "session and retry.",
+                               volinfo->volname, old_slave_url);
+                ret = -1;
+                goto out;
+            }
+        }
+
+        ret = dict_set_dynstr_with_alloc(dict, "old_slavehost",
+                                         slave1.old_slvhost);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to set old_slavehost in the dict");
+            goto out;
+        }
+
+        ret = dict_set_int32(dict, "existing_session", _gf_true);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to set existing_session in the dict");
+            goto out;
+        }
+    } else if (ret == -2) {
+        snprintf(errmsg, sizeof(errmsg),
+                 "get_slavehost_from_voluuid"
+                 " failed for %s::%s. Please check the glusterd logs.",
+                 slave_host, slave_vol);
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_FORCE_CREATE_SESSION,
+               "get_slavehost_from_voluuid failed %s %s!!", slave_host,
+               slave_vol);
+        goto out;
+    }
+
+    ret = glusterd_verify_gsyncd_spawn(volinfo->volname, slave);
+    if (ret) {
+        snprintf(errmsg, sizeof(errmsg), "Unable to spawn gsyncd.");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GSYNCD_SPAWN_FAILED, "%s",
+               errmsg);
+        goto out;
+    }
+
+    ret = 0;
+out:
+
+    if (ret && errmsg[0] != '\0')
+        *op_errstr = gf_strdup(errmsg);
+
+    if (slave_url_buf)
+        GF_FREE(slave_url_buf);
+
+    return ret;
+}
+
+/* pre-condition check for geo-rep pause/resume.
+ * Return: 0 on success
+ *        -1 on any check failed.
+ */
+static int
+gd_pause_resume_validation(int type, glusterd_volinfo_t *volinfo, char *slave,
+                           char *statefile, char **op_errstr)
+{
+    int ret = 0;
+    char errmsg[PATH_MAX] = {
+        0,
+    };
+    char monitor_status[NAME_MAX] = {
+        0,
+    };
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(slave);
+    GF_ASSERT(statefile);
+    GF_ASSERT(op_errstr);
+
+    ret = glusterd_gsync_read_frm_status(statefile, monitor_status,
+                                         sizeof(monitor_status));
+    if (ret <= 0) {
+        snprintf(errmsg, sizeof(errmsg),
+                 "Pause check Failed:"
+                 " Geo-rep session is not setup");
+        ret = -1;
+        goto out;
+    }
+
+    if (type == GF_GSYNC_OPTION_TYPE_PAUSE &&
+        strstr(monitor_status, "Paused")) {
+        snprintf(errmsg, sizeof(errmsg),
+                 "Geo-replication"
+                 " session between %s and %s already Paused.",
+                 volinfo->volname, slave);
+        ret = -1;
+        goto out;
+    }
+    if (type == GF_GSYNC_OPTION_TYPE_RESUME &&
+        !strstr(monitor_status, "Paused")) {
+        snprintf(errmsg, sizeof(errmsg),
+                 "Geo-replication"
+                 " session between %s and %s is not Paused.",
+                 volinfo->volname, slave);
+        ret = -1;
+        goto out;
+    }
+    ret = 0;
+out:
+    if (ret && (errmsg[0] != '\0')) {
+        *op_errstr = gf_strdup(errmsg);
+    }
+    return ret;
+}
+
+int
+glusterd_op_stage_gsync_set(dict_t *dict, char **op_errstr)
+{
+    int ret = 0;
+    int type = 0;
+    char *volname = NULL;
+    char *slave = NULL;
+    char *slave_url = NULL;
+    char *slave_host = NULL;
+    char *slave_vol = NULL;
+    char *down_peerstr = NULL;
+    char *statefile = NULL;
+    char statefiledir[PATH_MAX] = {
+        0,
+    };
+    char *statedir = NULL;
+    char *path_list = NULL;
+    char *conf_path = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char errmsg[PATH_MAX] = {
+        0,
+    };
+    dict_t *ctx = NULL;
+    gf_boolean_t is_force = 0;
+    gf_boolean_t is_running = _gf_false;
+    gf_boolean_t is_template_in_use = _gf_false;
+    uuid_t uuid = {0};
+    char uuid_str[64] = {0};
+    char *host_uuid = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    struct stat stbuf = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    ret = dict_get_int32(dict, "type", &type);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+               "command type not found");
+        *op_errstr = gf_strdup("command unsuccessful");
+        goto out;
+    }
+
+    if (type == GF_GSYNC_OPTION_TYPE_STATUS) {
+        ret = glusterd_verify_gsync_status_opts(dict, op_errstr);
+        goto out;
+    }
+
+    ret = glusterd_op_gsync_args_get(dict, op_errstr, &volname, &slave,
+                                     &host_uuid);
+    if (ret)
+        goto out;
+
+    uuid_utoa_r(MY_UUID, uuid_str);
+
+    if (conf->op_version < 2) {
+        snprintf(errmsg, sizeof(errmsg),
+                 "One or more nodes do not"
+                 " support the required op version.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(errmsg, sizeof(errmsg),
+                 "Volume name %s does not"
+                 " exist",
+                 volname);
+        goto out;
+    }
+
+    ret = glusterd_get_slave_details_confpath(volinfo, dict, &slave_url,
+                                              &slave_host, &slave_vol,
+                                              &conf_path, op_errstr);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVEINFO_FETCH_ERROR,
+               "Unable to fetch slave or confpath details.");
+        ret = -1;
+        goto out;
+    }
+
+    is_force = dict_get_str_boolean(dict, "force", _gf_false);
+
+    ret = glusterd_get_statefile_name(volinfo, slave, conf_path, &statefile,
+                                      &is_template_in_use);
+    if (ret) {
+        if (!strstr(slave, "::")) {
+            snprintf(errmsg, sizeof(errmsg), "%s is not a valid slave url.",
+                     slave);
+            ret = -1;
+            goto out;
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVE_URL_INVALID,
+                   "state_file entry missing in config file (%s)", conf_path);
+
+            if ((type == GF_GSYNC_OPTION_TYPE_STOP) && is_force) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_STOP_FORCE,
+                       "Allowing stop "
+                       "force to bypass missing statefile "
+                       "entry in config file (%s), and "
+                       "template file",
+                       conf_path);
+                ret = 0;
+            } else
+                goto out;
+        }
+    } else {
+        ret = dict_set_str(dict, "statefile", statefile);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to store statefile path");
+            goto out;
+        }
+    }
+
+    /* Allowing stop force to bypass the statefile check
+     * as this command acts as a fail safe method to stop geo-rep
+     * session. */
+    if (!((type == GF_GSYNC_OPTION_TYPE_STOP) && is_force)) {
+        /* check session directory as statefile may not present
+         * during upgrade */
+        if (snprintf(statefiledir, sizeof(statefiledir), "%s", statefile) >=
+            sizeof(statefiledir)) {
+            snprintf(errmsg, sizeof(errmsg), "Failed copying statefiledir");
+            ret = -1;
+            goto out;
+        }
+        statedir = dirname(statefiledir);
+
+        ret = sys_lstat(statedir, &stbuf);
+        if (ret) {
+            snprintf(errmsg, sizeof(errmsg),
+                     "Geo-replication"
+                     " session between %s and %s does not exist.",
+                     volinfo->volname, slave);
+            gf_msg(this->name, GF_LOG_ERROR, ENOENT, GD_MSG_FILE_OP_FAILED,
+                   "%s. statefile = %s", errmsg, statefile);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    /* Check if all peers that are a part of the volume are up or not */
+    if ((type == GF_GSYNC_OPTION_TYPE_DELETE) ||
+        ((type == GF_GSYNC_OPTION_TYPE_STOP) && !is_force) ||
+        (type == GF_GSYNC_OPTION_TYPE_PAUSE) ||
+        (type == GF_GSYNC_OPTION_TYPE_RESUME)) {
+        if (!strcmp(uuid_str, host_uuid)) {
+            ret = glusterd_are_vol_all_peers_up(volinfo, &conf->peers,
+                                                &down_peerstr);
+            if (ret == _gf_false) {
+                snprintf(errmsg, sizeof(errmsg),
+                         "Peer %s,"
+                         " which is a part of %s volume, is"
+                         " down. Please bring up the peer and"
+                         " retry.",
+                         down_peerstr, volinfo->volname);
+                ret = -1;
+                GF_FREE(down_peerstr);
+                down_peerstr = NULL;
+                goto out;
+            }
+        }
+    }
+
+    switch (type) {
+        case GF_GSYNC_OPTION_TYPE_START:
+            if (is_template_in_use) {
+                snprintf(errmsg, sizeof(errmsg),
+                         "state-file entry "
+                         "missing in the config file(%s).",
+                         conf_path);
+                ret = -1;
+                goto out;
+            }
+
+            ret = glusterd_op_verify_gsync_start_options(
+                volinfo, slave, conf_path, statefile, op_errstr, is_force);
+            if (ret)
+                goto out;
+            ctx = glusterd_op_get_ctx();
+            if (ctx) {
+                /* gsyncd does a fuse mount to start
+                 * the geo-rep session */
+                if (!glusterd_is_fuse_available()) {
+                    gf_msg("glusterd", GF_LOG_ERROR, errno,
+                           GD_MSG_GEO_REP_START_FAILED,
+                           "Unable "
+                           "to open /dev/fuse (%s), "
+                           "geo-replication start failed",
+                           strerror(errno));
+                    snprintf(errmsg, sizeof(errmsg), "fuse unavailable");
+                    ret = -1;
+                    goto out;
+                }
+            }
+            break;
+
+        case GF_GSYNC_OPTION_TYPE_STOP:
+            if (!is_force) {
+                if (is_template_in_use) {
+                    snprintf(errmsg, sizeof(errmsg),
+                             "state-file entry missing in "
+                             "the config file(%s).",
+                             conf_path);
+                    ret = -1;
+                    goto out;
+                }
+
+                ret = glusterd_op_verify_gsync_running(volinfo, slave,
+                                                       conf_path, op_errstr);
+                if (ret) {
+                    ret = glusterd_get_local_brickpaths(volinfo, &path_list);
+                    if (!path_list && ret == -1)
+                        goto out;
+                }
+
+                /* Check for geo-rep session is active or not for
+                 * configured user.*/
+                ret = glusterd_gsync_get_uuid(slave, volinfo, uuid);
+                if (ret) {
+                    snprintf(errmsg, sizeof(errmsg),
+                             "Geo-replication session between %s "
+                             "and %s does not exist.",
+                             volinfo->volname, slave);
+                    ret = -1;
+                    goto out;
+                }
+            }
+            break;
+
+        case GF_GSYNC_OPTION_TYPE_PAUSE:
+        case GF_GSYNC_OPTION_TYPE_RESUME:
+            if (is_template_in_use) {
+                snprintf(errmsg, sizeof(errmsg),
+                         "state-file entry missing in "
+                         "the config file(%s).",
+                         conf_path);
+                ret = -1;
+                goto out;
+            }
+
+            ret = glusterd_op_verify_gsync_running(volinfo, slave, conf_path,
+                                                   op_errstr);
+            if (ret) {
+                ret = glusterd_get_local_brickpaths(volinfo, &path_list);
+                if (!path_list && ret == -1)
+                    goto out;
+            }
+
+            /* Check for geo-rep session is active or not
+             * for configured user.*/
+            ret = glusterd_gsync_get_uuid(slave, volinfo, uuid);
+            if (ret) {
+                snprintf(errmsg, sizeof(errmsg),
+                         "Geo-replication"
+                         " session between %s and %s does not exist.",
+                         volinfo->volname, slave);
+                ret = -1;
+                goto out;
+            }
+
+            if (!is_force) {
+                ret = gd_pause_resume_validation(type, volinfo, slave,
+                                                 statefile, op_errstr);
+                if (ret) {
+                    ret = glusterd_get_local_brickpaths(volinfo, &path_list);
+                    if (!path_list && ret == -1)
+                        goto out;
+                }
+            }
+            break;
+
+        case GF_GSYNC_OPTION_TYPE_CONFIG:
+            if (is_template_in_use) {
+                snprintf(errmsg, sizeof(errmsg),
+                         "state-file entry "
+                         "missing in the config file(%s).",
+                         conf_path);
+                ret = -1;
+                goto out;
+            }
+
+            ret = gsync_verify_config_options(dict, op_errstr, volname);
+            goto out;
+            break;
+
+        case GF_GSYNC_OPTION_TYPE_DELETE:
+            /* Check if the gsync session is still running
+             * If so ask the user to stop geo-replication first.*/
+            if (is_template_in_use) {
+                snprintf(errmsg, sizeof(errmsg),
+                         "state-file entry "
+                         "missing in the config file(%s).",
+                         conf_path);
+                ret = -1;
+                goto out;
+            }
+
+            ret = glusterd_gsync_get_uuid(slave, volinfo, uuid);
+            if (ret) {
+                snprintf(errmsg, sizeof(errmsg),
+                         "Geo-replication"
+                         " session between %s and %s does not exist.",
+                         volinfo->volname, slave);
+                ret = -1;
+                goto out;
+            } else {
+                ret = glusterd_check_gsync_running_local(
+                    volinfo->volname, slave, conf_path, &is_running);
+                if (_gf_true == is_running) {
+                    snprintf(errmsg, sizeof(errmsg),
+                             GEOREP
+                             " session between %s & %s is "
+                             "still active. Please stop the "
+                             "session and retry.",
+                             volinfo->volname, slave);
+                    ret = -1;
+                    goto out;
+                }
+            }
+
+            ret = glusterd_verify_gsyncd_spawn(volinfo->volname, slave);
+            if (ret) {
+                snprintf(errmsg, sizeof(errmsg), "Unable to spawn gsyncd");
+            }
+
+            break;
+    }
+
+out:
+
+    if (path_list)
+        GF_FREE(path_list);
+
+    if (ret && errmsg[0] != '\0') {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GSYNCD_ERROR, "%s", errmsg);
+        *op_errstr = gf_strdup(errmsg);
+    }
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+gd_pause_or_resume_gsync(dict_t *dict, char *master, char *slave,
+                         char *slave_host, char *slave_vol, char *conf_path,
+                         char **op_errstr, gf_boolean_t is_pause)
+{
+    int32_t ret = 0;
+    int pfd = -1;
+    long pid = 0;
+    char pidfile[PATH_MAX] = {
+        0,
+    };
+    char errmsg[PATH_MAX] = "";
+    char buf[4096] = {
+        0,
+    };
+    gf_boolean_t is_template_in_use = _gf_false;
+    char monitor_status[NAME_MAX] = {
+        0,
+    };
+    char *statefile = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(master);
+    GF_ASSERT(slave);
+    GF_ASSERT(slave_host);
+    GF_ASSERT(slave_vol);
+    GF_ASSERT(conf_path);
+
+    pfd = gsyncd_getpidfile(master, slave, pidfile, conf_path,
+                            &is_template_in_use);
+    if (pfd == -2) {
+        snprintf(errmsg, sizeof(errmsg),
+                 "pid-file entry mising in config file and "
+                 "template config file.");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PIDFILE_NOT_FOUND, "%s",
+               errmsg);
+        *op_errstr = gf_strdup(errmsg);
+        ret = -1;
+        goto out;
+    }
+
+    if (gsync_status_byfd(pfd) == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GSYNCD_ERROR,
+               "gsyncd b/w %s & %s is not running", master, slave);
+        /* monitor gsyncd already dead */
+        goto out;
+    }
+
+    if (pfd < 0)
+        goto out;
+
+    /* Prepare to update status file*/
+    ret = dict_get_str(dict, "statefile", &statefile);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Pause/Resume Failed: Unable to fetch statefile path");
+        goto out;
+    }
+    ret = glusterd_gsync_read_frm_status(statefile, monitor_status,
+                                         sizeof(monitor_status));
+    if (ret <= 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STAT_FILE_READ_FAILED,
+               "Pause/Resume Failed: "
+               "Unable to read status file for %s(master)"
+               " %s(slave)",
+               master, slave);
+        goto out;
+    }
+
+    ret = sys_read(pfd, buf, sizeof(buf) - 1);
+    if (ret > 0) {
+        buf[ret] = '\0';
+        pid = strtol(buf, NULL, 10);
+        if (is_pause) {
+            ret = kill(-pid, SIGSTOP);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_PID_KILL_FAIL,
+                       "Failed"
+                       " to pause gsyncd. Error: %s",
+                       strerror(errno));
+                goto out;
+            }
+            /*On pause force, if status is already paused
+              do not update status again*/
+            if (strstr(monitor_status, "Paused"))
+                goto out;
+
+            ret = glusterd_create_status_file(master, slave, slave_host,
+                                              slave_vol, "Paused");
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_UPDATE_STATEFILE_FAILED,
+                       "Unable  to update state_file."
+                       " Error : %s",
+                       strerror(errno));
+                /* If status cannot be updated resume back */
+                if (kill(-pid, SIGCONT)) {
+                    snprintf(errmsg, sizeof(errmsg),
+                             "Pause successful but could "
+                             "not update status file. "
+                             "Please use 'resume force' to"
+                             " resume back and retry pause"
+                             " to reflect in status");
+                    gf_msg(this->name, GF_LOG_ERROR, errno,
+                           GD_MSG_PID_KILL_FAIL,
+                           "Resume back Failed. Error:"
+                           "%s",
+                           strerror(errno));
+                    *op_errstr = gf_strdup(errmsg);
+                }
+                goto out;
+            }
+        } else {
+            ret = glusterd_create_status_file(master, slave, slave_host,
+                                              slave_vol, "Started");
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_UPDATE_STATEFILE_FAILED,
+                       "Resume Failed: Unable to update "
+                       "state_file. Error : %s",
+                       strerror(errno));
+                goto out;
+            }
+            ret = kill(-pid, SIGCONT);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_PID_KILL_FAIL,
+                       "Resumed Failed: Unable to send"
+                       " SIGCONT. Error: %s",
+                       strerror(errno));
+                /* Process can't be resumed, update status
+                 * back to paused. */
+                ret = glusterd_create_status_file(master, slave, slave_host,
+                                                  slave_vol, monitor_status);
+                if (ret) {
+                    snprintf(errmsg, sizeof(errmsg),
+                             "Resume failed!!! Status "
+                             "inconsistent. Please use "
+                             "'resume force' to resume and"
+                             " reach consistent state");
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_STATUS_UPDATE_FAILED,
+                           "Updating status back to paused"
+                           " Failed. Error: %s",
+                           strerror(errno));
+                    *op_errstr = gf_strdup(errmsg);
+                }
+                goto out;
+            }
+        }
+    }
+    ret = 0;
+
+out:
+    sys_close(pfd);
+    /* coverity[INTEGER_OVERFLOW] */
+    return ret;
+}
+
+static int
+stop_gsync(char *master, char *slave, char **msg, char *conf_path,
+           char **op_errstr, gf_boolean_t is_force)
+{
+    int32_t ret = 0;
+    int pfd = -1;
+    long pid = 0;
+    char pidfile[PATH_MAX] = {
+        0,
+    };
+    char errmsg[PATH_MAX] = "";
+    char buf[4096] = {
+        0,
+    };
+    int i = 0;
+    gf_boolean_t is_template_in_use = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(this->private);
+
+    pfd = gsyncd_getpidfile(master, slave, pidfile, conf_path,
+                            &is_template_in_use);
+    if (pfd == -2) {
+        snprintf(errmsg, sizeof(errmsg) - 1,
+                 "pid-file entry mising in config file and "
+                 "template config file.");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PIDFILE_NOT_FOUND, "%s",
+               errmsg);
+        *op_errstr = gf_strdup(errmsg);
+        ret = -1;
+        goto out;
+    }
+    if (gsync_status_byfd(pfd) == -1 && !is_force) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GSYNCD_ERROR,
+               "gsyncd b/w %s & %s is not running", master, slave);
+        /* monitor gsyncd already dead */
+        goto out;
+    }
+
+    if (pfd < 0)
+        goto out;
+
+    ret = sys_read(pfd, buf, sizeof(buf) - 1);
+    if (ret > 0) {
+        buf[ret] = '\0';
+        pid = strtol(buf, NULL, 10);
+        ret = kill(-pid, SIGTERM);
+        if (ret && !is_force) {
+            gf_msg(this->name, GF_LOG_WARNING, errno, GD_MSG_PID_KILL_FAIL,
+                   "failed to kill gsyncd");
+            goto out;
+        }
+        for (i = 0; i < 20; i++) {
+            if (gsync_status_byfd(pfd) == -1) {
+                /* monitor gsyncd is dead but worker may
+                 * still be alive, give some more time
+                 * before SIGKILL (hack)
+                 */
+                gf_nanosleep(50000 * GF_US_IN_NS);
+                break;
+            }
+            gf_nanosleep(50000 * GF_US_IN_NS);
+        }
+        kill(-pid, SIGKILL);
+        sys_unlink(pidfile);
+    }
+    ret = 0;
+
+out:
+    sys_close(pfd);
+    /* coverity[INTEGER_OVERFLOW] */
+    return ret;
+}
+
+/*
+ * glusterd_gsync_op_already_set:
+ *      This function checks whether the op_value is same as in the
+ *      gsyncd.conf file.
+ *
+ * RETURN VALUE:
+ *      0 : op_value matches the conf file.
+ *      1 : op_value does not matches the conf file or op_param not
+ *          found in conf file.
+ *     -1 : error
+ */
+
+int
+glusterd_gsync_op_already_set(char *master, char *slave, char *conf_path,
+                              char *op_name, char *op_value)
+{
+    dict_t *confd = NULL;
+    char *op_val_buf = NULL;
+    int32_t op_val_conf = 0;
+    int32_t op_val_cli = 0;
+    int32_t ret = -1;
+    gf_boolean_t is_bool = _gf_true;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    confd = dict_new();
+    if (!confd) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Not able to create dict.");
+        return -1;
+    }
+
+    ret = glusterd_gsync_get_config(master, slave, conf_path, confd);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GET_CONFIG_INFO_FAILED,
+               "Unable to get configuration data for %s(master), "
+               "%s(slave)",
+               master, slave);
+        goto out;
+    }
+
+    ret = dict_get_param(confd, op_name, &op_val_buf);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get op_value for %s(master), %s(slave). "
+               "Please check gsync config file.",
+               master, slave);
+        ret = 1;
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "val_cli:%s  val_conf:%s", op_value,
+                 op_val_buf);
+
+    if (!strcmp(op_val_buf, "true") || !strcmp(op_val_buf, "1") ||
+        !strcmp(op_val_buf, "yes")) {
+        op_val_conf = 1;
+    } else if (!strcmp(op_val_buf, "false") || !strcmp(op_val_buf, "0") ||
+               !strcmp(op_val_buf, "no")) {
+        op_val_conf = 0;
+    } else {
+        is_bool = _gf_false;
+    }
+
+    if (is_bool) {
+        if (op_value && (!strcmp(op_value, "true") || !strcmp(op_value, "1") ||
+                         !strcmp(op_value, "yes"))) {
+            op_val_cli = 1;
+        } else {
+            op_val_cli = 0;
+        }
+
+        if (op_val_cli == op_val_conf) {
+            ret = 0;
+            goto out;
+        }
+    } else {
+        if (op_value && !strcmp(op_val_buf, op_value)) {
+            ret = 0;
+            goto out;
+        }
+    }
+
+    ret = 1;
+
+out:
+    dict_unref(confd);
+    return ret;
+}
+
+static int
+glusterd_gsync_configure(glusterd_volinfo_t *volinfo, char *slave,
+                         char *path_list, dict_t *dict, dict_t *resp_dict,
+                         char **op_errstr)
+{
+    int32_t ret = -1;
+    char *op_name = NULL;
+    char *op_value = NULL;
+    runner_t runner = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    char *subop = NULL;
+    char *master = NULL;
+    char *conf_path = NULL;
+    char *slave_host = NULL;
+    char *slave_vol = NULL;
+    struct stat stbuf = {
+        0,
+    };
+    gf_boolean_t restart_required = _gf_true;
+    char **resopt = NULL;
+    gf_boolean_t op_already_set = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(slave);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(dict);
+    GF_ASSERT(resp_dict);
+
+    ret = dict_get_str(dict, "subop", &subop);
+    if (ret != 0)
+        goto out;
+
+    if (strcmp(subop, "get") == 0 || strcmp(subop, "get-all") == 0) {
+        /* deferred to cli */
+        gf_msg_debug(this->name, 0, "Returning 0");
+        return 0;
+    }
+
+    ret = dict_get_str(dict, "op_name", &op_name);
+    if (ret != 0)
+        goto out;
+
+    if (strtail(subop, "set")) {
+        ret = dict_get_str(dict, "op_value", &op_value);
+        if (ret != 0)
+            goto out;
+    }
+
+    priv = THIS->private;
+    if (priv == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_PRIV_NOT_FOUND,
+               "priv of glusterd not present");
+        *op_errstr = gf_strdup("glusterd defunct");
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "conf_path", &conf_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch conf file path.");
+        goto out;
+    }
+
+    master = "";
+    runinit(&runner);
+    runner_add_args(&runner, GSYNCD_PREFIX "/gsyncd", "-c", NULL);
+    runner_argprintf(&runner, "%s", conf_path);
+    runner_argprintf(&runner, "--iprefix=%s", DATADIR);
+    if (volinfo) {
+        master = volinfo->volname;
+        runner_argprintf(&runner, ":%s", master);
+    }
+    runner_add_arg(&runner, slave);
+    runner_argprintf(&runner, "--config-%s", subop);
+    runner_add_arg(&runner, op_name);
+    if (op_value) {
+        runner_argprintf(&runner, "--value=%s", op_value);
+    }
+
+    if (strcmp(op_name, "checkpoint") != 0 && strtail(subop, "set")) {
+        ret = glusterd_gsync_op_already_set(master, slave, conf_path, op_name,
+                                            op_value);
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_GSYNCD_OP_SET_FAILED,
+                   "glusterd_gsync_op_already_set failed.");
+            gf_asprintf(op_errstr,
+                        GEOREP
+                        " config-%s failed for "
+                        "%s %s",
+                        subop, master, slave);
+            goto out;
+        }
+        if (ret == 0) {
+            gf_msg_debug(this->name, 0, "op_value is already set");
+            op_already_set = _gf_true;
+            goto out;
+        }
+    }
+
+    synclock_unlock(&priv->big_lock);
+    ret = runner_run(&runner);
+    synclock_lock(&priv->big_lock);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_GSYNCD_ERROR,
+               "gsyncd failed to %s %s option for "
+               "%s %s peers",
+               subop, op_name, master, slave);
+
+        gf_asprintf(op_errstr, GEOREP " config-%s failed for %s %s", subop,
+                    master, slave);
+
+        goto out;
+    }
+
+    if ((!strcmp(op_name, "state_file")) && (op_value)) {
+        ret = sys_lstat(op_value, &stbuf);
+        if (ret) {
+            ret = dict_get_str(dict, "slave_host", &slave_host);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to fetch slave host.");
+                goto out;
+            }
+
+            ret = dict_get_str(dict, "slave_vol", &slave_vol);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to fetch slave volume name.");
+                goto out;
+            }
+
+            ret = glusterd_create_status_file(volinfo->volname, slave,
+                                              slave_host, slave_vol,
+                                              "Switching Status "
+                                              "File");
+            if (ret || sys_lstat(op_value, &stbuf)) {
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                       "Unable to "
+                       "create %s. Error : %s",
+                       op_value, strerror(errno));
+                ret = -1;
+                goto out;
+            }
+        }
+    }
+
+    ret = 0;
+    gf_asprintf(op_errstr, "config-%s successful", subop);
+
+out:
+    if (!ret && volinfo && !op_already_set) {
+        for (resopt = gsync_no_restart_opts; *resopt; resopt++) {
+            restart_required = _gf_true;
+            if (!strcmp((*resopt), op_name)) {
+                restart_required = _gf_false;
+                break;
+            }
+        }
+
+        if (restart_required) {
+            ret = glusterd_check_restart_gsync_session(
+                volinfo, slave, resp_dict, path_list, conf_path, 0);
+            if (ret)
+                *op_errstr = gf_strdup("internal error");
+        }
+    }
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_gsync_read_frm_status(char *path, char *buf, size_t blen)
+{
+    int ret = 0;
+    int status_fd = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(path);
+    GF_ASSERT(buf);
+    status_fd = open(path, O_RDONLY);
+    if (status_fd == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED,
+               "Unable to read gsyncd status file %s", path);
+        return -1;
+    }
+    ret = sys_read(status_fd, buf, blen - 1);
+    if (ret > 0) {
+        size_t len = strnlen(buf, ret);
+        /* Ensure there is a NUL byte and that it's not the first.  */
+        if (len == 0 || len == blen - 1) {
+            ret = -1;
+        } else {
+            char *p = buf + len - 1;
+            while (isspace(*p))
+                *p-- = '\0';
+        }
+    } else if (ret == 0)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GSYNCD_ERROR,
+               "Status file of gsyncd is empty");
+    else /* ret < 0 */
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GSYNCD_ERROR,
+               "Status file of gsyncd is corrupt");
+
+    sys_close(status_fd);
+    return ret;
+}
+
+static int
+dict_get_param(dict_t *dict, char *key, char **param)
+{
+    char *dk = NULL;
+    char *s = NULL;
+    char x = '\0';
+    int ret = 0;
+
+    if (dict_get_str(dict, key, param) == 0)
+        return 0;
+
+    dk = gf_strdup(key);
+    if (!dk)
+        return -1;
+
+    s = strpbrk(dk, "-_");
+    if (!s) {
+        ret = -1;
+        goto out;
+    }
+    x = (*s == '-') ? '_' : '-';
+    *s++ = x;
+    while ((s = strpbrk(s, "-_")))
+        *s++ = x;
+
+    ret = dict_get_str(dict, dk, param);
+out:
+    GF_FREE(dk);
+    return ret;
+}
+
+int
+glusterd_fetch_values_from_config(char *master, char *slave, char *confpath,
+                                  dict_t *confd, char **statefile,
+                                  char **georep_session_wrkng_dir,
+                                  char **socketfile)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = glusterd_gsync_get_config(master, slave, confpath, confd);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GET_CONFIG_INFO_FAILED,
+               "Unable to get configuration data for %s(master), "
+               "%s(slave)",
+               master, slave);
+        goto out;
+    }
+
+    if (statefile) {
+        ret = dict_get_param(confd, "state_file", statefile);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get state_file's name "
+                   "for %s(master), %s(slave). "
+                   "Please check gsync config file.",
+                   master, slave);
+            goto out;
+        }
+    }
+
+    if (georep_session_wrkng_dir) {
+        ret = dict_get_param(confd, "georep_session_working_dir",
+                             georep_session_wrkng_dir);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get geo-rep session's "
+                   "working directory name for %s(master), "
+                   "%s(slave). Please check gsync config file.",
+                   master, slave);
+            goto out;
+        }
+    }
+
+    if (socketfile) {
+        ret = dict_get_param(confd, "state_socket_unencoded", socketfile);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get socket file's name "
+                   "for %s(master), %s(slave). "
+                   "Please check gsync config file.",
+                   master, slave);
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_read_status_file(glusterd_volinfo_t *volinfo, char *slave,
+                          char *conf_path, dict_t *dict, char *node)
+{
+    char temp_conf_path[PATH_MAX] = "";
+    char *working_conf_path = NULL;
+    char *georep_session_wrkng_dir = NULL;
+    char *master = NULL;
+    char sts_val_name[1024] = "";
+    char monitor_status[NAME_MAX] = "";
+    char *statefile = NULL;
+    char *socketfile = NULL;
+    dict_t *confd = NULL;
+    char *slavekey = NULL;
+    char *slaveentry = NULL;
+    char *slaveuser = NULL;
+    char *saveptr = NULL;
+    char *temp = NULL;
+    char *temp_inp = NULL;
+    char *brick_host_uuid = NULL;
+    int brick_host_uuid_length = 0;
+    int gsync_count = 0;
+    int ret = 0;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    gf_gsync_status_t *sts_val = NULL;
+    gf_boolean_t is_template_in_use = _gf_false;
+    glusterd_conf_t *priv = NULL;
+    struct stat stbuf = {
+        0,
+    };
+    xlator_t *this = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(this->private);
+    GF_ASSERT(volinfo);
+    GF_ASSERT(conf_path);
+
+    master = volinfo->volname;
+
+    confd = dict_new();
+    if (!confd) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Not able to create dict.");
+        return -1;
+    }
+
+    priv = THIS->private;
+
+    len = snprintf(temp_conf_path, sizeof(temp_conf_path),
+                   "%s/" GSYNC_CONF_TEMPLATE, priv->workdir);
+    if ((len < 0) || (len >= sizeof(temp_conf_path))) {
+        return -1;
+    }
+
+    ret = sys_lstat(conf_path, &stbuf);
+    if (!ret) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_CONFIG_INFO,
+               "Using passed config template(%s).", conf_path);
+        working_conf_path = conf_path;
+    } else {
+        gf_msg(this->name, GF_LOG_WARNING, ENOENT, GD_MSG_FILE_OP_FAILED,
+               "Config file (%s) missing. Looking for template "
+               "config file (%s)",
+               conf_path, temp_conf_path);
+        ret = sys_lstat(temp_conf_path, &stbuf);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOENT, GD_MSG_FILE_OP_FAILED,
+                   "Template "
+                   "config file (%s) missing.",
+                   temp_conf_path);
+            goto out;
+        }
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DEFAULT_TEMP_CONFIG,
+               "Using default config template(%s).", temp_conf_path);
+        working_conf_path = temp_conf_path;
+        is_template_in_use = _gf_true;
+    }
+
+fetch_data:
+    ret = glusterd_fetch_values_from_config(
+        master, slave, working_conf_path, confd, &statefile,
+        &georep_session_wrkng_dir, &socketfile);
+    if (ret) {
+        if (is_template_in_use == _gf_false) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FETCH_CONFIG_VAL_FAILED,
+                   "Unable to fetch config values "
+                   "for %s(master), %s(slave). "
+                   "Trying default config template",
+                   master, slave);
+            working_conf_path = temp_conf_path;
+            is_template_in_use = _gf_true;
+            goto fetch_data;
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FETCH_CONFIG_VAL_FAILED,
+                   "Unable to "
+                   "fetch config values for %s(master), "
+                   "%s(slave)",
+                   master, slave);
+            goto out;
+        }
+    }
+
+    ret = glusterd_gsync_read_frm_status(statefile, monitor_status,
+                                         sizeof(monitor_status));
+    if (ret <= 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STAT_FILE_READ_FAILED,
+               "Unable to read the status file for %s(master), "
+               "%s(slave) statefile: %s",
+               master, slave, statefile);
+        snprintf(monitor_status, sizeof(monitor_status), "defunct");
+    }
+
+    ret = dict_get_int32(dict, "gsync-count", &gsync_count);
+    if (ret)
+        gsync_count = 0;
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID))
+            continue;
+
+        sts_val = GF_CALLOC(1, sizeof(gf_gsync_status_t),
+                            gf_common_mt_gsync_status_t);
+        if (!sts_val) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "Out Of Memory");
+            goto out;
+        }
+
+        /* Slave Key */
+        ret = glusterd_get_slave(volinfo, slave, &slavekey);
+        if (ret < 0) {
+            GF_FREE(sts_val);
+            goto out;
+        }
+        memcpy(sts_val->slavekey, slavekey, strlen(slavekey));
+        sts_val->slavekey[strlen(slavekey)] = '\0';
+
+        /* Master Volume */
+        memcpy(sts_val->master, master, strlen(master));
+        sts_val->master[strlen(master)] = '\0';
+
+        /* Master Brick Node */
+        memcpy(sts_val->node, brickinfo->hostname, strlen(brickinfo->hostname));
+        sts_val->node[strlen(brickinfo->hostname)] = '\0';
+
+        /* Master Brick Path */
+        memcpy(sts_val->brick, brickinfo->path, strlen(brickinfo->path));
+        sts_val->brick[strlen(brickinfo->path)] = '\0';
+
+        /* Brick Host UUID */
+        brick_host_uuid = uuid_utoa(brickinfo->uuid);
+        brick_host_uuid_length = strlen(brick_host_uuid);
+        memcpy(sts_val->brick_host_uuid, brick_host_uuid,
+               brick_host_uuid_length);
+        sts_val->brick_host_uuid[brick_host_uuid_length] = '\0';
+
+        /* Slave */
+        memcpy(sts_val->slave, slave, strlen(slave));
+        sts_val->slave[strlen(slave)] = '\0';
+
+        snprintf(sts_val->slave_node, sizeof(sts_val->slave_node), "N/A");
+
+        snprintf(sts_val->worker_status, sizeof(sts_val->worker_status), "N/A");
+
+        snprintf(sts_val->crawl_status, sizeof(sts_val->crawl_status), "N/A");
+
+        snprintf(sts_val->last_synced, sizeof(sts_val->last_synced), "N/A");
+
+        snprintf(sts_val->last_synced_utc, sizeof(sts_val->last_synced_utc),
+                 "N/A");
+
+        snprintf(sts_val->entry, sizeof(sts_val->entry), "N/A");
+
+        snprintf(sts_val->data, sizeof(sts_val->data), "N/A");
+
+        snprintf(sts_val->meta, sizeof(sts_val->meta), "N/A");
+
+        snprintf(sts_val->failures, sizeof(sts_val->failures), "N/A");
+
+        snprintf(sts_val->checkpoint_time, sizeof(sts_val->checkpoint_time),
+                 "N/A");
+
+        snprintf(sts_val->checkpoint_time_utc,
+                 sizeof(sts_val->checkpoint_time_utc), "N/A");
+
+        snprintf(sts_val->checkpoint_completed,
+                 sizeof(sts_val->checkpoint_completed), "N/A");
+
+        snprintf(sts_val->checkpoint_completion_time,
+                 sizeof(sts_val->checkpoint_completion_time), "N/A");
+
+        snprintf(sts_val->checkpoint_completion_time_utc,
+                 sizeof(sts_val->checkpoint_completion_time_utc), "N/A");
+
+        /* Get all the other values from Gsyncd */
+        ret = glusterd_gsync_get_status(master, slave, conf_path,
+                                        brickinfo->path, sts_val);
+
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GET_STATUS_DATA_FAIL,
+                   "Unable to get status data "
+                   "for %s(master), %s(slave), %s(brick)",
+                   master, slave, brickinfo->path);
+            ret = -1;
+            goto out;
+        }
+
+        if (is_template_in_use) {
+            snprintf(sts_val->worker_status, sizeof(sts_val->worker_status),
+                     "Config Corrupted");
+        }
+
+        ret = dict_get_str(volinfo->gsync_slaves, slavekey, &slaveentry);
+        if (ret < 0) {
+            GF_FREE(sts_val);
+            goto out;
+        }
+
+        memcpy(sts_val->session_slave, slaveentry, strlen(slaveentry));
+        sts_val->session_slave[strlen(slaveentry)] = '\0';
+
+        temp_inp = gf_strdup(slaveentry);
+        if (!temp_inp)
+            goto out;
+
+        if (strstr(temp_inp, "@") == NULL) {
+            slaveuser = "root";
+        } else {
+            temp = strtok_r(temp_inp, "//", &saveptr);
+            temp = strtok_r(NULL, "/", &saveptr);
+            slaveuser = strtok_r(temp, "@", &saveptr);
+        }
+        memcpy(sts_val->slave_user, slaveuser, strlen(slaveuser));
+        sts_val->slave_user[strlen(slaveuser)] = '\0';
+
+        snprintf(sts_val_name, sizeof(sts_val_name), "status_value%d",
+                 gsync_count);
+        ret = dict_set_bin(dict, sts_val_name, sts_val,
+                           sizeof(gf_gsync_status_t));
+        if (ret) {
+            GF_FREE(sts_val);
+            goto out;
+        }
+
+        gsync_count++;
+        sts_val = NULL;
+    }
+
+    ret = dict_set_int32(dict, "gsync-count", gsync_count);
+    if (ret)
+        goto out;
+
+out:
+    GF_FREE(temp_inp);
+    dict_unref(confd);
+
+    return 0;
+}
+
+int
+glusterd_check_restart_gsync_session(glusterd_volinfo_t *volinfo, char *slave,
+                                     dict_t *resp_dict, char *path_list,
+                                     char *conf_path, gf_boolean_t is_force)
+{
+    int ret = 0;
+    glusterd_conf_t *priv = NULL;
+    char *status_msg = NULL;
+    gf_boolean_t is_running = _gf_false;
+    char *op_errstr = NULL;
+    char *key = NULL;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(slave);
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    key = slave;
+
+    ret = glusterd_check_gsync_running_local(volinfo->volname, slave, conf_path,
+                                             &is_running);
+    if (!ret && (_gf_true != is_running))
+        /* gsynd not running, nothing to do */
+        goto out;
+
+    ret = stop_gsync(volinfo->volname, slave, &status_msg, conf_path,
+                     &op_errstr, is_force);
+    if (ret == 0 && status_msg)
+        ret = dict_set_str(resp_dict, "gsync-status", status_msg);
+    if (ret == 0) {
+        dict_del(volinfo->gsync_active_slaves, key);
+        ret = glusterd_start_gsync(volinfo, slave, path_list, conf_path,
+                                   uuid_utoa(MY_UUID), NULL, _gf_false);
+        if (!ret) {
+            /* Add slave to the dict indicating geo-rep session is
+             * running.*/
+            ret = dict_set_dynstr_with_alloc(volinfo->gsync_active_slaves, key,
+                                             "running");
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Unable to set"
+                       " key:%s value:running in dict. But "
+                       "the config succeeded.",
+                       key);
+                goto out;
+            }
+        }
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    if (op_errstr)
+        GF_FREE(op_errstr);
+    return ret;
+}
+
+static int32_t
+glusterd_marker_changelog_create_volfile(glusterd_volinfo_t *volinfo)
+{
+    int32_t ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = glusterd_create_volfiles_and_notify_services(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Unable to create volfile for setting of marker "
+               "while '" GEOREP " start'");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+    if (ret)
+        goto out;
+
+    if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+        ret = glusterd_svcs_manager(volinfo);
+        goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+glusterd_set_gsync_knob(glusterd_volinfo_t *volinfo, char *key, int *vc)
+{
+    int ret = -1;
+    int conf_enabled = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(this->private);
+
+    conf_enabled = glusterd_volinfo_get_boolean(volinfo, key);
+    if (conf_enabled == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GET_KEY_FAILED,
+               "failed to get key %s from volinfo", key);
+        goto out;
+    }
+
+    ret = 0;
+    if (conf_enabled == _gf_false) {
+        *vc = 1;
+        ret = glusterd_gsync_volinfo_dict_set(volinfo, key, "on");
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_set_gsync_confs(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    int volfile_changed = 0;
+
+    ret = glusterd_set_gsync_knob(volinfo, VKEY_MARKER_XTIME, &volfile_changed);
+    if (ret)
+        goto out;
+
+    /**
+     * enable ignore-pid-check blindly as it could be needed for
+     * cascading setups.
+     */
+    ret = glusterd_set_gsync_knob(volinfo, VKEY_MARKER_XTIME_FORCE,
+                                  &volfile_changed);
+    if (ret)
+        goto out;
+
+    ret = glusterd_set_gsync_knob(volinfo, VKEY_CHANGELOG, &volfile_changed);
+    if (ret)
+        goto out;
+
+    if (volfile_changed)
+        ret = glusterd_marker_changelog_create_volfile(volinfo);
+
+out:
+    return ret;
+}
+
+static int
+glusterd_get_gsync_status_mst_slv(glusterd_volinfo_t *volinfo, char *slave,
+                                  char *conf_path, dict_t *rsp_dict, char *node)
+{
+    char *statefile = NULL;
+    uuid_t uuid = {
+        0,
+    };
+    int ret = 0;
+    gf_boolean_t is_template_in_use = _gf_false;
+    struct stat stbuf = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(slave);
+    GF_ASSERT(this->private);
+
+    ret = glusterd_gsync_get_uuid(slave, volinfo, uuid);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_SESSION_INACTIVE,
+               "geo-replication status %s %s : session is not "
+               "active",
+               volinfo->volname, slave);
+
+        ret = glusterd_get_statefile_name(volinfo, slave, conf_path, &statefile,
+                                          &is_template_in_use);
+        if (ret) {
+            if (!strstr(slave, "::"))
+                gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_SLAVE_URL_INVALID,
+                       "%s is not a valid slave url.", slave);
+            else
+                gf_msg(this->name, GF_LOG_INFO, 0,
+                       GD_MSG_GET_STATEFILE_NAME_FAILED,
+                       "Unable to get statefile's name");
+            ret = 0;
+            goto out;
+        }
+
+        ret = sys_lstat(statefile, &stbuf);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_INFO, ENOENT, GD_MSG_FILE_OP_FAILED,
+                   "%s statefile not present.", statefile);
+            ret = 0;
+            goto out;
+        }
+    }
+
+    ret = glusterd_read_status_file(volinfo, slave, conf_path, rsp_dict, node);
+out:
+    if (statefile)
+        GF_FREE(statefile);
+
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+int
+glusterd_get_gsync_status_mst(glusterd_volinfo_t *volinfo, dict_t *rsp_dict,
+                              char *node)
+{
+    glusterd_gsync_status_temp_t param = {
+        0,
+    };
+
+    GF_ASSERT(volinfo);
+
+    param.rsp_dict = rsp_dict;
+    param.volinfo = volinfo;
+    param.node = node;
+    dict_foreach(volinfo->gsync_slaves, _get_status_mst_slv, &param);
+
+    return 0;
+}
+
+static int
+glusterd_get_gsync_status_all(dict_t *rsp_dict, char *node)
+{
+    int32_t ret = 0;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+    {
+        ret = glusterd_get_gsync_status_mst(volinfo, rsp_dict, node);
+        if (ret)
+            goto out;
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+static int
+glusterd_get_gsync_status(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    char *slave = NULL;
+    char *volname = NULL;
+    char *conf_path = NULL;
+    char errmsg[PATH_MAX] = {
+        0,
+    };
+    glusterd_volinfo_t *volinfo = NULL;
+    int ret = 0;
+    char my_hostname[256] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = gethostname(my_hostname, 256);
+    if (ret) {
+        /* stick to N/A */
+        (void)strcpy(my_hostname, "N/A");
+    }
+
+    ret = dict_get_str(dict, "master", &volname);
+    if (ret < 0) {
+        ret = glusterd_get_gsync_status_all(rsp_dict, my_hostname);
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOL_NOT_FOUND,
+               "volume name does not exist");
+        snprintf(errmsg, sizeof(errmsg),
+                 "Volume name %s does not"
+                 " exist",
+                 volname);
+        *op_errstr = gf_strdup(errmsg);
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "slave", &slave);
+    if (ret < 0) {
+        ret = glusterd_get_gsync_status_mst(volinfo, rsp_dict, my_hostname);
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "conf_path", &conf_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch conf file path.");
+        goto out;
+    }
+
+    ret = glusterd_get_gsync_status_mst_slv(volinfo, slave, conf_path, rsp_dict,
+                                            my_hostname);
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_gsync_delete(glusterd_volinfo_t *volinfo, char *slave,
+                      char *slave_host, char *slave_vol, char *path_list,
+                      dict_t *dict, dict_t *resp_dict, char **op_errstr)
+{
+    int32_t ret = -1;
+    runner_t runner = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    char *master = NULL;
+    char *gl_workdir = NULL;
+    char geo_rep_dir[PATH_MAX] = "";
+    char *conf_path = NULL;
+    xlator_t *this = NULL;
+    uint32_t reset_sync_time = _gf_false;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(slave);
+    GF_ASSERT(slave_host);
+    GF_ASSERT(slave_vol);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(dict);
+    GF_ASSERT(resp_dict);
+
+    if (THIS)
+        priv = THIS->private;
+    if (priv == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_PRIV_NOT_FOUND,
+               "priv of glusterd not present");
+        *op_errstr = gf_strdup("glusterd defunct");
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "conf_path", &conf_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch conf file path.");
+        goto out;
+    }
+
+    gl_workdir = priv->workdir;
+    master = "";
+    runinit(&runner);
+    runner_add_args(&runner, GSYNCD_PREFIX "/gsyncd", "--delete", "-c", NULL);
+    runner_argprintf(&runner, "%s", conf_path);
+    runner_argprintf(&runner, "--iprefix=%s", DATADIR);
+
+    runner_argprintf(&runner, "--path-list=%s", path_list);
+
+    ret = dict_get_uint32(dict, "reset-sync-time", &reset_sync_time);
+    if (!ret && reset_sync_time) {
+        runner_add_args(&runner, "--reset-sync-time", NULL);
+    }
+
+    if (volinfo) {
+        master = volinfo->volname;
+        runner_argprintf(&runner, ":%s", master);
+    }
+    runner_add_arg(&runner, slave);
+    runner_redir(&runner, STDOUT_FILENO, RUN_PIPE);
+    synclock_unlock(&priv->big_lock);
+    ret = runner_run(&runner);
+    synclock_lock(&priv->big_lock);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SESSION_DEL_FAILED,
+               "gsyncd failed to delete session info for %s and "
+               "%s peers",
+               master, slave);
+
+        gf_asprintf(op_errstr,
+                    "gsyncd failed to "
+                    "delete session info for %s and %s peers",
+                    master, slave);
+
+        goto out;
+    }
+
+    ret = snprintf(geo_rep_dir, sizeof(geo_rep_dir) - 1,
+                   "%s/" GEOREP "/%s_%s_%s", gl_workdir, volinfo->volname,
+                   slave_host, slave_vol);
+    geo_rep_dir[ret] = '\0';
+
+    ret = sys_rmdir(geo_rep_dir);
+    if (ret) {
+        if (errno == ENOENT)
+            gf_msg_debug(this->name, 0, "Geo Rep Dir(%s) Not Present.",
+                         geo_rep_dir);
+        else {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+                   "Unable to delete Geo Rep Dir(%s). Error: %s", geo_rep_dir,
+                   strerror(errno));
+            goto out;
+        }
+    }
+
+    ret = 0;
+
+    gf_asprintf(op_errstr, "delete successful");
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_sys_exec(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    char buf[PATH_MAX] = "";
+    char cmd_arg_name[PATH_MAX] = "";
+    char output_name[PATH_MAX] = "";
+    char errmsg[PATH_MAX] = "";
+    char *ptr = NULL;
+    char *bufp = NULL;
+    char *command = NULL;
+    char **cmd_args = NULL;
+    int ret = -1;
+    int i = -1;
+    int cmd_args_count = 0;
+    int output_count = 0;
+    glusterd_conf_t *priv = NULL;
+    runner_t runner = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(rsp_dict);
+
+    if (THIS)
+        priv = THIS->private;
+    if (priv == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_PRIV_NOT_FOUND,
+               "priv of glusterd not present");
+        *op_errstr = gf_strdup("glusterd defunct");
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "command", &command);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get command from dict");
+        goto out;
+    }
+
+    ret = dict_get_int32(dict, "cmd_args_count", &cmd_args_count);
+    if (ret)
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "No cmd_args_count");
+
+    if (cmd_args_count) {
+        cmd_args = GF_CALLOC(cmd_args_count, sizeof(char *), gf_common_mt_char);
+        if (!cmd_args) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "Unable to calloc. Errno = %s", strerror(errno));
+            goto out;
+        }
+
+        for (i = 1; i <= cmd_args_count; i++) {
+            snprintf(cmd_arg_name, sizeof(cmd_arg_name), "cmd_arg_%d", i);
+            ret = dict_get_str(dict, cmd_arg_name, &cmd_args[i - 1]);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to get"
+                       " %s in dict",
+                       cmd_arg_name);
+                goto out;
+            }
+        }
+    }
+
+    runinit(&runner);
+    runner_argprintf(&runner, GSYNCD_PREFIX "/peer_%s", command);
+    for (i = 0; i < cmd_args_count; i++)
+        runner_add_arg(&runner, cmd_args[i]);
+    runner_redir(&runner, STDOUT_FILENO, RUN_PIPE);
+    synclock_unlock(&priv->big_lock);
+    ret = runner_start(&runner);
+    if (ret == -1) {
+        snprintf(errmsg, sizeof(errmsg),
+                 "Unable to "
+                 "execute command. Error : %s",
+                 strerror(errno));
+        *op_errstr = gf_strdup(errmsg);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CMD_EXEC_FAIL, "%s", errmsg);
+        ret = -1;
+        synclock_lock(&priv->big_lock);
+        goto out;
+    }
+
+    do {
+        ptr = fgets(buf, sizeof(buf), runner_chio(&runner, STDOUT_FILENO));
+        if (ptr) {
+            ret = dict_get_int32(rsp_dict, "output_count", &output_count);
+            if (ret)
+                output_count = 1;
+            else
+                output_count++;
+            snprintf(output_name, sizeof(output_name), "output_%d",
+                     output_count);
+            if (buf[strlen(buf) - 1] == '\n')
+                buf[strlen(buf) - 1] = '\0';
+            bufp = gf_strdup(buf);
+            if (!bufp)
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STRDUP_FAILED,
+                       "gf_strdup failed.");
+            ret = dict_set_dynstr(rsp_dict, output_name, bufp);
+            if (ret) {
+                GF_FREE(bufp);
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "output set "
+                       "failed.");
+            }
+            ret = dict_set_int32(rsp_dict, "output_count", output_count);
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "output_count "
+                       "set failed.");
+        }
+    } while (ptr);
+
+    ret = runner_end(&runner);
+    if (ret) {
+        snprintf(errmsg, sizeof(errmsg),
+                 "Unable to "
+                 "end. Error : %s",
+                 strerror(errno));
+        *op_errstr = gf_strdup(errmsg);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNABLE_TO_END, "%s", errmsg);
+        ret = -1;
+        synclock_lock(&priv->big_lock);
+        goto out;
+    }
+    synclock_lock(&priv->big_lock);
+
+    ret = 0;
+out:
+    if (cmd_args) {
+        GF_FREE(cmd_args);
+        cmd_args = NULL;
+    }
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_copy_file(dict_t *dict, char **op_errstr)
+{
+    char abs_filename[PATH_MAX] = "";
+    char errmsg[PATH_MAX] = "";
+    char *filename = NULL;
+    char *host_uuid = NULL;
+    char uuid_str[64] = {0};
+    char *contents = NULL;
+    char buf[4096] = "";
+    int ret = -1;
+    int fd = -1;
+    int bytes_writen = 0;
+    int bytes_read = 0;
+    int contents_size = -1;
+    int file_mode = -1;
+    glusterd_conf_t *priv = NULL;
+    struct stat stbuf = {
+        0,
+    };
+    gf_boolean_t free_contents = _gf_true;
+    xlator_t *this = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (THIS)
+        priv = THIS->private;
+    if (priv == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_PRIV_NOT_FOUND,
+               "priv of glusterd not present");
+        *op_errstr = gf_strdup("glusterd defunct");
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "host-uuid", &host_uuid);
+    if (ret < 0)
+        goto out;
+
+    ret = dict_get_str(dict, "source", &filename);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch filename from dict.");
+        *op_errstr = gf_strdup("command unsuccessful");
+        goto out;
+    }
+    len = snprintf(abs_filename, sizeof(abs_filename), "%s/%s", priv->workdir,
+                   filename);
+    if ((len < 0) || (len >= sizeof(abs_filename))) {
+        ret = -1;
+        goto out;
+    }
+
+    uuid_utoa_r(MY_UUID, uuid_str);
+    if (!strcmp(uuid_str, host_uuid)) {
+        ret = sys_lstat(abs_filename, &stbuf);
+        if (ret) {
+            len = snprintf(errmsg, sizeof(errmsg),
+                           "Source file "
+                           "does not exist in %s",
+                           priv->workdir);
+            if (len < 0) {
+                strcpy(errmsg, "<error>");
+            }
+            *op_errstr = gf_strdup(errmsg);
+            gf_msg(this->name, GF_LOG_ERROR, ENOENT, GD_MSG_FILE_OP_FAILED,
+                   "%s", errmsg);
+            goto out;
+        }
+
+        contents = GF_CALLOC(1, stbuf.st_size + 1, gf_common_mt_char);
+        if (!contents) {
+            snprintf(errmsg, sizeof(errmsg), "Unable to allocate memory");
+            *op_errstr = gf_strdup(errmsg);
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY, "%s",
+                   errmsg);
+            ret = -1;
+            goto out;
+        }
+
+        fd = open(abs_filename, O_RDONLY);
+        if (fd < 0) {
+            len = snprintf(errmsg, sizeof(errmsg), "Unable to open %s",
+                           abs_filename);
+            if (len < 0) {
+                strcpy(errmsg, "<error>");
+            }
+            *op_errstr = gf_strdup(errmsg);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED, "%s",
+                   errmsg);
+            ret = -1;
+            goto out;
+        }
+
+        do {
+            ret = sys_read(fd, buf, sizeof(buf) - 1);
+            if (ret > 0) {
+                buf[ret] = '\0';
+                memcpy(contents + bytes_read, buf, ret);
+                bytes_read += ret;
+            }
+        } while (ret > 0);
+
+        if (bytes_read != stbuf.st_size) {
+            len = snprintf(errmsg, sizeof(errmsg),
+                           "Unable to read all the data from %s", abs_filename);
+            if (len < 0) {
+                strcpy(errmsg, "<error>");
+            }
+            *op_errstr = gf_strdup(errmsg);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_READ_ERROR, "%s",
+                   errmsg);
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_set_int32(dict, "contents_size", stbuf.st_size);
+        if (ret) {
+            snprintf(errmsg, sizeof(errmsg),
+                     "Unable to set"
+                     " contents size in dict.");
+            *op_errstr = gf_strdup(errmsg);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "%s",
+                   errmsg);
+            goto out;
+        }
+
+        ret = dict_set_int32(dict, "file_mode", (int32_t)stbuf.st_mode);
+        if (ret) {
+            snprintf(errmsg, sizeof(errmsg),
+                     "Unable to set"
+                     " file mode in dict.");
+            *op_errstr = gf_strdup(errmsg);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "%s",
+                   errmsg);
+            goto out;
+        }
+
+        ret = dict_set_bin(dict, "common_pem_contents", contents,
+                           stbuf.st_size);
+        if (ret) {
+            snprintf(errmsg, sizeof(errmsg),
+                     "Unable to set"
+                     " pem contents in dict.");
+            *op_errstr = gf_strdup(errmsg);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "%s",
+                   errmsg);
+            goto out;
+        }
+        free_contents = _gf_false;
+    } else {
+        free_contents = _gf_false;
+        ret = dict_get_bin(dict, "common_pem_contents", (void **)&contents);
+        if (ret) {
+            snprintf(errmsg, sizeof(errmsg),
+                     "Unable to get"
+                     " pem contents in dict.");
+            *op_errstr = gf_strdup(errmsg);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+                   errmsg);
+            goto out;
+        }
+        ret = dict_get_int32(dict, "contents_size", &contents_size);
+        if (ret) {
+            snprintf(errmsg, sizeof(errmsg),
+                     "Unable to set"
+                     " contents size in dict.");
+            *op_errstr = gf_strdup(errmsg);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+                   errmsg);
+            goto out;
+        }
+
+        ret = dict_get_int32(dict, "file_mode", &file_mode);
+        if (ret) {
+            snprintf(errmsg, sizeof(errmsg),
+                     "Unable to get"
+                     " file mode in dict.");
+            *op_errstr = gf_strdup(errmsg);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+                   errmsg);
+            goto out;
+        }
+
+        fd = open(abs_filename, O_WRONLY | O_TRUNC | O_CREAT, 0600);
+        if (fd < 0) {
+            len = snprintf(errmsg, sizeof(errmsg), "Unable to open %s",
+                           abs_filename);
+            if (len < 0) {
+                strcpy(errmsg, "<error>");
+            }
+            *op_errstr = gf_strdup(errmsg);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED, "%s",
+                   errmsg);
+            ret = -1;
+            goto out;
+        }
+
+        bytes_writen = sys_write(fd, contents, contents_size);
+
+        if (bytes_writen != contents_size) {
+            len = snprintf(errmsg, sizeof(errmsg), "Failed to write to %s",
+                           abs_filename);
+            if (len < 0) {
+                strcpy(errmsg, "<error>");
+            }
+            *op_errstr = gf_strdup(errmsg);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED, "%s",
+                   errmsg);
+            ret = -1;
+            goto out;
+        }
+
+        sys_fchmod(fd, file_mode);
+    }
+
+    ret = 0;
+out:
+    if (fd != -1)
+        sys_close(fd);
+
+    if (free_contents)
+        GF_FREE(contents);
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_gsync_set(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    int32_t ret = -1;
+    int32_t type = -1;
+    char *host_uuid = NULL;
+    char *slave = NULL;
+    char *slave_url = NULL;
+    char *slave_vol = NULL;
+    char *slave_host = NULL;
+    char *volname = NULL;
+    char *path_list = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    gf_boolean_t is_force = _gf_false;
+    char *status_msg = NULL;
+    gf_boolean_t is_running = _gf_false;
+    char *conf_path = NULL;
+    char *key = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(rsp_dict);
+
+    ret = dict_get_int32(dict, "type", &type);
+    if (ret < 0)
+        goto out;
+
+    ret = dict_get_str(dict, "host-uuid", &host_uuid);
+    if (ret < 0)
+        goto out;
+
+    if (type == GF_GSYNC_OPTION_TYPE_STATUS) {
+        ret = glusterd_get_gsync_status(dict, op_errstr, rsp_dict);
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "slave", &slave);
+    if (ret < 0)
+        goto out;
+
+    key = slave;
+
+    ret = dict_get_str(dict, "slave_url", &slave_url);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch slave url.");
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "slave_host", &slave_host);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch slave hostname.");
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "slave_vol", &slave_vol);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch slave volume name.");
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "conf_path", &conf_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch conf file path.");
+        goto out;
+    }
+
+    if (dict_get_str(dict, "master", &volname) == 0) {
+        ret = glusterd_volinfo_find(volname, &volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+                   "Volinfo for"
+                   " %s (master) not found",
+                   volname);
+            goto out;
+        }
+
+        ret = glusterd_get_local_brickpaths(volinfo, &path_list);
+        if (!path_list && ret == -1)
+            goto out;
+    }
+
+    if (type == GF_GSYNC_OPTION_TYPE_CONFIG) {
+        ret = glusterd_gsync_configure(volinfo, slave, path_list, dict,
+                                       rsp_dict, op_errstr);
+        if (!ret) {
+            ret = dict_set_str(rsp_dict, "conf_path", conf_path);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Unable to store conf_file_path.");
+                goto out;
+            }
+        }
+        goto out;
+    }
+
+    if (type == GF_GSYNC_OPTION_TYPE_DELETE) {
+        ret = glusterd_remove_slave_in_info(volinfo, slave, op_errstr);
+        if (ret && !is_force && path_list)
+            goto out;
+
+        ret = glusterd_gsync_delete(volinfo, slave, slave_host, slave_vol,
+                                    path_list, dict, rsp_dict, op_errstr);
+        goto out;
+    }
+
+    if (!volinfo) {
+        ret = -1;
+        goto out;
+    }
+
+    is_force = dict_get_str_boolean(dict, "force", _gf_false);
+
+    if (type == GF_GSYNC_OPTION_TYPE_START) {
+        /* Add slave to the dict indicating geo-rep session is running*/
+        ret = dict_set_dynstr_with_alloc(volinfo->gsync_active_slaves, key,
+                                         "running");
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to set key:%s"
+                   " value:running in the dict",
+                   key);
+            goto out;
+        }
+
+        /* If slave volume uuid is not present in gsync_slaves
+         * update it*/
+        ret = glusterd_update_slave_voluuid_slaveinfo(volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REMOTE_VOL_UUID_FAIL,
+                   "Error in updating"
+                   " slave volume uuid for old slave info");
+            goto out;
+        }
+
+        ret = glusterd_start_gsync(volinfo, slave, path_list, conf_path,
+                                   host_uuid, op_errstr, _gf_false);
+
+        /* Delete added slave in the dict if start fails*/
+        if (ret)
+            dict_del(volinfo->gsync_active_slaves, key);
+    }
+
+    if (type == GF_GSYNC_OPTION_TYPE_STOP ||
+        type == GF_GSYNC_OPTION_TYPE_PAUSE ||
+        type == GF_GSYNC_OPTION_TYPE_RESUME) {
+        ret = glusterd_check_gsync_running_local(volinfo->volname, slave,
+                                                 conf_path, &is_running);
+        if (!ret && !is_force && path_list && (_gf_true != is_running)) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_GSYNCD_OP_SET_FAILED,
+                   GEOREP
+                   " is not "
+                   "set up for %s(master) and %s(slave)",
+                   volname, slave);
+            *op_errstr = gf_strdup(GEOREP " is not set up");
+            goto out;
+        }
+
+        if (type == GF_GSYNC_OPTION_TYPE_PAUSE) {
+            ret = gd_pause_or_resume_gsync(dict, volname, slave, slave_host,
+                                           slave_vol, conf_path, op_errstr,
+                                           _gf_true);
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PAUSE_FAILED,
+                       GEOREP " Pause Failed");
+            else
+                dict_del(volinfo->gsync_active_slaves, key);
+
+        } else if (type == GF_GSYNC_OPTION_TYPE_RESUME) {
+            /* Add slave to the dict indicating geo-rep session is
+             * running*/
+            ret = dict_set_dynstr_with_alloc(volinfo->gsync_active_slaves, key,
+                                             "running");
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Unable to set "
+                       "key:%s value:running in dict",
+                       key);
+                goto out;
+            }
+
+            ret = gd_pause_or_resume_gsync(dict, volname, slave, slave_host,
+                                           slave_vol, conf_path, op_errstr,
+                                           _gf_false);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESUME_FAILED,
+                       GEOREP " Resume Failed");
+                dict_del(volinfo->gsync_active_slaves, key);
+            }
+        } else {
+            ret = stop_gsync(volname, slave, &status_msg, conf_path, op_errstr,
+                             is_force);
+
+            if (ret == 0 && status_msg)
+                ret = dict_set_str(rsp_dict, "gsync-status", status_msg);
+            if (!ret) {
+                ret = glusterd_create_status_file(
+                    volinfo->volname, slave, slave_host, slave_vol, "Stopped");
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_UPDATE_STATEFILE_FAILED,
+                           "Unable to update state_file. "
+                           "Error : %s",
+                           strerror(errno));
+                }
+                dict_del(volinfo->gsync_active_slaves, key);
+            }
+        }
+    }
+
+out:
+    if (path_list) {
+        GF_FREE(path_list);
+        path_list = NULL;
+    }
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_get_slave_details_confpath(glusterd_volinfo_t *volinfo, dict_t *dict,
+                                    char **slave_url, char **slave_host,
+                                    char **slave_vol, char **conf_path,
+                                    char **op_errstr)
+{
+    int ret = -1;
+    char confpath[PATH_MAX] = "";
+    glusterd_conf_t *priv = NULL;
+    char *slave = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_str(dict, "slave", &slave);
+    if (ret || !slave) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch slave from dict");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_get_slave_info(slave, slave_url, slave_host, slave_vol,
+                                  op_errstr);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVEINFO_FETCH_ERROR,
+               "Unable to fetch slave details.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_str(dict, "slave_url", *slave_url);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to store slave IP.");
+        goto out;
+    }
+
+    ret = dict_set_str(dict, "slave_host", *slave_host);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to store slave hostname");
+        goto out;
+    }
+
+    ret = dict_set_str(dict, "slave_vol", *slave_vol);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to store slave volume name.");
+        goto out;
+    }
+
+    ret = snprintf(confpath, sizeof(confpath) - 1,
+                   "%s/" GEOREP "/%s_%s_%s/gsyncd.conf", priv->workdir,
+                   volinfo->volname, *slave_host, *slave_vol);
+    confpath[ret] = '\0';
+    *conf_path = gf_strdup(confpath);
+    if (!(*conf_path)) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+               "Unable to gf_strdup. Error: %s", strerror(errno));
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_str(dict, "conf_path", *conf_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to store conf_path");
+        goto out;
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_get_slave_info(char *slave, char **slave_url, char **hostname,
+                        char **slave_vol, char **op_errstr)
+{
+    char *tmp = NULL;
+    char *save_ptr = NULL;
+    char **linearr = NULL;
+    int32_t ret = -1;
+    char errmsg[PATH_MAX] = "";
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = glusterd_urltransform_single(slave, "normalize", &linearr);
+    if ((ret == -1) || (linearr[0] == NULL)) {
+        ret = snprintf(errmsg, sizeof(errmsg) - 1, "Invalid Url: %s", slave);
+        errmsg[ret] = '\0';
+        *op_errstr = gf_strdup(errmsg);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NORMALIZE_URL_FAIL,
+               "Failed to normalize url");
+        goto out;
+    }
+
+    tmp = strtok_r(linearr[0], "/", &save_ptr);
+    tmp = strtok_r(NULL, "/", &save_ptr);
+    slave = NULL;
+    if (tmp != NULL) {
+        slave = strtok_r(tmp, ":", &save_ptr);
+    }
+    if (slave) {
+        ret = glusterd_geo_rep_parse_slave(slave, hostname, op_errstr);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVE_URL_INVALID,
+                   "Invalid slave url: %s", *op_errstr);
+            goto out;
+        }
+        gf_msg_debug(this->name, 0, "Hostname : %s", *hostname);
+
+        *slave_url = gf_strdup(slave);
+        if (!*slave_url) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STRDUP_FAILED,
+                   "Failed to gf_strdup");
+            ret = -1;
+            goto out;
+        }
+        gf_msg_debug(this->name, 0, "Slave URL : %s", *slave_url);
+        ret = 0;
+    } else {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "Invalid slave name");
+        goto out;
+    }
+
+    slave = strtok_r(NULL, ":", &save_ptr);
+    if (slave) {
+        *slave_vol = gf_strdup(slave);
+        if (!*slave_vol) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STRDUP_FAILED,
+                   "Failed to gf_strdup");
+            ret = -1;
+            GF_FREE(*slave_url);
+            goto out;
+        }
+        gf_msg_debug(this->name, 0, "Slave Vol : %s", *slave_vol);
+        ret = 0;
+    } else {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "Invalid slave name");
+        goto out;
+    }
+
+out:
+    if (linearr)
+        glusterd_urltransform_free(linearr, 1);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static void
+runinit_gsyncd_setrx(runner_t *runner, char *conf_path)
+{
+    runinit(runner);
+    runner_add_args(runner, GSYNCD_PREFIX "/gsyncd", "-c", NULL);
+    runner_argprintf(runner, "%s", conf_path);
+    runner_add_arg(runner, "--config-set-rx");
+}
+
+static int
+glusterd_check_gsync_present(int *valid_state)
+{
+    char buff[PATH_MAX] = {
+        0,
+    };
+    runner_t runner = {
+        0,
+    };
+    char *ptr = NULL;
+    int ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    runinit(&runner);
+    runner_add_args(&runner, GSYNCD_PREFIX "/gsyncd", "--version", NULL);
+    runner_redir(&runner, STDOUT_FILENO, RUN_PIPE);
+    ret = runner_start(&runner);
+    if (ret == -1) {
+        if (errno == ENOENT) {
+            gf_msg("glusterd", GF_LOG_INFO, ENOENT, GD_MSG_MODULE_NOT_INSTALLED,
+                   GEOREP
+                   " module "
+                   "not installed in the system");
+            *valid_state = 0;
+        } else {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_MODULE_ERROR,
+                   GEOREP " module not working as desired");
+            *valid_state = -1;
+        }
+        goto out;
+    }
+
+    ptr = fgets(buff, sizeof(buff), runner_chio(&runner, STDOUT_FILENO));
+    if (ptr) {
+        if (!strstr(buff, "gsyncd")) {
+            ret = -1;
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_MODULE_ERROR,
+                   GEOREP " module not working as desired");
+            *valid_state = -1;
+            goto out;
+        }
+    } else {
+        ret = -1;
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_MODULE_ERROR,
+               GEOREP " module not working as desired");
+        *valid_state = -1;
+        goto out;
+    }
+
+    ret = 0;
+out:
+
+    runner_end(&runner);
+
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+create_conf_file(glusterd_conf_t *conf, char *conf_path)
+#define RUN_GSYNCD_CMD                                                         \
+    do {                                                                       \
+        ret = runner_run_reuse(&runner);                                       \
+        if (ret == -1) {                                                       \
+            runner_log(&runner, "glusterd", GF_LOG_ERROR, "command failed");   \
+            runner_end(&runner);                                               \
+            goto out;                                                          \
+        }                                                                      \
+        runner_end(&runner);                                                   \
+    } while (0)
+{
+    int ret = 0;
+    runner_t runner = {
+        0,
+    };
+    char georepdir[PATH_MAX] = {
+        0,
+    };
+    int valid_state = 0;
+
+    valid_state = -1;
+    ret = glusterd_check_gsync_present(&valid_state);
+    if (-1 == ret) {
+        ret = valid_state;
+        goto out;
+    }
+
+    ret = snprintf(georepdir, sizeof(georepdir) - 1, "%s/" GEOREP,
+                   conf->workdir);
+    georepdir[ret] = '\0';
+
+    /************
+     * master pre-configuration
+     ************/
+
+    /* remote-gsyncd */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_args(&runner, "remote-gsyncd", GSYNCD_PREFIX "/gsyncd", ".", ".",
+                    NULL);
+    RUN_GSYNCD_CMD;
+
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_args(&runner, "remote-gsyncd", "/nonexistent/gsyncd", ".",
+                    "^ssh:", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* gluster-command-dir */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_args(&runner, "gluster-command-dir", SBIN_DIR "/", ".", ".",
+                    NULL);
+    RUN_GSYNCD_CMD;
+
+    /* gluster-params */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_args(&runner, "gluster-params", "aux-gfid-mount acl", ".", ".",
+                    NULL);
+    RUN_GSYNCD_CMD;
+
+    /* ssh-command */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_arg(&runner, "ssh-command");
+    runner_argprintf(&runner,
+                     "ssh -oPasswordAuthentication=no "
+                     "-oStrictHostKeyChecking=no "
+                     "-i %s/secret.pem",
+                     georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* ssh-command tar */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_arg(&runner, "ssh-command-tar");
+    runner_argprintf(&runner,
+                     "ssh -oPasswordAuthentication=no "
+                     "-oStrictHostKeyChecking=no "
+                     "-i %s/tar_ssh.pem",
+                     georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* pid-file */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_arg(&runner, "pid-file");
+    runner_argprintf(&runner,
+                     "%s/${mastervol}_${remotehost}_${slavevol}/monitor.pid",
+                     georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* geo-rep-working-dir */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_arg(&runner, "georep-session-working-dir");
+    runner_argprintf(&runner, "%s/${mastervol}_${remotehost}_${slavevol}/",
+                     georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* state-file */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_arg(&runner, "state-file");
+    runner_argprintf(&runner,
+                     "%s/${mastervol}_${remotehost}_${slavevol}/monitor.status",
+                     georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* state-detail-file */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_arg(&runner, "state-detail-file");
+    runner_argprintf(
+        &runner,
+        "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}-detail.status",
+        georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* state-socket */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_arg(&runner, "state-socket-unencoded");
+    runner_argprintf(
+        &runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.socket",
+        georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* socketdir */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_args(&runner, "socketdir", GLUSTERD_SOCK_DIR, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* log-file */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_arg(&runner, "log-file");
+    runner_argprintf(&runner, "%s/%s/${mastervol}/${eSlave}.log", conf->logdir,
+                     GEOREP);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* changelog-log-file */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_arg(&runner, "changelog-log-file");
+    runner_argprintf(&runner,
+                     "%s/%s/${mastervol}/${eSlave}${local_id}-changes.log",
+                     conf->logdir, GEOREP);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* gluster-log-file */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_arg(&runner, "gluster-log-file");
+    runner_argprintf(&runner,
+                     "%s/%s/${mastervol}/${eSlave}${local_id}.gluster.log",
+                     conf->logdir, GEOREP);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* ignore-deletes */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_args(&runner, "ignore-deletes", "false", ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* special-sync-mode */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_args(&runner, "special-sync-mode", "partial", ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* change-detector == changelog */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_args(&runner, "change-detector", "changelog", ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_arg(&runner, "working-dir");
+    runner_argprintf(&runner, "%s/${mastervol}/${eSlave}",
+                     DEFAULT_GLUSTERFSD_MISC_DIRETORY);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /************
+     * slave pre-configuration
+     ************/
+
+    /* slave-gluster-command-dir */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_args(&runner, "slave-gluster-command-dir", SBIN_DIR "/", ".",
+                    NULL);
+    RUN_GSYNCD_CMD;
+
+    /* gluster-params */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_args(&runner, "gluster-params", "aux-gfid-mount acl", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* log-file */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_arg(&runner, "log-file");
+    runner_argprintf(&runner,
+                     "%s/%s-slaves/"
+                     "${session_owner}:${local_node}${local_id}.${slavevol}."
+                     "log",
+                     conf->logdir, GEOREP);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* MountBroker log-file */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_arg(&runner, "log-file-mbr");
+    runner_argprintf(&runner,
+                     "%s/%s-slaves/mbr/"
+                     "${session_owner}:${local_node}${local_id}.${slavevol}."
+                     "log",
+                     conf->logdir, GEOREP);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* gluster-log-file */
+    runinit_gsyncd_setrx(&runner, conf_path);
+    runner_add_arg(&runner, "gluster-log-file");
+    runner_argprintf(&runner,
+                     "%s/%s-slaves/"
+                     "${session_owner}:${local_node}${local_id}.${slavevol}."
+                     "gluster.log",
+                     conf->logdir, GEOREP);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+out:
+    return ret ? -1 : 0;
+}
+
+static int
+glusterd_create_essential_dir_files(glusterd_volinfo_t *volinfo, dict_t *dict,
+                                    char *slave, char *slave_host,
+                                    char *slave_vol, char **op_errstr)
+{
+    int ret = -1;
+    char *conf_path = NULL;
+    char *statefile = NULL;
+    char buf[PATH_MAX] = "";
+    char errmsg[PATH_MAX] = "";
+    glusterd_conf_t *conf = NULL;
+    struct stat stbuf = {
+        0,
+    };
+    xlator_t *this = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    conf = this->private;
+
+    ret = dict_get_str(dict, "conf_path", &conf_path);
+    if (ret) {
+        snprintf(errmsg, sizeof(errmsg), "Unable to fetch conf file path.");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               errmsg);
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "statefile", &statefile);
+    if (ret) {
+        snprintf(errmsg, sizeof(errmsg), "Unable to fetch statefile path.");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               errmsg);
+        goto out;
+    }
+
+    ret = snprintf(buf, sizeof(buf), "%s/" GEOREP "/%s_%s_%s", conf->workdir,
+                   volinfo->volname, slave_host, slave_vol);
+    if ((ret < 0) || (ret >= sizeof(buf))) {
+        ret = -1;
+        goto out;
+    }
+    ret = mkdir_p(buf, 0755, _gf_true);
+    if (ret) {
+        len = snprintf(errmsg, sizeof(errmsg),
+                       "Unable to create %s"
+                       ". Error : %s",
+                       buf, strerror(errno));
+        if (len < 0) {
+            strcpy(errmsg, "<error>");
+        }
+        *op_errstr = gf_strdup(errmsg);
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED, "%s",
+               errmsg);
+        goto out;
+    }
+
+    ret = snprintf(buf, PATH_MAX, "%s/" GEOREP "/%s", conf->logdir,
+                   volinfo->volname);
+    if ((ret < 0) || (ret >= PATH_MAX)) {
+        ret = -1;
+        goto out;
+    }
+    ret = mkdir_p(buf, 0755, _gf_true);
+    if (ret) {
+        len = snprintf(errmsg, sizeof(errmsg),
+                       "Unable to create %s"
+                       ". Error : %s",
+                       buf, strerror(errno));
+        if (len < 0) {
+            strcpy(errmsg, "<error>");
+        }
+        *op_errstr = gf_strdup(errmsg);
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED, "%s",
+               errmsg);
+        goto out;
+    }
+
+    ret = sys_lstat(conf_path, &stbuf);
+    if (!ret) {
+        gf_msg_debug(this->name, 0,
+                     "Session already running."
+                     " Not creating config file again.");
+    } else {
+        ret = create_conf_file(conf, conf_path);
+        if (ret || sys_lstat(conf_path, &stbuf)) {
+            snprintf(errmsg, sizeof(errmsg),
+                     "Failed to create"
+                     " config file(%s).",
+                     conf_path);
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED, "%s",
+                   errmsg);
+            goto out;
+        }
+    }
+
+    ret = sys_lstat(statefile, &stbuf);
+    if (!ret) {
+        gf_msg_debug(this->name, 0,
+                     "Session already running."
+                     " Not creating status file again.");
+        goto out;
+    } else {
+        ret = glusterd_create_status_file(volinfo->volname, slave, slave_host,
+                                          slave_vol, "Created");
+        if (ret || sys_lstat(statefile, &stbuf)) {
+            snprintf(errmsg, sizeof(errmsg),
+                     "Unable to create %s"
+                     ". Error : %s",
+                     statefile, strerror(errno));
+            *op_errstr = gf_strdup(errmsg);
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED, "%s",
+                   errmsg);
+            ret = -1;
+            goto out;
+        }
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_gsync_create(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    char common_pem_file[PATH_MAX] = "";
+    char errmsg[PATH_MAX] = {
+        0,
+    };
+    char hooks_args[PATH_MAX] = "";
+    char uuid_str[64] = "";
+    char *host_uuid = NULL;
+    char *slave_url = NULL;
+    char *slave_url_buf = NULL;
+    char *slave_user = NULL;
+    char *slave_ip = NULL;
+    char *save_ptr = NULL;
+    char *slave_host = NULL;
+    char *slave_vol = NULL;
+    char *arg_buf = NULL;
+    char *volname = NULL;
+    char *slave = NULL;
+    int32_t ret = -1;
+    int32_t is_pem_push = -1;
+    int32_t ssh_port = 22;
+    gf_boolean_t is_force = -1;
+    glusterd_conf_t *conf = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    char old_working_dir[PATH_MAX] = {0};
+    char new_working_dir[PATH_MAX] = {0};
+    char *slave_voluuid = NULL;
+    char *old_slavehost = NULL;
+    gf_boolean_t is_existing_session = _gf_false;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+
+    ret = glusterd_op_gsync_args_get(dict, op_errstr, &volname, &slave,
+                                     &host_uuid);
+    if (ret)
+        goto out;
+
+    len = snprintf(common_pem_file, sizeof(common_pem_file),
+                   "%s" GLUSTERD_COMMON_PEM_PUB_FILE, conf->workdir);
+    if ((len < 0) || (len >= sizeof(common_pem_file))) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "Volinfo for %s (master) not found", volname);
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "slave_vol", &slave_vol);
+    if (ret) {
+        snprintf(errmsg, sizeof(errmsg), "Unable to fetch slave volume name.");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               errmsg);
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "slave_url", &slave_url);
+    if (ret) {
+        snprintf(errmsg, sizeof(errmsg), "Unable to fetch slave IP.");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               errmsg);
+        ret = -1;
+        goto out;
+    }
+
+    /* Fetch the slave_user and slave_ip from the slave_url.
+     * If the slave_user is not present. Use "root"
+     */
+    if (strstr(slave_url, "@")) {
+        slave_url_buf = gf_strdup(slave_url);
+        if (!slave_url_buf) {
+            ret = -1;
+            goto out;
+        }
+        slave_user = strtok_r(slave_url, "@", &save_ptr);
+        slave_ip = strtok_r(NULL, "@", &save_ptr);
+    } else {
+        slave_user = "root";
+        slave_ip = slave_url;
+    }
+
+    if (!slave_user || !slave_ip) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVE_URL_INVALID,
+               "Invalid slave url.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "slave_host", &slave_host);
+    if (ret) {
+        snprintf(errmsg, sizeof(errmsg), "Unable to fetch slave host");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               errmsg);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_int32(dict, "ssh_port", &ssh_port);
+    if (ret < 0 && ret != -ENOENT) {
+        snprintf(errmsg, sizeof(errmsg), "Fetching ssh_port failed");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               errmsg);
+        ret = -1;
+        goto out;
+    }
+
+    is_force = dict_get_str_boolean(dict, "force", _gf_false);
+
+    uuid_utoa_r(MY_UUID, uuid_str);
+    if (!strcmp(uuid_str, host_uuid)) {
+        ret = dict_get_int32(dict, "push_pem", &is_pem_push);
+        if (!ret && is_pem_push) {
+            gf_msg_debug(this->name, 0,
+                         "Trying to setup"
+                         " pem files in slave");
+            is_pem_push = 1;
+        } else
+            is_pem_push = 0;
+
+        len = snprintf(hooks_args, sizeof(hooks_args),
+                       "is_push_pem=%d,pub_file=%s,slave_user=%s,"
+                       "slave_ip=%s,slave_vol=%s,ssh_port=%d",
+                       is_pem_push, common_pem_file, slave_user, slave_ip,
+                       slave_vol, ssh_port);
+        if ((len < 0) || (len >= sizeof(hooks_args))) {
+            ret = -1;
+            goto out;
+        }
+    } else
+        snprintf(hooks_args, sizeof(hooks_args),
+                 "This argument will stop the hooks script");
+
+    arg_buf = gf_strdup(hooks_args);
+    if (!arg_buf) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STRDUP_FAILED,
+               "Failed to gf_strdup");
+        if (is_force) {
+            ret = 0;
+            goto create_essentials;
+        }
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_str(dict, "hooks_args", arg_buf);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set hooks_args in dict.");
+        if (is_force) {
+            ret = 0;
+            goto create_essentials;
+        }
+        goto out;
+    }
+
+create_essentials:
+    /* Fetch slave volume uuid, to get stored in volume info. */
+    ret = dict_get_str(dict, "slave_voluuid", &slave_voluuid);
+    if (ret) {
+        snprintf(errmsg, sizeof(errmsg),
+                 "Unable to fetch slave volume uuid from dict");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               errmsg);
+        ret = -1;
+        goto out;
+    }
+
+    is_existing_session = dict_get_str_boolean(dict, "existing_session",
+                                               _gf_false);
+    if (is_existing_session) {
+        ret = dict_get_str(dict, "old_slavehost", &old_slavehost);
+        if (ret) {
+            snprintf(errmsg, sizeof(errmsg), "Unable to fetch old_slavehost");
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+                   errmsg);
+            ret = -1;
+            goto out;
+        }
+
+        /* Rename existing geo-rep session with new Slave Host */
+        ret = snprintf(old_working_dir, sizeof(old_working_dir) - 1,
+                       "%s/" GEOREP "/%s_%s_%s", conf->workdir,
+                       volinfo->volname, old_slavehost, slave_vol);
+
+        ret = snprintf(new_working_dir, sizeof(new_working_dir) - 1,
+                       "%s/" GEOREP "/%s_%s_%s", conf->workdir,
+                       volinfo->volname, slave_host, slave_vol);
+
+        ret = sys_rename(old_working_dir, new_working_dir);
+        if (!ret) {
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_FORCE_CREATE_SESSION,
+                   "rename of old working dir %s to "
+                   "new working dir %s is done! ",
+                   old_working_dir, new_working_dir);
+        } else {
+            if (errno == ENOENT) {
+                /* log error, but proceed with directory
+                 * creation below */
+                gf_msg_debug(this->name, 0,
+                             "old_working_dir(%s) "
+                             "not present.",
+                             old_working_dir);
+            } else {
+                len = snprintf(errmsg, sizeof(errmsg),
+                               "rename of old working dir %s "
+                               "to new working dir %s "
+                               "failed! Error: %s",
+                               old_working_dir, new_working_dir,
+                               strerror(errno));
+                if (len < 0) {
+                    strcpy(errmsg, "<error>");
+                }
+                gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_FORCE_CREATE_SESSION,
+                       "rename of old working dir %s to "
+                       "new working dir %s failed! Error: %s!",
+                       old_working_dir, new_working_dir, strerror(errno));
+
+                ret = -1;
+                goto out;
+            }
+        }
+    }
+
+    ret = glusterd_create_essential_dir_files(volinfo, dict, slave, slave_host,
+                                              slave_vol, op_errstr);
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_slave_in_info(volinfo, slave, host_uuid, slave_voluuid,
+                                       op_errstr, is_force);
+    if (ret) {
+        snprintf(errmsg, sizeof(errmsg),
+                 "Unable to store"
+                 " slave info.");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SLAVEINFO_STORE_ERROR, "%s",
+               errmsg);
+        goto out;
+    }
+
+    /* Enable marker and changelog */
+    ret = glusterd_set_gsync_confs(volinfo);
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_MARKER_START_FAIL,
+               "marker/changelog"
+               " start failed");
+        snprintf(errmsg, sizeof(errmsg), "Index initialization failed");
+
+        ret = -1;
+        goto out;
+    }
+
+out:
+    if (ret && errmsg[0] != '\0') {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GSYNCD_ERROR, "%s", errmsg);
+        *op_errstr = gf_strdup(errmsg);
+    }
+
+    GF_FREE(slave_url_buf);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-geo-rep.h b/xlators/mgmt/glusterd/src/glusterd-geo-rep.h
new file mode 100644
index 00000000000..7d1318f522c
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-geo-rep.h
@@ -0,0 +1,52 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_GEO_REP_H_
+#define _GLUSTERD_GEO_REP_H_
+
+#ifndef GSYNC_CONF_TEMPLATE
+#define GSYNC_CONF_TEMPLATE GEOREP "/gsyncd_template.conf"
+#endif
+
+/* <slave host>::<slave volume> */
+#define SLAVE_URL_INFO_MAX (_POSIX_HOST_NAME_MAX + GD_VOLUME_NAME_MAX + 3)
+
+/* slave info format:
+ * <master host uuid>:ssh://{<slave_user>@}<slave host>::<slave volume> \
+ * :<slave volume uuid> */
+#define VOLINFO_SLAVE_URL_MAX                                                  \
+    (LOGIN_NAME_MAX + (2 * GF_UUID_BUF_SIZE) + SLAVE_URL_INFO_MAX + 10)
+
+typedef struct glusterd_gsync_status_temp {
+    dict_t *rsp_dict;
+    glusterd_volinfo_t *volinfo;
+    char *node;
+} glusterd_gsync_status_temp_t;
+
+typedef struct gsync_status_param {
+    glusterd_volinfo_t *volinfo;
+    int is_active;
+} gsync_status_param_t;
+
+int
+gsync_status(char *master, char *slave, char *conf_path, int *status,
+             gf_boolean_t *is_template_in_use);
+
+void
+glusterd_check_geo_rep_configured(glusterd_volinfo_t *volinfo,
+                                  gf_boolean_t *flag);
+int
+_get_slave_status(dict_t *dict, char *key, data_t *value, void *data);
+int
+glusterd_check_geo_rep_running(gsync_status_param_t *param, char **op_errstr);
+
+int
+glusterd_get_gsync_status_mst(glusterd_volinfo_t *volinfo, dict_t *rsp_dict,
+                              char *node);
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc-helper.c b/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc-helper.c
new file mode 100644
index 00000000000..319bfa140f3
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc-helper.c
@@ -0,0 +1,235 @@
+/*
+   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-gfproxyd-svc-helper.h"
+#include "glusterd-messages.h"
+#include <glusterfs/syscall.h>
+#include "glusterd-volgen.h"
+
+void
+glusterd_svc_build_gfproxyd_rundir(glusterd_volinfo_t *volinfo, char *path,
+                                   int path_len)
+{
+    char workdir[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = THIS->private;
+
+    GLUSTERD_GET_VOLUME_PID_DIR(workdir, volinfo, priv);
+
+    snprintf(path, path_len, "%s", workdir);
+}
+
+void
+glusterd_svc_build_gfproxyd_socket_filepath(glusterd_volinfo_t *volinfo,
+                                            char *path, int path_len)
+{
+    char sockfilepath[PATH_MAX] = {
+        0,
+    };
+    char rundir[PATH_MAX] = {
+        0,
+    };
+    int32_t len = 0;
+
+    glusterd_svc_build_gfproxyd_rundir(volinfo, rundir, sizeof(rundir));
+    len = snprintf(sockfilepath, sizeof(sockfilepath), "%s/run-%s", rundir,
+                   uuid_utoa(MY_UUID));
+    if ((len < 0) || (len >= sizeof(sockfilepath))) {
+        sockfilepath[0] = 0;
+    }
+
+    glusterd_set_socket_filepath(sockfilepath, path, path_len);
+}
+
+void
+glusterd_svc_build_gfproxyd_pidfile(glusterd_volinfo_t *volinfo, char *path,
+                                    int path_len)
+{
+    char rundir[PATH_MAX] = {
+        0,
+    };
+
+    glusterd_svc_build_gfproxyd_rundir(volinfo, rundir, sizeof(rundir));
+
+    snprintf(path, path_len, "%s/%s.gfproxyd.pid", rundir, volinfo->volname);
+}
+
+void
+glusterd_svc_build_gfproxyd_volfile_path(glusterd_volinfo_t *volinfo,
+                                         char *path, int path_len)
+{
+    char workdir[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = THIS->private;
+
+    GLUSTERD_GET_VOLUME_DIR(workdir, volinfo, priv);
+
+    snprintf(path, path_len, "%s/%s.gfproxyd.vol", workdir, volinfo->volname);
+}
+
+void
+glusterd_svc_build_gfproxyd_logdir(char *logdir, char *volname, size_t len)
+{
+    glusterd_conf_t *conf = THIS->private;
+    snprintf(logdir, len, "%s/gfproxy/%s", conf->logdir, volname);
+}
+
+void
+glusterd_svc_build_gfproxyd_logfile(char *logfile, char *logdir, size_t len)
+{
+    snprintf(logfile, len, "%s/gfproxyd.log", logdir);
+}
+
+int
+glusterd_is_gfproxyd_enabled(glusterd_volinfo_t *volinfo)
+{
+    return glusterd_volinfo_get_boolean(volinfo, VKEY_CONFIG_GFPROXY);
+}
+
+static int
+glusterd_svc_get_gfproxyd_volfile(glusterd_volinfo_t *volinfo, char *svc_name,
+                                  char *orgvol, char **tmpvol, int path_len)
+{
+    int tmp_fd = -1;
+    int ret = -1;
+    int need_unlink = 0;
+
+    glusterd_svc_build_gfproxyd_volfile_path(volinfo, orgvol, path_len);
+
+    ret = gf_asprintf(tmpvol, "/tmp/g%s-XXXXXX", svc_name);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* coverity[SECURE_TEMP] mkstemp uses 0600 as the mode and is safe */
+    tmp_fd = mkstemp(*tmpvol);
+    if (tmp_fd < 0) {
+        gf_msg("glusterd", GF_LOG_WARNING, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to create temp file"
+               " %s:(%s)",
+               *tmpvol, strerror(errno));
+        ret = -1;
+        goto out;
+    }
+
+    need_unlink = 1;
+    ret = glusterd_build_gfproxyd_volfile(volinfo, *tmpvol);
+out:
+    if (need_unlink && ret < 0)
+        sys_unlink(*tmpvol);
+
+    if ((ret < 0) && (*tmpvol != NULL)) {
+        GF_FREE(*tmpvol);
+        *tmpvol = NULL;
+    }
+
+    if (tmp_fd >= 0)
+        sys_close(tmp_fd);
+
+    return ret;
+}
+
+int
+glusterd_svc_check_gfproxyd_volfile_identical(char *svc_name,
+                                              glusterd_volinfo_t *volinfo,
+                                              gf_boolean_t *identical)
+{
+    char orgvol[PATH_MAX] = {
+        0,
+    };
+    char *tmpvol = NULL;
+    int ret = -1;
+    int need_unlink = 0;
+
+    GF_VALIDATE_OR_GOTO("glusterd", identical, out);
+
+    ret = glusterd_svc_get_gfproxyd_volfile(volinfo, svc_name, orgvol, &tmpvol,
+                                            PATH_MAX);
+    if (ret)
+        goto out;
+
+    need_unlink = 1;
+    ret = glusterd_check_files_identical(orgvol, tmpvol, identical);
+    if (ret)
+        goto out;
+
+out:
+    if (need_unlink)
+        sys_unlink(tmpvol);
+
+    if (tmpvol != NULL)
+        GF_FREE(tmpvol);
+
+    return ret;
+}
+
+int
+glusterd_svc_check_gfproxyd_topology_identical(char *svc_name,
+                                               glusterd_volinfo_t *volinfo,
+                                               gf_boolean_t *identical)
+{
+    char orgvol[PATH_MAX] = {
+        0,
+    };
+    char *tmpvol = NULL;
+    int ret = -1;
+    int tmpclean = 0;
+
+    GF_VALIDATE_OR_GOTO("glusterd", identical, out);
+
+    ret = glusterd_svc_get_gfproxyd_volfile(volinfo, svc_name, orgvol, &tmpvol,
+                                            PATH_MAX);
+    if (ret)
+        goto out;
+
+    tmpclean = 1; /* SET the flag to unlink() tmpfile */
+
+    /* Compare the topology of volfiles */
+    ret = glusterd_check_topology_identical(orgvol, tmpvol, identical);
+out:
+    if (tmpclean)
+        sys_unlink(tmpvol);
+
+    if (tmpvol != NULL)
+        GF_FREE(tmpvol);
+
+    return ret;
+}
+
+glusterd_volinfo_t *
+glusterd_gfproxyd_volinfo_from_svc(glusterd_svc_t *svc)
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_gfproxydsvc_t *gfproxyd = NULL;
+
+    /* Get volinfo->gfproxyd from svc object */
+    gfproxyd = cds_list_entry(svc, glusterd_gfproxydsvc_t, svc);
+    if (!gfproxyd) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_SNAPD_OBJ_GET_FAIL,
+               "Failed to get gfproxyd "
+               "object from gfproxyd service");
+        goto out;
+    }
+
+    /* Get volinfo from gfproxyd */
+    volinfo = cds_list_entry(gfproxyd, glusterd_volinfo_t, gfproxyd);
+    if (!volinfo) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Failed to get volinfo from "
+               "from gfproxyd");
+        goto out;
+    }
+out:
+    return volinfo;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc-helper.h b/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc-helper.h
new file mode 100644
index 00000000000..3aca218a65d
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc-helper.h
@@ -0,0 +1,51 @@
+/*
+   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_GFPROXYD_SVC_HELPER_H_
+#define _GLUSTERD_GFPROXYD_SVC_HELPER_H_
+
+#include "glusterd.h"
+
+void
+glusterd_svc_build_gfproxyd_rundir(glusterd_volinfo_t *volinfo, char *path,
+                                   int path_len);
+
+void
+glusterd_svc_build_gfproxyd_socket_filepath(glusterd_volinfo_t *volinfo,
+                                            char *path, int path_len);
+
+void
+glusterd_svc_build_gfproxyd_pidfile(glusterd_volinfo_t *volinfo, char *path,
+                                    int path_len);
+
+void
+glusterd_svc_build_gfproxyd_volfile_path(glusterd_volinfo_t *volinfo,
+                                         char *path, int path_len);
+
+void
+glusterd_svc_build_gfproxyd_logdir(char *logdir, char *volname, size_t len);
+
+void
+glusterd_svc_build_gfproxyd_logfile(char *logfile, char *logdir, size_t len);
+
+int
+glusterd_svc_check_gfproxyd_volfile_identical(char *svc_name,
+                                              glusterd_volinfo_t *volinfo,
+                                              gf_boolean_t *identical);
+int
+glusterd_svc_check_gfproxyd_topology_identical(char *svc_name,
+                                               glusterd_volinfo_t *volinfo,
+                                               gf_boolean_t *identical);
+int
+glusterd_is_gfproxyd_enabled(glusterd_volinfo_t *volinfo);
+
+glusterd_volinfo_t *
+glusterd_gfproxyd_volinfo_from_svc(glusterd_svc_t *svc);
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.c b/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.c
new file mode 100644
index 00000000000..a0bfea41f0f
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.c
@@ -0,0 +1,478 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/globals.h>
+#include <glusterfs/run.h>
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-gfproxyd-svc.h"
+#include "glusterd-messages.h"
+#include "glusterd-svc-helper.h"
+#include "glusterd-svc-mgmt.h"
+#include "glusterd-gfproxyd-svc-helper.h"
+#include <glusterfs/syscall.h>
+
+void
+glusterd_gfproxydsvc_build(glusterd_svc_t *svc)
+{
+    svc->manager = glusterd_gfproxydsvc_manager;
+    svc->start = glusterd_gfproxydsvc_start;
+    svc->stop = glusterd_gfproxydsvc_stop;
+    svc->reconfigure = glusterd_gfproxydsvc_reconfigure;
+}
+
+int
+glusterd_gfproxydsvc_stop(glusterd_svc_t *svc, int sig)
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    int ret = 0;
+
+    ret = glusterd_svc_stop(svc, sig);
+    if (ret)
+        goto out;
+
+    volinfo = glusterd_gfproxyd_volinfo_from_svc(svc);
+    volinfo->gfproxyd.port = 0;
+
+out:
+    return ret;
+}
+
+int
+glusterd_gfproxydsvc_init(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    char rundir[PATH_MAX] = {
+        0,
+    };
+    char sockpath[PATH_MAX] = {
+        0,
+    };
+    char pidfile[PATH_MAX] = {
+        0,
+    };
+    char volfile[PATH_MAX] = {
+        0,
+    };
+    char logdir[PATH_MAX] = {
+        0,
+    };
+    char logfile[PATH_MAX] = {
+        0,
+    };
+    char volfileid[256] = {0};
+    glusterd_svc_t *svc = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_conn_notify_t notify = NULL;
+    xlator_t *this = NULL;
+    char *volfileserver = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    svc = &(volinfo->gfproxyd.svc);
+
+    ret = snprintf(svc->name, sizeof(svc->name), "%s", gfproxyd_svc_name);
+    if (ret < 0)
+        goto out;
+
+    notify = glusterd_svc_common_rpc_notify;
+
+    glusterd_svc_build_gfproxyd_rundir(volinfo, rundir, sizeof(rundir));
+    glusterd_svc_create_rundir(rundir);
+
+    /* Initialize the connection mgmt */
+    glusterd_svc_build_gfproxyd_socket_filepath(volinfo, sockpath,
+                                                sizeof(sockpath));
+    ret = glusterd_conn_init(&(svc->conn), sockpath, 600, notify);
+    if (ret)
+        goto out;
+
+    /* Initialize the process mgmt */
+    glusterd_svc_build_gfproxyd_pidfile(volinfo, pidfile, sizeof(pidfile));
+    glusterd_svc_build_gfproxyd_volfile_path(volinfo, volfile, sizeof(volfile));
+    glusterd_svc_build_gfproxyd_logdir(logdir, volinfo->volname,
+                                       sizeof(logdir));
+    ret = mkdir_p(logdir, 0755, _gf_true);
+    if ((ret == -1) && (EEXIST != errno)) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create logdir %s", logdir);
+        goto out;
+    }
+    glusterd_svc_build_gfproxyd_logfile(logfile, logdir, sizeof(logfile));
+    len = snprintf(volfileid, sizeof(volfileid), "gfproxyd/%s",
+                   volinfo->volname);
+    if ((len < 0) || (len >= sizeof(volfileid))) {
+        ret = -1;
+        goto out;
+    }
+
+    if (dict_get_strn(this->options, "transport.socket.bind-address",
+                      SLEN("transport.socket.bind-address"),
+                      &volfileserver) != 0) {
+        volfileserver = "localhost";
+    }
+    ret = glusterd_proc_init(&(svc->proc), gfproxyd_svc_name, pidfile, logdir,
+                             logfile, volfile, volfileid, volfileserver);
+    if (ret)
+        goto out;
+
+out:
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_gfproxydsvc_create_volfile(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    ret = glusterd_generate_gfproxyd_volfile(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Failed to create volfile");
+        goto out;
+    }
+
+out:
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_gfproxydsvc_manager(glusterd_svc_t *svc, void *data, int flags)
+{
+    int ret = -1;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    volinfo = data;
+    GF_VALIDATE_OR_GOTO(this->name, data, out);
+
+    if (!svc->inited) {
+        ret = glusterd_gfproxydsvc_init(volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FAILED_INIT_QUOTASVC,
+                   "Failed to init "
+                   "gfproxyd service");
+            goto out;
+        } else {
+            svc->inited = _gf_true;
+            gf_msg_debug(this->name, 0,
+                         "gfproxyd service "
+                         "initialized");
+        }
+    }
+
+    ret = glusterd_is_gfproxyd_enabled(volinfo);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Failed to read volume "
+               "options");
+        goto out;
+    }
+
+    if (ret) {
+        if (!glusterd_is_volume_started(volinfo)) {
+            if (glusterd_proc_is_running(&svc->proc)) {
+                ret = svc->stop(svc, SIGTERM);
+                if (ret)
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_STOP_FAIL,
+                           "Couldn't stop gfproxyd for "
+                           "volume: %s",
+                           volinfo->volname);
+            } else {
+                /* Since gfproxyd is not running set ret to 0 */
+                ret = 0;
+            }
+            goto out;
+        }
+
+        ret = glusterd_gfproxydsvc_create_volfile(volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_CREATE_FAIL,
+                   "Couldn't create "
+                   "gfroxyd volfile for volume: %s",
+                   volinfo->volname);
+            goto out;
+        }
+        ret = svc->stop(svc, SIGTERM);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_START_FAIL,
+                   "Couldn't stop "
+                   "gfproxyd for volume: %s",
+                   volinfo->volname);
+            goto out;
+        }
+
+        ret = svc->start(svc, flags);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_START_FAIL,
+                   "Couldn't start "
+                   "gfproxyd for volume: %s",
+                   volinfo->volname);
+            goto out;
+        }
+
+        glusterd_volinfo_ref(volinfo);
+        ret = glusterd_conn_connect(&(svc->conn));
+        if (ret) {
+            glusterd_volinfo_unref(volinfo);
+            volinfo = NULL;
+            goto out;
+        }
+
+    } else if (glusterd_proc_is_running(&svc->proc)) {
+        ret = svc->stop(svc, SIGTERM);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_STOP_FAIL,
+                   "Couldn't stop gfproxyd for volume: %s", volinfo->volname);
+            goto out;
+        }
+    }
+
+out:
+    if (ret) {
+        if (volinfo) {
+            gf_event(EVENT_SVC_MANAGER_FAILED, "volume=%s;svc_name=%s",
+                     volinfo->volname, svc->name);
+        }
+    }
+
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_gfproxydsvc_start(glusterd_svc_t *svc, int flags)
+{
+    int ret = -1;
+    runner_t runner = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    char valgrind_logfile[PATH_MAX] = {0};
+    int gfproxyd_port = 0;
+    char msg[1024] = {
+        0,
+    };
+    char gfproxyd_id[PATH_MAX] = {
+        0,
+    };
+    glusterd_volinfo_t *volinfo = NULL;
+    char *localtime_logging = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    volinfo = glusterd_gfproxyd_volinfo_from_svc(svc);
+    if (!volinfo)
+        goto out;
+
+    ret = sys_access(svc->proc.volfile, F_OK);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_DEBUG, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "gfproxyd Volfile %s is not present", svc->proc.volfile);
+        ret = glusterd_gfproxydsvc_create_volfile(volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+                   "Couldn't create "
+                   "gfproxyd volfile for volume: %s",
+                   volinfo->volname);
+            goto out;
+        }
+    }
+    runinit(&runner);
+
+    if (this->ctx->cmd_args.vgtool != _gf_none) {
+        len = snprintf(valgrind_logfile, PATH_MAX, "%s/valgrind-%s",
+                       svc->proc.logdir, svc->proc.logfile);
+        if ((len < 0) || (len >= PATH_MAX)) {
+            ret = -1;
+            goto out;
+        }
+
+        if (this->ctx->cmd_args.vgtool == _gf_memcheck)
+            runner_add_args(&runner, "valgrind", "--leak-check=full",
+                            "--trace-children=yes", "--track-origins=yes",
+                            NULL);
+        else
+            runner_add_args(&runner, "valgrind", "--tool=drd", NULL);
+
+        runner_argprintf(&runner, "--log-file=%s", valgrind_logfile);
+    }
+
+    snprintf(gfproxyd_id, sizeof(gfproxyd_id), "gfproxyd-%s", volinfo->volname);
+    runner_add_args(&runner, SBIN_DIR "/glusterfsd", "-s",
+                    svc->proc.volfileserver, "--volfile-id",
+                    svc->proc.volfileid, "-p", svc->proc.pidfile, "-l",
+                    svc->proc.logfile, "--brick-name", gfproxyd_id, "-S",
+                    svc->conn.sockpath, NULL);
+
+    if (volinfo->memory_accounting)
+        runner_add_arg(&runner, "--mem-accounting");
+    if (dict_get_strn(priv->opts, GLUSTERD_LOCALTIME_LOGGING_KEY,
+                      SLEN(GLUSTERD_LOCALTIME_LOGGING_KEY),
+                      &localtime_logging) == 0) {
+        if (strcmp(localtime_logging, "enable") == 0)
+            runner_add_arg(&runner, "--localtime-logging");
+    }
+
+    gfproxyd_port = pmap_assign_port(this, volinfo->gfproxyd.port, gfproxyd_id);
+    volinfo->gfproxyd.port = gfproxyd_port;
+
+    runner_add_arg(&runner, "--brick-port");
+    runner_argprintf(&runner, "%d", gfproxyd_port);
+    runner_add_arg(&runner, "--xlator-option");
+    runner_argprintf(&runner, "%s-server.listen-port=%d", volinfo->volname,
+                     gfproxyd_port);
+
+    snprintf(msg, sizeof(msg), "Starting the gfproxyd service for volume %s",
+             volinfo->volname);
+    runner_log(&runner, this->name, GF_LOG_DEBUG, msg);
+
+    if (flags == PROC_START_NO_WAIT) {
+        ret = runner_run_nowait(&runner);
+    } else {
+        synclock_unlock(&priv->big_lock);
+        {
+            ret = runner_run(&runner);
+        }
+        synclock_lock(&priv->big_lock);
+    }
+
+out:
+    return ret;
+}
+
+int
+glusterd_gfproxydsvc_restart()
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_volinfo_t *tmp = NULL;
+    int ret = -1;
+    xlator_t *this = THIS;
+    glusterd_conf_t *conf = NULL;
+    glusterd_svc_t *svc = NULL;
+
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    cds_list_for_each_entry_safe(volinfo, tmp, &conf->volumes, vol_list)
+    {
+        /* Start per volume gfproxyd svc */
+        if (volinfo->status == GLUSTERD_STATUS_STARTED) {
+            svc = &(volinfo->gfproxyd.svc);
+            ret = svc->manager(svc, volinfo, PROC_START_NO_WAIT);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_START_FAIL,
+                       "Couldn't resolve gfproxyd for "
+                       "vol: %s on restart",
+                       volinfo->volname);
+                gf_event(EVENT_SVC_MANAGER_FAILED, "volume=%s;svc_name=%s",
+                         volinfo->volname, svc->name);
+                goto out;
+            }
+        }
+    }
+out:
+    return ret;
+}
+
+int
+glusterd_gfproxydsvc_reconfigure(void *data)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    gf_boolean_t identical = _gf_false;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    volinfo = data;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    if (!volinfo->gfproxyd.svc.inited)
+        goto manager;
+
+    if (!glusterd_is_gfproxyd_enabled(volinfo))
+        goto manager;
+    else if (!glusterd_proc_is_running(&volinfo->gfproxyd.svc.proc))
+        goto manager;
+
+    /*
+     * Check both OLD and NEW volfiles, if they are SAME by size
+     * and cksum i.e. "character-by-character". If YES, then
+     * NOTHING has been changed, just return.
+     */
+    ret = glusterd_svc_check_gfproxyd_volfile_identical(
+        volinfo->gfproxyd.svc.name, volinfo, &identical);
+    if (ret)
+        goto out;
+
+    if (identical) {
+        ret = 0;
+        goto out;
+    }
+
+    /*
+     * They are not identical. Find out if the topology is changed
+     * OR just the volume options. If just the options which got
+     * changed, then inform the xlator to reconfigure the options.
+     */
+    identical = _gf_false; /* RESET the FLAG */
+    ret = glusterd_svc_check_gfproxyd_topology_identical(
+        volinfo->gfproxyd.svc.name, volinfo, &identical);
+    if (ret)
+        goto out;
+
+    /* Topology is not changed, but just the options. But write the
+     * options to gfproxyd volfile, so that gfproxyd will be reconfigured.
+     */
+    if (identical) {
+        ret = glusterd_gfproxydsvc_create_volfile(volinfo);
+        if (ret == 0) { /* Only if above PASSES */
+            ret = glusterd_fetchspec_notify(this);
+        }
+        goto out;
+    }
+manager:
+    /*
+     * gfproxyd volfile's topology has been changed. gfproxyd server needs
+     * to be RESTARTED to ACT on the changed volfile.
+     */
+    ret = volinfo->gfproxyd.svc.manager(&(volinfo->gfproxyd.svc), volinfo,
+                                        PROC_START_NO_WAIT);
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.h b/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.h
new file mode 100644
index 00000000000..d396b4015f3
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.h
@@ -0,0 +1,47 @@
+/*
+  Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_GFPROXYD_SVC_H_
+#define _GLUSTERD_GFPROXYD_SVC_H_
+
+#include "glusterd-svc-mgmt.h"
+
+#define gfproxyd_svc_name "gfproxyd"
+
+struct glusterd_gfproxydsvc_ {
+    glusterd_svc_t svc;
+    gf_store_handle_t *handle;
+    int port;
+};
+
+typedef struct glusterd_gfproxydsvc_ glusterd_gfproxydsvc_t;
+
+void
+glusterd_gfproxydsvc_build(glusterd_svc_t *svc);
+
+int
+glusterd_gfproxydsvc_manager(glusterd_svc_t *svc, void *data, int flags);
+
+int
+glusterd_gfproxydsvc_start(glusterd_svc_t *svc, int flags);
+
+int
+glusterd_gfproxydsvc_stop(glusterd_svc_t *svc, int sig);
+
+int
+glusterd_gfproxydsvc_reconfigure();
+
+void
+glusterd_gfproxydsvc_build_volfile_path(char *server, char *workdir,
+                                        char *volfile, size_t len);
+
+int
+glusterd_gfproxydsvc_restart();
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
new file mode 100644
index 00000000000..1b21c40596d
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -0,0 +1,6713 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <inttypes.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/dict.h>
+#include "protocol-common.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/timer.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/run.h>
+#include "glusterd-mem-types.h"
+#include "glusterd.h"
+#include "glusterd-sm.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-utils.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-server-quorum.h"
+#include "glusterd-store.h"
+#include "glusterd-locks.h"
+#include "glusterd-snapshot-utils.h"
+#include "glusterd-geo-rep.h"
+
+#include "glusterd1-xdr.h"
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+#include "rpc-clnt.h"
+#include "glusterd-volgen.h"
+#include "glusterd-mountbroker.h"
+#include "glusterd-messages.h"
+#include "glusterd-errno.h"
+
+#include <sys/resource.h>
+#include <inttypes.h>
+
+#include <glusterfs/common-utils.h>
+
+#include "glusterd-syncop.h"
+#include "glusterd-messages.h"
+
+extern glusterd_op_info_t opinfo;
+static int volcount;
+
+int
+glusterd_big_locked_notify(struct rpc_clnt *rpc, void *mydata,
+                           rpc_clnt_event_t event, void *data,
+                           rpc_clnt_notify_t notify_fn)
+{
+    glusterd_conf_t *priv = THIS->private;
+    int ret = -1;
+
+    synclock_lock(&priv->big_lock);
+    ret = notify_fn(rpc, mydata, event, data);
+    synclock_unlock(&priv->big_lock);
+
+    return ret;
+}
+
+int
+glusterd_big_locked_handler(rpcsvc_request_t *req, rpcsvc_actor actor_fn)
+{
+    glusterd_conf_t *priv = THIS->private;
+    int ret = -1;
+
+    synclock_lock(&priv->big_lock);
+    ret = actor_fn(req);
+    synclock_unlock(&priv->big_lock);
+
+    return ret;
+}
+
+static int
+glusterd_handle_friend_req(rpcsvc_request_t *req, uuid_t uuid, char *hostname,
+                           int port, gd1_mgmt_friend_req *friend_req)
+{
+    int ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_friend_sm_event_t *event = NULL;
+    glusterd_friend_req_ctx_t *ctx = NULL;
+    char rhost[UNIX_PATH_MAX + 1] = {0};
+    dict_t *dict = NULL;
+
+    if (!port)
+        port = GF_DEFAULT_BASE_PORT;
+
+    ret = glusterd_remote_hostname_get(req, rhost, sizeof(rhost));
+
+    ctx = GF_CALLOC(1, sizeof(*ctx), gf_gld_mt_friend_req_ctx_t);
+    dict = dict_new();
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find(uuid, rhost);
+
+    if (peerinfo == NULL) {
+        gf_event(EVENT_PEER_REJECT, "peer=%s", hostname);
+        ret = glusterd_xfer_friend_add_resp(req, hostname, rhost, port, -1,
+                                            GF_PROBE_UNKNOWN_PEER);
+        if (friend_req->vols.vols_val) {
+            free(friend_req->vols.vols_val);
+            friend_req->vols.vols_val = NULL;
+        }
+        goto out;
+    }
+
+    ret = glusterd_friend_sm_new_event(GD_FRIEND_EVENT_RCVD_FRIEND_REQ, &event);
+
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_EVENT_NEW_GET_FAIL,
+               "event generation failed: %d", ret);
+        goto out;
+    }
+
+    event->peername = gf_strdup(peerinfo->hostname);
+    gf_uuid_copy(event->peerid, peerinfo->uuid);
+
+    if (!ctx) {
+        gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Unable to allocate memory");
+        ret = -1;
+        goto out;
+    }
+
+    gf_uuid_copy(ctx->uuid, uuid);
+    if (hostname)
+        ctx->hostname = gf_strdup(hostname);
+    ctx->req = req;
+
+    if (!dict) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_unserialize(friend_req->vols.vols_val, friend_req->vols.vols_len,
+                           &dict);
+
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                NULL);
+        goto out;
+    } else
+        dict->extra_stdfree = friend_req->vols.vols_val;
+
+    ctx->vols = dict;
+    event->ctx = ctx;
+
+    ret = glusterd_friend_sm_inject_event(event);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_EVENT_INJECT_FAIL,
+               "Unable to inject event %d, "
+               "ret = %d",
+               event->event, ret);
+        goto out;
+    }
+
+    ret = 0;
+    if (peerinfo && (0 == peerinfo->connected))
+        ret = GLUSTERD_CONNECTION_AWAITED;
+
+out:
+    RCU_READ_UNLOCK;
+
+    if (ret && (ret != GLUSTERD_CONNECTION_AWAITED)) {
+        if (ctx && ctx->hostname)
+            GF_FREE(ctx->hostname);
+        GF_FREE(ctx);
+        if (dict) {
+            if ((!dict->extra_stdfree) && friend_req->vols.vols_val)
+                free(friend_req->vols.vols_val);
+            dict_unref(dict);
+        } else {
+            free(friend_req->vols.vols_val);
+        }
+        if (event)
+            GF_FREE(event->peername);
+        GF_FREE(event);
+    }
+
+    return ret;
+}
+
+static int
+glusterd_handle_unfriend_req(rpcsvc_request_t *req, uuid_t uuid, char *hostname,
+                             int port)
+{
+    int ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_friend_sm_event_t *event = NULL;
+    glusterd_friend_req_ctx_t *ctx = NULL;
+
+    if (!port)
+        port = GF_DEFAULT_BASE_PORT;
+
+    ctx = GF_CALLOC(1, sizeof(*ctx), gf_gld_mt_friend_req_ctx_t);
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find(uuid, hostname);
+
+    if (peerinfo == NULL) {
+        RCU_READ_UNLOCK;
+        gf_msg("glusterd", GF_LOG_CRITICAL, 0, GD_MSG_REQ_FROM_UNKNOWN_PEER,
+               "Received remove-friend from unknown peer %s", hostname);
+        ret = glusterd_xfer_friend_remove_resp(req, hostname, port);
+        goto out;
+    }
+
+    ret = glusterd_friend_sm_new_event(GD_FRIEND_EVENT_RCVD_REMOVE_FRIEND,
+                                       &event);
+
+    if (ret) {
+        RCU_READ_UNLOCK;
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_EVENT_NEW_GET_FAIL,
+               "event generation failed: %d", ret);
+        goto out;
+    }
+
+    if (hostname)
+        event->peername = gf_strdup(hostname);
+
+    gf_uuid_copy(event->peerid, uuid);
+
+    if (!ctx) {
+        RCU_READ_UNLOCK;
+        ret = -1;
+        gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Unable to allocate memory");
+        goto out;
+    }
+
+    gf_uuid_copy(ctx->uuid, uuid);
+    if (hostname)
+        ctx->hostname = gf_strdup(hostname);
+    ctx->req = req;
+
+    event->ctx = ctx;
+
+    ret = glusterd_friend_sm_inject_event(event);
+
+    if (ret) {
+        RCU_READ_UNLOCK;
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_EVENT_INJECT_FAIL,
+               "Unable to inject event %d, "
+               "ret = %d",
+               event->event, ret);
+        goto out;
+    }
+
+    RCU_READ_UNLOCK;
+
+    return 0;
+
+out:
+
+    if (0 != ret) {
+        if (ctx && ctx->hostname)
+            GF_FREE(ctx->hostname);
+        GF_FREE(ctx);
+        if (event)
+            GF_FREE(event->peername);
+        GF_FREE(event);
+    }
+
+    return ret;
+}
+
+struct args_pack {
+    dict_t *dict;
+    int vol_count;
+    int opt_count;
+};
+
+static int
+_build_option_key(dict_t *d, char *k, data_t *v, void *tmp)
+{
+    char reconfig_key[256] = {
+        0,
+    };
+    int keylen;
+    struct args_pack *pack = NULL;
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    pack = tmp;
+    if (strcmp(k, GLUSTERD_GLOBAL_OPT_VERSION) == 0)
+        return 0;
+
+    if (priv->op_version > GD_OP_VERSION_MIN) {
+        if ((strcmp(k, "features.limit-usage") == 0) ||
+            (strcmp(k, "features.soft-limit") == 0))
+            return 0;
+    }
+
+    /* snap-max-hard-limit and snap-max-soft-limit are system   *
+     * options set and managed by snapshot config option. Hence *
+     * they should not be displayed in gluster volume info.     *
+     */
+    if ((strcmp(k, "snap-max-hard-limit") == 0) ||
+        (strcmp(k, "snap-max-soft-limit") == 0))
+        return 0;
+
+    keylen = snprintf(reconfig_key, sizeof(reconfig_key), "volume%d.option.%s",
+                      pack->vol_count, k);
+    ret = dict_set_strn(pack->dict, reconfig_key, keylen, v->data);
+    if (0 == ret)
+        pack->opt_count++;
+
+    return 0;
+}
+
+int
+glusterd_add_arbiter_info_to_bricks(glusterd_volinfo_t *volinfo,
+                                    dict_t *volumes, int count)
+{
+    char key[64] = {
+        0,
+    };
+    int keylen;
+    int i = 0;
+    int ret = 0;
+
+    if (volinfo->replica_count == 1 || volinfo->arbiter_count != 1)
+        return 0;
+    for (i = 1; i <= volinfo->brick_count; i++) {
+        if (i % volinfo->replica_count != 0)
+            continue;
+        keylen = snprintf(key, sizeof(key), "volume%d.brick%d.isArbiter", count,
+                          i);
+        ret = dict_set_int32n(volumes, key, keylen, 1);
+        if (ret)
+            return ret;
+    }
+    return 0;
+}
+
+int
+glusterd_add_volume_detail_to_dict(glusterd_volinfo_t *volinfo, dict_t *volumes,
+                                   int count)
+{
+    int ret = -1;
+    char key[64] = {
+        0,
+    };
+    int keylen;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *ta_brickinfo = NULL;
+    char *buf = NULL;
+    int i = 1;
+    dict_t *dict = NULL;
+    glusterd_conf_t *priv = NULL;
+    char *volume_id_str = NULL;
+    struct args_pack pack = {
+        0,
+    };
+    xlator_t *this = NULL;
+    int32_t len = 0;
+
+    char ta_brick[4096] = {
+        0,
+    };
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(volumes);
+
+    this = THIS;
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    keylen = snprintf(key, sizeof(key), "volume%d.name", count);
+    ret = dict_set_strn(volumes, key, keylen, volinfo->volname);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "volume%d.type", count);
+    ret = dict_set_int32n(volumes, key, keylen, volinfo->type);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "volume%d.status", count);
+    ret = dict_set_int32n(volumes, key, keylen, volinfo->status);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "volume%d.brick_count", count);
+    ret = dict_set_int32n(volumes, key, keylen, volinfo->brick_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "volume%d.dist_count", count);
+    ret = dict_set_int32n(volumes, key, keylen, volinfo->dist_leaf_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "volume%d.stripe_count", count);
+    ret = dict_set_int32n(volumes, key, keylen, volinfo->stripe_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "volume%d.replica_count", count);
+    ret = dict_set_int32n(volumes, key, keylen, volinfo->replica_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "volume%d.disperse_count", count);
+    ret = dict_set_int32n(volumes, key, keylen, volinfo->disperse_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "volume%d.redundancy_count", count);
+    ret = dict_set_int32n(volumes, key, keylen, volinfo->redundancy_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "volume%d.arbiter_count", count);
+    ret = dict_set_int32n(volumes, key, keylen, volinfo->arbiter_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "volume%d.transport", count);
+    ret = dict_set_int32n(volumes, key, keylen, volinfo->transport_type);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "volume%d.thin_arbiter_count", count);
+    ret = dict_set_int32n(volumes, key, keylen, volinfo->thin_arbiter_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    volume_id_str = gf_strdup(uuid_utoa(volinfo->volume_id));
+    if (!volume_id_str) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "volume%d.volume_id", count);
+    ret = dict_set_dynstrn(volumes, key, keylen, volume_id_str);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "volume%d.rebalance", count);
+    ret = dict_set_int32n(volumes, key, keylen, volinfo->rebal.defrag_cmd);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "volume%d.snap_count", count);
+    ret = dict_set_int32n(volumes, key, keylen, volinfo->snap_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        char brick[1024] = {
+            0,
+        };
+        char brick_uuid[64] = {
+            0,
+        };
+        len = snprintf(brick, sizeof(brick), "%s:%s", brickinfo->hostname,
+                       brickinfo->path);
+        if ((len < 0) || (len >= sizeof(brick))) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+            ret = -1;
+            goto out;
+        }
+        buf = gf_strdup(brick);
+        keylen = snprintf(key, sizeof(key), "volume%d.brick%d", count, i);
+        ret = dict_set_dynstrn(volumes, key, keylen, buf);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=%s", key, NULL);
+            goto out;
+        }
+        keylen = snprintf(key, sizeof(key), "volume%d.brick%d.uuid", count, i);
+        snprintf(brick_uuid, sizeof(brick_uuid), "%s",
+                 uuid_utoa(brickinfo->uuid));
+        buf = gf_strdup(brick_uuid);
+        if (!buf) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+                    "brick_uuid=%s", brick_uuid, NULL);
+            goto out;
+        }
+        ret = dict_set_dynstrn(volumes, key, keylen, buf);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=%s", key, NULL);
+            goto out;
+        }
+
+        i++;
+    }
+    if (volinfo->thin_arbiter_count == 1) {
+        ta_brickinfo = list_first_entry(&volinfo->ta_bricks,
+                                        glusterd_brickinfo_t, brick_list);
+        len = snprintf(ta_brick, sizeof(ta_brick), "%s:%s",
+                       ta_brickinfo->hostname, ta_brickinfo->path);
+        if ((len < 0) || (len >= sizeof(ta_brick))) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+            ret = -1;
+            goto out;
+        }
+        buf = gf_strdup(ta_brick);
+        keylen = snprintf(key, sizeof(key), "volume%d.thin_arbiter_brick",
+                          count);
+        ret = dict_set_dynstrn(volumes, key, keylen, buf);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=%s", key, NULL);
+            goto out;
+        }
+    }
+
+    ret = glusterd_add_arbiter_info_to_bricks(volinfo, volumes, count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_ARBITER_BRICK_SET_INFO_FAIL, NULL);
+        goto out;
+    }
+
+    dict = volinfo->dict;
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = 0;
+        goto out;
+    }
+
+    pack.dict = volumes;
+    pack.vol_count = count;
+    pack.opt_count = 0;
+    dict_foreach(dict, _build_option_key, (void *)&pack);
+    dict_foreach(priv->opts, _build_option_key, &pack);
+
+    keylen = snprintf(key, sizeof(key), "volume%d.opt_count", pack.vol_count);
+    ret = dict_set_int32n(volumes, key, keylen, pack.opt_count);
+out:
+    return ret;
+}
+
+int32_t
+glusterd_op_txn_begin(rpcsvc_request_t *req, glusterd_op_t op, void *ctx,
+                      char *err_str, size_t err_len)
+{
+    int32_t ret = -1;
+    dict_t *dict = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    int32_t locked = 0;
+    char *tmp = NULL;
+    char *volname = NULL;
+    uuid_t *txn_id = NULL;
+    glusterd_op_info_t txn_op_info = {
+        {0},
+    };
+    glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
+    uint32_t op_errno = 0;
+    uint32_t timeout = 0;
+
+    GF_ASSERT(req);
+    GF_ASSERT((op > GD_OP_NONE) && (op < GD_OP_MAX));
+    GF_ASSERT(NULL != ctx);
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    dict = ctx;
+
+    /* Generate a transaction-id for this operation and
+     * save it in the dict. This transaction id distinguishes
+     * each transaction, and helps separate opinfos in the
+     * op state machine. */
+    ret = glusterd_generate_txn_id(dict, &txn_id);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_IDGEN_FAIL,
+               "Failed to generate transaction id");
+        goto out;
+    }
+
+    /* Save the MY_UUID as the originator_uuid. This originator_uuid
+     * will be used by is_origin_glusterd() to determine if a node
+     * is the originator node for a command. */
+    ret = glusterd_set_originator_uuid(dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UUID_SET_FAIL,
+               "Failed to set originator_uuid.");
+        goto out;
+    }
+
+    /* Based on the op_version, acquire a cluster or mgmt_v3 lock */
+    if (priv->op_version < GD_OP_VERSION_3_6_0) {
+        ret = glusterd_lock(MY_UUID);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_LOCK_FAIL,
+                   "Unable to acquire lock on localhost, ret: %d", ret);
+            snprintf(err_str, err_len,
+                     "Another transaction is in progress. "
+                     "Please try again after some time.");
+            goto out;
+        }
+    } else {
+        /* If no volname is given as a part of the command, locks will
+         * not be held */
+        ret = dict_get_strn(dict, "volname", SLEN("volname"), &tmp);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_FAILED,
+                   "No Volume name present. "
+                   "Locks not being held.");
+            goto local_locking_done;
+        } else {
+            /* Use a copy of volname, as cli response will be
+             * sent before the unlock, and the volname in the
+             * dict, might be removed */
+            volname = gf_strdup(tmp);
+            if (!volname)
+                goto out;
+        }
+
+        /* Cli will add timeout key to dict if the default timeout is
+         * other than 2 minutes. Here we use this value to check whether
+         * mgmt_v3_lock_timeout should be set to default value or we
+         * need to change the value according to timeout value
+         * i.e, timeout + 120 seconds. */
+        ret = dict_get_uint32(dict, "timeout", &timeout);
+        if (!ret)
+            priv->mgmt_v3_lock_timeout = timeout + 120;
+
+        ret = glusterd_mgmt_v3_lock(volname, MY_UUID, &op_errno, "vol");
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCK_GET_FAIL,
+                   "Unable to acquire lock for %s", volname);
+            snprintf(err_str, err_len,
+                     "Another transaction is in progress for %s. "
+                     "Please try again after some time.",
+                     volname);
+            goto out;
+        }
+    }
+
+    locked = 1;
+    gf_msg_debug(this->name, 0, "Acquired lock on localhost");
+
+local_locking_done:
+    /* If no volname is given as a part of the command, locks will
+     * not be held, hence sending stage event. */
+    if (volname || (priv->op_version < GD_OP_VERSION_3_6_0))
+        event_type = GD_OP_EVENT_START_LOCK;
+    else {
+        txn_op_info.state.state = GD_OP_STATE_LOCK_SENT;
+        event_type = GD_OP_EVENT_ALL_ACC;
+    }
+
+    /* Save opinfo for this transaction with the transaction id */
+    glusterd_txn_opinfo_init(&txn_op_info, NULL, &op, ctx, req);
+
+    ret = glusterd_set_txn_opinfo(txn_id, &txn_op_info);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set transaction's opinfo");
+        if (ctx)
+            dict_unref(ctx);
+        goto out;
+    }
+
+    ret = glusterd_op_sm_inject_event(event_type, txn_id, ctx);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_EVENT_INJECT_FAIL,
+               "Failed to acquire cluster"
+               " lock.");
+        goto out;
+    }
+
+out:
+    if (locked && ret) {
+        /* Based on the op-version, we release the
+         * cluster or mgmt_v3 lock */
+        if (priv->op_version < GD_OP_VERSION_3_6_0)
+            glusterd_unlock(MY_UUID);
+        else {
+            ret = glusterd_mgmt_v3_unlock(volname, MY_UUID, "vol");
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FAIL,
+                       "Unable to release lock for %s", volname);
+            ret = -1;
+        }
+    }
+
+    if (volname)
+        GF_FREE(volname);
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+__glusterd_handle_cluster_lock(rpcsvc_request_t *req)
+{
+    dict_t *op_ctx = NULL;
+    int32_t ret = -1;
+    gd1_mgmt_cluster_lock_req lock_req = {
+        {0},
+    };
+    glusterd_op_lock_ctx_t *ctx = NULL;
+    glusterd_op_sm_event_type_t op = GD_OP_EVENT_LOCK;
+    glusterd_op_info_t txn_op_info = {
+        {0},
+    };
+    glusterd_conf_t *priv = NULL;
+    uuid_t *txn_id = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(req);
+
+    txn_id = &priv->global_txn_id;
+
+    ret = xdr_to_generic(req->msg[0], &lock_req,
+                         (xdrproc_t)xdr_gd1_mgmt_cluster_lock_req);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode lock "
+               "request received from peer");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "Received LOCK from uuid: %s",
+                 uuid_utoa(lock_req.uuid));
+
+    RCU_READ_LOCK;
+    ret = (glusterd_peerinfo_find_by_uuid(lock_req.uuid) == NULL);
+    RCU_READ_UNLOCK;
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_PEER_NOT_FOUND,
+               "%s doesn't "
+               "belong to the cluster. Ignoring request.",
+               uuid_utoa(lock_req.uuid));
+        ret = -1;
+        goto out;
+    }
+
+    ctx = GF_CALLOC(1, sizeof(*ctx), gf_gld_mt_op_lock_ctx_t);
+
+    if (!ctx) {
+        // respond here
+        return -1;
+    }
+
+    gf_uuid_copy(ctx->uuid, lock_req.uuid);
+    ctx->req = req;
+    ctx->dict = NULL;
+
+    op_ctx = dict_new();
+    if (!op_ctx) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_DICT_CREATE_FAIL,
+               "Unable to set new dict");
+        goto out;
+    }
+
+    glusterd_txn_opinfo_init(&txn_op_info, NULL, &op, op_ctx, req);
+
+    ret = glusterd_set_txn_opinfo(txn_id, &txn_op_info);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set transaction's opinfo");
+        dict_unref(txn_op_info.op_ctx);
+        goto out;
+    }
+
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_LOCK, txn_id, ctx);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_EVENT_INJECT_FAIL,
+               "Failed to inject event GD_OP_EVENT_LOCK");
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    if (ret)
+        GF_FREE(ctx);
+
+    return ret;
+}
+
+int
+glusterd_handle_cluster_lock(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_cluster_lock);
+}
+
+static int
+glusterd_req_ctx_create(rpcsvc_request_t *rpc_req, int op, uuid_t uuid,
+                        char *buf_val, size_t buf_len,
+                        gf_gld_mem_types_t mem_type,
+                        glusterd_req_ctx_t **req_ctx_out)
+{
+    int ret = -1;
+    char str[50] = {
+        0,
+    };
+    glusterd_req_ctx_t *req_ctx = NULL;
+    dict_t *dict = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    gf_uuid_unparse(uuid, str);
+    gf_msg_debug(this->name, 0, "Received op from uuid %s", str);
+
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    req_ctx = GF_CALLOC(1, sizeof(*req_ctx), mem_type);
+    if (!req_ctx) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    gf_uuid_copy(req_ctx->uuid, uuid);
+    req_ctx->op = op;
+    ret = dict_unserialize(buf_val, buf_len, &dict);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                NULL);
+        goto out;
+    }
+
+    req_ctx->dict = dict;
+    req_ctx->req = rpc_req;
+    *req_ctx_out = req_ctx;
+    ret = 0;
+out:
+    if (ret) {
+        if (dict)
+            dict_unref(dict);
+        GF_FREE(req_ctx);
+    }
+    return ret;
+}
+
+int
+__glusterd_handle_stage_op(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    glusterd_req_ctx_t *req_ctx = NULL;
+    gd1_mgmt_stage_op_req op_req = {
+        {0},
+    };
+    xlator_t *this = NULL;
+    uuid_t *txn_id = NULL;
+    glusterd_op_info_t txn_op_info = {
+        {0},
+    };
+    glusterd_op_sm_state_info_t state = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(req);
+
+    txn_id = &priv->global_txn_id;
+
+    ret = xdr_to_generic(req->msg[0], &op_req,
+                         (xdrproc_t)xdr_gd1_mgmt_stage_op_req);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode stage "
+               "request received from peer");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    ret = glusterd_req_ctx_create(req, op_req.op, op_req.uuid,
+                                  op_req.buf.buf_val, op_req.buf.buf_len,
+                                  gf_gld_mt_op_stage_ctx_t, &req_ctx);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_CTX_CREATE_FAIL,
+               "Failed to create req_ctx");
+        goto out;
+    }
+
+    ret = dict_get_bin(req_ctx->dict, "transaction_id", (void **)&txn_id);
+    gf_msg_debug(this->name, 0, "transaction ID = %s", uuid_utoa(*txn_id));
+
+    RCU_READ_LOCK;
+    ret = (glusterd_peerinfo_find_by_uuid(op_req.uuid) == NULL);
+    RCU_READ_UNLOCK;
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_PEER_NOT_FOUND,
+               "%s doesn't "
+               "belong to the cluster. Ignoring request.",
+               uuid_utoa(op_req.uuid));
+        ret = -1;
+        goto out;
+    }
+
+    /* In cases where there is no volname, the receivers won't have a
+     * transaction opinfo created, as for those operations, the locking
+     * phase where the transaction opinfos are created, won't be called.
+     * skip_locking will be true for all such transaction and we clear
+     * the txn_opinfo after the staging phase, except for geo-replication
+     * operations where we need to access txn_opinfo in the later phases also.
+     */
+    ret = glusterd_get_txn_opinfo(txn_id, &txn_op_info);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "No transaction's opinfo set");
+
+        state.state = GD_OP_STATE_LOCKED;
+        glusterd_txn_opinfo_init(&txn_op_info, &state, &op_req.op,
+                                 req_ctx->dict, req);
+
+        if (req_ctx->op != GD_OP_GSYNC_SET)
+            txn_op_info.skip_locking = _gf_true;
+        ret = glusterd_set_txn_opinfo(txn_id, &txn_op_info);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+                   "Unable to set transaction's opinfo");
+            dict_unref(req_ctx->dict);
+            goto out;
+        }
+    }
+
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_STAGE_OP, txn_id, req_ctx);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_EVENT_INJECT_FAIL,
+               "Failed to inject event GD_OP_EVENT_STAGE_OP");
+
+out:
+    free(op_req.buf.buf_val);  // malloced by xdr
+    glusterd_friend_sm();
+    glusterd_op_sm();
+    return ret;
+}
+
+int
+glusterd_handle_stage_op(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_stage_op);
+}
+
+int
+__glusterd_handle_commit_op(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    glusterd_req_ctx_t *req_ctx = NULL;
+    gd1_mgmt_commit_op_req op_req = {
+        {0},
+    };
+    xlator_t *this = NULL;
+    uuid_t *txn_id = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(req);
+
+    txn_id = &priv->global_txn_id;
+
+    ret = xdr_to_generic(req->msg[0], &op_req,
+                         (xdrproc_t)xdr_gd1_mgmt_commit_op_req);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode commit "
+               "request received from peer");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    RCU_READ_LOCK;
+    ret = (glusterd_peerinfo_find_by_uuid(op_req.uuid) == NULL);
+    RCU_READ_UNLOCK;
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_PEER_NOT_FOUND,
+               "%s doesn't "
+               "belong to the cluster. Ignoring request.",
+               uuid_utoa(op_req.uuid));
+        ret = -1;
+        goto out;
+    }
+
+    // the structures should always be equal
+    GF_ASSERT(sizeof(gd1_mgmt_commit_op_req) == sizeof(gd1_mgmt_stage_op_req));
+    ret = glusterd_req_ctx_create(req, op_req.op, op_req.uuid,
+                                  op_req.buf.buf_val, op_req.buf.buf_len,
+                                  gf_gld_mt_op_commit_ctx_t, &req_ctx);
+    if (ret)
+        goto out;
+
+    ret = dict_get_bin(req_ctx->dict, "transaction_id", (void **)&txn_id);
+    gf_msg_debug(this->name, 0, "transaction ID = %s", uuid_utoa(*txn_id));
+
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_COMMIT_OP, txn_id, req_ctx);
+
+out:
+    free(op_req.buf.buf_val);  // malloced by xdr
+    glusterd_friend_sm();
+    glusterd_op_sm();
+    return ret;
+}
+
+int
+glusterd_handle_commit_op(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_commit_op);
+}
+
+int
+__glusterd_handle_cli_probe(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {
+        {
+            0,
+        },
+    };
+    glusterd_peerinfo_t *peerinfo = NULL;
+    gf_boolean_t run_fsm = _gf_true;
+    xlator_t *this = NULL;
+    char *bind_name = NULL;
+    dict_t *dict = NULL;
+    char *hostname = NULL;
+    int port = 0;
+    int op_errno = 0;
+
+    GF_ASSERT(req);
+    this = THIS;
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "xdr decoding error");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "Failed to "
+                   "unserialize req-buffer to dictionary");
+            goto out;
+        }
+    }
+
+    ret = dict_get_strn(dict, "hostname", SLEN("hostname"), &hostname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HOSTNAME_NOTFOUND_IN_DICT,
+               "Failed to get hostname");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "port", SLEN("port"), &port);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PORT_NOTFOUND_IN_DICT,
+               "Failed to get port");
+        goto out;
+    }
+
+    if (glusterd_is_any_volume_in_server_quorum(this) &&
+        !does_gd_meet_server_quorum(this)) {
+        glusterd_xfer_cli_probe_resp(req, -1, GF_PROBE_QUORUM_NOT_MET, NULL,
+                                     hostname, port, dict);
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_SERVER_QUORUM_NOT_MET,
+               "Server quorum not met. Rejecting operation.");
+        ret = 0;
+        goto out;
+    }
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_CLI_REQ_RECVD,
+           "Received CLI probe req %s %d", hostname, port);
+
+    if (dict_get_strn(this->options, "transport.socket.bind-address",
+                      SLEN("transport.socket.bind-address"), &bind_name) == 0) {
+        gf_msg_debug("glusterd", 0,
+                     "only checking probe address vs. bind address");
+        ret = gf_is_same_address(bind_name, hostname);
+    } else {
+        ret = gf_is_local_addr(hostname);
+    }
+    if (ret) {
+        glusterd_xfer_cli_probe_resp(req, 0, GF_PROBE_LOCALHOST, NULL, hostname,
+                                     port, dict);
+        ret = 0;
+        goto out;
+    }
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find_by_hostname(hostname);
+    ret = (peerinfo && gd_peer_has_address(peerinfo, hostname));
+
+    RCU_READ_UNLOCK;
+
+    if (ret) {
+        gf_msg_debug("glusterd", 0,
+                     "Probe host %s port %d "
+                     "already a peer",
+                     hostname, port);
+        glusterd_xfer_cli_probe_resp(req, 0, GF_PROBE_FRIEND, NULL, hostname,
+                                     port, dict);
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_probe_begin(req, hostname, port, dict, &op_errno);
+
+    if (ret == GLUSTERD_CONNECTION_AWAITED) {
+        // fsm should be run after connection establishes
+        run_fsm = _gf_false;
+        ret = 0;
+
+    } else if (ret == -1) {
+        glusterd_xfer_cli_probe_resp(req, -1, op_errno, NULL, hostname, port,
+                                     dict);
+        goto out;
+    }
+
+out:
+    free(cli_req.dict.dict_val);
+
+    if (run_fsm) {
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    }
+
+    return ret;
+}
+
+int
+glusterd_handle_cli_probe(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_cli_probe);
+}
+
+int
+__glusterd_handle_cli_deprobe(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {
+        {
+            0,
+        },
+    };
+    uuid_t uuid = {0};
+    int op_errno = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    dict_t *dict = NULL;
+    char *hostname = NULL;
+    int port = 0;
+    int flags = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_volinfo_t *tmp = NULL;
+    glusterd_snap_t *snapinfo = NULL;
+    glusterd_snap_t *tmpsnap = NULL;
+    gf_boolean_t need_free = _gf_false;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode "
+               "request received from cli");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        dict = dict_new();
+
+        if (dict) {
+            need_free = _gf_true;
+        } else {
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "Failed to "
+                   "unserialize req-buffer to dictionary");
+            goto out;
+        }
+    }
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_CLI_REQ_RECVD,
+           "Received CLI deprobe req");
+
+    ret = dict_get_strn(dict, "hostname", SLEN("hostname"), &hostname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HOSTNAME_NOTFOUND_IN_DICT,
+               "Failed to get hostname");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "port", SLEN("port"), &port);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PORT_NOTFOUND_IN_DICT,
+               "Failed to get port");
+        goto out;
+    }
+    ret = dict_get_int32n(dict, "flags", SLEN("flags"), &flags);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FLAGS_NOTFOUND_IN_DICT,
+               "Failed to get flags");
+        goto out;
+    }
+
+    ret = glusterd_hostname_to_uuid(hostname, uuid);
+    if (ret) {
+        op_errno = GF_DEPROBE_NOT_FRIEND;
+        goto out;
+    }
+
+    if (!gf_uuid_compare(uuid, MY_UUID)) {
+        op_errno = GF_DEPROBE_LOCALHOST;
+        ret = -1;
+        goto out;
+    }
+
+    if (!(flags & GF_CLI_FLAG_OP_FORCE)) {
+        /* Check if peers are connected, except peer being
+         * detached*/
+        if (!glusterd_chk_peers_connected_befriended(uuid)) {
+            ret = -1;
+            op_errno = GF_DEPROBE_FRIEND_DOWN;
+            goto out;
+        }
+    }
+
+    /* Check for if volumes exist with some bricks on the peer being
+     * detached. It's not a problem if a volume contains none or all
+     * of its bricks on the peer being detached
+     */
+    cds_list_for_each_entry_safe(volinfo, tmp, &priv->volumes, vol_list)
+    {
+        ret = glusterd_friend_contains_vol_bricks(volinfo, uuid);
+        if (ret == 1) {
+            op_errno = GF_DEPROBE_BRICK_EXIST;
+            goto out;
+        }
+    }
+
+    cds_list_for_each_entry_safe(snapinfo, tmpsnap, &priv->snapshots, snap_list)
+    {
+        ret = glusterd_friend_contains_snap_bricks(snapinfo, uuid);
+        if (ret == 1) {
+            op_errno = GF_DEPROBE_SNAP_BRICK_EXIST;
+            goto out;
+        }
+    }
+    if (!(flags & GF_CLI_FLAG_OP_FORCE)) {
+        if (glusterd_is_any_volume_in_server_quorum(this) &&
+            !does_gd_meet_server_quorum(this)) {
+            gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_SERVER_QUORUM_NOT_MET,
+                   "Server quorum not met. Rejecting operation.");
+            ret = -1;
+            op_errno = GF_DEPROBE_QUORUM_NOT_MET;
+            goto out;
+        }
+    }
+
+    if (!gf_uuid_is_null(uuid)) {
+        ret = glusterd_deprobe_begin(req, hostname, port, uuid, dict,
+                                     &op_errno);
+    } else {
+        ret = glusterd_deprobe_begin(req, hostname, port, NULL, dict,
+                                     &op_errno);
+    }
+
+    need_free = _gf_false;
+
+out:
+    free(cli_req.dict.dict_val);
+
+    if (ret) {
+        ret = glusterd_xfer_cli_deprobe_resp(req, ret, op_errno, NULL, hostname,
+                                             dict);
+        if (need_free) {
+            dict_unref(dict);
+        }
+    }
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    return ret;
+}
+
+int
+glusterd_handle_cli_deprobe(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_cli_deprobe);
+}
+
+int
+__glusterd_handle_cli_list_friends(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf1_cli_peer_list_req cli_req = {
+        0,
+    };
+    dict_t *dict = NULL;
+
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req,
+                         (xdrproc_t)xdr_gf1_cli_peer_list_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode "
+               "request received from cli");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_CLI_REQ_RECVD,
+           "Received cli list req");
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+    }
+
+    ret = glusterd_list_friends(req, dict, cli_req.flags);
+
+out:
+    if (dict)
+        dict_unref(dict);
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    return ret;
+}
+
+int
+glusterd_handle_cli_list_friends(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_cli_list_friends);
+}
+
+static int
+__glusterd_handle_cli_get_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    int32_t flags = 0;
+    dict_t *dict = NULL;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(req);
+    this = THIS;
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode "
+               "request received from cli");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    gf_msg(this->name, GF_LOG_DEBUG, 0, GD_MSG_GET_VOL_REQ_RCVD,
+           "Received get vol req");
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+    }
+
+    ret = dict_get_int32n(dict, "flags", SLEN("flags"), &flags);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FLAGS_NOTFOUND_IN_DICT,
+               "failed to get flags");
+        goto out;
+    }
+    ret = glusterd_get_volumes(req, dict, flags);
+
+out:
+    if (dict)
+        dict_unref(dict);
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    return ret;
+}
+
+int
+glusterd_handle_cli_get_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_cli_get_volume);
+}
+
+int
+__glusterd_handle_cli_uuid_reset(rpcsvc_request_t *req)
+{
+    int ret = -1;
+    dict_t *dict = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    uuid_t uuid = {0};
+    gf_cli_rsp rsp = {
+        0,
+    };
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    char msg_str[128] = {
+        0,
+    };
+
+    GF_ASSERT(req);
+
+    this = THIS;
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode "
+               "request received from cli");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    gf_msg_debug("glusterd", 0, "Received uuid reset req");
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(msg_str, sizeof(msg_str),
+                     "Unable to decode "
+                     "the buffer");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+    }
+
+    /* In the above section if dict_unserialize is successful, ret is set
+     * to zero.
+     */
+    ret = -1;
+    // Do not allow peer reset if there are any volumes in the cluster
+    if (!cds_list_empty(&priv->volumes)) {
+        snprintf(msg_str, sizeof(msg_str),
+                 "volumes are already "
+                 "present in the cluster. Resetting uuid is not "
+                 "allowed");
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLS_ALREADY_PRESENT, "%s",
+               msg_str);
+        goto out;
+    }
+
+    // Do not allow peer reset if trusted storage pool is already formed
+    if (!cds_list_empty(&priv->peers)) {
+        snprintf(msg_str, sizeof(msg_str),
+                 "trusted storage pool "
+                 "has been already formed. Please detach this peer "
+                 "from the pool and reset its uuid.");
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_TSP_ALREADY_FORMED, "%s",
+               msg_str);
+        goto out;
+    }
+
+    gf_uuid_copy(uuid, priv->uuid);
+    ret = glusterd_uuid_generate_save();
+
+    if (!gf_uuid_compare(uuid, MY_UUID)) {
+        snprintf(msg_str, sizeof(msg_str),
+                 "old uuid and the new uuid"
+                 " are same. Try gluster peer reset again");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UUIDS_SAME_RETRY, "%s",
+               msg_str);
+        ret = -1;
+        goto out;
+    }
+
+out:
+    if (ret) {
+        rsp.op_ret = -1;
+        if (msg_str[0] == '\0')
+            snprintf(msg_str, sizeof(msg_str),
+                     "Operation "
+                     "failed");
+        rsp.op_errstr = msg_str;
+        ret = 0;
+    } else {
+        rsp.op_errstr = "";
+    }
+
+    glusterd_to_cli(req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_cli_rsp, dict);
+
+    return ret;
+}
+
+int
+glusterd_handle_cli_uuid_reset(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_cli_uuid_reset);
+}
+
+int
+__glusterd_handle_cli_uuid_get(rpcsvc_request_t *req)
+{
+    int ret = -1;
+    dict_t *dict = NULL;
+    dict_t *rsp_dict = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    gf_cli_rsp rsp = {
+        0,
+    };
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    char err_str[64] = {
+        0,
+    };
+    char uuid_str[64] = {
+        0,
+    };
+
+    GF_ASSERT(req);
+
+    this = THIS;
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode "
+               "request received from cli");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    gf_msg_debug("glusterd", 0, "Received uuid get req");
+
+    if (cli_req.dict.dict_len) {
+        dict = dict_new();
+        if (!dict) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the buffer");
+            goto out;
+
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+    }
+
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    uuid_utoa_r(MY_UUID, uuid_str);
+    ret = dict_set_strn(rsp_dict, "uuid", SLEN("uuid"), uuid_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set uuid in "
+               "dictionary.");
+        goto out;
+    }
+
+    ret = dict_allocate_and_serialize(rsp_dict, &rsp.dict.dict_val,
+                                      &rsp.dict.dict_len);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+    ret = 0;
+out:
+    if (ret) {
+        rsp.op_ret = -1;
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str),
+                     "Operation "
+                     "failed");
+        rsp.op_errstr = err_str;
+
+    } else {
+        rsp.op_errstr = "";
+    }
+
+    glusterd_to_cli(req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_cli_rsp, dict);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+    GF_FREE(rsp.dict.dict_val);
+
+    return 0;
+}
+int
+glusterd_handle_cli_uuid_get(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_cli_uuid_get);
+}
+
+int
+__glusterd_handle_cli_list_volume(rpcsvc_request_t *req)
+{
+    int ret = -1;
+    dict_t *dict = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    int count = 0;
+    char key[64] = {
+        0,
+    };
+    int keylen;
+    gf_cli_rsp rsp = {
+        0,
+    };
+
+    GF_ASSERT(req);
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+    {
+        keylen = snprintf(key, sizeof(key), "volume%d", count);
+        ret = dict_set_strn(dict, key, keylen, volinfo->volname);
+        if (ret)
+            goto out;
+        count++;
+    }
+
+    ret = dict_set_int32n(dict, "count", SLEN("count"), count);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=count", NULL);
+        goto out;
+    }
+
+    ret = dict_allocate_and_serialize(dict, &rsp.dict.dict_val,
+                                      &rsp.dict.dict_len);
+    if (ret)
+        goto out;
+
+    ret = 0;
+
+out:
+    rsp.op_ret = ret;
+    if (ret)
+        rsp.op_errstr = "Error listing volumes";
+    else
+        rsp.op_errstr = "";
+
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_cli_rsp);
+    ret = 0;
+
+    if (dict)
+        dict_unref(dict);
+
+    GF_FREE(rsp.dict.dict_val);
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    return ret;
+}
+
+int
+glusterd_handle_cli_list_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_cli_list_volume);
+}
+
+int32_t
+glusterd_op_begin(rpcsvc_request_t *req, glusterd_op_t op, void *ctx,
+                  char *err_str, size_t err_len)
+{
+    int ret = -1;
+
+    ret = glusterd_op_txn_begin(req, op, ctx, err_str, err_len);
+
+    return ret;
+}
+
+int
+__glusterd_handle_ganesha_cmd(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    glusterd_op_t cli_op = GD_OP_GANESHA;
+    char *op_errstr = NULL;
+    char err_str[2048] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to decode "
+                 "request received from cli");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL, "%s",
+               err_str);
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+        if (!dict) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+    }
+
+    gf_msg_trace(this->name, 0, "Received global option request");
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_GANESHA, dict);
+out:
+    if (ret) {
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, err_str);
+    }
+    if (op_errstr)
+        GF_FREE(op_errstr);
+    if (dict)
+        dict_unref(dict);
+
+    return ret;
+}
+
+int
+glusterd_handle_ganesha_cmd(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_ganesha_cmd);
+}
+
+static int
+__glusterd_handle_reset_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    glusterd_op_t cli_op = GD_OP_RESET_VOLUME;
+    char *volname = NULL;
+    char err_str[64] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    GF_ASSERT(req);
+    this = THIS;
+    GF_ASSERT(this);
+
+    gf_msg(this->name, GF_LOG_INFO, 0, 0, "Received reset vol req");
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to decode request "
+                 "received from cli");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL, "%s",
+               err_str);
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to get volume "
+                 "name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLNAME_NOTFOUND_IN_DICT,
+               "%s", err_str);
+        goto out;
+    }
+    gf_msg_debug(this->name, 0,
+                 "Received volume reset request for "
+                 "volume %s",
+                 volname);
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_RESET_VOLUME, dict);
+
+out:
+    if (ret) {
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, err_str);
+    }
+
+    return ret;
+}
+
+int
+glusterd_handle_reset_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_reset_volume);
+}
+
+int
+__glusterd_handle_set_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    glusterd_op_t cli_op = GD_OP_SET_VOLUME;
+    char *key = NULL;
+    char *value = NULL;
+    char *volname = NULL;
+    char *op_errstr = NULL;
+    gf_boolean_t help = _gf_false;
+    char err_str[2048] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to decode "
+                 "request received from cli");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL, "%s",
+               err_str);
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, errno,
+                   GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to get volume "
+                 "name while handling volume set command");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    if (strcmp(volname, "help") == 0 || strcmp(volname, "help-xml") == 0) {
+        ret = glusterd_volset_help(dict, &op_errstr);
+        help = _gf_true;
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "key1", SLEN("key1"), &key);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to get key while"
+                 " handling volume set for %s",
+                 volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "value1", SLEN("value1"), &value);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to get value while"
+                 " handling volume set for %s",
+                 volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+    gf_msg_debug(this->name, 0,
+                 "Received volume set request for "
+                 "volume %s",
+                 volname);
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_SET_VOLUME, dict);
+
+out:
+    if (help)
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict,
+                                            (op_errstr) ? op_errstr : "");
+    else if (ret) {
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, err_str);
+    }
+    if (op_errstr)
+        GF_FREE(op_errstr);
+
+    return ret;
+}
+
+int
+glusterd_handle_set_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_set_volume);
+}
+
+int
+__glusterd_handle_sync_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    gf_cli_rsp cli_rsp = {0.};
+    char msg[2048] = {
+        0,
+    };
+    char *volname = NULL;
+    gf1_cli_sync_volume flags = 0;
+    char *hostname = NULL;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(req);
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL, "%s",
+               "Failed to decode "
+               "request received from cli");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(msg, sizeof(msg),
+                     "Unable to decode the "
+                     "command");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+    }
+
+    ret = dict_get_strn(dict, "hostname", SLEN("hostname"), &hostname);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Failed to get hostname");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HOSTNAME_NOTFOUND_IN_DICT,
+               "%s", msg);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        ret = dict_get_int32n(dict, "flags", SLEN("flags"), (int32_t *)&flags);
+        if (ret) {
+            snprintf(msg, sizeof(msg), "Failed to get volume name or flags");
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FLAGS_NOTFOUND_IN_DICT,
+                   "%s", msg);
+            goto out;
+        }
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_VOL_SYNC_REQ_RCVD,
+           "Received volume sync req "
+           "for volume %s",
+           (flags & GF_CLI_SYNC_ALL) ? "all" : volname);
+
+    if (gf_is_local_addr(hostname)) {
+        ret = -1;
+        snprintf(msg, sizeof(msg),
+                 "sync from localhost"
+                 " not allowed");
+        gf_msg(this->name, GF_LOG_ERROR, 0,
+               GD_MSG_SYNC_FROM_LOCALHOST_UNALLOWED, "%s", msg);
+        goto out;
+    }
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_SYNC_VOLUME, dict);
+
+out:
+    if (ret) {
+        cli_rsp.op_ret = -1;
+        cli_rsp.op_errstr = msg;
+        if (msg[0] == '\0')
+            snprintf(msg, sizeof(msg), "Operation failed");
+        glusterd_to_cli(req, &cli_rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_cli_rsp,
+                        dict);
+
+        ret = 0;  // sent error to cli, prevent second reply
+    }
+
+    return ret;
+}
+
+int
+glusterd_handle_sync_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_sync_volume);
+}
+
+int
+glusterd_fsm_log_send_resp(rpcsvc_request_t *req, int op_ret, char *op_errstr,
+                           dict_t *dict)
+{
+    int ret = -1;
+    gf1_cli_fsm_log_rsp rsp = {0};
+
+    GF_ASSERT(req);
+    GF_ASSERT(op_errstr);
+
+    rsp.op_ret = op_ret;
+    rsp.op_errstr = op_errstr;
+    if (rsp.op_ret == 0) {
+        ret = dict_allocate_and_serialize(dict, &rsp.fsm_log.fsm_log_val,
+                                          &rsp.fsm_log.fsm_log_len);
+        if (ret < 0) {
+            gf_smsg("glusterd", GF_LOG_ERROR, errno,
+                    GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+            return ret;
+        }
+    }
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gf1_cli_fsm_log_rsp);
+    GF_FREE(rsp.fsm_log.fsm_log_val);
+
+    gf_msg_debug("glusterd", 0, "Responded, ret: %d", ret);
+
+    return 0;
+}
+
+int
+__glusterd_handle_fsm_log(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf1_cli_fsm_log_req cli_req = {
+        0,
+    };
+    dict_t *dict = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char msg[2048] = {0};
+    glusterd_peerinfo_t *peerinfo = NULL;
+
+    GF_ASSERT(req);
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("xlator", (this != NULL), out);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req,
+                         (xdrproc_t)xdr_gf1_cli_fsm_log_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode "
+               "request received from client.");
+        req->rpc_err = GARBAGE_ARGS;
+        snprintf(msg, sizeof(msg), "Garbage request");
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    if (strcmp("", cli_req.name) == 0) {
+        conf = this->private;
+        ret = glusterd_sm_tr_log_add_to_dict(dict, &conf->op_sm_log);
+    } else {
+        RCU_READ_LOCK;
+
+        peerinfo = glusterd_peerinfo_find_by_hostname(cli_req.name);
+        if (!peerinfo) {
+            RCU_READ_UNLOCK;
+            ret = -1;
+            snprintf(msg, sizeof(msg), "%s is not a peer", cli_req.name);
+        } else {
+            ret = glusterd_sm_tr_log_add_to_dict(dict, &peerinfo->sm_log);
+            RCU_READ_UNLOCK;
+        }
+    }
+
+out:
+    (void)glusterd_fsm_log_send_resp(req, ret, msg, dict);
+    free(cli_req.name);  // malloced by xdr
+    if (dict)
+        dict_unref(dict);
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    return 0;  // send 0 to avoid double reply
+}
+
+int
+glusterd_handle_fsm_log(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_fsm_log);
+}
+
+int
+glusterd_op_lock_send_resp(rpcsvc_request_t *req, int32_t status)
+{
+    gd1_mgmt_cluster_lock_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+
+    GF_ASSERT(req);
+    glusterd_get_uuid(&rsp.uuid);
+    rsp.op_ret = status;
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_cluster_lock_rsp);
+
+    gf_msg_debug(THIS->name, 0, "Responded to lock, ret: %d", ret);
+
+    return 0;
+}
+
+int
+glusterd_op_unlock_send_resp(rpcsvc_request_t *req, int32_t status)
+{
+    gd1_mgmt_cluster_unlock_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+
+    GF_ASSERT(req);
+    rsp.op_ret = status;
+    glusterd_get_uuid(&rsp.uuid);
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_cluster_unlock_rsp);
+
+    gf_msg_debug(THIS->name, 0, "Responded to unlock, ret: %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_op_mgmt_v3_lock_send_resp(rpcsvc_request_t *req, uuid_t *txn_id,
+                                   int32_t status)
+{
+    gd1_mgmt_v3_lock_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+
+    GF_ASSERT(req);
+    GF_ASSERT(txn_id);
+    glusterd_get_uuid(&rsp.uuid);
+    rsp.op_ret = status;
+    if (rsp.op_ret)
+        rsp.op_errno = errno;
+    gf_uuid_copy(rsp.txn_id, *txn_id);
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+
+    gf_msg_debug(THIS->name, 0, "Responded to mgmt_v3 lock, ret: %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_op_mgmt_v3_unlock_send_resp(rpcsvc_request_t *req, uuid_t *txn_id,
+                                     int32_t status)
+{
+    gd1_mgmt_v3_unlock_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+
+    GF_ASSERT(req);
+    GF_ASSERT(txn_id);
+    rsp.op_ret = status;
+    if (rsp.op_ret)
+        rsp.op_errno = errno;
+    glusterd_get_uuid(&rsp.uuid);
+    gf_uuid_copy(rsp.txn_id, *txn_id);
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+
+    gf_msg_debug(THIS->name, 0, "Responded to mgmt_v3 unlock, ret: %d", ret);
+
+    return ret;
+}
+
+int
+__glusterd_handle_cluster_unlock(rpcsvc_request_t *req)
+{
+    gd1_mgmt_cluster_unlock_req unlock_req = {
+        {0},
+    };
+    int32_t ret = -1;
+    glusterd_op_lock_ctx_t *ctx = NULL;
+    xlator_t *this = NULL;
+    uuid_t *txn_id = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(req);
+
+    txn_id = &priv->global_txn_id;
+
+    ret = xdr_to_generic(req->msg[0], &unlock_req,
+                         (xdrproc_t)xdr_gd1_mgmt_cluster_unlock_req);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode unlock "
+               "request received from peer");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "Received UNLOCK from uuid: %s",
+                 uuid_utoa(unlock_req.uuid));
+
+    RCU_READ_LOCK;
+    ret = (glusterd_peerinfo_find_by_uuid(unlock_req.uuid) == NULL);
+    RCU_READ_LOCK;
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_PEER_NOT_FOUND,
+               "%s doesn't "
+               "belong to the cluster. Ignoring request.",
+               uuid_utoa(unlock_req.uuid));
+        ret = -1;
+        goto out;
+    }
+
+    ctx = GF_CALLOC(1, sizeof(*ctx), gf_gld_mt_op_lock_ctx_t);
+
+    if (!ctx) {
+        // respond here
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "No memory.");
+        return -1;
+    }
+    gf_uuid_copy(ctx->uuid, unlock_req.uuid);
+    ctx->req = req;
+    ctx->dict = NULL;
+
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_UNLOCK, txn_id, ctx);
+
+out:
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    return ret;
+}
+
+int
+glusterd_handle_cluster_unlock(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_cluster_unlock);
+}
+
+int
+glusterd_op_stage_send_resp(rpcsvc_request_t *req, int32_t op, int32_t status,
+                            char *op_errstr, dict_t *rsp_dict)
+{
+    gd1_mgmt_stage_op_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    rsp.op_ret = status;
+    glusterd_get_uuid(&rsp.uuid);
+    rsp.op = op;
+    if (op_errstr)
+        rsp.op_errstr = op_errstr;
+    else
+        rsp.op_errstr = "";
+
+    ret = dict_allocate_and_serialize(rsp_dict, &rsp.dict.dict_val,
+                                      &rsp.dict.dict_len);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        return ret;
+    }
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_stage_op_rsp);
+
+    gf_msg_debug(this->name, 0, "Responded to stage, ret: %d", ret);
+    GF_FREE(rsp.dict.dict_val);
+
+    return ret;
+}
+
+int
+glusterd_op_commit_send_resp(rpcsvc_request_t *req, int32_t op, int32_t status,
+                             char *op_errstr, dict_t *rsp_dict)
+{
+    gd1_mgmt_commit_op_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    rsp.op_ret = status;
+    glusterd_get_uuid(&rsp.uuid);
+    rsp.op = op;
+
+    if (op_errstr)
+        rsp.op_errstr = op_errstr;
+    else
+        rsp.op_errstr = "";
+
+    if (rsp_dict) {
+        ret = dict_allocate_and_serialize(rsp_dict, &rsp.dict.dict_val,
+                                          &rsp.dict.dict_len);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+            goto out;
+        }
+    }
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_commit_op_rsp);
+
+    gf_msg_debug(this->name, 0, "Responded to commit, ret: %d", ret);
+
+out:
+    GF_FREE(rsp.dict.dict_val);
+    return ret;
+}
+
+int
+__glusterd_handle_incoming_friend_req(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gd1_mgmt_friend_req friend_req = {
+        {0},
+    };
+    gf_boolean_t run_fsm = _gf_true;
+
+    GF_ASSERT(req);
+    ret = xdr_to_generic(req->msg[0], &friend_req,
+                         (xdrproc_t)xdr_gd1_mgmt_friend_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode "
+               "request received from friend");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_PROBE_RCVD,
+           "Received probe from uuid: %s", uuid_utoa(friend_req.uuid));
+    ret = glusterd_handle_friend_req(req, friend_req.uuid, friend_req.hostname,
+                                     friend_req.port, &friend_req);
+
+    if (ret == GLUSTERD_CONNECTION_AWAITED) {
+        // fsm should be run after connection establishes
+        run_fsm = _gf_false;
+        ret = 0;
+    }
+
+out:
+    free(friend_req.hostname);  // malloced by xdr
+
+    if (run_fsm) {
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    }
+
+    return ret;
+}
+
+int
+glusterd_handle_incoming_friend_req(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req,
+                                       __glusterd_handle_incoming_friend_req);
+}
+
+int
+__glusterd_handle_incoming_unfriend_req(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gd1_mgmt_friend_req friend_req = {
+        {0},
+    };
+    char remote_hostname[UNIX_PATH_MAX + 1] = {
+        0,
+    };
+
+    GF_ASSERT(req);
+    ret = xdr_to_generic(req->msg[0], &friend_req,
+                         (xdrproc_t)xdr_gd1_mgmt_friend_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode "
+               "request received.");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_UNFRIEND_REQ_RCVD,
+           "Received unfriend from uuid: %s", uuid_utoa(friend_req.uuid));
+
+    ret = glusterd_remote_hostname_get(req, remote_hostname,
+                                       sizeof(remote_hostname));
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_HOSTNAME_RESOLVE_FAIL,
+               "Unable to get the remote hostname");
+        goto out;
+    }
+    ret = glusterd_handle_unfriend_req(req, friend_req.uuid, remote_hostname,
+                                       friend_req.port);
+
+out:
+    free(friend_req.hostname);       // malloced by xdr
+    free(friend_req.vols.vols_val);  // malloced by xdr
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    return ret;
+}
+
+int
+glusterd_handle_incoming_unfriend_req(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req,
+                                       __glusterd_handle_incoming_unfriend_req);
+}
+
+int
+glusterd_handle_friend_update_delete(dict_t *dict)
+{
+    char *hostname = NULL;
+    int32_t ret = -1;
+
+    GF_ASSERT(dict);
+
+    ret = dict_get_strn(dict, "hostname", SLEN("hostname"), &hostname);
+    if (ret)
+        goto out;
+
+    ret = glusterd_friend_remove(NULL, hostname);
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_peer_hostname_update(glusterd_peerinfo_t *peerinfo,
+                              const char *hostname, gf_boolean_t store_update)
+{
+    int ret = 0;
+
+    GF_ASSERT(peerinfo);
+    GF_ASSERT(hostname);
+
+    ret = gd_add_address_to_peer(peerinfo, hostname);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0,
+               GD_MSG_HOSTNAME_ADD_TO_PEERLIST_FAIL,
+               "Couldn't add address to the peer info");
+        goto out;
+    }
+
+    if (store_update)
+        ret = glusterd_store_peerinfo(peerinfo);
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+__glusterd_handle_friend_update(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gd1_mgmt_friend_update friend_req = {
+        {0},
+    };
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    gd1_mgmt_friend_update_rsp rsp = {
+        {0},
+    };
+    dict_t *dict = NULL;
+    char key[32] = {
+        0,
+    };
+    int keylen;
+    char *uuid_buf = NULL;
+    int i = 1;
+    int count = 0;
+    uuid_t uuid = {
+        0,
+    };
+    glusterd_peerctx_args_t args = {0};
+    int32_t op = 0;
+
+    GF_ASSERT(req);
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = xdr_to_generic(req->msg[0], &friend_req,
+                         (xdrproc_t)xdr_gd1_mgmt_friend_update);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode "
+               "request received");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    ret = 0;
+    RCU_READ_LOCK;
+    if (glusterd_peerinfo_find(friend_req.uuid, NULL) == NULL) {
+        ret = -1;
+    }
+    RCU_READ_UNLOCK;
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_REQ_FROM_UNKNOWN_PEER,
+               "Received friend update request "
+               "from unknown peer %s",
+               uuid_utoa(friend_req.uuid));
+        gf_event(EVENT_UNKNOWN_PEER, "peer=%s", uuid_utoa(friend_req.uuid));
+        goto out;
+    }
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_FRIEND_UPDATE_RCVD,
+           "Received friend update from uuid: %s", uuid_utoa(friend_req.uuid));
+
+    if (friend_req.friends.friends_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(friend_req.friends.friends_val,
+                               friend_req.friends.friends_len, &dict);
+        if (ret < 0) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            goto out;
+        } else {
+            dict->extra_stdfree = friend_req.friends.friends_val;
+        }
+    }
+
+    ret = dict_get_int32n(dict, "count", SLEN("count"), &count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=count", NULL);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "op", SLEN("op"), &op);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=op", NULL);
+        goto out;
+    }
+
+    if (GD_FRIEND_UPDATE_DEL == op) {
+        (void)glusterd_handle_friend_update_delete(dict);
+        goto out;
+    }
+
+    args.mode = GD_MODE_ON;
+    while (i <= count) {
+        keylen = snprintf(key, sizeof(key), "friend%d.uuid", i);
+        ret = dict_get_strn(dict, key, keylen, &uuid_buf);
+        if (ret)
+            goto out;
+        gf_uuid_parse(uuid_buf, uuid);
+
+        if (!gf_uuid_compare(uuid, MY_UUID)) {
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_UUID_RECEIVED,
+                   "Received my uuid as Friend");
+            i++;
+            continue;
+        }
+
+        snprintf(key, sizeof(key), "friend%d", i);
+
+        RCU_READ_LOCK;
+        peerinfo = glusterd_peerinfo_find(uuid, NULL);
+        if (peerinfo == NULL) {
+            /* Create a new peer and add it to the list as there is
+             * no existing peer with the uuid
+             */
+            peerinfo = gd_peerinfo_from_dict(dict, key);
+            if (peerinfo == NULL) {
+                ret = -1;
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEERINFO_CREATE_FAIL,
+                       "Could not create peerinfo from dict "
+                       "for prefix %s",
+                       key);
+                goto unlock;
+            }
+
+            /* As this is a new peer, it should be added as a
+             * friend.  The friend state machine will take care of
+             * correcting the state as required
+             */
+            peerinfo->state.state = GD_FRIEND_STATE_BEFRIENDED;
+
+            ret = glusterd_friend_add_from_peerinfo(peerinfo, 0, &args);
+        } else {
+            /* As an existing peer was found, update it with the new
+             * information
+             */
+            ret = gd_update_peerinfo_from_dict(peerinfo, dict, key);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_PEER_INFO_UPDATE_FAIL,
+                       "Failed to "
+                       "update peer %s",
+                       peerinfo->hostname);
+                goto unlock;
+            }
+            ret = glusterd_store_peerinfo(peerinfo);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEERINFO_CREATE_FAIL,
+                       "Failed to store peerinfo");
+                gf_event(EVENT_PEER_STORE_FAILURE, "peer=%s",
+                         peerinfo->hostname);
+            }
+        }
+    unlock:
+        RCU_READ_UNLOCK;
+        if (ret)
+            break;
+
+        peerinfo = NULL;
+        i++;
+    }
+
+out:
+    gf_uuid_copy(rsp.uuid, MY_UUID);
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_friend_update_rsp);
+    if (dict) {
+        if (!dict->extra_stdfree && friend_req.friends.friends_val)
+            free(friend_req.friends.friends_val);  // malloced by xdr
+        dict_unref(dict);
+    } else {
+        free(friend_req.friends.friends_val);  // malloced by xdr
+    }
+
+    if (peerinfo)
+        glusterd_peerinfo_cleanup(peerinfo);
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    return ret;
+}
+
+int
+glusterd_handle_friend_update(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_friend_update);
+}
+
+int
+__glusterd_handle_probe_query(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    gd1_mgmt_probe_req probe_req = {
+        {0},
+    };
+    gd1_mgmt_probe_rsp rsp = {
+        {0},
+    };
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_peerctx_args_t args = {0};
+    int port = 0;
+    char remote_hostname[UNIX_PATH_MAX + 1] = {
+        0,
+    };
+
+    GF_ASSERT(req);
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("xlator", (this != NULL), out);
+
+    ret = xdr_to_generic(req->msg[0], &probe_req,
+                         (xdrproc_t)xdr_gd1_mgmt_probe_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode probe "
+               "request");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    conf = this->private;
+    if (probe_req.port)
+        port = probe_req.port;
+    else
+        port = GF_DEFAULT_BASE_PORT;
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_PROBE_RCVD,
+           "Received probe from uuid: %s", uuid_utoa(probe_req.uuid));
+
+    /* Check for uuid collision and handle it in a user friendly way by
+     * sending the error.
+     */
+    if (!gf_uuid_compare(probe_req.uuid, MY_UUID)) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_UUIDS_SAME_RETRY,
+               "Peer uuid %s is same as "
+               "local uuid. Please check the uuid of both the peers "
+               "from %s/%s",
+               uuid_utoa(probe_req.uuid), GLUSTERD_DEFAULT_WORKDIR,
+               GLUSTERD_INFO_FILE);
+        rsp.op_ret = -1;
+        rsp.op_errno = GF_PROBE_SAME_UUID;
+        rsp.port = port;
+        goto respond;
+    }
+
+    ret = glusterd_remote_hostname_get(req, remote_hostname,
+                                       sizeof(remote_hostname));
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_HOSTNAME_RESOLVE_FAIL,
+               "Unable to get the remote hostname");
+        goto out;
+    }
+
+    RCU_READ_LOCK;
+    peerinfo = glusterd_peerinfo_find(probe_req.uuid, remote_hostname);
+    if ((peerinfo == NULL) && (!cds_list_empty(&conf->peers))) {
+        rsp.op_ret = -1;
+        rsp.op_errno = GF_PROBE_ANOTHER_CLUSTER;
+    } else if (peerinfo == NULL) {
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_PEER_NOT_FOUND,
+               "Unable to find peerinfo"
+               " for host: %s (%d)",
+               remote_hostname, port);
+        args.mode = GD_MODE_ON;
+        ret = glusterd_friend_add(remote_hostname, port,
+                                  GD_FRIEND_STATE_PROBE_RCVD, NULL, &peerinfo,
+                                  0, &args);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_PEER_ADD_FAIL,
+                   "Failed to add peer %s", remote_hostname);
+            rsp.op_errno = GF_PROBE_ADD_FAILED;
+        }
+    }
+    RCU_READ_UNLOCK;
+
+respond:
+    gf_uuid_copy(rsp.uuid, MY_UUID);
+
+    rsp.hostname = probe_req.hostname;
+    rsp.op_errstr = "";
+
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                          (xdrproc_t)xdr_gd1_mgmt_probe_rsp);
+    ret = 0;
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_RESPONSE_INFO,
+           "Responded to %s, op_ret: %d, "
+           "op_errno: %d, ret: %d",
+           remote_hostname, rsp.op_ret, rsp.op_errno, ret);
+
+out:
+    free(probe_req.hostname);  // malloced by xdr
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    return ret;
+}
+
+int
+glusterd_handle_probe_query(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_probe_query);
+}
+
+int
+__glusterd_handle_cli_profile_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    glusterd_op_t cli_op = GD_OP_PROFILE_VOLUME;
+    char *volname = NULL;
+    int32_t op = 0;
+    char err_str[64] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    GF_ASSERT(req);
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode "
+               "request received from cli");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len > 0) {
+        dict = dict_new();
+        if (!dict) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            goto out;
+        }
+        dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len, &dict);
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get volume "
+                 "name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLNAME_NOTFOUND_IN_DICT,
+               "%s", err_str);
+        goto out;
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_VOL_PROFILE_REQ_RCVD,
+           "Received volume profile req "
+           "for volume %s",
+           volname);
+    ret = dict_get_int32n(dict, "op", SLEN("op"), &op);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str), "Unable to get operation");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    if (conf->op_version < GD_OP_VERSION_6_0) {
+        gf_msg_debug(this->name, 0,
+                     "The cluster is operating at "
+                     "version less than %d. Falling back "
+                     "to op-sm framework.",
+                     GD_OP_VERSION_6_0);
+        ret = glusterd_op_begin(req, cli_op, dict, err_str, sizeof(err_str));
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    } else {
+        ret = glusterd_mgmt_v3_initiate_all_phases_with_brickop_phase(
+            req, cli_op, dict);
+    }
+
+out:
+    free(cli_req.dict.dict_val);
+
+    if (ret) {
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, err_str);
+    }
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_handle_cli_profile_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req,
+                                       __glusterd_handle_cli_profile_volume);
+}
+
+int
+__glusterd_handle_getwd(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf1_cli_getwd_rsp rsp = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(req);
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_GETWD_REQ_RCVD,
+           "Received getwd req");
+
+    rsp.wd = priv->workdir;
+
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                          (xdrproc_t)xdr_gf1_cli_getwd_rsp);
+    ret = 0;
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    return ret;
+}
+
+int
+glusterd_handle_getwd(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_getwd);
+}
+
+int
+__glusterd_handle_mount(rpcsvc_request_t *req)
+{
+    gf1_cli_mount_req mnt_req = {
+        0,
+    };
+    gf1_cli_mount_rsp rsp = {
+        0,
+    };
+    dict_t *dict = NULL;
+    int ret = 0;
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(req);
+    priv = THIS->private;
+
+    ret = xdr_to_generic(req->msg[0], &mnt_req,
+                         (xdrproc_t)xdr_gf1_cli_mount_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode mount "
+               "request received");
+        req->rpc_err = GARBAGE_ARGS;
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        goto out;
+    }
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_MOUNT_REQ_RCVD,
+           "Received mount req");
+
+    if (mnt_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(mnt_req.dict.dict_val, mnt_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            rsp.op_ret = -1;
+            rsp.op_errno = -EINVAL;
+            goto out;
+        } else {
+            dict->extra_stdfree = mnt_req.dict.dict_val;
+        }
+    }
+
+    synclock_unlock(&priv->big_lock);
+    rsp.op_ret = glusterd_do_mount(mnt_req.label, dict, &rsp.path,
+                                   &rsp.op_errno);
+    synclock_lock(&priv->big_lock);
+
+out:
+    if (!rsp.path)
+        rsp.path = gf_strdup("");
+
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                          (xdrproc_t)xdr_gf1_cli_mount_rsp);
+    ret = 0;
+
+    if (dict)
+        dict_unref(dict);
+
+    GF_FREE(rsp.path);
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    return ret;
+}
+
+int
+glusterd_handle_mount(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_mount);
+}
+
+int
+__glusterd_handle_umount(rpcsvc_request_t *req)
+{
+    gf1_cli_umount_req umnt_req = {
+        0,
+    };
+    gf1_cli_umount_rsp rsp = {
+        0,
+    };
+    char *mountbroker_root = NULL;
+    char mntp[PATH_MAX] = {
+        0,
+    };
+    char *path = NULL;
+    runner_t runner = {
+        0,
+    };
+    int ret = 0;
+    xlator_t *this = THIS;
+    gf_boolean_t dir_ok = _gf_false;
+    char *pdir = NULL;
+    char *t = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(req);
+    GF_ASSERT(this);
+    priv = this->private;
+
+    ret = xdr_to_generic(req->msg[0], &umnt_req,
+                         (xdrproc_t)xdr_gf1_cli_umount_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode umount"
+               "request");
+        req->rpc_err = GARBAGE_ARGS;
+        rsp.op_ret = -1;
+        goto out;
+    }
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_UMOUNT_REQ_RCVD,
+           "Received umount req");
+
+    if (dict_get_strn(this->options, "mountbroker-root",
+                      SLEN("mountbroker-root"), &mountbroker_root) != 0) {
+        rsp.op_errno = ENOENT;
+        goto out;
+    }
+
+    /* check if it is allowed to umount path */
+    path = gf_strdup(umnt_req.path);
+    if (!path) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED, NULL);
+        rsp.op_errno = ENOMEM;
+        goto out;
+    }
+    dir_ok = _gf_false;
+    pdir = dirname(path);
+    t = strtail(pdir, mountbroker_root);
+    if (t && *t == '/') {
+        t = strtail(++t, MB_HIVE);
+        if (t && !*t)
+            dir_ok = _gf_true;
+    }
+    GF_FREE(path);
+    if (!dir_ok) {
+        rsp.op_errno = EACCES;
+        goto out;
+    }
+
+    synclock_unlock(&priv->big_lock);
+
+    if (umnt_req.lazy) {
+        rsp.op_ret = gf_umount_lazy(this->name, umnt_req.path, 0);
+    } else {
+        runinit(&runner);
+        runner_add_args(&runner, _PATH_UMOUNT, umnt_req.path, NULL);
+        rsp.op_ret = runner_run(&runner);
+    }
+
+    synclock_lock(&priv->big_lock);
+    if (rsp.op_ret == 0) {
+        if (realpath(umnt_req.path, mntp))
+            sys_rmdir(mntp);
+        else {
+            rsp.op_ret = -1;
+            rsp.op_errno = errno;
+        }
+        if (sys_unlink(umnt_req.path) != 0) {
+            rsp.op_ret = -1;
+            rsp.op_errno = errno;
+        }
+    }
+
+out:
+    if (rsp.op_errno)
+        rsp.op_ret = -1;
+
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                          (xdrproc_t)xdr_gf1_cli_umount_rsp);
+    ret = 0;
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    return ret;
+}
+
+int
+glusterd_handle_umount(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_umount);
+}
+
+int
+glusterd_friend_remove(uuid_t uuid, char *hostname)
+{
+    int ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find(uuid, hostname);
+    if (peerinfo == NULL) {
+        RCU_READ_UNLOCK;
+        goto out;
+    }
+
+    ret = glusterd_friend_remove_cleanup_vols(peerinfo->uuid);
+    RCU_READ_UNLOCK;
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_WARNING, 0, GD_MSG_VOL_CLEANUP_FAIL,
+               "Volumes cleanup failed");
+    /* Giving up the critical section here as glusterd_peerinfo_cleanup must
+     * be called from outside a critical section
+     */
+    ret = glusterd_peerinfo_cleanup(peerinfo);
+out:
+    gf_msg_debug(THIS->name, 0, "returning %d", ret);
+    /* coverity[LOCK] */
+    return ret;
+}
+
+int
+glusterd_rpc_create(struct rpc_clnt **rpc, dict_t *options,
+                    rpc_clnt_notify_t notify_fn, void *notify_data,
+                    gf_boolean_t force)
+{
+    struct rpc_clnt *new_rpc = NULL;
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(options);
+    GF_VALIDATE_OR_GOTO(this->name, rpc, out);
+
+    if (force && rpc && *rpc) {
+        (void)rpc_clnt_unref(*rpc);
+        *rpc = NULL;
+    }
+
+    /* TODO: is 32 enough? or more ? */
+    new_rpc = rpc_clnt_new(options, this, this->name, 16);
+    if (!new_rpc)
+        goto out;
+
+    ret = rpc_clnt_register_notify(new_rpc, notify_fn, notify_data);
+    if (ret)
+        goto out;
+    ret = rpc_clnt_start(new_rpc);
+out:
+    if (ret) {
+        if (new_rpc) {
+            (void)rpc_clnt_unref(new_rpc);
+        }
+    } else {
+        *rpc = new_rpc;
+    }
+
+    gf_msg_debug(this->name, 0, "returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_transport_inet_options_build(dict_t *dict, const char *hostname,
+                                      int port, char *af)
+{
+    xlator_t *this = NULL;
+    int32_t interval = -1;
+    int32_t time = -1;
+    int32_t timeout = -1;
+    int ret = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(hostname);
+
+    if (!port)
+        port = GLUSTERD_DEFAULT_PORT;
+
+    /* Build default transport options */
+    ret = rpc_transport_inet_options_build(dict, hostname, port, af);
+    if (ret)
+        goto out;
+
+    /* Set frame-timeout to 10mins. Default timeout of 30 mins is too long
+     * when compared to 2 mins for cli timeout. This ensures users don't
+     * wait too long after cli timesout before being able to resume normal
+     * operations
+     */
+    ret = dict_set_int32n(dict, "frame-timeout", SLEN("frame-timeout"), 600);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set frame-timeout");
+        goto out;
+    }
+
+    /* Set keepalive options */
+    ret = dict_get_int32n(this->options, "transport.socket.keepalive-interval",
+                          SLEN("transport.socket.keepalive-interval"),
+                          &interval);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get socket keepalive-interval");
+    }
+    ret = dict_get_int32n(this->options, "transport.socket.keepalive-time",
+                          SLEN("transport.socket.keepalive-time"), &time);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get socket keepalive-time");
+    }
+    ret = dict_get_int32n(this->options, "transport.tcp-user-timeout",
+                          SLEN("transport.tcp-user-timeout"), &timeout);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get tcp-user-timeout");
+    }
+
+    if ((interval > 0) || (time > 0))
+        ret = rpc_transport_keepalive_options_set(dict, interval, time,
+                                                  timeout);
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_friend_rpc_create(xlator_t *this, glusterd_peerinfo_t *peerinfo,
+                           glusterd_peerctx_args_t *args)
+{
+    dict_t *options = NULL;
+    int ret = -1;
+    glusterd_peerctx_t *peerctx = NULL;
+    data_t *data = NULL;
+    char *af = NULL;
+
+    peerctx = GF_CALLOC(1, sizeof(*peerctx), gf_gld_mt_peerctx_t);
+    if (!peerctx) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    options = dict_new();
+    if (!options) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    if (args)
+        peerctx->args = *args;
+
+    gf_uuid_copy(peerctx->peerid, peerinfo->uuid);
+    peerctx->peername = gf_strdup(peerinfo->hostname);
+    peerctx->peerinfo_gen = peerinfo->generation; /* A peerinfos generation
+                                                     number can be used to
+                                                     uniquely identify a
+                                                     peerinfo */
+
+    ret = dict_get_str(this->options, "transport.address-family", &af);
+    if (ret)
+        gf_log(this->name, GF_LOG_TRACE,
+               "option transport.address-family is not set in xlator options");
+    ret = glusterd_transport_inet_options_build(options, peerinfo->hostname,
+                                                peerinfo->port, af);
+    if (ret)
+        goto out;
+
+    /*
+     * For simulated multi-node testing, we need to make sure that we
+     * create our RPC endpoint with the same address that the peer would
+     * use to reach us.
+     */
+
+    if (this->options) {
+        data = dict_getn(this->options, "transport.socket.bind-address",
+                         SLEN("transport.socket.bind-address"));
+        if (data) {
+            ret = dict_set_sizen(options, "transport.socket.source-addr", data);
+        }
+        data = dict_getn(this->options, "ping-timeout", SLEN("ping-timeout"));
+        if (data) {
+            ret = dict_set_sizen(options, "ping-timeout", data);
+        }
+    }
+
+    /* Enable encryption for the client connection if management encryption
+     * is enabled
+     */
+    if (this->ctx->secure_mgmt) {
+        ret = dict_set_nstrn(options, "transport.socket.ssl-enabled",
+                             SLEN("transport.socket.ssl-enabled"), "on",
+                             SLEN("on"));
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "failed to set ssl-enabled in dict");
+            goto out;
+        }
+
+        this->ctx->ssl_cert_depth = glusterfs_read_secure_access_file();
+    }
+
+    ret = glusterd_rpc_create(&peerinfo->rpc, options, glusterd_peer_rpc_notify,
+                              peerctx, _gf_false);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_CREATE_FAIL,
+               "failed to create rpc for"
+               " peer %s",
+               peerinfo->hostname);
+        gf_event(EVENT_PEER_RPC_CREATE_FAILED, "peer=%s", peerinfo->hostname);
+        goto out;
+    }
+    peerctx = NULL;
+    ret = 0;
+out:
+    if (options)
+        dict_unref(options);
+
+    GF_FREE(peerctx);
+    return ret;
+}
+
+int
+glusterd_friend_add(const char *hoststr, int port,
+                    glusterd_friend_sm_state_t state, uuid_t *uuid,
+                    glusterd_peerinfo_t **friend, gf_boolean_t restore,
+                    glusterd_peerctx_args_t *args)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    conf = this->private;
+    GF_ASSERT(conf);
+    GF_ASSERT(hoststr);
+    GF_ASSERT(friend);
+
+    *friend = glusterd_peerinfo_new(state, uuid, hoststr, port);
+    if (*friend == NULL) {
+        ret = -1;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_PEER_ADD_FAIL, NULL);
+        goto out;
+    }
+
+    /*
+     * We can't add to the list after calling glusterd_friend_rpc_create,
+     * even if it succeeds, because by then the callback to take it back
+     * off and free might have happened already (notably in the case of an
+     * invalid peer name).  That would mean we're adding something that had
+     * just been free, and we're likely to crash later.
+     */
+    cds_list_add_tail_rcu(&(*friend)->uuid_list, &conf->peers);
+
+    // restore needs to first create the list of peers, then create rpcs
+    // to keep track of quorum in race-free manner. In restore for each peer
+    // rpc-create calls rpc_notify when the friend-list is partially
+    // constructed, leading to wrong quorum calculations.
+    if (!restore) {
+        ret = glusterd_store_peerinfo(*friend);
+        if (ret == 0) {
+            ret = glusterd_friend_rpc_create(this, *friend, args);
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEERINFO_CREATE_FAIL,
+                   "Failed to store peerinfo");
+            gf_event(EVENT_PEER_STORE_FAILURE, "peer=%s", (*friend)->hostname);
+        }
+    }
+
+    if (ret) {
+        (void)glusterd_peerinfo_cleanup(*friend);
+        *friend = NULL;
+    }
+
+out:
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_CONNECT_RETURNED,
+           "connect returned %d", ret);
+    return ret;
+}
+
+/* glusterd_friend_add_from_peerinfo() adds a new peer into the local friends
+ * list from a pre created @peerinfo object. It otherwise works similarly to
+ * glusterd_friend_add()
+ */
+int
+glusterd_friend_add_from_peerinfo(glusterd_peerinfo_t *friend,
+                                  gf_boolean_t restore,
+                                  glusterd_peerctx_args_t *args)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GF_VALIDATE_OR_GOTO(this->name, (friend != NULL), out);
+
+    /*
+     * We can't add to the list after calling glusterd_friend_rpc_create,
+     * even if it succeeds, because by then the callback to take it back
+     * off and free might have happened already (notably in the case of an
+     * invalid peer name).  That would mean we're adding something that had
+     * just been free, and we're likely to crash later.
+     */
+    cds_list_add_tail_rcu(&friend->uuid_list, &conf->peers);
+
+    // restore needs to first create the list of peers, then create rpcs
+    // to keep track of quorum in race-free manner. In restore for each peer
+    // rpc-create calls rpc_notify when the friend-list is partially
+    // constructed, leading to wrong quorum calculations.
+    if (!restore) {
+        ret = glusterd_store_peerinfo(friend);
+        if (ret == 0) {
+            ret = glusterd_friend_rpc_create(this, friend, args);
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEERINFO_CREATE_FAIL,
+                   "Failed to store peerinfo");
+            gf_event(EVENT_PEER_STORE_FAILURE, "peer=%s", friend->hostname);
+        }
+    }
+
+out:
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_CONNECT_RETURNED,
+           "connect returned %d", ret);
+    return ret;
+}
+
+int
+glusterd_probe_begin(rpcsvc_request_t *req, const char *hoststr, int port,
+                     dict_t *dict, int *op_errno)
+{
+    int ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_peerctx_args_t args = {0};
+    glusterd_friend_sm_event_t *event = NULL;
+
+    GF_ASSERT(hoststr);
+
+    RCU_READ_LOCK;
+    peerinfo = glusterd_peerinfo_find(NULL, hoststr);
+
+    if (peerinfo == NULL) {
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_PEER_NOT_FOUND,
+               "Unable to find peerinfo"
+               " for host: %s (%d)",
+               hoststr, port);
+        args.mode = GD_MODE_ON;
+        args.req = req;
+        args.dict = dict;
+        ret = glusterd_friend_add(hoststr, port, GD_FRIEND_STATE_DEFAULT, NULL,
+                                  &peerinfo, 0, &args);
+        if ((!ret) && (!peerinfo->connected)) {
+            ret = GLUSTERD_CONNECTION_AWAITED;
+        }
+
+    } else if (peerinfo->connected &&
+               (GD_FRIEND_STATE_BEFRIENDED == peerinfo->state.state)) {
+        if (peerinfo->detaching) {
+            ret = -1;
+            if (op_errno)
+                *op_errno = GF_PROBE_FRIEND_DETACHING;
+            goto out;
+        }
+        ret = glusterd_peer_hostname_update(peerinfo, hoststr, _gf_false);
+        if (ret)
+            goto out;
+        // Injecting a NEW_NAME event to update cluster
+        ret = glusterd_friend_sm_new_event(GD_FRIEND_EVENT_NEW_NAME, &event);
+        if (!ret) {
+            event->peername = gf_strdup(peerinfo->hostname);
+            gf_uuid_copy(event->peerid, peerinfo->uuid);
+
+            ret = glusterd_friend_sm_inject_event(event);
+            glusterd_xfer_cli_probe_resp(req, 0, GF_PROBE_SUCCESS, NULL,
+                                         (char *)hoststr, port, dict);
+        }
+    } else {
+        glusterd_xfer_cli_probe_resp(req, 0, GF_PROBE_FRIEND, NULL,
+                                     (char *)hoststr, port, dict);
+        ret = 0;
+    }
+
+out:
+    RCU_READ_UNLOCK;
+    gf_msg_debug("glusterd", 0, "returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_deprobe_begin(rpcsvc_request_t *req, const char *hoststr, int port,
+                       uuid_t uuid, dict_t *dict, int *op_errno)
+{
+    int ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_friend_sm_event_t *event = NULL;
+    glusterd_probe_ctx_t *ctx = NULL;
+
+    GF_ASSERT(hoststr);
+    GF_ASSERT(req);
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find(uuid, hoststr);
+    if (peerinfo == NULL) {
+        ret = -1;
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_PEER_NOT_FOUND,
+               "Unable to find peerinfo"
+               " for host: %s %d",
+               hoststr, port);
+        goto out;
+    }
+
+    if (!peerinfo->rpc) {
+        // handle this case
+        goto out;
+    }
+
+    if (peerinfo->detaching) {
+        ret = -1;
+        if (op_errno)
+            *op_errno = GF_DEPROBE_FRIEND_DETACHING;
+        goto out;
+    }
+
+    ret = glusterd_friend_sm_new_event(GD_FRIEND_EVENT_INIT_REMOVE_FRIEND,
+                                       &event);
+
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_EVENT_NEW_GET_FAIL,
+               "Unable to get new event");
+        goto out;
+    }
+
+    ctx = GF_CALLOC(1, sizeof(*ctx), gf_gld_mt_probe_ctx_t);
+
+    if (!ctx) {
+        goto out;
+    }
+
+    ctx->hostname = gf_strdup(hoststr);
+    ctx->port = port;
+    ctx->req = req;
+    ctx->dict = dict;
+
+    event->ctx = ctx;
+
+    event->peername = gf_strdup(hoststr);
+    gf_uuid_copy(event->peerid, uuid);
+
+    ret = glusterd_friend_sm_inject_event(event);
+
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_EVENT_INJECT_FAIL,
+               "Unable to inject event %d, "
+               "ret = %d",
+               event->event, ret);
+        goto out;
+    }
+    peerinfo->detaching = _gf_true;
+
+out:
+    RCU_READ_UNLOCK;
+    return ret;
+}
+
+int
+glusterd_xfer_friend_remove_resp(rpcsvc_request_t *req, char *hostname,
+                                 int port)
+{
+    gd1_mgmt_friend_rsp rsp = {
+        {0},
+    };
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(hostname);
+
+    rsp.op_ret = 0;
+    this = THIS;
+    GF_ASSERT(this);
+
+    gf_uuid_copy(rsp.uuid, MY_UUID);
+    rsp.hostname = hostname;
+    rsp.port = port;
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_friend_rsp);
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_RESPONSE_INFO,
+           "Responded to %s (%d), ret: %d", hostname, port, ret);
+    return ret;
+}
+
+int
+glusterd_xfer_friend_add_resp(rpcsvc_request_t *req, char *myhostname,
+                              char *remote_hostname, int port, int32_t op_ret,
+                              int32_t op_errno)
+{
+    gd1_mgmt_friend_rsp rsp = {
+        {0},
+    };
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(myhostname);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    gf_uuid_copy(rsp.uuid, MY_UUID);
+    rsp.op_ret = op_ret;
+    rsp.op_errno = op_errno;
+    rsp.hostname = gf_strdup(myhostname);
+    rsp.port = port;
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_friend_rsp);
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_RESPONSE_INFO,
+           "Responded to %s (%d), ret: %d, op_ret: %d", remote_hostname, port,
+           ret, op_ret);
+    GF_FREE(rsp.hostname);
+    return ret;
+}
+
+static void
+set_probe_error_str(int op_ret, int op_errno, char *op_errstr, char *errstr,
+                    size_t len, char *hostname, int port)
+{
+    if ((op_errstr) && (strcmp(op_errstr, ""))) {
+        snprintf(errstr, len, "%s", op_errstr);
+        return;
+    }
+
+    if (!op_ret) {
+        switch (op_errno) {
+            case GF_PROBE_LOCALHOST:
+                snprintf(errstr, len,
+                         "Probe on localhost not "
+                         "needed");
+                break;
+
+            case GF_PROBE_FRIEND:
+                snprintf(errstr, len,
+                         "Host %s port %d already"
+                         " in peer list",
+                         hostname, port);
+                break;
+
+            case GF_PROBE_FRIEND_DETACHING:
+                snprintf(errstr, len,
+                         "Peer is already being "
+                         "detached from cluster.\n"
+                         "Check peer status by running "
+                         "gluster peer status");
+                break;
+            default:
+                if (op_errno != 0)
+                    snprintf(errstr, len,
+                             "Probe returned "
+                             "with %s",
+                             strerror(op_errno));
+                break;
+        }
+    } else {
+        switch (op_errno) {
+            case GF_PROBE_ANOTHER_CLUSTER:
+                snprintf(errstr, len,
+                         "%s is either already "
+                         "part of another cluster or having "
+                         "volumes configured",
+                         hostname);
+                break;
+
+            case GF_PROBE_VOLUME_CONFLICT:
+                snprintf(errstr, len,
+                         "At least one volume on "
+                         "%s conflicts with existing volumes "
+                         "in the cluster",
+                         hostname);
+                break;
+
+            case GF_PROBE_UNKNOWN_PEER:
+                snprintf(errstr, len,
+                         "%s responded with "
+                         "'unknown peer' error, this could "
+                         "happen if %s doesn't have localhost "
+                         "in its peer database",
+                         hostname, hostname);
+                break;
+
+            case GF_PROBE_ADD_FAILED:
+                snprintf(errstr, len,
+                         "Failed to add peer "
+                         "information on %s",
+                         hostname);
+                break;
+
+            case GF_PROBE_SAME_UUID:
+                snprintf(errstr, len,
+                         "Peer uuid (host %s) is "
+                         "same as local uuid",
+                         hostname);
+                break;
+
+            case GF_PROBE_QUORUM_NOT_MET:
+                snprintf(errstr, len,
+                         "Cluster quorum is not "
+                         "met. Changing peers is not allowed "
+                         "in this state");
+                break;
+
+            case GF_PROBE_MISSED_SNAP_CONFLICT:
+                snprintf(errstr, len,
+                         "Failed to update "
+                         "list of missed snapshots from "
+                         "peer %s",
+                         hostname);
+                break;
+
+            case GF_PROBE_SNAP_CONFLICT:
+                snprintf(errstr, len,
+                         "Conflict in comparing "
+                         "list of snapshots from "
+                         "peer %s",
+                         hostname);
+                break;
+
+            default:
+                snprintf(errstr, len,
+                         "Probe returned with "
+                         "%s",
+                         strerror(op_errno));
+                break;
+        }
+    }
+}
+
+int
+glusterd_xfer_cli_probe_resp(rpcsvc_request_t *req, int32_t op_ret,
+                             int32_t op_errno, char *op_errstr, char *hostname,
+                             int port, dict_t *dict)
+{
+    gf_cli_rsp rsp = {
+        0,
+    };
+    int32_t ret = -1;
+    char errstr[2048] = {
+        0,
+    };
+    char *cmd_str = NULL;
+    xlator_t *this = THIS;
+
+    GF_ASSERT(req);
+    GF_ASSERT(this);
+
+    (void)set_probe_error_str(op_ret, op_errno, op_errstr, errstr,
+                              sizeof(errstr), hostname, port);
+
+    if (dict) {
+        ret = dict_get_strn(dict, "cmd-str", SLEN("cmd-str"), &cmd_str);
+        if (ret)
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CMDSTR_NOTFOUND_IN_DICT,
+                   "Failed to get "
+                   "command string");
+    }
+
+    rsp.op_ret = op_ret;
+    rsp.op_errno = op_errno;
+    rsp.op_errstr = (errstr[0] != '\0') ? errstr : "";
+
+    gf_cmd_log("", "%s : %s %s %s", cmd_str, (op_ret) ? "FAILED" : "SUCCESS",
+               (errstr[0] != '\0') ? ":" : " ",
+               (errstr[0] != '\0') ? errstr : " ");
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gf_cli_rsp);
+
+    if (dict)
+        dict_unref(dict);
+    gf_msg_debug(this->name, 0, "Responded to CLI, ret: %d", ret);
+
+    return ret;
+}
+
+static void
+set_deprobe_error_str(int op_ret, int op_errno, char *op_errstr, char *errstr,
+                      size_t len, char *hostname)
+{
+    if ((op_errstr) && (strcmp(op_errstr, ""))) {
+        snprintf(errstr, len, "%s", op_errstr);
+        return;
+    }
+
+    if (op_ret) {
+        switch (op_errno) {
+            case GF_DEPROBE_LOCALHOST:
+                snprintf(errstr, len, "%s is localhost", hostname);
+                break;
+
+            case GF_DEPROBE_NOT_FRIEND:
+                snprintf(errstr, len,
+                         "%s is not part of "
+                         "cluster",
+                         hostname);
+                break;
+
+            case GF_DEPROBE_BRICK_EXIST:
+                snprintf(errstr, len,
+                         "Peer %s hosts one or more bricks. If the peer is in "
+                         "not recoverable state then use either replace-brick "
+                         "or remove-brick command with force to remove all "
+                         "bricks from the peer and attempt the peer detach "
+                         "again.",
+                         hostname);
+                break;
+
+            case GF_DEPROBE_SNAP_BRICK_EXIST:
+                snprintf(errstr, len,
+                         "%s is part of existing "
+                         "snapshot. Remove those snapshots "
+                         "before proceeding ",
+                         hostname);
+                break;
+
+            case GF_DEPROBE_FRIEND_DOWN:
+                snprintf(errstr, len,
+                         "One of the peers is "
+                         "probably down. Check with "
+                         "'peer status'");
+                break;
+
+            case GF_DEPROBE_QUORUM_NOT_MET:
+                snprintf(errstr, len,
+                         "Cluster quorum is not "
+                         "met. Changing peers is not allowed "
+                         "in this state");
+                break;
+
+            case GF_DEPROBE_FRIEND_DETACHING:
+                snprintf(errstr, len,
+                         "Peer is already being "
+                         "detached from cluster.\n"
+                         "Check peer status by running "
+                         "gluster peer status");
+                break;
+            default:
+                snprintf(errstr, len,
+                         "Detach returned with "
+                         "%s",
+                         strerror(op_errno));
+                break;
+        }
+    }
+}
+
+int
+glusterd_xfer_cli_deprobe_resp(rpcsvc_request_t *req, int32_t op_ret,
+                               int32_t op_errno, char *op_errstr,
+                               char *hostname, dict_t *dict)
+{
+    gf_cli_rsp rsp = {
+        0,
+    };
+    int32_t ret = -1;
+    char *cmd_str = NULL;
+    char errstr[2048] = {
+        0,
+    };
+
+    GF_ASSERT(req);
+
+    (void)set_deprobe_error_str(op_ret, op_errno, op_errstr, errstr,
+                                sizeof(errstr), hostname);
+
+    if (dict) {
+        ret = dict_get_strn(dict, "cmd-str", SLEN("cmd-str"), &cmd_str);
+        if (ret)
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_CMDSTR_NOTFOUND_IN_DICT,
+                   "Failed to get "
+                   "command string");
+    }
+
+    rsp.op_ret = op_ret;
+    rsp.op_errno = op_errno;
+    rsp.op_errstr = (errstr[0] != '\0') ? errstr : "";
+
+    gf_cmd_log("", "%s : %s %s %s", cmd_str, (op_ret) ? "FAILED" : "SUCCESS",
+               (errstr[0] != '\0') ? ":" : " ",
+               (errstr[0] != '\0') ? errstr : " ");
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gf_cli_rsp);
+
+    gf_msg_debug(THIS->name, 0, "Responded to CLI, ret: %d", ret);
+
+    return ret;
+}
+
+int32_t
+glusterd_list_friends(rpcsvc_request_t *req, dict_t *dict, int32_t flags)
+{
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    glusterd_peerinfo_t *entry = NULL;
+    int32_t count = 0;
+    dict_t *friends = NULL;
+    gf1_cli_peer_list_rsp rsp = {
+        0,
+    };
+    char my_uuid_str[64] = {
+        0,
+    };
+    char key[64] = {
+        0,
+    };
+    int keylen;
+
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    friends = dict_new();
+    if (!friends) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    /* Reset ret to 0, needed to prevent failure in case no peers exist */
+    ret = 0;
+    RCU_READ_LOCK;
+    if (!cds_list_empty(&priv->peers)) {
+        cds_list_for_each_entry_rcu(entry, &priv->peers, uuid_list)
+        {
+            count++;
+            ret = gd_add_peer_detail_to_dict(entry, friends, count);
+            if (ret)
+                goto unlock;
+        }
+    }
+unlock:
+    RCU_READ_UNLOCK;
+    if (ret)
+        goto out;
+
+    if (flags == GF_CLI_LIST_POOL_NODES) {
+        count++;
+        keylen = snprintf(key, sizeof(key), "friend%d.uuid", count);
+        uuid_utoa_r(MY_UUID, my_uuid_str);
+        ret = dict_set_strn(friends, key, keylen, my_uuid_str);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=%s", key, NULL);
+            goto out;
+        }
+
+        keylen = snprintf(key, sizeof(key), "friend%d.hostname", count);
+        ret = dict_set_nstrn(friends, key, keylen, "localhost",
+                             SLEN("localhost"));
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=%s", key, NULL);
+            goto out;
+        }
+
+        keylen = snprintf(key, sizeof(key), "friend%d.connected", count);
+        ret = dict_set_int32n(friends, key, keylen, 1);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=%s", key, NULL);
+            goto out;
+        }
+    }
+
+    ret = dict_set_int32n(friends, "count", SLEN("count"), count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=count", NULL);
+        goto out;
+    }
+
+    ret = dict_allocate_and_serialize(friends, &rsp.friends.friends_val,
+                                      &rsp.friends.friends_len);
+
+    if (ret)
+        goto out;
+
+    ret = 0;
+out:
+
+    if (friends)
+        dict_unref(friends);
+
+    rsp.op_ret = ret;
+
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                          (xdrproc_t)xdr_gf1_cli_peer_list_rsp);
+    ret = 0;
+    GF_FREE(rsp.friends.friends_val);
+
+    return ret;
+}
+
+int32_t
+glusterd_get_volumes(rpcsvc_request_t *req, dict_t *dict, int32_t flags)
+{
+    int32_t ret = -1;
+    int32_t ret_bkp = 0;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *entry = NULL;
+    int32_t count = 0;
+    dict_t *volumes = NULL;
+    gf_cli_rsp rsp = {
+        0,
+    };
+    char *volname = NULL;
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+    volumes = dict_new();
+    if (!volumes) {
+        gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Out of Memory");
+        goto out;
+    }
+
+    if (cds_list_empty(&priv->volumes)) {
+        if (flags == GF_CLI_GET_VOLUME)
+            ret_bkp = -1;
+        ret = 0;
+        goto respond;
+    }
+    if (flags == GF_CLI_GET_VOLUME_ALL) {
+        cds_list_for_each_entry(entry, &priv->volumes, vol_list)
+        {
+            ret = glusterd_add_volume_detail_to_dict(entry, volumes, count);
+            if (ret)
+                goto respond;
+
+            count++;
+        }
+
+    } else if (flags == GF_CLI_GET_NEXT_VOLUME) {
+        ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+
+        if (ret) {
+            if (priv->volumes.next) {
+                entry = cds_list_entry(priv->volumes.next, typeof(*entry),
+                                       vol_list);
+            }
+        } else {
+            ret = glusterd_volinfo_find(volname, &entry);
+            if (ret)
+                goto respond;
+            entry = cds_list_entry(entry->vol_list.next, typeof(*entry),
+                                   vol_list);
+        }
+
+        if (&entry->vol_list == &priv->volumes) {
+            goto respond;
+        } else {
+            ret = glusterd_add_volume_detail_to_dict(entry, volumes, count);
+            if (ret)
+                goto respond;
+
+            count++;
+        }
+    } else if (flags == GF_CLI_GET_VOLUME) {
+        ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+
+        if (ret)
+            goto respond;
+
+        ret = glusterd_volinfo_find(volname, &entry);
+        if (ret) {
+            ret_bkp = ret;
+            goto respond;
+        }
+
+        ret = glusterd_add_volume_detail_to_dict(entry, volumes, count);
+        if (ret)
+            goto respond;
+
+        count++;
+    }
+
+respond:
+    ret = dict_set_int32n(volumes, "count", SLEN("count"), count);
+    if (ret)
+        goto out;
+    ret = dict_allocate_and_serialize(volumes, &rsp.dict.dict_val,
+                                      &rsp.dict.dict_len);
+
+    if (ret)
+        goto out;
+
+    ret = 0;
+out:
+    if (ret_bkp == -1) {
+        rsp.op_ret = ret_bkp;
+        rsp.op_errstr = "Volume does not exist";
+        rsp.op_errno = EG_NOVOL;
+    } else {
+        rsp.op_ret = ret;
+        rsp.op_errstr = "";
+    }
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_cli_rsp);
+    ret = 0;
+
+    if (volumes)
+        dict_unref(volumes);
+
+    GF_FREE(rsp.dict.dict_val);
+    return ret;
+}
+
+int
+__glusterd_handle_status_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    uint32_t cmd = 0;
+    dict_t *dict = NULL;
+    char *volname = 0;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    glusterd_op_t cli_op = GD_OP_STATUS_VOLUME;
+    char err_str[256] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    GF_ASSERT(req);
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode "
+               "request received from cli");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len > 0) {
+        dict = dict_new();
+        if (!dict) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            goto out;
+        }
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize buffer");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        }
+    }
+
+    ret = dict_get_uint32(dict, "cmd", &cmd);
+    if (ret)
+        goto out;
+
+    if (!(cmd & GF_CLI_STATUS_ALL)) {
+        ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+        if (ret) {
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to get "
+                     "volume name");
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND, "%s",
+                   err_str);
+            goto out;
+        }
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_STATUS_VOL_REQ_RCVD,
+               "Received status volume req for volume %s", volname);
+    }
+    if ((cmd & GF_CLI_STATUS_CLIENT_LIST) &&
+        (conf->op_version < GD_OP_VERSION_3_13_0)) {
+        snprintf(err_str, sizeof(err_str),
+                 "The cluster is operating "
+                 "at version less than %d. Getting the client-list "
+                 "is not allowed in this state.",
+                 GD_OP_VERSION_3_13_0);
+        ret = -1;
+        goto out;
+    }
+
+    if ((cmd & GF_CLI_STATUS_QUOTAD) &&
+        (conf->op_version == GD_OP_VERSION_MIN)) {
+        snprintf(err_str, sizeof(err_str),
+                 "The cluster is operating "
+                 "at version 1. Getting the status of quotad is not "
+                 "allowed in this state.");
+        ret = -1;
+        goto out;
+    }
+
+    if ((cmd & GF_CLI_STATUS_SNAPD) &&
+        (conf->op_version < GD_OP_VERSION_3_6_0)) {
+        snprintf(err_str, sizeof(err_str),
+                 "The cluster is operating "
+                 "at a lesser version than %d. Getting the status of "
+                 "snapd is not allowed in this state",
+                 GD_OP_VERSION_3_6_0);
+        ret = -1;
+        goto out;
+    }
+
+    if ((cmd & GF_CLI_STATUS_BITD) &&
+        (conf->op_version < GD_OP_VERSION_3_7_0)) {
+        snprintf(err_str, sizeof(err_str),
+                 "The cluster is operating "
+                 "at a lesser version than %d. Getting the status of "
+                 "bitd is not allowed in this state",
+                 GD_OP_VERSION_3_7_0);
+        ret = -1;
+        goto out;
+    }
+
+    if ((cmd & GF_CLI_STATUS_SCRUB) &&
+        (conf->op_version < GD_OP_VERSION_3_7_0)) {
+        snprintf(err_str, sizeof(err_str),
+                 "The cluster is operating "
+                 "at a lesser version than %d. Getting the status of "
+                 "scrub is not allowed in this state",
+                 GD_OP_VERSION_3_7_0);
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_STATUS_VOLUME, dict);
+
+out:
+
+    if (ret) {
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, err_str);
+    }
+    free(cli_req.dict.dict_val);
+
+    return ret;
+}
+
+int
+glusterd_handle_status_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_status_volume);
+}
+
+int
+__glusterd_handle_cli_clearlocks_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    glusterd_op_t cli_op = GD_OP_CLEARLOCKS_VOLUME;
+    char *volname = NULL;
+    dict_t *dict = NULL;
+    char err_str[64] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    GF_ASSERT(req);
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = -1;
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode "
+               "request received from cli");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to unserialize req-buffer to"
+                   " dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "unable to decode "
+                     "the command");
+            goto out;
+        }
+
+    } else {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CLI_REQ_EMPTY,
+               "Empty cli request.");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get volume "
+                 "name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLNAME_NOTFOUND_IN_DICT,
+               "%s", err_str);
+        goto out;
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_CLRCLK_VOL_REQ_RCVD,
+           "Received clear-locks volume req "
+           "for volume %s",
+           volname);
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_CLEARLOCKS_VOLUME, dict);
+
+out:
+    if (ret) {
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, err_str);
+    }
+    free(cli_req.dict.dict_val);
+
+    return ret;
+}
+
+int
+glusterd_handle_cli_clearlocks_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req,
+                                       __glusterd_handle_cli_clearlocks_volume);
+}
+
+static int
+get_volinfo_from_brickid(char *brickid, glusterd_volinfo_t **volinfo)
+{
+    int ret = -1;
+    char *volid_str = NULL;
+    char *brick = NULL;
+    char *brickid_dup = NULL;
+    uuid_t volid = {0};
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(brickid);
+
+    brickid_dup = gf_strdup(brickid);
+    if (!brickid_dup)
+        goto out;
+
+    volid_str = brickid_dup;
+    brick = strchr(brickid_dup, ':');
+    if (!brick) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_NOT_FOUND,
+               "Invalid brickid");
+        goto out;
+    }
+
+    *brick = '\0';
+    brick++;
+    gf_uuid_parse(volid_str, volid);
+    ret = glusterd_volinfo_find_by_volume_id(volid, volinfo);
+    if (ret) {
+        /* Check if it is a snapshot volume */
+        ret = glusterd_snap_volinfo_find_by_volume_id(volid, volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLINFO_GET_FAIL,
+                   "Failed to find volinfo");
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    GF_FREE(brickid_dup);
+    return ret;
+}
+
+static int
+__glusterd_handle_barrier(rpcsvc_request_t *req)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    char *volname = NULL;
+
+    GF_ASSERT(req);
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode "
+               "request received from cli");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (!cli_req.dict.dict_len) {
+        ret = -1;
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+    ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len, &dict);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+               "Failed to unserialize "
+               "request dictionary.");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLNAME_NOTFOUND_IN_DICT,
+               "Volname not present in "
+               "dict");
+        goto out;
+    }
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_BARRIER_VOL_REQ_RCVD,
+           "Received barrier volume request for "
+           "volume %s",
+           volname);
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_BARRIER, dict);
+
+out:
+    if (ret) {
+        ret = glusterd_op_send_cli_response(GD_OP_BARRIER, ret, 0, req, dict,
+                                            "Operation failed");
+    }
+    free(cli_req.dict.dict_val);
+    return ret;
+}
+
+int
+glusterd_handle_barrier(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_barrier);
+}
+
+static gf_boolean_t
+gd_is_global_option(char *opt_key)
+{
+    GF_VALIDATE_OR_GOTO(THIS->name, opt_key, out);
+
+    return (strcmp(opt_key, GLUSTERD_SHARED_STORAGE_KEY) == 0 ||
+            strcmp(opt_key, GLUSTERD_QUORUM_RATIO_KEY) == 0 ||
+            strcmp(opt_key, GLUSTERD_GLOBAL_OP_VERSION_KEY) == 0 ||
+            strcmp(opt_key, GLUSTERD_BRICK_MULTIPLEX_KEY) == 0 ||
+            strcmp(opt_key, GLUSTERD_LOCALTIME_LOGGING_KEY) == 0 ||
+            strcmp(opt_key, GLUSTERD_DAEMON_LOG_LEVEL_KEY) == 0 ||
+            strcmp(opt_key, GLUSTERD_MAX_OP_VERSION_KEY) == 0);
+
+out:
+    return _gf_false;
+}
+
+int32_t
+glusterd_get_volume_opts(rpcsvc_request_t *req, dict_t *dict)
+{
+    int32_t ret = -1;
+    int32_t count = 1;
+    int exists = 0;
+    char *key = NULL;
+    char *orig_key = NULL;
+    char *key_fixed = NULL;
+    char *volname = NULL;
+    char *value = NULL;
+    char err_str[2048] = {
+        0,
+    };
+    char dict_key[50] = {
+        0,
+    };
+    int keylen;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    gf_cli_rsp rsp = {
+        0,
+    };
+    char op_version_buff[10] = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(req);
+    GF_ASSERT(dict);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to get volume "
+                 "name while handling get volume option command");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLNAME_NOTFOUND_IN_DICT,
+               "%s", err_str);
+        goto out;
+    }
+
+    if (strcasecmp(volname, "all") == 0) {
+        ret = glusterd_get_global_options_for_all_vols(req, dict,
+                                                       &rsp.op_errstr);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "key", SLEN("key"), &key);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to get key "
+                 "while handling get volume option for %s",
+                 volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+    gf_msg_debug(this->name, 0,
+                 "Received get volume opt request for "
+                 "volume %s",
+                 volname);
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str), FMTSTR_CHECK_VOL_EXISTS, volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+    if (strcmp(key, "all")) {
+        if (fnmatch(GD_HOOKS_SPECIFIC_KEY, key, FNM_NOESCAPE) == 0) {
+            keylen = sprintf(dict_key, "key%d", count);
+            ret = dict_set_strn(dict, dict_key, keylen, key);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to "
+                       "set %s in dictionary",
+                       key);
+                goto out;
+            }
+            ret = dict_get_str(volinfo->dict, key, &value);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to "
+                       "get %s in dictionary",
+                       key);
+                goto out;
+            }
+            keylen = sprintf(dict_key, "value%d", count);
+            ret = dict_set_strn(dict, dict_key, keylen, value);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to "
+                       "set %s in dictionary",
+                       key);
+                goto out;
+            }
+        } else {
+            exists = glusterd_check_option_exists(key, &key_fixed);
+            if (!exists) {
+                snprintf(err_str, sizeof(err_str),
+                         "Option "
+                         "with name: %s does not exist",
+                         key);
+                gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_UNKNOWN_KEY,
+                       "%s", err_str);
+                if (key_fixed)
+                    snprintf(err_str + ret, sizeof(err_str) - ret,
+                             "Did you mean %s?", key_fixed);
+                ret = -1;
+                goto out;
+            }
+            if (key_fixed) {
+                orig_key = key;
+                key = key_fixed;
+            }
+
+            if (gd_is_global_option(key)) {
+                char warn_str[] =
+                    "Warning: support to get \
+                                        global option value using volume get \
+                                        <volname>` will be deprecated from \
+                                        next release. Consider using `volume \
+                                        get all` instead for global options";
+
+                ret = dict_set_strn(dict, "warning", SLEN("warning"), warn_str);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Failed to set warning "
+                           "message in dictionary");
+                    goto out;
+                }
+            }
+
+            if (strcmp(key, GLUSTERD_MAX_OP_VERSION_KEY) == 0) {
+                ret = glusterd_get_global_max_op_version(req, dict, 1);
+                if (ret)
+                    goto out;
+            } else if (strcmp(key, GLUSTERD_GLOBAL_OP_VERSION_KEY) == 0) {
+                keylen = sprintf(dict_key, "key%d", count);
+                ret = dict_set_strn(dict, dict_key, keylen, key);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Failed"
+                           "to set %s in dictionary",
+                           key);
+                    goto out;
+                }
+                keylen = sprintf(dict_key, "value%d", count);
+                sprintf(op_version_buff, "%d", priv->op_version);
+                ret = dict_set_strn(dict, dict_key, keylen, op_version_buff);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Failed"
+                           " to set value for key %s in "
+                           "dictionary",
+                           key);
+                    goto out;
+                }
+            } else if (strcmp(key, "config.memory-accounting") == 0) {
+                keylen = sprintf(dict_key, "key%d", count);
+                ret = dict_set_strn(dict, dict_key, keylen, key);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Failed"
+                           " to set %s in dictionary",
+                           key);
+                    goto out;
+                }
+                keylen = sprintf(dict_key, "value%d", count);
+
+                if (volinfo->memory_accounting)
+                    ret = dict_set_nstrn(dict, dict_key, keylen, "Enabled",
+                                         SLEN("Enabled"));
+                else
+                    ret = dict_set_nstrn(dict, dict_key, keylen, "Disabled",
+                                         SLEN("Disabled"));
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Failed"
+                           " to set value for key %s in "
+                           "dictionary",
+                           key);
+                    goto out;
+                }
+            } else if (strcmp(key, "config.transport") == 0) {
+                keylen = sprintf(dict_key, "key%d", count);
+                ret = dict_set_strn(dict, dict_key, keylen, key);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Failed to set %s in "
+                           "dictionary",
+                           key);
+                    goto out;
+                }
+                keylen = sprintf(dict_key, "value%d", count);
+
+                if (volinfo->transport_type == GF_TRANSPORT_RDMA)
+                    ret = dict_set_nstrn(dict, dict_key, keylen, "rdma",
+                                         SLEN("rdma"));
+                else if (volinfo->transport_type == GF_TRANSPORT_TCP)
+                    ret = dict_set_nstrn(dict, dict_key, keylen, "tcp",
+                                         SLEN("tcp"));
+                else if (volinfo->transport_type == GF_TRANSPORT_BOTH_TCP_RDMA)
+                    ret = dict_set_nstrn(dict, dict_key, keylen, "tcp,rdma",
+                                         SLEN("tcp,rdma"));
+                else
+                    ret = dict_set_nstrn(dict, dict_key, keylen, "none",
+                                         SLEN("none"));
+
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Failed to set value for key "
+                           "%s in dictionary",
+                           key);
+                    goto out;
+                }
+            } else {
+                keylen = sprintf(dict_key, "key%d", count);
+                ret = dict_set_strn(dict, dict_key, keylen, key);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Failed to set %s in "
+                           "dictionary",
+                           key);
+                    goto out;
+                }
+                keylen = sprintf(dict_key, "value%d", count);
+                ret = dict_get_str(priv->opts, key, &value);
+                if (!ret) {
+                    ret = dict_set_strn(dict, dict_key, keylen, value);
+                    if (ret) {
+                        gf_msg(this->name, GF_LOG_ERROR, 0,
+                               GD_MSG_DICT_SET_FAILED,
+                               "Failed to set %s in "
+                               " dictionary",
+                               key);
+                        goto out;
+                    }
+                } else {
+                    ret = glusterd_get_default_val_for_volopt(
+                        dict, _gf_false, key, orig_key, volinfo,
+                        &rsp.op_errstr);
+                    if (ret && !rsp.op_errstr) {
+                        snprintf(err_str, sizeof(err_str),
+                                 "Failed to fetch the "
+                                 "value of %s, check "
+                                 "log file for more"
+                                 " details",
+                                 key);
+                    }
+                }
+            }
+        }
+        /* Request is for a single option, explicitly set count to 1
+         * in the dictionary.
+         */
+        ret = dict_set_int32n(dict, "count", SLEN("count"), 1);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set count "
+                   "value in the dictionary");
+            goto out;
+        }
+    } else {
+        /* Handle the "all" volume option request */
+        ret = glusterd_get_default_val_for_volopt(dict, _gf_true, NULL, NULL,
+                                                  volinfo, &rsp.op_errstr);
+        if (ret && !rsp.op_errstr) {
+            snprintf(err_str, sizeof(err_str),
+                     "Failed to fetch the value of all volume "
+                     "options, check log file for more details");
+        }
+    }
+
+out:
+    if (ret) {
+        if (!rsp.op_errstr)
+            rsp.op_errstr = err_str;
+        rsp.op_ret = ret;
+    } else {
+        rsp.op_errstr = "";
+        rsp.op_ret = 0;
+    }
+
+    ret = dict_allocate_and_serialize(dict, &rsp.dict.dict_val,
+                                      &rsp.dict.dict_len);
+
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_cli_rsp);
+    GF_FREE(rsp.dict.dict_val);
+    GF_FREE(key_fixed);
+    return ret;
+}
+
+int
+__glusterd_handle_get_vol_opt(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    char err_str[64] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to decode "
+                 "request received from cli");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL, "%s",
+               err_str);
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+    }
+    ret = glusterd_get_volume_opts(req, dict);
+
+out:
+    if (dict)
+        dict_unref(dict);
+
+    return ret;
+}
+
+int
+glusterd_handle_get_vol_opt(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_get_vol_opt);
+}
+
+extern struct rpc_clnt_program gd_brick_prog;
+
+static int
+glusterd_print_global_options(dict_t *opts, char *key, data_t *val, void *data)
+{
+    FILE *fp = NULL;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, key, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, val, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, data, out);
+
+    if (strcmp(key, GLUSTERD_GLOBAL_OPT_VERSION) == 0)
+        goto out;
+
+    fp = (FILE *)data;
+    fprintf(fp, "%s: %s\n", key, val->data);
+out:
+    return 0;
+}
+
+static int
+glusterd_print_volume_options(dict_t *opts, char *key, data_t *val, void *data)
+{
+    FILE *fp = NULL;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, key, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, val, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, data, out);
+
+    fp = (FILE *)data;
+    fprintf(fp, "Volume%d.options.%s: %s\n", volcount, key, val->data);
+out:
+    return 0;
+}
+
+static int
+glusterd_print_gsync_status(FILE *fp, dict_t *gsync_dict)
+{
+    int ret = -1;
+    int gsync_count = 0;
+    int i = 0;
+    gf_gsync_status_t *status_vals = NULL;
+    char status_val_name[PATH_MAX] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO(THIS->name, fp, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, gsync_dict, out);
+
+    ret = dict_get_int32n(gsync_dict, "gsync-count", SLEN("gsync-count"),
+                          &gsync_count);
+
+    fprintf(fp, "Volume%d.gsync_count: %d\n", volcount, gsync_count);
+
+    if (gsync_count == 0) {
+        ret = 0;
+        goto out;
+    }
+
+    for (i = 0; i < gsync_count; i++) {
+        snprintf(status_val_name, sizeof(status_val_name), "status_value%d", i);
+
+        ret = dict_get_bin(gsync_dict, status_val_name,
+                           (void **)&(status_vals));
+        if (ret)
+            goto out;
+
+        fprintf(fp, "Volume%d.pair%d.session_slave: %s\n", volcount, i + 1,
+                get_struct_variable(21, status_vals));
+        fprintf(fp, "Volume%d.pair%d.master_node: %s\n", volcount, i + 1,
+                get_struct_variable(0, status_vals));
+        fprintf(fp, "Volume%d.pair%d.master_volume: %s\n", volcount, i + 1,
+                get_struct_variable(1, status_vals));
+        fprintf(fp, "Volume%d.pair%d.master_brick: %s\n", volcount, i + 1,
+                get_struct_variable(2, status_vals));
+        fprintf(fp, "Volume%d.pair%d.slave_user: %s\n", volcount, i + 1,
+                get_struct_variable(3, status_vals));
+        fprintf(fp, "Volume%d.pair%d.slave: %s\n", volcount, i + 1,
+                get_struct_variable(4, status_vals));
+        fprintf(fp, "Volume%d.pair%d.slave_node: %s\n", volcount, i + 1,
+                get_struct_variable(5, status_vals));
+        fprintf(fp, "Volume%d.pair%d.status: %s\n", volcount, i + 1,
+                get_struct_variable(6, status_vals));
+        fprintf(fp, "Volume%d.pair%d.crawl_status: %s\n", volcount, i + 1,
+                get_struct_variable(7, status_vals));
+        fprintf(fp, "Volume%d.pair%d.last_synced: %s\n", volcount, i + 1,
+                get_struct_variable(8, status_vals));
+        fprintf(fp, "Volume%d.pair%d.entry: %s\n", volcount, i + 1,
+                get_struct_variable(9, status_vals));
+        fprintf(fp, "Volume%d.pair%d.data: %s\n", volcount, i + 1,
+                get_struct_variable(10, status_vals));
+        fprintf(fp, "Volume%d.pair%d.meta: %s\n", volcount, i + 1,
+                get_struct_variable(11, status_vals));
+        fprintf(fp, "Volume%d.pair%d.failures: %s\n", volcount, i + 1,
+                get_struct_variable(12, status_vals));
+        fprintf(fp, "Volume%d.pair%d.checkpoint_time: %s\n", volcount, i + 1,
+                get_struct_variable(13, status_vals));
+        fprintf(fp, "Volume%d.pair%d.checkpoint_completed: %s\n", volcount,
+                i + 1, get_struct_variable(14, status_vals));
+        fprintf(fp, "Volume%d.pair%d.checkpoint_completion_time: %s\n",
+                volcount, i + 1, get_struct_variable(15, status_vals));
+    }
+out:
+    return ret;
+}
+
+static int
+glusterd_print_gsync_status_by_vol(FILE *fp, glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    dict_t *gsync_rsp_dict = NULL;
+    char my_hostname[256] = {
+        0,
+    };
+
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    GF_VALIDATE_OR_GOTO(THIS->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, fp, out);
+
+    gsync_rsp_dict = dict_new();
+    if (!gsync_rsp_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ret = gethostname(my_hostname, sizeof(my_hostname));
+    if (ret) {
+        /* stick to N/A */
+        (void)strcpy(my_hostname, "N/A");
+    }
+
+    ret = glusterd_get_gsync_status_mst(volinfo, gsync_rsp_dict, my_hostname);
+    /* Ignoring ret as above function always returns ret = 0 */
+
+    ret = glusterd_print_gsync_status(fp, gsync_rsp_dict);
+out:
+    if (gsync_rsp_dict)
+        dict_unref(gsync_rsp_dict);
+    return ret;
+}
+
+static int
+glusterd_print_snapinfo_by_vol(FILE *fp, glusterd_volinfo_t *volinfo,
+                               int volcount)
+{
+    int ret = -1;
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_volinfo_t *tmp_vol = NULL;
+    glusterd_snap_t *snapinfo = NULL;
+    int snapcount = 0;
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    char snap_status_str[STATUS_STRLEN] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO(THIS->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, fp, out);
+
+    cds_list_for_each_entry_safe(snap_vol, tmp_vol, &volinfo->snap_volumes,
+                                 snapvol_list)
+    {
+        snapcount++;
+        snapinfo = snap_vol->snapshot;
+
+        ret = glusterd_get_snap_status_str(snapinfo, snap_status_str);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_STATE_STR_GET_FAILED,
+                   "Failed to get status for snapshot: %s", snapinfo->snapname);
+
+            goto out;
+        }
+        gf_time_fmt(timestr, sizeof timestr, snapinfo->time_stamp,
+                    gf_timefmt_FT);
+
+        fprintf(fp, "Volume%d.snapshot%d.name: %s\n", volcount, snapcount,
+                snapinfo->snapname);
+        fprintf(fp, "Volume%d.snapshot%d.id: %s\n", volcount, snapcount,
+                uuid_utoa(snapinfo->snap_id));
+        fprintf(fp, "Volume%d.snapshot%d.time: %s\n", volcount, snapcount,
+                timestr);
+
+        if (snapinfo->description)
+            fprintf(fp, "Volume%d.snapshot%d.description: %s\n", volcount,
+                    snapcount, snapinfo->description);
+        fprintf(fp, "Volume%d.snapshot%d.status: %s\n", volcount, snapcount,
+                snap_status_str);
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+glusterd_print_client_details(FILE *fp, dict_t *dict,
+                              glusterd_volinfo_t *volinfo, int volcount,
+                              glusterd_brickinfo_t *brickinfo, int brickcount)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    int brick_index = -1;
+    int client_count = 0;
+    char key[64] = {
+        0,
+    };
+    int keylen;
+    char *clientname = NULL;
+    uint64_t bytesread = 0;
+    uint64_t byteswrite = 0;
+    uint32_t opversion = 0;
+
+    glusterd_pending_node_t *pending_node = NULL;
+    rpc_clnt_t *rpc = NULL;
+    struct syncargs args = {
+        0,
+    };
+    gd1_mgmt_brick_op_req *brick_req = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+
+    if (gf_uuid_compare(brickinfo->uuid, MY_UUID) ||
+        !glusterd_is_brick_started(brickinfo)) {
+        ret = 0;
+        goto out;
+    }
+
+    brick_index++;
+    pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                             gf_gld_mt_pending_node_t);
+    if (!pending_node) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Unable to allocate memory");
+        goto out;
+    }
+
+    pending_node->node = brickinfo;
+    pending_node->type = GD_NODE_BRICK;
+    pending_node->index = brick_index;
+
+    rpc = glusterd_pending_node_get_rpc(pending_node);
+    if (!rpc) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_FAILURE,
+               "Failed to retrieve rpc object");
+        goto out;
+    }
+
+    brick_req = GF_CALLOC(1, sizeof(*brick_req), gf_gld_mt_mop_brick_req_t);
+    if (!brick_req) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Unable to allocate memory");
+        goto out;
+    }
+
+    brick_req->op = GLUSTERD_BRICK_STATUS;
+    brick_req->name = "";
+    brick_req->dict.dict_val = NULL;
+    brick_req->dict.dict_len = 0;
+
+    ret = dict_set_strn(dict, "brick-name", SLEN("brick-name"),
+                        brickinfo->path);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=brick-name", NULL);
+        goto out;
+    }
+
+    ret = dict_set_int32n(dict, "cmd", SLEN("cmd"), GF_CLI_STATUS_CLIENTS);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=cmd", NULL);
+        goto out;
+    }
+
+    ret = dict_set_strn(dict, "volname", SLEN("volname"), volinfo->volname);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=volname", NULL);
+        goto out;
+    }
+
+    ret = dict_allocate_and_serialize(dict, &brick_req->input.input_val,
+                                      &brick_req->input.input_len);
+    if (ret)
+        goto out;
+
+    GD_SYNCOP(rpc, (&args), NULL, gd_syncop_brick_op_cbk, brick_req,
+              &gd_brick_prog, brick_req->op, xdr_gd1_mgmt_brick_op_req);
+
+    if (args.op_ret)
+        goto out;
+
+    ret = dict_get_int32n(args.dict, "clientcount", SLEN("clientcount"),
+                          &client_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Couldn't get client count");
+        goto out;
+    }
+
+    fprintf(fp, "Volume%d.Brick%d.client_count: %d\n", volcount, brickcount,
+            client_count);
+
+    if (client_count == 0) {
+        ret = 0;
+        goto out;
+    }
+
+    int i;
+    for (i = 1; i <= client_count; i++) {
+        keylen = snprintf(key, sizeof(key), "client%d.hostname", i - 1);
+        ret = dict_get_strn(args.dict, key, keylen, &clientname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get client hostname");
+            goto out;
+        }
+
+        snprintf(key, sizeof(key), "Client%d.hostname", i);
+        fprintf(fp, "Volume%d.Brick%d.%s: %s\n", volcount, brickcount, key,
+                clientname);
+
+        snprintf(key, sizeof(key), "client%d.bytesread", i - 1);
+        ret = dict_get_uint64(args.dict, key, &bytesread);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get bytesread from client");
+            goto out;
+        }
+
+        snprintf(key, sizeof(key), "Client%d.bytesread", i);
+        fprintf(fp, "Volume%d.Brick%d.%s: %" PRIu64 "\n", volcount, brickcount,
+                key, bytesread);
+
+        snprintf(key, sizeof(key), "client%d.byteswrite", i - 1);
+        ret = dict_get_uint64(args.dict, key, &byteswrite);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get byteswrite from client");
+            goto out;
+        }
+
+        snprintf(key, sizeof(key), "Client%d.byteswrite", i);
+        fprintf(fp, "Volume%d.Brick%d.%s: %" PRIu64 "\n", volcount, brickcount,
+                key, byteswrite);
+
+        snprintf(key, sizeof(key), "client%d.opversion", i - 1);
+        ret = dict_get_uint32(args.dict, key, &opversion);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get client opversion");
+            goto out;
+        }
+
+        snprintf(key, sizeof(key), "Client%d.opversion", i);
+        fprintf(fp, "Volume%d.Brick%d.%s: %" PRIu32 "\n", volcount, brickcount,
+                key, opversion);
+    }
+
+out:
+    if (pending_node)
+        GF_FREE(pending_node);
+
+    if (brick_req) {
+        if (brick_req->input.input_val)
+            GF_FREE(brick_req->input.input_val);
+        GF_FREE(brick_req);
+    }
+    if (args.dict)
+        dict_unref(args.dict);
+    if (args.errstr)
+        GF_FREE(args.errstr);
+
+    return ret;
+}
+
+static int
+glusterd_get_state(rpcsvc_request_t *req, dict_t *dict)
+{
+    int32_t ret = -1;
+    gf_cli_rsp rsp = {
+        0,
+    };
+    FILE *fp = NULL;
+    DIR *dp = NULL;
+    char err_str[2048] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_peer_hostname_t *peer_hostname_info = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    xlator_t *this = NULL;
+    dict_t *vol_all_opts = NULL;
+    struct statvfs brickstat = {0};
+    char *odir = NULL;
+    char *filename = NULL;
+    char *ofilepath = NULL;
+    char *tmp_str = NULL;
+    int count = 0;
+    int count_bkp = 0;
+    int odirlen = 0;
+    time_t now = 0;
+    char timestamp[16] = {
+        0,
+    };
+    uint32_t get_state_cmd = 0;
+    uint64_t memtotal = 0;
+    uint64_t memfree = 0;
+    char id_str[64] = {
+        0,
+    };
+
+    char *vol_type_str = NULL;
+
+    char transport_type_str[STATUS_STRLEN] = {
+        0,
+    };
+    char quorum_status_str[STATUS_STRLEN] = {
+        0,
+    };
+    char rebal_status_str[STATUS_STRLEN] = {
+        0,
+    };
+    char vol_status_str[STATUS_STRLEN] = {
+        0,
+    };
+    char brick_status_str[STATUS_STRLEN] = {
+        0,
+    };
+    this = THIS;
+    GF_VALIDATE_OR_GOTO(THIS->name, this, out);
+
+    priv = THIS->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+
+    ret = dict_get_strn(dict, "odir", SLEN("odir"), &tmp_str);
+    if (ret) {
+        odirlen = gf_asprintf(&odir, "%s", "/var/run/gluster/");
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "Default output directory: %s", odir);
+    } else {
+        odirlen = gf_asprintf(&odir, "%s", tmp_str);
+    }
+
+    dp = sys_opendir(odir);
+    if (dp) {
+        sys_closedir(dp);
+    } else {
+        if (errno == ENOENT) {
+            snprintf(err_str, sizeof(err_str),
+                     "Output directory %s does not exist.", odir);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+                   err_str);
+        } else if (errno == ENOTDIR) {
+            snprintf(err_str, sizeof(err_str),
+                     "Output directory "
+                     "does not exist. %s points to a file.",
+                     odir);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+                   err_str);
+        }
+
+        GF_FREE(odir);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "filename", SLEN("filename"), &tmp_str);
+    if (ret) {
+        now = gf_time();
+        strftime(timestamp, sizeof(timestamp), "%Y%m%d_%H%M%S",
+                 localtime(&now));
+        gf_asprintf(&filename, "%s_%s", "glusterd_state", timestamp);
+
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "Default filename: %s", filename);
+    } else {
+        gf_asprintf(&filename, "%s", tmp_str);
+    }
+
+    ret = gf_asprintf(&ofilepath, "%s%s%s", odir,
+                      ((odir[odirlen - 1] != '/') ? "/" : ""), filename);
+
+    if (ret < 0) {
+        GF_FREE(odir);
+        GF_FREE(filename);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to get the output path");
+        ret = -1;
+        goto out;
+    }
+    GF_FREE(odir);
+    GF_FREE(filename);
+
+    ret = dict_set_dynstrn(dict, "ofilepath", SLEN("ofilepath"), ofilepath);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set output path");
+        goto out;
+    }
+
+    fp = fopen(ofilepath, "w");
+    if (!fp) {
+        snprintf(err_str, sizeof(err_str), "Failed to open file at %s",
+                 ofilepath);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_uint32(dict, "getstate-cmd", &get_state_cmd);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "get-state command type not set");
+        ret = 0;
+    }
+
+    if (get_state_cmd == GF_CLI_GET_STATE_VOLOPTS) {
+        fprintf(fp, "[Volume Options]\n");
+        cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+        {
+            fprintf(fp, "Volume%d.name: %s\n", ++count, volinfo->volname);
+
+            volcount = count;
+            vol_all_opts = dict_new();
+
+            ret = glusterd_get_default_val_for_volopt(
+                vol_all_opts, _gf_true, NULL, NULL, volinfo, &rsp.op_errstr);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_OPTS_IMPORT_FAIL,
+                       "Failed to "
+                       "fetch the value of all volume options "
+                       "for volume %s",
+                       volinfo->volname);
+                if (vol_all_opts)
+                    dict_unref(vol_all_opts);
+                continue;
+            }
+
+            dict_foreach(vol_all_opts, glusterd_print_volume_options, fp);
+
+            if (vol_all_opts)
+                dict_unref(vol_all_opts);
+        }
+        ret = 0;
+        goto out;
+    }
+
+    fprintf(fp, "[Global]\n");
+
+    uuid_utoa_r(priv->uuid, id_str);
+    fprintf(fp, "MYUUID: %s\n", id_str);
+
+    fprintf(fp, "op-version: %d\n", priv->op_version);
+
+    fprintf(fp, "\n[Global options]\n");
+
+    if (priv->opts)
+        dict_foreach(priv->opts, glusterd_print_global_options, fp);
+
+    fprintf(fp, "\n[Peers]\n");
+    RCU_READ_LOCK;
+
+    cds_list_for_each_entry_rcu(peerinfo, &priv->peers, uuid_list)
+    {
+        fprintf(fp, "Peer%d.primary_hostname: %s\n", ++count,
+                peerinfo->hostname);
+        fprintf(fp, "Peer%d.uuid: %s\n", count, gd_peer_uuid_str(peerinfo));
+        fprintf(fp, "Peer%d.state: %s\n", count,
+                glusterd_friend_sm_state_name_get(peerinfo->state.state));
+        fprintf(fp, "Peer%d.connected: %s\n", count,
+                peerinfo->connected ? "Connected" : "Disconnected");
+
+        fprintf(fp, "Peer%d.othernames: ", count);
+        count_bkp = 0;
+        cds_list_for_each_entry(peer_hostname_info, &peerinfo->hostnames,
+                                hostname_list)
+        {
+            if (strcmp(peerinfo->hostname, peer_hostname_info->hostname) == 0)
+                continue;
+
+            if (count_bkp > 0)
+                fprintf(fp, ",");
+
+            fprintf(fp, "%s", peer_hostname_info->hostname);
+            count_bkp++;
+        }
+        count_bkp = 0;
+        fprintf(fp, "\n");
+    }
+    RCU_READ_UNLOCK;
+
+    count = 0;
+    fprintf(fp, "\n[Volumes]\n");
+
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+    {
+        ret = glusterd_volume_get_type_str(volinfo, &vol_type_str);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STATE_STR_GET_FAILED,
+                   "Failed to get type for volume: %s", volinfo->volname);
+            goto out;
+        }
+
+        ret = glusterd_volume_get_status_str(volinfo, vol_status_str);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STATE_STR_GET_FAILED,
+                   "Failed to get status for volume: %s", volinfo->volname);
+            goto out;
+        }
+
+        ret = glusterd_volume_get_transport_type_str(volinfo,
+                                                     transport_type_str);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STATE_STR_GET_FAILED,
+                   "Failed to get transport type for volume: %s",
+                   volinfo->volname);
+            goto out;
+        }
+
+        ret = glusterd_volume_get_quorum_status_str(volinfo, quorum_status_str);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STATE_STR_GET_FAILED,
+                   "Failed to get quorum status for volume: %s",
+                   volinfo->volname);
+            goto out;
+        }
+
+        ret = glusterd_volume_get_rebalance_status_str(volinfo,
+                                                       rebal_status_str);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STATE_STR_GET_FAILED,
+                   "Failed to get rebalance status for volume: %s",
+                   volinfo->volname);
+            goto out;
+        }
+
+        fprintf(fp, "Volume%d.name: %s\n", ++count, volinfo->volname);
+
+        uuid_utoa_r(volinfo->volume_id, id_str);
+        fprintf(fp, "Volume%d.id: %s\n", count, id_str);
+
+        fprintf(fp, "Volume%d.type: %s\n", count, vol_type_str);
+        fprintf(fp, "Volume%d.transport_type: %s\n", count, transport_type_str);
+        fprintf(fp, "Volume%d.status: %s\n", count, vol_status_str);
+        fprintf(fp, "Volume%d.profile_enabled: %d\n", count,
+                glusterd_is_profile_on(volinfo));
+        fprintf(fp, "Volume%d.brickcount: %d\n", count, volinfo->brick_count);
+
+        count_bkp = count;
+        count = 0;
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            fprintf(fp, "Volume%d.Brick%d.path: %s:%s\n", count_bkp, ++count,
+                    brickinfo->hostname, brickinfo->path);
+            fprintf(fp, "Volume%d.Brick%d.hostname: %s\n", count_bkp, count,
+                    brickinfo->hostname);
+            /* Determine which one is the arbiter brick */
+            if (volinfo->arbiter_count == 1) {
+                if (count % volinfo->replica_count == 0) {
+                    fprintf(fp,
+                            "Volume%d.Brick%d."
+                            "is_arbiter: 1\n",
+                            count_bkp, count);
+                }
+            }
+            /* Add following information only for bricks
+             *  local to current node */
+            if (gf_uuid_compare(brickinfo->uuid, MY_UUID))
+                continue;
+            fprintf(fp, "Volume%d.Brick%d.port: %d\n", count_bkp, count,
+                    brickinfo->port);
+            fprintf(fp, "Volume%d.Brick%d.rdma_port: %d\n", count_bkp, count,
+                    brickinfo->rdma_port);
+            fprintf(fp, "Volume%d.Brick%d.port_registered: %d\n", count_bkp,
+                    count, brickinfo->port_registered);
+            glusterd_brick_get_status_str(brickinfo, brick_status_str);
+            fprintf(fp, "Volume%d.Brick%d.status: %s\n", count_bkp, count,
+                    brick_status_str);
+
+            ret = sys_statvfs(brickinfo->path, &brickstat);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                       "statfs error: %s ", strerror(errno));
+                memfree = 0;
+                memtotal = 0;
+            } else {
+                memfree = brickstat.f_bfree * brickstat.f_bsize;
+                memtotal = brickstat.f_blocks * brickstat.f_bsize;
+            }
+
+            fprintf(fp, "Volume%d.Brick%d.spacefree: %" PRIu64 "Bytes\n",
+                    count_bkp, count, memfree);
+            fprintf(fp, "Volume%d.Brick%d.spacetotal: %" PRIu64 "Bytes\n",
+                    count_bkp, count, memtotal);
+
+            if (get_state_cmd != GF_CLI_GET_STATE_DETAIL)
+                continue;
+
+            ret = glusterd_print_client_details(fp, dict, volinfo, count_bkp,
+                                                brickinfo, count);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_CLIENTS_GET_STATE_FAILED,
+                       "Failed to get client details");
+                goto out;
+            }
+        }
+
+        count = count_bkp;
+
+        ret = glusterd_print_snapinfo_by_vol(fp, volinfo, count);
+        if (ret)
+            goto out;
+
+        fprintf(fp, "Volume%d.snap_count: %" PRIu64 "\n", count,
+                volinfo->snap_count);
+        fprintf(fp, "Volume%d.stripe_count: %d\n", count,
+                volinfo->stripe_count);
+        fprintf(fp, "Volume%d.replica_count: %d\n", count,
+                volinfo->replica_count);
+        fprintf(fp, "Volume%d.subvol_count: %d\n", count,
+                volinfo->subvol_count);
+        fprintf(fp, "Volume%d.arbiter_count: %d\n", count,
+                volinfo->arbiter_count);
+        fprintf(fp, "Volume%d.disperse_count: %d\n", count,
+                volinfo->disperse_count);
+        fprintf(fp, "Volume%d.redundancy_count: %d\n", count,
+                volinfo->redundancy_count);
+        fprintf(fp, "Volume%d.quorum_status: %s\n", count, quorum_status_str);
+
+        fprintf(fp, "Volume%d.snapd_svc.online_status: %s\n", count,
+                volinfo->snapd.svc.online ? "Online" : "Offline");
+        fprintf(fp, "Volume%d.snapd_svc.inited: %s\n", count,
+                volinfo->snapd.svc.inited ? "True" : "False");
+
+        uuid_utoa_r(volinfo->rebal.rebalance_id, id_str);
+        char *rebal_data = gf_uint64_2human_readable(
+            volinfo->rebal.rebalance_data);
+
+        fprintf(fp, "Volume%d.rebalance.id: %s\n", count, id_str);
+        fprintf(fp, "Volume%d.rebalance.status: %s\n", count, rebal_status_str);
+        fprintf(fp, "Volume%d.rebalance.failures: %" PRIu64 "\n", count,
+                volinfo->rebal.rebalance_failures);
+        fprintf(fp, "Volume%d.rebalance.skipped: %" PRIu64 "\n", count,
+                volinfo->rebal.skipped_files);
+        fprintf(fp, "Volume%d.rebalance.lookedup: %" PRIu64 "\n", count,
+                volinfo->rebal.lookedup_files);
+        fprintf(fp, "Volume%d.rebalance.files: %" PRIu64 "\n", count,
+                volinfo->rebal.rebalance_files);
+        fprintf(fp, "Volume%d.rebalance.data: %s\n", count, rebal_data);
+        fprintf(fp, "Volume%d.time_left: %" PRIu64 "\n", count,
+                volinfo->rebal.time_left);
+
+        GF_FREE(rebal_data);
+
+        fprintf(fp, "Volume%d.shd_svc.online_status: %s\n", count,
+                volinfo->shd.svc.online ? "Online" : "Offline");
+        fprintf(fp, "Volume%d.shd_svc.inited: %s\n", count,
+                volinfo->shd.svc.inited ? "True" : "False");
+
+        if (volinfo->rep_brick.src_brick && volinfo->rep_brick.dst_brick) {
+            fprintf(fp, "Volume%d.replace_brick.src: %s:%s\n", count,
+                    volinfo->rep_brick.src_brick->hostname,
+                    volinfo->rep_brick.src_brick->path);
+            fprintf(fp, "Volume%d.replace_brick.dest: %s:%s\n", count,
+                    volinfo->rep_brick.dst_brick->hostname,
+                    volinfo->rep_brick.dst_brick->path);
+        }
+
+        volcount = count;
+        ret = glusterd_print_gsync_status_by_vol(fp, volinfo);
+        if (ret)
+            goto out;
+
+        if (volinfo->dict)
+            dict_foreach(volinfo->dict, glusterd_print_volume_options, fp);
+
+        fprintf(fp, "\n");
+    }
+
+    count = 0;
+
+    fprintf(fp, "\n[Services]\n");
+#ifdef BUILD_GNFS
+    if (priv->nfs_svc.inited) {
+        fprintf(fp, "svc%d.name: %s\n", ++count, priv->nfs_svc.name);
+        fprintf(fp, "svc%d.online_status: %s\n\n", count,
+                priv->nfs_svc.online ? "Online" : "Offline");
+    }
+#endif
+    if (priv->bitd_svc.inited) {
+        fprintf(fp, "svc%d.name: %s\n", ++count, priv->bitd_svc.name);
+        fprintf(fp, "svc%d.online_status: %s\n\n", count,
+                priv->bitd_svc.online ? "Online" : "Offline");
+    }
+
+    if (priv->scrub_svc.inited) {
+        fprintf(fp, "svc%d.name: %s\n", ++count, priv->scrub_svc.name);
+        fprintf(fp, "svc%d.online_status: %s\n\n", count,
+                priv->scrub_svc.online ? "Online" : "Offline");
+    }
+
+    if (priv->quotad_svc.inited) {
+        fprintf(fp, "svc%d.name: %s\n", ++count, priv->quotad_svc.name);
+        fprintf(fp, "svc%d.online_status: %s\n\n", count,
+                priv->quotad_svc.online ? "Online" : "Offline");
+    }
+
+    fprintf(fp, "\n[Misc]\n");
+    if (priv->pmap) {
+        fprintf(fp, "Base port: %d\n", priv->pmap->base_port);
+        fprintf(fp, "Last allocated port: %d\n", priv->pmap->last_alloc);
+    }
+out:
+
+    if (fp)
+        fclose(fp);
+
+    rsp.op_ret = ret;
+    if (rsp.op_errstr == NULL)
+        rsp.op_errstr = err_str;
+
+    ret = dict_allocate_and_serialize(dict, &rsp.dict.dict_val,
+                                      &rsp.dict.dict_len);
+    glusterd_to_cli(req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_cli_rsp, dict);
+    GF_FREE(rsp.dict.dict_val);
+
+    return ret;
+}
+
+static int
+__glusterd_handle_get_state(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {
+        {
+            0,
+        },
+    };
+    dict_t *dict = NULL;
+    char err_str[64] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO(THIS->name, this, out);
+    GF_VALIDATE_OR_GOTO(this->name, req, out);
+
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DAEMON_STATE_REQ_RCVD,
+           "Received request to get state for glusterd");
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to decode "
+                 "request received from cli");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL, "%s",
+               err_str);
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode"
+                     " the command");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+    }
+
+    ret = glusterd_get_state(req, dict);
+
+out:
+    if (dict && ret) {
+        /*
+         * When glusterd_to_cli (called from glusterd_get_state)
+         * succeeds, it frees the dict for us, so this would be a
+         * double free, but in other cases it's our responsibility.
+         */
+        dict_unref(dict);
+    }
+    return ret;
+}
+
+int
+glusterd_handle_get_state(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_get_state);
+}
+
+static int
+get_brickinfo_from_brickid(char *brickid, glusterd_brickinfo_t **brickinfo)
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    char *volid_str = NULL;
+    char *brick = NULL;
+    char *brickid_dup = NULL;
+    uuid_t volid = {0};
+    int ret = -1;
+
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    brickid_dup = gf_strdup(brickid);
+    if (!brickid_dup) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+                "brick_id=%s", brickid, NULL);
+        goto out;
+    }
+
+    volid_str = brickid_dup;
+    brick = strchr(brickid_dup, ':');
+    if (!volid_str) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRCHR_FAIL, NULL);
+        goto out;
+    }
+
+    if (!brick) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRCHR_FAIL, NULL);
+        goto out;
+    }
+
+    *brick = '\0';
+    brick++;
+    gf_uuid_parse(volid_str, volid);
+    ret = glusterd_volinfo_find_by_volume_id(volid, &volinfo);
+    if (ret) {
+        /* Check if it a snapshot volume */
+        ret = glusterd_snap_volinfo_find_by_volume_id(volid, &volinfo);
+        if (ret)
+            goto out;
+    }
+
+    ret = glusterd_volume_brickinfo_get_by_brick(brick, volinfo, brickinfo,
+                                                 _gf_false);
+    if (ret)
+        goto out;
+
+    ret = 0;
+out:
+    GF_FREE(brickid_dup);
+    return ret;
+}
+
+static int gd_stale_rpc_disconnect_log;
+
+int
+__glusterd_brick_rpc_notify(struct rpc_clnt *rpc, void *mydata,
+                            rpc_clnt_event_t event, void *data)
+{
+    char *brickid = NULL;
+    int ret = 0;
+    glusterd_conf_t *conf = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    int32_t pid = -1;
+    glusterd_brickinfo_t *brickinfo_tmp = NULL;
+    glusterd_brick_proc_t *brick_proc = NULL;
+    char pidfile[PATH_MAX] = {0};
+    char *brickpath = NULL;
+    gf_boolean_t is_service_running = _gf_true;
+
+    brickid = mydata;
+    if (!brickid)
+        return 0;
+
+    ret = get_brickinfo_from_brickid(brickid, &brickinfo);
+    if (ret)
+        return 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    switch (event) {
+        case RPC_CLNT_CONNECT:
+            ret = get_volinfo_from_brickid(brickid, &volinfo);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                       "Failed to get volinfo from "
+                       "brickid(%s)",
+                       brickid);
+                goto out;
+            }
+            /* If a node on coming back up, already starts a brick
+             * before the handshake, and the notification comes after
+             * the handshake is done, then we need to check if this
+             * is a restored brick with a snapshot pending. If so, we
+             * need to stop the brick
+             */
+            if (brickinfo->snap_status == -1) {
+                gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_SNAPSHOT_PENDING,
+                       "Snapshot is pending on %s:%s. "
+                       "Hence not starting the brick",
+                       brickinfo->hostname, brickinfo->path);
+                ret = glusterd_brick_stop(volinfo, brickinfo, _gf_false);
+                if (ret) {
+                    gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_STOP_FAIL,
+                           "Unable to stop %s:%s", brickinfo->hostname,
+                           brickinfo->path);
+                    goto out;
+                }
+
+                break;
+            }
+            gf_msg_debug(this->name, 0, "Connected to %s:%s",
+                         brickinfo->hostname, brickinfo->path);
+
+            glusterd_set_brick_status(brickinfo, GF_BRICK_STARTED);
+
+            gf_event(EVENT_BRICK_CONNECTED, "peer=%s;volume=%s;brick=%s",
+                     brickinfo->hostname, volinfo->volname, brickinfo->path);
+
+            ret = default_notify(this, GF_EVENT_CHILD_UP, NULL);
+
+            break;
+
+        case RPC_CLNT_DISCONNECT:
+            if (rpc != brickinfo->rpc) {
+                /*
+                 * There used to be a bunch of races in the volume
+                 * start/stop code that could result in us getting here
+                 * and setting the brick status incorrectly.  Many of
+                 * those have been fixed or avoided, but just in case
+                 * any are still left it doesn't hurt to keep the extra
+                 * check and avoid further damage.
+                 */
+                GF_LOG_OCCASIONALLY(gd_stale_rpc_disconnect_log, this->name,
+                                    GF_LOG_WARNING,
+                                    "got disconnect from stale rpc on "
+                                    "%s",
+                                    brickinfo->path);
+                break;
+            }
+            if (glusterd_is_brick_started(brickinfo)) {
+                gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_BRICK_DISCONNECTED,
+                       "Brick %s:%s has disconnected from glusterd.",
+                       brickinfo->hostname, brickinfo->path);
+
+                ret = get_volinfo_from_brickid(brickid, &volinfo);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                           "Failed to get volinfo from "
+                           "brickid(%s)",
+                           brickid);
+                    goto out;
+                }
+                gf_event(EVENT_BRICK_DISCONNECTED, "peer=%s;volume=%s;brick=%s",
+                         brickinfo->hostname, volinfo->volname,
+                         brickinfo->path);
+                /* In case of an abrupt shutdown of a brick PMAP_SIGNOUT
+                 * event is not received by glusterd which can lead to a
+                 * stale port entry in glusterd, so forcibly clean up
+                 * the same if the process is not running sometime
+                 * gf_is_service_running true so to ensure about brick instance
+                 * call search_brick_path_from_proc
+                 */
+                GLUSTERD_GET_BRICK_PIDFILE(pidfile, volinfo, brickinfo, conf);
+                is_service_running = gf_is_service_running(pidfile, &pid);
+                if (pid > 0)
+                    brickpath = search_brick_path_from_proc(pid,
+                                                            brickinfo->path);
+                if (!is_service_running || !brickpath) {
+                    ret = pmap_registry_remove(
+                        THIS, brickinfo->port, brickinfo->path,
+                        GF_PMAP_PORT_BRICKSERVER, NULL, _gf_true);
+                    if (ret) {
+                        gf_msg(this->name, GF_LOG_WARNING,
+                               GD_MSG_PMAP_REGISTRY_REMOVE_FAIL, 0,
+                               "Failed to remove pmap "
+                               "registry for port %d for "
+                               "brick %s",
+                               brickinfo->port, brickinfo->path);
+                        ret = 0;
+                    }
+                }
+            }
+
+            if (brickpath)
+                GF_FREE(brickpath);
+
+            if (is_brick_mx_enabled() && glusterd_is_brick_started(brickinfo)) {
+                brick_proc = brickinfo->brick_proc;
+                if (!brick_proc)
+                    break;
+                cds_list_for_each_entry(brickinfo_tmp, &brick_proc->bricks,
+                                        mux_bricks)
+                {
+                    glusterd_set_brick_status(brickinfo_tmp, GF_BRICK_STOPPED);
+                    brickinfo_tmp->start_triggered = _gf_false;
+                    /* When bricks are stopped, ports also need to
+                     * be cleaned up
+                     */
+                    pmap_registry_remove(
+                        THIS, brickinfo_tmp->port, brickinfo_tmp->path,
+                        GF_PMAP_PORT_BRICKSERVER, NULL, _gf_true);
+                }
+            } else {
+                glusterd_set_brick_status(brickinfo, GF_BRICK_STOPPED);
+                brickinfo->start_triggered = _gf_false;
+            }
+            break;
+
+        case RPC_CLNT_DESTROY:
+            GF_FREE(mydata);
+            mydata = NULL;
+            break;
+        default:
+            gf_msg_trace(this->name, 0, "got some other RPC event %d", event);
+            break;
+    }
+
+out:
+    return ret;
+}
+
+int
+glusterd_brick_rpc_notify(struct rpc_clnt *rpc, void *mydata,
+                          rpc_clnt_event_t event, void *data)
+{
+    return glusterd_big_locked_notify(rpc, mydata, event, data,
+                                      __glusterd_brick_rpc_notify);
+}
+
+int
+glusterd_friend_remove_notify(glusterd_peerctx_t *peerctx, int32_t op_errno)
+{
+    int ret = -1;
+    glusterd_friend_sm_event_t *new_event = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    rpcsvc_request_t *req = NULL;
+    char *errstr = NULL;
+    dict_t *dict = NULL;
+
+    GF_ASSERT(peerctx);
+
+    RCU_READ_LOCK;
+    peerinfo = glusterd_peerinfo_find_by_generation(peerctx->peerinfo_gen);
+    if (!peerinfo) {
+        gf_msg_debug(THIS->name, 0,
+                     "Could not find peer %s(%s). "
+                     "Peer could have been deleted.",
+                     peerctx->peername, uuid_utoa(peerctx->peerid));
+        ret = 0;
+        goto out;
+    }
+
+    req = peerctx->args.req;
+    dict = peerctx->args.dict;
+    errstr = peerctx->errstr;
+
+    ret = glusterd_friend_sm_new_event(GD_FRIEND_EVENT_REMOVE_FRIEND,
+                                       &new_event);
+    if (!ret) {
+        if (!req) {
+            gf_msg(THIS->name, GF_LOG_WARNING, 0, GD_MSG_EVENT_NEW_GET_FAIL,
+                   "Unable to find the request for responding "
+                   "to User (%s)",
+                   peerinfo->hostname);
+            goto out;
+        }
+
+        glusterd_xfer_cli_probe_resp(req, -1, op_errno, errstr,
+                                     peerinfo->hostname, peerinfo->port, dict);
+
+        new_event->peername = gf_strdup(peerinfo->hostname);
+        gf_uuid_copy(new_event->peerid, peerinfo->uuid);
+        ret = glusterd_friend_sm_inject_event(new_event);
+
+    } else {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_EVENT_INJECT_FAIL,
+               "Unable to create event for removing peer %s",
+               peerinfo->hostname);
+    }
+
+out:
+    RCU_READ_UNLOCK;
+    return ret;
+}
+
+int
+__glusterd_peer_rpc_notify(struct rpc_clnt *rpc, void *mydata,
+                           rpc_clnt_event_t event, void *data)
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    int ret = 0;
+    int32_t op_errno = ENOTCONN;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_peerctx_t *peerctx = NULL;
+    gf_boolean_t quorum_action = _gf_false;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterfs_ctx_t *ctx = NULL;
+
+    uuid_t uuid;
+
+    peerctx = mydata;
+    if (!peerctx)
+        return 0;
+
+    this = THIS;
+    conf = this->private;
+
+    switch (event) {
+        case RPC_CLNT_DESTROY:
+            GF_FREE(peerctx->errstr);
+            GF_FREE(peerctx->peername);
+            GF_FREE(peerctx);
+            return 0;
+        case RPC_CLNT_PING:
+            return 0;
+        default:
+            break;
+    }
+    ctx = this->ctx;
+    GF_VALIDATE_OR_GOTO(this->name, ctx, out);
+    if (ctx->cleanup_started) {
+        gf_log(this->name, GF_LOG_INFO,
+               "glusterd already received a SIGTERM, "
+               "dropping the event %d for peer %s",
+               event, peerctx->peername);
+        return 0;
+    }
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find_by_generation(peerctx->peerinfo_gen);
+    if (!peerinfo) {
+        /* Peerinfo should be available at this point if its a connect
+         * event. Not finding it means that something terrible has
+         * happened. For non-connect event we might end up having a null
+         * peerinfo, so log at debug level.
+         */
+        gf_msg(THIS->name,
+               (RPC_CLNT_CONNECT == event) ? GF_LOG_CRITICAL : GF_LOG_DEBUG,
+               ENOENT, GD_MSG_PEER_NOT_FOUND,
+               "Could not find peer "
+               "%s(%s)",
+               peerctx->peername, uuid_utoa(peerctx->peerid));
+
+        if (RPC_CLNT_CONNECT == event) {
+            gf_event(EVENT_PEER_NOT_FOUND, "peer=%s;uuid=%s", peerctx->peername,
+                     uuid_utoa(peerctx->peerid));
+        }
+        ret = -1;
+        goto out;
+    }
+
+    switch (event) {
+        case RPC_CLNT_CONNECT: {
+            gf_msg_debug(this->name, 0, "got RPC_CLNT_CONNECT");
+            peerinfo->connected = 1;
+            peerinfo->quorum_action = _gf_true;
+            peerinfo->generation = uatomic_add_return(&conf->generation, 1);
+            peerctx->peerinfo_gen = peerinfo->generation;
+            /* EVENT_PEER_CONNECT will only be sent if peerctx->uuid is not
+             * NULL, otherwise it indicates this RPC_CLNT_CONNECT is from a
+             * peer probe trigger and given we already generate an event for
+             * peer probe this would be unnecessary.
+             */
+            if (!gf_uuid_is_null(peerinfo->uuid)) {
+                gf_event(EVENT_PEER_CONNECT, "host=%s;uuid=%s",
+                         peerinfo->hostname, uuid_utoa(peerinfo->uuid));
+            }
+            ret = glusterd_peer_dump_version(this, rpc, peerctx);
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HANDSHAKE_FAILED,
+                       "glusterd handshake failed");
+            break;
+        }
+
+        case RPC_CLNT_DISCONNECT: {
+            /* If DISCONNECT event is already processed, skip the further
+             * ones
+             */
+            if (is_rpc_clnt_disconnected(&rpc->conn))
+                break;
+
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_PEER_DISCONNECTED,
+                   "Peer <%s> (<%s>), in state <%s>, has disconnected "
+                   "from glusterd.",
+                   peerinfo->hostname, uuid_utoa(peerinfo->uuid),
+                   glusterd_friend_sm_state_name_get(peerinfo->state.state));
+            gf_event(EVENT_PEER_DISCONNECT, "peer=%s;uuid=%s;state=%s",
+                     peerinfo->hostname, uuid_utoa(peerinfo->uuid),
+                     glusterd_friend_sm_state_name_get(peerinfo->state.state));
+
+            if (peerinfo->connected) {
+                if (conf->op_version < GD_OP_VERSION_3_6_0) {
+                    glusterd_get_lock_owner(&uuid);
+                    if (!gf_uuid_is_null(uuid) &&
+                        !gf_uuid_compare(peerinfo->uuid, uuid))
+                        glusterd_unlock(peerinfo->uuid);
+                } else {
+                    cds_list_for_each_entry(volinfo, &conf->volumes, vol_list)
+                    {
+                        ret = glusterd_mgmt_v3_unlock(volinfo->volname,
+                                                      peerinfo->uuid, "vol");
+                        if (ret)
+                            gf_msg(this->name, GF_LOG_WARNING, 0,
+                                   GD_MSG_MGMTV3_UNLOCK_FAIL,
+                                   "Lock not released "
+                                   "for %s",
+                                   volinfo->volname);
+                    }
+                }
+
+                op_errno = GF_PROBE_ANOTHER_CLUSTER;
+                ret = 0;
+            }
+
+            if ((peerinfo->quorum_contrib != QUORUM_DOWN) &&
+                (peerinfo->state.state == GD_FRIEND_STATE_BEFRIENDED)) {
+                peerinfo->quorum_contrib = QUORUM_DOWN;
+                quorum_action = _gf_true;
+                peerinfo->quorum_action = _gf_false;
+            }
+
+            /* Remove peer if it is not a friend and connection/handshake
+             *  fails, and notify cli. Happens only during probe.
+             */
+            if (peerinfo->state.state == GD_FRIEND_STATE_DEFAULT) {
+                glusterd_friend_remove_notify(peerctx, op_errno);
+                goto out;
+            }
+
+            peerinfo->connected = 0;
+            break;
+        }
+
+        default:
+            gf_msg_trace(this->name, 0, "got some other RPC event %d", event);
+            ret = 0;
+            break;
+    }
+
+out:
+    RCU_READ_UNLOCK;
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+    if (quorum_action)
+        glusterd_do_quorum_action();
+    return ret;
+}
+
+int
+glusterd_peer_rpc_notify(struct rpc_clnt *rpc, void *mydata,
+                         rpc_clnt_event_t event, void *data)
+{
+    return glusterd_big_locked_notify(rpc, mydata, event, data,
+                                      __glusterd_peer_rpc_notify);
+}
+
+int
+glusterd_null(rpcsvc_request_t *req)
+{
+    return 0;
+}
+
+static rpcsvc_actor_t gd_svc_mgmt_actors[GLUSTERD_MGMT_MAXVALUE] = {
+    [GLUSTERD_MGMT_NULL] = {"NULL", glusterd_null, NULL, GLUSTERD_MGMT_NULL,
+                            DRC_NA, 0},
+    [GLUSTERD_MGMT_CLUSTER_LOCK] = {"CLUSTER_LOCK",
+                                    glusterd_handle_cluster_lock, NULL,
+                                    GLUSTERD_MGMT_CLUSTER_LOCK, DRC_NA, 0},
+    [GLUSTERD_MGMT_CLUSTER_UNLOCK] = {"CLUSTER_UNLOCK",
+                                      glusterd_handle_cluster_unlock, NULL,
+                                      GLUSTERD_MGMT_CLUSTER_UNLOCK, DRC_NA, 0},
+    [GLUSTERD_MGMT_STAGE_OP] = {"STAGE_OP", glusterd_handle_stage_op, NULL,
+                                GLUSTERD_MGMT_STAGE_OP, DRC_NA, 0},
+    [GLUSTERD_MGMT_COMMIT_OP] =
+        {
+            "COMMIT_OP",
+            glusterd_handle_commit_op,
+            NULL,
+            GLUSTERD_MGMT_COMMIT_OP,
+            DRC_NA,
+            0,
+        },
+};
+
+struct rpcsvc_program gd_svc_mgmt_prog = {
+    .progname = "GlusterD svc mgmt",
+    .prognum = GD_MGMT_PROGRAM,
+    .progver = GD_MGMT_VERSION,
+    .numactors = GLUSTERD_MGMT_MAXVALUE,
+    .actors = gd_svc_mgmt_actors,
+    .synctask = _gf_true,
+};
+
+static rpcsvc_actor_t gd_svc_peer_actors[GLUSTERD_FRIEND_MAXVALUE] = {
+    [GLUSTERD_FRIEND_NULL] = {"NULL", glusterd_null, NULL, GLUSTERD_MGMT_NULL,
+                              DRC_NA, 0},
+    [GLUSTERD_PROBE_QUERY] = {"PROBE_QUERY", glusterd_handle_probe_query, NULL,
+                              GLUSTERD_PROBE_QUERY, DRC_NA, 0},
+    [GLUSTERD_FRIEND_ADD] = {"FRIEND_ADD", glusterd_handle_incoming_friend_req,
+                             NULL, GLUSTERD_FRIEND_ADD, DRC_NA, 0},
+    [GLUSTERD_FRIEND_REMOVE] = {"FRIEND_REMOVE",
+                                glusterd_handle_incoming_unfriend_req, NULL,
+                                GLUSTERD_FRIEND_REMOVE, DRC_NA, 0},
+    [GLUSTERD_FRIEND_UPDATE] = {"FRIEND_UPDATE", glusterd_handle_friend_update,
+                                NULL, GLUSTERD_FRIEND_UPDATE, DRC_NA, 0},
+};
+
+struct rpcsvc_program gd_svc_peer_prog = {
+    .progname = "GlusterD svc peer",
+    .prognum = GD_FRIEND_PROGRAM,
+    .progver = GD_FRIEND_VERSION,
+    .numactors = GLUSTERD_FRIEND_MAXVALUE,
+    .actors = gd_svc_peer_actors,
+    .synctask = _gf_false,
+};
+
+static rpcsvc_actor_t gd_svc_cli_actors[GLUSTER_CLI_MAXVALUE] = {
+    [GLUSTER_CLI_PROBE] = {"CLI_PROBE", glusterd_handle_cli_probe, NULL,
+                           GLUSTER_CLI_PROBE, DRC_NA, 0},
+    [GLUSTER_CLI_CREATE_VOLUME] = {"CLI_CREATE_VOLUME",
+                                   glusterd_handle_create_volume, NULL,
+                                   GLUSTER_CLI_CREATE_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_DEFRAG_VOLUME] = {"CLI_DEFRAG_VOLUME",
+                                   glusterd_handle_defrag_volume, NULL,
+                                   GLUSTER_CLI_DEFRAG_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_DEPROBE] = {"FRIEND_REMOVE", glusterd_handle_cli_deprobe, NULL,
+                             GLUSTER_CLI_DEPROBE, DRC_NA, 0},
+    [GLUSTER_CLI_LIST_FRIENDS] = {"LIST_FRIENDS",
+                                  glusterd_handle_cli_list_friends, NULL,
+                                  GLUSTER_CLI_LIST_FRIENDS, DRC_NA, 0},
+    [GLUSTER_CLI_UUID_RESET] = {"UUID_RESET", glusterd_handle_cli_uuid_reset,
+                                NULL, GLUSTER_CLI_UUID_RESET, DRC_NA, 0},
+    [GLUSTER_CLI_UUID_GET] = {"UUID_GET", glusterd_handle_cli_uuid_get, NULL,
+                              GLUSTER_CLI_UUID_GET, DRC_NA, 0},
+    [GLUSTER_CLI_START_VOLUME] = {"START_VOLUME",
+                                  glusterd_handle_cli_start_volume, NULL,
+                                  GLUSTER_CLI_START_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_STOP_VOLUME] = {"STOP_VOLUME", glusterd_handle_cli_stop_volume,
+                                 NULL, GLUSTER_CLI_STOP_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_DELETE_VOLUME] = {"DELETE_VOLUME",
+                                   glusterd_handle_cli_delete_volume, NULL,
+                                   GLUSTER_CLI_DELETE_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_GET_VOLUME] = {"GET_VOLUME", glusterd_handle_cli_get_volume,
+                                NULL, GLUSTER_CLI_GET_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_ADD_BRICK] = {"ADD_BRICK", glusterd_handle_add_brick, NULL,
+                               GLUSTER_CLI_ADD_BRICK, DRC_NA, 0},
+    [GLUSTER_CLI_ATTACH_TIER] = {"ATTACH_TIER", glusterd_handle_attach_tier,
+                                 NULL, GLUSTER_CLI_ATTACH_TIER, DRC_NA, 0},
+    [GLUSTER_CLI_REPLACE_BRICK] = {"REPLACE_BRICK",
+                                   glusterd_handle_replace_brick, NULL,
+                                   GLUSTER_CLI_REPLACE_BRICK, DRC_NA, 0},
+    [GLUSTER_CLI_REMOVE_BRICK] = {"REMOVE_BRICK", glusterd_handle_remove_brick,
+                                  NULL, GLUSTER_CLI_REMOVE_BRICK, DRC_NA, 0},
+    [GLUSTER_CLI_LOG_ROTATE] = {"LOG FILENAME", glusterd_handle_log_rotate,
+                                NULL, GLUSTER_CLI_LOG_ROTATE, DRC_NA, 0},
+    [GLUSTER_CLI_SET_VOLUME] = {"SET_VOLUME", glusterd_handle_set_volume, NULL,
+                                GLUSTER_CLI_SET_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_SYNC_VOLUME] = {"SYNC_VOLUME", glusterd_handle_sync_volume,
+                                 NULL, GLUSTER_CLI_SYNC_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_RESET_VOLUME] = {"RESET_VOLUME", glusterd_handle_reset_volume,
+                                  NULL, GLUSTER_CLI_RESET_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_FSM_LOG] = {"FSM_LOG", glusterd_handle_fsm_log, NULL,
+                             GLUSTER_CLI_FSM_LOG, DRC_NA, 0},
+    [GLUSTER_CLI_GSYNC_SET] = {"GSYNC_SET", glusterd_handle_gsync_set, NULL,
+                               GLUSTER_CLI_GSYNC_SET, DRC_NA, 0},
+    [GLUSTER_CLI_PROFILE_VOLUME] = {"STATS_VOLUME",
+                                    glusterd_handle_cli_profile_volume, NULL,
+                                    GLUSTER_CLI_PROFILE_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_QUOTA] = {"QUOTA", glusterd_handle_quota, NULL,
+                           GLUSTER_CLI_QUOTA, DRC_NA, 0},
+    [GLUSTER_CLI_GETWD] = {"GETWD", glusterd_handle_getwd, NULL,
+                           GLUSTER_CLI_GETWD, DRC_NA, 1},
+    [GLUSTER_CLI_STATUS_VOLUME] = {"STATUS_VOLUME",
+                                   glusterd_handle_status_volume, NULL,
+                                   GLUSTER_CLI_STATUS_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_MOUNT] = {"MOUNT", glusterd_handle_mount, NULL,
+                           GLUSTER_CLI_MOUNT, DRC_NA, 1},
+    [GLUSTER_CLI_UMOUNT] = {"UMOUNT", glusterd_handle_umount, NULL,
+                            GLUSTER_CLI_UMOUNT, DRC_NA, 1},
+    [GLUSTER_CLI_HEAL_VOLUME] = {"HEAL_VOLUME", glusterd_handle_cli_heal_volume,
+                                 NULL, GLUSTER_CLI_HEAL_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_STATEDUMP_VOLUME] = {"STATEDUMP_VOLUME",
+                                      glusterd_handle_cli_statedump_volume,
+                                      NULL, GLUSTER_CLI_STATEDUMP_VOLUME,
+                                      DRC_NA, 0},
+    [GLUSTER_CLI_LIST_VOLUME] = {"LIST_VOLUME", glusterd_handle_cli_list_volume,
+                                 NULL, GLUSTER_CLI_LIST_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_CLRLOCKS_VOLUME] = {"CLEARLOCKS_VOLUME",
+                                     glusterd_handle_cli_clearlocks_volume,
+                                     NULL, GLUSTER_CLI_CLRLOCKS_VOLUME, DRC_NA,
+                                     0},
+    [GLUSTER_CLI_COPY_FILE] = {"COPY_FILE", glusterd_handle_copy_file, NULL,
+                               GLUSTER_CLI_COPY_FILE, DRC_NA, 0},
+    [GLUSTER_CLI_SYS_EXEC] = {"SYS_EXEC", glusterd_handle_sys_exec, NULL,
+                              GLUSTER_CLI_SYS_EXEC, DRC_NA, 0},
+    [GLUSTER_CLI_SNAP] = {"SNAP", glusterd_handle_snapshot, NULL,
+                          GLUSTER_CLI_SNAP, DRC_NA, 0},
+    [GLUSTER_CLI_BARRIER_VOLUME] = {"BARRIER_VOLUME", glusterd_handle_barrier,
+                                    NULL, GLUSTER_CLI_BARRIER_VOLUME, DRC_NA,
+                                    0},
+    [GLUSTER_CLI_GANESHA] = {"GANESHA", glusterd_handle_ganesha_cmd, NULL,
+                             GLUSTER_CLI_GANESHA, DRC_NA, 0},
+    [GLUSTER_CLI_GET_VOL_OPT] = {"GET_VOL_OPT", glusterd_handle_get_vol_opt,
+                                 NULL, DRC_NA, 0},
+    [GLUSTER_CLI_BITROT] = {"BITROT", glusterd_handle_bitrot, NULL,
+                            GLUSTER_CLI_BITROT, DRC_NA, 0},
+    [GLUSTER_CLI_GET_STATE] = {"GET_STATE", glusterd_handle_get_state, NULL,
+                               GLUSTER_CLI_GET_STATE, DRC_NA, 0},
+    [GLUSTER_CLI_RESET_BRICK] = {"RESET_BRICK", glusterd_handle_reset_brick,
+                                 NULL, GLUSTER_CLI_RESET_BRICK, DRC_NA, 0},
+    [GLUSTER_CLI_TIER] = {"TIER", glusterd_handle_tier, NULL, GLUSTER_CLI_TIER,
+                          DRC_NA, 0},
+    [GLUSTER_CLI_REMOVE_TIER_BRICK] = {"REMOVE_TIER_BRICK",
+                                       glusterd_handle_tier, NULL,
+                                       GLUSTER_CLI_REMOVE_TIER_BRICK, DRC_NA,
+                                       0},
+    [GLUSTER_CLI_ADD_TIER_BRICK] = {"ADD_TIER_BRICK",
+                                    glusterd_handle_add_tier_brick, NULL,
+                                    GLUSTER_CLI_ADD_TIER_BRICK, DRC_NA, 0},
+};
+
+struct rpcsvc_program gd_svc_cli_prog = {
+    .progname = "GlusterD svc cli",
+    .prognum = GLUSTER_CLI_PROGRAM,
+    .progver = GLUSTER_CLI_VERSION,
+    .numactors = GLUSTER_CLI_MAXVALUE,
+    .actors = gd_svc_cli_actors,
+    .synctask = _gf_true,
+};
+
+/**
+ * This set of RPC progs are deemed to be trusted. Most of the actors support
+ * read only queries, the only exception being MOUNT/UMOUNT which is required
+ * by geo-replication to support unprivileged master -> slave sessions.
+ */
+static rpcsvc_actor_t gd_svc_cli_trusted_actors[GLUSTER_CLI_MAXVALUE] = {
+    [GLUSTER_CLI_LIST_FRIENDS] = {"LIST_FRIENDS",
+                                  glusterd_handle_cli_list_friends, NULL,
+                                  GLUSTER_CLI_LIST_FRIENDS, DRC_NA, 0},
+    [GLUSTER_CLI_UUID_GET] = {"UUID_GET", glusterd_handle_cli_uuid_get, NULL,
+                              GLUSTER_CLI_UUID_GET, DRC_NA, 0},
+    [GLUSTER_CLI_GET_VOLUME] = {"GET_VOLUME", glusterd_handle_cli_get_volume,
+                                NULL, GLUSTER_CLI_GET_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_GETWD] = {"GETWD", glusterd_handle_getwd, NULL,
+                           GLUSTER_CLI_GETWD, DRC_NA, 1},
+    [GLUSTER_CLI_STATUS_VOLUME] = {"STATUS_VOLUME",
+                                   glusterd_handle_status_volume, NULL,
+                                   GLUSTER_CLI_STATUS_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_LIST_VOLUME] = {"LIST_VOLUME", glusterd_handle_cli_list_volume,
+                                 NULL, GLUSTER_CLI_LIST_VOLUME, DRC_NA, 0},
+    [GLUSTER_CLI_MOUNT] = {"MOUNT", glusterd_handle_mount, NULL,
+                           GLUSTER_CLI_MOUNT, DRC_NA, 1},
+    [GLUSTER_CLI_UMOUNT] = {"UMOUNT", glusterd_handle_umount, NULL,
+                            GLUSTER_CLI_UMOUNT, DRC_NA, 1},
+};
+
+struct rpcsvc_program gd_svc_cli_trusted_progs = {
+    .progname = "GlusterD svc cli read-only",
+    .prognum = GLUSTER_CLI_PROGRAM,
+    .progver = GLUSTER_CLI_VERSION,
+    .numactors = GLUSTER_CLI_MAXVALUE,
+    .actors = gd_svc_cli_trusted_actors,
+    .synctask = _gf_true,
+};
+
+/* As we cant remove the handlers, I'm moving the tier based
+ * handlers to this file as we no longer have gluster-tier.c
+ * and other tier.c files
+ */
+
+int
+glusterd_handle_tier(rpcsvc_request_t *req)
+{
+    return 0;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-handshake.c b/xlators/mgmt/glusterd/src/glusterd-handshake.c
new file mode 100644
index 00000000000..d96e35503dd
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-handshake.c
@@ -0,0 +1,2580 @@
+/*
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/compat-errno.h>
+
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-store.h"
+#include "glusterd-snapshot-utils.h"
+#include "glusterd-svc-mgmt.h"
+#include "glusterd-snapd-svc-helper.h"
+#include "glusterd-volgen.h"
+#include "glusterd-quotad-svc.h"
+#include "glusterd-messages.h"
+#include "glusterfs3.h"
+#include "protocol-common.h"
+#include "rpcsvc.h"
+#include "rpc-common-xdr.h"
+#include "glusterd-gfproxyd-svc-helper.h"
+#include "glusterd-shd-svc-helper.h"
+
+extern struct rpc_clnt_program gd_peer_prog;
+extern struct rpc_clnt_program gd_mgmt_prog;
+extern struct rpc_clnt_program gd_mgmt_v3_prog;
+
+#define TRUSTED_PREFIX "trusted-"
+#define GD_PEER_ID_KEY "peer-id"
+
+typedef ssize_t (*gfs_serialize_t)(struct iovec outmsg, void *data);
+
+static int
+get_snap_volname_and_volinfo(const char *volpath, char **volname,
+                             glusterd_volinfo_t **volinfo)
+{
+    int ret = -1;
+    char *save_ptr = NULL;
+    char *str_token = NULL;
+    char *snapname = NULL;
+    char *volname_token = NULL;
+    char *vol = NULL;
+    glusterd_snap_t *snap = NULL;
+    xlator_t *this = NULL;
+    char *tmp_str_token = NULL;
+    char *volfile_token = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(volpath);
+    GF_ASSERT(volinfo);
+
+    str_token = gf_strdup(volpath);
+    if (NULL == str_token) {
+        goto out;
+    }
+
+    tmp_str_token = str_token;
+
+    /* Input volname will have below formats:
+     * /snaps/<snapname>/<volname>.<hostname>
+     * or
+     * /snaps/<snapname>/<parent-volname>
+     * We need to extract snapname and parent_volname */
+
+    /*split string by "/" */
+    strtok_r(str_token, "/", &save_ptr);
+    snapname = strtok_r(NULL, "/", &save_ptr);
+    if (!snapname) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "Invalid path: %s", volpath);
+        goto out;
+    }
+
+    volname_token = strtok_r(NULL, "/", &save_ptr);
+    if (!volname_token) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "Invalid path: %s", volpath);
+        goto out;
+    }
+
+    snap = glusterd_find_snap_by_name(snapname);
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_SNAP_NOT_FOUND,
+               "Failed to "
+               "fetch snap %s",
+               snapname);
+        goto out;
+    }
+
+    /* Find if its a parent volume name or snap volume
+     * name. This function will succeed if volname_token
+     * is a parent volname
+     */
+    ret = glusterd_volinfo_find(volname_token, volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "failed to get the volinfo for the volume %s", volname_token);
+
+        /* Get the actual volfile name. */
+        volfile_token = strtok_r(NULL, "/", &save_ptr);
+        *volname = gf_strdup(volfile_token);
+        if (NULL == *volname) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+                    "Volname=%s", volfile_token, NULL);
+            ret = -1;
+            goto out;
+        }
+
+        /*
+         * Ideally, this should succeed as volname_token now contains
+         * the name of the snap volume (i.e. name of the volume that
+         * represents the snapshot). But, if for some reason, volinfo
+         * for the snap volume is not found, then try to get from the
+         * name of the volfile. Name of the volfile is like this.
+         * <snap volume name>.<hostname>.<brick path>.vol
+         */
+        ret = glusterd_snap_volinfo_find(volname_token, snap, volinfo);
+        if (ret) {
+            /* Split the volume name */
+            vol = strtok_r(volfile_token, ".", &save_ptr);
+            if (!vol) {
+                gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                       "Invalid "
+                       "volname (%s)",
+                       volfile_token);
+                goto out;
+            }
+
+            ret = glusterd_snap_volinfo_find(vol, snap, volinfo);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_INFO_FAIL,
+                       "Failed to "
+                       "fetch snap volume from volname (%s)",
+                       vol);
+                goto out;
+            }
+        }
+    } else {
+        /*volname_token is parent volname*/
+        ret = glusterd_snap_volinfo_find_from_parent_volname(volname_token,
+                                                             snap, volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_INFO_FAIL,
+                   "Failed to "
+                   "fetch snap volume from parent "
+                   "volname (%s)",
+                   volname_token);
+            goto out;
+        }
+
+        /* Since volname_token is a parent volname we should
+         * get the snap volname here*/
+        *volname = gf_strdup((*volinfo)->volname);
+        if (NULL == *volname) {
+            ret = -1;
+            goto out;
+        }
+    }
+
+out:
+    if (ret && NULL != *volname) {
+        GF_FREE(*volname);
+        *volname = NULL;
+    }
+
+    if (tmp_str_token)
+        GF_FREE(tmp_str_token);
+    return ret;
+}
+
+int32_t
+glusterd_get_client_per_brick_volfile(glusterd_volinfo_t *volinfo,
+                                      char *filename, char *path, int path_len)
+{
+    char workdir[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("glusterd", THIS, out);
+    priv = THIS->private;
+    GF_VALIDATE_OR_GOTO(THIS->name, priv, out);
+
+    GLUSTERD_GET_VOLUME_DIR(workdir, volinfo, priv);
+
+    snprintf(path, path_len, "%s/%s", workdir, filename);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+size_t
+build_volfile_path(char *volume_id, char *path, size_t path_len,
+                   char *trusted_str, dict_t *dict)
+{
+    struct stat stbuf = {
+        0,
+    };
+    int32_t ret = -1;
+    char *vol = NULL;
+    char *dup_volname = NULL;
+    char *save_ptr = NULL;
+    char *free_ptr = NULL;
+    char *volname = NULL;
+    char *volid_ptr = NULL;
+    char dup_volid[PATH_MAX] = {
+        0,
+    };
+    char path_prefix[PATH_MAX] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(volume_id);
+    GF_ASSERT(path);
+
+    volid_ptr = strstr(volume_id, "snapd/");
+    if (volid_ptr) {
+        volid_ptr = strchr(volid_ptr, '/');
+        if (!volid_ptr) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRCHR_FAIL, NULL);
+            ret = -1;
+            goto out;
+        }
+        volid_ptr++;
+
+        ret = glusterd_volinfo_find(volid_ptr, &volinfo);
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                   "Couldn't find volinfo");
+            goto out;
+        }
+        glusterd_svc_build_snapd_volfile(volinfo, path, path_len);
+        ret = 0;
+        goto out;
+    }
+
+    volid_ptr = strstr(volume_id, "gluster/");
+    if (volid_ptr) {
+        volid_ptr = strchr(volid_ptr, '/');
+        if (!volid_ptr) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRCHR_FAIL, NULL);
+            ret = -1;
+            goto out;
+        }
+        volid_ptr++;
+
+        glusterd_svc_build_volfile_path(volid_ptr, priv->workdir, path,
+                                        path_len);
+        ret = 0;
+        goto out;
+    }
+
+    volid_ptr = strstr(volume_id, "gfproxy-client/");
+    if (volid_ptr) {
+        volid_ptr = strchr(volid_ptr, '/');
+        if (!volid_ptr) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRCHR_FAIL, NULL);
+            ret = -1;
+            goto out;
+        }
+        volid_ptr++;
+
+        ret = glusterd_volinfo_find(volid_ptr, &volinfo);
+        if (ret == -1) {
+            gf_log(this->name, GF_LOG_ERROR, "Couldn't find volinfo");
+            goto out;
+        }
+
+        glusterd_get_gfproxy_client_volfile(volinfo, path, path_len);
+
+        ret = 0;
+        goto out;
+    }
+
+    volid_ptr = strstr(volume_id, "gfproxyd/");
+    if (volid_ptr) {
+        volid_ptr = strchr(volid_ptr, '/');
+        if (!volid_ptr) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRCHR_FAIL, NULL);
+            ret = -1;
+            goto out;
+        }
+        volid_ptr++;
+
+        ret = glusterd_volinfo_find(volid_ptr, &volinfo);
+        if (ret == -1) {
+            gf_log(this->name, GF_LOG_ERROR, "Couldn't find volinfo");
+            goto out;
+        }
+
+        glusterd_svc_build_gfproxyd_volfile_path(volinfo, path, path_len);
+        ret = 0;
+        goto out;
+    }
+
+    volid_ptr = strstr(volume_id, "shd/");
+    if (volid_ptr) {
+        volid_ptr = strchr(volid_ptr, '/');
+        if (!volid_ptr) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRCHR_FAIL, NULL);
+            ret = -1;
+            goto out;
+        }
+        volid_ptr++;
+
+        ret = glusterd_volinfo_find(volid_ptr, &volinfo);
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                   "Couldn't find volinfo for volid=%s", volid_ptr);
+            goto out;
+        }
+
+        glusterd_svc_build_shd_volfile_path(volinfo, path, path_len);
+
+        ret = glusterd_svc_set_shd_pidfile(volinfo, dict);
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Couldn't set pidfile in dict for volid=%s", volid_ptr);
+            goto out;
+        }
+        ret = 0;
+        goto out;
+    }
+
+    volid_ptr = strstr(volume_id, "/snaps/");
+    if (volid_ptr) {
+        ret = get_snap_volname_and_volinfo(volid_ptr, &volname, &volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_INFO_FAIL,
+                   "Failed to get snap"
+                   " volinfo from path (%s)",
+                   volume_id);
+            ret = -1;
+            goto out;
+        }
+
+        len = snprintf(path_prefix, sizeof(path_prefix), "%s/snaps/%s",
+                       priv->workdir, volinfo->snapshot->snapname);
+        volid_ptr = volname;
+        /* this is to ensure that volname recvd from
+           get_snap_volname_and_volinfo is free'd */
+        free_ptr = volname;
+        if ((len < 0) || (len >= sizeof(path_prefix))) {
+            ret = -1;
+            goto out;
+        }
+
+        goto gotvolinfo;
+    }
+
+    volid_ptr = strstr(volume_id, "rebalance/");
+    if (volid_ptr) {
+        volid_ptr = strchr(volid_ptr, '/');
+        if (!volid_ptr) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRCHR_FAIL, NULL);
+            ret = -1;
+            goto out;
+        }
+        volid_ptr++;
+
+        ret = glusterd_volinfo_find(volid_ptr, &volinfo);
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                   "Couldn't find volinfo");
+            goto out;
+        }
+        glusterd_get_rebalance_volfile(volinfo, path, path_len);
+        ret = 0;
+        goto out;
+    }
+
+    volid_ptr = strstr(volume_id, "client_per_brick/");
+    if (volid_ptr) {
+        volid_ptr = strchr(volid_ptr, '/');
+        if (!volid_ptr) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRCHR_FAIL, NULL);
+            ret = -1;
+            goto out;
+        }
+        volid_ptr++;
+
+        dup_volname = gf_strdup(volid_ptr);
+        if (!dup_volname) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "strdup failed");
+            ret = -1;
+            goto out;
+        }
+
+        /* Split the volume name */
+        vol = strtok_r(dup_volname, ".", &save_ptr);
+        if (!vol) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_SPLIT_FAIL,
+                    "Volume name=%s", dup_volname, NULL);
+            ret = -1;
+            goto out;
+        }
+        ret = glusterd_volinfo_find(vol, &volinfo);
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                   "Couldn't find volinfo");
+            goto out;
+        }
+        ret = glusterd_get_client_per_brick_volfile(volinfo, volid_ptr, path,
+                                                    path_len);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_MEMORY,
+                   "failed to get volinfo path");
+            goto out;
+        }
+
+        ret = sys_access(path, F_OK);
+        goto out;
+    }
+
+    if (volume_id[0] == '/') {
+        /* Normal behavior */
+        volid_ptr = volume_id;
+        volid_ptr++;
+
+    } else {
+        /* Bringing in NFS like behavior for mount command, */
+        /* With this, one can mount a volume with below cmd */
+        /* bash# mount -t glusterfs server:/volume /mnt/pnt */
+        volid_ptr = volume_id;
+    }
+
+    len = snprintf(path_prefix, sizeof(path_prefix), "%s/vols", priv->workdir);
+    if ((len < 0) || (len >= sizeof(path_prefix))) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volid_ptr, &volinfo);
+
+    if (ret) {
+        dup_volname = gf_strdup(volid_ptr);
+        if (!dup_volname) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+                    "Volume name=%s", volid_ptr, NULL);
+            ret = -1;
+            goto out;
+        }
+        /* Split the volume name */
+        vol = strtok_r(dup_volname, ".", &save_ptr);
+        if (!vol) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_SPLIT_FAIL,
+                    "Volume name=%s", dup_volname, NULL);
+            ret = -1;
+            goto out;
+        }
+        ret = glusterd_volinfo_find(vol, &volinfo);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_VOLINFO_GET_FAIL,
+                    NULL);
+            goto out;
+        }
+    }
+
+gotvolinfo:
+    if (!glusterd_auth_get_username(volinfo))
+        trusted_str = NULL;
+
+    ret = snprintf(path, path_len, "%s/%s/%s.vol", path_prefix,
+                   volinfo->volname, volid_ptr);
+    if (ret == -1) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    ret = sys_stat(path, &stbuf);
+
+    if ((ret == -1) && (errno == ENOENT)) {
+        if (snprintf(dup_volid, PATH_MAX, "%s", volid_ptr) >= PATH_MAX)
+            goto out;
+        if (!strchr(dup_volid, '.')) {
+            switch (volinfo->transport_type) {
+                case GF_TRANSPORT_TCP:
+                    strcat(dup_volid, ".tcp");
+                    break;
+                case GF_TRANSPORT_RDMA:
+                    strcat(dup_volid, ".rdma");
+                    break;
+                case GF_TRANSPORT_BOTH_TCP_RDMA:
+                    strcat(dup_volid, ".tcp");
+                    break;
+                default:
+                    break;
+            }
+        }
+        snprintf(path, path_len, "%s/%s/%s%s-fuse.vol", path_prefix,
+                 volinfo->volname, (trusted_str ? trusted_str : ""), dup_volid);
+        ret = sys_stat(path, &stbuf);
+    }
+out:
+    if (dup_volname)
+        GF_FREE(dup_volname);
+    if (free_ptr)
+        GF_FREE(free_ptr);
+    return ret;
+}
+
+/* Get and store op-versions of the clients sending the getspec request
+ * Clients of versions <= 3.3, don't send op-versions, their op-versions are
+ * defaulted to 1. Also fetch brick_name.
+ */
+int32_t
+glusterd_get_args_from_dict(gf_getspec_req *args, peer_info_t *peerinfo,
+                            char **brick_name)
+{
+    dict_t *dict = NULL;
+    int client_max_op_version = 1;
+    int client_min_op_version = 1;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    char *name = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(args);
+    GF_ASSERT(peerinfo);
+
+    if (!args->xdata.xdata_len) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        ret = 0;
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_unserialize(args->xdata.xdata_val, args->xdata.xdata_len, &dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+               "Failed to unserialize request dictionary");
+        goto out;
+    }
+
+    ret = dict_get_int32(dict, "min-op-version", &client_min_op_version);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get client-min-op-version");
+        goto out;
+    }
+
+    ret = dict_get_int32(dict, "max-op-version", &client_max_op_version);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get client-max-op-version");
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "brick_name", &name);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "No brick name present");
+        ret = 0;
+        goto out;
+    }
+    *brick_name = gf_strdup(name);
+    if (*brick_name == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+                "Brick_name=%s", name, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "brick_name = %s", *brick_name);
+out:
+    peerinfo->max_op_version = client_max_op_version;
+    peerinfo->min_op_version = client_min_op_version;
+
+    if (dict)
+        dict_unref(dict);
+
+    return ret;
+}
+
+/* Given the missed_snapinfo and snap_opinfo take the
+ * missed lvm snapshot
+ */
+int32_t
+glusterd_create_missed_snap(glusterd_missed_snap_info *missed_snapinfo,
+                            glusterd_snap_op_t *snap_opinfo)
+{
+    char *device = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int32_t ret = -1;
+    int32_t i = 0;
+    uuid_t snap_uuid = {
+        0,
+    };
+    xlator_t *this = NULL;
+    char *mnt_device = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(missed_snapinfo);
+    GF_ASSERT(snap_opinfo);
+
+    gf_uuid_parse(missed_snapinfo->snap_uuid, snap_uuid);
+
+    /* Find the snap-object */
+    snap = glusterd_find_snap_by_id(snap_uuid);
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_NOT_FOUND,
+               "Unable to find the snap with snap_uuid %s",
+               missed_snapinfo->snap_uuid);
+        ret = -1;
+        goto out;
+    }
+
+    /* Find the snap_vol */
+    cds_list_for_each_entry(volinfo, &snap->volumes, vol_list)
+    {
+        if (!strcmp(volinfo->volname, snap_opinfo->snap_vol_id)) {
+            snap_vol = volinfo;
+            break;
+        }
+    }
+
+    if (!snap_vol) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "Unable to find the snap_vol(%s) "
+               "for snap(%s)",
+               snap_opinfo->snap_vol_id, snap->snapname);
+        ret = -1;
+        goto out;
+    }
+
+    /* Find the missed brick in the snap volume */
+    cds_list_for_each_entry(brickinfo, &snap_vol->bricks, brick_list)
+    {
+        i++;
+        if (i == snap_opinfo->brick_num)
+            break;
+    }
+
+    if (brickinfo->snap_status != -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_STATUS_NOT_PENDING,
+               "The snap status of the missed "
+               "brick(%s) is not pending",
+               brickinfo->path);
+        goto out;
+    }
+
+    /* Fetch the device path */
+    mnt_device = glusterd_get_brick_mount_device(snap_opinfo->brick_path);
+    if (!mnt_device) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_GET_INFO_FAIL,
+               "Getting device name for the"
+               "brick %s:%s failed",
+               brickinfo->hostname, snap_opinfo->brick_path);
+        ret = -1;
+        goto out;
+    }
+
+    device = glusterd_build_snap_device_path(mnt_device, snap_vol->volname,
+                                             snap_opinfo->brick_num - 1);
+    if (!device) {
+        gf_msg(this->name, GF_LOG_ERROR, ENXIO,
+               GD_MSG_SNAP_DEVICE_NAME_GET_FAIL,
+               "cannot copy the snapshot "
+               "device name (volname: %s, snapname: %s)",
+               snap_vol->volname, snap->snapname);
+        ret = -1;
+        goto out;
+    }
+    if (snprintf(brickinfo->device_path, sizeof(brickinfo->device_path), "%s",
+                 device) >= sizeof(brickinfo->device_path)) {
+        gf_msg(this->name, GF_LOG_ERROR, ENXIO,
+               GD_MSG_SNAP_DEVICE_NAME_GET_FAIL,
+               "cannot copy the device_path "
+               "(device_path: %s)",
+               brickinfo->device_path);
+        ret = -1;
+        goto out;
+    }
+
+    /* Update the backend file-system type of snap brick in
+     * snap volinfo. */
+    ret = glusterd_update_mntopts(snap_opinfo->brick_path, brickinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRK_MOUNTOPTS_FAIL,
+               "Failed to update "
+               "mount options for %s brick",
+               brickinfo->path);
+        /* We should not fail snapshot operation if we fail to get
+         * the file-system type */
+    }
+
+    ret = glusterd_take_lvm_snapshot(brickinfo, snap_opinfo->brick_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPSHOT_OP_FAILED,
+               "Failed to take snapshot of %s", snap_opinfo->brick_path);
+        goto out;
+    }
+
+    /* After the snapshot both the origin brick (LVM brick) and
+     * the snapshot brick will have the same file-system label. This
+     * will cause lot of problems at mount time. Therefore we must
+     * generate a new label for the snapshot brick
+     */
+    ret = glusterd_update_fs_label(brickinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_SET_INFO_FAIL,
+               "Failed to update "
+               "file-system label for %s brick",
+               brickinfo->path);
+        /* Failing to update label should not cause snapshot failure.
+         * Currently label is updated only for XFS and ext2/ext3/ext4
+         * file-system.
+         */
+    }
+
+    /* Create and mount the snap brick */
+    ret = glusterd_snap_brick_create(snap_vol, brickinfo,
+                                     snap_opinfo->brick_num - 1, 0);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_CREATION_FAIL,
+               "Failed to "
+               " create and mount the brick(%s) for the snap %s",
+               snap_opinfo->brick_path, snap_vol->snapshot->snapname);
+        goto out;
+    }
+
+    brickinfo->snap_status = 0;
+    ret = glusterd_brick_start(snap_vol, brickinfo, _gf_false, _gf_false);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_BRICK_DISCONNECTED,
+               "starting the "
+               "brick %s:%s for the snap %s failed",
+               brickinfo->hostname, brickinfo->path, snap->snapname);
+        goto out;
+    }
+    ret = glusterd_store_volinfo(snap_vol, GLUSTERD_VOLINFO_VER_AC_NONE);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_STORE_FAIL,
+               "Failed to store snapshot "
+               "volinfo (%s) for snap %s",
+               snap_vol->volname, snap->snapname);
+        goto out;
+    }
+
+out:
+    if (mnt_device)
+        GF_FREE(mnt_device);
+    if (device)
+        GF_FREE(device);
+
+    return ret;
+}
+
+/* Look into missed_snap_list, to see it the given brick_name,
+ * has any missed snap creates for the local node */
+int32_t
+glusterd_take_missing_brick_snapshots(char *brick_name)
+{
+    char *my_node_uuid = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_missed_snap_info *missed_snapinfo = NULL;
+    glusterd_snap_op_t *snap_opinfo = NULL;
+    int32_t ret = -1;
+    gf_boolean_t update_list = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(brick_name);
+
+    my_node_uuid = uuid_utoa(MY_UUID);
+
+    cds_list_for_each_entry(missed_snapinfo, &priv->missed_snaps_list,
+                            missed_snaps)
+    {
+        /* If the missed snap op is not for the local node
+         * then continue
+         */
+        if (strcmp(my_node_uuid, missed_snapinfo->node_uuid))
+            continue;
+
+        cds_list_for_each_entry(snap_opinfo, &missed_snapinfo->snap_ops,
+                                snap_ops_list)
+        {
+            /* Check if the missed snap's op is a create for
+             * the brick name in question
+             */
+            if ((snap_opinfo->op == GF_SNAP_OPTION_TYPE_CREATE) &&
+                (!strcmp(brick_name, snap_opinfo->brick_path))) {
+                /* Perform a snap create if the
+                 * op is still pending
+                 */
+                if (snap_opinfo->status == GD_MISSED_SNAP_PENDING) {
+                    ret = glusterd_create_missed_snap(missed_snapinfo,
+                                                      snap_opinfo);
+                    if (ret) {
+                        gf_msg(this->name, GF_LOG_ERROR, 0,
+                               GD_MSG_MISSED_SNAP_CREATE_FAIL,
+                               "Failed to create "
+                               "missed snap for %s",
+                               brick_name);
+                        /* At this stage, we will mark
+                         * the entry as done. Because
+                         * of the failure other
+                         * snapshots will not be
+                         * affected, and neither the
+                         * brick. Only the current snap
+                         * brick will always remain as
+                         * pending.
+                         */
+                    }
+                    snap_opinfo->status = GD_MISSED_SNAP_DONE;
+                    update_list = _gf_true;
+                }
+                /* One snap-id won't have more than one missed
+                 * create for the same brick path. Hence
+                 * breaking in search of another missed create
+                 * for the same brick path in the local node
+                 */
+                break;
+            }
+        }
+    }
+
+    if (update_list == _gf_true) {
+        ret = glusterd_store_update_missed_snaps();
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_MISSED_SNAP_LIST_STORE_FAIL,
+                   "Failed to update missed_snaps_list");
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/* Checks if the client supports the volume, ie. client can understand all the
+ * options in the volfile
+ */
+static gf_boolean_t
+_client_supports_volume(peer_info_t *peerinfo, int32_t *op_errno)
+{
+    gf_boolean_t ret = _gf_true;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    GF_ASSERT(peerinfo);
+    GF_ASSERT(op_errno);
+
+    /* Only check when the volfile being requested is a volume. Not finding
+     * a volinfo implies that the volfile requested for is not of a gluster
+     * volume. A non volume volfile is requested by the local gluster
+     * services like shd and nfs-server. These need not be checked as they
+     * will be running at the same op-version as glusterd and will be able
+     * to support all the features
+     */
+    if ((glusterd_volinfo_find(peerinfo->volname, &volinfo) == 0) &&
+        ((peerinfo->min_op_version > volinfo->client_op_version) ||
+         (peerinfo->max_op_version < volinfo->client_op_version))) {
+        ret = _gf_false;
+        *op_errno = ENOTSUP;
+        gf_msg("glusterd", GF_LOG_INFO, ENOTSUP, GD_MSG_UNSUPPORTED_VERSION,
+               "Client %s (%d -> %d) doesn't support required "
+               "op-version (%d). Rejecting volfile request.",
+               peerinfo->identifier, peerinfo->min_op_version,
+               peerinfo->max_op_version, volinfo->client_op_version);
+    }
+
+    return ret;
+}
+
+int
+__server_getspec(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    int32_t spec_fd = -1;
+    size_t file_len = 0;
+    char filename[PATH_MAX] = {
+        0,
+    };
+    struct stat stbuf = {
+        0,
+    };
+    char *brick_name = NULL;
+    char *volume = NULL;
+    char *tmp = NULL;
+    rpc_transport_t *trans = NULL;
+    gf_getspec_req args = {
+        0,
+    };
+    gf_getspec_rsp rsp = {
+        0,
+    };
+    char addrstr[RPCSVC_PEER_STRLEN] = {0};
+    peer_info_t *peerinfo = NULL;
+    xlator_t *this = NULL;
+    dict_t *dict = NULL;
+    glusterd_peerinfo_t *peer = NULL;
+    glusterd_conf_t *conf = NULL;
+    int peer_cnt = 0;
+    char *peer_hosts = NULL;
+    char *tmp_str = NULL;
+    char portstr[10] = {
+        0,
+    };
+    int len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    conf = this->private;
+    ret = xdr_to_generic(req->msg[0], &args, (xdrproc_t)xdr_gf_getspec_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        req->rpc_err = GARBAGE_ARGS;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode the message");
+        goto fail;
+    }
+
+    peerinfo = &req->trans->peerinfo;
+
+    volume = args.key;
+
+    if (strlen(volume) >= (NAME_MAX)) {
+        op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_NAME_TOO_LONG,
+               "volume name too long (%s)", volume);
+        goto fail;
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_MOUNT_REQ_RCVD,
+           "Received mount request for volume %s", volume);
+
+    /* Need to strip leading '/' from volnames. This was introduced to
+     * support nfs style mount parameters for native gluster mount
+     */
+    if (volume[0] == '/')
+        ret = snprintf(peerinfo->volname, sizeof(peerinfo->volname), "%s",
+                       &volume[1]);
+    else
+        ret = snprintf(peerinfo->volname, sizeof(peerinfo->volname), "%s",
+                       volume);
+    if (ret < 0 || ret >= sizeof(peerinfo->volname)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "peerinfo->volname %s truncated or error occurred: "
+               "(ret: %d)",
+               peerinfo->volname, ret);
+        ret = -1;
+        goto fail;
+    }
+
+    ret = glusterd_get_args_from_dict(&args, peerinfo, &brick_name);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get args from dict");
+        goto fail;
+    }
+
+    if (!_client_supports_volume(peerinfo, &op_errno)) {
+        ret = -1;
+        goto fail;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    trans = req->trans;
+    /* addrstr will be empty for cli socket connections */
+    ret = rpcsvc_transport_peername(trans, (char *)&addrstr, sizeof(addrstr));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0,
+               GD_MSG_RPC_TRANSPORT_GET_PEERNAME_FAIL,
+               "Failed to get the peername");
+        goto fail;
+    }
+
+    tmp = strrchr(addrstr, ':');
+    if (tmp)
+        *tmp = '\0';
+
+    /* The trusted volfiles are given to the glusterd owned process like NFS
+     * server, self-heal daemon etc., so that they are not inadvertently
+     * blocked by a auth.{allow,reject} setting. The trusted volfile is not
+     * meant for external users.
+     * For unix domain socket, address will be empty.
+     */
+    if (strlen(addrstr) == 0 || gf_is_local_addr(addrstr)) {
+        ret = build_volfile_path(volume, filename, sizeof(filename),
+                                 TRUSTED_PREFIX, dict);
+    } else {
+        ret = build_volfile_path(volume, filename, sizeof(filename), NULL,
+                                 dict);
+    }
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peer, &conf->peers, uuid_list)
+    {
+        if (!peer->connected)
+            continue;
+        if (!peer_hosts) {
+            if (peer->port) {
+                snprintf(portstr, sizeof(portstr), "%d", peer->port);
+            } else {
+                snprintf(portstr, sizeof(portstr), "%d", GLUSTERD_DEFAULT_PORT);
+            }
+            len = strlen(peer->hostname) + strlen(portstr) + 3;
+            tmp_str = GF_CALLOC(1, len, gf_gld_mt_char);
+            snprintf(tmp_str, len, "%s%s%s%s", peer->hostname, ":", portstr,
+                     " ");
+            peer_hosts = tmp_str;
+        } else {
+            len = strlen(peer_hosts) + strlen(peer->hostname) +
+                  strlen(portstr) + 3;
+            tmp_str = GF_CALLOC(1, len, gf_gld_mt_char);
+            snprintf(tmp_str, len, "%s%s%s%s%s", peer_hosts, peer->hostname,
+                     ":", portstr, " ");
+            GF_FREE(peer_hosts);
+            peer_hosts = tmp_str;
+        }
+        peer_cnt++;
+    }
+    RCU_READ_UNLOCK;
+    if (peer_cnt) {
+        op_ret = dict_set_str(dict, GLUSTERD_BRICK_SERVERS, peer_hosts);
+        if (op_ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "failed to set peer_host in dict");
+            ret = op_ret;
+            goto fail;
+        }
+    }
+
+    if (ret == 0) {
+        if (dict->count > 0) {
+            ret = dict_allocate_and_serialize(dict, &rsp.xdata.xdata_val,
+                                              &rsp.xdata.xdata_len);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno,
+                        GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+                goto fail;
+            }
+        }
+
+        /* to allocate the proper buffer to hold the file data */
+        ret = sys_stat(filename, &stbuf);
+        if (ret < 0) {
+            gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                   "Unable to stat %s (%s)", filename, strerror(errno));
+            goto fail;
+        }
+
+        spec_fd = open(filename, O_RDONLY);
+        if (spec_fd < 0) {
+            gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                   "Unable to open %s (%s)", filename, strerror(errno));
+            goto fail;
+        }
+        ret = file_len = stbuf.st_size;
+    } else {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_PEER_NOT_FOUND, NULL);
+        op_errno = ENOENT;
+        goto fail;
+    }
+
+    if (file_len) {
+        rsp.spec = CALLOC(file_len + 1, sizeof(char));
+        if (!rsp.spec) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+            ret = -1;
+            op_errno = ENOMEM;
+            goto fail;
+        }
+        ret = sys_read(spec_fd, rsp.spec, file_len);
+    }
+
+    if (brick_name) {
+        gf_msg_debug(this->name, 0, "Look for missing snap creates for %s",
+                     brick_name);
+        op_ret = glusterd_take_missing_brick_snapshots(brick_name);
+        if (op_ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MISSED_SNAP_CREATE_FAIL,
+                   "Failed to take missing brick snapshots");
+            ret = -1;
+            goto fail;
+        }
+    }
+    /* convert to XDR */
+fail:
+    if (spec_fd >= 0)
+        sys_close(spec_fd);
+
+    GF_FREE(brick_name);
+
+    rsp.op_ret = ret;
+    if (rsp.op_ret < 0)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MOUNT_REQ_FAIL,
+               "Failed to mount the volume");
+
+    if (op_errno)
+        rsp.op_errno = gf_errno_to_error(op_errno);
+
+    if (!rsp.spec)
+        rsp.spec = strdup("");
+
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                          (xdrproc_t)xdr_gf_getspec_rsp);
+    free(args.key);  // malloced by xdr
+    free(rsp.spec);
+
+    if (peer_hosts)
+        GF_FREE(peer_hosts);
+    if (dict)
+        dict_unref(dict);
+
+    if (args.xdata.xdata_val)
+        free(args.xdata.xdata_val);
+
+    if (rsp.xdata.xdata_val)
+        GF_FREE(rsp.xdata.xdata_val);
+
+    return 0;
+}
+
+int
+server_getspec(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __server_getspec);
+}
+
+int32_t
+__server_event_notify(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_event_notify_req args = {
+        0,
+    };
+    gf_event_notify_rsp rsp = {
+        0,
+    };
+    dict_t *dict = NULL;
+    gf_boolean_t need_rsp = _gf_true;
+
+    ret = xdr_to_generic(req->msg[0], &args,
+                         (xdrproc_t)xdr_gf_event_notify_req);
+    if (ret < 0) {
+        req->rpc_err = GARBAGE_ARGS;
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto fail;
+    }
+
+    if (args.dict.dict_len) {
+        dict = dict_new();
+        if (!dict) {
+            gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            return ret;
+        }
+        ret = dict_unserialize(args.dict.dict_val, args.dict.dict_len, &dict);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "Failed to unserialize req");
+            goto fail;
+        }
+    }
+
+    switch (args.op) {
+        case GF_EN_DEFRAG_STATUS:
+            gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_DEFRAG_STATUS_UPDATED,
+                   "received defrag status updated");
+            if (dict) {
+                glusterd_defrag_event_notify_handle(dict);
+                need_rsp = _gf_false;
+            }
+            break;
+        default:
+            gf_msg("glusterd", GF_LOG_ERROR, EINVAL, GD_MSG_OP_UNSUPPORTED,
+                   "Unknown op received in event "
+                   "notify");
+            gf_event(EVENT_NOTIFY_UNKNOWN_OP, "op=%d", args.op);
+            ret = -1;
+            break;
+    }
+
+fail:
+    rsp.op_ret = ret;
+
+    if (need_rsp)
+        glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                              (xdrproc_t)xdr_gf_event_notify_rsp);
+    if (dict)
+        dict_unref(dict);
+    free(args.dict.dict_val);  // malloced by xdr
+
+    return 0;
+}
+
+int32_t
+server_event_notify(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __server_event_notify);
+}
+
+int
+gd_validate_cluster_op_version(xlator_t *this, int cluster_op_version,
+                               char *peerid)
+{
+    int ret = -1;
+    glusterd_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (cluster_op_version > GD_OP_VERSION_MAX) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_VERSION_MISMATCH,
+               "operating version %d is more than the maximum "
+               "supported (%d) on the machine (as per peer request "
+               "from %s)",
+               cluster_op_version, GD_OP_VERSION_MAX, peerid);
+        goto out;
+    }
+
+    /* The peer can only reduce its op-version when it doesn't have any
+     * volumes. Reducing op-version when it already contains volumes can
+     * lead to inconsistencies in the cluster
+     */
+    if ((cluster_op_version < conf->op_version) &&
+        !cds_list_empty(&conf->volumes)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_VERS_ADJUST_FAIL,
+               "cannot reduce operating version to %d from current "
+               "version %d as volumes exist (as per peer request from "
+               "%s)",
+               cluster_op_version, conf->op_version, peerid);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/* Validate if glusterd can serve the management handshake request
+ *
+ * Requests are allowed if,
+ *  - glusterd has no peers & no volumes, or
+ *  - the request came from a known peer
+ * A known peer is identified using the following steps
+ *  - the dict is checked for a peer uuid, which if present is matched with the
+ *  peer list, else
+ *  - the incoming request address is matched with the peer list
+ */
+gf_boolean_t
+gd_validate_mgmt_hndsk_req(rpcsvc_request_t *req, dict_t *dict)
+{
+    int ret = -1;
+    char hostname[UNIX_PATH_MAX + 1] = {
+        0,
+    };
+    glusterd_peerinfo_t *peer = NULL;
+    xlator_t *this = NULL;
+    char *uuid_str = NULL;
+    uuid_t peer_uuid = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (!glusterd_have_peers() && !glusterd_have_volumes())
+        return _gf_true;
+
+    ret = dict_get_str(dict, GD_PEER_ID_KEY, &uuid_str);
+    /* Try to match uuid only if available, don't fail as older peers will
+     * not send a uuid
+     */
+    if (!ret) {
+        gf_uuid_parse(uuid_str, peer_uuid);
+        RCU_READ_LOCK;
+        ret = (glusterd_peerinfo_find(peer_uuid, NULL) != NULL);
+        RCU_READ_UNLOCK;
+        if (ret)
+            return _gf_true;
+    }
+
+    /* If you cannot get the hostname, you cannot authenticate */
+    ret = glusterd_remote_hostname_get(req, hostname, sizeof(hostname));
+    if (ret)
+        return _gf_false;
+
+    /* If peer object is not found it indicates that request is from an
+     * unknown peer, if its found, validate whether its uuid is also
+     * available in the peerinfo list. There could be a case where hostname
+     * is available in the peerinfo list but the uuid has changed of the
+     * node due to a reinstall, in that case the validation should fail!
+     */
+    RCU_READ_LOCK;
+    if (!uuid_str) {
+        ret = (glusterd_peerinfo_find(NULL, hostname) == NULL);
+    } else {
+        peer = glusterd_peerinfo_find(NULL, hostname);
+        if (!peer) {
+            ret = -1;
+        } else if (peer && glusterd_peerinfo_find(peer_uuid, NULL) != NULL) {
+            ret = 0;
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HANDSHAKE_REQ_REJECTED,
+                   "Request from "
+                   "peer %s has an entry in peerinfo, but uuid "
+                   "does not match",
+                   req->trans->peerinfo.identifier);
+            ret = -1;
+        }
+    }
+    RCU_READ_UNLOCK;
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HANDSHAKE_REQ_REJECTED,
+               "Rejecting management "
+               "handshake request from unknown peer %s",
+               req->trans->peerinfo.identifier);
+        gf_event(EVENT_PEER_REJECT, "peer=%s", req->trans->peerinfo.identifier);
+        return _gf_false;
+    }
+
+    return _gf_true;
+}
+
+int
+__glusterd_mgmt_hndsk_versions(rpcsvc_request_t *req)
+{
+    dict_t *dict = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    int ret = -1;
+    int op_errno = EINVAL;
+    gf_mgmt_hndsk_req args = {
+        {
+            0,
+        },
+    };
+    gf_mgmt_hndsk_rsp rsp = {
+        0,
+    };
+    dict_t *args_dict = NULL;
+
+    this = THIS;
+    conf = this->private;
+
+    ret = xdr_to_generic(req->msg[0], &args, (xdrproc_t)xdr_gf_mgmt_hndsk_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        req->rpc_err = GARBAGE_ARGS;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto out;
+    }
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, args_dict, args.hndsk.hndsk_val,
+                                 (args.hndsk.hndsk_len), ret, op_errno, out);
+
+    /* Check if we can service the request */
+    if (!gd_validate_mgmt_hndsk_req(req, args_dict)) {
+        ret = -1;
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ret = dict_set_int32(dict, GD_OP_VERSION_KEY, conf->op_version);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set operating version");
+        rsp.op_ret = ret;
+        goto out;
+    }
+
+    ret = dict_set_int32(dict, GD_MIN_OP_VERSION_KEY, GD_OP_VERSION_MIN);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set %s", GD_MIN_OP_VERSION_KEY);
+        rsp.op_ret = ret;
+        goto out;
+    }
+
+    ret = dict_set_int32(dict, GD_MAX_OP_VERSION_KEY, GD_OP_VERSION_MAX);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set %s", GD_MAX_OP_VERSION_KEY);
+        rsp.op_ret = ret;
+        goto out;
+    }
+
+    ret = 0;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, dict, (&rsp.hndsk.hndsk_val),
+                               rsp.hndsk.hndsk_len, op_errno, out);
+out:
+
+    rsp.op_ret = ret;
+    rsp.op_errno = op_errno;
+
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                          (xdrproc_t)xdr_gf_mgmt_hndsk_rsp);
+
+    ret = 0;
+
+    if (dict)
+        dict_unref(dict);
+
+    if (args.hndsk.hndsk_val)
+        free(args.hndsk.hndsk_val);
+
+    if (rsp.hndsk.hndsk_val)
+        GF_FREE(rsp.hndsk.hndsk_val);
+
+    if (args_dict)
+        dict_unref(args_dict);
+
+    return ret;
+}
+
+int
+glusterd_mgmt_hndsk_versions(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_mgmt_hndsk_versions);
+}
+
+int
+__glusterd_mgmt_hndsk_versions_ack(rpcsvc_request_t *req)
+{
+    dict_t *clnt_dict = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    int ret = -1;
+    int op_errno = EINVAL;
+    int peer_op_version = 0;
+    gf_mgmt_hndsk_req args = {
+        {
+            0,
+        },
+    };
+    gf_mgmt_hndsk_rsp rsp = {
+        0,
+    };
+
+    this = THIS;
+    conf = this->private;
+
+    ret = xdr_to_generic(req->msg[0], &args, (xdrproc_t)xdr_gf_mgmt_hndsk_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        req->rpc_err = GARBAGE_ARGS;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto out;
+    }
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, clnt_dict, args.hndsk.hndsk_val,
+                                 (args.hndsk.hndsk_len), ret, op_errno, out);
+
+    ret = dict_get_int32(clnt_dict, GD_OP_VERSION_KEY, &peer_op_version);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to get the op-version key peer=%s",
+               req->trans->peerinfo.identifier);
+        goto out;
+    }
+
+    ret = gd_validate_cluster_op_version(this, peer_op_version,
+                                         req->trans->peerinfo.identifier);
+    if (ret)
+        goto out;
+
+    /* As this is ACK from the Cluster for the versions supported,
+       can set the op-version of 'this' glusterd to the one
+       received. */
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_VERS_INFO,
+           "using the op-version %d", peer_op_version);
+    conf->op_version = peer_op_version;
+    ret = glusterd_store_global_info(this);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLOBAL_OP_VERSION_SET_FAIL,
+               "Failed to store op-version");
+
+out:
+    rsp.op_ret = ret;
+    rsp.op_errno = op_errno;
+
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                          (xdrproc_t)xdr_gf_mgmt_hndsk_rsp);
+
+    ret = 0;
+
+    if (clnt_dict)
+        dict_unref(clnt_dict);
+
+    if (args.hndsk.hndsk_val)
+        free(args.hndsk.hndsk_val);
+
+    return ret;
+}
+
+int
+glusterd_mgmt_hndsk_versions_ack(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_mgmt_hndsk_versions_ack);
+}
+
+int
+__server_get_volume_info(rpcsvc_request_t *req)
+{
+    int ret = -1;
+    int32_t op_errno = ENOENT;
+    gf_get_volume_info_req vol_info_req = {{
+        0,
+    }};
+    gf_get_volume_info_rsp vol_info_rsp = {
+        0,
+    };
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    dict_t *dict = NULL;
+    dict_t *dict_rsp = NULL;
+    char *volume_id_str = NULL;
+    int32_t flags = 0;
+
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    ret = xdr_to_generic(req->msg[0], &vol_info_req,
+                         (xdrproc_t)xdr_gf_get_volume_info_req);
+    if (ret < 0) {
+        /* failed to decode msg */
+        req->rpc_err = GARBAGE_ARGS;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto out;
+    }
+    gf_smsg(this->name, GF_LOG_INFO, 0, GD_MSG_VOL_INFO_REQ_RECVD, NULL);
+
+    if (vol_info_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+        if (!dict) {
+            gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            op_errno = ENOMEM;
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_unserialize(vol_info_req.dict.dict_val,
+                               vol_info_req.dict.dict_len, &dict);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                    NULL);
+            op_errno = -ret;
+            ret = -1;
+            goto out;
+        } else {
+            dict->extra_stdfree = vol_info_req.dict.dict_val;
+        }
+    }
+
+    ret = dict_get_int32(dict, "flags", &flags);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=flags", NULL);
+        op_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    if (!flags) {
+        /* Nothing to query about. Just return success */
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_FLAG_SET, NULL);
+        ret = 0;
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "volname", &volname);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=volname", NULL);
+        op_errno = EINVAL;
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_VOLINFO_GET_FAIL,
+                "Volname=%s", volname, NULL);
+        op_errno = EINVAL;
+        ret = -1;
+        goto out;
+    }
+
+    if (flags & (int32_t)GF_GET_VOLUME_UUID) {
+        volume_id_str = gf_strdup(uuid_utoa(volinfo->volume_id));
+        if (!volume_id_str) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+                    NULL);
+            op_errno = ENOMEM;
+            ret = -1;
+            goto out;
+        }
+
+        dict_rsp = dict_new();
+        if (!dict_rsp) {
+            gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            op_errno = ENOMEM;
+            GF_FREE(volume_id_str);
+            ret = -1;
+            goto out;
+        }
+        ret = dict_set_dynstr(dict_rsp, "volume_id", volume_id_str);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=volume_id", NULL);
+            op_errno = -ret;
+            ret = -1;
+            goto out;
+        }
+    }
+    ret = dict_allocate_and_serialize(dict_rsp, &vol_info_rsp.dict.dict_val,
+                                      &vol_info_rsp.dict.dict_len);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        op_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+out:
+    vol_info_rsp.op_ret = ret;
+    vol_info_rsp.op_errno = op_errno;
+    vol_info_rsp.op_errstr = "";
+    glusterd_submit_reply(req, &vol_info_rsp, NULL, 0, NULL,
+                          (xdrproc_t)xdr_gf_get_volume_info_rsp);
+    ret = 0;
+
+    if (dict) {
+        dict_unref(dict);
+    }
+
+    if (dict_rsp) {
+        dict_unref(dict_rsp);
+    }
+
+    if (vol_info_rsp.dict.dict_val) {
+        GF_FREE(vol_info_rsp.dict.dict_val);
+    }
+    return ret;
+}
+
+int
+server_get_volume_info(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __server_get_volume_info);
+}
+
+/*
+ * glusterd function to get the list of snapshot names and uuids
+ */
+int
+__server_get_snap_info(rpcsvc_request_t *req)
+{
+    int ret = -1;
+    int op_errno = ENOENT;
+    gf_getsnap_name_uuid_req snap_info_req = {{
+        0,
+    }};
+    gf_getsnap_name_uuid_rsp snap_info_rsp = {
+        0,
+    };
+    dict_t *dict = NULL;
+    dict_t *dict_rsp = NULL;
+    char *volname = NULL;
+
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &snap_info_req,
+                         (xdrproc_t)xdr_gf_getsnap_name_uuid_req);
+    if (ret < 0) {
+        req->rpc_err = GARBAGE_ARGS;
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode management handshake response");
+        goto out;
+    }
+
+    if (snap_info_req.dict.dict_len) {
+        dict = dict_new();
+        if (!dict) {
+            gf_smsg("glusterd", GF_LOG_WARNING, ENOMEM, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            op_errno = ENOMEM;
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_unserialize(snap_info_req.dict.dict_val,
+                               snap_info_req.dict.dict_len, &dict);
+        if (ret < 0) {
+            gf_msg("glusterd", GF_LOG_ERROR, EINVAL,
+                   GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "Failed to unserialize dictionary");
+            op_errno = EINVAL;
+            ret = -1;
+            goto out;
+        } else {
+            dict->extra_stdfree = snap_info_req.dict.dict_val;
+        }
+    }
+
+    ret = dict_get_str(dict, "volname", &volname);
+    if (ret) {
+        op_errno = EINVAL;
+        gf_msg("glusterd", GF_LOG_ERROR, EINVAL, GD_MSG_DICT_GET_FAILED,
+               "Failed to retrieve volname");
+        ret = -1;
+        goto out;
+    }
+
+    dict_rsp = dict_new();
+    if (!dict_rsp) {
+        gf_smsg("glusterd", GF_LOG_WARNING, ENOMEM, GD_MSG_DICT_CREATE_FAIL,
+                NULL);
+        op_errno = ENOMEM;
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_snapshot_get_volnames_uuids(dict_rsp, volname,
+                                               &snap_info_rsp);
+
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+               "Error getting snapshot volume names and uuids : %s", volname);
+        op_errno = EINVAL;
+    }
+
+out:
+    snap_info_rsp.op_ret = ret;
+    snap_info_rsp.op_errno = op_errno;
+    snap_info_rsp.op_errstr = "";
+    glusterd_submit_reply(req, &snap_info_rsp, NULL, 0, NULL,
+                          (xdrproc_t)xdr_gf_getsnap_name_uuid_rsp);
+
+    if (dict) {
+        dict_unref(dict);
+    }
+
+    if (dict_rsp) {
+        dict_unref(dict_rsp);
+    }
+
+    if (snap_info_rsp.dict.dict_val) {
+        GF_FREE(snap_info_rsp.dict.dict_val);
+    }
+
+    return 0;
+}
+
+int
+server_get_snap_info(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __server_get_snap_info);
+}
+
+static rpcsvc_actor_t gluster_handshake_actors[GF_HNDSK_MAXVALUE] = {
+    [GF_HNDSK_NULL] = {"NULL", NULL, NULL, GF_HNDSK_NULL, DRC_NA, 0},
+    [GF_HNDSK_GETSPEC] = {"GETSPEC", server_getspec, NULL, GF_HNDSK_GETSPEC,
+                          DRC_NA, 0},
+    [GF_HNDSK_EVENT_NOTIFY] = {"EVENTNOTIFY", server_event_notify, NULL,
+                               GF_HNDSK_EVENT_NOTIFY, DRC_NA, 0},
+    [GF_HNDSK_GET_VOLUME_INFO] = {"GETVOLUMEINFO", server_get_volume_info, NULL,
+                                  GF_HNDSK_GET_VOLUME_INFO, DRC_NA, 0},
+    [GF_HNDSK_GET_SNAPSHOT_INFO] = {"GETSNAPINFO", server_get_snap_info, NULL,
+                                    GF_HNDSK_GET_SNAPSHOT_INFO, DRC_NA, 0},
+};
+
+struct rpcsvc_program gluster_handshake_prog = {
+    .progname = "Gluster Handshake",
+    .prognum = GLUSTER_HNDSK_PROGRAM,
+    .progver = GLUSTER_HNDSK_VERSION,
+    .actors = gluster_handshake_actors,
+    .numactors = GF_HNDSK_MAXVALUE,
+};
+
+/* A minimal RPC program just for the cli getspec command */
+static rpcsvc_actor_t gluster_cli_getspec_actors[GF_HNDSK_MAXVALUE] = {
+    [GF_HNDSK_GETSPEC] = {"GETSPEC", server_getspec, NULL, GF_HNDSK_GETSPEC,
+                          DRC_NA, 0},
+};
+
+struct rpcsvc_program gluster_cli_getspec_prog = {
+    .progname = "Gluster Handshake (CLI Getspec)",
+    .prognum = GLUSTER_HNDSK_PROGRAM,
+    .progver = GLUSTER_HNDSK_VERSION,
+    .actors = gluster_cli_getspec_actors,
+    .numactors = GF_HNDSK_MAXVALUE,
+};
+
+static char *glusterd_dump_proc[GF_DUMP_MAXVALUE] = {
+    [GF_DUMP_NULL] = "NULL",
+    [GF_DUMP_DUMP] = "DUMP",
+    [GF_DUMP_PING] = "PING",
+};
+
+static rpc_clnt_prog_t glusterd_dump_prog = {
+    .progname = "GLUSTERD-DUMP",
+    .prognum = GLUSTER_DUMP_PROGRAM,
+    .progver = GLUSTER_DUMP_VERSION,
+    .procnames = glusterd_dump_proc,
+};
+
+static rpcsvc_actor_t glusterd_mgmt_hndsk_actors[GD_MGMT_HNDSK_MAXVALUE] = {
+    [GD_MGMT_HNDSK_NULL] = {"NULL", NULL, NULL, GD_MGMT_HNDSK_NULL, DRC_NA, 0},
+    [GD_MGMT_HNDSK_VERSIONS] = {"MGMT-VERS", glusterd_mgmt_hndsk_versions, NULL,
+                                GD_MGMT_HNDSK_VERSIONS, DRC_NA, 0},
+    [GD_MGMT_HNDSK_VERSIONS_ACK] = {"MGMT-VERS-ACK",
+                                    glusterd_mgmt_hndsk_versions_ack, NULL,
+                                    GD_MGMT_HNDSK_VERSIONS_ACK, DRC_NA, 0},
+};
+
+struct rpcsvc_program glusterd_mgmt_hndsk_prog = {
+    .progname = "Gluster MGMT Handshake",
+    .prognum = GD_MGMT_HNDSK_PROGRAM,
+    .progver = GD_MGMT_HNDSK_VERSION,
+    .actors = glusterd_mgmt_hndsk_actors,
+    .numactors = GD_MGMT_HNDSK_MAXVALUE,
+};
+
+static char *glusterd_mgmt_hndsk_proc[GD_MGMT_HNDSK_MAXVALUE] = {
+    [GD_MGMT_HNDSK_NULL] = "NULL",
+    [GD_MGMT_HNDSK_VERSIONS] = "MGMT-VERS",
+    [GD_MGMT_HNDSK_VERSIONS_ACK] = "MGMT-VERS-ACK",
+};
+
+static rpc_clnt_prog_t gd_clnt_mgmt_hndsk_prog = {
+    .progname = "Gluster MGMT Handshake",
+    .prognum = GD_MGMT_HNDSK_PROGRAM,
+    .progver = GD_MGMT_HNDSK_VERSION,
+    .procnames = glusterd_mgmt_hndsk_proc,
+};
+
+static int
+glusterd_event_connected_inject(glusterd_peerctx_t *peerctx)
+{
+    GF_ASSERT(peerctx);
+
+    glusterd_friend_sm_event_t *event = NULL;
+    glusterd_probe_ctx_t *ctx = NULL;
+    int ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+
+    ret = glusterd_friend_sm_new_event(GD_FRIEND_EVENT_CONNECTED, &event);
+
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_EVENT_NEW_GET_FAIL,
+               "Unable to get new event");
+        goto out;
+    }
+
+    ctx = GF_CALLOC(1, sizeof(*ctx), gf_gld_mt_probe_ctx_t);
+
+    if (!ctx) {
+        ret = -1;
+        gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Memory not available");
+        goto out;
+    }
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find_by_generation(peerctx->peerinfo_gen);
+    if (!peerinfo) {
+        RCU_READ_UNLOCK;
+        ret = -1;
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_PEER_NOT_FOUND,
+               "Could not find peer %s(%s)", peerctx->peername,
+               uuid_utoa(peerctx->peerid));
+        GF_FREE(ctx);
+        goto out;
+    }
+    ctx->hostname = gf_strdup(peerinfo->hostname);
+    ctx->port = peerinfo->port;
+    ctx->req = peerctx->args.req;
+    ctx->dict = peerctx->args.dict;
+
+    event->peername = gf_strdup(peerinfo->hostname);
+    gf_uuid_copy(event->peerid, peerinfo->uuid);
+    event->ctx = ctx;
+
+    ret = glusterd_friend_sm_inject_event(event);
+
+    RCU_READ_UNLOCK;
+
+    if (ret)
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_EVENT_INJECT_FAIL,
+               "Unable to inject "
+               "EVENT_CONNECTED ret = %d",
+               ret);
+
+out:
+    gf_msg_debug("glusterd", 0, "returning %d", ret);
+    return ret;
+}
+
+int
+gd_validate_peer_op_version(xlator_t *this, glusterd_peerinfo_t *peerinfo,
+                            dict_t *dict, char **errstr)
+{
+    int ret = -1;
+    glusterd_conf_t *conf = NULL;
+    int32_t peer_op_version = 0;
+    int32_t peer_min_op_version = 0;
+    int32_t peer_max_op_version = 0;
+
+    if (!dict) {
+        gf_smsg("glusterd", GF_LOG_WARNING, ENOMEM, GD_MSG_DICT_CREATE_FAIL,
+                NULL);
+        goto out;
+    }
+
+    if (!this) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_XLATOR_NOT_DEFINED,
+                NULL);
+        goto out;
+    }
+
+    if (!peerinfo) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    conf = this->private;
+
+    ret = dict_get_int32(dict, GD_OP_VERSION_KEY, &peer_op_version);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=%s", GD_OP_VERSION_KEY, NULL);
+        goto out;
+    }
+
+    ret = dict_get_int32(dict, GD_MAX_OP_VERSION_KEY, &peer_max_op_version);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=%s", GD_MAX_OP_VERSION_KEY, NULL);
+        goto out;
+    }
+
+    ret = dict_get_int32(dict, GD_MIN_OP_VERSION_KEY, &peer_min_op_version);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=%s", GD_MIN_OP_VERSION_KEY, NULL);
+        goto out;
+    }
+
+    ret = -1;
+    /* Check if peer can support our op_version */
+    if ((peer_max_op_version < conf->op_version) ||
+        (peer_min_op_version > conf->op_version)) {
+        ret = gf_asprintf(errstr,
+                          "Peer %s does not support required "
+                          "op-version",
+                          peerinfo->hostname);
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (peerinfo)
+        gf_msg_debug((this ? this->name : "glusterd"), 0, "Peer %s %s",
+                     peerinfo->hostname, ((ret < 0) ? "rejected" : "accepted"));
+    return ret;
+}
+
+int
+__glusterd_mgmt_hndsk_version_ack_cbk(struct rpc_req *req, struct iovec *iov,
+                                      int count, void *myframe)
+{
+    int ret = -1;
+    gf_mgmt_hndsk_rsp rsp = {
+        0,
+    };
+    xlator_t *this = NULL;
+    call_frame_t *frame = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_peerctx_t *peerctx = NULL;
+    char msg[64] = {
+        0,
+    };
+
+    this = THIS;
+    frame = myframe;
+    peerctx = frame->local;
+
+    RCU_READ_LOCK;
+    peerinfo = glusterd_peerinfo_find_by_generation(peerctx->peerinfo_gen);
+    if (!peerinfo) {
+        gf_msg_debug(this->name, 0, "Could not find peer %s(%s)",
+                     peerctx->peername, uuid_utoa(peerctx->peerid));
+        ret = -1;
+        goto out;
+    }
+
+    if (-1 == req->rpc_status) {
+        snprintf(msg, sizeof(msg),
+                 "Error through RPC layer, retry again later");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_LAYER_ERROR, "%s", msg);
+        peerctx->errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gf_mgmt_hndsk_rsp);
+    if (ret < 0) {
+        snprintf(msg, sizeof(msg), "Failed to decode XDR");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL, "%s", msg);
+        peerctx->errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    if (-1 == rsp.op_ret) {
+        ret = -1;
+        snprintf(msg, sizeof(msg),
+                 "Failed to get handshake ack from remote server");
+        gf_msg(frame->this->name, GF_LOG_ERROR, 0, GD_MSG_NO_HANDSHAKE_ACK,
+               "%s", msg);
+        peerctx->errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    /* TODO: this is hardcoded as of now, but I don't forsee any problems
+     * with this as long as we are properly handshaking operating versions
+     */
+    peerinfo->mgmt = &gd_mgmt_prog;
+    peerinfo->peer = &gd_peer_prog;
+    peerinfo->mgmt_v3 = &gd_mgmt_v3_prog;
+
+    ret = default_notify(this, GF_EVENT_CHILD_UP, NULL);
+
+    if (GD_MODE_ON == peerctx->args.mode) {
+        (void)glusterd_event_connected_inject(peerctx);
+        peerctx->args.req = NULL;
+    } else if (GD_MODE_SWITCH_ON == peerctx->args.mode) {
+        peerctx->args.mode = GD_MODE_ON;
+    } else {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_UNKNOWN_MODE,
+               "unknown mode %d", peerctx->args.mode);
+    }
+
+    ret = 0;
+out:
+
+    if (ret != 0 && peerinfo)
+        rpc_transport_disconnect(peerinfo->rpc->conn.trans, _gf_false);
+
+    RCU_READ_UNLOCK;
+
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+
+    if (rsp.hndsk.hndsk_val)
+        free(rsp.hndsk.hndsk_val);
+
+    glusterd_friend_sm();
+
+    return 0;
+}
+
+int
+glusterd_mgmt_hndsk_version_ack_cbk(struct rpc_req *req, struct iovec *iov,
+                                    int count, void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   __glusterd_mgmt_hndsk_version_ack_cbk);
+}
+
+int
+__glusterd_mgmt_hndsk_version_cbk(struct rpc_req *req, struct iovec *iov,
+                                  int count, void *myframe)
+{
+    int ret = -1;
+    int op_errno = EINVAL;
+    gf_mgmt_hndsk_rsp rsp = {
+        0,
+    };
+    gf_mgmt_hndsk_req arg = {{
+        0,
+    }};
+    xlator_t *this = NULL;
+    call_frame_t *frame = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_peerctx_t *peerctx = NULL;
+    dict_t *dict = NULL;
+    dict_t *rsp_dict = NULL;
+    glusterd_conf_t *conf = NULL;
+    char msg[64] = {
+        0,
+    };
+
+    this = THIS;
+    conf = this->private;
+    frame = myframe;
+    peerctx = frame->local;
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find_by_generation(peerctx->peerinfo_gen);
+    if (!peerinfo) {
+        ret = -1;
+        gf_msg_debug(this->name, 0, "Could not find peer %s(%s)",
+                     peerctx->peername, uuid_utoa(peerctx->peerid));
+        goto out;
+    }
+
+    if (-1 == req->rpc_status) {
+        ret = -1;
+        snprintf(msg, sizeof(msg),
+                 "Error through RPC layer, retry again later");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_LAYER_ERROR, "%s", msg);
+        peerctx->errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gf_mgmt_hndsk_rsp);
+    if (ret < 0) {
+        snprintf(msg, sizeof(msg),
+                 "Failed to decode management "
+                 "handshake response");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL, "%s", msg);
+        peerctx->errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, dict, rsp.hndsk.hndsk_val,
+                                 rsp.hndsk.hndsk_len, ret, op_errno, out);
+
+    op_errno = rsp.op_errno;
+    if (-1 == rsp.op_ret) {
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, GD_MSG_VERS_GET_FAIL,
+               "failed to get the 'versions' from peer (%s)",
+               req->conn->trans->peerinfo.identifier);
+        goto out;
+    }
+
+    /* Check if peer can be part of cluster */
+    ret = gd_validate_peer_op_version(this, peerinfo, dict, &peerctx->errstr);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_VERSION_MISMATCH,
+               "failed to validate the operating version of peer (%s)",
+               peerinfo->hostname);
+        goto out;
+    }
+
+    rsp_dict = dict_new();
+    if (!rsp_dict)
+        goto out;
+
+    ret = dict_set_int32(rsp_dict, GD_OP_VERSION_KEY, conf->op_version);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set operating version in dict");
+        goto out;
+    }
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, rsp_dict, (&arg.hndsk.hndsk_val),
+                               arg.hndsk.hndsk_len, op_errno, out);
+
+    ret = glusterd_submit_request(
+        peerinfo->rpc, &arg, frame, &gd_clnt_mgmt_hndsk_prog,
+        GD_MGMT_HNDSK_VERSIONS_ACK, NULL, this,
+        glusterd_mgmt_hndsk_version_ack_cbk, (xdrproc_t)xdr_gf_mgmt_hndsk_req);
+
+out:
+    if (ret) {
+        frame->local = NULL;
+        STACK_DESTROY(frame->root);
+        if (peerinfo)
+            rpc_transport_disconnect(peerinfo->rpc->conn.trans, _gf_false);
+    }
+
+    RCU_READ_UNLOCK;
+
+    if (rsp.hndsk.hndsk_val)
+        free(rsp.hndsk.hndsk_val);
+
+    if (arg.hndsk.hndsk_val)
+        GF_FREE(arg.hndsk.hndsk_val);
+
+    if (dict)
+        dict_unref(dict);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    return 0;
+}
+
+int
+glusterd_mgmt_hndsk_version_cbk(struct rpc_req *req, struct iovec *iov,
+                                int count, void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   __glusterd_mgmt_hndsk_version_cbk);
+}
+
+int
+glusterd_mgmt_handshake(xlator_t *this, glusterd_peerctx_t *peerctx)
+{
+    call_frame_t *frame = NULL;
+    gf_mgmt_hndsk_req req = {
+        {
+            0,
+        },
+    };
+    glusterd_peerinfo_t *peerinfo = NULL;
+    dict_t *req_dict = NULL;
+    int ret = -1;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame) {
+        gf_smsg("glusterd", GF_LOG_WARNING, errno, GD_MSG_FRAME_CREATE_FAIL,
+                NULL);
+        goto out;
+    }
+
+    frame->local = peerctx;
+
+    req_dict = dict_new();
+    if (!req_dict) {
+        gf_smsg("glusterd", GF_LOG_WARNING, ENOMEM, GD_MSG_DICT_CREATE_FAIL,
+                NULL);
+        goto out;
+    }
+
+    ret = dict_set_dynstr(req_dict, GD_PEER_ID_KEY,
+                          gf_strdup(uuid_utoa(MY_UUID)));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "failed to set peer ID in dict");
+        goto out;
+    }
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, req_dict, (&req.hndsk.hndsk_val),
+                               req.hndsk.hndsk_len, ret, out);
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find_by_generation(peerctx->peerinfo_gen);
+    if (!peerinfo) {
+        RCU_READ_UNLOCK;
+        gf_msg_debug(THIS->name, 0, "Could not find peer %s(%s)",
+                     peerctx->peername, uuid_utoa(peerctx->peerid));
+        goto out;
+    }
+
+    ret = glusterd_submit_request(
+        peerinfo->rpc, &req, frame, &gd_clnt_mgmt_hndsk_prog,
+        GD_MGMT_HNDSK_VERSIONS, NULL, this, glusterd_mgmt_hndsk_version_cbk,
+        (xdrproc_t)xdr_gf_mgmt_hndsk_req);
+
+    RCU_READ_UNLOCK;
+
+    ret = 0;
+
+out:
+    if (req_dict)
+        dict_unref(req_dict);
+
+    if (ret && frame)
+        STACK_DESTROY(frame->root);
+
+    return ret;
+}
+
+int
+glusterd_set_clnt_mgmt_program(glusterd_peerinfo_t *peerinfo,
+                               gf_prog_detail *prog)
+{
+    gf_prog_detail *trav = NULL;
+    int ret = -1;
+
+    if (!peerinfo || !prog)
+        goto out;
+
+    trav = prog;
+
+    while (trav) {
+        ret = -1;
+        if ((gd_mgmt_prog.prognum == trav->prognum) &&
+            (gd_mgmt_prog.progver == trav->progver)) {
+            peerinfo->mgmt = &gd_mgmt_prog;
+            ret = 0;
+        }
+
+        if ((gd_peer_prog.prognum == trav->prognum) &&
+            (gd_peer_prog.progver == trav->progver)) {
+            peerinfo->peer = &gd_peer_prog;
+            ret = 0;
+        }
+
+        if (ret) {
+            gf_msg_debug("glusterd", 0,
+                         "%s (%" PRId64 ":%" PRId64 ") not supported",
+                         trav->progname, trav->prognum, trav->progver);
+        }
+
+        trav = trav->next;
+    }
+
+    if (peerinfo->mgmt) {
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_VERS_INFO,
+               "Using Program %s, Num (%d), Version (%d)",
+               peerinfo->mgmt->progname, peerinfo->mgmt->prognum,
+               peerinfo->mgmt->progver);
+    }
+
+    if (peerinfo->peer) {
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_VERS_INFO,
+               "Using Program %s, Num (%d), Version (%d)",
+               peerinfo->peer->progname, peerinfo->peer->prognum,
+               peerinfo->peer->progver);
+    }
+
+    if (peerinfo->mgmt_v3) {
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_VERS_INFO,
+               "Using Program %s, Num (%d), Version (%d)",
+               peerinfo->mgmt_v3->progname, peerinfo->mgmt_v3->prognum,
+               peerinfo->mgmt_v3->progver);
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static gf_boolean_t
+_mgmt_hndsk_prog_present(gf_prog_detail *prog)
+{
+    gf_boolean_t ret = _gf_false;
+    gf_prog_detail *trav = NULL;
+
+    GF_ASSERT(prog);
+
+    trav = prog;
+
+    while (trav) {
+        if ((trav->prognum == GD_MGMT_HNDSK_PROGRAM) &&
+            (trav->progver == GD_MGMT_HNDSK_VERSION)) {
+            ret = _gf_true;
+            goto out;
+        }
+        trav = trav->next;
+    }
+out:
+    return ret;
+}
+
+int
+__glusterd_peer_dump_version_cbk(struct rpc_req *req, struct iovec *iov,
+                                 int count, void *myframe)
+{
+    int ret = -1;
+    gf_dump_rsp rsp = {
+        0,
+    };
+    xlator_t *this = NULL;
+    gf_prog_detail *trav = NULL;
+    gf_prog_detail *next = NULL;
+    call_frame_t *frame = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_peerctx_t *peerctx = NULL;
+    glusterd_conf_t *conf = NULL;
+    char msg[1024] = {
+        0,
+    };
+
+    this = THIS;
+    conf = this->private;
+    frame = myframe;
+    peerctx = frame->local;
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find_by_generation(peerctx->peerinfo_gen);
+    if (!peerinfo) {
+        gf_msg_debug(this->name, 0, "Couldn't find peer %s(%s)",
+                     peerctx->peername, uuid_utoa(peerctx->peerid));
+        goto out;
+    }
+
+    if (-1 == req->rpc_status) {
+        snprintf(msg, sizeof(msg),
+                 "Error through RPC layer, retry again later");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_LAYER_ERROR, "%s", msg);
+        peerctx->errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gf_dump_rsp);
+    if (ret < 0) {
+        snprintf(msg, sizeof(msg), "Failed to decode XDR");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL, "%s", msg);
+        peerctx->errstr = gf_strdup(msg);
+        goto out;
+    }
+    if (-1 == rsp.op_ret) {
+        snprintf(msg, sizeof(msg),
+                 "Failed to get the 'versions' from remote server");
+        gf_msg(frame->this->name, GF_LOG_ERROR, 0, GD_MSG_VERS_GET_FAIL, "%s",
+               msg);
+        peerctx->errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    if (_mgmt_hndsk_prog_present(rsp.prog)) {
+        gf_msg_debug(this->name, 0,
+                     "Proceeding to op-version handshake with peer %s",
+                     peerinfo->hostname);
+        ret = glusterd_mgmt_handshake(this, peerctx);
+        goto out;
+    } else if (conf->op_version > 1) {
+        ret = -1;
+        snprintf(msg, sizeof(msg),
+                 "Peer %s does not support required op-version",
+                 peerinfo->hostname);
+        peerctx->errstr = gf_strdup(msg);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VERSION_UNSUPPORTED, "%s",
+               msg);
+        goto out;
+    }
+
+    /* Make sure we assign the proper program to peer */
+    ret = glusterd_set_clnt_mgmt_program(peerinfo, rsp.prog);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_MGMT_PGM_SET_FAIL,
+               "failed to set the mgmt program");
+        goto out;
+    }
+
+    ret = default_notify(this, GF_EVENT_CHILD_UP, NULL);
+
+    if (GD_MODE_ON == peerctx->args.mode) {
+        (void)glusterd_event_connected_inject(peerctx);
+        peerctx->args.req = NULL;
+    } else if (GD_MODE_SWITCH_ON == peerctx->args.mode) {
+        peerctx->args.mode = GD_MODE_ON;
+    } else {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_UNKNOWN_MODE,
+               "unknown mode %d", peerctx->args.mode);
+    }
+
+    ret = 0;
+
+out:
+    if (ret != 0 && peerinfo)
+        rpc_transport_disconnect(peerinfo->rpc->conn.trans, _gf_false);
+
+    RCU_READ_UNLOCK;
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    /* don't use GF_FREE, buffer was allocated by libc */
+    if (rsp.prog) {
+        trav = rsp.prog;
+        while (trav) {
+            next = trav->next;
+            free(trav->progname);
+            free(trav);
+            trav = next;
+        }
+    }
+
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+
+    return 0;
+}
+
+int
+glusterd_peer_dump_version_cbk(struct rpc_req *req, struct iovec *iov,
+                               int count, void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   __glusterd_peer_dump_version_cbk);
+}
+
+int
+glusterd_peer_dump_version(xlator_t *this, struct rpc_clnt *rpc,
+                           glusterd_peerctx_t *peerctx)
+{
+    call_frame_t *frame = NULL;
+    gf_dump_req req = {
+        0,
+    };
+    glusterd_peerinfo_t *peerinfo = NULL;
+    int ret = -1;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame) {
+        gf_smsg(this->name, GF_LOG_WARNING, errno, GD_MSG_FRAME_CREATE_FAIL,
+                NULL);
+        goto out;
+    }
+
+    frame->local = peerctx;
+    if (!peerctx) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find_by_generation(peerctx->peerinfo_gen);
+    if (!peerinfo) {
+        RCU_READ_UNLOCK;
+        gf_msg_debug(this->name, 0, "Couldn't find peer %s(%s)",
+                     peerctx->peername, uuid_utoa(peerctx->peerid));
+        goto out;
+    }
+
+    req.gfs_id = 0xcafe;
+
+    ret = glusterd_submit_request(
+        peerinfo->rpc, &req, frame, &glusterd_dump_prog, GF_DUMP_DUMP, NULL,
+        this, glusterd_peer_dump_version_cbk, (xdrproc_t)xdr_gf_dump_req);
+
+    RCU_READ_UNLOCK;
+out:
+    if (ret && frame)
+        STACK_DESTROY(frame->root);
+
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-hooks.c b/xlators/mgmt/glusterd/src/glusterd-hooks.c
new file mode 100644
index 00000000000..61c0f1c946f
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-hooks.c
@@ -0,0 +1,641 @@
+/*
+   Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/run.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/compat-errno.h>
+#include "glusterd.h"
+#include "glusterd-sm.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-utils.h"
+#include "glusterd-store.h"
+#include "glusterd-hooks.h"
+#include "glusterd-messages.h"
+
+#include <fnmatch.h>
+
+#define EMPTY ""
+char glusterd_hook_dirnames[GD_OP_MAX][256] = {
+    [GD_OP_NONE] = EMPTY,
+    [GD_OP_CREATE_VOLUME] = "create",
+    [GD_OP_START_BRICK] = EMPTY,
+    [GD_OP_STOP_BRICK] = EMPTY,
+    [GD_OP_DELETE_VOLUME] = "delete",
+    [GD_OP_START_VOLUME] = "start",
+    [GD_OP_STOP_VOLUME] = "stop",
+    [GD_OP_DEFRAG_VOLUME] = EMPTY,
+    [GD_OP_ADD_BRICK] = "add-brick",
+    [GD_OP_REMOVE_BRICK] = "remove-brick",
+    [GD_OP_REPLACE_BRICK] = EMPTY,
+    [GD_OP_SET_VOLUME] = "set",
+    [GD_OP_RESET_VOLUME] = "reset",
+    [GD_OP_SYNC_VOLUME] = EMPTY,
+    [GD_OP_LOG_ROTATE] = EMPTY,
+    [GD_OP_GSYNC_CREATE] = "gsync-create",
+    [GD_OP_GSYNC_SET] = EMPTY,
+    [GD_OP_PROFILE_VOLUME] = EMPTY,
+    [GD_OP_QUOTA] = EMPTY,
+    [GD_OP_STATUS_VOLUME] = EMPTY,
+    [GD_OP_REBALANCE] = EMPTY,
+    [GD_OP_HEAL_VOLUME] = EMPTY,
+    [GD_OP_STATEDUMP_VOLUME] = EMPTY,
+    [GD_OP_LIST_VOLUME] = EMPTY,
+    [GD_OP_CLEARLOCKS_VOLUME] = EMPTY,
+    [GD_OP_DEFRAG_BRICK_VOLUME] = EMPTY,
+    [GD_OP_RESET_BRICK] = EMPTY,
+};
+#undef EMPTY
+
+static gf_boolean_t
+glusterd_is_hook_enabled(char *script)
+{
+    return (script[0] == 'S' && (fnmatch("*.rpmsave", script, 0) != 0) &&
+            (fnmatch("*.rpmnew", script, 0) != 0));
+}
+
+int
+glusterd_hooks_create_hooks_directory(char *basedir)
+{
+    int ret = -1;
+    int op = GD_OP_NONE;
+    int type = GD_COMMIT_HOOK_NONE;
+    char version_dir[PATH_MAX] = {
+        0,
+    };
+    char path[PATH_MAX] = {
+        0,
+    };
+    char *cmd_subdir = NULL;
+    char type_subdir[GD_COMMIT_HOOK_MAX][256] = {{
+                                                     0,
+                                                 },
+                                                 "pre",
+                                                 "post"};
+    glusterd_conf_t *priv = NULL;
+    int32_t len = 0;
+
+    xlator_t *this = NULL;
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+
+    snprintf(path, sizeof(path), "%s/hooks", basedir);
+    ret = mkdir_p(path, 0755, _gf_true);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+                "Path=%s", path, NULL);
+        goto out;
+    }
+
+    GLUSTERD_GET_HOOKS_DIR(version_dir, GLUSTERD_HOOK_VER, priv);
+    ret = mkdir_p(version_dir, 0755, _gf_true);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+                "Directory=%s", version_dir, NULL);
+        goto out;
+    }
+
+    for (op = GD_OP_NONE + 1; op < GD_OP_MAX; op++) {
+        cmd_subdir = glusterd_hooks_get_hooks_cmd_subdir(op);
+        if (strlen(cmd_subdir) == 0)
+            continue;
+
+        len = snprintf(path, sizeof(path), "%s/%s", version_dir, cmd_subdir);
+        if ((len < 0) || (len >= sizeof(path))) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+            ret = -1;
+            goto out;
+        }
+        ret = mkdir_p(path, 0755, _gf_true);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_CRITICAL, errno,
+                    GD_MSG_CREATE_DIR_FAILED, "Path=%s", path, NULL);
+            goto out;
+        }
+
+        for (type = GD_COMMIT_HOOK_PRE; type < GD_COMMIT_HOOK_MAX; type++) {
+            len = snprintf(path, sizeof(path), "%s/%s/%s", version_dir,
+                           cmd_subdir, type_subdir[type]);
+            if ((len < 0) || (len >= sizeof(path))) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL,
+                        NULL);
+                ret = -1;
+                goto out;
+            }
+            ret = mkdir_p(path, 0755, _gf_true);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_CRITICAL, errno,
+                        GD_MSG_CREATE_DIR_FAILED, "Path=%s", path, NULL);
+                goto out;
+            }
+        }
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+char *
+glusterd_hooks_get_hooks_cmd_subdir(glusterd_op_t op)
+{
+    GF_ASSERT((op > GD_OP_NONE) && (op < GD_OP_MAX));
+
+    return glusterd_hook_dirnames[op];
+}
+
+void
+glusterd_hooks_add_working_dir(runner_t *runner, glusterd_conf_t *priv)
+{
+    runner_argprintf(runner, "--gd-workdir=%s", priv->workdir);
+}
+
+void
+glusterd_hooks_add_op(runner_t *runner, char *op)
+{
+    runner_argprintf(runner, "--volume-op=%s", op);
+}
+
+void
+glusterd_hooks_add_hooks_version(runner_t *runner)
+{
+    runner_argprintf(runner, "--version=%d", GLUSTERD_HOOK_VER);
+}
+
+static void
+glusterd_hooks_add_custom_args(dict_t *dict, runner_t *runner)
+{
+    char *hooks_args = NULL;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+    GF_VALIDATE_OR_GOTO(this->name, runner, out);
+
+    ret = dict_get_str(dict, "hooks_args", &hooks_args);
+    if (ret)
+        gf_msg_debug(this->name, 0, "No Hooks Arguments.");
+    else
+        gf_msg_debug(this->name, 0, "Hooks Args = %s", hooks_args);
+
+    if (hooks_args)
+        runner_argprintf(runner, "%s", hooks_args);
+
+out:
+    return;
+}
+
+int
+glusterd_hooks_set_volume_args(dict_t *dict, runner_t *runner)
+{
+    int i = 0;
+    int count = 0;
+    int ret = -1;
+    int flag = 0;
+    char query[1024] = {
+        0,
+    };
+    char *key = NULL;
+    char *value = NULL;
+    char *inet_family = NULL;
+    xlator_t *this = NULL;
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_int32(dict, "count", &count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=count", NULL);
+        goto out;
+    }
+
+    /* This will not happen unless op_ctx
+     * is corrupted*/
+    if (!count) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ENTRY, "count",
+                NULL);
+        goto out;
+    }
+
+    runner_add_arg(runner, "-o");
+    for (i = 1; ret == 0; i++) {
+        snprintf(query, sizeof(query), "key%d", i);
+        ret = dict_get_str(dict, query, &key);
+        if (ret)
+            continue;
+
+        snprintf(query, sizeof(query), "value%d", i);
+        ret = dict_get_str(dict, query, &value);
+        if (ret)
+            continue;
+
+        runner_argprintf(runner, "%s=%s", key, value);
+        if ((strncmp(key, "cluster.enable-shared-storage",
+                     SLEN("cluster.enable-shared-storage")) == 0 ||
+             strncmp(key, "enable-shared-storage",
+                     SLEN("enable-shared-storage")) == 0) &&
+            strncmp(value, "enable", SLEN("enable")) == 0)
+            flag = 1;
+    }
+
+    glusterd_hooks_add_custom_args(dict, runner);
+    if (flag == 1) {
+        ret = dict_get_str_sizen(this->options, "transport.address-family",
+                                 &inet_family);
+        if (!ret) {
+            runner_argprintf(runner, "transport.address-family=%s",
+                             inet_family);
+        }
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+glusterd_hooks_add_op_args(runner_t *runner, glusterd_op_t op, dict_t *op_ctx,
+                           glusterd_commit_hook_type_t type)
+{
+    int vol_count = 0;
+    gf_boolean_t truth = _gf_false;
+    glusterd_volinfo_t *voliter = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = -1;
+
+    priv = THIS->private;
+    cds_list_for_each_entry(voliter, &priv->volumes, vol_list)
+    {
+        if (glusterd_is_volume_started(voliter))
+            vol_count++;
+    }
+
+    ret = 0;
+    switch (op) {
+        case GD_OP_START_VOLUME:
+            if (type == GD_COMMIT_HOOK_PRE && vol_count == 0)
+                truth = _gf_true;
+
+            else if (type == GD_COMMIT_HOOK_POST && vol_count == 1)
+                truth = _gf_true;
+
+            else
+                truth = _gf_false;
+
+            runner_argprintf(runner, "--first=%s", truth ? "yes" : "no");
+
+            glusterd_hooks_add_hooks_version(runner);
+            glusterd_hooks_add_op(runner, "start");
+            glusterd_hooks_add_working_dir(runner, priv);
+
+            break;
+
+        case GD_OP_STOP_VOLUME:
+            if (type == GD_COMMIT_HOOK_PRE && vol_count == 1)
+                truth = _gf_true;
+
+            else if (type == GD_COMMIT_HOOK_POST && vol_count == 0)
+                truth = _gf_true;
+
+            else
+                truth = _gf_false;
+
+            runner_argprintf(runner, "--last=%s", truth ? "yes" : "no");
+            break;
+
+        case GD_OP_SET_VOLUME:
+            ret = glusterd_hooks_set_volume_args(op_ctx, runner);
+            glusterd_hooks_add_working_dir(runner, priv);
+            break;
+
+        case GD_OP_GSYNC_CREATE:
+            glusterd_hooks_add_custom_args(op_ctx, runner);
+            break;
+
+        case GD_OP_ADD_BRICK:
+            glusterd_hooks_add_hooks_version(runner);
+            glusterd_hooks_add_op(runner, "add-brick");
+            glusterd_hooks_add_working_dir(runner, priv);
+            break;
+
+        case GD_OP_RESET_VOLUME:
+            glusterd_hooks_add_hooks_version(runner);
+            glusterd_hooks_add_op(runner, "reset");
+            glusterd_hooks_add_working_dir(runner, priv);
+            break;
+
+        default:
+            break;
+    }
+
+    return ret;
+}
+
+int
+glusterd_hooks_run_hooks(char *hooks_path, glusterd_op_t op, dict_t *op_ctx,
+                         glusterd_commit_hook_type_t type)
+{
+    xlator_t *this = NULL;
+    runner_t runner = {
+        0,
+    };
+    DIR *hookdir = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    char *volname = NULL;
+    char **lines = NULL;
+    int N = 8; /*arbitrary*/
+    int lineno = 0;
+    int line_count = 0;
+    int ret = -1;
+
+    this = THIS;
+
+    ret = dict_get_str(op_ctx, "volname", &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_DICT_GET_FAILED,
+               "Failed to get volname "
+               "from operation context");
+        goto out;
+    }
+
+    hookdir = sys_opendir(hooks_path);
+    if (!hookdir) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Failed to open dir %s", hooks_path);
+        goto out;
+    }
+
+    lines = GF_CALLOC(1, N * sizeof(*lines), gf_gld_mt_charptr);
+    if (!lines) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = -1;
+    line_count = 0;
+
+    while ((entry = sys_readdir(hookdir, scratch))) {
+        if (gf_irrelevant_entry(entry))
+            continue;
+        if (line_count == N - 1) {
+            N *= 2;
+            lines = GF_REALLOC(lines, N * sizeof(char *));
+            if (!lines) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY,
+                        NULL);
+                goto out;
+            }
+        }
+
+        if (glusterd_is_hook_enabled(entry->d_name)) {
+            lines[line_count] = gf_strdup(entry->d_name);
+            line_count++;
+        }
+    }
+
+    lines[line_count] = NULL;
+    lines = GF_REALLOC(lines, (line_count + 1) * sizeof(char *));
+    if (!lines)
+        goto out;
+
+    qsort(lines, line_count, sizeof(*lines), glusterd_compare_lines);
+
+    for (lineno = 0; lineno < line_count; lineno++) {
+        runinit(&runner);
+        runner_argprintf(&runner, "%s/%s", hooks_path, lines[lineno]);
+        /*Add future command line arguments to hook scripts below*/
+        runner_argprintf(&runner, "--volname=%s", volname);
+        ret = glusterd_hooks_add_op_args(&runner, op, op_ctx, type);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_ADD_OP_ARGS_FAIL,
+                   "Failed to add "
+                   "command specific arguments");
+            goto out;
+        }
+
+        ret = runner_run_reuse(&runner);
+        if (ret) {
+            runner_log(&runner, this->name, GF_LOG_ERROR,
+                       "Failed to execute script");
+        } else {
+            runner_log(&runner, this->name, GF_LOG_INFO, "Ran script");
+        }
+        runner_end(&runner);
+    }
+
+    ret = 0;
+out:
+    if (lines) {
+        for (lineno = 0; lineno < line_count + 1; lineno++)
+            GF_FREE(lines[lineno]);
+
+        GF_FREE(lines);
+    }
+
+    if (hookdir)
+        sys_closedir(hookdir);
+
+    return ret;
+}
+
+int
+glusterd_hooks_post_stub_enqueue(char *scriptdir, glusterd_op_t op,
+                                 dict_t *op_ctx)
+{
+    int ret = -1;
+    glusterd_hooks_stub_t *stub = NULL;
+    glusterd_hooks_private_t *hooks_priv = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    conf = THIS->private;
+    hooks_priv = conf->hooks_priv;
+
+    ret = glusterd_hooks_stub_init(&stub, scriptdir, op, op_ctx);
+    if (ret)
+        goto out;
+
+    pthread_mutex_lock(&hooks_priv->mutex);
+    {
+        hooks_priv->waitcount++;
+        cds_list_add_tail(&stub->all_hooks, &hooks_priv->list);
+        pthread_cond_signal(&hooks_priv->cond);
+    }
+    pthread_mutex_unlock(&hooks_priv->mutex);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_hooks_stub_init(glusterd_hooks_stub_t **stub, char *scriptdir,
+                         glusterd_op_t op, dict_t *op_ctx)
+{
+    int ret = -1;
+    glusterd_hooks_stub_t *hooks_stub = NULL;
+
+    xlator_t *this = NULL;
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(stub);
+    if (!stub)
+        goto out;
+
+    hooks_stub = GF_CALLOC(1, sizeof(*hooks_stub), gf_gld_mt_hooks_stub_t);
+    if (!hooks_stub) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    CDS_INIT_LIST_HEAD(&hooks_stub->all_hooks);
+    hooks_stub->op = op;
+    hooks_stub->scriptdir = gf_strdup(scriptdir);
+    if (!hooks_stub->scriptdir) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+                "scriptdir=%s", scriptdir, NULL);
+        goto out;
+    }
+
+    hooks_stub->op_ctx = dict_copy_with_ref(op_ctx, hooks_stub->op_ctx);
+    if (!hooks_stub->op_ctx) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    *stub = hooks_stub;
+    ret = 0;
+out:
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_HOOK_STUB_INIT_FAIL,
+                NULL);
+        glusterd_hooks_stub_cleanup(hooks_stub);
+    }
+
+    return ret;
+}
+
+void
+glusterd_hooks_stub_cleanup(glusterd_hooks_stub_t *stub)
+{
+    if (!stub) {
+        gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, GD_MSG_HOOK_STUB_NULL,
+                         "hooks_stub is NULL");
+        return;
+    }
+
+    if (stub->op_ctx)
+        dict_unref(stub->op_ctx);
+
+    GF_FREE(stub->scriptdir);
+
+    GF_FREE(stub);
+}
+
+static void *
+hooks_worker(void *args)
+{
+    glusterd_conf_t *conf = NULL;
+    glusterd_hooks_private_t *hooks_priv = NULL;
+    glusterd_hooks_stub_t *stub = NULL;
+
+    THIS = args;
+    conf = THIS->private;
+    hooks_priv = conf->hooks_priv;
+
+    for (;;) {
+        pthread_mutex_lock(&hooks_priv->mutex);
+        {
+            while (cds_list_empty(&hooks_priv->list)) {
+                pthread_cond_wait(&hooks_priv->cond, &hooks_priv->mutex);
+            }
+            stub = cds_list_entry(hooks_priv->list.next, glusterd_hooks_stub_t,
+                                  all_hooks);
+            cds_list_del_init(&stub->all_hooks);
+            hooks_priv->waitcount--;
+        }
+        pthread_mutex_unlock(&hooks_priv->mutex);
+
+        glusterd_hooks_run_hooks(stub->scriptdir, stub->op, stub->op_ctx,
+                                 GD_COMMIT_HOOK_POST);
+        glusterd_hooks_stub_cleanup(stub);
+    }
+
+    return NULL;
+}
+
+int
+glusterd_hooks_priv_init(glusterd_hooks_private_t **new)
+{
+    int ret = -1;
+    glusterd_hooks_private_t *hooks_priv = NULL;
+
+    xlator_t *this = NULL;
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (!new) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    hooks_priv = GF_CALLOC(1, sizeof(*hooks_priv), gf_gld_mt_hooks_priv_t);
+    if (!hooks_priv) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    pthread_mutex_init(&hooks_priv->mutex, NULL);
+    pthread_cond_init(&hooks_priv->cond, NULL);
+    CDS_INIT_LIST_HEAD(&hooks_priv->list);
+    hooks_priv->waitcount = 0;
+
+    *new = hooks_priv;
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_hooks_spawn_worker(xlator_t *this)
+{
+    int ret = -1;
+    glusterd_conf_t *conf = NULL;
+    glusterd_hooks_private_t *hooks_priv = NULL;
+
+    ret = glusterd_hooks_priv_init(&hooks_priv);
+    if (ret)
+        goto out;
+
+    conf = this->private;
+    conf->hooks_priv = hooks_priv;
+    ret = gf_thread_create(&hooks_priv->worker, NULL, hooks_worker,
+                           (void *)this, "gdhooks");
+    if (ret)
+        gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_SPAWN_THREADS_FAIL,
+               "Failed to spawn post "
+               "hooks worker thread");
+out:
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-hooks.h b/xlators/mgmt/glusterd/src/glusterd-hooks.h
new file mode 100644
index 00000000000..f8b887b9bd7
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-hooks.h
@@ -0,0 +1,88 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_HOOKS_H_
+#define _GLUSTERD_HOOKS_H_
+
+#include <fnmatch.h>
+
+#define GLUSTERD_GET_HOOKS_DIR(path, version, priv)                            \
+    do {                                                                       \
+        int32_t len;                                                           \
+        len = snprintf(path, PATH_MAX, "%s/hooks/%d", priv->workdir, version); \
+        if (len < 0) {                                                         \
+            path[0] = 0;                                                       \
+        }                                                                      \
+    } while (0)
+
+#define GLUSTERD_HOOK_VER 1
+
+#define GD_HOOKS_SPECIFIC_KEY "user.*"
+
+typedef enum glusterd_commit_hook_type {
+    GD_COMMIT_HOOK_NONE = 0,
+    GD_COMMIT_HOOK_PRE,
+    GD_COMMIT_HOOK_POST,
+    GD_COMMIT_HOOK_MAX
+} glusterd_commit_hook_type_t;
+
+typedef struct hooks_private {
+    struct cds_list_head list;
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+    pthread_t worker;
+    int waitcount;  // debug purposes
+} glusterd_hooks_private_t;
+
+typedef struct hooks_stub {
+    struct cds_list_head all_hooks;
+    char *scriptdir;
+    dict_t *op_ctx;
+    glusterd_op_t op;
+
+} glusterd_hooks_stub_t;
+
+static inline gf_boolean_t
+is_key_glusterd_hooks_friendly(char *key)
+{
+    gf_boolean_t is_friendly = _gf_false;
+
+    /* This is very specific to hooks friendly behavior */
+    if (fnmatch(GD_HOOKS_SPECIFIC_KEY, key, FNM_NOESCAPE) == 0) {
+        gf_msg_debug(THIS->name, 0, "user namespace key %s", key);
+        is_friendly = _gf_true;
+    }
+
+    return is_friendly;
+}
+
+int
+glusterd_hooks_create_hooks_directory(char *basedir);
+
+char *
+glusterd_hooks_get_hooks_cmd_subdir(glusterd_op_t op);
+
+int
+glusterd_hooks_run_hooks(char *hooks_path, glusterd_op_t op, dict_t *op_ctx,
+                         glusterd_commit_hook_type_t type);
+int
+glusterd_hooks_spawn_worker(xlator_t *this);
+
+int
+glusterd_hooks_stub_init(glusterd_hooks_stub_t **stub, char *scriptdir,
+                         glusterd_op_t op, dict_t *op_ctx);
+void
+glusterd_hooks_stub_cleanup(glusterd_hooks_stub_t *stub);
+
+int
+glusterd_hooks_post_stub_enqueue(char *scriptdir, glusterd_op_t op,
+                                 dict_t *op_ctx);
+int
+glusterd_hooks_priv_init(glusterd_hooks_private_t **new);
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-locks.c b/xlators/mgmt/glusterd/src/glusterd-locks.c
new file mode 100644
index 00000000000..11523f2854b
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-locks.c
@@ -0,0 +1,870 @@
+/*
+   Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/common-utils.h>
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+#include "glusterd.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-store.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-locks.h"
+#include "glusterd-errno.h"
+#include <glusterfs/run.h>
+#include <glusterfs/syscall.h>
+#include "glusterd-messages.h"
+
+#include <signal.h>
+
+#define GF_MAX_LOCKING_ENTITIES 3
+
+/* Valid entities that the mgmt_v3 lock can hold locks upon    *
+ * To add newer entities to be locked, we can just add more    *
+ * entries to this table along with the type and default value */
+glusterd_valid_entities valid_types[] = {
+    {"vol", _gf_true},
+    {"snap", _gf_false},
+    {"global", _gf_false},
+    {NULL},
+};
+
+/* Checks if the lock request is for a valid entity */
+static gf_boolean_t
+glusterd_mgmt_v3_is_type_valid(char *type)
+{
+    int i = 0;
+
+    GF_ASSERT(type);
+
+    for (i = 0; valid_types[i].type; i++) {
+        if (!strcmp(type, valid_types[i].type)) {
+            return _gf_true;
+        }
+    }
+
+    return _gf_false;
+}
+
+/* Initialize the global mgmt_v3 lock list(dict) when
+ * glusterd is spawned */
+int32_t
+glusterd_mgmt_v3_lock_init()
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    priv->mgmt_v3_lock = dict_new();
+    if (!priv->mgmt_v3_lock)
+        goto out;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/* Destroy the global mgmt_v3 lock list(dict) when
+ * glusterd cleanup is performed */
+void
+glusterd_mgmt_v3_lock_fini()
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (priv->mgmt_v3_lock)
+        dict_unref(priv->mgmt_v3_lock);
+}
+
+/* Initialize the global mgmt_v3_timer lock list(dict) when
+ * glusterd is spawned */
+int32_t
+glusterd_mgmt_v3_lock_timer_init()
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    priv->mgmt_v3_lock_timer = dict_new();
+    if (!priv->mgmt_v3_lock_timer)
+        goto out;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/* Destroy the global mgmt_v3_timer lock list(dict) when
+ * glusterd cleanup is performed */
+void
+glusterd_mgmt_v3_lock_timer_fini()
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    if (priv->mgmt_v3_lock_timer)
+        dict_unref(priv->mgmt_v3_lock_timer);
+out:
+    return;
+}
+
+static int32_t
+glusterd_get_mgmt_v3_lock_owner(char *key, uuid_t *uuid)
+{
+    int32_t ret = -1;
+    glusterd_mgmt_v3_lock_obj *lock_obj = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (!key || !uuid) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "key or uuid is null.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_bin(priv->mgmt_v3_lock, key, (void **)&lock_obj);
+    if (!ret)
+        gf_uuid_copy(*uuid, lock_obj->lock_owner);
+
+    ret = 0;
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* This function is called with the locked_count and type, to   *
+ * release all the acquired locks. */
+static int32_t
+glusterd_release_multiple_locks_per_entity(dict_t *dict, uuid_t uuid,
+                                           int32_t locked_count, char *type)
+{
+    char name_buf[PATH_MAX] = "";
+    char *name = NULL;
+    int32_t i = -1;
+    int32_t op_ret = 0;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(type);
+
+    if (locked_count == 0) {
+        gf_msg_debug(this->name, 0, "No %s locked as part of this transaction",
+                     type);
+        goto out;
+    }
+
+    /* Release all the locks held */
+    for (i = 0; i < locked_count; i++) {
+        ret = snprintf(name_buf, sizeof(name_buf), "%sname%d", type, i + 1);
+
+        /* Looking for volname1, volname2 or snapname1, *
+         * as key in the dict snapname2 */
+        ret = dict_get_strn(dict, name_buf, ret, &name);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get %s locked_count = %d", name_buf,
+                   locked_count);
+            op_ret = ret;
+            continue;
+        }
+
+        ret = glusterd_mgmt_v3_unlock(name, uuid, type);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FAIL,
+                   "Failed to release lock for %s.", name);
+            op_ret = ret;
+        }
+    }
+
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", op_ret);
+    return op_ret;
+}
+
+/* Given the count and type of the entity this function acquires     *
+ * locks on multiple elements of the same entity. For example:       *
+ * If type is "vol" this function tries to acquire locks on multiple *
+ * volumes */
+static int32_t
+glusterd_acquire_multiple_locks_per_entity(dict_t *dict, uuid_t uuid,
+                                           uint32_t *op_errno, int32_t count,
+                                           char *type)
+{
+    char name_buf[PATH_MAX] = "";
+    char *name = NULL;
+    int32_t i = -1;
+    int32_t ret = -1;
+    int32_t locked_count = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(type);
+
+    /* Locking one element after other */
+    for (i = 0; i < count; i++) {
+        ret = snprintf(name_buf, sizeof(name_buf), "%sname%d", type, i + 1);
+
+        /* Looking for volname1, volname2 or snapname1, *
+         * as key in the dict snapname2 */
+        ret = dict_get_strn(dict, name_buf, ret, &name);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get %s count = %d", name_buf, count);
+            break;
+        }
+
+        ret = glusterd_mgmt_v3_lock(name, uuid, op_errno, type);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCK_GET_FAIL,
+                   "Failed to acquire lock for %s %s "
+                   "on behalf of %s. Reversing "
+                   "this transaction",
+                   type, name, uuid_utoa(uuid));
+            break;
+        }
+        locked_count++;
+    }
+
+    if (count == locked_count) {
+        /* If all locking ops went successfully, return as success */
+        ret = 0;
+        goto out;
+    }
+
+    /* If we failed to lock one element, unlock others and return failure */
+    ret = glusterd_release_multiple_locks_per_entity(dict, uuid, locked_count,
+                                                     type);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MULTIPLE_LOCK_RELEASE_FAIL,
+               "Failed to release multiple %s locks", type);
+    }
+    ret = -1;
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Given the type of entity, this function figures out if it should unlock a   *
+ * single element of multiple elements of the said entity. For example:        *
+ * if the type is "vol", this function will accordingly unlock a single volume *
+ * or multiple volumes */
+static int32_t
+glusterd_mgmt_v3_unlock_entity(dict_t *dict, uuid_t uuid, char *type,
+                               gf_boolean_t default_value)
+{
+    char name_buf[PATH_MAX] = "";
+    char *name = NULL;
+    int32_t count = -1;
+    int32_t ret = -1;
+    gf_boolean_t hold_locks = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(type);
+
+    snprintf(name_buf, sizeof(name_buf), "hold_%s_locks", type);
+    hold_locks = dict_get_str_boolean(dict, name_buf, default_value);
+
+    if (hold_locks == _gf_false) {
+        /* Locks were not held for this particular entity *
+         * Hence nothing to release */
+        ret = 0;
+        goto out;
+    }
+
+    /* Looking for volcount or snapcount in the dict */
+    ret = snprintf(name_buf, sizeof(name_buf), "%scount", type);
+    ret = dict_get_int32n(dict, name_buf, ret, &count);
+    if (ret) {
+        /* count is not present. Only one *
+         * element name needs to be unlocked */
+        ret = snprintf(name_buf, sizeof(name_buf), "%sname", type);
+        ret = dict_get_strn(dict, name_buf, ret, &name);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to fetch %sname", type);
+            goto out;
+        }
+
+        ret = glusterd_mgmt_v3_unlock(name, uuid, type);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FAIL,
+                   "Failed to release lock for %s %s "
+                   "on behalf of %s.",
+                   type, name, uuid_utoa(uuid));
+            goto out;
+        }
+    } else {
+        /* Unlocking one element name after another */
+        ret = glusterd_release_multiple_locks_per_entity(dict, uuid, count,
+                                                         type);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_MULTIPLE_LOCK_RELEASE_FAIL,
+                   "Failed to release all %s locks", type);
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Given the type of entity, this function figures out if it should lock a   *
+ * single element or multiple elements of the said entity. For example:      *
+ * if the type is "vol", this function will accordingly lock a single volume *
+ * or multiple volumes */
+static int32_t
+glusterd_mgmt_v3_lock_entity(dict_t *dict, uuid_t uuid, uint32_t *op_errno,
+                             char *type, gf_boolean_t default_value)
+{
+    char name_buf[PATH_MAX] = "";
+    char *name = NULL;
+    int32_t count = -1;
+    int32_t ret = -1;
+    gf_boolean_t hold_locks = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(type);
+
+    snprintf(name_buf, sizeof(name_buf), "hold_%s_locks", type);
+    hold_locks = dict_get_str_boolean(dict, name_buf, default_value);
+
+    if (hold_locks == _gf_false) {
+        /* Not holding locks for this particular entity */
+        ret = 0;
+        goto out;
+    }
+
+    /* Looking for volcount or snapcount in the dict */
+    ret = snprintf(name_buf, sizeof(name_buf), "%scount", type);
+    ret = dict_get_int32n(dict, name_buf, ret, &count);
+    if (ret) {
+        /* count is not present. Only one *
+         * element name needs to be locked */
+        ret = snprintf(name_buf, sizeof(name_buf), "%sname", type);
+        ret = dict_get_strn(dict, name_buf, ret, &name);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to fetch %sname", type);
+            goto out;
+        }
+
+        ret = glusterd_mgmt_v3_lock(name, uuid, op_errno, type);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCK_GET_FAIL,
+                   "Failed to acquire lock for %s %s "
+                   "on behalf of %s.",
+                   type, name, uuid_utoa(uuid));
+            goto out;
+        }
+    } else {
+        /* Locking one element name after another */
+        ret = glusterd_acquire_multiple_locks_per_entity(dict, uuid, op_errno,
+                                                         count, type);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_MULTIPLE_LOCK_ACQUIRE_FAIL,
+                   "Failed to acquire all %s locks", type);
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Try to release locks of multiple entities like *
+ * volume, snaps etc. */
+int32_t
+glusterd_multiple_mgmt_v3_unlock(dict_t *dict, uuid_t uuid)
+{
+    int32_t i = -1;
+    int32_t ret = -1;
+    int32_t op_ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (!dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_EMPTY, "dict is null.");
+        ret = -1;
+        goto out;
+    }
+
+    for (i = 0; valid_types[i].type; i++) {
+        ret = glusterd_mgmt_v3_unlock_entity(dict, uuid, valid_types[i].type,
+                                             valid_types[i].default_value);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_MULTIPLE_LOCK_RELEASE_FAIL, "Unable to unlock all %s",
+                   valid_types[i].type);
+            op_ret = ret;
+        }
+    }
+
+    ret = op_ret;
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Try to acquire locks on multiple entities like *
+ * volume, snaps etc. */
+int32_t
+glusterd_multiple_mgmt_v3_lock(dict_t *dict, uuid_t uuid, uint32_t *op_errno)
+{
+    int32_t i = -1;
+    int32_t ret = -1;
+    int32_t locked_count = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (!dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_EMPTY, "dict is null.");
+        ret = -1;
+        goto out;
+    }
+
+    /* Locking one entity after other */
+    for (i = 0; valid_types[i].type; i++) {
+        ret = glusterd_mgmt_v3_lock_entity(dict, uuid, op_errno,
+                                           valid_types[i].type,
+                                           valid_types[i].default_value);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_MULTIPLE_LOCK_ACQUIRE_FAIL, "Unable to lock all %s",
+                   valid_types[i].type);
+            break;
+        }
+        locked_count++;
+    }
+
+    if (locked_count == GF_MAX_LOCKING_ENTITIES) {
+        /* If all locking ops went successfully, return as success */
+        ret = 0;
+        goto out;
+    }
+
+    /* If we failed to lock one entity, unlock others and return failure */
+    for (i = 0; i < locked_count; i++) {
+        ret = glusterd_mgmt_v3_unlock_entity(dict, uuid, valid_types[i].type,
+                                             valid_types[i].default_value);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_MULTIPLE_LOCK_RELEASE_FAIL, "Unable to unlock all %s",
+                   valid_types[i].type);
+        }
+    }
+    ret = -1;
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_lock(const char *name, uuid_t uuid, uint32_t *op_errno,
+                      char *type)
+{
+    char key[PATH_MAX] = "";
+    int32_t ret = -1;
+    glusterd_mgmt_v3_lock_obj *lock_obj = NULL;
+    glusterd_mgmt_v3_lock_timer *mgmt_lock_timer = NULL;
+    glusterd_conf_t *priv = NULL;
+    gf_boolean_t is_valid = _gf_true;
+    uuid_t owner = {0};
+    xlator_t *this = NULL;
+    char *bt = NULL;
+    struct timespec delay = {0};
+    char *key_dup = NULL;
+    glusterfs_ctx_t *mgmt_lock_timer_ctx = NULL;
+    xlator_t *mgmt_lock_timer_xl = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (!name || !type) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "name or type is null.");
+        ret = -1;
+        goto out;
+    }
+
+    is_valid = glusterd_mgmt_v3_is_type_valid(type);
+    if (is_valid != _gf_true) {
+        gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                         "Invalid entity. Cannot perform locking "
+                         "operation on %s types",
+                         type);
+        ret = -1;
+        goto out;
+    }
+
+    ret = snprintf(key, sizeof(key), "%s_%s", name, type);
+    if (ret != strlen(name) + 1 + strlen(type)) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CREATE_KEY_FAIL,
+               "Unable to create key");
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "Trying to acquire lock of %s for %s", key,
+                 uuid_utoa(uuid));
+
+    ret = glusterd_get_mgmt_v3_lock_owner(key, &owner);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Unable to get mgmt_v3 lock owner");
+        goto out;
+    }
+
+    /* If the lock has already been held for the given volume
+     * we fail */
+    if (!gf_uuid_is_null(owner)) {
+        gf_msg_callingfn(this->name, GF_LOG_WARNING, 0,
+                         GD_MSG_LOCK_ALREADY_HELD, "Lock for %s held by %s",
+                         name, uuid_utoa(owner));
+        ret = -1;
+        *op_errno = EG_ANOTRANS;
+        goto out;
+    }
+
+    lock_obj = GF_MALLOC(sizeof(glusterd_mgmt_v3_lock_obj),
+                         gf_common_mt_mgmt_v3_lock_obj_t);
+    if (!lock_obj) {
+        ret = -1;
+        goto out;
+    }
+
+    gf_uuid_copy(lock_obj->lock_owner, uuid);
+
+    ret = dict_set_bin(priv->mgmt_v3_lock, key, lock_obj,
+                       sizeof(glusterd_mgmt_v3_lock_obj));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set lock owner in mgmt_v3 lock");
+        GF_FREE(lock_obj);
+        goto out;
+    }
+
+    mgmt_lock_timer = GF_CALLOC(1, sizeof(glusterd_mgmt_v3_lock_timer),
+                                gf_common_mt_mgmt_v3_lock_timer_t);
+
+    if (!mgmt_lock_timer) {
+        ret = -1;
+        goto out;
+    }
+
+    mgmt_lock_timer->xl = THIS;
+    /*changing to default timeout value*/
+    priv->mgmt_v3_lock_timeout = GF_LOCK_TIMER;
+
+    ret = -1;
+    mgmt_lock_timer_xl = mgmt_lock_timer->xl;
+    if (!mgmt_lock_timer_xl) {
+        GF_FREE(mgmt_lock_timer);
+        goto out;
+    }
+
+    mgmt_lock_timer_ctx = mgmt_lock_timer_xl->ctx;
+    if (!mgmt_lock_timer_ctx) {
+        GF_FREE(mgmt_lock_timer);
+        goto out;
+    }
+
+    key_dup = gf_strdup(key);
+    delay.tv_sec = priv->mgmt_v3_lock_timeout;
+    delay.tv_nsec = 0;
+
+    mgmt_lock_timer->timer = gf_timer_call_after(
+        mgmt_lock_timer_ctx, delay, gd_mgmt_v3_unlock_timer_cbk, key_dup);
+
+    ret = dict_set_bin(priv->mgmt_v3_lock_timer, key, mgmt_lock_timer,
+                       sizeof(glusterd_mgmt_v3_lock_timer));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set timer in mgmt_v3 lock");
+        GF_FREE(key_dup);
+        GF_FREE(mgmt_lock_timer);
+        goto out;
+    }
+
+    /* Saving the backtrace into the pre-allocated buffer, ctx->btbuf*/
+    if ((bt = gf_backtrace_save(NULL))) {
+        snprintf(key, sizeof(key), "debug.last-success-bt-%s", key_dup);
+        ret = dict_set_dynstr_with_alloc(priv->mgmt_v3_lock, key, bt);
+        if (ret)
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to save "
+                   "the back trace for lock %s granted to %s",
+                   key_dup, uuid_utoa(uuid));
+        ret = 0;
+    }
+
+    gf_msg_debug(this->name, 0, "Lock for %s successfully held by %s", key_dup,
+                 uuid_utoa(uuid));
+
+    ret = 0;
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/*
+ * This call back will ensure to unlock the lock_obj, in case we hit a situation
+ * where unlocking failed and stale lock exist*/
+void
+gd_mgmt_v3_unlock_timer_cbk(void *data)
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    glusterd_mgmt_v3_lock_timer *mgmt_lock_timer = NULL;
+    char *key = NULL;
+    int keylen;
+    char bt_key[PATH_MAX] = "";
+    int bt_key_len = 0;
+    int32_t ret = -1;
+    glusterfs_ctx_t *mgmt_lock_timer_ctx = NULL;
+    xlator_t *mgmt_lock_timer_xl = NULL;
+    gf_timer_t *timer = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    GF_ASSERT(NULL != data);
+    key = (char *)data;
+
+    keylen = strlen(key);
+    dict_deln(conf->mgmt_v3_lock, key, keylen);
+
+    bt_key_len = snprintf(bt_key, PATH_MAX, "debug.last-success-bt-%s", key);
+    if (bt_key_len != SLEN("debug.last-success-bt-") + keylen) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CREATE_KEY_FAIL,
+               "Unable to create backtrace "
+               "key");
+        goto out;
+    }
+
+    dict_deln(conf->mgmt_v3_lock, bt_key, bt_key_len);
+
+    ret = dict_get_bin(conf->mgmt_v3_lock_timer, key,
+                       (void **)&mgmt_lock_timer);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to get lock owner in mgmt_v3 lock");
+    }
+
+out:
+    if (mgmt_lock_timer && mgmt_lock_timer->timer) {
+        mgmt_lock_timer_xl = mgmt_lock_timer->xl;
+        GF_VALIDATE_OR_GOTO(this->name, mgmt_lock_timer_xl, ret_function);
+
+        mgmt_lock_timer_ctx = mgmt_lock_timer_xl->ctx;
+        GF_VALIDATE_OR_GOTO(this->name, mgmt_lock_timer_ctx, ret_function);
+
+        timer = mgmt_lock_timer->timer;
+        GF_FREE(timer->data);
+        gf_timer_call_cancel(mgmt_lock_timer_ctx, mgmt_lock_timer->timer);
+        dict_deln(conf->mgmt_v3_lock_timer, bt_key, bt_key_len);
+        mgmt_lock_timer->timer = NULL;
+        gf_log(this->name, GF_LOG_INFO,
+               "unlock timer is cancelled for volume_type"
+               " %s",
+               key);
+    }
+
+ret_function:
+
+    return;
+}
+
+int32_t
+glusterd_mgmt_v3_unlock(const char *name, uuid_t uuid, char *type)
+{
+    char key[PATH_MAX] = "";
+    char key_dup[PATH_MAX] = "";
+    int keylen;
+    int32_t ret = -1;
+    gf_boolean_t is_valid = _gf_true;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_mgmt_v3_lock_timer *mgmt_lock_timer = NULL;
+    uuid_t owner = {0};
+    xlator_t *this = NULL;
+    glusterfs_ctx_t *mgmt_lock_timer_ctx = NULL;
+    xlator_t *mgmt_lock_timer_xl = NULL;
+    gf_timer_t *timer = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (!name || !type) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "name is null.");
+        ret = -1;
+        goto out;
+    }
+
+    is_valid = glusterd_mgmt_v3_is_type_valid(type);
+    if (is_valid != _gf_true) {
+        gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                         "Invalid entity. Cannot perform unlocking "
+                         "operation on %s types",
+                         type);
+        ret = -1;
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s_%s", name, type);
+    if (keylen != strlen(name) + 1 + strlen(type)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CREATE_KEY_FAIL,
+               "Unable to create key");
+        ret = -1;
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "Trying to release lock of %s %s for %s as %s",
+                 type, name, uuid_utoa(uuid), key);
+
+    ret = glusterd_get_mgmt_v3_lock_owner(key, &owner);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Unable to get mgmt_v3 lock owner");
+        goto out;
+    }
+
+    if (gf_uuid_is_null(owner)) {
+        gf_msg_callingfn(this->name, GF_LOG_WARNING, 0, GD_MSG_LOCK_NOT_HELD,
+                         "Lock for %s %s not held", type, name);
+        ret = -1;
+        goto out;
+    }
+
+    ret = gf_uuid_compare(uuid, owner);
+    if (ret) {
+        gf_msg_callingfn(this->name, GF_LOG_WARNING, 0,
+                         GD_MSG_LOCK_OWNER_MISMATCH,
+                         "Lock owner mismatch. "
+                         "Lock for %s %s held by %s",
+                         type, name, uuid_utoa(owner));
+        goto out;
+    }
+
+    /* Removing the mgmt_v3 lock from the global list */
+    dict_deln(priv->mgmt_v3_lock, key, keylen);
+
+    ret = dict_get_bin(priv->mgmt_v3_lock_timer, key,
+                       (void **)&mgmt_lock_timer);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to get mgmt lock key in mgmt_v3 lock");
+        goto out;
+    }
+
+    (void)snprintf(key_dup, sizeof(key_dup), "%s", key);
+
+    /* Remove the backtrace key as well */
+    ret = snprintf(key, sizeof(key), "debug.last-success-bt-%s", key_dup);
+    if (ret != SLEN("debug.last-success-bt-") + keylen) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CREATE_KEY_FAIL,
+               "Unable to create backtrace "
+               "key");
+        ret = -1;
+        goto out;
+    }
+    dict_deln(priv->mgmt_v3_lock, key, ret);
+
+    gf_msg_debug(this->name, 0, "Lock for %s %s successfully released", type,
+                 name);
+
+    /* Release owner reference which was held during lock */
+    if (mgmt_lock_timer && mgmt_lock_timer->timer) {
+        ret = -1;
+        mgmt_lock_timer_xl = mgmt_lock_timer->xl;
+        GF_VALIDATE_OR_GOTO(this->name, mgmt_lock_timer_xl, out);
+
+        mgmt_lock_timer_ctx = mgmt_lock_timer_xl->ctx;
+        GF_VALIDATE_OR_GOTO(this->name, mgmt_lock_timer_ctx, out);
+        ret = 0;
+
+        timer = mgmt_lock_timer->timer;
+        GF_FREE(timer->data);
+        gf_timer_call_cancel(mgmt_lock_timer_ctx, mgmt_lock_timer->timer);
+        dict_deln(priv->mgmt_v3_lock_timer, key_dup, keylen);
+    }
+    ret = glusterd_volinfo_find(name, &volinfo);
+    if (volinfo && volinfo->stage_deleted) {
+        /* this indicates a volume still exists and the volume delete
+         * operation has failed in some of the phases, need to ensure
+         * stage_deleted flag is set back to false
+         */
+        volinfo->stage_deleted = _gf_false;
+        gf_log(this->name, GF_LOG_INFO,
+               "Volume %s still exist, setting "
+               "stage deleted flag to false for the volume",
+               volinfo->volname);
+    }
+    ret = 0;
+out:
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-locks.h b/xlators/mgmt/glusterd/src/glusterd-locks.h
new file mode 100644
index 00000000000..44667cebd3d
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-locks.h
@@ -0,0 +1,57 @@
+/*
+   Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_LOCKS_H_
+#define _GLUSTERD_LOCKS_H_
+
+typedef struct glusterd_mgmt_v3_lock_object_ {
+    uuid_t lock_owner;
+} glusterd_mgmt_v3_lock_obj;
+
+typedef struct glusterd_mgmt_v3_lock_timer_ {
+    gf_timer_t *timer;
+    xlator_t *xl;
+} glusterd_mgmt_v3_lock_timer;
+
+typedef struct glusterd_mgmt_v3_lock_valid_entities {
+    char *type;                 /* Entity type like vol, snap */
+    gf_boolean_t default_value; /* The default value that  *
+                                 * determines if the locks *
+                                 * should be held for that *
+                                 * entity */
+} glusterd_valid_entities;
+
+int32_t
+glusterd_mgmt_v3_lock_init();
+
+void
+glusterd_mgmt_v3_lock_fini();
+
+int32_t
+glusterd_mgmt_v3_lock_timer_init();
+
+void
+glusterd_mgmt_v3_lock_timer_fini();
+
+int32_t
+glusterd_mgmt_v3_lock(const char *key, uuid_t uuid, uint32_t *op_errno,
+                      char *type);
+
+int32_t
+glusterd_mgmt_v3_unlock(const char *key, uuid_t uuid, char *type);
+
+int32_t
+glusterd_multiple_mgmt_v3_lock(dict_t *dict, uuid_t uuid, uint32_t *op_errno);
+
+int32_t
+glusterd_multiple_mgmt_v3_unlock(dict_t *dict, uuid_t uuid);
+
+void
+gd_mgmt_v3_unlock_timer_cbk(void *data);
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-log-ops.c b/xlators/mgmt/glusterd/src/glusterd-log-ops.c
new file mode 100644
index 00000000000..34abf35cb00
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-log-ops.c
@@ -0,0 +1,290 @@
+/*
+   Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/common-utils.h>
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+#include "glusterd.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-store.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-messages.h"
+#include <glusterfs/syscall.h>
+
+#include <signal.h>
+
+int
+__glusterd_handle_log_rotate(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    glusterd_op_t cli_op = GD_OP_LOG_ROTATE;
+    char *volname = NULL;
+    char msg[64] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    GF_ASSERT(req);
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        req->rpc_err = GARBAGE_ARGS;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(msg, sizeof(msg),
+                     "Unable to decode the "
+                     "command");
+            goto out;
+        }
+    }
+
+    ret = dict_get_str(dict, "volname", &volname);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Failed to get volume name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s", msg);
+        goto out;
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_LOG_ROTATE_REQ_RECVD,
+           "Received log rotate req "
+           "for volume %s",
+           volname);
+
+    ret = dict_set_uint64(dict, "rotate-key", (uint64_t)gf_time());
+    if (ret)
+        goto out;
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_LOG_ROTATE, dict);
+
+out:
+    if (ret) {
+        if (msg[0] == '\0')
+            snprintf(msg, sizeof(msg), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, msg);
+    }
+
+    free(cli_req.dict.dict_val);
+    return ret;
+}
+
+int
+glusterd_handle_log_rotate(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_log_rotate);
+}
+
+/* op-sm */
+int
+glusterd_op_stage_log_rotate(dict_t *dict, char **op_errstr)
+{
+    int ret = -1;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char msg[2048] = {0};
+    char *brick = NULL;
+
+    ret = dict_get_str(dict, "volname", &volname);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Volume %s does not exist", volname);
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND, "%s", msg);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    if (_gf_false == glusterd_is_volume_started(volinfo)) {
+        snprintf(msg, sizeof(msg),
+                 "Volume %s needs to be started before"
+                 " log rotate.",
+                 volname);
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_STARTED, "%s", msg);
+        *op_errstr = gf_strdup(msg);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "brick", &brick);
+    /* If no brick is specified, do log-rotate for
+       all the bricks in the volume */
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=brick", NULL);
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_volume_brickinfo_get_by_brick(brick, volinfo, NULL,
+                                                 _gf_false);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "Incorrect brick %s "
+                 "for volume %s",
+                 brick, volname);
+        gf_msg("glusterd", GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY, "%s",
+               msg);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_op_log_rotate(dict_t *dict)
+{
+    int ret = -1;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    xlator_t *this = NULL;
+    char *volname = NULL;
+    char *brick = NULL;
+    char logfile[PATH_MAX] = {
+        0,
+    };
+    char pidfile[PATH_MAX] = {
+        0,
+    };
+    FILE *file = NULL;
+    pid_t pid = 0;
+    uint64_t key = 0;
+    int valid_brick = 0;
+    glusterd_brickinfo_t *tmpbrkinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_str(dict, "volname", &volname);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "volname not found");
+        goto out;
+    }
+
+    ret = dict_get_uint64(dict, "rotate-key", &key);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "rotate key not found");
+        goto out;
+    }
+
+    ret = dict_get_str(dict, "brick", &brick);
+    /* If no brick is specified, do log-rotate for
+       all the bricks in the volume */
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=brick", NULL);
+        goto cont;
+    }
+
+    ret = glusterd_brickinfo_new_from_brick(brick, &tmpbrkinfo, _gf_false,
+                                            NULL);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_BRICK_NOT_FOUND,
+               "cannot get brickinfo from brick");
+        goto out;
+    }
+
+cont:
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret)
+        goto out;
+
+    ret = -1;
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID))
+            continue;
+
+        if (tmpbrkinfo && brick &&
+            (strcmp(tmpbrkinfo->hostname, brickinfo->hostname) ||
+             strcmp(tmpbrkinfo->path, brickinfo->path)))
+            continue;
+
+        valid_brick = 1;
+
+        GLUSTERD_GET_BRICK_PIDFILE(pidfile, volinfo, brickinfo, priv);
+        file = fopen(pidfile, "r+");
+        if (!file) {
+            gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                   "Unable to open pidfile: %s", pidfile);
+            ret = -1;
+            goto out;
+        }
+
+        ret = fscanf(file, "%d", &pid);
+        if (ret <= 0) {
+            fclose(file);
+            gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                   "Unable to read pidfile: %s", pidfile);
+            ret = -1;
+            goto out;
+        }
+        fclose(file);
+        file = NULL;
+
+        snprintf(logfile, PATH_MAX, "%s.%" PRIu64, brickinfo->logfile, key);
+
+        ret = sys_rename(brickinfo->logfile, logfile);
+        if (ret)
+            gf_msg("glusterd", GF_LOG_WARNING, errno, GD_MSG_FILE_OP_FAILED,
+                   "rename failed");
+
+        ret = kill(pid, SIGHUP);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_PID_KILL_FAIL,
+                   "Unable to SIGHUP to %d", pid);
+            goto out;
+        }
+        ret = 0;
+
+        /* If request was for brick, only one iteration is enough */
+        if (brick)
+            break;
+    }
+
+    if (ret && !valid_brick)
+        ret = 0;
+
+out:
+    if (tmpbrkinfo)
+        glusterd_brickinfo_delete(tmpbrkinfo);
+
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-mem-types.h b/xlators/mgmt/glusterd/src/glusterd-mem-types.h
new file mode 100644
index 00000000000..d7257e1a7b5
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-mem-types.h
@@ -0,0 +1,58 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __GLUSTERD_MEM_TYPES_H__
+#define __GLUSTERD_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+typedef enum gf_gld_mem_types_ {
+    gf_gld_mt_glusterd_conf_t = gf_common_mt_end + 1,
+    gf_gld_mt_char,
+    gf_gld_mt_peerinfo_t,
+    gf_gld_mt_friend_sm_event_t,
+    gf_gld_mt_friend_req_ctx_t,
+    gf_gld_mt_friend_update_ctx_t,
+    gf_gld_mt_op_sm_event_t,
+    gf_gld_mt_op_lock_ctx_t,
+    gf_gld_mt_op_stage_ctx_t,
+    gf_gld_mt_op_commit_ctx_t,
+    gf_gld_mt_mop_stage_req_t,
+    gf_gld_mt_probe_ctx_t,
+    gf_gld_mt_glusterd_volinfo_t,
+    gf_gld_mt_volinfo_dict_data_t,
+    gf_gld_mt_glusterd_brickinfo_t,
+    gf_gld_mt_peer_hostname_t,
+    gf_gld_mt_defrag_info,
+    gf_gld_mt_peerctx_t,
+    gf_gld_mt_sm_tr_log_t,
+    gf_gld_mt_pending_node_t,
+    gf_gld_mt_brick_rsp_ctx_t,
+    gf_gld_mt_mop_brick_req_t,
+    gf_gld_mt_op_allack_ctx_t,
+    gf_gld_mt_linearr,
+    gf_gld_mt_linebuf,
+    gf_gld_mt_mount_pattern,
+    gf_gld_mt_mount_comp_container,
+    gf_gld_mt_mount_spec,
+    gf_gld_mt_georep_meet_spec,
+    gf_gld_mt_charptr,
+    gf_gld_mt_hooks_stub_t,
+    gf_gld_mt_hooks_priv_t,
+    gf_gld_mt_mop_commit_req_t,
+    gf_gld_mt_int,
+    gf_gld_mt_snap_t,
+    gf_gld_mt_missed_snapinfo_t,
+    gf_gld_mt_snap_create_args_t,
+    gf_gld_mt_glusterd_brick_proc_t,
+    gf_gld_mt_glusterd_svc_proc_t,
+    gf_gld_mt_end,
+} gf_gld_mem_types_t;
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-messages.h b/xlators/mgmt/glusterd/src/glusterd-messages.h
new file mode 100644
index 00000000000..3a1e600fb03
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-messages.h
@@ -0,0 +1,451 @@
+/*
+  Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_MESSAGES_H_
+#define _GLUSTERD_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(
+    GLUSTERD, GD_MSG_SERVER_QUORUM_NOT_MET,
+    GD_MSG_SERVER_QUORUM_LOST_STOPPING_BRICKS,
+    GD_MSG_SERVER_QUORUM_MET_STARTING_BRICKS, GD_MSG_PEER_DISCONNECTED,
+    GD_MSG_BRICK_DISCONNECTED, GD_MSG_NODE_DISCONNECTED,
+    GD_MSG_REBALANCE_DISCONNECTED, GD_MSG_VOL_CLEANUP_FAIL,
+    GD_MSG_VOL_VERS_MISMATCH, GD_MSG_CKSUM_VERS_MISMATCH,
+    GD_MSG_QUOTA_CONFIG_VERS_MISMATCH, GD_MSG_QUOTA_CONFIG_CKSUM_MISMATCH,
+    GD_MSG_BRICK_STOP_FAIL, GD_MSG_SVC_KILL_FAIL, GD_MSG_PID_KILL_FAIL,
+    GD_MSG_REBAL_NO_SOCK_FILE, GD_MSG_UNIX_OP_BUILD_FAIL,
+    GD_MSG_RPC_CREATE_FAIL, GD_MSG_FAIL_DEFAULT_OPT_SET,
+    GD_MSG_CLUSTER_UNLOCK_FAILED, GD_MSG_NO_MEMORY, GD_MSG_UNSUPPORTED_VERSION,
+    GD_MSG_COMMAND_NOT_FOUND, GD_MSG_SNAPSHOT_OP_FAILED, GD_MSG_INVALID_ENTRY,
+    GD_MSG_VOL_NOT_FOUND, GD_MSG_REG_COMPILE_FAILED, GD_MSG_FILE_OP_FAILED,
+    GD_MSG_SNAP_CREATION_FAIL, GD_MSG_VOL_OP_FAILED, GD_MSG_CREATE_DIR_FAILED,
+    GD_MSG_DIR_OP_FAILED, GD_MSG_VOL_STOP_FAILED, GD_MSG_NO_CLI_RESP,
+    GD_MSG_LOCK_INIT_FAILED, GD_MSG_SNAP_LIST_GET_FAIL, GD_MSG_UNOUNT_FAILED,
+    GD_MSG_LOCK_DESTROY_FAILED, GD_MSG_SNAP_CLEANUP_FAIL,
+    GD_MSG_SNAP_ACTIVATE_FAIL, GD_MSG_SNAP_DEACTIVATE_FAIL,
+    GD_MSG_SNAP_RESTORE_FAIL, GD_MSG_SNAP_REMOVE_FAIL, GD_MSG_SNAP_CONFIG_FAIL,
+    GD_MSG_SNAP_STATUS_FAIL, GD_MSG_SNAP_INIT_FAIL, GD_MSG_VOLINFO_SET_FAIL,
+    GD_MSG_VOLINFO_GET_FAIL, GD_MSG_BRICK_CREATION_FAIL,
+    GD_MSG_BRICK_GET_INFO_FAIL, GD_MSG_BRICK_NEW_INFO_FAIL, GD_MSG_LVS_FAIL,
+    GD_MSG_SET_XATTR_FAIL, GD_MSG_UMOUNTING_SNAP_BRICK, GD_MSG_OP_UNSUPPORTED,
+    GD_MSG_SNAP_NOT_FOUND, GD_MSG_FS_LABEL_UPDATE_FAIL, GD_MSG_LVM_MOUNT_FAILED,
+    GD_MSG_DICT_SET_FAILED, GD_MSG_CANONICALIZE_FAIL, GD_MSG_DICT_GET_FAILED,
+    GD_MSG_SNAP_INFO_FAIL, GD_MSG_SNAP_VOL_CONFIG_FAIL,
+    GD_MSG_SNAP_OBJECT_STORE_FAIL, GD_MSG_DICT_UNSERIALIZE_FAIL,
+    GD_MSG_SNAP_RESTORE_REVERT_FAIL, GD_MSG_SNAP_LIST_SET_FAIL,
+    GD_MSG_VOLFILE_CREATE_FAIL, GD_MSG_VOLINFO_REMOVE_FAIL,
+    GD_MSG_VOL_DELETE_FAIL, GD_MSG_SNAPSHOT_PENDING,
+    GD_MSG_BRICK_PATH_UNMOUNTED, GD_MSG_BRICK_ADD_FAIL,
+    GD_MSG_BRICK_SET_INFO_FAIL, GD_MSG_LVCREATE_FAIL, GD_MSG_VG_GET_FAIL,
+    GD_MSG_TPOOL_GET_FAIL, GD_MSG_LVM_REMOVE_FAILED,
+    GD_MSG_MISSEDSNAP_INFO_SET_FAIL, GD_MSG_BRK_MOUNTOPTS_FAIL,
+    GD_MSG_MISSED_SNAP_LIST_STORE_FAIL, GD_MSG_INVALID_MISSED_SNAP_ENTRY,
+    GD_MSG_MISSED_SNAP_GET_FAIL, GD_MSG_MISSED_SNAP_CREATE_FAIL,
+    GD_MSG_DUP_ENTRY, GD_MSG_MISSED_SNAP_STATUS_DONE, GD_MSG_NO_EXEC_PERMS,
+    GD_MSG_GLOBAL_OP_VERSION_SET_FAIL, GD_MSG_HARD_LIMIT_SET_FAIL,
+    GD_MSG_OP_SUCCESS, GD_MSG_STORE_FAIL, GD_MSG_GLOBAL_OP_VERSION_GET_FAIL,
+    GD_MSG_GEOREP_GET_FAILED, GD_MSG_GLUSTERD_UMOUNT_FAIL,
+    GD_MSG_QUORUM_CHECK_FAIL, GD_MSG_QUORUM_COUNT_IGNORED,
+    GD_MSG_SNAP_MOUNT_FAIL, GD_MSG_RSP_DICT_USE_FAIL, GD_MSG_SNAP_IMPORT_FAIL,
+    GD_MSG_SNAP_CONFLICT, GD_MSG_MISSED_SNAP_DELETE,
+    GD_MSG_QUOTA_CONFIG_IMPORT_FAIL, GD_MSG_SNAPDIR_CREATE_FAIL,
+    GD_MSG_MISSED_SNAP_PRESENT, GD_MSG_UUID_NULL, GD_MSG_TSTAMP_SET_FAIL,
+    GD_MSG_RESP_AGGR_FAIL, GD_MSG_DICT_EMPTY, GD_MSG_DICT_CREATE_FAIL,
+    GD_MSG_SNAPD_STOP_FAIL, GD_MSG_SOFT_LIMIT_REACHED, GD_MSG_SNAPD_START_FAIL,
+    GD_MSG_SNAPD_CREATE_FAIL, GD_MSG_SNAPD_INIT_FAIL, GD_MSG_MGMTV3_OP_FAIL,
+    GD_MSG_MGMTV3_PAYLOAD_BUILD_FAIL, GD_MSG_MGMTV3_UNLOCK_FAIL,
+    GD_MSG_MGMTV3_LOCK_GET_FAIL, GD_MSG_MGMTV3_LOCKDOWN_FAIL,
+    GD_MSG_POST_VALIDATION_FAIL, GD_MSG_PRE_VALIDATION_FAIL,
+    GD_MSG_COMMIT_OP_FAIL, GD_MSG_PEER_LIST_CREATE_FAIL, GD_MSG_BRICK_OP_FAIL,
+    GD_MSG_OPINFO_SET_FAIL, GD_MSG_OP_EVENT_UNLOCK_FAIL,
+    GD_MSG_MGMTV3_OP_RESP_FAIL, GD_MSG_PEER_NOT_FOUND, GD_MSG_REQ_DECODE_FAIL,
+    GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, GD_MSG_ALREADY_STOPPED,
+    GD_MSG_PRE_VALD_RESP_FAIL, GD_MSG_SVC_GET_FAIL, GD_MSG_VOLFILE_NOT_FOUND,
+    GD_MSG_OP_EVENT_LOCK_FAIL, GD_MSG_NON_STRIPE_VOL, GD_MSG_SNAPD_OBJ_GET_FAIL,
+    GD_MSG_QUOTA_DISABLED, GD_MSG_CACHE_MINMAX_SIZE_INVALID,
+    GD_MSG_QUOTA_GET_STAT_FAIL, GD_MSG_SUBVOLUMES_EXCEED, GD_MSG_BRICK_ADD,
+    GD_MSG_BRICK_REMOVE, GD_MSG_CREATE_KEY_FAIL,
+    GD_MSG_MULTIPLE_LOCK_ACQUIRE_FAIL, GD_MSG_MULTIPLE_LOCK_RELEASE_FAIL,
+    GD_MSG_RESP_FROM_UNKNOWN_PEER, GD_MSG_BRICK_MOUNDIRS_AGGR_FAIL,
+    GD_MSG_GFID_VALIDATE_SET_FAIL, GD_MSG_PEER_LOCK_FAIL,
+    GD_MSG_PEER_UNLOCK_FAIL, GD_MSG_MGMT_OP_FAIL,
+    GD_MSG_TRANS_OPINFO_CLEAR_FAIL, GD_MSG_GLUSTERD_LOCK_FAIL,
+    GD_MSG_TRANS_OPINFO_SET_FAIL, GD_MSG_TRANS_IDGEN_FAIL, GD_MSG_RPC_FAILURE,
+    GD_MSG_OP_VERS_ADJUST_FAIL, GD_MSG_SNAP_DEVICE_NAME_GET_FAIL,
+    GD_MSG_SNAP_STATUS_NOT_PENDING, GD_MSG_MGMT_PGM_SET_FAIL,
+    GD_MSG_EVENT_INJECT_FAIL, GD_MSG_VERS_INFO, GD_MSG_VOL_INFO_REQ_RECVD,
+    GD_MSG_VERS_GET_FAIL, GD_MSG_EVENT_NEW_GET_FAIL, GD_MSG_RPC_LAYER_ERROR,
+    GD_MSG_NO_HANDSHAKE_ACK, GD_MSG_OP_VERSION_MISMATCH,
+    GD_MSG_HANDSHAKE_REQ_REJECTED, GD_MSG_UNKNOWN_MODE,
+    GD_MSG_DEFRAG_STATUS_UPDATED, GD_MSG_NO_FLAG_SET,
+    GD_MSG_VERSION_UNSUPPORTED, GD_MSG_UUID_SET_FAIL, GD_MSG_MOUNT_REQ_FAIL,
+    GD_MSG_GLUSTERD_GLOBAL_INFO_STORE_FAIL, GD_MSG_OP_VERS_STORE_FAIL,
+    GD_MSG_SNAP_AUTOMIC_UPDATE_FAIL, GD_MSG_SNAPINFO_WRITE_FAIL,
+    GD_MSG_SNAPINFO_CREATE_FAIL, GD_MSG_SNAPD_INFO_STORE_FAIL,
+    GD_MSG_BRK_MNTPATH_MOUNT_FAIL, GD_MSG_BRK_MNTPATH_GET_FAIL,
+    GD_MSG_SNAP_BRK_MNT_RECREATE_FAIL, GD_MSG_SNAP_RESOLVE_BRICK_FAIL,
+    GD_MSG_RESOLVE_BRICK_FAIL, GD_MSG_BRK_MNT_RECREATE_FAIL,
+    GD_MSG_TMP_FILE_UNLINK_FAIL, GD_MSG_VOL_VALS_WRITE_FAIL,
+    GD_MSG_STORE_HANDLE_GET_FAIL, GD_MSG_STORE_HANDLE_WRITE_FAIL,
+    GD_MSG_MISSED_SNAP_LIST_STORE_HANDLE_GET_FAIL,
+    GD_MSG_MISSED_SNAP_LIST_EMPTY, GD_MSG_SNAP_VOL_RETRIEVE_FAIL,
+    GD_MSG_SNAPSHOT_UPDATE_FAIL, GD_MSG_SNAPD_PORT_STORE_FAIL,
+    GD_MSG_CKSUM_STORE_FAIL, GD_MSG_STORE_HANDLE_CREATE_FAIL,
+    GD_MSG_HANDLE_NULL, GD_MSG_VOL_RESTORE_FAIL, GD_MSG_NAME_TOO_LONG,
+    GD_MSG_UUID_PARSE_FAIL, GD_MSG_UNKNOWN_KEY, GD_MSG_STORE_ITER_DESTROY_FAIL,
+    GD_MSG_STORE_ITER_GET_FAIL, GD_MSG_VOLINFO_UPDATE_FAIL,
+    GD_MSG_PARSE_BRICKINFO_FAIL, GD_MSG_VERS_STORE_FAIL, GD_MSG_HEADER_ADD_FAIL,
+    GD_MSG_QUOTA_CONF_WRITE_FAIL, GD_MSG_QUOTA_CONF_CORRUPT, GD_MSG_FORK_FAIL,
+    GD_MSG_CKSUM_COMPUTE_FAIL, GD_MSG_VERS_CKSUM_STORE_FAIL,
+    GD_MSG_GET_XATTR_FAIL, GD_MSG_CONVERSION_FAILED, GD_MSG_VOL_NOT_DISTRIBUTE,
+    GD_MSG_VOL_STOPPED, GD_MSG_OPCTX_GET_FAIL, GD_MSG_TASKID_GEN_FAIL,
+    GD_MSG_REBALANCE_ID_MISSING, GD_MSG_NO_REBALANCE_PFX_IN_VOLNAME,
+    GD_MSG_DEFRAG_STATUS_UPDATE_FAIL, GD_MSG_UUID_GEN_STORE_FAIL,
+    GD_MSG_UUID_STORE_FAIL, GD_MSG_NO_INIT, GD_MSG_MODULE_NOT_INSTALLED,
+    GD_MSG_MODULE_NOT_WORKING, GD_MSG_WRITE_ACCESS_GRANT_FAIL,
+    GD_MSG_DIRPATH_TOO_LONG, GD_MSG_LOGGROUP_INVALID, GD_MSG_DIR_PERM_LIBERAL,
+    GD_MSG_DIR_PERM_STRICT, GD_MSG_MOUNT_SPEC_INSTALL_FAIL,
+    GD_MSG_GLUSTERD_SOCK_LISTENER_START_FAIL, GD_MSG_DIR_NOT_FOUND,
+    GD_MSG_FAILED_INIT_SHDSVC, GD_MSG_FAILED_INIT_NFSSVC,
+    GD_MSG_FAILED_INIT_QUOTASVC, GD_MSG_RPC_INIT_FAIL,
+    GD_MSG_RPCSVC_REG_NOTIFY_RETURNED, GD_MSG_RPC_TRANSPORT_COUNT_GET_FAIL,
+    GD_MSG_RPC_LISTENER_CREATE_FAIL, GD_MSG_OP_VERS_RESTORE_FAIL,
+    GD_MSG_SELF_HEALD_DISABLED, GD_MSG_PRIV_NULL, GD_MSG_GSYNC_VALIDATION_FAIL,
+    GD_MSG_SLAVE_CONFPATH_DETAILS_FETCH_FAIL, GD_MSG_OP_NOT_PERMITTED_AC_REQD,
+    GD_MSG_OP_NOT_PERMITTED, GD_MSG_REBALANCE_START_FAIL,
+    GD_MSG_NFS_RECONF_FAIL, GD_MSG_REMOVE_BRICK_ID_SET_FAIL,
+    GD_MSG_BRICK_MOUNTDIR_GET_FAIL, GD_MSG_BRICK_NOT_FOUND,
+    GD_MSG_BRKPATH_TOO_LONG, GD_MSG_CLRLOCKS_CLNT_UMOUNT_FAIL,
+    GD_MSG_CLRLOCKS_CLNT_MOUNT_FAIL, GD_MSG_CLRLOCKS_MOUNTDIR_CREATE_FAIL,
+    GD_MSG_BRK_PORT_NUM_GET_FAIL, GD_MSG_BRK_STATEDUMP_FAIL,
+    GD_MSG_VOL_GRAPH_CHANGE_NOTIFY_FAIL, GD_MSG_INVALID_VG,
+    GD_MSG_GLUSTERD_OP_FAILED, GD_MSG_HOSTNAME_ADD_TO_PEERLIST_FAIL,
+    GD_MSG_STALE_PEERINFO_REMOVE_FAIL, GD_MSG_TRANS_ID_GET_FAIL,
+    GD_MSG_RES_DECODE_FAIL, GD_MSG_VOL_ALREADY_EXIST, GD_MSG_BAD_BRKORDER,
+    GD_MSG_BAD_BRKORDER_CHECK_FAIL, GD_MSG_BRICK_SELECT_FAIL,
+    GD_MSG_NO_LOCK_RESP_FROM_PEER, GD_MSG_MGMTV3_LOCK_FROM_UUID_REJCT,
+    GD_MSG_STAGE_FROM_UUID_REJCT, GD_MSG_UNLOCK_FROM_UUID_REJCT,
+    GD_MSG_MGMTV3_UNLOCK_FROM_UUID_REJCT, GD_MSG_COMMIT_FROM_UUID_REJCT,
+    GD_MSG_VOL_NOT_STARTED, GD_MSG_VOL_NOT_REPLICA, GD_MSG_VOL_NOT_DISPERSE,
+    GD_MSG_OLD_REMOVE_BRICK_EXISTS, GD_MSG_USE_THE_FORCE, GD_MSG_OIP,
+    GD_MSG_OIP_RETRY_LATER, GD_MSG_GSYNC_RESTART_FAIL,
+    GD_MSG_LOCK_FROM_UUID_REJCT, GD_MSG_BRICK_OP_PAYLOAD_BUILD_FAIL,
+    GD_MSG_HOSTNAME_RESOLVE_FAIL, GD_MSG_COUNT_VALIDATE_FAILED,
+    GD_MSG_SPAWNING_CHILD_FAILED, GD_MSG_READ_CHILD_DATA_FAILED,
+    GD_MSG_DEFAULT_TEMP_CONFIG, GD_MSG_PIDFILE_CREATE_FAILED,
+    GD_MSG_GSYNCD_SPAWN_FAILED, GD_MSG_SUBOP_NOT_FOUND, GD_MSG_RESERVED_OPTION,
+    GD_MSG_GLUSTERD_PRIV_NOT_FOUND, GD_MSG_SLAVEINFO_FETCH_ERROR,
+    GD_MSG_VALIDATE_FAILED, GD_MSG_INVOKE_ERROR, GD_MSG_SESSION_CREATE_ERROR,
+    GD_MSG_STOP_FORCE, GD_MSG_GET_CONFIG_INFO_FAILED,
+    GD_MSG_STAT_FILE_READ_FAILED, GD_MSG_CONF_PATH_ASSIGN_FAILED,
+    GD_MSG_SESSION_INACTIVE, GD_MSG_PIDFILE_NOT_FOUND, GD_MSG_PEER_CMD_ERROR,
+    GD_MSG_SRC_FILE_ERROR, GD_MSG_GET_STATEFILE_NAME_FAILED, GD_MSG_STATUS_NULL,
+    GD_MSG_STATUSFILE_CREATE_FAILED, GD_MSG_SLAVE_URL_INVALID,
+    GD_MSG_INVALID_SLAVE, GD_MSG_READ_ERROR, GD_MSG_ARG_FETCH_ERROR,
+    GD_MSG_REG_FILE_MISSING, GD_MSG_STATEFILE_NAME_NOT_FOUND,
+    GD_MSG_GEO_REP_START_FAILED, GD_MSG_GSYNCD_ERROR,
+    GD_MSG_UPDATE_STATEFILE_FAILED, GD_MSG_STATUS_UPDATE_FAILED,
+    GD_MSG_GSYNCD_OP_SET_FAILED, GD_MSG_BUFFER_EMPTY, GD_MSG_CONFIG_INFO,
+    GD_MSG_FETCH_CONFIG_VAL_FAILED, GD_MSG_GSYNCD_PARSE_ERROR,
+    GD_MSG_SESSION_ALREADY_EXIST, GD_MSG_FORCE_CREATE_SESSION,
+    GD_MSG_GET_KEY_FAILED, GD_MSG_SESSION_DEL_FAILED, GD_MSG_CMD_EXEC_FAIL,
+    GD_MSG_STRDUP_FAILED, GD_MSG_UNABLE_TO_END, GD_MSG_PAUSE_FAILED,
+    GD_MSG_NORMALIZE_URL_FAIL, GD_MSG_MODULE_ERROR,
+    GD_MSG_SLAVEINFO_STORE_ERROR, GD_MSG_MARKER_START_FAIL,
+    GD_MSG_RESUME_FAILED, GD_MSG_GLUSTERFS_START_FAIL,
+    GD_MSG_GLUSTERFS_STOP_FAIL, GD_MSG_RBOP_STATE_STORE_FAIL,
+    GD_MSG_PUMP_XLATOR_DISABLED, GD_MSG_ABORT_OP_FAIL, GD_MSG_PAUSE_OP_FAIL,
+    GD_MSG_GLUSTER_SERVICE_START_FAIL, GD_MSG_HANDSHAKE_FAILED,
+    GD_MSG_CLI_REQ_EMPTY, GD_MSG_PEER_ADD_FAIL,
+    GD_MSG_SYNC_FROM_LOCALHOST_UNALLOWED, GD_MSG_UUIDS_SAME_RETRY,
+    GD_MSG_TSP_ALREADY_FORMED, GD_MSG_VOLS_ALREADY_PRESENT,
+    GD_MSG_REQ_CTX_CREATE_FAIL, GD_MSG_PEER_INFO_UPDATE_FAIL,
+    GD_MSG_PEERINFO_CREATE_FAIL, GD_MSG_REQ_FROM_UNKNOWN_PEER,
+    GD_MSG_STATUS_REPLY_STRING_CREATE_FAIL, GD_MSG_TOKENIZE_FAIL,
+    GD_MSG_LAZY_UMOUNT_FAIL, GD_MSG_NFS_SERVER_START_FAIL,
+    GD_MSG_GLUSTER_SERVICES_STOP_FAIL, GD_MSG_BRK_CLEANUP_FAIL,
+    GD_MSG_RB_ALREADY_STARTED, GD_MSG_RB_BRICKINFO_GET_FAIL, GD_MSG_BAD_FORMAT,
+    GD_MSG_RB_CMD_FAIL, GD_MSG_RB_NOT_STARTED_OR_PAUSED, GD_MSG_RB_NOT_STARTED,
+    GD_MSG_RB_PAUSED_ALREADY, GD_MSG_NO_FREE_PORTS,
+    GD_MSG_EVENT_STATE_TRANSITION_FAIL, GD_MSG_HANDLER_RETURNED,
+    GD_MSG_SNAP_COMPARE_CONFLICT, GD_MSG_PEER_DETACH_CLEANUP_FAIL,
+    GD_MSG_STALE_VOL_REMOVE_FAIL, GD_MSG_AC_ERROR, GD_MSG_LOCK_FAIL,
+    GD_MSG_MGMTV3_LOCK_REQ_SEND_FAIL, GD_MSG_GLUSTERD_UNLOCK_FAIL,
+    GD_MSG_RBOP_START_FAIL, GD_MSG_UNKNOWN_RESPONSE,
+    GD_MSG_COMMIT_REQ_SEND_FAIL, GD_MSG_OPCTX_UPDATE_FAIL, GD_MSG_OPCTX_NULL,
+    GD_MSG_DICT_COPY_FAIL, GD_MSG_SHD_STATUS_SET_FAIL,
+    GD_MSG_REPLICA_INDEX_GET_FAIL, GD_MSG_NFS_SERVER_NOT_RUNNING,
+    GD_MSG_STAGE_REQ_SEND_FAIL, GD_MSG_LOCK_REQ_SEND_FAIL,
+    GD_MSG_VOLNAMES_GET_FAIL, GD_MSG_NO_TASK_ID, GD_MSG_ADD_REMOVE_BRICK_FAIL,
+    GD_MSG_SVC_RESTART_FAIL, GD_MSG_VOL_SET_FAIL, GD_MSG_QUOTAD_NOT_RUNNING,
+    GD_MSG_XLATOR_COUNT_GET_FAIL, GD_MSG_TRANS_OPINFO_GET_FAIL,
+    GD_MSG_TRANS_ID_INVALID, GD_MSG_NO_OPTIONS_GIVEN, GD_MSG_SNAPD_NOT_RUNNING,
+    GD_MSG_ADD_ADDRESS_TO_PEER_FAIL, GD_MSG_PEER_ADDRESS_GET_FAIL,
+    GD_MSG_GETADDRINFO_FAIL, GD_MSG_PEERINFO_DELETE_FAIL, GD_MSG_KEY_NULL,
+    GD_MSG_SPAWN_SVCS_FAIL, GD_MSG_DICT_ITER_FAIL,
+    GD_MSG_TASK_STATUS_UPDATE_FAIL, GD_MSG_VOL_ID_MISMATCH,
+    GD_MSG_STR_TO_BOOL_FAIL, GD_MSG_RB_MNT_BRICKS_MISMATCH,
+    GD_MSG_RB_SRC_BRICKS_MISMATCH, GD_MSG_MNTENTRY_GET_FAIL,
+    GD_MSG_INODE_SIZE_GET_FAIL, GD_MSG_NO_STATEFILE_ENTRY,
+    GD_MSG_PMAP_UNSET_FAIL, GD_MSG_GLOBAL_OPT_IMPORT_FAIL,
+    GD_MSD_BRICK_DISCONNECT_FAIL, GD_MSG_SNAP_DETAILS_IMPORT_FAIL,
+    GD_MSG_BRICKINFO_CREATE_FAIL, GD_MSG_QUOTA_CKSUM_VER_STORE_FAIL,
+    GD_MSG_CKSUM_GET_FAIL, GD_MSG_BRICKPATH_ROOT_GET_FAIL,
+    GD_MSG_HOSTNAME_TO_UUID_FAIL, GD_MSG_REPLY_SUBMIT_FAIL,
+    GD_MSG_SERIALIZE_MSG_FAIL, GD_MSG_ENCODE_FAIL,
+    GD_MSG_RB_DST_BRICKS_MISMATCH, GD_MSG_XLATOR_VOLOPT_DYNLOAD_ERROR,
+    GD_MSG_VOLNAME_NOTFOUND_IN_DICT, GD_MSG_FLAGS_NOTFOUND_IN_DICT,
+    GD_MSG_HOSTNAME_NOTFOUND_IN_DICT, GD_MSG_PORT_NOTFOUND_IN_DICT,
+    GD_MSG_CMDSTR_NOTFOUND_IN_DICT, GD_MSG_SNAP_OBJ_NEW_FAIL,
+    GD_MSG_SNAP_BACKEND_MAKE_FAIL, GD_MSG_SNAP_CLONE_FAILED,
+    GD_MSG_SNAP_CLONE_PREVAL_FAILED, GD_MSG_SNAP_CLONE_POSTVAL_FAILED,
+    GD_MSG_VOLINFO_STORE_FAIL, GD_MSG_NEW_FRIEND_SM_EVENT_GET_FAIL,
+    GD_MSG_VOL_TYPE_CHANGING_INFO, GD_MSG_BRKPATH_MNTPNT_MISMATCH,
+    GD_MSG_TASKS_COUNT_MISMATCH, GD_MSG_WRONG_OPTS_SETTING,
+    GD_MSG_PATH_ALREADY_PART_OF_VOL, GD_MSG_BRICK_VALIDATE_FAIL,
+    GD_MSG_READIN_FILE_FAILED, GD_MSG_IMPORT_PRDICT_DICT,
+    GD_MSG_VOL_OPTS_IMPORT_FAIL, GD_MSG_BRICK_IMPORT_FAIL,
+    GD_MSG_VOLINFO_IMPORT_FAIL, GD_MSG_BRICK_ID_GEN_FAILED,
+    GD_MSG_GET_STATUS_DATA_FAIL, GD_MSG_BITROT_NOT_RUNNING,
+    GD_MSG_SCRUBBER_NOT_RUNNING, GD_MSG_SRC_BRICK_PORT_UNAVAIL,
+    GD_MSG_BITD_INIT_FAIL, GD_MSG_SCRUB_INIT_FAIL, GD_MSG_VAR_RUN_DIR_INIT_FAIL,
+    GD_MSG_VAR_RUN_DIR_FIND_FAIL, GD_MSG_SCRUBSVC_RECONF_FAIL,
+    GD_MSG_BITDSVC_RECONF_FAIL, GD_MSG_NFS_GNS_START_FAIL,
+    GD_MSG_NFS_GNS_SETUP_FAIL, GD_MSG_UNRECOGNIZED_SVC_MNGR,
+    GD_MSG_NFS_GNS_OP_HANDLE_FAIL, GD_MSG_EXPORT_FILE_CREATE_FAIL,
+    GD_MSG_NFS_GNS_HOST_FOUND, GD_MSG_REBALANCE_CMD_IN_TIER_VOL,
+    GD_MSG_INCOMPATIBLE_VALUE, GD_MSG_GENERATED_UUID,
+    GD_MSG_FILE_DESC_LIMIT_SET, GD_MSG_CURR_WORK_DIR_INFO,
+    GD_MSG_STRIPE_COUNT_CHANGE_INFO, GD_MSG_REPLICA_COUNT_CHANGE_INFO,
+    GD_MSG_ADD_BRICK_REQ_RECVD, GD_MSG_VOL_ALREADY_TIER,
+    GD_MSG_REM_BRICK_REQ_RECVD, GD_MSG_VOL_NOT_TIER,
+    GD_MSG_LOG_ROTATE_REQ_RECVD, GD_MSG_CLI_REQ_RECVD, GD_MSG_GET_VOL_REQ_RCVD,
+    GD_MSG_VOL_SYNC_REQ_RCVD, GD_MSG_PROBE_RCVD, GD_MSG_UNFRIEND_REQ_RCVD,
+    GD_MSG_FRIEND_UPDATE_RCVD, GD_MSG_RESPONSE_INFO,
+    GD_MSG_VOL_PROFILE_REQ_RCVD, GD_MSG_GETWD_REQ_RCVD, GD_MSG_MOUNT_REQ_RCVD,
+    GD_MSG_UMOUNT_REQ_RCVD, GD_MSG_CONNECT_RETURNED, GD_MSG_STATUS_VOL_REQ_RCVD,
+    GD_MSG_CLRCLK_VOL_REQ_RCVD, GD_MSG_BARRIER_VOL_REQ_RCVD,
+    GD_MSG_UUID_RECEIVED, GD_MSG_REPLACE_BRK_COMMIT_FORCE_REQ_RCVD,
+    GD_MSG_BRK_PORT_NO_ADD_INDO, GD_MSG_REPLACE_BRK_REQ_RCVD,
+    GD_MSG_ADD_OP_ARGS_FAIL, GD_MSG_POST_HOOK_STUB_INIT_FAIL,
+    GD_MSG_HOOK_STUB_NULL, GD_MSG_SPAWN_THREADS_FAIL,
+    GD_MSG_STALE_VOL_DELETE_INFO, GD_MSG_PROBE_REQ_RESP_RCVD,
+    GD_MSG_HOST_PRESENT_ALREADY, GD_MSG_OP_VERS_INFO, GD_MSG_OP_VERS_SET_INFO,
+    GD_MSG_NEW_NODE_STATE_CREATION, GD_MSG_ALREADY_MOUNTED,
+    GD_MSG_SHARED_STRG_VOL_OPT_VALIDATE_FAIL, GD_MSG_NFS_GNS_STOP_FAIL,
+    GD_MSG_NFS_GNS_RESET_FAIL, GD_MSG_SHARED_STRG_SET_FAIL,
+    GD_MSG_VOL_TRANSPORT_TYPE_CHANGE, GD_MSG_PEER_COUNT_GET_FAIL,
+    GD_MSG_INSUFFICIENT_UP_NODES, GD_MSG_OP_STAGE_STATS_VOL_FAIL,
+    GD_MSG_VOL_ID_SET_FAIL, GD_MSG_OP_STAGE_RESET_VOL_FAIL,
+    GD_MSG_OP_STAGE_BITROT_FAIL, GD_MSG_OP_STAGE_QUOTA_FAIL,
+    GD_MSG_OP_STAGE_DELETE_VOL_FAIL, GD_MSG_HANDLE_HEAL_CMD_FAIL,
+    GD_MSG_CLRCLK_SND_CMD_FAIL, GD_MSG_DISPERSE_CLUSTER_FOUND,
+    GD_MSG_HEAL_VOL_REQ_RCVD, GD_MSG_STATEDUMP_VOL_REQ_RCVD,
+    GD_MSG_THINPOOLS_FOR_THINLVS, GD_MSG_OP_STAGE_CREATE_VOL_FAIL,
+    GD_MSG_OP_STAGE_START_VOL_FAIL, GD_MSG_NFS_GNS_UNEXPRT_VOL_FAIL,
+    GD_MSG_TASK_ID_INFO, GD_MSG_DEREGISTER_SUCCESS, GD_MSG_STATEDUMP_OPTS_RCVD,
+    GD_MSG_STATEDUMP_INFO, GD_MSG_RECOVERING_CORRUPT_CONF,
+    GD_MSG_RETRIEVED_UUID, GD_MSG_XLATOR_CREATE_FAIL,
+    GD_MSG_GRAPH_ENTRY_ADD_FAIL, GD_MSG_ERROR_ENCOUNTERED,
+    GD_MSG_FILTER_RUN_FAILED, GD_MSG_DEFAULT_OPT_INFO,
+    GD_MSG_MARKER_STATUS_GET_FAIL, GD_MSG_MARKER_DISABLE_FAIL,
+    GD_MSG_GRAPH_FEATURE_ADD_FAIL, GD_MSG_XLATOR_SET_OPT_FAIL,
+    GD_MSG_BUILD_GRAPH_FAILED, GD_MSG_XML_TEXT_WRITE_FAIL,
+    GD_MSG_XML_DOC_START_FAIL, GD_MSG_XML_ELE_CREATE_FAIL,
+    GD_MSG_VOLUME_INCONSISTENCY, GD_MSG_XLATOR_LINK_FAIL,
+    GD_MSG_REMOTE_HOST_GET_FAIL, GD_MSG_GRAPH_SET_OPT_FAIL,
+    GD_MSG_ROOT_SQUASH_ENABLED, GD_MSG_ROOT_SQUASH_FAILED,
+    GD_MSG_LOCK_OWNER_MISMATCH, GD_MSG_LOCK_NOT_HELD, GD_MSG_LOCK_ALREADY_HELD,
+    GD_MSG_SVC_START_SUCCESS, GD_MSG_SVC_STOP_SUCCESS, GD_MSG_PARAM_NULL,
+    GD_MSG_SVC_STOP_FAIL, GD_MSG_SHARED_STORAGE_DOES_NOT_EXIST,
+    GD_MSG_SNAP_PAUSE_TIER_FAIL, GD_MSG_SNAP_RESUME_TIER_FAIL,
+    GD_MSG_FILE_NOT_FOUND, GD_MSG_RETRY_WITH_NEW_PORT,
+    GD_MSG_REMOTE_VOL_UUID_FAIL, GD_MSG_SLAVE_VOL_PARSE_FAIL,
+    GD_MSG_DICT_GET_SUCCESS, GD_MSG_PMAP_REGISTRY_REMOVE_FAIL,
+    GD_MSG_MNTBROKER_LABEL_NULL, GD_MSG_MNTBROKER_LABEL_MISS,
+    GD_MSG_MNTBROKER_SPEC_MISMATCH, GD_MSG_SYSCALL_FAIL,
+    GD_MSG_DAEMON_STATE_REQ_RCVD, GD_MSG_BRICK_CLEANUP_SUCCESS,
+    GD_MSG_STATE_STR_GET_FAILED, GD_MSG_RESET_BRICK_COMMIT_FORCE_REQ_RCVD,
+    GD_MSG_RESET_BRICK_CMD_FAIL, GD_MSG_TIERD_STOP_FAIL,
+    GD_MSG_TIERD_CREATE_FAIL, GD_MSG_TIERD_START_FAIL,
+    GD_MSG_TIERD_OBJ_GET_FAIL, GD_MSG_TIERD_NOT_RUNNING, GD_MSG_TIERD_INIT_FAIL,
+    GD_MSG_BRICK_MX_SET_FAIL, GD_MSG_NO_SIG_TO_PID_ZERO,
+    GD_MSG_TIER_WATERMARK_RESET_FAIL, GD_MSG_CLIENTS_GET_STATE_FAILED,
+    GD_MSG_GNFS_XLATOR_NOT_INSTALLED, GD_MSG_PIDFILE_UNLINKING,
+    GD_MSG_VOL_SET_VALIDATION_INFO, GD_MSG_NO_MUX_LIMIT,
+    GD_MSG_BRICKPROC_REM_BRICK_FAILED, GD_MSG_BRICKPROC_ADD_BRICK_FAILED,
+    GD_MSG_BRICKPROC_NEW_FAILED, GD_MSG_STATVFS_FAILED, GD_MSG_GARBAGE_ARGS,
+    GD_MSG_LOCALTIME_LOGGING_VOL_OPT_VALIDATE_FAIL,
+    GD_MSG_LOCALTIME_LOGGING_ENABLE, GD_MSG_LOCALTIME_LOGGING_DISABLE,
+    GD_MSG_PORTS_EXHAUSTED, GD_MSG_CHANGELOG_GET_FAIL,
+    GD_MSG_MANAGER_FUNCTION_FAILED,
+    GD_MSG_DAEMON_LOG_LEVEL_VOL_OPT_VALIDATE_FAIL, GD_MSG_SHD_START_FAIL,
+    GD_MSG_SHD_OBJ_GET_FAIL, GD_MSG_SVC_ATTACH_FAIL, GD_MSG_ATTACH_INFO,
+    GD_MSG_DETACH_INFO, GD_MSG_SVC_DETACH_FAIL,
+    GD_MSG_RPC_TRANSPORT_GET_PEERNAME_FAIL, GD_MSG_CLUSTER_RC_ENABLE,
+    GD_MSG_NFS_GANESHA_DISABLED, GD_MSG_GANESHA_NOT_RUNNING, GD_MSG_SNAP_WARN,
+    GD_MSG_BRICK_SUBVOL_VERIFY_FAIL, GD_MSG_REMOVE_ARBITER_BRICK,
+    GD_MSG_BRICK_NOT_DECOM, GD_MSG_BRICK_STOPPED, GD_MSG_BRICK_DEAD,
+    GD_MSG_BRICK_HOST_NOT_FOUND, GD_MSG_BRICK_HOST_DOWN, GD_MSG_BRICK_DELETE,
+    GD_MSG_BRICK_NO_REMOVE_CMD, GD_MSG_MIGRATION_PROG, GD_MSG_MIGRATION_FAIL,
+    GD_MSG_COPY_FAIL, GD_MSG_REALPATH_GET_FAIL,
+    GD_MSG_ARBITER_BRICK_SET_INFO_FAIL, GD_MSG_STRCHR_FAIL, GD_MSG_SPLIT_FAIL,
+    GD_MSG_ALLOC_AND_COPY_UUID_FAIL, GD_MSG_VOL_SHD_NOT_COMP,
+    GD_MSG_BITROT_NOT_ENABLED, GD_MSG_CREATE_BRICK_DIR_FAILED,
+    GD_MSG_CREATE_GLUSTER_DIR_FAILED, GD_MSG_BRICK_CREATE_MNTPNT,
+    GD_MSG_BRICK_CREATE_ROOT, GD_MSG_SET_XATTR_BRICK_FAIL,
+    GD_MSG_REMOVE_XATTR_FAIL, GD_MSG_XLATOR_NOT_DEFINED,
+    GD_MSG_BRICK_NOT_RUNNING, GD_MSG_INCORRECT_BRICK, GD_MSG_UUID_GET_FAIL,
+    GD_MSG_INVALID_ARGUMENT, GD_MSG_FRAME_CREATE_FAIL,
+    GD_MSG_SNAPSHOT_NOT_THIN_PROVISIONED, GD_MSG_VOL_STOP_ARGS_GET_FAILED,
+    GD_MSG_LSTAT_FAIL, GD_MSG_VOLUME_NOT_IMPORTED,
+    GD_MSG_ADD_BRICK_MNT_INFO_FAIL, GD_MSG_GET_MNT_ENTRY_INFO_FAIL,
+    GD_MSG_QUORUM_CLUSTER_COUNT_GET_FAIL, GD_MSG_POST_COMMIT_OP_FAIL,
+    GD_MSG_POST_COMMIT_FROM_UUID_REJCT, GD_MSG_POST_COMMIT_REQ_SEND_FAIL);
+
+#define GD_MSG_INVALID_ENTRY_STR "Invalid data entry"
+#define GD_MSG_INVALID_ARGUMENT_STR                                            \
+    "Invalid arguments have been given to function"
+#define GD_MSG_GARBAGE_ARGS_STR "Garbage args received"
+#define GD_MSG_BRICK_SUBVOL_VERIFY_FAIL_STR "Brick's subvol verification fail"
+#define GD_MSG_REMOVE_ARBITER_BRICK_STR "Failed to remove arbiter bricks"
+#define GD_MSG_DICT_GET_FAILED_STR "Dict get failed"
+#define GD_MSG_DICT_SET_FAILED_STR "Dict set failed"
+#define GD_MSG_BRICK_NOT_FOUND_STR "Brick not found in volume"
+#define GD_MSG_BRICK_NOT_DECOM_STR "Brick is not decommissoned"
+#define GD_MSG_BRICK_STOPPED_STR "Found stopped brick"
+#define GD_MSG_BRICK_DEAD_STR "Found dead brick"
+#define GD_MSG_BRICK_HOST_NOT_FOUND_STR                                        \
+    "Host node of the brick is not a part of cluster"
+#define GD_MSG_BRICK_HOST_DOWN_STR "Host node of the brick is down"
+#define GD_MSG_BRICK_DELETE_STR                                                \
+    "Deleting all the bricks of the volume is not allowed"
+#define GD_MSG_BRICK_NO_REMOVE_CMD_STR "No remove-brick command issued"
+#define GD_MSG_INCORRECT_BRICK_STR "Incorrect brick for volume"
+#define GD_MSG_MIGRATION_PROG_STR "Migration is in progress"
+#define GD_MSG_MIGRATION_FAIL_STR "Migration has failed"
+#define GD_MSG_XLATOR_NOT_DEFINED_STR "Xlator not defined"
+#define GD_MSG_DICT_CREATE_FAIL_STR "Failed to create dictionary"
+#define GD_MSG_COPY_FAIL_STR "Failed to copy"
+#define GD_MSG_UUID_GET_FAIL_STR "Failed to get the uuid of local glusterd"
+#define GD_MSG_GEO_REP_START_FAILED_STR "Georep start failed for volume"
+#define GD_MSG_REALPATH_GET_FAIL_STR "Failed to get realpath"
+#define GD_MSG_FILE_NOT_FOUND_STR "File not found in directory"
+#define GD_MSG_SRC_FILE_ERROR_STR "Error in source file"
+#define GD_MSG_DICT_UNSERIALIZE_FAIL_STR "Failed to unserialize dict"
+#define GD_MSG_VOL_ID_SET_FAIL_STR "Failed to set volume id"
+#define GD_MSG_ARBITER_BRICK_SET_INFO_FAIL_STR                                 \
+    "Failed to add arbiter info to brick"
+#define GD_MSG_NO_MEMORY_STR "Out of memory"
+#define GD_MSG_GLUSTERD_UMOUNT_FAIL_STR "Failed to unmount path"
+#define GD_MSG_PEER_ADD_FAIL_STR "Failed to add new peer"
+#define GD_MSG_BRICK_GET_INFO_FAIL_STR "Failed to get brick info"
+#define GD_MSG_STRCHR_FAIL_STR "Failed to get the character"
+#define GD_MSG_SPLIT_FAIL_STR "Failed to split"
+#define GD_MSG_VOLINFO_GET_FAIL_STR "Failed to get volinfo"
+#define GD_MSG_PEER_NOT_FOUND_STR "Failed to find peer info"
+#define GD_MSG_DICT_COPY_FAIL_STR "Failed to copy values from dictionary"
+#define GD_MSG_ALLOC_AND_COPY_UUID_FAIL_STR                                    \
+    "Failed to allocate memory or copy uuid"
+#define GD_MSG_VOL_NOT_FOUND_STR "Volume not found"
+#define GD_MSG_PEER_DISCONNECTED_STR "Peer is disconnected"
+#define GD_MSG_QUOTA_GET_STAT_FAIL_STR "Failed to get quota status"
+#define GD_MSG_SNAP_STATUS_FAIL_STR "Failed to get status of snapd"
+#define GD_MSG_VALIDATE_FAILED_STR "Failed to validate volume"
+#define GD_MSG_VOL_NOT_STARTED_STR "Volume is not started"
+#define GD_MSG_VOL_SHD_NOT_COMP_STR "Volume is not Self-heal compatible"
+#define GD_MSG_SELF_HEALD_DISABLED_STR "Self-heal daemon is disabled"
+#define GD_MSG_NFS_GANESHA_DISABLED_STR "NFS server is disabled"
+#define GD_MSG_QUOTA_DISABLED_STR "Quota is disabled"
+#define GD_MSG_BITROT_NOT_RUNNING_STR "Bitrot is not enabled"
+#define GD_MSG_BITROT_NOT_ENABLED_STR "Volume does not have bitrot enabled"
+#define GD_MSG_SNAPD_NOT_RUNNING_STR "Snapd is not enabled"
+#define GD_MSG_STRDUP_FAILED_STR "Strdup operation failed"
+#define GD_MSG_QUORUM_CLUSTER_COUNT_GET_FAIL_STR                               \
+    "Failed to get quorum cluster counts"
+#define GD_MSG_GLUSTER_SERVICE_START_FAIL_STR "Failed to start glusterd service"
+#define GD_MSG_PEER_ADDRESS_GET_FAIL_STR "Failed to get the address of peer"
+#define GD_MSG_INVALID_SLAVE_STR "Volume is not a slave volume"
+#define GD_MSG_BRICK_NOT_RUNNING_STR "One or more bricks are not running"
+#define GD_MSG_BRK_MNTPATH_GET_FAIL_STR "Failed to get brick mount device"
+#define GD_MSG_SNAPSHOT_NOT_THIN_PROVISIONED_STR                               \
+    "Snapshot is supported only for thin provisioned LV."
+#define GD_MSG_SNAP_DEVICE_NAME_GET_FAIL_STR                                   \
+    "Failed to copy snapshot device name"
+#define GD_MSG_SNAP_NOT_FOUND_STR "Snapshot does not exist"
+#define GD_MSG_CREATE_BRICK_DIR_FAILED_STR "Failed to create brick directory"
+#define GD_MSG_LSTAT_FAIL_STR "Lstat operation failed"
+#define GD_MSG_DIR_OP_FAILED_STR                                               \
+    "The provided path is already present. It is not a directory"
+#define GD_MSG_BRICK_CREATION_FAIL_STR                                         \
+    "Brick isn't allowed to be created inside glusterd's working directory."
+#define GD_MSG_BRICK_CREATE_ROOT_STR                                           \
+    "The brick is being created in the root partition. It is recommended "     \
+    "that you don't use the system's root partition for storage backend."
+#define GD_MSG_BRICK_CREATE_MNTPNT_STR                                         \
+    "The brick is a mount point. Please create a sub-directory under the "     \
+    "mount point and use that as the brick directory."
+#define GD_MSG_CREATE_GLUSTER_DIR_FAILED_STR                                   \
+    "Failed to create glusterfs directory"
+#define GD_MSG_VOLINFO_IMPORT_FAIL_STR "Volume is not yet imported"
+#define GD_MSG_BRICK_SET_INFO_FAIL_STR                                         \
+    "Failed to add brick mount details to dict"
+#define GD_MSG_SET_XATTR_BRICK_FAIL_STR                                        \
+    "Glusterfs is not supported on brick. Setting extended attribute failed"
+#define GD_MSG_SET_XATTR_FAIL_STR "Failed to set extended attribute"
+#define GD_MSG_REMOVE_XATTR_FAIL_STR "Failed to remove extended attribute"
+#define GD_MSG_XLATOR_SET_OPT_FAIL_STR "Failed to set xlator type"
+#define GD_MSG_XLATOR_LINK_FAIL_STR                                            \
+    "Failed to do the link of xlator with children"
+#define GD_MSG_READ_ERROR_STR "Failed to read directory"
+#define GD_MSG_INCOMPATIBLE_VALUE_STR "Incompatible transport type"
+#define GD_MSG_VOL_STOP_ARGS_GET_FAILED_STR "Failed to get volume stop args"
+#define GD_MSG_FRAME_CREATE_FAIL_STR "Failed to create frame"
+#define GD_MSG_VOLUME_NOT_IMPORTED_STR "Volume has not been imported"
+#define GD_MSG_ADD_BRICK_MNT_INFO_FAIL_STR                                     \
+    "Failed to add brick mount details to dict"
+#define GD_MSG_GET_MNT_ENTRY_INFO_FAIL_STR "Failed to get mount entry details"
+#define GD_MSG_BRICKPATH_ROOT_GET_FAIL_STR "failed to get brick root details"
+#define GD_MSG_VOL_INFO_REQ_RECVD_STR "Received get volume info req"
+#define GD_MSG_NO_FLAG_SET_STR "No flags set"
+#define GD_MSG_CREATE_DIR_FAILED_STR "Failed to create directory"
+#define GD_MSG_POST_HOOK_STUB_INIT_FAIL_STR                                    \
+    "Failed to initialize post hooks stub"
+#define GD_MSG_FILE_OP_FAILED_STR "File operation failed"
+#define GD_MSG_INODE_SIZE_GET_FAIL_STR "Failed to get inode size"
+#define GD_MSG_CMD_EXEC_FAIL_STR "Command execution failed"
+#define GD_MSG_XLATOR_CREATE_FAIL_STR "Failed to create xlator"
+#define GD_MSG_CLRCLK_VOL_REQ_RCVD_STR "Received clear-locks request for volume"
+#define GD_MSG_BRK_PORT_NUM_GET_FAIL_STR                                       \
+    "Couldn't get port number of local bricks"
+#define GD_MSG_CLRLOCKS_MOUNTDIR_CREATE_FAIL_STR                               \
+    "Creating mount directory for clear-locks failed"
+#define GD_MSG_CLRLOCKS_CLNT_MOUNT_FAIL_STR                                    \
+    "Failed to mount clear-locks maintenance client"
+#define GD_MSG_CLRLOCKS_CLNT_UMOUNT_FAIL_STR                                   \
+    "Failed to unmount clear-locks mount point"
+#define GD_MSG_CLRCLK_SND_CMD_FAIL_STR "Failed to send command for clear-locks"
+#define GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL_STR                         \
+    "Failed to allocate memory or get serialized length of dict"
+#define GD_MSG_GET_XATTR_FAIL_STR "Failed to get extended attribute"
+
+#endif /* !_GLUSTERD_MESSAGES_H_ */
diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c b/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c
new file mode 100644
index 00000000000..1069688a89d
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c
@@ -0,0 +1,1144 @@
+/*
+   Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+/* rpc related syncops */
+#include "rpc-clnt.h"
+#include "protocol-common.h"
+#include "xdr-generic.h"
+#include "glusterd1-xdr.h"
+#include "glusterd-syncop.h"
+
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-locks.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-messages.h"
+
+static int
+glusterd_mgmt_v3_null(rpcsvc_request_t *req)
+{
+    return 0;
+}
+
+static int
+glusterd_mgmt_v3_lock_send_resp(rpcsvc_request_t *req, int32_t status,
+                                uint32_t op_errno)
+{
+    gd1_mgmt_v3_lock_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    rsp.op_ret = status;
+    if (rsp.op_ret)
+        rsp.op_errno = op_errno;
+
+    glusterd_get_uuid(&rsp.uuid);
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+
+    gf_msg_debug(this->name, 0, "Responded to mgmt_v3 lock, ret: %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_synctasked_mgmt_v3_lock(rpcsvc_request_t *req,
+                                 gd1_mgmt_v3_lock_req *lock_req,
+                                 glusterd_op_lock_ctx_t *ctx)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    uint32_t op_errno = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(ctx);
+    GF_ASSERT(ctx->dict);
+
+    /* Trying to acquire multiple mgmt_v3 locks */
+    ret = glusterd_multiple_mgmt_v3_lock(ctx->dict, ctx->uuid, &op_errno);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCK_GET_FAIL,
+               "Failed to acquire mgmt_v3 locks for %s", uuid_utoa(ctx->uuid));
+
+    ret = glusterd_mgmt_v3_lock_send_resp(req, ret, op_errno);
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_op_state_machine_mgmt_v3_lock(rpcsvc_request_t *req,
+                                       gd1_mgmt_v3_lock_req *lock_req,
+                                       glusterd_op_lock_ctx_t *ctx)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_op_info_t txn_op_info = {
+        {0},
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    glusterd_txn_opinfo_init(&txn_op_info, NULL, &lock_req->op, ctx->dict, req);
+
+    ret = glusterd_set_txn_opinfo(&lock_req->txn_id, &txn_op_info);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OPINFO_SET_FAIL,
+               "Unable to set transaction's opinfo");
+        goto out;
+    }
+
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_LOCK, &lock_req->txn_id, ctx);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_EVENT_LOCK_FAIL,
+               "Failed to inject event GD_OP_EVENT_LOCK");
+
+out:
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_handle_mgmt_v3_lock_fn(rpcsvc_request_t *req)
+{
+    gd1_mgmt_v3_lock_req lock_req = {
+        {0},
+    };
+    int32_t ret = -1;
+    glusterd_op_lock_ctx_t *ctx = NULL;
+    xlator_t *this = NULL;
+    gf_boolean_t is_synctasked = _gf_false;
+    gf_boolean_t free_ctx = _gf_false;
+    glusterd_conf_t *conf = NULL;
+    uint32_t timeout = 0;
+
+    this = THIS;
+    conf = this->private;
+    GF_ASSERT(conf);
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &lock_req,
+                         (xdrproc_t)xdr_gd1_mgmt_v3_lock_req);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode lock "
+               "request received from peer");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0,
+                 "Received mgmt_v3 lock req "
+                 "from uuid: %s",
+                 uuid_utoa(lock_req.uuid));
+
+    if (glusterd_peerinfo_find_by_uuid(lock_req.uuid) == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_PEER_NOT_FOUND,
+               "%s doesn't "
+               "belong to the cluster. Ignoring request.",
+               uuid_utoa(lock_req.uuid));
+        ret = -1;
+        goto out;
+    }
+
+    ctx = GF_CALLOC(1, sizeof(*ctx), gf_gld_mt_op_lock_ctx_t);
+    if (!ctx) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    gf_uuid_copy(ctx->uuid, lock_req.uuid);
+    ctx->req = req;
+
+    ctx->dict = dict_new();
+    if (!ctx->dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_unserialize(lock_req.dict.dict_val, lock_req.dict.dict_len,
+                           &ctx->dict);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                NULL);
+        goto out;
+    }
+
+    /* Cli will add timeout key to dict if the default timeout is
+     * other than 2 minutes. Here we use this value to check whether
+     * mgmt_v3_lock_timeout should be set to default value or we
+     * need to change the value according to timeout value
+     * i.e, timeout + 120 seconds. */
+    ret = dict_get_uint32(ctx->dict, "timeout", &timeout);
+    if (!ret)
+        conf->mgmt_v3_lock_timeout = timeout + 120;
+
+    is_synctasked = dict_get_str_boolean(ctx->dict, "is_synctasked", _gf_false);
+    if (is_synctasked) {
+        ret = glusterd_synctasked_mgmt_v3_lock(req, &lock_req, ctx);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCK_GET_FAIL,
+                   "Failed to acquire mgmt_v3_locks");
+            /* Ignore the return code, as it shouldn't be propagated
+             * from the handler function so as to avoid double
+             * deletion of the req
+             */
+            ret = 0;
+        }
+
+        /* The above function does not take ownership of ctx.
+         * Therefore we need to free the ctx explicitly. */
+        free_ctx = _gf_true;
+    } else {
+        /* Shouldn't ignore the return code here, and it should
+         * be propagated from the handler function as in failure
+         * case it doesn't delete the req object
+         */
+        ret = glusterd_op_state_machine_mgmt_v3_lock(req, &lock_req, ctx);
+        if (ret)
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCK_GET_FAIL,
+                   "Failed to acquire mgmt_v3_locks");
+    }
+
+out:
+
+    if (ctx && (ret || free_ctx)) {
+        if (ctx->dict)
+            dict_unref(ctx->dict);
+
+        GF_FREE(ctx);
+    }
+
+    free(lock_req.dict.dict_val);
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_mgmt_v3_pre_validate_send_resp(rpcsvc_request_t *req, int32_t op,
+                                        int32_t status, char *op_errstr,
+                                        dict_t *rsp_dict, uint32_t op_errno)
+{
+    gd1_mgmt_v3_pre_val_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    rsp.op_ret = status;
+    glusterd_get_uuid(&rsp.uuid);
+    rsp.op = op;
+    rsp.op_errno = op_errno;
+    if (op_errstr)
+        rsp.op_errstr = op_errstr;
+    else
+        rsp.op_errstr = "";
+
+    ret = dict_allocate_and_serialize(rsp_dict, &rsp.dict.dict_val,
+                                      &rsp.dict.dict_len);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_v3_pre_val_rsp);
+
+    GF_FREE(rsp.dict.dict_val);
+out:
+    gf_msg_debug(this->name, 0, "Responded to pre validation, ret: %d", ret);
+    return ret;
+}
+
+static int
+glusterd_handle_pre_validate_fn(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gd1_mgmt_v3_pre_val_req op_req = {
+        {0},
+    };
+    xlator_t *this = NULL;
+    char *op_errstr = NULL;
+    dict_t *dict = NULL;
+    dict_t *rsp_dict = NULL;
+    uint32_t op_errno = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &op_req,
+                         (xdrproc_t)xdr_gd1_mgmt_v3_pre_val_req);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode pre validation "
+               "request received from peer");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (glusterd_peerinfo_find_by_uuid(op_req.uuid) == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_PEER_NOT_FOUND,
+               "%s doesn't "
+               "belong to the cluster. Ignoring request.",
+               uuid_utoa(op_req.uuid));
+        ret = -1;
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ret = dict_unserialize(op_req.dict.dict_val, op_req.dict.dict_len, &dict);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                NULL);
+        goto out;
+    }
+
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL, NULL);
+        return -1;
+    }
+
+    ret = gd_mgmt_v3_pre_validate_fn(op_req.op, dict, &op_errstr, rsp_dict,
+                                     &op_errno);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRE_VALIDATION_FAIL,
+               "Pre Validation failed on operation %s", gd_op_list[op_req.op]);
+    }
+
+    ret = glusterd_mgmt_v3_pre_validate_send_resp(
+        req, op_req.op, ret, op_errstr, rsp_dict, op_errno);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_OP_RESP_FAIL,
+               "Failed to send Pre Validation "
+               "response for operation %s",
+               gd_op_list[op_req.op]);
+        goto out;
+    }
+
+out:
+    if (op_errstr && (strcmp(op_errstr, "")))
+        GF_FREE(op_errstr);
+
+    free(op_req.dict.dict_val);
+
+    if (dict)
+        dict_unref(dict);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    /* Return 0 from handler to avoid double deletion of req obj */
+    return 0;
+}
+
+static int
+glusterd_mgmt_v3_brick_op_send_resp(rpcsvc_request_t *req, int32_t op,
+                                    int32_t status, char *op_errstr,
+                                    dict_t *rsp_dict)
+{
+    gd1_mgmt_v3_brick_op_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    rsp.op_ret = status;
+    glusterd_get_uuid(&rsp.uuid);
+    rsp.op = op;
+    if (op_errstr)
+        rsp.op_errstr = op_errstr;
+    else
+        rsp.op_errstr = "";
+
+    ret = dict_allocate_and_serialize(rsp_dict, &rsp.dict.dict_val,
+                                      &rsp.dict.dict_len);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_v3_brick_op_rsp);
+
+    GF_FREE(rsp.dict.dict_val);
+out:
+    gf_msg_debug(this->name, 0, "Responded to brick op, ret: %d", ret);
+    return ret;
+}
+
+static int
+glusterd_handle_brick_op_fn(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gd1_mgmt_v3_brick_op_req op_req = {
+        {0},
+    };
+    xlator_t *this = NULL;
+    char *op_errstr = NULL;
+    dict_t *dict = NULL;
+    dict_t *rsp_dict = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &op_req,
+                         (xdrproc_t)xdr_gd1_mgmt_v3_brick_op_req);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode brick op "
+               "request received from peer");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (glusterd_peerinfo_find_by_uuid(op_req.uuid) == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_PEER_NOT_FOUND,
+               "%s doesn't "
+               "belong to the cluster. Ignoring request.",
+               uuid_utoa(op_req.uuid));
+        ret = -1;
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ret = dict_unserialize(op_req.dict.dict_val, op_req.dict.dict_len, &dict);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                NULL);
+        goto out;
+    }
+
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL, NULL);
+        return -1;
+    }
+
+    ret = gd_mgmt_v3_brick_op_fn(op_req.op, dict, &op_errstr, rsp_dict);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_OP_FAIL,
+               "Brick Op failed on operation %s", gd_op_list[op_req.op]);
+    }
+
+    ret = glusterd_mgmt_v3_brick_op_send_resp(req, op_req.op, ret, op_errstr,
+                                              rsp_dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRE_VALD_RESP_FAIL,
+               "Failed to send brick op "
+               "response for operation %s",
+               gd_op_list[op_req.op]);
+        goto out;
+    }
+
+out:
+    if (op_errstr && (strcmp(op_errstr, "")))
+        GF_FREE(op_errstr);
+
+    free(op_req.dict.dict_val);
+
+    if (dict)
+        dict_unref(dict);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    /* Return 0 from handler to avoid double deletion of req obj */
+    return 0;
+}
+
+static int
+glusterd_mgmt_v3_commit_send_resp(rpcsvc_request_t *req, int32_t op,
+                                  int32_t status, char *op_errstr,
+                                  uint32_t op_errno, dict_t *rsp_dict)
+{
+    gd1_mgmt_v3_commit_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    rsp.op_ret = status;
+    glusterd_get_uuid(&rsp.uuid);
+    rsp.op = op;
+    rsp.op_errno = op_errno;
+    if (op_errstr)
+        rsp.op_errstr = op_errstr;
+    else
+        rsp.op_errstr = "";
+
+    ret = dict_allocate_and_serialize(rsp_dict, &rsp.dict.dict_val,
+                                      &rsp.dict.dict_len);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_v3_commit_rsp);
+
+    GF_FREE(rsp.dict.dict_val);
+out:
+    gf_msg_debug(this->name, 0, "Responded to commit, ret: %d", ret);
+    return ret;
+}
+
+static int
+glusterd_handle_commit_fn(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gd1_mgmt_v3_commit_req op_req = {
+        {0},
+    };
+    xlator_t *this = NULL;
+    char *op_errstr = NULL;
+    dict_t *dict = NULL;
+    dict_t *rsp_dict = NULL;
+    uint32_t op_errno = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &op_req,
+                         (xdrproc_t)xdr_gd1_mgmt_v3_commit_req);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode commit "
+               "request received from peer");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (glusterd_peerinfo_find_by_uuid(op_req.uuid) == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_PEER_NOT_FOUND,
+               "%s doesn't "
+               "belong to the cluster. Ignoring request.",
+               uuid_utoa(op_req.uuid));
+        ret = -1;
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ret = dict_unserialize(op_req.dict.dict_val, op_req.dict.dict_len, &dict);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                NULL);
+        goto out;
+    }
+
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL, NULL);
+        return -1;
+    }
+
+    ret = gd_mgmt_v3_commit_fn(op_req.op, dict, &op_errstr, &op_errno,
+                               rsp_dict);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+               "commit failed on operation %s", gd_op_list[op_req.op]);
+    }
+
+    ret = glusterd_mgmt_v3_commit_send_resp(req, op_req.op, ret, op_errstr,
+                                            op_errno, rsp_dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_OP_RESP_FAIL,
+               "Failed to send commit "
+               "response for operation %s",
+               gd_op_list[op_req.op]);
+        goto out;
+    }
+
+out:
+    if (op_errstr && (strcmp(op_errstr, "")))
+        GF_FREE(op_errstr);
+
+    free(op_req.dict.dict_val);
+
+    if (dict)
+        dict_unref(dict);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    /* Return 0 from handler to avoid double deletion of req obj */
+    return 0;
+}
+
+static int
+glusterd_mgmt_v3_post_commit_send_resp(rpcsvc_request_t *req, int32_t op,
+                                       int32_t status, char *op_errstr,
+                                       uint32_t op_errno, dict_t *rsp_dict)
+{
+    gd1_mgmt_v3_post_commit_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    rsp.op_ret = status;
+    glusterd_get_uuid(&rsp.uuid);
+    rsp.op = op;
+    rsp.op_errno = op_errno;
+    if (op_errstr)
+        rsp.op_errstr = op_errstr;
+    else
+        rsp.op_errstr = "";
+
+    ret = dict_allocate_and_serialize(rsp_dict, &rsp.dict.dict_val,
+                                      &rsp.dict.dict_len);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_v3_post_commit_rsp);
+
+    GF_FREE(rsp.dict.dict_val);
+out:
+    gf_msg_debug(this->name, 0, "Responded to post commit, ret: %d", ret);
+    return ret;
+}
+
+static int
+glusterd_handle_post_commit_fn(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gd1_mgmt_v3_post_commit_req op_req = {
+        {0},
+    };
+    xlator_t *this = NULL;
+    char *op_errstr = NULL;
+    dict_t *dict = NULL;
+    dict_t *rsp_dict = NULL;
+    uint32_t op_errno = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &op_req,
+                         (xdrproc_t)xdr_gd1_mgmt_v3_post_commit_req);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode post commit "
+               "request received from peer");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (glusterd_peerinfo_find_by_uuid(op_req.uuid) == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_PEER_NOT_FOUND,
+               "%s doesn't "
+               "belong to the cluster. Ignoring request.",
+               uuid_utoa(op_req.uuid));
+        ret = -1;
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ret = dict_unserialize(op_req.dict.dict_val, op_req.dict.dict_len, &dict);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                NULL);
+        goto out;
+    }
+
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL, NULL);
+        return -1;
+    }
+
+    ret = gd_mgmt_v3_post_commit_fn(op_req.op, dict, &op_errstr, &op_errno,
+                                    rsp_dict);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_COMMIT_OP_FAIL,
+               "post commit failed on operation %s", gd_op_list[op_req.op]);
+    }
+
+    ret = glusterd_mgmt_v3_post_commit_send_resp(req, op_req.op, ret, op_errstr,
+                                                 op_errno, rsp_dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_OP_RESP_FAIL,
+               "Failed to send post commit "
+               "response for operation %s",
+               gd_op_list[op_req.op]);
+        goto out;
+    }
+
+out:
+    if (op_errstr && (strcmp(op_errstr, "")))
+        GF_FREE(op_errstr);
+
+    free(op_req.dict.dict_val);
+
+    if (dict)
+        dict_unref(dict);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    /* Return 0 from handler to avoid double deletion of req obj */
+    return 0;
+}
+
+static int
+glusterd_mgmt_v3_post_validate_send_resp(rpcsvc_request_t *req, int32_t op,
+                                         int32_t status, char *op_errstr,
+                                         dict_t *rsp_dict)
+{
+    gd1_mgmt_v3_post_val_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    rsp.op_ret = status;
+    glusterd_get_uuid(&rsp.uuid);
+    rsp.op = op;
+    if (op_errstr)
+        rsp.op_errstr = op_errstr;
+    else
+        rsp.op_errstr = "";
+
+    ret = dict_allocate_and_serialize(rsp_dict, &rsp.dict.dict_val,
+                                      &rsp.dict.dict_len);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_v3_post_val_rsp);
+
+    GF_FREE(rsp.dict.dict_val);
+out:
+    gf_msg_debug(this->name, 0, "Responded to post validation, ret: %d", ret);
+    return ret;
+}
+
+static int
+glusterd_handle_post_validate_fn(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gd1_mgmt_v3_post_val_req op_req = {
+        {0},
+    };
+    xlator_t *this = NULL;
+    char *op_errstr = NULL;
+    dict_t *dict = NULL;
+    dict_t *rsp_dict = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &op_req,
+                         (xdrproc_t)xdr_gd1_mgmt_v3_post_val_req);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode post validation "
+               "request received from peer");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (glusterd_peerinfo_find_by_uuid(op_req.uuid) == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_PEER_NOT_FOUND,
+               "%s doesn't "
+               "belong to the cluster. Ignoring request.",
+               uuid_utoa(op_req.uuid));
+        ret = -1;
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ret = dict_unserialize(op_req.dict.dict_val, op_req.dict.dict_len, &dict);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                NULL);
+        goto out;
+    }
+
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL, NULL);
+        return -1;
+    }
+
+    ret = gd_mgmt_v3_post_validate_fn(op_req.op, op_req.op_ret, dict,
+                                      &op_errstr, rsp_dict);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_VALIDATION_FAIL,
+               "Post Validation failed on operation %s", gd_op_list[op_req.op]);
+    }
+
+    ret = glusterd_mgmt_v3_post_validate_send_resp(req, op_req.op, ret,
+                                                   op_errstr, rsp_dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_OP_RESP_FAIL,
+               "Failed to send Post Validation "
+               "response for operation %s",
+               gd_op_list[op_req.op]);
+        goto out;
+    }
+
+out:
+    if (op_errstr && (strcmp(op_errstr, "")))
+        GF_FREE(op_errstr);
+
+    free(op_req.dict.dict_val);
+
+    if (dict)
+        dict_unref(dict);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    /* Return 0 from handler to avoid double deletion of req obj */
+    return 0;
+}
+
+static int
+glusterd_mgmt_v3_unlock_send_resp(rpcsvc_request_t *req, int32_t status)
+{
+    gd1_mgmt_v3_unlock_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    rsp.op_ret = status;
+    if (rsp.op_ret)
+        rsp.op_errno = errno;
+
+    glusterd_get_uuid(&rsp.uuid);
+
+    ret = glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                                (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+
+    gf_msg_debug(this->name, 0, "Responded to mgmt_v3 unlock, ret: %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_syctasked_mgmt_v3_unlock(rpcsvc_request_t *req,
+                                  gd1_mgmt_v3_unlock_req *unlock_req,
+                                  glusterd_op_lock_ctx_t *ctx)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(ctx);
+
+    /* Trying to release multiple mgmt_v3 locks */
+    ret = glusterd_multiple_mgmt_v3_unlock(ctx->dict, ctx->uuid);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FAIL,
+               "Failed to release mgmt_v3 locks for %s", uuid_utoa(ctx->uuid));
+    }
+
+    ret = glusterd_mgmt_v3_unlock_send_resp(req, ret);
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_op_state_machine_mgmt_v3_unlock(rpcsvc_request_t *req,
+                                         gd1_mgmt_v3_unlock_req *lock_req,
+                                         glusterd_op_lock_ctx_t *ctx)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_UNLOCK, &lock_req->txn_id,
+                                      ctx);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_EVENT_UNLOCK_FAIL,
+               "Failed to inject event GD_OP_EVENT_UNLOCK");
+
+    glusterd_friend_sm();
+    glusterd_op_sm();
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_handle_mgmt_v3_unlock_fn(rpcsvc_request_t *req)
+{
+    gd1_mgmt_v3_unlock_req lock_req = {
+        {0},
+    };
+    int32_t ret = -1;
+    glusterd_op_lock_ctx_t *ctx = NULL;
+    xlator_t *this = NULL;
+    gf_boolean_t is_synctasked = _gf_false;
+    gf_boolean_t free_ctx = _gf_false;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &lock_req,
+                         (xdrproc_t)xdr_gd1_mgmt_v3_unlock_req);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode unlock "
+               "request received from peer");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0,
+                 "Received volume unlock req "
+                 "from uuid: %s",
+                 uuid_utoa(lock_req.uuid));
+
+    if (glusterd_peerinfo_find_by_uuid(lock_req.uuid) == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_PEER_NOT_FOUND,
+               "%s doesn't "
+               "belong to the cluster. Ignoring request.",
+               uuid_utoa(lock_req.uuid));
+        ret = -1;
+        goto out;
+    }
+
+    ctx = GF_CALLOC(1, sizeof(*ctx), gf_gld_mt_op_lock_ctx_t);
+    if (!ctx) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_MEMORY, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    gf_uuid_copy(ctx->uuid, lock_req.uuid);
+    ctx->req = req;
+
+    ctx->dict = dict_new();
+    if (!ctx->dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_unserialize(lock_req.dict.dict_val, lock_req.dict.dict_len,
+                           &ctx->dict);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                NULL);
+        goto out;
+    }
+
+    is_synctasked = dict_get_str_boolean(ctx->dict, "is_synctasked", _gf_false);
+    if (is_synctasked) {
+        ret = glusterd_syctasked_mgmt_v3_unlock(req, &lock_req, ctx);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FAIL,
+                   "Failed to release mgmt_v3_locks");
+            /* Ignore the return code, as it shouldn't be propagated
+             * from the handler function so as to avoid double
+             * deletion of the req
+             */
+            ret = 0;
+        }
+
+        /* The above function does not take ownership of ctx.
+         * Therefore we need to free the ctx explicitly. */
+        free_ctx = _gf_true;
+    } else {
+        /* Shouldn't ignore the return code here, and it should
+         * be propagated from the handler function as in failure
+         * case it doesn't delete the req object
+         */
+        ret = glusterd_op_state_machine_mgmt_v3_unlock(req, &lock_req, ctx);
+        if (ret)
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FAIL,
+                   "Failed to release mgmt_v3_locks");
+    }
+
+out:
+
+    if (ctx && (ret || free_ctx)) {
+        if (ctx->dict)
+            dict_unref(ctx->dict);
+
+        GF_FREE(ctx);
+    }
+
+    free(lock_req.dict.dict_val);
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_handle_mgmt_v3_lock(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, glusterd_handle_mgmt_v3_lock_fn);
+}
+
+static int
+glusterd_handle_pre_validate(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, glusterd_handle_pre_validate_fn);
+}
+
+static int
+glusterd_handle_brick_op(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, glusterd_handle_brick_op_fn);
+}
+
+static int
+glusterd_handle_commit(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, glusterd_handle_commit_fn);
+}
+
+static int
+glusterd_handle_post_commit(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, glusterd_handle_post_commit_fn);
+}
+
+static int
+glusterd_handle_post_validate(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, glusterd_handle_post_validate_fn);
+}
+
+int
+glusterd_handle_mgmt_v3_unlock(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, glusterd_handle_mgmt_v3_unlock_fn);
+}
+
+static rpcsvc_actor_t gd_svc_mgmt_v3_actors[GLUSTERD_MGMT_V3_MAXVALUE] = {
+    [GLUSTERD_MGMT_V3_NULL] = {"NULL", glusterd_mgmt_v3_null, NULL,
+                               GLUSTERD_MGMT_V3_NULL, DRC_NA, 0},
+    [GLUSTERD_MGMT_V3_LOCK] = {"MGMT_V3_LOCK", glusterd_handle_mgmt_v3_lock,
+                               NULL, GLUSTERD_MGMT_V3_LOCK, DRC_NA, 0},
+    [GLUSTERD_MGMT_V3_PRE_VALIDATE] = {"PRE_VAL", glusterd_handle_pre_validate,
+                                       NULL, GLUSTERD_MGMT_V3_PRE_VALIDATE,
+                                       DRC_NA, 0},
+    [GLUSTERD_MGMT_V3_BRICK_OP] = {"BRCK_OP", glusterd_handle_brick_op, NULL,
+                                   GLUSTERD_MGMT_V3_BRICK_OP, DRC_NA, 0},
+    [GLUSTERD_MGMT_V3_COMMIT] = {"COMMIT", glusterd_handle_commit, NULL,
+                                 GLUSTERD_MGMT_V3_COMMIT, DRC_NA, 0},
+    [GLUSTERD_MGMT_V3_POST_COMMIT] = {"POST_COMMIT",
+                                      glusterd_handle_post_commit, NULL,
+                                      GLUSTERD_MGMT_V3_POST_COMMIT, DRC_NA, 0},
+    [GLUSTERD_MGMT_V3_POST_VALIDATE] = {"POST_VAL",
+                                        glusterd_handle_post_validate, NULL,
+                                        GLUSTERD_MGMT_V3_POST_VALIDATE, DRC_NA,
+                                        0},
+    [GLUSTERD_MGMT_V3_UNLOCK] = {"MGMT_V3_UNLOCK",
+                                 glusterd_handle_mgmt_v3_unlock, NULL,
+                                 GLUSTERD_MGMT_V3_UNLOCK, DRC_NA, 0},
+};
+
+struct rpcsvc_program gd_svc_mgmt_v3_prog = {
+    .progname = "GlusterD svc mgmt v3",
+    .prognum = GD_MGMT_PROGRAM,
+    .progver = GD_MGMT_V3_VERSION,
+    .numactors = GLUSTERD_MGMT_V3_MAXVALUE,
+    .actors = gd_svc_mgmt_v3_actors,
+    .synctask = _gf_true,
+};
diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-mgmt.c
new file mode 100644
index 00000000000..bca7221062b
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-mgmt.c
@@ -0,0 +1,3114 @@
+/*
+   Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+/* rpc related syncops */
+#include "rpc-clnt.h"
+#include "protocol-common.h"
+#include "xdr-generic.h"
+#include "glusterd1-xdr.h"
+#include "glusterd-syncop.h"
+
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-locks.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-server-quorum.h"
+#include "glusterd-volgen.h"
+#include "glusterd-store.h"
+#include "glusterd-snapshot-utils.h"
+#include "glusterd-messages.h"
+#include "glusterd-errno.h"
+#include "glusterd-hooks.h"
+
+extern struct rpc_clnt_program gd_mgmt_v3_prog;
+
+void
+gd_mgmt_v3_collate_errors(struct syncargs *args, int op_ret, int op_errno,
+                          char *op_errstr, int op_code, uuid_t peerid,
+                          u_char *uuid)
+{
+    char *peer_str = NULL;
+    char err_str[PATH_MAX] = "Please check log file for details.";
+    char op_err[PATH_MAX] = "";
+    xlator_t *this = NULL;
+    int is_operrstr_blk = 0;
+    char *err_string = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(args);
+    GF_ASSERT(uuid);
+
+    if (op_ret) {
+        args->op_ret = op_ret;
+        args->op_errno = op_errno;
+
+        RCU_READ_LOCK;
+        peerinfo = glusterd_peerinfo_find(peerid, NULL);
+        if (peerinfo)
+            peer_str = gf_strdup(peerinfo->hostname);
+        else
+            peer_str = gf_strdup(uuid_utoa(uuid));
+
+        RCU_READ_UNLOCK;
+
+        is_operrstr_blk = (op_errstr && strcmp(op_errstr, ""));
+        err_string = (is_operrstr_blk) ? op_errstr : err_str;
+
+        switch (op_code) {
+            case GLUSTERD_MGMT_V3_LOCK: {
+                snprintf(op_err, sizeof(op_err), "Locking failed on %s. %s",
+                         peer_str, err_string);
+                break;
+            }
+            case GLUSTERD_MGMT_V3_PRE_VALIDATE: {
+                snprintf(op_err, sizeof(op_err),
+                         "Pre Validation failed on %s. %s", peer_str,
+                         err_string);
+                break;
+            }
+            case GLUSTERD_MGMT_V3_BRICK_OP: {
+                snprintf(op_err, sizeof(op_err), "Brick ops failed on %s. %s",
+                         peer_str, err_string);
+                break;
+            }
+            case GLUSTERD_MGMT_V3_COMMIT: {
+                snprintf(op_err, sizeof(op_err), "Commit failed on %s. %s",
+                         peer_str, err_string);
+                break;
+            }
+            case GLUSTERD_MGMT_V3_POST_COMMIT: {
+                snprintf(op_err, sizeof(op_err), "Post commit failed on %s. %s",
+                         peer_str, err_string);
+                break;
+            }
+            case GLUSTERD_MGMT_V3_POST_VALIDATE: {
+                snprintf(op_err, sizeof(op_err),
+                         "Post Validation failed on %s. %s", peer_str,
+                         err_string);
+                break;
+            }
+            case GLUSTERD_MGMT_V3_UNLOCK: {
+                snprintf(op_err, sizeof(op_err), "Unlocking failed on %s. %s",
+                         peer_str, err_string);
+                break;
+            }
+            default:
+                snprintf(op_err, sizeof(op_err), "Unknown error! on %s. %s",
+                         peer_str, err_string);
+        }
+
+        if (args->errstr) {
+            len = snprintf(err_str, sizeof(err_str), "%s\n%s", args->errstr,
+                           op_err);
+            if (len < 0) {
+                strcpy(err_str, "<error>");
+            }
+            GF_FREE(args->errstr);
+            args->errstr = NULL;
+        } else
+            snprintf(err_str, sizeof(err_str), "%s", op_err);
+
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_OP_FAIL, "%s",
+               op_err);
+        args->errstr = gf_strdup(err_str);
+    }
+
+    GF_FREE(peer_str);
+
+    return;
+}
+
+int32_t
+gd_mgmt_v3_pre_validate_fn(glusterd_op_t op, dict_t *dict, char **op_errstr,
+                           dict_t *rsp_dict, uint32_t *op_errno)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(rsp_dict);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    switch (op) {
+        case GD_OP_SNAP:
+            ret = glusterd_snapshot_prevalidate(dict, op_errstr, rsp_dict,
+                                                op_errno);
+
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_PRE_VALIDATION_FAIL,
+                       "Snapshot Prevalidate Failed");
+                goto out;
+            }
+
+            break;
+
+        case GD_OP_REPLACE_BRICK:
+            ret = glusterd_op_stage_replace_brick(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_PRE_VALIDATION_FAIL,
+                       "Replace-brick prevalidation failed.");
+                goto out;
+            }
+            break;
+        case GD_OP_ADD_BRICK:
+            ret = glusterd_op_stage_add_brick(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_PRE_VALIDATION_FAIL,
+                       "ADD-brick prevalidation failed.");
+                goto out;
+            }
+            break;
+        case GD_OP_START_VOLUME:
+            ret = glusterd_op_stage_start_volume(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_PRE_VALIDATION_FAIL,
+                       "Volume start prevalidation failed.");
+                goto out;
+            }
+            break;
+        case GD_OP_STOP_VOLUME:
+            ret = glusterd_op_stage_stop_volume(dict, op_errstr);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_PRE_VALIDATION_FAIL,
+                       "Volume stop prevalidation failed.");
+                goto out;
+            }
+            break;
+        case GD_OP_REMOVE_BRICK:
+            ret = glusterd_op_stage_remove_brick(dict, op_errstr);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_PRE_VALIDATION_FAIL,
+                       "Remove brick prevalidation failed.");
+                goto out;
+            }
+            break;
+
+        case GD_OP_RESET_BRICK:
+            ret = glusterd_reset_brick_prevalidate(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_PRE_VALIDATION_FAIL,
+                       "Reset brick prevalidation failed.");
+                goto out;
+            }
+            break;
+
+        case GD_OP_PROFILE_VOLUME:
+            ret = glusterd_op_stage_stats_volume(dict, op_errstr);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_PRE_VALIDATION_FAIL,
+                       "prevalidation failed for profile operation.");
+                goto out;
+            }
+            break;
+        case GD_OP_REBALANCE:
+        case GD_OP_DEFRAG_BRICK_VOLUME:
+            ret = glusterd_mgmt_v3_op_stage_rebalance(dict, op_errstr);
+            if (ret) {
+                gf_log(this->name, GF_LOG_WARNING,
+                       "Rebalance Prevalidate Failed");
+                goto out;
+            }
+            break;
+
+        case GD_OP_MAX_OPVERSION:
+            ret = 0;
+            break;
+
+        default:
+            break;
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug(this->name, 0, "OP = %d. Returning %d", op, ret);
+    return ret;
+}
+
+int32_t
+gd_mgmt_v3_brick_op_fn(glusterd_op_t op, dict_t *dict, char **op_errstr,
+                       dict_t *rsp_dict)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(rsp_dict);
+
+    switch (op) {
+        case GD_OP_SNAP: {
+            ret = glusterd_snapshot_brickop(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_BRICK_OP_FAIL,
+                       "snapshot brickop failed");
+                goto out;
+            }
+            break;
+        }
+        case GD_OP_PROFILE_VOLUME:
+        case GD_OP_REBALANCE:
+        case GD_OP_DEFRAG_BRICK_VOLUME: {
+            ret = gd_brick_op_phase(op, rsp_dict, dict, op_errstr);
+            if (ret) {
+                gf_log(this->name, GF_LOG_WARNING,
+                       "%s brickop "
+                       "failed",
+                       gd_op_list[op]);
+                goto out;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+
+    ret = 0;
+out:
+    gf_msg_trace(this->name, 0, "OP = %d. Returning %d", op, ret);
+    return ret;
+}
+
+int32_t
+gd_mgmt_v3_commit_fn(glusterd_op_t op, dict_t *dict, char **op_errstr,
+                     uint32_t *op_errno, dict_t *rsp_dict)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+    GF_ASSERT(rsp_dict);
+
+    glusterd_op_commit_hook(op, dict, GD_COMMIT_HOOK_PRE);
+    switch (op) {
+        case GD_OP_SNAP: {
+            ret = glusterd_snapshot(dict, op_errstr, op_errno, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_COMMIT_OP_FAIL,
+                       "Snapshot Commit Failed");
+                goto out;
+            }
+            break;
+        }
+        case GD_OP_REPLACE_BRICK: {
+            ret = glusterd_op_replace_brick(dict, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+                       "Replace-brick commit failed.");
+                goto out;
+            }
+            break;
+        }
+        case GD_OP_ADD_BRICK: {
+            ret = glusterd_op_add_brick(dict, op_errstr);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+                       "Add-brick commit failed.");
+                goto out;
+            }
+            break;
+        }
+        case GD_OP_START_VOLUME: {
+            ret = glusterd_op_start_volume(dict, op_errstr);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+                       "Volume start commit failed.");
+                goto out;
+            }
+            break;
+        }
+        case GD_OP_STOP_VOLUME: {
+            ret = glusterd_op_stop_volume(dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+                       "Volume stop commit failed.");
+                goto out;
+            }
+            break;
+        }
+        case GD_OP_REMOVE_BRICK: {
+            ret = glusterd_op_remove_brick(dict, op_errstr);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+                       "Remove-brick commit failed.");
+                goto out;
+            }
+            break;
+        }
+        case GD_OP_RESET_BRICK: {
+            ret = glusterd_op_reset_brick(dict, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+                       "Reset-brick commit failed.");
+                goto out;
+            }
+            break;
+        }
+        case GD_OP_MAX_OPVERSION: {
+            ret = glusterd_op_get_max_opversion(op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+                       "Commit failed.");
+                goto out;
+            }
+            break;
+        }
+        case GD_OP_PROFILE_VOLUME: {
+            ret = glusterd_op_stats_volume(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+                       "commit failed for volume profile operation.");
+                goto out;
+            }
+            break;
+        }
+        case GD_OP_REBALANCE:
+        case GD_OP_DEFRAG_BRICK_VOLUME: {
+            ret = glusterd_mgmt_v3_op_rebalance(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+                       "Rebalance Commit Failed");
+                goto out;
+            }
+            break;
+        }
+
+        default:
+            break;
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug(this->name, 0, "OP = %d. Returning %d", op, ret);
+    return ret;
+}
+
+int32_t
+gd_mgmt_v3_post_commit_fn(glusterd_op_t op, dict_t *dict, char **op_errstr,
+                          uint32_t *op_errno, dict_t *rsp_dict)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+    GF_ASSERT(rsp_dict);
+
+    switch (op) {
+        case GD_OP_ADD_BRICK:
+            ret = glusterd_post_commit_add_brick(dict, op_errstr);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_COMMIT_OP_FAIL,
+                       "Add-brick post commit failed.");
+                goto out;
+            }
+            break;
+        case GD_OP_REPLACE_BRICK:
+            ret = glusterd_post_commit_replace_brick(dict, op_errstr);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_COMMIT_OP_FAIL,
+                       "Replace-brick post commit failed.");
+                goto out;
+            }
+            break;
+        default:
+            break;
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug(this->name, 0, "OP = %d. Returning %d", op, ret);
+    return ret;
+}
+
+int32_t
+gd_mgmt_v3_post_validate_fn(glusterd_op_t op, int32_t op_ret, dict_t *dict,
+                            char **op_errstr, dict_t *rsp_dict)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(rsp_dict);
+
+    if (op_ret == 0)
+        glusterd_op_commit_hook(op, dict, GD_COMMIT_HOOK_POST);
+
+    switch (op) {
+        case GD_OP_SNAP: {
+            ret = glusterd_snapshot_postvalidate(dict, op_ret, op_errstr,
+                                                 rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_POST_VALIDATION_FAIL,
+                       "postvalidate operation failed");
+                goto out;
+            }
+            break;
+        }
+        case GD_OP_ADD_BRICK: {
+            ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to get"
+                       " volume name");
+                goto out;
+            }
+
+            ret = glusterd_volinfo_find(volname, &volinfo);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+                       "Unable to "
+                       "allocate memory");
+                goto out;
+            }
+            ret = glusterd_create_volfiles_and_notify_services(volinfo);
+            if (ret)
+                goto out;
+            ret = glusterd_store_volinfo(volinfo,
+                                         GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+            if (ret)
+                goto out;
+            break;
+        }
+        case GD_OP_START_VOLUME: {
+            ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to get"
+                       " volume name");
+                goto out;
+            }
+
+            ret = glusterd_volinfo_find(volname, &volinfo);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+                       "Unable to "
+                       "allocate memory");
+                goto out;
+            }
+
+            break;
+        }
+        case GD_OP_STOP_VOLUME: {
+            ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to get"
+                       " volume name");
+                goto out;
+            }
+
+            ret = glusterd_volinfo_find(volname, &volinfo);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+                       "Unable to "
+                       "allocate memory");
+                goto out;
+            }
+            break;
+        }
+
+        default:
+            break;
+    }
+
+    ret = 0;
+
+out:
+    gf_msg_trace(this->name, 0, "OP = %d. Returning %d", op, ret);
+    return ret;
+}
+
+int32_t
+gd_mgmt_v3_lock_cbk_fn(struct rpc_req *req, struct iovec *iov, int count,
+                       void *myframe)
+{
+    int32_t ret = -1;
+    struct syncargs *args = NULL;
+    gd1_mgmt_v3_lock_rsp rsp = {
+        {0},
+    };
+    call_frame_t *frame = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = -1;
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(myframe);
+
+    /* Even though the lock command has failed, while collating the errors
+       (gd_mgmt_v3_collate_errors), args->op_ret and args->op_errno will be
+       used. @args is obtained from frame->local. So before checking the
+       status of the request and going out if its a failure, args should be
+       set to frame->local. Otherwise, while collating args will be NULL.
+       This applies to other phases such as prevalidate, brickop, commit and
+       postvalidate also.
+    */
+    frame = myframe;
+    args = frame->local;
+    peerid = frame->cookie;
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    if (-1 == req->rpc_status) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, op_errno, EINVAL);
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+    if (ret < 0)
+        goto out;
+
+    gf_uuid_copy(args->uuid, rsp.uuid);
+
+    op_ret = rsp.op_ret;
+    op_errno = rsp.op_errno;
+
+out:
+    gd_mgmt_v3_collate_errors(args, op_ret, op_errno, NULL,
+                              GLUSTERD_MGMT_V3_LOCK, *peerid, rsp.uuid);
+    GF_FREE(peerid);
+
+    if (rsp.dict.dict_val)
+        free(rsp.dict.dict_val);
+    /* req->rpc_status set to -1 means, STACK_DESTROY will be called from
+     * the caller function.
+     */
+    if (req->rpc_status != -1)
+        STACK_DESTROY(frame->root);
+    synctask_barrier_wake(args);
+    return 0;
+}
+
+int32_t
+gd_mgmt_v3_lock_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                    void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   gd_mgmt_v3_lock_cbk_fn);
+}
+
+int
+gd_mgmt_v3_lock(glusterd_op_t op, dict_t *op_ctx, glusterd_peerinfo_t *peerinfo,
+                struct syncargs *args, uuid_t my_uuid, uuid_t recv_uuid)
+{
+    gd1_mgmt_v3_lock_req req = {
+        {0},
+    };
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(peerinfo);
+    GF_ASSERT(args);
+
+    ret = dict_allocate_and_serialize(op_ctx, &req.dict.dict_val,
+                                      &req.dict.dict_len);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    gf_uuid_copy(req.uuid, my_uuid);
+    req.op = op;
+
+    GD_ALLOC_COPY_UUID(peerid, peerinfo->uuid, ret);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_ALLOC_AND_COPY_UUID_FAIL, NULL);
+        goto out;
+    }
+
+    ret = gd_syncop_submit_request(peerinfo->rpc, &req, args, peerid,
+                                   &gd_mgmt_v3_prog, GLUSTERD_MGMT_V3_LOCK,
+                                   gd_mgmt_v3_lock_cbk,
+                                   (xdrproc_t)xdr_gd1_mgmt_v3_lock_req);
+out:
+    GF_FREE(req.dict.dict_val);
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_mgmt_v3_initiate_lockdown(glusterd_op_t op, dict_t *dict,
+                                   char **op_errstr, uint32_t *op_errno,
+                                   gf_boolean_t *is_acquired,
+                                   uint32_t txn_generation)
+{
+    glusterd_peerinfo_t *peerinfo = NULL;
+    int32_t ret = -1;
+    int32_t peer_cnt = 0;
+    struct syncargs args = {0};
+    uuid_t peer_uuid = {0};
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    uint32_t timeout = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(is_acquired);
+
+    /* Cli will add timeout key to dict if the default timeout is
+     * other than 2 minutes. Here we use this value to check whether
+     * mgmt_v3_lock_timeout should be set to default value or we
+     * need to change the value according to timeout value
+     * i.e, timeout + 120 seconds. */
+    ret = dict_get_uint32(dict, "timeout", &timeout);
+    if (!ret)
+        conf->mgmt_v3_lock_timeout = timeout + 120;
+
+    /* Trying to acquire multiple mgmt_v3 locks on local node */
+    ret = glusterd_multiple_mgmt_v3_lock(dict, MY_UUID, op_errno);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCK_GET_FAIL,
+               "Failed to acquire mgmt_v3 locks on localhost");
+        goto out;
+    }
+
+    *is_acquired = _gf_true;
+
+    /* Sending mgmt_v3 lock req to other nodes in the cluster */
+    gd_syncargs_init(&args, NULL);
+    ret = synctask_barrier_init((&args));
+    if (ret)
+        goto out;
+
+    peer_cnt = 0;
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        /* Only send requests to peers who were available before the
+         * transaction started
+         */
+        if (peerinfo->generation > txn_generation)
+            continue;
+
+        if (!peerinfo->connected)
+            continue;
+        if (op != GD_OP_SYNC_VOLUME &&
+            peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)
+            continue;
+
+        gd_mgmt_v3_lock(op, dict, peerinfo, &args, MY_UUID, peer_uuid);
+        peer_cnt++;
+    }
+    RCU_READ_UNLOCK;
+
+    if (0 == peer_cnt) {
+        ret = 0;
+        goto out;
+    }
+
+    gd_synctask_barrier_wait((&args), peer_cnt);
+
+    if (args.errstr)
+        *op_errstr = gf_strdup(args.errstr);
+
+    ret = args.op_ret;
+    *op_errno = args.op_errno;
+
+    gf_msg_debug(this->name, 0,
+                 "Sent lock op req for %s "
+                 "to %d peers. Returning %d",
+                 gd_op_list[op], peer_cnt, ret);
+out:
+    if (ret) {
+        if (*op_errstr)
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCK_GET_FAIL,
+                   "%s", *op_errstr);
+
+        ret = gf_asprintf(op_errstr,
+                          "Another transaction is in progress. "
+                          "Please try again after some time.");
+
+        if (ret == -1)
+            *op_errstr = NULL;
+
+        ret = -1;
+    }
+
+    return ret;
+}
+
+int
+glusterd_pre_validate_aggr_rsp_dict(glusterd_op_t op, dict_t *aggr, dict_t *rsp)
+{
+    int32_t ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(aggr);
+    GF_ASSERT(rsp);
+
+    switch (op) {
+        case GD_OP_SNAP:
+            ret = glusterd_snap_pre_validate_use_rsp_dict(aggr, rsp);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRE_VALIDATION_FAIL,
+                       "Failed to aggregate prevalidate "
+                       "response dictionaries.");
+                goto out;
+            }
+            break;
+        case GD_OP_REPLACE_BRICK:
+            ret = glusterd_rb_use_rsp_dict(aggr, rsp);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRE_VALIDATION_FAIL,
+                       "Failed to aggregate prevalidate "
+                       "response dictionaries.");
+                goto out;
+            }
+            break;
+        case GD_OP_START_VOLUME:
+        case GD_OP_ADD_BRICK:
+            ret = glusterd_aggr_brick_mount_dirs(aggr, rsp);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_BRICK_MOUNDIRS_AGGR_FAIL,
+                       "Failed to "
+                       "aggregate brick mount dirs");
+                goto out;
+            }
+            break;
+        case GD_OP_RESET_BRICK:
+            ret = glusterd_rb_use_rsp_dict(aggr, rsp);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRE_VALIDATION_FAIL,
+                       "Failed to aggregate prevalidate "
+                       "response dictionaries.");
+                goto out;
+            }
+        case GD_OP_STOP_VOLUME:
+        case GD_OP_REMOVE_BRICK:
+        case GD_OP_PROFILE_VOLUME:
+        case GD_OP_DEFRAG_BRICK_VOLUME:
+        case GD_OP_REBALANCE:
+            break;
+        case GD_OP_MAX_OPVERSION:
+            break;
+        default:
+            ret = -1;
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                   "Invalid op (%s)", gd_op_list[op]);
+
+            break;
+    }
+out:
+    return ret;
+}
+
+int32_t
+gd_mgmt_v3_pre_validate_cbk_fn(struct rpc_req *req, struct iovec *iov,
+                               int count, void *myframe)
+{
+    int32_t ret = -1;
+    struct syncargs *args = NULL;
+    gd1_mgmt_v3_pre_val_rsp rsp = {
+        {0},
+    };
+    call_frame_t *frame = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = -1;
+    dict_t *rsp_dict = NULL;
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(myframe);
+
+    frame = myframe;
+    args = frame->local;
+    peerid = frame->cookie;
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    if (-1 == req->rpc_status) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, op_errno, EINVAL);
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_v3_pre_val_rsp);
+    if (ret < 0)
+        goto out;
+
+    if (rsp.dict.dict_len) {
+        /* Unserialize the dictionary */
+        rsp_dict = dict_new();
+
+        ret = dict_unserialize(rsp.dict.dict_val, rsp.dict.dict_len, &rsp_dict);
+        if (ret < 0) {
+            free(rsp.dict.dict_val);
+            goto out;
+        } else {
+            rsp_dict->extra_stdfree = rsp.dict.dict_val;
+        }
+    }
+
+    gf_uuid_copy(args->uuid, rsp.uuid);
+    pthread_mutex_lock(&args->lock_dict);
+    {
+        ret = glusterd_pre_validate_aggr_rsp_dict(rsp.op, args->dict, rsp_dict);
+    }
+    pthread_mutex_unlock(&args->lock_dict);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESP_AGGR_FAIL, "%s",
+               "Failed to aggregate response from "
+               " node/brick");
+        if (!rsp.op_ret)
+            op_ret = ret;
+        else {
+            op_ret = rsp.op_ret;
+            op_errno = rsp.op_errno;
+        }
+    } else {
+        op_ret = rsp.op_ret;
+        op_errno = rsp.op_errno;
+    }
+
+out:
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    gd_mgmt_v3_collate_errors(args, op_ret, op_errno, rsp.op_errstr,
+                              GLUSTERD_MGMT_V3_PRE_VALIDATE, *peerid, rsp.uuid);
+
+    if (rsp.op_errstr)
+        free(rsp.op_errstr);
+    GF_FREE(peerid);
+    /* req->rpc_status set to -1 means, STACK_DESTROY will be called from
+     * the caller function.
+     */
+    if (req->rpc_status != -1)
+        STACK_DESTROY(frame->root);
+    synctask_barrier_wake(args);
+    return 0;
+}
+
+int32_t
+gd_mgmt_v3_pre_validate_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                            void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   gd_mgmt_v3_pre_validate_cbk_fn);
+}
+
+int
+gd_mgmt_v3_pre_validate_req(glusterd_op_t op, dict_t *op_ctx,
+                            glusterd_peerinfo_t *peerinfo,
+                            struct syncargs *args, uuid_t my_uuid,
+                            uuid_t recv_uuid)
+{
+    int32_t ret = -1;
+    gd1_mgmt_v3_pre_val_req req = {
+        {0},
+    };
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(peerinfo);
+    GF_ASSERT(args);
+
+    ret = dict_allocate_and_serialize(op_ctx, &req.dict.dict_val,
+                                      &req.dict.dict_len);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    gf_uuid_copy(req.uuid, my_uuid);
+    req.op = op;
+
+    GD_ALLOC_COPY_UUID(peerid, peerinfo->uuid, ret);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_ALLOC_AND_COPY_UUID_FAIL, NULL);
+        goto out;
+    }
+
+    ret = gd_syncop_submit_request(
+        peerinfo->rpc, &req, args, peerid, &gd_mgmt_v3_prog,
+        GLUSTERD_MGMT_V3_PRE_VALIDATE, gd_mgmt_v3_pre_validate_cbk,
+        (xdrproc_t)xdr_gd1_mgmt_v3_pre_val_req);
+out:
+    GF_FREE(req.dict.dict_val);
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_mgmt_v3_pre_validate(glusterd_op_t op, dict_t *req_dict,
+                              char **op_errstr, uint32_t *op_errno,
+                              uint32_t txn_generation)
+{
+    int32_t ret = -1;
+    int32_t peer_cnt = 0;
+    dict_t *rsp_dict = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    struct syncargs args = {0};
+    uuid_t peer_uuid = {0};
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GF_ASSERT(req_dict);
+    GF_ASSERT(op_errstr);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Failed to create response dictionary");
+        goto out;
+    }
+
+    if (op == GD_OP_PROFILE_VOLUME || op == GD_OP_STOP_VOLUME ||
+        op == GD_OP_REBALANCE || op == GD_OP_REMOVE_BRICK) {
+        ret = glusterd_validate_quorum(this, op, req_dict, op_errstr);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SERVER_QUORUM_NOT_MET,
+                   "Server quorum not met. Rejecting operation.");
+            goto out;
+        }
+    }
+
+    /* Pre Validation on local node */
+    ret = gd_mgmt_v3_pre_validate_fn(op, req_dict, op_errstr, rsp_dict,
+                                     op_errno);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRE_VALIDATION_FAIL,
+               "Pre Validation failed for "
+               "operation %s on local node",
+               gd_op_list[op]);
+
+        if (*op_errstr == NULL) {
+            ret = gf_asprintf(op_errstr,
+                              "Pre-validation failed "
+                              "on localhost. Please "
+                              "check log file for details");
+            if (ret == -1)
+                *op_errstr = NULL;
+
+            ret = -1;
+        }
+        goto out;
+    }
+
+    if (op != GD_OP_MAX_OPVERSION) {
+        ret = glusterd_pre_validate_aggr_rsp_dict(op, req_dict, rsp_dict);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRE_VALIDATION_FAIL,
+                   "%s",
+                   "Failed to aggregate response from "
+                   " node/brick");
+            goto out;
+        }
+
+        dict_unref(rsp_dict);
+        rsp_dict = NULL;
+    }
+
+    /* Sending Pre Validation req to other nodes in the cluster */
+    gd_syncargs_init(&args, req_dict);
+    ret = synctask_barrier_init((&args));
+    if (ret)
+        goto out;
+
+    peer_cnt = 0;
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        /* Only send requests to peers who were available before the
+         * transaction started
+         */
+        if (peerinfo->generation > txn_generation)
+            continue;
+
+        if (!peerinfo->connected)
+            continue;
+        if (op != GD_OP_SYNC_VOLUME &&
+            peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)
+            continue;
+
+        gd_mgmt_v3_pre_validate_req(op, req_dict, peerinfo, &args, MY_UUID,
+                                    peer_uuid);
+        peer_cnt++;
+    }
+    RCU_READ_UNLOCK;
+
+    if (0 == peer_cnt) {
+        ret = 0;
+        goto out;
+    }
+
+    gd_synctask_barrier_wait((&args), peer_cnt);
+
+    if (args.op_ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRE_VALIDATION_FAIL,
+               "Pre Validation failed on peers");
+
+        if (args.errstr)
+            *op_errstr = gf_strdup(args.errstr);
+    }
+
+    ret = args.op_ret;
+    *op_errno = args.op_errno;
+
+    gf_msg_debug(this->name, 0,
+                 "Sent pre valaidation req for %s "
+                 "to %d peers. Returning %d",
+                 gd_op_list[op], peer_cnt, ret);
+out:
+    return ret;
+}
+
+int
+glusterd_mgmt_v3_build_payload(dict_t **req, char **op_errstr, dict_t *dict,
+                               glusterd_op_t op)
+{
+    int32_t ret = -1;
+    dict_t *req_dict = NULL;
+    xlator_t *this = NULL;
+    char *volname = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(dict);
+
+    req_dict = dict_new();
+    if (!req_dict)
+        goto out;
+
+    switch (op) {
+        case GD_OP_MAX_OPVERSION:
+        case GD_OP_SNAP:
+            dict_copy(dict, req_dict);
+            break;
+        case GD_OP_START_VOLUME:
+        case GD_OP_STOP_VOLUME:
+        case GD_OP_ADD_BRICK:
+        case GD_OP_REMOVE_BRICK:
+        case GD_OP_DEFRAG_BRICK_VOLUME:
+        case GD_OP_REPLACE_BRICK:
+        case GD_OP_RESET_BRICK:
+        case GD_OP_PROFILE_VOLUME: {
+            ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_CRITICAL, errno,
+                       GD_MSG_DICT_GET_FAILED,
+                       "volname is not present in "
+                       "operation ctx");
+                goto out;
+            }
+
+            if (strcasecmp(volname, "all")) {
+                ret = glusterd_dict_set_volid(dict, volname, op_errstr);
+                if (ret)
+                    goto out;
+            }
+            dict_copy(dict, req_dict);
+        } break;
+
+        case GD_OP_REBALANCE: {
+            if (gd_set_commit_hash(dict) != 0) {
+                ret = -1;
+                goto out;
+            }
+            ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_CRITICAL, errno,
+                       GD_MSG_DICT_GET_FAILED,
+                       "volname is not present in "
+                       "operation ctx");
+                goto out;
+            }
+
+            if (strcasecmp(volname, "all")) {
+                ret = glusterd_dict_set_volid(dict, volname, op_errstr);
+                if (ret)
+                    goto out;
+            }
+            dict_copy(dict, req_dict);
+        } break;
+
+        default:
+            break;
+    }
+
+    *req = req_dict;
+    ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+gd_mgmt_v3_brick_op_cbk_fn(struct rpc_req *req, struct iovec *iov, int count,
+                           void *myframe)
+{
+    int32_t ret = -1;
+    struct syncargs *args = NULL;
+    gd1_mgmt_v3_brick_op_rsp rsp = {
+        {0},
+    };
+    call_frame_t *frame = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = -1;
+    dict_t *rsp_dict = NULL;
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(myframe);
+
+    frame = myframe;
+    args = frame->local;
+    peerid = frame->cookie;
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    /* If the operation failed, then iov can be NULL. So better check the
+       status of the operation and then worry about iov (if the status of
+       the command is success)
+    */
+    if (-1 == req->rpc_status) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, op_errno, EINVAL);
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_v3_brick_op_rsp);
+    if (ret < 0)
+        goto out;
+
+    if (rsp.dict.dict_len) {
+        /* Unserialize the dictionary */
+        rsp_dict = dict_new();
+
+        ret = dict_unserialize(rsp.dict.dict_val, rsp.dict.dict_len, &rsp_dict);
+        if (ret < 0) {
+            goto out;
+        } else {
+            rsp_dict->extra_stdfree = rsp.dict.dict_val;
+        }
+    }
+
+    gf_uuid_copy(args->uuid, rsp.uuid);
+    pthread_mutex_lock(&args->lock_dict);
+    {
+        if (rsp.op == GD_OP_DEFRAG_BRICK_VOLUME ||
+            rsp.op == GD_OP_PROFILE_VOLUME)
+            ret = glusterd_syncop_aggr_rsp_dict(rsp.op, args->dict, rsp_dict);
+    }
+    pthread_mutex_unlock(&args->lock_dict);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESP_AGGR_FAIL, "%s",
+               "Failed to aggregate response from "
+               " node/brick");
+        if (!rsp.op_ret)
+            op_ret = ret;
+        else {
+            op_ret = rsp.op_ret;
+            op_errno = rsp.op_errno;
+        }
+    } else {
+        op_ret = rsp.op_ret;
+        op_errno = rsp.op_errno;
+    }
+
+out:
+
+    gd_mgmt_v3_collate_errors(args, op_ret, op_errno, rsp.op_errstr,
+                              GLUSTERD_MGMT_V3_BRICK_OP, *peerid, rsp.uuid);
+
+    if (rsp.op_errstr)
+        free(rsp.op_errstr);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    GF_FREE(peerid);
+    /* req->rpc_status set to -1 means, STACK_DESTROY will be called from
+     * the caller function.
+     */
+    if (req->rpc_status != -1)
+        STACK_DESTROY(frame->root);
+    synctask_barrier_wake(args);
+    return 0;
+}
+
+int32_t
+gd_mgmt_v3_brick_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                        void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   gd_mgmt_v3_brick_op_cbk_fn);
+}
+
+int
+gd_mgmt_v3_brick_op_req(glusterd_op_t op, dict_t *op_ctx,
+                        glusterd_peerinfo_t *peerinfo, struct syncargs *args,
+                        uuid_t my_uuid, uuid_t recv_uuid)
+{
+    int32_t ret = -1;
+    gd1_mgmt_v3_brick_op_req req = {
+        {0},
+    };
+    xlator_t *this = NULL;
+    uuid_t *peerid = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(peerinfo);
+    GF_ASSERT(args);
+
+    ret = dict_allocate_and_serialize(op_ctx, &req.dict.dict_val,
+                                      &req.dict.dict_len);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    gf_uuid_copy(req.uuid, my_uuid);
+    req.op = op;
+
+    GD_ALLOC_COPY_UUID(peerid, peerinfo->uuid, ret);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_ALLOC_AND_COPY_UUID_FAIL, NULL);
+        goto out;
+    }
+
+    ret = gd_syncop_submit_request(peerinfo->rpc, &req, args, peerid,
+                                   &gd_mgmt_v3_prog, GLUSTERD_MGMT_V3_BRICK_OP,
+                                   gd_mgmt_v3_brick_op_cbk,
+                                   (xdrproc_t)xdr_gd1_mgmt_v3_brick_op_req);
+out:
+    GF_FREE(req.dict.dict_val);
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_mgmt_v3_brick_op(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict,
+                          char **op_errstr, uint32_t txn_generation)
+{
+    int32_t ret = -1;
+    int32_t peer_cnt = 0;
+    dict_t *rsp_dict = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    struct syncargs args = {0};
+    uuid_t peer_uuid = {0};
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GF_ASSERT(req_dict);
+    GF_ASSERT(op_errstr);
+
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Failed to create response dictionary");
+        goto out;
+    }
+
+    /* Perform brick op on local node */
+    ret = gd_mgmt_v3_brick_op_fn(op, req_dict, op_errstr, rsp_dict);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_OP_FAIL,
+               "Brick ops failed for "
+               "operation %s on local node",
+               gd_op_list[op]);
+
+        if (*op_errstr == NULL) {
+            ret = gf_asprintf(op_errstr,
+                              "Brick ops failed "
+                              "on localhost. Please "
+                              "check log file for details");
+            if (ret == -1)
+                *op_errstr = NULL;
+
+            ret = -1;
+        }
+        goto out;
+    }
+    if (op == GD_OP_DEFRAG_BRICK_VOLUME || op == GD_OP_PROFILE_VOLUME) {
+        ret = glusterd_syncop_aggr_rsp_dict(op, op_ctx, rsp_dict);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR, "%s",
+                   "Failed to aggregate response from "
+                   " node/brick");
+            goto out;
+        }
+    }
+
+    dict_unref(rsp_dict);
+    rsp_dict = NULL;
+
+    /* Sending brick op req to other nodes in the cluster */
+    gd_syncargs_init(&args, op_ctx);
+    ret = synctask_barrier_init((&args));
+    if (ret)
+        goto out;
+
+    peer_cnt = 0;
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        /* Only send requests to peers who were available before the
+         * transaction started
+         */
+        if (peerinfo->generation > txn_generation)
+            continue;
+
+        if (!peerinfo->connected)
+            continue;
+        if (op != GD_OP_SYNC_VOLUME &&
+            peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)
+            continue;
+
+        gd_mgmt_v3_brick_op_req(op, req_dict, peerinfo, &args, MY_UUID,
+                                peer_uuid);
+        peer_cnt++;
+    }
+    RCU_READ_UNLOCK;
+
+    if (0 == peer_cnt) {
+        ret = 0;
+        goto out;
+    }
+
+    gd_synctask_barrier_wait((&args), peer_cnt);
+
+    if (args.op_ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_OP_FAIL,
+               "Brick ops failed on peers");
+
+        if (args.errstr)
+            *op_errstr = gf_strdup(args.errstr);
+    }
+
+    ret = args.op_ret;
+
+    gf_msg_debug(this->name, 0,
+                 "Sent brick op req for %s "
+                 "to %d peers. Returning %d",
+                 gd_op_list[op], peer_cnt, ret);
+out:
+    return ret;
+}
+
+int32_t
+gd_mgmt_v3_commit_cbk_fn(struct rpc_req *req, struct iovec *iov, int count,
+                         void *myframe)
+{
+    int32_t ret = -1;
+    struct syncargs *args = NULL;
+    gd1_mgmt_v3_commit_rsp rsp = {
+        {0},
+    };
+    call_frame_t *frame = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = -1;
+    dict_t *rsp_dict = NULL;
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(myframe);
+
+    frame = myframe;
+    args = frame->local;
+    peerid = frame->cookie;
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    if (-1 == req->rpc_status) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, op_errno, EINVAL);
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_v3_commit_rsp);
+    if (ret < 0)
+        goto out;
+
+    if (rsp.dict.dict_len) {
+        /* Unserialize the dictionary */
+        rsp_dict = dict_new();
+
+        ret = dict_unserialize(rsp.dict.dict_val, rsp.dict.dict_len, &rsp_dict);
+        if (ret < 0) {
+            free(rsp.dict.dict_val);
+            goto out;
+        } else {
+            rsp_dict->extra_stdfree = rsp.dict.dict_val;
+        }
+    }
+
+    gf_uuid_copy(args->uuid, rsp.uuid);
+    pthread_mutex_lock(&args->lock_dict);
+    {
+        ret = glusterd_syncop_aggr_rsp_dict(rsp.op, args->dict, rsp_dict);
+    }
+    pthread_mutex_unlock(&args->lock_dict);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESP_AGGR_FAIL, "%s",
+               "Failed to aggregate response from "
+               " node/brick");
+        if (!rsp.op_ret)
+            op_ret = ret;
+        else {
+            op_ret = rsp.op_ret;
+            op_errno = rsp.op_errno;
+        }
+    } else {
+        op_ret = rsp.op_ret;
+        op_errno = rsp.op_errno;
+    }
+
+out:
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    gd_mgmt_v3_collate_errors(args, op_ret, op_errno, rsp.op_errstr,
+                              GLUSTERD_MGMT_V3_COMMIT, *peerid, rsp.uuid);
+    GF_FREE(peerid);
+
+    if (rsp.op_errstr)
+        free(rsp.op_errstr);
+
+    /* req->rpc_status set to -1 means, STACK_DESTROY will be called from
+     * the caller function.
+     */
+    if (req->rpc_status != -1)
+        STACK_DESTROY(frame->root);
+    synctask_barrier_wake(args);
+    return 0;
+}
+
+int32_t
+gd_mgmt_v3_commit_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                      void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   gd_mgmt_v3_commit_cbk_fn);
+}
+
+int
+gd_mgmt_v3_commit_req(glusterd_op_t op, dict_t *op_ctx,
+                      glusterd_peerinfo_t *peerinfo, struct syncargs *args,
+                      uuid_t my_uuid, uuid_t recv_uuid)
+{
+    int32_t ret = -1;
+    gd1_mgmt_v3_commit_req req = {
+        {0},
+    };
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(peerinfo);
+    GF_ASSERT(args);
+
+    ret = dict_allocate_and_serialize(op_ctx, &req.dict.dict_val,
+                                      &req.dict.dict_len);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    gf_uuid_copy(req.uuid, my_uuid);
+    req.op = op;
+
+    GD_ALLOC_COPY_UUID(peerid, peerinfo->uuid, ret);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_ALLOC_AND_COPY_UUID_FAIL, NULL);
+        goto out;
+    }
+
+    ret = gd_syncop_submit_request(peerinfo->rpc, &req, args, peerid,
+                                   &gd_mgmt_v3_prog, GLUSTERD_MGMT_V3_COMMIT,
+                                   gd_mgmt_v3_commit_cbk,
+                                   (xdrproc_t)xdr_gd1_mgmt_v3_commit_req);
+out:
+    GF_FREE(req.dict.dict_val);
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_mgmt_v3_commit(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict,
+                        char **op_errstr, uint32_t *op_errno,
+                        uint32_t txn_generation)
+{
+    int32_t ret = -1;
+    int32_t peer_cnt = 0;
+    dict_t *rsp_dict = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    struct syncargs args = {0};
+    uuid_t peer_uuid = {0};
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(req_dict);
+    GF_ASSERT(op_errstr);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    switch (op) {
+        case GD_OP_REBALANCE:
+        case GD_OP_DEFRAG_BRICK_VOLUME:
+
+            ret = glusterd_set_rebalance_id_in_rsp_dict(req_dict, op_ctx);
+            if (ret) {
+                gf_log(this->name, GF_LOG_WARNING,
+                       "Failed to set rebalance id in dict.");
+            }
+            break;
+        case GD_OP_REMOVE_BRICK:
+            ret = glusterd_set_rebalance_id_for_remove_brick(req_dict, op_ctx);
+            if (ret) {
+                gf_log(this->name, GF_LOG_WARNING,
+                       "Failed to set rebalance id for remove-brick in dict.");
+            }
+            break;
+        default:
+            break;
+    }
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Failed to create response dictionary");
+        goto out;
+    }
+
+    /* Commit on local node */
+    ret = gd_mgmt_v3_commit_fn(op, req_dict, op_errstr, op_errno, rsp_dict);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+               "Commit failed for "
+               "operation %s on local node",
+               gd_op_list[op]);
+
+        if (*op_errstr == NULL) {
+            ret = gf_asprintf(op_errstr,
+                              "Commit failed "
+                              "on localhost. Please "
+                              "check log file for details.");
+            if (ret == -1)
+                *op_errstr = NULL;
+
+            ret = -1;
+        }
+        goto out;
+    }
+
+    ret = glusterd_syncop_aggr_rsp_dict(op, op_ctx, rsp_dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESP_AGGR_FAIL, "%s",
+               "Failed to aggregate response from "
+               " node/brick");
+        goto out;
+    }
+
+    dict_unref(rsp_dict);
+    rsp_dict = NULL;
+
+    /* Sending commit req to other nodes in the cluster */
+    gd_syncargs_init(&args, op_ctx);
+    ret = synctask_barrier_init((&args));
+    if (ret)
+        goto out;
+    peer_cnt = 0;
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        /* Only send requests to peers who were available before the
+         * transaction started
+         */
+        if (peerinfo->generation > txn_generation)
+            continue;
+        if (!peerinfo->connected)
+            continue;
+
+        if (op != GD_OP_SYNC_VOLUME &&
+            peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)
+            continue;
+
+        gd_mgmt_v3_commit_req(op, req_dict, peerinfo, &args, MY_UUID,
+                              peer_uuid);
+        peer_cnt++;
+    }
+    RCU_READ_UNLOCK;
+
+    if (0 == peer_cnt) {
+        ret = 0;
+        goto out;
+    }
+
+    gd_synctask_barrier_wait((&args), peer_cnt);
+
+    if (args.op_ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+               "Commit failed on peers");
+
+        if (args.errstr)
+            *op_errstr = gf_strdup(args.errstr);
+    }
+
+    ret = args.op_ret;
+    *op_errno = args.op_errno;
+
+    gf_msg_debug(this->name, 0,
+                 "Sent commit req for %s to %d "
+                 "peers. Returning %d",
+                 gd_op_list[op], peer_cnt, ret);
+out:
+    glusterd_op_modify_op_ctx(op, op_ctx);
+    return ret;
+}
+
+int32_t
+gd_mgmt_v3_post_commit_cbk_fn(struct rpc_req *req, struct iovec *iov, int count,
+                              void *myframe)
+{
+    int32_t ret = -1;
+    struct syncargs *args = NULL;
+    gd1_mgmt_v3_post_commit_rsp rsp = {
+        {0},
+    };
+    call_frame_t *frame = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = -1;
+    dict_t *rsp_dict = NULL;
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(myframe);
+
+    frame = myframe;
+    args = frame->local;
+    peerid = frame->cookie;
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    if (-1 == req->rpc_status) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, op_errno, EINVAL);
+
+    ret = xdr_to_generic(*iov, &rsp,
+                         (xdrproc_t)xdr_gd1_mgmt_v3_post_commit_rsp);
+    if (ret < 0)
+        goto out;
+
+    if (rsp.dict.dict_len) {
+        /* Unserialize the dictionary */
+        rsp_dict = dict_new();
+
+        ret = dict_unserialize(rsp.dict.dict_val, rsp.dict.dict_len, &rsp_dict);
+        if (ret < 0) {
+            free(rsp.dict.dict_val);
+            goto out;
+        } else {
+            rsp_dict->extra_stdfree = rsp.dict.dict_val;
+        }
+    }
+
+    gf_uuid_copy(args->uuid, rsp.uuid);
+    pthread_mutex_lock(&args->lock_dict);
+    {
+        ret = glusterd_syncop_aggr_rsp_dict(rsp.op, args->dict, rsp_dict);
+    }
+    pthread_mutex_unlock(&args->lock_dict);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESP_AGGR_FAIL, "%s",
+               "Failed to aggregate response from "
+               " node/brick");
+        if (!rsp.op_ret)
+            op_ret = ret;
+        else {
+            op_ret = rsp.op_ret;
+            op_errno = rsp.op_errno;
+        }
+    } else {
+        op_ret = rsp.op_ret;
+        op_errno = rsp.op_errno;
+    }
+
+out:
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    gd_mgmt_v3_collate_errors(args, op_ret, op_errno, rsp.op_errstr,
+                              GLUSTERD_MGMT_V3_POST_COMMIT, *peerid, rsp.uuid);
+    GF_FREE(peerid);
+
+    if (rsp.op_errstr)
+        free(rsp.op_errstr);
+
+    /* req->rpc_status set to -1 means, STACK_DESTROY will be called from
+     * the caller function.
+     */
+    if (req->rpc_status != -1)
+        STACK_DESTROY(frame->root);
+    synctask_barrier_wake(args);
+    return 0;
+}
+
+int32_t
+gd_mgmt_v3_post_commit_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                           void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   gd_mgmt_v3_post_commit_cbk_fn);
+}
+
+int
+gd_mgmt_v3_post_commit_req(glusterd_op_t op, dict_t *op_ctx,
+                           glusterd_peerinfo_t *peerinfo, struct syncargs *args,
+                           uuid_t my_uuid, uuid_t recv_uuid)
+{
+    int32_t ret = -1;
+    gd1_mgmt_v3_post_commit_req req = {
+        {0},
+    };
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(peerinfo);
+    GF_ASSERT(args);
+
+    ret = dict_allocate_and_serialize(op_ctx, &req.dict.dict_val,
+                                      &req.dict.dict_len);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    gf_uuid_copy(req.uuid, my_uuid);
+    req.op = op;
+
+    GD_ALLOC_COPY_UUID(peerid, peerinfo->uuid, ret);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_ALLOC_AND_COPY_UUID_FAIL, NULL);
+        goto out;
+    }
+
+    ret = gd_syncop_submit_request(
+        peerinfo->rpc, &req, args, peerid, &gd_mgmt_v3_prog,
+        GLUSTERD_MGMT_V3_POST_COMMIT, gd_mgmt_v3_post_commit_cbk,
+        (xdrproc_t)xdr_gd1_mgmt_v3_post_commit_req);
+out:
+    GF_FREE(req.dict.dict_val);
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_mgmt_v3_post_commit(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict,
+                             char **op_errstr, uint32_t *op_errno,
+                             uint32_t txn_generation)
+{
+    int32_t ret = -1;
+    int32_t peer_cnt = 0;
+    dict_t *rsp_dict = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    struct syncargs args = {0};
+    uuid_t peer_uuid = {0};
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(req_dict);
+    GF_ASSERT(op_errstr);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Failed to create response dictionary");
+        goto out;
+    }
+
+    /* Post commit on local node */
+    ret = gd_mgmt_v3_post_commit_fn(op, req_dict, op_errstr, op_errno,
+                                    rsp_dict);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_COMMIT_OP_FAIL,
+               "Post commit failed for "
+               "operation %s on local node",
+               gd_op_list[op]);
+
+        if (*op_errstr == NULL) {
+            ret = gf_asprintf(op_errstr,
+                              "Post commit failed "
+                              "on localhost. Please "
+                              "check log file for details.");
+            if (ret == -1)
+                *op_errstr = NULL;
+
+            ret = -1;
+        }
+        goto out;
+    }
+
+    ret = glusterd_syncop_aggr_rsp_dict(op, op_ctx, rsp_dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESP_AGGR_FAIL, "%s",
+               "Failed to aggregate response from "
+               " node/brick");
+        goto out;
+    }
+
+    dict_unref(rsp_dict);
+    rsp_dict = NULL;
+
+    /* Sending post commit req to other nodes in the cluster */
+    gd_syncargs_init(&args, op_ctx);
+    ret = synctask_barrier_init((&args));
+    if (ret)
+        goto out;
+    peer_cnt = 0;
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        /* Only send requests to peers who were available before the
+         * transaction started
+         */
+        if (peerinfo->generation > txn_generation)
+            continue;
+        if (!peerinfo->connected)
+            continue;
+
+        if (op != GD_OP_SYNC_VOLUME &&
+            peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)
+            continue;
+
+        gd_mgmt_v3_post_commit_req(op, req_dict, peerinfo, &args, MY_UUID,
+                                   peer_uuid);
+        peer_cnt++;
+    }
+    RCU_READ_UNLOCK;
+
+    if (0 == peer_cnt) {
+        ret = 0;
+        goto out;
+    }
+
+    gd_synctask_barrier_wait((&args), peer_cnt);
+
+    if (args.op_ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_COMMIT_OP_FAIL,
+               "Post commit failed on peers");
+
+        if (args.errstr)
+            *op_errstr = gf_strdup(args.errstr);
+    }
+
+    ret = args.op_ret;
+    *op_errno = args.op_errno;
+
+    gf_msg_debug(this->name, 0,
+                 "Sent post commit req for %s to %d "
+                 "peers. Returning %d",
+                 gd_op_list[op], peer_cnt, ret);
+out:
+    glusterd_op_modify_op_ctx(op, op_ctx);
+    return ret;
+}
+
+int32_t
+gd_mgmt_v3_post_validate_cbk_fn(struct rpc_req *req, struct iovec *iov,
+                                int count, void *myframe)
+{
+    int32_t ret = -1;
+    struct syncargs *args = NULL;
+    gd1_mgmt_v3_post_val_rsp rsp = {
+        {0},
+    };
+    call_frame_t *frame = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = -1;
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(myframe);
+
+    frame = myframe;
+    args = frame->local;
+    peerid = frame->cookie;
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    if (-1 == req->rpc_status) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, op_errno, EINVAL);
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_v3_post_val_rsp);
+    if (ret < 0)
+        goto out;
+
+    gf_uuid_copy(args->uuid, rsp.uuid);
+
+    op_ret = rsp.op_ret;
+    op_errno = rsp.op_errno;
+
+out:
+    gd_mgmt_v3_collate_errors(args, op_ret, op_errno, rsp.op_errstr,
+                              GLUSTERD_MGMT_V3_POST_VALIDATE, *peerid,
+                              rsp.uuid);
+    if (rsp.op_errstr)
+        free(rsp.op_errstr);
+
+    if (rsp.dict.dict_val)
+        free(rsp.dict.dict_val);
+    GF_FREE(peerid);
+    /* req->rpc_status set to -1 means, STACK_DESTROY will be called from
+     * the caller function.
+     */
+    if (req->rpc_status != -1)
+        STACK_DESTROY(frame->root);
+    synctask_barrier_wake(args);
+    return 0;
+}
+
+int32_t
+gd_mgmt_v3_post_validate_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                             void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   gd_mgmt_v3_post_validate_cbk_fn);
+}
+
+int
+gd_mgmt_v3_post_validate_req(glusterd_op_t op, int32_t op_ret, dict_t *op_ctx,
+                             glusterd_peerinfo_t *peerinfo,
+                             struct syncargs *args, uuid_t my_uuid,
+                             uuid_t recv_uuid)
+{
+    int32_t ret = -1;
+    gd1_mgmt_v3_post_val_req req = {
+        {0},
+    };
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(peerinfo);
+    GF_ASSERT(args);
+
+    ret = dict_allocate_and_serialize(op_ctx, &req.dict.dict_val,
+                                      &req.dict.dict_len);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    gf_uuid_copy(req.uuid, my_uuid);
+    req.op = op;
+    req.op_ret = op_ret;
+
+    GD_ALLOC_COPY_UUID(peerid, peerinfo->uuid, ret);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_ALLOC_AND_COPY_UUID_FAIL, NULL);
+        goto out;
+    }
+
+    ret = gd_syncop_submit_request(
+        peerinfo->rpc, &req, args, peerid, &gd_mgmt_v3_prog,
+        GLUSTERD_MGMT_V3_POST_VALIDATE, gd_mgmt_v3_post_validate_cbk,
+        (xdrproc_t)xdr_gd1_mgmt_v3_post_val_req);
+out:
+    GF_FREE(req.dict.dict_val);
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_mgmt_v3_post_validate(glusterd_op_t op, int32_t op_ret, dict_t *dict,
+                               dict_t *req_dict, char **op_errstr,
+                               uint32_t txn_generation)
+{
+    int32_t ret = -1;
+    int32_t peer_cnt = 0;
+    dict_t *rsp_dict = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    struct syncargs args = {0};
+    uuid_t peer_uuid = {0};
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GF_ASSERT(dict);
+    GF_VALIDATE_OR_GOTO(this->name, req_dict, out);
+    GF_ASSERT(op_errstr);
+
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Failed to create response dictionary");
+        goto out;
+    }
+
+    /* Post Validation on local node */
+    ret = gd_mgmt_v3_post_validate_fn(op, op_ret, req_dict, op_errstr,
+                                      rsp_dict);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_VALIDATION_FAIL,
+               "Post Validation failed for "
+               "operation %s on local node",
+               gd_op_list[op]);
+
+        if (*op_errstr == NULL) {
+            ret = gf_asprintf(op_errstr,
+                              "Post-validation failed "
+                              "on localhost. Please check "
+                              "log file for details");
+            if (ret == -1)
+                *op_errstr = NULL;
+
+            ret = -1;
+        }
+        goto out;
+    }
+
+    dict_unref(rsp_dict);
+    rsp_dict = NULL;
+
+    /* Sending Post Validation req to other nodes in the cluster */
+    gd_syncargs_init(&args, req_dict);
+    ret = synctask_barrier_init((&args));
+    if (ret)
+        goto out;
+
+    peer_cnt = 0;
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        /* Only send requests to peers who were available before the
+         * transaction started
+         */
+        if (peerinfo->generation > txn_generation)
+            continue;
+
+        if (!peerinfo->connected)
+            continue;
+        if (op != GD_OP_SYNC_VOLUME &&
+            peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)
+            continue;
+
+        gd_mgmt_v3_post_validate_req(op, op_ret, req_dict, peerinfo, &args,
+                                     MY_UUID, peer_uuid);
+        peer_cnt++;
+    }
+    RCU_READ_UNLOCK;
+
+    if (0 == peer_cnt) {
+        ret = 0;
+        goto out;
+    }
+
+    gd_synctask_barrier_wait((&args), peer_cnt);
+
+    if (args.op_ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_VALIDATION_FAIL,
+               "Post Validation failed on peers");
+
+        if (args.errstr)
+            *op_errstr = gf_strdup(args.errstr);
+    }
+
+    ret = args.op_ret;
+
+    gf_msg_debug(this->name, 0,
+                 "Sent post valaidation req for %s "
+                 "to %d peers. Returning %d",
+                 gd_op_list[op], peer_cnt, ret);
+out:
+    return ret;
+}
+
+int32_t
+gd_mgmt_v3_unlock_cbk_fn(struct rpc_req *req, struct iovec *iov, int count,
+                         void *myframe)
+{
+    int32_t ret = -1;
+    struct syncargs *args = NULL;
+    gd1_mgmt_v3_unlock_rsp rsp = {
+        {0},
+    };
+    call_frame_t *frame = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = -1;
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(myframe);
+
+    frame = myframe;
+    args = frame->local;
+    peerid = frame->cookie;
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    if (-1 == req->rpc_status) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, op_errno, EINVAL);
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+    if (ret < 0)
+        goto out;
+
+    gf_uuid_copy(args->uuid, rsp.uuid);
+
+    op_ret = rsp.op_ret;
+    op_errno = rsp.op_errno;
+
+out:
+    gd_mgmt_v3_collate_errors(args, op_ret, op_errno, NULL,
+                              GLUSTERD_MGMT_V3_UNLOCK, *peerid, rsp.uuid);
+    if (rsp.dict.dict_val)
+        free(rsp.dict.dict_val);
+    GF_FREE(peerid);
+    /* req->rpc_status set to -1 means, STACK_DESTROY will be called from
+     * the caller function.
+     */
+    if (req->rpc_status != -1)
+        STACK_DESTROY(frame->root);
+    synctask_barrier_wake(args);
+    return 0;
+}
+
+int32_t
+gd_mgmt_v3_unlock_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                      void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   gd_mgmt_v3_unlock_cbk_fn);
+}
+
+int
+gd_mgmt_v3_unlock(glusterd_op_t op, dict_t *op_ctx,
+                  glusterd_peerinfo_t *peerinfo, struct syncargs *args,
+                  uuid_t my_uuid, uuid_t recv_uuid)
+{
+    int32_t ret = -1;
+    gd1_mgmt_v3_unlock_req req = {
+        {0},
+    };
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(peerinfo);
+    GF_ASSERT(args);
+
+    ret = dict_allocate_and_serialize(op_ctx, &req.dict.dict_val,
+                                      &req.dict.dict_len);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    gf_uuid_copy(req.uuid, my_uuid);
+    req.op = op;
+
+    GD_ALLOC_COPY_UUID(peerid, peerinfo->uuid, ret);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_ALLOC_AND_COPY_UUID_FAIL, NULL);
+        goto out;
+    }
+
+    ret = gd_syncop_submit_request(peerinfo->rpc, &req, args, peerid,
+                                   &gd_mgmt_v3_prog, GLUSTERD_MGMT_V3_UNLOCK,
+                                   gd_mgmt_v3_unlock_cbk,
+                                   (xdrproc_t)xdr_gd1_mgmt_v3_unlock_req);
+out:
+    GF_FREE(req.dict.dict_val);
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_mgmt_v3_release_peer_locks(glusterd_op_t op, dict_t *dict,
+                                    int32_t op_ret, char **op_errstr,
+                                    gf_boolean_t is_acquired,
+                                    uint32_t txn_generation)
+{
+    int32_t ret = -1;
+    int32_t peer_cnt = 0;
+    uuid_t peer_uuid = {0};
+    xlator_t *this = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    struct syncargs args = {0};
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+
+    /* If the lock has not been held during this
+     * transaction, do not send unlock requests */
+    if (!is_acquired)
+        goto out;
+
+    /* Sending mgmt_v3 unlock req to other nodes in the cluster */
+    gd_syncargs_init(&args, NULL);
+    ret = synctask_barrier_init((&args));
+    if (ret)
+        goto out;
+    peer_cnt = 0;
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        /* Only send requests to peers who were available before the
+         * transaction started
+         */
+        if (peerinfo->generation > txn_generation)
+            continue;
+
+        if (!peerinfo->connected)
+            continue;
+        if (op != GD_OP_SYNC_VOLUME &&
+            peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)
+            continue;
+
+        gd_mgmt_v3_unlock(op, dict, peerinfo, &args, MY_UUID, peer_uuid);
+        peer_cnt++;
+    }
+    RCU_READ_UNLOCK;
+
+    if (0 == peer_cnt) {
+        ret = 0;
+        goto out;
+    }
+
+    gd_synctask_barrier_wait((&args), peer_cnt);
+
+    if (args.op_ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FAIL,
+               "Unlock failed on peers");
+
+        if (!op_ret && args.errstr)
+            *op_errstr = gf_strdup(args.errstr);
+    }
+
+    ret = args.op_ret;
+
+    gf_msg_debug(this->name, 0,
+                 "Sent unlock op req for %s "
+                 "to %d peers. Returning %d",
+                 gd_op_list[op], peer_cnt, ret);
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_initiate_all_phases_with_brickop_phase(rpcsvc_request_t *req,
+                                                        glusterd_op_t op,
+                                                        dict_t *dict)
+{
+    int32_t ret = -1;
+    int32_t op_ret = -1;
+    dict_t *req_dict = NULL;
+    dict_t *tmp_dict = NULL;
+    glusterd_conf_t *conf = NULL;
+    char *op_errstr = NULL;
+    xlator_t *this = NULL;
+    gf_boolean_t is_acquired = _gf_false;
+    uuid_t *originator_uuid = NULL;
+    uint32_t txn_generation = 0;
+    uint32_t op_errno = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(dict);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    /* Save the peer list generation */
+    txn_generation = conf->generation;
+    cmm_smp_rmb();
+    /* This read memory barrier makes sure that this assignment happens here
+     * only and is not reordered and optimized by either the compiler or the
+     * processor.
+     */
+
+    /* Save the MY_UUID as the originator_uuid. This originator_uuid
+     * will be used by is_origin_glusterd() to determine if a node
+     * is the originator node for a command. */
+    originator_uuid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!originator_uuid) {
+        ret = -1;
+        goto out;
+    }
+
+    gf_uuid_copy(*originator_uuid, MY_UUID);
+    ret = dict_set_bin(dict, "originator_uuid", originator_uuid,
+                       sizeof(uuid_t));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set originator_uuid.");
+        GF_FREE(originator_uuid);
+        goto out;
+    }
+
+    /* Marking the operation as complete synctasked */
+    ret = dict_set_int32(dict, "is_synctasked", _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set synctasked flag.");
+        goto out;
+    }
+
+    /* Use a copy at local unlock as cli response will be sent before
+     * the unlock and the volname in the dict might be removed */
+    tmp_dict = dict_new();
+    if (!tmp_dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Unable to create dict");
+        goto out;
+    }
+    dict_copy(dict, tmp_dict);
+
+    /* LOCKDOWN PHASE - Acquire mgmt_v3 locks */
+    ret = glusterd_mgmt_v3_initiate_lockdown(op, dict, &op_errstr, &op_errno,
+                                             &is_acquired, txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCKDOWN_FAIL,
+               "mgmt_v3 lockdown failed.");
+        goto out;
+    }
+
+    /* BUILD PAYLOAD */
+    ret = glusterd_mgmt_v3_build_payload(&req_dict, &op_errstr, dict, op);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_PAYLOAD_BUILD_FAIL,
+               LOGSTR_BUILD_PAYLOAD, gd_op_list[op]);
+        if (op_errstr == NULL)
+            gf_asprintf(&op_errstr, OPERRSTR_BUILD_PAYLOAD);
+        goto out;
+    }
+
+    /* PRE-COMMIT VALIDATE PHASE */
+    ret = glusterd_mgmt_v3_pre_validate(op, req_dict, &op_errstr, &op_errno,
+                                        txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRE_VALIDATION_FAIL,
+               "Pre Validation Failed");
+        goto out;
+    }
+
+    /* BRICK-OPS */
+    ret = glusterd_mgmt_v3_brick_op(op, dict, req_dict, &op_errstr,
+                                    txn_generation);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR, "Brick Op Failed");
+        goto out;
+    }
+
+    /* COMMIT OP PHASE */
+    ret = glusterd_mgmt_v3_commit(op, dict, req_dict, &op_errstr, &op_errno,
+                                  txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+               "Commit Op Failed");
+        goto out;
+    }
+
+    /* POST-COMMIT VALIDATE PHASE */
+    /* As of now, post_validate is not trying to cleanup any failed
+       commands. So as of now, I am sending 0 (op_ret as 0).
+    */
+    ret = glusterd_mgmt_v3_post_validate(op, 0, dict, req_dict, &op_errstr,
+                                         txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_VALIDATION_FAIL,
+               "Post Validation Failed");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    op_ret = ret;
+    /* UNLOCK PHASE FOR PEERS*/
+    (void)glusterd_mgmt_v3_release_peer_locks(op, dict, op_ret, &op_errstr,
+                                              is_acquired, txn_generation);
+
+    /* LOCAL VOLUME(S) UNLOCK */
+    if (is_acquired) {
+        /* Trying to release multiple mgmt_v3 locks */
+        ret = glusterd_multiple_mgmt_v3_unlock(tmp_dict, MY_UUID);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FAIL,
+                   "Failed to release mgmt_v3 locks on localhost");
+            op_ret = ret;
+        }
+    }
+
+    if (op_ret && (op_errno == 0))
+        op_errno = EG_INTRNL;
+
+    if (op != GD_OP_MAX_OPVERSION) {
+        /* SEND CLI RESPONSE */
+        glusterd_op_send_cli_response(op, op_ret, op_errno, req, dict,
+                                      op_errstr);
+    }
+
+    if (req_dict)
+        dict_unref(req_dict);
+
+    if (tmp_dict)
+        dict_unref(tmp_dict);
+
+    if (op_errstr) {
+        GF_FREE(op_errstr);
+        op_errstr = NULL;
+    }
+
+    return 0;
+}
+
+int32_t
+glusterd_mgmt_v3_initiate_all_phases(rpcsvc_request_t *req, glusterd_op_t op,
+                                     dict_t *dict)
+{
+    int32_t ret = -1;
+    int32_t op_ret = -1;
+    dict_t *req_dict = NULL;
+    dict_t *tmp_dict = NULL;
+    glusterd_conf_t *conf = NULL;
+    char *op_errstr = NULL;
+    xlator_t *this = NULL;
+    gf_boolean_t is_acquired = _gf_false;
+    uuid_t *originator_uuid = NULL;
+    uint32_t txn_generation = 0;
+    uint32_t op_errno = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(dict);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    /* Save the peer list generation */
+    txn_generation = conf->generation;
+    cmm_smp_rmb();
+    /* This read memory barrier makes sure that this assignment happens here
+     * only and is not reordered and optimized by either the compiler or the
+     * processor.
+     */
+
+    /* Save the MY_UUID as the originator_uuid. This originator_uuid
+     * will be used by is_origin_glusterd() to determine if a node
+     * is the originator node for a command. */
+    originator_uuid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!originator_uuid) {
+        ret = -1;
+        goto out;
+    }
+
+    gf_uuid_copy(*originator_uuid, MY_UUID);
+    ret = dict_set_bin(dict, "originator_uuid", originator_uuid,
+                       sizeof(uuid_t));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set originator_uuid.");
+        GF_FREE(originator_uuid);
+        goto out;
+    }
+
+    /* Marking the operation as complete synctasked */
+    ret = dict_set_int32(dict, "is_synctasked", _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set synctasked flag.");
+        goto out;
+    }
+
+    /* Use a copy at local unlock as cli response will be sent before
+     * the unlock and the volname in the dict might be removed */
+    tmp_dict = dict_new();
+    if (!tmp_dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Unable to create dict");
+        goto out;
+    }
+    dict_copy(dict, tmp_dict);
+
+    /* LOCKDOWN PHASE - Acquire mgmt_v3 locks */
+    ret = glusterd_mgmt_v3_initiate_lockdown(op, dict, &op_errstr, &op_errno,
+                                             &is_acquired, txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCKDOWN_FAIL,
+               "mgmt_v3 lockdown failed.");
+        goto out;
+    }
+
+    /* BUILD PAYLOAD */
+    ret = glusterd_mgmt_v3_build_payload(&req_dict, &op_errstr, dict, op);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_PAYLOAD_BUILD_FAIL,
+               LOGSTR_BUILD_PAYLOAD, gd_op_list[op]);
+        if (op_errstr == NULL)
+            gf_asprintf(&op_errstr, OPERRSTR_BUILD_PAYLOAD);
+        goto out;
+    }
+
+    /* PRE-COMMIT VALIDATE PHASE */
+    ret = glusterd_mgmt_v3_pre_validate(op, req_dict, &op_errstr, &op_errno,
+                                        txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRE_VALIDATION_FAIL,
+               "Pre Validation Failed");
+        goto out;
+    }
+
+    /* COMMIT OP PHASE */
+    ret = glusterd_mgmt_v3_commit(op, dict, req_dict, &op_errstr, &op_errno,
+                                  txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+               "Commit Op Failed");
+        goto out;
+    }
+
+    /* POST COMMIT OP PHASE */
+    ret = glusterd_mgmt_v3_post_commit(op, dict, req_dict, &op_errstr,
+                                       &op_errno, txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_COMMIT_OP_FAIL,
+               "Post commit Op Failed");
+        goto out;
+    }
+
+    /* POST-COMMIT VALIDATE PHASE */
+    /* As of now, post_validate is not trying to cleanup any failed
+       commands. So as of now, I am sending 0 (op_ret as 0).
+    */
+    ret = glusterd_mgmt_v3_post_validate(op, 0, dict, req_dict, &op_errstr,
+                                         txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_VALIDATION_FAIL,
+               "Post Validation Failed");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    op_ret = ret;
+    /* UNLOCK PHASE FOR PEERS*/
+    (void)glusterd_mgmt_v3_release_peer_locks(op, dict, op_ret, &op_errstr,
+                                              is_acquired, txn_generation);
+
+    /* LOCAL VOLUME(S) UNLOCK */
+    if (is_acquired) {
+        /* Trying to release multiple mgmt_v3 locks */
+        ret = glusterd_multiple_mgmt_v3_unlock(tmp_dict, MY_UUID);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FAIL,
+                   "Failed to release mgmt_v3 locks on localhost");
+            op_ret = ret;
+        }
+    }
+
+    if (op_ret && (op_errno == 0))
+        op_errno = EG_INTRNL;
+
+    if (op != GD_OP_MAX_OPVERSION) {
+        /* SEND CLI RESPONSE */
+        glusterd_op_send_cli_response(op, op_ret, op_errno, req, dict,
+                                      op_errstr);
+    }
+
+    if (req_dict)
+        dict_unref(req_dict);
+
+    if (tmp_dict)
+        dict_unref(tmp_dict);
+
+    if (op_errstr) {
+        GF_FREE(op_errstr);
+        op_errstr = NULL;
+    }
+
+    return 0;
+}
+
+int32_t
+glusterd_set_barrier_value(dict_t *dict, char *option)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *vol = NULL;
+    char *volname = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(dict);
+    GF_ASSERT(option);
+
+    /* TODO : Change this when we support multiple volume.
+     * As of now only snapshot of single volume is supported,
+     * Hence volname1 is directly fetched
+     */
+    ret = dict_get_strn(dict, "volname1", SLEN("volname1"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Volname not present in "
+               "dict");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &vol);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "Volume %s not found ", volname);
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(dict, "barrier", option);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set barrier op "
+               "in request dictionary");
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(vol->dict, "features.barrier", option);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set barrier op "
+               "in volume option dict");
+        goto out;
+    }
+
+    gd_update_volume_op_versions(vol);
+
+    ret = glusterd_create_volfiles(vol);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Failed to create volfiles");
+        goto out;
+    }
+
+    ret = glusterd_store_volinfo(vol, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_initiate_snap_phases(rpcsvc_request_t *req, glusterd_op_t op,
+                                      dict_t *dict)
+{
+    int32_t ret = -1;
+    int32_t op_ret = -1;
+    dict_t *req_dict = NULL;
+    dict_t *tmp_dict = NULL;
+    glusterd_conf_t *conf = NULL;
+    char *op_errstr = NULL;
+    xlator_t *this = NULL;
+    gf_boolean_t is_acquired = _gf_false;
+    uuid_t *originator_uuid = NULL;
+    gf_boolean_t success = _gf_false;
+    char *cli_errstr = NULL;
+    uint32_t txn_generation = 0;
+    uint32_t op_errno = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(dict);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    /* Save the peer list generation */
+    txn_generation = conf->generation;
+    cmm_smp_rmb();
+    /* This read memory barrier makes sure that this assignment happens here
+     * only and is not reordered and optimized by either the compiler or the
+     * processor.
+     */
+
+    /* Save the MY_UUID as the originator_uuid. This originator_uuid
+     * will be used by is_origin_glusterd() to determine if a node
+     * is the originator node for a command. */
+    originator_uuid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!originator_uuid) {
+        ret = -1;
+        goto out;
+    }
+
+    gf_uuid_copy(*originator_uuid, MY_UUID);
+    ret = dict_set_bin(dict, "originator_uuid", originator_uuid,
+                       sizeof(uuid_t));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set originator_uuid.");
+        GF_FREE(originator_uuid);
+        goto out;
+    }
+
+    /* Marking the operation as complete synctasked */
+    ret = dict_set_int32n(dict, "is_synctasked", SLEN("is_synctasked"),
+                          _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set synctasked flag.");
+        goto out;
+    }
+
+    /* Use a copy at local unlock as cli response will be sent before
+     * the unlock and the volname in the dict might be removed */
+    tmp_dict = dict_new();
+    if (!tmp_dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Unable to create dict");
+        goto out;
+    }
+    dict_copy(dict, tmp_dict);
+
+    /* LOCKDOWN PHASE - Acquire mgmt_v3 locks */
+    ret = glusterd_mgmt_v3_initiate_lockdown(op, dict, &op_errstr, &op_errno,
+                                             &is_acquired, txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCKDOWN_FAIL,
+               "mgmt_v3 lockdown failed.");
+        goto out;
+    }
+
+    /* BUILD PAYLOAD */
+    ret = glusterd_mgmt_v3_build_payload(&req_dict, &op_errstr, dict, op);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_PAYLOAD_BUILD_FAIL,
+               LOGSTR_BUILD_PAYLOAD, gd_op_list[op]);
+        if (op_errstr == NULL)
+            gf_asprintf(&op_errstr, OPERRSTR_BUILD_PAYLOAD);
+        goto out;
+    }
+
+    /* PRE-COMMIT VALIDATE PHASE */
+    ret = glusterd_mgmt_v3_pre_validate(op, req_dict, &op_errstr, &op_errno,
+                                        txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRE_VALIDATION_FAIL,
+               "Pre Validation Failed");
+        goto out;
+    }
+
+    /* quorum check of the volume is done here */
+    ret = glusterd_snap_quorum_check(req_dict, _gf_false, &op_errstr,
+                                     &op_errno);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_QUORUM_CHECK_FAIL,
+               "Volume quorum check failed");
+        goto out;
+    }
+
+    /* Set the operation type as pre, so that differentiation can be
+     * made whether the brickop is sent during pre-commit or post-commit
+     */
+    ret = dict_set_dynstr_with_alloc(req_dict, "operation-type", "pre");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set "
+               "operation-type in dictionary");
+        goto out;
+    }
+
+    ret = glusterd_mgmt_v3_brick_op(op, dict, req_dict, &op_errstr,
+                                    txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_OP_FAIL,
+               "Brick Ops Failed");
+        goto unbarrier;
+    }
+
+    /* COMMIT OP PHASE */
+    /* TODO: As of now, the plan is to do quorum check before sending the
+       commit fop and if the quorum succeeds, then commit is sent to all
+       the other glusterds.
+       snap create functionality now creates the in memory and on disk
+       objects for the snapshot (marking them as incomplete), takes the lvm
+       snapshot and then updates the status of the in memory and on disk
+       snap objects as complete. Suppose one of the glusterds goes down
+       after taking the lvm snapshot, but before updating the snap object,
+       then treat it as a snapshot create failure and trigger cleanup.
+       i.e the number of commit responses received by the originator
+       glusterd shold be the same as the number of peers it has sent the
+       request to (i.e npeers variable). If not, then originator glusterd
+       will initiate cleanup in post-validate fop.
+       Question: What if one of the other glusterds goes down as explained
+       above and along with it the originator glusterd also goes down?
+       Who will initiate the cleanup?
+    */
+    ret = dict_set_int32n(req_dict, "cleanup", SLEN("cleanup"), 1);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set dict");
+        goto unbarrier;
+    }
+
+    ret = glusterd_mgmt_v3_commit(op, dict, req_dict, &op_errstr, &op_errno,
+                                  txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+               "Commit Op Failed");
+        /* If the main op fails, we should save the error string.
+           Because, op_errstr will be used for unbarrier and
+           unlock ops also. We might lose the actual error that
+           caused the failure.
+        */
+        cli_errstr = op_errstr;
+        op_errstr = NULL;
+        goto unbarrier;
+    }
+
+    success = _gf_true;
+unbarrier:
+    /* Set the operation type as post, so that differentiation can be
+     * made whether the brickop is sent during pre-commit or post-commit
+     */
+    ret = dict_set_dynstr_with_alloc(req_dict, "operation-type", "post");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set "
+               "operation-type in dictionary");
+        goto out;
+    }
+
+    ret = glusterd_mgmt_v3_brick_op(op, dict, req_dict, &op_errstr,
+                                    txn_generation);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_OP_FAIL,
+               "Brick Ops Failed");
+        goto out;
+    }
+
+    /*Do a quorum check if the commit phase is successful*/
+    if (success) {
+        // quorum check of the snapshot volume
+        ret = glusterd_snap_quorum_check(dict, _gf_true, &op_errstr, &op_errno);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_QUORUM_CHECK_FAIL,
+                   "Snapshot Volume quorum check failed");
+            goto out;
+        }
+    }
+
+    ret = 0;
+
+out:
+    op_ret = ret;
+
+    if (success == _gf_false)
+        op_ret = -1;
+
+    /* POST-COMMIT VALIDATE PHASE */
+    ret = glusterd_mgmt_v3_post_validate(op, op_ret, dict, req_dict, &op_errstr,
+                                         txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_POST_VALIDATION_FAIL,
+               "Post Validation Failed");
+        op_ret = -1;
+    }
+
+    /* UNLOCK PHASE FOR PEERS*/
+    (void)glusterd_mgmt_v3_release_peer_locks(op, dict, op_ret, &op_errstr,
+                                              is_acquired, txn_generation);
+
+    /* If the commit op (snapshot taking) failed, then the error is stored
+       in cli_errstr and unbarrier is called. Suppose, if unbarrier also
+       fails, then the error happened in unbarrier is logged and freed.
+       The error happened in commit op, which is stored in cli_errstr
+       is sent to cli.
+    */
+    if (cli_errstr) {
+        GF_FREE(op_errstr);
+        op_errstr = NULL;
+        op_errstr = cli_errstr;
+    }
+
+    /* LOCAL VOLUME(S) UNLOCK */
+    if (is_acquired) {
+        /* Trying to release multiple mgmt_v3 locks */
+        ret = glusterd_multiple_mgmt_v3_unlock(tmp_dict, MY_UUID);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FAIL,
+                   "Failed to release mgmt_v3 locks on localhost");
+            op_ret = ret;
+        }
+    }
+
+    if (op_ret && (op_errno == 0))
+        op_errno = EG_INTRNL;
+
+    /* SEND CLI RESPONSE */
+    glusterd_op_send_cli_response(op, op_ret, op_errno, req, dict, op_errstr);
+
+    if (req_dict)
+        dict_unref(req_dict);
+
+    if (tmp_dict)
+        dict_unref(tmp_dict);
+
+    if (op_errstr) {
+        GF_FREE(op_errstr);
+        op_errstr = NULL;
+    }
+
+    return 0;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt.h b/xlators/mgmt/glusterd/src/glusterd-mgmt.h
new file mode 100644
index 00000000000..27dd1849519
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-mgmt.h
@@ -0,0 +1,97 @@
+/*
+   Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_MGMT_H_
+#define _GLUSTERD_MGMT_H_
+
+void
+gd_mgmt_v3_collate_errors(struct syncargs *args, int op_ret, int op_errno,
+                          char *op_errstr, int op_code, uuid_t peerid,
+                          u_char *uuid);
+
+int32_t
+gd_mgmt_v3_pre_validate_fn(glusterd_op_t op, dict_t *dict, char **op_errstr,
+                           dict_t *rsp_dict, uint32_t *op_errno);
+
+int32_t
+gd_mgmt_v3_brick_op_fn(glusterd_op_t op, dict_t *dict, char **op_errstr,
+                       dict_t *rsp_dict);
+
+int32_t
+gd_mgmt_v3_commit_fn(glusterd_op_t op, dict_t *dict, char **op_errstr,
+                     uint32_t *op_errno, dict_t *rsp_dict);
+
+int32_t
+gd_mgmt_v3_post_commit_fn(glusterd_op_t op, dict_t *dict, char **op_errstr,
+                          uint32_t *op_errno, dict_t *rsp_dict);
+
+int32_t
+gd_mgmt_v3_post_validate_fn(glusterd_op_t op, int32_t op_ret, dict_t *dict,
+                            char **op_errstr, dict_t *rsp_dict);
+
+int32_t
+glusterd_mgmt_v3_initiate_all_phases(rpcsvc_request_t *req, glusterd_op_t op,
+                                     dict_t *dict);
+
+int32_t
+glusterd_mgmt_v3_initiate_all_phases_with_brickop_phase(rpcsvc_request_t *req,
+                                                        glusterd_op_t op,
+                                                        dict_t *dict);
+
+int32_t
+glusterd_mgmt_v3_initiate_snap_phases(rpcsvc_request_t *req, glusterd_op_t op,
+                                      dict_t *dict);
+
+int
+glusterd_snap_pre_validate_use_rsp_dict(dict_t *dst, dict_t *src);
+
+int32_t
+glusterd_set_barrier_value(dict_t *dict, char *option);
+int
+
+glusterd_mgmt_v3_initiate_lockdown(glusterd_op_t op, dict_t *dict,
+                                   char **op_errstr, uint32_t *op_errno,
+                                   gf_boolean_t *is_acquired,
+                                   uint32_t txn_generation);
+
+int
+glusterd_mgmt_v3_build_payload(dict_t **req, char **op_errstr, dict_t *dict,
+                               glusterd_op_t op);
+
+int
+glusterd_mgmt_v3_pre_validate(glusterd_op_t op, dict_t *req_dict,
+                              char **op_errstr, uint32_t *op_errno,
+                              uint32_t txn_generation);
+
+int
+glusterd_mgmt_v3_commit(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict,
+                        char **op_errstr, uint32_t *op_errno,
+                        uint32_t txn_generation);
+
+int
+glusterd_mgmt_v3_release_peer_locks(glusterd_op_t op, dict_t *dict,
+                                    int32_t op_ret, char **op_errstr,
+                                    gf_boolean_t is_acquired,
+                                    uint32_t txn_generation);
+
+int32_t
+glusterd_multiple_mgmt_v3_unlock(dict_t *dict, uuid_t uuid);
+
+int
+glusterd_reset_brick_prevalidate(dict_t *dict, char **op_errstr,
+                                 dict_t *rsp_dict);
+int
+glusterd_op_reset_brick(dict_t *dict, dict_t *rsp_dict);
+
+int
+glusterd_post_commit_add_brick(dict_t *dict, char **op_errstr);
+
+int
+glusterd_post_commit_replace_brick(dict_t *dict, char **op_errstr);
+#endif /* _GLUSTERD_MGMT_H_ */
diff --git a/xlators/mgmt/glusterd/src/glusterd-mountbroker.c b/xlators/mgmt/glusterd/src/glusterd-mountbroker.c
new file mode 100644
index 00000000000..645d845ee76
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-mountbroker.c
@@ -0,0 +1,721 @@
+/*
+   Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <inttypes.h>
+#include <fnmatch.h>
+#include <pwd.h>
+
+#include <glusterfs/globals.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/list.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/run.h>
+#include "glusterd-mem-types.h"
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include <glusterfs/common-utils.h>
+#include "glusterd-mountbroker.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-messages.h"
+
+static int
+seq_dict_foreach(dict_t *dict, int (*fn)(char *str, void *data), void *data)
+{
+    char index[] = "4294967296";  // 1<<32
+    int i = 0;
+    char *val = NULL;
+    int ret = 0;
+
+    for (;; i++) {
+        snprintf(index, sizeof(index), "%d", i);
+        ret = dict_get_str(dict, index, &val);
+        if (ret != 0)
+            return ret == -ENOENT ? 0 : ret;
+        ret = fn(val, data);
+        if (ret != 0)
+            return ret;
+    }
+}
+
+int
+parse_mount_pattern_desc(gf_mount_spec_t *mspec, char *pdesc)
+#define SYNTAX_ERR -2
+{
+    char *curs = NULL;
+    char *c2 = NULL;
+    char sc = '\0';
+    char **cc = NULL;
+    gf_mount_pattern_t *pat = NULL;
+    int pnum = 0;
+    int ret = 0;
+    int lastsup = -1;
+    int incl = -1;
+    char **pcc = NULL;
+    int pnc = 0;
+
+    skipwhite(&pdesc);
+
+    /* a bow to theory */
+    if (!*pdesc)
+        return 0;
+
+    /* count number of components, separated by '&' */
+    mspec->len = 0;
+    for (curs = pdesc; *curs; curs++) {
+        if (*curs == ')')
+            mspec->len++;
+    }
+
+    mspec->patterns = GF_CALLOC(mspec->len, sizeof(*mspec->patterns),
+                                gf_gld_mt_mount_pattern);
+    if (!mspec->patterns) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    pat = mspec->patterns;
+    curs = pdesc;
+    skipwhite(&curs);
+    for (;;) {
+        incl = -1;
+
+        /* check for pattern signedness modifier */
+        if (*curs == '-') {
+            pat->negative = _gf_true;
+            curs++;
+        }
+
+        /* now should come condition specifier,
+         * then opening paren
+         */
+        c2 = nwstrtail(curs, "SUB(");
+        if (c2) {
+            pat->condition = SET_SUB;
+            goto got_cond;
+        }
+        c2 = nwstrtail(curs, "SUP(");
+        if (c2) {
+            pat->condition = SET_SUPER;
+            lastsup = pat - mspec->patterns;
+            goto got_cond;
+        }
+        c2 = nwstrtail(curs, "EQL(");
+        if (c2) {
+            pat->condition = SET_EQUAL;
+            goto got_cond;
+        }
+        c2 = nwstrtail(curs, "MEET(");
+        if (c2) {
+            pat->condition = SET_INTERSECT;
+            goto got_cond;
+        }
+        c2 = nwstrtail(curs, "SUB+(");
+        if (c2) {
+            pat->condition = SET_SUB;
+            incl = lastsup;
+            goto got_cond;
+        }
+
+        ret = SYNTAX_ERR;
+        goto out;
+
+    got_cond:
+        curs = c2;
+        skipwhite(&curs);
+        /* count the number of components for pattern */
+        pnum = *curs == ')' ? 0 : 1;
+        for (c2 = curs; *c2 != ')';) {
+            if (strchr("&|", *c2)) {
+                ret = SYNTAX_ERR;
+                goto out;
+            }
+            while (!strchr("|&)", *c2) && !isspace(*c2))
+                c2++;
+            skipwhite(&c2);
+            switch (*c2) {
+                case ')':
+                    break;
+                case '\0':
+                case '&':
+                    ret = SYNTAX_ERR;
+                    goto out;
+                case '|':
+                    *c2 = ' ';
+                    skipwhite(&c2);
+                    /* fall through */
+                default:
+                    pnum++;
+            }
+        }
+        if (incl >= 0) {
+            pnc = 0;
+            for (pcc = mspec->patterns[incl].components; *pcc; pcc++)
+                pnc++;
+            pnum += pnc;
+        }
+        pat->components = GF_CALLOC(pnum + 1, sizeof(*pat->components),
+                                    gf_gld_mt_mount_comp_container);
+        if (!pat->components) {
+            ret = -1;
+            goto out;
+        }
+
+        cc = pat->components;
+        /* copy over included component set */
+        if (incl >= 0) {
+            memcpy(pat->components, mspec->patterns[incl].components,
+                   pnc * sizeof(*pat->components));
+            cc += pnc;
+        }
+        /* parse and add components */
+        c2 = ""; /* reset c2 */
+        while (*c2 != ')') {
+            c2 = curs;
+            while (!isspace(*c2) && *c2 != ')')
+                c2++;
+            sc = *c2;
+            *c2 = '\0';
+            ;
+            *cc = gf_strdup(curs);
+            if (!*cc) {
+                ret = -1;
+                goto out;
+            }
+            *c2 = sc;
+            skipwhite(&c2);
+            curs = c2;
+            cc++;
+        }
+
+        curs++;
+        skipwhite(&curs);
+        if (*curs == '&') {
+            curs++;
+            skipwhite(&curs);
+        }
+
+        if (!*curs)
+            break;
+        pat++;
+    }
+
+out:
+    if (ret == SYNTAX_ERR) {
+        gf_msg("glusterd", GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "cannot parse mount patterns %s", pdesc);
+    }
+
+    /* We've allocted a lotta stuff here but don't bother with freeing
+     * on error, in that case we'll terminate anyway
+     */
+    return ret ? -1 : 0;
+}
+#undef SYNTAX_ERR
+
+const char *georep_mnt_desc_template =
+    "SUP("
+    "aux-gfid-mount "
+    "acl "
+    "volfile-server=localhost "
+    "client-pid=%d "
+    "user-map-root=%s "
+    ")"
+    "SUB+("
+    "log-file=%s/" GEOREP
+    "*/* "
+    "log-level=* "
+    "volfile-id=* "
+    ")"
+    "MEET("
+    "%s"
+    ")";
+
+int
+make_georep_mountspec(gf_mount_spec_t *mspec, const char *volnames, char *user,
+                      char *logdir)
+{
+    char *georep_mnt_desc = NULL;
+    char *meetspec = NULL;
+    char *vols = NULL;
+    char *vol = NULL;
+    char *p = NULL;
+    char *savetok = NULL;
+    char *fa[3] = {
+        0,
+    };
+    size_t siz = 0;
+    int vc = 0;
+    int i = 0;
+    int ret = 0;
+
+    vols = gf_strdup((char *)volnames);
+    if (!vols) {
+        gf_smsg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+                "Volume name=%s", volnames, NULL);
+        goto out;
+    }
+
+    for (vc = 1, p = vols; *p; p++) {
+        if (*p == ',')
+            vc++;
+    }
+    siz = strlen(volnames) + vc * SLEN("volfile-id=");
+    meetspec = GF_CALLOC(1, siz + 1, gf_gld_mt_georep_meet_spec);
+    if (!meetspec) {
+        gf_smsg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    for (p = vols;;) {
+        vol = strtok_r(p, ",", &savetok);
+        if (!vol) {
+            GF_ASSERT(vc == 0);
+            break;
+        }
+        p = NULL;
+        strcat(meetspec, "volfile-id=");
+        strcat(meetspec, vol);
+        if (--vc > 0)
+            strcat(meetspec, " ");
+    }
+
+    ret = gf_asprintf(&georep_mnt_desc, georep_mnt_desc_template,
+                      GF_CLIENT_PID_GSYNCD, user, logdir, meetspec);
+    if (ret == -1) {
+        georep_mnt_desc = NULL;
+        goto out;
+    }
+
+    ret = parse_mount_pattern_desc(mspec, georep_mnt_desc);
+
+out:
+    fa[0] = meetspec;
+    fa[1] = vols;
+    fa[2] = georep_mnt_desc;
+
+    for (i = 0; i < 3; i++) {
+        if (fa[i] == NULL)
+            ret = -1;
+        else
+            GF_FREE(fa[i]);
+    }
+
+    return ret;
+}
+
+static gf_boolean_t
+match_comp(char *str, char *patcomp)
+{
+    char *c1 = patcomp;
+    char *c2 = str;
+
+    GF_ASSERT(c1);
+    GF_ASSERT(c2);
+
+    while (*c1 == *c2) {
+        if (!*c1)
+            return _gf_true;
+        c1++;
+        c2++;
+        if (c1[-1] == '=')
+            break;
+    }
+
+    return fnmatch(c1, c2, 0) == 0 ? _gf_true : _gf_false;
+}
+
+struct gf_set_descriptor {
+    gf_boolean_t priv[2];
+    gf_boolean_t common;
+};
+
+static int
+_gf_set_dict_iter1(char *val, void *data)
+{
+    void **dataa = data;
+    struct gf_set_descriptor *sd = dataa[0];
+    char **curs = dataa[1];
+    gf_boolean_t priv = _gf_true;
+
+    while (*curs) {
+        if (match_comp(val, *curs)) {
+            priv = _gf_false;
+            sd->common = _gf_true;
+        }
+        curs++;
+    }
+
+    if (priv)
+        sd->priv[0] = _gf_true;
+
+    return 0;
+}
+
+static int
+_gf_set_dict_iter2(char *val, void *data)
+{
+    void **dataa = data;
+    gf_boolean_t *boo = dataa[0];
+    char *comp = dataa[1];
+
+    if (match_comp(val, comp))
+        *boo = _gf_true;
+
+    return 0;
+}
+
+static void
+relate_sets(struct gf_set_descriptor *sd, dict_t *argdict, char **complist)
+{
+    void *dataa[] = {NULL, NULL};
+    gf_boolean_t boo = _gf_false;
+
+    memset(sd, 0, sizeof(*sd));
+
+    dataa[0] = sd;
+    dataa[1] = complist;
+    seq_dict_foreach(argdict, _gf_set_dict_iter1, dataa);
+
+    while (*complist) {
+        boo = _gf_false;
+        dataa[0] = &boo;
+        dataa[1] = *complist;
+        seq_dict_foreach(argdict, _gf_set_dict_iter2, dataa);
+
+        if (boo)
+            sd->common = _gf_true;
+        else
+            sd->priv[1] = _gf_true;
+
+        complist++;
+    }
+}
+
+static int
+_arg_parse_uid(char *val, void *data)
+{
+    char *user = strtail(val, "user-map-root=");
+    struct passwd *pw = NULL;
+
+    if (!user)
+        return 0;
+    pw = getpwnam(user);
+    if (!pw)
+        return -EINVAL;
+
+    if (*(int *)data >= 0)
+        /* uid ambiguity, already found */
+        return -EINVAL;
+
+    *(int *)data = pw->pw_uid;
+    return 0;
+}
+
+static int
+evaluate_mount_request(xlator_t *this, gf_mount_spec_t *mspec, dict_t *argdict)
+{
+    struct gf_set_descriptor sd = {
+        {
+            0,
+        },
+    };
+    int i = 0;
+    int uid = -1;
+    int ret = 0;
+    gf_boolean_t match = _gf_false;
+
+    for (i = 0; i < mspec->len; i++) {
+        relate_sets(&sd, argdict, mspec->patterns[i].components);
+        switch (mspec->patterns[i].condition) {
+            case SET_SUB:
+                match = !sd.priv[0];
+                break;
+            case SET_SUPER:
+                match = !sd.priv[1];
+                break;
+            case SET_EQUAL:
+                match = (!sd.priv[0] && !sd.priv[1]);
+                break;
+            case SET_INTERSECT:
+                match = sd.common;
+                break;
+            default:
+                GF_ASSERT(!"unreached");
+        }
+        if (mspec->patterns[i].negative)
+            match = !match;
+
+        if (!match) {
+            gf_msg(this->name, GF_LOG_ERROR, EPERM,
+                   GD_MSG_MNTBROKER_SPEC_MISMATCH,
+                   "Mountbroker spec mismatch!!! SET: %d "
+                   "COMPONENT: %d. Review the mount args passed",
+                   mspec->patterns[i].condition, i);
+            return -EPERM;
+        }
+    }
+
+    ret = seq_dict_foreach(argdict, _arg_parse_uid, &uid);
+    if (ret != 0)
+        return ret;
+
+    return uid;
+}
+
+static int
+_volname_get(char *val, void *data)
+{
+    char **volname = data;
+
+    *volname = strtail(val, "volfile-id=");
+
+    return *volname ? 1 : 0;
+}
+
+static int
+_runner_add(char *val, void *data)
+{
+    runner_t *runner = data;
+
+    runner_argprintf(runner, "--%s", val);
+
+    return 0;
+}
+
+int
+glusterd_do_mount(char *label, dict_t *argdict, char **path, int *op_errno)
+{
+    glusterd_conf_t *priv = NULL;
+    char *mountbroker_root = NULL;
+    gf_mount_spec_t *mspec = NULL;
+    int uid = -ENOENT;
+    char *volname = NULL;
+    glusterd_volinfo_t *vol = NULL;
+    char *mtptemp = NULL;
+    char *mntlink = NULL;
+    char *cookieswitch = NULL;
+    char *cookie = NULL;
+    char *sla = NULL;
+    struct stat st = {
+        0,
+    };
+    runner_t runner = {
+        0,
+    };
+    int ret = 0;
+    xlator_t *this = THIS;
+    mode_t orig_umask = 0;
+    gf_boolean_t found_label = _gf_false;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(op_errno);
+    *op_errno = 0;
+
+    if (dict_get_strn(this->options, "mountbroker-root",
+                      SLEN("mountbroker-root"), &mountbroker_root) != 0) {
+        *op_errno = ENOENT;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "'option mountbroker-root' "
+               "missing in glusterd vol file");
+        goto out;
+    }
+
+    GF_ASSERT(label);
+    if (!*label) {
+        *op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_ERROR, *op_errno, GD_MSG_MNTBROKER_LABEL_NULL,
+               "label is NULL (%s)", strerror(*op_errno));
+        goto out;
+    }
+
+    /* look up spec for label */
+    cds_list_for_each_entry(mspec, &priv->mount_specs, speclist)
+    {
+        if (strcmp(mspec->label, label) != 0)
+            continue;
+
+        found_label = _gf_true;
+        uid = evaluate_mount_request(this, mspec, argdict);
+        break;
+    }
+    if (uid < 0) {
+        *op_errno = -uid;
+        if (!found_label) {
+            gf_msg(this->name, GF_LOG_ERROR, *op_errno,
+                   GD_MSG_MNTBROKER_LABEL_MISS,
+                   "Missing mspec: Check the corresponding option "
+                   "in glusterd vol file for mountbroker user: %s",
+                   label);
+        }
+        goto out;
+    }
+
+    /* some sanity check on arguments */
+    seq_dict_foreach(argdict, _volname_get, &volname);
+    if (!volname) {
+        *op_errno = EINVAL;
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_DICT_GET_FAILED,
+               "Dict get failed for the key 'volname'");
+        goto out;
+    }
+    if (glusterd_volinfo_find(volname, &vol) != 0 ||
+        !glusterd_is_volume_started(vol)) {
+        *op_errno = ENOENT;
+        gf_msg(this->name, GF_LOG_ERROR, *op_errno, GD_MSG_MOUNT_REQ_FAIL,
+               "Either volume is not started or volinfo not found");
+        goto out;
+    }
+
+    /* go do mount */
+
+    /** create actual mount dir */
+
+    /*** "overload" string name to be possible to used for cookie
+         creation, see below */
+    ret = gf_asprintf(&mtptemp, "%s/user%d/mtpt-%s-XXXXXX/cookie",
+                      mountbroker_root, uid, label);
+    if (ret == -1) {
+        mtptemp = NULL;
+        *op_errno = ENOMEM;
+        goto out;
+    }
+    /*** hide cookie part */
+    cookieswitch = strrchr(mtptemp, '/');
+    *cookieswitch = '\0';
+
+    sla = strrchr(mtptemp, '/');
+    *sla = '\0';
+    ret = sys_mkdir(mtptemp, 0700);
+    if (ret == 0)
+        ret = sys_chown(mtptemp, uid, 0);
+    else if (errno == EEXIST)
+        ret = 0;
+    if (ret == -1) {
+        *op_errno = errno;
+        gf_msg(this->name, GF_LOG_ERROR, *op_errno, GD_MSG_SYSCALL_FAIL,
+               "Mountbroker User directory creation failed");
+        goto out;
+    }
+    ret = sys_lstat(mtptemp, &st);
+    if (ret == -1) {
+        *op_errno = errno;
+        gf_msg(this->name, GF_LOG_ERROR, *op_errno, GD_MSG_SYSCALL_FAIL,
+               "stat on mountbroker user directory failed");
+        goto out;
+    }
+    if (!(S_ISDIR(st.st_mode) && (st.st_mode & ~S_IFMT) == 0700 &&
+          st.st_uid == uid && st.st_gid == 0)) {
+        *op_errno = EACCES;
+        gf_msg(this->name, GF_LOG_ERROR, *op_errno, GD_MSG_MOUNT_REQ_FAIL,
+               "Incorrect mountbroker user directory attributes");
+        goto out;
+    }
+    *sla = '/';
+
+    if (!mkdtemp(mtptemp)) {
+        *op_errno = errno;
+        gf_msg(this->name, GF_LOG_ERROR, *op_errno, GD_MSG_SYSCALL_FAIL,
+               "Mountbroker mount directory creation failed");
+        goto out;
+    }
+
+    /** create private "cookie" symlink */
+
+    /*** occupy an entry in the hive dir via mkstemp */
+    ret = gf_asprintf(&cookie, "%s/" MB_HIVE "/mntXXXXXX", mountbroker_root);
+    if (ret == -1) {
+        cookie = NULL;
+        *op_errno = ENOMEM;
+        goto out;
+    }
+    orig_umask = umask(S_IRWXG | S_IRWXO);
+    ret = mkstemp(cookie);
+    umask(orig_umask);
+    if (ret == -1) {
+        *op_errno = errno;
+        gf_msg(this->name, GF_LOG_ERROR, *op_errno, GD_MSG_SYSCALL_FAIL,
+               "Mountbroker cookie file creation failed");
+        goto out;
+    }
+    sys_close(ret);
+
+    /*** assembly the path from cookie to mountpoint */
+    sla = strchr(sla - 1, '/');
+    GF_ASSERT(sla);
+    ret = gf_asprintf(&mntlink, "../user%d%s", uid, sla);
+    if (ret == -1) {
+        *op_errno = ENOMEM;
+        goto out;
+    }
+
+    /*** create cookie link in (to-be) mountpoint,
+         move it over to the final place */
+    *cookieswitch = '/';
+    ret = sys_symlink(mntlink, mtptemp);
+    if (ret != -1)
+        ret = sys_rename(mtptemp, cookie);
+    *cookieswitch = '\0';
+    if (ret == -1) {
+        *op_errno = errno;
+        gf_msg(this->name, GF_LOG_ERROR, *op_errno, GD_MSG_SYSCALL_FAIL,
+               "symlink or rename failed");
+        goto out;
+    }
+
+    /** invoke glusterfs on the mountpoint */
+
+    runinit(&runner);
+    runner_add_arg(&runner, SBIN_DIR "/glusterfs");
+    seq_dict_foreach(argdict, _runner_add, &runner);
+    runner_add_arg(&runner, mtptemp);
+    ret = runner_run_reuse(&runner);
+    if (ret == -1) {
+        *op_errno = EIO; /* XXX hacky fake */
+        runner_log(&runner, "", GF_LOG_ERROR, "command failed");
+    }
+    runner_end(&runner);
+
+out:
+
+    if (*op_errno) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_WARNING, *op_errno, GD_MSG_MOUNT_REQ_FAIL,
+               "unsuccessful mount request");
+        if (mtptemp) {
+            *cookieswitch = '/';
+            sys_unlink(mtptemp);
+            *cookieswitch = '\0';
+            sys_rmdir(mtptemp);
+        }
+        if (cookie) {
+            sys_unlink(cookie);
+            GF_FREE(cookie);
+        }
+
+    } else {
+        ret = 0;
+        *path = cookie;
+    }
+
+    if (mtptemp)
+        GF_FREE(mtptemp);
+    if (mntlink)
+        GF_FREE(mntlink);
+
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-mountbroker.h b/xlators/mgmt/glusterd/src/glusterd-mountbroker.h
new file mode 100644
index 00000000000..20c1347f52f
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-mountbroker.h
@@ -0,0 +1,37 @@
+/*
+   Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#define MB_HIVE "mb_hive"
+
+typedef enum { SET_SUB = 1, SET_SUPER, SET_EQUAL, SET_INTERSECT } gf_setrel_t;
+
+struct gf_mount_pattern {
+    char **components;
+    gf_setrel_t condition;
+    gf_boolean_t negative;
+};
+typedef struct gf_mount_pattern gf_mount_pattern_t;
+
+struct gf_mount_spec {
+    struct cds_list_head speclist;
+    char *label;
+    gf_mount_pattern_t *patterns;
+    size_t len;
+};
+typedef struct gf_mount_spec gf_mount_spec_t;
+
+int
+parse_mount_pattern_desc(gf_mount_spec_t *mspec, char *pdesc);
+
+int
+make_georep_mountspec(gf_mount_spec_t *mspec, const char *volname, char *user,
+                      char *logdir);
+
+int
+glusterd_do_mount(char *label, dict_t *argdict, char **path, int *op_errno);
diff --git a/xlators/mgmt/glusterd/src/glusterd-nfs-svc.c b/xlators/mgmt/glusterd/src/glusterd-nfs-svc.c
new file mode 100644
index 00000000000..4908dbbc213
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-nfs-svc.c
@@ -0,0 +1,228 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifdef BUILD_GNFS
+
+#include <glusterfs/globals.h>
+#include <glusterfs/run.h>
+#include <glusterfs/syscall.h>
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-nfs-svc.h"
+#include "glusterd-messages.h"
+#include "glusterd-svc-helper.h"
+
+static gf_boolean_t
+glusterd_nfssvc_need_start()
+{
+    glusterd_conf_t *priv = NULL;
+    gf_boolean_t start = _gf_false;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    priv = THIS->private;
+
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+    {
+        if (!glusterd_is_volume_started(volinfo))
+            continue;
+
+        if (dict_get_str_boolean(volinfo->dict, NFS_DISABLE_MAP_KEY, 1))
+            continue;
+        start = _gf_true;
+        break;
+    }
+
+    return start;
+}
+
+static int
+glusterd_nfssvc_create_volfile()
+{
+    char filepath[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *conf = THIS->private;
+
+    glusterd_svc_build_volfile_path(conf->nfs_svc.name, conf->workdir, filepath,
+                                    sizeof(filepath));
+    return glusterd_create_global_volfile(build_nfs_graph, filepath, NULL);
+}
+
+static int
+glusterd_nfssvc_manager(glusterd_svc_t *svc, void *data, int flags)
+{
+    int ret = -1;
+
+    if (!svc->inited) {
+        ret = glusterd_svc_init(svc, "nfs");
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_FAILED_INIT_NFSSVC,
+                   "Failed to init nfs service");
+            goto out;
+        } else {
+            svc->inited = _gf_true;
+            gf_msg_debug(THIS->name, 0, "nfs service initialized");
+        }
+    }
+
+    ret = svc->stop(svc, SIGKILL);
+    if (ret)
+        goto out;
+
+    /* not an error, or a (very) soft error at best */
+    if (sys_access(XLATORDIR "/nfs/server.so", R_OK) != 0) {
+        gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_GNFS_XLATOR_NOT_INSTALLED,
+               "nfs/server.so xlator is not installed");
+        goto out;
+    }
+
+    ret = glusterd_nfssvc_create_volfile();
+    if (ret)
+        goto out;
+
+    if (glusterd_nfssvc_need_start()) {
+        ret = svc->start(svc, flags);
+        if (ret)
+            goto out;
+
+        ret = glusterd_conn_connect(&(svc->conn));
+        if (ret)
+            goto out;
+    }
+out:
+    if (ret)
+        gf_event(EVENT_SVC_MANAGER_FAILED, "svc_name=%s", svc->name);
+
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_nfssvc_start(glusterd_svc_t *svc, int flags)
+{
+    return glusterd_svc_start(svc, flags, NULL);
+}
+
+static int
+glusterd_nfssvc_stop(glusterd_svc_t *svc, int sig)
+{
+    int ret = -1;
+    gf_boolean_t deregister = _gf_false;
+
+    if (glusterd_proc_is_running(&(svc->proc)))
+        deregister = _gf_true;
+
+    ret = glusterd_svc_stop(svc, sig);
+    if (ret)
+        goto out;
+    if (deregister)
+        glusterd_nfs_pmap_deregister();
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+void
+glusterd_nfssvc_build(glusterd_svc_t *svc)
+{
+    svc->manager = glusterd_nfssvc_manager;
+    svc->start = glusterd_nfssvc_start;
+    svc->stop = glusterd_nfssvc_stop;
+}
+
+int
+glusterd_nfssvc_reconfigure()
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    gf_boolean_t identical = _gf_false;
+    gf_boolean_t vol_started = _gf_false;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    /* not an error, or a (very) soft error at best */
+    if (sys_access(XLATORDIR "/nfs/server.so", R_OK) != 0) {
+        gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_GNFS_XLATOR_NOT_INSTALLED,
+               "nfs/server.so xlator is not installed");
+        ret = 0;
+        goto out;
+    }
+
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+    {
+        if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+            vol_started = _gf_true;
+            break;
+        }
+    }
+    if (!vol_started) {
+        ret = 0;
+        goto out;
+    }
+
+    /*
+     * Check both OLD and NEW volfiles, if they are SAME by size
+     * and cksum i.e. "character-by-character". If YES, then
+     * NOTHING has been changed, just return.
+     */
+
+    ret = glusterd_svc_check_volfile_identical(priv->nfs_svc.name,
+                                               build_nfs_graph, &identical);
+    if (ret)
+        goto out;
+
+    if (identical) {
+        ret = 0;
+        goto out;
+    }
+
+    /*
+     * They are not identical. Find out if the topology is changed
+     * OR just the volume options. If just the options which got
+     * changed, then inform the xlator to reconfigure the options.
+     */
+    identical = _gf_false; /* RESET the FLAG */
+    ret = glusterd_svc_check_topology_identical(priv->nfs_svc.name,
+                                                build_nfs_graph, &identical);
+    if (ret)
+        goto out;
+
+    /* Topology is not changed, but just the options. But write the
+     * options to NFS volfile, so that NFS will be reconfigured.
+     */
+    if (identical) {
+        ret = glusterd_nfssvc_create_volfile();
+        if (ret == 0) { /* Only if above PASSES */
+            ret = glusterd_fetchspec_notify(THIS);
+        }
+        goto out;
+    }
+
+    /*
+     * NFS volfile's topology has been changed. NFS server needs
+     * to be RESTARTED to ACT on the changed volfile.
+     */
+    ret = priv->nfs_svc.manager(&(priv->nfs_svc), NULL, PROC_START_NO_WAIT);
+
+out:
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-nfs-svc.h b/xlators/mgmt/glusterd/src/glusterd-nfs-svc.h
new file mode 100644
index 00000000000..6bfdde95749
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-nfs-svc.h
@@ -0,0 +1,27 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_NFS_SVC_H_
+#define _GLUSTERD_NFS_SVC_H_
+
+#include "glusterd-svc-mgmt.h"
+
+#ifdef BUILD_GNFS
+void
+glusterd_nfssvc_build(glusterd_svc_t *svc);
+
+int
+glusterd_nfssvc_init(glusterd_svc_t *svc);
+
+int
+glusterd_nfssvc_reconfigure();
+
+#endif /* BUILD_GNFS */
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
new file mode 100644
index 00000000000..c537fc33a85
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
@@ -0,0 +1,8164 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <time.h>
+#include <sys/uio.h>
+#include <sys/resource.h>
+#include <sys/mount.h>
+
+#include <libgen.h>
+#include <glusterfs/compat-uuid.h>
+
+#include "fnmatch.h"
+#include <glusterfs/xlator.h>
+#include "protocol-common.h"
+#include "glusterd.h"
+#include <glusterfs/call-stub.h>
+#include <glusterfs/list.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/statedump.h>
+#include "glusterd-op-sm.h"
+#include "glusterd-utils.h"
+#include "glusterd-store.h"
+#include "glusterd-locks.h"
+#include "glusterd-quota.h"
+#include <glusterfs/syscall.h>
+#include "cli1-xdr.h"
+#include "glusterd-snapshot-utils.h"
+#include "glusterd-svc-mgmt.h"
+#include "glusterd-svc-helper.h"
+#include "glusterd-shd-svc-helper.h"
+#include "glusterd-shd-svc.h"
+#include "glusterd-quotad-svc.h"
+#include "glusterd-server-quorum.h"
+#include <sys/types.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include "glusterd-gfproxyd-svc-helper.h"
+
+#define len_strcmp(key, len, str)                                              \
+    ((len == SLEN(str)) && (strcmp(key, str) == 0))
+
+extern char local_node_hostname[PATH_MAX];
+static int
+glusterd_set_shared_storage(dict_t *dict, char *key, char *value,
+                            char **op_errstr);
+
+/*
+ * Valid options for all volumes to be listed in the valid_all_vol_opts table.
+ * To add newer options to all volumes, we can just add more entries to this
+ * table.
+ *
+ * It's important that every value have a default, or have a special handler
+ * in glusterd_get_global_options_for_all_vols, or else we might crash there.
+ */
+const glusterd_all_vol_opts valid_all_vol_opts[] = {
+    {GLUSTERD_QUORUM_RATIO_KEY, "51"},
+    {GLUSTERD_SHARED_STORAGE_KEY, "disable"},
+    /* This one actually gets filled in dynamically. */
+    {GLUSTERD_GLOBAL_OP_VERSION_KEY, "BUG_NO_OP_VERSION"},
+    /*
+     * This one should be filled in dynamically, but it didn't used to be
+     * (before the defaults were added here) so the value is unclear.
+     *
+     * TBD: add a dynamic handler to set the appropriate value
+     */
+    {GLUSTERD_MAX_OP_VERSION_KEY, "BUG_NO_MAX_OP_VERSION"},
+    {GLUSTERD_BRICK_MULTIPLEX_KEY, "disable"},
+    /* Set this value to 0 by default implying brick-multiplexing
+     * behaviour with no limit set on the number of brick instances that
+     * can be attached per process.
+     * TBD: Discuss the default value for this. Maybe this should be a
+     * dynamic value depending on the memory specifications per node */
+    {GLUSTERD_BRICKMUX_LIMIT_KEY, GLUSTERD_BRICKMUX_LIMIT_DFLT_VALUE},
+    {GLUSTERD_VOL_CNT_PER_THRD, GLUSTERD_VOL_CNT_PER_THRD_DEFAULT_VALUE},
+    {GLUSTERD_LOCALTIME_LOGGING_KEY, "disable"},
+    {GLUSTERD_DAEMON_LOG_LEVEL_KEY, "INFO"},
+    {NULL},
+};
+
+static struct cds_list_head gd_op_sm_queue;
+synclock_t gd_op_sm_lock;
+glusterd_op_info_t opinfo = {
+    {0},
+};
+
+int32_t
+glusterd_txn_opinfo_dict_init()
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    priv->glusterd_txn_opinfo = dict_new();
+    if (!priv->glusterd_txn_opinfo) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    memset(priv->global_txn_id, '\0', sizeof(uuid_t));
+
+    ret = 0;
+out:
+    return ret;
+}
+
+void
+glusterd_txn_opinfo_dict_fini()
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (priv->glusterd_txn_opinfo)
+        dict_unref(priv->glusterd_txn_opinfo);
+}
+
+void
+glusterd_txn_opinfo_init(glusterd_op_info_t *opinfo,
+                         glusterd_op_sm_state_info_t *state, int *op,
+                         dict_t *op_ctx, rpcsvc_request_t *req)
+{
+    glusterd_conf_t *conf = NULL;
+
+    GF_ASSERT(opinfo);
+
+    conf = THIS->private;
+    GF_ASSERT(conf);
+
+    if (state)
+        opinfo->state = *state;
+
+    if (op)
+        opinfo->op = *op;
+
+    if (op_ctx)
+        opinfo->op_ctx = dict_ref(op_ctx);
+    else
+        opinfo->op_ctx = NULL;
+
+    if (req)
+        opinfo->req = req;
+
+    opinfo->txn_generation = conf->generation;
+    cmm_smp_rmb();
+
+    return;
+}
+
+int32_t
+glusterd_generate_txn_id(dict_t *dict, uuid_t **txn_id)
+{
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(dict);
+
+    *txn_id = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!*txn_id) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    if (priv->op_version < GD_OP_VERSION_3_6_0)
+        gf_uuid_copy(**txn_id, priv->global_txn_id);
+    else
+        gf_uuid_generate(**txn_id);
+
+    ret = dict_set_bin(dict, "transaction_id", *txn_id, sizeof(**txn_id));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set transaction id.");
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "Transaction_id = %s", uuid_utoa(**txn_id));
+out:
+    if (ret && *txn_id) {
+        GF_FREE(*txn_id);
+        *txn_id = NULL;
+    }
+
+    return ret;
+}
+
+int32_t
+glusterd_get_txn_opinfo(uuid_t *txn_id, glusterd_op_info_t *opinfo)
+{
+    int32_t ret = -1;
+    glusterd_txn_opinfo_obj *opinfo_obj = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (!txn_id || !opinfo) {
+        gf_msg_callingfn(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_ID_GET_FAIL,
+                         "Empty transaction id or opinfo received.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_bin(priv->glusterd_txn_opinfo, uuid_utoa(*txn_id),
+                       (void **)&opinfo_obj);
+    if (ret)
+        goto out;
+
+    (*opinfo) = opinfo_obj->opinfo;
+
+    gf_msg_debug(this->name, 0,
+                 "Successfully got opinfo for transaction ID : %s",
+                 uuid_utoa(*txn_id));
+
+    ret = 0;
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_set_txn_opinfo(uuid_t *txn_id, glusterd_op_info_t *opinfo)
+{
+    int32_t ret = -1;
+    glusterd_txn_opinfo_obj *opinfo_obj = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (!txn_id) {
+        gf_msg_callingfn(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_ID_GET_FAIL,
+                         "Empty transaction id received.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_bin(priv->glusterd_txn_opinfo, uuid_utoa(*txn_id),
+                       (void **)&opinfo_obj);
+    if (ret) {
+        opinfo_obj = GF_CALLOC(1, sizeof(glusterd_txn_opinfo_obj),
+                               gf_common_mt_txn_opinfo_obj_t);
+        if (!opinfo_obj) {
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_set_bin(priv->glusterd_txn_opinfo, uuid_utoa(*txn_id),
+                           opinfo_obj, sizeof(glusterd_txn_opinfo_obj));
+        if (ret) {
+            gf_msg_callingfn(this->name, GF_LOG_ERROR, errno,
+                             GD_MSG_DICT_SET_FAILED,
+                             "Unable to set opinfo for transaction"
+                             " ID : %s",
+                             uuid_utoa(*txn_id));
+            goto out;
+        }
+    }
+
+    opinfo_obj->opinfo = (*opinfo);
+
+    gf_msg_debug(this->name, 0,
+                 "Successfully set opinfo for transaction ID : %s",
+                 uuid_utoa(*txn_id));
+    ret = 0;
+out:
+    if (ret)
+        if (opinfo_obj)
+            GF_FREE(opinfo_obj);
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_clear_txn_opinfo(uuid_t *txn_id)
+{
+    int32_t ret = -1;
+    glusterd_op_info_t txn_op_info = {
+        {0},
+    };
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (!txn_id) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_ID_GET_FAIL,
+               "Empty transaction id received.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_get_txn_opinfo(txn_id, &txn_op_info);
+    if (ret) {
+        gf_msg_callingfn(this->name, GF_LOG_ERROR, 0,
+                         GD_MSG_TRANS_OPINFO_GET_FAIL,
+                         "Unable to get transaction opinfo "
+                         "for transaction ID : %s",
+                         uuid_utoa(*txn_id));
+        goto out;
+    }
+
+    if (txn_op_info.op_ctx)
+        dict_unref(txn_op_info.op_ctx);
+
+    dict_del(priv->glusterd_txn_opinfo, uuid_utoa(*txn_id));
+
+    gf_msg_debug(this->name, 0,
+                 "Successfully cleared opinfo for transaction ID : %s",
+                 uuid_utoa(*txn_id));
+
+    ret = 0;
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int glusterfs_port = GLUSTERD_DEFAULT_PORT;
+static char *glusterd_op_sm_state_names[] = {
+    "Default",
+    "Lock sent",
+    "Locked",
+    "Stage op sent",
+    "Staged",
+    "Commit op sent",
+    "Committed",
+    "Unlock sent",
+    "Stage op failed",
+    "Commit op failed",
+    "Brick op sent",
+    "Brick op failed",
+    "Brick op Committed",
+    "Brick op Commit failed",
+    "Ack drain",
+    "Invalid",
+};
+
+static char *glusterd_op_sm_event_names[] = {
+    "GD_OP_EVENT_NONE",       "GD_OP_EVENT_START_LOCK",
+    "GD_OP_EVENT_LOCK",       "GD_OP_EVENT_RCVD_ACC",
+    "GD_OP_EVENT_ALL_ACC",    "GD_OP_EVENT_STAGE_ACC",
+    "GD_OP_EVENT_COMMIT_ACC", "GD_OP_EVENT_RCVD_RJT",
+    "GD_OP_EVENT_STAGE_OP",   "GD_OP_EVENT_COMMIT_OP",
+    "GD_OP_EVENT_UNLOCK",     "GD_OP_EVENT_START_UNLOCK",
+    "GD_OP_EVENT_ALL_ACK",    "GD_OP_EVENT_LOCAL_UNLOCK_NO_RESP",
+    "GD_OP_EVENT_INVALID"};
+
+char *
+glusterd_op_sm_state_name_get(int state)
+{
+    if (state < 0 || state >= GD_OP_STATE_MAX)
+        return glusterd_op_sm_state_names[GD_OP_STATE_MAX];
+    return glusterd_op_sm_state_names[state];
+}
+
+char *
+glusterd_op_sm_event_name_get(int event)
+{
+    if (event < 0 || event >= GD_OP_EVENT_MAX)
+        return glusterd_op_sm_event_names[GD_OP_EVENT_MAX];
+    return glusterd_op_sm_event_names[event];
+}
+
+static void
+glusterd_destroy_lock_ctx(glusterd_op_lock_ctx_t *ctx)
+{
+    if (!ctx)
+        return;
+    GF_FREE(ctx);
+}
+
+void
+glusterd_set_volume_status(glusterd_volinfo_t *volinfo,
+                           glusterd_volume_status status)
+{
+    GF_ASSERT(volinfo);
+    volinfo->status = status;
+}
+
+static int
+glusterd_op_sm_inject_all_acc(uuid_t *txn_id)
+{
+    int ret = -1;
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_ALL_ACC, txn_id, NULL);
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_check_bitrot_cmd(char *key, const int keylen, char *errstr,
+                          const size_t size)
+{
+    int ret = -1;
+
+    if (len_strcmp(key, keylen, "bitrot") ||
+        len_strcmp(key, keylen, "features.bitrot")) {
+        snprintf(errstr, size,
+                 " 'gluster volume set <VOLNAME> %s' is invalid command."
+                 " Use 'gluster volume bitrot <VOLNAME> {enable|disable}'"
+                 " instead.",
+                 key);
+        goto out;
+    } else if (len_strcmp(key, keylen, "scrub-freq") ||
+               len_strcmp(key, keylen, "features.scrub-freq")) {
+        snprintf(errstr, size,
+                 " 'gluster volume set <VOLNAME> %s' is invalid command."
+                 " Use 'gluster volume bitrot <VOLNAME> scrub-frequency"
+                 " {hourly|daily|weekly|biweekly|monthly}' instead.",
+                 key);
+        goto out;
+    } else if (len_strcmp(key, keylen, "scrub") ||
+               len_strcmp(key, keylen, "features.scrub")) {
+        snprintf(errstr, size,
+                 " 'gluster volume set <VOLNAME> %s' is invalid command."
+                 " Use 'gluster volume bitrot <VOLNAME> scrub {pause|resume}'"
+                 " instead.",
+                 key);
+        goto out;
+    } else if (len_strcmp(key, keylen, "scrub-throttle") ||
+               len_strcmp(key, keylen, "features.scrub-throttle")) {
+        snprintf(errstr, size,
+                 " 'gluster volume set <VOLNAME> %s' is invalid command."
+                 " Use 'gluster volume bitrot <VOLNAME> scrub-throttle "
+                 " {lazy|normal|aggressive}' instead.",
+                 key);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+glusterd_check_quota_cmd(char *key, const int keylen, char *value, char *errstr,
+                         size_t size)
+{
+    int ret = -1;
+    gf_boolean_t b = _gf_false;
+
+    if (len_strcmp(key, keylen, "quota") ||
+        len_strcmp(key, keylen, "features.quota")) {
+        ret = gf_string2boolean(value, &b);
+        if (ret)
+            goto out;
+        ret = -1;
+        if (b) {
+            snprintf(errstr, size,
+                     " 'gluster volume set <VOLNAME> %s %s' is deprecated."
+                     " Use 'gluster volume quota <VOLNAME> enable' instead.",
+                     key, value);
+        } else {
+            snprintf(errstr, size,
+                     " 'gluster volume set <VOLNAME> %s %s' is deprecated."
+                     " Use 'gluster volume quota <VOLNAME> disable' instead.",
+                     key, value);
+        }
+        goto out;
+    } else if (len_strcmp(key, keylen, "inode-quota") ||
+               len_strcmp(key, keylen, "features.inode-quota")) {
+        ret = gf_string2boolean(value, &b);
+        if (ret)
+            goto out;
+        ret = -1;
+        if (b) {
+            snprintf(
+                errstr, size,
+                " 'gluster volume set <VOLNAME> %s %s' is deprecated."
+                " Use 'gluster volume inode-quota <VOLNAME> enable' instead.",
+                key, value);
+        } else {
+            /* inode-quota disable not supported,
+             * use quota disable
+             */
+            snprintf(errstr, size,
+                     " 'gluster volume set <VOLNAME> %s %s' is deprecated."
+                     " Use 'gluster volume quota <VOLNAME> disable' instead.",
+                     key, value);
+        }
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_brick_op_build_payload(glusterd_op_t op,
+                                glusterd_brickinfo_t *brickinfo,
+                                gd1_mgmt_brick_op_req **req, dict_t *dict)
+{
+    int ret = -1;
+    gd1_mgmt_brick_op_req *brick_req = NULL;
+    char *volname = NULL;
+    char name[1024] = {
+        0,
+    };
+    gf_xl_afr_op_t heal_op = GF_SHD_OP_INVALID;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(op < GD_OP_MAX);
+    GF_ASSERT(op > GD_OP_NONE);
+    GF_ASSERT(req);
+
+    switch (op) {
+        case GD_OP_REMOVE_BRICK:
+        case GD_OP_STOP_VOLUME:
+            brick_req = GF_CALLOC(1, sizeof(*brick_req),
+                                  gf_gld_mt_mop_brick_req_t);
+            if (!brick_req) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY,
+                        NULL);
+                goto out;
+            }
+            brick_req->op = GLUSTERD_BRICK_TERMINATE;
+            brick_req->name = brickinfo->path;
+            glusterd_set_brick_status(brickinfo, GF_BRICK_STOPPING);
+            break;
+        case GD_OP_PROFILE_VOLUME:
+            brick_req = GF_CALLOC(1, sizeof(*brick_req),
+                                  gf_gld_mt_mop_brick_req_t);
+
+            if (!brick_req) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY,
+                        NULL);
+                goto out;
+            }
+
+            brick_req->op = GLUSTERD_BRICK_XLATOR_INFO;
+            brick_req->name = brickinfo->path;
+
+            break;
+        case GD_OP_HEAL_VOLUME: {
+            brick_req = GF_CALLOC(1, sizeof(*brick_req),
+                                  gf_gld_mt_mop_brick_req_t);
+            if (!brick_req) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY,
+                        NULL);
+                goto out;
+            }
+
+            brick_req->op = GLUSTERD_BRICK_XLATOR_OP;
+            brick_req->name = "";
+            ret = dict_get_int32n(dict, "heal-op", SLEN("heal-op"),
+                                  (int32_t *)&heal_op);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                        "Key=heal-op", NULL);
+                goto out;
+            }
+            ret = dict_set_int32n(dict, "xl-op", SLEN("xl-op"), heal_op);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                        "Key=xl-op", NULL);
+                goto out;
+            }
+        } break;
+        case GD_OP_STATUS_VOLUME: {
+            brick_req = GF_CALLOC(1, sizeof(*brick_req),
+                                  gf_gld_mt_mop_brick_req_t);
+            if (!brick_req) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY,
+                        NULL);
+                goto out;
+            }
+            brick_req->op = GLUSTERD_BRICK_STATUS;
+            brick_req->name = "";
+            ret = dict_set_strn(dict, "brick-name", SLEN("brick-name"),
+                                brickinfo->path);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                        "Key=brick-name", NULL);
+                goto out;
+            }
+        } break;
+        case GD_OP_REBALANCE:
+        case GD_OP_DEFRAG_BRICK_VOLUME:
+            brick_req = GF_CALLOC(1, sizeof(*brick_req),
+                                  gf_gld_mt_mop_brick_req_t);
+            if (!brick_req) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY,
+                        NULL);
+                goto out;
+            }
+
+            brick_req->op = GLUSTERD_BRICK_XLATOR_DEFRAG;
+            ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                        "Key=volname", NULL);
+                goto out;
+            }
+            ret = glusterd_volinfo_find(volname, &volinfo);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno,
+                        GD_MSG_VOLINFO_GET_FAIL, "Volume=%s", volname, NULL);
+                goto out;
+            }
+            snprintf(name, sizeof(name), "%s-dht", volname);
+            brick_req->name = gf_strdup(name);
+
+            break;
+        case GD_OP_SNAP:
+        case GD_OP_BARRIER:
+            brick_req = GF_CALLOC(1, sizeof(*brick_req),
+                                  gf_gld_mt_mop_brick_req_t);
+            if (!brick_req) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY,
+                        NULL);
+                goto out;
+            }
+            brick_req->op = GLUSTERD_BRICK_BARRIER;
+            brick_req->name = brickinfo->path;
+            break;
+
+        default:
+            goto out;
+            break;
+    }
+
+    brick_req->dict.dict_len = 0;
+    brick_req->dict.dict_val = NULL;
+    ret = dict_allocate_and_serialize(dict, &brick_req->input.input_val,
+                                      &brick_req->input.input_len);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+    *req = brick_req;
+    ret = 0;
+
+out:
+    if (ret && brick_req)
+        GF_FREE(brick_req);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_node_op_build_payload(glusterd_op_t op, gd1_mgmt_brick_op_req **req,
+                               dict_t *dict)
+{
+    int ret = -1;
+    gd1_mgmt_brick_op_req *brick_req = NULL;
+    char *volname = NULL;
+
+    GF_ASSERT(op < GD_OP_MAX);
+    GF_ASSERT(op > GD_OP_NONE);
+    GF_ASSERT(req);
+    xlator_t *this = NULL;
+    this = THIS;
+    GF_ASSERT(this);
+
+    switch (op) {
+        case GD_OP_PROFILE_VOLUME:
+            brick_req = GF_CALLOC(1, sizeof(*brick_req),
+                                  gf_gld_mt_mop_brick_req_t);
+            if (!brick_req) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY,
+                        NULL);
+                goto out;
+            }
+
+            brick_req->op = GLUSTERD_NODE_PROFILE;
+            brick_req->name = "";
+
+            break;
+
+        case GD_OP_STATUS_VOLUME:
+            brick_req = GF_CALLOC(1, sizeof(*brick_req),
+                                  gf_gld_mt_mop_brick_req_t);
+            if (!brick_req) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY,
+                        NULL);
+                goto out;
+            }
+
+            brick_req->op = GLUSTERD_NODE_STATUS;
+            brick_req->name = "";
+
+            break;
+
+        case GD_OP_SCRUB_STATUS:
+        case GD_OP_SCRUB_ONDEMAND:
+            brick_req = GF_CALLOC(1, sizeof(*brick_req),
+                                  gf_gld_mt_mop_brick_req_t);
+            if (!brick_req) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY,
+                        NULL);
+                goto out;
+            }
+
+            brick_req->op = GLUSTERD_NODE_BITROT;
+
+            ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                        "Key=volname", NULL);
+                goto out;
+            }
+
+            brick_req->name = gf_strdup(volname);
+            break;
+        default:
+            goto out;
+    }
+
+    brick_req->dict.dict_len = 0;
+    brick_req->dict.dict_val = NULL;
+    ret = dict_allocate_and_serialize(dict, &brick_req->input.input_val,
+                                      &brick_req->input.input_len);
+
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    *req = brick_req;
+    ret = 0;
+
+out:
+    if (ret && brick_req)
+        GF_FREE(brick_req);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_validate_quorum_options(xlator_t *this, char *fullkey, char *value,
+                                 char **op_errstr)
+{
+    int ret = 0;
+    char *key = NULL;
+    volume_option_t *opt = NULL;
+
+    if (!glusterd_is_quorum_option(fullkey))
+        goto out;
+    key = strchr(fullkey, '.');
+    if (key == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRCHR_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+    key++;
+    opt = xlator_volume_option_get(this, key);
+    if (!opt) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_VOLINFO_GET_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+    ret = xlator_option_validate(this, key, value, opt, op_errstr);
+out:
+    return ret;
+}
+
+static int
+glusterd_validate_brick_mx_options(xlator_t *this, char *fullkey, char *value,
+                                   char **op_errstr)
+{
+    int ret = 0;
+
+    // Placeholder function for now
+
+    return ret;
+}
+
+static int
+glusterd_validate_shared_storage(char *value, char *errstr)
+{
+    int32_t ret = -1;
+    int32_t count = -1;
+    char *op = NULL;
+    char hook_script[PATH_MAX] = "";
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    int32_t len = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    GF_VALIDATE_OR_GOTO(this->name, value, out);
+    GF_VALIDATE_OR_GOTO(this->name, errstr, out);
+
+    if ((strcmp(value, "enable")) && (strcmp(value, "disable"))) {
+        snprintf(errstr, PATH_MAX,
+                 "Invalid option(%s). Valid options "
+                 "are 'enable' and 'disable'",
+                 value);
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY, "%s",
+               errstr);
+        ret = -1;
+        goto out;
+    }
+
+    len = snprintf(hook_script, sizeof(hook_script),
+                   "%s" GLUSTERD_SHRD_STRG_HOOK_SCRIPT, conf->workdir);
+    if ((len < 0) || (len >= sizeof(hook_script))) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = sys_access(hook_script, R_OK | X_OK);
+    if (ret) {
+        len = snprintf(errstr, PATH_MAX,
+                       "The hook-script (%s) required "
+                       "for this operation is not present. "
+                       "Please install the hook-script "
+                       "and retry",
+                       hook_script);
+        if (len < 0) {
+            strncpy(errstr, "<error>", PATH_MAX);
+        }
+        gf_msg(this->name, GF_LOG_ERROR, ENOENT, GD_MSG_FILE_OP_FAILED, "%s",
+               errstr);
+        goto out;
+    }
+
+    if (!strncmp(value, "disable", SLEN("disable"))) {
+        ret = dict_get_strn(conf->opts, GLUSTERD_SHARED_STORAGE_KEY,
+                            SLEN(GLUSTERD_SHARED_STORAGE_KEY), &op);
+        if (ret || !strncmp(op, "disable", SLEN("disable"))) {
+            snprintf(errstr, PATH_MAX,
+                     "Shared storage volume "
+                     "does not exist. Please enable shared storage"
+                     " for creating shared storage volume.");
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_SHARED_STORAGE_DOES_NOT_EXIST, "%s", errstr);
+            ret = -1;
+            goto out;
+        }
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(GLUSTER_SHARED_STORAGE, &volinfo);
+    if (!ret) {
+        snprintf(errstr, PATH_MAX,
+                 "Shared storage volume(" GLUSTER_SHARED_STORAGE
+                 ") already exists.");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_ALREADY_EXIST, "%s",
+               errstr);
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_count_connected_peers(&count);
+    if (ret) {
+        snprintf(errstr, PATH_MAX,
+                 "Failed to calculate number of connected peers.");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEER_COUNT_GET_FAIL, "%s",
+               errstr);
+        goto out;
+    }
+
+    if (count <= 1) {
+        snprintf(errstr, PATH_MAX,
+                 "More than one node should "
+                 "be up/present in the cluster to enable this option");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INSUFFICIENT_UP_NODES, "%s",
+               errstr);
+        ret = -1;
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+static int
+glusterd_validate_localtime_logging(char *value, char *errstr)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    int already_enabled = 0;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+    GF_VALIDATE_OR_GOTO(this->name, value, out);
+
+    already_enabled = gf_log_get_localtime();
+
+    ret = 0;
+    if (strcmp(value, "enable") == 0) {
+        gf_log_set_localtime(1);
+        if (!already_enabled)
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_LOCALTIME_LOGGING_ENABLE,
+                   "localtime logging enable");
+    } else if (strcmp(value, "disable") == 0) {
+        gf_log_set_localtime(0);
+        if (already_enabled)
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_LOCALTIME_LOGGING_DISABLE,
+                   "localtime logging disable");
+    } else {
+        ret = -1;
+        GF_VALIDATE_OR_GOTO(this->name, errstr, out);
+        snprintf(errstr, PATH_MAX,
+                 "Invalid option(%s). Valid options "
+                 "are 'enable' and 'disable'",
+                 value);
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY, "%s",
+               errstr);
+    }
+
+out:
+    return ret;
+}
+
+static int
+glusterd_validate_daemon_log_level(char *value, char *errstr)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    GF_VALIDATE_OR_GOTO(this->name, value, out);
+
+    ret = 0;
+
+    if ((strcmp(value, "INFO")) && (strcmp(value, "WARNING")) &&
+        (strcmp(value, "DEBUG")) && (strcmp(value, "TRACE")) &&
+        (strcmp(value, "ERROR"))) {
+        ret = -1;
+        GF_VALIDATE_OR_GOTO(this->name, errstr, out);
+        snprintf(errstr, PATH_MAX,
+                 "Invalid option(%s). Valid options "
+                 "are 'INFO' or 'WARNING' or 'ERROR' or 'DEBUG' or "
+                 " 'TRACE'",
+                 value);
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY, "%s",
+               errstr);
+    }
+
+out:
+    return ret;
+}
+
+static int
+glusterd_op_stage_set_volume(dict_t *dict, char **op_errstr)
+{
+    int ret = -1;
+    char *volname = NULL;
+    int exists = 0;
+    char *key = NULL;
+    char *key_fixed = NULL;
+    char *value = NULL;
+    char *val_dup = NULL;
+    char keystr[100] = {
+        0,
+    };
+    int keystr_len;
+    int keylen;
+    char *trash_path = NULL;
+    int trash_path_len = 0;
+    int count = 0;
+    int dict_count = 0;
+    char errstr[PATH_MAX] = {
+        0,
+    };
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    dict_t *val_dict = NULL;
+    gf_boolean_t global_opt = _gf_false;
+    gf_boolean_t key_matched = _gf_false; /* if a key was processed or not*/
+    glusterd_volinfo_t *voliter = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    uint32_t new_op_version = GD_OP_VERSION_MIN;
+    uint32_t local_new_op_version = GD_OP_VERSION_MIN;
+    uint32_t local_new_client_op_version = GD_OP_VERSION_MIN;
+    uint32_t key_op_version = GD_OP_VERSION_MIN;
+    uint32_t local_key_op_version = GD_OP_VERSION_MIN;
+    gf_boolean_t origin_glusterd = _gf_true;
+    gf_boolean_t check_op_version = _gf_true;
+    gf_boolean_t trash_enabled = _gf_false;
+    gf_boolean_t all_vol = _gf_false;
+    struct volopt_map_entry *vmep = NULL;
+
+    GF_ASSERT(dict);
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* Check if we can support the required op-version
+     * This check is not done on the originator glusterd. The originator
+     * glusterd sets this value.
+     */
+    origin_glusterd = is_origin_glusterd(dict);
+
+    if (!origin_glusterd) {
+        /* Check for v3.3.x origin glusterd */
+        check_op_version = dict_get_str_boolean(dict, "check-op-version",
+                                                _gf_false);
+
+        if (check_op_version) {
+            ret = dict_get_uint32(dict, "new-op-version", &new_op_version);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                        "Key=new-op-version", NULL);
+                goto out;
+            }
+
+            if ((new_op_version > GD_OP_VERSION_MAX) ||
+                (new_op_version < GD_OP_VERSION_MIN)) {
+                ret = -1;
+                snprintf(errstr, sizeof(errstr),
+                         "Required op_version (%d) is not supported."
+                         " Max supported op version is %d",
+                         new_op_version, priv->op_version);
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNSUPPORTED_VERSION,
+                       "%s", errstr);
+                goto out;
+            }
+        }
+    }
+
+    ret = dict_get_int32_sizen(dict, "count", &dict_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Count(dict),not set in Volume-Set");
+        goto out;
+    }
+
+    if (dict_count == 0) {
+        /*No options would be specified of volume set help */
+        if (dict_get_sizen(dict, "help")) {
+            ret = 0;
+            goto out;
+        }
+
+        if (dict_get_sizen(dict, "help-xml")) {
+#if (HAVE_LIB_XML)
+            ret = 0;
+            goto out;
+#else
+            ret = -1;
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MODULE_NOT_INSTALLED,
+                   "libxml not present in the system");
+            *op_errstr = gf_strdup(
+                "Error: xml libraries not present to produce xml-output");
+            goto out;
+#endif
+        }
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_OPTIONS_GIVEN,
+               "No options received ");
+        *op_errstr = gf_strdup("Options not specified");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_str_sizen(dict, "volname", &volname);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                "Key=volname", NULL);
+        goto out;
+    }
+
+    if (strcasecmp(volname, "all") != 0) {
+        ret = glusterd_volinfo_find(volname, &volinfo);
+        if (ret) {
+            snprintf(errstr, sizeof(errstr), FMTSTR_CHECK_VOL_EXISTS, volname);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+                   FMTSTR_CHECK_VOL_EXISTS, volname);
+            goto out;
+        }
+
+        ret = glusterd_validate_volume_id(dict, volinfo);
+        if (ret)
+            goto out;
+
+        local_new_op_version = volinfo->op_version;
+        local_new_client_op_version = volinfo->client_op_version;
+
+    } else {
+        all_vol = _gf_true;
+    }
+
+    val_dict = dict_new();
+    if (!val_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    for (count = 1; ret != 1; count++) {
+        keystr_len = sprintf(keystr, "key%d", count);
+        ret = dict_get_strn(dict, keystr, keystr_len, &key);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                    "Key=%s", keystr, NULL);
+            break;
+        }
+
+        keystr_len = sprintf(keystr, "value%d", count);
+        ret = dict_get_strn(dict, keystr, keystr_len, &value);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "invalid key,value pair in 'volume set'");
+            ret = -1;
+            goto out;
+        }
+
+        key_matched = _gf_false;
+        keylen = strlen(key);
+        if (len_strcmp(key, keylen, "config.memory-accounting")) {
+            key_matched = _gf_true;
+            gf_msg_debug(this->name, 0,
+                         "enabling memory accounting for volume %s", volname);
+            ret = 0;
+        } else if (len_strcmp(key, keylen, "config.transport")) {
+            key_matched = _gf_true;
+            gf_msg_debug(this->name, 0, "changing transport-type for volume %s",
+                         volname);
+            ret = 0;
+            /* if value is none of 'tcp/rdma/tcp,rdma' error out */
+            if (!((strcasecmp(value, "rdma") == 0) ||
+                  (strcasecmp(value, "tcp") == 0) ||
+                  (strcasecmp(value, "tcp,rdma") == 0) ||
+                  (strcasecmp(value, "rdma,tcp") == 0))) {
+                ret = snprintf(errstr, sizeof(errstr),
+                               "transport-type %s does not exist", value);
+                /* lets not bother about above return value,
+                   its a failure anyways */
+                ret = -1;
+                goto out;
+            }
+        } else if (len_strcmp(key, keylen, "ganesha.enable")) {
+            key_matched = _gf_true;
+            if (!strcmp(value, "off") == 0) {
+                ret = ganesha_manage_export(dict, "off", _gf_true, op_errstr);
+                if (ret)
+                    goto out;
+            }
+        }
+
+        if (!key_matched) {
+            ret = glusterd_check_bitrot_cmd(key, keylen, errstr,
+                                            sizeof(errstr));
+            if (ret)
+                goto out;
+            ret = glusterd_check_quota_cmd(key, keylen, value, errstr,
+                                           sizeof(errstr));
+            if (ret)
+                goto out;
+        }
+
+        if (is_key_glusterd_hooks_friendly(key))
+            continue;
+
+        ret = glusterd_volopt_validate(volinfo, dict, key, value, op_errstr);
+        if (ret)
+            goto out;
+
+        exists = glusterd_check_option_exists(key, &key_fixed);
+        if (exists == -1) {
+            ret = -1;
+            goto out;
+        }
+
+        if (!exists) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+                   "Option with name: %s does not exist", key);
+            ret = snprintf(errstr, sizeof(errstr), "option : %s does not exist",
+                           key);
+            if (key_fixed)
+                snprintf(errstr + ret, sizeof(errstr) - ret,
+                         "\nDid you mean %s?", key_fixed);
+            ret = -1;
+            goto out;
+        }
+
+        if (key_fixed) {
+            key = key_fixed;
+            keylen = strlen(key_fixed);
+        }
+
+        if (len_strcmp(key, keylen, "cluster.granular-entry-heal")) {
+            /* For granular entry-heal, if the set command was
+             * invoked through volume-set CLI, then allow the
+             * command only if the volume is still in 'Created'
+             * state
+             */
+            if (volinfo && volinfo->status != GLUSTERD_STATUS_NONE &&
+                (dict_get_sizen(dict, "is-special-key") == NULL)) {
+                snprintf(errstr, sizeof(errstr),
+                         " 'gluster volume set <VOLNAME> %s {enable, disable}'"
+                         " is not supported."
+                         " Use 'gluster volume heal <VOLNAME> "
+                         "granular-entry-heal {enable, disable}' instead.",
+                         key);
+                ret = -1;
+                goto out;
+            }
+        } else if (len_strcmp(key, keylen, GLUSTERD_GLOBAL_OP_VERSION_KEY)) {
+            /* Check if the key is cluster.op-version and set
+             * local_new_op_version to the value given if possible.
+             */
+            if (!all_vol) {
+                ret = -1;
+                snprintf(errstr, sizeof(errstr),
+                         "Option \"%s\" is not valid for a single volume", key);
+                goto out;
+            }
+            /* Check if cluster.op-version is the only option being
+             * set
+             */
+            if (count != 1) {
+                ret = -1;
+                snprintf(errstr, sizeof(errstr),
+                         "Option \"%s\" cannot be set along with other options",
+                         key);
+                goto out;
+            }
+            /* Just reusing the variable, but I'm using it for
+             * storing the op-version from value
+             */
+            ret = gf_string2uint(value, &local_key_op_version);
+            if (ret) {
+                snprintf(errstr, sizeof(errstr),
+                         "invalid number format \"%s\" in option \"%s\"", value,
+                         key);
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY, "%s",
+                       errstr);
+                goto out;
+            }
+
+            if (local_key_op_version > GD_OP_VERSION_MAX ||
+                local_key_op_version < GD_OP_VERSION_MIN) {
+                ret = -1;
+                snprintf(errstr, sizeof(errstr),
+                         "Required op_version (%d) is not supported."
+                         " Max supported op version is %d",
+                         local_key_op_version, priv->op_version);
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VERSION_UNSUPPORTED,
+                       "%s", errstr);
+                goto out;
+            }
+            if (local_key_op_version > priv->op_version) {
+                local_new_op_version = local_key_op_version;
+            } else {
+                ret = -1;
+                snprintf(errstr, sizeof(errstr),
+                         "Required op-version (%d) should"
+                         " not be equal or lower than current"
+                         " cluster op-version (%d).",
+                         local_key_op_version, priv->op_version);
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VERSION_UNSUPPORTED,
+                       "%s", errstr);
+                goto out;
+            }
+
+            goto cont;
+        }
+
+        ALL_VOLUME_OPTION_CHECK(volname, _gf_false, key, ret, op_errstr, out);
+        ret = glusterd_validate_quorum_options(this, key, value, op_errstr);
+        if (ret)
+            goto out;
+
+        ret = glusterd_validate_brick_mx_options(this, key, value, op_errstr);
+        if (ret)
+            goto out;
+
+        vmep = gd_get_vmep(key);
+        local_key_op_version = glusterd_get_op_version_from_vmep(vmep);
+        if (local_key_op_version > local_new_op_version)
+            local_new_op_version = local_key_op_version;
+        if (gd_is_client_option(vmep) &&
+            (local_key_op_version > local_new_client_op_version))
+            local_new_client_op_version = local_key_op_version;
+
+        sprintf(keystr, "op-version%d", count);
+        if (origin_glusterd) {
+            ret = dict_set_uint32(dict, keystr, local_key_op_version);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set key-op-version in dict");
+                goto out;
+            }
+        } else if (check_op_version) {
+            ret = dict_get_uint32(dict, keystr, &key_op_version);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get key-op-version from dict");
+                goto out;
+            }
+            if (local_key_op_version != key_op_version) {
+                ret = -1;
+                snprintf(errstr, sizeof(errstr),
+                         "option: %s op-version mismatch", key);
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_VERSION_MISMATCH,
+                       "%s, required op-version = %" PRIu32
+                       ", available op-version = %" PRIu32,
+                       errstr, key_op_version, local_key_op_version);
+                goto out;
+            }
+        }
+
+        global_opt = glusterd_check_globaloption(key);
+
+        if (len_strcmp(key, keylen, GLUSTERD_SHARED_STORAGE_KEY)) {
+            ret = glusterd_validate_shared_storage(value, errstr);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_SHARED_STRG_VOL_OPT_VALIDATE_FAIL,
+                       "Failed to validate shared storage volume options");
+                goto out;
+            }
+        } else if (len_strcmp(key, keylen, GLUSTERD_LOCALTIME_LOGGING_KEY)) {
+            ret = glusterd_validate_localtime_logging(value, errstr);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_LOCALTIME_LOGGING_VOL_OPT_VALIDATE_FAIL,
+                       "Failed to validate localtime logging volume options");
+                goto out;
+            }
+        } else if (len_strcmp(key, keylen, GLUSTERD_DAEMON_LOG_LEVEL_KEY)) {
+            ret = glusterd_validate_daemon_log_level(value, errstr);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_DAEMON_LOG_LEVEL_VOL_OPT_VALIDATE_FAIL,
+                       "Failed to validate daemon-log-level volume options");
+                goto out;
+            }
+        } else if (len_strcmp(key, keylen, "features.trash-dir")) {
+            if (volinfo) {
+                ret = glusterd_volinfo_get(volinfo, VKEY_FEATURES_TRASH,
+                                           &val_dup);
+                if (!ret && val_dup) {
+                    ret = gf_string2boolean(val_dup, &trash_enabled);
+                    if (ret)
+                        goto out;
+                }
+            }
+            if (!trash_enabled) {
+                snprintf(errstr, sizeof(errstr),
+                         "Trash translator is not enabled. "
+                         "Use volume set %s trash on",
+                         volname);
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_SET_FAIL,
+                       "Unable to set the options in 'volume set': %s", errstr);
+                ret = -1;
+                goto out;
+            }
+            if (strchr(value, '/')) {
+                snprintf(errstr, sizeof(errstr),
+                         "Path is not allowed as option");
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_SET_FAIL,
+                       "Unable to set the options in 'volume set': %s", errstr);
+                ret = -1;
+                goto out;
+            }
+
+            list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+            {
+                /* Check for local brick */
+                if (!gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+                    trash_path_len = strlen(value) + strlen(brickinfo->path) +
+                                     2;
+                    trash_path = GF_MALLOC(trash_path_len, gf_common_mt_char);
+                    snprintf(trash_path, trash_path_len, "%s/%s",
+                             brickinfo->path, value);
+
+                    /* Checks whether a directory with
+                       given option exists or not */
+                    if (!sys_access(trash_path, R_OK)) {
+                        snprintf(errstr, sizeof(errstr), "Path %s exists",
+                                 value);
+                        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_SET_FAIL,
+                               "Unable to set the options in 'volume set': %s",
+                               errstr);
+                        ret = -1;
+                        goto out;
+                    } else {
+                        gf_msg_debug(this->name, 0,
+                                     "Directory with given name does not exist,"
+                                     " continuing");
+                    }
+
+                    if (volinfo->status == GLUSTERD_STATUS_STARTED &&
+                        brickinfo->status != GF_BRICK_STARTED) {
+                        /* If volume is in started state , checks
+                           whether bricks are online */
+                        snprintf(errstr, sizeof(errstr),
+                                 "One or more bricks are down");
+                        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_SET_FAIL,
+                               "Unable to set the options in 'volume set': %s",
+                               errstr);
+                        ret = -1;
+                        goto out;
+                    }
+                }
+                if (trash_path) {
+                    GF_FREE(trash_path);
+                    trash_path = NULL;
+                }
+            }
+        }
+
+        ret = dict_set_strn(val_dict, key, keylen, value);
+
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to set the options in 'volume set'");
+            ret = -1;
+            goto out;
+        }
+
+        *op_errstr = NULL;
+        if (!global_opt && !all_vol)
+            ret = glusterd_validate_reconfopts(volinfo, val_dict, op_errstr);
+        else if (!all_vol) {
+            voliter = NULL;
+            cds_list_for_each_entry(voliter, &priv->volumes, vol_list)
+            {
+                ret = glusterd_validate_globalopts(voliter, val_dict,
+                                                   op_errstr);
+                if (ret)
+                    break;
+            }
+        }
+
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+                   "Could not create temp volfile, some option failed: %s",
+                   *op_errstr);
+            goto out;
+        }
+        dict_deln(val_dict, key, keylen);
+
+        if (key_fixed) {
+            GF_FREE(key_fixed);
+            key_fixed = NULL;
+        }
+    }
+
+    /* Check if all the connected clients support the new client-op-version
+     */
+    ret = glusterd_check_client_op_version_support(
+        volname, local_new_client_op_version, op_errstr);
+    if (ret)
+        goto out;
+cont:
+    if (origin_glusterd) {
+        ret = dict_set_uint32(dict, "new-op-version", local_new_op_version);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set new-op-version in dict");
+            goto out;
+        }
+        /* Set this value in dict so other peers know to check for
+         * op-version. This is a hack for 3.3.x compatibility
+         *
+         * TODO: Remove this and the other places this is referred once
+         * 3.3.x compatibility is not required
+         */
+        ret = dict_set_int32_sizen(dict, "check-op-version", 1);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set check-op-version in dict");
+            goto out;
+        }
+    }
+
+    ret = 0;
+
+out:
+    if (val_dict)
+        dict_unref(val_dict);
+
+    if (trash_path)
+        GF_FREE(trash_path);
+
+    GF_FREE(key_fixed);
+    if (errstr[0] != '\0')
+        *op_errstr = gf_strdup(errstr);
+
+    if (ret) {
+        if (!(*op_errstr)) {
+            *op_errstr = gf_strdup("Error, Validation Failed");
+            gf_msg_debug(this->name, 0, "Error, Cannot Validate option :%s",
+                         *op_errstr);
+        } else {
+            gf_msg_debug(this->name, 0, "Error, Cannot Validate option");
+        }
+    }
+    return ret;
+}
+
+static int
+glusterd_op_stage_reset_volume(dict_t *dict, char **op_errstr)
+{
+    int ret = 0;
+    char *volname = NULL;
+    int exists = 0;
+    char msg[2048] = {0};
+    char *key = NULL;
+    char *key_fixed = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    if (strcasecmp(volname, "all") != 0) {
+        ret = glusterd_volinfo_find(volname, &volinfo);
+        if (ret) {
+            snprintf(msg, sizeof(msg), FMTSTR_CHECK_VOL_EXISTS, volname);
+            goto out;
+        }
+
+        ret = glusterd_validate_volume_id(dict, volinfo);
+        if (ret)
+            goto out;
+    }
+
+    ret = dict_get_strn(dict, "key", SLEN("key"), &key);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get option key");
+        goto out;
+    }
+
+    /* *
+     * If key ganesha.enable is set, then volume should be unexported from
+     * ganesha server. Also it is a volume-level option, perform only when
+     * volume name not equal to "all"(in other words if volinfo != NULL)
+     */
+    if (volinfo && (!strcmp(key, "all") || !strcmp(key, "ganesha.enable"))) {
+        if (glusterd_check_ganesha_export(volinfo)) {
+            ret = ganesha_manage_export(dict, "off", _gf_true, op_errstr);
+            if (ret)
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_NFS_GNS_RESET_FAIL,
+                       "Could not reset ganesha.enable key");
+        }
+    }
+
+    if (strcmp(key, "all")) {
+        exists = glusterd_check_option_exists(key, &key_fixed);
+        if (exists == -1) {
+            ret = -1;
+            goto out;
+        }
+
+        if (!exists) {
+            ret = snprintf(msg, sizeof(msg), "Option %s does not exist", key);
+            if (key_fixed)
+                snprintf(msg + ret, sizeof(msg) - ret, "\nDid you mean %s?",
+                         key_fixed);
+            ret = -1;
+            goto out;
+        } else if (exists > 0) {
+            if (key_fixed)
+                key = key_fixed;
+
+            /* 'gluster volume set/reset <VOLNAME>
+             * features.quota/features.inode-quota' should
+             * not be allowed as it is deprecated.
+             * Setting and resetting quota/inode-quota features
+             * should be allowed only through 'gluster volume quota
+             * <VOLNAME> enable/disable'.
+             * But, 'gluster volume set features.quota-deem-statfs'
+             * can be turned on/off when quota is enabled.
+             */
+
+            if (strcmp(VKEY_FEATURES_INODE_QUOTA, key) == 0 ||
+                strcmp(VKEY_FEATURES_QUOTA, key) == 0) {
+                snprintf(msg, sizeof(msg),
+                         "'gluster volume "
+                         "reset <VOLNAME> %s' is deprecated. "
+                         "Use 'gluster volume quota <VOLNAME> "
+                         "disable' instead.",
+                         key);
+                ret = -1;
+                goto out;
+            }
+            ALL_VOLUME_OPTION_CHECK(volname, _gf_false, key, ret, op_errstr,
+                                    out);
+        }
+    }
+
+out:
+    GF_FREE(key_fixed);
+
+    if (msg[0] != '\0') {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_STAGE_RESET_VOL_FAIL,
+               "%s", msg);
+        *op_errstr = gf_strdup(msg);
+    }
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_op_stage_sync_volume(dict_t *dict, char **op_errstr)
+{
+    int ret = -1;
+    char *volname = NULL;
+    char *hostname = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    char msg[2048] = {
+        0,
+    };
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "hostname", SLEN("hostname"), &hostname);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "hostname couldn't be "
+                 "retrieved from msg");
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=hostname", NULL);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    if (gf_is_local_addr(hostname)) {
+        // volname is not present in case of sync all
+        ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+        if (!ret) {
+            ret = glusterd_volinfo_find(volname, &volinfo);
+            if (ret) {
+                snprintf(msg, sizeof(msg),
+                         "Volume %s "
+                         "does not exist",
+                         volname);
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_VOL_NOT_FOUND,
+                        "Volume=%s", volname, NULL);
+                *op_errstr = gf_strdup(msg);
+                goto out;
+            }
+        }
+    } else {
+        RCU_READ_LOCK;
+
+        peerinfo = glusterd_peerinfo_find(NULL, hostname);
+        if (peerinfo == NULL) {
+            RCU_READ_UNLOCK;
+            ret = -1;
+            snprintf(msg, sizeof(msg), "%s, is not a friend", hostname);
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_PEER_NOT_FOUND,
+                    "Peer_name=%s", hostname, NULL);
+            *op_errstr = gf_strdup(msg);
+            goto out;
+
+        } else if (!peerinfo->connected) {
+            RCU_READ_UNLOCK;
+            ret = -1;
+            snprintf(msg, sizeof(msg),
+                     "%s, is not connected at "
+                     "the moment",
+                     hostname);
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_PEER_DISCONNECTED,
+                    "Peer_name=%s", hostname, NULL);
+            *op_errstr = gf_strdup(msg);
+            goto out;
+        }
+
+        RCU_READ_UNLOCK;
+    }
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_op_stage_status_volume(dict_t *dict, char **op_errstr)
+{
+    int ret = -1;
+    uint32_t cmd = 0;
+    char msg[2048] = {
+        0,
+    };
+    char *volname = NULL;
+    char *brick = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    dict_t *vol_opts = NULL;
+#ifdef BUILD_GNFS
+    gf_boolean_t nfs_disabled = _gf_false;
+#endif
+    gf_boolean_t shd_enabled = _gf_false;
+
+    GF_ASSERT(dict);
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_uint32(dict, "cmd", &cmd);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=cmd", NULL);
+        goto out;
+    }
+
+    if (cmd & GF_CLI_STATUS_ALL)
+        goto out;
+
+    if ((cmd & GF_CLI_STATUS_QUOTAD) &&
+        (priv->op_version == GD_OP_VERSION_MIN)) {
+        snprintf(msg, sizeof(msg),
+                 "The cluster is operating at "
+                 "version 1. Getting the status of quotad is not "
+                 "allowed in this state.");
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_QUOTA_GET_STAT_FAIL,
+                msg, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    if ((cmd & GF_CLI_STATUS_SNAPD) &&
+        (priv->op_version < GD_OP_VERSION_3_6_0)) {
+        snprintf(msg, sizeof(msg),
+                 "The cluster is operating at "
+                 "version less than %d. Getting the "
+                 "status of snapd is not allowed in this state.",
+                 GD_OP_VERSION_3_6_0);
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_SNAP_STATUS_FAIL, msg,
+                NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), FMTSTR_CHECK_VOL_EXISTS, volname);
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_VOLINFO_GET_FAIL,
+                "Volume=%s", volname, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_validate_volume_id(dict, volinfo);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_VALIDATE_FAILED, NULL);
+        goto out;
+    }
+
+    ret = glusterd_is_volume_started(volinfo);
+    if (!ret) {
+        snprintf(msg, sizeof(msg), "Volume %s is not started", volname);
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_VOL_NOT_STARTED,
+                "Volume=%s", volname, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    vol_opts = volinfo->dict;
+
+    if ((cmd & GF_CLI_STATUS_SHD) != 0) {
+        if (glusterd_is_shd_compatible_volume(volinfo)) {
+            shd_enabled = gd_is_self_heal_enabled(volinfo, vol_opts);
+        } else {
+            ret = -1;
+            snprintf(msg, sizeof(msg), "Volume %s is not Self-heal compatible",
+                     volname);
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_VOL_SHD_NOT_COMP,
+                    "Volume=%s", volname, NULL);
+            goto out;
+        }
+        if (!shd_enabled) {
+            ret = -1;
+            snprintf(msg, sizeof(msg),
+                     "Self-heal Daemon is disabled for volume %s", volname);
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_SELF_HEALD_DISABLED,
+                    "Volume=%s", volname, NULL);
+            goto out;
+        }
+#ifdef BUILD_GNFS
+    } else if ((cmd & GF_CLI_STATUS_NFS) != 0) {
+        nfs_disabled = dict_get_str_boolean(vol_opts, NFS_DISABLE_MAP_KEY,
+                                            _gf_false);
+        if (nfs_disabled) {
+            ret = -1;
+            snprintf(msg, sizeof(msg), "NFS server is disabled for volume %s",
+                     volname);
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    GD_MSG_NFS_GANESHA_DISABLED, "Volume=%s", volname, NULL);
+            goto out;
+        }
+#endif
+    } else if ((cmd & GF_CLI_STATUS_QUOTAD) != 0) {
+        if (!glusterd_is_volume_quota_enabled(volinfo)) {
+            ret = -1;
+            snprintf(msg, sizeof(msg),
+                     "Volume %s does not have "
+                     "quota enabled",
+                     volname);
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_QUOTA_DISABLED,
+                    "Volume=%s", volname, NULL);
+            goto out;
+        }
+    } else if ((cmd & GF_CLI_STATUS_BITD) != 0) {
+        if (!glusterd_is_bitrot_enabled(volinfo)) {
+            ret = -1;
+            snprintf(msg, sizeof(msg),
+                     "Volume %s does not have "
+                     "bitrot enabled",
+                     volname);
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_BITROT_NOT_ENABLED,
+                    "Volume=%s", volname, NULL);
+            goto out;
+        }
+    } else if ((cmd & GF_CLI_STATUS_SCRUB) != 0) {
+        if (!glusterd_is_bitrot_enabled(volinfo)) {
+            ret = -1;
+            snprintf(msg, sizeof(msg),
+                     "Volume %s does not have "
+                     "bitrot enabled. Scrubber will be enabled "
+                     "automatically if bitrot is enabled",
+                     volname);
+            gf_smsg(
+                this->name, GF_LOG_ERROR, errno, GD_MSG_BITROT_NOT_ENABLED,
+                "Scrubber will be enabled automatically if bitrot is enabled",
+                "Volume=%s", volname, NULL);
+            goto out;
+        }
+    } else if ((cmd & GF_CLI_STATUS_SNAPD) != 0) {
+        if (!glusterd_is_snapd_enabled(volinfo)) {
+            ret = -1;
+            snprintf(msg, sizeof(msg),
+                     "Volume %s does not have "
+                     "uss enabled",
+                     volname);
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_SNAPD_NOT_RUNNING,
+                    "Volume=%s", volname, NULL);
+            goto out;
+        }
+    } else if ((cmd & GF_CLI_STATUS_BRICK) != 0) {
+        ret = dict_get_strn(dict, "brick", SLEN("brick"), &brick);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                    "Key=brick", NULL);
+            goto out;
+        }
+
+        ret = glusterd_volume_brickinfo_get_by_brick(brick, volinfo, &brickinfo,
+                                                     _gf_false);
+        if (ret) {
+            snprintf(msg, sizeof(msg),
+                     "No brick %s in"
+                     " volume %s",
+                     brick, volname);
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_BRICK_NOT_FOUND,
+                    "Brick=%s, Volume=%s", brick, volname, NULL);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = 0;
+
+out:
+    if (ret) {
+        if (msg[0] != '\0')
+            *op_errstr = gf_strdup(msg);
+        else
+            *op_errstr = gf_strdup("Validation Failed for Status");
+    }
+
+    gf_msg_debug(this->name, 0, "Returning: %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_stage_stats_volume(dict_t *dict, char **op_errstr)
+{
+    int ret = -1;
+    char *volname = NULL;
+    char msg[2048] = {
+        0,
+    };
+    int32_t stats_op = GF_CLI_STATS_NONE;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Volume name get failed");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "Volume %s, "
+                 "doesn't exist",
+                 volname);
+        goto out;
+    }
+
+    ret = glusterd_validate_volume_id(dict, volinfo);
+    if (ret)
+        goto out;
+
+    ret = dict_get_int32n(dict, "op", SLEN("op"), &stats_op);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Volume profile op get failed");
+        goto out;
+    }
+
+    if (GF_CLI_STATS_START == stats_op) {
+        if (_gf_true == glusterd_is_profile_on(volinfo)) {
+            snprintf(msg, sizeof(msg),
+                     "Profile on Volume %s is"
+                     " already started",
+                     volinfo->volname);
+            ret = -1;
+            goto out;
+        }
+    } else if ((GF_CLI_STATS_STOP == stats_op) ||
+               (GF_CLI_STATS_INFO == stats_op)) {
+        if (_gf_false == glusterd_is_profile_on(volinfo)) {
+            snprintf(msg, sizeof(msg),
+                     "Profile on Volume %s is"
+                     " not started",
+                     volinfo->volname);
+            ret = -1;
+
+            goto out;
+        }
+    }
+    if ((GF_CLI_STATS_TOP == stats_op) || (GF_CLI_STATS_INFO == stats_op)) {
+        if (_gf_false == glusterd_is_volume_started(volinfo)) {
+            snprintf(msg, sizeof(msg), "Volume %s is not started.",
+                     volinfo->volname);
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_STARTED, "%s",
+                   msg);
+            ret = -1;
+            goto out;
+        }
+    }
+    ret = 0;
+out:
+    if (msg[0] != '\0') {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_OP_STAGE_STATS_VOL_FAIL,
+               "%s", msg);
+        *op_errstr = gf_strdup(msg);
+    }
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+_delete_reconfig_opt(dict_t *this, char *key, data_t *value, void *data)
+{
+    int32_t *is_force = 0;
+
+    GF_ASSERT(data);
+    is_force = (int32_t *)data;
+
+    /* Keys which has the flag VOLOPT_FLAG_NEVER_RESET
+     * should not be deleted
+     */
+
+    if (_gf_true ==
+        glusterd_check_voloption_flags(key, VOLOPT_FLAG_NEVER_RESET)) {
+        if (*is_force != 1)
+            *is_force = *is_force | GD_OP_PROTECTED;
+        goto out;
+    }
+
+    if (*is_force != 1) {
+        if (_gf_true ==
+            glusterd_check_voloption_flags(key, VOLOPT_FLAG_FORCE)) {
+            /* indicate to caller that we don't set the option
+             * due to being protected
+             */
+            *is_force = *is_force | GD_OP_PROTECTED;
+            goto out;
+        } else {
+            *is_force = *is_force | GD_OP_UNPROTECTED;
+        }
+    }
+
+    gf_msg_debug("glusterd", 0, "deleting dict with key=%s,value=%s", key,
+                 value->data);
+    dict_del(this, key);
+    /**Delete scrubber (pause/resume) option from the dictionary if bitrot
+     * option is going to be reset
+     * */
+    if (!strncmp(key, VKEY_FEATURES_BITROT, strlen(VKEY_FEATURES_BITROT))) {
+        dict_del_sizen(this, VKEY_FEATURES_SCRUB);
+    }
+out:
+    return 0;
+}
+
+static int
+_delete_reconfig_global_opt(dict_t *this, char *key, data_t *value, void *data)
+{
+    GF_ASSERT(data);
+
+    if (strcmp(GLUSTERD_GLOBAL_OPT_VERSION, key) == 0)
+        goto out;
+
+    _delete_reconfig_opt(this, key, value, data);
+out:
+    return 0;
+}
+
+static int
+glusterd_options_reset(glusterd_volinfo_t *volinfo, char *key,
+                       int32_t *is_force)
+{
+    int ret = 0;
+    data_t *value = NULL;
+    char *key_fixed = NULL;
+    xlator_t *this = NULL;
+    glusterd_svc_t *svc = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(volinfo->dict);
+    GF_ASSERT(key);
+
+    if (!strncmp(key, "all", 3)) {
+        dict_foreach(volinfo->dict, _delete_reconfig_opt, is_force);
+        ret = glusterd_enable_default_options(volinfo, NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FAIL_DEFAULT_OPT_SET,
+                   "Failed to set "
+                   "default options on reset for volume %s",
+                   volinfo->volname);
+            goto out;
+        }
+    } else {
+        value = dict_get(volinfo->dict, key);
+        if (!value) {
+            gf_msg_debug(this->name, 0, "no value set for option %s", key);
+            goto out;
+        }
+        _delete_reconfig_opt(volinfo->dict, key, value, is_force);
+        ret = glusterd_enable_default_options(volinfo, key);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FAIL_DEFAULT_OPT_SET,
+                   "Failed to set "
+                   "default value for option '%s' on reset for "
+                   "volume %s",
+                   key, volinfo->volname);
+            goto out;
+        }
+    }
+
+    gd_update_volume_op_versions(volinfo);
+    if (!volinfo->is_snap_volume) {
+        svc = &(volinfo->snapd.svc);
+        ret = svc->manager(svc, volinfo, PROC_START_NO_WAIT);
+        if (ret)
+            goto out;
+    }
+    svc = &(volinfo->gfproxyd.svc);
+    ret = svc->reconfigure(volinfo);
+    if (ret)
+        goto out;
+
+    svc = &(volinfo->shd.svc);
+    ret = svc->reconfigure(volinfo);
+    if (ret)
+        goto out;
+
+    ret = glusterd_create_volfiles_and_notify_services(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Unable to create volfile for"
+               " 'volume reset'");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+    if (ret)
+        goto out;
+
+    if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+        ret = glusterd_svcs_reconfigure(volinfo);
+        if (ret)
+            goto out;
+    }
+
+    ret = 0;
+
+out:
+    GF_FREE(key_fixed);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_op_reset_all_volume_options(xlator_t *this, dict_t *dict)
+{
+    char *key = NULL;
+    char *key_fixed = NULL;
+    int ret = -1;
+    int32_t is_force = 0;
+    glusterd_conf_t *conf = NULL;
+    dict_t *dup_opt = NULL;
+    gf_boolean_t all = _gf_false;
+    char *next_version = NULL;
+    gf_boolean_t quorum_action = _gf_false;
+
+    conf = this->private;
+    ret = dict_get_strn(dict, "key", SLEN("key"), &key);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get key");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "force", SLEN("force"), &is_force);
+    if (ret)
+        is_force = 0;
+
+    if (strcmp(key, "all")) {
+        ret = glusterd_check_option_exists(key, &key_fixed);
+        if (ret <= 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+                   "Option %s does not "
+                   "exist",
+                   key);
+            ret = -1;
+            goto out;
+        }
+    } else {
+        all = _gf_true;
+    }
+
+    if (key_fixed)
+        key = key_fixed;
+
+    ret = -1;
+    dup_opt = dict_new();
+    if (!dup_opt) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+    if (!all) {
+        dict_copy(conf->opts, dup_opt);
+        dict_del(dup_opt, key);
+    }
+    ret = glusterd_get_next_global_opt_version_str(conf->opts, &next_version);
+    if (ret)
+        goto out;
+
+    ret = dict_set_strn(dup_opt, GLUSTERD_GLOBAL_OPT_VERSION,
+                        SLEN(GLUSTERD_GLOBAL_OPT_VERSION), next_version);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", GLUSTERD_GLOBAL_OPT_VERSION, NULL);
+        goto out;
+    }
+
+    ret = glusterd_store_options(this, dup_opt);
+    if (ret)
+        goto out;
+
+    if (glusterd_is_quorum_changed(conf->opts, key, NULL))
+        quorum_action = _gf_true;
+
+    ret = dict_set_dynstrn(conf->opts, GLUSTERD_GLOBAL_OPT_VERSION,
+                           SLEN(GLUSTERD_GLOBAL_OPT_VERSION), next_version);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", GLUSTERD_GLOBAL_OPT_VERSION, NULL);
+        goto out;
+    } else
+        next_version = NULL;
+
+    if (!all) {
+        dict_del(conf->opts, key);
+    } else {
+        dict_foreach(conf->opts, _delete_reconfig_global_opt, &is_force);
+    }
+out:
+    GF_FREE(key_fixed);
+    if (dup_opt)
+        dict_unref(dup_opt);
+
+    gf_msg_debug(this->name, 0, "returning %d", ret);
+    if (quorum_action)
+        glusterd_do_quorum_action();
+    GF_FREE(next_version);
+    return ret;
+}
+
+static int
+glusterd_op_reset_volume(dict_t *dict, char **op_rspstr)
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    int ret = -1;
+    char *volname = NULL;
+    char *key = NULL;
+    char *key_fixed = NULL;
+    int32_t is_force = 0;
+    gf_boolean_t quorum_action = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    if (strcasecmp(volname, "all") == 0) {
+        ret = glusterd_op_reset_all_volume_options(this, dict);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "force", SLEN("force"), &is_force);
+    if (ret)
+        is_force = 0;
+
+    ret = dict_get_strn(dict, "key", SLEN("key"), &key);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get option key");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+
+    if (strcmp(key, "all") &&
+        glusterd_check_option_exists(key, &key_fixed) != 1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+               "volinfo dict inconsistency: option %s not found", key);
+        ret = -1;
+        goto out;
+    }
+    if (key_fixed)
+        key = key_fixed;
+
+    if (glusterd_is_quorum_changed(volinfo->dict, key, NULL))
+        quorum_action = _gf_true;
+
+    ret = glusterd_options_reset(volinfo, key, &is_force);
+    if (ret == -1) {
+        gf_asprintf(op_rspstr, "Volume reset : failed");
+    } else if (is_force & GD_OP_PROTECTED) {
+        if (is_force & GD_OP_UNPROTECTED) {
+            gf_asprintf(op_rspstr,
+                        "All unprotected fields were"
+                        " reset. To reset the protected fields,"
+                        " use 'force'.");
+        } else {
+            ret = -1;
+            gf_asprintf(op_rspstr,
+                        "'%s' is protected. To reset"
+                        " use 'force'.",
+                        key);
+        }
+    }
+
+    if (!strcmp(key, "ganesha.enable") || !strcmp(key, "all")) {
+        if (glusterd_check_ganesha_export(volinfo) &&
+            is_origin_glusterd(dict)) {
+            ret = manage_export_config(volname, "off", op_rspstr);
+            if (ret)
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_NFS_GNS_RESET_FAIL,
+                       "Could not reset ganesha.enable key");
+        }
+    }
+
+out:
+    GF_FREE(key_fixed);
+    if (quorum_action)
+        glusterd_do_quorum_action();
+
+    gf_msg_debug(this->name, 0, "'volume reset' returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_stop_bricks(glusterd_volinfo_t *volinfo)
+{
+    glusterd_brickinfo_t *brickinfo = NULL;
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        /*TODO: Need to change @del_brick in brick_stop to _gf_true
+         * once we enable synctask in peer rpc prog */
+        if (glusterd_brick_stop(volinfo, brickinfo, _gf_false)) {
+            gf_event(EVENT_BRICK_STOP_FAILED, "peer=%s;volume=%s;brick=%s",
+                     brickinfo->hostname, volinfo->volname, brickinfo->path);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int
+glusterd_start_bricks(glusterd_volinfo_t *volinfo)
+
+{
+    int ret = -1;
+    glusterd_brickinfo_t *brickinfo = NULL;
+
+    GF_ASSERT(volinfo);
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (!brickinfo->start_triggered) {
+            pthread_mutex_lock(&brickinfo->restart_mutex);
+            {
+                /* coverity[SLEEP] */
+                ret = glusterd_brick_start(volinfo, brickinfo, _gf_false,
+                                           _gf_false);
+            }
+            pthread_mutex_unlock(&brickinfo->restart_mutex);
+            if (ret) {
+                gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_DISCONNECTED,
+                       "Failed to start %s:%s for %s", brickinfo->hostname,
+                       brickinfo->path, volinfo->volname);
+                gf_event(EVENT_BRICK_START_FAILED, "peer=%s;volume=%s;brick=%s",
+                         brickinfo->hostname, volinfo->volname,
+                         brickinfo->path);
+                goto out;
+            }
+        }
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+glusterd_update_volumes_dict(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char *address_family_str = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    /* 3.9.0 onwards gNFS will be disabled by default. In case of an upgrade
+     * from anything below than 3.9.0 to 3.9.x the volume's dictionary will
+     * not have 'nfs.disable' key set which means the same will not be set
+     * to on until explicitly done. setnfs.disable to 'on' at op-version
+     * bump up flow is the ideal way here. The same is also applicable for
+     * transport.address-family where if the transport type is set to tcp
+     * then transport.address-family is defaulted to 'inet'.
+     */
+    if (conf->op_version >= GD_OP_VERSION_3_9_0) {
+        if (dict_get_str_boolean(volinfo->dict, NFS_DISABLE_MAP_KEY, 1)) {
+            ret = dict_set_dynstr_with_alloc(volinfo->dict, NFS_DISABLE_MAP_KEY,
+                                             "on");
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set "
+                       "option ' NFS_DISABLE_MAP_KEY ' on "
+                       "volume %s",
+                       volinfo->volname);
+                goto out;
+            }
+        }
+        ret = dict_get_strn(volinfo->dict, "transport.address-family",
+                            SLEN("transport.address-family"),
+                            &address_family_str);
+        if (ret) {
+            if (volinfo->transport_type == GF_TRANSPORT_TCP) {
+                ret = dict_set_dynstr_with_alloc(
+                    volinfo->dict, "transport.address-family", "inet");
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, errno,
+                           GD_MSG_DICT_SET_FAILED,
+                           "failed to set transport."
+                           "address-family on %s",
+                           volinfo->volname);
+                    goto out;
+                }
+            }
+        }
+    }
+    ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+
+out:
+    return ret;
+}
+
+static int
+glusterd_set_brick_mx_opts(dict_t *dict, char *key, char *value,
+                           char **op_errstr)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+    GF_VALIDATE_OR_GOTO(this->name, key, out);
+    GF_VALIDATE_OR_GOTO(this->name, value, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_errstr, out);
+
+    ret = 0;
+
+    priv = this->private;
+
+    if (!strcmp(key, GLUSTERD_BRICK_MULTIPLEX_KEY)) {
+        ret = dict_set_dynstrn(priv->opts, GLUSTERD_BRICK_MULTIPLEX_KEY,
+                               SLEN(GLUSTERD_BRICK_MULTIPLEX_KEY),
+                               gf_strdup(value));
+    }
+
+out:
+    return ret;
+}
+
+/* This is a hack to prevent client-io-threads from being loaded in the graph
+ * when the cluster-op-version is bumped up from 3.8.x to 3.13.x. The key is
+ * deleted subsequently in glusterd_create_volfiles(). */
+static int
+glusterd_dict_set_skip_cliot_key(glusterd_volinfo_t *volinfo)
+{
+    return dict_set_int32n(volinfo->dict, "skip-CLIOT", SLEN("skip-CLIOT"), 1);
+}
+
+static int
+glusterd_op_set_all_volume_options(xlator_t *this, dict_t *dict,
+                                   char **op_errstr)
+{
+    char *key = NULL;
+    char *key_fixed = NULL;
+    char *value = NULL;
+    char *dup_value = NULL;
+    int ret = -1;
+    glusterd_conf_t *conf = NULL;
+    dict_t *dup_opt = NULL;
+    char *next_version = NULL;
+    gf_boolean_t quorum_action = _gf_false;
+    uint32_t op_version = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_svc_t *svc = NULL;
+    gf_boolean_t svcs_reconfigure = _gf_false;
+
+    conf = this->private;
+    ret = dict_get_strn(dict, "key1", SLEN("key1"), &key);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=key1", NULL);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "value1", SLEN("value1"), &value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "invalid key,value pair in 'volume set'");
+        goto out;
+    }
+
+    ret = glusterd_check_option_exists(key, &key_fixed);
+    if (ret <= 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNKNOWN_KEY,
+               "Invalid key %s", key);
+        ret = -1;
+        goto out;
+    }
+
+    if (key_fixed)
+        key = key_fixed;
+
+    ret = glusterd_set_shared_storage(dict, key, value, op_errstr);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SHARED_STRG_SET_FAIL,
+               "Failed to set shared storage option");
+        goto out;
+    }
+
+    ret = glusterd_set_brick_mx_opts(dict, key, value, op_errstr);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_MX_SET_FAIL,
+               "Failed to set brick multiplexing option");
+        goto out;
+    }
+
+    /* If the key is cluster.op-version, set conf->op_version to the value
+     * if needed and save it.
+     */
+    if (strcmp(key, GLUSTERD_GLOBAL_OP_VERSION_KEY) == 0) {
+        ret = 0;
+
+        ret = gf_string2uint(value, &op_version);
+        if (ret)
+            goto out;
+
+        if (op_version >= conf->op_version) {
+            conf->op_version = op_version;
+
+            /* When a bump up happens, update the quota.conf file
+             * as well. This is because, till 3.7 we had a quota
+             * conf version v1.1 in quota.conf. When inode-quota
+             * feature is introduced, this needs to be changed to
+             * v1.2 in quota.conf and 16 bytes uuid in quota.conf
+             * needs to be changed to 17 bytes. Look
+             * glusterd_store_quota_config for more details.
+             */
+            cds_list_for_each_entry(volinfo, &conf->volumes, vol_list)
+            {
+                ret = glusterd_store_quota_config(
+                    volinfo, NULL, NULL, GF_QUOTA_OPTION_TYPE_UPGRADE, NULL);
+                if (ret)
+                    goto out;
+                ret = glusterd_update_volumes_dict(volinfo);
+                if (ret)
+                    goto out;
+
+                if (glusterd_dict_set_skip_cliot_key(volinfo))
+                    goto out;
+
+                if (!volinfo->is_snap_volume) {
+                    svc = &(volinfo->snapd.svc);
+                    ret = svc->manager(svc, volinfo, PROC_START_NO_WAIT);
+                    if (ret)
+                        goto out;
+                }
+
+                svc = &(volinfo->gfproxyd.svc);
+                ret = svc->reconfigure(volinfo);
+                if (ret)
+                    goto out;
+
+                svc = &(volinfo->shd.svc);
+                ret = svc->reconfigure(volinfo);
+                if (ret)
+                    goto out;
+
+                ret = glusterd_create_volfiles_and_notify_services(volinfo);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_VOLFILE_CREATE_FAIL,
+                           "Unable to create volfile for"
+                           " 'volume set'");
+                    goto out;
+                }
+                if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+                    svcs_reconfigure = _gf_true;
+                }
+            }
+            if (svcs_reconfigure) {
+                ret = glusterd_svcs_reconfigure(NULL);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SVC_RESTART_FAIL,
+                           "Unable to restart "
+                           "services");
+                    goto out;
+                }
+            }
+
+            ret = glusterd_store_global_info(this);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_VERS_STORE_FAIL,
+                       "Failed to store op-version.");
+            }
+        }
+        /* No need to save cluster.op-version in conf->opts
+         */
+        goto out;
+    }
+    ret = -1;
+    dup_opt = dict_new();
+    if (!dup_opt) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+    dict_copy(conf->opts, dup_opt);
+    ret = dict_set_str(dup_opt, key, value);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    ret = glusterd_get_next_global_opt_version_str(conf->opts, &next_version);
+    if (ret)
+        goto out;
+
+    ret = dict_set_strn(dup_opt, GLUSTERD_GLOBAL_OPT_VERSION,
+                        SLEN(GLUSTERD_GLOBAL_OPT_VERSION), next_version);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", GLUSTERD_GLOBAL_OPT_VERSION, NULL);
+        goto out;
+    }
+
+    ret = glusterd_store_options(this, dup_opt);
+    if (ret)
+        goto out;
+
+    if (glusterd_is_quorum_changed(conf->opts, key, value))
+        quorum_action = _gf_true;
+
+    ret = dict_set_dynstrn(conf->opts, GLUSTERD_GLOBAL_OPT_VERSION,
+                           SLEN(GLUSTERD_GLOBAL_OPT_VERSION), next_version);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", GLUSTERD_GLOBAL_OPT_VERSION, NULL);
+        goto out;
+    } else
+        next_version = NULL;
+
+    dup_value = gf_strdup(value);
+    if (!dup_value)
+        goto out;
+
+    ret = dict_set_dynstr(conf->opts, key, dup_value);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    } else
+        dup_value = NULL; /* Protect the allocation from GF_FREE */
+
+out:
+    GF_FREE(dup_value);
+    GF_FREE(key_fixed);
+    if (dup_opt)
+        dict_unref(dup_opt);
+
+    gf_msg_debug(this->name, 0, "returning %d", ret);
+    if (quorum_action)
+        glusterd_do_quorum_action();
+    GF_FREE(next_version);
+    return ret;
+}
+
+int
+glusterd_op_get_max_opversion(char **op_errstr, dict_t *rsp_dict)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, rsp_dict, out);
+
+    ret = dict_set_int32n(rsp_dict, "max-opversion", SLEN("max-opversion"),
+                          GD_OP_VERSION_MAX);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Setting value for max-opversion to dict failed");
+        goto out;
+    }
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_set_shared_storage(dict_t *dict, char *key, char *value,
+                            char **op_errstr)
+{
+    int32_t ret = -1;
+    char hooks_args[PATH_MAX] = {
+        0,
+    };
+    char errstr[PATH_MAX] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+    GF_VALIDATE_OR_GOTO(this->name, key, out);
+    GF_VALIDATE_OR_GOTO(this->name, value, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_errstr, out);
+
+    ret = 0;
+
+    if (strcmp(key, GLUSTERD_SHARED_STORAGE_KEY)) {
+        goto out;
+    }
+
+    /* Re-create the brick path so as to be *
+     * able to re-use it                    *
+     */
+    ret = recursive_rmdir(GLUSTER_SHARED_STORAGE_BRICK_DIR);
+    if (ret) {
+        snprintf(errstr, PATH_MAX,
+                 "Failed to remove shared "
+                 "storage brick(%s). "
+                 "Reason: %s",
+                 GLUSTER_SHARED_STORAGE_BRICK_DIR, strerror(errno));
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED, "%s",
+               errstr);
+        ret = -1;
+        goto out;
+    }
+
+    ret = mkdir_p(GLUSTER_SHARED_STORAGE_BRICK_DIR, 0755, _gf_true);
+    if (-1 == ret) {
+        snprintf(errstr, PATH_MAX,
+                 "Failed to create shared "
+                 "storage brick(%s). "
+                 "Reason: %s",
+                 GLUSTER_SHARED_STORAGE_BRICK_DIR, strerror(errno));
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_CREATE_DIR_FAILED, "%s",
+               errstr);
+        goto out;
+    }
+
+    if (is_origin_glusterd(dict)) {
+        len = snprintf(hooks_args, sizeof(hooks_args),
+                       "is_originator=1,local_node_hostname=%s",
+                       local_node_hostname);
+    } else {
+        len = snprintf(hooks_args, sizeof(hooks_args),
+                       "is_originator=0,local_node_hostname=%s",
+                       local_node_hostname);
+    }
+    if ((len < 0) || (len >= sizeof(hooks_args))) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(dict, "hooks_args", hooks_args);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Failed to set"
+               " hooks_args in dict.");
+        goto out;
+    }
+
+out:
+    if (ret && strlen(errstr)) {
+        *op_errstr = gf_strdup(errstr);
+    }
+
+    return ret;
+}
+
+static int
+glusterd_op_set_volume(dict_t *dict, char **errstr)
+{
+    int ret = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+    char *volname = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    int count = 1;
+    char *key = NULL;
+    char *key_fixed = NULL;
+    char *value = NULL;
+    char keystr[50] = {
+        0,
+    };
+    int keylen;
+    gf_boolean_t global_opt = _gf_false;
+    gf_boolean_t global_opts_set = _gf_false;
+    glusterd_volinfo_t *voliter = NULL;
+    int32_t dict_count = 0;
+    gf_boolean_t check_op_version = _gf_false;
+    uint32_t new_op_version = 0;
+    gf_boolean_t quorum_action = _gf_false;
+    glusterd_svc_t *svc = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_int32n(dict, "count", SLEN("count"), &dict_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Count(dict),not set in Volume-Set");
+        goto out;
+    }
+
+    if (dict_count == 0) {
+        ret = glusterd_volset_help(NULL, errstr);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    if (strcasecmp(volname, "all") == 0) {
+        ret = glusterd_op_set_all_volume_options(this, dict, errstr);
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+
+    /* TODO: Remove this once v3.3 compatibility is not required */
+    check_op_version = dict_get_str_boolean(dict, "check-op-version",
+                                            _gf_false);
+
+    if (check_op_version) {
+        ret = dict_get_uint32(dict, "new-op-version", &new_op_version);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get new op-version from dict");
+            goto out;
+        }
+    }
+
+    for (count = 1; ret != -1; count++) {
+        keylen = snprintf(keystr, sizeof(keystr), "key%d", count);
+        ret = dict_get_strn(dict, keystr, keylen, &key);
+        if (ret)
+            break;
+
+        keylen = snprintf(keystr, sizeof(keystr), "value%d", count);
+        ret = dict_get_strn(dict, keystr, keylen, &value);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "invalid key,value pair in 'volume set'");
+            ret = -1;
+            goto out;
+        }
+
+        if (strcmp(key, "config.memory-accounting") == 0) {
+            ret = gf_string2boolean(value, &volinfo->memory_accounting);
+            if (ret == -1) {
+                gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                       "Invalid value in key-value pair.");
+                goto out;
+            }
+        }
+
+        if (strcmp(key, "config.transport") == 0) {
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_VOL_TRANSPORT_TYPE_CHANGE,
+                   "changing transport-type for volume %s to %s", volname,
+                   value);
+            ret = 0;
+            if (strcasecmp(value, "rdma") == 0) {
+                volinfo->transport_type = GF_TRANSPORT_RDMA;
+            } else if (strcasecmp(value, "tcp") == 0) {
+                volinfo->transport_type = GF_TRANSPORT_TCP;
+            } else if ((strcasecmp(value, "tcp,rdma") == 0) ||
+                       (strcasecmp(value, "rdma,tcp") == 0)) {
+                volinfo->transport_type = GF_TRANSPORT_BOTH_TCP_RDMA;
+            } else {
+                ret = -1;
+                goto out;
+            }
+        }
+
+        ret = glusterd_check_ganesha_cmd(key, value, errstr, dict);
+        if (ret == -1)
+            goto out;
+
+        if (!is_key_glusterd_hooks_friendly(key)) {
+            ret = glusterd_check_option_exists(key, &key_fixed);
+            GF_ASSERT(ret);
+            if (ret <= 0) {
+                key_fixed = NULL;
+                goto out;
+            }
+        }
+
+        global_opt = _gf_false;
+        if (glusterd_check_globaloption(key)) {
+            global_opt = _gf_true;
+            global_opts_set = _gf_true;
+        }
+
+        if (!global_opt)
+            value = gf_strdup(value);
+
+        if (!value) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_SET_FAIL,
+                   "Unable to set the options in 'volume set'");
+            ret = -1;
+            goto out;
+        }
+
+        if (key_fixed)
+            key = key_fixed;
+
+        if (glusterd_is_quorum_changed(volinfo->dict, key, value))
+            quorum_action = _gf_true;
+
+        if (global_opt) {
+            cds_list_for_each_entry(voliter, &priv->volumes, vol_list)
+            {
+                value = gf_strdup(value);
+                ret = dict_set_dynstr(voliter->dict, key, value);
+                if (ret)
+                    goto out;
+            }
+        } else {
+            ret = dict_set_dynstr(volinfo->dict, key, value);
+            if (ret)
+                goto out;
+        }
+
+        if (key_fixed) {
+            GF_FREE(key_fixed);
+            key_fixed = NULL;
+        }
+    }
+
+    if (count == 1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_OPTIONS_GIVEN,
+               "No options received ");
+        ret = -1;
+        goto out;
+    }
+
+    /* Update the cluster op-version before regenerating volfiles so that
+     * correct volfiles are generated
+     */
+    if (new_op_version > priv->op_version) {
+        priv->op_version = new_op_version;
+        ret = glusterd_store_global_info(this);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_VERS_STORE_FAIL,
+                   "Failed to store op-version");
+            goto out;
+        }
+    }
+    if (!global_opts_set) {
+        gd_update_volume_op_versions(volinfo);
+
+        if (!volinfo->is_snap_volume) {
+            svc = &(volinfo->snapd.svc);
+            ret = svc->manager(svc, volinfo, PROC_START_NO_WAIT);
+            if (ret)
+                goto out;
+        }
+        svc = &(volinfo->gfproxyd.svc);
+        ret = svc->reconfigure(volinfo);
+        if (ret)
+            goto out;
+
+        svc = &(volinfo->shd.svc);
+        ret = svc->reconfigure(volinfo);
+        if (ret)
+            goto out;
+
+        ret = glusterd_create_volfiles_and_notify_services(volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+                   "Unable to create volfile for"
+                   " 'volume set'");
+            ret = -1;
+            goto out;
+        }
+
+        ret = glusterd_store_volinfo(volinfo,
+                                     GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+        if (ret)
+            goto out;
+
+        if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+            ret = glusterd_svcs_reconfigure(volinfo);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SVC_RESTART_FAIL,
+                       "Unable to restart services");
+                goto out;
+            }
+        }
+
+    } else {
+        cds_list_for_each_entry(voliter, &priv->volumes, vol_list)
+        {
+            volinfo = voliter;
+            gd_update_volume_op_versions(volinfo);
+
+            if (!volinfo->is_snap_volume) {
+                svc = &(volinfo->snapd.svc);
+                ret = svc->manager(svc, volinfo, PROC_START_NO_WAIT);
+                if (ret)
+                    goto out;
+            }
+
+            svc = &(volinfo->gfproxyd.svc);
+            ret = svc->reconfigure(volinfo);
+            if (ret)
+                goto out;
+
+            svc = &(volinfo->shd.svc);
+            ret = svc->reconfigure(volinfo);
+            if (ret)
+                goto out;
+
+            ret = glusterd_create_volfiles_and_notify_services(volinfo);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+                       "Unable to create volfile for"
+                       " 'volume set'");
+                ret = -1;
+                goto out;
+            }
+
+            ret = glusterd_store_volinfo(volinfo,
+                                         GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+            if (ret)
+                goto out;
+
+            if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+                ret = glusterd_svcs_reconfigure(volinfo);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SVC_RESTART_FAIL,
+                           "Unable to restart services");
+                    goto out;
+                }
+            }
+        }
+    }
+
+out:
+    GF_FREE(key_fixed);
+    gf_msg_debug(this->name, 0, "returning %d", ret);
+    if (quorum_action)
+        glusterd_do_quorum_action();
+    return ret;
+}
+
+static int
+glusterd_op_sync_volume(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    int ret = -1;
+    char *volname = NULL;
+    char *hostname = NULL;
+    char msg[2048] = {
+        0,
+    };
+    int count = 1;
+    int vol_count = 0;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "hostname", SLEN("hostname"), &hostname);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "hostname couldn't be "
+                 "retrieved from msg");
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=hostname", NULL);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    if (!gf_is_local_addr(hostname)) {
+        ret = 0;
+        goto out;
+    }
+
+    // volname is not present in case of sync all
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (!ret) {
+        ret = glusterd_volinfo_find(volname, &volinfo);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+                   "Volume with name: %s "
+                   "not exists",
+                   volname);
+            goto out;
+        }
+    }
+
+    if (!rsp_dict) {
+        // this should happen only on source
+        gf_smsg(this->name, GF_LOG_INFO, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        ret = 0;
+        goto out;
+    }
+
+    if (volname) {
+        ret = glusterd_add_volume_to_dict(volinfo, rsp_dict, 1, "volume");
+        if (ret)
+            goto out;
+        vol_count = 1;
+    } else {
+        cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+        {
+            ret = glusterd_add_volume_to_dict(volinfo, rsp_dict, count,
+                                              "volume");
+            if (ret)
+                goto out;
+
+            vol_count = count++;
+        }
+    }
+    ret = dict_set_int32n(rsp_dict, "count", SLEN("count"), vol_count);
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_add_profile_volume_options(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+
+    GF_ASSERT(volinfo);
+
+    ret = dict_set_nstrn(volinfo->dict, VKEY_DIAG_LAT_MEASUREMENT,
+                         SLEN(VKEY_DIAG_LAT_MEASUREMENT), "on", SLEN("on"));
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set the volume %s "
+               "option %s value %s",
+               volinfo->volname, VKEY_DIAG_LAT_MEASUREMENT, "on");
+        goto out;
+    }
+
+    ret = dict_set_nstrn(volinfo->dict, VKEY_DIAG_CNT_FOP_HITS,
+                         SLEN(VKEY_DIAG_CNT_FOP_HITS), "on", SLEN("on"));
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set the volume %s "
+               "option %s value %s",
+               volinfo->volname, VKEY_DIAG_CNT_FOP_HITS, "on");
+        goto out;
+    }
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+static void
+glusterd_remove_profile_volume_options(glusterd_volinfo_t *volinfo)
+{
+    GF_ASSERT(volinfo);
+
+    dict_del_sizen(volinfo->dict, VKEY_DIAG_LAT_MEASUREMENT);
+    dict_del_sizen(volinfo->dict, VKEY_DIAG_CNT_FOP_HITS);
+}
+
+int
+glusterd_op_stats_volume(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    int ret = -1;
+    char *volname = NULL;
+    char msg[2048] = {
+        0,
+    };
+    glusterd_volinfo_t *volinfo = NULL;
+    int32_t stats_op = GF_CLI_STATS_NONE;
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "volume name get failed");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Volume %s does not exists", volname);
+
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND, "%s", msg);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "op", SLEN("op"), &stats_op);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "volume profile op get failed");
+        goto out;
+    }
+
+    switch (stats_op) {
+        case GF_CLI_STATS_START:
+            ret = glusterd_add_profile_volume_options(volinfo);
+            if (ret)
+                goto out;
+            break;
+        case GF_CLI_STATS_STOP:
+            glusterd_remove_profile_volume_options(volinfo);
+            break;
+        case GF_CLI_STATS_INFO:
+        case GF_CLI_STATS_TOP:
+            // info is already collected in brick op.
+            // just goto out;
+            ret = 0;
+            goto out;
+            break;
+        default:
+            GF_ASSERT(0);
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+                   "Invalid profile op: %d", stats_op);
+            ret = -1;
+            goto out;
+            break;
+    }
+    ret = glusterd_create_volfiles_and_notify_services(volinfo);
+
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Unable to create volfile for"
+               " 'volume set'");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+    if (ret)
+        goto out;
+
+    if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+        ret = glusterd_svcs_reconfigure(volinfo);
+        if (ret)
+            goto out;
+    }
+
+    ret = 0;
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+_add_remove_bricks_to_dict(dict_t *dict, glusterd_volinfo_t *volinfo,
+                           char *prefix)
+{
+    int ret = -1;
+    int count = 0;
+    int i = 0;
+    char brick_key[16] = {
+        0,
+    };
+    char dict_key[64] = {
+        /* dict_key is small as prefix is up to 32 chars */
+        0,
+    };
+    int keylen;
+    char *brick = NULL;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(dict);
+    GF_ASSERT(volinfo);
+    GF_ASSERT(prefix);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_int32n(volinfo->rebal.dict, "count", SLEN("count"), &count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get brick count");
+        goto out;
+    }
+
+    keylen = snprintf(dict_key, sizeof(dict_key), "%s.count", prefix);
+    ret = dict_set_int32n(dict, dict_key, keylen, count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set brick count in dict");
+        goto out;
+    }
+
+    for (i = 1; i <= count; i++) {
+        keylen = snprintf(brick_key, sizeof(brick_key), "brick%d", i);
+
+        ret = dict_get_strn(volinfo->rebal.dict, brick_key, keylen, &brick);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get %s", brick_key);
+            goto out;
+        }
+
+        keylen = snprintf(dict_key, sizeof(dict_key), "%s.%s", prefix,
+                          brick_key);
+        if ((keylen < 0) || (keylen >= sizeof(dict_key))) {
+            ret = -1;
+            goto out;
+        }
+        ret = dict_set_strn(dict, dict_key, keylen, brick);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to add brick to dict");
+            goto out;
+        }
+        brick = NULL;
+    }
+
+out:
+    return ret;
+}
+
+/* This adds the respective task-id and all available parameters of a task into
+ * a dictionary
+ */
+static int
+_add_task_to_dict(dict_t *dict, glusterd_volinfo_t *volinfo, int op, int index)
+{
+    int ret = -1;
+    char key[32] = {
+        0,
+    };
+    int keylen;
+    char *uuid_str = NULL;
+    int status = 0;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(dict);
+    GF_ASSERT(volinfo);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    switch (op) {
+        case GD_OP_REMOVE_BRICK:
+            snprintf(key, sizeof(key), "task%d", index);
+            ret = _add_remove_bricks_to_dict(dict, volinfo, key);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_ADD_REMOVE_BRICK_FAIL,
+                       "Failed to add remove bricks to dict");
+                goto out;
+            }
+        case GD_OP_REBALANCE:
+            uuid_str = gf_strdup(uuid_utoa(volinfo->rebal.rebalance_id));
+            status = volinfo->rebal.defrag_status;
+            break;
+
+        default:
+            ret = -1;
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_TASK_ID,
+                   "%s operation doesn't have a"
+                   " task_id",
+                   gd_op_list[op]);
+            goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "task%d.type", index);
+    ret = dict_set_strn(dict, key, keylen, (char *)gd_op_list[op]);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Error setting task type in dict");
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "task%d.id", index);
+
+    if (!uuid_str)
+        goto out;
+    ret = dict_set_dynstrn(dict, key, keylen, uuid_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Error setting task id in dict");
+        goto out;
+    }
+    uuid_str = NULL;
+
+    keylen = snprintf(key, sizeof(key), "task%d.status", index);
+    ret = dict_set_int32n(dict, key, keylen, status);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Error setting task status in dict");
+        goto out;
+    }
+
+out:
+    if (uuid_str)
+        GF_FREE(uuid_str);
+    return ret;
+}
+
+static int
+glusterd_aggregate_task_status(dict_t *rsp_dict, glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    int tasks = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (!gf_uuid_is_null(volinfo->rebal.rebalance_id)) {
+        ret = _add_task_to_dict(rsp_dict, volinfo, volinfo->rebal.op, tasks);
+
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to add task details to dict");
+            goto out;
+        }
+        tasks++;
+    }
+    ret = dict_set_int32n(rsp_dict, "tasks", SLEN("tasks"), tasks);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Error setting tasks count in dict");
+        goto out;
+    }
+out:
+    return ret;
+}
+
+static int
+glusterd_op_status_volume(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    int ret = -1;
+    int node_count = 0;
+    int brick_index = -1;
+    int other_count = 0;
+    int other_index = 0;
+    uint32_t cmd = 0;
+    char *volname = NULL;
+    char *brick = NULL;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    dict_t *vol_opts = NULL;
+#ifdef BUILD_GNFS
+    gf_boolean_t nfs_disabled = _gf_false;
+#endif
+    gf_boolean_t shd_enabled = _gf_false;
+    gf_boolean_t origin_glusterd = _gf_false;
+    int snapd_enabled, bitrot_enabled, volume_quota_enabled;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    GF_ASSERT(dict);
+
+    origin_glusterd = is_origin_glusterd(dict);
+
+    ret = dict_get_uint32(dict, "cmd", &cmd);
+    if (ret)
+        goto out;
+
+    if (origin_glusterd) {
+        ret = 0;
+        if ((cmd & GF_CLI_STATUS_ALL)) {
+            ret = glusterd_get_all_volnames(rsp_dict);
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLNAMES_GET_FAIL,
+                       "failed to get all volume "
+                       "names for status");
+        }
+    }
+
+    ret = dict_set_uint32(rsp_dict, "cmd", cmd);
+    if (ret)
+        goto out;
+
+    if (cmd & GF_CLI_STATUS_ALL)
+        goto out;
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret)
+        goto out;
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "Volume with name: %s "
+               "does not exist",
+               volname);
+        goto out;
+    }
+    vol_opts = volinfo->dict;
+
+    if ((cmd & GF_CLI_STATUS_QUOTAD) != 0) {
+        ret = glusterd_add_node_to_dict(priv->quotad_svc.name, rsp_dict, 0,
+                                        vol_opts);
+        if (ret)
+            goto out;
+        other_count++;
+        node_count++;
+#ifdef BUILD_GNFS
+    } else if ((cmd & GF_CLI_STATUS_NFS) != 0) {
+        ret = glusterd_add_node_to_dict(priv->nfs_svc.name, rsp_dict, 0,
+                                        vol_opts);
+        if (ret)
+            goto out;
+        other_count++;
+        node_count++;
+#endif
+    } else if ((cmd & GF_CLI_STATUS_BITD) != 0) {
+        ret = glusterd_add_node_to_dict(priv->bitd_svc.name, rsp_dict, 0,
+                                        vol_opts);
+        if (ret)
+            goto out;
+        other_count++;
+        node_count++;
+    } else if ((cmd & GF_CLI_STATUS_SCRUB) != 0) {
+        ret = glusterd_add_node_to_dict(priv->scrub_svc.name, rsp_dict, 0,
+                                        vol_opts);
+        if (ret)
+            goto out;
+        other_count++;
+        node_count++;
+    } else if ((cmd & GF_CLI_STATUS_SNAPD) != 0) {
+        ret = glusterd_add_snapd_to_dict(volinfo, rsp_dict, other_index);
+        if (ret)
+            goto out;
+        other_count++;
+        node_count++;
+    } else if ((cmd & GF_CLI_STATUS_SHD) != 0) {
+        ret = glusterd_add_shd_to_dict(volinfo, rsp_dict, other_index);
+        if (ret)
+            goto out;
+        other_count++;
+        node_count++;
+    } else if ((cmd & GF_CLI_STATUS_BRICK) != 0) {
+        ret = dict_get_strn(dict, "brick", SLEN("brick"), &brick);
+        if (ret)
+            goto out;
+
+        ret = glusterd_volume_brickinfo_get_by_brick(brick, volinfo, &brickinfo,
+                                                     _gf_false);
+        if (ret)
+            goto out;
+
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID))
+            goto out;
+
+        glusterd_add_brick_to_dict(volinfo, brickinfo, rsp_dict, ++brick_index);
+        if (cmd & GF_CLI_STATUS_DETAIL)
+            glusterd_add_brick_detail_to_dict(volinfo, brickinfo, rsp_dict,
+                                              brick_index);
+        node_count++;
+
+    } else if ((cmd & GF_CLI_STATUS_TASKS) != 0) {
+        ret = glusterd_aggregate_task_status(rsp_dict, volinfo);
+        goto out;
+
+    } else {
+        snapd_enabled = glusterd_is_snapd_enabled(volinfo);
+        shd_enabled = gd_is_self_heal_enabled(volinfo, vol_opts);
+#ifdef BUILD_GNFS
+        nfs_disabled = dict_get_str_boolean(vol_opts, NFS_DISABLE_MAP_KEY,
+                                            _gf_false);
+#endif
+        volume_quota_enabled = glusterd_is_volume_quota_enabled(volinfo);
+        bitrot_enabled = glusterd_is_bitrot_enabled(volinfo);
+
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            brick_index++;
+            if (gf_uuid_compare(brickinfo->uuid, MY_UUID))
+                continue;
+
+            glusterd_add_brick_to_dict(volinfo, brickinfo, rsp_dict,
+                                       brick_index);
+
+            if (cmd & GF_CLI_STATUS_DETAIL) {
+                glusterd_add_brick_detail_to_dict(volinfo, brickinfo, rsp_dict,
+                                                  brick_index);
+            }
+            node_count++;
+        }
+
+        if ((cmd & GF_CLI_STATUS_MASK) == GF_CLI_STATUS_NONE) {
+            other_index = brick_index + 1;
+            if (snapd_enabled) {
+                ret = glusterd_add_snapd_to_dict(volinfo, rsp_dict,
+                                                 other_index);
+                if (ret)
+                    goto out;
+                other_count++;
+                other_index++;
+                node_count++;
+            }
+
+            if (glusterd_is_shd_compatible_volume(volinfo)) {
+                if (shd_enabled) {
+                    ret = glusterd_add_shd_to_dict(volinfo, rsp_dict,
+                                                   other_index);
+                    if (ret)
+                        goto out;
+                    other_count++;
+                    other_index++;
+                    node_count++;
+                }
+            }
+#ifdef BUILD_GNFS
+            if (!nfs_disabled) {
+                ret = glusterd_add_node_to_dict(priv->nfs_svc.name, rsp_dict,
+                                                other_index, vol_opts);
+                if (ret)
+                    goto out;
+                other_index++;
+                other_count++;
+                node_count++;
+            }
+#endif
+            if (volume_quota_enabled) {
+                ret = glusterd_add_node_to_dict(priv->quotad_svc.name, rsp_dict,
+                                                other_index, vol_opts);
+                if (ret)
+                    goto out;
+                other_count++;
+                node_count++;
+                other_index++;
+            }
+
+            if (bitrot_enabled) {
+                ret = glusterd_add_node_to_dict(priv->bitd_svc.name, rsp_dict,
+                                                other_index, vol_opts);
+                if (ret)
+                    goto out;
+                other_count++;
+                node_count++;
+                other_index++;
+                /* For handling scrub status. Scrub daemon will be
+                 * running automatically when bitrot is enable */
+                ret = glusterd_add_node_to_dict(priv->scrub_svc.name, rsp_dict,
+                                                other_index, vol_opts);
+                if (ret)
+                    goto out;
+                other_count++;
+                node_count++;
+            }
+        }
+    }
+
+    ret = dict_set_int32n(rsp_dict, "type", SLEN("type"), volinfo->type);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=type", NULL);
+        goto out;
+    }
+
+    ret = dict_set_int32n(rsp_dict, "brick-index-max", SLEN("brick-index-max"),
+                          brick_index);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                "Key=brick-index-max", NULL);
+        goto out;
+    }
+    ret = dict_set_int32n(rsp_dict, "other-count", SLEN("other-count"),
+                          other_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                "Key=other-count", NULL);
+        goto out;
+    }
+    ret = dict_set_int32n(rsp_dict, "count", SLEN("count"), node_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                "Key=count", NULL);
+        goto out;
+    }
+
+    /* Active tasks */
+    /* Tasks are added only for normal volume status request for either a
+     * single volume or all volumes
+     */
+    if (!glusterd_status_has_tasks(cmd))
+        goto out;
+
+    ret = glusterd_aggregate_task_status(rsp_dict, volinfo);
+    if (ret)
+        goto out;
+    ret = 0;
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_op_ac_none(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+
+    gf_msg_debug(THIS->name, 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_op_sm_locking_failed(uuid_t *txn_id)
+{
+    int ret = -1;
+
+    opinfo.op_ret = -1;
+    opinfo.op_errstr = gf_strdup("locking failed for one of the peer.");
+
+    ret = glusterd_set_txn_opinfo(txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+    /* Inject a reject event such that unlocking gets triggered right away*/
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_RCVD_RJT, txn_id, NULL);
+
+    return ret;
+}
+
+static int
+glusterd_op_ac_send_lock(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    rpc_clnt_procedure_t *proc = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    uint32_t pending_count = 0;
+    dict_t *dict = NULL;
+
+    this = THIS;
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &priv->peers, uuid_list)
+    {
+        /* Only send requests to peers who were available before the
+         * transaction started
+         */
+        if (peerinfo->generation > opinfo.txn_generation)
+            continue;
+
+        if (!peerinfo->connected || !peerinfo->mgmt)
+            continue;
+        if ((peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED) &&
+            (glusterd_op_get_op() != GD_OP_SYNC_VOLUME))
+            continue;
+
+        /* Based on the op_version, acquire a cluster or mgmt_v3 lock */
+        if (priv->op_version < GD_OP_VERSION_3_6_0) {
+            proc = &peerinfo->mgmt->proctable[GLUSTERD_MGMT_CLUSTER_LOCK];
+            if (proc->fn) {
+                ret = proc->fn(NULL, this, peerinfo);
+                if (ret) {
+                    RCU_READ_UNLOCK;
+                    gf_msg(this->name, GF_LOG_WARNING, 0,
+                           GD_MSG_LOCK_REQ_SEND_FAIL,
+                           "Failed to send lock request "
+                           "for operation 'Volume %s' to "
+                           "peer %s",
+                           gd_op_list[opinfo.op], peerinfo->hostname);
+                    goto out;
+                }
+                /* Mark the peer as locked*/
+                peerinfo->locked = _gf_true;
+                pending_count++;
+            }
+        } else {
+            dict = glusterd_op_get_ctx();
+            dict_ref(dict);
+
+            proc = &peerinfo->mgmt_v3->proctable[GLUSTERD_MGMT_V3_LOCK];
+            if (proc->fn) {
+                ret = dict_set_static_ptr(dict, "peerinfo", peerinfo);
+                if (ret) {
+                    RCU_READ_UNLOCK;
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "failed to set peerinfo");
+                    dict_unref(dict);
+                    goto out;
+                }
+
+                ret = proc->fn(NULL, this, dict);
+                if (ret) {
+                    RCU_READ_UNLOCK;
+                    gf_msg(this->name, GF_LOG_WARNING, 0,
+                           GD_MSG_MGMTV3_LOCK_REQ_SEND_FAIL,
+                           "Failed to send mgmt_v3 lock "
+                           "request for operation "
+                           "'Volume %s' to peer %s",
+                           gd_op_list[opinfo.op], peerinfo->hostname);
+                    dict_unref(dict);
+                    goto out;
+                }
+                /* Mark the peer as locked*/
+                peerinfo->locked = _gf_true;
+                pending_count++;
+            }
+        }
+    }
+    RCU_READ_UNLOCK;
+
+    opinfo.pending_count = pending_count;
+
+    ret = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    if (!opinfo.pending_count)
+        ret = glusterd_op_sm_inject_all_acc(&event->txn_id);
+
+out:
+    if (ret)
+        ret = glusterd_op_sm_locking_failed(&event->txn_id);
+
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+static int
+glusterd_op_ac_send_unlock(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    rpc_clnt_procedure_t *proc = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    uint32_t pending_count = 0;
+    dict_t *dict = NULL;
+
+    this = THIS;
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &priv->peers, uuid_list)
+    {
+        /* Only send requests to peers who were available before the
+         * transaction started
+         */
+        if (peerinfo->generation > opinfo.txn_generation)
+            continue;
+
+        if (!peerinfo->connected || !peerinfo->mgmt || !peerinfo->locked)
+            continue;
+        if ((peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED) &&
+            (glusterd_op_get_op() != GD_OP_SYNC_VOLUME))
+            continue;
+        /* Based on the op_version,
+         * release the cluster or mgmt_v3 lock */
+        if (priv->op_version < GD_OP_VERSION_3_6_0) {
+            proc = &peerinfo->mgmt->proctable[GLUSTERD_MGMT_CLUSTER_UNLOCK];
+            if (proc->fn) {
+                ret = proc->fn(NULL, this, peerinfo);
+                if (ret) {
+                    opinfo.op_errstr = gf_strdup(
+                        "Unlocking failed for one of "
+                        "the peer.");
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_CLUSTER_UNLOCK_FAILED,
+                           "Unlocking failed for operation"
+                           " volume %s on peer %s",
+                           gd_op_list[opinfo.op], peerinfo->hostname);
+                    continue;
+                }
+                pending_count++;
+                peerinfo->locked = _gf_false;
+            }
+        } else {
+            dict = glusterd_op_get_ctx();
+            dict_ref(dict);
+
+            proc = &peerinfo->mgmt_v3->proctable[GLUSTERD_MGMT_V3_UNLOCK];
+            if (proc->fn) {
+                ret = dict_set_static_ptr(dict, "peerinfo", peerinfo);
+                if (ret) {
+                    opinfo.op_errstr = gf_strdup(
+                        "Unlocking failed for one of the "
+                        "peer.");
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_CLUSTER_UNLOCK_FAILED,
+                           "Unlocking failed for operation"
+                           " volume %s on peer %s",
+                           gd_op_list[opinfo.op], peerinfo->hostname);
+                    dict_unref(dict);
+                    continue;
+                }
+
+                ret = proc->fn(NULL, this, dict);
+                if (ret) {
+                    opinfo.op_errstr = gf_strdup(
+                        "Unlocking failed for one of the "
+                        "peer.");
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_CLUSTER_UNLOCK_FAILED,
+                           "Unlocking failed for operation"
+                           " volume %s on peer %s",
+                           gd_op_list[opinfo.op], peerinfo->hostname);
+                    dict_unref(dict);
+                    continue;
+                }
+                pending_count++;
+                peerinfo->locked = _gf_false;
+            }
+        }
+    }
+    RCU_READ_UNLOCK;
+
+    opinfo.pending_count = pending_count;
+
+    ret = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    if (!opinfo.pending_count)
+        ret = glusterd_op_sm_inject_all_acc(&event->txn_id);
+
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+static int
+glusterd_op_ac_ack_drain(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+
+    if (opinfo.pending_count > 0)
+        opinfo.pending_count--;
+
+    ret = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    if (!opinfo.pending_count)
+        ret = glusterd_op_sm_inject_event(GD_OP_EVENT_ALL_ACK, &event->txn_id,
+                                          NULL);
+
+    gf_msg_debug(THIS->name, 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_op_ac_send_unlock_drain(glusterd_op_sm_event_t *event, void *ctx)
+{
+    return glusterd_op_ac_ack_drain(event, ctx);
+}
+
+static int
+glusterd_op_ac_lock(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int32_t ret = 0;
+    char *volname = NULL;
+    char *globalname = NULL;
+    glusterd_op_lock_ctx_t *lock_ctx = NULL;
+    xlator_t *this = NULL;
+    uint32_t op_errno = 0;
+    glusterd_conf_t *conf = NULL;
+    uint32_t timeout = 0;
+
+    GF_ASSERT(event);
+    GF_ASSERT(ctx);
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    lock_ctx = (glusterd_op_lock_ctx_t *)ctx;
+
+    /* If the req came from a node running on older op_version
+     * the dict won't be present. Based on it acquiring a cluster
+     * or mgmt_v3 lock */
+    if (lock_ctx->dict == NULL) {
+        ret = glusterd_lock(lock_ctx->uuid);
+        glusterd_op_lock_send_resp(lock_ctx->req, ret);
+    } else {
+        /* Cli will add timeout key to dict if the default timeout is
+         * other than 2 minutes. Here we use this value to check whether
+         * mgmt_v3_lock_timeout should be set to default value or we
+         * need to change the value according to timeout value
+         * i.e, timeout + 120 seconds. */
+        ret = dict_get_uint32(lock_ctx->dict, "timeout", &timeout);
+        if (!ret)
+            conf->mgmt_v3_lock_timeout = timeout + 120;
+
+        ret = dict_get_strn(lock_ctx->dict, "volname", SLEN("volname"),
+                            &volname);
+        if (ret)
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to acquire volname");
+        else {
+            ret = glusterd_mgmt_v3_lock(volname, lock_ctx->uuid, &op_errno,
+                                        "vol");
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCK_GET_FAIL,
+                       "Unable to acquire lock for %s", volname);
+            goto out;
+        }
+        ret = dict_get_strn(lock_ctx->dict, "globalname", SLEN("globalname"),
+                            &globalname);
+        if (!ret) {
+            ret = glusterd_mgmt_v3_lock(globalname, lock_ctx->uuid, &op_errno,
+                                        "global");
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCK_GET_FAIL,
+                       "Unable to acquire lock for %s", globalname);
+        }
+    out:
+        glusterd_op_mgmt_v3_lock_send_resp(lock_ctx->req, &event->txn_id, ret);
+
+        dict_unref(lock_ctx->dict);
+    }
+
+    gf_msg_debug(THIS->name, 0, "Lock Returned %d", ret);
+    return ret;
+}
+
+static int
+glusterd_op_ac_unlock(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int32_t ret = 0;
+    char *volname = NULL;
+    char *globalname = NULL;
+    glusterd_op_lock_ctx_t *lock_ctx = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(event);
+    GF_ASSERT(ctx);
+
+    this = THIS;
+    priv = this->private;
+
+    lock_ctx = (glusterd_op_lock_ctx_t *)ctx;
+
+    /* If the req came from a node running on older op_version
+     * the dict won't be present. Based on it releasing the cluster
+     * or mgmt_v3 lock */
+    if (lock_ctx->dict == NULL) {
+        ret = glusterd_unlock(lock_ctx->uuid);
+        glusterd_op_unlock_send_resp(lock_ctx->req, ret);
+    } else {
+        ret = dict_get_strn(lock_ctx->dict, "volname", SLEN("volname"),
+                            &volname);
+        if (ret)
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to acquire volname");
+        else {
+            ret = glusterd_mgmt_v3_unlock(volname, lock_ctx->uuid, "vol");
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FAIL,
+                       "Unable to release lock for %s", volname);
+            goto out;
+        }
+
+        ret = dict_get_strn(lock_ctx->dict, "globalname", SLEN("globalname"),
+                            &globalname);
+        if (!ret) {
+            ret = glusterd_mgmt_v3_unlock(globalname, lock_ctx->uuid, "global");
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FAIL,
+                       "Unable to release lock for %s", globalname);
+        }
+    out:
+        glusterd_op_mgmt_v3_unlock_send_resp(lock_ctx->req, &event->txn_id,
+                                             ret);
+
+        dict_unref(lock_ctx->dict);
+    }
+
+    gf_msg_debug(this->name, 0, "Unlock Returned %d", ret);
+
+    if (priv->pending_quorum_action)
+        glusterd_do_quorum_action();
+    return ret;
+}
+
+static int
+glusterd_op_ac_local_unlock(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    uuid_t *originator = NULL;
+
+    GF_ASSERT(event);
+    GF_ASSERT(ctx);
+
+    originator = (uuid_t *)ctx;
+
+    ret = glusterd_unlock(*originator);
+
+    gf_msg_debug(THIS->name, 0, "Unlock Returned %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_op_ac_rcvd_lock_acc(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+
+    GF_ASSERT(event);
+
+    if (opinfo.pending_count > 0)
+        opinfo.pending_count--;
+
+    ret = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    if (opinfo.pending_count > 0)
+        goto out;
+
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_ALL_ACC, &event->txn_id,
+                                      NULL);
+
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+out:
+    return ret;
+}
+
+int
+glusterd_dict_set_volid(dict_t *dict, char *volname, char **op_errstr)
+{
+    int ret = -1;
+    glusterd_volinfo_t *volinfo = NULL;
+    char *volid = NULL;
+    char msg[1024] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (!dict || !volname) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+    volid = gf_strdup(uuid_utoa(volinfo->volume_id));
+    if (!volid) {
+        ret = -1;
+        goto out;
+    }
+    ret = dict_set_dynstrn(dict, "vol-id", SLEN("vol-id"), volid);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "Failed to set volume id of volume"
+                 " %s",
+                 volname);
+        GF_FREE(volid);
+        goto out;
+    }
+out:
+    if (msg[0] != '\0') {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_ID_SET_FAIL, "%s", msg);
+        *op_errstr = gf_strdup(msg);
+    }
+    return ret;
+}
+
+int
+gd_set_commit_hash(dict_t *dict)
+{
+    struct timeval tv;
+    uint32_t hash;
+
+    /*
+     * We need a commit hash that won't conflict with others we might have
+     * set, or zero which is the implicit value if we never have.  Using
+     * seconds<<3 like this ensures that we'll only get a collision if two
+     * consecutive rebalances are separated by exactly 2^29 seconds - about
+     * 17 years - and even then there's only a 1/8 chance of a collision in
+     * the low order bits.  It's far more likely that this code will have
+     * changed completely by then.  If not, call me in 2031.
+     *
+     * P.S. Time zone changes?  Yeah, right.
+     */
+    gettimeofday(&tv, NULL);
+    hash = tv.tv_sec << 3;
+
+    /*
+     * Make sure at least one of those low-order bits is set.  The extra
+     * shifting is because not all machines have sub-millisecond time
+     * resolution.
+     */
+    hash |= 1 << ((tv.tv_usec >> 10) % 3);
+
+    return dict_set_uint32(dict, "commit-hash", hash);
+}
+
+int
+glusterd_op_build_payload(dict_t **req, char **op_errstr, dict_t *op_ctx)
+{
+    int ret = -1;
+    void *ctx = NULL;
+    dict_t *dict = NULL;
+    dict_t *req_dict = NULL;
+    glusterd_op_t op = GD_OP_NONE;
+    char *volname = NULL;
+    uint32_t status_cmd = GF_CLI_STATUS_NONE;
+    xlator_t *this = NULL;
+    gf_boolean_t do_common = _gf_false;
+
+    GF_ASSERT(req);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    req_dict = dict_new();
+    if (!req_dict)
+        goto out;
+
+    if (!op_ctx) {
+        op = glusterd_op_get_op();
+        ctx = (void *)glusterd_op_get_ctx();
+        if (!ctx) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_OPTIONS_GIVEN,
+                   "Null Context for "
+                   "op %d",
+                   op);
+            ret = -1;
+            goto out;
+        }
+
+    } else {
+#define GD_SYNC_OPCODE_KEY "sync-mgmt-operation"
+        ret = dict_get_int32(op_ctx, GD_SYNC_OPCODE_KEY, (int32_t *)&op);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get volume"
+                   " operation");
+            goto out;
+        }
+        ctx = op_ctx;
+#undef GD_SYNC_OPCODE_KEY
+    }
+
+    dict = ctx;
+    switch (op) {
+        case GD_OP_CREATE_VOLUME: {
+            ++glusterfs_port;
+            ret = dict_set_int32n(dict, "port", SLEN("port"), glusterfs_port);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set port in "
+                       "dictionary");
+                goto out;
+            }
+            dict_copy(dict, req_dict);
+        } break;
+
+        case GD_OP_GSYNC_CREATE:
+        case GD_OP_GSYNC_SET: {
+            ret = glusterd_op_gsync_args_get(dict, op_errstr, &volname, NULL,
+                                             NULL);
+            if (ret == 0) {
+                ret = glusterd_dict_set_volid(dict, volname, op_errstr);
+                if (ret)
+                    goto out;
+            }
+            dict_copy(dict, req_dict);
+        } break;
+
+        case GD_OP_SET_VOLUME: {
+            ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_DICT_GET_FAILED,
+                       "volname is not present in "
+                       "operation ctx");
+                goto out;
+            }
+            if (strcmp(volname, "help") && strcmp(volname, "help-xml") &&
+                strcasecmp(volname, "all")) {
+                ret = glusterd_dict_set_volid(dict, volname, op_errstr);
+                if (ret)
+                    goto out;
+            }
+            dict_unref(req_dict);
+            req_dict = dict_ref(dict);
+        } break;
+
+        case GD_OP_REMOVE_BRICK: {
+            dict_t *dict = ctx;
+            ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_DICT_GET_FAILED,
+                       "volname is not present in "
+                       "operation ctx");
+                goto out;
+            }
+
+            ret = glusterd_dict_set_volid(dict, volname, op_errstr);
+            if (ret)
+                goto out;
+
+            if (gd_set_commit_hash(dict) != 0) {
+                goto out;
+            }
+
+            dict_unref(req_dict);
+            req_dict = dict_ref(dict);
+        } break;
+
+        case GD_OP_STATUS_VOLUME: {
+            ret = dict_get_uint32(dict, "cmd", &status_cmd);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Status command not present "
+                       "in op ctx");
+                goto out;
+            }
+            if (GF_CLI_STATUS_ALL & status_cmd) {
+                dict_copy(dict, req_dict);
+                break;
+            }
+            do_common = _gf_true;
+        } break;
+
+        case GD_OP_DELETE_VOLUME:
+        case GD_OP_START_VOLUME:
+        case GD_OP_STOP_VOLUME:
+        case GD_OP_ADD_BRICK:
+        case GD_OP_REPLACE_BRICK:
+        case GD_OP_RESET_VOLUME:
+        case GD_OP_LOG_ROTATE:
+        case GD_OP_QUOTA:
+        case GD_OP_PROFILE_VOLUME:
+        case GD_OP_HEAL_VOLUME:
+        case GD_OP_STATEDUMP_VOLUME:
+        case GD_OP_CLEARLOCKS_VOLUME:
+        case GD_OP_DEFRAG_BRICK_VOLUME:
+        case GD_OP_BARRIER:
+        case GD_OP_BITROT:
+        case GD_OP_SCRUB_STATUS:
+        case GD_OP_SCRUB_ONDEMAND:
+        case GD_OP_RESET_BRICK: {
+            do_common = _gf_true;
+        } break;
+
+        case GD_OP_REBALANCE: {
+            if (gd_set_commit_hash(dict) != 0) {
+                goto out;
+            }
+            do_common = _gf_true;
+        } break;
+
+        case GD_OP_SYNC_VOLUME:
+        case GD_OP_COPY_FILE:
+        case GD_OP_SYS_EXEC:
+        case GD_OP_GANESHA: {
+            dict_copy(dict, req_dict);
+        } break;
+
+        default:
+            break;
+    }
+
+    /*
+     * This has been moved out of the switch so that multiple ops with
+     * other special needs can all "fall through" to it.
+     */
+    if (do_common) {
+        ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_CRITICAL, -ret, GD_MSG_DICT_GET_FAILED,
+                   "volname is not present in "
+                   "operation ctx");
+            goto out;
+        }
+
+        if (strcasecmp(volname, "all")) {
+            ret = glusterd_dict_set_volid(dict, volname, op_errstr);
+            if (ret)
+                goto out;
+        }
+        dict_copy(dict, req_dict);
+    }
+
+    *req = req_dict;
+    ret = 0;
+
+out:
+    return ret;
+}
+
+static int
+glusterd_op_ac_send_stage_op(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    int ret1 = 0;
+    rpc_clnt_procedure_t *proc = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    dict_t *dict = NULL;
+    dict_t *rsp_dict = NULL;
+    char *op_errstr = NULL;
+    glusterd_op_t op = GD_OP_NONE;
+    uint32_t pending_count = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    op = glusterd_op_get_op();
+
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_DICT_CREATE_FAIL,
+               "Failed to create rsp_dict");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_op_build_payload(&dict, &op_errstr, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_OP_PAYLOAD_BUILD_FAIL,
+               LOGSTR_BUILD_PAYLOAD, gd_op_list[op]);
+        if (op_errstr == NULL)
+            gf_asprintf(&op_errstr, OPERRSTR_BUILD_PAYLOAD);
+        opinfo.op_errstr = op_errstr;
+        goto out;
+    }
+
+    ret = glusterd_validate_quorum(this, op, dict, &op_errstr);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_SERVER_QUORUM_NOT_MET,
+               "Server quorum not met. Rejecting operation.");
+        opinfo.op_errstr = op_errstr;
+        goto out;
+    }
+
+    ret = glusterd_op_stage_validate(op, dict, &op_errstr, rsp_dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VALIDATE_FAILED,
+               LOGSTR_STAGE_FAIL, gd_op_list[op], "localhost",
+               (op_errstr) ? ":" : " ", (op_errstr) ? op_errstr : " ");
+        if (op_errstr == NULL)
+            gf_asprintf(&op_errstr, OPERRSTR_STAGE_FAIL, "localhost");
+        opinfo.op_errstr = op_errstr;
+        goto out;
+    }
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &priv->peers, uuid_list)
+    {
+        /* Only send requests to peers who were available before the
+         * transaction started
+         */
+        if (peerinfo->generation > opinfo.txn_generation)
+            continue;
+
+        if (!peerinfo->connected || !peerinfo->mgmt)
+            continue;
+        if ((peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED) &&
+            (glusterd_op_get_op() != GD_OP_SYNC_VOLUME))
+            continue;
+
+        proc = &peerinfo->mgmt->proctable[GLUSTERD_MGMT_STAGE_OP];
+        GF_ASSERT(proc);
+        if (proc->fn) {
+            ret = dict_set_static_ptr(dict, "peerinfo", peerinfo);
+            if (ret) {
+                RCU_READ_UNLOCK;
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "failed to "
+                       "set peerinfo");
+                goto out;
+            }
+
+            ret = proc->fn(NULL, this, dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_STAGE_REQ_SEND_FAIL,
+                       "Failed to "
+                       "send stage request for operation "
+                       "'Volume %s' to peer %s",
+                       gd_op_list[op], peerinfo->hostname);
+                continue;
+            }
+            pending_count++;
+        }
+    }
+    RCU_READ_UNLOCK;
+
+    opinfo.pending_count = pending_count;
+out:
+    if (ret)
+        opinfo.op_ret = ret;
+
+    ret1 = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+    if (ret1)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    if (dict)
+        dict_unref(dict);
+    if (ret) {
+        glusterd_op_sm_inject_event(GD_OP_EVENT_RCVD_RJT, &event->txn_id, NULL);
+        opinfo.op_ret = ret;
+    }
+
+    gf_msg_debug(this->name, 0,
+                 "Sent stage op request for "
+                 "'Volume %s' to %d peers",
+                 gd_op_list[op], opinfo.pending_count);
+
+    if (!opinfo.pending_count)
+        ret = glusterd_op_sm_inject_all_acc(&event->txn_id);
+
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+/* This function takes a dict and converts the uuid values of key specified
+ * into hostnames
+ */
+static int
+glusterd_op_volume_dict_uuid_to_hostname(dict_t *dict, const char *key_fmt,
+                                         int idx_min, int idx_max)
+{
+    int ret = -1;
+    int i = 0;
+    char key[128];
+    int keylen;
+    char *uuid_str = NULL;
+    uuid_t uuid = {
+        0,
+    };
+    char *hostname = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(dict);
+    GF_ASSERT(key_fmt);
+
+    for (i = idx_min; i < idx_max; i++) {
+        keylen = snprintf(key, sizeof(key), key_fmt, i);
+        ret = dict_get_strn(dict, key, keylen, &uuid_str);
+        if (ret) {
+            ret = 0;
+            continue;
+        }
+
+        gf_msg_debug(this->name, 0, "Got uuid %s", uuid_str);
+
+        ret = gf_uuid_parse(uuid_str, uuid);
+        /* if parsing fails don't error out
+         * let the original value be retained
+         */
+        if (ret) {
+            ret = 0;
+            continue;
+        }
+
+        hostname = glusterd_uuid_to_hostname(uuid);
+        if (hostname) {
+            gf_msg_debug(this->name, 0, "%s -> %s", uuid_str, hostname);
+            ret = dict_set_dynstrn(dict, key, keylen, hostname);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Error setting hostname %s to dict", hostname);
+                GF_FREE(hostname);
+                goto out;
+            }
+        }
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+reassign_defrag_status(dict_t *dict, char *key, int keylen,
+                       gf_defrag_status_t *status)
+{
+    int ret = 0;
+
+    if (!*status)
+        return ret;
+
+    switch (*status) {
+        case GF_DEFRAG_STATUS_STARTED:
+            *status = GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED;
+            break;
+
+        case GF_DEFRAG_STATUS_STOPPED:
+            *status = GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED;
+            break;
+
+        case GF_DEFRAG_STATUS_COMPLETE:
+            *status = GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE;
+            break;
+
+        case GF_DEFRAG_STATUS_FAILED:
+            *status = GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED;
+            break;
+        default:
+            break;
+    }
+
+    ret = dict_set_int32n(dict, key, keylen, *status);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_WARNING, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to reset defrag %s in dict", key);
+
+    return ret;
+}
+
+/* Check and reassign the defrag_status enum got from the rebalance process
+ * of all peers so that the rebalance-status CLI command can display if a
+ * full-rebalance or just a fix-layout was carried out.
+ */
+static int
+glusterd_op_check_peer_defrag_status(dict_t *dict, int count)
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    gf_defrag_status_t status = GF_DEFRAG_STATUS_NOT_STARTED;
+    char key[64] = {
+        0,
+    };
+    int keylen;
+    char *volname = NULL;
+    int ret = -1;
+    int i = 1;
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_WARNING, 0, GD_MSG_VOL_NOT_FOUND,
+               FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+
+    if (volinfo->rebal.defrag_cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) {
+        /* Fix layout was not issued; we don't need to reassign
+           the status */
+        ret = 0;
+        goto out;
+    }
+
+    do {
+        keylen = snprintf(key, sizeof(key), "status-%d", i);
+        ret = dict_get_int32n(dict, key, keylen, (int32_t *)&status);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+                   "failed to get defrag %s", key);
+            goto out;
+        }
+        ret = reassign_defrag_status(dict, key, keylen, &status);
+        if (ret)
+            goto out;
+        i++;
+    } while (i <= count);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/* This function is used to verify if op_ctx indeed
+   requires modification. This is necessary since the
+   dictionary for certain commands might not have the
+   necessary keys required for the op_ctx modification
+   to succeed.
+
+   Special Cases:
+   - volume status all
+   - volume status
+
+   Regular Cases:
+   - volume status <volname> <brick>
+   - volume status <volname> mem
+   - volume status <volname> clients
+   - volume status <volname> inode
+   - volume status <volname> fd
+   - volume status <volname> callpool
+   - volume status <volname> tasks
+*/
+
+static gf_boolean_t
+glusterd_is_volume_status_modify_op_ctx(uint32_t cmd)
+{
+    if ((cmd & GF_CLI_STATUS_MASK) == GF_CLI_STATUS_NONE) {
+        if (cmd & GF_CLI_STATUS_BRICK)
+            return _gf_false;
+        if (cmd & GF_CLI_STATUS_ALL)
+            return _gf_false;
+        return _gf_true;
+    }
+    return _gf_false;
+}
+
+int
+glusterd_op_modify_port_key(dict_t *op_ctx, int brick_index_max)
+{
+    char *port = NULL;
+    int i = 0;
+    int ret = -1;
+    char key[64] = {0};
+    int keylen;
+    char old_key[64] = {0};
+    int old_keylen;
+
+    for (i = 0; i <= brick_index_max; i++) {
+        keylen = snprintf(key, sizeof(key), "brick%d.rdma_port", i);
+        ret = dict_get_strn(op_ctx, key, keylen, &port);
+
+        if (ret) {
+            old_keylen = snprintf(old_key, sizeof(old_key), "brick%d.port", i);
+            ret = dict_get_strn(op_ctx, old_key, old_keylen, &port);
+            if (ret)
+                goto out;
+
+            ret = dict_set_strn(op_ctx, key, keylen, port);
+            if (ret)
+                goto out;
+            ret = dict_set_nstrn(op_ctx, old_key, old_keylen, "\0", SLEN("\0"));
+            if (ret)
+                goto out;
+        }
+    }
+out:
+    return ret;
+}
+
+/* This function is used to modify the op_ctx dict before sending it back
+ * to cli. This is useful in situations like changing the peer uuids to
+ * hostnames etc.
+ */
+void
+glusterd_op_modify_op_ctx(glusterd_op_t op, void *ctx)
+{
+    int ret = -1;
+    dict_t *op_ctx = NULL;
+    int brick_index_max = -1;
+    int other_count = 0;
+    int count = 0;
+    uint32_t cmd = GF_CLI_STATUS_NONE;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char *port = 0;
+    int i = 0;
+    char key[64] = {
+        0,
+    };
+    int keylen;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+
+    if (ctx)
+        op_ctx = ctx;
+    else
+        op_ctx = glusterd_op_get_ctx();
+
+    if (!op_ctx) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_OPCTX_NULL,
+               "Operation context is not present.");
+        goto out;
+    }
+
+    switch (op) {
+        case GD_OP_STATUS_VOLUME:
+            ret = dict_get_uint32(op_ctx, "cmd", &cmd);
+            if (ret) {
+                gf_msg_debug(this->name, 0, "Failed to get status cmd");
+                goto out;
+            }
+
+            if (!glusterd_is_volume_status_modify_op_ctx(cmd)) {
+                gf_msg_debug(this->name, 0,
+                             "op_ctx modification not required for status "
+                             "operation being performed");
+                goto out;
+            }
+
+            ret = dict_get_int32n(op_ctx, "brick-index-max",
+                                  SLEN("brick-index-max"), &brick_index_max);
+            if (ret) {
+                gf_msg_debug(this->name, 0, "Failed to get brick-index-max");
+                goto out;
+            }
+
+            ret = dict_get_int32n(op_ctx, "other-count", SLEN("other-count"),
+                                  &other_count);
+            if (ret) {
+                gf_msg_debug(this->name, 0, "Failed to get other-count");
+                goto out;
+            }
+
+            count = brick_index_max + other_count + 1;
+
+            /*
+             * a glusterd lesser than version 3.7 will be sending the
+             * rdma port in older key. Changing that value from here
+             * to support backward compatibility
+             */
+            ret = dict_get_strn(op_ctx, "volname", SLEN("volname"), &volname);
+            if (ret)
+                goto out;
+
+            for (i = 0; i <= brick_index_max; i++) {
+                keylen = snprintf(key, sizeof(key), "brick%d.rdma_port", i);
+                ret = dict_get_strn(op_ctx, key, keylen, &port);
+                if (ret) {
+                    ret = dict_set_nstrn(op_ctx, key, keylen, "\0", SLEN("\0"));
+                    if (ret)
+                        goto out;
+                }
+            }
+            ret = glusterd_volinfo_find(volname, &volinfo);
+            if (ret)
+                goto out;
+            if (conf->op_version < GD_OP_VERSION_3_7_0 &&
+                volinfo->transport_type == GF_TRANSPORT_RDMA) {
+                ret = glusterd_op_modify_port_key(op_ctx, brick_index_max);
+                if (ret)
+                    goto out;
+            }
+            /* add 'brick%d.peerid' into op_ctx with value of 'brick%d.path'.
+               nfs/sshd like services have this additional uuid */
+            {
+                char *uuid_str = NULL;
+                char *uuid = NULL;
+                int i;
+
+                for (i = brick_index_max + 1; i < count; i++) {
+                    keylen = snprintf(key, sizeof(key), "brick%d.path", i);
+                    ret = dict_get_strn(op_ctx, key, keylen, &uuid_str);
+                    if (!ret) {
+                        keylen = snprintf(key, sizeof(key), "brick%d.peerid",
+                                          i);
+                        uuid = gf_strdup(uuid_str);
+                        if (!uuid) {
+                            gf_msg_debug(this->name, 0,
+                                         "unable to create dup of"
+                                         " uuid_str");
+                            continue;
+                        }
+                        ret = dict_set_dynstrn(op_ctx, key, keylen, uuid);
+                        if (ret != 0) {
+                            GF_FREE(uuid);
+                        }
+                    }
+                }
+            }
+
+            ret = glusterd_op_volume_dict_uuid_to_hostname(
+                op_ctx, "brick%d.path", 0, count);
+            if (ret)
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_CONVERSION_FAILED,
+                       "Failed uuid to hostname conversion");
+
+            break;
+
+        case GD_OP_PROFILE_VOLUME:
+            ret = dict_get_str_boolean(op_ctx, "nfs", _gf_false);
+            if (!ret)
+                goto out;
+
+            ret = dict_get_int32n(op_ctx, "count", SLEN("count"), &count);
+            if (ret) {
+                gf_msg_debug(this->name, 0, "Failed to get brick count");
+                goto out;
+            }
+
+            ret = glusterd_op_volume_dict_uuid_to_hostname(op_ctx, "%d-brick",
+                                                           1, (count + 1));
+            if (ret)
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_CONVERSION_FAILED,
+                       "Failed uuid to hostname conversion");
+
+            break;
+
+        /* For both rebalance and remove-brick status, the glusterd op is the
+         * same
+         */
+        case GD_OP_DEFRAG_BRICK_VOLUME:
+        case GD_OP_SCRUB_STATUS:
+        case GD_OP_SCRUB_ONDEMAND:
+            ret = dict_get_int32n(op_ctx, "count", SLEN("count"), &count);
+            if (ret) {
+                gf_msg_debug(this->name, 0, "Failed to get count");
+                goto out;
+            }
+
+            /* add 'node-name-%d' into op_ctx with value uuid_str.
+               this will be used to convert to hostname later */
+            {
+                char *uuid_str = NULL;
+                char *uuid = NULL;
+                int i;
+
+                for (i = 1; i <= count; i++) {
+                    keylen = snprintf(key, sizeof(key), "node-uuid-%d", i);
+                    ret = dict_get_strn(op_ctx, key, keylen, &uuid_str);
+                    if (!ret) {
+                        keylen = snprintf(key, sizeof(key), "node-name-%d", i);
+                        uuid = gf_strdup(uuid_str);
+                        if (!uuid) {
+                            gf_msg_debug(this->name, 0,
+                                         "unable to create dup of"
+                                         " uuid_str");
+                            continue;
+                        }
+                        ret = dict_set_dynstrn(op_ctx, key, keylen, uuid);
+                        if (ret != 0) {
+                            GF_FREE(uuid);
+                        }
+                    }
+                }
+            }
+
+            ret = glusterd_op_volume_dict_uuid_to_hostname(
+                op_ctx, "node-name-%d", 1, (count + 1));
+            if (ret)
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_CONVERSION_FAILED,
+                       "Failed uuid to hostname conversion");
+
+            /* Since Both rebalance and bitrot scrub status/ondemand
+             * are going to use same code path till here, we should
+             * break in case of scrub status.
+             */
+            if (op == GD_OP_SCRUB_STATUS || op == GD_OP_SCRUB_ONDEMAND) {
+                break;
+            }
+
+            ret = glusterd_op_check_peer_defrag_status(op_ctx, count);
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_DEFRAG_STATUS_UPDATE_FAIL,
+                       "Failed to reset defrag status for fix-layout");
+            break;
+
+        default:
+            ret = 0;
+            gf_msg_debug(this->name, 0, "op_ctx modification not required");
+            break;
+    }
+
+out:
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_OPCTX_UPDATE_FAIL,
+               "op_ctx modification failed");
+    return;
+}
+
+int
+glusterd_op_commit_hook(glusterd_op_t op, dict_t *op_ctx,
+                        glusterd_commit_hook_type_t type)
+{
+    glusterd_conf_t *priv = NULL;
+    char hookdir[PATH_MAX] = {
+        0,
+    };
+    char scriptdir[PATH_MAX] = {
+        0,
+    };
+    char *type_subdir = "";
+    char *cmd_subdir = NULL;
+    int ret = -1;
+    int32_t len = 0;
+
+    priv = THIS->private;
+    switch (type) {
+        case GD_COMMIT_HOOK_NONE:
+        case GD_COMMIT_HOOK_MAX:
+            /*Won't be called*/
+            break;
+
+        case GD_COMMIT_HOOK_PRE:
+            type_subdir = "pre";
+            break;
+        case GD_COMMIT_HOOK_POST:
+            type_subdir = "post";
+            break;
+    }
+
+    cmd_subdir = glusterd_hooks_get_hooks_cmd_subdir(op);
+    if (strlen(cmd_subdir) == 0)
+        return -1;
+
+    GLUSTERD_GET_HOOKS_DIR(hookdir, GLUSTERD_HOOK_VER, priv);
+    len = snprintf(scriptdir, sizeof(scriptdir), "%s/%s/%s", hookdir,
+                   cmd_subdir, type_subdir);
+    if ((len < 0) || (len >= sizeof(scriptdir))) {
+        return -1;
+    }
+
+    switch (type) {
+        case GD_COMMIT_HOOK_NONE:
+        case GD_COMMIT_HOOK_MAX:
+            /*Won't be called*/
+            break;
+
+        case GD_COMMIT_HOOK_PRE:
+            ret = glusterd_hooks_run_hooks(scriptdir, op, op_ctx, type);
+            break;
+        case GD_COMMIT_HOOK_POST:
+            ret = glusterd_hooks_post_stub_enqueue(scriptdir, op, op_ctx);
+            break;
+    }
+
+    return ret;
+}
+
+static int
+glusterd_op_ac_send_commit_op(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    int ret1 = 0;
+    rpc_clnt_procedure_t *proc = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    dict_t *dict = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    char *op_errstr = NULL;
+    glusterd_op_t op = GD_OP_NONE;
+    uint32_t pending_count = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    op = glusterd_op_get_op();
+
+    ret = glusterd_op_build_payload(&dict, &op_errstr, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_OP_PAYLOAD_BUILD_FAIL,
+               LOGSTR_BUILD_PAYLOAD, gd_op_list[op]);
+        if (op_errstr == NULL)
+            gf_asprintf(&op_errstr, OPERRSTR_BUILD_PAYLOAD);
+        opinfo.op_errstr = op_errstr;
+        goto out;
+    }
+
+    ret = glusterd_op_commit_perform(op, dict, &op_errstr,
+                                     NULL);  // rsp_dict invalid for source
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+               LOGSTR_COMMIT_FAIL, gd_op_list[op], "localhost",
+               (op_errstr) ? ":" : " ", (op_errstr) ? op_errstr : " ");
+        if (op_errstr == NULL)
+            gf_asprintf(&op_errstr, OPERRSTR_COMMIT_FAIL, "localhost");
+        opinfo.op_errstr = op_errstr;
+        goto out;
+    }
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &priv->peers, uuid_list)
+    {
+        /* Only send requests to peers who were available before the
+         * transaction started
+         */
+        if (peerinfo->generation > opinfo.txn_generation)
+            continue;
+
+        if (!peerinfo->connected || !peerinfo->mgmt)
+            continue;
+        if ((peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED) &&
+            (glusterd_op_get_op() != GD_OP_SYNC_VOLUME))
+            continue;
+
+        proc = &peerinfo->mgmt->proctable[GLUSTERD_MGMT_COMMIT_OP];
+        GF_ASSERT(proc);
+        if (proc->fn) {
+            ret = dict_set_static_ptr(dict, "peerinfo", peerinfo);
+            if (ret) {
+                RCU_READ_UNLOCK;
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "failed to set peerinfo");
+                goto out;
+            }
+            ret = proc->fn(NULL, this, dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_COMMIT_REQ_SEND_FAIL,
+                       "Failed to "
+                       "send commit request for operation "
+                       "'Volume %s' to peer %s",
+                       gd_op_list[op], peerinfo->hostname);
+                continue;
+            }
+            pending_count++;
+        }
+    }
+    RCU_READ_UNLOCK;
+
+    opinfo.pending_count = pending_count;
+    gf_msg_debug(this->name, 0,
+                 "Sent commit op req for 'Volume %s' "
+                 "to %d peers",
+                 gd_op_list[op], opinfo.pending_count);
+out:
+    if (dict)
+        dict_unref(dict);
+
+    if (ret)
+        opinfo.op_ret = ret;
+
+    ret1 = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+    if (ret1)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    if (ret) {
+        glusterd_op_sm_inject_event(GD_OP_EVENT_RCVD_RJT, &event->txn_id, NULL);
+        opinfo.op_ret = ret;
+    }
+
+    if (!opinfo.pending_count) {
+        if (op == GD_OP_REPLACE_BRICK) {
+            ret = glusterd_op_sm_inject_all_acc(&event->txn_id);
+        } else {
+            glusterd_op_modify_op_ctx(op, NULL);
+            ret = glusterd_op_sm_inject_all_acc(&event->txn_id);
+        }
+        goto err;
+    }
+
+err:
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_op_ac_rcvd_stage_op_acc(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+
+    GF_ASSERT(event);
+
+    if (opinfo.pending_count > 0)
+        opinfo.pending_count--;
+
+    ret = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    if (opinfo.pending_count > 0)
+        goto out;
+
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_STAGE_ACC, &event->txn_id,
+                                      NULL);
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_op_ac_stage_op_failed(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+
+    GF_ASSERT(event);
+
+    if (opinfo.pending_count > 0)
+        opinfo.pending_count--;
+
+    ret = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    if (opinfo.pending_count > 0)
+        goto out;
+
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_ALL_ACK, &event->txn_id,
+                                      NULL);
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_op_ac_commit_op_failed(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+
+    GF_ASSERT(event);
+
+    if (opinfo.pending_count > 0)
+        opinfo.pending_count--;
+
+    ret = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    if (opinfo.pending_count > 0)
+        goto out;
+
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_ALL_ACK, &event->txn_id,
+                                      NULL);
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_op_ac_brick_op_failed(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    glusterd_op_brick_rsp_ctx_t *ev_ctx = NULL;
+    gf_boolean_t free_errstr = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(event);
+    GF_ASSERT(ctx);
+    ev_ctx = ctx;
+
+    ret = glusterd_remove_pending_entry(&opinfo.pending_bricks,
+                                        ev_ctx->pending_node->node);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNKNOWN_RESPONSE,
+               "unknown response received ");
+        ret = -1;
+        free_errstr = _gf_true;
+        goto out;
+    }
+    if (opinfo.brick_pending_count > 0)
+        opinfo.brick_pending_count--;
+    if (opinfo.op_ret == 0)
+        opinfo.op_ret = ev_ctx->op_ret;
+
+    if (opinfo.op_errstr == NULL)
+        opinfo.op_errstr = ev_ctx->op_errstr;
+    else
+        free_errstr = _gf_true;
+
+    ret = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    if (opinfo.brick_pending_count > 0)
+        goto out;
+
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_ALL_ACK, &event->txn_id,
+                                      ev_ctx->commit_ctx);
+
+out:
+    if (ev_ctx->rsp_dict)
+        dict_unref(ev_ctx->rsp_dict);
+    if (free_errstr && ev_ctx->op_errstr)
+        GF_FREE(ev_ctx->op_errstr);
+    GF_FREE(ctx);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_op_ac_rcvd_commit_op_acc(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    gf_boolean_t commit_ack_inject = _gf_true;
+    glusterd_op_t op = GD_OP_NONE;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    op = glusterd_op_get_op();
+    GF_ASSERT(event);
+
+    if (opinfo.pending_count > 0)
+        opinfo.pending_count--;
+
+    ret = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    if (opinfo.pending_count > 0)
+        goto out;
+
+    if (op == GD_OP_REPLACE_BRICK) {
+        ret = glusterd_op_sm_inject_all_acc(&event->txn_id);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RBOP_START_FAIL,
+                   "Couldn't start "
+                   "replace-brick operation.");
+            goto out;
+        }
+
+        commit_ack_inject = _gf_false;
+        goto out;
+    }
+
+out:
+    if (commit_ack_inject) {
+        if (ret)
+            ret = glusterd_op_sm_inject_event(GD_OP_EVENT_RCVD_RJT,
+                                              &event->txn_id, NULL);
+        else if (!opinfo.pending_count) {
+            glusterd_op_modify_op_ctx(op, NULL);
+            ret = glusterd_op_sm_inject_event(GD_OP_EVENT_COMMIT_ACC,
+                                              &event->txn_id, NULL);
+        }
+        /*else do nothing*/
+    }
+
+    return ret;
+}
+
+static int
+glusterd_op_ac_rcvd_unlock_acc(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+
+    GF_ASSERT(event);
+
+    if (opinfo.pending_count > 0)
+        opinfo.pending_count--;
+
+    ret = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    if (opinfo.pending_count > 0)
+        goto out;
+
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_ALL_ACC, &event->txn_id,
+                                      NULL);
+
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_op_clear_errstr()
+{
+    opinfo.op_errstr = NULL;
+    return 0;
+}
+
+int32_t
+glusterd_op_set_ctx(void *ctx)
+{
+    opinfo.op_ctx = ctx;
+
+    return 0;
+}
+
+int32_t
+glusterd_op_reset_ctx()
+{
+    glusterd_op_set_ctx(NULL);
+
+    return 0;
+}
+
+int32_t
+glusterd_op_txn_complete(uuid_t *txn_id)
+{
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    int32_t op = -1;
+    int32_t op_ret = 0;
+    int32_t op_errno = 0;
+    rpcsvc_request_t *req = NULL;
+    void *ctx = NULL;
+    char *op_errstr = NULL;
+    char *volname = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    op = glusterd_op_get_op();
+    ctx = glusterd_op_get_ctx();
+    op_ret = opinfo.op_ret;
+    op_errno = opinfo.op_errno;
+    req = opinfo.req;
+    if (opinfo.op_errstr)
+        op_errstr = opinfo.op_errstr;
+
+    opinfo.op_ret = 0;
+    opinfo.op_errno = 0;
+    glusterd_op_clear_op();
+    glusterd_op_reset_ctx();
+    glusterd_op_clear_errstr();
+
+    /* Based on the op-version, we release the cluster or mgmt_v3 lock */
+    if (priv->op_version < GD_OP_VERSION_3_6_0) {
+        ret = glusterd_unlock(MY_UUID);
+        /* unlock can't/shouldn't fail here!! */
+        if (ret)
+            gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_GLUSTERD_UNLOCK_FAIL,
+                   "Unable to clear local lock, ret: %d", ret);
+        else
+            gf_msg_debug(this->name, 0, "Cleared local lock");
+    } else {
+        ret = dict_get_strn(ctx, "volname", SLEN("volname"), &volname);
+        if (ret)
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+                   "No Volume name present. "
+                   "Locks have not been held.");
+
+        if (volname) {
+            ret = glusterd_mgmt_v3_unlock(volname, MY_UUID, "vol");
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FAIL,
+                       "Unable to release lock for %s", volname);
+        }
+    }
+
+    ret = glusterd_op_send_cli_response(op, op_ret, op_errno, req, ctx,
+                                        op_errstr);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_CLI_RESP,
+               "Responding to cli failed, "
+               "ret: %d",
+               ret);
+        // Ignore this error, else state machine blocks
+        ret = 0;
+    }
+
+    if (op_errstr && (strcmp(op_errstr, "")))
+        GF_FREE(op_errstr);
+
+    if (priv->pending_quorum_action)
+        glusterd_do_quorum_action();
+
+    /* Clearing the transaction opinfo */
+    ret = glusterd_clear_txn_opinfo(txn_id);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_CLEAR_FAIL,
+               "Unable to clear transaction's opinfo");
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_op_ac_unlocked_all(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+
+    GF_ASSERT(event);
+
+    ret = glusterd_op_txn_complete(&event->txn_id);
+
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_op_ac_stage_op(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = -1;
+    glusterd_req_ctx_t *req_ctx = NULL;
+    int32_t status = 0;
+    dict_t *rsp_dict = NULL;
+    char *op_errstr = NULL;
+    dict_t *dict = NULL;
+    xlator_t *this = NULL;
+    uuid_t *txn_id = NULL;
+    glusterd_op_info_t txn_op_info = {
+        {0},
+    };
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(ctx);
+
+    req_ctx = ctx;
+
+    dict = req_ctx->dict;
+
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_DICT_CREATE_FAIL,
+               "Failed to get new dictionary");
+        return -1;
+    }
+
+    status = glusterd_op_stage_validate(req_ctx->op, dict, &op_errstr,
+                                        rsp_dict);
+
+    if (status) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VALIDATE_FAILED,
+               "Stage failed on operation"
+               " 'Volume %s', Status : %d",
+               gd_op_list[req_ctx->op], status);
+    }
+
+    txn_id = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+
+    if (txn_id)
+        gf_uuid_copy(*txn_id, event->txn_id);
+    else {
+        ret = -1;
+        goto out;
+    }
+    ret = glusterd_get_txn_opinfo(&event->txn_id, &txn_op_info);
+
+    ret = dict_set_bin(rsp_dict, "transaction_id", txn_id, sizeof(*txn_id));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set transaction id.");
+        GF_FREE(txn_id);
+        txn_id = NULL;
+        goto out;
+    }
+
+    ret = glusterd_op_stage_send_resp(req_ctx->req, req_ctx->op, status,
+                                      op_errstr, rsp_dict);
+
+out:
+    if (op_errstr && (strcmp(op_errstr, "")))
+        GF_FREE(op_errstr);
+
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+
+    /* for no volname transactions, the txn_opinfo needs to be cleaned up
+     * as there's no unlock event triggered. However if the originator node of
+     * this transaction is still running with a version lower than 60000,
+     * txn_opinfo can't be cleared as that'll lead to a race of referring op_ctx
+     * after it's being freed.
+     */
+    if (txn_op_info.skip_locking && priv->op_version >= GD_OP_VERSION_6_0 &&
+        txn_id)
+        ret = glusterd_clear_txn_opinfo(txn_id);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    return ret;
+}
+
+static gf_boolean_t
+glusterd_need_brick_op(glusterd_op_t op)
+{
+    gf_boolean_t ret = _gf_false;
+
+    GF_ASSERT(GD_OP_NONE < op && op < GD_OP_MAX);
+
+    switch (op) {
+        case GD_OP_PROFILE_VOLUME:
+        case GD_OP_STATUS_VOLUME:
+        case GD_OP_DEFRAG_BRICK_VOLUME:
+        case GD_OP_HEAL_VOLUME:
+        case GD_OP_SCRUB_STATUS:
+        case GD_OP_SCRUB_ONDEMAND:
+            ret = _gf_true;
+            break;
+        default:
+            ret = _gf_false;
+    }
+
+    return ret;
+}
+
+dict_t *
+glusterd_op_init_commit_rsp_dict(glusterd_op_t op)
+{
+    dict_t *rsp_dict = NULL;
+    dict_t *op_ctx = NULL;
+
+    GF_ASSERT(GD_OP_NONE < op && op < GD_OP_MAX);
+
+    if (glusterd_need_brick_op(op)) {
+        op_ctx = glusterd_op_get_ctx();
+        GF_ASSERT(op_ctx);
+        rsp_dict = dict_ref(op_ctx);
+    } else {
+        rsp_dict = dict_new();
+    }
+
+    return rsp_dict;
+}
+
+static int
+glusterd_op_ac_commit_op(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    glusterd_req_ctx_t *req_ctx = NULL;
+    int32_t status = 0;
+    char *op_errstr = NULL;
+    dict_t *dict = NULL;
+    dict_t *rsp_dict = NULL;
+    xlator_t *this = NULL;
+    uuid_t *txn_id = NULL;
+    glusterd_op_info_t txn_op_info = {
+        {0},
+    };
+    gf_boolean_t need_cleanup = _gf_true;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(ctx);
+
+    req_ctx = ctx;
+
+    dict = req_ctx->dict;
+
+    rsp_dict = glusterd_op_init_commit_rsp_dict(req_ctx->op);
+    if (NULL == rsp_dict)
+        return -1;
+
+    if (GD_OP_CLEARLOCKS_VOLUME == req_ctx->op) {
+        /*clear locks should be run only on
+         * originator glusterd*/
+        status = 0;
+
+    } else {
+        status = glusterd_op_commit_perform(req_ctx->op, dict, &op_errstr,
+                                            rsp_dict);
+    }
+
+    if (status)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+               "Commit of operation "
+               "'Volume %s' failed: %d",
+               gd_op_list[req_ctx->op], status);
+
+    txn_id = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+
+    if (txn_id)
+        gf_uuid_copy(*txn_id, event->txn_id);
+    else {
+        ret = -1;
+        goto out;
+    }
+    ret = glusterd_get_txn_opinfo(&event->txn_id, &txn_op_info);
+    if (ret) {
+        gf_msg_callingfn(this->name, GF_LOG_ERROR, 0,
+                         GD_MSG_TRANS_OPINFO_GET_FAIL,
+                         "Unable to get transaction opinfo "
+                         "for transaction ID : %s",
+                         uuid_utoa(event->txn_id));
+        goto out;
+    }
+
+    ret = dict_set_bin(rsp_dict, "transaction_id", txn_id, sizeof(*txn_id));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set transaction id.");
+        if (txn_op_info.skip_locking)
+            ret = glusterd_clear_txn_opinfo(txn_id);
+        need_cleanup = _gf_false;
+        GF_FREE(txn_id);
+        goto out;
+    }
+
+    ret = glusterd_op_commit_send_resp(req_ctx->req, req_ctx->op, status,
+                                       op_errstr, rsp_dict);
+
+out:
+    if (op_errstr && (strcmp(op_errstr, "")))
+        GF_FREE(op_errstr);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+    /* for no volname transactions, the txn_opinfo needs to be cleaned up
+     * as there's no unlock event triggered
+     */
+    if (need_cleanup && txn_id && txn_op_info.skip_locking)
+        ret = glusterd_clear_txn_opinfo(txn_id);
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_op_ac_send_commit_failed(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    glusterd_req_ctx_t *req_ctx = NULL;
+    dict_t *op_ctx = NULL;
+
+    GF_ASSERT(ctx);
+
+    req_ctx = ctx;
+
+    op_ctx = glusterd_op_get_ctx();
+
+    ret = glusterd_op_commit_send_resp(req_ctx->req, req_ctx->op, opinfo.op_ret,
+                                       opinfo.op_errstr, op_ctx);
+
+    if (opinfo.op_errstr && (strcmp(opinfo.op_errstr, ""))) {
+        GF_FREE(opinfo.op_errstr);
+        opinfo.op_errstr = NULL;
+    }
+
+    ret = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    gf_msg_debug(THIS->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+static int
+glusterd_op_sm_transition_state(glusterd_op_info_t *opinfo,
+                                glusterd_op_sm_t *state,
+                                glusterd_op_sm_event_type_t event_type)
+{
+    glusterd_conf_t *conf = NULL;
+
+    GF_ASSERT(state);
+    GF_ASSERT(opinfo);
+
+    conf = THIS->private;
+    GF_ASSERT(conf);
+
+    (void)glusterd_sm_tr_log_transition_add(
+        &conf->op_sm_log, opinfo->state.state, state[event_type].next_state,
+        event_type);
+
+    opinfo->state.state = state[event_type].next_state;
+    return 0;
+}
+
+int32_t
+glusterd_op_stage_validate(glusterd_op_t op, dict_t *dict, char **op_errstr,
+                           dict_t *rsp_dict)
+{
+    int ret = -1;
+    xlator_t *this = THIS;
+
+    switch (op) {
+        case GD_OP_CREATE_VOLUME:
+            ret = glusterd_op_stage_create_volume(dict, op_errstr, rsp_dict);
+            break;
+
+        case GD_OP_START_VOLUME:
+            ret = glusterd_op_stage_start_volume(dict, op_errstr, rsp_dict);
+            break;
+
+        case GD_OP_STOP_VOLUME:
+            ret = glusterd_op_stage_stop_volume(dict, op_errstr);
+            break;
+
+        case GD_OP_DELETE_VOLUME:
+            ret = glusterd_op_stage_delete_volume(dict, op_errstr);
+            break;
+
+        case GD_OP_ADD_BRICK:
+            ret = glusterd_op_stage_add_brick(dict, op_errstr, rsp_dict);
+            break;
+
+        case GD_OP_REPLACE_BRICK:
+            ret = glusterd_op_stage_replace_brick(dict, op_errstr, rsp_dict);
+            break;
+
+        case GD_OP_SET_VOLUME:
+            ret = glusterd_op_stage_set_volume(dict, op_errstr);
+            break;
+
+        case GD_OP_GANESHA:
+            ret = glusterd_op_stage_set_ganesha(dict, op_errstr);
+            break;
+
+        case GD_OP_RESET_VOLUME:
+            ret = glusterd_op_stage_reset_volume(dict, op_errstr);
+            break;
+        case GD_OP_REMOVE_BRICK:
+            ret = glusterd_op_stage_remove_brick(dict, op_errstr);
+            break;
+
+        case GD_OP_LOG_ROTATE:
+            ret = glusterd_op_stage_log_rotate(dict, op_errstr);
+            break;
+
+        case GD_OP_SYNC_VOLUME:
+            ret = glusterd_op_stage_sync_volume(dict, op_errstr);
+            break;
+
+        case GD_OP_GSYNC_CREATE:
+            ret = glusterd_op_stage_gsync_create(dict, op_errstr);
+            break;
+
+        case GD_OP_GSYNC_SET:
+            ret = glusterd_op_stage_gsync_set(dict, op_errstr);
+            break;
+
+        case GD_OP_PROFILE_VOLUME:
+            ret = glusterd_op_stage_stats_volume(dict, op_errstr);
+            break;
+
+        case GD_OP_QUOTA:
+            ret = glusterd_op_stage_quota(dict, op_errstr, rsp_dict);
+            break;
+
+        case GD_OP_STATUS_VOLUME:
+            ret = glusterd_op_stage_status_volume(dict, op_errstr);
+            break;
+
+        case GD_OP_REBALANCE:
+        case GD_OP_DEFRAG_BRICK_VOLUME:
+            ret = glusterd_op_stage_rebalance(dict, op_errstr);
+            break;
+
+        case GD_OP_HEAL_VOLUME:
+            ret = glusterd_op_stage_heal_volume(dict, op_errstr);
+            break;
+
+        case GD_OP_STATEDUMP_VOLUME:
+            ret = glusterd_op_stage_statedump_volume(dict, op_errstr);
+            break;
+        case GD_OP_CLEARLOCKS_VOLUME:
+            ret = glusterd_op_stage_clearlocks_volume(dict, op_errstr);
+            break;
+
+        case GD_OP_COPY_FILE:
+            ret = glusterd_op_stage_copy_file(dict, op_errstr);
+            break;
+
+        case GD_OP_SYS_EXEC:
+            ret = glusterd_op_stage_sys_exec(dict, op_errstr);
+            break;
+
+        case GD_OP_BARRIER:
+            ret = glusterd_op_stage_barrier(dict, op_errstr);
+            break;
+
+        case GD_OP_BITROT:
+        case GD_OP_SCRUB_STATUS:
+        case GD_OP_SCRUB_ONDEMAND:
+            ret = glusterd_op_stage_bitrot(dict, op_errstr, rsp_dict);
+            break;
+
+        default:
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+                   "Unknown op %s", gd_op_list[op]);
+    }
+
+    gf_msg_debug(this->name, 0, "OP = %d. Returning %d", op, ret);
+    return ret;
+}
+
+static void
+glusterd_wait_for_blockers(glusterd_conf_t *priv)
+{
+    while (GF_ATOMIC_GET(priv->blockers)) {
+        synccond_wait(&priv->cond_blockers, &priv->big_lock);
+    }
+}
+
+int32_t
+glusterd_op_commit_perform(glusterd_op_t op, dict_t *dict, char **op_errstr,
+                           dict_t *rsp_dict)
+{
+    int ret = -1;
+    xlator_t *this = THIS;
+
+    glusterd_op_commit_hook(op, dict, GD_COMMIT_HOOK_PRE);
+    switch (op) {
+        case GD_OP_CREATE_VOLUME:
+            ret = glusterd_op_create_volume(dict, op_errstr);
+            break;
+
+        case GD_OP_START_VOLUME:
+            ret = glusterd_op_start_volume(dict, op_errstr);
+            break;
+
+        case GD_OP_STOP_VOLUME:
+            ret = glusterd_op_stop_volume(dict);
+            break;
+
+        case GD_OP_DELETE_VOLUME:
+            glusterd_wait_for_blockers(this->private);
+            ret = glusterd_op_delete_volume(dict);
+            break;
+
+        case GD_OP_ADD_BRICK:
+            glusterd_wait_for_blockers(this->private);
+            ret = glusterd_op_add_brick(dict, op_errstr);
+            break;
+
+        case GD_OP_REPLACE_BRICK:
+            glusterd_wait_for_blockers(this->private);
+            ret = glusterd_op_replace_brick(dict, rsp_dict);
+            break;
+
+        case GD_OP_SET_VOLUME:
+            ret = glusterd_op_set_volume(dict, op_errstr);
+            break;
+        case GD_OP_GANESHA:
+            ret = glusterd_op_set_ganesha(dict, op_errstr);
+            break;
+        case GD_OP_RESET_VOLUME:
+            ret = glusterd_op_reset_volume(dict, op_errstr);
+            break;
+
+        case GD_OP_REMOVE_BRICK:
+            glusterd_wait_for_blockers(this->private);
+            ret = glusterd_op_remove_brick(dict, op_errstr);
+            break;
+
+        case GD_OP_LOG_ROTATE:
+            ret = glusterd_op_log_rotate(dict);
+            break;
+
+        case GD_OP_SYNC_VOLUME:
+            ret = glusterd_op_sync_volume(dict, op_errstr, rsp_dict);
+            break;
+
+        case GD_OP_GSYNC_CREATE:
+            ret = glusterd_op_gsync_create(dict, op_errstr, rsp_dict);
+            break;
+
+        case GD_OP_GSYNC_SET:
+            ret = glusterd_op_gsync_set(dict, op_errstr, rsp_dict);
+            break;
+
+        case GD_OP_PROFILE_VOLUME:
+            ret = glusterd_op_stats_volume(dict, op_errstr, rsp_dict);
+            break;
+
+        case GD_OP_QUOTA:
+            ret = glusterd_op_quota(dict, op_errstr, rsp_dict);
+            break;
+
+        case GD_OP_STATUS_VOLUME:
+            ret = glusterd_op_status_volume(dict, op_errstr, rsp_dict);
+            break;
+
+        case GD_OP_REBALANCE:
+        case GD_OP_DEFRAG_BRICK_VOLUME:
+            ret = glusterd_op_rebalance(dict, op_errstr, rsp_dict);
+            break;
+
+        case GD_OP_HEAL_VOLUME:
+            ret = glusterd_op_heal_volume(dict, op_errstr);
+            break;
+
+        case GD_OP_STATEDUMP_VOLUME:
+            ret = glusterd_op_statedump_volume(dict, op_errstr);
+            break;
+
+        case GD_OP_CLEARLOCKS_VOLUME:
+            ret = glusterd_op_clearlocks_volume(dict, op_errstr, rsp_dict);
+            break;
+
+        case GD_OP_COPY_FILE:
+            ret = glusterd_op_copy_file(dict, op_errstr);
+            break;
+
+        case GD_OP_SYS_EXEC:
+            ret = glusterd_op_sys_exec(dict, op_errstr, rsp_dict);
+            break;
+
+        case GD_OP_BARRIER:
+            ret = glusterd_op_barrier(dict, op_errstr);
+            break;
+
+        case GD_OP_BITROT:
+        case GD_OP_SCRUB_STATUS:
+        case GD_OP_SCRUB_ONDEMAND:
+            ret = glusterd_op_bitrot(dict, op_errstr, rsp_dict);
+            break;
+
+        default:
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+                   "Unknown op %s", gd_op_list[op]);
+            break;
+    }
+
+    if (ret == 0)
+        glusterd_op_commit_hook(op, dict, GD_COMMIT_HOOK_POST);
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_bricks_select_stop_volume(dict_t *dict, char **op_errstr,
+                                   struct cds_list_head *selected)
+{
+    int ret = 0;
+    int flags = 0;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_pending_node_t *pending_node = NULL;
+
+    ret = glusterd_op_stop_volume_args_get(dict, &volname, &flags);
+    if (ret)
+        goto out;
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               FMTSTR_CHECK_VOL_EXISTS, volname);
+        gf_asprintf(op_errstr, FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (glusterd_is_brick_started(brickinfo)) {
+            pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                     gf_gld_mt_pending_node_t);
+            if (!pending_node) {
+                ret = -1;
+                goto out;
+            } else {
+                pending_node->node = brickinfo;
+                pending_node->type = GD_NODE_BRICK;
+                cds_list_add_tail(&pending_node->list, selected);
+                pending_node = NULL;
+            }
+            /*
+             * This is not really the right place to do it, but
+             * it's the most convenient.
+             * TBD: move this to *after* the RPC
+             */
+            brickinfo->status = GF_BRICK_STOPPED;
+        }
+    }
+
+out:
+    return ret;
+}
+
+static int
+glusterd_bricks_select_remove_brick(dict_t *dict, char **op_errstr,
+                                    struct cds_list_head *selected)
+{
+    int ret = -1;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    char *brick = NULL;
+    int32_t count = 0;
+    int32_t i = 1;
+    char key[64] = {
+        0,
+    };
+    int keylen;
+    glusterd_pending_node_t *pending_node = NULL;
+    int32_t command = 0;
+    int32_t force = 0;
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "Unable to allocate memory");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "count", SLEN("count"), &count);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, -ret, GD_MSG_DICT_GET_FAILED,
+               "Unable to get count");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "command", SLEN("command"), &command);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, -ret, GD_MSG_DICT_GET_FAILED,
+               "Unable to get command");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "force", SLEN("force"), &force);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "force flag is not set");
+        ret = 0;
+        goto out;
+    }
+
+    while (i <= count) {
+        keylen = snprintf(key, sizeof(key), "brick%d", i);
+
+        ret = dict_get_strn(dict, key, keylen, &brick);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get brick");
+            goto out;
+        }
+
+        ret = glusterd_volume_brickinfo_get_by_brick(brick, volinfo, &brickinfo,
+                                                     _gf_false);
+
+        if (ret)
+            goto out;
+
+        if (glusterd_is_brick_started(brickinfo)) {
+            pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                     gf_gld_mt_pending_node_t);
+            if (!pending_node) {
+                ret = -1;
+                goto out;
+            } else {
+                pending_node->node = brickinfo;
+                pending_node->type = GD_NODE_BRICK;
+                cds_list_add_tail(&pending_node->list, selected);
+                pending_node = NULL;
+            }
+            /*
+             * This is not really the right place to do it, but
+             * it's the most convenient.
+             * TBD: move this to *after* the RPC
+             */
+            brickinfo->status = GF_BRICK_STOPPED;
+        }
+        i++;
+    }
+
+out:
+    return ret;
+}
+
+static int
+glusterd_bricks_select_profile_volume(dict_t *dict, char **op_errstr,
+                                      struct cds_list_head *selected)
+{
+    int ret = -1;
+    char *volname = NULL;
+    char msg[2048] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    int32_t stats_op = GF_CLI_STATS_NONE;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_pending_node_t *pending_node = NULL;
+    char *brick = NULL;
+    int32_t pid = -1;
+    char pidfile[PATH_MAX] = {0};
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "volume name get failed");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Volume %s does not exists", volname);
+
+        *op_errstr = gf_strdup(msg);
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND, "%s", msg);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "op", SLEN("op"), &stats_op);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "volume profile op get failed");
+        goto out;
+    }
+
+    switch (stats_op) {
+        case GF_CLI_STATS_START:
+        case GF_CLI_STATS_STOP:
+            goto out;
+            break;
+        case GF_CLI_STATS_INFO:
+#ifdef BUILD_GNFS
+            ret = dict_get_str_boolean(dict, "nfs", _gf_false);
+            if (ret) {
+                if (!priv->nfs_svc.online) {
+                    ret = -1;
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_NFS_SERVER_NOT_RUNNING,
+                           "NFS server"
+                           " is not running");
+                    goto out;
+                }
+                pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                         gf_gld_mt_pending_node_t);
+                if (!pending_node) {
+                    ret = -1;
+                    goto out;
+                }
+                pending_node->node = &(priv->nfs_svc);
+                pending_node->type = GD_NODE_NFS;
+                cds_list_add_tail(&pending_node->list, selected);
+                pending_node = NULL;
+
+                ret = 0;
+                goto out;
+            }
+#endif
+            cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+            {
+                if (glusterd_is_brick_started(brickinfo)) {
+                    /*
+                     * In normal use, glusterd_is_brick_started
+                     * will give us the answer we need.  However,
+                     * in our tests the brick gets detached behind
+                     * our back, so we need to double-check this
+                     * way.
+                     */
+                    GLUSTERD_GET_BRICK_PIDFILE(pidfile, volinfo, brickinfo,
+                                               priv);
+                    if (!gf_is_service_running(pidfile, &pid)) {
+                        continue;
+                    }
+                    pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                             gf_gld_mt_pending_node_t);
+                    if (!pending_node) {
+                        ret = -1;
+                        goto out;
+                    } else {
+                        pending_node->node = brickinfo;
+                        pending_node->type = GD_NODE_BRICK;
+                        cds_list_add_tail(&pending_node->list, selected);
+                        pending_node = NULL;
+                    }
+                }
+            }
+            break;
+
+        case GF_CLI_STATS_TOP:
+#ifdef BUILD_GNFS
+            ret = dict_get_str_boolean(dict, "nfs", _gf_false);
+            if (ret) {
+                if (!priv->nfs_svc.online) {
+                    ret = -1;
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_NFS_SERVER_NOT_RUNNING,
+                           "NFS server"
+                           " is not running");
+                    goto out;
+                }
+                pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                         gf_gld_mt_pending_node_t);
+                if (!pending_node) {
+                    ret = -1;
+                    goto out;
+                }
+                pending_node->node = &(priv->nfs_svc);
+                pending_node->type = GD_NODE_NFS;
+                cds_list_add_tail(&pending_node->list, selected);
+                pending_node = NULL;
+
+                ret = 0;
+                goto out;
+            }
+#endif
+            ret = dict_get_strn(dict, "brick", SLEN("brick"), &brick);
+            if (!ret) {
+                ret = glusterd_volume_brickinfo_get_by_brick(
+                    brick, volinfo, &brickinfo, _gf_true);
+                if (ret)
+                    goto out;
+
+                if (!glusterd_is_brick_started(brickinfo))
+                    goto out;
+
+                pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                         gf_gld_mt_pending_node_t);
+                if (!pending_node) {
+                    ret = -1;
+                    goto out;
+                } else {
+                    pending_node->node = brickinfo;
+                    pending_node->type = GD_NODE_BRICK;
+                    cds_list_add_tail(&pending_node->list, selected);
+                    pending_node = NULL;
+                    goto out;
+                }
+            }
+            ret = 0;
+            cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+            {
+                if (glusterd_is_brick_started(brickinfo)) {
+                    pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                             gf_gld_mt_pending_node_t);
+                    if (!pending_node) {
+                        ret = -1;
+                        goto out;
+                    } else {
+                        pending_node->node = brickinfo;
+                        pending_node->type = GD_NODE_BRICK;
+                        cds_list_add_tail(&pending_node->list, selected);
+                        pending_node = NULL;
+                    }
+                }
+            }
+            break;
+
+        default:
+            GF_ASSERT(0);
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+                   "Invalid profile op: %d", stats_op);
+            ret = -1;
+            goto out;
+            break;
+    }
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+_get_hxl_children_count(glusterd_volinfo_t *volinfo)
+{
+    if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) {
+        return volinfo->disperse_count;
+    } else {
+        return volinfo->replica_count;
+    }
+}
+
+static int
+_add_hxlator_to_dict(dict_t *dict, glusterd_volinfo_t *volinfo, int index,
+                     int count)
+{
+    int ret = -1;
+    char key[64] = {
+        0,
+    };
+    int keylen;
+    char *xname = NULL;
+    char *xl_type = 0;
+
+    if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) {
+        xl_type = "disperse";
+    } else {
+        xl_type = "replicate";
+    }
+    keylen = snprintf(key, sizeof(key), "xl-%d", count);
+    ret = gf_asprintf(&xname, "%s-%s-%d", volinfo->volname, xl_type, index);
+    if (ret == -1)
+        goto out;
+
+    ret = dict_set_dynstrn(dict, key, keylen, xname);
+    if (ret)
+        goto out;
+
+    ret = dict_set_int32(dict, xname, index);
+out:
+    return ret;
+}
+
+int
+get_replica_index_for_per_replica_cmd(glusterd_volinfo_t *volinfo, dict_t *dict)
+{
+    int ret = 0;
+    char *hostname = NULL;
+    char *path = NULL;
+    int index = 0;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int cmd_replica_index = -1;
+    int replica_count = -1;
+
+    if (!dict) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "per-replica-cmd-hostname",
+                        SLEN("per-replica-cmd-hostname"), &hostname);
+    if (ret)
+        goto out;
+    ret = dict_get_strn(dict, "per-replica-cmd-path",
+                        SLEN("per-replica-cmd-path"), &path);
+    if (ret)
+        goto out;
+
+    replica_count = volinfo->replica_count;
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (gf_uuid_is_null(brickinfo->uuid))
+            (void)glusterd_resolve_brick(brickinfo);
+        if (!strcmp(brickinfo->path, path) &&
+            !strcmp(brickinfo->hostname, hostname)) {
+            cmd_replica_index = index / (replica_count);
+            goto out;
+        }
+        index++;
+    }
+
+out:
+    if (ret)
+        cmd_replica_index = -1;
+
+    return cmd_replica_index;
+}
+
+int
+_select_hxlator_with_matching_brick(xlator_t *this, glusterd_volinfo_t *volinfo,
+                                    dict_t *dict, int *index)
+{
+    char *path = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int hxl_children = 0;
+
+    if (!dict || dict_get_strn(dict, "per-replica-cmd-path",
+                               SLEN("per-replica-cmd-path"), &path))
+        return -1;
+
+    hxl_children = _get_hxl_children_count(volinfo);
+    if ((*index) == 0)
+        (*index)++;
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (gf_uuid_is_null(brickinfo->uuid))
+            (void)glusterd_resolve_brick(brickinfo);
+
+        if ((!gf_uuid_compare(MY_UUID, brickinfo->uuid)) &&
+            (!strncmp(brickinfo->path, path, strlen(path)))) {
+            _add_hxlator_to_dict(dict, volinfo, ((*index) - 1) / hxl_children,
+                                 0);
+            return 1;
+        }
+        (*index)++;
+    }
+
+    return 0;
+}
+void
+_select_hxlators_with_local_bricks(xlator_t *this, glusterd_volinfo_t *volinfo,
+                                   dict_t *dict, int *index, int *hxlator_count)
+{
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int hxl_children = 0;
+    gf_boolean_t add = _gf_false;
+
+    hxl_children = _get_hxl_children_count(volinfo);
+
+    if ((*index) == 0)
+        (*index)++;
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (gf_uuid_is_null(brickinfo->uuid))
+            (void)glusterd_resolve_brick(brickinfo);
+
+        if (!gf_uuid_compare(MY_UUID, brickinfo->uuid))
+            add = _gf_true;
+
+        if ((*index) % hxl_children == 0) {
+            if (add) {
+                _add_hxlator_to_dict(dict, volinfo,
+                                     ((*index) - 1) / hxl_children,
+                                     (*hxlator_count));
+                (*hxlator_count)++;
+            }
+            add = _gf_false;
+        }
+
+        (*index)++;
+    }
+}
+
+int
+_select_hxlators_for_full_self_heal(xlator_t *this, glusterd_volinfo_t *volinfo,
+                                    dict_t *dict, int *index,
+                                    int *hxlator_count)
+{
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int hxl_children = 0;
+    uuid_t candidate = {0};
+    int brick_index = 0;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    int delta = 0;
+    uuid_t candidate_max = {0};
+
+    if ((*index) == 0)
+        (*index)++;
+    if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) {
+        hxl_children = volinfo->disperse_count;
+    } else {
+        hxl_children = volinfo->replica_count;
+    }
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (gf_uuid_compare(brickinfo->uuid, candidate_max) > 0) {
+            if (!gf_uuid_compare(MY_UUID, brickinfo->uuid)) {
+                gf_uuid_copy(candidate_max, brickinfo->uuid);
+            } else {
+                peerinfo = glusterd_peerinfo_find(brickinfo->uuid, NULL);
+                if (peerinfo && peerinfo->connected) {
+                    gf_uuid_copy(candidate_max, brickinfo->uuid);
+                }
+            }
+        }
+    }
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (gf_uuid_is_null(brickinfo->uuid))
+            (void)glusterd_resolve_brick(brickinfo);
+
+        delta %= hxl_children;
+        if ((*index + delta) == (brick_index + hxl_children)) {
+            if (!gf_uuid_compare(MY_UUID, brickinfo->uuid)) {
+                gf_uuid_copy(candidate, brickinfo->uuid);
+            } else {
+                peerinfo = glusterd_peerinfo_find(brickinfo->uuid, NULL);
+                if (peerinfo && peerinfo->connected) {
+                    gf_uuid_copy(candidate, brickinfo->uuid);
+                } else if (peerinfo &&
+                           (!gf_uuid_compare(candidate_max, MY_UUID))) {
+                    _add_hxlator_to_dict(dict, volinfo,
+                                         ((*index) - 1) / hxl_children,
+                                         (*hxlator_count));
+                    (*hxlator_count)++;
+                }
+            }
+
+            if (!gf_uuid_compare(MY_UUID, candidate)) {
+                _add_hxlator_to_dict(dict, volinfo,
+                                     ((*index) - 1) / hxl_children,
+                                     (*hxlator_count));
+                (*hxlator_count)++;
+            }
+            gf_uuid_clear(candidate);
+            brick_index += hxl_children;
+            delta++;
+        }
+
+        (*index)++;
+    }
+    return *hxlator_count;
+}
+
+static int
+glusterd_bricks_select_snap(dict_t *dict, char **op_errstr,
+                            struct cds_list_head *selected)
+{
+    int ret = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    glusterd_pending_node_t *pending_node = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char *volname = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int brick_index = -1;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get"
+               " volname");
+        goto out;
+    }
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret)
+        goto out;
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        brick_index++;
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID) ||
+            !glusterd_is_brick_started(brickinfo)) {
+            continue;
+        }
+        pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                 gf_gld_mt_pending_node_t);
+        if (!pending_node) {
+            ret = -1;
+            goto out;
+        }
+        pending_node->node = brickinfo;
+        pending_node->type = GD_NODE_BRICK;
+        pending_node->index = brick_index;
+        cds_list_add_tail(&pending_node->list, selected);
+        pending_node = NULL;
+    }
+
+    ret = 0;
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning ret %d", ret);
+    return ret;
+}
+
+static int
+fill_shd_status_for_local_bricks(dict_t *dict, glusterd_volinfo_t *volinfo,
+                                 cli_cmd_type type, int *index,
+                                 dict_t *req_dict)
+{
+    glusterd_brickinfo_t *brickinfo = NULL;
+    static char *msg = "self-heal-daemon is not running on";
+    char key[32] = {
+        0,
+    };
+    int keylen;
+    char value[128] = {
+        0,
+    };
+    int ret = 0;
+    xlator_t *this = NULL;
+    int cmd_replica_index = -1;
+
+    this = THIS;
+
+    if (type == PER_HEAL_XL) {
+        cmd_replica_index = get_replica_index_for_per_replica_cmd(volinfo,
+                                                                  req_dict);
+        if (cmd_replica_index == -1) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_REPLICA_INDEX_GET_FAIL,
+                   "Could not find the "
+                   "replica index for per replica type command");
+            ret = -1;
+            goto out;
+        }
+    }
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (gf_uuid_is_null(brickinfo->uuid))
+            (void)glusterd_resolve_brick(brickinfo);
+
+        if (gf_uuid_compare(MY_UUID, brickinfo->uuid)) {
+            (*index)++;
+            continue;
+        }
+
+        if (type == PER_HEAL_XL) {
+            if (cmd_replica_index != ((*index) / volinfo->replica_count)) {
+                (*index)++;
+                continue;
+            }
+        }
+        keylen = snprintf(key, sizeof(key), "%d-status", (*index));
+        snprintf(value, sizeof(value), "%s %s", msg, uuid_utoa(MY_UUID));
+        ret = dict_set_dynstrn(dict, key, keylen, gf_strdup(value));
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to"
+                   "set the dictionary for shd status msg");
+            goto out;
+        }
+        keylen = snprintf(key, sizeof(key), "%d-shd-status", (*index));
+        ret = dict_set_nstrn(dict, key, keylen, "off", SLEN("off"));
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to"
+                   " set dictionary for shd status msg");
+            goto out;
+        }
+
+        (*index)++;
+    }
+
+out:
+    return ret;
+}
+int
+glusterd_shd_select_brick_xlator(dict_t *dict, gf_xl_afr_op_t heal_op,
+                                 glusterd_volinfo_t *volinfo, int *index,
+                                 int *hxlator_count, dict_t *rsp_dict)
+{
+    int ret = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    glusterd_svc_t *svc = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    svc = &(volinfo->shd.svc);
+
+    switch (heal_op) {
+        case GF_SHD_OP_INDEX_SUMMARY:
+        case GF_SHD_OP_STATISTICS_HEAL_COUNT:
+            if (!svc->online) {
+                if (!rsp_dict) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OPCTX_NULL,
+                           "Received "
+                           "empty ctx.");
+                    goto out;
+                }
+
+                ret = fill_shd_status_for_local_bricks(
+                    rsp_dict, volinfo, ALL_HEAL_XL, index, dict);
+                if (ret)
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_SHD_STATUS_SET_FAIL,
+                           "Unable to "
+                           "fill the shd status for the local "
+                           "bricks");
+                goto out;
+            }
+            break;
+
+        case GF_SHD_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+            if (!svc->online) {
+                if (!rsp_dict) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OPCTX_NULL,
+                           "Received "
+                           "empty ctx.");
+                    goto out;
+                }
+                ret = fill_shd_status_for_local_bricks(
+                    rsp_dict, volinfo, PER_HEAL_XL, index, dict);
+                if (ret)
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_SHD_STATUS_SET_FAIL,
+                           "Unable to "
+                           "fill the shd status for the local"
+                           " bricks.");
+                goto out;
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    switch (heal_op) {
+        case GF_SHD_OP_HEAL_FULL:
+            _select_hxlators_for_full_self_heal(this, volinfo, dict, index,
+                                                hxlator_count);
+            break;
+        case GF_SHD_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+            (*hxlator_count) += _select_hxlator_with_matching_brick(
+                this, volinfo, dict, index);
+            break;
+        default:
+            _select_hxlators_with_local_bricks(this, volinfo, dict, index,
+                                               hxlator_count);
+            break;
+    }
+    ret = (*hxlator_count);
+out:
+    return ret;
+}
+
+static int
+glusterd_bricks_select_heal_volume(dict_t *dict, char **op_errstr,
+                                   struct cds_list_head *selected,
+                                   dict_t *rsp_dict)
+{
+    int ret = -1;
+    char *volname = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    char msg[2048] = {
+        0,
+    };
+    glusterd_pending_node_t *pending_node = NULL;
+    gf_xl_afr_op_t heal_op = GF_SHD_OP_INVALID;
+    int hxlator_count = 0;
+    int index = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "volume name get failed");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Volume %s does not exist", volname);
+
+        *op_errstr = gf_strdup(msg);
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND, "%s", msg);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "heal-op", SLEN("heal-op"),
+                          (int32_t *)&heal_op);
+    if (ret || (heal_op == GF_SHD_OP_INVALID)) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "heal op invalid");
+        goto out;
+    }
+    ret = glusterd_shd_select_brick_xlator(dict, heal_op, volinfo, &index,
+                                           &hxlator_count, rsp_dict);
+    if (ret < 0) {
+        goto out;
+    }
+
+    if (!hxlator_count)
+        goto out;
+    if (hxlator_count == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_XLATOR_COUNT_GET_FAIL,
+               "Could not determine the"
+               "translator count");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_int32n(dict, "count", SLEN("count"), hxlator_count);
+    if (ret)
+        goto out;
+    pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                             gf_gld_mt_pending_node_t);
+    if (!pending_node) {
+        ret = -1;
+        goto out;
+    } else {
+        pending_node->node = &(volinfo->shd.svc);
+        pending_node->type = GD_NODE_SHD;
+        cds_list_add_tail(&pending_node->list, selected);
+        pending_node = NULL;
+    }
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning ret %d", ret);
+    return ret;
+}
+
+static int
+glusterd_bricks_select_rebalance_volume(dict_t *dict, char **op_errstr,
+                                        struct cds_list_head *selected)
+{
+    int ret = -1;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    char msg[2048] = {
+        0,
+    };
+    glusterd_pending_node_t *pending_node = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "volume name get failed");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Volume %s does not exist", volname);
+
+        *op_errstr = gf_strdup(msg);
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND, "%s", msg);
+        goto out;
+    }
+    pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                             gf_gld_mt_pending_node_t);
+    if (!pending_node) {
+        ret = -1;
+        goto out;
+    } else {
+        pending_node->node = volinfo;
+        pending_node->type = GD_NODE_REBALANCE;
+        cds_list_add_tail(&pending_node->list, selected);
+        pending_node = NULL;
+    }
+
+out:
+    return ret;
+}
+
+static int
+glusterd_bricks_select_status_volume(dict_t *dict, char **op_errstr,
+                                     struct cds_list_head *selected)
+{
+    int ret = -1;
+    int cmd = 0;
+    int brick_index = -1;
+    char *volname = NULL;
+    char *brickname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_pending_node_t *pending_node = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_svc_t *svc = NULL;
+
+    GF_ASSERT(dict);
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_int32n(dict, "cmd", SLEN("cmd"), &cmd);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get status type");
+        goto out;
+    }
+
+    if (cmd & GF_CLI_STATUS_ALL)
+        goto out;
+
+    switch (cmd & GF_CLI_STATUS_MASK) {
+        case GF_CLI_STATUS_MEM:
+        case GF_CLI_STATUS_CLIENTS:
+        case GF_CLI_STATUS_INODE:
+        case GF_CLI_STATUS_FD:
+        case GF_CLI_STATUS_CALLPOOL:
+        case GF_CLI_STATUS_NFS:
+        case GF_CLI_STATUS_SHD:
+        case GF_CLI_STATUS_QUOTAD:
+        case GF_CLI_STATUS_SNAPD:
+        case GF_CLI_STATUS_BITD:
+        case GF_CLI_STATUS_SCRUB:
+        case GF_CLI_STATUS_CLIENT_LIST:
+            break;
+        default:
+            goto out;
+    }
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volname");
+        goto out;
+    }
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        goto out;
+    }
+
+    if ((cmd & GF_CLI_STATUS_BRICK) != 0) {
+        ret = dict_get_strn(dict, "brick", SLEN("brick"), &brickname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get brick");
+            goto out;
+        }
+        ret = glusterd_volume_brickinfo_get_by_brick(brickname, volinfo,
+                                                     &brickinfo, _gf_false);
+        if (ret)
+            goto out;
+
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID) ||
+            !glusterd_is_brick_started(brickinfo))
+            goto out;
+
+        pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                 gf_gld_mt_pending_node_t);
+        if (!pending_node) {
+            ret = -1;
+            goto out;
+        }
+        pending_node->node = brickinfo;
+        pending_node->type = GD_NODE_BRICK;
+        pending_node->index = 0;
+        cds_list_add_tail(&pending_node->list, selected);
+
+        ret = 0;
+#ifdef BUILD_GNFS
+    } else if ((cmd & GF_CLI_STATUS_NFS) != 0) {
+        if (!priv->nfs_svc.online) {
+            ret = -1;
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NFS_SERVER_NOT_RUNNING,
+                   "NFS server is not running");
+            goto out;
+        }
+        pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                 gf_gld_mt_pending_node_t);
+        if (!pending_node) {
+            ret = -1;
+            goto out;
+        }
+        pending_node->node = &(priv->nfs_svc);
+        pending_node->type = GD_NODE_NFS;
+        pending_node->index = 0;
+        cds_list_add_tail(&pending_node->list, selected);
+
+        ret = 0;
+#endif
+    } else if ((cmd & GF_CLI_STATUS_SHD) != 0) {
+        svc = &(volinfo->shd.svc);
+        if (!svc->online) {
+            ret = -1;
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SELF_HEALD_DISABLED,
+                   "Self-heal daemon is not running");
+            goto out;
+        }
+        pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                 gf_gld_mt_pending_node_t);
+        if (!pending_node) {
+            ret = -1;
+            goto out;
+        }
+        pending_node->node = svc;
+        pending_node->type = GD_NODE_SHD;
+        pending_node->index = 0;
+        cds_list_add_tail(&pending_node->list, selected);
+
+        ret = 0;
+    } else if ((cmd & GF_CLI_STATUS_QUOTAD) != 0) {
+        if (!priv->quotad_svc.online) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_QUOTAD_NOT_RUNNING,
+                   "Quotad is not "
+                   "running");
+            ret = -1;
+            goto out;
+        }
+        pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                 gf_gld_mt_pending_node_t);
+        if (!pending_node) {
+            ret = -1;
+            goto out;
+        }
+        pending_node->node = &(priv->quotad_svc);
+        pending_node->type = GD_NODE_QUOTAD;
+        pending_node->index = 0;
+        cds_list_add_tail(&pending_node->list, selected);
+
+        ret = 0;
+    } else if ((cmd & GF_CLI_STATUS_BITD) != 0) {
+        if (!priv->bitd_svc.online) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BITROT_NOT_RUNNING,
+                   "Bitrot is not "
+                   "running");
+            ret = -1;
+            goto out;
+        }
+        pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                 gf_gld_mt_pending_node_t);
+        if (!pending_node) {
+            ret = -1;
+            goto out;
+        }
+        pending_node->node = &(priv->bitd_svc);
+        pending_node->type = GD_NODE_BITD;
+        pending_node->index = 0;
+        cds_list_add_tail(&pending_node->list, selected);
+
+        ret = 0;
+    } else if ((cmd & GF_CLI_STATUS_SCRUB) != 0) {
+        if (!priv->scrub_svc.online) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SCRUBBER_NOT_RUNNING,
+                   "Scrubber is not "
+                   "running");
+            ret = -1;
+            goto out;
+        }
+        pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                 gf_gld_mt_pending_node_t);
+        if (!pending_node) {
+            ret = -1;
+            goto out;
+        }
+        pending_node->node = &(priv->scrub_svc);
+        pending_node->type = GD_NODE_SCRUB;
+        pending_node->index = 0;
+        cds_list_add_tail(&pending_node->list, selected);
+
+        ret = 0;
+    } else if ((cmd & GF_CLI_STATUS_SNAPD) != 0) {
+        if (!volinfo->snapd.svc.online) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_NOT_RUNNING,
+                   "snapd is not "
+                   "running");
+            ret = -1;
+            goto out;
+        }
+        pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                 gf_gld_mt_pending_node_t);
+        if (!pending_node) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "failed to allocate "
+                   "memory for pending node");
+            ret = -1;
+            goto out;
+        }
+
+        pending_node->node = (void *)(&volinfo->snapd);
+        pending_node->type = GD_NODE_SNAPD;
+        pending_node->index = 0;
+        cds_list_add_tail(&pending_node->list, selected);
+
+        ret = 0;
+    } else {
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            brick_index++;
+            if (gf_uuid_compare(brickinfo->uuid, MY_UUID) ||
+                !glusterd_is_brick_started(brickinfo)) {
+                continue;
+            }
+            pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                     gf_gld_mt_pending_node_t);
+            if (!pending_node) {
+                ret = -1;
+                gf_msg(THIS->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                       "Unable to allocate memory");
+                goto out;
+            }
+            pending_node->node = brickinfo;
+            pending_node->type = GD_NODE_BRICK;
+            pending_node->index = brick_index;
+            cds_list_add_tail(&pending_node->list, selected);
+            pending_node = NULL;
+        }
+    }
+out:
+    return ret;
+}
+
+static int
+glusterd_bricks_select_scrub(dict_t *dict, char **op_errstr,
+                             struct cds_list_head *selected)
+{
+    int ret = -1;
+    char *volname = NULL;
+    char msg[2048] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_pending_node_t *pending_node = NULL;
+
+    this = THIS;
+    priv = this->private;
+    GF_ASSERT(this);
+    GF_ASSERT(priv);
+
+    GF_ASSERT(dict);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get"
+               " volname");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Volume %s does not exist", volname);
+
+        *op_errstr = gf_strdup(msg);
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND, "%s",
+               msg);
+        goto out;
+    }
+
+    if (!priv->scrub_svc.online) {
+        ret = 0;
+        snprintf(msg, sizeof(msg), "Scrubber daemon is not running");
+
+        gf_msg_debug(this->name, 0, "%s", msg);
+        goto out;
+    }
+
+    pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                             gf_gld_mt_pending_node_t);
+    if (!pending_node) {
+        ret = -1;
+        goto out;
+    }
+
+    pending_node->node = &(priv->scrub_svc);
+    pending_node->type = GD_NODE_SCRUB;
+    cds_list_add_tail(&pending_node->list, selected);
+    pending_node = NULL;
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+/* Select the bricks to send the barrier request to.
+ * This selects the bricks of the given volume which are present on this peer
+ * and are running
+ */
+static int
+glusterd_bricks_select_barrier(dict_t *dict, struct cds_list_head *selected)
+{
+    int ret = -1;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_pending_node_t *pending_node = NULL;
+
+    GF_ASSERT(dict);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get volname");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "Failed to find volume %s", volname);
+        goto out;
+    }
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID) ||
+            !glusterd_is_brick_started(brickinfo)) {
+            continue;
+        }
+        pending_node = GF_CALLOC(1, sizeof(*pending_node),
+                                 gf_gld_mt_pending_node_t);
+        if (!pending_node) {
+            ret = -1;
+            goto out;
+        }
+        pending_node->node = brickinfo;
+        pending_node->type = GD_NODE_BRICK;
+        cds_list_add_tail(&pending_node->list, selected);
+        pending_node = NULL;
+    }
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_op_ac_send_brick_op(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    rpc_clnt_procedure_t *proc = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    glusterd_op_t op = GD_OP_NONE;
+    glusterd_req_ctx_t *req_ctx = NULL;
+    char *op_errstr = NULL;
+    gf_boolean_t free_req_ctx = _gf_false;
+
+    this = THIS;
+    priv = this->private;
+
+    if (ctx) {
+        req_ctx = ctx;
+    } else {
+        req_ctx = GF_CALLOC(1, sizeof(*req_ctx), gf_gld_mt_op_allack_ctx_t);
+        if (!req_ctx)
+            goto out;
+        free_req_ctx = _gf_true;
+        op = glusterd_op_get_op();
+        req_ctx->op = op;
+        gf_uuid_copy(req_ctx->uuid, MY_UUID);
+        ret = glusterd_op_build_payload(&req_ctx->dict, &op_errstr, NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_BRICK_OP_PAYLOAD_BUILD_FAIL, LOGSTR_BUILD_PAYLOAD,
+                   gd_op_list[op]);
+            if (op_errstr == NULL)
+                gf_asprintf(&op_errstr, OPERRSTR_BUILD_PAYLOAD);
+            opinfo.op_errstr = op_errstr;
+            goto out;
+        }
+    }
+
+    proc = &priv->gfs_mgmt->proctable[GLUSTERD_BRICK_OP];
+    if (proc->fn) {
+        ret = proc->fn(NULL, this, req_ctx);
+        if (ret)
+            goto out;
+    }
+
+    if (!opinfo.pending_count && !opinfo.brick_pending_count) {
+        glusterd_clear_pending_nodes(&opinfo.pending_bricks);
+        ret = glusterd_op_sm_inject_event(GD_OP_EVENT_ALL_ACK, &event->txn_id,
+                                          req_ctx);
+    }
+
+out:
+    if (ret && free_req_ctx)
+        GF_FREE(req_ctx);
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_op_ac_rcvd_brick_op_acc(glusterd_op_sm_event_t *event, void *ctx)
+{
+    int ret = -1;
+    glusterd_op_brick_rsp_ctx_t *ev_ctx = NULL;
+    char *op_errstr = NULL;
+    glusterd_op_t op = GD_OP_NONE;
+    gd_node_type type = GD_NODE_NONE;
+    dict_t *op_ctx = NULL;
+    glusterd_req_ctx_t *req_ctx = NULL;
+    void *pending_entry = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, event, out);
+    GF_VALIDATE_OR_GOTO(this->name, ctx, out);
+    ev_ctx = ctx;
+    GF_VALIDATE_OR_GOTO(this->name, ev_ctx, out);
+
+    req_ctx = ev_ctx->commit_ctx;
+    GF_VALIDATE_OR_GOTO(this->name, req_ctx, out);
+
+    op = req_ctx->op;
+    op_ctx = glusterd_op_get_ctx();
+    pending_entry = ev_ctx->pending_node->node;
+    type = ev_ctx->pending_node->type;
+
+    ret = glusterd_remove_pending_entry(&opinfo.pending_bricks, pending_entry);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNKNOWN_RESPONSE,
+               "unknown response received ");
+        ret = -1;
+        goto out;
+    }
+
+    if (opinfo.brick_pending_count > 0)
+        opinfo.brick_pending_count--;
+
+    ret = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    glusterd_handle_node_rsp(req_ctx->dict, pending_entry, op, ev_ctx->rsp_dict,
+                             op_ctx, &op_errstr, type);
+
+    if (opinfo.brick_pending_count > 0)
+        goto out;
+
+    ret = glusterd_op_sm_inject_event(GD_OP_EVENT_ALL_ACK, &event->txn_id,
+                                      ev_ctx->commit_ctx);
+
+out:
+    if (ev_ctx && ev_ctx->rsp_dict)
+        dict_unref(ev_ctx->rsp_dict);
+    GF_FREE(ev_ctx);
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_op_bricks_select(glusterd_op_t op, dict_t *dict, char **op_errstr,
+                          struct cds_list_head *selected, dict_t *rsp_dict)
+{
+    int ret = 0;
+
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(op > GD_OP_NONE);
+    GF_ASSERT(op < GD_OP_MAX);
+
+    switch (op) {
+        case GD_OP_STOP_VOLUME:
+            ret = glusterd_bricks_select_stop_volume(dict, op_errstr, selected);
+            break;
+        case GD_OP_REMOVE_BRICK:
+            ret = glusterd_bricks_select_remove_brick(dict, op_errstr,
+                                                      selected);
+            break;
+
+        case GD_OP_PROFILE_VOLUME:
+            ret = glusterd_bricks_select_profile_volume(dict, op_errstr,
+                                                        selected);
+            break;
+
+        case GD_OP_HEAL_VOLUME:
+            ret = glusterd_bricks_select_heal_volume(dict, op_errstr, selected,
+                                                     rsp_dict);
+            break;
+
+        case GD_OP_STATUS_VOLUME:
+            ret = glusterd_bricks_select_status_volume(dict, op_errstr,
+                                                       selected);
+            break;
+        case GD_OP_DEFRAG_BRICK_VOLUME:
+            ret = glusterd_bricks_select_rebalance_volume(dict, op_errstr,
+                                                          selected);
+            break;
+
+        case GD_OP_BARRIER:
+            ret = glusterd_bricks_select_barrier(dict, selected);
+            break;
+        case GD_OP_SNAP:
+            ret = glusterd_bricks_select_snap(dict, op_errstr, selected);
+            break;
+        case GD_OP_SCRUB_STATUS:
+        case GD_OP_SCRUB_ONDEMAND:
+            ret = glusterd_bricks_select_scrub(dict, op_errstr, selected);
+            break;
+        default:
+            break;
+    }
+
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+glusterd_op_sm_t glusterd_op_state_default[] = {
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_none},         // EVENT_NONE
+    {GD_OP_STATE_LOCK_SENT, glusterd_op_ac_send_lock},  // EVENT_START_LOCK
+    {GD_OP_STATE_LOCKED, glusterd_op_ac_lock},          // EVENT_LOCK
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_none},         // EVENT_RCVD_ACC
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_none},         // EVENT_ALL_ACC
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_none},         // EVENT_STAGE_ACC
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_none},         // EVENT_COMMIT_ACC
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_none},         // EVENT_RCVD_RJT
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_none},         // EVENT_STAGE_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_none},         // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},       // EVENT_UNLOCK
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_none},         // EVENT_START_UNLOCK
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_none},         // EVENT_ALL_ACK
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_none},  // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t glusterd_op_state_lock_sent[] = {
+    {GD_OP_STATE_LOCK_SENT, glusterd_op_ac_none},           // EVENT_NONE
+    {GD_OP_STATE_LOCK_SENT, glusterd_op_ac_none},           // EVENT_START_LOCK
+    {GD_OP_STATE_LOCK_SENT, glusterd_op_ac_lock},           // EVENT_LOCK
+    {GD_OP_STATE_LOCK_SENT, glusterd_op_ac_rcvd_lock_acc},  // EVENT_RCVD_ACC
+    {GD_OP_STATE_STAGE_OP_SENT, glusterd_op_ac_send_stage_op},  // EVENT_ALL_ACC
+    {GD_OP_STATE_LOCK_SENT, glusterd_op_ac_none},  // EVENT_STAGE_ACC
+    {GD_OP_STATE_LOCK_SENT, glusterd_op_ac_none},  // EVENT_COMMIT_ACC
+    {GD_OP_STATE_ACK_DRAIN,
+     glusterd_op_ac_send_unlock_drain},            // EVENT_RCVD_RJT
+    {GD_OP_STATE_LOCK_SENT, glusterd_op_ac_none},  // EVENT_STAGE_OP
+    {GD_OP_STATE_LOCK_SENT, glusterd_op_ac_none},  // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},  // EVENT_UNLOCK
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},  // EVENT_START_UNLOCK
+    {GD_OP_STATE_LOCK_SENT, glusterd_op_ac_none},  // EVENT_ALL_ACK
+    {GD_OP_STATE_LOCK_SENT, glusterd_op_ac_none},  // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_LOCK_SENT, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t glusterd_op_state_locked[] = {
+    {GD_OP_STATE_LOCKED, glusterd_op_ac_none},      // EVENT_NONE
+    {GD_OP_STATE_LOCKED, glusterd_op_ac_none},      // EVENT_START_LOCK
+    {GD_OP_STATE_LOCKED, glusterd_op_ac_lock},      // EVENT_LOCK
+    {GD_OP_STATE_LOCKED, glusterd_op_ac_none},      // EVENT_RCVD_ACC
+    {GD_OP_STATE_LOCKED, glusterd_op_ac_none},      // EVENT_ALL_ACC
+    {GD_OP_STATE_LOCKED, glusterd_op_ac_none},      // EVENT_STAGE_ACC
+    {GD_OP_STATE_LOCKED, glusterd_op_ac_none},      // EVENT_COMMIT_ACC
+    {GD_OP_STATE_LOCKED, glusterd_op_ac_none},      // EVENT_RCVD_RJT
+    {GD_OP_STATE_STAGED, glusterd_op_ac_stage_op},  // EVENT_STAGE_OP
+    {GD_OP_STATE_LOCKED, glusterd_op_ac_none},      // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},   // EVENT_UNLOCK
+    {GD_OP_STATE_LOCKED, glusterd_op_ac_none},      // EVENT_START_UNLOCK
+    {GD_OP_STATE_LOCKED, glusterd_op_ac_none},      // EVENT_ALL_ACK
+    {GD_OP_STATE_DEFAULT,
+     glusterd_op_ac_local_unlock},              // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_LOCKED, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t glusterd_op_state_stage_op_sent[] = {
+    {GD_OP_STATE_STAGE_OP_SENT, glusterd_op_ac_none},  // EVENT_NONE
+    {GD_OP_STATE_STAGE_OP_SENT, glusterd_op_ac_none},  // EVENT_START_LOCK
+    {GD_OP_STATE_STAGE_OP_SENT, glusterd_op_ac_lock},  // EVENT_LOCK
+    {GD_OP_STATE_STAGE_OP_SENT,
+     glusterd_op_ac_rcvd_stage_op_acc},  // EVENT_RCVD_ACC
+    {GD_OP_STATE_BRICK_OP_SENT, glusterd_op_ac_send_brick_op},  // EVENT_ALL_ACC
+    {GD_OP_STATE_BRICK_OP_SENT,
+     glusterd_op_ac_send_brick_op},                    // EVENT_STAGE_ACC
+    {GD_OP_STATE_STAGE_OP_SENT, glusterd_op_ac_none},  // EVENT_COMMIT_ACC
+    {GD_OP_STATE_STAGE_OP_FAILED,
+     glusterd_op_ac_stage_op_failed},                  // EVENT_RCVD_RJT
+    {GD_OP_STATE_STAGE_OP_SENT, glusterd_op_ac_none},  // EVENT_STAGE_OP
+    {GD_OP_STATE_STAGE_OP_SENT, glusterd_op_ac_none},  // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},      // EVENT_UNLOCK
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},      // EVENT_START_UNLOCK
+    {GD_OP_STATE_STAGE_OP_SENT, glusterd_op_ac_none},  // EVENT_ALL_ACK
+    {GD_OP_STATE_STAGE_OP_SENT,
+     glusterd_op_ac_none},  // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_STAGE_OP_SENT, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t glusterd_op_state_stage_op_failed[] = {
+    {GD_OP_STATE_STAGE_OP_FAILED, glusterd_op_ac_none},  // EVENT_NONE
+    {GD_OP_STATE_STAGE_OP_FAILED, glusterd_op_ac_none},  // EVENT_START_LOCK
+    {GD_OP_STATE_STAGE_OP_FAILED, glusterd_op_ac_lock},  // EVENT_LOCK
+    {GD_OP_STATE_STAGE_OP_FAILED,
+     glusterd_op_ac_stage_op_failed},                    // EVENT_RCVD_ACC
+    {GD_OP_STATE_STAGE_OP_FAILED, glusterd_op_ac_none},  // EVENT_ALL_ACC
+    {GD_OP_STATE_STAGE_OP_FAILED, glusterd_op_ac_none},  // EVENT_STAGE_ACC
+    {GD_OP_STATE_STAGE_OP_FAILED, glusterd_op_ac_none},  // EVENT_COMMIT_ACC
+    {GD_OP_STATE_STAGE_OP_FAILED,
+     glusterd_op_ac_stage_op_failed},                    // EVENT_RCVD_RJT
+    {GD_OP_STATE_STAGE_OP_FAILED, glusterd_op_ac_none},  // EVENT_STAGE_OP
+    {GD_OP_STATE_STAGE_OP_FAILED, glusterd_op_ac_none},  // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},        // EVENT_UNLOCK
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},        // EVENT_START_UNLOCK
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_send_unlock},  // EVENT_ALL_ACK
+    {GD_OP_STATE_STAGE_OP_FAILED,
+     glusterd_op_ac_none},  // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_STAGE_OP_FAILED, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t glusterd_op_state_staged[] = {
+    {GD_OP_STATE_STAGED, glusterd_op_ac_none},  // EVENT_NONE
+    {GD_OP_STATE_STAGED, glusterd_op_ac_none},  // EVENT_START_LOCK
+    {GD_OP_STATE_STAGED, glusterd_op_ac_lock},  // EVENT_LOCK
+    {GD_OP_STATE_STAGED, glusterd_op_ac_none},  // EVENT_RCVD_ACC
+    {GD_OP_STATE_STAGED, glusterd_op_ac_none},  // EVENT_ALL_ACC
+    {GD_OP_STATE_STAGED, glusterd_op_ac_none},  // EVENT_STAGE_ACC
+    {GD_OP_STATE_STAGED, glusterd_op_ac_none},  // EVENT_COMMIT_ACC
+    {GD_OP_STATE_STAGED, glusterd_op_ac_none},  // EVENT_RCVD_RJT
+    {GD_OP_STATE_STAGED, glusterd_op_ac_none},  // EVENT_STAGE_OP
+    {GD_OP_STATE_BRICK_COMMITTED,
+     glusterd_op_ac_send_brick_op},                // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},  // EVENT_UNLOCK
+    {GD_OP_STATE_STAGED, glusterd_op_ac_none},     // EVENT_START_UNLOCK
+    {GD_OP_STATE_STAGED, glusterd_op_ac_none},     // EVENT_ALL_ACK
+    {GD_OP_STATE_DEFAULT,
+     glusterd_op_ac_local_unlock},              // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_STAGED, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t glusterd_op_state_brick_op_sent[] = {
+    {GD_OP_STATE_BRICK_OP_SENT, glusterd_op_ac_none},  // EVENT_NONE
+    {GD_OP_STATE_BRICK_OP_SENT, glusterd_op_ac_none},  // EVENT_START_LOCK
+    {GD_OP_STATE_BRICK_OP_SENT, glusterd_op_ac_lock},  // EVENT_LOCK
+    {GD_OP_STATE_BRICK_OP_SENT,
+     glusterd_op_ac_rcvd_brick_op_acc},                // EVENT_RCVD_ACC
+    {GD_OP_STATE_BRICK_OP_SENT, glusterd_op_ac_none},  // EVENT_ALL_ACC
+    {GD_OP_STATE_BRICK_OP_SENT, glusterd_op_ac_none},  // EVENT_STAGE_ACC
+    {GD_OP_STATE_BRICK_OP_SENT, glusterd_op_ac_none},  // EVENT_COMMIT_ACC
+    {GD_OP_STATE_BRICK_OP_FAILED,
+     glusterd_op_ac_brick_op_failed},                  // EVENT_RCVD_RJT
+    {GD_OP_STATE_BRICK_OP_SENT, glusterd_op_ac_none},  // EVENT_BRICK_OP
+    {GD_OP_STATE_BRICK_OP_SENT, glusterd_op_ac_none},  // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},      // EVENT_UNLOCK
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},      // EVENT_START_UNLOCK
+    {GD_OP_STATE_COMMIT_OP_SENT,
+     glusterd_op_ac_send_commit_op},  // EVENT_ALL_ACK
+    {GD_OP_STATE_BRICK_OP_SENT,
+     glusterd_op_ac_none},  // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_BRICK_OP_SENT, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t glusterd_op_state_brick_op_failed[] = {
+    {GD_OP_STATE_BRICK_OP_FAILED, glusterd_op_ac_none},  // EVENT_NONE
+    {GD_OP_STATE_BRICK_OP_FAILED, glusterd_op_ac_none},  // EVENT_START_LOCK
+    {GD_OP_STATE_BRICK_OP_FAILED, glusterd_op_ac_lock},  // EVENT_LOCK
+    {GD_OP_STATE_BRICK_OP_FAILED,
+     glusterd_op_ac_brick_op_failed},                    // EVENT_RCVD_ACC
+    {GD_OP_STATE_BRICK_OP_FAILED, glusterd_op_ac_none},  // EVENT_ALL_ACC
+    {GD_OP_STATE_BRICK_OP_FAILED, glusterd_op_ac_none},  // EVENT_STAGE_ACC
+    {GD_OP_STATE_BRICK_OP_FAILED, glusterd_op_ac_none},  // EVENT_COMMIT_ACC
+    {GD_OP_STATE_BRICK_OP_FAILED,
+     glusterd_op_ac_brick_op_failed},                    // EVENT_RCVD_RJT
+    {GD_OP_STATE_BRICK_OP_FAILED, glusterd_op_ac_none},  // EVENT_BRICK_OP
+    {GD_OP_STATE_BRICK_OP_FAILED, glusterd_op_ac_none},  // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},        // EVENT_UNLOCK
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},        // EVENT_START_UNLOCK
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_send_unlock},  // EVENT_ALL_ACK
+    {GD_OP_STATE_BRICK_OP_FAILED,
+     glusterd_op_ac_none},  // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_BRICK_OP_FAILED, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t glusterd_op_state_brick_committed[] = {
+    {GD_OP_STATE_BRICK_COMMITTED, glusterd_op_ac_none},  // EVENT_NONE
+    {GD_OP_STATE_BRICK_COMMITTED, glusterd_op_ac_none},  // EVENT_START_LOCK
+    {GD_OP_STATE_BRICK_COMMITTED, glusterd_op_ac_lock},  // EVENT_LOCK
+    {GD_OP_STATE_BRICK_COMMITTED,
+     glusterd_op_ac_rcvd_brick_op_acc},                  // EVENT_RCVD_ACC
+    {GD_OP_STATE_BRICK_COMMITTED, glusterd_op_ac_none},  // EVENT_ALL_ACC
+    {GD_OP_STATE_BRICK_COMMITTED, glusterd_op_ac_none},  // EVENT_STAGE_ACC
+    {GD_OP_STATE_BRICK_COMMITTED, glusterd_op_ac_none},  // EVENT_COMMIT_ACC
+    {GD_OP_STATE_BRICK_COMMIT_FAILED,
+     glusterd_op_ac_brick_op_failed},                    // EVENT_RCVD_RJT
+    {GD_OP_STATE_BRICK_COMMITTED, glusterd_op_ac_none},  // EVENT_STAGE_OP
+    {GD_OP_STATE_BRICK_COMMITTED, glusterd_op_ac_none},  // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},        // EVENT_UNLOCK
+    {GD_OP_STATE_BRICK_COMMITTED, glusterd_op_ac_none},  // EVENT_START_UNLOCK
+    {GD_OP_STATE_COMMITED, glusterd_op_ac_commit_op},    // EVENT_ALL_ACK
+    {GD_OP_STATE_DEFAULT,
+     glusterd_op_ac_local_unlock},  // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_BRICK_COMMITTED, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t glusterd_op_state_brick_commit_failed[] = {
+    {GD_OP_STATE_BRICK_COMMIT_FAILED, glusterd_op_ac_none},  // EVENT_NONE
+    {GD_OP_STATE_BRICK_COMMIT_FAILED, glusterd_op_ac_none},  // EVENT_START_LOCK
+    {GD_OP_STATE_BRICK_COMMIT_FAILED, glusterd_op_ac_lock},  // EVENT_LOCK
+    {GD_OP_STATE_BRICK_COMMIT_FAILED,
+     glusterd_op_ac_brick_op_failed},                        // EVENT_RCVD_ACC
+    {GD_OP_STATE_BRICK_COMMIT_FAILED, glusterd_op_ac_none},  // EVENT_ALL_ACC
+    {GD_OP_STATE_BRICK_COMMIT_FAILED, glusterd_op_ac_none},  // EVENT_STAGE_ACC
+    {GD_OP_STATE_BRICK_COMMIT_FAILED, glusterd_op_ac_none},  // EVENT_COMMIT_ACC
+    {GD_OP_STATE_BRICK_COMMIT_FAILED,
+     glusterd_op_ac_brick_op_failed},                        // EVENT_RCVD_RJT
+    {GD_OP_STATE_BRICK_COMMIT_FAILED, glusterd_op_ac_none},  // EVENT_STAGE_OP
+    {GD_OP_STATE_BRICK_COMMIT_FAILED, glusterd_op_ac_none},  // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},            // EVENT_UNLOCK
+    {GD_OP_STATE_BRICK_COMMIT_FAILED,
+     glusterd_op_ac_none},  // EVENT_START_UNLOCK
+    {GD_OP_STATE_BRICK_COMMIT_FAILED,
+     glusterd_op_ac_send_commit_failed},  // EVENT_ALL_ACK
+    {GD_OP_STATE_DEFAULT,
+     glusterd_op_ac_local_unlock},  // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_BRICK_COMMIT_FAILED, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t glusterd_op_state_commit_op_failed[] = {
+    {GD_OP_STATE_COMMIT_OP_FAILED, glusterd_op_ac_none},  // EVENT_NONE
+    {GD_OP_STATE_COMMIT_OP_FAILED, glusterd_op_ac_none},  // EVENT_START_LOCK
+    {GD_OP_STATE_COMMIT_OP_FAILED, glusterd_op_ac_lock},  // EVENT_LOCK
+    {GD_OP_STATE_COMMIT_OP_FAILED,
+     glusterd_op_ac_commit_op_failed},                    // EVENT_RCVD_ACC
+    {GD_OP_STATE_COMMIT_OP_FAILED, glusterd_op_ac_none},  // EVENT_ALL_ACC
+    {GD_OP_STATE_COMMIT_OP_FAILED, glusterd_op_ac_none},  // EVENT_STAGE_ACC
+    {GD_OP_STATE_COMMIT_OP_FAILED, glusterd_op_ac_none},  // EVENT_COMMIT_ACC
+    {GD_OP_STATE_COMMIT_OP_FAILED,
+     glusterd_op_ac_commit_op_failed},                    // EVENT_RCVD_RJT
+    {GD_OP_STATE_COMMIT_OP_FAILED, glusterd_op_ac_none},  // EVENT_STAGE_OP
+    {GD_OP_STATE_COMMIT_OP_FAILED, glusterd_op_ac_none},  // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},         // EVENT_UNLOCK
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},         // EVENT_START_UNLOCK
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_send_unlock},  // EVENT_ALL_ACK
+    {GD_OP_STATE_COMMIT_OP_FAILED,
+     glusterd_op_ac_none},  // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_COMMIT_OP_FAILED, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t glusterd_op_state_commit_op_sent[] = {
+    {GD_OP_STATE_COMMIT_OP_SENT, glusterd_op_ac_none},  // EVENT_NONE
+    {GD_OP_STATE_COMMIT_OP_SENT, glusterd_op_ac_none},  // EVENT_START_LOCK
+    {GD_OP_STATE_COMMIT_OP_SENT, glusterd_op_ac_lock},  // EVENT_LOCK
+    {GD_OP_STATE_COMMIT_OP_SENT,
+     glusterd_op_ac_rcvd_commit_op_acc},                    // EVENT_RCVD_ACC
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_send_unlock},  // EVENT_ALL_ACC
+    {GD_OP_STATE_COMMIT_OP_SENT, glusterd_op_ac_none},      // EVENT_STAGE_ACC
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_send_unlock},  // EVENT_COMMIT_ACC
+    {GD_OP_STATE_COMMIT_OP_FAILED,
+     glusterd_op_ac_commit_op_failed},                  // EVENT_RCVD_RJT
+    {GD_OP_STATE_COMMIT_OP_SENT, glusterd_op_ac_none},  // EVENT_STAGE_OP
+    {GD_OP_STATE_COMMIT_OP_SENT, glusterd_op_ac_none},  // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},       // EVENT_UNLOCK
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},       // EVENT_START_UNLOCK
+    {GD_OP_STATE_COMMIT_OP_SENT, glusterd_op_ac_none},  // EVENT_ALL_ACK
+    {GD_OP_STATE_COMMIT_OP_SENT,
+     glusterd_op_ac_none},  // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_COMMIT_OP_SENT, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t glusterd_op_state_committed[] = {
+    {GD_OP_STATE_COMMITED, glusterd_op_ac_none},   // EVENT_NONE
+    {GD_OP_STATE_COMMITED, glusterd_op_ac_none},   // EVENT_START_LOCK
+    {GD_OP_STATE_COMMITED, glusterd_op_ac_lock},   // EVENT_LOCK
+    {GD_OP_STATE_COMMITED, glusterd_op_ac_none},   // EVENT_RCVD_ACC
+    {GD_OP_STATE_COMMITED, glusterd_op_ac_none},   // EVENT_ALL_ACC
+    {GD_OP_STATE_COMMITED, glusterd_op_ac_none},   // EVENT_STAGE_ACC
+    {GD_OP_STATE_COMMITED, glusterd_op_ac_none},   // EVENT_COMMIT_ACC
+    {GD_OP_STATE_COMMITED, glusterd_op_ac_none},   // EVENT_RCVD_RJT
+    {GD_OP_STATE_COMMITED, glusterd_op_ac_none},   // EVENT_STAGE_OP
+    {GD_OP_STATE_COMMITED, glusterd_op_ac_none},   // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},  // EVENT_UNLOCK
+    {GD_OP_STATE_COMMITED, glusterd_op_ac_none},   // EVENT_START_UNLOCK
+    {GD_OP_STATE_COMMITED, glusterd_op_ac_none},   // EVENT_ALL_ACK
+    {GD_OP_STATE_DEFAULT,
+     glusterd_op_ac_local_unlock},                // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_COMMITED, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t glusterd_op_state_unlock_sent[] = {
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_none},  // EVENT_NONE
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_none},  // EVENT_START_LOCK
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_lock},  // EVENT_LOCK
+    {GD_OP_STATE_UNLOCK_SENT,
+     glusterd_op_ac_rcvd_unlock_acc},                    // EVENT_RCVD_ACC
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlocked_all},  // EVENT_ALL_ACC
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_none},      // EVENT_STAGE_ACC
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_none},      // EVENT_COMMIT_ACC
+    {GD_OP_STATE_UNLOCK_SENT,
+     glusterd_op_ac_rcvd_unlock_acc},                // EVENT_RCVD_RJT
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_none},  // EVENT_STAGE_OP
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_none},  // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},    // EVENT_UNLOCK
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},    // EVENT_START_UNLOCK
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_none},  // EVENT_ALL_ACK
+    {GD_OP_STATE_UNLOCK_SENT,
+     glusterd_op_ac_none},  // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t glusterd_op_state_ack_drain[] = {
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},  // EVENT_NONE
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},  // EVENT_START_LOCK
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_lock},  // EVENT_LOCK
+    {GD_OP_STATE_ACK_DRAIN,
+     glusterd_op_ac_send_unlock_drain},            // EVENT_RCVD_ACC
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},  // EVENT_ALL_ACC
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},  // EVENT_STAGE_ACC
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},  // EVENT_COMMIT_ACC
+    {GD_OP_STATE_ACK_DRAIN,
+     glusterd_op_ac_send_unlock_drain},            // EVENT_RCVD_RJT
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},  // EVENT_STAGE_OP
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},  // EVENT_COMMIT_OP
+    {GD_OP_STATE_DEFAULT, glusterd_op_ac_unlock},  // EVENT_UNLOCK
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},  // EVENT_START_UNLOCK
+    {GD_OP_STATE_UNLOCK_SENT, glusterd_op_ac_send_unlock},  // EVENT_ALL_ACK
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},  // EVENT_LOCAL_UNLOCK_NO_RESP
+    {GD_OP_STATE_ACK_DRAIN, glusterd_op_ac_none},  // EVENT_MAX
+};
+
+glusterd_op_sm_t *glusterd_op_state_table[] = {
+    glusterd_op_state_default,         glusterd_op_state_lock_sent,
+    glusterd_op_state_locked,          glusterd_op_state_stage_op_sent,
+    glusterd_op_state_staged,          glusterd_op_state_commit_op_sent,
+    glusterd_op_state_committed,       glusterd_op_state_unlock_sent,
+    glusterd_op_state_stage_op_failed, glusterd_op_state_commit_op_failed,
+    glusterd_op_state_brick_op_sent,   glusterd_op_state_brick_op_failed,
+    glusterd_op_state_brick_committed, glusterd_op_state_brick_commit_failed,
+    glusterd_op_state_ack_drain};
+
+int
+glusterd_op_sm_new_event(glusterd_op_sm_event_type_t event_type,
+                         glusterd_op_sm_event_t **new_event)
+{
+    glusterd_op_sm_event_t *event = NULL;
+
+    GF_ASSERT(new_event);
+    GF_ASSERT(GD_OP_EVENT_NONE <= event_type && GD_OP_EVENT_MAX > event_type);
+
+    event = GF_CALLOC(1, sizeof(*event), gf_gld_mt_op_sm_event_t);
+
+    if (!event)
+        return -1;
+
+    *new_event = event;
+    event->event = event_type;
+    CDS_INIT_LIST_HEAD(&event->list);
+
+    return 0;
+}
+
+int
+glusterd_op_sm_inject_event(glusterd_op_sm_event_type_t event_type,
+                            uuid_t *txn_id, void *ctx)
+{
+    int32_t ret = -1;
+    glusterd_op_sm_event_t *event = NULL;
+
+    GF_ASSERT(event_type < GD_OP_EVENT_MAX && event_type >= GD_OP_EVENT_NONE);
+
+    ret = glusterd_op_sm_new_event(event_type, &event);
+
+    if (ret)
+        goto out;
+
+    event->ctx = ctx;
+
+    if (txn_id)
+        gf_uuid_copy(event->txn_id, *txn_id);
+
+    gf_msg_debug(THIS->name, 0, "Enqueue event: '%s'",
+                 glusterd_op_sm_event_name_get(event->event));
+    cds_list_add_tail(&event->list, &gd_op_sm_queue);
+
+out:
+    return ret;
+}
+
+void
+glusterd_destroy_req_ctx(glusterd_req_ctx_t *ctx)
+{
+    if (!ctx)
+        return;
+    if (ctx->dict)
+        dict_unref(ctx->dict);
+    GF_FREE(ctx);
+}
+
+void
+glusterd_destroy_local_unlock_ctx(uuid_t *ctx)
+{
+    if (!ctx)
+        return;
+    GF_FREE(ctx);
+}
+
+void
+glusterd_destroy_op_event_ctx(glusterd_op_sm_event_t *event)
+{
+    if (!event)
+        return;
+
+    switch (event->event) {
+        case GD_OP_EVENT_LOCK:
+        case GD_OP_EVENT_UNLOCK:
+            glusterd_destroy_lock_ctx(event->ctx);
+            break;
+        case GD_OP_EVENT_STAGE_OP:
+        case GD_OP_EVENT_ALL_ACK:
+            glusterd_destroy_req_ctx(event->ctx);
+            break;
+        case GD_OP_EVENT_LOCAL_UNLOCK_NO_RESP:
+            glusterd_destroy_local_unlock_ctx(event->ctx);
+            break;
+        default:
+            break;
+    }
+}
+
+int
+glusterd_op_sm()
+{
+    glusterd_op_sm_event_t *event = NULL;
+    glusterd_op_sm_event_t *tmp = NULL;
+    int ret = -1;
+    int lock_err = 0;
+    glusterd_op_sm_ac_fn handler = NULL;
+    glusterd_op_sm_t *state = NULL;
+    glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
+    xlator_t *this = NULL;
+    glusterd_op_info_t txn_op_info;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = synclock_trylock(&gd_op_sm_lock);
+    if (ret) {
+        lock_err = errno;
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_LOCK_FAIL,
+               "lock failed due to %s", strerror(lock_err));
+        goto lock_failed;
+    }
+
+    while (!cds_list_empty(&gd_op_sm_queue)) {
+        cds_list_for_each_entry_safe(event, tmp, &gd_op_sm_queue, list)
+        {
+            cds_list_del_init(&event->list);
+            event_type = event->event;
+            gf_msg_debug(this->name, 0,
+                         "Dequeued event of "
+                         "type: '%s'",
+                         glusterd_op_sm_event_name_get(event_type));
+
+            gf_msg_debug(this->name, 0, "transaction ID = %s",
+                         uuid_utoa(event->txn_id));
+
+            ret = glusterd_get_txn_opinfo(&event->txn_id, &txn_op_info);
+            if (ret) {
+                gf_msg_callingfn(this->name, GF_LOG_ERROR, 0,
+                                 GD_MSG_TRANS_OPINFO_GET_FAIL,
+                                 "Unable to get transaction "
+                                 "opinfo for transaction ID :"
+                                 "%s",
+                                 uuid_utoa(event->txn_id));
+                glusterd_destroy_op_event_ctx(event);
+                GF_FREE(event);
+                continue;
+            } else
+                opinfo = txn_op_info;
+
+            state = glusterd_op_state_table[opinfo.state.state];
+
+            GF_ASSERT(state);
+
+            handler = state[event_type].handler;
+            GF_ASSERT(handler);
+
+            ret = handler(event, event->ctx);
+
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HANDLER_RETURNED,
+                       "handler returned: %d", ret);
+                glusterd_destroy_op_event_ctx(event);
+                GF_FREE(event);
+                continue;
+            }
+
+            ret = glusterd_op_sm_transition_state(&opinfo, state, event_type);
+
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_EVENT_STATE_TRANSITION_FAIL,
+                       "Unable to transition"
+                       "state from '%s' to '%s'",
+                       glusterd_op_sm_state_name_get(opinfo.state.state),
+                       glusterd_op_sm_state_name_get(
+                           state[event_type].next_state));
+                (void)synclock_unlock(&gd_op_sm_lock);
+                return ret;
+            }
+
+            if ((state[event_type].next_state == GD_OP_STATE_DEFAULT) &&
+                (event_type == GD_OP_EVENT_UNLOCK)) {
+                /* Clearing the transaction opinfo */
+                ret = glusterd_clear_txn_opinfo(&event->txn_id);
+                if (ret)
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_TRANS_OPINFO_CLEAR_FAIL,
+                           "Unable to clear "
+                           "transaction's opinfo");
+            } else {
+                if ((priv->op_version < GD_OP_VERSION_6_0) ||
+                    !(event_type == GD_OP_EVENT_STAGE_OP &&
+                      opinfo.state.state == GD_OP_STATE_STAGED &&
+                      opinfo.skip_locking)) {
+                    ret = glusterd_set_txn_opinfo(&event->txn_id, &opinfo);
+                    if (ret)
+                        gf_msg(this->name, GF_LOG_ERROR, 0,
+                               GD_MSG_TRANS_OPINFO_SET_FAIL,
+                               "Unable to set "
+                               "transaction's opinfo");
+                }
+            }
+
+            glusterd_destroy_op_event_ctx(event);
+            GF_FREE(event);
+        }
+    }
+
+    (void)synclock_unlock(&gd_op_sm_lock);
+    ret = 0;
+
+lock_failed:
+
+    return ret;
+}
+
+int32_t
+glusterd_op_set_op(glusterd_op_t op)
+{
+    GF_ASSERT(op < GD_OP_MAX);
+    GF_ASSERT(op > GD_OP_NONE);
+
+    opinfo.op = op;
+
+    return 0;
+}
+
+int32_t
+glusterd_op_get_op()
+{
+    return opinfo.op;
+}
+
+int32_t
+glusterd_op_set_req(rpcsvc_request_t *req)
+{
+    GF_ASSERT(req);
+    opinfo.req = req;
+    return 0;
+}
+
+int32_t
+glusterd_op_clear_op(glusterd_op_t op)
+{
+    opinfo.op = GD_OP_NONE;
+
+    return 0;
+}
+
+int32_t
+glusterd_op_free_ctx(glusterd_op_t op, void *ctx)
+{
+    if (ctx) {
+        switch (op) {
+            case GD_OP_CREATE_VOLUME:
+            case GD_OP_DELETE_VOLUME:
+            case GD_OP_STOP_VOLUME:
+            case GD_OP_ADD_BRICK:
+            case GD_OP_REMOVE_BRICK:
+            case GD_OP_REPLACE_BRICK:
+            case GD_OP_LOG_ROTATE:
+            case GD_OP_SYNC_VOLUME:
+            case GD_OP_SET_VOLUME:
+            case GD_OP_START_VOLUME:
+            case GD_OP_RESET_VOLUME:
+            case GD_OP_GSYNC_SET:
+            case GD_OP_QUOTA:
+            case GD_OP_PROFILE_VOLUME:
+            case GD_OP_STATUS_VOLUME:
+            case GD_OP_REBALANCE:
+            case GD_OP_HEAL_VOLUME:
+            case GD_OP_STATEDUMP_VOLUME:
+            case GD_OP_CLEARLOCKS_VOLUME:
+            case GD_OP_DEFRAG_BRICK_VOLUME:
+            case GD_OP_MAX_OPVERSION:
+                dict_unref(ctx);
+                break;
+            default:
+                GF_ASSERT(0);
+                break;
+        }
+    }
+
+    glusterd_op_reset_ctx();
+    return 0;
+}
+
+void *
+glusterd_op_get_ctx()
+{
+    return opinfo.op_ctx;
+}
+
+int
+glusterd_op_sm_init()
+{
+    CDS_INIT_LIST_HEAD(&gd_op_sm_queue);
+    synclock_init(&gd_op_sm_lock, SYNC_LOCK_DEFAULT);
+    return 0;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.h b/xlators/mgmt/glusterd/src/glusterd-op-sm.h
new file mode 100644
index 00000000000..8a24b16612a
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.h
@@ -0,0 +1,313 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_OP_SM_H_
+#define _GLUSTERD_OP_SM_H_
+
+#include <pthread.h>
+#include <glusterfs/compat-uuid.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/byte-order.h>
+#include "glusterd.h"
+#include "protocol-common.h"
+#include "glusterd-hooks.h"
+
+#define GD_OP_PROTECTED (0x02)
+#define GD_OP_UNPROTECTED (0x04)
+
+typedef enum glusterd_op_sm_state_ {
+    GD_OP_STATE_DEFAULT = 0,
+    GD_OP_STATE_LOCK_SENT,
+    GD_OP_STATE_LOCKED,
+    GD_OP_STATE_STAGE_OP_SENT,
+    GD_OP_STATE_STAGED,
+    GD_OP_STATE_COMMIT_OP_SENT,
+    GD_OP_STATE_COMMITED,
+    GD_OP_STATE_UNLOCK_SENT,
+    GD_OP_STATE_STAGE_OP_FAILED,
+    GD_OP_STATE_COMMIT_OP_FAILED,
+    GD_OP_STATE_BRICK_OP_SENT,
+    GD_OP_STATE_BRICK_OP_FAILED,
+    GD_OP_STATE_BRICK_COMMITTED,
+    GD_OP_STATE_BRICK_COMMIT_FAILED,
+    GD_OP_STATE_ACK_DRAIN,
+    GD_OP_STATE_MAX,
+} glusterd_op_sm_state_t;
+
+typedef enum glusterd_op_sm_event_type_ {
+    GD_OP_EVENT_NONE = 0,
+    GD_OP_EVENT_START_LOCK,
+    GD_OP_EVENT_LOCK,
+    GD_OP_EVENT_RCVD_ACC,
+    GD_OP_EVENT_ALL_ACC,
+    GD_OP_EVENT_STAGE_ACC,
+    GD_OP_EVENT_COMMIT_ACC,
+    GD_OP_EVENT_RCVD_RJT,
+    GD_OP_EVENT_STAGE_OP,
+    GD_OP_EVENT_COMMIT_OP,
+    GD_OP_EVENT_UNLOCK,
+    GD_OP_EVENT_START_UNLOCK,
+    GD_OP_EVENT_ALL_ACK,
+    GD_OP_EVENT_LOCAL_UNLOCK_NO_RESP,
+    GD_OP_EVENT_MAX
+} glusterd_op_sm_event_type_t;
+
+struct glusterd_op_sm_event_ {
+    struct cds_list_head list;
+    void *ctx;
+    glusterd_op_sm_event_type_t event;
+    uuid_t txn_id;
+};
+
+typedef struct glusterd_op_sm_event_ glusterd_op_sm_event_t;
+
+typedef int (*glusterd_op_sm_ac_fn)(glusterd_op_sm_event_t *, void *);
+
+typedef struct glusterd_op_sm_ {
+    glusterd_op_sm_state_t next_state;
+    glusterd_op_sm_ac_fn handler;
+} glusterd_op_sm_t;
+
+typedef struct glusterd_op_sm_state_info_ {
+    glusterd_op_sm_state_t state;
+    struct timeval time;
+} glusterd_op_sm_state_info_t;
+
+struct glusterd_op_info_ {
+    glusterd_op_sm_state_info_t state;
+    int32_t pending_count;
+    int32_t brick_pending_count;
+    int32_t op_count;
+    /* op is an enum, glusterd_op_t or glusterd_op_sm_state_info_t */
+    int op;
+    struct cds_list_head op_peers;
+    void *op_ctx;
+    rpcsvc_request_t *req;
+    int32_t op_ret;
+    int32_t op_errno;
+    char *op_errstr;
+    struct cds_list_head pending_bricks;
+    uint32_t txn_generation;
+    gf_boolean_t skip_locking;
+};
+
+typedef struct glusterd_op_info_ glusterd_op_info_t;
+
+struct glusterd_op_log_filename_ctx_ {
+    char volume_name[GD_VOLUME_NAME_MAX];
+    char brick[GD_VOLUME_NAME_MAX];
+    char path[PATH_MAX];
+};
+typedef struct glusterd_op_log_filename_ctx_ glusterd_op_log_filename_ctx_t;
+
+struct glusterd_op_lock_ctx_ {
+    uuid_t uuid;
+    dict_t *dict;
+    rpcsvc_request_t *req;
+};
+
+typedef struct glusterd_op_lock_ctx_ glusterd_op_lock_ctx_t;
+
+struct glusterd_req_ctx_ {
+    rpcsvc_request_t *req;
+    u_char uuid[16];
+    int op;
+    dict_t *dict;
+};
+
+typedef struct glusterd_req_ctx_ glusterd_req_ctx_t;
+
+typedef struct glusterd_op_brick_rsp_ctx_ {
+    int op_ret;
+    char *op_errstr;
+    dict_t *rsp_dict;
+    glusterd_req_ctx_t *commit_ctx;
+    glusterd_pending_node_t *pending_node;
+} glusterd_op_brick_rsp_ctx_t;
+
+typedef struct glusterd_pr_brick_rsp_conv_t {
+    int count;
+    dict_t *dict;
+} glusterd_pr_brick_rsp_conv_t;
+
+typedef struct glusterd_heal_rsp_conv_ {
+    dict_t *dict;
+    glusterd_volinfo_t *volinfo;
+    xlator_t *this;
+} glusterd_heal_rsp_conv_t;
+
+typedef struct glusterd_status_rsp_conv_ {
+    int count;
+    int brick_index_max;
+    int other_count;
+    dict_t *dict;
+} glusterd_status_rsp_conv_t;
+
+typedef struct glusterd_txn_opinfo_object_ {
+    glusterd_op_info_t opinfo;
+} glusterd_txn_opinfo_obj;
+
+typedef enum cli_cmd_type_ {
+    PER_HEAL_XL,
+    ALL_HEAL_XL,
+} cli_cmd_type;
+
+typedef struct glusterd_all_volume_options {
+    char *option;
+    char *dflt_val;
+} glusterd_all_vol_opts;
+
+int
+glusterd_op_commit_hook(glusterd_op_t op, dict_t *op_ctx,
+                        glusterd_commit_hook_type_t type);
+
+int
+glusterd_op_sm_new_event(glusterd_op_sm_event_type_t event_type,
+                         glusterd_op_sm_event_t **new_event);
+int
+glusterd_op_sm_inject_event(glusterd_op_sm_event_type_t event_type,
+                            uuid_t *txn_id, void *ctx);
+
+int
+glusterd_op_sm_init();
+
+int
+glusterd_op_sm();
+
+int32_t
+glusterd_op_set_ctx(void *ctx);
+
+int32_t
+glusterd_op_set_op(glusterd_op_t op);
+
+int
+glusterd_op_build_payload(dict_t **req, char **op_errstr, dict_t *op_ctx);
+
+int32_t
+glusterd_op_stage_validate(glusterd_op_t op, dict_t *req, char **op_errstr,
+                           dict_t *rsp_dict);
+
+int32_t
+glusterd_op_commit_perform(glusterd_op_t op, dict_t *req, char **op_errstr,
+                           dict_t *dict);
+
+int32_t
+glusterd_op_txn_begin(rpcsvc_request_t *req, glusterd_op_t op, void *ctx,
+                      char *err_str, size_t err_len);
+
+int32_t
+glusterd_op_txn_complete();
+
+void *
+glusterd_op_get_ctx();
+
+int32_t
+glusterd_op_set_req(rpcsvc_request_t *req);
+
+int32_t
+glusterd_op_send_cli_response(glusterd_op_t op, int32_t op_ret,
+                              int32_t op_errno, rpcsvc_request_t *req,
+                              void *ctx, char *op_errstr);
+int32_t
+glusterd_op_get_op();
+
+int32_t
+glusterd_op_clear_op();
+
+int32_t
+glusterd_op_free_ctx(glusterd_op_t op, void *ctx);
+
+int
+glusterd_check_option_exists(char *optstring, char **completion);
+
+int
+set_xlator_option(dict_t *dict, char *key, char *value);
+
+char *
+glusterd_op_sm_state_name_get(int state);
+
+char *
+glusterd_op_sm_event_name_get(int event);
+int32_t
+glusterd_op_bricks_select(glusterd_op_t op, dict_t *dict, char **op_errstr,
+                          struct cds_list_head *selected, dict_t *rsp_dict);
+int
+glusterd_brick_op_build_payload(glusterd_op_t op,
+                                glusterd_brickinfo_t *brickinfo,
+                                gd1_mgmt_brick_op_req **req, dict_t *dict);
+int
+glusterd_node_op_build_payload(glusterd_op_t op, gd1_mgmt_brick_op_req **req,
+                               dict_t *dict);
+int32_t
+glusterd_handle_brick_rsp(void *pending_entry, glusterd_op_t op,
+                          dict_t *rsp_dict, dict_t *ctx_dict, char **op_errstr,
+                          gd_node_type type);
+
+dict_t *
+glusterd_op_init_commit_rsp_dict(glusterd_op_t op);
+
+void
+glusterd_op_modify_op_ctx(glusterd_op_t op, void *op_ctx);
+
+int
+glusterd_set_detach_bricks(dict_t *dict, glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_volume_stats_read_perf(char *brick_path, int32_t blk_size,
+                                int32_t blk_count, double *throughput,
+                                double *time);
+int32_t
+glusterd_volume_stats_write_perf(char *brick_path, int32_t blk_size,
+                                 int32_t blk_count, double *throughput,
+                                 double *time);
+gf_boolean_t
+glusterd_is_volume_started(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_start_bricks(glusterd_volinfo_t *volinfo);
+
+gf_boolean_t
+glusterd_are_all_volumes_stopped();
+int
+glusterd_stop_bricks(glusterd_volinfo_t *volinfo);
+int
+glusterd_defrag_volume_node_rsp(dict_t *req_dict, dict_t *rsp_dict,
+                                dict_t *op_ctx);
+
+int32_t
+glusterd_get_txn_opinfo(uuid_t *txn_id, glusterd_op_info_t *opinfo);
+
+int32_t
+glusterd_set_txn_opinfo(uuid_t *txn_id, glusterd_op_info_t *opinfo);
+
+int32_t
+glusterd_clear_txn_opinfo(uuid_t *txn_id);
+
+int32_t
+glusterd_generate_txn_id(dict_t *dict, uuid_t **txn_id);
+
+void
+glusterd_set_opinfo(char *errstr, int32_t op_errno, int32_t op_ret);
+
+int
+glusterd_dict_set_volid(dict_t *dict, char *volname, char **op_errstr);
+
+int
+glusterd_op_stats_volume(dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+
+int
+glusterd_op_stage_stats_volume(dict_t *dict, char **op_errstr);
+
+int
+gd_set_commit_hash(dict_t *dict);
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-peer-utils.c b/xlators/mgmt/glusterd/src/glusterd-peer-utils.c
new file mode 100644
index 00000000000..18d355cb186
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-peer-utils.c
@@ -0,0 +1,1058 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "glusterd-peer-utils.h"
+#include "glusterd-store.h"
+#include "glusterd-server-quorum.h"
+#include "glusterd-messages.h"
+#include <glusterfs/common-utils.h>
+#include "glusterd-utils.h"
+
+void
+glusterd_peerinfo_destroy(struct rcu_head *head)
+{
+    int32_t ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_peer_hostname_t *hostname = NULL;
+    glusterd_peer_hostname_t *tmp = NULL;
+
+    /* This works as rcu_head is the first member of gd_rcu_head */
+    peerinfo = caa_container_of((gd_rcu_head *)head, glusterd_peerinfo_t,
+                                rcu_head);
+
+    /* Set THIS to the saved this. Needed by some functions below */
+    THIS = peerinfo->rcu_head.this;
+
+    CDS_INIT_LIST_HEAD(&peerinfo->uuid_list);
+
+    ret = glusterd_store_delete_peerinfo(peerinfo);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_PEERINFO_DELETE_FAIL,
+               "Deleting peer info failed");
+    }
+
+    GF_FREE(peerinfo->hostname);
+    peerinfo->hostname = NULL;
+
+    cds_list_for_each_entry_safe(hostname, tmp, &peerinfo->hostnames,
+                                 hostname_list)
+    {
+        glusterd_peer_hostname_free(hostname);
+    }
+
+    glusterd_sm_tr_log_delete(&peerinfo->sm_log);
+    pthread_mutex_unlock(&peerinfo->delete_lock);
+    pthread_mutex_destroy(&peerinfo->delete_lock);
+    GF_FREE(peerinfo);
+
+    peerinfo = NULL;
+
+    return;
+}
+
+int32_t
+glusterd_peerinfo_cleanup(glusterd_peerinfo_t *peerinfo)
+{
+    GF_ASSERT(peerinfo);
+    gf_boolean_t quorum_action = _gf_false;
+    glusterd_conf_t *priv = THIS->private;
+
+    if (pthread_mutex_trylock(&peerinfo->delete_lock)) {
+        /* Someone else is already deleting the peer, so give up */
+        return 0;
+    }
+
+    if (peerinfo->quorum_contrib != QUORUM_NONE)
+        quorum_action = _gf_true;
+    if (peerinfo->rpc) {
+        peerinfo->rpc = glusterd_rpc_clnt_unref(priv, peerinfo->rpc);
+        peerinfo->rpc = NULL;
+    }
+
+    cds_list_del_rcu(&peerinfo->uuid_list);
+    /* Saving THIS, as it is needed by the callback function */
+    peerinfo->rcu_head.this = THIS;
+    call_rcu(&peerinfo->rcu_head.head, glusterd_peerinfo_destroy);
+
+    if (quorum_action)
+        /* coverity[SLEEP] */
+        glusterd_do_quorum_action();
+    return 0;
+}
+
+/* gd_peerinfo_find_from_hostname iterates over all the addresses saved for each
+ * peer and matches it to @hoststr.
+ * Returns the matched peer if found else returns NULL
+ */
+static glusterd_peerinfo_t *
+gd_peerinfo_find_from_hostname(const char *hoststr)
+{
+    xlator_t *this = THIS;
+    glusterd_conf_t *priv = NULL;
+    glusterd_peerinfo_t *peer = NULL;
+    glusterd_peerinfo_t *found = NULL;
+    glusterd_peer_hostname_t *tmphost = NULL;
+
+    GF_ASSERT(this != NULL);
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (priv != NULL), out);
+
+    GF_VALIDATE_OR_GOTO(this->name, (hoststr != NULL), out);
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peer, &priv->peers, uuid_list)
+    {
+        cds_list_for_each_entry_rcu(tmphost, &peer->hostnames, hostname_list)
+        {
+            if (!strncasecmp(tmphost->hostname, hoststr, 1024)) {
+                gf_msg_debug(this->name, 0, "Friend %s found.. state: %d",
+                             tmphost->hostname, peer->state.state);
+                found = peer; /* Probably needs to be
+                                 dereferenced*/
+                goto unlock;
+            }
+        }
+    }
+unlock:
+    RCU_READ_UNLOCK;
+out:
+    return found;
+}
+
+/* gd_peerinfo_find_from_addrinfo iterates over all the addresses saved for each
+ * peer, resolves them and compares them to @addr.
+ *
+ *
+ * NOTE: As getaddrinfo is a blocking call and is being performed multiple times
+ * in this function, it could lead to the calling thread to be blocked for
+ * significant amounts of time.
+ *
+ * Returns the matched peer if found else returns NULL
+ */
+static glusterd_peerinfo_t *
+gd_peerinfo_find_from_addrinfo(const struct addrinfo *addr)
+{
+    xlator_t *this = THIS;
+    glusterd_conf_t *conf = NULL;
+    glusterd_peerinfo_t *peer = NULL;
+    glusterd_peerinfo_t *found = NULL;
+    glusterd_peer_hostname_t *address = NULL;
+    int ret = 0;
+    struct addrinfo *paddr = NULL;
+    struct addrinfo *tmp = NULL;
+
+    GF_ASSERT(this != NULL);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (conf != NULL), out);
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peer, &conf->peers, uuid_list)
+    {
+        cds_list_for_each_entry_rcu(address, &peer->hostnames, hostname_list)
+        {
+            /* TODO: Cache the resolved addrinfos to improve
+             * performance
+             */
+            ret = getaddrinfo(address->hostname, NULL, NULL, &paddr);
+            if (ret) {
+                /* Don't fail if getaddrinfo fails, continue
+                 * onto the next address
+                 */
+                gf_msg_trace(this->name, 0, "getaddrinfo for %s failed (%s)",
+                             address->hostname, gai_strerror(ret));
+                continue;
+            }
+
+            for (tmp = paddr; tmp != NULL; tmp = tmp->ai_next) {
+                if (gf_compare_sockaddr(addr->ai_addr, tmp->ai_addr)) {
+                    found = peer; /* (de)referenced? */
+                    break;
+                }
+            }
+
+            freeaddrinfo(paddr);
+            if (found)
+                goto unlock;
+        }
+    }
+unlock:
+    RCU_READ_UNLOCK;
+out:
+    return found;
+}
+
+/* glusterd_peerinfo_find_by_hostname searches for a peer which matches the
+ * hostname @hoststr and if found returns the pointer to peerinfo object.
+ * Returns NULL otherwise.
+ *
+ * It first attempts a quick search by string matching @hoststr. If that fails,
+ * it'll attempt a more thorough match by resolving the addresses and matching
+ * the resolved addrinfos.
+ */
+glusterd_peerinfo_t *
+glusterd_peerinfo_find_by_hostname(const char *hoststr)
+{
+    int ret = -1;
+    struct addrinfo *addr = NULL;
+    struct addrinfo *p = NULL;
+    xlator_t *this = THIS;
+    glusterd_peerinfo_t *peerinfo = NULL;
+
+    GF_ASSERT(hoststr);
+
+    peerinfo = gd_peerinfo_find_from_hostname(hoststr);
+    if (peerinfo)
+        return peerinfo;
+
+    ret = getaddrinfo(hoststr, NULL, NULL, &addr);
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, ret, GD_MSG_GETADDRINFO_FAIL,
+               "error in getaddrinfo: %s\n", gai_strerror(ret));
+        goto out;
+    }
+
+    for (p = addr; p != NULL; p = p->ai_next) {
+        peerinfo = gd_peerinfo_find_from_addrinfo(p);
+        if (peerinfo) {
+            freeaddrinfo(addr);
+            return peerinfo;
+        }
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Unable to find friend: %s", hoststr);
+    if (addr)
+        freeaddrinfo(addr);
+    return NULL;
+}
+
+int
+glusterd_hostname_to_uuid(char *hostname, uuid_t uuid)
+{
+    GF_ASSERT(hostname);
+    GF_ASSERT(uuid);
+
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    peerinfo = glusterd_peerinfo_find_by_hostname(hostname);
+    if (peerinfo) {
+        ret = 0;
+        gf_uuid_copy(uuid, peerinfo->uuid);
+    } else {
+        if (gf_is_local_addr(hostname)) {
+            gf_uuid_copy(uuid, MY_UUID);
+            ret = 0;
+        } else {
+            ret = -1;
+        }
+    }
+
+    gf_msg_debug(this->name, 0, "returning %d", ret);
+    return ret;
+}
+
+/* glusterd_peerinfo_find_by_uuid searches for a peer which matches the
+ * uuid @uuid and if found returns the pointer to peerinfo object.
+ * Returns NULL otherwise.
+ */
+glusterd_peerinfo_t *
+glusterd_peerinfo_find_by_uuid(uuid_t uuid)
+{
+    glusterd_conf_t *priv = NULL;
+    glusterd_peerinfo_t *entry = NULL;
+    glusterd_peerinfo_t *found = NULL;
+    xlator_t *this = THIS;
+    glusterd_friend_sm_state_t state;
+
+    GF_ASSERT(this);
+
+    if (gf_uuid_is_null(uuid))
+        return NULL;
+
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(entry, &priv->peers, uuid_list)
+    {
+        if (!gf_uuid_compare(entry->uuid, uuid)) {
+            found = entry; /* Probably should be rcu_dereferenced */
+            state = found->state.state;
+            break;
+        }
+    }
+    RCU_READ_UNLOCK;
+
+    if (found)
+        gf_msg_debug(this->name, 0, "Friend found... state: %s",
+                     glusterd_friend_sm_state_name_get(state));
+    else
+        gf_msg_debug(this->name, 0, "Friend with uuid: %s, not found",
+                     uuid_utoa(uuid));
+    return found;
+}
+
+/* glusterd_peerinfo_find will search for a peer matching either @uuid or
+ * @hostname and return a pointer to the peerinfo object
+ * Returns NULL otherwise.
+ */
+glusterd_peerinfo_t *
+glusterd_peerinfo_find(uuid_t uuid, const char *hostname)
+{
+    glusterd_peerinfo_t *peerinfo = NULL;
+    xlator_t *this = THIS;
+
+    GF_ASSERT(this);
+
+    if (uuid) {
+        peerinfo = glusterd_peerinfo_find_by_uuid(uuid);
+
+        if (peerinfo) {
+            return peerinfo;
+        } else {
+            gf_msg_debug(this->name, 0, "Unable to find peer by uuid: %s",
+                         uuid_utoa(uuid));
+        }
+    }
+
+    if (hostname) {
+        peerinfo = glusterd_peerinfo_find_by_hostname(hostname);
+
+        if (peerinfo) {
+            return peerinfo;
+        } else {
+            gf_msg_debug(this->name, 0, "Unable to find hostname: %s",
+                         hostname);
+        }
+    }
+    return NULL;
+}
+
+/* glusterd_peerinfo_new will create a new peerinfo object and set it's members
+ * values using the passed parameters.
+ * @hostname is added as the first entry in peerinfo->hostnames list and also
+ * set to peerinfo->hostname.
+ * It returns a pointer to peerinfo object if successful and returns NULL
+ * otherwise. The caller should take care of freeing the created peerinfo
+ * object.
+ */
+glusterd_peerinfo_t *
+glusterd_peerinfo_new(glusterd_friend_sm_state_t state, uuid_t *uuid,
+                      const char *hostname, int port)
+{
+    glusterd_peerinfo_t *new_peer = NULL;
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    new_peer = GF_CALLOC(1, sizeof(*new_peer), gf_gld_mt_peerinfo_t);
+    if (!new_peer) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    CDS_INIT_LIST_HEAD(&new_peer->uuid_list);
+
+    new_peer->state.state = state;
+
+    CDS_INIT_LIST_HEAD(&new_peer->hostnames);
+    if (hostname) {
+        ret = gd_add_address_to_peer(new_peer, hostname);
+        if (ret)
+            goto out;
+        /* Also set it to peerinfo->hostname. Doing this as we use
+         * peerinfo->hostname in a lot of places and is really hard to
+         * get everything right
+         */
+        new_peer->hostname = gf_strdup(hostname);
+    }
+
+    if (uuid) {
+        gf_uuid_copy(new_peer->uuid, *uuid);
+    }
+
+    ret = glusterd_sm_tr_log_init(
+        &new_peer->sm_log, glusterd_friend_sm_state_name_get,
+        glusterd_friend_sm_event_name_get, GLUSTERD_TR_LOG_SIZE);
+    if (ret)
+        goto out;
+
+    if (new_peer->state.state == GD_FRIEND_STATE_BEFRIENDED)
+        new_peer->quorum_contrib = QUORUM_WAITING;
+    new_peer->port = port;
+
+    pthread_mutex_init(&new_peer->delete_lock, NULL);
+
+    new_peer->generation = uatomic_add_return(&conf->generation, 1);
+out:
+    if (ret && new_peer) {
+        glusterd_peerinfo_cleanup(new_peer);
+        new_peer = NULL;
+    }
+    return new_peer;
+}
+
+/* Check if the all peers are connected and befriended, except the peer
+ * specified (the peer being detached)
+ */
+gf_boolean_t
+glusterd_chk_peers_connected_befriended(uuid_t skip_uuid)
+{
+    gf_boolean_t ret = _gf_true;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &priv->peers, uuid_list)
+    {
+        if (!gf_uuid_is_null(skip_uuid) &&
+            !gf_uuid_compare(skip_uuid, peerinfo->uuid))
+            continue;
+
+        if ((GD_FRIEND_STATE_BEFRIENDED != peerinfo->state.state) ||
+            !(peerinfo->connected)) {
+            ret = _gf_false;
+            break;
+        }
+    }
+    RCU_READ_UNLOCK;
+
+    gf_msg_debug(THIS->name, 0, "Returning %s", (ret ? "TRUE" : "FALSE"));
+    return ret;
+}
+
+/* Return hostname for given uuid if it exists
+ * else return NULL
+ */
+char *
+glusterd_uuid_to_hostname(uuid_t uuid)
+{
+    char *hostname = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_peerinfo_t *entry = NULL;
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    if (!gf_uuid_compare(MY_UUID, uuid)) {
+        hostname = gf_strdup("localhost");
+        return hostname;
+    }
+    RCU_READ_LOCK;
+    if (!cds_list_empty(&priv->peers)) {
+        cds_list_for_each_entry_rcu(entry, &priv->peers, uuid_list)
+        {
+            if (!gf_uuid_compare(entry->uuid, uuid)) {
+                hostname = gf_strdup(entry->hostname);
+                break;
+            }
+        }
+    }
+    RCU_READ_UNLOCK;
+
+    return hostname;
+}
+
+char *
+gd_peer_uuid_str(glusterd_peerinfo_t *peerinfo)
+{
+    if ((peerinfo == NULL) || gf_uuid_is_null(peerinfo->uuid))
+        return NULL;
+
+    if (peerinfo->uuid_str[0] == '\0')
+        uuid_utoa_r(peerinfo->uuid, peerinfo->uuid_str);
+
+    return peerinfo->uuid_str;
+}
+
+gf_boolean_t
+glusterd_are_all_peers_up()
+{
+    glusterd_peerinfo_t *peerinfo = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    gf_boolean_t peers_up = _gf_false;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        if (!peerinfo->connected) {
+            RCU_READ_UNLOCK;
+            goto out;
+        }
+    }
+    RCU_READ_UNLOCK;
+
+    peers_up = _gf_true;
+
+out:
+    return peers_up;
+}
+
+gf_boolean_t
+glusterd_are_vol_all_peers_up(glusterd_volinfo_t *volinfo,
+                              struct cds_list_head *peers, char **down_peerstr)
+{
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    gf_boolean_t ret = _gf_false;
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (!gf_uuid_compare(brickinfo->uuid, MY_UUID))
+            continue;
+
+        RCU_READ_LOCK;
+        cds_list_for_each_entry_rcu(peerinfo, peers, uuid_list)
+        {
+            if (gf_uuid_compare(peerinfo->uuid, brickinfo->uuid))
+                continue;
+
+            /*Found peer who owns the brick, return false
+             * if peer is not connected or not friend */
+            if (!(peerinfo->connected) ||
+                (peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)) {
+                *down_peerstr = gf_strdup(peerinfo->hostname);
+                RCU_READ_UNLOCK;
+                gf_msg_debug(THIS->name, 0, "Peer %s is down. ", *down_peerstr);
+                goto out;
+            }
+        }
+        RCU_READ_UNLOCK;
+    }
+
+    ret = _gf_true;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_peer_hostname_new(const char *hostname,
+                           glusterd_peer_hostname_t **name)
+{
+    glusterd_peer_hostname_t *peer_hostname = NULL;
+    int32_t ret = -1;
+
+    GF_ASSERT(hostname);
+    GF_ASSERT(name);
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    peer_hostname = GF_CALLOC(1, sizeof(*peer_hostname),
+                              gf_gld_mt_peer_hostname_t);
+
+    if (!peer_hostname) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    peer_hostname->hostname = gf_strdup(hostname);
+    CDS_INIT_LIST_HEAD(&peer_hostname->hostname_list);
+
+    *name = peer_hostname;
+    ret = 0;
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+void
+glusterd_peer_hostname_free(glusterd_peer_hostname_t *name)
+{
+    if (!name)
+        return;
+
+    cds_list_del_init(&name->hostname_list);
+
+    GF_FREE(name->hostname);
+    name->hostname = NULL;
+
+    GF_FREE(name);
+
+    return;
+}
+
+gf_boolean_t
+gd_peer_has_address(glusterd_peerinfo_t *peerinfo, const char *address)
+{
+    glusterd_peer_hostname_t *hostname = NULL;
+
+    GF_VALIDATE_OR_GOTO("glusterd", (peerinfo != NULL), out);
+    GF_VALIDATE_OR_GOTO("glusterd", (address != NULL), out);
+
+    cds_list_for_each_entry(hostname, &peerinfo->hostnames, hostname_list)
+    {
+        if (strcmp(hostname->hostname, address) == 0) {
+            return _gf_true;
+        }
+    }
+
+out:
+    return _gf_false;
+}
+
+int
+gd_add_address_to_peer(glusterd_peerinfo_t *peerinfo, const char *address)
+{
+    int ret = -1;
+    glusterd_peer_hostname_t *hostname = NULL;
+
+    GF_VALIDATE_OR_GOTO("glusterd", (peerinfo != NULL), out);
+    GF_VALIDATE_OR_GOTO("glusterd", (address != NULL), out);
+
+    if (gd_peer_has_address(peerinfo, address)) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_peer_hostname_new(address, &hostname);
+    if (ret)
+        goto out;
+
+    cds_list_add_tail_rcu(&hostname->hostname_list, &peerinfo->hostnames);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/* gd_add_friend_to_dict() adds details of @friend into @dict with the given
+ * @prefix. All the parameters are compulsory.
+ *
+ * The complete address list is added to the dict only if the cluster op-version
+ * is >= GD_OP_VERSION_3_6_0
+ */
+int
+gd_add_friend_to_dict(glusterd_peerinfo_t *friend, dict_t *dict,
+                      const char *prefix)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char key[100] = {
+        0,
+    };
+    glusterd_peer_hostname_t *address = NULL;
+    int count = 0;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", (this != NULL), out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (conf != NULL), out);
+
+    GF_VALIDATE_OR_GOTO(this->name, (friend != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (dict != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (prefix != NULL), out);
+
+    snprintf(key, sizeof(key), "%s.uuid", prefix);
+    ret = dict_set_dynstr_with_alloc(dict, key, uuid_utoa(friend->uuid));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set key %s in dict", key);
+        goto out;
+    }
+
+    /* Setting the first hostname from the list with this key for backward
+     * compatibility
+     */
+    snprintf(key, sizeof(key), "%s.hostname", prefix);
+    address = cds_list_entry(&friend->hostnames, glusterd_peer_hostname_t,
+                             hostname_list);
+    ret = dict_set_dynstr_with_alloc(dict, key, address->hostname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set key %s in dict", key);
+        goto out;
+    }
+
+    if (conf->op_version < GD_OP_VERSION_3_6_0) {
+        ret = 0;
+        goto out;
+    }
+
+    address = NULL;
+    count = 0;
+    cds_list_for_each_entry(address, &friend->hostnames, hostname_list)
+    {
+        GF_VALIDATE_OR_GOTO(this->name, (address != NULL), out);
+
+        snprintf(key, sizeof(key), "%s.hostname%d", prefix, count);
+        ret = dict_set_dynstr_with_alloc(dict, key, address->hostname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set key %s in dict", key);
+            goto out;
+        }
+        count++;
+    }
+    ret = snprintf(key, sizeof(key), "%s.address-count", prefix);
+    ret = dict_set_int32n(dict, key, ret, count);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set key %s in dict", key);
+
+out:
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+/* gd_update_peerinfo_from_dict will update the hostnames for @peerinfo from
+ * peer details with @prefix in @dict.
+ * Returns 0 on success and -1 on failure.
+ */
+int
+gd_update_peerinfo_from_dict(glusterd_peerinfo_t *peerinfo, dict_t *dict,
+                             const char *prefix)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char key[100] = {
+        0,
+    };
+    char *hostname = NULL;
+    int count = 0;
+    int i = 0;
+
+    this = THIS;
+    GF_ASSERT(this != NULL);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (conf != NULL), out);
+
+    GF_VALIDATE_OR_GOTO(this->name, (peerinfo != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (dict != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (prefix != NULL), out);
+
+    ret = snprintf(key, sizeof(key), "%s.hostname", prefix);
+    ret = dict_get_strn(dict, key, ret, &hostname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Key %s not present in "
+               "dictionary",
+               key);
+        goto out;
+    }
+    ret = gd_add_address_to_peer(peerinfo, hostname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_ADD_ADDRESS_TO_PEER_FAIL,
+               "Could not add address to peer");
+        goto out;
+    }
+    /* Also set peerinfo->hostname to the first address */
+    if (peerinfo->hostname != NULL)
+        GF_FREE(peerinfo->hostname);
+    peerinfo->hostname = gf_strdup(hostname);
+
+    if (conf->op_version < GD_OP_VERSION_3_6_0) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = snprintf(key, sizeof(key), "%s.address-count", prefix);
+    ret = dict_get_int32n(dict, key, ret, &count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Key %s not present in "
+               "dictionary",
+               key);
+        goto out;
+    }
+    hostname = NULL;
+    for (i = 0; i < count; i++) {
+        ret = snprintf(key, sizeof(key), "%s.hostname%d", prefix, i);
+        ret = dict_get_strn(dict, key, ret, &hostname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Key %s not present "
+                   "in dictionary",
+                   key);
+            goto out;
+        }
+        ret = gd_add_address_to_peer(peerinfo, hostname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_ADD_ADDRESS_TO_PEER_FAIL,
+                   "Could not add address to peer");
+            goto out;
+        }
+
+        hostname = NULL;
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* gd_peerinfo_from_dict creates a peerinfo object from details of peer with
+ * @prefix in @dict.
+ * Returns a pointer to the created peerinfo object on success, and NULL on
+ * failure.
+ */
+glusterd_peerinfo_t *
+gd_peerinfo_from_dict(dict_t *dict, const char *prefix)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    glusterd_peerinfo_t *new_peer = NULL;
+    char key[64] = {
+        0,
+    };
+    char *uuid_str = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", (this != NULL), out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (conf != NULL), out);
+
+    GF_VALIDATE_OR_GOTO(this->name, (dict != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (prefix != NULL), out);
+
+    new_peer = glusterd_peerinfo_new(GD_FRIEND_STATE_DEFAULT, NULL, NULL, 0);
+    if (new_peer == NULL) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEERINFO_CREATE_FAIL,
+               "Could not create peerinfo "
+               "object");
+        goto out;
+    }
+
+    ret = snprintf(key, sizeof(key), "%s.uuid", prefix);
+    ret = dict_get_strn(dict, key, ret, &uuid_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Key %s not present in "
+               "dictionary",
+               key);
+        goto out;
+    }
+    gf_uuid_parse(uuid_str, new_peer->uuid);
+
+    ret = gd_update_peerinfo_from_dict(new_peer, dict, prefix);
+
+out:
+    if ((ret != 0) && (new_peer != NULL)) {
+        glusterd_peerinfo_cleanup(new_peer);
+        new_peer = NULL;
+    }
+
+    return new_peer;
+}
+
+static int
+gd_add_peer_hostnames_to_dict(glusterd_peerinfo_t *peerinfo, dict_t *dict,
+                              const char *prefix)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char key[64] = {
+        0,
+    };
+    glusterd_peer_hostname_t *addr = NULL;
+    int count = 0;
+
+    this = THIS;
+    GF_ASSERT(this != NULL);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (conf != NULL), out);
+
+    if (conf->op_version < GD_OP_VERSION_3_6_0) {
+        ret = 0;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO(this->name, (peerinfo != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (dict != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (prefix != NULL), out);
+
+    cds_list_for_each_entry(addr, &peerinfo->hostnames, hostname_list)
+    {
+        snprintf(key, sizeof(key), "%s.hostname%d", prefix, count);
+        ret = dict_set_dynstr_with_alloc(dict, key, addr->hostname);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                    "Key=%s", key, NULL);
+            goto out;
+        }
+        count++;
+    }
+
+    ret = snprintf(key, sizeof(key), "%s.hostname_count", prefix);
+    ret = dict_set_int32n(dict, key, ret, count);
+
+out:
+    return ret;
+}
+
+int
+gd_add_peer_detail_to_dict(glusterd_peerinfo_t *peerinfo, dict_t *friends,
+                           int count)
+{
+    int ret = -1;
+    char key[32] = {
+        0,
+    };
+    int keylen;
+    char *peer_uuid_str = NULL;
+
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(peerinfo);
+    GF_ASSERT(friends);
+
+    peer_uuid_str = gd_peer_uuid_str(peerinfo);
+    keylen = snprintf(key, sizeof(key), "friend%d.uuid", count);
+    ret = dict_set_strn(friends, key, keylen, peer_uuid_str);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "Key=%s",
+                key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "friend%d.hostname", count);
+    ret = dict_set_strn(friends, key, keylen, peerinfo->hostname);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "Key=%s",
+                key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "friend%d.port", count);
+    ret = dict_set_int32n(friends, key, keylen, peerinfo->port);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "Key=%s",
+                key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "friend%d.stateId", count);
+    ret = dict_set_int32n(friends, key, keylen, peerinfo->state.state);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                "Key=%s in dict", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "friend%d.state", count);
+    ret = dict_set_strn(
+        friends, key, keylen,
+        glusterd_friend_sm_state_name_get(peerinfo->state.state));
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "key=%s",
+                key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "friend%d.connected", count);
+    ret = dict_set_int32n(friends, key, keylen, (int32_t)peerinfo->connected);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "Key=%s",
+                key, NULL);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "friend%d", count);
+    ret = gd_add_peer_hostnames_to_dict(peerinfo, friends, key);
+
+out:
+    return ret;
+}
+
+/* glusterd_peerinfo_find_by_generation searches for a peer which has the
+ * generation number @generation and if found returns the pointer to peerinfo
+ * object. Returns NULL otherwise.
+ */
+glusterd_peerinfo_t *
+glusterd_peerinfo_find_by_generation(uint32_t generation)
+{
+    glusterd_conf_t *priv = NULL;
+    glusterd_peerinfo_t *entry = NULL;
+    glusterd_peerinfo_t *found = NULL;
+    xlator_t *this = THIS;
+    glusterd_friend_sm_state_t state;
+
+    GF_ASSERT(this);
+
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(entry, &priv->peers, uuid_list)
+    {
+        if (entry->generation == generation) {
+            found = entry; /* Probably should be rcu_dereferenced */
+            state = found->state.state;
+            break;
+        }
+    }
+    RCU_READ_UNLOCK;
+
+    if (found)
+        gf_msg_debug(this->name, 0, "Friend found... state: %s",
+                     glusterd_friend_sm_state_name_get(state));
+    else
+        gf_msg_debug(this->name, 0,
+                     "Friend with generation: %" PRIu32 ", not found",
+                     generation);
+    return found;
+}
+
+int
+glusterd_get_peers_count()
+{
+    int count = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    glusterd_peerinfo_t *peer = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peer, &conf->peers, uuid_list) count++;
+    RCU_READ_UNLOCK;
+
+out:
+    return count;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-peer-utils.h b/xlators/mgmt/glusterd/src/glusterd-peer-utils.h
new file mode 100644
index 00000000000..fd254d57391
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-peer-utils.h
@@ -0,0 +1,82 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_PEER_UTILS_H
+#define _GLUSTERD_PEER_UTILS_H
+
+#include "glusterd.h"
+
+int32_t
+glusterd_peerinfo_cleanup(glusterd_peerinfo_t *peerinfo);
+
+glusterd_peerinfo_t *
+glusterd_peerinfo_find_by_hostname(const char *hoststr);
+
+int
+glusterd_hostname_to_uuid(char *hostname, uuid_t uuid);
+
+glusterd_peerinfo_t *
+glusterd_peerinfo_find_by_uuid(uuid_t uuid);
+
+glusterd_peerinfo_t *
+glusterd_peerinfo_find(uuid_t uuid, const char *hostname);
+
+glusterd_peerinfo_t *
+glusterd_peerinfo_new(glusterd_friend_sm_state_t state, uuid_t *uuid,
+                      const char *hostname, int port);
+
+gf_boolean_t
+glusterd_chk_peers_connected_befriended(uuid_t skip_uuid);
+
+char *
+glusterd_uuid_to_hostname(uuid_t uuid);
+
+char *
+gd_peer_uuid_str(glusterd_peerinfo_t *peerinfo);
+
+gf_boolean_t
+glusterd_are_all_peers_up();
+
+gf_boolean_t
+glusterd_are_vol_all_peers_up(glusterd_volinfo_t *volinfo,
+                              struct cds_list_head *peers, char **down_peerstr);
+
+int32_t
+glusterd_peer_hostname_new(const char *hostname,
+                           glusterd_peer_hostname_t **name);
+void
+glusterd_peer_hostname_free(glusterd_peer_hostname_t *name);
+
+gf_boolean_t
+gd_peer_has_address(glusterd_peerinfo_t *peerinfo, const char *address);
+
+int
+gd_add_address_to_peer(glusterd_peerinfo_t *peerinfo, const char *address);
+
+int
+gd_add_friend_to_dict(glusterd_peerinfo_t *friend, dict_t *dict,
+                      const char *prefix);
+
+int
+gd_update_peerinfo_from_dict(glusterd_peerinfo_t *peerinfo, dict_t *dict,
+                             const char *prefix);
+
+glusterd_peerinfo_t *
+gd_peerinfo_from_dict(dict_t *dict, const char *prefix);
+
+int
+gd_add_peer_detail_to_dict(glusterd_peerinfo_t *peerinfo, dict_t *friends,
+                           int count);
+glusterd_peerinfo_t *
+glusterd_peerinfo_find_by_generation(uint32_t generation);
+
+int
+glusterd_get_peers_count();
+#endif /* _GLUSTERD_PEER_UTILS_H */
diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c
new file mode 100644
index 00000000000..16ac628ab82
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c
@@ -0,0 +1,666 @@
+/*
+   Copyright (c) 2010-2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/xlator.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/compat-errno.h>
+
+#include "glusterd.h"
+#include "glusterd-utils.h"
+
+#include "portmap-xdr.h"
+#include "xdr-generic.h"
+#include "protocol-common.h"
+#include "glusterd-messages.h"
+#include "rpcsvc.h"
+
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+
+static int
+pmap_port_isfree(int port)
+{
+    struct sockaddr_in sin;
+    int sock = -1;
+    int ret = 0;
+
+    memset(&sin, 0, sizeof(sin));
+    sin.sin_family = PF_INET;
+    sin.sin_port = hton16(port);
+
+    sock = socket(PF_INET, SOCK_STREAM, 0);
+    if (sock == -1)
+        return -1;
+
+    ret = bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+    sys_close(sock);
+
+    return (ret == 0) ? 1 : 0;
+}
+
+static struct pmap_registry *
+pmap_registry_new(xlator_t *this)
+{
+    struct pmap_registry *pmap = NULL;
+    int i = 0;
+
+    pmap = CALLOC(sizeof(*pmap), 1);
+    if (!pmap)
+        return NULL;
+
+    pmap->base_port = pmap->last_alloc = ((glusterd_conf_t *)(this->private))
+                                             ->base_port;
+    pmap->max_port = ((glusterd_conf_t *)(this->private))->max_port;
+    for (i = pmap->base_port; i <= pmap->max_port; i++) {
+        if (pmap_port_isfree(i))
+            pmap->ports[i].type = GF_PMAP_PORT_FREE;
+        else
+            pmap->ports[i].type = GF_PMAP_PORT_FOREIGN;
+    }
+
+    return pmap;
+}
+
+struct pmap_registry *
+pmap_registry_get(xlator_t *this)
+{
+    glusterd_conf_t *priv = NULL;
+    struct pmap_registry *pmap = NULL;
+
+    priv = this->private;
+
+    pmap = priv->pmap;
+    if (!pmap) {
+        pmap = pmap_registry_new(this);
+        if (!pmap)
+            return NULL;
+        priv->pmap = pmap;
+    }
+
+    return pmap;
+}
+
+/*
+ * The "destroy" argument avoids a double search in pmap_registry_remove - one
+ * to find the entry in the table, and the other to find the particular
+ * brickname within that entry (which might cover multiple bricks).  We do the
+ * actual deletion here by "whiting out" the brick name with spaces.  It's up
+ * to pmap_registry_remove to figure out what to do from there.
+ */
+int
+pmap_registry_search(xlator_t *this, const char *brickname,
+                     gf_pmap_port_type_t type, gf_boolean_t destroy)
+{
+    struct pmap_registry *pmap = NULL;
+    int p = 0;
+    char *brck = NULL;
+    size_t i;
+
+    pmap = pmap_registry_get(this);
+
+    for (p = pmap->last_alloc; p >= pmap->base_port; p--) {
+        if (!pmap->ports[p].brickname || pmap->ports[p].type != type)
+            continue;
+
+        brck = pmap->ports[p].brickname;
+        for (;;) {
+            for (i = 0; brck[i] && !isspace(brck[i]); ++i)
+                ;
+            if (i == 0 && brck[i] == '\0')
+                break;
+
+            if (strncmp(brck, brickname, i) == 0) {
+                /*
+                 * Without this check, we'd break when brck
+                 * is merely a substring of brickname.
+                 */
+                if (brickname[i] == '\0') {
+                    if (destroy)
+                        do {
+                            *(brck++) = ' ';
+                        } while (--i);
+                    return p;
+                }
+            }
+
+            brck += i;
+
+            /*
+             * Skip over *any* amount of whitespace, including
+             * none (if we're already at the end of the string).
+             */
+            while (isspace(*brck))
+                ++brck;
+            /*
+             * We're either at the end of the string (which will be
+             * handled above strncmp on the next iteration) or at
+             * the next non-whitespace substring (which will be
+             * handled by strncmp itself).
+             */
+        }
+    }
+
+    return 0;
+}
+
+static int
+pmap_registry_search_by_xprt(xlator_t *this, void *xprt,
+                             gf_pmap_port_type_t type)
+{
+    struct pmap_registry *pmap = NULL;
+    int p = 0;
+    int port = 0;
+
+    pmap = pmap_registry_get(this);
+
+    for (p = pmap->last_alloc; p >= pmap->base_port; p--) {
+        if (!pmap->ports[p].xprt)
+            continue;
+        if (pmap->ports[p].xprt == xprt) {
+            if (pmap->ports[p].type == type || type == GF_PMAP_PORT_ANY) {
+                port = p;
+                break;
+            }
+        }
+    }
+
+    return port;
+}
+
+static char *
+pmap_registry_search_by_port(xlator_t *this, int port)
+{
+    struct pmap_registry *pmap = NULL;
+    char *brickname = NULL;
+    int max_port = 0;
+
+    max_port = ((glusterd_conf_t *)(this->private))->max_port;
+    if (port > max_port)
+        goto out;
+
+    pmap = pmap_registry_get(this);
+
+    if (pmap->ports[port].type == GF_PMAP_PORT_BRICKSERVER)
+        brickname = pmap->ports[port].brickname;
+
+out:
+    return brickname;
+}
+
+int
+pmap_registry_alloc(xlator_t *this)
+{
+    struct pmap_registry *pmap = NULL;
+    int p = 0;
+    int port = 0;
+
+    pmap = pmap_registry_get(this);
+
+    for (p = pmap->base_port; p <= pmap->max_port; p++) {
+        /* GF_PMAP_PORT_FOREIGN may be freed up ? */
+        if ((pmap->ports[p].type == GF_PMAP_PORT_FREE) ||
+            (pmap->ports[p].type == GF_PMAP_PORT_FOREIGN)) {
+            if (pmap_port_isfree(p)) {
+                pmap->ports[p].type = GF_PMAP_PORT_LEASED;
+                port = p;
+                break;
+            }
+        }
+    }
+
+    if (port > pmap->last_alloc)
+        pmap->last_alloc = port;
+
+    return port;
+}
+
+/* pmap_assign_port does a pmap_registry_remove followed by pmap_registry_alloc,
+ * the reason for the former is to ensure we don't end up with stale ports
+ */
+int
+pmap_assign_port(xlator_t *this, int old_port, const char *path)
+{
+    int ret = -1;
+    int new_port = 0;
+
+    if (old_port) {
+        ret = pmap_registry_remove(this, 0, path, GF_PMAP_PORT_BRICKSERVER,
+                                   NULL, _gf_false);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, GD_MSG_PMAP_REGISTRY_REMOVE_FAIL,
+                   0,
+                   "Failed to"
+                   "remove pmap registry for older signin for path"
+                   " %s",
+                   path);
+        }
+    }
+    new_port = pmap_registry_alloc(this);
+    return new_port;
+}
+
+int
+pmap_registry_bind(xlator_t *this, int port, const char *brickname,
+                   gf_pmap_port_type_t type, void *xprt)
+{
+    struct pmap_registry *pmap = NULL;
+    int p = 0;
+
+    pmap = pmap_registry_get(this);
+
+    if (port > pmap->max_port)
+        goto out;
+
+    p = port;
+    if (pmap->ports[p].type == GF_PMAP_PORT_FREE) {
+        /* Because of some crazy race in volume start code path because
+         * of friend handshaking with volumes with quorum enabled we
+         * might end up into a situation where glusterd would start a
+         * brick and get a disconnect and then immediately try to start
+         * the same brick instance based on another friend update
+         * request. And then if for the very first brick even if the
+         * process doesn't come up at the end sign in event gets sent
+         * and we end up having two duplicate portmap entries for the
+         * same brick. Since in brick start we mark the previous port as
+         * free, its better to consider a sign in request as no op if
+         * the corresponding port type is marked as free
+         */
+        goto out;
+    }
+    if (pmap->ports[p].brickname) {
+        char *tmp = pmap->ports[p].brickname;
+        asprintf(&pmap->ports[p].brickname, "%s %s", tmp, brickname);
+        free(tmp);
+    } else {
+        pmap->ports[p].brickname = strdup(brickname);
+    }
+    pmap->ports[p].type = type;
+    pmap->ports[p].xprt = xprt;
+
+    gf_msg("pmap", GF_LOG_INFO, 0, GD_MSG_BRICK_ADD,
+           "adding brick %s on port %d", brickname, port);
+
+    if (pmap->last_alloc < p)
+        pmap->last_alloc = p;
+out:
+    return 0;
+}
+
+int
+pmap_registry_extend(xlator_t *this, int port, const char *brickname)
+{
+    struct pmap_registry *pmap = NULL;
+    char *old_bn;
+    char *new_bn;
+    size_t bn_len;
+    char *entry;
+    int found = 0;
+
+    pmap = pmap_registry_get(this);
+
+    if (port > pmap->max_port) {
+        return -1;
+    }
+
+    switch (pmap->ports[port].type) {
+        case GF_PMAP_PORT_LEASED:
+        case GF_PMAP_PORT_BRICKSERVER:
+            break;
+        default:
+            return -1;
+    }
+
+    old_bn = pmap->ports[port].brickname;
+    if (old_bn) {
+        bn_len = strlen(brickname);
+        entry = strstr(old_bn, brickname);
+        while (entry) {
+            found = 1;
+            if ((entry != old_bn) && (entry[-1] != ' ')) {
+                found = 0;
+            }
+            if ((entry[bn_len] != ' ') && (entry[bn_len] != '\0')) {
+                found = 0;
+            }
+            if (found) {
+                return 0;
+            }
+            entry = strstr(entry + bn_len, brickname);
+        }
+        asprintf(&new_bn, "%s %s", old_bn, brickname);
+    } else {
+        new_bn = strdup(brickname);
+    }
+
+    if (!new_bn) {
+        return -1;
+    }
+
+    pmap->ports[port].brickname = new_bn;
+    free(old_bn);
+
+    return 0;
+}
+
+int
+pmap_registry_remove(xlator_t *this, int port, const char *brickname,
+                     gf_pmap_port_type_t type, void *xprt,
+                     gf_boolean_t brick_disconnect)
+{
+    struct pmap_registry *pmap = NULL;
+    int p = 0;
+    glusterd_conf_t *priv = NULL;
+    char *brick_str;
+
+    priv = this->private;
+    pmap = priv->pmap;
+    if (!pmap)
+        goto out;
+
+    if (port) {
+        if (port > pmap->max_port)
+            goto out;
+    }
+
+    if (brickname) {
+        p = pmap_registry_search(this, brickname, type, _gf_true);
+        if (p)
+            goto remove;
+    }
+
+    if (xprt) {
+        p = pmap_registry_search_by_xprt(this, xprt, type);
+        if (p)
+            goto remove;
+    }
+
+    goto out;
+remove:
+    gf_msg("pmap", GF_LOG_INFO, 0, GD_MSG_BRICK_REMOVE,
+           "removing brick %s on port %d", brickname, p);
+
+    if (xprt && (xprt == pmap->ports[p].xprt)) {
+        pmap->ports[p].xprt = NULL;
+    }
+
+    /*
+     * This is where we garbage-collect.  If all of the brick names have
+     * been "whited out" by pmap_registry_search(...,destroy=_gf_true) and
+     * there's no xprt either, then we have nothing left worth saving and
+     * can delete the entire entry.
+     */
+    if (brick_disconnect || !pmap->ports[p].xprt) {
+        /* If the signout call is being triggered by brick disconnect
+         * then clean up all the bricks (in case of brick mux)
+         */
+        if (!brick_disconnect) {
+            brick_str = pmap->ports[p].brickname;
+            if (brick_str) {
+                while (*brick_str != '\0') {
+                    if (*(brick_str++) != ' ') {
+                        goto out;
+                    }
+                }
+            }
+        }
+        free(pmap->ports[p].brickname);
+        pmap->ports[p].brickname = NULL;
+        pmap->ports[p].type = GF_PMAP_PORT_FREE;
+    }
+
+out:
+    return 0;
+}
+
+int
+__gluster_pmap_portbybrick(rpcsvc_request_t *req)
+{
+    pmap_port_by_brick_req args = {
+        0,
+    };
+    pmap_port_by_brick_rsp rsp = {
+        0,
+    };
+    char *brick = NULL;
+    int port = 0;
+    int ret = -1;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    ret = xdr_to_generic(req->msg[0], &args,
+                         (xdrproc_t)xdr_pmap_port_by_brick_req);
+    if (ret < 0) {
+        req->rpc_err = GARBAGE_ARGS;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto fail;
+    }
+
+    brick = args.brick;
+
+    port = pmap_registry_search(this, brick, GF_PMAP_PORT_BRICKSERVER,
+                                _gf_false);
+
+    if (!port)
+        rsp.op_ret = -1;
+
+    rsp.port = port;
+
+fail:
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                          (xdrproc_t)xdr_pmap_port_by_brick_rsp);
+    free(args.brick);  // malloced by xdr
+
+    return 0;
+}
+
+int
+gluster_pmap_portbybrick(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __gluster_pmap_portbybrick);
+}
+
+int
+__gluster_pmap_brickbyport(rpcsvc_request_t *req)
+{
+    pmap_brick_by_port_req args = {
+        0,
+    };
+    pmap_brick_by_port_rsp rsp = {
+        0,
+    };
+    int ret = -1;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    ret = xdr_to_generic(req->msg[0], &args,
+                         (xdrproc_t)xdr_pmap_brick_by_port_req);
+    if (ret < 0) {
+        req->rpc_err = GARBAGE_ARGS;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto fail;
+    }
+
+    rsp.brick = pmap_registry_search_by_port(THIS, args.port);
+    if (!rsp.brick) {
+        rsp.op_ret = -1;
+        rsp.brick = "";
+    }
+fail:
+
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                          (xdrproc_t)xdr_pmap_brick_by_port_rsp);
+
+    return 0;
+}
+
+int
+gluster_pmap_brickbyport(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __gluster_pmap_brickbyport);
+}
+
+int
+__gluster_pmap_signin(rpcsvc_request_t *req)
+{
+    pmap_signin_req args = {
+        0,
+    };
+    pmap_signin_rsp rsp = {
+        0,
+    };
+    int ret = -1;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    ret = xdr_to_generic(req->msg[0], &args, (xdrproc_t)xdr_pmap_signin_req);
+    if (ret < 0) {
+        req->rpc_err = GARBAGE_ARGS;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto fail;
+    }
+
+    rsp.op_ret = pmap_registry_bind(THIS, args.port, args.brick,
+                                    GF_PMAP_PORT_BRICKSERVER, req->trans);
+
+    ret = glusterd_get_brickinfo(THIS, args.brick, args.port, &brickinfo);
+    /* Update portmap status in brickinfo */
+    if (brickinfo)
+        brickinfo->port_registered = _gf_true;
+
+fail:
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                          (xdrproc_t)xdr_pmap_signin_rsp);
+    free(args.brick);  // malloced by xdr
+
+    return 0;
+}
+
+int
+gluster_pmap_signin(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __gluster_pmap_signin);
+}
+
+int
+__gluster_pmap_signout(rpcsvc_request_t *req)
+{
+    pmap_signout_req args = {
+        0,
+    };
+    pmap_signout_rsp rsp = {
+        0,
+    };
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    char pidfile[PATH_MAX] = {0};
+    char brick_path[PATH_MAX] = {
+        0,
+    };
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, fail);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, fail);
+
+    ret = xdr_to_generic(req->msg[0], &args, (xdrproc_t)xdr_pmap_signout_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        req->rpc_err = GARBAGE_ARGS;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto fail;
+    }
+    rsp.op_ret = pmap_registry_remove(THIS, args.port, args.brick,
+                                      GF_PMAP_PORT_BRICKSERVER, req->trans,
+                                      _gf_false);
+
+    ret = glusterd_get_brickinfo(THIS, args.brick, args.port, &brickinfo);
+    if (args.rdma_port) {
+        snprintf(brick_path, PATH_MAX, "%s.rdma", args.brick);
+        rsp.op_ret = pmap_registry_remove(THIS, args.rdma_port, brick_path,
+                                          GF_PMAP_PORT_BRICKSERVER, req->trans,
+                                          _gf_false);
+    }
+    /* Update portmap status on brickinfo */
+    if (brickinfo)
+        brickinfo->port_registered = _gf_false;
+
+    /* Clean up the pidfile for this brick given glusterfsd doesn't clean it
+     * any more. This is required to ensure we don't end up with having
+     * stale pid files in case a brick is killed from the backend
+     */
+    ret = glusterd_get_volinfo_from_brick(args.brick, &volinfo);
+    if (!ret) {
+        if (volinfo && brickinfo) {
+            GLUSTERD_GET_BRICK_PIDFILE(pidfile, volinfo, brickinfo, conf);
+            sys_unlink(pidfile);
+
+            /* Setting the brick status to GF_BRICK_STOPPED to
+             * ensure correct brick status is maintained on the
+             * glusterd end when a brick is killed from the
+             * backend */
+            brickinfo->status = GF_BRICK_STOPPED;
+
+            /* Remove brick from brick process if not already
+             * removed in the brick op phase. This situation would
+             * arise when the brick is killed explicitly from the
+             * backend */
+            ret = glusterd_brick_process_remove_brick(brickinfo, NULL);
+            if (ret) {
+                gf_msg_debug(this->name, 0,
+                             "Couldn't remove "
+                             "brick %s:%s from brick process",
+                             brickinfo->hostname, brickinfo->path);
+                /* Ignore 'ret' here since the brick might
+                 * have already been deleted in brick op phase
+                 */
+                ret = 0;
+            }
+        }
+    }
+
+fail:
+    glusterd_submit_reply(req, &rsp, NULL, 0, NULL,
+                          (xdrproc_t)xdr_pmap_signout_rsp);
+    free(args.brick);  // malloced by xdr
+
+    return 0;
+}
+
+int
+gluster_pmap_signout(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __gluster_pmap_signout);
+}
+
+static rpcsvc_actor_t gluster_pmap_actors[GF_PMAP_MAXVALUE] = {
+    [GF_PMAP_NULL] = {"NULL", NULL, NULL, GF_PMAP_NULL, DRC_NA, 0},
+    [GF_PMAP_PORTBYBRICK] = {"PORTBYBRICK", gluster_pmap_portbybrick, NULL,
+                             GF_PMAP_PORTBYBRICK, DRC_NA, 0},
+    [GF_PMAP_BRICKBYPORT] = {"BRICKBYPORT", gluster_pmap_brickbyport, NULL,
+                             GF_PMAP_BRICKBYPORT, DRC_NA, 0},
+    [GF_PMAP_SIGNIN] = {"SIGNIN", gluster_pmap_signin, NULL, GF_PMAP_SIGNIN,
+                        DRC_NA, 0},
+    [GF_PMAP_SIGNOUT] = {"SIGNOUT", gluster_pmap_signout, NULL, GF_PMAP_SIGNOUT,
+                         DRC_NA, 0},
+};
+
+struct rpcsvc_program gluster_pmap_prog = {
+    .progname = "Gluster Portmap",
+    .prognum = GLUSTER_PMAP_PROGRAM,
+    .progver = GLUSTER_PMAP_VERSION,
+    .actors = gluster_pmap_actors,
+    .numactors = GF_PMAP_MAXVALUE,
+};
diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.h b/xlators/mgmt/glusterd/src/glusterd-pmap.h
new file mode 100644
index 00000000000..51d75361431
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-pmap.h
@@ -0,0 +1,57 @@
+/*
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_PMAP_H_
+#define _GLUSTERD_PMAP_H_
+
+#include <pthread.h>
+#include <glusterfs/compat-uuid.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/byte-order.h>
+#include "rpcsvc.h"
+
+struct pmap_port_status {
+    char *brickname;
+    void *xprt;
+    gf_pmap_port_type_t type;
+};
+
+struct pmap_registry {
+    struct pmap_port_status ports[GF_PORT_MAX + 1];
+    int base_port;
+    int max_port;
+    int last_alloc;
+};
+
+int
+pmap_assign_port(xlator_t *this, int port, const char *path);
+int
+pmap_mark_port_leased(xlator_t *this, int port);
+int
+pmap_registry_alloc(xlator_t *this);
+int
+pmap_registry_bind(xlator_t *this, int port, const char *brickname,
+                   gf_pmap_port_type_t type, void *xprt);
+int
+pmap_registry_extend(xlator_t *this, int port, const char *brickname);
+int
+pmap_registry_remove(xlator_t *this, int port, const char *brickname,
+                     gf_pmap_port_type_t type, void *xprt,
+                     gf_boolean_t brick_disconnect);
+int
+pmap_registry_search(xlator_t *this, const char *brickname,
+                     gf_pmap_port_type_t type, gf_boolean_t destroy);
+struct pmap_registry *
+pmap_registry_get(xlator_t *this);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
new file mode 100644
index 00000000000..a05c90d7b10
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c
@@ -0,0 +1,152 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <stdio.h>
+#include <limits.h>
+#include <signal.h>
+
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include <glusterfs/common-utils.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include "glusterd-messages.h"
+#include "glusterd-proc-mgmt.h"
+
+int
+glusterd_proc_init(glusterd_proc_t *proc, char *name, char *pidfile,
+                   char *logdir, char *logfile, char *volfile, char *volfileid,
+                   char *volfileserver)
+{
+    int ret = -1;
+
+    ret = snprintf(proc->name, sizeof(proc->name), "%s", name);
+    if (ret < 0)
+        goto out;
+
+    ret = snprintf(proc->pidfile, sizeof(proc->pidfile), "%s", pidfile);
+    if (ret < 0)
+        goto out;
+
+    ret = snprintf(proc->logdir, sizeof(proc->logdir), "%s", logdir);
+    if (ret < 0)
+        goto out;
+
+    ret = snprintf(proc->logfile, sizeof(proc->logfile), "%s", logfile);
+    if (ret < 0)
+        goto out;
+
+    ret = snprintf(proc->volfile, sizeof(proc->volfile), "%s", volfile);
+    if (ret < 0)
+        goto out;
+
+    ret = snprintf(proc->volfileid, sizeof(proc->volfileid), "%s", volfileid);
+    if (ret < 0)
+        goto out;
+
+    ret = snprintf(proc->volfileserver, sizeof(proc->volfileserver), "%s",
+                   volfileserver);
+    if (ret < 0)
+        goto out;
+
+out:
+    if (ret > 0)
+        ret = 0;
+
+    return ret;
+}
+
+int
+glusterd_proc_stop(glusterd_proc_t *proc, int sig, int flags)
+{
+    /* NB: Copy-paste code from glusterd_service_stop, the source may be
+     * removed once all daemon management use proc */
+
+    int32_t ret = -1;
+    pid_t pid = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    if (!gf_is_service_running(proc->pidfile, &pid)) {
+        ret = 0;
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_ALREADY_STOPPED,
+               "%s already stopped", proc->name);
+        goto out;
+    }
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_SVC_STOP_SUCCESS,
+           "Stopping %s daemon running in pid: "
+           "%d",
+           proc->name, pid);
+
+    ret = kill(pid, sig);
+    if (ret) {
+        switch (errno) {
+            case ESRCH:
+                gf_msg_debug(this->name, 0,
+                             "%s is already "
+                             "stopped",
+                             proc->name);
+                ret = 0;
+                goto out;
+            default:
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_SVC_KILL_FAIL,
+                       "Unable to kill %s "
+                       "service, reason:%s",
+                       proc->name, strerror(errno));
+        }
+    } else {
+        (void)glusterd_unlink_file(proc->pidfile);
+    }
+    if (flags != PROC_STOP_FORCE)
+        goto out;
+
+    synclock_unlock(&conf->big_lock);
+    synctask_sleep(1);
+    synclock_lock(&conf->big_lock);
+    if (gf_is_service_running(proc->pidfile, &pid)) {
+        ret = kill(pid, SIGKILL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_PID_KILL_FAIL,
+                   "Unable to kill pid:%d, "
+                   "reason:%s",
+                   pid, strerror(errno));
+            goto out;
+        }
+        ret = glusterd_unlink_file(proc->pidfile);
+        if (ret)
+            goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_proc_get_pid(glusterd_proc_t *proc)
+{
+    int pid = -1;
+    (void)gf_is_service_running(proc->pidfile, &pid);
+    return pid;
+}
+
+int
+glusterd_proc_is_running(glusterd_proc_t *proc)
+{
+    int pid = -1;
+
+    return gf_is_service_running(proc->pidfile, &pid);
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.h b/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.h
new file mode 100644
index 00000000000..e8e9ffc5082
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-proc-mgmt.h
@@ -0,0 +1,44 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_PROC_MGMT_H_
+#define _GLUSTERD_PROC_MGMT_H_
+
+typedef struct glusterd_proc_ glusterd_proc_t;
+
+enum proc_flags {
+    PROC_NONE = 0,
+    PROC_START,
+    PROC_START_NO_WAIT,
+    PROC_STOP,
+    PROC_STOP_FORCE
+};
+
+struct glusterd_proc_ {
+    char name[NAME_MAX];
+    char pidfile[PATH_MAX];
+    char logdir[PATH_MAX];
+    char logfile[PATH_MAX];
+    char volfile[PATH_MAX];
+    char volfileserver[PATH_MAX];
+    char volfileid[256];
+};
+
+int
+glusterd_proc_init(glusterd_proc_t *proc, char *name, char *pidfile,
+                   char *logdir, char *logfile, char *volfile, char *volfileid,
+                   char *volfileserver);
+
+int
+glusterd_proc_stop(glusterd_proc_t *proc, int sig, int flags);
+
+int
+glusterd_proc_is_running(glusterd_proc_t *proc);
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-quota.c b/xlators/mgmt/glusterd/src/glusterd-quota.c
new file mode 100644
index 00000000000..8370c174ce3
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-quota.c
@@ -0,0 +1,2259 @@
+/*
+   Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/common-utils.h>
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+#include "glusterd.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-store.h"
+#include "glusterd-utils.h"
+#include "glusterd-quotad-svc.h"
+#include "glusterd-volgen.h"
+#include "glusterd-messages.h"
+#include <glusterfs/run.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/quota-common-utils.h>
+#include "glusterd-quota.h"
+
+#include <sys/wait.h>
+#include <dlfcn.h>
+
+#ifndef _PATH_SETFATTR
+#ifdef GF_LINUX_HOST_OS
+#define _PATH_SETFATTR "setfattr"
+#endif
+#ifdef __NetBSD__
+#define _PATH_SETFATTR "/usr/pkg/bin/setfattr"
+#endif
+#endif
+
+/* Any negative pid to make it special client */
+#define QUOTA_CRAWL_PID "-100"
+
+#define GLUSTERFS_GET_QUOTA_LIMIT_MOUNT_PIDFILE(pidfile, volname)              \
+    {                                                                          \
+        snprintf(pidfile, PATH_MAX - 1,                                        \
+                 DEFAULT_VAR_RUN_DIRECTORY "/%s_quota_limit.pid", volname);    \
+    }
+
+#define GLUSTERFS_GET_QUOTA_LIST_MOUNT_PIDFILE(pidfile, volname)               \
+    {                                                                          \
+        snprintf(pidfile, PATH_MAX - 1,                                        \
+                 DEFAULT_VAR_RUN_DIRECTORY "/%s_quota_list.pid", volname);     \
+    }
+
+#define GLUSTERD_GET_QUOTA_CRAWL_PIDDIR(piddir, volinfo, type)                 \
+    do {                                                                       \
+        char _volpath[PATH_MAX] = {                                            \
+            0,                                                                 \
+        };                                                                     \
+        int32_t _crawl_pid_len;                                                \
+        GLUSTERD_GET_VOLUME_DIR(_volpath, volinfo, priv);                      \
+        if (type == GF_QUOTA_OPTION_TYPE_ENABLE ||                             \
+            type == GF_QUOTA_OPTION_TYPE_ENABLE_OBJECTS)                       \
+            _crawl_pid_len = snprintf(piddir, PATH_MAX, "%s/run/quota/enable", \
+                                      _volpath);                               \
+        else                                                                   \
+            _crawl_pid_len = snprintf(piddir, PATH_MAX,                        \
+                                      "%s/run/quota/disable", _volpath);       \
+        if ((_crawl_pid_len < 0) || (_crawl_pid_len >= PATH_MAX)) {            \
+            piddir[0] = 0;                                                     \
+        }                                                                      \
+    } while (0)
+
+#define GLUSTERD_GET_TMP_PATH(abspath, path)                                   \
+    do {                                                                       \
+        snprintf(abspath, sizeof(abspath) - 1,                                 \
+                 DEFAULT_VAR_RUN_DIRECTORY "/tmp%s", path);                    \
+    } while (0)
+
+#define GLUSTERD_GET_QUOTA_LIST_MOUNT_PATH(abspath, volname, path)             \
+    do {                                                                       \
+        snprintf(abspath, sizeof(abspath) - 1,                                 \
+                 DEFAULT_VAR_RUN_DIRECTORY "/%s_quota_list%s", volname, path); \
+    } while (0)
+
+const char *gd_quota_op_list[GF_QUOTA_OPTION_TYPE_MAX + 1] = {
+    [GF_QUOTA_OPTION_TYPE_NONE] = "none",
+    [GF_QUOTA_OPTION_TYPE_ENABLE] = "enable",
+    [GF_QUOTA_OPTION_TYPE_DISABLE] = "disable",
+    [GF_QUOTA_OPTION_TYPE_LIMIT_USAGE] = "limit-usage",
+    [GF_QUOTA_OPTION_TYPE_REMOVE] = "remove",
+    [GF_QUOTA_OPTION_TYPE_LIST] = "list",
+    [GF_QUOTA_OPTION_TYPE_VERSION] = "version",
+    [GF_QUOTA_OPTION_TYPE_ALERT_TIME] = "alert-time",
+    [GF_QUOTA_OPTION_TYPE_SOFT_TIMEOUT] = "soft-timeout",
+    [GF_QUOTA_OPTION_TYPE_HARD_TIMEOUT] = "hard-timeout",
+    [GF_QUOTA_OPTION_TYPE_DEFAULT_SOFT_LIMIT] = "default-soft-limit",
+    [GF_QUOTA_OPTION_TYPE_LIMIT_OBJECTS] = "limit-objects",
+    [GF_QUOTA_OPTION_TYPE_LIST_OBJECTS] = "list-objects",
+    [GF_QUOTA_OPTION_TYPE_REMOVE_OBJECTS] = "remove-objects",
+    [GF_QUOTA_OPTION_TYPE_ENABLE_OBJECTS] = "enable-objects",
+    [GF_QUOTA_OPTION_TYPE_UPGRADE] = "upgrade",
+    [GF_QUOTA_OPTION_TYPE_MAX] = NULL};
+
+gf_boolean_t
+glusterd_is_quota_supported(int32_t type, char **op_errstr)
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    gf_boolean_t supported = _gf_false;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    if ((conf->op_version == GD_OP_VERSION_MIN) &&
+        (type > GF_QUOTA_OPTION_TYPE_VERSION))
+        goto out;
+
+    if ((conf->op_version < GD_OP_VERSION_3_7_0) &&
+        (type > GF_QUOTA_OPTION_TYPE_VERSION_OBJECTS))
+        goto out;
+
+    /* Quota Operations that change quota.conf shouldn't
+     * be allowed as the quota.conf format changes in 3.7
+     */
+    if ((conf->op_version < GD_OP_VERSION_3_7_0) &&
+        (type == GF_QUOTA_OPTION_TYPE_ENABLE ||
+         type == GF_QUOTA_OPTION_TYPE_LIMIT_USAGE ||
+         type == GF_QUOTA_OPTION_TYPE_REMOVE))
+        goto out;
+
+    /* Quota xattr version implemented in 3.7.6
+     * quota-version is incremented when quota is enabled
+     * Quota enable and disable performance enhancement has been done
+     * in version 3.7.12.
+     * so don't allow enabling/disabling quota in heterogeneous
+     * cluster during upgrade
+     */
+    if (type == GF_QUOTA_OPTION_TYPE_ENABLE ||
+        type == GF_QUOTA_OPTION_TYPE_ENABLE_OBJECTS ||
+        type == GF_QUOTA_OPTION_TYPE_DISABLE) {
+        if (conf->op_version < GD_OP_VERSION_3_7_12)
+            goto out;
+    }
+
+    supported = _gf_true;
+
+out:
+    if (!supported && op_errstr != NULL && conf)
+        gf_asprintf(op_errstr,
+                    "Volume quota failed. The cluster is "
+                    "operating at version %d. Quota command"
+                    " %s is unavailable in this version.",
+                    conf->op_version, gd_quota_op_list[type]);
+
+    return supported;
+}
+
+int
+__glusterd_handle_quota(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    glusterd_op_t cli_op = GD_OP_QUOTA;
+    char *volname = NULL;
+    int32_t type = 0;
+    char msg[2048] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    GF_ASSERT(req);
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(msg, sizeof(msg),
+                     "Unable to decode the "
+                     "command");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Unable to get volume name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name, "
+               "while handling quota command");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "type", SLEN("type"), &type);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Unable to get type of command");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get type of cmd, "
+               "while handling quota command");
+        goto out;
+    }
+
+    if (!glusterd_is_quota_supported(type, NULL)) {
+        snprintf(msg, sizeof(msg),
+                 "Volume quota failed. The cluster "
+                 "is operating at version %d. Quota command"
+                 " %s is unavailable in this version.",
+                 conf->op_version, gd_quota_op_list[type]);
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_QUOTA, dict);
+
+out:
+    if (ret) {
+        if (msg[0] == '\0')
+            snprintf(msg, sizeof(msg), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, msg);
+    }
+
+    return ret;
+}
+
+int
+glusterd_handle_quota(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_quota);
+}
+
+int32_t
+glusterd_check_if_quota_trans_enabled(glusterd_volinfo_t *volinfo)
+{
+    int32_t ret = 0;
+    int flag = _gf_false;
+
+    flag = glusterd_volinfo_get_boolean(volinfo, VKEY_FEATURES_QUOTA);
+    if (flag == -1) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_QUOTA_GET_STAT_FAIL,
+               "failed to get the quota status");
+        ret = -1;
+        goto out;
+    }
+
+    if (flag == _gf_false) {
+        ret = -1;
+        goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+_glusterd_quota_initiate_fs_crawl(glusterd_conf_t *priv,
+                                  glusterd_volinfo_t *volinfo,
+                                  glusterd_brickinfo_t *brick, int type,
+                                  char *pid_dir)
+{
+    pid_t pid;
+    int32_t ret = -1;
+    int status = 0;
+    char mountdir[PATH_MAX] = {
+        0,
+    };
+    char logfile[PATH_MAX] = {
+        0,
+    };
+    char brickpath[PATH_MAX] = {
+        0,
+    };
+    char vol_id[PATH_MAX] = {
+        0,
+    };
+    char pidfile[PATH_MAX] = {
+        0,
+    };
+    runner_t runner = {0};
+    char *volfileserver = NULL;
+    FILE *pidfp = NULL;
+    int32_t len = 0;
+
+    GF_VALIDATE_OR_GOTO("glusterd", THIS, out);
+
+    GLUSTERD_GET_TMP_PATH(mountdir, "/");
+    ret = sys_mkdir(mountdir, 0755);
+    if (ret && errno != EEXIST) {
+        gf_msg(THIS->name, GF_LOG_WARNING, errno, GD_MSG_MOUNT_REQ_FAIL,
+               "failed to create temporary "
+               "directory %s",
+               mountdir);
+        ret = -1;
+        goto out;
+    }
+
+    strcat(mountdir, "mntXXXXXX");
+    if (mkdtemp(mountdir) == NULL) {
+        gf_msg(THIS->name, GF_LOG_WARNING, errno, GD_MSG_MOUNT_REQ_FAIL,
+               "failed to create a temporary "
+               "mount directory: %s",
+               mountdir);
+        ret = -1;
+        goto out;
+    }
+
+    GLUSTERD_REMOVE_SLASH_FROM_PATH(brick->path, brickpath);
+    len = snprintf(logfile, sizeof(logfile),
+                   DEFAULT_QUOTA_CRAWL_LOG_DIRECTORY "/%s.log", brickpath);
+    if ((len < 0) || (len >= sizeof(vol_id))) {
+        ret = -1;
+        goto out;
+    }
+
+    if (dict_get_strn(THIS->options, "transport.socket.bind-address",
+                      SLEN("transport.socket.bind-address"),
+                      &volfileserver) != 0)
+        volfileserver = "localhost";
+
+    len = snprintf(vol_id, sizeof(vol_id), "client_per_brick/%s.%s.%s.%s.vol",
+                   volinfo->volname, "client", brick->hostname, brickpath);
+    if ((len < 0) || (len >= sizeof(vol_id))) {
+        ret = -1;
+        goto out;
+    }
+
+    runinit(&runner);
+
+    if (type == GF_QUOTA_OPTION_TYPE_ENABLE ||
+        type == GF_QUOTA_OPTION_TYPE_ENABLE_OBJECTS)
+        runner_add_args(&runner, SBIN_DIR "/glusterfs", "-s", volfileserver,
+                        "--volfile-id", vol_id, "--use-readdirp=yes",
+                        "--client-pid", QUOTA_CRAWL_PID, "-l", logfile,
+                        mountdir, NULL);
+    else
+        runner_add_args(&runner, SBIN_DIR "/glusterfs", "-s", volfileserver,
+                        "--volfile-id", vol_id, "--use-readdirp=no",
+                        "--client-pid", QUOTA_CRAWL_PID, "-l", logfile,
+                        mountdir, NULL);
+
+    synclock_unlock(&priv->big_lock);
+    ret = runner_run_reuse(&runner);
+    synclock_lock(&priv->big_lock);
+    if (ret == -1) {
+        runner_log(&runner, "glusterd", GF_LOG_DEBUG, "command failed");
+        runner_end(&runner);
+        goto out;
+    }
+    runner_end(&runner);
+
+    if ((pid = fork()) < 0) {
+        gf_msg(THIS->name, GF_LOG_WARNING, 0, GD_MSG_FORK_FAIL,
+               "fork from parent failed");
+        gf_umount_lazy("glusterd", mountdir, 1);
+        ret = -1;
+        goto out;
+    } else if (pid == 0) {  // first child
+        /* fork one more to not hold back main process on
+         * blocking call below
+         */
+        pid = fork();
+        if (pid < 0) {
+            gf_umount_lazy("glusterd", mountdir, 1);
+            _exit(EXIT_FAILURE);
+        } else if (pid > 0) {
+            _exit(EXIT_SUCCESS);
+        }
+
+        ret = chdir(mountdir);
+        if (ret == -1) {
+            gf_msg(THIS->name, GF_LOG_WARNING, errno, GD_MSG_DIR_OP_FAILED,
+                   "chdir %s failed", mountdir);
+            gf_umount_lazy("glusterd", mountdir, 1);
+            exit(EXIT_FAILURE);
+        }
+        runinit(&runner);
+
+        if (type == GF_QUOTA_OPTION_TYPE_ENABLE ||
+            type == GF_QUOTA_OPTION_TYPE_ENABLE_OBJECTS)
+            runner_add_args(&runner, "/usr/bin/find", ".", "-exec",
+                            "/usr/bin/stat", "{}", "\\", ";", NULL);
+
+        else if (type == GF_QUOTA_OPTION_TYPE_DISABLE) {
+#if defined(GF_DARWIN_HOST_OS)
+            runner_add_args(
+                &runner, "/usr/bin/find", ".", "-exec", "/usr/bin/xattr", "-w",
+                VIRTUAL_QUOTA_XATTR_CLEANUP_KEY, "1", "{}", "\\", ";", NULL);
+#elif defined(__FreeBSD__)
+            runner_add_args(&runner, "/usr/bin/find", ".", "-exec",
+                            "/usr/sbin/setextattr", EXTATTR_NAMESPACE_USER,
+                            VIRTUAL_QUOTA_XATTR_CLEANUP_KEY, "1", "{}", "\\",
+                            ";", NULL);
+#else
+            runner_add_args(&runner, "find", ".", "-exec", _PATH_SETFATTR, "-n",
+                            VIRTUAL_QUOTA_XATTR_CLEANUP_KEY, "-v", "1", "{}",
+                            "\\", ";", NULL);
+#endif
+        }
+
+        if (runner_start(&runner) == -1) {
+            gf_umount_lazy("glusterd", mountdir, 1);
+            _exit(EXIT_FAILURE);
+        }
+
+        len = snprintf(pidfile, sizeof(pidfile), "%s/%s.pid", pid_dir,
+                       brickpath);
+        if ((len >= 0) && (len < sizeof(pidfile))) {
+            pidfp = fopen(pidfile, "w");
+            if (pidfp != NULL) {
+                fprintf(pidfp, "%d\n", runner.chpid);
+                fflush(pidfp);
+                fclose(pidfp);
+            }
+        }
+
+#ifndef GF_LINUX_HOST_OS
+        runner_end(&runner); /* blocks in waitpid */
+#endif
+        gf_umount_lazy("glusterd", mountdir, 1);
+
+        _exit(EXIT_SUCCESS);
+    }
+    ret = (waitpid(pid, &status, 0) == pid && WIFEXITED(status) &&
+           WEXITSTATUS(status) == EXIT_SUCCESS)
+              ? 0
+              : -1;
+
+out:
+    return ret;
+}
+
+void
+glusterd_stop_all_quota_crawl_service(glusterd_conf_t *priv,
+                                      glusterd_volinfo_t *volinfo, int type)
+{
+    DIR *dir = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    char pid_dir[PATH_MAX] = {
+        0,
+    };
+    char pidfile[PATH_MAX] = {
+        0,
+    };
+    int32_t len = 0;
+
+    GLUSTERD_GET_QUOTA_CRAWL_PIDDIR(pid_dir, volinfo, type);
+
+    dir = sys_opendir(pid_dir);
+    if (dir == NULL)
+        return;
+
+    while ((entry = sys_readdir(dir, scratch))) {
+        if (gf_irrelevant_entry(entry))
+            continue;
+        len = snprintf(pidfile, sizeof(pidfile), "%s/%s", pid_dir,
+                       entry->d_name);
+        if ((len >= 0) && (len < sizeof(pidfile))) {
+            glusterd_service_stop_nolock("quota_crawl", pidfile, SIGKILL,
+                                         _gf_true);
+            sys_unlink(pidfile);
+        }
+    }
+    sys_closedir(dir);
+}
+
+int32_t
+glusterd_quota_initiate_fs_crawl(glusterd_conf_t *priv,
+                                 glusterd_volinfo_t *volinfo, int type)
+{
+    int32_t ret = -1;
+    glusterd_brickinfo_t *brick = NULL;
+    char pid_dir[PATH_MAX] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("glusterd", THIS, out);
+
+    ret = glusterd_generate_client_per_brick_volfile(volinfo);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_OP_FAILED,
+               "failed to generate client volume file");
+        goto out;
+    }
+
+    ret = mkdir_p(DEFAULT_QUOTA_CRAWL_LOG_DIRECTORY, 0755, _gf_true);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_GLUSTERD_OP_FAILED,
+               "failed to create dir %s: %s", DEFAULT_QUOTA_CRAWL_LOG_DIRECTORY,
+               strerror(errno));
+        goto out;
+    }
+
+    GLUSTERD_GET_QUOTA_CRAWL_PIDDIR(pid_dir, volinfo, type);
+    ret = mkdir_p(pid_dir, 0755, _gf_true);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_GLUSTERD_OP_FAILED,
+               "failed to create dir %s: %s", pid_dir, strerror(errno));
+        goto out;
+    }
+
+    /* When quota enable is performed, stop alreday running enable crawl
+     * process and start fresh crawl process. let disable process continue
+     * if running to cleanup the older xattrs
+     * When quota disable is performed, stop both enable/disable crawl
+     * process and start fresh crawl process to cleanup the xattrs
+     */
+    glusterd_stop_all_quota_crawl_service(priv, volinfo,
+                                          GF_QUOTA_OPTION_TYPE_ENABLE);
+    if (type == GF_QUOTA_OPTION_TYPE_DISABLE)
+        glusterd_stop_all_quota_crawl_service(priv, volinfo,
+                                              GF_QUOTA_OPTION_TYPE_DISABLE);
+
+    cds_list_for_each_entry(brick, &volinfo->bricks, brick_list)
+    {
+        if (gf_uuid_compare(brick->uuid, MY_UUID))
+            continue;
+
+        ret = _glusterd_quota_initiate_fs_crawl(priv, volinfo, brick, type,
+                                                pid_dir);
+
+        if (ret)
+            goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+glusterd_quota_get_default_soft_limit(glusterd_volinfo_t *volinfo,
+                                      dict_t *rsp_dict)
+{
+    int32_t ret = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char *default_limit = NULL;
+    char *val = NULL;
+
+    if (rsp_dict == NULL)
+        return -1;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    ret = glusterd_volinfo_get(volinfo, "features.default-soft-limit",
+                               &default_limit);
+    if (default_limit)
+        val = gf_strdup(default_limit);
+    else
+        val = gf_strdup("80%");
+
+    ret = dict_set_dynstr_sizen(rsp_dict, "default-soft-limit", val);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set default "
+               "soft-limit into dict");
+        goto out;
+    }
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_inode_quota_enable(glusterd_volinfo_t *volinfo, char **op_errstr,
+                            gf_boolean_t *crawl)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(this->name, crawl, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_errstr, out);
+
+    if (glusterd_is_volume_started(volinfo) == 0) {
+        *op_errstr = gf_strdup(
+            "Volume is stopped, start volume "
+            "to enable inode quota.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_check_if_quota_trans_enabled(volinfo);
+    if (ret != 0) {
+        *op_errstr = gf_strdup(
+            "Quota is disabled. Enabling quota "
+            "will enable inode quota");
+        ret = -1;
+        goto out;
+    }
+
+    if (glusterd_is_volume_inode_quota_enabled(volinfo)) {
+        *op_errstr = gf_strdup("Inode Quota is already enabled");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(volinfo->dict, VKEY_FEATURES_INODE_QUOTA,
+                                     "on");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "dict set failed");
+        goto out;
+    }
+
+    *crawl = _gf_true;
+
+    ret = glusterd_store_quota_config(
+        volinfo, NULL, NULL, GF_QUOTA_OPTION_TYPE_ENABLE_OBJECTS, op_errstr);
+
+    ret = 0;
+out:
+    if (ret && op_errstr && !*op_errstr)
+        gf_asprintf(op_errstr,
+                    "Enabling inode quota on volume %s has "
+                    "been unsuccessful",
+                    volinfo->volname);
+    return ret;
+}
+
+int32_t
+glusterd_quota_enable(glusterd_volinfo_t *volinfo, char **op_errstr,
+                      gf_boolean_t *crawl)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(this->name, crawl, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_errstr, out);
+
+    if (glusterd_is_volume_started(volinfo) == 0) {
+        *op_errstr = gf_strdup(
+            "Volume is stopped, start volume "
+            "to enable quota.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_check_if_quota_trans_enabled(volinfo);
+    if (ret == 0) {
+        *op_errstr = gf_strdup("Quota is already enabled");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(volinfo->dict, VKEY_FEATURES_QUOTA, "on");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "dict set failed");
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(volinfo->dict, VKEY_FEATURES_INODE_QUOTA,
+                                     "on");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "dict set failed");
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(volinfo->dict,
+                                     "features.quota-deem-statfs", "on");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "setting quota-deem-statfs"
+               "in volinfo failed");
+        goto out;
+    }
+
+    *crawl = _gf_true;
+
+    ret = glusterd_store_quota_config(volinfo, NULL, NULL,
+                                      GF_QUOTA_OPTION_TYPE_ENABLE, op_errstr);
+
+    ret = 0;
+out:
+    if (ret && op_errstr && !*op_errstr)
+        gf_asprintf(op_errstr,
+                    "Enabling quota on volume %s has been "
+                    "unsuccessful",
+                    volinfo->volname);
+    return ret;
+}
+
+int32_t
+glusterd_quota_disable(glusterd_volinfo_t *volinfo, char **op_errstr,
+                       gf_boolean_t *crawl)
+{
+    int32_t ret = -1;
+    int i = 0;
+    char *value = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char *quota_options[] = {"features.soft-timeout",
+                             "features.hard-timeout",
+                             "features.alert-time",
+                             "features.default-soft-limit",
+                             "features.quota-deem-statfs",
+                             "features.quota-timeout",
+                             NULL};
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_errstr, out);
+
+    ret = glusterd_check_if_quota_trans_enabled(volinfo);
+    if (ret == -1) {
+        *op_errstr = gf_strdup("Quota is already disabled");
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(volinfo->dict, VKEY_FEATURES_QUOTA, "off");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "dict set failed");
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(volinfo->dict, VKEY_FEATURES_INODE_QUOTA,
+                                     "off");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "dict set failed");
+        goto out;
+    }
+
+    for (i = 0; quota_options[i]; i++) {
+        ret = glusterd_volinfo_get(volinfo, quota_options[i], &value);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_VOLINFO_GET_FAIL,
+                   "failed to get option"
+                   " %s",
+                   quota_options[i]);
+        } else {
+            dict_del(volinfo->dict, quota_options[i]);
+        }
+    }
+
+    *crawl = _gf_true;
+
+    (void)glusterd_clean_up_quota_store(volinfo);
+
+    ret = 0;
+out:
+    if (ret && op_errstr && !*op_errstr)
+        gf_asprintf(op_errstr,
+                    "Disabling quota on volume %s has been "
+                    "unsuccessful",
+                    volinfo->volname);
+    return ret;
+}
+
+static int
+glusterd_set_quota_limit(char *volname, char *path, char *hard_limit,
+                         char *soft_limit, char *key, char **op_errstr)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    char abspath[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    quota_limits_t existing_limit = {
+        0,
+    };
+    quota_limits_t new_limit = {
+        0,
+    };
+    double soft_limit_double = 0;
+    int64_t local_hl = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GLUSTERD_GET_QUOTA_LIMIT_MOUNT_PATH(abspath, volname, path);
+    ret = gf_lstat_dir(abspath, NULL);
+    if (ret) {
+        gf_asprintf(op_errstr,
+                    "Failed to find the directory %s. "
+                    "Reason : %s",
+                    abspath, strerror(errno));
+        goto out;
+    }
+
+    if (!soft_limit) {
+        ret = sys_lgetxattr(abspath, key, (void *)&existing_limit,
+                            sizeof(existing_limit));
+        if (ret < 0) {
+            switch (errno) {
+#if defined(ENOATTR) && (ENOATTR != ENODATA)
+                case ENODATA: /* FALLTHROUGH */
+#endif
+                case ENOATTR:
+                    existing_limit.sl = -1;
+                    break;
+                default:
+                    gf_asprintf(op_errstr,
+                                "Failed to get the "
+                                "xattr %s from %s. Reason : %s",
+                                key, abspath, strerror(errno));
+                    goto out;
+            }
+        } else {
+            existing_limit.hl = ntoh64(existing_limit.hl);
+            existing_limit.sl = ntoh64(existing_limit.sl);
+        }
+        new_limit.sl = existing_limit.sl;
+
+    } else {
+        ret = gf_string2percent(soft_limit, &soft_limit_double);
+        if (ret)
+            goto out;
+        new_limit.sl = soft_limit_double;
+    }
+
+    new_limit.sl = hton64(new_limit.sl);
+
+    ret = gf_string2bytesize_int64(hard_limit, &local_hl);
+    if (ret)
+        goto out;
+
+    new_limit.hl = hton64(local_hl);
+
+    ret = sys_lsetxattr(abspath, key, (char *)(void *)&new_limit,
+                        sizeof(new_limit), 0);
+    if (ret == -1) {
+        gf_asprintf(op_errstr,
+                    "setxattr of %s failed on %s."
+                    " Reason : %s",
+                    key, abspath, strerror(errno));
+        goto out;
+    }
+    ret = 0;
+
+out:
+    return ret;
+}
+
+static int
+glusterd_update_quota_conf_version(glusterd_volinfo_t *volinfo)
+{
+    volinfo->quota_conf_version++;
+    return 0;
+}
+
+/*The function glusterd_find_gfid_match () does the following:
+ * Given a buffer of gfids, the number of bytes read and the key gfid that needs
+ * to be found, the function compares 16 bytes at a time from @buf against
+ * @gfid.
+ *
+ * What happens when the match is found:
+ * i. If the function was called as part of 'limit-usage' operation, the call
+ *    returns with write_byte_count = bytes_read
+ *ii. If the function as called as part of 'quota remove' operation, @buf
+ *    is modified in memory such that the match is deleted from the buffer, and
+ *    also @write_byte_count is set to original buf size minus the sixteen bytes
+ *    that was deleted as part of 'remove'.
+ *
+ * What happens when the match is not found in the current buffer:
+ * The function returns with write_byte_count = bytes_read, which means to say
+ * that the caller of this function must write the entire buffer to the tmp file
+ * and continue the search.
+ */
+static gf_boolean_t
+glusterd_find_gfid_match_3_6(uuid_t gfid, unsigned char *buf, size_t bytes_read,
+                             int opcode, size_t *write_byte_count)
+{
+    int gfid_index = 0;
+    int shift_count = 0;
+    unsigned char tmp_buf[17] = {
+        0,
+    };
+
+    /* This function if for backward compatibility */
+
+    while (gfid_index != bytes_read) {
+        memcpy((void *)tmp_buf, (void *)&buf[gfid_index], 16);
+        if (!gf_uuid_compare(gfid, tmp_buf)) {
+            if (opcode == GF_QUOTA_OPTION_TYPE_REMOVE) {
+                shift_count = bytes_read - (gfid_index + 16);
+                memmove((void *)&buf[gfid_index], (void *)&buf[gfid_index + 16],
+                        shift_count);
+                *write_byte_count = bytes_read - 16;
+            } else {
+                *write_byte_count = bytes_read;
+            }
+            return _gf_true;
+        } else {
+            gfid_index += 16;
+        }
+    }
+    if (gfid_index == bytes_read)
+        *write_byte_count = bytes_read;
+
+    return _gf_false;
+}
+
+static gf_boolean_t
+glusterd_find_gfid_match(uuid_t gfid, char gfid_type, unsigned char *buf,
+                         size_t bytes_read, int opcode,
+                         size_t *write_byte_count)
+{
+    int gfid_index = 0;
+    int shift_count = 0;
+    unsigned char tmp_buf[17] = {
+        0,
+    };
+    char type = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    if (conf->op_version < GD_OP_VERSION_3_7_0)
+        return glusterd_find_gfid_match_3_6(gfid, buf, bytes_read, opcode,
+                                            write_byte_count);
+
+    while (gfid_index != bytes_read) {
+        memcpy((void *)tmp_buf, (void *)&buf[gfid_index], 16);
+        type = buf[gfid_index + 16];
+
+        if (!gf_uuid_compare(gfid, tmp_buf) && type == gfid_type) {
+            if (opcode == GF_QUOTA_OPTION_TYPE_REMOVE ||
+                opcode == GF_QUOTA_OPTION_TYPE_REMOVE_OBJECTS) {
+                shift_count = bytes_read - (gfid_index + 17);
+                memmove((void *)&buf[gfid_index], (void *)&buf[gfid_index + 17],
+                        shift_count);
+                *write_byte_count = bytes_read - 17;
+            } else {
+                *write_byte_count = bytes_read;
+            }
+            return _gf_true;
+        } else {
+            gfid_index += 17;
+        }
+    }
+    if (gfid_index == bytes_read)
+        *write_byte_count = bytes_read;
+
+out:
+
+    return _gf_false;
+}
+
+/* The function glusterd_copy_to_tmp_file() reads the "remaining" bytes from
+ * the source fd and writes them to destination fd, at the rate of 1000 entries
+ * a time (qconf_line_sz is the size of an entry)
+ */
+
+static int
+glusterd_copy_to_tmp_file(int src_fd, int dst_fd, int qconf_line_sz)
+{
+    int ret = 0;
+    ssize_t bytes_read = 0;
+    xlator_t *this = NULL;
+    unsigned char *buf = 0;
+    int buf_sz = qconf_line_sz * 1000;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(buf_sz > 0);
+
+    buf = GF_CALLOC(buf_sz, 1, gf_common_mt_char);
+    if (!buf) {
+        ret = -1;
+        goto out;
+    }
+
+    while ((bytes_read = sys_read(src_fd, buf, buf_sz)) > 0) {
+        if (bytes_read % qconf_line_sz != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_QUOTA_CONF_CORRUPT,
+                   "quota.conf "
+                   "corrupted");
+            ret = -1;
+            goto out;
+        }
+        ret = sys_write(dst_fd, (void *)buf, bytes_read);
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, errno,
+                   GD_MSG_QUOTA_CONF_WRITE_FAIL,
+                   "write into quota.conf failed.");
+            goto out;
+        }
+    }
+    ret = 0;
+
+out:
+    if (buf)
+        GF_FREE(buf);
+    return ret;
+}
+
+int
+glusterd_store_quota_conf_upgrade(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    int fd = -1;
+    int conf_fd = -1;
+    unsigned char gfid[17] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    char type = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    fd = gf_store_mkstemp(volinfo->quota_conf_shandle);
+    if (fd < 0) {
+        ret = -1;
+        goto out;
+    }
+
+    conf_fd = open(volinfo->quota_conf_shandle->path, O_RDONLY);
+    if (conf_fd == -1) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = quota_conf_skip_header(conf_fd);
+    if (ret)
+        goto out;
+
+    ret = glusterd_quota_conf_write_header(fd);
+    if (ret)
+        goto out;
+
+    while (1) {
+        ret = quota_conf_read_gfid(conf_fd, gfid, &type, 1.1);
+        if (ret == 0)
+            break;
+        else if (ret < 0)
+            goto out;
+
+        ret = glusterd_quota_conf_write_gfid(fd, gfid,
+                                             GF_QUOTA_CONF_TYPE_USAGE);
+        if (ret < 0)
+            goto out;
+    }
+
+out:
+    if (conf_fd != -1)
+        sys_close(conf_fd);
+
+    if (ret && (fd > 0)) {
+        gf_store_unlink_tmppath(volinfo->quota_conf_shandle);
+    } else if (!ret) {
+        ret = gf_store_rename_tmppath(volinfo->quota_conf_shandle);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                   "Failed to rename "
+                   "quota conf file");
+            return ret;
+        }
+
+        ret = glusterd_compute_cksum(volinfo, _gf_true);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CKSUM_COMPUTE_FAIL,
+                   "Failed to "
+                   "compute cksum for quota conf file");
+            return ret;
+        }
+
+        ret = glusterd_store_save_quota_version_and_cksum(volinfo);
+        if (ret)
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_QUOTA_CKSUM_VER_STORE_FAIL,
+                   "Failed to "
+                   "store quota version and cksum");
+    }
+
+    return ret;
+}
+
+int
+glusterd_store_quota_config(glusterd_volinfo_t *volinfo, char *path,
+                            char *gfid_str, int opcode, char **op_errstr)
+{
+    int ret = -1;
+    int fd = -1;
+    int conf_fd = -1;
+    ssize_t bytes_read = 0;
+    size_t bytes_to_write = 0;
+    uuid_t gfid = {
+        0,
+    };
+    xlator_t *this = NULL;
+    gf_boolean_t found = _gf_false;
+    gf_boolean_t modified = _gf_false;
+    gf_boolean_t is_file_empty = _gf_false;
+    gf_boolean_t is_first_read = _gf_true;
+    glusterd_conf_t *conf = NULL;
+    float version = 0.0f;
+    char type = 0;
+    int quota_conf_line_sz = 16;
+    unsigned char *buf = 0;
+    int buf_sz = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    glusterd_store_create_quota_conf_sh_on_absence(volinfo);
+
+    conf_fd = open(volinfo->quota_conf_shandle->path, O_RDONLY);
+    if (conf_fd == -1) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = quota_conf_read_version(conf_fd, &version);
+    if (ret)
+        goto out;
+
+    if (version < 1.2f && conf->op_version >= GD_OP_VERSION_3_7_0) {
+        /* Upgrade quota.conf file to newer format */
+        sys_close(conf_fd);
+        conf_fd = -1;
+
+        ret = glusterd_store_quota_conf_upgrade(volinfo);
+        if (ret)
+            goto out;
+
+        if (GF_QUOTA_OPTION_TYPE_UPGRADE == opcode) {
+            /* Nothing more to be done here */
+            goto out;
+        }
+
+        conf_fd = open(volinfo->quota_conf_shandle->path, O_RDONLY);
+        if (conf_fd == -1) {
+            ret = -1;
+            goto out;
+        }
+
+        ret = quota_conf_skip_header(conf_fd);
+        if (ret)
+            goto out;
+    } else if (GF_QUOTA_OPTION_TYPE_UPGRADE == opcode) {
+        /* No change to be done in quota_conf*/
+        goto out;
+    }
+
+    /* If op-ver is gt 3.7, then quota.conf will be upgraded, and 17 bytes
+     * storted in the new format. 16 bytes uuid and
+     * 1 byte type (usage/object)
+     */
+    if (conf->op_version >= GD_OP_VERSION_3_7_0)
+        quota_conf_line_sz++;
+
+    buf_sz = quota_conf_line_sz * 1000;
+
+    buf = GF_CALLOC(buf_sz, 1, gf_common_mt_char);
+    if (!buf) {
+        ret = -1;
+        goto out;
+    }
+
+    fd = gf_store_mkstemp(volinfo->quota_conf_shandle);
+    if (fd < 0) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_quota_conf_write_header(fd);
+    if (ret)
+        goto out;
+
+    /* Just create empty quota.conf file if create */
+    if (GF_QUOTA_OPTION_TYPE_ENABLE == opcode ||
+        GF_QUOTA_OPTION_TYPE_ENABLE_OBJECTS == opcode) {
+        modified = _gf_true;
+        goto out;
+    }
+
+    /* Check if gfid_str is given for opts other than ENABLE */
+    if (!gfid_str) {
+        ret = -1;
+        goto out;
+    }
+    gf_uuid_parse(gfid_str, gfid);
+
+    if (opcode > GF_QUOTA_OPTION_TYPE_VERSION_OBJECTS)
+        type = GF_QUOTA_CONF_TYPE_OBJECTS;
+    else
+        type = GF_QUOTA_CONF_TYPE_USAGE;
+
+    for (;;) {
+        bytes_read = sys_read(conf_fd, buf, buf_sz);
+        if (bytes_read <= 0) {
+            /*The flag @is_first_read is TRUE when the loop is
+             * entered, and is set to false if the first read
+             * reads non-zero bytes of data. The flag is used to
+             * detect if quota.conf is an empty file, but for the
+             * header. This is done to log appropriate error message
+             * when 'quota remove' is attempted when there are no
+             * limits set on the given volume.
+             */
+            if (is_first_read)
+                is_file_empty = _gf_true;
+            break;
+        }
+        if ((bytes_read % quota_conf_line_sz) != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_QUOTA_CONF_CORRUPT,
+                   "quota.conf "
+                   "corrupted");
+            ret = -1;
+            goto out;
+        }
+        found = glusterd_find_gfid_match(gfid, type, buf, bytes_read, opcode,
+                                         &bytes_to_write);
+
+        ret = sys_write(fd, (void *)buf, bytes_to_write);
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, errno,
+                   GD_MSG_QUOTA_CONF_WRITE_FAIL,
+                   "write into quota.conf failed.");
+            goto out;
+        }
+
+        /*If the match is found in this iteration, copy the rest of
+         * quota.conf into quota.conf.tmp and break.
+         * Else continue with the search.
+         */
+        if (found) {
+            ret = glusterd_copy_to_tmp_file(conf_fd, fd, quota_conf_line_sz);
+            if (ret)
+                goto out;
+            break;
+        }
+        is_first_read = _gf_false;
+    }
+
+    switch (opcode) {
+        case GF_QUOTA_OPTION_TYPE_LIMIT_USAGE:
+            if (!found) {
+                ret = glusterd_quota_conf_write_gfid(fd, gfid,
+                                                     GF_QUOTA_CONF_TYPE_USAGE);
+                if (ret == -1) {
+                    gf_msg(this->name, GF_LOG_ERROR, errno,
+                           GD_MSG_QUOTA_CONF_WRITE_FAIL,
+                           "write into quota.conf failed. ");
+                    goto out;
+                }
+                modified = _gf_true;
+            }
+            break;
+        case GF_QUOTA_OPTION_TYPE_LIMIT_OBJECTS:
+            if (!found) {
+                ret = glusterd_quota_conf_write_gfid(
+                    fd, gfid, GF_QUOTA_CONF_TYPE_OBJECTS);
+                if (ret == -1) {
+                    gf_msg(this->name, GF_LOG_ERROR, errno,
+                           GD_MSG_QUOTA_CONF_WRITE_FAIL,
+                           "write into quota.conf failed. ");
+                    goto out;
+                }
+                modified = _gf_true;
+            }
+            break;
+
+        case GF_QUOTA_OPTION_TYPE_REMOVE:
+        case GF_QUOTA_OPTION_TYPE_REMOVE_OBJECTS:
+            if (is_file_empty) {
+                gf_asprintf(op_errstr,
+                            "Cannot remove limit on"
+                            " %s. The quota configuration file"
+                            " for volume %s is empty.",
+                            path, volinfo->volname);
+                ret = -1;
+                goto out;
+            } else {
+                if (!found) {
+                    gf_asprintf(op_errstr,
+                                "Error. gfid %s"
+                                " for path %s not found in"
+                                " store",
+                                gfid_str, path);
+                    ret = -1;
+                    goto out;
+                } else {
+                    modified = _gf_true;
+                }
+            }
+            break;
+
+        default:
+            ret = 0;
+            break;
+    }
+
+    if (modified)
+        glusterd_update_quota_conf_version(volinfo);
+
+    ret = 0;
+out:
+    if (conf_fd != -1) {
+        sys_close(conf_fd);
+    }
+
+    if (buf)
+        GF_FREE(buf);
+
+    if (ret && (fd > 0)) {
+        gf_store_unlink_tmppath(volinfo->quota_conf_shandle);
+    } else if (!ret && GF_QUOTA_OPTION_TYPE_UPGRADE != opcode) {
+        ret = gf_store_rename_tmppath(volinfo->quota_conf_shandle);
+        if (modified) {
+            ret = glusterd_compute_cksum(volinfo, _gf_true);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CKSUM_COMPUTE_FAIL,
+                       "Failed to "
+                       "compute cksum for quota conf file");
+                return ret;
+            }
+
+            ret = glusterd_store_save_quota_version_and_cksum(volinfo);
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_VERS_CKSUM_STORE_FAIL,
+                       "Failed to "
+                       "store quota version and cksum");
+        }
+    }
+    return ret;
+}
+
+int32_t
+glusterd_quota_limit_usage(glusterd_volinfo_t *volinfo, dict_t *dict,
+                           int opcode, char **op_errstr)
+{
+    int32_t ret = -1;
+    char *path = NULL;
+    char *hard_limit = NULL;
+    char *soft_limit = NULL;
+    char *gfid_str = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_errstr, out);
+
+    ret = glusterd_check_if_quota_trans_enabled(volinfo);
+    if (ret == -1) {
+        *op_errstr = gf_strdup(
+            "Quota is disabled, please enable "
+            "quota");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "path", SLEN("path"), &path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch path");
+        goto out;
+    }
+    ret = gf_canonicalize_path(path);
+    if (ret)
+        goto out;
+
+    ret = dict_get_strn(dict, "hard-limit", SLEN("hard-limit"), &hard_limit);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch hard limit");
+        goto out;
+    }
+
+    if (dict_getn(dict, "soft-limit", SLEN("soft-limit"))) {
+        ret = dict_get_strn(dict, "soft-limit", SLEN("soft-limit"),
+                            &soft_limit);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to fetch "
+                   "soft limit");
+            goto out;
+        }
+    }
+
+    if (is_origin_glusterd(dict)) {
+        if (opcode == GF_QUOTA_OPTION_TYPE_LIMIT_USAGE) {
+            ret = glusterd_set_quota_limit(volinfo->volname, path, hard_limit,
+                                           soft_limit, QUOTA_LIMIT_KEY,
+                                           op_errstr);
+        } else {
+            ret = glusterd_set_quota_limit(volinfo->volname, path, hard_limit,
+                                           soft_limit, QUOTA_LIMIT_OBJECTS_KEY,
+                                           op_errstr);
+        }
+        if (ret)
+            goto out;
+    }
+
+    ret = dict_get_strn(dict, "gfid", SLEN("gfid"), &gfid_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get gfid of path "
+               "%s",
+               path);
+        goto out;
+    }
+
+    ret = glusterd_store_quota_config(volinfo, path, gfid_str, opcode,
+                                      op_errstr);
+    if (ret)
+        goto out;
+
+    ret = 0;
+out:
+
+    if (ret && op_errstr && !*op_errstr)
+        gf_asprintf(op_errstr,
+                    "Failed to set hard limit on path %s "
+                    "for volume %s",
+                    path, volinfo->volname);
+    return ret;
+}
+
+static int
+glusterd_remove_quota_limit(char *volname, char *path, char **op_errstr,
+                            int type)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    char abspath[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GLUSTERD_GET_QUOTA_LIMIT_MOUNT_PATH(abspath, volname, path);
+    ret = gf_lstat_dir(abspath, NULL);
+    if (ret) {
+        gf_asprintf(op_errstr,
+                    "Failed to find the directory %s. "
+                    "Reason : %s",
+                    abspath, strerror(errno));
+        goto out;
+    }
+
+    if (type == GF_QUOTA_OPTION_TYPE_REMOVE) {
+        ret = sys_lremovexattr(abspath, QUOTA_LIMIT_KEY);
+        if (ret) {
+            gf_asprintf(op_errstr,
+                        "removexattr failed on %s. "
+                        "Reason : %s",
+                        abspath, strerror(errno));
+            goto out;
+        }
+    }
+
+    if (type == GF_QUOTA_OPTION_TYPE_REMOVE_OBJECTS) {
+        ret = sys_lremovexattr(abspath, QUOTA_LIMIT_OBJECTS_KEY);
+        if (ret) {
+            gf_asprintf(op_errstr,
+                        "removexattr failed on %s. "
+                        "Reason : %s",
+                        abspath, strerror(errno));
+            goto out;
+        }
+    }
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_quota_remove_limits(glusterd_volinfo_t *volinfo, dict_t *dict,
+                             int opcode, char **op_errstr, int type)
+{
+    int32_t ret = -1;
+    char *path = NULL;
+    char *gfid_str = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_errstr, out);
+
+    ret = glusterd_check_if_quota_trans_enabled(volinfo);
+    if (ret == -1) {
+        *op_errstr = gf_strdup(
+            "Quota is disabled, please enable "
+            "quota");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "path", SLEN("path"), &path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch path");
+        goto out;
+    }
+
+    ret = gf_canonicalize_path(path);
+    if (ret)
+        goto out;
+
+    if (is_origin_glusterd(dict)) {
+        ret = glusterd_remove_quota_limit(volinfo->volname, path, op_errstr,
+                                          type);
+        if (ret)
+            goto out;
+    }
+
+    ret = dict_get_strn(dict, "gfid", SLEN("gfid"), &gfid_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get gfid of path "
+               "%s",
+               path);
+        goto out;
+    }
+
+    ret = glusterd_store_quota_config(volinfo, path, gfid_str, opcode,
+                                      op_errstr);
+    if (ret)
+        goto out;
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int
+glusterd_set_quota_option(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                          char **op_errstr)
+{
+    int ret = 0;
+    char *value = NULL;
+    xlator_t *this = NULL;
+    char *option = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = glusterd_check_if_quota_trans_enabled(volinfo);
+    if (ret == -1) {
+        gf_asprintf(op_errstr,
+                    "Cannot set %s. Quota on volume %s is "
+                    "disabled",
+                    key, volinfo->volname);
+        return -1;
+    }
+
+    ret = dict_get_strn(dict, "value", SLEN("value"), &value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Option value absent.");
+        return -1;
+    }
+
+    option = gf_strdup(value);
+    ret = dict_set_dynstr(volinfo->dict, key, option);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to set option %s", key);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int
+glusterd_quotad_op(int opcode)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    switch (opcode) {
+        case GF_QUOTA_OPTION_TYPE_ENABLE:
+        case GF_QUOTA_OPTION_TYPE_DISABLE:
+
+            if (glusterd_all_volumes_with_quota_stopped())
+                ret = glusterd_svc_stop(&(priv->quotad_svc), SIGTERM);
+            else
+                ret = priv->quotad_svc.manager(&(priv->quotad_svc), NULL,
+                                               PROC_START);
+            break;
+
+        default:
+            ret = 0;
+            break;
+    }
+    return ret;
+}
+
+int
+glusterd_op_quota(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    int32_t ret = -1;
+    char *volname = NULL;
+    int type = -1;
+    gf_boolean_t start_crawl = _gf_false;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_asprintf(op_errstr, FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "type", SLEN("type"), &type);
+
+    if (!glusterd_is_quota_supported(type, op_errstr)) {
+        ret = -1;
+        goto out;
+    }
+
+    switch (type) {
+        case GF_QUOTA_OPTION_TYPE_ENABLE:
+            ret = glusterd_quota_enable(volinfo, op_errstr, &start_crawl);
+            if (ret < 0)
+                goto out;
+            break;
+
+        case GF_QUOTA_OPTION_TYPE_ENABLE_OBJECTS:
+            ret = glusterd_inode_quota_enable(volinfo, op_errstr, &start_crawl);
+            if (ret < 0)
+                goto out;
+            break;
+
+        case GF_QUOTA_OPTION_TYPE_DISABLE:
+            ret = glusterd_quota_disable(volinfo, op_errstr, &start_crawl);
+            if (ret < 0)
+                goto out;
+
+            break;
+
+        case GF_QUOTA_OPTION_TYPE_LIMIT_USAGE:
+        case GF_QUOTA_OPTION_TYPE_LIMIT_OBJECTS:
+            ret = glusterd_quota_limit_usage(volinfo, dict, type, op_errstr);
+            goto out;
+
+        case GF_QUOTA_OPTION_TYPE_REMOVE:
+        case GF_QUOTA_OPTION_TYPE_REMOVE_OBJECTS:
+            ret = glusterd_quota_remove_limits(volinfo, dict, type, op_errstr,
+                                               type);
+            goto out;
+
+        case GF_QUOTA_OPTION_TYPE_LIST:
+        case GF_QUOTA_OPTION_TYPE_LIST_OBJECTS:
+            ret = glusterd_check_if_quota_trans_enabled(volinfo);
+            if (ret == -1) {
+                *op_errstr = gf_strdup(
+                    "Cannot list limits, "
+                    "quota is disabled");
+                goto out;
+            }
+            ret = glusterd_quota_get_default_soft_limit(volinfo, rsp_dict);
+            goto out;
+
+        case GF_QUOTA_OPTION_TYPE_SOFT_TIMEOUT:
+            ret = glusterd_set_quota_option(volinfo, dict,
+                                            "features.soft-timeout", op_errstr);
+            if (ret)
+                goto out;
+            break;
+
+        case GF_QUOTA_OPTION_TYPE_HARD_TIMEOUT:
+            ret = glusterd_set_quota_option(volinfo, dict,
+                                            "features.hard-timeout", op_errstr);
+            if (ret)
+                goto out;
+            break;
+
+        case GF_QUOTA_OPTION_TYPE_ALERT_TIME:
+            ret = glusterd_set_quota_option(volinfo, dict,
+                                            "features.alert-time", op_errstr);
+            if (ret)
+                goto out;
+            break;
+
+        case GF_QUOTA_OPTION_TYPE_DEFAULT_SOFT_LIMIT:
+            ret = glusterd_set_quota_option(
+                volinfo, dict, "features.default-soft-limit", op_errstr);
+            if (ret)
+                goto out;
+            break;
+
+        default:
+            gf_asprintf(op_errstr,
+                        "Quota command failed. Invalid "
+                        "opcode");
+            ret = -1;
+            goto out;
+    }
+
+    if (priv->op_version > GD_OP_VERSION_MIN) {
+        ret = glusterd_quotad_op(type);
+        if (ret)
+            goto out;
+    }
+
+    if (GF_QUOTA_OPTION_TYPE_ENABLE == type)
+        volinfo->quota_xattr_version++;
+    ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+    if (ret) {
+        if (GF_QUOTA_OPTION_TYPE_ENABLE == type)
+            volinfo->quota_xattr_version--;
+        goto out;
+    }
+
+    ret = glusterd_create_volfiles_and_notify_services(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Unable to re-create "
+               "volfiles");
+        if (GF_QUOTA_OPTION_TYPE_ENABLE == type) {
+            /* rollback volinfo */
+            volinfo->quota_xattr_version--;
+            ret = glusterd_store_volinfo(volinfo,
+                                         GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_SET_FAIL,
+                       "Failed to store volinfo for volume %s",
+                       volinfo->volname);
+            }
+        }
+
+        ret = -1;
+        goto out;
+    }
+
+#if BUILD_GNFS
+    if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+        if (priv->op_version == GD_OP_VERSION_MIN)
+            (void)priv->nfs_svc.manager(&(priv->nfs_svc), NULL, 0);
+    }
+#endif
+
+    if (rsp_dict && start_crawl == _gf_true)
+        glusterd_quota_initiate_fs_crawl(priv, volinfo, type);
+
+    ret = 0;
+out:
+    if (type == GF_QUOTA_OPTION_TYPE_LIMIT_USAGE ||
+        type == GF_QUOTA_OPTION_TYPE_LIMIT_OBJECTS ||
+        type == GF_QUOTA_OPTION_TYPE_REMOVE ||
+        type == GF_QUOTA_OPTION_TYPE_REMOVE_OBJECTS) {
+        /* During a list operation we need the aux mount to be
+         * accessible until the listing is done at the cli
+         */
+        glusterd_remove_auxiliary_mount(volinfo->volname);
+    }
+
+    return ret;
+}
+
+/*
+ * glusterd_get_gfid_from_brick() fetches the 'trusted.gfid' attribute of @path
+ * from each brick in the backend and places the same in the rsp_dict with the
+ * keys being gfid0, gfid1, gfid2 and so on. The absence of @path in the backend
+ * is not treated as error.
+ */
+static int
+glusterd_get_gfid_from_brick(dict_t *dict, glusterd_volinfo_t *volinfo,
+                             dict_t *rsp_dict, char **op_errstr)
+{
+    int ret = -1;
+    int count = 0;
+    char *path = NULL;
+    char backend_path[PATH_MAX] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    char key[64] = {
+        0,
+    };
+    int keylen;
+    char *gfid_str = NULL;
+    uuid_t gfid;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "path", SLEN("path"), &path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get path");
+        goto out;
+    }
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        ret = glusterd_resolve_brick(brickinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESOLVE_BRICK_FAIL,
+                   FMTSTR_RESOLVE_BRICK, brickinfo->hostname, brickinfo->path);
+            goto out;
+        }
+
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID))
+            continue;
+
+        if (brickinfo->vg[0])
+            continue;
+
+        snprintf(backend_path, sizeof(backend_path), "%s%s", brickinfo->path,
+                 path);
+
+        ret = gf_lstat_dir(backend_path, NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_INFO, errno, GD_MSG_DIR_OP_FAILED,
+                   "Failed to find "
+                   "directory %s.",
+                   backend_path);
+            ret = 0;
+            continue;
+        }
+        ret = sys_lgetxattr(backend_path, GFID_XATTR_KEY, gfid, 16);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_INFO, errno, GD_MSG_GET_XATTR_FAIL,
+                    "Attribute=%s, Directory=%s", GFID_XATTR_KEY, backend_path,
+                    NULL);
+            ret = 0;
+            continue;
+        }
+        keylen = snprintf(key, sizeof(key), "gfid%d", count);
+
+        gfid_str = gf_strdup(uuid_utoa(gfid));
+        if (!gfid_str) {
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_set_dynstrn(rsp_dict, key, keylen, gfid_str);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to place "
+                   "gfid of %s in dict",
+                   backend_path);
+            GF_FREE(gfid_str);
+            goto out;
+        }
+        count++;
+    }
+
+    ret = dict_set_int32n(rsp_dict, "count", SLEN("count"), count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set count");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+_glusterd_validate_quota_opts(dict_t *dict, int type, char **errstr)
+{
+    int ret = -1;
+    xlator_t *this = THIS;
+    void *quota_xl = NULL;
+    volume_opt_list_t opt_list = {
+        {0},
+    };
+    volume_option_t *opt = NULL;
+    char *key = NULL;
+    char *value = NULL;
+
+    GF_ASSERT(dict);
+    GF_ASSERT(this);
+
+    ret = xlator_volopt_dynload("features/quota", &quota_xl, &opt_list);
+    if (ret)
+        goto out;
+
+    switch (type) {
+        case GF_QUOTA_OPTION_TYPE_SOFT_TIMEOUT:
+        case GF_QUOTA_OPTION_TYPE_HARD_TIMEOUT:
+        case GF_QUOTA_OPTION_TYPE_ALERT_TIME:
+        case GF_QUOTA_OPTION_TYPE_DEFAULT_SOFT_LIMIT:
+            key = (char *)gd_quota_op_list[type];
+            break;
+        default:
+            ret = -1;
+            goto out;
+    }
+
+    opt = xlator_volume_option_get_list(&opt_list, key);
+    if (!opt) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_UNKNOWN_KEY,
+               "Unknown option: %s", key);
+        goto out;
+    }
+    ret = dict_get_strn(dict, "value", SLEN("value"), &value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Value not found for key %s", key);
+        goto out;
+    }
+
+    ret = xlator_option_validate(this, key, value, opt, errstr);
+
+out:
+    if (quota_xl) {
+        dlclose(quota_xl);
+        quota_xl = NULL;
+    }
+    return ret;
+}
+
+static int
+glusterd_create_quota_auxiliary_mount(xlator_t *this, char *volname, int type)
+{
+    int ret = -1;
+    char mountdir[PATH_MAX] = {
+        0,
+    };
+    char pidfile_path[PATH_MAX] = {
+        0,
+    };
+    char logfile[PATH_MAX] = {
+        0,
+    };
+    char qpid[16] = {
+        0,
+    };
+    char *volfileserver = NULL;
+    glusterd_conf_t *priv = NULL;
+    struct stat buf = {
+        0,
+    };
+    FILE *file = NULL;
+
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    if (type == GF_QUOTA_OPTION_TYPE_LIST ||
+        type == GF_QUOTA_OPTION_TYPE_LIST_OBJECTS) {
+        GLUSTERFS_GET_QUOTA_LIST_MOUNT_PIDFILE(pidfile_path, volname);
+        GLUSTERD_GET_QUOTA_LIST_MOUNT_PATH(mountdir, volname, "/");
+    } else {
+        GLUSTERFS_GET_QUOTA_LIMIT_MOUNT_PIDFILE(pidfile_path, volname);
+        GLUSTERD_GET_QUOTA_LIMIT_MOUNT_PATH(mountdir, volname, "/");
+    }
+
+    file = fopen(pidfile_path, "r");
+    if (file) {
+        /* Previous command did not clean up pid file.
+         * remove aux mount if it exists*/
+        gf_umount_lazy(this->name, mountdir, 1);
+        fclose(file);
+    }
+
+    ret = sys_mkdir(mountdir, 0755);
+    if (ret && errno != EEXIST) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_MOUNT_REQ_FAIL,
+               "Failed to create auxiliary "
+               "mount directory %s",
+               mountdir);
+        goto out;
+    }
+    snprintf(logfile, PATH_MAX - 1, "%s/quota-mount-%s.log", priv->logdir,
+             volname);
+    snprintf(qpid, 15, "%d", GF_CLIENT_PID_QUOTA_MOUNT);
+
+    if (dict_get_strn(this->options, "transport.socket.bind-address",
+                      SLEN("transport.socket.bind-address"),
+                      &volfileserver) != 0)
+        volfileserver = "localhost";
+
+    synclock_unlock(&priv->big_lock);
+    ret = runcmd(SBIN_DIR "/glusterfs", "--volfile-server", volfileserver,
+                 "--volfile-id", volname, "-l", logfile, "-p", pidfile_path,
+                 "--client-pid", qpid, mountdir, NULL);
+    if (ret == 0) {
+        /* Block here till mount process is ready to accept FOPs.
+         * Else, if glusterd acquires biglock below before
+         * mount process is ready, then glusterd and mount process
+         * can get into a deadlock situation.
+         */
+        ret = sys_stat(mountdir, &buf);
+        if (ret < 0)
+            ret = -errno;
+    } else {
+        ret = -errno;
+    }
+
+    synclock_lock(&priv->big_lock);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, GD_MSG_MOUNT_REQ_FAIL,
+               "Failed to mount glusterfs "
+               "client. Please check the log file %s for more details",
+               logfile);
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int
+glusterd_op_stage_quota(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    int ret = 0;
+    char *volname = NULL;
+    int type = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char *hard_limit_str = NULL;
+    int64_t hard_limit = 0;
+    gf_boolean_t get_gfid = _gf_false;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_asprintf(op_errstr, FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+
+    if (!glusterd_is_volume_started(volinfo)) {
+        *op_errstr = gf_strdup(
+            "Volume is stopped, start volume "
+            "before executing quota command.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "type", SLEN("type"), &type);
+    if (ret) {
+        *op_errstr = gf_strdup(
+            "Volume quota failed, internal error, "
+            "unable to get type of operation");
+        goto out;
+    }
+
+    if ((!glusterd_is_volume_quota_enabled(volinfo)) &&
+        (type != GF_QUOTA_OPTION_TYPE_ENABLE)) {
+        *op_errstr = gf_strdup(
+            "Quota is disabled, please enable "
+            "quota");
+        ret = -1;
+        goto out;
+    }
+
+    if (type > GF_QUOTA_OPTION_TYPE_VERSION_OBJECTS) {
+        if (!glusterd_is_volume_inode_quota_enabled(volinfo) &&
+            type != GF_QUOTA_OPTION_TYPE_ENABLE_OBJECTS) {
+            *op_errstr = gf_strdup(
+                "Inode Quota is disabled, "
+                "please enable inode quota");
+            ret = -1;
+            goto out;
+        }
+    }
+
+    if (!glusterd_is_quota_supported(type, op_errstr)) {
+        ret = -1;
+        goto out;
+    }
+
+    if ((GF_QUOTA_OPTION_TYPE_ENABLE != type) &&
+        (glusterd_check_if_quota_trans_enabled(volinfo) != 0)) {
+        ret = -1;
+        gf_asprintf(op_errstr, "Quota is not enabled on volume %s", volname);
+        goto out;
+    }
+
+    switch (type) {
+        case GF_QUOTA_OPTION_TYPE_LIST:
+        case GF_QUOTA_OPTION_TYPE_LIST_OBJECTS:
+        case GF_QUOTA_OPTION_TYPE_LIMIT_USAGE:
+        case GF_QUOTA_OPTION_TYPE_LIMIT_OBJECTS:
+        case GF_QUOTA_OPTION_TYPE_REMOVE:
+        case GF_QUOTA_OPTION_TYPE_REMOVE_OBJECTS:
+            /* Quota auxiliary mount is needed by CLI
+             * for list command and need by glusterd for
+             * setting/removing limit
+             */
+            if (is_origin_glusterd(dict)) {
+                ret = glusterd_create_quota_auxiliary_mount(this, volname,
+                                                            type);
+                if (ret) {
+                    *op_errstr = gf_strdup(
+                        "Failed to start aux "
+                        "mount");
+                    goto out;
+                }
+            }
+            break;
+    }
+
+    switch (type) {
+        case GF_QUOTA_OPTION_TYPE_LIMIT_USAGE:
+            ret = dict_get_strn(dict, "hard-limit", SLEN("hard-limit"),
+                                &hard_limit_str);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get hard-limit from dict");
+                goto out;
+            }
+            ret = gf_string2bytesize_int64(hard_limit_str, &hard_limit);
+            if (ret) {
+                if (errno == ERANGE || hard_limit < 0)
+                    gf_asprintf(op_errstr,
+                                "Hard-limit "
+                                "value out of range (0 - %" PRId64 "): %s",
+                                hard_limit, hard_limit_str);
+                else
+                    gf_msg(this->name, GF_LOG_ERROR, errno,
+                           GD_MSG_CONVERSION_FAILED,
+                           "Failed to convert hard-limit "
+                           "string to value");
+                goto out;
+            }
+            get_gfid = _gf_true;
+            break;
+        case GF_QUOTA_OPTION_TYPE_LIMIT_OBJECTS:
+            get_gfid = _gf_true;
+            break;
+
+        case GF_QUOTA_OPTION_TYPE_REMOVE:
+        case GF_QUOTA_OPTION_TYPE_REMOVE_OBJECTS:
+            get_gfid = _gf_true;
+            break;
+
+        case GF_QUOTA_OPTION_TYPE_SOFT_TIMEOUT:
+        case GF_QUOTA_OPTION_TYPE_HARD_TIMEOUT:
+        case GF_QUOTA_OPTION_TYPE_ALERT_TIME:
+        case GF_QUOTA_OPTION_TYPE_DEFAULT_SOFT_LIMIT:
+            ret = _glusterd_validate_quota_opts(dict, type, op_errstr);
+            if (ret)
+                goto out;
+            break;
+
+        default:
+            break;
+    }
+
+    if (get_gfid == _gf_true) {
+        ret = glusterd_get_gfid_from_brick(dict, volinfo, rsp_dict, op_errstr);
+        if (ret)
+            goto out;
+    }
+
+    ret = 0;
+
+out:
+    if (ret && op_errstr && *op_errstr)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_STAGE_QUOTA_FAIL, "%s",
+               *op_errstr);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-quota.h b/xlators/mgmt/glusterd/src/glusterd-quota.h
new file mode 100644
index 00000000000..ab2092a9c6a
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-quota.h
@@ -0,0 +1,17 @@
+/*
+   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_QUOTA_
+#define _GLUSTERD_QUOTA_
+
+int
+glusterd_store_quota_config(glusterd_volinfo_t *volinfo, char *path,
+                            char *gfid_str, int opcode, char **op_errstr);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-quotad-svc.c b/xlators/mgmt/glusterd/src/glusterd-quotad-svc.c
new file mode 100644
index 00000000000..f26d832a06d
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-quotad-svc.c
@@ -0,0 +1,217 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/globals.h>
+#include <glusterfs/run.h>
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-quotad-svc.h"
+#include "glusterd-messages.h"
+#include "glusterd-svc-helper.h"
+
+char *quotad_svc_name = "quotad";
+
+void
+glusterd_quotadsvc_build(glusterd_svc_t *svc)
+{
+    svc->manager = glusterd_quotadsvc_manager;
+    svc->start = glusterd_quotadsvc_start;
+    svc->stop = glusterd_svc_stop;
+}
+
+int
+glusterd_quotadsvc_init(glusterd_svc_t *svc)
+{
+    int ret = -1;
+
+    ret = glusterd_svc_init(svc, quotad_svc_name);
+    if (ret)
+        goto out;
+
+out:
+    return ret;
+}
+
+static int
+glusterd_quotadsvc_create_volfile()
+{
+    char filepath[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *conf = THIS->private;
+
+    glusterd_svc_build_volfile_path(quotad_svc_name, conf->workdir, filepath,
+                                    sizeof(filepath));
+    return glusterd_create_global_volfile(build_quotad_graph, filepath, NULL);
+}
+
+int
+glusterd_quotadsvc_manager(glusterd_svc_t *svc, void *data, int flags)
+{
+    int ret = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    if (!svc->inited) {
+        ret = glusterd_quotadsvc_init(svc);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_FAILED_INIT_QUOTASVC,
+                   "Failed to init "
+                   "quotad service");
+            goto out;
+        } else {
+            svc->inited = _gf_true;
+            gf_msg_debug(THIS->name, 0,
+                         "quotad service "
+                         "initialized");
+        }
+    }
+
+    volinfo = data;
+
+    /* If all the volumes are stopped or all shd compatible volumes
+     * are stopped then stop the service if:
+     * - volinfo is NULL or
+     * - volinfo is present and volume is shd compatible
+     * Otherwise create volfile and restart service if:
+     * - volinfo is NULL or
+     * - volinfo is present and volume is shd compatible
+     */
+    if (glusterd_are_all_volumes_stopped() ||
+        glusterd_all_volumes_with_quota_stopped()) {
+        if (!(volinfo && !glusterd_is_volume_quota_enabled(volinfo))) {
+            ret = svc->stop(svc, SIGTERM);
+        }
+    } else {
+        if (!(volinfo && !glusterd_is_volume_quota_enabled(volinfo))) {
+            ret = glusterd_quotadsvc_create_volfile();
+            if (ret)
+                goto out;
+
+            ret = svc->stop(svc, SIGTERM);
+            if (ret)
+                goto out;
+
+            ret = svc->start(svc, flags);
+            if (ret)
+                goto out;
+
+            ret = glusterd_conn_connect(&(svc->conn));
+            if (ret)
+                goto out;
+        }
+    }
+out:
+    if (ret)
+        gf_event(EVENT_SVC_MANAGER_FAILED, "svc_name=%s", svc->name);
+
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_quotadsvc_start(glusterd_svc_t *svc, int flags)
+{
+    int i = 0;
+    int ret = -1;
+    dict_t *cmdline = NULL;
+    char key[16] = {0};
+    char *options[] = {svc->name, "--process-name", NULL};
+
+    cmdline = dict_new();
+    if (!cmdline) {
+        gf_smsg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    for (i = 0; options[i]; i++) {
+        ret = snprintf(key, sizeof(key), "arg%d", i);
+        ret = dict_set_strn(cmdline, key, ret, options[i]);
+        if (ret)
+            goto out;
+    }
+
+    ret = glusterd_svc_start(svc, flags, cmdline);
+
+out:
+    if (cmdline)
+        dict_unref(cmdline);
+
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_quotadsvc_reconfigure()
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    gf_boolean_t identical = _gf_false;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    if (glusterd_all_volumes_with_quota_stopped())
+        goto manager;
+
+    /*
+     * Check both OLD and NEW volfiles, if they are SAME by size
+     * and cksum i.e. "character-by-character". If YES, then
+     * NOTHING has been changed, just return.
+     */
+    ret = glusterd_svc_check_volfile_identical(priv->quotad_svc.name,
+                                               build_quotad_graph, &identical);
+    if (ret)
+        goto out;
+
+    if (identical) {
+        ret = 0;
+        goto out;
+    }
+
+    /*
+     * They are not identical. Find out if the topology is changed
+     * OR just the volume options. If just the options which got
+     * changed, then inform the xlator to reconfigure the options.
+     */
+    identical = _gf_false; /* RESET the FLAG */
+    ret = glusterd_svc_check_topology_identical(priv->quotad_svc.name,
+                                                build_quotad_graph, &identical);
+    if (ret)
+        goto out;
+
+    /* Topology is not changed, but just the options. But write the
+     * options to quotad volfile, so that quotad will be reconfigured.
+     */
+    if (identical) {
+        ret = glusterd_quotadsvc_create_volfile();
+        if (ret == 0) { /* Only if above PASSES */
+            ret = glusterd_fetchspec_notify(THIS);
+        }
+        goto out;
+    }
+manager:
+    /*
+     * quotad volfile's topology has been changed. quotad server needs
+     * to be RESTARTED to ACT on the changed volfile.
+     */
+    ret = priv->quotad_svc.manager(&(priv->quotad_svc), NULL,
+                                   PROC_START_NO_WAIT);
+
+out:
+    gf_msg_debug(this ? this->name : "Quotad", 0, "Returning %d", ret);
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-quotad-svc.h b/xlators/mgmt/glusterd/src/glusterd-quotad-svc.h
new file mode 100644
index 00000000000..e8d9bbee964
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-quotad-svc.h
@@ -0,0 +1,31 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_QUOTAD_SVC_H_
+#define _GLUSTERD_QUOTAD_SVC_H_
+
+#include "glusterd-svc-mgmt.h"
+
+void
+glusterd_quotadsvc_build(glusterd_svc_t *svc);
+
+int
+glusterd_quotadsvc_init(glusterd_svc_t *svc);
+
+int
+glusterd_quotadsvc_start(glusterd_svc_t *svc, int flags);
+
+int
+glusterd_quotadsvc_manager(glusterd_svc_t *svc, void *data, int flags);
+
+int
+glusterd_quotadsvc_reconfigure();
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-rcu.h b/xlators/mgmt/glusterd/src/glusterd-rcu.h
new file mode 100644
index 00000000000..c85f9bea8f8
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-rcu.h
@@ -0,0 +1,36 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_RCU_H
+#define _GLUSTERD_RCU_H
+
+#include <urcu-bp.h>
+#include <urcu/rculist.h>
+#include <urcu/compiler.h>
+#include <urcu/uatomic.h>
+#include <urcu-call-rcu.h>
+
+#ifdef URCU_OLD
+#include "rculist-extra.h"
+#endif
+
+#include <glusterfs/xlator.h>
+
+/* gd_rcu_head is a composite struct, composed of struct rcu_head and a this
+ * pointer, which is used to pass the THIS pointer to call_rcu callbacks.
+ *
+ * Use this in place of struct rcu_head when embedding into another struct
+ */
+typedef struct glusterd_rcu_head_ {
+    struct rcu_head head;
+    xlator_t *this;
+} gd_rcu_head;
+
+#endif /* _GLUSTERD_RCU_H */
diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
new file mode 100644
index 00000000000..458bf168ede
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
@@ -0,0 +1,1422 @@
+/*
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <inttypes.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/statvfs.h>
+
+#include <glusterfs/compat.h>
+#include "protocol-common.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/timer.h>
+#include "glusterd-mem-types.h"
+#include "glusterd.h"
+#include "glusterd-sm.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-utils.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-messages.h"
+#include "glusterd-store.h"
+#include <glusterfs/run.h>
+#include "glusterd-volgen.h"
+#include "glusterd-messages.h"
+
+#include <glusterfs/syscall.h>
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+
+#define GLUSTERD_GET_DEFRAG_SOCK_FILE(path, volinfo)                           \
+    do {                                                                       \
+        int32_t _defrag_sockfile_len;                                          \
+        char tmppath[PATH_MAX] = {                                             \
+            0,                                                                 \
+        };                                                                     \
+        _defrag_sockfile_len = snprintf(                                       \
+            tmppath, PATH_MAX,                                                 \
+            DEFAULT_VAR_RUN_DIRECTORY "/gluster-%s-%s-%s.sock", "rebalance",   \
+            volinfo->volname, uuid_utoa(MY_UUID));                             \
+        if ((_defrag_sockfile_len < 0) ||                                      \
+            (_defrag_sockfile_len >= PATH_MAX)) {                              \
+            path[0] = 0;                                                       \
+        } else {                                                               \
+            glusterd_set_socket_filepath(tmppath, path, sizeof(path));         \
+        }                                                                      \
+    } while (0)
+
+int32_t
+glusterd_brick_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                      void *myframe);
+int
+glusterd_defrag_start_validate(glusterd_volinfo_t *volinfo, char *op_errstr,
+                               size_t len, glusterd_op_t op)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    /* Check only if operation is not remove-brick */
+    if ((GD_OP_REMOVE_BRICK != op) && !gd_is_remove_brick_committed(volinfo)) {
+        gf_msg_debug(this->name, 0,
+                     "A remove-brick task on "
+                     "volume %s is not yet committed",
+                     volinfo->volname);
+        snprintf(op_errstr, len,
+                 "A remove-brick task on volume %s is"
+                 " not yet committed. Either commit or stop the "
+                 "remove-brick task.",
+                 volinfo->volname);
+        goto out;
+    }
+
+    if (glusterd_is_defrag_on(volinfo)) {
+        gf_msg_debug(this->name, 0, "rebalance on volume %s already started",
+                     volinfo->volname);
+        snprintf(op_errstr, len, "Rebalance on %s is already started",
+                 volinfo->volname);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+__glusterd_defrag_notify(struct rpc_clnt *rpc, void *mydata,
+                         rpc_clnt_event_t event, void *data)
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_defrag_info_t *defrag = NULL;
+    int ret = 0;
+    char pidfile[PATH_MAX];
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    int pid = -1;
+
+    this = THIS;
+    if (!this)
+        return 0;
+
+    priv = this->private;
+    if (!priv)
+        return 0;
+
+    volinfo = mydata;
+    if (!volinfo)
+        return 0;
+
+    defrag = volinfo->rebal.defrag;
+    if (!defrag)
+        return 0;
+
+    if ((event == RPC_CLNT_DISCONNECT) && defrag->connected)
+        volinfo->rebal.defrag = NULL;
+
+    GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv);
+
+    switch (event) {
+        case RPC_CLNT_CONNECT: {
+            if (defrag->connected)
+                return 0;
+
+            LOCK(&defrag->lock);
+            {
+                defrag->connected = 1;
+            }
+            UNLOCK(&defrag->lock);
+
+            gf_msg_debug(this->name, 0, "%s got RPC_CLNT_CONNECT",
+                         rpc->conn.name);
+            break;
+        }
+
+        case RPC_CLNT_DISCONNECT: {
+            if (!defrag->connected)
+                return 0;
+
+            LOCK(&defrag->lock);
+            {
+                defrag->connected = 0;
+            }
+            UNLOCK(&defrag->lock);
+
+            if (!gf_is_service_running(pidfile, &pid)) {
+                if (volinfo->rebal.defrag_status == GF_DEFRAG_STATUS_STARTED) {
+                    volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_FAILED;
+                }
+            }
+
+            glusterd_store_perform_node_state_store(volinfo);
+
+            rpc_clnt_disable(defrag->rpc);
+            glusterd_defrag_rpc_put(defrag);
+            if (defrag->cbk_fn)
+                defrag->cbk_fn(volinfo, volinfo->rebal.defrag_status);
+
+            GF_FREE(defrag);
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_REBALANCE_DISCONNECTED,
+                   "Rebalance process for volume %s has disconnected.",
+                   volinfo->volname);
+            break;
+        }
+        case RPC_CLNT_DESTROY:
+            glusterd_volinfo_unref(volinfo);
+            break;
+        default:
+            gf_msg_trace(this->name, 0, "got some other RPC event %d", event);
+            ret = 0;
+            break;
+    }
+
+    return ret;
+}
+
+int32_t
+glusterd_defrag_notify(struct rpc_clnt *rpc, void *mydata,
+                       rpc_clnt_event_t event, void *data)
+{
+    return glusterd_big_locked_notify(rpc, mydata, event, data,
+                                      __glusterd_defrag_notify);
+}
+
+int
+glusterd_handle_defrag_start(glusterd_volinfo_t *volinfo, char *op_errstr,
+                             size_t len, int cmd, defrag_cbk_fn_t cbk,
+                             glusterd_op_t op)
+{
+    xlator_t *this = NULL;
+    int ret = -1;
+    glusterd_defrag_info_t *defrag = NULL;
+    runner_t runner = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    char defrag_path[PATH_MAX];
+    char sockfile[PATH_MAX] = {
+        0,
+    };
+    char pidfile[PATH_MAX] = {
+        0,
+    };
+    char logfile[PATH_MAX] = {
+        0,
+    };
+    char volname[PATH_MAX] = {
+        0,
+    };
+    char valgrind_logfile[PATH_MAX] = {
+        0,
+    };
+    char msg[1024] = {
+        0,
+    };
+    char *volfileserver = NULL;
+    char *localtime_logging = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("glusterd", priv, out);
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(op_errstr);
+
+    ret = glusterd_defrag_start_validate(volinfo, op_errstr, len, op);
+    if (ret)
+        goto out;
+    if (!volinfo->rebal.defrag)
+        volinfo->rebal.defrag = GF_CALLOC(1, sizeof(*volinfo->rebal.defrag),
+                                          gf_gld_mt_defrag_info);
+    if (!volinfo->rebal.defrag)
+        goto out;
+
+    defrag = volinfo->rebal.defrag;
+
+    defrag->cmd = cmd;
+
+    volinfo->rebal.defrag_cmd = cmd;
+    volinfo->rebal.op = op;
+
+    LOCK_INIT(&defrag->lock);
+
+    volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_STARTED;
+
+    glusterd_volinfo_reset_defrag_stats(volinfo);
+    glusterd_store_perform_node_state_store(volinfo);
+
+    GLUSTERD_GET_DEFRAG_DIR(defrag_path, volinfo, priv);
+    ret = mkdir_p(defrag_path, 0755, _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Failed to create "
+               "directory %s",
+               defrag_path);
+        goto out;
+    }
+
+    GLUSTERD_GET_DEFRAG_SOCK_FILE(sockfile, volinfo);
+    GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv);
+    snprintf(logfile, PATH_MAX, "%s/%s-%s.log", priv->logdir, volinfo->volname,
+             "rebalance");
+    runinit(&runner);
+
+    if (this->ctx->cmd_args.vgtool != _gf_none) {
+        snprintf(valgrind_logfile, PATH_MAX, "%s/valgrind-%s-rebalance.log",
+                 priv->logdir, volinfo->volname);
+
+        if (this->ctx->cmd_args.vgtool == _gf_memcheck)
+            runner_add_args(&runner, "valgrind", "--leak-check=full",
+                            "--trace-children=yes", "--track-origins=yes",
+                            NULL);
+        else
+            runner_add_args(&runner, "valgrind", "--tool=drd", NULL);
+
+        runner_argprintf(&runner, "--log-file=%s", valgrind_logfile);
+    }
+
+    snprintf(volname, sizeof(volname), "rebalance/%s", volinfo->volname);
+
+    if (dict_get_strn(this->options, "transport.socket.bind-address",
+                      SLEN("transport.socket.bind-address"),
+                      &volfileserver) != 0) {
+        volfileserver = "localhost";
+    }
+
+    runner_add_args(
+        &runner, SBIN_DIR "/glusterfs", "-s", volfileserver, "--volfile-id",
+        volname, "--xlator-option", "*dht.use-readdirp=yes", "--xlator-option",
+        "*dht.lookup-unhashed=yes", "--xlator-option",
+        "*dht.assert-no-child-down=yes", "--xlator-option",
+        "*dht.readdir-optimize=on", "--process-name", "rebalance", NULL);
+
+    runner_add_arg(&runner, "--xlator-option");
+    runner_argprintf(&runner, "*dht.rebalance-cmd=%d", cmd);
+    runner_add_arg(&runner, "--xlator-option");
+    runner_argprintf(&runner, "*dht.node-uuid=%s", uuid_utoa(MY_UUID));
+    runner_add_arg(&runner, "--xlator-option");
+    runner_argprintf(&runner, "*dht.commit-hash=%u",
+                     volinfo->rebal.commit_hash);
+    runner_add_arg(&runner, "--socket-file");
+    runner_argprintf(&runner, "%s", sockfile);
+    runner_add_arg(&runner, "--pid-file");
+    runner_argprintf(&runner, "%s", pidfile);
+    runner_add_arg(&runner, "-l");
+    runner_argprintf(&runner, "%s", logfile);
+    if (volinfo->memory_accounting)
+        runner_add_arg(&runner, "--mem-accounting");
+    if (dict_get_strn(priv->opts, GLUSTERD_LOCALTIME_LOGGING_KEY,
+                      SLEN(GLUSTERD_LOCALTIME_LOGGING_KEY),
+                      &localtime_logging) == 0) {
+        if (strcmp(localtime_logging, "enable") == 0)
+            runner_add_arg(&runner, "--localtime-logging");
+    }
+
+    snprintf(msg, sizeof(msg), "Starting the rebalance service for volume %s",
+             volinfo->volname);
+    runner_log(&runner, this->name, GF_LOG_DEBUG, msg);
+
+    ret = runner_run_nowait(&runner);
+    if (ret) {
+        gf_msg_debug("glusterd", 0, "rebalance command failed");
+        goto out;
+    }
+
+    sleep(5);
+
+    ret = glusterd_rebalance_rpc_create(volinfo);
+
+    // FIXME: this cbk is passed as NULL in all occurrences. May be
+    // we never needed it.
+    if (cbk)
+        defrag->cbk_fn = cbk;
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_rebalance_defrag_init(glusterd_volinfo_t *volinfo, defrag_cbk_fn_t cbk)
+
+{
+    glusterd_defrag_info_t *defrag = NULL;
+    int ret = -1;
+
+    if (!volinfo->rebal.defrag) {
+        volinfo->rebal.defrag = GF_CALLOC(1, sizeof(*volinfo->rebal.defrag),
+                                          gf_gld_mt_defrag_info);
+    } else {
+        /*
+         * if defrag variable is already initialized,
+         * we skip the initialization.
+         */
+        ret = 0;
+        goto out;
+    }
+
+    if (!volinfo->rebal.defrag)
+        goto out;
+    defrag = volinfo->rebal.defrag;
+
+    defrag->cmd = volinfo->rebal.defrag_cmd;
+    LOCK_INIT(&defrag->lock);
+    if (cbk)
+        defrag->cbk_fn = cbk;
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_rebalance_rpc_create(glusterd_volinfo_t *volinfo)
+{
+    dict_t *options = NULL;
+    char sockfile[PATH_MAX] = {
+        0,
+    };
+    int ret = -1;
+    glusterd_defrag_info_t *defrag = volinfo->rebal.defrag;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    // rebalance process is not started
+    if (!defrag)
+        goto out;
+
+    options = dict_new();
+    if (!options) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    GLUSTERD_GET_DEFRAG_SOCK_FILE(sockfile, volinfo);
+
+    /* Setting frame-timeout to 10mins (600seconds).
+     * Unix domain sockets ensures that the connection is reliable. The
+     * default timeout of 30mins used for unreliable network connections is
+     * too long for unix domain socket connections.
+     */
+    ret = rpc_transport_unix_options_build(options, sockfile, 600);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_UNIX_OP_BUILD_FAIL,
+               "Unix options build failed");
+        goto out;
+    }
+
+    glusterd_volinfo_ref(volinfo);
+    ret = glusterd_rpc_create(&defrag->rpc, options, glusterd_defrag_notify,
+                              volinfo, _gf_true);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_RPC_CREATE_FAIL,
+               "Glusterd RPC creation failed");
+        goto out;
+    }
+    ret = 0;
+out:
+    if (options)
+        dict_unref(options);
+    return ret;
+}
+
+int
+glusterd_rebalance_cmd_validate(int cmd, char *volname,
+                                glusterd_volinfo_t **volinfo, char *op_errstr,
+                                size_t len)
+{
+    int ret = -1;
+
+    if (glusterd_volinfo_find(volname, volinfo)) {
+        gf_msg("glusterd", GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+               "Received rebalance on invalid"
+               " volname %s",
+               volname);
+        snprintf(op_errstr, len, "Volume %s does not exist", volname);
+        goto out;
+    }
+    if ((*volinfo)->brick_count <= (*volinfo)->dist_leaf_count) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_DISTRIBUTE,
+               "Volume %s is not a "
+               "distribute type or contains only 1 brick",
+               volname);
+        snprintf(op_errstr, len,
+                 "Volume %s is not a distribute "
+                 "volume or contains only 1 brick.\n"
+                 "Not performing rebalance",
+                 volname);
+        goto out;
+    }
+
+    if ((*volinfo)->status != GLUSTERD_STATUS_STARTED) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_STOPPED,
+               "Received rebalance on stopped"
+               " volname %s",
+               volname);
+        snprintf(op_errstr, len,
+                 "Volume %s needs to "
+                 "be started to perform rebalance",
+                 volname);
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+__glusterd_handle_defrag_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    glusterd_conf_t *priv = NULL;
+    int32_t op = GD_OP_NONE;
+    dict_t *dict = NULL;
+    char *volname = NULL;
+    gf_cli_defrag_type cmd = 0;
+    char msg[2048] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    GF_ASSERT(req);
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        req->rpc_err = GARBAGE_ARGS;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(msg, sizeof(msg),
+                     "Unable to decode the "
+                     "command");
+            goto out;
+        }
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Failed to get volume name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s", msg);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "rebalance-command", SLEN("rebalance-command"),
+                          (int32_t *)&cmd);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Failed to get command");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s", msg);
+        goto out;
+    }
+
+    ret = dict_set_static_bin(dict, "node-uuid", MY_UUID, 16);
+    if (ret)
+        goto out;
+
+    if ((cmd == GF_DEFRAG_CMD_STATUS) || (cmd == GF_DEFRAG_CMD_STOP)) {
+        op = GD_OP_DEFRAG_BRICK_VOLUME;
+    } else
+        op = GD_OP_REBALANCE;
+
+    if (priv->op_version < GD_OP_VERSION_6_0) {
+        gf_msg_debug(this->name, 0,
+                     "The cluster is operating at "
+                     "version less than %d. Falling back "
+                     "to op-sm framework.",
+                     GD_OP_VERSION_6_0);
+        ret = glusterd_op_begin(req, op, dict, msg, sizeof(msg));
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    } else {
+        ret = glusterd_mgmt_v3_initiate_all_phases_with_brickop_phase(req, op,
+                                                                      dict);
+    }
+out:
+    if (ret) {
+        if (msg[0] == '\0')
+            snprintf(msg, sizeof(msg), "Operation failed");
+        ret = glusterd_op_send_cli_response(GD_OP_REBALANCE, ret, 0, req, dict,
+                                            msg);
+    }
+
+    free(cli_req.dict.dict_val);  // malloced by xdr
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_handle_defrag_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_defrag_volume);
+}
+
+static int
+glusterd_brick_validation(dict_t *dict, char *key, data_t *value, void *data)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *volinfo = data;
+    glusterd_brickinfo_t *brickinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = glusterd_volume_brickinfo_get_by_brick(value->data, volinfo,
+                                                 &brickinfo, _gf_false);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_BRICK_NOT_FOUND,
+               "Incorrect brick %s for "
+               "volume %s",
+               value->data, volinfo->volname);
+        return ret;
+    }
+
+    if (!brickinfo->decommissioned) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_BRICK_NOT_FOUND,
+               "Incorrect brick %s for "
+               "volume %s",
+               value->data, volinfo->volname);
+        ret = -1;
+        return ret;
+    }
+
+    return ret;
+}
+
+int
+glusterd_set_rebalance_id_in_rsp_dict(dict_t *req_dict, dict_t *rsp_dict)
+{
+    int ret = -1;
+    int32_t cmd = 0;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char msg[2048] = {0};
+    char *task_id_str = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(req_dict);
+
+    ret = dict_get_strn(rsp_dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "volname not found");
+        goto out;
+    }
+
+    ret = dict_get_int32n(rsp_dict, "rebalance-command",
+                          SLEN("rebalance-command"), &cmd);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "cmd not found");
+        goto out;
+    }
+
+    ret = glusterd_rebalance_cmd_validate(cmd, volname, &volinfo, msg,
+                                          sizeof(msg));
+    if (ret) {
+        gf_msg_debug(this->name, 0, "failed to validate");
+        goto out;
+    }
+
+    /* reblance id is generted in glusterd_mgmt_v3_op_stage_rebalance(), but
+     * rsp_dict is unavailable there. So copying it to rsp_dict from req_dict
+     * here. So that cli can display the rebalance id.*/
+    if ((cmd == GF_DEFRAG_CMD_START) ||
+        (cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX) ||
+        (cmd == GF_DEFRAG_CMD_START_FORCE)) {
+        if (is_origin_glusterd(rsp_dict)) {
+            ret = dict_get_strn(req_dict, GF_REBALANCE_TID_KEY,
+                                SLEN(GF_REBALANCE_TID_KEY), &task_id_str);
+            if (ret) {
+                snprintf(msg, sizeof(msg), "Missing rebalance-id");
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_REBALANCE_ID_MISSING, "%s", msg);
+                ret = 0;
+            } else {
+                gf_uuid_parse(task_id_str, volinfo->rebal.rebalance_id);
+                ret = glusterd_copy_uuid_to_dict(volinfo->rebal.rebalance_id,
+                                                 rsp_dict, GF_REBALANCE_TID_KEY,
+                                                 SLEN(GF_REBALANCE_TID_KEY));
+                if (ret) {
+                    snprintf(msg, sizeof(msg),
+                             "Failed to set rebalance id for volume %s",
+                             volname);
+                    gf_msg(this->name, GF_LOG_WARNING, 0,
+                           GD_MSG_DICT_SET_FAILED, "%s", msg);
+                }
+            }
+        }
+    }
+
+    /* Set task-id, if available, in rsp_dict for operations other than
+     * start. This is needed when we want rebalance id in xml output
+     */
+    if (cmd == GF_DEFRAG_CMD_STATUS || cmd == GF_DEFRAG_CMD_STOP) {
+        if (!gf_uuid_is_null(volinfo->rebal.rebalance_id)) {
+            if (GD_OP_REMOVE_BRICK == volinfo->rebal.op)
+                ret = glusterd_copy_uuid_to_dict(
+                    volinfo->rebal.rebalance_id, rsp_dict,
+                    GF_REMOVE_BRICK_TID_KEY, SLEN(GF_REMOVE_BRICK_TID_KEY));
+            else
+                ret = glusterd_copy_uuid_to_dict(volinfo->rebal.rebalance_id,
+                                                 rsp_dict, GF_REBALANCE_TID_KEY,
+                                                 SLEN(GF_REBALANCE_TID_KEY));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set task-id for volume %s", volname);
+                goto out;
+            }
+        }
+    }
+out:
+    return ret;
+}
+
+int
+glusterd_mgmt_v3_op_stage_rebalance(dict_t *dict, char **op_errstr)
+{
+    char *volname = NULL;
+    char *cmd_str = NULL;
+    int ret = 0;
+    int32_t cmd = 0;
+    char msg[2048] = {0};
+    glusterd_volinfo_t *volinfo = NULL;
+    char *task_id_str = NULL;
+    xlator_t *this = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "volname not found");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "rebalance-command", SLEN("rebalance-command"),
+                          &cmd);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "cmd not found");
+        goto out;
+    }
+
+    ret = glusterd_rebalance_cmd_validate(cmd, volname, &volinfo, msg,
+                                          sizeof(msg));
+    if (ret) {
+        gf_msg_debug(this->name, 0, "failed to validate");
+        goto out;
+    }
+    switch (cmd) {
+        case GF_DEFRAG_CMD_START:
+        case GF_DEFRAG_CMD_START_LAYOUT_FIX:
+            /* Check if the connected clients are all of version
+             * glusterfs-3.6 and higher. This is needed to prevent some data
+             * loss issues that could occur when older clients are connected
+             * when rebalance is run. This check can be bypassed by using
+             * 'force'
+             */
+            ret = glusterd_check_client_op_version_support(
+                volname, GD_OP_VERSION_3_6_0, NULL);
+            if (ret) {
+                ret = gf_asprintf(op_errstr,
+                                  "Volume %s has one or "
+                                  "more connected clients of a version"
+                                  " lower than GlusterFS-v3.6.0. "
+                                  "Starting rebalance in this state "
+                                  "could lead to data loss.\nPlease "
+                                  "disconnect those clients before "
+                                  "attempting this command again.",
+                                  volname);
+                goto out;
+            }
+            /* Fall through */
+        case GF_DEFRAG_CMD_START_FORCE:
+            if (is_origin_glusterd(dict)) {
+                ret = glusterd_generate_and_set_task_id(
+                    dict, GF_REBALANCE_TID_KEY, SLEN(GF_REBALANCE_TID_KEY));
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TASKID_GEN_FAIL,
+                           "Failed to generate task-id");
+                    goto out;
+                }
+            } else {
+                ret = dict_get_strn(dict, GF_REBALANCE_TID_KEY,
+                                    SLEN(GF_REBALANCE_TID_KEY), &task_id_str);
+                if (ret) {
+                    snprintf(msg, sizeof(msg), "Missing rebalance-id");
+                    gf_msg(this->name, GF_LOG_WARNING, 0,
+                           GD_MSG_REBALANCE_ID_MISSING, "%s", msg);
+                    ret = 0;
+                }
+            }
+            ret = glusterd_defrag_start_validate(volinfo, msg, sizeof(msg),
+                                                 GD_OP_REBALANCE);
+            if (ret) {
+                gf_msg_debug(this->name, 0,
+                             "defrag start validate "
+                             "failed for volume %s.",
+                             volinfo->volname);
+                goto out;
+            }
+            break;
+        case GF_DEFRAG_CMD_STATUS:
+        case GF_DEFRAG_CMD_STOP:
+
+            ret = dict_get_strn(dict, "cmd-str", SLEN("cmd-str"), &cmd_str);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get "
+                       "command string");
+                ret = -1;
+                goto out;
+            }
+            if ((strstr(cmd_str, "rebalance") != NULL) &&
+                (volinfo->rebal.op != GD_OP_REBALANCE)) {
+                snprintf(msg, sizeof(msg),
+                         "Rebalance not started "
+                         "for volume %s.",
+                         volinfo->volname);
+                ret = -1;
+                goto out;
+            }
+
+            if (strstr(cmd_str, "remove-brick") != NULL) {
+                if (volinfo->rebal.op != GD_OP_REMOVE_BRICK) {
+                    snprintf(msg, sizeof(msg),
+                             "remove-brick not "
+                             "started for volume %s.",
+                             volinfo->volname);
+                    ret = -1;
+                    goto out;
+                }
+
+                /* For remove-brick status/stop command check whether
+                 * given input brick is part of volume or not.*/
+
+                ret = dict_foreach_fnmatch(dict, "brick*",
+                                           glusterd_brick_validation, volinfo);
+                if (ret == -1) {
+                    snprintf(msg, sizeof(msg),
+                             "Incorrect brick"
+                             " for volume %s",
+                             volinfo->volname);
+                    goto out;
+                }
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    ret = 0;
+out:
+    if (ret && op_errstr && msg[0])
+        *op_errstr = gf_strdup(msg);
+
+    return ret;
+}
+
+int
+glusterd_mgmt_v3_op_rebalance(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    char *volname = NULL;
+    int ret = 0;
+    int32_t cmd = 0;
+    char msg[2048] = {0};
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *tmp = NULL;
+    gf_boolean_t volfile_update = _gf_false;
+    char *task_id_str = NULL;
+    xlator_t *this = NULL;
+    uint32_t commit_hash;
+    int32_t is_force = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "volname not given");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "rebalance-command", SLEN("rebalance-command"),
+                          &cmd);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "command not given");
+        goto out;
+    }
+
+    ret = glusterd_rebalance_cmd_validate(cmd, volname, &volinfo, msg,
+                                          sizeof(msg));
+    if (ret) {
+        gf_msg_debug(this->name, 0, "cmd validate failed");
+        goto out;
+    }
+
+    switch (cmd) {
+        case GF_DEFRAG_CMD_START:
+        case GF_DEFRAG_CMD_START_LAYOUT_FIX:
+        case GF_DEFRAG_CMD_START_FORCE:
+
+            ret = dict_get_int32n(dict, "force", SLEN("force"), &is_force);
+            if (ret)
+                is_force = 0;
+            if (!is_force) {
+                /* Reset defrag status to 'NOT STARTED' whenever a
+                 * remove-brick/rebalance command is issued to remove
+                 * stale information from previous run.
+                 */
+                volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_NOT_STARTED;
+
+                ret = dict_get_strn(dict, GF_REBALANCE_TID_KEY,
+                                    SLEN(GF_REBALANCE_TID_KEY), &task_id_str);
+                if (ret) {
+                    gf_msg_debug(this->name, 0,
+                                 "Missing rebalance"
+                                 " id");
+                    ret = 0;
+                } else {
+                    gf_uuid_parse(task_id_str, volinfo->rebal.rebalance_id);
+                    volinfo->rebal.op = GD_OP_REBALANCE;
+                }
+                if (!gd_should_i_start_rebalance(volinfo)) {
+                    /* Store the rebalance-id and rebalance command
+                     * even if the peer isn't starting a rebalance
+                     * process. On peers where a rebalance process
+                     * is started, glusterd_handle_defrag_start
+                     * performs the storing.
+                     * Storing this is needed for having
+                     * 'volume status' work correctly.
+                     */
+                    glusterd_store_perform_node_state_store(volinfo);
+                    break;
+                }
+                if (dict_get_uint32(dict, "commit-hash", &commit_hash) == 0) {
+                    volinfo->rebal.commit_hash = commit_hash;
+                }
+                ret = glusterd_handle_defrag_start(volinfo, msg, sizeof(msg),
+                                                   cmd, NULL, GD_OP_REBALANCE);
+                break;
+            } else {
+                /* Reset defrag status to 'STARTED' so that the
+                 * pid is checked and restarted accordingly.
+                 * If the pid is not running it executes the
+                 * "NOT_STARTED" case and restarts the process
+                 */
+                volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_STARTED;
+                volinfo->rebal.defrag_cmd = cmd;
+                volinfo->rebal.op = GD_OP_REBALANCE;
+
+                ret = dict_get_strn(dict, GF_REBALANCE_TID_KEY,
+                                    SLEN(GF_REBALANCE_TID_KEY), &task_id_str);
+                if (ret) {
+                    gf_msg_debug(this->name, 0,
+                                 "Missing rebalance"
+                                 " id");
+                    ret = 0;
+                } else {
+                    gf_uuid_parse(task_id_str, volinfo->rebal.rebalance_id);
+                    volinfo->rebal.op = GD_OP_REBALANCE;
+                }
+                if (dict_get_uint32(dict, "commit-hash", &commit_hash) == 0) {
+                    volinfo->rebal.commit_hash = commit_hash;
+                }
+                ret = glusterd_restart_rebalance_for_volume(volinfo);
+                break;
+            }
+        case GF_DEFRAG_CMD_STOP:
+            /* Clear task-id only on explicitly stopping rebalance.
+             * Also clear the stored operation, so it doesn't cause trouble
+             * with future rebalance/remove-brick starts
+             */
+            gf_uuid_clear(volinfo->rebal.rebalance_id);
+            volinfo->rebal.op = GD_OP_NONE;
+
+            /* Fall back to the old volume file in case of decommission*/
+            cds_list_for_each_entry_safe(brickinfo, tmp, &volinfo->bricks,
+                                         brick_list)
+            {
+                if (!brickinfo->decommissioned)
+                    continue;
+                brickinfo->decommissioned = 0;
+                volfile_update = _gf_true;
+            }
+
+            if (volfile_update == _gf_false) {
+                ret = 0;
+                break;
+            }
+
+            ret = glusterd_create_volfiles_and_notify_services(volinfo);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_VOLFILE_CREATE_FAIL, "failed to create volfiles");
+                goto out;
+            }
+
+            ret = glusterd_store_volinfo(volinfo,
+                                         GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLINFO_SET_FAIL,
+                       "failed to store volinfo");
+                goto out;
+            }
+
+            ret = 0;
+            break;
+
+        case GF_DEFRAG_CMD_STATUS:
+            break;
+        default:
+            break;
+    }
+
+out:
+    if (ret && op_errstr && msg[0])
+        *op_errstr = gf_strdup(msg);
+
+    return ret;
+}
+
+int
+glusterd_op_stage_rebalance(dict_t *dict, char **op_errstr)
+{
+    char *volname = NULL;
+    char *cmd_str = NULL;
+    int ret = 0;
+    int32_t cmd = 0;
+    char msg[2048] = {0};
+    glusterd_volinfo_t *volinfo = NULL;
+    char *task_id_str = NULL;
+    dict_t *op_ctx = NULL;
+    xlator_t *this = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "volname not found");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "rebalance-command", SLEN("rebalance-command"),
+                          &cmd);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "cmd not found");
+        goto out;
+    }
+
+    ret = glusterd_rebalance_cmd_validate(cmd, volname, &volinfo, msg,
+                                          sizeof(msg));
+    if (ret) {
+        gf_msg_debug(this->name, 0, "failed to validate");
+        goto out;
+    }
+    switch (cmd) {
+        case GF_DEFRAG_CMD_START:
+        case GF_DEFRAG_CMD_START_LAYOUT_FIX:
+            /* Check if the connected clients are all of version
+             * glusterfs-3.6 and higher. This is needed to prevent some data
+             * loss issues that could occur when older clients are connected
+             * when rebalance is run. This check can be bypassed by using
+             * 'force'
+             */
+            ret = glusterd_check_client_op_version_support(
+                volname, GD_OP_VERSION_3_6_0, NULL);
+            if (ret) {
+                ret = gf_asprintf(op_errstr,
+                                  "Volume %s has one or "
+                                  "more connected clients of a version"
+                                  " lower than GlusterFS-v3.6.0. "
+                                  "Starting rebalance in this state "
+                                  "could lead to data loss.\nPlease "
+                                  "disconnect those clients before "
+                                  "attempting this command again.",
+                                  volname);
+                goto out;
+            }
+            /* Fall through */
+        case GF_DEFRAG_CMD_START_FORCE:
+            if (is_origin_glusterd(dict)) {
+                op_ctx = glusterd_op_get_ctx();
+                if (!op_ctx) {
+                    ret = -1;
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OPCTX_GET_FAIL,
+                           "Failed to get op_ctx");
+                    goto out;
+                }
+
+                ret = glusterd_generate_and_set_task_id(
+                    op_ctx, GF_REBALANCE_TID_KEY, SLEN(GF_REBALANCE_TID_KEY));
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TASKID_GEN_FAIL,
+                           "Failed to generate task-id");
+                    goto out;
+                }
+            } else {
+                ret = dict_get_strn(dict, GF_REBALANCE_TID_KEY,
+                                    SLEN(GF_REBALANCE_TID_KEY), &task_id_str);
+                if (ret) {
+                    snprintf(msg, sizeof(msg), "Missing rebalance-id");
+                    gf_msg(this->name, GF_LOG_WARNING, 0,
+                           GD_MSG_REBALANCE_ID_MISSING, "%s", msg);
+                    ret = 0;
+                }
+            }
+            ret = glusterd_defrag_start_validate(volinfo, msg, sizeof(msg),
+                                                 GD_OP_REBALANCE);
+            if (ret) {
+                gf_msg_debug(this->name, 0,
+                             "defrag start validate "
+                             "failed for volume %s.",
+                             volinfo->volname);
+                goto out;
+            }
+            break;
+        case GF_DEFRAG_CMD_STATUS:
+        case GF_DEFRAG_CMD_STOP:
+
+            ret = dict_get_strn(dict, "cmd-str", SLEN("cmd-str"), &cmd_str);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get "
+                       "command string");
+                ret = -1;
+                goto out;
+            }
+            if ((strstr(cmd_str, "rebalance") != NULL) &&
+                (volinfo->rebal.op != GD_OP_REBALANCE)) {
+                snprintf(msg, sizeof(msg),
+                         "Rebalance not started "
+                         "for volume %s.",
+                         volinfo->volname);
+                ret = -1;
+                goto out;
+            }
+
+            if (strstr(cmd_str, "remove-brick") != NULL) {
+                if (volinfo->rebal.op != GD_OP_REMOVE_BRICK) {
+                    snprintf(msg, sizeof(msg),
+                             "remove-brick not "
+                             "started for volume %s.",
+                             volinfo->volname);
+                    ret = -1;
+                    goto out;
+                }
+
+                /* For remove-brick status/stop command check whether
+                 * given input brick is part of volume or not.*/
+
+                ret = dict_foreach_fnmatch(dict, "brick*",
+                                           glusterd_brick_validation, volinfo);
+                if (ret == -1) {
+                    snprintf(msg, sizeof(msg),
+                             "Incorrect brick"
+                             " for volume %s",
+                             volinfo->volname);
+                    goto out;
+                }
+            }
+            break;
+
+        default:
+            break;
+    }
+
+    ret = 0;
+out:
+    if (ret && op_errstr && msg[0])
+        *op_errstr = gf_strdup(msg);
+
+    return ret;
+}
+
+int
+glusterd_op_rebalance(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    char *volname = NULL;
+    int ret = 0;
+    int32_t cmd = 0;
+    char msg[2048] = {0};
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *tmp = NULL;
+    gf_boolean_t volfile_update = _gf_false;
+    char *task_id_str = NULL;
+    dict_t *ctx = NULL;
+    xlator_t *this = NULL;
+    uint32_t commit_hash;
+    int32_t is_force = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "volname not given");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "rebalance-command", SLEN("rebalance-command"),
+                          &cmd);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "command not given");
+        goto out;
+    }
+
+    ret = glusterd_rebalance_cmd_validate(cmd, volname, &volinfo, msg,
+                                          sizeof(msg));
+    if (ret) {
+        gf_msg_debug(this->name, 0, "cmd validate failed");
+        goto out;
+    }
+
+    /* Set task-id, if available, in op_ctx dict for operations other than
+     * start
+     */
+    if (cmd == GF_DEFRAG_CMD_STATUS || cmd == GF_DEFRAG_CMD_STOP) {
+        if (!gf_uuid_is_null(volinfo->rebal.rebalance_id)) {
+            ctx = glusterd_op_get_ctx();
+            if (!ctx) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OPCTX_GET_FAIL,
+                       "Failed to get op_ctx");
+                ret = -1;
+                goto out;
+            }
+
+            if (GD_OP_REMOVE_BRICK == volinfo->rebal.op)
+                ret = glusterd_copy_uuid_to_dict(volinfo->rebal.rebalance_id,
+                                                 ctx, GF_REMOVE_BRICK_TID_KEY,
+                                                 SLEN(GF_REMOVE_BRICK_TID_KEY));
+            else
+                ret = glusterd_copy_uuid_to_dict(volinfo->rebal.rebalance_id,
+                                                 ctx, GF_REBALANCE_TID_KEY,
+                                                 SLEN(GF_REBALANCE_TID_KEY));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TASKID_GEN_FAIL,
+                       "Failed to set task-id");
+                goto out;
+            }
+        }
+    }
+
+    switch (cmd) {
+        case GF_DEFRAG_CMD_START:
+        case GF_DEFRAG_CMD_START_LAYOUT_FIX:
+        case GF_DEFRAG_CMD_START_FORCE:
+
+            ret = dict_get_int32n(dict, "force", SLEN("force"), &is_force);
+            if (ret)
+                is_force = 0;
+            if (!is_force) {
+                /* Reset defrag status to 'NOT STARTED' whenever a
+                 * remove-brick/rebalance command is issued to remove
+                 * stale information from previous run.
+                 */
+                volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_NOT_STARTED;
+
+                ret = dict_get_strn(dict, GF_REBALANCE_TID_KEY,
+                                    SLEN(GF_REBALANCE_TID_KEY), &task_id_str);
+                if (ret) {
+                    gf_msg_debug(this->name, 0,
+                                 "Missing rebalance"
+                                 " id");
+                    ret = 0;
+                } else {
+                    gf_uuid_parse(task_id_str, volinfo->rebal.rebalance_id);
+                    volinfo->rebal.op = GD_OP_REBALANCE;
+                }
+                if (!gd_should_i_start_rebalance(volinfo)) {
+                    /* Store the rebalance-id and rebalance command
+                     * even if the peer isn't starting a rebalance
+                     * process. On peers where a rebalance process
+                     * is started, glusterd_handle_defrag_start
+                     * performs the storing.
+                     * Storing this is needed for having
+                     * 'volume status' work correctly.
+                     */
+                    glusterd_store_perform_node_state_store(volinfo);
+                    break;
+                }
+                if (dict_get_uint32(dict, "commit-hash", &commit_hash) == 0) {
+                    volinfo->rebal.commit_hash = commit_hash;
+                }
+                ret = glusterd_handle_defrag_start(volinfo, msg, sizeof(msg),
+                                                   cmd, NULL, GD_OP_REBALANCE);
+                break;
+            } else {
+                /* Reset defrag status to 'STARTED' so that the
+                 * pid is checked and restarted accordingly.
+                 * If the pid is not running it executes the
+                 * "NOT_STARTED" case and restarts the process
+                 */
+                volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_STARTED;
+                volinfo->rebal.defrag_cmd = cmd;
+                volinfo->rebal.op = GD_OP_REBALANCE;
+
+                ret = dict_get_strn(dict, GF_REBALANCE_TID_KEY,
+                                    SLEN(GF_REBALANCE_TID_KEY), &task_id_str);
+                if (ret) {
+                    gf_msg_debug(this->name, 0,
+                                 "Missing rebalance"
+                                 " id");
+                    ret = 0;
+                } else {
+                    gf_uuid_parse(task_id_str, volinfo->rebal.rebalance_id);
+                    volinfo->rebal.op = GD_OP_REBALANCE;
+                }
+                if (dict_get_uint32(dict, "commit-hash", &commit_hash) == 0) {
+                    volinfo->rebal.commit_hash = commit_hash;
+                }
+                ret = glusterd_restart_rebalance_for_volume(volinfo);
+                break;
+            }
+        case GF_DEFRAG_CMD_STOP:
+            /* Clear task-id only on explicitly stopping rebalance.
+             * Also clear the stored operation, so it doesn't cause trouble
+             * with future rebalance/remove-brick starts
+             */
+            gf_uuid_clear(volinfo->rebal.rebalance_id);
+            volinfo->rebal.op = GD_OP_NONE;
+
+            /* Fall back to the old volume file in case of decommission*/
+            cds_list_for_each_entry_safe(brickinfo, tmp, &volinfo->bricks,
+                                         brick_list)
+            {
+                if (!brickinfo->decommissioned)
+                    continue;
+                brickinfo->decommissioned = 0;
+                volfile_update = _gf_true;
+            }
+
+            if (volfile_update == _gf_false) {
+                ret = 0;
+                break;
+            }
+
+            ret = glusterd_create_volfiles_and_notify_services(volinfo);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_VOLFILE_CREATE_FAIL, "failed to create volfiles");
+                goto out;
+            }
+
+            ret = glusterd_store_volinfo(volinfo,
+                                         GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLINFO_SET_FAIL,
+                       "failed to store volinfo");
+                goto out;
+            }
+
+            ret = 0;
+            break;
+
+        case GF_DEFRAG_CMD_STATUS:
+            break;
+        default:
+            break;
+    }
+
+out:
+    if (ret && op_errstr && msg[0])
+        *op_errstr = gf_strdup(msg);
+
+    return ret;
+}
+
+int32_t
+glusterd_defrag_event_notify_handle(dict_t *dict)
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    char *volname = NULL;
+    char *volname_ptr = NULL;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get volname");
+        return ret;
+    }
+
+    volname_ptr = strstr(volname, "rebalance/");
+    if (volname_ptr) {
+        volname_ptr = strchr(volname_ptr, '/');
+        volname = volname_ptr + 1;
+    } else {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_REBALANCE_PFX_IN_VOLNAME,
+               "volname received (%s) is not prefixed with rebalance.",
+               volname);
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Failed to get volinfo for %s", volname);
+        return ret;
+    }
+
+    ret = glusterd_defrag_volume_status_update(volinfo, dict, 0);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DEFRAG_STATUS_UPDATE_FAIL,
+               "Failed to update status");
+        gf_event(EVENT_REBALANCE_STATUS_UPDATE_FAILED, "volume=%s",
+                 volinfo->volname);
+    }
+
+out:
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c
new file mode 100644
index 00000000000..43c2f4373e0
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c
@@ -0,0 +1,716 @@
+/*
+   Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/common-utils.h>
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+#include <glusterfs/glusterfs.h>
+#include "glusterd.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-geo-rep.h"
+#include "glusterd-store.h"
+#include "glusterd-utils.h"
+#include "glusterd-svc-mgmt.h"
+#include "glusterd-svc-helper.h"
+#include "glusterd-volgen.h"
+#include "glusterd-messages.h"
+#include "glusterd-server-quorum.h"
+#include "glusterd-mgmt.h"
+#include <glusterfs/run.h>
+#include <glusterfs/syscall.h>
+
+#include <signal.h>
+
+int
+glusterd_mgmt_v3_initiate_replace_brick_cmd_phases(rpcsvc_request_t *req,
+                                                   glusterd_op_t op,
+                                                   dict_t *dict);
+int
+__glusterd_handle_replace_brick(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    char *src_brick = NULL;
+    char *dst_brick = NULL;
+    char *cli_op = NULL;
+    glusterd_op_t op = -1;
+    char *volname = NULL;
+    char msg[256] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    GF_ASSERT(req);
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "Failed to decode "
+               "request received from cli");
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_REPLACE_BRK_REQ_RCVD,
+           "Received replace brick req");
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(msg, sizeof(msg),
+                     "Unable to decode the "
+                     "command");
+            goto out;
+        }
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Could not get volume name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s", msg);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "operation", SLEN("operation"), &cli_op);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "dict_get on operation failed");
+        snprintf(msg, sizeof(msg), "Could not get operation");
+        goto out;
+    }
+
+    op = gd_cli_to_gd_op(cli_op);
+
+    if (conf->op_version < GD_OP_VERSION_3_9_0 &&
+        strcmp(cli_op, "GF_REPLACE_OP_COMMIT_FORCE")) {
+        snprintf(msg, sizeof(msg),
+                 "Cannot execute command. The "
+                 "cluster is operating at version %d. reset-brick "
+                 "command %s is unavailable in this version.",
+                 conf->op_version, gd_rb_op_to_str(cli_op));
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "src-brick", SLEN("src-brick"), &src_brick);
+
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Failed to get src brick");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s", msg);
+        goto out;
+    }
+    gf_msg_debug(this->name, 0, "src brick=%s", src_brick);
+
+    if (!strcmp(cli_op, "GF_RESET_OP_COMMIT") ||
+        !strcmp(cli_op, "GF_RESET_OP_COMMIT_FORCE") ||
+        !strcmp(cli_op, "GF_REPLACE_OP_COMMIT_FORCE")) {
+        ret = dict_get_strn(dict, "dst-brick", SLEN("dst-brick"), &dst_brick);
+
+        if (ret) {
+            snprintf(msg, sizeof(msg),
+                     "Failed to get"
+                     "dest brick");
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+                   msg);
+            goto out;
+        }
+
+        gf_msg_debug(this->name, 0, "dst brick=%s", dst_brick);
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0,
+           (op == GD_OP_REPLACE_BRICK)
+               ? GD_MSG_REPLACE_BRK_COMMIT_FORCE_REQ_RCVD
+               : GD_MSG_RESET_BRICK_COMMIT_FORCE_REQ_RCVD,
+           "Received %s request.", gd_rb_op_to_str(cli_op));
+
+    ret = glusterd_mgmt_v3_initiate_replace_brick_cmd_phases(req, op, dict);
+
+out:
+    if (ret) {
+        glusterd_op_send_cli_response(op, ret, 0, req, dict, msg);
+    }
+    ret = 0;
+    free(cli_req.dict.dict_val);  // malloced by xdr
+
+    return ret;
+}
+
+int
+glusterd_handle_reset_brick(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_replace_brick);
+}
+
+int
+glusterd_handle_replace_brick(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_replace_brick);
+}
+
+int
+glusterd_op_stage_replace_brick(dict_t *dict, char **op_errstr,
+                                dict_t *rsp_dict)
+{
+    int ret = 0;
+    char *src_brick = NULL;
+    char *dst_brick = NULL;
+    char *volname = NULL;
+    char *op = NULL;
+    glusterd_op_t gd_op = -1;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *src_brickinfo = NULL;
+    char *host = NULL;
+    char msg[2048] = {0};
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_brickinfo_t *dst_brickinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    char pidfile[PATH_MAX] = {0};
+    xlator_t *this = NULL;
+    gf_boolean_t is_force = _gf_false;
+    char *dup_dstbrick = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = glusterd_brick_op_prerequisites(dict, &op, &gd_op, &volname, &volinfo,
+                                          &src_brick, &src_brickinfo, pidfile,
+                                          op_errstr, rsp_dict);
+    if (ret)
+        goto out;
+
+    if (volinfo->type == GF_CLUSTER_TYPE_NONE) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_NOT_PERMITTED,
+               "replace-brick is not permitted on distribute only "
+               "volumes");
+        gf_asprintf(op_errstr,
+                    "replace-brick is not permitted on "
+                    "distribute only volumes. Please use add-brick "
+                    "and remove-brick operations instead.");
+        ret = -1;
+        goto out;
+    }
+    ret = glusterd_validate_quorum(this, gd_op, dict, op_errstr);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SERVER_QUORUM_NOT_MET,
+               "Server quorum not met. Rejecting operation.");
+        goto out;
+    }
+
+    if (strcmp(op, "GF_REPLACE_OP_COMMIT_FORCE")) {
+        ret = -1;
+        goto out;
+    } else {
+        is_force = _gf_true;
+    }
+
+    if (volinfo->snap_count > 0 || !cds_list_empty(&volinfo->snap_volumes)) {
+        snprintf(msg, sizeof(msg),
+                 "Volume %s  has %" PRIu64
+                 " snapshots. "
+                 "Changing the volume configuration will not effect snapshots."
+                 "But the snapshot brick mount should be intact to "
+                 "make them function.",
+                 volname, volinfo->snap_count);
+        gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_SNAP_WARN, "%s", msg);
+        msg[0] = '\0';
+    }
+
+    glusterd_add_peers_to_auth_list(volname);
+
+    ret = glusterd_get_dst_brick_info(&dst_brick, volname, op_errstr,
+                                      &dst_brickinfo, &host, dict,
+                                      &dup_dstbrick);
+    if (ret)
+        goto out;
+
+    ret = glusterd_new_brick_validate(dst_brick, dst_brickinfo, msg,
+                                      sizeof(msg), op);
+    /* fail if brick being replaced with itself */
+    if (ret) {
+        *op_errstr = gf_strdup(msg);
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_VALIDATE_FAIL, "%s",
+               *op_errstr);
+        goto out;
+    }
+
+    volinfo->rep_brick.src_brick = src_brickinfo;
+    volinfo->rep_brick.dst_brick = dst_brickinfo;
+
+    if (glusterd_rb_check_bricks(volinfo, src_brickinfo, dst_brickinfo)) {
+        ret = -1;
+        *op_errstr = gf_strdup(
+            "Incorrect source or "
+            "destination brick");
+        if (*op_errstr)
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_BRICK_NOT_FOUND,
+                   "%s", *op_errstr);
+        goto out;
+    }
+
+    if (gf_is_local_addr(host)) {
+        ret = glusterd_validate_and_create_brickpath(
+            dst_brickinfo, volinfo->volume_id, volinfo->volname, op_errstr,
+            is_force, _gf_false);
+        if (ret)
+            goto out;
+    }
+
+    if (!gf_is_local_addr(host)) {
+        RCU_READ_LOCK;
+
+        peerinfo = glusterd_peerinfo_find(NULL, host);
+        if (peerinfo == NULL) {
+            RCU_READ_UNLOCK;
+            ret = -1;
+            snprintf(msg, sizeof(msg), "%s, is not a friend", host);
+            *op_errstr = gf_strdup(msg);
+            goto out;
+
+        } else if (!peerinfo->connected) {
+            RCU_READ_UNLOCK;
+            ret = -1;
+            snprintf(msg, sizeof(msg),
+                     "%s, is not connected at "
+                     "the moment",
+                     host);
+            *op_errstr = gf_strdup(msg);
+            goto out;
+
+        } else if (GD_FRIEND_STATE_BEFRIENDED != peerinfo->state.state) {
+            RCU_READ_UNLOCK;
+            ret = -1;
+            snprintf(msg, sizeof(msg),
+                     "%s, is not befriended "
+                     "at the moment",
+                     host);
+            *op_errstr = gf_strdup(msg);
+            goto out;
+        }
+        RCU_READ_UNLOCK;
+
+    } else if (priv->op_version >= GD_OP_VERSION_3_6_0) {
+        /* A bricks mount dir is required only by snapshots which were
+         * introduced in gluster-3.6.0
+         */
+
+        if (!(gf_uuid_compare(dst_brickinfo->uuid, MY_UUID))) {
+            ret = glusterd_get_brick_mount_dir(dst_brickinfo->path,
+                                               dst_brickinfo->hostname,
+                                               dst_brickinfo->mount_dir);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_BRICK_MOUNTDIR_GET_FAIL,
+                       "Failed to get brick mount_dir");
+                goto out;
+            }
+            ret = dict_set_dynstr_with_alloc(rsp_dict, "brick1.mount_dir",
+                                             dst_brickinfo->mount_dir);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set brick.mount_dir");
+                goto out;
+            }
+        }
+
+        ret = dict_set_int32n(rsp_dict, "brick_count", SLEN("brick_count"), 1);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set local_brick_count");
+            goto out;
+        }
+    }
+
+    ret = 0;
+
+out:
+    GF_FREE(dup_dstbrick);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_op_perform_replace_brick(glusterd_volinfo_t *volinfo, char *old_brick,
+                                  char *new_brick, dict_t *dict)
+{
+    char *brick_mount_dir = NULL;
+    glusterd_brickinfo_t *old_brickinfo = NULL;
+    glusterd_brickinfo_t *new_brickinfo = NULL;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    struct statvfs brickstat = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(volinfo);
+
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    ret = glusterd_brickinfo_new_from_brick(new_brick, &new_brickinfo, _gf_true,
+                                            NULL);
+    if (ret)
+        goto out;
+
+    ret = glusterd_resolve_brick(new_brickinfo);
+    if (ret)
+        goto out;
+
+    if (!gf_uuid_compare(new_brickinfo->uuid, MY_UUID)) {
+        ret = sys_statvfs(new_brickinfo->path, &brickstat);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_STATVFS_FAILED,
+                   "Failed to fetch disk utilization "
+                   "from the brick (%s:%s). Please check the health of "
+                   "the brick. Error code was %s",
+                   new_brickinfo->hostname, new_brickinfo->path,
+                   strerror(errno));
+
+            goto out;
+        }
+        new_brickinfo->statfs_fsid = brickstat.f_fsid;
+    }
+
+    ret = glusterd_volume_brickinfo_get_by_brick(old_brick, volinfo,
+                                                 &old_brickinfo, _gf_false);
+    if (ret)
+        goto out;
+
+    (void)snprintf(new_brickinfo->brick_id, sizeof(new_brickinfo->brick_id),
+                   "%s", old_brickinfo->brick_id);
+    new_brickinfo->port = old_brickinfo->port;
+
+    /* A bricks mount dir is required only by snapshots which were
+     * introduced in gluster-3.6.0
+     */
+    if (conf->op_version >= GD_OP_VERSION_3_6_0) {
+        ret = dict_get_strn(dict, "brick1.mount_dir", SLEN("brick1.mount_dir"),
+                            &brick_mount_dir);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno,
+                   GD_MSG_BRICK_MOUNTDIR_GET_FAIL,
+                   "brick1.mount_dir not present");
+            goto out;
+        }
+        (void)snprintf(new_brickinfo->mount_dir,
+                       sizeof(new_brickinfo->mount_dir), "%s", brick_mount_dir);
+    }
+
+    cds_list_add(&new_brickinfo->brick_list, &old_brickinfo->brick_list);
+
+    volinfo->brick_count++;
+
+    ret = glusterd_op_perform_remove_brick(volinfo, old_brick, 1, NULL);
+    if (ret)
+        goto out;
+
+    /* if the volume is a replicate volume, do: */
+    if (glusterd_is_volume_replicate(volinfo)) {
+        if (!gf_uuid_compare(new_brickinfo->uuid, MY_UUID)) {
+            ret = glusterd_handle_replicate_brick_ops(volinfo, new_brickinfo,
+                                                      GD_OP_REPLACE_BRICK);
+            if (ret < 0)
+                goto out;
+        }
+    }
+
+    ret = glusterd_create_volfiles_and_notify_services(volinfo);
+    if (ret)
+        goto out;
+
+    if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+        ret = glusterd_brick_start(volinfo, new_brickinfo, _gf_false,
+                                   _gf_false);
+        if (ret)
+            goto out;
+    }
+
+out:
+
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_replace_brick(dict_t *dict, dict_t *rsp_dict)
+{
+    int ret = 0;
+    char *replace_op = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char *volname = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    char *src_brick = NULL;
+    char *dst_brick = NULL;
+    glusterd_brickinfo_t *src_brickinfo = NULL;
+    glusterd_brickinfo_t *dst_brickinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "src-brick", SLEN("src-brick"), &src_brick);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get src brick");
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "src brick=%s", src_brick);
+
+    ret = dict_get_strn(dict, "dst-brick", SLEN("dst-brick"), &dst_brick);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get dst brick");
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "dst brick=%s", dst_brick);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "operation", SLEN("operation"), &replace_op);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "dict_get on operation failed");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Unable to allocate memory");
+        goto out;
+    }
+
+    ret = glusterd_volume_brickinfo_get_by_brick(src_brick, volinfo,
+                                                 &src_brickinfo, _gf_false);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Unable to get src-brickinfo");
+        goto out;
+    }
+
+    ret = glusterd_get_rb_dst_brickinfo(volinfo, &dst_brickinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RB_BRICKINFO_GET_FAIL,
+               "Unable to get "
+               "replace brick destination brickinfo");
+        goto out;
+    }
+
+    ret = glusterd_resolve_brick(dst_brickinfo);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Unable to resolve dst-brickinfo");
+        goto out;
+    }
+
+    ret = rb_update_dstbrick_port(dst_brickinfo, rsp_dict, dict);
+    if (ret)
+        goto out;
+
+    if (strcmp(replace_op, "GF_REPLACE_OP_COMMIT_FORCE")) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_svcs_stop(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTER_SERVICES_STOP_FAIL,
+               "Unable to stop gluster services, ret: %d", ret);
+    }
+
+    ret = glusterd_op_perform_replace_brick(volinfo, src_brick, dst_brick,
+                                            dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_BRICK_ADD_FAIL,
+               "Unable to add dst-brick: "
+               "%s to volume: %s",
+               dst_brick, volinfo->volname);
+        (void)glusterd_svcs_manager(volinfo);
+        goto out;
+    }
+
+    volinfo->rebal.defrag_status = 0;
+
+    ret = glusterd_svcs_manager(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0,
+               GD_MSG_GLUSTER_SERVICE_START_FAIL,
+               "Failed to start one or more gluster services.");
+    }
+
+    ret = glusterd_fetchspec_notify(THIS);
+    glusterd_brickinfo_delete(volinfo->rep_brick.dst_brick);
+    volinfo->rep_brick.src_brick = NULL;
+    volinfo->rep_brick.dst_brick = NULL;
+
+    if (!ret)
+        ret = glusterd_store_volinfo(volinfo,
+                                     GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RBOP_STATE_STORE_FAIL,
+               "Couldn't store"
+               " replace brick operation's state");
+
+out:
+    return ret;
+}
+
+int
+glusterd_mgmt_v3_initiate_replace_brick_cmd_phases(rpcsvc_request_t *req,
+                                                   glusterd_op_t op,
+                                                   dict_t *dict)
+{
+    int32_t ret = -1;
+    int32_t op_ret = -1;
+    uint32_t txn_generation = 0;
+    uint32_t op_errno = 0;
+    char *op_errstr = NULL;
+    dict_t *req_dict = NULL;
+    dict_t *tmp_dict = NULL;
+    uuid_t *originator_uuid = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    gf_boolean_t is_acquired = _gf_false;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(dict);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    txn_generation = conf->generation;
+    originator_uuid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!originator_uuid) {
+        ret = -1;
+        goto out;
+    }
+
+    gf_uuid_copy(*originator_uuid, MY_UUID);
+    ret = dict_set_bin(dict, "originator_uuid", originator_uuid,
+                       sizeof(uuid_t));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set originator_uuid.");
+        GF_FREE(originator_uuid);
+        goto out;
+    }
+
+    ret = dict_set_int32n(dict, "is_synctasked", SLEN("is_synctasked"),
+                          _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set synctasked flag to true.");
+        goto out;
+    }
+
+    tmp_dict = dict_new();
+    if (!tmp_dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Unable to create dict");
+        goto out;
+    }
+    dict_copy(dict, tmp_dict);
+
+    ret = glusterd_mgmt_v3_initiate_lockdown(op, dict, &op_errstr, &op_errno,
+                                             &is_acquired, txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCKDOWN_FAIL,
+               "mgmt_v3 lockdown failed.");
+        goto out;
+    }
+
+    ret = glusterd_mgmt_v3_build_payload(&req_dict, &op_errstr, dict, op);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_PAYLOAD_BUILD_FAIL,
+               LOGSTR_BUILD_PAYLOAD, gd_op_list[op]);
+        if (op_errstr == NULL)
+            gf_asprintf(&op_errstr, OPERRSTR_BUILD_PAYLOAD);
+        goto out;
+    }
+
+    ret = glusterd_mgmt_v3_pre_validate(op, req_dict, &op_errstr, &op_errno,
+                                        txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRE_VALIDATION_FAIL,
+               "Pre Validation Failed");
+        goto out;
+    }
+
+    ret = glusterd_mgmt_v3_commit(op, dict, req_dict, &op_errstr, &op_errno,
+                                  txn_generation);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+               "Commit Op Failed");
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    op_ret = ret;
+
+    (void)glusterd_mgmt_v3_release_peer_locks(op, dict, op_ret, &op_errstr,
+                                              is_acquired, txn_generation);
+
+    if (is_acquired) {
+        ret = glusterd_multiple_mgmt_v3_unlock(tmp_dict, MY_UUID);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FAIL,
+                   "Failed to release mgmt_v3 locks on "
+                   "localhost.");
+            op_ret = ret;
+        }
+    }
+    /* SEND CLI RESPONSE */
+    glusterd_op_send_cli_response(op, op_ret, op_errno, req, dict, op_errstr);
+
+    if (req_dict)
+        dict_unref(req_dict);
+
+    if (tmp_dict)
+        dict_unref(tmp_dict);
+
+    if (op_errstr) {
+        GF_FREE(op_errstr);
+        op_errstr = NULL;
+    }
+
+    return 0;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-reset-brick.c b/xlators/mgmt/glusterd/src/glusterd-reset-brick.c
new file mode 100644
index 00000000000..e4d247a1d6c
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-reset-brick.c
@@ -0,0 +1,376 @@
+/*
+   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/common-utils.h>
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+#include <glusterfs/glusterfs.h>
+#include "glusterd.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-geo-rep.h"
+#include "glusterd-store.h"
+#include "glusterd-utils.h"
+#include "glusterd-svc-mgmt.h"
+#include "glusterd-svc-helper.h"
+#include "glusterd-volgen.h"
+#include "glusterd-messages.h"
+#include "glusterd-mgmt.h"
+#include <glusterfs/run.h>
+#include <glusterfs/syscall.h>
+
+#include <signal.h>
+
+int
+glusterd_reset_brick_prevalidate(dict_t *dict, char **op_errstr,
+                                 dict_t *rsp_dict)
+{
+    int ret = 0;
+    char *src_brick = NULL;
+    char *dst_brick = NULL;
+    char *volname = NULL;
+    char *op = NULL;
+    glusterd_op_t gd_op = -1;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *src_brickinfo = NULL;
+    char *host = NULL;
+    char msg[2048] = {0};
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_brickinfo_t *dst_brickinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    char pidfile[PATH_MAX] = {0};
+    xlator_t *this = NULL;
+    gf_boolean_t is_force = _gf_false;
+    int32_t ignore_partition = 0;
+    pid_t pid = -1;
+    uuid_t volume_id = {
+        0,
+    };
+    char *dup_dstbrick = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = glusterd_brick_op_prerequisites(dict, &op, &gd_op, &volname, &volinfo,
+                                          &src_brick, &src_brickinfo, pidfile,
+                                          op_errstr, rsp_dict);
+    if (ret)
+        goto out;
+
+    if (!strcmp(op, "GF_RESET_OP_START"))
+        goto done;
+
+    if (!strcmp(op, "GF_RESET_OP_COMMIT_FORCE"))
+        is_force = _gf_true;
+
+    ret = glusterd_get_dst_brick_info(&dst_brick, volname, op_errstr,
+                                      &dst_brickinfo, &host, dict,
+                                      &dup_dstbrick);
+    if (ret)
+        goto out;
+
+    ret = glusterd_new_brick_validate(dst_brick, dst_brickinfo, msg,
+                                      sizeof(msg), op);
+    /* if bricks are not same and reset brick was used, fail command.
+     * Only replace brick should be used to replace with new bricks
+     * to the volume.
+     */
+    if (ret == 0) {
+        if (!gf_uuid_compare(MY_UUID, dst_brickinfo->uuid)) {
+            ret = -1;
+            *op_errstr = gf_strdup(
+                "When destination brick is new,"
+                " please use"
+                " gluster volume "
+                "replace-brick <volname> "
+                "<src-brick> <dst-brick> "
+                "commit force");
+            if (*op_errstr)
+                gf_msg(this->name, GF_LOG_ERROR, EPERM,
+                       GD_MSG_BRICK_VALIDATE_FAIL, "%s", *op_errstr);
+            goto out;
+        }
+    } else if (ret == 1) {
+        if (gf_is_service_running(pidfile, &pid)) {
+            ret = -1;
+            *op_errstr = gf_strdup(
+                "Source brick"
+                " must be stopped."
+                " Please use "
+                "gluster volume "
+                "reset-brick <volname> "
+                "<dst-brick> start.");
+            if (*op_errstr)
+                gf_msg(this->name, GF_LOG_ERROR, EPERM,
+                       GD_MSG_BRICK_VALIDATE_FAIL, "%s", *op_errstr);
+            goto out;
+        }
+        ret = sys_lgetxattr(dst_brickinfo->path, GF_XATTR_VOL_ID_KEY, volume_id,
+                            16);
+        if (gf_uuid_compare(dst_brickinfo->uuid, src_brickinfo->uuid) ||
+            (ret >= 0 && is_force == _gf_false)) {
+            ret = -1;
+            *op_errstr = gf_strdup(
+                "Brick not available."
+                "It may be containing "
+                "or be contained "
+                "by an existing brick."
+                "Use 'force' option to "
+                "override this.");
+            if (*op_errstr)
+                gf_msg(this->name, GF_LOG_ERROR, EPERM,
+                       GD_MSG_BRICK_VALIDATE_FAIL, "%s", *op_errstr);
+            goto out;
+        }
+        ret = 0;
+    } else {
+        *op_errstr = gf_strdup(msg);
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_VALIDATE_FAIL, "%s",
+               *op_errstr);
+        goto out;
+    }
+
+    volinfo->rep_brick.src_brick = src_brickinfo;
+    volinfo->rep_brick.dst_brick = dst_brickinfo;
+
+    ret = dict_get_int32n(dict, "ignore-partition", SLEN("ignore-partition"),
+                          &ignore_partition);
+    ret = 0;
+    if (gf_is_local_addr(host)) {
+        ret = glusterd_validate_and_create_brickpath(
+            dst_brickinfo, volinfo->volume_id, volinfo->volname, op_errstr,
+            is_force, ignore_partition);
+        if (ret)
+            goto out;
+    } else {
+        RCU_READ_LOCK;
+
+        peerinfo = glusterd_peerinfo_find(NULL, host);
+        if (peerinfo == NULL) {
+            RCU_READ_UNLOCK;
+            ret = -1;
+            snprintf(msg, sizeof(msg), "%s, is not a friend.", host);
+            *op_errstr = gf_strdup(msg);
+            goto out;
+
+        } else if (!peerinfo->connected) {
+            RCU_READ_UNLOCK;
+            ret = -1;
+            snprintf(msg, sizeof(msg),
+                     "%s,"
+                     "is not connected at "
+                     "the moment.",
+                     host);
+            *op_errstr = gf_strdup(msg);
+            goto out;
+
+        } else if (GD_FRIEND_STATE_BEFRIENDED != peerinfo->state.state) {
+            RCU_READ_UNLOCK;
+            ret = -1;
+            snprintf(msg, sizeof(msg),
+                     "%s, is not befriended "
+                     "at the moment.",
+                     host);
+            *op_errstr = gf_strdup(msg);
+            goto out;
+        }
+        RCU_READ_UNLOCK;
+    }
+
+    if (!(gf_uuid_compare(dst_brickinfo->uuid, MY_UUID))) {
+        ret = glusterd_get_brick_mount_dir(dst_brickinfo->path,
+                                           dst_brickinfo->hostname,
+                                           dst_brickinfo->mount_dir);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_MOUNTDIR_GET_FAIL,
+                   "Failed to get brick mount_dir");
+            goto out;
+        }
+        ret = dict_set_dynstr_with_alloc(rsp_dict, "brick1.mount_dir",
+                                         dst_brickinfo->mount_dir);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set brick.mount_dir");
+            goto out;
+        }
+    }
+
+    ret = dict_set_int32n(rsp_dict, "brick_count", SLEN("brick_count"), 1);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set local_brick_count.");
+        goto out;
+    }
+
+done:
+    ret = 0;
+out:
+    GF_FREE(dup_dstbrick);
+    gf_msg_debug(this->name, 0, "Returning %d.", ret);
+
+    return ret;
+}
+
+int
+glusterd_op_reset_brick(dict_t *dict, dict_t *rsp_dict)
+{
+    int ret = 0;
+    char *op = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char *volname = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    char *src_brick = NULL;
+    char *dst_brick = NULL;
+    glusterd_brickinfo_t *src_brickinfo = NULL;
+    glusterd_brickinfo_t *dst_brickinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "operation", SLEN("operation"), &op);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "dict_get on operation failed");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret)
+        goto out;
+
+    ret = dict_get_strn(dict, "src-brick", SLEN("src-brick"), &src_brick);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get src brick");
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "src brick=%s", src_brick);
+
+    ret = glusterd_volume_brickinfo_get_by_brick(src_brick, volinfo,
+                                                 &src_brickinfo, _gf_false);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Unable to get src-brickinfo");
+        goto out;
+    }
+
+    if (!strcmp(op, "GF_RESET_OP_START")) {
+        ret = glusterd_volume_stop_glusterfs(volinfo, src_brickinfo, _gf_false);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_BRICK_STOP_FAIL,
+                   "Unable to stop"
+                   " brick: %s:%s",
+                   src_brickinfo->hostname, src_brickinfo->path);
+        }
+
+        goto out;
+
+    } else if (!strcmp(op, "GF_RESET_OP_COMMIT") ||
+               !strcmp(op, "GF_RESET_OP_COMMIT_FORCE")) {
+        ret = dict_get_strn(dict, "dst-brick", SLEN("dst-brick"), &dst_brick);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get dst brick");
+            goto out;
+        }
+
+        gf_msg_debug(this->name, 0, "dst brick=%s", dst_brick);
+
+        ret = glusterd_get_rb_dst_brickinfo(volinfo, &dst_brickinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RB_BRICKINFO_GET_FAIL,
+                   "Unable to get "
+                   "reset brick "
+                   "destination brickinfo");
+            goto out;
+        }
+
+        ret = glusterd_resolve_brick(dst_brickinfo);
+        if (ret) {
+            gf_msg_debug(this->name, 0, "Unable to resolve dst-brickinfo");
+            goto out;
+        }
+
+        ret = rb_update_dstbrick_port(dst_brickinfo, rsp_dict, dict);
+        if (ret)
+            goto out;
+
+        if (gf_uuid_compare(dst_brickinfo->uuid, MY_UUID)) {
+            gf_msg_debug(this->name, 0, "I AM THE DESTINATION HOST");
+            ret = glusterd_volume_stop_glusterfs(volinfo, src_brickinfo,
+                                                 _gf_false);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_BRICK_STOP_FAIL,
+                       "Unable to stop brick: %s:%s", src_brickinfo->hostname,
+                       src_brickinfo->path);
+                goto out;
+            }
+        }
+
+        ret = glusterd_svcs_stop(volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_GLUSTER_SERVICES_STOP_FAIL,
+                   "Unable to stop gluster services, ret: %d", ret);
+            goto out;
+        }
+        ret = glusterd_op_perform_replace_brick(volinfo, src_brick, dst_brick,
+                                                dict);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_BRICK_ADD_FAIL,
+                   "Unable to add dst-brick: "
+                   "%s to volume: %s",
+                   dst_brick, volinfo->volname);
+            (void)glusterd_svcs_manager(volinfo);
+            goto out;
+        }
+
+        volinfo->rebal.defrag_status = 0;
+
+        ret = glusterd_svcs_manager(volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_CRITICAL, 0,
+                   GD_MSG_GLUSTER_SERVICE_START_FAIL,
+                   "Failed to start one or more gluster services.");
+        }
+
+        ret = glusterd_fetchspec_notify(THIS);
+        glusterd_brickinfo_delete(volinfo->rep_brick.dst_brick);
+        volinfo->rep_brick.src_brick = NULL;
+        volinfo->rep_brick.dst_brick = NULL;
+
+        if (!ret)
+            ret = glusterd_store_volinfo(volinfo,
+                                         GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RBOP_STATE_STORE_FAIL,
+                   "Couldn't store"
+                   " reset brick operation's state.");
+        }
+    } else {
+        ret = -1;
+        goto out;
+    }
+
+out:
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c
new file mode 100644
index 00000000000..88662e3bbae
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c
@@ -0,0 +1,2448 @@
+/*
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "rpc-clnt.h"
+#include "glusterd1-xdr.h"
+#include "cli1-xdr.h"
+
+#include "xdr-generic.h"
+
+#include <glusterfs/compat-errno.h>
+#include "glusterd-op-sm.h"
+#include "glusterd-sm.h"
+#include "glusterd.h"
+#include "protocol-common.h"
+#include "glusterd-utils.h"
+#include <glusterfs/common-utils.h>
+#include "glusterd-messages.h"
+#include "glusterd-snapshot-utils.h"
+#include <sys/uio.h>
+
+#define SERVER_PATH_MAX (16 * 1024)
+
+#define GLUSTERD_STACK_DESTROY(frame)                                          \
+    do {                                                                       \
+        frame->local = NULL;                                                   \
+        STACK_DESTROY(frame->root);                                            \
+    } while (0)
+
+extern glusterd_op_info_t opinfo;
+extern uuid_t global_txn_id;
+
+int32_t
+glusterd_op_send_cli_response(glusterd_op_t op, int32_t op_ret,
+                              int32_t op_errno, rpcsvc_request_t *req,
+                              void *op_ctx, char *op_errstr)
+{
+    int32_t ret = -1;
+    void *cli_rsp = NULL;
+    dict_t *ctx = NULL;
+    char *free_ptr = NULL;
+    glusterd_conf_t *conf = NULL;
+    xdrproc_t xdrproc = NULL;
+    char *errstr = NULL;
+    int32_t status = 0;
+    int32_t count = 0;
+    gf_cli_rsp rsp = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+
+    GF_ASSERT(conf);
+
+    ctx = op_ctx;
+
+    switch (op) {
+        case GD_OP_REMOVE_BRICK: {
+            if (ctx)
+                ret = dict_get_strn(ctx, "errstr", SLEN("errstr"), &errstr);
+            break;
+        }
+        case GD_OP_RESET_VOLUME: {
+            if (op_ret && !op_errstr)
+                errstr = "Error while resetting options";
+            break;
+        }
+        case GD_OP_REBALANCE:
+        case GD_OP_DEFRAG_BRICK_VOLUME: {
+            if (ctx) {
+                ret = dict_get_int32n(ctx, "status", SLEN("status"), &status);
+                if (ret) {
+                    gf_msg_trace(this->name, 0, "failed to get status");
+                }
+            }
+            break;
+        }
+        case GD_OP_GSYNC_CREATE:
+        case GD_OP_GSYNC_SET: {
+            if (ctx) {
+                ret = dict_get_strn(ctx, "errstr", SLEN("errstr"), &errstr);
+                ret = dict_set_strn(ctx, "glusterd_workdir",
+                                    SLEN("glusterd_workdir"), conf->workdir);
+                /* swallow error here, that will be re-triggered in cli */
+            }
+            break;
+        }
+        case GD_OP_PROFILE_VOLUME: {
+            if (ctx && dict_get_int32n(ctx, "count", SLEN("count"), &count)) {
+                ret = dict_set_int32n(ctx, "count", SLEN("count"), 0);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "failed to set count in dictionary");
+                }
+            }
+            break;
+        }
+        case GD_OP_START_BRICK:
+        case GD_OP_STOP_BRICK: {
+            gf_msg_debug(this->name, 0, "op '%s' not supported",
+                         gd_op_list[op]);
+            break;
+        }
+        case GD_OP_NONE:
+        case GD_OP_MAX: {
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_OP_UNSUPPORTED,
+                   "invalid operation");
+            break;
+        }
+        case GD_OP_CREATE_VOLUME:
+        case GD_OP_START_VOLUME:
+        case GD_OP_STOP_VOLUME:
+        case GD_OP_DELETE_VOLUME:
+        case GD_OP_DEFRAG_VOLUME:
+        case GD_OP_ADD_BRICK:
+        case GD_OP_LOG_ROTATE:
+        case GD_OP_SYNC_VOLUME:
+        case GD_OP_STATEDUMP_VOLUME:
+        case GD_OP_REPLACE_BRICK:
+        case GD_OP_STATUS_VOLUME:
+        case GD_OP_SET_VOLUME:
+        case GD_OP_LIST_VOLUME:
+        case GD_OP_CLEARLOCKS_VOLUME:
+        case GD_OP_HEAL_VOLUME:
+        case GD_OP_QUOTA:
+        case GD_OP_SNAP:
+        case GD_OP_BARRIER:
+        case GD_OP_BITROT:
+        case GD_OP_SCRUB_STATUS:
+        case GD_OP_SCRUB_ONDEMAND:
+        case GD_OP_RESET_BRICK:
+        case GD_OP_MAX_OPVERSION:
+        case GD_OP_DETACH_NOT_STARTED:
+        case GD_OP_GANESHA:
+        case GD_OP_DETACH_TIER:
+        case GD_OP_TIER_MIGRATE:
+        case GD_OP_TIER_START_STOP:
+        case GD_OP_TIER_STATUS:
+        case GD_OP_DETACH_TIER_STATUS:
+        case GD_OP_REMOVE_TIER_BRICK:
+        case GD_OP_ADD_TIER_BRICK:
+
+        {
+            /*nothing specific to be done*/
+            break;
+        }
+        case GD_OP_COPY_FILE: {
+            if (ctx)
+                ret = dict_get_strn(ctx, "errstr", SLEN("errstr"), &errstr);
+            break;
+        }
+        case GD_OP_SYS_EXEC: {
+            if (ctx) {
+                ret = dict_get_strn(ctx, "errstr", SLEN("errstr"), &errstr);
+                ret = dict_set_strn(ctx, "glusterd_workdir",
+                                    SLEN("glusterd_workdir"), conf->workdir);
+            }
+            break;
+        }
+    }
+
+    rsp.op_ret = op_ret;
+    rsp.op_errno = op_errno;
+
+    if (errstr)
+        rsp.op_errstr = errstr;
+    else if (op_errstr)
+        rsp.op_errstr = op_errstr;
+
+    if (!rsp.op_errstr)
+        rsp.op_errstr = "";
+
+    if (ctx) {
+        ret = dict_allocate_and_serialize(ctx, &rsp.dict.dict_val,
+                                          &rsp.dict.dict_len);
+        if (ret < 0)
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        else
+            free_ptr = rsp.dict.dict_val;
+    }
+
+    /* needed by 'rebalance status' */
+    if (status)
+        rsp.op_errno = status;
+
+    cli_rsp = &rsp;
+    xdrproc = (xdrproc_t)xdr_gf_cli_rsp;
+
+    glusterd_to_cli(req, cli_rsp, NULL, 0, NULL, xdrproc, ctx);
+    ret = 0;
+
+    GF_FREE(free_ptr);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_big_locked_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                        void *myframe, fop_cbk_fn_t fn)
+{
+    glusterd_conf_t *priv = THIS->private;
+    int ret = -1;
+
+    synclock_lock(&priv->big_lock);
+    ret = fn(req, iov, count, myframe);
+    synclock_unlock(&priv->big_lock);
+
+    return ret;
+}
+
+int
+__glusterd_probe_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                     void *myframe)
+{
+    gd1_mgmt_probe_rsp rsp = {
+        {0},
+    };
+    int ret = 0;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_friend_sm_event_t *event = NULL;
+    glusterd_probe_ctx_t *ctx = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    if (-1 == req->rpc_status) {
+        goto out;
+    }
+
+    this = THIS;
+    GF_ASSERT(this != NULL);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (conf != NULL), out);
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_probe_rsp);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RES_DECODE_FAIL, "error");
+        // rsp.op_ret   = -1;
+        // rsp.op_errno = EINVAL;
+        goto out;
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_PROBE_REQ_RESP_RCVD,
+           "Received probe resp from uuid: %s, host: %s", uuid_utoa(rsp.uuid),
+           rsp.hostname);
+    if (rsp.op_ret != 0) {
+        ctx = ((call_frame_t *)myframe)->local;
+        ((call_frame_t *)myframe)->local = NULL;
+
+        GF_ASSERT(ctx);
+
+        if (ctx->req) {
+            glusterd_xfer_cli_probe_resp(ctx->req, rsp.op_ret, rsp.op_errno,
+                                         rsp.op_errstr, ctx->hostname,
+                                         ctx->port, ctx->dict);
+        }
+
+        glusterd_destroy_probe_ctx(ctx);
+        (void)glusterd_friend_remove(rsp.uuid, rsp.hostname);
+        ret = rsp.op_ret;
+        goto out;
+    }
+
+    RCU_READ_LOCK;
+    peerinfo = glusterd_peerinfo_find(rsp.uuid, rsp.hostname);
+    if (peerinfo == NULL) {
+        RCU_READ_UNLOCK
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEER_NOT_FOUND,
+               "Could not find peerd %s(%s)", rsp.hostname,
+               uuid_utoa(rsp.uuid));
+        goto out;
+    }
+
+    /*
+     * In the case of a fresh probe rsp.uuid and peerinfo.uuid will not
+     * match, as peerinfo->uuid will be NULL.
+     *
+     * In the case of a peer probe being done to add a new network to a
+     * peer, rsp.uuid will match an existing peerinfo.uuid. If we have this
+     * stage it means that the current address/hostname being used isn't
+     * present in the found peerinfo. If it were, we would have found out
+     * earlier in the probe process and wouldn't even reach till here. So,
+     * we need to add the new hostname to the peer.
+     *
+     * This addition should only be done for cluster op-version >=
+     * GD_OP_VERSION_3_6_0 as address lists are only supported from then on.
+     * Also, this update should only be done when an explicit CLI probe
+     * command was used to begin the probe process.
+     */
+    if ((conf->op_version >= GD_OP_VERSION_3_6_0) &&
+        (gf_uuid_compare(rsp.uuid, peerinfo->uuid) == 0)) {
+        ctx = ((call_frame_t *)myframe)->local;
+        /* Presence of ctx->req implies this probe was started by a cli
+         * probe command
+         */
+        if (ctx->req == NULL)
+            goto cont;
+
+        gf_msg_debug(this->name, 0,
+                     "Adding address '%s' to "
+                     "existing peer %s",
+                     rsp.hostname, uuid_utoa(rsp.uuid));
+
+        ret = glusterd_friend_remove(NULL, rsp.hostname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_STALE_PEERINFO_REMOVE_FAIL,
+                   "Could not remove "
+                   "stale peerinfo with name %s",
+                   rsp.hostname);
+            goto reply;
+        }
+
+        ret = gd_add_address_to_peer(peerinfo, rsp.hostname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_HOSTNAME_ADD_TO_PEERLIST_FAIL,
+                   "Couldn't add hostname to peer list");
+            goto reply;
+        }
+
+        /* Injecting EVENT_NEW_NAME to send update */
+        ret = glusterd_friend_sm_new_event(GD_FRIEND_EVENT_NEW_NAME, &event);
+        if (!ret) {
+            event->peername = gf_strdup(peerinfo->hostname);
+            gf_uuid_copy(event->peerid, peerinfo->uuid);
+
+            ret = glusterd_friend_sm_inject_event(event);
+        }
+        rsp.op_errno = GF_PROBE_FRIEND;
+
+    reply:
+        ctx = ((call_frame_t *)myframe)->local;
+        ((call_frame_t *)myframe)->local = NULL;
+
+        if (!ctx) {
+            ret = -1;
+            goto unlock;
+        }
+
+        if (ctx->req) {
+            glusterd_xfer_cli_probe_resp(ctx->req, ret, rsp.op_errno,
+                                         rsp.op_errstr, ctx->hostname,
+                                         ctx->port, ctx->dict);
+        }
+
+        glusterd_destroy_probe_ctx(ctx);
+
+        goto unlock;
+
+    } else if (strncasecmp(rsp.hostname, peerinfo->hostname, 1024)) {
+        gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_HOST_PRESENT_ALREADY,
+               "Host: %s  with uuid: %s "
+               "already present in cluster with alias hostname: %s",
+               rsp.hostname, uuid_utoa(rsp.uuid), peerinfo->hostname);
+
+        ctx = ((call_frame_t *)myframe)->local;
+        ((call_frame_t *)myframe)->local = NULL;
+
+        if (!ctx) {
+            ret = -1;
+            goto unlock;
+        }
+
+        rsp.op_errno = GF_PROBE_FRIEND;
+        if (ctx->req) {
+            glusterd_xfer_cli_probe_resp(ctx->req, rsp.op_ret, rsp.op_errno,
+                                         rsp.op_errstr, ctx->hostname,
+                                         ctx->port, ctx->dict);
+        }
+
+        glusterd_destroy_probe_ctx(ctx);
+        (void)glusterd_friend_remove(NULL, rsp.hostname);
+        ret = rsp.op_ret;
+
+        goto unlock;
+    }
+
+cont:
+    gf_uuid_copy(peerinfo->uuid, rsp.uuid);
+
+    ret = glusterd_friend_sm_new_event(GD_FRIEND_EVENT_INIT_FRIEND_REQ, &event);
+
+    if (ret) {
+        RCU_READ_UNLOCK;
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_NEW_FRIEND_SM_EVENT_GET_FAIL,
+               "Unable to get event");
+        goto out;
+    }
+
+    event->peername = gf_strdup(peerinfo->hostname);
+    gf_uuid_copy(event->peerid, peerinfo->uuid);
+
+    event->ctx = ((call_frame_t *)myframe)->local;
+    ((call_frame_t *)myframe)->local = NULL;
+    ret = glusterd_friend_sm_inject_event(event);
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_PROBE_REQ_RESP_RCVD,
+           "Received resp to probe req");
+
+unlock:
+    RCU_READ_UNLOCK;
+
+out:
+    free(rsp.hostname);  // malloced by xdr
+    GLUSTERD_STACK_DESTROY(((call_frame_t *)myframe));
+
+    /* Attempt to start the state machine. Needed as no state machine could
+     * be running at time this RPC reply was received
+     */
+    if (!ret) {
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    }
+
+    return ret;
+}
+
+int
+glusterd_probe_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                   void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   __glusterd_probe_cbk);
+}
+
+int
+__glusterd_friend_add_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                          void *myframe)
+{
+    gd1_mgmt_friend_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    glusterd_friend_sm_event_t *event = NULL;
+    glusterd_friend_sm_event_type_t event_type = GD_FRIEND_EVENT_NONE;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = EINVAL;
+    glusterd_probe_ctx_t *ctx = NULL;
+    glusterd_friend_update_ctx_t *ev_ctx = NULL;
+
+    if (-1 == req->rpc_status) {
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_friend_rsp);
+    if (ret < 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_RES_DECODE_FAIL,
+               "error");
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        goto out;
+    }
+
+    op_ret = rsp.op_ret;
+    op_errno = rsp.op_errno;
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_RESPONSE_INFO,
+           "Received %s from uuid: %s, host: %s, port: %d",
+           (op_ret) ? "RJT" : "ACC", uuid_utoa(rsp.uuid), rsp.hostname,
+           rsp.port);
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find(rsp.uuid, rsp.hostname);
+    if (peerinfo == NULL) {
+        RCU_READ_UNLOCK
+        ret = -1;
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_RESP_FROM_UNKNOWN_PEER,
+               "received friend add response from"
+               " unknown peer uuid: %s",
+               uuid_utoa(rsp.uuid));
+        goto out;
+    }
+
+    if (op_ret)
+        event_type = GD_FRIEND_EVENT_RCVD_RJT;
+    else
+        event_type = GD_FRIEND_EVENT_RCVD_ACC;
+
+    ret = glusterd_friend_sm_new_event(event_type, &event);
+
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_EVENT_NEW_GET_FAIL,
+               "Unable to get event");
+        goto unlock;
+    }
+
+    ev_ctx = GF_CALLOC(1, sizeof(*ev_ctx), gf_gld_mt_friend_update_ctx_t);
+    if (!ev_ctx) {
+        ret = -1;
+        goto unlock;
+    }
+
+    gf_uuid_copy(ev_ctx->uuid, rsp.uuid);
+    ev_ctx->hostname = gf_strdup(rsp.hostname);
+
+    event->peername = gf_strdup(peerinfo->hostname);
+    gf_uuid_copy(event->peerid, peerinfo->uuid);
+    event->ctx = ev_ctx;
+    ret = glusterd_friend_sm_inject_event(event);
+
+unlock:
+    RCU_READ_UNLOCK;
+out:
+    ctx = ((call_frame_t *)myframe)->local;
+    ((call_frame_t *)myframe)->local = NULL;
+
+    if (ctx && ctx->req) {
+        /*reverse probe doesn't have req*/
+        ret = glusterd_xfer_cli_probe_resp(ctx->req, op_ret, op_errno, NULL,
+                                           ctx->hostname, ctx->port, ctx->dict);
+    }
+    if (!ret) {
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    }
+
+    if (ctx)
+        glusterd_destroy_probe_ctx(ctx);
+    free(rsp.hostname);  // malloced by xdr
+    GLUSTERD_STACK_DESTROY(((call_frame_t *)myframe));
+    return ret;
+}
+
+int
+glusterd_friend_add_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                        void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   __glusterd_friend_add_cbk);
+}
+
+int
+__glusterd_friend_remove_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                             void *myframe)
+{
+    gd1_mgmt_friend_rsp rsp = {
+        {0},
+    };
+    glusterd_conf_t *conf = NULL;
+    int ret = -1;
+    glusterd_friend_sm_event_t *event = NULL;
+    glusterd_friend_sm_event_type_t event_type = GD_FRIEND_EVENT_NONE;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    glusterd_probe_ctx_t *ctx = NULL;
+    gf_boolean_t move_sm_now = _gf_true;
+
+    conf = THIS->private;
+    GF_ASSERT(conf);
+
+    ctx = ((call_frame_t *)myframe)->local;
+    ((call_frame_t *)myframe)->local = NULL;
+    if (!ctx) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_EVENT_NEW_GET_FAIL,
+               "Unable to get glusterd probe context");
+        goto out;
+    }
+    if (-1 == req->rpc_status) {
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        move_sm_now = _gf_false;
+        goto inject;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_friend_rsp);
+    if (ret < 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_RES_DECODE_FAIL,
+               "error");
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        goto respond;
+    }
+
+    op_ret = rsp.op_ret;
+    op_errno = rsp.op_errno;
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_RESPONSE_INFO,
+           "Received %s from uuid: %s, host: %s, port: %d",
+           (op_ret) ? "RJT" : "ACC", uuid_utoa(rsp.uuid), rsp.hostname,
+           rsp.port);
+
+inject:
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find(rsp.uuid, ctx->hostname);
+    if (peerinfo == NULL) {
+        // can happen as part of rpc clnt connection cleanup
+        // when the frame timeout happens after 30 minutes
+        goto unlock;
+    }
+
+    event_type = GD_FRIEND_EVENT_REMOVE_FRIEND;
+
+    ret = glusterd_friend_sm_new_event(event_type, &event);
+
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_EVENT_NEW_GET_FAIL,
+               "Unable to get event");
+        goto unlock;
+    }
+    event->peername = gf_strdup(peerinfo->hostname);
+    gf_uuid_copy(event->peerid, peerinfo->uuid);
+
+    ret = glusterd_friend_sm_inject_event(event);
+
+    if (ret)
+        goto unlock;
+
+    /*friend_sm would be moved on CLNT_DISCONNECT, consequently
+      cleaning up peerinfo. Else, we run the risk of triggering
+      a clnt_destroy within saved_frames_unwind.
+    */
+    op_ret = 0;
+
+unlock:
+    RCU_READ_UNLOCK;
+
+respond:
+    ret = glusterd_xfer_cli_deprobe_resp(ctx->req, op_ret, op_errno, NULL,
+                                         ctx->hostname, ctx->dict);
+    if (!ret && move_sm_now) {
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    }
+
+    glusterd_broadcast_friend_delete(ctx->hostname, NULL);
+    glusterd_destroy_probe_ctx(ctx);
+out:
+    free(rsp.hostname);  // malloced by xdr
+    GLUSTERD_STACK_DESTROY(((call_frame_t *)myframe));
+    return ret;
+}
+
+int
+glusterd_friend_remove_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                           void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   __glusterd_friend_remove_cbk);
+}
+
+int32_t
+__glusterd_friend_update_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                             void *myframe)
+{
+    int ret = -1;
+    gd1_mgmt_friend_update_rsp rsp = {
+        {0},
+    };
+    xlator_t *this = NULL;
+
+    GF_ASSERT(req);
+    this = THIS;
+
+    if (-1 == req->rpc_status) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_FAILURE, "RPC Error");
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_friend_update_rsp);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RES_DECODE_FAIL,
+               "Failed to serialize friend"
+               " update response");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_RESPONSE_INFO,
+           "Received %s from uuid: %s", (ret) ? "RJT" : "ACC",
+           uuid_utoa(rsp.uuid));
+
+    GLUSTERD_STACK_DESTROY(((call_frame_t *)myframe));
+    return ret;
+}
+
+int
+glusterd_friend_update_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                           void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   __glusterd_friend_update_cbk);
+}
+
+int32_t
+__glusterd_cluster_lock_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                            void *myframe)
+{
+    gd1_mgmt_cluster_lock_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    int32_t op_ret = -1;
+    glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
+    xlator_t *this = NULL;
+    uuid_t *txn_id = NULL;
+    glusterd_conf_t *priv = NULL;
+    char *err_str = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(req);
+
+    txn_id = &priv->global_txn_id;
+
+    if (-1 == req->rpc_status) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_LOCK_RESP_FROM_PEER,
+               "Lock response is not "
+               "received from one of the peer");
+        err_str = "Lock response is not received from one of the peer";
+        glusterd_set_opinfo(err_str, ENETRESET, -1);
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_cluster_lock_rsp);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RES_DECODE_FAIL,
+               "Failed to decode "
+               "cluster lock response received from peer");
+        err_str =
+            "Failed to decode cluster lock response received from"
+            " peer";
+        glusterd_set_opinfo(err_str, EINVAL, -1);
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        goto out;
+    }
+
+    op_ret = rsp.op_ret;
+
+    if (op_ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_LOCK_FROM_UUID_REJCT,
+               "Received lock RJT from uuid: %s", uuid_utoa(rsp.uuid));
+    } else {
+        gf_msg_debug(this->name, 0, "Received lock ACC from uuid: %s",
+                     uuid_utoa(rsp.uuid));
+    }
+
+    RCU_READ_LOCK;
+    ret = (glusterd_peerinfo_find(rsp.uuid, NULL) == NULL);
+    RCU_READ_UNLOCK;
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_RESP_FROM_UNKNOWN_PEER,
+               "cluster lock response received from unknown peer: %s."
+               "Ignoring response",
+               uuid_utoa(rsp.uuid));
+        err_str = "cluster lock response received from unknown peer";
+        goto out;
+    }
+
+    if (op_ret) {
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        opinfo.op_ret = op_ret;
+        opinfo.op_errstr = gf_strdup(
+            "Another transaction could be in "
+            "progress. Please try again after"
+            " some time.");
+    } else {
+        event_type = GD_OP_EVENT_RCVD_ACC;
+    }
+
+out:
+
+    ret = glusterd_set_txn_opinfo(txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    ret = glusterd_op_sm_inject_event(event_type, txn_id, NULL);
+
+    if (!ret) {
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    }
+
+    GLUSTERD_STACK_DESTROY(((call_frame_t *)myframe));
+    return ret;
+}
+
+int32_t
+glusterd_cluster_lock_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                          void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   __glusterd_cluster_lock_cbk);
+}
+
+void
+glusterd_set_opinfo(char *errstr, int32_t op_errno, int32_t op_ret)
+{
+    opinfo.op_errstr = gf_strdup(errstr);
+    opinfo.op_errno = op_errno;
+    opinfo.op_ret = op_ret;
+}
+
+static int32_t
+glusterd_mgmt_v3_lock_peers_cbk_fn(struct rpc_req *req, struct iovec *iov,
+                                   int count, void *myframe)
+{
+    gd1_mgmt_v3_lock_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    int32_t op_ret = -1;
+    glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
+    xlator_t *this = NULL;
+    call_frame_t *frame = NULL;
+    uuid_t *txn_id = NULL;
+    char *err_str = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    frame = myframe;
+    txn_id = frame->cookie;
+    frame->cookie = NULL;
+
+    if (-1 == req->rpc_status) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_LOCK_RESP_FROM_PEER,
+               "Lock response is not "
+               "received from one of the peer");
+        err_str = "Lock response is not received from one of the peer";
+        glusterd_set_opinfo(err_str, ENETRESET, -1);
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RES_DECODE_FAIL,
+               "Failed to decode "
+               "mgmt_v3 lock response received from peer");
+        err_str =
+            "Failed to decode mgmt_v3 lock response received from"
+            " peer";
+        glusterd_set_opinfo(err_str, EINVAL, -1);
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        goto out;
+    }
+
+    op_ret = rsp.op_ret;
+
+    txn_id = &rsp.txn_id;
+
+    if (op_ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCK_FROM_UUID_REJCT,
+               "Received mgmt_v3 lock RJT from uuid: %s", uuid_utoa(rsp.uuid));
+    } else {
+        gf_msg_debug(this->name, 0, "Received mgmt_v3 lock ACC from uuid: %s",
+                     uuid_utoa(rsp.uuid));
+    }
+
+    RCU_READ_LOCK;
+    ret = (glusterd_peerinfo_find(rsp.uuid, NULL) == NULL);
+    RCU_READ_UNLOCK;
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_RESP_FROM_UNKNOWN_PEER,
+               "mgmt_v3 lock response received "
+               "from unknown peer: %s. Ignoring response",
+               uuid_utoa(rsp.uuid));
+        goto out;
+    }
+
+    if (op_ret) {
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        opinfo.op_ret = op_ret;
+        opinfo.op_errstr = gf_strdup(
+            "Another transaction could be in "
+            "progress. Please try again after"
+            " some time.");
+    } else {
+        event_type = GD_OP_EVENT_RCVD_ACC;
+    }
+
+out:
+
+    ret = glusterd_set_txn_opinfo(txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    ret = glusterd_op_sm_inject_event(event_type, txn_id, NULL);
+    if (!ret) {
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    }
+
+    GF_FREE(frame->cookie);
+    GLUSTERD_STACK_DESTROY(frame);
+    return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_lock_peers_cbk(struct rpc_req *req, struct iovec *iov,
+                                int count, void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   glusterd_mgmt_v3_lock_peers_cbk_fn);
+}
+
+static int32_t
+glusterd_mgmt_v3_unlock_peers_cbk_fn(struct rpc_req *req, struct iovec *iov,
+                                     int count, void *myframe)
+{
+    gd1_mgmt_v3_unlock_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    int32_t op_ret = -1;
+    glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
+    xlator_t *this = NULL;
+    call_frame_t *frame = NULL;
+    uuid_t *txn_id = NULL;
+    char *err_str = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    frame = myframe;
+    txn_id = frame->cookie;
+    frame->cookie = NULL;
+
+    if (-1 == req->rpc_status) {
+        err_str = "Unlock response not received from one of the peer.";
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CLUSTER_UNLOCK_FAILED,
+               "UnLock response is not received from one of the peer");
+        glusterd_set_opinfo(err_str, 0, 0);
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CLUSTER_UNLOCK_FAILED,
+               "Failed to decode mgmt_v3 unlock response received from"
+               "peer");
+        err_str =
+            "Failed to decode mgmt_v3 unlock response received "
+            "from peer";
+        glusterd_set_opinfo(err_str, 0, 0);
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        goto out;
+    }
+
+    op_ret = rsp.op_ret;
+
+    txn_id = &rsp.txn_id;
+
+    if (op_ret) {
+        gf_msg(
+            this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_UNLOCK_FROM_UUID_REJCT,
+            "Received mgmt_v3 unlock RJT from uuid: %s", uuid_utoa(rsp.uuid));
+    } else {
+        gf_msg_debug(this->name, 0, "Received mgmt_v3 unlock ACC from uuid: %s",
+                     uuid_utoa(rsp.uuid));
+    }
+
+    RCU_READ_LOCK;
+    ret = (glusterd_peerinfo_find(rsp.uuid, NULL) == NULL);
+    RCU_READ_UNLOCK;
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_CLUSTER_UNLOCK_FAILED,
+               "mgmt_v3 unlock response received "
+               "from unknown peer: %s. Ignoring response",
+               uuid_utoa(rsp.uuid));
+        goto out;
+    }
+
+    if (op_ret) {
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        opinfo.op_ret = op_ret;
+        opinfo.op_errstr = gf_strdup(
+            "Another transaction could be in "
+            "progress. Please try again after"
+            " some time.");
+    } else {
+        event_type = GD_OP_EVENT_RCVD_ACC;
+    }
+
+out:
+
+    ret = glusterd_set_txn_opinfo(txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    ret = glusterd_op_sm_inject_event(event_type, txn_id, NULL);
+
+    if (!ret) {
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    }
+
+    GF_FREE(frame->cookie);
+    GLUSTERD_STACK_DESTROY(frame);
+    return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_unlock_peers_cbk(struct rpc_req *req, struct iovec *iov,
+                                  int count, void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   glusterd_mgmt_v3_unlock_peers_cbk_fn);
+}
+
+int32_t
+__glusterd_cluster_unlock_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                              void *myframe)
+{
+    gd1_mgmt_cluster_lock_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    int32_t op_ret = -1;
+    glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
+    xlator_t *this = NULL;
+    uuid_t *txn_id = NULL;
+    glusterd_conf_t *priv = NULL;
+    char *err_str = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(req);
+
+    txn_id = &priv->global_txn_id;
+
+    if (-1 == req->rpc_status) {
+        err_str = "Unlock response not received from one of the peer.";
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CLUSTER_UNLOCK_FAILED,
+               "UnLock response is not received from one of the peer");
+        glusterd_set_opinfo(err_str, 0, 0);
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp,
+                         (xdrproc_t)xdr_gd1_mgmt_cluster_unlock_rsp);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CLUSTER_UNLOCK_FAILED,
+               "Failed to decode unlock response received from peer");
+        err_str =
+            "Failed to decode cluster unlock response received "
+            "from peer";
+        glusterd_set_opinfo(err_str, 0, 0);
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        goto out;
+    }
+
+    op_ret = rsp.op_ret;
+
+    if (op_ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNLOCK_FROM_UUID_REJCT,
+               "Received unlock RJT from uuid: %s", uuid_utoa(rsp.uuid));
+    } else {
+        gf_msg_debug(this->name, 0, "Received unlock ACC from uuid: %s",
+                     uuid_utoa(rsp.uuid));
+    }
+
+    RCU_READ_LOCK;
+    ret = (glusterd_peerinfo_find(rsp.uuid, NULL) == NULL);
+    RCU_READ_UNLOCK;
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_CLUSTER_UNLOCK_FAILED,
+               "Unlock response received from unknown peer %s",
+               uuid_utoa(rsp.uuid));
+        goto out;
+    }
+
+    if (op_ret) {
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        opinfo.op_ret = op_ret;
+    } else {
+        event_type = GD_OP_EVENT_RCVD_ACC;
+    }
+
+out:
+
+    ret = glusterd_set_txn_opinfo(txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    ret = glusterd_op_sm_inject_event(event_type, txn_id, NULL);
+
+    if (!ret) {
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    }
+
+    GLUSTERD_STACK_DESTROY(((call_frame_t *)myframe));
+    return ret;
+}
+
+int32_t
+glusterd_cluster_unlock_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                            void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   __glusterd_cluster_unlock_cbk);
+}
+
+int32_t
+__glusterd_stage_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                        void *myframe)
+{
+    gd1_mgmt_stage_op_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    int32_t op_ret = -1;
+    glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    dict_t *dict = NULL;
+    char *peer_str = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    uuid_t *txn_id = NULL;
+    call_frame_t *frame = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(myframe);
+
+    frame = myframe;
+    txn_id = frame->cookie;
+
+    if (-1 == req->rpc_status) {
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        /* use standard allocation because to keep uniformity
+           in freeing it */
+        rsp.op_errstr = strdup("error");
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_stage_op_rsp);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RES_DECODE_FAIL,
+               "Failed to decode stage "
+               "response received from peer");
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        /* use standard allocation because to keep uniformity
+           in freeing it */
+        rsp.op_errstr = strdup(
+            "Failed to decode stage response "
+            "received from peer.");
+        goto out;
+    }
+
+    if (rsp.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(rsp.dict.dict_val, rsp.dict.dict_len, &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize rsp-buffer to dictionary");
+            event_type = GD_OP_EVENT_RCVD_RJT;
+            goto out;
+        } else {
+            dict->extra_stdfree = rsp.dict.dict_val;
+        }
+    }
+
+out:
+    op_ret = rsp.op_ret;
+
+    if (op_ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STAGE_FROM_UUID_REJCT,
+               "Received stage RJT from uuid: %s", uuid_utoa(rsp.uuid));
+    } else {
+        gf_msg_debug(this->name, 0, "Received stage ACC from uuid: %s",
+                     uuid_utoa(rsp.uuid));
+    }
+
+    RCU_READ_LOCK;
+    peerinfo = glusterd_peerinfo_find(rsp.uuid, NULL);
+    if (peerinfo == NULL) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_RESP_FROM_UNKNOWN_PEER,
+               "Stage response received "
+               "from unknown peer: %s. Ignoring response.",
+               uuid_utoa(rsp.uuid));
+    }
+
+    if (op_ret) {
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        opinfo.op_ret = op_ret;
+        if (strcmp("", rsp.op_errstr)) {
+            opinfo.op_errstr = gf_strdup(rsp.op_errstr);
+        } else {
+            if (peerinfo)
+                peer_str = peerinfo->hostname;
+            else
+                peer_str = uuid_utoa(rsp.uuid);
+            char err_str[2048];
+            snprintf(err_str, sizeof(err_str), OPERRSTR_STAGE_FAIL, peer_str);
+            opinfo.op_errstr = gf_strdup(err_str);
+        }
+    } else {
+        event_type = GD_OP_EVENT_RCVD_ACC;
+    }
+
+    RCU_READ_UNLOCK;
+
+    ret = glusterd_set_txn_opinfo(txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    ret = glusterd_op_sm_inject_event(event_type, txn_id, NULL);
+
+    if (!ret) {
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    }
+
+    free(rsp.op_errstr);  // malloced by xdr
+    if (dict) {
+        if (!dict->extra_stdfree && rsp.dict.dict_val)
+            free(rsp.dict.dict_val);  // malloced by xdr
+        dict_unref(dict);
+    } else {
+        free(rsp.dict.dict_val);  // malloced by xdr
+    }
+    GF_FREE(frame->cookie);
+    GLUSTERD_STACK_DESTROY(((call_frame_t *)myframe));
+    return ret;
+}
+
+int32_t
+glusterd_stage_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                      void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   __glusterd_stage_op_cbk);
+}
+
+int32_t
+__glusterd_commit_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                         void *myframe)
+{
+    gd1_mgmt_commit_op_rsp rsp = {
+        {0},
+    };
+    int ret = -1;
+    int32_t op_ret = -1;
+    glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    dict_t *dict = NULL;
+    char *peer_str = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    uuid_t *txn_id = NULL;
+    glusterd_op_info_t txn_op_info = {
+        {0},
+    };
+    call_frame_t *frame = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(myframe);
+
+    frame = myframe;
+    txn_id = frame->cookie;
+
+    if (-1 == req->rpc_status) {
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        /* use standard allocation because to keep uniformity
+           in freeing it */
+        rsp.op_errstr = strdup("error");
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_commit_op_rsp);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RES_DECODE_FAIL,
+               "Failed to decode commit "
+               "response received from peer");
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        /* use standard allocation because to keep uniformity
+           in freeing it */
+        rsp.op_errstr = strdup(
+            "Failed to decode commit response "
+            "received from peer.");
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        goto out;
+    }
+
+    if (rsp.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(rsp.dict.dict_val, rsp.dict.dict_len, &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize rsp-buffer to dictionary");
+            event_type = GD_OP_EVENT_RCVD_RJT;
+            goto out;
+        } else {
+            dict->extra_stdfree = rsp.dict.dict_val;
+        }
+    }
+
+    op_ret = rsp.op_ret;
+
+    if (op_ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_FROM_UUID_REJCT,
+               "Received commit RJT from uuid: %s", uuid_utoa(rsp.uuid));
+    } else {
+        gf_msg_debug(this->name, 0, "Received commit ACC from uuid: %s",
+                     uuid_utoa(rsp.uuid));
+    }
+
+    ret = glusterd_get_txn_opinfo(txn_id, &txn_op_info);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_GET_FAIL,
+               "Failed to get txn_op_info "
+               "for txn_id = %s",
+               uuid_utoa(*txn_id));
+    }
+
+    RCU_READ_LOCK;
+    peerinfo = glusterd_peerinfo_find(rsp.uuid, NULL);
+    if (peerinfo == NULL) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_RESP_FROM_UNKNOWN_PEER,
+               "Commit response for "
+               "'Volume %s' received from unknown peer: %s",
+               gd_op_list[opinfo.op], uuid_utoa(rsp.uuid));
+    }
+
+    if (op_ret) {
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        opinfo.op_ret = op_ret;
+        if (strcmp("", rsp.op_errstr)) {
+            opinfo.op_errstr = gf_strdup(rsp.op_errstr);
+        } else {
+            if (peerinfo)
+                peer_str = peerinfo->hostname;
+            else
+                peer_str = uuid_utoa(rsp.uuid);
+            char err_str[2048];
+            snprintf(err_str, sizeof(err_str), OPERRSTR_COMMIT_FAIL, peer_str);
+            opinfo.op_errstr = gf_strdup(err_str);
+        }
+        if (!opinfo.op_errstr) {
+            goto unlock;
+        }
+    } else {
+        event_type = GD_OP_EVENT_RCVD_ACC;
+        GF_ASSERT(rsp.op == txn_op_info.op);
+
+        switch (rsp.op) {
+            case GD_OP_PROFILE_VOLUME:
+                ret = glusterd_profile_volume_use_rsp_dict(txn_op_info.op_ctx,
+                                                           dict);
+                if (ret)
+                    goto unlock;
+                break;
+
+            case GD_OP_REBALANCE:
+            case GD_OP_DEFRAG_BRICK_VOLUME:
+                ret = glusterd_volume_rebalance_use_rsp_dict(txn_op_info.op_ctx,
+                                                             dict);
+                if (ret)
+                    goto unlock;
+                break;
+
+            default:
+                break;
+        }
+    }
+unlock:
+    RCU_READ_UNLOCK;
+
+out:
+
+    ret = glusterd_set_txn_opinfo(txn_id, &opinfo);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    ret = glusterd_op_sm_inject_event(event_type, txn_id, NULL);
+
+    if (!ret) {
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    }
+
+    if (dict)
+        dict_unref(dict);
+    free(rsp.op_errstr);  // malloced by xdr
+    GF_FREE(frame->cookie);
+    GLUSTERD_STACK_DESTROY(((call_frame_t *)myframe));
+    return ret;
+}
+
+int32_t
+glusterd_commit_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                       void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   __glusterd_commit_op_cbk);
+}
+
+int32_t
+glusterd_rpc_probe(call_frame_t *frame, xlator_t *this, void *data)
+{
+    gd1_mgmt_probe_req req = {
+        {0},
+    };
+    int ret = 0;
+    int port = 0;
+    char *hostname = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    dict_t *dict = NULL;
+
+    if (!frame || !this || !data) {
+        gf_smsg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    dict = data;
+    priv = this->private;
+
+    GF_ASSERT(priv);
+    ret = dict_get_strn(dict, "hostname", SLEN("hostname"), &hostname);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=hostname", NULL);
+        goto out;
+    }
+    ret = dict_get_int32n(dict, "port", SLEN("port"), &port);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_DEBUG, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=port", NULL);
+        port = GF_DEFAULT_BASE_PORT;
+    }
+
+    ret = dict_get_ptr(dict, "peerinfo", VOID(&peerinfo));
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=peerinfo", NULL);
+        goto out;
+    }
+
+    gf_uuid_copy(req.uuid, MY_UUID);
+    req.hostname = gf_strdup(hostname);
+    req.port = port;
+
+    ret = glusterd_submit_request(
+        peerinfo->rpc, &req, frame, peerinfo->peer, GLUSTERD_PROBE_QUERY, NULL,
+        this, glusterd_probe_cbk, (xdrproc_t)xdr_gd1_mgmt_probe_req);
+
+out:
+    GF_FREE(req.hostname);
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_rpc_friend_add(call_frame_t *frame, xlator_t *this, void *data)
+{
+    gd1_mgmt_friend_req req = {
+        {0},
+    };
+    int ret = 0;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_friend_sm_event_t *event = NULL;
+    dict_t *peer_data = NULL;
+
+    if (!frame || !this || !data) {
+        gf_smsg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    event = data;
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find(event->peerid, event->peername);
+    if (!peerinfo) {
+        RCU_READ_UNLOCK;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEER_NOT_FOUND,
+               "Could not find peer %s(%s)", event->peername,
+               uuid_utoa(event->peerid));
+        goto out;
+    }
+
+    req.hostname = gf_strdup(peerinfo->hostname);
+    req.port = peerinfo->port;
+
+    RCU_READ_UNLOCK;
+
+    gf_uuid_copy(req.uuid, MY_UUID);
+
+    peer_data = dict_new();
+    if (!peer_data) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_DICT_CREATE_FAIL,
+                NULL);
+        errno = ENOMEM;
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(peer_data, "hostname_in_cluster",
+                                     peerinfo->hostname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Unable to add hostname of the peer");
+        goto out;
+    }
+
+    if (priv->op_version >= GD_OP_VERSION_3_6_0) {
+        ret = glusterd_add_missed_snaps_to_export_dict(peer_data);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_MISSED_SNAP_LIST_STORE_FAIL,
+                   "Unable to add list of missed snapshots "
+                   "in the peer_data dict for handshake");
+            goto out;
+        }
+
+        ret = glusterd_add_snapshots_to_export_dict(peer_data);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_LIST_SET_FAIL,
+                   "Unable to add list of snapshots "
+                   "in the peer_data dict for handshake");
+            goto out;
+        }
+    }
+
+    /* Don't add any key-value in peer_data dictionary after call this function
+     */
+    ret = glusterd_add_volumes_to_export_dict(peer_data, &req.vols.vols_val,
+                                              &req.vols.vols_len);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to add list of volumes "
+               "in the peer_data dict for handshake");
+        goto out;
+    }
+
+    if (!req.vols.vols_len) {
+        ret = dict_allocate_and_serialize(peer_data, &req.vols.vols_val,
+                                          &req.vols.vols_len);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+            goto out;
+        }
+    }
+
+    ret = glusterd_submit_request(
+        peerinfo->rpc, &req, frame, peerinfo->peer, GLUSTERD_FRIEND_ADD, NULL,
+        this, glusterd_friend_add_cbk, (xdrproc_t)xdr_gd1_mgmt_friend_req);
+
+out:
+    GF_FREE(req.vols.vols_val);
+    GF_FREE(req.hostname);
+
+    if (peer_data)
+        dict_unref(peer_data);
+
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_rpc_friend_remove(call_frame_t *frame, xlator_t *this, void *data)
+{
+    gd1_mgmt_friend_req req = {
+        {0},
+    };
+    int ret = 0;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_friend_sm_event_t *event = NULL;
+
+    if (!frame || !this || !data) {
+        ret = -1;
+        goto out;
+    }
+
+    event = data;
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find(event->peerid, event->peername);
+    if (!peerinfo) {
+        RCU_READ_UNLOCK;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEER_NOT_FOUND,
+               "Could not find peer %s(%s)", event->peername,
+               uuid_utoa(event->peerid));
+        goto out;
+    }
+
+    gf_uuid_copy(req.uuid, MY_UUID);
+    req.hostname = gf_strdup(peerinfo->hostname);
+    req.port = peerinfo->port;
+
+    ret = glusterd_submit_request(peerinfo->rpc, &req, frame, peerinfo->peer,
+                                  GLUSTERD_FRIEND_REMOVE, NULL, this,
+                                  glusterd_friend_remove_cbk,
+                                  (xdrproc_t)xdr_gd1_mgmt_friend_req);
+
+    RCU_READ_UNLOCK;
+out:
+    GF_FREE(req.hostname);
+
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_rpc_friend_update(call_frame_t *frame, xlator_t *this, void *data)
+{
+    gd1_mgmt_friend_update req = {
+        {0},
+    };
+    int ret = 0;
+    glusterd_conf_t *priv = NULL;
+    dict_t *friends = NULL;
+    call_frame_t *dummy_frame = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    friends = data;
+    if (!friends)
+        goto out;
+
+    ret = dict_get_ptr(friends, "peerinfo", VOID(&peerinfo));
+    if (ret)
+        goto out;
+    /* Don't want to send the pointer over */
+    dict_deln(friends, "peerinfo", SLEN("peerinfo"));
+
+    ret = dict_allocate_and_serialize(friends, &req.friends.friends_val,
+                                      &req.friends.friends_len);
+    if (ret)
+        goto out;
+
+    gf_uuid_copy(req.uuid, MY_UUID);
+
+    dummy_frame = create_frame(this, this->ctx->pool);
+    ret = glusterd_submit_request(peerinfo->rpc, &req, dummy_frame,
+                                  peerinfo->peer, GLUSTERD_FRIEND_UPDATE, NULL,
+                                  this, glusterd_friend_update_cbk,
+                                  (xdrproc_t)xdr_gd1_mgmt_friend_update);
+
+out:
+    GF_FREE(req.friends.friends_val);
+
+    if (ret && dummy_frame)
+        STACK_DESTROY(dummy_frame->root);
+
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_cluster_lock(call_frame_t *frame, xlator_t *this, void *data)
+{
+    gd1_mgmt_cluster_lock_req req = {
+        {0},
+    };
+    int ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    call_frame_t *dummy_frame = NULL;
+
+    if (!this)
+        goto out;
+
+    peerinfo = data;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    glusterd_get_uuid(&req.uuid);
+
+    dummy_frame = create_frame(this, this->ctx->pool);
+    if (!dummy_frame)
+        goto out;
+
+    ret = glusterd_submit_request(peerinfo->rpc, &req, dummy_frame,
+                                  peerinfo->mgmt, GLUSTERD_MGMT_CLUSTER_LOCK,
+                                  NULL, this, glusterd_cluster_lock_cbk,
+                                  (xdrproc_t)xdr_gd1_mgmt_cluster_lock_req);
+out:
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+
+    if (ret && dummy_frame)
+        STACK_DESTROY(dummy_frame->root);
+    return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_lock_peers(call_frame_t *frame, xlator_t *this, void *data)
+{
+    gd1_mgmt_v3_lock_req req = {
+        {0},
+    };
+    int ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    dict_t *dict = NULL;
+    uuid_t *txn_id = NULL;
+
+    if (!this)
+        goto out;
+
+    dict = data;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_ptr(dict, "peerinfo", VOID(&peerinfo));
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=peerinfo", NULL);
+        goto out;
+    }
+
+    // peerinfo should not be in payload
+    dict_deln(dict, "peerinfo", SLEN("peerinfo"));
+
+    glusterd_get_uuid(&req.uuid);
+
+    ret = dict_allocate_and_serialize(dict, &req.dict.dict_val,
+                                      &req.dict.dict_len);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    /* Sending valid transaction ID to peers */
+    ret = dict_get_bin(dict, "transaction_id", (void **)&txn_id);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_ID_GET_FAIL,
+               "Failed to get transaction id.");
+        goto out;
+    } else {
+        gf_msg_debug(this->name, 0, "Transaction_id = %s", uuid_utoa(*txn_id));
+        gf_uuid_copy(req.txn_id, *txn_id);
+    }
+
+    if (!frame)
+        frame = create_frame(this, this->ctx->pool);
+
+    if (!frame) {
+        ret = -1;
+        goto out;
+    }
+    frame->cookie = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!frame->cookie) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        ret = -1;
+        goto out;
+    }
+    gf_uuid_copy(frame->cookie, req.txn_id);
+
+    ret = glusterd_submit_request(peerinfo->rpc, &req, frame, peerinfo->mgmt_v3,
+                                  GLUSTERD_MGMT_V3_LOCK, NULL, this,
+                                  glusterd_mgmt_v3_lock_peers_cbk,
+                                  (xdrproc_t)xdr_gd1_mgmt_v3_lock_req);
+out:
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    if (dict)
+        dict_unref(dict);
+    if (req.dict.dict_val)
+        GF_FREE(req.dict.dict_val);
+    return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_unlock_peers(call_frame_t *frame, xlator_t *this, void *data)
+{
+    gd1_mgmt_v3_unlock_req req = {
+        {0},
+    };
+    int ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    dict_t *dict = NULL;
+    uuid_t *txn_id = NULL;
+
+    if (!this)
+        goto out;
+
+    dict = data;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_ptr(dict, "peerinfo", VOID(&peerinfo));
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=peerinfo", NULL);
+        goto out;
+    }
+
+    // peerinfo should not be in payload
+    dict_deln(dict, "peerinfo", SLEN("peerinfo"));
+
+    glusterd_get_uuid(&req.uuid);
+
+    ret = dict_allocate_and_serialize(dict, &req.dict.dict_val,
+                                      &req.dict.dict_len);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    /* Sending valid transaction ID to peers */
+    ret = dict_get_bin(dict, "transaction_id", (void **)&txn_id);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_ID_GET_FAIL,
+               "Failed to get transaction id.");
+        goto out;
+    } else {
+        gf_msg_debug(this->name, 0, "Transaction_id = %s", uuid_utoa(*txn_id));
+        gf_uuid_copy(req.txn_id, *txn_id);
+    }
+
+    if (!frame)
+        frame = create_frame(this, this->ctx->pool);
+
+    if (!frame) {
+        ret = -1;
+        goto out;
+    }
+    frame->cookie = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!frame->cookie) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        ret = -1;
+        goto out;
+    }
+    gf_uuid_copy(frame->cookie, req.txn_id);
+
+    ret = glusterd_submit_request(peerinfo->rpc, &req, frame, peerinfo->mgmt_v3,
+                                  GLUSTERD_MGMT_V3_UNLOCK, NULL, this,
+                                  glusterd_mgmt_v3_unlock_peers_cbk,
+                                  (xdrproc_t)xdr_gd1_mgmt_v3_unlock_req);
+out:
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    if (dict)
+        dict_unref(dict);
+
+    if (req.dict.dict_val)
+        GF_FREE(req.dict.dict_val);
+    return ret;
+}
+
+int32_t
+glusterd_cluster_unlock(call_frame_t *frame, xlator_t *this, void *data)
+{
+    gd1_mgmt_cluster_lock_req req = {
+        {0},
+    };
+    int ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    call_frame_t *dummy_frame = NULL;
+
+    if (!this) {
+        ret = -1;
+        goto out;
+    }
+    peerinfo = data;
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    glusterd_get_uuid(&req.uuid);
+
+    dummy_frame = create_frame(this, this->ctx->pool);
+    if (!dummy_frame)
+        goto out;
+
+    ret = glusterd_submit_request(peerinfo->rpc, &req, dummy_frame,
+                                  peerinfo->mgmt, GLUSTERD_MGMT_CLUSTER_UNLOCK,
+                                  NULL, this, glusterd_cluster_unlock_cbk,
+                                  (xdrproc_t)xdr_gd1_mgmt_cluster_unlock_req);
+out:
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+
+    if (ret && dummy_frame)
+        STACK_DESTROY(dummy_frame->root);
+
+    return ret;
+}
+
+int32_t
+glusterd_stage_op(call_frame_t *frame, xlator_t *this, void *data)
+{
+    gd1_mgmt_stage_op_req req = {
+        {
+            0,
+        },
+    };
+    int ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    dict_t *dict = NULL;
+    uuid_t *txn_id = NULL;
+
+    if (!this) {
+        goto out;
+    }
+
+    dict = data;
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_ptr(dict, "peerinfo", VOID(&peerinfo));
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=peerinfo", NULL);
+        goto out;
+    }
+
+    // peerinfo should not be in payload
+    dict_deln(dict, "peerinfo", SLEN("peerinfo"));
+
+    glusterd_get_uuid(&req.uuid);
+    req.op = glusterd_op_get_op();
+
+    ret = dict_allocate_and_serialize(dict, &req.buf.buf_val, &req.buf.buf_len);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+    /* Sending valid transaction ID to peers */
+    ret = dict_get_bin(dict, "transaction_id", (void **)&txn_id);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_ID_GET_FAIL,
+               "Failed to get transaction id.");
+        goto out;
+    } else {
+        gf_msg_debug(this->name, 0, "Transaction_id = %s", uuid_utoa(*txn_id));
+    }
+
+    if (!frame)
+        frame = create_frame(this, this->ctx->pool);
+
+    if (!frame) {
+        ret = -1;
+        goto out;
+    }
+    frame->cookie = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!frame->cookie) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        ret = -1;
+        goto out;
+    }
+    gf_uuid_copy(frame->cookie, *txn_id);
+
+    ret = glusterd_submit_request(peerinfo->rpc, &req, frame, peerinfo->mgmt,
+                                  GLUSTERD_MGMT_STAGE_OP, NULL, this,
+                                  glusterd_stage_op_cbk,
+                                  (xdrproc_t)xdr_gd1_mgmt_stage_op_req);
+
+out:
+    if (req.buf.buf_val)
+        GF_FREE(req.buf.buf_val);
+
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_commit_op(call_frame_t *frame, xlator_t *this, void *data)
+{
+    gd1_mgmt_commit_op_req req = {
+        {
+            0,
+        },
+    };
+    int ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    dict_t *dict = NULL;
+    uuid_t *txn_id = NULL;
+
+    if (!this) {
+        goto out;
+    }
+
+    dict = data;
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_ptr(dict, "peerinfo", VOID(&peerinfo));
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=peerinfo", NULL);
+        goto out;
+    }
+
+    // peerinfo should not be in payload
+    dict_deln(dict, "peerinfo", SLEN("peerinfo"));
+
+    glusterd_get_uuid(&req.uuid);
+    req.op = glusterd_op_get_op();
+
+    ret = dict_allocate_and_serialize(dict, &req.buf.buf_val, &req.buf.buf_len);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+    /* Sending valid transaction ID to peers */
+    ret = dict_get_bin(dict, "transaction_id", (void **)&txn_id);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_ID_GET_FAIL,
+               "Failed to get transaction id.");
+        goto out;
+    } else {
+        gf_msg_debug(this->name, 0, "Transaction_id = %s", uuid_utoa(*txn_id));
+    }
+
+    if (!frame)
+        frame = create_frame(this, this->ctx->pool);
+
+    if (!frame) {
+        ret = -1;
+        goto out;
+    }
+    frame->cookie = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!frame->cookie) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        ret = -1;
+        goto out;
+    }
+    gf_uuid_copy(frame->cookie, *txn_id);
+
+    ret = glusterd_submit_request(peerinfo->rpc, &req, frame, peerinfo->mgmt,
+                                  GLUSTERD_MGMT_COMMIT_OP, NULL, this,
+                                  glusterd_commit_op_cbk,
+                                  (xdrproc_t)xdr_gd1_mgmt_commit_op_req);
+
+out:
+    if (req.buf.buf_val)
+        GF_FREE(req.buf.buf_val);
+
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+__glusterd_brick_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                        void *myframe)
+{
+    gd1_mgmt_brick_op_rsp rsp = {0};
+    int ret = -1;
+    int32_t op_ret = -1;
+    glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
+    call_frame_t *frame = NULL;
+    glusterd_op_brick_rsp_ctx_t *ev_ctx = NULL;
+    dict_t *dict = NULL;
+    int index = 0;
+    glusterd_req_ctx_t *req_ctx = NULL;
+    glusterd_pending_node_t *node = NULL;
+    xlator_t *this = NULL;
+    uuid_t *txn_id = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(req);
+
+    txn_id = &priv->global_txn_id;
+    frame = myframe;
+    req_ctx = frame->local;
+
+    if (-1 == req->rpc_status) {
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        /* use standard allocation because to keep uniformity
+           in freeing it */
+        rsp.op_errstr = strdup("error");
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_brick_op_rsp);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RES_DECODE_FAIL,
+               "Failed to decode brick op "
+               "response received");
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        rsp.op_errstr = strdup("Unable to decode brick op response");
+        event_type = GD_OP_EVENT_RCVD_RJT;
+        goto out;
+    }
+
+    if (rsp.output.output_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(rsp.output.output_val, rsp.output.output_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "Failed to "
+                   "unserialize rsp-buffer to dictionary");
+            event_type = GD_OP_EVENT_RCVD_RJT;
+            goto out;
+        } else {
+            dict->extra_stdfree = rsp.output.output_val;
+        }
+    }
+
+    op_ret = rsp.op_ret;
+
+    /* Add index to rsp_dict for GD_OP_STATUS_VOLUME */
+    if (GD_OP_STATUS_VOLUME == req_ctx->op) {
+        node = frame->cookie;
+        index = node->index;
+        ret = dict_set_int32n(dict, "index", SLEN("index"), index);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Error setting index on brick status rsp dict");
+            rsp.op_ret = -1;
+            event_type = GD_OP_EVENT_RCVD_RJT;
+            goto out;
+        }
+    }
+out:
+
+    if (req_ctx && req_ctx->dict) {
+        ret = dict_get_bin(req_ctx->dict, "transaction_id", (void **)&txn_id);
+        gf_msg_debug(this->name, -ret, "transaction ID = %s",
+                     uuid_utoa(*txn_id));
+    }
+
+    ev_ctx = GF_CALLOC(1, sizeof(*ev_ctx), gf_gld_mt_brick_rsp_ctx_t);
+    if (ev_ctx) {
+        if (op_ret) {
+            event_type = GD_OP_EVENT_RCVD_RJT;
+            ev_ctx->op_ret = op_ret;
+            ev_ctx->op_errstr = gf_strdup(rsp.op_errstr);
+        } else {
+            event_type = GD_OP_EVENT_RCVD_ACC;
+        }
+        ev_ctx->pending_node = frame->cookie;
+        ev_ctx->rsp_dict = dict;
+        ev_ctx->commit_ctx = frame->local;
+        ret = glusterd_op_sm_inject_event(event_type, txn_id, ev_ctx);
+    }
+    if (!ret) {
+        glusterd_friend_sm();
+        glusterd_op_sm();
+    }
+
+    if (ret) {
+        if (dict) {
+            dict_unref(dict);
+        }
+        if (ev_ctx) {
+            GF_FREE(ev_ctx->op_errstr);
+            GF_FREE(ev_ctx);
+        }
+    }
+    free(rsp.op_errstr);  // malloced by xdr
+    GLUSTERD_STACK_DESTROY(frame);
+    return ret;
+}
+
+int32_t
+glusterd_brick_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                      void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   __glusterd_brick_op_cbk);
+}
+
+int32_t
+glusterd_brick_op(call_frame_t *frame, xlator_t *this, void *data)
+{
+    gd1_mgmt_brick_op_req *req = NULL;
+    int ret = 0;
+    int ret1 = 0;
+    glusterd_conf_t *priv = NULL;
+    call_frame_t *dummy_frame = NULL;
+    char *op_errstr = NULL;
+    int pending_bricks = 0;
+    glusterd_pending_node_t *pending_node;
+    glusterd_req_ctx_t *req_ctx = NULL;
+    struct rpc_clnt *rpc = NULL;
+    dict_t *op_ctx = NULL;
+    uuid_t *txn_id = NULL;
+
+    if (!this) {
+        ret = -1;
+        goto out;
+    }
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    txn_id = &priv->global_txn_id;
+
+    req_ctx = data;
+    GF_ASSERT(req_ctx);
+    CDS_INIT_LIST_HEAD(&opinfo.pending_bricks);
+
+    ret = dict_get_bin(req_ctx->dict, "transaction_id", (void **)&txn_id);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_BRICK_SELECT_FAIL,
+               "Could not get transaction ID from dict, global"
+               "transaction ID = %s",
+               uuid_utoa(*txn_id));
+    } else {
+        gf_msg_debug(this->name, 0, "transaction ID = %s", uuid_utoa(*txn_id));
+    }
+    ret = glusterd_op_bricks_select(req_ctx->op, req_ctx->dict, &op_errstr,
+                                    &opinfo.pending_bricks, NULL);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_SELECT_FAIL,
+               "Failed to select bricks "
+               "while performing brick op during 'Volume %s'",
+               gd_op_list[opinfo.op]);
+        opinfo.op_errstr = op_errstr;
+        goto out;
+    }
+
+    cds_list_for_each_entry(pending_node, &opinfo.pending_bricks, list)
+    {
+        dummy_frame = create_frame(this, this->ctx->pool);
+        if (!dummy_frame)
+            continue;
+
+        if ((pending_node->type == GD_NODE_NFS) ||
+            (pending_node->type == GD_NODE_QUOTAD) ||
+            (pending_node->type == GD_NODE_SNAPD) ||
+            (pending_node->type == GD_NODE_SCRUB) ||
+            ((pending_node->type == GD_NODE_SHD) &&
+             (req_ctx->op == GD_OP_STATUS_VOLUME))) {
+            ret = glusterd_node_op_build_payload(
+                req_ctx->op, (gd1_mgmt_brick_op_req **)&req, req_ctx->dict);
+        } else {
+            ret = glusterd_brick_op_build_payload(
+                req_ctx->op, pending_node->node, (gd1_mgmt_brick_op_req **)&req,
+                req_ctx->dict);
+        }
+        if (ret || !req) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_BRICK_OP_PAYLOAD_BUILD_FAIL,
+                   "Failed to "
+                   "build op payload during "
+                   "'Volume %s'",
+                   gd_op_list[req_ctx->op]);
+            goto out;
+        }
+
+        dummy_frame->local = data;
+        dummy_frame->cookie = pending_node;
+
+        rpc = glusterd_pending_node_get_rpc(pending_node);
+        if (!rpc) {
+            if (pending_node->type == GD_NODE_REBALANCE) {
+                opinfo.brick_pending_count = 0;
+                ret = 0;
+                GF_FREE(req->input.input_val);
+                GF_FREE(req);
+                req = NULL;
+                GLUSTERD_STACK_DESTROY(dummy_frame);
+
+                op_ctx = glusterd_op_get_ctx();
+                if (!op_ctx)
+                    goto out;
+                glusterd_defrag_volume_node_rsp(req_ctx->dict, NULL, op_ctx);
+
+                goto out;
+            }
+
+            ret = -1;
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_FAILURE,
+                   "Brick Op failed "
+                   "due to rpc failure.");
+            goto out;
+        }
+
+        ret = glusterd_submit_request(
+            rpc, req, dummy_frame, priv->gfs_mgmt, req->op, NULL, this,
+            glusterd_brick_op_cbk, (xdrproc_t)xdr_gd1_mgmt_brick_op_req);
+        GF_FREE(req->input.input_val);
+        GF_FREE(req);
+        req = NULL;
+
+        if (!ret)
+            pending_bricks++;
+
+        glusterd_pending_node_put_rpc(pending_node);
+    }
+
+    gf_msg_trace(this->name, 0,
+                 "Sent brick op req for operation "
+                 "'Volume %s' to %d bricks",
+                 gd_op_list[req_ctx->op], pending_bricks);
+    opinfo.brick_pending_count = pending_bricks;
+
+out:
+
+    if (ret)
+        opinfo.op_ret = ret;
+
+    ret1 = glusterd_set_txn_opinfo(txn_id, &opinfo);
+    if (ret1)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set "
+               "transaction's opinfo");
+
+    if (ret) {
+        glusterd_op_sm_inject_event(GD_OP_EVENT_RCVD_RJT, txn_id, data);
+        opinfo.op_ret = ret;
+    }
+
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+struct rpc_clnt_procedure gd_brick_actors[GLUSTERD_BRICK_MAXVALUE] = {
+    [GLUSTERD_BRICK_NULL] = {"NULL", NULL},
+    [GLUSTERD_BRICK_OP] = {"BRICK_OP", glusterd_brick_op},
+};
+
+struct rpc_clnt_procedure gd_peer_actors[GLUSTERD_FRIEND_MAXVALUE] = {
+    [GLUSTERD_FRIEND_NULL] = {"NULL", NULL},
+    [GLUSTERD_PROBE_QUERY] = {"PROBE_QUERY", glusterd_rpc_probe},
+    [GLUSTERD_FRIEND_ADD] = {"FRIEND_ADD", glusterd_rpc_friend_add},
+    [GLUSTERD_FRIEND_REMOVE] = {"FRIEND_REMOVE", glusterd_rpc_friend_remove},
+    [GLUSTERD_FRIEND_UPDATE] = {"FRIEND_UPDATE", glusterd_rpc_friend_update},
+};
+
+struct rpc_clnt_procedure gd_mgmt_actors[GLUSTERD_MGMT_MAXVALUE] = {
+    [GLUSTERD_MGMT_NULL] = {"NULL", NULL},
+    [GLUSTERD_MGMT_CLUSTER_LOCK] = {"CLUSTER_LOCK", glusterd_cluster_lock},
+    [GLUSTERD_MGMT_CLUSTER_UNLOCK] = {"CLUSTER_UNLOCK",
+                                      glusterd_cluster_unlock},
+    [GLUSTERD_MGMT_STAGE_OP] = {"STAGE_OP", glusterd_stage_op},
+    [GLUSTERD_MGMT_COMMIT_OP] = {"COMMIT_OP", glusterd_commit_op},
+};
+
+struct rpc_clnt_procedure gd_mgmt_v3_actors[GLUSTERD_MGMT_V3_MAXVALUE] = {
+    [GLUSTERD_MGMT_V3_NULL] = {"NULL", NULL},
+    [GLUSTERD_MGMT_V3_LOCK] = {"MGMT_V3_LOCK", glusterd_mgmt_v3_lock_peers},
+    [GLUSTERD_MGMT_V3_UNLOCK] = {"MGMT_V3_UNLOCK",
+                                 glusterd_mgmt_v3_unlock_peers},
+};
+
+struct rpc_clnt_program gd_mgmt_prog = {
+    .progname = "glusterd mgmt",
+    .prognum = GD_MGMT_PROGRAM,
+    .progver = GD_MGMT_VERSION,
+    .proctable = gd_mgmt_actors,
+    .numproc = GLUSTERD_MGMT_MAXVALUE,
+};
+
+struct rpc_clnt_program gd_brick_prog = {
+    .progname = "brick operations",
+    .prognum = GD_BRICK_PROGRAM,
+    .progver = GD_BRICK_VERSION,
+    .proctable = gd_brick_actors,
+    .numproc = GLUSTERD_BRICK_MAXVALUE,
+};
+
+struct rpc_clnt_program gd_peer_prog = {
+    .progname = "Peer mgmt",
+    .prognum = GD_FRIEND_PROGRAM,
+    .progver = GD_FRIEND_VERSION,
+    .proctable = gd_peer_actors,
+    .numproc = GLUSTERD_FRIEND_MAXVALUE,
+};
+
+struct rpc_clnt_program gd_mgmt_v3_prog = {
+    .progname = "glusterd mgmt v3",
+    .prognum = GD_MGMT_PROGRAM,
+    .progver = GD_MGMT_V3_VERSION,
+    .proctable = gd_mgmt_v3_actors,
+    .numproc = GLUSTERD_MGMT_V3_MAXVALUE,
+};
diff --git a/xlators/mgmt/glusterd/src/glusterd-scrub-svc.c b/xlators/mgmt/glusterd/src/glusterd-scrub-svc.c
new file mode 100644
index 00000000000..c49a0eefba5
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-scrub-svc.c
@@ -0,0 +1,207 @@
+/*
+  Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/globals.h>
+#include <glusterfs/run.h>
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-scrub-svc.h"
+#include "glusterd-svc-helper.h"
+
+char *scrub_svc_name = "scrub";
+
+void
+glusterd_scrubsvc_build(glusterd_svc_t *svc)
+{
+    svc->manager = glusterd_scrubsvc_manager;
+    svc->start = glusterd_scrubsvc_start;
+    svc->stop = glusterd_scrubsvc_stop;
+}
+
+int
+glusterd_scrubsvc_init(glusterd_svc_t *svc)
+{
+    return glusterd_svc_init(svc, scrub_svc_name);
+}
+
+static int
+glusterd_scrubsvc_create_volfile()
+{
+    char filepath[PATH_MAX] = {
+        0,
+    };
+    int ret = -1;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    glusterd_svc_build_volfile_path(scrub_svc_name, conf->workdir, filepath,
+                                    sizeof(filepath));
+
+    ret = glusterd_create_global_volfile(build_scrub_graph, filepath, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Failed to create volfile");
+        goto out;
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_scrubsvc_manager(glusterd_svc_t *svc, void *data, int flags)
+{
+    int ret = -EINVAL;
+
+    if (!svc->inited) {
+        ret = glusterd_scrubsvc_init(svc);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SCRUB_INIT_FAIL,
+                   "Failed to init "
+                   "scrub service");
+            goto out;
+        } else {
+            svc->inited = _gf_true;
+            gf_msg_debug(THIS->name, 0,
+                         "scrub service "
+                         "initialized");
+        }
+    }
+
+    if (glusterd_should_i_stop_bitd()) {
+        ret = svc->stop(svc, SIGTERM);
+    } else {
+        ret = glusterd_scrubsvc_create_volfile();
+        if (ret)
+            goto out;
+
+        ret = svc->stop(svc, SIGKILL);
+        if (ret)
+            goto out;
+
+        ret = svc->start(svc, flags);
+        if (ret)
+            goto out;
+
+        ret = glusterd_conn_connect(&(svc->conn));
+        if (ret)
+            goto out;
+    }
+
+out:
+    if (ret)
+        gf_event(EVENT_SVC_MANAGER_FAILED, "svc_name=%s", svc->name);
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_scrubsvc_start(glusterd_svc_t *svc, int flags)
+{
+    int ret = -1;
+    dict_t *cmdict = NULL;
+
+    cmdict = dict_new();
+    if (!cmdict) {
+        gf_smsg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto error_return;
+    }
+
+    ret = dict_set_str(cmdict, "cmdarg0", "--global-timer-wheel");
+    if (ret)
+        goto dealloc_dict;
+
+    ret = glusterd_svc_start(svc, flags, cmdict);
+
+dealloc_dict:
+    dict_unref(cmdict);
+error_return:
+    return ret;
+}
+
+int
+glusterd_scrubsvc_stop(glusterd_svc_t *svc, int sig)
+{
+    return glusterd_svc_stop(svc, sig);
+}
+
+int
+glusterd_scrubsvc_reconfigure()
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    gf_boolean_t identical = _gf_false;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    if (glusterd_should_i_stop_bitd())
+        goto manager;
+
+    /*
+     * Check both OLD and NEW volfiles, if they are SAME by size
+     * and cksum i.e. "character-by-character". If YES, then
+     * NOTHING has been changed, just return.
+     */
+    ret = glusterd_svc_check_volfile_identical(priv->scrub_svc.name,
+                                               build_scrub_graph, &identical);
+    if (ret)
+        goto out;
+
+    if (identical) {
+        ret = 0;
+        goto out;
+    }
+
+    /*
+     * They are not identical. Find out if the topology is changed
+     * OR just the volume options. If just the options which got
+     * changed, then inform the xlator to reconfigure the options.
+     */
+    identical = _gf_false; /* RESET the FLAG */
+    ret = glusterd_svc_check_topology_identical(priv->scrub_svc.name,
+                                                build_scrub_graph, &identical);
+    if (ret)
+        goto out;
+
+    /* Topology is not changed, but just the options. But write the
+     * options to scrub volfile, so that scrub will be reconfigured.
+     */
+    if (identical) {
+        ret = glusterd_scrubsvc_create_volfile();
+        if (ret == 0) { /* Only if above PASSES */
+            ret = glusterd_fetchspec_notify(THIS);
+        }
+        goto out;
+    }
+
+manager:
+    /*
+     * scrub volfile's topology has been changed. scrub server needs
+     * to be RESTARTED to ACT on the changed volfile.
+     */
+    ret = priv->scrub_svc.manager(&(priv->scrub_svc), NULL, PROC_START_NO_WAIT);
+
+out:
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-scrub-svc.h b/xlators/mgmt/glusterd/src/glusterd-scrub-svc.h
new file mode 100644
index 00000000000..514b1de96a0
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-scrub-svc.h
@@ -0,0 +1,45 @@
+/*
+  Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_SCRUB_SVC_H_
+#define _GLUSTERD_SCRUB_SVC_H_
+
+#include "glusterd-svc-mgmt.h"
+
+typedef struct glusterd_scrubsvc_ glusterd_scrubsvc_t;
+
+struct glusterd_scrubsvc_ {
+    glusterd_svc_t svc;
+    gf_store_handle_t *handle;
+};
+
+void
+glusterd_scrubsvc_build(glusterd_svc_t *svc);
+
+int
+glusterd_scrubsvc_init(glusterd_svc_t *svc);
+
+int
+glusterd_scrubsvc_manager(glusterd_svc_t *svc, void *data, int flags);
+
+int
+glusterd_scrubsvc_start(glusterd_svc_t *svc, int flags);
+
+int
+glusterd_scrubsvc_stop(glusterd_svc_t *svc, int sig);
+
+int
+glusterd_scrubsvc_reconfigure();
+
+void
+glusterd_scrubsvc_build_volfile_path(char *server, char *workdir, char *volfile,
+                                     size_t len);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-server-quorum.c b/xlators/mgmt/glusterd/src/glusterd-server-quorum.c
new file mode 100644
index 00000000000..b0b8a2e4018
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-server-quorum.c
@@ -0,0 +1,486 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/common-utils.h>
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-messages.h"
+#include "glusterd-server-quorum.h"
+#include "glusterd-store.h"
+#include "glusterd-syncop.h"
+#include "glusterd-op-sm.h"
+
+#define CEILING_POS(X) (((X) - (int)(X)) > 0 ? (int)((X) + 1) : (int)(X))
+
+static gf_boolean_t
+glusterd_is_get_op(xlator_t *this, glusterd_op_t op, dict_t *dict)
+{
+    char *key = NULL;
+    char *volname = NULL;
+    int ret = 0;
+
+    if (op == GD_OP_STATUS_VOLUME)
+        return _gf_true;
+
+    if (op == GD_OP_SET_VOLUME) {
+        /*check for set volume help*/
+        ret = dict_get_str(dict, "volname", &volname);
+        if (volname && ((strcmp(volname, "help") == 0) ||
+                        (strcmp(volname, "help-xml") == 0))) {
+            ret = dict_get_str(dict, "key1", &key);
+            if (ret < 0)
+                return _gf_true;
+        }
+    }
+    return _gf_false;
+}
+
+gf_boolean_t
+glusterd_is_quorum_validation_required(xlator_t *this, glusterd_op_t op,
+                                       dict_t *dict)
+{
+    gf_boolean_t required = _gf_true;
+    char *key = NULL;
+    char *key_fixed = NULL;
+    int ret = -1;
+
+    if (glusterd_is_get_op(this, op, dict)) {
+        required = _gf_false;
+        goto out;
+    }
+    if ((op != GD_OP_SET_VOLUME) && (op != GD_OP_RESET_VOLUME))
+        goto out;
+    if (op == GD_OP_SET_VOLUME)
+        ret = dict_get_str(dict, "key1", &key);
+    else if (op == GD_OP_RESET_VOLUME)
+        ret = dict_get_str(dict, "key", &key);
+    if (ret)
+        goto out;
+    ret = glusterd_check_option_exists(key, &key_fixed);
+    if (ret <= 0)
+        goto out;
+    if (key_fixed)
+        key = key_fixed;
+    if (glusterd_is_quorum_option(key))
+        required = _gf_false;
+out:
+    GF_FREE(key_fixed);
+    return required;
+}
+
+int
+glusterd_validate_quorum(xlator_t *this, glusterd_op_t op, dict_t *dict,
+                         char **op_errstr)
+{
+    int ret = 0;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char *errstr = NULL;
+
+    errstr = "Quorum not met. Volume operation not allowed.";
+    if (!glusterd_is_quorum_validation_required(this, op, dict))
+        goto out;
+
+    ret = dict_get_str(dict, "volname", &volname);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=volname", NULL);
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_VOLINFO_GET_FAIL, NULL);
+        ret = 0;
+        goto out;
+    }
+
+    if (!glusterd_is_volume_in_server_quorum(volinfo)) {
+        ret = 0;
+        goto out;
+    }
+
+    if (does_gd_meet_server_quorum(this)) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = -1;
+    *op_errstr = gf_strdup(errstr);
+
+out:
+    return ret;
+}
+
+gf_boolean_t
+glusterd_is_quorum_option(char *option)
+{
+    gf_boolean_t res = _gf_false;
+    int i = 0;
+    static const char *const keys[] = {GLUSTERD_QUORUM_TYPE_KEY,
+                                       GLUSTERD_QUORUM_RATIO_KEY, NULL};
+
+    for (i = 0; keys[i]; i++) {
+        if (strcmp(option, keys[i]) == 0) {
+            res = _gf_true;
+            break;
+        }
+    }
+    return res;
+}
+
+gf_boolean_t
+glusterd_is_quorum_changed(dict_t *options, char *option, char *value)
+{
+    int ret = 0;
+    gf_boolean_t reconfigured = _gf_false;
+    gf_boolean_t all = _gf_false;
+    char *oldquorum = NULL;
+    char *newquorum = NULL;
+    char *oldratio = NULL;
+    char *newratio = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    if ((strcmp("all", option) != 0) && !glusterd_is_quorum_option(option))
+        goto out;
+
+    if (strcmp("all", option) == 0)
+        all = _gf_true;
+
+    if (all || (strcmp(GLUSTERD_QUORUM_TYPE_KEY, option) == 0)) {
+        newquorum = value;
+        ret = dict_get_str(options, GLUSTERD_QUORUM_TYPE_KEY, &oldquorum);
+        if (ret)
+            gf_msg(this->name, GF_LOG_DEBUG, 0, GD_MSG_DICT_GET_FAILED,
+                   "dict_get_str failed on %s", GLUSTERD_QUORUM_TYPE_KEY);
+    }
+
+    if (all || (strcmp(GLUSTERD_QUORUM_RATIO_KEY, option) == 0)) {
+        newratio = value;
+        ret = dict_get_str(options, GLUSTERD_QUORUM_RATIO_KEY, &oldratio);
+        if (ret)
+            gf_msg(this->name, GF_LOG_DEBUG, 0, GD_MSG_DICT_GET_FAILED,
+                   "dict_get_str failed on %s", GLUSTERD_QUORUM_RATIO_KEY);
+    }
+
+    reconfigured = _gf_true;
+
+    if (oldquorum && newquorum && (strcmp(oldquorum, newquorum) == 0))
+        reconfigured = _gf_false;
+    if (oldratio && newratio && (strcmp(oldratio, newratio) == 0))
+        reconfigured = _gf_false;
+
+    if ((oldratio == NULL) && (newratio == NULL) && (oldquorum == NULL) &&
+        (newquorum == NULL))
+        reconfigured = _gf_false;
+out:
+    return reconfigured;
+}
+
+static gf_boolean_t
+_is_contributing_to_quorum(gd_quorum_contrib_t contrib)
+{
+    if ((contrib == QUORUM_UP) || (contrib == QUORUM_DOWN))
+        return _gf_true;
+    return _gf_false;
+}
+
+gf_boolean_t
+does_quorum_meet(int active_count, int quorum_count)
+{
+    return (active_count >= quorum_count);
+}
+
+int
+glusterd_get_quorum_cluster_counts(xlator_t *this, int *active_count,
+                                   int *quorum_count)
+{
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *conf = NULL;
+    int ret = -1;
+    int inquorum_count = 0;
+    char *val = NULL;
+    double quorum_percentage = 0.0;
+    gf_boolean_t ratio = _gf_false;
+    int count = 0;
+
+    conf = this->private;
+
+    /* Start with counting self */
+    inquorum_count = 1;
+    if (active_count)
+        *active_count = 1;
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        if (_is_contributing_to_quorum(peerinfo->quorum_contrib))
+            inquorum_count = inquorum_count + 1;
+        if (active_count && (peerinfo->quorum_contrib == QUORUM_UP))
+            *active_count = *active_count + 1;
+    }
+    RCU_READ_UNLOCK;
+
+    ret = dict_get_str(conf->opts, GLUSTERD_QUORUM_RATIO_KEY, &val);
+    if (ret == 0) {
+        ret = gf_string2percent(val, &quorum_percentage);
+        if (ret == 0)
+            ratio = _gf_true;
+    }
+    if (ratio)
+        count = CEILING_POS(inquorum_count * quorum_percentage / 100.0);
+    else
+        count = (inquorum_count * 50 / 100) + 1;
+
+    *quorum_count = count;
+    ret = 0;
+
+    return ret;
+}
+
+gf_boolean_t
+glusterd_is_volume_in_server_quorum(glusterd_volinfo_t *volinfo)
+{
+    gf_boolean_t res = _gf_false;
+    char *quorum_type = NULL;
+    int ret = 0;
+
+    ret = dict_get_str(volinfo->dict, GLUSTERD_QUORUM_TYPE_KEY, &quorum_type);
+    if (ret) {
+        gf_smsg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=%s", GLUSTERD_QUORUM_TYPE_KEY, NULL);
+        goto out;
+    }
+
+    if (strcmp(quorum_type, GLUSTERD_SERVER_QUORUM) == 0)
+        res = _gf_true;
+out:
+    return res;
+}
+
+gf_boolean_t
+glusterd_is_any_volume_in_server_quorum(xlator_t *this)
+{
+    glusterd_conf_t *conf = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    conf = this->private;
+    list_for_each_entry(volinfo, &conf->volumes, vol_list)
+    {
+        if (glusterd_is_volume_in_server_quorum(volinfo)) {
+            return _gf_true;
+        }
+    }
+    return _gf_false;
+}
+
+gf_boolean_t
+does_gd_meet_server_quorum(xlator_t *this)
+{
+    int quorum_count = 0;
+    int active_count = 0;
+    gf_boolean_t in = _gf_false;
+    int ret = -1;
+
+    ret = glusterd_get_quorum_cluster_counts(this, &active_count,
+                                             &quorum_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_QUORUM_CLUSTER_COUNT_GET_FAIL, NULL);
+        goto out;
+    }
+
+    if (!does_quorum_meet(active_count, quorum_count)) {
+        goto out;
+    }
+
+    in = _gf_true;
+out:
+    return in;
+}
+
+void
+glusterd_do_volume_quorum_action(xlator_t *this, glusterd_volinfo_t *volinfo,
+                                 gf_boolean_t meets_quorum)
+{
+    int ret = -1;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    gd_quorum_status_t quorum_status = NOT_APPLICABLE_QUORUM;
+    gf_boolean_t follows_quorum = _gf_false;
+    gf_boolean_t quorum_status_unchanged = _gf_false;
+
+    if (volinfo->status != GLUSTERD_STATUS_STARTED) {
+        volinfo->quorum_status = NOT_APPLICABLE_QUORUM;
+        goto out;
+    }
+
+    follows_quorum = glusterd_is_volume_in_server_quorum(volinfo);
+    if (follows_quorum) {
+        if (meets_quorum)
+            quorum_status = MEETS_QUORUM;
+        else
+            quorum_status = DOESNT_MEET_QUORUM;
+    } else {
+        quorum_status = NOT_APPLICABLE_QUORUM;
+    }
+
+    /*
+     * The following check is added to prevent spurious brick starts when
+     * events occur that affect quorum.
+     * Example:
+     * There is a cluster of 10 peers. Volume is in quorum. User
+     * takes down one brick from the volume to perform maintenance.
+     * Suddenly one of the peers go down. Cluster is still in quorum. But
+     * because of this 'peer going down' event, quorum is calculated and
+     * the bricks that are down are brought up again. In this process it
+     * also brings up the brick that is purposefully taken down.
+     */
+    if (volinfo->quorum_status == quorum_status) {
+        quorum_status_unchanged = _gf_true;
+        goto out;
+    }
+
+    if (quorum_status == MEETS_QUORUM) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0,
+               GD_MSG_SERVER_QUORUM_MET_STARTING_BRICKS,
+               "Server quorum regained for volume %s. Starting local "
+               "bricks.",
+               volinfo->volname);
+        gf_event(EVENT_QUORUM_REGAINED, "volume=%s", volinfo->volname);
+    } else if (quorum_status == DOESNT_MEET_QUORUM) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0,
+               GD_MSG_SERVER_QUORUM_LOST_STOPPING_BRICKS,
+               "Server quorum lost for volume %s. Stopping local "
+               "bricks.",
+               volinfo->volname);
+        gf_event(EVENT_QUORUM_LOST, "volume=%s", volinfo->volname);
+    }
+
+    list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (!glusterd_is_local_brick(this, volinfo, brickinfo))
+            continue;
+        if (quorum_status == DOESNT_MEET_QUORUM) {
+            ret = glusterd_brick_stop(volinfo, brickinfo, _gf_false);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_STOP_FAIL,
+                       "Failed to "
+                       "stop brick %s:%s",
+                       brickinfo->hostname, brickinfo->path);
+            }
+        } else {
+            if (!brickinfo->start_triggered) {
+                pthread_mutex_lock(&brickinfo->restart_mutex);
+                {
+                    /* coverity[SLEEP] */
+                    ret = glusterd_brick_start(volinfo, brickinfo, _gf_false,
+                                               _gf_false);
+                }
+                pthread_mutex_unlock(&brickinfo->restart_mutex);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_BRICK_DISCONNECTED, "Failed to start %s:%s",
+                           brickinfo->hostname, brickinfo->path);
+                }
+            }
+        }
+    }
+    volinfo->quorum_status = quorum_status;
+    if (quorum_status == MEETS_QUORUM) {
+        /* bricks might have been restarted and so as the port change
+         * might have happened
+         */
+        ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_STORE_FAIL,
+                   "Failed to write volinfo for volume %s", volinfo->volname);
+            goto out;
+        }
+    }
+out:
+    if (quorum_status_unchanged) {
+        list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            if (!glusterd_is_local_brick(this, volinfo, brickinfo))
+                continue;
+            ret = glusterd_brick_start(volinfo, brickinfo, _gf_false, _gf_true);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_DISCONNECTED,
+                       "Failed to "
+                       "connect to %s:%s",
+                       brickinfo->hostname, brickinfo->path);
+            }
+        }
+    }
+    return;
+}
+
+int
+glusterd_do_quorum_action()
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    int ret = 0;
+    int active_count = 0;
+    int quorum_count = 0;
+    gf_boolean_t meets = _gf_false;
+
+    this = THIS;
+    conf = this->private;
+
+    conf->pending_quorum_action = _gf_true;
+    ret = glusterd_lock(conf->uuid);
+    if (ret)
+        goto out;
+
+    {
+        ret = glusterd_get_quorum_cluster_counts(this, &active_count,
+                                                 &quorum_count);
+        if (ret)
+            goto unlock;
+
+        if (does_quorum_meet(active_count, quorum_count))
+            meets = _gf_true;
+        list_for_each_entry(volinfo, &conf->volumes, vol_list)
+        {
+            glusterd_do_volume_quorum_action(this, volinfo, meets);
+        }
+    }
+unlock:
+    (void)glusterd_unlock(conf->uuid);
+    conf->pending_quorum_action = _gf_false;
+out:
+    return ret;
+}
+
+/* ret = 0 represents quorum is not met
+ * ret = 1 represents quorum is met
+ * ret = 2 represents quorum not applicable
+ */
+
+int
+check_quorum_for_brick_start(glusterd_volinfo_t *volinfo,
+                             gf_boolean_t node_quorum)
+{
+    gf_boolean_t volume_quorum = _gf_false;
+    int ret = 0;
+
+    volume_quorum = glusterd_is_volume_in_server_quorum(volinfo);
+    if (volume_quorum) {
+        if (node_quorum)
+            ret = 1;
+    } else {
+        ret = 2;
+    }
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-server-quorum.h b/xlators/mgmt/glusterd/src/glusterd-server-quorum.h
new file mode 100644
index 00000000000..e11bf1a9206
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-server-quorum.h
@@ -0,0 +1,46 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_SERVER_QUORUM_H
+#define _GLUSTERD_SERVER_QUORUM_H
+
+int
+glusterd_validate_quorum(xlator_t *this, glusterd_op_t op, dict_t *dict,
+                         char **op_errstr);
+
+gf_boolean_t
+glusterd_is_quorum_changed(dict_t *options, char *option, char *value);
+
+int
+glusterd_do_quorum_action();
+
+int
+glusterd_get_quorum_cluster_counts(xlator_t *this, int *active_count,
+                                   int *quorum_count);
+
+gf_boolean_t
+glusterd_is_quorum_option(char *option);
+
+gf_boolean_t
+glusterd_is_volume_in_server_quorum(glusterd_volinfo_t *volinfo);
+
+gf_boolean_t
+glusterd_is_any_volume_in_server_quorum(xlator_t *this);
+
+gf_boolean_t
+does_gd_meet_server_quorum(xlator_t *this);
+
+int
+check_quorum_for_brick_start(glusterd_volinfo_t *volinfo,
+                             gf_boolean_t node_quorum);
+
+gf_boolean_t
+does_quorum_meet(int active_count, int quorum_count);
+
+#endif /* _GLUSTERD_SERVER_QUORUM_H */
diff --git a/xlators/mgmt/glusterd/src/glusterd-shd-svc-helper.c b/xlators/mgmt/glusterd/src/glusterd-shd-svc-helper.c
new file mode 100644
index 00000000000..5661e391a9c
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-shd-svc-helper.c
@@ -0,0 +1,153 @@
+/*
+   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-shd-svc-helper.h"
+#include "glusterd-messages.h"
+#include "glusterd-volgen.h"
+
+void
+glusterd_svc_build_shd_socket_filepath(glusterd_volinfo_t *volinfo, char *path,
+                                       int path_len)
+{
+    char sockfilepath[PATH_MAX] = {
+        0,
+    };
+    char rundir[PATH_MAX] = {
+        0,
+    };
+    int32_t len = 0;
+    glusterd_conf_t *priv = THIS->private;
+
+    if (!priv)
+        return;
+
+    GLUSTERD_GET_SHD_RUNDIR(rundir, volinfo, priv);
+    len = snprintf(sockfilepath, sizeof(sockfilepath), "%s/run-%s", rundir,
+                   uuid_utoa(MY_UUID));
+    if ((len < 0) || (len >= sizeof(sockfilepath))) {
+        sockfilepath[0] = 0;
+    }
+
+    glusterd_set_socket_filepath(sockfilepath, path, path_len);
+}
+
+void
+glusterd_svc_build_shd_pidfile(glusterd_volinfo_t *volinfo, char *path,
+                               int path_len)
+{
+    char rundir[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = THIS->private;
+
+    if (!priv)
+        return;
+
+    GLUSTERD_GET_SHD_RUNDIR(rundir, volinfo, priv);
+
+    snprintf(path, path_len, "%s/%s-shd.pid", rundir, volinfo->volname);
+}
+
+void
+glusterd_svc_build_shd_volfile_path(glusterd_volinfo_t *volinfo, char *path,
+                                    int path_len)
+{
+    char workdir[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = THIS->private;
+
+    if (!priv)
+        return;
+
+    GLUSTERD_GET_VOLUME_DIR(workdir, volinfo, priv);
+
+    snprintf(path, path_len, "%s/%s-shd.vol", workdir, volinfo->volname);
+}
+
+void
+glusterd_shd_svcproc_cleanup(glusterd_shdsvc_t *shd)
+{
+    glusterd_svc_proc_t *svc_proc = NULL;
+    glusterd_svc_t *svc = NULL;
+    glusterd_conf_t *conf = NULL;
+    gf_boolean_t need_unref = _gf_false;
+    rpc_clnt_t *rpc = NULL;
+
+    conf = THIS->private;
+    if (!conf)
+        return;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, conf, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, shd, out);
+
+    svc = &shd->svc;
+    shd->attached = _gf_false;
+
+    if (svc->conn.rpc) {
+        rpc_clnt_unref(svc->conn.rpc);
+        svc->conn.rpc = NULL;
+    }
+
+    pthread_mutex_lock(&conf->attach_lock);
+    {
+        svc_proc = svc->svc_proc;
+        svc->svc_proc = NULL;
+        svc->inited = _gf_false;
+        cds_list_del_init(&svc->mux_svc);
+        glusterd_unlink_file(svc->proc.pidfile);
+
+        if (svc_proc && cds_list_empty(&svc_proc->svcs)) {
+            cds_list_del_init(&svc_proc->svc_proc_list);
+            /* We cannot free svc_proc list from here. Because
+             * if there are pending events on the rpc, it will
+             * try to access the corresponding svc_proc, so unrefing
+             * rpc request and then cleaning up the memory is carried
+             * from the notify function upon RPC_CLNT_DESTROY destroy.
+             */
+            need_unref = _gf_true;
+            rpc = svc_proc->rpc;
+            svc_proc->rpc = NULL;
+        }
+    }
+    pthread_mutex_unlock(&conf->attach_lock);
+    /*rpc unref has to be performed outside the lock*/
+    if (need_unref && rpc)
+        rpc_clnt_unref(rpc);
+out:
+    return;
+}
+
+int
+glusterd_svc_set_shd_pidfile(glusterd_volinfo_t *volinfo, dict_t *dict)
+{
+    int ret = -1;
+    glusterd_svc_t *svc = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+
+    svc = &(volinfo->shd.svc);
+
+    ret = dict_set_dynstr_with_alloc(dict, "pidfile", svc->proc.pidfile);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set pidfile %s in dict", svc->proc.pidfile);
+        goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-shd-svc-helper.h b/xlators/mgmt/glusterd/src/glusterd-shd-svc-helper.h
new file mode 100644
index 00000000000..1f0984ba857
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-shd-svc-helper.h
@@ -0,0 +1,42 @@
+/*
+   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_SHD_SVC_HELPER_H_
+#define _GLUSTERD_SHD_SVC_HELPER_H_
+
+#include "glusterd.h"
+#include "glusterd-svc-mgmt.h"
+
+void
+glusterd_svc_build_shd_socket_filepath(glusterd_volinfo_t *volinfo, char *path,
+                                       int path_len);
+
+void
+glusterd_svc_build_shd_pidfile(glusterd_volinfo_t *volinfo, char *path,
+                               int path_len);
+
+void
+glusterd_svc_build_shd_volfile_path(glusterd_volinfo_t *volinfo, char *path,
+                                    int path_len);
+
+void
+glusterd_shd_svcproc_cleanup(glusterd_shdsvc_t *shd);
+
+int
+glusterd_recover_shd_attach_failure(glusterd_volinfo_t *volinfo,
+                                    glusterd_svc_t *svc, int flags);
+
+int
+glusterd_shdsvc_create_volfile(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_svc_set_shd_pidfile(glusterd_volinfo_t *volinfo, dict_t *dict);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-shd-svc.c b/xlators/mgmt/glusterd/src/glusterd-shd-svc.c
new file mode 100644
index 00000000000..1c56384a14b
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-shd-svc.c
@@ -0,0 +1,796 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/globals.h>
+#include <glusterfs/run.h>
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-shd-svc.h"
+#include "glusterd-shd-svc-helper.h"
+#include "glusterd-svc-helper.h"
+#include "glusterd-store.h"
+
+#define GD_SHD_PROCESS_NAME "--process-name"
+char *shd_svc_name = "glustershd";
+
+void
+glusterd_shdsvc_build(glusterd_svc_t *svc)
+{
+    int ret = -1;
+    ret = snprintf(svc->name, sizeof(svc->name), "%s", shd_svc_name);
+    if (ret < 0)
+        return;
+
+    CDS_INIT_LIST_HEAD(&svc->mux_svc);
+    svc->manager = glusterd_shdsvc_manager;
+    svc->start = glusterd_shdsvc_start;
+    svc->stop = glusterd_shdsvc_stop;
+    svc->reconfigure = glusterd_shdsvc_reconfigure;
+}
+
+int
+glusterd_shdsvc_init(void *data, glusterd_conn_t *mux_conn,
+                     glusterd_svc_proc_t *mux_svc)
+{
+    int ret = -1;
+    char rundir[PATH_MAX] = {
+        0,
+    };
+    char sockpath[PATH_MAX] = {
+        0,
+    };
+    char pidfile[PATH_MAX] = {
+        0,
+    };
+    char volfile[PATH_MAX] = {
+        0,
+    };
+    char logdir[PATH_MAX] = {
+        0,
+    };
+    char logfile[PATH_MAX] = {
+        0,
+    };
+    char volfileid[256] = {0};
+    glusterd_svc_t *svc = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_muxsvc_conn_notify_t notify = NULL;
+    xlator_t *this = NULL;
+    char *volfileserver = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO(THIS->name, this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    volinfo = data;
+    GF_VALIDATE_OR_GOTO(this->name, data, out);
+    GF_VALIDATE_OR_GOTO(this->name, mux_svc, out);
+
+    svc = &(volinfo->shd.svc);
+
+    ret = snprintf(svc->name, sizeof(svc->name), "%s", shd_svc_name);
+    if (ret < 0)
+        goto out;
+
+    notify = glusterd_muxsvc_common_rpc_notify;
+    glusterd_store_perform_node_state_store(volinfo);
+
+    GLUSTERD_GET_SHD_RUNDIR(rundir, volinfo, priv);
+    glusterd_svc_create_rundir(rundir);
+
+    glusterd_svc_build_logfile_path(shd_svc_name, priv->logdir, logfile,
+                                    sizeof(logfile));
+
+    /* Initialize the connection mgmt */
+    if (mux_conn && mux_svc->rpc) {
+        /* multiplexed svc */
+        svc->conn.frame_timeout = mux_conn->frame_timeout;
+        /* This will be unrefed from glusterd_shd_svcproc_cleanup*/
+        svc->conn.rpc = rpc_clnt_ref(mux_svc->rpc);
+        ret = snprintf(svc->conn.sockpath, sizeof(svc->conn.sockpath), "%s",
+                       mux_conn->sockpath);
+        if (ret < 0)
+            goto out;
+    } else {
+        ret = mkdir_p(priv->logdir, 0755, _gf_true);
+        if ((ret == -1) && (EEXIST != errno)) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_CREATE_DIR_FAILED,
+                   "Unable to create logdir %s", logdir);
+            goto out;
+        }
+
+        glusterd_svc_build_shd_socket_filepath(volinfo, sockpath,
+                                               sizeof(sockpath));
+        ret = glusterd_muxsvc_conn_init(&(svc->conn), mux_svc, sockpath, 600,
+                                        notify);
+        if (ret)
+            goto out;
+        /* This will be unrefed when the last svcs is detached from the list */
+        if (!mux_svc->rpc)
+            mux_svc->rpc = rpc_clnt_ref(svc->conn.rpc);
+    }
+
+    /* Initialize the process mgmt */
+    glusterd_svc_build_shd_pidfile(volinfo, pidfile, sizeof(pidfile));
+    glusterd_svc_build_shd_volfile_path(volinfo, volfile, PATH_MAX);
+    len = snprintf(volfileid, sizeof(volfileid), "shd/%s", volinfo->volname);
+    if ((len < 0) || (len >= sizeof(volfileid))) {
+        ret = -1;
+        goto out;
+    }
+
+    if (dict_get_strn(this->options, "transport.socket.bind-address",
+                      SLEN("transport.socket.bind-address"),
+                      &volfileserver) != 0) {
+        volfileserver = "localhost";
+    }
+    ret = glusterd_proc_init(&(svc->proc), shd_svc_name, pidfile, logdir,
+                             logfile, volfile, volfileid, volfileserver);
+    if (ret)
+        goto out;
+
+out:
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_shdsvc_create_volfile(glusterd_volinfo_t *volinfo)
+{
+    char filepath[PATH_MAX] = {
+        0,
+    };
+
+    int ret = -1;
+    dict_t *mod_dict = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    glusterd_svc_build_shd_volfile_path(volinfo, filepath, PATH_MAX);
+    if (!glusterd_is_shd_compatible_volume(volinfo)) {
+        /* If volfile exist, delete it. This case happens when we
+         * change from replica/ec to distribute.
+         */
+        (void)glusterd_unlink_file(filepath);
+        ret = 0;
+        goto out;
+    }
+    mod_dict = dict_new();
+    if (!mod_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ret = dict_set_uint32(mod_dict, "cluster.background-self-heal-count", 0);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=cluster.background-self-heal-count", NULL);
+        goto out;
+    }
+
+    ret = dict_set_str(mod_dict, "cluster.data-self-heal", "on");
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=cluster.data-self-heal", NULL);
+        goto out;
+    }
+
+    ret = dict_set_str(mod_dict, "cluster.metadata-self-heal", "on");
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=cluster.metadata-self-heal", NULL);
+        goto out;
+    }
+
+    ret = dict_set_str(mod_dict, "cluster.entry-self-heal", "on");
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=cluster.entry-self-heal", NULL);
+        goto out;
+    }
+
+    ret = glusterd_shdsvc_generate_volfile(volinfo, filepath, mod_dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Failed to create volfile");
+        goto out;
+    }
+
+out:
+    if (mod_dict)
+        dict_unref(mod_dict);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+gf_boolean_t
+glusterd_svcs_shd_compatible_volumes_stopped(glusterd_svc_t *svc)
+{
+    glusterd_svc_proc_t *svc_proc = NULL;
+    glusterd_shdsvc_t *shd = NULL;
+    glusterd_svc_t *temp_svc = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    gf_boolean_t comp = _gf_false;
+    glusterd_conf_t *conf = THIS->private;
+
+    GF_VALIDATE_OR_GOTO("glusterd", conf, out);
+    GF_VALIDATE_OR_GOTO("glusterd", svc, out);
+    pthread_mutex_lock(&conf->attach_lock);
+    {
+        svc_proc = svc->svc_proc;
+        if (!svc_proc)
+            goto unlock;
+        cds_list_for_each_entry(temp_svc, &svc_proc->svcs, mux_svc)
+        {
+            /* Get volinfo->shd from svc object */
+            shd = cds_list_entry(svc, glusterd_shdsvc_t, svc);
+            if (!shd) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_SHD_OBJ_GET_FAIL,
+                       "Failed to get shd object "
+                       "from shd service");
+                goto unlock;
+            }
+
+            /* Get volinfo from shd */
+            volinfo = cds_list_entry(shd, glusterd_volinfo_t, shd);
+            if (!volinfo) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                       "Failed to get volinfo from "
+                       "from shd");
+                goto unlock;
+            }
+            if (!glusterd_is_shd_compatible_volume(volinfo))
+                continue;
+            if (volinfo->status == GLUSTERD_STATUS_STARTED)
+                goto unlock;
+        }
+        comp = _gf_true;
+    }
+unlock:
+    pthread_mutex_unlock(&conf->attach_lock);
+out:
+    return comp;
+}
+
+int
+glusterd_shdsvc_manager(glusterd_svc_t *svc, void *data, int flags)
+{
+    int ret = -1;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_conf_t *conf = NULL;
+    gf_boolean_t shd_restart = _gf_false;
+
+    conf = THIS->private;
+    GF_VALIDATE_OR_GOTO("glusterd", conf, out);
+    GF_VALIDATE_OR_GOTO("glusterd", svc, out);
+    volinfo = data;
+    GF_VALIDATE_OR_GOTO("glusterd", volinfo, out);
+
+    if (volinfo->is_snap_volume) {
+        /* healing of a snap volume is not supported yet*/
+        ret = 0;
+        goto out;
+    }
+
+    while (conf->restart_shd) {
+        synccond_wait(&conf->cond_restart_shd, &conf->big_lock);
+    }
+    conf->restart_shd = _gf_true;
+    shd_restart = _gf_true;
+
+    if (volinfo)
+        glusterd_volinfo_ref(volinfo);
+
+    if (!glusterd_is_shd_compatible_volume(volinfo)) {
+        ret = 0;
+        if (svc->inited) {
+            /* This means glusterd was running for this volume and now
+             * it was converted to a non-shd volume. So just stop the shd
+             */
+            ret = svc->stop(svc, SIGTERM);
+        }
+        goto out;
+    }
+    ret = glusterd_shdsvc_create_volfile(volinfo);
+    if (ret)
+        goto out;
+
+    ret = glusterd_shd_svc_mux_init(volinfo, svc);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_FAILED_INIT_SHDSVC,
+               "Failed to init shd service");
+        goto out;
+    }
+
+    /* If all the volumes are stopped or all shd compatible volumes
+     * are stopped then stop the service if:
+     * - volinfo is NULL or
+     * - volinfo is present and volume is shd compatible
+     * Otherwise create volfile and restart service if:
+     * - volinfo is NULL or
+     * - volinfo is present and volume is shd compatible
+     */
+    if (glusterd_svcs_shd_compatible_volumes_stopped(svc)) {
+        /* TODO
+         * Take a lock and detach all svc's to stop the process
+         * also reset the init flag
+         */
+        ret = svc->stop(svc, SIGTERM);
+    } else if (volinfo) {
+        if (volinfo->status != GLUSTERD_STATUS_STARTED) {
+            ret = svc->stop(svc, SIGTERM);
+            if (ret)
+                goto out;
+        }
+        if (volinfo->status == GLUSTERD_STATUS_STARTED) {
+            ret = svc->start(svc, flags);
+            if (ret)
+                goto out;
+        }
+    }
+out:
+    if (shd_restart) {
+        conf->restart_shd = _gf_false;
+        synccond_broadcast(&conf->cond_restart_shd);
+    }
+    if (volinfo)
+        glusterd_volinfo_unref(volinfo);
+    if (ret)
+        gf_event(EVENT_SVC_MANAGER_FAILED, "svc_name=%s", svc->name);
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_new_shd_svc_start(glusterd_svc_t *svc, int flags)
+{
+    int ret = -1;
+    char glusterd_uuid_option[PATH_MAX] = {0};
+    char client_pid[32] = {0};
+    dict_t *cmdline = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    cmdline = dict_new();
+    if (!cmdline)
+        goto out;
+
+    ret = snprintf(glusterd_uuid_option, sizeof(glusterd_uuid_option),
+                   "*replicate*.node-uuid=%s", uuid_utoa(MY_UUID));
+    if (ret < 0)
+        goto out;
+
+    ret = snprintf(client_pid, sizeof(client_pid), "--client-pid=%d",
+                   GF_CLIENT_PID_SELF_HEALD);
+    if (ret < 0)
+        goto out;
+
+    ret = dict_set_str(cmdline, "arg", client_pid);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=arg", NULL);
+        goto out;
+    }
+
+    /* Pass cmdline arguments as key-value pair. The key is merely
+     * a carrier and is not used. Since dictionary follows LIFO the value
+     * should be put in reverse order*/
+    ret = dict_set_str(cmdline, "arg4", svc->name);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=arg4", NULL);
+        goto out;
+    }
+
+    ret = dict_set_str(cmdline, "arg3", GD_SHD_PROCESS_NAME);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=arg3", NULL);
+        goto out;
+    }
+
+    ret = dict_set_str(cmdline, "arg2", glusterd_uuid_option);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=arg2", NULL);
+        goto out;
+    }
+
+    ret = dict_set_str(cmdline, "arg1", "--xlator-option");
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=arg1", NULL);
+        goto out;
+    }
+
+    ret = glusterd_svc_start(svc, flags, cmdline);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno,
+                GD_MSG_GLUSTER_SERVICE_START_FAIL, NULL);
+        goto out;
+    }
+
+    ret = glusterd_conn_connect(&(svc->conn));
+out:
+    if (cmdline)
+        dict_unref(cmdline);
+    return ret;
+}
+
+int
+glusterd_recover_shd_attach_failure(glusterd_volinfo_t *volinfo,
+                                    glusterd_svc_t *svc, int flags)
+{
+    int ret = -1;
+    glusterd_svc_proc_t *mux_proc = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    conf = THIS->private;
+
+    if (!conf || !volinfo || !svc)
+        return -1;
+    glusterd_shd_svcproc_cleanup(&volinfo->shd);
+    mux_proc = glusterd_svcprocess_new();
+    if (!mux_proc) {
+        return -1;
+    }
+    ret = glusterd_shdsvc_init(volinfo, NULL, mux_proc);
+    if (ret)
+        return -1;
+    pthread_mutex_lock(&conf->attach_lock);
+    {
+        cds_list_add_tail(&mux_proc->svc_proc_list, &conf->shd_procs);
+        svc->svc_proc = mux_proc;
+        cds_list_del_init(&svc->mux_svc);
+        cds_list_add_tail(&svc->mux_svc, &mux_proc->svcs);
+    }
+    pthread_mutex_unlock(&conf->attach_lock);
+
+    ret = glusterd_new_shd_svc_start(svc, flags);
+    if (!ret) {
+        volinfo->shd.attached = _gf_true;
+    }
+    return ret;
+}
+
+int
+glusterd_shdsvc_start(glusterd_svc_t *svc, int flags)
+{
+    int ret = -1;
+    glusterd_shdsvc_t *shd = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    GF_VALIDATE_OR_GOTO("glusterd", svc, out);
+    conf = THIS->private;
+    GF_VALIDATE_OR_GOTO("glusterd", conf, out);
+
+    /* Get volinfo->shd from svc object */
+    shd = cds_list_entry(svc, glusterd_shdsvc_t, svc);
+    if (!shd) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_SHD_OBJ_GET_FAIL,
+               "Failed to get shd object "
+               "from shd service");
+        return -1;
+    }
+
+    /* Get volinfo from shd */
+    volinfo = cds_list_entry(shd, glusterd_volinfo_t, shd);
+    if (!volinfo) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Failed to get volinfo from "
+               "from shd");
+        return -1;
+    }
+
+    if (volinfo->status != GLUSTERD_STATUS_STARTED)
+        return -1;
+
+    glusterd_volinfo_ref(volinfo);
+
+    if (!svc->inited) {
+        ret = glusterd_shd_svc_mux_init(volinfo, svc);
+        if (ret)
+            goto out;
+    }
+
+    if (shd->attached) {
+        glusterd_volinfo_ref(volinfo);
+        /* Unref will happen from glusterd_svc_attach_cbk */
+        ret = glusterd_attach_svc(svc, volinfo, flags);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                   "Failed to attach shd svc(volume=%s) to pid=%d",
+                   volinfo->volname, glusterd_proc_get_pid(&svc->proc));
+            glusterd_shd_svcproc_cleanup(&volinfo->shd);
+            glusterd_volinfo_unref(volinfo);
+            goto out1;
+        }
+        goto out;
+    }
+    ret = glusterd_new_shd_svc_start(svc, flags);
+    if (!ret) {
+        shd->attached = _gf_true;
+    }
+out:
+    if (ret && volinfo)
+        glusterd_shd_svcproc_cleanup(&volinfo->shd);
+    if (volinfo)
+        glusterd_volinfo_unref(volinfo);
+out1:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_shdsvc_reconfigure(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    gf_boolean_t identical = _gf_false;
+    dict_t *mod_dict = NULL;
+    glusterd_svc_t *svc = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    if (!volinfo) {
+        /* reconfigure will be called separately*/
+        ret = 0;
+        goto out;
+    }
+
+    glusterd_volinfo_ref(volinfo);
+    svc = &(volinfo->shd.svc);
+    if (glusterd_svcs_shd_compatible_volumes_stopped(svc))
+        goto manager;
+
+    /*
+     * Check both OLD and NEW volfiles, if they are SAME by size
+     * and cksum i.e. "character-by-character". If YES, then
+     * NOTHING has been changed, just return.
+     */
+
+    if (!glusterd_is_shd_compatible_volume(volinfo)) {
+        if (svc->inited)
+            goto manager;
+
+        /* Nothing to do if not shd compatible */
+        ret = 0;
+        goto out;
+    }
+    mod_dict = dict_new();
+    if (!mod_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ret = dict_set_uint32(mod_dict, "cluster.background-self-heal-count", 0);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=cluster.background-self-heal-count", NULL);
+        goto out;
+    }
+
+    ret = dict_set_str(mod_dict, "cluster.data-self-heal", "on");
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=cluster.data-self-heal", NULL);
+        goto out;
+    }
+
+    ret = dict_set_str(mod_dict, "cluster.metadata-self-heal", "on");
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=cluster.metadata-self-heal", NULL);
+        goto out;
+    }
+
+    ret = dict_set_int32(mod_dict, "graph-check", 1);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=graph-check", NULL);
+        goto out;
+    }
+
+    ret = dict_set_str(mod_dict, "cluster.entry-self-heal", "on");
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=cluster.entry-self-heal", NULL);
+        goto out;
+    }
+
+    ret = glusterd_volume_svc_check_volfile_identical(
+        "glustershd", mod_dict, volinfo, glusterd_shdsvc_generate_volfile,
+        &identical);
+    if (ret)
+        goto out;
+
+    if (identical) {
+        ret = 0;
+        goto out;
+    }
+
+    /*
+     * They are not identical. Find out if the topology is changed
+     * OR just the volume options. If just the options which got
+     * changed, then inform the xlator to reconfigure the options.
+     */
+    identical = _gf_false; /* RESET the FLAG */
+    ret = glusterd_volume_svc_check_topology_identical(
+        "glustershd", mod_dict, volinfo, glusterd_shdsvc_generate_volfile,
+        &identical);
+    if (ret)
+        goto out;
+
+    /* Topology is not changed, but just the options. But write the
+     * options to shd volfile, so that shd will be reconfigured.
+     */
+    if (identical) {
+        ret = glusterd_shdsvc_create_volfile(volinfo);
+        if (ret == 0) { /* Only if above PASSES */
+            ret = glusterd_fetchspec_notify(THIS);
+        }
+        goto out;
+    }
+manager:
+    /*
+     * shd volfile's topology has been changed. volfile needs
+     * to be RECONFIGURED to ACT on the changed volfile.
+     */
+    ret = svc->manager(svc, volinfo, PROC_START_NO_WAIT);
+
+out:
+    if (volinfo)
+        glusterd_volinfo_unref(volinfo);
+    if (mod_dict)
+        dict_unref(mod_dict);
+    gf_msg_debug(this ? this->name : "glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_shdsvc_restart()
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_volinfo_t *tmp = NULL;
+    int ret = -1;
+    xlator_t *this = THIS;
+    glusterd_conf_t *conf = NULL;
+    glusterd_svc_t *svc = NULL;
+
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    pthread_mutex_lock(&conf->volume_lock);
+    cds_list_for_each_entry_safe(volinfo, tmp, &conf->volumes, vol_list)
+    {
+        glusterd_volinfo_ref(volinfo);
+        pthread_mutex_unlock(&conf->volume_lock);
+        /* Start per volume shd svc */
+        if (volinfo->status == GLUSTERD_STATUS_STARTED) {
+            svc = &(volinfo->shd.svc);
+            ret = svc->manager(svc, volinfo, PROC_START_NO_WAIT);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SHD_START_FAIL,
+                       "Couldn't start shd for "
+                       "vol: %s on restart",
+                       volinfo->volname);
+                gf_event(EVENT_SVC_MANAGER_FAILED, "volume=%s;svc_name=%s",
+                         volinfo->volname, svc->name);
+                glusterd_volinfo_unref(volinfo);
+                goto out;
+            }
+        }
+        glusterd_volinfo_unref(volinfo);
+        pthread_mutex_lock(&conf->volume_lock);
+    }
+    pthread_mutex_unlock(&conf->volume_lock);
+out:
+    return ret;
+}
+
+int
+glusterd_shdsvc_stop(glusterd_svc_t *svc, int sig)
+{
+    int ret = -1;
+    glusterd_svc_proc_t *svc_proc = NULL;
+    glusterd_shdsvc_t *shd = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    gf_boolean_t empty = _gf_false;
+    glusterd_conf_t *conf = NULL;
+    int pid = -1;
+
+    conf = THIS->private;
+    GF_VALIDATE_OR_GOTO("glusterd", conf, out);
+    GF_VALIDATE_OR_GOTO("glusterd", svc, out);
+    svc_proc = svc->svc_proc;
+    if (!svc_proc) {
+        /*
+         * This can happen when stop was called on a volume that is not shd
+         * compatible.
+         */
+        gf_msg_debug("glusterd", 0, "svc_proc is null, ie shd already stopped");
+        ret = 0;
+        goto out;
+    }
+
+    /* Get volinfo->shd from svc object */
+    shd = cds_list_entry(svc, glusterd_shdsvc_t, svc);
+    if (!shd) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_SHD_OBJ_GET_FAIL,
+               "Failed to get shd object "
+               "from shd service");
+        return -1;
+    }
+
+    /* Get volinfo from shd */
+    volinfo = cds_list_entry(shd, glusterd_volinfo_t, shd);
+    if (!volinfo) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Failed to get volinfo from "
+               "from shd");
+        return -1;
+    }
+
+    glusterd_volinfo_ref(volinfo);
+    pthread_mutex_lock(&conf->attach_lock);
+    {
+        if (!gf_is_service_running(svc->proc.pidfile, &pid)) {
+            gf_msg_debug(THIS->name, 0, "shd isn't running");
+        }
+        cds_list_del_init(&svc->mux_svc);
+        empty = cds_list_empty(&svc_proc->svcs);
+        if (empty) {
+            svc_proc->status = GF_SVC_STOPPING;
+            cds_list_del_init(&svc_proc->svc_proc_list);
+        }
+    }
+    pthread_mutex_unlock(&conf->attach_lock);
+    if (empty) {
+        /* Unref will happen when destroying the connection */
+        glusterd_volinfo_ref(volinfo);
+        svc_proc->data = volinfo;
+        ret = glusterd_svc_stop(svc, sig);
+        if (ret) {
+            glusterd_volinfo_unref(volinfo);
+            goto out;
+        }
+    }
+    if (!empty && pid != -1) {
+        ret = glusterd_detach_svc(svc, volinfo, sig);
+        if (ret)
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SVC_STOP_FAIL,
+                   "shd service is failed to detach volume %s from pid %d",
+                   volinfo->volname, glusterd_proc_get_pid(&svc->proc));
+        else
+            gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_SVC_STOP_SUCCESS,
+                   "Shd service is detached for volume %s from pid %d",
+                   volinfo->volname, glusterd_proc_get_pid(&svc->proc));
+    }
+    svc->online = _gf_false;
+    (void)glusterd_unlink_file((char *)svc->proc.pidfile);
+    glusterd_shd_svcproc_cleanup(shd);
+    ret = 0;
+    glusterd_volinfo_unref(volinfo);
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-shd-svc.h b/xlators/mgmt/glusterd/src/glusterd-shd-svc.h
new file mode 100644
index 00000000000..55b409f4b69
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-shd-svc.h
@@ -0,0 +1,45 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_SHD_SVC_H_
+#define _GLUSTERD_SHD_SVC_H_
+
+#include "glusterd-svc-mgmt.h"
+#include "glusterd.h"
+
+typedef struct glusterd_shdsvc_ glusterd_shdsvc_t;
+struct glusterd_shdsvc_ {
+    glusterd_svc_t svc;
+    gf_boolean_t attached;
+};
+
+void
+glusterd_shdsvc_build(glusterd_svc_t *svc);
+
+int
+glusterd_shdsvc_init(void *data, glusterd_conn_t *mux_conn,
+                     glusterd_svc_proc_t *svc_proc);
+
+int
+glusterd_shdsvc_manager(glusterd_svc_t *svc, void *data, int flags);
+
+int
+glusterd_shdsvc_start(glusterd_svc_t *svc, int flags);
+
+int
+glusterd_shdsvc_reconfigure();
+
+int
+glusterd_shdsvc_restart();
+
+int
+glusterd_shdsvc_stop(glusterd_svc_t *svc, int sig);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-sm.c b/xlators/mgmt/glusterd/src/glusterd-sm.c
new file mode 100644
index 00000000000..bf2d81b644a
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-sm.c
@@ -0,0 +1,1622 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <time.h>
+#include <sys/uio.h>
+#include <sys/resource.h>
+
+#include <libgen.h>
+#include <glusterfs/compat-uuid.h>
+
+#include "fnmatch.h"
+#include <glusterfs/xlator.h>
+#include "protocol-common.h"
+#include "glusterd.h"
+#include <glusterfs/call-stub.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/list.h>
+#include "glusterd-messages.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/statedump.h>
+#include "glusterd-sm.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-utils.h"
+#include "glusterd-store.h"
+#include "glusterd-svc-helper.h"
+#include "glusterd-snapshot-utils.h"
+#include "glusterd-server-quorum.h"
+#include "glusterd-gfproxyd-svc-helper.h"
+
+char local_node_hostname[PATH_MAX] = {
+    0,
+};
+
+static struct cds_list_head gd_friend_sm_queue;
+
+static char *glusterd_friend_sm_state_names[] = {
+    "Establishing Connection",
+    "Probe Sent to Peer",
+    "Probe Received from Peer",
+    "Peer in Cluster",
+    "Accepted peer request",
+    "Sent and Received peer request",
+    "Peer Rejected",
+    "Peer detach in progress",
+    "Probe Received from peer",
+    "Connected to Peer",
+    "Peer is connected and Accepted",
+    "Invalid State"};
+
+static char *glusterd_friend_sm_event_names[] = {
+    "GD_FRIEND_EVENT_NONE",
+    "GD_FRIEND_EVENT_PROBE",
+    "GD_FRIEND_EVENT_INIT_FRIEND_REQ",
+    "GD_FRIEND_EVENT_RCVD_ACC",
+    "GD_FRIEND_EVENT_LOCAL_ACC",
+    "GD_FRIEND_EVENT_RCVD_RJT",
+    "GD_FRIEND_EVENT_LOCAL_RJT",
+    "GD_FRIEND_EVENT_RCVD_FRIEND_REQ",
+    "GD_FRIEND_EVENT_INIT_REMOVE_FRIEND",
+    "GD_FRIEND_EVENT_RCVD_REMOVE_FRIEND",
+    "GD_FRIEND_EVENT_REMOVE_FRIEND",
+    "GD_FRIEND_EVENT_CONNECTED",
+    "GD_FRIEND_EVENT_NEW_NAME",
+    "GD_FRIEND_EVENT_MAX"};
+
+char *
+glusterd_friend_sm_state_name_get(int state)
+{
+    if (state < 0 || state >= GD_FRIEND_STATE_MAX)
+        return glusterd_friend_sm_state_names[GD_FRIEND_STATE_MAX];
+    return glusterd_friend_sm_state_names[state];
+}
+
+char *
+glusterd_friend_sm_event_name_get(int event)
+{
+    if (event < 0 || event >= GD_FRIEND_EVENT_MAX)
+        return glusterd_friend_sm_event_names[GD_FRIEND_EVENT_MAX];
+    return glusterd_friend_sm_event_names[event];
+}
+
+void
+glusterd_destroy_probe_ctx(glusterd_probe_ctx_t *ctx)
+{
+    if (!ctx)
+        return;
+
+    GF_FREE(ctx->hostname);
+    GF_FREE(ctx);
+}
+
+void
+glusterd_destroy_friend_req_ctx(glusterd_friend_req_ctx_t *ctx)
+{
+    if (!ctx)
+        return;
+
+    if (ctx->vols)
+        dict_unref(ctx->vols);
+    GF_FREE(ctx->hostname);
+    GF_FREE(ctx);
+}
+
+void
+glusterd_destroy_friend_update_ctx(glusterd_friend_update_ctx_t *ctx)
+{
+    if (!ctx)
+        return;
+    GF_FREE(ctx->hostname);
+    GF_FREE(ctx);
+}
+
+int
+glusterd_broadcast_friend_delete(char *hostname, uuid_t uuid)
+{
+    int ret = 0;
+    rpc_clnt_procedure_t *proc = NULL;
+    xlator_t *this = NULL;
+    glusterd_friend_update_ctx_t ctx = {
+        {0},
+    };
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    dict_t *friends = NULL;
+    char key[64] = {
+        0,
+    };
+    int keylen;
+    int32_t count = 0;
+
+    this = THIS;
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    ctx.hostname = hostname;
+    ctx.op = GD_FRIEND_UPDATE_DEL;
+
+    friends = dict_new();
+    if (!friends) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "op");
+    ret = dict_set_int32n(friends, key, keylen, ctx.op);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "hostname");
+    ret = dict_set_strn(friends, key, keylen, hostname);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    ret = dict_set_int32n(friends, "count", SLEN("count"), count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &priv->peers, uuid_list)
+    {
+        if (!peerinfo->connected || !peerinfo->peer)
+            continue;
+
+        /* Setting a direct reference to peerinfo in the dict is okay as
+         * it is only going to be used within this read critical section
+         * (in glusterd_rpc_friend_update)
+         */
+        ret = dict_set_static_ptr(friends, "peerinfo", peerinfo);
+        if (ret) {
+            RCU_READ_UNLOCK;
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "failed to set peerinfo");
+            goto out;
+        }
+
+        proc = &peerinfo->peer->proctable[GLUSTERD_FRIEND_UPDATE];
+        if (proc->fn) {
+            ret = proc->fn(NULL, this, friends);
+        }
+    }
+
+    RCU_READ_UNLOCK;
+out:
+    if (friends)
+        dict_unref(friends);
+
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+static int
+glusterd_ac_none(glusterd_friend_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_ac_error(glusterd_friend_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+
+    gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_AC_ERROR, "Received event %d ",
+           event->event);
+
+    return ret;
+}
+
+static int
+glusterd_ac_reverse_probe_begin(glusterd_friend_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_friend_sm_event_t *new_event = NULL;
+    glusterd_probe_ctx_t *new_ev_ctx = NULL;
+
+    GF_ASSERT(event);
+    GF_ASSERT(ctx);
+
+    new_ev_ctx = GF_CALLOC(1, sizeof(*new_ev_ctx), gf_gld_mt_probe_ctx_t);
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find(event->peerid, event->peername);
+    if (!peerinfo) {
+        RCU_READ_UNLOCK;
+        ret = -1;
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_PEER_NOT_FOUND,
+               "Could not find peer %s(%s)", event->peername,
+               uuid_utoa(event->peerid));
+        goto out;
+    }
+
+    ret = glusterd_friend_sm_new_event(GD_FRIEND_EVENT_PROBE, &new_event);
+
+    if (ret) {
+        RCU_READ_UNLOCK;
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_EVENT_NEW_GET_FAIL,
+               "Unable to get new new_event");
+        ret = -1;
+        goto out;
+    }
+
+    if (!new_ev_ctx) {
+        RCU_READ_UNLOCK;
+        ret = -1;
+        goto out;
+    }
+
+    new_ev_ctx->hostname = gf_strdup(peerinfo->hostname);
+    new_ev_ctx->port = peerinfo->port;
+    new_ev_ctx->req = NULL;
+
+    new_event->peername = gf_strdup(peerinfo->hostname);
+    gf_uuid_copy(new_event->peerid, peerinfo->uuid);
+    new_event->ctx = new_ev_ctx;
+
+    ret = glusterd_friend_sm_inject_event(new_event);
+
+    RCU_READ_UNLOCK;
+
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_EVENT_INJECT_FAIL,
+               "Unable to inject new_event %d, "
+               "ret = %d",
+               new_event->event, ret);
+    }
+
+out:
+    if (ret) {
+        if (new_event)
+            GF_FREE(new_event->peername);
+        GF_FREE(new_event);
+        if (new_ev_ctx)
+            GF_FREE(new_ev_ctx->hostname);
+        GF_FREE(new_ev_ctx);
+    }
+    gf_msg_debug("glusterd", 0, "returning with %d", ret);
+    return ret;
+}
+
+static int
+glusterd_ac_friend_add(glusterd_friend_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    rpc_clnt_procedure_t *proc = NULL;
+    call_frame_t *frame = NULL;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(event);
+
+    this = THIS;
+    conf = this->private;
+
+    GF_ASSERT(conf);
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find(event->peerid, event->peername);
+    if (!peerinfo) {
+        RCU_READ_UNLOCK;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEER_NOT_FOUND,
+               "Could not find peer %s(%s)", event->peername,
+               uuid_utoa(event->peerid));
+        goto out;
+    }
+
+    if (!peerinfo->peer) {
+        RCU_READ_UNLOCK;
+        goto out;
+    }
+    proc = &peerinfo->peer->proctable[GLUSTERD_FRIEND_ADD];
+    if (proc->fn) {
+        frame = create_frame(this, this->ctx->pool);
+        if (!frame) {
+            RCU_READ_UNLOCK;
+            goto out;
+        }
+        frame->local = ctx;
+        ret = proc->fn(frame, this, event);
+    }
+
+    RCU_READ_UNLOCK;
+out:
+    if (ret && frame)
+        STACK_DESTROY(frame->root);
+
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+static int
+glusterd_ac_friend_probe(glusterd_friend_sm_event_t *event, void *ctx)
+{
+    int ret = -1;
+    rpc_clnt_procedure_t *proc = NULL;
+    call_frame_t *frame = NULL;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+    glusterd_probe_ctx_t *probe_ctx = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    dict_t *dict = NULL;
+
+    GF_ASSERT(ctx);
+
+    probe_ctx = ctx;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+
+    conf = this->private;
+
+    GF_ASSERT(conf);
+
+    RCU_READ_LOCK;
+    peerinfo = glusterd_peerinfo_find(NULL, probe_ctx->hostname);
+    if (peerinfo == NULL) {
+        // We should not reach this state ideally
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_PEER_NOT_FOUND, NULL);
+        ret = -1;
+        goto unlock;
+    }
+
+    if (!peerinfo->peer) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_PEER_ADDRESS_GET_FAIL,
+                NULL);
+        goto unlock;
+    }
+    proc = &peerinfo->peer->proctable[GLUSTERD_PROBE_QUERY];
+    if (proc->fn) {
+        frame = create_frame(this, this->ctx->pool);
+        if (!frame) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_FRAME_CREATE_FAIL,
+                    NULL);
+            goto unlock;
+        }
+        frame->local = ctx;
+        dict = dict_new();
+        if (!dict) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            goto unlock;
+        }
+        ret = dict_set_strn(dict, "hostname", SLEN("hostname"),
+                            probe_ctx->hostname);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=hostname", NULL);
+            goto unlock;
+        }
+
+        ret = dict_set_int32n(dict, "port", SLEN("port"), probe_ctx->port);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=port", NULL);
+            goto unlock;
+        }
+
+        /* The peerinfo reference being set here is going to be used
+         * only within this critical section, in glusterd_rpc_probe
+         * (ie. proc->fn).
+         */
+        ret = dict_set_static_ptr(dict, "peerinfo", peerinfo);
+        if (ret) {
+            RCU_READ_UNLOCK;
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "failed to set peerinfo");
+            goto out;
+        }
+
+        ret = proc->fn(frame, this, dict);
+        if (ret)
+            goto unlock;
+    }
+unlock:
+    RCU_READ_UNLOCK;
+out:
+
+    if (dict)
+        dict_unref(dict);
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+
+    if (ret && frame)
+        STACK_DESTROY(frame->root);
+
+    return ret;
+}
+
+static int
+glusterd_ac_send_friend_remove_req(glusterd_friend_sm_event_t *event,
+                                   void *data)
+{
+    int ret = 0;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    rpc_clnt_procedure_t *proc = NULL;
+    call_frame_t *frame = NULL;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+    glusterd_friend_sm_event_type_t event_type = GD_FRIEND_EVENT_NONE;
+    glusterd_probe_ctx_t *ctx = NULL;
+    glusterd_friend_sm_event_t *new_event = NULL;
+
+    GF_ASSERT(event);
+
+    this = THIS;
+    conf = this->private;
+
+    GF_ASSERT(conf);
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find(event->peerid, event->peername);
+    if (!peerinfo) {
+        RCU_READ_UNLOCK;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEER_NOT_FOUND,
+               "Could not find peer %s(%s)", event->peername,
+               uuid_utoa(event->peerid));
+        goto out;
+    }
+    ctx = event->ctx;
+
+    if (!peerinfo->connected) {
+        event_type = GD_FRIEND_EVENT_REMOVE_FRIEND;
+
+        ret = glusterd_friend_sm_new_event(event_type, &new_event);
+
+        if (!ret) {
+            new_event->peername = peerinfo->hostname;
+            gf_uuid_copy(new_event->peerid, peerinfo->uuid);
+            ret = glusterd_friend_sm_inject_event(new_event);
+        } else {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_EVENT_NEW_GET_FAIL,
+                   "Unable to get event");
+        }
+
+        if (ctx) {
+            ret = glusterd_xfer_cli_deprobe_resp(ctx->req, ret, 0, NULL,
+                                                 ctx->hostname, ctx->dict);
+            glusterd_broadcast_friend_delete(ctx->hostname, NULL);
+            glusterd_destroy_probe_ctx(ctx);
+        }
+        goto unlock;
+    }
+
+    if (!peerinfo->peer) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_PEER_ADDRESS_GET_FAIL,
+                NULL);
+        goto unlock;
+    }
+    proc = &peerinfo->peer->proctable[GLUSTERD_FRIEND_REMOVE];
+    if (proc->fn) {
+        frame = create_frame(this, this->ctx->pool);
+        if (!frame) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_FRAME_CREATE_FAIL,
+                    NULL);
+            goto unlock;
+        }
+        frame->local = data;
+        ret = proc->fn(frame, this, event);
+    }
+
+unlock:
+    RCU_READ_UNLOCK;
+out:
+
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+
+    if (ret && frame)
+        STACK_DESTROY(frame->root);
+
+    return ret;
+}
+
+static gf_boolean_t
+glusterd_should_update_peer(glusterd_peerinfo_t *peerinfo,
+                            glusterd_peerinfo_t *cur_peerinfo)
+{
+    if ((peerinfo == cur_peerinfo) ||
+        (peerinfo->state.state == GD_FRIEND_STATE_BEFRIENDED))
+        return _gf_true;
+
+    return _gf_false;
+}
+
+static int
+glusterd_ac_send_friend_update(glusterd_friend_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    glusterd_peerinfo_t *cur_peerinfo = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    rpc_clnt_procedure_t *proc = NULL;
+    xlator_t *this = NULL;
+    glusterd_friend_update_ctx_t ev_ctx = {{0}};
+    glusterd_conf_t *priv = NULL;
+    dict_t *friends = NULL;
+    char key[64] = {
+        0,
+    };
+    int keylen;
+    int32_t count = 0;
+
+    GF_ASSERT(event);
+
+    this = THIS;
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    keylen = snprintf(key, sizeof(key), "op");
+    friends = dict_new();
+
+    RCU_READ_LOCK;
+
+    cur_peerinfo = glusterd_peerinfo_find(event->peerid, event->peername);
+    if (!cur_peerinfo) {
+        RCU_READ_UNLOCK;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEER_NOT_FOUND,
+               "Could not find peer %s(%s)", event->peername,
+               uuid_utoa(event->peerid));
+        goto out;
+    }
+
+    if (!friends) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto unlock;
+    }
+
+    ev_ctx.op = GD_FRIEND_UPDATE_ADD;
+    ret = dict_set_int32n(friends, key, keylen, ev_ctx.op);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto unlock;
+    }
+
+    cds_list_for_each_entry_rcu(peerinfo, &priv->peers, uuid_list)
+    {
+        if (!glusterd_should_update_peer(peerinfo, cur_peerinfo))
+            continue;
+
+        count++;
+
+        snprintf(key, sizeof(key), "friend%d", count);
+        ret = gd_add_friend_to_dict(peerinfo, friends, key);
+        if (ret)
+            goto unlock;
+    }
+
+    ret = dict_set_int32n(friends, "count", SLEN("count"), count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=count", NULL);
+        goto unlock;
+    }
+
+    cds_list_for_each_entry_rcu(peerinfo, &priv->peers, uuid_list)
+    {
+        if (!peerinfo->connected || !peerinfo->peer)
+            continue;
+
+        if (!glusterd_should_update_peer(peerinfo, cur_peerinfo))
+            continue;
+
+        ret = dict_set_static_ptr(friends, "peerinfo", peerinfo);
+        if (ret) {
+            RCU_READ_UNLOCK;
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "failed to set peerinfo");
+            goto out;
+        }
+
+        proc = &peerinfo->peer->proctable[GLUSTERD_FRIEND_UPDATE];
+        if (proc->fn) {
+            ret = proc->fn(NULL, this, friends);
+        }
+    }
+
+unlock:
+    RCU_READ_UNLOCK;
+out:
+
+    if (friends)
+        dict_unref(friends);
+
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+/* ac_update_friend only sends friend update to the friend that caused this
+ * event to happen
+ */
+static int
+glusterd_ac_update_friend(glusterd_friend_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    glusterd_peerinfo_t *cur_peerinfo = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    rpc_clnt_procedure_t *proc = NULL;
+    xlator_t *this = NULL;
+    glusterd_friend_update_ctx_t ev_ctx = {{0}};
+    glusterd_conf_t *priv = NULL;
+    dict_t *friends = NULL;
+    char key[64] = {
+        0,
+    };
+    int keylen;
+    int32_t count = 0;
+
+    GF_ASSERT(event);
+
+    this = THIS;
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    friends = dict_new();
+    keylen = snprintf(key, sizeof(key), "op");
+
+    RCU_READ_LOCK;
+
+    cur_peerinfo = glusterd_peerinfo_find(event->peerid, event->peername);
+    if (!cur_peerinfo) {
+        RCU_READ_UNLOCK;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEER_NOT_FOUND,
+               "Could not find peer %s(%s)", event->peername,
+               uuid_utoa(event->peerid));
+        goto out;
+    }
+
+    /* Bail out early if peer is not connected.
+     * We cannot send requests to the peer until we have established our
+     * client connection to it.
+     */
+    if (!cur_peerinfo->connected || !cur_peerinfo->peer) {
+        ret = 0;
+        goto unlock;
+    }
+
+    if (!friends) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ev_ctx.op = GD_FRIEND_UPDATE_ADD;
+    ret = dict_set_int32n(friends, key, keylen, ev_ctx.op);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto unlock;
+    }
+
+    cds_list_for_each_entry_rcu(peerinfo, &priv->peers, uuid_list)
+    {
+        if (!glusterd_should_update_peer(peerinfo, cur_peerinfo))
+            continue;
+
+        count++;
+
+        snprintf(key, sizeof(key), "friend%d", count);
+        ret = gd_add_friend_to_dict(peerinfo, friends, key);
+        if (ret)
+            goto unlock;
+    }
+
+    ret = dict_set_int32n(friends, "count", SLEN("count"), count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=count", NULL);
+        goto unlock;
+    }
+
+    ret = dict_set_static_ptr(friends, "peerinfo", cur_peerinfo);
+    if (ret) {
+        RCU_READ_UNLOCK;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set peerinfo");
+        goto out;
+    }
+
+    proc = &cur_peerinfo->peer->proctable[GLUSTERD_FRIEND_UPDATE];
+    if (proc->fn)
+        ret = proc->fn(NULL, this, friends);
+
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+
+unlock:
+    RCU_READ_UNLOCK;
+out:
+
+    if (friends)
+        dict_unref(friends);
+
+    return ret;
+}
+
+/* Clean up stale volumes on the peer being detached. The volumes which have
+ * bricks on other peers are stale with respect to the detached peer.
+ */
+static void
+glusterd_peer_detach_cleanup(glusterd_conf_t *priv)
+{
+    int ret = -1;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_volinfo_t *tmp_volinfo = NULL;
+    glusterd_svc_t *svc = NULL;
+
+    GF_ASSERT(priv);
+
+    cds_list_for_each_entry_safe(volinfo, tmp_volinfo, &priv->volumes, vol_list)
+    {
+        /* The peer detach checks make sure that, at this point in the
+         * detach process, there are only volumes contained completely
+         * within or completely outside the detached peer.
+         * The only stale volumes at this point are the ones
+         * completely outside the peer and can be safely deleted.
+         */
+        if (!glusterd_friend_contains_vol_bricks(volinfo, MY_UUID)) {
+            gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_STALE_VOL_DELETE_INFO,
+                   "Deleting stale volume %s", volinfo->volname);
+
+            /*Stop snapd daemon service if snapd daemon is running*/
+            if (!volinfo->is_snap_volume) {
+                svc = &(volinfo->snapd.svc);
+                ret = svc->stop(svc, SIGTERM);
+                if (ret) {
+                    gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SVC_STOP_FAIL,
+                           "Failed "
+                           "to stop snapd daemon service");
+                }
+            }
+
+            if (glusterd_is_shd_compatible_volume(volinfo)) {
+                svc = &(volinfo->shd.svc);
+                ret = svc->stop(svc, SIGTERM);
+                if (ret) {
+                    gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SVC_STOP_FAIL,
+                           "Failed "
+                           "to stop shd daemon service");
+                }
+            }
+
+            if (glusterd_is_gfproxyd_enabled(volinfo)) {
+                svc = &(volinfo->gfproxyd.svc);
+                ret = svc->stop(svc, SIGTERM);
+                if (ret) {
+                    gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SVC_STOP_FAIL,
+                           "Failed "
+                           "to stop gfproxyd daemon service");
+                }
+            }
+
+            ret = glusterd_cleanup_snaps_for_volume(volinfo);
+            if (ret) {
+                gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOL_DELETE_FAIL,
+                       "Error deleting snapshots for volume %s",
+                       volinfo->volname);
+            }
+
+            ret = glusterd_delete_volume(volinfo);
+            if (ret) {
+                gf_msg(THIS->name, GF_LOG_ERROR, 0,
+                       GD_MSG_STALE_VOL_REMOVE_FAIL,
+                       "Error deleting stale volume");
+            }
+        }
+    }
+
+    /*Reconfigure all daemon services upon peer detach*/
+    ret = glusterd_svcs_reconfigure(NULL);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SVC_STOP_FAIL,
+               "Failed to reconfigure all daemon services.");
+    }
+}
+
+static int
+glusterd_ac_handle_friend_remove_req(glusterd_friend_sm_event_t *event,
+                                     void *ctx)
+{
+    int ret = 0;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_friend_req_ctx_t *ev_ctx = NULL;
+    glusterd_friend_sm_event_t *new_event = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(ctx);
+    ev_ctx = ctx;
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    ret = glusterd_xfer_friend_remove_resp(ev_ctx->req, ev_ctx->hostname,
+                                           ev_ctx->port);
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &priv->peers, uuid_list)
+    {
+        ret = glusterd_friend_sm_new_event(GD_FRIEND_EVENT_REMOVE_FRIEND,
+                                           &new_event);
+        if (ret) {
+            RCU_READ_UNLOCK;
+            goto out;
+        }
+
+        new_event->peername = gf_strdup(peerinfo->hostname);
+        gf_uuid_copy(new_event->peerid, peerinfo->uuid);
+
+        ret = glusterd_friend_sm_inject_event(new_event);
+        if (ret) {
+            RCU_READ_UNLOCK;
+            goto out;
+        }
+
+        new_event = NULL;
+    }
+    RCU_READ_UNLOCK;
+
+    glusterd_peer_detach_cleanup(priv);
+out:
+    if (new_event)
+        GF_FREE(new_event->peername);
+    GF_FREE(new_event);
+
+    gf_msg_debug(THIS->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+static int
+glusterd_ac_friend_remove(glusterd_friend_sm_event_t *event, void *ctx)
+{
+    int ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+
+    GF_ASSERT(event);
+
+    RCU_READ_LOCK;
+
+    peerinfo = glusterd_peerinfo_find(event->peerid, event->peername);
+    if (!peerinfo) {
+        RCU_READ_UNLOCK;
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_PEER_NOT_FOUND,
+               "Could not find peer %s(%s)", event->peername,
+               uuid_utoa(event->peerid));
+        goto out;
+    }
+    ret = glusterd_friend_remove_cleanup_vols(peerinfo->uuid);
+    RCU_READ_UNLOCK;
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_WARNING, 0, GD_MSG_VOL_CLEANUP_FAIL,
+               "Volumes cleanup failed");
+
+    /* Exiting read critical section as glusterd_peerinfo_cleanup calls
+     * synchronize_rcu before freeing the peerinfo
+     */
+
+    ret = glusterd_peerinfo_cleanup(peerinfo);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_PEER_DETACH_CLEANUP_FAIL,
+               "Cleanup returned: %d", ret);
+    }
+out:
+    return 0;
+}
+
+/*static int
+glusterd_ac_none (void *ctx)
+{
+        int ret = 0;
+
+        gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+
+        return ret;
+}*/
+
+static int
+glusterd_ac_handle_friend_add_req(glusterd_friend_sm_event_t *event, void *ctx)
+{
+    int ret = 0;
+    uuid_t uuid;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_friend_req_ctx_t *ev_ctx = NULL;
+    glusterd_friend_update_ctx_t *new_ev_ctx = NULL;
+    glusterd_friend_sm_event_t *new_event = NULL;
+    glusterd_friend_sm_event_type_t event_type = GD_FRIEND_EVENT_NONE;
+    glusterd_conf_t *conf = NULL;
+    int status = 0;
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    xlator_t *this = NULL;
+    char *hostname = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(ctx);
+    ev_ctx = ctx;
+    gf_uuid_copy(uuid, ev_ctx->uuid);
+
+    RCU_READ_LOCK;
+    peerinfo = glusterd_peerinfo_find(event->peerid, event->peername);
+    if (!peerinfo) {
+        RCU_READ_UNLOCK;
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEER_NOT_FOUND,
+               "Could not find peer %s(%s)", event->peername,
+               uuid_utoa(event->peerid));
+        goto out;
+    }
+
+    /* TODO: How do you do an atomic copy of uuid_t */
+    /* TODO: Updating within a read-critical section is also invalid
+     *       Update properly with updater synchronization
+     */
+    gf_uuid_copy(peerinfo->uuid, ev_ctx->uuid);
+
+    RCU_READ_UNLOCK;
+
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    /* Passing the peername from the event. glusterd_compare_friend_data
+     * updates volumes and will use synchronize_rcu. If we were to pass
+     * peerinfo->hostname, we would have to do it under a read critical
+     * section which would lead to a deadlock
+     */
+
+    // Build comparison logic here.
+    pthread_mutex_lock(&conf->import_volumes);
+    {
+        ret = glusterd_compare_friend_data(ev_ctx->vols, &status,
+                                           event->peername);
+        if (ret) {
+            pthread_mutex_unlock(&conf->import_volumes);
+            goto out;
+        }
+
+        if (GLUSTERD_VOL_COMP_RJT != status) {
+            event_type = GD_FRIEND_EVENT_LOCAL_ACC;
+            op_ret = 0;
+        } else {
+            event_type = GD_FRIEND_EVENT_LOCAL_RJT;
+            op_errno = GF_PROBE_VOLUME_CONFLICT;
+            op_ret = -1;
+        }
+
+        /* Compare missed_snapshot list with the peer *
+         * if volume comparison is successful */
+        if ((op_ret == 0) && (conf->op_version >= GD_OP_VERSION_3_6_0)) {
+            ret = glusterd_import_friend_missed_snap_list(ev_ctx->vols);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_MISSED_SNAP_LIST_STORE_FAIL,
+                       "Failed to import peer's "
+                       "missed_snaps_list.");
+                event_type = GD_FRIEND_EVENT_LOCAL_RJT;
+                op_errno = GF_PROBE_MISSED_SNAP_CONFLICT;
+                op_ret = -1;
+            }
+
+            /* glusterd_compare_friend_snapshots and functions only require
+             * a peers hostname and uuid. It also does updates, which
+             * require use of synchronize_rcu. So we pass the hostname and
+             * id from the event instead of the peerinfo object to prevent
+             * deadlocks as above.
+             */
+            ret = glusterd_compare_friend_snapshots(
+                ev_ctx->vols, event->peername, event->peerid);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_SNAP_COMPARE_CONFLICT,
+                       "Conflict in comparing peer's snapshots");
+                event_type = GD_FRIEND_EVENT_LOCAL_RJT;
+                op_errno = GF_PROBE_SNAP_CONFLICT;
+                op_ret = -1;
+            }
+        }
+    }
+    pthread_mutex_unlock(&conf->import_volumes);
+    ret = glusterd_friend_sm_new_event(event_type, &new_event);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Out of Memory");
+        goto out;
+    }
+
+    new_event->peername = gf_strdup(event->peername);
+    gf_uuid_copy(new_event->peerid, event->peerid);
+
+    new_ev_ctx = GF_CALLOC(1, sizeof(*new_ev_ctx),
+                           gf_gld_mt_friend_update_ctx_t);
+    if (!new_ev_ctx) {
+        ret = -1;
+        goto out;
+    }
+
+    gf_uuid_copy(new_ev_ctx->uuid, ev_ctx->uuid);
+    new_ev_ctx->hostname = gf_strdup(ev_ctx->hostname);
+    new_ev_ctx->op = GD_FRIEND_UPDATE_ADD;
+
+    new_event->ctx = new_ev_ctx;
+
+    ret = dict_get_strn(ev_ctx->vols, "hostname_in_cluster",
+                        SLEN("hostname_in_cluster"), &hostname);
+    if (ret || !hostname) {
+        gf_msg_debug(this->name, 0, "Unable to fetch local hostname from peer");
+    } else if (snprintf(local_node_hostname, sizeof(local_node_hostname), "%s",
+                        hostname) >= sizeof(local_node_hostname)) {
+        gf_msg_debug(this->name, 0, "local_node_hostname truncated");
+        ret = -1;
+        goto out;
+    }
+
+    glusterd_friend_sm_inject_event(new_event);
+    new_event = NULL;
+
+    ret = glusterd_xfer_friend_add_resp(ev_ctx->req, ev_ctx->hostname,
+                                        event->peername, ev_ctx->port, op_ret,
+                                        op_errno);
+
+out:
+    if (new_event)
+        GF_FREE(new_event->peername);
+    GF_FREE(new_event);
+
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+static int
+glusterd_friend_sm_transition_state(uuid_t peerid, char *peername,
+                                    glusterd_sm_t *state,
+                                    glusterd_friend_sm_event_type_t event_type)
+{
+    int ret = -1;
+    glusterd_peerinfo_t *peerinfo = NULL;
+
+    GF_ASSERT(state);
+    GF_ASSERT(peername);
+
+    RCU_READ_LOCK;
+    peerinfo = glusterd_peerinfo_find(peerid, peername);
+    if (!peerinfo) {
+        gf_smsg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_PEER_NOT_FOUND, NULL);
+        goto out;
+    }
+
+    (void)glusterd_sm_tr_log_transition_add(
+        &peerinfo->sm_log, peerinfo->state.state, state[event_type].next_state,
+        event_type);
+
+    uatomic_set(&peerinfo->state.state, state[event_type].next_state);
+
+    ret = 0;
+out:
+    RCU_READ_UNLOCK;
+    return ret;
+}
+
+glusterd_sm_t glusterd_state_default[] = {
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_none},
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_friend_probe},  // EV_PROBE
+    {GD_FRIEND_STATE_REQ_SENT, glusterd_ac_friend_add},   // EV_INIT_FRIEND_REQ
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_none},          // EVENT_RCVD_ACC
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_none},  // EVENT_RCVD_LOCAL_ACC
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_none},  // EVENT_RCVD_RJT
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_none},  // EVENT_RCVD_LOCAL_RJT
+    {GD_FRIEND_STATE_REQ_RCVD,
+     glusterd_ac_handle_friend_add_req},  // EVENT_RCV_FRIEND_REQ
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_send_friend_remove_req},         // EV_INIT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_none},  // EVENT_RCVD_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_friend_remove},                          // EVENT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_friend_probe},  // EVENT_CONNECTED
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_none},          // EVENT_NEW_NAME
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_none},          // EVENT_MAX
+};
+
+glusterd_sm_t glusterd_state_probe_rcvd[] = {
+    {GD_FRIEND_STATE_PROBE_RCVD, glusterd_ac_none},
+    {GD_FRIEND_STATE_PROBE_RCVD, glusterd_ac_none},  // EV_PROBE
+    {GD_FRIEND_STATE_PROBE_RCVD, glusterd_ac_none},  // EV_INIT_FRIEND_REQ
+    {GD_FRIEND_STATE_PROBE_RCVD, glusterd_ac_none},  // EVENT_RCVD_ACC
+    {GD_FRIEND_STATE_PROBE_RCVD, glusterd_ac_none},  // EVENT_RCVD_LOCAL_ACC
+    {GD_FRIEND_STATE_PROBE_RCVD, glusterd_ac_none},  // EVENT_RCVD_RJT
+    {GD_FRIEND_STATE_PROBE_RCVD, glusterd_ac_none},  // EVENT_RCVD_LOCAL_RJT
+    {GD_FRIEND_STATE_REQ_RCVD,
+     glusterd_ac_handle_friend_add_req},  // EVENT_RCV_FRIEND_REQ
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_send_friend_remove_req},         // EV_INIT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_none},  // EVENT_RCVD_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_friend_remove},                         // EVENT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_CONNECTED_RCVD, glusterd_ac_none},  // EVENT_CONNECTED
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_none},         // EVENT_NEW_NAME
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_none},         // EVENT_MAX
+};
+
+glusterd_sm_t glusterd_state_connected_rcvd[] = {
+    {GD_FRIEND_STATE_CONNECTED_RCVD, glusterd_ac_none},
+    {GD_FRIEND_STATE_CONNECTED_RCVD, glusterd_ac_none},  // EV_PROBE
+    {GD_FRIEND_STATE_CONNECTED_RCVD, glusterd_ac_none},  // EV_INIT_FRIEND_REQ
+    {GD_FRIEND_STATE_CONNECTED_RCVD, glusterd_ac_none},  // EVENT_RCVD_ACC
+    {GD_FRIEND_STATE_CONNECTED_ACCEPTED,
+     glusterd_ac_reverse_probe_begin},                   // EVENT_RCVD_LOCAL_ACC
+    {GD_FRIEND_STATE_CONNECTED_RCVD, glusterd_ac_none},  // EVENT_RCVD_RJT
+    {GD_FRIEND_STATE_REJECTED, glusterd_ac_none},        // EVENT_RCVD_LOCAL_RJT
+    {GD_FRIEND_STATE_CONNECTED_RCVD,
+     glusterd_ac_handle_friend_add_req},  // EVENT_RCV_FRIEND_REQ
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_send_friend_remove_req},         // EV_INIT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_none},  // EVENT_RCVD_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_friend_remove},                         // EVENT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_CONNECTED_RCVD, glusterd_ac_none},  // EVENT_CONNECTED
+    {GD_FRIEND_STATE_CONNECTED_RCVD, glusterd_ac_none},  // EVENT_NEW_NAME
+    {GD_FRIEND_STATE_CONNECTED_RCVD, glusterd_ac_none},  // EVENT_MAX
+};
+
+glusterd_sm_t glusterd_state_connected_accepted[] = {
+    {GD_FRIEND_STATE_CONNECTED_ACCEPTED, glusterd_ac_none},
+    {GD_FRIEND_STATE_CONNECTED_ACCEPTED, glusterd_ac_friend_probe},  // EV_PROBE
+    {GD_FRIEND_STATE_REQ_SENT_RCVD,
+     glusterd_ac_friend_add},  // EV_INIT_FRIEND_REQ
+    {GD_FRIEND_STATE_CONNECTED_ACCEPTED, glusterd_ac_none},  // EVENT_RCVD_ACC
+    {GD_FRIEND_STATE_CONNECTED_ACCEPTED,
+     glusterd_ac_none},  // EVENT_RCVD_LOCAL_ACC
+    {GD_FRIEND_STATE_CONNECTED_ACCEPTED, glusterd_ac_none},  // EVENT_RCVD_RJT
+    {GD_FRIEND_STATE_CONNECTED_ACCEPTED,
+     glusterd_ac_none},  // EVENT_RCVD_LOCAL_RJT
+    {GD_FRIEND_STATE_CONNECTED_ACCEPTED,
+     glusterd_ac_none},  // EVENT_RCV_FRIEND_REQ
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_send_friend_remove_req},         // EV_INIT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT, glusterd_ac_none},  // EVENT_RCVD_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_friend_remove},  // EVENT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_CONNECTED_ACCEPTED, glusterd_ac_none},  // EVENT_CONNECTED
+    {GD_FRIEND_STATE_CONNECTED_ACCEPTED, glusterd_ac_none},  // EVENT_NEW_NAME
+    {GD_FRIEND_STATE_CONNECTED_ACCEPTED, glusterd_ac_none},  // EVENT_MAX
+};
+
+glusterd_sm_t glusterd_state_req_sent[] = {
+    {GD_FRIEND_STATE_REQ_SENT, glusterd_ac_none},      // EVENT_NONE,
+    {GD_FRIEND_STATE_REQ_SENT, glusterd_ac_none},      // EVENT_PROBE,
+    {GD_FRIEND_STATE_REQ_SENT, glusterd_ac_none},      // EVENT_INIT_FRIEND_REQ,
+    {GD_FRIEND_STATE_REQ_ACCEPTED, glusterd_ac_none},  // EVENT_RCVD_ACC
+    {GD_FRIEND_STATE_REQ_SENT, glusterd_ac_none},      // EVENT_RCVD_LOCAL_ACC
+    {GD_FRIEND_STATE_REJECTED, glusterd_ac_none},      // EVENT_RCVD_RJT
+    {GD_FRIEND_STATE_REQ_SENT, glusterd_ac_none},      // EVENT_RCVD_LOCAL_RJT
+    {GD_FRIEND_STATE_REQ_SENT_RCVD,
+     glusterd_ac_handle_friend_add_req},  // EVENT_RCV_FRIEND_REQ
+    {GD_FRIEND_STATE_UNFRIEND_SENT,
+     glusterd_ac_send_friend_remove_req},          // EVENT_INIT_REMOVE_FRIEND,
+    {GD_FRIEND_STATE_REQ_SENT, glusterd_ac_none},  // EVENT_RCVD_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_friend_remove},                   // EVENT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_REQ_SENT, glusterd_ac_none},  // EVENT_CONNECTED
+    {GD_FRIEND_STATE_REQ_SENT, glusterd_ac_none},  // EVENT_NEW_NAME
+    {GD_FRIEND_STATE_REQ_SENT, glusterd_ac_none},  // EVENT_MAX
+};
+
+glusterd_sm_t glusterd_state_req_rcvd[] = {
+    {GD_FRIEND_STATE_REQ_RCVD, glusterd_ac_none},  // EVENT_NONE,
+    {GD_FRIEND_STATE_REQ_RCVD, glusterd_ac_none},  // EVENT_PROBE,
+    {GD_FRIEND_STATE_REQ_SENT_RCVD,
+     glusterd_ac_none},                                // EVENT_INIT_FRIEND_REQ,
+    {GD_FRIEND_STATE_REQ_RCVD, glusterd_ac_none},      // EVENT_RCVD_ACC
+    {GD_FRIEND_STATE_REQ_ACCEPTED, glusterd_ac_none},  // EVENT_RCVD_LOCAL_ACC
+    {GD_FRIEND_STATE_REQ_RCVD, glusterd_ac_none},      // EVENT_RCVD_RJT
+    {GD_FRIEND_STATE_REJECTED, glusterd_ac_none},      // EVENT_RCVD_LOCAL_RJT
+    {GD_FRIEND_STATE_REQ_RCVD, glusterd_ac_none},      // EVENT_RCV_FRIEND_REQ
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_send_friend_remove_req},  // EVENT_INIT_REMOVE_FRIEND,
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_handle_friend_remove_req},  // EVENT_RCVD_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_friend_remove},                         // EVENT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_CONNECTED_RCVD, glusterd_ac_none},  // EVENT_CONNECTED
+    {GD_FRIEND_STATE_CONNECTED_RCVD, glusterd_ac_none},  // EVENT_NEW_NAME
+    {GD_FRIEND_STATE_REQ_RCVD, glusterd_ac_none},        // EVENT_MAX
+};
+
+glusterd_sm_t glusterd_state_befriended[] = {
+    {GD_FRIEND_STATE_BEFRIENDED, glusterd_ac_none},  // EVENT_NONE,
+    {GD_FRIEND_STATE_BEFRIENDED, glusterd_ac_none},  // EVENT_PROBE,
+    {GD_FRIEND_STATE_BEFRIENDED, glusterd_ac_none},  // EVENT_INIT_FRIEND_REQ,
+    {GD_FRIEND_STATE_BEFRIENDED, glusterd_ac_update_friend},  // EVENT_RCVD_ACC
+    {GD_FRIEND_STATE_BEFRIENDED,
+     glusterd_ac_update_friend},                   // EVENT_RCVD_LOCAL_ACC
+    {GD_FRIEND_STATE_REJECTED, glusterd_ac_none},  // EVENT_RCVD_RJT
+    {GD_FRIEND_STATE_REJECTED, glusterd_ac_none},  // EVENT_RCVD_LOCAL_RJT
+    {GD_FRIEND_STATE_BEFRIENDED,
+     glusterd_ac_handle_friend_add_req},  // EVENT_RCV_FRIEND_REQ
+    {GD_FRIEND_STATE_UNFRIEND_SENT,
+     glusterd_ac_send_friend_remove_req},  // EVENT_INIT_REMOVE_FRIEND,
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_handle_friend_remove_req},  // EVENT_RCVD_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_friend_remove},  // EVENT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_BEFRIENDED, glusterd_ac_friend_add},  // EVENT_CONNECTED
+    {GD_FRIEND_STATE_BEFRIENDED,
+     glusterd_ac_send_friend_update},                // EVENT_NEW_NAME
+    {GD_FRIEND_STATE_BEFRIENDED, glusterd_ac_none},  // EVENT_MAX
+};
+
+glusterd_sm_t glusterd_state_req_sent_rcvd[] = {
+    {GD_FRIEND_STATE_REQ_SENT_RCVD, glusterd_ac_none},  // EVENT_NONE,
+    {GD_FRIEND_STATE_REQ_SENT_RCVD, glusterd_ac_none},  // EVENT_PROBE,
+    {GD_FRIEND_STATE_REQ_SENT_RCVD,
+     glusterd_ac_none},  // EVENT_INIT_FRIEND_REQ,
+    {GD_FRIEND_STATE_BEFRIENDED,
+     glusterd_ac_send_friend_update},                   // EVENT_RCVD_ACC
+    {GD_FRIEND_STATE_REQ_SENT_RCVD, glusterd_ac_none},  // EVENT_RCVD_LOCAL_ACC
+    {GD_FRIEND_STATE_REJECTED, glusterd_ac_none},       // EVENT_RCVD_RJT
+    {GD_FRIEND_STATE_REQ_SENT_RCVD, glusterd_ac_none},  // EVENT_RCVD_LOCAL_RJT
+    {GD_FRIEND_STATE_REQ_SENT_RCVD, glusterd_ac_none},  // EVENT_RCV_FRIEND_REQ
+    {GD_FRIEND_STATE_UNFRIEND_SENT,
+     glusterd_ac_send_friend_remove_req},  // EVENT_INIT_REMOVE_FRIEND,
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_handle_friend_remove_req},  // EVENT_RCVD_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_friend_remove},                        // EVENT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_REQ_SENT_RCVD, glusterd_ac_none},  // EVENT_CONNECTED
+    {GD_FRIEND_STATE_REQ_SENT_RCVD, glusterd_ac_none},  // EVENT_NEW_NAME
+    {GD_FRIEND_STATE_REQ_SENT_RCVD, glusterd_ac_none},  // EVENT_MAX
+};
+
+glusterd_sm_t glusterd_state_rejected[] = {
+    {GD_FRIEND_STATE_REJECTED, glusterd_ac_none},          // EVENT_NONE,
+    {GD_FRIEND_STATE_REJECTED, glusterd_ac_friend_probe},  // EVENT_PROBE,
+    {GD_FRIEND_STATE_REQ_SENT,
+     glusterd_ac_friend_add},                        // EVENT_INIT_FRIEND_REQ,
+    {GD_FRIEND_STATE_BEFRIENDED, glusterd_ac_none},  // EVENT_RCVD_ACC
+    {GD_FRIEND_STATE_BEFRIENDED, glusterd_ac_none},  // EVENT_RCVD_LOCAL_ACC
+    {GD_FRIEND_STATE_REJECTED, glusterd_ac_none},    // EVENT_RCVD_RJT
+    {GD_FRIEND_STATE_REJECTED, glusterd_ac_none},    // EVENT_RCVD_LOCAL_RJT
+    {GD_FRIEND_STATE_REQ_RCVD,
+     glusterd_ac_handle_friend_add_req},  // EVENT_RCV_FRIEND_REQ
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_send_friend_remove_req},  // EVENT_INIT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_handle_friend_remove_req},  // EVENT_RCVD_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_friend_remove},                         // EVENT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_REJECTED, glusterd_ac_friend_add},  // EVENT_CONNECTED
+    {GD_FRIEND_STATE_REJECTED, glusterd_ac_none},        // EVENT_NEW_NAME
+    {GD_FRIEND_STATE_REQ_RCVD, glusterd_ac_none},        // EVENT_MAX
+};
+
+glusterd_sm_t glusterd_state_req_accepted[] = {
+    {GD_FRIEND_STATE_REQ_ACCEPTED, glusterd_ac_none},  // EVENT_NONE,
+    {GD_FRIEND_STATE_REQ_ACCEPTED, glusterd_ac_none},  // EVENT_PROBE,
+    {GD_FRIEND_STATE_REQ_ACCEPTED, glusterd_ac_none},  // EVENT_INIT_FRIEND_REQ,
+    {GD_FRIEND_STATE_BEFRIENDED,
+     glusterd_ac_send_friend_update},  // EVENT_RCVD_ACC
+    {GD_FRIEND_STATE_BEFRIENDED,
+     glusterd_ac_send_friend_update},              // EVENT_RCVD_LOCAL_ACC
+    {GD_FRIEND_STATE_REJECTED, glusterd_ac_none},  // EVENT_RCVD_RJT
+    {GD_FRIEND_STATE_REJECTED, glusterd_ac_none},  // EVENT_RCVD_LOCAL_RJT
+    {GD_FRIEND_STATE_REQ_ACCEPTED,
+     glusterd_ac_handle_friend_add_req},  // EVENT_RCV_FRIEND_REQ
+    {GD_FRIEND_STATE_REQ_ACCEPTED,
+     glusterd_ac_send_friend_remove_req},  // EVENT_INIT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_handle_friend_remove_req},  // EVENT_RCVD_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_friend_remove},  // EVENT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_CONNECTED_ACCEPTED,
+     glusterd_ac_reverse_probe_begin},                 // EVENT_CONNECTED
+    {GD_FRIEND_STATE_REQ_ACCEPTED, glusterd_ac_none},  // EVENT_NEW_NAME
+    {GD_FRIEND_STATE_REQ_SENT, glusterd_ac_none},      // EVENT_MAX
+};
+
+glusterd_sm_t glusterd_state_unfriend_sent[] = {
+    {GD_FRIEND_STATE_UNFRIEND_SENT, glusterd_ac_none},   // EVENT_NONE,
+    {GD_FRIEND_STATE_UNFRIEND_SENT, glusterd_ac_error},  // EVENT_PROBE,
+    {GD_FRIEND_STATE_UNFRIEND_SENT,
+     glusterd_ac_none},  // EVENT_INIT_FRIEND_REQ,
+    {GD_FRIEND_STATE_UNFRIEND_SENT, glusterd_ac_none},   // EVENT_RCVD_ACC
+    {GD_FRIEND_STATE_UNFRIEND_SENT, glusterd_ac_none},   // EVENT_RCVD_LOCAL_ACC
+    {GD_FRIEND_STATE_UNFRIEND_SENT, glusterd_ac_error},  // EVENT_RCVD_RJT
+    {GD_FRIEND_STATE_UNFRIEND_SENT, glusterd_ac_error},  // EVENT_RCVD_LOCAL_RJT
+    {GD_FRIEND_STATE_UNFRIEND_SENT, glusterd_ac_error},  // EVENT_RCV_FRIEND_REQ
+    {GD_FRIEND_STATE_UNFRIEND_SENT,
+     glusterd_ac_none},  // EVENT_INIT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_UNFRIEND_SENT,
+     glusterd_ac_none},  // EVENT_RCVD_REMOVE_FRIEND
+    {GD_FRIEND_STATE_DEFAULT,
+     glusterd_ac_friend_remove},                        // EVENT_REMOVE_FRIEND
+    {GD_FRIEND_STATE_UNFRIEND_SENT, glusterd_ac_none},  // EVENT_CONNECTED
+    {GD_FRIEND_STATE_UNFRIEND_SENT, glusterd_ac_none},  // EVENT_NEW_NAME
+    {GD_FRIEND_STATE_UNFRIEND_SENT, glusterd_ac_none},  // EVENT_MAX
+};
+
+glusterd_sm_t *glusterd_friend_state_table[] = {
+    glusterd_state_default,           glusterd_state_req_sent,
+    glusterd_state_req_rcvd,          glusterd_state_befriended,
+    glusterd_state_req_accepted,      glusterd_state_req_sent_rcvd,
+    glusterd_state_rejected,          glusterd_state_unfriend_sent,
+    glusterd_state_probe_rcvd,        glusterd_state_connected_rcvd,
+    glusterd_state_connected_accepted};
+
+int
+glusterd_friend_sm_new_event(glusterd_friend_sm_event_type_t event_type,
+                             glusterd_friend_sm_event_t **new_event)
+{
+    glusterd_friend_sm_event_t *event = NULL;
+
+    GF_ASSERT(new_event);
+    GF_ASSERT(GD_FRIEND_EVENT_NONE <= event_type &&
+              GD_FRIEND_EVENT_MAX > event_type);
+
+    event = GF_CALLOC(1, sizeof(*event), gf_gld_mt_friend_sm_event_t);
+
+    if (!event)
+        return -1;
+
+    *new_event = event;
+    event->event = event_type;
+    CDS_INIT_LIST_HEAD(&event->list);
+
+    return 0;
+}
+
+int
+glusterd_friend_sm_inject_event(glusterd_friend_sm_event_t *event)
+{
+    GF_ASSERT(event);
+    gf_msg_debug("glusterd", 0, "Enqueue event: '%s'",
+                 glusterd_friend_sm_event_name_get(event->event));
+    cds_list_add_tail(&event->list, &gd_friend_sm_queue);
+
+    return 0;
+}
+
+void
+glusterd_destroy_friend_event_context(glusterd_friend_sm_event_t *event)
+{
+    if (!event)
+        return;
+
+    switch (event->event) {
+        case GD_FRIEND_EVENT_RCVD_FRIEND_REQ:
+        case GD_FRIEND_EVENT_RCVD_REMOVE_FRIEND:
+            glusterd_destroy_friend_req_ctx(event->ctx);
+            break;
+        case GD_FRIEND_EVENT_LOCAL_ACC:
+        case GD_FRIEND_EVENT_LOCAL_RJT:
+        case GD_FRIEND_EVENT_RCVD_ACC:
+        case GD_FRIEND_EVENT_RCVD_RJT:
+            glusterd_destroy_friend_update_ctx(event->ctx);
+            break;
+        default:
+            break;
+    }
+}
+
+gf_boolean_t
+gd_does_peer_affect_quorum(glusterd_friend_sm_state_t old_state,
+                           glusterd_friend_sm_event_type_t event_type,
+                           glusterd_peerinfo_t *peerinfo)
+{
+    gf_boolean_t affects = _gf_false;
+
+    // When glusterd comes up with friends in BEFRIENDED state in store,
+    // wait until compare-data happens.
+    if ((old_state == GD_FRIEND_STATE_BEFRIENDED) &&
+        (event_type != GD_FRIEND_EVENT_RCVD_ACC) &&
+        (event_type != GD_FRIEND_EVENT_LOCAL_ACC))
+        goto out;
+    if ((peerinfo->state.state == GD_FRIEND_STATE_BEFRIENDED) &&
+        peerinfo->connected) {
+        affects = _gf_true;
+    }
+out:
+    return affects;
+}
+
+int
+glusterd_friend_sm()
+{
+    glusterd_friend_sm_event_t *event = NULL;
+    glusterd_friend_sm_event_t *tmp = NULL;
+    int ret = -1;
+    glusterd_friend_sm_ac_fn handler = NULL;
+    glusterd_sm_t *state = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_friend_sm_event_type_t event_type = 0;
+    gf_boolean_t is_await_conn = _gf_false;
+    gf_boolean_t quorum_action = _gf_false;
+    glusterd_friend_sm_state_t old_state = GD_FRIEND_STATE_DEFAULT;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    while (!cds_list_empty(&gd_friend_sm_queue)) {
+        cds_list_for_each_entry_safe(event, tmp, &gd_friend_sm_queue, list)
+        {
+            cds_list_del_init(&event->list);
+            event_type = event->event;
+
+            RCU_READ_LOCK;
+
+            peerinfo = glusterd_peerinfo_find(event->peerid, event->peername);
+            if (!peerinfo) {
+                RCU_READ_UNLOCK;
+                gf_msg("glusterd", GF_LOG_CRITICAL, 0, GD_MSG_PEER_NOT_FOUND,
+                       "Received"
+                       " event %s with empty peer info",
+                       glusterd_friend_sm_event_name_get(event_type));
+
+                GF_FREE(event);
+                continue;
+            }
+            old_state = peerinfo->state.state;
+            RCU_READ_UNLOCK;
+            gf_msg_debug("glusterd", 0, "Dequeued event of type: '%s'",
+                         glusterd_friend_sm_event_name_get(event_type));
+
+            /* Giving up read-critical section here as we only need
+             * the current state to call the handler.
+             *
+             * We cannot continue into the handler in a read
+             * critical section as there are handlers who do
+             * updates, and could cause deadlocks.
+             */
+
+            state = glusterd_friend_state_table[old_state];
+
+            GF_ASSERT(state);
+
+            handler = state[event_type].handler;
+            GF_ASSERT(handler);
+
+            ret = handler(event, event->ctx);
+            if (ret == GLUSTERD_CONNECTION_AWAITED) {
+                is_await_conn = _gf_true;
+                ret = 0;
+            }
+
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_HANDLER_RETURNED,
+                       "handler returned: "
+                       "%d",
+                       ret);
+                glusterd_destroy_friend_event_context(event);
+                GF_FREE(event);
+                continue;
+            }
+
+            if ((GD_FRIEND_EVENT_REMOVE_FRIEND == event_type) ||
+                (GD_FRIEND_EVENT_INIT_REMOVE_FRIEND == event_type)) {
+                glusterd_destroy_friend_event_context(event);
+                GF_FREE(event);
+                continue;
+            }
+
+            ret = glusterd_friend_sm_transition_state(
+                event->peerid, event->peername, state, event_type);
+
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0,
+                       GD_MSG_EVENT_STATE_TRANSITION_FAIL,
+                       "Unable to transition"
+                       " state from '%s' to '%s' for event '%s'",
+                       glusterd_friend_sm_state_name_get(old_state),
+                       glusterd_friend_sm_state_name_get(
+                           state[event_type].next_state),
+                       glusterd_friend_sm_event_name_get(event_type));
+                goto out;
+            }
+
+            peerinfo = NULL;
+            /* We need to obtain peerinfo reference once again as we
+             * had exited the read critical section above.
+             */
+            RCU_READ_LOCK;
+            peerinfo = glusterd_peerinfo_find(event->peerid, event->peername);
+            if (!peerinfo) {
+                RCU_READ_UNLOCK;
+                /* A peer can only be deleted as a effect of
+                 * this state machine, and two such state
+                 * machines can never run at the same time.
+                 * So if we cannot find the peerinfo here,
+                 * something has gone terribly wrong.
+                 */
+                ret = -1;
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_PEER_NOT_FOUND,
+                       "Cannot find peer %s(%s)", event->peername,
+                       uuid_utoa(event->peerid));
+                goto out;
+            }
+            if (gd_does_peer_affect_quorum(old_state, event_type, peerinfo)) {
+                peerinfo->quorum_contrib = QUORUM_UP;
+                if (peerinfo->quorum_action) {
+                    peerinfo->quorum_action = _gf_false;
+                    quorum_action = _gf_true;
+                }
+            }
+
+            ret = glusterd_store_peerinfo(peerinfo);
+            RCU_READ_UNLOCK;
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEERINFO_CREATE_FAIL,
+                       "Failed to store peerinfo");
+            }
+
+            glusterd_destroy_friend_event_context(event);
+            GF_FREE(event);
+            if (is_await_conn)
+                break;
+        }
+        if (is_await_conn)
+            break;
+    }
+
+    ret = 0;
+out:
+    if (quorum_action) {
+        /* When glusterd is restarted, it needs to wait until the 'friends' view
+         * of the volumes settle, before it starts any of the internal daemons.
+         *
+         * Every friend that was part of the cluster, would send its
+         * cluster-view, 'our' way. For every friend, who belongs to
+         * a partition which has a different cluster-view from our
+         * partition, we may update our cluster-view. For subsequent
+         * friends from that partition would agree with us, if the first
+         * friend wasn't rejected. For every first friend, whom we agreed with,
+         * we would need to start internal daemons/bricks belonging to the
+         * new volumes.
+         * glusterd_spawn_daemons calls functions that are idempotent. ie,
+         * the functions spawn process(es) only if they are not started yet.
+         *
+         * */
+        synclock_unlock(&priv->big_lock);
+        glusterd_launch_synctask(glusterd_spawn_daemons, NULL);
+        synclock_lock(&priv->big_lock);
+        glusterd_do_quorum_action();
+    }
+    return ret;
+}
+
+int
+glusterd_friend_sm_init()
+{
+    CDS_INIT_LIST_HEAD(&gd_friend_sm_queue);
+    return 0;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-sm.h b/xlators/mgmt/glusterd/src/glusterd-sm.h
new file mode 100644
index 00000000000..11cbd85b3e3
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-sm.h
@@ -0,0 +1,216 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_SM_H_
+#define _GLUSTERD_SM_H_
+
+#include <pthread.h>
+#include <glusterfs/compat-uuid.h>
+
+#include "rpc-clnt.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/byte-order.h>
+#include "rpcsvc.h"
+#include <glusterfs/store.h>
+
+#include "glusterd-rcu.h"
+
+typedef enum gd_quorum_contribution_ {
+    QUORUM_NONE,
+    QUORUM_WAITING,
+    QUORUM_DOWN,
+    QUORUM_UP
+} gd_quorum_contrib_t;
+
+typedef enum glusterd_friend_sm_state_ {
+    GD_FRIEND_STATE_DEFAULT = 0,
+    GD_FRIEND_STATE_REQ_SENT,
+    GD_FRIEND_STATE_REQ_RCVD,
+    GD_FRIEND_STATE_BEFRIENDED,
+    GD_FRIEND_STATE_REQ_ACCEPTED,
+    GD_FRIEND_STATE_REQ_SENT_RCVD,
+    GD_FRIEND_STATE_REJECTED,
+    GD_FRIEND_STATE_UNFRIEND_SENT,
+    GD_FRIEND_STATE_PROBE_RCVD,
+    GD_FRIEND_STATE_CONNECTED_RCVD,
+    GD_FRIEND_STATE_CONNECTED_ACCEPTED,
+    GD_FRIEND_STATE_MAX
+} glusterd_friend_sm_state_t;
+
+typedef struct glusterd_peer_state_info_ {
+    glusterd_friend_sm_state_t state;
+    struct timeval transition_time;
+} glusterd_peer_state_info_t;
+
+typedef struct glusterd_peer_hostname_ {
+    char *hostname;
+    struct cds_list_head hostname_list;
+} glusterd_peer_hostname_t;
+
+typedef struct glusterd_sm_transition_ {
+    int old_state;
+    int event;
+    int new_state;
+    time_t time;
+} glusterd_sm_transition_t;
+
+typedef struct glusterd_sm_tr_log_ {
+    glusterd_sm_transition_t *transitions;
+    size_t current;
+    size_t size;
+    size_t count;
+    char *(*state_name_get)(int);
+    char *(*event_name_get)(int);
+} glusterd_sm_tr_log_t;
+
+struct glusterd_peerinfo_ {
+    uuid_t uuid;
+    char uuid_str[50]; /* Retrieve this using
+                        * gd_peer_uuid_str ()
+                        */
+    glusterd_peer_state_info_t state;
+    char *hostname;
+    struct cds_list_head hostnames;
+    int port;
+    struct cds_list_head uuid_list;
+    struct cds_list_head op_peers_list;
+    struct rpc_clnt *rpc;
+    rpc_clnt_prog_t *mgmt;
+    rpc_clnt_prog_t *peer;
+    rpc_clnt_prog_t *mgmt_v3;
+    int connected;
+    gf_store_handle_t *shandle;
+    glusterd_sm_tr_log_t sm_log;
+    gf_boolean_t quorum_action;
+    gd_quorum_contrib_t quorum_contrib;
+    gf_boolean_t locked;
+    gf_boolean_t detaching;
+    /* Members required for proper cleanup using RCU */
+    gd_rcu_head rcu_head;
+    pthread_mutex_t delete_lock;
+    uint32_t generation;
+};
+
+typedef struct glusterd_peerinfo_ glusterd_peerinfo_t;
+
+typedef struct glusterd_local_peers_ {
+    glusterd_peerinfo_t *peerinfo;
+    struct cds_list_head op_peers_list;
+} glusterd_local_peers_t;
+
+typedef enum glusterd_ev_gen_mode_ {
+    GD_MODE_OFF,
+    GD_MODE_ON,
+    GD_MODE_SWITCH_ON
+} glusterd_ev_gen_mode_t;
+
+typedef struct glusterd_peer_ctx_args_ {
+    rpcsvc_request_t *req;
+    glusterd_ev_gen_mode_t mode;
+    dict_t *dict;
+} glusterd_peerctx_args_t;
+
+typedef struct glusterd_peer_ctx_ {
+    glusterd_peerctx_args_t args;
+    uuid_t peerid;
+    char *peername;
+    uint32_t peerinfo_gen;
+    char *errstr;
+} glusterd_peerctx_t;
+
+typedef enum glusterd_friend_sm_event_type_ {
+    GD_FRIEND_EVENT_NONE = 0,
+    GD_FRIEND_EVENT_PROBE,
+    GD_FRIEND_EVENT_INIT_FRIEND_REQ,
+    GD_FRIEND_EVENT_RCVD_ACC,
+    GD_FRIEND_EVENT_LOCAL_ACC,
+    GD_FRIEND_EVENT_RCVD_RJT,
+    GD_FRIEND_EVENT_LOCAL_RJT,
+    GD_FRIEND_EVENT_RCVD_FRIEND_REQ,
+    GD_FRIEND_EVENT_INIT_REMOVE_FRIEND,
+    GD_FRIEND_EVENT_RCVD_REMOVE_FRIEND,
+    GD_FRIEND_EVENT_REMOVE_FRIEND,
+    GD_FRIEND_EVENT_CONNECTED,
+    GD_FRIEND_EVENT_NEW_NAME,
+    GD_FRIEND_EVENT_MAX
+} glusterd_friend_sm_event_type_t;
+
+typedef enum glusterd_friend_update_op_ {
+    GD_FRIEND_UPDATE_NONE = 0,
+    GD_FRIEND_UPDATE_ADD,
+    GD_FRIEND_UPDATE_DEL,
+} glusterd_friend_update_op_t;
+
+struct glusterd_friend_sm_event_ {
+    struct cds_list_head list;
+    uuid_t peerid;
+    char *peername;
+    void *ctx;
+    glusterd_friend_sm_event_type_t event;
+};
+
+typedef struct glusterd_friend_sm_event_ glusterd_friend_sm_event_t;
+
+typedef int (*glusterd_friend_sm_ac_fn)(glusterd_friend_sm_event_t *, void *);
+
+typedef struct glusterd_sm_ {
+    glusterd_friend_sm_state_t next_state;
+    glusterd_friend_sm_ac_fn handler;
+} glusterd_sm_t;
+
+typedef struct glusterd_friend_req_ctx_ {
+    uuid_t uuid;
+    char *hostname;
+    rpcsvc_request_t *req;
+    int port;
+    dict_t *vols;
+} glusterd_friend_req_ctx_t;
+
+typedef struct glusterd_friend_update_ctx_ {
+    uuid_t uuid;
+    char *hostname;
+    int op;
+} glusterd_friend_update_ctx_t;
+
+typedef struct glusterd_probe_ctx_ {
+    char *hostname;
+    rpcsvc_request_t *req;
+    int port;
+    dict_t *dict;
+} glusterd_probe_ctx_t;
+int
+glusterd_friend_sm_new_event(glusterd_friend_sm_event_type_t event_type,
+                             glusterd_friend_sm_event_t **new_event);
+int
+glusterd_friend_sm_inject_event(glusterd_friend_sm_event_t *event);
+
+int
+glusterd_friend_sm_init();
+
+int
+glusterd_friend_sm();
+
+void
+glusterd_destroy_probe_ctx(glusterd_probe_ctx_t *ctx);
+
+void
+glusterd_destroy_friend_req_ctx(glusterd_friend_req_ctx_t *ctx);
+
+char *
+glusterd_friend_sm_state_name_get(int state);
+
+char *
+glusterd_friend_sm_event_name_get(int event);
+
+int
+glusterd_broadcast_friend_delete(char *hostname, uuid_t uuid);
+void
+glusterd_destroy_friend_update_ctx(glusterd_friend_update_ctx_t *ctx);
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapd-svc-helper.c b/xlators/mgmt/glusterd/src/glusterd-snapd-svc-helper.c
new file mode 100644
index 00000000000..42ef51b01b4
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-snapd-svc-helper.c
@@ -0,0 +1,75 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-snapd-svc-helper.h"
+
+void
+glusterd_svc_build_snapd_rundir(glusterd_volinfo_t *volinfo, char *path,
+                                int path_len)
+{
+    char workdir[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = THIS->private;
+
+    GLUSTERD_GET_VOLUME_PID_DIR(workdir, volinfo, priv);
+    snprintf(path, path_len, "%s", workdir);
+}
+
+void
+glusterd_svc_build_snapd_socket_filepath(glusterd_volinfo_t *volinfo,
+                                         char *path, int path_len)
+{
+    char sockfilepath[PATH_MAX] = {
+        0,
+    };
+    char rundir[PATH_MAX] = {
+        0,
+    };
+    int32_t len = 0;
+
+    glusterd_svc_build_snapd_rundir(volinfo, rundir, sizeof(rundir));
+    len = snprintf(sockfilepath, sizeof(sockfilepath), "%s/run-%s", rundir,
+                   uuid_utoa(MY_UUID));
+    if ((len < 0) || (len >= sizeof(sockfilepath))) {
+        sockfilepath[0] = 0;
+    }
+
+    glusterd_set_socket_filepath(sockfilepath, path, path_len);
+}
+
+void
+glusterd_svc_build_snapd_pidfile(glusterd_volinfo_t *volinfo, char *path,
+                                 int path_len)
+{
+    char rundir[PATH_MAX] = {
+        0,
+    };
+
+    glusterd_svc_build_snapd_rundir(volinfo, rundir, sizeof(rundir));
+
+    snprintf(path, path_len, "%s/%s-snapd.pid", rundir, volinfo->volname);
+}
+
+void
+glusterd_svc_build_snapd_volfile(glusterd_volinfo_t *volinfo, char *path,
+                                 int path_len)
+{
+    char workdir[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = THIS->private;
+
+    GLUSTERD_GET_VOLUME_DIR(workdir, volinfo, priv);
+
+    snprintf(path, path_len, "%s/%s-snapd.vol", workdir, volinfo->volname);
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapd-svc-helper.h b/xlators/mgmt/glusterd/src/glusterd-snapd-svc-helper.h
new file mode 100644
index 00000000000..3e23c2ce942
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-snapd-svc-helper.h
@@ -0,0 +1,32 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_SNAPD_SVC_HELPER_H_
+#define _GLUSTERD_SNAPD_SVC_HELPER_H_
+
+#include "glusterd.h"
+
+void
+glusterd_svc_build_snapd_rundir(glusterd_volinfo_t *volinfo, char *path,
+                                int path_len);
+
+void
+glusterd_svc_build_snapd_socket_filepath(glusterd_volinfo_t *volinfo,
+                                         char *path, int path_len);
+
+void
+glusterd_svc_build_snapd_pidfile(glusterd_volinfo_t *volinfo, char *path,
+                                 int path_len);
+
+void
+glusterd_svc_build_snapd_volfile(glusterd_volinfo_t *volinfo, char *path,
+                                 int path_len);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapd-svc.c b/xlators/mgmt/glusterd/src/glusterd-snapd-svc.c
new file mode 100644
index 00000000000..d75f249b29e
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-snapd-svc.c
@@ -0,0 +1,478 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/globals.h>
+#include <glusterfs/run.h>
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-messages.h"
+#include "glusterd-svc-mgmt.h"
+#include "glusterd-svc-helper.h"
+#include "glusterd-conn-mgmt.h"
+#include "glusterd-proc-mgmt.h"
+#include "glusterd-snapd-svc.h"
+#include "glusterd-snapd-svc-helper.h"
+#include "glusterd-snapshot-utils.h"
+#include <glusterfs/syscall.h>
+
+char *snapd_svc_name = "snapd";
+
+static void
+glusterd_svc_build_snapd_logdir(char *logdir, char *volname, size_t len)
+{
+    glusterd_conf_t *priv = THIS->private;
+    snprintf(logdir, len, "%s/snaps/%s", priv->logdir, volname);
+}
+
+static void
+glusterd_svc_build_snapd_logfile(char *logfile, char *logdir, size_t len)
+{
+    snprintf(logfile, len, "%s/snapd.log", logdir);
+}
+
+void
+glusterd_snapdsvc_build(glusterd_svc_t *svc)
+{
+    svc->manager = glusterd_snapdsvc_manager;
+    svc->start = glusterd_snapdsvc_start;
+    svc->stop = glusterd_svc_stop;
+}
+
+int
+glusterd_snapdsvc_init(void *data)
+{
+    int ret = -1;
+    char rundir[PATH_MAX] = {
+        0,
+    };
+    char sockpath[PATH_MAX] = {
+        0,
+    };
+    char pidfile[PATH_MAX] = {
+        0,
+    };
+    char volfile[PATH_MAX] = {
+        0,
+    };
+    char logdir[PATH_MAX] = {
+        0,
+    };
+    char logfile[PATH_MAX] = {
+        0,
+    };
+    char volfileid[256] = {0};
+    glusterd_svc_t *svc = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_conn_notify_t notify = NULL;
+    xlator_t *this = NULL;
+    char *volfileserver = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    volinfo = data;
+
+    svc = &(volinfo->snapd.svc);
+
+    ret = snprintf(svc->name, sizeof(svc->name), "%s", snapd_svc_name);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    notify = glusterd_snapdsvc_rpc_notify;
+
+    glusterd_svc_build_snapd_rundir(volinfo, rundir, sizeof(rundir));
+    glusterd_svc_create_rundir(rundir);
+
+    /* Initialize the connection mgmt */
+    glusterd_svc_build_snapd_socket_filepath(volinfo, sockpath,
+                                             sizeof(sockpath));
+    ret = glusterd_conn_init(&(svc->conn), sockpath, 600, notify);
+    if (ret)
+        goto out;
+
+    /* Initialize the process mgmt */
+    glusterd_svc_build_snapd_pidfile(volinfo, pidfile, sizeof(pidfile));
+    glusterd_svc_build_snapd_volfile(volinfo, volfile, sizeof(volfile));
+    glusterd_svc_build_snapd_logdir(logdir, volinfo->volname, sizeof(logdir));
+    ret = mkdir_p(logdir, 0755, _gf_true);
+    if ((ret == -1) && (EEXIST != errno)) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create logdir %s", logdir);
+        goto out;
+    }
+    glusterd_svc_build_snapd_logfile(logfile, logdir, sizeof(logfile));
+    len = snprintf(volfileid, sizeof(volfileid), "snapd/%s", volinfo->volname);
+    if ((len < 0) || (len >= sizeof(volfileid))) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    if (dict_get_str(this->options, "transport.socket.bind-address",
+                     &volfileserver) != 0) {
+        volfileserver = "localhost";
+    }
+    ret = glusterd_proc_init(&(svc->proc), snapd_svc_name, pidfile, logdir,
+                             logfile, volfile, volfileid, volfileserver);
+    if (ret)
+        goto out;
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_snapdsvc_manager(glusterd_svc_t *svc, void *data, int flags)
+{
+    int ret = 0;
+    xlator_t *this = THIS;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    volinfo = data;
+
+    if (!svc->inited) {
+        ret = glusterd_snapdsvc_init(volinfo);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_INIT_FAIL,
+                   "Failed to initialize "
+                   "snapd service for volume %s",
+                   volinfo->volname);
+            goto out;
+        } else {
+            svc->inited = _gf_true;
+            gf_msg_debug(THIS->name, 0,
+                         "snapd service "
+                         "initialized");
+        }
+    }
+
+    ret = glusterd_is_snapd_enabled(volinfo);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Failed to read volume "
+               "options");
+        goto out;
+    }
+
+    if (ret) {
+        if (!glusterd_is_volume_started(volinfo)) {
+            if (glusterd_proc_is_running(&svc->proc)) {
+                ret = svc->stop(svc, SIGTERM);
+                if (ret)
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_STOP_FAIL,
+                           "Couldn't stop snapd for "
+                           "volume: %s",
+                           volinfo->volname);
+            } else {
+                /* Since snapd is not running set ret to 0 */
+                ret = 0;
+            }
+            goto out;
+        }
+
+        ret = glusterd_snapdsvc_create_volfile(volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_CREATE_FAIL,
+                   "Couldn't create "
+                   "snapd volfile for volume: %s",
+                   volinfo->volname);
+            goto out;
+        }
+
+        ret = svc->start(svc, flags);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_START_FAIL,
+                   "Couldn't start "
+                   "snapd for volume: %s",
+                   volinfo->volname);
+            goto out;
+        }
+
+        glusterd_volinfo_ref(volinfo);
+        ret = glusterd_conn_connect(&(svc->conn));
+        if (ret) {
+            glusterd_volinfo_unref(volinfo);
+            goto out;
+        }
+
+    } else if (glusterd_proc_is_running(&svc->proc)) {
+        ret = svc->stop(svc, SIGTERM);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_STOP_FAIL,
+                   "Couldn't stop snapd for volume: %s", volinfo->volname);
+            goto out;
+        }
+        volinfo->snapd.port = 0;
+    }
+
+out:
+    if (ret) {
+        gf_event(EVENT_SVC_MANAGER_FAILED, "volume=%s;svc_name=%s",
+                 volinfo->volname, svc->name);
+    }
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int32_t
+glusterd_snapdsvc_start(glusterd_svc_t *svc, int flags)
+{
+    int ret = -1;
+    runner_t runner = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    char valgrind_logfile[PATH_MAX] = {0};
+    int snapd_port = 0;
+    char msg[1024] = {
+        0,
+    };
+    char snapd_id[PATH_MAX] = {
+        0,
+    };
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_snapdsvc_t *snapd = NULL;
+    char *localtime_logging = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (glusterd_proc_is_running(&svc->proc)) {
+        ret = 0;
+        goto out;
+    }
+
+    /* Get volinfo->snapd from svc object */
+    snapd = cds_list_entry(svc, glusterd_snapdsvc_t, svc);
+    if (!snapd) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_OBJ_GET_FAIL,
+               "Failed to get snapd object "
+               "from snapd service");
+        goto out;
+    }
+
+    /* Get volinfo from snapd */
+    volinfo = cds_list_entry(snapd, glusterd_volinfo_t, snapd);
+    if (!volinfo) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Failed to get volinfo from "
+               "from snapd");
+        goto out;
+    }
+
+    ret = sys_access(svc->proc.volfile, F_OK);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_DEBUG, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "snapd Volfile %s is not present", svc->proc.volfile);
+        /* If glusterd is down on one of the nodes and during
+         * that time "USS is enabled" for the first time. After some
+         * time when the glusterd which was down comes back it tries
+         * to look for the snapd volfile and it does not find snapd
+         * volfile and because of this starting of snapd fails.
+         * Therefore, if volfile is not present then create a fresh
+         * volfile.
+         */
+        ret = glusterd_snapdsvc_create_volfile(volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+                   "Couldn't create "
+                   "snapd volfile for volume: %s",
+                   volinfo->volname);
+            goto out;
+        }
+    }
+    runinit(&runner);
+
+    if (this->ctx->cmd_args.vgtool != _gf_none) {
+        len = snprintf(valgrind_logfile, PATH_MAX, "%s/valgrind-snapd.log",
+                       svc->proc.logdir);
+        if ((len < 0) || (len >= PATH_MAX)) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+            ret = -1;
+            goto out;
+        }
+
+        if (this->ctx->cmd_args.vgtool == _gf_memcheck)
+            runner_add_args(&runner, "valgrind", "--leak-check=full",
+                            "--trace-children=yes", "--track-origins=yes",
+                            NULL);
+        else
+            runner_add_args(&runner, "valgrind", "--tool=drd", NULL);
+
+        runner_argprintf(&runner, "--log-file=%s", valgrind_logfile);
+    }
+
+    snprintf(snapd_id, sizeof(snapd_id), "snapd-%s", volinfo->volname);
+    runner_add_args(&runner, SBIN_DIR "/glusterfsd", "-s",
+                    svc->proc.volfileserver, "--volfile-id",
+                    svc->proc.volfileid, "-p", svc->proc.pidfile, "-l",
+                    svc->proc.logfile, "--brick-name", snapd_id, "-S",
+                    svc->conn.sockpath, "--process-name", svc->name, NULL);
+    if (dict_get_str(priv->opts, GLUSTERD_LOCALTIME_LOGGING_KEY,
+                     &localtime_logging) == 0) {
+        if (strcmp(localtime_logging, "enable") == 0)
+            runner_add_arg(&runner, "--localtime-logging");
+    }
+
+    snapd_port = pmap_assign_port(THIS, volinfo->snapd.port, snapd_id);
+    if (!snapd_port) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PORTS_EXHAUSTED,
+               "All the ports in the range are exhausted, can't start "
+               "snapd for volume %s",
+               volinfo->volname);
+        ret = -1;
+        goto out;
+    }
+
+    volinfo->snapd.port = snapd_port;
+
+    runner_add_arg(&runner, "--brick-port");
+    runner_argprintf(&runner, "%d", snapd_port);
+    runner_add_arg(&runner, "--xlator-option");
+    runner_argprintf(&runner, "%s-server.listen-port=%d", volinfo->volname,
+                     snapd_port);
+    runner_add_arg(&runner, "--no-mem-accounting");
+
+    snprintf(msg, sizeof(msg), "Starting the snapd service for volume %s",
+             volinfo->volname);
+    runner_log(&runner, this->name, GF_LOG_DEBUG, msg);
+
+    if (flags == PROC_START_NO_WAIT) {
+        ret = runner_run_nowait(&runner);
+    } else {
+        synclock_unlock(&priv->big_lock);
+        {
+            ret = runner_run(&runner);
+        }
+        synclock_lock(&priv->big_lock);
+    }
+
+out:
+    return ret;
+}
+
+int
+glusterd_snapdsvc_restart()
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_volinfo_t *tmp = NULL;
+    int ret = 0;
+    xlator_t *this = THIS;
+    glusterd_conf_t *conf = NULL;
+    glusterd_svc_t *svc = NULL;
+
+    GF_ASSERT(this);
+
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    cds_list_for_each_entry_safe(volinfo, tmp, &conf->volumes, vol_list)
+    {
+        /* Start per volume snapd svc */
+        if (volinfo->status == GLUSTERD_STATUS_STARTED) {
+            svc = &(volinfo->snapd.svc);
+            ret = svc->manager(svc, volinfo, PROC_START_NO_WAIT);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_START_FAIL,
+                       "Couldn't resolve snapd for "
+                       "vol: %s on restart",
+                       volinfo->volname);
+                gf_event(EVENT_SVC_MANAGER_FAILED, "volume=%s;svc_name=%s",
+                         volinfo->volname, svc->name);
+                goto out;
+            }
+        }
+    }
+out:
+    return ret;
+}
+
+int
+glusterd_snapdsvc_rpc_notify(glusterd_conn_t *conn, rpc_clnt_event_t event)
+{
+    int ret = 0;
+    glusterd_svc_t *svc = NULL;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_snapdsvc_t *snapd = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    svc = cds_list_entry(conn, glusterd_svc_t, conn);
+    if (!svc) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SVC_GET_FAIL,
+               "Failed to get the service");
+        return -1;
+    }
+    snapd = cds_list_entry(svc, glusterd_snapdsvc_t, svc);
+    if (!snapd) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_OBJ_GET_FAIL,
+               "Failed to get the "
+               "snapd object");
+        return -1;
+    }
+
+    volinfo = cds_list_entry(snapd, glusterd_volinfo_t, snapd);
+    if (!volinfo) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Failed to get the "
+               "volinfo object");
+        return -1;
+    }
+
+    switch (event) {
+        case RPC_CLNT_CONNECT:
+            gf_msg_debug(this->name, 0,
+                         "%s has connected with "
+                         "glusterd.",
+                         svc->name);
+            gf_event(EVENT_SVC_CONNECTED, "volume=%s;svc_name=%s",
+                     volinfo->volname, svc->name);
+            svc->online = _gf_true;
+            break;
+
+        case RPC_CLNT_DISCONNECT:
+            if (svc->online) {
+                gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_NODE_DISCONNECTED,
+                       "%s has disconnected "
+                       "from glusterd.",
+                       svc->name);
+                gf_event(EVENT_SVC_DISCONNECTED, "volume=%s;svc_name=%s",
+                         volinfo->volname, svc->name);
+                svc->online = _gf_false;
+            }
+            break;
+
+        case RPC_CLNT_DESTROY:
+            glusterd_volinfo_unref(volinfo);
+            break;
+
+        default:
+            gf_msg_trace(this->name, 0, "got some other RPC event %d", event);
+            break;
+    }
+
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapd-svc.h b/xlators/mgmt/glusterd/src/glusterd-snapd-svc.h
new file mode 100644
index 00000000000..e15dbf54315
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-snapd-svc.h
@@ -0,0 +1,42 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_SNAPD_SVC_H_
+#define _GLUSTERD_SNAPD_SVC_H_
+
+#include "glusterd-svc-mgmt.h"
+
+typedef struct glusterd_snapdsvc_ glusterd_snapdsvc_t;
+
+struct glusterd_snapdsvc_ {
+    glusterd_svc_t svc;
+    gf_store_handle_t *handle;
+    int port;
+};
+
+void
+glusterd_snapdsvc_build(glusterd_svc_t *svc);
+
+int
+glusterd_snapdsvc_init(void *data);
+
+int
+glusterd_snapdsvc_manager(glusterd_svc_t *svc, void *data, int flags);
+
+int
+glusterd_snapdsvc_start(glusterd_svc_t *svc, int flags);
+
+int
+glusterd_snapdsvc_restart();
+
+int
+glusterd_snapdsvc_rpc_notify(glusterd_conn_t *conn, rpc_clnt_event_t event);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
new file mode 100644
index 00000000000..995268b796d
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c
@@ -0,0 +1,4290 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <inttypes.h>
+
+#if defined(GF_LINUX_HOST_OS)
+#include <mntent.h>
+#else
+#include "mntent_compat.h"
+#endif
+#include <dlfcn.h>
+
+#include <glusterfs/dict.h>
+#include <glusterfs/syscall.h>
+#include "glusterd-op-sm.h"
+#include "glusterd-utils.h"
+#include "glusterd-messages.h"
+#include "glusterd-store.h"
+#include "glusterd-volgen.h"
+#include "glusterd-snapd-svc.h"
+#include "glusterd-svc-helper.h"
+#include "glusterd-snapd-svc-helper.h"
+#include "glusterd-snapshot-utils.h"
+#include "glusterd-server-quorum.h"
+#include "glusterd-messages.h"
+#include "glusterd-errno.h"
+
+/*
+ *  glusterd_snap_geo_rep_restore:
+ *      This function restores the atime and mtime of marker.tstamp
+ *      if present from snapped marker.tstamp file.
+ */
+
+int32_t
+glusterd_snapobject_delete(glusterd_snap_t *snap)
+{
+    if (snap == NULL) {
+        gf_msg(THIS->name, GF_LOG_WARNING, 0, GD_MSG_PARAM_NULL,
+               "snap is NULL");
+        return -1;
+    }
+
+    cds_list_del_init(&snap->snap_list);
+    cds_list_del_init(&snap->volumes);
+    if (LOCK_DESTROY(&snap->lock))
+        gf_msg(THIS->name, GF_LOG_WARNING, 0, GD_MSG_LOCK_DESTROY_FAILED,
+               "Failed destroying lock"
+               "of snap %s",
+               snap->snapname);
+
+    GF_FREE(snap->description);
+    GF_FREE(snap);
+
+    return 0;
+}
+
+/*
+ * This function is to be called only from glusterd_peer_detach_cleanup()
+ * as this continues to delete snaps in spite of faiure while deleting
+ * one, as we don't want to fail peer_detach in such a case.
+ */
+int
+glusterd_cleanup_snaps_for_volume(glusterd_volinfo_t *volinfo)
+{
+    int32_t op_ret = 0;
+    int32_t ret = 0;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_volinfo_t *dummy_snap_vol = NULL;
+    glusterd_snap_t *snap = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    cds_list_for_each_entry_safe(snap_vol, dummy_snap_vol,
+                                 &volinfo->snap_volumes, snapvol_list)
+    {
+        snap = snap_vol->snapshot;
+        ret = glusterd_store_delete_snap(snap);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOL_DELETE_FAIL,
+                   "Failed to remove "
+                   "snap %s from store",
+                   snap->snapname);
+            op_ret = ret;
+            continue;
+        }
+
+        ret = glusterd_snapobject_delete(snap);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOL_DELETE_FAIL,
+                   "Failed to delete "
+                   "snap object %s",
+                   snap->snapname);
+            op_ret = ret;
+            continue;
+        }
+
+        ret = glusterd_store_delete_volume(snap_vol);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOL_DELETE_FAIL,
+                   "Failed to remove "
+                   "volume %s from store",
+                   snap_vol->volname);
+            op_ret = ret;
+            continue;
+        }
+
+        ret = glusterd_volinfo_delete(snap_vol);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOL_DELETE_FAIL,
+                   "Failed to remove "
+                   "volinfo %s ",
+                   snap_vol->volname);
+            op_ret = ret;
+            continue;
+        }
+    }
+
+    return op_ret;
+}
+
+int
+glusterd_snap_geo_rep_restore(glusterd_volinfo_t *snap_volinfo,
+                              glusterd_volinfo_t *new_volinfo)
+{
+    char vol_tstamp_file[PATH_MAX] = {
+        0,
+    };
+    char snap_tstamp_file[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    int geo_rep_indexing_on = 0;
+    int ret = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(snap_volinfo);
+    GF_ASSERT(new_volinfo);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* Check if geo-rep indexing is enabled, if yes, we need restore
+     * back the mtime of 'marker.tstamp' file.
+     */
+    geo_rep_indexing_on = glusterd_volinfo_get_boolean(new_volinfo,
+                                                       VKEY_MARKER_XTIME);
+    if (geo_rep_indexing_on == -1) {
+        gf_msg_debug(this->name, 0,
+                     "Failed"
+                     " to check whether geo-rep-indexing enabled or not");
+        ret = 0;
+        goto out;
+    }
+
+    if (geo_rep_indexing_on == 1) {
+        GLUSTERD_GET_VOLUME_DIR(vol_tstamp_file, new_volinfo, priv);
+        strncat(vol_tstamp_file, "/marker.tstamp",
+                PATH_MAX - strlen(vol_tstamp_file) - 1);
+        GLUSTERD_GET_VOLUME_DIR(snap_tstamp_file, snap_volinfo, priv);
+        strncat(snap_tstamp_file, "/marker.tstamp",
+                PATH_MAX - strlen(snap_tstamp_file) - 1);
+        ret = gf_set_timestamp(snap_tstamp_file, vol_tstamp_file);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TSTAMP_SET_FAIL,
+                   "Unable to set atime and mtime of %s as of %s",
+                   vol_tstamp_file, snap_tstamp_file);
+            goto out;
+        }
+    }
+
+out:
+    return ret;
+}
+
+/* This function will copy snap volinfo to the new
+ * passed volinfo and regenerate backend store files
+ * for the restored snap.
+ *
+ * @param new_volinfo   new volinfo
+ * @param snap_volinfo  volinfo of snap volume
+ *
+ * @return 0 on success and -1 on failure
+ *
+ * TODO: Duplicate all members of volinfo, e.g. geo-rep sync slaves
+ */
+int32_t
+glusterd_snap_volinfo_restore(dict_t *dict, dict_t *rsp_dict,
+                              glusterd_volinfo_t *new_volinfo,
+                              glusterd_volinfo_t *snap_volinfo,
+                              int32_t volcount)
+{
+    char *value = NULL;
+    char key[64] = "";
+    int32_t brick_count = -1;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *new_brickinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+
+    GF_VALIDATE_OR_GOTO(this->name, new_volinfo, out);
+    GF_VALIDATE_OR_GOTO(this->name, snap_volinfo, out);
+
+    brick_count = 0;
+    cds_list_for_each_entry(brickinfo, &snap_volinfo->bricks, brick_list)
+    {
+        brick_count++;
+        ret = glusterd_brickinfo_new(&new_brickinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_NEW_INFO_FAIL,
+                   "Failed to create "
+                   "new brickinfo");
+            goto out;
+        }
+
+        /* Duplicate brickinfo */
+        ret = glusterd_brickinfo_dup(brickinfo, new_brickinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_SET_INFO_FAIL,
+                   "Failed to dup "
+                   "brickinfo");
+            goto out;
+        }
+
+        /* Fetch values if present in dict These values won't
+         * be present in case of a missed restore. In that case
+         * it's fine to use the local node's value
+         */
+        snprintf(key, sizeof(key), "snap%d.brick%d.path", volcount,
+                 brick_count);
+        ret = dict_get_str(dict, key, &value);
+        if (!ret)
+            gf_strncpy(new_brickinfo->path, value, sizeof(new_brickinfo->path));
+
+        snprintf(key, sizeof(key), "snap%d.brick%d.snap_status", volcount,
+                 brick_count);
+        ret = dict_get_int32(dict, key, &new_brickinfo->snap_status);
+
+        snprintf(key, sizeof(key), "snap%d.brick%d.device_path", volcount,
+                 brick_count);
+        ret = dict_get_str(dict, key, &value);
+        if (!ret)
+            gf_strncpy(new_brickinfo->device_path, value,
+                       sizeof(new_brickinfo->device_path));
+
+        snprintf(key, sizeof(key), "snap%d.brick%d.fs_type", volcount,
+                 brick_count);
+        ret = dict_get_str(dict, key, &value);
+        if (!ret)
+            gf_strncpy(new_brickinfo->fstype, value,
+                       sizeof(new_brickinfo->fstype));
+
+        snprintf(key, sizeof(key), "snap%d.brick%d.mnt_opts", volcount,
+                 brick_count);
+        ret = dict_get_str(dict, key, &value);
+        if (!ret)
+            gf_strncpy(new_brickinfo->mnt_opts, value,
+                       sizeof(new_brickinfo->mnt_opts));
+
+        /* If the brick is not of this peer, or snapshot is missed *
+         * for the brick do not replace the xattr for it */
+        if ((!gf_uuid_compare(brickinfo->uuid, MY_UUID)) &&
+            (brickinfo->snap_status != -1)) {
+            /* We need to replace the volume id of all the bricks
+             * to the volume id of the origin volume. new_volinfo
+             * has the origin volume's volume id*/
+            ret = sys_lsetxattr(new_brickinfo->path, GF_XATTR_VOL_ID_KEY,
+                                new_volinfo->volume_id,
+                                sizeof(new_volinfo->volume_id), XATTR_REPLACE);
+            if (ret == -1) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_SET_XATTR_FAIL,
+                        "Attribute=%s, Path=%s, Reason=%s, Snap=%s",
+                        GF_XATTR_VOL_ID_KEY, new_brickinfo->path,
+                        strerror(errno), new_volinfo->volname, NULL);
+                goto out;
+            }
+        }
+
+        /* If a snapshot is pending for this brick then
+         * restore should also be pending
+         */
+        if (brickinfo->snap_status == -1) {
+            /* Adding missed delete to the dict */
+            ret = glusterd_add_missed_snaps_to_dict(
+                rsp_dict, snap_volinfo, brickinfo, brick_count,
+                GF_SNAP_OPTION_TYPE_RESTORE);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_MISSEDSNAP_INFO_SET_FAIL,
+                       "Failed to add missed snapshot info "
+                       "for %s:%s in the rsp_dict",
+                       brickinfo->hostname, brickinfo->path);
+                goto out;
+            }
+        }
+
+        cds_list_add_tail(&new_brickinfo->brick_list, &new_volinfo->bricks);
+        /* ownership of new_brickinfo is passed to new_volinfo */
+        new_brickinfo = NULL;
+    }
+
+    /* Regenerate all volfiles */
+    ret = glusterd_create_volfiles_and_notify_services(new_volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Failed to regenerate volfiles");
+        goto out;
+    }
+
+    /* Restore geo-rep marker.tstamp's timestamp */
+    ret = glusterd_snap_geo_rep_restore(snap_volinfo, new_volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TSTAMP_SET_FAIL,
+               "Geo-rep: marker.tstamp's timestamp restoration failed");
+        goto out;
+    }
+
+out:
+    if (ret && (NULL != new_brickinfo)) {
+        (void)glusterd_brickinfo_delete(new_brickinfo);
+    }
+
+    return ret;
+}
+
+int
+glusterd_snap_volinfo_find_by_volume_id(uuid_t volume_id,
+                                        glusterd_volinfo_t **volinfo)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *voliter = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(volinfo);
+
+    if (gf_uuid_is_null(volume_id)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_UUID_NULL,
+               "Volume UUID is NULL");
+        goto out;
+    }
+
+    cds_list_for_each_entry(snap, &priv->snapshots, snap_list)
+    {
+        cds_list_for_each_entry(voliter, &snap->volumes, vol_list)
+        {
+            if (gf_uuid_compare(volume_id, voliter->volume_id))
+                continue;
+            *volinfo = voliter;
+            ret = 0;
+            goto out;
+        }
+    }
+
+    gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_NOT_FOUND,
+           "Snap volume not found");
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_snap_volinfo_find(char *snap_volname, glusterd_snap_t *snap,
+                           glusterd_volinfo_t **volinfo)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(snap);
+    GF_ASSERT(snap_volname);
+
+    cds_list_for_each_entry(snap_vol, &snap->volumes, vol_list)
+    {
+        if (!strcmp(snap_vol->volname, snap_volname)) {
+            ret = 0;
+            *volinfo = snap_vol;
+            goto out;
+        }
+    }
+
+    gf_msg(this->name, GF_LOG_WARNING, EINVAL, GD_MSG_SNAP_NOT_FOUND,
+           "Snap volume %s not found", snap_volname);
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_snap_volinfo_find_from_parent_volname(char *origin_volname,
+                                               glusterd_snap_t *snap,
+                                               glusterd_volinfo_t **volinfo)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(snap);
+    GF_ASSERT(origin_volname);
+
+    cds_list_for_each_entry(snap_vol, &snap->volumes, vol_list)
+    {
+        if (!strcmp(snap_vol->parent_volname, origin_volname)) {
+            ret = 0;
+            *volinfo = snap_vol;
+            goto out;
+        }
+    }
+
+    gf_msg_debug(this->name, 0,
+                 "Snap volume not found(snap: %s, "
+                 "origin-volume: %s",
+                 snap->snapname, origin_volname);
+
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Exports a bricks snapshot details only if required
+ *
+ * The details will be exported only if the cluster op-version is greater than
+ * 4, ie. snapshot is supported in the cluster
+ */
+int
+gd_add_brick_snap_details_to_dict(dict_t *dict, char *prefix,
+                                  glusterd_brickinfo_t *brickinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char key[256] = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this != NULL);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (conf != NULL), out);
+
+    GF_VALIDATE_OR_GOTO(this->name, (dict != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (prefix != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (brickinfo != NULL), out);
+
+    if (conf->op_version < GD_OP_VERSION_3_6_0) {
+        ret = 0;
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.snap_status", prefix);
+    ret = dict_set_int32(dict, key, brickinfo->snap_status);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_STATUS_FAIL,
+               "Failed to set snap_status for %s:%s", brickinfo->hostname,
+               brickinfo->path);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.device_path", prefix);
+    ret = dict_set_str(dict, key, brickinfo->device_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snap_device for %s:%s", brickinfo->hostname,
+               brickinfo->path);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.fs_type", prefix);
+    ret = dict_set_str(dict, key, brickinfo->fstype);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set fstype for %s:%s", brickinfo->hostname,
+               brickinfo->path);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.mnt_opts", prefix);
+    ret = dict_set_str(dict, key, brickinfo->mnt_opts);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRK_MOUNTOPTS_FAIL,
+               "Failed to set mnt_opts for %s:%s", brickinfo->hostname,
+               brickinfo->path);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.mount_dir", prefix);
+    ret = dict_set_str(dict, key, brickinfo->mount_dir);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to set mount_dir for %s:%s", brickinfo->hostname,
+               brickinfo->path);
+
+out:
+    return ret;
+}
+
+/* Exports a volumes snapshot details only if required.
+ *
+ * The snapshot details will only be exported if the cluster op-version is
+ * greater than 4, ie. snapshot is supported in the cluster
+ */
+int
+gd_add_vol_snap_details_to_dict(dict_t *dict, char *prefix,
+                                glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char key[256] = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this != NULL);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (conf != NULL), out);
+
+    GF_VALIDATE_OR_GOTO(this->name, (dict != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (volinfo != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (prefix != NULL), out);
+
+    if (conf->op_version < GD_OP_VERSION_3_6_0) {
+        ret = 0;
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.restored_from_snap", prefix);
+    ret = dict_set_dynstr_with_alloc(dict, key,
+                                     uuid_utoa(volinfo->restored_from_snap));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set %s for volume"
+               "%s",
+               key, volinfo->volname);
+        goto out;
+    }
+
+    if (strlen(volinfo->parent_volname) > 0) {
+        snprintf(key, sizeof(key), "%s.parent_volname", prefix);
+        ret = dict_set_dynstr_with_alloc(dict, key, volinfo->parent_volname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to set %s "
+                   "for volume %s",
+                   key, volinfo->volname);
+            goto out;
+        }
+    }
+
+    snprintf(key, sizeof(key), "%s.is_snap_volume", prefix);
+    ret = dict_set_uint32(dict, key, volinfo->is_snap_volume);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set %s for volume"
+               "%s",
+               key, volinfo->volname);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.snap-max-hard-limit", prefix);
+    ret = dict_set_uint64(dict, key, volinfo->snap_max_hard_limit);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set %s for volume"
+               "%s",
+               key, volinfo->volname);
+    }
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_add_missed_snaps_to_export_dict(dict_t *peer_data)
+{
+    char name_buf[PATH_MAX] = "";
+    char value[PATH_MAX] = "";
+    int32_t missed_snap_count = 0;
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    glusterd_missed_snap_info *missed_snapinfo = NULL;
+    glusterd_snap_op_t *snap_opinfo = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(peer_data);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* Add the missed_entries in the dict */
+    cds_list_for_each_entry(missed_snapinfo, &priv->missed_snaps_list,
+                            missed_snaps)
+    {
+        cds_list_for_each_entry(snap_opinfo, &missed_snapinfo->snap_ops,
+                                snap_ops_list)
+        {
+            snprintf(name_buf, sizeof(name_buf), "missed_snaps_%d",
+                     missed_snap_count);
+            snprintf(value, sizeof(value), "%s:%s=%s:%d:%s:%d:%d",
+                     missed_snapinfo->node_uuid, missed_snapinfo->snap_uuid,
+                     snap_opinfo->snap_vol_id, snap_opinfo->brick_num,
+                     snap_opinfo->brick_path, snap_opinfo->op,
+                     snap_opinfo->status);
+
+            ret = dict_set_dynstr_with_alloc(peer_data, name_buf, value);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Unable to set %s", name_buf);
+                goto out;
+            }
+            missed_snap_count++;
+        }
+    }
+
+    ret = dict_set_int32(peer_data, "missed_snap_count", missed_snap_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set missed_snap_count");
+        goto out;
+    }
+
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_add_snap_to_dict(glusterd_snap_t *snap, dict_t *peer_data,
+                          int32_t snap_count)
+{
+    char buf[64] = "";
+    char prefix[32] = "";
+    int32_t ret = -1;
+    int32_t volcount = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    gf_boolean_t host_bricks = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(snap);
+    GF_ASSERT(peer_data);
+
+    snprintf(prefix, sizeof(prefix), "snap%d", snap_count);
+
+    cds_list_for_each_entry(volinfo, &snap->volumes, vol_list)
+    {
+        volcount++;
+        ret = glusterd_add_volume_to_dict(volinfo, peer_data, volcount, prefix);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to add snap:%s volume:%s "
+                   "to peer_data dict for handshake",
+                   snap->snapname, volinfo->volname);
+            goto out;
+        }
+
+        if (glusterd_is_volume_quota_enabled(volinfo)) {
+            ret = glusterd_vol_add_quota_conf_to_dict(volinfo, peer_data,
+                                                      volcount, prefix);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to add quota conf for "
+                       "snap:%s volume:%s to peer_data "
+                       "dict for handshake",
+                       snap->snapname, volinfo->volname);
+                goto out;
+            }
+        }
+
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            if (!gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+                host_bricks = _gf_true;
+                break;
+            }
+        }
+    }
+
+    snprintf(buf, sizeof(buf), "%s.host_bricks", prefix);
+    ret = dict_set_int8(peer_data, buf, (int8_t)host_bricks);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set host_bricks for snap %s", snap->snapname);
+        goto out;
+    }
+
+    snprintf(buf, sizeof(buf), "%s.volcount", prefix);
+    ret = dict_set_int32(peer_data, buf, volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set volcount for snap %s", snap->snapname);
+        goto out;
+    }
+
+    snprintf(buf, sizeof(buf), "%s.snapname", prefix);
+    ret = dict_set_dynstr_with_alloc(peer_data, buf, snap->snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set snapname for snap %s", snap->snapname);
+        goto out;
+    }
+
+    snprintf(buf, sizeof(buf), "%s.snap_id", prefix);
+    ret = dict_set_dynstr_with_alloc(peer_data, buf, uuid_utoa(snap->snap_id));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set snap_id for snap %s", snap->snapname);
+        goto out;
+    }
+
+    if (snap->description) {
+        snprintf(buf, sizeof(buf), "%s.description", prefix);
+        ret = dict_set_dynstr_with_alloc(peer_data, buf, snap->description);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to set description for snap %s", snap->snapname);
+            goto out;
+        }
+    }
+
+    snprintf(buf, sizeof(buf), "%s.time_stamp", prefix);
+    ret = dict_set_int64(peer_data, buf, (int64_t)snap->time_stamp);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set time_stamp for snap %s", snap->snapname);
+        goto out;
+    }
+
+    snprintf(buf, sizeof(buf), "%s.snap_restored", prefix);
+    ret = dict_set_int8(peer_data, buf, snap->snap_restored);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set snap_restored for snap %s", snap->snapname);
+        goto out;
+    }
+
+    snprintf(buf, sizeof(buf), "%s.snap_status", prefix);
+    ret = dict_set_int32(peer_data, buf, snap->snap_status);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set snap_status for snap %s", snap->snapname);
+        goto out;
+    }
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_add_snapshots_to_export_dict(dict_t *peer_data)
+{
+    int32_t snap_count = 0;
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    glusterd_snap_t *snap = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(peer_data);
+
+    cds_list_for_each_entry(snap, &priv->snapshots, snap_list)
+    {
+        snap_count++;
+        ret = glusterd_add_snap_to_dict(snap, peer_data, snap_count);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to add snap(%s) to the "
+                   " peer_data dict for handshake",
+                   snap->snapname);
+            goto out;
+        }
+    }
+
+    ret = dict_set_int32(peer_data, "snap_count", snap_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snap_count");
+        goto out;
+    }
+
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Imports the snapshot details of a brick if required and available
+ *
+ * Snapshot details will be imported only if the cluster op-version is >= 4
+ */
+int
+gd_import_new_brick_snap_details(dict_t *dict, char *prefix,
+                                 glusterd_brickinfo_t *brickinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char key[512] = {
+        0,
+    };
+    char *snap_device = NULL;
+    char *fs_type = NULL;
+    char *mnt_opts = NULL;
+    char *mount_dir = NULL;
+
+    this = THIS;
+    GF_ASSERT(this != NULL);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (conf != NULL), out);
+
+    GF_VALIDATE_OR_GOTO(this->name, (dict != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (prefix != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (brickinfo != NULL), out);
+
+    if (conf->op_version < GD_OP_VERSION_3_6_0) {
+        ret = 0;
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.snap_status", prefix);
+    ret = dict_get_int32(dict, key, &brickinfo->snap_status);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "%s missing in payload", key);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.device_path", prefix);
+    ret = dict_get_str(dict, key, &snap_device);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "%s missing in payload", key);
+        goto out;
+    }
+    gf_strncpy(brickinfo->device_path, snap_device,
+               sizeof(brickinfo->device_path));
+    snprintf(key, sizeof(key), "%s.fs_type", prefix);
+    ret = dict_get_str(dict, key, &fs_type);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "%s missing in payload", key);
+        goto out;
+    }
+    gf_strncpy(brickinfo->fstype, fs_type, sizeof(brickinfo->fstype));
+
+    snprintf(key, sizeof(key), "%s.mnt_opts", prefix);
+    ret = dict_get_str(dict, key, &mnt_opts);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "%s missing in payload", key);
+        goto out;
+    }
+    gf_strncpy(brickinfo->mnt_opts, mnt_opts, sizeof(brickinfo->mnt_opts));
+
+    snprintf(key, sizeof(key), "%s.mount_dir", prefix);
+    ret = dict_get_str(dict, key, &mount_dir);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "%s missing in payload", key);
+        goto out;
+    }
+    gf_strncpy(brickinfo->mount_dir, mount_dir, sizeof(brickinfo->mount_dir));
+
+out:
+    return ret;
+}
+
+/*
+ * Imports the snapshot details of a volume if required and available
+ *
+ * Snapshot details will be imported only if cluster.op_version is greater than
+ * or equal to GD_OP_VERSION_3_6_0, the op-version from which volume snapshot is
+ * supported.
+ */
+int
+gd_import_volume_snap_details(dict_t *dict, glusterd_volinfo_t *volinfo,
+                              char *prefix, char *volname)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char key[256] = {
+        0,
+    };
+    char *restored_snap = NULL;
+
+    this = THIS;
+    GF_ASSERT(this != NULL);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (conf != NULL), out);
+
+    GF_VALIDATE_OR_GOTO(this->name, (dict != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (volinfo != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (prefix != NULL), out);
+    GF_VALIDATE_OR_GOTO(this->name, (volname != NULL), out);
+
+    if (conf->op_version < GD_OP_VERSION_3_6_0) {
+        ret = 0;
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.is_snap_volume", prefix);
+    uint32_t is_snap_int;
+    ret = dict_get_uint32(dict, key, &is_snap_int);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "%s missing in payload "
+               "for %s",
+               key, volname);
+        goto out;
+    }
+    volinfo->is_snap_volume = (is_snap_int != 0);
+
+    snprintf(key, sizeof(key), "%s.restored_from_snap", prefix);
+    ret = dict_get_str(dict, key, &restored_snap);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "%s missing in payload "
+               "for %s",
+               key, volname);
+        goto out;
+    }
+
+    gf_uuid_parse(restored_snap, volinfo->restored_from_snap);
+
+    snprintf(key, sizeof(key), "%s.snap-max-hard-limit", prefix);
+    ret = dict_get_uint64(dict, key, &volinfo->snap_max_hard_limit);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "%s missing in payload "
+               "for %s",
+               key, volname);
+out:
+    return ret;
+}
+
+int32_t
+glusterd_perform_missed_op(glusterd_snap_t *snap, int32_t op)
+{
+    dict_t *dict = NULL;
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *snap_volinfo = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_volinfo_t *tmp = NULL;
+    xlator_t *this = NULL;
+    uuid_t null_uuid = {0};
+    char *parent_volname = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(snap);
+
+    dict = dict_new();
+    if (!dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Unable to create dict");
+        ret = -1;
+        goto out;
+    }
+
+    switch (op) {
+        case GF_SNAP_OPTION_TYPE_DELETE:
+            ret = glusterd_snap_remove(dict, snap, _gf_true, _gf_false,
+                                       _gf_false);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                       "Failed to remove snap");
+                goto out;
+            }
+
+            break;
+        case GF_SNAP_OPTION_TYPE_RESTORE:
+            cds_list_for_each_entry_safe(snap_volinfo, tmp, &snap->volumes,
+                                         vol_list)
+            {
+                parent_volname = gf_strdup(snap_volinfo->parent_volname);
+                if (!parent_volname)
+                    goto out;
+
+                ret = glusterd_volinfo_find(parent_volname, &volinfo);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                           "Could not get volinfo of %s", parent_volname);
+                    goto out;
+                }
+
+                volinfo->version--;
+                gf_uuid_copy(volinfo->restored_from_snap, null_uuid);
+
+                /* gd_restore_snap_volume() uses the dict and volcount
+                 * to fetch snap brick info from other nodes, which were
+                 * collected during prevalidation. As this is an ad-hoc
+                 * op and only local node's data matter, hence sending
+                 * volcount as 0 and re-using the same dict because we
+                 * need not record any missed creates in the rsp_dict.
+                 */
+                ret = gd_restore_snap_volume(dict, dict, volinfo, snap_volinfo,
+                                             0);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_SNAP_RESTORE_FAIL,
+                           "Failed to restore snap for %s", snap->snapname);
+                    volinfo->version++;
+                    goto out;
+                }
+
+                /* Restore is successful therefore delete the original
+                 * volume's volinfo. If the volinfo is already restored
+                 * then we should delete the backend LVMs */
+                if (!gf_uuid_is_null(volinfo->restored_from_snap)) {
+                    ret = glusterd_lvm_snapshot_remove(dict, volinfo);
+                    if (ret) {
+                        gf_msg(this->name, GF_LOG_ERROR, 0,
+                               GD_MSG_SNAP_REMOVE_FAIL,
+                               "Failed to remove LVM backend");
+                        goto out;
+                    }
+                }
+
+                /* Detach the volinfo from priv->volumes, so that no new
+                 * command can ref it any more and then unref it.
+                 */
+                cds_list_del_init(&volinfo->vol_list);
+                glusterd_volinfo_unref(volinfo);
+
+                ret = glusterd_snapshot_restore_cleanup(dict, parent_volname,
+                                                        snap);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_SNAP_CLEANUP_FAIL,
+                           "Failed to perform snapshot restore "
+                           "cleanup for %s volume",
+                           parent_volname);
+                    goto out;
+                }
+
+                GF_FREE(parent_volname);
+                parent_volname = NULL;
+            }
+
+            break;
+        default:
+            /* The entry must be a create, delete, or
+             * restore entry
+             */
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                   "Invalid missed snap entry");
+            ret = -1;
+            goto out;
+    }
+
+out:
+    dict_unref(dict);
+    if (parent_volname) {
+        GF_FREE(parent_volname);
+        parent_volname = NULL;
+    }
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Perform missed deletes and restores on this node */
+int32_t
+glusterd_perform_missed_snap_ops()
+{
+    int32_t ret = -1;
+    int32_t op_status = -1;
+    glusterd_conf_t *priv = NULL;
+    glusterd_missed_snap_info *missed_snapinfo = NULL;
+    glusterd_snap_op_t *snap_opinfo = NULL;
+    glusterd_snap_t *snap = NULL;
+    uuid_t snap_uuid = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    cds_list_for_each_entry(missed_snapinfo, &priv->missed_snaps_list,
+                            missed_snaps)
+    {
+        /* If the pending snap_op is not for this node then continue */
+        if (strcmp(missed_snapinfo->node_uuid, uuid_utoa(MY_UUID)))
+            continue;
+
+        /* Find the snap id */
+        gf_uuid_parse(missed_snapinfo->snap_uuid, snap_uuid);
+        snap = NULL;
+        snap = glusterd_find_snap_by_id(snap_uuid);
+        if (!snap) {
+            /* If the snap is not found, then a delete or a
+             * restore can't be pending on that snap_uuid.
+             */
+            gf_msg_debug(this->name, 0, "Not a pending delete or restore op");
+            continue;
+        }
+
+        op_status = GD_MISSED_SNAP_PENDING;
+        cds_list_for_each_entry(snap_opinfo, &missed_snapinfo->snap_ops,
+                                snap_ops_list)
+        {
+            /* If the snap_op is create or its status is
+             * GD_MISSED_SNAP_DONE then continue
+             */
+            if ((snap_opinfo->status == GD_MISSED_SNAP_DONE) ||
+                (snap_opinfo->op == GF_SNAP_OPTION_TYPE_CREATE))
+                continue;
+
+            /* Perform the actual op for the first time for
+             * this snap, and mark the snap_status as
+             * GD_MISSED_SNAP_DONE. For other entries for the same
+             * snap, just mark the entry as done.
+             */
+            if (op_status == GD_MISSED_SNAP_PENDING) {
+                ret = glusterd_perform_missed_op(snap, snap_opinfo->op);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_SNAPSHOT_OP_FAILED,
+                           "Failed to perform missed snap op");
+                    goto out;
+                }
+                op_status = GD_MISSED_SNAP_DONE;
+            }
+
+            snap_opinfo->status = GD_MISSED_SNAP_DONE;
+        }
+    }
+
+    ret = 0;
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Import friend volumes missed_snap_list and update *
+ * missed_snap_list if need be */
+int32_t
+glusterd_import_friend_missed_snap_list(dict_t *peer_data)
+{
+    int32_t missed_snap_count = -1;
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(peer_data);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* Add the friends missed_snaps entries to the in-memory list */
+    ret = dict_get_int32(peer_data, "missed_snap_count", &missed_snap_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_MISSED_SNAP_GET_FAIL,
+               "No missed snaps");
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_add_missed_snaps_to_list(peer_data, missed_snap_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MISSED_SNAP_LIST_STORE_FAIL,
+               "Failed to add missed snaps to list");
+        goto out;
+    }
+
+    ret = glusterd_perform_missed_snap_ops();
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPSHOT_OP_FAILED,
+               "Failed to perform snap operations");
+        /* Not going to out at this point coz some *
+         * missed ops might have been performed. We *
+         * need to persist the current list *
+         */
+    }
+
+    ret = glusterd_store_update_missed_snaps();
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MISSED_SNAP_LIST_STORE_FAIL,
+               "Failed to update missed_snaps_list");
+        goto out;
+    }
+
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/*
+ * This function will set boolean "conflict" to true if peer snap
+ * has a version greater than snap version of local node. Otherwise
+ * boolean "conflict" will be set to false.
+ */
+int
+glusterd_check_peer_has_higher_snap_version(dict_t *peer_data,
+                                            char *peer_snap_name, int volcount,
+                                            gf_boolean_t *conflict,
+                                            char *prefix, glusterd_snap_t *snap,
+                                            char *hostname)
+{
+    glusterd_volinfo_t *snap_volinfo = NULL;
+    char key[256] = {0};
+    int version = 0, i = 0;
+    int ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(snap);
+    GF_ASSERT(peer_data);
+
+    for (i = 1; i <= volcount; i++) {
+        snprintf(key, sizeof(key), "%s%d.version", prefix, i);
+        ret = dict_get_int32(peer_data, key, &version);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "failed to get "
+                   "version of snap volume = %s",
+                   peer_snap_name);
+            return -1;
+        }
+
+        /* TODO : As of now there is only one volume in snapshot.
+         * Change this when multiple volume snapshot is introduced
+         */
+        snap_volinfo = cds_list_entry(snap->volumes.next, glusterd_volinfo_t,
+                                      vol_list);
+        if (!snap_volinfo) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                   "Failed to get snap "
+                   "volinfo %s",
+                   snap->snapname);
+            return -1;
+        }
+
+        if (version > snap_volinfo->version) {
+            /* Mismatch detected */
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_VOL_VERS_MISMATCH,
+                   "Version of volume %s differ. "
+                   "local version = %d, remote version = %d "
+                   "on peer %s",
+                   snap_volinfo->volname, snap_volinfo->version, version,
+                   hostname);
+            *conflict = _gf_true;
+            break;
+        } else {
+            *conflict = _gf_false;
+        }
+    }
+    return 0;
+}
+
+/* Check for the peer_snap_name in the list of existing snapshots.
+ * If a snap exists with the same name and a different snap_id, then
+ * there is a conflict. Set conflict as _gf_true, and snap to the
+ * conflicting snap object. If a snap exists with the same name, and the
+ * same snap_id, then there is no conflict. Set conflict as _gf_false
+ * and snap to the existing snap object. If no snap exists with the
+ * peer_snap_name, then there is no conflict. Set conflict as _gf_false
+ * and snap to NULL.
+ */
+void
+glusterd_is_peer_snap_conflicting(char *peer_snap_name, char *peer_snap_id,
+                                  gf_boolean_t *conflict,
+                                  glusterd_snap_t **snap, char *hostname)
+{
+    uuid_t peer_snap_uuid = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(peer_snap_name);
+    GF_ASSERT(peer_snap_id);
+    GF_ASSERT(conflict);
+    GF_ASSERT(snap);
+    GF_ASSERT(hostname);
+
+    *snap = glusterd_find_snap_by_name(peer_snap_name);
+    if (*snap) {
+        gf_uuid_parse(peer_snap_id, peer_snap_uuid);
+        if (!gf_uuid_compare(peer_snap_uuid, (*snap)->snap_id)) {
+            /* Current node contains the same snap having
+             * the same snapname and snap_id
+             */
+            gf_msg_debug(this->name, 0,
+                         "Snapshot %s from peer %s present in "
+                         "localhost",
+                         peer_snap_name, hostname);
+            *conflict = _gf_false;
+        } else {
+            /* Current node contains the same snap having
+             * the same snapname but different snap_id
+             */
+            gf_msg_debug(this->name, 0,
+                         "Snapshot %s from peer %s conflicts with "
+                         "snapshot in localhost",
+                         peer_snap_name, hostname);
+            *conflict = _gf_true;
+        }
+    } else {
+        /* Peer contains snapshots missing on the current node */
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_MISSED_SNAP_PRESENT,
+               "Snapshot %s from peer %s missing on localhost", peer_snap_name,
+               hostname);
+        *conflict = _gf_false;
+    }
+}
+
+/* Check if the local node is hosting any bricks for the given snapshot */
+gf_boolean_t
+glusterd_are_snap_bricks_local(glusterd_snap_t *snap)
+{
+    gf_boolean_t is_local = _gf_false;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(snap);
+
+    cds_list_for_each_entry(volinfo, &snap->volumes, vol_list)
+    {
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            if (!gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+                is_local = _gf_true;
+                goto out;
+            }
+        }
+    }
+
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", is_local);
+    return is_local;
+}
+
+/* Check if the peer has missed any snap delete
+ * or restore for the given snap_id
+ */
+gf_boolean_t
+glusterd_peer_has_missed_snap_delete(uuid_t peerid, char *peer_snap_id)
+{
+    char *peer_uuid = NULL;
+    gf_boolean_t missed_delete = _gf_false;
+    glusterd_conf_t *priv = NULL;
+    glusterd_missed_snap_info *missed_snapinfo = NULL;
+    glusterd_snap_op_t *snap_opinfo = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(peer_snap_id);
+
+    peer_uuid = uuid_utoa(peerid);
+
+    cds_list_for_each_entry(missed_snapinfo, &priv->missed_snaps_list,
+                            missed_snaps)
+    {
+        /* Look for missed snap for the same peer, and
+         * the same snap_id
+         */
+        if ((!strcmp(peer_uuid, missed_snapinfo->node_uuid)) &&
+            (!strcmp(peer_snap_id, missed_snapinfo->snap_uuid))) {
+            /* Check if the missed snap's op is delete and the
+             * status is pending
+             */
+            cds_list_for_each_entry(snap_opinfo, &missed_snapinfo->snap_ops,
+                                    snap_ops_list)
+            {
+                if (((snap_opinfo->op == GF_SNAP_OPTION_TYPE_DELETE) ||
+                     (snap_opinfo->op == GF_SNAP_OPTION_TYPE_RESTORE)) &&
+                    (snap_opinfo->status == GD_MISSED_SNAP_PENDING)) {
+                    missed_delete = _gf_true;
+                    goto out;
+                }
+            }
+        }
+    }
+
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", missed_delete);
+    return missed_delete;
+}
+
+/* Generate and store snap volfiles for imported snap object */
+int32_t
+glusterd_gen_snap_volfiles(glusterd_volinfo_t *snap_vol, char *peer_snap_name)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *parent_volinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(snap_vol);
+    GF_ASSERT(peer_snap_name);
+
+    ret = glusterd_store_volinfo(snap_vol, GLUSTERD_VOLINFO_VER_AC_NONE);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_SET_FAIL,
+               "Failed to store snapshot "
+               "volinfo (%s) for snap %s",
+               snap_vol->volname, peer_snap_name);
+        goto out;
+    }
+
+    ret = generate_brick_volfiles(snap_vol);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "generating the brick volfiles for the "
+               "snap %s failed",
+               peer_snap_name);
+        goto out;
+    }
+
+    ret = generate_client_volfiles(snap_vol, GF_CLIENT_TRUSTED);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "generating the trusted client volfiles for "
+               "the snap %s failed",
+               peer_snap_name);
+        goto out;
+    }
+
+    ret = generate_client_volfiles(snap_vol, GF_CLIENT_OTHER);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "generating the client volfiles for the "
+               "snap %s failed",
+               peer_snap_name);
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(snap_vol->parent_volname, &parent_volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Parent volinfo "
+               "not found for %s volume of snap %s",
+               snap_vol->volname, peer_snap_name);
+        goto out;
+    }
+
+    glusterd_list_add_snapvol(parent_volinfo, snap_vol);
+
+    ret = glusterd_store_volinfo(snap_vol, GLUSTERD_VOLINFO_VER_AC_NONE);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_SET_FAIL,
+               "Failed to store snap volinfo");
+        goto out;
+    }
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Import snapshot info from peer_data and add it to priv */
+int32_t
+glusterd_import_friend_snap(dict_t *peer_data, int32_t snap_count,
+                            char *peer_snap_name, char *peer_snap_id)
+{
+    char buf[64] = "";
+    char prefix[32] = "";
+    char *description = NULL;
+    dict_t *dict = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_conf_t *priv = NULL;
+    int32_t ret = -1;
+    int32_t volcount = -1;
+    int32_t i = -1;
+    xlator_t *this = NULL;
+    int64_t time_stamp;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(peer_data);
+    GF_ASSERT(peer_snap_name);
+    GF_ASSERT(peer_snap_id);
+
+    snprintf(prefix, sizeof(prefix), "snap%d", snap_count);
+
+    snap = glusterd_new_snap_object();
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CREATION_FAIL,
+               "Could not create "
+               "the snap object for snap %s",
+               peer_snap_name);
+        goto out;
+    }
+
+    dict = dict_new();
+    if (!dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Failed to create dict");
+        ret = -1;
+        goto out;
+    }
+
+    gf_strncpy(snap->snapname, peer_snap_name, sizeof(snap->snapname));
+    gf_uuid_parse(peer_snap_id, snap->snap_id);
+
+    snprintf(buf, sizeof(buf), "%s.description", prefix);
+    ret = dict_get_str(peer_data, buf, &description);
+    if (ret == 0 && description) {
+        snap->description = gf_strdup(description);
+        if (snap->description == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CREATION_FAIL,
+                   "Saving the Snapshot Description Failed");
+            ret = -1;
+            goto out;
+        }
+    }
+
+    snprintf(buf, sizeof(buf), "%s.time_stamp", prefix);
+    ret = dict_get_int64(peer_data, buf, &time_stamp);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get time_stamp for snap %s", peer_snap_name);
+        goto out;
+    }
+    snap->time_stamp = (time_t)time_stamp;
+
+    snprintf(buf, sizeof(buf), "%s.snap_restored", prefix);
+    ret = dict_get_int8(peer_data, buf, (int8_t *)&snap->snap_restored);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get snap_restored for snap %s", peer_snap_name);
+        goto out;
+    }
+
+    snprintf(buf, sizeof(buf), "%s.snap_status", prefix);
+    ret = dict_get_int32(peer_data, buf, (int32_t *)&snap->snap_status);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get snap_status for snap %s", peer_snap_name);
+        goto out;
+    }
+
+    /* If the snap is scheduled to be decommissioned, then
+     * don't accept the snap */
+    if (snap->snap_status == GD_SNAP_STATUS_DECOMMISSION) {
+        gf_msg_debug(this->name, 0,
+                     "The snap(%s) is scheduled to be decommissioned "
+                     "Not accepting the snap.",
+                     peer_snap_name);
+        glusterd_snap_remove(dict, snap, _gf_true, _gf_true, _gf_false);
+        ret = 0;
+        goto out;
+    }
+
+    snprintf(buf, sizeof(buf), "%s.volcount", prefix);
+    ret = dict_get_int32(peer_data, buf, &volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volcount for snap %s", peer_snap_name);
+        goto out;
+    }
+
+    ret = glusterd_store_create_snap_dir(snap);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SNAPDIR_CREATE_FAIL,
+               "Failed to create snap dir");
+        goto out;
+    }
+
+    glusterd_list_add_order(&snap->snap_list, &priv->snapshots,
+                            glusterd_compare_snap_time);
+
+    for (i = 1; i <= volcount; i++) {
+        ret = glusterd_import_volinfo(peer_data, i, &snap_vol, prefix);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_SET_FAIL,
+                   "Failed to import snap volinfo for "
+                   "snap %s",
+                   peer_snap_name);
+            goto out;
+        }
+
+        snap_vol->snapshot = snap;
+
+        ret = glusterd_gen_snap_volfiles(snap_vol, peer_snap_name);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+                   "Failed to generate snap vol files "
+                   "for snap %s",
+                   peer_snap_name);
+            goto out;
+        }
+        /* During handshake, after getting updates from friend mount
+         * point for activated snapshot should exist and should not
+         * for deactivated snapshot.
+         */
+        if (glusterd_is_volume_started(snap_vol)) {
+            ret = glusterd_recreate_vol_brick_mounts(this, snap_vol);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_BRK_MNT_RECREATE_FAIL,
+                       "Failed to recreate brick mounts"
+                       " for %s",
+                       snap->snapname);
+                goto out;
+            }
+
+            (void)glusterd_start_bricks(snap_vol);
+            ret = glusterd_store_volinfo(snap_vol,
+                                         GLUSTERD_VOLINFO_VER_AC_NONE);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_STORE_FAIL,
+                       "Failed to "
+                       "write volinfo for volume %s",
+                       snap_vol->volname);
+                goto out;
+            }
+        } else {
+            (void)glusterd_stop_bricks(snap_vol);
+            ret = glusterd_snap_unmount(this, snap_vol);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_UMOUNT_FAIL,
+                       "Failed to unmounts for %s", snap->snapname);
+            }
+        }
+
+        ret = glusterd_import_quota_conf(peer_data, i, snap_vol, prefix);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_QUOTA_CONFIG_IMPORT_FAIL,
+                   "Failed to import quota conf "
+                   "for snap %s",
+                   peer_snap_name);
+            goto out;
+        }
+
+        snap_vol = NULL;
+    }
+
+    ret = glusterd_store_snap(snap);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_CREATION_FAIL,
+               "Could not store snap"
+               "object %s",
+               peer_snap_name);
+        goto out;
+    }
+    glusterd_fetchsnap_notify(this);
+
+out:
+    if (ret)
+        glusterd_snap_remove(dict, snap, _gf_true, _gf_true, _gf_false);
+
+    if (dict)
+        dict_unref(dict);
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* During a peer-handshake, after the volumes have synced, and the list of
+ * missed snapshots have synced, the node will perform the pending deletes
+ * and restores on this list. At this point, the current snapshot list in
+ * the node will be updated, and hence in case of conflicts arising during
+ * snapshot handshake, the peer hosting the bricks will be given precedence
+ * Likewise, if there will be a conflict, and both peers will be in the same
+ * state, i.e either both would be hosting bricks or both would not be hosting
+ * bricks, then a decision can't be taken and a peer-reject will happen.
+ *
+ * glusterd_compare_snap()  & glusterd_update_snaps () implement the following
+ * algorithm to perform the above task. Please note the former function tries to
+ * iterate over the snaps one at a time and updating the relevant fields in the
+ * dictionary and then glusterd_update_snaps () go over all the snaps and update
+ * them at one go as part of a synctask.
+ * Step  1: Start.
+ * Step  2: Check if the peer is missing a delete or restore on the said snap.
+ *          If yes, goto step 6.
+ * Step  3: Check if there is a conflict between the peer's data and the
+ *          local snap. If no, goto step 5.
+ * Step  4: As there is a conflict, check if both the peer and the local nodes
+ *          are hosting bricks. Based on the results perform the following:
+ *          Peer Hosts Bricks    Local Node Hosts Bricks       Action
+ *                Yes                     Yes                Goto Step 8
+ *                No                      No                 Goto Step 8
+ *                Yes                     No                 Goto Step 9
+ *                No                      Yes                Goto Step 7
+ * Step  5: Check if the local node is missing the peer's data.
+ *          If yes, goto step 10.
+ * Step  6: Check if the snap volume version is lesser than peer_data
+ *          if yes goto step 9
+ * Step  7: It's a no-op. Goto step 11
+ * Step  8: Peer Reject. Goto step 11
+ * Step  9: Delete local node's data.
+ * Step 10: Accept Peer Data.
+ * Step 11: Stop
+ *
+ */
+int32_t
+glusterd_compare_snap(dict_t *peer_data, int32_t snap_count, char *peername,
+                      uuid_t peerid)
+{
+    char buf[64] = "";
+    char prefix[32] = "";
+    char *peer_snap_name = NULL;
+    char *peer_snap_id = NULL;
+    glusterd_snap_t *snap = NULL;
+    gf_boolean_t conflict = _gf_false;
+    gf_boolean_t is_local = _gf_false;
+    gf_boolean_t is_hosted = _gf_false;
+    gf_boolean_t missed_delete = _gf_false;
+    int32_t ret = -1;
+    int32_t volcount = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(peer_data);
+    GF_ASSERT(peername);
+
+    snprintf(prefix, sizeof(prefix), "snap%d", snap_count);
+
+    ret = dict_set_uint32(peer_data, buf, 0);
+    snprintf(buf, sizeof(buf), "%s.accept_peer_data", prefix);
+    ret = dict_set_uint32(peer_data, buf, 0);
+    snprintf(buf, sizeof(buf), "%s.remove_lvm", prefix);
+    ret = dict_set_uint32(peer_data, buf, 0);
+    snprintf(buf, sizeof(buf), "%s.remove_my_data", prefix);
+    ret = dict_set_uint32(peer_data, buf, 0);
+
+    /* Fetch the peer's snapname */
+    snprintf(buf, sizeof(buf), "%s.snapname", prefix);
+    ret = dict_get_str(peer_data, buf, &peer_snap_name);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch snapname from peer: %s", peername);
+        goto out;
+    }
+
+    /* Fetch the peer's snap_id */
+    snprintf(buf, sizeof(buf), "%s.snap_id", prefix);
+    ret = dict_get_str(peer_data, buf, &peer_snap_id);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch snap_id from peer: %s", peername);
+        goto out;
+    }
+
+    snprintf(buf, sizeof(buf), "%s.volcount", prefix);
+    ret = dict_get_int32(peer_data, buf, &volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volcount for snap %s", peer_snap_name);
+        goto out;
+    }
+
+    /* Check if the peer has missed a snap delete or restore
+     * resulting in stale data for the snap in question
+     */
+    missed_delete = glusterd_peer_has_missed_snap_delete(peerid, peer_snap_id);
+    if (missed_delete == _gf_true) {
+        /* Peer has missed delete on the missing/conflicting snap_id */
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_MISSED_SNAP_DELETE,
+               "Peer %s has missed a delete "
+               "on snap %s",
+               peername, peer_snap_name);
+        ret = 0;
+        goto out;
+    }
+
+    /* Check if there is a conflict, and if the
+     * peer data is already present
+     */
+    glusterd_is_peer_snap_conflicting(peer_snap_name, peer_snap_id, &conflict,
+                                      &snap, peername);
+    if (conflict == _gf_false) {
+        if (!snap) {
+            /* Peer has snap with the same snapname
+             * and snap_id, which local node doesn't have.
+             */
+            snprintf(buf, sizeof(buf), "%s.accept_peer_data", prefix);
+            ret = dict_set_uint32(peer_data, buf, 1);
+            goto out;
+        }
+        /* Peer has snap with the same snapname
+         * and snap_id. Now check if peer has a
+         * snap with higher snap version than local
+         * node has.
+         */
+        ret = glusterd_check_peer_has_higher_snap_version(
+            peer_data, peer_snap_name, volcount, &conflict, prefix, snap,
+            peername);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOL_VERS_MISMATCH,
+                   "Failed "
+                   "to check version of snap volume");
+            goto out;
+        }
+        if (conflict == _gf_true) {
+            /*
+             * Snap version of peer is higher than snap
+             * version of local node.
+             *
+             * Remove data in local node and accept peer data.
+             * We just need to heal snap info of local node, So
+             * When removing data from local node, make sure
+             * we are not removing backend lvm of the snap.
+             */
+            snprintf(buf, sizeof(buf), "%s.remove_lvm", prefix);
+            ret = dict_set_uint32(peer_data, buf, 0);
+            snprintf(buf, sizeof(buf), "%s.remove_my_data", prefix);
+            ret = dict_set_uint32(peer_data, buf, 1);
+            snprintf(buf, sizeof(buf), "%s.accept_peer_data", prefix);
+            ret = dict_set_uint32(peer_data, buf, 1);
+
+        } else {
+            ret = 0;
+        }
+        goto out;
+    }
+
+    /* There is a conflict. Check if the current node is
+     * hosting bricks for the conflicted snap.
+     */
+    is_local = glusterd_are_snap_bricks_local(snap);
+
+    /* Check if the peer is hosting any bricks for the
+     * conflicting snap
+     */
+    snprintf(buf, sizeof(buf), "%s.host_bricks", prefix);
+    ret = dict_get_int8(peer_data, buf, (int8_t *)&is_hosted);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch host_bricks from peer: %s "
+               "for %s",
+               peername, peer_snap_name);
+        goto out;
+    }
+
+    /* As there is a conflict at this point of time, the data of the
+     * node that hosts a brick takes precedence. If both the local
+     * node and the peer are in the same state, i.e if both of them
+     * are either hosting or not hosting the bricks, for the snap,
+     * then it's a peer reject
+     */
+    if (is_hosted == is_local) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CONFLICT,
+               "Conflict in snapshot %s with peer %s", peer_snap_name,
+               peername);
+        ret = -1;
+        goto out;
+    }
+
+    if (is_hosted == _gf_false) {
+        /* If there was a conflict, and the peer is not hosting
+         * any brick, then don't accept peer data
+         */
+        gf_msg_debug(this->name, 0,
+                     "Peer doesn't hosts bricks for conflicting "
+                     "snap(%s). Not accepting peer data.",
+                     peer_snap_name);
+        ret = 0;
+        goto out;
+    }
+
+    /* The peer is hosting a brick in case of conflict
+     * And local node isn't. Hence remove local node's
+     * data and accept peer data
+     */
+    gf_msg_debug(this->name, 0,
+                 "Peer hosts bricks for conflicting "
+                 "snap(%s). Removing local data. Accepting peer data.",
+                 peer_snap_name);
+    snprintf(buf, sizeof(buf), "%s.remove_lvm", prefix);
+    ret = dict_set_uint32(peer_data, buf, 1);
+    snprintf(buf, sizeof(buf), "%s.remove_my_data", prefix);
+    ret = dict_set_uint32(peer_data, buf, 1);
+    snprintf(buf, sizeof(buf), "%s.accept_peer_data", prefix);
+    ret = dict_set_uint32(peer_data, buf, 1);
+
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_update_snaps_synctask(void *opaque)
+{
+    int32_t ret = -1;
+    int32_t snap_count = 0;
+    int i = 1;
+    xlator_t *this = NULL;
+    dict_t *peer_data = NULL;
+    char buf[64] = "";
+    char prefix[32] = "";
+    char *peer_snap_name = NULL;
+    char *peer_snap_id = NULL;
+    char *peername = NULL;
+    gf_boolean_t remove_lvm = _gf_false;
+    gf_boolean_t remove_my_data = _gf_false;
+    gf_boolean_t accept_peer_data = _gf_false;
+    int32_t val = 0;
+    glusterd_snap_t *snap = NULL;
+    dict_t *dict = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    peer_data = (dict_t *)opaque;
+    GF_ASSERT(peer_data);
+
+    synclock_lock(&conf->big_lock);
+
+    while (conf->restart_bricks) {
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
+    }
+    conf->restart_bricks = _gf_true;
+
+    ret = dict_get_int32(peer_data, "snap_count", &snap_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to fetch snap_count");
+        goto out;
+    }
+    ret = dict_get_str(peer_data, "peername", &peername);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to fetch peername");
+        goto out;
+    }
+
+    for (i = 1; i <= snap_count; i++) {
+        snprintf(prefix, sizeof(prefix), "snap%d", i);
+
+        /* Fetch the peer's snapname */
+        snprintf(buf, sizeof(buf), "%s.snapname", prefix);
+        ret = dict_get_str(peer_data, buf, &peer_snap_name);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to fetch snapname from peer: %s", peername);
+            goto out;
+        }
+
+        /* Fetch the peer's snap_id */
+        snprintf(buf, sizeof(buf), "%s.snap_id", prefix);
+        ret = dict_get_str(peer_data, buf, &peer_snap_id);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to fetch snap_id from peer: %s", peername);
+            goto out;
+        }
+
+        /* remove_my_data */
+        snprintf(buf, sizeof(buf), "%s.remove_my_data", prefix);
+        ret = dict_get_int32(peer_data, buf, &val);
+        if (val)
+            remove_my_data = _gf_true;
+        else
+            remove_my_data = _gf_false;
+
+        if (remove_my_data) {
+            snprintf(buf, sizeof(buf), "%s.remove_lvm", prefix);
+            ret = dict_get_int32(peer_data, buf, &val);
+            if (val)
+                remove_lvm = _gf_true;
+            else
+                remove_lvm = _gf_false;
+
+            dict = dict_new();
+            if (!dict) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+                       "Unable to create dict");
+                ret = -1;
+                goto out;
+            }
+            snap = glusterd_find_snap_by_name(peer_snap_name);
+            if (!snap) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MISSED_SNAP_PRESENT,
+                       "Snapshot %s from peer %s missing on "
+                       "localhost",
+                       peer_snap_name, peername);
+                ret = -1;
+                goto out;
+            }
+
+            ret = glusterd_snap_remove(dict, snap, remove_lvm, _gf_false,
+                                       _gf_false);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                       "Failed to remove snap %s", snap->snapname);
+                goto out;
+            }
+
+            dict_unref(dict);
+            dict = NULL;
+        }
+        snprintf(buf, sizeof(buf), "%s.accept_peer_data", prefix);
+        ret = dict_get_int32(peer_data, buf, &val);
+        if (val)
+            accept_peer_data = _gf_true;
+        else
+            accept_peer_data = _gf_false;
+
+        if (accept_peer_data) {
+            /* Accept Peer Data */
+            ret = glusterd_import_friend_snap(peer_data, i, peer_snap_name,
+                                              peer_snap_id);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_IMPORT_FAIL,
+                       "Failed to import snap %s from peer %s", peer_snap_name,
+                       peername);
+                goto out;
+            }
+        }
+    }
+
+out:
+    if (peer_data)
+        dict_unref(peer_data);
+    if (dict)
+        dict_unref(dict);
+    conf->restart_bricks = _gf_false;
+    synccond_broadcast(&conf->cond_restart_bricks);
+
+    return ret;
+}
+
+/* Compare snapshots present in peer_data, with the snapshots in
+ * the current node
+ */
+int32_t
+glusterd_compare_friend_snapshots(dict_t *peer_data, char *peername,
+                                  uuid_t peerid)
+{
+    int32_t ret = -1;
+    int32_t snap_count = 0;
+    int i = 1;
+    xlator_t *this = NULL;
+    dict_t *peer_data_copy = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(peer_data);
+    GF_ASSERT(peername);
+
+    ret = dict_get_int32(peer_data, "snap_count", &snap_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to fetch snap_count");
+        goto out;
+    }
+
+    if (!snap_count)
+        goto out;
+
+    for (i = 1; i <= snap_count; i++) {
+        /* Compare one snapshot from peer_data at a time */
+        ret = glusterd_compare_snap(peer_data, i, peername, peerid);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPSHOT_OP_FAILED,
+                   "Failed to compare snapshots with peer %s", peername);
+            goto out;
+        }
+    }
+    /* Update the snaps at one go */
+    peer_data_copy = dict_copy_with_ref(peer_data, NULL);
+    ret = dict_set_str(peer_data_copy, "peername", peername);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set peername into the dict");
+        if (peer_data_copy)
+            dict_unref(peer_data_copy);
+        goto out;
+    }
+    glusterd_launch_synctask(glusterd_update_snaps_synctask, peer_data_copy);
+
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_add_snapd_to_dict(glusterd_volinfo_t *volinfo, dict_t *dict,
+                           int32_t count)
+{
+    int ret = -1;
+    int32_t pid = -1;
+    int32_t brick_online = -1;
+    char key[64] = {0};
+    char base_key[32] = {0};
+    char pidfile[PATH_MAX] = {0};
+    xlator_t *this = NULL;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(dict);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    snprintf(base_key, sizeof(base_key), "brick%d", count);
+    snprintf(key, sizeof(key), "%s.hostname", base_key);
+    ret = dict_set_str(dict, key, "Snapshot Daemon");
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "Key=%s",
+                key, NULL);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.path", base_key);
+    ret = dict_set_dynstr(dict, key, gf_strdup(uuid_utoa(MY_UUID)));
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "Key=%s",
+                key, NULL);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.port", base_key);
+    ret = dict_set_int32(dict, key, volinfo->snapd.port);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "Key=%s",
+                key, NULL);
+        goto out;
+    }
+
+    glusterd_svc_build_snapd_pidfile(volinfo, pidfile, sizeof(pidfile));
+
+    brick_online = gf_is_service_running(pidfile, &pid);
+    if (brick_online == _gf_false)
+        pid = -1;
+
+    snprintf(key, sizeof(key), "%s.pid", base_key);
+    ret = dict_set_int32(dict, key, pid);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "Key=%s",
+                key, NULL);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.status", base_key);
+    ret = dict_set_int32(dict, key, brick_online);
+
+out:
+    if (ret)
+        gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_snap_config_use_rsp_dict(dict_t *dst, dict_t *src)
+{
+    char buf[PATH_MAX] = "";
+    char *volname = NULL;
+    int ret = -1;
+    int config_command = 0;
+    uint64_t i = 0;
+    uint64_t hard_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+    uint64_t soft_limit = GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT;
+    uint64_t value = 0;
+    uint64_t voldisplaycount = 0;
+
+    if (!dst || !src) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_EMPTY,
+               "Source or Destination "
+               "dict is empty.");
+        goto out;
+    }
+
+    ret = dict_get_int32(dst, "config-command", &config_command);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to get config-command type");
+        goto out;
+    }
+
+    switch (config_command) {
+        case GF_SNAP_CONFIG_DISPLAY:
+            ret = dict_get_uint64(src, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT,
+                                  &hard_limit);
+            if (!ret) {
+                ret = dict_set_uint64(
+                    dst, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT, hard_limit);
+                if (ret) {
+                    gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Unable to set snap_max_hard_limit");
+                    goto out;
+                }
+            } else {
+                /* Received dummy response from other nodes */
+                ret = 0;
+                goto out;
+            }
+
+            ret = dict_get_uint64(src, GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT,
+                                  &soft_limit);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to get snap_max_soft_limit");
+                goto out;
+            }
+
+            ret = dict_set_uint64(dst, GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT,
+                                  soft_limit);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Unable to set snap_max_soft_limit");
+                goto out;
+            }
+
+            ret = dict_get_uint64(src, "voldisplaycount", &voldisplaycount);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to get voldisplaycount");
+                goto out;
+            }
+
+            ret = dict_set_uint64(dst, "voldisplaycount", voldisplaycount);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Unable to set voldisplaycount");
+                goto out;
+            }
+
+            for (i = 0; i < voldisplaycount; i++) {
+                snprintf(buf, sizeof(buf), "volume%" PRIu64 "-volname", i);
+                ret = dict_get_str(src, buf, &volname);
+                if (ret) {
+                    gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                           "Unable to get %s", buf);
+                    goto out;
+                }
+                ret = dict_set_str(dst, buf, volname);
+                if (ret) {
+                    gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Unable to set %s", buf);
+                    goto out;
+                }
+
+                snprintf(buf, sizeof(buf),
+                         "volume%" PRIu64 "-snap-max-hard-limit", i);
+                ret = dict_get_uint64(src, buf, &value);
+                if (ret) {
+                    gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                           "Unable to get %s", buf);
+                    goto out;
+                }
+                ret = dict_set_uint64(dst, buf, value);
+                if (ret) {
+                    gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Unable to set %s", buf);
+                    goto out;
+                }
+
+                snprintf(buf, sizeof(buf),
+                         "volume%" PRIu64 "-active-hard-limit", i);
+                ret = dict_get_uint64(src, buf, &value);
+                if (ret) {
+                    gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                           "Unable to get %s", buf);
+                    goto out;
+                }
+                ret = dict_set_uint64(dst, buf, value);
+                if (ret) {
+                    gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Unable to set %s", buf);
+                    goto out;
+                }
+
+                snprintf(buf, sizeof(buf),
+                         "volume%" PRIu64 "-snap-max-soft-limit", i);
+                ret = dict_get_uint64(src, buf, &value);
+                if (ret) {
+                    gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                           "Unable to get %s", buf);
+                    goto out;
+                }
+                ret = dict_set_uint64(dst, buf, value);
+                if (ret) {
+                    gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Unable to set %s", buf);
+                    goto out;
+                }
+            }
+
+            break;
+        default:
+            break;
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_merge_brick_status(dict_t *dst, dict_t *src)
+{
+    int64_t volume_count = 0;
+    int64_t index = 0;
+    int64_t j = 0;
+    int64_t brick_count = 0;
+    int64_t brick_order = 0;
+    char key[64] = {
+        0,
+    };
+    char key_prefix[16] = {
+        0,
+    };
+    char snapbrckcnt[PATH_MAX] = {
+        0,
+    };
+    char snapbrckord[PATH_MAX] = {
+        0,
+    };
+    char *clonename = NULL;
+    int ret = -1;
+    int32_t brick_online = 0;
+    xlator_t *this = NULL;
+    int32_t snap_command = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (!dst || !src) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_EMPTY,
+               "Source or Destination "
+               "dict is empty.");
+        goto out;
+    }
+
+    ret = dict_get_int32(dst, "type", &snap_command);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "unable to get the type of "
+               "the snapshot command");
+        goto out;
+    }
+
+    if (snap_command == GF_SNAP_OPTION_TYPE_DELETE) {
+        gf_msg_debug(this->name, 0,
+                     "snapshot delete command."
+                     " Need not merge the status of the bricks");
+        ret = 0;
+        goto out;
+    }
+
+    /* Try and fetch clonename. If present set status with clonename *
+     * else do so as snap-vol */
+    ret = dict_get_str(dst, "clonename", &clonename);
+    if (ret) {
+        snprintf(key_prefix, sizeof(key_prefix), "snap-vol");
+    } else
+        snprintf(key_prefix, sizeof(key_prefix), "clone");
+
+    ret = dict_get_int64(src, "volcount", &volume_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to "
+               "get the volume count");
+        goto out;
+    }
+
+    for (index = 0; index < volume_count; index++) {
+        ret = snprintf(snapbrckcnt, sizeof(snapbrckcnt) - 1,
+                       "snap-vol%" PRId64 "_brickcount", index + 1);
+        ret = dict_get_int64(src, snapbrckcnt, &brick_count);
+        if (ret) {
+            gf_msg_trace(this->name, 0,
+                         "No bricks for this volume in this dict (%s)",
+                         snapbrckcnt);
+            continue;
+        }
+
+        for (j = 0; j < brick_count; j++) {
+            /* Fetching data from source dict */
+            snprintf(snapbrckord, sizeof(snapbrckord) - 1,
+                     "snap-vol%" PRId64 ".brick%" PRId64 ".order", index + 1,
+                     j);
+
+            ret = dict_get_int64(src, snapbrckord, &brick_order);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get brick order (%s)", snapbrckord);
+                goto out;
+            }
+
+            snprintf(key, sizeof(key), "%s%" PRId64 ".brick%" PRId64 ".status",
+                     key_prefix, index + 1, brick_order);
+            ret = dict_get_int32(src, key, &brick_online);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "failed to "
+                       "get the brick status (%s)",
+                       key);
+                goto out;
+            }
+
+            ret = dict_set_int32(dst, key, brick_online);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "failed to "
+                       "set the brick status (%s)",
+                       key);
+                goto out;
+            }
+            brick_online = 0;
+        }
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+/* Aggregate missed_snap_counts from different nodes and save it *
+ * in the req_dict of the originator node */
+int
+glusterd_snap_create_use_rsp_dict(dict_t *dst, dict_t *src)
+{
+    char *buf = NULL;
+    char *tmp_str = NULL;
+    char name_buf[PATH_MAX] = "";
+    int32_t i = -1;
+    int32_t ret = -1;
+    int32_t src_missed_snap_count = -1;
+    int32_t dst_missed_snap_count = -1;
+    xlator_t *this = NULL;
+    int8_t soft_limit_flag = -1;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (!dst || !src) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_EMPTY,
+               "Source or Destination "
+               "dict is empty.");
+        goto out;
+    }
+
+    ret = glusterd_merge_brick_status(dst, src);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_SET_INFO_FAIL,
+               "failed to merge brick "
+               "status");
+        goto out;
+    }
+
+    ret = dict_get_str(src, "snapuuid", &buf);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to get snap UUID");
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(dst, "snapuuid", buf);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snap uuid in dict");
+        goto out;
+    }
+
+    /* set in dst dictionary soft-limit-reach only if soft-limit-reach
+     * is present src dictionary */
+    ret = dict_get_int8(src, "soft-limit-reach", &soft_limit_flag);
+    if (!ret) {
+        ret = dict_set_int8(dst, "soft-limit-reach", soft_limit_flag);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set "
+                   "soft_limit_flag");
+            goto out;
+        }
+    }
+
+    ret = dict_get_int32(src, "missed_snap_count", &src_missed_snap_count);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "No missed snaps");
+        ret = 0;
+        goto out;
+    }
+
+    ret = dict_get_int32(dst, "missed_snap_count", &dst_missed_snap_count);
+    if (ret) {
+        /* Initialize dst_missed_count for the first time */
+        dst_missed_snap_count = 0;
+    }
+
+    for (i = 0; i < src_missed_snap_count; i++) {
+        snprintf(name_buf, sizeof(name_buf), "missed_snaps_%d", i);
+        ret = dict_get_str(src, name_buf, &buf);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to fetch %s", name_buf);
+            goto out;
+        }
+
+        snprintf(name_buf, sizeof(name_buf), "missed_snaps_%d",
+                 dst_missed_snap_count);
+
+        tmp_str = gf_strdup(buf);
+        if (!tmp_str) {
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_set_dynstr(dst, name_buf, tmp_str);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to set %s", name_buf);
+            goto out;
+        }
+
+        tmp_str = NULL;
+        dst_missed_snap_count++;
+    }
+
+    ret = dict_set_int32(dst, "missed_snap_count", dst_missed_snap_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set dst_missed_snap_count");
+        goto out;
+    }
+
+out:
+    if (ret && tmp_str)
+        GF_FREE(tmp_str);
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_snap_use_rsp_dict(dict_t *dst, dict_t *src)
+{
+    int ret = -1;
+    int32_t snap_command = 0;
+
+    if (!dst || !src) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_EMPTY,
+               "Source or Destination "
+               "dict is empty.");
+        goto out;
+    }
+
+    ret = dict_get_int32(dst, "type", &snap_command);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "unable to get the type of "
+               "the snapshot command");
+        goto out;
+    }
+
+    switch (snap_command) {
+        case GF_SNAP_OPTION_TYPE_CREATE:
+        case GF_SNAP_OPTION_TYPE_DELETE:
+        case GF_SNAP_OPTION_TYPE_CLONE:
+            ret = glusterd_snap_create_use_rsp_dict(dst, src);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_RSP_DICT_USE_FAIL,
+                       "Unable to use rsp dict");
+                goto out;
+            }
+            break;
+        case GF_SNAP_OPTION_TYPE_CONFIG:
+            ret = glusterd_snap_config_use_rsp_dict(dst, src);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_RSP_DICT_USE_FAIL,
+                       "Unable to use rsp dict");
+                goto out;
+            }
+            break;
+        default:
+            /* copy the response dictinary's contents to the dict to be
+             * sent back to the cli */
+            dict_copy(src, dst);
+            break;
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_compare_snap_time(struct cds_list_head *list1,
+                           struct cds_list_head *list2)
+{
+    glusterd_snap_t *snap1 = NULL;
+    glusterd_snap_t *snap2 = NULL;
+    double diff_time = 0;
+
+    GF_ASSERT(list1);
+    GF_ASSERT(list2);
+
+    snap1 = cds_list_entry(list1, glusterd_snap_t, snap_list);
+    snap2 = cds_list_entry(list2, glusterd_snap_t, snap_list);
+    diff_time = difftime(snap1->time_stamp, snap2->time_stamp);
+
+    return (int)diff_time;
+}
+
+int
+glusterd_compare_snap_vol_time(struct cds_list_head *list1,
+                               struct cds_list_head *list2)
+{
+    glusterd_volinfo_t *snapvol1 = NULL;
+    glusterd_volinfo_t *snapvol2 = NULL;
+    double diff_time = 0;
+
+    GF_ASSERT(list1);
+    GF_ASSERT(list2);
+
+    snapvol1 = cds_list_entry(list1, glusterd_volinfo_t, snapvol_list);
+    snapvol2 = cds_list_entry(list2, glusterd_volinfo_t, snapvol_list);
+    diff_time = difftime(snapvol1->snapshot->time_stamp,
+                         snapvol2->snapshot->time_stamp);
+
+    return (int)diff_time;
+}
+
+int32_t
+glusterd_missed_snapinfo_new(glusterd_missed_snap_info **missed_snapinfo)
+{
+    glusterd_missed_snap_info *new_missed_snapinfo = NULL;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(missed_snapinfo);
+
+    new_missed_snapinfo = GF_CALLOC(1, sizeof(*new_missed_snapinfo),
+                                    gf_gld_mt_missed_snapinfo_t);
+
+    if (!new_missed_snapinfo) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    CDS_INIT_LIST_HEAD(&new_missed_snapinfo->missed_snaps);
+    CDS_INIT_LIST_HEAD(&new_missed_snapinfo->snap_ops);
+
+    *missed_snapinfo = new_missed_snapinfo;
+
+    ret = 0;
+
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_missed_snap_op_new(glusterd_snap_op_t **snap_op)
+{
+    glusterd_snap_op_t *new_snap_op = NULL;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(snap_op);
+
+    new_snap_op = GF_CALLOC(1, sizeof(*new_snap_op),
+                            gf_gld_mt_missed_snapinfo_t);
+
+    if (!new_snap_op) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    new_snap_op->brick_num = -1;
+    new_snap_op->op = -1;
+    new_snap_op->status = -1;
+    CDS_INIT_LIST_HEAD(&new_snap_op->snap_ops_list);
+
+    *snap_op = new_snap_op;
+
+    ret = 0;
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+gf_boolean_t
+mntopts_exists(const char *str, const char *opts)
+{
+    char *dup_val = NULL;
+    char *savetok = NULL;
+    char *token = NULL;
+    gf_boolean_t exists = _gf_false;
+
+    GF_ASSERT(opts);
+
+    if (!str || !strlen(str))
+        goto out;
+
+    dup_val = gf_strdup(str);
+    if (!dup_val)
+        goto out;
+
+    token = strtok_r(dup_val, ",", &savetok);
+    while (token) {
+        if (!strcmp(token, opts)) {
+            exists = _gf_true;
+            goto out;
+        }
+        token = strtok_r(NULL, ",", &savetok);
+    }
+
+out:
+    GF_FREE(dup_val);
+    return exists;
+}
+
+int32_t
+glusterd_mount_lvm_snapshot(glusterd_brickinfo_t *brickinfo,
+                            char *brick_mount_path)
+{
+    char msg[NAME_MAX] = "";
+    char mnt_opts[1024] = "";
+    int32_t ret = -1;
+    runner_t runner = {
+        0,
+    };
+    xlator_t *this = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(brick_mount_path);
+    GF_ASSERT(brickinfo);
+
+    runinit(&runner);
+    len = snprintf(msg, sizeof(msg), "mount %s %s", brickinfo->device_path,
+                   brick_mount_path);
+    if (len < 0) {
+        strcpy(msg, "<error>");
+    }
+
+    gf_strncpy(mnt_opts, brickinfo->mnt_opts, sizeof(mnt_opts));
+
+    /* XFS file-system does not allow to mount file-system with duplicate
+     * UUID. File-system UUID of snapshot and its origin volume is same.
+     * Therefore to mount such a snapshot in XFS we need to pass nouuid
+     * option
+     */
+    if (!strcmp(brickinfo->fstype, "xfs") &&
+        !mntopts_exists(mnt_opts, "nouuid")) {
+        if (strlen(mnt_opts) > 0)
+            strcat(mnt_opts, ",");
+        strcat(mnt_opts, "nouuid");
+    }
+
+    if (strlen(mnt_opts) > 0) {
+        runner_add_args(&runner, "mount", "-o", mnt_opts,
+                        brickinfo->device_path, brick_mount_path, NULL);
+    } else {
+        runner_add_args(&runner, "mount", brickinfo->device_path,
+                        brick_mount_path, NULL);
+    }
+
+    runner_log(&runner, this->name, GF_LOG_DEBUG, msg);
+    ret = runner_run(&runner);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_MOUNT_FAIL,
+               "mounting the snapshot "
+               "logical device %s failed (error: %s)",
+               brickinfo->device_path, strerror(errno));
+        goto out;
+    } else
+        gf_msg_debug(this->name, 0,
+                     "mounting the snapshot "
+                     "logical device %s successful",
+                     brickinfo->device_path);
+
+out:
+    gf_msg_trace(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+gf_boolean_t
+glusterd_volume_quorum_calculate(glusterd_volinfo_t *volinfo, dict_t *dict,
+                                 int down_count, gf_boolean_t first_brick_on,
+                                 int8_t snap_force, int quorum_count,
+                                 char *quorum_type, char **op_errstr,
+                                 uint32_t *op_errno)
+{
+    gf_boolean_t quorum_met = _gf_false;
+    const char err_str[] = "One or more bricks may be down.";
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    if (!volinfo || !dict) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_INVALID_ENTRY,
+               "input parameters NULL");
+        goto out;
+    }
+
+    /* In a n-way replication where n >= 3 we should not take a snapshot
+     * if even one brick is down, irrespective of the quorum being met.
+     * TODO: Remove this restriction once n-way replication is
+     * supported with snapshot.
+     */
+    if (down_count) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_DISCONNECTED, "%s",
+               err_str);
+        *op_errstr = gf_strdup(err_str);
+        *op_errno = EG_BRCKDWN;
+    } else {
+        quorum_met = _gf_true;
+    }
+
+    /* TODO : Support for n-way relication in snapshot*/
+out:
+    return quorum_met;
+}
+
+static int32_t
+glusterd_volume_quorum_check(glusterd_volinfo_t *volinfo, int64_t index,
+                             dict_t *dict, const char *key_prefix,
+                             int8_t snap_force, int quorum_count,
+                             char *quorum_type, char **op_errstr,
+                             uint32_t *op_errno)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+    int64_t i = 0;
+    int64_t j = 0;
+    char key[128] = {
+        0,
+    }; /* key_prefix is passed from above, but is really quite small */
+    int keylen;
+    int down_count = 0;
+    gf_boolean_t first_brick_on = _gf_true;
+    glusterd_conf_t *priv = NULL;
+    gf_boolean_t quorum_met = _gf_false;
+    int distribute_subvols = 0;
+    int32_t brick_online = 0;
+    const char err_str[] = "quorum is not met";
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    if (!volinfo || !dict) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_INVALID_ENTRY,
+               "input parameters NULL");
+        goto out;
+    }
+
+    if ((!glusterd_is_volume_replicate(volinfo) ||
+         volinfo->replica_count < 3) &&
+        (GF_CLUSTER_TYPE_DISPERSE != volinfo->type)) {
+        for (i = 0; i < volinfo->brick_count; i++) {
+            /* for a pure distribute volume, and replica volume
+               with replica count 2, quorum is not met if even
+               one of its subvolumes is down
+            */
+            keylen = snprintf(key, sizeof(key),
+                              "%s%" PRId64 ".brick%" PRId64 ".status",
+                              key_prefix, index, i);
+            ret = dict_get_int32n(dict, key, keylen, &brick_online);
+            if (ret || !brick_online) {
+                ret = 1;
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_SERVER_QUORUM_NOT_MET, "%s", err_str);
+                *op_errstr = gf_strdup(err_str);
+                *op_errno = EG_BRCKDWN;
+                goto out;
+            }
+        }
+        ret = 0;
+        quorum_met = _gf_true;
+    } else {
+        distribute_subvols = volinfo->brick_count / volinfo->dist_leaf_count;
+        for (j = 0; j < distribute_subvols; j++) {
+            /* by default assume quorum is not met
+               TODO: Handle distributed striped replicate volumes
+               Currently only distributed replicate volumes are
+               handled.
+            */
+            ret = 1;
+            quorum_met = _gf_false;
+            for (i = 0; i < volinfo->dist_leaf_count; i++) {
+                keylen = snprintf(
+                    key, sizeof(key), "%s%" PRId64 ".brick%" PRId64 ".status",
+                    key_prefix, index, (j * volinfo->dist_leaf_count) + i);
+                ret = dict_get_int32n(dict, key, keylen, &brick_online);
+                if (ret || !brick_online) {
+                    if (i == 0)
+                        first_brick_on = _gf_false;
+                    down_count++;
+                }
+            }
+
+            quorum_met = glusterd_volume_quorum_calculate(
+                volinfo, dict, down_count, first_brick_on, snap_force,
+                quorum_count, quorum_type, op_errstr, op_errno);
+            /* goto out if quorum is not met */
+            if (!quorum_met) {
+                ret = -1;
+                goto out;
+            }
+
+            down_count = 0;
+            first_brick_on = _gf_true;
+        }
+    }
+
+    if (quorum_met) {
+        gf_msg_debug(this->name, 0, "volume %s is in quorum", volinfo->volname);
+        ret = 0;
+    }
+
+out:
+    return ret;
+}
+
+static int32_t
+glusterd_snap_common_quorum_calculate(glusterd_volinfo_t *volinfo, dict_t *dict,
+                                      int64_t index, const char *key_prefix,
+                                      int8_t snap_force,
+                                      gf_boolean_t snap_volume,
+                                      char **op_errstr, uint32_t *op_errno)
+{
+    int quorum_count = 0;
+    char *quorum_type = NULL;
+    int32_t tmp = 0;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+
+    /* for replicate volumes with replica count equal to or
+       greater than 3, do quorum check by getting what type
+       of quorum rule has been set by getting the volume
+       option set. If getting the option fails, then assume
+       default.
+       AFR does this:
+       if quorum type is "auto":
+       - for odd number of bricks (n), n/2 + 1
+       bricks should be present
+       - for even number of bricks n, n/2 bricks
+       should be present along with the 1st
+       subvolume
+       if quorum type is not "auto":
+       - get the quorum count from dict with the
+       help of the option "cluster.quorum-count"
+       if the option is not there in the dict,
+       then assume quorum type is auto and follow
+       the above method.
+       For non replicate volumes quorum is met only if all
+       the bricks of the volume are online
+     */
+
+    if (GF_CLUSTER_TYPE_REPLICATE == volinfo->type) {
+        if (volinfo->replica_count % 2 == 0)
+            quorum_count = volinfo->replica_count / 2;
+        else
+            quorum_count = volinfo->replica_count / 2 + 1;
+    } else if (GF_CLUSTER_TYPE_DISPERSE == volinfo->type) {
+        quorum_count = volinfo->disperse_count - volinfo->redundancy_count;
+    } else {
+        quorum_count = volinfo->brick_count;
+    }
+
+    ret = dict_get_str_sizen(volinfo->dict, "cluster.quorum-type",
+                             &quorum_type);
+    if (!ret && !strcmp(quorum_type, "fixed")) {
+        ret = dict_get_int32_sizen(volinfo->dict, "cluster.quorum-count", &tmp);
+        /* if quorum-type option is not found in the
+           dict assume auto quorum type. i.e n/2 + 1.
+           The same assumption is made when quorum-count
+           option cannot be obtained from the dict (even
+           if the quorum-type option is not set to auto,
+           the behavior is set to the default behavior)
+         */
+        if (!ret) {
+            /* for dispersed volumes, only allow quorums
+               equal or larger than minimum functional
+               value.
+             */
+            if ((GF_CLUSTER_TYPE_DISPERSE != volinfo->type) ||
+                (tmp >= quorum_count)) {
+                quorum_count = tmp;
+            } else {
+                gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_QUORUM_COUNT_IGNORED,
+                       "Ignoring small quorum-count "
+                       "(%d) on dispersed volume",
+                       tmp);
+                quorum_type = NULL;
+            }
+        } else
+            quorum_type = NULL;
+    }
+
+    ret = glusterd_volume_quorum_check(volinfo, index, dict, key_prefix,
+                                       snap_force, quorum_count, quorum_type,
+                                       op_errstr, op_errno);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOL_NOT_FOUND,
+               "volume %s "
+               "is not in quorum",
+               volinfo->volname);
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+static int32_t
+glusterd_snap_quorum_check_for_clone(dict_t *dict, gf_boolean_t snap_volume,
+                                     char **op_errstr, uint32_t *op_errno)
+{
+    const char err_str[] = "glusterds are not in quorum";
+    char key_prefix[16] = {
+        0,
+    };
+    char *snapname = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_volinfo_t *tmp_volinfo = NULL;
+    char *volname = NULL;
+    int64_t volcount = 0;
+    int64_t i = 0;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    if (!dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_EMPTY, "dict is NULL");
+        goto out;
+    }
+
+    if (snap_volume) {
+        ret = dict_get_str_sizen(dict, "snapname", &snapname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "failed to "
+                   "get snapname");
+            goto out;
+        }
+
+        snap = glusterd_find_snap_by_name(snapname);
+        if (!snap) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_NOT_FOUND,
+                   "failed to "
+                   "get the snapshot %s",
+                   snapname);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    /* Do a quorum check of glusterds also. Because, the missed snapshot
+     * information will be saved by glusterd and if glusterds are not in
+     * quorum, then better fail the snapshot
+     */
+    if (!does_gd_meet_server_quorum(this)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SERVER_QUORUM_NOT_MET,
+               "%s", err_str);
+        *op_errstr = gf_strdup(err_str);
+        *op_errno = EG_NODEDWN;
+        ret = -1;
+        goto out;
+    } else
+        gf_msg_debug(this->name, 0, "glusterds are in quorum");
+
+    ret = dict_get_int64(dict, "volcount", &volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to get "
+               "volcount");
+        goto out;
+    }
+
+    for (i = 1; i <= volcount; i++) {
+        ret = dict_get_str_sizen(dict, "clonename", &volname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "failed to "
+                   "get clonename");
+            goto out;
+        }
+
+        if (snap_volume && snap) {
+            cds_list_for_each_entry(tmp_volinfo, &snap->volumes, vol_list)
+            {
+                if (!tmp_volinfo) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_NOT_FOUND,
+                           "failed to get snap volume "
+                           "for snap %s",
+                           snapname);
+                    ret = -1;
+                    goto out;
+                }
+                volinfo = tmp_volinfo;
+            }
+        } else {
+            ret = glusterd_volinfo_find(volname, &volinfo);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+                       "failed to find the volume %s", volname);
+                goto out;
+            }
+        }
+
+        snprintf(key_prefix, sizeof(key_prefix), "%s",
+                 snap_volume ? "vol" : "clone");
+
+        ret = glusterd_snap_common_quorum_calculate(
+            volinfo, dict, i, key_prefix, 0, snap_volume, op_errstr, op_errno);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOL_NOT_FOUND,
+                   "volume %s "
+                   "is not in quorum",
+                   volname);
+            goto out;
+        }
+    }
+out:
+    return ret;
+}
+
+static int32_t
+glusterd_snap_quorum_check_for_create(dict_t *dict, gf_boolean_t snap_volume,
+                                      char **op_errstr, uint32_t *op_errno)
+{
+    int8_t snap_force = 0;
+    int32_t force = 0;
+    const char err_str[] = "glusterds are not in quorum";
+    char key_prefix[16] = {
+        0,
+    };
+    char *snapname = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char *volname = NULL;
+    int64_t volcount = 0;
+    char key[32] = {
+        0,
+    };
+    int64_t i = 0;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    if (!dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_EMPTY, "dict is NULL");
+        goto out;
+    }
+
+    if (snap_volume) {
+        ret = dict_get_str(dict, "snapname", &snapname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "failed to "
+                   "get snapname");
+            goto out;
+        }
+
+        snap = glusterd_find_snap_by_name(snapname);
+        if (!snap) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_NOT_FOUND,
+                   "failed to "
+                   "get the snapshot %s",
+                   snapname);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = dict_get_int32(dict, "flags", &force);
+    if (!ret && (force & GF_CLI_FLAG_OP_FORCE))
+        snap_force = 1;
+
+    /* Do a quorum check of glusterds also. Because, the missed snapshot
+     * information will be saved by glusterd and if glusterds are not in
+     * quorum, then better fail the snapshot
+     */
+    if (!does_gd_meet_server_quorum(this)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SERVER_QUORUM_NOT_MET,
+               "%s", err_str);
+        *op_errstr = gf_strdup(err_str);
+        *op_errno = EG_NODEDWN;
+        ret = -1;
+        goto out;
+    } else
+        gf_msg_debug(this->name, 0, "glusterds are in quorum");
+
+    ret = dict_get_int64(dict, "volcount", &volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to get "
+               "volcount");
+        goto out;
+    }
+
+    for (i = 1; i <= volcount; i++) {
+        snprintf(key, sizeof(key), "%s%" PRId64,
+                 snap_volume ? "snap-volname" : "volname", i);
+        ret = dict_get_str(dict, key, &volname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "failed to "
+                   "get volname");
+            goto out;
+        }
+
+        if (snap_volume) {
+            ret = glusterd_snap_volinfo_find(volname, snap, &volinfo);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_NOT_FOUND,
+                       "failed to get snap volume %s "
+                       "for snap %s",
+                       volname, snapname);
+                goto out;
+            }
+        } else {
+            ret = glusterd_volinfo_find(volname, &volinfo);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+                       "failed to find the volume %s", volname);
+                goto out;
+            }
+        }
+
+        snprintf(key_prefix, sizeof(key_prefix), "%s",
+                 snap_volume ? "snap-vol" : "vol");
+
+        ret = glusterd_snap_common_quorum_calculate(
+            volinfo, dict, i, key_prefix, snap_force, snap_volume, op_errstr,
+            op_errno);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOL_NOT_FOUND,
+                   "volume %s "
+                   "is not in quorum",
+                   volinfo->volname);
+            goto out;
+        }
+    }
+out:
+    return ret;
+}
+
+int32_t
+glusterd_snap_quorum_check(dict_t *dict, gf_boolean_t snap_volume,
+                           char **op_errstr, uint32_t *op_errno)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    int32_t snap_command = 0;
+    const char err_str[] = "glusterds are not in quorum";
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    if (!dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_EMPTY, "dict is NULL");
+        goto out;
+    }
+
+    ret = dict_get_int32_sizen(dict, "type", &snap_command);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "unable to get the type of "
+               "the snapshot command");
+        goto out;
+    }
+
+    switch (snap_command) {
+        case GF_SNAP_OPTION_TYPE_CREATE:
+            ret = glusterd_snap_quorum_check_for_create(dict, snap_volume,
+                                                        op_errstr, op_errno);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_QUORUM_CHECK_FAIL,
+                       "Quorum check"
+                       "failed during snapshot create command");
+                goto out;
+            }
+            break;
+        case GF_SNAP_OPTION_TYPE_CLONE:
+            ret = glusterd_snap_quorum_check_for_clone(dict, !snap_volume,
+                                                       op_errstr, op_errno);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_QUORUM_CHECK_FAIL,
+                       "Quorum check"
+                       "failed during snapshot clone command");
+                goto out;
+            }
+            break;
+        case GF_SNAP_OPTION_TYPE_DELETE:
+        case GF_SNAP_OPTION_TYPE_RESTORE:
+            if (!does_gd_meet_server_quorum(this)) {
+                ret = -1;
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_SERVER_QUORUM_NOT_MET, "%s", err_str);
+                *op_errstr = gf_strdup(err_str);
+                *op_errno = EG_NODEDWN;
+                goto out;
+            }
+
+            gf_msg_debug(this->name, 0,
+                         "glusterds are in "
+                         "quorum");
+            break;
+        default:
+            break;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int
+glusterd_is_path_mounted(const char *path)
+{
+    FILE *mtab = NULL;
+    struct mntent *part = NULL;
+    int is_mounted = 0;
+
+    if ((mtab = setmntent("/etc/mtab", "r")) != NULL) {
+        while ((part = getmntent(mtab)) != NULL) {
+            if ((part->mnt_fsname != NULL) &&
+                (strcmp(part->mnt_dir, path)) == 0) {
+                is_mounted = 1;
+                break;
+            }
+        }
+        endmntent(mtab);
+    }
+    return is_mounted;
+}
+/* This function will do unmount for snaps.
+ */
+int32_t
+glusterd_snap_unmount(xlator_t *this, glusterd_volinfo_t *volinfo)
+{
+    char *brick_mount_path = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int32_t ret = -1;
+    int retry_count = 0;
+
+    GF_ASSERT(this);
+    GF_ASSERT(volinfo);
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        /* If the brick is not of this node, we continue */
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+            continue;
+        }
+        /* If snapshot is pending, we continue */
+        if (brickinfo->snap_status == -1) {
+            continue;
+        }
+
+        ret = glusterd_find_brick_mount_path(brickinfo->path,
+                                             &brick_mount_path);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRK_MNTPATH_GET_FAIL,
+                   "Failed to find brick_mount_path for %s", brickinfo->path);
+            goto out;
+        }
+        /* unmount cannot be done when the brick process is still in
+         * the process of shutdown, so give three re-tries
+         */
+        retry_count = 0;
+        while (retry_count <= 2) {
+            retry_count++;
+            /* umount2 system call doesn't cleanup mtab entry
+             * after un-mount, using external umount command.
+             */
+            ret = glusterd_umount(brick_mount_path);
+            if (!ret)
+                break;
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_UMOUNT_FAIL,
+                   "umount failed "
+                   "for path %s (brick: %s): %s. Retry(%d)",
+                   brick_mount_path, brickinfo->path, strerror(errno),
+                   retry_count);
+            sleep(3);
+        }
+    }
+
+out:
+    if (brick_mount_path)
+        GF_FREE(brick_mount_path);
+
+    return ret;
+}
+
+int32_t
+glusterd_umount(const char *path)
+{
+    char msg[NAME_MAX] = "";
+    int32_t ret = -1;
+    runner_t runner = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(path);
+
+    if (!glusterd_is_path_mounted(path)) {
+        return 0;
+    }
+
+    runinit(&runner);
+    snprintf(msg, sizeof(msg), "umount path %s", path);
+    runner_add_args(&runner, _PATH_UMOUNT, "-f", path, NULL);
+    runner_log(&runner, this->name, GF_LOG_DEBUG, msg);
+    ret = runner_run(&runner);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_GLUSTERD_UMOUNT_FAIL,
+               "umounting %s failed (%s)", path, strerror(errno));
+
+    gf_msg_trace(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_copy_file(const char *source, const char *destination)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    char buffer[1024] = "";
+    int src_fd = -1;
+    int dest_fd = -1;
+    int read_len = -1;
+    struct stat stbuf = {
+        0,
+    };
+    mode_t dest_mode = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(source);
+    GF_ASSERT(destination);
+
+    /* Here is stat is made to get the file permission of source file*/
+    ret = sys_lstat(source, &stbuf);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "%s not found", source);
+        goto out;
+    }
+
+    dest_mode = stbuf.st_mode & 0777;
+
+    src_fd = open(source, O_RDONLY);
+    if (src_fd == -1) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to open file %s", source);
+        goto out;
+    }
+
+    dest_fd = sys_creat(destination, dest_mode);
+    if (dest_fd < 0) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED,
+               "Unble to open a file %s", destination);
+        goto out;
+    }
+
+    do {
+        ret = sys_read(src_fd, buffer, sizeof(buffer));
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                   "Error reading file "
+                   "%s",
+                   source);
+            goto out;
+        }
+        read_len = ret;
+        if (read_len == 0)
+            break;
+
+        ret = sys_write(dest_fd, buffer, read_len);
+        if (ret != read_len) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED,
+                   "Writing in "
+                   "file %s failed with error %s",
+                   destination, strerror(errno));
+            goto out;
+        }
+    } while (ret > 0);
+out:
+    if (src_fd != -1)
+        sys_close(src_fd);
+
+    if (dest_fd > 0)
+        sys_close(dest_fd);
+    return ret;
+}
+
+int32_t
+glusterd_copy_folder(const char *source, const char *destination)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    DIR *dir_ptr = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    char src_path[PATH_MAX] = {
+        0,
+    };
+    char dest_path[PATH_MAX] = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(source);
+    GF_ASSERT(destination);
+
+    dir_ptr = sys_opendir(source);
+    if (!dir_ptr) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Unable to open %s", source);
+        goto out;
+    }
+
+    for (;;) {
+        errno = 0;
+        entry = sys_readdir(dir_ptr, scratch);
+        if (!entry || errno != 0)
+            break;
+
+        if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
+            continue;
+        ret = snprintf(src_path, sizeof(src_path), "%s/%s", source,
+                       entry->d_name);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+            goto out;
+        }
+
+        ret = snprintf(dest_path, sizeof(dest_path), "%s/%s", destination,
+                       entry->d_name);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+            goto out;
+        }
+
+        ret = glusterd_copy_file(src_path, dest_path);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "Could not copy "
+                   "%s to %s",
+                   src_path, dest_path);
+            goto out;
+        }
+    }
+out:
+    if (dir_ptr)
+        (void)sys_closedir(dir_ptr);
+
+    return ret;
+}
+
+int32_t
+glusterd_get_geo_rep_session(char *slave_key, char *origin_volname,
+                             dict_t *gsync_slaves_dict, char *session,
+                             char *slave)
+{
+    int32_t ret = -1;
+    int32_t len = 0;
+    char *token = NULL;
+    char *tok = NULL;
+    char *temp = NULL;
+    char *ip = NULL;
+    char *ip_i = NULL;
+    char *ip_temp = NULL;
+    char *buffer = NULL;
+    xlator_t *this = NULL;
+    char *slave_temp = NULL;
+    char *save_ptr = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(slave_key);
+    GF_ASSERT(origin_volname);
+    GF_ASSERT(gsync_slaves_dict);
+
+    ret = dict_get_str(gsync_slaves_dict, slave_key, &buffer);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to "
+               "get value for key %s",
+               slave_key);
+        goto out;
+    }
+
+    temp = gf_strdup(buffer);
+    if (!temp) {
+        ret = -1;
+        goto out;
+    }
+
+    /* geo-rep session string format being parsed:
+     * "master_node_uuid:ssh://slave_host::slave_vol:slave_voluuid"
+     */
+    token = strtok_r(temp, "/", &save_ptr);
+
+    token = strtok_r(NULL, ":", &save_ptr);
+    if (!token) {
+        ret = -1;
+        goto out;
+    }
+    token++;
+
+    ip = gf_strdup(token);
+    if (!ip) {
+        ret = -1;
+        goto out;
+    }
+    ip_i = ip;
+
+    token = strtok_r(NULL, ":", &save_ptr);
+    if (!token) {
+        ret = -1;
+        goto out;
+    }
+
+    slave_temp = gf_strdup(token);
+    if (!slave) {
+        ret = -1;
+        goto out;
+    }
+
+    /* If 'ip' has 'root@slavehost', point to 'slavehost' as
+     * working directory for root users are created without
+     * 'root@' */
+    ip_temp = gf_strdup(ip);
+    tok = strtok_r(ip_temp, "@", &save_ptr);
+    len = strlen(tok);
+    tok = strtok_r(NULL, "@", &save_ptr);
+    if (tok != NULL)
+        ip_i = ip + len + 1;
+
+    ret = snprintf(session, PATH_MAX, "%s_%s_%s", origin_volname, ip_i,
+                   slave_temp);
+    if (ret < 0) /* Negative value is an error */
+        goto out;
+
+    ret = snprintf(slave, PATH_MAX, "%s::%s", ip, slave_temp);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = 0; /* Success */
+
+out:
+    if (temp)
+        GF_FREE(temp);
+
+    if (ip)
+        GF_FREE(ip);
+
+    if (ip_temp)
+        GF_FREE(ip_temp);
+
+    if (slave_temp)
+        GF_FREE(slave_temp);
+
+    return ret;
+}
+
+int32_t
+glusterd_copy_quota_files(glusterd_volinfo_t *src_vol,
+                          glusterd_volinfo_t *dest_vol,
+                          gf_boolean_t *conf_present)
+{
+    int32_t ret = -1;
+    char src_dir[PATH_MAX] = "";
+    char dest_dir[PATH_MAX] = "";
+    char src_path[PATH_MAX] = "";
+    char dest_path[PATH_MAX] = "";
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    struct stat stbuf = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(src_vol);
+    GF_ASSERT(dest_vol);
+
+    GLUSTERD_GET_VOLUME_DIR(src_dir, src_vol, priv);
+
+    GLUSTERD_GET_VOLUME_DIR(dest_dir, dest_vol, priv);
+
+    ret = snprintf(src_path, sizeof(src_path), "%s/quota.conf", src_dir);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    /* quota.conf is not present if quota is not enabled, Hence ignoring
+     * the absence of this file
+     */
+    ret = sys_lstat(src_path, &stbuf);
+    if (ret) {
+        ret = 0;
+        gf_msg_debug(this->name, 0, "%s not found", src_path);
+        goto out;
+    }
+
+    ret = snprintf(dest_path, sizeof(dest_path), "%s/quota.conf", dest_dir);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    ret = glusterd_copy_file(src_path, dest_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Failed to copy %s in %s", src_path, dest_path);
+        goto out;
+    }
+
+    ret = snprintf(src_path, sizeof(src_path), "%s/quota.cksum", src_dir);
+    if (ret < 0)
+        goto out;
+
+    /* if quota.conf is present, quota.cksum has to be present. *
+     * Fail snapshot operation if file is absent                *
+     */
+    ret = sys_lstat(src_path, &stbuf);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_NOT_FOUND,
+               "%s not found", src_path);
+        goto out;
+    }
+
+    ret = snprintf(dest_path, sizeof(dest_path), "%s/quota.cksum", dest_dir);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    ret = glusterd_copy_file(src_path, dest_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Failed to copy %s in %s", src_path, dest_path);
+        goto out;
+    }
+
+    *conf_present = _gf_true;
+out:
+    return ret;
+}
+
+/* *
+ * Here there are two possibilities, either destination is snaphot or
+ * clone. In the case of snapshot nfs_ganesha export file will be copied
+ * to snapdir. If it is clone , then new export file will be created for
+ * the clone in the GANESHA_EXPORT_DIRECTORY, replacing occurences of
+ * volname with clonename
+ */
+int
+glusterd_copy_nfs_ganesha_file(glusterd_volinfo_t *src_vol,
+                               glusterd_volinfo_t *dest_vol)
+{
+    int32_t ret = -1;
+    char snap_dir[PATH_MAX] = {
+        0,
+    };
+    char src_path[PATH_MAX] = {
+        0,
+    };
+    char dest_path[PATH_MAX] = {
+        0,
+    };
+    char buffer[BUFSIZ] = {
+        0,
+    };
+    char *find_ptr = NULL;
+    char *buff_ptr = NULL;
+    char *tmp_ptr = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    struct stat stbuf = {
+        0,
+    };
+    FILE *src = NULL;
+    FILE *dest = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("snapshot", this, out);
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    GF_VALIDATE_OR_GOTO(this->name, src_vol, out);
+    GF_VALIDATE_OR_GOTO(this->name, dest_vol, out);
+
+    if (glusterd_check_ganesha_export(src_vol) == _gf_false) {
+        gf_msg_debug(this->name, 0,
+                     "%s is not exported via "
+                     "NFS-Ganesha. Skipping copy of export conf.",
+                     src_vol->volname);
+        ret = 0;
+        goto out;
+    }
+
+    if (src_vol->is_snap_volume) {
+        GLUSTERD_GET_SNAP_DIR(snap_dir, src_vol->snapshot, priv);
+        ret = snprintf(src_path, PATH_MAX, "%s/export.%s.conf", snap_dir,
+                       src_vol->snapshot->snapname);
+    } else {
+        ret = snprintf(src_path, PATH_MAX, "%s/export.%s.conf",
+                       GANESHA_EXPORT_DIRECTORY, src_vol->volname);
+    }
+    if (ret < 0 || ret >= PATH_MAX)
+        goto out;
+
+    ret = sys_lstat(src_path, &stbuf);
+    if (ret) {
+        /*
+         * This code path is hit, only when the src_vol is being *
+         * exported via NFS-Ganesha. So if the conf file is not  *
+         * available, we fail the snapshot operation.            *
+         */
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "Stat on %s failed with %s", src_path, strerror(errno));
+        goto out;
+    }
+
+    if (dest_vol->is_snap_volume) {
+        memset(snap_dir, 0, PATH_MAX);
+        GLUSTERD_GET_SNAP_DIR(snap_dir, dest_vol->snapshot, priv);
+        ret = snprintf(dest_path, sizeof(dest_path), "%s/export.%s.conf",
+                       snap_dir, dest_vol->snapshot->snapname);
+        if (ret < 0)
+            goto out;
+
+        ret = glusterd_copy_file(src_path, dest_path);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "Failed to copy %s in %s", src_path, dest_path);
+            goto out;
+        }
+
+    } else {
+        ret = snprintf(dest_path, sizeof(dest_path), "%s/export.%s.conf",
+                       GANESHA_EXPORT_DIRECTORY, dest_vol->volname);
+        if (ret < 0)
+            goto out;
+
+        src = fopen(src_path, "r");
+        dest = fopen(dest_path, "w");
+
+        if (!src || !dest) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED,
+                   "Failed to open %s", dest ? src_path : dest_path);
+            ret = -1;
+            goto out;
+        }
+
+        /* *
+         * if the source volume is snapshot, the export conf file
+         * consists of orginal volname
+         */
+        if (src_vol->is_snap_volume)
+            find_ptr = gf_strdup(src_vol->parent_volname);
+        else
+            find_ptr = gf_strdup(src_vol->volname);
+
+        if (!find_ptr)
+            goto out;
+
+        /* Replacing volname with clonename */
+        while (fgets(buffer, BUFSIZ, src)) {
+            buff_ptr = buffer;
+            while ((tmp_ptr = strstr(buff_ptr, find_ptr))) {
+                while (buff_ptr < tmp_ptr)
+                    fputc((int)*buff_ptr++, dest);
+                fputs(dest_vol->volname, dest);
+                buff_ptr += strlen(find_ptr);
+            }
+            fputs(buff_ptr, dest);
+            memset(buffer, 0, BUFSIZ);
+        }
+    }
+out:
+    if (src)
+        fclose(src);
+    if (dest)
+        fclose(dest);
+    if (find_ptr)
+        GF_FREE(find_ptr);
+
+    return ret;
+}
+
+int32_t
+glusterd_restore_geo_rep_files(glusterd_volinfo_t *snap_vol)
+{
+    int32_t ret = -1;
+    char src_path[PATH_MAX] = "";
+    char dest_path[PATH_MAX] = "";
+    xlator_t *this = NULL;
+    char *origin_volname = NULL;
+    glusterd_volinfo_t *origin_vol = NULL;
+    int i = 0;
+    char key[32] = "";
+    char session[PATH_MAX] = "";
+    char slave[PATH_MAX] = "";
+    char snapgeo_dir[PATH_MAX] = "";
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(snap_vol);
+
+    origin_volname = gf_strdup(snap_vol->parent_volname);
+    if (!origin_volname) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(origin_volname, &origin_vol);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "Unable to fetch "
+               "volinfo for volname %s",
+               origin_volname);
+        goto out;
+    }
+
+    for (i = 1; i <= snap_vol->gsync_slaves->count; i++) {
+        ret = snprintf(key, sizeof(key), "slave%d", i);
+        if (ret < 0) {
+            goto out;
+        }
+
+        /* "origin_vol" is used here because geo-replication saves
+         * the session in the form of master_ip_slave.
+         * As we need the master volume to be same even after
+         * restore, we are passing the origin volume name.
+         *
+         * "snap_vol->gsync_slaves" contain the slave information
+         * when the snapshot was taken, hence we have to restore all
+         * those slaves information when we do snapshot restore.
+         */
+        ret = glusterd_get_geo_rep_session(
+            key, origin_vol->volname, snap_vol->gsync_slaves, session, slave);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GEOREP_GET_FAILED,
+                   "Failed to get geo-rep session");
+            goto out;
+        }
+
+        GLUSTERD_GET_SNAP_GEO_REP_DIR(snapgeo_dir, snap_vol->snapshot, priv);
+        ret = snprintf(src_path, sizeof(src_path), "%s/%s", snapgeo_dir,
+                       session);
+        if (ret < 0)
+            goto out;
+
+        ret = snprintf(dest_path, sizeof(dest_path), "%s/%s/%s", priv->workdir,
+                       GEOREP, session);
+        if (ret < 0)
+            goto out;
+
+        ret = glusterd_copy_folder(src_path, dest_path);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DIR_OP_FAILED,
+                   "Could not copy "
+                   "%s to %s",
+                   src_path, dest_path);
+            goto out;
+        }
+    }
+out:
+    if (origin_volname)
+        GF_FREE(origin_volname);
+
+    return ret;
+}
+
+int
+glusterd_restore_nfs_ganesha_file(glusterd_volinfo_t *src_vol,
+                                  glusterd_snap_t *snap)
+{
+    int32_t ret = -1;
+    char snap_dir[PATH_MAX] = "";
+    char src_path[PATH_MAX] = "";
+    char dest_path[PATH_MAX] = "";
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    struct stat stbuf = {
+        0,
+    };
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("snapshot", this, out);
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    GF_VALIDATE_OR_GOTO(this->name, src_vol, out);
+    GF_VALIDATE_OR_GOTO(this->name, snap, out);
+
+    GLUSTERD_GET_SNAP_DIR(snap_dir, snap, priv);
+
+    ret = snprintf(src_path, sizeof(src_path), "%s/export.%s.conf", snap_dir,
+                   snap->snapname);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    ret = sys_lstat(src_path, &stbuf);
+    if (ret) {
+        if (errno == ENOENT) {
+            ret = 0;
+            gf_msg_debug(this->name, 0, "%s not found", src_path);
+        } else
+            gf_msg(this->name, GF_LOG_WARNING, errno, GD_MSG_FILE_OP_FAILED,
+                   "Stat on %s failed with %s", src_path, strerror(errno));
+        goto out;
+    }
+
+    ret = snprintf(dest_path, sizeof(dest_path), "%s/export.%s.conf",
+                   GANESHA_EXPORT_DIRECTORY, src_vol->volname);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    ret = glusterd_copy_file(src_path, dest_path);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Failed to copy %s in %s", src_path, dest_path);
+
+out:
+    return ret;
+}
+
+/* Snapd functions */
+int
+glusterd_is_snapd_enabled(glusterd_volinfo_t *volinfo)
+{
+    int ret = 0;
+    xlator_t *this = THIS;
+
+    ret = dict_get_str_boolean(volinfo->dict, "features.uss", -2);
+    if (ret == -2) {
+        gf_msg_debug(this->name, 0,
+                     "Key features.uss not "
+                     "present in the dict for volume %s",
+                     volinfo->volname);
+        ret = 0;
+
+    } else if (ret == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get 'features.uss'"
+               " from dict for volume %s",
+               volinfo->volname);
+    }
+
+    return ret;
+}
+
+int32_t
+glusterd_is_snap_soft_limit_reached(glusterd_volinfo_t *volinfo, dict_t *dict)
+{
+    int32_t ret = -1;
+    uint64_t opt_max_hard = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+    uint64_t opt_max_soft = GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT;
+    uint64_t limit = 0;
+    int auto_delete = 0;
+    uint64_t effective_max_limit = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(dict);
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* config values snap-max-hard-limit and snap-max-soft-limit are
+     * optional and hence we are not erroring out if values are not
+     * present
+     */
+    gd_get_snap_conf_values_if_present(priv->opts, &opt_max_hard,
+                                       &opt_max_soft);
+
+    /* "auto-delete" might not be set by user explicitly,
+     * in that case it's better to consider the default value.
+     * Hence not erroring out if Key is not found.
+     */
+    auto_delete = dict_get_str_boolean(
+        priv->opts, GLUSTERD_STORE_KEY_SNAP_AUTO_DELETE, _gf_false);
+
+    if (volinfo->snap_max_hard_limit < opt_max_hard)
+        effective_max_limit = volinfo->snap_max_hard_limit;
+    else
+        effective_max_limit = opt_max_hard;
+
+    limit = (opt_max_soft * effective_max_limit) / 100;
+
+    if (volinfo->snap_count >= limit && auto_delete != _gf_true) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SOFT_LIMIT_REACHED,
+               "Soft-limit "
+               "(value = %" PRIu64
+               ") of volume %s is reached. "
+               "Snapshot creation is not possible once effective "
+               "hard-limit (value = %" PRIu64 ") is reached.",
+               limit, volinfo->volname, effective_max_limit);
+
+        ret = dict_set_int8(dict, "soft-limit-reach", _gf_true);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to "
+                   "set soft limit exceed flag in "
+                   "response dictionary");
+        }
+
+        goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+/* This function initializes the parameter sys_hard_limit,
+ * sys_soft_limit and auto_delete value to the value set
+ * in dictionary, If value is not present then it is
+ * initialized to default values. Hence this function does not
+ * return any values.
+ */
+void
+gd_get_snap_conf_values_if_present(dict_t *dict, uint64_t *sys_hard_limit,
+                                   uint64_t *sys_soft_limit)
+{
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(dict);
+
+    /* "snap-max-hard-limit" might not be set by user explicitly,
+     * in that case it's better to consider the default value.
+     * Hence not erroring out if Key is not found.
+     */
+    if (dict_get_uint64(dict, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT,
+                        sys_hard_limit)) {
+        gf_msg_debug(this->name, 0,
+                     "%s is not present in"
+                     "dictionary",
+                     GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT);
+    }
+
+    /* "snap-max-soft-limit" might not be set by user explicitly,
+     * in that case it's better to consider the default value.
+     * Hence not erroring out if Key is not found.
+     */
+    if (dict_get_uint64(dict, GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT,
+                        sys_soft_limit)) {
+        gf_msg_debug(this->name, 0,
+                     "%s is not present in"
+                     "dictionary",
+                     GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT);
+    }
+}
+
+int
+glusterd_get_snap_status_str(glusterd_snap_t *snapinfo, char *snap_status_str)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, snapinfo, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, snap_status_str, out);
+
+    switch (snapinfo->snap_status) {
+        case GD_SNAP_STATUS_NONE:
+            sprintf(snap_status_str, "%s", "none");
+            break;
+        case GD_SNAP_STATUS_INIT:
+            sprintf(snap_status_str, "%s", "init");
+            break;
+        case GD_SNAP_STATUS_IN_USE:
+            sprintf(snap_status_str, "%s", "in_use");
+            break;
+        case GD_SNAP_STATUS_DECOMMISSION:
+            sprintf(snap_status_str, "%s", "decommissioned");
+            break;
+        case GD_SNAP_STATUS_UNDER_RESTORE:
+            sprintf(snap_status_str, "%s", "under_restore");
+            break;
+        case GD_SNAP_STATUS_RESTORED:
+            sprintf(snap_status_str, "%s", "restored");
+            break;
+        default:
+            goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.h b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.h
new file mode 100644
index 00000000000..5762999bba7
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot-utils.h
@@ -0,0 +1,169 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_SNAP_UTILS_H
+#define _GLUSTERD_SNAP_UTILS_H
+
+#define GLUSTERD_GET_SNAP_DIR(path, snap, priv)                                \
+    do {                                                                       \
+        int32_t _snap_dir_len;                                                 \
+        _snap_dir_len = snprintf(path, PATH_MAX, "%s/snaps/%s", priv->workdir, \
+                                 snap->snapname);                              \
+        if ((_snap_dir_len < 0) || (_snap_dir_len >= PATH_MAX)) {              \
+            path[0] = 0;                                                       \
+        }                                                                      \
+    } while (0)
+
+int32_t
+glusterd_snap_volinfo_find(char *volname, glusterd_snap_t *snap,
+                           glusterd_volinfo_t **volinfo);
+
+int32_t
+glusterd_snap_volinfo_find_from_parent_volname(char *origin_volname,
+                                               glusterd_snap_t *snap,
+                                               glusterd_volinfo_t **volinfo);
+
+int
+glusterd_snap_volinfo_find_by_volume_id(uuid_t volume_id,
+                                        glusterd_volinfo_t **volinfo);
+
+int32_t
+glusterd_add_snapd_to_dict(glusterd_volinfo_t *volinfo, dict_t *dict,
+                           int32_t count);
+
+int
+glusterd_compare_snap_time(struct cds_list_head *, struct cds_list_head *);
+
+int
+glusterd_compare_snap_vol_time(struct cds_list_head *, struct cds_list_head *);
+
+int32_t
+glusterd_snap_volinfo_restore(dict_t *dict, dict_t *rsp_dict,
+                              glusterd_volinfo_t *new_volinfo,
+                              glusterd_volinfo_t *snap_volinfo,
+                              int32_t volcount);
+int32_t
+glusterd_snapobject_delete(glusterd_snap_t *snap);
+
+int32_t
+glusterd_cleanup_snaps_for_volume(glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_missed_snapinfo_new(glusterd_missed_snap_info **missed_snapinfo);
+
+int32_t
+glusterd_missed_snap_op_new(glusterd_snap_op_t **snap_op);
+
+int32_t
+glusterd_add_missed_snaps_to_dict(dict_t *rsp_dict,
+                                  glusterd_volinfo_t *snap_vol,
+                                  glusterd_brickinfo_t *brickinfo,
+                                  int32_t brick_number, int32_t op);
+
+int32_t
+glusterd_add_missed_snaps_to_export_dict(dict_t *peer_data);
+
+int32_t
+glusterd_import_friend_missed_snap_list(dict_t *peer_data);
+
+int
+gd_restore_snap_volume(dict_t *dict, dict_t *rsp_dict,
+                       glusterd_volinfo_t *orig_vol,
+                       glusterd_volinfo_t *snap_vol, int32_t volcount);
+
+int32_t
+glusterd_mount_lvm_snapshot(glusterd_brickinfo_t *brickinfo,
+                            char *brick_mount_path);
+
+int32_t
+glusterd_umount(const char *path);
+
+int32_t
+glusterd_snap_unmount(xlator_t *this, glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_add_snapshots_to_export_dict(dict_t *peer_data);
+
+int32_t
+glusterd_compare_friend_snapshots(dict_t *peer_data, char *peername,
+                                  uuid_t peerid);
+
+int32_t
+glusterd_store_create_snap_dir(glusterd_snap_t *snap);
+
+int32_t
+glusterd_copy_file(const char *source, const char *destination);
+
+int32_t
+glusterd_copy_folder(const char *source, const char *destination);
+
+int32_t
+glusterd_get_geo_rep_session(char *slave_key, char *origin_volname,
+                             dict_t *gsync_slaves_dict, char *session,
+                             char *slave);
+
+int32_t
+glusterd_restore_geo_rep_files(glusterd_volinfo_t *snap_vol);
+
+int32_t
+glusterd_copy_quota_files(glusterd_volinfo_t *src_vol,
+                          glusterd_volinfo_t *dest_vol,
+                          gf_boolean_t *conf_present);
+
+int
+glusterd_snap_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict);
+
+int
+gd_add_vol_snap_details_to_dict(dict_t *dict, char *prefix,
+                                glusterd_volinfo_t *volinfo);
+
+int
+gd_add_brick_snap_details_to_dict(dict_t *dict, char *prefix,
+                                  glusterd_brickinfo_t *brickinfo);
+
+int
+gd_import_new_brick_snap_details(dict_t *dict, char *prefix,
+                                 glusterd_brickinfo_t *brickinfo);
+
+int
+gd_import_volume_snap_details(dict_t *dict, glusterd_volinfo_t *volinfo,
+                              char *prefix, char *volname);
+
+int32_t
+glusterd_snap_quorum_check(dict_t *dict, gf_boolean_t snap_volume,
+                           char **op_errstr, uint32_t *op_errno);
+
+int32_t
+glusterd_snap_brick_create(glusterd_volinfo_t *snap_volinfo,
+                           glusterd_brickinfo_t *brickinfo, int32_t brick_count,
+                           int32_t clone);
+
+int
+glusterd_snapshot_restore_cleanup(dict_t *rsp_dict, char *volname,
+                                  glusterd_snap_t *snap);
+
+void
+glusterd_get_snapd_dir(glusterd_volinfo_t *volinfo, char *path, int path_len);
+
+int
+glusterd_is_snapd_enabled(glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_check_and_set_config_limit(glusterd_conf_t *priv);
+
+int32_t
+glusterd_is_snap_soft_limit_reached(glusterd_volinfo_t *volinfo, dict_t *dict);
+
+void
+gd_get_snap_conf_values_if_present(dict_t *opts, uint64_t *sys_hard_limit,
+                                   uint64_t *sys_soft_limit);
+int
+glusterd_get_snap_status_str(glusterd_snap_t *snapinfo, char *snap_status_str);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot.c b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
new file mode 100644
index 00000000000..aeaa8d15214
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
@@ -0,0 +1,10087 @@
+/*
+   Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <inttypes.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <signal.h>
+#include "glusterd-messages.h"
+#include "glusterd-errno.h"
+
+#if defined(GF_LINUX_HOST_OS)
+#include <mntent.h>
+#else
+#include "mntent_compat.h"
+#endif
+
+#ifdef __NetBSD__
+#define umount2(dir, flags) unmount(dir, ((flags) != 0) ? MNT_FORCE : 0)
+#endif
+
+#if defined(GF_DARWIN_HOST_OS) || defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#define umount2(dir, flags) unmount(dir, ((flags) != 0) ? MNT_FORCE : 0)
+#endif
+
+#include <regex.h>
+
+#include <glusterfs/compat.h>
+#include "protocol-common.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/timer.h>
+#include "glusterd-mem-types.h"
+#include "glusterd.h"
+#include "glusterd-sm.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-utils.h"
+#include "glusterd-store.h"
+#include <glusterfs/run.h>
+#include "glusterd-volgen.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-syncop.h"
+#include "glusterd-snapshot-utils.h"
+#include "glusterd-snapd-svc.h"
+
+#include "glusterfs3.h"
+
+#include <glusterfs/syscall.h>
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+
+#include <glusterfs/lvm-defaults.h>
+#include <glusterfs/events.h>
+
+#define GLUSTERD_GET_UUID_NOHYPHEN(ret_string, uuid)                           \
+    do {                                                                       \
+        char *snap_volname_ptr = ret_string;                                   \
+        char tmp_uuid[64];                                                     \
+        char *snap_volid_ptr = uuid_utoa_r(uuid, tmp_uuid);                    \
+        while (*snap_volid_ptr) {                                              \
+            if (*snap_volid_ptr == '-') {                                      \
+                snap_volid_ptr++;                                              \
+            } else {                                                           \
+                (*snap_volname_ptr++) = (*snap_volid_ptr++);                   \
+            }                                                                  \
+        }                                                                      \
+        *snap_volname_ptr = '\0';                                              \
+    } while (0)
+
+char snap_mount_dir[VALID_GLUSTERD_PATHMAX];
+struct snap_create_args_ {
+    xlator_t *this;
+    dict_t *dict;
+    dict_t *rsp_dict;
+    glusterd_volinfo_t *snap_vol;
+    glusterd_brickinfo_t *brickinfo;
+    struct syncargs *args;
+    int32_t volcount;
+    int32_t brickcount;
+    int32_t brickorder;
+};
+
+/* This structure is used to store unsupported options and their values
+ * for snapshotted volume.
+ */
+struct gd_snap_unsupported_opt_t {
+    char *key;
+    char *value;
+};
+
+typedef struct snap_create_args_ snap_create_args_t;
+
+/* This function is called to get the device path of the snap lvm. Usually
+   if /dev/mapper/<group-name>-<lvm-name> is the device for the lvm,
+   then the snap device will be /dev/<group-name>/<snapname>.
+   This function takes care of building the path for the snap device.
+*/
+
+char *
+glusterd_build_snap_device_path(char *device, char *snapname,
+                                int32_t brickcount)
+{
+    char snap[PATH_MAX] = "";
+    char msg[1024] = "";
+    char volgroup[PATH_MAX] = "";
+    char *snap_device = NULL;
+    xlator_t *this = NULL;
+    runner_t runner = {
+        0,
+    };
+    char *ptr = NULL;
+    int ret = -1;
+
+    this = THIS;
+    GF_ASSERT(this);
+    if (!device) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "device is NULL");
+        goto out;
+    }
+    if (!snapname) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "snapname is NULL");
+        goto out;
+    }
+
+    runinit(&runner);
+    runner_add_args(&runner, "lvs", "--noheadings", "-o", "vg_name", device,
+                    NULL);
+    runner_redir(&runner, STDOUT_FILENO, RUN_PIPE);
+    snprintf(msg, sizeof(msg), "Get volume group for device %s", device);
+    runner_log(&runner, this->name, GF_LOG_DEBUG, msg);
+    ret = runner_start(&runner);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_VG_GET_FAIL,
+               "Failed to get volume group "
+               "for device %s",
+               device);
+        runner_end(&runner);
+        goto out;
+    }
+    ptr = fgets(volgroup, sizeof(volgroup),
+                runner_chio(&runner, STDOUT_FILENO));
+    if (!ptr || !strlen(volgroup)) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_VG_GET_FAIL,
+               "Failed to get volume group "
+               "for snap %s",
+               snapname);
+        runner_end(&runner);
+        ret = -1;
+        goto out;
+    }
+    runner_end(&runner);
+
+    snprintf(snap, sizeof(snap), "/dev/%s/%s_%d", gf_trim(volgroup), snapname,
+             brickcount);
+    snap_device = gf_strdup(snap);
+    if (!snap_device) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, GD_MSG_NO_MEMORY,
+               "Cannot copy the snapshot device name for snapname: %s",
+               snapname);
+    }
+
+out:
+    return snap_device;
+}
+
+/* Look for disconnected peers, for missed snap creates or deletes */
+static int32_t
+glusterd_find_missed_snap(dict_t *rsp_dict, glusterd_volinfo_t *vol,
+                          struct cds_list_head *peers, int32_t op)
+{
+    int32_t brick_count = -1;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(peers);
+    GF_ASSERT(vol);
+
+    brick_count = 0;
+    cds_list_for_each_entry(brickinfo, &vol->bricks, brick_list)
+    {
+        if (!gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+            /* If the brick belongs to the same node */
+            brick_count++;
+            continue;
+        }
+
+        RCU_READ_LOCK;
+        cds_list_for_each_entry_rcu(peerinfo, peers, uuid_list)
+        {
+            if (gf_uuid_compare(peerinfo->uuid, brickinfo->uuid)) {
+                /* If the brick doesn't belong to this peer */
+                continue;
+            }
+
+            /* Found peer who owns the brick,    *
+             * if peer is not connected or not   *
+             * friend add it to missed snap list */
+            if (!(peerinfo->connected) ||
+                (peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)) {
+                ret = glusterd_add_missed_snaps_to_dict(
+                    rsp_dict, vol, brickinfo, brick_count + 1, op);
+                if (ret) {
+                    RCU_READ_UNLOCK;
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_MISSED_SNAP_CREATE_FAIL,
+                           "Failed to add missed snapshot "
+                           "info for %s:%s in the "
+                           "rsp_dict",
+                           brickinfo->hostname, brickinfo->path);
+                    goto out;
+                }
+            }
+        }
+        RCU_READ_UNLOCK;
+        brick_count++;
+    }
+
+    ret = 0;
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+snap_max_limits_display_commit(dict_t *rsp_dict, char *volname, char *op_errstr,
+                               int len)
+{
+    char err_str[PATH_MAX] = "";
+    char key[64] = "";
+    int keylen;
+    glusterd_conf_t *conf = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    int ret = -1;
+    uint64_t active_hard_limit = 0;
+    uint64_t snap_max_limit = 0;
+    uint64_t soft_limit_value = -1;
+    uint64_t count = 0;
+    xlator_t *this = NULL;
+    uint64_t opt_hard_max = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+    uint64_t opt_soft_max = GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT;
+    char *auto_delete = "disable";
+    char *snap_activate = "disable";
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(op_errstr);
+
+    conf = this->private;
+
+    GF_ASSERT(conf);
+
+    /* config values snap-max-hard-limit and snap-max-soft-limit are
+     * optional and hence we are not erroring out if values are not
+     * present
+     */
+    gd_get_snap_conf_values_if_present(conf->opts, &opt_hard_max,
+                                       &opt_soft_max);
+
+    if (!volname) {
+        /* For system limit */
+        cds_list_for_each_entry(volinfo, &conf->volumes, vol_list)
+        {
+            if (volinfo->is_snap_volume == _gf_true)
+                continue;
+
+            snap_max_limit = volinfo->snap_max_hard_limit;
+            if (snap_max_limit > opt_hard_max)
+                active_hard_limit = opt_hard_max;
+            else
+                active_hard_limit = snap_max_limit;
+
+            soft_limit_value = (opt_soft_max * active_hard_limit) / 100;
+
+            keylen = snprintf(key, sizeof(key), "volume%" PRId64 "-volname",
+                              count);
+            ret = dict_set_strn(rsp_dict, key, keylen, volinfo->volname);
+            if (ret) {
+                len = snprintf(err_str, PATH_MAX, "Failed to set %s", key);
+                if (len < 0) {
+                    strcpy(err_str, "<error>");
+                }
+                goto out;
+            }
+
+            snprintf(key, sizeof(key), "volume%" PRId64 "-snap-max-hard-limit",
+                     count);
+            ret = dict_set_uint64(rsp_dict, key, snap_max_limit);
+            if (ret) {
+                len = snprintf(err_str, PATH_MAX, "Failed to set %s", key);
+                if (len < 0) {
+                    strcpy(err_str, "<error>");
+                }
+                goto out;
+            }
+
+            snprintf(key, sizeof(key), "volume%" PRId64 "-active-hard-limit",
+                     count);
+            ret = dict_set_uint64(rsp_dict, key, active_hard_limit);
+            if (ret) {
+                len = snprintf(err_str, PATH_MAX, "Failed to set %s", key);
+                if (len < 0) {
+                    strcpy(err_str, "<error>");
+                }
+                goto out;
+            }
+
+            snprintf(key, sizeof(key), "volume%" PRId64 "-snap-max-soft-limit",
+                     count);
+            ret = dict_set_uint64(rsp_dict, key, soft_limit_value);
+            if (ret) {
+                len = snprintf(err_str, PATH_MAX, "Failed to set %s", key);
+                if (len < 0) {
+                    strcpy(err_str, "<error>");
+                }
+                goto out;
+            }
+            count++;
+        }
+
+        ret = dict_set_uint64(rsp_dict, "voldisplaycount", count);
+        if (ret) {
+            snprintf(err_str, PATH_MAX, "Failed to set voldisplaycount");
+            goto out;
+        }
+    } else {
+        /*  For one volume */
+        ret = glusterd_volinfo_find(volname, &volinfo);
+        if (ret) {
+            snprintf(err_str, PATH_MAX,
+                     "Volume (%s) does not "
+                     "exist",
+                     volname);
+            goto out;
+        }
+
+        snap_max_limit = volinfo->snap_max_hard_limit;
+        if (snap_max_limit > opt_hard_max)
+            active_hard_limit = opt_hard_max;
+        else
+            active_hard_limit = snap_max_limit;
+
+        soft_limit_value = (opt_soft_max * active_hard_limit) / 100;
+
+        keylen = snprintf(key, sizeof(key), "volume%" PRId64 "-volname", count);
+        ret = dict_set_strn(rsp_dict, key, keylen, volinfo->volname);
+        if (ret) {
+            len = snprintf(err_str, PATH_MAX, "Failed to set %s", key);
+            if (len < 0) {
+                strcpy(err_str, "<error>");
+            }
+            goto out;
+        }
+
+        snprintf(key, sizeof(key), "volume%" PRId64 "-snap-max-hard-limit",
+                 count);
+        ret = dict_set_uint64(rsp_dict, key, snap_max_limit);
+        if (ret) {
+            len = snprintf(err_str, PATH_MAX, "Failed to set %s", key);
+            if (len < 0) {
+                strcpy(err_str, "<error>");
+            }
+            goto out;
+        }
+
+        snprintf(key, sizeof(key), "volume%" PRId64 "-active-hard-limit",
+                 count);
+        ret = dict_set_uint64(rsp_dict, key, active_hard_limit);
+        if (ret) {
+            len = snprintf(err_str, PATH_MAX, "Failed to set %s", key);
+            if (len < 0) {
+                strcpy(err_str, "<error>");
+            }
+            goto out;
+        }
+
+        snprintf(key, sizeof(key), "volume%" PRId64 "-snap-max-soft-limit",
+                 count);
+        ret = dict_set_uint64(rsp_dict, key, soft_limit_value);
+        if (ret) {
+            len = snprintf(err_str, PATH_MAX, "Failed to set %s", key);
+            if (len < 0) {
+                strcpy(err_str, "<error>");
+            }
+            goto out;
+        }
+
+        count++;
+
+        ret = dict_set_uint64(rsp_dict, "voldisplaycount", count);
+        if (ret) {
+            snprintf(err_str, PATH_MAX, "Failed to set voldisplaycount");
+            goto out;
+        }
+    }
+
+    ret = dict_set_uint64(rsp_dict, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT,
+                          opt_hard_max);
+    if (ret) {
+        snprintf(err_str, PATH_MAX, "Failed to set %s in response dictionary",
+                 GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT);
+        goto out;
+    }
+
+    ret = dict_set_uint64(rsp_dict, GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT,
+                          opt_soft_max);
+    if (ret) {
+        snprintf(err_str, PATH_MAX, "Failed to set %s in response dictionary",
+                 GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT);
+        goto out;
+    }
+
+    /* "auto-delete" might not be set by user explicitly,
+     * in that case it's better to consider the default value.
+     * Hence not erroring out if Key is not found.
+     */
+    ret = dict_get_strn(conf->opts, GLUSTERD_STORE_KEY_SNAP_AUTO_DELETE,
+                        SLEN(GLUSTERD_STORE_KEY_SNAP_AUTO_DELETE),
+                        &auto_delete);
+
+    ret = dict_set_dynstr_with_alloc(
+        rsp_dict, GLUSTERD_STORE_KEY_SNAP_AUTO_DELETE, auto_delete);
+    if (ret) {
+        snprintf(err_str, PATH_MAX, "Failed to set %s in response dictionary",
+                 GLUSTERD_STORE_KEY_SNAP_AUTO_DELETE);
+        goto out;
+    }
+
+    /* "snap-activate-on-create" might not be set by user explicitly,
+     * in that case it's better to consider the default value.
+     * Hence not erroring out if Key is not found.
+     */
+    ret = dict_get_strn(conf->opts, GLUSTERD_STORE_KEY_SNAP_ACTIVATE,
+                        SLEN(GLUSTERD_STORE_KEY_SNAP_ACTIVATE), &snap_activate);
+
+    ret = dict_set_dynstr_with_alloc(rsp_dict, GLUSTERD_STORE_KEY_SNAP_ACTIVATE,
+                                     snap_activate);
+    if (ret) {
+        snprintf(err_str, PATH_MAX, "Failed to set %s in response dictionary",
+                 GLUSTERD_STORE_KEY_SNAP_ACTIVATE);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (ret) {
+        strncpy(op_errstr, err_str, len);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "%s",
+               err_str);
+    }
+    return ret;
+}
+
+/* Third argument of scandir(used in glusterd_copy_geo_rep_session_files)
+ * is filter function. As we don't want "." and ".." files present in the
+ * directory, we are excliding these 2 files.
+ * "file_select" function here does the job of filtering.
+ */
+int
+file_select(const struct dirent *entry)
+{
+    if (entry == NULL)
+        return (FALSE);
+
+    if ((strcmp(entry->d_name, ".") == 0) || (strcmp(entry->d_name, "..") == 0))
+        return (FALSE);
+    else
+        return (TRUE);
+}
+
+int32_t
+glusterd_copy_geo_rep_session_files(char *session, glusterd_volinfo_t *snap_vol)
+{
+    int32_t ret = -1;
+    char snap_session_dir[PATH_MAX] = "";
+    char georep_session_dir[PATH_MAX] = "";
+    regex_t *reg_exp = NULL;
+    int file_count = -1;
+    struct dirent **files = {
+        0,
+    };
+    xlator_t *this = NULL;
+    int i = 0;
+    char src_path[PATH_MAX] = "";
+    char dest_path[PATH_MAX] = "";
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(session);
+    GF_ASSERT(snap_vol);
+
+    ret = snprintf(georep_session_dir, sizeof(georep_session_dir), "%s/%s/%s",
+                   priv->workdir, GEOREP, session);
+    if (ret < 0) { /* Negative value is an error */
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    ret = snprintf(snap_session_dir, sizeof(snap_session_dir), "%s/%s/%s/%s/%s",
+                   priv->workdir, GLUSTERD_VOL_SNAP_DIR_PREFIX,
+                   snap_vol->snapshot->snapname, GEOREP, session);
+    if (ret < 0) { /* Negative value is an error */
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    ret = mkdir_p(snap_session_dir, 0755, _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Creating directory %s failed", snap_session_dir);
+        goto out;
+    }
+
+    /* TODO : good to have - Allocate in stack instead of heap */
+    reg_exp = GF_CALLOC(1, sizeof(regex_t), gf_common_mt_regex_t);
+    if (!reg_exp) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Failed to allocate memory for regular expression");
+        goto out;
+    }
+
+    ret = regcomp(reg_exp, "(.*status$)|(.*conf$)\0", REG_EXTENDED);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REG_COMPILE_FAILED,
+               "Failed to compile the regular expression");
+        goto out;
+    }
+
+    /* If there are no files in a particular session then fail it*/
+    file_count = scandir(georep_session_dir, &files, file_select, alphasort);
+    if (file_count <= 0) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, ENOENT, GD_MSG_FILE_OP_FAILED,
+               "Session files not present "
+               "in %s",
+               georep_session_dir);
+        goto out;
+    }
+
+    /* Now compare the file name with regular expression to see if
+     * there is a match
+     */
+    for (i = 0; i < file_count; i++) {
+        if (regexec(reg_exp, files[i]->d_name, 0, NULL, 0))
+            continue;
+
+        ret = snprintf(src_path, sizeof(src_path), "%s/%s", georep_session_dir,
+                       files[i]->d_name);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_COPY_FAIL, NULL);
+            goto out;
+        }
+
+        ret = snprintf(dest_path, sizeof(dest_path), "%s/%s", snap_session_dir,
+                       files[i]->d_name);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_COPY_FAIL, NULL);
+            goto out;
+        }
+
+        ret = glusterd_copy_file(src_path, dest_path);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "Could not copy file %s of session %s", files[i]->d_name,
+                   session);
+            goto out;
+        }
+    }
+out:
+    /* files are malloc'd by scandir, free them */
+    if (file_count > 0) {
+        while (file_count--) {
+            free(files[file_count]);
+        }
+        free(files);
+    }
+
+    if (reg_exp)
+        GF_FREE(reg_exp);
+
+    return ret;
+}
+
+/* This function will take backup of the volume store
+ * of the to-be restored volume. This will help us to
+ * revert the operation if it fails.
+ *
+ * @param volinfo volinfo of the origin volume
+ *
+ * @return 0 on success and -1 on failure
+ */
+int
+glusterd_snapshot_backup_vol(glusterd_volinfo_t *volinfo)
+{
+    char pathname[PATH_MAX] = "";
+    int ret = -1;
+    int op_ret = 0;
+    char delete_path[PATH_MAX] = "";
+    char trashdir[PATH_MAX] = "";
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(volinfo);
+
+    GLUSTERD_GET_VOLUME_DIR(pathname, volinfo, priv);
+
+    len = snprintf(delete_path, sizeof(delete_path),
+                   "%s/" GLUSTERD_TRASH "/vols-%s.deleted", priv->workdir,
+                   volinfo->volname);
+    if ((len < 0) || (len >= sizeof(delete_path))) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    len = snprintf(trashdir, sizeof(trashdir), "%s/" GLUSTERD_TRASH,
+                   priv->workdir);
+    if ((len < 0) || (len >= sizeof(trashdir))) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    /* Create trash folder if it is not there */
+    ret = sys_mkdir(trashdir, 0755);
+    if (ret && errno != EEXIST) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Failed to create trash directory, reason : %s",
+               strerror(errno));
+        ret = -1;
+        goto out;
+    }
+
+    /* Move the origin volume volder to the backup location */
+    ret = sys_rename(pathname, delete_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "Failed to rename snap "
+               "directory %s to %s",
+               pathname, delete_path);
+        goto out;
+    }
+
+    /* Re-create an empty origin volume folder so that restore can
+     * happen. */
+    ret = sys_mkdir(pathname, 0755);
+    if (ret && errno != EEXIST) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Failed to create origin "
+               "volume directory (%s), reason : %s",
+               pathname, strerror(errno));
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+out:
+    /* Save the actual return value */
+    op_ret = ret;
+    if (ret) {
+        /* Revert the changes in case of failure */
+        ret = sys_rmdir(pathname);
+        if (ret) {
+            gf_msg_debug(this->name, 0, "Failed to rmdir: %s,err: %s", pathname,
+                         strerror(errno));
+        }
+
+        ret = sys_rename(delete_path, pathname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                   "Failed to rename directory %s to %s", delete_path,
+                   pathname);
+        }
+
+        ret = sys_rmdir(trashdir);
+        if (ret) {
+            gf_msg_debug(this->name, 0, "Failed to rmdir: %s, Reason: %s",
+                         trashdir, strerror(errno));
+        }
+    }
+
+    gf_msg_trace(this->name, 0, "Returning %d", op_ret);
+
+    return op_ret;
+}
+
+static int32_t
+glusterd_copy_geo_rep_files(glusterd_volinfo_t *origin_vol,
+                            glusterd_volinfo_t *snap_vol, dict_t *rsp_dict)
+{
+    int32_t ret = -1;
+    int i = 0;
+    xlator_t *this = NULL;
+    char key[32] = "";
+    char session[PATH_MAX] = "";
+    char slave[PATH_MAX] = "";
+    char snapgeo_dir[PATH_MAX] = "";
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(origin_vol);
+    GF_ASSERT(snap_vol);
+    GF_ASSERT(rsp_dict);
+
+    /* This condition is not satisfied if the volume
+     * is slave volume.
+     */
+    if (!origin_vol->gsync_slaves) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_SLAVE, NULL);
+        ret = 0;
+        goto out;
+    }
+
+    GLUSTERD_GET_SNAP_GEO_REP_DIR(snapgeo_dir, snap_vol->snapshot, priv);
+
+    ret = sys_mkdir(snapgeo_dir, 0755);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Creating directory %s failed", snapgeo_dir);
+        goto out;
+    }
+
+    for (i = 1; i <= origin_vol->gsync_slaves->count; i++) {
+        ret = snprintf(key, sizeof(key), "slave%d", i);
+        if (ret < 0) /* Negative value is an error */
+            goto out;
+
+        ret = glusterd_get_geo_rep_session(
+            key, origin_vol->volname, origin_vol->gsync_slaves, session, slave);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GEOREP_GET_FAILED,
+                   "Failed to get geo-rep session");
+            goto out;
+        }
+
+        ret = glusterd_copy_geo_rep_session_files(session, snap_vol);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED,
+                   "Failed to copy files"
+                   " related to session %s",
+                   session);
+            goto out;
+        }
+    }
+
+out:
+    return ret;
+}
+
+/* This function will restore a snapshot volumes
+ *
+ * @param dict          dictionary containing snapshot restore request
+ * @param op_errstr     In case of any failure error message will be returned
+ *                      in this variable
+ * @return              Negative value on Failure and 0 in success
+ */
+int
+glusterd_snapshot_restore(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    int ret = -1;
+    int32_t volcount = -1;
+    char *snapname = NULL;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *snap_volinfo = NULL;
+    glusterd_volinfo_t *tmp = NULL;
+    glusterd_volinfo_t *parent_volinfo = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(rsp_dict);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get snap name");
+        goto out;
+    }
+
+    snap = glusterd_find_snap_by_name(snapname);
+    if (NULL == snap) {
+        ret = gf_asprintf(op_errstr, "Snapshot (%s) does not exist", snapname);
+        if (ret < 0) {
+            goto out;
+        }
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_NOT_FOUND, "%s",
+               *op_errstr);
+        ret = -1;
+        goto out;
+    }
+
+    volcount = 0;
+    cds_list_for_each_entry_safe(snap_volinfo, tmp, &snap->volumes, vol_list)
+    {
+        volcount++;
+        ret = glusterd_volinfo_find(snap_volinfo->parent_volname,
+                                    &parent_volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+                   "Could not get volinfo of %s", snap_volinfo->parent_volname);
+            goto out;
+        }
+
+        ret = dict_set_dynstr_with_alloc(rsp_dict, "snapuuid",
+                                         uuid_utoa(snap->snap_id));
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set snap "
+                   "uuid in response dictionary for %s snapshot",
+                   snap->snapname);
+            goto out;
+        }
+
+        ret = dict_set_dynstr_with_alloc(rsp_dict, "volname",
+                                         snap_volinfo->parent_volname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set snap "
+                   "uuid in response dictionary for %s snapshot",
+                   snap->snapname);
+            goto out;
+        }
+
+        ret = dict_set_dynstr_with_alloc(rsp_dict, "volid",
+                                         uuid_utoa(parent_volinfo->volume_id));
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set snap "
+                   "uuid in response dictionary for %s snapshot",
+                   snap->snapname);
+            goto out;
+        }
+
+        if (is_origin_glusterd(dict) == _gf_true) {
+            /* From origin glusterd check if      *
+             * any peers with snap bricks is down */
+            ret = glusterd_find_missed_snap(rsp_dict, snap_volinfo,
+                                            &priv->peers,
+                                            GF_SNAP_OPTION_TYPE_RESTORE);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MISSED_SNAP_GET_FAIL,
+                       "Failed to find missed snap restores");
+                goto out;
+            }
+        }
+        /* During snapshot restore, mount point for stopped snap
+         * should exist as it is required to set extended attribute.
+         */
+        ret = glusterd_recreate_vol_brick_mounts(this, snap_volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRK_MNT_RECREATE_FAIL,
+                   "Failed to recreate brick mounts for %s", snap->snapname);
+            goto out;
+        }
+
+        ret = gd_restore_snap_volume(dict, rsp_dict, parent_volinfo,
+                                     snap_volinfo, volcount);
+        if (ret) {
+            /* No need to update op_errstr because it is assumed
+             * that the called function will do that in case of
+             * failure.
+             */
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_RESTORE_FAIL,
+                   "Failed to restore "
+                   "snap for %s",
+                   snapname);
+            goto out;
+        }
+
+        /* Detach the volinfo from priv->volumes, so that no new
+         * command can ref it any more and then unref it.
+         */
+        cds_list_del_init(&parent_volinfo->vol_list);
+        glusterd_volinfo_unref(parent_volinfo);
+    }
+
+    ret = 0;
+
+    /* TODO: Need to check if we need to delete the snap after the
+     * operation is successful or not. Also need to persist the state
+     * of restore operation in the store.
+     */
+out:
+    return ret;
+}
+
+/* This function is called before actual restore is taken place. This function
+ * will validate whether the snapshot volumes are ready to be restored or not.
+ *
+ * @param dict          dictionary containing snapshot restore request
+ * @param op_errstr     In case of any failure error message will be returned
+ *                      in this variable
+ * @param rsp_dict      response dictionary
+ * @return              Negative value on Failure and 0 in success
+ */
+int32_t
+glusterd_snapshot_restore_prevalidate(dict_t *dict, char **op_errstr,
+                                      uint32_t *op_errno, dict_t *rsp_dict)
+{
+    int ret = -1;
+    int32_t i = 0;
+    int32_t volcount = 0;
+    int32_t brick_count = 0;
+    gf_boolean_t snap_restored = _gf_false;
+    char key[64] = "";
+    int keylen;
+    char *volname = NULL;
+    char *snapname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_snap_t *snap = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+    GF_ASSERT(rsp_dict);
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get "
+               "snap name");
+        goto out;
+    }
+
+    snap = glusterd_find_snap_by_name(snapname);
+    if (NULL == snap) {
+        ret = gf_asprintf(op_errstr, "Snapshot (%s) does not exist", snapname);
+        *op_errno = EG_SNAPEXST;
+        if (ret < 0) {
+            goto out;
+        }
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_SNAP_NOT_FOUND, "%s",
+               *op_errstr);
+        ret = -1;
+        goto out;
+    }
+
+    snap_restored = snap->snap_restored;
+
+    if (snap_restored) {
+        ret = gf_asprintf(op_errstr,
+                          "Snapshot (%s) is already "
+                          "restored",
+                          snapname);
+        if (ret < 0) {
+            goto out;
+        }
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPSHOT_OP_FAILED, "%s",
+               *op_errstr);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_strn(rsp_dict, "snapname", SLEN("snapname"), snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set "
+               "snap name(%s)",
+               snapname);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "volcount", SLEN("volcount"), &volcount);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get volume count");
+        goto out;
+    }
+
+    /* Snapshot restore will only work if all the volumes,
+       that are part of the snapshot, are stopped. */
+    for (i = 1; i <= volcount; ++i) {
+        keylen = snprintf(key, sizeof(key), "volname%d", i);
+        ret = dict_get_strn(dict, key, keylen, &volname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to "
+                   "get volume name");
+            goto out;
+        }
+
+        ret = glusterd_volinfo_find(volname, &volinfo);
+        if (ret) {
+            ret = gf_asprintf(op_errstr,
+                              "Volume (%s) "
+                              "does not exist",
+                              volname);
+            *op_errno = EG_NOVOL;
+            if (ret < 0) {
+                goto out;
+            }
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND, "%s",
+                   *op_errstr);
+            ret = -1;
+            goto out;
+        }
+
+        if (glusterd_is_volume_started(volinfo)) {
+            ret = gf_asprintf(
+                op_errstr,
+                "Volume (%s) has been "
+                "started. Volume needs to be stopped before restoring "
+                "a snapshot.",
+                volname);
+            *op_errno = EG_VOLRUN;
+            if (ret < 0) {
+                goto out;
+            }
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPSHOT_OP_FAILED, "%s",
+                   *op_errstr);
+            ret = -1;
+            goto out;
+        }
+
+        /* Take backup of the volinfo folder */
+        ret = glusterd_snapshot_backup_vol(volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_OP_FAILED,
+                   "Failed to backup "
+                   "volume backend files for %s volume",
+                   volinfo->volname);
+            goto out;
+        }
+    }
+
+    /* Get brickinfo for snap_volumes */
+    volcount = 0;
+    cds_list_for_each_entry(volinfo, &snap->volumes, vol_list)
+    {
+        volcount++;
+        brick_count = 0;
+
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            brick_count++;
+            if (gf_uuid_compare(brickinfo->uuid, MY_UUID))
+                continue;
+
+            keylen = snprintf(key, sizeof(key), "snap%d.brick%d.path", volcount,
+                              brick_count);
+            ret = dict_set_strn(rsp_dict, key, keylen, brickinfo->path);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set %s", key);
+                goto out;
+            }
+
+            keylen = snprintf(key, sizeof(key), "snap%d.brick%d.snap_status",
+                              volcount, brick_count);
+            ret = dict_set_int32n(rsp_dict, key, keylen,
+                                  brickinfo->snap_status);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set %s", key);
+                goto out;
+            }
+
+            keylen = snprintf(key, sizeof(key), "snap%d.brick%d.device_path",
+                              volcount, brick_count);
+            ret = dict_set_strn(rsp_dict, key, keylen, brickinfo->device_path);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set %s", key);
+                goto out;
+            }
+
+            keylen = snprintf(key, sizeof(key), "snap%d.brick%d.fs_type",
+                              volcount, brick_count);
+            ret = dict_set_strn(rsp_dict, key, keylen, brickinfo->fstype);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set %s", key);
+                goto out;
+            }
+
+            keylen = snprintf(key, sizeof(key), "snap%d.brick%d.mnt_opts",
+                              volcount, brick_count);
+            ret = dict_set_strn(rsp_dict, key, keylen, brickinfo->mnt_opts);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set %s", key);
+                goto out;
+            }
+        }
+
+        keylen = snprintf(key, sizeof(key), "snap%d.brick_count", volcount);
+        ret = dict_set_int32n(rsp_dict, key, keylen, brick_count);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set %s", key);
+            goto out;
+        }
+    }
+
+    ret = dict_set_int32n(rsp_dict, "volcount", SLEN("volcount"), volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set %s", key);
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+int
+snap_max_hard_limits_validate(dict_t *dict, char *volname, uint64_t value,
+                              char **op_errstr)
+{
+    char err_str[PATH_MAX] = "";
+    glusterd_conf_t *conf = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    int ret = -1;
+    uint64_t max_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+    xlator_t *this = NULL;
+    uint64_t opt_hard_max = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+
+    conf = this->private;
+
+    GF_ASSERT(conf);
+
+    if (volname) {
+        ret = glusterd_volinfo_find(volname, &volinfo);
+        if (!ret) {
+            if (volinfo->is_snap_volume) {
+                ret = -1;
+                snprintf(err_str, PATH_MAX,
+                         "%s is a snap volume. Configuring "
+                         "snap-max-hard-limit for a snap "
+                         "volume is prohibited.",
+                         volname);
+                goto out;
+            }
+        }
+    }
+
+    /* "snap-max-hard-limit" might not be set by user explicitly,
+     * in that case it's better to use the default value.
+     * Hence not erroring out if Key is not found.
+     */
+    ret = dict_get_uint64(conf->opts, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT,
+                          &opt_hard_max);
+    if (ret) {
+        ret = 0;
+        gf_msg_debug(this->name, 0,
+                     "%s is not present in "
+                     "opts dictionary",
+                     GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT);
+    }
+
+    /* volume snap-max-hard-limit cannot exceed system snap-max-hard-limit.
+     * Hence during prevalidate following checks are made to ensure the
+     * snap-max-hard-limit set on one particular volume does not
+     * exceed snap-max-hard-limit set globally (system limit).
+     */
+    if (value && volname) {
+        max_limit = opt_hard_max;
+    }
+
+    if (value > max_limit) {
+        ret = -1;
+        snprintf(err_str, PATH_MAX,
+                 "Invalid snap-max-hard-limit "
+                 "%" PRIu64 ". Expected range 1 - %" PRIu64,
+                 value, max_limit);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (ret) {
+        *op_errstr = gf_strdup(err_str);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPSHOT_OP_FAILED, "%s",
+               err_str);
+    }
+    return ret;
+}
+
+int
+glusterd_snapshot_config_prevalidate(dict_t *dict, char **op_errstr,
+                                     uint32_t *op_errno)
+{
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    int ret = -1;
+    int config_command = 0;
+    char err_str[PATH_MAX] = "";
+    glusterd_conf_t *conf = NULL;
+    uint64_t hard_limit = 0;
+    uint64_t soft_limit = 0;
+    gf_loglevel_t loglevel = GF_LOG_ERROR;
+    uint64_t max_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+    int32_t cur_auto_delete = 0;
+    int32_t req_auto_delete = 0;
+    int32_t cur_snap_activate = 0;
+    int32_t req_snap_activate = 0;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    conf = this->private;
+
+    GF_ASSERT(conf);
+
+    ret = dict_get_int32n(dict, "config-command", SLEN("config-command"),
+                          &config_command);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str), "failed to get config-command type");
+        goto out;
+    }
+
+    if (config_command != GF_SNAP_CONFIG_TYPE_SET) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (volname) {
+        ret = glusterd_volinfo_find(volname, &volinfo);
+        if (ret) {
+            snprintf(err_str, sizeof(err_str), "Volume (%s) does not exist.",
+                     volname);
+            *op_errno = EG_NOVOL;
+            goto out;
+        }
+    }
+
+    /* config values snap-max-hard-limit and snap-max-soft-limit are
+     * optional and hence we are not erroring out if values are not
+     * present
+     */
+    gd_get_snap_conf_values_if_present(dict, &hard_limit, &soft_limit);
+
+    if (hard_limit) {
+        /* Validations for snap-max-hard-limits */
+        ret = snap_max_hard_limits_validate(dict, volname, hard_limit,
+                                            op_errstr);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HARD_LIMIT_SET_FAIL,
+                   "snap-max-hard-limit validation failed.");
+            *op_errno = EINVAL;
+            goto out;
+        }
+    }
+
+    if (soft_limit) {
+        max_limit = GLUSTERD_SNAPS_MAX_SOFT_LIMIT_PERCENT;
+        if (soft_limit > max_limit) {
+            ret = -1;
+            snprintf(err_str, PATH_MAX,
+                     "Invalid "
+                     "snap-max-soft-limit "
+                     "%" PRIu64 ". Expected range 1 - %" PRIu64,
+                     soft_limit, max_limit);
+            *op_errno = EINVAL;
+            goto out;
+        }
+    }
+
+    if (hard_limit || soft_limit) {
+        ret = 0;
+        goto out;
+    }
+
+    if (dict_getn(dict, GLUSTERD_STORE_KEY_SNAP_AUTO_DELETE,
+                  SLEN(GLUSTERD_STORE_KEY_SNAP_AUTO_DELETE))) {
+        req_auto_delete = dict_get_str_boolean(
+            dict, GLUSTERD_STORE_KEY_SNAP_AUTO_DELETE, _gf_false);
+        if (req_auto_delete < 0) {
+            ret = -1;
+            snprintf(err_str, sizeof(err_str),
+                     "Please enter a "
+                     "valid boolean value for auto-delete");
+            *op_errno = EINVAL;
+            goto out;
+        }
+
+        /* Ignoring the error as the auto-delete is optional and
+           might not be present in the options dictionary.*/
+        cur_auto_delete = dict_get_str_boolean(
+            conf->opts, GLUSTERD_STORE_KEY_SNAP_AUTO_DELETE, _gf_false);
+
+        if (cur_auto_delete == req_auto_delete) {
+            ret = -1;
+            if (cur_auto_delete == _gf_true)
+                snprintf(err_str, sizeof(err_str),
+                         "auto-delete is already enabled");
+            else
+                snprintf(err_str, sizeof(err_str),
+                         "auto-delete is already disabled");
+            *op_errno = EINVAL;
+            goto out;
+        }
+    } else if (dict_getn(dict, GLUSTERD_STORE_KEY_SNAP_ACTIVATE,
+                         SLEN(GLUSTERD_STORE_KEY_SNAP_ACTIVATE))) {
+        req_snap_activate = dict_get_str_boolean(
+            dict, GLUSTERD_STORE_KEY_SNAP_ACTIVATE, _gf_false);
+        if (req_snap_activate < 0) {
+            ret = -1;
+            snprintf(err_str, sizeof(err_str),
+                     "Please enter a "
+                     "valid boolean value for activate-on-create");
+            *op_errno = EINVAL;
+            goto out;
+        }
+
+        /* Ignoring the error as the activate-on-create is optional and
+           might not be present in the options dictionary.*/
+        cur_snap_activate = dict_get_str_boolean(
+            conf->opts, GLUSTERD_STORE_KEY_SNAP_ACTIVATE, _gf_false);
+
+        if (cur_snap_activate == req_snap_activate) {
+            ret = -1;
+            if (cur_snap_activate == _gf_true)
+                snprintf(err_str, sizeof(err_str),
+                         "activate-on-create is already enabled");
+            else
+                snprintf(err_str, sizeof(err_str),
+                         "activate-on-create is already disabled");
+            *op_errno = EINVAL;
+            goto out;
+        }
+    } else {
+        ret = -1;
+        snprintf(err_str, sizeof(err_str), "Invalid option");
+        *op_errno = EINVAL;
+        goto out;
+    }
+
+    ret = 0;
+out:
+
+    if (ret && err_str[0] != '\0') {
+        gf_msg(this->name, loglevel, 0, GD_MSG_SNAPSHOT_OP_FAILED, "%s",
+               err_str);
+        *op_errstr = gf_strdup(err_str);
+    }
+
+    return ret;
+}
+
+/* This function will be called from RPC handler routine.
+ * This function is responsible for getting the requested
+ * snapshot config into the dictionary.
+ *
+ * @param req   RPC request object. Required for sending a response back.
+ * @param op    glusterd operation. Required for sending a response back.
+ * @param dict  pointer to dictionary which will contain both
+ *              request and response key-pair values.
+ * @return -1 on error and 0 on success
+ */
+int
+glusterd_handle_snapshot_config(rpcsvc_request_t *req, glusterd_op_t op,
+                                dict_t *dict, char *err_str, size_t len)
+{
+    int32_t ret = -1;
+    char *volname = NULL;
+    xlator_t *this = NULL;
+    int config_command = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_VALIDATE_OR_GOTO(this->name, req, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+
+    /* TODO : Type of lock to be taken when we are setting
+     * limits system wide
+     */
+    ret = dict_get_int32n(dict, "config-command", SLEN("config-command"),
+                          &config_command);
+    if (ret) {
+        snprintf(err_str, len, "Failed to get config-command type");
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                "Key=config-command", NULL);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+
+    switch (config_command) {
+        case GF_SNAP_CONFIG_TYPE_SET:
+            if (!volname) {
+                ret = dict_set_int32n(dict, "hold_vol_locks",
+                                      SLEN("hold_vol_locks"), _gf_false);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Unable to set hold_vol_locks value "
+                           "as _gf_false");
+                    goto out;
+                }
+            }
+            ret = glusterd_mgmt_v3_initiate_all_phases(req, op, dict);
+            break;
+        case GF_SNAP_CONFIG_DISPLAY:
+            /* Reading data from local node only */
+            ret = snap_max_limits_display_commit(dict, volname, err_str, len);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HARD_LIMIT_SET_FAIL,
+                       "snap-max-limit "
+                       "display commit failed.");
+                goto out;
+            }
+
+            /* If everything is successful then send the response
+             * back to cli
+             */
+            ret = glusterd_op_send_cli_response(op, 0, 0, req, dict, err_str);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_CLI_RESP,
+                       "Failed to send cli "
+                       "response");
+                goto out;
+            }
+
+            break;
+        default:
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_COMMAND_NOT_FOUND,
+                   "Unknown config type");
+            ret = -1;
+            break;
+    }
+out:
+    return ret;
+}
+int
+glusterd_snap_create_clone_pre_val_use_rsp_dict(dict_t *dst, dict_t *src)
+{
+    char *snap_brick_dir = NULL;
+    char *snap_device = NULL;
+    char key[64] = "";
+    int keylen;
+    char *value = "";
+    char snapbrckcnt[PATH_MAX] = "";
+    char snapbrckord[PATH_MAX] = "";
+    int ret = -1;
+    int64_t i = -1;
+    int64_t j = -1;
+    int64_t volume_count = 0;
+    int64_t brick_count = 0;
+    int64_t brick_order = 0;
+    xlator_t *this = NULL;
+    int32_t brick_online = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dst);
+    GF_ASSERT(src);
+
+    ret = dict_get_int64(src, "volcount", &volume_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to "
+               "get the volume count");
+        goto out;
+    }
+
+    for (i = 0; i < volume_count; i++) {
+        ret = snprintf(snapbrckcnt, sizeof(snapbrckcnt) - 1,
+                       "vol%" PRId64 "_brickcount", i + 1);
+        ret = dict_get_int64(src, snapbrckcnt, &brick_count);
+        if (ret) {
+            gf_msg_trace(this->name, 0,
+                         "No bricks for this volume in this dict");
+            continue;
+        }
+
+        for (j = 0; j < brick_count; j++) {
+            /* Fetching data from source dict */
+            snprintf(key, sizeof(key), "vol%" PRId64 ".brickdir%" PRId64, i + 1,
+                     j);
+            ret = dict_get_ptr(src, key, (void **)&snap_brick_dir);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to fetch %s", key);
+                continue;
+            }
+
+            /* Fetching brick order from source dict */
+            snprintf(snapbrckord, sizeof(snapbrckord) - 1,
+                     "vol%" PRId64 ".brick%" PRId64 ".order", i + 1, j);
+            ret = dict_get_int64(src, snapbrckord, &brick_order);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get brick order");
+                goto out;
+            }
+
+            snprintf(key, sizeof(key), "vol%" PRId64 ".brickdir%" PRId64, i + 1,
+                     brick_order);
+            ret = dict_set_dynstr_with_alloc(dst, key, snap_brick_dir);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set %s", key);
+                goto out;
+            }
+
+            keylen = snprintf(key, sizeof(key), "vol%" PRId64 ".fstype%" PRId64,
+                              i + 1, j);
+            ret = dict_get_strn(src, key, keylen, &value);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to fetch %s", key);
+                continue;
+            }
+
+            snprintf(key, sizeof(key), "vol%" PRId64 ".fstype%" PRId64, i + 1,
+                     brick_order);
+            ret = dict_set_dynstr_with_alloc(dst, key, value);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set %s", key);
+                goto out;
+            }
+
+            keylen = snprintf(key, sizeof(key),
+                              "vol%" PRId64 ".mnt_opts%" PRId64, i + 1, j);
+            ret = dict_get_strn(src, key, keylen, &value);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to fetch %s", key);
+                continue;
+            }
+
+            snprintf(key, sizeof(key), "vol%" PRId64 ".mnt_opts%" PRId64, i + 1,
+                     brick_order);
+            ret = dict_set_dynstr_with_alloc(dst, key, value);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set %s", key);
+                goto out;
+            }
+
+            snprintf(key, sizeof(key),
+                     "vol%" PRId64 ".brick_snapdevice%" PRId64, i + 1, j);
+            ret = dict_get_ptr(src, key, (void **)&snap_device);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to fetch snap_device");
+                goto out;
+            }
+
+            snprintf(key, sizeof(key),
+                     "vol%" PRId64 ".brick_snapdevice%" PRId64, i + 1,
+                     brick_order);
+            ret = dict_set_dynstr_with_alloc(dst, key, snap_device);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set %s", key);
+                goto out;
+            }
+
+            keylen = snprintf(key, sizeof(key),
+                              "vol%" PRId64 ".brick%" PRId64 ".status", i + 1,
+                              brick_order);
+            ret = dict_get_int32n(src, key, keylen, &brick_online);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "failed to "
+                       "get the brick status");
+                goto out;
+            }
+
+            ret = dict_set_int32n(dst, key, keylen, brick_online);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "failed to "
+                       "set the brick status");
+                goto out;
+            }
+            brick_online = 0;
+        }
+    }
+    ret = 0;
+out:
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Aggregate brickinfo's of the snap volumes to be restored from */
+int32_t
+glusterd_snap_restore_use_rsp_dict(dict_t *dst, dict_t *src)
+{
+    char key[64] = "";
+    int keylen;
+    char *strvalue = NULL;
+    int32_t value = -1;
+    int32_t i = -1;
+    int32_t j = -1;
+    int32_t vol_count = -1;
+    int32_t brickcount = -1;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (!dst || !src) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "Source or Destination "
+               "dict is empty.");
+        goto out;
+    }
+
+    ret = dict_get_int32(src, "volcount", &vol_count);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "No volumes");
+        ret = 0;
+        goto out;
+    }
+
+    for (i = 1; i <= vol_count; i++) {
+        keylen = snprintf(key, sizeof(key), "snap%d.brick_count", i);
+        ret = dict_get_int32n(src, key, keylen, &brickcount);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get %s", key);
+            goto out;
+        }
+
+        for (j = 1; j <= brickcount; j++) {
+            keylen = snprintf(key, sizeof(key), "snap%d.brick%d.path", i, j);
+            ret = dict_get_strn(src, key, keylen, &strvalue);
+            if (ret) {
+                /* The brickinfo will be present in
+                 * another rsp_dict */
+                gf_msg_debug(this->name, 0, "%s not present", key);
+                ret = 0;
+                continue;
+            }
+            ret = dict_set_dynstr_with_alloc(dst, key, strvalue);
+            if (ret) {
+                gf_msg_debug(this->name, 0, "Failed to set %s", key);
+                goto out;
+            }
+
+            keylen = snprintf(key, sizeof(key), "snap%d.brick%d.snap_status", i,
+                              j);
+            ret = dict_get_int32n(src, key, keylen, &value);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get %s", key);
+                goto out;
+            }
+            ret = dict_set_int32n(dst, key, keylen, value);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set %s", key);
+                goto out;
+            }
+
+            keylen = snprintf(key, sizeof(key), "snap%d.brick%d.device_path", i,
+                              j);
+            ret = dict_get_strn(src, key, keylen, &strvalue);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get %s", key);
+                goto out;
+            }
+            ret = dict_set_dynstr_with_alloc(dst, key, strvalue);
+            if (ret) {
+                gf_msg_debug(this->name, 0, "Failed to set %s", key);
+                goto out;
+            }
+
+            keylen = snprintf(key, sizeof(key), "snap%d.brick%d.fs_type", i, j);
+            ret = dict_get_strn(src, key, keylen, &strvalue);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get %s", key);
+                goto out;
+            }
+            ret = dict_set_dynstr_with_alloc(dst, key, strvalue);
+            if (ret) {
+                gf_msg_debug(this->name, 0, "Failed to set %s", key);
+                goto out;
+            }
+
+            keylen = snprintf(key, sizeof(key), "snap%d.brick%d.mnt_opts", i,
+                              j);
+            ret = dict_get_strn(src, key, keylen, &strvalue);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get %s", key);
+                goto out;
+            }
+            ret = dict_set_dynstr_with_alloc(dst, key, strvalue);
+            if (ret) {
+                gf_msg_debug(this->name, 0, "Failed to set %s", key);
+                goto out;
+            }
+        }
+    }
+
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_snap_pre_validate_use_rsp_dict(dict_t *dst, dict_t *src)
+{
+    int ret = -1;
+    int32_t snap_command = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (!dst || !src) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+               "Source or Destination "
+               "dict is empty.");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dst, "type", SLEN("type"), &snap_command);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "unable to get the type of "
+               "the snapshot command");
+        goto out;
+    }
+
+    switch (snap_command) {
+        case GF_SNAP_OPTION_TYPE_CREATE:
+        case GF_SNAP_OPTION_TYPE_CLONE:
+            ret = glusterd_snap_create_clone_pre_val_use_rsp_dict(dst, src);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to use "
+                       "rsp dict");
+                goto out;
+            }
+            break;
+        case GF_SNAP_OPTION_TYPE_RESTORE:
+            ret = glusterd_snap_restore_use_rsp_dict(dst, src);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RSP_DICT_USE_FAIL,
+                       "Unable to use "
+                       "rsp dict");
+                goto out;
+            }
+            break;
+        default:
+            break;
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_add_brick_status_to_dict(dict_t *dict, glusterd_volinfo_t *volinfo,
+                                  glusterd_brickinfo_t *brickinfo,
+                                  char *key_prefix)
+{
+    char pidfile[PATH_MAX] = "";
+    int32_t brick_online = 0;
+    pid_t pid = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    int ret = -1;
+
+    GF_ASSERT(dict);
+    GF_ASSERT(volinfo);
+    GF_ASSERT(brickinfo);
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    if (!key_prefix) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "key prefix is NULL");
+        goto out;
+    }
+
+    GLUSTERD_GET_BRICK_PIDFILE(pidfile, volinfo, brickinfo, conf);
+
+    brick_online = gf_is_service_running(pidfile, &pid);
+
+    ret = dict_set_int32(dict, key_prefix, brick_online);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set %s", key_prefix);
+        goto out;
+    }
+    brick_online = 0;
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+/* This function will check whether the given device
+ * is a thinly provisioned LV or not.
+ *
+ * @param device        LV device path
+ *
+ * @return              _gf_true if LV is thin else _gf_false
+ */
+gf_boolean_t
+glusterd_is_thinp_brick(char *device, uint32_t *op_errno)
+{
+    int ret = -1;
+    char msg[1024] = "";
+    char pool_name[PATH_MAX] = "";
+    char *ptr = NULL;
+    xlator_t *this = NULL;
+    runner_t runner = {
+        0,
+    };
+    gf_boolean_t is_thin = _gf_false;
+
+    this = THIS;
+
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, device, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    snprintf(msg, sizeof(msg), "Get thin pool name for device %s", device);
+
+    runinit(&runner);
+
+    runner_add_args(&runner, "lvs", "--noheadings", "-o", "pool_lv", device,
+                    NULL);
+    runner_redir(&runner, STDOUT_FILENO, RUN_PIPE);
+    runner_log(&runner, this->name, GF_LOG_DEBUG, msg);
+
+    ret = runner_start(&runner);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_TPOOL_GET_FAIL,
+               "Failed to get thin pool "
+               "name for device %s",
+               device);
+        runner_end(&runner);
+        goto out;
+    }
+
+    ptr = fgets(pool_name, sizeof(pool_name),
+                runner_chio(&runner, STDOUT_FILENO));
+    if (!ptr || !strlen(pool_name)) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_TPOOL_GET_FAIL,
+               "Failed to get pool name "
+               "for device %s",
+               device);
+        runner_end(&runner);
+        ret = -1;
+        goto out;
+    }
+
+    runner_end(&runner);
+
+    /* Trim all the whitespaces. */
+    ptr = gf_trim(pool_name);
+
+    /* If the LV has thin pool associated with this
+     * then it is a thinly provisioned LV else it is
+     * regular LV */
+    if (0 != ptr[0]) {
+        is_thin = _gf_true;
+    }
+
+out:
+    if (!is_thin)
+        *op_errno = EG_NOTTHINP;
+
+    return is_thin;
+}
+
+int
+glusterd_snap_create_clone_common_prevalidate(
+    dict_t *rsp_dict, int flags, char *snapname, char *err_str,
+    char *snap_volname, int64_t volcount, glusterd_volinfo_t *volinfo,
+    gf_loglevel_t *loglevel, int clone, uint32_t *op_errno)
+{
+    char *device = NULL;
+    char *orig_device = NULL;
+    char key[128] = "";
+    int ret = -1;
+    int64_t i = 1;
+    int64_t brick_order = 0;
+    int64_t brick_count = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    conf = this->private;
+    GF_ASSERT(conf);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    if (!snapname || !volinfo) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "Failed to validate "
+               "snapname or volume information");
+        ret = -1;
+        goto out;
+    }
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+            brick_order++;
+            continue;
+        }
+
+        if (!glusterd_is_brick_started(brickinfo)) {
+            if (!clone && (flags & GF_CLI_FLAG_OP_FORCE)) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_BRICK_DISCONNECTED,
+                       "brick %s:%s is not started", brickinfo->hostname,
+                       brickinfo->path);
+                brick_order++;
+                brick_count++;
+                continue;
+            }
+            if (!clone) {
+                snprintf(err_str, PATH_MAX,
+                         "One or more bricks are not running. "
+                         "Please run volume status command to see "
+                         "brick status.\n"
+                         "Please start the stopped brick "
+                         "and then issue snapshot create "
+                         "command or use [force] option in "
+                         "snapshot create to override this "
+                         "behavior.");
+                gf_smsg(this->name, GF_LOG_ERROR, errno,
+                        GD_MSG_BRICK_NOT_RUNNING,
+                        "Please run volume status command to see brick "
+                        "status.Please start the stopped brick and then issue "
+                        "snapshot create command or use 'force' option in "
+                        "snapshot create to override this behavior.",
+                        NULL);
+            } else {
+                snprintf(err_str, PATH_MAX,
+                         "One or more bricks are not running. "
+                         "Please run snapshot status command to see "
+                         "brick status.\n"
+                         "Please start the stopped brick "
+                         "and then issue snapshot clone "
+                         "command ");
+                gf_smsg(this->name, GF_LOG_ERROR, errno,
+                        GD_MSG_BRICK_NOT_RUNNING,
+                        "Please run snapshot status command to see brick "
+                        "status. Please start the stopped brick and then issue "
+                        "snapshot clone command.",
+                        NULL);
+            }
+            *op_errno = EG_BRCKDWN;
+            ret = -1;
+            goto out;
+        }
+
+        orig_device = glusterd_get_brick_mount_device(brickinfo->path);
+        if (!orig_device) {
+            len = snprintf(err_str, PATH_MAX,
+                           "getting device name for the brick "
+                           "%s:%s failed",
+                           brickinfo->hostname, brickinfo->path);
+            if (len < 0) {
+                strcpy(err_str, "<error>");
+            }
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    GD_MSG_BRK_MNTPATH_GET_FAIL,
+                    "Brick_hostname=%s, Brick_path=%s", brickinfo->hostname,
+                    brickinfo->path, NULL);
+            ret = -1;
+            goto out;
+        }
+        if (!clone) {
+            if (!glusterd_is_thinp_brick(orig_device, op_errno)) {
+                snprintf(err_str, PATH_MAX,
+                         "Snapshot is supported only for "
+                         "thin provisioned LV. Ensure that "
+                         "all bricks of %s are thinly "
+                         "provisioned LV.",
+                         volinfo->volname);
+                gf_smsg(this->name, GF_LOG_ERROR, errno,
+                        GD_MSG_SNAPSHOT_NOT_THIN_PROVISIONED,
+                        "Ensure that all bricks of volume are thinly "
+                        "provisioned LV, Volume=%s",
+                        volinfo->volname, NULL);
+                ret = -1;
+                goto out;
+            }
+        }
+
+        device = glusterd_build_snap_device_path(orig_device, snap_volname,
+                                                 brick_count);
+        if (!device) {
+            snprintf(err_str, PATH_MAX,
+                     "cannot copy the snapshot device "
+                     "name (volname: %s, snapname: %s)",
+                     volinfo->volname, snapname);
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    GD_MSG_SNAP_DEVICE_NAME_GET_FAIL, "Volname=%s, Snapname=%s",
+                    volinfo->volname, snapname, NULL);
+            *loglevel = GF_LOG_WARNING;
+            ret = -1;
+            goto out;
+        }
+
+        GF_FREE(orig_device);
+        orig_device = NULL;
+
+        snprintf(key, sizeof(key), "vol%" PRId64 ".brick_snapdevice%" PRId64, i,
+                 brick_count);
+        ret = dict_set_dynstr_with_alloc(rsp_dict, key, device);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set %s", key);
+            goto out;
+        }
+
+        ret = glusterd_update_mntopts(brickinfo->path, brickinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRK_MOUNTOPTS_FAIL,
+                   "Failed to "
+                   "update mount options for %s brick",
+                   brickinfo->path);
+        }
+
+        snprintf(key, sizeof(key), "vol%" PRId64 ".fstype%" PRId64, i,
+                 brick_count);
+        ret = dict_set_dynstr_with_alloc(rsp_dict, key, brickinfo->fstype);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set %s", key);
+            goto out;
+        }
+
+        snprintf(key, sizeof(key), "vol%" PRId64 ".mnt_opts%" PRId64, i,
+                 brick_count);
+        ret = dict_set_dynstr_with_alloc(rsp_dict, key, brickinfo->mnt_opts);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set %s", key);
+            goto out;
+        }
+
+        snprintf(key, sizeof(key), "vol%" PRId64 ".brickdir%" PRId64, i,
+                 brick_count);
+        ret = dict_set_dynstr_with_alloc(rsp_dict, key, brickinfo->mount_dir);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set %s", key);
+            goto out;
+        }
+
+        snprintf(key, sizeof(key) - 1, "vol%" PRId64 ".brick%" PRId64 ".order",
+                 i, brick_count);
+        ret = dict_set_int64(rsp_dict, key, brick_order);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set %s", key);
+            goto out;
+        }
+
+        snprintf(key, sizeof(key), "vol%" PRId64 ".brick%" PRId64 ".status", i,
+                 brick_order);
+
+        ret = glusterd_add_brick_status_to_dict(rsp_dict, volinfo, brickinfo,
+                                                key);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "failed to "
+                   "add brick status to dict");
+            goto out;
+        }
+        brick_count++;
+        brick_order++;
+        if (device) {
+            GF_FREE(device);
+            device = NULL;
+        }
+    }
+    snprintf(key, sizeof(key) - 1, "vol%" PRId64 "_brickcount", volcount);
+    ret = dict_set_int64(rsp_dict, key, brick_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set %s", key);
+        goto out;
+    }
+    ret = 0;
+out:
+    if (orig_device)
+        GF_FREE(orig_device);
+
+    if (device)
+        GF_FREE(device);
+
+    return ret;
+}
+
+int
+glusterd_snapshot_clone_prevalidate(dict_t *dict, char **op_errstr,
+                                    dict_t *rsp_dict, uint32_t *op_errno)
+{
+    char *clonename = NULL;
+    char *snapname = NULL;
+    char device_name[64] = "";
+    glusterd_snap_t *snap = NULL;
+    char err_str[PATH_MAX] = "";
+    int ret = -1;
+    int64_t volcount = 1;
+    glusterd_volinfo_t *snap_vol = NULL;
+    xlator_t *this = NULL;
+    uuid_t *snap_volid = NULL;
+    gf_loglevel_t loglevel = GF_LOG_ERROR;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(dict);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    ret = dict_get_strn(dict, "clonename", SLEN("clonename"), &clonename);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to "
+                 "get the clone name");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str), "Failed to get snapname");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(clonename, &volinfo);
+    if (!ret) {
+        ret = -1;
+        snprintf(err_str, sizeof(err_str),
+                 "Volume with name:%s "
+                 "already exists",
+                 clonename);
+        *op_errno = EG_VOLEXST;
+        goto out;
+    }
+    /* need to find snap volinfo*/
+    snap = glusterd_find_snap_by_name(snapname);
+    if (!snap) {
+        ret = -1;
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to find :%s "
+                 "snap",
+                 snapname);
+        goto out;
+    }
+
+    /* TODO : As of now there is only one volume in snapshot.
+     * Change this when multiple volume snapshot is introduced
+     */
+    snap_vol = list_entry(snap->volumes.next, glusterd_volinfo_t, vol_list);
+    if (!snap_vol) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Failed to get snap "
+               "volinfo %s",
+               snap->snapname);
+        goto out;
+    }
+
+    if (!glusterd_is_volume_started(snap_vol)) {
+        snprintf(err_str, sizeof(err_str),
+                 "Snapshot %s is "
+                 "not activated",
+                 snap->snapname);
+        loglevel = GF_LOG_WARNING;
+        *op_errno = EG_VOLSTP;
+        goto out;
+    }
+
+    ret = dict_get_bin(dict, "vol1_volid", (void **)&snap_volid);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch snap_volid");
+        goto out;
+    }
+
+    GLUSTERD_GET_UUID_NOHYPHEN(device_name, *snap_volid);
+
+    /* Adding snap bricks mount paths to the dict */
+    ret = glusterd_snap_create_clone_common_prevalidate(
+        rsp_dict, 0, snapname, err_str, device_name, 1, snap_vol, &loglevel, 1,
+        op_errno);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRE_VALIDATION_FAIL,
+               "Failed to pre validate");
+        goto out;
+    }
+
+    ret = dict_set_int64(rsp_dict, "volcount", volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set volcount");
+        goto out;
+    }
+
+out:
+
+    if (ret && err_str[0] != '\0') {
+        gf_msg(this->name, loglevel, 0, GD_MSG_SNAP_CLONE_PREVAL_FAILED, "%s",
+               err_str);
+        *op_errstr = gf_strdup(err_str);
+    }
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_snapshot_create_prevalidate(dict_t *dict, char **op_errstr,
+                                     dict_t *rsp_dict, uint32_t *op_errno)
+{
+    char *volname = NULL;
+    char *snapname = NULL;
+    char key[64] = "";
+    int keylen;
+    char snap_volname[64] = "";
+    char err_str[PATH_MAX] = "";
+    int ret = -1;
+    int64_t i = 0;
+    int64_t volcount = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    uuid_t *snap_volid = NULL;
+    gf_loglevel_t loglevel = GF_LOG_ERROR;
+    glusterd_conf_t *conf = NULL;
+    int64_t effective_max_limit = 0;
+    int flags = 0;
+    uint64_t opt_hard_max = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+    char *description = NULL;
+
+    this = THIS;
+    GF_ASSERT(op_errstr);
+    conf = this->private;
+    GF_ASSERT(conf);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    ret = dict_get_int64(dict, "volcount", &volcount);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to "
+                 "get the volume count");
+        goto out;
+    }
+    if (volcount <= 0) {
+        snprintf(err_str, sizeof(err_str),
+                 "Invalid volume count %" PRId64 " supplied", volcount);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str), "Failed to get snapname");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "description", SLEN("description"), &description);
+    if (description && !(*description)) {
+        /* description should have a non-null value */
+        ret = -1;
+        snprintf(err_str, sizeof(err_str),
+                 "Snapshot cannot be "
+                 "created with empty description");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "flags", SLEN("flags"), &flags);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get flags");
+        goto out;
+    }
+
+    if (glusterd_find_snap_by_name(snapname)) {
+        ret = -1;
+        snprintf(err_str, sizeof(err_str),
+                 "Snapshot %s already "
+                 "exists",
+                 snapname);
+        *op_errno = EG_SNAPEXST;
+        goto out;
+    }
+
+    for (i = 1; i <= volcount; i++) {
+        keylen = snprintf(key, sizeof(key), "volname%" PRId64, i);
+        ret = dict_get_strn(dict, key, keylen, &volname);
+        if (ret) {
+            snprintf(err_str, sizeof(err_str), "failed to get volume name");
+            goto out;
+        }
+        ret = glusterd_volinfo_find(volname, &volinfo);
+        if (ret) {
+            snprintf(err_str, sizeof(err_str), "Volume (%s) does not exist ",
+                     volname);
+            *op_errno = EG_NOVOL;
+            goto out;
+        }
+
+        ret = -1;
+        if (!glusterd_is_volume_started(volinfo)) {
+            snprintf(err_str, sizeof(err_str),
+                     "volume %s is "
+                     "not started",
+                     volinfo->volname);
+            loglevel = GF_LOG_WARNING;
+            *op_errno = EG_VOLSTP;
+            goto out;
+        }
+
+        if (glusterd_is_defrag_on(volinfo)) {
+            snprintf(err_str, sizeof(err_str),
+                     "rebalance process is running for the "
+                     "volume %s",
+                     volname);
+            loglevel = GF_LOG_WARNING;
+            *op_errno = EG_RBALRUN;
+            goto out;
+        }
+
+        if (gd_vol_is_geo_rep_active(volinfo)) {
+            snprintf(err_str, sizeof(err_str),
+                     "geo-replication session is running for "
+                     "the volume %s. Session needs to be "
+                     "stopped before taking a snapshot.",
+                     volname);
+            loglevel = GF_LOG_WARNING;
+            *op_errno = EG_GEOREPRUN;
+            goto out;
+        }
+
+        if (volinfo->is_snap_volume == _gf_true) {
+            snprintf(err_str, sizeof(err_str), "Volume %s is a snap volume",
+                     volname);
+            loglevel = GF_LOG_WARNING;
+            *op_errno = EG_ISSNAP;
+            goto out;
+        }
+
+        /* "snap-max-hard-limit" might not be set by user explicitly,
+         * in that case it's better to consider the default value.
+         * Hence not erroring out if Key is not found.
+         */
+        ret = dict_get_uint64(
+            conf->opts, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT, &opt_hard_max);
+        if (ret) {
+            ret = 0;
+            gf_msg_debug(this->name, 0,
+                         "%s is not present "
+                         "in opts dictionary",
+                         GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT);
+        }
+
+        if (volinfo->snap_max_hard_limit < opt_hard_max)
+            effective_max_limit = volinfo->snap_max_hard_limit;
+        else
+            effective_max_limit = opt_hard_max;
+
+        if (volinfo->snap_count >= effective_max_limit) {
+            ret = -1;
+            snprintf(err_str, sizeof(err_str),
+                     "The number of existing snaps has reached "
+                     "the effective maximum limit of %" PRIu64
+                     ", "
+                     "for the volume (%s). Please delete few "
+                     "snapshots before taking further snapshots.",
+                     effective_max_limit, volname);
+            loglevel = GF_LOG_WARNING;
+            *op_errno = EG_HRDLMT;
+            goto out;
+        }
+
+        snprintf(key, sizeof(key), "vol%" PRId64 "_volid", i);
+        ret = dict_get_bin(dict, key, (void **)&snap_volid);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to fetch snap_volid");
+            goto out;
+        }
+
+        /* snap volume uuid is used as lvm snapshot name.
+           This will avoid restrictions on snapshot names
+           provided by user */
+        GLUSTERD_GET_UUID_NOHYPHEN(snap_volname, *snap_volid);
+
+        ret = glusterd_snap_create_clone_common_prevalidate(
+            rsp_dict, flags, snapname, err_str, snap_volname, i, volinfo,
+            &loglevel, 0, op_errno);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRE_VALIDATION_FAIL,
+                   "Failed to pre validate");
+            goto out;
+        }
+    }
+
+    ret = dict_set_int64(rsp_dict, "volcount", volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set volcount");
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    if (ret && err_str[0] != '\0') {
+        gf_msg(this->name, loglevel, 0, GD_MSG_SNAPSHOT_OP_FAILED, "%s",
+               err_str);
+        *op_errstr = gf_strdup(err_str);
+    }
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+glusterd_snap_t *
+glusterd_new_snap_object()
+{
+    glusterd_snap_t *snap = NULL;
+
+    snap = GF_CALLOC(1, sizeof(*snap), gf_gld_mt_snap_t);
+
+    if (snap) {
+        if (LOCK_INIT(&snap->lock)) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_LOCK_INIT_FAILED,
+                   "Failed initiating"
+                   " snap lock");
+            GF_FREE(snap);
+            return NULL;
+        }
+
+        CDS_INIT_LIST_HEAD(&snap->snap_list);
+        CDS_INIT_LIST_HEAD(&snap->volumes);
+        snap->snapname[0] = 0;
+        snap->snap_status = GD_SNAP_STATUS_INIT;
+    }
+
+    return snap;
+};
+
+/* Function glusterd_list_add_snapvol adds the volinfo object (snapshot volume)
+   to the snapshot object list and to the parent volume list */
+int32_t
+glusterd_list_add_snapvol(glusterd_volinfo_t *origin_vol,
+                          glusterd_volinfo_t *snap_vol)
+{
+    int ret = -1;
+    glusterd_snap_t *snap = NULL;
+
+    GF_VALIDATE_OR_GOTO("glusterd", origin_vol, out);
+    GF_VALIDATE_OR_GOTO("glusterd", snap_vol, out);
+
+    snap = snap_vol->snapshot;
+    GF_ASSERT(snap);
+
+    cds_list_add_tail(&snap_vol->vol_list, &snap->volumes);
+    LOCK(&origin_vol->lock);
+    {
+        glusterd_list_add_order(&snap_vol->snapvol_list,
+                                &origin_vol->snap_volumes,
+                                glusterd_compare_snap_vol_time);
+
+        origin_vol->snap_count++;
+    }
+    UNLOCK(&origin_vol->lock);
+
+    gf_msg_debug(THIS->name, 0, "Snapshot %s added to the list",
+                 snap->snapname);
+    ret = 0;
+out:
+    return ret;
+}
+
+glusterd_snap_t *
+glusterd_find_snap_by_name(char *snapname)
+{
+    glusterd_snap_t *snap = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(snapname);
+
+    cds_list_for_each_entry(snap, &priv->snapshots, snap_list)
+    {
+        if (!strcmp(snap->snapname, snapname)) {
+            gf_msg_debug(THIS->name, 0,
+                         "Found "
+                         "snap %s (%s)",
+                         snap->snapname, uuid_utoa(snap->snap_id));
+            goto out;
+        }
+    }
+    snap = NULL;
+out:
+    return snap;
+}
+
+glusterd_snap_t *
+glusterd_find_snap_by_id(uuid_t snap_id)
+{
+    glusterd_snap_t *snap = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    if (gf_uuid_is_null(snap_id))
+        goto out;
+
+    cds_list_for_each_entry(snap, &priv->snapshots, snap_list)
+    {
+        if (!gf_uuid_compare(snap->snap_id, snap_id)) {
+            gf_msg_debug(THIS->name, 0,
+                         "Found "
+                         "snap %s (%s)",
+                         snap->snapname, uuid_utoa(snap->snap_id));
+            goto out;
+        }
+    }
+    snap = NULL;
+out:
+    return snap;
+}
+
+int
+glusterd_do_lvm_snapshot_remove(glusterd_volinfo_t *snap_vol,
+                                glusterd_brickinfo_t *brickinfo,
+                                const char *mount_pt, const char *snap_device)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    runner_t runner = {
+        0,
+    };
+    char msg[1024] = "";
+    char pidfile[PATH_MAX] = "";
+    pid_t pid = -1;
+    int retry_count = 0;
+    char *mnt_pt = NULL;
+    gf_boolean_t unmount = _gf_true;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (!brickinfo) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+               "brickinfo NULL");
+        goto out;
+    }
+    GF_ASSERT(snap_vol);
+    GF_ASSERT(mount_pt);
+    GF_ASSERT(snap_device);
+
+    GLUSTERD_GET_BRICK_PIDFILE(pidfile, snap_vol, brickinfo, priv);
+    if (gf_is_service_running(pidfile, &pid)) {
+        (void)send_attach_req(this, brickinfo->rpc, brickinfo->path, NULL, NULL,
+                              GLUSTERD_BRICK_TERMINATE);
+        brickinfo->status = GF_BRICK_STOPPED;
+    }
+
+    /* Check if the brick is mounted and then try unmounting the brick */
+    ret = glusterd_get_brick_root(brickinfo->path, &mnt_pt);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_BRICK_PATH_UNMOUNTED,
+               "Getting the root "
+               "of the brick for volume %s (snap %s) failed. "
+               "Removing lv (%s).",
+               snap_vol->volname, snap_vol->snapshot->snapname, snap_device);
+        /* The brick path is already unmounted. Remove the lv only *
+         * Need not fail the operation */
+        ret = 0;
+        unmount = _gf_false;
+    }
+
+    if ((unmount == _gf_true) && (strcmp(mnt_pt, mount_pt))) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_BRICK_PATH_UNMOUNTED,
+               "Lvm is not mounted for brick %s:%s. "
+               "Removing lv (%s).",
+               brickinfo->hostname, brickinfo->path, snap_device);
+        /* The brick path is already unmounted. Remove the lv only *
+         * Need not fail the operation */
+        unmount = _gf_false;
+    }
+
+    /* umount cannot be done when the brick process is still in the process
+       of shutdown, so give three re-tries */
+    while ((unmount == _gf_true) && (retry_count < 3)) {
+        retry_count++;
+        /*umount2 system call doesn't cleanup mtab entry after un-mount.
+          So use external umount command*/
+        ret = glusterd_umount(mount_pt);
+        if (!ret)
+            break;
+
+        gf_msg_debug(this->name, 0,
+                     "umount failed for "
+                     "path %s (brick: %s): %s. Retry(%d)",
+                     mount_pt, brickinfo->path, strerror(errno), retry_count);
+
+        /*
+         * This used to be one second, but that wasn't long enough
+         * to get past the spurious EPERM errors that prevent some
+         * tests (especially bug-1162462.t) from passing reliably.
+         *
+         * TBD: figure out where that garbage is coming from
+         */
+        sleep(3);
+    }
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNOUNT_FAILED,
+               "umount failed for "
+               "path %s (brick: %s): %s.",
+               mount_pt, brickinfo->path, strerror(errno));
+        /*
+         * This is cheating, but necessary until we figure out how to
+         * shut down a brick within a still-living brick daemon so that
+         * random translators aren't keeping the mountpoint alive.
+         *
+         * TBD: figure out a real solution
+         */
+        ret = 0;
+        goto out;
+    }
+
+    runinit(&runner);
+    len = snprintf(msg, sizeof(msg),
+                   "remove snapshot of the brick %s:%s, "
+                   "device: %s",
+                   brickinfo->hostname, brickinfo->path, snap_device);
+    if (len < 0) {
+        strcpy(msg, "<error>");
+    }
+    runner_add_args(&runner, LVM_REMOVE, "-f", snap_device, NULL);
+    runner_log(&runner, "", GF_LOG_DEBUG, msg);
+
+    ret = runner_run(&runner);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_REMOVE_FAIL,
+               "removing snapshot of the "
+               "brick (%s:%s) of device %s failed",
+               brickinfo->hostname, brickinfo->path, snap_device);
+        goto out;
+    }
+
+out:
+    if (mnt_pt)
+        GF_FREE(mnt_pt);
+
+    return ret;
+}
+
+int32_t
+glusterd_lvm_snapshot_remove(dict_t *rsp_dict, glusterd_volinfo_t *snap_vol)
+{
+    int32_t brick_count = -1;
+    int32_t ret = -1;
+    int32_t err = 0;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    xlator_t *this = NULL;
+    char brick_dir[PATH_MAX] = "";
+    char snap_path[PATH_MAX] = "";
+    char *tmp = NULL;
+    char *brick_mount_path = NULL;
+    gf_boolean_t is_brick_dir_present = _gf_false;
+    struct stat stbuf = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(snap_vol);
+
+    if ((snap_vol->is_snap_volume == _gf_false) &&
+        (gf_uuid_is_null(snap_vol->restored_from_snap))) {
+        gf_msg_debug(this->name, 0,
+                     "Not a snap volume, or a restored snap volume.");
+        ret = 0;
+        goto out;
+    }
+
+    brick_count = -1;
+    cds_list_for_each_entry(brickinfo, &snap_vol->bricks, brick_list)
+    {
+        brick_count++;
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+            gf_msg_debug(this->name, 0, "%s:%s belongs to a different node",
+                         brickinfo->hostname, brickinfo->path);
+            continue;
+        }
+
+        /* Fetch the brick mount path from the brickinfo->path */
+        ret = glusterd_find_brick_mount_path(brickinfo->path,
+                                             &brick_mount_path);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_GET_INFO_FAIL,
+                   "Failed to find brick_mount_path for %s", brickinfo->path);
+            ret = 0;
+            continue;
+        }
+
+        /* As deactivated snapshot have no active mount point we
+         * check only for activated snapshot.
+         */
+        if (snap_vol->status == GLUSTERD_STATUS_STARTED) {
+            ret = sys_lstat(brick_mount_path, &stbuf);
+            if (ret) {
+                gf_msg_debug(this->name, 0, "Brick %s:%s already deleted.",
+                             brickinfo->hostname, brickinfo->path);
+                ret = 0;
+                continue;
+            }
+        }
+
+        if (brickinfo->snap_status == -1) {
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_SNAPSHOT_PENDING,
+                   "snapshot was pending. lvm not present "
+                   "for brick %s:%s of the snap %s.",
+                   brickinfo->hostname, brickinfo->path,
+                   snap_vol->snapshot->snapname);
+
+            if (rsp_dict && (snap_vol->is_snap_volume == _gf_true)) {
+                /* Adding missed delete to the dict */
+                ret = glusterd_add_missed_snaps_to_dict(
+                    rsp_dict, snap_vol, brickinfo, brick_count + 1,
+                    GF_SNAP_OPTION_TYPE_DELETE);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_MISSED_SNAP_CREATE_FAIL,
+                           "Failed to add missed snapshot "
+                           "info for %s:%s in the "
+                           "rsp_dict",
+                           brickinfo->hostname, brickinfo->path);
+                    goto out;
+                }
+            }
+
+            continue;
+        }
+
+        /* Check if the brick has a LV associated with it */
+        if (strlen(brickinfo->device_path) == 0) {
+            gf_msg_debug(this->name, 0,
+                         "Brick (%s:%s) does not have a LV "
+                         "associated with it. Removing the brick path",
+                         brickinfo->hostname, brickinfo->path);
+            goto remove_brick_path;
+        }
+
+        /* Verify if the device path exists or not */
+        ret = sys_stat(brickinfo->device_path, &stbuf);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "LV (%s) for brick (%s:%s) not present. "
+                         "Removing the brick path",
+                         brickinfo->device_path, brickinfo->hostname,
+                         brickinfo->path);
+            /* Making ret = 0 as absence of device path should *
+             * not fail the remove operation */
+            ret = 0;
+            goto remove_brick_path;
+        }
+
+        ret = glusterd_do_lvm_snapshot_remove(
+            snap_vol, brickinfo, brick_mount_path, brickinfo->device_path);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                   "Failed to "
+                   "remove the snapshot %s (%s)",
+                   brickinfo->path, brickinfo->device_path);
+            err = -1; /* We need to record this failure */
+        }
+
+    remove_brick_path:
+        /* After removing the brick dir fetch the parent path
+         * i.e /var/run/gluster/snaps/<snap-vol-id>/
+         */
+        if (is_brick_dir_present == _gf_false) {
+            /* Need to fetch brick_dir to be removed from
+             * brickinfo->path, as in a restored volume,
+             * snap_vol won't have the non-hyphenated snap_vol_id
+             */
+            tmp = strstr(brick_mount_path, "brick");
+            if (!tmp) {
+                gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                       "Invalid brick %s", brickinfo->path);
+                GF_FREE(brick_mount_path);
+                brick_mount_path = NULL;
+                continue;
+            }
+
+            strncpy(brick_dir, brick_mount_path,
+                    (size_t)(tmp - brick_mount_path));
+
+            /* Peers not hosting bricks will have _gf_false */
+            is_brick_dir_present = _gf_true;
+        }
+
+        GF_FREE(brick_mount_path);
+        brick_mount_path = NULL;
+    }
+
+    if (is_brick_dir_present == _gf_true) {
+        ret = recursive_rmdir(brick_dir);
+        if (ret) {
+            if (errno == ENOTEMPTY) {
+                /* Will occur when multiple glusterds
+                 * are running in the same node
+                 */
+                gf_msg(this->name, GF_LOG_WARNING, errno, GD_MSG_DIR_OP_FAILED,
+                       "Failed to rmdir: %s, err: %s. "
+                       "More than one glusterd running "
+                       "on this node.",
+                       brick_dir, strerror(errno));
+                ret = 0;
+                goto out;
+            } else
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+                       "Failed to rmdir: %s, err: %s", brick_dir,
+                       strerror(errno));
+            goto out;
+        }
+
+        /* After removing brick_dir, fetch and remove snap path
+         * i.e. /var/run/gluster/snaps/<snap-name>.
+         */
+        if (!snap_vol->snapshot) {
+            gf_msg(this->name, GF_LOG_WARNING, EINVAL, GD_MSG_INVALID_ENTRY,
+                   "snapshot not"
+                   "present in snap_vol");
+            ret = -1;
+            goto out;
+        }
+
+        snprintf(snap_path, sizeof(snap_path), "%s/%s", snap_mount_dir,
+                 snap_vol->snapshot->snapname);
+        ret = recursive_rmdir(snap_path);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+                   "Failed to remove "
+                   "%s directory : error : %s",
+                   snap_path, strerror(errno));
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    if (err) {
+        ret = err;
+    }
+    GF_FREE(brick_mount_path);
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_snap_volume_remove(dict_t *rsp_dict, glusterd_volinfo_t *snap_vol,
+                            gf_boolean_t remove_lvm, gf_boolean_t force)
+{
+    int ret = -1;
+    int save_ret = 0;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_volinfo_t *origin_vol = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(snap_vol);
+
+    if (!snap_vol) {
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, GD_MSG_INVALID_ENTRY,
+               "snap_vol in NULL");
+        ret = -1;
+        goto out;
+    }
+
+    cds_list_for_each_entry(brickinfo, &snap_vol->bricks, brick_list)
+    {
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID))
+            continue;
+
+        ret = glusterd_brick_stop(snap_vol, brickinfo, _gf_false);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_BRICK_STOP_FAIL,
+                   "Failed to stop "
+                   "brick for volume %s",
+                   snap_vol->volname);
+            save_ret = ret;
+
+            /* Don't clean up the snap on error when
+               force flag is disabled */
+            if (!force)
+                goto out;
+        }
+    }
+
+    /* Only remove the backend lvm when required */
+    if (remove_lvm) {
+        ret = glusterd_lvm_snapshot_remove(rsp_dict, snap_vol);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                   "Failed to remove "
+                   "lvm snapshot volume %s",
+                   snap_vol->volname);
+            save_ret = ret;
+            if (!force)
+                goto out;
+        }
+    }
+
+    ret = glusterd_store_delete_volume(snap_vol);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOL_DELETE_FAIL,
+               "Failed to remove volume %s "
+               "from store",
+               snap_vol->volname);
+        save_ret = ret;
+        if (!force)
+            goto out;
+    }
+
+    if (!cds_list_empty(&snap_vol->snapvol_list)) {
+        ret = glusterd_volinfo_find(snap_vol->parent_volname, &origin_vol);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+                   "Failed to get "
+                   "parent volinfo %s  for volume  %s",
+                   snap_vol->parent_volname, snap_vol->volname);
+            save_ret = ret;
+            if (!force)
+                goto out;
+        }
+        origin_vol->snap_count--;
+    }
+
+    glusterd_volinfo_unref(snap_vol);
+
+    if (save_ret)
+        ret = save_ret;
+out:
+    gf_msg_trace(this->name, 0, "returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_snap_remove(dict_t *rsp_dict, glusterd_snap_t *snap,
+                     gf_boolean_t remove_lvm, gf_boolean_t force,
+                     gf_boolean_t is_clone)
+{
+    int ret = -1;
+    int save_ret = 0;
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_volinfo_t *tmp = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(snap);
+
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, GD_MSG_INVALID_ENTRY,
+               "snap is NULL");
+        ret = -1;
+        goto out;
+    }
+
+    cds_list_for_each_entry_safe(snap_vol, tmp, &snap->volumes, vol_list)
+    {
+        ret = glusterd_snap_volume_remove(rsp_dict, snap_vol, remove_lvm,
+                                          force);
+        if (ret && !force) {
+            /* Don't clean up the snap on error when
+               force flag is disabled */
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                   "Failed to remove "
+                   "volinfo %s for snap %s",
+                   snap_vol->volname, snap->snapname);
+            save_ret = ret;
+            goto out;
+        }
+    }
+
+    /* A clone does not persist snap info in /var/lib/glusterd/snaps/ *
+     * and hence there is no snap info to be deleted from there       *
+     */
+    if (!is_clone) {
+        ret = glusterd_store_delete_snap(snap);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                   "Failed to remove snap %s from store", snap->snapname);
+            save_ret = ret;
+            if (!force)
+                goto out;
+        }
+    }
+
+    ret = glusterd_snapobject_delete(snap);
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_REMOVE_FAIL,
+               "Failed to delete "
+               "snap object %s",
+               snap->snapname);
+
+    if (save_ret)
+        ret = save_ret;
+out:
+    gf_msg_trace(THIS->name, 0, "returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_snapshot_get_snapvol_detail(dict_t *dict, glusterd_volinfo_t *snap_vol,
+                                     const char *keyprefix, const int detail)
+{
+    int ret = -1;
+    int snap_limit = 0;
+    char key[64] = ""; /* keyprefix is quite small, up to 32 byts */
+    int keylen;
+    char *value = NULL;
+    glusterd_volinfo_t *origin_vol = NULL;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+    uint64_t opt_hard_max = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+
+    this = THIS;
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GF_ASSERT(dict);
+    GF_ASSERT(snap_vol);
+    GF_ASSERT(keyprefix);
+
+    /* Volume Name */
+    value = gf_strdup(snap_vol->volname);
+    if (!value)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.volname", keyprefix);
+    ret = dict_set_dynstrn(dict, key, keylen, value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set "
+               "volume name in dictionary: %s",
+               key);
+        goto out;
+    }
+
+    /* Volume ID */
+    value = gf_strdup(uuid_utoa(snap_vol->volume_id));
+    if (NULL == value) {
+        ret = -1;
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.vol-id", keyprefix);
+    ret = dict_set_dynstrn(dict, key, keylen, value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_MEMORY,
+               "Failed to set "
+               "volume id in dictionary: %s",
+               key);
+        goto out;
+    }
+    value = NULL;
+
+    /* volume status */
+    keylen = snprintf(key, sizeof(key), "%s.vol-status", keyprefix);
+    switch (snap_vol->status) {
+        case GLUSTERD_STATUS_STARTED:
+            ret = dict_set_nstrn(dict, key, keylen, "Started", SLEN("Started"));
+            break;
+        case GLUSTERD_STATUS_STOPPED:
+            ret = dict_set_nstrn(dict, key, keylen, "Stopped", SLEN("Stopped"));
+            break;
+        case GD_SNAP_STATUS_NONE:
+            ret = dict_set_nstrn(dict, key, keylen, "None", SLEN("None"));
+            break;
+        default:
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                   "Invalid volume status");
+            ret = -1;
+            goto out;
+    }
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set volume status"
+               " in dictionary: %s",
+               key);
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(snap_vol->parent_volname, &origin_vol);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+               "failed to get the parent "
+               "volinfo for the volume %s",
+               snap_vol->volname);
+        goto out;
+    }
+
+    /* "snap-max-hard-limit" might not be set by user explicitly,
+     * in that case it's better to consider the default value.
+     * Hence not erroring out if Key is not found.
+     */
+    ret = dict_get_uint64(conf->opts, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT,
+                          &opt_hard_max);
+    if (ret) {
+        ret = 0;
+        gf_msg_debug(this->name, 0,
+                     "%s is not present in "
+                     "opts dictionary",
+                     GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT);
+    }
+
+    if (opt_hard_max < origin_vol->snap_max_hard_limit) {
+        snap_limit = opt_hard_max;
+        gf_msg_debug(this->name, 0,
+                     "system snap-max-hard-limit is"
+                     " lesser than volume snap-max-hard-limit, "
+                     "snap-max-hard-limit value is set to %d",
+                     snap_limit);
+    } else {
+        snap_limit = origin_vol->snap_max_hard_limit;
+        gf_msg_debug(this->name, 0,
+                     "volume snap-max-hard-limit is"
+                     " lesser than system snap-max-hard-limit, "
+                     "snap-max-hard-limit value is set to %d",
+                     snap_limit);
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.snaps-available", keyprefix);
+    if (snap_limit > origin_vol->snap_count)
+        ret = dict_set_int32n(dict, key, keylen,
+                              snap_limit - origin_vol->snap_count);
+    else
+        ret = dict_set_int32(dict, key, 0);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set available snaps");
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.snapcount", keyprefix);
+    ret = dict_set_int32n(dict, key, keylen, origin_vol->snap_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Could not save snapcount");
+        goto out;
+    }
+
+    if (!detail)
+        goto out;
+
+    /* Parent volume name */
+    value = gf_strdup(snap_vol->parent_volname);
+    if (!value)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.origin-volname", keyprefix);
+    ret = dict_set_dynstrn(dict, key, keylen, value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set parent "
+               "volume name in dictionary: %s",
+               key);
+        goto out;
+    }
+    value = NULL;
+
+    ret = 0;
+out:
+    if (value)
+        GF_FREE(value);
+
+    return ret;
+}
+
+static int
+glusterd_snapshot_get_snap_detail(dict_t *dict, glusterd_snap_t *snap,
+                                  const char *keyprefix,
+                                  glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    int volcount = 0;
+    char key[32] = ""; /* keyprefix is quite small, up to 16 bytes */
+    int keylen;
+    char timestr[GF_TIMESTR_SIZE] = "";
+    char *value = NULL;
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_volinfo_t *tmp_vol = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    GF_ASSERT(dict);
+    GF_ASSERT(snap);
+    GF_ASSERT(keyprefix);
+
+    /* Snap Name */
+    value = gf_strdup(snap->snapname);
+    if (!value)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.snapname", keyprefix);
+    ret = dict_set_dynstrn(dict, key, keylen, value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set "
+               "snap name in dictionary");
+        goto out;
+    }
+
+    /* Snap ID */
+    value = gf_strdup(uuid_utoa(snap->snap_id));
+    if (NULL == value) {
+        ret = -1;
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.snap-id", keyprefix);
+    ret = dict_set_dynstrn(dict, key, keylen, value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set "
+               "snap id in dictionary");
+        goto out;
+    }
+    value = NULL;
+
+    gf_time_fmt(timestr, sizeof timestr, snap->time_stamp, gf_timefmt_FT);
+    value = gf_strdup(timestr);
+
+    if (NULL == value) {
+        ret = -1;
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.snap-time", keyprefix);
+    ret = dict_set_dynstrn(dict, key, keylen, value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set "
+               "snap time stamp in dictionary");
+        goto out;
+    }
+    value = NULL;
+
+    /* If snap description is provided then add that into dictionary */
+    if (NULL != snap->description) {
+        value = gf_strdup(snap->description);
+        if (NULL == value) {
+            ret = -1;
+            goto out;
+        }
+
+        keylen = snprintf(key, sizeof(key), "%s.snap-desc", keyprefix);
+        ret = dict_set_dynstrn(dict, key, keylen, value);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set "
+                   "snap description in dictionary");
+            goto out;
+        }
+        value = NULL;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.snap-status", keyprefix);
+    switch (snap->snap_status) {
+        case GD_SNAP_STATUS_INIT:
+            ret = dict_set_nstrn(dict, key, keylen, "Init", SLEN("Init"));
+            break;
+        case GD_SNAP_STATUS_IN_USE:
+            ret = dict_set_nstrn(dict, key, keylen, "In-use", SLEN("In-use"));
+            break;
+        case GD_SNAP_STATUS_DECOMMISSION:
+            ret = dict_set_nstrn(dict, key, keylen, "Decommisioned",
+                                 SLEN("Decommisioned"));
+            break;
+        case GD_SNAP_STATUS_RESTORED:
+            ret = dict_set_nstrn(dict, key, keylen, "Restored",
+                                 SLEN("Restored"));
+            break;
+        case GD_SNAP_STATUS_NONE:
+            ret = dict_set_nstrn(dict, key, keylen, "None", SLEN("None"));
+            break;
+        default:
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                   "Invalid snap status");
+            ret = -1;
+            goto out;
+    }
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snap status "
+               "in dictionary");
+        goto out;
+    }
+
+    if (volinfo) {
+        volcount = 1;
+        snprintf(key, sizeof(key), "%s.vol%d", keyprefix, volcount);
+        ret = glusterd_snapshot_get_snapvol_detail(dict, volinfo, key, 0);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_DICT_GET_FAILED,
+                   "Failed to "
+                   "get volume detail %s for snap %s",
+                   snap_vol->volname, snap->snapname);
+            goto out;
+        }
+        goto done;
+    }
+
+    cds_list_for_each_entry_safe(snap_vol, tmp_vol, &snap->volumes, vol_list)
+    {
+        volcount++;
+        snprintf(key, sizeof(key), "%s.vol%d", keyprefix, volcount);
+        ret = glusterd_snapshot_get_snapvol_detail(dict, snap_vol, key, 1);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to "
+                   "get volume detail %s for snap %s",
+                   snap_vol->volname, snap->snapname);
+            goto out;
+        }
+    }
+
+done:
+    keylen = snprintf(key, sizeof(key), "%s.vol-count", keyprefix);
+    ret = dict_set_int32n(dict, key, keylen, volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set %s", key);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (value)
+        GF_FREE(value);
+
+    return ret;
+}
+
+static int
+glusterd_snapshot_get_all_snap_info(dict_t *dict)
+{
+    int ret = -1;
+    int snapcount = 0;
+    char key[16] = "";
+    glusterd_snap_t *snap = NULL;
+    glusterd_snap_t *tmp_snap = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* General parameter validation */
+    GF_ASSERT(dict);
+
+    cds_list_for_each_entry_safe(snap, tmp_snap, &priv->snapshots, snap_list)
+    {
+        snapcount++;
+        snprintf(key, sizeof(key), "snap%d", snapcount);
+        ret = glusterd_snapshot_get_snap_detail(dict, snap, key, NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get "
+                   "snapdetail for snap %s",
+                   snap->snapname);
+            goto out;
+        }
+    }
+
+    ret = dict_set_int32n(dict, "snapcount", SLEN("snapcount"), snapcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snapcount");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_snapshot_get_info_by_volume(dict_t *dict, char *volname, char *err_str,
+                                     size_t len)
+{
+    int ret = -1;
+    int snapcount = 0;
+    int snap_limit = 0;
+    char *value = NULL;
+    char key[16] = "";
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_volinfo_t *tmp_vol = NULL;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+    uint64_t opt_hard_max = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+
+    this = THIS;
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GF_ASSERT(dict);
+    GF_ASSERT(volname);
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(err_str, len, "Volume (%s) does not exist", volname);
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND, "%s",
+               err_str);
+        goto out;
+    }
+
+    /* "snap-max-hard-limit" might not be set by user explicitly,
+     * in that case it's better to consider the default value.
+     * Hence not erroring out if Key is not found.
+     */
+    ret = dict_get_uint64(conf->opts, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT,
+                          &opt_hard_max);
+    if (ret) {
+        ret = 0;
+        gf_msg_debug(this->name, 0,
+                     "%s is not present in "
+                     "opts dictionary",
+                     GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT);
+    }
+
+    if (opt_hard_max < volinfo->snap_max_hard_limit) {
+        snap_limit = opt_hard_max;
+        gf_msg_debug(this->name, 0,
+                     "system snap-max-hard-limit is"
+                     " lesser than volume snap-max-hard-limit, "
+                     "snap-max-hard-limit value is set to %d",
+                     snap_limit);
+    } else {
+        snap_limit = volinfo->snap_max_hard_limit;
+        gf_msg_debug(this->name, 0,
+                     "volume snap-max-hard-limit is"
+                     " lesser than system snap-max-hard-limit, "
+                     "snap-max-hard-limit value is set to %d",
+                     snap_limit);
+    }
+
+    if (snap_limit > volinfo->snap_count)
+        ret = dict_set_int32n(dict, "snaps-available", SLEN("snaps-available"),
+                              snap_limit - volinfo->snap_count);
+    else
+        ret = dict_set_int32n(dict, "snaps-available", SLEN("snaps-available"),
+                              0);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set available snaps");
+        goto out;
+    }
+
+    /* Origin volume name */
+    value = gf_strdup(volinfo->volname);
+    if (!value)
+        goto out;
+
+    ret = dict_set_dynstrn(dict, "origin-volname", SLEN("origin-volname"),
+                           value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set parent "
+               "volume name in dictionary: %s",
+               value);
+        goto out;
+    }
+    value = NULL;
+
+    cds_list_for_each_entry_safe(snap_vol, tmp_vol, &volinfo->snap_volumes,
+                                 snapvol_list)
+    {
+        snapcount++;
+        snprintf(key, sizeof(key), "snap%d", snapcount);
+        ret = glusterd_snapshot_get_snap_detail(dict, snap_vol->snapshot, key,
+                                                snap_vol);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get "
+                   "snapdetail for snap %s",
+                   snap_vol->snapshot->snapname);
+            goto out;
+        }
+    }
+    ret = dict_set_int32n(dict, "snapcount", SLEN("snapcount"), snapcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snapcount");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (value)
+        GF_FREE(value);
+
+    return ret;
+}
+
+/* This function will be called from RPC handler routine.
+ * This function is responsible for getting the requested
+ * snapshot info into the dictionary.
+ *
+ * @param req   RPC request object. Required for sending a response back.
+ * @param op    glusterd operation. Required for sending a response back.
+ * @param dict  pointer to dictionary which will contain both
+ *              request and response key-pair values.
+ * @return -1 on error and 0 on success
+ */
+int
+glusterd_handle_snapshot_info(rpcsvc_request_t *req, glusterd_op_t op,
+                              dict_t *dict, char *err_str, size_t len)
+{
+    int ret = -1;
+    int8_t snap_driven = 1;
+    char *volname = NULL;
+    char *snapname = NULL;
+    glusterd_snap_t *snap = NULL;
+    xlator_t *this = NULL;
+    int32_t cmd = GF_SNAP_INFO_TYPE_ALL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_VALIDATE_OR_GOTO(this->name, req, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+
+    ret = dict_get_int32n(dict, "sub-cmd", SLEN("sub-cmd"), &cmd);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get type "
+               "of snapshot info");
+        goto out;
+    }
+
+    switch (cmd) {
+        case GF_SNAP_INFO_TYPE_ALL: {
+            ret = glusterd_snapshot_get_all_snap_info(dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get info of all snaps");
+                goto out;
+            }
+            break;
+        }
+
+        case GF_SNAP_INFO_TYPE_SNAP: {
+            ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get snap name");
+                goto out;
+            }
+
+            ret = dict_set_int32n(dict, "snapcount", SLEN("snapcount"), 1);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set snapcount");
+                goto out;
+            }
+
+            snap = glusterd_find_snap_by_name(snapname);
+            if (!snap) {
+                snprintf(err_str, len, "Snapshot (%s) does not exist",
+                         snapname);
+                gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_SNAP_NOT_FOUND,
+                       "%s", err_str);
+                ret = -1;
+                goto out;
+            }
+            ret = glusterd_snapshot_get_snap_detail(dict, snap, "snap1", NULL);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_NOT_FOUND,
+                       "Failed to get snap detail of snap "
+                       "%s",
+                       snap->snapname);
+                goto out;
+            }
+            break;
+        }
+
+        case GF_SNAP_INFO_TYPE_VOL: {
+            ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+                       "Failed to get volname");
+                goto out;
+            }
+            ret = glusterd_snapshot_get_info_by_volume(dict, volname, err_str,
+                                                       len);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+                       "Failed to get volume info of volume "
+                       "%s",
+                       volname);
+                goto out;
+            }
+            snap_driven = 0;
+            break;
+        }
+    }
+
+    ret = dict_set_int8(dict, "snap-driven", snap_driven);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snap-driven");
+        goto out;
+    }
+
+    /* If everything is successful then send the response back to cli.
+     * In case of failure the caller of this function will take care
+       of the response */
+    ret = glusterd_op_send_cli_response(op, 0, 0, req, dict, err_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_CLI_RESP,
+               "Failed to send cli "
+               "response");
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+/* This function sets all the snapshot names in the dictionary */
+int
+glusterd_snapshot_get_all_snapnames(dict_t *dict)
+{
+    int ret = -1;
+    int snapcount = 0;
+    char *snapname = NULL;
+    char key[64] = "";
+    int keylen;
+    glusterd_snap_t *snap = NULL;
+    glusterd_snap_t *tmp_snap = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(dict);
+
+    cds_list_for_each_entry_safe(snap, tmp_snap, &priv->snapshots, snap_list)
+    {
+        snapcount++;
+        snapname = gf_strdup(snap->snapname);
+        if (!snapname) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "strdup failed");
+            ret = -1;
+            goto out;
+        }
+        keylen = snprintf(key, sizeof(key), "snapname%d", snapcount);
+        ret = dict_set_dynstrn(dict, key, keylen, snapname);
+        if (ret) {
+            GF_FREE(snapname);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set %s", key);
+            goto out;
+        }
+    }
+
+    ret = dict_set_int32n(dict, "snapcount", SLEN("snapcount"), snapcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snapcount");
+        goto out;
+    }
+
+    ret = 0;
+out:
+
+    return ret;
+}
+
+/* This function sets all the snapshot names
+   under a given volume in the dictionary */
+int
+glusterd_snapshot_get_vol_snapnames(dict_t *dict, glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    int snapcount = 0;
+    char *snapname = NULL;
+    char key[32] = "";
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_volinfo_t *tmp_vol = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(dict);
+    GF_ASSERT(volinfo);
+
+    cds_list_for_each_entry_safe(snap_vol, tmp_vol, &volinfo->snap_volumes,
+                                 snapvol_list)
+    {
+        snapcount++;
+        snprintf(key, sizeof(key), "snapname%d", snapcount);
+
+        ret = dict_set_dynstr_with_alloc(dict, key,
+                                         snap_vol->snapshot->snapname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to "
+                   "set %s",
+                   key);
+            GF_FREE(snapname);
+            goto out;
+        }
+    }
+
+    ret = dict_set_int32n(dict, "snapcount", SLEN("snapcount"), snapcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snapcount");
+        goto out;
+    }
+
+    ret = 0;
+out:
+
+    return ret;
+}
+
+int
+glusterd_handle_snapshot_list(rpcsvc_request_t *req, glusterd_op_t op,
+                              dict_t *dict, char *err_str, size_t len,
+                              uint32_t *op_errno)
+{
+    int ret = -1;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    GF_VALIDATE_OR_GOTO(this->name, req, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    /* Ignore error for getting volname as it is optional */
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+
+    if (NULL == volname) {
+        ret = glusterd_snapshot_get_all_snapnames(dict);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_SNAP_LIST_GET_FAIL,
+                   "Failed to get snapshot list");
+            goto out;
+        }
+    } else {
+        ret = glusterd_volinfo_find(volname, &volinfo);
+        if (ret) {
+            snprintf(err_str, len, "Volume (%s) does not exist", volname);
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND, "%s",
+                   err_str);
+            *op_errno = EG_NOVOL;
+            goto out;
+        }
+
+        ret = glusterd_snapshot_get_vol_snapnames(dict, volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_SNAP_LIST_GET_FAIL,
+                   "Failed to get snapshot list for volume %s", volname);
+            goto out;
+        }
+    }
+
+    /* If everything is successful then send the response back to cli.
+    In case of failure the caller of this function will take of response.*/
+    ret = glusterd_op_send_cli_response(op, 0, 0, req, dict, err_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_CLI_RESP,
+               "Failed to send cli "
+               "response");
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+/* This is a snapshot create handler function. This function will be
+ * executed in the originator node. This function is responsible for
+ * calling mgmt_v3 framework to do the actual snap creation on all the bricks
+ *
+ * @param req           RPC request object
+ * @param op            gluster operation
+ * @param dict          dictionary containing snapshot restore request
+ * @param err_str       In case of an err this string should be populated
+ * @param len           length of err_str buffer
+ *
+ * @return              Negative value on Failure and 0 in success
+ */
+int
+glusterd_handle_snapshot_create(rpcsvc_request_t *req, glusterd_op_t op,
+                                dict_t *dict, char *err_str, size_t len)
+{
+    int ret = -1;
+    char *volname = NULL;
+    char *snapname = NULL;
+    int64_t volcount = 0;
+    xlator_t *this = NULL;
+    char key[64] = "";
+    int keylen;
+    char *username = NULL;
+    char *password = NULL;
+    uuid_t *uuid_ptr = NULL;
+    uuid_t tmp_uuid = {0};
+    int i = 0;
+    int timestamp = 0;
+    char snap_volname[GD_VOLUME_NAME_MAX] = "";
+    char new_snapname[GLUSTERD_MAX_SNAP_NAME] = "";
+    char gmt_snaptime[GLUSTERD_MAX_SNAP_NAME] = "";
+    time_t snap_time;
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(dict);
+    GF_ASSERT(err_str);
+
+    ret = dict_get_int64(dict, "volcount", &volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to "
+               "get the volume count");
+        goto out;
+    }
+    if (volcount <= 0) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "Invalid volume count %" PRId64 " supplied", volcount);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to get the snapname");
+        goto out;
+    }
+
+    timestamp = dict_get_str_boolean(dict, "no-timestamp", _gf_false);
+    if (timestamp == -1) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Failed to get "
+               "no-timestamp flag ");
+        goto out;
+    }
+
+    snap_time = gf_time();
+    ret = dict_set_int64(dict, "snap-time", (int64_t)snap_time);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set snap-time");
+        goto out;
+    }
+
+    if (!timestamp) {
+        strftime(gmt_snaptime, sizeof(gmt_snaptime), "_GMT-%Y.%m.%d-%H.%M.%S",
+                 gmtime(&snap_time));
+        snprintf(new_snapname, sizeof(new_snapname), "%s%s", snapname,
+                 gmt_snaptime);
+        ret = dict_set_dynstr_with_alloc(dict, "snapname", new_snapname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to update "
+                   "snap-name");
+            goto out;
+        }
+        snapname = new_snapname;
+    }
+
+    if (strlen(snapname) >= GLUSTERD_MAX_SNAP_NAME) {
+        snprintf(err_str, len,
+                 "snapname cannot exceed 255 "
+                 "characters");
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY, "%s",
+               err_str);
+        ret = -1;
+        goto out;
+    }
+
+    uuid_ptr = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!uuid_ptr) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Out Of Memory");
+        ret = -1;
+        goto out;
+    }
+
+    gf_uuid_generate(*uuid_ptr);
+    ret = dict_set_bin(dict, "snap-id", uuid_ptr, sizeof(uuid_t));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set snap-id");
+        GF_FREE(uuid_ptr);
+        goto out;
+    }
+    uuid_ptr = NULL;
+
+    for (i = 1; i <= volcount; i++) {
+        keylen = snprintf(key, sizeof(key), "volname%d", i);
+        ret = dict_get_strn(dict, key, keylen, &volname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get volume name");
+            goto out;
+        }
+
+        /* generate internal username and password  for the snap*/
+        gf_uuid_generate(tmp_uuid);
+        username = gf_strdup(uuid_utoa(tmp_uuid));
+        keylen = snprintf(key, sizeof(key), "volume%d_username", i);
+        ret = dict_set_dynstrn(dict, key, keylen, username);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set snap "
+                   "username for volume %s",
+                   volname);
+            GF_FREE(username);
+            goto out;
+        }
+
+        gf_uuid_generate(tmp_uuid);
+        password = gf_strdup(uuid_utoa(tmp_uuid));
+        keylen = snprintf(key, sizeof(key), "volume%d_password", i);
+        ret = dict_set_dynstrn(dict, key, keylen, password);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set snap "
+                   "password for volume %s",
+                   volname);
+            GF_FREE(password);
+            goto out;
+        }
+
+        uuid_ptr = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+        if (!uuid_ptr) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "Out Of Memory");
+            ret = -1;
+            goto out;
+        }
+
+        snprintf(key, sizeof(key), "vol%d_volid", i);
+        gf_uuid_generate(*uuid_ptr);
+        ret = dict_set_bin(dict, key, uuid_ptr, sizeof(uuid_t));
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to set snap_volid");
+            GF_FREE(uuid_ptr);
+            goto out;
+        }
+        GLUSTERD_GET_UUID_NOHYPHEN(snap_volname, *uuid_ptr);
+        snprintf(key, sizeof(key), "snap-volname%d", i);
+        ret = dict_set_dynstr_with_alloc(dict, key, snap_volname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to set snap volname");
+            GF_FREE(uuid_ptr);
+            goto out;
+        }
+    }
+
+    ret = glusterd_mgmt_v3_initiate_snap_phases(req, op, dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_INIT_FAIL,
+               "Failed to initiate snap "
+               "phases");
+    }
+
+out:
+    return ret;
+}
+
+/* This is a snapshot status handler function. This function will be
+ * executed in a originator node. This function is responsible for
+ * calling mgmt v3 framework to get the actual snapshot status from
+ * all the bricks
+ *
+ * @param req           RPC request object
+ * @param op            gluster operation
+ * @param dict          dictionary containing snapshot status request
+ * @param err_str       In case of an err this string should be populated
+ * @param len           length of err_str buffer
+ *
+ * return :  0  in case of success.
+ *          -1  in case of failure.
+ *
+ */
+int
+glusterd_handle_snapshot_status(rpcsvc_request_t *req, glusterd_op_t op,
+                                dict_t *dict, char *err_str, size_t len)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(req);
+    GF_ASSERT(dict);
+    GF_ASSERT(err_str);
+
+    ret = glusterd_mgmt_v3_initiate_snap_phases(req, op, dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_INIT_FAIL,
+               "Failed to initiate "
+               "snap phases");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/* This is a snapshot clone handler function. This function will be
+ * executed in the originator node. This function is responsible for
+ * calling mgmt_v3 framework to do the actual snap clone on all the bricks
+ *
+ * @param req           RPC request object
+ * @param op            gluster operation
+ * @param dict          dictionary containing snapshot restore request
+ * @param err_str       In case of an err this string should be populated
+ * @param len           length of err_str buffer
+ *
+ * @return              Negative value on Failure and 0 in success
+ */
+int
+glusterd_handle_snapshot_clone(rpcsvc_request_t *req, glusterd_op_t op,
+                               dict_t *dict, char *err_str, size_t len)
+{
+    int ret = -1;
+    char *clonename = NULL;
+    char *snapname = NULL;
+    xlator_t *this = NULL;
+    char key[64] = "";
+    int keylen;
+    char *username = NULL;
+    char *password = NULL;
+    char *volname = NULL;
+    uuid_t *uuid_ptr = NULL;
+    uuid_t tmp_uuid = {0};
+    int i = 0;
+    char snap_volname[GD_VOLUME_NAME_MAX] = "";
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(dict);
+    GF_ASSERT(err_str);
+
+    ret = dict_get_strn(dict, "clonename", SLEN("clonename"), &clonename);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to "
+               "get the clone name");
+        goto out;
+    }
+    /*We need to take a volume lock on clone name*/
+    volname = gf_strdup(clonename);
+    keylen = snprintf(key, sizeof(key), "volname1");
+    ret = dict_set_dynstrn(dict, key, keylen, volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set clone "
+               "name for volume locking");
+        GF_FREE(volname);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to get the snapname");
+        goto out;
+    }
+
+    uuid_ptr = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!uuid_ptr) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Out Of Memory");
+        ret = -1;
+        goto out;
+    }
+
+    gf_uuid_generate(*uuid_ptr);
+    ret = dict_set_bin(dict, "clone-id", uuid_ptr, sizeof(uuid_t));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set clone-id");
+        GF_FREE(uuid_ptr);
+        goto out;
+    }
+    uuid_ptr = NULL;
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get snapname name");
+        goto out;
+    }
+
+    gf_uuid_generate(tmp_uuid);
+    username = gf_strdup(uuid_utoa(tmp_uuid));
+    keylen = snprintf(key, sizeof(key), "volume1_username");
+    ret = dict_set_dynstrn(dict, key, keylen, username);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set clone "
+               "username for volume %s",
+               clonename);
+        GF_FREE(username);
+        goto out;
+    }
+
+    gf_uuid_generate(tmp_uuid);
+    password = gf_strdup(uuid_utoa(tmp_uuid));
+    keylen = snprintf(key, sizeof(key), "volume1_password");
+    ret = dict_set_dynstrn(dict, key, keylen, password);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set clone "
+               "password for volume %s",
+               clonename);
+        GF_FREE(password);
+        goto out;
+    }
+
+    uuid_ptr = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!uuid_ptr) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Out Of Memory");
+        ret = -1;
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "vol1_volid");
+    gf_uuid_generate(*uuid_ptr);
+    ret = dict_set_bin(dict, key, uuid_ptr, sizeof(uuid_t));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set clone_volid");
+        GF_FREE(uuid_ptr);
+        goto out;
+    }
+    snprintf(key, sizeof(key), "clone-volname%d", i);
+    ret = dict_set_dynstr_with_alloc(dict, key, snap_volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set snap volname");
+        GF_FREE(uuid_ptr);
+        goto out;
+    }
+
+    ret = glusterd_mgmt_v3_initiate_snap_phases(req, op, dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_INIT_FAIL,
+               "Failed to initiate "
+               "snap phases");
+    }
+
+out:
+    return ret;
+}
+
+/* This is a snapshot restore handler function. This function will be
+ * executed in the originator node. This function is responsible for
+ * calling mgmt_v3 framework to do the actual restore on all the bricks
+ *
+ * @param req           RPC request object
+ * @param op            gluster operation
+ * @param dict          dictionary containing snapshot restore request
+ * @param err_str       In case of an err this string should be populated
+ * @param len           length of err_str buffer
+ *
+ * @return              Negative value on Failure and 0 in success
+ */
+int
+glusterd_handle_snapshot_restore(rpcsvc_request_t *req, glusterd_op_t op,
+                                 dict_t *dict, char *err_str,
+                                 uint32_t *op_errno, size_t len)
+{
+    int ret = -1;
+    char *snapname = NULL;
+    char *buf = NULL;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_volinfo_t *snap_volinfo = NULL;
+    int32_t i = 0;
+    char key[64] = "";
+    int keylen;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+
+    GF_ASSERT(conf);
+    GF_ASSERT(req);
+    GF_ASSERT(dict);
+    GF_ASSERT(err_str);
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to "
+               "get snapname");
+        goto out;
+    }
+
+    snap = glusterd_find_snap_by_name(snapname);
+    if (!snap) {
+        snprintf(err_str, len, "Snapshot (%s) does not exist", snapname);
+        *op_errno = EG_NOSNAP;
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_SNAP_NOT_FOUND, "%s",
+               err_str);
+        ret = -1;
+        goto out;
+    }
+
+    list_for_each_entry(snap_volinfo, &snap->volumes, vol_list)
+    {
+        i++;
+        keylen = snprintf(key, sizeof(key), "volname%d", i);
+        buf = gf_strdup(snap_volinfo->parent_volname);
+        if (!buf) {
+            ret = -1;
+            goto out;
+        }
+        ret = dict_set_dynstrn(dict, key, keylen, buf);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Could not set "
+                   "parent volume name %s in the dict",
+                   snap_volinfo->parent_volname);
+            GF_FREE(buf);
+            goto out;
+        }
+        buf = NULL;
+    }
+
+    ret = dict_set_int32n(dict, "volcount", SLEN("volcount"), i);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Could not save volume count");
+        goto out;
+    }
+
+    ret = glusterd_mgmt_v3_initiate_snap_phases(req, op, dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_INIT_FAIL,
+               "Failed to initiate snap phases");
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+glusterd_snap_t *
+glusterd_create_snap_object(dict_t *dict, dict_t *rsp_dict)
+{
+    char *snapname = NULL;
+    uuid_t *snap_id = NULL;
+    char *description = NULL;
+    glusterd_snap_t *snap = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = -1;
+    int64_t time_stamp = 0;
+
+    this = THIS;
+    priv = this->private;
+
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+
+    /* Fetch snapname, description, id and time from dict */
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch snapname");
+        goto out;
+    }
+
+    /* Ignore ret value for description*/
+    ret = dict_get_strn(dict, "description", SLEN("description"), &description);
+
+    ret = dict_get_bin(dict, "snap-id", (void **)&snap_id);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch snap_id");
+        goto out;
+    }
+
+    ret = dict_get_int64(dict, "snap-time", &time_stamp);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch snap-time");
+        goto out;
+    }
+    if (time_stamp <= 0) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "Invalid time-stamp: %" PRId64, time_stamp);
+        goto out;
+    }
+
+    cds_list_for_each_entry(snap, &priv->snapshots, snap_list)
+    {
+        if (!strcmp(snap->snapname, snapname) ||
+            !gf_uuid_compare(snap->snap_id, *snap_id)) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CREATION_FAIL,
+                   "Found duplicate snap %s (%s)", snap->snapname,
+                   uuid_utoa(snap->snap_id));
+            ret = -1;
+            break;
+        }
+    }
+    if (ret) {
+        snap = NULL;
+        goto out;
+    }
+
+    snap = glusterd_new_snap_object();
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CREATION_FAIL,
+               "Could not create "
+               "the snap object for snap %s",
+               snapname);
+        goto out;
+    }
+
+    gf_strncpy(snap->snapname, snapname, sizeof(snap->snapname));
+    gf_uuid_copy(snap->snap_id, *snap_id);
+    snap->time_stamp = (time_t)time_stamp;
+    /* Set the status as GD_SNAP_STATUS_INIT and once the backend snapshot
+       is taken and snap is really ready to use, set the status to
+       GD_SNAP_STATUS_IN_USE. This helps in identifying the incomplete
+       snapshots and cleaning them up.
+    */
+    snap->snap_status = GD_SNAP_STATUS_INIT;
+    if (description) {
+        snap->description = gf_strdup(description);
+        if (snap->description == NULL) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CREATION_FAIL,
+                   "Saving the Snapshot Description Failed");
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = glusterd_store_snap(snap);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_CREATION_FAIL,
+               "Could not store snap"
+               "object %s",
+               snap->snapname);
+        goto out;
+    }
+
+    glusterd_list_add_order(&snap->snap_list, &priv->snapshots,
+                            glusterd_compare_snap_time);
+
+    gf_msg_trace(this->name, 0, "Snapshot %s added to the list",
+                 snap->snapname);
+
+    ret = 0;
+
+out:
+    if (ret) {
+        if (snap)
+            glusterd_snap_remove(rsp_dict, snap, _gf_true, _gf_true, _gf_false);
+        snap = NULL;
+    }
+
+    return snap;
+}
+
+/* Added missed_snap_entry to rsp_dict */
+int32_t
+glusterd_add_missed_snaps_to_dict(dict_t *rsp_dict,
+                                  glusterd_volinfo_t *snap_vol,
+                                  glusterd_brickinfo_t *brickinfo,
+                                  int32_t brick_number, int32_t op)
+{
+    char *snap_uuid = NULL;
+    char missed_snap_entry[PATH_MAX] = "";
+    char name_buf[PATH_MAX] = "";
+    int32_t missed_snap_count = -1;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(snap_vol);
+    GF_ASSERT(brickinfo);
+
+    snap_uuid = gf_strdup(uuid_utoa(snap_vol->snapshot->snap_id));
+    if (!snap_uuid) {
+        ret = -1;
+        goto out;
+    }
+
+    len = snprintf(missed_snap_entry, sizeof(missed_snap_entry),
+                   "%s:%s=%s:%d:%s:%d:%d", uuid_utoa(brickinfo->uuid),
+                   snap_uuid, snap_vol->volname, brick_number, brickinfo->path,
+                   op, GD_MISSED_SNAP_PENDING);
+    if ((len < 0) || (len >= sizeof(missed_snap_entry))) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    /* Fetch the missed_snap_count from the dict */
+    ret = dict_get_int32n(rsp_dict, "missed_snap_count",
+                          SLEN("missed_snap_count"), &missed_snap_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                "Key=missed_snap_count", NULL);
+        /* Initialize the missed_snap_count for the first time */
+        missed_snap_count = 0;
+    }
+
+    /* Setting the missed_snap_entry in the rsp_dict */
+    snprintf(name_buf, sizeof(name_buf), "missed_snaps_%d", missed_snap_count);
+    ret = dict_set_dynstr_with_alloc(rsp_dict, name_buf, missed_snap_entry);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set missed_snap_entry (%s) "
+               "in the rsp_dict.",
+               missed_snap_entry);
+        goto out;
+    }
+    missed_snap_count++;
+
+    /* Setting the new missed_snap_count in the dict */
+    ret = dict_set_int32n(rsp_dict, "missed_snap_count",
+                          SLEN("missed_snap_count"), missed_snap_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set missed_snap_count for %s "
+               "in the rsp_dict.",
+               missed_snap_entry);
+        goto out;
+    }
+
+out:
+    if (snap_uuid)
+        GF_FREE(snap_uuid);
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* This function actually calls the command (or the API) for taking the
+   snapshot of the backend brick filesystem. If this is successful,
+   then call the glusterd_snap_create function to create the snap object
+   for glusterd
+*/
+int32_t
+glusterd_take_lvm_snapshot(glusterd_brickinfo_t *brickinfo,
+                           char *origin_brick_path)
+{
+    char msg[NAME_MAX] = "";
+    char buf[PATH_MAX] = "";
+    char *ptr = NULL;
+    char *origin_device = NULL;
+    int ret = -1;
+    gf_boolean_t match = _gf_false;
+    runner_t runner = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(brickinfo);
+    GF_ASSERT(origin_brick_path);
+
+    origin_device = glusterd_get_brick_mount_device(origin_brick_path);
+    if (!origin_device) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_GET_INFO_FAIL,
+               "getting device name for "
+               "the brick %s failed",
+               origin_brick_path);
+        goto out;
+    }
+
+    /* Figuring out if setactivationskip flag is supported or not */
+    runinit(&runner);
+    snprintf(msg, sizeof(msg), "running lvcreate help");
+    runner_add_args(&runner, LVM_CREATE, "--help", NULL);
+    runner_log(&runner, "", GF_LOG_DEBUG, msg);
+    runner_redir(&runner, STDOUT_FILENO, RUN_PIPE);
+    ret = runner_start(&runner);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_LVCREATE_FAIL,
+               "Failed to run lvcreate help");
+        runner_end(&runner);
+        goto out;
+    }
+
+    /* Looking for setactivationskip in lvcreate --help */
+    do {
+        ptr = fgets(buf, sizeof(buf), runner_chio(&runner, STDOUT_FILENO));
+        if (ptr) {
+            if (strstr(buf, "setactivationskip")) {
+                match = _gf_true;
+                break;
+            }
+        }
+    } while (ptr != NULL);
+    runner_end(&runner);
+
+    /* Taking the actual snapshot */
+    runinit(&runner);
+    snprintf(msg, sizeof(msg), "taking snapshot of the brick %s",
+             origin_brick_path);
+    if (match == _gf_true)
+        runner_add_args(&runner, LVM_CREATE, "-s", origin_device,
+                        "--setactivationskip", "n", "--name",
+                        brickinfo->device_path, NULL);
+    else
+        runner_add_args(&runner, LVM_CREATE, "-s", origin_device, "--name",
+                        brickinfo->device_path, NULL);
+    runner_log(&runner, this->name, GF_LOG_DEBUG, msg);
+    ret = runner_run(&runner);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CREATION_FAIL,
+               "taking snapshot of the "
+               "brick (%s) of device %s failed",
+               origin_brick_path, origin_device);
+    }
+
+out:
+    if (origin_device)
+        GF_FREE(origin_device);
+
+    return ret;
+}
+
+int32_t
+glusterd_snap_brick_create(glusterd_volinfo_t *snap_volinfo,
+                           glusterd_brickinfo_t *brickinfo, int32_t brick_count,
+                           int32_t clone)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    char snap_brick_mount_path[PATH_MAX] = "";
+    char clone_uuid[64] = "";
+    struct stat statbuf = {
+        0,
+    };
+    int32_t len = 0;
+
+    this = THIS;
+
+    GF_ASSERT(snap_volinfo);
+    GF_ASSERT(brickinfo);
+
+    if (clone) {
+        GLUSTERD_GET_UUID_NOHYPHEN(clone_uuid, snap_volinfo->volume_id);
+        len = snprintf(snap_brick_mount_path, sizeof(snap_brick_mount_path),
+                       "%s/%s/brick%d", snap_mount_dir, clone_uuid,
+                       brick_count + 1);
+    } else {
+        len = snprintf(snap_brick_mount_path, sizeof(snap_brick_mount_path),
+                       "%s/%s/brick%d", snap_mount_dir, snap_volinfo->volname,
+                       brick_count + 1);
+    }
+    if ((len < 0) || (len >= sizeof(snap_brick_mount_path))) {
+        goto out;
+    }
+
+    ret = mkdir_p(snap_brick_mount_path, 0755, _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "creating the brick directory"
+               " %s for the snapshot %s(device: %s) failed",
+               snap_brick_mount_path, snap_volinfo->volname,
+               brickinfo->device_path);
+        goto out;
+    }
+    /* mount the snap logical device on the directory inside
+       /run/gluster/snaps/<snapname>/@snap_brick_mount_path
+       Way to mount the snap brick via mount api is this.
+       ret = mount (device, snap_brick_mount_path, entry->mnt_type,
+                    MS_MGC_VAL, "nouuid");
+       But for now, mounting using runner apis.
+    */
+    ret = glusterd_mount_lvm_snapshot(brickinfo, snap_brick_mount_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_LVM_MOUNT_FAILED,
+               "Failed to mount lvm snapshot.");
+        goto out;
+    }
+
+    ret = sys_stat(brickinfo->path, &statbuf);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, errno, GD_MSG_FILE_OP_FAILED,
+               "stat of the brick %s"
+               "(brick mount: %s) failed (%s)",
+               brickinfo->path, snap_brick_mount_path, strerror(errno));
+        goto out;
+    }
+    ret = sys_lsetxattr(brickinfo->path, GF_XATTR_VOL_ID_KEY,
+                        snap_volinfo->volume_id, 16, XATTR_REPLACE);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_SET_XATTR_FAIL,
+               "Failed to set "
+               "extended attribute %s on %s. Reason: "
+               "%s, snap: %s",
+               GF_XATTR_VOL_ID_KEY, brickinfo->path, strerror(errno),
+               snap_volinfo->volname);
+        goto out;
+    }
+
+out:
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_UMOUNTING_SNAP_BRICK,
+               "unmounting the snap brick"
+               " mount %s",
+               snap_brick_mount_path);
+        /*umount2 system call doesn't cleanup mtab entry after un-mount.
+          So use external umount command*/
+        glusterd_umount(snap_brick_mount_path);
+    }
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int32_t
+glusterd_add_brick_to_snap_volume(dict_t *dict, dict_t *rsp_dict,
+                                  glusterd_volinfo_t *snap_vol,
+                                  glusterd_brickinfo_t *original_brickinfo,
+                                  int64_t volcount, int32_t brick_count,
+                                  int clone)
+{
+    char key[64] = "";
+    int keylen;
+    char *value = NULL;
+    char *snap_brick_dir = NULL;
+    char snap_brick_path[PATH_MAX] = "";
+    char clone_uuid[64] = "";
+    char *snap_device = NULL;
+    glusterd_brickinfo_t *snap_brickinfo = NULL;
+    gf_boolean_t add_missed_snap = _gf_false;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    char abspath[PATH_MAX] = "";
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(snap_vol);
+    GF_ASSERT(original_brickinfo);
+
+    snprintf(key, sizeof(key), "vol%" PRId64 ".origin_brickpath%d", volcount,
+             brick_count);
+    ret = dict_set_dynstr_with_alloc(dict, key, original_brickinfo->path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set %s", key);
+        goto out;
+    }
+
+    ret = glusterd_brickinfo_new(&snap_brickinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_NEW_INFO_FAIL,
+               "initializing the brick for the snap "
+               "volume failed (snapname: %s)",
+               snap_vol->snapshot->snapname);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "vol%" PRId64 ".fstype%d", volcount,
+                      brick_count);
+    ret = dict_get_strn(dict, key, keylen, &value);
+    if (!ret) {
+        /* Update the fstype in original brickinfo as well */
+        gf_strncpy(original_brickinfo->fstype, value,
+                   sizeof(original_brickinfo->fstype));
+        gf_strncpy(snap_brickinfo->fstype, value,
+                   sizeof(snap_brickinfo->fstype));
+    } else {
+        if (is_origin_glusterd(dict) == _gf_true)
+            add_missed_snap = _gf_true;
+    }
+
+    keylen = snprintf(key, sizeof(key), "vol%" PRId64 ".mnt_opts%d", volcount,
+                      brick_count);
+    ret = dict_get_strn(dict, key, keylen, &value);
+    if (!ret) {
+        /* Update the mnt_opts in original brickinfo as well */
+        gf_strncpy(original_brickinfo->mnt_opts, value,
+                   sizeof(original_brickinfo->mnt_opts));
+        gf_strncpy(snap_brickinfo->mnt_opts, value,
+                   sizeof(snap_brickinfo->mnt_opts));
+    } else {
+        if (is_origin_glusterd(dict) == _gf_true)
+            add_missed_snap = _gf_true;
+    }
+
+    keylen = snprintf(key, sizeof(key), "vol%" PRId64 ".brickdir%d", volcount,
+                      brick_count);
+    ret = dict_get_strn(dict, key, keylen, &snap_brick_dir);
+    if (ret) {
+        /* Using original brickinfo here because it will be a
+         * pending snapshot and storing the original brickinfo
+         * will help in mapping while recreating the missed snapshot
+         */
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_NOT_FOUND,
+               "Unable to fetch "
+               "snap mount path(%s). Adding to missed_snap_list",
+               key);
+        snap_brickinfo->snap_status = -1;
+
+        snap_brick_dir = original_brickinfo->mount_dir;
+
+        /* In origiator node add snaps missed
+         * from different nodes to the dict
+         */
+        if (is_origin_glusterd(dict) == _gf_true)
+            add_missed_snap = _gf_true;
+    }
+
+    if ((snap_brickinfo->snap_status != -1) &&
+        (!gf_uuid_compare(original_brickinfo->uuid, MY_UUID)) &&
+        (!glusterd_is_brick_started(original_brickinfo))) {
+        /* In case if the brick goes down after prevalidate. */
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_BRICK_DISCONNECTED,
+               "brick %s:%s is not"
+               " started (snap: %s)",
+               original_brickinfo->hostname, original_brickinfo->path,
+               snap_vol->snapshot->snapname);
+
+        snap_brickinfo->snap_status = -1;
+        add_missed_snap = _gf_true;
+    }
+
+    if (add_missed_snap) {
+        ret = glusterd_add_missed_snaps_to_dict(
+            rsp_dict, snap_vol, original_brickinfo, brick_count + 1,
+            GF_SNAP_OPTION_TYPE_CREATE);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MISSEDSNAP_INFO_SET_FAIL,
+                   "Failed to add missed"
+                   " snapshot info for %s:%s in the rsp_dict",
+                   original_brickinfo->hostname, original_brickinfo->path);
+            goto out;
+        }
+    }
+
+    /* Create brick-path in the format /var/run/gluster/snaps/ *
+     * <snap-uuid>/<original-brick#>/snap-brick-dir *
+     */
+    if (clone) {
+        GLUSTERD_GET_UUID_NOHYPHEN(clone_uuid, snap_vol->volume_id);
+        len = snprintf(snap_brick_path, sizeof(snap_brick_path),
+                       "%s/%s/brick%d%s", snap_mount_dir, clone_uuid,
+                       brick_count + 1, snap_brick_dir);
+    } else {
+        len = snprintf(snap_brick_path, sizeof(snap_brick_path),
+                       "%s/%s/brick%d%s", snap_mount_dir, snap_vol->volname,
+                       brick_count + 1, snap_brick_dir);
+    }
+    if ((len < 0) || (len >= sizeof(snap_brick_path))) {
+        ret = -1;
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "vol%" PRId64 ".brick_snapdevice%d",
+                      volcount, brick_count);
+    ret = dict_get_strn(dict, key, keylen, &snap_device);
+    if (ret) {
+        /* If the device name is empty, so will be the brick path
+         * Hence the missed snap has already been added above
+         */
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_NOT_FOUND,
+               "Unable to fetch "
+               "snap device (%s). Leaving empty",
+               key);
+    } else
+        gf_strncpy(snap_brickinfo->device_path, snap_device,
+                   sizeof(snap_brickinfo->device_path));
+
+    ret = gf_canonicalize_path(snap_brick_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CANONICALIZE_FAIL,
+               "Failed to canonicalize path");
+        goto out;
+    }
+
+    gf_strncpy(snap_brickinfo->hostname, original_brickinfo->hostname,
+               sizeof(snap_brickinfo->hostname));
+    gf_strncpy(snap_brickinfo->path, snap_brick_path,
+               sizeof(snap_brickinfo->path));
+
+    if (!realpath(snap_brick_path, abspath)) {
+        /* ENOENT indicates that brick path has not been created which
+         * is a valid scenario */
+        if (errno != ENOENT) {
+            gf_msg(this->name, GF_LOG_CRITICAL, errno,
+                   GD_MSG_BRICKINFO_CREATE_FAIL,
+                   "realpath () "
+                   "failed for brick %s. The underlying filesystem"
+                   " may be in bad state",
+                   snap_brick_path);
+            ret = -1;
+            goto out;
+        }
+    }
+    gf_strncpy(snap_brickinfo->real_path, abspath,
+               sizeof(snap_brickinfo->real_path));
+
+    gf_strncpy(snap_brickinfo->mount_dir, original_brickinfo->mount_dir,
+               sizeof(snap_brickinfo->mount_dir));
+    gf_uuid_copy(snap_brickinfo->uuid, original_brickinfo->uuid);
+    /* AFR changelog names are based on brick_id and hence the snap
+     * volume's bricks must retain the same ID */
+    cds_list_add_tail(&snap_brickinfo->brick_list, &snap_vol->bricks);
+
+    if (clone) {
+        GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO(snap_brickinfo, snap_vol,
+                                             brick_count);
+    } else
+        gf_strncpy(snap_brickinfo->brick_id, original_brickinfo->brick_id,
+                   sizeof(snap_brickinfo->brick_id));
+
+out:
+    if (ret && snap_brickinfo)
+        GF_FREE(snap_brickinfo);
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* This function will update the file-system label of the
+ * backend snapshot brick.
+ *
+ * @param brickinfo     brickinfo of the snap volume
+ *
+ * @return 0 on success and -1 on failure
+ */
+int
+glusterd_update_fs_label(glusterd_brickinfo_t *brickinfo)
+{
+    int32_t ret = -1;
+    char msg[PATH_MAX] = "";
+    char label[NAME_MAX] = "";
+    uuid_t uuid = {
+        0,
+    };
+    runner_t runner = {
+        0,
+    };
+    xlator_t *this = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(brickinfo);
+
+    /* Generate a new UUID */
+    gf_uuid_generate(uuid);
+
+    GLUSTERD_GET_UUID_NOHYPHEN(label, uuid);
+
+    runinit(&runner);
+
+    /* Call the file-system specific tools to update the file-system
+     * label. Currently we are only supporting xfs and ext2/ext3/ext4
+     * file-system.
+     */
+    if (0 == strcmp(brickinfo->fstype, "xfs")) {
+        /* XFS label is of size 12. Therefore we should truncate the
+         * label to 12 bytes*/
+        label[12] = '\0';
+        len = snprintf(msg, sizeof(msg),
+                       "Changing filesystem label "
+                       "of %s brick to %s",
+                       brickinfo->path, label);
+        if (len < 0) {
+            strcpy(msg, "<error>");
+        }
+        /* Run the run xfs_admin tool to change the label
+         * of the file-system */
+        runner_add_args(&runner, "xfs_admin", "-L", label,
+                        brickinfo->device_path, NULL);
+    } else if (0 == strcmp(brickinfo->fstype, "ext4") ||
+               0 == strcmp(brickinfo->fstype, "ext3") ||
+               0 == strcmp(brickinfo->fstype, "ext2")) {
+        /* Ext2/Ext3/Ext4 label is of size 16. Therefore we should
+         * truncate the label to 16 bytes*/
+        label[16] = '\0';
+        len = snprintf(msg, sizeof(msg),
+                       "Changing filesystem label "
+                       "of %s brick to %s",
+                       brickinfo->path, label);
+        if (len < 0) {
+            strcpy(msg, "<error>");
+        }
+        /* For ext2/ext3/ext4 run tune2fs to change the
+         * file-system label */
+        runner_add_args(&runner, "tune2fs", "-L", label, brickinfo->device_path,
+                        NULL);
+    } else {
+        gf_msg(this->name, GF_LOG_WARNING, EOPNOTSUPP, GD_MSG_OP_UNSUPPORTED,
+               "Changing file-system "
+               "label of %s file-system is not supported as of now",
+               brickinfo->fstype);
+        runner_end(&runner);
+        ret = -1;
+        goto out;
+    }
+
+    runner_log(&runner, this->name, GF_LOG_DEBUG, msg);
+    ret = runner_run(&runner);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FS_LABEL_UPDATE_FAIL,
+               "Failed to change "
+               "filesystem label of %s brick to %s",
+               brickinfo->path, label);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int32_t
+glusterd_take_brick_snapshot(dict_t *dict, glusterd_volinfo_t *snap_vol,
+                             glusterd_brickinfo_t *brickinfo, int32_t volcount,
+                             int32_t brick_count, int32_t clone)
+{
+    char *origin_brick_path = NULL;
+    char key[64] = "";
+    int keylen;
+    int32_t ret = -1;
+    gf_boolean_t snap_activate = _gf_false;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    priv = this->private;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(snap_vol);
+    GF_ASSERT(brickinfo);
+    GF_ASSERT(priv);
+
+    if (strlen(brickinfo->device_path) == 0) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "Device path is empty "
+               "brick %s:%s",
+               brickinfo->hostname, brickinfo->path);
+        ret = -1;
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "vol%d.origin_brickpath%d", volcount,
+                      brick_count);
+    ret = dict_get_strn(dict, key, keylen, &origin_brick_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch "
+               "brick path (%s)",
+               key);
+        goto out;
+    }
+
+    ret = glusterd_take_lvm_snapshot(brickinfo, origin_brick_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CREATION_FAIL,
+               "Failed to take snapshot of "
+               "brick %s:%s",
+               brickinfo->hostname, origin_brick_path);
+        goto out;
+    }
+
+    /* After the snapshot both the origin brick (LVM brick) and
+     * the snapshot brick will have the same file-system label. This
+     * will cause lot of problems at mount time. Therefore we must
+     * generate a new label for the snapshot brick
+     */
+    ret = glusterd_update_fs_label(brickinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FS_LABEL_UPDATE_FAIL,
+               "Failed to update "
+               "file-system label for %s brick",
+               brickinfo->path);
+        /* Failing to update label should not cause snapshot failure.
+         * Currently label is updated only for XFS and ext2/ext3/ext4
+         * file-system.
+         */
+    }
+
+    /* create the complete brick here in case of clone and
+     * activate-on-create configuration.
+     */
+    snap_activate = dict_get_str_boolean(
+        priv->opts, GLUSTERD_STORE_KEY_SNAP_ACTIVATE, _gf_false);
+    if (clone || snap_activate) {
+        ret = glusterd_snap_brick_create(snap_vol, brickinfo, brick_count,
+                                         clone);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_CREATION_FAIL,
+                   "not able to "
+                   "create the brick for the snap %s, volume %s",
+                   snap_vol->snapshot->snapname, snap_vol->volname);
+            goto out;
+        }
+    }
+
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_snap_clear_unsupported_opt(
+    glusterd_volinfo_t *volinfo,
+    struct gd_snap_unsupported_opt_t *unsupported_opt)
+{
+    int ret = -1;
+    int i = 0;
+
+    GF_VALIDATE_OR_GOTO("glusterd", volinfo, out);
+
+    for (i = 0; unsupported_opt[i].key; i++) {
+        glusterd_volinfo_get(volinfo, unsupported_opt[i].key,
+                             &unsupported_opt[i].value);
+
+        if (unsupported_opt[i].value) {
+            unsupported_opt[i].value = gf_strdup(unsupported_opt[i].value);
+            if (!unsupported_opt[i].value) {
+                ret = -1;
+                goto out;
+            }
+            dict_del(volinfo->dict, unsupported_opt[i].key);
+        }
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+glusterd_snap_set_unsupported_opt(
+    glusterd_volinfo_t *volinfo,
+    struct gd_snap_unsupported_opt_t *unsupported_opt)
+{
+    int ret = -1;
+    int i = 0;
+
+    GF_VALIDATE_OR_GOTO("glusterd", volinfo, out);
+
+    for (i = 0; unsupported_opt[i].key; i++) {
+        if (!unsupported_opt[i].value)
+            continue;
+
+        ret = dict_set_dynstr(volinfo->dict, unsupported_opt[i].key,
+                              unsupported_opt[i].value);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                   "dict set failed");
+            goto out;
+        }
+        unsupported_opt[i].value = NULL;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+glusterd_volinfo_t *
+glusterd_do_snap_vol(glusterd_volinfo_t *origin_vol, glusterd_snap_t *snap,
+                     dict_t *dict, dict_t *rsp_dict, int64_t volcount,
+                     int clone)
+{
+    char key[64] = "";
+    int keylen;
+    char *username = NULL;
+    char *password = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *snap_vol = NULL;
+    uuid_t *snap_volid = NULL;
+    int32_t ret = -1;
+    int32_t brick_count = 0;
+    xlator_t *this = NULL;
+    char *clonename = NULL;
+    gf_boolean_t conf_present = _gf_false;
+    int i = 0;
+
+    struct gd_snap_unsupported_opt_t unsupported_opt[] = {
+        {.key = VKEY_FEATURES_QUOTA, .value = NULL},
+        {.key = VKEY_FEATURES_INODE_QUOTA, .value = NULL},
+        {.key = "feature.deem-statfs", .value = NULL},
+        {.key = "features.quota-deem-statfs", .value = NULL},
+        {.key = NULL, .value = NULL}};
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(dict);
+    GF_ASSERT(origin_vol);
+    GF_ASSERT(rsp_dict);
+
+    /* fetch username, password and vol_id from dict*/
+    keylen = snprintf(key, sizeof(key), "volume%" PRId64 "_username", volcount);
+    ret = dict_get_strn(dict, key, keylen, &username);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get %s for "
+               "snap %s",
+               key, snap->snapname);
+        goto out;
+    }
+    keylen = snprintf(key, sizeof(key), "volume%" PRId64 "_password", volcount);
+    ret = dict_get_strn(dict, key, keylen, &password);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get %s for "
+               "snap %s",
+               key, snap->snapname);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "vol%" PRId64 "_volid", volcount);
+    ret = dict_get_bin(dict, key, (void **)&snap_volid);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch snap_volid");
+        goto out;
+    }
+
+    /* We are not setting the username and password here as
+     * we need to set the user name and password passed in
+     * the dictionary
+     */
+    ret = glusterd_volinfo_dup(origin_vol, &snap_vol, _gf_false);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_OP_FAILED,
+               "Failed to duplicate volinfo "
+               "for the snapshot %s",
+               snap->snapname);
+        goto out;
+    }
+
+    /* uuid is used as lvm snapshot name.
+       This will avoid restrictions on snapshot names provided by user */
+    gf_uuid_copy(snap_vol->volume_id, *snap_volid);
+    snap_vol->is_snap_volume = _gf_true;
+    snap_vol->snapshot = snap;
+
+    if (clone) {
+        snap_vol->is_snap_volume = _gf_false;
+        ret = dict_get_strn(dict, "clonename", SLEN("clonename"), &clonename);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get %s "
+                   "for snap %s",
+                   key, snap->snapname);
+            goto out;
+        }
+        cds_list_add_tail(&snap_vol->vol_list, &snap->volumes);
+        gf_strncpy(snap_vol->volname, clonename, sizeof(snap_vol->volname));
+        gf_uuid_copy(snap_vol->restored_from_snap,
+                     origin_vol->snapshot->snap_id);
+
+    } else {
+        GLUSTERD_GET_UUID_NOHYPHEN(snap_vol->volname, *snap_volid);
+        gf_strncpy(snap_vol->parent_volname, origin_vol->volname,
+                   sizeof(snap_vol->parent_volname));
+        ret = glusterd_list_add_snapvol(origin_vol, snap_vol);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_LIST_SET_FAIL,
+                   "could not add the "
+                   "snap volume %s to the list",
+                   snap_vol->volname);
+            goto out;
+        }
+        /* TODO : Sync before taking a snapshot */
+        /* Copy the status and config files of geo-replication before
+         * taking a snapshot. During restore operation these files needs
+         * to be copied back in /var/lib/glusterd/georeplication/
+         */
+        ret = glusterd_copy_geo_rep_files(origin_vol, snap_vol, rsp_dict);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_OP_FAILED,
+                   "Failed to copy "
+                   "geo-rep config and status files for volume %s",
+                   origin_vol->volname);
+            goto out;
+        }
+    }
+
+    glusterd_auth_set_username(snap_vol, username);
+    glusterd_auth_set_password(snap_vol, password);
+
+    /* Adding snap brickinfos to the snap volinfo */
+    brick_count = 0;
+    cds_list_for_each_entry(brickinfo, &origin_vol->bricks, brick_list)
+    {
+        ret = glusterd_add_brick_to_snap_volume(
+            dict, rsp_dict, snap_vol, brickinfo, volcount, brick_count, clone);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ADD_FAIL,
+                   "Failed to add the snap brick for "
+                   "%s:%s to the snap volume",
+                   brickinfo->hostname, brickinfo->path);
+            goto out;
+        }
+        brick_count++;
+    }
+
+    /* During snapshot creation if I/O is in progress,
+     * then barrier value is enabled. Hence during snapshot create
+     * and in-turn snapshot restore the barrier value is set to enable.
+     * Because of this further I/O on the mount point fails.
+     * Hence remove the barrier key from newly created snap volinfo
+     * before storing and generating the brick volfiles. Also update
+     * the snap vol's version after removing the barrier key.
+     */
+    dict_deln(snap_vol->dict, "features.barrier", SLEN("features.barrier"));
+    gd_update_volume_op_versions(snap_vol);
+
+    /* *
+     * Create the export file from the node where ganesha.enable "on"
+     * is executed
+     * */
+    if (glusterd_is_ganesha_cluster() &&
+        glusterd_check_ganesha_export(snap_vol)) {
+        if (is_origin_glusterd(dict)) {
+            ret = manage_export_config(clonename, "on", NULL);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_EXPORT_FILE_CREATE_FAIL,
+                       "Failed to create"
+                       "export file for NFS-Ganesha\n");
+                goto out;
+            }
+        }
+
+        ret = dict_set_dynstr_with_alloc(snap_vol->dict,
+                                         "features.cache-invalidation", "on");
+        ret = gd_ganesha_send_dbus(clonename, "on");
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_EXPORT_FILE_CREATE_FAIL,
+                   "Dynamic export addition/deletion failed."
+                   " Please see log file for details. Clone name = %s",
+                   clonename);
+            goto out;
+        }
+    }
+    if (!glusterd_is_ganesha_cluster() &&
+        glusterd_check_ganesha_export(snap_vol)) {
+        /* This happens when a snapshot was created when Ganesha was
+         * enabled globally. Then Ganesha disabled from the cluster.
+         * In such cases, we will have the volume level option set
+         * on dict, So we have to disable it as it doesn't make sense
+         * to keep the option.
+         */
+
+        ret = dict_set_dynstr(snap_vol->dict, "ganesha.enable", "off");
+        if (ret)
+            goto out;
+    }
+
+    ret = glusterd_store_volinfo(snap_vol, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_SET_FAIL,
+               "Failed to store snapshot "
+               "volinfo (%s) for snap %s",
+               snap_vol->volname, snap->snapname);
+        goto out;
+    }
+
+    ret = glusterd_copy_quota_files(origin_vol, snap_vol, &conf_present);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_VOL_CONFIG_FAIL,
+               "Failed to copy quota "
+               "config and cksum for volume %s",
+               origin_vol->volname);
+        goto out;
+    }
+
+    if (snap_vol->is_snap_volume) {
+        ret = glusterd_snap_clear_unsupported_opt(snap_vol, unsupported_opt);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_OP_FAILED,
+                   "Failed to clear quota "
+                   "option for the snap %s (volume: %s)",
+                   snap->snapname, origin_vol->volname);
+            goto out;
+        }
+    }
+
+    ret = generate_brick_volfiles(snap_vol);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "generating the brick "
+               "volfiles for the snap %s (volume: %s) failed",
+               snap->snapname, origin_vol->volname);
+        goto reset_option;
+    }
+
+    ret = generate_client_volfiles(snap_vol, GF_CLIENT_TRUSTED);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "generating the trusted "
+               "client volfiles for the snap %s (volume: %s) failed",
+               snap->snapname, origin_vol->volname);
+        goto reset_option;
+    }
+
+    ret = generate_client_volfiles(snap_vol, GF_CLIENT_OTHER);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "generating the client "
+               "volfiles for the snap %s (volume: %s) failed",
+               snap->snapname, origin_vol->volname);
+        goto reset_option;
+    }
+
+reset_option:
+    if (snap_vol->is_snap_volume) {
+        if (glusterd_snap_set_unsupported_opt(snap_vol, unsupported_opt)) {
+            ret = -1;
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_OP_FAILED,
+                   "Failed to reset quota "
+                   "option for the snap %s (volume: %s)",
+                   snap->snapname, origin_vol->volname);
+        }
+    }
+out:
+    if (ret) {
+        for (i = 0; unsupported_opt[i].key; i++)
+            GF_FREE(unsupported_opt[i].value);
+
+        if (snap_vol) {
+            if (glusterd_is_ganesha_cluster() &&
+                glusterd_check_ganesha_export(snap_vol)) {
+                if (is_origin_glusterd(dict)) {
+                    ret = manage_export_config(clonename, "on", NULL);
+                    if (ret) {
+                        gf_msg(this->name, GF_LOG_ERROR, 0,
+                               GD_MSG_EXPORT_FILE_CREATE_FAIL,
+                               "Failed to create"
+                               "export file for NFS-Ganesha\n");
+                    }
+                }
+
+                ret = gd_ganesha_send_dbus(clonename, "off");
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_EXPORT_FILE_CREATE_FAIL,
+                           "Dynamic export addition/deletion failed."
+                           " Please see log file for details. Clone name = %s",
+                           clonename);
+                }
+            }
+
+            glusterd_snap_volume_remove(rsp_dict, snap_vol, _gf_true, _gf_true);
+        }
+        snap_vol = NULL;
+    }
+
+    return snap_vol;
+}
+
+/*This is the prevalidate function for both activate and deactive of snap
+ * For Activate operation pass is_op_activate as _gf_true
+ * For Deactivate operation pass is_op_activate as _gf_false
+ * */
+int
+glusterd_snapshot_activate_deactivate_prevalidate(dict_t *dict,
+                                                  char **op_errstr,
+                                                  uint32_t *op_errno,
+                                                  dict_t *rsp_dict,
+                                                  gf_boolean_t is_op_activate)
+{
+    int32_t ret = -1;
+    char *snapname = NULL;
+    xlator_t *this = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_volinfo_t *snap_volinfo = NULL;
+    char err_str[PATH_MAX] = "";
+    gf_loglevel_t loglevel = GF_LOG_ERROR;
+    glusterd_volume_status volume_status = GLUSTERD_STATUS_STOPPED;
+    int flags = 0;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    if (!dict || !op_errstr) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+               "input parameters NULL");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Getting the snap name "
+               "failed");
+        goto out;
+    }
+
+    snap = glusterd_find_snap_by_name(snapname);
+    if (!snap) {
+        snprintf(err_str, sizeof(err_str),
+                 "Snapshot (%s) does not "
+                 "exist.",
+                 snapname);
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_NOT_FOUND,
+                "Snapname=%s", snapname, NULL);
+        *op_errno = EG_NOSNAP;
+        ret = -1;
+        goto out;
+    }
+
+    /*If its activation of snap then fetch the flags*/
+    if (is_op_activate) {
+        ret = dict_get_int32n(dict, "flags", SLEN("flags"), &flags);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get flags");
+            goto out;
+        }
+    }
+
+    /* TODO : As of now there is only volume in snapshot.
+     * Change this when multiple volume snapshot is introduced
+     */
+    snap_volinfo = cds_list_entry(snap->volumes.next, glusterd_volinfo_t,
+                                  vol_list);
+    if (!snap_volinfo) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOLINFO_GET_FAIL,
+               "Unable to fetch snap_volinfo");
+        ret = -1;
+        goto out;
+    }
+
+    /*TODO: When multiple snapvolume are involved a cumulative
+     * logic is required to tell whether is snapshot is
+     * started/partially started/stopped*/
+    if (is_op_activate) {
+        volume_status = GLUSTERD_STATUS_STARTED;
+    }
+
+    if (snap_volinfo->status == volume_status) {
+        if (is_op_activate) {
+            /* if flag is to GF_CLI_FLAG_OP_FORCE
+             * try to start the snap volume, even
+             * if the volume_status is GLUSTERD_STATUS_STARTED.
+             * By doing so we try to bring
+             * back the brick processes that are down*/
+            if (!(flags & GF_CLI_FLAG_OP_FORCE)) {
+                snprintf(err_str, sizeof(err_str),
+                         "Snapshot %s is already activated.", snapname);
+                *op_errno = EINVAL;
+                ret = -1;
+            }
+        } else {
+            snprintf(err_str, sizeof(err_str),
+                     "Snapshot %s is already deactivated.", snapname);
+            *op_errno = EINVAL;
+            ret = -1;
+        }
+        goto out;
+    }
+    ret = 0;
+out:
+
+    if (ret && err_str[0] != '\0' && op_errstr) {
+        gf_msg(this->name, loglevel, 0, GD_MSG_SNAPSHOT_OP_FAILED, "%s",
+               err_str);
+        *op_errstr = gf_strdup(err_str);
+    }
+
+    return ret;
+}
+
+int32_t
+glusterd_handle_snapshot_delete_vol(dict_t *dict, char *err_str,
+                                    uint32_t *op_errno, int len)
+{
+    int32_t ret = -1;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    char *volname = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get "
+               "volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(err_str, len, "Volume (%s) does not exist", volname);
+        *op_errno = EG_NOVOL;
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+               "Failed to get volinfo of "
+               "volume %s",
+               volname);
+        goto out;
+    }
+
+    ret = glusterd_snapshot_get_vol_snapnames(dict, volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_LIST_GET_FAIL,
+               "Failed to get snapshot list for volume %s", volname);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int32_t
+glusterd_handle_snapshot_delete_all(dict_t *dict)
+{
+    int32_t ret = -1;
+    int32_t i = 0;
+    char key[32] = "";
+    glusterd_conf_t *priv = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_snap_t *tmp_snap = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(dict);
+
+    cds_list_for_each_entry_safe(snap, tmp_snap, &priv->snapshots, snap_list)
+    {
+        /* indexing from 1 to n, to keep it uniform with other code
+         * paths
+         */
+        i++;
+        ret = snprintf(key, sizeof(key), "snapname%d", i);
+        if (ret < 0) {
+            goto out;
+        }
+
+        ret = dict_set_dynstr_with_alloc(dict, key, snap->snapname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Could not save "
+                   "snap name");
+            goto out;
+        }
+    }
+
+    ret = dict_set_int32n(dict, "snapcount", SLEN("snapcount"), i);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Could not save snapcount");
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_handle_snapshot_delete_type_snap(rpcsvc_request_t *req,
+                                          glusterd_op_t op, dict_t *dict,
+                                          char *err_str, uint32_t *op_errno,
+                                          size_t len)
+{
+    int32_t ret = -1;
+    int64_t volcount = 0;
+    char *snapname = NULL;
+    char *volname = NULL;
+    char key[64] = "";
+    int keylen;
+    glusterd_snap_t *snap = NULL;
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_volinfo_t *tmp = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(req);
+    GF_ASSERT(dict);
+    GF_ASSERT(err_str);
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get snapname");
+        goto out;
+    }
+
+    snap = glusterd_find_snap_by_name(snapname);
+    if (!snap) {
+        snprintf(err_str, len, "Snapshot (%s) does not exist", snapname);
+        *op_errno = EG_NOSNAP;
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_SNAP_NOT_FOUND, "%s",
+               err_str);
+        ret = -1;
+        goto out;
+    }
+
+    /* Set volnames in the dict to get mgmt_v3 lock */
+    cds_list_for_each_entry_safe(snap_vol, tmp, &snap->volumes, vol_list)
+    {
+        volcount++;
+        volname = gf_strdup(snap_vol->parent_volname);
+        if (!volname) {
+            ret = -1;
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "strdup failed");
+            goto out;
+        }
+
+        keylen = snprintf(key, sizeof(key), "volname%" PRId64, volcount);
+        ret = dict_set_dynstrn(dict, key, keylen, volname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set "
+                   "volume name in dictionary");
+            GF_FREE(volname);
+            goto out;
+        }
+        volname = NULL;
+    }
+    ret = dict_set_int64(dict, "volcount", volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set volcount");
+        goto out;
+    }
+
+    ret = glusterd_mgmt_v3_initiate_snap_phases(req, op, dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_INIT_FAIL,
+               "Failed to initiate snap "
+               "phases");
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+/* This is a snapshot remove handler function. This function will be
+ * executed in the originator node. This function is responsible for
+ * calling mgmt v3 framework to do the actual remove on all the bricks
+ *
+ * @param req           RPC request object
+ * @param op            gluster operation
+ * @param dict          dictionary containing snapshot remove request
+ * @param err_str       In case of an err this string should be populated
+ * @param len           length of err_str buffer
+ *
+ * @return              Negative value on Failure and 0 in success
+ */
+int
+glusterd_handle_snapshot_delete(rpcsvc_request_t *req, glusterd_op_t op,
+                                dict_t *dict, char *err_str, uint32_t *op_errno,
+                                size_t len)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    int32_t delete_cmd = -1;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+
+    GF_ASSERT(req);
+    GF_ASSERT(dict);
+    GF_ASSERT(err_str);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    ret = dict_get_int32n(dict, "sub-cmd", SLEN("sub-cmd"), &delete_cmd);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMAND_NOT_FOUND,
+               "Failed to get sub-cmd");
+        goto out;
+    }
+
+    switch (delete_cmd) {
+        case GF_SNAP_DELETE_TYPE_SNAP:
+        case GF_SNAP_DELETE_TYPE_ITER:
+            ret = glusterd_handle_snapshot_delete_type_snap(
+                req, op, dict, err_str, op_errno, len);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                       "Failed to handle "
+                       "snapshot delete for type SNAP");
+                goto out;
+            }
+            break;
+
+        case GF_SNAP_DELETE_TYPE_ALL:
+            ret = glusterd_handle_snapshot_delete_all(dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                       "Failed to handle "
+                       "snapshot delete for type ALL");
+                goto out;
+            }
+            break;
+
+        case GF_SNAP_DELETE_TYPE_VOL:
+            ret = glusterd_handle_snapshot_delete_vol(dict, err_str, op_errno,
+                                                      len);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                       "Failed to handle "
+                       "snapshot delete for type VOL");
+                goto out;
+            }
+            break;
+
+        default:
+            *op_errno = EINVAL;
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                   "Wrong snapshot delete type");
+            break;
+    }
+
+    if (ret == 0 && (delete_cmd == GF_SNAP_DELETE_TYPE_ALL ||
+                     delete_cmd == GF_SNAP_DELETE_TYPE_VOL)) {
+        ret = glusterd_op_send_cli_response(op, 0, 0, req, dict, err_str);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_CLI_RESP,
+                   "Failed to send cli "
+                   "response");
+            goto out;
+        }
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_snapshot_remove_prevalidate(dict_t *dict, char **op_errstr,
+                                     uint32_t *op_errno, dict_t *rsp_dict)
+{
+    int32_t ret = -1;
+    char *snapname = NULL;
+    xlator_t *this = NULL;
+    glusterd_snap_t *snap = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    if (!dict || !op_errstr) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+               "input parameters NULL");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Getting the snap name "
+               "failed");
+        goto out;
+    }
+
+    snap = glusterd_find_snap_by_name(snapname);
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_SNAP_NOT_FOUND,
+               "Snapshot (%s) does not exist", snapname);
+        *op_errno = EG_NOSNAP;
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(dict, "snapuuid",
+                                     uuid_utoa(snap->snap_id));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snap "
+               "uuid in response dictionary for %s snapshot",
+               snap->snapname);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_snapshot_status_prevalidate(dict_t *dict, char **op_errstr,
+                                     uint32_t *op_errno, dict_t *rsp_dict)
+{
+    int ret = -1;
+    char *snapname = NULL;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+    int32_t cmd = -1;
+    glusterd_volinfo_t *volinfo = NULL;
+    char *volname = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+    GF_ASSERT(op_errstr);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    if (!dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+               "Input dict is NULL");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "sub-cmd", SLEN("sub-cmd"), &cmd);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Could not fetch status cmd");
+        goto out;
+    }
+
+    switch (cmd) {
+        case GF_SNAP_STATUS_TYPE_ALL: {
+            break;
+        }
+        case GF_SNAP_STATUS_TYPE_ITER:
+        case GF_SNAP_STATUS_TYPE_SNAP: {
+            ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Could not fetch snapname");
+                goto out;
+            }
+
+            if (!glusterd_find_snap_by_name(snapname)) {
+                ret = gf_asprintf(op_errstr,
+                                  "Snapshot (%s) "
+                                  "does not exist",
+                                  snapname);
+                *op_errno = EG_NOSNAP;
+                if (ret < 0) {
+                    goto out;
+                }
+                ret = -1;
+                gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_SNAP_NOT_FOUND,
+                       "Snapshot (%s) does not exist", snapname);
+                goto out;
+            }
+            break;
+        }
+        case GF_SNAP_STATUS_TYPE_VOL: {
+            ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Could not fetch volname");
+                goto out;
+            }
+
+            ret = glusterd_volinfo_find(volname, &volinfo);
+            if (ret) {
+                ret = gf_asprintf(op_errstr,
+                                  "Volume (%s) "
+                                  "does not exist",
+                                  volname);
+                *op_errno = EG_NOVOL;
+                if (ret < 0) {
+                    goto out;
+                }
+                ret = -1;
+                gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+                       "Volume "
+                       "%s not present",
+                       volname);
+                goto out;
+            }
+            break;
+        }
+        default: {
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_COMMAND_NOT_FOUND,
+                   "Invalid command");
+            *op_errno = EINVAL;
+            break;
+        }
+    }
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_snapshot_activate_commit(dict_t *dict, char **op_errstr,
+                                  dict_t *rsp_dict)
+{
+    int32_t ret = -1;
+    char *snapname = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_volinfo_t *snap_volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    xlator_t *this = NULL;
+    int flags = 0;
+    int brick_count = -1;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(op_errstr);
+
+    if (!dict || !op_errstr) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+               "input parameters NULL");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Getting the snap name "
+               "failed");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "flags", SLEN("flags"), &flags);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get flags");
+        goto out;
+    }
+
+    snap = glusterd_find_snap_by_name(snapname);
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_SNAP_NOT_FOUND,
+               "Snapshot (%s) does not exist", snapname);
+        ret = -1;
+        goto out;
+    }
+
+    /* TODO : As of now there is only volume in snapshot.
+     * Change this when multiple volume snapshot is introduced
+     */
+    snap_volinfo = cds_list_entry(snap->volumes.next, glusterd_volinfo_t,
+                                  vol_list);
+    if (!snap_volinfo) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Unable to fetch snap_volinfo");
+        ret = -1;
+        goto out;
+    }
+
+    /* create the complete brick here */
+    cds_list_for_each_entry(brickinfo, &snap_volinfo->bricks, brick_list)
+    {
+        brick_count++;
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID))
+            continue;
+        ret = glusterd_snap_brick_create(snap_volinfo, brickinfo, brick_count,
+                                         _gf_false);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_CREATION_FAIL,
+                   "not able to "
+                   "create the brick for the snap %s, volume %s",
+                   snap_volinfo->snapshot->snapname, snap_volinfo->volname);
+            goto out;
+        }
+    }
+
+    ret = glusterd_start_volume(snap_volinfo, flags, _gf_true);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_ACTIVATE_FAIL,
+               "Failed to activate snap volume %s of the snap %s",
+               snap_volinfo->volname, snap->snapname);
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(rsp_dict, "snapuuid",
+                                     uuid_utoa(snap->snap_id));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snap "
+               "uuid in response dictionary for %s snapshot",
+               snap->snapname);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+glusterd_snapshot_deactivate_commit(dict_t *dict, char **op_errstr,
+                                    dict_t *rsp_dict)
+{
+    int32_t ret = -1;
+    char *snapname = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_volinfo_t *snap_volinfo = NULL;
+    xlator_t *this = NULL;
+    char snap_path[PATH_MAX] = "";
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(op_errstr);
+
+    if (!dict || !op_errstr) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+               "input parameters NULL");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Getting the snap name "
+               "failed");
+        goto out;
+    }
+
+    snap = glusterd_find_snap_by_name(snapname);
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_SNAP_NOT_FOUND,
+               "Snapshot (%s) does not exist", snapname);
+        ret = -1;
+        goto out;
+    }
+
+    /* TODO : As of now there is only volume in snapshot.
+     * Change this when multiple volume snapshot is introduced
+     */
+    snap_volinfo = cds_list_entry(snap->volumes.next, glusterd_volinfo_t,
+                                  vol_list);
+    if (!snap_volinfo) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Unable to fetch snap_volinfo");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_stop_volume(snap_volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_DEACTIVATE_FAIL,
+               "Failed to deactivate"
+               "snap %s",
+               snapname);
+        goto out;
+    }
+
+    ret = glusterd_snap_unmount(this, snap_volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_UMOUNT_FAIL,
+               "Failed to unmounts for %s", snap->snapname);
+    }
+
+    /*Remove /var/run/gluster/snaps/<snap-name> entry for deactivated snaps.
+     * This entry will be created again during snap activate.
+     */
+    snprintf(snap_path, sizeof(snap_path), "%s/%s", snap_mount_dir, snapname);
+    ret = recursive_rmdir(snap_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Failed to remove "
+               "%s directory : error : %s",
+               snap_path, strerror(errno));
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(rsp_dict, "snapuuid",
+                                     uuid_utoa(snap->snap_id));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snap "
+               "uuid in response dictionary for %s snapshot",
+               snap->snapname);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+glusterd_snapshot_remove_commit(dict_t *dict, char **op_errstr,
+                                dict_t *rsp_dict)
+{
+    int32_t ret = -1;
+    char *snapname = NULL;
+    char *dup_snapname = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *snap_volinfo = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(op_errstr);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (!dict || !op_errstr) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+               "input parameters NULL");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Getting the snap name "
+               "failed");
+        goto out;
+    }
+
+    snap = glusterd_find_snap_by_name(snapname);
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_SNAP_NOT_FOUND,
+               "Snapshot (%s) does not exist", snapname);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(rsp_dict, "snapuuid",
+                                     uuid_utoa(snap->snap_id));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snap uuid in "
+               "response dictionary for %s snapshot",
+               snap->snapname);
+        goto out;
+    }
+
+    /* Save the snap status as GD_SNAP_STATUS_DECOMMISSION so
+     * that if the node goes down the snap would be removed
+     */
+    snap->snap_status = GD_SNAP_STATUS_DECOMMISSION;
+    ret = glusterd_store_snap(snap);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_OBJECT_STORE_FAIL,
+               "Failed to "
+               "store snap object %s",
+               snap->snapname);
+        goto out;
+    } else
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_OP_SUCCESS,
+               "Successfully marked "
+               "snap %s for decommission.",
+               snap->snapname);
+
+    if (is_origin_glusterd(dict) == _gf_true) {
+        /* TODO : As of now there is only volume in snapshot.
+         * Change this when multiple volume snapshot is introduced
+         */
+        snap_volinfo = cds_list_entry(snap->volumes.next, glusterd_volinfo_t,
+                                      vol_list);
+        if (!snap_volinfo) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                   "Unable to fetch snap_volinfo");
+            ret = -1;
+            goto out;
+        }
+
+        /* From origin glusterd check if      *
+         * any peers with snap bricks is down */
+        ret = glusterd_find_missed_snap(rsp_dict, snap_volinfo, &priv->peers,
+                                        GF_SNAP_OPTION_TYPE_DELETE);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MISSED_SNAP_GET_FAIL,
+                   "Failed to find missed snap deletes");
+            goto out;
+        }
+    }
+
+    ret = glusterd_snap_remove(rsp_dict, snap, _gf_true, _gf_false, _gf_false);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_REMOVE_FAIL,
+               "Failed to remove snap %s", snapname);
+        goto out;
+    }
+
+    dup_snapname = gf_strdup(snapname);
+    if (!dup_snapname) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Strdup failed");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_dynstr(rsp_dict, "snapname", dup_snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set the snapname");
+        GF_FREE(dup_snapname);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+glusterd_do_snap_cleanup(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    int32_t ret = -1;
+    char *name = NULL;
+    char *volname = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    glusterd_snap_t *snap = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    if (!dict || !op_errstr) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+               "input parameters NULL");
+        goto out;
+    }
+
+    /* As of now snapshot of multiple volumes are not supported */
+    ret = dict_get_strn(dict, "volname1", SLEN("volname1"), &volname);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get"
+               " volume name");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &name);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "getting the snap "
+               "name failed (volume: %s)",
+               volname);
+        goto out;
+    }
+
+    /*
+      If the snapname is not found that means the failure happened at
+      staging, or in commit, before the snap object is created, in which
+      case there is nothing to cleanup. So set ret to 0.
+    */
+    snap = glusterd_find_snap_by_name(name);
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_INFO, EINVAL, GD_MSG_SNAP_NOT_FOUND,
+               "Snapshot (%s) does not exist", name);
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_snap_remove(rsp_dict, snap, _gf_true, _gf_true, _gf_false);
+    if (ret) {
+        /* Ignore failure as this is a cleanup of half cooked
+           snapshot */
+        gf_msg_debug(this->name, 0, "removing the snap %s failed", name);
+        ret = 0;
+    }
+
+    name = NULL;
+
+    ret = 0;
+
+out:
+
+    return ret;
+}
+
+/* In case of a successful, delete or create operation, during post_validate *
+ * look for missed snap operations and update the missed snap lists */
+int32_t
+glusterd_snapshot_update_snaps_post_validate(dict_t *dict, char **op_errstr,
+                                             dict_t *rsp_dict)
+{
+    int32_t ret = -1;
+    int32_t missed_snap_count = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(op_errstr);
+
+    ret = dict_get_int32n(dict, "missed_snap_count", SLEN("missed_snap_count"),
+                          &missed_snap_count);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "No missed snaps");
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_add_missed_snaps_to_list(dict, missed_snap_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MISSEDSNAP_INFO_SET_FAIL,
+               "Failed to add missed snaps to list");
+        goto out;
+    }
+
+    ret = glusterd_store_update_missed_snaps();
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MISSEDSNAP_INFO_SET_FAIL,
+               "Failed to update missed_snaps_list");
+        goto out;
+    }
+
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_take_brick_snapshot_task(void *opaque)
+{
+    int ret = 0;
+    int32_t clone = 0;
+    snap_create_args_t *snap_args = NULL;
+    char *clonename = NULL;
+    char key[64] = "";
+    int keylen;
+
+    GF_ASSERT(opaque);
+
+    snap_args = (snap_create_args_t *)opaque;
+    THIS = snap_args->this;
+
+    /* Try and fetch clonename. If present set status with clonename *
+     * else do so as snap-vol */
+    ret = dict_get_strn(snap_args->dict, "clonename", SLEN("clonename"),
+                        &clonename);
+    if (ret) {
+        keylen = snprintf(key, sizeof(key), "snap-vol%d.brick%d.status",
+                          snap_args->volcount, snap_args->brickorder);
+    } else {
+        keylen = snprintf(key, sizeof(key), "clone%d.brick%d.status",
+                          snap_args->volcount, snap_args->brickorder);
+        clone = 1;
+    }
+
+    ret = glusterd_take_brick_snapshot(
+        snap_args->dict, snap_args->snap_vol, snap_args->brickinfo,
+        snap_args->volcount, snap_args->brickorder, clone);
+
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CREATION_FAIL,
+               "Failed to "
+               "take backend snapshot for brick "
+               "%s:%s volume(%s)",
+               snap_args->brickinfo->hostname, snap_args->brickinfo->path,
+               snap_args->snap_vol->volname);
+    }
+
+    if (dict_set_int32n(snap_args->rsp_dict, key, keylen, (ret) ? 0 : 1)) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to "
+               "add %s to dict",
+               key);
+        ret = -1;
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_take_brick_snapshot_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+    snap_create_args_t *snap_args = NULL;
+    struct syncargs *args = NULL;
+
+    GF_ASSERT(opaque);
+
+    snap_args = (snap_create_args_t *)opaque;
+    args = snap_args->args;
+
+    if (ret)
+        args->op_ret = ret;
+
+    GF_FREE(opaque);
+    synctask_barrier_wake(args);
+    return 0;
+}
+
+int32_t
+glusterd_schedule_brick_snapshot(dict_t *dict, dict_t *rsp_dict,
+                                 glusterd_snap_t *snap)
+{
+    int ret = -1;
+    int32_t volcount = 0;
+    int32_t brickcount = 0;
+    int32_t brickorder = 0;
+    int32_t taskcount = 0;
+    char key[64] = "";
+    int keylen;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    struct syncargs args = {0};
+    snap_create_args_t *snap_args = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(snap);
+
+    ret = synctask_barrier_init((&args));
+    if (ret)
+        goto out;
+    cds_list_for_each_entry(snap_vol, &snap->volumes, vol_list)
+    {
+        volcount++;
+        brickcount = 0;
+        brickorder = 0;
+        cds_list_for_each_entry(brickinfo, &snap_vol->bricks, brick_list)
+        {
+            keylen = snprintf(key, sizeof(key), "snap-vol%d.brick%d.order",
+                              volcount, brickcount);
+            ret = dict_set_int32n(rsp_dict, key, keylen, brickorder);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set %s", key);
+                goto out;
+            }
+
+            if ((gf_uuid_compare(brickinfo->uuid, MY_UUID)) ||
+                (brickinfo->snap_status == -1)) {
+                if (!gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+                    brickcount++;
+                    keylen = snprintf(key, sizeof(key),
+                                      "snap-vol%d.brick%d.status", volcount,
+                                      brickorder);
+                    ret = dict_set_int32n(rsp_dict, key, keylen, 0);
+                    if (ret) {
+                        gf_msg(this->name, GF_LOG_ERROR, 0,
+                               GD_MSG_DICT_SET_FAILED,
+                               "failed to add %s to "
+                               "dict",
+                               key);
+                        goto out;
+                    }
+                }
+                brickorder++;
+                continue;
+            }
+
+            snap_args = GF_CALLOC(1, sizeof(*snap_args),
+                                  gf_gld_mt_snap_create_args_t);
+            if (!snap_args) {
+                ret = -1;
+                goto out;
+            }
+
+            snap_args->this = this;
+            snap_args->dict = dict;
+            snap_args->rsp_dict = rsp_dict;
+            snap_args->snap_vol = snap_vol;
+            snap_args->brickinfo = brickinfo;
+            snap_args->volcount = volcount;
+            snap_args->brickcount = brickcount;
+            snap_args->brickorder = brickorder;
+            snap_args->args = &args;
+
+            ret = synctask_new(
+                this->ctx->env, glusterd_take_brick_snapshot_task,
+                glusterd_take_brick_snapshot_cbk, NULL, snap_args);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CREATION_FAIL,
+                       "Failed to "
+                       "spawn task for snapshot create");
+                GF_FREE(snap_args);
+                goto out;
+            }
+            taskcount++;
+            brickcount++;
+            brickorder++;
+        }
+
+        snprintf(key, sizeof(key), "snap-vol%d_brickcount", volcount);
+        ret = dict_set_int64(rsp_dict, key, brickcount);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "failed to "
+                   "add %s to dict",
+                   key);
+            goto out;
+        }
+    }
+    synctask_barrier_wait((&args), taskcount);
+    taskcount = 0;
+
+    if (args.op_ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CREATION_FAIL,
+               "Failed to create snapshot");
+
+    ret = args.op_ret;
+out:
+    if (ret && taskcount)
+        synctask_barrier_wait((&args), taskcount);
+
+    return ret;
+}
+
+glusterd_snap_t *
+glusterd_create_snap_object_for_clone(dict_t *dict, dict_t *rsp_dict)
+{
+    char *snapname = NULL;
+    uuid_t *snap_id = NULL;
+    glusterd_snap_t *snap = NULL;
+    xlator_t *this = NULL;
+    int ret = -1;
+
+    this = THIS;
+
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+
+    /* Fetch snapname, description, id and time from dict */
+    ret = dict_get_strn(dict, "clonename", SLEN("clonename"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch clonename");
+        goto out;
+    }
+
+    ret = dict_get_bin(dict, "clone-id", (void **)&snap_id);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch clone_id");
+        goto out;
+    }
+
+    snap = glusterd_new_snap_object();
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_OBJ_NEW_FAIL,
+               "Could not create "
+               "the snap object for snap %s",
+               snapname);
+        goto out;
+    }
+
+    gf_strncpy(snap->snapname, snapname, sizeof(snap->snapname));
+    gf_uuid_copy(snap->snap_id, *snap_id);
+
+    ret = 0;
+
+out:
+    if (ret) {
+        snap = NULL;
+    }
+
+    return snap;
+}
+
+int32_t
+glusterd_snapshot_clone_commit(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    int ret = -1;
+    int64_t volcount = 0;
+    char *snapname = NULL;
+    char *volname = NULL;
+    char *tmp_name = NULL;
+    xlator_t *this = NULL;
+    glusterd_snap_t *snap_parent = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_volinfo_t *origin_vol = NULL;
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(rsp_dict);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "clonename", SLEN("clonename"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch clonename");
+        goto out;
+    }
+    tmp_name = gf_strdup(snapname);
+    if (!tmp_name) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Out of memory");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_dynstr(rsp_dict, "clonename", tmp_name);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set clonename in rsp_dict");
+        GF_FREE(tmp_name);
+        goto out;
+    }
+    tmp_name = NULL;
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to get snap name");
+        goto out;
+    }
+
+    snap_parent = glusterd_find_snap_by_name(volname);
+    if (!snap_parent) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_SNAP_NOT_FOUND,
+               "Failed to "
+               "fetch snap %s",
+               volname);
+        goto out;
+    }
+
+    /* TODO : As of now there is only one volume in snapshot.
+     * Change this when multiple volume snapshot is introduced
+     */
+    origin_vol = cds_list_entry(snap_parent->volumes.next, glusterd_volinfo_t,
+                                vol_list);
+    if (!origin_vol) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Failed to get snap "
+               "volinfo %s",
+               snap_parent->snapname);
+        goto out;
+    }
+
+    snap = glusterd_create_snap_object_for_clone(dict, rsp_dict);
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_OBJ_NEW_FAIL,
+               "creating the"
+               "snap object %s failed",
+               snapname);
+        ret = -1;
+        goto out;
+    }
+
+    snap_vol = glusterd_do_snap_vol(origin_vol, snap, dict, rsp_dict, 1, 1);
+    if (!snap_vol) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_CREATION_FAIL,
+               "taking the "
+               "snapshot of the volume %s failed",
+               volname);
+        goto out;
+    }
+
+    volcount = 1;
+    ret = dict_set_int64(rsp_dict, "volcount", volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set volcount");
+        goto out;
+    }
+
+    ret = glusterd_schedule_brick_snapshot(dict, rsp_dict, snap);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_BACKEND_MAKE_FAIL,
+               "Failed to take backend "
+               "snapshot %s",
+               snap->snapname);
+        goto out;
+    }
+
+    cds_list_del_init(&snap_vol->vol_list);
+    ret = dict_set_dynstr_with_alloc(rsp_dict, "snapuuid",
+                                     uuid_utoa(snap_vol->volume_id));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snap "
+               "uuid in response dictionary for %s snapshot",
+               snap->snapname);
+        goto out;
+    }
+
+    glusterd_list_add_order(&snap_vol->vol_list, &priv->volumes,
+                            glusterd_compare_volume_name);
+
+    ret = 0;
+
+out:
+    if (ret) {
+        if (snap)
+            glusterd_snap_remove(rsp_dict, snap, _gf_true, _gf_true, _gf_true);
+        snap = NULL;
+    }
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_snapshot_create_commit(dict_t *dict, char **op_errstr,
+                                uint32_t *op_errno, dict_t *rsp_dict)
+{
+    int ret = -1;
+    int64_t i = 0;
+    int64_t volcount = 0;
+    int32_t snap_activate = 0;
+    int32_t flags = 0;
+    char *snapname = NULL;
+    char *volname = NULL;
+    char *tmp_name = NULL;
+    char key[64] = "";
+    int keylen;
+    xlator_t *this = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_volinfo_t *origin_vol = NULL;
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+    GF_ASSERT(rsp_dict);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_int64(dict, "volcount", &volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to "
+               "get the volume count");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch snapname");
+        goto out;
+    }
+    tmp_name = gf_strdup(snapname);
+    if (!tmp_name) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Out of memory");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_dynstr(rsp_dict, "snapname", tmp_name);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set snapname in rsp_dict");
+        GF_FREE(tmp_name);
+        goto out;
+    }
+    tmp_name = NULL;
+
+    snap = glusterd_create_snap_object(dict, rsp_dict);
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CREATION_FAIL,
+               "creating the"
+               "snap object %s failed",
+               snapname);
+        ret = -1;
+        goto out;
+    }
+
+    for (i = 1; i <= volcount; i++) {
+        keylen = snprintf(key, sizeof(key), "volname%" PRId64, i);
+        ret = dict_get_strn(dict, key, keylen, &volname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "failed to get volume name");
+            goto out;
+        }
+
+        ret = glusterd_volinfo_find(volname, &origin_vol);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+                   "failed to get the volinfo for "
+                   "the volume %s",
+                   volname);
+            goto out;
+        }
+
+        if (is_origin_glusterd(dict)) {
+            ret = glusterd_is_snap_soft_limit_reached(origin_vol, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPSHOT_OP_FAILED,
+                       "Failed to "
+                       "check soft limit exceeded or not, "
+                       "for volume %s ",
+                       origin_vol->volname);
+                goto out;
+            }
+        }
+
+        snap_vol = glusterd_do_snap_vol(origin_vol, snap, dict, rsp_dict, i, 0);
+        if (!snap_vol) {
+            ret = -1;
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_CREATION_FAIL,
+                   "taking the "
+                   "snapshot of the volume %s failed",
+                   volname);
+            goto out;
+        }
+    }
+    ret = dict_set_int64(rsp_dict, "volcount", volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set volcount");
+        goto out;
+    }
+
+    ret = glusterd_schedule_brick_snapshot(dict, rsp_dict, snap);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CREATION_FAIL,
+               "Failed to take backend "
+               "snapshot %s",
+               snap->snapname);
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(rsp_dict, "snapuuid",
+                                     uuid_utoa(snap->snap_id));
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snap "
+               "uuid in response dictionary for %s snapshot",
+               snap->snapname);
+        goto out;
+    }
+
+    snap_activate = dict_get_str_boolean(
+        priv->opts, GLUSTERD_STORE_KEY_SNAP_ACTIVATE, _gf_false);
+    if (!snap_activate) {
+        cds_list_for_each_entry(snap_vol, &snap->volumes, vol_list)
+        {
+            snap_vol->status = GLUSTERD_STATUS_STOPPED;
+            ret = glusterd_store_volinfo(snap_vol,
+                                         GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_SET_FAIL,
+                       "Failed to store snap volinfo %s", snap_vol->volname);
+                goto out;
+            }
+        }
+
+        goto out;
+    }
+
+    /* Activate created bricks in case of activate-on-create config. */
+    ret = dict_get_int32n(dict, "flags", SLEN("flags"), &flags);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get flags");
+        goto out;
+    }
+
+    cds_list_for_each_entry(snap_vol, &snap->volumes, vol_list)
+    {
+        ret = glusterd_start_volume(snap_vol, flags, _gf_true);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_ACTIVATE_FAIL,
+                   "Failed to activate snap volume %s of the "
+                   "snap %s",
+                   snap_vol->volname, snap->snapname);
+            goto out;
+        }
+    }
+
+    ret = 0;
+
+out:
+    if (ret) {
+        if (snap)
+            glusterd_snap_remove(rsp_dict, snap, _gf_true, _gf_true, _gf_false);
+        snap = NULL;
+    }
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+snap_max_hard_limit_set_commit(dict_t *dict, uint64_t value, char *volname,
+                               char **op_errstr)
+{
+    char err_str[PATH_MAX] = "";
+    glusterd_conf_t *conf = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    int ret = -1;
+    xlator_t *this = NULL;
+    char *next_version = NULL;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+
+    conf = this->private;
+
+    GF_ASSERT(conf);
+
+    /* TODO: Initiate auto deletion when there is a limit change */
+    if (!volname) {
+        /* For system limit */
+        ret = dict_set_uint64(conf->opts,
+                              GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT, value);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to store "
+                   "%s in the options",
+                   GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT);
+            goto out;
+        }
+
+        ret = glusterd_get_next_global_opt_version_str(conf->opts,
+                                                       &next_version);
+        if (ret)
+            goto out;
+
+        ret = dict_set_strn(conf->opts, GLUSTERD_GLOBAL_OPT_VERSION,
+                            SLEN(GLUSTERD_GLOBAL_OPT_VERSION), next_version);
+        if (ret)
+            goto out;
+
+        ret = glusterd_store_options(this, conf->opts);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_FAIL,
+                   "Failed to store "
+                   "options");
+            goto out;
+        }
+    } else {
+        /*  For one volume */
+        ret = glusterd_volinfo_find(volname, &volinfo);
+        if (ret) {
+            snprintf(err_str, PATH_MAX,
+                     "Failed to get the"
+                     " volinfo for volume %s",
+                     volname);
+            goto out;
+        }
+
+        volinfo->snap_max_hard_limit = value;
+
+        ret = glusterd_store_volinfo(volinfo,
+                                     GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+        if (ret) {
+            snprintf(err_str, PATH_MAX,
+                     "Failed to store "
+                     "snap-max-hard-limit for volume %s",
+                     volname);
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    if (ret) {
+        *op_errstr = gf_strdup(err_str);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPSHOT_OP_FAILED, "%s",
+               err_str);
+    }
+    return ret;
+}
+
+int
+glusterd_snapshot_config_commit(dict_t *dict, char **op_errstr,
+                                dict_t *rsp_dict)
+{
+    char *volname = NULL;
+    xlator_t *this = NULL;
+    int ret = -1;
+    glusterd_conf_t *conf = NULL;
+    int config_command = 0;
+    uint64_t hard_limit = 0;
+    uint64_t soft_limit = 0;
+    char *next_version = NULL;
+    char *auto_delete = NULL;
+    char *snap_activate = NULL;
+    gf_boolean_t system_conf = _gf_false;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+
+    conf = this->private;
+
+    GF_ASSERT(conf);
+
+    ret = dict_get_int32n(dict, "config-command", SLEN("config-command"),
+                          &config_command);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMAND_NOT_FOUND,
+               "failed to get config-command type");
+        goto out;
+    }
+    if (config_command != GF_SNAP_CONFIG_TYPE_SET) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+
+    /* config values snap-max-hard-limit and snap-max-soft-limit are
+     * optional and hence we are not erroring out if values are not
+     * present
+     */
+    gd_get_snap_conf_values_if_present(dict, &hard_limit, &soft_limit);
+
+    if (hard_limit) {
+        /* Commit ops for snap-max-hard-limit */
+        ret = snap_max_hard_limit_set_commit(dict, hard_limit, volname,
+                                             op_errstr);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HARD_LIMIT_SET_FAIL,
+                   "snap-max-hard-limit set commit failed.");
+            goto out;
+        }
+    }
+
+    if (soft_limit) {
+        /* For system limit */
+        system_conf = _gf_true;
+        ret = dict_set_uint64(
+            conf->opts, GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT, soft_limit);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to save %s in the dictionary",
+                   GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT);
+            goto out;
+        }
+    }
+
+    if (hard_limit || soft_limit) {
+        ret = 0;
+        goto done;
+    }
+
+    if (!dict_get_strn(dict, GLUSTERD_STORE_KEY_SNAP_AUTO_DELETE,
+                       SLEN(GLUSTERD_STORE_KEY_SNAP_AUTO_DELETE),
+                       &auto_delete)) {
+        system_conf = _gf_true;
+        ret = dict_set_dynstr_with_alloc(
+            conf->opts, GLUSTERD_STORE_KEY_SNAP_AUTO_DELETE, auto_delete);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Could not "
+                   "save auto-delete value in conf->opts");
+            goto out;
+        }
+    } else if (!dict_get_strn(dict, GLUSTERD_STORE_KEY_SNAP_ACTIVATE,
+                              SLEN(GLUSTERD_STORE_KEY_SNAP_ACTIVATE),
+                              &snap_activate)) {
+        system_conf = _gf_true;
+        ret = dict_set_dynstr_with_alloc(
+            conf->opts, GLUSTERD_STORE_KEY_SNAP_ACTIVATE, snap_activate);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Could not save "
+                   "snap-activate-on-create value in conf->opts");
+            goto out;
+        }
+    } else {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "Invalid option");
+        goto out;
+    }
+
+done:
+    if (system_conf) {
+        ret = glusterd_get_next_global_opt_version_str(conf->opts,
+                                                       &next_version);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_GLOBAL_OP_VERSION_GET_FAIL,
+                   "Failed to get next global opt-version");
+            goto out;
+        }
+
+        ret = dict_set_strn(conf->opts, GLUSTERD_GLOBAL_OPT_VERSION,
+                            SLEN(GLUSTERD_GLOBAL_OPT_VERSION), next_version);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_GLOBAL_OP_VERSION_SET_FAIL,
+                   "Failed to set next global opt-version");
+            goto out;
+        }
+
+        ret = glusterd_store_options(this, conf->opts);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_FAIL,
+                   "Failed to store options");
+            goto out;
+        }
+    }
+
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+glusterd_get_brick_lvm_details(dict_t *rsp_dict,
+                               glusterd_brickinfo_t *brickinfo, char *volname,
+                               char *device, const char *key_prefix)
+{
+    int ret = -1;
+    glusterd_conf_t *priv = NULL;
+    runner_t runner = {
+        0,
+    };
+    xlator_t *this = NULL;
+    char msg[PATH_MAX] = "";
+    char buf[PATH_MAX] = "";
+    char *ptr = NULL;
+    char *token = NULL;
+    char key[160] = ""; /* key_prefix is 128 bytes at most */
+    char *value = NULL;
+
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(brickinfo);
+    GF_ASSERT(volname);
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    device = glusterd_get_brick_mount_device(brickinfo->path);
+    if (!device) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_GET_INFO_FAIL,
+               "Getting device name for "
+               "the brick %s:%s failed",
+               brickinfo->hostname, brickinfo->path);
+        goto out;
+    }
+    runinit(&runner);
+    snprintf(msg, sizeof(msg),
+             "running lvs command, "
+             "for getting snap status");
+    /* Using lvs command fetch the Volume Group name,
+     * Percentage of data filled and Logical Volume size
+     *
+     * "-o" argument is used to get the desired information,
+     * example : "lvs /dev/VolGroup/thin_vol -o vgname,lv_size",
+     * will get us Volume Group name and Logical Volume size.
+     *
+     * Here separator used is ":",
+     * for the above given command with separator ":",
+     * The output will be "vgname:lvsize"
+     */
+    runner_add_args(&runner, LVS, device, "--noheading", "-o",
+                    "vg_name,data_percent,lv_size", "--separator", ":", NULL);
+    runner_redir(&runner, STDOUT_FILENO, RUN_PIPE);
+    runner_log(&runner, "", GF_LOG_DEBUG, msg);
+    ret = runner_start(&runner);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_LVS_FAIL,
+               "Could not perform lvs action");
+        goto end;
+    }
+    do {
+        ptr = fgets(buf, sizeof(buf), runner_chio(&runner, STDOUT_FILENO));
+
+        if (ptr == NULL)
+            break;
+        token = strtok(buf, ":");
+        if (token != NULL) {
+            while (token[0] == ' ')
+                token++;
+            value = gf_strdup(token);
+            if (!value) {
+                ret = -1;
+                goto end;
+            }
+            ret = snprintf(key, sizeof(key), "%s.vgname", key_prefix);
+            if (ret < 0) {
+                goto end;
+            }
+
+            ret = dict_set_dynstr(rsp_dict, key, value);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Could not save vgname ");
+                goto end;
+            }
+        }
+
+        token = strtok(NULL, ":");
+        if (token != NULL) {
+            value = gf_strdup(token);
+            if (!value) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+                        "token=%s", token, NULL);
+                ret = -1;
+                goto end;
+            }
+            ret = snprintf(key, sizeof(key), "%s.data", key_prefix);
+            if (ret < 0) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL,
+                        NULL);
+                goto end;
+            }
+
+            ret = dict_set_dynstr(rsp_dict, key, value);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Could not save data percent ");
+                goto end;
+            }
+        }
+        token = strtok(NULL, ":");
+        if (token != NULL) {
+            value = gf_strdup(token);
+            if (!value) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+                        "token=%s", token, NULL);
+                ret = -1;
+                goto end;
+            }
+            ret = snprintf(key, sizeof(key), "%s.lvsize", key_prefix);
+            if (ret < 0) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL,
+                        NULL);
+                goto end;
+            }
+
+            ret = dict_set_dynstr(rsp_dict, key, value);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Could not save meta data percent ");
+                goto end;
+            }
+        }
+
+    } while (ptr != NULL);
+
+    ret = 0;
+
+end:
+    runner_end(&runner);
+
+out:
+    if (ret && value) {
+        GF_FREE(value);
+    }
+
+    if (device)
+        GF_FREE(device);
+
+    return ret;
+}
+
+static int
+glusterd_get_single_brick_status(char **op_errstr, dict_t *rsp_dict,
+                                 const char *keyprefix, int index,
+                                 glusterd_volinfo_t *snap_volinfo,
+                                 glusterd_brickinfo_t *brickinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    char key[128] = ""; /* keyprefix is not longer than 64 bytes */
+    int keylen;
+    char *device = NULL;
+    char *value = NULL;
+    char brick_path[PATH_MAX] = "";
+    char pidfile[PATH_MAX] = "";
+    pid_t pid = -1;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(keyprefix);
+    GF_ASSERT(snap_volinfo);
+    GF_ASSERT(brickinfo);
+
+    keylen = snprintf(key, sizeof(key), "%s.brick%d.path", keyprefix, index);
+    if (keylen < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = snprintf(brick_path, sizeof(brick_path), "%s:%s", brickinfo->hostname,
+                   brickinfo->path);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    value = gf_strdup(brick_path);
+    if (!value) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+                "brick_path=%s", brick_path, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_dynstrn(rsp_dict, key, keylen, value);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to store "
+               "brick_path %s",
+               brickinfo->path);
+        goto out;
+    }
+
+    if (brickinfo->snap_status == -1) {
+        /* Setting vgname as "Pending Snapshot" */
+        value = gf_strdup("Pending Snapshot");
+        if (!value) {
+            ret = -1;
+            goto out;
+        }
+
+        keylen = snprintf(key, sizeof(key), "%s.brick%d.vgname", keyprefix,
+                          index);
+        ret = dict_set_dynstrn(rsp_dict, key, keylen, value);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Could not save vgname ");
+            goto out;
+        }
+
+        ret = 0;
+        goto out;
+    }
+    value = NULL;
+
+    keylen = snprintf(key, sizeof(key), "%s.brick%d.status", keyprefix, index);
+    if (keylen < 0) {
+        ret = -1;
+        goto out;
+    }
+
+    if (brickinfo->status == GF_BRICK_STOPPED) {
+        value = gf_strdup("No");
+        if (!value) {
+            ret = -1;
+            goto out;
+        }
+        ret = dict_set_strn(rsp_dict, key, keylen, value);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Could not save brick status");
+            goto out;
+        }
+        value = NULL;
+    } else {
+        value = gf_strdup("Yes");
+        if (!value) {
+            ret = -1;
+            goto out;
+        }
+        ret = dict_set_strn(rsp_dict, key, keylen, value);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Could not save brick status");
+            goto out;
+        }
+        value = NULL;
+
+        GLUSTERD_GET_BRICK_PIDFILE(pidfile, snap_volinfo, brickinfo, priv);
+
+        if (gf_is_service_running(pidfile, &pid)) {
+            keylen = snprintf(key, sizeof(key), "%s.brick%d.pid", keyprefix,
+                              index);
+            if (keylen < 0) {
+                ret = -1;
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL,
+                        NULL);
+                goto out;
+            }
+
+            ret = dict_set_int32n(rsp_dict, key, keylen, pid);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Could not save pid %d", pid);
+                goto out;
+            }
+        }
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.brick%d", keyprefix, index);
+    if (keylen < 0) {
+        ret = -1;
+        goto out;
+    }
+    /* While getting snap status we should show relevant information
+     * for deactivated snaps.
+     */
+    if (snap_volinfo->status == GLUSTERD_STATUS_STOPPED) {
+        /* Setting vgname as "Deactivated Snapshot" */
+        value = gf_strdup("N/A (Deactivated Snapshot)");
+        if (!value) {
+            ret = -1;
+            goto out;
+        }
+
+        keylen = snprintf(key, sizeof(key), "%s.brick%d.vgname", keyprefix,
+                          index);
+        ret = dict_set_dynstrn(rsp_dict, key, keylen, value);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Could not save vgname ");
+            goto out;
+        }
+
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_get_brick_lvm_details(rsp_dict, brickinfo,
+                                         snap_volinfo->volname, device, key);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_GET_INFO_FAIL,
+               "Failed to get "
+               "brick LVM details");
+        goto out;
+    }
+out:
+    if (ret && value) {
+        GF_FREE(value);
+    }
+
+    return ret;
+}
+
+static int
+glusterd_get_single_snap_status(char **op_errstr, dict_t *rsp_dict,
+                                const char *keyprefix, glusterd_snap_t *snap)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    char key[64] = ""; /* keyprefix is "status.snap0" */
+    int keylen;
+    char brickkey[PATH_MAX] = "";
+    glusterd_volinfo_t *snap_volinfo = NULL;
+    glusterd_volinfo_t *tmp_volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int volcount = 0;
+    int brickcount = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(keyprefix);
+    GF_ASSERT(snap);
+
+    cds_list_for_each_entry_safe(snap_volinfo, tmp_volinfo, &snap->volumes,
+                                 vol_list)
+    {
+        keylen = snprintf(key, sizeof(key), "%s.vol%d", keyprefix, volcount);
+        if (keylen < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+            ret = -1;
+            goto out;
+        }
+        cds_list_for_each_entry(brickinfo, &snap_volinfo->bricks, brick_list)
+        {
+            if (!glusterd_is_local_brick(this, snap_volinfo, brickinfo)) {
+                brickcount++;
+                continue;
+            }
+
+            ret = glusterd_get_single_brick_status(
+                op_errstr, rsp_dict, key, brickcount, snap_volinfo, brickinfo);
+
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_STATUS_FAIL,
+                       "Getting "
+                       "single snap status failed");
+                goto out;
+            }
+            brickcount++;
+        }
+        keylen = snprintf(brickkey, sizeof(brickkey), "%s.brickcount", key);
+        if (keylen < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+            goto out;
+        }
+
+        ret = dict_set_int32n(rsp_dict, brickkey, keylen, brickcount);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Could not save brick count");
+            goto out;
+        }
+        volcount++;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.volcount", keyprefix);
+    if (keylen < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_int32n(rsp_dict, key, keylen, volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Could not save volcount");
+        goto out;
+    }
+
+out:
+
+    return ret;
+}
+
+static int
+glusterd_get_each_snap_object_status(char **op_errstr, dict_t *rsp_dict,
+                                     glusterd_snap_t *snap,
+                                     const char *keyprefix)
+{
+    int ret = -1;
+    char key[32] = ""; /* keyprefix is "status.snap0" */
+    int keylen;
+    char *temp = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(snap);
+    GF_ASSERT(keyprefix);
+
+    /* TODO : Get all the snap volume info present in snap object,
+     * as of now, There will be only one snapvolinfo per snap object
+     */
+    keylen = snprintf(key, sizeof(key), "%s.snapname", keyprefix);
+    if (keylen < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    temp = gf_strdup(snap->snapname);
+    if (temp == NULL) {
+        ret = -1;
+        goto out;
+    }
+    ret = dict_set_dynstrn(rsp_dict, key, keylen, temp);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Could not save "
+               "snap name");
+        goto out;
+    }
+
+    temp = NULL;
+
+    keylen = snprintf(key, sizeof(key), "%s.uuid", keyprefix);
+    if (keylen < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    temp = gf_strdup(uuid_utoa(snap->snap_id));
+    if (temp == NULL) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_dynstrn(rsp_dict, key, keylen, temp);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Could not save "
+               "snap UUID");
+        goto out;
+    }
+
+    temp = NULL;
+
+    ret = glusterd_get_single_snap_status(op_errstr, rsp_dict, keyprefix, snap);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_STATUS_FAIL,
+               "Could not get single snap status");
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.volcount", keyprefix);
+    if (keylen < 0) {
+        ret = keylen;
+        goto out;
+    }
+
+    ret = dict_set_int32n(rsp_dict, key, keylen, 1);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Could not save volcount");
+        goto out;
+    }
+out:
+    if (ret && temp)
+        GF_FREE(temp);
+
+    return ret;
+}
+
+int
+glusterd_get_snap_status_of_volume(char **op_errstr, dict_t *rsp_dict,
+                                   char *volname, char *keyprefix)
+{
+    int ret = -1;
+    glusterd_volinfo_t *snap_volinfo = NULL;
+    glusterd_volinfo_t *temp_volinfo = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char key[64] = "";
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    int i = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(volname);
+    GF_ASSERT(keyprefix);
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+               "Failed to get volinfo of "
+               "volume %s",
+               volname);
+        goto out;
+    }
+
+    cds_list_for_each_entry_safe(snap_volinfo, temp_volinfo,
+                                 &volinfo->snap_volumes, snapvol_list)
+    {
+        ret = snprintf(key, sizeof(key), "status.snap%d.snapname", i);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+            goto out;
+        }
+
+        ret = dict_set_dynstr_with_alloc(rsp_dict, key,
+                                         snap_volinfo->snapshot->snapname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Could not save "
+                   "snap name");
+            goto out;
+        }
+
+        i++;
+    }
+
+    ret = dict_set_int32n(rsp_dict, "status.snapcount",
+                          SLEN("status.snapcount"), i);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to save snapcount");
+        ret = -1;
+        goto out;
+    }
+out:
+    return ret;
+}
+
+int
+glusterd_get_all_snapshot_status(dict_t *dict, char **op_errstr,
+                                 dict_t *rsp_dict)
+{
+    int32_t i = 0;
+    int ret = -1;
+    char key[64] = "";
+    glusterd_conf_t *priv = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_snap_t *tmp_snap = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+
+    cds_list_for_each_entry_safe(snap, tmp_snap, &priv->snapshots, snap_list)
+    {
+        ret = snprintf(key, sizeof(key), "status.snap%d.snapname", i);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+            goto out;
+        }
+
+        ret = dict_set_dynstr_with_alloc(rsp_dict, key, snap->snapname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Could not save "
+                   "snap name");
+            goto out;
+        }
+
+        i++;
+    }
+
+    ret = dict_set_int32n(rsp_dict, "status.snapcount",
+                          SLEN("status.snapcount"), i);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Could not save snapcount");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_snapshot_status_commit(dict_t *dict, char **op_errstr,
+                                dict_t *rsp_dict)
+{
+    xlator_t *this = NULL;
+    int ret = -1;
+    glusterd_conf_t *conf = NULL;
+    int32_t cmd = -1;
+    char *snapname = NULL;
+    glusterd_snap_t *snap = NULL;
+    char *volname = NULL;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(op_errstr);
+
+    conf = this->private;
+
+    GF_ASSERT(conf);
+    ret = dict_get_int32n(dict, "sub-cmd", SLEN("sub-cmd"), &cmd);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get status cmd type");
+        goto out;
+    }
+
+    ret = dict_set_int32n(rsp_dict, "sub-cmd", SLEN("sub-cmd"), cmd);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Could not save status cmd in rsp dictionary");
+        goto out;
+    }
+    switch (cmd) {
+        case GF_SNAP_STATUS_TYPE_ALL: {
+            ret = glusterd_get_all_snapshot_status(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_STATUS_FAIL,
+                       "Unable to "
+                       "get snapshot status");
+                goto out;
+            }
+            break;
+        }
+        case GF_SNAP_STATUS_TYPE_ITER:
+        case GF_SNAP_STATUS_TYPE_SNAP: {
+            ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to "
+                       "get snap name");
+                goto out;
+            }
+
+            snap = glusterd_find_snap_by_name(snapname);
+            if (!snap) {
+                ret = gf_asprintf(op_errstr,
+                                  "Snapshot (%s) "
+                                  "does not exist",
+                                  snapname);
+                if (ret < 0) {
+                    goto out;
+                }
+                ret = -1;
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                       "Unable to "
+                       "get snap volinfo");
+                goto out;
+            }
+            ret = glusterd_get_each_snap_object_status(op_errstr, rsp_dict,
+                                                       snap, "status.snap0");
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_STATUS_FAIL,
+                       "Unable to "
+                       "get status of snap");
+                goto out;
+            }
+
+            ret = dict_set_int32n(rsp_dict, "status.snapcount",
+                                  SLEN("status.snapcount"), 1);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Unable to "
+                       "set snapcount to 1");
+                goto out;
+            }
+            break;
+        }
+        case GF_SNAP_STATUS_TYPE_VOL: {
+            ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to"
+                       " get volume name");
+                goto out;
+            }
+
+            ret = glusterd_get_snap_status_of_volume(op_errstr, rsp_dict,
+                                                     volname, "status.vol0");
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_STATUS_FAIL,
+                       "Function :"
+                       " glusterd_get_snap_status_of_volume "
+                       "failed");
+                goto out;
+            }
+        }
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+glusterd_handle_snap_limit(dict_t *dict, dict_t *rsp_dict)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    uint64_t effective_max_limit = 0;
+    int64_t volcount = 0;
+    int i = 0;
+    char *volname = NULL;
+    char key[64] = "";
+    int keylen;
+    char msg[PATH_MAX] = "";
+    glusterd_volinfo_t *volinfo = NULL;
+    uint64_t limit = 0;
+    int64_t count = 0;
+    glusterd_snap_t *snap = NULL;
+    glusterd_volinfo_t *tmp_volinfo = NULL;
+    uint64_t opt_max_hard = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+    uint64_t opt_max_soft = GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_int64(dict, "volcount", &volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to get the volcount");
+        goto out;
+    }
+
+    for (i = 1; i <= volcount; i++) {
+        keylen = snprintf(key, sizeof(key), "volname%d", i);
+        ret = dict_get_strn(dict, key, keylen, &volname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "failed to get the "
+                   "volname");
+            goto out;
+        }
+
+        ret = glusterd_volinfo_find(volname, &volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+                   "volinfo for %s "
+                   "not found",
+                   volname);
+            goto out;
+        }
+
+        /* config values snap-max-hard-limit and snap-max-soft-limit are
+         * optional and hence we are not erroring out if values are not
+         * present
+         */
+        gd_get_snap_conf_values_if_present(priv->opts, &opt_max_hard,
+                                           &opt_max_soft);
+
+        /* The minimum of the 2 limits i.e system wide limit and
+           volume wide limit will be considered
+        */
+        if (volinfo->snap_max_hard_limit < opt_max_hard)
+            effective_max_limit = volinfo->snap_max_hard_limit;
+        else
+            effective_max_limit = opt_max_hard;
+
+        limit = (opt_max_soft * effective_max_limit) / 100;
+
+        count = volinfo->snap_count - limit;
+        if (count <= 0)
+            goto out;
+
+        tmp_volinfo = cds_list_entry(volinfo->snap_volumes.next,
+                                     glusterd_volinfo_t, snapvol_list);
+        snap = tmp_volinfo->snapshot;
+        GF_ASSERT(snap);
+
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SOFT_LIMIT_REACHED,
+               "Soft-limit "
+               "(value = %" PRIu64
+               ") of volume %s is reached. "
+               "Deleting snapshot %s.",
+               limit, volinfo->volname, snap->snapname);
+
+        snprintf(msg, sizeof(msg),
+                 "snapshot_name=%s;"
+                 "snapshot_uuid=%s",
+                 snap->snapname, uuid_utoa(snap->snap_id));
+
+        LOCK(&snap->lock);
+        {
+            snap->snap_status = GD_SNAP_STATUS_DECOMMISSION;
+            ret = glusterd_store_snap(snap);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_SNAP_OBJECT_STORE_FAIL,
+                       "could "
+                       "not store snap object %s",
+                       snap->snapname);
+                goto unlock;
+            }
+
+            ret = glusterd_snap_remove(rsp_dict, snap, _gf_true, _gf_true,
+                                       _gf_false);
+            if (ret)
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                       "failed to remove snap %s", snap->snapname);
+        }
+    unlock:
+        UNLOCK(&snap->lock);
+        if (is_origin_glusterd(dict) == _gf_true) {
+            if (ret)
+                gf_event(EVENT_SNAPSHOT_DELETE_FAILED, "%s", msg);
+            else
+                gf_event(EVENT_SNAPSHOT_DELETED, "%s", msg);
+        }
+    }
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_snapshot_clone_postvalidate(dict_t *dict, int32_t op_ret,
+                                     char **op_errstr, dict_t *rsp_dict)
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = -1;
+    int32_t cleanup = 0;
+    glusterd_snap_t *snap = NULL;
+    glusterd_volinfo_t *snap_vol = NULL;
+    char *clonename = NULL;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "clonename", SLEN("clonename"), &clonename);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch "
+               "clonename");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(clonename, &snap_vol);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "unable to find clone "
+               "%s volinfo",
+               clonename);
+        goto out;
+    }
+
+    if (snap_vol)
+        snap = snap_vol->snapshot;
+    else {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_NOT_FOUND,
+               "Snapshot volume is null");
+        goto out;
+    }
+
+    /* Fetch snap object from snap_vol and delete it all in case of *
+     * a failure, or else, just delete the snap object as it is not *
+     * needed in case of a clone                                    *
+     */
+    if (op_ret) {
+        ret = dict_get_int32n(dict, "cleanup", SLEN("cleanup"), &cleanup);
+        if (!ret && cleanup && snap) {
+            glusterd_snap_remove(rsp_dict, snap, _gf_true, _gf_true, _gf_true);
+        }
+        /* Irrespective of status of cleanup its better
+         * to return from this function. As the functions
+         * following this block is not required to be
+         * executed in case of failure scenario.
+         */
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_snapobject_delete(snap);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_REMOVE_FAIL,
+               "Failed to delete "
+               "snap object %s",
+               snap->snapname);
+        goto out;
+    }
+    snap_vol->snapshot = NULL;
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_snapshot_create_postvalidate(dict_t *dict, int32_t op_ret,
+                                      char **op_errstr, dict_t *rsp_dict)
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = -1;
+    int32_t cleanup = 0;
+    glusterd_snap_t *snap = NULL;
+    char *snapname = NULL;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    uint64_t opt_hard_max = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+    uint64_t opt_max_soft = GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT;
+    int64_t effective_max_limit = 0;
+    int64_t soft_limit = 0;
+    int32_t snap_activate = _gf_false;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (op_ret) {
+        ret = dict_get_int32n(dict, "cleanup", SLEN("cleanup"), &cleanup);
+        if (!ret && cleanup) {
+            ret = glusterd_do_snap_cleanup(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_CLEANUP_FAIL,
+                       "cleanup "
+                       "operation failed");
+                goto out;
+            }
+        }
+        /* Irrespective of status of cleanup its better
+         * to return from this function. As the functions
+         * following this block is not required to be
+         * executed in case of failure scenario.
+         */
+        ret = 0;
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &snapname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to fetch "
+               "snapname");
+        goto out;
+    }
+
+    snap = glusterd_find_snap_by_name(snapname);
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_NOT_FOUND,
+               "unable to find snap "
+               "%s",
+               snapname);
+        goto out;
+    }
+
+    snap->snap_status = GD_SNAP_STATUS_IN_USE;
+    ret = glusterd_store_snap(snap);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_OBJECT_STORE_FAIL,
+               "Could not store snap"
+               "object %s",
+               snap->snapname);
+        goto out;
+    }
+
+    ret = glusterd_snapshot_update_snaps_post_validate(dict, op_errstr,
+                                                       rsp_dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CREATION_FAIL,
+               "Failed to "
+               "create snapshot");
+        goto out;
+    }
+
+    /*
+     * If activate_on_create was enabled, and we have reached this  *
+     * section of the code, that means, that after successfully     *
+     * creating the snapshot, we have also successfully started the *
+     * snapshot bricks on all nodes. So from originator node we can *
+     * send EVENT_SNAPSHOT_ACTIVATED event.                         *
+     *                                                              *
+     * Also check, if hard limit and soft limit is reached in case  *
+     * of successfully creating the snapshot, and generate the event *
+     */
+    if (is_origin_glusterd(dict) == _gf_true) {
+        snap_activate = dict_get_str_boolean(
+            priv->opts, GLUSTERD_STORE_KEY_SNAP_ACTIVATE, _gf_false);
+
+        if (snap_activate == _gf_true) {
+            gf_event(EVENT_SNAPSHOT_ACTIVATED,
+                     "snapshot_name=%s;"
+                     "snapshot_uuid=%s",
+                     snap->snapname, uuid_utoa(snap->snap_id));
+        }
+
+        ret = dict_get_strn(dict, "volname1", SLEN("volname1"), &volname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get volname.");
+            goto out;
+        }
+
+        ret = glusterd_volinfo_find(volname, &volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+                   "Failed to get volinfo.");
+            goto out;
+        }
+
+        /* config values snap-max-hard-limit and snap-max-soft-limit are
+         * optional and hence we are not erroring out if values are not
+         * present
+         */
+        gd_get_snap_conf_values_if_present(priv->opts, &opt_hard_max,
+                                           &opt_max_soft);
+
+        if (volinfo->snap_max_hard_limit < opt_hard_max)
+            effective_max_limit = volinfo->snap_max_hard_limit;
+        else
+            effective_max_limit = opt_hard_max;
+
+        /*
+         * Check for hard limit. If it is reached after taking *
+         * this snapshot, then generate event for the same. If *
+         * it is not reached, then check for the soft limit,   *
+         * and generate event accordingly.                     *
+         */
+        if (volinfo->snap_count >= effective_max_limit) {
+            gf_event(EVENT_SNAPSHOT_HARD_LIMIT_REACHED,
+                     "volume_name=%s;volume_id=%s", volname,
+                     uuid_utoa(volinfo->volume_id));
+        } else {
+            soft_limit = (opt_max_soft * effective_max_limit) / 100;
+            if (volinfo->snap_count >= soft_limit) {
+                gf_event(EVENT_SNAPSHOT_SOFT_LIMIT_REACHED,
+                         "volume_name=%s;volume_id=%s", volname,
+                         uuid_utoa(volinfo->volume_id));
+            }
+        }
+    }
+
+    /* "auto-delete" might not be set by user explicitly,
+     * in that case it's better to consider the default value.
+     * Hence not erroring out if Key is not found.
+     */
+    ret = dict_get_str_boolean(priv->opts, GLUSTERD_STORE_KEY_SNAP_AUTO_DELETE,
+                               _gf_false);
+    if (_gf_true == ret) {
+        ret = glusterd_handle_snap_limit(dict, rsp_dict);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                   "failed to remove snap");
+            /* ignore the errors of autodelete */
+            ret = 0;
+        }
+    }
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_snapshot(dict_t *dict, char **op_errstr, uint32_t *op_errno,
+                  dict_t *rsp_dict)
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    int32_t snap_command = 0;
+    char *snap_name = NULL;
+    char temp[PATH_MAX] = "";
+    int ret = -1;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_int32n(dict, "type", SLEN("type"), &snap_command);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMAND_NOT_FOUND,
+               "unable to get the type of "
+               "the snapshot command");
+        goto out;
+    }
+
+    switch (snap_command) {
+        case (GF_SNAP_OPTION_TYPE_CREATE):
+            ret = glusterd_snapshot_create_commit(dict, op_errstr, op_errno,
+                                                  rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CREATION_FAIL,
+                       "Failed to "
+                       "create snapshot");
+                goto out;
+            }
+            break;
+
+        case (GF_SNAP_OPTION_TYPE_CLONE):
+            ret = glusterd_snapshot_clone_commit(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CLONE_FAILED,
+                       "Failed to "
+                       "clone snapshot");
+                goto out;
+            }
+            break;
+
+        case GF_SNAP_OPTION_TYPE_CONFIG:
+            ret = glusterd_snapshot_config_commit(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CONFIG_FAIL,
+                       "snapshot config failed");
+                goto out;
+            }
+            break;
+
+        case GF_SNAP_OPTION_TYPE_DELETE:
+            ret = glusterd_snapshot_remove_commit(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                       "Failed to "
+                       "delete snapshot");
+                if (*op_errstr) {
+                    /* If error string is already set
+                     * then goto out */
+                    goto out;
+                }
+
+                ret = dict_get_strn(dict, "snapname", SLEN("snapname"),
+                                    &snap_name);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                           "Failed to get snapname");
+                    snap_name = "NA";
+                }
+
+                snprintf(temp, sizeof(temp),
+                         "Snapshot %s might "
+                         "not be in an usable state.",
+                         snap_name);
+
+                *op_errstr = gf_strdup(temp);
+                ret = -1;
+                goto out;
+            }
+            break;
+
+        case GF_SNAP_OPTION_TYPE_RESTORE:
+            ret = glusterd_snapshot_restore(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_RESTORE_FAIL,
+                       "Failed to "
+                       "restore snapshot");
+                goto out;
+            }
+
+            break;
+        case GF_SNAP_OPTION_TYPE_ACTIVATE:
+            ret = glusterd_snapshot_activate_commit(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_ACTIVATE_FAIL,
+                       "Failed to "
+                       "activate snapshot");
+                goto out;
+            }
+
+            break;
+
+        case GF_SNAP_OPTION_TYPE_DEACTIVATE:
+            ret = glusterd_snapshot_deactivate_commit(dict, op_errstr,
+                                                      rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_SNAP_DEACTIVATE_FAIL,
+                       "Failed to "
+                       "deactivate snapshot");
+                goto out;
+            }
+
+            break;
+
+        case GF_SNAP_OPTION_TYPE_STATUS:
+            ret = glusterd_snapshot_status_commit(dict, op_errstr, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_STATUS_FAIL,
+                       "Failed to "
+                       "show snapshot status");
+                goto out;
+            }
+            break;
+
+        default:
+            gf_msg(this->name, GF_LOG_WARNING, EINVAL, GD_MSG_INVALID_ENTRY,
+                   "invalid snap command");
+            goto out;
+            break;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int
+glusterd_snapshot_brickop(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    int ret = -1;
+    int64_t vol_count = 0;
+    int64_t count = 1;
+    char key[64] = "";
+    int keylen;
+    char *volname = NULL;
+    int32_t snap_command = 0;
+    xlator_t *this = NULL;
+    char *op_type = NULL;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+
+    ret = dict_get_int32n(dict, "type", SLEN("type"), &snap_command);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMAND_NOT_FOUND,
+               "unable to get the type of "
+               "the snapshot command");
+        goto out;
+    }
+
+    switch (snap_command) {
+        case GF_SNAP_OPTION_TYPE_CREATE:
+
+            /* op_type with tell us whether its pre-commit operation
+             * or post-commit
+             */
+            ret = dict_get_strn(dict, "operation-type", SLEN("operation-type"),
+                                &op_type);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to fetch "
+                       "operation type");
+                goto out;
+            }
+
+            if (strcmp(op_type, "pre") == 0) {
+                /* BRICK OP PHASE for enabling barrier, Enable barrier
+                 * if its a pre-commit operation
+                 */
+                ret = glusterd_set_barrier_value(dict, "enable");
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Failed to "
+                           "set barrier value as enable in dict");
+                    goto out;
+                }
+            } else if (strcmp(op_type, "post") == 0) {
+                /* BRICK OP PHASE for disabling barrier, Disable barrier
+                 * if its a post-commit operation
+                 */
+                ret = glusterd_set_barrier_value(dict, "disable");
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                           "Failed to "
+                           "set barrier value as disable in "
+                           "dict");
+                    goto out;
+                }
+            } else {
+                ret = -1;
+                gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                       "Invalid op_type");
+                goto out;
+            }
+
+            ret = dict_get_int64(dict, "volcount", &vol_count);
+            if (ret)
+                goto out;
+            while (count <= vol_count) {
+                keylen = snprintf(key, sizeof(key), "volname%" PRId64, count);
+                ret = dict_get_strn(dict, key, keylen, &volname);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                           "Unable to get volname");
+                    goto out;
+                }
+                ret = dict_set_strn(dict, "volname", SLEN("volname"), volname);
+                if (ret)
+                    goto out;
+
+                ret = gd_brick_op_phase(GD_OP_SNAP, NULL, dict, op_errstr);
+                if (ret)
+                    goto out;
+                volname = NULL;
+                count++;
+            }
+
+            dict_deln(dict, "volname", SLEN("volname"));
+            ret = 0;
+            break;
+        case GF_SNAP_OPTION_TYPE_DELETE:
+            break;
+        default:
+            break;
+    }
+
+out:
+    return ret;
+}
+
+int
+glusterd_snapshot_prevalidate(dict_t *dict, char **op_errstr, dict_t *rsp_dict,
+                              uint32_t *op_errno)
+{
+    int snap_command = 0;
+    xlator_t *this = NULL;
+    int ret = -1;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+
+    ret = dict_get_int32n(dict, "type", SLEN("type"), &snap_command);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMAND_NOT_FOUND,
+               "unable to get the type of "
+               "the snapshot command");
+        goto out;
+    }
+
+    switch (snap_command) {
+        case (GF_SNAP_OPTION_TYPE_CREATE):
+            ret = glusterd_snapshot_create_prevalidate(dict, op_errstr,
+                                                       rsp_dict, op_errno);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_CREATION_FAIL,
+                       "Snapshot create "
+                       "pre-validation failed");
+                goto out;
+            }
+            break;
+
+        case (GF_SNAP_OPTION_TYPE_CLONE):
+            ret = glusterd_snapshot_clone_prevalidate(dict, op_errstr, rsp_dict,
+                                                      op_errno);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_SNAP_CLONE_PREVAL_FAILED,
+                       "Snapshot clone "
+                       "pre-validation failed");
+                goto out;
+            }
+            break;
+
+        case (GF_SNAP_OPTION_TYPE_CONFIG):
+            ret = glusterd_snapshot_config_prevalidate(dict, op_errstr,
+                                                       op_errno);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_CONFIG_FAIL,
+                       "Snapshot config "
+                       "pre-validation failed");
+                goto out;
+            }
+            break;
+
+        case GF_SNAP_OPTION_TYPE_RESTORE:
+            ret = glusterd_snapshot_restore_prevalidate(dict, op_errstr,
+                                                        op_errno, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_RESTORE_FAIL,
+                       "Snapshot restore "
+                       "validation failed");
+                goto out;
+            }
+            break;
+
+        case GF_SNAP_OPTION_TYPE_ACTIVATE:
+            ret = glusterd_snapshot_activate_deactivate_prevalidate(
+                dict, op_errstr, op_errno, rsp_dict, _gf_true);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_ACTIVATE_FAIL,
+                       "Snapshot activate "
+                       "validation failed");
+                goto out;
+            }
+            break;
+        case GF_SNAP_OPTION_TYPE_DEACTIVATE:
+            ret = glusterd_snapshot_activate_deactivate_prevalidate(
+                dict, op_errstr, op_errno, rsp_dict, _gf_false);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_SNAP_DEACTIVATE_FAIL,
+                       "Snapshot deactivate validation failed");
+                goto out;
+            }
+            break;
+        case GF_SNAP_OPTION_TYPE_DELETE:
+            ret = glusterd_snapshot_remove_prevalidate(dict, op_errstr,
+                                                       op_errno, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                       "Snapshot remove "
+                       "validation failed");
+                goto out;
+            }
+            break;
+
+        case GF_SNAP_OPTION_TYPE_STATUS:
+            ret = glusterd_snapshot_status_prevalidate(dict, op_errstr,
+                                                       op_errno, rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_STATUS_FAIL,
+                       "Snapshot status "
+                       "validation failed");
+                goto out;
+            }
+            break;
+
+        default:
+            gf_msg(this->name, GF_LOG_WARNING, EINVAL, GD_MSG_COMMAND_NOT_FOUND,
+                   "invalid snap command");
+            *op_errno = EINVAL;
+            goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/* This function is called to remove the trashpath, in cases
+ * when the restore operation is successful and we don't need
+ * the backup, and incases when the restore op is failed before
+ * commit, and we don't need to revert the backup.
+ *
+ * @param volname  name of the volume which is being restored
+ *
+ * @return 0 on success or -1 on failure
+ */
+int
+glusterd_remove_trashpath(char *volname)
+{
+    int ret = -1;
+    char delete_path[PATH_MAX] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    struct stat stbuf = {
+        0,
+    };
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+
+    GF_ASSERT(volname);
+
+    len = snprintf(delete_path, sizeof(delete_path),
+                   "%s/" GLUSTERD_TRASH "/vols-%s.deleted", priv->workdir,
+                   volname);
+    if ((len < 0) || (len >= sizeof(delete_path))) {
+        goto out;
+    }
+
+    ret = sys_lstat(delete_path, &stbuf);
+    if (ret) {
+        /* If the trash dir does not exist, return *
+         * without failure                         *
+         */
+        if (errno == ENOENT) {
+            ret = 0;
+            goto out;
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+                   "Failed to lstat "
+                   "backup dir (%s)",
+                   delete_path);
+            goto out;
+        }
+    }
+
+    /* Delete the backup copy of volume folder */
+    ret = recursive_rmdir(delete_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Failed to remove "
+               "backup dir (%s)",
+               delete_path);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/* This function is called if snapshot restore operation
+ * is successful. It will cleanup the backup files created
+ * during the restore operation.
+ *
+ * @param rsp_dict Response dictionary
+ * @param volinfo  volinfo of the volume which is being restored
+ * @param snap     snap object
+ *
+ * @return 0 on success or -1 on failure
+ */
+int
+glusterd_snapshot_restore_cleanup(dict_t *rsp_dict, char *volname,
+                                  glusterd_snap_t *snap)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(volname);
+    GF_ASSERT(snap);
+
+    /* Now delete the snap entry. */
+    ret = glusterd_snap_remove(rsp_dict, snap, _gf_false, _gf_true, _gf_false);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_REMOVE_FAIL,
+               "Failed to delete "
+               "snap %s",
+               snap->snapname);
+        goto out;
+    }
+
+    /* Delete the backup copy of volume folder */
+    ret = glusterd_remove_trashpath(volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Failed to remove "
+               "backup dir");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/* This function is called when the snapshot restore operation failed
+ * for some reasons. In such case we revert the restore operation.
+ *
+ * @param volinfo               volinfo of the origin volume
+ *
+ * @return 0 on success and -1 on failure
+ */
+int
+glusterd_snapshot_revert_partial_restored_vol(glusterd_volinfo_t *volinfo)
+{
+    int ret = 0;
+    char pathname[PATH_MAX] = "";
+    char trash_path[PATH_MAX] = "";
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_volinfo_t *reverted_vol = NULL;
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_volinfo_t *tmp_vol = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(volinfo);
+
+    GLUSTERD_GET_VOLUME_DIR(pathname, volinfo, priv);
+
+    len = snprintf(trash_path, sizeof(trash_path),
+                   "%s/" GLUSTERD_TRASH "/vols-%s.deleted", priv->workdir,
+                   volinfo->volname);
+    if ((len < 0) || (len >= sizeof(trash_path))) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    /* Since snapshot restore failed we cannot rely on the volume
+     * data stored under vols folder. Therefore delete the origin
+     * volume's backend folder.*/
+    ret = recursive_rmdir(pathname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Failed to remove "
+               "%s directory",
+               pathname);
+        goto out;
+    }
+
+    /* Now move the backup copy of the vols to its original
+     * location.*/
+    ret = sys_rename(trash_path, pathname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Failed to rename folder "
+               "from %s to %s",
+               trash_path, pathname);
+        goto out;
+    }
+
+    /* Retrieve the volume from the store */
+    reverted_vol = glusterd_store_retrieve_volume(volinfo->volname, NULL);
+    if (NULL == reverted_vol) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_OP_FAILED,
+               "Failed to load restored "
+               "%s volume",
+               volinfo->volname);
+        goto out;
+    }
+
+    /* Retrieve the snap_volumes list from the older volinfo */
+    reverted_vol->snap_count = volinfo->snap_count;
+    cds_list_for_each_entry_safe(snap_vol, tmp_vol, &volinfo->snap_volumes,
+                                 snapvol_list)
+    {
+        cds_list_add_tail(&snap_vol->snapvol_list, &reverted_vol->snap_volumes);
+
+        cds_list_for_each_entry(brickinfo, &snap_vol->bricks, brick_list)
+        {
+            /*
+             * If the brick is not of this peer, or snapshot is    *
+             * missed for the brick don't restore the xattr for it *
+             */
+            if ((!gf_uuid_compare(brickinfo->uuid, MY_UUID)) &&
+                (brickinfo->snap_status != -1)) {
+                /*
+                 * We need to restore volume id of all snap *
+                 * bricks to volume id of the snap volume.  *
+                 */
+                ret = sys_lsetxattr(brickinfo->path, GF_XATTR_VOL_ID_KEY,
+                                    snap_vol->volume_id,
+                                    sizeof(snap_vol->volume_id), XATTR_REPLACE);
+                if (ret == -1) {
+                    gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_SET_XATTR_FAIL,
+                            "Attribute=%s, Path=%s, Reason=%s, Snap=%s",
+                            GF_XATTR_VOL_ID_KEY, brickinfo->path,
+                            strerror(errno), snap_vol->volname, NULL);
+                    goto out;
+                }
+            }
+        }
+    }
+
+    /* Since we retrieved the volinfo from store now we don't
+     * want the older volinfo. Therefore delete the older volinfo */
+    glusterd_volinfo_unref(volinfo);
+    ret = 0;
+out:
+    return ret;
+}
+
+/* This function is called when glusterd is started and we need
+ * to revert a failed snapshot restore.
+ *
+ * @param snap snapshot object of the restored snap
+ *
+ * @return 0 on success and -1 on failure
+ */
+int
+glusterd_snapshot_revert_restore_from_snap(glusterd_snap_t *snap)
+{
+    int ret = -1;
+    char volname[PATH_MAX] = "";
+    glusterd_volinfo_t *snap_volinfo = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(snap);
+
+    /* TODO : As of now there is only one volume in snapshot.
+     * Change this when multiple volume snapshot is introduced
+     */
+    snap_volinfo = cds_list_entry(snap->volumes.next, glusterd_volinfo_t,
+                                  vol_list);
+
+    gf_strncpy(volname, snap_volinfo->parent_volname, sizeof(volname));
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+               "Could not get volinfo of "
+               "%s",
+               snap_volinfo->parent_volname);
+        goto out;
+    }
+
+    ret = glusterd_snapshot_revert_partial_restored_vol(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_RESTORE_REVERT_FAIL,
+               "Failed to revert snapshot "
+               "restore operation for %s volume",
+               volname);
+        goto out;
+    }
+out:
+    return ret;
+}
+
+/* This function is called from post-validation. Based on the op_ret
+ * it will take a decision on whether to revert the operation or
+ * perform cleanup.
+ *
+ * @param dict          dictionary object
+ * @param op_ret        return value of the restore operation
+ * @param op_errstr     error string
+ * @param rsp_dict      Response dictionary
+ *
+ * @return 0 on success and -1 on failure
+ */
+int
+glusterd_snapshot_restore_postop(dict_t *dict, int32_t op_ret, char **op_errstr,
+                                 dict_t *rsp_dict)
+{
+    int ret = -1;
+    char *name = NULL;
+    char *volname = NULL;
+    int cleanup = 0;
+    glusterd_snap_t *snap = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &name);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "getting the snap "
+               "name failed (volume: %s)",
+               name);
+        goto out;
+    }
+
+    snap = glusterd_find_snap_by_name(name);
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_NOT_FOUND,
+               "Snapshot (%s) does not exist", name);
+        ret = -1;
+        goto out;
+    }
+
+    /* TODO: fix this when multiple volume support will come */
+    ret = dict_get_strn(dict, "volname1", SLEN("volname1"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "Volume (%s) does not exist ", volname);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "snapname", SLEN("snapname"), &name);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "getting the snap "
+               "name failed (volume: %s)",
+               volinfo->volname);
+        goto out;
+    }
+
+    snap = glusterd_find_snap_by_name(name);
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_SNAP_NOT_FOUND,
+               "snap %s is not found", name);
+        ret = -1;
+        goto out;
+    }
+
+    /* On success perform the cleanup operation */
+    if (0 == op_ret) {
+        ret = glusterd_snapshot_restore_cleanup(rsp_dict, volname, snap);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CLEANUP_FAIL,
+                   "Failed to perform "
+                   "snapshot restore cleanup for %s volume",
+                   volname);
+            goto out;
+        }
+    } else { /* On failure revert snapshot restore */
+        ret = dict_get_int32n(dict, "cleanup", SLEN("cleanup"), &cleanup);
+        /* Perform cleanup only when required */
+        if (ret || (0 == cleanup)) {
+            /* Delete the backup copy of volume folder */
+            ret = glusterd_remove_trashpath(volinfo->volname);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+                       "Failed to remove backup dir");
+                goto out;
+            }
+            ret = 0;
+            goto out;
+        }
+
+        ret = glusterd_snapshot_revert_partial_restored_vol(volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_RESTORE_REVERT_FAIL,
+                   "Failed to revert "
+                   "restore operation for %s volume",
+                   volname);
+            goto out;
+        }
+
+        snap->snap_status = GD_SNAP_STATUS_IN_USE;
+        /* We need to save this in disk */
+        ret = glusterd_store_snap(snap);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_OBJECT_STORE_FAIL,
+                   "Could not store snap object for %s snap", snap->snapname);
+            goto out;
+        }
+
+        /* After restore fails, we have to remove mount point for
+         * deactivated snaps which was created at start of restore op.
+         */
+        if (volinfo->status == GLUSTERD_STATUS_STOPPED) {
+            ret = glusterd_snap_unmount(this, volinfo);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_UMOUNT_FAIL,
+                       "Failed to unmounts for %s", snap->snapname);
+            }
+        }
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_snapshot_postvalidate(dict_t *dict, int32_t op_ret, char **op_errstr,
+                               dict_t *rsp_dict)
+{
+    int snap_command = 0;
+    xlator_t *this = NULL;
+    int ret = -1;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+
+    ret = dict_get_int32n(dict, "type", SLEN("type"), &snap_command);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMAND_NOT_FOUND,
+               "unable to get the type of "
+               "the snapshot command");
+        goto out;
+    }
+
+    switch (snap_command) {
+        case GF_SNAP_OPTION_TYPE_CREATE:
+            ret = glusterd_snapshot_create_postvalidate(dict, op_ret, op_errstr,
+                                                        rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_CREATION_FAIL,
+                       "Snapshot create "
+                       "post-validation failed");
+                goto out;
+            }
+            glusterd_fetchsnap_notify(this);
+            break;
+        case GF_SNAP_OPTION_TYPE_CLONE:
+            ret = glusterd_snapshot_clone_postvalidate(dict, op_ret, op_errstr,
+                                                       rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_SNAP_CLONE_POSTVAL_FAILED,
+                       "Snapshot create "
+                       "post-validation failed");
+                goto out;
+            }
+            glusterd_fetchsnap_notify(this);
+            break;
+        case GF_SNAP_OPTION_TYPE_DELETE:
+            if (op_ret) {
+                gf_msg_debug(this->name, 0,
+                             "op_ret = %d. Not performing delete "
+                             "post_validate",
+                             op_ret);
+                ret = 0;
+                goto out;
+            }
+            ret = glusterd_snapshot_update_snaps_post_validate(dict, op_errstr,
+                                                               rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_MISSED_SNAP_LIST_STORE_FAIL,
+                       "Failed to "
+                       "update missed snaps list");
+                goto out;
+            }
+            glusterd_fetchsnap_notify(this);
+            break;
+        case GF_SNAP_OPTION_TYPE_RESTORE:
+            ret = glusterd_snapshot_update_snaps_post_validate(dict, op_errstr,
+                                                               rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_RESTORE_FAIL,
+                       "Failed to "
+                       "update missed snaps list");
+                goto out;
+            }
+
+            ret = glusterd_snapshot_restore_postop(dict, op_ret, op_errstr,
+                                                   rsp_dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_RESTORE_FAIL,
+                       "Failed to "
+                       "perform snapshot restore post-op");
+                goto out;
+            }
+            glusterd_fetchsnap_notify(this);
+            break;
+        case GF_SNAP_OPTION_TYPE_ACTIVATE:
+        case GF_SNAP_OPTION_TYPE_DEACTIVATE:
+            glusterd_fetchsnap_notify(this);
+            break;
+        case GF_SNAP_OPTION_TYPE_STATUS:
+        case GF_SNAP_OPTION_TYPE_CONFIG:
+        case GF_SNAP_OPTION_TYPE_INFO:
+        case GF_SNAP_OPTION_TYPE_LIST:
+            /*Nothing to be done. But want to
+             * avoid the default case warning*/
+            ret = 0;
+            break;
+        default:
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_COMMAND_NOT_FOUND,
+                   "invalid snap command");
+            goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/*
+  Verify availability of lvm commands
+*/
+
+static gf_boolean_t
+glusterd_is_lvm_cmd_available(char *lvm_cmd)
+{
+    int32_t ret = 0;
+    struct stat buf = {
+        0,
+    };
+
+    if (!lvm_cmd)
+        return _gf_false;
+
+    ret = sys_stat(lvm_cmd, &buf);
+    if (ret != 0) {
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "stat fails on %s, exiting. (errno = %d (%s))", lvm_cmd, errno,
+               strerror(errno));
+        return _gf_false;
+    }
+
+    if ((!ret) && (!S_ISREG(buf.st_mode))) {
+        gf_msg(THIS->name, GF_LOG_CRITICAL, EINVAL, GD_MSG_COMMAND_NOT_FOUND,
+               "Provided command %s is not a regular file,"
+               "exiting",
+               lvm_cmd);
+        return _gf_false;
+    }
+
+    if ((!ret) && (!(buf.st_mode & S_IXUSR))) {
+        gf_msg(THIS->name, GF_LOG_CRITICAL, 0, GD_MSG_NO_EXEC_PERMS,
+               "Provided command %s has no exec permissions,"
+               "exiting",
+               lvm_cmd);
+        return _gf_false;
+    }
+
+    return _gf_true;
+}
+
+int
+glusterd_handle_snapshot_fn(rpcsvc_request_t *req)
+{
+    int32_t ret = 0;
+    dict_t *dict = NULL;
+    gf_cli_req cli_req = {
+        {0},
+    };
+    glusterd_op_t cli_op = GD_OP_SNAP;
+    int type = 0;
+    glusterd_conf_t *conf = NULL;
+    char *host_uuid = NULL;
+    char err_str[2048] = "";
+    xlator_t *this = NULL;
+    uint32_t op_errno = 0;
+
+    GF_ASSERT(req);
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        req->rpc_err = GARBAGE_ARGS;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len > 0) {
+        dict = dict_new();
+        if (!dict)
+            goto out;
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        }
+
+        dict->extra_stdfree = cli_req.dict.dict_val;
+
+        host_uuid = gf_strdup(uuid_utoa(MY_UUID));
+        if (host_uuid == NULL) {
+            snprintf(err_str, sizeof(err_str),
+                     "Failed to get "
+                     "the uuid of local glusterd");
+            ret = -1;
+            goto out;
+        }
+        ret = dict_set_dynstrn(dict, "host-uuid", SLEN("host-uuid"), host_uuid);
+        if (ret) {
+            GF_FREE(host_uuid);
+            goto out;
+        }
+
+    } else {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "request dict length is %d", cli_req.dict.dict_len);
+        goto out;
+    }
+
+    if (conf->op_version < GD_OP_VERSION_3_6_0) {
+        snprintf(err_str, sizeof(err_str),
+                 "Cluster operating version"
+                 " is lesser than the supported version "
+                 "for a snapshot");
+        op_errno = EG_OPNOTSUP;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNSUPPORTED_VERSION,
+               "%s (%d < %d)", err_str, conf->op_version, GD_OP_VERSION_3_6_0);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "type", SLEN("type"), &type);
+    if (ret < 0) {
+        snprintf(err_str, sizeof(err_str), "Command type not found");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMAND_NOT_FOUND, "%s",
+               err_str);
+        goto out;
+    }
+
+    if (!glusterd_is_lvm_cmd_available(LVM_CREATE)) {
+        snprintf(err_str, sizeof(err_str),
+                 "LVM commands not found,"
+                 " snapshot functionality is disabled");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMAND_NOT_FOUND, "%s",
+               err_str);
+        ret = -1;
+        goto out;
+    }
+
+    switch (type) {
+        case GF_SNAP_OPTION_TYPE_CREATE:
+            ret = glusterd_handle_snapshot_create(req, cli_op, dict, err_str,
+                                                  sizeof(err_str));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_CREATION_FAIL,
+                       "Snapshot create failed: %s", err_str);
+            }
+            break;
+
+        case GF_SNAP_OPTION_TYPE_CLONE:
+            ret = glusterd_handle_snapshot_clone(req, cli_op, dict, err_str,
+                                                 sizeof(err_str));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_CLONE_FAILED,
+                       "Snapshot clone "
+                       "failed: %s",
+                       err_str);
+            }
+            break;
+
+        case GF_SNAP_OPTION_TYPE_RESTORE:
+            ret = glusterd_handle_snapshot_restore(req, cli_op, dict, err_str,
+                                                   &op_errno, sizeof(err_str));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_RESTORE_FAIL,
+                       "Snapshot restore failed: %s", err_str);
+            }
+
+            break;
+        case GF_SNAP_OPTION_TYPE_INFO:
+            ret = glusterd_handle_snapshot_info(req, cli_op, dict, err_str,
+                                                sizeof(err_str));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_INFO_FAIL,
+                       "Snapshot info failed");
+            }
+            break;
+        case GF_SNAP_OPTION_TYPE_LIST:
+            ret = glusterd_handle_snapshot_list(req, cli_op, dict, err_str,
+                                                sizeof(err_str), &op_errno);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_LIST_GET_FAIL,
+                       "Snapshot list failed");
+            }
+            break;
+        case GF_SNAP_OPTION_TYPE_CONFIG:
+            ret = glusterd_handle_snapshot_config(req, cli_op, dict, err_str,
+                                                  sizeof(err_str));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_CONFIG_FAIL,
+                       "snapshot config failed");
+            }
+            break;
+        case GF_SNAP_OPTION_TYPE_DELETE:
+            ret = glusterd_handle_snapshot_delete(req, cli_op, dict, err_str,
+                                                  &op_errno, sizeof(err_str));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                       "Snapshot delete failed: %s", err_str);
+            }
+            break;
+        case GF_SNAP_OPTION_TYPE_ACTIVATE:
+            ret = glusterd_mgmt_v3_initiate_snap_phases(req, cli_op, dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_ACTIVATE_FAIL,
+                       "Snapshot activate failed: %s", err_str);
+            }
+            break;
+        case GF_SNAP_OPTION_TYPE_DEACTIVATE:
+            ret = glusterd_mgmt_v3_initiate_snap_phases(req, cli_op, dict);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_SNAP_DEACTIVATE_FAIL,
+                       "Snapshot deactivate failed: %s", err_str);
+            }
+            break;
+        case GF_SNAP_OPTION_TYPE_STATUS:
+            ret = glusterd_handle_snapshot_status(req, cli_op, dict, err_str,
+                                                  sizeof(err_str));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_STATUS_FAIL,
+                       "Snapshot status failed: %s", err_str);
+            }
+            break;
+        default:
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_COMMAND_NOT_FOUND,
+                   "Unknown snapshot request "
+                   "type (%d)",
+                   type);
+            ret = -1; /* Failure */
+    }
+
+out:
+    if (ret) {
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+
+        if (ret && (op_errno == 0))
+            op_errno = EG_INTRNL;
+
+        ret = glusterd_op_send_cli_response(cli_op, ret, op_errno, req, dict,
+                                            err_str);
+    }
+
+    return ret;
+}
+
+int
+glusterd_handle_snapshot(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, glusterd_handle_snapshot_fn);
+}
+
+static void
+glusterd_free_snap_op(glusterd_snap_op_t *snap_op)
+{
+    if (snap_op) {
+        if (snap_op->brick_path)
+            GF_FREE(snap_op->brick_path);
+
+        GF_FREE(snap_op);
+    }
+}
+
+static void
+glusterd_free_missed_snapinfo(glusterd_missed_snap_info *missed_snapinfo)
+{
+    glusterd_snap_op_t *snap_opinfo = NULL;
+    glusterd_snap_op_t *tmp = NULL;
+
+    if (missed_snapinfo) {
+        cds_list_for_each_entry_safe(snap_opinfo, tmp,
+                                     &missed_snapinfo->snap_ops, snap_ops_list)
+        {
+            glusterd_free_snap_op(snap_opinfo);
+            snap_opinfo = NULL;
+        }
+
+        if (missed_snapinfo->node_uuid)
+            GF_FREE(missed_snapinfo->node_uuid);
+
+        if (missed_snapinfo->snap_uuid)
+            GF_FREE(missed_snapinfo->snap_uuid);
+
+        GF_FREE(missed_snapinfo);
+    }
+}
+
+/* Look for duplicates and accordingly update the list */
+int32_t
+glusterd_update_missed_snap_entry(glusterd_missed_snap_info *missed_snapinfo,
+                                  glusterd_snap_op_t *missed_snap_op)
+{
+    int32_t ret = -1;
+    glusterd_snap_op_t *snap_opinfo = NULL;
+    gf_boolean_t match = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(missed_snapinfo);
+    GF_ASSERT(missed_snap_op);
+
+    cds_list_for_each_entry(snap_opinfo, &missed_snapinfo->snap_ops,
+                            snap_ops_list)
+    {
+        /* If the entry is not for the same snap_vol_id
+         * then continue
+         */
+        if (strcmp(snap_opinfo->snap_vol_id, missed_snap_op->snap_vol_id))
+            continue;
+
+        if ((!strcmp(snap_opinfo->brick_path, missed_snap_op->brick_path)) &&
+            (snap_opinfo->op == missed_snap_op->op)) {
+            /* If two entries have conflicting status
+             * GD_MISSED_SNAP_DONE takes precedence
+             */
+            if ((snap_opinfo->status == GD_MISSED_SNAP_PENDING) &&
+                (missed_snap_op->status == GD_MISSED_SNAP_DONE)) {
+                snap_opinfo->status = GD_MISSED_SNAP_DONE;
+                gf_msg(this->name, GF_LOG_INFO, 0,
+                       GD_MSG_MISSED_SNAP_STATUS_DONE,
+                       "Updating missed snap status "
+                       "for %s:%s=%s:%d:%s:%d as DONE",
+                       missed_snapinfo->node_uuid, missed_snapinfo->snap_uuid,
+                       snap_opinfo->snap_vol_id, snap_opinfo->brick_num,
+                       snap_opinfo->brick_path, snap_opinfo->op);
+                ret = 0;
+                glusterd_free_snap_op(missed_snap_op);
+                goto out;
+            }
+            match = _gf_true;
+            break;
+        } else if ((snap_opinfo->brick_num == missed_snap_op->brick_num) &&
+                   (snap_opinfo->op == GF_SNAP_OPTION_TYPE_CREATE) &&
+                   ((missed_snap_op->op == GF_SNAP_OPTION_TYPE_DELETE) ||
+                    (missed_snap_op->op == GF_SNAP_OPTION_TYPE_RESTORE))) {
+            /* Optimizing create and delete entries for the same
+             * brick and same node
+             */
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_MISSED_SNAP_STATUS_DONE,
+                   "Updating missed snap status "
+                   "for %s:%s=%s:%d:%s:%d as DONE",
+                   missed_snapinfo->node_uuid, missed_snapinfo->snap_uuid,
+                   snap_opinfo->snap_vol_id, snap_opinfo->brick_num,
+                   snap_opinfo->brick_path, snap_opinfo->op);
+            snap_opinfo->status = GD_MISSED_SNAP_DONE;
+            ret = 0;
+            glusterd_free_snap_op(missed_snap_op);
+            goto out;
+        }
+    }
+
+    if (match == _gf_true) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DUP_ENTRY,
+               "Duplicate entry. Not updating");
+        glusterd_free_snap_op(missed_snap_op);
+    } else {
+        cds_list_add_tail(&missed_snap_op->snap_ops_list,
+                          &missed_snapinfo->snap_ops);
+    }
+
+    ret = 0;
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Add new missed snap entry to the missed_snaps list. */
+int32_t
+glusterd_add_new_entry_to_list(char *missed_info, char *snap_vol_id,
+                               int32_t brick_num, char *brick_path,
+                               int32_t snap_op, int32_t snap_status)
+{
+    char *buf = NULL;
+    char *save_ptr = NULL;
+    char node_snap_info[PATH_MAX] = "";
+    int32_t ret = -1;
+    glusterd_missed_snap_info *missed_snapinfo = NULL;
+    glusterd_snap_op_t *missed_snap_op = NULL;
+    glusterd_conf_t *priv = NULL;
+    gf_boolean_t match = _gf_false;
+    gf_boolean_t free_missed_snap_info = _gf_false;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(missed_info);
+    GF_ASSERT(snap_vol_id);
+    GF_ASSERT(brick_path);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* Create the snap_op object consisting of the *
+     * snap id and the op */
+    ret = glusterd_missed_snap_op_new(&missed_snap_op);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MISSED_SNAP_CREATE_FAIL,
+               "Failed to create new missed snap object.");
+        ret = -1;
+        goto out;
+    }
+
+    missed_snap_op->snap_vol_id = gf_strdup(snap_vol_id);
+    if (!missed_snap_op->snap_vol_id) {
+        ret = -1;
+        goto out;
+    }
+    missed_snap_op->brick_path = gf_strdup(brick_path);
+    if (!missed_snap_op->brick_path) {
+        ret = -1;
+        goto out;
+    }
+    missed_snap_op->brick_num = brick_num;
+    missed_snap_op->op = snap_op;
+    missed_snap_op->status = snap_status;
+
+    /* Look for other entries for the same node and same snap */
+    cds_list_for_each_entry(missed_snapinfo, &priv->missed_snaps_list,
+                            missed_snaps)
+    {
+        snprintf(node_snap_info, sizeof(node_snap_info), "%s:%s",
+                 missed_snapinfo->node_uuid, missed_snapinfo->snap_uuid);
+        if (!strcmp(node_snap_info, missed_info)) {
+            /* Found missed snapshot info for *
+             * the same node and same snap */
+            match = _gf_true;
+            break;
+        }
+    }
+
+    if (match == _gf_false) {
+        /* First snap op missed for the brick */
+        ret = glusterd_missed_snapinfo_new(&missed_snapinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MISSED_SNAP_CREATE_FAIL,
+                   "Failed to create missed snapinfo");
+            goto out;
+        }
+        free_missed_snap_info = _gf_true;
+        buf = strtok_r(missed_info, ":", &save_ptr);
+        if (!buf) {
+            ret = -1;
+            goto out;
+        }
+        missed_snapinfo->node_uuid = gf_strdup(buf);
+        if (!missed_snapinfo->node_uuid) {
+            ret = -1;
+            goto out;
+        }
+
+        buf = strtok_r(NULL, ":", &save_ptr);
+        if (!buf) {
+            ret = -1;
+            goto out;
+        }
+        missed_snapinfo->snap_uuid = gf_strdup(buf);
+        if (!missed_snapinfo->snap_uuid) {
+            ret = -1;
+            goto out;
+        }
+
+        cds_list_add_tail(&missed_snap_op->snap_ops_list,
+                          &missed_snapinfo->snap_ops);
+        cds_list_add_tail(&missed_snapinfo->missed_snaps,
+                          &priv->missed_snaps_list);
+
+        ret = 0;
+        goto out;
+    } else {
+        ret = glusterd_update_missed_snap_entry(missed_snapinfo,
+                                                missed_snap_op);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MISSED_SNAP_CREATE_FAIL,
+                   "Failed to update existing missed snap entry.");
+            goto out;
+        }
+    }
+
+out:
+    if (ret) {
+        glusterd_free_snap_op(missed_snap_op);
+
+        if (missed_snapinfo && (free_missed_snap_info == _gf_true))
+            glusterd_free_missed_snapinfo(missed_snapinfo);
+    }
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Add  missing snap entries to the in-memory conf->missed_snap_list */
+int32_t
+glusterd_add_missed_snaps_to_list(dict_t *dict, int32_t missed_snap_count)
+{
+    char *buf = NULL;
+    char *tmp = NULL;
+    char *save_ptr = NULL;
+    char *nodeid = NULL;
+    char *snap_uuid = NULL;
+    char *snap_vol_id = NULL;
+    char *brick_path = NULL;
+    char missed_info[PATH_MAX] = "";
+    char key[64] = "";
+    int keylen;
+    int32_t i = -1;
+    int32_t ret = -1;
+    int32_t brick_num = -1;
+    int32_t snap_op = -1;
+    int32_t snap_status = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* We can update the missed_snaps_list without acquiring *
+     * any additional locks as big lock will be held.        */
+    for (i = 0; i < missed_snap_count; i++) {
+        keylen = snprintf(key, sizeof(key), "missed_snaps_%d", i);
+        ret = dict_get_strn(dict, key, keylen, &buf);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to fetch %s", key);
+            goto out;
+        }
+
+        gf_msg_debug(this->name, 0, "missed_snap_entry = %s", buf);
+
+        /* Need to make a duplicate string coz the same dictionary *
+         * is resent to the non-originator nodes */
+        tmp = gf_strdup(buf);
+        if (!tmp) {
+            ret = -1;
+            goto out;
+        }
+
+        /* Fetch the node-id, snap-id, brick_num,
+         * brick_path, snap_op and snap status
+         */
+        nodeid = strtok_r(tmp, ":", &save_ptr);
+        snap_uuid = strtok_r(NULL, "=", &save_ptr);
+        snap_vol_id = strtok_r(NULL, ":", &save_ptr);
+        brick_num = atoi(strtok_r(NULL, ":", &save_ptr));
+        brick_path = strtok_r(NULL, ":", &save_ptr);
+        snap_op = atoi(strtok_r(NULL, ":", &save_ptr));
+        snap_status = atoi(strtok_r(NULL, ":", &save_ptr));
+
+        if (!nodeid || !snap_uuid || !brick_path || !snap_vol_id ||
+            brick_num < 1 || snap_op < 1 || snap_status < 1) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_INVALID_MISSED_SNAP_ENTRY,
+                   "Invalid missed_snap_entry");
+            ret = -1;
+            goto out;
+        }
+
+        snprintf(missed_info, sizeof(missed_info), "%s:%s", nodeid, snap_uuid);
+
+        ret = glusterd_add_new_entry_to_list(missed_info, snap_vol_id,
+                                             brick_num, brick_path, snap_op,
+                                             snap_status);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_MISSED_SNAP_LIST_STORE_FAIL,
+                   "Failed to store missed snaps_list");
+            goto out;
+        }
+
+        GF_FREE(tmp);
+        tmp = NULL;
+    }
+
+    ret = 0;
+out:
+    if (tmp)
+        GF_FREE(tmp);
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* This function will restore origin volume to it's snap.
+ * The restore operation will simply replace the Gluster origin
+ * volume with the snap volume.
+ * TODO: Multi-volume delete to be done.
+ *       Cleanup in case of restore failure is pending.
+ *
+ * @param orig_vol      volinfo of origin volume
+ * @param snap_vol      volinfo of snapshot volume
+ *
+ * @return 0 on success and negative value on error
+ */
+int
+gd_restore_snap_volume(dict_t *dict, dict_t *rsp_dict,
+                       glusterd_volinfo_t *orig_vol,
+                       glusterd_volinfo_t *snap_vol, int32_t volcount)
+{
+    int ret = -1;
+    glusterd_volinfo_t *new_volinfo = NULL;
+    glusterd_snap_t *snap = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    glusterd_volinfo_t *temp_volinfo = NULL;
+    glusterd_volinfo_t *voliter = NULL;
+    gf_boolean_t conf_present = _gf_false;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(rsp_dict);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GF_VALIDATE_OR_GOTO(this->name, orig_vol, out);
+    GF_VALIDATE_OR_GOTO(this->name, snap_vol, out);
+    snap = snap_vol->snapshot;
+    GF_VALIDATE_OR_GOTO(this->name, snap, out);
+
+    /* Set the status to under restore so that if the
+     * the node goes down during restore and comes back
+     * the state of the volume can be reverted correctly
+     */
+    snap->snap_status = GD_SNAP_STATUS_UNDER_RESTORE;
+
+    /* We need to save this in disk so that if node goes
+     * down the status is in updated state.
+     */
+    ret = glusterd_store_snap(snap);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED,
+               "Could not store snap "
+               "object for %s snap of %s volume",
+               snap_vol->volname, snap_vol->parent_volname);
+        goto out;
+    }
+
+    /* Snap volume must be stopped before performing the
+     * restore operation.
+     */
+    ret = glusterd_stop_volume(snap_vol);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_STOP_FAILED,
+               "Failed to stop "
+               "snap volume");
+        goto out;
+    }
+
+    /* Create a new volinfo for the restored volume */
+    ret = glusterd_volinfo_dup(snap_vol, &new_volinfo, _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_OP_FAILED,
+               "Failed to create volinfo");
+        goto out;
+    }
+
+    /* Following entries need to be derived from origin volume. */
+    gf_strncpy(new_volinfo->volname, orig_vol->volname,
+               sizeof(new_volinfo->volname));
+    gf_uuid_copy(new_volinfo->volume_id, orig_vol->volume_id);
+    new_volinfo->snap_count = orig_vol->snap_count;
+    gf_uuid_copy(new_volinfo->restored_from_snap, snap_vol->snapshot->snap_id);
+
+    /* Use the same version as the original version */
+    new_volinfo->version = orig_vol->version;
+
+    /* Copy the snap vol info to the new_volinfo.*/
+    ret = glusterd_snap_volinfo_restore(dict, rsp_dict, new_volinfo, snap_vol,
+                                        volcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_RESTORE_FAIL,
+               "Failed to restore snap");
+        goto out;
+    }
+
+    /* In case a new node is added to the peer, after a snapshot was
+     * taken, the geo-rep files are not synced to that node. This
+     * leads to the failure of snapshot restore. Hence, ignoring the
+     * missing geo-rep files in the new node, and proceeding with
+     * snapshot restore. Once the restore is successful, the missing
+     * geo-rep files can be generated with "gluster volume geo-rep
+     * <master-vol> <slave-vol> create push-pem force"
+     */
+    ret = glusterd_restore_geo_rep_files(snap_vol);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_RESTORE_FAIL,
+               "Failed to restore "
+               "geo-rep files for snap %s",
+               snap_vol->snapshot->snapname);
+    }
+
+    /* Need not save cksum, as we will copy cksum file in *
+     * this function                                           *
+     */
+    ret = glusterd_copy_quota_files(snap_vol, orig_vol, &conf_present);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_RESTORE_FAIL,
+               "Failed to restore "
+               "quota files for snap %s",
+               snap_vol->snapshot->snapname);
+        goto out;
+    }
+
+    /* New volinfo always shows the status as created. Therefore
+     * set the status to the original volume's status. */
+    glusterd_set_volume_status(new_volinfo, orig_vol->status);
+
+    cds_list_add_tail(&new_volinfo->vol_list, &conf->volumes);
+
+    ret = glusterd_store_volinfo(new_volinfo,
+                                 GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_OP_FAILED,
+               "Failed to store volinfo");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (ret) {
+        /* In case of any failure we should free new_volinfo. Doing
+         * this will also remove the entry we added in conf->volumes
+         * if it was added there.
+         */
+        if (new_volinfo)
+            (void)glusterd_volinfo_delete(new_volinfo);
+    } else {
+        cds_list_for_each_entry_safe(voliter, temp_volinfo,
+                                     &orig_vol->snap_volumes, snapvol_list)
+        {
+            cds_list_add_tail(&voliter->snapvol_list,
+                              &new_volinfo->snap_volumes);
+        }
+    }
+
+    return ret;
+}
+
+int
+glusterd_snapshot_get_volnames_uuids(dict_t *dict, char *volname,
+                                     gf_getsnap_name_uuid_rsp *snap_info_rsp)
+{
+    int ret = -1;
+    int snapcount = 0;
+    char key[32] = "";
+    glusterd_volinfo_t *snap_vol = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_volinfo_t *tmp_vol = NULL;
+    xlator_t *this = NULL;
+    int op_errno = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(volname);
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, dict, out, op_errno, EINVAL);
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, volname, out, op_errno, EINVAL);
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, snap_info_rsp, out, op_errno,
+                                   EINVAL);
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_VOL_NOT_FOUND,
+               "Failed to get volinfo of volume %s", volname);
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    cds_list_for_each_entry_safe(snap_vol, tmp_vol, &volinfo->snap_volumes,
+                                 snapvol_list)
+    {
+        if (GLUSTERD_STATUS_STARTED != snap_vol->status)
+            continue;
+
+        snapcount++;
+
+        /* Set Snap Name */
+        snprintf(key, sizeof(key), "snapname.%d", snapcount);
+        ret = dict_set_dynstr_with_alloc(dict, key,
+                                         snap_vol->snapshot->snapname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set "
+                   "snap name in dictionary");
+            goto out;
+        }
+
+        /* Set Snap ID */
+        snprintf(key, sizeof(key), "snap-id.%d", snapcount);
+        ret = dict_set_dynstr_with_alloc(
+            dict, key, uuid_utoa(snap_vol->snapshot->snap_id));
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set "
+                   "snap id in dictionary");
+            goto out;
+        }
+
+        /* Snap Volname which is used to activate the snap vol */
+        snprintf(key, sizeof(key), "snap-volname.%d", snapcount);
+        ret = dict_set_dynstr_with_alloc(dict, key, snap_vol->volname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set "
+                   "snap id in dictionary");
+            goto out;
+        }
+    }
+
+    ret = dict_set_int32n(dict, "snap-count", SLEN("snap-count"), snapcount);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set snapcount");
+        op_errno = -ret;
+        goto out;
+    }
+
+    ret = dict_allocate_and_serialize(dict, &snap_info_rsp->dict.dict_val,
+                                      &snap_info_rsp->dict.dict_len);
+    if (ret) {
+        op_errno = -ret;
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    snap_info_rsp->op_ret = ret;
+    snap_info_rsp->op_errno = op_errno;
+    snap_info_rsp->op_errstr = "";
+
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-statedump.c b/xlators/mgmt/glusterd/src/glusterd-statedump.c
new file mode 100644
index 00000000000..225d10cc546
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-statedump.c
@@ -0,0 +1,243 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/statedump.h>
+#include "glusterd.h"
+#include "glusterd-shd-svc.h"
+#include "glusterd-quotad-svc.h"
+#include "glusterd-locks.h"
+#include "glusterd-messages.h"
+
+static void
+glusterd_dump_peer(glusterd_peerinfo_t *peerinfo, char *input_key, int index,
+                   gf_boolean_t xpeers)
+{
+    char subkey[GF_DUMP_MAX_BUF_LEN + 11] = "";
+    char key[GF_DUMP_MAX_BUF_LEN] = "";
+
+    strncpy(key, input_key, sizeof(key) - 1);
+
+    snprintf(subkey, sizeof(subkey), "%s%d", key, index);
+
+    gf_proc_dump_build_key(key, subkey, "uuid");
+    gf_proc_dump_write(key, "%s", uuid_utoa(peerinfo->uuid));
+
+    gf_proc_dump_build_key(key, subkey, "hostname");
+    gf_proc_dump_write(key, "%s", peerinfo->hostname);
+
+    gf_proc_dump_build_key(key, subkey, "port");
+    gf_proc_dump_write(key, "%d", peerinfo->port);
+
+    gf_proc_dump_build_key(key, subkey, "state");
+    gf_proc_dump_write(key, "%d", peerinfo->state.state);
+
+    gf_proc_dump_build_key(key, subkey, "quorum-action");
+    gf_proc_dump_write(key, "%d", peerinfo->quorum_action);
+
+    gf_proc_dump_build_key(key, subkey, "quorum-contrib");
+    gf_proc_dump_write(key, "%d", peerinfo->quorum_contrib);
+
+    gf_proc_dump_build_key(key, subkey, "detaching");
+    gf_proc_dump_write(key, "%d", peerinfo->detaching);
+
+    gf_proc_dump_build_key(key, subkey, "locked");
+    gf_proc_dump_write(key, "%d", peerinfo->locked);
+}
+
+static void
+glusterd_dump_peer_rpcstat(glusterd_peerinfo_t *peerinfo, char *input_key,
+                           int index)
+{
+    rpc_clnt_connection_t *conn = NULL;
+    int ret = -1;
+    rpc_clnt_t *rpc = NULL;
+    char rpcsvc_peername[RPCSVC_PEER_STRLEN] = "";
+    char subkey[GF_DUMP_MAX_BUF_LEN + 11] = "";
+    char key[GF_DUMP_MAX_BUF_LEN] = "";
+
+    strncpy(key, input_key, sizeof(key) - 1);
+
+    /* Dump the rpc connection statistics */
+    rpc = peerinfo->rpc;
+    if (rpc) {
+        conn = &rpc->conn;
+        snprintf(subkey, sizeof(subkey), "%s%d", key, index);
+        ret = rpcsvc_transport_peername(conn->trans, (char *)&rpcsvc_peername,
+                                        sizeof(rpcsvc_peername));
+        if (!ret) {
+            gf_proc_dump_build_key(key, subkey, "rpc.peername");
+            gf_proc_dump_write(key, "%s", rpcsvc_peername);
+        }
+        gf_proc_dump_build_key(key, subkey, "rpc.connected");
+        gf_proc_dump_write(key, "%d", conn->connected);
+
+        gf_proc_dump_build_key(key, subkey, "rpc.total-bytes-read");
+        gf_proc_dump_write(key, "%" PRIu64, conn->trans->total_bytes_read);
+
+        gf_proc_dump_build_key(key, subkey, "rpc.total-bytes-written");
+        gf_proc_dump_write(key, "%" PRIu64, conn->trans->total_bytes_write);
+
+        gf_proc_dump_build_key(key, subkey, "rpc.ping_msgs_sent");
+        gf_proc_dump_write(key, "%" PRIu64, conn->pingcnt);
+
+        gf_proc_dump_build_key(key, subkey, "rpc.msgs_sent");
+        gf_proc_dump_write(key, "%" PRIu64, conn->msgcnt);
+    }
+}
+
+static void
+glusterd_dump_client_details(glusterd_conf_t *conf)
+{
+    rpc_transport_t *xprt = NULL;
+    char key[GF_DUMP_MAX_BUF_LEN] = "";
+    char subkey[50] = "";
+    int index = 1;
+
+    pthread_mutex_lock(&conf->xprt_lock);
+    {
+        list_for_each_entry(xprt, &conf->xprt_list, list)
+        {
+            snprintf(subkey, sizeof(subkey), "glusterd.client%d", index);
+
+            gf_proc_dump_build_key(key, subkey, "identifier");
+            gf_proc_dump_write(key, "%s", xprt->peerinfo.identifier);
+
+            gf_proc_dump_build_key(key, subkey, "volname");
+            gf_proc_dump_write(key, "%s", xprt->peerinfo.volname);
+
+            gf_proc_dump_build_key(key, subkey, "max-op-version");
+            gf_proc_dump_write(key, "%u", xprt->peerinfo.max_op_version);
+
+            gf_proc_dump_build_key(key, subkey, "min-op-version");
+            gf_proc_dump_write(key, "%u", xprt->peerinfo.min_op_version);
+            index++;
+        }
+    }
+    pthread_mutex_unlock(&conf->xprt_lock);
+}
+
+/* The following function is just for dumping mgmt_v3_lock dictionary, any other
+ * dict passed to this API will not work */
+
+static void
+glusterd_dict_mgmt_v3_lock_statedump(dict_t *dict)
+{
+    int ret = 0;
+    int dumplen = 0;
+    data_pair_t *trav = NULL;
+    char key[GF_DUMP_MAX_BUF_LEN] = "";
+    char dump[64 * 1024] = "";
+
+    if (!dict) {
+        gf_msg_callingfn("glusterd", GF_LOG_WARNING, EINVAL, GD_MSG_DICT_EMPTY,
+                         "dict NULL");
+        goto out;
+    }
+    for (trav = dict->members_list; trav; trav = trav->next) {
+        if (strstr(trav->key, "debug.last-success-bt") != NULL) {
+            ret = snprintf(&dump[dumplen], sizeof(dump) - dumplen, "\n\t%s:%s",
+                           trav->key, trav->value->data);
+        } else {
+            ret = snprintf(
+                &dump[dumplen], sizeof(dump) - dumplen, "\n\t%s:%s", trav->key,
+                uuid_utoa(((glusterd_mgmt_v3_lock_obj *)(trav->value->data))
+                              ->lock_owner));
+        }
+        if ((ret == -1) || !ret)
+            return;
+        dumplen += ret;
+    }
+
+    if (dumplen) {
+        gf_proc_dump_build_key(key, "glusterd", "mgmt_v3_lock");
+        gf_proc_dump_write(key, "%s", dump);
+    }
+
+out:
+    return;
+}
+
+int
+glusterd_dump_priv(xlator_t *this)
+{
+    glusterd_conf_t *priv = NULL;
+    char key[GF_DUMP_MAX_BUF_LEN] = "";
+    int port = 0;
+    struct pmap_registry *pmap = NULL;
+
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    if (!priv)
+        return 0;
+
+    gf_proc_dump_build_key(key, "xlator.glusterd", "priv");
+    gf_proc_dump_add_section("%s", key);
+
+    pthread_mutex_lock(&priv->mutex);
+    {
+        gf_proc_dump_build_key(key, "glusterd", "my-uuid");
+        gf_proc_dump_write(key, "%s", uuid_utoa(priv->uuid));
+
+        gf_proc_dump_build_key(key, "glusterd", "working-directory");
+        gf_proc_dump_write(key, "%s", priv->workdir);
+
+        gf_proc_dump_build_key(key, "glusterd", "max-op-version");
+        gf_proc_dump_write(key, "%d", GD_OP_VERSION_MAX);
+
+        gf_proc_dump_build_key(key, "glusterd", "min-op-version");
+        gf_proc_dump_write(key, "%d", GD_OP_VERSION_MIN);
+
+        gf_proc_dump_build_key(key, "glusterd", "current-op-version");
+        gf_proc_dump_write(key, "%d", priv->op_version);
+
+        gf_proc_dump_build_key(key, "glusterd", "ping-timeout");
+        gf_proc_dump_write(key, "%d", priv->ping_timeout);
+#ifdef BUILD_GNFS
+        gf_proc_dump_build_key(key, "glusterd", "nfs.online");
+        gf_proc_dump_write(key, "%d", priv->nfs_svc.online);
+#endif
+        gf_proc_dump_build_key(key, "glusterd", "quotad.online");
+        gf_proc_dump_write(key, "%d", priv->quotad_svc.online);
+
+        gf_proc_dump_build_key(key, "glusterd", "bitd.online");
+        gf_proc_dump_write(key, "%d", priv->bitd_svc.online);
+
+        gf_proc_dump_build_key(key, "glusterd", "scrub.online");
+        gf_proc_dump_write(key, "%d", priv->scrub_svc.online);
+
+        /* Dump peer details */
+        GLUSTERD_DUMP_PEERS(&priv->peers, uuid_list, _gf_false);
+
+        /* Dump pmap data structure from base port to last alloc */
+        pmap = priv->pmap;
+        if (pmap) {
+            for (port = pmap->base_port; port <= pmap->last_alloc; port++) {
+                gf_proc_dump_build_key(key, "glusterd", "pmap_port");
+                gf_proc_dump_write(key, "%d", port);
+                gf_proc_dump_build_key(key, "glusterd", "pmap[%d].type", port);
+                gf_proc_dump_write(key, "%d", pmap->ports[port].type);
+                gf_proc_dump_build_key(key, "glusterd", "pmap[%d].brickname",
+                                       port);
+                gf_proc_dump_write(key, "%s", pmap->ports[port].brickname);
+            }
+        }
+        /* Dump client details */
+        glusterd_dump_client_details(priv);
+
+        /* Dump mgmt_v3_lock from the dictionary if any */
+        glusterd_dict_mgmt_v3_lock_statedump(priv->mgmt_v3_lock);
+        dict_dump_to_statedump(priv->opts, "options", "glusterd");
+    }
+    pthread_mutex_unlock(&priv->mutex);
+
+out:
+    return 0;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-statedump.h b/xlators/mgmt/glusterd/src/glusterd-statedump.h
new file mode 100644
index 00000000000..b5ef1f48e82
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-statedump.h
@@ -0,0 +1,18 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_STATEDUMP_H_
+#define _GLUSTERD_STATEDUMP_H_
+
+#include <glusterfs/xlator.h>
+
+int
+glusterd_dump_priv(xlator_t *this);
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c
new file mode 100644
index 00000000000..d94dceb10b7
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-store.c
@@ -0,0 +1,5125 @@
+/*
+   Copyright (c) 2007-2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "glusterd-op-sm.h"
+#include <inttypes.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/dict.h>
+#include "protocol-common.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/timer.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/statedump.h>
+#include "glusterd-mem-types.h"
+#include "glusterd.h"
+#include "glusterd-sm.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-utils.h"
+#include "glusterd-hooks.h"
+#include <glusterfs/store.h>
+#include "glusterd-store.h"
+#include "glusterd-snapshot-utils.h"
+#include "glusterd-messages.h"
+
+#include "rpc-clnt.h"
+#include <glusterfs/common-utils.h>
+#include <glusterfs/quota-common-utils.h>
+
+#include <sys/resource.h>
+#include <inttypes.h>
+#include <dirent.h>
+
+#if defined(GF_LINUX_HOST_OS)
+#include <mntent.h>
+#else
+#include "mntent_compat.h"
+#endif
+
+#define GLUSTERD_GET_BRICK_DIR(path, volinfo, priv)                            \
+    do {                                                                       \
+        int32_t _brick_len;                                                    \
+        if (volinfo->is_snap_volume) {                                         \
+            _brick_len = snprintf(path, PATH_MAX, "%s/snaps/%s/%s/%s",         \
+                                  priv->workdir, volinfo->snapshot->snapname,  \
+                                  volinfo->volname, GLUSTERD_BRICK_INFO_DIR);  \
+        } else {                                                               \
+            _brick_len = snprintf(path, PATH_MAX, "%s/%s/%s/%s",               \
+                                  priv->workdir, GLUSTERD_VOLUME_DIR_PREFIX,   \
+                                  volinfo->volname, GLUSTERD_BRICK_INFO_DIR);  \
+        }                                                                      \
+        if ((_brick_len < 0) || (_brick_len >= PATH_MAX)) {                    \
+            path[0] = 0;                                                       \
+        }                                                                      \
+    } while (0)
+
+void
+glusterd_replace_slash_with_hyphen(char *str)
+{
+    char *ptr = NULL;
+
+    ptr = strchr(str, '/');
+
+    while (ptr) {
+        *ptr = '-';
+        ptr = strchr(ptr, '/');
+    }
+}
+
+int32_t
+glusterd_store_create_brick_dir(glusterd_volinfo_t *volinfo)
+{
+    int32_t ret = -1;
+    char brickdirpath[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(volinfo);
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    GLUSTERD_GET_BRICK_DIR(brickdirpath, volinfo, priv);
+    ret = gf_store_mkdir(brickdirpath);
+
+    return ret;
+}
+
+static void
+glusterd_store_key_vol_brick_set(glusterd_brickinfo_t *brickinfo,
+                                 char *key_vol_brick, size_t len)
+{
+    GF_ASSERT(brickinfo);
+    GF_ASSERT(key_vol_brick);
+    GF_ASSERT(len >= PATH_MAX);
+
+    snprintf(key_vol_brick, len, "%s", brickinfo->path);
+    glusterd_replace_slash_with_hyphen(key_vol_brick);
+}
+
+static void
+glusterd_store_brickinfofname_set(glusterd_brickinfo_t *brickinfo,
+                                  char *brickfname, size_t len)
+{
+    char key_vol_brick[PATH_MAX] = {0};
+
+    GF_ASSERT(brickfname);
+    GF_ASSERT(brickinfo);
+    GF_ASSERT(len >= PATH_MAX);
+
+    glusterd_store_key_vol_brick_set(brickinfo, key_vol_brick,
+                                     sizeof(key_vol_brick));
+    snprintf(brickfname, len, "%s:%s", brickinfo->hostname, key_vol_brick);
+}
+
+static void
+glusterd_store_brickinfopath_set(glusterd_volinfo_t *volinfo,
+                                 glusterd_brickinfo_t *brickinfo,
+                                 char *brickpath, size_t len)
+{
+    char brickfname[PATH_MAX] = {0};
+    char brickdirpath[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(brickpath);
+    GF_ASSERT(brickinfo);
+    GF_ASSERT(len >= PATH_MAX);
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    GLUSTERD_GET_BRICK_DIR(brickdirpath, volinfo, priv);
+    glusterd_store_brickinfofname_set(brickinfo, brickfname,
+                                      sizeof(brickfname));
+    snprintf(brickpath, len, "%s/%s", brickdirpath, brickfname);
+}
+
+static void
+glusterd_store_snapd_path_set(glusterd_volinfo_t *volinfo, char *snapd_path,
+                              size_t len)
+{
+    char volpath[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(len >= PATH_MAX);
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, priv);
+
+    snprintf(snapd_path, len, "%s/snapd.info", volpath);
+}
+
+gf_boolean_t
+glusterd_store_is_valid_brickpath(char *volname, char *brick)
+{
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    int32_t ret = 0;
+    size_t volname_len = strlen(volname);
+    xlator_t *this = NULL;
+    int bpath_len = 0;
+    const char delim[2] = "/";
+    char *sub_dir = NULL;
+    char *saveptr = NULL;
+    char *brickpath_ptr = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = glusterd_brickinfo_new_from_brick(brick, &brickinfo, _gf_false, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_BRICK_CREATION_FAIL,
+               "Failed to create brick "
+               "info for brick %s",
+               brick);
+        ret = 0;
+        goto out;
+    }
+    ret = glusterd_volinfo_new(&volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Failed to create volinfo");
+        ret = 0;
+        goto out;
+    }
+    if (volname_len >= sizeof(volinfo->volname)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_NAME_TOO_LONG,
+               "volume name too long");
+        ret = 0;
+        goto out;
+    }
+    memcpy(volinfo->volname, volname, volname_len + 1);
+
+    /* Check whether brickpath is less than PATH_MAX */
+    ret = 1;
+    bpath_len = strlen(brickinfo->path);
+
+    if (brickinfo->path[bpath_len - 1] != '/') {
+        if (bpath_len >= PATH_MAX) {
+            ret = 0;
+            goto out;
+        }
+    } else {
+        /* Path has a trailing "/" which should not be considered in
+         * length check validation
+         */
+        if (bpath_len >= PATH_MAX + 1) {
+            ret = 0;
+            goto out;
+        }
+    }
+
+    /* The following validation checks whether each sub directories in the
+     * brick path meets the POSIX max length validation
+     */
+
+    brickpath_ptr = brickinfo->path;
+    sub_dir = strtok_r(brickpath_ptr, delim, &saveptr);
+
+    while (sub_dir != NULL) {
+        if (strlen(sub_dir) >= _POSIX_PATH_MAX) {
+            ret = 0;
+            goto out;
+        }
+        sub_dir = strtok_r(NULL, delim, &saveptr);
+    }
+
+out:
+    if (brickinfo)
+        glusterd_brickinfo_delete(brickinfo);
+    if (volinfo)
+        glusterd_volinfo_unref(volinfo);
+
+    return ret;
+}
+
+int32_t
+glusterd_store_volinfo_brick_fname_write(int vol_fd,
+                                         glusterd_brickinfo_t *brickinfo,
+                                         int32_t brick_count,
+                                         int is_thin_arbiter)
+{
+    char key[64] = {
+        0,
+    };
+    char brickfname[PATH_MAX] = {
+        0,
+    };
+    int32_t ret = -1;
+
+    if (!is_thin_arbiter) {
+        snprintf(key, sizeof(key), "%s-%d", GLUSTERD_STORE_KEY_VOL_BRICK,
+                 brick_count);
+    } else {
+        snprintf(key, sizeof(key), "%s-%d", GLUSTERD_STORE_KEY_VOL_TA_BRICK,
+                 brick_count);
+    }
+    glusterd_store_brickinfofname_set(brickinfo, brickfname,
+                                      sizeof(brickfname));
+    ret = gf_store_save_value(vol_fd, key, brickfname);
+    return ret;
+}
+
+int32_t
+glusterd_store_create_brick_shandle_on_absence(glusterd_volinfo_t *volinfo,
+                                               glusterd_brickinfo_t *brickinfo)
+{
+    char brickpath[PATH_MAX] = {
+        0,
+    };
+    int32_t ret = 0;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(brickinfo);
+
+    glusterd_store_brickinfopath_set(volinfo, brickinfo, brickpath,
+                                     sizeof(brickpath));
+    ret = gf_store_handle_create_on_absence(&brickinfo->shandle, brickpath);
+    return ret;
+}
+
+int32_t
+glusterd_store_create_snapd_shandle_on_absence(glusterd_volinfo_t *volinfo)
+{
+    char snapd_path[PATH_MAX] = {
+        0,
+    };
+    int32_t ret = 0;
+
+    GF_ASSERT(volinfo);
+
+    glusterd_store_snapd_path_set(volinfo, snapd_path, sizeof(snapd_path));
+    ret = gf_store_handle_create_on_absence(&volinfo->snapd.handle, snapd_path);
+    return ret;
+}
+
+/* Store the bricks snapshot details only if required
+ *
+ * The snapshot details will be stored only if the cluster op-version is
+ * greater than or equal to 4
+ */
+static int
+gd_store_brick_snap_details_write(int fd, glusterd_brickinfo_t *brickinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char value[5 * PATH_MAX];
+    uint total_len = 0;
+
+    this = THIS;
+    GF_ASSERT(this != NULL);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (conf != NULL), out);
+
+    GF_VALIDATE_OR_GOTO(this->name, (fd > 0), out);
+    GF_VALIDATE_OR_GOTO(this->name, (brickinfo != NULL), out);
+
+    if (conf->op_version < GD_OP_VERSION_3_6_0) {
+        ret = 0;
+        goto out;
+    }
+
+    if (brickinfo->device_path[0] != '\0') {
+        ret = snprintf(value + total_len, sizeof(value) - total_len, "%s=%s\n",
+                       GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH,
+                       brickinfo->device_path);
+        if (ret < 0 || ret >= sizeof(value) - total_len) {
+            ret = -1;
+            goto err;
+        }
+        total_len += ret;
+    }
+
+    if (brickinfo->mount_dir[0] != '\0') {
+        ret = snprintf(value + total_len, sizeof(value) - total_len, "%s=%s\n",
+                       GLUSTERD_STORE_KEY_BRICK_MOUNT_DIR,
+                       brickinfo->mount_dir);
+        if (ret < 0 || ret >= sizeof(value) - total_len) {
+            ret = -1;
+            goto err;
+        }
+        total_len += ret;
+    }
+
+    if (brickinfo->fstype[0] != '\0') {
+        ret = snprintf(value + total_len, sizeof(value) - total_len, "%s=%s\n",
+                       GLUSTERD_STORE_KEY_BRICK_FSTYPE, brickinfo->fstype);
+        if (ret < 0 || ret >= sizeof(value) - total_len) {
+            ret = -1;
+            goto err;
+        }
+        total_len += ret;
+    }
+
+    if (brickinfo->mnt_opts[0] != '\0') {
+        ret = snprintf(value + total_len, sizeof(value) - total_len, "%s=%s\n",
+                       GLUSTERD_STORE_KEY_BRICK_MNTOPTS, brickinfo->mnt_opts);
+        if (ret < 0 || ret >= sizeof(value) - total_len) {
+            ret = -1;
+            goto err;
+        }
+        total_len += ret;
+    }
+
+    ret = snprintf(value + total_len, sizeof(value) - total_len, "%s=%d\n",
+                   GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS,
+                   brickinfo->snap_status);
+    if (ret < 0 || ret >= sizeof(value) - total_len) {
+        ret = -1;
+        goto err;
+    }
+    total_len += ret;
+
+    ret = snprintf(value + total_len, sizeof(value) - total_len,
+                   "%s=%" PRIu64 "\n", GLUSTERD_STORE_KEY_BRICK_FSID,
+                   brickinfo->statfs_fsid);
+    if (ret < 0 || ret >= sizeof(value) - total_len) {
+        ret = -1;
+        goto err;
+    }
+
+    ret = gf_store_save_items(fd, value);
+err:
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FS_LABEL_UPDATE_FAIL,
+               "Failed to save "
+               "snap detils of brick %s",
+               brickinfo->path);
+    }
+out:
+    return ret;
+}
+
+static int32_t
+glusterd_store_brickinfo_write(int fd, glusterd_brickinfo_t *brickinfo)
+{
+    char value[5 * PATH_MAX];
+    int32_t ret = -1;
+
+    GF_ASSERT(brickinfo);
+    GF_ASSERT(fd > 0);
+
+    ret = snprintf(value, sizeof(value),
+                   "%s=%s\n%s=%s\n%s=%s\n%s=%s\n%s=%d\n%s=%d\n%s=%d\n%s=%s\n",
+                   GLUSTERD_STORE_KEY_BRICK_UUID, uuid_utoa(brickinfo->uuid),
+                   GLUSTERD_STORE_KEY_BRICK_HOSTNAME, brickinfo->hostname,
+                   GLUSTERD_STORE_KEY_BRICK_PATH, brickinfo->path,
+                   GLUSTERD_STORE_KEY_BRICK_REAL_PATH, brickinfo->path,
+                   GLUSTERD_STORE_KEY_BRICK_PORT, brickinfo->port,
+                   GLUSTERD_STORE_KEY_BRICK_RDMA_PORT, brickinfo->rdma_port,
+                   GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED,
+                   brickinfo->decommissioned, GLUSTERD_STORE_KEY_BRICK_ID,
+                   brickinfo->brick_id);
+
+    if (ret < 0 || ret >= sizeof(value)) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = gf_store_save_items(fd, value);
+    if (ret)
+        goto out;
+
+    ret = gd_store_brick_snap_details_write(fd, brickinfo);
+    if (ret)
+        goto out;
+
+    if (!brickinfo->vg[0])
+        goto out;
+
+    ret = gf_store_save_value(fd, GLUSTERD_STORE_KEY_BRICK_VGNAME,
+                              brickinfo->vg);
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_store_snapd_write(int fd, glusterd_volinfo_t *volinfo)
+{
+    char value[64] = {
+        0,
+    };
+    int32_t ret = 0;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(fd > 0);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    snprintf(value, sizeof(value), "%d", volinfo->snapd.port);
+    ret = gf_store_save_value(fd, GLUSTERD_STORE_KEY_SNAPD_PORT, value);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_PORT_STORE_FAIL,
+               "failed to store the snapd "
+               "port of volume %s",
+               volinfo->volname);
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int32_t
+glusterd_store_perform_brick_store(glusterd_brickinfo_t *brickinfo)
+{
+    int fd = -1;
+    int32_t ret = -1;
+    GF_ASSERT(brickinfo);
+
+    fd = gf_store_mkstemp(brickinfo->shandle);
+    if (fd <= 0) {
+        ret = -1;
+        goto out;
+    }
+    ret = glusterd_store_brickinfo_write(fd, brickinfo);
+    if (ret)
+        goto out;
+
+out:
+    if (ret && (fd > 0)) {
+        gf_store_unlink_tmppath(brickinfo->shandle);
+    }
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_store_perform_snapd_store(glusterd_volinfo_t *volinfo)
+{
+    int fd = -1;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(volinfo);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    fd = gf_store_mkstemp(volinfo->snapd.handle);
+    if (fd <= 0) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "failed to create the "
+               "temporary file for the snapd store handle of volume "
+               "%s",
+               volinfo->volname);
+        goto out;
+    }
+
+    ret = glusterd_store_snapd_write(fd, volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_PORT_STORE_FAIL,
+               "failed to write snapd port "
+               "info to store handle (volume: %s",
+               volinfo->volname);
+        goto out;
+    }
+
+    ret = gf_store_rename_tmppath(volinfo->snapd.handle);
+
+out:
+    if (ret && (fd > 0))
+        gf_store_unlink_tmppath(volinfo->snapd.handle);
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int32_t
+glusterd_store_brickinfo(glusterd_volinfo_t *volinfo,
+                         glusterd_brickinfo_t *brickinfo, int32_t brick_count,
+                         int vol_fd, int is_thin_arbiter)
+{
+    int32_t ret = -1;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(brickinfo);
+
+    ret = glusterd_store_volinfo_brick_fname_write(
+        vol_fd, brickinfo, brick_count, is_thin_arbiter);
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_create_brick_shandle_on_absence(volinfo, brickinfo);
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_perform_brick_store(brickinfo);
+out:
+    gf_msg_debug(THIS->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_store_snapd_info(glusterd_volinfo_t *volinfo)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(volinfo);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = glusterd_store_create_snapd_shandle_on_absence(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_HANDLE_CREATE_FAIL,
+               "failed to create store "
+               "handle for snapd (volume: %s)",
+               volinfo->volname);
+        goto out;
+    }
+
+    ret = glusterd_store_perform_snapd_store(volinfo);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPD_INFO_STORE_FAIL,
+               "failed to store snapd info "
+               "of the volume %s",
+               volinfo->volname);
+
+out:
+    if (ret)
+        gf_store_unlink_tmppath(volinfo->snapd.handle);
+
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_store_delete_brick(glusterd_brickinfo_t *brickinfo, char *delete_path)
+{
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    char brickpath[PATH_MAX] = {
+        0,
+    };
+    char *ptr = NULL;
+    char *tmppath = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(brickinfo);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    tmppath = gf_strdup(brickinfo->path);
+
+    ptr = strchr(tmppath, '/');
+
+    while (ptr) {
+        *ptr = '-';
+        ptr = strchr(tmppath, '/');
+    }
+
+    snprintf(brickpath, sizeof(brickpath),
+             "%s/" GLUSTERD_BRICK_INFO_DIR "/%s:%s", delete_path,
+             brickinfo->hostname, tmppath);
+
+    GF_FREE(tmppath);
+
+    ret = sys_unlink(brickpath);
+
+    if ((ret < 0) && (errno != ENOENT)) {
+        gf_msg_debug(this->name, 0, "Unlink failed on %s", brickpath);
+        ret = -1;
+        goto out;
+    } else {
+        ret = 0;
+    }
+
+out:
+    if (brickinfo->shandle) {
+        gf_store_handle_destroy(brickinfo->shandle);
+        brickinfo->shandle = NULL;
+    }
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+static int
+_storeopts(dict_t *dict_value, char *key, data_t *value, void *data)
+{
+    int32_t ret = 0;
+    int32_t exists = 0;
+    int32_t option_len = 0;
+    gf_store_handle_t *shandle = NULL;
+    glusterd_volinfo_data_store_t *dict_data = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    dict_data = (glusterd_volinfo_data_store_t *)data;
+    shandle = dict_data->shandle;
+
+    GF_ASSERT(shandle);
+    GF_ASSERT(shandle->fd > 0);
+    GF_ASSERT(key);
+    GF_ASSERT(value);
+    GF_ASSERT(value->data);
+
+    if (dict_data->key_check == 1) {
+        if (is_key_glusterd_hooks_friendly(key)) {
+            exists = 1;
+
+        } else {
+            exists = glusterd_check_option_exists(key, NULL);
+        }
+    }
+    if (exists == 1 || dict_data->key_check == 0) {
+        gf_msg_debug(this->name, 0,
+                     "Storing in buffer for volinfo:key= %s, "
+                     "val=%s",
+                     key, value->data);
+    } else {
+        gf_msg_debug(this->name, 0, "Discarding:key= %s, val=%s", key,
+                     value->data);
+        return 0;
+    }
+
+    /*
+     * The option_len considers the length of the key value
+     * pair and along with that '=' and '\n', but as value->len
+     * already considers a NULL at the end of the data, adding
+     * just 1.
+     */
+    option_len = strlen(key) + value->len + 1;
+
+    if ((VOLINFO_BUFFER_SIZE - dict_data->buffer_len - 1) < option_len) {
+        ret = gf_store_save_items(shandle->fd, dict_data->buffer);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED, NULL);
+            return -1;
+        }
+        dict_data->buffer_len = 0;
+        dict_data->buffer[0] = '\0';
+    }
+    ret = snprintf(dict_data->buffer + dict_data->buffer_len, option_len + 1,
+                   "%s=%s\n", key, value->data);
+    if (ret < 0 || ret > option_len + 1) {
+        gf_smsg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_COPY_FAIL, NULL);
+        return -1;
+    }
+
+    dict_data->buffer_len += ret;
+
+    return 0;
+}
+
+/* Store the volumes snapshot details only if required
+ *
+ * The snapshot details will be stored only if the cluster op-version is
+ * greater than or equal to 4
+ */
+static int
+glusterd_volume_write_snap_details(int fd, glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char buf[PATH_MAX] = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this != NULL);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (conf != NULL), out);
+
+    GF_VALIDATE_OR_GOTO(this->name, (fd > 0), out);
+    GF_VALIDATE_OR_GOTO(this->name, (volinfo != NULL), out);
+
+    if (conf->op_version < GD_OP_VERSION_3_6_0) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = snprintf(buf, sizeof(buf), "%s=%s\n%s=%s\n%s=%" PRIu64 "\n",
+                   GLUSTERD_STORE_KEY_PARENT_VOLNAME, volinfo->parent_volname,
+                   GLUSTERD_STORE_KEY_VOL_RESTORED_SNAP,
+                   uuid_utoa(volinfo->restored_from_snap),
+                   GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT,
+                   volinfo->snap_max_hard_limit);
+    if (ret < 0 || ret >= sizeof(buf)) {
+        ret = -1;
+        goto err;
+    }
+
+    ret = gf_store_save_items(fd, buf);
+    if (ret) {
+        goto err;
+    }
+    ret = glusterd_store_snapd_info(volinfo);
+err:
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPINFO_WRITE_FAIL,
+               "Failed to write snap details"
+               " for volume %s",
+               volinfo->volname);
+out:
+    return ret;
+}
+
+static int32_t
+glusterd_volume_exclude_options_write(int fd, glusterd_volinfo_t *volinfo)
+{
+    char *str = NULL;
+    char buf[PATH_MAX];
+    uint total_len = 0;
+    int32_t ret = -1;
+    xlator_t *this = THIS;
+    glusterd_conf_t *conf = NULL;
+
+    GF_ASSERT(this);
+    GF_ASSERT(fd > 0);
+    GF_ASSERT(volinfo);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (conf != NULL), out);
+
+    ret = snprintf(buf + total_len, sizeof(buf) - total_len,
+                   "%s=%d\n%s=%d\n%s=%d\n%s=%d\n%s=%d\n%s=%d\n",
+                   GLUSTERD_STORE_KEY_VOL_TYPE, volinfo->type,
+                   GLUSTERD_STORE_KEY_VOL_COUNT, volinfo->brick_count,
+                   GLUSTERD_STORE_KEY_VOL_STATUS, volinfo->status,
+                   GLUSTERD_STORE_KEY_VOL_SUB_COUNT, volinfo->sub_count,
+                   GLUSTERD_STORE_KEY_VOL_STRIPE_CNT, volinfo->stripe_count,
+                   GLUSTERD_STORE_KEY_VOL_REPLICA_CNT, volinfo->replica_count);
+    if (ret < 0 || ret >= sizeof(buf) - total_len) {
+        ret = -1;
+        goto out;
+    }
+    total_len += ret;
+
+    if ((conf->op_version >= GD_OP_VERSION_3_7_6) && volinfo->arbiter_count) {
+        ret = snprintf(buf + total_len, sizeof(buf) - total_len, "%s=%d\n",
+                       GLUSTERD_STORE_KEY_VOL_ARBITER_CNT,
+                       volinfo->arbiter_count);
+        if (ret < 0 || ret >= sizeof(buf) - total_len) {
+            ret = -1;
+            goto out;
+        }
+        total_len += ret;
+    }
+
+    if (conf->op_version >= GD_OP_VERSION_3_6_0) {
+        ret = snprintf(
+            buf + total_len, sizeof(buf) - total_len, "%s=%d\n%s=%d\n",
+            GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT, volinfo->disperse_count,
+            GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT, volinfo->redundancy_count);
+        if (ret < 0 || ret >= sizeof(buf) - total_len) {
+            ret = -1;
+            goto out;
+        }
+        total_len += ret;
+    }
+
+    ret = snprintf(buf + total_len, sizeof(buf) - total_len,
+                   "%s=%d\n%s=%d\n%s=%s\n", GLUSTERD_STORE_KEY_VOL_VERSION,
+                   volinfo->version, GLUSTERD_STORE_KEY_VOL_TRANSPORT,
+                   volinfo->transport_type, GLUSTERD_STORE_KEY_VOL_ID,
+                   uuid_utoa(volinfo->volume_id));
+    if (ret < 0 || ret >= sizeof(buf) - total_len) {
+        ret = -1;
+        goto out;
+    }
+    total_len += ret;
+
+    str = glusterd_auth_get_username(volinfo);
+    if (str) {
+        ret = snprintf(buf + total_len, sizeof(buf) - total_len, "%s=%s\n",
+                       GLUSTERD_STORE_KEY_USERNAME, str);
+        if (ret < 0 || ret >= sizeof(buf) - total_len) {
+            ret = -1;
+            goto out;
+        }
+        total_len += ret;
+    }
+
+    str = glusterd_auth_get_password(volinfo);
+    if (str) {
+        ret = snprintf(buf + total_len, sizeof(buf) - total_len, "%s=%s\n",
+                       GLUSTERD_STORE_KEY_PASSWORD, str);
+        if (ret < 0 || ret >= sizeof(buf) - total_len) {
+            ret = -1;
+            goto out;
+        }
+        total_len += ret;
+    }
+
+    ret = snprintf(buf + total_len, sizeof(buf) - total_len, "%s=%d\n%s=%d\n",
+                   GLUSTERD_STORE_KEY_VOL_OP_VERSION, volinfo->op_version,
+                   GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION,
+                   volinfo->client_op_version);
+    if (ret < 0 || ret >= sizeof(buf) - total_len) {
+        ret = -1;
+        goto out;
+    }
+    total_len += ret;
+
+    if (conf->op_version >= GD_OP_VERSION_3_7_6) {
+        ret = snprintf(buf + total_len, sizeof(buf) - total_len, "%s=%d\n",
+                       GLUSTERD_STORE_KEY_VOL_QUOTA_VERSION,
+                       volinfo->quota_xattr_version);
+        if (ret < 0 || ret >= sizeof(buf) - total_len) {
+            ret = -1;
+            goto out;
+        }
+        total_len += ret;
+    }
+    if (conf->op_version >= GD_OP_VERSION_3_10_0) {
+        ret = snprintf(buf + total_len, sizeof(buf) - total_len, "%s=0\n",
+                       GF_TIER_ENABLED);
+        if (ret < 0 || ret >= sizeof(buf) - total_len) {
+            ret = -1;
+            goto out;
+        }
+        total_len += ret;
+    }
+
+    if ((conf->op_version >= GD_OP_VERSION_7_0) &&
+        volinfo->thin_arbiter_count) {
+        ret = snprintf(buf + total_len, sizeof(buf) - total_len, "%s=%d\n",
+                       GLUSTERD_STORE_KEY_VOL_THIN_ARBITER_CNT,
+                       volinfo->thin_arbiter_count);
+        if (ret < 0 || ret >= sizeof(buf) - total_len) {
+            ret = -1;
+            goto out;
+        }
+        total_len += ret;
+    }
+
+    ret = gf_store_save_items(fd, buf);
+    if (ret)
+        goto out;
+
+    ret = glusterd_volume_write_snap_details(fd, volinfo);
+
+out:
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_VALS_WRITE_FAIL,
+               "Unable to write volume "
+               "values for %s",
+               volinfo->volname);
+    return ret;
+}
+
+static void
+glusterd_store_voldirpath_set(glusterd_volinfo_t *volinfo, char *voldirpath)
+{
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(volinfo);
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    GLUSTERD_GET_VOLUME_DIR(voldirpath, volinfo, priv);
+}
+
+static void
+glusterd_store_piddirpath_set(glusterd_volinfo_t *volinfo, char *piddirpath)
+{
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(volinfo);
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    GLUSTERD_GET_VOLUME_PID_DIR(piddirpath, volinfo, priv);
+}
+
+static int32_t
+glusterd_store_create_volume_dirs(glusterd_volinfo_t *volinfo)
+{
+    int32_t ret = -1;
+    char dirpath[PATH_MAX] = {
+        0,
+    };
+
+    GF_ASSERT(volinfo);
+
+    glusterd_store_voldirpath_set(volinfo, dirpath);
+    ret = gf_store_mkdir(dirpath);
+    if (ret)
+        goto out;
+
+    glusterd_store_piddirpath_set(volinfo, dirpath);
+    ret = gf_store_mkdir(dirpath);
+    if (ret)
+        goto out;
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_store_create_snap_dir(glusterd_snap_t *snap)
+{
+    int32_t ret = -1;
+    char snapdirpath[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(snap);
+
+    GLUSTERD_GET_SNAP_DIR(snapdirpath, snap, priv);
+
+    ret = mkdir_p(snapdirpath, 0755, _gf_true);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Failed to create snaps dir "
+               "%s",
+               snapdirpath);
+    }
+    return ret;
+}
+
+static int32_t
+glusterd_store_volinfo_write(int fd, glusterd_volinfo_t *volinfo)
+{
+    int32_t ret = -1;
+    gf_store_handle_t *shandle = NULL;
+    GF_ASSERT(fd > 0);
+    GF_ASSERT(volinfo);
+    GF_ASSERT(volinfo->shandle);
+    xlator_t *this = NULL;
+    glusterd_volinfo_data_store_t *dict_data = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    shandle = volinfo->shandle;
+
+    dict_data = GF_CALLOC(1, sizeof(glusterd_volinfo_data_store_t),
+                          gf_gld_mt_volinfo_dict_data_t);
+    if (dict_data == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_MEMORY, NULL);
+        return -1;
+    }
+
+    ret = glusterd_volume_exclude_options_write(fd, volinfo);
+    if (ret) {
+        goto out;
+    }
+
+    dict_data->shandle = shandle;
+    dict_data->key_check = 1;
+
+    shandle->fd = fd;
+    dict_foreach(volinfo->dict, _storeopts, (void *)dict_data);
+
+    dict_data->key_check = 0;
+    dict_foreach(volinfo->gsync_slaves, _storeopts, (void *)dict_data);
+
+    if (dict_data->buffer_len > 0) {
+        ret = gf_store_save_items(fd, dict_data->buffer);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED, NULL);
+            goto out;
+        }
+    }
+
+    shandle->fd = 0;
+out:
+    GF_FREE(dict_data);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int32_t
+glusterd_store_snapinfo_write(glusterd_snap_t *snap)
+{
+    int32_t ret = -1;
+    int fd = 0;
+    char buf[PATH_MAX];
+    uint total_len = 0;
+
+    GF_ASSERT(snap);
+
+    fd = gf_store_mkstemp(snap->shandle);
+    if (fd <= 0)
+        goto out;
+
+    ret = snprintf(buf + total_len, sizeof(buf) - total_len,
+                   "%s=%s\n%s=%d\n%s=%d\n", GLUSTERD_STORE_KEY_SNAP_ID,
+                   uuid_utoa(snap->snap_id), GLUSTERD_STORE_KEY_SNAP_STATUS,
+                   snap->snap_status, GLUSTERD_STORE_KEY_SNAP_RESTORED,
+                   snap->snap_restored);
+    if (ret < 0 || ret >= sizeof(buf) - total_len) {
+        ret = -1;
+        goto out;
+    }
+    total_len += ret;
+
+    if (snap->description) {
+        ret = snprintf(buf + total_len, sizeof(buf) - total_len, "%s=%s\n",
+                       GLUSTERD_STORE_KEY_SNAP_DESC, snap->description);
+        if (ret < 0 || ret >= sizeof(buf) - total_len) {
+            ret = -1;
+            goto out;
+        }
+        total_len += ret;
+    }
+
+    ret = snprintf(buf + total_len, sizeof(buf) - total_len, "%s=%ld\n",
+                   GLUSTERD_STORE_KEY_SNAP_TIMESTAMP, snap->time_stamp);
+    if (ret < 0 || ret >= sizeof(buf) - total_len) {
+        ret = -1;
+        goto out;
+    }
+    ret = gf_store_save_items(fd, buf);
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static void
+glusterd_store_volfpath_set(glusterd_volinfo_t *volinfo, char *volfpath,
+                            size_t len)
+{
+    char voldirpath[PATH_MAX] = {
+        0,
+    };
+    GF_ASSERT(volinfo);
+    GF_ASSERT(volfpath);
+    GF_ASSERT(len <= PATH_MAX);
+
+    glusterd_store_voldirpath_set(volinfo, voldirpath);
+    snprintf(volfpath, len, "%s/%s", voldirpath, GLUSTERD_VOLUME_INFO_FILE);
+}
+
+static void
+glusterd_store_node_state_path_set(glusterd_volinfo_t *volinfo,
+                                   char *node_statepath, size_t len)
+{
+    char voldirpath[PATH_MAX] = {
+        0,
+    };
+    GF_ASSERT(volinfo);
+    GF_ASSERT(node_statepath);
+    GF_ASSERT(len <= PATH_MAX);
+
+    glusterd_store_voldirpath_set(volinfo, voldirpath);
+    snprintf(node_statepath, len, "%s/%s", voldirpath,
+             GLUSTERD_NODE_STATE_FILE);
+}
+
+static void
+glusterd_store_quota_conf_path_set(glusterd_volinfo_t *volinfo,
+                                   char *quota_conf_path, size_t len)
+{
+    char voldirpath[PATH_MAX] = {
+        0,
+    };
+    GF_ASSERT(volinfo);
+    GF_ASSERT(quota_conf_path);
+    GF_ASSERT(len <= PATH_MAX);
+
+    glusterd_store_voldirpath_set(volinfo, voldirpath);
+    snprintf(quota_conf_path, len, "%s/%s", voldirpath,
+             GLUSTERD_VOLUME_QUOTA_CONFIG);
+}
+
+static void
+glusterd_store_missed_snaps_list_path_set(char *missed_snaps_list, size_t len)
+{
+    glusterd_conf_t *priv = NULL;
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(missed_snaps_list);
+    GF_ASSERT(len <= PATH_MAX);
+
+    snprintf(missed_snaps_list, len,
+             "%s/snaps/" GLUSTERD_MISSED_SNAPS_LIST_FILE, priv->workdir);
+}
+
+static void
+glusterd_store_snapfpath_set(glusterd_snap_t *snap, char *snap_fpath,
+                             size_t len)
+{
+    glusterd_conf_t *priv = NULL;
+    priv = THIS->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(snap);
+    GF_ASSERT(snap_fpath);
+    GF_ASSERT(len <= PATH_MAX);
+
+    snprintf(snap_fpath, len, "%s/snaps/%s/%s", priv->workdir, snap->snapname,
+             GLUSTERD_SNAP_INFO_FILE);
+}
+
+int32_t
+glusterd_store_create_vol_shandle_on_absence(glusterd_volinfo_t *volinfo)
+{
+    char volfpath[PATH_MAX] = {0};
+    int32_t ret = 0;
+
+    GF_ASSERT(volinfo);
+
+    glusterd_store_volfpath_set(volinfo, volfpath, sizeof(volfpath));
+    ret = gf_store_handle_create_on_absence(&volinfo->shandle, volfpath);
+    return ret;
+}
+
+int32_t
+glusterd_store_create_nodestate_sh_on_absence(glusterd_volinfo_t *volinfo)
+{
+    char node_state_path[PATH_MAX] = {0};
+    int32_t ret = 0;
+
+    GF_ASSERT(volinfo);
+
+    glusterd_store_node_state_path_set(volinfo, node_state_path,
+                                       sizeof(node_state_path));
+    ret = gf_store_handle_create_on_absence(&volinfo->node_state_shandle,
+                                            node_state_path);
+
+    return ret;
+}
+
+int32_t
+glusterd_store_create_quota_conf_sh_on_absence(glusterd_volinfo_t *volinfo)
+{
+    char quota_conf_path[PATH_MAX] = {0};
+    int32_t ret = 0;
+
+    GF_ASSERT(volinfo);
+
+    glusterd_store_quota_conf_path_set(volinfo, quota_conf_path,
+                                       sizeof(quota_conf_path));
+    ret = gf_store_handle_create_on_absence(&volinfo->quota_conf_shandle,
+                                            quota_conf_path);
+
+    return ret;
+}
+
+static int32_t
+glusterd_store_create_missed_snaps_list_shandle_on_absence()
+{
+    char missed_snaps_list[PATH_MAX] = "";
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    glusterd_store_missed_snaps_list_path_set(missed_snaps_list,
+                                              sizeof(missed_snaps_list));
+
+    ret = gf_store_handle_create_on_absence(&priv->missed_snaps_list_shandle,
+                                            missed_snaps_list);
+    return ret;
+}
+
+int32_t
+glusterd_store_create_snap_shandle_on_absence(glusterd_snap_t *snap)
+{
+    char snapfpath[PATH_MAX] = {0};
+    int32_t ret = 0;
+
+    GF_ASSERT(snap);
+
+    glusterd_store_snapfpath_set(snap, snapfpath, sizeof(snapfpath));
+    ret = gf_store_handle_create_on_absence(&snap->shandle, snapfpath);
+    return ret;
+}
+
+static int32_t
+glusterd_store_brickinfos(glusterd_volinfo_t *volinfo, int vol_fd)
+{
+    int32_t ret = 0;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *ta_brickinfo = NULL;
+    int32_t brick_count = 0;
+    int32_t ta_brick_count = 0;
+
+    GF_ASSERT(volinfo);
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        ret = glusterd_store_brickinfo(volinfo, brickinfo, brick_count, vol_fd,
+                                       0);
+        if (ret)
+            goto out;
+        brick_count++;
+    }
+    if (volinfo->thin_arbiter_count == 1) {
+        ta_brickinfo = list_first_entry(&volinfo->ta_bricks,
+                                        glusterd_brickinfo_t, brick_list);
+        ret = glusterd_store_brickinfo(volinfo, ta_brickinfo, ta_brick_count,
+                                       vol_fd, 1);
+        if (ret)
+            goto out;
+    }
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_store_node_state_write(int fd, glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    char buf[PATH_MAX];
+    char uuid[UUID_SIZE + 1];
+    uint total_len = 0;
+    glusterd_volinfo_data_store_t *dict_data = NULL;
+    gf_store_handle_t shandle;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(fd > 0);
+    GF_ASSERT(volinfo);
+
+    if (volinfo->rebal.defrag_cmd == GF_DEFRAG_CMD_STATUS) {
+        ret = 0;
+        goto out;
+    }
+
+    gf_uuid_unparse(volinfo->rebal.rebalance_id, uuid);
+    ret = snprintf(buf + total_len, sizeof(buf) - total_len,
+                   "%s=%d\n%s=%d\n%s=%d\n%s=%s\n",
+                   GLUSTERD_STORE_KEY_VOL_DEFRAG, volinfo->rebal.defrag_cmd,
+                   GLUSTERD_STORE_KEY_VOL_DEFRAG_STATUS,
+                   volinfo->rebal.defrag_status, GLUSTERD_STORE_KEY_DEFRAG_OP,
+                   volinfo->rebal.op, GF_REBALANCE_TID_KEY, uuid);
+    if (ret < 0 || ret >= sizeof(buf) - total_len) {
+        ret = -1;
+        goto out;
+    }
+    total_len += ret;
+
+    ret = snprintf(
+        buf + total_len, sizeof(buf) - total_len,
+        "%s=%" PRIu64 "\n%s=%" PRIu64 "\n%s=%" PRIu64 "\n%s=%" PRIu64
+        "\n%s=%" PRIu64 "\n%s=%lf\n",
+        GLUSTERD_STORE_KEY_VOL_DEFRAG_REB_FILES, volinfo->rebal.rebalance_files,
+        GLUSTERD_STORE_KEY_VOL_DEFRAG_SIZE, volinfo->rebal.rebalance_data,
+        GLUSTERD_STORE_KEY_VOL_DEFRAG_SCANNED, volinfo->rebal.lookedup_files,
+        GLUSTERD_STORE_KEY_VOL_DEFRAG_FAILURES,
+        volinfo->rebal.rebalance_failures,
+        GLUSTERD_STORE_KEY_VOL_DEFRAG_SKIPPED, volinfo->rebal.skipped_files,
+        GLUSTERD_STORE_KEY_VOL_DEFRAG_RUN_TIME, volinfo->rebal.rebalance_time);
+    if (ret < 0 || ret >= sizeof(buf) - total_len) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = gf_store_save_items(fd, buf);
+    if (ret) {
+        goto out;
+    }
+
+    if (volinfo->rebal.dict) {
+        dict_data = GF_CALLOC(1, sizeof(glusterd_volinfo_data_store_t),
+                              gf_gld_mt_volinfo_dict_data_t);
+        if (dict_data == NULL) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_MEMORY, NULL);
+            return -1;
+        }
+        dict_data->shandle = &shandle;
+        shandle.fd = fd;
+        dict_foreach(volinfo->rebal.dict, _storeopts, (void *)dict_data);
+        if (dict_data->buffer_len > 0) {
+            ret = gf_store_save_items(fd, dict_data->buffer);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED,
+                        NULL);
+                goto out;
+                ;
+            }
+        }
+    }
+out:
+    GF_FREE(dict_data);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_store_perform_node_state_store(glusterd_volinfo_t *volinfo)
+{
+    int fd = -1;
+    int32_t ret = -1;
+    GF_ASSERT(volinfo);
+
+    fd = gf_store_mkstemp(volinfo->node_state_shandle);
+    if (fd <= 0) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_store_node_state_write(fd, volinfo);
+    if (ret)
+        goto out;
+
+    ret = gf_store_rename_tmppath(volinfo->node_state_shandle);
+    if (ret)
+        goto out;
+
+out:
+    if (ret && (fd > 0))
+        gf_store_unlink_tmppath(volinfo->node_state_shandle);
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int32_t
+glusterd_store_perform_volume_store(glusterd_volinfo_t *volinfo)
+{
+    int fd = -1;
+    int32_t ret = -1;
+    GF_ASSERT(volinfo);
+
+    fd = gf_store_mkstemp(volinfo->shandle);
+    if (fd <= 0) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_store_volinfo_write(fd, volinfo);
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_create_brick_dir(volinfo);
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_brickinfos(volinfo, fd);
+    if (ret)
+        goto out;
+
+out:
+    if (ret && (fd > 0))
+        gf_store_unlink_tmppath(volinfo->shandle);
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+void
+glusterd_perform_volinfo_version_action(glusterd_volinfo_t *volinfo,
+                                        glusterd_volinfo_ver_ac_t ac)
+{
+    GF_ASSERT(volinfo);
+
+    switch (ac) {
+        case GLUSTERD_VOLINFO_VER_AC_NONE:
+            break;
+        case GLUSTERD_VOLINFO_VER_AC_INCREMENT:
+            volinfo->version++;
+            break;
+        case GLUSTERD_VOLINFO_VER_AC_DECREMENT:
+            volinfo->version--;
+            break;
+    }
+}
+
+void
+glusterd_store_bricks_cleanup_tmp(glusterd_volinfo_t *volinfo)
+{
+    glusterd_brickinfo_t *brickinfo = NULL;
+
+    GF_ASSERT(volinfo);
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        gf_store_unlink_tmppath(brickinfo->shandle);
+    }
+}
+
+void
+glusterd_store_volume_cleanup_tmp(glusterd_volinfo_t *volinfo)
+{
+    GF_ASSERT(volinfo);
+
+    glusterd_store_bricks_cleanup_tmp(volinfo);
+
+    gf_store_unlink_tmppath(volinfo->shandle);
+
+    gf_store_unlink_tmppath(volinfo->node_state_shandle);
+
+    gf_store_unlink_tmppath(volinfo->snapd.handle);
+}
+
+int32_t
+glusterd_store_brickinfos_atomic_update(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *ta_brickinfo = NULL;
+
+    GF_ASSERT(volinfo);
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        ret = gf_store_rename_tmppath(brickinfo->shandle);
+        if (ret)
+            goto out;
+    }
+
+    if (volinfo->thin_arbiter_count == 1) {
+        ta_brickinfo = list_first_entry(&volinfo->ta_bricks,
+                                        glusterd_brickinfo_t, brick_list);
+        ret = gf_store_rename_tmppath(ta_brickinfo->shandle);
+        if (ret)
+            goto out;
+    }
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_store_volinfo_atomic_update(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    GF_ASSERT(volinfo);
+
+    ret = gf_store_rename_tmppath(volinfo->shandle);
+    if (ret)
+        goto out;
+
+out:
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "Couldn't rename "
+               "temporary file(s)");
+    return ret;
+}
+
+int32_t
+glusterd_store_volume_atomic_update(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    GF_ASSERT(volinfo);
+
+    ret = glusterd_store_brickinfos_atomic_update(volinfo);
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_volinfo_atomic_update(volinfo);
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_store_snap_atomic_update(glusterd_snap_t *snap)
+{
+    int ret = -1;
+    GF_ASSERT(snap);
+
+    ret = gf_store_rename_tmppath(snap->shandle);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "Couldn't rename "
+               "temporary file(s)");
+
+    return ret;
+}
+
+int32_t
+glusterd_store_snap(glusterd_snap_t *snap)
+{
+    int32_t ret = -1;
+
+    GF_ASSERT(snap);
+
+    ret = glusterd_store_create_snap_dir(snap);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SNAPDIR_CREATE_FAIL,
+               "Failed to create snap dir");
+        goto out;
+    }
+
+    ret = glusterd_store_create_snap_shandle_on_absence(snap);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SNAPINFO_CREATE_FAIL,
+               "Failed to create snap info "
+               "file");
+        goto out;
+    }
+
+    ret = glusterd_store_snapinfo_write(snap);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SNAPINFO_WRITE_FAIL,
+               "Failed to write snap info");
+        goto out;
+    }
+
+    ret = glusterd_store_snap_atomic_update(snap);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_AUTOMIC_UPDATE_FAIL,
+               "Failed to do automic update");
+        goto out;
+    }
+
+out:
+    if (ret && snap->shandle)
+        gf_store_unlink_tmppath(snap->shandle);
+
+    gf_msg_trace(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_store_volinfo(glusterd_volinfo_t *volinfo,
+                       glusterd_volinfo_ver_ac_t ac)
+{
+    int32_t ret = -1;
+    glusterfs_ctx_t *ctx = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    ctx = this->ctx;
+    GF_ASSERT(ctx);
+    GF_ASSERT(volinfo);
+
+    pthread_mutex_lock(&ctx->cleanup_lock);
+    pthread_mutex_lock(&volinfo->store_volinfo_lock);
+    {
+        glusterd_perform_volinfo_version_action(volinfo, ac);
+
+        ret = glusterd_store_create_volume_dirs(volinfo);
+        if (ret)
+            goto unlock;
+
+        ret = glusterd_store_create_vol_shandle_on_absence(volinfo);
+        if (ret)
+            goto unlock;
+
+        ret = glusterd_store_create_nodestate_sh_on_absence(volinfo);
+        if (ret)
+            goto unlock;
+
+        ret = glusterd_store_perform_volume_store(volinfo);
+        if (ret)
+            goto unlock;
+
+        ret = glusterd_store_volume_atomic_update(volinfo);
+        if (ret) {
+            glusterd_perform_volinfo_version_action(
+                volinfo, GLUSTERD_VOLINFO_VER_AC_DECREMENT);
+            goto unlock;
+        }
+
+        ret = glusterd_store_perform_node_state_store(volinfo);
+        if (ret)
+            goto unlock;
+
+        /* checksum should be computed at the end */
+        ret = glusterd_compute_cksum(volinfo, _gf_false);
+        if (ret)
+            goto unlock;
+    }
+unlock:
+    pthread_mutex_unlock(&volinfo->store_volinfo_lock);
+    pthread_mutex_unlock(&ctx->cleanup_lock);
+
+    if (ret)
+        glusterd_store_volume_cleanup_tmp(volinfo);
+
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int32_t
+glusterd_store_delete_volume(glusterd_volinfo_t *volinfo)
+{
+    char pathname[PATH_MAX] = {
+        0,
+    };
+    int32_t ret = 0;
+    glusterd_conf_t *priv = NULL;
+    char delete_path[PATH_MAX] = {
+        0,
+    };
+    char trashdir[PATH_MAX] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    gf_boolean_t rename_fail = _gf_false;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(volinfo);
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    GLUSTERD_GET_VOLUME_DIR(pathname, volinfo, priv);
+
+    len = snprintf(delete_path, sizeof(delete_path),
+                   "%s/" GLUSTERD_TRASH "/%s.deleted", priv->workdir,
+                   uuid_utoa(volinfo->volume_id));
+    if ((len < 0) || (len >= sizeof(delete_path))) {
+        goto out;
+    }
+
+    len = snprintf(trashdir, sizeof(trashdir), "%s/" GLUSTERD_TRASH,
+                   priv->workdir);
+    if ((len < 0) || (len >= sizeof(trashdir))) {
+        goto out;
+    }
+
+    ret = sys_mkdir(trashdir, 0755);
+    if (ret && errno != EEXIST) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Failed to create trash "
+               "directory");
+        goto out;
+    }
+
+    ret = sys_rename(pathname, delete_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Failed to rename volume "
+               "directory for volume %s",
+               volinfo->volname);
+        rename_fail = _gf_true;
+        goto out;
+    }
+
+    ret = recursive_rmdir(trashdir);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Failed to rmdir: %s", trashdir);
+    }
+
+out:
+    if (volinfo->shandle) {
+        gf_store_handle_destroy(volinfo->shandle);
+        volinfo->shandle = NULL;
+    }
+    ret = (rename_fail == _gf_true) ? -1 : 0;
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/*TODO: cleanup the duplicate code and implement a generic function for
+ * deleting snap/volume depending on the parameter flag */
+int32_t
+glusterd_store_delete_snap(glusterd_snap_t *snap)
+{
+    char pathname[PATH_MAX] = {
+        0,
+    };
+    int32_t ret = 0;
+    glusterd_conf_t *priv = NULL;
+    DIR *dir = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    char path[PATH_MAX] = {
+        0,
+    };
+    char delete_path[PATH_MAX] = {
+        0,
+    };
+    char trashdir[PATH_MAX] = {
+        0,
+    };
+    struct stat st = {
+        0,
+    };
+    xlator_t *this = NULL;
+    gf_boolean_t rename_fail = _gf_false;
+    int32_t len = 0;
+
+    this = THIS;
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(snap);
+    GLUSTERD_GET_SNAP_DIR(pathname, snap, priv);
+
+    len = snprintf(delete_path, sizeof(delete_path),
+                   "%s/" GLUSTERD_TRASH "/snap-%s.deleted", priv->workdir,
+                   uuid_utoa(snap->snap_id));
+    if ((len < 0) || (len >= sizeof(delete_path))) {
+        goto out;
+    }
+
+    len = snprintf(trashdir, sizeof(trashdir), "%s/" GLUSTERD_TRASH,
+                   priv->workdir);
+    if ((len < 0) || (len >= sizeof(trashdir))) {
+        goto out;
+    }
+
+    ret = sys_mkdir(trashdir, 0755);
+    if (ret && errno != EEXIST) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Failed to create trash "
+               "directory");
+        goto out;
+    }
+
+    ret = sys_rename(pathname, delete_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Failed to rename snap "
+               "directory %s to %s",
+               pathname, delete_path);
+        rename_fail = _gf_true;
+        goto out;
+    }
+
+    dir = sys_opendir(delete_path);
+    if (!dir) {
+        gf_msg_debug(this->name, 0, "Failed to open directory %s.",
+                     delete_path);
+        goto out;
+    }
+
+    while ((entry = sys_readdir(dir, scratch))) {
+        if (gf_irrelevant_entry(entry))
+            continue;
+        len = snprintf(path, PATH_MAX, "%s/%s", delete_path, entry->d_name);
+        if ((len < 0) || (len >= PATH_MAX)) {
+            goto stat_failed;
+        }
+        ret = sys_stat(path, &st);
+        if (ret == -1) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to stat "
+                         "entry %s",
+                         path);
+            goto stat_failed;
+        }
+
+        if (S_ISDIR(st.st_mode))
+            ret = sys_rmdir(path);
+        else
+            ret = sys_unlink(path);
+
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         " Failed to remove "
+                         "%s",
+                         path);
+        }
+
+        gf_msg_debug(this->name, 0, "%s %s",
+                     ret ? "Failed to remove" : "Removed", entry->d_name);
+    stat_failed:
+        memset(path, 0, sizeof(path));
+    }
+
+    ret = sys_closedir(dir);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Failed to close dir %s.", delete_path);
+    }
+
+    ret = sys_rmdir(delete_path);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Failed to rmdir: %s", delete_path);
+    }
+    ret = sys_rmdir(trashdir);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Failed to rmdir: %s", trashdir);
+    }
+
+out:
+    if (snap->shandle) {
+        gf_store_handle_destroy(snap->shandle);
+        snap->shandle = NULL;
+    }
+    ret = (rename_fail == _gf_true) ? -1 : 0;
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_store_global_info(xlator_t *this)
+{
+    int ret = -1;
+    glusterd_conf_t *conf = NULL;
+    char buf[PATH_MAX];
+    uint total_len = 0;
+    gf_store_handle_t *handle = NULL;
+    char *uuid_str = NULL;
+
+    conf = this->private;
+
+    uuid_str = gf_strdup(uuid_utoa(MY_UUID));
+    if (!uuid_str)
+        goto out;
+
+    if (!conf->handle) {
+        ret = snprintf(buf, sizeof(buf), "%s/%s", conf->workdir,
+                       GLUSTERD_INFO_FILE);
+        if ((ret < 0) || (ret >= sizeof(buf))) {
+            ret = -1;
+            goto out;
+        }
+        ret = gf_store_handle_new(buf, &handle);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_HANDLE_GET_FAIL,
+                   "Unable to get store handle");
+            goto out;
+        }
+
+        conf->handle = handle;
+    } else
+        handle = conf->handle;
+
+    /* These options need to be available for all users */
+    ret = sys_chmod(handle->path, 0644);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "chmod error for %s", GLUSTERD_INFO_FILE);
+        goto out;
+    }
+
+    handle->fd = gf_store_mkstemp(handle);
+    if (handle->fd < 0) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = snprintf(buf, sizeof(buf), "%s=%s\n", GLUSTERD_STORE_UUID_KEY,
+                   uuid_str);
+    if (ret < 0 || ret >= sizeof(buf)) {
+        ret = -1;
+        goto out;
+    }
+    total_len += ret;
+
+    ret = snprintf(buf + total_len, sizeof(buf) - total_len, "%s=%d\n",
+                   GD_OP_VERSION_KEY, conf->op_version);
+    if (ret < 0 || ret >= sizeof(buf) - total_len) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = gf_store_save_items(handle->fd, buf);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_OP_VERS_STORE_FAIL,
+               "Storing glusterd global-info failed ret = %d", ret);
+        goto out;
+    }
+
+    ret = gf_store_rename_tmppath(handle);
+out:
+    if (handle) {
+        if (ret && (handle->fd >= 0))
+            gf_store_unlink_tmppath(handle);
+    }
+
+    if (uuid_str)
+        GF_FREE(uuid_str);
+
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0,
+               GD_MSG_GLUSTERD_GLOBAL_INFO_STORE_FAIL,
+               "Failed to store glusterd global-info");
+
+    return ret;
+}
+
+int
+glusterd_store_max_op_version(xlator_t *this)
+{
+    int ret = -1;
+    glusterd_conf_t *conf = NULL;
+    char op_version_str[15] = {
+        0,
+    };
+    char path[PATH_MAX] = {
+        0,
+    };
+    gf_store_handle_t *handle = NULL;
+    int32_t len = 0;
+
+    conf = this->private;
+
+    len = snprintf(path, PATH_MAX, "%s/%s", conf->workdir,
+                   GLUSTERD_UPGRADE_FILE);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        goto out;
+    }
+    ret = gf_store_handle_new(path, &handle);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_HANDLE_GET_FAIL,
+               "Unable to get store handle");
+        goto out;
+    }
+
+    /* These options need to be available for all users */
+    ret = sys_chmod(handle->path, 0644);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "chmod error for %s", GLUSTERD_UPGRADE_FILE);
+        goto out;
+    }
+
+    handle->fd = gf_store_mkstemp(handle);
+    if (handle->fd < 0) {
+        ret = -1;
+        goto out;
+    }
+
+    snprintf(op_version_str, sizeof(op_version_str), "%d", GD_OP_VERSION_MAX);
+    ret = gf_store_save_value(handle->fd, GD_MAX_OP_VERSION_KEY,
+                              op_version_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_VERS_STORE_FAIL,
+               "Storing op-version failed ret = %d", ret);
+        goto out;
+    }
+
+    ret = gf_store_rename_tmppath(handle);
+out:
+    if (handle) {
+        if (ret && (handle->fd >= 0))
+            gf_store_unlink_tmppath(handle);
+    }
+
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0,
+               GD_MSG_GLUSTERD_GLOBAL_INFO_STORE_FAIL,
+               "Failed to store max op-version");
+    if (handle)
+        gf_store_handle_destroy(handle);
+    return ret;
+}
+
+int
+glusterd_retrieve_max_op_version(xlator_t *this, int *op_version)
+{
+    char *op_version_str = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = -1;
+    int tmp_version = 0;
+    char *tmp = NULL;
+    char path[PATH_MAX] = {
+        0,
+    };
+    gf_store_handle_t *handle = NULL;
+    int32_t len = 0;
+
+    priv = this->private;
+
+    len = snprintf(path, PATH_MAX, "%s/%s", priv->workdir,
+                   GLUSTERD_UPGRADE_FILE);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        goto out;
+    }
+    ret = gf_store_handle_retrieve(path, &handle);
+
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Unable to get store "
+                     "handle!");
+        goto out;
+    }
+
+    ret = gf_store_retrieve_value(handle, GD_MAX_OP_VERSION_KEY,
+                                  &op_version_str);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "No previous op_version present");
+        goto out;
+    }
+
+    tmp_version = strtol(op_version_str, &tmp, 10);
+    if ((tmp_version <= 0) || (tmp && strlen(tmp) > 1)) {
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, GD_MSG_UNSUPPORTED_VERSION,
+               "invalid version number");
+        goto out;
+    }
+
+    *op_version = tmp_version;
+
+    ret = 0;
+out:
+    if (op_version_str)
+        GF_FREE(op_version_str);
+    if (handle)
+        gf_store_handle_destroy(handle);
+    return ret;
+}
+
+int
+glusterd_retrieve_op_version(xlator_t *this, int *op_version)
+{
+    char *op_version_str = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = -1;
+    int tmp_version = 0;
+    char *tmp = NULL;
+    char path[PATH_MAX] = {
+        0,
+    };
+    gf_store_handle_t *handle = NULL;
+    int32_t len = 0;
+
+    priv = this->private;
+
+    if (!priv->handle) {
+        len = snprintf(path, PATH_MAX, "%s/%s", priv->workdir,
+                       GLUSTERD_INFO_FILE);
+        if ((len < 0) || (len >= PATH_MAX)) {
+            goto out;
+        }
+        ret = gf_store_handle_retrieve(path, &handle);
+
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Unable to get store "
+                         "handle!");
+            goto out;
+        }
+
+        priv->handle = handle;
+    }
+
+    ret = gf_store_retrieve_value(priv->handle, GD_OP_VERSION_KEY,
+                                  &op_version_str);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "No previous op_version present");
+        goto out;
+    }
+
+    tmp_version = strtol(op_version_str, &tmp, 10);
+    if ((tmp_version <= 0) || (tmp && strlen(tmp) > 1)) {
+        gf_msg(this->name, GF_LOG_WARNING, EINVAL, GD_MSG_UNSUPPORTED_VERSION,
+               "invalid version number");
+        goto out;
+    }
+
+    *op_version = tmp_version;
+
+    ret = 0;
+out:
+    if (op_version_str)
+        GF_FREE(op_version_str);
+
+    return ret;
+}
+
+int
+glusterd_restore_op_version(xlator_t *this)
+{
+    glusterd_conf_t *conf = NULL;
+    int ret = 0;
+    int op_version = 0;
+
+    conf = this->private;
+
+    ret = glusterd_retrieve_op_version(this, &op_version);
+    if (!ret) {
+        if ((op_version < GD_OP_VERSION_MIN) ||
+            (op_version > GD_OP_VERSION_MAX)) {
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_UNSUPPORTED_VERSION,
+                   "wrong op-version (%d) retrieved", op_version);
+            ret = -1;
+            goto out;
+        }
+        conf->op_version = op_version;
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_OP_VERS_INFO,
+               "retrieved op-version: %d", conf->op_version);
+        goto out;
+    }
+
+    /* op-version can be missing from the store file in 2 cases,
+     * 1. This is a new install of glusterfs
+     * 2. This is an upgrade of glusterfs from a version without op-version
+     *    to a version with op-version (eg. 3.3 -> 3.4)
+     *
+     * Detection of a new install or an upgrade from an older install can be
+     * done by checking for the presence of the its peer-id in the store
+     * file.  If peer-id is present, the installation is an upgrade else, it
+     * is a new install.
+     *
+     * For case 1, set op-version to GD_OP_VERSION_MAX.
+     * For case 2, set op-version to GD_OP_VERSION_MIN.
+     */
+    ret = glusterd_retrieve_uuid();
+    if (ret) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_OP_VERS_SET_INFO,
+               "Detected new install. Setting"
+               " op-version to maximum : %d",
+               GD_OP_VERSION_MAX);
+        conf->op_version = GD_OP_VERSION_MAX;
+    } else {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_OP_VERS_SET_INFO,
+               "Upgrade detected. Setting"
+               " op-version to minimum : %d",
+               GD_OP_VERSION_MIN);
+        conf->op_version = GD_OP_VERSION_MIN;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+glusterd_retrieve_uuid()
+{
+    char *uuid_str = NULL;
+    int32_t ret = -1;
+    gf_store_handle_t *handle = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    char path[PATH_MAX] = {
+        0,
+    };
+    int32_t len = 0;
+
+    this = THIS;
+    priv = this->private;
+
+    if (!priv->handle) {
+        len = snprintf(path, PATH_MAX, "%s/%s", priv->workdir,
+                       GLUSTERD_INFO_FILE);
+        if ((len < 0) || (len >= PATH_MAX)) {
+            goto out;
+        }
+        ret = gf_store_handle_retrieve(path, &handle);
+
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Unable to get store"
+                         "handle!");
+            goto out;
+        }
+
+        priv->handle = handle;
+    }
+    pthread_mutex_lock(&priv->mutex);
+    {
+        ret = gf_store_retrieve_value(priv->handle, GLUSTERD_STORE_UUID_KEY,
+                                      &uuid_str);
+    }
+    pthread_mutex_unlock(&priv->mutex);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "No previous uuid is present");
+        goto out;
+    }
+
+    gf_uuid_parse(uuid_str, priv->uuid);
+
+out:
+    GF_FREE(uuid_str);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_store_retrieve_snapd(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    char *key = NULL;
+    char *value = NULL;
+    char volpath[PATH_MAX] = {
+        0,
+    };
+    char path[PATH_MAX] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    gf_store_iter_t *iter = NULL;
+    gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = THIS->private;
+    GF_ASSERT(volinfo);
+
+    if (conf->op_version < GD_OP_VERSION_3_6_0) {
+        ret = 0;
+        goto out;
+    }
+
+    /*
+     * This is needed for upgrade situations. Say a volume is created with
+     * older version of glusterfs and upgraded to a glusterfs version equal
+     * to or greater than GD_OP_VERSION_3_6_0. The older glusterd would not
+     * have created the snapd.info file related to snapshot daemon for user
+     * serviceable snapshots. So as part of upgrade when the new glusterd
+     * starts, as part of restore (restoring the volume to be precise), it
+     * tries to snapd related info from snapd.info file. But since there was
+     * no such file till now, the restore operation fails. Thus, to prevent
+     * it from happening check whether user serviceable snapshots features
+     * is enabled before restoring snapd. If its disabled, then simply
+     * exit by returning success (without even checking for the snapd.info).
+     */
+
+    if (!dict_get_str_boolean(volinfo->dict, "features.uss", _gf_false)) {
+        ret = 0;
+        goto out;
+    }
+
+    GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, conf);
+
+    len = snprintf(path, sizeof(path), "%s/%s", volpath,
+                   GLUSTERD_VOLUME_SNAPD_INFO_FILE);
+    if ((len < 0) || (len >= sizeof(path))) {
+        goto out;
+    }
+
+    ret = gf_store_handle_retrieve(path, &volinfo->snapd.handle);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HANDLE_NULL,
+               "volinfo handle is NULL");
+        goto out;
+    }
+
+    ret = gf_store_iter_new(volinfo->snapd.handle, &iter);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_GET_FAIL,
+               "Failed to get new store "
+               "iter");
+        goto out;
+    }
+
+    ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_GET_FAIL,
+               "Failed to get next store "
+               "iter");
+        goto out;
+    }
+
+    while (!ret) {
+        if (!strncmp(key, GLUSTERD_STORE_KEY_SNAPD_PORT,
+                     SLEN(GLUSTERD_STORE_KEY_SNAPD_PORT))) {
+            volinfo->snapd.port = atoi(value);
+        }
+
+        ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+    }
+
+    if (op_errno != GD_STORE_EOF)
+        goto out;
+
+    ret = 0;
+
+out:
+    if (gf_store_iter_destroy(&iter)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL,
+               "Failed to destroy store iter");
+        ret = -1;
+    }
+
+    return ret;
+}
+
+int32_t
+glusterd_store_retrieve_bricks(glusterd_volinfo_t *volinfo)
+{
+    int32_t ret = 0;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *ta_brickinfo = NULL;
+    gf_store_iter_t *iter = NULL;
+    char *key = NULL;
+    char *value = NULL;
+    char brickdir[PATH_MAX] = {
+        0,
+    };
+    char path[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    int32_t brick_count = 0;
+    int32_t ta_brick_count = 0;
+    char tmpkey[32] = {
+        0,
+    };
+    gf_store_iter_t *tmpiter = NULL;
+    char *tmpvalue = NULL;
+    char abspath[PATH_MAX] = {0};
+    struct pmap_registry *pmap = NULL;
+    xlator_t *this = NULL;
+    int brickid = 0;
+    /* ta_brick_id initialization with 2 since ta-brick id starts with
+     * volname-ta-2
+     */
+    int ta_brick_id = 2;
+    gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+    int32_t len = 0;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(volinfo->volname);
+
+    this = THIS;
+    priv = this->private;
+
+    GLUSTERD_GET_BRICK_DIR(brickdir, volinfo, priv);
+
+    ret = gf_store_iter_new(volinfo->shandle, &tmpiter);
+
+    if (ret)
+        goto out;
+
+    while (brick_count < volinfo->brick_count) {
+        ret = glusterd_brickinfo_new(&brickinfo);
+
+        if (ret)
+            goto out;
+        snprintf(tmpkey, sizeof(tmpkey), "%s-%d", GLUSTERD_STORE_KEY_VOL_BRICK,
+                 brick_count);
+        ret = gf_store_iter_get_matching(tmpiter, tmpkey, &tmpvalue);
+        len = snprintf(path, sizeof(path), "%s/%s", brickdir, tmpvalue);
+        GF_FREE(tmpvalue);
+        tmpvalue = NULL;
+        if ((len < 0) || (len >= sizeof(path))) {
+            ret = -1;
+            goto out;
+        }
+
+        ret = gf_store_handle_retrieve(path, &brickinfo->shandle);
+
+        if (ret)
+            goto out;
+
+        ret = gf_store_iter_new(brickinfo->shandle, &iter);
+
+        if (ret)
+            goto out;
+
+        ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                   GD_MSG_STORE_ITER_GET_FAIL,
+                   "Unable to iterate "
+                   "the store for brick: %s",
+                   path);
+            goto out;
+        }
+        while (!ret) {
+            if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_HOSTNAME,
+                         SLEN(GLUSTERD_STORE_KEY_BRICK_HOSTNAME))) {
+                if (snprintf(brickinfo->hostname, sizeof(brickinfo->hostname),
+                             "%s", value) >= sizeof(brickinfo->hostname)) {
+                    gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                           GD_MSG_PARSE_BRICKINFO_FAIL,
+                           "brick hostname truncated: %s", brickinfo->hostname);
+                    goto out;
+                }
+            } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_PATH,
+                                SLEN(GLUSTERD_STORE_KEY_BRICK_PATH))) {
+                if (snprintf(brickinfo->path, sizeof(brickinfo->path), "%s",
+                             value) >= sizeof(brickinfo->path)) {
+                    gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                           GD_MSG_PARSE_BRICKINFO_FAIL,
+                           "brick path truncated: %s", brickinfo->path);
+                    goto out;
+                }
+            } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_REAL_PATH,
+                                SLEN(GLUSTERD_STORE_KEY_BRICK_REAL_PATH))) {
+                if (snprintf(brickinfo->real_path, sizeof(brickinfo->real_path),
+                             "%s", value) >= sizeof(brickinfo->real_path)) {
+                    gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                           GD_MSG_PARSE_BRICKINFO_FAIL,
+                           "real_path truncated: %s", brickinfo->real_path);
+                    goto out;
+                }
+            } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_PORT,
+                                SLEN(GLUSTERD_STORE_KEY_BRICK_PORT))) {
+                ret = gf_string2int(value, &brickinfo->port);
+                if (ret == -1) {
+                    gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+                           GD_MSG_INCOMPATIBLE_VALUE,
+                           "Failed to convert "
+                           "string to integer");
+                }
+
+                if (brickinfo->port < priv->base_port) {
+                    /* This is required to adhere to the
+                       IANA standards */
+                    brickinfo->port = 0;
+                } else {
+                    /* This is required to have proper ports
+                       assigned to bricks after restart */
+                    pmap = pmap_registry_get(THIS);
+                    if (pmap->last_alloc <= brickinfo->port)
+                        pmap->last_alloc = brickinfo->port + 1;
+                }
+            } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_RDMA_PORT,
+                                SLEN(GLUSTERD_STORE_KEY_BRICK_RDMA_PORT))) {
+                ret = gf_string2int(value, &brickinfo->rdma_port);
+                if (ret == -1) {
+                    gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+                           GD_MSG_INCOMPATIBLE_VALUE,
+                           "Failed to convert "
+                           "string to integer");
+                }
+
+                if (brickinfo->rdma_port < priv->base_port) {
+                    /* This is required to adhere to the
+                       IANA standards */
+                    brickinfo->rdma_port = 0;
+                } else {
+                    /* This is required to have proper ports
+                       assigned to bricks after restart */
+                    pmap = pmap_registry_get(THIS);
+                    if (pmap->last_alloc <= brickinfo->rdma_port)
+                        pmap->last_alloc = brickinfo->rdma_port + 1;
+                }
+
+            } else if (!strncmp(
+                           key, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED,
+                           SLEN(GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED))) {
+                ret = gf_string2int(value, &brickinfo->decommissioned);
+                if (ret == -1) {
+                    gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+                           GD_MSG_INCOMPATIBLE_VALUE,
+                           "Failed to convert "
+                           "string to integer");
+                }
+
+            } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH,
+                                SLEN(GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH))) {
+                if (snprintf(brickinfo->device_path,
+                             sizeof(brickinfo->device_path), "%s",
+                             value) >= sizeof(brickinfo->device_path)) {
+                    gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                           GD_MSG_PARSE_BRICKINFO_FAIL,
+                           "device_path truncated: %s", brickinfo->device_path);
+                    goto out;
+                }
+            } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_MOUNT_DIR,
+                                SLEN(GLUSTERD_STORE_KEY_BRICK_MOUNT_DIR))) {
+                if (snprintf(brickinfo->mount_dir, sizeof(brickinfo->mount_dir),
+                             "%s", value) >= sizeof(brickinfo->mount_dir)) {
+                    gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                           GD_MSG_PARSE_BRICKINFO_FAIL,
+                           "mount_dir truncated: %s", brickinfo->mount_dir);
+                    goto out;
+                }
+            } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS,
+                                SLEN(GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS))) {
+                ret = gf_string2int(value, &brickinfo->snap_status);
+                if (ret == -1) {
+                    gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+                           GD_MSG_INCOMPATIBLE_VALUE,
+                           "Failed to convert "
+                           "string to integer");
+                }
+
+            } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_FSTYPE,
+                                SLEN(GLUSTERD_STORE_KEY_BRICK_FSTYPE))) {
+                if (snprintf(brickinfo->fstype, sizeof(brickinfo->fstype), "%s",
+                             value) >= sizeof(brickinfo->fstype)) {
+                    gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                           GD_MSG_PARSE_BRICKINFO_FAIL, "fstype truncated: %s",
+                           brickinfo->fstype);
+                    goto out;
+                }
+            } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_MNTOPTS,
+                                SLEN(GLUSTERD_STORE_KEY_BRICK_MNTOPTS))) {
+                if (snprintf(brickinfo->mnt_opts, sizeof(brickinfo->mnt_opts),
+                             "%s", value) >= sizeof(brickinfo->mnt_opts)) {
+                    gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                           GD_MSG_PARSE_BRICKINFO_FAIL,
+                           "mnt_opts truncated: %s", brickinfo->mnt_opts);
+                    goto out;
+                }
+            } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_VGNAME,
+                                SLEN(GLUSTERD_STORE_KEY_BRICK_VGNAME))) {
+                if (snprintf(brickinfo->vg, sizeof(brickinfo->vg), "%s",
+                             value) >= sizeof(brickinfo->vg)) {
+                    gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                           GD_MSG_PARSE_BRICKINFO_FAIL,
+                           "brickinfo->vg truncated: %s", brickinfo->vg);
+                    goto out;
+                }
+            } else if (!strcmp(key, GLUSTERD_STORE_KEY_BRICK_ID)) {
+                if (snprintf(brickinfo->brick_id, sizeof(brickinfo->brick_id),
+                             "%s", value) >= sizeof(brickinfo->brick_id)) {
+                    gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                           GD_MSG_PARSE_BRICKINFO_FAIL,
+                           "brick_id truncated: %s", brickinfo->brick_id);
+                    goto out;
+                }
+            } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_FSID,
+                                SLEN(GLUSTERD_STORE_KEY_BRICK_FSID))) {
+                ret = gf_string2uint64(value, &brickinfo->statfs_fsid);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+                           "%s "
+                           "is not a valid uint64_t value",
+                           value);
+                }
+
+            } else if (!strcmp(key, GLUSTERD_STORE_KEY_BRICK_UUID)) {
+                gf_uuid_parse(value, brickinfo->uuid);
+            } else {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNKNOWN_KEY,
+                       "Unknown key: %s", key);
+            }
+
+            GF_FREE(key);
+            GF_FREE(value);
+            key = NULL;
+            value = NULL;
+
+            ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+        }
+
+        if (op_errno != GD_STORE_EOF) {
+            gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                   GD_MSG_PARSE_BRICKINFO_FAIL,
+                   "Error parsing brickinfo: "
+                   "op_errno=%d",
+                   op_errno);
+            goto out;
+        }
+
+        if (brickinfo->brick_id[0] == '\0') {
+            /* This is an old volume upgraded to op_version 4 */
+            GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO(brickinfo, volinfo, brickid++);
+        }
+        /* Populate brickinfo->real_path for normal volumes, for
+         * snapshot or snapshot restored volume this would be done post
+         * creating the brick mounts
+         */
+        if (gf_uuid_is_null(brickinfo->uuid))
+            (void)glusterd_resolve_brick(brickinfo);
+        if (brickinfo->real_path[0] == '\0' && !volinfo->is_snap_volume &&
+            gf_uuid_is_null(volinfo->restored_from_snap)) {
+            /* By now if the brick is a local brick then it will be
+             * able to resolve which is the only thing we want now
+             * for checking  whether the brickinfo->uuid matches
+             * with MY_UUID for realpath check. Hence do not handle
+             * error
+             */
+            if (!gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+                if (!realpath(brickinfo->path, abspath)) {
+                    gf_msg(this->name, GF_LOG_CRITICAL, errno,
+                           GD_MSG_BRICKINFO_CREATE_FAIL,
+                           "realpath() failed for brick %s"
+                           ". The underlying file system "
+                           "may be in bad state",
+                           brickinfo->path);
+                    ret = -1;
+                    goto out;
+                }
+                if (strlen(abspath) >= sizeof(brickinfo->real_path)) {
+                    ret = -1;
+                    goto out;
+                }
+                (void)strncpy(brickinfo->real_path, abspath,
+                              sizeof(brickinfo->real_path));
+            }
+        }
+
+        /* Handle upgrade case of shared_brick_count 'fsid' */
+        /* Ideally statfs_fsid should never be 0 if done right */
+        if (!gf_uuid_compare(brickinfo->uuid, MY_UUID) &&
+            brickinfo->statfs_fsid == 0) {
+            struct statvfs brickstat = {
+                0,
+            };
+            ret = sys_statvfs(brickinfo->path, &brickstat);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, errno,
+                       GD_MSG_BRICKINFO_CREATE_FAIL,
+                       "failed to get statfs() call on brick %s",
+                       brickinfo->path);
+                /* No need for treating it as an error, lets continue
+                   with just a message */
+            } else {
+                brickinfo->statfs_fsid = brickstat.f_fsid;
+            }
+        }
+
+        cds_list_add_tail(&brickinfo->brick_list, &volinfo->bricks);
+        brick_count++;
+    }
+
+    if (gf_store_iter_destroy(&tmpiter)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL,
+               "Failed to destroy store iter");
+        ret = -1;
+        goto out;
+    }
+
+    ret = gf_store_iter_new(volinfo->shandle, &tmpiter);
+
+    if (ret)
+        goto out;
+
+    if (volinfo->thin_arbiter_count == 1) {
+        snprintf(tmpkey, sizeof(tmpkey), "%s-%d",
+                 GLUSTERD_STORE_KEY_VOL_TA_BRICK, 0);
+        while (ta_brick_count < volinfo->subvol_count) {
+            ret = glusterd_brickinfo_new(&ta_brickinfo);
+            if (ret)
+                goto out;
+
+            ret = gf_store_iter_get_matching(tmpiter, tmpkey, &tmpvalue);
+
+            len = snprintf(path, sizeof(path), "%s/%s", brickdir, tmpvalue);
+            GF_FREE(tmpvalue);
+            tmpvalue = NULL;
+            if ((len < 0) || (len >= sizeof(path))) {
+                ret = -1;
+                goto out;
+            }
+
+            ret = gf_store_handle_retrieve(path, &ta_brickinfo->shandle);
+
+            if (ret)
+                goto out;
+
+            ret = gf_store_iter_new(ta_brickinfo->shandle, &iter);
+
+            if (ret)
+                goto out;
+
+            ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                       GD_MSG_STORE_ITER_GET_FAIL,
+                       "Unable to iterate "
+                       "the store for brick: %s",
+                       path);
+                goto out;
+            }
+
+            while (!ret) {
+                if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_HOSTNAME,
+                             SLEN(GLUSTERD_STORE_KEY_BRICK_HOSTNAME))) {
+                    if (snprintf(ta_brickinfo->hostname,
+                                 sizeof(ta_brickinfo->hostname), "%s",
+                                 value) >= sizeof(ta_brickinfo->hostname)) {
+                        gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                               GD_MSG_PARSE_BRICKINFO_FAIL,
+                               "brick hostname truncated: %s",
+                               ta_brickinfo->hostname);
+                        goto out;
+                    }
+                } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_PATH,
+                                    SLEN(GLUSTERD_STORE_KEY_BRICK_PATH))) {
+                    if (snprintf(ta_brickinfo->path, sizeof(ta_brickinfo->path),
+                                 "%s", value) >= sizeof(ta_brickinfo->path)) {
+                        gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                               GD_MSG_PARSE_BRICKINFO_FAIL,
+                               "brick path truncated: %s", ta_brickinfo->path);
+                        goto out;
+                    }
+                } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_REAL_PATH,
+                                    SLEN(GLUSTERD_STORE_KEY_BRICK_REAL_PATH))) {
+                    if (snprintf(ta_brickinfo->real_path,
+                                 sizeof(ta_brickinfo->real_path), "%s",
+                                 value) >= sizeof(ta_brickinfo->real_path)) {
+                        gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                               GD_MSG_PARSE_BRICKINFO_FAIL,
+                               "real_path truncated: %s",
+                               ta_brickinfo->real_path);
+                        goto out;
+                    }
+                } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_PORT,
+                                    SLEN(GLUSTERD_STORE_KEY_BRICK_PORT))) {
+                    ret = gf_string2int(value, &ta_brickinfo->port);
+                    if (ret == -1) {
+                        gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+                               GD_MSG_INCOMPATIBLE_VALUE,
+                               "Failed to convert "
+                               "string to integer");
+                    }
+
+                    if (ta_brickinfo->port < priv->base_port) {
+                        /* This is required to adhere to the
+                        IANA standards */
+                        ta_brickinfo->port = 0;
+                    }
+                } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_RDMA_PORT,
+                                    SLEN(GLUSTERD_STORE_KEY_BRICK_RDMA_PORT))) {
+                    ret = gf_string2int(value, &ta_brickinfo->rdma_port);
+                    if (ret == -1) {
+                        gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+                               GD_MSG_INCOMPATIBLE_VALUE,
+                               "Failed to convert "
+                               "string to integer");
+                    }
+
+                    if (ta_brickinfo->rdma_port < priv->base_port) {
+                        /* This is required to adhere to the
+                        IANA standards */
+                        ta_brickinfo->rdma_port = 0;
+                    }
+                } else if (!strncmp(
+                               key, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED,
+                               SLEN(GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED))) {
+                    ret = gf_string2int(value, &ta_brickinfo->decommissioned);
+                    if (ret == -1) {
+                        gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+                               GD_MSG_INCOMPATIBLE_VALUE,
+                               "Failed to convert "
+                               "string to integer");
+                    }
+
+                } else if (!strcmp(key, GLUSTERD_STORE_KEY_BRICK_ID)) {
+                    if (snprintf(ta_brickinfo->brick_id,
+                                 sizeof(ta_brickinfo->brick_id), "%s",
+                                 value) >= sizeof(ta_brickinfo->brick_id)) {
+                        gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                               GD_MSG_PARSE_BRICKINFO_FAIL,
+                               "brick_id truncated: %s",
+                               ta_brickinfo->brick_id);
+                        goto out;
+                    }
+                } else if (!strncmp(key, GLUSTERD_STORE_KEY_BRICK_FSID,
+                                    SLEN(GLUSTERD_STORE_KEY_BRICK_FSID))) {
+                    ret = gf_string2uint64(value, &ta_brickinfo->statfs_fsid);
+                    if (ret) {
+                        gf_msg(this->name, GF_LOG_ERROR, 0,
+                               GD_MSG_INVALID_ENTRY,
+                               "%s "
+                               "is not a valid uint64_t value",
+                               value);
+                    }
+                } else if (!strcmp(key, GLUSTERD_STORE_KEY_BRICK_UUID)) {
+                    gf_uuid_parse(value, brickinfo->uuid);
+                } else if (!strncmp(
+                               key, GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS,
+                               SLEN(GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS))) {
+                    ret = gf_string2int(value, &ta_brickinfo->snap_status);
+                    if (ret == -1) {
+                        gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+                               GD_MSG_INCOMPATIBLE_VALUE,
+                               "Failed to convert "
+                               "string to integer");
+                    }
+
+                } else {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNKNOWN_KEY,
+                           "Unknown key: %s", key);
+                }
+
+                GF_FREE(key);
+                GF_FREE(value);
+                key = NULL;
+                value = NULL;
+                ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+            }
+
+            GLUSTERD_ASSIGN_BRICKID_TO_TA_BRICKINFO(ta_brickinfo, volinfo,
+                                                    ta_brick_id);
+            ta_brick_id += 3;
+
+            cds_list_add_tail(&ta_brickinfo->brick_list, &volinfo->ta_bricks);
+            ta_brick_count++;
+        }
+    }
+
+    assign_brick_groups(volinfo);
+    ret = 0;
+
+out:
+    if (gf_store_iter_destroy(&tmpiter)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL,
+               "Failed to destroy store iter");
+        ret = -1;
+    }
+
+    if (gf_store_iter_destroy(&iter)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL,
+               "Failed to destroy store iter");
+        ret = -1;
+    }
+
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+int32_t
+glusterd_store_retrieve_node_state(glusterd_volinfo_t *volinfo)
+{
+    int32_t ret = -1;
+    gf_store_iter_t *iter = NULL;
+    char *key = NULL;
+    char *value = NULL;
+    char *dup_value = NULL;
+    char volpath[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    char path[PATH_MAX] = {
+        0,
+    };
+    gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+    dict_t *tmp_dict = NULL;
+    xlator_t *this = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(volinfo);
+
+    GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, priv);
+    len = snprintf(path, sizeof(path), "%s/%s", volpath,
+                   GLUSTERD_NODE_STATE_FILE);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        goto out;
+    }
+
+    ret = gf_store_handle_retrieve(path, &volinfo->node_state_shandle);
+    if (ret)
+        goto out;
+
+    ret = gf_store_iter_new(volinfo->node_state_shandle, &iter);
+
+    if (ret)
+        goto out;
+
+    ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+
+    if (ret)
+        goto out;
+
+    while (ret == 0) {
+        if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_DEFRAG,
+                     SLEN(GLUSTERD_STORE_KEY_VOL_DEFRAG))) {
+            volinfo->rebal.defrag_cmd = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_DEFRAG_STATUS,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_DEFRAG_STATUS))) {
+            volinfo->rebal.defrag_status = atoi(value);
+        } else if (!strncmp(key, GF_REBALANCE_TID_KEY,
+                            SLEN(GF_REBALANCE_TID_KEY))) {
+            gf_uuid_parse(value, volinfo->rebal.rebalance_id);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_DEFRAG_OP,
+                            SLEN(GLUSTERD_STORE_KEY_DEFRAG_OP))) {
+            volinfo->rebal.op = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_DEFRAG_REB_FILES,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_DEFRAG_REB_FILES))) {
+            sscanf(value, "%" PRIu64, &volinfo->rebal.rebalance_files);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_DEFRAG_SIZE,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_DEFRAG_SIZE))) {
+            sscanf(value, "%" PRIu64, &volinfo->rebal.rebalance_data);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_DEFRAG_SCANNED,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_DEFRAG_SCANNED))) {
+            sscanf(value, "%" PRIu64, &volinfo->rebal.lookedup_files);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_DEFRAG_FAILURES,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_DEFRAG_FAILURES))) {
+            sscanf(value, "%" PRIu64, &volinfo->rebal.rebalance_failures);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_DEFRAG_SKIPPED,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_DEFRAG_SKIPPED))) {
+            sscanf(value, "%" PRIu64, &volinfo->rebal.skipped_files);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_DEFRAG_RUN_TIME,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_DEFRAG_RUN_TIME))) {
+            volinfo->rebal.rebalance_time = atoi(value);
+        } else {
+            if (!tmp_dict) {
+                tmp_dict = dict_new();
+                if (!tmp_dict) {
+                    ret = -1;
+                    goto out;
+                }
+            }
+            dup_value = gf_strdup(value);
+            if (!dup_value) {
+                ret = -1;
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                       "Failed to strdup value string");
+                goto out;
+            }
+            ret = dict_set_str(tmp_dict, key, dup_value);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Error setting data in rebal "
+                       "dict.");
+                goto out;
+            }
+            dup_value = NULL;
+        }
+
+        GF_FREE(key);
+        GF_FREE(value);
+        key = NULL;
+        value = NULL;
+
+        ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+    }
+    if (tmp_dict) {
+        volinfo->rebal.dict = dict_ref(tmp_dict);
+    }
+
+    if (op_errno != GD_STORE_EOF) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    if (gf_store_iter_destroy(&iter)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL,
+               "Failed to destroy store iter");
+        ret = -1;
+    }
+
+    if (dup_value)
+        GF_FREE(dup_value);
+    if (ret) {
+        if (volinfo->rebal.dict)
+            dict_unref(volinfo->rebal.dict);
+    }
+    if (tmp_dict)
+        dict_unref(tmp_dict);
+
+    gf_msg_trace(this->name, 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_store_update_volinfo(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    int exists = 0;
+    char *key = NULL;
+    char *value = NULL;
+    char volpath[PATH_MAX] = {
+        0,
+    };
+    char path[PATH_MAX] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    gf_store_iter_t *iter = NULL;
+    gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = THIS->private;
+    GF_ASSERT(volinfo);
+
+    GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, conf);
+
+    len = snprintf(path, sizeof(path), "%s/%s", volpath,
+                   GLUSTERD_VOLUME_INFO_FILE);
+    if ((len < 0) || (len >= sizeof(path))) {
+        goto out;
+    }
+
+    ret = gf_store_handle_retrieve(path, &volinfo->shandle);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HANDLE_NULL,
+               "volinfo handle is NULL");
+        goto out;
+    }
+
+    ret = gf_store_iter_new(volinfo->shandle, &iter);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_GET_FAIL,
+               "Failed to get new store "
+               "iter");
+        goto out;
+    }
+
+    ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_GET_FAIL,
+               "Failed to get next store "
+               "iter");
+        goto out;
+    }
+
+    while (!ret) {
+        gf_msg_debug(this->name, 0, "key = %s value = %s", key, value);
+        if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_TYPE,
+                     SLEN(GLUSTERD_STORE_KEY_VOL_TYPE))) {
+            volinfo->type = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_COUNT,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_COUNT))) {
+            volinfo->brick_count = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_STATUS,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_STATUS))) {
+            volinfo->status = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_VERSION,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_VERSION))) {
+            volinfo->version = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_PORT,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_PORT))) {
+            volinfo->port = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_SUB_COUNT,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_SUB_COUNT))) {
+            volinfo->sub_count = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_STRIPE_CNT,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_STRIPE_CNT))) {
+            volinfo->stripe_count = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_REPLICA_CNT,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_REPLICA_CNT))) {
+            volinfo->replica_count = atoi(value);
+        } else if (!strcmp(key, GLUSTERD_STORE_KEY_VOL_ARBITER_CNT)) {
+            volinfo->arbiter_count = atoi(value);
+        } else if (!strcmp(key, GLUSTERD_STORE_KEY_VOL_THIN_ARBITER_CNT)) {
+            volinfo->thin_arbiter_count = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT))) {
+            volinfo->disperse_count = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT))) {
+            volinfo->redundancy_count = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_TRANSPORT,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_TRANSPORT))) {
+            volinfo->transport_type = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_ID,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_ID))) {
+            ret = gf_uuid_parse(value, volinfo->volume_id);
+            if (ret)
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_UUID_PARSE_FAIL,
+                       "failed to parse uuid");
+
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_USERNAME,
+                            SLEN(GLUSTERD_STORE_KEY_USERNAME))) {
+            glusterd_auth_set_username(volinfo, value);
+
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_PASSWORD,
+                            SLEN(GLUSTERD_STORE_KEY_PASSWORD))) {
+            glusterd_auth_set_password(volinfo, value);
+
+        } else if (strstr(key, "slave")) {
+            ret = dict_set_dynstr(volinfo->gsync_slaves, key, gf_strdup(value));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Error in "
+                       "dict_set_str");
+                goto out;
+            }
+            gf_msg_debug(this->name, 0,
+                         "Parsed as " GEOREP
+                         " "
+                         " slave:key=%s,value:%s",
+                         key, value);
+
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_OP_VERSION,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_OP_VERSION))) {
+            volinfo->op_version = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION))) {
+            volinfo->client_op_version = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT,
+                            SLEN(GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT))) {
+            volinfo->snap_max_hard_limit = (uint64_t)atoll(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_RESTORED_SNAP,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_RESTORED_SNAP))) {
+            ret = gf_uuid_parse(value, volinfo->restored_from_snap);
+            if (ret)
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_UUID_PARSE_FAIL,
+                       "failed to parse restored snap's uuid");
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_PARENT_VOLNAME,
+                            SLEN(GLUSTERD_STORE_KEY_PARENT_VOLNAME))) {
+            if (snprintf(volinfo->parent_volname,
+                         sizeof(volinfo->parent_volname), "%s",
+                         value) >= sizeof(volinfo->parent_volname)) {
+                gf_msg("glusterd", GF_LOG_ERROR, op_errno,
+                       GD_MSG_PARSE_BRICKINFO_FAIL,
+                       "parent_volname truncated: %s", volinfo->parent_volname);
+                goto out;
+            }
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_VOL_QUOTA_VERSION,
+                            SLEN(GLUSTERD_STORE_KEY_VOL_QUOTA_VERSION))) {
+            volinfo->quota_xattr_version = atoi(value);
+        } else {
+            if (is_key_glusterd_hooks_friendly(key)) {
+                exists = 1;
+
+            } else {
+                exists = glusterd_check_option_exists(key, NULL);
+            }
+
+            switch (exists) {
+                case -1:
+                    ret = -1;
+                    goto out;
+
+                case 0:
+                    /*Ignore GLUSTERD_STORE_KEY_VOL_BRICK since
+                     glusterd_store_retrieve_bricks gets it later.
+                     also, ignore tier-enabled key as we deprecated
+                     tier xlator*/
+                    if (!strstr(key, GLUSTERD_STORE_KEY_VOL_BRICK) ||
+                        !strstr(key, GF_TIER_ENABLED))
+                        gf_msg(this->name, GF_LOG_WARNING, 0,
+                               GD_MSG_UNKNOWN_KEY, "Unknown key: %s", key);
+                    break;
+
+                case 1:
+                    /*The following strcmp check is to ensure that
+                     * glusterd does not restore the quota limits
+                     * into volinfo->dict post upgradation from 3.3
+                     * to 3.4 as the same limits will now be stored
+                     * in xattrs on the respective directories.
+                     */
+                    if (!strcmp(key, "features.limit-usage"))
+                        break;
+                    ret = dict_set_str(volinfo->dict, key, gf_strdup(value));
+                    if (ret) {
+                        gf_msg(this->name, GF_LOG_ERROR, 0,
+                               GD_MSG_DICT_SET_FAILED,
+                               "Error in "
+                               "dict_set_str");
+                        goto out;
+                    }
+                    gf_msg_debug(this->name, 0,
+                                 "Parsed as Volume-"
+                                 "set:key=%s,value:%s",
+                                 key, value);
+                    break;
+            }
+        }
+
+        GF_FREE(key);
+        GF_FREE(value);
+        key = NULL;
+        value = NULL;
+
+        ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+    }
+
+    /* backward compatibility */
+    {
+        switch (volinfo->type) {
+            case GF_CLUSTER_TYPE_NONE:
+                volinfo->stripe_count = 1;
+                volinfo->replica_count = 1;
+                break;
+
+            case GF_CLUSTER_TYPE_REPLICATE:
+                volinfo->stripe_count = 1;
+                volinfo->replica_count = volinfo->sub_count;
+                break;
+
+            case GF_CLUSTER_TYPE_DISPERSE:
+                GF_ASSERT(volinfo->disperse_count > 0);
+                GF_ASSERT(volinfo->redundancy_count > 0);
+                break;
+
+            case GF_CLUSTER_TYPE_STRIPE:
+            case GF_CLUSTER_TYPE_STRIPE_REPLICATE:
+                gf_msg(this->name, GF_LOG_CRITICAL, ENOTSUP,
+                       GD_MSG_VOLINFO_STORE_FAIL,
+                       "The volume type is no more supported. Please refer to "
+                       "glusterfs-6.0 release-notes for how to migrate from "
+                       "this volume type");
+                break;
+
+            default:
+                GF_ASSERT(0);
+                break;
+        }
+
+        volinfo->dist_leaf_count = glusterd_get_dist_leaf_count(volinfo);
+
+        volinfo->subvol_count = (volinfo->brick_count /
+                                 volinfo->dist_leaf_count);
+
+        /* Only calculate volume op-versions if they are not found */
+        if (!volinfo->op_version && !volinfo->client_op_version)
+            gd_update_volume_op_versions(volinfo);
+    }
+
+    if (op_errno != GD_STORE_EOF)
+        goto out;
+
+    ret = 0;
+
+out:
+    if (gf_store_iter_destroy(&iter)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL,
+               "Failed to destroy store iter");
+        ret = -1;
+    }
+
+    return ret;
+}
+
+glusterd_volinfo_t *
+glusterd_store_retrieve_volume(char *volname, glusterd_snap_t *snap)
+{
+    int32_t ret = -1;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_volinfo_t *origin_volinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(volname);
+
+    ret = glusterd_volinfo_new(&volinfo);
+    if (ret)
+        goto out;
+
+    if (snprintf(volinfo->volname, NAME_MAX + 1, "%s", volname) >= NAME_MAX + 1)
+        goto out;
+    volinfo->snapshot = snap;
+    if (snap)
+        volinfo->is_snap_volume = _gf_true;
+
+    ret = glusterd_store_update_volinfo(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_UPDATE_FAIL,
+               "Failed to update volinfo "
+               "for %s volume",
+               volname);
+        goto out;
+    }
+
+    ret = glusterd_store_retrieve_bricks(volinfo);
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_retrieve_snapd(volinfo);
+    if (ret)
+        goto out;
+
+    ret = glusterd_compute_cksum(volinfo, _gf_false);
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_retrieve_quota_version(volinfo);
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_create_quota_conf_sh_on_absence(volinfo);
+    if (ret)
+        goto out;
+
+    ret = glusterd_compute_cksum(volinfo, _gf_true);
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_save_quota_version_and_cksum(volinfo);
+    if (ret)
+        goto out;
+
+    if (!snap) {
+        glusterd_list_add_order(&volinfo->vol_list, &priv->volumes,
+                                glusterd_compare_volume_name);
+
+    } else {
+        ret = glusterd_volinfo_find(volinfo->parent_volname, &origin_volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                   "Parent volinfo "
+                   "not found for %s volume",
+                   volname);
+            goto out;
+        }
+        glusterd_list_add_snapvol(origin_volinfo, volinfo);
+    }
+
+out:
+    if (ret) {
+        if (volinfo)
+            glusterd_volinfo_unref(volinfo);
+        volinfo = NULL;
+    }
+
+    gf_msg_trace(this->name, 0, "Returning with %d", ret);
+
+    return volinfo;
+}
+
+static void
+glusterd_store_set_options_path(glusterd_conf_t *conf, char *path, size_t len)
+{
+    snprintf(path, len, "%s/options", conf->workdir);
+}
+
+int32_t
+glusterd_store_options(xlator_t *this, dict_t *opts)
+{
+    gf_store_handle_t *shandle = NULL;
+    glusterd_conf_t *conf = NULL;
+    char path[PATH_MAX] = {0};
+    int fd = -1;
+    int32_t ret = -1;
+    glusterd_volinfo_data_store_t *dict_data = NULL;
+
+    conf = this->private;
+    glusterd_store_set_options_path(conf, path, sizeof(path));
+
+    ret = gf_store_handle_new(path, &shandle);
+    if (ret) {
+        goto out;
+    }
+
+    fd = gf_store_mkstemp(shandle);
+    if (fd <= 0) {
+        ret = -1;
+        goto out;
+    }
+
+    dict_data = GF_CALLOC(1, sizeof(glusterd_volinfo_data_store_t),
+                          gf_gld_mt_volinfo_dict_data_t);
+    if (dict_data == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_NO_MEMORY, NULL);
+        return -1;
+    }
+    dict_data->shandle = shandle;
+    shandle->fd = fd;
+    dict_foreach(opts, _storeopts, (void *)dict_data);
+    if (dict_data->buffer_len > 0) {
+        ret = gf_store_save_items(fd, dict_data->buffer);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED, NULL);
+            goto out;
+        }
+    }
+
+    ret = gf_store_rename_tmppath(shandle);
+out:
+    shandle->fd = 0;
+    GF_FREE(dict_data);
+    if ((ret < 0) && (fd > 0)) {
+        gf_store_unlink_tmppath(shandle);
+    }
+    gf_store_handle_destroy(shandle);
+    return ret;
+}
+
+int32_t
+glusterd_store_retrieve_options(xlator_t *this)
+{
+    char path[PATH_MAX] = {0};
+    glusterd_conf_t *conf = NULL;
+    gf_store_handle_t *shandle = NULL;
+    gf_store_iter_t *iter = NULL;
+    char *key = NULL;
+    char *value = NULL;
+    gf_store_op_errno_t op_errno = 0;
+    int ret = -1;
+
+    conf = this->private;
+    glusterd_store_set_options_path(conf, path, sizeof(path));
+
+    ret = gf_store_handle_retrieve(path, &shandle);
+    if (ret)
+        goto out;
+
+    ret = gf_store_iter_new(shandle, &iter);
+    if (ret)
+        goto out;
+
+    ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+    while (!ret) {
+        ret = dict_set_dynstr(conf->opts, key, value);
+        if (ret) {
+            GF_FREE(key);
+            GF_FREE(value);
+            goto out;
+        }
+        GF_FREE(key);
+        key = NULL;
+        value = NULL;
+
+        ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+    }
+    if (op_errno != GD_STORE_EOF)
+        goto out;
+    ret = 0;
+out:
+    (void)gf_store_iter_destroy(&iter);
+    gf_store_handle_destroy(shandle);
+    return ret;
+}
+
+int32_t
+glusterd_store_retrieve_volumes(xlator_t *this, glusterd_snap_t *snap)
+{
+    int32_t ret = -1;
+    char path[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    DIR *dir = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    glusterd_volinfo_t *volinfo = NULL;
+    struct stat st = {
+        0,
+    };
+    char entry_path[PATH_MAX] = {
+        0,
+    };
+    int32_t len = 0;
+
+    GF_ASSERT(this);
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    if (snap)
+        len = snprintf(path, PATH_MAX, "%s/snaps/%s", priv->workdir,
+                       snap->snapname);
+    else
+        len = snprintf(path, PATH_MAX, "%s/%s", priv->workdir,
+                       GLUSTERD_VOLUME_DIR_PREFIX);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        goto out;
+    }
+
+    dir = sys_opendir(path);
+
+    if (!dir) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Unable to open dir %s", path);
+        goto out;
+    }
+
+    while ((entry = sys_readdir(dir, scratch))) {
+        if (gf_irrelevant_entry(entry))
+            continue;
+        if (snap && ((!strcmp(entry->d_name, "geo-replication")) ||
+                     (!strcmp(entry->d_name, "info"))))
+            continue;
+
+        len = snprintf(entry_path, PATH_MAX, "%s/%s", path, entry->d_name);
+        if ((len < 0) || (len >= PATH_MAX))
+            continue;
+
+        ret = sys_lstat(entry_path, &st);
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+                   "Failed to stat entry %s : %s", path, strerror(errno));
+            continue;
+        }
+
+        if (!S_ISDIR(st.st_mode)) {
+            gf_msg_debug(this->name, 0, "%s is not a valid volume",
+                         entry->d_name);
+            continue;
+        }
+
+        volinfo = glusterd_store_retrieve_volume(entry->d_name, snap);
+        if (!volinfo) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_RESTORE_FAIL,
+                   "Unable to restore "
+                   "volume: %s",
+                   entry->d_name);
+            ret = -1;
+            goto out;
+        }
+
+        ret = glusterd_store_retrieve_node_state(volinfo);
+        if (ret) {
+            /* Backward compatibility */
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_NEW_NODE_STATE_CREATION,
+                   "Creating a new node_state "
+                   "for volume: %s.",
+                   entry->d_name);
+            glusterd_store_create_nodestate_sh_on_absence(volinfo);
+            glusterd_store_perform_node_state_store(volinfo);
+        }
+    }
+
+    ret = 0;
+
+out:
+    if (dir)
+        sys_closedir(dir);
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+/* Figure out the brick mount path, from the brick path */
+int32_t
+glusterd_find_brick_mount_path(char *brick_path, char **brick_mount_path)
+{
+    char *ptr = NULL;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(brick_path);
+    GF_ASSERT(brick_mount_path);
+
+    *brick_mount_path = gf_strdup(brick_path);
+    if (!*brick_mount_path) {
+        ret = -1;
+        goto out;
+    }
+
+    /* Finding the pointer to the end of
+     * /var/run/gluster/snaps/<snap-uuid>
+     */
+    ptr = strstr(*brick_mount_path, "brick");
+    if (!ptr) {
+        /* Snapshot bricks must have brick num as part
+         * of the brickpath
+         */
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "Invalid brick path(%s)", brick_path);
+        ret = -1;
+        goto out;
+    }
+
+    /* Moving the pointer to the end of
+     * /var/run/gluster/snaps/<snap-uuid>/<brick_num>
+     * and assigning '\0' to it.
+     */
+    while ((*ptr != '\0') && (*ptr != '/'))
+        ptr++;
+
+    if (*ptr == '/') {
+        *ptr = '\0';
+    }
+
+    ret = 0;
+out:
+    if (ret && *brick_mount_path) {
+        GF_FREE(*brick_mount_path);
+        *brick_mount_path = NULL;
+    }
+    gf_msg_trace(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+/* Check if brick_mount_path is already mounted. If not, mount the device_path
+ * at the brick_mount_path
+ */
+int32_t
+glusterd_mount_brick_paths(char *brick_mount_path,
+                           glusterd_brickinfo_t *brickinfo)
+{
+    int32_t ret = -1;
+    runner_t runner = {
+        0,
+    };
+    char buff[PATH_MAX] = {
+        0,
+    };
+    struct mntent save_entry = {
+        0,
+    };
+    struct mntent *entry = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(brick_mount_path);
+    GF_ASSERT(brickinfo);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* Check if the brick_mount_path is already mounted */
+    entry = glusterd_get_mnt_entry_info(brick_mount_path, buff, sizeof(buff),
+                                        &save_entry);
+    if (entry) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_ALREADY_MOUNTED,
+               "brick_mount_path (%s) already mounted.", brick_mount_path);
+        ret = 0;
+        goto out;
+    }
+
+    /* TODO RHEL 6.5 has the logical volumes inactive by default
+     * on reboot. Hence activating the logical vol. Check behaviour
+     * on other systems
+     */
+    /* Activate the snapshot */
+    runinit(&runner);
+    runner_add_args(&runner, "lvchange", "-ay", brickinfo->device_path, NULL);
+    ret = runner_run(&runner);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_SNAP_ACTIVATE_FAIL,
+               "Failed to activate %s.", brickinfo->device_path);
+        goto out;
+    } else
+        gf_msg_debug(this->name, 0, "Activating %s successful",
+                     brickinfo->device_path);
+
+    /* Mount the snapshot */
+    ret = glusterd_mount_lvm_snapshot(brickinfo, brick_mount_path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_MOUNT_FAIL,
+               "Failed to mount lvm snapshot.");
+        goto out;
+    }
+
+out:
+    gf_msg_trace(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_recreate_vol_brick_mounts(xlator_t *this, glusterd_volinfo_t *volinfo)
+{
+    char *brick_mount_path = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int32_t ret = -1;
+    struct stat st_buf = {
+        0,
+    };
+    char abspath[PATH_MAX] = {0};
+
+    GF_ASSERT(this);
+    GF_ASSERT(volinfo);
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        /* If the brick is not of this node, or its
+         * snapshot is pending, or the brick is not
+         * a snapshotted brick, we continue
+         */
+        if ((gf_uuid_compare(brickinfo->uuid, MY_UUID)) ||
+            (brickinfo->snap_status == -1) ||
+            (strlen(brickinfo->device_path) == 0))
+            continue;
+
+        /* Fetch the brick mount path from the brickinfo->path */
+        ret = glusterd_find_brick_mount_path(brickinfo->path,
+                                             &brick_mount_path);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRK_MNTPATH_GET_FAIL,
+                   "Failed to find brick_mount_path for %s", brickinfo->path);
+            goto out;
+        }
+
+        /* Check if the brickinfo path is present.
+         * If not create the brick_mount_path */
+        ret = sys_lstat(brickinfo->path, &st_buf);
+        if (ret) {
+            if (errno == ENOENT) {
+                ret = mkdir_p(brick_mount_path, 0755, _gf_true);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, errno,
+                           GD_MSG_CREATE_DIR_FAILED, "Failed to create %s. ",
+                           brick_mount_path);
+                    goto out;
+                }
+            } else {
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                       "Brick Path(%s) not valid. ", brickinfo->path);
+                goto out;
+            }
+        }
+
+        /* Check if brick_mount_path is already mounted.
+         * If not, mount the device_path at the brick_mount_path */
+        ret = glusterd_mount_brick_paths(brick_mount_path, brickinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRK_MNTPATH_MOUNT_FAIL,
+                   "Failed to mount brick_mount_path");
+        }
+        if (!gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+            if (brickinfo->real_path[0] == '\0') {
+                if (!realpath(brickinfo->path, abspath)) {
+                    gf_msg(this->name, GF_LOG_CRITICAL, errno,
+                           GD_MSG_BRICKINFO_CREATE_FAIL,
+                           "realpath() failed for brick %s"
+                           ". The underlying file system "
+                           "may be in bad state",
+                           brickinfo->path);
+                    ret = -1;
+                    goto out;
+                }
+                if (strlen(abspath) >= sizeof(brickinfo->real_path)) {
+                    ret = -1;
+                    goto out;
+                }
+                (void)strncpy(brickinfo->real_path, abspath,
+                              sizeof(brickinfo->real_path));
+            }
+        }
+
+        if (brick_mount_path) {
+            GF_FREE(brick_mount_path);
+            brick_mount_path = NULL;
+        }
+    }
+
+    ret = 0;
+out:
+    if (ret && brick_mount_path)
+        GF_FREE(brick_mount_path);
+
+    gf_msg_trace(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_resolve_snap_bricks(xlator_t *this, glusterd_snap_t *snap)
+{
+    int32_t ret = -1;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+
+    GF_ASSERT(this);
+    GF_VALIDATE_OR_GOTO(this->name, snap, out);
+
+    cds_list_for_each_entry(volinfo, &snap->volumes, vol_list)
+    {
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            ret = glusterd_resolve_brick(brickinfo);
+            if (ret) {
+                gf_event(EVENT_BRICKPATH_RESOLVE_FAILED,
+                         "peer=%s;volume=%s;brick=%s", brickinfo->hostname,
+                         volinfo->volname, brickinfo->path);
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESOLVE_BRICK_FAIL,
+                       "resolve brick failed in restore");
+                goto out;
+            }
+        }
+    }
+
+    ret = 0;
+
+out:
+    gf_msg_trace(this->name, 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_store_update_snap(glusterd_snap_t *snap)
+{
+    int ret = -1;
+    char *key = NULL;
+    char *value = NULL;
+    char snappath[PATH_MAX] = {
+        0,
+    };
+    char path[PATH_MAX] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    gf_store_iter_t *iter = NULL;
+    gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+    int32_t len = 0;
+
+    this = THIS;
+    conf = this->private;
+    GF_ASSERT(snap);
+
+    GLUSTERD_GET_SNAP_DIR(snappath, snap, conf);
+
+    len = snprintf(path, sizeof(path), "%s/%s", snappath,
+                   GLUSTERD_SNAP_INFO_FILE);
+    if ((len < 0) || (len >= sizeof(path))) {
+        goto out;
+    }
+
+    ret = gf_store_handle_retrieve(path, &snap->shandle);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HANDLE_NULL,
+               "snap handle is NULL");
+        goto out;
+    }
+
+    ret = gf_store_iter_new(snap->shandle, &iter);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_GET_FAIL,
+               "Failed to get new store "
+               "iter");
+        goto out;
+    }
+
+    ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_GET_FAIL,
+               "Failed to get next store "
+               "iter");
+        goto out;
+    }
+
+    while (!ret) {
+        gf_msg_debug(this->name, 0, "key = %s value = %s", key, value);
+
+        if (!strncmp(key, GLUSTERD_STORE_KEY_SNAP_ID,
+                     SLEN(GLUSTERD_STORE_KEY_SNAP_ID))) {
+            ret = gf_uuid_parse(value, snap->snap_id);
+            if (ret)
+                gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_UUID_PARSE_FAIL,
+                       "Failed to parse uuid");
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_SNAP_RESTORED,
+                            SLEN(GLUSTERD_STORE_KEY_SNAP_RESTORED))) {
+            snap->snap_restored = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_SNAP_STATUS,
+                            SLEN(GLUSTERD_STORE_KEY_SNAP_STATUS))) {
+            snap->snap_status = atoi(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_SNAP_DESC,
+                            SLEN(GLUSTERD_STORE_KEY_SNAP_DESC))) {
+            snap->description = gf_strdup(value);
+        } else if (!strncmp(key, GLUSTERD_STORE_KEY_SNAP_TIMESTAMP,
+                            SLEN(GLUSTERD_STORE_KEY_SNAP_TIMESTAMP))) {
+            snap->time_stamp = atoi(value);
+        }
+
+        GF_FREE(key);
+        GF_FREE(value);
+        key = NULL;
+        value = NULL;
+
+        ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+    }
+
+    if (op_errno != GD_STORE_EOF)
+        goto out;
+
+    ret = 0;
+
+out:
+    if (gf_store_iter_destroy(&iter)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_ITER_DESTROY_FAIL,
+               "Failed to destroy store iter");
+        ret = -1;
+    }
+
+    return ret;
+}
+
+int32_t
+glusterd_store_retrieve_snap(char *snapname)
+{
+    int32_t ret = -1;
+    glusterd_snap_t *snap = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(snapname);
+
+    snap = glusterd_new_snap_object();
+    if (!snap) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_OBJECT_STORE_FAIL,
+               "Failed to create "
+               " snap object");
+        goto out;
+    }
+
+    if (snprintf(snap->snapname, sizeof(snap->snapname), "%s", snapname) >=
+        sizeof(snap->snapname))
+        goto out;
+    ret = glusterd_store_update_snap(snap);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAPSHOT_UPDATE_FAIL,
+               "Failed to update snapshot "
+               "for %s snap",
+               snapname);
+        goto out;
+    }
+
+    ret = glusterd_store_retrieve_volumes(this, snap);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_VOL_RETRIEVE_FAIL,
+               "Failed to retrieve "
+               "snap volumes for snap %s",
+               snapname);
+        goto out;
+    }
+
+    /* TODO: list_add_order can do 'N-square' comparisons and
+       is not efficient. Find a better solution to store the snap
+       in order */
+    glusterd_list_add_order(&snap->snap_list, &priv->snapshots,
+                            glusterd_compare_snap_time);
+
+out:
+    gf_msg_trace(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+/* Read the missed_snap_list and update the in-memory structs */
+int32_t
+glusterd_store_retrieve_missed_snaps_list(xlator_t *this)
+{
+    char path[PATH_MAX] = "";
+    char *snap_vol_id = NULL;
+    char *missed_node_info = NULL;
+    char *brick_path = NULL;
+    char *value = NULL;
+    char *save_ptr = NULL;
+    FILE *fp = NULL;
+    int32_t brick_num = -1;
+    int32_t snap_op = -1;
+    int32_t snap_status = -1;
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    gf_store_op_errno_t store_errno = GD_STORE_SUCCESS;
+
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* Get the path of the missed_snap_list */
+    glusterd_store_missed_snaps_list_path_set(path, sizeof(path));
+
+    fp = fopen(path, "r");
+    if (!fp) {
+        /* If errno is ENOENT then there are no missed snaps yet */
+        if (errno != ENOENT) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                   "Failed to open %s. ", path);
+        } else {
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_MISSED_SNAP_LIST_EMPTY,
+                   "No missed snaps list.");
+            ret = 0;
+        }
+        goto out;
+    }
+
+    do {
+        ret = gf_store_read_and_tokenize(fp, &missed_node_info, &value,
+                                         &store_errno);
+        if (ret) {
+            if (store_errno == GD_STORE_EOF) {
+                gf_msg_debug(this->name, 0, "EOF for missed_snap_list");
+                ret = 0;
+                break;
+            }
+            gf_msg(this->name, GF_LOG_ERROR, store_errno,
+                   GD_MSG_MISSED_SNAP_GET_FAIL,
+                   "Failed to fetch data from "
+                   "missed_snaps_list.");
+            goto out;
+        }
+
+        /* Fetch the brick_num, brick_path, snap_op and snap status */
+        snap_vol_id = strtok_r(value, ":", &save_ptr);
+        brick_num = atoi(strtok_r(NULL, ":", &save_ptr));
+        brick_path = strtok_r(NULL, ":", &save_ptr);
+        snap_op = atoi(strtok_r(NULL, ":", &save_ptr));
+        snap_status = atoi(strtok_r(NULL, ":", &save_ptr));
+
+        if (!missed_node_info || !brick_path || !snap_vol_id || brick_num < 1 ||
+            snap_op < 1 || snap_status < 1) {
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+                   GD_MSG_INVALID_MISSED_SNAP_ENTRY,
+                   "Invalid missed_snap_entry");
+            ret = -1;
+            goto out;
+        }
+
+        ret = glusterd_add_new_entry_to_list(missed_node_info, snap_vol_id,
+                                             brick_num, brick_path, snap_op,
+                                             snap_status);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_MISSED_SNAP_LIST_STORE_FAIL,
+                   "Failed to store missed snaps_list");
+            goto out;
+        }
+
+    } while (store_errno == GD_STORE_SUCCESS);
+
+    ret = 0;
+out:
+    if (fp)
+        fclose(fp);
+
+    gf_msg_trace(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_store_retrieve_snaps(xlator_t *this)
+{
+    int32_t ret = 0;
+    char path[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    DIR *dir = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    int32_t len = 0;
+
+    GF_ASSERT(this);
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    len = snprintf(path, PATH_MAX, "%s/snaps", priv->workdir);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        ret = -1;
+        goto out;
+    }
+
+    dir = sys_opendir(path);
+
+    if (!dir) {
+        /* If snaps dir doesn't exists ignore the error for
+           backward compatibility */
+        if (errno != ENOENT) {
+            ret = -1;
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+                   "Unable to open dir %s", path);
+        }
+        goto out;
+    }
+
+    while ((entry = sys_readdir(dir, scratch))) {
+        if (gf_irrelevant_entry(entry))
+            continue;
+        if (strcmp(entry->d_name, GLUSTERD_MISSED_SNAPS_LIST_FILE)) {
+            ret = glusterd_store_retrieve_snap(entry->d_name);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_RESTORE_FAIL,
+                       "Unable to restore snapshot: %s", entry->d_name);
+                goto out;
+            }
+        }
+    }
+
+    /* Retrieve missed_snaps_list */
+    ret = glusterd_store_retrieve_missed_snaps_list(this);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Failed to retrieve missed_snaps_list");
+        goto out;
+    }
+
+out:
+    if (dir)
+        sys_closedir(dir);
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+/* Writes all the contents of conf->missed_snap_list */
+int32_t
+glusterd_store_write_missed_snapinfo(int32_t fd)
+{
+    char key[(UUID_SIZE * 2) + 2];
+    char value[PATH_MAX];
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    glusterd_missed_snap_info *missed_snapinfo = NULL;
+    glusterd_snap_op_t *snap_opinfo = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* Write the missed_snap_entry */
+    cds_list_for_each_entry(missed_snapinfo, &priv->missed_snaps_list,
+                            missed_snaps)
+    {
+        cds_list_for_each_entry(snap_opinfo, &missed_snapinfo->snap_ops,
+                                snap_ops_list)
+        {
+            snprintf(key, sizeof(key), "%s:%s", missed_snapinfo->node_uuid,
+                     missed_snapinfo->snap_uuid);
+            snprintf(value, sizeof(value), "%s:%d:%s:%d:%d",
+                     snap_opinfo->snap_vol_id, snap_opinfo->brick_num,
+                     snap_opinfo->brick_path, snap_opinfo->op,
+                     snap_opinfo->status);
+            ret = gf_store_save_value(fd, key, value);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_MISSEDSNAP_INFO_SET_FAIL,
+                       "Failed to write missed snapinfo");
+                goto out;
+            }
+        }
+    }
+
+    ret = 0;
+out:
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Adds the missed snap entries to the in-memory conf->missed_snap_list *
+ * and writes them to disk */
+int32_t
+glusterd_store_update_missed_snaps()
+{
+    int32_t fd = -1;
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = glusterd_store_create_missed_snaps_list_shandle_on_absence();
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0,
+               GD_MSG_MISSED_SNAP_LIST_STORE_HANDLE_GET_FAIL,
+               "Unable to obtain "
+               "missed_snaps_list store handle.");
+        goto out;
+    }
+
+    fd = gf_store_mkstemp(priv->missed_snaps_list_shandle);
+    if (fd <= 0) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "Failed to create tmp file");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_store_write_missed_snapinfo(fd);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MISSED_SNAP_CREATE_FAIL,
+               "Failed to write missed snaps to disk");
+        goto out;
+    }
+
+    ret = gf_store_rename_tmppath(priv->missed_snaps_list_shandle);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "Failed to rename the tmp file");
+        goto out;
+    }
+out:
+    if (ret && (fd > 0)) {
+        ret = gf_store_unlink_tmppath(priv->missed_snaps_list_shandle);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TMP_FILE_UNLINK_FAIL,
+                   "Failed to unlink the tmp file");
+        }
+        ret = -1;
+    }
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_store_delete_peerinfo(glusterd_peerinfo_t *peerinfo)
+{
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    char peerdir[PATH_MAX] = {
+        0,
+    };
+    char filepath[PATH_MAX] = {
+        0,
+    };
+    char hostname_path[PATH_MAX] = {
+        0,
+    };
+    int32_t len = 0;
+
+    if (!peerinfo) {
+        ret = 0;
+        goto out;
+    }
+
+    this = THIS;
+    priv = this->private;
+
+    len = snprintf(peerdir, PATH_MAX, "%s/peers", priv->workdir);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        goto out;
+    }
+
+    if (gf_uuid_is_null(peerinfo->uuid)) {
+        if (peerinfo->hostname) {
+            len = snprintf(filepath, PATH_MAX, "%s/%s", peerdir,
+                           peerinfo->hostname);
+            if ((len < 0) || (len >= PATH_MAX)) {
+                goto out;
+            }
+        } else {
+            ret = 0;
+            goto out;
+        }
+    } else {
+        len = snprintf(filepath, PATH_MAX, "%s/%s", peerdir,
+                       uuid_utoa(peerinfo->uuid));
+        if ((len < 0) || (len >= PATH_MAX)) {
+            goto out;
+        }
+        len = snprintf(hostname_path, PATH_MAX, "%s/%s", peerdir,
+                       peerinfo->hostname);
+        if ((len < 0) || (len >= PATH_MAX)) {
+            goto out;
+        }
+
+        ret = sys_unlink(hostname_path);
+
+        if (!ret)
+            goto out;
+    }
+
+    ret = sys_unlink(filepath);
+    if (ret && (errno == ENOENT))
+        ret = 0;
+
+out:
+    if (peerinfo && peerinfo->shandle) {
+        gf_store_handle_destroy(peerinfo->shandle);
+        peerinfo->shandle = NULL;
+    }
+    gf_msg_debug((this ? this->name : "glusterd"), 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+void
+glusterd_store_peerinfo_dirpath_set(char *path, size_t len)
+{
+    glusterd_conf_t *priv = NULL;
+    GF_ASSERT(path);
+    GF_ASSERT(len >= PATH_MAX);
+
+    priv = THIS->private;
+    snprintf(path, len, "%s/peers", priv->workdir);
+}
+
+int32_t
+glusterd_store_create_peer_dir()
+{
+    int32_t ret = 0;
+    char path[PATH_MAX];
+
+    glusterd_store_peerinfo_dirpath_set(path, sizeof(path));
+    ret = gf_store_mkdir(path);
+
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+static void
+glusterd_store_uuid_peerpath_set(glusterd_peerinfo_t *peerinfo, char *peerfpath,
+                                 size_t len)
+{
+    char peerdir[PATH_MAX];
+    char str[50] = {0};
+
+    GF_ASSERT(peerinfo);
+    GF_ASSERT(peerfpath);
+    GF_ASSERT(len >= PATH_MAX);
+
+    glusterd_store_peerinfo_dirpath_set(peerdir, sizeof(peerdir));
+    gf_uuid_unparse(peerinfo->uuid, str);
+    snprintf(peerfpath, len, "%s/%s", peerdir, str);
+}
+
+static void
+glusterd_store_hostname_peerpath_set(glusterd_peerinfo_t *peerinfo,
+                                     char *peerfpath, size_t len)
+{
+    char peerdir[PATH_MAX];
+
+    GF_ASSERT(peerinfo);
+    GF_ASSERT(peerfpath);
+    GF_ASSERT(len >= PATH_MAX);
+
+    glusterd_store_peerinfo_dirpath_set(peerdir, sizeof(peerdir));
+    snprintf(peerfpath, len, "%s/%s", peerdir, peerinfo->hostname);
+}
+
+int32_t
+glusterd_store_peerinfo_hostname_shandle_create(glusterd_peerinfo_t *peerinfo)
+{
+    char peerfpath[PATH_MAX];
+    int32_t ret = -1;
+
+    glusterd_store_hostname_peerpath_set(peerinfo, peerfpath,
+                                         sizeof(peerfpath));
+    ret = gf_store_handle_create_on_absence(&peerinfo->shandle, peerfpath);
+    return ret;
+}
+
+int32_t
+glusterd_store_peerinfo_uuid_shandle_create(glusterd_peerinfo_t *peerinfo)
+{
+    char peerfpath[PATH_MAX];
+    int32_t ret = -1;
+
+    glusterd_store_uuid_peerpath_set(peerinfo, peerfpath, sizeof(peerfpath));
+    ret = gf_store_handle_create_on_absence(&peerinfo->shandle, peerfpath);
+    return ret;
+}
+
+int32_t
+glusterd_peerinfo_hostname_shandle_check_destroy(glusterd_peerinfo_t *peerinfo)
+{
+    char peerfpath[PATH_MAX];
+    int32_t ret = -1;
+    struct stat stbuf = {
+        0,
+    };
+
+    glusterd_store_hostname_peerpath_set(peerinfo, peerfpath,
+                                         sizeof(peerfpath));
+    ret = sys_stat(peerfpath, &stbuf);
+    if (!ret) {
+        if (peerinfo->shandle)
+            gf_store_handle_destroy(peerinfo->shandle);
+        peerinfo->shandle = NULL;
+        ret = sys_unlink(peerfpath);
+    }
+    return ret;
+}
+
+int32_t
+glusterd_store_create_peer_shandle(glusterd_peerinfo_t *peerinfo)
+{
+    int32_t ret = 0;
+
+    GF_ASSERT(peerinfo);
+
+    if (gf_uuid_is_null(peerinfo->uuid)) {
+        ret = glusterd_store_peerinfo_hostname_shandle_create(peerinfo);
+    } else {
+        ret = glusterd_peerinfo_hostname_shandle_check_destroy(peerinfo);
+        ret = glusterd_store_peerinfo_uuid_shandle_create(peerinfo);
+    }
+    return ret;
+}
+
+static int32_t
+glusterd_store_peer_write(int fd, glusterd_peerinfo_t *peerinfo)
+{
+    char buf[PATH_MAX];
+    uint total_len = 0;
+    int32_t ret = 0;
+    int32_t i = 1;
+    glusterd_peer_hostname_t *hostname = NULL;
+
+    ret = snprintf(buf + total_len, sizeof(buf) - total_len, "%s=%s\n%s=%d\n",
+                   GLUSTERD_STORE_KEY_PEER_UUID, uuid_utoa(peerinfo->uuid),
+                   GLUSTERD_STORE_KEY_PEER_STATE, peerinfo->state.state);
+    if (ret < 0 || ret >= sizeof(buf) - total_len) {
+        ret = -1;
+        goto out;
+    }
+    total_len += ret;
+
+    cds_list_for_each_entry(hostname, &peerinfo->hostnames, hostname_list)
+    {
+        ret = snprintf(buf + total_len, sizeof(buf) - total_len,
+                       GLUSTERD_STORE_KEY_PEER_HOSTNAME "%d=%s\n", i,
+                       hostname->hostname);
+        if (ret < 0 || ret >= sizeof(buf) - total_len) {
+            ret = -1;
+            goto out;
+        }
+        total_len += ret;
+        i++;
+    }
+
+    ret = gf_store_save_items(fd, buf);
+out:
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_store_perform_peer_store(glusterd_peerinfo_t *peerinfo)
+{
+    int fd = -1;
+    int32_t ret = -1;
+
+    GF_ASSERT(peerinfo);
+
+    fd = gf_store_mkstemp(peerinfo->shandle);
+    if (fd <= 0) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_store_peer_write(fd, peerinfo);
+    if (ret)
+        goto out;
+
+    ret = gf_store_rename_tmppath(peerinfo->shandle);
+out:
+    if (ret && (fd > 0))
+        gf_store_unlink_tmppath(peerinfo->shandle);
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_store_peerinfo(glusterd_peerinfo_t *peerinfo)
+{
+    int32_t ret = -1;
+
+    GF_ASSERT(peerinfo);
+
+    ret = glusterd_store_create_peer_dir();
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_create_peer_shandle(peerinfo);
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_perform_peer_store(peerinfo);
+out:
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_store_retrieve_peers(xlator_t *this)
+{
+    int32_t ret = 0;
+    glusterd_conf_t *priv = NULL;
+    DIR *dir = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    char path[PATH_MAX] = {
+        0,
+    };
+    glusterd_peerinfo_t *peerinfo = NULL;
+    gf_store_handle_t *shandle = NULL;
+    char filepath[PATH_MAX] = {
+        0,
+    };
+    gf_store_iter_t *iter = NULL;
+    char *key = NULL;
+    char *value = NULL;
+    glusterd_peerctx_args_t args = {0};
+    gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+    glusterd_peer_hostname_t *address = NULL;
+    uuid_t tmp_uuid;
+    gf_boolean_t is_ok;
+    int32_t len;
+
+    GF_ASSERT(this);
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    len = snprintf(path, PATH_MAX, "%s/%s", priv->workdir,
+                   GLUSTERD_PEER_DIR_PREFIX);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        ret = -1;
+        goto out;
+    }
+
+    dir = sys_opendir(path);
+
+    if (!dir) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "Unable to open dir %s", path);
+        ret = -1;
+        goto out;
+    }
+
+    while ((entry = sys_readdir(dir, scratch))) {
+        if (gf_irrelevant_entry(entry))
+            continue;
+        if (gf_uuid_parse(entry->d_name, tmp_uuid) != 0) {
+            gf_log(this->name, GF_LOG_WARNING, "skipping non-peer file %s",
+                   entry->d_name);
+            continue;
+        }
+        is_ok = _gf_false;
+        len = snprintf(filepath, PATH_MAX, "%s/%s", path, entry->d_name);
+        if ((len < 0) || (len >= PATH_MAX)) {
+            goto next;
+        }
+        ret = gf_store_handle_retrieve(filepath, &shandle);
+        if (ret)
+            goto next;
+
+        ret = gf_store_iter_new(shandle, &iter);
+        if (ret)
+            goto next;
+
+        ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+        if (ret) {
+            goto next;
+        }
+
+        /* Create an empty peerinfo object before reading in the
+         * details
+         */
+        peerinfo = glusterd_peerinfo_new(GD_FRIEND_STATE_DEFAULT, NULL, NULL,
+                                         0);
+        if (peerinfo == NULL) {
+            ret = -1;
+            goto next;
+        }
+
+        while (!ret) {
+            if (!strncmp(GLUSTERD_STORE_KEY_PEER_UUID, key,
+                         SLEN(GLUSTERD_STORE_KEY_PEER_UUID))) {
+                if (value)
+                    gf_uuid_parse(value, peerinfo->uuid);
+            } else if (!strncmp(GLUSTERD_STORE_KEY_PEER_STATE, key,
+                                SLEN(GLUSTERD_STORE_KEY_PEER_STATE))) {
+                peerinfo->state.state = atoi(value);
+            } else if (!strncmp(GLUSTERD_STORE_KEY_PEER_HOSTNAME, key,
+                                SLEN(GLUSTERD_STORE_KEY_PEER_HOSTNAME))) {
+                ret = gd_add_address_to_peer(peerinfo, value);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_ADD_ADDRESS_TO_PEER_FAIL,
+                           "Could not add address to peer");
+                }
+            } else {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNKNOWN_KEY,
+                       "Unknown key: %s", key);
+            }
+
+            GF_FREE(key);
+            GF_FREE(value);
+            key = NULL;
+            value = NULL;
+
+            ret = gf_store_iter_get_next(iter, &key, &value, &op_errno);
+        }
+        if (op_errno != GD_STORE_EOF) {
+            goto next;
+        }
+
+        if (gf_uuid_is_null(peerinfo->uuid)) {
+            gf_log("", GF_LOG_ERROR,
+                   "Null UUID while attempting to read peer from '%s'",
+                   filepath);
+            goto next;
+        }
+
+        /* Set first hostname from peerinfo->hostnames to
+         * peerinfo->hostname
+         */
+        address = cds_list_entry(peerinfo->hostnames.next,
+                                 glusterd_peer_hostname_t, hostname_list);
+        peerinfo->hostname = gf_strdup(address->hostname);
+
+        ret = glusterd_friend_add_from_peerinfo(peerinfo, 1, NULL);
+        if (ret)
+            goto next;
+
+        peerinfo->shandle = shandle;
+        is_ok = _gf_true;
+
+    next:
+        (void)gf_store_iter_destroy(&iter);
+
+        if (!is_ok) {
+            gf_log(this->name, GF_LOG_WARNING,
+                   "skipping malformed peer file %s", entry->d_name);
+            if (peerinfo) {
+                glusterd_peerinfo_cleanup(peerinfo);
+            }
+        }
+        peerinfo = NULL;
+    }
+
+    args.mode = GD_MODE_ON;
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &priv->peers, uuid_list)
+    {
+        ret = glusterd_friend_rpc_create(this, peerinfo, &args);
+        if (ret)
+            break;
+    }
+    RCU_READ_UNLOCK;
+    peerinfo = NULL;
+
+out:
+
+    if (dir)
+        sys_closedir(dir);
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+/* Bricks for snap volumes are hosted at /var/run/gluster/snaps
+ * When a volume is restored, it points to the bricks of the snap
+ * volume it was restored from. Hence on a node restart these
+ * paths need to be recreated and re-mounted
+ */
+int32_t
+glusterd_recreate_all_snap_brick_mounts(xlator_t *this)
+{
+    int32_t ret = 0;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_snap_t *snap = NULL;
+
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* Recreate bricks of volumes restored from snaps */
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+    {
+        /* If the volume is not a restored volume then continue */
+        if (gf_uuid_is_null(volinfo->restored_from_snap))
+            continue;
+
+        ret = glusterd_recreate_vol_brick_mounts(this, volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRK_MNT_RECREATE_FAIL,
+                   "Failed to recreate brick mounts "
+                   "for %s",
+                   volinfo->volname);
+            goto out;
+        }
+    }
+
+    /* Recreate bricks of snapshot volumes
+     * We are not creating brick mounts for stopped snaps.
+     */
+    cds_list_for_each_entry(snap, &priv->snapshots, snap_list)
+    {
+        cds_list_for_each_entry(volinfo, &snap->volumes, vol_list)
+        {
+            if (volinfo->status != GLUSTERD_STATUS_STOPPED) {
+                ret = glusterd_recreate_vol_brick_mounts(this, volinfo);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_BRK_MNT_RECREATE_FAIL,
+                           "Failed to recreate brick "
+                           "mounts for %s",
+                           snap->snapname);
+                    goto out;
+                }
+            }
+        }
+    }
+
+out:
+    gf_msg_trace(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+/* When the snapshot command from cli is received, the on disk and
+ * in memory structures for the snapshot are created (with the status)
+ * being marked as GD_SNAP_STATUS_INIT. Once the backend snapshot is
+ * taken, the status is changed to GD_SNAP_STATUS_IN_USE. If glusterd
+ * dies after taking the backend snapshot, but before updating the
+ * status, then when glusterd comes up, it should treat that snapshot
+ * as a failed snapshot and clean it up.
+ *
+ * Restore operation starts by setting the status to
+ * GD_SNAP_STATUS_RESTORED. If the server goes down before changing
+ * the status the status back we need to revert the partial snapshot
+ * taken.
+ */
+int32_t
+glusterd_snap_cleanup(xlator_t *this)
+{
+    dict_t *dict = NULL;
+    int32_t ret = 0;
+    glusterd_conf_t *priv = NULL;
+    glusterd_snap_t *snap = NULL;
+    glusterd_snap_t *tmp_snap = NULL;
+
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    dict = dict_new();
+    if (!dict) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_CREATE_FAIL,
+               "Failed to create dict");
+        ret = -1;
+        goto out;
+    }
+
+    cds_list_for_each_entry_safe(snap, tmp_snap, &priv->snapshots, snap_list)
+    {
+        if (snap->snap_status == GD_SNAP_STATUS_RESTORED) {
+            ret = glusterd_snapshot_revert_restore_from_snap(snap);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_SNAP_RESTORE_REVERT_FAIL,
+                       "Failed to "
+                       "revert partially restored snapshot "
+                       "(%s)",
+                       snap->snapname);
+                goto out;
+            }
+        } else if (snap->snap_status != GD_SNAP_STATUS_IN_USE) {
+            ret = glusterd_snap_remove(dict, snap, _gf_true, _gf_true,
+                                       _gf_false);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                       "Failed to remove the snapshot %s", snap->snapname);
+                goto out;
+            }
+        }
+    }
+out:
+    if (dict)
+        dict_unref(dict);
+
+    gf_msg_trace(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_resolve_all_bricks(xlator_t *this)
+{
+    int32_t ret = 0;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_snap_t *snap = NULL;
+
+    GF_ASSERT(this);
+    priv = this->private;
+
+    GF_ASSERT(priv);
+
+    /* Resolve bricks of volumes */
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+    {
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            ret = glusterd_resolve_brick(brickinfo);
+            if (ret) {
+                gf_event(EVENT_BRICKPATH_RESOLVE_FAILED,
+                         "peer=%s;volume=%s;brick=%s", brickinfo->hostname,
+                         volinfo->volname, brickinfo->path);
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_RESOLVE_BRICK_FAIL,
+                       "Failed to resolve brick %s with host %s of volume %s"
+                       " in restore",
+                       brickinfo->path, brickinfo->hostname, volinfo->volname);
+                goto out;
+            }
+        }
+    }
+
+    /* Resolve bricks of snapshot volumes */
+    cds_list_for_each_entry(snap, &priv->snapshots, snap_list)
+    {
+        ret = glusterd_resolve_snap_bricks(this, snap);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_RESOLVE_BRICK_FAIL,
+                   "resolving the snap bricks"
+                   " failed for snap: %s",
+                   snap->snapname);
+            goto out;
+        }
+    }
+
+out:
+    gf_msg_trace(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_restore()
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    ret = glusterd_options_init(this);
+    if (ret < 0)
+        goto out;
+
+    ret = glusterd_store_retrieve_volumes(this, NULL);
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_retrieve_peers(this);
+    if (ret)
+        goto out;
+
+    /* While retrieving snapshots, if the snapshot status
+       is not GD_SNAP_STATUS_IN_USE, then the snapshot is
+       cleaned up. To do that, the snap volume has to be
+       stopped by stopping snapshot volume's bricks. And for
+       that the snapshot bricks should be resolved. But without
+       retrieving the peers, resolving bricks will fail. So
+       do retrieving of snapshots after retrieving peers.
+    */
+    ret = glusterd_store_retrieve_snaps(this);
+    if (ret)
+        goto out;
+
+    ret = glusterd_resolve_all_bricks(this);
+    if (ret)
+        goto out;
+
+    ret = glusterd_snap_cleanup(this);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_CLEANUP_FAIL,
+               "Failed to perform "
+               "a cleanup of the snapshots");
+        goto out;
+    }
+
+    ret = glusterd_recreate_all_snap_brick_mounts(this);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SNAP_BRK_MNT_RECREATE_FAIL,
+               "Failed to recreate "
+               "all snap brick mounts");
+        goto out;
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_store_retrieve_quota_version(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    uint32_t version = 0;
+    char cksum_path[PATH_MAX] = {
+        0,
+    };
+    char path[PATH_MAX] = {
+        0,
+    };
+    char *version_str = NULL;
+    char *tmp = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    gf_store_handle_t *handle = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GLUSTERD_GET_VOLUME_DIR(path, volinfo, conf);
+    len = snprintf(cksum_path, sizeof(cksum_path), "%s/%s", path,
+                   GLUSTERD_VOL_QUOTA_CKSUM_FILE);
+    if ((len < 0) || (len >= sizeof(cksum_path))) {
+        goto out;
+    }
+
+    ret = gf_store_handle_new(cksum_path, &handle);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_STORE_HANDLE_GET_FAIL,
+               "Unable to get store handle "
+               "for %s",
+               cksum_path);
+        goto out;
+    }
+
+    ret = gf_store_retrieve_value(handle, "version", &version_str);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Version absent");
+        ret = 0;
+        goto out;
+    }
+
+    version = strtoul(version_str, &tmp, 10);
+    if ((errno == ERANGE) || (errno == EINVAL)) {
+        gf_msg_debug(this->name, 0, "Invalid version number");
+        goto out;
+    }
+    volinfo->quota_conf_version = version;
+    ret = 0;
+
+out:
+    if (version_str)
+        GF_FREE(version_str);
+    gf_store_handle_destroy(handle);
+    return ret;
+}
+
+int
+glusterd_store_save_quota_version_and_cksum(glusterd_volinfo_t *volinfo)
+{
+    gf_store_handle_t *shandle = NULL;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+    char path[PATH_MAX] = {0};
+    char cksum_path[PATH_MAX + 32] = {
+        0,
+    };
+    char buf[64] = {0};
+    int fd = -1;
+    int32_t ret = -1;
+    int32_t len = 0;
+
+    this = THIS;
+    conf = this->private;
+
+    GLUSTERD_GET_VOLUME_DIR(path, volinfo, conf);
+    len = snprintf(cksum_path, sizeof(cksum_path), "%s/%s", path,
+                   GLUSTERD_VOL_QUOTA_CKSUM_FILE);
+    if ((len < 0) || (len >= sizeof(cksum_path))) {
+        goto out;
+    }
+
+    ret = gf_store_handle_new(cksum_path, &shandle);
+    if (ret)
+        goto out;
+
+    fd = gf_store_mkstemp(shandle);
+    if (fd <= 0) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = snprintf(buf, sizeof(buf), "cksum=%u\nversion=%u\n",
+                   volinfo->quota_conf_cksum, volinfo->quota_conf_version);
+    if (ret < 0 || ret >= sizeof(buf)) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = gf_store_save_items(fd, buf);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CKSUM_STORE_FAIL,
+               "Failed to store quota cksum and version");
+        goto out;
+    }
+
+    ret = gf_store_rename_tmppath(shandle);
+    if (ret)
+        goto out;
+
+out:
+    if ((ret < 0) && (fd > 0))
+        gf_store_unlink_tmppath(shandle);
+    gf_store_handle_destroy(shandle);
+    return ret;
+}
+
+int32_t
+glusterd_quota_conf_write_header(int fd)
+{
+    int header_len = 0;
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("quota", this, out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    if (conf->op_version < GD_OP_VERSION_3_7_0) {
+        header_len = SLEN(QUOTA_CONF_HEADER_1_1);
+        ret = gf_nwrite(fd, QUOTA_CONF_HEADER_1_1, header_len);
+    } else {
+        header_len = SLEN(QUOTA_CONF_HEADER);
+        ret = gf_nwrite(fd, QUOTA_CONF_HEADER, header_len);
+    }
+
+    if (ret != header_len) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    if (ret < 0)
+        gf_msg_callingfn("quota", GF_LOG_ERROR, 0, GD_MSG_QUOTA_CONF_WRITE_FAIL,
+                         "failed to write "
+                         "header to a quota conf");
+
+    return ret;
+}
+
+int32_t
+glusterd_quota_conf_write_gfid(int fd, void *buf, char type)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("quota", this, out);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    ret = gf_nwrite(fd, buf, 16);
+    if (ret != 16) {
+        ret = -1;
+        goto out;
+    }
+
+    if (conf->op_version >= GD_OP_VERSION_3_7_0) {
+        ret = gf_nwrite(fd, &type, 1);
+        if (ret != 1) {
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = 0;
+
+out:
+    if (ret < 0)
+        gf_msg_callingfn("quota", GF_LOG_ERROR, 0, GD_MSG_QUOTA_CONF_WRITE_FAIL,
+                         "failed to write "
+                         "gfid %s to a quota conf",
+                         uuid_utoa(buf));
+
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h
new file mode 100644
index 00000000000..83f4df0783e
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-store.h
@@ -0,0 +1,216 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_HA_H_
+#define _GLUSTERD_HA_H_
+
+#include <pthread.h>
+#include <glusterfs/compat-uuid.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/run.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/byte-order.h>
+#include "glusterd.h"
+#include "rpcsvc.h"
+
+typedef enum glusterd_store_ver_ac_ {
+    GLUSTERD_VOLINFO_VER_AC_NONE = 0,
+    GLUSTERD_VOLINFO_VER_AC_INCREMENT = 1,
+    GLUSTERD_VOLINFO_VER_AC_DECREMENT = 2,
+} glusterd_volinfo_ver_ac_t;
+
+#define UUID_SIZE 36
+#define VOLINFO_BUFFER_SIZE 4093
+#define GLUSTERD_STORE_UUID_KEY "UUID"
+
+#define GLUSTERD_STORE_KEY_VOL_TYPE "type"
+#define GLUSTERD_STORE_KEY_VOL_COUNT "count"
+#define GLUSTERD_STORE_KEY_VOL_STATUS "status"
+#define GLUSTERD_STORE_KEY_VOL_PORT "port"
+#define GLUSTERD_STORE_KEY_VOL_SUB_COUNT "sub_count"
+#define GLUSTERD_STORE_KEY_VOL_STRIPE_CNT "stripe_count"
+#define GLUSTERD_STORE_KEY_VOL_REPLICA_CNT "replica_count"
+#define GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT "disperse_count"
+#define GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT "redundancy_count"
+#define GLUSTERD_STORE_KEY_VOL_ARBITER_CNT "arbiter_count"
+#define GLUSTERD_STORE_KEY_VOL_THIN_ARBITER_CNT "thin_arbiter_count"
+#define GLUSTERD_STORE_KEY_VOL_BRICK "brick"
+#define GLUSTERD_STORE_KEY_VOL_TA_BRICK "ta-brick"
+#define GLUSTERD_STORE_KEY_VOL_VERSION "version"
+#define GLUSTERD_STORE_KEY_VOL_TRANSPORT "transport-type"
+#define GLUSTERD_STORE_KEY_VOL_ID "volume-id"
+#define GLUSTERD_STORE_KEY_VOL_RESTORED_SNAP "restored_from_snap"
+#define GLUSTERD_STORE_KEY_RB_STATUS "rb_status"
+#define GLUSTERD_STORE_KEY_RB_SRC_BRICK "rb_src"
+#define GLUSTERD_STORE_KEY_RB_DST_BRICK "rb_dst"
+#define GLUSTERD_STORE_KEY_RB_DST_PORT "rb_port"
+#define GLUSTERD_STORE_KEY_VOL_DEFRAG "rebalance_status"
+#define GLUSTERD_STORE_KEY_VOL_DEFRAG_STATUS "status"
+#define GLUSTERD_STORE_KEY_DEFRAG_OP "rebalance_op"
+#define GLUSTERD_STORE_KEY_USERNAME "username"
+#define GLUSTERD_STORE_KEY_PASSWORD "password"
+#define GLUSTERD_STORE_KEY_PARENT_VOLNAME "parent_volname"
+#define GLUSTERD_STORE_KEY_VOL_OP_VERSION "op-version"
+#define GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION "client-op-version"
+#define GLUSTERD_STORE_KEY_VOL_QUOTA_VERSION "quota-version"
+
+#define GLUSTERD_STORE_KEY_SNAP_NAME "name"
+#define GLUSTERD_STORE_KEY_SNAP_ID "snap-id"
+#define GLUSTERD_STORE_KEY_SNAP_DESC "desc"
+#define GLUSTERD_STORE_KEY_SNAP_TIMESTAMP "time-stamp"
+#define GLUSTERD_STORE_KEY_SNAP_STATUS "status"
+#define GLUSTERD_STORE_KEY_SNAP_RESTORED "snap-restored"
+#define GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT "snap-max-hard-limit"
+#define GLUSTERD_STORE_KEY_SNAP_AUTO_DELETE "auto-delete"
+#define GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT "snap-max-soft-limit"
+#define GLUSTERD_STORE_KEY_SNAPD_PORT "snapd-port"
+#define GLUSTERD_STORE_KEY_SNAP_ACTIVATE "snap-activate-on-create"
+
+#define GLUSTERD_STORE_KEY_BRICK_HOSTNAME "hostname"
+#define GLUSTERD_STORE_KEY_BRICK_PATH "path"
+#define GLUSTERD_STORE_KEY_BRICK_REAL_PATH "real_path"
+#define GLUSTERD_STORE_KEY_BRICK_PORT "listen-port"
+#define GLUSTERD_STORE_KEY_BRICK_RDMA_PORT "rdma.listen-port"
+#define GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED "decommissioned"
+#define GLUSTERD_STORE_KEY_BRICK_VGNAME "vg"
+#define GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH "device_path"
+#define GLUSTERD_STORE_KEY_BRICK_MOUNT_DIR "mount_dir"
+#define GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS "snap-status"
+#define GLUSTERD_STORE_KEY_BRICK_FSTYPE "fs-type"
+#define GLUSTERD_STORE_KEY_BRICK_MNTOPTS "mnt-opts"
+#define GLUSTERD_STORE_KEY_BRICK_ID "brick-id"
+#define GLUSTERD_STORE_KEY_BRICK_FSID "brick-fsid"
+#define GLUSTERD_STORE_KEY_BRICK_UUID "uuid"
+
+#define GLUSTERD_STORE_KEY_PEER_UUID "uuid"
+#define GLUSTERD_STORE_KEY_PEER_HOSTNAME "hostname"
+#define GLUSTERD_STORE_KEY_PEER_STATE "state"
+#define GLUSTERD_STORE_KEY_VOL_CAPS "caps" /* left just for backward compat */
+
+#define GLUSTERD_STORE_KEY_VOL_DEFRAG_REB_FILES "rebalanced-files"
+#define GLUSTERD_STORE_KEY_VOL_DEFRAG_SIZE "size"
+#define GLUSTERD_STORE_KEY_VOL_DEFRAG_SCANNED "scanned"
+#define GLUSTERD_STORE_KEY_VOL_DEFRAG_FAILURES "failures"
+#define GLUSTERD_STORE_KEY_VOL_DEFRAG_SKIPPED "skipped"
+#define GLUSTERD_STORE_KEY_VOL_DEFRAG_RUN_TIME "run-time"
+
+#define GLUSTERD_STORE_KEY_VOL_MIGRATED_FILES "migrated-files"
+#define GLUSTERD_STORE_KEY_VOL_MIGRATED_SIZE "migration-size"
+#define GLUSTERD_STORE_KEY_VOL_MIGRATIONS_SCANNED "migration-scanned"
+#define GLUSTERD_STORE_KEY_VOL_MIGRATIONS_FAILURES "migration-failures"
+#define GLUSTERD_STORE_KEY_VOL_MIGRATIONS_SKIPPED "migration-skipped"
+#define GLUSTERD_STORE_KEY_VOL_MIGRATION_RUN_TIME "migration-run-time"
+
+#define GLUSTERD_STORE_KEY_GANESHA_GLOBAL "nfs-ganesha"
+
+/*
+ * The structure is responsible for handling the parameter for writes into
+ * the buffer before it is finally written to the file. The writes will be
+ * of the form of key-value pairs.
+ */
+struct glusterd_volinfo_data_store_ {
+    gf_store_handle_t *shandle; /*Contains fd and path of the file */
+    int16_t buffer_len;
+    char key_check; /* flag to check if key is to be validated before write*/
+    char buffer[VOLINFO_BUFFER_SIZE];
+};
+typedef struct glusterd_volinfo_data_store_ glusterd_volinfo_data_store_t;
+
+int32_t
+glusterd_store_volinfo(glusterd_volinfo_t *volinfo,
+                       glusterd_volinfo_ver_ac_t ac);
+
+int32_t
+glusterd_store_delete_volume(glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_store_delete_snap(glusterd_snap_t *snap);
+
+int32_t
+glusterd_retrieve_uuid();
+
+int32_t
+glusterd_store_peerinfo(glusterd_peerinfo_t *peerinfo);
+
+int32_t
+glusterd_store_delete_peerinfo(glusterd_peerinfo_t *peerinfo);
+
+int32_t
+glusterd_store_delete_brick(glusterd_brickinfo_t *brickinfo, char *delete_path);
+
+int32_t
+glusterd_restore();
+
+void
+glusterd_perform_volinfo_version_action(glusterd_volinfo_t *volinfo,
+                                        glusterd_volinfo_ver_ac_t ac);
+gf_boolean_t
+glusterd_store_is_valid_brickpath(char *volname, char *brick);
+
+int32_t
+glusterd_store_perform_node_state_store(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_retrieve_op_version(xlator_t *this, int *op_version);
+
+int
+glusterd_retrieve_max_op_version(xlator_t *this, int *op_version);
+
+int
+glusterd_store_max_op_version(xlator_t *this);
+
+int
+glusterd_store_global_info(xlator_t *this);
+
+int32_t
+glusterd_store_retrieve_options(xlator_t *this);
+
+int32_t
+glusterd_store_retrieve_bricks(glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_store_options(xlator_t *this, dict_t *opts);
+
+void
+glusterd_replace_slash_with_hyphen(char *str);
+
+int32_t
+glusterd_store_create_quota_conf_sh_on_absence(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_store_retrieve_quota_version(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_store_save_quota_version_and_cksum(glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_store_snap(glusterd_snap_t *snap);
+
+int32_t
+glusterd_store_update_missed_snaps();
+
+glusterd_volinfo_t *
+glusterd_store_retrieve_volume(char *volname, glusterd_snap_t *snap);
+
+int
+glusterd_restore_op_version(xlator_t *this);
+
+int32_t
+glusterd_quota_conf_write_header(int fd);
+
+int32_t
+glusterd_quota_conf_write_gfid(int fd, void *buf, char type);
+
+int32_t
+glusterd_recreate_vol_brick_mounts(xlator_t *this, glusterd_volinfo_t *volinfo);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-svc-helper.c b/xlators/mgmt/glusterd/src/glusterd-svc-helper.c
new file mode 100644
index 00000000000..ca845903c4f
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-svc-helper.c
@@ -0,0 +1,1047 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <signal.h>
+
+#include <glusterfs/globals.h>
+#include <glusterfs/run.h>
+#include "glusterd.h"
+#include <glusterfs/glusterfs.h>
+#include "glusterd-utils.h"
+#include "glusterd-svc-mgmt.h"
+#include "glusterd-shd-svc.h"
+#include "glusterd-quotad-svc.h"
+#ifdef BUILD_GNFS
+#include "glusterd-nfs-svc.h"
+#endif
+#include "glusterd-bitd-svc.h"
+#include "glusterd-shd-svc-helper.h"
+#include "glusterd-scrub-svc.h"
+#include "glusterd-svc-helper.h"
+#include <glusterfs/syscall.h>
+#include "glusterd-snapshot-utils.h"
+
+int
+glusterd_svcs_reconfigure(glusterd_volinfo_t *volinfo)
+{
+    int ret = 0;
+    xlator_t *this = THIS;
+    glusterd_conf_t *conf = NULL;
+    char *svc_name = NULL;
+
+    GF_ASSERT(this);
+
+    conf = this->private;
+    GF_ASSERT(conf);
+
+#ifdef BUILD_GNFS
+    svc_name = "nfs";
+    ret = glusterd_nfssvc_reconfigure();
+    if (ret)
+        goto out;
+#endif
+    svc_name = "self-heald";
+    if (volinfo) {
+        ret = glusterd_shdsvc_reconfigure(volinfo);
+        if (ret)
+            goto out;
+    }
+
+    if (conf->op_version == GD_OP_VERSION_MIN)
+        goto out;
+
+    svc_name = "quotad";
+    ret = glusterd_quotadsvc_reconfigure();
+    if (ret)
+        goto out;
+
+    svc_name = "bitd";
+    ret = glusterd_bitdsvc_reconfigure();
+    if (ret)
+        goto out;
+
+    svc_name = "scrubber";
+    ret = glusterd_scrubsvc_reconfigure();
+out:
+    if (ret && svc_name)
+        gf_event(EVENT_SVC_RECONFIGURE_FAILED, "svc_name=%s", svc_name);
+    return ret;
+}
+
+int
+glusterd_svcs_stop(glusterd_volinfo_t *volinfo)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+#ifdef BUILD_GNFS
+    ret = priv->nfs_svc.stop(&(priv->nfs_svc), SIGKILL);
+    if (ret)
+        goto out;
+#endif
+    ret = priv->quotad_svc.stop(&(priv->quotad_svc), SIGTERM);
+    if (ret)
+        goto out;
+
+    if (volinfo) {
+        ret = volinfo->shd.svc.stop(&(volinfo->shd.svc), SIGTERM);
+        if (ret)
+            goto out;
+    }
+
+    ret = priv->bitd_svc.stop(&(priv->bitd_svc), SIGTERM);
+    if (ret)
+        goto out;
+
+    ret = priv->scrub_svc.stop(&(priv->scrub_svc), SIGTERM);
+
+out:
+    return ret;
+}
+
+int
+glusterd_svcs_manager(glusterd_volinfo_t *volinfo)
+{
+    int ret = 0;
+    xlator_t *this = THIS;
+    glusterd_conf_t *conf = NULL;
+
+    GF_ASSERT(this);
+
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    if (volinfo && volinfo->is_snap_volume)
+        return 0;
+
+#if BUILD_GNFS
+    ret = conf->nfs_svc.manager(&(conf->nfs_svc), NULL, PROC_START_NO_WAIT);
+    if (ret)
+        goto out;
+#endif
+    if (conf->op_version == GD_OP_VERSION_MIN)
+        goto out;
+
+    ret = conf->quotad_svc.manager(&(conf->quotad_svc), volinfo,
+                                   PROC_START_NO_WAIT);
+    if (ret == -EINVAL)
+        ret = 0;
+    if (ret)
+        goto out;
+
+    ret = conf->bitd_svc.manager(&(conf->bitd_svc), NULL, PROC_START_NO_WAIT);
+    if (ret == -EINVAL)
+        ret = 0;
+    if (ret)
+        goto out;
+
+    if (volinfo) {
+        ret = volinfo->shd.svc.manager(&(volinfo->shd.svc), volinfo,
+                                       PROC_START_NO_WAIT);
+        if (ret == -EINVAL)
+            ret = 0;
+        if (ret)
+            goto out;
+    }
+
+    ret = conf->scrub_svc.manager(&(conf->scrub_svc), NULL, PROC_START_NO_WAIT);
+    if (ret == -EINVAL)
+        ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_svc_check_volfile_identical(char *svc_name,
+                                     glusterd_graph_builder_t builder,
+                                     gf_boolean_t *identical)
+{
+    char orgvol[PATH_MAX] = {
+        0,
+    };
+    char *tmpvol = NULL;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+    int ret = -1;
+    int need_unlink = 0;
+    int tmp_fd = -1;
+
+    this = THIS;
+
+    GF_ASSERT(this);
+    GF_ASSERT(identical);
+    conf = this->private;
+
+    glusterd_svc_build_volfile_path(svc_name, conf->workdir, orgvol,
+                                    sizeof(orgvol));
+
+    ret = gf_asprintf(&tmpvol, "/tmp/g%s-XXXXXX", svc_name);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* coverity[SECURE_TEMP] mkstemp uses 0600 as the mode and is safe */
+    tmp_fd = mkstemp(tmpvol);
+    if (tmp_fd < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to create temp file"
+               " %s:(%s)",
+               tmpvol, strerror(errno));
+        ret = -1;
+        goto out;
+    }
+
+    need_unlink = 1;
+
+    ret = glusterd_create_global_volfile(builder, tmpvol, NULL);
+    if (ret)
+        goto out;
+
+    ret = glusterd_check_files_identical(orgvol, tmpvol, identical);
+out:
+    if (need_unlink)
+        sys_unlink(tmpvol);
+
+    if (tmpvol != NULL)
+        GF_FREE(tmpvol);
+
+    if (tmp_fd >= 0)
+        sys_close(tmp_fd);
+
+    return ret;
+}
+
+int
+glusterd_svc_check_topology_identical(char *svc_name,
+                                      glusterd_graph_builder_t builder,
+                                      gf_boolean_t *identical)
+{
+    char orgvol[PATH_MAX] = {
+        0,
+    };
+    char *tmpvol = NULL;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = THIS;
+    int ret = -1;
+    int tmpclean = 0;
+    int tmpfd = -1;
+
+    if ((!identical) || (!this) || (!this->private)) {
+        gf_smsg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    /* Fetch the original volfile */
+    glusterd_svc_build_volfile_path(svc_name, conf->workdir, orgvol,
+                                    sizeof(orgvol));
+
+    /* Create the temporary volfile */
+    ret = gf_asprintf(&tmpvol, "/tmp/g%s-XXXXXX", svc_name);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* coverity[SECURE_TEMP] mkstemp uses 0600 as the mode and is safe */
+    tmpfd = mkstemp(tmpvol);
+    if (tmpfd < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to create temp file"
+               " %s:(%s)",
+               tmpvol, strerror(errno));
+        ret = -1;
+        goto out;
+    }
+
+    tmpclean = 1; /* SET the flag to unlink() tmpfile */
+
+    ret = glusterd_create_global_volfile(builder, tmpvol, NULL);
+    if (ret)
+        goto out;
+
+    /* Compare the topology of volfiles */
+    ret = glusterd_check_topology_identical(orgvol, tmpvol, identical);
+out:
+    if (tmpfd >= 0)
+        sys_close(tmpfd);
+    if (tmpclean)
+        sys_unlink(tmpvol);
+    if (tmpvol != NULL)
+        GF_FREE(tmpvol);
+    return ret;
+}
+
+int
+glusterd_volume_svc_check_volfile_identical(
+    char *svc_name, dict_t *mode_dict, glusterd_volinfo_t *volinfo,
+    glusterd_vol_graph_builder_t builder, gf_boolean_t *identical)
+{
+    char orgvol[PATH_MAX] = {
+        0,
+    };
+    char *tmpvol = NULL;
+    xlator_t *this = NULL;
+    int ret = -1;
+    int need_unlink = 0;
+    int tmp_fd = -1;
+
+    this = THIS;
+
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, identical, out);
+
+    /* This builds volfile for volume level dameons */
+    glusterd_volume_svc_build_volfile_path(svc_name, volinfo, orgvol,
+                                           sizeof(orgvol));
+
+    ret = gf_asprintf(&tmpvol, "/tmp/g%s-XXXXXX", svc_name);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* coverity[SECURE_TEMP] mkstemp uses 0600 as the mode and is safe */
+    tmp_fd = mkstemp(tmpvol);
+    if (tmp_fd < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to create temp file"
+               " %s:(%s)",
+               tmpvol, strerror(errno));
+        ret = -1;
+        goto out;
+    }
+
+    need_unlink = 1;
+
+    ret = builder(volinfo, tmpvol, mode_dict);
+    if (ret)
+        goto out;
+
+    ret = glusterd_check_files_identical(orgvol, tmpvol, identical);
+out:
+    if (need_unlink)
+        sys_unlink(tmpvol);
+
+    if (tmpvol != NULL)
+        GF_FREE(tmpvol);
+
+    if (tmp_fd >= 0)
+        sys_close(tmp_fd);
+
+    return ret;
+}
+
+int
+glusterd_volume_svc_check_topology_identical(
+    char *svc_name, dict_t *mode_dict, glusterd_volinfo_t *volinfo,
+    glusterd_vol_graph_builder_t builder, gf_boolean_t *identical)
+{
+    char orgvol[PATH_MAX] = {
+        0,
+    };
+    char *tmpvol = NULL;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = THIS;
+    int ret = -1;
+    int tmpclean = 0;
+    int tmpfd = -1;
+
+    if ((!identical) || (!this) || (!this->private)) {
+        gf_smsg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    /* This builds volfile for volume level dameons */
+    glusterd_volume_svc_build_volfile_path(svc_name, volinfo, orgvol,
+                                           sizeof(orgvol));
+    /* Create the temporary volfile */
+    ret = gf_asprintf(&tmpvol, "/tmp/g%s-XXXXXX", svc_name);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* coverity[SECURE_TEMP] mkstemp uses 0600 as the mode and is safe */
+    tmpfd = mkstemp(tmpvol);
+    if (tmpfd < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to create temp file"
+               " %s:(%s)",
+               tmpvol, strerror(errno));
+        ret = -1;
+        goto out;
+    }
+
+    tmpclean = 1; /* SET the flag to unlink() tmpfile */
+
+    ret = builder(volinfo, tmpvol, mode_dict);
+    if (ret)
+        goto out;
+
+    /* Compare the topology of volfiles */
+    ret = glusterd_check_topology_identical(orgvol, tmpvol, identical);
+out:
+    if (tmpfd >= 0)
+        sys_close(tmpfd);
+    if (tmpclean)
+        sys_unlink(tmpvol);
+    if (tmpvol != NULL)
+        GF_FREE(tmpvol);
+    return ret;
+}
+
+gf_boolean_t
+glusterd_is_svcproc_attachable(glusterd_svc_proc_t *svc_proc)
+{
+    int pid = -1;
+    glusterd_svc_t *parent_svc = NULL;
+
+    if (!svc_proc)
+        return _gf_false;
+
+    if (svc_proc->status == GF_SVC_STARTING)
+        return _gf_true;
+
+    if (svc_proc->status == GF_SVC_STARTED ||
+        svc_proc->status == GF_SVC_DISCONNECTED) {
+        parent_svc = cds_list_entry(svc_proc->svcs.next, glusterd_svc_t,
+                                    mux_svc);
+        if (parent_svc && gf_is_service_running(parent_svc->proc.pidfile, &pid))
+            return _gf_true;
+    }
+
+    if (svc_proc->status == GF_SVC_DIED || svc_proc->status == GF_SVC_STOPPING)
+        return _gf_false;
+
+    return _gf_false;
+}
+
+void *
+__gf_find_compatible_svc(gd_node_type daemon)
+{
+    glusterd_svc_proc_t *svc_proc = NULL;
+    struct cds_list_head *svc_procs = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    conf = THIS->private;
+    GF_VALIDATE_OR_GOTO("glusterd", conf, out);
+
+    switch (daemon) {
+        case GD_NODE_SHD: {
+            svc_procs = &conf->shd_procs;
+            if (!svc_procs)
+                goto out;
+        } break;
+        default:
+            /* Add support for other client daemons here */
+            goto out;
+    }
+
+    cds_list_for_each_entry(svc_proc, svc_procs, svc_proc_list)
+    {
+        if (glusterd_is_svcproc_attachable(svc_proc))
+            return (void *)svc_proc;
+        /*
+         * Logic to select one process goes here. Currently there is only one
+         * shd_proc. So selecting the first one;
+         */
+    }
+out:
+    return NULL;
+}
+
+glusterd_svc_proc_t *
+glusterd_svcprocess_new()
+{
+    glusterd_svc_proc_t *new_svcprocess = NULL;
+
+    new_svcprocess = GF_CALLOC(1, sizeof(*new_svcprocess),
+                               gf_gld_mt_glusterd_svc_proc_t);
+
+    if (!new_svcprocess)
+        return NULL;
+
+    CDS_INIT_LIST_HEAD(&new_svcprocess->svc_proc_list);
+    CDS_INIT_LIST_HEAD(&new_svcprocess->svcs);
+    new_svcprocess->notify = glusterd_muxsvc_common_rpc_notify;
+    new_svcprocess->status = GF_SVC_STARTING;
+    return new_svcprocess;
+}
+
+int
+glusterd_shd_svc_mux_init(glusterd_volinfo_t *volinfo, glusterd_svc_t *svc)
+{
+    int ret = -1;
+    glusterd_svc_proc_t *mux_proc = NULL;
+    glusterd_conn_t *mux_conn = NULL;
+    glusterd_conf_t *conf = NULL;
+    glusterd_svc_t *parent_svc = NULL;
+    int pid = -1;
+    gf_boolean_t stop_daemon = _gf_false;
+    char pidfile[PATH_MAX] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("glusterd", svc, out);
+    GF_VALIDATE_OR_GOTO("glusterd", volinfo, out);
+    conf = THIS->private;
+    GF_VALIDATE_OR_GOTO("glusterd", conf, out);
+    GF_VALIDATE_OR_GOTO("glusterd", svc, out);
+
+    pthread_mutex_lock(&conf->attach_lock);
+    {
+        if (svc->inited && !glusterd_proc_is_running(&(svc->proc))) {
+            /* This is the case when shd process was abnormally killed */
+            pthread_mutex_unlock(&conf->attach_lock);
+            glusterd_shd_svcproc_cleanup(&volinfo->shd);
+            pthread_mutex_lock(&conf->attach_lock);
+        }
+
+        if (!svc->inited) {
+            glusterd_svc_build_shd_pidfile(volinfo, pidfile, sizeof(pidfile));
+            ret = snprintf(svc->proc.name, sizeof(svc->proc.name), "%s",
+                           "glustershd");
+            if (ret < 0)
+                goto unlock;
+
+            ret = snprintf(svc->proc.pidfile, sizeof(svc->proc.pidfile), "%s",
+                           pidfile);
+            if (ret < 0)
+                goto unlock;
+
+            if (gf_is_service_running(pidfile, &pid)) {
+                /* Just connect is required, but we don't know what happens
+                 * during the disconnect. So better to reattach.
+                 */
+                mux_proc = __gf_find_compatible_svc_from_pid(GD_NODE_SHD, pid);
+            }
+
+            if (!mux_proc) {
+                if (pid != -1 && sys_access(pidfile, R_OK) == 0) {
+                    /* stale pid file, stop and unlink it. This has to be
+                     * done outside the attach_lock.
+                     */
+                    stop_daemon = _gf_true;
+                }
+                mux_proc = __gf_find_compatible_svc(GD_NODE_SHD);
+            }
+            if (mux_proc) {
+                /* Take first entry from the process */
+                parent_svc = cds_list_entry(mux_proc->svcs.next, glusterd_svc_t,
+                                            mux_svc);
+                mux_conn = &parent_svc->conn;
+                if (volinfo)
+                    volinfo->shd.attached = _gf_true;
+            } else {
+                mux_proc = glusterd_svcprocess_new();
+                if (!mux_proc) {
+                    ret = -1;
+                    goto unlock;
+                }
+                cds_list_add_tail(&mux_proc->svc_proc_list, &conf->shd_procs);
+            }
+            svc->svc_proc = mux_proc;
+            cds_list_del_init(&svc->mux_svc);
+            cds_list_add_tail(&svc->mux_svc, &mux_proc->svcs);
+            ret = glusterd_shdsvc_init(volinfo, mux_conn, mux_proc);
+            if (ret) {
+                pthread_mutex_unlock(&conf->attach_lock);
+                gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_FAILED_INIT_SHDSVC,
+                       "Failed to init shd "
+                       "service");
+                goto out;
+            }
+            gf_msg_debug(THIS->name, 0, "shd service initialized");
+            svc->inited = _gf_true;
+        }
+        ret = 0;
+    }
+unlock:
+    pthread_mutex_unlock(&conf->attach_lock);
+out:
+    if (stop_daemon) {
+        glusterd_proc_stop(&svc->proc, SIGTERM, PROC_STOP_FORCE);
+        glusterd_unlink_file(pidfile);
+    }
+    return ret;
+}
+
+void *
+__gf_find_compatible_svc_from_pid(gd_node_type daemon, pid_t pid)
+{
+    glusterd_svc_proc_t *svc_proc = NULL;
+    struct cds_list_head *svc_procs = NULL;
+    glusterd_svc_t *svc = NULL;
+    pid_t mux_pid = -1;
+    glusterd_conf_t *conf = NULL;
+
+    conf = THIS->private;
+    if (!conf)
+        return NULL;
+
+    switch (daemon) {
+        case GD_NODE_SHD: {
+            svc_procs = &conf->shd_procs;
+            if (!svc_procs)
+                return NULL;
+        } break;
+        default:
+            /* Add support for other client daemons here */
+            return NULL;
+    }
+
+    cds_list_for_each_entry(svc_proc, svc_procs, svc_proc_list)
+    {
+        cds_list_for_each_entry(svc, &svc_proc->svcs, mux_svc)
+        {
+            if (gf_is_service_running(svc->proc.pidfile, &mux_pid)) {
+                if (mux_pid == pid &&
+                    glusterd_is_svcproc_attachable(svc_proc)) {
+                    /*TODO
+                     * inefficient loop, but at the moment, there is only
+                     * one shd.
+                     */
+                    return svc_proc;
+                }
+            }
+        }
+    }
+    return NULL;
+}
+
+static int32_t
+my_callback(struct rpc_req *req, struct iovec *iov, int count, void *v_frame)
+{
+    call_frame_t *frame = v_frame;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    GF_VALIDATE_OR_GOTO("glusterd", frame, out);
+    this = frame->this;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
+        synccond_broadcast(&conf->cond_blockers);
+    }
+
+    STACK_DESTROY(frame->root);
+out:
+    return 0;
+}
+
+static int32_t
+glusterd_svc_attach_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                        void *v_frame)
+{
+    call_frame_t *frame = v_frame;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_shdsvc_t *shd = NULL;
+    glusterd_svc_t *svc = frame->cookie;
+    glusterd_conf_t *conf = NULL;
+    int *flag = (int *)frame->local;
+    xlator_t *this = THIS;
+    int ret = -1;
+    gf_getspec_rsp rsp = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO("glusterd", conf, out);
+    GF_VALIDATE_OR_GOTO("glusterd", frame, out);
+    GF_VALIDATE_OR_GOTO("glusterd", svc, out);
+
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    if (!strcmp(svc->name, "glustershd")) {
+        /* Get volinfo->shd from svc object */
+        shd = cds_list_entry(svc, glusterd_shdsvc_t, svc);
+        if (!shd) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_SHD_OBJ_GET_FAIL,
+                   "Failed to get shd object "
+                   "from shd service");
+            goto out;
+        }
+
+        /* Get volinfo from shd */
+        volinfo = cds_list_entry(shd, glusterd_volinfo_t, shd);
+        if (!volinfo) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                   "Failed to get volinfo from "
+                   "from shd");
+            goto out;
+        }
+    }
+
+    if (!iov) {
+        gf_msg(frame->this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "iov is NULL");
+        ret = -1;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gf_getspec_rsp);
+    if (ret < 0) {
+        gf_msg(frame->this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL,
+               "XDR decoding error");
+        ret = -1;
+        goto out;
+    }
+
+    if (rsp.op_ret == 0) {
+        svc->online = _gf_true;
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_SVC_ATTACH_FAIL,
+               "svc %s of volume %s attached successfully to pid %d", svc->name,
+               volinfo->volname, glusterd_proc_get_pid(&svc->proc));
+    } else {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SVC_ATTACH_FAIL,
+               "svc %s of volume %s failed to attach to pid %d", svc->name,
+               volinfo->volname, glusterd_proc_get_pid(&svc->proc));
+        if (!strcmp(svc->name, "glustershd")) {
+            glusterd_shd_svcproc_cleanup(&volinfo->shd);
+        }
+    }
+out:
+    if (flag) {
+        GF_FREE(flag);
+    }
+
+    if (volinfo)
+        glusterd_volinfo_unref(volinfo);
+
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
+        synccond_broadcast(&conf->cond_blockers);
+    }
+    STACK_DESTROY(frame->root);
+    return 0;
+}
+
+extern size_t
+build_volfile_path(char *volume_id, char *path, size_t path_len,
+                   char *trusted_str, dict_t *dict);
+
+int
+__glusterd_send_svc_configure_req(glusterd_svc_t *svc, int flags,
+                                  struct rpc_clnt *rpc, char *volfile_id,
+                                  int op)
+{
+    int ret = -1;
+    struct iobuf *iobuf = NULL;
+    struct iobref *iobref = NULL;
+    struct iovec iov = {
+        0,
+    };
+    char path[PATH_MAX] = {
+        '\0',
+    };
+    struct stat stbuf = {
+        0,
+    };
+    int32_t spec_fd = -1;
+    size_t file_len = -1;
+    char *volfile_content = NULL;
+    ssize_t req_size = 0;
+    call_frame_t *frame = NULL;
+    gd1_mgmt_brick_op_req brick_req;
+    dict_t *dict = NULL;
+    void *req = &brick_req;
+    void *errlbl = &&err;
+    struct rpc_clnt_connection *conn;
+    xlator_t *this = THIS;
+    glusterd_conf_t *conf = THIS->private;
+    extern struct rpc_clnt_program gd_brick_prog;
+    fop_cbk_fn_t cbkfn = my_callback;
+
+    if (!rpc) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_PARAM_NULL,
+               "called with null rpc");
+        return -1;
+    }
+
+    conn = &rpc->conn;
+    if (!conn->connected || conn->disconnected) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_CONNECT_RETURNED,
+               "not connected yet");
+        return -1;
+    }
+
+    brick_req.op = op;
+    brick_req.name = volfile_id;
+    brick_req.input.input_val = NULL;
+    brick_req.input.input_len = 0;
+    brick_req.dict.dict_val = NULL;
+    brick_req.dict.dict_len = 0;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_FRAME_CREATE_FAIL,
+                NULL);
+        goto *errlbl;
+    }
+
+    if (op == GLUSTERD_SVC_ATTACH) {
+        dict = dict_new();
+        if (!dict) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            ret = -ENOMEM;
+            goto *errlbl;
+        }
+
+        (void)build_volfile_path(volfile_id, path, sizeof(path), NULL, dict);
+
+        ret = sys_stat(path, &stbuf);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SVC_ATTACH_FAIL,
+                   "Unable to stat %s (%s)", path, strerror(errno));
+            ret = -EINVAL;
+            goto *errlbl;
+        }
+
+        file_len = stbuf.st_size;
+        volfile_content = GF_MALLOC(file_len + 1, gf_common_mt_char);
+        if (!volfile_content) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+            ret = -ENOMEM;
+            goto *errlbl;
+        }
+        spec_fd = open(path, O_RDONLY);
+        if (spec_fd < 0) {
+            gf_msg(THIS->name, GF_LOG_WARNING, 0, GD_MSG_SVC_ATTACH_FAIL,
+                   "failed to read volfile %s", path);
+            ret = -EIO;
+            goto *errlbl;
+        }
+        ret = sys_read(spec_fd, volfile_content, file_len);
+        if (ret == file_len) {
+            brick_req.input.input_val = volfile_content;
+            brick_req.input.input_len = file_len;
+        } else {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SVC_ATTACH_FAIL,
+                   "read failed on path %s. File size=%" GF_PRI_SIZET
+                   "read size=%d",
+                   path, file_len, ret);
+            ret = -EIO;
+            goto *errlbl;
+        }
+        if (dict->count > 0) {
+            ret = dict_allocate_and_serialize(dict, &brick_req.dict.dict_val,
+                                              &brick_req.dict.dict_len);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno,
+                        GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+                goto *errlbl;
+            }
+        }
+
+        frame->cookie = svc;
+        frame->local = GF_CALLOC(1, sizeof(int), gf_gld_mt_int);
+        *((int *)frame->local) = flags;
+        cbkfn = glusterd_svc_attach_cbk;
+    }
+
+    req_size = xdr_sizeof((xdrproc_t)xdr_gd1_mgmt_brick_op_req, req);
+    iobuf = iobuf_get2(rpc->ctx->iobuf_pool, req_size);
+    if (!iobuf) {
+        goto *errlbl;
+    }
+    errlbl = &&maybe_free_iobuf;
+
+    iov.iov_base = iobuf->ptr;
+    iov.iov_len = iobuf_pagesize(iobuf);
+
+    iobref = iobref_new();
+    if (!iobref) {
+        goto *errlbl;
+    }
+    errlbl = &&free_iobref;
+
+    iobref_add(iobref, iobuf);
+    /*
+     * Drop our reference to the iobuf.  The iobref should already have
+     * one after iobref_add, so when we unref that we'll free the iobuf as
+     * well.  This allows us to pass just the iobref as frame->local.
+     */
+    iobuf_unref(iobuf);
+    /* Set the pointer to null so we don't free it on a later error. */
+    iobuf = NULL;
+
+    /* Create the xdr payload */
+    ret = xdr_serialize_generic(iov, req, (xdrproc_t)xdr_gd1_mgmt_brick_op_req);
+    if (ret == -1) {
+        goto *errlbl;
+    }
+    iov.iov_len = ret;
+
+    /* Send the msg */
+    GF_ATOMIC_INC(conf->blockers);
+    ret = rpc_clnt_submit(rpc, &gd_brick_prog, op, cbkfn, &iov, 1, NULL, 0,
+                          iobref, frame, NULL, 0, NULL, 0, NULL);
+    if (dict)
+        dict_unref(dict);
+    GF_FREE(volfile_content);
+    if (spec_fd >= 0)
+        sys_close(spec_fd);
+    return ret;
+
+free_iobref:
+    iobref_unref(iobref);
+maybe_free_iobuf:
+    if (iobuf) {
+        iobuf_unref(iobuf);
+    }
+err:
+    if (dict)
+        dict_unref(dict);
+    if (brick_req.dict.dict_val)
+        GF_FREE(brick_req.dict.dict_val);
+
+    GF_FREE(volfile_content);
+    if (spec_fd >= 0)
+        sys_close(spec_fd);
+    if (frame)
+        STACK_DESTROY(frame->root);
+    return -1;
+}
+
+int
+glusterd_attach_svc(glusterd_svc_t *svc, glusterd_volinfo_t *volinfo, int flags)
+{
+    glusterd_conf_t *conf = THIS->private;
+    int ret = -1;
+    int tries;
+    rpc_clnt_t *rpc = NULL;
+
+    GF_VALIDATE_OR_GOTO("glusterd", conf, out);
+    GF_VALIDATE_OR_GOTO("glusterd", svc, out);
+    GF_VALIDATE_OR_GOTO("glusterd", volinfo, out);
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_ATTACH_INFO,
+           "adding svc %s (volume=%s) to existing "
+           "process with pid %d",
+           svc->name, volinfo->volname, glusterd_proc_get_pid(&svc->proc));
+
+    rpc = rpc_clnt_ref(svc->conn.rpc);
+    for (tries = 15; tries > 0; --tries) {
+        /* There might be a case that the volume for which we're attempting to
+         * attach a shd svc might become stale and in the process of deletion.
+         * Given that the volinfo object is being already passed here before
+         * that sequence of operation has happened we might be operating on a
+         * stale volume. At every sync task switch we should check for existance
+         * of the volume now
+         */
+        if (!glusterd_volume_exists(volinfo->volname)) {
+            gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_SVC_ATTACH_FAIL,
+                   "Volume %s "
+                   " is marked as stale, not attempting further shd svc attach "
+                   "attempts",
+                   volinfo->volname);
+            ret = 0;
+            goto out;
+        }
+        if (rpc) {
+            pthread_mutex_lock(&conf->attach_lock);
+            {
+                ret = __glusterd_send_svc_configure_req(
+                    svc, flags, rpc, svc->proc.volfileid, GLUSTERD_SVC_ATTACH);
+            }
+            pthread_mutex_unlock(&conf->attach_lock);
+            if (!ret) {
+                volinfo->shd.attached = _gf_true;
+                goto out;
+            }
+        }
+        /*
+         * It might not actually be safe to manipulate the lock
+         * like this, but if we don't then the connection can
+         * never actually complete and retries are useless.
+         * Unfortunately, all of the alternatives (e.g. doing
+         * all of this in a separate thread) are much more
+         * complicated and risky.
+         * TBD: see if there's a better way
+         */
+        synclock_unlock(&conf->big_lock);
+        synctask_sleep(1);
+        synclock_lock(&conf->big_lock);
+    }
+    ret = -1;
+    gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_SVC_ATTACH_FAIL,
+           "attach failed for %s(volume=%s)", svc->name, volinfo->volname);
+out:
+    if (rpc)
+        rpc_clnt_unref(rpc);
+    return ret;
+}
+
+int
+glusterd_detach_svc(glusterd_svc_t *svc, glusterd_volinfo_t *volinfo, int sig)
+{
+    glusterd_conf_t *conf = THIS->private;
+    int ret = -1;
+    int tries;
+    rpc_clnt_t *rpc = NULL;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, conf, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, svc, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, volinfo, out);
+
+    gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DETACH_INFO,
+           "removing svc %s (volume=%s) from existing "
+           "process with pid %d",
+           svc->name, volinfo->volname, glusterd_proc_get_pid(&svc->proc));
+
+    rpc = rpc_clnt_ref(svc->conn.rpc);
+    for (tries = 15; tries > 0; --tries) {
+        if (rpc) {
+            /*For detach there is no flags, and we are not using sig.*/
+            pthread_mutex_lock(&conf->attach_lock);
+            {
+                ret = __glusterd_send_svc_configure_req(svc, 0, svc->conn.rpc,
+                                                        svc->proc.volfileid,
+                                                        GLUSTERD_SVC_DETACH);
+            }
+            pthread_mutex_unlock(&conf->attach_lock);
+            if (!ret) {
+                goto out;
+            }
+        }
+        /*
+         * It might not actually be safe to manipulate the lock
+         * like this, but if we don't then the connection can
+         * never actually complete and retries are useless.
+         * Unfortunately, all of the alternatives (e.g. doing
+         * all of this in a separate thread) are much more
+         * complicated and risky.
+         * TBD: see if there's a better way
+         */
+        synclock_unlock(&conf->big_lock);
+        synctask_sleep(1);
+        synclock_lock(&conf->big_lock);
+    }
+    ret = -1;
+    gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_SVC_DETACH_FAIL,
+           "detach failed for %s(volume=%s)", svc->name, volinfo->volname);
+out:
+    if (rpc)
+        rpc_clnt_unref(rpc);
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-svc-helper.h b/xlators/mgmt/glusterd/src/glusterd-svc-helper.h
new file mode 100644
index 00000000000..12717dc58ac
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-svc-helper.h
@@ -0,0 +1,72 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_SVC_HELPER_H_
+#define _GLUSTERD_SVC_HELPER_H_
+
+#include "glusterd.h"
+#include "glusterd-svc-mgmt.h"
+#include "glusterd-volgen.h"
+
+int
+glusterd_svcs_reconfigure(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_svcs_stop(glusterd_volinfo_t *vol);
+
+int
+glusterd_svcs_manager(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_svc_check_volfile_identical(char *svc_name,
+                                     glusterd_graph_builder_t builder,
+                                     gf_boolean_t *identical);
+int
+glusterd_svc_check_topology_identical(char *svc_name,
+                                      glusterd_graph_builder_t builder,
+                                      gf_boolean_t *identical);
+int
+glusterd_volume_svc_check_volfile_identical(char *svc_name, dict_t *mode_dict,
+                                            glusterd_volinfo_t *volinfo,
+                                            glusterd_vol_graph_builder_t,
+                                            gf_boolean_t *identical);
+int
+glusterd_volume_svc_check_topology_identical(char *svc_name, dict_t *mode_dict,
+                                             glusterd_volinfo_t *volinfo,
+                                             glusterd_vol_graph_builder_t,
+                                             gf_boolean_t *identical);
+void
+glusterd_volume_svc_build_volfile_path(char *server, glusterd_volinfo_t *vol,
+                                       char *volfile, size_t len);
+void *
+__gf_find_compatible_svc(gd_node_type daemon);
+
+glusterd_svc_proc_t *
+glusterd_svcprocess_new();
+
+int
+glusterd_shd_svc_mux_init(glusterd_volinfo_t *volinfo, glusterd_svc_t *svc);
+
+void *
+__gf_find_compatible_svc_from_pid(gd_node_type daemon, pid_t pid);
+
+int
+glusterd_attach_svc(glusterd_svc_t *svc, glusterd_volinfo_t *volinfo,
+                    int flags);
+
+int
+glusterd_detach_svc(glusterd_svc_t *svc, glusterd_volinfo_t *volinfo, int sig);
+
+int
+__glusterd_send_svc_configure_req(glusterd_svc_t *svc, int flag,
+                                  struct rpc_clnt *rpc, char *volfile_id,
+                                  int op);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c
new file mode 100644
index 00000000000..18b3fb13630
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c
@@ -0,0 +1,536 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/globals.h>
+#include <glusterfs/run.h>
+#include "glusterd.h"
+#include <glusterfs/glusterfs.h>
+#include "glusterd-utils.h"
+#include "glusterd-svc-mgmt.h"
+#include "glusterd-proc-mgmt.h"
+#include "glusterd-conn-mgmt.h"
+#include "glusterd-messages.h"
+#include <glusterfs/syscall.h>
+#include "glusterd-shd-svc-helper.h"
+
+int
+glusterd_svc_create_rundir(char *rundir)
+{
+    int ret = -1;
+
+    ret = mkdir_p(rundir, 0755, _gf_true);
+    if ((ret == -1) && (EEXIST != errno)) {
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create rundir %s", rundir);
+    }
+    return ret;
+}
+
+void
+glusterd_svc_build_logfile_path(char *server, char *logdir, char *logfile,
+                                size_t len)
+{
+    snprintf(logfile, len, "%s/%s.log", logdir, server);
+}
+
+void
+glusterd_svc_build_volfileid_path(char *server, char *volfileid, size_t len)
+{
+    snprintf(volfileid, len, "gluster/%s", server);
+}
+
+static int
+glusterd_svc_init_common(glusterd_svc_t *svc, char *svc_name, char *workdir,
+                         char *rundir, char *logdir,
+                         glusterd_conn_notify_t notify)
+{
+    int ret = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    char pidfile[PATH_MAX] = {
+        0,
+    };
+    char logfile[PATH_MAX] = {
+        0,
+    };
+    char volfile[PATH_MAX] = {
+        0,
+    };
+    char sockfpath[PATH_MAX] = {
+        0,
+    };
+    char volfileid[256] = {0};
+    char *volfileserver = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = snprintf(svc->name, sizeof(svc->name), "%s", svc_name);
+    if (ret < 0)
+        goto out;
+
+    if (!notify)
+        notify = glusterd_svc_common_rpc_notify;
+
+    glusterd_svc_create_rundir(rundir);
+
+    /* Initialize the connection mgmt */
+    glusterd_conn_build_socket_filepath(rundir, MY_UUID, sockfpath,
+                                        sizeof(sockfpath));
+
+    ret = glusterd_conn_init(&(svc->conn), sockfpath, 600, notify);
+    if (ret)
+        goto out;
+
+    /* Initialize the process mgmt */
+    glusterd_svc_build_pidfile_path(svc_name, priv->rundir, pidfile,
+                                    sizeof(pidfile));
+
+    glusterd_svc_build_volfile_path(svc_name, workdir, volfile,
+                                    sizeof(volfile));
+
+    glusterd_svc_build_logfile_path(svc_name, logdir, logfile, sizeof(logfile));
+    glusterd_svc_build_volfileid_path(svc_name, volfileid, sizeof(volfileid));
+
+    if (dict_get_strn(this->options, "transport.socket.bind-address",
+                      SLEN("transport.socket.bind-address"),
+                      &volfileserver) != 0) {
+        volfileserver = "localhost";
+    }
+
+    ret = glusterd_proc_init(&(svc->proc), svc_name, pidfile, logdir, logfile,
+                             volfile, volfileid, volfileserver);
+    if (ret)
+        goto out;
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+svc_add_args(dict_t *cmdline, char *arg, data_t *value, void *data)
+{
+    runner_t *runner = data;
+    runner_add_arg(runner, value->data);
+    return 0;
+}
+
+int
+glusterd_svc_init(glusterd_svc_t *svc, char *svc_name)
+{
+    int ret = -1;
+    char rundir[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    glusterd_svc_build_rundir(svc_name, priv->rundir, rundir, sizeof(rundir));
+    ret = glusterd_svc_init_common(svc, svc_name, priv->workdir, rundir,
+                                   priv->logdir, NULL);
+
+    return ret;
+}
+
+int
+glusterd_svc_start(glusterd_svc_t *svc, int flags, dict_t *cmdline)
+{
+    int ret = -1;
+    runner_t runner = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    char valgrind_logfile[PATH_MAX] = {0};
+    char *localtime_logging = NULL;
+    char *log_level = NULL;
+    char daemon_log_level[30] = {0};
+    char msg[1024] = {
+        0,
+    };
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("glusterd", priv, out);
+    GF_VALIDATE_OR_GOTO("glusterd", svc, out);
+
+    pthread_mutex_lock(&priv->attach_lock);
+    {
+        if (glusterd_proc_is_running(&(svc->proc))) {
+            ret = 0;
+            goto unlock;
+        }
+
+        ret = sys_access(svc->proc.volfile, F_OK);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_NOT_FOUND,
+                   "Volfile %s is not present", svc->proc.volfile);
+            goto unlock;
+        }
+
+        runinit(&runner);
+
+        if (this->ctx->cmd_args.vgtool != _gf_none) {
+            len = snprintf(valgrind_logfile, PATH_MAX, "%s/valgrind-%s.log",
+                           svc->proc.logdir, svc->name);
+            if ((len < 0) || (len >= PATH_MAX)) {
+                ret = -1;
+                goto unlock;
+            }
+
+            if (this->ctx->cmd_args.vgtool == _gf_memcheck)
+                runner_add_args(&runner, "valgrind", "--leak-check=full",
+                                "--trace-children=yes", "--track-origins=yes",
+                                NULL);
+            else
+                runner_add_args(&runner, "valgrind", "--tool=drd", NULL);
+
+            runner_argprintf(&runner, "--log-file=%s", valgrind_logfile);
+        }
+
+        runner_add_args(&runner, SBIN_DIR "/glusterfs", "-s",
+                        svc->proc.volfileserver, "--volfile-id",
+                        svc->proc.volfileid, "-p", svc->proc.pidfile, "-l",
+                        svc->proc.logfile, "-S", svc->conn.sockpath, NULL);
+
+        if (dict_get_strn(priv->opts, GLUSTERD_LOCALTIME_LOGGING_KEY,
+                          SLEN(GLUSTERD_LOCALTIME_LOGGING_KEY),
+                          &localtime_logging) == 0) {
+            if (strcmp(localtime_logging, "enable") == 0)
+                runner_add_arg(&runner, "--localtime-logging");
+        }
+        if (dict_get_strn(priv->opts, GLUSTERD_DAEMON_LOG_LEVEL_KEY,
+                          SLEN(GLUSTERD_DAEMON_LOG_LEVEL_KEY),
+                          &log_level) == 0) {
+            snprintf(daemon_log_level, 30, "--log-level=%s", log_level);
+            runner_add_arg(&runner, daemon_log_level);
+        }
+
+        if (this->ctx->cmd_args.global_threading) {
+            runner_add_arg(&runner, "--global-threading");
+        }
+
+        if (cmdline)
+            dict_foreach(cmdline, svc_add_args, (void *)&runner);
+
+        snprintf(msg, sizeof(msg), "Starting %s service", svc->name);
+        runner_log(&runner, this->name, GF_LOG_DEBUG, msg);
+
+        if (flags == PROC_START_NO_WAIT) {
+            ret = runner_run_nowait(&runner);
+        } else {
+            synclock_unlock(&priv->big_lock);
+            {
+                ret = runner_run(&runner);
+            }
+            synclock_lock(&priv->big_lock);
+        }
+    }
+unlock:
+    pthread_mutex_unlock(&priv->attach_lock);
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_svc_stop(glusterd_svc_t *svc, int sig)
+{
+    int ret = -1;
+
+    ret = glusterd_proc_stop(&(svc->proc), sig, PROC_STOP_FORCE);
+    if (ret)
+        goto out;
+    glusterd_conn_disconnect(&(svc->conn));
+
+    if (ret == 0) {
+        svc->online = _gf_false;
+        (void)glusterd_unlink_file((char *)svc->conn.sockpath);
+    }
+    gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_SVC_STOP_SUCCESS,
+           "%s service is stopped", svc->name);
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+void
+glusterd_svc_build_pidfile_path(char *server, char *workdir, char *path,
+                                size_t len)
+{
+    char dir[PATH_MAX] = {0};
+
+    GF_ASSERT(len == PATH_MAX);
+
+    glusterd_svc_build_rundir(server, workdir, dir, sizeof(dir));
+    snprintf(path, len, "%s/%s.pid", dir, server);
+}
+
+void
+glusterd_svc_build_volfile_path(char *server, char *workdir, char *volfile,
+                                size_t len)
+{
+    char dir[PATH_MAX] = {
+        0,
+    };
+
+    GF_ASSERT(len == PATH_MAX);
+
+    glusterd_svc_build_svcdir(server, workdir, dir, sizeof(dir));
+
+    if (!strcmp(server, "quotad"))
+        /*quotad has different volfile name*/
+        snprintf(volfile, len, "%s/%s.vol", dir, server);
+    else
+        snprintf(volfile, len, "%s/%s-server.vol", dir, server);
+}
+
+void
+glusterd_svc_build_svcdir(char *server, char *workdir, char *path, size_t len)
+{
+    GF_ASSERT(len == PATH_MAX);
+
+    snprintf(path, len, "%s/%s", workdir, server);
+}
+
+void
+glusterd_svc_build_rundir(char *server, char *workdir, char *path, size_t len)
+{
+    char dir[PATH_MAX] = {0};
+
+    GF_ASSERT(len == PATH_MAX);
+
+    glusterd_svc_build_svcdir(server, workdir, dir, sizeof(dir));
+    snprintf(path, len, "%s", dir);
+}
+
+int
+glusterd_svc_reconfigure(int (*create_volfile)())
+{
+    int ret = -1;
+
+    ret = create_volfile();
+    if (ret)
+        goto out;
+
+    ret = glusterd_fetchspec_notify(THIS);
+out:
+    return ret;
+}
+
+int
+glusterd_svc_common_rpc_notify(glusterd_conn_t *conn, rpc_clnt_event_t event)
+{
+    int ret = 0;
+    glusterd_svc_t *svc = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    /* Get the parent onject i.e. svc using list_entry macro */
+    svc = cds_list_entry(conn, glusterd_svc_t, conn);
+    if (!svc) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SVC_GET_FAIL,
+               "Failed to get the service");
+        return -1;
+    }
+
+    switch (event) {
+        case RPC_CLNT_CONNECT:
+            gf_msg_debug(this->name, 0,
+                         "%s has connected with "
+                         "glusterd.",
+                         svc->name);
+            gf_event(EVENT_SVC_CONNECTED, "svc_name=%s", svc->name);
+            svc->online = _gf_true;
+            break;
+
+        case RPC_CLNT_DISCONNECT:
+            if (svc->online) {
+                gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_NODE_DISCONNECTED,
+                       "%s has disconnected "
+                       "from glusterd.",
+                       svc->name);
+                gf_event(EVENT_SVC_DISCONNECTED, "svc_name=%s", svc->name);
+                svc->online = _gf_false;
+            }
+            break;
+
+        default:
+            gf_msg_trace(this->name, 0, "got some other RPC event %d", event);
+            break;
+    }
+
+    return ret;
+}
+
+void
+glusterd_volume_svc_build_volfile_path(char *server, glusterd_volinfo_t *vol,
+                                       char *volfile, size_t len)
+{
+    GF_ASSERT(len == PATH_MAX);
+
+    if (!strcmp(server, "glustershd")) {
+        glusterd_svc_build_shd_volfile_path(vol, volfile, len);
+    }
+}
+
+int
+glusterd_muxsvc_common_rpc_notify(glusterd_svc_proc_t *mux_proc,
+                                  rpc_clnt_event_t event)
+{
+    int ret = 0;
+    glusterd_svc_t *svc = NULL;
+    glusterd_svc_t *tmp = NULL;
+    xlator_t *this = NULL;
+    gf_boolean_t need_logging = _gf_false;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (!mux_proc) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SVC_GET_FAIL,
+               "Failed to get the svc proc data");
+        return -1;
+    }
+
+    /* Currently this function was used for shd svc, if this function is
+     * using for another svc, change ths glustershd reference. We can get
+     * the svc name from any of the attached svc's
+     */
+    switch (event) {
+        case RPC_CLNT_CONNECT:
+            gf_msg_debug(this->name, 0,
+                         "glustershd has connected with glusterd.");
+            gf_event(EVENT_SVC_CONNECTED, "svc_name=glustershd");
+            cds_list_for_each_entry_safe(svc, tmp, &mux_proc->svcs, mux_svc)
+            {
+                if (svc->online)
+                    continue;
+                svc->online = _gf_true;
+            }
+            if (mux_proc->status != GF_SVC_STARTED)
+                mux_proc->status = GF_SVC_STARTED;
+
+            break;
+
+        case RPC_CLNT_DISCONNECT:
+            cds_list_for_each_entry_safe(svc, tmp, &mux_proc->svcs, mux_svc)
+            {
+                if (svc->online) {
+                    if (!need_logging)
+                        need_logging = _gf_true;
+                    svc->online = _gf_false;
+                }
+            }
+            if (mux_proc->status != GF_SVC_DIED) {
+                svc = cds_list_entry(mux_proc->svcs.next, glusterd_svc_t,
+                                     mux_svc);
+                if (svc && !glusterd_proc_is_running(&svc->proc)) {
+                    mux_proc->status = GF_SVC_DIED;
+                } else {
+                    mux_proc->status = GF_SVC_DISCONNECTED;
+                }
+            }
+
+            if (need_logging) {
+                gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_NODE_DISCONNECTED,
+                       "glustershd has disconnected from glusterd.");
+                gf_event(EVENT_SVC_DISCONNECTED, "svc_name=glustershd");
+            }
+            break;
+
+        default:
+            gf_msg_trace(this->name, 0, "got some other RPC event %d", event);
+            break;
+    }
+
+    return ret;
+}
+
+int
+glusterd_muxsvc_conn_init(glusterd_conn_t *conn, glusterd_svc_proc_t *mux_proc,
+                          char *sockpath, int frame_timeout,
+                          glusterd_muxsvc_conn_notify_t notify)
+{
+    int ret = -1;
+    dict_t *options = NULL;
+    struct rpc_clnt *rpc = NULL;
+    xlator_t *this = THIS;
+    glusterd_svc_t *svc = NULL;
+
+    options = dict_new();
+    if (!this || !options)
+        goto out;
+
+    svc = cds_list_entry(conn, glusterd_svc_t, conn);
+    if (!svc) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SVC_GET_FAIL,
+               "Failed to get the service");
+        goto out;
+    }
+
+    ret = rpc_transport_unix_options_build(options, sockpath, frame_timeout);
+    if (ret)
+        goto out;
+
+    ret = dict_set_int32n(options, "transport.socket.ignore-enoent",
+                          SLEN("transport.socket.ignore-enoent"), 1);
+    if (ret)
+        goto out;
+
+    /* @options is free'd by rpc_transport when destroyed */
+    rpc = rpc_clnt_new(options, this, (char *)svc->name, 16);
+    if (!rpc) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = rpc_clnt_register_notify(rpc, glusterd_muxsvc_conn_common_notify,
+                                   mux_proc);
+    if (ret)
+        goto out;
+
+    ret = snprintf(conn->sockpath, sizeof(conn->sockpath), "%s", sockpath);
+    if (ret < 0)
+        goto out;
+    else
+        ret = 0;
+
+    conn->frame_timeout = frame_timeout;
+    conn->rpc = rpc;
+    mux_proc->notify = notify;
+out:
+    if (options)
+        dict_unref(options);
+    if (ret) {
+        if (rpc) {
+            rpc_clnt_unref(rpc);
+            rpc = NULL;
+        }
+    }
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-svc-mgmt.h b/xlators/mgmt/glusterd/src/glusterd-svc-mgmt.h
new file mode 100644
index 00000000000..5daee993833
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-svc-mgmt.h
@@ -0,0 +1,112 @@
+/*
+   Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_SVC_MGMT_H_
+#define _GLUSTERD_SVC_MGMT_H_
+
+#include "glusterd-proc-mgmt.h"
+#include "glusterd-conn-mgmt.h"
+#include "glusterd-rcu.h"
+
+struct glusterd_svc_;
+
+typedef struct glusterd_svc_ glusterd_svc_t;
+typedef struct glusterd_svc_proc_ glusterd_svc_proc_t;
+
+typedef void (*glusterd_svc_build_t)(glusterd_svc_t *svc);
+
+typedef int (*glusterd_svc_manager_t)(glusterd_svc_t *svc, void *data,
+                                      int flags);
+typedef int (*glusterd_svc_start_t)(glusterd_svc_t *svc, int flags);
+typedef int (*glusterd_svc_stop_t)(glusterd_svc_t *svc, int sig);
+typedef int (*glusterd_svc_reconfigure_t)(void *data);
+
+typedef int (*glusterd_muxsvc_conn_notify_t)(glusterd_svc_proc_t *mux_proc,
+                                             rpc_clnt_event_t event);
+
+typedef enum gf_svc_status {
+    GF_SVC_STARTING,
+    GF_SVC_STARTED,
+    GF_SVC_STOPPING,
+    GF_SVC_DISCONNECTED,
+    GF_SVC_DIED,
+} gf_svc_status_t;
+
+struct glusterd_svc_proc_ {
+    struct cds_list_head svc_proc_list;
+    struct cds_list_head svcs;
+    glusterd_muxsvc_conn_notify_t notify;
+    rpc_clnt_t *rpc;
+    void *data;
+    gf_svc_status_t status;
+};
+
+struct glusterd_svc_ {
+    glusterd_conn_t conn;
+    glusterd_svc_manager_t manager;
+    glusterd_svc_start_t start;
+    glusterd_svc_stop_t stop;
+    glusterd_svc_reconfigure_t reconfigure;
+    glusterd_svc_proc_t *svc_proc;
+    struct cds_list_head mux_svc;
+    glusterd_proc_t proc;
+    char name[NAME_MAX];
+    gf_boolean_t online;
+    gf_boolean_t inited;
+};
+
+int
+glusterd_svc_create_rundir(char *rundir);
+
+int
+glusterd_svc_init(glusterd_svc_t *svc, char *svc_name);
+
+int
+glusterd_svc_start(glusterd_svc_t *svc, int flags, dict_t *cmdline);
+
+int
+glusterd_svc_stop(glusterd_svc_t *svc, int sig);
+
+void
+glusterd_svc_build_pidfile_path(char *server, char *workdir, char *path,
+                                size_t len);
+
+void
+glusterd_svc_build_volfile_path(char *server, char *workdir, char *volfile,
+                                size_t len);
+
+void
+glusterd_svc_build_logfile_path(char *server, char *logdir, char *logfile,
+                                size_t len);
+
+void
+glusterd_svc_build_svcdir(char *server, char *workdir, char *path, size_t len);
+
+void
+glusterd_svc_build_rundir(char *server, char *workdir, char *path, size_t len);
+
+int
+glusterd_svc_reconfigure(int (*create_volfile)());
+
+int
+glusterd_svc_common_rpc_notify(glusterd_conn_t *conn, rpc_clnt_event_t event);
+
+int
+glusterd_muxsvc_common_rpc_notify(glusterd_svc_proc_t *conn,
+                                  rpc_clnt_event_t event);
+
+int
+glusterd_proc_get_pid(glusterd_proc_t *proc);
+
+int
+glusterd_muxsvc_conn_init(glusterd_conn_t *conn, glusterd_svc_proc_t *mux_proc,
+                          char *sockpath, int frame_timeout,
+                          glusterd_muxsvc_conn_notify_t notify);
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.c b/xlators/mgmt/glusterd/src/glusterd-syncop.c
new file mode 100644
index 00000000000..b73d37ad08e
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-syncop.c
@@ -0,0 +1,2043 @@
+/*
+   Copyright (c) 2012-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+/* rpc related syncops */
+#include "rpc-clnt.h"
+#include "protocol-common.h"
+#include "xdr-generic.h"
+#include "glusterd1-xdr.h"
+#include "glusterd-syncop.h"
+#include "glusterd-mgmt.h"
+
+#include "glusterd.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-utils.h"
+#include "glusterd-server-quorum.h"
+#include "glusterd-locks.h"
+#include "glusterd-snapshot-utils.h"
+#include "glusterd-messages.h"
+#include "glusterd-errno.h"
+
+extern glusterd_op_info_t opinfo;
+
+void
+gd_synctask_barrier_wait(struct syncargs *args, int count)
+{
+    glusterd_conf_t *conf = THIS->private;
+
+    synclock_unlock(&conf->big_lock);
+    synctask_barrier_wait(args, count);
+    synclock_lock(&conf->big_lock);
+
+    syncbarrier_destroy(&args->barrier);
+}
+
+static void
+gd_collate_errors(struct syncargs *args, int op_ret, int op_errno,
+                  char *op_errstr, int op_code, uuid_t peerid, u_char *uuid)
+{
+    char err_str[PATH_MAX] = "Please check log file for details.";
+    char op_err[PATH_MAX] = "";
+    int len = -1;
+    char *peer_str = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+
+    if (op_ret) {
+        args->op_ret = op_ret;
+        args->op_errno = op_errno;
+
+        RCU_READ_LOCK;
+        peerinfo = glusterd_peerinfo_find(peerid, NULL);
+        if (peerinfo)
+            peer_str = gf_strdup(peerinfo->hostname);
+        else
+            peer_str = gf_strdup(uuid_utoa(uuid));
+        RCU_READ_UNLOCK;
+
+        if (op_errstr && strcmp(op_errstr, "")) {
+            len = snprintf(err_str, sizeof(err_str) - 1, "Error: %s",
+                           op_errstr);
+            err_str[len] = '\0';
+        }
+
+        switch (op_code) {
+            case GLUSTERD_MGMT_CLUSTER_LOCK: {
+                len = snprintf(op_err, sizeof(op_err) - 1,
+                               "Locking failed on %s. %s", peer_str, err_str);
+                break;
+            }
+            case GLUSTERD_MGMT_CLUSTER_UNLOCK: {
+                len = snprintf(op_err, sizeof(op_err) - 1,
+                               "Unlocking failed on %s. %s", peer_str, err_str);
+                break;
+            }
+            case GLUSTERD_MGMT_STAGE_OP: {
+                len = snprintf(op_err, sizeof(op_err) - 1,
+                               "Staging failed on %s. %s", peer_str, err_str);
+                break;
+            }
+            case GLUSTERD_MGMT_COMMIT_OP: {
+                len = snprintf(op_err, sizeof(op_err) - 1,
+                               "Commit failed on %s. %s", peer_str, err_str);
+                break;
+            }
+        }
+
+        if (len > 0)
+            op_err[len] = '\0';
+
+        if (args->errstr) {
+            len = snprintf(err_str, sizeof(err_str) - 1, "%s\n%s", args->errstr,
+                           op_err);
+            GF_FREE(args->errstr);
+            args->errstr = NULL;
+        } else
+            len = snprintf(err_str, sizeof(err_str) - 1, "%s", op_err);
+        err_str[len] = '\0';
+
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_MGMT_OP_FAIL, "%s", op_err);
+        args->errstr = gf_strdup(err_str);
+    }
+
+    GF_FREE(peer_str);
+
+    return;
+}
+
+void
+gd_syncargs_init(struct syncargs *args, dict_t *op_ctx)
+{
+    args->dict = op_ctx;
+    pthread_mutex_init(&args->lock_dict, NULL);
+}
+
+static void
+gd_stage_op_req_free(gd1_mgmt_stage_op_req *req)
+{
+    if (!req)
+        return;
+
+    GF_FREE(req->buf.buf_val);
+    GF_FREE(req);
+}
+
+static void
+gd_commit_op_req_free(gd1_mgmt_commit_op_req *req)
+{
+    if (!req)
+        return;
+
+    GF_FREE(req->buf.buf_val);
+    GF_FREE(req);
+}
+
+static void
+gd_brick_op_req_free(gd1_mgmt_brick_op_req *req)
+{
+    if (!req)
+        return;
+
+    if (req->dict.dict_val)
+        GF_FREE(req->dict.dict_val);
+    GF_FREE(req->input.input_val);
+    GF_FREE(req);
+}
+
+int
+gd_syncop_submit_request(struct rpc_clnt *rpc, void *req, void *local,
+                         void *cookie, rpc_clnt_prog_t *prog, int procnum,
+                         fop_cbk_fn_t cbkfn, xdrproc_t xdrproc)
+{
+    int ret = -1;
+    struct iobuf *iobuf = NULL;
+    struct iobref *iobref = NULL;
+    int count = 0;
+    struct iovec iov = {
+        0,
+    };
+    ssize_t req_size = 0;
+    call_frame_t *frame = NULL;
+
+    GF_ASSERT(rpc);
+    if (!req)
+        goto out;
+
+    req_size = xdr_sizeof(xdrproc, req);
+    iobuf = iobuf_get2(rpc->ctx->iobuf_pool, req_size);
+    if (!iobuf)
+        goto out;
+
+    iobref = iobref_new();
+    if (!iobref)
+        goto out;
+
+    frame = create_frame(THIS, THIS->ctx->pool);
+    if (!frame)
+        goto out;
+
+    iobref_add(iobref, iobuf);
+
+    iov.iov_base = iobuf->ptr;
+    iov.iov_len = iobuf_pagesize(iobuf);
+
+    /* Create the xdr payload */
+    ret = xdr_serialize_generic(iov, req, xdrproc);
+    if (ret == -1)
+        goto out;
+
+    iov.iov_len = ret;
+    count = 1;
+
+    frame->local = local;
+    frame->cookie = cookie;
+
+    /* Send the msg */
+    ret = rpc_clnt_submit(rpc, prog, procnum, cbkfn, &iov, count, NULL, 0,
+                          iobref, frame, NULL, 0, NULL, 0, NULL);
+
+    /* TODO: do we need to start ping also? */
+
+out:
+    iobref_unref(iobref);
+    iobuf_unref(iobuf);
+
+    if (ret && frame)
+        STACK_DESTROY(frame->root);
+    return ret;
+}
+
+/* Defined in glusterd-rpc-ops.c */
+extern struct rpc_clnt_program gd_mgmt_prog;
+extern struct rpc_clnt_program gd_brick_prog;
+extern struct rpc_clnt_program gd_mgmt_v3_prog;
+
+int
+glusterd_syncop_aggr_rsp_dict(glusterd_op_t op, dict_t *aggr, dict_t *rsp)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    switch (op) {
+        case GD_OP_CREATE_VOLUME:
+        case GD_OP_ADD_BRICK:
+        case GD_OP_START_VOLUME:
+            ret = glusterd_aggr_brick_mount_dirs(aggr, rsp);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_BRICK_MOUNDIRS_AGGR_FAIL,
+                       "Failed to "
+                       "aggregate brick mount dirs");
+                goto out;
+            }
+            break;
+
+        case GD_OP_REPLACE_BRICK:
+        case GD_OP_RESET_BRICK:
+            ret = glusterd_rb_use_rsp_dict(aggr, rsp);
+            if (ret)
+                goto out;
+            break;
+
+        case GD_OP_SYNC_VOLUME:
+            ret = glusterd_sync_use_rsp_dict(aggr, rsp);
+            if (ret)
+                goto out;
+            break;
+
+        case GD_OP_GSYNC_CREATE:
+            break;
+
+        case GD_OP_GSYNC_SET:
+            ret = glusterd_gsync_use_rsp_dict(aggr, rsp, NULL);
+            if (ret)
+                goto out;
+            break;
+
+        case GD_OP_STATUS_VOLUME:
+            ret = glusterd_volume_status_copy_to_op_ctx_dict(aggr, rsp);
+            if (ret)
+                goto out;
+            break;
+
+        case GD_OP_HEAL_VOLUME:
+            ret = glusterd_volume_heal_use_rsp_dict(aggr, rsp);
+            if (ret)
+                goto out;
+
+            break;
+
+        case GD_OP_CLEARLOCKS_VOLUME:
+            ret = glusterd_use_rsp_dict(aggr, rsp);
+            if (ret)
+                goto out;
+            break;
+
+        case GD_OP_QUOTA:
+            ret = glusterd_volume_quota_copy_to_op_ctx_dict(aggr, rsp);
+            if (ret)
+                goto out;
+            break;
+
+        case GD_OP_SYS_EXEC:
+            ret = glusterd_sys_exec_output_rsp_dict(aggr, rsp);
+            if (ret)
+                goto out;
+            break;
+
+        case GD_OP_SNAP:
+            ret = glusterd_snap_use_rsp_dict(aggr, rsp);
+            if (ret)
+                goto out;
+            break;
+
+        case GD_OP_SCRUB_STATUS:
+            ret = glusterd_volume_bitrot_scrub_use_rsp_dict(aggr, rsp);
+            break;
+
+        case GD_OP_SCRUB_ONDEMAND:
+            break;
+
+        case GD_OP_MAX_OPVERSION:
+            ret = glusterd_max_opversion_use_rsp_dict(aggr, rsp);
+            break;
+
+        case GD_OP_PROFILE_VOLUME:
+            ret = glusterd_profile_volume_use_rsp_dict(aggr, rsp);
+            break;
+
+        case GD_OP_REBALANCE:
+        case GD_OP_DEFRAG_BRICK_VOLUME:
+            ret = glusterd_volume_rebalance_use_rsp_dict(aggr, rsp);
+            break;
+
+        default:
+            break;
+    }
+out:
+    return ret;
+}
+
+int32_t
+gd_syncop_mgmt_v3_lock_cbk_fn(struct rpc_req *req, struct iovec *iov, int count,
+                              void *myframe)
+{
+    int ret = -1;
+    struct syncargs *args = NULL;
+    gd1_mgmt_v3_lock_rsp rsp = {
+        {0},
+    };
+    call_frame_t *frame = NULL;
+    int op_ret = -1;
+    int op_errno = -1;
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(myframe);
+
+    frame = myframe;
+    args = frame->local;
+    peerid = frame->cookie;
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    if (-1 == req->rpc_status) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, op_errno, EINVAL);
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+    if (ret < 0)
+        goto out;
+
+    gf_uuid_copy(args->uuid, rsp.uuid);
+
+    op_ret = rsp.op_ret;
+    op_errno = rsp.op_errno;
+out:
+    gd_mgmt_v3_collate_errors(args, op_ret, op_errno, NULL,
+                              GLUSTERD_MGMT_V3_LOCK, *peerid, rsp.uuid);
+
+    GF_FREE(peerid);
+    /* req->rpc_status set to -1 means, STACK_DESTROY will be called from
+     * the caller function.
+     */
+    if (req->rpc_status != -1)
+        STACK_DESTROY(frame->root);
+    synctask_barrier_wake(args);
+    return 0;
+}
+
+int32_t
+gd_syncop_mgmt_v3_lock_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                           void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   gd_syncop_mgmt_v3_lock_cbk_fn);
+}
+
+int
+gd_syncop_mgmt_v3_lock(glusterd_op_t op, dict_t *op_ctx,
+                       glusterd_peerinfo_t *peerinfo, struct syncargs *args,
+                       uuid_t my_uuid, uuid_t recv_uuid, uuid_t txn_id)
+{
+    int ret = -1;
+    gd1_mgmt_v3_lock_req req = {
+        {0},
+    };
+    uuid_t *peerid = NULL;
+
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(peerinfo);
+    GF_ASSERT(args);
+
+    ret = dict_allocate_and_serialize(op_ctx, &req.dict.dict_val,
+                                      &req.dict.dict_len);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    gf_uuid_copy(req.uuid, my_uuid);
+    gf_uuid_copy(req.txn_id, txn_id);
+    req.op = op;
+
+    GD_ALLOC_COPY_UUID(peerid, peerinfo->uuid, ret);
+    if (ret)
+        goto out;
+
+    ret = gd_syncop_submit_request(peerinfo->rpc, &req, args, peerid,
+                                   &gd_mgmt_v3_prog, GLUSTERD_MGMT_V3_LOCK,
+                                   gd_syncop_mgmt_v3_lock_cbk,
+                                   (xdrproc_t)xdr_gd1_mgmt_v3_lock_req);
+out:
+    GF_FREE(req.dict.dict_val);
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+gd_syncop_mgmt_v3_unlock_cbk_fn(struct rpc_req *req, struct iovec *iov,
+                                int count, void *myframe)
+{
+    int ret = -1;
+    struct syncargs *args = NULL;
+    gd1_mgmt_v3_unlock_rsp rsp = {
+        {0},
+    };
+    call_frame_t *frame = NULL;
+    int op_ret = -1;
+    int op_errno = -1;
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    GF_ASSERT(myframe);
+
+    frame = myframe;
+    args = frame->local;
+    peerid = frame->cookie;
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    if (-1 == req->rpc_status) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, op_errno, EINVAL);
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+    if (ret < 0)
+        goto out;
+
+    gf_uuid_copy(args->uuid, rsp.uuid);
+
+    op_ret = rsp.op_ret;
+    op_errno = rsp.op_errno;
+out:
+    gd_mgmt_v3_collate_errors(args, op_ret, op_errno, NULL,
+                              GLUSTERD_MGMT_V3_UNLOCK, *peerid, rsp.uuid);
+
+    GF_FREE(peerid);
+    /* req->rpc_status set to -1 means, STACK_DESTROY will be called from
+     * the caller function.
+     */
+    if (req->rpc_status != -1)
+        STACK_DESTROY(frame->root);
+    synctask_barrier_wake(args);
+    return 0;
+}
+
+int32_t
+gd_syncop_mgmt_v3_unlock_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                             void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   gd_syncop_mgmt_v3_unlock_cbk_fn);
+}
+
+int
+gd_syncop_mgmt_v3_unlock(dict_t *op_ctx, glusterd_peerinfo_t *peerinfo,
+                         struct syncargs *args, uuid_t my_uuid,
+                         uuid_t recv_uuid, uuid_t txn_id)
+{
+    int ret = -1;
+    gd1_mgmt_v3_unlock_req req = {
+        {0},
+    };
+    uuid_t *peerid = NULL;
+
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(peerinfo);
+    GF_ASSERT(args);
+
+    ret = dict_allocate_and_serialize(op_ctx, &req.dict.dict_val,
+                                      &req.dict.dict_len);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    gf_uuid_copy(req.uuid, my_uuid);
+    gf_uuid_copy(req.txn_id, txn_id);
+
+    GD_ALLOC_COPY_UUID(peerid, peerinfo->uuid, ret);
+    if (ret)
+        goto out;
+
+    ret = gd_syncop_submit_request(peerinfo->rpc, &req, args, peerid,
+                                   &gd_mgmt_v3_prog, GLUSTERD_MGMT_V3_UNLOCK,
+                                   gd_syncop_mgmt_v3_unlock_cbk,
+                                   (xdrproc_t)xdr_gd1_mgmt_v3_unlock_req);
+out:
+    GF_FREE(req.dict.dict_val);
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+_gd_syncop_mgmt_lock_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                         void *myframe)
+{
+    int ret = -1;
+    struct syncargs *args = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    gd1_mgmt_cluster_lock_rsp rsp = {
+        {0},
+    };
+    call_frame_t *frame = NULL;
+    int op_ret = -1;
+    int op_errno = -1;
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    frame = myframe;
+    args = frame->local;
+    peerid = frame->cookie;
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    if (-1 == req->rpc_status) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, op_errno, EINVAL);
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_cluster_lock_rsp);
+    if (ret < 0)
+        goto out;
+
+    gf_uuid_copy(args->uuid, rsp.uuid);
+
+    RCU_READ_LOCK;
+    peerinfo = glusterd_peerinfo_find(*peerid, NULL);
+    if (peerinfo) {
+        /* Set peer as locked, so we unlock only the locked peers */
+        if (rsp.op_ret == 0)
+            peerinfo->locked = _gf_true;
+        RCU_READ_UNLOCK;
+    } else {
+        RCU_READ_UNLOCK;
+        rsp.op_ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_PEER_NOT_FOUND,
+               "Could not find peer with "
+               "ID %s",
+               uuid_utoa(*peerid));
+    }
+
+    op_ret = rsp.op_ret;
+    op_errno = rsp.op_errno;
+out:
+    gd_collate_errors(args, op_ret, op_errno, NULL, GLUSTERD_MGMT_CLUSTER_LOCK,
+                      *peerid, rsp.uuid);
+
+    GF_FREE(peerid);
+    /* req->rpc_status set to -1 means, STACK_DESTROY will be called from
+     * the caller function.
+     */
+    if (req->rpc_status != -1)
+        STACK_DESTROY(frame->root);
+    synctask_barrier_wake(args);
+    return 0;
+}
+
+int32_t
+gd_syncop_mgmt_lock_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                        void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   _gd_syncop_mgmt_lock_cbk);
+}
+
+int
+gd_syncop_mgmt_lock(glusterd_peerinfo_t *peerinfo, struct syncargs *args,
+                    uuid_t my_uuid, uuid_t recv_uuid)
+{
+    int ret = -1;
+    gd1_mgmt_cluster_lock_req req = {
+        {0},
+    };
+    uuid_t *peerid = NULL;
+
+    gf_uuid_copy(req.uuid, my_uuid);
+    GD_ALLOC_COPY_UUID(peerid, peerinfo->uuid, ret);
+    if (ret)
+        goto out;
+
+    ret = gd_syncop_submit_request(peerinfo->rpc, &req, args, peerid,
+                                   &gd_mgmt_prog, GLUSTERD_MGMT_CLUSTER_LOCK,
+                                   gd_syncop_mgmt_lock_cbk,
+                                   (xdrproc_t)xdr_gd1_mgmt_cluster_lock_req);
+out:
+    return ret;
+}
+
+int32_t
+_gd_syncop_mgmt_unlock_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                           void *myframe)
+{
+    int ret = -1;
+    struct syncargs *args = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    gd1_mgmt_cluster_unlock_rsp rsp = {
+        {0},
+    };
+    call_frame_t *frame = NULL;
+    int op_ret = -1;
+    int op_errno = -1;
+    xlator_t *this = NULL;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    frame = myframe;
+    args = frame->local;
+    peerid = frame->cookie;
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    if (-1 == req->rpc_status) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, op_errno, EINVAL);
+
+    ret = xdr_to_generic(*iov, &rsp,
+                         (xdrproc_t)xdr_gd1_mgmt_cluster_unlock_rsp);
+    if (ret < 0)
+        goto out;
+
+    gf_uuid_copy(args->uuid, rsp.uuid);
+
+    RCU_READ_LOCK;
+    peerinfo = glusterd_peerinfo_find(*peerid, NULL);
+    if (peerinfo) {
+        peerinfo->locked = _gf_false;
+        RCU_READ_UNLOCK;
+    } else {
+        RCU_READ_UNLOCK;
+        rsp.op_ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_PEER_NOT_FOUND,
+               "Could not find peer with "
+               "ID %s",
+               uuid_utoa(*peerid));
+    }
+
+    op_ret = rsp.op_ret;
+    op_errno = rsp.op_errno;
+out:
+    gd_collate_errors(args, op_ret, op_errno, NULL,
+                      GLUSTERD_MGMT_CLUSTER_UNLOCK, *peerid, rsp.uuid);
+
+    GF_FREE(peerid);
+    /* req->rpc_status set to -1 means, STACK_DESTROY will be called from
+     * the caller function.
+     */
+    if (req->rpc_status != -1)
+        STACK_DESTROY(frame->root);
+    synctask_barrier_wake(args);
+    return 0;
+}
+
+int32_t
+gd_syncop_mgmt_unlock_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                          void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   _gd_syncop_mgmt_unlock_cbk);
+}
+
+int
+gd_syncop_mgmt_unlock(glusterd_peerinfo_t *peerinfo, struct syncargs *args,
+                      uuid_t my_uuid, uuid_t recv_uuid)
+{
+    int ret = -1;
+    gd1_mgmt_cluster_unlock_req req = {
+        {0},
+    };
+    uuid_t *peerid = NULL;
+
+    gf_uuid_copy(req.uuid, my_uuid);
+    GD_ALLOC_COPY_UUID(peerid, peerinfo->uuid, ret);
+    if (ret)
+        goto out;
+
+    ret = gd_syncop_submit_request(peerinfo->rpc, &req, args, peerid,
+                                   &gd_mgmt_prog, GLUSTERD_MGMT_CLUSTER_UNLOCK,
+                                   gd_syncop_mgmt_unlock_cbk,
+                                   (xdrproc_t)xdr_gd1_mgmt_cluster_lock_req);
+out:
+    return ret;
+}
+
+int32_t
+_gd_syncop_stage_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                        void *myframe)
+{
+    int ret = -1;
+    gd1_mgmt_stage_op_rsp rsp = {
+        {0},
+    };
+    struct syncargs *args = NULL;
+    xlator_t *this = NULL;
+    dict_t *rsp_dict = NULL;
+    call_frame_t *frame = NULL;
+    int op_ret = -1;
+    int op_errno = -1;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    frame = myframe;
+    args = frame->local;
+    peerid = frame->cookie;
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    if (-1 == req->rpc_status) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, op_errno, EINVAL);
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_stage_op_rsp);
+    if (ret < 0)
+        goto out;
+
+    if (rsp.dict.dict_len) {
+        /* Unserialize the dictionary */
+        rsp_dict = dict_new();
+
+        ret = dict_unserialize(rsp.dict.dict_val, rsp.dict.dict_len, &rsp_dict);
+        if (ret < 0) {
+            GF_FREE(rsp.dict.dict_val);
+            goto out;
+        } else {
+            rsp_dict->extra_stdfree = rsp.dict.dict_val;
+        }
+    }
+
+    RCU_READ_LOCK;
+    ret = (glusterd_peerinfo_find(rsp.uuid, NULL) == NULL);
+    RCU_READ_UNLOCK;
+    if (ret) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_RESP_FROM_UNKNOWN_PEER,
+               "Staging response "
+               "for 'Volume %s' received from unknown "
+               "peer: %s",
+               gd_op_list[rsp.op], uuid_utoa(rsp.uuid));
+        goto out;
+    }
+
+    gf_uuid_copy(args->uuid, rsp.uuid);
+    if (rsp.op == GD_OP_REPLACE_BRICK || rsp.op == GD_OP_QUOTA ||
+        rsp.op == GD_OP_CREATE_VOLUME || rsp.op == GD_OP_ADD_BRICK ||
+        rsp.op == GD_OP_START_VOLUME) {
+        pthread_mutex_lock(&args->lock_dict);
+        {
+            ret = glusterd_syncop_aggr_rsp_dict(rsp.op, args->dict, rsp_dict);
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESP_AGGR_FAIL, "%s",
+                       "Failed to aggregate response from "
+                       " node/brick");
+        }
+        pthread_mutex_unlock(&args->lock_dict);
+    }
+
+    op_ret = rsp.op_ret;
+    op_errno = rsp.op_errno;
+
+out:
+    gd_collate_errors(args, op_ret, op_errno, rsp.op_errstr,
+                      GLUSTERD_MGMT_STAGE_OP, *peerid, rsp.uuid);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+    GF_FREE(peerid);
+    /* req->rpc_status set to -1 means, STACK_DESTROY will be called from
+     * the caller function.
+     */
+    if (req->rpc_status != -1)
+        STACK_DESTROY(frame->root);
+    synctask_barrier_wake(args);
+    return 0;
+}
+
+int32_t
+gd_syncop_stage_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                       void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   _gd_syncop_stage_op_cbk);
+}
+
+int
+gd_syncop_mgmt_stage_op(glusterd_peerinfo_t *peerinfo, struct syncargs *args,
+                        uuid_t my_uuid, uuid_t recv_uuid, int op,
+                        dict_t *dict_out, dict_t *op_ctx)
+{
+    gd1_mgmt_stage_op_req *req = NULL;
+    int ret = -1;
+    uuid_t *peerid = NULL;
+
+    req = GF_CALLOC(1, sizeof(*req), gf_gld_mt_mop_stage_req_t);
+    if (!req) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    gf_uuid_copy(req->uuid, my_uuid);
+    req->op = op;
+
+    ret = dict_allocate_and_serialize(dict_out, &req->buf.buf_val,
+                                      &req->buf.buf_len);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    GD_ALLOC_COPY_UUID(peerid, peerinfo->uuid, ret);
+    if (ret)
+        goto out;
+
+    ret = gd_syncop_submit_request(
+        peerinfo->rpc, req, args, peerid, &gd_mgmt_prog, GLUSTERD_MGMT_STAGE_OP,
+        gd_syncop_stage_op_cbk, (xdrproc_t)xdr_gd1_mgmt_stage_op_req);
+out:
+    gd_stage_op_req_free(req);
+    return ret;
+}
+
+int32_t
+_gd_syncop_brick_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                        void *myframe)
+{
+    struct syncargs *args = NULL;
+    gd1_mgmt_brick_op_rsp rsp = {
+        0,
+    };
+    int ret = -1;
+    call_frame_t *frame = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    frame = myframe;
+    args = frame->local;
+    frame->local = NULL;
+
+    /* initialize */
+    args->op_ret = -1;
+    args->op_errno = EINVAL;
+
+    if (-1 == req->rpc_status) {
+        args->op_errno = ENOTCONN;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, args->op_errno,
+                                   EINVAL);
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_brick_op_rsp);
+    if (ret < 0)
+        goto out;
+
+    if (rsp.output.output_len) {
+        args->dict = dict_new();
+        if (!args->dict) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            ret = -1;
+            args->op_errno = ENOMEM;
+            goto out;
+        }
+
+        ret = dict_unserialize(rsp.output.output_val, rsp.output.output_len,
+                               &args->dict);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno,
+                    GD_MSG_DICT_UNSERIALIZE_FAIL, NULL);
+            goto out;
+        }
+    }
+
+    args->op_ret = rsp.op_ret;
+    args->op_errno = rsp.op_errno;
+    args->errstr = gf_strdup(rsp.op_errstr);
+
+out:
+    if ((rsp.op_errstr) && (strcmp(rsp.op_errstr, "") != 0))
+        free(rsp.op_errstr);
+    free(rsp.output.output_val);
+
+    /* req->rpc_status set to -1 means, STACK_DESTROY will be called from
+     * the caller function.
+     */
+    if (req->rpc_status != -1)
+        STACK_DESTROY(frame->root);
+    __wake(args);
+
+    return 0;
+}
+
+int32_t
+gd_syncop_brick_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                       void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   _gd_syncop_brick_op_cbk);
+}
+
+int
+gd_syncop_mgmt_brick_op(struct rpc_clnt *rpc, glusterd_pending_node_t *pnode,
+                        int op, dict_t *dict_out, dict_t *op_ctx, char **errstr)
+{
+    struct syncargs args = {
+        0,
+    };
+    gd1_mgmt_brick_op_req *req = NULL;
+    int ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    args.op_ret = -1;
+    args.op_errno = ENOTCONN;
+
+    if ((pnode->type == GD_NODE_NFS) || (pnode->type == GD_NODE_QUOTAD) ||
+        (pnode->type == GD_NODE_SCRUB) ||
+        ((pnode->type == GD_NODE_SHD) && (op == GD_OP_STATUS_VOLUME))) {
+        ret = glusterd_node_op_build_payload(op, &req, dict_out);
+
+    } else {
+        ret = glusterd_brick_op_build_payload(op, pnode->node, &req, dict_out);
+    }
+
+    if (ret)
+        goto out;
+
+    GD_SYNCOP(rpc, (&args), NULL, gd_syncop_brick_op_cbk, req, &gd_brick_prog,
+              req->op, xdr_gd1_mgmt_brick_op_req);
+
+    if (args.errstr) {
+        if ((strlen(args.errstr) > 0) && errstr)
+            *errstr = args.errstr;
+        else
+            GF_FREE(args.errstr);
+    }
+
+    if (GD_OP_STATUS_VOLUME == op) {
+        ret = dict_set_int32(args.dict, "index", pnode->index);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Error setting index on brick status"
+                   " rsp dict");
+            args.op_ret = -1;
+            goto out;
+        }
+    }
+
+    if (req->op == GLUSTERD_BRICK_TERMINATE) {
+        if (args.op_ret && (args.op_errno == ENOTCONN)) {
+            /*
+             * This is actually OK.  It happens when the target
+             * brick process exits and we saw the closed connection
+             * before we read the response.  If we didn't read the
+             * response quickly enough that's kind of our own
+             * fault, and the fact that the process exited means
+             * that our goal of terminating the brick was achieved.
+             */
+            args.op_ret = 0;
+        }
+    }
+
+    if (args.op_ret == 0)
+        glusterd_handle_node_rsp(dict_out, pnode->node, op, args.dict, op_ctx,
+                                 errstr, pnode->type);
+
+out:
+    errno = args.op_errno;
+    if (args.dict)
+        dict_unref(args.dict);
+    if (args.op_ret && errstr && (*errstr == NULL)) {
+        if (op == GD_OP_HEAL_VOLUME) {
+            gf_asprintf(errstr,
+                        "Glusterd Syncop Mgmt brick op '%s' failed."
+                        " Please check glustershd log file for details.",
+                        gd_op_list[op]);
+        } else {
+            gf_asprintf(errstr,
+                        "Glusterd Syncop Mgmt brick op '%s' failed."
+                        " Please check brick log file for details.",
+                        gd_op_list[op]);
+        }
+    }
+    gd_brick_op_req_free(req);
+    return args.op_ret;
+}
+
+int32_t
+_gd_syncop_commit_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                         void *myframe)
+{
+    int ret = -1;
+    gd1_mgmt_commit_op_rsp rsp = {
+        {0},
+    };
+    struct syncargs *args = NULL;
+    xlator_t *this = NULL;
+    dict_t *rsp_dict = NULL;
+    call_frame_t *frame = NULL;
+    int op_ret = -1;
+    int op_errno = -1;
+    int type = GF_QUOTA_OPTION_TYPE_NONE;
+    uuid_t *peerid = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    frame = myframe;
+    args = frame->local;
+    peerid = frame->cookie;
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    if (-1 == req->rpc_status) {
+        op_errno = ENOTCONN;
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR(this->name, iov, out, op_errno, EINVAL);
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_commit_op_rsp);
+    if (ret < 0) {
+        goto out;
+    }
+
+    if (rsp.dict.dict_len) {
+        /* Unserialize the dictionary */
+        rsp_dict = dict_new();
+
+        ret = dict_unserialize(rsp.dict.dict_val, rsp.dict.dict_len, &rsp_dict);
+        if (ret < 0) {
+            GF_FREE(rsp.dict.dict_val);
+            goto out;
+        } else {
+            rsp_dict->extra_stdfree = rsp.dict.dict_val;
+        }
+    }
+
+    RCU_READ_LOCK;
+    ret = (glusterd_peerinfo_find(rsp.uuid, NULL) == 0);
+    RCU_READ_UNLOCK;
+    if (ret) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_RESP_FROM_UNKNOWN_PEER,
+               "Commit response "
+               "for 'Volume %s' received from unknown "
+               "peer: %s",
+               gd_op_list[rsp.op], uuid_utoa(rsp.uuid));
+        goto out;
+    }
+
+    gf_uuid_copy(args->uuid, rsp.uuid);
+    if (rsp.op == GD_OP_QUOTA) {
+        ret = dict_get_int32(args->dict, "type", &type);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get "
+                   "opcode");
+            goto out;
+        }
+    }
+
+    if ((rsp.op != GD_OP_QUOTA) || (type == GF_QUOTA_OPTION_TYPE_LIST)) {
+        pthread_mutex_lock(&args->lock_dict);
+        {
+            ret = glusterd_syncop_aggr_rsp_dict(rsp.op, args->dict, rsp_dict);
+            if (ret)
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESP_AGGR_FAIL, "%s",
+                       "Failed to aggregate response from "
+                       " node/brick");
+        }
+        pthread_mutex_unlock(&args->lock_dict);
+    }
+
+    op_ret = rsp.op_ret;
+    op_errno = rsp.op_errno;
+
+out:
+    gd_collate_errors(args, op_ret, op_errno, rsp.op_errstr,
+                      GLUSTERD_MGMT_COMMIT_OP, *peerid, rsp.uuid);
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+    GF_FREE(peerid);
+    /* req->rpc_status set to -1 means, STACK_DESTROY will be called from
+     * the caller function.
+     */
+    if (req->rpc_status != -1)
+        STACK_DESTROY(frame->root);
+    synctask_barrier_wake(args);
+
+    return 0;
+}
+
+int32_t
+gd_syncop_commit_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                        void *myframe)
+{
+    return glusterd_big_locked_cbk(req, iov, count, myframe,
+                                   _gd_syncop_commit_op_cbk);
+}
+
+int
+gd_syncop_mgmt_commit_op(glusterd_peerinfo_t *peerinfo, struct syncargs *args,
+                         uuid_t my_uuid, uuid_t recv_uuid, int op,
+                         dict_t *dict_out, dict_t *op_ctx)
+{
+    gd1_mgmt_commit_op_req *req = NULL;
+    int ret = -1;
+    uuid_t *peerid = NULL;
+
+    req = GF_CALLOC(1, sizeof(*req), gf_gld_mt_mop_commit_req_t);
+    if (!req) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    gf_uuid_copy(req->uuid, my_uuid);
+    req->op = op;
+
+    ret = dict_allocate_and_serialize(dict_out, &req->buf.buf_val,
+                                      &req->buf.buf_len);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno,
+                GD_MSG_DICT_ALLOC_AND_SERL_LENGTH_GET_FAIL, NULL);
+        goto out;
+    }
+
+    GD_ALLOC_COPY_UUID(peerid, peerinfo->uuid, ret);
+    if (ret)
+        goto out;
+
+    ret = gd_syncop_submit_request(peerinfo->rpc, req, args, peerid,
+                                   &gd_mgmt_prog, GLUSTERD_MGMT_COMMIT_OP,
+                                   gd_syncop_commit_op_cbk,
+                                   (xdrproc_t)xdr_gd1_mgmt_commit_op_req);
+out:
+    gd_commit_op_req_free(req);
+    return ret;
+}
+
+int
+gd_lock_op_phase(glusterd_conf_t *conf, glusterd_op_t op, dict_t *op_ctx,
+                 char **op_errstr, uuid_t txn_id,
+                 glusterd_op_info_t *txn_opinfo, gf_boolean_t cluster_lock)
+{
+    int ret = -1;
+    int peer_cnt = 0;
+    uuid_t peer_uuid = {0};
+    xlator_t *this = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    struct syncargs args = {0};
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    ret = synctask_barrier_init((&args));
+    if (ret)
+        goto out;
+
+    peer_cnt = 0;
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        /* Only send requests to peers who were available before the
+         * transaction started
+         */
+        if (peerinfo->generation > txn_opinfo->txn_generation)
+            continue;
+
+        if (!peerinfo->connected)
+            continue;
+        if (op != GD_OP_SYNC_VOLUME &&
+            peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)
+            continue;
+
+        if (cluster_lock) {
+            /* Reset lock status */
+            peerinfo->locked = _gf_false;
+            gd_syncop_mgmt_lock(peerinfo, &args, MY_UUID, peer_uuid);
+        } else
+            gd_syncop_mgmt_v3_lock(op, op_ctx, peerinfo, &args, MY_UUID,
+                                   peer_uuid, txn_id);
+        peer_cnt++;
+    }
+    RCU_READ_UNLOCK;
+
+    if (0 == peer_cnt) {
+        ret = 0;
+        goto out;
+    }
+
+    gd_synctask_barrier_wait((&args), peer_cnt);
+
+    if (args.op_ret) {
+        if (args.errstr)
+            *op_errstr = gf_strdup(args.errstr);
+        else {
+            ret = gf_asprintf(op_errstr,
+                              "Another transaction "
+                              "could be in progress. Please try "
+                              "again after some time.");
+            if (ret == -1)
+                *op_errstr = NULL;
+
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEER_LOCK_FAIL,
+                   "Failed to acquire lock");
+        }
+    }
+
+    ret = args.op_ret;
+
+    gf_msg_debug(this->name, 0,
+                 "Sent lock op req for 'Volume %s' "
+                 "to %d peers. Returning %d",
+                 gd_op_list[op], peer_cnt, ret);
+out:
+    return ret;
+}
+
+int
+gd_stage_op_phase(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict,
+                  char **op_errstr, glusterd_op_info_t *txn_opinfo)
+{
+    int ret = -1;
+    int peer_cnt = 0;
+    dict_t *rsp_dict = NULL;
+    char *hostname = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    uuid_t tmp_uuid = {0};
+    char *errstr = NULL;
+    struct syncargs args = {0};
+    dict_t *aggr_dict = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    if ((op == GD_OP_CREATE_VOLUME) || (op == GD_OP_ADD_BRICK) ||
+        (op == GD_OP_START_VOLUME))
+        aggr_dict = req_dict;
+    else
+        aggr_dict = op_ctx;
+
+    ret = glusterd_validate_quorum(this, op, req_dict, op_errstr);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_SERVER_QUORUM_NOT_MET,
+               "Server quorum not met. Rejecting operation.");
+        goto out;
+    }
+
+    ret = glusterd_op_stage_validate(op, req_dict, op_errstr, rsp_dict);
+    if (ret) {
+        hostname = "localhost";
+        goto stage_done;
+    }
+
+    if ((op == GD_OP_REPLACE_BRICK || op == GD_OP_QUOTA ||
+         op == GD_OP_CREATE_VOLUME || op == GD_OP_ADD_BRICK ||
+         op == GD_OP_START_VOLUME)) {
+        ret = glusterd_syncop_aggr_rsp_dict(op, aggr_dict, rsp_dict);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESP_AGGR_FAIL, "%s",
+                   "Failed to aggregate response from node/brick");
+            goto out;
+        }
+    }
+    dict_unref(rsp_dict);
+    rsp_dict = NULL;
+
+stage_done:
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VALIDATE_FAILED,
+               LOGSTR_STAGE_FAIL, gd_op_list[op], hostname,
+               (*op_errstr) ? ":" : " ", (*op_errstr) ? *op_errstr : " ");
+        if (*op_errstr == NULL)
+            gf_asprintf(op_errstr, OPERRSTR_STAGE_FAIL, hostname);
+        goto out;
+    }
+
+    gd_syncargs_init(&args, aggr_dict);
+    ret = synctask_barrier_init((&args));
+    if (ret)
+        goto out;
+
+    peer_cnt = 0;
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        /* Only send requests to peers who were available before the
+         * transaction started
+         */
+        if (peerinfo->generation > txn_opinfo->txn_generation)
+            continue;
+
+        if (!peerinfo->connected)
+            continue;
+        if (op != GD_OP_SYNC_VOLUME &&
+            peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)
+            continue;
+
+        (void)gd_syncop_mgmt_stage_op(peerinfo, &args, MY_UUID, tmp_uuid, op,
+                                      req_dict, op_ctx);
+        peer_cnt++;
+    }
+    RCU_READ_UNLOCK;
+
+    if (0 == peer_cnt) {
+        ret = 0;
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0,
+                 "Sent stage op req for 'Volume %s' "
+                 "to %d peers",
+                 gd_op_list[op], peer_cnt);
+
+    gd_synctask_barrier_wait((&args), peer_cnt);
+
+    if (args.errstr)
+        *op_errstr = gf_strdup(args.errstr);
+    else if (dict_get_str(aggr_dict, "errstr", &errstr) == 0)
+        *op_errstr = gf_strdup(errstr);
+
+    ret = args.op_ret;
+
+out:
+    if ((ret == 0) && (op == GD_OP_QUOTA)) {
+        ret = glusterd_validate_and_set_gfid(op_ctx, req_dict, op_errstr);
+        if (ret)
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GFID_VALIDATE_SET_FAIL,
+                   "Failed to validate and set gfid");
+    }
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+    return ret;
+}
+
+int
+gd_commit_op_phase(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict,
+                   char **op_errstr, glusterd_op_info_t *txn_opinfo)
+{
+    dict_t *rsp_dict = NULL;
+    int peer_cnt = -1;
+    int ret = -1;
+    char *hostname = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    uuid_t tmp_uuid = {0};
+    char *errstr = NULL;
+    struct syncargs args = {0};
+    int type = GF_QUOTA_OPTION_TYPE_NONE;
+    uint32_t cmd = 0;
+    gf_boolean_t origin_glusterd = _gf_false;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_op_commit_perform(op, req_dict, op_errstr, rsp_dict);
+    if (ret) {
+        hostname = "localhost";
+        goto commit_done;
+    }
+
+    if (op == GD_OP_QUOTA) {
+        ret = dict_get_int32(op_ctx, "type", &type);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get "
+                   "opcode");
+            goto out;
+        }
+    }
+
+    if (((op == GD_OP_QUOTA) &&
+         ((type == GF_QUOTA_OPTION_TYPE_LIST) ||
+          (type == GF_QUOTA_OPTION_TYPE_LIST_OBJECTS))) ||
+        ((op != GD_OP_SYNC_VOLUME) && (op != GD_OP_QUOTA))) {
+        ret = glusterd_syncop_aggr_rsp_dict(op, op_ctx, rsp_dict);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESP_AGGR_FAIL, "%s",
+                   "Failed to aggregate "
+                   "response from node/brick");
+            goto out;
+        }
+    }
+
+    dict_unref(rsp_dict);
+    rsp_dict = NULL;
+
+commit_done:
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_COMMIT_OP_FAIL,
+               LOGSTR_COMMIT_FAIL, gd_op_list[op], hostname,
+               (*op_errstr) ? ":" : " ", (*op_errstr) ? *op_errstr : " ");
+        if (*op_errstr == NULL)
+            gf_asprintf(op_errstr, OPERRSTR_COMMIT_FAIL, hostname);
+        goto out;
+    }
+
+    gd_syncargs_init(&args, op_ctx);
+    ret = synctask_barrier_init((&args));
+    if (ret)
+        goto out;
+
+    peer_cnt = 0;
+    origin_glusterd = is_origin_glusterd(req_dict);
+
+    if (op == GD_OP_STATUS_VOLUME) {
+        ret = dict_get_uint32(req_dict, "cmd", &cmd);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                    "Key=cmd", NULL);
+            goto out;
+        }
+
+        if (origin_glusterd) {
+            if ((cmd & GF_CLI_STATUS_ALL)) {
+                ret = 0;
+                goto out;
+            }
+        }
+    }
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        /* Only send requests to peers who were available before the
+         * transaction started
+         */
+        if (peerinfo->generation > txn_opinfo->txn_generation)
+            continue;
+
+        if (!peerinfo->connected)
+            continue;
+        if (op != GD_OP_SYNC_VOLUME &&
+            peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)
+            continue;
+
+        (void)gd_syncop_mgmt_commit_op(peerinfo, &args, MY_UUID, tmp_uuid, op,
+                                       req_dict, op_ctx);
+        peer_cnt++;
+    }
+    RCU_READ_UNLOCK;
+
+    if (0 == peer_cnt) {
+        ret = 0;
+        goto out;
+    }
+
+    gd_synctask_barrier_wait((&args), peer_cnt);
+    ret = args.op_ret;
+    if (args.errstr)
+        *op_errstr = gf_strdup(args.errstr);
+    else if (dict_get_str(op_ctx, "errstr", &errstr) == 0)
+        *op_errstr = gf_strdup(errstr);
+
+    gf_msg_debug(this->name, 0,
+                 "Sent commit op req for 'Volume %s' "
+                 "to %d peers",
+                 gd_op_list[op], peer_cnt);
+out:
+    if (!ret)
+        glusterd_op_modify_op_ctx(op, op_ctx);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+
+    GF_FREE(args.errstr);
+    args.errstr = NULL;
+
+    return ret;
+}
+
+int
+gd_unlock_op_phase(glusterd_conf_t *conf, glusterd_op_t op, int *op_ret,
+                   rpcsvc_request_t *req, dict_t *op_ctx, char *op_errstr,
+                   char *volname, gf_boolean_t is_acquired, uuid_t txn_id,
+                   glusterd_op_info_t *txn_opinfo, gf_boolean_t cluster_lock)
+{
+    glusterd_peerinfo_t *peerinfo = NULL;
+    uuid_t tmp_uuid = {0};
+    int peer_cnt = 0;
+    int ret = -1;
+    xlator_t *this = NULL;
+    struct syncargs args = {0};
+    int32_t global = 0;
+    char *type = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    /* If the lock has not been held during this
+     * transaction, do not send unlock requests */
+    if (!is_acquired) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = synctask_barrier_init((&args));
+    if (ret)
+        goto out;
+
+    peer_cnt = 0;
+
+    if (cluster_lock) {
+        RCU_READ_LOCK;
+        cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+        {
+            /* Only send requests to peers who were available before
+             * the transaction started
+             */
+            if (peerinfo->generation > txn_opinfo->txn_generation)
+                continue;
+
+            if (!peerinfo->connected)
+                continue;
+            if (op != GD_OP_SYNC_VOLUME &&
+                peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)
+                continue;
+
+            /* Only unlock peers that were locked */
+            if (peerinfo->locked) {
+                gd_syncop_mgmt_unlock(peerinfo, &args, MY_UUID, tmp_uuid);
+                peer_cnt++;
+            }
+        }
+        RCU_READ_UNLOCK;
+    } else {
+        ret = dict_get_int32(op_ctx, "hold_global_locks", &global);
+        if (!ret && global)
+            type = "global";
+        else
+            type = "vol";
+        if (volname || global) {
+            RCU_READ_LOCK;
+            cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+            {
+                /* Only send requests to peers who were
+                 * available before the transaction started
+                 */
+                if (peerinfo->generation > txn_opinfo->txn_generation)
+                    continue;
+
+                if (!peerinfo->connected)
+                    continue;
+                if (op != GD_OP_SYNC_VOLUME &&
+                    peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)
+                    continue;
+
+                gd_syncop_mgmt_v3_unlock(op_ctx, peerinfo, &args, MY_UUID,
+                                         tmp_uuid, txn_id);
+                peer_cnt++;
+            }
+            RCU_READ_UNLOCK;
+        }
+    }
+
+    if (0 == peer_cnt) {
+        ret = 0;
+        goto out;
+    }
+
+    gd_synctask_barrier_wait((&args), peer_cnt);
+
+    ret = args.op_ret;
+
+    gf_msg_debug(this->name, 0,
+                 "Sent unlock op req for 'Volume %s' "
+                 "to %d peers. Returning %d",
+                 gd_op_list[op], peer_cnt, ret);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEER_UNLOCK_FAIL,
+               "Failed to unlock "
+               "on some peer(s)");
+    }
+
+out:
+    /* If unlock failed, and op_ret was previously set
+     * priority is given to the op_ret. If op_ret was
+     * not set, and unlock failed, then set op_ret */
+    if (!*op_ret)
+        *op_ret = ret;
+
+    if (is_acquired) {
+        /* Based on the op-version,
+         * we release the cluster or mgmt_v3 lock
+         * and clear the op */
+
+        glusterd_op_clear_op(op);
+        if (cluster_lock)
+            glusterd_unlock(MY_UUID);
+        else {
+            if (type) {
+                ret = glusterd_mgmt_v3_unlock(volname, MY_UUID, type);
+                if (ret)
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_MGMTV3_UNLOCK_FAIL,
+                           "Unable to release lock for %s", volname);
+            }
+        }
+    }
+
+    if (!*op_ret)
+        *op_ret = ret;
+
+    /*
+     * If there are any quorum events while the OP is in progress, process
+     * them.
+     */
+    if (conf->pending_quorum_action)
+        glusterd_do_quorum_action();
+
+    return 0;
+}
+
+int
+gd_get_brick_count(struct cds_list_head *bricks)
+{
+    glusterd_pending_node_t *pending_node = NULL;
+    int npeers = 0;
+    cds_list_for_each_entry(pending_node, bricks, list) { npeers++; }
+    return npeers;
+}
+
+int
+gd_brick_op_phase(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict,
+                  char **op_errstr)
+{
+    glusterd_pending_node_t *pending_node = NULL;
+    glusterd_pending_node_t *tmp = NULL;
+    struct cds_list_head selected = {
+        0,
+    };
+    xlator_t *this = NULL;
+    int brick_count = 0;
+    int ret = -1;
+    rpc_clnt_t *rpc = NULL;
+    dict_t *rsp_dict = NULL;
+    int32_t cmd = GF_OP_CMD_NONE;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    this = THIS;
+    rsp_dict = dict_new();
+    if (!rsp_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    CDS_INIT_LIST_HEAD(&selected);
+    ret = glusterd_op_bricks_select(op, req_dict, op_errstr, &selected,
+                                    rsp_dict);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_OP_FAIL, "%s",
+               (*op_errstr) ? *op_errstr
+                            : "Brick op failed. Check "
+                              "glusterd log file for more details.");
+        goto out;
+    }
+
+    if (op == GD_OP_HEAL_VOLUME) {
+        ret = glusterd_syncop_aggr_rsp_dict(op, op_ctx, rsp_dict);
+        if (ret)
+            goto out;
+    }
+    dict_unref(rsp_dict);
+    rsp_dict = NULL;
+
+    brick_count = 0;
+    cds_list_for_each_entry_safe(pending_node, tmp, &selected, list)
+    {
+        rpc = glusterd_pending_node_get_rpc(pending_node);
+        /* In the case of rebalance if the rpc object is null, we try to
+         * create the rpc object. if the rebalance daemon is down, it returns
+         * -1. otherwise, rpc object will be created and referenced.
+         */
+        if (!rpc) {
+            if (pending_node->type == GD_NODE_REBALANCE && pending_node->node) {
+                volinfo = pending_node->node;
+                ret = glusterd_rebalance_rpc_create(volinfo);
+                if (ret) {
+                    ret = 0;
+                    glusterd_defrag_volume_node_rsp(req_dict, NULL, op_ctx);
+                    goto out;
+                } else {
+                    rpc = glusterd_defrag_rpc_get(volinfo->rebal.defrag);
+                }
+            } else {
+                ret = -1;
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_FAILURE,
+                       "Brick Op failed "
+                       "due to rpc failure.");
+                goto out;
+            }
+        }
+
+        ret = gd_syncop_mgmt_brick_op(rpc, pending_node, op, req_dict, op_ctx,
+                                      op_errstr);
+        if (op == GD_OP_STATUS_VOLUME) {
+            /* for client-list its enough to quit the loop
+             * once we get the value from one brick
+             * */
+            ret = dict_get_int32(req_dict, "cmd", &cmd);
+            if (!ret && (cmd & GF_CLI_STATUS_CLIENT_LIST)) {
+                if (dict_get(op_ctx, "client-count"))
+                    break;
+            }
+        }
+        if (ret)
+            goto out;
+
+        brick_count++;
+        glusterd_pending_node_put_rpc(pending_node);
+        GF_FREE(pending_node);
+    }
+
+    pending_node = NULL;
+    ret = 0;
+out:
+    if (pending_node && pending_node->node)
+        glusterd_pending_node_put_rpc(pending_node);
+
+    if (rsp_dict)
+        dict_unref(rsp_dict);
+    gf_msg_debug(this->name, 0, "Sent op req to %d bricks", brick_count);
+    return ret;
+}
+
+void
+gd_sync_task_begin(dict_t *op_ctx, rpcsvc_request_t *req)
+{
+    int ret = -1;
+    int op_ret = -1;
+    dict_t *req_dict = NULL;
+    glusterd_conf_t *conf = NULL;
+    glusterd_op_t op = GD_OP_NONE;
+    int32_t tmp_op = 0;
+    char *op_errstr = NULL;
+    char *tmp = NULL;
+    char *global = NULL;
+    char *volname = NULL;
+    xlator_t *this = NULL;
+    gf_boolean_t is_acquired = _gf_false;
+    gf_boolean_t is_global = _gf_false;
+    uuid_t *txn_id = NULL;
+    glusterd_op_info_t txn_opinfo = {
+        {0},
+    };
+    uint32_t op_errno = 0;
+    gf_boolean_t cluster_lock = _gf_false;
+    uint32_t timeout = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    ret = dict_get_int32(op_ctx, GD_SYNC_OPCODE_KEY, &tmp_op);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get volume "
+               "operation");
+        goto out;
+    }
+    op = tmp_op;
+
+    /* Generate a transaction-id for this operation and
+     * save it in the dict */
+    ret = glusterd_generate_txn_id(op_ctx, &txn_id);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_IDGEN_FAIL,
+               "Failed to generate transaction id");
+        goto out;
+    }
+
+    /* Save opinfo for this transaction with the transaction id */
+    glusterd_txn_opinfo_init(&txn_opinfo, NULL, &op, NULL, NULL);
+    ret = glusterd_set_txn_opinfo(txn_id, &txn_opinfo);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_SET_FAIL,
+               "Unable to set transaction's opinfo");
+
+    gf_msg_debug(this->name, 0, "Transaction ID : %s", uuid_utoa(*txn_id));
+
+    /* Save the MY_UUID as the originator_uuid */
+    ret = glusterd_set_originator_uuid(op_ctx);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UUID_SET_FAIL,
+               "Failed to set originator_uuid.");
+        goto out;
+    }
+
+    if (conf->op_version < GD_OP_VERSION_3_6_0)
+        cluster_lock = _gf_true;
+
+    /* Based on the op_version, acquire a cluster or mgmt_v3 lock */
+    if (cluster_lock) {
+        ret = glusterd_lock(MY_UUID);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_LOCK_FAIL,
+                   "Unable to acquire lock");
+            gf_asprintf(&op_errstr,
+                        "Another transaction is in progress. "
+                        "Please try again after some time.");
+            goto out;
+        }
+    } else {
+        /* Cli will add timeout key to dict if the default timeout is
+         * other than 2 minutes. Here we use this value to check whether
+         * mgmt_v3_lock_timeout should be set to default value or we
+         * need to change the value according to timeout value
+         * i.e, timeout + 120 seconds. */
+        ret = dict_get_uint32(op_ctx, "timeout", &timeout);
+        if (!ret)
+            conf->mgmt_v3_lock_timeout = timeout + 120;
+
+        ret = dict_get_str(op_ctx, "globalname", &global);
+        if (!ret) {
+            is_global = _gf_true;
+            goto global;
+        }
+
+        /* If no volname is given as a part of the command, locks will
+         * not be held */
+        ret = dict_get_str(op_ctx, "volname", &tmp);
+        if (ret) {
+            gf_msg_debug("glusterd", 0,
+                         "Failed to get volume "
+                         "name");
+            goto local_locking_done;
+        } else {
+            /* Use a copy of volname, as cli response will be
+             * sent before the unlock, and the volname in the
+             * dict, might be removed */
+            volname = gf_strdup(tmp);
+            if (!volname)
+                goto out;
+        }
+
+        ret = glusterd_mgmt_v3_lock(volname, MY_UUID, &op_errno, "vol");
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCK_GET_FAIL,
+                   "Unable to acquire lock for %s", volname);
+            gf_asprintf(&op_errstr,
+                        "Another transaction is in progress "
+                        "for %s. Please try again after some time.",
+                        volname);
+            goto out;
+        }
+    }
+
+global:
+    if (is_global) {
+        ret = glusterd_mgmt_v3_lock(global, MY_UUID, &op_errno, "global");
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MGMTV3_LOCK_GET_FAIL,
+                   "Unable to acquire lock for %s", global);
+            gf_asprintf(&op_errstr,
+                        "Another transaction is in progress "
+                        "for %s. Please try again after some time.",
+                        global);
+            is_global = _gf_false;
+            goto out;
+        }
+    }
+
+    is_acquired = _gf_true;
+
+local_locking_done:
+
+    /* If no volname is given as a part of the command, locks will
+     * not be held */
+    if (volname || cluster_lock || is_global) {
+        ret = gd_lock_op_phase(conf, op, op_ctx, &op_errstr, *txn_id,
+                               &txn_opinfo, cluster_lock);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PEER_LOCK_FAIL,
+                   "Locking Peers Failed.");
+            goto out;
+        }
+    }
+
+    ret = glusterd_op_build_payload(&req_dict, &op_errstr, op_ctx);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_OP_PAYLOAD_BUILD_FAIL,
+               LOGSTR_BUILD_PAYLOAD, gd_op_list[op]);
+        if (op_errstr == NULL)
+            gf_asprintf(&op_errstr, OPERRSTR_BUILD_PAYLOAD);
+        goto out;
+    }
+
+    ret = gd_stage_op_phase(op, op_ctx, req_dict, &op_errstr, &txn_opinfo);
+    if (ret)
+        goto out;
+
+    ret = gd_brick_op_phase(op, op_ctx, req_dict, &op_errstr);
+    if (ret)
+        goto out;
+
+    ret = gd_commit_op_phase(op, op_ctx, req_dict, &op_errstr, &txn_opinfo);
+    if (ret)
+        goto out;
+
+    ret = 0;
+out:
+    op_ret = ret;
+    if (txn_id) {
+        if (global)
+            (void)gd_unlock_op_phase(conf, op, &op_ret, req, op_ctx, op_errstr,
+                                     global, is_acquired, *txn_id, &txn_opinfo,
+                                     cluster_lock);
+        else
+            (void)gd_unlock_op_phase(conf, op, &op_ret, req, op_ctx, op_errstr,
+                                     volname, is_acquired, *txn_id, &txn_opinfo,
+                                     cluster_lock);
+
+        /* Clearing the transaction opinfo */
+        ret = glusterd_clear_txn_opinfo(txn_id);
+        if (ret)
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TRANS_OPINFO_CLEAR_FAIL,
+                   "Unable to clear transaction's "
+                   "opinfo for transaction ID : %s",
+                   uuid_utoa(*txn_id));
+    }
+
+    if (op_ret && (op_errno == 0))
+        op_errno = EG_INTRNL;
+
+    glusterd_op_send_cli_response(op, op_ret, op_errno, req, op_ctx, op_errstr);
+
+    if (volname)
+        GF_FREE(volname);
+
+    if (req_dict)
+        dict_unref(req_dict);
+
+    if (op_errstr) {
+        GF_FREE(op_errstr);
+        op_errstr = NULL;
+    }
+
+    return;
+}
+
+int32_t
+glusterd_op_begin_synctask(rpcsvc_request_t *req, glusterd_op_t op, void *dict)
+{
+    int ret = 0;
+
+    ret = dict_set_int32(dict, GD_SYNC_OPCODE_KEY, op);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "dict set failed for setting operations");
+        goto out;
+    }
+
+    gd_sync_task_begin(dict, req);
+    ret = 0;
+out:
+
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.h b/xlators/mgmt/glusterd/src/glusterd-syncop.h
new file mode 100644
index 00000000000..a265f2135c6
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-syncop.h
@@ -0,0 +1,93 @@
+/*
+   Copyright (c) 2012-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef __RPC_SYNCOP_H
+#define __RPC_SYNCOP_H
+
+#include <glusterfs/syncop.h>
+#include "glusterd-sm.h"
+#include "glusterd.h"
+
+#define GD_SYNC_OPCODE_KEY "sync-mgmt-operation"
+
+/* gd_syncop_* */
+#define GD_SYNCOP(rpc, stb, cookie, cbk, req, prog, procnum, xdrproc)          \
+    do {                                                                       \
+        int ret = 0;                                                           \
+        struct synctask *task = NULL;                                          \
+        glusterd_conf_t *conf = THIS->private;                                 \
+                                                                               \
+        task = synctask_get();                                                 \
+        stb->task = task;                                                      \
+                                                                               \
+        /*This is to ensure that the brick_op_cbk is able to                   \
+         * take the big lock*/                                                 \
+        synclock_unlock(&conf->big_lock);                                      \
+        ret = gd_syncop_submit_request(rpc, req, stb, cookie, prog, procnum,   \
+                                       cbk, (xdrproc_t)xdrproc);               \
+        if (!ret)                                                              \
+            synctask_yield(stb->task, NULL);                                   \
+        else                                                                   \
+            gf_asprintf(&stb->errstr,                                          \
+                        "%s failed. Check log file"                            \
+                        " for more details",                                   \
+                        (prog)->progname);                                     \
+        synclock_lock(&conf->big_lock);                                        \
+    } while (0)
+
+#define GD_ALLOC_COPY_UUID(dst_ptr, uuid, ret)                                 \
+    do {                                                                       \
+        dst_ptr = GF_MALLOC(sizeof(*dst_ptr), gf_common_mt_uuid_t);            \
+        if (dst_ptr) {                                                         \
+            gf_uuid_copy(*dst_ptr, uuid);                                      \
+            ret = 0;                                                           \
+        } else {                                                               \
+            ret = -1;                                                          \
+        }                                                                      \
+    } while (0)
+
+int32_t
+gd_syncop_brick_op_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                       void *myframe);
+
+int
+gd_syncop_submit_request(struct rpc_clnt *rpc, void *req, void *local,
+                         void *cookie, rpc_clnt_prog_t *prog, int procnum,
+                         fop_cbk_fn_t cbkfn, xdrproc_t xdrproc);
+int
+gd_syncop_mgmt_lock(glusterd_peerinfo_t *peerinfo, struct syncargs *arg,
+                    uuid_t my_uuid, uuid_t recv_uuid);
+
+int
+gd_syncop_mgmt_unlock(glusterd_peerinfo_t *peerinfo, struct syncargs *arg,
+                      uuid_t my_uuid, uuid_t recv_uuid);
+
+int
+gd_syncop_mgmt_stage_op(glusterd_peerinfo_t *peerinfo, struct syncargs *arg,
+                        uuid_t my_uuid, uuid_t recv_uuid, int op,
+                        dict_t *dict_out, dict_t *op_ctx);
+
+int
+gd_syncop_mgmt_commit_op(glusterd_peerinfo_t *peerinfo, struct syncargs *arg,
+                         uuid_t my_uuid, uuid_t recv_uuid, int op,
+                         dict_t *dict_out, dict_t *op_ctx);
+
+void
+gd_synctask_barrier_wait(struct syncargs *args, int count);
+
+int
+gd_brick_op_phase(glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict,
+                  char **op_errstr);
+
+int
+glusterd_syncop_aggr_rsp_dict(glusterd_op_t op, dict_t *aggr, dict_t *rsp);
+
+void
+gd_syncargs_init(struct syncargs *args, dict_t *op_ctx);
+#endif /* __RPC_SYNCOP_H */
diff --git a/xlators/mgmt/glusterd/src/glusterd-tierd-svc-helper.c b/xlators/mgmt/glusterd/src/glusterd-tierd-svc-helper.c
new file mode 100644
index 00000000000..035795b3deb
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-tierd-svc-helper.c
@@ -0,0 +1,207 @@
+/*
+   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-tierd-svc-helper.h"
+#include "glusterd-messages.h"
+#include <glusterfs/syscall.h>
+#include "glusterd-volgen.h"
+
+void
+glusterd_svc_build_tierd_rundir(glusterd_volinfo_t *volinfo, char *path,
+                                int path_len)
+{
+    char workdir[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = THIS->private;
+
+    GLUSTERD_GET_TIER_DIR(workdir, volinfo, priv);
+
+    snprintf(path, path_len, "%s/run", workdir);
+}
+
+void
+glusterd_svc_build_tierd_socket_filepath(glusterd_volinfo_t *volinfo,
+                                         char *path, int path_len)
+{
+    char sockfilepath[PATH_MAX] = {
+        0,
+    };
+    char rundir[PATH_MAX] = {
+        0,
+    };
+    int32_t len = 0;
+
+    glusterd_svc_build_tierd_rundir(volinfo, rundir, sizeof(rundir));
+    len = snprintf(sockfilepath, sizeof(sockfilepath), "%s/run-%s", rundir,
+                   uuid_utoa(MY_UUID));
+    if ((len < 0) || (len >= sizeof(sockfilepath))) {
+        sockfilepath[0] = 0;
+    }
+
+    glusterd_set_socket_filepath(sockfilepath, path, path_len);
+}
+
+void
+glusterd_svc_build_tierd_pidfile(glusterd_volinfo_t *volinfo, char *path,
+                                 int path_len)
+{
+    char rundir[PATH_MAX] = {
+        0,
+    };
+
+    glusterd_svc_build_tierd_rundir(volinfo, rundir, sizeof(rundir));
+
+    snprintf(path, path_len, "%s/%s-tierd.pid", rundir, volinfo->volname);
+}
+
+void
+glusterd_svc_build_tierd_volfile_path(glusterd_volinfo_t *volinfo, char *path,
+                                      int path_len)
+{
+    char workdir[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = THIS->private;
+
+    GLUSTERD_GET_VOLUME_DIR(workdir, volinfo, priv);
+
+    snprintf(path, path_len, "%s/%s-tierd.vol", workdir, volinfo->volname);
+}
+
+void
+glusterd_svc_build_tierd_logdir(char *logdir, char *volname, size_t len)
+{
+    glusterd_conf_t *conf = THIS->private;
+    snprintf(logdir, len, "%s/tier/%s", priv->logdir, volname);
+}
+
+void
+glusterd_svc_build_tierd_logfile(char *logfile, char *logdir, size_t len)
+{
+    snprintf(logfile, len, "%s/tierd.log", logdir);
+}
+
+int
+glusterd_svc_check_tier_volfile_identical(char *svc_name,
+                                          glusterd_volinfo_t *volinfo,
+                                          gf_boolean_t *identical)
+{
+    char orgvol[PATH_MAX] = {
+        0,
+    };
+    char *tmpvol = NULL;
+    xlator_t *this = NULL;
+    int ret = -1;
+    int need_unlink = 0;
+    int tmp_fd = -1;
+
+    this = THIS;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, this, out);
+    GF_VALIDATE_OR_GOTO(this->name, identical, out);
+
+    glusterd_svc_build_tierd_volfile_path(volinfo, orgvol, sizeof(orgvol));
+
+    ret = gf_asprintf(&tmpvol, "/tmp/g%s-XXXXXX", svc_name);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* coverity[SECURE_TEMP] mkstemp uses 0600 as the mode and is safe */
+    tmp_fd = mkstemp(tmpvol);
+    if (tmp_fd < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to create temp file"
+               " %s:(%s)",
+               tmpvol, strerror(errno));
+        ret = -1;
+        goto out;
+    }
+
+    need_unlink = 1;
+    ret = build_rebalance_volfile(volinfo, tmpvol, NULL);
+    if (ret)
+        goto out;
+
+    ret = glusterd_check_files_identical(orgvol, tmpvol, identical);
+    if (ret)
+        goto out;
+
+out:
+    if (need_unlink)
+        sys_unlink(tmpvol);
+
+    if (tmpvol != NULL)
+        GF_FREE(tmpvol);
+
+    if (tmp_fd >= 0)
+        sys_close(tmp_fd);
+
+    return ret;
+}
+
+int
+glusterd_svc_check_tier_topology_identical(char *svc_name,
+                                           glusterd_volinfo_t *volinfo,
+                                           gf_boolean_t *identical)
+{
+    char orgvol[PATH_MAX] = {
+        0,
+    };
+    char *tmpvol = NULL;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = THIS;
+    int ret = -1;
+    int tmpclean = 0;
+    int tmpfd = -1;
+
+    if ((!identical) || (!this) || (!this->private))
+        goto out;
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    glusterd_svc_build_tierd_volfile_path(volinfo, orgvol, sizeof(orgvol));
+
+    ret = gf_asprintf(&tmpvol, "/tmp/g%s-XXXXXX", svc_name);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* coverity[SECURE_TEMP] mkstemp uses 0600 as the mode and is safe */
+    tmpfd = mkstemp(tmpvol);
+    if (tmpfd < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to create temp file"
+               " %s:(%s)",
+               tmpvol, strerror(errno));
+        ret = -1;
+        goto out;
+    }
+
+    tmpclean = 1; /* SET the flag to unlink() tmpfile */
+    ret = build_rebalance_volfile(volinfo, tmpvol, NULL);
+    if (ret)
+        goto out;
+
+    /* Compare the topology of volfiles */
+    ret = glusterd_check_topology_identical(orgvol, tmpvol, identical);
+out:
+    if (tmpfd >= 0)
+        sys_close(tmpfd);
+    if (tmpclean)
+        sys_unlink(tmpvol);
+    if (tmpvol != NULL)
+        GF_FREE(tmpvol);
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
new file mode 100644
index 00000000000..90ef2cf4c9c
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -0,0 +1,15046 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <inttypes.h>
+
+#if defined(GF_LINUX_HOST_OS)
+#include <mntent.h>
+#else
+#include "mntent_compat.h"
+#endif
+#include <dlfcn.h>
+#if (HAVE_LIB_XML)
+#include <libxml/encoding.h>
+#include <libxml/xmlwriter.h>
+#endif
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include "glusterd-messages.h"
+#include <glusterfs/timer.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/run.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/syscall.h>
+#include "glusterd-mem-types.h"
+#include "glusterd.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-geo-rep.h"
+#include "glusterd-sm.h"
+#include "glusterd-utils.h"
+#include "glusterd-store.h"
+#include "glusterd-volgen.h"
+#include "glusterd-pmap.h"
+#include <glusterfs/glusterfs-acl.h>
+#include "glusterd-syncop.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-locks.h"
+#include "glusterd-messages.h"
+#include "glusterd-volgen.h"
+#include "glusterd-snapshot-utils.h"
+#include "glusterd-svc-mgmt.h"
+#include "glusterd-svc-helper.h"
+#include "glusterd-shd-svc.h"
+#include "glusterd-quotad-svc.h"
+#include "glusterd-snapd-svc.h"
+#include "glusterd-bitd-svc.h"
+#include "glusterd-gfproxyd-svc.h"
+#include "glusterd-server-quorum.h"
+#include <glusterfs/quota-common-utils.h>
+#include <glusterfs/common-utils.h>
+#include "glusterd-shd-svc-helper.h"
+
+#include "xdr-generic.h"
+#include <sys/resource.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <rpc/pmap_clnt.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <sys/statvfs.h>
+#include <ifaddrs.h>
+
+#ifdef GF_SOLARIS_HOST_OS
+#include <sys/sockio.h>
+#endif
+
+#ifdef __FreeBSD__
+#include <sys/sysctl.h>
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <libprocstat.h>
+#include <libutil.h>
+#endif
+
+#define NFS_PROGRAM 100003
+#define NFSV3_VERSION 3
+
+#define MOUNT_PROGRAM 100005
+#define MOUNTV3_VERSION 3
+#define MOUNTV1_VERSION 1
+
+#define NLM_PROGRAM 100021
+#define NLMV4_VERSION 4
+#define NLMV1_VERSION 1
+
+#ifdef BUILD_GNFS
+#define GLUSTERD_GET_NFS_PIDFILE(pidfile, priv)                                \
+    do {                                                                       \
+        int32_t _nfs_pid_len;                                                  \
+        _nfs_pid_len = snprintf(pidfile, PATH_MAX, "%s/nfs/nfs.pid",           \
+                                priv->rundir);                                 \
+        if ((_nfs_pid_len < 0) || (_nfs_pid_len >= PATH_MAX)) {                \
+            pidfile[0] = 0;                                                    \
+        }                                                                      \
+    } while (0)
+#endif
+
+#define GLUSTERD_GET_QUOTAD_PIDFILE(pidfile, priv)                             \
+    do {                                                                       \
+        int32_t _quotad_pid_len;                                               \
+        _quotad_pid_len = snprintf(pidfile, PATH_MAX, "%s/quotad/quotad.pid",  \
+                                   priv->rundir);                              \
+        if ((_quotad_pid_len < 0) || (_quotad_pid_len >= PATH_MAX)) {          \
+            pidfile[0] = 0;                                                    \
+        }                                                                      \
+    } while (0)
+
+gf_boolean_t
+is_brick_mx_enabled(void)
+{
+    char *value = NULL;
+    int ret = 0;
+    gf_boolean_t enabled = _gf_false;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+
+    priv = this->private;
+
+    ret = dict_get_strn(priv->opts, GLUSTERD_BRICK_MULTIPLEX_KEY,
+                        SLEN(GLUSTERD_BRICK_MULTIPLEX_KEY), &value);
+
+    if (!ret)
+        ret = gf_string2boolean(value, &enabled);
+
+    return ret ? _gf_false : enabled;
+}
+
+int
+get_mux_limit_per_process(int *mux_limit)
+{
+    char *value = NULL;
+    int ret = -1;
+    int max_bricks_per_proc = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    if (!is_brick_mx_enabled()) {
+        max_bricks_per_proc = 1;
+        ret = 0;
+        goto out;
+    }
+
+    ret = dict_get_strn(priv->opts, GLUSTERD_BRICKMUX_LIMIT_KEY,
+                        SLEN(GLUSTERD_BRICKMUX_LIMIT_KEY), &value);
+    if (ret) {
+        value = GLUSTERD_BRICKMUX_LIMIT_DFLT_VALUE;
+    }
+    ret = gf_string2int(value, &max_bricks_per_proc);
+    if (ret)
+        goto out;
+
+out:
+    *mux_limit = max_bricks_per_proc;
+
+    gf_msg_debug("glusterd", 0, "Mux limit set to %d bricks per process",
+                 *mux_limit);
+
+    return ret;
+}
+
+int
+get_gd_vol_thread_limit(int *thread_limit)
+{
+    char *value = NULL;
+    int ret = -1;
+    int vol_per_thread_limit = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    if (!is_brick_mx_enabled()) {
+        vol_per_thread_limit = 1;
+        ret = 0;
+        goto out;
+    }
+
+    ret = dict_get_strn(priv->opts, GLUSTERD_VOL_CNT_PER_THRD,
+                        SLEN(GLUSTERD_VOL_CNT_PER_THRD), &value);
+    if (ret) {
+        value = GLUSTERD_VOL_CNT_PER_THRD_DEFAULT_VALUE;
+    }
+    ret = gf_string2int(value, &vol_per_thread_limit);
+    if (ret)
+        goto out;
+
+out:
+    *thread_limit = vol_per_thread_limit;
+
+    gf_msg_debug("glusterd", 0,
+                 "Per Thread volume limit set to %d glusterd to populate dict "
+                 "data parallel",
+                 *thread_limit);
+
+    return ret;
+}
+
+extern struct volopt_map_entry glusterd_volopt_map[];
+extern glusterd_all_vol_opts valid_all_vol_opts[];
+
+static glusterd_lock_t lock;
+
+static int
+_brick_for_each(glusterd_volinfo_t *volinfo, dict_t *mod_dict, void *data,
+                int (*fn)(glusterd_volinfo_t *, glusterd_brickinfo_t *,
+                          dict_t *mod_dict, void *))
+{
+    int ret = 0;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    xlator_t *this = THIS;
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        gf_msg_debug(this->name, 0, "Found a brick - %s:%s",
+                     brickinfo->hostname, brickinfo->path);
+        ret = fn(volinfo, brickinfo, mod_dict, data);
+        if (ret)
+            goto out;
+    }
+out:
+    return ret;
+}
+
+/* This is going to be a O(n^2) operation as we have to pick a brick,
+   make sure it belong to this machine, and compare another brick belonging
+   to this machine (if exists), is sharing the backend */
+static void
+gd_set_shared_brick_count(glusterd_volinfo_t *volinfo)
+{
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *trav = NULL;
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID))
+            continue;
+        brickinfo->fs_share_count = 0;
+        cds_list_for_each_entry(trav, &volinfo->bricks, brick_list)
+        {
+            if (!gf_uuid_compare(trav->uuid, MY_UUID) &&
+                (trav->statfs_fsid == brickinfo->statfs_fsid)) {
+                brickinfo->fs_share_count++;
+            }
+        }
+    }
+
+    return;
+}
+
+int
+glusterd_volume_brick_for_each(glusterd_volinfo_t *volinfo, void *data,
+                               int (*fn)(glusterd_volinfo_t *,
+                                         glusterd_brickinfo_t *,
+                                         dict_t *mod_dict, void *))
+{
+    gd_set_shared_brick_count(volinfo);
+
+    return _brick_for_each(volinfo, NULL, data, fn);
+}
+
+int32_t
+glusterd_get_lock_owner(uuid_t *uuid)
+{
+    gf_uuid_copy(*uuid, lock.owner);
+    return 0;
+}
+
+static int32_t
+glusterd_set_lock_owner(uuid_t owner)
+{
+    gf_uuid_copy(lock.owner, owner);
+    // TODO: set timestamp
+    return 0;
+}
+
+static int32_t
+glusterd_unset_lock_owner(uuid_t owner)
+{
+    gf_uuid_clear(lock.owner);
+    // TODO: set timestamp
+    return 0;
+}
+
+gf_boolean_t
+glusterd_is_fuse_available()
+{
+    int fd = 0;
+
+#ifdef __NetBSD__
+    fd = open("/dev/puffs", O_RDWR);
+#else
+    fd = open("/dev/fuse", O_RDWR);
+#endif
+
+    if (fd > -1 && !sys_close(fd))
+        return _gf_true;
+    else
+        return _gf_false;
+}
+
+int32_t
+glusterd_lock(uuid_t uuid)
+{
+    uuid_t owner;
+    char new_owner_str[50] = "";
+    char owner_str[50] = "";
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(uuid);
+
+    glusterd_get_lock_owner(&owner);
+
+    if (!gf_uuid_is_null(owner)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_LOCK_FAIL,
+               "Unable to get lock"
+               " for uuid: %s, lock held by: %s",
+               uuid_utoa_r(uuid, new_owner_str), uuid_utoa_r(owner, owner_str));
+        goto out;
+    }
+
+    ret = glusterd_set_lock_owner(uuid);
+
+    if (!ret) {
+        gf_msg_debug(this->name, 0,
+                     "Cluster lock held by"
+                     " %s",
+                     uuid_utoa(uuid));
+    }
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_unlock(uuid_t uuid)
+{
+    uuid_t owner;
+    char new_owner_str[50] = "";
+    char owner_str[50] = "";
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(uuid);
+
+    glusterd_get_lock_owner(&owner);
+
+    if (gf_uuid_is_null(owner)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_LOCK_FAIL,
+               "Cluster lock not held!");
+        goto out;
+    }
+
+    ret = gf_uuid_compare(uuid, owner);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_LOCK_FAIL,
+               "Cluster lock held by %s ,"
+               "unlock req from %s!",
+               uuid_utoa_r(owner, owner_str), uuid_utoa_r(uuid, new_owner_str));
+        goto out;
+    }
+
+    ret = glusterd_unset_lock_owner(uuid);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_UNLOCK_FAIL,
+               "Unable to clear cluster "
+               "lock");
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int
+glusterd_get_uuid(uuid_t *uuid)
+{
+    glusterd_conf_t *priv = NULL;
+
+    priv = THIS->private;
+
+    GF_ASSERT(priv);
+
+    gf_uuid_copy(*uuid, MY_UUID);
+
+    return 0;
+}
+
+int
+glusterd_submit_request(struct rpc_clnt *rpc, void *req, call_frame_t *frame,
+                        rpc_clnt_prog_t *prog, int procnum,
+                        struct iobref *iobref, xlator_t *this,
+                        fop_cbk_fn_t cbkfn, xdrproc_t xdrproc)
+{
+    char new_iobref = 0;
+    int ret = -1;
+    int count = 0;
+    ssize_t req_size = 0;
+    struct iobuf *iobuf = NULL;
+    struct iovec iov = {
+        0,
+    };
+
+    GF_ASSERT(rpc);
+    GF_ASSERT(this);
+
+    if (req) {
+        req_size = xdr_sizeof(xdrproc, req);
+        iobuf = iobuf_get2(this->ctx->iobuf_pool, req_size);
+        if (!iobuf) {
+            goto out;
+        };
+
+        if (!iobref) {
+            iobref = iobref_new();
+            if (!iobref) {
+                gf_smsg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                        NULL);
+                goto out;
+            }
+
+            new_iobref = 1;
+        }
+
+        iobref_add(iobref, iobuf);
+
+        iov.iov_base = iobuf->ptr;
+        iov.iov_len = iobuf_pagesize(iobuf);
+
+        /* Create the xdr payload */
+        ret = xdr_serialize_generic(iov, req, xdrproc);
+        if (ret == -1) {
+            goto out;
+        }
+        iov.iov_len = ret;
+        count = 1;
+    }
+
+    /* Send the msg */
+    rpc_clnt_submit(rpc, prog, procnum, cbkfn, &iov, count, NULL, 0, iobref,
+                    frame, NULL, 0, NULL, 0, NULL);
+
+    /* Unconditionally set ret to 0 here. This is to guard against a double
+     * STACK_DESTROY in case of a failure in rpc_clnt_submit AFTER the
+     * request is sent over the wire: once in the callback function of the
+     * request and once in the error codepath of some of the callers of
+     * glusterd_submit_request().
+     */
+    ret = 0;
+out:
+    if (new_iobref) {
+        iobref_unref(iobref);
+    }
+
+    iobuf_unref(iobuf);
+
+    return ret;
+}
+
+struct iobuf *
+glusterd_serialize_reply(rpcsvc_request_t *req, void *arg, struct iovec *outmsg,
+                         xdrproc_t xdrproc)
+{
+    struct iobuf *iob = NULL;
+    ssize_t retlen = -1;
+    ssize_t rsp_size = 0;
+
+    /* First, get the io buffer into which the reply in arg will
+     * be serialized.
+     */
+    rsp_size = xdr_sizeof(xdrproc, arg);
+    iob = iobuf_get2(req->svc->ctx->iobuf_pool, rsp_size);
+    if (!iob) {
+        gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Failed to get iobuf");
+        goto ret;
+    }
+
+    iobuf_to_iovec(iob, outmsg);
+    /* Use the given serializer to translate the give C structure in arg
+     * to XDR format which will be written into the buffer in outmsg.
+     */
+    /* retlen is used to received the error since size_t is unsigned and we
+     * need -1 for error notification during encoding.
+     */
+    retlen = xdr_serialize_generic(*outmsg, arg, xdrproc);
+    if (retlen == -1) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_ENCODE_FAIL,
+               "Failed to encode message");
+        goto ret;
+    }
+
+    outmsg->iov_len = retlen;
+ret:
+    if (retlen == -1) {
+        iobuf_unref(iob);
+        iob = NULL;
+    }
+
+    return iob;
+}
+
+int
+glusterd_submit_reply(rpcsvc_request_t *req, void *arg, struct iovec *payload,
+                      int payloadcount, struct iobref *iobref,
+                      xdrproc_t xdrproc)
+{
+    struct iobuf *iob = NULL;
+    int ret = -1;
+    struct iovec rsp = {
+        0,
+    };
+    char new_iobref = 0;
+
+    if (!req) {
+        GF_ASSERT(req);
+        goto out;
+    }
+
+    if (!iobref) {
+        iobref = iobref_new();
+        if (!iobref) {
+            gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "out of memory");
+            goto out;
+        }
+
+        new_iobref = 1;
+    }
+
+    iob = glusterd_serialize_reply(req, arg, &rsp, xdrproc);
+    if (!iob) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_SERIALIZE_MSG_FAIL,
+               "Failed to serialize reply");
+    } else {
+        iobref_add(iobref, iob);
+    }
+
+    ret = rpcsvc_submit_generic(req, &rsp, 1, payload, payloadcount, iobref);
+
+    /* Now that we've done our job of handing the message to the RPC layer
+     * we can safely unref the iob in the hope that RPC layer must have
+     * ref'ed the iob on receiving into the txlist.
+     */
+    if (ret == -1) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_REPLY_SUBMIT_FAIL,
+               "Reply submission failed");
+        goto out;
+    }
+
+    ret = 0;
+out:
+
+    if (new_iobref) {
+        iobref_unref(iobref);
+    }
+
+    if (iob)
+        iobuf_unref(iob);
+    return ret;
+}
+
+glusterd_volinfo_t *
+glusterd_volinfo_unref(glusterd_volinfo_t *volinfo)
+{
+    int refcnt = -1;
+    glusterd_conf_t *conf = THIS->private;
+
+    pthread_mutex_lock(&conf->volume_lock);
+    {
+        pthread_mutex_lock(&volinfo->reflock);
+        {
+            refcnt = --volinfo->refcnt;
+        }
+        pthread_mutex_unlock(&volinfo->reflock);
+    }
+    pthread_mutex_unlock(&conf->volume_lock);
+    if (!refcnt) {
+        glusterd_volinfo_delete(volinfo);
+        return NULL;
+    }
+
+    return volinfo;
+}
+
+glusterd_volinfo_t *
+glusterd_volinfo_ref(glusterd_volinfo_t *volinfo)
+{
+    pthread_mutex_lock(&volinfo->reflock);
+    {
+        ++volinfo->refcnt;
+    }
+    pthread_mutex_unlock(&volinfo->reflock);
+
+    return volinfo;
+}
+
+int32_t
+glusterd_volinfo_new(glusterd_volinfo_t **volinfo)
+{
+    glusterd_volinfo_t *new_volinfo = NULL;
+    int32_t ret = -1;
+
+    GF_ASSERT(volinfo);
+
+    new_volinfo = GF_CALLOC(1, sizeof(*new_volinfo),
+                            gf_gld_mt_glusterd_volinfo_t);
+
+    if (!new_volinfo)
+        goto out;
+
+    LOCK_INIT(&new_volinfo->lock);
+    CDS_INIT_LIST_HEAD(&new_volinfo->vol_list);
+    CDS_INIT_LIST_HEAD(&new_volinfo->snapvol_list);
+    CDS_INIT_LIST_HEAD(&new_volinfo->bricks);
+    CDS_INIT_LIST_HEAD(&new_volinfo->ta_bricks);
+    CDS_INIT_LIST_HEAD(&new_volinfo->snap_volumes);
+
+    new_volinfo->dict = dict_new();
+    if (!new_volinfo->dict) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        GF_FREE(new_volinfo);
+
+        goto out;
+    }
+
+    new_volinfo->gsync_slaves = dict_new();
+    if (!new_volinfo->gsync_slaves) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        dict_unref(new_volinfo->dict);
+        GF_FREE(new_volinfo);
+        goto out;
+    }
+
+    new_volinfo->gsync_active_slaves = dict_new();
+    if (!new_volinfo->gsync_active_slaves) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        dict_unref(new_volinfo->dict);
+        dict_unref(new_volinfo->gsync_slaves);
+        GF_FREE(new_volinfo);
+        goto out;
+    }
+
+    snprintf(new_volinfo->parent_volname, GD_VOLUME_NAME_MAX, "N/A");
+
+    new_volinfo->snap_max_hard_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+
+    new_volinfo->xl = THIS;
+
+    glusterd_snapdsvc_build(&new_volinfo->snapd.svc);
+    glusterd_gfproxydsvc_build(&new_volinfo->gfproxyd.svc);
+    glusterd_shdsvc_build(&new_volinfo->shd.svc);
+
+    pthread_mutex_init(&new_volinfo->store_volinfo_lock, NULL);
+    pthread_mutex_init(&new_volinfo->reflock, NULL);
+
+    *volinfo = glusterd_volinfo_ref(new_volinfo);
+
+    ret = 0;
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* This function will create a new volinfo and then
+ * dup the entries from volinfo to the new_volinfo.
+ *
+ * @param volinfo       volinfo which will be duplicated
+ * @param dup_volinfo   new volinfo which will be created
+ * @param set_userauth  if this true then auth info is also set
+ *
+ * @return 0 on success else -1
+ */
+int32_t
+glusterd_volinfo_dup(glusterd_volinfo_t *volinfo,
+                     glusterd_volinfo_t **dup_volinfo,
+                     gf_boolean_t set_userauth)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *new_volinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(this->name, dup_volinfo, out);
+
+    ret = glusterd_volinfo_new(&new_volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_SET_FAIL,
+               "not able to create the "
+               "duplicate volinfo for the volume %s",
+               volinfo->volname);
+        goto out;
+    }
+
+    new_volinfo->type = volinfo->type;
+    new_volinfo->replica_count = volinfo->replica_count;
+    new_volinfo->arbiter_count = volinfo->arbiter_count;
+    new_volinfo->stripe_count = volinfo->stripe_count;
+    new_volinfo->disperse_count = volinfo->disperse_count;
+    new_volinfo->redundancy_count = volinfo->redundancy_count;
+    new_volinfo->dist_leaf_count = volinfo->dist_leaf_count;
+    new_volinfo->sub_count = volinfo->sub_count;
+    new_volinfo->subvol_count = volinfo->subvol_count;
+    new_volinfo->transport_type = volinfo->transport_type;
+    new_volinfo->brick_count = volinfo->brick_count;
+    new_volinfo->quota_conf_version = volinfo->quota_conf_version;
+    new_volinfo->quota_xattr_version = volinfo->quota_xattr_version;
+    new_volinfo->snap_max_hard_limit = volinfo->snap_max_hard_limit;
+    new_volinfo->quota_conf_cksum = volinfo->quota_conf_cksum;
+
+    dict_copy(volinfo->dict, new_volinfo->dict);
+    dict_copy(volinfo->gsync_slaves, new_volinfo->gsync_slaves);
+    dict_copy(volinfo->gsync_active_slaves, new_volinfo->gsync_active_slaves);
+    gd_update_volume_op_versions(new_volinfo);
+
+    if (set_userauth) {
+        glusterd_auth_set_username(new_volinfo, volinfo->auth.username);
+        glusterd_auth_set_password(new_volinfo, volinfo->auth.password);
+    }
+
+    *dup_volinfo = new_volinfo;
+    ret = 0;
+out:
+    if (ret && (NULL != new_volinfo)) {
+        (void)glusterd_volinfo_delete(new_volinfo);
+    }
+    return ret;
+}
+
+/* This function will duplicate brickinfo
+ *
+ * @param brickinfo     Source brickinfo
+ * @param dup_brickinfo Destination brickinfo
+ *
+ * @return 0 on success else -1
+ */
+int32_t
+glusterd_brickinfo_dup(glusterd_brickinfo_t *brickinfo,
+                       glusterd_brickinfo_t *dup_brickinfo)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_VALIDATE_OR_GOTO(this->name, brickinfo, out);
+    GF_VALIDATE_OR_GOTO(this->name, dup_brickinfo, out);
+
+    strcpy(dup_brickinfo->hostname, brickinfo->hostname);
+    strcpy(dup_brickinfo->path, brickinfo->path);
+    strcpy(dup_brickinfo->real_path, brickinfo->real_path);
+    strcpy(dup_brickinfo->device_path, brickinfo->device_path);
+    strcpy(dup_brickinfo->fstype, brickinfo->fstype);
+    strcpy(dup_brickinfo->mnt_opts, brickinfo->mnt_opts);
+    ret = gf_canonicalize_path(dup_brickinfo->path);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_CANONICALIZE_FAIL,
+               "Failed to canonicalize "
+               "brick path");
+        goto out;
+    }
+    gf_uuid_copy(dup_brickinfo->uuid, brickinfo->uuid);
+
+    dup_brickinfo->port = brickinfo->port;
+    dup_brickinfo->rdma_port = brickinfo->rdma_port;
+    if (NULL != brickinfo->logfile) {
+        dup_brickinfo->logfile = gf_strdup(brickinfo->logfile);
+        if (NULL == dup_brickinfo->logfile) {
+            ret = -1;
+            goto out;
+        }
+    }
+    strcpy(dup_brickinfo->brick_id, brickinfo->brick_id);
+    strcpy(dup_brickinfo->mount_dir, brickinfo->mount_dir);
+    dup_brickinfo->status = brickinfo->status;
+    dup_brickinfo->snap_status = brickinfo->snap_status;
+out:
+    return ret;
+}
+
+/*
+ * gd_vol_is_geo_rep_active:
+ *      This function checks for any running geo-rep session for
+ *      the volume given.
+ *
+ * Return Value:
+ *      _gf_true : If any running geo-rep session.
+ *      _gf_false: If no running geo-rep session.
+ */
+
+gf_boolean_t
+gd_vol_is_geo_rep_active(glusterd_volinfo_t *volinfo)
+{
+    gf_boolean_t active = _gf_false;
+
+    GF_ASSERT(volinfo);
+
+    if (volinfo->gsync_active_slaves && volinfo->gsync_active_slaves->count > 0)
+        active = _gf_true;
+
+    return active;
+}
+
+void
+glusterd_auth_cleanup(glusterd_volinfo_t *volinfo)
+{
+    GF_ASSERT(volinfo);
+
+    GF_FREE(volinfo->auth.username);
+
+    GF_FREE(volinfo->auth.password);
+}
+
+char *
+glusterd_auth_get_username(glusterd_volinfo_t *volinfo)
+{
+    GF_ASSERT(volinfo);
+
+    return volinfo->auth.username;
+}
+
+char *
+glusterd_auth_get_password(glusterd_volinfo_t *volinfo)
+{
+    GF_ASSERT(volinfo);
+
+    return volinfo->auth.password;
+}
+
+int32_t
+glusterd_auth_set_username(glusterd_volinfo_t *volinfo, char *username)
+{
+    GF_ASSERT(volinfo);
+    GF_ASSERT(username);
+
+    volinfo->auth.username = gf_strdup(username);
+    return 0;
+}
+
+int32_t
+glusterd_auth_set_password(glusterd_volinfo_t *volinfo, char *password)
+{
+    GF_ASSERT(volinfo);
+    GF_ASSERT(password);
+
+    volinfo->auth.password = gf_strdup(password);
+    return 0;
+}
+
+int32_t
+glusterd_brickinfo_delete(glusterd_brickinfo_t *brickinfo)
+{
+    int32_t ret = -1;
+
+    GF_ASSERT(brickinfo);
+
+    cds_list_del_init(&brickinfo->brick_list);
+
+    (void)gf_store_handle_destroy(brickinfo->shandle);
+
+    GF_FREE(brickinfo->logfile);
+    GF_FREE(brickinfo);
+
+    ret = 0;
+
+    return ret;
+}
+
+int32_t
+glusterd_volume_brickinfos_delete(glusterd_volinfo_t *volinfo)
+{
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *tmp = NULL;
+    int32_t ret = 0;
+
+    GF_ASSERT(volinfo);
+
+    cds_list_for_each_entry_safe(brickinfo, tmp, &volinfo->bricks, brick_list)
+    {
+        ret = glusterd_brickinfo_delete(brickinfo);
+        if (ret)
+            goto out;
+    }
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_volinfo_remove(glusterd_volinfo_t *volinfo)
+{
+    cds_list_del_init(&volinfo->vol_list);
+    glusterd_volinfo_unref(volinfo);
+    return 0;
+}
+
+int32_t
+glusterd_volinfo_delete(glusterd_volinfo_t *volinfo)
+{
+    int32_t ret = -1;
+
+    GF_ASSERT(volinfo);
+
+    cds_list_del_init(&volinfo->vol_list);
+    cds_list_del_init(&volinfo->snapvol_list);
+
+    ret = glusterd_volume_brickinfos_delete(volinfo);
+    if (ret)
+        goto out;
+    if (volinfo->dict)
+        dict_unref(volinfo->dict);
+    if (volinfo->gsync_slaves)
+        dict_unref(volinfo->gsync_slaves);
+    if (volinfo->gsync_active_slaves)
+        dict_unref(volinfo->gsync_active_slaves);
+    GF_FREE(volinfo->logdir);
+    if (volinfo->rebal.dict)
+        dict_unref(volinfo->rebal.dict);
+
+    /* Destroy the connection object for per volume svc daemons */
+    glusterd_conn_term(&volinfo->snapd.svc.conn);
+    glusterd_conn_term(&volinfo->gfproxyd.svc.conn);
+
+    gf_store_handle_destroy(volinfo->quota_conf_shandle);
+    gf_store_handle_destroy(volinfo->shandle);
+    gf_store_handle_destroy(volinfo->node_state_shandle);
+    gf_store_handle_destroy(volinfo->snapd.handle);
+
+    glusterd_auth_cleanup(volinfo);
+    glusterd_shd_svcproc_cleanup(&volinfo->shd);
+
+    pthread_mutex_destroy(&volinfo->store_volinfo_lock);
+    pthread_mutex_destroy(&volinfo->reflock);
+    LOCK_DESTROY(&volinfo->lock);
+
+    GF_FREE(volinfo);
+    ret = 0;
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_brickprocess_new(glusterd_brick_proc_t **brickprocess)
+{
+    glusterd_brick_proc_t *new_brickprocess = NULL;
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, brickprocess, out);
+
+    new_brickprocess = GF_CALLOC(1, sizeof(*new_brickprocess),
+                                 gf_gld_mt_glusterd_brick_proc_t);
+
+    if (!new_brickprocess)
+        goto out;
+
+    CDS_INIT_LIST_HEAD(&new_brickprocess->bricks);
+    CDS_INIT_LIST_HEAD(&new_brickprocess->brick_proc_list);
+
+    new_brickprocess->brick_count = 0;
+    *brickprocess = new_brickprocess;
+
+    ret = 0;
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_brickinfo_new(glusterd_brickinfo_t **brickinfo)
+{
+    glusterd_brickinfo_t *new_brickinfo = NULL;
+    int32_t ret = -1;
+
+    GF_ASSERT(brickinfo);
+
+    new_brickinfo = GF_CALLOC(1, sizeof(*new_brickinfo),
+                              gf_gld_mt_glusterd_brickinfo_t);
+
+    if (!new_brickinfo)
+        goto out;
+
+    CDS_INIT_LIST_HEAD(&new_brickinfo->brick_list);
+    CDS_INIT_LIST_HEAD(&new_brickinfo->mux_bricks);
+    pthread_mutex_init(&new_brickinfo->restart_mutex, NULL);
+    *brickinfo = new_brickinfo;
+
+    ret = 0;
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_get_next_available_brickid(glusterd_volinfo_t *volinfo)
+{
+    glusterd_brickinfo_t *brickinfo = NULL;
+    char *token = NULL;
+    int brickid = 0;
+    int max_brickid = -1;
+    int ret = -1;
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        token = strrchr(brickinfo->brick_id, '-');
+        ret = gf_string2int32(++token, &brickid);
+        if (ret < 0) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_ID_GEN_FAILED,
+                   "Unable to generate brick ID");
+            return ret;
+        }
+        if (brickid > max_brickid)
+            max_brickid = brickid;
+    }
+
+    return max_brickid + 1;
+}
+
+int32_t
+glusterd_resolve_brick(glusterd_brickinfo_t *brickinfo)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(brickinfo);
+    if (!gf_uuid_compare(brickinfo->uuid, MY_UUID) ||
+        (glusterd_peerinfo_find_by_uuid(brickinfo->uuid) != NULL)) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_hostname_to_uuid(brickinfo->hostname, brickinfo->uuid);
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_get_brick_mount_dir(char *brickpath, char *hostname, char *mount_dir)
+{
+    char *mnt_pt = NULL;
+    char *brick_dir = NULL;
+    int32_t ret = -1;
+    uuid_t brick_uuid = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(brickpath);
+    GF_ASSERT(hostname);
+    GF_ASSERT(mount_dir);
+
+    ret = glusterd_hostname_to_uuid(hostname, brick_uuid);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HOSTNAME_TO_UUID_FAIL,
+               "Failed to convert hostname %s to uuid", hostname);
+        goto out;
+    }
+
+    if (!gf_uuid_compare(brick_uuid, MY_UUID)) {
+        ret = glusterd_get_brick_root(brickpath, &mnt_pt);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0,
+                   GD_MSG_BRICKPATH_ROOT_GET_FAIL,
+                   "Could not get the root of the brick path %s", brickpath);
+            goto out;
+        }
+
+        if (strncmp(brickpath, mnt_pt, strlen(mnt_pt))) {
+            gf_msg(this->name, GF_LOG_WARNING, 0,
+                   GD_MSG_BRKPATH_MNTPNT_MISMATCH, "brick: %s brick mount: %s",
+                   brickpath, mnt_pt);
+            ret = -1;
+            goto out;
+        }
+
+        brick_dir = &brickpath[strlen(mnt_pt)];
+        if (brick_dir[0] == '/')
+            brick_dir++;
+
+        snprintf(mount_dir, VALID_GLUSTERD_PATHMAX, "/%s", brick_dir);
+    }
+
+out:
+    if (mnt_pt)
+        GF_FREE(mnt_pt);
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_brickinfo_new_from_brick(char *brick, glusterd_brickinfo_t **brickinfo,
+                                  gf_boolean_t construct_real_path,
+                                  char **op_errstr)
+{
+    char *hostname = NULL;
+    char *path = NULL;
+    char *tmp_host = NULL;
+    char *tmp_path = NULL;
+    int32_t ret = -1;
+    glusterd_brickinfo_t *new_brickinfo = NULL;
+    xlator_t *this = NULL;
+    char abspath[PATH_MAX] = "";
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(brick);
+    GF_ASSERT(brickinfo);
+
+    tmp_host = gf_strdup(brick);
+    if (tmp_host && !get_host_name(tmp_host, &hostname))
+        goto out;
+    tmp_path = gf_strdup(brick);
+    if (tmp_path && !get_path_name(tmp_path, &path))
+        goto out;
+
+    GF_ASSERT(hostname);
+    GF_ASSERT(path);
+
+    ret = glusterd_brickinfo_new(&new_brickinfo);
+    if (ret)
+        goto out;
+
+    ret = gf_canonicalize_path(path);
+    if (ret)
+        goto out;
+    ret = snprintf(new_brickinfo->hostname, sizeof(new_brickinfo->hostname),
+                   "%s", hostname);
+    if (ret < 0 || ret >= sizeof(new_brickinfo->hostname)) {
+        ret = -1;
+        goto out;
+    }
+    ret = snprintf(new_brickinfo->path, sizeof(new_brickinfo->path), "%s",
+                   path);
+    if (ret < 0 || ret >= sizeof(new_brickinfo->path)) {
+        ret = -1;
+        goto out;
+    }
+
+    if (construct_real_path) {
+        ret = glusterd_hostname_to_uuid(new_brickinfo->hostname,
+                                        new_brickinfo->uuid);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HOSTNAME_TO_UUID_FAIL,
+                   "Failed to convert hostname %s to uuid", hostname);
+            if (op_errstr)
+                gf_asprintf(op_errstr,
+                            "Host %s is not in "
+                            "\'Peer in Cluster\' state",
+                            new_brickinfo->hostname);
+            goto out;
+        }
+    }
+
+    if (construct_real_path && !gf_uuid_compare(new_brickinfo->uuid, MY_UUID) &&
+        new_brickinfo->real_path[0] == '\0') {
+        if (!realpath(new_brickinfo->path, abspath)) {
+            /* ENOENT indicates that brick path has not been created
+             * which is a valid scenario */
+            if (errno != ENOENT) {
+                gf_msg(this->name, GF_LOG_CRITICAL, errno,
+                       GD_MSG_BRICKINFO_CREATE_FAIL,
+                       "realpath"
+                       " () failed for brick %s. The "
+                       "underlying filesystem may be in bad "
+                       "state. Error - %s",
+                       new_brickinfo->path, strerror(errno));
+                ret = -1;
+                goto out;
+            }
+        }
+        if (strlen(abspath) >= sizeof(new_brickinfo->real_path)) {
+            ret = -1;
+            goto out;
+        }
+        (void)strncpy(new_brickinfo->real_path, abspath,
+                      sizeof(new_brickinfo->real_path));
+    }
+
+    *brickinfo = new_brickinfo;
+
+    ret = 0;
+out:
+    GF_FREE(tmp_host);
+    if (tmp_host)
+        GF_FREE(tmp_path);
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static gf_boolean_t
+_is_prefix(char *str1, char *str2)
+{
+    GF_ASSERT(str1);
+    GF_ASSERT(str2);
+
+    int i = 0;
+    int len1 = 0;
+    int len2 = 0;
+    int small_len = 0;
+    char *bigger = NULL;
+    gf_boolean_t prefix = _gf_true;
+
+    len1 = strlen(str1);
+    len2 = strlen(str2);
+    small_len = min(len1, len2);
+
+    /*
+     * If either one (not both) of the strings are 0-length, they are not
+     * prefixes of each other.
+     */
+    if ((small_len == 0) && (len1 != len2)) {
+        return _gf_false;
+    }
+
+    for (i = 0; i < small_len; i++) {
+        if (str1[i] != str2[i]) {
+            prefix = _gf_false;
+            break;
+        }
+    }
+
+    if (len1 < len2)
+        bigger = str2;
+
+    else if (len1 > len2)
+        bigger = str1;
+
+    else
+        return prefix;
+
+    if (bigger[small_len] != '/')
+        prefix = _gf_false;
+
+    return prefix;
+}
+
+/* Checks if @path is available in the peer identified by @uuid
+ * 'availability' is determined by querying current state of volumes
+ * in the cluster. */
+gf_boolean_t
+glusterd_is_brickpath_available(uuid_t uuid, char *path)
+{
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    gf_boolean_t available = _gf_false;
+    char tmp_path[PATH_MAX] = "";
+
+    priv = THIS->private;
+
+    if (snprintf(tmp_path, PATH_MAX, "%s", path) >= PATH_MAX)
+        goto out;
+    /* path may not yet exist */
+    if (!realpath(path, tmp_path)) {
+        if (errno != ENOENT) {
+            gf_msg(THIS->name, GF_LOG_CRITICAL, errno,
+                   GD_MSG_BRICKINFO_CREATE_FAIL,
+                   "realpath"
+                   " () failed for brick %s. The "
+                   "underlying filesystem may be in bad "
+                   "state. Error - %s",
+                   path, strerror(errno));
+            goto out;
+        }
+        /* When realpath(3) fails, tmp_path is undefined. */
+        (void)snprintf(tmp_path, sizeof(tmp_path), "%s", path);
+    }
+
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+    {
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            if (gf_uuid_compare(uuid, brickinfo->uuid))
+                continue;
+            if (_is_prefix(brickinfo->real_path, tmp_path)) {
+                gf_msg(THIS->name, GF_LOG_CRITICAL, 0,
+                       GD_MSG_BRICKINFO_CREATE_FAIL,
+                       "_is_prefix call failed for brick %s "
+                       "against brick %s",
+                       tmp_path, brickinfo->real_path);
+                goto out;
+            }
+        }
+    }
+    available = _gf_true;
+out:
+    return available;
+}
+
+int
+glusterd_validate_and_create_brickpath(glusterd_brickinfo_t *brickinfo,
+                                       uuid_t volume_id, char *volname,
+                                       char **op_errstr, gf_boolean_t is_force,
+                                       gf_boolean_t ignore_partition)
+{
+    int ret = -1;
+    char parentdir[PATH_MAX] = "";
+    struct stat parent_st = {
+        0,
+    };
+    struct stat brick_st = {
+        0,
+    };
+    struct stat root_st = {
+        0,
+    };
+    char msg[2048] = "";
+    gf_boolean_t is_created = _gf_false;
+    char glusterfs_dir_path[PATH_MAX] = "";
+    int32_t len = 0;
+
+    ret = sys_mkdir(brickinfo->path, 0755);
+    if (ret) {
+        if (errno != EEXIST) {
+            len = snprintf(msg, sizeof(msg),
+                           "Failed to create "
+                           "brick directory for brick %s:%s. "
+                           "Reason : %s ",
+                           brickinfo->hostname, brickinfo->path,
+                           strerror(errno));
+            gf_smsg(
+                "glusterd", GF_LOG_ERROR, errno, GD_MSG_CREATE_BRICK_DIR_FAILED,
+                "Brick_hostname=%s, Brick_path=%s, Reason=%s",
+                brickinfo->hostname, brickinfo->path, strerror(errno), NULL);
+            goto out;
+        }
+    } else {
+        is_created = _gf_true;
+    }
+
+    ret = sys_lstat(brickinfo->path, &brick_st);
+    if (ret) {
+        len = snprintf(msg, sizeof(msg),
+                       "lstat failed on %s. "
+                       "Reason : %s",
+                       brickinfo->path, strerror(errno));
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_LSTAT_FAIL,
+                "Failed on Brick_path=%s, Reason=%s", brickinfo->path,
+                strerror(errno), NULL);
+        goto out;
+    }
+
+    if ((!is_created) && (!S_ISDIR(brick_st.st_mode))) {
+        len = snprintf(msg, sizeof(msg),
+                       "The provided path %s "
+                       "which is already present, is not a directory",
+                       brickinfo->path);
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+                "Brick_path=%s", brickinfo->path, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    len = snprintf(parentdir, sizeof(parentdir), "%s/..", brickinfo->path);
+    if ((len < 0) || (len >= sizeof(parentdir))) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = sys_lstat("/", &root_st);
+    if (ret) {
+        len = snprintf(msg, sizeof(msg),
+                       "lstat failed on /. "
+                       "Reason : %s",
+                       strerror(errno));
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_LSTAT_FAIL,
+                "Failed on /, Reason=%s", strerror(errno), NULL);
+        goto out;
+    }
+
+    ret = sys_lstat(parentdir, &parent_st);
+    if (ret) {
+        len = snprintf(msg, sizeof(msg),
+                       "lstat failed on %s. "
+                       "Reason : %s",
+                       parentdir, strerror(errno));
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_LSTAT_FAIL,
+                "Failed on parentdir=%s, Reason=%s", parentdir, strerror(errno),
+                NULL);
+        goto out;
+    }
+    if (strncmp(volname, GLUSTER_SHARED_STORAGE,
+                SLEN(GLUSTER_SHARED_STORAGE)) &&
+        sizeof(GLUSTERD_DEFAULT_WORKDIR) <= (strlen(brickinfo->path) + 1) &&
+        !strncmp(brickinfo->path, GLUSTERD_DEFAULT_WORKDIR,
+                 (sizeof(GLUSTERD_DEFAULT_WORKDIR) - 1))) {
+        len = snprintf(msg, sizeof(msg),
+                       "Brick isn't allowed to be "
+                       "created inside glusterd's working directory.");
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_BRICK_CREATION_FAIL,
+                NULL);
+        ret = -1;
+        goto out;
+    }
+
+    if (!is_force) {
+        if (brick_st.st_dev != parent_st.st_dev) {
+            len = snprintf(msg, sizeof(msg),
+                           "The brick %s:%s "
+                           "is a mount point. Please create a "
+                           "sub-directory under the mount point "
+                           "and use that as the brick directory. "
+                           "Or use 'force' at the end of the "
+                           "command if you want to override this "
+                           "behavior.",
+                           brickinfo->hostname, brickinfo->path);
+            gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_BRICK_CREATE_MNTPNT,
+                    "Use 'force' at the end of the command if you want to "
+                    "override this behavior, Brick_hostname=%s, Brick_path=%s",
+                    brickinfo->hostname, brickinfo->path, NULL);
+            ret = -1;
+            goto out;
+        } else if (parent_st.st_dev == root_st.st_dev) {
+            len = snprintf(msg, sizeof(msg),
+                           "The brick %s:%s "
+                           "is being created in the root "
+                           "partition. It is recommended that "
+                           "you don't use the system's root "
+                           "partition for storage backend. Or "
+                           "use 'force' at the end of the "
+                           "command if you want to override this "
+                           "behavior.",
+                           brickinfo->hostname, brickinfo->path);
+            gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_BRICK_CREATE_ROOT,
+                    "Use 'force' at the end of the command if you want to "
+                    "override this behavior, Brick_hostname=%s, Brick_path=%s",
+                    brickinfo->hostname, brickinfo->path, NULL);
+
+            /* If --wignore-partition flag is used, ignore warnings
+             * related to bricks being on root partition when 'force'
+             * is not used */
+            if ((len < 0) || (len >= sizeof(msg)) || !ignore_partition) {
+                ret = -1;
+                goto out;
+            }
+        }
+    }
+
+    ret = glusterd_check_and_set_brick_xattr(
+        brickinfo->hostname, brickinfo->path, volume_id, op_errstr, is_force);
+    if (ret)
+        goto out;
+
+    /* create .glusterfs directory */
+    len = snprintf(glusterfs_dir_path, sizeof(glusterfs_dir_path), "%s/%s",
+                   brickinfo->path, ".glusterfs");
+    if ((len < 0) || (len >= sizeof(glusterfs_dir_path))) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = sys_mkdir(glusterfs_dir_path, 0600);
+    if (ret && (errno != EEXIST)) {
+        len = snprintf(msg, sizeof(msg),
+                       "Failed to create "
+                       ".glusterfs directory for brick %s:%s. "
+                       "Reason : %s ",
+                       brickinfo->hostname, brickinfo->path, strerror(errno));
+        gf_smsg("glusterd", GF_LOG_ERROR, errno,
+                GD_MSG_CREATE_GLUSTER_DIR_FAILED,
+                "Brick_hostname=%s, Brick_path=%s, Reason=%s",
+                brickinfo->hostname, brickinfo->path, strerror(errno), NULL);
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    if (len < 0) {
+        ret = -1;
+    }
+    if (ret && is_created) {
+        (void)recursive_rmdir(brickinfo->path);
+    }
+    if (ret && !*op_errstr && msg[0] != '\0')
+        *op_errstr = gf_strdup(msg);
+
+    return ret;
+}
+
+int32_t
+glusterd_volume_brickinfo_get(uuid_t uuid, char *hostname, char *path,
+                              glusterd_volinfo_t *volinfo,
+                              glusterd_brickinfo_t **brickinfo)
+{
+    glusterd_brickinfo_t *brickiter = NULL;
+    uuid_t peer_uuid = {0};
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    if (uuid) {
+        gf_uuid_copy(peer_uuid, uuid);
+    } else {
+        ret = glusterd_hostname_to_uuid(hostname, peer_uuid);
+        if (ret)
+            goto out;
+    }
+    ret = -1;
+    cds_list_for_each_entry(brickiter, &volinfo->bricks, brick_list)
+    {
+        if ((gf_uuid_is_null(brickiter->uuid)) &&
+            (glusterd_resolve_brick(brickiter) != 0))
+            goto out;
+        if (gf_uuid_compare(peer_uuid, brickiter->uuid))
+            continue;
+
+        if (strcmp(brickiter->path, path) == 0) {
+            gf_msg_debug(this->name, 0, LOGSTR_FOUND_BRICK, brickiter->hostname,
+                         brickiter->path, volinfo->volname);
+            ret = 0;
+            if (brickinfo)
+                *brickinfo = brickiter;
+            break;
+        }
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_volume_ta_brickinfo_get(uuid_t uuid, char *hostname, char *path,
+                                 glusterd_volinfo_t *volinfo,
+                                 glusterd_brickinfo_t **ta_brickinfo)
+{
+    glusterd_brickinfo_t *ta_brickiter = NULL;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    ret = -1;
+
+    cds_list_for_each_entry(ta_brickiter, &volinfo->ta_bricks, brick_list)
+    {
+        if (strcmp(ta_brickiter->path, path) == 0 &&
+            strcmp(ta_brickiter->hostname, hostname) == 0) {
+            gf_msg_debug(this->name, 0, LOGSTR_FOUND_BRICK,
+                         ta_brickiter->hostname, ta_brickiter->path,
+                         volinfo->volname);
+            ret = 0;
+            if (ta_brickinfo)
+                *ta_brickinfo = ta_brickiter;
+            break;
+        }
+    }
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_volume_brickinfo_get_by_brick(char *brick, glusterd_volinfo_t *volinfo,
+                                       glusterd_brickinfo_t **brickinfo,
+                                       gf_boolean_t construct_real_path)
+{
+    int32_t ret = -1;
+    glusterd_brickinfo_t *tmp_brickinfo = NULL;
+
+    GF_ASSERT(brick);
+    GF_ASSERT(volinfo);
+
+    ret = glusterd_brickinfo_new_from_brick(brick, &tmp_brickinfo,
+                                            construct_real_path, NULL);
+    if (ret)
+        goto out;
+
+    ret = glusterd_volume_brickinfo_get(
+        NULL, tmp_brickinfo->hostname, tmp_brickinfo->path, volinfo, brickinfo);
+    (void)glusterd_brickinfo_delete(tmp_brickinfo);
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+gf_boolean_t
+glusterd_is_brick_decommissioned(glusterd_volinfo_t *volinfo, char *hostname,
+                                 char *path)
+{
+    gf_boolean_t decommissioned = _gf_false;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int ret = -1;
+
+    ret = glusterd_volume_brickinfo_get(NULL, hostname, path, volinfo,
+                                        &brickinfo);
+    if (ret)
+        goto out;
+    decommissioned = brickinfo->decommissioned;
+out:
+    return decommissioned;
+}
+
+int
+glusterd_volinfo_find_by_volume_id(uuid_t volume_id,
+                                   glusterd_volinfo_t **volinfo)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *voliter = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    if (!volume_id) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        return -1;
+    }
+
+    this = THIS;
+    priv = this->private;
+
+    cds_list_for_each_entry(voliter, &priv->volumes, vol_list)
+    {
+        if (gf_uuid_compare(volume_id, voliter->volume_id))
+            continue;
+        *volinfo = voliter;
+        ret = 0;
+        gf_msg_debug(this->name, 0, "Volume %s found", voliter->volname);
+        break;
+    }
+    return ret;
+}
+
+int32_t
+glusterd_volinfo_find(const char *volname, glusterd_volinfo_t **volinfo)
+{
+    glusterd_volinfo_t *tmp_volinfo = NULL;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(volname);
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    cds_list_for_each_entry(tmp_volinfo, &priv->volumes, vol_list)
+    {
+        if (!strcmp(tmp_volinfo->volname, volname)) {
+            gf_msg_debug(this->name, 0, "Volume %s found", volname);
+            ret = 0;
+            *volinfo = tmp_volinfo;
+            break;
+        }
+    }
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+gf_boolean_t
+glusterd_volume_exists(const char *volname)
+{
+    glusterd_volinfo_t *tmp_volinfo = NULL;
+    gf_boolean_t volume_found = _gf_false;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(volname);
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    cds_list_for_each_entry(tmp_volinfo, &priv->volumes, vol_list)
+    {
+        if (!strcmp(tmp_volinfo->volname, volname)) {
+            gf_msg_debug(this->name, 0, "Volume %s found", volname);
+            volume_found = _gf_true;
+            break;
+        }
+    }
+
+    return volume_found;
+}
+
+int32_t
+glusterd_service_stop(const char *service, char *pidfile, int sig,
+                      gf_boolean_t force_kill)
+{
+    int32_t ret = -1;
+    pid_t pid = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    if (!gf_is_service_running(pidfile, &pid)) {
+        ret = 0;
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_ALREADY_STOPPED,
+               "%s already stopped", service);
+        goto out;
+    }
+    gf_msg_debug(this->name, 0,
+                 "Stopping gluster %s running in pid: "
+                 "%d",
+                 service, pid);
+
+    ret = kill(pid, sig);
+    if (ret) {
+        switch (errno) {
+            case ESRCH:
+                gf_msg_debug(this->name, 0, "%s is already stopped", service);
+                ret = 0;
+                goto out;
+            default:
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_SVC_KILL_FAIL,
+                       "Unable to kill %s "
+                       "service, reason:%s",
+                       service, strerror(errno));
+        }
+    }
+    if (!force_kill)
+        goto out;
+
+    sleep(1);
+    if (gf_is_service_running(pidfile, &pid)) {
+        ret = kill(pid, SIGKILL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_PID_KILL_FAIL,
+                   "Unable to kill pid:%d, "
+                   "reason:%s",
+                   pid, strerror(errno));
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+glusterd_service_stop_nolock(const char *service, char *pidfile, int sig,
+                             gf_boolean_t force_kill)
+{
+    int32_t ret = -1;
+    pid_t pid = -1;
+    xlator_t *this = NULL;
+    FILE *file = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    file = fopen(pidfile, "r+");
+    if (file) {
+        ret = fscanf(file, "%d", &pid);
+        if (ret <= 0) {
+            gf_msg_debug(this->name, 0, "Unable to read pidfile: %s", pidfile);
+            goto out;
+        }
+    }
+
+    if (kill(pid, 0) < 0) {
+        ret = 0;
+        gf_msg_debug(this->name, 0, "%s process not running: (%d) %s", service,
+                     pid, strerror(errno));
+        goto out;
+    }
+    gf_msg_debug(this->name, 0,
+                 "Stopping gluster %s service running with "
+                 "pid: %d",
+                 service, pid);
+
+    ret = kill(pid, sig);
+    if (ret) {
+        switch (errno) {
+            case ESRCH:
+                gf_msg_debug(this->name, 0, "%s is already stopped", service);
+                ret = 0;
+                goto out;
+            default:
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_SVC_KILL_FAIL,
+                       "Unable to kill %s "
+                       "service, reason:%s",
+                       service, strerror(errno));
+        }
+    }
+    if (!force_kill)
+        goto out;
+
+    sleep(1);
+    if (kill(pid, 0) == 0) {
+        ret = kill(pid, SIGKILL);
+        if (ret) {
+            /* Process is already dead, don't fail */
+            if (errno == ESRCH) {
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_PID_KILL_FAIL,
+                       "Unable to find pid:%d, "
+                       "must be dead already. Ignoring.",
+                       pid);
+            } else {
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_PID_KILL_FAIL,
+                       "Unable to kill pid:%d, "
+                       "reason:%s",
+                       pid, strerror(errno));
+                goto out;
+            }
+        }
+    }
+
+    ret = 0;
+
+out:
+    if (file)
+        fclose(file);
+
+    return ret;
+}
+void
+glusterd_set_socket_filepath(char *sock_filepath, char *sockpath, size_t len)
+{
+    char xxh64[GF_XXH64_DIGEST_LENGTH * 2 + 1] = {
+        0,
+    };
+
+    gf_xxh64_wrapper((unsigned char *)sock_filepath, strlen(sock_filepath),
+                     GF_XXHSUM64_DEFAULT_SEED, xxh64);
+    snprintf(sockpath, len, "%s/%s.socket", GLUSTERD_SOCK_DIR, xxh64);
+}
+
+void
+glusterd_set_brick_socket_filepath(glusterd_volinfo_t *volinfo,
+                                   glusterd_brickinfo_t *brickinfo,
+                                   char *sockpath, size_t len)
+{
+    char volume_dir[PATH_MAX] = "";
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    int expected_file_len = 0;
+    char export_path[PATH_MAX] = "";
+    char sock_filepath[PATH_MAX] = "";
+    int32_t slen = 0;
+
+    expected_file_len = SLEN(GLUSTERD_SOCK_DIR) + SLEN("/") +
+                        SHA256_DIGEST_LENGTH * 2 + SLEN(".socket") + 1;
+    GF_ASSERT(len >= expected_file_len);
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+
+    GLUSTERD_GET_VOLUME_PID_DIR(volume_dir, volinfo, priv);
+    GLUSTERD_REMOVE_SLASH_FROM_PATH(brickinfo->path, export_path);
+    slen = snprintf(sock_filepath, PATH_MAX, "%s/run/%s-%s", volume_dir,
+                    brickinfo->hostname, export_path);
+    if (slen < 0) {
+        sock_filepath[0] = 0;
+    }
+    glusterd_set_socket_filepath(sock_filepath, sockpath, len);
+}
+
+/* connection happens only if it is not already connected,
+ * reconnections are taken care by rpc-layer
+ */
+int32_t
+glusterd_brick_connect(glusterd_volinfo_t *volinfo,
+                       glusterd_brickinfo_t *brickinfo, char *socketpath)
+{
+    int ret = 0;
+    char volume_id_str[64] = "";
+    char *brickid = NULL;
+    dict_t *options = NULL;
+    struct rpc_clnt *rpc = NULL;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(brickinfo);
+    GF_ASSERT(socketpath);
+
+    if (brickinfo->rpc == NULL) {
+        /* Setting frame-timeout to 10mins (600seconds).
+         * Unix domain sockets ensures that the connection is reliable.
+         * The default timeout of 30mins used for unreliable network
+         * connections is too long for unix domain socket connections.
+         */
+        options = dict_new();
+        if (!options) {
+            gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL,
+                    NULL);
+            goto out;
+        }
+
+        ret = rpc_transport_unix_options_build(options, socketpath, 600);
+        if (ret)
+            goto out;
+
+        uuid_utoa_r(volinfo->volume_id, volume_id_str);
+        ret = gf_asprintf(&brickid, "%s:%s:%s", volume_id_str,
+                          brickinfo->hostname, brickinfo->path);
+        if (ret < 0)
+            goto out;
+
+        ret = glusterd_rpc_create(&rpc, options, glusterd_brick_rpc_notify,
+                                  brickid, _gf_false);
+        if (ret) {
+            GF_FREE(brickid);
+            goto out;
+        }
+        brickinfo->rpc = rpc;
+    }
+out:
+    if (options)
+        dict_unref(options);
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+_mk_rundir_p(glusterd_volinfo_t *volinfo)
+{
+    char rundir[PATH_MAX] = "";
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    int ret = -1;
+
+    this = THIS;
+    priv = this->private;
+    GLUSTERD_GET_VOLUME_PID_DIR(rundir, volinfo, priv);
+    ret = mkdir_p(rundir, 0755, _gf_true);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Failed to create rundir");
+    return ret;
+}
+
+int32_t
+glusterd_volume_start_glusterfs(glusterd_volinfo_t *volinfo,
+                                glusterd_brickinfo_t *brickinfo,
+                                gf_boolean_t wait)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    char pidfile[PATH_MAX + 1] = "";
+    char volfile[PATH_MAX] = "";
+    runner_t runner = {
+        0,
+    };
+    char exp_path[PATH_MAX] = "";
+    char logfile[PATH_MAX] = "";
+    int port = 0;
+    int rdma_port = 0;
+    char *bind_address = NULL;
+    char *localtime_logging = NULL;
+    char socketpath[PATH_MAX] = "";
+    char glusterd_uuid[1024] = "";
+    char valgrind_logfile[PATH_MAX] = "";
+    char rdma_brick_path[PATH_MAX] = "";
+    struct rpc_clnt *rpc = NULL;
+    rpc_clnt_connection_t *conn = NULL;
+    int pid = -1;
+    int32_t len = 0;
+    glusterd_brick_proc_t *brick_proc = NULL;
+    char *inet_family = NULL;
+    char *global_threading = NULL;
+    bool threading = false;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(brickinfo);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (brickinfo->snap_status == -1) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_SNAPSHOT_PENDING,
+               "Snapshot is pending on %s:%s. "
+               "Hence not starting the brick",
+               brickinfo->hostname, brickinfo->path);
+        ret = 0;
+        goto out;
+    }
+
+    GLUSTERD_GET_BRICK_PIDFILE(pidfile, volinfo, brickinfo, priv);
+    if (gf_is_service_running(pidfile, &pid)) {
+        goto connect;
+    }
+
+    /*
+     * There are all sorts of races in the start/stop code that could leave
+     * a UNIX-domain socket or RPC-client object associated with a
+     * long-dead incarnation of this brick, while the new incarnation is
+     * listening on a new socket at the same path and wondering why we
+     * haven't shown up.  To avoid the whole mess and be on the safe side,
+     * we just blow away anything that might have been left over, and start
+     * over again.
+     */
+    glusterd_set_brick_socket_filepath(volinfo, brickinfo, socketpath,
+                                       sizeof(socketpath));
+    (void)glusterd_unlink_file(socketpath);
+    rpc = brickinfo->rpc;
+    if (rpc) {
+        brickinfo->rpc = NULL;
+        conn = &rpc->conn;
+        pthread_mutex_lock(&conn->lock);
+        if (conn->reconnect) {
+            (void)gf_timer_call_cancel(rpc->ctx, conn->reconnect);
+            conn->reconnect = NULL;
+        }
+        pthread_mutex_unlock(&conn->lock);
+        rpc_clnt_unref(rpc);
+    }
+
+    port = pmap_assign_port(THIS, brickinfo->port, brickinfo->path);
+    if (!port) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PORTS_EXHAUSTED,
+               "All the ports in the range are exhausted, can't start "
+               "brick %s for volume %s",
+               brickinfo->path, volinfo->volname);
+        ret = -1;
+        goto out;
+    }
+    /* Build the exp_path, before starting the glusterfsd even in
+       valgrind mode. Otherwise all the glusterfsd processes start
+       writing the valgrind log to the same file.
+    */
+    GLUSTERD_REMOVE_SLASH_FROM_PATH(brickinfo->path, exp_path);
+
+retry:
+    runinit(&runner);
+
+    if (this->ctx->cmd_args.vgtool != _gf_none) {
+        /* Run bricks with valgrind. */
+        if (volinfo->logdir) {
+            len = snprintf(valgrind_logfile, PATH_MAX, "%s/valgrind-%s-%s.log",
+                           volinfo->logdir, volinfo->volname, exp_path);
+        } else {
+            len = snprintf(valgrind_logfile, PATH_MAX,
+                           "%s/bricks/valgrind-%s-%s.log", priv->logdir,
+                           volinfo->volname, exp_path);
+        }
+        if ((len < 0) || (len >= PATH_MAX)) {
+            ret = -1;
+            goto out;
+        }
+
+        if (this->ctx->cmd_args.vgtool == _gf_memcheck)
+            runner_add_args(&runner, "valgrind", "--leak-check=full",
+                            "--trace-children=yes", "--track-origins=yes",
+                            NULL);
+        else
+            runner_add_args(&runner, "valgrind", "--tool=drd", NULL);
+
+        runner_argprintf(&runner, "--log-file=%s", valgrind_logfile);
+    }
+
+    if (volinfo->is_snap_volume) {
+        len = snprintf(volfile, PATH_MAX, "/%s/%s/%s/%s.%s.%s",
+                       GLUSTERD_VOL_SNAP_DIR_PREFIX,
+                       volinfo->snapshot->snapname, volinfo->volname,
+                       volinfo->volname, brickinfo->hostname, exp_path);
+    } else {
+        len = snprintf(volfile, PATH_MAX, "%s.%s.%s", volinfo->volname,
+                       brickinfo->hostname, exp_path);
+    }
+    if ((len < 0) || (len >= PATH_MAX)) {
+        ret = -1;
+        goto out;
+    }
+
+    if (volinfo->logdir) {
+        len = snprintf(logfile, PATH_MAX, "%s/%s.log", volinfo->logdir,
+                       exp_path);
+    } else {
+        len = snprintf(logfile, PATH_MAX, "%s/bricks/%s.log", priv->logdir,
+                       exp_path);
+    }
+    if ((len < 0) || (len >= PATH_MAX)) {
+        ret = -1;
+        goto out;
+    }
+
+    if (!brickinfo->logfile)
+        brickinfo->logfile = gf_strdup(logfile);
+
+    (void)snprintf(glusterd_uuid, 1024, "*-posix.glusterd-uuid=%s",
+                   uuid_utoa(MY_UUID));
+    runner_add_args(&runner, SBIN_DIR "/glusterfsd", "-s", brickinfo->hostname,
+                    "--volfile-id", volfile, "-p", pidfile, "-S", socketpath,
+                    "--brick-name", brickinfo->path, "-l", brickinfo->logfile,
+                    "--xlator-option", glusterd_uuid, "--process-name", "brick",
+                    NULL);
+
+    if (dict_get_strn(priv->opts, GLUSTERD_LOCALTIME_LOGGING_KEY,
+                      SLEN(GLUSTERD_LOCALTIME_LOGGING_KEY),
+                      &localtime_logging) == 0) {
+        if (strcmp(localtime_logging, "enable") == 0)
+            runner_add_arg(&runner, "--localtime-logging");
+    }
+
+    runner_add_arg(&runner, "--brick-port");
+    if (volinfo->transport_type != GF_TRANSPORT_BOTH_TCP_RDMA) {
+        runner_argprintf(&runner, "%d", port);
+    } else {
+        len = snprintf(rdma_brick_path, sizeof(rdma_brick_path), "%s.rdma",
+                       brickinfo->path);
+        if ((len < 0) || (len >= sizeof(rdma_brick_path))) {
+            ret = -1;
+            goto out;
+        }
+        rdma_port = pmap_assign_port(THIS, brickinfo->rdma_port,
+                                     rdma_brick_path);
+        if (!rdma_port) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PORTS_EXHAUSTED,
+                   "All rdma ports in the "
+                   "range are exhausted, can't start brick %s for "
+                   "volume %s",
+                   rdma_brick_path, volinfo->volname);
+            ret = -1;
+            goto out;
+        }
+        runner_argprintf(&runner, "%d,%d", port, rdma_port);
+        runner_add_arg(&runner, "--xlator-option");
+        runner_argprintf(&runner, "%s-server.transport.rdma.listen-port=%d",
+                         volinfo->volname, rdma_port);
+    }
+
+    if (dict_get_strn(volinfo->dict, VKEY_CONFIG_GLOBAL_THREADING,
+                      SLEN(VKEY_CONFIG_GLOBAL_THREADING),
+                      &global_threading) == 0) {
+        if ((gf_string2boolean(global_threading, &threading) == 0) &&
+            threading) {
+            runner_add_arg(&runner, "--global-threading");
+        }
+    }
+
+    runner_add_arg(&runner, "--xlator-option");
+    runner_argprintf(&runner, "%s-server.listen-port=%d", volinfo->volname,
+                     port);
+
+    if (dict_get_strn(this->options, "transport.socket.bind-address",
+                      SLEN("transport.socket.bind-address"),
+                      &bind_address) == 0) {
+        runner_add_arg(&runner, "--xlator-option");
+        runner_argprintf(&runner, "transport.socket.bind-address=%s",
+                         bind_address);
+    }
+
+    if (volinfo->transport_type == GF_TRANSPORT_RDMA)
+        runner_argprintf(&runner, "--volfile-server-transport=rdma");
+    else if (volinfo->transport_type == GF_TRANSPORT_BOTH_TCP_RDMA)
+        runner_argprintf(&runner, "--volfile-server-transport=socket,rdma");
+
+    ret = dict_get_str(this->options, "transport.address-family", &inet_family);
+    if (!ret) {
+        runner_add_arg(&runner, "--xlator-option");
+        runner_argprintf(&runner, "transport.address-family=%s", inet_family);
+    }
+
+    if (volinfo->memory_accounting)
+        runner_add_arg(&runner, "--mem-accounting");
+
+    if (is_brick_mx_enabled())
+        runner_add_arg(&runner, "--brick-mux");
+
+    runner_log(&runner, "", GF_LOG_DEBUG, "Starting GlusterFS");
+
+    brickinfo->port = port;
+    brickinfo->rdma_port = rdma_port;
+    brickinfo->status = GF_BRICK_STARTING;
+    brickinfo->port_registered = _gf_false;
+
+    if (wait) {
+        synclock_unlock(&priv->big_lock);
+        errno = 0;
+        ret = runner_run(&runner);
+        if (errno != 0)
+            ret = errno;
+        synclock_lock(&priv->big_lock);
+
+        if (ret == EADDRINUSE) {
+            /* retry after getting a new port */
+            gf_msg(this->name, GF_LOG_WARNING, -ret,
+                   GD_MSG_SRC_BRICK_PORT_UNAVAIL,
+                   "Port %d is used by other process", port);
+
+            port = pmap_registry_alloc(this);
+            if (!port) {
+                gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_NO_FREE_PORTS,
+                       "Couldn't allocate a port");
+                ret = -1;
+                goto out;
+            }
+            gf_msg(this->name, GF_LOG_NOTICE, 0, GD_MSG_RETRY_WITH_NEW_PORT,
+                   "Retrying to start brick %s with new port %d",
+                   brickinfo->path, port);
+            goto retry;
+        }
+    } else {
+        ret = runner_run_nowait(&runner);
+    }
+
+    if (ret) {
+        brickinfo->port = 0;
+        brickinfo->rdma_port = 0;
+        goto out;
+    }
+
+    ret = glusterd_brickprocess_new(&brick_proc);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICKPROC_NEW_FAILED,
+               "Failed to create "
+               "new brick process instance");
+        goto out;
+    }
+
+    brick_proc->port = brickinfo->port;
+    cds_list_add_tail(&brick_proc->brick_proc_list, &priv->brick_procs);
+    brickinfo->brick_proc = brick_proc;
+    cds_list_add_tail(&brickinfo->mux_bricks, &brick_proc->bricks);
+    brickinfo->brick_proc = brick_proc;
+    brick_proc->brick_count++;
+
+connect:
+    ret = glusterd_brick_connect(volinfo, brickinfo, socketpath);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_DISCONNECTED,
+               "Failed to connect to brick %s:%s on %s", brickinfo->hostname,
+               brickinfo->path, socketpath);
+        goto out;
+    }
+
+out:
+    if (ret)
+        brickinfo->status = GF_BRICK_STOPPED;
+    return ret;
+}
+
+int32_t
+glusterd_brick_unlink_socket_file(glusterd_volinfo_t *volinfo,
+                                  glusterd_brickinfo_t *brickinfo)
+{
+    char path[PATH_MAX] = "";
+    char socketpath[PATH_MAX] = "";
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(brickinfo);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GLUSTERD_GET_VOLUME_DIR(path, volinfo, priv);
+    glusterd_set_brick_socket_filepath(volinfo, brickinfo, socketpath,
+                                       sizeof(socketpath));
+
+    return glusterd_unlink_file(socketpath);
+}
+
+int32_t
+glusterd_brick_disconnect(glusterd_brickinfo_t *brickinfo)
+{
+    rpc_clnt_t *rpc = NULL;
+    glusterd_conf_t *priv = THIS->private;
+
+    GF_ASSERT(brickinfo);
+
+    if (!brickinfo) {
+        gf_msg_callingfn("glusterd", GF_LOG_WARNING, EINVAL,
+                         GD_MSG_BRICK_NOT_FOUND, "!brickinfo");
+        return -1;
+    }
+
+    rpc = brickinfo->rpc;
+    brickinfo->rpc = NULL;
+
+    if (rpc) {
+        glusterd_rpc_clnt_unref(priv, rpc);
+    }
+
+    return 0;
+}
+
+static gf_boolean_t
+unsafe_option(dict_t *this, char *key, data_t *value, void *arg)
+{
+    /*
+     * Certain options are safe because they're already being handled other
+     * ways, such as being copied down to the bricks (all auth options) or
+     * being made irrelevant (event-threads).  All others are suspect and
+     * must be checked in the next function.
+     */
+    if (fnmatch("*auth*", key, 0) == 0) {
+        return _gf_false;
+    }
+
+    if (fnmatch("*event-threads", key, 0) == 0) {
+        return _gf_false;
+    }
+
+    if (fnmatch("*diagnostics.brick-log*", key, 0) == 0) {
+        return _gf_false;
+    }
+
+    if (fnmatch("*diagnostics.client-log*", key, 0) == 0) {
+        return _gf_false;
+    }
+    if (fnmatch("user.*", key, 0) == 0) {
+        return _gf_false;
+    }
+
+    return _gf_true;
+}
+
+static int
+opts_mismatch(dict_t *dict1, char *key, data_t *value1, void *dict2)
+{
+    data_t *value2 = dict_get(dict2, key);
+    int32_t min_len;
+
+    /*
+     * If the option is only present on one, we can either look at the
+     * default or assume a mismatch.  Looking at the default is pretty
+     * hard, because that's part of a structure within each translator and
+     * there's no dlopen interface to get at it, so we assume a mismatch.
+     * If the user really wants them to match (and for their bricks to be
+     * multiplexed, they can always reset the option).
+     */
+    if (!value2) {
+        gf_log(THIS->name, GF_LOG_DEBUG, "missing option %s", key);
+        return -1;
+    }
+
+    min_len = MIN(value1->len, value2->len);
+    if (strncmp(value1->data, value2->data, min_len) != 0) {
+        gf_log(THIS->name, GF_LOG_DEBUG, "option mismatch, %s, %s != %s", key,
+               value1->data, value2->data);
+        return -1;
+    }
+
+    return 0;
+}
+
+int
+glusterd_brickprocess_delete(glusterd_brick_proc_t *brick_proc)
+{
+    cds_list_del_init(&brick_proc->brick_proc_list);
+    cds_list_del_init(&brick_proc->bricks);
+
+    GF_FREE(brick_proc);
+
+    return 0;
+}
+
+int
+glusterd_brick_process_remove_brick(glusterd_brickinfo_t *brickinfo,
+                                    int *last_brick)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_brick_proc_t *brick_proc = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+    GF_VALIDATE_OR_GOTO(this->name, brickinfo, out);
+
+    brick_proc = brickinfo->brick_proc;
+    if (!brick_proc) {
+        if (brickinfo->status != GF_BRICK_STARTED) {
+            /* this function will be called from gluster_pmap_signout and
+             * glusterd_volume_stop_glusterfs. So it is possible to have
+             * brick_proc set as null.
+             */
+            ret = 0;
+        }
+        goto out;
+    }
+
+    GF_VALIDATE_OR_GOTO(this->name, (brick_proc->brick_count > 0), out);
+
+    cds_list_del_init(&brickinfo->mux_bricks);
+    brick_proc->brick_count--;
+
+    /* If all bricks have been removed, delete the brick process */
+    if (brick_proc->brick_count == 0) {
+        if (last_brick != NULL)
+            *last_brick = 1;
+        ret = glusterd_brickprocess_delete(brick_proc);
+        if (ret)
+            goto out;
+    }
+    brickinfo->brick_proc = NULL;
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_brick_process_add_brick(glusterd_brickinfo_t *brickinfo,
+                                 glusterd_brickinfo_t *parent_brickinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_brick_proc_t *brick_proc = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+    GF_VALIDATE_OR_GOTO(this->name, brickinfo, out);
+
+    if (!parent_brickinfo) {
+        ret = glusterd_brick_proc_for_port(brickinfo->port, &brick_proc);
+        if (ret) {
+            ret = glusterd_brickprocess_new(&brick_proc);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICKPROC_NEW_FAILED,
+                       "Failed to create "
+                       "new brick process instance");
+                goto out;
+            }
+
+            brick_proc->port = brickinfo->port;
+
+            cds_list_add_tail(&brick_proc->brick_proc_list, &priv->brick_procs);
+        }
+    } else {
+        ret = 0;
+        brick_proc = parent_brickinfo->brick_proc;
+    }
+
+    cds_list_add_tail(&brickinfo->mux_bricks, &brick_proc->bricks);
+    brickinfo->brick_proc = brick_proc;
+    brick_proc->brick_count++;
+out:
+    return ret;
+}
+
+/* ret = 0 only when you get a brick process associated with the port
+ * ret = -1 otherwise
+ */
+int
+glusterd_brick_proc_for_port(int port, glusterd_brick_proc_t **brickprocess)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_brick_proc_t *brick_proc = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    cds_list_for_each_entry(brick_proc, &priv->brick_procs, brick_proc_list)
+    {
+        if (brick_proc->port == port) {
+            *brickprocess = brick_proc;
+            ret = 0;
+            break;
+        }
+    }
+out:
+    return ret;
+}
+
+int32_t
+glusterd_volume_stop_glusterfs(glusterd_volinfo_t *volinfo,
+                               glusterd_brickinfo_t *brickinfo,
+                               gf_boolean_t del_brick)
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    int ret = -1;
+    char *op_errstr = NULL;
+    char pidfile[PATH_MAX] = "";
+    int last_brick = -1;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(brickinfo);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    ret = glusterd_brick_process_remove_brick(brickinfo, &last_brick);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Couldn't remove brick from"
+                     " brick process");
+        goto out;
+    }
+
+    if (del_brick)
+        cds_list_del_init(&brickinfo->brick_list);
+
+    if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+        /*
+         * In a post-multiplexing world, even if we're not actually
+         * doing any multiplexing, just dropping the RPC connection
+         * isn't enough.  There might be many such connections during
+         * the brick daemon's lifetime, even if we only consider the
+         * management RPC port (because tests etc. might be manually
+         * attaching and detaching bricks).  Therefore, we have to send
+         * an actual signal instead.
+         */
+        if (is_brick_mx_enabled() && last_brick != 1) {
+            ret = send_attach_req(this, brickinfo->rpc, brickinfo->path, NULL,
+                                  NULL, GLUSTERD_BRICK_TERMINATE);
+            if (ret && brickinfo->status == GF_BRICK_STARTED) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_STOP_FAIL,
+                       "Failed to send"
+                       " detach request for brick %s",
+                       brickinfo->path);
+                goto out;
+            }
+            gf_log(this->name, GF_LOG_INFO,
+                   "Detach request for "
+                   "brick %s:%s is sent successfully",
+                   brickinfo->hostname, brickinfo->path);
+
+        } else {
+            gf_msg_debug(this->name, 0,
+                         "About to stop glusterfsd"
+                         " for brick %s:%s",
+                         brickinfo->hostname, brickinfo->path);
+            ret = glusterd_brick_terminate(volinfo, brickinfo, NULL, 0,
+                                           &op_errstr);
+            if (ret && brickinfo->status == GF_BRICK_STARTED) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_STOP_FAIL,
+                       "Failed to kill"
+                       " the brick %s",
+                       brickinfo->path);
+                goto out;
+            }
+
+            if (op_errstr) {
+                GF_FREE(op_errstr);
+            }
+            if (is_brick_mx_enabled()) {
+                /* In case of brick multiplexing we need to make
+                 * sure the port is cleaned up from here as the
+                 * RPC connection may not have been originated
+                 * for the same brick instance
+                 */
+                pmap_registry_remove(THIS, brickinfo->port, brickinfo->path,
+                                     GF_PMAP_PORT_BRICKSERVER, NULL, _gf_true);
+            }
+        }
+
+        (void)glusterd_brick_disconnect(brickinfo);
+        ret = 0;
+    }
+
+    GLUSTERD_GET_BRICK_PIDFILE(pidfile, volinfo, brickinfo, conf);
+    gf_msg_debug(this->name, 0, "Unlinking pidfile %s", pidfile);
+    (void)sys_unlink(pidfile);
+
+    brickinfo->status = GF_BRICK_STOPPED;
+    brickinfo->start_triggered = _gf_false;
+    brickinfo->brick_proc = NULL;
+    if (del_brick)
+        glusterd_delete_brick(volinfo, brickinfo);
+out:
+    return ret;
+}
+
+/* Free LINE[0..N-1] and then the LINE buffer.  */
+static void
+free_lines(char **line, size_t n)
+{
+    size_t i;
+    for (i = 0; i < n; i++)
+        GF_FREE(line[i]);
+    GF_FREE(line);
+}
+
+static char **
+glusterd_readin_file(const char *filepath, int *line_count)
+{
+    int ret = -1;
+    int n = 8;
+    int counter = 0;
+    char buffer[PATH_MAX + 256] = "";
+    char **lines = NULL;
+    FILE *fp = NULL;
+    void *p;
+
+    fp = fopen(filepath, "r");
+    if (!fp)
+        goto out;
+
+    lines = GF_CALLOC(1, n * sizeof(*lines), gf_gld_mt_charptr);
+    if (!lines)
+        goto out;
+
+    for (counter = 0; fgets(buffer, sizeof(buffer), fp); counter++) {
+        if (counter == n - 1) {
+            n *= 2;
+            p = GF_REALLOC(lines, n * sizeof(char *));
+            if (!p) {
+                free_lines(lines, n / 2);
+                lines = NULL;
+                goto out;
+            }
+            lines = p;
+        }
+
+        lines[counter] = gf_strdup(buffer);
+    }
+
+    lines[counter] = NULL;
+    /* Reduce allocation to minimal size.  */
+    p = GF_REALLOC(lines, (counter + 1) * sizeof(char *));
+    if (!p) {
+        /* coverity[TAINTED_SCALAR] */
+        free_lines(lines, counter);
+        lines = NULL;
+        goto out;
+    }
+    lines = p;
+
+    *line_count = counter;
+    ret = 0;
+
+out:
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_READIN_FILE_FAILED, "%s",
+               strerror(errno));
+    if (fp)
+        fclose(fp);
+
+    return lines;
+}
+
+int
+glusterd_compare_lines(const void *a, const void *b)
+{
+    return strcmp(*(char *const *)a, *(char *const *)b);
+}
+
+static int
+glusterd_sort_and_redirect(const char *src_filepath, int dest_fd)
+{
+    int ret = -1;
+    int line_count = 0;
+    int counter = 0;
+    char **lines = NULL;
+
+    if (!src_filepath || dest_fd < 0)
+        goto out;
+
+    lines = glusterd_readin_file(src_filepath, &line_count);
+    if (!lines)
+        goto out;
+
+    qsort(lines, line_count, sizeof(*lines), glusterd_compare_lines);
+
+    for (counter = 0; lines[counter]; counter++) {
+        ret = sys_write(dest_fd, lines[counter], strlen(lines[counter]));
+        if (ret < 0)
+            goto out;
+
+        GF_FREE(lines[counter]);
+    }
+
+    ret = 0;
+out:
+    GF_FREE(lines);
+
+    return ret;
+}
+
+static int
+glusterd_volume_compute_cksum(glusterd_volinfo_t *volinfo, char *cksum_path,
+                              char *filepath, gf_boolean_t is_quota_conf,
+                              uint32_t *cs)
+{
+    int32_t ret = -1;
+    uint32_t cksum = 0;
+    int fd = -1;
+    int sort_fd = 0;
+    char sort_filepath[PATH_MAX] = "";
+    char buf[32];
+    gf_boolean_t unlink_sortfile = _gf_false;
+    glusterd_conf_t *priv = THIS->private;
+    xlator_t *this = THIS;
+    mode_t orig_umask = 0;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(priv);
+
+    fd = open(cksum_path, O_RDWR | O_APPEND | O_CREAT | O_TRUNC, 0600);
+    if (-1 == fd) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to open %s,"
+               " errno: %d",
+               cksum_path, errno);
+        ret = -1;
+        goto out;
+    }
+
+    if (!is_quota_conf) {
+        snprintf(sort_filepath, sizeof(sort_filepath), "/tmp/%s.XXXXXX",
+                 volinfo->volname);
+
+        orig_umask = umask(S_IRWXG | S_IRWXO);
+        sort_fd = mkstemp(sort_filepath);
+        umask(orig_umask);
+        if (-1 == sort_fd) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                   "Could not generate "
+                   "temp file, reason: %s for volume: %s",
+                   strerror(errno), volinfo->volname);
+            goto out;
+        } else {
+            unlink_sortfile = _gf_true;
+        }
+
+        /* sort the info file, result in sort_filepath */
+
+        ret = glusterd_sort_and_redirect(filepath, sort_fd);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FILE_OP_FAILED,
+                   "sorting info file "
+                   "failed");
+            goto out;
+        }
+
+        ret = sys_close(sort_fd);
+        if (ret)
+            goto out;
+
+        ret = get_checksum_for_path(sort_filepath, &cksum, priv->op_version);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CKSUM_GET_FAIL,
+                   "unable to get "
+                   "checksum for path: %s",
+                   sort_filepath);
+            goto out;
+        }
+
+        ret = snprintf(buf, sizeof(buf), "info=%u\n", cksum);
+        ret = sys_write(fd, buf, ret);
+        if (ret <= 0) {
+            ret = -1;
+            goto out;
+        }
+    } else if (priv->op_version < GD_OP_VERSION_7_0) {
+        ret = get_checksum_for_path(filepath, &cksum, priv->op_version);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CKSUM_GET_FAIL,
+                   "unable to get "
+                   "checksum for path: %s",
+                   filepath);
+            goto out;
+        }
+    }
+
+    ret = get_checksum_for_file(fd, &cksum, priv->op_version);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CKSUM_GET_FAIL,
+               "unable to get checksum for path: %s", filepath);
+        goto out;
+    }
+
+    *cs = cksum;
+
+out:
+    if (fd != -1)
+        sys_close(fd);
+    if (unlink_sortfile)
+        sys_unlink(sort_filepath);
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_compute_cksum(glusterd_volinfo_t *volinfo, gf_boolean_t is_quota_conf)
+{
+    int ret = -1;
+    uint32_t cs = 0;
+    char cksum_path[PATH_MAX] = "";
+    char path[PATH_MAX] = "";
+    char filepath[PATH_MAX] = "";
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+    int32_t len1 = 0;
+    int32_t len2 = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GLUSTERD_GET_VOLUME_DIR(path, volinfo, conf);
+
+    if (is_quota_conf) {
+        len1 = snprintf(cksum_path, sizeof(cksum_path), "%s/%s", path,
+                        GLUSTERD_VOL_QUOTA_CKSUM_FILE);
+        len2 = snprintf(filepath, sizeof(filepath), "%s/%s", path,
+                        GLUSTERD_VOLUME_QUOTA_CONFIG);
+    } else {
+        len1 = snprintf(cksum_path, sizeof(cksum_path), "%s/%s", path,
+                        GLUSTERD_CKSUM_FILE);
+        len2 = snprintf(filepath, sizeof(filepath), "%s/%s", path,
+                        GLUSTERD_VOLUME_INFO_FILE);
+    }
+    if ((len1 < 0) || (len2 < 0) || (len1 >= sizeof(cksum_path)) ||
+        (len2 >= sizeof(filepath))) {
+        goto out;
+    }
+
+    ret = glusterd_volume_compute_cksum(volinfo, cksum_path, filepath,
+                                        is_quota_conf, &cs);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CKSUM_COMPUTE_FAIL,
+               "Failed to compute checksum "
+               "for volume %s",
+               volinfo->volname);
+        goto out;
+    }
+
+    if (is_quota_conf)
+        volinfo->quota_conf_cksum = cs;
+    else
+        volinfo->cksum = cs;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+_add_dict_to_prdict(dict_t *this, char *key, data_t *value, void *data)
+{
+    glusterd_dict_ctx_t *ctx = data;
+    char optkey[64]; /* optkey are usually quite small */
+    int ret = -1;
+
+    ret = snprintf(optkey, sizeof(optkey), "%s.%s%d", ctx->prefix,
+                   ctx->key_name, ctx->opt_count);
+    if (ret < 0 || ret >= sizeof(optkey))
+        return -1;
+    ret = dict_set_strn(ctx->dict, optkey, ret, key);
+    if (ret)
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "option add for %s%d %s", ctx->key_name, ctx->opt_count, key);
+    ret = snprintf(optkey, sizeof(optkey), "%s.%s%d", ctx->prefix,
+                   ctx->val_name, ctx->opt_count);
+    if (ret < 0 || ret >= sizeof(optkey))
+        return -1;
+    ret = dict_set_strn(ctx->dict, optkey, ret, value->data);
+    if (ret)
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "option add for %s%d %s", ctx->val_name, ctx->opt_count,
+               value->data);
+    ctx->opt_count++;
+
+    return ret;
+}
+
+int32_t
+glusterd_add_bricks_hname_path_to_dict(dict_t *dict,
+                                       glusterd_volinfo_t *volinfo)
+{
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int ret = 0;
+    char key[64] = "";
+    int index = 0;
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        ret = snprintf(key, sizeof(key), "%d-hostname", index);
+        ret = dict_set_strn(dict, key, ret, brickinfo->hostname);
+        if (ret) {
+            gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=%s", key, NULL);
+            goto out;
+        }
+
+        ret = snprintf(key, sizeof(key), "%d-path", index);
+        ret = dict_set_strn(dict, key, ret, brickinfo->path);
+        if (ret) {
+            gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=%s", key, NULL);
+            goto out;
+        }
+
+        index++;
+    }
+out:
+    return ret;
+}
+
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
+int32_t
+glusterd_add_volume_to_dict(glusterd_volinfo_t *volinfo, dict_t *dict,
+                            int32_t count, char *prefix)
+{
+    int32_t ret = -1;
+    char pfx[32] = ""; /* prefix should be quite small */
+    char key[64] = "";
+    int keylen;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *ta_brickinfo = NULL;
+    int32_t i = 1;
+    char *volume_id_str = NULL;
+    char *str = NULL;
+    glusterd_dict_ctx_t ctx = {0};
+    char *rebalance_id_str = NULL;
+    char *rb_id_str = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(dict);
+    GF_ASSERT(volinfo);
+    GF_ASSERT(prefix);
+
+    ret = snprintf(pfx, sizeof(pfx), "%s%d", prefix, count);
+    if (ret < 0 || ret >= sizeof(pfx)) {
+        ret = -1;
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.name", pfx);
+    ret = dict_set_strn(dict, key, keylen, volinfo->volname);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.type", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->type);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.brick_count", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->brick_count);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.version", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->version);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.status", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->status);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.sub_count", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->sub_count);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.subvol_count", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->subvol_count);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.stripe_count", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->stripe_count);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.replica_count", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->replica_count);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.arbiter_count", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->arbiter_count);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.thin_arbiter_count", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->thin_arbiter_count);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.disperse_count", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->disperse_count);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.redundancy_count", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->redundancy_count);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.dist_count", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->dist_leaf_count);
+    if (ret)
+        goto out;
+
+    snprintf(key, sizeof(key), "%s.ckusm", pfx);
+    ret = dict_set_int64(dict, key, volinfo->cksum);
+    if (ret)
+        goto out;
+
+    snprintf(key, sizeof(key), "%s.transport_type", pfx);
+    ret = dict_set_uint32(dict, key, volinfo->transport_type);
+    if (ret)
+        goto out;
+
+    snprintf(key, sizeof(key), "%s.stage_deleted", pfx);
+    ret = dict_set_uint32(dict, key, (uint32_t)volinfo->stage_deleted);
+    if (ret)
+        goto out;
+
+    ret = gd_add_vol_snap_details_to_dict(dict, pfx, volinfo);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "vol snap details", NULL);
+        goto out;
+    }
+
+    volume_id_str = gf_strdup(uuid_utoa(volinfo->volume_id));
+    if (!volume_id_str) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+                "volume id=%s", volinfo->volume_id, NULL);
+        ret = -1;
+        goto out;
+    }
+    keylen = snprintf(key, sizeof(key), "%s.volume_id", pfx);
+    ret = dict_set_dynstrn(dict, key, keylen, volume_id_str);
+    if (ret)
+        goto out;
+    volume_id_str = NULL;
+
+    keylen = snprintf(key, sizeof(key), "%s.username", pfx);
+    str = glusterd_auth_get_username(volinfo);
+    if (str) {
+        ret = dict_set_dynstrn(dict, key, keylen, gf_strdup(str));
+        if (ret)
+            goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.password", pfx);
+    str = glusterd_auth_get_password(volinfo);
+    if (str) {
+        ret = dict_set_dynstrn(dict, key, keylen, gf_strdup(str));
+        if (ret)
+            goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.rebalance", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->rebal.defrag_cmd);
+    if (ret)
+        goto out;
+
+    rebalance_id_str = gf_strdup(uuid_utoa(volinfo->rebal.rebalance_id));
+    if (!rebalance_id_str) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+                "rebalance_id=%s", volinfo->rebal.rebalance_id, NULL);
+        ret = -1;
+        goto out;
+    }
+    keylen = snprintf(key, sizeof(key), "%s.rebalance-id", pfx);
+    ret = dict_set_dynstrn(dict, key, keylen, rebalance_id_str);
+    if (ret)
+        goto out;
+    rebalance_id_str = NULL;
+
+    snprintf(key, sizeof(key), "%s.rebalance-op", pfx);
+    ret = dict_set_uint32(dict, key, volinfo->rebal.op);
+    if (ret)
+        goto out;
+
+    if (volinfo->rebal.dict) {
+        ctx.dict = dict;
+        ctx.prefix = pfx;
+        ctx.opt_count = 1;
+        ctx.key_name = "rebal-dict-key";
+        ctx.val_name = "rebal-dict-value";
+
+        dict_foreach(volinfo->rebal.dict, _add_dict_to_prdict, &ctx);
+        ctx.opt_count--;
+        keylen = snprintf(key, sizeof(key), "volume%d.rebal-dict-count", count);
+        ret = dict_set_int32n(dict, key, keylen, ctx.opt_count);
+        if (ret)
+            goto out;
+    }
+
+    ctx.dict = dict;
+    ctx.prefix = pfx;
+    ctx.opt_count = 1;
+    ctx.key_name = "key";
+    ctx.val_name = "value";
+    GF_ASSERT(volinfo->dict);
+
+    dict_foreach(volinfo->dict, _add_dict_to_prdict, &ctx);
+    ctx.opt_count--;
+    keylen = snprintf(key, sizeof(key), "%s.opt-count", pfx);
+    ret = dict_set_int32n(dict, key, keylen, ctx.opt_count);
+    if (ret)
+        goto out;
+
+    ctx.dict = dict;
+    ctx.prefix = pfx;
+    ctx.opt_count = 1;
+    ctx.key_name = "slave-num";
+    ctx.val_name = "slave-val";
+    GF_ASSERT(volinfo->gsync_slaves);
+
+    dict_foreach(volinfo->gsync_slaves, _add_dict_to_prdict, &ctx);
+    ctx.opt_count--;
+
+    keylen = snprintf(key, sizeof(key), "%s.gsync-count", pfx);
+    ret = dict_set_int32n(dict, key, keylen, ctx.opt_count);
+    if (ret)
+        goto out;
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        keylen = snprintf(key, sizeof(key), "%s.brick%d.hostname", pfx, i);
+        ret = dict_set_strn(dict, key, keylen, brickinfo->hostname);
+        if (ret)
+            goto out;
+
+        keylen = snprintf(key, sizeof(key), "%s.brick%d.path", pfx, i);
+        ret = dict_set_strn(dict, key, keylen, brickinfo->path);
+        if (ret)
+            goto out;
+
+        keylen = snprintf(key, sizeof(key), "%s.brick%d.decommissioned", pfx,
+                          i);
+        ret = dict_set_int32n(dict, key, keylen, brickinfo->decommissioned);
+        if (ret)
+            goto out;
+
+        keylen = snprintf(key, sizeof(key), "%s.brick%d.brick_id", pfx, i);
+        ret = dict_set_strn(dict, key, keylen, brickinfo->brick_id);
+        if (ret)
+            goto out;
+
+        snprintf(key, sizeof(key), "%s.brick%d.uuid", pfx, i);
+        ret = dict_set_dynstr_with_alloc(dict, key, uuid_utoa(brickinfo->uuid));
+        if (ret)
+            goto out;
+
+        snprintf(key, sizeof(key), "%s.brick%d", pfx, i);
+        ret = gd_add_brick_snap_details_to_dict(dict, key, brickinfo);
+        if (ret)
+            goto out;
+
+        i++;
+    }
+
+    i = 1;
+    if (volinfo->thin_arbiter_count == 1) {
+        cds_list_for_each_entry(ta_brickinfo, &volinfo->ta_bricks, brick_list)
+        {
+            keylen = snprintf(key, sizeof(key), "%s.ta-brick%d.hostname", pfx,
+                              i);
+            ret = dict_set_strn(dict, key, keylen, ta_brickinfo->hostname);
+            if (ret)
+                goto out;
+
+            keylen = snprintf(key, sizeof(key), "%s.ta-brick%d.path", pfx, i);
+            ret = dict_set_strn(dict, key, keylen, ta_brickinfo->path);
+            if (ret)
+                goto out;
+
+            keylen = snprintf(key, sizeof(key), "%s.ta-brick%d.decommissioned",
+                              pfx, i);
+            ret = dict_set_int32n(dict, key, keylen,
+                                  ta_brickinfo->decommissioned);
+            if (ret)
+                goto out;
+
+            keylen = snprintf(key, sizeof(key), "%s.ta-brick%d.brick_id", pfx,
+                              i);
+            ret = dict_set_strn(dict, key, keylen, ta_brickinfo->brick_id);
+            if (ret)
+                goto out;
+
+            snprintf(key, sizeof(key), "%s.ta-brick%d.uuid", pfx, i);
+            ret = dict_set_dynstr_with_alloc(dict, key,
+                                             uuid_utoa(ta_brickinfo->uuid));
+            if (ret)
+                goto out;
+
+            i++;
+        }
+    }
+
+    /* Add volume op-versions to dict. This prevents volume inconsistencies
+     * in the cluster
+     */
+    keylen = snprintf(key, sizeof(key), "%s.op-version", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->op_version);
+    if (ret)
+        goto out;
+    keylen = snprintf(key, sizeof(key), "%s.client-op-version", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->client_op_version);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.quota-xattr-version", pfx);
+    ret = dict_set_int32n(dict, key, keylen, volinfo->quota_xattr_version);
+out:
+    GF_FREE(volume_id_str);
+    GF_FREE(rebalance_id_str);
+    GF_FREE(rb_id_str);
+
+    if (key[0] != '\0' && ret != 0)
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
+int
+glusterd_vol_add_quota_conf_to_dict(glusterd_volinfo_t *volinfo, dict_t *load,
+                                    int vol_idx, char *prefix)
+{
+    int fd = -1;
+    unsigned char buf[16] = "";
+    char key[64];
+    char key_prefix[32];
+    int gfid_idx = 0;
+    int ret = -1;
+    xlator_t *this = NULL;
+    char type = 0;
+    float version = 0.0f;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(prefix);
+
+    ret = glusterd_store_create_quota_conf_sh_on_absence(volinfo);
+    if (ret)
+        goto out;
+
+    fd = open(volinfo->quota_conf_shandle->path, O_RDONLY);
+    if (fd == -1) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = quota_conf_read_version(fd, &version);
+    if (ret)
+        goto out;
+
+    ret = snprintf(key_prefix, sizeof(key_prefix), "%s%d", prefix, vol_idx);
+    if (ret < 0 || ret >= sizeof(key_prefix)) {
+        ret = -1;
+        goto out;
+    }
+    for (gfid_idx = 0;; gfid_idx++) {
+        ret = quota_conf_read_gfid(fd, buf, &type, version);
+        if (ret == 0) {
+            break;
+        } else if (ret < 0) {
+            gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_QUOTA_CONF_CORRUPT,
+                   "Quota "
+                   "configuration store may be corrupt.");
+            goto out;
+        }
+
+        snprintf(key, sizeof(key) - 1, "%s.gfid%d", key_prefix, gfid_idx);
+        ret = dict_set_dynstr_with_alloc(load, key, uuid_utoa(buf));
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=%s", key, NULL);
+            goto out;
+        }
+
+        snprintf(key, sizeof(key) - 1, "%s.gfid-type%d", key_prefix, gfid_idx);
+        ret = dict_set_int8(load, key, type);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=%s", key, NULL);
+            goto out;
+        }
+    }
+
+    ret = snprintf(key, sizeof(key), "%s.gfid-count", key_prefix);
+    ret = dict_set_int32n(load, key, ret, gfid_idx);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.quota-cksum", key_prefix);
+    ret = dict_set_uint32(load, key, volinfo->quota_conf_cksum);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.quota-version", key_prefix);
+    ret = dict_set_uint32(load, key, volinfo->quota_conf_version);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (fd != -1)
+        sys_close(fd);
+    return ret;
+}
+
+void *
+glusterd_add_bulk_volumes_create_thread(void *data)
+{
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    int32_t count = 0;
+    xlator_t *this = NULL;
+    glusterd_add_dict_args_t *arg = NULL;
+    dict_t *dict = NULL;
+    int start = 0;
+    int end = 0;
+
+    GF_ASSERT(data);
+
+    arg = data;
+    dict = arg->voldict;
+    start = arg->start;
+    end = arg->end;
+    this = arg->this;
+    THIS = arg->this;
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+    {
+        count++;
+
+        /* Skip volumes if index count is less than start
+           index to handle volume for specific thread
+        */
+        if (count < start)
+            continue;
+
+        /* No need to process volume if index count is greater
+           than end index
+        */
+        if (count > end)
+            break;
+
+        ret = glusterd_add_volume_to_dict(volinfo, dict, count, "volume");
+        if (ret)
+            goto out;
+        if (!dict_get_sizen(volinfo->dict, VKEY_FEATURES_QUOTA))
+            continue;
+        ret = glusterd_vol_add_quota_conf_to_dict(volinfo, dict, count,
+                                                  "volume");
+        if (ret)
+            goto out;
+    }
+
+out:
+    GF_ATOMIC_DEC(priv->thread_count);
+    free(arg);
+    return NULL;
+}
+
+int
+glusterd_dict_searialize(dict_t *dict_arr[], int count, int totcount, char *buf)
+{
+    int i = 0;
+    int32_t keylen = 0;
+    int64_t netword = 0;
+    data_pair_t *pair = NULL;
+    int dict_count = 0;
+    int ret = 0;
+
+    netword = hton32(totcount);
+    memcpy(buf, &netword, sizeof(netword));
+    buf += DICT_HDR_LEN;
+
+    for (i = 0; i < count; i++) {
+        if (dict_arr[i]) {
+            dict_count = dict_arr[i]->count;
+            pair = dict_arr[i]->members_list;
+            while (dict_count) {
+                if (!pair) {
+                    gf_msg("glusterd", GF_LOG_ERROR, 0,
+                           LG_MSG_PAIRS_LESS_THAN_COUNT,
+                           "less than count data pairs found!");
+                    ret = -1;
+                    goto out;
+                }
+
+                if (!pair->key) {
+                    gf_msg("glusterd", GF_LOG_ERROR, 0, LG_MSG_NULL_PTR,
+                           "pair->key is null!");
+                    ret = -1;
+                    goto out;
+                }
+
+                keylen = strlen(pair->key);
+                netword = hton32(keylen);
+                memcpy(buf, &netword, sizeof(netword));
+                buf += DICT_DATA_HDR_KEY_LEN;
+                if (!pair->value) {
+                    gf_msg("glusterd", GF_LOG_ERROR, 0, LG_MSG_NULL_PTR,
+                           "pair->value is null!");
+                    ret = -1;
+                    goto out;
+                }
+
+                netword = hton32(pair->value->len);
+                memcpy(buf, &netword, sizeof(netword));
+                buf += DICT_DATA_HDR_VAL_LEN;
+
+                memcpy(buf, pair->key, keylen);
+                buf += keylen;
+                *buf++ = '\0';
+
+                if (pair->value->data) {
+                    memcpy(buf, pair->value->data, pair->value->len);
+                    buf += pair->value->len;
+                }
+
+                pair = pair->next;
+                dict_count--;
+            }
+        }
+    }
+
+out:
+    for (i = 0; i < count; i++) {
+        if (dict_arr[i])
+            dict_unref(dict_arr[i]);
+    }
+    return ret;
+}
+
+int
+glusterd_dict_arr_serialize(dict_t *dict_arr[], int count, char **buf,
+                            u_int *length)
+{
+    ssize_t len = 0;
+    int i = 0;
+    int totcount = 0;
+    int ret = 0;
+
+    for (i = 0; i < count; i++) {
+        if (dict_arr[i]) {
+            len += dict_serialized_length_lk(dict_arr[i]);
+            totcount += dict_arr[i]->count;
+        }
+    }
+
+    // Subtract HDR_LEN except one dictionary
+    len = len - ((count - 1) * DICT_HDR_LEN);
+
+    *buf = GF_MALLOC(len, gf_common_mt_char);
+    if (*buf == NULL) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    if (length != NULL) {
+        *length = len;
+    }
+
+    ret = glusterd_dict_searialize(dict_arr, count, totcount, *buf);
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_add_volumes_to_export_dict(dict_t *peer_data, char **buf,
+                                    u_int *length)
+{
+    int32_t ret = -1;
+    dict_t *dict_arr[128] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    int32_t count = 0;
+    glusterd_dict_ctx_t ctx = {0};
+    xlator_t *this = NULL;
+    int totthread = 0;
+    int volcnt = 0;
+    int start = 1;
+    int endindex = 0;
+    int vol_per_thread_limit = 0;
+    glusterd_add_dict_args_t *arg = NULL;
+    pthread_t th_id = {
+        0,
+    };
+    int th_ret = 0;
+    int i = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    /* Count the total number of volumes */
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list) volcnt++;
+
+    get_gd_vol_thread_limit(&vol_per_thread_limit);
+
+    if ((vol_per_thread_limit == 1) || (vol_per_thread_limit == 0) ||
+        (vol_per_thread_limit > 100)) {
+        totthread = 0;
+    } else {
+        totthread = volcnt / vol_per_thread_limit;
+        if (totthread) {
+            endindex = volcnt % vol_per_thread_limit;
+            if (endindex)
+                totthread++;
+        }
+    }
+
+    if (totthread == 0) {
+        cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+        {
+            count++;
+            ret = glusterd_add_volume_to_dict(volinfo, peer_data, count,
+                                              "volume");
+            if (ret)
+                goto out;
+
+            if (!dict_get_sizen(volinfo->dict, VKEY_FEATURES_QUOTA))
+                continue;
+
+            ret = glusterd_vol_add_quota_conf_to_dict(volinfo, peer_data, count,
+                                                      "volume");
+            if (ret)
+                goto out;
+        }
+    } else {
+        for (i = 0; i < totthread; i++) {
+            arg = calloc(1, sizeof(*arg));
+            dict_arr[i] = dict_new();
+            arg->this = this;
+            arg->voldict = dict_arr[i];
+            arg->start = start;
+            if ((i + 1) != totthread) {
+                arg->end = ((i + 1) * vol_per_thread_limit);
+            } else {
+                arg->end = (((i + 1) * vol_per_thread_limit) + endindex);
+            }
+            th_ret = gf_thread_create_detached(
+                &th_id, glusterd_add_bulk_volumes_create_thread, arg,
+                "bulkvoldict");
+            if (th_ret) {
+                gf_log(this->name, GF_LOG_ERROR,
+                       "glusterd_add_bulk_volume %s"
+                       " thread creation failed",
+                       "bulkvoldict");
+                free(arg);
+                goto out;
+            }
+
+            start = start + vol_per_thread_limit;
+            GF_ATOMIC_INC(priv->thread_count);
+            gf_log(this->name, GF_LOG_INFO,
+                   "Create thread %d to populate dict data for volume"
+                   " start index is %d end index is %d",
+                   (i + 1), arg->start, arg->end);
+        }
+        while (GF_ATOMIC_GET(priv->thread_count)) {
+            sleep(1);
+        }
+
+        gf_log(this->name, GF_LOG_INFO,
+               "Finished dictionary population in all threads");
+    }
+
+    ret = dict_set_int32n(peer_data, "count", SLEN("count"), volcnt);
+    if (ret)
+        goto out;
+
+    ctx.dict = peer_data;
+    ctx.prefix = "global";
+    ctx.opt_count = 1;
+    ctx.key_name = "key";
+    ctx.val_name = "val";
+    dict_foreach(priv->opts, _add_dict_to_prdict, &ctx);
+    ctx.opt_count--;
+    ret = dict_set_int32n(peer_data, "global-opt-count",
+                          SLEN("global-opt-count"), ctx.opt_count);
+    if (ret)
+        goto out;
+
+    if (totthread) {
+        gf_log(this->name, GF_LOG_INFO,
+               "Merged multiple dictionaries into a single one");
+        dict_arr[totthread++] = dict_ref(peer_data);
+        ret = glusterd_dict_arr_serialize(dict_arr, totthread, buf, length);
+        gf_log(this->name, GF_LOG_INFO, "Serialize dictionary data returned %d",
+               ret);
+    }
+
+out:
+
+    gf_msg_trace(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int32_t
+glusterd_compare_friend_volume(dict_t *peer_data, int32_t count,
+                               int32_t *status, char *hostname)
+{
+    int32_t ret = -1;
+    char key[64] = "";
+    char key_prefix[32];
+    int keylen;
+    glusterd_volinfo_t *volinfo = NULL;
+    char *volname = NULL;
+    uint32_t cksum = 0;
+    uint32_t quota_cksum = 0;
+    uint32_t quota_version = 0;
+    uint32_t stage_deleted = 0;
+    int32_t version = 0;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(peer_data);
+    GF_ASSERT(status);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    snprintf(key_prefix, sizeof(key_prefix), "volume%d", count);
+    keylen = snprintf(key, sizeof(key), "%s.name", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &volname);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(key, sizeof(key), "%s.stage_deleted", key_prefix);
+        ret = dict_get_uint32(peer_data, key, &stage_deleted);
+        /* stage_deleted = 1 means the volume is still in the process of
+         * deleting a volume, so we shouldn't be trying to create a
+         * fresh volume here which would lead to a stale entry
+         */
+        if (!ret && stage_deleted == 0)
+            *status = GLUSTERD_VOL_COMP_UPDATE_REQ;
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.version", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &version);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    if (version > volinfo->version) {
+        // Mismatch detected
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_VOL_VERS_MISMATCH,
+               "Version of volume %s differ. local version = %d, "
+               "remote version = %d on peer %s",
+               volinfo->volname, volinfo->version, version, hostname);
+        GF_ATOMIC_INIT(volinfo->volpeerupdate, 1);
+        *status = GLUSTERD_VOL_COMP_UPDATE_REQ;
+        goto out;
+    } else if (version < volinfo->version) {
+        *status = GLUSTERD_VOL_COMP_SCS;
+        goto out;
+    }
+
+    // Now, versions are same, compare cksums.
+    //
+    snprintf(key, sizeof(key), "%s.ckusm", key_prefix);
+    ret = dict_get_uint32(peer_data, key, &cksum);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    if (cksum != volinfo->cksum) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CKSUM_VERS_MISMATCH,
+               "Version of Cksums %s differ. local cksum = %u, remote "
+               "cksum = %u on peer %s",
+               volinfo->volname, volinfo->cksum, cksum, hostname);
+        *status = GLUSTERD_VOL_COMP_RJT;
+        goto out;
+    }
+
+    if (!dict_get_sizen(volinfo->dict, VKEY_FEATURES_QUOTA))
+        goto skip_quota;
+
+    snprintf(key, sizeof(key), "%s.quota-version", key_prefix);
+    ret = dict_get_uint32(peer_data, key, &quota_version);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "quota-version key absent for"
+                     " volume %s in peer %s's response",
+                     volinfo->volname, hostname);
+    } else {
+        if (quota_version > volinfo->quota_conf_version) {
+            // Mismatch detected
+            gf_msg(this->name, GF_LOG_INFO, 0,
+                   GD_MSG_QUOTA_CONFIG_VERS_MISMATCH,
+                   "Quota configuration versions of volume %s "
+                   "differ. local version = %d, remote version = "
+                   "%d on peer %s",
+                   volinfo->volname, volinfo->quota_conf_version, quota_version,
+                   hostname);
+            *status = GLUSTERD_VOL_COMP_UPDATE_REQ;
+            goto out;
+        } else if (quota_version < volinfo->quota_conf_version) {
+            *status = GLUSTERD_VOL_COMP_SCS;
+            goto out;
+        }
+    }
+
+    // Now, versions are same, compare cksums.
+    //
+    snprintf(key, sizeof(key), "%s.quota-cksum", key_prefix);
+    ret = dict_get_uint32(peer_data, key, &quota_cksum);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "quota checksum absent for "
+                     "volume %s in peer %s's response",
+                     volinfo->volname, hostname);
+    } else {
+        if (quota_cksum != volinfo->quota_conf_cksum) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_QUOTA_CONFIG_CKSUM_MISMATCH,
+                   "Cksums of "
+                   "quota configuration of volume %s differ. local"
+                   " cksum = %u, remote  cksum = %u on peer %s",
+                   volinfo->volname, volinfo->quota_conf_cksum, quota_cksum,
+                   hostname);
+            *status = GLUSTERD_VOL_COMP_RJT;
+            goto out;
+        }
+    }
+
+skip_quota:
+    *status = GLUSTERD_VOL_COMP_SCS;
+
+out:
+    keylen = snprintf(key, sizeof(key), "%s.update", key_prefix);
+
+    if (*status == GLUSTERD_VOL_COMP_UPDATE_REQ) {
+        ret = dict_set_int32n(peer_data, key, keylen, 1);
+    } else {
+        ret = dict_set_int32n(peer_data, key, keylen, 0);
+    }
+    if (*status == GLUSTERD_VOL_COMP_RJT) {
+        gf_event(EVENT_COMPARE_FRIEND_VOLUME_FAILED, "volume=%s",
+                 volinfo->volname);
+    }
+    gf_msg_debug(this->name, 0, "Returning with ret: %d, status: %d", ret,
+                 *status);
+    return ret;
+}
+
+static int32_t
+import_prdict_dict(dict_t *peer_data, dict_t *dst_dict, char *key_prefix,
+                   char *value_prefix, int opt_count, char *prefix)
+{
+    char key[512] = "";
+    int keylen;
+    int32_t ret = 0;
+    int i = 1;
+    char *opt_key = NULL;
+    char *opt_val = NULL;
+    char *dup_opt_val = NULL;
+    char msg[2048] = "";
+
+    while (i <= opt_count) {
+        keylen = snprintf(key, sizeof(key), "%s.%s%d", prefix, key_prefix, i);
+        ret = dict_get_strn(peer_data, key, keylen, &opt_key);
+        if (ret) {
+            snprintf(msg, sizeof(msg),
+                     "Volume dict key not "
+                     "specified");
+            goto out;
+        }
+
+        keylen = snprintf(key, sizeof(key), "%s.%s%d", prefix, value_prefix, i);
+        ret = dict_get_strn(peer_data, key, keylen, &opt_val);
+        if (ret) {
+            snprintf(msg, sizeof(msg),
+                     "Volume dict value not "
+                     "specified");
+            goto out;
+        }
+        dup_opt_val = gf_strdup(opt_val);
+        if (!dup_opt_val) {
+            ret = -1;
+            goto out;
+        }
+        ret = dict_set_dynstr(dst_dict, opt_key, dup_opt_val);
+        if (ret) {
+            snprintf(msg, sizeof(msg),
+                     "Volume set %s %s "
+                     "unsuccessful",
+                     opt_key, dup_opt_val);
+            goto out;
+        }
+        i++;
+    }
+
+out:
+    if (msg[0])
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_IMPORT_PRDICT_DICT, "%s",
+               msg);
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+int
+glusterd_spawn_daemons(void *opaque)
+{
+    glusterd_conf_t *conf = THIS->private;
+    int ret = -1;
+
+    /* glusterd_restart_brick() will take the sync_lock. */
+    glusterd_restart_bricks(NULL);
+    glusterd_restart_gsyncds(conf);
+    glusterd_restart_rebalance(conf);
+    ret = glusterd_snapdsvc_restart();
+    ret = glusterd_gfproxydsvc_restart();
+    ret = glusterd_shdsvc_restart();
+    return ret;
+}
+
+static int32_t
+glusterd_import_friend_volume_opts(dict_t *peer_data, int count,
+                                   glusterd_volinfo_t *volinfo, char *prefix)
+{
+    char key[64];
+    int keylen;
+    int32_t ret = -1;
+    int opt_count = 0;
+    char msg[2048] = "";
+    char volume_prefix[32];
+
+    GF_ASSERT(peer_data);
+    GF_ASSERT(volinfo);
+
+    snprintf(volume_prefix, sizeof(volume_prefix), "%s%d", prefix, count);
+
+    keylen = snprintf(key, sizeof(key), "%s.opt-count", volume_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &opt_count);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "Volume option count not "
+                 "specified for %s",
+                 volinfo->volname);
+        goto out;
+    }
+
+    ret = import_prdict_dict(peer_data, volinfo->dict, "key", "value",
+                             opt_count, volume_prefix);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "Unable to import options dict "
+                 "specified for %s",
+                 volinfo->volname);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.gsync-count", volume_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &opt_count);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "Gsync count not "
+                 "specified for %s",
+                 volinfo->volname);
+        goto out;
+    }
+
+    ret = import_prdict_dict(peer_data, volinfo->gsync_slaves, "slave-num",
+                             "slave-val", opt_count, volume_prefix);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "Unable to import gsync sessions "
+                 "specified for %s",
+                 volinfo->volname);
+        goto out;
+    }
+
+out:
+    if (msg[0])
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_OPTS_IMPORT_FAIL, "%s",
+               msg);
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+static int32_t
+glusterd_import_new_ta_brick(dict_t *peer_data, int32_t vol_count,
+                             int32_t brick_count,
+                             glusterd_brickinfo_t **ta_brickinfo, char *prefix)
+{
+    char key[128];
+    char key_prefix[64];
+    int keylen;
+    int ret = -1;
+    char *hostname = NULL;
+    char *path = NULL;
+    char *brick_id = NULL;
+    int decommissioned = 0;
+    glusterd_brickinfo_t *new_ta_brickinfo = NULL;
+    char msg[256] = "";
+    char *brick_uuid_str = NULL;
+
+    GF_ASSERT(peer_data);
+    GF_ASSERT(vol_count >= 0);
+    GF_ASSERT(ta_brickinfo);
+    GF_ASSERT(prefix);
+
+    ret = snprintf(key_prefix, sizeof(key_prefix), "%s%d.ta-brick%d", prefix,
+                   vol_count, brick_count);
+
+    if (ret < 0 || ret >= sizeof(key_prefix)) {
+        ret = -1;
+        snprintf(msg, sizeof(msg), "key_prefix too long");
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.hostname", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &hostname);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload", key);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.path", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &path);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload", key);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.brick_id", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &brick_id);
+
+    keylen = snprintf(key, sizeof(key), "%s.decommissioned", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &decommissioned);
+    if (ret) {
+        /* For backward compatibility */
+        ret = 0;
+    }
+
+    ret = glusterd_brickinfo_new(&new_ta_brickinfo);
+    if (ret)
+        goto out;
+
+    ret = snprintf(new_ta_brickinfo->path, sizeof(new_ta_brickinfo->path), "%s",
+                   path);
+    if (ret < 0 || ret >= sizeof(new_ta_brickinfo->path)) {
+        ret = -1;
+        goto out;
+    }
+    ret = snprintf(new_ta_brickinfo->hostname,
+                   sizeof(new_ta_brickinfo->hostname), "%s", hostname);
+    if (ret < 0 || ret >= sizeof(new_ta_brickinfo->hostname)) {
+        ret = -1;
+        goto out;
+    }
+    new_ta_brickinfo->decommissioned = decommissioned;
+    if (brick_id)
+        (void)snprintf(new_ta_brickinfo->brick_id,
+                       sizeof(new_ta_brickinfo->brick_id), "%s", brick_id);
+    keylen = snprintf(key, sizeof(key), "%s.uuid", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &brick_uuid_str);
+    if (ret)
+        goto out;
+    gf_uuid_parse(brick_uuid_str, new_ta_brickinfo->uuid);
+
+    *ta_brickinfo = new_ta_brickinfo;
+
+out:
+    if (msg[0]) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_BRICK_IMPORT_FAIL, "%s",
+               msg);
+        gf_event(EVENT_IMPORT_BRICK_FAILED, "peer=%s;ta-brick=%s",
+                 new_ta_brickinfo->hostname, new_ta_brickinfo->path);
+    }
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
+static int32_t
+glusterd_import_new_brick(dict_t *peer_data, int32_t vol_count,
+                          int32_t brick_count, glusterd_brickinfo_t **brickinfo,
+                          char *prefix)
+{
+    char key[128];
+    char key_prefix[64];
+    int keylen;
+    int ret = -1;
+    char *hostname = NULL;
+    char *path = NULL;
+    char *brick_id = NULL;
+    int decommissioned = 0;
+    glusterd_brickinfo_t *new_brickinfo = NULL;
+    char msg[256] = "";
+    char *brick_uuid_str = NULL;
+
+    GF_ASSERT(peer_data);
+    GF_ASSERT(vol_count >= 0);
+    GF_ASSERT(brickinfo);
+    GF_ASSERT(prefix);
+
+    ret = snprintf(key_prefix, sizeof(key_prefix), "%s%d.brick%d", prefix,
+                   vol_count, brick_count);
+    if (ret < 0 || ret >= sizeof(key_prefix)) {
+        ret = -1;
+        snprintf(msg, sizeof(msg), "key_prefix too long");
+        goto out;
+    }
+    keylen = snprintf(key, sizeof(key), "%s.hostname", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &hostname);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload", key);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.path", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &path);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload", key);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.brick_id", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &brick_id);
+
+    keylen = snprintf(key, sizeof(key), "%s.decommissioned", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &decommissioned);
+    if (ret) {
+        /* For backward compatibility */
+        ret = 0;
+    }
+
+    ret = glusterd_brickinfo_new(&new_brickinfo);
+    if (ret)
+        goto out;
+
+    ret = snprintf(new_brickinfo->path, sizeof(new_brickinfo->path), "%s",
+                   path);
+    if (ret < 0 || ret >= sizeof(new_brickinfo->path)) {
+        ret = -1;
+        goto out;
+    }
+    ret = snprintf(new_brickinfo->hostname, sizeof(new_brickinfo->hostname),
+                   "%s", hostname);
+    if (ret < 0 || ret >= sizeof(new_brickinfo->hostname)) {
+        ret = -1;
+        goto out;
+    }
+    new_brickinfo->decommissioned = decommissioned;
+    if (brick_id)
+        (void)snprintf(new_brickinfo->brick_id, sizeof(new_brickinfo->brick_id),
+                       "%s", brick_id);
+
+    ret = gd_import_new_brick_snap_details(peer_data, key_prefix,
+                                           new_brickinfo);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.uuid", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &brick_uuid_str);
+    if (ret)
+        goto out;
+    gf_uuid_parse(brick_uuid_str, new_brickinfo->uuid);
+
+    *brickinfo = new_brickinfo;
+out:
+    if (msg[0]) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_BRICK_IMPORT_FAIL, "%s",
+               msg);
+        if (new_brickinfo)
+            gf_event(EVENT_IMPORT_BRICK_FAILED, "peer=%s;brick=%s",
+                     new_brickinfo->hostname, new_brickinfo->path);
+    }
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
+static int32_t
+glusterd_import_bricks(dict_t *peer_data, int32_t vol_count,
+                       glusterd_volinfo_t *new_volinfo, char *prefix)
+{
+    int ret = -1;
+    int brick_count = 1;
+    int ta_brick_count = 1;
+    int brickid = 0;
+    glusterd_brickinfo_t *new_brickinfo = NULL;
+    glusterd_brickinfo_t *new_ta_brickinfo = NULL;
+
+    GF_ASSERT(peer_data);
+    GF_ASSERT(vol_count >= 0);
+    GF_ASSERT(new_volinfo);
+    GF_ASSERT(prefix);
+    while (brick_count <= new_volinfo->brick_count) {
+        ret = glusterd_import_new_brick(peer_data, vol_count, brick_count,
+                                        &new_brickinfo, prefix);
+        if (ret)
+            goto out;
+        if (new_brickinfo->brick_id[0] == '\0')
+            /*We were probed from a peer having op-version
+             less than GD_OP_VER_PERSISTENT_AFR_XATTRS*/
+            GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO(new_brickinfo, new_volinfo,
+                                                 brickid++);
+        cds_list_add_tail(&new_brickinfo->brick_list, &new_volinfo->bricks);
+        brick_count++;
+    }
+
+    if (new_volinfo->thin_arbiter_count == 1) {
+        while (ta_brick_count <= new_volinfo->subvol_count) {
+            ret = glusterd_import_new_ta_brick(peer_data, vol_count,
+                                               ta_brick_count,
+                                               &new_ta_brickinfo, prefix);
+            if (ret)
+                goto out;
+            cds_list_add_tail(&new_ta_brickinfo->brick_list,
+                              &new_volinfo->ta_bricks);
+            ta_brick_count++;
+        }
+    }
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
+int
+glusterd_import_quota_conf(dict_t *peer_data, int vol_idx,
+                           glusterd_volinfo_t *new_volinfo, char *prefix)
+{
+    int gfid_idx = 0;
+    int gfid_count = 0;
+    int ret = -1;
+    int fd = -1;
+    char key[128];
+    char key_prefix[64];
+    int keylen;
+    char *gfid_str = NULL;
+    uuid_t gfid = {
+        0,
+    };
+    xlator_t *this = NULL;
+    int8_t gfid_type = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(peer_data);
+    GF_ASSERT(prefix);
+
+    if (!glusterd_is_volume_quota_enabled(new_volinfo)) {
+        (void)glusterd_clean_up_quota_store(new_volinfo);
+        return 0;
+    }
+
+    ret = glusterd_store_create_quota_conf_sh_on_absence(new_volinfo);
+    if (ret)
+        goto out;
+
+    fd = gf_store_mkstemp(new_volinfo->quota_conf_shandle);
+    if (fd < 0) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = snprintf(key_prefix, sizeof(key_prefix), "%s%d", prefix, vol_idx);
+    if (ret < 0 || ret >= sizeof(key_prefix)) {
+        ret = -1;
+        gf_msg_debug(this->name, 0, "Failed to set key_prefix for quota conf");
+        goto out;
+    }
+    snprintf(key, sizeof(key), "%s.quota-cksum", key_prefix);
+    ret = dict_get_uint32(peer_data, key, &new_volinfo->quota_conf_cksum);
+    if (ret)
+        gf_msg_debug(this->name, 0, "Failed to get quota cksum");
+
+    snprintf(key, sizeof(key), "%s.quota-version", key_prefix);
+    ret = dict_get_uint32(peer_data, key, &new_volinfo->quota_conf_version);
+    if (ret)
+        gf_msg_debug(this->name, 0,
+                     "Failed to get quota "
+                     "version");
+
+    keylen = snprintf(key, sizeof(key), "%s.gfid-count", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &gfid_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    ret = glusterd_quota_conf_write_header(fd);
+    if (ret)
+        goto out;
+
+    for (gfid_idx = 0; gfid_idx < gfid_count; gfid_idx++) {
+        keylen = snprintf(key, sizeof(key) - 1, "%s.gfid%d", key_prefix,
+                          gfid_idx);
+        ret = dict_get_strn(peer_data, key, keylen, &gfid_str);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                    "Key=%s", key, NULL);
+            goto out;
+        }
+
+        snprintf(key, sizeof(key) - 1, "%s.gfid-type%d", key_prefix, gfid_idx);
+        ret = dict_get_int8(peer_data, key, &gfid_type);
+        if (ret)
+            gfid_type = GF_QUOTA_CONF_TYPE_USAGE;
+
+        gf_uuid_parse(gfid_str, gfid);
+        ret = glusterd_quota_conf_write_gfid(fd, gfid, (char)gfid_type);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_CRITICAL, errno,
+                   GD_MSG_QUOTA_CONF_WRITE_FAIL,
+                   "Unable to write "
+                   "gfid %s into quota.conf for %s",
+                   gfid_str, new_volinfo->volname);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = gf_store_rename_tmppath(new_volinfo->quota_conf_shandle);
+
+    ret = 0;
+
+out:
+    if (!ret) {
+        ret = glusterd_compute_cksum(new_volinfo, _gf_true);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CKSUM_COMPUTE_FAIL,
+                   "Failed to compute checksum");
+            goto clear_quota_conf;
+        }
+
+        ret = glusterd_store_save_quota_version_and_cksum(new_volinfo);
+        if (ret)
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   GD_MSG_QUOTA_CKSUM_VER_STORE_FAIL,
+                   "Failed to save quota version and checksum");
+    }
+
+clear_quota_conf:
+    if (ret && (fd > 0)) {
+        gf_store_unlink_tmppath(new_volinfo->quota_conf_shandle);
+        (void)gf_store_handle_destroy(new_volinfo->quota_conf_shandle);
+        new_volinfo->quota_conf_shandle = NULL;
+    }
+
+    return ret;
+}
+
+int
+gd_import_friend_volume_rebal_dict(dict_t *dict, int count,
+                                   glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    char key[64] = "";
+    int dict_count = 0;
+    char key_prefix[32];
+
+    GF_ASSERT(dict);
+    GF_ASSERT(volinfo);
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    snprintf(key_prefix, sizeof(key_prefix), "volume%d", count);
+    ret = snprintf(key, sizeof(key), "%s.rebal-dict-count", key_prefix);
+    ret = dict_get_int32n(dict, key, ret, &dict_count);
+    if (ret) {
+        /* Older peers will not have this dict */
+        gf_smsg(this->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=%s", key, NULL);
+        ret = 0;
+        goto out;
+    }
+
+    volinfo->rebal.dict = dict_new();
+    if (!volinfo->rebal.dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = import_prdict_dict(dict, volinfo->rebal.dict, "rebal-dict-key",
+                             "rebal-dict-value", dict_count, key_prefix);
+out:
+    if (ret && volinfo->rebal.dict)
+        dict_unref(volinfo->rebal.dict);
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
+int32_t
+glusterd_import_volinfo(dict_t *peer_data, int count,
+                        glusterd_volinfo_t **volinfo, char *prefix)
+{
+    int ret = -1;
+    char key[64] = "";
+    char key_prefix[32];
+    int keylen;
+    char *parent_volname = NULL;
+    char *volname = NULL;
+    glusterd_volinfo_t *new_volinfo = NULL;
+    char *volume_id_str = NULL;
+    char msg[2048] = "";
+    char *str = NULL;
+    char *rebalance_id_str = NULL;
+    int op_version = 0;
+    int client_op_version = 0;
+    uint32_t stage_deleted = 0;
+
+    GF_ASSERT(peer_data);
+    GF_ASSERT(volinfo);
+    GF_ASSERT(prefix);
+
+    ret = snprintf(key_prefix, sizeof(key_prefix), "%s%d", prefix, count);
+    if (ret < 0 || ret >= sizeof(key_prefix)) {
+        ret = -1;
+        snprintf(msg, sizeof(msg), "key_prefix too big");
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.name", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &volname);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload", key);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.stage_deleted", key_prefix);
+    ret = dict_get_uint32(peer_data, key, &stage_deleted);
+    /* stage_deleted = 1 means the volume is still in the process of
+     * deleting a volume, so we shouldn't be trying to create a
+     * fresh volume here which would lead to a stale entry
+     */
+    if (stage_deleted) {
+        goto out;
+    }
+
+    ret = glusterd_volinfo_new(&new_volinfo);
+    if (ret)
+        goto out;
+    ret = snprintf(new_volinfo->volname, sizeof(new_volinfo->volname), "%s",
+                   volname);
+    if (ret < 0 || ret >= sizeof(new_volinfo->volname)) {
+        ret = -1;
+        goto out;
+    }
+    keylen = snprintf(key, sizeof(key), "%s.type", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &new_volinfo->type);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload for %s", key,
+                 volname);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.parent_volname", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &parent_volname);
+    if (!ret) {
+        ret = snprintf(new_volinfo->parent_volname,
+                       sizeof(new_volinfo->parent_volname), "%s",
+                       parent_volname);
+        if (ret < 0 || ret >= sizeof(new_volinfo->volname)) {
+            ret = -1;
+            goto out;
+        }
+    }
+    keylen = snprintf(key, sizeof(key), "%s.brick_count", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &new_volinfo->brick_count);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload for %s", key,
+                 volname);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.version", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &new_volinfo->version);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload for %s", key,
+                 volname);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.status", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen,
+                          (int32_t *)&new_volinfo->status);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload for %s", key,
+                 volname);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.sub_count", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &new_volinfo->sub_count);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload for %s", key,
+                 volname);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.subvol_count", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &new_volinfo->subvol_count);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload for %s", key,
+                 volname);
+        goto out;
+    }
+
+    /* not having a 'stripe_count' key is not a error
+       (as peer may be of old version) */
+    keylen = snprintf(key, sizeof(key), "%s.stripe_count", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &new_volinfo->stripe_count);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "peer is possibly old version");
+
+    /* not having a 'replica_count' key is not a error
+       (as peer may be of old version) */
+    keylen = snprintf(key, sizeof(key), "%s.replica_count", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &new_volinfo->replica_count);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "peer is possibly old version");
+
+    /* not having a 'arbiter_count' key is not a error
+       (as peer may be of old version) */
+    keylen = snprintf(key, sizeof(key), "%s.arbiter_count", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &new_volinfo->arbiter_count);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "peer is possibly old version");
+
+    /* not having a 'thin_arbiter_count' key is not a error
+       (as peer may be of old version) */
+    keylen = snprintf(key, sizeof(key), "%s.thin_arbiter_count", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen,
+                          &new_volinfo->thin_arbiter_count);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "peer is possibly old version");
+
+    /* not having a 'disperse_count' key is not a error
+       (as peer may be of old version) */
+    keylen = snprintf(key, sizeof(key), "%s.disperse_count", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &new_volinfo->disperse_count);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "peer is possibly old version");
+
+    /* not having a 'redundancy_count' key is not a error
+       (as peer may be of old version) */
+    keylen = snprintf(key, sizeof(key), "%s.redundancy_count", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen,
+                          &new_volinfo->redundancy_count);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "peer is possibly old version");
+
+    /* not having a 'dist_count' key is not a error
+       (as peer may be of old version) */
+    keylen = snprintf(key, sizeof(key), "%s.dist_count", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen,
+                          &new_volinfo->dist_leaf_count);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "peer is possibly old version");
+
+    new_volinfo->subvol_count = new_volinfo->brick_count /
+                                glusterd_get_dist_leaf_count(new_volinfo);
+    snprintf(key, sizeof(key), "%s.ckusm", key_prefix);
+    ret = dict_get_uint32(peer_data, key, &new_volinfo->cksum);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload for %s", key,
+                 volname);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.volume_id", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &volume_id_str);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload for %s", key,
+                 volname);
+        goto out;
+    }
+
+    gf_uuid_parse(volume_id_str, new_volinfo->volume_id);
+
+    keylen = snprintf(key, sizeof(key), "%s.username", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &str);
+    if (!ret) {
+        ret = glusterd_auth_set_username(new_volinfo, str);
+        if (ret)
+            goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.password", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &str);
+    if (!ret) {
+        ret = glusterd_auth_set_password(new_volinfo, str);
+        if (ret)
+            goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.transport_type", key_prefix);
+    ret = dict_get_uint32(peer_data, key, &new_volinfo->transport_type);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload for %s", key,
+                 volname);
+        goto out;
+    }
+
+    snprintf(key, sizeof(key), "%s.rebalance", key_prefix);
+    ret = dict_get_uint32(peer_data, key, &new_volinfo->rebal.defrag_cmd);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "%s missing in payload for %s", key,
+                 volname);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.rebalance-id", key_prefix);
+    ret = dict_get_strn(peer_data, key, keylen, &rebalance_id_str);
+    if (ret) {
+        /* This is not present in older glusterfs versions,
+         * so don't error out
+         */
+        ret = 0;
+    } else {
+        gf_uuid_parse(rebalance_id_str, new_volinfo->rebal.rebalance_id);
+    }
+
+    snprintf(key, sizeof(key), "%s.rebalance-op", key_prefix);
+    /* This is not present in older glusterfs versions,
+     * so don't error out
+     */
+    ret = dict_get_uint32(peer_data, key, (uint32_t *)&new_volinfo->rebal.op);
+
+    ret = gd_import_friend_volume_rebal_dict(peer_data, count, new_volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "Failed to import rebalance dict "
+                 "for volume.");
+        goto out;
+    }
+
+    ret = gd_import_volume_snap_details(peer_data, new_volinfo, key_prefix,
+                                        volname);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_SNAP_DETAILS_IMPORT_FAIL,
+               "Failed to import snapshot "
+               "details for volume %s",
+               volname);
+        goto out;
+    }
+
+    ret = glusterd_import_friend_volume_opts(peer_data, count, new_volinfo,
+                                             prefix);
+    if (ret)
+        goto out;
+
+    /* Import the volume's op-versions if available else set it to 1.
+     * Not having op-versions implies this informtation was obtained from a
+     * op-version 1 friend (gluster-3.3), ergo the cluster is at op-version
+     * 1 and all volumes are at op-versions 1.
+     *
+     * Either both the volume op-versions should be absent or both should be
+     * present. Only one being present is a failure
+     */
+    keylen = snprintf(key, sizeof(key), "%s.op-version", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &op_version);
+    if (ret)
+        ret = 0;
+    keylen = snprintf(key, sizeof(key), "%s.client-op-version", key_prefix);
+    ret = dict_get_int32n(peer_data, key, keylen, &client_op_version);
+    if (ret)
+        ret = 0;
+
+    if (op_version && client_op_version) {
+        new_volinfo->op_version = op_version;
+        new_volinfo->client_op_version = client_op_version;
+    } else if (((op_version == 0) && (client_op_version != 0)) ||
+               ((op_version != 0) && (client_op_version == 0))) {
+        ret = -1;
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Only one volume op-version found");
+        goto out;
+    } else {
+        new_volinfo->op_version = 1;
+        new_volinfo->client_op_version = 1;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.quota-xattr-version", key_prefix);
+    /*This is not present in older glusterfs versions, so ignore ret value*/
+    ret = dict_get_int32n(peer_data, key, keylen,
+                          &new_volinfo->quota_xattr_version);
+
+    ret = glusterd_import_bricks(peer_data, count, new_volinfo, prefix);
+    if (ret)
+        goto out;
+
+    *volinfo = new_volinfo;
+out:
+    if (msg[0]) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLINFO_IMPORT_FAIL, "%s",
+               msg);
+        gf_event(EVENT_IMPORT_VOLUME_FAILED, "volume=%s", new_volinfo->volname);
+    }
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_volume_disconnect_all_bricks(glusterd_volinfo_t *volinfo)
+{
+    int ret = 0;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brick_proc_t *brick_proc = NULL;
+    int brick_count = 0;
+
+    GF_ASSERT(volinfo);
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (glusterd_is_brick_started(brickinfo)) {
+            /* If brick multiplexing is enabled then we can't
+             * blindly set brickinfo->rpc to NULL as it might impact
+             * the other attached bricks.
+             */
+            ret = glusterd_brick_proc_for_port(brickinfo->port, &brick_proc);
+            if (!ret) {
+                brick_count = brick_proc->brick_count;
+            }
+            if (!is_brick_mx_enabled() || brick_count == 0) {
+                ret = glusterd_brick_disconnect(brickinfo);
+                if (ret) {
+                    gf_msg("glusterd", GF_LOG_ERROR, 0,
+                           GD_MSD_BRICK_DISCONNECT_FAIL,
+                           "Failed to "
+                           "disconnect %s:%s",
+                           brickinfo->hostname, brickinfo->path);
+                    break;
+                }
+            }
+        }
+    }
+
+    return ret;
+}
+
+int32_t
+glusterd_volinfo_copy_brickinfo(glusterd_volinfo_t *old_volinfo,
+                                glusterd_volinfo_t *new_volinfo)
+{
+    glusterd_brickinfo_t *new_brickinfo = NULL;
+    glusterd_brickinfo_t *old_brickinfo = NULL;
+    glusterd_brickinfo_t *new_ta_brickinfo = NULL;
+    glusterd_brickinfo_t *old_ta_brickinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = 0;
+    xlator_t *this = NULL;
+    char abspath[PATH_MAX] = "";
+
+    GF_ASSERT(new_volinfo);
+    GF_ASSERT(old_volinfo);
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    cds_list_for_each_entry(new_brickinfo, &new_volinfo->bricks, brick_list)
+    {
+        ret = glusterd_volume_brickinfo_get(
+            new_brickinfo->uuid, new_brickinfo->hostname, new_brickinfo->path,
+            old_volinfo, &old_brickinfo);
+        if (ret == 0) {
+            new_brickinfo->port = old_brickinfo->port;
+
+            if (old_brickinfo->real_path[0] == '\0') {
+                if (!realpath(new_brickinfo->path, abspath)) {
+                    /* Here an ENOENT should also be a
+                     * failure as the brick is expected to
+                     * be in existence
+                     */
+                    gf_msg(this->name, GF_LOG_CRITICAL, errno,
+                           GD_MSG_BRICKINFO_CREATE_FAIL,
+                           "realpath () failed for brick "
+                           "%s. The underlying filesystem "
+                           "may be in bad state",
+                           new_brickinfo->path);
+                    ret = -1;
+                    goto out;
+                }
+                if (strlen(abspath) >= sizeof(new_brickinfo->real_path)) {
+                    ret = -1;
+                    goto out;
+                }
+                (void)strncpy(new_brickinfo->real_path, abspath,
+                              sizeof(new_brickinfo->real_path));
+            } else {
+                (void)strncpy(new_brickinfo->real_path,
+                              old_brickinfo->real_path,
+                              sizeof(new_brickinfo->real_path));
+            }
+        }
+    }
+    if (new_volinfo->thin_arbiter_count == 1) {
+        cds_list_for_each_entry(new_ta_brickinfo, &new_volinfo->ta_bricks,
+                                brick_list)
+        {
+            ret = glusterd_volume_ta_brickinfo_get(
+                new_ta_brickinfo->uuid, new_ta_brickinfo->hostname,
+                new_ta_brickinfo->path, old_volinfo, &old_ta_brickinfo);
+            if (ret == 0) {
+                new_ta_brickinfo->port = old_ta_brickinfo->port;
+
+                if (old_ta_brickinfo->real_path[0] == '\0') {
+                    if (!realpath(new_ta_brickinfo->path, abspath)) {
+                        /* Here an ENOENT should also be a
+                         * failure as the brick is expected to
+                         * be in existence
+                         */
+                        gf_msg(this->name, GF_LOG_CRITICAL, errno,
+                               GD_MSG_BRICKINFO_CREATE_FAIL,
+                               "realpath () failed for brick "
+                               "%s. The underlying filesystem "
+                               "may be in bad state",
+                               new_brickinfo->path);
+                        ret = -1;
+                        goto out;
+                    }
+                    if (strlen(abspath) >=
+                        sizeof(new_ta_brickinfo->real_path)) {
+                        ret = -1;
+                        goto out;
+                    }
+                    (void)strncpy(new_ta_brickinfo->real_path, abspath,
+                                  sizeof(new_ta_brickinfo->real_path));
+                } else {
+                    (void)strncpy(new_ta_brickinfo->real_path,
+                                  old_ta_brickinfo->real_path,
+                                  sizeof(new_ta_brickinfo->real_path));
+                }
+            }
+        }
+    }
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_volinfo_stop_stale_bricks(glusterd_volinfo_t *new_volinfo,
+                                   glusterd_volinfo_t *old_volinfo)
+{
+    glusterd_brickinfo_t *new_brickinfo = NULL;
+    glusterd_brickinfo_t *old_brickinfo = NULL;
+
+    int ret = 0;
+    GF_ASSERT(new_volinfo);
+    GF_ASSERT(old_volinfo);
+    if (_gf_false == glusterd_is_volume_started(old_volinfo))
+        goto out;
+    cds_list_for_each_entry(old_brickinfo, &old_volinfo->bricks, brick_list)
+    {
+        ret = glusterd_volume_brickinfo_get(
+            old_brickinfo->uuid, old_brickinfo->hostname, old_brickinfo->path,
+            new_volinfo, &new_brickinfo);
+        /* If the brick is stale, i.e it's not a part of the new volume
+         * or if it's part of the new volume and is pending a snap or if it's
+         * brick multiplexing enabled, then stop the brick process
+         */
+        if (ret || (new_brickinfo->snap_status == -1) ||
+            GF_ATOMIC_GET(old_volinfo->volpeerupdate)) {
+            /*TODO: may need to switch to 'atomic' flavour of
+             * brick_stop, once we make peer rpc program also
+             * synctask enabled*/
+            ret = glusterd_brick_stop(old_volinfo, old_brickinfo, _gf_false);
+            if (ret)
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_BRICK_STOP_FAIL,
+                       "Failed to stop"
+                       " brick %s:%s",
+                       old_brickinfo->hostname, old_brickinfo->path);
+        }
+    }
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_delete_stale_volume(glusterd_volinfo_t *stale_volinfo,
+                             glusterd_volinfo_t *valid_volinfo)
+{
+    int32_t ret = -1;
+    glusterd_volinfo_t *temp_volinfo = NULL;
+    glusterd_volinfo_t *voliter = NULL;
+    xlator_t *this = NULL;
+    glusterd_svc_t *svc = NULL;
+
+    GF_ASSERT(stale_volinfo);
+    GF_ASSERT(valid_volinfo);
+    this = THIS;
+    GF_ASSERT(this);
+
+    /* Copy snap_volumes list from stale_volinfo to valid_volinfo */
+    valid_volinfo->snap_count = 0;
+    cds_list_for_each_entry_safe(voliter, temp_volinfo,
+                                 &stale_volinfo->snap_volumes, snapvol_list)
+    {
+        cds_list_add_tail(&voliter->snapvol_list, &valid_volinfo->snap_volumes);
+        valid_volinfo->snap_count++;
+    }
+
+    if ((!gf_uuid_is_null(stale_volinfo->restored_from_snap)) &&
+        (gf_uuid_compare(stale_volinfo->restored_from_snap,
+                         valid_volinfo->restored_from_snap))) {
+        ret = glusterd_lvm_snapshot_remove(NULL, stale_volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SNAP_REMOVE_FAIL,
+                   "Failed to remove lvm snapshot for "
+                   "restored volume %s",
+                   stale_volinfo->volname);
+        }
+    }
+
+    /* If stale volume is in started state, stop the stale bricks if the new
+     * volume is started else, stop all bricks.
+     * We don't want brick_rpc_notify to access already deleted brickinfo,
+     * so disconnect all bricks from stale_volinfo (unconditionally), since
+     * they are being deleted subsequently.
+     */
+    if (glusterd_is_volume_started(stale_volinfo)) {
+        if (glusterd_is_volume_started(valid_volinfo)) {
+            (void)glusterd_volinfo_stop_stale_bricks(valid_volinfo,
+                                                     stale_volinfo);
+
+        } else {
+            (void)glusterd_stop_bricks(stale_volinfo);
+        }
+
+        (void)glusterd_volume_disconnect_all_bricks(stale_volinfo);
+    }
+    /* Delete all the bricks and stores and vol files. They will be created
+     * again by the valid_volinfo. Volume store delete should not be
+     * performed because some of the bricks could still be running,
+     * keeping pid files under run directory
+     */
+    (void)glusterd_delete_all_bricks(stale_volinfo);
+    if (stale_volinfo->shandle) {
+        sys_unlink(stale_volinfo->shandle->path);
+        (void)gf_store_handle_destroy(stale_volinfo->shandle);
+        stale_volinfo->shandle = NULL;
+    }
+
+    /* Marking volume as stopped, so that svc manager stops snapd
+     * and we are deleting the volume.
+     */
+    stale_volinfo->status = GLUSTERD_STATUS_STOPPED;
+
+    if (!stale_volinfo->is_snap_volume) {
+        svc = &(stale_volinfo->snapd.svc);
+        (void)svc->manager(svc, stale_volinfo, PROC_START_NO_WAIT);
+    }
+    svc = &(stale_volinfo->shd.svc);
+    (void)svc->manager(svc, stale_volinfo, PROC_START_NO_WAIT);
+
+    (void)glusterd_volinfo_remove(stale_volinfo);
+
+    return 0;
+}
+
+/* This function updates the rebalance information of the new volinfo using the
+ * information from the old volinfo.
+ */
+int
+gd_check_and_update_rebalance_info(glusterd_volinfo_t *old_volinfo,
+                                   glusterd_volinfo_t *new_volinfo)
+{
+    int ret = -1;
+    glusterd_rebalance_t *old = NULL;
+    glusterd_rebalance_t *new = NULL;
+
+    GF_ASSERT(old_volinfo);
+    GF_ASSERT(new_volinfo);
+
+    old = &(old_volinfo->rebal);
+    new = &(new_volinfo->rebal);
+
+    // Disconnect from rebalance process
+    if (glusterd_defrag_rpc_get(old->defrag)) {
+        rpc_transport_disconnect(old->defrag->rpc->conn.trans, _gf_false);
+        glusterd_defrag_rpc_put(old->defrag);
+    }
+
+    if (!gf_uuid_is_null(old->rebalance_id) &&
+        gf_uuid_compare(old->rebalance_id, new->rebalance_id)) {
+        (void)gd_stop_rebalance_process(old_volinfo);
+        goto out;
+    }
+
+    /* If the tasks match, copy the status and other information of the
+     * rebalance process from old_volinfo to new_volinfo
+     */
+    new->defrag_status = old->defrag_status;
+    new->rebalance_files = old->rebalance_files;
+    new->rebalance_data = old->rebalance_data;
+    new->lookedup_files = old->lookedup_files;
+    new->skipped_files = old->skipped_files;
+    new->rebalance_failures = old->rebalance_failures;
+    new->rebalance_time = old->rebalance_time;
+
+    /* glusterd_rebalance_t.{op, id, defrag_cmd} are copied during volume
+     * import a new defrag object should come to life with rebalance being
+     * restarted
+     */
+out:
+    return ret;
+}
+
+static int32_t
+glusterd_import_friend_volume(dict_t *peer_data, int count)
+{
+    int32_t ret = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *old_volinfo = NULL;
+    glusterd_volinfo_t *new_volinfo = NULL;
+    glusterd_svc_t *svc = NULL;
+    int32_t update = 0;
+    char key[64] = "";
+
+    GF_ASSERT(peer_data);
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = snprintf(key, sizeof(key), "volume%d.update", count);
+    ret = dict_get_int32n(peer_data, key, ret, &update);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    if (!update) {
+        /* if update is 0 that means the volume is not imported */
+        gf_smsg(this->name, GF_LOG_INFO, 0, GD_MSG_VOLUME_NOT_IMPORTED, NULL);
+        goto out;
+    }
+
+    ret = glusterd_import_volinfo(peer_data, count, &new_volinfo, "volume");
+    if (ret)
+        goto out;
+
+    if (!new_volinfo) {
+        gf_msg_debug(this->name, 0, "Not importing snap volume");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(new_volinfo->volname, &old_volinfo);
+    if (0 == ret) {
+        if (new_volinfo->version <= old_volinfo->version) {
+            /* When this condition is true, it already means that
+             * the other synctask thread of import volume has
+             * already up to date volume, so just ignore this volume
+             * now
+             */
+            goto out;
+        }
+        /* Ref count the old_volinfo such that deleting it doesn't crash
+         * if its been already in use by other thread
+         */
+        glusterd_volinfo_ref(old_volinfo);
+        (void)gd_check_and_update_rebalance_info(old_volinfo, new_volinfo);
+
+        /* Copy brick ports & real_path from the old volinfo always.
+         * The old_volinfo will be cleaned up and this information
+         * could be lost
+         */
+        (void)glusterd_volinfo_copy_brickinfo(old_volinfo, new_volinfo);
+
+        (void)glusterd_delete_stale_volume(old_volinfo, new_volinfo);
+        glusterd_volinfo_unref(old_volinfo);
+    }
+
+    ret = glusterd_store_volinfo(new_volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_STORE_FAIL,
+               "Failed to store "
+               "volinfo for volume %s",
+               new_volinfo->volname);
+        goto out;
+    }
+
+    ret = glusterd_create_volfiles(new_volinfo);
+    if (ret)
+        goto out;
+
+    glusterd_list_add_order(&new_volinfo->vol_list, &priv->volumes,
+                            glusterd_compare_volume_name);
+
+    if (glusterd_is_volume_started(new_volinfo)) {
+        (void)glusterd_start_bricks(new_volinfo);
+        if (glusterd_is_snapd_enabled(new_volinfo)) {
+            svc = &(new_volinfo->snapd.svc);
+            if (svc->manager(svc, new_volinfo, PROC_START_NO_WAIT)) {
+                gf_event(EVENT_SVC_MANAGER_FAILED, "svc_name=%s", svc->name);
+            }
+        }
+        svc = &(new_volinfo->shd.svc);
+        if (svc->manager(svc, new_volinfo, PROC_START_NO_WAIT)) {
+            gf_event(EVENT_SVC_MANAGER_FAILED, "svc_name=%s", svc->name);
+        }
+    }
+
+    ret = glusterd_import_quota_conf(peer_data, count, new_volinfo, "volume");
+    if (ret) {
+        gf_event(EVENT_IMPORT_QUOTA_CONF_FAILED, "volume=%s",
+                 new_volinfo->volname);
+        goto out;
+    }
+
+    ret = glusterd_fetchspec_notify(this);
+out:
+    gf_msg_debug("glusterd", 0, "Returning with ret: %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_import_friend_volumes_synctask(void *opaque)
+{
+    int32_t ret = -1;
+    int32_t count = 0;
+    int i = 1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    dict_t *peer_data = NULL;
+    glusterd_friend_synctask_args_t *arg = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    arg = opaque;
+    if (!arg)
+        goto out;
+
+    peer_data = dict_new();
+    if (!peer_data) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ret = dict_unserialize(arg->dict_buf, arg->dictlen, &peer_data);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                NULL);
+        errno = ENOMEM;
+        goto out;
+    }
+
+    ret = dict_get_int32n(peer_data, "count", SLEN("count"), &count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=count", NULL);
+        goto out;
+    }
+
+    synclock_lock(&conf->big_lock);
+
+    /* We need to ensure that importing a volume shouldn't race with an
+     * other thread where as part of restarting glusterd, bricks are
+     * restarted (refer glusterd_restart_bricks ())
+     */
+    while (conf->restart_bricks) {
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
+    }
+    conf->restart_bricks = _gf_true;
+
+    while (i <= count) {
+        ret = glusterd_import_friend_volume(peer_data, i);
+        if (ret) {
+            break;
+        }
+        i++;
+    }
+    if (i > count) {
+        glusterd_svcs_manager(NULL);
+    }
+    conf->restart_bricks = _gf_false;
+    synccond_broadcast(&conf->cond_restart_bricks);
+out:
+    if (peer_data)
+        dict_unref(peer_data);
+    if (arg) {
+        if (arg->dict_buf)
+            GF_FREE(arg->dict_buf);
+        GF_FREE(arg);
+    }
+
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_import_friend_volumes(dict_t *peer_data)
+{
+    int32_t ret = -1;
+    int32_t count = 0;
+    int i = 1;
+
+    GF_ASSERT(peer_data);
+
+    ret = dict_get_int32n(peer_data, "count", SLEN("count"), &count);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=count", NULL);
+        goto out;
+    }
+
+    while (i <= count) {
+        ret = glusterd_import_friend_volume(peer_data, i);
+        if (ret)
+            goto out;
+        i++;
+    }
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning with %d", ret);
+    return ret;
+}
+
+int
+glusterd_get_global_server_quorum_ratio(dict_t *opts, double *quorum)
+{
+    int ret = -1;
+    char *quorum_str = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(opts, GLUSTERD_QUORUM_RATIO_KEY,
+                        SLEN(GLUSTERD_QUORUM_RATIO_KEY), &quorum_str);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=%s", GLUSTERD_QUORUM_RATIO_KEY, NULL);
+        goto out;
+    }
+
+    ret = gf_string2percent(quorum_str, quorum);
+    if (ret)
+        goto out;
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_get_global_opt_version(dict_t *opts, uint32_t *version)
+{
+    int ret = -1;
+    char *version_str = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(opts, GLUSTERD_GLOBAL_OPT_VERSION,
+                        SLEN(GLUSTERD_GLOBAL_OPT_VERSION), &version_str);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=%s", GLUSTERD_GLOBAL_OPT_VERSION, NULL);
+        goto out;
+    }
+
+    ret = gf_string2uint(version_str, version);
+    if (ret)
+        goto out;
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_get_next_global_opt_version_str(dict_t *opts, char **version_str)
+{
+    int ret = -1;
+    char version_string[64] = "";
+    uint32_t version = 0;
+
+    ret = glusterd_get_global_opt_version(opts, &version);
+    if (ret)
+        goto out;
+    version++;
+    snprintf(version_string, sizeof(version_string), "%" PRIu32, version);
+    *version_str = gf_strdup(version_string);
+    if (*version_str)
+        ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+glusterd_import_global_opts(dict_t *friend_data)
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    int ret = -1;
+    dict_t *import_options = NULL;
+    int count = 0;
+    uint32_t local_version = 0;
+    uint32_t remote_version = 0;
+    double old_quorum = 0.0;
+    double new_quorum = 0.0;
+
+    this = THIS;
+    conf = this->private;
+
+    ret = dict_get_int32n(friend_data, "global-opt-count",
+                          SLEN("global-opt-count"), &count);
+    if (ret) {
+        // old version peer
+        gf_smsg(this->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=global-opt-count", NULL);
+        ret = 0;
+        goto out;
+    }
+
+    import_options = dict_new();
+    if (!import_options) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+    ret = import_prdict_dict(friend_data, import_options, "key", "val", count,
+                             "global");
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLOBAL_OPT_IMPORT_FAIL,
+               "Failed to import"
+               " global options");
+        goto out;
+    }
+
+    /* Not handling ret since server-quorum-ratio might not yet be set */
+    ret = glusterd_get_global_server_quorum_ratio(conf->opts, &old_quorum);
+    ret = glusterd_get_global_server_quorum_ratio(import_options, &new_quorum);
+
+    ret = glusterd_get_global_opt_version(conf->opts, &local_version);
+    if (ret)
+        goto out;
+    ret = glusterd_get_global_opt_version(import_options, &remote_version);
+    if (ret)
+        goto out;
+
+    if (remote_version > local_version) {
+        ret = glusterd_store_options(this, import_options);
+        if (ret)
+            goto out;
+        dict_unref(conf->opts);
+        conf->opts = dict_ref(import_options);
+
+        /* If server quorum ratio has changed, restart bricks to
+         * recompute if quorum is met. If quorum is not met bricks are
+         * not started and those already running are stopped
+         */
+        if (old_quorum != new_quorum) {
+            glusterd_launch_synctask(glusterd_restart_bricks, NULL);
+        }
+    }
+
+    ret = 0;
+out:
+    if (import_options)
+        dict_unref(import_options);
+    return ret;
+}
+
+int32_t
+glusterd_compare_friend_data(dict_t *peer_data, int32_t *status, char *hostname)
+{
+    int32_t ret = -1;
+    int32_t count = 0;
+    int i = 1;
+    gf_boolean_t update = _gf_false;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_friend_synctask_args_t *arg = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(peer_data);
+    GF_ASSERT(status);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+    ret = glusterd_import_global_opts(peer_data);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLOBAL_OPT_IMPORT_FAIL,
+               "Importing global "
+               "options failed");
+        goto out;
+    }
+
+    ret = dict_get_int32n(peer_data, "count", SLEN("count"), &count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=count", NULL);
+        goto out;
+    }
+
+    while (i <= count) {
+        ret = glusterd_compare_friend_volume(peer_data, i, status, hostname);
+        if (ret)
+            goto out;
+
+        if (GLUSTERD_VOL_COMP_RJT == *status) {
+            ret = 0;
+            goto out;
+        }
+        if (GLUSTERD_VOL_COMP_UPDATE_REQ == *status) {
+            update = _gf_true;
+        }
+        i++;
+    }
+
+    if (update) {
+        /* Launch the import friend volume as a separate synctask as it
+         * has to trigger start bricks where we may need to wait for the
+         * first brick to come up before attaching the subsequent bricks
+         * in case brick multiplexing is enabled
+         */
+        arg = GF_CALLOC(1, sizeof(*arg), gf_common_mt_char);
+        ret = dict_allocate_and_serialize(peer_data, &arg->dict_buf,
+                                          &arg->dictlen);
+        if (ret < 0) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "dict_serialize failed while handling "
+                   " import friend volume request");
+            goto out;
+        }
+
+        glusterd_launch_synctask(glusterd_import_friend_volumes_synctask, arg);
+    }
+
+out:
+    if (ret && arg) {
+        GF_FREE(arg);
+    }
+    gf_msg_debug(this->name, 0, "Returning with ret: %d, status: %d", ret,
+                 *status);
+    return ret;
+}
+
+struct rpc_clnt *
+glusterd_defrag_rpc_get(glusterd_defrag_info_t *defrag)
+{
+    struct rpc_clnt *rpc = NULL;
+
+    if (!defrag)
+        return NULL;
+
+    LOCK(&defrag->lock);
+    {
+        rpc = rpc_clnt_ref(defrag->rpc);
+    }
+    UNLOCK(&defrag->lock);
+    return rpc;
+}
+
+struct rpc_clnt *
+glusterd_defrag_rpc_put(glusterd_defrag_info_t *defrag)
+{
+    struct rpc_clnt *rpc = NULL;
+
+    if (!defrag)
+        return NULL;
+
+    LOCK(&defrag->lock);
+    {
+        rpc = rpc_clnt_unref(defrag->rpc);
+        defrag->rpc = rpc;
+    }
+    UNLOCK(&defrag->lock);
+    return rpc;
+}
+
+struct rpc_clnt *
+glusterd_pending_node_get_rpc(glusterd_pending_node_t *pending_node)
+{
+    struct rpc_clnt *rpc = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_svc_t *svc = NULL;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, pending_node, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, pending_node->node, out);
+
+    if (pending_node->type == GD_NODE_BRICK) {
+        brickinfo = pending_node->node;
+        rpc = brickinfo->rpc;
+
+    } else if (pending_node->type == GD_NODE_SHD ||
+               pending_node->type == GD_NODE_NFS ||
+               pending_node->type == GD_NODE_QUOTAD ||
+               pending_node->type == GD_NODE_SCRUB) {
+        svc = pending_node->node;
+        rpc = svc->conn.rpc;
+    } else if (pending_node->type == GD_NODE_REBALANCE) {
+        volinfo = pending_node->node;
+        rpc = glusterd_defrag_rpc_get(volinfo->rebal.defrag);
+
+    } else if (pending_node->type == GD_NODE_SNAPD) {
+        volinfo = pending_node->node;
+        rpc = volinfo->snapd.svc.conn.rpc;
+    } else {
+        GF_ASSERT(0);
+    }
+
+out:
+    return rpc;
+}
+
+void
+glusterd_pending_node_put_rpc(glusterd_pending_node_t *pending_node)
+{
+    glusterd_volinfo_t *volinfo = NULL;
+
+    switch (pending_node->type) {
+        case GD_NODE_REBALANCE:
+            volinfo = pending_node->node;
+            glusterd_defrag_rpc_put(volinfo->rebal.defrag);
+            break;
+
+        default:
+            break;
+    }
+}
+
+int32_t
+glusterd_unlink_file(char *sockfpath)
+{
+    int ret = 0;
+
+    ret = sys_unlink(sockfpath);
+    if (ret) {
+        if (ENOENT == errno)
+            ret = 0;
+        else
+            gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                   "Failed to remove %s"
+                   " error: %s",
+                   sockfpath, strerror(errno));
+    }
+
+    return ret;
+}
+
+void
+glusterd_nfs_pmap_deregister()
+{
+    if (pmap_unset(MOUNT_PROGRAM, MOUNTV3_VERSION))
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_DEREGISTER_SUCCESS,
+               "De-registered MOUNTV3 successfully");
+    else
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_PMAP_UNSET_FAIL,
+               "De-register MOUNTV3 is unsuccessful");
+
+    if (pmap_unset(MOUNT_PROGRAM, MOUNTV1_VERSION))
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_DEREGISTER_SUCCESS,
+               "De-registered MOUNTV1 successfully");
+    else
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_PMAP_UNSET_FAIL,
+               "De-register MOUNTV1 is unsuccessful");
+
+    if (pmap_unset(NFS_PROGRAM, NFSV3_VERSION))
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_DEREGISTER_SUCCESS,
+               "De-registered NFSV3 successfully");
+    else
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_PMAP_UNSET_FAIL,
+               "De-register NFSV3 is unsuccessful");
+
+    if (pmap_unset(NLM_PROGRAM, NLMV4_VERSION))
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_DEREGISTER_SUCCESS,
+               "De-registered NLM v4 successfully");
+    else
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_PMAP_UNSET_FAIL,
+               "De-registration of NLM v4 failed");
+
+    if (pmap_unset(NLM_PROGRAM, NLMV1_VERSION))
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_DEREGISTER_SUCCESS,
+               "De-registered NLM v1 successfully");
+    else
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_PMAP_UNSET_FAIL,
+               "De-registration of NLM v1 failed");
+
+    if (pmap_unset(ACL_PROGRAM, ACLV3_VERSION))
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_DEREGISTER_SUCCESS,
+               "De-registered ACL v3 successfully");
+    else
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_PMAP_UNSET_FAIL,
+               "De-registration of ACL v3 failed");
+}
+
+int
+glusterd_add_node_to_dict(char *server, dict_t *dict, int count,
+                          dict_t *vol_opts)
+{
+    int ret = -1;
+    char pidfile[PATH_MAX] = "";
+    gf_boolean_t running = _gf_false;
+    int pid = -1;
+    int port = 0;
+    glusterd_svc_t *svc = NULL;
+    char key[64] = "";
+    int keylen;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (!strcmp(server, "")) {
+        ret = 0;
+        goto out;
+    }
+
+    glusterd_svc_build_pidfile_path(server, priv->rundir, pidfile,
+                                    sizeof(pidfile));
+
+    if (strcmp(server, priv->quotad_svc.name) == 0)
+        svc = &(priv->quotad_svc);
+#ifdef BUILD_GNFS
+    else if (strcmp(server, priv->nfs_svc.name) == 0)
+        svc = &(priv->nfs_svc);
+#endif
+    else if (strcmp(server, priv->bitd_svc.name) == 0)
+        svc = &(priv->bitd_svc);
+    else if (strcmp(server, priv->scrub_svc.name) == 0)
+        svc = &(priv->scrub_svc);
+    else {
+        ret = 0;
+        goto out;
+    }
+
+    // Consider service to be running only when glusterd sees it Online
+    if (svc->online)
+        running = gf_is_service_running(pidfile, &pid);
+
+    /* For nfs-servers/self-heal-daemon setting
+     * brick<n>.hostname = "NFS Server" / "Self-heal Daemon"
+     * brick<n>.path = uuid
+     * brick<n>.port = 0
+     *
+     * This might be confusing, but cli displays the name of
+     * the brick as hostname+path, so this will make more sense
+     * when output.
+     */
+
+    keylen = snprintf(key, sizeof(key), "brick%d.hostname", count);
+    if (!strcmp(server, priv->quotad_svc.name))
+        ret = dict_set_nstrn(dict, key, keylen, "Quota Daemon",
+                             SLEN("Quota Daemon"));
+#ifdef BUILD_GNFS
+    else if (!strcmp(server, priv->nfs_svc.name))
+        ret = dict_set_nstrn(dict, key, keylen, "NFS Server",
+                             SLEN("NFS Server"));
+#endif
+    else if (!strcmp(server, priv->bitd_svc.name))
+        ret = dict_set_nstrn(dict, key, keylen, "Bitrot Daemon",
+                             SLEN("Bitrot Daemon"));
+    else if (!strcmp(server, priv->scrub_svc.name))
+        ret = dict_set_nstrn(dict, key, keylen, "Scrubber Daemon",
+                             SLEN("Scrubber Daemon"));
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "brick%d.path", count);
+    ret = dict_set_dynstrn(dict, key, keylen, gf_strdup(uuid_utoa(MY_UUID)));
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+#ifdef BUILD_GNFS
+    /* Port is available only for the NFS server.
+     * Self-heal daemon doesn't provide any port for access
+     * by entities other than gluster.
+     */
+    if (!strcmp(server, priv->nfs_svc.name)) {
+        if (dict_getn(vol_opts, "nfs.port", SLEN("nfs.port"))) {
+            ret = dict_get_int32n(vol_opts, "nfs.port", SLEN("nfs.port"),
+                                  &port);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                        "Key=nfs.port", NULL);
+                goto out;
+            }
+        } else
+            port = GF_NFS3_PORT;
+    }
+#endif
+    keylen = snprintf(key, sizeof(key), "brick%d.port", count);
+    ret = dict_set_int32n(dict, key, keylen, port);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "brick%d.pid", count);
+    ret = dict_set_int32n(dict, key, keylen, pid);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "brick%d.status", count);
+    ret = dict_set_int32n(dict, key, keylen, running);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_remote_hostname_get(rpcsvc_request_t *req, char *remote_host, int len)
+{
+    GF_ASSERT(req);
+    GF_ASSERT(remote_host);
+    GF_ASSERT(req->trans);
+
+    char *name = NULL;
+    char *hostname = NULL;
+    char *tmp_host = NULL;
+    char *canon = NULL;
+    int ret = 0;
+
+    name = req->trans->peerinfo.identifier;
+    tmp_host = gf_strdup(name);
+    if (tmp_host)
+        get_host_name(tmp_host, &hostname);
+
+    GF_ASSERT(hostname);
+    if (!hostname) {
+        memset(remote_host, 0, len);
+        ret = -1;
+        goto out;
+    }
+
+    if ((gf_get_hostname_from_ip(hostname, &canon) == 0) && canon) {
+        GF_FREE(tmp_host);
+        tmp_host = hostname = canon;
+    }
+
+    (void)snprintf(remote_host, len, "%s", hostname);
+
+out:
+    GF_FREE(tmp_host);
+    return ret;
+}
+
+gf_boolean_t
+glusterd_are_all_volumes_stopped()
+{
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *voliter = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    cds_list_for_each_entry(voliter, &priv->volumes, vol_list)
+    {
+        if (voliter->status == GLUSTERD_STATUS_STARTED)
+            return _gf_false;
+    }
+
+    return _gf_true;
+}
+
+gf_boolean_t
+glusterd_all_shd_compatible_volumes_stopped()
+{
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *voliter = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    cds_list_for_each_entry(voliter, &priv->volumes, vol_list)
+    {
+        if (!glusterd_is_shd_compatible_volume(voliter))
+            continue;
+        if (voliter->status == GLUSTERD_STATUS_STARTED)
+            return _gf_false;
+    }
+
+    return _gf_true;
+}
+
+gf_boolean_t
+glusterd_all_volumes_with_quota_stopped()
+{
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    glusterd_volinfo_t *voliter = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    cds_list_for_each_entry(voliter, &priv->volumes, vol_list)
+    {
+        if (!glusterd_is_volume_quota_enabled(voliter))
+            continue;
+        if (voliter->status == GLUSTERD_STATUS_STARTED)
+            return _gf_false;
+    }
+
+    return _gf_true;
+}
+
+gf_boolean_t
+glusterd_have_volumes()
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    gf_boolean_t volumes_exist = _gf_false;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", (this != NULL), out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, (priv != NULL), out);
+
+    volumes_exist = !cds_list_empty(&priv->volumes);
+out:
+    return volumes_exist;
+}
+
+int
+glusterd_volume_count_get(void)
+{
+    glusterd_volinfo_t *tmp_volinfo = NULL;
+    int32_t ret = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+
+    cds_list_for_each_entry(tmp_volinfo, &priv->volumes, vol_list) { ret++; }
+
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_brickinfo_get(uuid_t uuid, char *hostname, char *path,
+                       glusterd_brickinfo_t **brickinfo)
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    int ret = -1;
+
+    GF_ASSERT(path);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+    {
+        ret = glusterd_volume_brickinfo_get(uuid, hostname, path, volinfo,
+                                            brickinfo);
+        if (ret == 0)
+            /*Found*/
+            goto out;
+    }
+out:
+    return ret;
+}
+
+static int32_t
+my_callback(struct rpc_req *req, struct iovec *iov, int count, void *v_frame)
+{
+    call_frame_t *frame = v_frame;
+    glusterd_conf_t *conf = frame->this->private;
+
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
+        synccond_broadcast(&conf->cond_blockers);
+    }
+
+    STACK_DESTROY(frame->root);
+    return 0;
+}
+
+static int32_t
+attach_brick_callback(struct rpc_req *req, struct iovec *iov, int count,
+                      void *v_frame)
+{
+    call_frame_t *frame = v_frame;
+    glusterd_conf_t *conf = frame->this->private;
+    glusterd_brickinfo_t *brickinfo = frame->local;
+    glusterd_brickinfo_t *other_brick = frame->cookie;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = THIS;
+    int ret = -1;
+    char pidfile1[PATH_MAX] = "";
+    char pidfile2[PATH_MAX] = "";
+    gf_getspec_rsp rsp = {
+        0,
+    };
+    int last_brick = -1;
+
+    frame->local = NULL;
+    frame->cookie = NULL;
+
+    if (!iov) {
+        gf_log(frame->this->name, GF_LOG_ERROR, "iov is NULL");
+        ret = -1;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gf_getspec_rsp);
+    if (ret < 0) {
+        gf_log(frame->this->name, GF_LOG_ERROR, "XDR decoding error");
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_get_volinfo_from_brick(other_brick->path, &volinfo);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Failed to get volinfo"
+               " from brick(%s) so  pidfile copying/unlink will fail",
+               other_brick->path);
+        goto out;
+    }
+    GLUSTERD_GET_BRICK_PIDFILE(pidfile1, volinfo, other_brick, conf);
+    volinfo = NULL;
+
+    ret = glusterd_get_volinfo_from_brick(brickinfo->path, &volinfo);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               "Failed to get volinfo"
+               " from brick(%s) so  pidfile copying/unlink will fail",
+               brickinfo->path);
+        goto out;
+    }
+    GLUSTERD_GET_BRICK_PIDFILE(pidfile2, volinfo, brickinfo, conf);
+
+    if (rsp.op_ret == 0) {
+        brickinfo->port_registered = _gf_true;
+
+        /* PID file is copied once brick has attached
+           successfully
+        */
+        ret = glusterd_copy_file(pidfile1, pidfile2);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "Could not copy file %s to %s", pidfile1, pidfile2);
+            goto out;
+        }
+
+        brickinfo->status = GF_BRICK_STARTED;
+        brickinfo->rpc = rpc_clnt_ref(other_brick->rpc);
+        gf_log(THIS->name, GF_LOG_INFO, "brick %s is attached successfully",
+               brickinfo->path);
+    } else {
+        gf_log(THIS->name, GF_LOG_INFO,
+               "attach_brick failed pidfile"
+               " is %s for brick_path %s",
+               pidfile2, brickinfo->path);
+        brickinfo->port = 0;
+        brickinfo->status = GF_BRICK_STOPPED;
+        ret = glusterd_brick_process_remove_brick(brickinfo, &last_brick);
+        if (ret)
+            gf_msg_debug(this->name, 0,
+                         "Couldn't remove brick from"
+                         " brick process");
+        LOCK(&volinfo->lock);
+        ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
+        UNLOCK(&volinfo->lock);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_SET_FAIL,
+                   "Failed to store volinfo of "
+                   "%s volume",
+                   volinfo->volname);
+            goto out;
+        }
+    }
+out:
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
+        synccond_broadcast(&conf->cond_blockers);
+    }
+    STACK_DESTROY(frame->root);
+    return 0;
+}
+
+int
+send_attach_req(xlator_t *this, struct rpc_clnt *rpc, char *path,
+                glusterd_brickinfo_t *brickinfo,
+                glusterd_brickinfo_t *other_brick, int op)
+{
+    int ret = -1;
+    struct iobuf *iobuf = NULL;
+    struct iobref *iobref = NULL;
+    struct iovec iov = {
+        0,
+    };
+    ssize_t req_size = 0;
+    call_frame_t *frame = NULL;
+    gd1_mgmt_brick_op_req brick_req;
+    void *req = &brick_req;
+    void *errlbl = &&err;
+    struct rpc_clnt_connection *conn;
+    glusterd_conf_t *conf = this->private;
+    extern struct rpc_clnt_program gd_brick_prog;
+    fop_cbk_fn_t cbkfn = my_callback;
+
+    if (!rpc) {
+        gf_log(this->name, GF_LOG_ERROR, "called with null rpc");
+        return -1;
+    }
+
+    conn = &rpc->conn;
+    if (!conn->connected || conn->disconnected) {
+        gf_log(this->name, GF_LOG_INFO, "not connected yet");
+        return -1;
+    }
+
+    brick_req.op = op;
+    brick_req.name = path;
+    brick_req.input.input_val = NULL;
+    brick_req.input.input_len = 0;
+    brick_req.dict.dict_val = NULL;
+    brick_req.dict.dict_len = 0;
+
+    req_size = xdr_sizeof((xdrproc_t)xdr_gd1_mgmt_brick_op_req, req);
+    iobuf = iobuf_get2(rpc->ctx->iobuf_pool, req_size);
+    if (!iobuf) {
+        goto *errlbl;
+    }
+    errlbl = &&maybe_free_iobuf;
+
+    iov.iov_base = iobuf->ptr;
+    iov.iov_len = iobuf_pagesize(iobuf);
+
+    iobref = iobref_new();
+    if (!iobref) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto *errlbl;
+    }
+    errlbl = &&free_iobref;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_FRAME_CREATE_FAIL,
+                NULL);
+        goto *errlbl;
+    }
+
+    iobref_add(iobref, iobuf);
+    /*
+     * Drop our reference to the iobuf.  The iobref should already have
+     * one after iobref_add, so when we unref that we'll free the iobuf as
+     * well.  This allows us to pass just the iobref as frame->local.
+     */
+    iobuf_unref(iobuf);
+    /* Set the pointer to null so we don't free it on a later error. */
+    iobuf = NULL;
+
+    /* Create the xdr payload */
+    ret = xdr_serialize_generic(iov, req, (xdrproc_t)xdr_gd1_mgmt_brick_op_req);
+    if (ret == -1) {
+        goto *errlbl;
+    }
+
+    iov.iov_len = ret;
+
+    if (op == GLUSTERD_BRICK_ATTACH) {
+        frame->local = brickinfo;
+        frame->cookie = other_brick;
+        cbkfn = attach_brick_callback;
+    }
+    /* Send the msg */
+    GF_ATOMIC_INC(conf->blockers);
+    ret = rpc_clnt_submit(rpc, &gd_brick_prog, op, cbkfn, &iov, 1, NULL, 0,
+                          iobref, frame, NULL, 0, NULL, 0, NULL);
+
+free_iobref:
+    iobref_unref(iobref);
+maybe_free_iobuf:
+    if (iobuf) {
+        iobuf_unref(iobuf);
+    }
+err:
+    return ret;
+}
+
+extern size_t
+build_volfile_path(char *volume_id, char *path, size_t path_len,
+                   char *trusted_str, dict_t *dict);
+
+static int
+attach_brick(xlator_t *this, glusterd_brickinfo_t *brickinfo,
+             glusterd_brickinfo_t *other_brick, glusterd_volinfo_t *volinfo,
+             glusterd_volinfo_t *other_vol)
+{
+    glusterd_conf_t *conf = this->private;
+    char unslashed[PATH_MAX] = {
+        '\0',
+    };
+    char full_id[PATH_MAX] = {
+        '\0',
+    };
+    char path[PATH_MAX] = {
+        '\0',
+    };
+    int ret = -1;
+    int tries;
+    rpc_clnt_t *rpc;
+    int32_t len;
+
+    gf_log(this->name, GF_LOG_INFO, "add brick %s to existing process for %s",
+           brickinfo->path, other_brick->path);
+
+    GLUSTERD_REMOVE_SLASH_FROM_PATH(brickinfo->path, unslashed);
+
+    if (volinfo->is_snap_volume) {
+        len = snprintf(full_id, sizeof(full_id), "/%s/%s/%s/%s.%s.%s",
+                       GLUSTERD_VOL_SNAP_DIR_PREFIX,
+                       volinfo->snapshot->snapname, volinfo->volname,
+                       volinfo->volname, brickinfo->hostname, unslashed);
+    } else {
+        len = snprintf(full_id, sizeof(full_id), "%s.%s.%s", volinfo->volname,
+                       brickinfo->hostname, unslashed);
+    }
+    if ((len < 0) || (len >= sizeof(full_id))) {
+        goto out;
+    }
+
+    (void)build_volfile_path(full_id, path, sizeof(path), NULL, NULL);
+
+    for (tries = 15; tries > 0; --tries) {
+        rpc = rpc_clnt_ref(other_brick->rpc);
+        if (rpc) {
+            ret = send_attach_req(this, rpc, path, brickinfo, other_brick,
+                                  GLUSTERD_BRICK_ATTACH);
+            rpc_clnt_unref(rpc);
+            if (!ret) {
+                ret = pmap_registry_extend(this, other_brick->port,
+                                           brickinfo->path);
+                if (ret != 0) {
+                    gf_log(this->name, GF_LOG_ERROR,
+                           "adding brick to process failed");
+                    goto out;
+                }
+                brickinfo->port = other_brick->port;
+                ret = glusterd_brick_process_add_brick(brickinfo, other_brick);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_BRICKPROC_ADD_BRICK_FAILED,
+                           "Adding brick %s:%s to brick "
+                           "process failed",
+                           brickinfo->hostname, brickinfo->path);
+                    return ret;
+                }
+                return 0;
+            }
+        }
+        /*
+         * It might not actually be safe to manipulate the lock
+         * like this, but if we don't then the connection can
+         * never actually complete and retries are useless.
+         * Unfortunately, all of the alternatives (e.g. doing
+         * all of this in a separate thread) are much more
+         * complicated and risky.
+         * TBD: see if there's a better way
+         */
+        synclock_unlock(&conf->big_lock);
+        synctask_sleep(1);
+        synclock_lock(&conf->big_lock);
+    }
+
+out:
+    gf_log(this->name, GF_LOG_WARNING, "attach failed for %s", brickinfo->path);
+    return ret;
+}
+
+/* This name was just getting too long, hence the abbreviations. */
+static glusterd_brickinfo_t *
+find_compat_brick_in_vol(glusterd_conf_t *conf,
+                         glusterd_volinfo_t *srch_vol, /* volume to search */
+                         glusterd_volinfo_t *comp_vol, /* volume to compare */
+                         glusterd_brickinfo_t *brickinfo)
+{
+    xlator_t *this = THIS;
+    glusterd_brickinfo_t *other_brick = NULL;
+    glusterd_brick_proc_t *brick_proc = NULL;
+    char pidfile2[PATH_MAX] = "";
+    int32_t pid2 = -1;
+    int16_t retries = 15;
+    int mux_limit = -1;
+    int ret = -1;
+    gf_boolean_t brick_status = _gf_false;
+    gf_boolean_t is_shared_storage = _gf_false;
+
+    /*
+     * If comp_vol is provided, we have to check *volume* compatibility
+     * before we can check *brick* compatibility.
+     */
+    if (comp_vol) {
+        /*
+         * We should not attach bricks of a normal volume to bricks
+         * of shared storage volume.
+         */
+        if (!strcmp(srch_vol->volname, GLUSTER_SHARED_STORAGE))
+            is_shared_storage = _gf_true;
+
+        if (!strcmp(comp_vol->volname, GLUSTER_SHARED_STORAGE)) {
+            if (!is_shared_storage)
+                return NULL;
+        } else if (is_shared_storage)
+            return NULL;
+
+        /*
+         * It's kind of a shame that we have to do this check in both
+         * directions, but an option might only exist on one of the two
+         * dictionaries and dict_foreach_match will only find that one.
+         */
+
+        gf_log(THIS->name, GF_LOG_DEBUG, "comparing options for %s and %s",
+               comp_vol->volname, srch_vol->volname);
+
+        if (dict_foreach_match(comp_vol->dict, unsafe_option, NULL,
+                               opts_mismatch, srch_vol->dict) < 0) {
+            gf_log(THIS->name, GF_LOG_DEBUG, "failure forward");
+            return NULL;
+        }
+
+        if (dict_foreach_match(srch_vol->dict, unsafe_option, NULL,
+                               opts_mismatch, comp_vol->dict) < 0) {
+            gf_log(THIS->name, GF_LOG_DEBUG, "failure backward");
+            return NULL;
+        }
+
+        gf_log(THIS->name, GF_LOG_DEBUG, "all options match");
+    }
+
+    ret = get_mux_limit_per_process(&mux_limit);
+    if (ret) {
+        gf_msg_debug(THIS->name, 0,
+                     "Retrieving brick mux "
+                     "limit failed. Returning NULL");
+        return NULL;
+    }
+
+    cds_list_for_each_entry(other_brick, &srch_vol->bricks, brick_list)
+    {
+        if (other_brick == brickinfo) {
+            continue;
+        }
+        if (gf_uuid_compare(brickinfo->uuid, other_brick->uuid)) {
+            continue;
+        }
+        if (other_brick->status != GF_BRICK_STARTED &&
+            other_brick->status != GF_BRICK_STARTING) {
+            continue;
+        }
+
+        ret = glusterd_brick_proc_for_port(other_brick->port, &brick_proc);
+        if (ret) {
+            gf_msg_debug(THIS->name, 0,
+                         "Couldn't get brick "
+                         "process corresponding to brick %s:%s",
+                         other_brick->hostname, other_brick->path);
+            continue;
+        }
+
+        if (mux_limit != 0) {
+            if (brick_proc->brick_count >= mux_limit)
+                continue;
+        } else {
+            /* This means that the "cluster.max-bricks-per-process"
+             * options hasn't yet been explicitly set. Continue
+             * as if there's no limit set
+             */
+            gf_msg(THIS->name, GF_LOG_WARNING, 0, GD_MSG_NO_MUX_LIMIT,
+                   "cluster.max-bricks-per-process options isn't "
+                   "set. Continuing with no limit set for "
+                   "brick multiplexing.");
+        }
+        /* The first brick process might take some time to finish its
+         * handshake with glusterd and prepare the graph. We can't
+         * afford to send attach_req for other bricks till that time.
+         * brick process sends PMAP_SIGNIN event after processing the
+         * volfile and hence it's safe to assume that if glusterd has
+         * received a pmap signin request for the same brick, we are
+         * good for subsequent attach requests.
+         */
+        retries = 15;
+        while (retries > 0) {
+            if (other_brick->port_registered) {
+                GLUSTERD_GET_BRICK_PIDFILE(pidfile2, srch_vol, other_brick,
+                                           conf);
+                if (sys_access(pidfile2, F_OK) == 0 &&
+                    gf_is_service_running(pidfile2, &pid2)) {
+                    gf_msg_debug(this->name, 0,
+                                 "brick %s is running as a pid %d ",
+                                 other_brick->path, pid2);
+                    brick_status = _gf_true;
+                    break;
+                }
+            }
+
+            synclock_unlock(&conf->big_lock);
+            gf_msg_debug(this->name, 0,
+                         "brick %s is still"
+                         " starting, waiting for 2 seconds ",
+                         other_brick->path);
+            synctask_sleep(2);
+            synclock_lock(&conf->big_lock);
+            retries--;
+        }
+
+        if (!brick_status) {
+            gf_log(this->name, GF_LOG_INFO,
+                   "brick has not come up so cleaning up dead brick %s:%s",
+                   other_brick->hostname, other_brick->path);
+            other_brick->status = GF_BRICK_STOPPED;
+            if (pidfile2[0])
+                sys_unlink(pidfile2);
+            continue;
+        }
+        return other_brick;
+    }
+
+    return NULL;
+}
+
+static glusterd_brickinfo_t *
+find_compatible_brick(glusterd_conf_t *conf, glusterd_volinfo_t *volinfo,
+                      glusterd_brickinfo_t *brickinfo,
+                      glusterd_volinfo_t **other_vol_p)
+{
+    glusterd_brickinfo_t *other_brick = NULL;
+    glusterd_volinfo_t *other_vol = NULL;
+    glusterd_snap_t *snap = NULL;
+
+    /* Just return NULL here if multiplexing is disabled. */
+    if (!is_brick_mx_enabled()) {
+        return NULL;
+    }
+
+    other_brick = find_compat_brick_in_vol(conf, volinfo, NULL, brickinfo);
+    if (other_brick) {
+        *other_vol_p = volinfo;
+        return other_brick;
+    }
+
+    /*
+     * This check is necessary because changes to a volume's
+     * transport options aren't propagated to snapshots.  Such a
+     * change might break compatibility between the two, but we
+     * have no way to "evict" a brick from the process it's
+     * currently in.  If we keep it separate from the start, we
+     * avoid the problem.  Note that snapshot bricks can still be
+     * colocated with one another, even if they're for different
+     * volumes, because the only thing likely to differ is their
+     * auth options and those are not a factor in determining
+     * compatibility.
+     *
+     * The very same immutability of snapshot bricks' transport
+     * options, which can make them incompatible with their parent
+     * volumes, ensures that once-compatible snapshot bricks will
+     * remain compatible.  However, the same is not true for bricks
+     * belonging to two non-snapshot volumes.  In that case, a
+     * change to one might break compatibility and require them to
+     * be separated, which is not yet done.
+     *
+     * TBD: address the option-change issue for non-snapshot bricks
+     */
+    if (!volinfo->is_snap_volume) {
+        cds_list_for_each_entry(other_vol, &conf->volumes, vol_list)
+        {
+            if (other_vol == volinfo) {
+                continue;
+            }
+            other_brick = find_compat_brick_in_vol(conf, other_vol, volinfo,
+                                                   brickinfo);
+            if (other_brick) {
+                *other_vol_p = other_vol;
+                return other_brick;
+            }
+        }
+    } else {
+        cds_list_for_each_entry(snap, &conf->snapshots, snap_list)
+        {
+            cds_list_for_each_entry(other_vol, &snap->volumes, vol_list)
+            {
+                if (other_vol == volinfo) {
+                    continue;
+                }
+                other_brick = find_compat_brick_in_vol(conf, other_vol, volinfo,
+                                                       brickinfo);
+                if (other_brick) {
+                    *other_vol_p = other_vol;
+                    return other_brick;
+                }
+            }
+        }
+    }
+
+    return NULL;
+}
+
+/* Below function is use to populate sockpath based on passed pid
+   value as a argument after check the value from proc and also
+   check if passed pid is match with running  glusterfs process
+*/
+
+int
+glusterd_get_sock_from_brick_pid(int pid, char *sockpath, size_t len)
+{
+    char buf[1024] = "";
+    char cmdline[2048] = "";
+    xlator_t *this = NULL;
+    int fd = -1;
+    int i = 0, j = 0;
+    char *ptr = NULL;
+    char *brptr = NULL;
+    char tmpsockpath[PATH_MAX] = "";
+    size_t blen = 0;
+    int ret = -1;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+#ifdef __FreeBSD__
+    blen = sizeof(buf);
+    int mib[4];
+
+    mib[0] = CTL_KERN;
+    mib[1] = KERN_PROC;
+    mib[2] = KERN_PROC_ARGS;
+    mib[3] = pid;
+
+    if (sys_sysctl(mib, 4, buf, &blen, NULL, blen) != 0) {
+        gf_log(this->name, GF_LOG_ERROR, "brick process %d is not running",
+               pid);
+        return ret;
+    }
+#else
+    char fname[128] = "";
+    snprintf(fname, sizeof(fname), "/proc/%d/cmdline", pid);
+
+    if (sys_access(fname, R_OK) != 0) {
+        gf_log(this->name, GF_LOG_ERROR, "brick process %d is not running",
+               pid);
+        return ret;
+    }
+
+    fd = open(fname, O_RDONLY);
+    if (fd != -1) {
+        blen = (int)sys_read(fd, buf, 1024);
+    } else {
+        gf_log(this->name, GF_LOG_ERROR, "open failed %s to open a file %s",
+               strerror(errno), fname);
+        return ret;
+    }
+#endif
+
+    /* convert cmdline to single string */
+    for (i = 0, j = 0; i < blen; i++) {
+        if (buf[i] == '\0')
+            cmdline[j++] = ' ';
+        else if (buf[i] < 32 || buf[i] > 126) /* remove control char */
+            continue;
+        else if (buf[i] == '"' || buf[i] == '\\') {
+            cmdline[j++] = '\\';
+            cmdline[j++] = buf[i];
+        } else {
+            cmdline[j++] = buf[i];
+        }
+    }
+    cmdline[j] = '\0';
+    if (fd)
+        sys_close(fd);
+    if (!strstr(cmdline, "glusterfs"))
+        return ret;
+
+    ptr = strstr(cmdline, "-S ");
+    if (!ptr)
+        return ret;
+    ptr = strchr(ptr, '/');
+    if (!ptr)
+        return ret;
+    brptr = strstr(ptr, "--brick-name");
+    if (!brptr)
+        return ret;
+    i = 0;
+
+    while (ptr < brptr) {
+        if (*ptr != 32)
+            tmpsockpath[i++] = *ptr;
+        ptr++;
+    }
+
+    if (tmpsockpath[0]) {
+        strncpy(sockpath, tmpsockpath, i);
+        ret = 0;
+    }
+
+    return ret;
+}
+
+char *
+search_brick_path_from_proc(pid_t brick_pid, char *brickpath)
+{
+    char *brick_path = NULL;
+#ifdef __FreeBSD__
+    struct filestat *fst;
+    struct procstat *ps;
+    struct kinfo_proc *kp;
+    struct filestat_list *head;
+
+    ps = procstat_open_sysctl();
+    if (ps == NULL)
+        goto out;
+
+    kp = kinfo_getproc(brick_pid);
+    if (kp == NULL)
+        goto out;
+
+    head = procstat_getfiles(ps, (void *)kp, 0);
+    if (head == NULL)
+        goto out;
+
+    STAILQ_FOREACH(fst, head, next)
+    {
+        if (fst->fs_fd < 0)
+            continue;
+
+        if (!strcmp(fst->fs_path, brickpath)) {
+            brick_path = gf_strdup(fst->fs_path);
+            break;
+        }
+    }
+
+out:
+    if (head != NULL)
+        procstat_freefiles(ps, head);
+    if (kp != NULL)
+        free(kp);
+    procstat_close(ps);
+#else
+    struct dirent *dp = NULL;
+    DIR *dirp = NULL;
+    size_t len = 0;
+    int fd = -1;
+    char path[PATH_MAX] = "";
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+
+    if (!brickpath)
+        goto out;
+
+    len = sprintf(path, "/proc/%d/fd/", brick_pid);
+    if (len >= (sizeof(path) - 2))
+        goto out;
+
+    dirp = sys_opendir(path);
+    if (!dirp)
+        goto out;
+
+    fd = dirfd(dirp);
+    if (fd < 0)
+        goto out;
+
+    while ((dp = sys_readdir(dirp, scratch))) {
+        if (!strcmp(dp->d_name, ".") || !strcmp(dp->d_name, ".."))
+            continue;
+
+        /* check for non numerical descriptors */
+        if (!strtol(dp->d_name, (char **)NULL, 10))
+            continue;
+
+        len = readlinkat(fd, dp->d_name, path, sizeof(path) - 1);
+        /* TODO: handle len == -1 -> error condition in readlinkat */
+        if (len > 1) {
+            path[len] = '\0';
+            if (!strcmp(path, brickpath)) {
+                brick_path = gf_strdup(path);
+                break;
+            }
+        }
+    }
+out:
+    if (dirp)
+        sys_closedir(dirp);
+#endif
+    return brick_path;
+}
+
+int
+glusterd_brick_start(glusterd_volinfo_t *volinfo,
+                     glusterd_brickinfo_t *brickinfo, gf_boolean_t wait,
+                     gf_boolean_t only_connect)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_brickinfo_t *other_brick;
+    glusterd_conf_t *conf = NULL;
+    int32_t pid = -1;
+    char pidfile[PATH_MAX] = "";
+    char socketpath[PATH_MAX] = "";
+    char *brickpath = NULL;
+    glusterd_volinfo_t *other_vol;
+    gf_boolean_t is_service_running = _gf_false;
+    uuid_t volid = {
+        0,
+    };
+    ssize_t size = -1;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+
+    if ((!brickinfo) || (!volinfo)) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    if (gf_uuid_is_null(brickinfo->uuid)) {
+        ret = glusterd_resolve_brick(brickinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESOLVE_BRICK_FAIL,
+                   FMTSTR_RESOLVE_BRICK, brickinfo->hostname, brickinfo->path);
+            gf_event(EVENT_BRICKPATH_RESOLVE_FAILED,
+                     "peer=%s;volume=%s;brick=%s", brickinfo->hostname,
+                     volinfo->volname, brickinfo->path);
+            goto out;
+        }
+    }
+
+    if (gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+        ret = 0;
+        goto out;
+    }
+
+    /* If a trigger to start the brick is already initiated then no need for
+     * a reattempt as it's an overkill. With glusterd_brick_start ()
+     * function being used in multiple places, when glusterd restarts we see
+     * three different triggers for an attempt to start the brick process
+     * due to the quorum handling code in glusterd_friend_sm.
+     */
+    if (brickinfo->status == GF_BRICK_STARTING || brickinfo->start_triggered ||
+        GF_ATOMIC_GET(volinfo->volpeerupdate)) {
+        gf_msg_debug(this->name, 0,
+                     "brick %s is already in starting "
+                     "phase",
+                     brickinfo->path);
+        ret = 0;
+        goto out;
+    }
+    if (!only_connect)
+        brickinfo->start_triggered = _gf_true;
+
+    GLUSTERD_GET_BRICK_PIDFILE(pidfile, volinfo, brickinfo, conf);
+
+    /* Compare volume-id xattr is helpful to ensure the existence of a
+       brick_root path before the start/attach a brick
+    */
+    size = sys_lgetxattr(brickinfo->path, GF_XATTR_VOL_ID_KEY, volid, 16);
+    if (size != 16) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Missing %s extended attribute on brick root (%s),"
+               " brick is deemed not to be a part of the volume (%s) ",
+               GF_XATTR_VOL_ID_KEY, brickinfo->path, volinfo->volname);
+        goto out;
+    }
+
+    if (strncmp(uuid_utoa(volinfo->volume_id), uuid_utoa(volid),
+                GF_UUID_BUF_SIZE)) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Mismatching %s extended attribute on brick root (%s),"
+               " brick is deemed not to be a part of the volume (%s)",
+               GF_XATTR_VOL_ID_KEY, brickinfo->path, volinfo->volname);
+        goto out;
+    }
+    is_service_running = gf_is_service_running(pidfile, &pid);
+    if (is_service_running) {
+        if (is_brick_mx_enabled()) {
+            brickpath = search_brick_path_from_proc(pid, brickinfo->path);
+            if (!brickpath) {
+                if (only_connect)
+                    return 0;
+                gf_log(this->name, GF_LOG_INFO,
+                       "Either pid %d is not running or brick"
+                       " path %s is not consumed so cleanup pidfile",
+                       pid, brickinfo->path);
+                /* brick isn't running,so unlink stale pidfile
+                 * if any.
+                 */
+                if (sys_access(pidfile, R_OK) == 0) {
+                    sys_unlink(pidfile);
+                }
+                goto run;
+            }
+            GF_FREE(brickpath);
+            ret = glusterd_get_sock_from_brick_pid(pid, socketpath,
+                                                   sizeof(socketpath));
+            if (ret) {
+                if (only_connect)
+                    return 0;
+                gf_log(this->name, GF_LOG_INFO,
+                       "Either pid %d is not running or does "
+                       "not match with any running brick "
+                       "processes",
+                       pid);
+                /* Fetch unix socket is failed so unlink pidfile */
+                if (sys_access(pidfile, R_OK) == 0) {
+                    sys_unlink(pidfile);
+                }
+                goto run;
+            }
+        }
+        if (brickinfo->status != GF_BRICK_STARTING &&
+            brickinfo->status != GF_BRICK_STARTED) {
+            gf_log(this->name, GF_LOG_INFO,
+                   "discovered already-running brick %s", brickinfo->path);
+            (void)pmap_registry_bind(this, brickinfo->port, brickinfo->path,
+                                     GF_PMAP_PORT_BRICKSERVER, NULL);
+            brickinfo->port_registered = _gf_true;
+            /*
+             * This will unfortunately result in a separate RPC
+             * connection per brick, even though they're all in
+             * the same process.  It works, but it would be nicer
+             * if we could find a pre-existing connection to that
+             * same port (on another brick) and re-use that.
+             * TBD: re-use RPC connection across bricks
+             */
+            if (!is_brick_mx_enabled()) {
+                glusterd_set_brick_socket_filepath(
+                    volinfo, brickinfo, socketpath, sizeof(socketpath));
+            }
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "Using %s as sockfile for brick %s of volume %s ",
+                   socketpath, brickinfo->path, volinfo->volname);
+
+            (void)glusterd_brick_connect(volinfo, brickinfo, socketpath);
+
+            ret = glusterd_brick_process_add_brick(brickinfo, NULL);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_BRICKPROC_ADD_BRICK_FAILED,
+                       "Adding brick %s:%s to brick process "
+                       "failed.",
+                       brickinfo->hostname, brickinfo->path);
+                goto out;
+            }
+            /* We need to set the status back to STARTING so that
+             * while the other (re)start brick requests come in for
+             * other bricks, this brick can be considered as
+             * compatible.
+             */
+            brickinfo->status = GF_BRICK_STARTING;
+        }
+        return 0;
+    }
+    if (only_connect)
+        return 0;
+
+run:
+    ret = _mk_rundir_p(volinfo);
+    if (ret)
+        goto out;
+
+    other_brick = find_compatible_brick(conf, volinfo, brickinfo, &other_vol);
+    if (other_brick) {
+        /* mark the brick to starting as send_attach_req might take few
+         * iterations to successfully attach the brick and we don't want
+         * to get into a state where another needless trigger to start
+         * the brick is processed
+         */
+        brickinfo->status = GF_BRICK_STARTING;
+        ret = attach_brick(this, brickinfo, other_brick, volinfo, other_vol);
+        if (ret == 0) {
+            goto out;
+        }
+        /* Attach_brick is failed so unlink pidfile */
+        if (sys_access(pidfile, R_OK) == 0) {
+            sys_unlink(pidfile);
+        }
+    }
+
+    /*
+     * This hack is necessary because our brick-process management is a
+     * total nightmare.  We expect a brick process's socket and pid files
+     * to be ready *immediately* after we start it.  Ditto for it calling
+     * back to bind its port.  Unfortunately, none of that is realistic.
+     * Any process takes non-zero time to start up.  This has *always* been
+     * racy and unsafe; it just became more visible with multiplexing.
+     *
+     * The right fix would be to do all of this setup *in the parent*,
+     * which would include (among other things) getting the PID back from
+     * the "runner" code.  That's all prohibitively difficult and risky.
+     * To work around the more immediate problems, we create a stub pidfile
+     * here to let gf_is_service_running know that we expect the process to
+     * be there shortly, and then it gets filled in with a real PID when
+     * the process does finish starting up.
+     *
+     * TBD: pray for GlusterD 2 to be ready soon.
+     */
+    gf_log(this->name, GF_LOG_INFO,
+           "starting a fresh brick process for "
+           "brick %s",
+           brickinfo->path);
+    ret = glusterd_volume_start_glusterfs(volinfo, brickinfo, wait);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_DISCONNECTED,
+               "Unable to start brick %s:%s", brickinfo->hostname,
+               brickinfo->path);
+        gf_event(EVENT_BRICK_START_FAILED, "peer=%s;volume=%s;brick=%s",
+                 brickinfo->hostname, volinfo->volname, brickinfo->path);
+        goto out;
+    }
+
+out:
+    if (ret && brickinfo) {
+        brickinfo->start_triggered = _gf_false;
+    }
+    gf_msg_debug(this->name, 0, "returning %d ", ret);
+    return ret;
+}
+
+int
+glusterd_restart_bricks(void *opaque)
+{
+    int ret = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_snap_t *snap = NULL;
+    gf_boolean_t start_svcs = _gf_false;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    int active_count = 0;
+    int quorum_count = 0;
+    gf_boolean_t node_quorum = _gf_false;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, return_block);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, return_block);
+
+    synclock_lock(&conf->big_lock);
+
+    /* We need to ensure that restarting the bricks during glusterd restart
+     * shouldn't race with the import volume thread (refer
+     * glusterd_compare_friend_data ())
+     */
+    while (conf->restart_bricks) {
+        synccond_wait(&conf->cond_restart_bricks, &conf->big_lock);
+    }
+    conf->restart_bricks = _gf_true;
+
+    GF_ATOMIC_INC(conf->blockers);
+    ret = glusterd_get_quorum_cluster_counts(this, &active_count,
+                                             &quorum_count);
+    if (ret)
+        goto out;
+
+    if (does_quorum_meet(active_count, quorum_count))
+        node_quorum = _gf_true;
+
+    cds_list_for_each_entry(volinfo, &conf->volumes, vol_list)
+    {
+        if (volinfo->status != GLUSTERD_STATUS_STARTED) {
+            continue;
+        }
+        gf_msg_debug(this->name, 0, "starting the volume %s", volinfo->volname);
+
+        /* Check the quorum, if quorum is not met, don't start the
+           bricks. Stop bricks in case they are running.
+        */
+        ret = check_quorum_for_brick_start(volinfo, node_quorum);
+        if (ret == 0) {
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_SERVER_QUORUM_NOT_MET,
+                   "Skipping brick "
+                   "restart for volume %s as quorum is not met",
+                   volinfo->volname);
+            (void)glusterd_stop_bricks(volinfo);
+            continue;
+        } else if (ret == 2 && conf->restart_done == _gf_true) {
+            /* If glusterd has been restarted and quorum is not
+             * applicable then do not restart the bricks as this
+             * might start bricks brought down purposely, say for
+             * maintenance
+             */
+            continue;
+        } else {
+            start_svcs = _gf_true;
+            cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+            {
+                if (!brickinfo->start_triggered) {
+                    pthread_mutex_lock(&brickinfo->restart_mutex);
+                    {
+                        glusterd_brick_start(volinfo, brickinfo, _gf_false,
+                                             _gf_false);
+                    }
+                    pthread_mutex_unlock(&brickinfo->restart_mutex);
+                }
+            }
+            ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_STORE_FAIL,
+                       "Failed to "
+                       "write volinfo for volume %s",
+                       volinfo->volname);
+                goto out;
+            }
+        }
+    }
+
+    cds_list_for_each_entry(snap, &conf->snapshots, snap_list)
+    {
+        cds_list_for_each_entry(volinfo, &snap->volumes, vol_list)
+        {
+            if (volinfo->status != GLUSTERD_STATUS_STARTED)
+                continue;
+            /* Check the quorum, if quorum is not met, don't start
+             * the bricks
+             */
+            ret = check_quorum_for_brick_start(volinfo, node_quorum);
+            if (ret == 0) {
+                gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_SERVER_QUORUM_NOT_MET,
+                       "Skipping"
+                       " brick restart for volume %s as "
+                       "quorum is not met",
+                       volinfo->volname);
+                continue;
+            }
+            start_svcs = _gf_true;
+            gf_msg_debug(this->name, 0,
+                         "starting the snap "
+                         "volume %s",
+                         volinfo->volname);
+            cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+            {
+                if (!brickinfo->start_triggered) {
+                    pthread_mutex_lock(&brickinfo->restart_mutex);
+                    {
+                        /* coverity[SLEEP] */
+                        glusterd_brick_start(volinfo, brickinfo, _gf_false,
+                                             _gf_false);
+                    }
+                    pthread_mutex_unlock(&brickinfo->restart_mutex);
+                }
+            }
+            ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_STORE_FAIL,
+                       "Failed to "
+                       "write volinfo for volume %s",
+                       volinfo->volname);
+                goto out;
+            }
+        }
+    }
+    if (start_svcs == _gf_true) {
+        glusterd_svcs_manager(NULL);
+    }
+
+    ret = 0;
+
+out:
+    conf->restart_done = _gf_true;
+    conf->restart_bricks = _gf_false;
+    if (GF_ATOMIC_DEC(conf->blockers) == 0) {
+        synccond_broadcast(&conf->cond_blockers);
+    }
+    synccond_broadcast(&conf->cond_restart_bricks);
+
+return_block:
+    return ret;
+}
+
+int
+_local_gsyncd_start(dict_t *this, char *key, data_t *value, void *data)
+{
+    char *path_list = NULL;
+    char *slave = NULL;
+    char *slave_url = NULL;
+    char *slave_vol = NULL;
+    char *slave_host = NULL;
+    char *statefile = NULL;
+    char buf[1024] = "faulty";
+    int ret = 0;
+    int op_ret = 0;
+    int ret_status = 0;
+    char uuid_str[64] = "";
+    glusterd_volinfo_t *volinfo = NULL;
+    char confpath[PATH_MAX] = "";
+    char *op_errstr = NULL;
+    glusterd_conf_t *priv = NULL;
+    gf_boolean_t is_template_in_use = _gf_false;
+    gf_boolean_t is_paused = _gf_false;
+    char key1[1024] = "";
+    xlator_t *this1 = NULL;
+
+    this1 = THIS;
+    GF_ASSERT(this1);
+    priv = this1->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(data);
+
+    volinfo = data;
+    slave = strchr(value->data, ':');
+    if (slave)
+        slave++;
+    else
+        return 0;
+
+    (void)snprintf(uuid_str, sizeof(uuid_str), "%s", (char *)value->data);
+
+    /* Getting Local Brickpaths */
+    ret = glusterd_get_local_brickpaths(volinfo, &path_list);
+
+    /*Generating the conf file path needed by gsyncd */
+    ret = glusterd_get_slave_info(slave, &slave_url, &slave_host, &slave_vol,
+                                  &op_errstr);
+    if (ret) {
+        gf_msg(this1->name, GF_LOG_ERROR, 0, GD_MSG_SLAVEINFO_FETCH_ERROR,
+               "Unable to fetch slave details.");
+        ret = -1;
+        goto out;
+    }
+
+    ret = snprintf(confpath, sizeof(confpath) - 1,
+                   "%s/" GEOREP "/%s_%s_%s/gsyncd.conf", priv->workdir,
+                   volinfo->volname, slave_host, slave_vol);
+    confpath[ret] = '\0';
+
+    /* Fetching the last status of the node */
+    ret = glusterd_get_statefile_name(volinfo, slave, confpath, &statefile,
+                                      &is_template_in_use);
+    if (ret) {
+        if (!strstr(slave, "::"))
+            gf_msg(this1->name, GF_LOG_INFO, 0, GD_MSG_SLAVE_URL_INVALID,
+                   "%s is not a valid slave url.", slave);
+        else
+            gf_msg(this1->name, GF_LOG_INFO, 0,
+                   GD_MSG_GET_STATEFILE_NAME_FAILED,
+                   "Unable to get"
+                   " statefile's name");
+        goto out;
+    }
+
+    /* If state-file entry is missing from the config file,
+     * do not start gsyncd on restart */
+    if (is_template_in_use) {
+        gf_msg(this1->name, GF_LOG_INFO, 0, GD_MSG_NO_STATEFILE_ENTRY,
+               "state-file entry is missing in config file."
+               "Not Restarting");
+        goto out;
+    }
+
+    is_template_in_use = _gf_false;
+
+    ret = gsync_status(volinfo->volname, slave, confpath, &ret_status,
+                       &is_template_in_use);
+    if (ret == -1) {
+        gf_msg(this1->name, GF_LOG_INFO, 0, GD_MSG_GSYNC_VALIDATION_FAIL,
+               GEOREP " start option validation failed ");
+        ret = 0;
+        goto out;
+    }
+
+    if (is_template_in_use == _gf_true) {
+        gf_msg(this1->name, GF_LOG_INFO, 0, GD_MSG_PIDFILE_NOT_FOUND,
+               "pid-file entry is missing in config file."
+               "Not Restarting");
+        ret = 0;
+        goto out;
+    }
+
+    ret = glusterd_gsync_read_frm_status(statefile, buf, sizeof(buf));
+    if (ret <= 0) {
+        gf_msg(this1->name, GF_LOG_ERROR, 0, GD_MSG_STAT_FILE_READ_FAILED,
+               "Unable to read the status");
+        goto out;
+    }
+
+    /* Form key1 which is "<user@><slave_host>::<slavevol>" */
+    snprintf(key1, sizeof(key1), "%s::%s", slave_url, slave_vol);
+
+    /* Looks for the last status, to find if the session was running
+     * when the node went down. If the session was just created or
+     * stopped, do not restart the geo-rep session */
+    if ((!strcmp(buf, "Created")) || (!strcmp(buf, "Stopped"))) {
+        gf_msg(this1->name, GF_LOG_INFO, 0, GD_MSG_GEO_REP_START_FAILED,
+               "Geo-Rep Session was not started between "
+               "%s and %s::%s. Not Restarting",
+               volinfo->volname, slave_url, slave_vol);
+        goto out;
+    } else if (strstr(buf, "Paused")) {
+        is_paused = _gf_true;
+    } else if ((!strcmp(buf, "Config Corrupted"))) {
+        gf_msg(this1->name, GF_LOG_INFO, 0, GD_MSG_RECOVERING_CORRUPT_CONF,
+               "Recovering from a corrupted config. "
+               "Not Restarting. Use start (force) to "
+               "start the session between %s and %s::%s.",
+               volinfo->volname, slave_url, slave_vol);
+        goto out;
+    }
+
+    if (is_paused) {
+        glusterd_start_gsync(volinfo, slave, path_list, confpath, uuid_str,
+                             NULL, _gf_true);
+    } else {
+        /* Add slave to the dict indicating geo-rep session is running*/
+        ret = dict_set_dynstr_with_alloc(volinfo->gsync_active_slaves, key1,
+                                         "running");
+        if (ret) {
+            gf_msg(this1->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to set key:%s"
+                   " value:running in the dict",
+                   key1);
+            goto out;
+        }
+        ret = glusterd_start_gsync(volinfo, slave, path_list, confpath,
+                                   uuid_str, NULL, _gf_false);
+        if (ret)
+            dict_del(volinfo->gsync_active_slaves, key1);
+    }
+
+out:
+    if (statefile)
+        GF_FREE(statefile);
+    if (slave_url)
+        GF_FREE(slave_url);
+
+    if (is_template_in_use) {
+        op_ret = glusterd_create_status_file(
+            volinfo->volname, slave, slave_host, slave_vol, "Config Corrupted");
+        if (op_ret) {
+            gf_msg(this1->name, GF_LOG_ERROR, 0,
+                   GD_MSG_STATUSFILE_CREATE_FAILED,
+                   "Unable to create status file"
+                   ". Error : %s",
+                   strerror(errno));
+            ret = op_ret;
+        }
+    }
+    if (slave_vol)
+        GF_FREE(slave_vol);
+    GF_FREE(path_list);
+    GF_FREE(op_errstr);
+
+    return ret;
+}
+
+int
+glusterd_volume_restart_gsyncds(glusterd_volinfo_t *volinfo)
+{
+    GF_ASSERT(volinfo);
+
+    dict_foreach(volinfo->gsync_slaves, _local_gsyncd_start, volinfo);
+    return 0;
+}
+
+int
+glusterd_restart_gsyncds(glusterd_conf_t *conf)
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    int ret = 0;
+
+    cds_list_for_each_entry(volinfo, &conf->volumes, vol_list)
+    {
+        glusterd_volume_restart_gsyncds(volinfo);
+    }
+    return ret;
+}
+
+int
+glusterd_calc_dist_leaf_count(int rcount, int scount)
+{
+    return (rcount ? rcount : 1) * (scount ? scount : 1);
+}
+
+int
+glusterd_get_dist_leaf_count(glusterd_volinfo_t *volinfo)
+{
+    int rcount = volinfo->replica_count;
+    int scount = volinfo->stripe_count;
+
+    if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE)
+        return volinfo->disperse_count;
+
+    return glusterd_calc_dist_leaf_count(rcount, scount);
+}
+
+int
+glusterd_get_brickinfo(xlator_t *this, const char *brickname, int port,
+                       glusterd_brickinfo_t **brickinfo)
+{
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *tmpbrkinfo = NULL;
+    glusterd_snap_t *snap = NULL;
+    int ret = -1;
+
+    GF_ASSERT(brickname);
+    GF_ASSERT(this);
+
+    priv = this->private;
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+    {
+        cds_list_for_each_entry(tmpbrkinfo, &volinfo->bricks, brick_list)
+        {
+            if (gf_uuid_compare(tmpbrkinfo->uuid, MY_UUID))
+                continue;
+            if (!strcmp(tmpbrkinfo->path, brickname) &&
+                (tmpbrkinfo->port == port)) {
+                *brickinfo = tmpbrkinfo;
+                return 0;
+            }
+        }
+    }
+    /* In case normal volume is not found, check for snapshot volumes */
+    cds_list_for_each_entry(snap, &priv->snapshots, snap_list)
+    {
+        cds_list_for_each_entry(volinfo, &snap->volumes, vol_list)
+        {
+            cds_list_for_each_entry(tmpbrkinfo, &volinfo->bricks, brick_list)
+            {
+                if (gf_uuid_compare(tmpbrkinfo->uuid, MY_UUID))
+                    continue;
+                if (!strcmp(tmpbrkinfo->path, brickname)) {
+                    *brickinfo = tmpbrkinfo;
+                    return 0;
+                }
+            }
+        }
+    }
+
+    return ret;
+}
+
+glusterd_brickinfo_t *
+glusterd_get_brickinfo_by_position(glusterd_volinfo_t *volinfo, uint32_t pos)
+{
+    glusterd_brickinfo_t *tmpbrkinfo = NULL;
+
+    cds_list_for_each_entry(tmpbrkinfo, &volinfo->bricks, brick_list)
+    {
+        if (pos == 0)
+            return tmpbrkinfo;
+        pos--;
+    }
+    return NULL;
+}
+
+void
+glusterd_set_brick_status(glusterd_brickinfo_t *brickinfo,
+                          gf_brick_status_t status)
+{
+    GF_ASSERT(brickinfo);
+    brickinfo->status = status;
+    if (GF_BRICK_STARTED == status) {
+        gf_msg_debug("glusterd", 0,
+                     "Setting brick %s:%s status "
+                     "to started",
+                     brickinfo->hostname, brickinfo->path);
+    } else {
+        gf_msg_debug("glusterd", 0,
+                     "Setting brick %s:%s status "
+                     "to stopped",
+                     brickinfo->hostname, brickinfo->path);
+    }
+}
+
+gf_boolean_t
+glusterd_is_brick_started(glusterd_brickinfo_t *brickinfo)
+{
+    GF_ASSERT(brickinfo);
+    return (brickinfo->status == GF_BRICK_STARTED);
+}
+
+int
+glusterd_friend_brick_belongs(glusterd_volinfo_t *volinfo,
+                              glusterd_brickinfo_t *brickinfo, void *uuid)
+{
+    int ret = -1;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(brickinfo);
+    GF_ASSERT(uuid);
+
+    if (gf_uuid_is_null(brickinfo->uuid)) {
+        ret = glusterd_resolve_brick(brickinfo);
+        if (ret) {
+            GF_ASSERT(0);
+            goto out;
+        }
+    }
+    if (!gf_uuid_compare(brickinfo->uuid, *((uuid_t *)uuid)))
+        return 0;
+out:
+    return -1;
+}
+
+int
+glusterd_get_brick_root(char *path, char **mount_point)
+{
+    char *ptr = NULL;
+    char *mnt_pt = NULL;
+    struct stat brickstat = {0};
+    struct stat buf = {0};
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!path) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto err;
+    }
+    mnt_pt = gf_strdup(path);
+    if (!mnt_pt) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto err;
+    }
+    if (sys_stat(mnt_pt, &brickstat))
+        goto err;
+
+    while ((ptr = strrchr(mnt_pt, '/')) && ptr != mnt_pt) {
+        *ptr = '\0';
+        if (sys_stat(mnt_pt, &buf)) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                    "Error in stat=%s", strerror(errno), NULL);
+            goto err;
+        }
+
+        if (brickstat.st_dev != buf.st_dev) {
+            *ptr = '/';
+            break;
+        }
+    }
+
+    if (ptr == mnt_pt) {
+        if (sys_stat("/", &buf)) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                    "Error in stat=%s", strerror(errno), NULL);
+            goto err;
+        }
+        if (brickstat.st_dev == buf.st_dev)
+            strcpy(mnt_pt, "/");
+    }
+
+    *mount_point = mnt_pt;
+    return 0;
+
+err:
+    GF_FREE(mnt_pt);
+    return -1;
+}
+
+static char *
+glusterd_parse_inode_size(char *stream, char *pattern)
+{
+    char *needle = NULL;
+    char *trail = NULL;
+
+    needle = strstr(stream, pattern);
+    if (!needle)
+        goto out;
+
+    needle = nwstrtail(needle, pattern);
+
+    trail = needle;
+    while (trail && isdigit(*trail))
+        trail++;
+    if (trail)
+        *trail = '\0';
+
+out:
+    return needle;
+}
+
+static struct fs_info {
+    char *fs_type_name;
+    char *fs_tool_name;
+    char *fs_tool_arg;
+    char *fs_tool_pattern;
+    char *fs_tool_pkg;
+} glusterd_fs[] = {{"xfs", "xfs_info", NULL, "isize=", "xfsprogs"},
+                   {"ext3", "tune2fs", "-l", "Inode size:", "e2fsprogs"},
+                   {"ext4", "tune2fs", "-l", "Inode size:", "e2fsprogs"},
+                   {"btrfs", NULL, NULL, NULL, NULL},
+                   {"zfs", NULL, NULL, NULL, NULL},
+                   {NULL, NULL, NULL, NULL, NULL}};
+
+static int
+glusterd_add_inode_size_to_dict(dict_t *dict, int count)
+{
+    int ret = -1;
+    char key[64];
+    char buffer[4096] = "";
+    char *device = NULL;
+    char *fs_name = NULL;
+    char *cur_word = NULL;
+    char *trail = NULL;
+    runner_t runner = {
+        0,
+    };
+    struct fs_info *fs = NULL;
+    static dict_t *cached_fs = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    ret = snprintf(key, sizeof(key), "brick%d.device", count);
+    ret = dict_get_strn(dict, key, ret, &device);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    if (cached_fs) {
+        if (dict_get_str(cached_fs, device, &cur_word) == 0) {
+            goto cached;
+        }
+    } else {
+        cached_fs = dict_new();
+    }
+
+    ret = snprintf(key, sizeof(key), "brick%d.fs_name", count);
+    ret = dict_get_strn(dict, key, ret, &fs_name);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    runinit(&runner);
+    runner_redir(&runner, STDOUT_FILENO, RUN_PIPE);
+
+    for (fs = glusterd_fs; fs->fs_type_name; fs++) {
+        if (strcmp(fs_name, fs->fs_type_name) == 0) {
+            if (!fs->fs_tool_name) {
+                /* dynamic inodes */
+                gf_smsg(this->name, GF_LOG_INFO, 0, GD_MSG_INODE_SIZE_GET_FAIL,
+                        "The brick on device uses dynamic inode sizes",
+                        "Device=%s (%s)", device, fs_name, NULL);
+                cur_word = "N/A";
+                goto cached;
+            }
+            runner_add_arg(&runner, fs->fs_tool_name);
+            break;
+        }
+    }
+
+    if (runner.argv[0]) {
+        if (fs->fs_tool_arg)
+            runner_add_arg(&runner, fs->fs_tool_arg);
+        runner_add_arg(&runner, device);
+    } else {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_INODE_SIZE_GET_FAIL,
+                "Could not find tool to get inode size for device", "Tool=%s",
+                fs->fs_tool_name, "Device=%s (%s)", device, fs_name,
+                "Missing package=%s ?", fs->fs_tool_pkg, NULL);
+        goto out;
+    }
+
+    ret = runner_start(&runner);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_CMD_EXEC_FAIL,
+                "Failed to execute \"%s\"", fs->fs_tool_name, NULL);
+        /*
+         * Runner_start might return an error after the child has
+         * been forked, e.g. if the program isn't there.  In that
+         * case, we still need to call runner_end to reap the
+         * child and free resources.  Fortunately, that seems to
+         * be harmless for other kinds of failures.
+         */
+        (void)runner_end(&runner);
+        goto out;
+    }
+
+    for (;;) {
+        if (fgets(buffer, sizeof(buffer),
+                  runner_chio(&runner, STDOUT_FILENO)) == NULL)
+            break;
+        trail = strrchr(buffer, '\n');
+        if (trail)
+            *trail = '\0';
+
+        cur_word = glusterd_parse_inode_size(buffer, fs->fs_tool_pattern);
+
+        if (cur_word)
+            break;
+    }
+
+    ret = runner_end(&runner);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_CMD_EXEC_FAIL,
+                "Tool exited with non-zero exit status", "Tool=%s",
+                fs->fs_tool_name, NULL);
+
+        goto out;
+    }
+    if (!cur_word) {
+        ret = -1;
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_INODE_SIZE_GET_FAIL,
+                "Using Tool=%s", fs->fs_tool_name, NULL);
+        goto out;
+    }
+
+    if (dict_set_dynstr_with_alloc(cached_fs, device, cur_word)) {
+        /* not fatal if not entered into the cache */
+        gf_msg_debug(this->name, 0, "failed to cache fs inode size for %s",
+                     device);
+    }
+
+cached:
+    snprintf(key, sizeof(key), "brick%d.inode_size", count);
+
+    ret = dict_set_dynstr_with_alloc(dict, key, cur_word);
+
+out:
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INODE_SIZE_GET_FAIL, NULL);
+    return ret;
+}
+
+struct mntent *
+glusterd_get_mnt_entry_info(char *mnt_pt, char *buff, int buflen,
+                            struct mntent *entry_ptr)
+{
+    struct mntent *entry = NULL;
+    FILE *mtab = NULL;
+    char abspath[PATH_MAX] = "";
+
+    GF_ASSERT(mnt_pt);
+    GF_ASSERT(buff);
+    GF_ASSERT(entry_ptr);
+
+    mtab = setmntent(_PATH_MOUNTED, "r");
+    if (!mtab)
+        goto out;
+
+    if (!realpath(mnt_pt, abspath)) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_MNTENTRY_GET_FAIL,
+               "realpath () failed for path %s", mnt_pt);
+        goto out;
+    }
+
+    entry = getmntent_r(mtab, entry_ptr, buff, buflen);
+
+    while (1) {
+        if (!entry)
+            goto out;
+
+        if (!strcmp(entry->mnt_dir, abspath) &&
+            strcmp(entry->mnt_type, "rootfs"))
+            break;
+        entry = getmntent_r(mtab, entry_ptr, buff, buflen);
+    }
+
+out:
+    if (NULL != mtab) {
+        endmntent(mtab);
+    }
+    return entry;
+}
+
+static int
+glusterd_add_brick_mount_details(glusterd_brickinfo_t *brickinfo, dict_t *dict,
+                                 int count)
+{
+    int ret = -1;
+    char key[64] = "";
+    char buff[PATH_MAX] = "";
+    char base_key[32] = "";
+    struct mntent save_entry = {0};
+    char *mnt_pt = NULL;
+    struct mntent *entry = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    snprintf(base_key, sizeof(base_key), "brick%d", count);
+
+    ret = glusterd_get_brick_root(brickinfo->path, &mnt_pt);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_BRICKPATH_ROOT_GET_FAIL,
+                NULL);
+        goto out;
+    }
+
+    entry = glusterd_get_mnt_entry_info(mnt_pt, buff, sizeof(buff),
+                                        &save_entry);
+    if (!entry) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GET_MNT_ENTRY_INFO_FAIL,
+                NULL);
+        ret = -1;
+        goto out;
+    }
+
+    /* get device file */
+    snprintf(key, sizeof(key), "%s.device", base_key);
+
+    ret = dict_set_dynstr_with_alloc(dict, key, entry->mnt_fsname);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    /* fs type */
+    snprintf(key, sizeof(key), "%s.fs_name", base_key);
+
+    ret = dict_set_dynstr_with_alloc(dict, key, entry->mnt_type);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    /* mount options */
+    snprintf(key, sizeof(key), "%s.mnt_options", base_key);
+
+    ret = dict_set_dynstr_with_alloc(dict, key, entry->mnt_opts);
+
+out:
+    if (mnt_pt)
+        GF_FREE(mnt_pt);
+
+    return ret;
+}
+
+char *
+glusterd_get_brick_mount_device(char *brick_path)
+{
+    int ret = -1;
+    char *mnt_pt = NULL;
+    char *device = NULL;
+    char buff[PATH_MAX] = "";
+    struct mntent *entry = NULL;
+    struct mntent save_entry = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(brick_path);
+
+    ret = glusterd_get_brick_root(brick_path, &mnt_pt);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICKPATH_ROOT_GET_FAIL,
+               "Failed to get mount point "
+               "for %s brick",
+               brick_path);
+        goto out;
+    }
+
+    entry = glusterd_get_mnt_entry_info(mnt_pt, buff, sizeof(buff),
+                                        &save_entry);
+    if (NULL == entry) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MNTENTRY_GET_FAIL,
+               "Failed to get mnt entry "
+               "for %s mount path",
+               mnt_pt);
+        goto out;
+    }
+
+    /* get the fs_name/device */
+    device = gf_strdup(entry->mnt_fsname);
+
+out:
+    if (mnt_pt)
+        GF_FREE(mnt_pt);
+
+    return device;
+}
+
+int
+glusterd_add_brick_detail_to_dict(glusterd_volinfo_t *volinfo,
+                                  glusterd_brickinfo_t *brickinfo, dict_t *dict,
+                                  int count)
+{
+    int ret = -1;
+    uint64_t memtotal = 0;
+    uint64_t memfree = 0;
+    uint64_t inodes_total = 0;
+    uint64_t inodes_free = 0;
+    uint64_t block_size = 0;
+    char key[64];
+    char base_key[32];
+    struct statvfs brickstat = {0};
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(volinfo);
+    GF_ASSERT(brickinfo);
+    GF_ASSERT(dict);
+
+    snprintf(base_key, sizeof(base_key), "brick%d", count);
+
+    ret = sys_statvfs(brickinfo->path, &brickstat);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "statfs error: %s ", strerror(errno));
+        goto out;
+    }
+
+    /* file system block size */
+    block_size = brickstat.f_bsize;
+    snprintf(key, sizeof(key), "%s.block_size", base_key);
+    ret = dict_set_uint64(dict, key, block_size);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    /* free space in brick */
+    memfree = brickstat.f_bfree * brickstat.f_bsize;
+    snprintf(key, sizeof(key), "%s.free", base_key);
+    ret = dict_set_uint64(dict, key, memfree);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    /* total space of brick */
+    memtotal = brickstat.f_blocks * brickstat.f_bsize;
+    snprintf(key, sizeof(key), "%s.total", base_key);
+    ret = dict_set_uint64(dict, key, memtotal);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        goto out;
+    }
+
+    /* inodes: total and free counts only for ext2/3/4 and xfs */
+    inodes_total = brickstat.f_files;
+    if (inodes_total) {
+        snprintf(key, sizeof(key), "%s.total_inodes", base_key);
+        ret = dict_set_uint64(dict, key, inodes_total);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=%s", key, NULL);
+            goto out;
+        }
+    }
+
+    inodes_free = brickstat.f_ffree;
+    if (inodes_free) {
+        snprintf(key, sizeof(key), "%s.free_inodes", base_key);
+        ret = dict_set_uint64(dict, key, inodes_free);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=%s", key, NULL);
+            goto out;
+        }
+    }
+
+    ret = glusterd_add_brick_mount_details(brickinfo, dict, count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_ADD_BRICK_MNT_INFO_FAIL,
+                NULL);
+        goto out;
+    }
+
+    ret = glusterd_add_inode_size_to_dict(dict, count);
+out:
+    if (ret)
+        gf_msg_debug(this->name, 0,
+                     "Error adding brick"
+                     " detail to dict: %s",
+                     strerror(errno));
+    return ret;
+}
+
+int32_t
+glusterd_add_brick_to_dict(glusterd_volinfo_t *volinfo,
+                           glusterd_brickinfo_t *brickinfo, dict_t *dict,
+                           int32_t count)
+{
+    int ret = -1;
+    int32_t pid = -1;
+    char key[64];
+    int keylen;
+    char base_key[32];
+    char pidfile[PATH_MAX] = "";
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    gf_boolean_t brick_online = _gf_false;
+    char *brickpath = NULL;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(brickinfo);
+    GF_ASSERT(dict);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+
+    snprintf(base_key, sizeof(base_key), "brick%d", count);
+    keylen = snprintf(key, sizeof(key), "%s.hostname", base_key);
+
+    ret = dict_set_strn(dict, key, keylen, brickinfo->hostname);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.path", base_key);
+    ret = dict_set_strn(dict, key, keylen, brickinfo->path);
+    if (ret)
+        goto out;
+
+    /* add peer uuid */
+    snprintf(key, sizeof(key), "%s.peerid", base_key);
+    ret = dict_set_dynstr_with_alloc(dict, key, uuid_utoa(brickinfo->uuid));
+    if (ret) {
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.port", base_key);
+    ret = dict_set_int32n(
+        dict, key, keylen,
+        (volinfo->transport_type == GF_TRANSPORT_RDMA) ? 0 : brickinfo->port);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.rdma_port", base_key);
+    if (volinfo->transport_type == GF_TRANSPORT_RDMA) {
+        ret = dict_set_int32n(dict, key, keylen, brickinfo->port);
+    } else if (volinfo->transport_type == GF_TRANSPORT_BOTH_TCP_RDMA) {
+        ret = dict_set_int32n(dict, key, keylen, brickinfo->rdma_port);
+    } else
+        ret = dict_set_int32n(dict, key, keylen, 0);
+
+    if (ret)
+        goto out;
+
+    GLUSTERD_GET_BRICK_PIDFILE(pidfile, volinfo, brickinfo, priv);
+
+    if (glusterd_is_brick_started(brickinfo)) {
+        if (gf_is_service_running(pidfile, &pid) &&
+            brickinfo->port_registered) {
+            if (!is_brick_mx_enabled()) {
+                brick_online = _gf_true;
+            } else {
+                brickpath = search_brick_path_from_proc(pid, brickinfo->path);
+                if (!brickpath) {
+                    gf_log(this->name, GF_LOG_INFO,
+                           "brick path %s is not consumed", brickinfo->path);
+                    brick_online = _gf_false;
+                } else {
+                    brick_online = _gf_true;
+                    GF_FREE(brickpath);
+                }
+            }
+        } else {
+            pid = -1;
+        }
+    }
+
+    keylen = snprintf(key, sizeof(key), "%s.pid", base_key);
+    ret = dict_set_int32n(dict, key, keylen, pid);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "%s.status", base_key);
+    ret = dict_set_int32n(dict, key, keylen, brick_online);
+
+out:
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+        gf_msg_debug(this->name, 0, "Returning %d", ret);
+    }
+
+    return ret;
+}
+
+int32_t
+glusterd_get_all_volnames(dict_t *dict)
+{
+    int ret = -1;
+    int32_t vol_count = 0;
+    char key[64] = "";
+    int keylen;
+    glusterd_volinfo_t *entry = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    cds_list_for_each_entry(entry, &priv->volumes, vol_list)
+    {
+        keylen = snprintf(key, sizeof(key), "vol%d", vol_count);
+        ret = dict_set_strn(dict, key, keylen, entry->volname);
+        if (ret)
+            goto out;
+
+        vol_count++;
+    }
+
+    ret = dict_set_int32n(dict, "vol_count", SLEN("vol_count"), vol_count);
+
+out:
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to get all "
+               "volume names for status");
+    return ret;
+}
+
+int
+glusterd_all_volume_cond_check(glusterd_condition_func func, int status,
+                               void *ctx)
+{
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    priv = this->private;
+
+    cds_list_for_each_entry(volinfo, &priv->volumes, vol_list)
+    {
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            ret = func(volinfo, brickinfo, ctx);
+            if (ret != status) {
+                ret = -1;
+                goto out;
+            }
+        }
+    }
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_brick_stop(glusterd_volinfo_t *volinfo,
+                    glusterd_brickinfo_t *brickinfo, gf_boolean_t del_brick)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    if ((!brickinfo) || (!volinfo)) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    if (gf_uuid_is_null(brickinfo->uuid)) {
+        ret = glusterd_resolve_brick(brickinfo);
+        if (ret) {
+            gf_event(EVENT_BRICKPATH_RESOLVE_FAILED,
+                     "peer=%s;volume=%s;brick=%s", brickinfo->hostname,
+                     volinfo->volname, brickinfo->path);
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESOLVE_BRICK_FAIL,
+                   FMTSTR_RESOLVE_BRICK, brickinfo->hostname, brickinfo->path);
+            goto out;
+        }
+    }
+
+    if (gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+        ret = 0;
+        if (del_brick)
+            glusterd_delete_brick(volinfo, brickinfo);
+        goto out;
+    }
+
+    ret = glusterd_volume_stop_glusterfs(volinfo, brickinfo, del_brick);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_BRICK_STOP_FAIL,
+               "Unable to stop"
+               " brick: %s:%s",
+               brickinfo->hostname, brickinfo->path);
+        goto out;
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "returning %d ", ret);
+    return ret;
+}
+
+int
+glusterd_is_defrag_on(glusterd_volinfo_t *volinfo)
+{
+    return (volinfo->rebal.defrag != NULL);
+}
+
+int
+glusterd_new_brick_validate(char *brick, glusterd_brickinfo_t *brickinfo,
+                            char *op_errstr, size_t len, char *op)
+{
+    glusterd_brickinfo_t *newbrickinfo = NULL;
+    int ret = -1;
+    gf_boolean_t is_allocated = _gf_false;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(brick);
+    GF_ASSERT(op_errstr);
+
+    if (!brickinfo) {
+        ret = glusterd_brickinfo_new_from_brick(brick, &newbrickinfo, _gf_true,
+                                                NULL);
+        if (ret)
+            goto out;
+        is_allocated = _gf_true;
+    } else {
+        newbrickinfo = brickinfo;
+    }
+
+    ret = glusterd_resolve_brick(newbrickinfo);
+    if (ret) {
+        snprintf(op_errstr, len,
+                 "Host %s is not in \'Peer "
+                 "in Cluster\' state",
+                 newbrickinfo->hostname);
+        goto out;
+    }
+
+    if (!gf_uuid_compare(MY_UUID, newbrickinfo->uuid)) {
+        /* brick is local */
+        if (!glusterd_is_brickpath_available(newbrickinfo->uuid,
+                                             newbrickinfo->path)) {
+            snprintf(op_errstr, len,
+                     "Brick: %s not available."
+                     " Brick may be containing or be contained "
+                     "by an existing brick.",
+                     brick);
+            if (op && (!strcmp(op, "GF_RESET_OP_COMMIT") ||
+                       !strcmp(op, "GF_RESET_OP_COMMIT_FORCE")))
+                ret = 1;
+            else
+                ret = -1;
+            goto out;
+        }
+
+    } else {
+        peerinfo = glusterd_peerinfo_find_by_uuid(newbrickinfo->uuid);
+        if (peerinfo == NULL) {
+            ret = -1;
+            snprintf(op_errstr, len, "Failed to find host %s",
+                     newbrickinfo->hostname);
+            goto out;
+        }
+
+        if ((!peerinfo->connected)) {
+            snprintf(op_errstr, len, "Host %s not connected",
+                     newbrickinfo->hostname);
+            ret = -1;
+            goto out;
+        }
+
+        if (peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED) {
+            snprintf(op_errstr, len,
+                     "Host %s is not in \'Peer "
+                     "in Cluster\' state",
+                     newbrickinfo->hostname);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    if (is_allocated)
+        glusterd_brickinfo_delete(newbrickinfo);
+    if (op_errstr[0] != '\0')
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_VALIDATE_FAIL, "%s",
+               op_errstr);
+    gf_msg_debug(this->name, 0, "returning %d ", ret);
+    return ret;
+}
+
+int
+glusterd_rb_check_bricks(glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *src,
+                         glusterd_brickinfo_t *dst)
+{
+    glusterd_replace_brick_t *rb = NULL;
+
+    GF_ASSERT(volinfo);
+
+    rb = &volinfo->rep_brick;
+
+    if (!rb->src_brick || !rb->dst_brick) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        return -1;
+    }
+
+    if (strcmp(rb->src_brick->hostname, src->hostname) ||
+        strcmp(rb->src_brick->path, src->path)) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_RB_SRC_BRICKS_MISMATCH,
+               "Replace brick src bricks differ");
+        return -1;
+    }
+
+    if (strcmp(rb->dst_brick->hostname, dst->hostname) ||
+        strcmp(rb->dst_brick->path, dst->path)) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_RB_DST_BRICKS_MISMATCH,
+               "Replace brick dst bricks differ");
+        return -1;
+    }
+
+    return 0;
+}
+
+/*path needs to be absolute; works only on gfid, volume-id*/
+static int
+glusterd_is_uuid_present(char *path, char *xattr, gf_boolean_t *present)
+{
+    GF_ASSERT(path);
+    GF_ASSERT(xattr);
+    GF_ASSERT(present);
+
+    int ret = -1;
+    uuid_t uid = {
+        0,
+    };
+
+    if (!path || !xattr || !present)
+        goto out;
+
+    ret = sys_lgetxattr(path, xattr, &uid, 16);
+
+    if (ret >= 0) {
+        *present = _gf_true;
+        ret = 0;
+        goto out;
+    }
+
+    switch (errno) {
+#if defined(ENODATA)
+        case ENODATA: /* FALLTHROUGH */
+#endif
+#if defined(ENOATTR) && (ENOATTR != ENODATA)
+        case ENOATTR: /* FALLTHROUGH */
+#endif
+        case ENOTSUP:
+            *present = _gf_false;
+            ret = 0;
+            break;
+        default:
+            break;
+    }
+out:
+    return ret;
+}
+
+/*path needs to be absolute*/
+static int
+glusterd_is_path_in_use(char *path, gf_boolean_t *in_use, char **op_errstr)
+{
+    int i = 0;
+    int ret = -1;
+    gf_boolean_t used = _gf_false;
+    char dir[PATH_MAX] = "";
+    char *curdir = NULL;
+    char msg[2048] = "";
+    char *keys[3] = {GFID_XATTR_KEY, GF_XATTR_VOL_ID_KEY, NULL};
+
+    GF_ASSERT(path);
+    if (!path)
+        goto out;
+
+    if (snprintf(dir, PATH_MAX, "%s", path) >= PATH_MAX)
+        goto out;
+
+    curdir = dir;
+    do {
+        for (i = 0; !used && keys[i]; i++) {
+            ret = glusterd_is_uuid_present(curdir, keys[i], &used);
+            if (ret)
+                goto out;
+        }
+
+        if (used)
+            break;
+
+        curdir = dirname(curdir);
+        if (!strcmp(curdir, "."))
+            goto out;
+
+    } while (strcmp(curdir, "/"));
+
+    if (!strcmp(curdir, "/")) {
+        for (i = 0; !used && keys[i]; i++) {
+            ret = glusterd_is_uuid_present(curdir, keys[i], &used);
+            if (ret)
+                goto out;
+        }
+    }
+
+    ret = 0;
+    *in_use = used;
+out:
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "Failed to get extended "
+                 "attribute %s, reason: %s",
+                 keys[i], strerror(errno));
+    }
+
+    if (*in_use) {
+        if (path && curdir && !strcmp(path, curdir)) {
+            snprintf(msg, sizeof(msg),
+                     "%s is already part of a "
+                     "volume",
+                     path);
+        } else {
+            snprintf(msg, sizeof(msg),
+                     "parent directory %s is "
+                     "already part of a volume",
+                     curdir);
+        }
+    }
+
+    if (strlen(msg)) {
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_PATH_ALREADY_PART_OF_VOL,
+               "%s", msg);
+        *op_errstr = gf_strdup(msg);
+    }
+
+    return ret;
+}
+
+int
+glusterd_check_and_set_brick_xattr(char *host, char *path, uuid_t uuid,
+                                   char **op_errstr, gf_boolean_t is_force)
+{
+    int ret = -1;
+    char msg[2048] = "";
+    gf_boolean_t in_use = _gf_false;
+    int flags = 0;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    /* Check for xattr support in backend fs */
+    ret = sys_lsetxattr(path, "trusted.glusterfs.test", "working", 8, 0);
+    if (ret == -1) {
+        snprintf(msg, sizeof(msg),
+                 "Glusterfs is not"
+                 " supported on brick: %s:%s.\nSetting"
+                 " extended attributes failed, reason:"
+                 " %s.",
+                 host, path, strerror(errno));
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_SET_XATTR_BRICK_FAIL,
+                "Host=%s, Path=%s", host, path, NULL);
+        goto out;
+
+    } else {
+        ret = sys_lremovexattr(path, "trusted.glusterfs.test");
+        if (ret) {
+            snprintf(msg, sizeof(msg),
+                     "Removing test extended"
+                     " attribute failed, reason: %s",
+                     strerror(errno));
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_REMOVE_XATTR_FAIL,
+                    NULL);
+            goto out;
+        }
+    }
+
+    ret = glusterd_is_path_in_use(path, &in_use, op_errstr);
+    if (ret)
+        goto out;
+
+    if (in_use && !is_force) {
+        ret = -1;
+        goto out;
+    }
+
+    if (!is_force)
+        flags = XATTR_CREATE;
+
+    ret = sys_lsetxattr(path, GF_XATTR_VOL_ID_KEY, uuid, 16, flags);
+    if (ret == -1) {
+        snprintf(msg, sizeof(msg),
+                 "Failed to set extended "
+                 "attributes %s, reason: %s",
+                 GF_XATTR_VOL_ID_KEY, strerror(errno));
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_SET_XATTR_FAIL,
+                "Attriutes=%s", GF_XATTR_VOL_ID_KEY, NULL);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (strlen(msg))
+        *op_errstr = gf_strdup(msg);
+
+    return ret;
+}
+
+static int
+glusterd_sm_tr_log_transition_add_to_dict(dict_t *dict,
+                                          glusterd_sm_tr_log_t *log, int i,
+                                          int count)
+{
+    int ret = -1;
+    char key[64] = "";
+    int keylen;
+    char timestr[GF_TIMESTR_SIZE] = "";
+    char *str = NULL;
+
+    GF_ASSERT(dict);
+    GF_ASSERT(log);
+
+    keylen = snprintf(key, sizeof(key), "log%d-old-state", count);
+    str = log->state_name_get(log->transitions[i].old_state);
+    ret = dict_set_strn(dict, key, keylen, str);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "log%d-event", count);
+    str = log->event_name_get(log->transitions[i].event);
+    ret = dict_set_strn(dict, key, keylen, str);
+    if (ret)
+        goto out;
+
+    keylen = snprintf(key, sizeof(key), "log%d-new-state", count);
+    str = log->state_name_get(log->transitions[i].new_state);
+    ret = dict_set_strn(dict, key, keylen, str);
+    if (ret)
+        goto out;
+
+    snprintf(key, sizeof(key), "log%d-time", count);
+    gf_time_fmt(timestr, sizeof timestr, log->transitions[i].time,
+                gf_timefmt_FT);
+    ret = dict_set_dynstr_with_alloc(dict, key, timestr);
+    if (ret)
+        goto out;
+
+out:
+    if (key[0] != '\0' && ret != 0)
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=%s", key, NULL);
+    gf_msg_debug("glusterd", 0, "returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_sm_tr_log_add_to_dict(dict_t *dict, glusterd_sm_tr_log_t *circular_log)
+{
+    int ret = -1;
+    int i = 0;
+    int start = 0;
+    int end = 0;
+    int index = 0;
+    char key[16] = {0};
+    glusterd_sm_tr_log_t *log = NULL;
+    int count = 0;
+
+    GF_ASSERT(dict);
+    GF_ASSERT(circular_log);
+
+    log = circular_log;
+    if (!log->count)
+        return 0;
+
+    if (log->count == log->size)
+        start = log->current + 1;
+
+    end = start + log->count;
+    for (i = start; i < end; i++, count++) {
+        index = i % log->count;
+        ret = glusterd_sm_tr_log_transition_add_to_dict(dict, log, index,
+                                                        count);
+        if (ret)
+            goto out;
+    }
+
+    ret = snprintf(key, sizeof(key), "count");
+    ret = dict_set_int32n(dict, key, ret, log->count);
+
+out:
+    gf_msg_debug("glusterd", 0, "returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_sm_tr_log_init(glusterd_sm_tr_log_t *log, char *(*state_name_get)(int),
+                        char *(*event_name_get)(int), size_t size)
+{
+    glusterd_sm_transition_t *transitions = NULL;
+    int ret = -1;
+
+    GF_ASSERT(size > 0);
+    GF_ASSERT(log && state_name_get && event_name_get);
+
+    if (!log || !state_name_get || !event_name_get || (size <= 0))
+        goto out;
+
+    transitions = GF_CALLOC(size, sizeof(*transitions), gf_gld_mt_sm_tr_log_t);
+    if (!transitions)
+        goto out;
+
+    log->transitions = transitions;
+    log->size = size;
+    log->state_name_get = state_name_get;
+    log->event_name_get = event_name_get;
+    ret = 0;
+
+out:
+    gf_msg_debug("glusterd", 0, "returning %d", ret);
+    return ret;
+}
+
+void
+glusterd_sm_tr_log_delete(glusterd_sm_tr_log_t *log)
+{
+    if (!log)
+        return;
+    GF_FREE(log->transitions);
+    return;
+}
+
+int
+glusterd_sm_tr_log_transition_add(glusterd_sm_tr_log_t *log, int old_state,
+                                  int new_state, int event)
+{
+    glusterd_sm_transition_t *transitions = NULL;
+    int ret = -1;
+    int next = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(log);
+    if (!log)
+        goto out;
+
+    transitions = log->transitions;
+    if (!transitions)
+        goto out;
+
+    if (log->count)
+        next = (log->current + 1) % log->size;
+    else
+        next = 0;
+
+    transitions[next].old_state = old_state;
+    transitions[next].new_state = new_state;
+    transitions[next].event = event;
+    transitions[next].time = gf_time();
+
+    log->current = next;
+    if (log->count < log->size)
+        log->count++;
+    ret = 0;
+    gf_msg_debug(this->name, 0,
+                 "Transitioning from '%s' to '%s' "
+                 "due to event '%s'",
+                 log->state_name_get(old_state), log->state_name_get(new_state),
+                 log->event_name_get(event));
+out:
+    gf_msg_debug(this->name, 0, "returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_remove_pending_entry(struct cds_list_head *list, void *elem)
+{
+    glusterd_pending_node_t *pending_node = NULL;
+    glusterd_pending_node_t *tmp = NULL;
+    int ret = 0;
+
+    cds_list_for_each_entry_safe(pending_node, tmp, list, list)
+    {
+        if (elem == pending_node->node) {
+            cds_list_del_init(&pending_node->list);
+            GF_FREE(pending_node);
+            ret = 0;
+            goto out;
+        }
+    }
+out:
+    gf_msg_debug(THIS->name, 0, "returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_clear_pending_nodes(struct cds_list_head *list)
+{
+    glusterd_pending_node_t *pending_node = NULL;
+    glusterd_pending_node_t *tmp = NULL;
+
+    cds_list_for_each_entry_safe(pending_node, tmp, list, list)
+    {
+        cds_list_del_init(&pending_node->list);
+        GF_FREE(pending_node);
+    }
+
+    return 0;
+}
+
+int32_t
+glusterd_delete_volume(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    GF_ASSERT(volinfo);
+
+    ret = glusterd_store_delete_volume(volinfo);
+
+    if (ret)
+        goto out;
+
+    glusterd_volinfo_remove(volinfo);
+out:
+    gf_msg_debug(THIS->name, 0, "returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_delete_brick(glusterd_volinfo_t *volinfo,
+                      glusterd_brickinfo_t *brickinfo)
+{
+    int ret = 0;
+    char voldir[PATH_MAX] = "";
+    glusterd_conf_t *priv = THIS->private;
+    GF_ASSERT(volinfo);
+    GF_ASSERT(brickinfo);
+
+    GLUSTERD_GET_VOLUME_DIR(voldir, volinfo, priv);
+
+    glusterd_delete_volfile(volinfo, brickinfo);
+    glusterd_store_delete_brick(brickinfo, voldir);
+    glusterd_brickinfo_delete(brickinfo);
+    volinfo->brick_count--;
+    return ret;
+}
+
+int32_t
+glusterd_delete_all_bricks(glusterd_volinfo_t *volinfo)
+{
+    int ret = 0;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *tmp = NULL;
+
+    GF_ASSERT(volinfo);
+
+    cds_list_for_each_entry_safe(brickinfo, tmp, &volinfo->bricks, brick_list)
+    {
+        ret = glusterd_delete_brick(volinfo, brickinfo);
+    }
+    return ret;
+}
+
+int
+glusterd_get_local_brickpaths(glusterd_volinfo_t *volinfo, char **pathlist)
+{
+    char **path_tokens = NULL;
+    char *tmp_path_list = NULL;
+    char path[PATH_MAX] = "";
+    int32_t count = 0;
+    int32_t pathlen = 0;
+    int32_t total_len = 0;
+    int32_t ret = 0;
+    int i = 0;
+    glusterd_brickinfo_t *brickinfo = NULL;
+
+    if ((!volinfo) || (!pathlist)) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    path_tokens = GF_CALLOC(sizeof(char *), volinfo->brick_count,
+                            gf_gld_mt_charptr);
+    if (!path_tokens) {
+        gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Could not allocate memory.");
+        ret = -1;
+        goto out;
+    }
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID))
+            continue;
+
+        pathlen = snprintf(path, sizeof(path), "--path=%s ", brickinfo->path);
+        if (pathlen < sizeof(path))
+            path[pathlen] = '\0';
+        else
+            path[sizeof(path) - 1] = '\0';
+        path_tokens[count] = gf_strdup(path);
+        if (!path_tokens[count]) {
+            gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "Could not allocate memory.");
+            ret = -1;
+            goto out;
+        }
+        count++;
+        total_len += pathlen;
+    }
+
+    tmp_path_list = GF_CALLOC(sizeof(char), total_len + 1, gf_gld_mt_char);
+    if (!tmp_path_list) {
+        gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Could not allocate memory.");
+        ret = -1;
+        goto out;
+    }
+
+    for (i = 0; i < count; i++)
+        strcat(tmp_path_list, path_tokens[i]);
+
+    if (count)
+        *pathlist = tmp_path_list;
+
+    ret = count;
+out:
+    if (path_tokens) {
+        for (i = 0; i < count; i++) {
+            GF_FREE(path_tokens[i]);
+        }
+    }
+
+    GF_FREE(path_tokens);
+    path_tokens = NULL;
+
+    if (ret == 0) {
+        gf_msg_debug("glusterd", 0, "No Local Bricks Present.");
+        GF_FREE(tmp_path_list);
+        tmp_path_list = NULL;
+    }
+
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_start_gsync(glusterd_volinfo_t *master_vol, char *slave,
+                     char *path_list, char *conf_path, char *glusterd_uuid_str,
+                     char **op_errstr, gf_boolean_t is_pause)
+{
+    int32_t ret = 0;
+    int32_t status = 0;
+    char uuid_str[64] = "";
+    runner_t runner = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    int errcode = 0;
+    gf_boolean_t is_template_in_use = _gf_false;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    uuid_utoa_r(MY_UUID, uuid_str);
+
+    if (!path_list) {
+        ret = 0;
+        gf_msg_debug("glusterd", 0,
+                     "No Bricks in this node."
+                     " Not starting gsyncd.");
+        goto out;
+    }
+
+    ret = gsync_status(master_vol->volname, slave, conf_path, &status,
+                       &is_template_in_use);
+    if (status == 0)
+        goto out;
+
+    if (is_template_in_use == _gf_true) {
+        gf_asprintf(op_errstr,
+                    GEOREP
+                    " start failed for %s %s : "
+                    "pid-file entry missing in config file",
+                    master_vol->volname, slave);
+        ret = -1;
+        goto out;
+    }
+
+    uuid_utoa_r(master_vol->volume_id, uuid_str);
+    runinit(&runner);
+    runner_add_args(&runner, GSYNCD_PREFIX "/gsyncd", path_list, "-c", NULL);
+    runner_argprintf(&runner, "%s", conf_path);
+    runner_argprintf(&runner, ":%s", master_vol->volname);
+    runner_add_args(&runner, slave, "--config-set", "session-owner", NULL);
+    runner_argprintf(&runner, "--value=%s", uuid_str);
+    synclock_unlock(&priv->big_lock);
+    ret = runner_run(&runner);
+    synclock_lock(&priv->big_lock);
+    if (ret == -1) {
+        errcode = -1;
+        goto out;
+    }
+
+    runinit(&runner);
+    runner_add_args(&runner, GSYNCD_PREFIX "/gsyncd", path_list, "--monitor",
+                    "-c", NULL);
+    runner_argprintf(&runner, "%s", conf_path);
+    runner_argprintf(&runner, "--iprefix=%s", DATADIR);
+    runner_argprintf(&runner, ":%s", master_vol->volname);
+    runner_argprintf(&runner, "--glusterd-uuid=%s", uuid_utoa(priv->uuid));
+    runner_add_arg(&runner, slave);
+    if (is_pause)
+        runner_add_arg(&runner, "--pause-on-start");
+    synclock_unlock(&priv->big_lock);
+    ret = runner_run(&runner);
+    synclock_lock(&priv->big_lock);
+    if (ret == -1) {
+        gf_asprintf(op_errstr, GEOREP " start failed for %s %s",
+                    master_vol->volname, slave);
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    if ((ret != 0) && errcode == -1) {
+        if (op_errstr)
+            *op_errstr = gf_strdup(
+                "internal error, cannot start "
+                "the " GEOREP " session");
+    }
+
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_recreate_volfiles(glusterd_conf_t *conf)
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    int ret = 0;
+    int op_ret = 0;
+
+    GF_ASSERT(conf);
+
+    cds_list_for_each_entry(volinfo, &conf->volumes, vol_list)
+    {
+        ret = generate_brick_volfiles(volinfo);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+                   "Failed to "
+                   "regenerate brick volfiles for %s",
+                   volinfo->volname);
+            op_ret = ret;
+        }
+        ret = generate_client_volfiles(volinfo, GF_CLIENT_TRUSTED);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+                   "Failed to "
+                   "regenerate trusted client volfiles for %s",
+                   volinfo->volname);
+            op_ret = ret;
+        }
+        ret = generate_client_volfiles(volinfo, GF_CLIENT_OTHER);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+                   "Failed to "
+                   "regenerate client volfiles for %s",
+                   volinfo->volname);
+            op_ret = ret;
+        }
+    }
+    return op_ret;
+}
+
+int32_t
+glusterd_handle_upgrade_downgrade(dict_t *options, glusterd_conf_t *conf,
+                                  gf_boolean_t upgrade, gf_boolean_t downgrade)
+{
+    int ret = 0;
+    gf_boolean_t regenerate_volfiles = _gf_false;
+    gf_boolean_t terminate = _gf_false;
+
+    if (_gf_true == upgrade)
+        regenerate_volfiles = _gf_true;
+
+    if (upgrade && downgrade) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_WRONG_OPTS_SETTING,
+               "Both upgrade and downgrade"
+               " options are set. Only one should be on");
+        ret = -1;
+        goto out;
+    }
+
+    if (!upgrade && !downgrade)
+        ret = 0;
+    else
+        terminate = _gf_true;
+    if (regenerate_volfiles) {
+        ret = glusterd_recreate_volfiles(conf);
+    }
+out:
+    if (terminate && (ret == 0))
+        kill(getpid(), SIGTERM);
+    return ret;
+}
+
+static inline int
+glusterd_is_replica_volume(int type)
+{
+    if (type == GF_CLUSTER_TYPE_REPLICATE)
+        return 1;
+    return 0;
+}
+gf_boolean_t
+glusterd_is_volume_replicate(glusterd_volinfo_t *volinfo)
+{
+    return glusterd_is_replica_volume((volinfo->type));
+}
+
+gf_boolean_t
+glusterd_is_shd_compatible_type(int type)
+{
+    switch (type) {
+        case GF_CLUSTER_TYPE_REPLICATE:
+        case GF_CLUSTER_TYPE_DISPERSE:
+            return _gf_true;
+    }
+    return _gf_false;
+}
+
+gf_boolean_t
+glusterd_is_shd_compatible_volume(glusterd_volinfo_t *volinfo)
+{
+    return glusterd_is_shd_compatible_type(volinfo->type);
+}
+
+int
+glusterd_set_dump_options(char *dumpoptions_path, char *options, int option_cnt)
+{
+    int ret = 0;
+    char *dup_options = NULL;
+    char *option = NULL;
+    char *tmpptr = NULL;
+    FILE *fp = NULL;
+    int nfs_cnt = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if (0 == option_cnt || (option_cnt == 1 && (!strcmp(options, "nfs ")))) {
+        ret = 0;
+        goto out;
+    }
+
+    fp = fopen(dumpoptions_path, "w");
+    if (!fp) {
+        ret = -1;
+        goto out;
+    }
+    dup_options = gf_strdup(options);
+
+    if (!dup_options) {
+        goto out;
+    }
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_STATEDUMP_OPTS_RCVD,
+           "Received following statedump options: %s", dup_options);
+    option = strtok_r(dup_options, " ", &tmpptr);
+    while (option) {
+        if (!strcmp(option, priv->nfs_svc.name)) {
+            if (nfs_cnt > 0) {
+                sys_unlink(dumpoptions_path);
+                ret = 0;
+                goto out;
+            }
+            nfs_cnt++;
+            option = strtok_r(NULL, " ", &tmpptr);
+            continue;
+        }
+        fprintf(fp, "%s=yes\n", option);
+        option = strtok_r(NULL, " ", &tmpptr);
+    }
+
+out:
+    if (fp)
+        fclose(fp);
+    GF_FREE(dup_options);
+    return ret;
+}
+
+static int
+glusterd_brick_signal(glusterd_volinfo_t *volinfo,
+                      glusterd_brickinfo_t *brickinfo, char *options,
+                      int option_cnt, char **op_errstr, int sig)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char pidfile_path[PATH_MAX] = "";
+    char dumpoptions_path[PATH_MAX] = "";
+    FILE *pidfile = NULL;
+    pid_t pid = -1;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    if (gf_uuid_is_null(brickinfo->uuid)) {
+        ret = glusterd_resolve_brick(brickinfo);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_RESOLVE_BRICK_FAIL,
+                   "Cannot resolve brick %s:%s", brickinfo->hostname,
+                   brickinfo->path);
+            goto out;
+        }
+    }
+
+    if (gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+        ret = 0;
+        goto out;
+    }
+
+    GLUSTERD_GET_BRICK_PIDFILE(pidfile_path, volinfo, brickinfo, conf);
+
+    /* TBD: use gf_is_service_running instead of almost-identical code? */
+    pidfile = fopen(pidfile_path, "r");
+    if (!pidfile) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to open pidfile: %s", pidfile_path);
+        ret = -1;
+        goto out;
+    }
+
+    ret = fscanf(pidfile, "%d", &pid);
+    if (ret <= 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to get pid of brick process");
+        ret = -1;
+        goto out;
+    }
+
+    if (pid == 0) {
+        gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_NO_SIG_TO_PID_ZERO,
+               "refusing to send signal %d to pid zero", sig);
+        goto out;
+    }
+
+    if (sig == SIGUSR1) {
+        snprintf(dumpoptions_path, sizeof(dumpoptions_path),
+                 DEFAULT_VAR_RUN_DIRECTORY "/glusterdump.%d.options", pid);
+        ret = glusterd_set_dump_options(dumpoptions_path, options, option_cnt);
+        if (ret < 0) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_BRK_STATEDUMP_FAIL,
+                   "error while parsing the statedump "
+                   "options");
+            ret = -1;
+            goto out;
+        }
+    }
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_STATEDUMP_INFO,
+           "sending signal %d to brick with pid %d", sig, pid);
+
+    kill(pid, sig);
+
+    sleep(1);
+    sys_unlink(dumpoptions_path);
+    ret = 0;
+out:
+    if (pidfile)
+        fclose(pidfile);
+    return ret;
+}
+
+int
+glusterd_brick_statedump(glusterd_volinfo_t *volinfo,
+                         glusterd_brickinfo_t *brickinfo, char *options,
+                         int option_cnt, char **op_errstr)
+{
+    return glusterd_brick_signal(volinfo, brickinfo, options, option_cnt,
+                                 op_errstr, SIGUSR1);
+}
+
+int
+glusterd_brick_terminate(glusterd_volinfo_t *volinfo,
+                         glusterd_brickinfo_t *brickinfo, char *options,
+                         int option_cnt, char **op_errstr)
+{
+    return glusterd_brick_signal(volinfo, brickinfo, options, option_cnt,
+                                 op_errstr, SIGTERM);
+}
+
+#ifdef BUILD_GNFS
+int
+glusterd_nfs_statedump(char *options, int option_cnt, char **op_errstr)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char pidfile_path[PATH_MAX] = "";
+    FILE *pidfile = NULL;
+    pid_t pid = -1;
+    char dumpoptions_path[PATH_MAX] = "";
+    char *option = NULL;
+    char *tmpptr = NULL;
+    char *dup_options = NULL;
+    char msg[256] = "";
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    dup_options = gf_strdup(options);
+
+    if (!dup_options) {
+        goto out;
+    }
+    option = strtok_r(dup_options, " ", &tmpptr);
+    if (strcmp(option, conf->nfs_svc.name)) {
+        snprintf(msg, sizeof(msg),
+                 "for nfs statedump, options should"
+                 " be after the key nfs");
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_INVALID_ENTRY,
+                "Options misplaced", NULL);
+        *op_errstr = gf_strdup(msg);
+        ret = -1;
+        goto out;
+    }
+
+    GLUSTERD_GET_NFS_PIDFILE(pidfile_path, conf);
+
+    pidfile = fopen(pidfile_path, "r");
+    if (!pidfile) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to open pidfile: %s", pidfile_path);
+        ret = -1;
+        goto out;
+    }
+
+    ret = fscanf(pidfile, "%d", &pid);
+    if (ret <= 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to get pid of brick process");
+        ret = -1;
+        goto out;
+    }
+
+    snprintf(dumpoptions_path, sizeof(dumpoptions_path),
+             DEFAULT_VAR_RUN_DIRECTORY "/glusterdump.%d.options", pid);
+    ret = glusterd_set_dump_options(dumpoptions_path, options, option_cnt);
+    if (ret < 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_BRK_STATEDUMP_FAIL,
+               "error while parsing the statedump "
+               "options");
+        ret = -1;
+        goto out;
+    }
+
+    gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_STATEDUMP_INFO,
+           "Performing statedump on nfs server with "
+           "pid %d",
+           pid);
+
+    kill(pid, SIGUSR1);
+
+    sleep(1);
+    /* coverity[TAINTED_STRING] */
+    sys_unlink(dumpoptions_path);
+    ret = 0;
+out:
+    if (pidfile)
+        fclose(pidfile);
+    GF_FREE(dup_options);
+    return ret;
+}
+#endif
+
+int
+glusterd_client_statedump(char *volname, char *options, int option_cnt,
+                          char **op_errstr)
+{
+    int ret = 0;
+    char *dup_options = NULL;
+    char *option = NULL;
+    char *tmpptr = NULL;
+    char msg[256] = "";
+    char *target_ip = NULL;
+    char *pid = NULL;
+
+    dup_options = gf_strdup(options);
+    if (!dup_options) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_STRDUP_FAILED,
+                "options=%s", options, NULL);
+        goto out;
+    }
+    option = strtok_r(dup_options, " ", &tmpptr);
+    if (strcmp(option, "client")) {
+        snprintf(msg, sizeof(msg),
+                 "for gluster client statedump, options "
+                 "should be after the key 'client'");
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_INVALID_ENTRY,
+                "Options misplaced", NULL);
+        *op_errstr = gf_strdup(msg);
+        ret = -1;
+        goto out;
+    }
+    target_ip = strtok_r(NULL, " ", &tmpptr);
+    if (target_ip == NULL) {
+        snprintf(msg, sizeof(msg), "ip address not specified");
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_INVALID_ENTRY, msg,
+                NULL);
+        *op_errstr = gf_strdup(msg);
+        ret = -1;
+        goto out;
+    }
+
+    pid = strtok_r(NULL, " ", &tmpptr);
+    if (pid == NULL) {
+        snprintf(msg, sizeof(msg), "pid not specified");
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_INVALID_ENTRY, msg,
+                NULL);
+        *op_errstr = gf_strdup(msg);
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_client_statedump_submit_req(volname, target_ip, pid);
+out:
+    GF_FREE(dup_options);
+    return ret;
+}
+
+int
+glusterd_quotad_statedump(char *options, int option_cnt, char **op_errstr)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char pidfile_path[PATH_MAX] = "";
+    FILE *pidfile = NULL;
+    pid_t pid = -1;
+    char dumpoptions_path[PATH_MAX] = "";
+    char *option = NULL;
+    char *tmpptr = NULL;
+    char *dup_options = NULL;
+    char msg[256] = "";
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    dup_options = gf_strdup(options);
+    if (!dup_options) {
+        goto out;
+    }
+    option = strtok_r(dup_options, " ", &tmpptr);
+    if (strcmp(option, conf->quotad_svc.name)) {
+        snprintf(msg, sizeof(msg),
+                 "for quotad statedump, options "
+                 "should be after the key 'quotad'");
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ENTRY,
+                "Options misplaced", NULL);
+        *op_errstr = gf_strdup(msg);
+        ret = -1;
+        goto out;
+    }
+
+    GLUSTERD_GET_QUOTAD_PIDFILE(pidfile_path, conf);
+
+    pidfile = fopen(pidfile_path, "r");
+    if (!pidfile) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to open pidfile: %s", pidfile_path);
+        ret = -1;
+        goto out;
+    }
+
+    ret = fscanf(pidfile, "%d", &pid);
+    if (ret <= 0) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to get pid of quotad "
+               "process");
+        ret = -1;
+        goto out;
+    }
+
+    snprintf(dumpoptions_path, sizeof(dumpoptions_path),
+             DEFAULT_VAR_RUN_DIRECTORY "/glusterdump.%d.options", pid);
+    ret = glusterd_set_dump_options(dumpoptions_path, options, option_cnt);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRK_STATEDUMP_FAIL,
+               "error while parsing "
+               "statedump options");
+        ret = -1;
+        goto out;
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_STATEDUMP_INFO,
+           "Performing statedump on quotad with "
+           "pid %d",
+           pid);
+
+    kill(pid, SIGUSR1);
+
+    sleep(1);
+
+    /* coverity[TAINTED_STRING] */
+    sys_unlink(dumpoptions_path);
+    ret = 0;
+out:
+    if (pidfile)
+        fclose(pidfile);
+    GF_FREE(dup_options);
+    return ret;
+}
+
+/* Checks if the given peer contains bricks belonging to the given volume.
+ * Returns,
+ *   2 - if peer contains all the bricks
+ *   1 - if peer contains at least 1 brick
+ *   0 - if peer contains no bricks
+ */
+int
+glusterd_friend_contains_vol_bricks(glusterd_volinfo_t *volinfo,
+                                    uuid_t friend_uuid)
+{
+    int ret = 0;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int count = 0;
+
+    GF_ASSERT(volinfo);
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (!gf_uuid_compare(brickinfo->uuid, friend_uuid)) {
+            count++;
+        }
+    }
+
+    if (count) {
+        if (count == volinfo->brick_count)
+            ret = 2;
+        else
+            ret = 1;
+    }
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Checks if the given peer contains bricks belonging to the given volume.
+ * Returns,
+ *   2 - if peer contains all the bricks
+ *   1 - if peer contains at least 1 brick
+ *   0 - if peer contains no bricks
+ */
+int
+glusterd_friend_contains_snap_bricks(glusterd_snap_t *snapinfo,
+                                     uuid_t friend_uuid)
+{
+    int ret = -1;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int count = 0;
+
+    GF_VALIDATE_OR_GOTO("glusterd", snapinfo, out);
+
+    cds_list_for_each_entry(volinfo, &snapinfo->volumes, vol_list)
+    {
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            if (!gf_uuid_compare(brickinfo->uuid, friend_uuid)) {
+                count++;
+            }
+        }
+    }
+
+    if (count > 0)
+        ret = 1;
+    else
+        ret = 0;
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+/* Cleanup the stale volumes left behind in the cluster. The volumes which are
+ * contained completely within the detached peer are stale with respect to the
+ * cluster.
+ */
+int
+glusterd_friend_remove_cleanup_vols(uuid_t uuid)
+{
+    int ret = -1;
+    glusterd_conf_t *priv = NULL;
+    glusterd_svc_t *svc = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_volinfo_t *tmp_volinfo = NULL;
+
+    priv = THIS->private;
+    GF_ASSERT(priv);
+
+    cds_list_for_each_entry_safe(volinfo, tmp_volinfo, &priv->volumes, vol_list)
+    {
+        if (!glusterd_friend_contains_vol_bricks(volinfo, MY_UUID)) {
+            /*Stop snapd daemon service if snapd daemon is running*/
+            if (!volinfo->is_snap_volume) {
+                svc = &(volinfo->snapd.svc);
+                ret = svc->stop(svc, SIGTERM);
+                if (ret) {
+                    gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SVC_STOP_FAIL,
+                           "Failed "
+                           "to stop snapd daemon service");
+                }
+            }
+
+            if (glusterd_is_shd_compatible_volume(volinfo)) {
+                /*
+                 * Sending stop request for all volumes. So it is fine
+                 * to send stop for mux shd
+                 */
+                svc = &(volinfo->shd.svc);
+                ret = svc->stop(svc, SIGTERM);
+                if (ret) {
+                    gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SVC_STOP_FAIL,
+                           "Failed "
+                           "to stop shd daemon service");
+                }
+            }
+        }
+
+        if (glusterd_friend_contains_vol_bricks(volinfo, uuid) == 2) {
+            gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_STALE_VOL_DELETE_INFO,
+                   "Deleting stale volume %s", volinfo->volname);
+            ret = glusterd_delete_volume(volinfo);
+            if (ret) {
+                gf_msg(THIS->name, GF_LOG_ERROR, 0,
+                       GD_MSG_STALE_VOL_REMOVE_FAIL,
+                       "Error deleting stale volume");
+                goto out;
+            }
+        }
+    }
+
+    /* Reconfigure all daemon services upon peer detach */
+    ret = glusterd_svcs_reconfigure(NULL);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_SVC_STOP_FAIL,
+               "Failed to reconfigure all daemon services.");
+    }
+    ret = 0;
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_get_bitd_filepath(char *filepath, glusterd_volinfo_t *volinfo)
+{
+    int ret = 0;
+    char path[PATH_MAX] = "";
+    glusterd_conf_t *priv = NULL;
+    int32_t len = 0;
+
+    priv = THIS->private;
+
+    GLUSTERD_GET_VOLUME_DIR(path, volinfo, priv);
+
+    len = snprintf(filepath, PATH_MAX, "%s/%s-bitd.vol", path,
+                   volinfo->volname);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        ret = -1;
+    }
+
+    return ret;
+}
+
+int
+glusterd_get_client_filepath(char *filepath, glusterd_volinfo_t *volinfo,
+                             gf_transport_type type)
+{
+    int ret = 0;
+    char path[PATH_MAX] = "";
+    glusterd_conf_t *priv = NULL;
+    int32_t len = 0;
+
+    priv = THIS->private;
+
+    GLUSTERD_GET_VOLUME_DIR(path, volinfo, priv);
+
+    switch (type) {
+        case GF_TRANSPORT_TCP:
+            len = snprintf(filepath, PATH_MAX, "%s/%s.tcp-fuse.vol", path,
+                           volinfo->volname);
+            break;
+
+        case GF_TRANSPORT_RDMA:
+            len = snprintf(filepath, PATH_MAX, "%s/%s.rdma-fuse.vol", path,
+                           volinfo->volname);
+            break;
+        default:
+            ret = -1;
+            break;
+    }
+    if ((len < 0) || (len >= PATH_MAX)) {
+        ret = -1;
+    }
+
+    return ret;
+}
+
+int
+glusterd_get_trusted_client_filepath(char *filepath,
+                                     glusterd_volinfo_t *volinfo,
+                                     gf_transport_type type)
+{
+    int ret = 0;
+    char path[PATH_MAX] = "";
+    glusterd_conf_t *priv = NULL;
+    int32_t len = 0;
+
+    priv = THIS->private;
+
+    GLUSTERD_GET_VOLUME_DIR(path, volinfo, priv);
+
+    switch (type) {
+        case GF_TRANSPORT_TCP:
+            len = snprintf(filepath, PATH_MAX, "%s/trusted-%s.tcp-fuse.vol",
+                           path, volinfo->volname);
+            break;
+
+        case GF_TRANSPORT_RDMA:
+            len = snprintf(filepath, PATH_MAX, "%s/trusted-%s.rdma-fuse.vol",
+                           path, volinfo->volname);
+            break;
+        default:
+            ret = -1;
+            break;
+    }
+    if ((len < 0) || (len >= PATH_MAX)) {
+        ret = -1;
+    }
+
+    return ret;
+}
+
+int
+glusterd_get_dummy_client_filepath(char *filepath, glusterd_volinfo_t *volinfo,
+                                   gf_transport_type type)
+{
+    int ret = 0;
+
+    switch (type) {
+        case GF_TRANSPORT_TCP:
+        case GF_TRANSPORT_BOTH_TCP_RDMA:
+            snprintf(filepath, PATH_MAX, "/tmp/%s.tcp-fuse.vol",
+                     volinfo->volname);
+            break;
+
+        case GF_TRANSPORT_RDMA:
+            snprintf(filepath, PATH_MAX, "/tmp/%s.rdma-fuse.vol",
+                     volinfo->volname);
+            break;
+        default:
+            ret = -1;
+            break;
+    }
+
+    return ret;
+}
+
+int
+glusterd_volume_defrag_restart(glusterd_volinfo_t *volinfo, char *op_errstr,
+                               size_t len, int cmd, defrag_cbk_fn_t cbk)
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    char pidfile[PATH_MAX] = "";
+    int ret = -1;
+    pid_t pid = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    if (!priv)
+        return ret;
+
+    /* Don't start the rebalance process if the stautus is already
+     * completed, stopped or failed. If the status is started, check if
+     * there is an existing process already and connect to it. If not, then
+     * start the rebalance process
+     */
+
+    switch (volinfo->rebal.defrag_status) {
+        case GF_DEFRAG_STATUS_COMPLETE:
+        case GF_DEFRAG_STATUS_STOPPED:
+        case GF_DEFRAG_STATUS_FAILED:
+            break;
+        case GF_DEFRAG_STATUS_STARTED:
+            GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv);
+            if (gf_is_service_running(pidfile, &pid)) {
+                ret = glusterd_rebalance_defrag_init(volinfo, cbk);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_REBALANCE_START_FAIL,
+                           "Failed to initialize  defrag."
+                           "Not starting rebalance process for "
+                           "%s.",
+                           volinfo->volname);
+                    gf_event(EVENT_REBALANCE_START_FAILED, "volume=%s",
+                             volinfo->volname);
+                    goto out;
+                }
+                ret = glusterd_rebalance_rpc_create(volinfo);
+                break;
+            }
+        case GF_DEFRAG_STATUS_NOT_STARTED:
+            ret = glusterd_handle_defrag_start(volinfo, op_errstr, len, cmd,
+                                               cbk, volinfo->rebal.op);
+            if (ret) {
+                volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_FAILED;
+                gf_event(EVENT_REBALANCE_START_FAILED, "volume=%s",
+                         volinfo->volname);
+            }
+            break;
+        default:
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REBALANCE_START_FAIL,
+                   "Unknown defrag status (%d)."
+                   "Not starting rebalance process for %s.",
+                   volinfo->rebal.defrag_status, volinfo->volname);
+            break;
+    }
+out:
+    return ret;
+}
+
+void
+glusterd_defrag_info_set(glusterd_volinfo_t *volinfo, dict_t *dict, int cmd,
+                         int status, int op)
+{
+    xlator_t *this = NULL;
+    int ret = -1;
+    char *task_id_str = NULL;
+    glusterd_rebalance_t *rebal = NULL;
+
+    this = THIS;
+    rebal = &volinfo->rebal;
+
+    rebal->defrag_cmd = cmd;
+    rebal->defrag_status = status;
+    rebal->op = op;
+
+    if (gf_uuid_is_null(rebal->rebalance_id))
+        return;
+
+    if (is_origin_glusterd(dict)) {
+        ret = glusterd_generate_and_set_task_id(dict, GF_REBALANCE_TID_KEY,
+                                                SLEN(GF_REBALANCE_TID_KEY));
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TASKID_GEN_FAIL,
+                   "Failed to generate task-id");
+            goto out;
+        }
+    }
+    ret = dict_get_strn(dict, GF_REBALANCE_TID_KEY, SLEN(GF_REBALANCE_TID_KEY),
+                        &task_id_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_REBALANCE_ID_MISSING,
+               "Missing rebalance-id");
+        ret = 0;
+        goto out;
+    }
+
+    gf_uuid_parse(task_id_str, rebal->rebalance_id);
+out:
+
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Rebalance start validate failed");
+    }
+    return;
+}
+
+int
+glusterd_restart_rebalance_for_volume(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    char op_errstr[PATH_MAX] = "";
+
+    if (!gd_should_i_start_rebalance(volinfo)) {
+        /* Store the rebalance-id and rebalance command even if
+         * the peer isn't starting a rebalance process. On peers
+         * where a rebalance process is started,
+         * glusterd_handle_defrag_start performs the storing.
+         *
+         * Storing this is needed for having 'volume status'
+         * work correctly.
+         */
+        volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_NOT_STARTED;
+        return 0;
+    }
+    if (!volinfo->rebal.defrag_cmd) {
+        volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_FAILED;
+        return -1;
+    }
+
+    ret = glusterd_volume_defrag_restart(volinfo, op_errstr, PATH_MAX,
+                                         volinfo->rebal.defrag_cmd,
+                                         volinfo->rebal.op == GD_OP_REMOVE_BRICK
+                                             ? glusterd_remove_brick_migrate_cbk
+                                             : NULL);
+    if (!ret) {
+        /* If remove brick is started then ensure that on a glusterd
+         * restart decommission_is_in_progress is set to avoid remove
+         * brick commit to happen when rebalance is not completed.
+         */
+        if (volinfo->rebal.op == GD_OP_REMOVE_BRICK &&
+            volinfo->rebal.defrag_status == GF_DEFRAG_STATUS_STARTED) {
+            volinfo->decommission_in_progress = 1;
+        }
+    }
+    return ret;
+}
+int
+glusterd_restart_rebalance(glusterd_conf_t *conf)
+{
+    glusterd_volinfo_t *volinfo = NULL;
+    int ret = 0;
+
+    cds_list_for_each_entry(volinfo, &conf->volumes, vol_list)
+    {
+        glusterd_restart_rebalance_for_volume(volinfo);
+    }
+    return ret;
+}
+
+void
+glusterd_volinfo_reset_defrag_stats(glusterd_volinfo_t *volinfo)
+{
+    glusterd_rebalance_t *rebal = NULL;
+    GF_ASSERT(volinfo);
+
+    rebal = &volinfo->rebal;
+    rebal->rebalance_files = 0;
+    rebal->rebalance_data = 0;
+    rebal->lookedup_files = 0;
+    rebal->rebalance_failures = 0;
+    rebal->rebalance_time = 0;
+    rebal->skipped_files = 0;
+}
+
+gf_boolean_t
+glusterd_is_local_brick(xlator_t *this, glusterd_volinfo_t *volinfo,
+                        glusterd_brickinfo_t *brickinfo)
+{
+    gf_boolean_t local = _gf_false;
+    int ret = 0;
+
+    if (gf_uuid_is_null(brickinfo->uuid)) {
+        ret = glusterd_resolve_brick(brickinfo);
+        if (ret)
+            goto out;
+    }
+    local = !gf_uuid_compare(brickinfo->uuid, MY_UUID);
+out:
+    return local;
+}
+int
+glusterd_validate_volume_id(dict_t *op_dict, glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    char *volid_str = NULL;
+    uuid_t vol_uid = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(op_dict, "vol-id", SLEN("vol-id"), &volid_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get volume id for "
+               "volume %s",
+               volinfo->volname);
+        goto out;
+    }
+    ret = gf_uuid_parse(volid_str, vol_uid);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UUID_PARSE_FAIL,
+               "Failed to parse volume id "
+               "for volume %s",
+               volinfo->volname);
+        goto out;
+    }
+
+    if (gf_uuid_compare(vol_uid, volinfo->volume_id)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_ID_MISMATCH,
+               "Volume ids of volume %s - %s"
+               " and %s - are different. Possibly a split brain among "
+               "peers.",
+               volinfo->volname, volid_str, uuid_utoa(volinfo->volume_id));
+        ret = -1;
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+int
+glusterd_defrag_volume_status_update(glusterd_volinfo_t *volinfo,
+                                     dict_t *rsp_dict, int32_t cmd)
+{
+    int ret = 0;
+    int ret2 = 0;
+    uint64_t files = 0;
+    uint64_t size = 0;
+    uint64_t lookup = 0;
+    gf_defrag_status_t status = GF_DEFRAG_STATUS_NOT_STARTED;
+    uint64_t failures = 0;
+    uint64_t skipped = 0;
+    xlator_t *this = NULL;
+    double run_time = 0;
+    uint64_t promoted = 0;
+    uint64_t demoted = 0;
+    uint64_t time_left = 0;
+
+    this = THIS;
+
+    ret = dict_get_uint64(rsp_dict, "files", &files);
+    if (ret)
+        gf_msg_trace(this->name, 0, "failed to get file count");
+
+    ret = dict_get_uint64(rsp_dict, "size", &size);
+    if (ret)
+        gf_msg_trace(this->name, 0, "failed to get size of xfer");
+
+    ret = dict_get_uint64(rsp_dict, "lookups", &lookup);
+    if (ret)
+        gf_msg_trace(this->name, 0, "failed to get lookedup file count");
+
+    ret = dict_get_int32n(rsp_dict, "status", SLEN("status"),
+                          (int32_t *)&status);
+    if (ret)
+        gf_msg_trace(this->name, 0, "failed to get status");
+
+    ret = dict_get_uint64(rsp_dict, "failures", &failures);
+    if (ret)
+        gf_msg_trace(this->name, 0, "failed to get failure count");
+
+    ret = dict_get_uint64(rsp_dict, "skipped", &skipped);
+    if (ret)
+        gf_msg_trace(this->name, 0, "failed to get skipped count");
+
+    ret = dict_get_uint64(rsp_dict, "promoted", &promoted);
+    if (ret)
+        gf_msg_trace(this->name, 0, "failed to get promoted count");
+
+    ret = dict_get_uint64(rsp_dict, "demoted", &demoted);
+    if (ret)
+        gf_msg_trace(this->name, 0, "failed to get demoted count");
+
+    ret = dict_get_double(rsp_dict, "run-time", &run_time);
+    if (ret)
+        gf_msg_trace(this->name, 0, "failed to get run-time");
+
+    ret2 = dict_get_uint64(rsp_dict, "time-left", &time_left);
+    if (ret2)
+        gf_msg_trace(this->name, 0, "failed to get time left");
+
+    if (files)
+        volinfo->rebal.rebalance_files = files;
+    if (size)
+        volinfo->rebal.rebalance_data = size;
+    if (lookup)
+        volinfo->rebal.lookedup_files = lookup;
+    if (status)
+        volinfo->rebal.defrag_status = status;
+    if (failures)
+        volinfo->rebal.rebalance_failures = failures;
+    if (skipped)
+        volinfo->rebal.skipped_files = skipped;
+    if (run_time)
+        volinfo->rebal.rebalance_time = run_time;
+    if (!ret2)
+        volinfo->rebal.time_left = time_left;
+
+    return ret;
+}
+
+int
+glusterd_check_topology_identical(const char *filename1, const char *filename2,
+                                  gf_boolean_t *identical)
+{
+    int ret = -1; /* FAILURE */
+    xlator_t *this = THIS;
+    FILE *fp1 = NULL;
+    FILE *fp2 = NULL;
+    glusterfs_graph_t *grph1 = NULL;
+    glusterfs_graph_t *grph2 = NULL;
+
+    /* Invalid xlator, Nothing to do */
+    if (!this)
+        return (-1);
+
+    /* Sanitize the inputs */
+    GF_VALIDATE_OR_GOTO(this->name, filename1, out);
+    GF_VALIDATE_OR_GOTO(this->name, filename2, out);
+    GF_VALIDATE_OR_GOTO(this->name, identical, out);
+
+    /* fopen() the volfile1 to create the graph */
+    fp1 = fopen(filename1, "r");
+    if (fp1 == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "fopen() on file: %s failed "
+               "(%s)",
+               filename1, strerror(errno));
+        goto out;
+    }
+
+    /* fopen() the volfile2 to create the graph */
+    fp2 = fopen(filename2, "r");
+    if (fp2 == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "fopen() on file: %s failed "
+               "(%s)",
+               filename2, strerror(errno));
+        goto out;
+    }
+
+    /* create the graph for filename1 */
+    grph1 = glusterfs_graph_construct(fp1);
+    if (grph1 == NULL)
+        goto out;
+
+    /* create the graph for filename2 */
+    grph2 = glusterfs_graph_construct(fp2);
+    if (grph2 == NULL)
+        goto out;
+
+    /* compare the graph topology */
+    *identical = is_graph_topology_equal(grph1, grph2);
+    ret = 0; /* SUCCESS */
+out:
+    if (fp1)
+        fclose(fp1);
+    if (fp2)
+        fclose(fp2);
+    if (grph1)
+        glusterfs_graph_destroy(grph1);
+    if (grph2)
+        glusterfs_graph_destroy(grph2);
+
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+int
+glusterd_check_files_identical(char *filename1, char *filename2,
+                               gf_boolean_t *identical)
+{
+    int ret = -1;
+    struct stat buf1 = {
+        0,
+    };
+    struct stat buf2 = {
+        0,
+    };
+    uint32_t cksum1 = 0;
+    uint32_t cksum2 = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(filename1);
+    GF_ASSERT(filename2);
+    GF_ASSERT(identical);
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    ret = sys_stat(filename1, &buf1);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "stat on file: %s failed "
+               "(%s)",
+               filename1, strerror(errno));
+        goto out;
+    }
+
+    ret = sys_stat(filename2, &buf2);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "stat on file: %s failed "
+               "(%s)",
+               filename2, strerror(errno));
+        goto out;
+    }
+
+    if (buf1.st_size != buf2.st_size) {
+        *identical = _gf_false;
+        goto out;
+    }
+
+    ret = get_checksum_for_path(filename1, &cksum1, priv->op_version);
+    if (ret)
+        goto out;
+
+    ret = get_checksum_for_path(filename2, &cksum2, priv->op_version);
+    if (ret)
+        goto out;
+
+    if (cksum1 != cksum2)
+        *identical = _gf_false;
+    else
+        *identical = _gf_true;
+
+out:
+    gf_msg_debug(this->name, 0, "Returning with %d", ret);
+    return ret;
+}
+
+int
+glusterd_volset_help(dict_t *dict, char **op_errstr)
+{
+    int ret = -1;
+    gf_boolean_t xml_out = _gf_false;
+#if (!HAVE_LIB_XML)
+    xlator_t *this = NULL;
+
+    this = THIS;
+#endif
+
+    if (!dict) {
+        if (!(dict = glusterd_op_get_ctx())) {
+            ret = 0;
+            goto out;
+        }
+    }
+
+    if (dict_getn(dict, "help", SLEN("help"))) {
+        xml_out = _gf_false;
+
+    } else if (dict_getn(dict, "help-xml", SLEN("help-xml"))) {
+        xml_out = _gf_true;
+#if (HAVE_LIB_XML)
+        ret = 0;
+#else
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MODULE_NOT_INSTALLED,
+               "libxml not present in the system");
+        if (op_errstr)
+            *op_errstr = gf_strdup(
+                "Error: xml libraries not "
+                "present to produce "
+                "xml-output");
+        goto out;
+#endif
+
+    } else {
+        goto out;
+    }
+
+    ret = glusterd_get_volopt_content(dict, xml_out);
+    if (ret && op_errstr)
+        *op_errstr = gf_strdup("Failed to get volume options help");
+out:
+
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_to_cli(rpcsvc_request_t *req, gf_cli_rsp *arg, struct iovec *payload,
+                int payloadcount, struct iobref *iobref, xdrproc_t xdrproc,
+                dict_t *dict)
+{
+    int ret = -1;
+    char *cmd = NULL;
+    int op_ret = 0;
+    char *op_errstr = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    op_ret = arg->op_ret;
+    op_errstr = arg->op_errstr;
+
+    ret = dict_get_strn(dict, "cmd-str", SLEN("cmd-str"), &cmd);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get command "
+               "string");
+
+    if (cmd) {
+        if (op_ret)
+            gf_cmd_log("", "%s : FAILED %s %s", cmd, (op_errstr) ? ":" : " ",
+                       (op_errstr) ? op_errstr : " ");
+        else
+            gf_cmd_log("", "%s : SUCCESS", cmd);
+    }
+
+    glusterd_submit_reply(req, arg, payload, payloadcount, iobref,
+                          (xdrproc_t)xdrproc);
+
+    if (dict) {
+        dict_unref(dict);
+    }
+    return ret;
+}
+
+static int32_t
+glusterd_append_gsync_status(dict_t *dst, dict_t *src)
+{
+    int ret = 0;
+    char *stop_msg = NULL;
+
+    ret = dict_get_strn(src, "gsync-status", SLEN("gsync-status"), &stop_msg);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=gsync-status", NULL);
+        ret = 0;
+        goto out;
+    }
+
+    ret = dict_set_dynstr_with_alloc(dst, "gsync-status", stop_msg);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set the stop"
+               "message in the ctx dictionary");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_append_status_dicts(dict_t *dst, dict_t *src)
+{
+    char sts_val_name[PATH_MAX] = "";
+    int dst_count = 0;
+    int src_count = 0;
+    int i = 0;
+    int ret = 0;
+    gf_gsync_status_t *sts_val = NULL;
+    gf_gsync_status_t *dst_sts_val = NULL;
+
+    GF_ASSERT(dst);
+
+    if (src == NULL)
+        goto out;
+
+    ret = dict_get_int32n(dst, "gsync-count", SLEN("gsync-count"), &dst_count);
+    if (ret)
+        dst_count = 0;
+
+    ret = dict_get_int32n(src, "gsync-count", SLEN("gsync-count"), &src_count);
+    if (ret || !src_count) {
+        gf_msg_debug("glusterd", 0, "Source brick empty");
+        ret = 0;
+        goto out;
+    }
+
+    for (i = 0; i < src_count; i++) {
+        snprintf(sts_val_name, sizeof(sts_val_name), "status_value%d", i);
+
+        ret = dict_get_bin(src, sts_val_name, (void **)&sts_val);
+        if (ret)
+            goto out;
+
+        dst_sts_val = GF_MALLOC(sizeof(gf_gsync_status_t),
+                                gf_common_mt_gsync_status_t);
+        if (!dst_sts_val) {
+            gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "Out Of Memory");
+            goto out;
+        }
+
+        memcpy(dst_sts_val, sts_val, sizeof(gf_gsync_status_t));
+
+        snprintf(sts_val_name, sizeof(sts_val_name), "status_value%d",
+                 i + dst_count);
+
+        ret = dict_set_bin(dst, sts_val_name, dst_sts_val,
+                           sizeof(gf_gsync_status_t));
+        if (ret) {
+            GF_FREE(dst_sts_val);
+            goto out;
+        }
+    }
+
+    ret = dict_set_int32n(dst, "gsync-count", SLEN("gsync-count"),
+                          dst_count + src_count);
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_aggr_brick_mount_dirs(dict_t *aggr, dict_t *rsp_dict)
+{
+    char key[64] = "";
+    int keylen;
+    char *brick_mount_dir = NULL;
+    int32_t brick_count = -1;
+    int32_t ret = -1;
+    int32_t i = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(aggr);
+    GF_ASSERT(rsp_dict);
+
+    ret = dict_get_int32n(rsp_dict, "brick_count", SLEN("brick_count"),
+                          &brick_count);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "No brick_count present");
+        ret = 0;
+        goto out;
+    }
+
+    for (i = 1; i <= brick_count; i++) {
+        brick_mount_dir = NULL;
+        keylen = snprintf(key, sizeof(key), "brick%d.mount_dir", i);
+        ret = dict_get_strn(rsp_dict, key, keylen, &brick_mount_dir);
+        if (ret) {
+            /* Coz the info will come from a different node */
+            gf_msg_debug(this->name, 0, "%s not present", key);
+            continue;
+        }
+
+        ret = dict_set_dynstr_with_alloc(aggr, key, brick_mount_dir);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set %s", key);
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    gf_msg_trace(this->name, 0, "Returning %d ", ret);
+    return ret;
+}
+
+int32_t
+glusterd_gsync_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict, char *op_errstr)
+{
+    dict_t *ctx = NULL;
+    int ret = 0;
+    char *conf_path = NULL;
+
+    if (aggr) {
+        ctx = aggr;
+
+    } else {
+        ctx = glusterd_op_get_ctx();
+        if (!ctx) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_OPCTX_GET_FAIL,
+                   "Operation Context is not present");
+            GF_ASSERT(0);
+        }
+    }
+
+    if (rsp_dict) {
+        ret = glusterd_append_status_dicts(ctx, rsp_dict);
+        if (ret)
+            goto out;
+
+        ret = glusterd_append_gsync_status(ctx, rsp_dict);
+        if (ret)
+            goto out;
+
+        ret = dict_get_strn(rsp_dict, "conf_path", SLEN("conf_path"),
+                            &conf_path);
+        if (!ret && conf_path) {
+            ret = dict_set_dynstr_with_alloc(ctx, "conf_path", conf_path);
+            if (ret) {
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Unable to store conf path.");
+                goto out;
+            }
+        }
+    }
+    if ((op_errstr) && (strcmp("", op_errstr))) {
+        ret = dict_set_dynstr_with_alloc(ctx, "errstr", op_errstr);
+        if (ret)
+            goto out;
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d ", ret);
+    return ret;
+}
+
+int32_t
+glusterd_rb_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict)
+{
+    int32_t src_port = 0;
+    int32_t dst_port = 0;
+    int ret = 0;
+    dict_t *ctx = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (aggr) {
+        ctx = aggr;
+
+    } else {
+        ctx = glusterd_op_get_ctx();
+        if (!ctx) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_OPCTX_GET_FAIL,
+                   "Operation Context is not present");
+            GF_ASSERT(0);
+        }
+    }
+
+    if (rsp_dict) {
+        ret = dict_get_int32n(rsp_dict, "src-brick-port",
+                              SLEN("src-brick-port"), &src_port);
+        if (ret == 0) {
+            gf_msg_debug("glusterd", 0, "src-brick-port=%d found", src_port);
+        }
+
+        ret = dict_get_int32n(rsp_dict, "dst-brick-port",
+                              SLEN("dst-brick-port"), &dst_port);
+        if (ret == 0) {
+            gf_msg_debug("glusterd", 0, "dst-brick-port=%d found", dst_port);
+        }
+
+        ret = glusterd_aggr_brick_mount_dirs(ctx, rsp_dict);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_MOUNDIRS_AGGR_FAIL,
+                   "Failed to "
+                   "aggregate brick mount dirs");
+            goto out;
+        }
+    }
+
+    if (src_port) {
+        ret = dict_set_int32n(ctx, "src-brick-port", SLEN("src-brick-port"),
+                              src_port);
+        if (ret) {
+            gf_msg_debug("glusterd", 0, "Could not set src-brick");
+            goto out;
+        }
+    }
+
+    if (dst_port) {
+        ret = dict_set_int32n(ctx, "dst-brick-port", SLEN("dst-brick-port"),
+                              dst_port);
+        if (ret) {
+            gf_msg_debug("glusterd", 0, "Could not set dst-brick");
+            goto out;
+        }
+    }
+
+out:
+    return ret;
+}
+
+int32_t
+glusterd_sync_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict)
+{
+    int ret = 0;
+
+    GF_ASSERT(rsp_dict);
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!rsp_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ret = glusterd_import_friend_volumes(rsp_dict);
+out:
+    return ret;
+}
+
+static int
+_profile_volume_add_friend_rsp(dict_t *this, char *key, data_t *value,
+                               void *data)
+{
+    char new_key[264] = "";
+    int new_key_len;
+    glusterd_pr_brick_rsp_conv_t *rsp_ctx = NULL;
+    data_t *new_value = NULL;
+    int brick_count = 0;
+    char brick_key[256] = "";
+
+    if (strcmp(key, "count") == 0)
+        return 0;
+    sscanf(key, "%d%s", &brick_count, brick_key);
+    rsp_ctx = data;
+    new_value = data_copy(value);
+    GF_ASSERT(new_value);
+    new_key_len = snprintf(new_key, sizeof(new_key), "%d%s",
+                           rsp_ctx->count + brick_count, brick_key);
+    dict_setn(rsp_ctx->dict, new_key, new_key_len, new_value);
+    return 0;
+}
+
+int
+glusterd_profile_volume_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict)
+{
+    int ret = 0;
+    glusterd_pr_brick_rsp_conv_t rsp_ctx = {0};
+    int32_t brick_count = 0;
+    int32_t count = 0;
+    dict_t *ctx_dict = NULL;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(rsp_dict);
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_int32n(rsp_dict, "count", SLEN("count"), &brick_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=count", NULL);
+        ret = 0;  // no bricks in the rsp
+        goto out;
+    }
+    if (aggr) {
+        ctx_dict = aggr;
+
+    } else {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OPCTX_GET_FAIL,
+               "Operation Context is not present");
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_int32n(ctx_dict, "count", SLEN("count"), &count);
+    rsp_ctx.count = count;
+    rsp_ctx.dict = ctx_dict;
+    dict_foreach(rsp_dict, _profile_volume_add_friend_rsp, &rsp_ctx);
+    ret = dict_set_int32n(ctx_dict, "count", SLEN("count"),
+                          count + brick_count);
+out:
+    return ret;
+}
+
+static int
+glusterd_volume_status_add_peer_rsp(dict_t *this, char *key, data_t *value,
+                                    void *data)
+{
+    glusterd_status_rsp_conv_t *rsp_ctx = NULL;
+    data_t *new_value = NULL;
+    char brick_key[1024] = "";
+    char new_key[1024] = "";
+    int32_t index = 0;
+    int32_t ret = -1;
+    int32_t len = 0;
+
+    /* Skip the following keys, they are already present in the ctx_dict */
+    /* Also, skip all the task related pairs. They will be added to the
+     * ctx_dict later
+     */
+    if (!strcmp(key, "count") || !strcmp(key, "cmd") ||
+        !strcmp(key, "brick-index-max") || !strcmp(key, "other-count") ||
+        !strncmp(key, "task", 4))
+        return 0;
+
+    rsp_ctx = data;
+    new_value = data_copy(value);
+    GF_ASSERT(new_value);
+
+    sscanf(key, "brick%d.%s", &index, brick_key);
+
+    if (index > rsp_ctx->brick_index_max) {
+        len = snprintf(new_key, sizeof(new_key), "brick%d.%s",
+                       index + rsp_ctx->other_count, brick_key);
+    } else {
+        len = snprintf(new_key, sizeof(new_key), "%s", key);
+    }
+    if (len < 0 || len >= sizeof(new_key))
+        goto out;
+
+    ret = dict_setn(rsp_ctx->dict, new_key, len, new_value);
+out:
+    if (ret) {
+        data_unref(new_value);
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Unable to set key: %s in dict", key);
+    }
+
+    return 0;
+}
+
+static int
+glusterd_volume_status_copy_tasks_to_ctx_dict(dict_t *this, char *key,
+                                              data_t *value, void *data)
+{
+    int ret = 0;
+    dict_t *ctx_dict = NULL;
+    data_t *new_value = NULL;
+
+    if (strncmp(key, "task", 4))
+        return 0;
+
+    ctx_dict = data;
+    GF_ASSERT(ctx_dict);
+
+    new_value = data_copy(value);
+    GF_ASSERT(new_value);
+
+    ret = dict_set(ctx_dict, key, new_value);
+
+    return ret;
+}
+
+int
+glusterd_volume_status_aggregate_tasks_status(dict_t *ctx_dict,
+                                              dict_t *rsp_dict)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    int local_count = 0;
+    int remote_count = 0;
+    int i = 0;
+    int j = 0;
+    char key[128] = "";
+    int keylen;
+    char *task_type = NULL;
+    int local_status = 0;
+    int remote_status = 0;
+    char *local_task_id = NULL;
+    char *remote_task_id = NULL;
+
+    GF_ASSERT(ctx_dict);
+    GF_ASSERT(rsp_dict);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_int32n(rsp_dict, "tasks", SLEN("tasks"), &remote_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get remote task count");
+        goto out;
+    }
+    /* Local count will not be present when this is called for the first
+     * time with the origins rsp_dict
+     */
+    ret = dict_get_int32n(ctx_dict, "tasks", SLEN("tasks"), &local_count);
+    if (ret) {
+        ret = dict_foreach(
+            rsp_dict, glusterd_volume_status_copy_tasks_to_ctx_dict, ctx_dict);
+        if (ret)
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to copy tasks"
+                   "to ctx_dict.");
+        goto out;
+    }
+
+    if (local_count != remote_count) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TASKS_COUNT_MISMATCH,
+               "Local tasks count (%d) and "
+               "remote tasks count (%d) do not match. Not aggregating "
+               "tasks status.",
+               local_count, remote_count);
+        ret = -1;
+        goto out;
+    }
+
+    /* Update the tasks statuses. For every remote tasks, search for the
+     * local task, and update the local task status based on the remote
+     * status.
+     */
+    for (i = 0; i < remote_count; i++) {
+        keylen = snprintf(key, sizeof(key), "task%d.type", i);
+        ret = dict_get_strn(rsp_dict, key, keylen, &task_type);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get task typpe from rsp dict");
+            goto out;
+        }
+
+        /* Skip replace-brick status as it is going to be the same on
+         * all peers. rb_status is set by the replace brick commit
+         * function on all peers based on the replace brick command.
+         * We return the value of rb_status as the status for a
+         * replace-brick task in a 'volume status' command.
+         */
+        if (!strcmp(task_type, "Replace brick"))
+            continue;
+
+        keylen = snprintf(key, sizeof(key), "task%d.status", i);
+        ret = dict_get_int32n(rsp_dict, key, keylen, &remote_status);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get task status from rsp dict");
+            goto out;
+        }
+        keylen = snprintf(key, sizeof(key), "task%d.id", i);
+        ret = dict_get_strn(rsp_dict, key, keylen, &remote_task_id);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get task id from rsp dict");
+            goto out;
+        }
+        for (j = 0; j < local_count; j++) {
+            keylen = snprintf(key, sizeof(key), "task%d.id", j);
+            ret = dict_get_strn(ctx_dict, key, keylen, &local_task_id);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get local task-id");
+                goto out;
+            }
+
+            if (strncmp(remote_task_id, local_task_id,
+                        strlen(remote_task_id))) {
+                /* Quit if a matching local task is not found */
+                if (j == (local_count - 1)) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_TASKS_COUNT_MISMATCH,
+                           "Could not find matching local "
+                           "task for task %s",
+                           remote_task_id);
+                    goto out;
+                }
+                continue;
+            }
+
+            keylen = snprintf(key, sizeof(key), "task%d.status", j);
+            ret = dict_get_int32n(ctx_dict, key, keylen, &local_status);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Failed to get local task status");
+                goto out;
+            }
+
+            /* Rebalance has 5 states,
+             * NOT_STARTED, STARTED, STOPPED, COMPLETE, FAILED
+             * The precedence used to determine the aggregate status
+             * is as below,
+             * STARTED > FAILED > STOPPED > COMPLETE > NOT_STARTED
+             */
+            /* TODO: Move this to a common place utilities that both
+             * CLI and glusterd need.
+             * Till then if the below algorithm is changed, change
+             * it in cli_xml_output_vol_rebalance_status in
+             * cli-xml-output.c
+             */
+            ret = 0;
+            int rank[] = {[GF_DEFRAG_STATUS_STARTED] = 1,
+                          [GF_DEFRAG_STATUS_FAILED] = 2,
+                          [GF_DEFRAG_STATUS_STOPPED] = 3,
+                          [GF_DEFRAG_STATUS_COMPLETE] = 4,
+                          [GF_DEFRAG_STATUS_NOT_STARTED] = 5};
+            if (rank[remote_status] <= rank[local_status])
+                ret = dict_set_int32n(ctx_dict, key, keylen, remote_status);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0,
+                       GD_MSG_TASK_STATUS_UPDATE_FAIL,
+                       "Failed to "
+                       "update task status");
+                goto out;
+            }
+            break;
+        }
+    }
+
+out:
+    return ret;
+}
+
+gf_boolean_t
+glusterd_status_has_tasks(int cmd)
+{
+    if (((cmd & GF_CLI_STATUS_MASK) == GF_CLI_STATUS_NONE) &&
+        (cmd & GF_CLI_STATUS_VOL))
+        return _gf_true;
+    return _gf_false;
+}
+
+int
+glusterd_volume_status_copy_to_op_ctx_dict(dict_t *aggr, dict_t *rsp_dict)
+{
+    int ret = 0;
+    glusterd_status_rsp_conv_t rsp_ctx = {0};
+    int32_t cmd = GF_CLI_STATUS_NONE;
+    int32_t node_count = 0;
+    int32_t other_count = 0;
+    int32_t brick_index_max = -1;
+    int32_t hot_brick_count = -1;
+    int32_t type = -1;
+    int32_t rsp_node_count = 0;
+    int32_t rsp_other_count = 0;
+    int vol_count = -1;
+    int i = 0;
+    dict_t *ctx_dict = NULL;
+    char key[64] = "";
+    int keylen;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    GF_ASSERT(rsp_dict);
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (aggr) {
+        ctx_dict = aggr;
+
+    } else {
+        ctx_dict = glusterd_op_get_ctx(GD_OP_STATUS_VOLUME);
+    }
+
+    ret = dict_get_int32n(ctx_dict, "cmd", SLEN("cmd"), &cmd);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "Key=cmd",
+                NULL);
+        goto out;
+    }
+
+    if (cmd & GF_CLI_STATUS_ALL && is_origin_glusterd(ctx_dict)) {
+        ret = dict_get_int32n(rsp_dict, "vol_count", SLEN("vol_count"),
+                              &vol_count);
+        if (ret == 0) {
+            ret = dict_set_int32n(ctx_dict, "vol_count", SLEN("vol_count"),
+                                  vol_count);
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                        "Key=vol_count", NULL);
+                goto out;
+            }
+
+            for (i = 0; i < vol_count; i++) {
+                keylen = snprintf(key, sizeof(key), "vol%d", i);
+                ret = dict_get_strn(rsp_dict, key, keylen, &volname);
+                if (ret) {
+                    gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                            "Key=%s", key, NULL);
+                    goto out;
+                }
+
+                ret = dict_set_strn(ctx_dict, key, keylen, volname);
+                if (ret) {
+                    gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                            "Key=%s", key, NULL);
+                    goto out;
+                }
+            }
+        } else {
+            /* Ignore the error as still the aggregation applies in
+             * case its a task sub command */
+            ret = 0;
+        }
+    }
+
+    if ((cmd & GF_CLI_STATUS_TASKS) != 0)
+        goto aggregate_tasks;
+
+    ret = dict_get_int32n(rsp_dict, "count", SLEN("count"), &rsp_node_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED, "Key=count",
+                NULL);
+        ret = 0;  // no bricks in the rsp
+        goto out;
+    }
+
+    ret = dict_get_int32n(rsp_dict, "other-count", SLEN("other-count"),
+                          &rsp_other_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                "Key=other-count", NULL);
+        goto out;
+    }
+
+    ret = dict_get_int32n(ctx_dict, "count", SLEN("count"), &node_count);
+    ret = dict_get_int32n(ctx_dict, "other-count", SLEN("other-count"),
+                          &other_count);
+    if (!dict_getn(ctx_dict, "brick-index-max", SLEN("brick-index-max"))) {
+        ret = dict_get_int32n(rsp_dict, "brick-index-max",
+                              SLEN("brick-index-max"), &brick_index_max);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                    "Key=brick-index-max", NULL);
+            goto out;
+        }
+        ret = dict_set_int32n(ctx_dict, "brick-index-max",
+                              SLEN("brick-index-max"), brick_index_max);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                    "Key=brick-index-max", NULL);
+            goto out;
+        }
+
+    } else {
+        ret = dict_get_int32n(ctx_dict, "brick-index-max",
+                              SLEN("brick-index-max"), &brick_index_max);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                    "Key=brick-index-max", NULL);
+            goto out;
+        }
+    }
+
+    rsp_ctx.count = node_count;
+    rsp_ctx.brick_index_max = brick_index_max;
+    rsp_ctx.other_count = other_count;
+    rsp_ctx.dict = ctx_dict;
+
+    dict_foreach(rsp_dict, glusterd_volume_status_add_peer_rsp, &rsp_ctx);
+
+    ret = dict_set_int32n(ctx_dict, "count", SLEN("count"),
+                          node_count + rsp_node_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                "Key=count", NULL);
+        goto out;
+    }
+
+    ret = dict_set_int32n(ctx_dict, "other-count", SLEN("other-count"),
+                          (other_count + rsp_other_count));
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                "Key=other-count", NULL);
+        goto out;
+    }
+
+    ret = dict_get_strn(ctx_dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                "Key=volname", NULL);
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                "Volume=%s", volname, NULL);
+        goto out;
+    }
+
+    ret = dict_set_int32n(ctx_dict, "hot_brick_count", SLEN("hot_brick_count"),
+                          hot_brick_count);
+    if (ret) {
+        gf_smsg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=hot_brick_count", NULL);
+        goto out;
+    }
+
+    ret = dict_set_int32n(ctx_dict, "type", SLEN("type"), type);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=type", NULL);
+        goto out;
+    }
+
+aggregate_tasks:
+    /* Tasks are only present for a normal status command for a volume or
+     * for an explicit tasks status command for a volume
+     */
+    if (!(cmd & GF_CLI_STATUS_ALL) &&
+        (((cmd & GF_CLI_STATUS_TASKS) != 0) || glusterd_status_has_tasks(cmd)))
+        ret = glusterd_volume_status_aggregate_tasks_status(ctx_dict, rsp_dict);
+
+out:
+    return ret;
+}
+
+int
+glusterd_max_opversion_use_rsp_dict(dict_t *dst, dict_t *src)
+{
+    int ret = -1;
+    int src_max_opversion = -1;
+    int max_opversion = -1;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, dst, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, src, out);
+
+    ret = dict_get_int32n(dst, "max-opversion", SLEN("max-opversion"),
+                          &max_opversion);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Maximum supported op-version not set in destination "
+               "dictionary");
+
+    ret = dict_get_int32n(src, "max-opversion", SLEN("max-opversion"),
+                          &src_max_opversion);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get maximum supported op-version from source");
+        goto out;
+    }
+
+    if (max_opversion == -1 || src_max_opversion < max_opversion)
+        max_opversion = src_max_opversion;
+
+    ret = dict_set_int32n(dst, "max-opversion", SLEN("max-opversion"),
+                          max_opversion);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set max op-version");
+        goto out;
+    }
+out:
+    return ret;
+}
+
+int
+glusterd_volume_bitrot_scrub_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict)
+{
+    int ret = -1;
+    int j = 0;
+    uint64_t value = 0;
+    char key[64] = "";
+    int keylen;
+    char *last_scrub_time = NULL;
+    char *scrub_time = NULL;
+    char *volname = NULL;
+    char *node_uuid = NULL;
+    char *node_uuid_str = NULL;
+    char *bitd_log = NULL;
+    char *scrub_log = NULL;
+    char *scrub_freq = NULL;
+    char *scrub_state = NULL;
+    char *scrub_impact = NULL;
+    char *bad_gfid_str = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    int src_count = 0;
+    int dst_count = 0;
+    int8_t scrub_running = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(aggr, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "Unable to find volinfo for volume: %s", volname);
+        goto out;
+    }
+
+    ret = dict_get_int32n(aggr, "count", SLEN("count"), &dst_count);
+
+    ret = dict_get_int32n(rsp_dict, "count", SLEN("count"), &src_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to get count value");
+        ret = 0;
+        goto out;
+    }
+
+    ret = dict_set_int32n(aggr, "count", SLEN("count"), src_count + dst_count);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set count in dictonary");
+
+    keylen = snprintf(key, sizeof(key), "node-uuid-%d", src_count);
+    ret = dict_get_strn(rsp_dict, key, keylen, &node_uuid);
+    if (!ret) {
+        node_uuid_str = gf_strdup(node_uuid);
+        keylen = snprintf(key, sizeof(key), "node-uuid-%d",
+                          src_count + dst_count);
+        ret = dict_set_dynstrn(aggr, key, keylen, node_uuid_str);
+        if (ret) {
+            gf_msg_debug(this->name, 0, "failed to set node-uuid");
+        }
+    }
+
+    snprintf(key, sizeof(key), "scrub-running-%d", src_count);
+    ret = dict_get_int8(rsp_dict, key, &scrub_running);
+    if (!ret) {
+        snprintf(key, sizeof(key), "scrub-running-%d", src_count + dst_count);
+        ret = dict_set_int8(aggr, key, scrub_running);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrub-running value");
+        }
+    }
+
+    snprintf(key, sizeof(key), "scrubbed-files-%d", src_count);
+    ret = dict_get_uint64(rsp_dict, key, &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "scrubbed-files-%d", src_count + dst_count);
+        ret = dict_set_uint64(aggr, key, value);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrubbed-file value");
+        }
+    }
+
+    snprintf(key, sizeof(key), "unsigned-files-%d", src_count);
+    ret = dict_get_uint64(rsp_dict, key, &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "unsigned-files-%d", src_count + dst_count);
+        ret = dict_set_uint64(aggr, key, value);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "unsigned-file value");
+        }
+    }
+
+    keylen = snprintf(key, sizeof(key), "last-scrub-time-%d", src_count);
+    ret = dict_get_strn(rsp_dict, key, keylen, &last_scrub_time);
+    if (!ret) {
+        scrub_time = gf_strdup(last_scrub_time);
+        keylen = snprintf(key, sizeof(key), "last-scrub-time-%d",
+                          src_count + dst_count);
+        ret = dict_set_dynstrn(aggr, key, keylen, scrub_time);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "last scrub time value");
+        }
+    }
+
+    snprintf(key, sizeof(key), "scrub-duration-%d", src_count);
+    ret = dict_get_uint64(rsp_dict, key, &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "scrub-duration-%d", src_count + dst_count);
+        ret = dict_set_uint64(aggr, key, value);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrubbed-duration value");
+        }
+    }
+
+    snprintf(key, sizeof(key), "error-count-%d", src_count);
+    ret = dict_get_uint64(rsp_dict, key, &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "error-count-%d", src_count + dst_count);
+        ret = dict_set_uint64(aggr, key, value);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set error "
+                         "count value");
+        }
+
+        /* Storing all the bad files in the dictionary */
+        for (j = 0; j < value; j++) {
+            keylen = snprintf(key, sizeof(key), "quarantine-%d-%d", j,
+                              src_count);
+            ret = dict_get_strn(rsp_dict, key, keylen, &bad_gfid_str);
+            if (!ret) {
+                snprintf(key, sizeof(key), "quarantine-%d-%d", j,
+                         src_count + dst_count);
+                ret = dict_set_dynstr_with_alloc(aggr, key, bad_gfid_str);
+                if (ret) {
+                    gf_msg_debug(this->name, 0,
+                                 "Failed to"
+                                 "bad file gfid ");
+                }
+            }
+        }
+    }
+
+    ret = dict_get_strn(rsp_dict, "bitrot_log_file", SLEN("bitrot_log_file"),
+                        &bitd_log);
+    if (!ret) {
+        ret = dict_set_dynstr_with_alloc(aggr, "bitrot_log_file", bitd_log);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "bitrot log file location");
+            goto out;
+        }
+    }
+
+    ret = dict_get_strn(rsp_dict, "scrub_log_file", SLEN("scrub_log_file"),
+                        &scrub_log);
+    if (!ret) {
+        ret = dict_set_dynstr_with_alloc(aggr, "scrub_log_file", scrub_log);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrubber log file location");
+            goto out;
+        }
+    }
+
+    ret = dict_get_strn(rsp_dict, "features.scrub-freq",
+                        SLEN("features.scrub-freq"), &scrub_freq);
+    if (!ret) {
+        ret = dict_set_dynstr_with_alloc(aggr, "features.scrub-freq",
+                                         scrub_freq);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrub-frequency value to dictionary");
+            goto out;
+        }
+    }
+
+    ret = dict_get_strn(rsp_dict, "features.scrub-throttle",
+                        SLEN("features.scrub-throttle"), &scrub_impact);
+    if (!ret) {
+        ret = dict_set_dynstr_with_alloc(aggr, "features.scrub-throttle",
+                                         scrub_impact);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrub-throttle value to dictionary");
+            goto out;
+        }
+    }
+
+    ret = dict_get_strn(rsp_dict, "features.scrub", SLEN("features.scrub"),
+                        &scrub_state);
+    if (!ret) {
+        ret = dict_set_dynstr_with_alloc(aggr, "features.scrub", scrub_state);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrub state value to dictionary");
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_bitrot_volume_node_rsp(dict_t *aggr, dict_t *rsp_dict)
+{
+    int ret = -1;
+    uint64_t value = 0;
+    char key[64] = "";
+    int keylen;
+    char buf[1024] = "";
+    int32_t i = 0;
+    int32_t j = 0;
+    char *last_scrub_time = NULL;
+    char *scrub_time = NULL;
+    char *volname = NULL;
+    char *scrub_freq = NULL;
+    char *scrub_state = NULL;
+    char *scrub_impact = NULL;
+    char *bad_gfid_str = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    int8_t scrub_running = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_set_strn(aggr, "bitrot_log_file", SLEN("bitrot_log_file"),
+                        priv->bitd_svc.proc.logfile);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set bitrot log file location");
+        goto out;
+    }
+
+    ret = dict_set_strn(aggr, "scrub_log_file", SLEN("scrub_log_file"),
+                        priv->scrub_svc.proc.logfile);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set scrubber log file location");
+        goto out;
+    }
+
+    ret = dict_get_strn(aggr, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "Unable to find volinfo for volume: %s", volname);
+        goto out;
+    }
+
+    ret = dict_get_int32n(aggr, "count", SLEN("count"), &i);
+    i++;
+
+    ret = dict_set_int32n(aggr, "count", SLEN("count"), i);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set count");
+
+    snprintf(buf, sizeof(buf), "%s", uuid_utoa(MY_UUID));
+
+    snprintf(key, sizeof(key), "node-uuid-%d", i);
+    ret = dict_set_dynstr_with_alloc(aggr, key, buf);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set node-uuid");
+
+    ret = dict_get_strn(volinfo->dict, "features.scrub-freq",
+                        SLEN("features.scrub-freq"), &scrub_freq);
+    if (!ret) {
+        ret = dict_set_strn(aggr, "features.scrub-freq",
+                            SLEN("features.scrub-freq"), scrub_freq);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrub-frequency value to dictionary");
+        }
+    } else {
+        /* By Default scrub-frequency is bi-weekly. So when user
+         * enable bitrot then scrub-frequency value will not be
+         * present in volinfo->dict. Setting by-default value of
+         * scrub-frequency explicitly for presenting it to scrub
+         * status.
+         */
+        ret = dict_set_dynstr_with_alloc(aggr, "features.scrub-freq",
+                                         "biweekly");
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrub-frequency value to dictionary");
+        }
+    }
+
+    ret = dict_get_strn(volinfo->dict, "features.scrub-throttle",
+                        SLEN("features.scrub-throttle"), &scrub_impact);
+    if (!ret) {
+        ret = dict_set_strn(aggr, "features.scrub-throttle",
+                            SLEN("features.scrub-throttle"), scrub_impact);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrub-throttle value to dictionary");
+        }
+    } else {
+        /* By Default scrub-throttle is lazy. So when user
+         * enable bitrot then scrub-throttle value will not be
+         * present in volinfo->dict. Setting by-default value of
+         * scrub-throttle explicitly for presenting it to
+         * scrub status.
+         */
+        ret = dict_set_dynstr_with_alloc(aggr, "features.scrub-throttle",
+                                         "lazy");
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrub-throttle value to dictionary");
+        }
+    }
+
+    ret = dict_get_strn(volinfo->dict, "features.scrub", SLEN("features.scrub"),
+                        &scrub_state);
+    if (!ret) {
+        ret = dict_set_strn(aggr, "features.scrub", SLEN("features.scrub"),
+                            scrub_state);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrub state value to dictionary");
+        }
+    }
+
+    ret = dict_get_int8(rsp_dict, "scrub-running", &scrub_running);
+    if (!ret) {
+        snprintf(key, sizeof(key), "scrub-running-%d", i);
+        ret = dict_set_uint64(aggr, key, scrub_running);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrub-running value");
+        }
+    }
+
+    ret = dict_get_uint64(rsp_dict, "scrubbed-files", &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "scrubbed-files-%d", i);
+        ret = dict_set_uint64(aggr, key, value);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrubbed-file value");
+        }
+    }
+
+    ret = dict_get_uint64(rsp_dict, "unsigned-files", &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "unsigned-files-%d", i);
+        ret = dict_set_uint64(aggr, key, value);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "unsigned-file value");
+        }
+    }
+
+    ret = dict_get_strn(rsp_dict, "last-scrub-time", SLEN("last-scrub-time"),
+                        &last_scrub_time);
+    if (!ret) {
+        keylen = snprintf(key, sizeof(key), "last-scrub-time-%d", i);
+
+        scrub_time = gf_strdup(last_scrub_time);
+        ret = dict_set_dynstrn(aggr, key, keylen, scrub_time);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "last scrub time value");
+        }
+    }
+
+    ret = dict_get_uint64(rsp_dict, "scrub-duration", &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "scrub-duration-%d", i);
+        ret = dict_set_uint64(aggr, key, value);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set "
+                         "scrubbed-duration value");
+        }
+    }
+
+    ret = dict_get_uint64(rsp_dict, "total-count", &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "error-count-%d", i);
+        ret = dict_set_uint64(aggr, key, value);
+        if (ret) {
+            gf_msg_debug(this->name, 0,
+                         "Failed to set error "
+                         "count value");
+        }
+
+        /* Storing all the bad files in the dictionary */
+        for (j = 0; j < value; j++) {
+            keylen = snprintf(key, sizeof(key), "quarantine-%d", j);
+            ret = dict_get_strn(rsp_dict, key, keylen, &bad_gfid_str);
+            if (!ret) {
+                snprintf(key, sizeof(key), "quarantine-%d-%d", j, i);
+                ret = dict_set_dynstr_with_alloc(aggr, key, bad_gfid_str);
+                if (ret) {
+                    gf_msg_debug(this->name, 0,
+                                 "Failed to"
+                                 "bad file gfid ");
+                }
+            }
+        }
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_volume_rebalance_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict)
+{
+    char key[64] = "";
+    int keylen;
+    char *node_uuid = NULL;
+    char *node_uuid_str = NULL;
+    char *volname = NULL;
+    dict_t *ctx_dict = NULL;
+    double elapsed_time = 0;
+    glusterd_conf_t *conf = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    int ret = 0;
+    int32_t index = 0;
+    int32_t count = 0;
+    int32_t current_index = 1;
+    int32_t value32 = 0;
+    uint64_t value = 0;
+    char *peer_uuid_str = NULL;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(rsp_dict);
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+
+    if (conf->op_version < GD_OP_VERSION_6_0)
+        current_index = 2;
+    if (aggr) {
+        ctx_dict = aggr;
+
+    } else {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OPCTX_GET_FAIL,
+               "Operation Context is not present");
+        goto out;
+    }
+
+    ret = dict_get_strn(ctx_dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+
+    if (ret)
+        goto out;
+
+    ret = dict_get_int32n(rsp_dict, "count", SLEN("count"), &index);
+    if (ret)
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "failed to get index from rsp dict");
+
+    keylen = snprintf(key, sizeof(key), "node-uuid-%d", index);
+    ret = dict_get_strn(rsp_dict, key, keylen, &node_uuid);
+    if (!ret) {
+        node_uuid_str = gf_strdup(node_uuid);
+
+        /* Finding the index of the node-uuid in the peer-list */
+        RCU_READ_LOCK;
+        cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+        {
+            peer_uuid_str = gd_peer_uuid_str(peerinfo);
+            if (strcmp(peer_uuid_str, node_uuid_str) == 0)
+                break;
+
+            current_index++;
+        }
+        RCU_READ_UNLOCK;
+
+        /* Setting the largest index value as the total count. */
+        ret = dict_get_int32n(ctx_dict, "count", SLEN("count"), &count);
+        if (count < current_index) {
+            ret = dict_set_int32n(ctx_dict, "count", SLEN("count"),
+                                  current_index);
+            if (ret)
+                gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set count");
+        }
+
+        /* Setting the same index for the node, as is in the peerlist.*/
+        keylen = snprintf(key, sizeof(key), "node-uuid-%d", current_index);
+        ret = dict_set_dynstrn(ctx_dict, key, keylen, node_uuid_str);
+        if (ret) {
+            gf_msg_debug(THIS->name, 0, "failed to set node-uuid");
+        }
+    }
+
+    snprintf(key, sizeof(key), "files-%d", index);
+    ret = dict_get_uint64(rsp_dict, key, &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "files-%d", current_index);
+        ret = dict_set_uint64(ctx_dict, key, value);
+        if (ret) {
+            gf_msg_debug(THIS->name, 0, "failed to set the file count");
+        }
+    }
+
+    snprintf(key, sizeof(key), "size-%d", index);
+    ret = dict_get_uint64(rsp_dict, key, &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "size-%d", current_index);
+        ret = dict_set_uint64(ctx_dict, key, value);
+        if (ret) {
+            gf_msg_debug(THIS->name, 0, "failed to set the size of migration");
+        }
+    }
+
+    snprintf(key, sizeof(key), "lookups-%d", index);
+    ret = dict_get_uint64(rsp_dict, key, &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "lookups-%d", current_index);
+        ret = dict_set_uint64(ctx_dict, key, value);
+        if (ret) {
+            gf_msg_debug(THIS->name, 0, "failed to set looked up file count");
+        }
+    }
+
+    keylen = snprintf(key, sizeof(key), "status-%d", index);
+    ret = dict_get_int32n(rsp_dict, key, keylen, &value32);
+    if (!ret) {
+        keylen = snprintf(key, sizeof(key), "status-%d", current_index);
+        ret = dict_set_int32n(ctx_dict, key, keylen, value32);
+        if (ret) {
+            gf_msg_debug(THIS->name, 0, "failed to set status");
+        }
+    }
+
+    snprintf(key, sizeof(key), "failures-%d", index);
+    ret = dict_get_uint64(rsp_dict, key, &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "failures-%d", current_index);
+        ret = dict_set_uint64(ctx_dict, key, value);
+        if (ret) {
+            gf_msg_debug(THIS->name, 0, "failed to set failure count");
+        }
+    }
+
+    snprintf(key, sizeof(key), "skipped-%d", index);
+    ret = dict_get_uint64(rsp_dict, key, &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "skipped-%d", current_index);
+        ret = dict_set_uint64(ctx_dict, key, value);
+        if (ret) {
+            gf_msg_debug(THIS->name, 0, "failed to set skipped count");
+        }
+    }
+    snprintf(key, sizeof(key), "run-time-%d", index);
+    ret = dict_get_double(rsp_dict, key, &elapsed_time);
+    if (!ret) {
+        snprintf(key, sizeof(key), "run-time-%d", current_index);
+        ret = dict_set_double(ctx_dict, key, elapsed_time);
+        if (ret) {
+            gf_msg_debug(THIS->name, 0, "failed to set run-time");
+        }
+    }
+
+    snprintf(key, sizeof(key), "time-left-%d", index);
+    ret = dict_get_uint64(rsp_dict, key, &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "time-left-%d", current_index);
+        ret = dict_set_uint64(ctx_dict, key, value);
+        if (ret) {
+            gf_msg_debug(THIS->name, 0, "failed to set time-left");
+        }
+    }
+    snprintf(key, sizeof(key), "demoted-%d", index);
+    ret = dict_get_uint64(rsp_dict, key, &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "demoted-%d", current_index);
+        ret = dict_set_uint64(ctx_dict, key, value);
+        if (ret) {
+            gf_msg_debug(THIS->name, 0, "failed to set demoted count");
+        }
+    }
+    snprintf(key, sizeof(key), "promoted-%d", index);
+    ret = dict_get_uint64(rsp_dict, key, &value);
+    if (!ret) {
+        snprintf(key, sizeof(key), "promoted-%d", current_index);
+        ret = dict_set_uint64(ctx_dict, key, value);
+        if (ret) {
+            gf_msg_debug(THIS->name, 0, "failed to set promoted count");
+        }
+    }
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int
+glusterd_sys_exec_output_rsp_dict(dict_t *dst, dict_t *src)
+{
+    char output_name[64] = "";
+    char *output = NULL;
+    int ret = 0;
+    int i = 0;
+    int keylen;
+    int src_output_count = 0;
+    int dst_output_count = 0;
+
+    if (!dst || !src) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_EMPTY,
+               "Source or Destination "
+               "dict is empty.");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dst, "output_count", SLEN("output_count"),
+                          &dst_output_count);
+
+    ret = dict_get_int32n(src, "output_count", SLEN("output_count"),
+                          &src_output_count);
+    if (ret) {
+        gf_msg_debug("glusterd", 0, "No output from source");
+        ret = 0;
+        goto out;
+    }
+
+    for (i = 1; i <= src_output_count; i++) {
+        keylen = snprintf(output_name, sizeof(output_name), "output_%d", i);
+        if (keylen <= 0 || keylen >= sizeof(output_name)) {
+            ret = -1;
+            goto out;
+        }
+        ret = dict_get_strn(src, output_name, keylen, &output);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to fetch %s", output_name);
+            goto out;
+        }
+
+        keylen = snprintf(output_name, sizeof(output_name), "output_%d",
+                          i + dst_output_count);
+        if (keylen <= 0 || keylen >= sizeof(output_name)) {
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_set_dynstrn(dst, output_name, keylen, gf_strdup(output));
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Unable to set %s", output_name);
+            goto out;
+        }
+    }
+
+    ret = dict_set_int32n(dst, "output_count", SLEN("output_count"),
+                          dst_output_count + src_output_count);
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict)
+{
+    int ret = 0;
+
+    GF_ASSERT(aggr);
+    GF_ASSERT(rsp_dict);
+
+    if (!aggr)
+        goto out;
+    dict_copy(rsp_dict, aggr);
+out:
+    return ret;
+}
+
+int
+glusterd_volume_heal_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict)
+{
+    int ret = 0;
+    dict_t *ctx_dict = NULL;
+    uuid_t *txn_id = NULL;
+    glusterd_op_info_t txn_op_info = {
+        {0},
+    };
+    glusterd_op_t op = GD_OP_NONE;
+
+    GF_ASSERT(rsp_dict);
+
+    ret = dict_get_bin(aggr, "transaction_id", (void **)&txn_id);
+    if (ret)
+        goto out;
+    gf_msg_debug(THIS->name, 0, "transaction ID = %s", uuid_utoa(*txn_id));
+
+    ret = glusterd_get_txn_opinfo(txn_id, &txn_op_info);
+    if (ret) {
+        gf_msg_callingfn(THIS->name, GF_LOG_ERROR, 0,
+                         GD_MSG_TRANS_OPINFO_GET_FAIL,
+                         "Unable to get transaction opinfo "
+                         "for transaction ID : %s",
+                         uuid_utoa(*txn_id));
+        goto out;
+    }
+
+    op = txn_op_info.op;
+    GF_ASSERT(GD_OP_HEAL_VOLUME == op);
+
+    if (aggr) {
+        ctx_dict = aggr;
+
+    } else {
+        ctx_dict = txn_op_info.op_ctx;
+    }
+
+    if (!ctx_dict)
+        goto out;
+    dict_copy(rsp_dict, ctx_dict);
+out:
+    return ret;
+}
+
+int
+_profile_volume_add_brick_rsp(dict_t *this, char *key, data_t *value,
+                              void *data)
+{
+    char new_key[256] = "";
+    int keylen;
+    glusterd_pr_brick_rsp_conv_t *rsp_ctx = NULL;
+    data_t *new_value = NULL;
+
+    rsp_ctx = data;
+    new_value = data_copy(value);
+    GF_ASSERT(new_value);
+    keylen = snprintf(new_key, sizeof(new_key), "%d-%s", rsp_ctx->count, key);
+    dict_setn(rsp_ctx->dict, new_key, keylen, new_value);
+    return 0;
+}
+
+int
+glusterd_volume_quota_copy_to_op_ctx_dict(dict_t *dict, dict_t *rsp_dict)
+{
+    int ret = -1;
+    int i = 0;
+    int count = 0;
+    int rsp_dict_count = 0;
+    char *uuid_str = NULL;
+    char *uuid_str_dup = NULL;
+    char key[64] = "";
+    int keylen;
+    xlator_t *this = NULL;
+    int type = GF_QUOTA_OPTION_TYPE_NONE;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_int32n(dict, "type", SLEN("type"), &type);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get quota opcode");
+        goto out;
+    }
+
+    if ((type != GF_QUOTA_OPTION_TYPE_LIMIT_USAGE) &&
+        (type != GF_QUOTA_OPTION_TYPE_LIMIT_OBJECTS) &&
+        (type != GF_QUOTA_OPTION_TYPE_REMOVE) &&
+        (type != GF_QUOTA_OPTION_TYPE_REMOVE_OBJECTS)) {
+        dict_copy(rsp_dict, dict);
+        ret = 0;
+        goto out;
+    }
+
+    ret = dict_get_int32n(rsp_dict, "count", SLEN("count"), &rsp_dict_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get the count of "
+               "gfids from the rsp dict");
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "count", SLEN("count"), &count);
+    if (ret)
+        /* The key "count" is absent in op_ctx when this function is
+         * called after self-staging on the originator. This must not
+         * be treated as error.
+         */
+        gf_msg_debug(this->name, 0,
+                     "Failed to get count of gfids"
+                     " from req dict. This could be because count is not yet"
+                     " copied from rsp_dict into op_ctx");
+
+    for (i = 0; i < rsp_dict_count; i++) {
+        keylen = snprintf(key, sizeof(key), "gfid%d", i);
+        ret = dict_get_strn(rsp_dict, key, keylen, &uuid_str);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get gfid "
+                   "from rsp dict");
+            goto out;
+        }
+
+        uuid_str_dup = gf_strdup(uuid_str);
+        if (!uuid_str_dup) {
+            ret = -1;
+            goto out;
+        }
+
+        keylen = snprintf(key, sizeof(key), "gfid%d", i + count);
+        ret = dict_set_dynstrn(dict, key, keylen, uuid_str_dup);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set gfid "
+                   "from rsp dict into req dict");
+            GF_FREE(uuid_str_dup);
+            goto out;
+        }
+    }
+
+    ret = dict_set_int32n(dict, "count", SLEN("count"), rsp_dict_count + count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set aggregated "
+               "count in req dict");
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+int
+glusterd_profile_volume_brick_rsp(void *pending_entry, dict_t *rsp_dict,
+                                  dict_t *op_ctx, char **op_errstr,
+                                  gd_node_type type)
+{
+    int ret = 0;
+    glusterd_pr_brick_rsp_conv_t rsp_ctx = {0};
+    int32_t count = 0;
+    char brick[PATH_MAX + 1024] = "";
+    char key[64] = "";
+    int keylen;
+    char *full_brick = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(op_errstr);
+    GF_ASSERT(pending_entry);
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_int32n(op_ctx, "count", SLEN("count"), &count);
+    if (ret) {
+        count = 1;
+    } else {
+        count++;
+    }
+    if (type == GD_NODE_BRICK) {
+        brickinfo = pending_entry;
+        snprintf(brick, sizeof(brick), "%s:%s", brickinfo->hostname,
+                 brickinfo->path);
+    } else if (type == GD_NODE_NFS) {
+        snprintf(brick, sizeof(brick), "%s", uuid_utoa(MY_UUID));
+    }
+    full_brick = gf_strdup(brick);
+    GF_ASSERT(full_brick);
+    keylen = snprintf(key, sizeof(key), "%d-brick", count);
+    ret = dict_set_dynstrn(op_ctx, key, keylen, full_brick);
+
+    rsp_ctx.count = count;
+    rsp_ctx.dict = op_ctx;
+    dict_foreach(rsp_dict, _profile_volume_add_brick_rsp, &rsp_ctx);
+    ret = dict_set_int32n(op_ctx, "count", SLEN("count"), count);
+    return ret;
+}
+
+// input-key: <replica-id>:<child-id>-*
+// output-key: <brick-id>-*
+int
+_heal_volume_add_shd_rsp(dict_t *this, char *key, data_t *value, void *data)
+{
+    char new_key[256] = "";
+    char int_str[16] = "";
+    data_t *new_value = NULL;
+    char *rxl_end = NULL;
+    int rxl_end_len;
+    char *rxl_child_end = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    int rxl_id = 0;
+    int rxl_child_id = 0;
+    int brick_id = 0;
+    int int_len = 0;
+    int ret = 0;
+    glusterd_heal_rsp_conv_t *rsp_ctx = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+
+    rsp_ctx = data;
+    rxl_end = strchr(key, '-');
+    if (!rxl_end)
+        goto out;
+
+    rxl_child_end = strchr(rxl_end + 1, '-');
+    if (!rxl_child_end)
+        goto out;
+
+    rxl_end_len = strlen(rxl_end);
+    int_len = strlen(key) - rxl_end_len;
+    (void)memcpy(int_str, key, int_len);
+    int_str[int_len] = '\0';
+
+    ret = gf_string2int(int_str, &rxl_id);
+    if (ret)
+        goto out;
+
+    int_len = rxl_end_len - strlen(rxl_child_end) - 1;
+    (void)memcpy(int_str, rxl_end + 1, int_len);
+    int_str[int_len] = '\0';
+
+    ret = gf_string2int(int_str, &rxl_child_id);
+    if (ret)
+        goto out;
+
+    volinfo = rsp_ctx->volinfo;
+    brick_id = rxl_id * volinfo->replica_count + rxl_child_id;
+
+    if (!strcmp(rxl_child_end, "-status")) {
+        brickinfo = glusterd_get_brickinfo_by_position(volinfo, brick_id);
+        if (!brickinfo)
+            goto out;
+        if (!glusterd_is_local_brick(rsp_ctx->this, volinfo, brickinfo))
+            goto out;
+    }
+    new_value = data_copy(value);
+    int_len = snprintf(new_key, sizeof(new_key), "%d%s", brick_id,
+                       rxl_child_end);
+    dict_setn(rsp_ctx->dict, new_key, int_len, new_value);
+
+out:
+    return 0;
+}
+
+int
+_heal_volume_add_shd_rsp_of_statistics(dict_t *this, char *key, data_t *value,
+                                       void *data)
+{
+    char new_key[256] = "";
+    char int_str[16] = "";
+    char key_begin_string[128] = "";
+    data_t *new_value = NULL;
+    char *rxl_end = NULL;
+    int rxl_end_len;
+    char *rxl_child_end = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char *key_begin_str = NULL;
+    int key_begin_strlen;
+    int rxl_id = 0;
+    int rxl_child_id = 0;
+    int brick_id = 0;
+    int int_len = 0;
+    int ret = 0;
+    glusterd_heal_rsp_conv_t *rsp_ctx = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+
+    rsp_ctx = data;
+    key_begin_str = strchr(key, '-');
+    if (!key_begin_str)
+        goto out;
+
+    rxl_end = strchr(key_begin_str + 1, '-');
+    if (!rxl_end)
+        goto out;
+
+    rxl_child_end = strchr(rxl_end + 1, '-');
+    if (!rxl_child_end)
+        goto out;
+
+    key_begin_strlen = strlen(key_begin_str);
+    int_len = strlen(key) - key_begin_strlen;
+
+    (void)memcpy(key_begin_string, key, int_len);
+    key_begin_string[int_len] = '\0';
+
+    rxl_end_len = strlen(rxl_end);
+    int_len = key_begin_strlen - rxl_end_len - 1;
+    (void)memcpy(int_str, key_begin_str + 1, int_len);
+    int_str[int_len] = '\0';
+    ret = gf_string2int(int_str, &rxl_id);
+    if (ret)
+        goto out;
+
+    int_len = rxl_end_len - strlen(rxl_child_end) - 1;
+    (void)memcpy(int_str, rxl_end + 1, int_len);
+    int_str[int_len] = '\0';
+    ret = gf_string2int(int_str, &rxl_child_id);
+    if (ret)
+        goto out;
+
+    volinfo = rsp_ctx->volinfo;
+    brick_id = rxl_id * volinfo->replica_count + rxl_child_id;
+
+    brickinfo = glusterd_get_brickinfo_by_position(volinfo, brick_id);
+    if (!brickinfo)
+        goto out;
+    if (!glusterd_is_local_brick(rsp_ctx->this, volinfo, brickinfo))
+        goto out;
+
+    new_value = data_copy(value);
+    int_len = snprintf(new_key, sizeof(new_key), "%s-%d%s", key_begin_string,
+                       brick_id, rxl_child_end);
+    dict_setn(rsp_ctx->dict, new_key, int_len, new_value);
+
+out:
+    return 0;
+}
+
+int
+glusterd_heal_volume_brick_rsp(dict_t *req_dict, dict_t *rsp_dict,
+                               dict_t *op_ctx, char **op_errstr)
+{
+    int ret = 0;
+    glusterd_heal_rsp_conv_t rsp_ctx = {0};
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    int heal_op = -1;
+
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(op_errstr);
+
+    ret = dict_get_strn(req_dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = dict_get_int32n(req_dict, "heal-op", SLEN("heal-op"), &heal_op);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get heal_op");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+
+    if (ret)
+        goto out;
+
+    rsp_ctx.dict = op_ctx;
+    rsp_ctx.volinfo = volinfo;
+    rsp_ctx.this = THIS;
+    if (heal_op == GF_SHD_OP_STATISTICS)
+        dict_foreach(rsp_dict, _heal_volume_add_shd_rsp_of_statistics,
+                     &rsp_ctx);
+    else
+        dict_foreach(rsp_dict, _heal_volume_add_shd_rsp, &rsp_ctx);
+
+out:
+    return ret;
+}
+
+int
+_status_volume_add_brick_rsp(dict_t *this, char *key, data_t *value, void *data)
+{
+    char new_key[256] = "";
+    int keylen;
+    data_t *new_value = 0;
+    glusterd_pr_brick_rsp_conv_t *rsp_ctx = NULL;
+
+    rsp_ctx = data;
+    new_value = data_copy(value);
+    keylen = snprintf(new_key, sizeof(new_key), "brick%d.%s", rsp_ctx->count,
+                      key);
+    dict_setn(rsp_ctx->dict, new_key, keylen, new_value);
+
+    return 0;
+}
+
+int
+glusterd_status_volume_brick_rsp(dict_t *rsp_dict, dict_t *op_ctx,
+                                 char **op_errstr)
+{
+    int ret = 0;
+    glusterd_pr_brick_rsp_conv_t rsp_ctx = {0};
+    int32_t count = 0;
+    int index = 0;
+
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(op_errstr);
+
+    ret = dict_get_int32n(op_ctx, "count", SLEN("count"), &count);
+    if (ret) {
+        count = 0;
+    } else {
+        count++;
+    }
+    ret = dict_get_int32n(rsp_dict, "index", SLEN("index"), &index);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Couldn't get node index");
+        goto out;
+    }
+    dict_deln(rsp_dict, "index", SLEN("index"));
+
+    rsp_ctx.count = index;
+    rsp_ctx.dict = op_ctx;
+    dict_foreach(rsp_dict, _status_volume_add_brick_rsp, &rsp_ctx);
+    ret = dict_set_int32n(op_ctx, "count", SLEN("count"), count);
+
+out:
+    return ret;
+}
+
+int
+glusterd_status_volume_client_list(dict_t *rsp_dict, dict_t *op_ctx,
+                                   char **op_errstr)
+{
+    int ret = 0;
+    char *process = 0;
+    int32_t count = 0;
+    int32_t fuse_count = 0;
+    int32_t gfapi_count = 0;
+    int32_t rebalance_count = 0;
+    int32_t glustershd_count = 0;
+    int32_t quotad_count = 0;
+    int32_t snapd_count = 0;
+    int32_t client_count = 0;
+    int i = 0;
+    char key[64] = "";
+
+    GF_ASSERT(rsp_dict);
+    GF_ASSERT(op_ctx);
+    GF_ASSERT(op_errstr);
+
+    ret = dict_get_int32n(rsp_dict, "clientcount", SLEN("clientcount"),
+                          &client_count);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "Couldn't get node index");
+    }
+    ret = dict_set_int32n(op_ctx, "client-count", SLEN("client-count"),
+                          client_count);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Couldn't get node index");
+        goto out;
+    }
+    for (i = 0; i < client_count; i++) {
+        count = 0;
+        ret = snprintf(key, sizeof(key), "client%d.name", i);
+        ret = dict_get_strn(rsp_dict, key, ret, &process);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+                   "Couldn't get client name");
+            goto out;
+        }
+        ret = dict_add_dynstr_with_alloc(op_ctx, key, process);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_SET_FAILED,
+                   "Couldn't set client name");
+        }
+        if (!strncmp(process, "fuse", 4)) {
+            ret = dict_get_int32n(op_ctx, "fuse-count", SLEN("fuse-count"),
+                                  &count);
+            if (ret) {
+                gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+                       "Couldn't get fuse-count");
+            }
+            fuse_count++;
+            continue;
+        } else if (!strncmp(process, "gfapi", 5)) {
+            ret = dict_get_int32n(op_ctx, "gfapi-count", SLEN("gfapi-count"),
+                                  &count);
+            if (ret) {
+                gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+                       "Couldn't get gfapi-count");
+            }
+            gfapi_count++;
+            continue;
+
+        } else if (!strcmp(process, "rebalance")) {
+            ret = dict_get_int32n(op_ctx, "rebalance-count",
+                                  SLEN("rebalance-count"), &count);
+            if (ret) {
+                gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+                       "Couldn't get rebalance-count");
+            }
+            rebalance_count++;
+            continue;
+        } else if (!strcmp(process, "glustershd")) {
+            ret = dict_get_int32n(op_ctx, "glustershd-count",
+                                  SLEN("glustershd-count"), &count);
+            if (ret) {
+                gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+                       "Couldn't get glustershd-count");
+            }
+            glustershd_count++;
+            continue;
+        } else if (!strcmp(process, "quotad")) {
+            ret = dict_get_int32n(op_ctx, "quotad-count", SLEN("quotad-count"),
+                                  &count);
+            if (ret) {
+                gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+                       "Couldn't get quotad-count");
+            }
+            quotad_count++;
+            continue;
+        } else if (!strcmp(process, "snapd")) {
+            ret = dict_get_int32n(op_ctx, "snapd-count", SLEN("snapd-count"),
+                                  &count);
+            if (ret) {
+                gf_msg(THIS->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+                       "Couldn't get snapd-count");
+            }
+            snapd_count++;
+        }
+    }
+
+    if (fuse_count) {
+        ret = dict_set_int32n(op_ctx, "fuse-count", SLEN("fuse-count"),
+                              fuse_count);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Couldn't set fuse-count");
+            goto out;
+        }
+    }
+    if (gfapi_count) {
+        ret = dict_set_int32n(op_ctx, "gfapi-count", SLEN("gfapi-count"),
+                              gfapi_count);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Couldn't set gfapi-count");
+            goto out;
+        }
+    }
+    if (rebalance_count) {
+        ret = dict_set_int32n(op_ctx, "rebalance-count",
+                              SLEN("rebalance-count"), rebalance_count);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Couldn't set rebalance-count");
+            goto out;
+        }
+    }
+    if (glustershd_count) {
+        ret = dict_set_int32n(op_ctx, "glustershd-count",
+                              SLEN("glustershd-count"), glustershd_count);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Couldn't set glustershd-count");
+            goto out;
+        }
+    }
+    if (quotad_count) {
+        ret = dict_set_int32n(op_ctx, "quotad-count", SLEN("quotad-count"),
+                              quotad_count);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Couldn't set quotad-count");
+            goto out;
+        }
+    }
+    if (snapd_count) {
+        ret = dict_set_int32n(op_ctx, "snapd-count", SLEN("snapd-count"),
+                              snapd_count);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Couldn't set snapd-count");
+            goto out;
+        }
+    }
+
+out:
+    return ret;
+}
+
+int
+glusterd_rebalance_rsp(dict_t *op_ctx, glusterd_rebalance_t *index, int32_t i)
+{
+    int ret = 0;
+    char key[64] = "";
+    int keylen;
+
+    snprintf(key, sizeof(key), "files-%d", i);
+    ret = dict_set_uint64(op_ctx, key, index->rebalance_files);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set file count");
+
+    snprintf(key, sizeof(key), "size-%d", i);
+    ret = dict_set_uint64(op_ctx, key, index->rebalance_data);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set size of xfer");
+
+    snprintf(key, sizeof(key), "lookups-%d", i);
+    ret = dict_set_uint64(op_ctx, key, index->lookedup_files);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set lookedup file count");
+
+    keylen = snprintf(key, sizeof(key), "status-%d", i);
+    ret = dict_set_int32n(op_ctx, key, keylen, index->defrag_status);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set status");
+
+    snprintf(key, sizeof(key), "failures-%d", i);
+    ret = dict_set_uint64(op_ctx, key, index->rebalance_failures);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set failure count");
+
+    snprintf(key, sizeof(key), "skipped-%d", i);
+    ret = dict_set_uint64(op_ctx, key, index->skipped_files);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set skipped count");
+
+    snprintf(key, sizeof(key), "run-time-%d", i);
+    ret = dict_set_double(op_ctx, key, index->rebalance_time);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set run-time");
+
+    return ret;
+}
+
+int
+glusterd_defrag_volume_node_rsp(dict_t *req_dict, dict_t *rsp_dict,
+                                dict_t *op_ctx)
+{
+    int ret = 0;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char key[64] = "";
+    int keylen;
+    int32_t i = 0;
+    char buf[64] = "";
+    char *node_str = NULL;
+    int32_t cmd = 0;
+
+    GF_ASSERT(req_dict);
+
+    ret = dict_get_strn(req_dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+
+    ret = dict_get_int32n(req_dict, "rebalance-command",
+                          SLEN("rebalance-command"), &cmd);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "Unable to get the cmd");
+        goto out;
+    }
+
+    if (rsp_dict) {
+        ret = glusterd_defrag_volume_status_update(volinfo, rsp_dict, cmd);
+    }
+
+    if (!op_ctx) {
+        dict_copy(rsp_dict, op_ctx);
+        goto out;
+    }
+
+    ret = dict_get_int32n(op_ctx, "count", SLEN("count"), &i);
+    i++;
+
+    ret = dict_set_int32n(op_ctx, "count", SLEN("count"), i);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set count");
+
+    snprintf(buf, sizeof(buf), "%s", uuid_utoa(MY_UUID));
+    node_str = gf_strdup(buf);
+
+    keylen = snprintf(key, sizeof(key), "node-uuid-%d", i);
+    ret = dict_set_dynstrn(op_ctx, key, keylen, node_str);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "failed to set node-uuid");
+
+    glusterd_rebalance_rsp(op_ctx, &volinfo->rebal, i);
+
+    snprintf(key, sizeof(key), "time-left-%d", i);
+    ret = dict_set_uint64(op_ctx, key, volinfo->rebal.time_left);
+    if (ret)
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "failed to set time left");
+
+out:
+    return ret;
+}
+int32_t
+glusterd_handle_node_rsp(dict_t *req_dict, void *pending_entry,
+                         glusterd_op_t op, dict_t *rsp_dict, dict_t *op_ctx,
+                         char **op_errstr, gd_node_type type)
+{
+    int ret = 0;
+    int32_t cmd = GF_OP_CMD_NONE;
+
+    GF_ASSERT(op_errstr);
+
+    switch (op) {
+        case GD_OP_PROFILE_VOLUME:
+            ret = glusterd_profile_volume_brick_rsp(pending_entry, rsp_dict,
+                                                    op_ctx, op_errstr, type);
+            break;
+        case GD_OP_STATUS_VOLUME:
+            ret = dict_get_int32n(req_dict, "cmd", SLEN("cmd"), &cmd);
+            if (!ret && (cmd & GF_CLI_STATUS_CLIENT_LIST)) {
+                ret = glusterd_status_volume_client_list(rsp_dict, op_ctx,
+                                                         op_errstr);
+            } else
+                ret = glusterd_status_volume_brick_rsp(rsp_dict, op_ctx,
+                                                       op_errstr);
+            break;
+        case GD_OP_DEFRAG_BRICK_VOLUME:
+            glusterd_defrag_volume_node_rsp(req_dict, rsp_dict, op_ctx);
+            break;
+
+        case GD_OP_HEAL_VOLUME:
+            ret = glusterd_heal_volume_brick_rsp(req_dict, rsp_dict, op_ctx,
+                                                 op_errstr);
+            break;
+        case GD_OP_SCRUB_STATUS:
+            ret = glusterd_bitrot_volume_node_rsp(op_ctx, rsp_dict);
+
+            break;
+        default:
+            break;
+    }
+
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int32_t
+glusterd_set_originator_uuid(dict_t *dict)
+{
+    int ret = -1;
+    uuid_t *originator_uuid = NULL;
+
+    GF_ASSERT(dict);
+
+    originator_uuid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t);
+    if (!originator_uuid) {
+        ret = -1;
+        goto out;
+    }
+
+    gf_uuid_copy(*originator_uuid, MY_UUID);
+    ret = dict_set_bin(dict, "originator_uuid", originator_uuid,
+                       sizeof(uuid_t));
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set originator_uuid.");
+        goto out;
+    }
+
+out:
+    if (ret && originator_uuid)
+        GF_FREE(originator_uuid);
+
+    return ret;
+}
+
+/* Should be used only when an operation is in progress, as that is the only
+ * time a lock_owner is set
+ */
+gf_boolean_t
+is_origin_glusterd(dict_t *dict)
+{
+    gf_boolean_t ret = _gf_false;
+    uuid_t lock_owner = {
+        0,
+    };
+    uuid_t *originator_uuid = NULL;
+
+    GF_ASSERT(dict);
+
+    ret = dict_get_bin(dict, "originator_uuid", (void **)&originator_uuid);
+    if (ret) {
+        /* If not originator_uuid has been set, then the command
+         * has been originated from a glusterd running on older version
+         * Hence fetching the lock owner */
+        ret = glusterd_get_lock_owner(&lock_owner);
+        if (ret) {
+            ret = _gf_false;
+            goto out;
+        }
+        ret = !gf_uuid_compare(MY_UUID, lock_owner);
+    } else
+        ret = !gf_uuid_compare(MY_UUID, *originator_uuid);
+
+out:
+    return ret;
+}
+
+int
+glusterd_generate_and_set_task_id(dict_t *dict, char *key, const int keylen)
+{
+    int ret = -1;
+    uuid_t task_id = {
+        0,
+    };
+    char *uuid_str = NULL;
+    xlator_t *this = NULL;
+
+    GF_ASSERT(dict);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    gf_uuid_generate(task_id);
+    uuid_str = gf_strdup(uuid_utoa(task_id));
+    if (!uuid_str) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_dynstrn(dict, key, keylen, uuid_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set %s in dict", key);
+        goto out;
+    }
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_TASK_ID_INFO,
+           "Generated task-id %s for key %s", uuid_str, key);
+
+out:
+    if (ret)
+        GF_FREE(uuid_str);
+    return ret;
+}
+
+int
+glusterd_copy_uuid_to_dict(uuid_t uuid, dict_t *dict, char *key,
+                           const int keylen)
+{
+    int ret = -1;
+    char tmp_str[40] = "";
+    char *task_id_str = NULL;
+
+    GF_ASSERT(dict);
+    GF_ASSERT(key);
+
+    gf_uuid_unparse(uuid, tmp_str);
+    task_id_str = gf_strdup(tmp_str);
+    if (!task_id_str)
+        return -1;
+
+    ret = dict_set_dynstrn(dict, key, keylen, task_id_str);
+    if (ret) {
+        GF_FREE(task_id_str);
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Error setting uuid in dict with key %s", key);
+    }
+
+    return 0;
+}
+
+static int
+_update_volume_op_versions(dict_t *this, char *key, data_t *value, void *data)
+{
+    int op_version = 0;
+    glusterd_volinfo_t *ctx = NULL;
+    gf_boolean_t enabled = _gf_true;
+    int ret = -1;
+    struct volopt_map_entry *vmep = NULL;
+
+    GF_ASSERT(data);
+    ctx = data;
+
+    vmep = gd_get_vmep(key);
+    op_version = glusterd_get_op_version_from_vmep(vmep);
+
+    if (gd_is_xlator_option(vmep) || gd_is_boolean_option(vmep)) {
+        ret = gf_string2boolean(value->data, &enabled);
+        if (ret)
+            return 0;
+
+        if (!enabled)
+            return 0;
+    }
+
+    if (op_version > ctx->op_version)
+        ctx->op_version = op_version;
+
+    if (gd_is_client_option(vmep) && (op_version > ctx->client_op_version))
+        ctx->client_op_version = op_version;
+
+    return 0;
+}
+
+void
+gd_update_volume_op_versions(glusterd_volinfo_t *volinfo)
+{
+    glusterd_conf_t *conf = NULL;
+    gf_boolean_t ob_enabled = _gf_false;
+
+    GF_ASSERT(volinfo);
+
+    conf = THIS->private;
+    GF_ASSERT(conf);
+
+    /* Reset op-versions to minimum */
+    volinfo->op_version = 1;
+    volinfo->client_op_version = 1;
+
+    dict_foreach(volinfo->dict, _update_volume_op_versions, volinfo);
+
+    /* Special case for open-behind
+     * If cluster op-version >= 2 and open-behind hasn't been explicitly
+     * disabled, volume op-versions must be updated to account for it
+     */
+
+    /* TODO: Remove once we have a general way to update automatically
+     * enabled features
+     */
+    if (conf->op_version >= 2) {
+        ob_enabled = dict_get_str_boolean(volinfo->dict,
+                                          "performance.open-behind", _gf_true);
+        if (ob_enabled) {
+            if (volinfo->op_version < 2)
+                volinfo->op_version = 2;
+            if (volinfo->client_op_version < 2)
+                volinfo->client_op_version = 2;
+        }
+    }
+
+    if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) {
+        if (volinfo->op_version < GD_OP_VERSION_3_6_0)
+            volinfo->op_version = GD_OP_VERSION_3_6_0;
+        if (volinfo->client_op_version < GD_OP_VERSION_3_6_0)
+            volinfo->client_op_version = GD_OP_VERSION_3_6_0;
+    }
+
+    return;
+}
+
+int
+op_version_check(xlator_t *this, int min_op_version, char *msg, int msglen)
+{
+    int ret = 0;
+    glusterd_conf_t *priv = NULL;
+
+    GF_ASSERT(this);
+    GF_ASSERT(msg);
+
+    priv = this->private;
+    if (priv->op_version < min_op_version) {
+        snprintf(msg, msglen,
+                 "One or more nodes do not support "
+                 "the required op-version. Cluster op-version must "
+                 "at least be %d.",
+                 min_op_version);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNSUPPORTED_VERSION, "%s",
+               msg);
+        ret = -1;
+    }
+    return ret;
+}
+
+/* A task is committed/completed once the task-id for it is cleared */
+gf_boolean_t
+gd_is_remove_brick_committed(glusterd_volinfo_t *volinfo)
+{
+    GF_ASSERT(volinfo);
+
+    if ((GD_OP_REMOVE_BRICK == volinfo->rebal.op) &&
+        !gf_uuid_is_null(volinfo->rebal.rebalance_id))
+        return _gf_false;
+
+    return _gf_true;
+}
+
+gf_boolean_t
+glusterd_is_status_tasks_op(glusterd_op_t op, dict_t *dict)
+{
+    int ret = -1;
+    uint32_t cmd = GF_CLI_STATUS_NONE;
+    gf_boolean_t is_status_tasks = _gf_false;
+
+    if (op != GD_OP_STATUS_VOLUME)
+        goto out;
+
+    ret = dict_get_uint32(dict, "cmd", &cmd);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get opcode");
+        goto out;
+    }
+
+    if (cmd & GF_CLI_STATUS_TASKS)
+        is_status_tasks = _gf_true;
+
+out:
+    return is_status_tasks;
+}
+
+/* Tells if rebalance needs to be started for the given volume on the peer
+ *
+ * Rebalance should be started on a peer only if an involved brick is present on
+ * the peer.
+ *
+ * For a normal rebalance, if any one brick of the given volume is present on
+ * the peer, the rebalance process should be started.
+ *
+ * For a rebalance as part of a remove-brick operation, the rebalance process
+ * should be started only if one of the bricks being removed is present on the
+ * peer
+ */
+gf_boolean_t
+gd_should_i_start_rebalance(glusterd_volinfo_t *volinfo)
+{
+    gf_boolean_t retval = _gf_false;
+    int ret = -1;
+    glusterd_brickinfo_t *brick = NULL;
+    int count = 0;
+    int i = 0;
+    char key[64] = "";
+    int keylen;
+    char *brickname = NULL;
+
+    switch (volinfo->rebal.op) {
+        case GD_OP_REBALANCE:
+            cds_list_for_each_entry(brick, &volinfo->bricks, brick_list)
+            {
+                if (gf_uuid_compare(MY_UUID, brick->uuid) == 0) {
+                    retval = _gf_true;
+                    break;
+                }
+            }
+            break;
+        case GD_OP_REMOVE_BRICK:
+            ret = dict_get_int32n(volinfo->rebal.dict, "count", SLEN("count"),
+                                  &count);
+            if (ret) {
+                goto out;
+            }
+            for (i = 1; i <= count; i++) {
+                keylen = snprintf(key, sizeof(key), "brick%d", i);
+                ret = dict_get_strn(volinfo->rebal.dict, key, keylen,
+                                    &brickname);
+                if (ret)
+                    goto out;
+                ret = glusterd_volume_brickinfo_get_by_brick(brickname, volinfo,
+                                                             &brick, _gf_false);
+                if (ret)
+                    goto out;
+                if (gf_uuid_compare(MY_UUID, brick->uuid) == 0) {
+                    retval = _gf_true;
+                    break;
+                }
+            }
+            break;
+        default:
+            break;
+    }
+
+out:
+    return retval;
+}
+
+int
+glusterd_is_volume_quota_enabled(glusterd_volinfo_t *volinfo)
+{
+    return (glusterd_volinfo_get_boolean(volinfo, VKEY_FEATURES_QUOTA));
+}
+
+int
+glusterd_is_volume_inode_quota_enabled(glusterd_volinfo_t *volinfo)
+{
+    return (glusterd_volinfo_get_boolean(volinfo, VKEY_FEATURES_INODE_QUOTA));
+}
+
+int
+glusterd_is_bitrot_enabled(glusterd_volinfo_t *volinfo)
+{
+    return glusterd_volinfo_get_boolean(volinfo, VKEY_FEATURES_BITROT);
+}
+
+int
+glusterd_validate_and_set_gfid(dict_t *op_ctx, dict_t *req_dict,
+                               char **op_errstr)
+{
+    int ret = -1;
+    int count = 0;
+    int i = 0;
+    int op_code = GF_QUOTA_OPTION_TYPE_NONE;
+    uuid_t uuid1 = {0};
+    uuid_t uuid2 = {
+        0,
+    };
+    char *path = NULL;
+    char key[64] = "";
+    int keylen;
+    char *uuid1_str = NULL;
+    char *uuid1_str_dup = NULL;
+    char *uuid2_str = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_int32n(op_ctx, "type", SLEN("type"), &op_code);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get quota opcode");
+        goto out;
+    }
+
+    if ((op_code != GF_QUOTA_OPTION_TYPE_LIMIT_USAGE) &&
+        (op_code != GF_QUOTA_OPTION_TYPE_LIMIT_OBJECTS) &&
+        (op_code != GF_QUOTA_OPTION_TYPE_REMOVE) &&
+        (op_code != GF_QUOTA_OPTION_TYPE_REMOVE_OBJECTS)) {
+        ret = 0;
+        goto out;
+    }
+
+    ret = dict_get_strn(op_ctx, "path", SLEN("path"), &path);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get path");
+        goto out;
+    }
+
+    ret = dict_get_int32n(op_ctx, "count", SLEN("count"), &count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get count");
+        goto out;
+    }
+
+    /* If count is 0, fail the command with ENOENT.
+     *
+     * If count is 1, treat gfid0 as the gfid on which the operation
+     * is to be performed and resume the command.
+     *
+     * if count > 1, get the 0th gfid from the op_ctx and,
+     * compare it with the remaining 'count -1' gfids.
+     * If they are found to be the same, set gfid0 in the op_ctx and
+     * resume the operation, else error out.
+     */
+
+    if (count == 0) {
+        gf_asprintf(op_errstr,
+                    "Failed to get trusted.gfid attribute "
+                    "on path %s. Reason : %s",
+                    path, strerror(ENOENT));
+        ret = -ENOENT;
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "gfid%d", 0);
+
+    ret = dict_get_strn(op_ctx, key, keylen, &uuid1_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get key '%s'", key);
+        goto out;
+    }
+
+    gf_uuid_parse(uuid1_str, uuid1);
+
+    for (i = 1; i < count; i++) {
+        keylen = snprintf(key, sizeof(key), "gfid%d", i);
+
+        ret = dict_get_strn(op_ctx, key, keylen, &uuid2_str);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get key "
+                   "'%s'",
+                   key);
+            goto out;
+        }
+
+        gf_uuid_parse(uuid2_str, uuid2);
+
+        if (gf_uuid_compare(uuid1, uuid2)) {
+            gf_asprintf(op_errstr,
+                        "gfid mismatch between %s and "
+                        "%s for path %s",
+                        uuid1_str, uuid2_str, path);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    if (i == count) {
+        uuid1_str_dup = gf_strdup(uuid1_str);
+        if (!uuid1_str_dup) {
+            ret = -1;
+            goto out;
+        }
+
+        ret = dict_set_dynstrn(req_dict, "gfid", SLEN("gfid"), uuid1_str_dup);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set gfid");
+            GF_FREE(uuid1_str_dup);
+            goto out;
+        }
+    } else {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_ITER_FAIL,
+               "Failed to iterate through %d"
+               " entries in the req dict",
+               count);
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+void
+glusterd_clean_up_quota_store(glusterd_volinfo_t *volinfo)
+{
+    char voldir[PATH_MAX] = "";
+    char quota_confpath[PATH_MAX] = "";
+    char cksum_path[PATH_MAX] = "";
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GLUSTERD_GET_VOLUME_DIR(voldir, volinfo, conf);
+
+    len = snprintf(quota_confpath, sizeof(quota_confpath), "%s/%s", voldir,
+                   GLUSTERD_VOLUME_QUOTA_CONFIG);
+    if ((len < 0) || (len >= sizeof(quota_confpath))) {
+        quota_confpath[0] = 0;
+    }
+    len = snprintf(cksum_path, sizeof(cksum_path), "%s/%s", voldir,
+                   GLUSTERD_VOL_QUOTA_CKSUM_FILE);
+    if ((len < 0) || (len >= sizeof(cksum_path))) {
+        cksum_path[0] = 0;
+    }
+
+    sys_unlink(quota_confpath);
+    sys_unlink(cksum_path);
+
+    gf_store_handle_destroy(volinfo->quota_conf_shandle);
+    volinfo->quota_conf_shandle = NULL;
+    volinfo->quota_conf_version = 0;
+}
+
+int
+glusterd_remove_auxiliary_mount(char *volname)
+{
+    int ret = -1;
+    char mountdir[PATH_MAX] = "";
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GLUSTERD_GET_QUOTA_LIMIT_MOUNT_PATH(mountdir, volname, "/");
+    ret = gf_umount_lazy(this->name, mountdir, 1);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_LAZY_UMOUNT_FAIL,
+               "umount on %s failed, "
+               "reason : %s",
+               mountdir, strerror(errno));
+
+        /* Hide EBADF as it means the mount is already gone */
+        if (errno == EBADF)
+            ret = 0;
+    }
+
+    return ret;
+}
+
+/* Stops the rebalance process of the given volume
+ */
+int
+gd_stop_rebalance_process(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char pidfile[PATH_MAX] = "";
+
+    GF_ASSERT(volinfo);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, conf);
+    ret = glusterd_service_stop("rebalance", pidfile, SIGTERM, _gf_true);
+
+    return ret;
+}
+
+rpc_clnt_t *
+glusterd_rpc_clnt_unref(glusterd_conf_t *conf, rpc_clnt_t *rpc)
+{
+    rpc_clnt_t *ret = NULL;
+
+    GF_ASSERT(conf);
+    GF_ASSERT(rpc);
+    synclock_unlock(&conf->big_lock);
+    (void)rpc_clnt_reconnect_cleanup(&rpc->conn);
+    ret = rpc_clnt_unref(rpc);
+    synclock_lock(&conf->big_lock);
+
+    return ret;
+}
+
+int32_t
+glusterd_compare_volume_name(struct cds_list_head *list1,
+                             struct cds_list_head *list2)
+{
+    glusterd_volinfo_t *volinfo1 = NULL;
+    glusterd_volinfo_t *volinfo2 = NULL;
+
+    volinfo1 = cds_list_entry(list1, glusterd_volinfo_t, vol_list);
+    volinfo2 = cds_list_entry(list2, glusterd_volinfo_t, vol_list);
+    return strcmp(volinfo1->volname, volinfo2->volname);
+}
+
+static int
+gd_default_synctask_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+    glusterd_conf_t *priv = THIS->private;
+    synclock_unlock(&priv->big_lock);
+    return ret;
+}
+
+void
+glusterd_launch_synctask(synctask_fn_t fn, void *opaque)
+{
+    xlator_t *this = NULL;
+    int ret = -1;
+
+    this = THIS;
+
+    /* synclock_lock must be called from within synctask, @fn must call it
+     * before it starts with its work*/
+    ret = synctask_new(this->ctx->env, fn, gd_default_synctask_cbk, NULL,
+                       opaque);
+    if (ret)
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_SPAWN_SVCS_FAIL,
+               "Failed to spawn bricks"
+               " and other volume related services");
+}
+
+/*
+ * glusterd_enable_default_options enable certain options by default on the
+ * given volume based on the cluster op-version. This is called only during
+ * volume create or during volume reset
+ *
+ * @volinfo - volume on which to enable the default options
+ * @option  - option to be set to default. If NULL, all possible options will be
+ *            set to default
+ *
+ * Returns 0 on success and -1 on failure. If @option is given, but doesn't
+ * match any of the options that could be set, it is a success.
+ */
+/*
+ * TODO: Make this able to parse the volume-set table to set options
+ * Currently, the check and set for any option which wants to make use of this
+ * 'framework' needs to be done here manually. This would mean more work for the
+ * developer. This little extra work can be avoided if we make it possible to
+ * parse the volume-set table to get the options which could be set and their
+ * default values
+ */
+int
+glusterd_enable_default_options(glusterd_volinfo_t *volinfo, char *option)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+#ifdef IPV6_DEFAULT
+    char *addr_family = "inet6";
+#else
+    char *addr_family = "inet";
+#endif
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+
+    conf = this->private;
+    GF_ASSERT(conf);
+
+#ifdef GD_OP_VERSION_3_8_0
+    if (conf->op_version >= GD_OP_VERSION_3_8_0) {
+        /* nfs.disable needs to be enabled for new volumes with
+         * >= gluster version 3.7 (for now) 3.8 later
+         */
+        if (!option || !strcmp(NFS_DISABLE_MAP_KEY, option)) {
+            ret = dict_set_dynstr_with_alloc(volinfo->dict, NFS_DISABLE_MAP_KEY,
+                                             "on");
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set option '" NFS_DISABLE_MAP_KEY
+                       "' on volume "
+                       "%s",
+                       volinfo->volname);
+                goto out;
+            }
+        }
+    }
+#endif
+
+    if (conf->op_version >= GD_OP_VERSION_3_7_0) {
+        /* Set needed volume options in volinfo->dict
+         * For ex.,
+         *
+         * if (!option || !strcmp("someoption", option) {
+         *      ret = dict_set_str(volinfo->dict, "someoption", "on");
+         *      ...
+         * }
+         * */
+
+        /* Option 'features.quota-deem-statfs' should not be turned off
+         * with 'gluster volume reset <VOLNAME>', since quota features
+         * can be reset only with 'gluster volume quota <VOLNAME>
+         * disable'.
+         */
+
+        if (!option || !strcmp("features.quota-deem-statfs", option)) {
+            if (glusterd_is_volume_quota_enabled(volinfo)) {
+                ret = dict_set_dynstr_with_alloc(
+                    volinfo->dict, "features.quota-deem-statfs", "on");
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, errno,
+                           GD_MSG_DICT_SET_FAILED,
+                           "Failed to set option "
+                           "'features.quota-deem-statfs' "
+                           "on volume %s",
+                           volinfo->volname);
+                    goto out;
+                }
+            }
+        }
+    }
+
+    if (conf->op_version >= GD_OP_VERSION_3_9_0) {
+        if (!option || !strcmp("transport.address-family", option)) {
+            if (volinfo->transport_type == GF_TRANSPORT_TCP) {
+                ret = dict_set_dynstr_with_alloc(
+                    volinfo->dict, "transport.address-family", addr_family);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, errno,
+                           GD_MSG_DICT_SET_FAILED,
+                           "failed to set transport."
+                           "address-family on %s",
+                           volinfo->volname);
+                    goto out;
+                }
+            }
+        }
+    }
+
+    if (conf->op_version >= GD_OP_VERSION_7_0) {
+        ret = dict_set_dynstr_with_alloc(volinfo->dict,
+                                         "storage.fips-mode-rchecksum", "on");
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set option 'storage.fips-mode-rchecksum' "
+                   "on volume %s",
+                   volinfo->volname);
+            goto out;
+        }
+    }
+out:
+    return ret;
+}
+
+void
+glusterd_get_gfproxy_client_volfile(glusterd_volinfo_t *volinfo, char *path,
+                                    int path_len)
+{
+    char workdir[PATH_MAX] = "";
+    glusterd_conf_t *priv = THIS->private;
+
+    GLUSTERD_GET_VOLUME_DIR(workdir, volinfo, priv);
+
+    switch (volinfo->transport_type) {
+        case GF_TRANSPORT_TCP:
+        case GF_TRANSPORT_BOTH_TCP_RDMA:
+            snprintf(path, path_len, "%s/trusted-%s.tcp-gfproxy-fuse.vol",
+                     workdir, volinfo->volname);
+            break;
+
+        case GF_TRANSPORT_RDMA:
+            snprintf(path, path_len, "%s/trusted-%s.rdma-gfproxy-fuse.vol",
+                     workdir, volinfo->volname);
+            break;
+        default:
+            break;
+    }
+}
+
+void
+glusterd_get_rebalance_volfile(glusterd_volinfo_t *volinfo, char *path,
+                               int path_len)
+{
+    char workdir[PATH_MAX] = "";
+    glusterd_conf_t *priv = THIS->private;
+
+    GLUSTERD_GET_VOLUME_DIR(workdir, volinfo, priv);
+
+    snprintf(path, path_len, "%s/%s-rebalance.vol", workdir, volinfo->volname);
+}
+
+/* This function will update the backend file-system
+ * type and the mount options in origin and snap brickinfo.
+ * This will be later used to perform file-system specific operation
+ * during LVM snapshot.
+ *
+ * @param brick_path       brickpath for which fstype to be found
+ * @param brickinfo        brickinfo of snap/origin volume
+ * @return 0 on success and -1 on failure
+ */
+int
+glusterd_update_mntopts(char *brick_path, glusterd_brickinfo_t *brickinfo)
+{
+    int32_t ret = -1;
+    char *mnt_pt = NULL;
+    char buff[PATH_MAX] = "";
+    struct mntent *entry = NULL;
+    struct mntent save_entry = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(brick_path);
+    GF_ASSERT(brickinfo);
+
+    ret = glusterd_get_brick_root(brick_path, &mnt_pt);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICKPATH_ROOT_GET_FAIL,
+               "getting the root "
+               "of the brick (%s) failed ",
+               brick_path);
+        goto out;
+    }
+
+    entry = glusterd_get_mnt_entry_info(mnt_pt, buff, sizeof(buff),
+                                        &save_entry);
+    if (!entry) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_MNTENTRY_GET_FAIL,
+               "getting the mount entry for "
+               "the brick (%s) failed",
+               brick_path);
+        ret = -1;
+        goto out;
+    }
+
+    if (snprintf(brickinfo->fstype, sizeof(brickinfo->fstype), "%s",
+                 entry->mnt_type) >= sizeof(brickinfo->fstype)) {
+        ret = -1;
+        goto out;
+    }
+    (void)snprintf(brickinfo->mnt_opts, sizeof(brickinfo->mnt_opts), "%s",
+                   entry->mnt_opts);
+
+    gf_strncpy(brickinfo->mnt_opts, entry->mnt_opts,
+               sizeof(brickinfo->mnt_opts));
+
+    ret = 0;
+out:
+    if (mnt_pt)
+        GF_FREE(mnt_pt);
+    return ret;
+}
+
+int
+glusterd_get_value_for_vme_entry(struct volopt_map_entry *vme, char **def_val)
+{
+    int ret = -1;
+    char *key = NULL;
+    xlator_t *this = NULL;
+    char *descr = NULL;
+    char *local_def_val = NULL;
+    void *dl_handle = NULL;
+    volume_opt_list_t vol_opt_handle = {
+        {0},
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    CDS_INIT_LIST_HEAD(&vol_opt_handle.list);
+
+    if (_get_xlator_opt_key_from_vme(vme, &key)) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GET_KEY_FAILED,
+               "Failed to get %s key from "
+               "volume option entry",
+               vme->key);
+        goto out;
+    }
+
+    ret = xlator_volopt_dynload(vme->voltype, &dl_handle, &vol_opt_handle);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_XLATOR_VOLOPT_DYNLOAD_ERROR,
+               "xlator_volopt_dynload error "
+               "(%d)",
+               ret);
+        ret = -2;
+        goto cont;
+    }
+
+    ret = xlator_option_info_list(&vol_opt_handle, key, &local_def_val, &descr);
+    if (ret) {
+        /*Swallow Error if option not found*/
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_GET_KEY_FAILED,
+               "Failed to get option for %s "
+               "key",
+               key);
+        ret = -2;
+        goto cont;
+    }
+    if (!local_def_val)
+        local_def_val = "(null)";
+
+    *def_val = gf_strdup(local_def_val);
+
+cont:
+    if (dl_handle) {
+        dlclose(dl_handle);
+        dl_handle = NULL;
+        vol_opt_handle.given_opt = NULL;
+    }
+    if (key) {
+        _free_xlator_opt_key(key);
+        key = NULL;
+    }
+
+    if (ret)
+        goto out;
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_get_global_max_op_version(rpcsvc_request_t *req, dict_t *ctx,
+                                   int count)
+{
+    int ret = -1;
+    char *def_val = NULL;
+    char dict_key[50] = "";
+    int keylen;
+
+    ret = glusterd_mgmt_v3_initiate_all_phases(req, GD_OP_MAX_OPVERSION, ctx);
+
+    ret = dict_get_strn(ctx, "max-opversion", SLEN("max-opversion"), &def_val);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get max-opversion value from"
+               " dictionary");
+        goto out;
+    }
+
+    keylen = sprintf(dict_key, "key%d", count);
+    ret = dict_set_nstrn(ctx, dict_key, keylen, GLUSTERD_MAX_OP_VERSION_KEY,
+                         SLEN(GLUSTERD_MAX_OP_VERSION_KEY));
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set %s in "
+               "dictionary",
+               GLUSTERD_MAX_OP_VERSION_KEY);
+        goto out;
+    }
+
+    sprintf(dict_key, "value%d", count);
+    ret = dict_set_dynstr_with_alloc(ctx, dict_key, def_val);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set %s for key %s in dictionary", def_val,
+               GLUSTERD_MAX_OP_VERSION_KEY);
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+int
+glusterd_get_global_options_for_all_vols(rpcsvc_request_t *req, dict_t *ctx,
+                                         char **op_errstr)
+{
+    int ret = -1;
+    int count = 0;
+    gf_boolean_t all_opts = _gf_false;
+    gf_boolean_t key_found = _gf_false;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    char *key = NULL;
+    char *key_fixed = NULL;
+    char dict_key[50] = "";
+    char *def_val = NULL;
+    char err_str[PATH_MAX] = "";
+    char *allvolopt = NULL;
+    int32_t i = 0;
+    gf_boolean_t exists = _gf_false;
+    gf_boolean_t need_free = _gf_false;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO(THIS->name, this, out);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    GF_VALIDATE_OR_GOTO(this->name, ctx, out);
+
+    ret = dict_get_strn(ctx, "key", SLEN("key"), &key);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get option key from dictionary");
+        goto out;
+    }
+
+    if (strcasecmp(key, "all") == 0)
+        all_opts = _gf_true;
+    else {
+        exists = glusterd_check_option_exists(key, &key_fixed);
+        if (!exists) {
+            snprintf(err_str, sizeof(err_str),
+                     "Option "
+                     "with name: %s does not exist",
+                     key);
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_UNKNOWN_KEY, "%s",
+                   err_str);
+            if (key_fixed)
+                snprintf(err_str, sizeof(err_str), "Did you mean %s?",
+                         key_fixed);
+            ret = -1;
+            goto out;
+        }
+        if (key_fixed)
+            key = key_fixed;
+    }
+    /* coverity[CONSTANT_EXPRESSION_RESULT] */
+    ALL_VOLUME_OPTION_CHECK("all", _gf_true, key, ret, op_errstr, out);
+
+    for (i = 0; valid_all_vol_opts[i].option; i++) {
+        allvolopt = valid_all_vol_opts[i].option;
+
+        if (!all_opts && strcmp(key, allvolopt) != 0)
+            continue;
+
+        /* Found global option */
+        if (strcmp(allvolopt, GLUSTERD_MAX_OP_VERSION_KEY) == 0) {
+            count++;
+            ret = glusterd_get_global_max_op_version(req, ctx, count);
+            if (ret)
+                goto out;
+            else
+                continue;
+        }
+
+        ret = dict_get_str(priv->opts, allvolopt, &def_val);
+
+        /* If global option isn't set explicitly */
+
+        if (!def_val) {
+            if (!strcmp(allvolopt, GLUSTERD_GLOBAL_OP_VERSION_KEY)) {
+                gf_asprintf(&def_val, "%d", priv->op_version);
+                need_free = _gf_true;
+            } else {
+                gf_asprintf(&def_val, "%s (DEFAULT)",
+                            valid_all_vol_opts[i].dflt_val);
+                need_free = _gf_true;
+            }
+        }
+
+        count++;
+        ret = sprintf(dict_key, "key%d", count);
+        ret = dict_set_strn(ctx, dict_key, ret, allvolopt);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set %s in dictionary", allvolopt);
+            goto out;
+        }
+
+        sprintf(dict_key, "value%d", count);
+        ret = dict_set_dynstr_with_alloc(ctx, dict_key, def_val);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to set %s for key %s in dictionary", def_val,
+                   allvolopt);
+            goto out;
+        }
+
+        if (need_free) {
+            GF_FREE(def_val);
+            need_free = _gf_false;
+        }
+        def_val = NULL;
+        allvolopt = NULL;
+
+        if (!all_opts)
+            break;
+    }
+
+    ret = dict_set_int32n(ctx, "count", SLEN("count"), count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set count in dictionary");
+    }
+
+out:
+    if (ret && !all_opts && !key_found) {
+        if (err_str[0] == 0)
+            snprintf(err_str, sizeof(err_str), "option %s does not exist", key);
+        if (*op_errstr == NULL)
+            *op_errstr = gf_strdup(err_str);
+    }
+
+    if (ret && need_free) {
+        GF_FREE(def_val);
+    }
+    GF_FREE(key_fixed);
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+char *
+glusterd_get_option_value(glusterd_volinfo_t *volinfo, char *key)
+{
+    char *value = NULL;
+
+    if (!glusterd_is_volume_replicate(volinfo))
+        goto ret;
+
+    if (!strcmp(key, "performance.client-io-threads")) {
+        value = "off";
+    } else if (!strcmp(key, "cluster.quorum-type")) {
+        if (volinfo->replica_count % 2) {
+            value = "auto";
+        }
+    }
+ret:
+    return value;
+}
+
+int
+glusterd_get_default_val_for_volopt(dict_t *ctx, gf_boolean_t all_opts,
+                                    char *input_key, char *orig_key,
+                                    glusterd_volinfo_t *volinfo,
+                                    char **op_errstr)
+{
+    struct volopt_map_entry *vme = NULL;
+    int ret = -1;
+    int count = 0;
+    xlator_t *this = NULL;
+    char *def_val = NULL;
+    char *def_val_str = NULL;
+    char dict_key[50] = "";
+    int keylen;
+    gf_boolean_t key_found = _gf_false;
+    gf_boolean_t get_value_vme = _gf_false;
+    glusterd_conf_t *priv = NULL;
+    dict_t *vol_dict = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    vol_dict = volinfo->dict;
+    GF_VALIDATE_OR_GOTO(this->name, vol_dict, out);
+
+    /* Check whether key is passed for a single option */
+    if (!all_opts && !input_key) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_KEY_NULL, "Key is NULL");
+        goto out;
+    }
+
+    for (vme = &glusterd_volopt_map[0]; vme->key; vme++) {
+        if (!all_opts && strcmp(vme->key, input_key))
+            continue;
+        key_found = _gf_true;
+        get_value_vme = _gf_false;
+        /* First look for the key in the priv->opts for global option
+         * and then into vol_dict, if its not present then look for
+         * translator default value */
+        keylen = strlen(vme->key);
+        ret = dict_get_strn(priv->opts, vme->key, keylen, &def_val);
+        if (!def_val) {
+            ret = dict_get_strn(vol_dict, vme->key, keylen, &def_val);
+            if (ret == -ENOENT)
+                def_val = glusterd_get_option_value(volinfo, vme->key);
+            if (!def_val) {
+                if (vme->value) {
+                    def_val = vme->value;
+                } else {
+                    ret = glusterd_get_value_for_vme_entry(vme, &def_val);
+                    get_value_vme = _gf_true;
+                    if (!all_opts && ret)
+                        goto out;
+                    else if (ret == -2)
+                        continue;
+                }
+            }
+        }
+        count++;
+        keylen = sprintf(dict_key, "key%d", count);
+        ret = dict_set_strn(ctx, dict_key, keylen, vme->key);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to "
+                   "set %s in dictionary",
+                   vme->key);
+            goto out;
+        }
+        sprintf(dict_key, "value%d", count);
+        if (get_value_vme) {  // the value was never changed  - DEFAULT is used
+            gf_asprintf(&def_val_str, "%s (DEFAULT)", def_val);
+            ret = dict_set_dynstr_with_alloc(ctx, dict_key, def_val_str);
+            GF_FREE(def_val_str);
+            def_val_str = NULL;
+        } else
+            ret = dict_set_dynstr_with_alloc(ctx, dict_key, def_val);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "Failed to "
+                   "set %s for key %s in dictionary",
+                   def_val, vme->key);
+            goto out;
+        }
+        if (get_value_vme)
+            GF_FREE(def_val);
+
+        def_val = NULL;
+        if (!all_opts)
+            break;
+    }
+    if (!all_opts && !key_found)
+        goto out;
+
+    ret = dict_set_int32n(ctx, "count", SLEN("count"), count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set count "
+               "in dictionary");
+    }
+
+out:
+    if (ret && !all_opts && !key_found) {
+        char err_str[PATH_MAX];
+        snprintf(err_str, sizeof(err_str), "option %s does not exist",
+                 orig_key);
+        *op_errstr = gf_strdup(err_str);
+    }
+    if (def_val)
+        GF_FREE(def_val);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_get_volopt_content(dict_t *ctx, gf_boolean_t xml_out)
+{
+    void *dl_handle = NULL;
+    volume_opt_list_t vol_opt_handle = {
+        {0},
+    };
+    char *key = NULL;
+    struct volopt_map_entry *vme = NULL;
+    int ret = -1;
+    char *def_val = NULL;
+    char *descr = NULL;
+    char *output = NULL;
+    size_t size = 0;
+    size_t used = 0;
+#if (HAVE_LIB_XML)
+    xmlTextWriterPtr writer = NULL;
+    xmlBufferPtr buf = NULL;
+
+    if (xml_out) {
+        ret = init_sethelp_xml_doc(&writer, &buf);
+        if (ret) /*logging done in init_xml_lib*/
+            goto out;
+    }
+#endif
+
+    if (!xml_out) {
+        size = 65536;
+        output = GF_MALLOC(size, gf_common_mt_char);
+        if (output == NULL) {
+            ret = -1;
+            goto out;
+        }
+    }
+
+    CDS_INIT_LIST_HEAD(&vol_opt_handle.list);
+
+    for (vme = &glusterd_volopt_map[0]; vme->key; vme++) {
+        if ((vme->type == NO_DOC) || (vme->type == GLOBAL_NO_DOC))
+            continue;
+
+        if (vme->description) {
+            descr = vme->description;
+            def_val = vme->value;
+        } else {
+            if (_get_xlator_opt_key_from_vme(vme, &key)) {
+                gf_msg_debug("glusterd", 0,
+                             "Failed to "
+                             "get %s key from volume option entry",
+                             vme->key);
+                goto out; /*Some error while getting key*/
+            }
+
+            ret = xlator_volopt_dynload(vme->voltype, &dl_handle,
+                                        &vol_opt_handle);
+
+            if (ret) {
+                gf_msg_debug("glusterd", 0, "xlator_volopt_dynload error(%d)",
+                             ret);
+                ret = 0;
+                goto cont;
+            }
+
+            ret = xlator_option_info_list(&vol_opt_handle, key, &def_val,
+                                          &descr);
+            if (ret) { /*Swallow Error i.e if option not found*/
+                gf_msg_debug("glusterd", 0, "Failed to get option for %s key",
+                             key);
+                ret = 0;
+                goto cont;
+            }
+        }
+
+        if (xml_out) {
+#if (HAVE_LIB_XML)
+            if (xml_add_volset_element(writer, vme->key, def_val, descr)) {
+                ret = -1;
+                goto cont;
+            }
+#else
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_MODULE_NOT_INSTALLED,
+                   "Libxml not present");
+#endif
+        } else {
+            void *tmp;
+            int len;
+
+            do {
+                len = snprintf(output + used, size - used,
+                               "Option: %s\nDefault Value: %s\n"
+                               "Description: %s\n\n",
+                               vme->key, def_val, descr);
+                if (len < 0) {
+                    ret = -1;
+                    goto cont;
+                }
+                if (used + len < size) {
+                    used += len;
+                    break;
+                }
+
+                size += (len + 65536) & ~65535;
+                tmp = GF_REALLOC(output, size);
+                if (tmp == NULL) {
+                    ret = -1;
+                    goto cont;
+                }
+                output = tmp;
+            } while (1);
+        }
+    cont:
+        if (dl_handle) {
+            dlclose(dl_handle);
+            dl_handle = NULL;
+            vol_opt_handle.given_opt = NULL;
+        }
+        if (key) {
+            _free_xlator_opt_key(key);
+            key = NULL;
+        }
+        if (ret)
+            goto out;
+    }
+
+#if (HAVE_LIB_XML)
+    if ((xml_out) && (ret = end_sethelp_xml_doc(writer)))
+        goto out;
+#else
+    if (xml_out)
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_MODULE_NOT_INSTALLED,
+               "Libxml not present");
+#endif
+
+    if (xml_out) {
+#if (HAVE_LIB_XML)
+        output = gf_strdup((char *)buf->content);
+        if (NULL == output) {
+            ret = -1;
+            goto out;
+        }
+#else
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_MODULE_NOT_INSTALLED,
+               "Libxml not present");
+#endif
+    }
+
+    ret = dict_set_dynstrn(ctx, "help-str", SLEN("help-str"), output);
+    if (ret >= 0) {
+        output = NULL;
+    }
+out:
+    GF_FREE(output);
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_check_client_op_version_support(char *volname, uint32_t op_version,
+                                         char **op_errstr)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    rpc_transport_t *xprt = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    pthread_mutex_lock(&priv->xprt_lock);
+    list_for_each_entry(xprt, &priv->xprt_list, list)
+    {
+        if ((!strcmp(volname, xprt->peerinfo.volname)) &&
+            ((op_version > xprt->peerinfo.max_op_version) ||
+             (op_version < xprt->peerinfo.min_op_version))) {
+            ret = -1;
+            break;
+        }
+    }
+    pthread_mutex_unlock(&priv->xprt_lock);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNSUPPORTED_VERSION,
+               "Client %s is running with min_op_version as %d and "
+               "max_op_version as %d and don't support the required "
+               "op-version %d",
+               xprt->peerinfo.identifier, xprt->peerinfo.min_op_version,
+               xprt->peerinfo.max_op_version, op_version);
+        if (op_errstr)
+            ret = gf_asprintf(op_errstr,
+                              "One of the client %s is "
+                              "running with op-version %d and "
+                              "doesn't support the required "
+                              "op-version %d. This client needs to"
+                              " be upgraded or disconnected "
+                              "before running this command again",
+                              xprt->peerinfo.identifier,
+                              xprt->peerinfo.max_op_version, op_version);
+
+        return -1;
+    }
+    return 0;
+}
+
+gf_boolean_t
+glusterd_have_peers()
+{
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    return !cds_list_empty(&conf->peers);
+}
+
+gf_boolean_t
+glusterd_is_volume_started(glusterd_volinfo_t *volinfo)
+{
+    GF_ASSERT(volinfo);
+    return (volinfo->status == GLUSTERD_STATUS_STARTED);
+}
+
+int
+glusterd_volume_get_type_str(glusterd_volinfo_t *volinfo, char **voltype_str)
+{
+    int ret = -1;
+    int type = 0;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, volinfo, out);
+
+    type = get_vol_type(volinfo->type, volinfo->dist_leaf_count,
+                        volinfo->brick_count);
+
+    *voltype_str = vol_type_str[type];
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_volume_get_status_str(glusterd_volinfo_t *volinfo, char *status_str)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, status_str, out);
+
+    switch (volinfo->status) {
+        case GLUSTERD_STATUS_NONE:
+            sprintf(status_str, "%s", "Created");
+            break;
+        case GLUSTERD_STATUS_STARTED:
+            sprintf(status_str, "%s", "Started");
+            break;
+        case GLUSTERD_STATUS_STOPPED:
+            sprintf(status_str, "%s", "Stopped");
+            break;
+        default:
+            goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+void
+glusterd_brick_get_status_str(glusterd_brickinfo_t *brickinfo, char *status_str)
+{
+    GF_VALIDATE_OR_GOTO(THIS->name, brickinfo, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, status_str, out);
+
+    switch (brickinfo->status) {
+        case GF_BRICK_STOPPED:
+            sprintf(status_str, "%s", "Stopped");
+            break;
+        case GF_BRICK_STARTED:
+            sprintf(status_str, "%s", "Started");
+            break;
+        case GF_BRICK_STARTING:
+            sprintf(status_str, "%s", "Starting");
+            break;
+        case GF_BRICK_STOPPING:
+            sprintf(status_str, "%s", "Stopping");
+            break;
+        default:
+            sprintf(status_str, "%s", "None");
+            break;
+    }
+
+out:
+    return;
+}
+
+int
+glusterd_volume_get_transport_type_str(glusterd_volinfo_t *volinfo,
+                                       char *transport_type_str)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, transport_type_str, out);
+
+    switch (volinfo->transport_type) {
+        case GF_TRANSPORT_TCP:
+            sprintf(transport_type_str, "%s", "tcp");
+            break;
+        case GF_TRANSPORT_RDMA:
+            sprintf(transport_type_str, "%s", "rdma");
+            break;
+        case GF_TRANSPORT_BOTH_TCP_RDMA:
+            sprintf(transport_type_str, "%s", "tcp_rdma_both");
+            break;
+        default:
+            goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_volume_get_quorum_status_str(glusterd_volinfo_t *volinfo,
+                                      char *quorum_status_str)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, quorum_status_str, out);
+
+    switch (volinfo->quorum_status) {
+        case NOT_APPLICABLE_QUORUM:
+            sprintf(quorum_status_str, "%s", "not_applicable");
+            break;
+        case MEETS_QUORUM:
+            sprintf(quorum_status_str, "%s", "meets");
+            break;
+        case DOESNT_MEET_QUORUM:
+            sprintf(quorum_status_str, "%s", "does_not_meet");
+            break;
+        default:
+            goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_volume_get_rebalance_status_str(glusterd_volinfo_t *volinfo,
+                                         char *rebal_status_str)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, rebal_status_str, out);
+
+    switch (volinfo->rebal.defrag_status) {
+        case GF_DEFRAG_STATUS_NOT_STARTED:
+            sprintf(rebal_status_str, "%s", "not_started");
+            break;
+        case GF_DEFRAG_STATUS_STARTED:
+            sprintf(rebal_status_str, "%s", "started");
+            break;
+        case GF_DEFRAG_STATUS_STOPPED:
+            sprintf(rebal_status_str, "%s", "stopped");
+            break;
+        case GF_DEFRAG_STATUS_COMPLETE:
+            sprintf(rebal_status_str, "%s", "completed");
+            break;
+        case GF_DEFRAG_STATUS_FAILED:
+            sprintf(rebal_status_str, "%s", "failed");
+            break;
+        case GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED:
+            sprintf(rebal_status_str, "%s", "layout_fix_started");
+            break;
+        case GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED:
+            sprintf(rebal_status_str, "%s", "layout_fix_stopped");
+            break;
+        case GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE:
+            sprintf(rebal_status_str, "%s", "layout_fix_complete");
+            break;
+        case GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED:
+            sprintf(rebal_status_str, "%s", "layout_fix_failed");
+            break;
+        default:
+            goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+/* This function will insert the element to the list in a order.
+   Order will be based on the compare function provided as a input.
+   If element to be inserted in ascending order compare should return:
+    0: if both the arguments are equal
+   >0: if first argument is greater than second argument
+   <0: if first argument is less than second argument */
+void
+glusterd_list_add_order(struct cds_list_head *new, struct cds_list_head *head,
+                        int (*compare)(struct cds_list_head *,
+                                       struct cds_list_head *))
+{
+    struct cds_list_head *pos = NULL;
+
+    cds_list_for_each_rcu(pos, head)
+    {
+        if (compare(new, pos) <= 0)
+            break;
+    }
+
+    cds_list_add_rcu(new, rcu_dereference(pos->prev));
+}
+
+int32_t
+glusterd_count_connected_peers(int32_t *count)
+{
+    glusterd_peerinfo_t *peerinfo = NULL;
+    glusterd_conf_t *conf = NULL;
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+    GF_VALIDATE_OR_GOTO(this->name, count, out);
+
+    *count = 1;
+
+    RCU_READ_LOCK;
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        /* Find peer who is connected and is a friend */
+        if ((peerinfo->connected) &&
+            (peerinfo->state.state == GD_FRIEND_STATE_BEFRIENDED)) {
+            (*count)++;
+        }
+    }
+    RCU_READ_UNLOCK;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+char *
+gd_get_shd_key(int type)
+{
+    char *key = NULL;
+
+    switch (type) {
+        case GF_CLUSTER_TYPE_REPLICATE:
+            key = "cluster.self-heal-daemon";
+            break;
+        case GF_CLUSTER_TYPE_DISPERSE:
+            key = "cluster.disperse-self-heal-daemon";
+            break;
+        default:
+            key = NULL;
+            break;
+    }
+    return key;
+}
+
+int
+glusterd_handle_replicate_brick_ops(glusterd_volinfo_t *volinfo,
+                                    glusterd_brickinfo_t *brickinfo,
+                                    glusterd_op_t op)
+{
+    int32_t ret = -1;
+    char tmpmount[] = "/tmp/mntXXXXXX";
+    char logfile[PATH_MAX] = "";
+    int dirty[3] = {
+        0,
+    };
+    runner_t runner = {0};
+    glusterd_conf_t *priv = NULL;
+    char *pid = NULL;
+    char vpath[PATH_MAX] = "";
+    char *volfileserver = NULL;
+
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, priv, out);
+
+    dirty[2] = hton32(1);
+
+    ret = sys_lsetxattr(brickinfo->path, GF_AFR_DIRTY, dirty, sizeof(dirty), 0);
+    if (ret == -1) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_SET_XATTR_FAIL,
+                "Attribute=%s", GF_AFR_DIRTY, "Reason=%s", strerror(errno),
+                NULL);
+        goto out;
+    }
+
+    if (mkdtemp(tmpmount) == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_CREATE_DIR_FAILED,
+                NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = gf_asprintf(&pid, "%d", GF_CLIENT_PID_ADD_REPLICA_MOUNT);
+    if (ret < 0)
+        goto out;
+
+    switch (op) {
+        case GD_OP_REPLACE_BRICK:
+            if (dict_get_strn(this->options, "transport.socket.bind-address",
+                              SLEN("transport.socket.bind-address"),
+                              &volfileserver) != 0)
+                volfileserver = "localhost";
+
+            snprintf(logfile, sizeof(logfile), "%s/%s-replace-brick-mount.log",
+                     priv->logdir, volinfo->volname);
+            if (!*logfile) {
+                ret = -1;
+                goto out;
+            }
+            runinit(&runner);
+            runner_add_args(&runner, SBIN_DIR "/glusterfs", "-s", volfileserver,
+                            "--volfile-id", volinfo->volname, "--client-pid",
+                            pid, "-l", logfile, tmpmount, NULL);
+            break;
+
+        case GD_OP_ADD_BRICK:
+            snprintf(logfile, sizeof(logfile), "%s/%s-add-brick-mount.log",
+                     priv->logdir, volinfo->volname);
+            if (!*logfile) {
+                ret = -1;
+                goto out;
+            }
+            ret = glusterd_get_dummy_client_filepath(vpath, volinfo,
+                                                     volinfo->transport_type);
+            if (ret) {
+                gf_log("", GF_LOG_ERROR,
+                       "Failed to get "
+                       "volfile path");
+                goto out;
+            }
+            runinit(&runner);
+            runner_add_args(&runner, SBIN_DIR "/glusterfs", "--volfile", vpath,
+                            "--client-pid", pid, "-l", logfile, tmpmount, NULL);
+            break;
+        default:
+            break;
+    }
+    synclock_unlock(&priv->big_lock);
+    ret = runner_run(&runner);
+
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "mount command"
+               " failed.");
+        goto lock;
+    }
+    ret = sys_lsetxattr(
+        tmpmount,
+        (op == GD_OP_REPLACE_BRICK) ? GF_AFR_REPLACE_BRICK : GF_AFR_ADD_BRICK,
+        brickinfo->brick_id, sizeof(brickinfo->brick_id), 0);
+    if (ret == -1)
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_SET_XATTR_FAIL,
+                "Attribute=%s, Reason=%s",
+                (op == GD_OP_REPLACE_BRICK) ? GF_AFR_REPLACE_BRICK
+                                            : GF_AFR_ADD_BRICK,
+                strerror(errno), NULL);
+    gf_umount_lazy(this->name, tmpmount, 1);
+lock:
+    synclock_lock(&priv->big_lock);
+out:
+    if (pid)
+        GF_FREE(pid);
+    gf_msg_debug(this->name, 0, "Returning with ret");
+    return ret;
+}
+
+void
+assign_brick_groups(glusterd_volinfo_t *volinfo)
+{
+    glusterd_brickinfo_t *brickinfo = NULL;
+    uint16_t group_num = 0;
+    int in_group = 0;
+
+    list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        brickinfo->group = group_num;
+        if (++in_group >= volinfo->replica_count) {
+            in_group = 0;
+            ++group_num;
+        }
+    }
+}
+
+glusterd_brickinfo_t *
+get_last_brick_of_brick_group(glusterd_volinfo_t *volinfo,
+                              glusterd_brickinfo_t *brickinfo)
+{
+    glusterd_brickinfo_t *next = NULL;
+    glusterd_brickinfo_t *last = NULL;
+
+    last = brickinfo;
+    for (;;) {
+        next = list_next(last, &volinfo->bricks, glusterd_brickinfo_t,
+                         brick_list);
+        if (!next || (next->group != brickinfo->group)) {
+            break;
+        }
+        last = next;
+    }
+
+    return last;
+}
+
+int
+glusterd_get_rb_dst_brickinfo(glusterd_volinfo_t *volinfo,
+                              glusterd_brickinfo_t **brickinfo)
+{
+    int32_t ret = -1;
+
+    if (!volinfo || !brickinfo)
+        goto out;
+
+    *brickinfo = volinfo->rep_brick.dst_brick;
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int
+rb_update_dstbrick_port(glusterd_brickinfo_t *dst_brickinfo, dict_t *rsp_dict,
+                        dict_t *req_dict)
+{
+    int ret = 0;
+    int dict_ret = 0;
+    int dst_port = 0;
+
+    dict_ret = dict_get_int32n(req_dict, "dst-brick-port",
+                               SLEN("dst-brick-port"), &dst_port);
+    if (!dict_ret)
+        dst_brickinfo->port = dst_port;
+
+    if (gf_is_local_addr(dst_brickinfo->hostname)) {
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_BRK_PORT_NO_ADD_INDO,
+               "adding dst-brick port no %d", dst_port);
+
+        if (rsp_dict) {
+            ret = dict_set_int32n(rsp_dict, "dst-brick-port",
+                                  SLEN("dst-brick-port"), dst_brickinfo->port);
+            if (ret) {
+                gf_msg_debug("glusterd", 0,
+                             "Could not set dst-brick port no in rsp dict");
+                goto out;
+            }
+        }
+
+        if (req_dict && !dict_ret) {
+            ret = dict_set_int32n(req_dict, "dst-brick-port",
+                                  SLEN("dst-brick-port"), dst_brickinfo->port);
+            if (ret) {
+                gf_msg_debug("glusterd", 0, "Could not set dst-brick port no");
+                goto out;
+            }
+        }
+    }
+out:
+    return ret;
+}
+
+int
+glusterd_brick_op_prerequisites(dict_t *dict, char **op, glusterd_op_t *gd_op,
+                                char **volname, glusterd_volinfo_t **volinfo,
+                                char **src_brick,
+                                glusterd_brickinfo_t **src_brickinfo,
+                                char *pidfile, char **op_errstr,
+                                dict_t *rsp_dict)
+{
+    int ret = 0;
+    char msg[2048] = "";
+    gsync_status_param_t param = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *v = NULL;
+    glusterd_brickinfo_t *b = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "operation", SLEN("operation"), op);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "dict get on operation type failed");
+        goto out;
+    }
+
+    *gd_op = gd_cli_to_gd_op(*op);
+    if (*gd_op < 0)
+        goto out;
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), volname);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(*volname, volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "volume: %s does not exist", *volname);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    if (GLUSTERD_STATUS_STARTED != (*volinfo)->status) {
+        ret = -1;
+        snprintf(msg, sizeof(msg), "volume: %s is not started", *volname);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    /* If geo-rep is configured, for this volume, it should be stopped. */
+    param.volinfo = *volinfo;
+    ret = glusterd_check_geo_rep_running(&param, op_errstr);
+    if (ret || param.is_active) {
+        ret = -1;
+        goto out;
+    }
+
+    if (glusterd_is_defrag_on(*volinfo)) {
+        snprintf(msg, sizeof(msg),
+                 "Volume name %s rebalance is in "
+                 "progress. Please retry after completion",
+                 *volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OIP_RETRY_LATER, "%s", msg);
+        *op_errstr = gf_strdup(msg);
+        ret = -1;
+        goto out;
+    }
+
+    if (dict) {
+        if (!glusterd_is_fuse_available()) {
+            gf_msg(this->name, GF_LOG_ERROR, 0,
+                   (*gd_op == GD_OP_REPLACE_BRICK)
+                       ? GD_MSG_RB_CMD_FAIL
+                       : GD_MSG_RESET_BRICK_CMD_FAIL,
+                   "Unable to open /dev/"
+                   "fuse (%s), %s command failed",
+                   strerror(errno), gd_rb_op_to_str(*op));
+            snprintf(msg, sizeof(msg),
+                     "Fuse unavailable\n "
+                     "%s failed",
+                     gd_rb_op_to_str(*op));
+            *op_errstr = gf_strdup(msg);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = dict_get_strn(dict, "src-brick", SLEN("src-brick"), src_brick);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get src brick");
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "src brick=%s", *src_brick);
+
+    ret = glusterd_volume_brickinfo_get_by_brick(*src_brick, *volinfo,
+                                                 src_brickinfo, _gf_false);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "brick: %s does not exist in "
+                 "volume: %s",
+                 *src_brick, *volname);
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_NOT_FOUND,
+                "Brick=%s, Volume=%s", *src_brick, *volname, NULL);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    if (gf_is_local_addr((*src_brickinfo)->hostname)) {
+        gf_msg_debug(this->name, 0, "I AM THE SOURCE HOST");
+        if ((*src_brickinfo)->port && rsp_dict) {
+            ret = dict_set_int32n(rsp_dict, "src-brick-port",
+                                  SLEN("src-brick-port"),
+                                  (*src_brickinfo)->port);
+            if (ret) {
+                gf_msg_debug(this->name, 0, "Could not set src-brick-port=%d",
+                             (*src_brickinfo)->port);
+            }
+        }
+
+        v = *volinfo;
+        b = *src_brickinfo;
+        GLUSTERD_GET_BRICK_PIDFILE(pidfile, v, b, priv);
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_get_dst_brick_info(char **dst_brick, char *volname, char **op_errstr,
+                            glusterd_brickinfo_t **dst_brickinfo, char **host,
+                            dict_t *dict, char **dup_dstbrick)
+{
+    char *path = NULL;
+    char *c = NULL;
+    char msg[2048] = "";
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_strn(dict, "dst-brick", SLEN("dst-brick"), dst_brick);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get dest brick.");
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "dst brick=%s", *dst_brick);
+
+    if (!glusterd_store_is_valid_brickpath(volname, *dst_brick) ||
+        !glusterd_is_valid_volfpath(volname, *dst_brick)) {
+        snprintf(msg, sizeof(msg),
+                 "brick path %s is too "
+                 "long.",
+                 *dst_brick);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRKPATH_TOO_LONG, "%s", msg);
+        *op_errstr = gf_strdup(msg);
+
+        ret = -1;
+        goto out;
+    }
+
+    *dup_dstbrick = gf_strdup(*dst_brick);
+    if (!*dup_dstbrick) {
+        ret = -1;
+        goto out;
+    }
+
+    /*
+     * IPv4 address contains '.' and ipv6 addresses contains ':'
+     * So finding the last occurrence of ':' to
+     * mark the start of brick path
+     */
+    c = strrchr(*dup_dstbrick, ':');
+    if (c != NULL) {
+        c[0] = '\0';
+        *host = *dup_dstbrick;
+        path = c++;
+    }
+
+    if (!host || !path) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BAD_FORMAT,
+               "dst brick %s is not of "
+               "form <HOSTNAME>:<export-dir>",
+               *dst_brick);
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_brickinfo_new_from_brick(*dst_brick, dst_brickinfo, _gf_true,
+                                            NULL);
+    if (ret)
+        goto out;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_get_volinfo_from_brick(char *brick, glusterd_volinfo_t **volinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    glusterd_volinfo_t *voliter = NULL;
+    glusterd_brickinfo_t *brickiter = NULL;
+    glusterd_snap_t *snap = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    /* First check for normal volumes */
+    cds_list_for_each_entry(voliter, &conf->volumes, vol_list)
+    {
+        cds_list_for_each_entry(brickiter, &voliter->bricks, brick_list)
+        {
+            if (gf_uuid_compare(brickiter->uuid, MY_UUID))
+                continue;
+            if (!strcmp(brickiter->path, brick)) {
+                *volinfo = voliter;
+                return 0;
+            }
+        }
+    }
+    /* In case normal volume is not found, check for snapshot volumes */
+    cds_list_for_each_entry(snap, &conf->snapshots, snap_list)
+    {
+        cds_list_for_each_entry(voliter, &snap->volumes, vol_list)
+        {
+            cds_list_for_each_entry(brickiter, &voliter->bricks, brick_list)
+            {
+                if (gf_uuid_compare(brickiter->uuid, MY_UUID))
+                    continue;
+                if (!strcmp(brickiter->path, brick)) {
+                    *volinfo = voliter;
+                    return 0;
+                }
+            }
+        }
+    }
+
+out:
+    return ret;
+}
+
+glusterd_op_t
+gd_cli_to_gd_op(char *cli_op)
+{
+    if (!strcmp(cli_op, "GF_RESET_OP_START") ||
+        !strcmp(cli_op, "GF_RESET_OP_COMMIT") ||
+        !strcmp(cli_op, "GF_RESET_OP_COMMIT_FORCE")) {
+        return GD_OP_RESET_BRICK;
+    }
+
+    if (!strcmp(cli_op, "GF_REPLACE_OP_COMMIT_FORCE"))
+        return GD_OP_REPLACE_BRICK;
+
+    return -1;
+}
+
+char *
+gd_rb_op_to_str(char *op)
+{
+    if (!strcmp(op, "GF_RESET_OP_START"))
+        return "reset-brick start";
+    if (!strcmp(op, "GF_RESET_OP_COMMIT"))
+        return "reset-brick commit";
+    if (!strcmp(op, "GF_RESET_OP_COMMIT_FORCE"))
+        return "reset-brick commit force";
+    if (!strcmp(op, "GF_REPLACE_OP_COMMIT_FORCE"))
+        return "replace-brick commit force";
+    return NULL;
+}
+
+gf_boolean_t
+glusterd_is_profile_on(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    gf_boolean_t is_latency_on = _gf_false;
+    gf_boolean_t is_fd_stats_on = _gf_false;
+
+    GF_ASSERT(volinfo);
+
+    ret = glusterd_volinfo_get_boolean(volinfo, VKEY_DIAG_CNT_FOP_HITS);
+    if (ret != -1)
+        is_fd_stats_on = ret;
+    ret = glusterd_volinfo_get_boolean(volinfo, VKEY_DIAG_LAT_MEASUREMENT);
+    if (ret != -1)
+        is_latency_on = ret;
+    if ((_gf_true == is_latency_on) && (_gf_true == is_fd_stats_on))
+        return _gf_true;
+    return _gf_false;
+}
+
+int32_t
+glusterd_add_shd_to_dict(glusterd_volinfo_t *volinfo, dict_t *dict,
+                         int32_t count)
+{
+    int ret = -1;
+    int32_t pid = -1;
+    int32_t brick_online = -1;
+    char key[64] = {0};
+    int keylen;
+    char *pidfile = NULL;
+    xlator_t *this = NULL;
+    char *uuid_str = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO(THIS->name, this, out);
+
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+
+    keylen = snprintf(key, sizeof(key), "brick%d.hostname", count);
+    ret = dict_set_nstrn(dict, key, keylen, "Self-heal Daemon",
+                         SLEN("Self-heal Daemon"));
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "Key=%s",
+                key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "brick%d.path", count);
+    uuid_str = gf_strdup(uuid_utoa(MY_UUID));
+    if (!uuid_str) {
+        ret = -1;
+        goto out;
+    }
+    ret = dict_set_dynstrn(dict, key, keylen, uuid_str);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "Key=%s",
+                key, NULL);
+        goto out;
+    }
+    uuid_str = NULL;
+
+    /* shd doesn't have a port. but the cli needs a port key with
+     * a zero value to parse.
+     * */
+
+    keylen = snprintf(key, sizeof(key), "brick%d.port", count);
+    ret = dict_set_int32n(dict, key, keylen, 0);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "Key=%s",
+                key, NULL);
+        goto out;
+    }
+
+    pidfile = volinfo->shd.svc.proc.pidfile;
+
+    brick_online = gf_is_service_running(pidfile, &pid);
+
+    /* If shd is not running, then don't print the pid */
+    if (!brick_online)
+        pid = -1;
+    keylen = snprintf(key, sizeof(key), "brick%d.pid", count);
+    ret = dict_set_int32n(dict, key, keylen, pid);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "Key=%s",
+                key, NULL);
+        goto out;
+    }
+
+    keylen = snprintf(key, sizeof(key), "brick%d.status", count);
+    ret = dict_set_int32n(dict, key, keylen, brick_online);
+
+out:
+    if (uuid_str)
+        GF_FREE(uuid_str);
+    if (ret)
+        gf_msg(this ? this->name : "glusterd", GF_LOG_ERROR, 0,
+               GD_MSG_DICT_SET_FAILED,
+               "Returning %d. adding values to dict failed", ret);
+
+    return ret;
+}
+
+static gf_ai_compare_t
+glusterd_compare_addrinfo(struct addrinfo *first, struct addrinfo *next)
+{
+    int ret = -1;
+    struct addrinfo *tmp1 = NULL;
+    struct addrinfo *tmp2 = NULL;
+    char firstip[NI_MAXHOST] = {0.};
+    char nextip[NI_MAXHOST] = {
+        0,
+    };
+
+    for (tmp1 = first; tmp1 != NULL; tmp1 = tmp1->ai_next) {
+        ret = getnameinfo(tmp1->ai_addr, tmp1->ai_addrlen, firstip, NI_MAXHOST,
+                          NULL, 0, NI_NUMERICHOST);
+        if (ret)
+            return GF_AI_COMPARE_ERROR;
+        for (tmp2 = next; tmp2 != NULL; tmp2 = tmp2->ai_next) {
+            ret = getnameinfo(tmp2->ai_addr, tmp2->ai_addrlen, nextip,
+                              NI_MAXHOST, NULL, 0, NI_NUMERICHOST);
+            if (ret)
+                return GF_AI_COMPARE_ERROR;
+            if (!strcmp(firstip, nextip)) {
+                return GF_AI_COMPARE_MATCH;
+            }
+        }
+    }
+    return GF_AI_COMPARE_NO_MATCH;
+}
+
+/* Check for non optimal brick order for Replicate/Disperse :
+ * Checks if bricks belonging to a replicate or disperse
+ * volume are present on the same server
+ */
+int32_t
+glusterd_check_brick_order(dict_t *dict, char *err_str, int32_t type,
+                           char **volname, char **brick_list,
+                           int32_t *brick_count, int32_t sub_count)
+{
+    int ret = -1;
+    int i = 0;
+    int j = 0;
+    int k = 0;
+    xlator_t *this = NULL;
+    addrinfo_list_t *ai_list = NULL;
+    addrinfo_list_t *ai_list_tmp1 = NULL;
+    addrinfo_list_t *ai_list_tmp2 = NULL;
+    char *brick = NULL;
+    char *brick_list_dup = NULL;
+    char *brick_list_ptr = NULL;
+    char *tmpptr = NULL;
+    struct addrinfo *ai_info = NULL;
+    char brick_addr[128] = {
+        0,
+    };
+    int addrlen = 0;
+
+    const char failed_string[2048] =
+        "Failed to perform brick order "
+        "check. Use 'force' at the end of the command"
+        " if you want to override this behavior. ";
+    const char found_string[2048] =
+        "Multiple bricks of a %s "
+        "volume are present on the same server. This "
+        "setup is not optimal. Bricks should be on "
+        "different nodes to have best fault tolerant "
+        "configuration. Use 'force' at the end of the "
+        "command if you want to override this "
+        "behavior. ";
+
+    this = THIS;
+
+    GF_ASSERT(this);
+
+    ai_list = MALLOC(sizeof(addrinfo_list_t));
+    ai_list->info = NULL;
+    CDS_INIT_LIST_HEAD(&ai_list->list);
+
+    if (!(*volname)) {
+        ret = dict_get_strn(dict, "volname", SLEN("volname"), &(*volname));
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Unable to get volume name");
+            goto out;
+        }
+    }
+
+    if (!(*brick_list)) {
+        ret = dict_get_strn(dict, "bricks", SLEN("bricks"), &(*brick_list));
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Bricks check : Could not "
+                   "retrieve bricks list");
+            goto out;
+        }
+    }
+
+    if (!(*brick_count)) {
+        ret = dict_get_int32n(dict, "count", SLEN("count"), &(*brick_count));
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Bricks check : Could not "
+                   "retrieve brick count");
+            goto out;
+        }
+    }
+
+    brick_list_dup = brick_list_ptr = gf_strdup(*brick_list);
+    /* Resolve hostnames and get addrinfo */
+    while (i < *brick_count) {
+        ++i;
+        brick = strtok_r(brick_list_dup, " \n", &tmpptr);
+        brick_list_dup = tmpptr;
+        if (brick == NULL)
+            goto check_failed;
+        tmpptr = strrchr(brick, ':');
+        if (tmpptr == NULL)
+            goto check_failed;
+        addrlen = strlen(brick) - strlen(tmpptr);
+        strncpy(brick_addr, brick, addrlen);
+        brick_addr[addrlen] = '\0';
+        ret = getaddrinfo(brick_addr, NULL, NULL, &ai_info);
+        if (ret != 0) {
+            ret = 0;
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_HOSTNAME_RESOLVE_FAIL,
+                   "unable to resolve host name for addr %s", brick_addr);
+            goto out;
+        }
+        ai_list_tmp1 = MALLOC(sizeof(addrinfo_list_t));
+        if (ai_list_tmp1 == NULL) {
+            ret = 0;
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "failed to allocate "
+                   "memory");
+            freeaddrinfo(ai_info);
+            goto out;
+        }
+        ai_list_tmp1->info = ai_info;
+        cds_list_add_tail(&ai_list_tmp1->list, &ai_list->list);
+        ai_list_tmp1 = NULL;
+    }
+
+    i = 0;
+    ai_list_tmp1 = cds_list_entry(ai_list->list.next, addrinfo_list_t, list);
+
+    if (*brick_count < sub_count) {
+        sub_count = *brick_count;
+    }
+
+    /* Check for bad brick order */
+    while (i < *brick_count) {
+        ++i;
+        ai_info = ai_list_tmp1->info;
+        ai_list_tmp1 = cds_list_entry(ai_list_tmp1->list.next, addrinfo_list_t,
+                                      list);
+        if (0 == i % sub_count) {
+            j = 0;
+            continue;
+        }
+        ai_list_tmp2 = ai_list_tmp1;
+        k = j;
+        while (k < sub_count - 1) {
+            ++k;
+            ret = glusterd_compare_addrinfo(ai_info, ai_list_tmp2->info);
+            if (GF_AI_COMPARE_ERROR == ret)
+                goto check_failed;
+            if (GF_AI_COMPARE_MATCH == ret)
+                goto found_bad_brick_order;
+            ai_list_tmp2 = cds_list_entry(ai_list_tmp2->list.next,
+                                          addrinfo_list_t, list);
+        }
+        ++j;
+    }
+    gf_msg_debug(this->name, 0, "Brick order okay");
+    ret = 0;
+    goto out;
+
+check_failed:
+    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BAD_BRKORDER_CHECK_FAIL,
+           "Failed bad brick order check");
+    snprintf(err_str, sizeof(failed_string), failed_string);
+    ret = -1;
+    goto out;
+
+found_bad_brick_order:
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_BAD_BRKORDER,
+           "Bad brick order found");
+    if (type == GF_CLUSTER_TYPE_DISPERSE) {
+        snprintf(err_str, sizeof(found_string), found_string, "disperse");
+    } else {
+        snprintf(err_str, sizeof(found_string), found_string, "replicate");
+    }
+
+    ret = -1;
+out:
+    ai_list_tmp2 = NULL;
+    GF_FREE(brick_list_ptr);
+    cds_list_for_each_entry(ai_list_tmp1, &ai_list->list, list)
+    {
+        if (ai_list_tmp1->info)
+            freeaddrinfo(ai_list_tmp1->info);
+        free(ai_list_tmp2);
+        ai_list_tmp2 = ai_list_tmp1;
+    }
+    free(ai_list);
+    free(ai_list_tmp2);
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+static gf_boolean_t
+search_peer_in_auth_list(char *peer_hostname, char *auth_allow_list)
+{
+    if (strstr(auth_allow_list, peer_hostname)) {
+        return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+/* glusterd_add_peers_to_auth_list() adds peers into auth.allow list
+ * if auth.allow list is not empty. This is called for add-brick and
+ * replica brick operations to avoid failing the temporary mount. New
+ * volfiles will be generated and clients are notified reg new volfiles.
+ */
+void
+glusterd_add_peers_to_auth_list(char *volname)
+{
+    int ret = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_peerinfo_t *peerinfo = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    int32_t len = 0;
+    char *auth_allow_list = NULL;
+    char *new_auth_allow_list = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    GF_VALIDATE_OR_GOTO(this->name, volname, out);
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "Unable to find volume: %s", volname);
+        goto out;
+    }
+
+    ret = dict_get_str_sizen(volinfo->dict, "auth.allow", &auth_allow_list);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_FAILED,
+               "auth allow list is not set");
+        goto out;
+    }
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        len += strlen(peerinfo->hostname);
+    }
+    len += strlen(auth_allow_list) + 1;
+
+    new_auth_allow_list = GF_CALLOC(1, len, gf_common_mt_char);
+
+    new_auth_allow_list = strncat(new_auth_allow_list, auth_allow_list,
+                                  strlen(auth_allow_list));
+    cds_list_for_each_entry_rcu(peerinfo, &conf->peers, uuid_list)
+    {
+        ret = search_peer_in_auth_list(peerinfo->hostname, new_auth_allow_list);
+        if (!ret) {
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "peer %s not found in auth.allow list", peerinfo->hostname);
+            new_auth_allow_list = strcat(new_auth_allow_list, ",");
+            new_auth_allow_list = strncat(new_auth_allow_list,
+                                          peerinfo->hostname,
+                                          strlen(peerinfo->hostname));
+        }
+    }
+    if (strcmp(new_auth_allow_list, auth_allow_list) != 0) {
+        /* In case, new_auth_allow_list is not same as auth_allow_list,
+         * we need to update the volinfo->dict with new_auth_allow_list.
+         * we delete the auth_allow_list and replace it with
+         * new_auth_allow_list. for reverting the changes in post commit, we
+         * keep the copy of auth_allow_list as old_auth_allow_list in
+         * volinfo->dict.
+         */
+        dict_del_sizen(volinfo->dict, "auth.allow");
+        ret = dict_set_strn(volinfo->dict, "auth.allow", SLEN("auth.allow"),
+                            new_auth_allow_list);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                   "Unable to set new auth.allow list");
+            goto out;
+        }
+        ret = dict_set_strn(volinfo->dict, "old.auth.allow",
+                            SLEN("old.auth.allow"), auth_allow_list);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                   "Unable to set old auth.allow list");
+            goto out;
+        }
+        ret = glusterd_create_volfiles_and_notify_services(volinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+                   "failed to create volfiles");
+            goto out;
+        }
+    }
+out:
+    GF_FREE(new_auth_allow_list);
+    return;
+}
+
+int
+glusterd_replace_old_auth_allow_list(char *volname)
+{
+    int ret = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    char *old_auth_allow_list = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_VALIDATE_OR_GOTO(this->name, volname, out);
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               "Unable to find volume: %s", volname);
+        goto out;
+    }
+
+    ret = dict_get_str_sizen(volinfo->dict, "old.auth.allow",
+                             &old_auth_allow_list);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_INFO, errno, GD_MSG_DICT_GET_FAILED,
+               "old auth allow list is not set, no need to replace the list");
+        ret = 0;
+        goto out;
+    }
+
+    dict_del_sizen(volinfo->dict, "auth.allow");
+    ret = dict_set_strn(volinfo->dict, "auth.allow", SLEN("auth.allow"),
+                        old_auth_allow_list);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Unable to replace auth.allow list");
+        goto out;
+    }
+
+    dict_del_sizen(volinfo->dict, "old.auth.allow");
+
+    ret = glusterd_create_volfiles_and_notify_services(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "failed to create volfiles");
+        goto out;
+    }
+    ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLINFO_STORE_FAIL,
+               "failed to store volinfo");
+        goto out;
+    }
+out:
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h
new file mode 100644
index 00000000000..bf6ac295e26
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.h
@@ -0,0 +1,865 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_UTILS_H
+#define _GLUSTERD_UTILS_H
+
+#include <pthread.h>
+#include <glusterfs/compat-uuid.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/byte-order.h>
+#include "glusterd.h"
+#include "rpc-clnt.h"
+#include "protocol-common.h"
+
+#include "glusterfs3-xdr.h"
+#include "glusterd-peer-utils.h"
+
+#define GLUSTERD_SOCK_DIR "/var/run/gluster"
+#define GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO(brickinfo, volinfo, brickid)      \
+    do {                                                                       \
+        sprintf(brickinfo->brick_id, "%s-client-%d", volinfo->volname,         \
+                brickid);                                                      \
+    } while (0)
+
+#define GLUSTERD_ASSIGN_BRICKID_TO_TA_BRICKINFO(ta_brickinfo, volinfo,         \
+                                                brickid)                       \
+    do {                                                                       \
+        sprintf(ta_brickinfo->brick_id, "%s-ta-%d", volinfo->volname,          \
+                brickid);                                                      \
+    } while (0)
+
+#define ALL_VOLUME_OPTION_CHECK(volname, get_opt, key, ret, op_errstr, label)  \
+    do {                                                                       \
+        gf_boolean_t _all = !strcmp("all", volname);                           \
+        gf_boolean_t _key_all = !strcmp(key, "all");                           \
+        gf_boolean_t _is_valid_opt = _gf_false;                                \
+        int32_t i = 0;                                                         \
+                                                                               \
+        if (!get_opt &&                                                        \
+            (_key_all || !strcmp(key, GLUSTERD_MAX_OP_VERSION_KEY))) {         \
+            ret = -1;                                                          \
+            *op_errstr = gf_strdup("Not a valid option to set");               \
+            goto out;                                                          \
+        }                                                                      \
+        if (_key_all) {                                                        \
+            _is_valid_opt = _gf_true;                                          \
+        } else {                                                               \
+            for (i = 0; valid_all_vol_opts[i].option; i++) {                   \
+                if (!strcmp(key, valid_all_vol_opts[i].option)) {              \
+                    _is_valid_opt = _gf_true;                                  \
+                    break;                                                     \
+                }                                                              \
+            }                                                                  \
+        }                                                                      \
+        if (_all && !_is_valid_opt) {                                          \
+            ret = -1;                                                          \
+            *op_errstr = gf_strdup("Not a valid option for all volumes");      \
+            goto label;                                                        \
+        } else if (!_all && _is_valid_opt) {                                   \
+            ret = -1;                                                          \
+            *op_errstr = gf_strdup("Not a valid option for single volume");    \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0)
+
+struct glusterd_lock_ {
+    uuid_t owner;
+    time_t timestamp;
+};
+
+typedef struct glusterd_dict_ctx_ {
+    dict_t *dict;
+    int opt_count;
+    char *key_name;
+    char *val_name;
+    char *prefix;
+} glusterd_dict_ctx_t;
+
+gf_boolean_t
+is_brick_mx_enabled(void);
+
+int
+glusterd_compare_lines(const void *a, const void *b);
+
+typedef int (*glusterd_condition_func)(glusterd_volinfo_t *volinfo,
+                                       glusterd_brickinfo_t *brickinfo,
+                                       void *ctx);
+typedef struct glusterd_lock_ glusterd_lock_t;
+
+int32_t
+glusterd_get_lock_owner(uuid_t *cur_owner);
+
+int32_t
+glusterd_lock(uuid_t new_owner);
+
+int32_t
+glusterd_unlock(uuid_t owner);
+
+int32_t
+glusterd_get_uuid(uuid_t *uuid);
+
+char *
+gd_get_shd_key(int type);
+
+int
+glusterd_submit_reply(rpcsvc_request_t *req, void *arg, struct iovec *payload,
+                      int payloadcount, struct iobref *iobref,
+                      xdrproc_t xdrproc);
+
+int
+glusterd_to_cli(rpcsvc_request_t *req, gf_cli_rsp *arg, struct iovec *payload,
+                int payloadcount, struct iobref *iobref, xdrproc_t xdrproc,
+                dict_t *dict);
+
+int
+glusterd_submit_request(struct rpc_clnt *rpc, void *req, call_frame_t *frame,
+                        rpc_clnt_prog_t *prog, int procnum,
+                        struct iobref *iobref, xlator_t *this,
+                        fop_cbk_fn_t cbkfn, xdrproc_t xdrproc);
+int32_t
+glusterd_volinfo_new(glusterd_volinfo_t **volinfo);
+
+int32_t
+glusterd_volinfo_dup(glusterd_volinfo_t *volinfo,
+                     glusterd_volinfo_t **dup_volinfo,
+                     gf_boolean_t set_userauth);
+
+char *
+glusterd_auth_get_username(glusterd_volinfo_t *volinfo);
+
+char *
+glusterd_auth_get_password(glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_auth_set_username(glusterd_volinfo_t *volinfo, char *username);
+
+int32_t
+glusterd_auth_set_password(glusterd_volinfo_t *volinfo, char *password);
+
+void
+glusterd_auth_cleanup(glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_brickprocess_new(glusterd_brick_proc_t **brickprocess);
+
+int32_t
+glusterd_brickinfo_new(glusterd_brickinfo_t **brickinfo);
+
+int32_t
+glusterd_brickinfo_new_from_brick(char *brick, glusterd_brickinfo_t **brickinfo,
+                                  gf_boolean_t construct_real_path,
+                                  char **op_errstr);
+
+int32_t
+glusterd_volinfo_find(const char *volname, glusterd_volinfo_t **volinfo);
+
+gf_boolean_t
+glusterd_volume_exists(const char *volname);
+
+int
+glusterd_volinfo_find_by_volume_id(uuid_t volume_id,
+                                   glusterd_volinfo_t **volinfo);
+
+int32_t
+glusterd_service_stop(const char *service, char *pidfile, int sig,
+                      gf_boolean_t force_kill);
+
+int32_t
+glusterd_service_stop_nolock(const char *service, char *pidfile, int sig,
+                             gf_boolean_t force_kill);
+
+int
+glusterd_get_next_available_brickid(glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_resolve_brick(glusterd_brickinfo_t *brickinfo);
+
+int
+glusterd_brick_process_add_brick(glusterd_brickinfo_t *brickinfo,
+                                 glusterd_brickinfo_t *parent_brickinfo);
+
+int
+glusterd_brick_process_remove_brick(glusterd_brickinfo_t *brickinfo,
+                                    int *last_brick);
+
+int
+glusterd_brick_proc_for_port(int port, glusterd_brick_proc_t **brickprocess);
+
+int32_t
+glusterd_volume_start_glusterfs(glusterd_volinfo_t *volinfo,
+                                glusterd_brickinfo_t *brickinfo,
+                                gf_boolean_t wait);
+
+int32_t
+glusterd_volume_stop_glusterfs(glusterd_volinfo_t *volinfo,
+                               glusterd_brickinfo_t *brickinfo,
+                               gf_boolean_t del_brick);
+
+int
+send_attach_req(xlator_t *this, struct rpc_clnt *rpc, char *path,
+                glusterd_brickinfo_t *brick, glusterd_brickinfo_t *other_brick,
+                int op);
+
+glusterd_volinfo_t *
+glusterd_volinfo_ref(glusterd_volinfo_t *volinfo);
+
+glusterd_volinfo_t *
+glusterd_volinfo_unref(glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_volinfo_delete(glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_brickinfo_delete(glusterd_brickinfo_t *brickinfo);
+
+gf_boolean_t
+glusterd_is_cli_op_req(int32_t op);
+
+int32_t
+glusterd_volume_brickinfo_get_by_brick(char *brick, glusterd_volinfo_t *volinfo,
+                                       glusterd_brickinfo_t **brickinfo,
+                                       gf_boolean_t construct_real_path);
+
+int32_t
+glusterd_add_volumes_to_export_dict(dict_t *peer_data, char **buf,
+                                    u_int *length);
+
+int32_t
+glusterd_compare_friend_data(dict_t *peer_data, int32_t *status,
+                             char *hostname);
+
+int
+glusterd_compute_cksum(glusterd_volinfo_t *volinfo, gf_boolean_t is_quota_conf);
+
+void
+glusterd_set_socket_filepath(char *sock_filepath, char *sockpath, size_t len);
+
+struct rpc_clnt *
+glusterd_pending_node_get_rpc(glusterd_pending_node_t *pending_node);
+
+void
+glusterd_pending_node_put_rpc(glusterd_pending_node_t *pending_node);
+
+int
+glusterd_remote_hostname_get(rpcsvc_request_t *req, char *remote_host, int len);
+
+int32_t
+glusterd_import_friend_volumes_synctask(void *opaque);
+
+int32_t
+glusterd_import_friend_volumes(dict_t *peer_data);
+void
+glusterd_set_volume_status(glusterd_volinfo_t *volinfo,
+                           glusterd_volume_status status);
+
+int32_t
+glusterd_volume_count_get(void);
+int32_t
+glusterd_add_volume_to_dict(glusterd_volinfo_t *volinfo, dict_t *dict,
+                            int32_t count, char *prefix);
+int
+glusterd_get_brickinfo(xlator_t *this, const char *brickname, int port,
+                       glusterd_brickinfo_t **brickinfo);
+
+void
+glusterd_set_brick_status(glusterd_brickinfo_t *brickinfo,
+                          gf_brick_status_t status);
+
+gf_boolean_t
+glusterd_is_brick_started(glusterd_brickinfo_t *brickinfo);
+
+int
+glusterd_friend_brick_belongs(glusterd_volinfo_t *volinfo,
+                              glusterd_brickinfo_t *brickinfo, void *uuid);
+int
+glusterd_all_volume_cond_check(glusterd_condition_func func, int status,
+                               void *ctx);
+int
+glusterd_brick_start(glusterd_volinfo_t *volinfo,
+                     glusterd_brickinfo_t *brickinfo, gf_boolean_t wait,
+                     gf_boolean_t only_connect);
+int
+glusterd_brick_stop(glusterd_volinfo_t *volinfo,
+                    glusterd_brickinfo_t *brickinfo, gf_boolean_t del_brick);
+
+int
+glusterd_is_defrag_on(glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_volinfo_bricks_delete(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_new_brick_validate(char *brick, glusterd_brickinfo_t *brickinfo,
+                            char *op_errstr, size_t len, char *op);
+int32_t
+glusterd_volume_brickinfos_delete(glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_volume_brickinfo_get(uuid_t uuid, char *hostname, char *path,
+                              glusterd_volinfo_t *volinfo,
+                              glusterd_brickinfo_t **brickinfo);
+
+int
+glusterd_brickinfo_get(uuid_t uuid, char *hostname, char *path,
+                       glusterd_brickinfo_t **brickinfo);
+
+int
+glusterd_rb_check_bricks(glusterd_volinfo_t *volinfo,
+                         glusterd_brickinfo_t *src_brick,
+                         glusterd_brickinfo_t *dst_brick);
+
+int
+glusterd_check_and_set_brick_xattr(char *host, char *path, uuid_t uuid,
+                                   char **op_errstr, gf_boolean_t is_force);
+
+int
+glusterd_validate_and_create_brickpath(glusterd_brickinfo_t *brickinfo,
+                                       uuid_t volume_id, char *volname,
+                                       char **op_errstr, gf_boolean_t is_force,
+                                       gf_boolean_t ignore_partition);
+int
+glusterd_sm_tr_log_transition_add(glusterd_sm_tr_log_t *log, int old_state,
+                                  int new_state, int event);
+int
+glusterd_sm_tr_log_init(glusterd_sm_tr_log_t *log, char *(*state_name_get)(int),
+                        char *(*event_name_get)(int), size_t size);
+void
+glusterd_sm_tr_log_delete(glusterd_sm_tr_log_t *log);
+
+int
+glusterd_sm_tr_log_add_to_dict(dict_t *dict,
+                               glusterd_sm_tr_log_t *circular_log);
+int
+glusterd_remove_pending_entry(struct cds_list_head *list, void *elem);
+int
+glusterd_clear_pending_nodes(struct cds_list_head *list);
+int32_t
+glusterd_brick_connect(glusterd_volinfo_t *volinfo,
+                       glusterd_brickinfo_t *brickinfo, char *socketpath);
+int32_t
+glusterd_brick_disconnect(glusterd_brickinfo_t *brickinfo);
+int32_t
+glusterd_delete_volume(glusterd_volinfo_t *volinfo);
+int32_t
+glusterd_delete_brick(glusterd_volinfo_t *volinfo,
+                      glusterd_brickinfo_t *brickinfo);
+
+int32_t
+glusterd_delete_all_bricks(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_spawn_daemons(void *opaque);
+
+int
+glusterd_restart_gsyncds(glusterd_conf_t *conf);
+
+int
+glusterd_start_gsync(glusterd_volinfo_t *master_vol, char *slave,
+                     char *path_list, char *conf_path, char *glusterd_uuid_str,
+                     char **op_errstr, gf_boolean_t is_pause);
+int
+glusterd_get_local_brickpaths(glusterd_volinfo_t *volinfo, char **pathlist);
+
+int32_t
+glusterd_recreate_bricks(glusterd_conf_t *conf);
+int32_t
+glusterd_handle_upgrade_downgrade(dict_t *options, glusterd_conf_t *conf,
+                                  gf_boolean_t upgrade, gf_boolean_t downgrade);
+
+int
+glusterd_add_brick_detail_to_dict(glusterd_volinfo_t *volinfo,
+                                  glusterd_brickinfo_t *brickinfo, dict_t *dict,
+                                  int32_t count);
+
+int32_t
+glusterd_add_brick_to_dict(glusterd_volinfo_t *volinfo,
+                           glusterd_brickinfo_t *brickinfo, dict_t *dict,
+                           int32_t count);
+
+int32_t
+glusterd_get_all_volnames(dict_t *dict);
+
+gf_boolean_t
+glusterd_is_fuse_available();
+
+int
+glusterd_brick_statedump(glusterd_volinfo_t *volinfo,
+                         glusterd_brickinfo_t *brickinfo, char *options,
+                         int option_cnt, char **op_errstr);
+
+int
+glusterd_brick_terminate(glusterd_volinfo_t *volinfo,
+                         glusterd_brickinfo_t *brickinfo, char *options,
+                         int option_cnt, char **op_errstr);
+
+#ifdef BUILD_GNFS
+int
+glusterd_nfs_statedump(char *options, int option_cnt, char **op_errstr);
+#endif
+
+int
+glusterd_client_statedump(char *volname, char *options, int option_cnt,
+                          char **op_errstr);
+
+int
+glusterd_quotad_statedump(char *options, int option_cnt, char **op_errstr);
+
+gf_boolean_t
+glusterd_is_volume_replicate(glusterd_volinfo_t *volinfo);
+
+gf_boolean_t
+glusterd_is_brick_decommissioned(glusterd_volinfo_t *volinfo, char *hostname,
+                                 char *path);
+int
+glusterd_friend_contains_vol_bricks(glusterd_volinfo_t *volinfo,
+                                    uuid_t friend_uuid);
+
+int
+glusterd_friend_contains_snap_bricks(glusterd_snap_t *snapinfo,
+                                     uuid_t friend_uuid);
+int
+glusterd_friend_remove_cleanup_vols(uuid_t uuid);
+
+int
+glusterd_get_client_filepath(char *filepath, glusterd_volinfo_t *volinfo,
+                             gf_transport_type type);
+int
+glusterd_get_trusted_client_filepath(char *filepath,
+                                     glusterd_volinfo_t *volinfo,
+                                     gf_transport_type type);
+int
+glusterd_restart_rebalance(glusterd_conf_t *conf);
+
+int
+glusterd_restart_rebalance_for_volume(glusterd_volinfo_t *volinfo);
+
+void
+glusterd_defrag_info_set(glusterd_volinfo_t *volinfo, dict_t *dict, int cmd,
+                         int status, int op);
+
+int32_t
+glusterd_add_bricks_hname_path_to_dict(dict_t *dict,
+                                       glusterd_volinfo_t *volinfo);
+
+int
+glusterd_add_node_to_dict(char *server, dict_t *dict, int count,
+                          dict_t *vol_opts);
+
+int
+glusterd_calc_dist_leaf_count(int rcount, int scount);
+
+int
+glusterd_get_dist_leaf_count(glusterd_volinfo_t *volinfo);
+
+glusterd_brickinfo_t *
+glusterd_get_brickinfo_by_position(glusterd_volinfo_t *volinfo, uint32_t pos);
+
+gf_boolean_t
+glusterd_is_local_brick(xlator_t *this, glusterd_volinfo_t *volinfo,
+                        glusterd_brickinfo_t *brickinfo);
+int
+glusterd_validate_volume_id(dict_t *op_dict, glusterd_volinfo_t *volinfo);
+
+int
+glusterd_defrag_volume_status_update(glusterd_volinfo_t *volinfo,
+                                     dict_t *rsp_dict, int32_t cmd);
+
+int
+glusterd_check_files_identical(char *filename1, char *filename2,
+                               gf_boolean_t *identical);
+
+int
+glusterd_check_topology_identical(const char *filename1, const char *filename2,
+                                  gf_boolean_t *identical);
+
+void
+glusterd_volinfo_reset_defrag_stats(glusterd_volinfo_t *volinfo);
+int
+glusterd_volset_help(dict_t *dict, char **op_errstr);
+
+int32_t
+glusterd_sync_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict);
+int32_t
+glusterd_gsync_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict, char *op_errstr);
+int32_t
+glusterd_rb_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict);
+int
+glusterd_profile_volume_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict);
+int
+glusterd_volume_status_copy_to_op_ctx_dict(dict_t *aggr, dict_t *rsp_dict);
+int
+glusterd_volume_rebalance_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict);
+int
+glusterd_volume_heal_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict);
+int
+glusterd_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict);
+int
+glusterd_sys_exec_output_rsp_dict(dict_t *aggr, dict_t *rsp_dict);
+int32_t
+glusterd_handle_node_rsp(dict_t *req_ctx, void *pending_entry, glusterd_op_t op,
+                         dict_t *rsp_dict, dict_t *op_ctx, char **op_errstr,
+                         gd_node_type type);
+int
+glusterd_max_opversion_use_rsp_dict(dict_t *dst, dict_t *src);
+
+int
+glusterd_volume_bitrot_scrub_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict);
+
+int
+glusterd_volume_heal_use_rsp_dict(dict_t *aggr, dict_t *rsp_dict);
+
+int32_t
+glusterd_check_if_quota_trans_enabled(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_volume_quota_copy_to_op_ctx_dict(dict_t *aggr, dict_t *rsp);
+int
+_profile_volume_add_brick_rsp(dict_t *this, char *key, data_t *value,
+                              void *data);
+int
+glusterd_profile_volume_brick_rsp(void *pending_entry, dict_t *rsp_dict,
+                                  dict_t *op_ctx, char **op_errstr,
+                                  gd_node_type type);
+
+int32_t
+glusterd_set_originator_uuid(dict_t *dict);
+
+/* Should be used only when an operation is in progress, as that is the only
+ * time a lock_owner is set
+ */
+gf_boolean_t
+is_origin_glusterd(dict_t *dict);
+
+int
+glusterd_get_next_global_opt_version_str(dict_t *opts, char **version_str);
+
+int
+glusterd_generate_and_set_task_id(dict_t *dict, char *key, const int keylen);
+
+int
+glusterd_validate_and_set_gfid(dict_t *op_ctx, dict_t *req_dict,
+                               char **op_errstr);
+
+int
+glusterd_copy_uuid_to_dict(uuid_t uuid, dict_t *dict, char *key,
+                           const int keylen);
+
+gf_boolean_t
+glusterd_is_same_address(char *name1, char *name2);
+
+void
+gd_update_volume_op_versions(glusterd_volinfo_t *volinfo);
+
+int
+op_version_check(xlator_t *this, int min_op_version, char *msg, int msglen);
+
+gf_boolean_t
+gd_is_remove_brick_committed(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_remove_brick_validate_bricks(gf1_op_commands cmd, int32_t brick_count,
+                                      dict_t *dict, glusterd_volinfo_t *volinfo,
+                                      char **errstr, gf_cli_defrag_type);
+int
+glusterd_get_slave_details_confpath(glusterd_volinfo_t *volinfo, dict_t *dict,
+                                    char **slave_url, char **slave_host,
+                                    char **slave_vol, char **conf_path,
+                                    char **op_errstr);
+
+int
+glusterd_get_slave_info(char *slave, char **slave_url, char **hostname,
+                        char **slave_vol, char **op_errstr);
+
+int
+glusterd_get_statefile_name(glusterd_volinfo_t *volinfo, char *slave,
+                            char *conf_path, char **statefile,
+                            gf_boolean_t *is_template_in_use);
+
+int
+glusterd_gsync_read_frm_status(char *path, char *buf, size_t blen);
+
+int
+glusterd_create_status_file(char *master, char *slave, char *slave_url,
+                            char *slave_vol, char *status);
+
+int
+glusterd_check_restart_gsync_session(glusterd_volinfo_t *volinfo, char *slave,
+                                     dict_t *resp_dict, char *path_list,
+                                     char *conf_path, gf_boolean_t is_force);
+
+int
+glusterd_check_gsync_running_local(char *master, char *slave, char *conf_path,
+                                   gf_boolean_t *is_run);
+
+gf_boolean_t
+glusterd_is_status_tasks_op(glusterd_op_t op, dict_t *dict);
+
+gf_boolean_t
+gd_should_i_start_rebalance(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_is_volume_quota_enabled(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_is_volume_inode_quota_enabled(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_is_bitrot_enabled(glusterd_volinfo_t *volinfo);
+
+gf_boolean_t
+glusterd_all_volumes_with_quota_stopped();
+
+void
+glusterd_clean_up_quota_store(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_remove_auxiliary_mount(char *volname);
+
+gf_boolean_t
+glusterd_status_has_tasks(int cmd);
+
+int
+gd_stop_rebalance_process(glusterd_volinfo_t *volinfo);
+
+rpc_clnt_t *
+glusterd_rpc_clnt_unref(glusterd_conf_t *conf, rpc_clnt_t *rpc);
+
+int32_t
+glusterd_compare_volume_name(struct cds_list_head *, struct cds_list_head *);
+
+char *
+glusterd_get_brick_mount_device(char *brick_path);
+
+struct mntent *
+glusterd_get_mnt_entry_info(char *mnt_pt, char *buff, int buflen,
+                            struct mntent *entry_ptr);
+
+int
+glusterd_get_brick_root(char *path, char **mount_point);
+
+int32_t
+glusterd_lvm_snapshot_remove(dict_t *rsp_dict, glusterd_volinfo_t *snap_vol);
+
+gf_boolean_t
+gd_vol_is_geo_rep_active(glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_get_brick_mount_dir(char *brickpath, char *hostname, char *mount_dir);
+
+int32_t
+glusterd_aggr_brick_mount_dirs(dict_t *aggr, dict_t *rsp_dict);
+
+int32_t
+glusterd_take_lvm_snapshot(glusterd_brickinfo_t *brickinfo,
+                           char *origin_brick_path);
+
+void
+glusterd_launch_synctask(synctask_fn_t fn, void *opaque);
+
+int
+glusterd_enable_default_options(glusterd_volinfo_t *volinfo, char *option);
+
+int
+glusterd_unlink_file(char *sock_file_path);
+
+int32_t
+glusterd_find_brick_mount_path(char *brick_path, char **brick_mount_path);
+
+/*
+ * Function to retrieve list of snap volnames and their uuids
+ */
+int
+glusterd_snapshot_get_volnames_uuids(dict_t *dict, char *volname,
+                                     gf_getsnap_name_uuid_rsp *snap_info_rsp);
+
+int
+glusterd_update_mntopts(char *brick_path, glusterd_brickinfo_t *brickinfo);
+
+int
+glusterd_update_fs_label(glusterd_brickinfo_t *brickinfo);
+
+int
+glusterd_get_volopt_content(dict_t *dict, gf_boolean_t xml_out);
+
+int
+glusterd_get_global_max_op_version(rpcsvc_request_t *req, dict_t *ctx,
+                                   int count);
+
+int
+glusterd_get_global_options_for_all_vols(rpcsvc_request_t *req, dict_t *dict,
+                                         char **op_errstr);
+
+int
+glusterd_get_default_val_for_volopt(dict_t *dict, gf_boolean_t all_opts,
+                                    char *key, char *orig_key,
+                                    glusterd_volinfo_t *volinfo,
+                                    char **err_str);
+
+int
+glusterd_check_client_op_version_support(char *volname, uint32_t op_version,
+                                         char **op_errstr);
+
+gf_boolean_t
+glusterd_have_peers();
+
+gf_boolean_t
+glusterd_have_volumes();
+
+void
+glusterd_get_rebalance_volfile(glusterd_volinfo_t *volinfo, char *path,
+                               int path_len);
+
+void
+glusterd_get_gfproxy_client_volfile(glusterd_volinfo_t *volinfo, char *path,
+                                    int path_len);
+
+int32_t
+glusterd_brickinfo_dup(glusterd_brickinfo_t *brickinfo,
+                       glusterd_brickinfo_t *dup_brickinfo);
+
+int
+glusterd_vol_add_quota_conf_to_dict(glusterd_volinfo_t *volinfo, dict_t *load,
+                                    int vol_idx, char *prefix);
+
+int32_t
+glusterd_import_volinfo(dict_t *peer_data, int count,
+                        glusterd_volinfo_t **volinfo, char *prefix);
+
+int
+glusterd_import_quota_conf(dict_t *peer_data, int vol_idx,
+                           glusterd_volinfo_t *new_volinfo, char *prefix);
+
+gf_boolean_t
+glusterd_is_shd_compatible_volume(glusterd_volinfo_t *volinfo);
+
+gf_boolean_t
+glusterd_is_shd_compatible_type(int type);
+
+gf_boolean_t
+glusterd_are_all_volumes_stopped();
+
+gf_boolean_t
+glusterd_all_shd_compatible_volumes_stopped();
+
+void
+glusterd_nfs_pmap_deregister();
+
+gf_boolean_t
+glusterd_is_volume_started(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_volume_get_type_str(glusterd_volinfo_t *volinfo, char **vol_type_str);
+
+int
+glusterd_volume_get_status_str(glusterd_volinfo_t *volinfo, char *status_str);
+
+void
+glusterd_brick_get_status_str(glusterd_brickinfo_t *brickinfo,
+                              char *status_str);
+
+int
+glusterd_volume_get_transport_type_str(glusterd_volinfo_t *volinfo,
+                                       char *transport_type_str);
+
+int
+glusterd_volume_get_quorum_status_str(glusterd_volinfo_t *volinfo,
+                                      char *quorum_status_str);
+
+int
+glusterd_volume_get_rebalance_status_str(glusterd_volinfo_t *volinfo,
+                                         char *rebal_status_str);
+
+void
+glusterd_list_add_order(struct cds_list_head *new, struct cds_list_head *head,
+                        int (*compare)(struct cds_list_head *,
+                                       struct cds_list_head *));
+
+struct rpc_clnt *
+glusterd_defrag_rpc_get(glusterd_defrag_info_t *defrag);
+
+struct rpc_clnt *
+glusterd_defrag_rpc_put(glusterd_defrag_info_t *defrag);
+
+int32_t
+glusterd_count_connected_peers(int32_t *count);
+
+int
+glusterd_volume_brick_for_each(glusterd_volinfo_t *volinfo, void *data,
+                               int (*fn)(glusterd_volinfo_t *,
+                                         glusterd_brickinfo_t *,
+                                         dict_t *mod_dict, void *));
+
+int
+glusterd_get_dummy_client_filepath(char *filepath, glusterd_volinfo_t *volinfo,
+                                   gf_transport_type type);
+
+int
+glusterd_handle_replicate_brick_ops(glusterd_volinfo_t *volinfo,
+                                    glusterd_brickinfo_t *brickinfo,
+                                    glusterd_op_t op);
+void
+assign_brick_groups(glusterd_volinfo_t *volinfo);
+
+glusterd_brickinfo_t *
+get_last_brick_of_brick_group(glusterd_volinfo_t *volinfo,
+                              glusterd_brickinfo_t *brickinfo);
+int
+glusterd_get_rb_dst_brickinfo(glusterd_volinfo_t *volinfo,
+                              glusterd_brickinfo_t **brickinfo);
+int
+rb_update_dstbrick_port(glusterd_brickinfo_t *dst_brickinfo, dict_t *rsp_dict,
+                        dict_t *req_dict);
+int
+glusterd_op_perform_replace_brick(glusterd_volinfo_t *volinfo, char *old_brick,
+                                  char *new_brick, dict_t *dict);
+int32_t
+glusterd_brick_unlink_socket_file(glusterd_volinfo_t *volinfo,
+                                  glusterd_brickinfo_t *brickinfo);
+char *
+gd_rb_op_to_str(char *op);
+
+glusterd_op_t
+gd_cli_to_gd_op(char *cli_op);
+
+int
+glusterd_get_dst_brick_info(char **dst_brick, char *volname, char **op_errstr,
+                            glusterd_brickinfo_t **dst_brickinfo, char **host,
+                            dict_t *dict, char **dup_dstbrick);
+
+int
+glusterd_brick_op_prerequisites(dict_t *dict, char **op, glusterd_op_t *gd_op,
+                                char **volname, glusterd_volinfo_t **volinfo,
+                                char **src_brick,
+                                glusterd_brickinfo_t **src_brickinfo,
+                                char *pidfile, char **op_errstr,
+                                dict_t *rsp_dict);
+
+int
+glusterd_get_volinfo_from_brick(char *brick, glusterd_volinfo_t **volinfo);
+
+gf_boolean_t
+glusterd_is_profile_on(glusterd_volinfo_t *volinfo);
+
+char *
+search_brick_path_from_proc(pid_t brick_pid, char *brickpath);
+
+int32_t
+glusterd_add_shd_to_dict(glusterd_volinfo_t *volinfo, dict_t *dict,
+                         int32_t count);
+int32_t
+glusterd_check_brick_order(dict_t *dict, char *err_str, int32_t type,
+                           char **volname, char **bricks, int32_t *brick_count,
+                           int32_t sub_count);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c
new file mode 100644
index 00000000000..8d6fb5e0fac
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c
@@ -0,0 +1,6754 @@
+/*
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <fnmatch.h>
+#include <sys/wait.h>
+#include <dlfcn.h>
+#include <utime.h>
+
+#include <glusterfs/xlator.h>
+#include "glusterd.h"
+#include <glusterfs/defaults.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/graph-utils.h>
+#include <glusterfs/common-utils.h>
+#include "glusterd-store.h"
+#include "glusterd-hooks.h"
+#include <glusterfs/trie.h>
+#include "glusterd-mem-types.h"
+#include "cli1-xdr.h"
+#include "glusterd-volgen.h"
+#include "glusterd-geo-rep.h"
+#include "glusterd-utils.h"
+#include "glusterd-messages.h"
+#include <glusterfs/run.h>
+#include <glusterfs/options.h>
+#include "glusterd-snapshot-utils.h"
+#include "glusterd-svc-mgmt.h"
+#include "glusterd-svc-helper.h"
+#include "glusterd-snapd-svc-helper.h"
+#include "glusterd-shd-svc-helper.h"
+#include "glusterd-gfproxyd-svc-helper.h"
+
+struct gd_validate_reconf_opts {
+    dict_t *options;
+    char **op_errstr;
+};
+
+extern struct volopt_map_entry glusterd_volopt_map[];
+
+#define RPC_SET_OPT(XL, CLI_OPT, XLATOR_OPT, ERROR_CMD)                        \
+    do {                                                                       \
+        char *_value = NULL;                                                   \
+                                                                               \
+        if (dict_get_str_sizen(set_dict, CLI_OPT, &_value) == 0) {             \
+            if (xlator_set_fixed_option(XL, "transport.socket." XLATOR_OPT,    \
+                                        _value) != 0) {                        \
+                gf_msg("glusterd", GF_LOG_WARNING, errno,                      \
+                       GD_MSG_XLATOR_SET_OPT_FAIL,                             \
+                       "failed to set " XLATOR_OPT);                           \
+                ERROR_CMD;                                                     \
+            }                                                                  \
+        }                                                                      \
+    } while (0 /* CONSTCOND */)
+
+static int
+volgen_graph_build_clients(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                           dict_t *set_dict, void *param);
+
+static int
+build_client_graph(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                   dict_t *mod_dict);
+
+/*********************************************
+ *
+ * xlator generation / graph manipulation API
+ *
+ *********************************************/
+
+static void
+set_graph_errstr(volgen_graph_t *graph, const char *str)
+{
+    if (!graph->errstr)
+        return;
+
+    *graph->errstr = gf_strdup(str);
+}
+
+static xlator_t *
+xlator_instantiate_va(const char *type, const char *format, va_list arg)
+{
+    xlator_t *xl = NULL;
+    char *volname = NULL;
+    int ret = 0;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    ret = gf_vasprintf(&volname, format, arg);
+    if (ret < 0) {
+        volname = NULL;
+
+        goto error;
+    }
+
+    xl = GF_CALLOC(1, sizeof(*xl), gf_common_mt_xlator_t);
+    if (!xl) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto error;
+    }
+    ret = xlator_set_type_virtual(xl, type);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_XLATOR_SET_OPT_FAIL,
+                NULL);
+        goto error;
+    }
+    xl->options = dict_new();
+    if (!xl->options) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto error;
+    }
+    xl->name = volname;
+    CDS_INIT_LIST_HEAD(&xl->volume_options);
+
+    xl->ctx = THIS->ctx;
+
+    return xl;
+
+error:
+    gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_XLATOR_CREATE_FAIL, "Type=%s",
+            type, NULL);
+    GF_FREE(volname);
+    if (xl)
+        xlator_destroy(xl);
+
+    return NULL;
+}
+
+static int
+volgen_xlator_link(xlator_t *pxl, xlator_t *cxl)
+{
+    int ret = 0;
+
+    ret = glusterfs_xlator_link(pxl, cxl);
+    if (ret == -1) {
+        gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Out of memory, cannot link xlators %s <- %s", pxl->name,
+               cxl->name);
+    }
+
+    return ret;
+}
+
+static int
+volgen_graph_link(volgen_graph_t *graph, xlator_t *xl)
+{
+    int ret = 0;
+
+    /* no need to care about graph->top here */
+    if (graph->graph.first)
+        ret = volgen_xlator_link(xl, graph->graph.first);
+    if (ret == -1) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_GRAPH_ENTRY_ADD_FAIL,
+               "failed to add graph entry %s", xl->name);
+
+        return -1;
+    }
+
+    return 0;
+}
+
+static xlator_t *
+volgen_graph_add_as(volgen_graph_t *graph, const char *type, const char *format,
+                    ...)
+{
+    va_list arg;
+    xlator_t *xl = NULL;
+
+    va_start(arg, format);
+    xl = xlator_instantiate_va(type, format, arg);
+    va_end(arg);
+
+    if (!xl)
+        return NULL;
+
+    if (volgen_graph_link(graph, xl)) {
+        xlator_destroy(xl);
+
+        return NULL;
+    } else
+        glusterfs_graph_set_first(&graph->graph, xl);
+
+    return xl;
+}
+
+static xlator_t *
+volgen_graph_add_nolink(volgen_graph_t *graph, const char *type,
+                        const char *format, ...)
+{
+    va_list arg;
+    xlator_t *xl = NULL;
+
+    va_start(arg, format);
+    xl = xlator_instantiate_va(type, format, arg);
+    va_end(arg);
+
+    if (!xl)
+        return NULL;
+
+    glusterfs_graph_set_first(&graph->graph, xl);
+
+    return xl;
+}
+
+static xlator_t *
+volgen_graph_add(volgen_graph_t *graph, char *type, char *volname)
+{
+    char *shorttype = NULL;
+
+    shorttype = strrchr(type, '/');
+    GF_ASSERT(shorttype);
+    shorttype++;
+    GF_ASSERT(*shorttype);
+
+    return volgen_graph_add_as(graph, type, "%s-%s", volname, shorttype);
+}
+
+#define xlator_set_fixed_option(xl, key, value)                                \
+    xlator_set_option(xl, key, SLEN(key), value)
+
+/* XXX Seems there is no such generic routine?
+ * Maybe should put to xlator.c ??
+ */
+static int
+xlator_set_option(xlator_t *xl, char *key, const int keylen, char *value)
+{
+    char *dval = gf_strdup(value);
+
+    if (!dval) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY,
+               "failed to set xlator opt: %s[%s] = %s", xl->name, key, value);
+
+        return -1;
+    }
+
+    return dict_set_dynstrn(xl->options, key, keylen, dval);
+}
+
+#define xlator_get_fixed_option(xl, key, value)                                \
+    xlator_get_option(xl, key, SLEN(key), value)
+
+static int
+xlator_get_option(xlator_t *xl, char *key, const int keylen, char **value)
+{
+    GF_ASSERT(xl);
+    return dict_get_strn(xl->options, key, keylen, value);
+}
+
+static xlator_t *
+first_of(volgen_graph_t *graph)
+{
+    return (xlator_t *)graph->graph.first;
+}
+
+/**************************
+ *
+ * Trie glue
+ *
+ *************************/
+
+static int
+volopt_selector(int lvl, char **patt, void *param,
+                int (*optcbk)(char *word, void *param))
+{
+    struct volopt_map_entry *vme = NULL;
+    char *w = NULL;
+    int i = 0;
+    int len = 0;
+    int ret = 0;
+    char *dot = NULL;
+
+    for (vme = glusterd_volopt_map; vme->key; vme++) {
+        w = vme->key;
+
+        for (i = 0; i < lvl; i++) {
+            if (patt[i]) {
+                w = strtail(w, patt[i]);
+                GF_ASSERT(!w || *w);
+                if (!w || *w != '.')
+                    goto next;
+            } else {
+                w = strchr(w, '.');
+                GF_ASSERT(w);
+            }
+            w++;
+        }
+
+        dot = strchr(w, '.');
+        if (dot) {
+            len = dot - w;
+            w = gf_strdup(w);
+            if (!w)
+                return -1;
+            w[len] = '\0';
+        }
+        ret = optcbk(w, param);
+        if (dot)
+            GF_FREE(w);
+        if (ret)
+            return -1;
+    next:
+        continue;
+    }
+
+    return 0;
+}
+
+static int
+volopt_trie_cbk(char *word, void *param)
+{
+    return trie_add((trie_t *)param, word);
+}
+
+static int
+process_nodevec(struct trienodevec *nodevec, char **outputhint, char *inputhint)
+{
+    int ret = 0;
+    char *hint1 = NULL;
+    char *hint2 = NULL;
+    char *hintinfx = "";
+    trienode_t **nodes = nodevec->nodes;
+
+    if (!nodes[0]) {
+        *outputhint = NULL;
+        return 0;
+    }
+
+#if 0
+        /* Limit as in git */
+        if (trienode_get_dist (nodes[0]) >= 6) {
+                *outputhint = NULL;
+                return 0;
+        }
+#endif
+
+    if (trienode_get_word(nodes[0], &hint1))
+        return -1;
+
+    if (nodevec->cnt < 2 || !nodes[1]) {
+        *outputhint = hint1;
+        return 0;
+    }
+
+    if (trienode_get_word(nodes[1], &hint2)) {
+        GF_FREE(hint1);
+        return -1;
+    }
+
+    if (inputhint)
+        hintinfx = inputhint;
+    ret = gf_asprintf(outputhint, "%s or %s%s", hint1, hintinfx, hint2);
+    if (ret > 0)
+        ret = 0;
+    if (hint1)
+        GF_FREE(hint1);
+    if (hint2)
+        GF_FREE(hint2);
+    return ret;
+}
+
+static int
+volopt_trie_section(int lvl, char **patt, char *word, char **outputhint,
+                    char *inputhint, int hints)
+{
+    trienode_t *nodes[] = {NULL, NULL};
+    struct trienodevec nodevec = {nodes, 2};
+    trie_t *trie = NULL;
+    int ret = 0;
+
+    trie = trie_new();
+    if (!trie)
+        return -1;
+
+    if (volopt_selector(lvl, patt, trie, &volopt_trie_cbk)) {
+        trie_destroy(trie);
+
+        return -1;
+    }
+
+    GF_ASSERT(hints <= 2);
+    nodevec.cnt = hints;
+    ret = trie_measure_vec(trie, word, &nodevec);
+    if (!ret && nodevec.nodes[0])
+        ret = process_nodevec(&nodevec, outputhint, inputhint);
+
+    trie_destroy(trie);
+
+    return ret;
+}
+
+static int
+volopt_trie(char *key, char **hint)
+{
+    char *patt[] = {NULL};
+    char *fullhint = NULL;
+    char *inputhint = NULL;
+    char *dot = NULL;
+    char *dom = NULL;
+    int len = 0;
+    int ret = 0;
+
+    *hint = NULL;
+
+    dot = strchr(key, '.');
+    if (!dot)
+        return volopt_trie_section(1, patt, key, hint, inputhint, 2);
+
+    len = dot - key;
+    dom = gf_strdup(key);
+    if (!dom)
+        return -1;
+    dom[len] = '\0';
+
+    ret = volopt_trie_section(0, NULL, dom, patt, inputhint, 1);
+    GF_FREE(dom);
+    if (ret) {
+        patt[0] = NULL;
+        goto out;
+    }
+    if (!patt[0])
+        goto out;
+
+    inputhint = "...";
+    ret = volopt_trie_section(1, patt, dot + 1, hint, inputhint, 2);
+    if (ret)
+        goto out;
+    if (*hint) {
+        ret = gf_asprintf(&fullhint, "%s.%s", patt[0], *hint);
+        GF_FREE(*hint);
+        if (ret >= 0) {
+            ret = 0;
+            *hint = fullhint;
+        }
+    }
+
+out:
+    GF_FREE(patt[0]);
+    if (ret)
+        *hint = NULL;
+
+    return ret;
+}
+
+/**************************
+ *
+ * Volume generation engine
+ *
+ **************************/
+
+typedef int (*volgen_opthandler_t)(volgen_graph_t *graph,
+                                   struct volopt_map_entry *vme, void *param);
+
+struct opthandler_data {
+    volgen_graph_t *graph;
+    volgen_opthandler_t handler;
+    struct volopt_map_entry *vme;
+    gf_boolean_t found;
+    gf_boolean_t data_t_fake;
+    int rv;
+    char *volname;
+    void *param;
+};
+
+static void
+process_option(char *key, data_t *value, void *param)
+{
+    struct opthandler_data *odt = param;
+    struct volopt_map_entry vme = {
+        0,
+    };
+
+    if (odt->rv)
+        return;
+    odt->found = _gf_true;
+
+    vme.key = key;
+    vme.voltype = odt->vme->voltype;
+    vme.option = odt->vme->option;
+    vme.op_version = odt->vme->op_version;
+
+    if (!vme.option) {
+        vme.option = strrchr(key, '.');
+        if (vme.option)
+            vme.option++;
+        else
+            vme.option = key;
+    }
+    if (odt->data_t_fake)
+        vme.value = (char *)value;
+    else
+        vme.value = value->data;
+
+    odt->rv = odt->handler(odt->graph, &vme, odt->param);
+    return;
+}
+
+static int
+volgen_graph_set_options_generic(volgen_graph_t *graph, dict_t *dict,
+                                 void *param, volgen_opthandler_t handler)
+{
+    struct volopt_map_entry *vme = NULL;
+    struct opthandler_data odt = {
+        0,
+    };
+    data_t *data = NULL;
+    int keylen;
+
+    odt.graph = graph;
+    odt.handler = handler;
+    odt.param = param;
+    (void)data;
+
+    for (vme = glusterd_volopt_map; vme->key; vme++) {
+        keylen = strlen(vme->key);
+        if (keylen == SLEN("performance.client-io-threads") &&
+            !strcmp(vme->key, "performance.client-io-threads") &&
+            dict_get_str_boolean(dict, "skip-CLIOT", _gf_false) == _gf_true) {
+            continue;
+        }
+
+        odt.vme = vme;
+        odt.found = _gf_false;
+        odt.data_t_fake = _gf_false;
+        data = dict_getn(dict, vme->key, keylen);
+        if (data)
+            process_option(vme->key, data, &odt);
+        if (odt.rv)
+            return odt.rv;
+
+        if (odt.found)
+            continue;
+
+        /* check for default value */
+
+        if (vme->value) {
+            /* stupid hack to be able to reuse dict iterator
+             * in this context
+             */
+            odt.data_t_fake = _gf_true;
+            process_option(vme->key, (data_t *)vme->value, &odt);
+            if (odt.rv)
+                return odt.rv;
+        }
+    }
+
+    return 0;
+}
+
+static int
+no_filter_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                         void *param)
+{
+    xlator_t *trav;
+    int ret = 0;
+
+    for (trav = first_of(graph); trav; trav = trav->next) {
+        if (strcmp(trav->type, vme->voltype) != 0)
+            continue;
+        if (strcmp(vme->option, "ta-remote-port") == 0) {
+            if (strstr(trav->name, "-ta-") != NULL) {
+                ret = xlator_set_option(trav, "remote-port",
+                                        strlen(vme->option), vme->value);
+            }
+            continue;
+        }
+        ret = xlator_set_option(trav, vme->option, strlen(vme->option),
+                                vme->value);
+        if (ret)
+            break;
+    }
+    return ret;
+}
+
+static int
+basic_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                     void *param)
+{
+    int ret = 0;
+
+    if (vme->option[0] == '!')
+        goto out;
+
+    ret = no_filter_option_handler(graph, vme, param);
+out:
+    return ret;
+}
+
+static int
+volgen_graph_set_options(volgen_graph_t *graph, dict_t *dict)
+{
+    return volgen_graph_set_options_generic(graph, dict, NULL,
+                                            &basic_option_handler);
+}
+
+static int
+optget_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                      void *param)
+{
+    struct volopt_map_entry *vme2 = param;
+
+    if (strcmp(vme->key, vme2->key) == 0)
+        vme2->value = vme->value;
+
+    return 0;
+}
+
+/* This getter considers defaults also. */
+static int
+volgen_dict_get(dict_t *dict, char *key, char **value)
+{
+    struct volopt_map_entry vme = {
+        0,
+    };
+    int ret = 0;
+
+    vme.key = key;
+
+    ret = volgen_graph_set_options_generic(NULL, dict, &vme,
+                                           &optget_option_handler);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Out of memory");
+
+        return -1;
+    }
+
+    *value = vme.value;
+
+    return 0;
+}
+
+static int
+option_complete(char *key, char **completion)
+{
+    struct volopt_map_entry *vme = NULL;
+
+    *completion = NULL;
+    for (vme = glusterd_volopt_map; vme->key; vme++) {
+        if (strcmp(strchr(vme->key, '.') + 1, key) != 0)
+            continue;
+
+        if (*completion && strcmp(*completion, vme->key) != 0) {
+            /* cancel on non-unique match */
+            *completion = NULL;
+
+            return 0;
+        } else
+            *completion = vme->key;
+    }
+
+    if (*completion) {
+        /* For sake of unified API we want
+         * have the completion to be a to-be-freed
+         * string.
+         */
+        *completion = gf_strdup(*completion);
+        return -!*completion;
+    }
+
+    return 0;
+}
+
+int
+glusterd_volinfo_get(glusterd_volinfo_t *volinfo, char *key, char **value)
+{
+    return volgen_dict_get(volinfo->dict, key, value);
+}
+
+int
+glusterd_volinfo_get_boolean(glusterd_volinfo_t *volinfo, char *key)
+{
+    char *val = NULL;
+    gf_boolean_t enabled = _gf_false;
+    int ret = 0;
+
+    ret = glusterd_volinfo_get(volinfo, key, &val);
+    if (ret)
+        return -1;
+
+    if (val)
+        ret = gf_string2boolean(val, &enabled);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "value for %s option is not valid", key);
+
+        return -1;
+    }
+
+    return enabled;
+}
+
+gf_boolean_t
+glusterd_check_voloption_flags(char *key, int32_t flags)
+{
+    char *completion = NULL;
+    struct volopt_map_entry *vmep = NULL;
+    int ret = 0;
+
+    COMPLETE_OPTION(key, completion, ret);
+    for (vmep = glusterd_volopt_map; vmep->key; vmep++) {
+        if (strcmp(vmep->key, key) == 0) {
+            if (vmep->flags & flags)
+                return _gf_true;
+            else
+                return _gf_false;
+        }
+    }
+
+    return _gf_false;
+}
+
+gf_boolean_t
+glusterd_check_globaloption(char *key)
+{
+    char *completion = NULL;
+    struct volopt_map_entry *vmep = NULL;
+    int ret = 0;
+
+    COMPLETE_OPTION(key, completion, ret);
+    for (vmep = glusterd_volopt_map; vmep->key; vmep++) {
+        if (strcmp(vmep->key, key) == 0) {
+            if ((vmep->type == GLOBAL_DOC) || (vmep->type == GLOBAL_NO_DOC))
+                return _gf_true;
+            else
+                return _gf_false;
+        }
+    }
+
+    return _gf_false;
+}
+
+gf_boolean_t
+glusterd_check_localoption(char *key)
+{
+    char *completion = NULL;
+    struct volopt_map_entry *vmep = NULL;
+    int ret = 0;
+
+    COMPLETE_OPTION(key, completion, ret);
+    for (vmep = glusterd_volopt_map; vmep->key; vmep++) {
+        if (strcmp(vmep->key, key) == 0) {
+            if ((vmep->type == DOC) || (vmep->type == NO_DOC))
+                return _gf_true;
+            else
+                return _gf_false;
+        }
+    }
+
+    return _gf_false;
+}
+
+int
+glusterd_check_option_exists(char *key, char **completion)
+{
+    struct volopt_map_entry vme = {
+        0,
+    };
+    struct volopt_map_entry *vmep = NULL;
+    int ret = 0;
+    xlator_t *this = THIS;
+
+    (void)vme;
+    (void)vmep;
+
+    if (!strchr(key, '.')) {
+        if (completion) {
+            ret = option_complete(key, completion);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                       "Out of memory");
+                return -1;
+            }
+
+            ret = !!*completion;
+            if (ret)
+                return ret;
+            else
+                goto trie;
+        } else
+            return 0;
+    }
+
+    for (vmep = glusterd_volopt_map; vmep->key; vmep++) {
+        if (strcmp(vmep->key, key) == 0) {
+            ret = 1;
+            break;
+        }
+    }
+
+    if (ret || !completion)
+        return ret;
+
+trie:
+    ret = volopt_trie(key, completion);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_ERROR_ENCOUNTERED,
+               "Some error occurred during keyword hinting");
+    }
+
+    return ret;
+}
+
+int
+glusterd_volopt_validate(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                         char *value, char **op_errstr)
+{
+    struct volopt_map_entry *vme = NULL;
+    int ret = 0;
+    xlator_t *this = THIS;
+
+    if (!dict || !key || !value) {
+        gf_msg_callingfn(this->name, GF_LOG_WARNING, EINVAL,
+                         GD_MSG_INVALID_ENTRY,
+                         "Invalid "
+                         "Arguments (dict=%p, key=%s, value=%s)",
+                         dict, key, value);
+        return -1;
+    }
+
+    for (vme = &glusterd_volopt_map[0]; vme->key; vme++) {
+        if ((vme->validate_fn) && ((!strcmp(key, vme->key)) ||
+                                   (!strcmp(key, strchr(vme->key, '.') + 1)))) {
+            if ((vme->type != GLOBAL_DOC && vme->type != GLOBAL_NO_DOC) &&
+                !volinfo) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,
+                       "%s is not"
+                       " a global option",
+                       vme->key);
+                ret = -1;
+                goto out;
+            }
+            ret = vme->validate_fn(volinfo, dict, key, value, op_errstr);
+            if (ret)
+                goto out;
+            break;
+        }
+    }
+out:
+    return ret;
+}
+
+char *
+glusterd_get_trans_type_rb(gf_transport_type ttype)
+{
+    char *trans_type = NULL;
+
+    switch (ttype) {
+        case GF_TRANSPORT_RDMA:
+            gf_asprintf(&trans_type, "rdma");
+            break;
+        case GF_TRANSPORT_TCP:
+        case GF_TRANSPORT_BOTH_TCP_RDMA:
+            gf_asprintf(&trans_type, "tcp");
+            break;
+        default:
+            gf_msg(THIS->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                   "Unknown "
+                   "transport type");
+    }
+
+    return trans_type;
+}
+
+static int
+_xl_link_children(xlator_t *parent, xlator_t *children, size_t child_count)
+{
+    xlator_t *trav = NULL;
+    size_t seek = 0;
+    int ret = -1;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (child_count == 0)
+        goto out;
+    seek = child_count;
+    for (trav = children; --seek; trav = trav->next)
+        ;
+    for (; child_count--; trav = trav->prev) {
+        ret = volgen_xlator_link(parent, trav);
+        gf_msg_debug(this->name, 0, "%s:%s", parent->name, trav->name);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_XLATOR_LINK_FAIL,
+                    NULL);
+            goto out;
+        }
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+volgen_graph_merge_sub(volgen_graph_t *dgraph, volgen_graph_t *sgraph,
+                       size_t child_count)
+{
+    xlator_t *trav = NULL;
+    int ret = 0;
+
+    GF_ASSERT(dgraph->graph.first);
+
+    ret = _xl_link_children(first_of(dgraph), first_of(sgraph), child_count);
+    if (ret)
+        goto out;
+
+    for (trav = first_of(dgraph); trav->next; trav = trav->next)
+        ;
+
+    trav->next = first_of(sgraph);
+    trav->next->prev = trav;
+    dgraph->graph.xl_count += sgraph->graph.xl_count;
+
+out:
+    return ret;
+}
+
+static void
+volgen_apply_filters(char *orig_volfile)
+{
+    DIR *filterdir = NULL;
+    struct dirent *entry = NULL;
+    struct dirent scratch[2] = {
+        {
+            0,
+        },
+    };
+    struct stat statbuf = {
+        0,
+    };
+    char filterpath[PATH_MAX] = {
+        0,
+    };
+
+    filterdir = sys_opendir(FILTERDIR);
+
+    if (!filterdir)
+        return;
+
+    for (;;) {
+        errno = 0;
+
+        entry = sys_readdir(filterdir, scratch);
+
+        if (!entry || errno != 0) {
+            gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_READ_ERROR, NULL);
+            break;
+        }
+
+        if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
+            continue;
+        /*
+         * d_type isn't guaranteed to be present/valid on all systems,
+         * so do an explicit stat instead.
+         */
+        (void)snprintf(filterpath, sizeof(filterpath), "%s/%s", FILTERDIR,
+                       entry->d_name);
+
+        /* Deliberately use stat instead of lstat to allow symlinks. */
+        if (sys_stat(filterpath, &statbuf) == -1)
+            continue;
+
+        if (!S_ISREG(statbuf.st_mode))
+            continue;
+        /*
+         * We could check the mode in statbuf directly, or just skip
+         * this entirely and check for EPERM after exec fails, but this
+         * is cleaner.
+         */
+        if (sys_access(filterpath, X_OK) != 0)
+            continue;
+
+        if (runcmd(filterpath, orig_volfile, NULL)) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_FILTER_RUN_FAILED,
+                   "failed to run filter %s", entry->d_name);
+        }
+    }
+
+    (void)sys_closedir(filterdir);
+}
+
+static int
+volgen_write_volfile(volgen_graph_t *graph, char *filename)
+{
+    char *ftmp = NULL;
+    FILE *f = NULL;
+    int fd = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    if (gf_asprintf(&ftmp, "%s.tmp", filename) == -1) {
+        ftmp = NULL;
+        goto error;
+    }
+
+    fd = sys_creat(ftmp, S_IRUSR | S_IWUSR);
+    if (fd < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "file creation failed");
+        goto error;
+    }
+
+    sys_close(fd);
+
+    f = fopen(ftmp, "w");
+    if (!f)
+        goto error;
+
+    if (glusterfs_graph_print_file(f, &graph->graph) == -1)
+        goto error;
+
+    if (fclose(f) != 0) {
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "fclose on the file %s "
+               "failed",
+               ftmp);
+        /*
+         * Even though fclose has failed here, we have to set f to NULL.
+         * Otherwise when the code path goes to error, there again we
+         * try to close it which might cause undefined behavior such as
+         * process crash.
+         */
+        f = NULL;
+        goto error;
+    }
+
+    f = NULL;
+
+    if (sys_rename(ftmp, filename) == -1)
+        goto error;
+
+    GF_FREE(ftmp);
+
+    volgen_apply_filters(filename);
+
+    return 0;
+
+error:
+
+    GF_FREE(ftmp);
+    if (f)
+        fclose(f);
+
+    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+           "failed to create volfile %s", filename);
+
+    return -1;
+}
+
+static void
+volgen_graph_free(volgen_graph_t *graph)
+{
+    xlator_t *trav = NULL;
+    xlator_t *trav_old = NULL;
+
+    for (trav = first_of(graph);; trav = trav->next) {
+        if (trav_old)
+            xlator_destroy(trav_old);
+
+        trav_old = trav;
+
+        if (!trav)
+            break;
+    }
+}
+
+static int
+build_graph_generic(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                    dict_t *mod_dict, void *param,
+                    int (*builder)(volgen_graph_t *graph,
+                                   glusterd_volinfo_t *volinfo,
+                                   dict_t *set_dict, void *param))
+{
+    dict_t *set_dict = NULL;
+    int ret = 0;
+
+    if (mod_dict) {
+        set_dict = dict_copy_with_ref(volinfo->dict, NULL);
+        if (!set_dict)
+            return -1;
+        dict_copy(mod_dict, set_dict);
+        /* XXX dict_copy swallows errors */
+    } else {
+        set_dict = volinfo->dict;
+    }
+
+    ret = builder(graph, volinfo, set_dict, param);
+    if (!ret)
+        ret = volgen_graph_set_options(graph, set_dict);
+
+    if (mod_dict)
+        dict_unref(set_dict);
+
+    return ret;
+}
+
+static gf_transport_type
+transport_str_to_type(char *tt)
+{
+    gf_transport_type type = GF_TRANSPORT_TCP;
+
+    if (!strcmp("tcp", tt))
+        type = GF_TRANSPORT_TCP;
+    else if (!strcmp("rdma", tt))
+        type = GF_TRANSPORT_RDMA;
+    else if (!strcmp("tcp,rdma", tt))
+        type = GF_TRANSPORT_BOTH_TCP_RDMA;
+    return type;
+}
+
+static void
+transport_type_to_str(gf_transport_type type, char *tt)
+{
+    switch (type) {
+        case GF_TRANSPORT_RDMA:
+            strcpy(tt, "rdma");
+            break;
+        case GF_TRANSPORT_TCP:
+            strcpy(tt, "tcp");
+            break;
+        case GF_TRANSPORT_BOTH_TCP_RDMA:
+            strcpy(tt, "tcp,rdma");
+            break;
+    }
+}
+
+static void
+get_vol_transport_type(glusterd_volinfo_t *volinfo, char *tt)
+{
+    transport_type_to_str(volinfo->transport_type, tt);
+}
+
+#ifdef BUILD_GNFS
+/* If no value has specified for tcp,rdma volume from cli
+ * use tcp as default value.Otherwise, use transport type
+ * mentioned in volinfo
+ */
+static void
+get_vol_nfs_transport_type(glusterd_volinfo_t *volinfo, char *tt)
+{
+    if (volinfo->transport_type == GF_TRANSPORT_BOTH_TCP_RDMA) {
+        strcpy(tt, "tcp");
+        gf_msg("glusterd", GF_LOG_INFO, 0, GD_MSG_DEFAULT_OPT_INFO,
+               "The default transport type for tcp,rdma volume "
+               "is tcp if option is not defined by the user ");
+    } else
+        transport_type_to_str(volinfo->transport_type, tt);
+}
+#endif
+
+/*  gets the volinfo, dict, a character array for filling in
+ *  the transport type and a boolean option which says whether
+ *  the transport type is required for nfs or not. If its not
+ *  for nfs, then it is considered as the client transport
+ *  and client transport type is filled in the character array
+ */
+static void
+get_transport_type(glusterd_volinfo_t *volinfo, dict_t *set_dict, char *transt,
+                   gf_boolean_t is_nfs)
+{
+    int ret = -1;
+    char *tt = NULL;
+
+    if (is_nfs == _gf_false) {
+        ret = dict_get_str_sizen(set_dict, "client-transport-type", &tt);
+        if (ret)
+            get_vol_transport_type(volinfo, transt);
+    } else {
+#ifdef BUILD_GNFS
+        ret = dict_get_str_sizen(set_dict, "nfs.transport-type", &tt);
+        if (ret)
+            get_vol_nfs_transport_type(volinfo, transt);
+#endif
+    }
+
+    if (!ret)
+        strcpy(transt, tt);
+}
+
+static int
+server_auth_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                           void *param)
+{
+    xlator_t *xl = NULL;
+    char *aa = NULL;
+    int ret = 0;
+    char *key = NULL;
+    char *auth_path = NULL;
+
+    if (strcmp(vme->option, "!server-auth") != 0)
+        return 0;
+
+    xl = first_of(graph);
+
+    /* from 'auth.allow' -> 'allow', and 'auth.reject' -> 'reject' */
+    key = strchr(vme->key, '.') + 1;
+
+    ret = xlator_get_fixed_option(xl, "auth-path", &auth_path);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DEFAULT_OPT_INFO,
+               "Failed to get auth-path from server graph");
+        return -1;
+    }
+    ret = gf_asprintf(&aa, "auth.addr.%s.%s", auth_path, key);
+    if (ret != -1) {
+        ret = xlator_set_option(xl, aa, ret, vme->value);
+        GF_FREE(aa);
+    }
+    if (ret)
+        return -1;
+
+    return 0;
+}
+
+static int
+loglevel_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                        void *param)
+{
+    char *role = param;
+    struct volopt_map_entry vme2 = {
+        0,
+    };
+
+    if ((strcmp(vme->option, "!client-log-level") != 0 &&
+         strcmp(vme->option, "!brick-log-level") != 0) ||
+        !strstr(vme->key, role))
+        return 0;
+
+    memcpy(&vme2, vme, sizeof(vme2));
+    vme2.option = "log-level";
+
+    return basic_option_handler(graph, &vme2, NULL);
+}
+
+static int
+threads_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                       void *param)
+{
+    char *role = param;
+    struct volopt_map_entry vme2 = {
+        0,
+    };
+
+    if ((strcmp(vme->option, "!client-threads") != 0 &&
+         strcmp(vme->option, "!brick-threads") != 0) ||
+        !strstr(vme->key, role))
+        return 0;
+
+    memcpy(&vme2, vme, sizeof(vme2));
+    vme2.option = "threads";
+
+    return basic_option_handler(graph, &vme2, NULL);
+}
+
+static int
+server_check_changelog_off(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                           glusterd_volinfo_t *volinfo)
+{
+    gf_boolean_t enabled = _gf_false;
+    int ret = 0;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(vme);
+
+    if (strcmp(vme->option, "changelog") != 0)
+        return 0;
+
+    ret = gf_string2boolean(vme->value, &enabled);
+    if (ret || enabled)
+        goto out;
+
+    ret = glusterd_volinfo_get_boolean(volinfo, VKEY_CHANGELOG);
+    if (ret < 0) {
+        gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_CHANGELOG_GET_FAIL,
+               "failed to get the changelog status");
+        ret = -1;
+        goto out;
+    }
+
+    if (ret) {
+        enabled = _gf_false;
+        glusterd_check_geo_rep_configured(volinfo, &enabled);
+
+        if (enabled) {
+            gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_XLATOR_SET_OPT_FAIL,
+                   GEOREP
+                   " sessions active"
+                   "for the volume %s, cannot disable changelog ",
+                   volinfo->volname);
+            set_graph_errstr(graph, VKEY_CHANGELOG
+                             " cannot be disabled "
+                             "while " GEOREP " sessions exist");
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+server_check_marker_off(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                        glusterd_volinfo_t *volinfo)
+{
+    gf_boolean_t enabled = _gf_false;
+    int ret = 0;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(vme);
+
+    if (strcmp(vme->option, "!xtime") != 0)
+        return 0;
+
+    ret = gf_string2boolean(vme->value, &enabled);
+    if (ret || enabled)
+        goto out;
+
+    ret = glusterd_volinfo_get_boolean(volinfo, VKEY_MARKER_XTIME);
+    if (ret < 0) {
+        gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_MARKER_STATUS_GET_FAIL,
+               "failed to get the marker status");
+        ret = -1;
+        goto out;
+    }
+
+    if (ret) {
+        enabled = _gf_false;
+        glusterd_check_geo_rep_configured(volinfo, &enabled);
+
+        if (enabled) {
+            gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_MARKER_DISABLE_FAIL,
+                   GEOREP
+                   " sessions active"
+                   "for the volume %s, cannot disable marker ",
+                   volinfo->volname);
+            set_graph_errstr(graph, VKEY_MARKER_XTIME
+                             " cannot be disabled "
+                             "while " GEOREP " sessions exist");
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+sys_loglevel_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                            void *param)
+{
+    char *role = NULL;
+    struct volopt_map_entry vme2 = {
+        0,
+    };
+
+    role = (char *)param;
+
+    if (strcmp(vme->option, "!sys-log-level") != 0 || !strstr(vme->key, role))
+        return 0;
+
+    memcpy(&vme2, vme, sizeof(vme2));
+    vme2.option = "sys-log-level";
+
+    return basic_option_handler(graph, &vme2, NULL);
+}
+
+static int
+logger_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                      void *param)
+{
+    char *role = NULL;
+    struct volopt_map_entry vme2 = {
+        0,
+    };
+
+    role = (char *)param;
+
+    if (strcmp(vme->option, "!logger") != 0 || !strstr(vme->key, role))
+        return 0;
+
+    memcpy(&vme2, vme, sizeof(vme2));
+    vme2.option = "logger";
+
+    return basic_option_handler(graph, &vme2, NULL);
+}
+
+static int
+log_format_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                          void *param)
+{
+    char *role = NULL;
+    struct volopt_map_entry vme2 = {
+        0,
+    };
+
+    role = (char *)param;
+
+    if (strcmp(vme->option, "!log-format") != 0 || !strstr(vme->key, role))
+        return 0;
+
+    memcpy(&vme2, vme, sizeof(vme2));
+    vme2.option = "log-format";
+
+    return basic_option_handler(graph, &vme2, NULL);
+}
+
+static int
+log_localtime_logging_option_handler(volgen_graph_t *graph,
+                                     struct volopt_map_entry *vme, void *param)
+{
+    char *role = NULL;
+    struct volopt_map_entry vme2 = {
+        0,
+    };
+
+    role = (char *)param;
+
+    if (strcmp(vme->option, "!cluster.localtime-logging") != 0 ||
+        !strstr(vme->key, role))
+        return 0;
+
+    memcpy(&vme2, vme, sizeof(vme2));
+    vme2.option = GLUSTERD_LOCALTIME_LOGGING_KEY;
+
+    return basic_option_handler(graph, &vme2, NULL);
+}
+
+static int
+log_buf_size_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                            void *param)
+{
+    char *role = NULL;
+    struct volopt_map_entry vme2 = {
+        0,
+    };
+
+    role = (char *)param;
+
+    if (strcmp(vme->option, "!log-buf-size") != 0 || !strstr(vme->key, role))
+        return 0;
+
+    memcpy(&vme2, vme, sizeof(vme2));
+    vme2.option = "log-buf-size";
+
+    return basic_option_handler(graph, &vme2, NULL);
+}
+
+static int
+log_flush_timeout_option_handler(volgen_graph_t *graph,
+                                 struct volopt_map_entry *vme, void *param)
+{
+    char *role = NULL;
+    struct volopt_map_entry vme2 = {
+        0,
+    };
+
+    role = (char *)param;
+
+    if (strcmp(vme->option, "!log-flush-timeout") != 0 ||
+        !strstr(vme->key, role))
+        return 0;
+
+    memcpy(&vme2, vme, sizeof(vme2));
+    vme2.option = "log-flush-timeout";
+
+    return basic_option_handler(graph, &vme2, NULL);
+}
+
+static int
+volgen_graph_set_xl_options(volgen_graph_t *graph, dict_t *dict)
+{
+    int32_t ret = -1;
+    char *xlator = NULL;
+    char xlator_match[1024] = {
+        0,
+    }; /* for posix* -> *posix* */
+    char *loglevel = NULL;
+    xlator_t *trav = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_str_sizen(dict, "xlator", &xlator);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=xlator", NULL);
+        goto out;
+    }
+
+    ret = dict_get_str_sizen(dict, "loglevel", &loglevel);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=loglevel", NULL);
+        goto out;
+    }
+
+    snprintf(xlator_match, 1024, "*%s", xlator);
+
+    for (trav = first_of(graph); trav; trav = trav->next) {
+        if (fnmatch(xlator_match, trav->type, FNM_NOESCAPE) == 0) {
+            gf_msg_debug("glusterd", 0, "Setting log level for xlator: %s",
+                         trav->type);
+            ret = xlator_set_fixed_option(trav, "log-level", loglevel);
+            if (ret)
+                break;
+        }
+    }
+
+out:
+    return ret;
+}
+
+static int
+server_spec_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                           void *param)
+{
+    int ret = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    volinfo = param;
+
+    ret = server_auth_option_handler(graph, vme, NULL);
+    if (!ret)
+        ret = server_check_marker_off(graph, vme, volinfo);
+
+    if (!ret)
+        ret = server_check_changelog_off(graph, vme, volinfo);
+
+    if (!ret)
+        ret = loglevel_option_handler(graph, vme, "brick");
+
+    if (!ret)
+        ret = sys_loglevel_option_handler(graph, vme, "brick");
+
+    if (!ret)
+        ret = logger_option_handler(graph, vme, "brick");
+
+    if (!ret)
+        ret = log_format_option_handler(graph, vme, "brick");
+
+    if (!ret)
+        ret = log_buf_size_option_handler(graph, vme, "brick");
+
+    if (!ret)
+        ret = log_flush_timeout_option_handler(graph, vme, "brick");
+
+    if (!ret)
+        ret = log_localtime_logging_option_handler(graph, vme, "brick");
+
+    if (!ret)
+        ret = threads_option_handler(graph, vme, "brick");
+
+    return ret;
+}
+
+static int
+server_spec_extended_option_handler(volgen_graph_t *graph,
+                                    struct volopt_map_entry *vme, void *param)
+{
+    int ret = 0;
+    dict_t *dict = NULL;
+
+    GF_ASSERT(param);
+    dict = (dict_t *)param;
+
+    ret = server_auth_option_handler(graph, vme, NULL);
+    if (!ret)
+        ret = volgen_graph_set_xl_options(graph, dict);
+
+    return ret;
+}
+
+static void
+get_vol_tstamp_file(char *filename, glusterd_volinfo_t *volinfo);
+
+static int
+gfproxy_server_graph_builder(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                             dict_t *set_dict, void *param)
+{
+    xlator_t *xl = NULL;
+    /*char            *value          = NULL;*/
+    char transt[16] = {
+        0,
+    };
+    char key[1024] = {
+        0,
+    };
+    int keylen;
+    /*char            port_str[7]     = {0, };*/
+    int ret = 0;
+    char *username = NULL;
+    char *password = NULL;
+    /*int             rclusters       = 0;*/
+
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+    /* We are a trusted client */
+    ret = dict_set_uint32(set_dict, "trusted-client", GF_CLIENT_TRUSTED);
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=trusted-client", NULL);
+        goto out;
+    }
+
+    ret = dict_set_int32_sizen(set_dict, "gfproxy-server", 1);
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=gfproxy-server", NULL);
+        goto out;
+    }
+
+    /* Build the client section of the graph first */
+    build_client_graph(graph, volinfo, set_dict);
+
+    /* Clear this setting so that future users of set_dict do not end up
+     * thinking they are a gfproxy server */
+    dict_del_sizen(set_dict, "gfproxy-server");
+    dict_del_sizen(set_dict, "trusted-client");
+
+    /* Then add the server to it */
+    get_vol_transport_type(volinfo, transt);
+    xl = volgen_graph_add(graph, "protocol/server", volinfo->volname);
+    if (!xl)
+        goto out;
+
+    ret = xlator_set_fixed_option(xl, "transport-type", transt);
+    if (ret != 0)
+        goto out;
+
+    /* Set username and password */
+    username = glusterd_auth_get_username(volinfo);
+    password = glusterd_auth_get_password(volinfo);
+    if (username) {
+        keylen = snprintf(key, sizeof(key), "auth.login.gfproxyd-%s.allow",
+                          volinfo->volname);
+        ret = xlator_set_option(xl, key, keylen, username);
+        if (ret)
+            return -1;
+    }
+
+    if (password) {
+        keylen = snprintf(key, sizeof(key), "auth.login.%s.password", username);
+        ret = xlator_set_option(xl, key, keylen, password);
+        if (ret != 0)
+            goto out;
+    }
+
+    snprintf(key, sizeof(key), "gfproxyd-%s", volinfo->volname);
+    ret = xlator_set_fixed_option(xl, "auth-path", key);
+
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_posix(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                      dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    char tmpstr[10] = {
+        0,
+    };
+    int ret = -1;
+    gf_boolean_t quota_enabled = _gf_true;
+    gf_boolean_t trash_enabled = _gf_false;
+    gf_boolean_t pgfid_feat = _gf_false;
+    char *value = NULL;
+    xlator_t *xl = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+
+    if (!graph || !volinfo || !set_dict || !brickinfo) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("glusterd", priv, out);
+
+    ret = glusterd_volinfo_get(volinfo, VKEY_FEATURES_QUOTA, &value);
+    if (value) {
+        ret = gf_string2boolean(value, &quota_enabled);
+        if (ret)
+            goto out;
+    }
+
+    ret = glusterd_volinfo_get(volinfo, VKEY_FEATURES_TRASH, &value);
+    if (value) {
+        ret = gf_string2boolean(value, &trash_enabled);
+        if (ret)
+            goto out;
+    }
+
+    ret = glusterd_volinfo_get(volinfo, "update-link-count-parent", &value);
+    if (value) {
+        ret = gf_string2boolean(value, &pgfid_feat);
+        if (ret)
+            goto out;
+    }
+
+    ret = -1;
+
+    xl = volgen_graph_add(graph, "storage/posix", volinfo->volname);
+    if (!xl)
+        goto out;
+
+    ret = xlator_set_fixed_option(xl, "directory", brickinfo->path);
+    if (ret)
+        goto out;
+
+    ret = xlator_set_fixed_option(xl, "volume-id",
+                                  uuid_utoa(volinfo->volume_id));
+    if (ret)
+        goto out;
+
+    if (quota_enabled || pgfid_feat || trash_enabled) {
+        ret = xlator_set_fixed_option(xl, "update-link-count-parent", "on");
+        if (ret) {
+            goto out;
+        }
+    }
+
+    if (priv->op_version >= GD_OP_VERSION_7_0) {
+        ret = xlator_set_fixed_option(xl, "fips-mode-rchecksum", "on");
+        if (ret) {
+            goto out;
+        }
+    }
+    snprintf(tmpstr, sizeof(tmpstr), "%d", brickinfo->fs_share_count);
+    ret = xlator_set_fixed_option(xl, "shared-brick-count", tmpstr);
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_selinux(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                        dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    xlator_t *xl = NULL;
+    int ret = -1;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    xl = volgen_graph_add(graph, "features/selinux", volinfo->volname);
+    if (!xl)
+        goto out;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_trash(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                      dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    int ret = -1;
+    xlator_t *xl = NULL;
+
+    xl = volgen_graph_add(graph, "features/trash", volinfo->volname);
+    if (!xl)
+        goto out;
+    ret = xlator_set_fixed_option(xl, "trash-dir", ".trashcan");
+    if (ret)
+        goto out;
+    ret = xlator_set_fixed_option(xl, "brick-path", brickinfo->path);
+    if (ret)
+        goto out;
+    ret = xlator_set_fixed_option(xl, "trash-internal-op", "off");
+    if (ret)
+        goto out;
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_arbiter(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                        dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    xlator_t *xl = NULL;
+    glusterd_brickinfo_t *last = NULL;
+    int ret = -1;
+
+    if (volinfo->arbiter_count != 1)
+        return 0;
+
+    /* Add arbiter only if it is the last (i.e. 3rd) brick. */
+    last = get_last_brick_of_brick_group(volinfo, brickinfo);
+    if (last != brickinfo)
+        return 0;
+
+    xl = volgen_graph_add(graph, "features/arbiter", volinfo->volname);
+    if (!xl)
+        goto out;
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_bitrot_stub(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                            dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    xlator_t *xl = NULL;
+    int ret = -1;
+    char *value = NULL;
+    xlator_t *this = THIS;
+
+    if (!graph || !volinfo || !set_dict || !brickinfo) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    xl = volgen_graph_add(graph, "features/bitrot-stub", volinfo->volname);
+    if (!xl)
+        goto out;
+
+    ret = xlator_set_fixed_option(xl, "export", brickinfo->path);
+    if (ret) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "failed to set the export "
+               "option in bit-rot-stub");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_get(volinfo, VKEY_FEATURES_BITROT, &value);
+    ret = xlator_set_fixed_option(xl, "bitrot", value);
+    if (ret)
+        gf_log(this->name, GF_LOG_WARNING,
+               "failed to set bitrot "
+               "enable option in bit-rot-stub");
+
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_changelog(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                          dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    xlator_t *xl = NULL;
+    char changelog_basepath[PATH_MAX] = {
+        0,
+    };
+    int ret = -1;
+    int32_t len = 0;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !set_dict || !brickinfo) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    xl = volgen_graph_add(graph, "features/changelog", volinfo->volname);
+    if (!xl)
+        goto out;
+
+    ret = xlator_set_fixed_option(xl, "changelog-brick", brickinfo->path);
+    if (ret)
+        goto out;
+
+    len = snprintf(changelog_basepath, sizeof(changelog_basepath), "%s/%s",
+                   brickinfo->path, ".glusterfs/changelogs");
+    if ((len < 0) || (len >= sizeof(changelog_basepath))) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+    ret = xlator_set_fixed_option(xl, "changelog-dir", changelog_basepath);
+    if (ret)
+        goto out;
+
+    ret = glusterd_is_bitrot_enabled(volinfo);
+    if (ret == -1) {
+        goto out;
+    } else if (ret) {
+        ret = xlator_set_fixed_option(xl, "changelog-notification", "on");
+        if (ret)
+            goto out;
+    } else {
+        ret = xlator_set_fixed_option(xl, "changelog-notification", "off");
+        if (ret)
+            goto out;
+    }
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_acl(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                    dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    xlator_t *xl = NULL;
+    int ret = -1;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    ret = dict_get_str_boolean(set_dict, "features.acl", 1);
+    if (!ret) {
+        /* Skip creating this volume if option is disabled */
+        /* By default, this is 'true' */
+        goto out;
+    } else if (ret < 0) {
+        /* lets not treat this as error, as this option is not critical,
+           and implemented for debug help */
+        gf_log(THIS->name, GF_LOG_INFO,
+               "failed to get 'features.acl' flag from dict");
+    }
+
+    xl = volgen_graph_add(graph, "features/access-control", volinfo->volname);
+    if (!xl) {
+        ret = -1;
+        goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_locks(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                      dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    xlator_t *xl = NULL;
+    int ret = -1;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    xl = volgen_graph_add(graph, "features/locks", volinfo->volname);
+    if (!xl)
+        goto out;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_iot(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                    dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    xlator_t *xl = NULL;
+    int ret = -1;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    xl = volgen_graph_add(graph, "performance/io-threads", volinfo->volname);
+    if (!xl)
+        goto out;
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_barrier(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                        dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    xlator_t *xl = NULL;
+    int ret = -1;
+    xlator_t *this = THIS;
+
+    if (!graph || !volinfo) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    xl = volgen_graph_add(graph, "features/barrier", volinfo->volname);
+    if (!xl)
+        goto out;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_sdfs(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                     dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    xlator_t *xl = NULL;
+    int ret = -1;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    if (!dict_get_str_boolean(set_dict, "features.sdfs", 0)) {
+        /* update only if option is enabled */
+        ret = 0;
+        goto out;
+    }
+
+    xl = volgen_graph_add(graph, "features/sdfs", volinfo->volname);
+    if (!xl)
+        goto out;
+    /* If we don't set this option here, the translator by default marks
+       it 'pass-through' */
+    ret = xlator_set_fixed_option(xl, "pass-through", "false");
+    if (ret)
+        goto out;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_namespace(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                          dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    xlator_t *xl = NULL;
+    int ret = -1;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    ret = dict_get_str_boolean(set_dict, "features.tag-namespaces", 0);
+    if (ret == -1)
+        goto out;
+
+    if (ret) {
+        xl = volgen_graph_add(graph, "features/namespace", volinfo->volname);
+        if (!xl)
+            goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+xlator_t *
+add_one_peer(volgen_graph_t *graph, glusterd_brickinfo_t *peer, char *volname,
+             uint16_t index)
+{
+    xlator_t *kid;
+
+    kid = volgen_graph_add_nolink(graph, "protocol/client", "%s-client-%u",
+                                  volname, index++);
+    if (!kid) {
+        return NULL;
+    }
+
+    /* TBD: figure out where to get the proper transport list */
+    if (xlator_set_fixed_option(kid, "transport-type", "socket")) {
+        return NULL;
+    }
+    if (xlator_set_fixed_option(kid, "remote-host", peer->hostname)) {
+        return NULL;
+    }
+    if (xlator_set_fixed_option(kid, "remote-subvolume", peer->path)) {
+        return NULL;
+    }
+    /* TBD: deal with RDMA, SSL */
+
+    return kid;
+}
+
+static int
+brick_graph_add_index(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                      dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    xlator_t *xl = NULL;
+    char *pending_xattr = NULL;
+    char index_basepath[PATH_MAX] = {0};
+    int ret = -1;
+    int32_t len = 0;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !brickinfo || !set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    xl = volgen_graph_add(graph, "features/index", volinfo->volname);
+    if (!xl)
+        goto out;
+
+    len = snprintf(index_basepath, sizeof(index_basepath), "%s/%s",
+                   brickinfo->path, ".glusterfs/indices");
+    if ((len < 0) || (len >= sizeof(index_basepath))) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        goto out;
+    }
+
+    ret = xlator_set_fixed_option(xl, "index-base", index_basepath);
+    if (ret)
+        goto out;
+    if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) {
+        ret = xlator_set_fixed_option(xl, "xattrop64-watchlist",
+                                      "trusted.ec.dirty");
+        if (ret)
+            goto out;
+    }
+    if ((volinfo->type == GF_CLUSTER_TYPE_REPLICATE ||
+         volinfo->type == GF_CLUSTER_TYPE_NONE)) {
+        ret = xlator_set_fixed_option(xl, "xattrop-dirty-watchlist",
+                                      "trusted.afr.dirty");
+        if (ret)
+            goto out;
+        ret = gf_asprintf(&pending_xattr, "trusted.afr.%s-", volinfo->volname);
+        if (ret < 0)
+            goto out;
+        ret = xlator_set_fixed_option(xl, "xattrop-pending-watchlist",
+                                      pending_xattr);
+        if (ret)
+            goto out;
+    }
+out:
+    GF_FREE(pending_xattr);
+    return ret;
+}
+
+static int
+brick_graph_add_marker(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                       dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    int ret = -1;
+    xlator_t *xl = NULL;
+    char tstamp_file[PATH_MAX] = {
+        0,
+    };
+    char volume_id[64] = {
+        0,
+    };
+    char buf[32] = {
+        0,
+    };
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    xl = volgen_graph_add(graph, "features/marker", volinfo->volname);
+    if (!xl)
+        goto out;
+
+    gf_uuid_unparse(volinfo->volume_id, volume_id);
+    ret = xlator_set_fixed_option(xl, "volume-uuid", volume_id);
+    if (ret)
+        goto out;
+    get_vol_tstamp_file(tstamp_file, volinfo);
+    ret = xlator_set_fixed_option(xl, "timestamp-file", tstamp_file);
+    if (ret)
+        goto out;
+
+    snprintf(buf, sizeof(buf), "%d", volinfo->quota_xattr_version);
+    ret = xlator_set_fixed_option(xl, "quota-version", buf);
+    if (ret)
+        goto out;
+
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_quota(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                      dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    int ret = -1;
+    xlator_t *xl = NULL;
+    char *value = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    xl = volgen_graph_add(graph, "features/quota", volinfo->volname);
+    if (!xl)
+        goto out;
+
+    ret = xlator_set_fixed_option(xl, "volume-uuid", volinfo->volname);
+    if (ret)
+        goto out;
+
+    ret = glusterd_volinfo_get(volinfo, VKEY_FEATURES_QUOTA, &value);
+    if (value) {
+        ret = xlator_set_fixed_option(xl, "server-quota", value);
+        if (ret)
+            goto out;
+    }
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_ro(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                   dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    int ret = -1;
+    xlator_t *xl = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    if (dict_get_str_boolean(set_dict, "features.read-only", 0) &&
+        (dict_get_str_boolean(set_dict, "features.worm", 0) ||
+         dict_get_str_boolean(set_dict, "features.worm-file-level", 0))) {
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "read-only and worm cannot be set together");
+        ret = -1;
+        goto out;
+    }
+
+    xl = volgen_graph_add(graph, "features/read-only", volinfo->volname);
+    if (!xl)
+        return -1;
+    ret = xlator_set_fixed_option(xl, "read-only", "off");
+    if (ret)
+        return -1;
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_worm(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                     dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    int ret = -1;
+    xlator_t *xl = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    if (dict_get_str_boolean(set_dict, "features.read-only", 0) &&
+        (dict_get_str_boolean(set_dict, "features.worm", 0) ||
+         dict_get_str_boolean(set_dict, "features.worm-file-level", 0))) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_INCOMPATIBLE_VALUE,
+               "read-only and worm cannot be set together");
+        ret = -1;
+        goto out;
+    }
+
+    xl = volgen_graph_add(graph, "features/worm", volinfo->volname);
+    if (!xl)
+        return -1;
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_cdc(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                    dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    int ret = -1;
+    xlator_t *xl = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    /* Check for compress volume option, and add it to the graph on
+     * server side */
+    ret = dict_get_str_boolean(set_dict, "network.compression", 0);
+    if (ret == -1)
+        goto out;
+    if (ret) {
+        xl = volgen_graph_add(graph, "features/cdc", volinfo->volname);
+        if (!xl) {
+            ret = -1;
+            goto out;
+        }
+        ret = xlator_set_fixed_option(xl, "mode", "server");
+        if (ret)
+            goto out;
+    }
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_io_stats(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                         dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    int ret = -1;
+    xlator_t *xl = NULL;
+    xlator_t *this = THIS;
+    glusterd_conf_t *priv = this->private;
+
+    if (!graph || !set_dict || !brickinfo) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    xl = volgen_graph_add_as(graph, "debug/io-stats", brickinfo->path);
+    if (!xl)
+        goto out;
+
+    ret = xlator_set_fixed_option(xl, "unique-id", brickinfo->path);
+    if (ret)
+        goto out;
+
+    if (priv->op_version >= GD_OP_VERSION_7_1) {
+        ret = xlator_set_fixed_option(xl, "volume-id",
+                                      uuid_utoa(volinfo->volume_id));
+        if (ret)
+            goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_upcall(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                       dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    xlator_t *xl = NULL;
+    int ret = -1;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    xl = volgen_graph_add(graph, "features/upcall", volinfo->volname);
+    if (!xl) {
+        gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_GRAPH_FEATURE_ADD_FAIL,
+               "failed to add features/upcall to graph");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_leases(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                       dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    xlator_t *xl = NULL;
+    int ret = -1;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    xl = volgen_graph_add(graph, "features/leases", volinfo->volname);
+    if (!xl) {
+        gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_GRAPH_FEATURE_ADD_FAIL,
+               "failed to add features/leases to graph");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_server(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                       dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    int ret = -1;
+    xlator_t *xl = NULL;
+    char transt[16] = {
+        0,
+    };
+    char *username = NULL;
+    char *password = NULL;
+    char key[1024] = {0};
+    char *ssl_user = NULL;
+    char *volname = NULL;
+    char *address_family_data = NULL;
+    int32_t len = 0;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !set_dict || !brickinfo) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    get_vol_transport_type(volinfo, transt);
+
+    username = glusterd_auth_get_username(volinfo);
+    password = glusterd_auth_get_password(volinfo);
+
+    xl = volgen_graph_add(graph, "protocol/server", volinfo->volname);
+    if (!xl)
+        goto out;
+
+    ret = xlator_set_fixed_option(xl, "transport-type", transt);
+    if (ret)
+        goto out;
+
+    /*In the case of running multiple glusterds on a single machine,
+     * we should ensure that bricks don't listen on all IPs on that
+     * machine and break the IP based separation being brought about.*/
+    if (dict_get_sizen(THIS->options, "transport.socket.bind-address")) {
+        ret = xlator_set_fixed_option(xl, "transport.socket.bind-address",
+                                      brickinfo->hostname);
+        if (ret)
+            return -1;
+    }
+
+    RPC_SET_OPT(xl, SSL_OWN_CERT_OPT, "ssl-own-cert", return -1);
+    RPC_SET_OPT(xl, SSL_PRIVATE_KEY_OPT, "ssl-private-key", return -1);
+    RPC_SET_OPT(xl, SSL_CA_LIST_OPT, "ssl-ca-list", return -1);
+    RPC_SET_OPT(xl, SSL_CRL_PATH_OPT, "ssl-crl-path", return -1);
+    RPC_SET_OPT(xl, SSL_CERT_DEPTH_OPT, "ssl-cert-depth", return -1);
+    RPC_SET_OPT(xl, SSL_CIPHER_LIST_OPT, "ssl-cipher-list", return -1);
+    RPC_SET_OPT(xl, SSL_DH_PARAM_OPT, "ssl-dh-param", return -1);
+    RPC_SET_OPT(xl, SSL_EC_CURVE_OPT, "ssl-ec-curve", return -1);
+
+    if (dict_get_str_sizen(volinfo->dict, "transport.address-family",
+                           &address_family_data) == 0) {
+        ret = xlator_set_fixed_option(xl, "transport.address-family",
+                                      address_family_data);
+        if (ret) {
+            gf_log("glusterd", GF_LOG_WARNING,
+                   "failed to set transport.address-family");
+            return -1;
+        }
+    }
+
+    if (username) {
+        len = snprintf(key, sizeof(key), "auth.login.%s.allow",
+                       brickinfo->path);
+        if ((len < 0) || (len >= sizeof(key))) {
+            return -1;
+        }
+
+        ret = xlator_set_option(xl, key, len, username);
+        if (ret)
+            return -1;
+    }
+
+    if (password) {
+        len = snprintf(key, sizeof(key), "auth.login.%s.password", username);
+        if ((len < 0) || (len >= sizeof(key))) {
+            return -1;
+        }
+        ret = xlator_set_option(xl, key, len, password);
+        if (ret)
+            return -1;
+    }
+
+    ret = xlator_set_fixed_option(xl, "auth-path", brickinfo->path);
+    if (ret)
+        return -1;
+
+    volname = volinfo->is_snap_volume ? volinfo->parent_volname
+                                      : volinfo->volname;
+
+    if (volname && !strcmp(volname, GLUSTER_SHARED_STORAGE)) {
+        ret = xlator_set_fixed_option(xl, "strict-auth-accept", "true");
+        if (ret)
+            return -1;
+    }
+
+    if (dict_get_str_sizen(volinfo->dict, "auth.ssl-allow", &ssl_user) == 0) {
+        len = snprintf(key, sizeof(key), "auth.login.%s.ssl-allow",
+                       brickinfo->path);
+        if ((len < 0) || (len >= sizeof(key))) {
+            return -1;
+        }
+
+        ret = xlator_set_option(xl, key, len, ssl_user);
+        if (ret)
+            return -1;
+    }
+
+out:
+    return ret;
+}
+
+static int
+brick_graph_add_pump(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                     dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+    int ret = -1;
+    int pump = 0;
+    xlator_t *xl = NULL;
+    xlator_t *txl = NULL;
+    xlator_t *rbxl = NULL;
+    char *username = NULL;
+    char *password = NULL;
+    char *ptranst = NULL;
+    char *address_family_data = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    if (!graph || !volinfo || !set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    ret = dict_get_int32(volinfo->dict, "enable-pump", &pump);
+    if (ret == -ENOENT) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=enable-pump", NULL);
+        ret = pump = 0;
+    }
+    if (ret)
+        return -1;
+
+    username = glusterd_auth_get_username(volinfo);
+    password = glusterd_auth_get_password(volinfo);
+
+    if (pump) {
+        txl = first_of(graph);
+
+        rbxl = volgen_graph_add_nolink(graph, "protocol/client",
+                                       "%s-replace-brick", volinfo->volname);
+        if (!rbxl)
+            return -1;
+
+        ptranst = glusterd_get_trans_type_rb(volinfo->transport_type);
+        if (NULL == ptranst)
+            return -1;
+
+        RPC_SET_OPT(rbxl, SSL_OWN_CERT_OPT, "ssl-own-cert", return -1);
+        RPC_SET_OPT(rbxl, SSL_PRIVATE_KEY_OPT, "ssl-private-key", return -1);
+        RPC_SET_OPT(rbxl, SSL_CA_LIST_OPT, "ssl-ca-list", return -1);
+        RPC_SET_OPT(rbxl, SSL_CRL_PATH_OPT, "ssl-crl-path", return -1);
+        RPC_SET_OPT(rbxl, SSL_CERT_DEPTH_OPT, "ssl-cert-depth", return -1);
+        RPC_SET_OPT(rbxl, SSL_CIPHER_LIST_OPT, "ssl-cipher-list", return -1);
+        RPC_SET_OPT(rbxl, SSL_DH_PARAM_OPT, "ssl-dh-param", return -1);
+        RPC_SET_OPT(rbxl, SSL_EC_CURVE_OPT, "ssl-ec-curve", return -1);
+
+        if (username) {
+            ret = xlator_set_fixed_option(rbxl, "username", username);
+            if (ret)
+                return -1;
+        }
+
+        if (password) {
+            ret = xlator_set_fixed_option(rbxl, "password", password);
+            if (ret)
+                return -1;
+        }
+
+        ret = xlator_set_fixed_option(rbxl, "transport-type", ptranst);
+        GF_FREE(ptranst);
+        if (ret)
+            return -1;
+
+        if (dict_get_str_sizen(volinfo->dict, "transport.address-family",
+                               &address_family_data) == 0) {
+            ret = xlator_set_fixed_option(rbxl, "transport.address-family",
+                                          address_family_data);
+            if (ret) {
+                gf_log("glusterd", GF_LOG_WARNING,
+                       "failed to set transport.address-family");
+                return -1;
+            }
+        }
+
+        xl = volgen_graph_add_nolink(graph, "cluster/pump", "%s-pump",
+                                     volinfo->volname);
+        if (!xl)
+            return -1;
+        ret = volgen_xlator_link(xl, txl);
+        if (ret)
+            return -1;
+        ret = volgen_xlator_link(xl, rbxl);
+        if (ret)
+            return -1;
+    }
+
+out:
+    return ret;
+}
+
+/* The order of xlator definition here determines
+ * the topology of the brick graph */
+static volgen_brick_xlator_t server_graph_table[] = {
+    {brick_graph_add_server, NULL},
+    {brick_graph_add_io_stats, "NULL"},
+    {brick_graph_add_sdfs, "sdfs"},
+    {brick_graph_add_namespace, "namespace"},
+    {brick_graph_add_cdc, NULL},
+    {brick_graph_add_quota, "quota"},
+    {brick_graph_add_index, "index"},
+    {brick_graph_add_barrier, NULL},
+    {brick_graph_add_marker, "marker"},
+    {brick_graph_add_selinux, "selinux"},
+    {brick_graph_add_iot, "io-threads"},
+    {brick_graph_add_upcall, "upcall"},
+    {brick_graph_add_leases, "leases"},
+    {brick_graph_add_pump, NULL},
+    {brick_graph_add_ro, NULL},
+    {brick_graph_add_worm, NULL},
+    {brick_graph_add_locks, "locks"},
+    {brick_graph_add_acl, "acl"},
+    {brick_graph_add_bitrot_stub, "bitrot-stub"},
+    {brick_graph_add_changelog, "changelog"},
+    {brick_graph_add_trash, "trash"},
+    {brick_graph_add_arbiter, "arbiter"},
+    {brick_graph_add_posix, "posix"},
+};
+
+static glusterd_server_xlator_t
+get_server_xlator(char *xlator)
+{
+    int i = 0;
+    int size = sizeof(server_graph_table) / sizeof(server_graph_table[0]);
+
+    for (i = 0; i < size; i++) {
+        if (!server_graph_table[i].dbg_key)
+            continue;
+        if (strcmp(xlator, server_graph_table[i].dbg_key))
+            return GF_XLATOR_SERVER;
+    }
+
+    return GF_XLATOR_NONE;
+}
+
+static glusterd_client_xlator_t
+get_client_xlator(char *xlator)
+{
+    glusterd_client_xlator_t subvol = GF_CLNT_XLATOR_NONE;
+
+    if (strcmp(xlator, "client") == 0)
+        subvol = GF_CLNT_XLATOR_FUSE;
+
+    return subvol;
+}
+
+static int
+debugxl_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                       void *param)
+{
+    char *volname = NULL;
+    gf_boolean_t enabled = _gf_false;
+
+    volname = param;
+
+    if (strcmp(vme->option, "!debug") != 0)
+        return 0;
+
+    if (!strcmp(vme->key, "debug.trace") ||
+        !strcmp(vme->key, "debug.error-gen") ||
+        !strcmp(vme->key, "debug.delay-gen")) {
+        if (get_server_xlator(vme->value) == GF_XLATOR_NONE &&
+            get_client_xlator(vme->value) == GF_CLNT_XLATOR_NONE)
+            return 0;
+    }
+
+    if (gf_string2boolean(vme->value, &enabled) == -1)
+        goto add_graph;
+    if (!enabled)
+        return 0;
+
+add_graph:
+    if (strcmp(vme->value, "off") == 0)
+        return 0;
+    if (volgen_graph_add(graph, vme->voltype, volname))
+        return 0;
+    else
+        return -1;
+}
+
+int
+check_and_add_debug_xl(volgen_graph_t *graph, dict_t *set_dict, char *volname,
+                       char *xlname)
+{
+    int i = 0;
+    int ret = 0;
+    char *value_str = NULL;
+    static char *xls[] = {"debug.trace", "debug.error-gen", "debug.delay-gen",
+                          NULL};
+
+    if (!xlname)
+        goto out;
+
+    while (xls[i]) {
+        ret = dict_get_str(set_dict, xls[i], &value_str);
+        if (!ret) {
+            if (strcmp(xlname, value_str) == 0) {
+                ret = volgen_graph_set_options_generic(graph, set_dict, volname,
+                                                       &debugxl_option_handler);
+                if (ret)
+                    goto out;
+            }
+        }
+        i++;
+    }
+    ret = 0;
+
+out:
+    return ret;
+}
+
+static int
+server_graph_builder(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                     dict_t *set_dict, void *param)
+{
+    int ret = 0;
+    char *xlator = NULL;
+    char *loglevel = NULL;
+    int i = 0;
+
+    i = sizeof(server_graph_table) / sizeof(server_graph_table[0]) - 1;
+
+    while (i >= 0) {
+        ret = server_graph_table[i].builder(graph, volinfo, set_dict, param);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_BUILD_GRAPH_FAILED,
+                   "Builing graph "
+                   "failed for server graph table entry: %d",
+                   i);
+            goto out;
+        }
+
+        ret = check_and_add_debug_xl(graph, set_dict, volinfo->volname,
+                                     server_graph_table[i].dbg_key);
+        if (ret)
+            goto out;
+
+        i--;
+    }
+
+    ret = dict_get_str_sizen(set_dict, "xlator", &xlator);
+
+    /* got a cli log level request */
+    if (!ret) {
+        ret = dict_get_str_sizen(set_dict, "loglevel", &loglevel);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                   "could not get both"
+                   " translator name and loglevel for log level request");
+            goto out;
+        }
+    }
+
+    ret = volgen_graph_set_options_generic(
+        graph, set_dict, (xlator && loglevel) ? (void *)set_dict : volinfo,
+        (xlator && loglevel) ? &server_spec_extended_option_handler
+                             : &server_spec_option_handler);
+
+out:
+    return ret;
+}
+
+/* builds a graph for server role , with option overrides in mod_dict */
+static int
+build_server_graph(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                   dict_t *mod_dict, glusterd_brickinfo_t *brickinfo)
+{
+    return build_graph_generic(graph, volinfo, mod_dict, brickinfo,
+                               &server_graph_builder);
+}
+
+static int
+perfxl_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                      void *param)
+{
+    gf_boolean_t enabled = _gf_false;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("glusterd", param, out);
+    volinfo = param;
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    priv = this->private;
+    GF_VALIDATE_OR_GOTO("glusterd", priv, out);
+
+    if (strcmp(vme->option, "!perf") != 0)
+        return 0;
+
+    if (gf_string2boolean(vme->value, &enabled) == -1)
+        return -1;
+    if (!enabled)
+        return 0;
+
+    /* Check op-version before adding the 'open-behind' xlator in the graph
+     */
+    if (!strcmp(vme->key, "performance.open-behind") &&
+        (vme->op_version > volinfo->client_op_version))
+        return 0;
+
+    if (priv->op_version < GD_OP_VERSION_3_12_2) {
+        /* For replicate volumes do not load io-threads as it affects
+         * performance
+         */
+        if (!strcmp(vme->key, "performance.client-io-threads") &&
+            (GF_CLUSTER_TYPE_REPLICATE == volinfo->type))
+            return 0;
+    }
+
+    /* if VKEY_READDIR_AHEAD is enabled and parallel readdir is
+     * not enabled then load readdir-ahead here else it will be
+     * loaded as a child of dht */
+    if (!strcmp(vme->key, VKEY_READDIR_AHEAD) &&
+        glusterd_volinfo_get_boolean(volinfo, VKEY_PARALLEL_READDIR))
+        return 0;
+
+    if (volgen_graph_add(graph, vme->voltype, volinfo->volname))
+        return 0;
+out:
+    return -1;
+}
+
+static int
+gfproxy_server_perfxl_option_handler(volgen_graph_t *graph,
+                                     struct volopt_map_entry *vme, void *param)
+{
+    GF_ASSERT(param);
+
+    /* write-behind is the *not* allowed for gfproxy-servers */
+    if (strstr(vme->key, "write-behind")) {
+        return 0;
+    }
+
+    perfxl_option_handler(graph, vme, param);
+
+    return 0;
+}
+
+static int
+gfproxy_client_perfxl_option_handler(volgen_graph_t *graph,
+                                     struct volopt_map_entry *vme, void *param)
+{
+    GF_ASSERT(param);
+
+    /* write-behind is the only allowed "perf" for gfproxy-clients */
+    if (!strstr(vme->key, "write-behind"))
+        return 0;
+
+    perfxl_option_handler(graph, vme, param);
+
+    return 0;
+}
+
+#ifdef BUILD_GNFS
+static int
+nfsperfxl_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                         void *param)
+{
+    char *volname = NULL;
+    gf_boolean_t enabled = _gf_false;
+
+    volname = param;
+
+    if (strcmp(vme->option, "!nfsperf") != 0)
+        return 0;
+
+    if (gf_string2boolean(vme->value, &enabled) == -1)
+        return -1;
+    if (!enabled)
+        return 0;
+
+    if (volgen_graph_add(graph, vme->voltype, volname))
+        return 0;
+    else
+        return -1;
+}
+#endif
+
+#if (HAVE_LIB_XML)
+int
+end_sethelp_xml_doc(xmlTextWriterPtr writer)
+{
+    int ret = -1;
+
+    ret = xmlTextWriterEndElement(writer);
+    if (ret < 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_XML_TEXT_WRITE_FAIL,
+               "Could not end an "
+               "xmlElement");
+        ret = -1;
+        goto out;
+    }
+    ret = xmlTextWriterEndDocument(writer);
+    if (ret < 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_XML_TEXT_WRITE_FAIL,
+               "Could not end an "
+               "xmlDocument");
+        ret = -1;
+        goto out;
+    }
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+init_sethelp_xml_doc(xmlTextWriterPtr *writer, xmlBufferPtr *buf)
+{
+    int ret = -1;
+
+    if (!writer || !buf)
+        goto out;
+
+    *buf = xmlBufferCreateSize(8192);
+    if (buf == NULL) {
+        gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Error creating the xml "
+               "buffer");
+        ret = -1;
+        goto out;
+    }
+
+    xmlBufferSetAllocationScheme(*buf, XML_BUFFER_ALLOC_DOUBLEIT);
+
+    *writer = xmlNewTextWriterMemory(*buf, 0);
+    if (writer == NULL) {
+        gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               " Error creating the xml "
+               "writer");
+        ret = -1;
+        goto out;
+    }
+
+    ret = xmlTextWriterStartDocument(*writer, "1.0", "UTF-8", "yes");
+    if (ret < 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_XML_DOC_START_FAIL,
+               "Error While starting the "
+               "xmlDoc");
+        goto out;
+    }
+
+    ret = xmlTextWriterStartElement(*writer, (xmlChar *)"options");
+    if (ret < 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_XML_ELE_CREATE_FAIL,
+               "Could not create an "
+               "xmlElement");
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+xml_add_volset_element(xmlTextWriterPtr writer, const char *name,
+                       const char *def_val, const char *dscrpt)
+{
+    int ret = -1;
+
+    GF_ASSERT(name);
+
+    ret = xmlTextWriterStartElement(writer, (xmlChar *)"option");
+    if (ret < 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_XML_ELE_CREATE_FAIL,
+               "Could not create an "
+               "xmlElemetnt");
+        ret = -1;
+        goto out;
+    }
+
+    ret = xmlTextWriterWriteFormatElement(writer, (xmlChar *)"defaultValue",
+                                          "%s", def_val);
+    if (ret < 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_XML_ELE_CREATE_FAIL,
+               "Could not create an "
+               "xmlElemetnt");
+        ret = -1;
+        goto out;
+    }
+
+    ret = xmlTextWriterWriteFormatElement(writer, (xmlChar *)"description",
+                                          "%s", dscrpt);
+    if (ret < 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_XML_ELE_CREATE_FAIL,
+               "Could not create an "
+               "xmlElemetnt");
+        ret = -1;
+        goto out;
+    }
+
+    ret = xmlTextWriterWriteFormatElement(writer, (xmlChar *)"name", "%s",
+                                          name);
+    if (ret < 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_XML_ELE_CREATE_FAIL,
+               "Could not create an "
+               "xmlElemetnt");
+        ret = -1;
+        goto out;
+    }
+
+    ret = xmlTextWriterEndElement(writer);
+    if (ret < 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_XML_ELE_CREATE_FAIL,
+               "Could not end an "
+               "xmlElemetnt");
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+#endif
+
+int
+_get_xlator_opt_key_from_vme(struct volopt_map_entry *vme, char **key)
+{
+    int ret = 0;
+
+    GF_ASSERT(vme);
+    GF_ASSERT(key);
+
+    if (!strcmp(vme->key, AUTH_ALLOW_MAP_KEY))
+        *key = gf_strdup(AUTH_ALLOW_OPT_KEY);
+    else if (!strcmp(vme->key, AUTH_REJECT_MAP_KEY))
+        *key = gf_strdup(AUTH_REJECT_OPT_KEY);
+#ifdef BUILD_GNFS
+    else if (!strcmp(vme->key, NFS_DISABLE_MAP_KEY))
+        *key = gf_strdup(NFS_DISABLE_OPT_KEY);
+#endif
+    else {
+        if (vme->option) {
+            if (vme->option[0] == '!') {
+                *key = vme->option + 1;
+                if (!*key[0])
+                    ret = -1;
+            } else {
+                *key = vme->option;
+            }
+        } else {
+            *key = strchr(vme->key, '.');
+            if (*key) {
+                (*key)++;
+                if (!*key[0])
+                    ret = -1;
+            } else {
+                ret = -1;
+            }
+        }
+    }
+    if (ret)
+        gf_msg("glusterd", GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+               "Wrong entry found in  "
+               "glusterd_volopt_map entry %s",
+               vme->key);
+    else
+        gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+void
+_free_xlator_opt_key(char *key)
+{
+    GF_ASSERT(key);
+
+    if (!strcmp(key, AUTH_ALLOW_OPT_KEY) || !strcmp(key, AUTH_REJECT_OPT_KEY) ||
+        !strcmp(key, NFS_DISABLE_OPT_KEY))
+        GF_FREE(key);
+
+    return;
+}
+
+static xlator_t *
+volgen_graph_build_client(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                          char *hostname, char *port, char *subvol, char *xl_id,
+                          char *transt, dict_t *set_dict)
+{
+    xlator_t *xl = NULL;
+    int ret = -2;
+    uint32_t client_type = GF_CLIENT_OTHER;
+    char *str = NULL;
+    char *ssl_str = NULL;
+    gf_boolean_t ssl_bool = _gf_false;
+    char *address_family_data = NULL;
+
+    GF_ASSERT(graph);
+    GF_ASSERT(subvol);
+    GF_ASSERT(xl_id);
+    GF_ASSERT(transt);
+
+    xl = volgen_graph_add_nolink(graph, "protocol/client", "%s", xl_id);
+    if (!xl)
+        goto err;
+
+    ret = xlator_set_fixed_option(xl, "ping-timeout", "42");
+    if (ret)
+        goto err;
+
+    if (hostname) {
+        ret = xlator_set_fixed_option(xl, "remote-host", hostname);
+        if (ret)
+            goto err;
+    }
+
+    if (port) {
+        ret = xlator_set_fixed_option(xl, "remote-port", port);
+        if (ret)
+            goto err;
+    }
+
+    ret = xlator_set_fixed_option(xl, "remote-subvolume", subvol);
+    if (ret)
+        goto err;
+
+    ret = xlator_set_fixed_option(xl, "transport-type", transt);
+    if (ret)
+        goto err;
+
+    if (dict_get_str_sizen(volinfo->dict, "transport.address-family",
+                           &address_family_data) == 0) {
+        ret = xlator_set_fixed_option(xl, "transport.address-family",
+                                      address_family_data);
+        if (ret) {
+            gf_log("glusterd", GF_LOG_WARNING,
+                   "failed to set transport.address-family");
+            goto err;
+        }
+    }
+
+    ret = dict_get_uint32(set_dict, "trusted-client", &client_type);
+
+    if (!ret && (client_type == GF_CLIENT_TRUSTED ||
+                 client_type == GF_CLIENT_TRUSTED_PROXY)) {
+        str = NULL;
+        str = glusterd_auth_get_username(volinfo);
+        if (str) {
+            ret = xlator_set_fixed_option(xl, "username", str);
+            if (ret)
+                goto err;
+        }
+
+        str = glusterd_auth_get_password(volinfo);
+        if (str) {
+            ret = xlator_set_fixed_option(xl, "password", str);
+            if (ret)
+                goto err;
+        }
+    }
+
+    if (dict_get_str_sizen(set_dict, "client.ssl", &ssl_str) == 0) {
+        if (gf_string2boolean(ssl_str, &ssl_bool) == 0) {
+            if (ssl_bool) {
+                ret = xlator_set_fixed_option(
+                    xl, "transport.socket.ssl-enabled", "true");
+                if (ret) {
+                    goto err;
+                }
+            }
+        }
+    }
+
+    RPC_SET_OPT(xl, SSL_OWN_CERT_OPT, "ssl-own-cert", goto err);
+    RPC_SET_OPT(xl, SSL_PRIVATE_KEY_OPT, "ssl-private-key", goto err);
+    RPC_SET_OPT(xl, SSL_CA_LIST_OPT, "ssl-ca-list", goto err);
+    RPC_SET_OPT(xl, SSL_CRL_PATH_OPT, "ssl-crl-path", goto err);
+    RPC_SET_OPT(xl, SSL_CERT_DEPTH_OPT, "ssl-cert-depth", goto err);
+    RPC_SET_OPT(xl, SSL_CIPHER_LIST_OPT, "ssl-cipher-list", goto err);
+    RPC_SET_OPT(xl, SSL_DH_PARAM_OPT, "ssl-dh-param", goto err);
+    RPC_SET_OPT(xl, SSL_EC_CURVE_OPT, "ssl-ec-curve", goto err);
+
+    return xl;
+err:
+    return NULL;
+}
+
+static int
+volgen_graph_build_clients(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                           dict_t *set_dict, void *param)
+{
+    int i = 0;
+    int ret = -1;
+    char transt[16] = {
+        0,
+    };
+    glusterd_brickinfo_t *brick = NULL;
+    glusterd_brickinfo_t *ta_brick = NULL;
+    xlator_t *xl = NULL;
+    int subvol_index = 0;
+    int thin_arbiter_index = 0;
+
+    if (volinfo->brick_count == 0) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLUME_INCONSISTENCY,
+               "volume inconsistency: brick count is 0");
+        goto out;
+    }
+
+    if ((volinfo->dist_leaf_count < volinfo->brick_count) &&
+        ((volinfo->brick_count % volinfo->dist_leaf_count) != 0)) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLUME_INCONSISTENCY,
+               "volume inconsistency: "
+               "total number of bricks (%d) is not divisible with "
+               "number of bricks per cluster (%d) in a multi-cluster "
+               "setup",
+               volinfo->brick_count, volinfo->dist_leaf_count);
+        goto out;
+    }
+
+    get_transport_type(volinfo, set_dict, transt, _gf_false);
+
+    if (!strcmp(transt, "tcp,rdma"))
+        strcpy(transt, "tcp");
+
+    i = 0;
+    cds_list_for_each_entry(brick, &volinfo->bricks, brick_list)
+    {
+        /* insert ta client xlator entry.
+         * eg - If subvol count is > 1, then after every two client xlator
+         * entries there should be a ta client xlator entry in the volfile. ta
+         * client xlator indexes are - 2, 5, 8 etc depending on the index of
+         * subvol.
+         */
+        if (volinfo->thin_arbiter_count &&
+            (i + 1) % (volinfo->replica_count + 1) == 0) {
+            thin_arbiter_index = 0;
+            cds_list_for_each_entry(ta_brick, &volinfo->ta_bricks, brick_list)
+            {
+                if (thin_arbiter_index == subvol_index) {
+                    xl = volgen_graph_build_client(
+                        graph, volinfo, ta_brick->hostname, NULL,
+                        ta_brick->path, ta_brick->brick_id, transt, set_dict);
+                    if (!xl) {
+                        ret = -1;
+                        goto out;
+                    }
+                }
+                thin_arbiter_index++;
+            }
+            subvol_index++;
+        }
+        xl = volgen_graph_build_client(graph, volinfo, brick->hostname, NULL,
+                                       brick->path, brick->brick_id, transt,
+                                       set_dict);
+        if (!xl) {
+            ret = -1;
+            goto out;
+        }
+
+        i++;
+    }
+
+    /* Add ta client xlator entry for last subvol
+     * Above loop will miss out on making the ta client
+     * xlator entry for the last subvolume in the volfile
+     */
+    if (volinfo->thin_arbiter_count) {
+        thin_arbiter_index = 0;
+        cds_list_for_each_entry(ta_brick, &volinfo->ta_bricks, brick_list)
+        {
+            if (thin_arbiter_index == subvol_index) {
+                xl = volgen_graph_build_client(
+                    graph, volinfo, ta_brick->hostname, NULL, ta_brick->path,
+                    ta_brick->brick_id, transt, set_dict);
+                if (!xl) {
+                    ret = -1;
+                    goto out;
+                }
+            }
+
+            thin_arbiter_index++;
+        }
+    }
+
+    if (i != volinfo->brick_count) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLUME_INCONSISTENCY,
+               "volume inconsistency: actual number of bricks (%d) "
+               "differs from brick count (%d)",
+               i, volinfo->brick_count);
+
+        ret = -1;
+        goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+volgen_link_bricks(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                   char *xl_type, char *xl_namefmt, size_t child_count,
+                   size_t sub_count, size_t start_count, xlator_t *trav)
+{
+    int i = 0;
+    int j = start_count;
+    xlator_t *xl = NULL;
+    char *volname = NULL;
+    int ret = -1;
+
+    if (child_count == 0)
+        goto out;
+    volname = volinfo->volname;
+
+    for (;; trav = trav->prev) {
+        if ((i % sub_count) == 0) {
+            xl = volgen_graph_add_nolink(graph, xl_type, xl_namefmt, volname,
+                                         j);
+            j++;
+        }
+
+        if (!xl) {
+            ret = -1;
+            goto out;
+        }
+
+        if (strncmp(xl_type, "performance/readdir-ahead",
+                    SLEN("performance/readdir-ahead")) == 0) {
+            ret = xlator_set_fixed_option(xl, "performance.readdir-ahead",
+                                          "on");
+            if (ret)
+                goto out;
+        }
+
+        ret = volgen_xlator_link(xl, trav);
+        if (ret)
+            goto out;
+
+        i++;
+        if (i == child_count)
+            break;
+    }
+
+    ret = j - start_count;
+out:
+    return ret;
+}
+
+static int
+volgen_link_bricks_from_list_tail_start(volgen_graph_t *graph,
+                                        glusterd_volinfo_t *volinfo,
+                                        char *xl_type, char *xl_namefmt,
+                                        size_t child_count, size_t sub_count,
+                                        size_t start_count)
+{
+    xlator_t *trav = NULL;
+    size_t cnt = child_count;
+
+    if (!cnt)
+        return -1;
+
+    for (trav = first_of(graph); --cnt; trav = trav->next)
+        ;
+
+    return volgen_link_bricks(graph, volinfo, xl_type, xl_namefmt, child_count,
+                              sub_count, start_count, trav);
+}
+
+static int
+volgen_link_bricks_from_list_tail(volgen_graph_t *graph,
+                                  glusterd_volinfo_t *volinfo, char *xl_type,
+                                  char *xl_namefmt, size_t child_count,
+                                  size_t sub_count)
+{
+    xlator_t *trav = NULL;
+    size_t cnt = child_count;
+
+    if (!cnt)
+        return -1;
+
+    for (trav = first_of(graph); --cnt; trav = trav->next)
+        ;
+
+    return volgen_link_bricks(graph, volinfo, xl_type, xl_namefmt, child_count,
+                              sub_count, 0, trav);
+}
+
+/**
+ * This is the build graph function for user-serviceable snapshots.
+ * Generates  snapview-client
+ */
+static int
+volgen_graph_build_snapview_client(volgen_graph_t *graph,
+                                   glusterd_volinfo_t *volinfo, char *volname,
+                                   dict_t *set_dict)
+{
+    int ret = 0;
+    xlator_t *prev_top = NULL;
+    xlator_t *prot_clnt = NULL;
+    xlator_t *svc = NULL;
+    char transt[16] = {
+        0,
+    };
+    char *svc_args[] = {"features/snapview-client", "%s-snapview-client"};
+    char subvol[1024] = {
+        0,
+    };
+    char xl_id[1024] = {
+        0,
+    };
+
+    prev_top = (xlator_t *)(graph->graph.first);
+
+    snprintf(subvol, sizeof(subvol), "snapd-%s", volinfo->volname);
+    snprintf(xl_id, sizeof(xl_id), "%s-snapd-client", volinfo->volname);
+
+    get_transport_type(volinfo, set_dict, transt, _gf_false);
+
+    prot_clnt = volgen_graph_build_client(graph, volinfo, NULL, NULL, subvol,
+                                          xl_id, transt, set_dict);
+    if (!prot_clnt) {
+        ret = -1;
+        goto out;
+    }
+
+    svc = volgen_graph_add_nolink(graph, svc_args[0], svc_args[1], volname);
+    if (!svc) {
+        ret = -1;
+        goto out;
+    }
+
+    /**
+     * Ordering the below two traslators (cur_top & prot_clnt) is important
+     * as snapview client implementation is built on the policy that
+     * normal volume path goes to FIRST_CHILD and snap world operations
+     * goes to SECOND_CHILD
+     **/
+    ret = volgen_xlator_link(graph->graph.first, prev_top);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_XLATOR_LINK_FAIL,
+               "failed to link the "
+               "snapview-client to distribute");
+        goto out;
+    }
+
+    ret = volgen_xlator_link(graph->graph.first, prot_clnt);
+    if (ret) {
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_XLATOR_LINK_FAIL,
+               "failed to link the "
+               "snapview-client to snapview-server");
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+gf_boolean_t
+_xl_is_client_decommissioned(xlator_t *xl, glusterd_volinfo_t *volinfo)
+{
+    int ret = 0;
+    gf_boolean_t decommissioned = _gf_false;
+    char *hostname = NULL;
+    char *path = NULL;
+
+    GF_ASSERT(!strcmp(xl->type, "protocol/client"));
+    ret = xlator_get_fixed_option(xl, "remote-host", &hostname);
+    if (ret) {
+        GF_ASSERT(0);
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_REMOTE_HOST_GET_FAIL,
+               "Failed to get remote-host "
+               "from client %s",
+               xl->name);
+        goto out;
+    }
+    ret = xlator_get_fixed_option(xl, "remote-subvolume", &path);
+    if (ret) {
+        GF_ASSERT(0);
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_REMOTE_HOST_GET_FAIL,
+               "Failed to get remote-host "
+               "from client %s",
+               xl->name);
+        goto out;
+    }
+
+    decommissioned = glusterd_is_brick_decommissioned(volinfo, hostname, path);
+out:
+    return decommissioned;
+}
+
+gf_boolean_t
+_xl_has_decommissioned_clients(xlator_t *xl, glusterd_volinfo_t *volinfo)
+{
+    xlator_list_t *xl_child = NULL;
+    gf_boolean_t decommissioned = _gf_false;
+    xlator_t *cxl = NULL;
+
+    if (!xl)
+        goto out;
+
+    if (!strcmp(xl->type, "protocol/client")) {
+        decommissioned = _xl_is_client_decommissioned(xl, volinfo);
+        goto out;
+    }
+
+    xl_child = xl->children;
+    while (xl_child) {
+        cxl = xl_child->xlator;
+        /* this can go into 2 depths if the volume type
+           is stripe-replicate */
+        decommissioned = _xl_has_decommissioned_clients(cxl, volinfo);
+        if (decommissioned)
+            break;
+
+        xl_child = xl_child->next;
+    }
+out:
+    return decommissioned;
+}
+
+static int
+_graph_get_decommissioned_children(xlator_t *dht, glusterd_volinfo_t *volinfo,
+                                   char **children)
+{
+    int ret = -1;
+    xlator_list_t *xl_child = NULL;
+    xlator_t *cxl = NULL;
+    gf_boolean_t comma = _gf_false;
+
+    *children = NULL;
+    xl_child = dht->children;
+    while (xl_child) {
+        cxl = xl_child->xlator;
+        if (_xl_has_decommissioned_clients(cxl, volinfo)) {
+            if (!*children) {
+                *children = GF_CALLOC(16 * GF_UNIT_KB, 1, gf_common_mt_char);
+                if (!*children)
+                    goto out;
+            }
+
+            if (comma)
+                strcat(*children, ",");
+            strcat(*children, cxl->name);
+            comma = _gf_true;
+        }
+
+        xl_child = xl_child->next;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+volgen_graph_build_readdir_ahead(volgen_graph_t *graph,
+                                 glusterd_volinfo_t *volinfo,
+                                 size_t child_count)
+{
+    int32_t clusters = 0;
+
+    if (graph->type == GF_QUOTAD || graph->type == GF_SNAPD ||
+        !glusterd_volinfo_get_boolean(volinfo, VKEY_PARALLEL_READDIR))
+        goto out;
+
+    clusters = volgen_link_bricks_from_list_tail(
+        graph, volinfo, "performance/readdir-ahead", "%s-readdir-ahead-%d",
+        child_count, 1);
+
+out:
+    return clusters;
+}
+
+static int
+volgen_graph_build_dht_cluster(volgen_graph_t *graph,
+                               glusterd_volinfo_t *volinfo, size_t child_count,
+                               gf_boolean_t is_quotad)
+{
+    int32_t clusters = 0;
+    int ret = -1;
+    char *decommissioned_children = NULL;
+    xlator_t *dht = NULL;
+    char *voltype = "cluster/distribute";
+    char *name_fmt = NULL;
+
+    /* NUFA and Switch section */
+    if (dict_get_str_boolean(volinfo->dict, "cluster.nufa", 0) &&
+        dict_get_str_boolean(volinfo->dict, "cluster.switch", 0)) {
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+               "nufa and switch cannot be set together");
+        ret = -1;
+        goto out;
+    }
+
+    /* Check for NUFA volume option, and change the voltype */
+    if (dict_get_str_boolean(volinfo->dict, "cluster.nufa", 0))
+        voltype = "cluster/nufa";
+
+    /* Check for switch volume option, and change the voltype */
+    if (dict_get_str_boolean(volinfo->dict, "cluster.switch", 0))
+        voltype = "cluster/switch";
+
+    if (is_quotad)
+        name_fmt = "%s";
+    else
+        name_fmt = "%s-dht";
+
+    clusters = volgen_link_bricks_from_list_tail(
+        graph, volinfo, voltype, name_fmt, child_count, child_count);
+    if (clusters < 0)
+        goto out;
+
+    dht = first_of(graph);
+    ret = _graph_get_decommissioned_children(dht, volinfo,
+                                             &decommissioned_children);
+    if (ret)
+        goto out;
+    if (decommissioned_children) {
+        ret = xlator_set_fixed_option(dht, "decommissioned-bricks",
+                                      decommissioned_children);
+        if (ret)
+            goto out;
+    }
+    ret = 0;
+out:
+    GF_FREE(decommissioned_children);
+    return ret;
+}
+
+static int
+volgen_graph_build_ec_clusters(volgen_graph_t *graph,
+                               glusterd_volinfo_t *volinfo)
+{
+    int i = 0;
+    int ret = 0;
+    int clusters = 0;
+    char *disperse_args[] = {"cluster/disperse", "%s-disperse-%d"};
+    xlator_t *ec = NULL;
+    char option[32] = {0};
+    int start_count = 0;
+
+    clusters = volgen_link_bricks_from_list_tail_start(
+        graph, volinfo, disperse_args[0], disperse_args[1],
+        volinfo->brick_count, volinfo->disperse_count, start_count);
+    if (clusters < 0)
+        goto out;
+
+    sprintf(option, "%d", volinfo->redundancy_count);
+    ec = first_of(graph);
+    for (i = 0; i < clusters; i++) {
+        ret = xlator_set_fixed_option(ec, "redundancy", option);
+        if (ret) {
+            clusters = -1;
+            goto out;
+        }
+
+        ec = ec->next;
+    }
+out:
+    return clusters;
+}
+
+static int
+set_afr_pending_xattrs_option(volgen_graph_t *graph,
+                              glusterd_volinfo_t *volinfo, int clusters)
+{
+    xlator_t *xlator = NULL;
+    xlator_t **afr_xlators_list = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    glusterd_brickinfo_t *brick = NULL;
+    glusterd_brickinfo_t *ta_brick = NULL;
+    char *ptr = NULL;
+    int i = 0;
+    int index = -1;
+    int ret = 0;
+    char *afr_xattrs_list = NULL;
+    int list_size = -1;
+    int ta_brick_index = 0;
+    int subvol_index = 0;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    if (conf->op_version < GD_OP_VERSION_3_9_0)
+        return ret;
+
+    /* (brick_id x rep.count) + (rep.count-1 commas) + NULL*/
+    list_size = (1024 * volinfo->replica_count) + (volinfo->replica_count - 1) +
+                1;
+    afr_xattrs_list = GF_CALLOC(1, list_size, gf_common_mt_char);
+    if (!afr_xattrs_list)
+        goto out;
+
+    ptr = afr_xattrs_list;
+    afr_xlators_list = GF_CALLOC(clusters, sizeof(xlator_t *),
+                                 gf_common_mt_xlator_t);
+    if (!afr_xlators_list)
+        goto out;
+
+    xlator = first_of(graph);
+
+    for (i = 0, index = clusters - 1; i < clusters; i++) {
+        afr_xlators_list[index--] = xlator;
+        xlator = xlator->next;
+    }
+
+    i = 1;
+    index = 0;
+
+    cds_list_for_each_entry(brick, &volinfo->bricks, brick_list)
+    {
+        if (index == clusters)
+            break;
+        strncat(ptr, brick->brick_id, strlen(brick->brick_id));
+        if (i == volinfo->replica_count) {
+            /* add ta client xlator in afr-pending-xattrs before making entries
+             * for client xlators in volfile.
+             * ta client xlator indexes are - 2, 5, 8 depending on the index of
+             * subvol. e.g- For first subvol ta client xlator id is volname-ta-2
+             * For pending-xattr, ta name would be
+             * 'volname-ta-2.{{volume-uuid}}' from GD_OP_VERSION_7_3.
+             */
+            ta_brick_index = 0;
+            if (volinfo->thin_arbiter_count == 1) {
+                ptr[strlen(brick->brick_id)] = ',';
+                cds_list_for_each_entry(ta_brick, &volinfo->ta_bricks,
+                                        brick_list)
+                {
+                    if (ta_brick_index == subvol_index) {
+                        break;
+                    }
+                    ta_brick_index++;
+                }
+                if (conf->op_version < GD_OP_VERSION_7_3) {
+                    strncat(ptr, ta_brick->brick_id,
+                            strlen(ta_brick->brick_id));
+                } else {
+                    char ta_volname[PATH_MAX] = "";
+                    int len = snprintf(ta_volname, PATH_MAX, "%s.%s",
+                                       ta_brick->brick_id,
+                                       uuid_utoa(volinfo->volume_id));
+                    strncat(ptr, ta_volname, len);
+                }
+            }
+
+            ret = xlator_set_fixed_option(afr_xlators_list[index++],
+                                          "afr-pending-xattr", afr_xattrs_list);
+            if (ret)
+                goto out;
+            memset(afr_xattrs_list, 0, list_size);
+            ptr = afr_xattrs_list;
+            i = 1;
+            subvol_index++;
+            continue;
+        }
+        ptr[strlen(brick->brick_id)] = ',';
+        ptr += strlen(brick->brick_id) + 1;
+        i++;
+    }
+
+out:
+    GF_FREE(afr_xattrs_list);
+    GF_FREE(afr_xlators_list);
+    return ret;
+}
+
+static int
+set_volfile_id_option(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                      int clusters)
+{
+    xlator_t *xlator = NULL;
+    int i = 0;
+    int ret = -1;
+    glusterd_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    if (conf->op_version < GD_OP_VERSION_9_0)
+        return 0;
+    xlator = first_of(graph);
+
+    for (i = 0; i < clusters; i++) {
+        ret = xlator_set_fixed_option(xlator, "volume-id",
+                                      uuid_utoa(volinfo->volume_id));
+        if (ret)
+            goto out;
+
+        xlator = xlator->next;
+    }
+
+out:
+    return ret;
+}
+
+static int
+volgen_graph_build_afr_clusters(volgen_graph_t *graph,
+                                glusterd_volinfo_t *volinfo)
+{
+    int i = 0;
+    int ret = 0;
+    int clusters = 0;
+    char *replicate_type = "cluster/replicate";
+    char *replicate_name = "%s-replicate-%d";
+    xlator_t *afr = NULL;
+    char option[32] = {0};
+    glusterd_brickinfo_t *ta_brick = NULL;
+    int ta_brick_index = 0;
+    int ta_replica_offset = 0;
+    int ta_brick_offset = 0;
+    char ta_option[4096] = {
+        0,
+    };
+
+    /* In thin-arbiter case brick count and replica count remain same
+     * but due to additional entries of ta client xlators in the volfile,
+     * GD1 is manipulated to include these client xlators while linking them to
+     * afr/cluster entry in the volfile.
+     */
+    if (volinfo->thin_arbiter_count == 1) {
+        ta_replica_offset = 1;
+        ta_brick_offset = volinfo->subvol_count;
+    }
+
+    clusters = volgen_link_bricks_from_list_tail(
+        graph, volinfo, replicate_type, replicate_name,
+        volinfo->brick_count + ta_brick_offset,
+        volinfo->replica_count + ta_replica_offset);
+
+    if (clusters < 0)
+        goto out;
+
+    ret = set_afr_pending_xattrs_option(graph, volinfo, clusters);
+    if (ret) {
+        clusters = -1;
+        goto out;
+    }
+
+    ret = set_volfile_id_option(graph, volinfo, clusters);
+    if (ret) {
+        clusters = -1;
+        goto out;
+    }
+
+    if (!volinfo->arbiter_count && !volinfo->thin_arbiter_count)
+        goto out;
+
+    afr = first_of(graph);
+
+    if (volinfo->arbiter_count) {
+        sprintf(option, "%d", volinfo->arbiter_count);
+        for (i = 0; i < clusters; i++) {
+            ret = xlator_set_fixed_option(afr, "arbiter-count", option);
+            if (ret) {
+                clusters = -1;
+                goto out;
+            }
+
+            afr = afr->next;
+        }
+    }
+
+    if (volinfo->thin_arbiter_count == 1) {
+        for (i = 0; i < clusters; i++) {
+            ta_brick_index = 0;
+            cds_list_for_each_entry(ta_brick, &volinfo->ta_bricks, brick_list)
+            {
+                if (ta_brick_index == i) {
+                    break;
+                }
+                ta_brick_index++;
+            }
+            snprintf(ta_option, sizeof(ta_option), "%s:%s", ta_brick->hostname,
+                     ta_brick->path);
+            ret = xlator_set_fixed_option(afr, "thin-arbiter", ta_option);
+            if (ret) {
+                clusters = -1;
+                goto out;
+            }
+            afr = afr->next;
+        }
+    }
+out:
+    return clusters;
+}
+
+static int
+volume_volgen_graph_build_clusters(volgen_graph_t *graph,
+                                   glusterd_volinfo_t *volinfo,
+                                   gf_boolean_t is_quotad)
+{
+    int clusters = 0;
+    int dist_count = 0;
+    int ret = -1;
+
+    if (!volinfo->dist_leaf_count)
+        goto out;
+
+    if (volinfo->dist_leaf_count == 1)
+        goto build_distribute;
+
+    /* All other cases, it will have one or the other cluster type */
+    switch (volinfo->type) {
+        case GF_CLUSTER_TYPE_REPLICATE:
+            clusters = volgen_graph_build_afr_clusters(graph, volinfo);
+            if (clusters < 0)
+                goto out;
+            break;
+        case GF_CLUSTER_TYPE_DISPERSE:
+            clusters = volgen_graph_build_ec_clusters(graph, volinfo);
+            if (clusters < 0)
+                goto out;
+
+            break;
+        default:
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOLUME_INCONSISTENCY,
+                   "volume inconsistency: "
+                   "unrecognized clustering type");
+            goto out;
+    }
+
+build_distribute:
+    dist_count = volinfo->brick_count / volinfo->dist_leaf_count;
+    if (!dist_count) {
+        ret = -1;
+        goto out;
+    }
+    clusters = volgen_graph_build_readdir_ahead(graph, volinfo, dist_count);
+    if (clusters < 0)
+        goto out;
+
+    ret = volgen_graph_build_dht_cluster(graph, volinfo, dist_count, is_quotad);
+    if (ret)
+        goto out;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+client_graph_set_rda_options(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                             dict_t *set_dict)
+{
+    char *rda_cache_s = NULL;
+    int32_t ret = 0;
+    uint64_t rda_cache_size = 0;
+    char *rda_req_s = NULL;
+    uint64_t rda_req_size = 0;
+    uint64_t new_cache_size = 0;
+    char new_cache_size_str[50] = {
+        0,
+    };
+    char new_req_size_str[50] = {
+        0,
+    };
+    int dist_count = 0;
+
+    dist_count = volinfo->brick_count / volinfo->dist_leaf_count;
+    if (dist_count <= 1)
+        goto out;
+
+    if (graph->type == GF_QUOTAD || graph->type == GF_SNAPD ||
+        !glusterd_volinfo_get_boolean(volinfo, VKEY_PARALLEL_READDIR) ||
+        !glusterd_volinfo_get_boolean(volinfo, VKEY_READDIR_AHEAD))
+        goto out;
+
+    /* glusterd_volinfo_get() will get the default value if nothing set
+     * explicitly. Hence it is important to check set_dict before checking
+     * glusterd_volinfo_get, so that we consider key value of the in
+     * progress volume set option.
+     */
+    ret = dict_get_str_sizen(set_dict, VKEY_RDA_CACHE_LIMIT, &rda_cache_s);
+    if (ret < 0) {
+        ret = glusterd_volinfo_get(volinfo, VKEY_RDA_CACHE_LIMIT, &rda_cache_s);
+        if (ret < 0)
+            goto out;
+    }
+    ret = gf_string2bytesize_uint64(rda_cache_s, &rda_cache_size);
+    if (ret < 0) {
+        set_graph_errstr(
+            graph, "invalid number format in option " VKEY_RDA_CACHE_LIMIT);
+        goto out;
+    }
+
+    ret = dict_get_str_sizen(set_dict, VKEY_RDA_REQUEST_SIZE, &rda_req_s);
+    if (ret < 0) {
+        ret = glusterd_volinfo_get(volinfo, VKEY_RDA_REQUEST_SIZE, &rda_req_s);
+        if (ret < 0)
+            goto out;
+    }
+    ret = gf_string2bytesize_uint64(rda_req_s, &rda_req_size);
+    if (ret < 0) {
+        set_graph_errstr(
+            graph, "invalid number format in option " VKEY_RDA_REQUEST_SIZE);
+        goto out;
+    }
+
+    if (rda_cache_size == 0 || rda_req_size == 0) {
+        set_graph_errstr(graph, "Value cannot be 0");
+        ret = -1;
+        goto out;
+    }
+
+    new_cache_size = rda_cache_size / dist_count;
+    if (new_cache_size < rda_req_size) {
+        if (new_cache_size < 4 * 1024)
+            new_cache_size = rda_req_size = 4 * 1024;
+        else
+            rda_req_size = new_cache_size;
+
+        snprintf(new_req_size_str, sizeof(new_req_size_str), "%" PRId64 "%s",
+                 rda_req_size, "B");
+        ret = dict_set_dynstr_with_alloc(set_dict, VKEY_RDA_REQUEST_SIZE,
+                                         new_req_size_str);
+        if (ret < 0)
+            goto out;
+    }
+
+    snprintf(new_cache_size_str, sizeof(new_cache_size_str), "%" PRId64 "%s",
+             new_cache_size, "B");
+    ret = dict_set_dynstr_with_alloc(set_dict, VKEY_RDA_CACHE_LIMIT,
+                                     new_cache_size_str);
+    if (ret < 0)
+        goto out;
+
+out:
+    return ret;
+}
+
+static int
+client_graph_set_perf_options(volgen_graph_t *graph,
+                              glusterd_volinfo_t *volinfo, dict_t *set_dict)
+{
+    int ret = 0;
+
+    /*
+     * Logic to make sure gfproxy-client gets custom performance translators
+     */
+    ret = dict_get_str_boolean(set_dict, "gfproxy-client", 0);
+    if (ret == 1) {
+        return volgen_graph_set_options_generic(
+            graph, set_dict, volinfo, &gfproxy_client_perfxl_option_handler);
+    }
+
+    /*
+     * Logic to make sure gfproxy-server gets custom performance translators
+     */
+    ret = dict_get_str_boolean(set_dict, "gfproxy-server", 0);
+    if (ret == 1) {
+        return volgen_graph_set_options_generic(
+            graph, set_dict, volinfo, &gfproxy_server_perfxl_option_handler);
+    }
+
+    /*
+     * Logic to make sure NFS doesn't have performance translators by
+     * default for a volume
+     */
+    ret = client_graph_set_rda_options(graph, volinfo, set_dict);
+    if (ret < 0)
+        return ret;
+
+#ifdef BUILD_GNFS
+    data_t *tmp_data = NULL;
+    char *volname = NULL;
+
+    tmp_data = dict_get_sizen(set_dict, "nfs-volume-file");
+    if (tmp_data) {
+        volname = volinfo->volname;
+        return volgen_graph_set_options_generic(graph, set_dict, volname,
+                                                &nfsperfxl_option_handler);
+    } else
+#endif
+        return volgen_graph_set_options_generic(graph, set_dict, volinfo,
+                                                &perfxl_option_handler);
+}
+
+static int
+graph_set_generic_options(xlator_t *this, volgen_graph_t *graph,
+                          dict_t *set_dict, char *identifier)
+{
+    int ret = 0;
+
+    ret = volgen_graph_set_options_generic(graph, set_dict, "client",
+                                           &loglevel_option_handler);
+
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_GRAPH_SET_OPT_FAIL,
+               "changing %s log level"
+               " failed",
+               identifier);
+
+    ret = volgen_graph_set_options_generic(graph, set_dict, "client",
+                                           &sys_loglevel_option_handler);
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_GRAPH_SET_OPT_FAIL,
+               "changing %s syslog "
+               "level failed",
+               identifier);
+
+    ret = volgen_graph_set_options_generic(graph, set_dict, "client",
+                                           &logger_option_handler);
+
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_GRAPH_SET_OPT_FAIL,
+               "changing %s logger"
+               " failed",
+               identifier);
+
+    ret = volgen_graph_set_options_generic(graph, set_dict, "client",
+                                           &log_format_option_handler);
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_GRAPH_SET_OPT_FAIL,
+               "changing %s log format"
+               " failed",
+               identifier);
+
+    ret = volgen_graph_set_options_generic(graph, set_dict, "client",
+                                           &log_buf_size_option_handler);
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_GRAPH_SET_OPT_FAIL,
+               "Failed to change "
+               "log-buf-size option");
+
+    ret = volgen_graph_set_options_generic(graph, set_dict, "client",
+                                           &log_flush_timeout_option_handler);
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_GRAPH_SET_OPT_FAIL,
+               "Failed to change "
+               "log-flush-timeout option");
+
+    ret = volgen_graph_set_options_generic(
+        graph, set_dict, "client", &log_localtime_logging_option_handler);
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_GRAPH_SET_OPT_FAIL,
+               "Failed to change "
+               "log-localtime-logging option");
+
+    ret = volgen_graph_set_options_generic(graph, set_dict, "client",
+                                           &threads_option_handler);
+
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_GRAPH_SET_OPT_FAIL,
+               "changing %s threads failed", identifier);
+
+    return 0;
+}
+
+static int
+client_graph_builder(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                     dict_t *set_dict, void *param)
+{
+    int ret = 0;
+    xlator_t *xl = NULL;
+    char *volname = NULL;
+    glusterd_conf_t *conf = THIS->private;
+    char *tmp = NULL;
+    gf_boolean_t var = _gf_false;
+    gf_boolean_t ob = _gf_false;
+    int uss_enabled = -1;
+    xlator_t *this = THIS;
+    char *subvol = NULL;
+    size_t namelen = 0;
+    char *xl_id = NULL;
+    gf_boolean_t gfproxy_clnt = _gf_false;
+
+    GF_ASSERT(this);
+    GF_ASSERT(conf);
+
+    ret = dict_get_str_boolean(set_dict, "gfproxy-client", 0);
+    if (ret == -1)
+        goto out;
+
+    volname = volinfo->volname;
+    if (ret == 0) {
+        ret = volgen_graph_build_clients(graph, volinfo, set_dict, param);
+        if (ret)
+            goto out;
+
+        else
+            ret = volume_volgen_graph_build_clusters(graph, volinfo, _gf_false);
+
+        if (ret == -1)
+            goto out;
+    } else {
+        gfproxy_clnt = _gf_true;
+        namelen = strlen(volinfo->volname) + SLEN("gfproxyd-") + 1;
+        subvol = alloca(namelen);
+        snprintf(subvol, namelen, "gfproxyd-%s", volinfo->volname);
+
+        namelen = strlen(volinfo->volname) + SLEN("-gfproxy-client") + 1;
+        xl_id = alloca(namelen);
+        snprintf(xl_id, namelen, "%s-gfproxy-client", volinfo->volname);
+        volgen_graph_build_client(graph, volinfo, NULL, NULL, subvol, xl_id,
+                                  "tcp", set_dict);
+    }
+
+    ret = dict_get_str_boolean(set_dict, "features.cloudsync", _gf_false);
+    if (ret == -1)
+        goto out;
+
+    if (ret) {
+        xl = volgen_graph_add(graph, "features/cloudsync", volname);
+        if (!xl) {
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = dict_get_str_boolean(set_dict, "features.shard", _gf_false);
+    if (ret == -1)
+        goto out;
+
+    if (ret) {
+        xl = volgen_graph_add(graph, "features/shard", volname);
+        if (!xl) {
+            ret = -1;
+            goto out;
+        }
+    }
+    /* a. ret will be -1 if features.ctime is not set in the volinfo->dict which
+     * means ctime should be loaded into the graph.
+     * b. ret will be 1 if features.ctime is explicitly turned on through
+     * volume set and in that case ctime should be loaded into the graph.
+     * c. ret will be 0 if features.ctime is explicitly turned off and in that
+     * case ctime shouldn't be loaded into the graph.
+     */
+    ret = dict_get_str_boolean(set_dict, "features.ctime", -1);
+    if (conf->op_version >= GD_OP_VERSION_5_0 && ret) {
+        xl = volgen_graph_add(graph, "features/utime", volname);
+        if (!xl) {
+            ret = -1;
+            goto out;
+        }
+    }
+
+    /* As of now snapshot volume is read-only. Read-only xlator is loaded
+     * in client graph so that AFR & DHT healing can be done in server.
+     */
+    if (volinfo->is_snap_volume) {
+        xl = volgen_graph_add(graph, "features/read-only", volname);
+        if (!xl) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GRAPH_FEATURE_ADD_FAIL,
+                   "Failed to add "
+                   "read-only feature to the graph of %s "
+                   "snapshot with %s origin volume",
+                   volname, volinfo->parent_volname);
+            ret = -1;
+            goto out;
+        }
+        ret = xlator_set_fixed_option(xl, "read-only", "on");
+        if (ret)
+            goto out;
+    }
+
+    /* Check for compress volume option, and add it to the graph on client side
+     */
+    ret = dict_get_str_boolean(set_dict, "network.compression", 0);
+    if (ret == -1)
+        goto out;
+    if (ret) {
+        xl = volgen_graph_add(graph, "features/cdc", volname);
+        if (!xl) {
+            ret = -1;
+            goto out;
+        }
+        ret = xlator_set_fixed_option(xl, "mode", "client");
+        if (ret)
+            goto out;
+    }
+
+    /* gfproxy needs the quiesce translator */
+    if (gfproxy_clnt) {
+        xl = volgen_graph_add(graph, "features/quiesce", volname);
+        if (!xl) {
+            ret = -1;
+            goto out;
+        }
+    }
+
+    if (conf->op_version == GD_OP_VERSION_MIN) {
+        ret = glusterd_volinfo_get_boolean(volinfo, VKEY_FEATURES_QUOTA);
+        if (ret == -1)
+            goto out;
+        if (ret) {
+            xl = volgen_graph_add(graph, "features/quota", volname);
+            if (!xl) {
+                ret = -1;
+                goto out;
+            }
+        }
+    }
+
+    /* Do not allow changing read-after-open option if root-squash is
+       enabled.
+    */
+    ret = dict_get_str_sizen(set_dict, "performance.read-after-open", &tmp);
+    if (!ret) {
+        ret = dict_get_str_sizen(volinfo->dict, "server.root-squash", &tmp);
+        if (!ret) {
+            ob = _gf_false;
+            ret = gf_string2boolean(tmp, &ob);
+            if (!ret && ob) {
+                gf_msg(this->name, GF_LOG_WARNING, 0,
+                       GD_MSG_ROOT_SQUASH_ENABLED,
+                       "root-squash is enabled. Please turn it"
+                       " off to change read-after-open "
+                       "option");
+                ret = -1;
+                goto out;
+            }
+        }
+    }
+
+    /* open behind causes problems when root-squash is enabled
+       (by allowing reads to happen even though the squashed user
+       does not have permissions to do so) as it fakes open to be
+       successful and later sends reads on anonymous fds. So when
+       root-squash is enabled, open-behind's option to read after
+       open is done is also enabled.
+    */
+    ret = dict_get_str_sizen(set_dict, "server.root-squash", &tmp);
+    if (!ret) {
+        ret = gf_string2boolean(tmp, &var);
+        if (ret)
+            goto out;
+
+        if (var) {
+            ret = dict_get_str_sizen(volinfo->dict,
+                                     "performance.read-after-open", &tmp);
+            if (!ret) {
+                ret = gf_string2boolean(tmp, &ob);
+                /* go ahead with turning read-after-open on
+                   even if string2boolean conversion fails,
+                   OR if read-after-open option is turned off
+                */
+                if (ret || !ob)
+                    ret = dict_set_sizen_str_sizen(
+                        set_dict, "performance.read-after-open", "yes");
+            } else {
+                ret = dict_set_sizen_str_sizen(
+                    set_dict, "performance.read-after-open", "yes");
+            }
+        } else {
+            /* When root-squash has to be turned off, open-behind's
+               read-after-open option should be reset to what was
+               there before root-squash was turned on. If the option
+               cannot be found in volinfo's dict, it means that
+               option was not set before turning on root-squash.
+            */
+            ob = _gf_false;
+            ret = dict_get_str_sizen(volinfo->dict,
+                                     "performance.read-after-open", &tmp);
+            if (!ret) {
+                ret = gf_string2boolean(tmp, &ob);
+
+                if (!ret && ob) {
+                    ret = dict_set_sizen_str_sizen(
+                        set_dict, "performance.read-after-open", "yes");
+                }
+            }
+            /* consider operation is failure only if read-after-open
+               option is enabled and could not set into set_dict
+            */
+            if (!ob)
+                ret = 0;
+        }
+        if (ret) {
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_ROOT_SQUASH_FAILED,
+                   "setting "
+                   "open behind option as part of root "
+                   "squash failed");
+            goto out;
+        }
+    }
+
+    ret = dict_get_str_boolean(set_dict, "server.manage-gids", _gf_false);
+    if (ret != -1) {
+        ret = dict_set_str_sizen(set_dict, "client.send-gids",
+                                 ret ? "false" : "true");
+        if (ret)
+            gf_msg(THIS->name, GF_LOG_WARNING, errno, GD_MSG_DICT_SET_FAILED,
+                   "changing client"
+                   " protocol option failed");
+    }
+
+    ret = client_graph_set_perf_options(graph, volinfo, set_dict);
+    if (ret)
+        goto out;
+
+    uss_enabled = dict_get_str_boolean(set_dict, "features.uss", _gf_false);
+    if (uss_enabled == -1)
+        goto out;
+    if (uss_enabled && !volinfo->is_snap_volume) {
+        ret = volgen_graph_build_snapview_client(graph, volinfo, volname,
+                                                 set_dict);
+        if (ret == -1)
+            goto out;
+    }
+
+    /* add debug translators depending on the options */
+    ret = check_and_add_debug_xl(graph, set_dict, volname, "client");
+    if (ret)
+        return -1;
+
+    /* if the client is part of 'gfproxyd' server, then we need to keep the
+       volume name as 'gfproxyd-<volname>', for better portmapper options */
+    subvol = volname;
+    ret = dict_get_str_boolean(set_dict, "gfproxy-server", 0);
+    if (ret > 0) {
+        namelen = strlen(volinfo->volname) + SLEN("gfproxyd-") + 1;
+        subvol = alloca(namelen);
+        snprintf(subvol, namelen, "gfproxyd-%s", volname);
+    }
+
+    ret = -1;
+    xl = volgen_graph_add_as(graph, "debug/io-stats", subvol);
+    if (!xl) {
+        goto out;
+    }
+
+    ret = graph_set_generic_options(this, graph, set_dict, "client");
+out:
+    return ret;
+}
+
+/* builds a graph for client role , with option overrides in mod_dict */
+static int
+build_client_graph(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                   dict_t *mod_dict)
+{
+    return build_graph_generic(graph, volinfo, mod_dict, NULL,
+                               &client_graph_builder);
+}
+
+char *gd_shd_options[] = {"!self-heal-daemon", "!heal-timeout", NULL};
+
+char *
+gd_get_matching_option(char **options, char *option)
+{
+    while (*options && strcmp(*options, option))
+        options++;
+    return *options;
+}
+
+static int
+bitrot_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                      void *param)
+{
+    xlator_t *xl = NULL;
+    int ret = 0;
+
+    xl = first_of(graph);
+
+    if (!strcmp(vme->option, "expiry-time")) {
+        ret = xlator_set_fixed_option(xl, "expiry-time", vme->value);
+        if (ret)
+            return -1;
+    }
+
+    if (!strcmp(vme->option, "signer-threads")) {
+        ret = xlator_set_fixed_option(xl, "signer-threads", vme->value);
+        if (ret)
+            return -1;
+    }
+
+    return ret;
+}
+
+static int
+scrubber_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                        void *param)
+{
+    xlator_t *xl = NULL;
+    int ret = 0;
+
+    xl = first_of(graph);
+
+    if (!strcmp(vme->option, "scrub-throttle")) {
+        ret = xlator_set_fixed_option(xl, "scrub-throttle", vme->value);
+        if (ret)
+            return -1;
+    }
+
+    if (!strcmp(vme->option, "scrub-frequency")) {
+        ret = xlator_set_fixed_option(xl, "scrub-freq", vme->value);
+        if (ret)
+            return -1;
+    }
+
+    if (!strcmp(vme->option, "scrubber")) {
+        if (!strcmp(vme->value, "pause")) {
+            ret = xlator_set_fixed_option(xl, "scrub-state", vme->value);
+            if (ret)
+                return -1;
+        }
+    }
+
+    return ret;
+}
+
+static int
+shd_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                   void *param)
+{
+    int ret = 0;
+    struct volopt_map_entry new_vme = {0};
+    char *shd_option = NULL;
+
+    shd_option = gd_get_matching_option(gd_shd_options, vme->option);
+    if ((vme->option[0] == '!') && !shd_option)
+        goto out;
+    new_vme = *vme;
+    if (shd_option) {
+        new_vme.option = shd_option + 1;  // option with out '!'
+    }
+
+    ret = no_filter_option_handler(graph, &new_vme, param);
+out:
+    return ret;
+}
+
+#ifdef BUILD_GNFS
+static int
+nfs_option_handler(volgen_graph_t *graph, struct volopt_map_entry *vme,
+                   void *param)
+{
+    static struct nfs_opt nfs_opts[] = {
+        /* {pattern, printf_pattern} */
+        {"!rpc-auth.addr.*.allow", "rpc-auth.addr.%s.allow"},
+        {"!rpc-auth.addr.*.reject", "rpc-auth.addr.%s.reject"},
+        {"!rpc-auth.auth-unix.*", "rpc-auth.auth-unix.%s"},
+        {"!rpc-auth.auth-null.*", "rpc-auth.auth-null.%s"},
+        {"!nfs3.*.trusted-sync", "nfs3.%s.trusted-sync"},
+        {"!nfs3.*.trusted-write", "nfs3.%s.trusted-write"},
+        {"!nfs3.*.volume-access", "nfs3.%s.volume-access"},
+        {"!rpc-auth.ports.*.insecure", "rpc-auth.ports.%s.insecure"},
+        {"!nfs-disable", "nfs.%s.disable"},
+        {NULL, NULL}};
+    xlator_t *xl = NULL;
+    char *aa = NULL;
+    int ret = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+    int keylen;
+    struct nfs_opt *opt = NULL;
+
+    volinfo = param;
+
+    if (!volinfo || (volinfo->volname[0] == '\0')) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        return 0;
+    }
+
+    if (!vme || !(vme->option)) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        return 0;
+    }
+
+    xl = first_of(graph);
+
+    for (opt = nfs_opts; opt->pattern; opt++) {
+        if (!strcmp(vme->option, opt->pattern)) {
+            keylen = gf_asprintf(&aa, opt->printf_pattern, volinfo->volname);
+
+            if (keylen == -1) {
+                return -1;
+            }
+
+            ret = xlator_set_option(xl, aa, keylen, vme->value);
+            GF_FREE(aa);
+
+            if (ret)
+                return -1;
+
+            goto out;
+        }
+    }
+
+    if (!strcmp(vme->option, "!nfs3.*.export-dir")) {
+        keylen = gf_asprintf(&aa, "nfs3.%s.export-dir", volinfo->volname);
+
+        if (keylen == -1) {
+            return -1;
+        }
+
+        ret = gf_canonicalize_path(vme->value);
+        if (ret) {
+            GF_FREE(aa);
+            return -1;
+        }
+        ret = xlator_set_option(xl, aa, keylen, vme->value);
+        GF_FREE(aa);
+
+        if (ret)
+            return -1;
+    } else if ((strcmp(vme->voltype, "nfs/server") == 0) &&
+               (vme->option[0] != '!')) {
+        ret = xlator_set_option(xl, vme->option, strlen(vme->option),
+                                vme->value);
+        if (ret)
+            return -1;
+    }
+
+out:
+    return 0;
+}
+
+#endif
+char *
+volgen_get_shd_key(int type)
+{
+    char *key = NULL;
+
+    switch (type) {
+        case GF_CLUSTER_TYPE_REPLICATE:
+            key = "cluster.self-heal-daemon";
+            break;
+        case GF_CLUSTER_TYPE_DISPERSE:
+            key = "cluster.disperse-self-heal-daemon";
+            break;
+        default:
+            key = NULL;
+            break;
+    }
+
+    return key;
+}
+
+static int
+volgen_set_shd_key_enable(dict_t *set_dict, const int type)
+{
+    int ret = 0;
+
+    switch (type) {
+        case GF_CLUSTER_TYPE_REPLICATE:
+            ret = dict_set_sizen_str_sizen(set_dict, "cluster.self-heal-daemon",
+                                           "enable");
+            break;
+        case GF_CLUSTER_TYPE_DISPERSE:
+            ret = dict_set_sizen_str_sizen(
+                set_dict, "cluster.disperse-self-heal-daemon", "enable");
+            break;
+        default:
+            break;
+    }
+
+    return ret;
+}
+
+static gf_boolean_t
+volgen_is_shd_compatible_xl(char *xl_type)
+{
+    char *shd_xls[] = {"cluster/replicate", "cluster/disperse", NULL};
+    if (gf_get_index_by_elem(shd_xls, xl_type) != -1)
+        return _gf_true;
+
+    return _gf_false;
+}
+
+static int
+volgen_graph_set_iam_shd(volgen_graph_t *graph)
+{
+    xlator_t *trav;
+    int ret = 0;
+
+    for (trav = first_of(graph); trav; trav = trav->next) {
+        if (!volgen_is_shd_compatible_xl(trav->type))
+            continue;
+
+        ret = xlator_set_fixed_option(trav, "iam-self-heal-daemon", "yes");
+        if (ret)
+            break;
+    }
+    return ret;
+}
+
+static int
+prepare_shd_volume_options(glusterd_volinfo_t *volinfo, dict_t *mod_dict,
+                           dict_t *set_dict)
+{
+    int ret = 0;
+
+    ret = volgen_set_shd_key_enable(set_dict, volinfo->type);
+    if (ret)
+        goto out;
+
+    ret = dict_set_uint32(set_dict, "trusted-client", GF_CLIENT_TRUSTED);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=trusted-client", NULL);
+        goto out;
+    }
+
+    dict_copy(volinfo->dict, set_dict);
+    if (mod_dict)
+        dict_copy(mod_dict, set_dict);
+out:
+    return ret;
+}
+
+static int
+build_afr_ec_clusters(volgen_graph_t *graph, glusterd_volinfo_t *volinfo)
+{
+    int clusters = -1;
+    switch (volinfo->type) {
+        case GF_CLUSTER_TYPE_REPLICATE:
+            clusters = volgen_graph_build_afr_clusters(graph, volinfo);
+            break;
+
+        case GF_CLUSTER_TYPE_DISPERSE:
+            clusters = volgen_graph_build_ec_clusters(graph, volinfo);
+            break;
+    }
+    return clusters;
+}
+
+static int
+build_shd_clusters(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                   dict_t *set_dict)
+{
+    int ret = 0;
+    int clusters = -1;
+
+    ret = volgen_graph_build_clients(graph, volinfo, set_dict, NULL);
+    if (ret)
+        goto out;
+    clusters = build_afr_ec_clusters(graph, volinfo);
+
+out:
+    return clusters;
+}
+
+gf_boolean_t
+gd_is_self_heal_enabled(glusterd_volinfo_t *volinfo, dict_t *dict)
+{
+    char *shd_key = NULL;
+    gf_boolean_t shd_enabled = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("glusterd", volinfo, out);
+
+    switch (volinfo->type) {
+        case GF_CLUSTER_TYPE_REPLICATE:
+        case GF_CLUSTER_TYPE_DISPERSE:
+            shd_key = volgen_get_shd_key(volinfo->type);
+            shd_enabled = dict_get_str_boolean(dict, shd_key, _gf_true);
+            break;
+        default:
+            break;
+    }
+out:
+    return shd_enabled;
+}
+
+int
+build_rebalance_volfile(glusterd_volinfo_t *volinfo, char *filepath,
+                        dict_t *mod_dict)
+{
+    volgen_graph_t graph = {
+        0,
+    };
+    xlator_t *xl = NULL;
+    int ret = -1;
+    xlator_t *this = NULL;
+    dict_t *set_dict = NULL;
+
+    this = THIS;
+
+    graph.type = GF_REBALANCED;
+
+    if (volinfo->brick_count <= volinfo->dist_leaf_count) {
+        /*
+         * Volume is not a distribute volume or
+         * contains only 1 brick, no need to create
+         * the volfiles.
+         */
+        return 0;
+    }
+
+    set_dict = dict_copy_with_ref(volinfo->dict, NULL);
+    if (!set_dict)
+        return -1;
+
+    if (mod_dict) {
+        dict_copy(mod_dict, set_dict);
+        /* XXX dict_copy swallows errors */
+    }
+
+    /* Rebalance is always a trusted client*/
+    ret = dict_set_uint32(set_dict, "trusted-client", GF_CLIENT_TRUSTED);
+    if (ret)
+        return -1;
+
+    ret = volgen_graph_build_clients(&graph, volinfo, set_dict, NULL);
+    if (ret)
+        goto out;
+
+    ret = volume_volgen_graph_build_clusters(&graph, volinfo, _gf_false);
+    if (ret)
+        goto out;
+
+    xl = volgen_graph_add_as(&graph, "debug/io-stats", volinfo->volname);
+    if (!xl) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = graph_set_generic_options(this, &graph, set_dict, "rebalance-daemon");
+    if (ret)
+        goto out;
+
+    ret = volgen_graph_set_options_generic(&graph, set_dict, volinfo,
+                                           basic_option_handler);
+
+    if (!ret)
+        ret = volgen_write_volfile(&graph, filepath);
+
+out:
+    volgen_graph_free(&graph);
+
+    dict_unref(set_dict);
+
+    return ret;
+}
+
+static int
+build_shd_volume_graph(xlator_t *this, volgen_graph_t *graph,
+                       glusterd_volinfo_t *volinfo, dict_t *mod_dict,
+                       dict_t *set_dict, gf_boolean_t graph_check)
+{
+    volgen_graph_t cgraph = {0};
+    int ret = 0;
+    int clusters = -1;
+
+    if (!glusterd_is_shd_compatible_volume(volinfo))
+        goto out;
+
+    ret = prepare_shd_volume_options(volinfo, mod_dict, set_dict);
+    if (ret)
+        goto out;
+
+    clusters = build_shd_clusters(&cgraph, volinfo, set_dict);
+    if (clusters < 0) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = volgen_graph_set_options_generic(&cgraph, set_dict, volinfo,
+                                           shd_option_handler);
+    if (ret)
+        goto out;
+
+    ret = volgen_graph_set_iam_shd(&cgraph);
+    if (ret)
+        goto out;
+
+    ret = volgen_graph_merge_sub(graph, &cgraph, clusters);
+    if (ret)
+        goto out;
+
+    ret = graph_set_generic_options(this, graph, set_dict, "self-heal daemon");
+out:
+    return ret;
+}
+
+int
+build_shd_graph(glusterd_volinfo_t *volinfo, volgen_graph_t *graph,
+                dict_t *mod_dict)
+{
+    xlator_t *this = NULL;
+    dict_t *set_dict = NULL;
+    int ret = 0;
+    xlator_t *iostxl = NULL;
+    gf_boolean_t graph_check = _gf_false;
+
+    this = THIS;
+
+    set_dict = dict_new();
+    if (!set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    if (mod_dict)
+        graph_check = dict_get_str_boolean(mod_dict, "graph-check", 0);
+    iostxl = volgen_graph_add_as(graph, "debug/io-stats", volinfo->volname);
+    if (!iostxl) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = build_shd_volume_graph(this, graph, volinfo, mod_dict, set_dict,
+                                 graph_check);
+
+out:
+    if (set_dict)
+        dict_unref(set_dict);
+    return ret;
+}
+
+#ifdef BUILD_GNFS
+
+static int
+volgen_graph_set_iam_nfsd(const volgen_graph_t *graph)
+{
+    xlator_t *trav;
+    int ret = 0;
+
+    for (trav = first_of((volgen_graph_t *)graph); trav; trav = trav->next) {
+        if (strcmp(trav->type, "cluster/replicate") != 0)
+            continue;
+
+        ret = xlator_set_fixed_option(trav, "iam-nfs-daemon", "yes");
+        if (ret)
+            break;
+    }
+    return ret;
+}
+
+/* builds a graph for nfs server role, with option overrides in mod_dict */
+int
+build_nfs_graph(volgen_graph_t *graph, dict_t *mod_dict)
+{
+    volgen_graph_t cgraph = {
+        0,
+    };
+    glusterd_volinfo_t *voliter = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    dict_t *set_dict = NULL;
+    xlator_t *nfsxl = NULL;
+    char *skey = NULL;
+    int ret = 0;
+    char nfs_xprt[16] = {
+        0,
+    };
+    char *volname = NULL;
+    data_t *data = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    set_dict = dict_new();
+    if (!set_dict) {
+        gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Out of memory");
+        return -1;
+    }
+
+    nfsxl = volgen_graph_add_as(graph, "nfs/server", "nfs-server");
+    if (!nfsxl) {
+        ret = -1;
+        goto out;
+    }
+    ret = xlator_set_fixed_option(nfsxl, "nfs.dynamic-volumes", "on");
+    if (ret)
+        goto out;
+
+    ret = xlator_set_fixed_option(nfsxl, "nfs.nlm", "on");
+    if (ret)
+        goto out;
+
+    ret = xlator_set_fixed_option(nfsxl, "nfs.drc", "off");
+    if (ret)
+        goto out;
+
+    cds_list_for_each_entry(voliter, &priv->volumes, vol_list)
+    {
+        if (voliter->status != GLUSTERD_STATUS_STARTED)
+            continue;
+
+        if (dict_get_str_boolean(voliter->dict, NFS_DISABLE_MAP_KEY, 0))
+            continue;
+
+        ret = gf_asprintf(&skey, "rpc-auth.addr.%s.allow", voliter->volname);
+        if (ret == -1) {
+            gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "Out of memory");
+            goto out;
+        }
+        ret = xlator_set_option(nfsxl, skey, ret, "*");
+        GF_FREE(skey);
+        if (ret)
+            goto out;
+
+        ret = gf_asprintf(&skey, "nfs3.%s.volume-id", voliter->volname);
+        if (ret == -1) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_NO_MEMORY,
+                   "Out of memory");
+            goto out;
+        }
+        ret = xlator_set_option(nfsxl, skey, ret,
+                                uuid_utoa(voliter->volume_id));
+        GF_FREE(skey);
+        if (ret)
+            goto out;
+
+        /* If both RDMA and TCP are the transport_type, use TCP for NFS
+         * client protocols, because tcp,rdma volume can be created in
+         * servers which does not have rdma supported hardware
+         * The transport type specified here is client transport type
+         * which is used for communication between gluster-nfs and brick
+         * processes.
+         * User can specify client transport for tcp,rdma volume using
+         * nfs.transport-type, if it is not set by user default
+         * one will be tcp.
+         */
+        memset(&cgraph, 0, sizeof(cgraph));
+        if (mod_dict)
+            get_transport_type(voliter, mod_dict, nfs_xprt, _gf_true);
+        else
+            get_transport_type(voliter, voliter->dict, nfs_xprt, _gf_true);
+
+        ret = dict_set_sizen_str_sizen(set_dict, "performance.stat-prefetch",
+                                       "off");
+        if (ret) {
+            gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=performance.stat-prefetch", NULL);
+            goto out;
+        }
+
+        ret = dict_set_sizen_str_sizen(set_dict,
+                                       "performance.client-io-threads", "off");
+        if (ret) {
+            gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=performance.client-io-threads", NULL);
+            goto out;
+        }
+
+        ret = dict_set_str_sizen(set_dict, "client-transport-type", nfs_xprt);
+        if (ret) {
+            gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=client-transport-type", NULL);
+            goto out;
+        }
+
+        ret = dict_set_uint32(set_dict, "trusted-client", GF_CLIENT_TRUSTED);
+        if (ret) {
+            gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=trusted-client", NULL);
+            goto out;
+        }
+
+        ret = dict_set_sizen_str_sizen(set_dict, "nfs-volume-file", "yes");
+        if (ret) {
+            gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=nfs-volume-file", NULL);
+            goto out;
+        }
+
+        if (mod_dict && (data = dict_get_sizen(mod_dict, "volume-name"))) {
+            volname = data->data;
+            if (strcmp(volname, voliter->volname) == 0)
+                dict_copy(mod_dict, set_dict);
+        }
+
+        ret = build_client_graph(&cgraph, voliter, set_dict);
+        if (ret)
+            goto out;
+
+        if (mod_dict) {
+            dict_copy(mod_dict, set_dict);
+            ret = volgen_graph_set_options_generic(&cgraph, set_dict, voliter,
+                                                   basic_option_handler);
+        } else {
+            ret = volgen_graph_set_options_generic(
+                &cgraph, voliter->dict, voliter, basic_option_handler);
+        }
+
+        if (ret)
+            goto out;
+
+        ret = volgen_graph_set_iam_nfsd(&cgraph);
+        if (ret)
+            goto out;
+
+        ret = volgen_graph_merge_sub(graph, &cgraph, 1);
+        if (ret)
+            goto out;
+        ret = dict_reset(set_dict);
+        if (ret)
+            goto out;
+    }
+
+    cds_list_for_each_entry(voliter, &priv->volumes, vol_list)
+    {
+        if (mod_dict) {
+            ret = volgen_graph_set_options_generic(graph, mod_dict, voliter,
+                                                   nfs_option_handler);
+        } else {
+            ret = volgen_graph_set_options_generic(graph, voliter->dict,
+                                                   voliter, nfs_option_handler);
+        }
+
+        if (ret)
+            gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_GRAPH_SET_OPT_FAIL,
+                   "Could not set "
+                   "vol-options for the volume %s",
+                   voliter->volname);
+    }
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    dict_unref(set_dict);
+
+    return ret;
+}
+#endif
+/****************************
+ *
+ * Volume generation interface
+ *
+ ****************************/
+
+static void
+get_brick_filepath(char *filename, glusterd_volinfo_t *volinfo,
+                   glusterd_brickinfo_t *brickinfo, char *prefix)
+{
+    char path[PATH_MAX] = {
+        0,
+    };
+    char brick[PATH_MAX] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    int32_t len = 0;
+
+    priv = THIS->private;
+
+    GLUSTERD_REMOVE_SLASH_FROM_PATH(brickinfo->path, brick);
+    GLUSTERD_GET_VOLUME_DIR(path, volinfo, priv);
+
+    if (prefix)
+        len = snprintf(filename, PATH_MAX, "%s/%s.%s.%s.%s.vol", path,
+                       volinfo->volname, prefix, brickinfo->hostname, brick);
+    else
+        len = snprintf(filename, PATH_MAX, "%s/%s.%s.%s.vol", path,
+                       volinfo->volname, brickinfo->hostname, brick);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        filename[0] = 0;
+    }
+}
+
+gf_boolean_t
+glusterd_is_valid_volfpath(char *volname, char *brick)
+{
+    char volfpath[PATH_MAX] = {
+        0,
+    };
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    int32_t ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = glusterd_brickinfo_new_from_brick(brick, &brickinfo, _gf_false, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_BRICKINFO_CREATE_FAIL,
+               "Failed to create brickinfo"
+               " for brick %s",
+               brick);
+        ret = 0;
+        goto out;
+    }
+    ret = glusterd_volinfo_new(&volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOLINFO_STORE_FAIL,
+               "Failed to create volinfo");
+        ret = 0;
+        goto out;
+    }
+    (void)snprintf(volinfo->volname, sizeof(volinfo->volname), "%s", volname);
+    get_brick_filepath(volfpath, volinfo, brickinfo, NULL);
+
+    ret = ((strlen(volfpath) < PATH_MAX) &&
+           strlen(strrchr(volfpath, '/')) < _POSIX_PATH_MAX);
+
+out:
+    if (brickinfo)
+        glusterd_brickinfo_delete(brickinfo);
+    if (volinfo)
+        glusterd_volinfo_unref(volinfo);
+    return ret;
+}
+
+int
+glusterd_build_gfproxyd_volfile(glusterd_volinfo_t *volinfo, char *filename)
+{
+    volgen_graph_t graph = {
+        0,
+    };
+    int ret = -1;
+
+    ret = build_graph_generic(&graph, volinfo, NULL, NULL,
+                              &gfproxy_server_graph_builder);
+    if (ret == 0)
+        ret = volgen_write_volfile(&graph, filename);
+
+    volgen_graph_free(&graph);
+
+    return ret;
+}
+
+int
+glusterd_generate_gfproxyd_volfile(glusterd_volinfo_t *volinfo)
+{
+    char filename[PATH_MAX] = {
+        0,
+    };
+    int ret = -1;
+
+    GF_ASSERT(volinfo);
+
+    glusterd_svc_build_gfproxyd_volfile_path(volinfo, filename, PATH_MAX - 1);
+
+    ret = glusterd_build_gfproxyd_volfile(volinfo, filename);
+
+    return ret;
+}
+
+static int
+glusterd_generate_brick_volfile(glusterd_volinfo_t *volinfo,
+                                glusterd_brickinfo_t *brickinfo,
+                                dict_t *mod_dict, void *data)
+{
+    volgen_graph_t graph = {
+        0,
+    };
+    char filename[PATH_MAX] = {
+        0,
+    };
+    int ret = -1;
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(brickinfo);
+
+    get_brick_filepath(filename, volinfo, brickinfo, NULL);
+
+    ret = build_server_graph(&graph, volinfo, mod_dict, brickinfo);
+    if (!ret)
+        ret = volgen_write_volfile(&graph, filename);
+
+    volgen_graph_free(&graph);
+
+    return ret;
+}
+
+int
+build_quotad_graph(volgen_graph_t *graph, dict_t *mod_dict)
+{
+    volgen_graph_t cgraph = {0};
+    glusterd_volinfo_t *voliter = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    dict_t *set_dict = NULL;
+    int ret = 0;
+    xlator_t *quotad_xl = NULL;
+    char *skey = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    graph->type = GF_QUOTAD;
+
+    set_dict = dict_new();
+    if (!set_dict) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    quotad_xl = volgen_graph_add_as(graph, "features/quotad", "quotad");
+    if (!quotad_xl) {
+        ret = -1;
+        goto out;
+    }
+
+    cds_list_for_each_entry(voliter, &priv->volumes, vol_list)
+    {
+        if (voliter->status != GLUSTERD_STATUS_STARTED)
+            continue;
+
+        if (1 != glusterd_is_volume_quota_enabled(voliter))
+            continue;
+
+        ret = dict_set_uint32(set_dict, "trusted-client", GF_CLIENT_TRUSTED);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=trusted-client", NULL);
+            goto out;
+        }
+
+        dict_copy(voliter->dict, set_dict);
+        if (mod_dict)
+            dict_copy(mod_dict, set_dict);
+
+        ret = gf_asprintf(&skey, "%s.volume-id", voliter->volname);
+        if (ret == -1) {
+            gf_msg("glusterd", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+                   "Out of memory");
+            goto out;
+        }
+        ret = xlator_set_option(quotad_xl, skey, ret, voliter->volname);
+        GF_FREE(skey);
+        if (ret)
+            goto out;
+
+        memset(&cgraph, 0, sizeof(cgraph));
+        ret = volgen_graph_build_clients(&cgraph, voliter, set_dict, NULL);
+        if (ret)
+            goto out;
+
+        ret = volume_volgen_graph_build_clusters(&cgraph, voliter, _gf_true);
+        if (ret) {
+            ret = -1;
+            goto out;
+        }
+
+        if (mod_dict) {
+            dict_copy(mod_dict, set_dict);
+            ret = volgen_graph_set_options_generic(&cgraph, set_dict, voliter,
+                                                   basic_option_handler);
+        } else {
+            ret = volgen_graph_set_options_generic(
+                &cgraph, voliter->dict, voliter, basic_option_handler);
+        }
+        if (ret)
+            goto out;
+
+        ret = volgen_graph_merge_sub(graph, &cgraph, 1);
+        if (ret)
+            goto out;
+
+        ret = dict_reset(set_dict);
+        if (ret)
+            goto out;
+    }
+
+out:
+    if (set_dict)
+        dict_unref(set_dict);
+    return ret;
+}
+
+static void
+get_vol_tstamp_file(char *filename, glusterd_volinfo_t *volinfo)
+{
+    glusterd_conf_t *priv = NULL;
+
+    priv = THIS->private;
+
+    GLUSTERD_GET_VOLUME_DIR(filename, volinfo, priv);
+    strncat(filename, "/marker.tstamp", PATH_MAX - strlen(filename) - 1);
+}
+
+static void
+get_parent_vol_tstamp_file(char *filename, glusterd_volinfo_t *volinfo)
+{
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    len = snprintf(filename, PATH_MAX, "%s/vols/%s/marker.tstamp",
+                   priv->workdir, volinfo->parent_volname);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        filename[0] = 0;
+    }
+}
+
+int
+generate_brick_volfiles(glusterd_volinfo_t *volinfo)
+{
+    char tstamp_file[PATH_MAX] = {
+        0,
+    };
+    char parent_tstamp_file[PATH_MAX] = {
+        0,
+    };
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = glusterd_volinfo_get_boolean(volinfo, VKEY_MARKER_XTIME);
+    if (ret == -1)
+        return -1;
+
+    assign_brick_groups(volinfo);
+    get_vol_tstamp_file(tstamp_file, volinfo);
+
+    if (ret) {
+        ret = open(tstamp_file, O_WRONLY | O_CREAT | O_EXCL, 0600);
+        if (ret == -1 && errno == EEXIST) {
+            gf_msg_debug(this->name, 0, "timestamp file exist");
+            ret = -2;
+        }
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                   "failed to create "
+                   "%s",
+                   tstamp_file);
+            return -1;
+        }
+        if (ret >= 0) {
+            sys_close(ret);
+            /* If snap_volume, retain timestamp for marker.tstamp
+             * from parent. Geo-replication depends on mtime of
+             * 'marker.tstamp' to decide the volume-mark, i.e.,
+             * geo-rep start time just after session is created.
+             */
+            if (volinfo->is_snap_volume) {
+                get_parent_vol_tstamp_file(parent_tstamp_file, volinfo);
+                ret = gf_set_timestamp(parent_tstamp_file, tstamp_file);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_TSTAMP_SET_FAIL,
+                           "Unable to set atime and mtime"
+                           " of %s as of %s",
+                           tstamp_file, parent_tstamp_file);
+                    goto out;
+                }
+            }
+        }
+    } else {
+        ret = sys_unlink(tstamp_file);
+        if (ret == -1 && errno == ENOENT)
+            ret = 0;
+        if (ret == -1) {
+            gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+                   "failed to unlink "
+                   "%s",
+                   tstamp_file);
+            return -1;
+        }
+    }
+
+    ret = glusterd_volume_brick_for_each(volinfo, NULL,
+                                         glusterd_generate_brick_volfile);
+    if (ret)
+        goto out;
+
+    ret = 0;
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+generate_single_transport_client_volfile(glusterd_volinfo_t *volinfo,
+                                         char *filepath, dict_t *dict)
+{
+    volgen_graph_t graph = {
+        0,
+    };
+    int ret = -1;
+
+    ret = build_client_graph(&graph, volinfo, dict);
+    if (!ret)
+        ret = volgen_write_volfile(&graph, filepath);
+
+    volgen_graph_free(&graph);
+
+    return ret;
+}
+
+int
+glusterd_generate_client_per_brick_volfile(glusterd_volinfo_t *volinfo)
+{
+    char filepath[PATH_MAX] = {
+        0,
+    };
+    glusterd_brickinfo_t *brick = NULL;
+    volgen_graph_t graph = {
+        0,
+    };
+    dict_t *dict = NULL;
+    xlator_t *xl = NULL;
+    int ret = -1;
+    char *ssl_str = NULL;
+    gf_boolean_t ssl_bool = _gf_false;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ret = dict_set_uint32(dict, "trusted-client", GF_CLIENT_TRUSTED);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=trusted-client", NULL);
+        goto free_dict;
+    }
+
+    if (dict_get_str_sizen(volinfo->dict, "client.ssl", &ssl_str) == 0) {
+        if (gf_string2boolean(ssl_str, &ssl_bool) == 0) {
+            if (ssl_bool) {
+                if (dict_set_dynstr_with_alloc(dict, "client.ssl", "on") != 0) {
+                    ret = -1;
+                    goto free_dict;
+                }
+            }
+        } else {
+            ret = -1;
+            goto free_dict;
+        }
+    }
+
+    cds_list_for_each_entry(brick, &volinfo->bricks, brick_list)
+    {
+        xl = volgen_graph_build_client(&graph, volinfo, brick->hostname, NULL,
+                                       brick->path, brick->brick_id, "tcp",
+                                       dict);
+        if (!xl) {
+            ret = -1;
+            goto out;
+        }
+
+        get_brick_filepath(filepath, volinfo, brick, "client");
+        ret = volgen_write_volfile(&graph, filepath);
+        if (ret < 0)
+            goto out;
+
+        volgen_graph_free(&graph);
+        memset(&graph, 0, sizeof(graph));
+    }
+
+    ret = 0;
+out:
+    if (ret)
+        volgen_graph_free(&graph);
+
+free_dict:
+
+    if (dict)
+        dict_unref(dict);
+
+    return ret;
+}
+
+static void
+enumerate_transport_reqs(gf_transport_type type, char **types)
+{
+    switch (type) {
+        case GF_TRANSPORT_TCP:
+            types[0] = "tcp";
+            break;
+        case GF_TRANSPORT_RDMA:
+            types[0] = "rdma";
+            break;
+        case GF_TRANSPORT_BOTH_TCP_RDMA:
+            types[0] = "tcp";
+            types[1] = "rdma";
+            break;
+    }
+}
+
+int
+generate_dummy_client_volfiles(glusterd_volinfo_t *volinfo)
+{
+    int i = 0;
+    int ret = -1;
+    char filepath[PATH_MAX] = {
+        0,
+    };
+    char *types[] = {NULL, NULL, NULL};
+    dict_t *dict = NULL;
+    xlator_t *this = NULL;
+    gf_transport_type type = GF_TRANSPORT_TCP;
+
+    this = THIS;
+
+    enumerate_transport_reqs(volinfo->transport_type, types);
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+    for (i = 0; types[i]; i++) {
+        ret = dict_set_str(dict, "client-transport-type", types[i]);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=client-transport-type", NULL);
+            goto out;
+        }
+        type = transport_str_to_type(types[i]);
+
+        ret = dict_set_uint32(dict, "trusted-client", GF_CLIENT_OTHER);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=trusted-client", NULL);
+            goto out;
+        }
+
+        ret = glusterd_get_dummy_client_filepath(filepath, volinfo, type);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                   "Received invalid transport-type.");
+            goto out;
+        }
+
+        ret = generate_single_transport_client_volfile(volinfo, filepath, dict);
+        if (ret)
+            goto out;
+    }
+
+out:
+    if (dict)
+        dict_unref(dict);
+
+    gf_msg_trace("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+generate_client_volfiles(glusterd_volinfo_t *volinfo,
+                         glusterd_client_type_t client_type)
+{
+    int i = 0;
+    int ret = -1;
+    char filepath[PATH_MAX] = {
+        0,
+    };
+    char *volname = NULL;
+    char *types[] = {NULL, NULL, NULL};
+    dict_t *dict = NULL;
+    xlator_t *this = NULL;
+    gf_transport_type type = GF_TRANSPORT_TCP;
+
+    this = THIS;
+
+    volname = volinfo->is_snap_volume ? volinfo->parent_volname
+                                      : volinfo->volname;
+
+    if (volname && !strcmp(volname, GLUSTER_SHARED_STORAGE) &&
+        client_type != GF_CLIENT_TRUSTED) {
+        /*
+         * shared storage volume cannot be mounted from non trusted
+         * nodes. So we are not creating volfiles for non-trusted
+         * clients for shared volumes as well as snapshot of shared
+         * volumes.
+         */
+
+        ret = 0;
+        gf_msg_debug("glusterd", 0,
+                     "Skipping the non-trusted volfile"
+                     "creation for shared storage volume. Volume %s",
+                     volname);
+        goto out;
+    }
+
+    enumerate_transport_reqs(volinfo->transport_type, types);
+    dict = dict_new();
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+    for (i = 0; types[i]; i++) {
+        ret = dict_set_str(dict, "client-transport-type", types[i]);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=client-transport-type", NULL);
+            goto out;
+        }
+        type = transport_str_to_type(types[i]);
+
+        ret = dict_set_uint32(dict, "trusted-client", client_type);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=trusted-client", NULL);
+            goto out;
+        }
+
+        if (client_type == GF_CLIENT_TRUSTED) {
+            ret = glusterd_get_trusted_client_filepath(filepath, volinfo, type);
+        } else if (client_type == GF_CLIENT_TRUSTED_PROXY) {
+            glusterd_get_gfproxy_client_volfile(volinfo, filepath, PATH_MAX);
+            ret = dict_set_int32_sizen(dict, "gfproxy-client", 1);
+        } else {
+            ret = glusterd_get_client_filepath(filepath, volinfo, type);
+        }
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY,
+                   "Received invalid transport-type");
+            goto out;
+        }
+
+        ret = generate_single_transport_client_volfile(volinfo, filepath, dict);
+        if (ret)
+            goto out;
+    }
+
+    /* Generate volfile for rebalance process */
+    glusterd_get_rebalance_volfile(volinfo, filepath, PATH_MAX);
+    ret = build_rebalance_volfile(volinfo, filepath, dict);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Failed to create rebalance volfile for %s", volinfo->volname);
+        goto out;
+    }
+
+out:
+    if (dict)
+        dict_unref(dict);
+
+    gf_msg_trace("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_snapdsvc_generate_volfile(volgen_graph_t *graph,
+                                   glusterd_volinfo_t *volinfo)
+{
+    xlator_t *xl = NULL;
+    char *username = NULL;
+    char *passwd = NULL;
+    int ret = 0;
+    char key[PATH_MAX] = {
+        0,
+    };
+    dict_t *set_dict = NULL;
+    char *loglevel = NULL;
+    char *xlator = NULL;
+    char *ssl_str = NULL;
+    gf_boolean_t ssl_bool = _gf_false;
+
+    set_dict = dict_copy(volinfo->dict, NULL);
+    if (!set_dict)
+        return -1;
+
+    ret = dict_get_str_sizen(set_dict, "xlator", &xlator);
+    if (!ret) {
+        ret = dict_get_str_sizen(set_dict, "loglevel", &loglevel);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                   "could not get both"
+                   " translator name and loglevel for log level "
+                   "request");
+            return -1;
+        }
+    }
+
+    xl = volgen_graph_add(graph, "features/snapview-server", volinfo->volname);
+    if (!xl)
+        return -1;
+
+    ret = xlator_set_fixed_option(xl, "volname", volinfo->volname);
+    if (ret)
+        return -1;
+
+    xl = volgen_graph_add(graph, "performance/io-threads", volinfo->volname);
+    if (!xl)
+        return -1;
+
+    snprintf(key, sizeof(key), "snapd-%s", volinfo->volname);
+    xl = volgen_graph_add_as(graph, "debug/io-stats", key);
+    if (!xl)
+        return -1;
+
+    xl = volgen_graph_add(graph, "protocol/server", volinfo->volname);
+    if (!xl)
+        return -1;
+
+    ret = xlator_set_fixed_option(xl, "transport-type", "tcp");
+    if (ret)
+        return -1;
+
+    if (dict_get_str_sizen(set_dict, "server.ssl", &ssl_str) == 0) {
+        if (gf_string2boolean(ssl_str, &ssl_bool) == 0) {
+            if (ssl_bool) {
+                ret = xlator_set_fixed_option(
+                    xl, "transport.socket.ssl-enabled", "true");
+                if (ret) {
+                    return -1;
+                }
+            }
+        }
+    }
+
+    RPC_SET_OPT(xl, SSL_OWN_CERT_OPT, "ssl-own-cert", return -1);
+    RPC_SET_OPT(xl, SSL_PRIVATE_KEY_OPT, "ssl-private-key", return -1);
+    RPC_SET_OPT(xl, SSL_CA_LIST_OPT, "ssl-ca-list", return -1);
+    RPC_SET_OPT(xl, SSL_CRL_PATH_OPT, "ssl-crl-path", return -1);
+    RPC_SET_OPT(xl, SSL_CERT_DEPTH_OPT, "ssl-cert-depth", return -1);
+    RPC_SET_OPT(xl, SSL_CIPHER_LIST_OPT, "ssl-cipher-list", return -1);
+    RPC_SET_OPT(xl, SSL_DH_PARAM_OPT, "ssl-dh-param", return -1);
+    RPC_SET_OPT(xl, SSL_EC_CURVE_OPT, "ssl-ec-curve", return -1);
+
+    username = glusterd_auth_get_username(volinfo);
+    passwd = glusterd_auth_get_password(volinfo);
+
+    ret = snprintf(key, sizeof(key), "auth.login.snapd-%s.allow",
+                   volinfo->volname);
+    ret = xlator_set_option(xl, key, ret, username);
+    if (ret)
+        return -1;
+
+    ret = snprintf(key, sizeof(key), "auth.login.%s.password", username);
+    ret = xlator_set_option(xl, key, ret, passwd);
+    if (ret)
+        return -1;
+
+    snprintf(key, sizeof(key), "snapd-%s", volinfo->volname);
+    ret = xlator_set_fixed_option(xl, "auth-path", key);
+    if (ret)
+        return -1;
+
+    ret = volgen_graph_set_options_generic(
+        graph, set_dict, (xlator && loglevel) ? (void *)set_dict : volinfo,
+        (xlator && loglevel) ? &server_spec_extended_option_handler
+                             : &server_spec_option_handler);
+
+    return ret;
+}
+
+static int
+prepare_bitrot_scrub_volume_options(glusterd_volinfo_t *volinfo,
+                                    dict_t *mod_dict, dict_t *set_dict)
+{
+    int ret = 0;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_set_uint32(set_dict, "trusted-client", GF_CLIENT_TRUSTED);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=trusted-client", NULL);
+        goto out;
+    }
+
+    dict_copy(volinfo->dict, set_dict);
+    if (mod_dict)
+        dict_copy(mod_dict, set_dict);
+
+out:
+    return ret;
+}
+
+static int
+build_bitd_clusters(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                    dict_t *set_dict, int brick_count, unsigned int numbricks)
+{
+    int ret = -1;
+    int clusters = 0;
+    xlator_t *xl = NULL;
+    char *brick_hint = NULL;
+    char *bitrot_args[] = {"features/bit-rot", "%s-bit-rot-%d"};
+
+    ret = volgen_link_bricks_from_list_tail(graph, volinfo, bitrot_args[0],
+                                            bitrot_args[1], brick_count,
+                                            brick_count);
+    clusters = ret;
+
+    xl = first_of(graph);
+
+    ret = gf_asprintf(&brick_hint, "%d", numbricks);
+    if (ret < 0)
+        goto out;
+
+    ret = xlator_set_fixed_option(xl, "brick-count", brick_hint);
+    if (ret)
+        goto out;
+
+    ret = clusters;
+
+out:
+    GF_FREE(brick_hint);
+    brick_hint = NULL;
+    return ret;
+}
+
+static int
+build_bitd_volume_graph(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                        dict_t *mod_dict, unsigned int numbricks)
+{
+    volgen_graph_t cgraph = {0};
+    xlator_t *this = NULL;
+    xlator_t *xl = NULL;
+    dict_t *set_dict = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = 0;
+    int clusters = -1;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    int brick_count = 0;
+    char transt[16] = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    set_dict = dict_new();
+    if (!set_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = prepare_bitrot_scrub_volume_options(volinfo, mod_dict, set_dict);
+    if (ret)
+        goto out;
+
+    get_transport_type(volinfo, set_dict, transt, _gf_false);
+    if (!strncmp(transt, "tcp,rdma", SLEN("tcp,rdma")))
+        (void)snprintf(transt, sizeof(transt), "%s", "tcp");
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (!glusterd_is_local_brick(this, volinfo, brickinfo))
+            continue;
+
+        xl = volgen_graph_build_client(&cgraph, volinfo, brickinfo->hostname,
+                                       NULL, brickinfo->path,
+                                       brickinfo->brick_id, transt, set_dict);
+        if (!xl) {
+            ret = -1;
+            goto out;
+        }
+        brick_count++;
+    }
+
+    if (brick_count == 0) {
+        ret = 0;
+        goto out;
+    }
+
+    clusters = build_bitd_clusters(&cgraph, volinfo, set_dict, brick_count,
+                                   numbricks);
+    if (clusters < 0) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = volgen_graph_set_options_generic(&cgraph, set_dict, volinfo,
+                                           bitrot_option_handler);
+    if (ret)
+        goto out;
+
+    ret = volgen_graph_merge_sub(graph, &cgraph, clusters);
+    if (ret)
+        goto out;
+
+    ret = graph_set_generic_options(this, graph, set_dict, "Bitrot");
+
+out:
+    if (set_dict)
+        dict_unref(set_dict);
+
+    return ret;
+}
+
+int
+build_bitd_graph(volgen_graph_t *graph, dict_t *mod_dict)
+{
+    glusterd_volinfo_t *voliter = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = 0;
+    xlator_t *iostxl = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    unsigned int numbricks = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    iostxl = volgen_graph_add_as(graph, "debug/io-stats", "bitd");
+    if (!iostxl) {
+        ret = -1;
+        goto out;
+    }
+
+    /* TODO: do way with this extra loop _if possible_ */
+    cds_list_for_each_entry(voliter, &priv->volumes, vol_list)
+    {
+        if (voliter->status != GLUSTERD_STATUS_STARTED)
+            continue;
+        if (!glusterd_is_bitrot_enabled(voliter))
+            continue;
+
+        cds_list_for_each_entry(brickinfo, &voliter->bricks, brick_list)
+        {
+            if (!glusterd_is_local_brick(this, voliter, brickinfo))
+                continue;
+            numbricks++;
+        }
+    }
+
+    cds_list_for_each_entry(voliter, &priv->volumes, vol_list)
+    {
+        if (voliter->status != GLUSTERD_STATUS_STARTED)
+            continue;
+
+        if (!glusterd_is_bitrot_enabled(voliter))
+            continue;
+
+        ret = build_bitd_volume_graph(graph, voliter, mod_dict, numbricks);
+    }
+out:
+    return ret;
+}
+
+static int
+build_scrub_clusters(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                     dict_t *set_dict, int brick_count)
+{
+    int ret = -1;
+    int clusters = 0;
+    xlator_t *xl = NULL;
+    char *scrub_args[] = {"features/bit-rot", "%s-bit-rot-%d"};
+
+    ret = volgen_link_bricks_from_list_tail(
+        graph, volinfo, scrub_args[0], scrub_args[1], brick_count, brick_count);
+    clusters = ret;
+
+    xl = first_of(graph);
+
+    ret = xlator_set_fixed_option(xl, "scrubber", "true");
+    if (ret)
+        goto out;
+
+    ret = clusters;
+
+out:
+    return ret;
+}
+
+static int
+build_scrub_volume_graph(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                         dict_t *mod_dict)
+{
+    volgen_graph_t cgraph = {0};
+    dict_t *set_dict = NULL;
+    xlator_t *this = NULL;
+    xlator_t *xl = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = 0;
+    int clusters = -1;
+    int brick_count = 0;
+    char transt[16] = {
+        0,
+    };
+    glusterd_brickinfo_t *brickinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    set_dict = dict_new();
+    if (!set_dict) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = prepare_bitrot_scrub_volume_options(volinfo, mod_dict, set_dict);
+    if (ret)
+        goto out;
+
+    get_transport_type(volinfo, set_dict, transt, _gf_false);
+    if (!strncmp(transt, "tcp,rdma", SLEN("tcp,rdma")))
+        (void)snprintf(transt, sizeof(transt), "%s", "tcp");
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        if (!glusterd_is_local_brick(this, volinfo, brickinfo))
+            continue;
+
+        xl = volgen_graph_build_client(&cgraph, volinfo, brickinfo->hostname,
+                                       NULL, brickinfo->path,
+                                       brickinfo->brick_id, transt, set_dict);
+        if (!xl) {
+            ret = -1;
+            goto out;
+        }
+        brick_count++;
+    }
+
+    if (brick_count == 0) {
+        ret = 0;
+        goto out;
+    }
+
+    clusters = build_scrub_clusters(&cgraph, volinfo, set_dict, brick_count);
+    if (clusters < 0) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = volgen_graph_set_options_generic(&cgraph, set_dict, volinfo,
+                                           scrubber_option_handler);
+    if (ret)
+        goto out;
+
+    ret = volgen_graph_merge_sub(graph, &cgraph, clusters);
+    if (ret)
+        goto out;
+
+    ret = graph_set_generic_options(this, graph, set_dict, "Scrubber");
+out:
+    if (set_dict)
+        dict_unref(set_dict);
+
+    return ret;
+}
+
+int
+build_scrub_graph(volgen_graph_t *graph, dict_t *mod_dict)
+{
+    glusterd_volinfo_t *voliter = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    int ret = 0;
+    xlator_t *iostxl = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    iostxl = volgen_graph_add_as(graph, "debug/io-stats", "scrub");
+    if (!iostxl) {
+        ret = -1;
+        goto out;
+    }
+
+    cds_list_for_each_entry(voliter, &priv->volumes, vol_list)
+    {
+        if (voliter->status != GLUSTERD_STATUS_STARTED)
+            continue;
+
+        if (!glusterd_is_bitrot_enabled(voliter))
+            continue;
+
+        ret = build_scrub_volume_graph(graph, voliter, mod_dict);
+    }
+out:
+    return ret;
+}
+
+int
+glusterd_snapdsvc_create_volfile(glusterd_volinfo_t *volinfo)
+{
+    volgen_graph_t graph = {
+        0,
+    };
+    int ret = -1;
+    char filename[PATH_MAX] = {
+        0,
+    };
+
+    graph.type = GF_SNAPD;
+    glusterd_svc_build_snapd_volfile(volinfo, filename, PATH_MAX);
+
+    ret = glusterd_snapdsvc_generate_volfile(&graph, volinfo);
+    if (!ret)
+        ret = volgen_write_volfile(&graph, filename);
+
+    volgen_graph_free(&graph);
+
+    return ret;
+}
+
+int
+glusterd_create_rb_volfiles(glusterd_volinfo_t *volinfo,
+                            glusterd_brickinfo_t *brickinfo)
+{
+    int ret = -1;
+
+    ret = glusterd_generate_brick_volfile(volinfo, brickinfo, NULL, NULL);
+    if (!ret)
+        ret = generate_client_volfiles(volinfo, GF_CLIENT_TRUSTED);
+    if (!ret)
+        ret = glusterd_fetchspec_notify(THIS);
+
+    return ret;
+}
+
+int
+glusterd_create_volfiles(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    ret = generate_brick_volfiles(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Could not generate volfiles for bricks");
+        goto out;
+    }
+
+    ret = generate_client_volfiles(volinfo, GF_CLIENT_TRUSTED);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Could not generate trusted client volfiles");
+        goto out;
+    }
+
+    ret = generate_client_volfiles(volinfo, GF_CLIENT_TRUSTED_PROXY);
+    if (ret) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Could not generate gfproxy client volfiles");
+        goto out;
+    }
+
+    ret = generate_client_volfiles(volinfo, GF_CLIENT_OTHER);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLFILE_CREATE_FAIL,
+               "Could not generate client volfiles");
+
+    ret = glusterd_generate_gfproxyd_volfile(volinfo);
+    if (ret)
+        gf_log(this->name, GF_LOG_ERROR, "Could not generate gfproxy volfiles");
+
+    ret = glusterd_shdsvc_create_volfile(volinfo);
+    if (ret)
+        gf_log(this->name, GF_LOG_ERROR, "Could not generate shd volfiles");
+
+    dict_del_sizen(volinfo->dict, "skip-CLIOT");
+
+out:
+    return ret;
+}
+
+int
+glusterd_create_volfiles_and_notify_services(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    ret = glusterd_create_volfiles(volinfo);
+    if (ret)
+        goto out;
+
+    ret = glusterd_fetchspec_notify(this);
+
+out:
+    return ret;
+}
+
+int
+glusterd_create_global_volfile(glusterd_graph_builder_t builder, char *filepath,
+                               dict_t *mod_dict)
+{
+    volgen_graph_t graph = {
+        0,
+    };
+    int ret = -1;
+
+    ret = builder(&graph, mod_dict);
+    if (!ret)
+        ret = volgen_write_volfile(&graph, filepath);
+
+    volgen_graph_free(&graph);
+
+    return ret;
+}
+
+int
+glusterd_delete_volfile(glusterd_volinfo_t *volinfo,
+                        glusterd_brickinfo_t *brickinfo)
+{
+    int ret = 0;
+    char filename[PATH_MAX] = {
+        0,
+    };
+
+    GF_ASSERT(volinfo);
+    GF_ASSERT(brickinfo);
+
+    get_brick_filepath(filename, volinfo, brickinfo, NULL);
+    ret = sys_unlink(filename);
+    if (ret)
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "failed to delete file: %s", filename);
+    return ret;
+}
+
+int
+validate_shdopts(glusterd_volinfo_t *volinfo, dict_t *val_dict,
+                 char **op_errstr)
+{
+    volgen_graph_t graph = {
+        0,
+    };
+    int ret = -1;
+
+    graph.errstr = op_errstr;
+
+    if (!glusterd_is_shd_compatible_volume(volinfo)) {
+        ret = 0;
+        goto out;
+    }
+    ret = dict_set_int32_sizen(val_dict, "graph-check", 1);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=graph-check", NULL);
+        goto out;
+    }
+    ret = build_shd_graph(volinfo, &graph, val_dict);
+    if (!ret)
+        ret = graph_reconf_validateopt(&graph.graph, op_errstr);
+
+    volgen_graph_free(&graph);
+
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+out:
+    dict_del_sizen(val_dict, "graph-check");
+    return ret;
+}
+
+#ifdef BUILD_GNFS
+static int
+validate_nfsopts(glusterd_volinfo_t *volinfo, dict_t *val_dict,
+                 char **op_errstr)
+{
+    volgen_graph_t graph = {
+        0,
+    };
+    int ret = -1;
+    char transport_type[16] = {
+        0,
+    };
+    char *tt = NULL;
+    char err_str[128] = {
+        0,
+    };
+    xlator_t *this = THIS;
+
+    GF_ASSERT(this);
+
+    graph.errstr = op_errstr;
+
+    get_vol_transport_type(volinfo, transport_type);
+    ret = dict_get_str_sizen(val_dict, "nfs.transport-type", &tt);
+    if (!ret) {
+        if (volinfo->transport_type != GF_TRANSPORT_BOTH_TCP_RDMA) {
+            snprintf(err_str, sizeof(err_str),
+                     "Changing nfs "
+                     "transport type is allowed only for volumes "
+                     "of transport type tcp,rdma");
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_UNSUPPORTED, "%s",
+                   err_str);
+            *op_errstr = gf_strdup(err_str);
+            ret = -1;
+            goto out;
+        }
+        if (strcmp(tt, "tcp") && strcmp(tt, "rdma")) {
+            snprintf(err_str, sizeof(err_str),
+                     "wrong transport "
+                     "type %s",
+                     tt);
+            gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_INCOMPATIBLE_VALUE,
+                    "Type=%s", tt, NULL);
+            *op_errstr = gf_strdup(err_str);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    ret = dict_set_str_sizen(val_dict, "volume-name", volinfo->volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+               "Failed to set volume name");
+        goto out;
+    }
+
+    ret = build_nfs_graph(&graph, val_dict);
+    if (!ret)
+        ret = graph_reconf_validateopt(&graph.graph, op_errstr);
+
+    volgen_graph_free(&graph);
+
+out:
+    if (dict_get_sizen(val_dict, "volume-name"))
+        dict_del_sizen(val_dict, "volume-name");
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+#endif
+
+int
+validate_clientopts(glusterd_volinfo_t *volinfo, dict_t *val_dict,
+                    char **op_errstr)
+{
+    volgen_graph_t graph = {
+        0,
+    };
+    int ret = -1;
+
+    GF_ASSERT(volinfo);
+
+    graph.errstr = op_errstr;
+
+    ret = build_client_graph(&graph, volinfo, val_dict);
+    if (!ret)
+        ret = graph_reconf_validateopt(&graph.graph, op_errstr);
+
+    volgen_graph_free(&graph);
+
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+validate_brickopts(glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *brickinfo,
+                   dict_t *mod_dict, void *reconf)
+{
+    volgen_graph_t graph = {
+        0,
+    };
+    int ret = -1;
+    struct gd_validate_reconf_opts *brickreconf = reconf;
+    dict_t *val_dict = brickreconf->options;
+    char **op_errstr = brickreconf->op_errstr;
+    dict_t *full_dict = NULL;
+
+    GF_ASSERT(volinfo);
+
+    graph.errstr = op_errstr;
+    full_dict = dict_new();
+    if (!full_dict) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    if (mod_dict)
+        dict_copy(mod_dict, full_dict);
+
+    if (val_dict)
+        dict_copy(val_dict, full_dict);
+
+    ret = build_server_graph(&graph, volinfo, full_dict, brickinfo);
+    if (!ret)
+        ret = graph_reconf_validateopt(&graph.graph, op_errstr);
+
+    volgen_graph_free(&graph);
+
+out:
+    if (full_dict)
+        dict_unref(full_dict);
+
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_validate_brickreconf(glusterd_volinfo_t *volinfo, dict_t *val_dict,
+                              char **op_errstr)
+{
+    int ret = -1;
+    struct gd_validate_reconf_opts brickreconf = {0};
+
+    brickreconf.options = val_dict;
+    brickreconf.op_errstr = op_errstr;
+    ret = glusterd_volume_brick_for_each(volinfo, &brickreconf,
+                                         validate_brickopts);
+    return ret;
+}
+
+static int
+_check_globalopt(dict_t *this, char *key, data_t *value, void *ret_val)
+{
+    int *ret = NULL;
+
+    ret = ret_val;
+    if (*ret)
+        return 0;
+    if (!glusterd_check_globaloption(key))
+        *ret = 1;
+
+    return 0;
+}
+
+int
+glusterd_validate_globalopts(glusterd_volinfo_t *volinfo, dict_t *val_dict,
+                             char **op_errstr)
+{
+    int ret = 0;
+
+    dict_foreach(val_dict, _check_globalopt, &ret);
+    if (ret) {
+        *op_errstr = gf_strdup("option specified is not a global option");
+        return -1;
+    }
+    ret = glusterd_validate_brickreconf(volinfo, val_dict, op_errstr);
+
+    if (ret) {
+        gf_msg_debug("glusterd", 0, "Could not Validate  bricks");
+        goto out;
+    }
+
+    ret = validate_clientopts(volinfo, val_dict, op_errstr);
+    if (ret) {
+        gf_msg_debug("glusterd", 0, "Could not Validate client");
+        goto out;
+    }
+#ifdef BUILD_GNFS
+    ret = validate_nfsopts(volinfo, val_dict, op_errstr);
+    if (ret) {
+        gf_msg_debug("glusterd", 0, "Could not Validate nfs");
+        goto out;
+    }
+#endif
+    ret = validate_shdopts(volinfo, val_dict, op_errstr);
+    if (ret) {
+        gf_msg_debug("glusterd", 0, "Could not Validate self-heald");
+        goto out;
+    }
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+_check_localopt(dict_t *this, char *key, data_t *value, void *ret_val)
+{
+    int *ret = NULL;
+
+    ret = ret_val;
+    if (*ret)
+        return 0;
+    if (!glusterd_check_localoption(key))
+        *ret = 1;
+
+    return 0;
+}
+
+int
+glusterd_validate_reconfopts(glusterd_volinfo_t *volinfo, dict_t *val_dict,
+                             char **op_errstr)
+{
+    int ret = 0;
+
+    dict_foreach(val_dict, _check_localopt, &ret);
+    if (ret) {
+        *op_errstr = gf_strdup("option specified is not a local option");
+        return -1;
+    }
+    ret = glusterd_validate_brickreconf(volinfo, val_dict, op_errstr);
+
+    if (ret) {
+        gf_msg_debug("glusterd", 0, "Could not Validate  bricks");
+        goto out;
+    }
+
+    ret = validate_clientopts(volinfo, val_dict, op_errstr);
+    if (ret) {
+        gf_msg_debug("glusterd", 0, "Could not Validate client");
+        goto out;
+    }
+
+#ifdef BUILD_GNFS
+    ret = validate_nfsopts(volinfo, val_dict, op_errstr);
+    if (ret) {
+        gf_msg_debug("glusterd", 0, "Could not Validate nfs");
+        goto out;
+    }
+#endif
+    ret = validate_shdopts(volinfo, val_dict, op_errstr);
+    if (ret) {
+        gf_msg_debug("glusterd", 0, "Could not Validate self-heald");
+        goto out;
+    }
+
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+struct volopt_map_entry *
+gd_get_vmep(const char *key)
+{
+    char *completion = NULL;
+    struct volopt_map_entry *vmep = NULL;
+    int ret = 0;
+
+    if (!key)
+        return NULL;
+
+    COMPLETE_OPTION((char *)key, completion, ret);
+    for (vmep = glusterd_volopt_map; vmep->key; vmep++) {
+        if (strcmp(vmep->key, key) == 0)
+            return vmep;
+    }
+
+    return NULL;
+}
+
+uint32_t
+glusterd_get_op_version_from_vmep(struct volopt_map_entry *vmep)
+{
+    if (vmep)
+        return vmep->op_version;
+
+    return 0;
+}
+
+gf_boolean_t
+gd_is_client_option(struct volopt_map_entry *vmep)
+{
+    if (vmep && (vmep->flags & VOLOPT_FLAG_CLIENT_OPT))
+        return _gf_true;
+
+    return _gf_false;
+}
+
+gf_boolean_t
+gd_is_xlator_option(struct volopt_map_entry *vmep)
+{
+    if (vmep && (vmep->flags & VOLOPT_FLAG_XLATOR_OPT))
+        return _gf_true;
+
+    return _gf_false;
+}
+
+static volume_option_type_t
+_gd_get_option_type(struct volopt_map_entry *vmep)
+{
+    void *dl_handle = NULL;
+    volume_opt_list_t vol_opt_list = {
+        {0},
+    };
+    int ret = -1;
+    volume_option_t *opt = NULL;
+    char *xlopt_key = NULL;
+    volume_option_type_t opt_type = GF_OPTION_TYPE_MAX;
+
+    if (vmep) {
+        CDS_INIT_LIST_HEAD(&vol_opt_list.list);
+        ret = xlator_volopt_dynload(vmep->voltype, &dl_handle, &vol_opt_list);
+        if (ret)
+            goto out;
+
+        if (_get_xlator_opt_key_from_vme(vmep, &xlopt_key))
+            goto out;
+
+        opt = xlator_volume_option_get_list(&vol_opt_list, xlopt_key);
+        _free_xlator_opt_key(xlopt_key);
+
+        if (opt)
+            opt_type = opt->type;
+    }
+
+out:
+    if (dl_handle) {
+        dlclose(dl_handle);
+        dl_handle = NULL;
+    }
+
+    return opt_type;
+}
+
+gf_boolean_t
+gd_is_boolean_option(struct volopt_map_entry *vmep)
+{
+    if (GF_OPTION_TYPE_BOOL == _gd_get_option_type(vmep))
+        return _gf_true;
+
+    return _gf_false;
+}
+
+int
+glusterd_shdsvc_generate_volfile(glusterd_volinfo_t *volinfo, char *filename,
+                                 dict_t *mode_dict)
+{
+    int ret = -1;
+    volgen_graph_t graph = {
+        0,
+    };
+
+    graph.type = GF_SHD;
+    ret = build_shd_graph(volinfo, &graph, mode_dict);
+    if (!ret)
+        ret = volgen_write_volfile(&graph, filename);
+
+    volgen_graph_free(&graph);
+
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h
new file mode 100644
index 00000000000..cd4d0c7d0cc
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h
@@ -0,0 +1,338 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_VOLGEN_H_
+#define _GLUSTERD_VOLGEN_H_
+
+#if (HAVE_LIB_XML)
+#include <libxml/encoding.h>
+#include <libxml/xmlwriter.h>
+#endif
+
+#include "glusterd.h"
+#include "glusterd-messages.h"
+
+/* volopt map key name definitions */
+
+#define VKEY_DIAG_CNT_FOP_HITS "diagnostics.count-fop-hits"
+#define VKEY_DIAG_LAT_MEASUREMENT "diagnostics.latency-measurement"
+#define VKEY_FEATURES_LIMIT_USAGE "features.limit-usage"
+#define VKEY_FEATURES_SOFT_LIMIT "features.soft-limit"
+#define VKEY_MARKER_XTIME GEOREP ".indexing"
+#define VKEY_MARKER_XTIME_FORCE GEOREP ".ignore-pid-check"
+#define VKEY_CHANGELOG "changelog.changelog"
+#define VKEY_FEATURES_QUOTA "features.quota"
+#define VKEY_FEATURES_INODE_QUOTA "features.inode-quota"
+#define VKEY_FEATURES_TRASH "features.trash"
+#define VKEY_FEATURES_BITROT "features.bitrot"
+#define VKEY_FEATURES_SCRUB "features.scrub"
+#define VKEY_FEATURES_SELINUX "features.selinux"
+#define VKEY_PARALLEL_READDIR "performance.parallel-readdir"
+#define VKEY_READDIR_AHEAD "performance.readdir-ahead"
+#define VKEY_RDA_CACHE_LIMIT "performance.rda-cache-limit"
+#define VKEY_RDA_REQUEST_SIZE "performance.rda-request-size"
+#define VKEY_CONFIG_GFPROXY "config.gfproxyd"
+#define VKEY_CONFIG_GLOBAL_THREADING "config.global-threading"
+#define VKEY_CONFIG_CLIENT_THREADS "config.client-threads"
+#define VKEY_CONFIG_BRICK_THREADS "config.brick-threads"
+
+#define AUTH_ALLOW_MAP_KEY "auth.allow"
+#define AUTH_REJECT_MAP_KEY "auth.reject"
+#define NFS_DISABLE_MAP_KEY "nfs.disable"
+#define AUTH_ALLOW_OPT_KEY "auth.addr.*.allow"
+#define AUTH_REJECT_OPT_KEY "auth.addr.*.reject"
+#define NFS_DISABLE_OPT_KEY "nfs.*.disable"
+
+#define SSL_OWN_CERT_OPT "ssl.own-cert"
+#define SSL_PRIVATE_KEY_OPT "ssl.private-key"
+#define SSL_CA_LIST_OPT "ssl.ca-list"
+#define SSL_CRL_PATH_OPT "ssl.crl-path"
+#define SSL_CERT_DEPTH_OPT "ssl.certificate-depth"
+#define SSL_CIPHER_LIST_OPT "ssl.cipher-list"
+#define SSL_DH_PARAM_OPT "ssl.dh-param"
+#define SSL_EC_CURVE_OPT "ssl.ec-curve"
+
+typedef enum {
+    GF_CLIENT_TRUSTED,
+    GF_CLIENT_OTHER,
+    GF_CLIENT_TRUSTED_PROXY,
+} glusterd_client_type_t;
+
+/* It indicates the type of volfile that the graph is built for */
+typedef enum {
+    GF_REBALANCED = 1,
+    GF_QUOTAD,
+    GF_SNAPD,
+    GF_SHD,
+} glusterd_graph_type_t;
+
+struct volgen_graph {
+    char **errstr;
+    glusterfs_graph_t graph;
+    glusterd_graph_type_t type;
+};
+typedef struct volgen_graph volgen_graph_t;
+
+typedef int (*glusterd_graph_builder_t)(volgen_graph_t *graph,
+                                        dict_t *mod_dict);
+typedef int (*glusterd_vol_graph_builder_t)(glusterd_volinfo_t *,
+                                            char *filename, dict_t *mod_dict);
+
+#define COMPLETE_OPTION(key, completion, ret)                                  \
+    do {                                                                       \
+        if (!strchr(key, '.')) {                                               \
+            ret = option_complete(key, &completion);                           \
+            if (ret) {                                                         \
+                gf_msg("", GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,             \
+                       "Out of memory");                                       \
+                return _gf_false;                                              \
+            }                                                                  \
+                                                                               \
+            if (!completion) {                                                 \
+                gf_msg("", GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY,              \
+                       "option %s does not"                                    \
+                       "exist",                                                \
+                       key);                                                   \
+                return _gf_false;                                              \
+            }                                                                  \
+        }                                                                      \
+                                                                               \
+        if (completion)                                                        \
+            GF_FREE(completion);                                               \
+    } while (0);
+
+typedef enum gd_volopt_flags_ {
+    VOLOPT_FLAG_NONE,
+    VOLOPT_FLAG_FORCE = 0x01,       /* option needs force to be reset */
+    VOLOPT_FLAG_XLATOR_OPT = 0x02,  /* option enables/disables xlators */
+    VOLOPT_FLAG_CLIENT_OPT = 0x04,  /* option affects clients */
+    VOLOPT_FLAG_NEVER_RESET = 0x08, /* option which should not be reset */
+} gd_volopt_flags_t;
+
+typedef enum {
+    GF_XLATOR_POSIX = 0,
+    GF_XLATOR_ACL,
+    GF_XLATOR_LOCKS,
+    GF_XLATOR_LEASES,
+    GF_XLATOR_UPCALL,
+    GF_XLATOR_IOT,
+    GF_XLATOR_INDEX,
+    GF_XLATOR_MARKER,
+    GF_XLATOR_IO_STATS,
+    GF_XLATOR_BD,
+    GF_XLATOR_SERVER,
+    GF_XLATOR_NONE,
+} glusterd_server_xlator_t;
+
+/* As of now debug xlators can be loaded only below fuse in the client
+ * graph via cli. More xlators can be added below when the cli option
+ * for adding debug xlators anywhere in the client graph has to be made
+ * available.
+ */
+typedef enum {
+    GF_CLNT_XLATOR_FUSE = 0,
+    GF_CLNT_XLATOR_NONE,
+} glusterd_client_xlator_t;
+
+typedef enum { DOC, NO_DOC, GLOBAL_DOC, GLOBAL_NO_DOC } option_type_t;
+
+typedef int (*vme_option_validation)(glusterd_volinfo_t *volinfo, dict_t *dict,
+                                     char *key, char *value, char **op_errstr);
+
+struct volopt_map_entry {
+    char *key;
+    char *voltype;
+    char *option;
+    char *value;
+    option_type_t type;
+    uint32_t flags;
+    uint32_t op_version;
+    char *description;
+    vme_option_validation validate_fn;
+    /* If client_option is true, the option affects clients.
+     * this is used to calculate client-op-version of volumes
+     */
+    // gf_boolean_t client_option;
+};
+
+typedef int (*brick_xlator_builder)(volgen_graph_t *graph,
+                                    glusterd_volinfo_t *volinfo,
+                                    dict_t *set_dict,
+                                    glusterd_brickinfo_t *brickinfo);
+
+struct volgen_brick_xlator {
+    /* function that builds a xlator */
+    brick_xlator_builder builder;
+    /* debug key for a xlator that
+     * gets used for adding debug translators like trace, error-gen,
+     * delay-gen before this xlator */
+    char *dbg_key;
+};
+
+struct nfs_opt {
+    const char *pattern;
+    const char *printf_pattern;
+};
+
+typedef struct volgen_brick_xlator volgen_brick_xlator_t;
+
+int
+glusterd_snapdsvc_create_volfile(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_snapdsvc_generate_volfile(volgen_graph_t *graph,
+                                   glusterd_volinfo_t *volinfo);
+
+int
+glusterd_create_global_volfile(glusterd_graph_builder_t builder, char *filepath,
+                               dict_t *mod_dict);
+
+int
+glusterd_create_rb_volfiles(glusterd_volinfo_t *volinfo,
+                            glusterd_brickinfo_t *brickinfo);
+
+int
+glusterd_create_volfiles(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_create_volfiles_and_notify_services(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_generate_client_per_brick_volfile(glusterd_volinfo_t *volinfo);
+
+void
+glusterd_get_nfs_filepath(char *filename);
+
+void
+glusterd_get_shd_filepath(char *filename);
+
+int
+build_shd_graph(glusterd_volinfo_t *volinfo, volgen_graph_t *graph,
+                dict_t *mod_dict);
+
+#ifdef BUILD_GNFS
+int
+build_nfs_graph(volgen_graph_t *graph, dict_t *mod_dict);
+#endif
+int
+build_quotad_graph(volgen_graph_t *graph, dict_t *mod_dict);
+
+int
+build_rebalance_volfile(glusterd_volinfo_t *volinfo, char *filepath,
+                        dict_t *mod_dict);
+
+int
+build_bitd_graph(volgen_graph_t *graph, dict_t *mod_dict);
+
+int
+build_scrub_graph(volgen_graph_t *graph, dict_t *mod_dict);
+
+int
+glusterd_delete_volfile(glusterd_volinfo_t *volinfo,
+                        glusterd_brickinfo_t *brickinfo);
+int
+glusterd_delete_snap_volfile(glusterd_volinfo_t *volinfo,
+                             glusterd_volinfo_t *snap_volinfo,
+                             glusterd_brickinfo_t *brickinfo);
+
+int
+glusterd_volinfo_get(glusterd_volinfo_t *volinfo, char *key, char **value);
+
+int
+glusterd_volinfo_get_boolean(glusterd_volinfo_t *volinfo, char *key);
+
+int
+glusterd_validate_globalopts(glusterd_volinfo_t *volinfo, dict_t *val_dict,
+                             char **op_errstr);
+
+int
+glusterd_validate_localopts(dict_t *val_dict, char **op_errstr);
+
+gf_boolean_t
+glusterd_check_globaloption(char *key);
+
+gf_boolean_t
+glusterd_check_voloption_flags(char *key, int32_t flags);
+
+gf_boolean_t
+glusterd_is_valid_volfpath(char *volname, char *brick);
+
+int
+generate_brick_volfiles(glusterd_volinfo_t *volinfo);
+
+int
+generate_snap_brick_volfiles(glusterd_volinfo_t *volinfo,
+                             glusterd_volinfo_t *snap_volinfo);
+int
+generate_client_volfiles(glusterd_volinfo_t *volinfo,
+                         glusterd_client_type_t client_type);
+int
+generate_snap_client_volfiles(glusterd_volinfo_t *actual_volinfo,
+                              glusterd_volinfo_t *snap_volinfo,
+                              glusterd_client_type_t client_type,
+                              gf_boolean_t vol_restore);
+
+int
+_get_xlator_opt_key_from_vme(struct volopt_map_entry *vme, char **key);
+
+void
+_free_xlator_opt_key(char *key);
+
+#if (HAVE_LIB_XML)
+int
+init_sethelp_xml_doc(xmlTextWriterPtr *writer, xmlBufferPtr *buf);
+
+int
+xml_add_volset_element(xmlTextWriterPtr writer, const char *name,
+                       const char *def_val, const char *dscrpt);
+int
+end_sethelp_xml_doc(xmlTextWriterPtr writer);
+#endif /* HAVE_LIB_XML */
+
+char *
+glusterd_get_trans_type_rb(gf_transport_type ttype);
+
+struct volopt_map_entry *
+gd_get_vmep(const char *key);
+
+uint32_t
+glusterd_get_op_version_from_vmep(struct volopt_map_entry *vmep);
+
+gf_boolean_t
+gd_is_client_option(struct volopt_map_entry *vmep);
+
+gf_boolean_t
+gd_is_xlator_option(struct volopt_map_entry *vmep);
+
+gf_boolean_t
+gd_is_boolean_option(struct volopt_map_entry *vmep);
+
+char *
+volgen_get_shd_key(int type);
+
+int
+glusterd_volopt_validate(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                         char *value, char **op_errstr);
+gf_boolean_t
+gd_is_self_heal_enabled(glusterd_volinfo_t *volinfo, dict_t *dict);
+
+int
+generate_dummy_client_volfiles(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_generate_gfproxyd_volfile(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_build_gfproxyd_volfile(glusterd_volinfo_t *volinfo, char *filename);
+
+int
+glusterd_shdsvc_generate_volfile(glusterd_volinfo_t *volinfo, char *filename,
+                                 dict_t *mode_dict);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
new file mode 100644
index 00000000000..814ab14fb27
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
@@ -0,0 +1,3033 @@
+/*
+   Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <glusterfs/common-utils.h>
+#include <glusterfs/syscall.h>
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+#include "glusterd.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-geo-rep.h"
+#include "glusterd-store.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-messages.h"
+#include <glusterfs/run.h>
+#include "glusterd-snapshot-utils.h"
+#include "glusterd-svc-mgmt.h"
+#include "glusterd-svc-helper.h"
+#include "glusterd-shd-svc.h"
+#include "glusterd-snapd-svc.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-server-quorum.h"
+
+#include <stdint.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+
+#define glusterd_op_start_volume_args_get(dict, volname, flags)                \
+    glusterd_op_stop_volume_args_get(dict, volname, flags)
+
+int
+__glusterd_handle_create_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    char *bricks = NULL;
+    char *volname = NULL;
+    int brick_count = 0;
+    int thin_arbiter_count = 0;
+    void *cli_rsp = NULL;
+    char err_str[2048] = {
+        0,
+    };
+    gf_cli_rsp rsp = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    char *free_ptr = NULL;
+    char *trans_type = NULL;
+    char *address_family_str = NULL;
+    uuid_t volume_id = {
+        0,
+    };
+    uuid_t tmp_uuid = {0};
+    int32_t type = 0;
+    char *username = NULL;
+    char *password = NULL;
+#ifdef IPV6_DEFAULT
+    char *addr_family = "inet6";
+#else
+    char *addr_family = "inet";
+#endif
+    glusterd_volinfo_t *volinfo = NULL;
+
+    GF_ASSERT(req);
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    ret = -1;
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        req->rpc_err = GARBAGE_ARGS;
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to decode request "
+                 "received from cli");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL, "%s",
+               err_str);
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "Received create volume req");
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get volume "
+                 "name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (!ret) {
+        ret = -1;
+        snprintf(err_str, sizeof(err_str), "Volume %s already exists", volname);
+        gf_msg(this->name, GF_LOG_ERROR, EEXIST, GD_MSG_VOL_ALREADY_EXIST, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "count", SLEN("count"), &brick_count);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get brick count"
+                 " for volume %s",
+                 volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "type", SLEN("type"), &type);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get type of "
+                 "volume %s",
+                 volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "transport", SLEN("transport"), &trans_type);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get "
+                 "transport-type of volume %s",
+                 volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = dict_get_strn(this->options, "transport.address-family",
+                        SLEN("transport.address-family"), &address_family_str);
+
+    if (!ret) {
+        ret = dict_set_dynstr_with_alloc(dict, "transport.address-family",
+                                         address_family_str);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "failed to set transport.address-family");
+            goto out;
+        }
+    } else if (!strcmp(trans_type, "tcp")) {
+        /* Setting default as inet for trans_type tcp if the op-version
+         * is >= 3.8.0
+         */
+        if (conf->op_version >= GD_OP_VERSION_3_8_0) {
+            ret = dict_set_dynstr_with_alloc(dict, "transport.address-family",
+                                             addr_family);
+            if (ret) {
+                gf_log(this->name, GF_LOG_ERROR,
+                       "failed to set "
+                       "transport.address-family "
+                       "to %s",
+                       addr_family);
+                goto out;
+            }
+        }
+    }
+    ret = dict_get_strn(dict, "bricks", SLEN("bricks"), &bricks);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get bricks for "
+                 "volume %s",
+                 volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "thin-arbiter-count",
+                          SLEN("thin-arbiter-count"), &thin_arbiter_count);
+    if (thin_arbiter_count && conf->op_version < GD_OP_VERSION_7_0) {
+        snprintf(err_str, sizeof(err_str),
+                 "Cannot execute command. "
+                 "The cluster is operating at version %d. "
+                 "Thin-arbiter volume creation is unavailable in "
+                 "this version",
+                 conf->op_version);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_GLUSTERD_OP_FAILED, "%s",
+               err_str);
+        ret = -1;
+        goto out;
+    }
+
+    if (!dict_getn(dict, "force", SLEN("force"))) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get 'force' flag");
+        goto out;
+    }
+
+    gf_uuid_generate(volume_id);
+    free_ptr = gf_strdup(uuid_utoa(volume_id));
+    ret = dict_set_dynstrn(dict, "volume-id", SLEN("volume-id"), free_ptr);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to set volume "
+                 "id of volume %s",
+                 volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+    free_ptr = NULL;
+
+    /* generate internal username and password */
+
+    gf_uuid_generate(tmp_uuid);
+    username = gf_strdup(uuid_utoa(tmp_uuid));
+    ret = dict_set_dynstrn(dict, "internal-username", SLEN("internal-username"),
+                           username);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set username for "
+               "volume %s",
+               volname);
+        goto out;
+    }
+
+    gf_uuid_generate(tmp_uuid);
+    password = gf_strdup(uuid_utoa(tmp_uuid));
+    ret = dict_set_dynstrn(dict, "internal-password", SLEN("internal-password"),
+                           password);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set password for "
+               "volume %s",
+               volname);
+        goto out;
+    }
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_CREATE_VOLUME, dict);
+
+out:
+    if (ret) {
+        rsp.op_ret = -1;
+        rsp.op_errno = 0;
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        rsp.op_errstr = err_str;
+        cli_rsp = &rsp;
+        glusterd_to_cli(req, cli_rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_cli_rsp,
+                        dict);
+        ret = 0;  // Client response sent, prevent second response
+    }
+
+    GF_FREE(free_ptr);
+
+    return ret;
+}
+
+int
+glusterd_handle_create_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_create_volume);
+}
+
+int
+__glusterd_handle_cli_start_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    char *volname = NULL;
+    dict_t *dict = NULL;
+    glusterd_op_t cli_op = GD_OP_START_VOLUME;
+    char errstr[2048] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+
+    conf = this->private;
+    GF_ASSERT(conf);
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        snprintf(errstr, sizeof(errstr),
+                 "Failed to decode message "
+                 "received from cli");
+        req->rpc_err = GARBAGE_ARGS;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL, "%s",
+               errstr);
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(errstr, sizeof(errstr),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        }
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(errstr, sizeof(errstr), "Unable to get volume name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               errstr);
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0,
+                 "Received start vol req"
+                 " for volume %s",
+                 volname);
+
+    if (conf->op_version <= GD_OP_VERSION_3_7_6) {
+        gf_msg_debug(this->name, 0,
+                     "The cluster is operating at "
+                     "version less than or equal to %d. Volume start "
+                     "falling back to syncop framework.",
+                     GD_OP_VERSION_3_7_6);
+        ret = glusterd_op_begin_synctask(req, GD_OP_START_VOLUME, dict);
+    } else {
+        ret = glusterd_mgmt_v3_initiate_all_phases(req, GD_OP_START_VOLUME,
+                                                   dict);
+    }
+out:
+    free(cli_req.dict.dict_val);  // its malloced by xdr
+
+    if (ret) {
+        if (errstr[0] == '\0')
+            snprintf(errstr, sizeof(errstr), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, errstr);
+    }
+
+    return ret;
+}
+
+int
+glusterd_handle_cli_start_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_cli_start_volume);
+}
+
+int
+__glusterd_handle_cli_stop_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    char *dup_volname = NULL;
+    dict_t *dict = NULL;
+    glusterd_op_t cli_op = GD_OP_STOP_VOLUME;
+    xlator_t *this = NULL;
+    char err_str[64] = {
+        0,
+    };
+    glusterd_conf_t *conf = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(req);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to decode message "
+                 "received from cli");
+        req->rpc_err = GARBAGE_ARGS;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL, "%s",
+               err_str);
+        goto out;
+    }
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        }
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &dup_volname);
+
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to get volume "
+                 "name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0,
+                 "Received stop vol req "
+                 "for volume %s",
+                 dup_volname);
+
+    if (conf->op_version < GD_OP_VERSION_4_1_0) {
+        gf_msg_debug(this->name, 0,
+                     "The cluster is operating at "
+                     "version less than %d. Volume start "
+                     "falling back to syncop framework.",
+                     GD_OP_VERSION_4_1_0);
+        ret = glusterd_op_begin_synctask(req, GD_OP_STOP_VOLUME, dict);
+    } else {
+        ret = glusterd_mgmt_v3_initiate_all_phases(req, GD_OP_STOP_VOLUME,
+                                                   dict);
+    }
+
+out:
+    free(cli_req.dict.dict_val);  // its malloced by xdr
+
+    if (ret) {
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, err_str);
+    }
+
+    return ret;
+}
+
+int
+glusterd_handle_cli_stop_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_cli_stop_volume);
+}
+
+int
+__glusterd_handle_cli_delete_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {
+        {
+            0,
+        },
+    };
+    glusterd_op_t cli_op = GD_OP_DELETE_VOLUME;
+    dict_t *dict = NULL;
+    char *volname = NULL;
+    char err_str[64] = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to decode request "
+                 "received from cli");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_REQ_DECODE_FAIL, "%s",
+               err_str);
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to decode "
+                     "the command");
+            goto out;
+        }
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Failed to get volume "
+                 "name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        req->rpc_err = GARBAGE_ARGS;
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0,
+                 "Received delete vol req"
+                 "for volume %s",
+                 volname);
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_DELETE_VOLUME, dict);
+
+out:
+    free(cli_req.dict.dict_val);  // its malloced by xdr
+
+    if (ret) {
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, err_str);
+    }
+
+    return ret;
+}
+int
+glusterd_handle_cli_delete_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req,
+                                       __glusterd_handle_cli_delete_volume);
+}
+static int
+glusterd_handle_heal_options_enable_disable(rpcsvc_request_t *req, dict_t *dict,
+                                            glusterd_volinfo_t *volinfo)
+{
+    gf_xl_afr_op_t heal_op = GF_SHD_OP_INVALID;
+    int ret = 0;
+    char *key = NULL;
+    char *value = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_int32n(dict, "heal-op", SLEN("heal-op"),
+                          (int32_t *)&heal_op);
+    if (ret || (heal_op == GF_SHD_OP_INVALID)) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=heal-op", NULL);
+        ret = -1;
+        goto out;
+    }
+
+    if ((heal_op != GF_SHD_OP_HEAL_ENABLE) &&
+        (heal_op != GF_SHD_OP_HEAL_DISABLE) &&
+        (heal_op != GF_SHD_OP_GRANULAR_ENTRY_HEAL_ENABLE) &&
+        (heal_op != GF_SHD_OP_GRANULAR_ENTRY_HEAL_DISABLE)) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if (((heal_op == GF_SHD_OP_GRANULAR_ENTRY_HEAL_ENABLE) ||
+         (heal_op == GF_SHD_OP_GRANULAR_ENTRY_HEAL_DISABLE)) &&
+        (volinfo->type == GF_CLUSTER_TYPE_DISPERSE)) {
+        ret = -1;
+        goto out;
+    }
+
+    if ((heal_op == GF_SHD_OP_HEAL_ENABLE) ||
+        (heal_op == GF_SHD_OP_GRANULAR_ENTRY_HEAL_ENABLE)) {
+        value = "enable";
+    } else if ((heal_op == GF_SHD_OP_HEAL_DISABLE) ||
+               (heal_op == GF_SHD_OP_GRANULAR_ENTRY_HEAL_DISABLE)) {
+        value = "disable";
+    }
+
+    if ((heal_op == GF_SHD_OP_HEAL_ENABLE) ||
+        (heal_op == GF_SHD_OP_HEAL_DISABLE)) {
+        key = volgen_get_shd_key(volinfo->type);
+        if (!key) {
+            ret = -1;
+            goto out;
+        }
+    } else {
+        key = "cluster.granular-entry-heal";
+        ret = dict_set_int8(dict, "is-special-key", 1);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=is-special-key", NULL);
+            goto out;
+        }
+    }
+
+    ret = dict_set_strn(dict, "key1", SLEN("key1"), key);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=key1", NULL);
+        goto out;
+    }
+
+    ret = dict_set_strn(dict, "value1", SLEN("value1"), value);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=value1", NULL);
+        goto out;
+    }
+
+    ret = dict_set_int32n(dict, "count", SLEN("count"), 1);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=count", NULL);
+        goto out;
+    }
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_SET_VOLUME, dict);
+
+out:
+    return ret;
+}
+
+int
+__glusterd_handle_cli_heal_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    dict_t *dict = NULL;
+    glusterd_op_t cli_op = GD_OP_HEAL_VOLUME;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+    char op_errstr[2048] = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_ASSERT(req);
+
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        // failed to decode msg;
+        req->rpc_err = GARBAGE_ARGS;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto out;
+    }
+
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(op_errstr, sizeof(op_errstr),
+                     "Unable to decode the command");
+            goto out;
+        } else {
+            dict->extra_stdfree = cli_req.dict.dict_val;
+        }
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(op_errstr, sizeof(op_errstr),
+                 "Unable to find "
+                 "volume name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               op_errstr);
+        goto out;
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_HEAL_VOL_REQ_RCVD,
+           "Received heal vol req "
+           "for volume %s",
+           volname);
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(op_errstr, sizeof(op_errstr), "Volume %s does not exist",
+                 volname);
+        goto out;
+    }
+
+    ret = glusterd_handle_heal_options_enable_disable(req, dict, volinfo);
+    if (ret == -EINVAL) {
+        ret = 0;
+    } else {
+        /*
+         * If the return value is -ve but not -EINVAL then the command
+         * failed. If the return value is 0 then the synctask for the
+         * op has begun, so in both cases just 'goto out'. If there was
+         * a failure it will respond with an error, otherwise the
+         * synctask will take the responsibility of sending the
+         * response.
+         */
+        goto out;
+    }
+
+    ret = glusterd_add_bricks_hname_path_to_dict(dict, volinfo);
+    if (ret)
+        goto out;
+
+    ret = dict_set_int32n(dict, "count", SLEN("count"), volinfo->brick_count);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                "Key=count", NULL);
+        goto out;
+    }
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_HEAL_VOLUME, dict);
+
+out:
+    if (ret) {
+        if (op_errstr[0] == '\0')
+            snprintf(op_errstr, sizeof(op_errstr), "operation failed");
+        gf_msg((this ? this->name : "glusterd"), GF_LOG_ERROR, 0,
+               GD_MSG_GLUSTERD_OP_FAILED, "%s", op_errstr);
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict,
+                                            op_errstr);
+    }
+
+    return ret;
+}
+
+int
+glusterd_handle_cli_heal_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req, __glusterd_handle_cli_heal_volume);
+}
+
+int
+__glusterd_handle_cli_statedump_volume(rpcsvc_request_t *req)
+{
+    int32_t ret = -1;
+    gf_cli_req cli_req = {{
+        0,
+    }};
+    char *volname = NULL;
+    char *options = NULL;
+    dict_t *dict = NULL;
+    int32_t option_cnt = 0;
+    glusterd_op_t cli_op = GD_OP_STATEDUMP_VOLUME;
+    char err_str[128] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    GF_ASSERT(req);
+
+    ret = -1;
+    ret = xdr_to_generic(req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+    if (ret < 0) {
+        req->rpc_err = GARBAGE_ARGS;
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_GARBAGE_ARGS, NULL);
+        goto out;
+    }
+    if (cli_req.dict.dict_len) {
+        /* Unserialize the dictionary */
+        dict = dict_new();
+
+        ret = dict_unserialize(cli_req.dict.dict_val, cli_req.dict.dict_len,
+                               &dict);
+        if (ret < 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_UNSERIALIZE_FAIL,
+                   "failed to "
+                   "unserialize req-buffer to dictionary");
+            snprintf(err_str, sizeof(err_str),
+                     "Unable to "
+                     "decode the command");
+            goto out;
+        }
+    }
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str), "Unable to get the volume name");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "options", SLEN("options"), &options);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str), "Unable to get options");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "option_cnt", SLEN("option_cnt"), &option_cnt);
+    if (ret) {
+        snprintf(err_str, sizeof(err_str),
+                 "Unable to get option "
+                 "count");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s",
+               err_str);
+        goto out;
+    }
+
+    if (priv->op_version == GD_OP_VERSION_MIN && strstr(options, "quotad")) {
+        snprintf(err_str, sizeof(err_str),
+                 "The cluster is operating "
+                 "at op-version 1. Taking quotad's statedump is "
+                 "disallowed in this state");
+        ret = -1;
+        goto out;
+    }
+
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_STATEDUMP_VOL_REQ_RCVD,
+           "Received statedump request for "
+           "volume %s with options %s",
+           volname, options);
+
+    ret = glusterd_op_begin_synctask(req, GD_OP_STATEDUMP_VOLUME, dict);
+
+out:
+    if (ret) {
+        if (err_str[0] == '\0')
+            snprintf(err_str, sizeof(err_str), "Operation failed");
+        ret = glusterd_op_send_cli_response(cli_op, ret, 0, req, dict, err_str);
+    }
+    free(cli_req.dict.dict_val);
+
+    return ret;
+}
+
+int
+glusterd_handle_cli_statedump_volume(rpcsvc_request_t *req)
+{
+    return glusterd_big_locked_handler(req,
+                                       __glusterd_handle_cli_statedump_volume);
+}
+
+/* op-sm */
+int
+glusterd_op_stage_create_volume(dict_t *dict, char **op_errstr,
+                                dict_t *rsp_dict)
+{
+    int ret = 0;
+    char *volname = NULL;
+    char *bricks = NULL;
+    char *brick_list = NULL;
+    char *free_ptr = NULL;
+    char key[64] = "";
+    glusterd_brickinfo_t *brick_info = NULL;
+    int32_t brick_count = 0;
+    int32_t local_brick_count = 0;
+    int32_t i = 0;
+    int32_t type = 0;
+    int32_t replica_count = 0;
+    int32_t disperse_count = 0;
+    char *brick = NULL;
+    char *tmpptr = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+    char msg[2048] = {0};
+    uuid_t volume_uuid;
+    char *volume_uuid_str;
+    gf_boolean_t is_force = _gf_false;
+    glusterd_volinfo_t *volinfo = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(rsp_dict);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (!ret) {
+        snprintf(msg, sizeof(msg), "Volume %s already exists", volname);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "count", SLEN("count"), &brick_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get brick count "
+               "for volume %s",
+               volname);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volume-id", SLEN("volume-id"), &volume_uuid_str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume id of "
+               "volume %s",
+               volname);
+        goto out;
+    }
+
+    ret = gf_uuid_parse(volume_uuid_str, volume_uuid);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UUID_PARSE_FAIL,
+               "Unable to parse volume id of"
+               " volume %s",
+               volname);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "bricks", SLEN("bricks"), &bricks);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get bricks for "
+               "volume %s",
+               volname);
+        goto out;
+    }
+
+    is_force = dict_get_str_boolean(dict, "force", _gf_false);
+
+    if (bricks) {
+        brick_list = gf_strdup(bricks);
+        if (!brick_list) {
+            ret = -1;
+            goto out;
+        } else {
+            free_ptr = brick_list;
+        }
+    }
+
+    /*Check brick order if the volume type is replicate or disperse. If
+     * force at the end of command not given then check brick order.
+     */
+    if (is_origin_glusterd(dict)) {
+        ret = dict_get_int32n(dict, "type", SLEN("type"), &type);
+        if (ret) {
+            snprintf(msg, sizeof(msg),
+                     "Unable to get type of "
+                     "volume %s",
+                     volname);
+            gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_DICT_GET_FAILED, "%s",
+                   msg);
+            goto out;
+        }
+
+        if (!is_force) {
+            if (type == GF_CLUSTER_TYPE_REPLICATE) {
+                ret = dict_get_int32n(dict, "replica-count",
+                                      SLEN("replica-count"), &replica_count);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                           "Bricks check : Could"
+                           " not retrieve replica count");
+                    goto out;
+                }
+                gf_msg_debug(this->name, 0,
+                             "Replicate cluster type "
+                             "found. Checking brick order.");
+                ret = glusterd_check_brick_order(dict, msg, type, &volname,
+                                                 &bricks, &brick_count,
+                                                 replica_count);
+            } else if (type == GF_CLUSTER_TYPE_DISPERSE) {
+                ret = dict_get_int32n(dict, "disperse-count",
+                                      SLEN("disperse-count"), &disperse_count);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                           "Bricks check : Could"
+                           " not retrieve disperse count");
+                    goto out;
+                }
+                gf_msg_debug(this->name, 0,
+                             "Disperse cluster type"
+                             " found. Checking brick order.");
+                ret = glusterd_check_brick_order(dict, msg, type, &volname,
+                                                 &bricks, &brick_count,
+                                                 disperse_count);
+            }
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BAD_BRKORDER,
+                       "Not creating the volume because of "
+                       "bad brick order. %s",
+                       msg);
+                *op_errstr = gf_strdup(msg);
+                goto out;
+            }
+        }
+    }
+
+    while (i < brick_count) {
+        i++;
+        brick = strtok_r(brick_list, " \n", &tmpptr);
+        brick_list = tmpptr;
+
+        if (!glusterd_store_is_valid_brickpath(volname, brick)) {
+            snprintf(msg, sizeof(msg),
+                     "brick path %s is too "
+                     "long.",
+                     brick);
+            ret = -1;
+            goto out;
+        }
+
+        if (!glusterd_is_valid_volfpath(volname, brick)) {
+            snprintf(msg, sizeof(msg),
+                     "Volume file path for "
+                     "volume %s and brick path %s is too long.",
+                     volname, brick);
+            ret = -1;
+            goto out;
+        }
+
+        ret = glusterd_brickinfo_new_from_brick(brick, &brick_info, _gf_true,
+                                                op_errstr);
+        if (ret)
+            goto out;
+
+        ret = glusterd_new_brick_validate(brick, brick_info, msg, sizeof(msg),
+                                          NULL);
+        if (ret)
+            goto out;
+
+        ret = glusterd_resolve_brick(brick_info);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESOLVE_BRICK_FAIL,
+                   FMTSTR_RESOLVE_BRICK, brick_info->hostname,
+                   brick_info->path);
+            goto out;
+        }
+
+        if (!gf_uuid_compare(brick_info->uuid, MY_UUID)) {
+            ret = glusterd_validate_and_create_brickpath(
+                brick_info, volume_uuid, volname, op_errstr, is_force,
+                _gf_false);
+            if (ret)
+                goto out;
+
+            /* A bricks mount dir is required only by snapshots which were
+             * introduced in gluster-3.6.0
+             */
+            if (priv->op_version >= GD_OP_VERSION_3_6_0) {
+                ret = glusterd_get_brick_mount_dir(brick_info->path,
+                                                   brick_info->hostname,
+                                                   brick_info->mount_dir);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_BRICK_MOUNTDIR_GET_FAIL,
+                           "Failed to get brick mount_dir");
+                    goto out;
+                }
+
+                snprintf(key, sizeof(key), "brick%d.mount_dir", i);
+                ret = dict_set_dynstr_with_alloc(rsp_dict, key,
+                                                 brick_info->mount_dir);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Failed to set %s", key);
+                    goto out;
+                }
+            }
+            local_brick_count = i;
+
+            brick_list = tmpptr;
+        }
+        glusterd_brickinfo_delete(brick_info);
+        brick_info = NULL;
+    }
+
+    ret = dict_set_int32n(rsp_dict, "brick_count", SLEN("brick_count"),
+                          local_brick_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set local_brick_count");
+        goto out;
+    }
+out:
+    GF_FREE(free_ptr);
+    if (brick_info)
+        glusterd_brickinfo_delete(brick_info);
+
+    if (msg[0] != '\0') {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_STAGE_CREATE_VOL_FAIL,
+               "%s", msg);
+        *op_errstr = gf_strdup(msg);
+    }
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_op_stop_volume_args_get(dict_t *dict, char **volname, int *flags)
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (!dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    if (!volname) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    if (!flags) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), volname);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                "Key=volname", NULL);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "flags", SLEN("flags"), flags);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                "Key=flags", NULL);
+        goto out;
+    }
+out:
+    return ret;
+}
+
+int
+glusterd_op_statedump_volume_args_get(dict_t *dict, char **volname,
+                                      char **options, int *option_cnt)
+{
+    int ret = -1;
+
+    if (!dict || !volname || !options || !option_cnt) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), volname);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                "Key=volname", NULL);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "options", SLEN("options"), options);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                "Key=options", NULL);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "option_cnt", SLEN("option_cnt"), option_cnt);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                "Key=option_cnt", NULL);
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+int
+glusterd_op_stage_start_volume(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    int ret = 0;
+    char *volname = NULL;
+    char key[64] = "";
+    int flags = 0;
+    int32_t brick_count = 0;
+    int32_t local_brick_count = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    char msg[2048] = {
+        0,
+    };
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+    uuid_t volume_id = {
+        0,
+    };
+    char volid[50] = {
+        0,
+    };
+    char xattr_volid[50] = {
+        0,
+    };
+    int32_t len = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+    GF_ASSERT(rsp_dict);
+
+    ret = glusterd_op_start_volume_args_get(dict, &volname, &flags);
+    if (ret)
+        goto out;
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), FMTSTR_CHECK_VOL_EXISTS, volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+               FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+
+    /* This is an incremental approach to have all the volinfo objects ref
+     * count. The first attempt is made in volume start transaction to
+     * ensure it doesn't race with import volume where stale volume is
+     * deleted. There are multiple instances of GlusterD crashing in
+     * bug-948686.t because of this. Once this approach is full proof, all
+     * other volinfo objects will be refcounted.
+     */
+    glusterd_volinfo_ref(volinfo);
+
+    if (priv->op_version > GD_OP_VERSION_3_7_5) {
+        ret = glusterd_validate_quorum(this, GD_OP_START_VOLUME, dict,
+                                       op_errstr);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_SERVER_QUORUM_NOT_MET,
+                   "Server quorum not met. Rejecting operation.");
+            goto out;
+        }
+    }
+
+    ret = glusterd_validate_volume_id(dict, volinfo);
+    if (ret)
+        goto out;
+
+    if (!(flags & GF_CLI_FLAG_OP_FORCE)) {
+        if (glusterd_is_volume_started(volinfo)) {
+            snprintf(msg, sizeof(msg),
+                     "Volume %s already "
+                     "started",
+                     volname);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        brick_count++;
+        ret = glusterd_resolve_brick(brickinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESOLVE_BRICK_FAIL,
+                   FMTSTR_RESOLVE_BRICK, brickinfo->hostname, brickinfo->path);
+            goto out;
+        }
+
+        if ((gf_uuid_compare(brickinfo->uuid, MY_UUID)) ||
+            (brickinfo->snap_status == -1))
+            continue;
+
+        ret = gf_lstat_dir(brickinfo->path, NULL);
+        if (ret && (flags & GF_CLI_FLAG_OP_FORCE)) {
+            continue;
+        } else if (ret) {
+            len = snprintf(msg, sizeof(msg),
+                           "Failed to find "
+                           "brick directory %s for volume %s. "
+                           "Reason : %s",
+                           brickinfo->path, volname, strerror(errno));
+            if (len < 0) {
+                strcpy(msg, "<error>");
+            }
+            goto out;
+        }
+        ret = sys_lgetxattr(brickinfo->path, GF_XATTR_VOL_ID_KEY, volume_id,
+                            16);
+        if (ret < 0 && (!(flags & GF_CLI_FLAG_OP_FORCE))) {
+            len = snprintf(msg, sizeof(msg),
+                           "Failed to get "
+                           "extended attribute %s for brick dir "
+                           "%s. Reason : %s",
+                           GF_XATTR_VOL_ID_KEY, brickinfo->path,
+                           strerror(errno));
+            if (len < 0) {
+                strcpy(msg, "<error>");
+            }
+            ret = -1;
+            goto out;
+        } else if (ret < 0) {
+            ret = sys_lsetxattr(brickinfo->path, GF_XATTR_VOL_ID_KEY,
+                                volinfo->volume_id, 16, XATTR_CREATE);
+            if (ret == -1) {
+                len = snprintf(msg, sizeof(msg),
+                               "Failed to "
+                               "set extended attribute %s on "
+                               "%s. Reason: %s",
+                               GF_XATTR_VOL_ID_KEY, brickinfo->path,
+                               strerror(errno));
+                if (len < 0) {
+                    strcpy(msg, "<error>");
+                }
+                goto out;
+            } else {
+                continue;
+            }
+        }
+        if (gf_uuid_compare(volinfo->volume_id, volume_id)) {
+            len = snprintf(msg, sizeof(msg),
+                           "Volume id "
+                           "mismatch for brick %s:%s. Expected "
+                           "volume id %s, volume id %s found",
+                           brickinfo->hostname, brickinfo->path,
+                           uuid_utoa_r(volinfo->volume_id, volid),
+                           uuid_utoa_r(volume_id, xattr_volid));
+            if (len < 0) {
+                strcpy(msg, "<error>");
+            }
+            ret = -1;
+            goto out;
+        }
+
+        /* A bricks mount dir is required only by snapshots which were
+         * introduced in gluster-3.6.0
+         */
+        if (priv->op_version >= GD_OP_VERSION_3_6_0) {
+            if (strlen(brickinfo->mount_dir) < 1) {
+                ret = glusterd_get_brick_mount_dir(
+                    brickinfo->path, brickinfo->hostname, brickinfo->mount_dir);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0,
+                           GD_MSG_BRICK_MOUNTDIR_GET_FAIL,
+                           "Failed to get brick mount_dir");
+                    goto out;
+                }
+
+                snprintf(key, sizeof(key), "brick%d.mount_dir", brick_count);
+                ret = dict_set_dynstr_with_alloc(rsp_dict, key,
+                                                 brickinfo->mount_dir);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                           "Failed to set %s", key);
+                    goto out;
+                }
+                local_brick_count = brick_count;
+            }
+        }
+    }
+
+    ret = dict_set_int32n(rsp_dict, "brick_count", SLEN("brick_count"),
+                          local_brick_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+               "Failed to set local_brick_count");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (volinfo)
+        glusterd_volinfo_unref(volinfo);
+
+    if (ret && (msg[0] != '\0')) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_STAGE_START_VOL_FAIL,
+               "%s", msg);
+        *op_errstr = gf_strdup(msg);
+    }
+    return ret;
+}
+
+int
+glusterd_op_stage_stop_volume(dict_t *dict, char **op_errstr)
+{
+    int ret = -1;
+    char *volname = NULL;
+    int flags = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+    char msg[2048] = {0};
+    xlator_t *this = NULL;
+    gsync_status_param_t param = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = glusterd_op_stop_volume_args_get(dict, &volname, &flags);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Failed to get details of volume %s",
+                 volname);
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_STOP_ARGS_GET_FAILED,
+                "Volume name=%s", volname, NULL);
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), FMTSTR_CHECK_VOL_EXISTS, volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL, "%s", msg);
+        goto out;
+    }
+
+    ret = glusterd_validate_volume_id(dict, volinfo);
+    if (ret)
+        goto out;
+
+    /* If 'force' flag is given, no check is required */
+    if (flags & GF_CLI_FLAG_OP_FORCE)
+        goto out;
+
+    if (_gf_false == glusterd_is_volume_started(volinfo)) {
+        snprintf(msg, sizeof(msg),
+                 "Volume %s "
+                 "is not in the started state",
+                 volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_STARTED, "%s", msg);
+        ret = -1;
+        goto out;
+    }
+
+    /* If geo-rep is configured, for this volume, it should be stopped. */
+    param.volinfo = volinfo;
+    ret = glusterd_check_geo_rep_running(&param, op_errstr);
+    if (ret || param.is_active) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = glusterd_check_ganesha_export(volinfo);
+    if (ret) {
+        ret = ganesha_manage_export(dict, "off", _gf_false, op_errstr);
+        if (ret) {
+            gf_msg(THIS->name, GF_LOG_WARNING, 0,
+                   GD_MSG_NFS_GNS_UNEXPRT_VOL_FAIL,
+                   "Could not "
+                   "unexport volume via NFS-Ganesha");
+            ret = 0;
+        }
+    }
+
+    if (glusterd_is_defrag_on(volinfo)) {
+        snprintf(msg, sizeof(msg),
+                 "rebalance session is "
+                 "in progress for the volume '%s'",
+                 volname);
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_OIP, "%s", msg);
+        ret = -1;
+        goto out;
+    }
+
+out:
+    if (msg[0] != 0)
+        *op_errstr = gf_strdup(msg);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_op_stage_delete_volume(dict_t *dict, char **op_errstr)
+{
+    int ret = 0;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char msg[2048] = {0};
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+
+    ret = glusterd_validate_volume_id(dict, volinfo);
+    if (ret)
+        goto out;
+
+    if (glusterd_is_volume_started(volinfo)) {
+        snprintf(msg, sizeof(msg),
+                 "Volume %s has been started."
+                 "Volume needs to be stopped before deletion.",
+                 volname);
+        ret = -1;
+        goto out;
+    }
+
+    if (volinfo->snap_count > 0 || !cds_list_empty(&volinfo->snap_volumes)) {
+        snprintf(msg, sizeof(msg),
+                 "Cannot delete Volume %s ,"
+                 "as it has %" PRIu64
+                 " snapshots. "
+                 "To delete the volume, "
+                 "first delete all the snapshots under it.",
+                 volname, volinfo->snap_count);
+        ret = -1;
+        goto out;
+    }
+
+    if (!glusterd_are_all_peers_up()) {
+        ret = -1;
+        snprintf(msg, sizeof(msg), "Some of the peers are down");
+        goto out;
+    }
+    volinfo->stage_deleted = _gf_true;
+    gf_log(this->name, GF_LOG_INFO,
+           "Setting stage deleted flag to true for "
+           "volume %s",
+           volinfo->volname);
+    ret = 0;
+
+out:
+    if (msg[0] != '\0') {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_STAGE_DELETE_VOL_FAIL,
+               "%s", msg);
+        *op_errstr = gf_strdup(msg);
+    }
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+glusterd_handle_heal_cmd(xlator_t *this, glusterd_volinfo_t *volinfo,
+                         dict_t *dict, char **op_errstr)
+{
+    glusterd_svc_t *svc = NULL;
+    gf_xl_afr_op_t heal_op = GF_SHD_OP_INVALID;
+    int ret = 0;
+    char msg[2408] = {
+        0,
+    };
+    char *offline_msg =
+        "Self-heal daemon is not running. "
+        "Check self-heal daemon log file.";
+
+    ret = dict_get_int32n(dict, "heal-op", SLEN("heal-op"),
+                          (int32_t *)&heal_op);
+    if (ret) {
+        ret = -1;
+        *op_errstr = gf_strdup("Heal operation not specified");
+        goto out;
+    }
+
+    svc = &(volinfo->shd.svc);
+    switch (heal_op) {
+        case GF_SHD_OP_INVALID:
+        case GF_SHD_OP_HEAL_ENABLE: /* This op should be handled in volume-set*/
+        case GF_SHD_OP_HEAL_DISABLE: /* This op should be handled in
+                                        volume-set*/
+        case GF_SHD_OP_GRANULAR_ENTRY_HEAL_ENABLE:  /* This op should be handled
+                                                       in volume-set */
+        case GF_SHD_OP_GRANULAR_ENTRY_HEAL_DISABLE: /* This op should be handled
+                                                       in volume-set */
+        case GF_SHD_OP_HEAL_SUMMARY:                /*glfsheal cmd*/
+        case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:  /*glfsheal cmd*/
+        case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME: /*glfsheal cmd*/
+        case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:        /*glfsheal cmd*/
+            ret = -1;
+            *op_errstr = gf_strdup("Invalid heal-op");
+            goto out;
+
+        case GF_SHD_OP_HEAL_INDEX:
+        case GF_SHD_OP_HEAL_FULL:
+            if (!glusterd_is_shd_compatible_volume(volinfo)) {
+                ret = -1;
+                snprintf(msg, sizeof(msg),
+                         "Volume %s is not of type "
+                         "replicate or disperse",
+                         volinfo->volname);
+                *op_errstr = gf_strdup(msg);
+                goto out;
+            }
+
+            if (!svc->online) {
+                ret = -1;
+                *op_errstr = gf_strdup(offline_msg);
+                goto out;
+            }
+            break;
+        case GF_SHD_OP_INDEX_SUMMARY:
+        case GF_SHD_OP_SPLIT_BRAIN_FILES:
+        case GF_SHD_OP_STATISTICS:
+        case GF_SHD_OP_STATISTICS_HEAL_COUNT:
+        case GF_SHD_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+            if (!glusterd_is_volume_replicate(volinfo)) {
+                ret = -1;
+                snprintf(msg, sizeof(msg),
+                         "This command is supported "
+                         "for only volume of replicated "
+                         "type. Volume %s is not of type "
+                         "replicate",
+                         volinfo->volname);
+                *op_errstr = gf_strdup(msg);
+                goto out;
+            }
+
+            if (!svc->online) {
+                ret = -1;
+                *op_errstr = gf_strdup(offline_msg);
+                goto out;
+            }
+            break;
+        case GF_SHD_OP_HEALED_FILES:
+        case GF_SHD_OP_HEAL_FAILED_FILES:
+            ret = -1;
+            snprintf(msg, sizeof(msg),
+                     "Command not supported. "
+                     "Please use \"gluster volume heal %s info\" "
+                     "and logs to find the heal information.",
+                     volinfo->volname);
+            *op_errstr = gf_strdup(msg);
+            goto out;
+    }
+out:
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_HANDLE_HEAL_CMD_FAIL, "%s",
+               *op_errstr);
+    return ret;
+}
+
+int
+glusterd_op_stage_heal_volume(dict_t *dict, char **op_errstr)
+{
+    int ret = 0;
+    char *volname = NULL;
+    gf_boolean_t enabled = _gf_false;
+    glusterd_volinfo_t *volinfo = NULL;
+    char msg[2048];
+    glusterd_conf_t *priv = NULL;
+    dict_t *opt_dict = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    if (!priv) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_PRIV_NULL, "priv is NULL");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        ret = -1;
+        snprintf(msg, sizeof(msg), "Volume %s does not exist", volname);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND, "%s", msg);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    ret = glusterd_validate_volume_id(dict, volinfo);
+    if (ret)
+        goto out;
+
+    if (!glusterd_is_volume_started(volinfo)) {
+        ret = -1;
+        snprintf(msg, sizeof(msg), "Volume %s is not started.", volname);
+        gf_smsg(this->name, GF_LOG_WARNING, 0, GD_MSG_VOL_NOT_STARTED,
+                "Volume=%s", volname, NULL);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    opt_dict = volinfo->dict;
+    if (!opt_dict) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED, NULL);
+        ret = 0;
+        goto out;
+    }
+    enabled = gd_is_self_heal_enabled(volinfo, opt_dict);
+    if (!enabled) {
+        ret = -1;
+        snprintf(msg, sizeof(msg),
+                 "Self-heal-daemon is "
+                 "disabled. Heal will not be triggered on volume %s",
+                 volname);
+        gf_msg(this->name, GF_LOG_WARNING, 0, GD_MSG_SELF_HEALD_DISABLED, "%s",
+               msg);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    ret = glusterd_handle_heal_cmd(this, volinfo, dict, op_errstr);
+    if (ret)
+        goto out;
+
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+int
+glusterd_op_stage_statedump_volume(dict_t *dict, char **op_errstr)
+{
+    int ret = -1;
+    char *volname = NULL;
+    char *options = NULL;
+    int option_cnt = 0;
+    gf_boolean_t is_running = _gf_false;
+    glusterd_volinfo_t *volinfo = NULL;
+    char msg[2408] = {
+        0,
+    };
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = glusterd_op_statedump_volume_args_get(dict, &volname, &options,
+                                                &option_cnt);
+    if (ret)
+        goto out;
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), FMTSTR_CHECK_VOL_EXISTS, volname);
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_VOLINFO_GET_FAIL,
+                "Volume=%s", volname, NULL);
+        goto out;
+    }
+
+    ret = glusterd_validate_volume_id(dict, volinfo);
+    if (ret)
+        goto out;
+
+    is_running = glusterd_is_volume_started(volinfo);
+    if (!is_running) {
+        snprintf(msg, sizeof(msg),
+                 "Volume %s is not in the started"
+                 " state",
+                 volname);
+        ret = -1;
+        goto out;
+    }
+
+    if (priv->op_version == GD_OP_VERSION_MIN && strstr(options, "quotad")) {
+        snprintf(msg, sizeof(msg),
+                 "The cluster is operating "
+                 "at op-version 1. Taking quotad's statedump is "
+                 "disallowed in this state");
+        ret = -1;
+        goto out;
+    }
+    if ((strstr(options, "quotad")) &&
+        (!glusterd_is_volume_quota_enabled(volinfo))) {
+        snprintf(msg, sizeof(msg),
+                 "Quota is not enabled on "
+                 "volume %s",
+                 volname);
+        ret = -1;
+        goto out;
+    }
+out:
+    if (ret && msg[0] != '\0')
+        *op_errstr = gf_strdup(msg);
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_stage_clearlocks_volume(dict_t *dict, char **op_errstr)
+{
+    int ret = -1;
+    char *volname = NULL;
+    char *path = NULL;
+    char *type = NULL;
+    char *kind = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    char msg[2048] = {
+        0,
+    };
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Failed to get volume name");
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s", msg);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "path", SLEN("path"), &path);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Failed to get path");
+        gf_msg(THIS->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s", msg);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "kind", SLEN("kind"), &kind);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Failed to get kind");
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s", msg);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "type", SLEN("type"), &type);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Failed to get type");
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "%s", msg);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Volume %s does not exist", volname);
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND, "%s", msg);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    ret = glusterd_validate_volume_id(dict, volinfo);
+    if (ret)
+        goto out;
+
+    if (!glusterd_is_volume_started(volinfo)) {
+        snprintf(msg, sizeof(msg), "Volume %s is not started", volname);
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_STARTED, "%s", msg);
+        *op_errstr = gf_strdup(msg);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_create_volume(dict_t *dict, char **op_errstr)
+{
+    int ret = 0;
+    char *volname = NULL;
+    glusterd_conf_t *priv = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    gf_boolean_t vol_added = _gf_false;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    glusterd_brickinfo_t *ta_brickinfo = NULL;
+    xlator_t *this = NULL;
+    char *brick = NULL;
+    char *ta_brick = NULL;
+    int32_t count = 0;
+    int32_t i = 1;
+    char *bricks = NULL;
+    char *ta_bricks = NULL;
+    char *brick_list = NULL;
+    char *ta_brick_list = NULL;
+    char *free_ptr = NULL;
+    char *ta_free_ptr = NULL;
+    char *saveptr = NULL;
+    char *ta_saveptr = NULL;
+    char *trans_type = NULL;
+    char *str = NULL;
+    char *username = NULL;
+    char *password = NULL;
+    int brickid = 0;
+    char msg[1024] __attribute__((unused)) = {
+        0,
+    };
+    char *brick_mount_dir = NULL;
+    char key[64] = "";
+    char *address_family_str = NULL;
+    struct statvfs brickstat = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = glusterd_volinfo_new(&volinfo);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Unable to allocate memory for volinfo");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    if (snprintf(volinfo->volname, sizeof(volinfo->volname), "%s", volname) >=
+        sizeof(volinfo->volname)) {
+        ret = -1;
+        goto out;
+    }
+
+    GF_ASSERT(volinfo->volname);
+
+    ret = dict_get_int32n(dict, "type", SLEN("type"), &volinfo->type);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get type of volume"
+               " %s",
+               volname);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "count", SLEN("count"), &volinfo->brick_count);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get brick count of"
+               " volume %s",
+               volname);
+        goto out;
+    }
+
+    ret = dict_get_int32n(dict, "port", SLEN("port"), &volinfo->port);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get port");
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "bricks", SLEN("bricks"), &bricks);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get bricks for "
+               "volume %s",
+               volname);
+        goto out;
+    }
+
+    /* replica-count 1 means, no replication, file is in one brick only */
+    volinfo->replica_count = 1;
+    /* stripe-count 1 means, no striping, file is present as a whole */
+    volinfo->stripe_count = 1;
+
+    if (GF_CLUSTER_TYPE_REPLICATE == volinfo->type) {
+        /* performance.client-io-threads is turned on to default,
+         * however this has adverse effects on replicate volumes due to
+         * replication design issues, till that get addressed
+         * performance.client-io-threads option is turned off for all
+         * replicate volumes
+         */
+        if (priv->op_version >= GD_OP_VERSION_3_12_2) {
+            ret = dict_set_nstrn(volinfo->dict, "performance.client-io-threads",
+                                 SLEN("performance.client-io-threads"), "off",
+                                 SLEN("off"));
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set "
+                       "performance.client-io-threads to off");
+                goto out;
+            }
+        }
+        ret = dict_get_int32n(dict, "replica-count", SLEN("replica-count"),
+                              &volinfo->replica_count);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get "
+                   "replica count for volume %s",
+                   volname);
+            goto out;
+        }
+
+        /* coverity[unused_value] arbiter count is optional */
+        ret = dict_get_int32n(dict, "arbiter-count", SLEN("arbiter-count"),
+                              &volinfo->arbiter_count);
+        ret = dict_get_int32n(dict, "thin-arbiter-count",
+                              SLEN("thin-arbiter-count"),
+                              &volinfo->thin_arbiter_count);
+        if (volinfo->thin_arbiter_count) {
+            ret = dict_get_strn(dict, "ta-brick", SLEN("ta-brick"), &ta_bricks);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "Unable to get thin arbiter brick for "
+                       "volume %s",
+                       volname);
+                goto out;
+            }
+        }
+
+    } else if (GF_CLUSTER_TYPE_DISPERSE == volinfo->type) {
+        ret = dict_get_int32n(dict, "disperse-count", SLEN("disperse-count"),
+                              &volinfo->disperse_count);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get "
+                   "disperse count for volume %s",
+                   volname);
+            goto out;
+        }
+        ret = dict_get_int32n(dict, "redundancy-count",
+                              SLEN("redundancy-count"),
+                              &volinfo->redundancy_count);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                   "Failed to get "
+                   "redundancy count for volume %s",
+                   volname);
+            goto out;
+        }
+        if (priv->op_version < GD_OP_VERSION_3_6_0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UNSUPPORTED_VERSION,
+                   "Disperse volume "
+                   "needs op-version 3.6.0 or higher");
+            ret = -1;
+            goto out;
+        }
+    }
+
+    /* dist-leaf-count is the count of brick nodes for a given
+       subvolume of distribute */
+    volinfo->dist_leaf_count = glusterd_get_dist_leaf_count(volinfo);
+
+    /* subvol_count is the count of number of subvolumes present
+       for a given distribute volume */
+    volinfo->subvol_count = (volinfo->brick_count / volinfo->dist_leaf_count);
+
+    /* Keep sub-count same as earlier, for the sake of backward
+       compatibility */
+    if (volinfo->dist_leaf_count > 1)
+        volinfo->sub_count = volinfo->dist_leaf_count;
+
+    ret = dict_get_strn(dict, "transport", SLEN("transport"), &trans_type);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get transport type of volume %s", volname);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "volume-id", SLEN("volume-id"), &str);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume-id of volume %s", volname);
+        goto out;
+    }
+    ret = gf_uuid_parse(str, volinfo->volume_id);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UUID_PARSE_FAIL,
+               "unable to parse uuid %s of volume %s", str, volname);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "internal-username", SLEN("internal-username"),
+                        &username);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "unable to get internal username of volume %s", volname);
+        goto out;
+    }
+    glusterd_auth_set_username(volinfo, username);
+
+    ret = dict_get_strn(dict, "internal-password", SLEN("internal-password"),
+                        &password);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "unable to get internal password of volume %s", volname);
+        goto out;
+    }
+    glusterd_auth_set_password(volinfo, password);
+
+    if (strcasecmp(trans_type, "rdma") == 0) {
+        volinfo->transport_type = GF_TRANSPORT_RDMA;
+    } else if (strcasecmp(trans_type, "tcp") == 0) {
+        volinfo->transport_type = GF_TRANSPORT_TCP;
+    } else {
+        volinfo->transport_type = GF_TRANSPORT_BOTH_TCP_RDMA;
+    }
+
+    if (ta_bricks) {
+        ta_brick_list = gf_strdup(ta_bricks);
+        ta_free_ptr = ta_brick_list;
+    }
+
+    if (volinfo->thin_arbiter_count) {
+        ta_brick = strtok_r(ta_brick_list + 1, " \n", &ta_saveptr);
+
+        count = 1;
+        brickid = volinfo->replica_count;
+        /* assign brickid to ta_bricks
+         * Following loop runs for number of subvols times. Although
+         * there is only one ta-brick for a volume but the volume fuse volfile
+         * requires an entry of ta-brick for each subvolume. Also, the ta-brick
+         * id needs to be adjusted according to the subvol count.
+         * For eg- For first subvolume ta-brick id is volname-ta-2, for second
+         * subvol ta-brick id is volname-ta-5.
+         */
+        while (count <= volinfo->subvol_count) {
+            ret = glusterd_brickinfo_new_from_brick(ta_brick, &ta_brickinfo,
+                                                    _gf_false, op_errstr);
+            if (ret)
+                goto out;
+
+            GLUSTERD_ASSIGN_BRICKID_TO_TA_BRICKINFO(ta_brickinfo, volinfo,
+                                                    brickid);
+            cds_list_add_tail(&ta_brickinfo->brick_list, &volinfo->ta_bricks);
+            count++;
+            brickid += volinfo->replica_count + 1;
+        }
+    }
+
+    if (bricks) {
+        brick_list = gf_strdup(bricks);
+        free_ptr = brick_list;
+    }
+
+    count = volinfo->brick_count;
+
+    if (count)
+        brick = strtok_r(brick_list + 1, " \n", &saveptr);
+
+    brickid = glusterd_get_next_available_brickid(volinfo);
+    if (brickid < 0)
+        goto out;
+    while (i <= count) {
+        ret = glusterd_brickinfo_new_from_brick(brick, &brickinfo, _gf_true,
+                                                op_errstr);
+        if (ret)
+            goto out;
+        if (volinfo->thin_arbiter_count == 1 &&
+            (brickid + 1) % (volinfo->replica_count + 1) == 0) {
+            brickid = brickid + 1;
+        }
+        GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO(brickinfo, volinfo, brickid++);
+
+        ret = glusterd_resolve_brick(brickinfo);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RESOLVE_BRICK_FAIL,
+                   FMTSTR_RESOLVE_BRICK, brickinfo->hostname, brickinfo->path);
+            goto out;
+        }
+
+        /* A bricks mount dir is required only by snapshots which were
+         * introduced in gluster-3.6.0
+         */
+        if (priv->op_version >= GD_OP_VERSION_3_6_0) {
+            brick_mount_dir = NULL;
+            ret = snprintf(key, sizeof(key), "brick%d.mount_dir", i);
+            ret = dict_get_strn(dict, key, ret, &brick_mount_dir);
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                       "%s not present", key);
+                goto out;
+            }
+            snprintf(brickinfo->mount_dir, sizeof(brickinfo->mount_dir), "%s",
+                     brick_mount_dir);
+        }
+
+        if (!gf_uuid_compare(brickinfo->uuid, MY_UUID)) {
+            ret = sys_statvfs(brickinfo->path, &brickstat);
+            if (ret) {
+                gf_log("brick-op", GF_LOG_ERROR,
+                       "Failed to fetch disk"
+                       " utilization from the brick (%s:%s). Please "
+                       "check health of the brick. Error code was %s",
+                       brickinfo->hostname, brickinfo->path, strerror(errno));
+                goto out;
+            }
+            brickinfo->statfs_fsid = brickstat.f_fsid;
+        }
+
+        cds_list_add_tail(&brickinfo->brick_list, &volinfo->bricks);
+        brick = strtok_r(NULL, " \n", &saveptr);
+        i++;
+    }
+
+    ret = glusterd_enable_default_options(volinfo, NULL);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_FAIL_DEFAULT_OPT_SET,
+               "Failed to set default "
+               "options on create for volume %s",
+               volinfo->volname);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "transport.address-family",
+                        SLEN("transport.address-family"), &address_family_str);
+
+    if (!ret) {
+        ret = dict_set_dynstr_with_alloc(
+            volinfo->dict, "transport.address-family", address_family_str);
+        if (ret) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "Failed to set transport.address-family for %s",
+                   volinfo->volname);
+            goto out;
+        }
+    }
+
+    gd_update_volume_op_versions(volinfo);
+
+    ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+    if (ret) {
+        glusterd_store_delete_volume(volinfo);
+        *op_errstr = gf_strdup(
+            "Failed to store the "
+            "Volume information");
+        goto out;
+    }
+
+    ret = glusterd_create_volfiles_and_notify_services(volinfo);
+    if (ret) {
+        *op_errstr = gf_strdup("Failed to create volume files");
+        goto out;
+    }
+
+    volinfo->rebal.defrag_status = 0;
+    glusterd_list_add_order(&volinfo->vol_list, &priv->volumes,
+                            glusterd_compare_volume_name);
+    vol_added = _gf_true;
+
+out:
+    GF_FREE(free_ptr);
+    GF_FREE(ta_free_ptr);
+    if (!vol_added && volinfo)
+        glusterd_volinfo_unref(volinfo);
+    return ret;
+}
+
+int
+glusterd_start_volume(glusterd_volinfo_t *volinfo, int flags, gf_boolean_t wait)
+
+{
+    int ret = 0;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    xlator_t *this = NULL;
+    glusterd_volinfo_ver_ac_t verincrement = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+    GF_ASSERT(volinfo);
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        /* Mark start_triggered to false so that in case if this brick
+         * was brought down through gf_attach utility, the
+         * brickinfo->start_triggered wouldn't have been updated to
+         * _gf_false
+         */
+        if (flags & GF_CLI_FLAG_OP_FORCE) {
+            brickinfo->start_triggered = _gf_false;
+        }
+        ret = glusterd_brick_start(volinfo, brickinfo, wait, _gf_false);
+        /* If 'force' try to start all bricks regardless of success or
+         * failure
+         */
+        if (!(flags & GF_CLI_FLAG_OP_FORCE) && ret)
+            goto out;
+    }
+
+    /* Increment the volinfo version only if there is a
+     * change in status. Force option can be used to start
+     * dead bricks even if the volume is in started state.
+     * In such case volume status will be GLUSTERD_STATUS_STARTED.
+     * Therefore we should not increment the volinfo version.*/
+    if (GLUSTERD_STATUS_STARTED != volinfo->status) {
+        verincrement = GLUSTERD_VOLINFO_VER_AC_INCREMENT;
+    } else {
+        verincrement = GLUSTERD_VOLINFO_VER_AC_NONE;
+    }
+
+    glusterd_set_volume_status(volinfo, GLUSTERD_STATUS_STARTED);
+    /* Update volinfo on disk in critical section because
+       attach_brick_callback can also call store_volinfo for same
+       volume to update volinfo on disk
+    */
+    /* coverity[ORDER_REVERSAL] */
+    LOCK(&volinfo->lock);
+    ret = glusterd_store_volinfo(volinfo, verincrement);
+    UNLOCK(&volinfo->lock);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_SET_FAIL,
+               "Failed to store volinfo of "
+               "%s volume",
+               volinfo->volname);
+        goto out;
+    }
+out:
+    gf_msg_trace(this->name, 0, "returning %d ", ret);
+    return ret;
+}
+
+int
+glusterd_op_start_volume(dict_t *dict, char **op_errstr)
+{
+    int ret = 0;
+    int32_t brick_count = 0;
+    char *brick_mount_dir = NULL;
+    char key[64] = "";
+    char *volname = NULL;
+    int flags = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    xlator_t *this = NULL;
+    glusterd_conf_t *conf = NULL;
+    glusterd_svc_t *svc = NULL;
+    char *str = NULL;
+    gf_boolean_t option = _gf_false;
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    ret = glusterd_op_start_volume_args_get(dict, &volname, &flags);
+    if (ret)
+        goto out;
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+
+    /* This is an incremental approach to have all the volinfo objects ref
+     * count. The first attempt is made in volume start transaction to
+     * ensure it doesn't race with import volume where stale volume is
+     * deleted. There are multiple instances of GlusterD crashing in
+     * bug-948686.t because of this. Once this approach is full proof, all
+     * other volinfo objects will be refcounted.
+     */
+    glusterd_volinfo_ref(volinfo);
+
+    /* A bricks mount dir is required only by snapshots which were
+     * introduced in gluster-3.6.0
+     */
+    if (conf->op_version >= GD_OP_VERSION_3_6_0) {
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            brick_count++;
+            /* Don't check bricks that are not owned by you
+             */
+            if (gf_uuid_compare(brickinfo->uuid, MY_UUID))
+                continue;
+            if (strlen(brickinfo->mount_dir) < 1) {
+                brick_mount_dir = NULL;
+                ret = snprintf(key, sizeof(key), "brick%d.mount_dir",
+                               brick_count);
+                ret = dict_get_strn(dict, key, ret, &brick_mount_dir);
+                if (ret) {
+                    gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                           "%s not present", key);
+                    goto out;
+                }
+                if (snprintf(brickinfo->mount_dir, sizeof(brickinfo->mount_dir),
+                             "%s",
+                             brick_mount_dir) >= sizeof(brickinfo->mount_dir)) {
+                    ret = -1;
+                    goto out;
+                }
+            }
+        }
+    }
+
+    ret = dict_get_str(conf->opts, GLUSTERD_STORE_KEY_GANESHA_GLOBAL, &str);
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DICT_GET_FAILED,
+               "Global dict not present.");
+        ret = 0;
+
+    } else {
+        ret = gf_string2boolean(str, &option);
+        /* Check if the feature is enabled and set nfs-disable to true */
+        if (option) {
+            gf_msg_debug(this->name, 0, "NFS-Ganesha is enabled");
+            /* Gluster-nfs should not start when NFS-Ganesha is enabled*/
+            ret = dict_set_str(volinfo->dict, NFS_DISABLE_MAP_KEY, "on");
+            if (ret) {
+                gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                       "Failed to set nfs.disable for"
+                       "volume %s",
+                       volname);
+                goto out;
+            }
+        }
+    }
+
+    ret = glusterd_start_volume(volinfo, flags, _gf_true);
+    if (ret)
+        goto out;
+
+    if (!volinfo->is_snap_volume) {
+        svc = &(volinfo->snapd.svc);
+        ret = svc->manager(svc, volinfo, PROC_START_NO_WAIT);
+        if (ret)
+            goto out;
+    }
+
+    svc = &(volinfo->gfproxyd.svc);
+    ret = svc->manager(svc, volinfo, PROC_START_NO_WAIT);
+    ret = glusterd_svcs_manager(volinfo);
+
+out:
+    if (volinfo)
+        glusterd_volinfo_unref(volinfo);
+
+    gf_msg_trace(this->name, 0, "returning %d ", ret);
+    return ret;
+}
+
+int
+glusterd_stop_volume(glusterd_volinfo_t *volinfo)
+{
+    int ret = -1;
+    glusterd_brickinfo_t *brickinfo = NULL;
+    xlator_t *this = NULL;
+    glusterd_svc_t *svc = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    GF_VALIDATE_OR_GOTO(this->name, volinfo, out);
+
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        ret = glusterd_brick_stop(volinfo, brickinfo, _gf_false);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRICK_STOP_FAIL,
+                   "Failed to stop "
+                   "brick (%s)",
+                   brickinfo->path);
+            goto out;
+        }
+    }
+
+    glusterd_set_volume_status(volinfo, GLUSTERD_STATUS_STOPPED);
+
+    ret = glusterd_store_volinfo(volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_SET_FAIL,
+               "Failed to store volinfo of "
+               "%s volume",
+               volinfo->volname);
+        goto out;
+    }
+
+    if (!volinfo->is_snap_volume) {
+        svc = &(volinfo->snapd.svc);
+        ret = svc->manager(svc, volinfo, PROC_START_NO_WAIT);
+        if (ret)
+            goto out;
+    }
+
+    ret = glusterd_svcs_manager(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_GRAPH_CHANGE_NOTIFY_FAIL,
+               "Failed to notify graph "
+               "change for %s volume",
+               volinfo->volname);
+
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+int
+glusterd_op_stop_volume(dict_t *dict)
+{
+    int ret = 0;
+    int flags = 0;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = glusterd_op_stop_volume_args_get(dict, &volname, &flags);
+    if (ret)
+        goto out;
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+
+    ret = glusterd_stop_volume(volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_STOP_FAILED,
+               "Failed to stop %s volume", volname);
+        goto out;
+    }
+out:
+    gf_msg_trace(this->name, 0, "returning %d ", ret);
+    return ret;
+}
+
+int
+glusterd_op_delete_volume(dict_t *dict)
+{
+    int ret = 0;
+    char *volname = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Unable to get volume name");
+        goto out;
+    }
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND,
+               FMTSTR_CHECK_VOL_EXISTS, volname);
+        goto out;
+    }
+
+    if (glusterd_check_ganesha_export(volinfo) && is_origin_glusterd(dict)) {
+        ret = manage_export_config(volname, "off", NULL);
+        if (ret)
+            gf_msg(this->name, GF_LOG_WARNING, 0, 0,
+                   "Could not delete ganesha export conf file "
+                   "for %s",
+                   volname);
+    }
+
+    ret = glusterd_delete_volume(volinfo);
+out:
+    gf_msg_debug(this->name, 0, "returning %d", ret);
+    return ret;
+}
+
+int
+glusterd_op_heal_volume(dict_t *dict, char **op_errstr)
+{
+    int ret = 0;
+    /* Necessary subtasks of heal are completed in brick op */
+
+    return ret;
+}
+
+int
+glusterd_op_statedump_volume(dict_t *dict, char **op_errstr)
+{
+    int ret = 0;
+    char *volname = NULL;
+    char *options = NULL;
+    int option_cnt = 0;
+    glusterd_volinfo_t *volinfo = NULL;
+    glusterd_brickinfo_t *brickinfo = NULL;
+
+    ret = glusterd_op_statedump_volume_args_get(dict, &volname, &options,
+                                                &option_cnt);
+    if (ret)
+        goto out;
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret)
+        goto out;
+    gf_msg_debug("glusterd", 0, "Performing statedump on volume %s", volname);
+    if (strstr(options, "quotad")) {
+        ret = glusterd_quotad_statedump(options, option_cnt, op_errstr);
+        if (ret)
+            goto out;
+#ifdef BUILD_GNFS
+    } else if (strstr(options, "nfs") != NULL) {
+        ret = glusterd_nfs_statedump(options, option_cnt, op_errstr);
+        if (ret)
+            goto out;
+#endif
+    } else if (strstr(options, "client")) {
+        ret = glusterd_client_statedump(volname, options, option_cnt,
+                                        op_errstr);
+        if (ret)
+            goto out;
+
+    } else {
+        cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+        {
+            ret = glusterd_brick_statedump(volinfo, brickinfo, options,
+                                           option_cnt, op_errstr);
+            /* Let us take the statedump of other bricks instead of
+             * exiting, if statedump of this brick fails.
+             */
+            if (ret)
+                gf_msg(THIS->name, GF_LOG_WARNING, 0, GD_MSG_BRK_STATEDUMP_FAIL,
+                       "could not "
+                       "take the statedump of the brick %s:%s."
+                       " Proceeding to other bricks",
+                       brickinfo->hostname, brickinfo->path);
+        }
+    }
+
+out:
+    return ret;
+}
+
+int
+glusterd_clearlocks_send_cmd(glusterd_volinfo_t *volinfo, char *cmd, char *path,
+                             char *result, char *errstr, int err_len,
+                             char *mntpt)
+{
+    int ret = -1;
+    char abspath[PATH_MAX] = {
+        0,
+    };
+
+    snprintf(abspath, sizeof(abspath), "%s/%s", mntpt, path);
+    ret = sys_lgetxattr(abspath, cmd, result, PATH_MAX);
+    if (ret < 0) {
+        snprintf(errstr, err_len,
+                 "clear-locks getxattr command "
+                 "failed. Reason: %s",
+                 strerror(errno));
+        gf_msg_debug(THIS->name, 0, "%s", errstr);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_clearlocks_rmdir_mount(glusterd_volinfo_t *volinfo, char *mntpt)
+{
+    int ret = -1;
+
+    ret = sys_rmdir(mntpt);
+    if (ret) {
+        gf_msg_debug(THIS->name, 0, "rmdir failed");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+void
+glusterd_clearlocks_unmount(glusterd_volinfo_t *volinfo, char *mntpt)
+{
+    glusterd_conf_t *priv = NULL;
+    runner_t runner = {
+        0,
+    };
+    int ret = 0;
+
+    priv = THIS->private;
+
+    /*umount failures are ignored. Using stat we could have avoided
+     * attempting to unmount a non-existent filesystem. But a failure of
+     * stat() on mount can be due to network failures.*/
+
+    runinit(&runner);
+    runner_add_args(&runner, _PATH_UMOUNT, "-f", NULL);
+    runner_argprintf(&runner, "%s", mntpt);
+
+    synclock_unlock(&priv->big_lock);
+    ret = runner_run(&runner);
+    synclock_lock(&priv->big_lock);
+    if (ret) {
+        ret = 0;
+        gf_msg_debug("glusterd", 0, "umount failed on maintenance client");
+    }
+
+    return;
+}
+
+int
+glusterd_clearlocks_create_mount(glusterd_volinfo_t *volinfo, char **mntpt)
+{
+    int ret = -1;
+    char template[PATH_MAX] = {
+        0,
+    };
+    char *tmpl = NULL;
+
+    snprintf(template, sizeof(template), "/tmp/%s.XXXXXX", volinfo->volname);
+    tmpl = mkdtemp(template);
+    if (!tmpl) {
+        gf_msg_debug(THIS->name, 0,
+                     "Couldn't create temporary "
+                     "mount directory. Reason %s",
+                     strerror(errno));
+        goto out;
+    }
+
+    *mntpt = gf_strdup(tmpl);
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_clearlocks_mount(glusterd_volinfo_t *volinfo, char **xl_opts,
+                          char *mntpt)
+{
+    int ret = -1;
+    int i = 0;
+    glusterd_conf_t *priv = NULL;
+    runner_t runner = {
+        0,
+    };
+    char client_volfpath[PATH_MAX] = {
+        0,
+    };
+    char self_heal_opts[3][1024] = {"*replicate*.data-self-heal=off",
+                                    "*replicate*.metadata-self-heal=off",
+                                    "*replicate*.entry-self-heal=off"};
+
+    priv = THIS->private;
+
+    runinit(&runner);
+    glusterd_get_trusted_client_filepath(client_volfpath, volinfo,
+                                         volinfo->transport_type);
+    runner_add_args(&runner, SBIN_DIR "/glusterfs", "-f", NULL);
+    runner_argprintf(&runner, "%s", client_volfpath);
+    runner_add_arg(&runner, "-l");
+    runner_argprintf(&runner, "%s/%s-clearlocks-mnt.log", priv->logdir,
+                     volinfo->volname);
+    if (volinfo->memory_accounting)
+        runner_add_arg(&runner, "--mem-accounting");
+
+    for (i = 0; i < volinfo->brick_count && xl_opts[i]; i++) {
+        runner_add_arg(&runner, "--xlator-option");
+        runner_argprintf(&runner, "%s", xl_opts[i]);
+    }
+
+    for (i = 0; i < 3; i++) {
+        runner_add_args(&runner, "--xlator-option", self_heal_opts[i], NULL);
+    }
+
+    runner_argprintf(&runner, "%s", mntpt);
+    synclock_unlock(&priv->big_lock);
+    ret = runner_run(&runner);
+    synclock_lock(&priv->big_lock);
+    if (ret) {
+        gf_msg_debug(THIS->name, 0, "Could not start glusterfs");
+        goto out;
+    }
+    gf_msg_debug(THIS->name, 0, "Started glusterfs successfully");
+
+out:
+    return ret;
+}
+
+int
+glusterd_clearlocks_get_local_client_ports(glusterd_volinfo_t *volinfo,
+                                           char **xl_opts)
+{
+    glusterd_brickinfo_t *brickinfo = NULL;
+    char brickname[PATH_MAX] = {
+        0,
+    };
+    int index = 0;
+    int ret = -1;
+    int i = 0;
+    int port = 0;
+    int32_t len = 0;
+
+    GF_ASSERT(xl_opts);
+    if (!xl_opts) {
+        gf_msg_debug(THIS->name, 0,
+                     "Should pass non-NULL "
+                     "xl_opts");
+        goto out;
+    }
+
+    index = -1;
+    cds_list_for_each_entry(brickinfo, &volinfo->bricks, brick_list)
+    {
+        index++;
+        if (gf_uuid_compare(brickinfo->uuid, MY_UUID))
+            continue;
+
+        if (volinfo->transport_type == GF_TRANSPORT_RDMA) {
+            len = snprintf(brickname, sizeof(brickname), "%s.rdma",
+                           brickinfo->path);
+        } else
+            len = snprintf(brickname, sizeof(brickname), "%s", brickinfo->path);
+        if ((len < 0) || (len >= sizeof(brickname))) {
+            ret = -1;
+            goto out;
+        }
+
+        port = pmap_registry_search(THIS, brickname, GF_PMAP_PORT_BRICKSERVER,
+                                    _gf_false);
+        if (!port) {
+            ret = -1;
+            gf_msg_debug(THIS->name, 0,
+                         "Couldn't get port "
+                         " for brick %s:%s",
+                         brickinfo->hostname, brickinfo->path);
+            goto out;
+        }
+
+        ret = gf_asprintf(&xl_opts[i], "%s-client-%d.remote-port=%d",
+                          volinfo->volname, index, port);
+        if (ret == -1) {
+            xl_opts[i] = NULL;
+            goto out;
+        }
+        i++;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+glusterd_op_clearlocks_volume(dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+    int32_t ret = -1;
+    int i = 0;
+    char *volname = NULL;
+    char *path = NULL;
+    char *kind = NULL;
+    char *type = NULL;
+    char *opts = NULL;
+    char *cmd_str = NULL;
+    char *free_ptr = NULL;
+    char msg[PATH_MAX] = {
+        0,
+    };
+    char result[PATH_MAX] = {
+        0,
+    };
+    char *mntpt = NULL;
+    char **xl_opts = NULL;
+    glusterd_volinfo_t *volinfo = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_strn(dict, "volname", SLEN("volname"), &volname);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+                "Key=volname", NULL);
+        goto out;
+    }
+    gf_msg_debug("glusterd", 0, "Performing clearlocks on volume %s", volname);
+
+    ret = dict_get_strn(dict, "path", SLEN("path"), &path);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "Key=path",
+                NULL);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "kind", SLEN("kind"), &kind);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "Key=kind",
+                NULL);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "type", SLEN("type"), &type);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED, "Key=type",
+                NULL);
+        goto out;
+    }
+
+    ret = dict_get_strn(dict, "opts", SLEN("opts"), &opts);
+    if (ret)
+        ret = 0;
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, GD_MSG_CLRCLK_VOL_REQ_RCVD,
+            "Volume=%s, Kind=%s, Type=%s, Options=%s", volname, kind, type,
+            opts, NULL);
+
+    if (opts)
+        ret = gf_asprintf(&cmd_str, GF_XATTR_CLRLK_CMD ".t%s.k%s.%s", type,
+                          kind, opts);
+    else
+        ret = gf_asprintf(&cmd_str, GF_XATTR_CLRLK_CMD ".t%s.k%s", type, kind);
+    if (ret == -1)
+        goto out;
+
+    ret = glusterd_volinfo_find(volname, &volinfo);
+    if (ret) {
+        snprintf(msg, sizeof(msg), "Volume %s doesn't exist.", volname);
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_FOUND, "Volume=%s",
+                volname, NULL);
+        goto out;
+    }
+
+    xl_opts = GF_CALLOC(volinfo->brick_count + 1, sizeof(char *),
+                        gf_gld_mt_charptr);
+    if (!xl_opts) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    ret = glusterd_clearlocks_get_local_client_ports(volinfo, xl_opts);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "Couldn't get port numbers of "
+                 "local bricks");
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_BRK_PORT_NUM_GET_FAIL,
+                NULL);
+        goto out;
+    }
+
+    ret = glusterd_clearlocks_create_mount(volinfo, &mntpt);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "Creating mount directory "
+                 "for clear-locks failed.");
+        gf_smsg(this->name, GF_LOG_ERROR, 0,
+                GD_MSG_CLRLOCKS_MOUNTDIR_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ret = glusterd_clearlocks_mount(volinfo, xl_opts, mntpt);
+    if (ret) {
+        snprintf(msg, sizeof(msg),
+                 "Failed to mount clear-locks "
+                 "maintenance client.");
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_CLRLOCKS_CLNT_MOUNT_FAIL,
+                NULL);
+        goto out;
+    }
+
+    ret = glusterd_clearlocks_send_cmd(volinfo, cmd_str, path, result, msg,
+                                       sizeof(msg), mntpt);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_CLRCLK_SND_CMD_FAIL, NULL);
+        goto umount;
+    }
+
+    free_ptr = gf_strdup(result);
+    if (dict_set_dynstrn(rsp_dict, "lk-summary", SLEN("lk-summary"),
+                         free_ptr)) {
+        GF_FREE(free_ptr);
+        snprintf(msg, sizeof(msg),
+                 "Failed to set clear-locks "
+                 "result");
+        gf_smsg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                "Key=lk-summary", NULL);
+    }
+
+umount:
+    glusterd_clearlocks_unmount(volinfo, mntpt);
+
+    if (glusterd_clearlocks_rmdir_mount(volinfo, mntpt))
+        gf_smsg(this->name, GF_LOG_WARNING, 0, GD_MSG_CLRLOCKS_CLNT_UMOUNT_FAIL,
+                NULL);
+
+out:
+    if (ret)
+        *op_errstr = gf_strdup(msg);
+
+    if (xl_opts) {
+        for (i = 0; i < volinfo->brick_count && xl_opts[i]; i++)
+            GF_FREE(xl_opts[i]);
+        GF_FREE(xl_opts);
+    }
+
+    GF_FREE(cmd_str);
+
+    GF_FREE(mntpt);
+
+    return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
new file mode 100644
index 00000000000..398b4d76f52
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -0,0 +1,3146 @@
+/*
+Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+This file is part of GlusterFS.
+
+This file is licensed to you under your choice of the GNU Lesser
+General Public License, version 3 or any later version (LGPLv3 or
+later), or the GNU General Public License, version 2 (GPLv2), in all
+cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/syscall.h>
+#include "glusterd-volgen.h"
+#include "glusterd-utils.h"
+
+static int
+validate_cache_max_min_size(glusterd_volinfo_t *volinfo, dict_t *dict,
+                            char *key, char *value, char **op_errstr)
+{
+    char *current_max_value = NULL;
+    char *current_min_value = NULL;
+    char errstr[2048] = "";
+    glusterd_conf_t *priv = NULL;
+    int ret = 0;
+    uint64_t max_value = 0;
+    uint64_t min_value = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    if ((!strcmp(key, "performance.cache-min-file-size")) ||
+        (!strcmp(key, "cache-min-file-size"))) {
+        glusterd_volinfo_get(volinfo, "performance.cache-max-file-size",
+                             &current_max_value);
+        if (current_max_value) {
+            gf_string2bytesize_uint64(current_max_value, &max_value);
+            gf_string2bytesize_uint64(value, &min_value);
+            current_min_value = value;
+        }
+    } else if ((!strcmp(key, "performance.cache-max-file-size")) ||
+               (!strcmp(key, "cache-max-file-size"))) {
+        glusterd_volinfo_get(volinfo, "performance.cache-min-file-size",
+                             &current_min_value);
+        if (current_min_value) {
+            gf_string2bytesize_uint64(current_min_value, &min_value);
+            gf_string2bytesize_uint64(value, &max_value);
+            current_max_value = value;
+        }
+    }
+
+    if (min_value > max_value) {
+        snprintf(errstr, sizeof(errstr),
+                 "cache-min-file-size (%s) is greater than "
+                 "cache-max-file-size (%s)",
+                 current_min_value, current_max_value);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_CACHE_MINMAX_SIZE_INVALID,
+               "%s", errstr);
+        *op_errstr = gf_strdup(errstr);
+        ret = -1;
+        goto out;
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_defrag_throttle_option(glusterd_volinfo_t *volinfo, dict_t *dict,
+                                char *key, char *value, char **op_errstr)
+{
+    char errstr[2048] = "";
+    int ret = 0;
+    xlator_t *this = NULL;
+    int thread_count = 0;
+    long int cores_available = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    cores_available = sysconf(_SC_NPROCESSORS_ONLN);
+
+    /* Throttle option should be one of lazy|normal|aggressive or a number
+     * configured by user max up to the number of cores in the machine */
+
+    if (!strcasecmp(value, "lazy") || !strcasecmp(value, "normal") ||
+        !strcasecmp(value, "aggressive")) {
+        ret = 0;
+    } else if ((gf_string2int(value, &thread_count) == 0)) {
+        if ((thread_count > 0) && (thread_count <= cores_available)) {
+            ret = 0;
+        } else {
+            ret = -1;
+            snprintf(errstr, sizeof(errstr),
+                     "%s should be within"
+                     " range of 0 and maximum number of cores "
+                     "available (cores available - %ld)",
+                     key, cores_available);
+
+            gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY, "%s",
+                   errstr);
+
+            *op_errstr = gf_strdup(errstr);
+        }
+    } else {
+        ret = -1;
+        snprintf(errstr, sizeof(errstr),
+                 "%s should be "
+                 "{lazy|normal|aggressive} or a number up to number of"
+                 " cores available (cores available - %ld)",
+                 key, cores_available);
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY, "%s",
+               errstr);
+        *op_errstr = gf_strdup(errstr);
+    }
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_quota(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+               char *value, char **op_errstr)
+{
+    char errstr[2048] = "";
+    glusterd_conf_t *priv = NULL;
+    int ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = glusterd_volinfo_get_boolean(volinfo, VKEY_FEATURES_QUOTA);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_QUOTA_GET_STAT_FAIL,
+               "failed to get the quota status");
+        goto out;
+    }
+
+    if (ret == _gf_false) {
+        snprintf(errstr, sizeof(errstr), "Cannot set %s. Enable quota first.",
+                 key);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_QUOTA_DISABLED, "%s",
+               errstr);
+        *op_errstr = gf_strdup(errstr);
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_uss(glusterd_volinfo_t *volinfo, dict_t *dict, char *key, char *value,
+             char **op_errstr)
+{
+    char errstr[2048] = "";
+    int ret = 0;
+    xlator_t *this = NULL;
+    gf_boolean_t b = _gf_false;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = gf_string2boolean(value, &b);
+    if (ret) {
+        snprintf(errstr, sizeof(errstr),
+                 "%s is not a valid boolean "
+                 "value. %s expects a valid boolean value.",
+                 value, key);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY, "%s", errstr);
+        *op_errstr = gf_strdup(errstr);
+        goto out;
+    }
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_uss_dir(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                 char *value, char **op_errstr)
+{
+    char errstr[2048] = "";
+    int ret = -1;
+    int i = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    i = strlen(value);
+    if (i > NAME_MAX) {
+        snprintf(errstr, sizeof(errstr),
+                 "value of %s exceedes %d "
+                 "characters",
+                 key, NAME_MAX);
+        goto out;
+    } else if (i < 2) {
+        snprintf(errstr, sizeof(errstr),
+                 "value of %s too short, "
+                 "expects at least two characters",
+                 key);
+        goto out;
+    }
+
+    if (value[0] != '.') {
+        snprintf(errstr, sizeof(errstr),
+                 "%s expects value starting "
+                 "with '.' ",
+                 key);
+        goto out;
+    }
+
+    for (i = 1; value[i]; i++) {
+        if (isalnum(value[i]) || value[i] == '_' || value[i] == '-')
+            continue;
+
+        snprintf(errstr, sizeof(errstr),
+                 "%s expects value to"
+                 " contain only '0-9a-z-_'",
+                 key);
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY, "%s",
+               errstr);
+        *op_errstr = gf_strdup(errstr);
+    }
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_server_options(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                        char *value, char **op_errstr)
+{
+    char errstr[2048] = "";
+    xlator_t *this = NULL;
+    int ret = -1;
+    int origin_val = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (volinfo->status == GLUSTERD_STATUS_STARTED) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_VOL_SET_VALIDATION_INFO,
+               "Please note that "
+               "volume %s is started. This option will only get "
+               "effected after a brick restart.",
+               volinfo->volname);
+    }
+
+    ret = gf_string2int(value, &origin_val);
+    if (ret) {
+        snprintf(errstr, sizeof(errstr),
+                 "%s is not a compatible "
+                 "value. %s expects an integer value.",
+                 value, key);
+        ret = -1;
+        goto out;
+    }
+
+    if (origin_val < 0) {
+        snprintf(errstr, sizeof(errstr),
+                 "%s is not a "
+                 "compatible value. %s expects a positive"
+                 "integer value.",
+                 value, key);
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INCOMPATIBLE_VALUE,
+               "%s", errstr);
+        *op_errstr = gf_strdup(errstr);
+    }
+
+    return ret;
+}
+
+static int
+validate_disperse(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                  char *value, char **op_errstr)
+{
+    char errstr[2048] = "";
+    int ret = -1;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    if (volinfo->type != GF_CLUSTER_TYPE_DISPERSE) {
+        snprintf(errstr, sizeof(errstr),
+                 "Cannot set %s for a non-disperse volume.", key);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_DISPERSE, "%s",
+               errstr);
+        *op_errstr = gf_strdup(errstr);
+        ret = -1;
+        goto out;
+    }
+    ret = 0;
+
+out:
+    gf_msg_debug(ret == 0 ? THIS->name : "glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_replica(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                 char *value, char **op_errstr)
+{
+    char errstr[2048] = "";
+    int ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (volinfo->replica_count == 1) {
+        snprintf(errstr, sizeof(errstr),
+                 "Cannot set %s for a non-replicate volume.", key);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VOL_NOT_REPLICA, "%s",
+               errstr);
+        *op_errstr = gf_strdup(errstr);
+        ret = -1;
+        goto out;
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_quorum_count(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                      char *value, char **op_errstr)
+{
+    int ret = 0;
+    xlator_t *this = NULL;
+    int q_count = 0;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    ret = gf_string2int(value, &q_count);
+    if (ret) {
+        gf_asprintf(op_errstr,
+                    "%s is not an integer. %s expects a "
+                    "valid integer value.",
+                    value, key);
+        goto out;
+    }
+
+    if (q_count < 1 || q_count > volinfo->replica_count) {
+        gf_asprintf(op_errstr, "%d in %s %d is out of range [1 - %d]", q_count,
+                    key, q_count, volinfo->replica_count);
+        ret = -1;
+    }
+
+out:
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY, "%s",
+               *op_errstr);
+    }
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_subvols_per_directory(glusterd_volinfo_t *volinfo, dict_t *dict,
+                               char *key, char *value, char **op_errstr)
+{
+    char errstr[2048] = "";
+    glusterd_conf_t *priv = NULL;
+    int ret = 0;
+    int subvols = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    subvols = atoi(value);
+
+    /* Checking if the subvols-per-directory exceed the total
+       number of subvolumes. */
+    if (subvols > volinfo->subvol_count) {
+        snprintf(errstr, sizeof(errstr),
+                 "subvols-per-directory(%d) is greater "
+                 "than the number of subvolumes(%d).",
+                 subvols, volinfo->subvol_count);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_SUBVOLUMES_EXCEED, "%s.",
+               errstr);
+        *op_errstr = gf_strdup(errstr);
+        ret = -1;
+        goto out;
+    }
+
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_replica_heal_enable_disable(glusterd_volinfo_t *volinfo, dict_t *dict,
+                                     char *key, char *value, char **op_errstr)
+{
+    int ret = 0;
+
+    if (!glusterd_is_volume_replicate(volinfo)) {
+        gf_asprintf(op_errstr, "Volume %s is not of replicate type",
+                    volinfo->volname);
+        ret = -1;
+    }
+
+    return ret;
+}
+
+static int
+validate_mandatory_locking(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                           char *value, char **op_errstr)
+{
+    char errstr[2048] = "";
+    int ret = 0;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (strcmp(value, "off") != 0 && strcmp(value, "file") != 0 &&
+        strcmp(value, "forced") != 0 && strcmp(value, "optimal") != 0) {
+        snprintf(errstr, sizeof(errstr),
+                 "Invalid option value '%s':"
+                 " Available options are 'off', 'file', "
+                 "'forced' or 'optimal'",
+                 value);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY, "%s", errstr);
+        *op_errstr = gf_strdup(errstr);
+        ret = -1;
+        goto out;
+    }
+out:
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_disperse_heal_enable_disable(glusterd_volinfo_t *volinfo, dict_t *dict,
+                                      char *key, char *value, char **op_errstr)
+{
+    int ret = 0;
+
+    if (volinfo->type != GF_CLUSTER_TYPE_DISPERSE) {
+        gf_asprintf(op_errstr, "Volume %s is not of disperse type",
+                    volinfo->volname);
+        ret = -1;
+    }
+
+    return ret;
+}
+
+static int
+validate_lock_migration_option(glusterd_volinfo_t *volinfo, dict_t *dict,
+                               char *key, char *value, char **op_errstr)
+{
+    char errstr[2048] = "";
+    int ret = 0;
+    xlator_t *this = NULL;
+    gf_boolean_t b = _gf_false;
+
+    this = THIS;
+    GF_ASSERT(this);
+
+    if (volinfo->replica_count > 1 || volinfo->disperse_count) {
+        snprintf(errstr, sizeof(errstr),
+                 "Lock migration is "
+                 "a experimental feature. Currently works with"
+                 " pure distribute volume only");
+        ret = -1;
+
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY, "%s",
+               errstr);
+
+        *op_errstr = gf_strdup(errstr);
+        goto out;
+    }
+
+    ret = gf_string2boolean(value, &b);
+    if (ret) {
+        snprintf(errstr, sizeof(errstr),
+                 "Invalid value"
+                 " for volume set command. Use on/off only.");
+        ret = -1;
+
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, GD_MSG_INVALID_ENTRY, "%s",
+               errstr);
+
+        *op_errstr = gf_strdup(errstr);
+
+        goto out;
+    }
+
+    gf_msg_debug(this->name, 0, "Returning %d", ret);
+
+out:
+    return ret;
+}
+
+static int
+validate_mux_limit(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                   char *value, char **op_errstr)
+{
+    xlator_t *this = NULL;
+    uint val = 0;
+    int ret = -1;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    if (!is_brick_mx_enabled()) {
+        gf_asprintf(op_errstr,
+                    "Brick-multiplexing is not enabled. "
+                    "Please enable brick multiplexing before trying "
+                    "to set this option.");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_WRONG_OPTS_SETTING, "%s",
+               *op_errstr);
+        goto out;
+    }
+
+    ret = gf_string2uint(value, &val);
+    if (ret) {
+        gf_asprintf(op_errstr,
+                    "%s is not a valid count. "
+                    "%s expects an unsigned integer.",
+                    value, key);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY, "%s",
+               *op_errstr);
+    }
+
+    if (val == 1) {
+        gf_asprintf(op_errstr,
+                    "Brick-multiplexing is enabled. "
+                    "Please set this option to a value other than 1 "
+                    "to make use of the brick-multiplexing feature.");
+        ret = -1;
+        goto out;
+    }
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_volume_per_thread_limit(glusterd_volinfo_t *volinfo, dict_t *dict,
+                                 char *key, char *value, char **op_errstr)
+{
+    xlator_t *this = NULL;
+    uint val = 0;
+    int ret = -1;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+
+    if (!is_brick_mx_enabled()) {
+        gf_asprintf(op_errstr,
+                    "Brick-multiplexing is not enabled. "
+                    "Please enable brick multiplexing before trying "
+                    "to set this option.");
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_WRONG_OPTS_SETTING, "%s",
+               *op_errstr);
+        goto out;
+    }
+
+    ret = gf_string2uint(value, &val);
+    if (ret) {
+        gf_asprintf(op_errstr,
+                    "%s is not a valid count. "
+                    "%s expects an unsigned integer.",
+                    value, key);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY, "%s",
+               *op_errstr);
+    }
+
+    if ((val < 5) || (val > 200)) {
+        gf_asprintf(
+            op_errstr,
+            "Please set this option to a value between 5 and 200 to"
+            "optimize processing large numbers of volumes in parallel.");
+        ret = -1;
+        goto out;
+    }
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_boolean(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                 char *value, char **op_errstr)
+{
+    xlator_t *this = NULL;
+    gf_boolean_t b = _gf_false;
+    int ret = -1;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    ret = gf_string2boolean(value, &b);
+    if (ret) {
+        gf_asprintf(op_errstr,
+                    "%s is not a valid boolean value. %s "
+                    "expects a valid boolean value.",
+                    value, key);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY, "%s",
+               *op_errstr);
+    }
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_disperse_quorum_count(glusterd_volinfo_t *volinfo, dict_t *dict,
+                               char *key, char *value, char **op_errstr)
+{
+    int ret = -1;
+    int quorum_count = 0;
+    int data_count = 0;
+
+    ret = gf_string2int(value, &quorum_count);
+    if (ret) {
+        gf_asprintf(op_errstr,
+                    "%s is not an integer. %s expects a "
+                    "valid integer value.",
+                    value, key);
+        goto out;
+    }
+
+    if (volinfo->type != GF_CLUSTER_TYPE_DISPERSE) {
+        gf_asprintf(op_errstr, "Cannot set %s for a non-disperse volume.", key);
+        ret = -1;
+        goto out;
+    }
+
+    data_count = volinfo->disperse_count - volinfo->redundancy_count;
+    if (quorum_count < data_count || quorum_count > volinfo->disperse_count) {
+        gf_asprintf(op_errstr, "%d for %s is out of range [%d - %d]",
+                    quorum_count, key, data_count, volinfo->disperse_count);
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+validate_parallel_readdir(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                          char *value, char **op_errstr)
+{
+    int ret = -1;
+
+    ret = validate_boolean(volinfo, dict, key, value, op_errstr);
+    if (ret)
+        goto out;
+
+    ret = glusterd_is_defrag_on(volinfo);
+    if (ret) {
+        gf_asprintf(op_errstr,
+                    "%s option should be set "
+                    "after rebalance is complete",
+                    key);
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY, "%s",
+               *op_errstr);
+    }
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_rda_cache_limit(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                         char *value, char **op_errstr)
+{
+    int ret = 0;
+    uint64_t rda_cache_size = 0;
+
+    ret = gf_string2bytesize_uint64(value, &rda_cache_size);
+    if (ret < 0)
+        goto out;
+
+    if (rda_cache_size <= (1 * GF_UNIT_GB))
+        goto out;
+
+    /* With release 3.11 the max value of rda_cache_limit is changed from
+     * 1GB to INFINITY. If there are clients older than 3.11 and the value
+     * of rda-cache-limit is set to > 1GB, the older clients will stop
+     * working. Hence if a user is setting rda-cache-limit to > 1GB
+     * ensure that all the clients are 3.11 or greater.
+     */
+    ret = glusterd_check_client_op_version_support(
+        volinfo->volname, GD_OP_VERSION_3_11_0, op_errstr);
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_worm_period(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                     char *value, char **op_errstr)
+{
+    xlator_t *this = NULL;
+    uint64_t period = -1;
+    int ret = -1;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    ret = gf_string2uint64(value, &period);
+    if (ret) {
+        gf_asprintf(op_errstr,
+                    "%s is not a valid uint64_t value."
+                    " %s expects a valid uint64_t value.",
+                    value, key);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY, "%s",
+               *op_errstr);
+    }
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+
+static int
+validate_reten_mode(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                    char *value, char **op_errstr)
+{
+    xlator_t *this = NULL;
+    int ret = -1;
+
+    this = THIS;
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    if ((strcmp(value, "relax") && strcmp(value, "enterprise"))) {
+        gf_asprintf(op_errstr,
+                    "The value of retention mode should be "
+                    "either relax or enterprise. But the value"
+                    " of %s is %s",
+                    key, value);
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY, "%s",
+               *op_errstr);
+        ret = -1;
+        goto out;
+    }
+    ret = 0;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+
+    return ret;
+}
+static int
+is_directory(const char *path)
+{
+    struct stat statbuf;
+    if (sys_stat(path, &statbuf) != 0)
+        return 0;
+    return S_ISDIR(statbuf.st_mode);
+}
+static int
+validate_statedump_path(glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
+                        char *value, char **op_errstr)
+{
+    xlator_t *this = NULL;
+    this = THIS;
+    GF_ASSERT(this);
+
+    int ret = 0;
+    if (!is_directory(value)) {
+        gf_asprintf(op_errstr, "Failed: %s is not a directory", value);
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY, "%s",
+               *op_errstr);
+    }
+
+    return ret;
+}
+
+/* dispatch table for VOLUME SET
+ * -----------------------------
+ *
+ * Format of entries:
+ *
+ * First field is the <key>, for the purpose of looking it up
+ * in volume dictionary. Each <key> is of the format "<domain>.<specifier>".
+ *
+ * Second field is <voltype>.
+ *
+ * Third field is <option>, if its unset, it's assumed to be
+ * the same as <specifier>.
+ *
+ * Fourth field is <value>. In this context they are used to specify
+ * a default. That is, even the volume dict doesn't have a value,
+ * we procced as if the default value were set for it.
+ *
+ * Fifth field is <doctype>, which decides if the option is public and available
+ * in "set help" or not. "NO_DOC" entries are not part of the public interface
+ * and are subject to change at any time. This also decides if an option is
+ * global (applies to all volumes) or normal (applies to only specified volume).
+ *
+ * Sixth field is <flags>.
+ *
+ * Seventh field is <op-version>.
+ *
+ * Eight field is description of option: If NULL, tried to fetch from
+ * translator code's xlator_options table.
+ *
+ * Ninth field is validation function: If NULL, xlator's option specific
+ * validation will be tried, otherwise tried at glusterd code itself.
+ *
+ * There are two type of entries: basic and special.
+ *
+ * - Basic entries are the ones where the <option> does _not_ start with
+ *   the bang! character ('!').
+ *
+ *   In their case, <option> is understood as an option for an xlator of
+ *   type <voltype>. Their effect is to copy over the volinfo->dict[<key>]
+ *   value to all graph nodes of type <voltype> (if such a value is set).
+ *
+ *   You are free to add entries of this type, they will become functional
+ *   just by being present in the table.
+ *
+ * - Special entries where the <option> starts with the bang!.
+ *
+ *   They are not applied to all graphs during generation, and you cannot
+ *   extend them in a trivial way which could be just picked up. Better
+ *   not touch them unless you know what you do.
+ *
+ *
+ * Another kind of grouping for options, according to visibility:
+ *
+ * - Exported: one which is used in the code. These are characterized by
+ *   being used a macro as <key> (of the format VKEY_..., defined in
+ *   glusterd-volgen.h
+ *
+ * - Non-exported: the rest; these have string literal <keys>.
+ *
+ * Adhering to this policy, option name changes shall be one-liners.
+ *
+ */
+
+struct volopt_map_entry glusterd_volopt_map[] = {
+    /* DHT xlator options */
+    {.key = "cluster.lookup-unhashed",
+     .voltype = "cluster/distribute",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.lookup-optimize",
+     .voltype = "cluster/distribute",
+     .op_version = GD_OP_VERSION_3_7_2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.min-free-disk",
+     .voltype = "cluster/distribute",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.min-free-inodes",
+     .voltype = "cluster/distribute",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.rebalance-stats",
+     .voltype = "cluster/distribute",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.subvols-per-directory",
+     .voltype = "cluster/distribute",
+     .option = "directory-layout-spread",
+     .op_version = 2,
+     .validate_fn = validate_subvols_per_directory,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.readdir-optimize",
+     .voltype = "cluster/distribute",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.rsync-hash-regex",
+     .voltype = "cluster/distribute",
+     .type = NO_DOC,
+     .op_version = 3,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.extra-hash-regex",
+     .voltype = "cluster/distribute",
+     .type = NO_DOC,
+     .op_version = 3,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.dht-xattr-name",
+     .voltype = "cluster/distribute",
+     .option = "xattr-name",
+     .type = NO_DOC,
+     .op_version = 3,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "cluster.randomize-hash-range-by-gfid",
+        .voltype = "cluster/distribute",
+        .option = "randomize-hash-range-by-gfid",
+        .type = NO_DOC,
+        .op_version = GD_OP_VERSION_3_6_0,
+        .flags = VOLOPT_FLAG_CLIENT_OPT,
+    },
+    {
+        .key = "cluster.rebal-throttle",
+        .voltype = "cluster/distribute",
+        .option = "rebal-throttle",
+        .op_version = GD_OP_VERSION_3_7_0,
+        .validate_fn = validate_defrag_throttle_option,
+        .flags = VOLOPT_FLAG_CLIENT_OPT,
+    },
+
+    {
+        .key = "cluster.lock-migration",
+        .voltype = "cluster/distribute",
+        .option = "lock-migration",
+        .value = "off",
+        .op_version = GD_OP_VERSION_3_8_0,
+        .validate_fn = validate_lock_migration_option,
+        .flags = VOLOPT_FLAG_CLIENT_OPT,
+    },
+
+    {
+        .key = "cluster.force-migration",
+        .voltype = "cluster/distribute",
+        .option = "force-migration",
+        .value = "off",
+        .op_version = GD_OP_VERSION_4_0_0,
+        .flags = VOLOPT_FLAG_CLIENT_OPT,
+    },
+
+    /* NUFA xlator options (Distribute special case) */
+    {.key = "cluster.nufa",
+     .voltype = "cluster/distribute",
+     .option = "!nufa",
+     .type = NO_DOC,
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.local-volume-name",
+     .voltype = "cluster/nufa",
+     .option = "local-volume-name",
+     .type = NO_DOC,
+     .op_version = 3,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "cluster.weighted-rebalance",
+        .voltype = "cluster/distribute",
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+
+    /* Switch xlator options (Distribute special case) */
+    {.key = "cluster.switch",
+     .voltype = "cluster/distribute",
+     .option = "!switch",
+     .type = NO_DOC,
+     .op_version = 3,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.switch-pattern",
+     .voltype = "cluster/switch",
+     .option = "pattern.switch.case",
+     .type = NO_DOC,
+     .op_version = 3,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+
+    /* AFR xlator options */
+    {.key = "cluster.entry-change-log",
+     .voltype = "cluster/replicate",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.read-subvolume",
+     .voltype = "cluster/replicate",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.read-subvolume-index",
+     .voltype = "cluster/replicate",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.read-hash-mode",
+     .voltype = "cluster/replicate",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.background-self-heal-count",
+     .voltype = "cluster/replicate",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.metadata-self-heal",
+     .voltype = "cluster/replicate",
+     .op_version = 1,
+     .validate_fn = validate_replica,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.data-self-heal",
+     .voltype = "cluster/replicate",
+     .op_version = 1,
+     .validate_fn = validate_replica,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.entry-self-heal",
+     .voltype = "cluster/replicate",
+     .op_version = 1,
+     .validate_fn = validate_replica,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.self-heal-daemon",
+     .voltype = "cluster/replicate",
+     .option = "!self-heal-daemon",
+     .op_version = 1,
+     .validate_fn = validate_replica_heal_enable_disable},
+    {.key = "cluster.heal-timeout",
+     .voltype = "cluster/replicate",
+     .option = "!heal-timeout",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.strict-readdir",
+     .voltype = "cluster/replicate",
+     .type = NO_DOC,
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.self-heal-window-size",
+     .voltype = "cluster/replicate",
+     .option = "data-self-heal-window-size",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.data-change-log",
+     .voltype = "cluster/replicate",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.metadata-change-log",
+     .voltype = "cluster/replicate",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.data-self-heal-algorithm",
+     .voltype = "cluster/replicate",
+     .option = "data-self-heal-algorithm",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.eager-lock",
+     .voltype = "cluster/replicate",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "disperse.eager-lock",
+     .voltype = "cluster/disperse",
+     .op_version = GD_OP_VERSION_3_7_10,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "disperse.other-eager-lock",
+     .voltype = "cluster/disperse",
+     .op_version = GD_OP_VERSION_3_13_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "disperse.eager-lock-timeout",
+     .voltype = "cluster/disperse",
+     .op_version = GD_OP_VERSION_4_0_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "disperse.other-eager-lock-timeout",
+     .voltype = "cluster/disperse",
+     .op_version = GD_OP_VERSION_4_0_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.quorum-type",
+     .voltype = "cluster/replicate",
+     .option = "quorum-type",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.quorum-count",
+     .voltype = "cluster/replicate",
+     .option = "quorum-count",
+     .op_version = 1,
+     .validate_fn = validate_quorum_count,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.choose-local",
+     .voltype = "cluster/replicate",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.self-heal-readdir-size",
+     .voltype = "cluster/replicate",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.post-op-delay-secs",
+     .voltype = "cluster/replicate",
+     .type = NO_DOC,
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.readdir-failover",
+     .voltype = "cluster/replicate",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.ensure-durability",
+     .voltype = "cluster/replicate",
+     .op_version = 3,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.consistent-metadata",
+     .voltype = "cluster/replicate",
+     .type = DOC,
+     .op_version = GD_OP_VERSION_3_7_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.heal-wait-queue-length",
+     .voltype = "cluster/replicate",
+     .type = DOC,
+     .op_version = GD_OP_VERSION_3_7_10,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.favorite-child-policy",
+     .voltype = "cluster/replicate",
+     .type = DOC,
+     .op_version = GD_OP_VERSION_3_7_12,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.full-lock",
+     .voltype = "cluster/replicate",
+     .type = NO_DOC,
+     .op_version = GD_OP_VERSION_3_13_2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.optimistic-change-log",
+     .voltype = "cluster/replicate",
+     .type = NO_DOC,
+     .op_version = GD_OP_VERSION_7_2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+
+    /* IO-stats xlator options */
+    {.key = VKEY_DIAG_LAT_MEASUREMENT,
+     .voltype = "debug/io-stats",
+     .option = "latency-measurement",
+     .value = "off",
+     .op_version = 1},
+    {.key = "diagnostics.dump-fd-stats",
+     .voltype = "debug/io-stats",
+     .op_version = 1},
+    {.key = VKEY_DIAG_CNT_FOP_HITS,
+     .voltype = "debug/io-stats",
+     .option = "count-fop-hits",
+     .value = "off",
+     .type = NO_DOC,
+     .op_version = 1},
+    {.key = "diagnostics.brick-log-level",
+     .voltype = "debug/io-stats",
+     .value = "INFO",
+     .option = "!brick-log-level",
+     .op_version = 1},
+    {.key = "diagnostics.client-log-level",
+     .voltype = "debug/io-stats",
+     .value = "INFO",
+     .option = "!client-log-level",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "diagnostics.brick-sys-log-level",
+     .voltype = "debug/io-stats",
+     .option = "!sys-log-level",
+     .op_version = 1},
+    {.key = "diagnostics.client-sys-log-level",
+     .voltype = "debug/io-stats",
+     .option = "!sys-log-level",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "diagnostics.brick-logger",
+        .voltype = "debug/io-stats",
+        .option = "!logger",
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {.key = "diagnostics.client-logger",
+     .voltype = "debug/io-stats",
+     .option = "!logger",
+     .op_version = GD_OP_VERSION_3_6_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "diagnostics.brick-log-format",
+        .voltype = "debug/io-stats",
+        .option = "!log-format",
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {.key = "diagnostics.client-log-format",
+     .voltype = "debug/io-stats",
+     .option = "!log-format",
+     .op_version = GD_OP_VERSION_3_6_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "diagnostics.brick-log-buf-size",
+        .voltype = "debug/io-stats",
+        .option = "!log-buf-size",
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {.key = "diagnostics.client-log-buf-size",
+     .voltype = "debug/io-stats",
+     .option = "!log-buf-size",
+     .op_version = GD_OP_VERSION_3_6_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "diagnostics.brick-log-flush-timeout",
+        .voltype = "debug/io-stats",
+        .option = "!log-flush-timeout",
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {.key = "diagnostics.client-log-flush-timeout",
+     .voltype = "debug/io-stats",
+     .option = "!log-flush-timeout",
+     .op_version = GD_OP_VERSION_3_6_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "diagnostics.stats-dump-interval",
+     .voltype = "debug/io-stats",
+     .option = "ios-dump-interval",
+     .op_version = 1},
+    {.key = "diagnostics.fop-sample-interval",
+     .voltype = "debug/io-stats",
+     .option = "ios-sample-interval",
+     .op_version = 1},
+    {
+        .key = "diagnostics.stats-dump-format",
+        .voltype = "debug/io-stats",
+        .option = "ios-dump-format",
+        .op_version = GD_OP_VERSION_3_12_0,
+    },
+    {.key = "diagnostics.fop-sample-buf-size",
+     .voltype = "debug/io-stats",
+     .option = "ios-sample-buf-size",
+     .op_version = 1},
+    {.key = "diagnostics.stats-dnscache-ttl-sec",
+     .voltype = "debug/io-stats",
+     .option = "ios-dnscache-ttl-sec",
+     .op_version = 1},
+
+    /* IO-cache xlator options */
+    {.key = "performance.cache-max-file-size",
+     .voltype = "performance/io-cache",
+     .option = "max-file-size",
+     .op_version = 1,
+     .validate_fn = validate_cache_max_min_size,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.cache-min-file-size",
+     .voltype = "performance/io-cache",
+     .option = "min-file-size",
+     .op_version = 1,
+     .validate_fn = validate_cache_max_min_size,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.cache-refresh-timeout",
+     .voltype = "performance/io-cache",
+     .option = "cache-timeout",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.cache-priority",
+     .voltype = "performance/io-cache",
+     .option = "priority",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.io-cache-size",
+     .voltype = "performance/io-cache",
+     .option = "cache-size",
+     .op_version = GD_OP_VERSION_8_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "performance.cache-size",
+        .voltype = "performance/io-cache",
+        .op_version = 1,
+        .flags = VOLOPT_FLAG_CLIENT_OPT,
+        .description = "Deprecated option. Use performance.io-cache-size "
+                       "to adjust the cache size of the io-cache translator, "
+                       "and use performance.quick-read-cache-size to adjust "
+                       "the cache size of the quick-read translator.",
+    },
+
+    /* IO-threads xlator options */
+    {.key = "performance.io-thread-count",
+     .voltype = "performance/io-threads",
+     .option = "thread-count",
+     .op_version = 1},
+    {.key = "performance.high-prio-threads",
+     .voltype = "performance/io-threads",
+     .op_version = 1},
+    {.key = "performance.normal-prio-threads",
+     .voltype = "performance/io-threads",
+     .op_version = 1},
+    {.key = "performance.low-prio-threads",
+     .voltype = "performance/io-threads",
+     .op_version = 1},
+    {.key = "performance.least-prio-threads",
+     .voltype = "performance/io-threads",
+     .op_version = 1},
+    {.key = "performance.enable-least-priority",
+     .voltype = "performance/io-threads",
+     .op_version = 1},
+    {.key = "performance.iot-watchdog-secs",
+     .voltype = "performance/io-threads",
+     .option = "watchdog-secs",
+     .op_version = GD_OP_VERSION_4_1_0},
+    {.key = "performance.iot-cleanup-disconnected-reqs",
+     .voltype = "performance/io-threads",
+     .option = "cleanup-disconnected-reqs",
+     .op_version = GD_OP_VERSION_4_1_0},
+    {.key = "performance.iot-pass-through",
+     .voltype = "performance/io-threads",
+     .option = "pass-through",
+     .op_version = GD_OP_VERSION_4_1_0},
+
+    /* Other perf xlators' options */
+    {.key = "performance.io-cache-pass-through",
+     .voltype = "performance/io-cache",
+     .option = "pass-through",
+     .op_version = GD_OP_VERSION_4_1_0},
+    {.key = "performance.quick-read-cache-size",
+     .voltype = "performance/quick-read",
+     .option = "cache-size",
+     .op_version = GD_OP_VERSION_8_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.cache-size",
+     .voltype = "performance/quick-read",
+     .type = NO_DOC,
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.quick-read-cache-timeout",
+     .voltype = "performance/quick-read",
+     .option = "cache-timeout",
+     .op_version = GD_OP_VERSION_8_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.qr-cache-timeout",
+     .voltype = "performance/quick-read",
+     .option = "cache-timeout",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT,
+     .description =
+         "Deprecated option. Use performance.quick-read-cache-timeout "
+         "instead."},
+    {.key = "performance.quick-read-cache-invalidation",
+     .voltype = "performance/quick-read",
+     .option = "quick-read-cache-invalidation",
+     .op_version = GD_OP_VERSION_4_0_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.ctime-invalidation",
+     .voltype = "performance/quick-read",
+     .option = "ctime-invalidation",
+     .op_version = GD_OP_VERSION_5_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.flush-behind",
+     .voltype = "performance/write-behind",
+     .option = "flush-behind",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.nfs.flush-behind",
+     .voltype = "performance/write-behind",
+     .option = "flush-behind",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.write-behind-window-size",
+     .voltype = "performance/write-behind",
+     .option = "cache-size",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "performance.resync-failed-syncs-after-fsync",
+        .voltype = "performance/write-behind",
+        .option = "resync-failed-syncs-after-fsync",
+        .op_version = GD_OP_VERSION_3_7_7,
+        .flags = VOLOPT_FLAG_CLIENT_OPT,
+        .description = "If sync of \"cached-writes issued before fsync\" "
+                       "(to backend) fails, this option configures whether "
+                       "to retry syncing them after fsync or forget them. "
+                       "If set to on, cached-writes are retried "
+                       "till a \"flush\" fop (or a successful sync) on sync "
+                       "failures. "
+                       "fsync itself is failed irrespective of the value of "
+                       "this option. ",
+    },
+    {.key = "performance.nfs.write-behind-window-size",
+     .voltype = "performance/write-behind",
+     .option = "cache-size",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.strict-o-direct",
+     .voltype = "performance/write-behind",
+     .option = "strict-O_DIRECT",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.nfs.strict-o-direct",
+     .voltype = "performance/write-behind",
+     .option = "strict-O_DIRECT",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.strict-write-ordering",
+     .voltype = "performance/write-behind",
+     .option = "strict-write-ordering",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.nfs.strict-write-ordering",
+     .voltype = "performance/write-behind",
+     .option = "strict-write-ordering",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.write-behind-trickling-writes",
+     .voltype = "performance/write-behind",
+     .option = "trickling-writes",
+     .op_version = GD_OP_VERSION_3_13_1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.aggregate-size",
+     .voltype = "performance/write-behind",
+     .option = "aggregate-size",
+     .op_version = GD_OP_VERSION_4_1_0,
+     .flags = OPT_FLAG_CLIENT_OPT},
+    {.key = "performance.nfs.write-behind-trickling-writes",
+     .voltype = "performance/write-behind",
+     .option = "trickling-writes",
+     .op_version = GD_OP_VERSION_3_13_1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.lazy-open",
+     .voltype = "performance/open-behind",
+     .option = "lazy-open",
+     .op_version = 3,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.read-after-open",
+     .voltype = "performance/open-behind",
+     .option = "read-after-open",
+     .op_version = 3,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "performance.open-behind-pass-through",
+        .voltype = "performance/open-behind",
+        .option = "pass-through",
+        .op_version = GD_OP_VERSION_4_1_0,
+    },
+    {.key = "performance.read-ahead-page-count",
+     .voltype = "performance/read-ahead",
+     .option = "page-count",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "performance.read-ahead-pass-through",
+        .voltype = "performance/read-ahead",
+        .option = "pass-through",
+        .op_version = GD_OP_VERSION_4_1_0,
+    },
+    {
+        .key = "performance.readdir-ahead-pass-through",
+        .voltype = "performance/readdir-ahead",
+        .option = "pass-through",
+        .op_version = GD_OP_VERSION_4_1_0,
+    },
+    {.key = "performance.md-cache-pass-through",
+     .voltype = "performance/md-cache",
+     .option = "pass-through",
+     .op_version = GD_OP_VERSION_4_1_0},
+    {.key = "performance.md-cache-timeout",
+     .voltype = "performance/md-cache",
+     .option = "md-cache-timeout",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.cache-swift-metadata",
+     .voltype = "performance/md-cache",
+     .option = "cache-swift-metadata",
+     .op_version = GD_OP_VERSION_3_7_10,
+     .description = "Cache swift metadata (user.swift.metadata xattr)",
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.cache-samba-metadata",
+     .voltype = "performance/md-cache",
+     .option = "cache-samba-metadata",
+     .op_version = GD_OP_VERSION_3_9_0,
+     .description = "Cache samba metadata (user.DOSATTRIB, security.NTACL"
+                    " xattr)",
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.cache-capability-xattrs",
+     .voltype = "performance/md-cache",
+     .option = "cache-capability-xattrs",
+     .op_version = GD_OP_VERSION_3_10_0,
+     .description = "Cache xattrs required for capability based security",
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.cache-ima-xattrs",
+     .voltype = "performance/md-cache",
+     .option = "cache-ima-xattrs",
+     .op_version = GD_OP_VERSION_3_10_0,
+     .description = "Cache xattrs required for IMA "
+                    "(Integrity Measurement Architecture)",
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.md-cache-statfs",
+     .voltype = "performance/md-cache",
+     .option = "md-cache-statfs",
+     .op_version = GD_OP_VERSION_4_0_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.xattr-cache-list",
+     .voltype = "performance/md-cache",
+     .option = "xattr-cache-list",
+     .op_version = GD_OP_VERSION_4_0_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT,
+     .description = "A comma separated list of xattrs that shall be "
+                    "cached by md-cache. The only wildcard allowed is '*'"},
+    {.key = "performance.nl-cache-pass-through",
+     .voltype = "performance/nl-cache",
+     .option = "pass-through",
+     .op_version = GD_OP_VERSION_4_1_0},
+
+    /* Client xlator options */
+    {.key = "network.frame-timeout",
+     .voltype = "protocol/client",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "network.ping-timeout",
+     .voltype = "protocol/client",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "network.tcp-window-size",
+     .voltype = "protocol/client",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "client.ssl",
+     .voltype = "protocol/client",
+     .option = "transport.socket.ssl-enabled",
+     .value = "off",
+     .op_version = 2,
+     .description = "enable/disable client.ssl flag in the "
+                    "volume.",
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "network.remote-dio",
+     .voltype = "protocol/client",
+     .option = "filter-O_DIRECT",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "client.own-thread",
+        .voltype = "protocol/client",
+        .option = "transport.socket.own-thread",
+        .type = NO_DOC,
+        .op_version = GD_OP_VERSION_3_7_0,
+    },
+    {
+        .key = "client.event-threads",
+        .voltype = "protocol/client",
+        .op_version = GD_OP_VERSION_3_7_0,
+    },
+    {.key = "client.tcp-user-timeout",
+     .voltype = "protocol/client",
+     .option = "transport.tcp-user-timeout",
+     .op_version = GD_OP_VERSION_3_10_2,
+     .value = "0", /* 0 - implies "use system default" */
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "client.keepalive-time",
+     .voltype = "protocol/client",
+     .option = "transport.socket.keepalive-time",
+     .op_version = GD_OP_VERSION_3_10_2,
+     .value = "20",
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "client.keepalive-interval",
+     .voltype = "protocol/client",
+     .option = "transport.socket.keepalive-interval",
+     .op_version = GD_OP_VERSION_3_10_2,
+     .value = "2",
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "client.keepalive-count",
+     .voltype = "protocol/client",
+     .option = "transport.socket.keepalive-count",
+     .op_version = GD_OP_VERSION_3_10_2,
+     .value = "9",
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "client.strict-locks",
+     .voltype = "protocol/client",
+     .option = "strict-locks",
+     .value = "off",
+     .op_version = GD_OP_VERSION_8_0,
+     .validate_fn = validate_boolean,
+     .type = GLOBAL_DOC,
+     .description = "When set, doesn't reopen saved fds after reconnect "
+                    "if POSIX locks are held on them. Hence subsequent "
+                    "operations on these fds will fail. This is "
+                    "necessary for stricter lock complaince as bricks "
+                    "cleanup any granted locks when a client "
+                    "disconnects."},
+
+    /* Although the following option is named ta-remote-port but it will be
+     * added as remote-port in client volfile for ta-bricks only.
+     */
+    {.key = "client.ta-brick-port",
+     .voltype = "protocol/client",
+     .option = "ta-remote-port",
+     .op_version = GD_OP_VERSION_7_0},
+
+    /* Server xlator options */
+    {.key = "network.tcp-window-size",
+     .voltype = "protocol/server",
+     .type = NO_DOC,
+     .op_version = 1},
+    {.key = "network.inode-lru-limit",
+     .voltype = "protocol/server",
+     .op_version = 1},
+    {.key = AUTH_ALLOW_MAP_KEY,
+     .voltype = "protocol/server",
+     .option = "!server-auth",
+     .value = "*",
+     .op_version = 1},
+    {.key = AUTH_REJECT_MAP_KEY,
+     .voltype = "protocol/server",
+     .option = "!server-auth",
+     .op_version = 1},
+    {.key = "transport.keepalive",
+     .voltype = "protocol/server",
+     .option = "transport.socket.keepalive",
+     .type = NO_DOC,
+     .value = "1",
+     .op_version = 1},
+    {.key = "server.allow-insecure",
+     .voltype = "protocol/server",
+     .option = "rpc-auth-allow-insecure",
+     .type = DOC,
+     .op_version = 1},
+    {.key = "server.root-squash",
+     .voltype = "protocol/server",
+     .option = "root-squash",
+     .op_version = 2},
+    {.key = "server.all-squash",
+     .voltype = "protocol/server",
+     .option = "all-squash",
+     .op_version = GD_OP_VERSION_6_0},
+    {.key = "server.anonuid",
+     .voltype = "protocol/server",
+     .option = "anonuid",
+     .op_version = 3},
+    {.key = "server.anongid",
+     .voltype = "protocol/server",
+     .option = "anongid",
+     .op_version = 3},
+    {.key = "server.statedump-path",
+     .voltype = "protocol/server",
+     .option = "statedump-path",
+     .op_version = 1,
+     .validate_fn = validate_statedump_path},
+    {.key = "server.outstanding-rpc-limit",
+     .voltype = "protocol/server",
+     .option = "rpc.outstanding-rpc-limit",
+     .type = GLOBAL_DOC,
+     .op_version = 3},
+    {.key = "server.ssl",
+     .voltype = "protocol/server",
+     .value = "off",
+     .option = "transport.socket.ssl-enabled",
+     .description = "enable/disable server.ssl flag in the "
+                    "volume.",
+     .op_version = 2},
+    {
+        .key = "auth.ssl-allow",
+        .voltype = "protocol/server",
+        .option = "!ssl-allow",
+        .value = "*",
+        .type = DOC,
+        .description = "Allow a comma separated list of common names (CN) of "
+                       "the clients that are allowed to access the server."
+                       "By default, all TLS authenticated clients are "
+                       "allowed to access the server.",
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {
+        .key = "server.manage-gids",
+        .voltype = "protocol/server",
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {
+        .key = "server.dynamic-auth",
+        .voltype = "protocol/server",
+        .op_version = GD_OP_VERSION_3_7_5,
+    },
+    {
+        .key = "client.send-gids",
+        .voltype = "protocol/client",
+        .type = NO_DOC,
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {
+        .key = "server.gid-timeout",
+        .voltype = "protocol/server",
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {
+        .key = "server.own-thread",
+        .voltype = "protocol/server",
+        .option = "transport.socket.own-thread",
+        .type = NO_DOC,
+        .op_version = GD_OP_VERSION_3_7_0,
+    },
+    {
+        .key = "server.event-threads",
+        .voltype = "protocol/server",
+        .op_version = GD_OP_VERSION_3_7_0,
+    },
+    {
+        .key = "server.tcp-user-timeout",
+        .voltype = "protocol/server",
+        .option = "transport.tcp-user-timeout",
+        .op_version = GD_OP_VERSION_3_10_2,
+    },
+    {
+        .key = "server.keepalive-time",
+        .voltype = "protocol/server",
+        .option = "transport.socket.keepalive-time",
+        .op_version = GD_OP_VERSION_3_10_2,
+        .value = "20",
+    },
+    {
+        .key = "server.keepalive-interval",
+        .voltype = "protocol/server",
+        .option = "transport.socket.keepalive-interval",
+        .op_version = GD_OP_VERSION_3_10_2,
+        .value = "2",
+    },
+    {
+        .key = "server.keepalive-count",
+        .voltype = "protocol/server",
+        .option = "transport.socket.keepalive-count",
+        .op_version = GD_OP_VERSION_3_10_2,
+        .value = "9",
+    },
+    {
+        .key = "transport.listen-backlog",
+        .voltype = "protocol/server",
+        .option = "transport.listen-backlog",
+        .op_version = GD_OP_VERSION_3_11_1,
+        .validate_fn = validate_server_options,
+        .description = "This option uses the value of backlog argument that "
+                       "defines the maximum length to which the queue of "
+                       "pending connections for socket fd may grow.",
+        .value = "1024",
+    },
+
+    /* Generic transport options */
+    {
+        .key = SSL_OWN_CERT_OPT,
+        .voltype = "rpc-transport/socket",
+        .option = "!ssl-own-cert",
+        .op_version = GD_OP_VERSION_3_7_4,
+    },
+    {
+        .key = SSL_PRIVATE_KEY_OPT,
+        .voltype = "rpc-transport/socket",
+        .option = "!ssl-private-key",
+        .op_version = GD_OP_VERSION_3_7_4,
+    },
+    {
+        .key = SSL_CA_LIST_OPT,
+        .voltype = "rpc-transport/socket",
+        .option = "!ssl-ca-list",
+        .op_version = GD_OP_VERSION_3_7_4,
+    },
+    {
+        .key = SSL_CRL_PATH_OPT,
+        .voltype = "rpc-transport/socket",
+        .option = "!ssl-crl-path",
+        .op_version = GD_OP_VERSION_3_7_4,
+    },
+    {
+        .key = SSL_CERT_DEPTH_OPT,
+        .voltype = "rpc-transport/socket",
+        .option = "!ssl-cert-depth",
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {
+        .key = SSL_CIPHER_LIST_OPT,
+        .voltype = "rpc-transport/socket",
+        .option = "!ssl-cipher-list",
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {
+        .key = SSL_DH_PARAM_OPT,
+        .voltype = "rpc-transport/socket",
+        .option = "!ssl-dh-param",
+        .op_version = GD_OP_VERSION_3_7_4,
+    },
+    {
+        .key = SSL_EC_CURVE_OPT,
+        .voltype = "rpc-transport/socket",
+        .option = "!ssl-ec-curve",
+        .op_version = GD_OP_VERSION_3_7_4,
+    },
+    {
+        .key = "transport.address-family",
+        .voltype = "protocol/server",
+        .option = "!address-family",
+        .op_version = GD_OP_VERSION_3_7_4,
+        .type = NO_DOC,
+    },
+
+    /* Performance xlators enable/disbable options */
+    {.key = "performance.write-behind",
+     .voltype = "performance/write-behind",
+     .option = "!perf",
+     .value = "on",
+     .op_version = 1,
+     .description = "enable/disable write-behind translator in the "
+                    "volume.",
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "performance.read-ahead",
+     .voltype = "performance/read-ahead",
+     .option = "!perf",
+     .value = "off",
+     .op_version = 1,
+     .description = "enable/disable read-ahead translator in the volume.",
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "performance.readdir-ahead",
+     .voltype = "performance/readdir-ahead",
+     .option = "!perf",
+     .value = "off",
+     .op_version = 3,
+     .description = "enable/disable readdir-ahead translator in the volume.",
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "performance.io-cache",
+     .voltype = "performance/io-cache",
+     .option = "!perf",
+     .value = "off",
+     .op_version = 1,
+     .description = "enable/disable io-cache translator in the volume.",
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.open-behind",
+     .voltype = "performance/open-behind",
+     .option = "!perf",
+     .value = "on",
+     .op_version = 2,
+     .description = "enable/disable open-behind translator in the volume.",
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT
+
+    },
+    {.key = "performance.quick-read",
+     .voltype = "performance/quick-read",
+     .option = "!perf",
+     .value = "on",
+     .op_version = 1,
+     .description = "enable/disable quick-read translator in the volume.",
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "performance.nl-cache",
+     .voltype = "performance/nl-cache",
+     .option = "!perf",
+     .value = "off",
+     .op_version = GD_OP_VERSION_3_11_0,
+     .description = "enable/disable negative entry caching translator in "
+                    "the volume. Enabling this option improves performance"
+                    " of 'create file/directory' workload",
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "performance.stat-prefetch",
+     .voltype = "performance/md-cache",
+     .option = "!perf",
+     .value = "on",
+     .op_version = 1,
+     .description = "enable/disable meta-data caching translator in the "
+                    "volume.",
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "performance.client-io-threads",
+     .voltype = "performance/io-threads",
+     .option = "!perf",
+     .value = "on",
+     .op_version = 1,
+     .description = "enable/disable io-threads translator in the client "
+                    "graph of volume.",
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "performance.nfs.write-behind",
+     .voltype = "performance/write-behind",
+     .option = "!nfsperf",
+     .value = "on",
+     .op_version = 1,
+     .description = "enable/disable write-behind translator in the volume",
+     .flags = VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "performance.nfs.read-ahead",
+     .voltype = "performance/read-ahead",
+     .option = "!nfsperf",
+     .value = "off",
+     .type = NO_DOC,
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "performance.nfs.io-cache",
+     .voltype = "performance/io-cache",
+     .option = "!nfsperf",
+     .value = "off",
+     .type = NO_DOC,
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "performance.nfs.quick-read",
+     .voltype = "performance/quick-read",
+     .option = "!nfsperf",
+     .value = "off",
+     .type = NO_DOC,
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "performance.nfs.stat-prefetch",
+     .voltype = "performance/md-cache",
+     .option = "!nfsperf",
+     .value = "off",
+     .type = NO_DOC,
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "performance.nfs.io-threads",
+     .voltype = "performance/io-threads",
+     .option = "!nfsperf",
+     .value = "off",
+     .type = NO_DOC,
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "performance.force-readdirp",
+     .voltype = "performance/md-cache",
+     .option = "force-readdirp",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.cache-invalidation",
+     .voltype = "performance/md-cache",
+     .option = "cache-invalidation",
+     .op_version = GD_OP_VERSION_3_9_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+
+    {.key = "performance.global-cache-invalidation",
+     .voltype = "performance/md-cache",
+     .option = "global-cache-invalidation",
+     .op_version = GD_OP_VERSION_6_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+
+    /* Feature translators */
+    {.key = "features.uss",
+     .voltype = "features/snapview-server",
+     .op_version = GD_OP_VERSION_3_6_0,
+     .value = "off",
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT,
+     .validate_fn = validate_uss,
+     .description = "enable/disable User Serviceable Snapshots on the "
+                    "volume."},
+
+    {.key = "features.snapshot-directory",
+     .voltype = "features/snapview-client",
+     .op_version = GD_OP_VERSION_3_6_0,
+     .value = ".snaps",
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT,
+     .validate_fn = validate_uss_dir,
+     .description = "Entry point directory for entering snapshot world. "
+                    "Value can have only [0-9a-z-_] and starts with "
+                    "dot (.) and cannot exceed 255 character"},
+
+    {.key = "features.show-snapshot-directory",
+     .voltype = "features/snapview-client",
+     .op_version = GD_OP_VERSION_3_6_0,
+     .value = "off",
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT,
+     .description = "show entry point in readdir output of "
+                    "snapdir-entry-path which is set by samba"},
+
+    {.key = "features.tag-namespaces",
+     .voltype = "features/namespace",
+     .op_version = GD_OP_VERSION_4_1_0,
+     .option = "tag-namespaces",
+     .value = "off",
+     .flags = OPT_FLAG_CLIENT_OPT,
+     .description = "This option enables this translator's functionality "
+                    "that tags every fop with a namespace hash for later "
+                    "throttling, stats collection, logging, etc."},
+
+#ifdef HAVE_LIB_Z
+    /* Compressor-decompressor xlator options
+     * defaults used from xlator/features/compress/src/cdc.h
+     */
+    {.key = "network.compression",
+     .voltype = "features/cdc",
+     .option = "!feat",
+     .value = "off",
+     .op_version = 3,
+     .description = "enable/disable network compression translator",
+     .flags = VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "network.compression.window-size",
+     .voltype = "features/cdc",
+     .option = "window-size",
+     .op_version = 3},
+    {.key = "network.compression.mem-level",
+     .voltype = "features/cdc",
+     .option = "mem-level",
+     .op_version = 3},
+    {.key = "network.compression.min-size",
+     .voltype = "features/cdc",
+     .option = "min-size",
+     .op_version = 3},
+    {.key = "network.compression.compression-level",
+     .voltype = "features/cdc",
+     .option = "compression-level",
+     .op_version = 3},
+    {.key = "network.compression.debug",
+     .voltype = "features/cdc",
+     .option = "debug",
+     .type = NO_DOC,
+     .op_version = 3},
+#endif
+
+    /* Quota xlator options */
+    {
+        .key = VKEY_FEATURES_LIMIT_USAGE,
+        .voltype = "features/quota",
+        .option = "limit-set",
+        .type = NO_DOC,
+        .op_version = 1,
+    },
+    {
+        .key = "features.default-soft-limit",
+        .voltype = "features/quota",
+        .option = "default-soft-limit",
+        .type = NO_DOC,
+        .op_version = 3,
+    },
+    {
+        .key = "features.soft-timeout",
+        .voltype = "features/quota",
+        .option = "soft-timeout",
+        .type = NO_DOC,
+        .op_version = 3,
+    },
+    {
+        .key = "features.hard-timeout",
+        .voltype = "features/quota",
+        .option = "hard-timeout",
+        .type = NO_DOC,
+        .op_version = 3,
+    },
+    {
+        .key = "features.alert-time",
+        .voltype = "features/quota",
+        .option = "alert-time",
+        .type = NO_DOC,
+        .op_version = 3,
+    },
+    {
+        .key = "features.quota-deem-statfs",
+        .voltype = "features/quota",
+        .option = "deem-statfs",
+        .value = "off",
+        .type = DOC,
+        .op_version = 2,
+        .validate_fn = validate_quota,
+    },
+
+    /* Marker xlator options */
+    {.key = VKEY_MARKER_XTIME,
+     .voltype = "features/marker",
+     .option = "xtime",
+     .value = "off",
+     .type = NO_DOC,
+     .flags = VOLOPT_FLAG_FORCE,
+     .op_version = 1},
+    {.key = VKEY_MARKER_XTIME,
+     .voltype = "features/marker",
+     .option = "!xtime",
+     .value = "off",
+     .type = NO_DOC,
+     .flags = VOLOPT_FLAG_FORCE,
+     .op_version = 1},
+    {.key = VKEY_MARKER_XTIME_FORCE,
+     .voltype = "features/marker",
+     .option = "gsync-force-xtime",
+     .value = "off",
+     .type = NO_DOC,
+     .flags = VOLOPT_FLAG_FORCE,
+     .op_version = 2},
+    {.key = VKEY_MARKER_XTIME_FORCE,
+     .voltype = "features/marker",
+     .option = "!gsync-force-xtime",
+     .value = "off",
+     .type = NO_DOC,
+     .flags = VOLOPT_FLAG_FORCE,
+     .op_version = 2},
+    {.key = VKEY_FEATURES_QUOTA,
+     .voltype = "features/marker",
+     .option = "quota",
+     .value = "off",
+     .type = NO_DOC,
+     .flags = VOLOPT_FLAG_NEVER_RESET,
+     .op_version = 1},
+    {.key = VKEY_FEATURES_INODE_QUOTA,
+     .voltype = "features/marker",
+     .option = "inode-quota",
+     .value = "off",
+     .type = NO_DOC,
+     .flags = VOLOPT_FLAG_NEVER_RESET,
+     .op_version = 1},
+    {.key = VKEY_FEATURES_BITROT,
+     .voltype = "features/bit-rot",
+     .option = "bitrot",
+     .value = "disable",
+     .type = NO_DOC,
+     .flags = VOLOPT_FLAG_FORCE,
+     .op_version = GD_OP_VERSION_3_7_0},
+
+    /* Debug xlators options */
+    {.key = "debug.trace",
+     .voltype = "debug/trace",
+     .option = "!debug",
+     .value = "off",
+     .type = NO_DOC,
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "debug.log-history",
+     .voltype = "debug/trace",
+     .option = "log-history",
+     .type = NO_DOC,
+     .op_version = 2},
+    {.key = "debug.log-file",
+     .voltype = "debug/trace",
+     .option = "log-file",
+     .type = NO_DOC,
+     .op_version = 2},
+    {.key = "debug.exclude-ops",
+     .voltype = "debug/trace",
+     .option = "exclude-ops",
+     .type = NO_DOC,
+     .op_version = 2},
+    {.key = "debug.include-ops",
+     .voltype = "debug/trace",
+     .option = "include-ops",
+     .type = NO_DOC,
+     .op_version = 2},
+    {.key = "debug.error-gen",
+     .voltype = "debug/error-gen",
+     .option = "!debug",
+     .value = "off",
+     .type = NO_DOC,
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "debug.error-failure",
+     .voltype = "debug/error-gen",
+     .option = "failure",
+     .type = NO_DOC,
+     .op_version = 3},
+    {.key = "debug.error-number",
+     .voltype = "debug/error-gen",
+     .option = "error-no",
+     .type = NO_DOC,
+     .op_version = 3},
+    {.key = "debug.random-failure",
+     .voltype = "debug/error-gen",
+     .option = "random-failure",
+     .type = NO_DOC,
+     .op_version = 3},
+    {.key = "debug.error-fops",
+     .voltype = "debug/error-gen",
+     .option = "enable",
+     .type = NO_DOC,
+     .op_version = 3},
+
+    /* NFS xlator options */
+    {.key = "nfs.enable-ino32",
+     .voltype = "nfs/server",
+     .option = "nfs.enable-ino32",
+     .type = GLOBAL_DOC,
+     .op_version = 1},
+    {.key = "nfs.mem-factor",
+     .voltype = "nfs/server",
+     .option = "nfs.mem-factor",
+     .type = GLOBAL_DOC,
+     .op_version = 1},
+    {.key = "nfs.export-dirs",
+     .voltype = "nfs/server",
+     .option = "nfs3.export-dirs",
+     .type = GLOBAL_DOC,
+     .op_version = 1},
+    {.key = "nfs.export-volumes",
+     .voltype = "nfs/server",
+     .option = "nfs3.export-volumes",
+     .type = GLOBAL_DOC,
+     .op_version = 1},
+    {.key = "nfs.addr-namelookup",
+     .voltype = "nfs/server",
+     .option = "rpc-auth.addr.namelookup",
+     .type = GLOBAL_DOC,
+     .op_version = 1},
+    {.key = "nfs.dynamic-volumes",
+     .voltype = "nfs/server",
+     .option = "nfs.dynamic-volumes",
+     .type = GLOBAL_DOC,
+     .op_version = 1},
+    {.key = "nfs.register-with-portmap",
+     .voltype = "nfs/server",
+     .option = "rpc.register-with-portmap",
+     .type = GLOBAL_DOC,
+     .op_version = 1},
+    {.key = "nfs.outstanding-rpc-limit",
+     .voltype = "nfs/server",
+     .option = "rpc.outstanding-rpc-limit",
+     .type = GLOBAL_DOC,
+     .op_version = 3},
+    {.key = "nfs.port",
+     .voltype = "nfs/server",
+     .option = "nfs.port",
+     .type = GLOBAL_DOC,
+     .op_version = 1},
+    {.key = "nfs.rpc-auth-unix",
+     .voltype = "nfs/server",
+     .option = "!rpc-auth.auth-unix.*",
+     .op_version = 1},
+    {.key = "nfs.rpc-auth-null",
+     .voltype = "nfs/server",
+     .option = "!rpc-auth.auth-null.*",
+     .op_version = 1},
+    {.key = "nfs.rpc-auth-allow",
+     .voltype = "nfs/server",
+     .option = "!rpc-auth.addr.*.allow",
+     .op_version = 1},
+    {.key = "nfs.rpc-auth-reject",
+     .voltype = "nfs/server",
+     .option = "!rpc-auth.addr.*.reject",
+     .op_version = 1},
+    {.key = "nfs.ports-insecure",
+     .voltype = "nfs/server",
+     .option = "!rpc-auth.ports.*.insecure",
+     .op_version = 1},
+    {.key = "nfs.transport-type",
+     .voltype = "nfs/server",
+     .option = "!nfs.transport-type",
+     .op_version = 1,
+     .description = "Specifies the nfs transport type. Valid "
+                    "transport types are 'tcp' and 'rdma'."},
+    {.key = "nfs.trusted-sync",
+     .voltype = "nfs/server",
+     .option = "!nfs3.*.trusted-sync",
+     .op_version = 1},
+    {.key = "nfs.trusted-write",
+     .voltype = "nfs/server",
+     .option = "!nfs3.*.trusted-write",
+     .op_version = 1},
+    {.key = "nfs.volume-access",
+     .voltype = "nfs/server",
+     .option = "!nfs3.*.volume-access",
+     .op_version = 1},
+    {.key = "nfs.export-dir",
+     .voltype = "nfs/server",
+     .option = "!nfs3.*.export-dir",
+     .op_version = 1},
+    {.key = NFS_DISABLE_MAP_KEY,
+     .voltype = "nfs/server",
+     .option = "!nfs-disable",
+     .value = SITE_H_NFS_DISABLE,
+     .op_version = 1},
+    {.key = "nfs.nlm",
+     .voltype = "nfs/server",
+     .option = "nfs.nlm",
+     .type = GLOBAL_DOC,
+     .op_version = 1},
+    {.key = "nfs.acl",
+     .voltype = "nfs/server",
+     .option = "nfs.acl",
+     .type = GLOBAL_DOC,
+     .op_version = 3},
+    {.key = "nfs.mount-udp",
+     .voltype = "nfs/server",
+     .option = "nfs.mount-udp",
+     .type = GLOBAL_DOC,
+     .op_version = 1},
+    {.key = "nfs.mount-rmtab",
+     .voltype = "nfs/server",
+     .option = "nfs.mount-rmtab",
+     .type = GLOBAL_DOC,
+     .op_version = 1},
+    {
+        .key = "nfs.rpc-statd",
+        .voltype = "nfs/server",
+        .option = "nfs.rpc-statd",
+        .type = NO_DOC,
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {
+        .key = "nfs.log-level",
+        .voltype = "nfs/server",
+        .option = "nfs.log-level",
+        .type = NO_DOC,
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {.key = "nfs.server-aux-gids",
+     .voltype = "nfs/server",
+     .option = "nfs.server-aux-gids",
+     .type = NO_DOC,
+     .op_version = 2},
+    {.key = "nfs.drc",
+     .voltype = "nfs/server",
+     .option = "nfs.drc",
+     .type = GLOBAL_DOC,
+     .op_version = 3},
+    {.key = "nfs.drc-size",
+     .voltype = "nfs/server",
+     .option = "nfs.drc-size",
+     .type = GLOBAL_DOC,
+     .op_version = 3},
+    {.key = "nfs.read-size",
+     .voltype = "nfs/server",
+     .option = "nfs3.read-size",
+     .type = GLOBAL_DOC,
+     .op_version = 3},
+    {.key = "nfs.write-size",
+     .voltype = "nfs/server",
+     .option = "nfs3.write-size",
+     .type = GLOBAL_DOC,
+     .op_version = 3},
+    {.key = "nfs.readdir-size",
+     .voltype = "nfs/server",
+     .option = "nfs3.readdir-size",
+     .type = GLOBAL_DOC,
+     .op_version = 3},
+    {.key = "nfs.rdirplus",
+     .voltype = "nfs/server",
+     .option = "nfs.rdirplus",
+     .type = GLOBAL_DOC,
+     .op_version = GD_OP_VERSION_3_7_12,
+     .description = "When this option is set to off NFS falls back to "
+                    "standard readdir instead of readdirp"},
+    {
+        .key = "nfs.event-threads",
+        .voltype = "nfs/server",
+        .option = "nfs.event-threads",
+        .type = NO_DOC,
+        .op_version = GD_OP_VERSION_4_0_0,
+    },
+
+    /* Cli options for Export authentication on nfs mount */
+    {.key = "nfs.exports-auth-enable",
+     .voltype = "nfs/server",
+     .option = "nfs.exports-auth-enable",
+     .type = GLOBAL_DOC,
+     .op_version = GD_OP_VERSION_3_7_0},
+    {.key = "nfs.auth-refresh-interval-sec",
+     .voltype = "nfs/server",
+     .option = "nfs.auth-refresh-interval-sec",
+     .type = GLOBAL_DOC,
+     .op_version = GD_OP_VERSION_3_7_0},
+    {.key = "nfs.auth-cache-ttl-sec",
+     .voltype = "nfs/server",
+     .option = "nfs.auth-cache-ttl-sec",
+     .type = GLOBAL_DOC,
+     .op_version = GD_OP_VERSION_3_7_0},
+
+    /* Other options which don't fit any place above */
+    {.key = "features.read-only",
+     .voltype = "features/read-only",
+     .option = "read-only",
+     .op_version = 1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "features.worm",
+     .voltype = "features/worm",
+     .option = "worm",
+     .value = "off",
+     .validate_fn = validate_boolean,
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "features.worm-file-level",
+     .voltype = "features/worm",
+     .option = "worm-file-level",
+     .value = "off",
+     .validate_fn = validate_boolean,
+     .op_version = GD_OP_VERSION_3_8_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "features.worm-files-deletable",
+     .voltype = "features/worm",
+     .option = "worm-files-deletable",
+     .value = "on",
+     .validate_fn = validate_boolean,
+     .op_version = GD_OP_VERSION_3_13_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT},
+    {
+        .key = "features.default-retention-period",
+        .voltype = "features/worm",
+        .option = "default-retention-period",
+        .validate_fn = validate_worm_period,
+        .op_version = GD_OP_VERSION_3_8_0,
+    },
+    {
+        .key = "features.retention-mode",
+        .voltype = "features/worm",
+        .option = "retention-mode",
+        .validate_fn = validate_reten_mode,
+        .op_version = GD_OP_VERSION_3_8_0,
+    },
+    {
+        .key = "features.auto-commit-period",
+        .voltype = "features/worm",
+        .option = "auto-commit-period",
+        .validate_fn = validate_worm_period,
+        .op_version = GD_OP_VERSION_3_8_0,
+    },
+    {.key = "storage.linux-aio", .voltype = "storage/posix", .op_version = 1},
+    {.key = "storage.batch-fsync-mode",
+     .voltype = "storage/posix",
+     .op_version = 3},
+    {.key = "storage.batch-fsync-delay-usec",
+     .voltype = "storage/posix",
+     .op_version = 3},
+    {
+        .key = "storage.xattr-user-namespace-mode",
+        .voltype = "storage/posix",
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {.key = "storage.owner-uid",
+     .voltype = "storage/posix",
+     .option = "brick-uid",
+     .op_version = 1},
+    {.key = "storage.owner-gid",
+     .voltype = "storage/posix",
+     .option = "brick-gid",
+     .op_version = 1},
+    {.key = "storage.node-uuid-pathinfo",
+     .voltype = "storage/posix",
+     .op_version = 3},
+    {.key = "storage.health-check-interval",
+     .voltype = "storage/posix",
+     .op_version = 3},
+    {
+        .option = "update-link-count-parent",
+        .key = "storage.build-pgfid",
+        .voltype = "storage/posix",
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {
+        .option = "gfid2path",
+        .key = "storage.gfid2path",
+        .type = NO_DOC,
+        .voltype = "storage/posix",
+        .op_version = GD_OP_VERSION_3_12_0,
+    },
+    {
+        .option = "gfid2path-separator",
+        .key = "storage.gfid2path-separator",
+        .voltype = "storage/posix",
+        .op_version = GD_OP_VERSION_3_12_0,
+    },
+    {
+        .key = "storage.reserve",
+        .voltype = "storage/posix",
+        .op_version = GD_OP_VERSION_3_13_0,
+    },
+    {
+        .option = "health-check-timeout",
+        .key = "storage.health-check-timeout",
+        .type = NO_DOC,
+        .voltype = "storage/posix",
+        .op_version = GD_OP_VERSION_4_0_0,
+    },
+    {
+        .option = "fips-mode-rchecksum",
+        .key = "storage.fips-mode-rchecksum",
+        .type = NO_DOC,
+        .voltype = "storage/posix",
+        .op_version = GD_OP_VERSION_4_0_0,
+    },
+    {
+        .option = "force-create-mode",
+        .key = "storage.force-create-mode",
+        .voltype = "storage/posix",
+        .op_version = GD_OP_VERSION_4_0_0,
+    },
+    {
+        .option = "force-directory-mode",
+        .key = "storage.force-directory-mode",
+        .voltype = "storage/posix",
+        .op_version = GD_OP_VERSION_4_0_0,
+    },
+    {
+        .option = "create-mask",
+        .key = "storage.create-mask",
+        .voltype = "storage/posix",
+        .op_version = GD_OP_VERSION_4_0_0,
+    },
+    {
+        .option = "create-directory-mask",
+        .key = "storage.create-directory-mask",
+        .voltype = "storage/posix",
+        .op_version = GD_OP_VERSION_4_0_0,
+    },
+    {
+        .option = "max-hardlinks",
+        .key = "storage.max-hardlinks",
+        .voltype = "storage/posix",
+        .op_version = GD_OP_VERSION_4_0_0,
+    },
+    {
+        .option = "ctime",
+        .key = "features.ctime",
+        .voltype = "storage/posix",
+        .op_version = GD_OP_VERSION_4_1_0,
+    },
+    {.key = "config.memory-accounting",
+     .voltype = "mgmt/glusterd",
+     .option = "!config",
+     .op_version = 2,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "config.transport",
+     .voltype = "mgmt/glusterd",
+     .option = "!config",
+     .op_version = 2},
+    {.key = VKEY_CONFIG_GFPROXY,
+     .voltype = "configuration",
+     .option = "gfproxyd",
+     .value = "off",
+     .type = DOC,
+     .op_version = GD_OP_VERSION_3_13_0,
+     .description = "If this option is enabled, the proxy client daemon "
+                    "called gfproxyd will be started on all the trusted "
+                    "storage pool nodes"},
+    {.key = GLUSTERD_QUORUM_TYPE_KEY,
+     .voltype = "mgmt/glusterd",
+     .value = "off",
+     .op_version = 2},
+    {.key = GLUSTERD_QUORUM_RATIO_KEY,
+     .voltype = "mgmt/glusterd",
+     .value = "51",
+     .op_version = 2},
+    /* changelog translator - global tunables */
+    {.key = "changelog.changelog",
+     .voltype = "features/changelog",
+     .type = NO_DOC,
+     .op_version = 3},
+    {.key = "changelog.changelog-dir",
+     .voltype = "features/changelog",
+     .type = NO_DOC,
+     .op_version = 3},
+    {.key = "changelog.encoding",
+     .voltype = "features/changelog",
+     .type = NO_DOC,
+     .op_version = 3},
+    {.key = "changelog.rollover-time",
+     .voltype = "features/changelog",
+     .type = NO_DOC,
+     .op_version = 3},
+    {.key = "changelog.fsync-interval",
+     .voltype = "features/changelog",
+     .type = NO_DOC,
+     .op_version = 3},
+    {
+        .key = "changelog.changelog-barrier-timeout",
+        .voltype = "features/changelog",
+        .value = BARRIER_TIMEOUT,
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {.key = "changelog.capture-del-path",
+     .voltype = "features/changelog",
+     .type = NO_DOC,
+     .op_version = 3},
+    {
+        .key = "features.barrier",
+        .voltype = "features/barrier",
+        .value = "disable",
+        .type = NO_DOC,
+        .op_version = GD_OP_VERSION_3_7_0,
+    },
+    {
+        .key = "features.barrier-timeout",
+        .voltype = "features/barrier",
+        .value = BARRIER_TIMEOUT,
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {
+        .key = GLUSTERD_GLOBAL_OP_VERSION_KEY,
+        .voltype = "mgmt/glusterd",
+        .op_version = GD_OP_VERSION_3_6_0,
+    },
+    {
+        .key = GLUSTERD_MAX_OP_VERSION_KEY,
+        .voltype = "mgmt/glusterd",
+        .op_version = GD_OP_VERSION_3_10_0,
+    },
+    /*Trash translator options */
+    {
+        .key = "features.trash",
+        .voltype = "features/trash",
+        .op_version = GD_OP_VERSION_3_7_0,
+    },
+    {
+        .key = "features.trash-dir",
+        .voltype = "features/trash",
+        .op_version = GD_OP_VERSION_3_7_0,
+    },
+    {
+        .key = "features.trash-eliminate-path",
+        .voltype = "features/trash",
+        .op_version = GD_OP_VERSION_3_7_0,
+    },
+    {
+        .key = "features.trash-max-filesize",
+        .voltype = "features/trash",
+        .op_version = GD_OP_VERSION_3_7_0,
+    },
+    {
+        .key = "features.trash-internal-op",
+        .voltype = "features/trash",
+        .op_version = GD_OP_VERSION_3_7_0,
+    },
+    {.key = GLUSTERD_SHARED_STORAGE_KEY,
+     .voltype = "mgmt/glusterd",
+     .value = "disable",
+     .type = GLOBAL_DOC,
+     .op_version = GD_OP_VERSION_3_7_1,
+     .description = "Create and mount the shared storage volume"
+                    "(gluster_shared_storage) at "
+                    "/var/run/gluster/shared_storage on enabling this "
+                    "option. Unmount and delete the shared storage volume "
+                    " on disabling this option."},
+    {
+        .key = "locks.trace",
+        .voltype = "features/locks",
+        .op_version = GD_OP_VERSION_3_7_0,
+    },
+    {
+        .key = "locks.mandatory-locking",
+        .voltype = "features/locks",
+        .op_version = GD_OP_VERSION_3_8_0,
+        .validate_fn = validate_mandatory_locking,
+    },
+    {.key = "cluster.disperse-self-heal-daemon",
+     .voltype = "cluster/disperse",
+     .type = NO_DOC,
+     .option = "self-heal-daemon",
+     .op_version = GD_OP_VERSION_3_7_0,
+     .validate_fn = validate_disperse_heal_enable_disable},
+    {.key = "cluster.quorum-reads",
+     .voltype = "cluster/replicate",
+     .op_version = GD_OP_VERSION_3_7_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "client.bind-insecure",
+     .voltype = "protocol/client",
+     .option = "client-bind-insecure",
+     .type = NO_DOC,
+     .op_version = GD_OP_VERSION_3_7_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "features.timeout",
+     .voltype = "features/quiesce",
+     .option = "timeout",
+     .op_version = GD_OP_VERSION_4_0_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT,
+     .description = "Specifies the number of seconds the "
+                    "quiesce translator will wait "
+                    "for a CHILD_UP event before "
+                    "force-unwinding the frames it has "
+                    "currently stored for retry."},
+    {.key = "features.failover-hosts",
+     .voltype = "features/quiesce",
+     .option = "failover-hosts",
+     .op_version = GD_OP_VERSION_4_0_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT,
+     .description = "It is a comma separated list of hostname/IP "
+                    "addresses. It Specifies the list of hosts where "
+                    "the gfproxy daemons are running, to which the "
+                    "the thin clients can failover to."},
+    {.key = "features.shard",
+     .voltype = "features/shard",
+     .value = "off",
+     .option = "!shard",
+     .op_version = GD_OP_VERSION_3_7_0,
+     .description = "enable/disable sharding translator on the volume.",
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "features.shard-block-size",
+     .voltype = "features/shard",
+     .op_version = GD_OP_VERSION_3_7_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "features.shard-lru-limit",
+        .voltype = "features/shard",
+        .op_version = GD_OP_VERSION_5_0,
+        .flags = VOLOPT_FLAG_CLIENT_OPT,
+        .type = NO_DOC,
+    },
+    {.key = "features.shard-deletion-rate",
+     .voltype = "features/shard",
+     .op_version = GD_OP_VERSION_5_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "features.scrub-throttle",
+        .voltype = "features/bit-rot",
+        .value = "lazy",
+        .option = "scrub-throttle",
+        .op_version = GD_OP_VERSION_3_7_0,
+        .type = NO_DOC,
+    },
+    {
+        .key = "features.scrub-freq",
+        .voltype = "features/bit-rot",
+        .value = "biweekly",
+        .option = "scrub-frequency",
+        .op_version = GD_OP_VERSION_3_7_0,
+        .type = NO_DOC,
+    },
+    {
+        .key = "features.scrub",
+        .voltype = "features/bit-rot",
+        .option = "scrubber",
+        .op_version = GD_OP_VERSION_3_7_0,
+        .flags = VOLOPT_FLAG_FORCE,
+        .type = NO_DOC,
+    },
+    {
+        .key = "features.expiry-time",
+        .voltype = "features/bit-rot",
+        .value = SIGNING_TIMEOUT,
+        .option = "expiry-time",
+        .op_version = GD_OP_VERSION_3_7_0,
+        .type = NO_DOC,
+    },
+    {
+        .key = "features.signer-threads",
+        .voltype = "features/bit-rot",
+        .value = BR_WORKERS,
+        .option = "signer-threads",
+        .op_version = GD_OP_VERSION_8_0,
+        .type = NO_DOC,
+    },
+    /* Upcall translator options */
+    /* Upcall translator options */
+    {
+        .key = "features.cache-invalidation",
+        .voltype = "features/upcall",
+        .value = "off",
+        .op_version = GD_OP_VERSION_3_7_0,
+    },
+    {
+        .key = "features.cache-invalidation-timeout",
+        .voltype = "features/upcall",
+        .op_version = GD_OP_VERSION_3_7_0,
+    },
+    {
+        .key = "ganesha.enable",
+        .voltype = "mgmt/ganesha",
+        .value = "off",
+        .option = "ganesha.enable",
+        .op_version = GD_OP_VERSION_7_0,
+    },
+    /* Lease translator options */
+    {
+        .key = "features.leases",
+        .voltype = "features/leases",
+        .value = "off",
+        .op_version = GD_OP_VERSION_3_8_0,
+    },
+    {
+        .key = "features.lease-lock-recall-timeout",
+        .voltype = "features/leases",
+        .op_version = GD_OP_VERSION_3_8_0,
+    },
+    {.key = "disperse.background-heals",
+     .voltype = "cluster/disperse",
+     .op_version = GD_OP_VERSION_3_7_3,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "disperse.heal-wait-qlength",
+     .voltype = "cluster/disperse",
+     .op_version = GD_OP_VERSION_3_7_3,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "cluster.heal-timeout",
+        .voltype = "cluster/disperse",
+        .option = "!heal-timeout",
+        .op_version = GD_OP_VERSION_3_7_3,
+        .type = NO_DOC,
+    },
+    {.key = "dht.force-readdirp",
+     .voltype = "cluster/distribute",
+     .option = "use-readdirp",
+     .op_version = GD_OP_VERSION_3_7_5,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "disperse.read-policy",
+     .voltype = "cluster/disperse",
+     .op_version = GD_OP_VERSION_3_7_6,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.shd-max-threads",
+     .voltype = "cluster/replicate",
+     .op_version = GD_OP_VERSION_3_7_12,
+     .flags = VOLOPT_FLAG_CLIENT_OPT,
+     .validate_fn = validate_replica},
+    {.key = "cluster.shd-wait-qlength",
+     .voltype = "cluster/replicate",
+     .op_version = GD_OP_VERSION_3_7_12,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.locking-scheme",
+     .voltype = "cluster/replicate",
+     .type = DOC,
+     .op_version = GD_OP_VERSION_3_7_12,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.granular-entry-heal",
+     .voltype = "cluster/replicate",
+     .type = DOC,
+     .op_version = GD_OP_VERSION_3_8_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .option = "revocation-secs",
+        .key = "features.locks-revocation-secs",
+        .voltype = "features/locks",
+        .op_version = GD_OP_VERSION_3_9_0,
+    },
+    {
+        .option = "revocation-clear-all",
+        .key = "features.locks-revocation-clear-all",
+        .voltype = "features/locks",
+        .op_version = GD_OP_VERSION_3_9_0,
+    },
+    {
+        .option = "revocation-max-blocked",
+        .key = "features.locks-revocation-max-blocked",
+        .voltype = "features/locks",
+        .op_version = GD_OP_VERSION_3_9_0,
+    },
+    {
+        .option = "monkey-unlocking",
+        .key = "features.locks-monkey-unlocking",
+        .voltype = "features/locks",
+        .op_version = GD_OP_VERSION_3_9_0,
+        .type = NO_DOC,
+    },
+    {
+        .option = "notify-contention",
+        .key = "features.locks-notify-contention",
+        .voltype = "features/locks",
+        .op_version = GD_OP_VERSION_4_0_0,
+    },
+    {
+        .option = "notify-contention-delay",
+        .key = "features.locks-notify-contention-delay",
+        .voltype = "features/locks",
+        .op_version = GD_OP_VERSION_4_0_0,
+    },
+    {.key = "disperse.shd-max-threads",
+     .voltype = "cluster/disperse",
+     .op_version = GD_OP_VERSION_3_9_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT,
+     .validate_fn = validate_disperse},
+    {.key = "disperse.shd-wait-qlength",
+     .voltype = "cluster/disperse",
+     .op_version = GD_OP_VERSION_3_9_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "disperse.cpu-extensions",
+     .voltype = "cluster/disperse",
+     .op_version = GD_OP_VERSION_3_9_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "disperse.self-heal-window-size",
+     .voltype = "cluster/disperse",
+     .op_version = GD_OP_VERSION_3_11_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.use-compound-fops",
+     .voltype = "cluster/replicate",
+     .value = "off",
+     .type = DOC,
+     .op_version = GD_OP_VERSION_3_8_4,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "performance.parallel-readdir",
+     .voltype = "performance/readdir-ahead",
+     .option = "parallel-readdir",
+     .value = "off",
+     .type = DOC,
+     .op_version = GD_OP_VERSION_3_10_0,
+     .validate_fn = validate_parallel_readdir,
+     .description = "If this option is enabled, the readdir operation "
+                    "is performed in parallel on all the bricks, thus "
+                    "improving the performance of readdir. Note that "
+                    "the performance improvement is higher in large "
+                    "clusters"},
+    {
+        .key = "performance.rda-request-size",
+        .voltype = "performance/readdir-ahead",
+        .option = "rda-request-size",
+        .value = "131072",
+        .flags = VOLOPT_FLAG_CLIENT_OPT,
+        .type = DOC,
+        .op_version = GD_OP_VERSION_3_9_1,
+    },
+    {
+        .key = "performance.rda-low-wmark",
+        .voltype = "performance/readdir-ahead",
+        .option = "rda-low-wmark",
+        .type = NO_DOC,
+        .flags = VOLOPT_FLAG_CLIENT_OPT,
+        .op_version = GD_OP_VERSION_3_9_1,
+    },
+    {
+        .key = "performance.rda-high-wmark",
+        .voltype = "performance/readdir-ahead",
+        .type = NO_DOC,
+        .flags = VOLOPT_FLAG_CLIENT_OPT,
+        .op_version = GD_OP_VERSION_3_9_1,
+    },
+    {.key = "performance.rda-cache-limit",
+     .voltype = "performance/readdir-ahead",
+     .value = "10MB",
+     .type = DOC,
+     .flags = VOLOPT_FLAG_CLIENT_OPT,
+     .op_version = GD_OP_VERSION_3_9_1,
+     .validate_fn = validate_rda_cache_limit},
+    {
+        .key = "performance.nl-cache-positive-entry",
+        .voltype = "performance/nl-cache",
+        .type = DOC,
+        .flags = VOLOPT_FLAG_CLIENT_OPT,
+        .op_version = GD_OP_VERSION_3_11_0,
+        .description = "enable/disable storing of entries that were lookedup"
+                       " and found to be present in the volume, thus lookup"
+                       " on non existent file is served from the cache",
+    },
+    {
+        .key = "performance.nl-cache-limit",
+        .voltype = "performance/nl-cache",
+        .value = "10MB",
+        .flags = VOLOPT_FLAG_CLIENT_OPT,
+        .op_version = GD_OP_VERSION_3_11_0,
+    },
+    {
+        .key = "performance.nl-cache-timeout",
+        .voltype = "performance/nl-cache",
+        .flags = VOLOPT_FLAG_CLIENT_OPT,
+        .op_version = GD_OP_VERSION_3_11_0,
+    },
+
+    /* Brick multiplexing options */
+    {.key = GLUSTERD_BRICK_MULTIPLEX_KEY,
+     .voltype = "mgmt/glusterd",
+     .value = "disable",
+     .op_version = GD_OP_VERSION_3_10_0,
+     .validate_fn = validate_boolean,
+     .type = GLOBAL_DOC,
+     .description = "This global option can be used to enable/disable "
+                    "brick multiplexing. Brick multiplexing ensures that "
+                    "compatible brick instances can share one single "
+                    "brick process."},
+    {.key = GLUSTERD_VOL_CNT_PER_THRD,
+     .voltype = "mgmt/glusterd",
+     .value = GLUSTERD_VOL_CNT_PER_THRD_DEFAULT_VALUE,
+     .op_version = GD_OP_VERSION_7_0,
+     .validate_fn = validate_volume_per_thread_limit,
+     .type = GLOBAL_NO_DOC,
+     .description =
+         "This option can be used to limit the number of volumes "
+         "handled per thread to populate peer data.The option accepts "
+         "values in the range of 5 to 200"},
+    {.key = GLUSTERD_BRICKMUX_LIMIT_KEY,
+     .voltype = "mgmt/glusterd",
+     .value = GLUSTERD_BRICKMUX_LIMIT_DFLT_VALUE,
+     .op_version = GD_OP_VERSION_3_12_0,
+     .validate_fn = validate_mux_limit,
+     .type = GLOBAL_DOC,
+     .description = "This option can be used to limit the number of brick "
+                    "instances per brick process when brick-multiplexing "
+                    "is enabled. If not explicitly set, this tunable is "
+                    "set to 0 which denotes that brick-multiplexing can "
+                    "happen without any limit on the number of bricks per "
+                    "process. Also this option can't be set when the "
+                    "brick-multiplexing feature is disabled."},
+    {.key = "disperse.optimistic-change-log",
+     .voltype = "cluster/disperse",
+     .type = NO_DOC,
+     .op_version = GD_OP_VERSION_3_10_1,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "disperse.stripe-cache",
+     .voltype = "cluster/disperse",
+     .type = NO_DOC,
+     .op_version = GD_OP_VERSION_4_0_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+
+    /* Halo replication options */
+    {.key = "cluster.halo-enabled",
+     .voltype = "cluster/replicate",
+     .op_version = GD_OP_VERSION_3_11_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.halo-shd-max-latency",
+     .voltype = "cluster/replicate",
+     .op_version = GD_OP_VERSION_3_11_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.halo-nfsd-max-latency",
+     .voltype = "cluster/replicate",
+     .op_version = GD_OP_VERSION_3_11_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.halo-max-latency",
+     .voltype = "cluster/replicate",
+     .op_version = GD_OP_VERSION_3_11_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.halo-max-replicas",
+     .voltype = "cluster/replicate",
+     .op_version = GD_OP_VERSION_3_11_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "cluster.halo-min-replicas",
+     .voltype = "cluster/replicate",
+     .op_version = GD_OP_VERSION_3_11_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = VKEY_FEATURES_SELINUX,
+     .voltype = "features/selinux",
+     .type = NO_DOC,
+     .value = "on",
+     .op_version = GD_OP_VERSION_3_11_0,
+     .description = "Convert security.selinux xattrs to "
+                    "trusted.gluster.selinux on the bricks. Recommended "
+                    "to have enabled when clients and/or bricks support "
+                    "SELinux."},
+    {.key = GLUSTERD_LOCALTIME_LOGGING_KEY,
+     .voltype = "mgmt/glusterd",
+     .type = GLOBAL_DOC,
+     .op_version = GD_OP_VERSION_3_12_0,
+     .validate_fn = validate_boolean},
+    {.key = GLUSTERD_DAEMON_LOG_LEVEL_KEY,
+     .voltype = "mgmt/glusterd",
+     .type = GLOBAL_NO_DOC,
+     .value = "INFO",
+     .op_version = GD_OP_VERSION_5_0},
+    {.key = "debug.delay-gen",
+     .voltype = "debug/delay-gen",
+     .option = "!debug",
+     .value = "off",
+     .type = NO_DOC,
+     .op_version = GD_OP_VERSION_3_13_0,
+     .flags = VOLOPT_FLAG_XLATOR_OPT},
+    {
+        .key = "delay-gen.delay-percentage",
+        .voltype = "debug/delay-gen",
+        .type = NO_DOC,
+        .op_version = GD_OP_VERSION_3_13_0,
+    },
+    {
+        .key = "delay-gen.delay-duration",
+        .voltype = "debug/delay-gen",
+        .type = NO_DOC,
+        .op_version = GD_OP_VERSION_3_13_0,
+    },
+    {
+        .key = "delay-gen.enable",
+        .voltype = "debug/delay-gen",
+        .type = NO_DOC,
+        .op_version = GD_OP_VERSION_3_13_0,
+    },
+    {.key = "disperse.parallel-writes",
+     .voltype = "cluster/disperse",
+     .type = NO_DOC,
+     .op_version = GD_OP_VERSION_3_13_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "disperse.quorum-count",
+     .voltype = "cluster/disperse",
+     .type = NO_DOC,
+     .op_version = GD_OP_VERSION_8_0,
+     .validate_fn = validate_disperse_quorum_count,
+     .description = "This option can be used to define how many successes on"
+                    "the bricks constitute a success to the application. This"
+                    " count should be in the range"
+                    "[disperse-data-count,  disperse-count] (inclusive)",
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "features.sdfs",
+        .voltype = "features/sdfs",
+        .value = "off",
+        .option = "!features",
+        .op_version = GD_OP_VERSION_4_0_0,
+        .description = "enable/disable dentry serialization xlator in volume",
+        .type = NO_DOC,
+    },
+    {.key = "features.cloudsync",
+     .voltype = "features/cloudsync",
+     .value = "off",
+     .op_version = GD_OP_VERSION_4_1_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "features.ctime",
+     .voltype = "features/utime",
+     .validate_fn = validate_boolean,
+     .value = "on",
+     .option = "!utime",
+     .op_version = GD_OP_VERSION_4_1_0,
+     .description = "enable/disable utime translator on the volume.",
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "ctime.noatime",
+     .voltype = "features/utime",
+     .validate_fn = validate_boolean,
+     .value = "on",
+     .option = "noatime",
+     .op_version = GD_OP_VERSION_5_0,
+     .description = "enable/disable noatime option with ctime enabled.",
+     .flags = VOLOPT_FLAG_CLIENT_OPT | VOLOPT_FLAG_XLATOR_OPT},
+    {.key = "features.cloudsync-storetype",
+     .voltype = "features/cloudsync",
+     .op_version = GD_OP_VERSION_5_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "features.s3plugin-seckey",
+     .voltype = "features/cloudsync",
+     .op_version = GD_OP_VERSION_5_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "features.s3plugin-keyid",
+     .voltype = "features/cloudsync",
+     .op_version = GD_OP_VERSION_5_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "features.s3plugin-bucketid",
+     .voltype = "features/cloudsync",
+     .op_version = GD_OP_VERSION_5_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "features.s3plugin-hostname",
+     .voltype = "features/cloudsync",
+     .op_version = GD_OP_VERSION_5_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "features.enforce-mandatory-lock",
+     .voltype = "features/locks",
+     .value = "off",
+     .type = NO_DOC,
+     .op_version = GD_OP_VERSION_6_0,
+     .validate_fn = validate_boolean,
+     .description = "option to enforce mandatory lock on a file",
+     .flags = VOLOPT_FLAG_XLATOR_OPT},
+    {.key = VKEY_CONFIG_GLOBAL_THREADING,
+     .voltype = "debug/io-stats",
+     .option = "global-threading",
+     .value = "off",
+     .op_version = GD_OP_VERSION_6_0},
+    {.key = VKEY_CONFIG_CLIENT_THREADS,
+     .voltype = "debug/io-stats",
+     .option = "!client-threads",
+     .value = "16",
+     .op_version = GD_OP_VERSION_6_0},
+    {.key = VKEY_CONFIG_BRICK_THREADS,
+     .voltype = "debug/io-stats",
+     .option = "!brick-threads",
+     .value = "16",
+     .op_version = GD_OP_VERSION_6_0},
+    {.key = "features.cloudsync-remote-read",
+     .voltype = "features/cloudsync",
+     .value = "off",
+     .op_version = GD_OP_VERSION_7_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "features.cloudsync-store-id",
+     .voltype = "features/cloudsync",
+     .op_version = GD_OP_VERSION_7_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = "features.cloudsync-product-id",
+     .voltype = "features/cloudsync",
+     .op_version = GD_OP_VERSION_7_0,
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {
+        .key = "features.acl",
+        .voltype = "features/access-control",
+        .value = "enable",
+        .option = "!features",
+        .op_version = GD_OP_VERSION_8_0,
+        .description = "(WARNING: for debug purpose only) enable/disable "
+                       "access-control xlator in volume",
+        .type = NO_DOC,
+    },
+
+    {.key = "cluster.use-anonymous-inode",
+     .voltype = "cluster/replicate",
+     .op_version = GD_OP_VERSION_9_0,
+     .value = "yes",
+     .flags = VOLOPT_FLAG_CLIENT_OPT},
+    {.key = NULL}};
diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c
new file mode 100644
index 00000000000..7a86c2997b1
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd.c
@@ -0,0 +1,2305 @@
+/*
+   Copyright (c) 2006-2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <time.h>
+#include <grp.h>
+#include <sys/uio.h>
+#include <sys/resource.h>
+
+#include <libgen.h>
+#include <glusterfs/compat-uuid.h>
+
+#include "glusterd.h"
+#include "rpcsvc.h"
+#include "fnmatch.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/list.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/options.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/syscall.h>
+#include "glusterd-statedump.h"
+#include "glusterd-sm.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-store.h"
+#include "glusterd-hooks.h"
+#include "glusterd-utils.h"
+#include "glusterd-locks.h"
+#include "glusterd-svc-mgmt.h"
+#include "glusterd-shd-svc.h"
+#ifdef BUILD_GNFS
+#include "glusterd-nfs-svc.h"
+#endif
+#include "glusterd-bitd-svc.h"
+#include "glusterd-scrub-svc.h"
+#include "glusterd-quotad-svc.h"
+#include "glusterd-snapd-svc.h"
+#include "glusterd-messages.h"
+#include <glusterfs/common-utils.h>
+#include "glusterd-geo-rep.h"
+#include <glusterfs/run.h>
+#include "rpc-clnt-ping.h"
+#include "rpc-common-xdr.h"
+
+#include <glusterfs/syncop.h>
+
+#include "glusterd-mountbroker.h"
+
+extern struct rpcsvc_program gluster_handshake_prog;
+extern struct rpcsvc_program gluster_cli_getspec_prog;
+extern struct rpcsvc_program gluster_pmap_prog;
+extern glusterd_op_info_t opinfo;
+extern struct rpcsvc_program gd_svc_mgmt_prog;
+extern struct rpcsvc_program gd_svc_mgmt_v3_prog;
+extern struct rpcsvc_program gd_svc_peer_prog;
+extern struct rpcsvc_program gd_svc_cli_prog;
+extern struct rpcsvc_program gd_svc_cli_trusted_progs;
+extern struct rpc_clnt_program gd_brick_prog;
+extern struct rpcsvc_program glusterd_mgmt_hndsk_prog;
+
+extern char snap_mount_dir[VALID_GLUSTERD_PATHMAX];
+
+rpcsvc_cbk_program_t glusterd_cbk_prog = {
+    .progname = "Gluster Callback",
+    .prognum = GLUSTER_CBK_PROGRAM,
+    .progver = GLUSTER_CBK_VERSION,
+};
+
+struct rpcsvc_program *gd_inet_programs[] = {
+    &gd_svc_peer_prog,       &gd_svc_cli_trusted_progs, /* Must be index 1 for
+                                                           secure_mgmt! */
+    &gd_svc_mgmt_prog,       &gd_svc_mgmt_v3_prog,      &gluster_pmap_prog,
+    &gluster_handshake_prog, &glusterd_mgmt_hndsk_prog,
+};
+int gd_inet_programs_count = (sizeof(gd_inet_programs) /
+                              sizeof(gd_inet_programs[0]));
+
+struct rpcsvc_program *gd_uds_programs[] = {
+    &gd_svc_cli_prog,
+    &gluster_cli_getspec_prog,
+};
+int gd_uds_programs_count = (sizeof(gd_uds_programs) /
+                             sizeof(gd_uds_programs[0]));
+
+const char *gd_op_list[GD_OP_MAX + 1] = {
+    [GD_OP_NONE] = "Invalid op",
+    [GD_OP_CREATE_VOLUME] = "Create",
+    [GD_OP_START_BRICK] = "Start Brick",
+    [GD_OP_STOP_BRICK] = "Stop Brick",
+    [GD_OP_DELETE_VOLUME] = "Delete",
+    [GD_OP_START_VOLUME] = "Start",
+    [GD_OP_STOP_VOLUME] = "Stop",
+    [GD_OP_DEFRAG_VOLUME] = "Rebalance",
+    [GD_OP_ADD_BRICK] = "Add brick",
+    [GD_OP_DETACH_TIER] = "Detach tier",
+    [GD_OP_TIER_MIGRATE] = "Tier migration",
+    [GD_OP_REMOVE_BRICK] = "Remove brick",
+    [GD_OP_REPLACE_BRICK] = "Replace brick",
+    [GD_OP_SET_VOLUME] = "Set",
+    [GD_OP_RESET_VOLUME] = "Reset",
+    [GD_OP_SYNC_VOLUME] = "Sync",
+    [GD_OP_LOG_ROTATE] = "Log rotate",
+    [GD_OP_GSYNC_SET] = "Geo-replication",
+    [GD_OP_PROFILE_VOLUME] = "Profile",
+    [GD_OP_QUOTA] = "Quota",
+    [GD_OP_STATUS_VOLUME] = "Status",
+    [GD_OP_REBALANCE] = "Rebalance",
+    [GD_OP_HEAL_VOLUME] = "Heal",
+    [GD_OP_STATEDUMP_VOLUME] = "Statedump",
+    [GD_OP_LIST_VOLUME] = "Lists",
+    [GD_OP_CLEARLOCKS_VOLUME] = "Clear locks",
+    [GD_OP_DEFRAG_BRICK_VOLUME] = "Rebalance",
+    [GD_OP_COPY_FILE] = "Copy File",
+    [GD_OP_SYS_EXEC] = "Execute system commands",
+    [GD_OP_GSYNC_CREATE] = "Geo-replication Create",
+    [GD_OP_SNAP] = "Snapshot",
+    [GD_OP_RESET_BRICK] = "Reset Brick",
+    [GD_OP_MAX_OPVERSION] = "Maximum supported op-version",
+    [GD_OP_MAX] = "Invalid op"};
+
+static int
+glusterd_opinfo_init()
+{
+    int32_t ret = -1;
+
+    opinfo.op = GD_OP_NONE;
+
+    return ret;
+}
+
+int
+glusterd_uuid_init()
+{
+    int ret = -1;
+    xlator_t *this = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+
+    ret = glusterd_retrieve_uuid();
+    if (ret == 0) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_RETRIEVED_UUID,
+               "retrieved UUID: %s", uuid_utoa(priv->uuid));
+        return 0;
+    }
+
+    ret = glusterd_uuid_generate_save();
+
+    if (ret) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_UUID_GEN_STORE_FAIL,
+               "Unable to generate and save new UUID");
+        return ret;
+    }
+
+    return 0;
+}
+
+int
+glusterd_uuid_generate_save()
+{
+    int ret = -1;
+    glusterd_conf_t *priv = NULL;
+    xlator_t *this = NULL;
+
+    this = THIS;
+    GF_ASSERT(this);
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    gf_uuid_generate(priv->uuid);
+
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_GENERATED_UUID,
+           "generated UUID: %s", uuid_utoa(priv->uuid));
+
+    ret = glusterd_store_global_info(this);
+
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_UUID_STORE_FAIL,
+               "Unable to store the generated uuid %s", uuid_utoa(priv->uuid));
+
+    return ret;
+}
+
+int
+glusterd_options_init(xlator_t *this)
+{
+    int ret = -1;
+    glusterd_conf_t *priv = NULL;
+    char *initial_version = "0";
+
+    priv = this->private;
+
+    priv->opts = dict_new();
+    if (!priv->opts) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    ret = glusterd_store_retrieve_options(this);
+    if (ret == 0) {
+        goto out;
+    }
+
+    ret = dict_set_str(priv->opts, GLUSTERD_GLOBAL_OPT_VERSION,
+                       initial_version);
+    if (ret)
+        goto out;
+
+    ret = glusterd_store_options(this, priv->opts);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_VERS_STORE_FAIL,
+               "Unable to store version");
+        return ret;
+    }
+out:
+    return 0;
+}
+
+int
+glusterd_client_statedump_submit_req(char *volname, char *target_ip, char *pid)
+{
+    gf_statedump statedump_req = {
+        0,
+    };
+    glusterd_conf_t *conf = NULL;
+    int ret = 0;
+    char *end_ptr = NULL;
+    rpc_transport_t *trans = NULL;
+    char *ip_addr = NULL;
+    xlator_t *this = NULL;
+    char tmp[UNIX_PATH_MAX] = {
+        0,
+    };
+
+    this = THIS;
+    GF_ASSERT(this);
+    conf = this->private;
+    GF_ASSERT(conf);
+
+    if (target_ip == NULL || pid == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_INVALID_ARGUMENT, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    statedump_req.pid = strtol(pid, &end_ptr, 10);
+
+    gf_msg_debug(this->name, 0,
+                 "Performing statedump on volume %s "
+                 "client with pid:%d host:%s",
+                 volname, statedump_req.pid, target_ip);
+
+    pthread_mutex_lock(&conf->xprt_lock);
+    {
+        list_for_each_entry(trans, &conf->xprt_list, list)
+        {
+            /* check if this connection matches "all" or the
+             * volname */
+            if (strncmp(volname, "all", NAME_MAX) &&
+                strncmp(trans->peerinfo.volname, volname, NAME_MAX)) {
+                /* no match, try next trans */
+                continue;
+            }
+
+            strcpy(tmp, trans->peerinfo.identifier);
+            ip_addr = strtok(tmp, ":");
+            if (gf_is_same_address(ip_addr, target_ip)) {
+                /* Every gluster client would have
+                 * connected to glusterd(volfile server). This
+                 * connection is used to send the statedump
+                 * request rpc to the application.
+                 */
+                gf_msg_trace(this->name, 0,
+                             "Submitting "
+                             "statedump rpc request for %s",
+                             trans->peerinfo.identifier);
+                rpcsvc_request_submit(conf->rpc, trans, &glusterd_cbk_prog,
+                                      GF_CBK_STATEDUMP, &statedump_req,
+                                      this->ctx, (xdrproc_t)xdr_gf_statedump);
+            }
+        }
+    }
+    pthread_mutex_unlock(&conf->xprt_lock);
+out:
+    return ret;
+}
+
+int
+glusterd_fetchspec_notify(xlator_t *this)
+{
+    int ret = -1;
+    glusterd_conf_t *priv = NULL;
+    rpc_transport_t *trans = NULL;
+
+    priv = this->private;
+
+    pthread_mutex_lock(&priv->xprt_lock);
+    {
+        list_for_each_entry(trans, &priv->xprt_list, list)
+        {
+            rpcsvc_callback_submit(priv->rpc, trans, &glusterd_cbk_prog,
+                                   GF_CBK_FETCHSPEC, NULL, 0, NULL);
+        }
+    }
+    pthread_mutex_unlock(&priv->xprt_lock);
+
+    ret = 0;
+
+    return ret;
+}
+
+int
+glusterd_fetchsnap_notify(xlator_t *this)
+{
+    int ret = -1;
+    glusterd_conf_t *priv = NULL;
+    rpc_transport_t *trans = NULL;
+
+    priv = this->private;
+
+    /*
+     * TODO: As of now, the identification of the rpc clients in the
+     * handshake protocol is not there. So among so many glusterfs processes
+     * registered with glusterd, it is hard to identify one particular
+     * process (in this particular case, the snap daemon). So the callback
+     * notification is sent to all the transports from the transport list.
+     * Only those processes which have a rpc client registered for this
+     * callback will respond to the notification. Once the identification
+     * of the rpc clients becomes possible, the below section can be changed
+     * to send callback notification to only those rpc clients, which have
+     * registered.
+     */
+    pthread_mutex_lock(&priv->xprt_lock);
+    {
+        list_for_each_entry(trans, &priv->xprt_list, list)
+        {
+            rpcsvc_callback_submit(priv->rpc, trans, &glusterd_cbk_prog,
+                                   GF_CBK_GET_SNAPS, NULL, 0, NULL);
+        }
+    }
+    pthread_mutex_unlock(&priv->xprt_lock);
+
+    ret = 0;
+
+    return ret;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_gld_mt_end + 1);
+
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, GD_MSG_NO_MEMORY,
+               "Memory accounting init"
+               " failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+int
+glusterd_rpcsvc_notify(rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
+                       void *data)
+{
+    xlator_t *this = NULL;
+    rpc_transport_t *xprt = NULL;
+    glusterd_conf_t *priv = NULL;
+
+    if (!xl || !data) {
+        gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_NO_INIT,
+               "Calling rpc_notify without initializing");
+        goto out;
+    }
+
+    this = xl;
+    xprt = data;
+
+    priv = this->private;
+
+    switch (event) {
+        case RPCSVC_EVENT_ACCEPT: {
+            pthread_mutex_lock(&priv->xprt_lock);
+            list_add_tail(&xprt->list, &priv->xprt_list);
+            pthread_mutex_unlock(&priv->xprt_lock);
+            break;
+        }
+        case RPCSVC_EVENT_DISCONNECT: {
+            /* A DISCONNECT event could come without an ACCEPT event
+             * happening for this transport. This happens when the server is
+             * expecting encrypted connections by the client tries to
+             * connect unecnrypted
+             */
+            if (list_empty(&xprt->list))
+                break;
+
+            pthread_mutex_lock(&priv->xprt_lock);
+            list_del(&xprt->list);
+            pthread_mutex_unlock(&priv->xprt_lock);
+            pmap_registry_remove(this, 0, NULL, GF_PMAP_PORT_ANY, xprt,
+                                 _gf_false);
+            break;
+        }
+
+        default:
+            break;
+    }
+
+out:
+    return 0;
+}
+
+static int32_t
+glusterd_program_register(xlator_t *this, rpcsvc_t *svc, rpcsvc_program_t *prog)
+{
+    int32_t ret = -1;
+
+    ret = rpcsvc_program_register(svc, prog, _gf_false);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "cannot register program (name: %s, prognum:%d, "
+                     "progver:%d)",
+                     prog->progname, prog->prognum, prog->progver);
+        goto out;
+    }
+
+out:
+    return ret;
+}
+
+int
+glusterd_rpcsvc_options_build(dict_t *options)
+{
+    int ret = 0;
+    uint32_t backlog = 0;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    ret = dict_get_uint32(options, "transport.listen-backlog", &backlog);
+
+    if (ret) {
+        backlog = GLUSTERFS_SOCKET_LISTEN_BACKLOG;
+        ret = dict_set_uint32(options, "transport.listen-backlog", backlog);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
+                    "Key=transport.listen-backlog", NULL);
+            goto out;
+        }
+    }
+
+    gf_msg_debug("glusterd", 0, "listen-backlog value: %d", backlog);
+
+out:
+    return ret;
+}
+
+#if SYNCDAEMON_COMPILE
+static int
+glusterd_check_gsync_present(int *valid_state)
+{
+    char buff[PATH_MAX] = {
+        0,
+    };
+    runner_t runner = {
+        0,
+    };
+    char *ptr = NULL;
+    int ret = 0;
+
+    runinit(&runner);
+    runner_add_args(&runner, GSYNCD_PREFIX "/gsyncd", "--version", NULL);
+    runner_redir(&runner, STDOUT_FILENO, RUN_PIPE);
+    ret = runner_start(&runner);
+    if (ret == -1) {
+        if (errno == ENOENT) {
+            gf_msg("glusterd", GF_LOG_INFO, errno, GD_MSG_MODULE_NOT_INSTALLED,
+                   GEOREP " module not installed in the system");
+            *valid_state = 0;
+        } else {
+            gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_MODULE_NOT_WORKING,
+                   GEOREP " module not working as desired");
+            *valid_state = -1;
+        }
+        goto out;
+    }
+
+    ptr = fgets(buff, sizeof(buff), runner_chio(&runner, STDOUT_FILENO));
+    if (ptr) {
+        if (!strstr(buff, "gsyncd")) {
+            ret = -1;
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_MODULE_NOT_WORKING,
+                   GEOREP
+                   " module not "
+                   "working as desired");
+            *valid_state = -1;
+            goto out;
+        }
+    } else {
+        ret = -1;
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_MODULE_NOT_WORKING,
+               GEOREP
+               " module not "
+               "working as desired");
+        *valid_state = -1;
+        goto out;
+    }
+
+    ret = 0;
+out:
+
+    runner_end(&runner);
+
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+static int
+group_write_allow(char *path, gid_t gid)
+{
+    struct stat st = {
+        0,
+    };
+    int ret = 0;
+
+    ret = sys_stat(path, &st);
+    if (ret == -1)
+        goto out;
+    GF_ASSERT(S_ISDIR(st.st_mode));
+
+    ret = sys_chown(path, -1, gid);
+    if (ret == -1)
+        goto out;
+
+    ret = sys_chmod(path, (st.st_mode & ~S_IFMT) | S_IWGRP | S_IXGRP | S_ISVTX);
+
+out:
+    if (ret == -1)
+        gf_msg("glusterd", GF_LOG_CRITICAL, errno,
+               GD_MSG_WRITE_ACCESS_GRANT_FAIL,
+               "failed to set up write access to %s for group %d (%s)", path,
+               gid, strerror(errno));
+    return ret;
+}
+
+static int
+glusterd_crt_georep_folders(char *georepdir, glusterd_conf_t *conf)
+{
+    char *greplg_s = NULL;
+    struct group *gr = NULL;
+    int ret = 0;
+    int gr_ret = 0;
+    int32_t len = 0;
+    char logdir[PATH_MAX] = {0};
+
+    GF_ASSERT(georepdir);
+    GF_ASSERT(conf);
+
+    if (strlen(conf->workdir) + 2 > PATH_MAX - SLEN(GEOREP)) {
+        ret = -1;
+        gf_msg("glusterd", GF_LOG_CRITICAL, 0, GD_MSG_DIRPATH_TOO_LONG,
+               "directory path %s/" GEOREP " is longer than PATH_MAX",
+               conf->workdir);
+        goto out;
+    }
+
+    len = snprintf(georepdir, PATH_MAX, "%s/" GEOREP, conf->workdir);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+    ret = mkdir_p(georepdir, 0755, _gf_true);
+    if (-1 == ret) {
+        gf_msg("glusterd", GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create " GEOREP " directory %s", georepdir);
+        goto out;
+    }
+
+    ret = dict_get_str(THIS->options, GEOREP "-log-group", &greplg_s);
+    if (ret) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DICT_GET_FAILED,
+                "Key=log-group", NULL);
+        ret = 0;
+    } else {
+        gr = getgrnam(greplg_s);
+        if (!gr) {
+            gf_msg("glusterd", GF_LOG_CRITICAL, 0, GD_MSG_LOGGROUP_INVALID,
+                   "group " GEOREP "-log-group %s does not exist", greplg_s);
+            gr_ret = -1;
+        }
+    }
+    if ((strlen(conf->logdir) + 2 + SLEN(GEOREP)) >= PATH_MAX) {
+        ret = -1;
+        gf_msg("glusterd", GF_LOG_CRITICAL, 0, GD_MSG_DIRPATH_TOO_LONG,
+               "directory path %s/" GEOREP " is longer than PATH_MAX",
+               conf->logdir);
+        goto out;
+    }
+    len = snprintf(logdir, PATH_MAX, "%s/" GEOREP, conf->logdir);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        ret = -1;
+        goto out;
+    }
+    ret = mkdir_p(logdir, 0755, _gf_true);
+    if (-1 == ret) {
+        gf_msg("glusterd", GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create " GEOREP " log directory");
+        goto out;
+    }
+    if (gr) {
+        gr_ret = group_write_allow(logdir, gr->gr_gid);
+    }
+
+    if ((strlen(conf->logdir) + 2 + SLEN(GEOREP "-slaves")) >= PATH_MAX) {
+        ret = -1;
+        gf_msg("glusterd", GF_LOG_CRITICAL, 0, GD_MSG_DIRPATH_TOO_LONG,
+               "directory path  %s/" GEOREP
+               "-slaves"
+               " is longer than PATH_MAX",
+               conf->logdir);
+        goto out;
+    }
+    len = snprintf(logdir, PATH_MAX, "%s/" GEOREP "-slaves", conf->logdir);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+    ret = mkdir_p(logdir, 0755, _gf_true);
+    if (-1 == ret) {
+        gf_msg("glusterd", GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create " GEOREP " slave log directory");
+        goto out;
+    }
+    if (gr && !gr_ret) {
+        gr_ret = group_write_allow(logdir, gr->gr_gid);
+    }
+
+    /* MountBroker log file directory */
+    if ((strlen(conf->logdir) + 2 + SLEN(GEOREP "-slaves/mbr")) >= PATH_MAX) {
+        ret = -1;
+        gf_msg("glusterd", GF_LOG_CRITICAL, 0, GD_MSG_DIRPATH_TOO_LONG,
+               "directory path  %s/" GEOREP
+               "-slaves/mbr"
+               " is longer than PATH_MAX",
+               conf->logdir);
+        goto out;
+    }
+
+    len = snprintf(logdir, PATH_MAX, "%s/" GEOREP "-slaves/mbr", conf->logdir);
+    if ((len < 0) || (len >= PATH_MAX)) {
+        gf_smsg("glusterd", GF_LOG_ERROR, errno, GD_MSG_COPY_FAIL, NULL);
+        ret = -1;
+        goto out;
+    }
+
+    ret = mkdir_p(logdir, 0755, _gf_true);
+    if (-1 == ret) {
+        gf_msg("glusterd", GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create " GEOREP " mountbroker slave log directory");
+        goto out;
+    }
+    if (gr && !gr_ret) {
+        gr_ret = group_write_allow(logdir, gr->gr_gid);
+    }
+    if (gr_ret)
+        ret = gr_ret;
+out:
+    gf_msg_debug("glusterd", 0, "Returning %d", ret);
+    return ret;
+}
+
+static void
+runinit_gsyncd_setrx(runner_t *runner, glusterd_conf_t *conf)
+{
+    runinit(runner);
+    runner_add_args(runner, GSYNCD_PREFIX "/gsyncd", "-c", NULL);
+    runner_argprintf(runner, "%s/" GSYNC_CONF_TEMPLATE, conf->workdir);
+    runner_add_arg(runner, "--config-set-rx");
+}
+
+static int
+configure_syncdaemon(glusterd_conf_t *conf)
+#define RUN_GSYNCD_CMD                                                         \
+    do {                                                                       \
+        ret = runner_run_reuse(&runner);                                       \
+        if (ret == -1) {                                                       \
+            runner_log(&runner, "glusterd", GF_LOG_ERROR, "command failed");   \
+            runner_end(&runner);                                               \
+            goto out;                                                          \
+        }                                                                      \
+        runner_end(&runner);                                                   \
+    } while (0)
+{
+    int ret = 0;
+    runner_t runner = {
+        0,
+    };
+    char georepdir[PATH_MAX] = {
+        0,
+    };
+    int valid_state = 0;
+
+    ret = setenv("_GLUSTERD_CALLED_", "1", 1);
+    if (ret < 0) {
+        ret = 0;
+        goto out;
+    }
+    valid_state = -1;
+    ret = glusterd_check_gsync_present(&valid_state);
+    if (-1 == ret) {
+        ret = valid_state;
+        goto out;
+    }
+
+    glusterd_crt_georep_folders(georepdir, conf);
+    if (ret) {
+        ret = 0;
+        goto out;
+    }
+
+    /************
+     * master pre-configuration
+     ************/
+
+    /* remote-gsyncd */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_args(&runner, "remote-gsyncd", GSYNCD_PREFIX "/gsyncd", ".", ".",
+                    NULL);
+    RUN_GSYNCD_CMD;
+
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_args(&runner, "remote-gsyncd", "/nonexistent/gsyncd", ".",
+                    "^ssh:", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* gluster-command-dir */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_args(&runner, "gluster-command-dir", SBIN_DIR "/", ".", ".",
+                    NULL);
+    RUN_GSYNCD_CMD;
+
+    /* gluster-params */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_args(&runner, "gluster-params", "aux-gfid-mount acl", ".", ".",
+                    NULL);
+    RUN_GSYNCD_CMD;
+
+    /* ssh-command */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_arg(&runner, "ssh-command");
+    runner_argprintf(&runner,
+                     "ssh -oPasswordAuthentication=no "
+                     "-oStrictHostKeyChecking=no "
+                     "-i %s/secret.pem",
+                     georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* ssh-command tar */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_arg(&runner, "ssh-command-tar");
+    runner_argprintf(&runner,
+                     "ssh -oPasswordAuthentication=no "
+                     "-oStrictHostKeyChecking=no "
+                     "-i %s/tar_ssh.pem",
+                     georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* pid-file */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_arg(&runner, "pid-file");
+    runner_argprintf(&runner,
+                     "%s/${mastervol}_${remotehost}_${slavevol}/monitor.pid",
+                     georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* geo-rep working dir */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_arg(&runner, "georep-session-working-dir");
+    runner_argprintf(&runner, "%s/${mastervol}_${remotehost}_${slavevol}/",
+                     georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* state-file */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_arg(&runner, "state-file");
+    runner_argprintf(&runner,
+                     "%s/${mastervol}_${remotehost}_${slavevol}/monitor.status",
+                     georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* state-detail-file */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_arg(&runner, "state-detail-file");
+    runner_argprintf(
+        &runner,
+        "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}-detail.status",
+        georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* state-detail-file */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_arg(&runner, "state-detail-file");
+    runner_argprintf(
+        &runner,
+        "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}-detail.status",
+        georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* state-socket */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_arg(&runner, "state-socket-unencoded");
+    runner_argprintf(&runner, "%s/${mastervol}/${eSlave}.socket", georepdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* socketdir */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_args(&runner, "socketdir", GLUSTERD_SOCK_DIR, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* log-file */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_arg(&runner, "log-file");
+    runner_argprintf(&runner, "%s/" GEOREP "/${mastervol}/${eSlave}.log",
+                     conf->logdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* gluster-log-file */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_arg(&runner, "gluster-log-file");
+    runner_argprintf(
+        &runner, "%s/" GEOREP "/${mastervol}/${eSlave}${local_id}.gluster.log",
+        conf->logdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* ignore-deletes */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_args(&runner, "ignore-deletes", "true", ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* special-sync-mode */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_args(&runner, "special-sync-mode", "partial", ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* change-detector == changelog */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_args(&runner, "change-detector", "changelog", ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_arg(&runner, "working-dir");
+    runner_argprintf(&runner, "%s/${mastervol}/${eSlave}",
+                     DEFAULT_VAR_RUN_DIRECTORY);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /************
+     * slave pre-configuration
+     ************/
+
+    /* slave-gluster-command-dir */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_args(&runner, "slave-gluster-command-dir", SBIN_DIR "/", ".",
+                    NULL);
+    RUN_GSYNCD_CMD;
+
+    /* gluster-params */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_args(&runner, "gluster-params", "aux-gfid-mount acl", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* log-file */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_arg(&runner, "log-file");
+    runner_argprintf(
+        &runner,
+        "%s/" GEOREP
+        "-slaves/${session_owner}:${local_node}${local_id}.${slavevol}.log",
+        conf->logdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* MountBroker log-file */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_arg(&runner, "log-file-mbr");
+    runner_argprintf(
+        &runner,
+        "%s/" GEOREP
+        "-slaves/mbr/${session_owner}:${local_node}${local_id}.${slavevol}.log",
+        conf->logdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+    /* gluster-log-file */
+    runinit_gsyncd_setrx(&runner, conf);
+    runner_add_arg(&runner, "gluster-log-file");
+    runner_argprintf(
+        &runner,
+        "%s/" GEOREP
+        "-slaves/"
+        "${session_owner}:${local_node}${local_id}.${slavevol}.gluster.log",
+        conf->logdir);
+    runner_add_args(&runner, ".", ".", NULL);
+    RUN_GSYNCD_CMD;
+
+out:
+    return ret ? -1 : 0;
+}
+#undef RUN_GSYNCD_CMD
+#else  /* SYNCDAEMON_COMPILE */
+static int
+configure_syncdaemon(glusterd_conf_t *conf)
+{
+    return 0;
+}
+#endif /* !SYNCDAEMON_COMPILE */
+
+static int
+check_prepare_mountbroker_root(char *mountbroker_root)
+{
+    int dfd0 = -1;
+    int dfd = -1;
+    int dfd2 = -1;
+    struct stat st = {
+        0,
+    };
+    struct stat st2 = {
+        0,
+    };
+    int ret = 0;
+
+    ret = open(mountbroker_root, O_RDONLY);
+    if (ret != -1) {
+        dfd = ret;
+        ret = sys_fstat(dfd, &st);
+    }
+    if (ret == -1 || !S_ISDIR(st.st_mode)) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "cannot access mountbroker-root directory %s", mountbroker_root);
+        ret = -1;
+        goto out;
+    }
+    if (st.st_uid != 0 || (st.st_mode & (S_IWGRP | S_IWOTH))) {
+        gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DIR_PERM_LIBERAL,
+               "permissions on mountbroker-root directory %s are "
+               "too liberal",
+               mountbroker_root);
+        ret = -1;
+        goto out;
+    }
+    if (!(st.st_mode & (S_IXGRP | S_IXOTH))) {
+        gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_DIR_PERM_STRICT,
+               "permissions on mountbroker-root directory %s are "
+               "probably too strict",
+               mountbroker_root);
+    }
+
+    dfd0 = dup(dfd);
+
+    for (;;) {
+        ret = sys_openat(dfd, "..", O_RDONLY, 0);
+        if (ret != -1) {
+            dfd2 = ret;
+            ret = sys_fstat(dfd2, &st2);
+        }
+        if (ret == -1) {
+            gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+                   "error while checking mountbroker-root ancestors "
+                   "%d (%s)",
+                   errno, strerror(errno));
+            goto out;
+        }
+
+        if (st2.st_ino == st.st_ino)
+            break; /* arrived to root */
+
+        if (st2.st_uid != 0 ||
+            ((st2.st_mode & (S_IWGRP | S_IWOTH)) && !(st2.st_mode & S_ISVTX))) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_DIR_PERM_LIBERAL,
+                   "permissions on ancestors of mountbroker-root "
+                   "directory are too liberal");
+            ret = -1;
+            goto out;
+        }
+        if (!(st.st_mode & (S_IXGRP | S_IXOTH))) {
+            gf_msg("glusterd", GF_LOG_WARNING, 0, GD_MSG_DIR_PERM_STRICT,
+                   "permissions on ancestors of mountbroker-root "
+                   "directory are probably too strict");
+        }
+
+        sys_close(dfd);
+        dfd = dfd2;
+        st = st2;
+    }
+
+    ret = sys_mkdirat(dfd0, MB_HIVE, 0711);
+    if (ret == -1 && errno == EEXIST)
+        ret = 0;
+    if (ret != -1)
+        ret = sys_fstatat(dfd0, MB_HIVE, &st, AT_SYMLINK_NOFOLLOW);
+    if (ret == -1 || st.st_mode != (S_IFDIR | 0711)) {
+        gf_msg("glusterd", GF_LOG_ERROR, errno, GD_MSG_CREATE_DIR_FAILED,
+               "failed to set up mountbroker-root directory %s",
+               mountbroker_root);
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    if (dfd0 != -1)
+        sys_close(dfd0);
+    if (dfd != -1)
+        sys_close(dfd);
+    if (dfd2 != -1 && dfd != dfd2)
+        sys_close(dfd2);
+
+    return ret;
+}
+
+static int
+_install_mount_spec(dict_t *opts, char *key, data_t *value, void *data)
+{
+    glusterd_conf_t *priv = THIS->private;
+    char *label = NULL;
+    gf_boolean_t georep = _gf_false;
+    char *pdesc = value->data;
+    char *volname = NULL;
+    int rv = 0;
+    gf_mount_spec_t *mspec = NULL;
+    char *user = NULL;
+    xlator_t *this = THIS;
+    GF_ASSERT(this);
+
+    label = strtail(key, "mountbroker.");
+
+    /* check for presence of geo-rep label */
+    if (!label) {
+        label = strtail(key, "mountbroker-" GEOREP ".");
+        if (label)
+            georep = _gf_true;
+    }
+
+    if (!label)
+        return 0;
+
+    mspec = GF_CALLOC(1, sizeof(*mspec), gf_gld_mt_mount_spec);
+    if (!mspec) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_NO_MEMORY, NULL);
+        goto err;
+    }
+    mspec->label = label;
+
+    if (georep) {
+        volname = gf_strdup(pdesc);
+        if (!volname)
+            goto err;
+        user = strchr(volname, ':');
+        if (user) {
+            *user = '\0';
+            user++;
+        } else
+            user = label;
+
+        rv = make_georep_mountspec(mspec, volname, user, priv->logdir);
+
+        GF_FREE(volname);
+        if (rv != 0)
+            goto err;
+    } else if (parse_mount_pattern_desc(mspec, pdesc) != 0)
+        goto err;
+
+    cds_list_add_tail(&mspec->speclist, &priv->mount_specs);
+
+    return 0;
+err:
+
+    gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_MOUNT_SPEC_INSTALL_FAIL,
+           "adding %smount spec failed: label: %s desc: %s",
+           georep ? GEOREP " " : "", label, pdesc ? pdesc : "");
+
+    if (mspec) {
+        if (mspec->patterns) {
+            GF_FREE(mspec->patterns->components);
+            GF_FREE(mspec->patterns);
+        }
+        GF_FREE(mspec);
+    }
+
+    return -1;
+}
+
+/* The glusterd unix domain socket listener only listens for cli */
+rpcsvc_t *
+glusterd_init_uds_listener(xlator_t *this)
+{
+    int ret = -1;
+    dict_t *options = NULL;
+    rpcsvc_t *rpc = NULL;
+    data_t *sock_data = NULL;
+    char sockfile[UNIX_PATH_MAX] = {0};
+    int i = 0;
+
+    GF_ASSERT(this);
+
+    options = dict_new();
+    if (!options) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_CREATE_FAIL, NULL);
+        goto out;
+    }
+
+    sock_data = dict_get(this->options, "glusterd-sockfile");
+    (void)snprintf(sockfile, sizeof(sockfile), "%s",
+                   sock_data ? sock_data->data : DEFAULT_GLUSTERD_SOCKFILE);
+
+    ret = rpcsvc_transport_unix_options_build(options, sockfile);
+    if (ret)
+        goto out;
+
+    rpc = rpcsvc_init(this, this->ctx, options, 8);
+    if (rpc == NULL) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = rpcsvc_register_notify(rpc, glusterd_rpcsvc_notify, this);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Failed to register notify function");
+        goto out;
+    }
+
+    ret = rpcsvc_create_listeners(rpc, options, this->name);
+    if (ret != 1) {
+        gf_msg_debug(this->name, 0, "Failed to create listener");
+        goto out;
+    }
+    ret = 0;
+
+    for (i = 0; i < gd_uds_programs_count; i++) {
+        ret = glusterd_program_register(this, rpc, gd_uds_programs[i]);
+        if (ret) {
+            i--;
+            for (; i >= 0; i--)
+                rpcsvc_program_unregister(rpc, gd_uds_programs[i]);
+
+            goto out;
+        }
+    }
+
+out:
+    if (options)
+        dict_unref(options);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0,
+               GD_MSG_GLUSTERD_SOCK_LISTENER_START_FAIL,
+               "Failed to start glusterd "
+               "unix domain socket listener.");
+        if (rpc) {
+            GF_FREE(rpc);
+            rpc = NULL;
+        }
+    }
+    return rpc;
+}
+
+void
+glusterd_stop_uds_listener(xlator_t *this)
+{
+    glusterd_conf_t *conf = NULL;
+    rpcsvc_listener_t *listener = NULL;
+    rpcsvc_listener_t *next = NULL;
+    data_t *sock_data = NULL;
+    char sockfile[UNIX_PATH_MAX] = {0};
+
+    GF_ASSERT(this);
+    conf = this->private;
+
+    (void)rpcsvc_program_unregister(conf->uds_rpc, &gd_svc_cli_prog);
+    (void)rpcsvc_program_unregister(conf->uds_rpc, &gluster_handshake_prog);
+
+    list_for_each_entry_safe(listener, next, &conf->uds_rpc->listeners, list)
+    {
+        rpcsvc_listener_destroy(listener);
+    }
+
+    (void)rpcsvc_unregister_notify(conf->uds_rpc, glusterd_rpcsvc_notify, this);
+
+    sock_data = dict_get(this->options, "glusterd-sockfile");
+    (void)snprintf(sockfile, sizeof(sockfile), "%s",
+                   sock_data ? sock_data->data : DEFAULT_GLUSTERD_SOCKFILE);
+    sys_unlink(sockfile);
+
+    return;
+}
+
+void
+glusterd_stop_listener(xlator_t *this)
+{
+    glusterd_conf_t *conf = NULL;
+    rpcsvc_listener_t *listener = NULL;
+    rpcsvc_listener_t *next = NULL;
+    int i = 0;
+
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    gf_msg_debug(this->name, 0, "%s function called ", __func__);
+
+    for (i = 0; i < gd_inet_programs_count; i++) {
+        rpcsvc_program_unregister(conf->rpc, gd_inet_programs[i]);
+    }
+
+    list_for_each_entry_safe(listener, next, &conf->rpc->listeners, list)
+    {
+        rpcsvc_listener_destroy(listener);
+    }
+
+    (void)rpcsvc_unregister_notify(conf->rpc, glusterd_rpcsvc_notify, this);
+
+out:
+
+    return;
+}
+
+static int
+glusterd_find_correct_var_run_dir(xlator_t *this, char *var_run_dir)
+{
+    int ret = -1;
+    struct stat buf = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, var_run_dir, out);
+
+    /* /var/run is normally a symbolic link to /run dir, which
+     * creates problems as the entry point in the mtab for the mount point
+     * and glusterd maintained entry point will be different. Therefore
+     * identify the correct run dir and use it
+     */
+    ret = sys_lstat(GLUSTERD_VAR_RUN_DIR, &buf);
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "stat fails on %s, exiting. (errno = %d)", GLUSTERD_VAR_RUN_DIR,
+               errno);
+        goto out;
+    }
+
+    /* If /var/run is symlink then use /run dir */
+    if (S_ISLNK(buf.st_mode)) {
+        strcpy(var_run_dir, GLUSTERD_RUN_DIR);
+    } else {
+        strcpy(var_run_dir, GLUSTERD_VAR_RUN_DIR);
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+glusterd_init_var_run_dirs(xlator_t *this, char *var_run_dir,
+                           char *dir_to_be_created)
+{
+    int ret = -1;
+    struct stat buf = {
+        0,
+    };
+    char abs_path[PATH_MAX] = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("glusterd", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, var_run_dir, out);
+    GF_VALIDATE_OR_GOTO(this->name, dir_to_be_created, out);
+
+    snprintf(abs_path, sizeof(abs_path), "%s%s", var_run_dir,
+             dir_to_be_created);
+
+    ret = sys_stat(abs_path, &buf);
+    if ((ret != 0) && (ENOENT != errno)) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_FILE_OP_FAILED,
+               "stat fails on %s, exiting. (errno = %d)", abs_path, errno);
+        ret = -1;
+        goto out;
+    }
+
+    if ((!ret) && (!S_ISDIR(buf.st_mode))) {
+        gf_msg(this->name, GF_LOG_CRITICAL, ENOENT, GD_MSG_DIR_NOT_FOUND,
+               "Provided snap path %s is not a directory,"
+               "exiting",
+               abs_path);
+        ret = -1;
+        goto out;
+    }
+
+    if ((-1 == ret) && (ENOENT == errno)) {
+        /* Create missing dirs */
+        ret = mkdir_p(abs_path, 0755, _gf_true);
+
+        if (-1 == ret) {
+            gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+                   "Unable to create directory %s"
+                   " ,errno = %d",
+                   abs_path, errno);
+            goto out;
+        }
+    }
+
+out:
+    return ret;
+}
+
+static int
+is_upgrade(dict_t *options, gf_boolean_t *upgrade)
+{
+    int ret = 0;
+    char *type = NULL;
+
+    ret = dict_get_str(options, "upgrade", &type);
+    if (!ret) {
+        ret = gf_string2boolean(type, upgrade);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_STR_TO_BOOL_FAIL,
+                   "upgrade option "
+                   "%s is not a valid boolean type",
+                   type);
+            ret = -1;
+            goto out;
+        }
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+static int
+is_downgrade(dict_t *options, gf_boolean_t *downgrade)
+{
+    int ret = 0;
+    char *type = NULL;
+
+    ret = dict_get_str(options, "downgrade", &type);
+    if (!ret) {
+        ret = gf_string2boolean(type, downgrade);
+        if (ret) {
+            gf_msg("glusterd", GF_LOG_ERROR, 0, GD_MSG_STR_TO_BOOL_FAIL,
+                   "downgrade option "
+                   "%s is not a valid boolean type",
+                   type);
+            ret = -1;
+            goto out;
+        }
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+/*
+ * init - called during glusterd initialization
+ *
+ * @this:
+ *
+ */
+int
+init(xlator_t *this)
+{
+    int32_t ret = -1;
+    rpcsvc_t *rpc = NULL;
+    rpcsvc_t *uds_rpc = NULL;
+    glusterd_conf_t *conf = NULL;
+    data_t *dir_data = NULL;
+    struct stat buf = {
+        0,
+    };
+    char storedir[PATH_MAX] = {
+        0,
+    };
+    char workdir[PATH_MAX] = {
+        0,
+    };
+    char rundir[PATH_MAX] = {
+        0,
+    };
+    char logdir[VALID_GLUSTERD_PATHMAX] = {
+        0,
+    };
+    char cmd_log_filename[PATH_MAX] = {
+        0,
+    };
+    char *mountbroker_root = NULL;
+    int i = 0;
+    int total_transport = 0;
+    gf_valgrind_tool vgtool;
+    char *valgrind_str = NULL;
+    char *transport_type = NULL;
+    char var_run_dir[PATH_MAX] = {
+        0,
+    };
+    int32_t workers = 0;
+    gf_boolean_t upgrade = _gf_false;
+    gf_boolean_t downgrade = _gf_false;
+    char *localtime_logging = NULL;
+    int32_t len = 0;
+    int op_version = 0;
+
+#if defined(RUN_WITH_MEMCHECK)
+    vgtool = _gf_memcheck;
+#elif defined(RUN_WITH_DRD)
+    vgtool = _gf_drd;
+#else
+    vgtool = _gf_none;
+#endif
+
+#ifndef GF_DARWIN_HOST_OS
+    {
+        struct rlimit lim;
+        lim.rlim_cur = 65536;
+        lim.rlim_max = 65536;
+
+        if (setrlimit(RLIMIT_NOFILE, &lim) == -1) {
+            gf_smsg(this->name, GF_LOG_ERROR, errno, GD_MSG_SET_XATTR_FAIL,
+                    "Failed to set 'ulimit -n 65536'", NULL);
+        } else {
+            gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_FILE_DESC_LIMIT_SET,
+                   "Maximum allowed open file descriptors "
+                   "set to 65536");
+        }
+    }
+#endif
+
+    dir_data = dict_get(this->options, "run-directory");
+
+    if (!dir_data) {
+        /* Use default working dir */
+        len = snprintf(rundir, PATH_MAX, "%s", DEFAULT_VAR_RUN_DIRECTORY);
+    } else {
+        len = snprintf(rundir, PATH_MAX, "%s", dir_data->data);
+    }
+    if (len < 0 || len >= PATH_MAX)
+        exit(2);
+
+    dir_data = dict_get(this->options, "cluster-test-mode");
+    if (!dir_data) {
+        /* Use default working dir */
+        len = snprintf(logdir, VALID_GLUSTERD_PATHMAX, "%s",
+                       DEFAULT_LOG_FILE_DIRECTORY);
+    } else {
+        len = snprintf(logdir, VALID_GLUSTERD_PATHMAX, "%s", dir_data->data);
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_CLUSTER_RC_ENABLE,
+               "cluster-test-mode is enabled logdir is %s", dir_data->data);
+    }
+    if (len < 0 || len >= PATH_MAX)
+        exit(2);
+
+    ret = mkdir_p(logdir, 0777, _gf_true);
+    if ((ret == -1) && (EEXIST != errno)) {
+        gf_msg(THIS->name, GF_LOG_ERROR, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create log dir %s", logdir);
+        exit(1);
+    }
+
+    dir_data = dict_get(this->options, "working-directory");
+
+    if (!dir_data) {
+        // Use default working dir
+        len = snprintf(workdir, PATH_MAX, "%s", GLUSTERD_DEFAULT_WORKDIR);
+    } else {
+        len = snprintf(workdir, PATH_MAX, "%s", dir_data->data);
+    }
+    if (len < 0 || len >= PATH_MAX)
+        exit(2);
+
+    ret = sys_stat(workdir, &buf);
+    if ((ret != 0) && (ENOENT != errno)) {
+        gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DIR_OP_FAILED,
+               "stat fails on %s, exiting. (errno = %d)", workdir, errno);
+        exit(1);
+    }
+
+    if ((!ret) && (!S_ISDIR(buf.st_mode))) {
+        gf_msg(this->name, GF_LOG_CRITICAL, ENOENT, GD_MSG_DIR_NOT_FOUND,
+               "Provided working area %s is not a directory,"
+               "exiting",
+               workdir);
+        exit(1);
+    }
+
+    if ((-1 == ret) && (ENOENT == errno)) {
+        ret = mkdir_p(workdir, 0755, _gf_true);
+
+        if (-1 == ret) {
+            gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+                   "Unable to create directory %s"
+                   " ,errno = %d",
+                   workdir, errno);
+            exit(1);
+        }
+    }
+
+    setenv("GLUSTERD_WORKDIR", workdir, 1);
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_CURR_WORK_DIR_INFO,
+           "Using %s as working directory", workdir);
+
+    setenv("DEFAULT_VAR_RUN_DIRECTORY", rundir, 1);
+    gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_CURR_WORK_DIR_INFO,
+           "Using %s as pid file working "
+           "directory",
+           rundir);
+
+    ret = glusterd_find_correct_var_run_dir(this, var_run_dir);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_VAR_RUN_DIR_FIND_FAIL,
+               "Unable to find "
+               "the correct var run dir");
+        exit(1);
+    }
+
+    ret = glusterd_init_var_run_dirs(this, var_run_dir,
+                                     GLUSTERD_DEFAULT_SNAPS_BRICK_DIR);
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create "
+               "snap backend folder");
+        exit(1);
+    }
+
+    len = snprintf(snap_mount_dir, sizeof(snap_mount_dir), "%s%s", var_run_dir,
+                   GLUSTERD_DEFAULT_SNAPS_BRICK_DIR);
+    if ((len < 0) || (len >= sizeof(snap_mount_dir))) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_DIR_OP_FAILED,
+               "Snap mount dir too long");
+        exit(1);
+    }
+
+    ret = mkdir_p(GLUSTER_SHARED_STORAGE_BRICK_DIR, 0755, _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_DIR_OP_FAILED,
+               "Unable to create "
+               "shared storage brick");
+        exit(1);
+    }
+
+    ret = glusterd_init_var_run_dirs(this, rundir, GLUSTERD_BITD_RUN_DIR);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create "
+               "bitd running directory");
+        exit(1);
+    }
+
+    ret = glusterd_init_var_run_dirs(this, rundir, GLUSTERD_SCRUB_RUN_DIR);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create "
+               "scrub running directory");
+        exit(1);
+    }
+
+#ifdef BUILD_GNFS
+    ret = glusterd_init_var_run_dirs(this, rundir, GLUSTERD_NFS_RUN_DIR);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create "
+               "nfs running directory");
+        exit(1);
+    }
+#endif
+
+    ret = glusterd_init_var_run_dirs(this, rundir, GLUSTERD_QUOTAD_RUN_DIR);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, 0, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create "
+               "quota running directory");
+        exit(1);
+    }
+
+    snprintf(cmd_log_filename, PATH_MAX, "%s/cmd_history.log", logdir);
+    ret = gf_cmd_log_init(cmd_log_filename);
+
+    if (ret == -1) {
+        gf_msg("this->name", GF_LOG_CRITICAL, errno, GD_MSG_FILE_OP_FAILED,
+               "Unable to create cmd log file %s", cmd_log_filename);
+        exit(1);
+    }
+
+    len = snprintf(storedir, sizeof(storedir), "%s/vols", workdir);
+    if ((len < 0) || (len >= sizeof(storedir))) {
+        exit(1);
+    }
+
+    ret = sys_mkdir(storedir, 0755);
+
+    if ((-1 == ret) && (errno != EEXIST)) {
+        gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create volume directory %s"
+               " ,errno = %d",
+               storedir, errno);
+        exit(1);
+    }
+
+    /*keeping individual volume pid file information in /var/run/gluster* */
+    len = snprintf(storedir, sizeof(storedir), "%s/vols", rundir);
+    if ((len < 0) || (len >= sizeof(storedir))) {
+        exit(1);
+    }
+
+    ret = sys_mkdir(storedir, 0755);
+
+    if ((-1 == ret) && (errno != EEXIST)) {
+        gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create volume directory %s"
+               " ,errno = %d",
+               storedir, errno);
+        exit(1);
+    }
+
+    len = snprintf(storedir, sizeof(storedir), "%s/snaps", workdir);
+    if ((len < 0) || (len >= sizeof(storedir))) {
+        exit(1);
+    }
+
+    ret = sys_mkdir(storedir, 0755);
+
+    if ((-1 == ret) && (errno != EEXIST)) {
+        gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create snaps directory %s"
+               " ,errno = %d",
+               storedir, errno);
+        exit(1);
+    }
+
+    len = snprintf(storedir, sizeof(storedir), "%s/peers", workdir);
+    if ((len < 0) || (len >= sizeof(storedir))) {
+        exit(1);
+    }
+
+    ret = sys_mkdir(storedir, 0755);
+
+    if ((-1 == ret) && (errno != EEXIST)) {
+        gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create peers directory %s"
+               " ,errno = %d",
+               storedir, errno);
+        exit(1);
+    }
+
+    len = snprintf(storedir, sizeof(storedir), "%s/bricks", logdir);
+    if ((len < 0) || (len >= sizeof(storedir))) {
+        exit(1);
+    }
+
+    ret = sys_mkdir(storedir, 0755);
+    if ((-1 == ret) && (errno != EEXIST)) {
+        gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create logs directory %s"
+               " ,errno = %d",
+               storedir, errno);
+        exit(1);
+    }
+
+#ifdef BUILD_GNFS
+    len = snprintf(storedir, sizeof(storedir), "%s/nfs", workdir);
+    if ((len < 0) || (len >= sizeof(storedir))) {
+        exit(1);
+    }
+    ret = sys_mkdir(storedir, 0755);
+    if ((-1 == ret) && (errno != EEXIST)) {
+        gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create nfs directory %s"
+               " ,errno = %d",
+               storedir, errno);
+        exit(1);
+    }
+#endif
+    len = snprintf(storedir, sizeof(storedir), "%s/bitd", workdir);
+    if ((len < 0) || (len >= sizeof(storedir))) {
+        exit(1);
+    }
+    ret = sys_mkdir(storedir, 0755);
+    if ((-1 == ret) && (errno != EEXIST)) {
+        gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create bitrot directory %s", storedir);
+        exit(1);
+    }
+
+    len = snprintf(storedir, sizeof(storedir), "%s/scrub", workdir);
+    if ((len < 0) || (len >= sizeof(storedir))) {
+        exit(1);
+    }
+    ret = sys_mkdir(storedir, 0755);
+    if ((-1 == ret) && (errno != EEXIST)) {
+        gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create scrub directory %s", storedir);
+        exit(1);
+    }
+
+    len = snprintf(storedir, sizeof(storedir), "%s/glustershd", workdir);
+    if ((len < 0) || (len >= sizeof(storedir))) {
+        exit(1);
+    }
+    ret = sys_mkdir(storedir, 0755);
+    if ((-1 == ret) && (errno != EEXIST)) {
+        gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create glustershd directory %s"
+               " ,errno = %d",
+               storedir, errno);
+        exit(1);
+    }
+
+    len = snprintf(storedir, sizeof(storedir), "%s/quotad", workdir);
+    if ((len < 0) || (len >= sizeof(storedir))) {
+        exit(1);
+    }
+    ret = sys_mkdir(storedir, 0755);
+    if ((-1 == ret) && (errno != EEXIST)) {
+        gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create quotad directory %s"
+               " ,errno = %d",
+               storedir, errno);
+        exit(1);
+    }
+
+    len = snprintf(storedir, sizeof(storedir), "%s/groups", workdir);
+    if ((len < 0) || (len >= sizeof(storedir))) {
+        exit(1);
+    }
+    ret = sys_mkdir(storedir, 0755);
+    if ((-1 == ret) && (errno != EEXIST)) {
+        gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_CREATE_DIR_FAILED,
+               "Unable to create glustershd directory %s"
+               " ,errno = %d",
+               storedir, errno);
+        exit(1);
+    }
+
+    ret = glusterd_rpcsvc_options_build(this->options);
+    if (ret)
+        goto out;
+    rpc = rpcsvc_init(this, this->ctx, this->options, 64);
+    if (rpc == NULL) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_INIT_FAIL,
+               "failed to init rpc");
+        goto out;
+    }
+
+    ret = rpcsvc_register_notify(rpc, glusterd_rpcsvc_notify, this);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPCSVC_REG_NOTIFY_RETURNED,
+               "rpcsvc_register_notify returned %d", ret);
+        goto out;
+    }
+
+    /* Enable encryption for the TCP listener is management encryption is
+     * enabled
+     */
+    if (this->ctx->secure_mgmt) {
+        ret = dict_set_str(this->options, "transport.socket.ssl-enabled", "on");
+        if (ret != 0) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED,
+                   "failed to set ssl-enabled in dict");
+            goto out;
+        }
+        /*
+         * This is the only place where we want secure_srvr to reflect
+         * the management-plane setting.
+         */
+        this->ctx->secure_srvr = MGMT_SSL_ALWAYS;
+    }
+
+    /*
+     * only one (at most a pair - rdma and socket) listener for
+     * glusterd1_mop_prog, gluster_pmap_prog and gluster_handshake_prog.
+     */
+
+    ret = dict_get_str(this->options, "transport-type", &transport_type);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_GET_FAILED,
+               "Failed to get transport type");
+        ret = -1;
+        goto out;
+    }
+
+    total_transport = rpc_transport_count(transport_type);
+    if (total_transport <= 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_TRANSPORT_COUNT_GET_FAIL,
+               "failed to get total number of available tranpsorts");
+        ret = -1;
+        goto out;
+    }
+
+    ret = rpcsvc_create_listeners(rpc, this->options, this->name);
+    if (ret < 1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_LISTENER_CREATE_FAIL,
+               "creation of listener failed");
+        ret = -1;
+        goto out;
+    } else if (ret < total_transport) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_RPC_LISTENER_CREATE_FAIL,
+               "creation of %d listeners failed, continuing with "
+               "succeeded transport",
+               (total_transport - ret));
+    }
+
+    for (i = 0; i < gd_inet_programs_count; i++) {
+        ret = glusterd_program_register(this, rpc, gd_inet_programs[i]);
+        if (ret) {
+            i--;
+            for (; i >= 0; i--)
+                rpcsvc_program_unregister(rpc, gd_inet_programs[i]);
+
+            goto out;
+        }
+    }
+
+    /*
+     * Start a unix domain socket listener just for cli commands This
+     * should prevent ports from being wasted by being in TIMED_WAIT when
+     * cli commands are done continuously
+     */
+    uds_rpc = glusterd_init_uds_listener(this);
+    if (uds_rpc == NULL) {
+        ret = -1;
+        goto out;
+    }
+
+    conf = GF_CALLOC(1, sizeof(glusterd_conf_t), gf_gld_mt_glusterd_conf_t);
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    CDS_INIT_LIST_HEAD(&conf->peers);
+    CDS_INIT_LIST_HEAD(&conf->volumes);
+    CDS_INIT_LIST_HEAD(&conf->snapshots);
+    CDS_INIT_LIST_HEAD(&conf->missed_snaps_list);
+    CDS_INIT_LIST_HEAD(&conf->brick_procs);
+    CDS_INIT_LIST_HEAD(&conf->shd_procs);
+    pthread_mutex_init(&conf->attach_lock, NULL);
+    pthread_mutex_init(&conf->volume_lock, NULL);
+
+    pthread_mutex_init(&conf->mutex, NULL);
+    conf->rpc = rpc;
+    conf->uds_rpc = uds_rpc;
+    conf->gfs_mgmt = &gd_brick_prog;
+    conf->restart_shd = _gf_false;
+    this->private = conf;
+    /* conf->workdir and conf->rundir are smaller than PATH_MAX; gcc's
+     * snprintf checking will throw an error here if sprintf is used.
+     * Dueling gcc-8 and coverity, now coverity isn't smart enough to
+     * detect that these strncpy calls are safe. And for extra fun,
+     * the annotations don't do anything. */
+    if (strlen(workdir) >= sizeof(conf->workdir)) {
+        ret = -1;
+        goto out;
+    }
+    /* coverity[BUFFER_SIZE_WARNING] */
+    (void)strncpy(conf->workdir, workdir, sizeof(conf->workdir));
+    /* separate tests because combined tests confuses gcc */
+    if (strlen(rundir) >= sizeof(conf->rundir)) {
+        ret = -1;
+        goto out;
+    }
+    /* coverity[BUFFER_SIZE_WARNING] */
+    (void)strncpy(conf->rundir, rundir, sizeof(conf->rundir));
+
+    /* coverity[BUFFER_SIZE_WARNING] */
+    (void)strncpy(conf->logdir, logdir, sizeof(conf->logdir));
+
+    synclock_init(&conf->big_lock, SYNC_LOCK_RECURSIVE);
+    synccond_init(&conf->cond_restart_bricks);
+    synccond_init(&conf->cond_restart_shd);
+    synccond_init(&conf->cond_blockers);
+    pthread_mutex_init(&conf->xprt_lock, NULL);
+    INIT_LIST_HEAD(&conf->xprt_list);
+    pthread_mutex_init(&conf->import_volumes, NULL);
+
+    glusterd_friend_sm_init();
+    glusterd_op_sm_init();
+    glusterd_opinfo_init();
+    ret = glusterd_sm_tr_log_init(
+        &conf->op_sm_log, glusterd_op_sm_state_name_get,
+        glusterd_op_sm_event_name_get, GLUSTERD_TR_LOG_SIZE);
+    if (ret)
+        goto out;
+
+    conf->base_port = GF_IANA_PRIV_PORTS_START;
+    if (dict_get_uint32(this->options, "base-port", &conf->base_port) == 0) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DICT_SET_FAILED,
+               "base-port override: %d", conf->base_port);
+    }
+    conf->max_port = GF_PORT_MAX;
+    if (dict_get_uint32(this->options, "max-port", &conf->max_port) == 0) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DICT_SET_FAILED,
+               "max-port override: %d", conf->max_port);
+    }
+
+    conf->mgmt_v3_lock_timeout = GF_LOCK_TIMER;
+    if (dict_get_uint32(this->options, "lock-timer",
+                        &conf->mgmt_v3_lock_timeout) == 0) {
+        gf_msg(this->name, GF_LOG_INFO, 0, GD_MSG_DICT_SET_FAILED,
+               "lock-timer override: %d", conf->mgmt_v3_lock_timeout);
+    }
+
+    /* Set option to run bricks on valgrind if enabled in glusterd.vol */
+    this->ctx->cmd_args.vgtool = vgtool;
+    ret = dict_get_str(this->options, "run-with-valgrind", &valgrind_str);
+    if (ret < 0) {
+        gf_msg_debug(this->name, 0, "cannot get run-with-valgrind value");
+    }
+    if (valgrind_str) {
+        gf_boolean_t vg = _gf_false;
+
+        if (!strcmp(valgrind_str, "memcheck"))
+            this->ctx->cmd_args.vgtool = _gf_memcheck;
+        else if (!strcmp(valgrind_str, "drd"))
+            this->ctx->cmd_args.vgtool = _gf_drd;
+        else if (!gf_string2boolean(valgrind_str, &vg))
+            this->ctx->cmd_args.vgtool = (vg ? _gf_memcheck : _gf_none);
+        else
+            gf_msg(this->name, GF_LOG_WARNING, EINVAL, GD_MSG_INVALID_ENTRY,
+                   "run-with-valgrind is neither boolean"
+                   " nor one of 'memcheck' or 'drd'");
+    }
+
+    /* Store ping-timeout in conf */
+    ret = dict_get_int32(this->options, "ping-timeout", &conf->ping_timeout);
+    /* Not failing here since ping-timeout can be optional as well */
+
+    glusterd_mgmt_v3_lock_init();
+    glusterd_mgmt_v3_lock_timer_init();
+    glusterd_txn_opinfo_dict_init();
+
+#ifdef BUILD_GNFS
+    glusterd_nfssvc_build(&conf->nfs_svc);
+#endif
+    glusterd_quotadsvc_build(&conf->quotad_svc);
+    glusterd_bitdsvc_build(&conf->bitd_svc);
+    glusterd_scrubsvc_build(&conf->scrub_svc);
+
+    /* Make install copies few of the hook-scripts by creating hooks
+     * directory. Hence purposefully not doing the check for the presence of
+     * hooks directory. Doing so avoids creation of complete hooks directory
+     * tree.
+     */
+    ret = glusterd_hooks_create_hooks_directory(conf->workdir);
+    if (-1 == ret) {
+        gf_msg(this->name, GF_LOG_CRITICAL, errno, GD_MSG_DIR_OP_FAILED,
+               "Unable to create hooks directory ");
+        exit(1);
+    }
+
+    CDS_INIT_LIST_HEAD(&conf->mount_specs);
+
+    ret = dict_foreach(this->options, _install_mount_spec, NULL);
+    if (ret)
+        goto out;
+    ret = dict_get_str(this->options, "mountbroker-root", &mountbroker_root);
+    if (ret)
+        ret = 0;
+    else
+        ret = check_prepare_mountbroker_root(mountbroker_root);
+    if (ret)
+        goto out;
+
+    ret = is_upgrade(this->options, &upgrade);
+    if (ret)
+        goto out;
+
+    ret = is_downgrade(this->options, &downgrade);
+    if (ret)
+        goto out;
+
+    if (!upgrade && !downgrade) {
+        ret = configure_syncdaemon(conf);
+        if (ret)
+            goto out;
+    }
+
+    /* Restoring op-version needs to be done before initializing the
+     * services as glusterd_svc_init_common () invokes
+     * glusterd_conn_build_socket_filepath () which uses MY_UUID macro.
+     * MY_UUID generates a new uuid if its not been generated and writes it
+     * in the info file, Since the op-version is not read yet
+     * the default value i.e. 0 will be written for op-version and restore
+     * will fail. This is why restoring op-version needs to happen before
+     * service initialization
+     * */
+    ret = glusterd_restore_op_version(this);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_OP_VERS_RESTORE_FAIL,
+               "Failed to restore op_version");
+        goto out;
+    }
+
+    ret = glusterd_restore();
+    if (ret < 0)
+        goto out;
+
+    if (dict_get_str(conf->opts, GLUSTERD_LOCALTIME_LOGGING_KEY,
+                     &localtime_logging) == 0) {
+        int already_enabled = gf_log_get_localtime();
+
+        if (strcmp(localtime_logging, "enable") == 0) {
+            gf_log_set_localtime(1);
+            if (!already_enabled)
+                gf_msg(this->name, GF_LOG_INFO, 0,
+                       GD_MSG_LOCALTIME_LOGGING_ENABLE,
+                       "localtime logging enable");
+        } else if (strcmp(localtime_logging, "disable") == 0) {
+            gf_log_set_localtime(0);
+            if (already_enabled)
+                gf_msg(this->name, GF_LOG_INFO, 0,
+                       GD_MSG_LOCALTIME_LOGGING_DISABLE,
+                       "localtime logging disable");
+        }
+    }
+
+    GF_ATOMIC_INIT(conf->blockers, 0);
+    ret = glusterd_handle_upgrade_downgrade(this->options, conf, upgrade,
+                                            downgrade);
+    if (ret)
+        goto out;
+
+    ret = glusterd_retrieve_max_op_version(this, &op_version);
+    /* first condition indicates file isn't present which means this code
+     * change is hitting for the first time or someone has deleted it from the
+     * backend.second condition is when max op_version differs, in both cases
+     * volfiles should be regenerated
+     */
+    if (op_version == 0 || op_version != GD_OP_VERSION_MAX) {
+        gf_log(this->name, GF_LOG_INFO,
+               "Regenerating volfiles due to a max op-version mismatch or "
+               "glusterd.upgrade file not being present, op_version retrieved:"
+               "%d, max op_version: %d",
+               op_version, GD_OP_VERSION_MAX);
+        glusterd_recreate_volfiles(conf);
+        ret = glusterd_store_max_op_version(this);
+        if (ret)
+            gf_log(this->name, GF_LOG_ERROR, "Failed to store max op-version");
+    }
+
+    /* If the peer count is less than 2 then this would be the best time to
+     * spawn process/bricks that may need (re)starting since last time
+     * (this) glusterd was up. */
+    if (glusterd_get_peers_count() < 2)
+        glusterd_launch_synctask(glusterd_spawn_daemons, NULL);
+
+    ret = glusterd_hooks_spawn_worker(this);
+    if (ret)
+        goto out;
+
+    GF_OPTION_INIT("event-threads", workers, int32, out);
+    if (workers > 0 && workers != conf->workers) {
+        conf->workers = workers;
+        ret = gf_event_reconfigure_threads(this->ctx->event_pool, workers);
+        if (ret)
+            goto out;
+    }
+
+    ret = 0;
+out:
+    if (ret < 0) {
+        if (this->private != NULL) {
+            GF_FREE(this->private);
+            this->private = NULL;
+        }
+    }
+
+    return ret;
+}
+
+/*
+ * fini - finish function for glusterd, called before
+ *        unloading gluster.
+ *
+ * @this:
+ *
+ */
+void
+fini(xlator_t *this)
+{
+    if (!this || !this->private)
+        goto out;
+
+    glusterd_stop_uds_listener(this); /*stop unix socket rpc*/
+    glusterd_stop_listener(this);     /*stop tcp/ip socket rpc*/
+
+#if 0
+       /* Running threads might be using these resourses, we have to cancel/stop
+        * running threads before deallocating the memory, but we don't have
+        * control over the running threads to do pthread_cancel().
+        * So memory freeing handover to kernel.
+        */
+        /*TODO: cancel/stop the running threads*/
+
+        GF_FREE (conf->uds_rpc);
+        GF_FREE (conf->rpc);
+        FREE (conf->pmap);
+        if (conf->handle)
+                gf_store_handle_destroy (conf->handle);
+        glusterd_sm_tr_log_delete (&conf->op_sm_log);
+        glusterd_mgmt_v3_lock_fini ();
+        glusterd_mgmt_v3_lock_timer_fini ();
+        glusterd_txn_opinfo_dict_fini ();
+        GF_FREE (conf);
+
+        this->private = NULL;
+#endif
+out:
+    return;
+}
+
+/*
+ * notify - notify function for glusterd
+ * @this:
+ * @trans:
+ * @event:
+ *
+ */
+int
+notify(xlator_t *this, int32_t event, void *data, ...)
+{
+    int ret = 0;
+
+    switch (event) {
+        case GF_EVENT_POLLIN:
+            break;
+
+        case GF_EVENT_POLLERR:
+            break;
+
+        case GF_EVENT_CLEANUP:
+            break;
+
+        default:
+            default_notify(this, event, data);
+            break;
+    }
+
+    return ret;
+}
+
+struct xlator_fops fops;
+
+struct xlator_cbks cbks;
+
+struct xlator_dumpops dumpops = {
+    .priv = glusterd_dump_priv,
+};
+
+struct volume_options options[] = {
+    {
+        .key = {"working-directory"},
+        .type = GF_OPTION_TYPE_PATH,
+    },
+    {
+        .key = {"transport-type"},
+        .type = GF_OPTION_TYPE_ANY,
+    },
+    {
+        .key = {"transport.*"},
+        .type = GF_OPTION_TYPE_ANY,
+    },
+    {
+        .key = {"rpc-auth.*"},
+        .type = GF_OPTION_TYPE_ANY,
+    },
+    {
+        .key = {"rpc-auth-allow-insecure"},
+        .type = GF_OPTION_TYPE_BOOL,
+    },
+    {
+        .key = {"upgrade"},
+        .type = GF_OPTION_TYPE_BOOL,
+    },
+    {
+        .key = {"downgrade"},
+        .type = GF_OPTION_TYPE_BOOL,
+    },
+    {
+        .key = {"bind-insecure"},
+        .type = GF_OPTION_TYPE_BOOL,
+    },
+    {
+        .key = {"mountbroker-root"},
+        .type = GF_OPTION_TYPE_PATH,
+    },
+    {
+        .key = {"mountbroker.*"},
+        .type = GF_OPTION_TYPE_ANY,
+    },
+    {
+        .key = {"mountbroker-" GEOREP ".*"},
+        .type = GF_OPTION_TYPE_ANY,
+    },
+    {
+        .key = {GEOREP "-log-group"},
+        .type = GF_OPTION_TYPE_ANY,
+    },
+    {
+        .key = {"run-with-valgrind"},
+        .type = GF_OPTION_TYPE_BOOL,
+    },
+    {.key = {"server-quorum-type"},
+     .type = GF_OPTION_TYPE_STR,
+     .value = {"none", "server"},
+     .default_value = "none",
+     .description = "It can be set to none or server. When set to server, "
+                    "this option enables the specified volume to "
+                    "participate in the server-side quorum. "
+                    "This feature is on the server-side i.e. in glusterd. "
+                    "Whenever the glusterd on a machine observes that "
+                    "the quorum is not met, it brings down the bricks to "
+                    "prevent data split-brains. When the network "
+                    "connections are brought back up and the quorum is "
+                    "restored the bricks in   "
+                    "the volume are brought back up."},
+    {.key = {"server-quorum-ratio"},
+     .type = GF_OPTION_TYPE_PERCENT,
+     .description = "Sets the quorum percentage for the trusted "
+                    "storage pool."},
+    {.key = {"glusterd-sockfile"},
+     .type = GF_OPTION_TYPE_PATH,
+     .description = "The socket file on which glusterd should listen for "
+                    "cli requests. Default is " DEFAULT_GLUSTERD_SOCKFILE "."},
+    {.key = {"base-port"},
+     .type = GF_OPTION_TYPE_INT,
+     .description = "Sets the base port for portmap query"},
+    {.key = {"max-port"},
+     .type = GF_OPTION_TYPE_INT,
+     .max = GF_PORT_MAX,
+     .description = "Sets the max port for portmap query"},
+    {.key = {"mgmt-v3-lock-timeout"},
+     .type = GF_OPTION_TYPE_INT,
+     .max = 600,
+     .description = "Sets the mgmt-v3-lock-timeout for transactions."
+                    "Specifes the default timeout value after which "
+                    "lock acquired while performing transaction will "
+                    "be released."},
+    {.key = {"snap-brick-path"},
+     .type = GF_OPTION_TYPE_STR,
+     .description =
+         "directory where the bricks for the snapshots will be created"},
+    {
+        .key = {"ping-timeout"},
+        .type = GF_OPTION_TYPE_TIME,
+        .min = 0,
+        .max = 300,
+        .default_value = TOSTRING(RPC_DEFAULT_PING_TIMEOUT),
+    },
+    {.key = {"event-threads"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 32,
+     .default_value = "2",
+     .description = "Specifies the number of event threads to execute "
+                    "in parallel. Larger values would help process"
+                    " responses faster, depending on available processing"
+                    " power. Range 1-32 threads."},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "glusterd",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
new file mode 100644
index 00000000000..cc4f98ecf47
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd.h
@@ -0,0 +1,1375 @@
+/*
+   Copyright (c) 2006-2013 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_H_
+#define _GLUSTERD_H_
+
+#include <sys/types.h>
+#include <dirent.h>
+#include <pthread.h>
+#include <libgen.h>
+
+#include <glusterfs/compat-uuid.h>
+
+#include "rpc-clnt.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/byte-order.h>
+#include "glusterd-mem-types.h"
+#include "rpcsvc.h"
+#include "glusterd-sm.h"
+#include "glusterd-snapd-svc.h"
+#include "glusterd-shd-svc.h"
+#include "glusterd-bitd-svc.h"
+#include "glusterd1-xdr.h"
+#include "protocol-common.h"
+#include "glusterd-pmap.h"
+#include "cli1-xdr.h"
+#include <glusterfs/syncop.h>
+#include <glusterfs/store.h>
+#include "glusterd-rcu.h"
+#include <glusterfs/events.h>
+#include "glusterd-gfproxyd-svc.h"
+
+#include "gd-common-utils.h"
+
+#define GLUSTERD_TR_LOG_SIZE 50
+#define GLUSTERD_QUORUM_TYPE_KEY "cluster.server-quorum-type"
+#define GLUSTERD_QUORUM_RATIO_KEY "cluster.server-quorum-ratio"
+#define GLUSTERD_GLOBAL_OPT_VERSION "global-option-version"
+#define GLUSTERD_GLOBAL_OP_VERSION_KEY "cluster.op-version"
+#define GLUSTERD_MAX_OP_VERSION_KEY "cluster.max-op-version"
+#define GLUSTERD_COMMON_PEM_PUB_FILE "/geo-replication/common_secret.pem.pub"
+#define GEO_CONF_MAX_OPT_VALS 6
+#define GLUSTERD_CREATE_HOOK_SCRIPT                                            \
+    "/hooks/1/gsync-create/post/"                                              \
+    "S56glusterd-geo-rep-create-post.sh"
+#define GLUSTERD_SHRD_STRG_HOOK_SCRIPT                                         \
+    "/hooks/1/set/post/"                                                       \
+    "S32gluster_enable_shared_storage.sh"
+#define GLUSTER_SHARED_STORAGE "gluster_shared_storage"
+#define GLUSTERD_SHARED_STORAGE_KEY "cluster.enable-shared-storage"
+#define GLUSTERD_BRICK_MULTIPLEX_KEY "cluster.brick-multiplex"
+#define GLUSTERD_VOL_CNT_PER_THRD "glusterd.vol_count_per_thread"
+#define GLUSTERD_BRICKMUX_LIMIT_KEY "cluster.max-bricks-per-process"
+#define GLUSTERD_BRICKMUX_LIMIT_DFLT_VALUE "250"
+#define GLUSTERD_VOL_CNT_PER_THRD_DEFAULT_VALUE "100"
+#define GLUSTERD_LOCALTIME_LOGGING_KEY "cluster.localtime-logging"
+#define GLUSTERD_DAEMON_LOG_LEVEL_KEY "cluster.daemon-log-level"
+
+#define GANESHA_HA_CONF CONFDIR "/ganesha-ha.conf"
+#define GANESHA_EXPORT_DIRECTORY CONFDIR "/exports"
+
+#define GLUSTERD_SNAPS_MAX_HARD_LIMIT 256
+#define GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT 90
+#define GLUSTERD_SNAPS_MAX_SOFT_LIMIT_PERCENT 100
+#define GLUSTERD_SERVER_QUORUM "server"
+#define STATUS_STRLEN 128
+
+#define FMTSTR_CHECK_VOL_EXISTS "Volume %s does not exist"
+#define FMTSTR_RESOLVE_BRICK "Could not find peer on which brick %s:%s resides"
+
+#define LOGSTR_FOUND_BRICK "Found brick %s:%s in volume %s"
+#define LOGSTR_BUILD_PAYLOAD "Failed to build payload for operation 'Volume %s'"
+#define LOGSTR_STAGE_FAIL "Staging of operation 'Volume %s' failed on %s %s %s"
+#define LOGSTR_COMMIT_FAIL "Commit of operation 'Volume %s' failed on %s %s %s"
+
+#define OPERRSTR_BUILD_PAYLOAD                                                 \
+    "Failed to build payload. Please check the log "                           \
+    "file for more details."
+#define OPERRSTR_STAGE_FAIL                                                    \
+    "Staging failed on %s. Please check the log file "                         \
+    "for more details."
+#define OPERRSTR_COMMIT_FAIL                                                   \
+    "Commit failed on %s. Please check the log file "                          \
+    "for more details."
+struct glusterd_volinfo_;
+typedef struct glusterd_volinfo_ glusterd_volinfo_t;
+
+struct glusterd_snap_;
+typedef struct glusterd_snap_ glusterd_snap_t;
+
+/* For every new feature please add respective enum of new feature
+ * at the end of latest enum (just before the GD_OP_MAX enum)
+ */
+typedef enum glusterd_op_ {
+    GD_OP_NONE = 0,
+    GD_OP_CREATE_VOLUME,
+    GD_OP_START_BRICK,
+    GD_OP_STOP_BRICK,
+    GD_OP_DELETE_VOLUME,
+    GD_OP_START_VOLUME,
+    GD_OP_STOP_VOLUME,
+    GD_OP_DEFRAG_VOLUME,
+    GD_OP_ADD_BRICK,
+    GD_OP_REMOVE_BRICK,
+    GD_OP_REPLACE_BRICK,
+    GD_OP_SET_VOLUME,
+    GD_OP_RESET_VOLUME,
+    GD_OP_SYNC_VOLUME,
+    GD_OP_LOG_ROTATE,
+    GD_OP_GSYNC_SET,
+    GD_OP_PROFILE_VOLUME,
+    GD_OP_QUOTA,
+    GD_OP_STATUS_VOLUME,
+    GD_OP_REBALANCE,
+    GD_OP_HEAL_VOLUME,
+    GD_OP_STATEDUMP_VOLUME,
+    GD_OP_LIST_VOLUME,
+    GD_OP_CLEARLOCKS_VOLUME,
+    GD_OP_DEFRAG_BRICK_VOLUME,
+    GD_OP_COPY_FILE,
+    GD_OP_SYS_EXEC,
+    GD_OP_GSYNC_CREATE,
+    GD_OP_SNAP,
+    GD_OP_BARRIER,
+    GD_OP_GANESHA, /* obsolete */
+    GD_OP_BITROT,
+    GD_OP_DETACH_TIER,
+    GD_OP_TIER_MIGRATE,
+    GD_OP_SCRUB_STATUS,
+    GD_OP_SCRUB_ONDEMAND,
+    GD_OP_RESET_BRICK,
+    GD_OP_MAX_OPVERSION,
+    GD_OP_TIER_START_STOP,
+    GD_OP_TIER_STATUS,
+    GD_OP_DETACH_TIER_STATUS,
+    GD_OP_DETACH_NOT_STARTED,
+    GD_OP_REMOVE_TIER_BRICK,
+    GD_OP_ADD_TIER_BRICK,
+    GD_OP_MAX,
+} glusterd_op_t;
+
+extern const char *gd_op_list[];
+
+struct glusterd_volgen {
+    dict_t *dict;
+};
+
+/* Keeping all the paths required in glusterd would
+   cause many buffer overflow errors, as we append
+   more defined paths to the brick path, workdir etc etc.
+   It is better to keep limit on this as lesser value,
+   so we get an option to continue with all functionalities.
+   For example, snapname max would be appended on brick-path and
+   would be stored on workdir... all of these being PATH_MAX, is
+   not an ideal situation for success. */
+#define VALID_GLUSTERD_PATHMAX (PATH_MAX - (256 + 64))
+
+typedef struct {
+    struct _volfile_ctx *volfile;
+    pthread_mutex_t mutex;
+    struct cds_list_head peers;
+    uuid_t uuid;
+    rpcsvc_t *rpc;
+    glusterd_svc_t nfs_svc;
+    glusterd_svc_t bitd_svc;
+    glusterd_svc_t scrub_svc;
+    glusterd_svc_t quotad_svc;
+    struct pmap_registry *pmap;
+    struct cds_list_head volumes;
+    struct cds_list_head snapshots;   /*List of snap volumes */
+    struct cds_list_head brick_procs; /* List of brick processes */
+    struct cds_list_head shd_procs;   /* List of shd processes */
+    pthread_mutex_t xprt_lock;
+    struct list_head xprt_list;
+    pthread_mutex_t import_volumes;
+    gf_store_handle_t *handle;
+    gf_timer_t *timer;
+    glusterd_sm_tr_log_t op_sm_log;
+    struct rpc_clnt_program *gfs_mgmt;
+    dict_t *mgmt_v3_lock;        /* Dict for saving
+                                  * mgmt_v3 locks */
+    dict_t *glusterd_txn_opinfo; /* Dict for saving
+                                  * transaction opinfos */
+    uuid_t global_txn_id;        /* To be used in
+                                  * heterogeneous
+                                  * cluster with no
+                                  * transaction ids */
+
+    dict_t *mgmt_v3_lock_timer;
+    struct cds_list_head mount_specs;
+    pthread_t brick_thread;
+    void *hooks_priv;
+
+    xlator_t *xl; /* Should be set to 'THIS' before creating thread */
+    /* need for proper handshake_t */
+    int op_version; /* Starts with 1 for 3.3.0 */
+    gf_boolean_t pending_quorum_action;
+    gf_boolean_t verify_volfile_checksum;
+    gf_boolean_t trace;
+    gf_boolean_t restart_done;
+    dict_t *opts;
+    synclock_t big_lock;
+    synccond_t cond_restart_bricks;
+    synccond_t cond_restart_shd;
+    synccond_t cond_blockers;
+    rpcsvc_t *uds_rpc; /* RPCSVC for the unix domain socket */
+    uint32_t base_port;
+    uint32_t max_port;
+    char *snap_bricks_directory;
+    gf_store_handle_t *missed_snaps_list_shandle;
+    struct cds_list_head missed_snaps_list;
+    int ping_timeout;
+    uint32_t generation;
+    int32_t workers;
+    uint32_t mgmt_v3_lock_timeout;
+    gf_atomic_t blockers;
+    pthread_mutex_t attach_lock; /* Lock can be per process or a common one */
+    pthread_mutex_t volume_lock; /* We release the big_lock from lot of places
+                                    which might lead the modification of volinfo
+                                    list.
+                                 */
+    gf_atomic_t thread_count;
+    gf_boolean_t restart_bricks;
+    gf_boolean_t restart_shd; /* This flag prevents running two shd manager
+                                 simultaneously
+                              */
+    char workdir[VALID_GLUSTERD_PATHMAX];
+    char rundir[VALID_GLUSTERD_PATHMAX];
+    char logdir[VALID_GLUSTERD_PATHMAX];
+} glusterd_conf_t;
+
+typedef struct glusterd_add_dict_args {
+    xlator_t *this;
+    dict_t *voldict;
+    int start;
+    int end;
+} glusterd_add_dict_args_t;
+
+typedef struct glusterd_friend_synctask_args {
+    char *dict_buf;
+    u_int dictlen;
+} glusterd_friend_synctask_args_t;
+
+typedef enum gf_brick_status {
+    GF_BRICK_STOPPED,
+    GF_BRICK_STARTED,
+    GF_BRICK_STOPPING,
+    GF_BRICK_STARTING
+} gf_brick_status_t;
+
+typedef struct glusterd_brickinfo glusterd_brickinfo_t;
+
+struct glusterd_brick_proc {
+    int port;
+    uint32_t brick_count;
+    struct cds_list_head brick_proc_list;
+    struct cds_list_head bricks;
+};
+
+typedef struct glusterd_brick_proc glusterd_brick_proc_t;
+
+struct glusterd_brickinfo {
+    struct cds_list_head brick_list;
+    uuid_t uuid;
+    int port;
+    int rdma_port;
+    char *logfile;
+    gf_store_handle_t *shandle;
+    struct rpc_clnt *rpc;
+    int decommissioned;
+    gf_brick_status_t status;
+    int32_t snap_status;
+    /*
+     * The group is used to identify which bricks are part of the same
+     * replica set during brick-volfile generation, so that JBR volfiles
+     * can "cross-connect" the bricks to one another. It is also used by
+     * AFR to load the arbiter xlator in the appropriate brick in case of
+     * a replica 3 volume with arbiter enabled.
+     */
+    uint16_t group;
+    gf_boolean_t port_registered;
+    gf_boolean_t start_triggered;
+
+    /* Below are used for handling the case of multiple bricks sharing
+       the backend filesystem */
+    uint64_t statfs_fsid;
+    pthread_mutex_t restart_mutex;
+    glusterd_brick_proc_t *brick_proc; /* Information regarding mux bricks */
+    struct cds_list_head mux_bricks; /* List to store the bricks in brick_proc*/
+    uint32_t fs_share_count;
+    char hostname[NAME_MAX];
+    char path[VALID_GLUSTERD_PATHMAX];
+    char real_path[VALID_GLUSTERD_PATHMAX];
+    char device_path[VALID_GLUSTERD_PATHMAX];
+    char mount_dir[VALID_GLUSTERD_PATHMAX];
+    char brick_id[1024];   /*Client xlator name, AFR changelog name*/
+    char fstype[NAME_MAX]; /* Brick file-system type */
+    char mnt_opts[1024];   /* Brick mount options */
+    char vg[PATH_MAX];     /* FIXME: Use max size for length of vg */
+};
+
+struct glusterd_gfproxyd_info {
+    char *logfile;
+    short port;
+};
+
+struct gf_defrag_brickinfo_ {
+    char *name;
+    int files;
+    int size;
+};
+
+typedef int (*defrag_cbk_fn_t)(glusterd_volinfo_t *volinfo,
+                               gf_defrag_status_t status);
+
+struct glusterd_defrag_info_ {
+    uint64_t total_files;
+    uint64_t total_data;
+    uint64_t num_files_lookedup;
+    uint64_t total_failures;
+    gf_lock_t lock;
+    int cmd;
+    uint32_t connected;
+    pthread_t th;
+    struct rpc_clnt *rpc;
+    struct gf_defrag_brickinfo_ *bricks; /* volinfo->brick_count */
+    defrag_cbk_fn_t cbk_fn;
+    gf_defrag_status_t defrag_status;
+    char mount[1024];
+};
+
+typedef struct glusterd_defrag_info_ glusterd_defrag_info_t;
+
+typedef enum gf_transport_type_ {
+    GF_TRANSPORT_TCP,  // DEFAULT
+    GF_TRANSPORT_RDMA,
+    GF_TRANSPORT_BOTH_TCP_RDMA,
+} gf_transport_type;
+
+typedef enum gf_rb_status_ {
+    GF_RB_STATUS_NONE,
+    GF_RB_STATUS_STARTED,
+    GF_RB_STATUS_PAUSED,
+} gf_rb_status_t;
+
+struct _auth {
+    char *username;
+    char *password;
+};
+
+typedef struct _auth auth_t;
+
+/* Capabilities of xlator */
+#define CAPS_BD 0x00000001
+#define CAPS_THIN 0x00000002
+#define CAPS_OFFLOAD_COPY 0x00000004
+#define CAPS_OFFLOAD_SNAPSHOT 0x00000008
+#define CAPS_OFFLOAD_ZERO 0x00000020
+
+struct glusterd_bitrot_scrub_ {
+    char *scrub_state;
+    char *scrub_impact;
+    char *scrub_freq;
+    uint64_t scrubbed_files;
+    uint64_t unsigned_files;
+    uint64_t last_scrub_time;
+    uint64_t scrub_duration;
+    uint64_t error_count;
+};
+
+typedef struct glusterd_bitrot_scrub_ glusterd_bitrot_scrub_t;
+
+struct glusterd_rebalance_ {
+    uint64_t rebalance_files;
+    uint64_t rebalance_data;
+    uint64_t lookedup_files;
+    uint64_t skipped_files;
+    uint64_t rebalance_failures;
+    glusterd_defrag_info_t *defrag;
+    gf_cli_defrag_type defrag_cmd;
+    gf_defrag_status_t defrag_status;
+    uuid_t rebalance_id;
+    double rebalance_time;
+    uint64_t time_left;
+    dict_t *dict; /* Dict to store misc information
+                   * like list of bricks being removed */
+    glusterd_op_t op;
+    uint32_t commit_hash;
+};
+
+typedef struct glusterd_rebalance_ glusterd_rebalance_t;
+
+struct glusterd_replace_brick_ {
+    glusterd_brickinfo_t *src_brick;
+    glusterd_brickinfo_t *dst_brick;
+};
+
+typedef struct glusterd_replace_brick_ glusterd_replace_brick_t;
+
+typedef enum gd_quorum_status_ {
+    NOT_APPLICABLE_QUORUM,  // Does not follow quorum
+    MEETS_QUORUM,           // Follows quorum and meets.
+    DOESNT_MEET_QUORUM,     // Follows quorum and does not meet.
+} gd_quorum_status_t;
+
+struct glusterd_volinfo_ {
+    gf_lock_t lock;
+    glusterd_snap_t *snapshot;
+    uuid_t restored_from_snap;
+    int type;
+    int brick_count;
+    uint64_t snap_count;
+    uint64_t snap_max_hard_limit;
+    struct cds_list_head vol_list;
+    /* In case of a snap volume
+       i.e (is_snap_volume == TRUE) this
+       is linked to glusterd_snap_t->volumes.
+       In case of a non-snap volume, this is
+       linked to glusterd_conf_t->volumes */
+    struct cds_list_head snapvol_list;
+    /* This is a current pointer for
+       glusterd_volinfo_t->snap_volumes */
+    struct cds_list_head bricks;
+    struct cds_list_head ta_bricks;
+    struct cds_list_head snap_volumes;
+    /* TODO : Need to remove this, as this
+     * is already part of snapshot object.
+     */
+    glusterd_volume_status status;
+    int sub_count; /* backward compatibility */
+    int stripe_count;
+    int replica_count;
+    int arbiter_count;
+    int thin_arbiter_count;
+    int disperse_count;
+    int redundancy_count;
+    int subvol_count;    /* Number of subvolumes in a
+                          distribute volume */
+    int dist_leaf_count; /* Number of bricks in one
+                          distribute subvolume */
+    int port;
+    gf_store_handle_t *shandle;
+    gf_store_handle_t *node_state_shandle;
+    gf_store_handle_t *quota_conf_shandle;
+
+    /* Defrag/rebalance related */
+    glusterd_rebalance_t rebal;
+
+    /* Replace brick status */
+    glusterd_replace_brick_t rep_brick;
+
+    /* Bitrot scrub status*/
+    glusterd_bitrot_scrub_t bitrot_scrub;
+
+    int version;
+    uint32_t quota_conf_version;
+    uint32_t cksum;
+    uint32_t quota_conf_cksum;
+
+    dict_t *dict;
+
+    uuid_t volume_id;
+    auth_t auth;
+    char *logdir;
+
+    dict_t *gsync_slaves;
+    dict_t *gsync_active_slaves;
+
+    xlator_t *xl;
+    int decommission_in_progress;
+
+    int op_version;
+    int client_op_version;
+    int32_t quota_xattr_version;
+    pthread_mutex_t reflock;
+    int refcnt;
+    gd_quorum_status_t quorum_status;
+
+    glusterd_snapdsvc_t snapd;
+    glusterd_shdsvc_t shd;
+    glusterd_gfproxydsvc_t gfproxyd;
+    pthread_mutex_t store_volinfo_lock; /* acquire lock for
+                                         * updating the volinfo
+                                         */
+    gf_transport_type transport_type;
+    gf_boolean_t is_snap_volume;
+    gf_boolean_t memory_accounting;
+    gf_boolean_t stage_deleted; /* volume has passed staging
+                                 * for delete operation
+                                 */
+    char parent_volname[GD_VOLUME_NAME_MAX];
+    /* In case of a snap volume
+       i.e (is_snap_volume == TRUE) this
+       field will contain the name of
+       the volume which is snapped. In
+       case of a non-snap volume, this
+       field will be initialized as N/A */
+    char volname[NAME_MAX + 1];
+    /* NAME_MAX + 1 will be equal to
+     * GD_VOLUME_NAME_MAX + 5.(also to
+     * GD_VOLUME_NAME_MAX_TIER). An extra 5
+     * bytes are added to GD_VOLUME_NAME_MAX
+     * because, as part of the tiering
+     * volfile generation code, we are
+     * temporarily appending either "-hot"
+     * or "-cold" */
+    gf_atomic_t volpeerupdate;
+    /* Flag to check about volume has received updates
+       from peer
+    */
+};
+
+typedef enum gd_snap_status_ {
+    GD_SNAP_STATUS_NONE,
+    GD_SNAP_STATUS_INIT,
+    GD_SNAP_STATUS_IN_USE,
+    GD_SNAP_STATUS_DECOMMISSION,
+    GD_SNAP_STATUS_UNDER_RESTORE,
+    GD_SNAP_STATUS_RESTORED,
+} gd_snap_status_t;
+
+struct glusterd_snap_ {
+    gf_lock_t lock;
+    struct cds_list_head volumes;
+    struct cds_list_head snap_list;
+    char *description;
+    uuid_t snap_id;
+    time_t time_stamp;
+    gf_store_handle_t *shandle;
+    gd_snap_status_t snap_status;
+    gf_boolean_t snap_restored;
+    char snapname[GLUSTERD_MAX_SNAP_NAME];
+};
+
+typedef struct glusterd_snap_op_ {
+    char *snap_vol_id;
+    char *brick_path;
+    struct cds_list_head snap_ops_list;
+    int32_t brick_num;
+    int32_t op;
+    int32_t status;
+} glusterd_snap_op_t;
+
+typedef struct glusterd_missed_snap_ {
+    char *node_uuid;
+    char *snap_uuid;
+    struct cds_list_head missed_snaps;
+    struct cds_list_head snap_ops;
+} glusterd_missed_snap_info;
+
+typedef enum gd_node_type_ {
+    GD_NODE_NONE,
+    GD_NODE_BRICK,
+    GD_NODE_SHD,
+    GD_NODE_REBALANCE,
+    GD_NODE_NFS,
+    GD_NODE_QUOTAD,
+    GD_NODE_SNAPD,
+    GD_NODE_BITD,
+    GD_NODE_SCRUB,
+    GD_NODE_TIERD
+} gd_node_type;
+
+typedef enum missed_snap_stat {
+    GD_MISSED_SNAP_NONE,
+    GD_MISSED_SNAP_PENDING,
+    GD_MISSED_SNAP_DONE,
+} missed_snap_stat;
+
+typedef struct glusterd_pending_node_ {
+    struct cds_list_head list;
+    void *node;
+    gd_node_type type;
+    int32_t index;
+} glusterd_pending_node_t;
+
+struct gsync_config_opt_vals_ {
+    char *op_name;
+    char *values[GEO_CONF_MAX_OPT_VALS];
+    int no_of_pos_vals;
+    gf_boolean_t case_sensitive;
+};
+
+enum glusterd_op_ret {
+    GLUSTERD_CONNECTION_AWAITED = 100,
+};
+
+enum glusterd_vol_comp_status_ {
+    GLUSTERD_VOL_COMP_NONE = 0,
+    GLUSTERD_VOL_COMP_SCS = 1,
+    GLUSTERD_VOL_COMP_UPDATE_REQ,
+    GLUSTERD_VOL_COMP_RJT,
+};
+
+typedef struct addrinfo_list {
+    struct cds_list_head list;
+    struct addrinfo *info;
+} addrinfo_list_t;
+
+typedef enum {
+    GF_AI_COMPARE_NO_MATCH = 0,
+    GF_AI_COMPARE_MATCH = 1,
+    GF_AI_COMPARE_ERROR = 2
+} gf_ai_compare_t;
+
+#define GLUSTERD_DEFAULT_PORT GF_DEFAULT_BASE_PORT
+#define GLUSTERD_INFO_FILE "glusterd.info"
+#define GLUSTERD_UPGRADE_FILE                                                  \
+    "glusterd.upgrade" /* zero byte file to detect a need for regenerating     \
+                          volfiles in container mode */
+#define GLUSTERD_VOLUME_QUOTA_CONFIG "quota.conf"
+#define GLUSTERD_VOLUME_DIR_PREFIX "vols"
+#define GLUSTERD_PEER_DIR_PREFIX "peers"
+#define GLUSTERD_VOLUME_INFO_FILE "info"
+#define GLUSTERD_VOLUME_SNAPD_INFO_FILE "snapd.info"
+#define GLUSTERD_SNAP_INFO_FILE "info"
+#define GLUSTERD_VOLUME_RBSTATE_FILE "rbstate"
+#define GLUSTERD_BRICK_INFO_DIR "bricks"
+#define GLUSTERD_CKSUM_FILE "cksum"
+#define GLUSTERD_VOL_QUOTA_CKSUM_FILE "quota.cksum"
+#define GLUSTERD_TRASH "trash"
+#define GLUSTERD_NODE_STATE_FILE "node_state.info"
+#define GLUSTERD_MISSED_SNAPS_LIST_FILE "missed_snaps_list"
+#define GLUSTERD_VOL_SNAP_DIR_PREFIX "snaps"
+
+#define GLUSTERD_DEFAULT_SNAPS_BRICK_DIR "/gluster/snaps"
+#define GLUSTERD_BITD_RUN_DIR "/bitd"
+#define GLUSTERD_SCRUB_RUN_DIR "/scrub"
+#define GLUSTERD_NFS_RUN_DIR "/nfs"
+#define GLUSTERD_QUOTAD_RUN_DIR "/quotad"
+#define GLUSTER_SHARED_STORAGE_BRICK_DIR GLUSTERD_DEFAULT_WORKDIR "/ss_brick"
+#define GLUSTERD_VAR_RUN_DIR "/var/run"
+#define GLUSTERD_RUN_DIR "/run"
+
+/* definitions related to replace brick */
+#define RB_CLIENT_MOUNTPOINT "rb_mount"
+#define RB_CLIENTVOL_FILENAME "rb_client.vol"
+#define RB_DSTBRICK_PIDFILE "rb_dst_brick.pid"
+#define RB_DSTBRICKVOL_FILENAME "rb_dst_brick.vol"
+#define RB_PUMP_DEF_ARG "default"
+
+#define GLUSTERD_UUID_LEN 50
+
+typedef ssize_t (*gd_serialize_t)(struct iovec outmsg, void *args);
+
+#define GLUSTERD_GET_VOLUME_DIR(path, volinfo, priv)                           \
+    do {                                                                       \
+        int32_t _vol_dir_len;                                                  \
+        if (volinfo->is_snap_volume) {                                         \
+            _vol_dir_len = snprintf(                                           \
+                path, PATH_MAX, "%s/snaps/%s/%s", priv->workdir,               \
+                volinfo->snapshot->snapname, volinfo->volname);                \
+        } else {                                                               \
+            _vol_dir_len = snprintf(path, PATH_MAX, "%s/vols/%s",              \
+                                    priv->workdir, volinfo->volname);          \
+        }                                                                      \
+        if ((_vol_dir_len < 0) || (_vol_dir_len >= PATH_MAX)) {                \
+            path[0] = 0;                                                       \
+        }                                                                      \
+    } while (0)
+
+#define GLUSTERD_GET_DEFRAG_DIR(path, volinfo, priv)                           \
+    do {                                                                       \
+        char vol_path[PATH_MAX];                                               \
+        int32_t _defrag_dir_len;                                               \
+        GLUSTERD_GET_VOLUME_DIR(vol_path, volinfo, priv);                      \
+        _defrag_dir_len = snprintf(path, PATH_MAX, "%s/%s", vol_path,          \
+                                   "rebalance");                               \
+        if ((_defrag_dir_len < 0) || (_defrag_dir_len >= PATH_MAX)) {          \
+            path[0] = 0;                                                       \
+        }                                                                      \
+    } while (0)
+
+#define GLUSTERD_GET_DEFRAG_PID_FILE(path, volinfo, priv)                      \
+    do {                                                                       \
+        char defrag_path[PATH_MAX];                                            \
+        int32_t _defrag_pidfile_len;                                           \
+        GLUSTERD_GET_DEFRAG_DIR(defrag_path, volinfo, priv);                   \
+        _defrag_pidfile_len = snprintf(path, PATH_MAX, "%s/%s.pid",            \
+                                       defrag_path, uuid_utoa(MY_UUID));       \
+        if ((_defrag_pidfile_len < 0) || (_defrag_pidfile_len >= PATH_MAX)) {  \
+            path[0] = 0;                                                       \
+        }                                                                      \
+    } while (0)
+
+#define GLUSTERD_GET_SHD_RUNDIR(path, volinfo, priv)                           \
+    do {                                                                       \
+        int32_t _shd_dir_len;                                                  \
+        _shd_dir_len = snprintf(path, PATH_MAX, "%s/shd/%s", priv->rundir,     \
+                                volinfo->volname);                             \
+        if ((_shd_dir_len < 0) || (_shd_dir_len >= PATH_MAX)) {                \
+            path[0] = 0;                                                       \
+        }                                                                      \
+    } while (0)
+
+#define GLUSTERD_GET_VOLUME_PID_DIR(path, volinfo, priv)                       \
+    do {                                                                       \
+        int32_t _vol_pid_len;                                                  \
+        if (volinfo->is_snap_volume) {                                         \
+            _vol_pid_len = snprintf(path, PATH_MAX, "%s/snaps/%s/%s",          \
+                                    priv->rundir, volinfo->snapshot->snapname, \
+                                    volinfo->volname);                         \
+        } else {                                                               \
+            _vol_pid_len = snprintf(path, PATH_MAX, "%s/vols/%s",              \
+                                    priv->rundir, volinfo->volname);           \
+        }                                                                      \
+        if ((_vol_pid_len < 0) || (_vol_pid_len >= PATH_MAX)) {                \
+            path[0] = 0;                                                       \
+        }                                                                      \
+    } while (0)
+
+#define GLUSTERD_GET_SNAP_GEO_REP_DIR(path, snap, priv)                        \
+    do {                                                                       \
+        int32_t _snap_geo_len;                                                 \
+        _snap_geo_len = snprintf(path, PATH_MAX, "%s/snaps/%s/%s",             \
+                                 priv->workdir, snap->snapname, GEOREP);       \
+        if ((_snap_geo_len < 0) || (_snap_geo_len >= PATH_MAX)) {              \
+            path[0] = 0;                                                       \
+        }                                                                      \
+    } while (0)
+
+#define GLUSTERD_GET_QUOTA_LIMIT_MOUNT_PATH(abspath, volname, path)            \
+    do {                                                                       \
+        snprintf(abspath, sizeof(abspath) - 1,                                 \
+                 DEFAULT_VAR_RUN_DIRECTORY "/%s_quota_limit%s", volname,       \
+                 path);                                                        \
+    } while (0)
+
+#define GLUSTERD_REMOVE_SLASH_FROM_PATH(path, string)                          \
+    do {                                                                       \
+        int i = 0;                                                             \
+        for (i = 1; i < strlen(path); i++) {                                   \
+            string[i - 1] = path[i];                                           \
+            if (string[i - 1] == '/' && (i != strlen(path) - 1))               \
+                string[i - 1] = '-';                                           \
+        }                                                                      \
+    } while (0)
+
+#define GLUSTERD_GET_BRICK_PIDFILE(pidfile, volinfo, brickinfo, priv)          \
+    do {                                                                       \
+        char exp_path[PATH_MAX] = {                                            \
+            0,                                                                 \
+        };                                                                     \
+        char volpath[PATH_MAX] = {                                             \
+            0,                                                                 \
+        };                                                                     \
+        int32_t _brick_pid_len = 0;                                            \
+        GLUSTERD_GET_VOLUME_PID_DIR(volpath, volinfo, priv);                   \
+        GLUSTERD_REMOVE_SLASH_FROM_PATH(brickinfo->path, exp_path);            \
+        _brick_pid_len = snprintf(pidfile, PATH_MAX, "%s/%s-%s.pid", volpath,  \
+                                  brickinfo->hostname, exp_path);              \
+        if ((_brick_pid_len < 0) || (_brick_pid_len >= PATH_MAX)) {            \
+            pidfile[0] = 0;                                                    \
+        }                                                                      \
+    } while (0)
+
+#define RCU_READ_LOCK                                                          \
+    pthread_mutex_lock(&(THIS->ctx)->cleanup_lock);                            \
+    {                                                                          \
+        rcu_read_lock();                                                       \
+    }                                                                          \
+    pthread_mutex_unlock(&(THIS->ctx)->cleanup_lock);
+
+#define RCU_READ_UNLOCK                                                        \
+    pthread_mutex_lock(&(THIS->ctx)->cleanup_lock);                            \
+    {                                                                          \
+        rcu_read_unlock();                                                     \
+    }                                                                          \
+    pthread_mutex_unlock(&(THIS->ctx)->cleanup_lock);
+
+#define GLUSTERD_DUMP_PEERS(head, member, xpeers)                              \
+    do {                                                                       \
+        glusterd_peerinfo_t *_peerinfo = NULL;                                 \
+        int index = 1;                                                         \
+        char *key = NULL;                                                      \
+                                                                               \
+        key = xpeers ? "glusterd.xaction_peer" : "glusterd.peer";              \
+                                                                               \
+        RCU_READ_LOCK;                                                         \
+        cds_list_for_each_entry_rcu(_peerinfo, head, member)                   \
+        {                                                                      \
+            glusterd_dump_peer(_peerinfo, key, index, xpeers);                 \
+            if (!xpeers)                                                       \
+                glusterd_dump_peer_rpcstat(_peerinfo, key, index);             \
+            index++;                                                           \
+        }                                                                      \
+        RCU_READ_UNLOCK;                                                       \
+                                                                               \
+    } while (0)
+
+int
+glusterd_uuid_init();
+
+int
+glusterd_uuid_generate_save();
+
+#define MY_UUID (__glusterd_uuid())
+
+static inline unsigned char *
+__glusterd_uuid()
+{
+    glusterd_conf_t *priv = THIS->private;
+
+    if (gf_uuid_is_null(priv->uuid))
+        glusterd_uuid_init();
+    return &priv->uuid[0];
+}
+
+int
+glusterd_big_locked_notify(struct rpc_clnt *rpc, void *mydata,
+                           rpc_clnt_event_t event, void *data,
+                           rpc_clnt_notify_t notify_fn);
+
+int
+glusterd_big_locked_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                        void *myframe, fop_cbk_fn_t fn);
+
+int
+glusterd_big_locked_handler(rpcsvc_request_t *req, rpcsvc_actor actor_fn);
+
+int32_t
+glusterd_brick_from_brickinfo(glusterd_brickinfo_t *brickinfo,
+                              char **new_brick);
+int
+glusterd_probe_begin(rpcsvc_request_t *req, const char *hoststr, int port,
+                     dict_t *dict, int *op_errno);
+
+int
+glusterd_xfer_friend_add_resp(rpcsvc_request_t *req, char *myhostname,
+                              char *remote_hostname, int port, int32_t op_ret,
+                              int32_t op_errno);
+
+int
+glusterd_friend_add(const char *hoststr, int port,
+                    glusterd_friend_sm_state_t state, uuid_t *uuid,
+                    glusterd_peerinfo_t **friend, gf_boolean_t restore,
+                    glusterd_peerctx_args_t *args);
+
+int
+glusterd_friend_add_from_peerinfo(glusterd_peerinfo_t *friend,
+                                  gf_boolean_t restore,
+                                  glusterd_peerctx_args_t *args);
+int
+glusterd_friend_rpc_create(xlator_t *this, glusterd_peerinfo_t *peerinfo,
+                           glusterd_peerctx_args_t *args);
+int
+glusterd_friend_remove(uuid_t uuid, char *hostname);
+
+int
+glusterd_op_lock_send_resp(rpcsvc_request_t *req, int32_t status);
+
+int
+glusterd_op_unlock_send_resp(rpcsvc_request_t *req, int32_t status);
+
+int
+glusterd_op_mgmt_v3_lock_send_resp(rpcsvc_request_t *req, uuid_t *txn_id,
+                                   int32_t status);
+
+int
+glusterd_op_mgmt_v3_unlock_send_resp(rpcsvc_request_t *req, uuid_t *txn_id,
+                                     int32_t status);
+
+int
+glusterd_op_stage_send_resp(rpcsvc_request_t *req, int32_t op, int32_t status,
+                            char *op_errstr, dict_t *rsp_dict);
+
+int
+glusterd_op_commmit_send_resp(rpcsvc_request_t *req, int32_t op,
+                              int32_t status);
+
+int32_t
+glusterd_create_volume(rpcsvc_request_t *req, dict_t *dict);
+
+int
+glusterd_handle_incoming_friend_req(rpcsvc_request_t *req);
+
+int
+glusterd_handle_probe_query(rpcsvc_request_t *req);
+
+int
+glusterd_handle_cluster_lock(rpcsvc_request_t *req);
+
+int
+glusterd_handle_cluster_unlock(rpcsvc_request_t *req);
+
+int
+glusterd_handle_stage_op(rpcsvc_request_t *req);
+
+int
+glusterd_handle_commit_op(rpcsvc_request_t *req);
+
+int
+glusterd_handle_cli_probe(rpcsvc_request_t *req);
+
+int
+glusterd_handle_create_volume(rpcsvc_request_t *req);
+
+int
+glusterd_handle_defrag_volume(rpcsvc_request_t *req);
+
+int
+glusterd_handle_defrag_volume_v2(rpcsvc_request_t *req);
+
+int
+glusterd_xfer_cli_probe_resp(rpcsvc_request_t *req, int32_t op_ret,
+                             int32_t op_errno, char *op_errstr, char *hostname,
+                             int port, dict_t *dict);
+
+int
+glusterd_op_commit_send_resp(rpcsvc_request_t *req, int32_t op, int32_t status,
+                             char *op_errstr, dict_t *rsp_dict);
+
+int
+glusterd_xfer_friend_remove_resp(rpcsvc_request_t *req, char *hostname,
+                                 int port);
+
+int
+glusterd_deprobe_begin(rpcsvc_request_t *req, const char *hoststr, int port,
+                       uuid_t uuid, dict_t *dict, int *op_errno);
+
+int
+glusterd_handle_cli_deprobe(rpcsvc_request_t *req);
+
+int
+glusterd_handle_incoming_unfriend_req(rpcsvc_request_t *req);
+
+int32_t
+glusterd_list_friends(rpcsvc_request_t *req, dict_t *dict, int32_t flags);
+
+int
+glusterd_handle_cli_list_friends(rpcsvc_request_t *req);
+
+int
+glusterd_handle_cli_start_volume(rpcsvc_request_t *req);
+
+int
+glusterd_handle_friend_update(rpcsvc_request_t *req);
+
+int
+glusterd_handle_cli_stop_volume(rpcsvc_request_t *req);
+
+int
+glusterd_handle_cli_delete_volume(rpcsvc_request_t *req);
+
+int
+glusterd_handle_cli_get_volume(rpcsvc_request_t *req);
+
+int32_t
+glusterd_get_volumes(rpcsvc_request_t *req, dict_t *dict, int32_t flags);
+
+int
+glusterd_handle_add_brick(rpcsvc_request_t *req);
+
+int
+glusterd_handle_tier(rpcsvc_request_t *req);
+
+int
+glusterd_handle_attach_tier(rpcsvc_request_t *req);
+
+int
+glusterd_handle_detach_tier(rpcsvc_request_t *req);
+
+int
+glusterd_handle_add_tier_brick(rpcsvc_request_t *req);
+
+int
+glusterd_handle_replace_brick(rpcsvc_request_t *req);
+
+int
+glusterd_handle_remove_brick(rpcsvc_request_t *req);
+
+int
+glusterd_handle_log_rotate(rpcsvc_request_t *req);
+
+int
+glusterd_handle_sync_volume(rpcsvc_request_t *req);
+
+int
+glusterd_defrag_start_validate(glusterd_volinfo_t *volinfo, char *op_errstr,
+                               size_t len, glusterd_op_t op);
+
+int
+glusterd_rebalance_cmd_validate(int cmd, char *volname,
+                                glusterd_volinfo_t **volinfo, char *op_errstr,
+                                size_t len);
+
+int32_t
+glusterd_log_filename(rpcsvc_request_t *req, dict_t *dict);
+
+int32_t
+glusterd_log_rotate(rpcsvc_request_t *req, dict_t *dict);
+
+int32_t
+glusterd_remove_brick(rpcsvc_request_t *req, dict_t *dict);
+
+int32_t
+glusterd_set_volume(rpcsvc_request_t *req, dict_t *dict);
+
+int32_t
+glusterd_reset_volume(rpcsvc_request_t *req, dict_t *dict);
+
+int32_t
+glusterd_gsync_set(rpcsvc_request_t *req, dict_t *dict);
+
+int32_t
+glusterd_quota(rpcsvc_request_t *req, dict_t *dict);
+
+int
+glusterd_handle_set_volume(rpcsvc_request_t *req);
+
+int
+glusterd_handle_reset_volume(rpcsvc_request_t *req);
+
+int
+glusterd_handle_copy_file(rpcsvc_request_t *req);
+
+int
+glusterd_handle_sys_exec(rpcsvc_request_t *req);
+
+int
+glusterd_handle_gsync_set(rpcsvc_request_t *req);
+
+int
+glusterd_handle_quota(rpcsvc_request_t *req);
+
+int
+glusterd_handle_bitrot(rpcsvc_request_t *req);
+
+int
+glusterd_handle_fsm_log(rpcsvc_request_t *req);
+
+int
+glusterd_handle_reset_brick(rpcsvc_request_t *req);
+
+int
+glusterd_xfer_cli_deprobe_resp(rpcsvc_request_t *req, int32_t op_ret,
+                               int32_t op_errno, char *op_errstr,
+                               char *hostname, dict_t *dict);
+
+int
+glusterd_client_statedump_submit_req(char *volname, char *target_ip, char *pid);
+
+int
+glusterd_fetchspec_notify(xlator_t *this);
+
+int
+glusterd_fetchsnap_notify(xlator_t *this);
+
+int
+glusterd_add_volume_detail_to_dict(glusterd_volinfo_t *volinfo, dict_t *volumes,
+                                   int count);
+
+int
+glusterd_restart_bricks(void *opaque);
+
+int32_t
+glusterd_volume_txn(rpcsvc_request_t *req, char *volname, int flags,
+                    glusterd_op_t op);
+
+int
+glusterd_peer_dump_version(xlator_t *this, struct rpc_clnt *rpc,
+                           glusterd_peerctx_t *peerctx);
+
+int
+glusterd_validate_reconfopts(glusterd_volinfo_t *volinfo, dict_t *val_dict,
+                             char **op_errstr);
+int
+glusterd_handle_cli_profile_volume(rpcsvc_request_t *req);
+
+int
+glusterd_handle_getwd(rpcsvc_request_t *req);
+
+int32_t
+glusterd_set_volume(rpcsvc_request_t *req, dict_t *dict);
+int
+glusterd_peer_rpc_notify(struct rpc_clnt *rpc, void *mydata,
+                         rpc_clnt_event_t event, void *data);
+int
+glusterd_brick_rpc_notify(struct rpc_clnt *rpc, void *mydata,
+                          rpc_clnt_event_t event, void *data);
+
+int
+glusterd_rpc_create(struct rpc_clnt **rpc, dict_t *options,
+                    rpc_clnt_notify_t notify_fn, void *notify_data,
+                    gf_boolean_t force);
+
+/* handler functions */
+int32_t
+glusterd_op_begin(rpcsvc_request_t *req, glusterd_op_t op, void *ctx,
+                  char *err_str, size_t size);
+
+/* removed other definitions as they have been defined elsewhere in this file*/
+
+int
+glusterd_handle_cli_statedump_volume(rpcsvc_request_t *req);
+int
+glusterd_handle_cli_clearlocks_volume(rpcsvc_request_t *req);
+
+int
+glusterd_handle_defrag_start(glusterd_volinfo_t *volinfo, char *op_errstr,
+                             size_t len, int cmd, defrag_cbk_fn_t cbk,
+                             glusterd_op_t op);
+int
+glusterd_rebalance_rpc_create(glusterd_volinfo_t *volinfo);
+
+int
+glusterd_rebalance_defrag_init(glusterd_volinfo_t *volinfo,
+                               defrag_cbk_fn_t cbk);
+
+int
+glusterd_handle_cli_heal_volume(rpcsvc_request_t *req);
+
+int
+glusterd_handle_cli_list_volume(rpcsvc_request_t *req);
+
+int
+glusterd_handle_snapshot(rpcsvc_request_t *req);
+
+/* op-sm functions */
+int
+glusterd_op_stage_heal_volume(dict_t *dict, char **op_errstr);
+int
+glusterd_op_heal_volume(dict_t *dict, char **op_errstr);
+int
+glusterd_op_stage_gsync_set(dict_t *dict, char **op_errstr);
+int
+glusterd_op_gsync_set(dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int
+glusterd_op_stage_copy_file(dict_t *dict, char **op_errstr);
+int
+glusterd_op_copy_file(dict_t *dict, char **op_errstr);
+int
+glusterd_op_stage_sys_exec(dict_t *dict, char **op_errstr);
+int
+glusterd_op_sys_exec(dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int
+glusterd_op_stage_gsync_create(dict_t *dict, char **op_errstr);
+int
+glusterd_op_gsync_create(dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int
+glusterd_op_quota(dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+
+int
+glusterd_op_bitrot(dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+
+int
+glusterd_op_stage_quota(dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+
+int
+glusterd_op_stage_bitrot(dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+
+int
+glusterd_op_stage_replace_brick(dict_t *dict, char **op_errstr,
+                                dict_t *rsp_dict);
+int
+glusterd_op_replace_brick(dict_t *dict, dict_t *rsp_dict);
+int
+glusterd_op_log_rotate(dict_t *dict);
+int
+glusterd_op_stage_log_rotate(dict_t *dict, char **op_errstr);
+int
+glusterd_op_stage_create_volume(dict_t *dict, char **op_errstr,
+                                dict_t *rsp_dict);
+int
+glusterd_op_stage_start_volume(dict_t *dict, char **op_errstr,
+                               dict_t *rsp_dict);
+int
+glusterd_op_stage_stop_volume(dict_t *dict, char **op_errstr);
+int
+glusterd_op_stage_delete_volume(dict_t *dict, char **op_errstr);
+int
+glusterd_op_create_volume(dict_t *dict, char **op_errstr);
+int
+glusterd_op_start_volume(dict_t *dict, char **op_errstr);
+int
+glusterd_op_stop_volume(dict_t *dict);
+int
+glusterd_op_delete_volume(dict_t *dict);
+int
+glusterd_handle_ganesha_op(dict_t *dict, char **op_errstr, char *key,
+                           char *value);
+int
+glusterd_check_ganesha_cmd(char *key, char *value, char **errstr, dict_t *dict);
+int
+glusterd_op_stage_set_ganesha(dict_t *dict, char **op_errstr);
+int
+glusterd_op_set_ganesha(dict_t *dict, char **errstr);
+int
+ganesha_manage_export(dict_t *dict, char *value,
+                      gf_boolean_t update_cache_invalidation, char **op_errstr);
+int
+gd_ganesha_send_dbus(char *volname, char *value);
+gf_boolean_t
+glusterd_is_ganesha_cluster();
+gf_boolean_t
+glusterd_check_ganesha_export(glusterd_volinfo_t *volinfo);
+int
+stop_ganesha(char **op_errstr);
+int
+tear_down_cluster(gf_boolean_t run_teardown);
+int
+manage_export_config(char *volname, char *value, char **op_errstr);
+
+int
+glusterd_op_add_brick(dict_t *dict, char **op_errstr);
+int
+glusterd_op_remove_brick(dict_t *dict, char **op_errstr);
+int
+glusterd_op_stage_add_brick(dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int
+glusterd_op_stage_remove_brick(dict_t *dict, char **op_errstr);
+
+int
+glusterd_set_rebalance_id_for_remove_brick(dict_t *req_dict, dict_t *rsp_dict);
+
+int
+glusterd_set_rebalance_id_in_rsp_dict(dict_t *req_dict, dict_t *rsp_dict);
+
+int
+glusterd_mgmt_v3_op_stage_rebalance(dict_t *dict, char **op_errstr);
+
+int
+glusterd_mgmt_v3_op_rebalance(dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+
+int
+glusterd_op_stage_rebalance(dict_t *dict, char **op_errstr);
+int
+glusterd_op_rebalance(dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+
+int
+glusterd_op_stage_statedump_volume(dict_t *dict, char **op_errstr);
+int
+glusterd_op_statedump_volume(dict_t *dict, char **op_errstr);
+
+int
+glusterd_op_stage_clearlocks_volume(dict_t *dict, char **op_errstr);
+int
+glusterd_op_clearlocks_volume(dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+
+int
+glusterd_op_stage_barrier(dict_t *dict, char **op_errstr);
+int
+glusterd_op_barrier(dict_t *dict, char **op_errstr);
+
+/* misc */
+int
+glusterd_op_perform_remove_brick(glusterd_volinfo_t *volinfo, char *brick,
+                                 int force, int *need_migrate);
+int
+glusterd_op_stop_volume_args_get(dict_t *dict, char **volname, int *flags);
+int
+glusterd_op_statedump_volume_args_get(dict_t *dict, char **volname,
+                                      char **options, int *option_cnt);
+
+int
+glusterd_op_gsync_args_get(dict_t *dict, char **op_errstr, char **master,
+                           char **slave, char **host_uuid);
+
+int
+glusterd_op_get_max_opversion(char **op_errstr, dict_t *rsp_dict);
+
+int
+glusterd_start_volume(glusterd_volinfo_t *volinfo, int flags,
+                      gf_boolean_t wait);
+
+int
+glusterd_stop_volume(glusterd_volinfo_t *volinfo);
+
+/* Synctask part */
+int32_t
+glusterd_op_begin_synctask(rpcsvc_request_t *req, glusterd_op_t op, void *dict);
+int32_t
+glusterd_defrag_event_notify_handle(dict_t *dict);
+
+int32_t
+glusterd_txn_opinfo_dict_init();
+
+void
+glusterd_txn_opinfo_dict_fini();
+
+void
+glusterd_txn_opinfo_init();
+
+/* snapshot */
+glusterd_snap_t *
+glusterd_new_snap_object();
+
+int32_t
+glusterd_list_add_snapvol(glusterd_volinfo_t *origin_vol,
+                          glusterd_volinfo_t *snap_vol);
+
+glusterd_snap_t *
+glusterd_remove_snap_by_id(uuid_t snap_id);
+
+glusterd_snap_t *
+glusterd_remove_snap_by_name(char *snap_name);
+
+glusterd_snap_t *
+glusterd_find_snap_by_name(char *snap_name);
+
+glusterd_snap_t *
+glusterd_find_snap_by_id(uuid_t snap_id);
+
+int
+glusterd_snapshot_prevalidate(dict_t *dict, char **op_errstr, dict_t *rsp_dict,
+                              uint32_t *op_errno);
+int
+glusterd_snapshot_brickop(dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int
+glusterd_snapshot(dict_t *dict, char **op_errstr, uint32_t *op_errno,
+                  dict_t *rsp_dict);
+int
+glusterd_snapshot_postvalidate(dict_t *dict, int32_t op_ret, char **op_errstr,
+                               dict_t *rsp_dict);
+char *
+glusterd_build_snap_device_path(char *device, char *snapname,
+                                int32_t brick_count);
+
+int32_t
+glusterd_snap_remove(dict_t *rsp_dict, glusterd_snap_t *snap,
+                     gf_boolean_t remove_lvm, gf_boolean_t force,
+                     gf_boolean_t is_clone);
+int32_t
+glusterd_snapshot_cleanup(dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+
+int32_t
+glusterd_add_missed_snaps_to_list(dict_t *dict, int32_t missed_snap_count);
+
+int32_t
+glusterd_add_new_entry_to_list(char *missed_info, char *snap_vol_id,
+                               int32_t brick_num, char *brick_path,
+                               int32_t snap_op, int32_t snap_status);
+
+int
+glusterd_snapshot_revert_restore_from_snap(glusterd_snap_t *snap);
+
+int
+glusterd_add_brick_status_to_dict(dict_t *dict, glusterd_volinfo_t *volinfo,
+                                  glusterd_brickinfo_t *brickinfo,
+                                  char *key_prefix);
+
+int32_t
+glusterd_handle_snap_limit(dict_t *dict, dict_t *rsp_dict);
+
+gf_boolean_t
+glusterd_should_i_stop_bitd();
+
+int
+glusterd_remove_brick_migrate_cbk(glusterd_volinfo_t *volinfo,
+                                  gf_defrag_status_t status);
+int
+__glusterd_handle_reset_brick(rpcsvc_request_t *req);
+
+int
+glusterd_options_init(xlator_t *this);
+
+int32_t
+glusterd_recreate_volfiles(glusterd_conf_t *conf);
+
+void
+glusterd_add_peers_to_auth_list(char *volname);
+
+int
+glusterd_replace_old_auth_allow_list(char *volname);
+
+#endif
diff --git a/xlators/mount/fuse/src/Makefile.am b/xlators/mount/fuse/src/Makefile.am
index 9d8d45e4f02..7018cad37f6 100644
--- a/xlators/mount/fuse/src/Makefile.am
+++ b/xlators/mount/fuse/src/Makefile.am
@@ -1,14 +1,39 @@
+noinst_HEADERS_linux = $(CONTRIBDIR)/fuse-include/fuse_kernel.h\
+	$(CONTRIBDIR)/fuse-include/mount_util.h\
+	$(CONTRIBDIR)/fuse-lib/mount-gluster-compat.h
+noinst_HEADERS_darwin = $(CONTRIBDIR)/fuse-include/fuse_kernel_macfuse.h\
+	$(CONTRIBDIR)/macfuse/fuse_param.h\
+	$(CONTRIBDIR)/macfuse/fuse_ioctl.h
+noinst_HEADERS_common = $(CONTRIBDIR)/fuse-include/fuse-mount.h\
+	$(CONTRIBDIR)/fuse-include/fuse-misc.h fuse-mem-types.h \
+	fuse-bridge.h
 
-noinst_HEADERS = fuse-extra.h
+if GF_DARWIN_HOST_OS
+    noinst_HEADERS = $(noinst_HEADERS_common) $(noinst_HEADERS_darwin)
+else
+    noinst_HEADERS = $(noinst_HEADERS_common) $(noinst_HEADERS_linux)
+endif
 
 xlator_LTLIBRARIES = fuse.la
 xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/mount
-fuse_la_SOURCES = fuse-bridge.c fuse-extra.c
-fuse_la_LDFLAGS = -module -avoidversion -shared -nostartfiles $(GF_FUSE_LDADD) 
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) -Wall \
-	-I$(top_srcdir)/libglusterfs/src $(GF_CFLAGS) -DFUSE_USE_VERSION=26
+if GF_DARWIN_HOST_OS
+    mount_source=$(CONTRIBDIR)/macfuse/mount_darwin.c
+else
+    mount_source=$(CONTRIBDIR)/fuse-lib/mount.c $(CONTRIBDIR)/fuse-lib/mount-common.c
+endif
 
+fuse_la_SOURCES = fuse-helpers.c fuse-resolve.c fuse-bridge.c \
+	$(CONTRIBDIR)/fuse-lib/misc.c $(mount_source)
 
-CLEANFILES = 
+fuse_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+fuse_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(GF_LDADD) @GF_FUSE_LDADD@
 
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(CONTRIBDIR)/fuse-include \
+	-I$(CONTRIBDIR)/fuse-lib $(GF_FUSE_CFLAGS)
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c
index 6cdba9ea6d1..0e22fe411ee 100644
--- a/xlators/mount/fuse/src/fuse-bridge.c
+++ b/xlators/mount/fuse/src/fuse-bridge.c
@@ -1,2954 +1,7248 @@
 /*
-   Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
+#include <config.h>
+
+#include <sys/wait.h>
+#include "fuse-bridge.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/glusterfs-acl.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/timespec.h>
+#include <glusterfs/async.h>
+
+#ifdef __NetBSD__
+#undef open /* in perfuse.h, pulled from mount-gluster-compat.h */
+#endif
+typedef struct _fuse_async {
+    struct iobuf *iobuf;
+    fuse_in_header_t *finh;
+    void *msg;
+    gf_async_t async;
+} fuse_async_t;
+
+static int gf_fuse_xattr_enotsup_log;
+
+void
+fini(xlator_t *this_xl);
+
+static int32_t
+fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino);
+
 /*
- * TODO:
- * Need to free_state() when fuse_reply_err() + return.
- * Check loc->path for "" after fuse_loc_fill in all fops
- * (now being done in getattr, lookup) or better - make 
- * fuse_loc_fill() and inode_path() return success/failure.
+ * Send an invalidate notification up to fuse to purge the file from local
+ * page cache.
  */
 
-#include <stdint.h>
-#include <signal.h>
-#include <pthread.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif /* _CONFIG_H */
-
-#include "glusterfs.h"
-#include "logging.h"
-#include "xlator.h"
-#include "glusterfs.h"
-#include "defaults.h"
-#include "common-utils.h"
-
-#include <fuse/fuse_lowlevel.h>
-
-#include "fuse-extra.h"
-#include "list.h"
-#include "dict.h"
-
-#include "compat.h"
-#include "compat-errno.h"
-
-/* TODO: when supporting posix acl, remove this definition */
-#define DISABLE_POSIX_ACL
-
-#define ZR_MOUNTPOINT_OPT       "mountpoint"
-#define ZR_DIRECT_IO_OPT        "direct-io-mode"
-#define ZR_STRICT_VOLFILE_CHECK "strict-volfile-check"
-
-#define BIG_FUSE_CHANNEL_SIZE 1048576
-
-struct fuse_private {
-        int                  fd;
-        struct fuse         *fuse;
-        struct fuse_session *se;
-        struct fuse_chan    *ch;
-        char                *volfile;
-        size_t               volfile_size;
-        char                *mount_point;
-        data_t              *buf;
-        pthread_t            fuse_thread;
-        char                 fuse_thread_started;
-        uint32_t             direct_io_mode;
-        double               entry_timeout;
-        double               attribute_timeout;
-        pthread_cond_t       first_call_cond;
-        pthread_mutex_t      first_call_mutex;
-        char                 first_call;
-        gf_boolean_t         strict_volfile_check;
-};
-typedef struct fuse_private fuse_private_t;
-
-#define _FI_TO_FD(fi) ((fd_t *)((long)fi->fh))
-
-#define FI_TO_FD(fi) ((_FI_TO_FD (fi))?(fd_ref (_FI_TO_FD(fi))):((fd_t *) 0))
-
-#define FUSE_FOP(state, ret, op_num, fop, args ...)                     \
-        do {                                                            \
-                call_frame_t *frame = get_call_frame_for_req (state, 1); \
-                xlator_t *xl = frame->this->children ?                  \
-                        frame->this->children->xlator : NULL;           \
-                dict_t *refs = frame->root->req_refs;                   \
-                frame->root->state = state;                             \
-                frame->root->op   = op_num;				\
-                STACK_WIND (frame, ret, xl, xl->fops->fop, args);       \
-                dict_unref (refs);                                      \
-        } while (0)
-
-
-typedef struct {
-        void          *pool;
-        xlator_t      *this;
-        inode_table_t *itable;
-        loc_t          loc;
-        loc_t          loc2;
-        fuse_req_t     req;
-        int32_t        flags;
-        off_t          off;
-        size_t         size;
-        unsigned long  nlookup;
-        fd_t          *fd;
-        dict_t        *dict;
-        char          *name;
-        char           is_revalidate;
-} fuse_state_t;
-
-int fuse_chan_receive (struct fuse_chan *ch,
-                       char *buf,
-                       int32_t size);
+static int32_t
+fuse_invalidate(xlator_t *this, inode_t *inode)
+{
+    fuse_private_t *priv = this->private;
+    uint64_t nodeid;
+
+    /*
+     * NOTE: We only invalidate at the moment if fopen_keep_cache is
+     * enabled because otherwise this is a departure from default
+     * behavior. Specifically, the performance/write-behind xlator
+     * causes unconditional invalidations on write requests.
+     */
+    if (!priv->fopen_keep_cache)
+        return 0;
 
+    nodeid = inode_to_fuse_nodeid(inode);
+    gf_log(this->name, GF_LOG_DEBUG, "Invalidate inode id %" GF_PRI_INODE ".",
+           nodeid);
+    fuse_log_eh(this, "Sending invalidate inode id: %" GF_PRI_INODE " gfid: %s",
+                nodeid, uuid_utoa(inode->gfid));
+    fuse_invalidate_inode(this, nodeid);
 
-static void
-free_state (fuse_state_t *state)
+    return 0;
+}
+
+static int32_t
+fuse_forget_cbk(xlator_t *this, inode_t *inode)
 {
-        loc_wipe (&state->loc);
+    // Nothing to free in inode ctx, hence return.
+    return 0;
+}
 
-        loc_wipe (&state->loc2);
+fuse_fd_ctx_t *
+__fuse_fd_ctx_check_n_create(xlator_t *this, fd_t *fd)
+{
+    uint64_t val = 0;
+    int32_t ret = 0;
+    fuse_fd_ctx_t *fd_ctx = NULL;
 
-        if (state->dict) {
-                dict_unref (state->dict);
-                state->dict = (void *)0xaaaaeeee;
-        }
-        if (state->name) {
-                FREE (state->name);
-                state->name = NULL;
+    ret = __fd_ctx_get(fd, this, &val);
+
+    fd_ctx = (fuse_fd_ctx_t *)(unsigned long)val;
+
+    if (fd_ctx == NULL) {
+        fd_ctx = GF_CALLOC(1, sizeof(*fd_ctx), gf_fuse_mt_fd_ctx_t);
+        if (!fd_ctx) {
+            goto out;
         }
-        if (state->fd) {
-                fd_unref (state->fd);
-                state->fd = (void *)0xfdfdfdfd;
+        ret = __fd_ctx_set(fd, this, (uint64_t)(unsigned long)fd_ctx);
+        if (ret < 0) {
+            gf_log("glusterfs-fuse", GF_LOG_DEBUG, "fd-ctx-set failed");
+            GF_FREE(fd_ctx);
+            fd_ctx = NULL;
         }
-#ifdef DEBUG
-        memset (state, 0x90, sizeof (*state));
-#endif
-        FREE (state);
-        state = NULL;
+    }
+out:
+    return fd_ctx;
 }
 
-
-fuse_state_t *
-state_from_req (fuse_req_t req)
+fuse_fd_ctx_t *
+fuse_fd_ctx_check_n_create(xlator_t *this, fd_t *fd)
 {
-        fuse_state_t *state;
-        xlator_t *this = NULL;
+    fuse_fd_ctx_t *fd_ctx = NULL;
 
-        this = fuse_req_userdata (req);
+    if ((fd == NULL) || (this == NULL)) {
+        goto out;
+    }
 
-        state = (void *)calloc (1, sizeof (*state));
-        ERR_ABORT (state);
-        state->pool = this->ctx->pool;
-        state->itable = this->itable;
-        state->req = req;
-        state->this = this;
+    LOCK(&fd->lock);
+    {
+        fd_ctx = __fuse_fd_ctx_check_n_create(this, fd);
+    }
+    UNLOCK(&fd->lock);
 
-        return state;
+out:
+    return fd_ctx;
 }
 
-static pid_t
-get_pid_from_req (fuse_req_t req)
+static void
+fuse_fd_ctx_destroy(xlator_t *this, fd_t *fd)
 {
-        const struct fuse_ctx *ctx = NULL;
-        ctx = fuse_req_ctx(req);
-        return ctx->pid;
+    fd_t *activefd = NULL;
+    uint64_t val = 0;
+    int ret = 0;
+    fuse_fd_ctx_t *fdctx = NULL;
+
+    ret = fd_ctx_del(fd, this, &val);
+    if (!ret) {
+        fdctx = (fuse_fd_ctx_t *)(unsigned long)val;
+        if (fdctx) {
+            activefd = fdctx->activefd;
+            if (activefd) {
+                fd_unref(activefd);
+            }
+
+            GF_FREE(fdctx);
+        }
+    }
 }
 
-static call_frame_t *
-get_call_frame_for_req (fuse_state_t *state, char d)
+fuse_fd_ctx_t *
+fuse_fd_ctx_get(xlator_t *this, fd_t *fd)
 {
-        call_pool_t *pool = state->pool;
-        fuse_req_t req = state->req;
-        const struct fuse_ctx *ctx = NULL;
-	call_frame_t *frame = NULL;
-        xlator_t *this = NULL;
-        fuse_private_t *priv = NULL;
-
-
-	if (req) {
-		this = fuse_req_userdata (req);
-	} else {
-		this = state->this;
-	}
-	priv = this->private;
-
-	frame = create_frame (this, pool);
-
-        if (req) {
-                ctx = fuse_req_ctx(req);
-
-                frame->root->uid    = ctx->uid;
-                frame->root->gid    = ctx->gid;
-                frame->root->pid    = ctx->pid;
-                frame->root->unique = req_callid (req);
-        }
-
-        if (d) {
-                frame->root->req_refs = dict_ref (get_new_dict ());
-                dict_set (frame->root->req_refs, NULL, priv->buf);
-        }
-
-        frame->root->type = GF_OP_TYPE_FOP_REQUEST;
-
-        return frame;
-}
-
-
-GF_MUST_CHECK static int32_t
-fuse_loc_fill (loc_t *loc,
-               fuse_state_t *state,
-               ino_t ino,
-               ino_t par,
-               const char *name)
-{
-        inode_t *inode = NULL, *parent = NULL;
-	int32_t ret = -1;
-	char *path = NULL;
-
-        /* resistance against multiple invocation of loc_fill not to get
-           reference leaks via inode_search() */
-
-        inode = loc->inode;
-	
-        if (!inode) {
-                if (ino)
-                        inode = inode_search (state->itable, ino, NULL);
-                if (par && name)
-                        inode = inode_search (state->itable, par, name);
-
-                loc->inode = inode;
-                if (inode)
-                        loc->ino = inode->ino;
-        }
-
-        parent = loc->parent;
-        if (!parent) {
-                if (inode)
-                        parent = inode_parent (inode, par, name);
-                else
-                        parent = inode_search (state->itable, par, NULL);
-                loc->parent = parent;
-        }
-  
-        if (name && parent) {
-		ret = inode_path (parent, name, &path);
-		if (ret <= 0) {
-			gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-				"inode_path failed for %"PRId64"/%s",
-				parent->ino, name);
-			goto fail;
-		} else {
-			loc->path = path;
-		}
-        } else 	if (inode) {
-		ret = inode_path (inode, NULL, &path);
-		if (ret <= 0) {
-			gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-				"inode_path failed for %"PRId64,
-				inode->ino);
-			goto fail;
-		} else {
-			loc->path = path;
-		}
-	}
-	if (loc->path) {
-		loc->name = strrchr (loc->path, '/');
-		if (loc->name)
-			loc->name++;
-		else loc->name = "";
-	}
-	
-	if ((ino != 1) &&
-	    (parent == NULL)) {
-		gf_log ("fuse-bridge", GF_LOG_ERROR,
-			"failed to search parent for %"PRId64"/%s (%"PRId64")",
-			(ino_t)par, name, (ino_t)ino);
-		ret = -1;
-		goto fail;
-	}
-	ret = 0;
-fail:
-	return ret;
+    fuse_fd_ctx_t *fdctx = NULL;
+    uint64_t value = 0;
+    int ret = 0;
+
+    ret = fd_ctx_get(fd, this, &value);
+    if (ret < 0) {
+        goto out;
+    }
+
+    fdctx = (fuse_fd_ctx_t *)(unsigned long)value;
+
+out:
+    return fdctx;
 }
 
+struct fusedump_timespec {
+    uint32_t len;
+    uint64_t sec;
+    uint32_t nsec;
+} __attribute__((packed));
+
+struct fusedump_signature {
+    uint32_t len;
+    char sig[8];
+} __attribute__((packed));
+
+static void
+fusedump_gettime(struct fusedump_timespec *fts)
+{
+    struct timespec ts = {
+        0,
+    };
+
+    timespec_now_realtime(&ts);
+
+    fts->sec = ts.tv_sec;
+    fts->nsec = ts.tv_nsec;
+}
+
+static void
+fusedump_setup_meta(struct iovec *iovs, char *dir,
+                    uint32_t *fusedump_item_count,
+                    struct fusedump_timespec *fts,
+                    struct fusedump_signature *fsig)
+{
+    char glustersig[8] = {'G', 'L', 'U', 'S', 'T', 'E', 'R', 0xF5};
+
+    *fusedump_item_count = 3;
+    fts->len = sizeof(*fts);
+    fusedump_gettime(fts);
+    fsig->len = sizeof(*fsig);
+    memcpy(fsig->sig, glustersig, 8);
+
+    iovs[0] = (struct iovec){dir, sizeof(*dir)};
+    iovs[1] = (struct iovec){fusedump_item_count, sizeof(*fusedump_item_count)};
+    iovs[2] = (struct iovec){fts, fts->len};
+    iovs[3] = (struct iovec){fsig, fsig->len};
+}
 
 static int
-need_fresh_lookup (int32_t op_ret, int32_t op_errno, 
-		   loc_t *loc, struct stat *buf)
-{
-        if (op_ret == -1) {
-		gf_log ("fuse-bridge",
-			(op_errno == ENOENT)? GF_LOG_DEBUG: GF_LOG_WARNING,
-			"revalidate of %s failed (%s)",
-			loc->path, strerror (op_errno));
-                return 1;
-	}
-
-        if (loc->inode->ino != buf->st_ino) {
-		gf_log ("fuse-bridge", GF_LOG_WARNING,
-			"inode num of %s changed %"PRId64" -> %"PRId64,
-			loc->path, loc->inode->ino, buf->st_ino);
-                return 1;
-        }
-
-	if ((loc->inode->st_mode & S_IFMT) ^ (buf->st_mode & S_IFMT)) {
-		gf_log ("fuse-bridge", GF_LOG_WARNING,
-			"inode mode of %s changed 0%o -> 0%o",
-			loc->path, loc->inode->st_mode, buf->st_mode);
-		return 1;
-	}
+check_and_dump_fuse_W(fuse_private_t *priv, struct iovec *iov_out, int count,
+                      ssize_t res, errnomask_t errnomask)
+{
+    char w = 'W';
+    struct iovec diov[4] = {
+        {
+            0,
+        },
+    };
+    uint32_t fusedump_item_count = 3;
+    struct fusedump_timespec fts = {
+        0,
+    };
+    struct fusedump_signature fsig = {
+        0,
+    };
+    struct fuse_out_header *fouh = NULL;
+
+    if (res == -1) {
+        const char *errdesc = NULL;
+        gf_loglevel_t loglevel = GF_LOG_ERROR;
+        gf_boolean_t errno_degraded = _gf_false;
+        gf_boolean_t errno_promoted = _gf_false;
+
+#define ACCOUNT_ERRNO(eno)                                                     \
+    do {                                                                       \
+        if (errno_degraded) {                                                  \
+            pthread_mutex_lock(&priv->fusedev_errno_cnt_mutex);                \
+            {                                                                  \
+                if (!++priv->fusedev_errno_cnt[FUSEDEV_##eno])                 \
+                    errno_promoted = _gf_true;                                 \
+            }                                                                  \
+            pthread_mutex_unlock(&priv->fusedev_errno_cnt_mutex);              \
+        }                                                                      \
+    } while (0)
+
+        /* If caller masked the errno, then it
+         * does not indicate an error at the application
+         * level, so we degrade the log severity to DEBUG.
+         */
+        if (errnomask && errno < ERRNOMASK_MAX &&
+            GET_ERRNO_MASK(errnomask, errno)) {
+            loglevel = GF_LOG_DEBUG;
+            errno_degraded = _gf_true;
+        }
+
+        switch (errno) {
+            /* The listed errnos are FUSE status indicators,
+             * not legit values according to POSIX (see write(3p)),
+             * so resolving them according to the standard
+             * POSIX interpretation would be misleading.
+             */
+            case ENOENT:
+                errdesc = "ENOENT";
+                ACCOUNT_ERRNO(ENOENT);
+                break;
+            case ENOTDIR:
+                errdesc = "ENOTDIR";
+                ACCOUNT_ERRNO(ENOTDIR);
+                break;
+            case ENODEV:
+                errdesc = "ENODEV";
+                ACCOUNT_ERRNO(ENODEV);
+                break;
+            case EPERM:
+                errdesc = "EPERM";
+                ACCOUNT_ERRNO(EPERM);
+                break;
+            case ENOMEM:
+                errdesc = "ENOMEM";
+                ACCOUNT_ERRNO(ENOMEM);
+                break;
+            case ENOTCONN:
+                errdesc = "ENOTCONN";
+                ACCOUNT_ERRNO(ENOTCONN);
+                break;
+            case ECONNREFUSED:
+                errdesc = "ECONNREFUSED";
+                ACCOUNT_ERRNO(ECONNREFUSED);
+                break;
+            case EOVERFLOW:
+                errdesc = "EOVERFLOW";
+                ACCOUNT_ERRNO(EOVERFLOW);
+                break;
+            case EBUSY:
+                errdesc = "EBUSY";
+                ACCOUNT_ERRNO(EBUSY);
+                break;
+            case ENOTEMPTY:
+                errdesc = "ENOTEMPTY";
+                ACCOUNT_ERRNO(ENOTEMPTY);
+                break;
+            default:
+                errdesc = strerror(errno);
+        }
 
+        gf_log_callingfn("glusterfs-fuse", loglevel,
+                         "writing to fuse device failed: %s", errdesc);
+        if (errno_promoted)
+            gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                   "writing to fuse device yielded %s %d times", errdesc,
+                   UINT8_MAX + 1);
+        return errno;
+
+#undef ACCOUNT_ERRNO
+    }
+
+    fouh = iov_out[0].iov_base;
+    if (res != fouh->len) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "inconsistent write to fuse device: "
+               "written %zd, expectd %d",
+               res, fouh->len);
+        return EINVAL;
+    }
+
+    if (priv->fuse_dump_fd == -1)
         return 0;
-}
 
+    fusedump_setup_meta(diov, &w, &fusedump_item_count, &fts, &fsig);
+
+    pthread_mutex_lock(&priv->fuse_dump_mutex);
+    res = sys_writev(priv->fuse_dump_fd, diov, sizeof(diov) / sizeof(diov[0]));
+    if (res != -1)
+        res = sys_writev(priv->fuse_dump_fd, iov_out, count);
+    pthread_mutex_unlock(&priv->fuse_dump_mutex);
 
+    if (res == -1)
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "failed to dump fuse message (W): %s", strerror(errno));
+
+    /*
+     * Return value reflects check on write to /dev/fuse,
+     * so ignore issues with dumping.
+     */
+
+    return 0;
+}
+
+/*
+ * iov_out should contain a fuse_out_header at zeroth position.
+ * The error value of this header is sent to kernel.
+ */
 static int
-fuse_lookup_cbk (call_frame_t *frame,
-                 void *cookie,
-                 xlator_t *this,
-                 int32_t op_ret,
-                 int32_t op_errno,
-                 inode_t *inode,
-                 struct stat *stat,
-                 dict_t *dict);
+send_fuse_iov(xlator_t *this, fuse_in_header_t *finh, struct iovec *iov_out,
+              int count)
+{
+    fuse_private_t *priv = NULL;
+    struct fuse_out_header *fouh = NULL;
+    int res, i;
+
+    if (!this || !finh || !iov_out) {
+        gf_log("send_fuse_iov", GF_LOG_ERROR, "Invalid arguments");
+        return EINVAL;
+    }
+    priv = this->private;
+
+    fouh = iov_out[0].iov_base;
+    iov_out[0].iov_len = sizeof(*fouh);
+    fouh->len = 0;
+    for (i = 0; i < count; i++)
+        fouh->len += iov_out[i].iov_len;
+    fouh->unique = finh->unique;
+
+    res = sys_writev(priv->fd, iov_out, count);
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "writev() result %d/%d %s", res,
+           fouh->len, res == -1 ? strerror(errno) : "");
+
+    return check_and_dump_fuse_W(priv, iov_out, count, res, NULL);
+}
 
 static int
-fuse_entry_cbk (call_frame_t *frame,
-                void *cookie,
-                xlator_t *this,
-                int32_t op_ret,
-                int32_t op_errno,
-                inode_t *inode,
-                struct stat *buf)
+send_fuse_data(xlator_t *this, fuse_in_header_t *finh, void *data, size_t size)
+{
+    struct fuse_out_header fouh = {
+        0,
+    };
+    struct iovec iov_out[2];
+    int ret = 0;
+
+    fouh.error = 0;
+    iov_out[0].iov_base = &fouh;
+    iov_out[1].iov_base = data;
+    iov_out[1].iov_len = size;
+
+    ret = send_fuse_iov(this, finh, iov_out, 2);
+    if (ret != 0)
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "send_fuse_iov() "
+               "failed: %s",
+               strerror(ret));
+
+    return ret;
+}
+
+#define send_fuse_obj(this, finh, obj)                                         \
+    send_fuse_data(this, finh, obj, sizeof(*(obj)))
+
+static int32_t
+fuse_invalidate_entry(xlator_t *this, uint64_t fuse_ino)
 {
-        fuse_state_t *state;
-        fuse_req_t req;
-        struct fuse_entry_param e = {0, };
-        fuse_private_t *priv = this->private;
+#if (FUSE_KERNEL_MINOR_VERSION >= 11 && defined(HAVE_FUSE_NOTIFICATIONS))
+    struct fuse_out_header *fouh = NULL;
+    struct fuse_notify_inval_entry_out *fnieo = NULL;
+    fuse_private_t *priv = NULL;
+    dentry_t *dentry = NULL;
+    dentry_t *tmp = NULL;
+    inode_t *inode = NULL;
+    size_t nlen = 0;
+    fuse_invalidate_node_t *node = NULL;
+    char gfid_str[UUID_CANONICAL_FORM_LEN + 1];
+
+    priv = this->private;
+    if (!priv->reverse_fuse_thread_started)
+        return -1;
+
+    if (priv->invalidate_limit &&
+        (priv->invalidate_count >= priv->invalidate_limit)) {
+        return -1;
+    }
 
-        state = frame->root->state;
-        req = state->req;
+    inode = (inode_t *)(unsigned long)fuse_ino;
+    if (inode == NULL)
+        return -1;
 
-        if (!op_ret && state->loc.ino == 1) {
-                buf->st_ino = 1;
+    list_for_each_entry_safe(dentry, tmp, &inode->dentry_list, inode_list)
+    {
+        node = GF_CALLOC(1, sizeof(*node), gf_fuse_mt_invalidate_node_t);
+        if (node == NULL)
+            return -1;
+
+        INIT_LIST_HEAD(&node->next);
+
+        fouh = (struct fuse_out_header *)node->inval_buf;
+        fnieo = (struct fuse_notify_inval_entry_out *)(fouh + 1);
+
+        fouh->unique = 0;
+        fouh->error = FUSE_NOTIFY_INVAL_ENTRY;
+
+        if (ENOENT < ERRNOMASK_MAX)
+            MASK_ERRNO(node->errnomask, ENOENT);
+        if (ENOTDIR < ERRNOMASK_MAX)
+            MASK_ERRNO(node->errnomask, ENOTDIR);
+        if (EBUSY < ERRNOMASK_MAX)
+            MASK_ERRNO(node->errnomask, EBUSY);
+        if (ENOTEMPTY < ERRNOMASK_MAX)
+            MASK_ERRNO(node->errnomask, ENOTEMPTY);
+
+        if (dentry->name) {
+            nlen = strlen(dentry->name);
+            fouh->len = sizeof(*fouh) + sizeof(*fnieo) + nlen + 1;
+            fnieo->parent = inode_to_fuse_nodeid(dentry->parent);
+
+            fnieo->namelen = nlen;
+            strcpy((node->inval_buf + sizeof(*fouh) + sizeof(*fnieo)),
+                   dentry->name);
         }
 
-        if (state->is_revalidate == 1
-	    && need_fresh_lookup (op_ret, op_errno, &state->loc, buf)) {
-                inode_unref (state->loc.inode);
-                state->loc.inode = inode_new (state->itable);
-                state->is_revalidate = 2;
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "INVALIDATE entry: %" PRIu64 "/%s (gfid:%s)", fnieo->parent,
+               dentry->name, uuid_utoa(inode->gfid));
 
-                STACK_WIND (frame, fuse_lookup_cbk,
-                            FIRST_CHILD (this),
-			    FIRST_CHILD (this)->fops->lookup,
-                            &state->loc, state->dict);
+        if (dentry->parent) {
+            fuse_log_eh(this, "Invalidated entry %s (parent: %s) gfid:%s",
+                        dentry->name, uuid_utoa(dentry->parent->gfid),
+                        uuid_utoa_r(inode->gfid, gfid_str));
+        } else {
+            fuse_log_eh(this,
+                        "Invalidated entry %s(nodeid: %" PRIu64 ") gfid:%s",
+                        dentry->name, fnieo->parent, uuid_utoa(inode->gfid));
+        }
 
-                return 0;
+        pthread_mutex_lock(&priv->invalidate_mutex);
+        {
+            list_add_tail(&node->next, &priv->invalidate_list);
+            priv->invalidate_count++;
+            pthread_cond_signal(&priv->invalidate_cond);
         }
+        pthread_mutex_unlock(&priv->invalidate_mutex);
+    }
 
-        if (op_ret == 0) {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": %s() %s => %"PRId64" (%"PRId64")",
-			frame->root->unique, gf_fop_list[frame->root->op],
-			state->loc.path, buf->st_ino, state->loc.ino);
+#endif
+    return 0;
+}
 
-		inode_link (inode, state->loc.parent, state->loc.name, buf);
+/*
+ * Send an inval inode notification to fuse. This causes an invalidation of the
+ * entire page cache mapping on the inode.
+ */
+static int32_t
+fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino)
+{
+#if (FUSE_KERNEL_MINOR_VERSION >= 11 && defined(HAVE_FUSE_NOTIFICATIONS))
+    struct fuse_out_header *fouh = NULL;
+    struct fuse_notify_inval_inode_out *fniio = NULL;
+    fuse_private_t *priv = NULL;
+    fuse_invalidate_node_t *node = NULL;
+    inode_t *inode = NULL;
 
-		inode_lookup (inode);
+    priv = this->private;
 
-                /* TODO: make these timeouts configurable (via meta?) */
-                e.ino = inode->ino;
+    if (!priv->reverse_fuse_thread_started)
+        return -1;
+
+    if (priv->invalidate_limit &&
+        (priv->invalidate_count >= priv->invalidate_limit)) {
+        return -1;
+    }
+
+    inode = (inode_t *)(unsigned long)fuse_ino;
+    if (inode == NULL)
+        return -1;
+
+    node = GF_CALLOC(1, sizeof(*node), gf_fuse_mt_invalidate_node_t);
+    if (node == NULL)
+        return -1;
+
+    INIT_LIST_HEAD(&node->next);
+
+    fouh = (struct fuse_out_header *)node->inval_buf;
+    fniio = (struct fuse_notify_inval_inode_out *)(fouh + 1);
+
+    fouh->unique = 0;
+    fouh->error = FUSE_NOTIFY_INVAL_INODE;
+    fouh->len = sizeof(struct fuse_out_header) +
+                sizeof(struct fuse_notify_inval_inode_out);
+
+    /* inval the entire mapping until we learn how to be more granular */
+    fniio->ino = fuse_ino;
+    fniio->off = 0;
+    fniio->len = -1;
+
+    if (ENOENT < ERRNOMASK_MAX)
+        MASK_ERRNO(node->errnomask, ENOENT);
+
+    fuse_log_eh(this, "Invalidated inode %" PRIu64 " (gfid: %s)", fuse_ino,
+                uuid_utoa(inode->gfid));
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "INVALIDATE inode: %" PRIu64 "(gfid:%s)", fuse_ino,
+           uuid_utoa(inode->gfid));
+
+    pthread_mutex_lock(&priv->invalidate_mutex);
+    {
+        list_add_tail(&node->next, &priv->invalidate_list);
+        priv->invalidate_count++;
+        pthread_cond_signal(&priv->invalidate_cond);
+    }
+    pthread_mutex_unlock(&priv->invalidate_mutex);
 
-#ifdef GF_DARWIN_HOST_OS
-                e.generation = 0;
 #else
-                e.generation = buf->st_ctime;
+    gf_log("glusterfs-fuse", GF_LOG_WARNING,
+           "fuse_invalidate_inode not implemented on this system");
 #endif
+    return 0;
+}
 
-                e.entry_timeout = priv->entry_timeout;
-                e.attr_timeout  = priv->attribute_timeout;
-                e.attr = *buf;
-                e.attr.st_blksize = BIG_FUSE_CHANNEL_SIZE; 
-  
-		if (!e.ino || !buf->st_ino) {
-			gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-				"%"PRId64": %s() %s returning inode 0",
-				frame->root->unique,
-				gf_fop_list[frame->root->op], state->loc.path);
-		}
-
-                if (state->loc.parent)
-                        fuse_reply_entry (req, &e);
-                else
-                        fuse_reply_attr (req, buf, priv->attribute_timeout);
-        } else {
-                gf_log ("glusterfs-fuse",
-			(op_errno == ENOENT ? GF_LOG_DEBUG : GF_LOG_ERROR),
-                        "%"PRId64": %s() %s => -1 (%s)", frame->root->unique,
-                        gf_fop_list[frame->root->op], state->loc.path,
-			strerror (op_errno));
-                fuse_reply_err (req, op_errno);
-        }
+#if FUSE_KERNEL_MINOR_VERSION >= 11
+/* Need this function for the signature (inode_t *, instead of uint64_t) */
+static int32_t
+fuse_inode_invalidate_fn(xlator_t *this, inode_t *inode)
+{
+    int32_t ret = 0;
+    ret = fuse_invalidate_entry(this, (uint64_t)(uintptr_t)inode);
+    return ret;
+}
+#endif
 
-        free_state (state);
-        STACK_DESTROY (frame->root);
-        return 0;
+static fuse_timed_message_t *
+fuse_timed_message_new(void)
+{
+    fuse_timed_message_t *dmsg = NULL;
+
+    dmsg = GF_MALLOC(sizeof(*dmsg), gf_fuse_mt_timed_message_t);
+    if (!dmsg) {
+        return NULL;
+    }
+
+    /* should be NULL if not set */
+    dmsg->fuse_message_body = NULL;
+    INIT_LIST_HEAD(&dmsg->next);
+    memset(dmsg->errnomask, 0, sizeof(dmsg->errnomask));
+
+    return dmsg;
 }
 
+static void
+fuse_timed_message_free(fuse_timed_message_t *dmsg)
+{
+    GF_FREE(dmsg->fuse_message_body);
+    GF_FREE(dmsg);
+}
 
-static int
-fuse_lookup_cbk (call_frame_t *frame,
-                 void *cookie,
-                 xlator_t *this,
-                 int32_t op_ret,
-                 int32_t op_errno,
-                 inode_t *inode,
-                 struct stat *stat,
-                 dict_t *dict)
-{
-        fuse_entry_cbk (frame, cookie, this, op_ret, op_errno, inode, stat);
-        return 0;
+static void
+send_fuse_timed(xlator_t *this, fuse_timed_message_t *dmsg)
+{
+    fuse_private_t *priv = NULL;
+
+    priv = this->private;
+
+    if (!priv->timed_response_fuse_thread_started) {
+        return;
+    }
+
+    pthread_mutex_lock(&priv->timed_mutex);
+    {
+        list_add_tail(&dmsg->next, &priv->timed_list);
+        pthread_cond_signal(&priv->timed_cond);
+    }
+    pthread_mutex_unlock(&priv->timed_mutex);
 }
 
+fuse_interrupt_record_t *
+fuse_interrupt_record_new(fuse_in_header_t *finh,
+                          fuse_interrupt_handler_t handler)
+{
+    fuse_interrupt_record_t *fir = NULL;
+
+    fir = GF_MALLOC(sizeof(*fir), gf_fuse_mt_interrupt_record_t);
+    if (!fir) {
+        return NULL;
+    }
+
+    fir->hit = _gf_false;
+    fir->interrupt_state = INTERRUPT_NONE;
+    fir->data = NULL;
+
+    fir->interrupt_handler = handler;
+    memcpy(&fir->fuse_in_header, finh, sizeof(*finh));
+    pthread_cond_init(&fir->handler_cond, NULL);
+    pthread_mutex_init(&fir->handler_mutex, NULL);
+    INIT_LIST_HEAD(&fir->next);
+
+    return fir;
+}
 
 static void
-fuse_lookup (fuse_req_t req,
-             fuse_ino_t par,
-             const char *name)
-{
-        fuse_state_t *state;
-	int32_t ret = -1;
-	
-        state = state_from_req (req);
-
-        ret = fuse_loc_fill (&state->loc, state, 0, par, name);
-
-	if (ret < 0) {
-		gf_log ("glusterfs-fuse", GF_LOG_ERROR, 
-			"%"PRId64": LOOKUP %"PRId64"/%s (fuse_loc_fill() failed)", 
-			req_callid (req), (ino_t)par, name);
-		free_state (state);
-		fuse_reply_err (req, EINVAL);
-		return;
-	}
-
-        if (!state->loc.inode) {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": LOOKUP %s", req_callid (req),
-                        state->loc.path);
-
-                state->loc.inode = inode_new (state->itable);
-                /* to differntiate in entry_cbk what kind of call it is */
-                state->is_revalidate = -1;
-        } else {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": LOOKUP %s(%"PRId64")", req_callid (req),
-                        state->loc.path, state->loc.inode->ino);
-                state->is_revalidate = 1;
+fuse_interrupt_record_free(fuse_interrupt_record_t *fir, void **datap)
+{
+    /*
+     * If caller wishes, we give back the private data to let them deal with it
+     * however they want; otherwise we take care of freeing it.
+     */
+    if (datap) {
+        *datap = fir->data;
+    } else {
+        GF_FREE(fir->data);
+    }
+
+    GF_FREE(fir);
+}
+
+void
+fuse_interrupt_record_insert(xlator_t *this, fuse_interrupt_record_t *fir)
+{
+    fuse_private_t *priv = NULL;
+
+    priv = this->private;
+    pthread_mutex_lock(&priv->interrupt_mutex);
+    {
+        list_add_tail(&fir->next, &priv->interrupt_list);
+    }
+    pthread_mutex_unlock(&priv->interrupt_mutex);
+}
+
+static fuse_interrupt_record_t *
+fuse_interrupt_record_fetch(xlator_t *this, uint64_t unique, gf_boolean_t reap)
+{
+    fuse_interrupt_record_t *fir = NULL;
+    gf_boolean_t found = _gf_false;
+    fuse_private_t *priv = NULL;
+
+    priv = this->private;
+    pthread_mutex_lock(&priv->interrupt_mutex);
+    {
+        list_for_each_entry(fir, &priv->interrupt_list, next)
+        {
+            if (fir->fuse_in_header.unique == unique) {
+                /*
+                 * If we are to reap, we do it regardless the
+                 * hit flag; otherwise we take the record only
+                 * hasn't yet flagged hit.
+                 */
+                if (reap || !fir->hit) {
+                    found = _gf_true;
+                }
+                /*
+                 * If we are not reaping (coming from handler
+                 * context), we set the hit flag.
+                 */
+                if (!reap) {
+                    fir->hit = _gf_true;
+                }
+                break;
+            }
+        }
+        if (found && reap) {
+            list_del(&fir->next);
         }
-	
-	state->dict = dict_new();
+    }
+    pthread_mutex_unlock(&priv->interrupt_mutex);
 
-        FUSE_FOP (state, fuse_lookup_cbk, GF_FOP_LOOKUP,
-                  lookup, &state->loc, state->dict);
+    if (found) {
+        return fir;
+    }
+    return NULL;
+}
+
+static fuse_interrupt_record_t *
+fuse_interrupt_record_get(xlator_t *this, uint64_t unique)
+{
+    return fuse_interrupt_record_fetch(this, unique, _gf_false);
 }
 
+static fuse_interrupt_record_t *
+fuse_interrupt_record_reap(xlator_t *this, uint64_t unique)
+{
+    return fuse_interrupt_record_fetch(this, unique, _gf_true);
+}
 
 static void
-fuse_forget (fuse_req_t req,
-             fuse_ino_t ino,
-             unsigned long nlookup)
+fuse_interrupt(xlator_t *this, fuse_in_header_t *finh, void *msg,
+               struct iobuf *iobuf)
 {
-        inode_t *fuse_inode;
-        fuse_state_t *state;
+    struct fuse_interrupt_in *fii = msg;
+    fuse_interrupt_record_t *fir = NULL;
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "unique %" PRIu64 " INTERRUPT for %" PRIu64, finh->unique,
+           fii->unique);
+
+    fir = fuse_interrupt_record_get(this, fii->unique);
+    if (fir) {
+        gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+               "unique %" PRIu64 " INTERRUPT for %" PRIu64
+               ": handler triggered",
+               finh->unique, fii->unique);
+
+        fir->interrupt_handler(this, fir);
+    } else {
+        fuse_timed_message_t *dmsg = NULL;
 
-        if (ino == 1) {
-                fuse_reply_none (req);
-                return;
+        /*
+         * No record found for this interrupt request.
+         *
+         * It's either because the handler for the interrupted message
+         * does not want to handle interrupt, or this interrupt
+         * message beat the interrupted which hasn't yet added a record
+         * to the interrupt queue. Either case we reply with error
+         * EAGAIN with some (0.01 sec) delay. That will have this
+         * interrupt request resent, unless the interrupted message
+         * has been already answered.
+         *
+         * So effectively we are looping in between kernel and
+         * userspace, which will be exited either when the interrupted
+         * message handler has added an interrupt record, or has
+         * replied to kernel. See
+         *
+         * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
+         * linux.git/tree/Documentation/filesystems/fuse.txt?h=v4.18#n148
+         */
+
+        gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+               "unique %" PRIu64 " INTERRUPT for %" PRIu64 ": no handler found",
+               finh->unique, fii->unique);
+
+        dmsg = fuse_timed_message_new();
+        if (!dmsg) {
+            gf_log("glusterfs-fuse", GF_LOG_ERROR,
+                   "unique %" PRIu64 " INTERRUPT for %" PRIu64
+                   ":"
+                   " failed to allocate timed message",
+                   finh->unique, fii->unique);
+
+            goto out;
         }
 
-        state = state_from_req (req);
-        fuse_inode = inode_search (state->itable, ino, NULL);
-	if (fuse_inode) {
-		gf_log ("glusterfs-fuse", GF_LOG_DEBUG, 
-			"got forget on inode (%lu)", ino);
-		inode_forget (fuse_inode, nlookup);
-		inode_unref (fuse_inode);
-	} else {
-		gf_log ("glusterfs-fuse", GF_LOG_ERROR, 
-			"got forget, but inode (%lu) not found", ino);
-	}
+        dmsg->fuse_out_header.unique = finh->unique;
+        dmsg->fuse_out_header.len = sizeof(dmsg->fuse_out_header);
+        dmsg->fuse_out_header.error = -EAGAIN;
+        if (ENOENT < ERRNOMASK_MAX)
+            MASK_ERRNO(dmsg->errnomask, ENOENT);
+        timespec_now(&dmsg->scheduled_ts);
+        timespec_adjust_delta(&dmsg->scheduled_ts,
+                              (struct timespec){0, 10000000});
 
-        free_state (state);
-        fuse_reply_none (req);
+        send_fuse_timed(this, dmsg);
+    }
+
+out:
+    GF_FREE(finh);
 }
 
+/*
+ * Function to be called in fop cbk context (if the fop engages
+ * with interrupt handling).
+ */
+gf_boolean_t
+fuse_interrupt_finish_fop(call_frame_t *frame, xlator_t *this,
+                          gf_boolean_t sync, void **datap)
+{
+    fuse_interrupt_record_t *fir = NULL;
+    fuse_state_t *state = frame->root->state;
+    fuse_in_header_t *finh = state->finh;
+    gf_boolean_t hit = _gf_false;
+    gf_boolean_t handled = _gf_false;
+    fuse_interrupt_state_t intstat_orig = INTERRUPT_NONE;
+
+    fir = fuse_interrupt_record_reap(this, finh->unique);
+    if (!fir) {
+        /*
+         * No interrupt record was inserted (however, caller would usually know
+         * about that and there is no point then in calling this function).
+         */
+        return _gf_false;
+    }
+
+    /*
+     * The interrupt handler (if finds the record) modifies fir->hit; however,
+     * that could have occurred only before fuse_interrupt_record_reap(), so
+     * we are safe here with a lock-free access.
+     */
+    hit = fir->hit;
+    if (hit) {
+        pthread_mutex_lock(&fir->handler_mutex);
+        {
+            intstat_orig = fir->interrupt_state;
+            if (fir->interrupt_state == INTERRUPT_NONE) {
+                if (sync) {
+                    fir->interrupt_state = INTERRUPT_WAITING_HANDLER;
+                    while (fir->interrupt_state != INTERRUPT_SQUELCHED) {
+                        pthread_cond_wait(&fir->handler_cond,
+                                          &fir->handler_mutex);
+                    }
+                } else
+                    fir->interrupt_state = INTERRUPT_SQUELCHED;
+            }
+        }
+        pthread_mutex_unlock(&fir->handler_mutex);
+    }
+
+    GF_ASSERT(intstat_orig == INTERRUPT_NONE ||
+              intstat_orig == INTERRUPT_HANDLED ||
+              intstat_orig == INTERRUPT_SQUELCHED);
+    gf_log("glusterfs-fuse", GF_LOG_DEBUG, "intstat_orig=%d", intstat_orig);
+
+    /*
+     * From this on fir can only be referred under the conditions that imply
+     * we are to free it (otherwise interrupt handler might have already freed
+     * it).
+     */
+
+    if (/* there was no interrupt */
+        !hit ||
+        /* lost the race against interrupt handler */
+        intstat_orig != INTERRUPT_NONE ||
+        /* we took cleaning up on us */
+        sync) {
+        /* cleaning up */
+        fuse_interrupt_record_free(fir, datap);
+    } else if (datap) {
+        *datap = NULL;
+    }
+
+    handled = (intstat_orig == INTERRUPT_HANDLED);
+    if (handled) {
+        /*
+         * Fuse request was answered already from interrupt context, we can do
+         * away with the stack.
+         */
+        free_fuse_state(state);
+        STACK_DESTROY(frame->root);
+    }
+
+    /*
+     * Let caller know if they have to answer the fuse request.
+     */
+    return handled;
+}
 
-static int
-fuse_attr_cbk (call_frame_t *frame,
-               void *cookie,
-               xlator_t *this,
-               int32_t op_ret,
-               int32_t op_errno,
-               struct stat *buf)
-{
-        fuse_state_t *state;
-        fuse_req_t req;
-        fuse_private_t *priv = this->private;
-
-        state = frame->root->state;
-        req = state->req;
-
-        if (op_ret == 0) {
-                gf_log ("glusterfs-fuse",
-			(buf->st_ino ? GF_LOG_DEBUG : GF_LOG_ERROR),
-                        "%"PRId64": %s() %s => %"PRId64, frame->root->unique, 
-                        gf_fop_list[frame->root->op],
-			state->loc.path ? state->loc.path : "ERR",
-                        buf->st_ino);
-
-                /* TODO: make these timeouts configurable via meta */
-                /* TODO: what if the inode number has changed by now */ 
-                buf->st_blksize = BIG_FUSE_CHANNEL_SIZE;
-
-                fuse_reply_attr (req, buf, priv->attribute_timeout);
-        } else {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": %s() %s => -1 (%s)", frame->root->unique, 
-                        gf_fop_list[frame->root->op],
-			state->loc.path ? state->loc.path : "ERR", 
-                        strerror (op_errno));
+/*
+ * Function to be called in interrupt handler context.
+ */
+void
+fuse_interrupt_finish_interrupt(xlator_t *this, fuse_interrupt_record_t *fir,
+                                fuse_interrupt_state_t intstat,
+                                gf_boolean_t sync, void **datap)
+{
+    fuse_in_header_t finh = {
+        0,
+    };
+    fuse_interrupt_state_t intstat_orig = INTERRUPT_NONE;
+
+    GF_ASSERT(intstat == INTERRUPT_HANDLED || intstat == INTERRUPT_SQUELCHED);
+
+    pthread_mutex_lock(&fir->handler_mutex);
+    {
+        intstat_orig = fir->interrupt_state;
+        switch (intstat_orig) {
+            case INTERRUPT_NONE:
+                fir->interrupt_state = intstat;
+                break;
+            case INTERRUPT_WAITING_HANDLER:
+                fir->interrupt_state = INTERRUPT_SQUELCHED;
+                pthread_cond_signal(&fir->handler_cond);
+                break;
+            default:
+                break;
+        }
+        finh = fir->fuse_in_header;
+    }
+    pthread_mutex_unlock(&fir->handler_mutex);
+
+    GF_ASSERT(intstat_orig == INTERRUPT_NONE ||
+              (sync && intstat_orig == INTERRUPT_WAITING_HANDLER) ||
+              (!sync && intstat_orig == INTERRUPT_SQUELCHED));
+    gf_log("glusterfs-fuse", GF_LOG_DEBUG, "intstat_orig=%d", intstat_orig);
+
+    /*
+     * From this on fir can only be referred under the conditions that imply
+     * we are to free it (otherwise fop handler might have already freed it).
+     */
+
+    if (/* we won the race, response is up to us */
+        intstat_orig == INTERRUPT_NONE &&
+        /* interrupt handling was successful, let the kernel know */
+        intstat == INTERRUPT_HANDLED) {
+        send_fuse_err(this, &finh, EINTR);
+    }
+
+    if (/* lost the race ... */
+        intstat_orig != INTERRUPT_NONE &&
+        /*
+         * ... and there is no contract with fop handler that it does the
+         * cleanup ...
+         */
+        !sync) {
+        /* ... so we do! */
+        fuse_interrupt_record_free(fir, datap);
+    } else if (datap) {
+        *datap = NULL;
+    }
+}
 
-                fuse_reply_err (req, op_errno);
+int
+send_fuse_err(xlator_t *this, fuse_in_header_t *finh, int error)
+{
+    struct fuse_out_header fouh = {
+        0,
+    };
+    struct iovec iov_out;
+    inode_t *inode = NULL;
+
+    fouh.error = -error;
+    iov_out.iov_base = &fouh;
+
+    inode = fuse_ino_to_inode(finh->nodeid, this);
+
+    // filter out ENOENT
+    if (error != ENOENT) {
+        if (inode) {
+            fuse_log_eh(this,
+                        "Sending %s for operation %d on "
+                        "inode %s",
+                        strerror(error), finh->opcode, uuid_utoa(inode->gfid));
+        } else {
+            fuse_log_eh(this,
+                        "Sending %s for operation %d on "
+                        "inode %" GF_PRI_INODE,
+                        strerror(error), finh->opcode, finh->nodeid);
         }
-        
-        free_state (state);
-        STACK_DESTROY (frame->root);
-        return 0;
+    }
+
+    if (inode)
+        inode_unref(inode);
+
+    return send_fuse_iov(this, finh, &iov_out, 1);
 }
 
+static int
+fuse_entry_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, inode_t *inode,
+               struct iatt *buf, dict_t *xdata)
+{
+    fuse_state_t *state = NULL;
+    fuse_in_header_t *finh = NULL;
+    struct fuse_entry_out feo = {
+        0,
+    };
+    fuse_private_t *priv = NULL;
+    inode_t *linked_inode = NULL;
+    uint64_t ctx_value = LOOKUP_NOT_NEEDED;
+
+    priv = this->private;
+    state = frame->root->state;
+    finh = state->finh;
+
+    if (op_ret == 0) {
+        if (__is_root_gfid(state->loc.inode->gfid))
+            buf->ia_ino = 1;
+        if (gf_uuid_is_null(buf->ia_gfid)) {
+            /* With a NULL gfid inode linking is
+               not possible. Let's not pretend this
+               call was a "success".
+            */
+            gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                   "Received NULL gfid for %s. Forcing EIO", state->loc.path);
+            op_ret = -1;
+            op_errno = EIO;
+        }
+    }
+
+    /* log into the event-history after the null uuid check is done, since
+     * the op_ret and op_errno are being changed if the gfid is NULL.
+     */
+    fuse_log_eh(
+        this,
+        "op_ret: %d op_errno: %d "
+        "%" PRIu64 ": %s() %s => %s",
+        op_ret, op_errno, frame->root->unique, gf_fop_list[frame->root->op],
+        state->loc.path,
+        (op_ret == 0) ? uuid_utoa(buf->ia_gfid) : uuid_utoa(state->loc.gfid));
+
+    if (op_ret == 0) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": %s() %s => %" PRIu64, frame->root->unique,
+               gf_fop_list[frame->root->op], state->loc.path, buf->ia_ino);
+
+        buf->ia_blksize = this->ctx->page_size;
+        gf_fuse_stat2attr(buf, &feo.attr, priv->enable_ino32);
+
+        if (!buf->ia_ino) {
+            gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                   "%" PRIu64 ": %s() %s returning inode 0",
+                   frame->root->unique, gf_fop_list[frame->root->op],
+                   state->loc.path);
+        }
 
-static void
-fuse_getattr (fuse_req_t req,
-              fuse_ino_t ino,
-              struct fuse_file_info *fi)
-{
-        fuse_state_t *state;
-        fd_t         *fd = NULL;
-	int32_t       ret = -1;
-
-        state = state_from_req (req);
-
-        if (ino == 1) {
-                ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL);
-		if (ret < 0) {
-			gf_log ("glusterfs-fuse", GF_LOG_ERROR, 
-				"%"PRId64": GETATTR %"PRId64" (fuse_loc_fill() failed)",
-				req_callid(req), (ino_t)ino);
-			fuse_reply_err (req, EINVAL);
-			free_state (state);
-			return;
-		}
-
-                if (state->loc.inode)
-                        state->is_revalidate = 1;
-                else
-                        state->is_revalidate = -1;
-
-		state->dict = dict_new();
-
-                FUSE_FOP (state, fuse_lookup_cbk, GF_FOP_LOOKUP,
-                          lookup, &state->loc, state->dict);
-                return;
-        }
-
-        ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL);
-
-        if (!state->loc.inode) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": GETATTR %"PRId64" (%s) (fuse_loc_fill() returned NULL inode)", 
-                        req_callid (req), (int64_t)ino, state->loc.path);
-                fuse_reply_err (req, EINVAL);
-                return;
-        }
-        
-        fd = fd_lookup (state->loc.inode, get_pid_from_req (req));
-        state->fd = fd;
-        if (!fd || S_ISDIR (state->loc.inode->st_mode)) {
-		/* this is the @ret of fuse_loc_fill, checked here
-		   to permit fstat() to happen even when fuse_loc_fill fails
-		*/
-		if (ret < 0) {
-			gf_log ("glusterfs-fuse", GF_LOG_ERROR, 
-				"%"PRId64": GETATTR %"PRId64" (fuse_loc_fill() failed)",
-				req_callid(req), (ino_t)ino);
-			fuse_reply_err (req, EINVAL);
-			free_state (state);
-			return;
-		}
-
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": GETATTR %"PRId64" (%s)",
-                        req_callid (req), (int64_t)ino, state->loc.path);
-
-    
-                FUSE_FOP (state, fuse_attr_cbk, GF_FOP_STAT,
-                          stat, &state->loc);
-        } else {
+        linked_inode = inode_link(inode, state->loc.parent, state->loc.name,
+                                  buf);
+
+        if (linked_inode == inode) {
+            inode_ctx_set(linked_inode, this, &ctx_value);
+        }
+
+        inode_lookup(linked_inode);
+
+        feo.nodeid = inode_to_fuse_nodeid(linked_inode);
 
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": FGETATTR %"PRId64" (%s/%p)",
-                        req_callid (req), (int64_t)ino, state->loc.path, fd);
+        inode_unref(linked_inode);
 
-                FUSE_FOP (state,fuse_attr_cbk, GF_FOP_FSTAT,
-                          fstat, fd);
+        feo.entry_valid = calc_timeout_sec(priv->entry_timeout);
+        feo.entry_valid_nsec = calc_timeout_nsec(priv->entry_timeout);
+        feo.attr_valid = calc_timeout_sec(priv->attribute_timeout);
+        feo.attr_valid_nsec = calc_timeout_nsec(priv->attribute_timeout);
+
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+        priv->proto_minor >= 9
+            ? send_fuse_obj(this, finh, &feo)
+            : send_fuse_data(this, finh, &feo, FUSE_COMPAT_ENTRY_OUT_SIZE);
+#else
+        send_fuse_obj(this, finh, &feo);
+#endif
+    } else {
+        gf_log("glusterfs-fuse",
+               (op_errno == ENOENT ? GF_LOG_TRACE : GF_LOG_WARNING),
+               "%" PRIu64 ": %s() %s => -1 (%s)", frame->root->unique,
+               gf_fop_list[frame->root->op], state->loc.path,
+               strerror(op_errno));
+
+        if ((op_errno == ENOENT) && (priv->negative_timeout != 0)) {
+            feo.entry_valid = calc_timeout_sec(priv->negative_timeout);
+            feo.entry_valid_nsec = calc_timeout_nsec(priv->negative_timeout);
+            send_fuse_obj(this, finh, &feo);
+        } else {
+            send_fuse_err(this, state->finh, op_errno);
         }
-}
+    }
 
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+    return 0;
+}
 
 static int
-fuse_fd_cbk (call_frame_t *frame,
-             void *cookie,
-             xlator_t *this,
-             int32_t op_ret,
-             int32_t op_errno,
-             fd_t *fd)
-{
-        fuse_state_t *state;
-        fuse_req_t req;
-        fuse_private_t *priv = this->private;
-
-        state = frame->root->state;
-        req = state->req;
-
-        if (op_ret >= 0) {
-                struct fuse_file_info fi = {0, };
-                
-                fi.fh = (unsigned long) fd;
-                fi.flags = state->flags;
-
-                if (!S_ISDIR (fd->inode->st_mode)) {
-                        if ((fi.flags & 3) && priv->direct_io_mode)
-                                fi.direct_io = 1;
-                }
+fuse_newentry_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, inode_t *inode,
+                  struct iatt *buf, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
+{
+    /* facilitate retry of link from VFS */
+    if (op_errno == ENOENT)
+        op_errno = ESTALE;
 
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": %s() %s => %p", frame->root->unique,
-			gf_fop_list[frame->root->op], state->loc.path, fd);
+    fuse_entry_cbk(frame, cookie, this, op_ret, op_errno, inode, buf, xdata);
+    return 0;
+}
 
-		fd_ref (fd);
-                if (fuse_reply_open (req, &fi) == -ENOENT) {
-                        gf_log ("glusterfs-fuse", GF_LOG_WARNING,
-				"open() got EINTR");
-			fd_unref (fd);
-				goto out;
-                }
-		
-		fd_bind (fd);
-        } else {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": %s() %s => -1 (%s)", frame->root->unique,
-                        gf_fop_list[frame->root->op], state->loc.path,
-			strerror (op_errno));
+static int
+fuse_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, inode_t *inode,
+                struct iatt *stat, dict_t *dict, struct iatt *postparent)
+{
+    fuse_state_t *state = NULL;
+    call_frame_t *prev = NULL;
+    inode_table_t *itable = NULL;
 
-                fuse_reply_err (req, op_errno);
-        }
-out:
-        free_state (state);
-        STACK_DESTROY (frame->root);
+    state = frame->root->state;
+    prev = cookie;
+
+    if (op_ret == -1 && state->is_revalidate == 1) {
+        itable = state->itable;
+        /*
+         * A stale mapping might exist for a dentry/inode that has been
+         * removed from another client.
+         */
+        if (op_errno == ENOENT)
+            inode_unlink(state->loc.inode, state->loc.parent, state->loc.name);
+        inode_unref(state->loc.inode);
+        state->loc.inode = inode_new(itable);
+        state->is_revalidate = 2;
+        if (gf_uuid_is_null(state->gfid))
+            gf_uuid_generate(state->gfid);
+        fuse_gfid_set(state);
+
+        STACK_WIND(frame, fuse_lookup_cbk, prev->this, prev->this->fops->lookup,
+                   &state->loc, state->xdata);
         return 0;
+    }
+
+    fuse_entry_cbk(frame, cookie, this, op_ret, op_errno, inode, stat, dict);
+    return 0;
 }
 
+void
+fuse_fop_resume(fuse_state_t *state)
+{
+    fuse_resume_fn_t fn = NULL;
+
+    /*
+     * Fail fd resolution failures right away.
+     */
+    if (state->resolve.fd && state->resolve.op_ret < 0) {
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+    fn = state->resume_fn;
+    fn(state);
+}
 
+void
+fuse_lookup_resume(fuse_state_t *state)
+{
+    if (!state->loc.parent && !state->loc.inode) {
+        gf_log("fuse", GF_LOG_ERROR, "failed to resolve path %s",
+               state->loc.path);
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+    /* parent was resolved, entry could not, may be a missing gfid?
+     * Hence try to do a regular lookup
+     */
+    if ((state->resolve.op_ret == -1) && (state->resolve.op_errno == ENODATA)) {
+        state->resolve.op_ret = 0;
+    }
+
+    if (state->loc.inode) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": LOOKUP %s(%s)",
+               state->finh->unique, state->loc.path,
+               uuid_utoa(state->loc.inode->gfid));
+        state->is_revalidate = 1;
+    } else {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": LOOKUP %s",
+               state->finh->unique, state->loc.path);
+        state->loc.inode = inode_new(state->loc.parent->table);
+        if (gf_uuid_is_null(state->gfid))
+            gf_uuid_generate(state->gfid);
+        fuse_gfid_set(state);
+    }
+
+    FUSE_FOP(state, fuse_lookup_cbk, GF_FOP_LOOKUP, lookup, &state->loc,
+             state->xdata);
+}
 
 static void
-do_chmod (fuse_req_t req,
-          fuse_ino_t ino,
-          struct stat *attr,
-          struct fuse_file_info *fi)
+fuse_lookup(xlator_t *this, fuse_in_header_t *finh, void *msg,
+            struct iobuf *iobuf)
 {
-        fuse_state_t *state = state_from_req (req);
-        fd_t *fd = NULL;
-	int32_t ret = -1;
+    char *name = msg;
+    fuse_state_t *state = NULL;
 
-        if (fi) {
-                fd = FI_TO_FD (fi);
-                state->fd = fd;
-        }
+    GET_STATE(this, finh, state);
 
-        if (fd) {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": FCHMOD %p", req_callid (req), fd);
+    (void)fuse_resolve_entry_init(state, &state->resolve, finh->nodeid, name);
 
-                FUSE_FOP (state, fuse_attr_cbk, GF_FOP_FCHMOD,
-                          fchmod, fd, attr->st_mode);
-        } else {
-                ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL);
-
-                if ((state->loc.inode == NULL) ||
-		    (ret < 0)) {
-                        gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                                "%"PRId64": CHMOD %"PRId64" (%s) (fuse_loc_fill() failed)", 
-                                req_callid (req), (int64_t)ino,
-				state->loc.path);
-                        fuse_reply_err (req, EINVAL);
-			free_state (state);
-                        return;
-                }
+    fuse_resolve_and_resume(state, fuse_lookup_resume);
 
+    return;
+}
 
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": CHMOD %s", req_callid (req),
-                        state->loc.path);
+static void
+do_forget(xlator_t *this, uint64_t unique, uint64_t nodeid, uint64_t nlookup)
+{
+    inode_t *fuse_inode = fuse_ino_to_inode(nodeid, this);
 
-                FUSE_FOP (state, fuse_attr_cbk, GF_FOP_CHMOD,
-                          chmod, &state->loc, attr->st_mode);
-        }
+    gf_log("fuse", GF_LOG_TRACE,
+           "%" PRIu64 ": FORGET %" PRIu64 "/%" PRIu64 " gfid: (%s)", unique,
+           nodeid, nlookup, uuid_utoa(fuse_inode->gfid));
+
+    fuse_log_eh(this, "%" PRIu64 ": FORGET %" PRIu64 "/%" PRIu64 " gfid: (%s)",
+                unique, nodeid, nlookup, uuid_utoa(fuse_inode->gfid));
+
+    inode_forget_with_unref(fuse_inode, nlookup);
 }
 
+static void
+fuse_forget(xlator_t *this, fuse_in_header_t *finh, void *msg,
+            struct iobuf *iobuf)
+
+{
+    struct fuse_forget_in *ffi = msg;
+
+    if (finh->nodeid == 1) {
+        GF_FREE(finh);
+        return;
+    }
+
+    do_forget(this, finh->unique, finh->nodeid, ffi->nlookup);
+
+    GF_FREE(finh);
+}
 
+#if FUSE_KERNEL_MINOR_VERSION >= 16
 static void
-do_chown (fuse_req_t req,
-          fuse_ino_t ino,
-          struct stat *attr,
-          int valid,
-          struct fuse_file_info *fi)
+fuse_batch_forget(xlator_t *this, fuse_in_header_t *finh, void *msg,
+                  struct iobuf *iobuf)
 {
-        fuse_state_t *state;
-        fd_t *fd = NULL;
-	int32_t ret = -1;
-        uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t) -1;
-        gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t) -1;
+    struct fuse_batch_forget_in *fbfi = msg;
+    struct fuse_forget_one *ffo = (struct fuse_forget_one *)(fbfi + 1);
+    int i;
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "%" PRIu64 ": BATCH_FORGET %" PRIu64 "/%" PRIu32, finh->unique,
+           finh->nodeid, fbfi->count);
+
+    for (i = 0; i < fbfi->count; i++) {
+        if (ffo[i].nodeid == 1)
+            continue;
+        do_forget(this, finh->unique, ffo[i].nodeid, ffo[i].nlookup);
+    }
+    GF_FREE(finh);
+}
+#endif
 
-        state = state_from_req (req);
+static int
+fuse_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                  struct iatt *postbuf, dict_t *xdata)
+{
+    fuse_state_t *state;
+    fuse_in_header_t *finh;
+    fuse_private_t *priv = NULL;
+    struct fuse_attr_out fao;
 
-        if (fi) {
-                fd = FI_TO_FD (fi);
-                state->fd = fd;
-        }
+    priv = this->private;
+    state = frame->root->state;
+    finh = state->finh;
 
-        if (fd) {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": FCHOWN %p", req_callid (req), fd);
+    fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
 
-                FUSE_FOP (state, fuse_attr_cbk, GF_FOP_FCHOWN,
-                          fchown, fd, uid, gid);
-        } else {
-                ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL);
-                if ((state->loc.inode == NULL) ||
-		    (ret < 0)) {
-                        gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                                "%"PRId64": CHOWN %"PRId64" (%s) (fuse_loc_fill() failed)", 
-                                req_callid (req), (int64_t)ino,
-				state->loc.path);
-                        fuse_reply_err (req, EINVAL);
-			free_state (state);
-                        return;
-                }
+    if (op_ret == 0) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": %s() %s => %" PRIu64, frame->root->unique,
+               gf_fop_list[frame->root->op],
+               state->loc.path ? state->loc.path : "ERR", prebuf->ia_ino);
 
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": CHOWN %s", req_callid (req),
-                        state->loc.path);
+        postbuf->ia_blksize = this->ctx->page_size;
+        gf_fuse_stat2attr(postbuf, &fao.attr, priv->enable_ino32);
 
-                FUSE_FOP (state, fuse_attr_cbk, GF_FOP_CHOWN,
-                          chown, &state->loc, uid, gid);
-        }
+        fao.attr_valid = calc_timeout_sec(priv->attribute_timeout);
+        fao.attr_valid_nsec = calc_timeout_nsec(priv->attribute_timeout);
+
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+        priv->proto_minor >= 9
+            ? send_fuse_obj(this, finh, &fao)
+            : send_fuse_data(this, finh, &fao, FUSE_COMPAT_ATTR_OUT_SIZE);
+#else
+        send_fuse_obj(this, finh, &fao);
+#endif
+    } else {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": %s() %s => -1 (%s)", frame->root->unique,
+               gf_fop_list[frame->root->op],
+               state->loc.path ? state->loc.path : "ERR", strerror(op_errno));
+
+        /* facilitate retry from VFS */
+        if ((state->fd == NULL) && (op_errno == ENOENT))
+            op_errno = ESTALE;
+
+        send_fuse_err(this, finh, op_errno);
+    }
+
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+
+    return 0;
 }
 
+static int
+fuse_root_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, inode_t *inode,
+                     struct iatt *stat, dict_t *dict, struct iatt *postparent);
 
-static void 
-do_truncate (fuse_req_t req,
-             fuse_ino_t ino,
-             struct stat *attr,
-             struct fuse_file_info *fi)
+static int
+fuse_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *buf, dict_t *xdata)
 {
-        fuse_state_t *state;
-        fd_t *fd = NULL;
-	int32_t ret = -1;
+    int32_t ret = 0;
+    fuse_state_t *state;
+    fuse_in_header_t *finh;
+    fuse_private_t *priv = NULL;
+    struct fuse_attr_out fao;
+
+    priv = this->private;
+    state = frame->root->state;
+    finh = state->finh;
+
+    fuse_log_eh(this,
+                "op_ret: %d, op_errno: %d, %" PRIu64
+                ": %s() %s => "
+                "gfid: %s",
+                op_ret, op_errno, frame->root->unique,
+                gf_fop_list[frame->root->op], state->loc.path,
+                state->loc.inode ? uuid_utoa(state->loc.inode->gfid) : "");
+    if (op_ret == 0) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": %s() %s => %" PRIu64, frame->root->unique,
+               gf_fop_list[frame->root->op],
+               state->loc.path ? state->loc.path : "ERR", buf->ia_ino);
+
+        buf->ia_blksize = this->ctx->page_size;
+        gf_fuse_stat2attr(buf, &fao.attr, priv->enable_ino32);
+
+        fao.attr_valid = calc_timeout_sec(priv->attribute_timeout);
+        fao.attr_valid_nsec = calc_timeout_nsec(priv->attribute_timeout);
+
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+        priv->proto_minor >= 9
+            ? send_fuse_obj(this, finh, &fao)
+            : send_fuse_data(this, finh, &fao, FUSE_COMPAT_ATTR_OUT_SIZE);
+#else
+        send_fuse_obj(this, finh, &fao);
+#endif
+    } else {
+        /* This is moved here from fuse_getattr(). It makes sense as
+           in few cases, like the self-heal processes, some
+           translators expect a lookup() to come on root inode
+           (inode number 1). This will make sure we don't fail in any
+           case, but the positive path will get better performance,
+           by following common path for all the cases */
+        if ((finh->nodeid == 1) && (state->gfid[15] != 1)) {
+            /* The 'state->gfid[15]' check is added to prevent the
+               infinite recursions */
+            state->gfid[15] = 1;
+
+            ret = fuse_loc_fill(&state->loc, state, finh->nodeid, 0, NULL);
+            if (ret < 0) {
+                gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                       "%" PRIu64 ": loc_fill() on / failed", finh->unique);
+                send_fuse_err(this, finh, ENOENT);
+                free_fuse_state(state);
+                return 0;
+            }
 
-        state = state_from_req (req);
-        
-        if (fi) {
-                fd = FI_TO_FD (fi);
-                state->fd = fd;
+            fuse_gfid_set(state);
+
+            FUSE_FOP(state, fuse_root_lookup_cbk, GF_FOP_LOOKUP, lookup,
+                     &state->loc, state->xdata);
+
+            return 0;
         }
-        if (fd) {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": FTRUNCATE %p/%"PRId64, req_callid (req),
-                        fd, attr->st_size);
 
-                FUSE_FOP (state, fuse_attr_cbk, GF_FOP_FTRUNCATE,
-                          ftruncate, fd, attr->st_size);
-        } else {
-                ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL);
-                if ((state->loc.inode == NULL) || 
-		    (ret < 0)) {
-                        gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                                "%"PRId64": TRUNCATE %s/%"PRId64" (fuse_loc_fill() failed)", 
-                                req_callid (req), state->loc.path,
-				attr->st_size);
-                        fuse_reply_err (req, EINVAL);
-			free_state (state);
-                        return;
-                }
+        /* facilitate retry from VFS */
+        if ((state->fd == NULL) && (op_errno == ENOENT))
+            op_errno = ESTALE;
+
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64
+               ": %s() "
+               "%s => -1 (%s)",
+               frame->root->unique, gf_fop_list[frame->root->op],
+               state->loc.path ? state->loc.path : "ERR", strerror(op_errno));
+
+        send_fuse_err(this, finh, op_errno);
+    }
+
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+
+    return 0;
+}
+
+static int
+fuse_root_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, inode_t *inode,
+                     struct iatt *stat, dict_t *dict, struct iatt *postparent)
+{
+    fuse_attr_cbk(frame, cookie, this, op_ret, op_errno, stat, dict);
 
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": TRUNCATE %s/%"PRId64"(%lu)",
-			req_callid (req),
-                        state->loc.path, attr->st_size, ino);
+    return 0;
+}
+
+void
+fuse_getattr_resume(fuse_state_t *state)
+{
+    if (!state->loc.inode && !(state->fd && state->fd->inode)) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "%" PRIu64 ": GETATTR %" PRIu64 " (%s) resolution failed",
+               state->finh->unique, state->finh->nodeid,
+               uuid_utoa(state->resolve.gfid));
+
+        /* facilitate retry from VFS */
+        if ((state->fd == NULL) && (state->resolve.op_errno == ENOENT))
+            state->resolve.op_errno = ESTALE;
+
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+    if (state->fd == NULL && !IA_ISDIR(state->loc.inode->ia_type)) {
+        state->fd = fd_lookup(state->loc.inode, state->finh->pid);
+
+        if (state->fd == NULL)
+            state->fd = fd_lookup(state->loc.inode, 0);
+    }
+
+    if (!state->fd) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": GETATTR %" PRIu64 " (%s)", state->finh->unique,
+               state->finh->nodeid, state->loc.path);
+
+        FUSE_FOP(state, fuse_attr_cbk, GF_FOP_STAT, stat, &state->loc,
+                 state->xdata);
+    } else {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": FGETATTR %" PRIu64 " (%s/%p)", state->finh->unique,
+               state->finh->nodeid, state->loc.path, state->fd);
+
+        FUSE_FOP(state, fuse_attr_cbk, GF_FOP_FSTAT, fstat, state->fd,
+                 state->xdata);
+    }
+}
 
-                FUSE_FOP (state, fuse_attr_cbk, GF_FOP_TRUNCATE,
-                          truncate, &state->loc, attr->st_size);
+static void
+fuse_getattr(xlator_t *this, fuse_in_header_t *finh, void *msg,
+             struct iobuf *iobuf)
+{
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+    struct fuse_getattr_in *fgi = msg;
+    fuse_private_t *priv = NULL;
+#endif
+    fuse_state_t *state;
+    int ret = -1;
+
+    GET_STATE(this, finh, state);
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+    priv = this->private;
+    if (priv->proto_minor >= 9 && fgi->getattr_flags & FUSE_GETATTR_FH)
+        state->fd = fd_ref((fd_t *)(uintptr_t)fgi->fh);
+#endif
+    if (finh->nodeid == 1) {
+        state->gfid[15] = 1;
+
+        ret = fuse_loc_fill(&state->loc, state, finh->nodeid, 0, NULL);
+        if (ret < 0) {
+            gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                   "%" PRIu64 ": GETATTR on / (fuse_loc_fill() failed)",
+                   finh->unique);
+            send_fuse_err(this, finh, ESTALE);
+            free_fuse_state(state);
+            return;
         }
 
+        fuse_gfid_set(state);
+
+        FUSE_FOP(state, fuse_root_lookup_cbk, GF_FOP_LOOKUP, lookup,
+                 &state->loc, state->xdata);
         return;
+    }
+
+    if (state->fd)
+        fuse_resolve_fd_init(state, &state->resolve, state->fd);
+    else
+        fuse_resolve_inode_init(state, &state->resolve, state->finh->nodeid);
+
+    fuse_resolve_and_resume(state, fuse_getattr_resume);
 }
 
+static int32_t
+fuse_fd_inherit_directio(xlator_t *this, fd_t *fd, struct fuse_open_out *foo)
+{
+    int32_t ret = 0;
+    fuse_fd_ctx_t *fdctx = NULL, *tmp_fdctx = NULL;
+    fd_t *tmp_fd = NULL;
+
+    GF_VALIDATE_OR_GOTO_WITH_ERROR("glusterfs-fuse", this, out, ret, -EINVAL);
+    GF_VALIDATE_OR_GOTO_WITH_ERROR("glusterfs-fuse", fd, out, ret, -EINVAL);
+    GF_VALIDATE_OR_GOTO_WITH_ERROR("glusterfs-fuse", foo, out, ret, -EINVAL);
+
+    fdctx = fuse_fd_ctx_get(this, fd);
+    if (!fdctx) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    tmp_fd = fd_lookup(fd->inode, 0);
+    if (tmp_fd) {
+        tmp_fdctx = fuse_fd_ctx_get(this, tmp_fd);
+        if (tmp_fdctx) {
+            foo->open_flags &= ~FOPEN_DIRECT_IO;
+            foo->open_flags |= (tmp_fdctx->open_flags & FOPEN_DIRECT_IO);
+        }
+    }
+
+    fdctx->open_flags |= (foo->open_flags & FOPEN_DIRECT_IO);
+
+    if (tmp_fd != NULL) {
+        fd_unref(tmp_fd);
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
 
-static void 
-do_utimes (fuse_req_t req,
-           fuse_ino_t ino,
-           struct stat *attr)
+gf_boolean_t
+direct_io_mode(dict_t *xdata)
 {
-        fuse_state_t *state;
+    if (xdata && dict_get(xdata, "direct-io-mode"))
+        return _gf_true;
+    return _gf_false;
+}
 
-        struct timespec tv[2];
-	int32_t ret = -1;
+static int
+fuse_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    fuse_state_t *state = NULL;
+    fuse_in_header_t *finh = NULL;
+    fuse_private_t *priv = NULL;
+    int32_t ret = 0;
+    struct fuse_open_out foo = {
+        0,
+    };
+
+    priv = this->private;
+    state = frame->root->state;
+    finh = state->finh;
+
+    fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
+    if (op_ret >= 0) {
+        foo.fh = (uintptr_t)fd;
+        foo.open_flags = 0;
+
+        if (!IA_ISDIR(fd->inode->ia_type)) {
+            if (((priv->direct_io_mode == 2) &&
+                 ((state->flags & O_ACCMODE) != O_RDONLY)) ||
+                (priv->direct_io_mode == 1) || (direct_io_mode(xdata)))
+                foo.open_flags |= FOPEN_DIRECT_IO;
+#ifdef GF_DARWIN_HOST_OS
+            /* In Linux: by default, buffer cache
+             * is purged upon open, setting
+             * FOPEN_KEEP_CACHE implies no-purge
+             *
+             * In MacFUSE: by default, buffer cache
+             * is left intact upon open, setting
+             * FOPEN_PURGE_UBC implies purge
+             *
+             * [[Interesting...]]
+             */
+            if (!priv->fopen_keep_cache)
+                foo.open_flags |= FOPEN_PURGE_UBC;
+#else
+            /*
+             * If fopen-keep-cache is enabled, we set the associated
+             * flag here such that files are not invalidated on open.
+             * File invalidations occur either in fuse or explicitly
+             * when the cache is set invalid on the inode.
+             */
+            if (priv->fopen_keep_cache)
+                foo.open_flags |= FOPEN_KEEP_CACHE;
+#endif
+        }
 
-	tv[0].tv_sec = attr->st_atime;
-	tv[0].tv_nsec = ST_ATIM_NSEC(attr);
-        tv[1].tv_sec = attr->st_mtime;
-        tv[1].tv_nsec = ST_ATIM_NSEC(attr);
+        gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": %s() %s => %p",
+               frame->root->unique, gf_fop_list[frame->root->op],
+               state->loc.path, fd);
+
+        ret = fuse_fd_inherit_directio(this, fd, &foo);
+        if (ret < 0) {
+            op_errno = -ret;
+            gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                   "cannot inherit direct-io values for fd "
+                   "(ptr:%p inode-gfid:%s) from fds already "
+                   "opened",
+                   fd, uuid_utoa(fd->inode->gfid));
+            goto err;
+        }
 
-        state = state_from_req (req);
-        ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL);
-        if ((state->loc.inode == NULL) ||
-	    (ret < 0)) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": UTIMENS %s (fuse_loc_fill() failed)", 
-                        req_callid (req), state->loc.path);
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
+        if (send_fuse_obj(this, finh, &foo) == ENOENT) {
+            gf_log("glusterfs-fuse", GF_LOG_DEBUG, "open(%s) got EINTR",
+                   state->loc.path);
+            gf_fd_put(priv->fdtable, state->fd_no);
+            goto out;
         }
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": UTIMENS (%lu)%s", req_callid (req),
-                ino, state->loc.path);
+        fd_bind(fd);
+    } else {
+    err:
+        /* OPEN(DIR) being an operation on inode should never fail with
+         * ENOENT. If gfid is not present, the appropriate error is
+         * ESTALE.
+         */
+        if (op_errno == ENOENT)
+            op_errno = ESTALE;
 
-        FUSE_FOP (state, fuse_attr_cbk, GF_FOP_UTIMENS,
-                  utimens, &state->loc, tv);
-}
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": %s() %s => -1 (%s)", frame->root->unique,
+               gf_fop_list[frame->root->op], state->loc.path,
+               strerror(op_errno));
 
+        send_fuse_err(this, finh, op_errno);
+        gf_fd_put(priv->fdtable, state->fd_no);
+    }
+out:
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+    return 0;
+}
 
 static void
-fuse_setattr (fuse_req_t req,
-              fuse_ino_t ino,
-              struct stat *attr,
-              int valid,
-              struct fuse_file_info *fi)
+fuse_do_truncate(fuse_state_t *state)
+{
+    if (state->fd) {
+        FUSE_FOP(state, fuse_truncate_cbk, GF_FOP_FTRUNCATE, ftruncate,
+                 state->fd, state->off, state->xdata);
+    } else {
+        FUSE_FOP(state, fuse_truncate_cbk, GF_FOP_TRUNCATE, truncate,
+                 &state->loc, state->off, state->xdata);
+    }
+
+    return;
+}
+
+static int
+fuse_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+                 struct iatt *statpost, dict_t *xdata)
 {
+    fuse_state_t *state;
+    fuse_in_header_t *finh;
+    fuse_private_t *priv = NULL;
+    struct fuse_attr_out fao;
+
+    int op_done = 0;
+
+    priv = this->private;
+    state = frame->root->state;
+    finh = state->finh;
+
+    fuse_log_eh(this,
+                "op_ret: %d, op_errno: %d, %" PRIu64
+                ", %s() %s => "
+                "gfid: %s",
+                op_ret, op_errno, frame->root->unique,
+                gf_fop_list[frame->root->op], state->loc.path,
+                state->loc.inode ? uuid_utoa(state->loc.inode->gfid) : "");
+
+    if (op_ret == 0) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": %s() %s => %" PRIu64, frame->root->unique,
+               gf_fop_list[frame->root->op],
+               state->loc.path ? state->loc.path : "ERR", statpost->ia_ino);
+
+        statpost->ia_blksize = this->ctx->page_size;
+        gf_fuse_stat2attr(statpost, &fao.attr, priv->enable_ino32);
+
+        fao.attr_valid = calc_timeout_sec(priv->attribute_timeout);
+        fao.attr_valid_nsec = calc_timeout_nsec(priv->attribute_timeout);
+
+        if (state->truncate_needed) {
+            fuse_do_truncate(state);
+        } else {
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+            priv->proto_minor >= 9
+                ? send_fuse_obj(this, finh, &fao)
+                : send_fuse_data(this, finh, &fao, FUSE_COMPAT_ATTR_OUT_SIZE);
+#else
+            send_fuse_obj(this, finh, &fao);
+#endif
+            op_done = 1;
+        }
+    } else {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": %s() %s => -1 (%s)", frame->root->unique,
+               gf_fop_list[frame->root->op],
+               state->loc.path ? state->loc.path : "ERR", strerror(op_errno));
+
+        /* facilitate retry from VFS */
+        if ((state->fd == NULL) && (op_errno == ENOENT))
+            op_errno = ESTALE;
+
+        send_fuse_err(this, finh, op_errno);
+        op_done = 1;
+    }
+
+    if (op_done) {
+        free_fuse_state(state);
+    }
 
-        if (valid & FUSE_SET_ATTR_MODE)
-                do_chmod (req, ino, attr, fi);
-        else if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID))
-                do_chown (req, ino, attr, valid, fi);
-        else if (valid & FUSE_SET_ATTR_SIZE)
-                do_truncate (req, ino, attr, fi);
-        else if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME))
-                do_utimes (req, ino, attr);
-        else 
-                fuse_getattr (req, ino, fi);
+    STACK_DESTROY(frame->root);
+
+    return 0;
 }
 
+static int32_t
+fattr_to_gf_set_attr(int32_t valid)
+{
+    int32_t gf_valid = 0;
 
-static int gf_fuse_xattr_enotsup_log;
+    if (valid & FATTR_MODE)
+        gf_valid |= GF_SET_ATTR_MODE;
 
-static int
-fuse_err_cbk (call_frame_t *frame,
-              void *cookie,
-              xlator_t *this,
-              int32_t op_ret,
-              int32_t op_errno)
-{
-        fuse_state_t *state = frame->root->state;
-        fuse_req_t req = state->req;
-
-        if (op_ret == 0) {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": %s() %s => 0", frame->root->unique,
-			gf_fop_list[frame->root->op], 
-                        state->loc.path ? state->loc.path : "ERR");
-
-                fuse_reply_err (req, 0);
+    if (valid & FATTR_UID)
+        gf_valid |= GF_SET_ATTR_UID;
+
+    if (valid & FATTR_GID)
+        gf_valid |= GF_SET_ATTR_GID;
+
+    if (valid & FATTR_ATIME)
+        gf_valid |= GF_SET_ATTR_ATIME;
+
+    if (valid & FATTR_MTIME)
+        gf_valid |= GF_SET_ATTR_MTIME;
+
+#if FUSE_KERNEL_MINOR_VERSION >= 23
+    if (valid & FATTR_CTIME)
+        gf_valid |= GF_SET_ATTR_CTIME;
+#endif
+
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+    if (valid & FATTR_ATIME_NOW)
+        gf_valid |= GF_ATTR_ATIME_NOW;
+
+    if (valid & FATTR_MTIME_NOW)
+        gf_valid |= GF_ATTR_MTIME_NOW;
+#endif
+
+    if (valid & FATTR_SIZE)
+        gf_valid |= GF_SET_ATTR_SIZE;
+
+    return gf_valid;
+}
+
+#define FATTR_MASK                                                             \
+    (FATTR_SIZE | FATTR_UID | FATTR_GID | FATTR_ATIME | FATTR_MTIME |          \
+     FATTR_MODE)
+
+void
+fuse_setattr_resume(fuse_state_t *state)
+{
+    if (!state->fd && !state->loc.inode) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "%" PRIu64 ": SETATTR %" PRIu64 " (%s) resolution failed",
+               state->finh->unique, state->finh->nodeid,
+               uuid_utoa(state->resolve.gfid));
+
+        /* facilitate retry from VFS */
+        if ((state->fd == NULL) && (state->resolve.op_errno == ENOENT))
+            state->resolve.op_errno = ESTALE;
+
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "%" PRIu64 ": SETATTR (%" PRIu64 ")%s", state->finh->unique,
+           state->finh->nodeid, state->loc.path);
+
+#ifdef GF_TEST_FFOP
+    /* this is for calls like 'fchmod()' */
+    if (!state->fd)
+        state->fd = fd_lookup(state->loc.inode, state->finh->pid);
+#endif /* GF_TEST_FFOP */
+
+    if ((state->valid & (FATTR_MASK)) != FATTR_SIZE) {
+        if (state->fd &&
+            !((state->valid & FATTR_ATIME) || (state->valid & FATTR_MTIME)
+#if FUSE_KERNEL_MINOR_VERSION >= 23
+              || (state->valid & FATTR_CTIME)
+#endif
+                  )) {
+            /*
+                there is no "futimes" call, so don't send
+                fsetattr if ATIME or MTIME is set
+             */
+
+            FUSE_FOP(state, fuse_setattr_cbk, GF_FOP_FSETATTR, fsetattr,
+                     state->fd, &state->attr,
+                     fattr_to_gf_set_attr(state->valid), state->xdata);
         } else {
-                if (frame->root->op == GF_FOP_SETXATTR) {
-			op_ret = gf_compat_setxattr (state->dict);
-			if (op_ret == 0)
-				op_errno = 0;
-			if (op_errno == ENOTSUP) {
-				gf_fuse_xattr_enotsup_log++;
-				if (!(gf_fuse_xattr_enotsup_log % GF_UNIVERSAL_ANSWER))
-					gf_log ("glusterfs-fuse", GF_LOG_CRITICAL,
-						"[ ERROR ] Extended attribute not supported by the backend storage");
-			}
-                } else {
-			if ((frame->root->op == GF_FOP_REMOVEXATTR)
-			    && (op_errno == ENOATTR)) {
-				goto nolog;
-			}
-                        gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                                "%"PRId64": %s() %s => -1 (%s)",
-				frame->root->unique,
-				gf_fop_list[frame->root->op],
-				state->loc.path ? state->loc.path : "ERR",
-                                strerror (op_errno));
-                }
-	nolog:
+            FUSE_FOP(state, fuse_setattr_cbk, GF_FOP_SETATTR, setattr,
+                     &state->loc, &state->attr,
+                     fattr_to_gf_set_attr(state->valid), state->xdata);
+        }
+    } else {
+        fuse_do_truncate(state);
+    }
+}
 
-                fuse_reply_err (req, op_errno);
+static void
+fuse_setattr(xlator_t *this, fuse_in_header_t *finh, void *msg,
+             struct iobuf *iobuf)
+{
+    struct fuse_setattr_in *fsi = msg;
+
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+    fuse_private_t *priv = NULL;
+#endif
+    fuse_state_t *state = NULL;
+
+    GET_STATE(this, finh, state);
+
+    if (fsi->valid & FATTR_FH && !(fsi->valid & (FATTR_ATIME | FATTR_MTIME))) {
+        /* We need no loc if kernel sent us an fd and
+         * we are not fiddling with times */
+        state->fd = FH_TO_FD(fsi->fh);
+        fuse_resolve_fd_init(state, &state->resolve, state->fd);
+    } else {
+        fuse_resolve_inode_init(state, &state->resolve, finh->nodeid);
+    }
+
+    /*
+     * This is just stub code demonstrating how to retrieve
+     * lock_owner in setattr, according to the FUSE proto.
+     * We do not make use of ATM. Its purpose is supporting
+     * mandatory locking, but getting that right is further
+     * down the road. Cf.
+     *
+     * http://thread.gmane.org/gmane.comp.file-systems.fuse.devel/
+     * 4962/focus=4982
+     *
+     * http://git.kernel.org/?p=linux/kernel/git/torvalds/
+     * linux-2.6.git;a=commit;h=v2.6.23-5896-gf333211
+     */
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+    priv = this->private;
+    if (priv->proto_minor >= 9 && fsi->valid & FATTR_LOCKOWNER)
+        state->lk_owner = fsi->lock_owner;
+#endif
+
+    state->valid = fsi->valid;
+
+    if ((fsi->valid & (FATTR_MASK)) != FATTR_SIZE) {
+        if (fsi->valid & FATTR_SIZE) {
+            state->off = fsi->size;
+            state->truncate_needed = _gf_true;
         }
 
-        free_state (state);
-        STACK_DESTROY (frame->root);
+        state->attr.ia_size = fsi->size;
+        state->attr.ia_atime = fsi->atime;
+        state->attr.ia_mtime = fsi->mtime;
+#if FUSE_KERNEL_MINOR_VERSION >= 23
+        state->attr.ia_ctime = fsi->ctime;
+#endif
+        state->attr.ia_atime_nsec = fsi->atimensec;
+        state->attr.ia_mtime_nsec = fsi->mtimensec;
+#if FUSE_KERNEL_MINOR_VERSION >= 23
+        state->attr.ia_ctime_nsec = fsi->ctimensec;
+#endif
 
-        return 0;
+        state->attr.ia_prot = ia_prot_from_st_mode(fsi->mode);
+        state->attr.ia_uid = fsi->uid;
+        state->attr.ia_gid = fsi->gid;
+    } else {
+        state->off = fsi->size;
+    }
+
+    fuse_resolve_and_resume(state, fuse_setattr_resume);
 }
 
+static int
+fuse_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    fuse_state_t *state = NULL;
+    fuse_in_header_t *finh = NULL;
+
+    GF_ASSERT(frame);
+    GF_ASSERT(frame->root);
+
+    state = frame->root->state;
+    finh = state->finh;
+
+    fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
 
+    if (op_ret == 0) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": %s() %s => 0",
+               frame->root->unique, gf_fop_list[frame->root->op],
+               state->loc.path ? state->loc.path : "ERR");
+
+        send_fuse_err(this, finh, 0);
+    } else {
+        gf_log("glusterfs-fuse",
+               (ENODATA == op_errno) ? GF_LOG_DEBUG : GF_LOG_WARNING,
+               "%" PRIu64 ": %s() of %s on %s => -1 (%s)", frame->root->unique,
+               gf_fop_list[frame->root->op], state->name ? state->name : "",
+               state->loc.path ? state->loc.path : "ERR", strerror(op_errno));
+
+        /* facilitate retry from VFS */
+        if ((state->fd == NULL) && (op_errno == ENOENT))
+            op_errno = ESTALE;
+
+        send_fuse_err(this, finh, op_errno);
+    }
+
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+
+    return 0;
+}
 
 static int
-fuse_unlink_cbk (call_frame_t *frame,
-                 void *cookie,
-                 xlator_t *this,
-                 int32_t op_ret,
-                 int32_t op_errno)
+fuse_err_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, dict_t *xdata)
 {
-        fuse_state_t *state = frame->root->state;
-        fuse_req_t req = state->req;
+    fuse_state_t *state = frame->root->state;
+    fuse_in_header_t *finh = state->finh;
+
+    fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
+    if (op_ret == 0) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": %s() %s => 0",
+               frame->root->unique, gf_fop_list[frame->root->op],
+               state->loc.path ? state->loc.path : "ERR");
+
+        send_fuse_err(this, finh, 0);
+    } else {
+        if (GF_IGNORE_IF_GSYNCD_SAFE_ERROR(frame, op_errno)) {
+            gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                   "%" PRIu64 ": %s() %s => -1 (%s)", frame->root->unique,
+                   gf_fop_list[frame->root->op],
+                   state->loc.path ? state->loc.path : "ERR",
+                   strerror(op_errno));
+        }
 
-        if (op_ret == 0)
-                inode_unlink (state->loc.inode, state->loc.parent,
-			      state->loc.name);
+        /* facilitate retry from VFS */
+        if ((state->fd == NULL) && (op_errno == ENOENT))
+            op_errno = ESTALE;
 
-        if (op_ret == 0) {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": %s() %s => 0", frame->root->unique,
-                        gf_fop_list[frame->root->op], state->loc.path);
+        send_fuse_err(this, finh, op_errno);
+    }
 
-                fuse_reply_err (req, 0);
-        } else {
-                gf_log ("glusterfs-fuse", 
-                        (op_errno != ENOTEMPTY ? GF_LOG_ERROR : GF_LOG_DEBUG),
-                        "%"PRId64": %s() %s => -1 (%s)", frame->root->unique,
-                        gf_fop_list[frame->root->op], state->loc.path,
-			strerror (op_errno));
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
 
-                fuse_reply_err (req, op_errno);
-        }
+    return 0;
+}
 
-        free_state (state);
-        STACK_DESTROY (frame->root);
+static int
+fuse_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    fuse_private_t *priv = this->private;
 
-        return 0;
+    if (priv->flush_handle_interrupt) {
+        if (fuse_interrupt_finish_fop(frame, this, _gf_false, NULL)) {
+            return 0;
+        }
+    }
+
+    return fuse_err_cbk(frame, cookie, this, op_ret, op_errno, xdata);
 }
 
+static int
+fuse_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+               struct iatt *postbuf, dict_t *xdata)
+{
+    return fuse_err_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+}
 
-static void
-fuse_access (fuse_req_t req,
-             fuse_ino_t ino,
-             int mask)
+static int
+fuse_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, dict_t *xdata)
 {
-        fuse_state_t *state;
-	int32_t ret = -1;
+    if (op_ret == -1 && op_errno == ENOTSUP)
+        GF_LOG_OCCASIONALLY(gf_fuse_xattr_enotsup_log, "glusterfs-fuse",
+                            GF_LOG_CRITICAL,
+                            "extended attribute not supported "
+                            "by the backend storage");
 
-        state = state_from_req (req);
+    return fuse_err_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+}
 
-        ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL);
-        if ((state->loc.inode == NULL) || 
-	    (ret < 0)) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": ACCESS %"PRId64" (%s) (fuse_loc_fill() failed)", 
-                        req_callid (req), (int64_t)ino, state->loc.path);
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
+static int
+fuse_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
+{
+    fuse_state_t *state = NULL;
+    fuse_in_header_t *finh = NULL;
+
+    state = frame->root->state;
+    finh = state->finh;
+
+    fuse_log_eh(this,
+                "op_ret: %d, op_errno: %d, %" PRIu64
+                ": %s() %s => "
+                "gfid: %s",
+                op_ret, op_errno, frame->root->unique,
+                gf_fop_list[frame->root->op], state->loc.path,
+                state->loc.inode ? uuid_utoa(state->loc.inode->gfid) : "");
+
+    if (op_ret == 0) {
+        inode_unlink(state->loc.inode, state->loc.parent, state->loc.name);
+        gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": %s() %s => 0",
+               frame->root->unique, gf_fop_list[frame->root->op],
+               state->loc.path);
+
+        send_fuse_err(this, finh, 0);
+    } else {
+        if (GF_IGNORE_IF_GSYNCD_SAFE_ERROR(frame, op_errno)) {
+            gf_log("glusterfs-fuse",
+                   op_errno == ENOTEMPTY ? GF_LOG_DEBUG : GF_LOG_WARNING,
+                   "%" PRIu64 ": %s() %s => -1 (%s)", frame->root->unique,
+                   gf_fop_list[frame->root->op], state->loc.path,
+                   strerror(op_errno));
         }
+        send_fuse_err(this, finh, op_errno);
+    }
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64" ACCESS %s/%lu mask=%d", req_callid (req),
-                state->loc.path, ino, mask);
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
 
-        FUSE_FOP (state, fuse_err_cbk,
-                  GF_FOP_ACCESS, access,
-		  &state->loc, mask);
+    return 0;
+}
 
+void
+fuse_access_resume(fuse_state_t *state)
+{
+    if (!state->loc.inode) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "%" PRIu64 ": ACCESS %" PRIu64 " (%s) resolution failed",
+               state->finh->unique, state->finh->nodeid,
+               uuid_utoa(state->resolve.gfid));
+
+        /* facilitate retry from VFS */
+        if (state->resolve.op_errno == ENOENT)
+            state->resolve.op_errno = ESTALE;
+
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
         return;
-}
+    }
 
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "%" PRIu64 " ACCESS %s/%" PRIu64 " mask=%d", state->finh->unique,
+           state->loc.path, state->finh->nodeid, state->mask);
 
+    FUSE_FOP(state, fuse_err_cbk, GF_FOP_ACCESS, access, &state->loc,
+             state->mask, state->xdata);
+}
 
-static int
-fuse_readlink_cbk (call_frame_t *frame,
-                   void *cookie,
-                   xlator_t *this,
-                   int32_t op_ret,
-                   int32_t op_errno,
-                   const char *linkname)
+static void
+fuse_access(xlator_t *this, fuse_in_header_t *finh, void *msg,
+            struct iobuf *iobuf)
 {
-        fuse_state_t *state = frame->root->state;
-        fuse_req_t req = state->req;
+    struct fuse_access_in *fai = msg;
+    fuse_state_t *state = NULL;
 
-        if (op_ret > 0) {
-                ((char *)linkname)[op_ret] = '\0';
+    GET_STATE(this, finh, state);
 
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": %s => %s", frame->root->unique,
-                        state->loc.path, linkname);
+    fuse_resolve_inode_init(state, &state->resolve, finh->nodeid);
 
-                fuse_reply_readlink(req, linkname);
-        } else {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": %s => -1 (%s)", frame->root->unique,
-                        state->loc.path, strerror(op_errno));
+    state->mask = fai->mask;
 
-                fuse_reply_err(req, op_errno);
-        }
+    fuse_resolve_and_resume(state, fuse_access_resume);
 
-        free_state (state);
-        STACK_DESTROY (frame->root);
+    return;
+}
 
-        return 0;
+static int
+fuse_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, const char *linkname,
+                  struct iatt *buf, dict_t *xdata)
+{
+    fuse_state_t *state = NULL;
+    fuse_in_header_t *finh = NULL;
+
+    state = frame->root->state;
+    finh = state->finh;
+
+    fuse_log_eh(this,
+                "op_ret: %d, op_errno: %d %" PRIu64
+                ": %s() => %s"
+                " linkname: %s, gfid: %s",
+                op_ret, op_errno, frame->root->unique,
+                gf_fop_list[frame->root->op], state->loc.gfid, linkname,
+                uuid_utoa(state->loc.gfid));
+
+    if (op_ret > 0) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": %s => %s (size:%d)", frame->root->unique,
+               state->loc.path, linkname, op_ret);
+        send_fuse_data(this, finh, (void *)linkname, op_ret);
+    } else {
+        /* facilitate retry from VFS */
+        if (op_errno == ENOENT)
+            op_errno = ESTALE;
+
+        gf_log("glusterfs-fuse", GF_LOG_WARNING, "%" PRIu64 ": %s => -1 (%s)",
+               frame->root->unique, state->loc.path, strerror(op_errno));
+
+        send_fuse_err(this, finh, op_errno);
+    }
+
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+
+    return 0;
 }
 
+void
+fuse_readlink_resume(fuse_state_t *state)
+{
+    if (!state->loc.inode) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "READLINK %" PRIu64 " (%s) resolution failed",
+               state->finh->unique, uuid_utoa(state->resolve.gfid));
 
-static void
-fuse_readlink (fuse_req_t req,
-               fuse_ino_t ino)
-{
-        fuse_state_t *state;
-	int32_t ret = -1;
-
-        state = state_from_req (req);
-        ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL);
-        if ((state->loc.inode == NULL) ||
-	    (ret < 0)) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64" READLINK %s/%"PRId64" (fuse_loc_fill() returned NULL inode)", 
-                        req_callid (req), state->loc.path,
-			state->loc.inode->ino);
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
-        }
-  
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64" READLINK %s/%"PRId64, req_callid (req),
-                state->loc.path, state->loc.inode->ino);
-
-        FUSE_FOP (state, fuse_readlink_cbk, GF_FOP_READLINK,
-		  readlink, &state->loc, 4096);
+        /* facilitate retry from VFS */
+        if (state->resolve.op_errno == ENOENT)
+            state->resolve.op_errno = ESTALE;
 
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
         return;
-}
+    }
 
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 " READLINK %s/%s",
+           state->finh->unique, state->loc.path,
+           uuid_utoa(state->loc.inode->gfid));
+
+    FUSE_FOP(state, fuse_readlink_cbk, GF_FOP_READLINK, readlink, &state->loc,
+             4096, state->xdata);
+}
 
 static void
-fuse_mknod (fuse_req_t req,
-            fuse_ino_t par,
-            const char *name,
-            mode_t mode,
-            dev_t rdev)
-{
-        fuse_state_t *state;
-	int32_t ret = -1;
-
-        state = state_from_req (req);
-        ret = fuse_loc_fill (&state->loc, state, 0, par, name);
-	if (ret < 0) {
-		gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64" MKNOD %s (fuse_loc_fill() failed)", 
-                        req_callid (req), state->loc.path);
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
-	}
-
-        state->loc.inode = inode_new (state->itable);
-
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": MKNOD %s", req_callid (req),
-                state->loc.path);
-
-        FUSE_FOP (state, fuse_entry_cbk, GF_FOP_MKNOD,
-		  mknod, &state->loc, mode, rdev);
+fuse_readlink(xlator_t *this, fuse_in_header_t *finh, void *msg,
+              struct iobuf *iobuf)
+{
+    fuse_state_t *state = NULL;
 
-        return;
+    GET_STATE(this, finh, state);
+
+    fuse_resolve_inode_init(state, &state->resolve, finh->nodeid);
+
+    fuse_resolve_and_resume(state, fuse_readlink_resume);
+
+    return;
 }
 
+void
+fuse_mknod_resume(fuse_state_t *state)
+{
+    if (!state->loc.parent) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "MKNOD %" PRIu64 "/%s (%s/%s) resolution failed",
+               state->finh->nodeid, state->resolve.bname,
+               uuid_utoa(state->resolve.gfid), state->resolve.bname);
+
+        /* facilitate retry from VFS */
+        if (state->resolve.op_errno == ENOENT)
+            state->resolve.op_errno = ESTALE;
+
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+    if (state->resolve.op_errno == ENOENT) {
+        state->resolve.op_ret = 0;
+        state->resolve.op_errno = 0;
+    }
+
+    if (state->loc.inode) {
+        gf_log(state->this->name, GF_LOG_DEBUG, "inode already present");
+        inode_unref(state->loc.inode);
+        state->loc.inode = NULL;
+    }
+
+    state->loc.inode = inode_new(state->loc.parent->table);
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": MKNOD %s",
+           state->finh->unique, state->loc.path);
+
+    FUSE_FOP(state, fuse_newentry_cbk, GF_FOP_MKNOD, mknod, &state->loc,
+             state->mode, state->rdev, state->umask, state->xdata);
+}
 
-static void 
-fuse_mkdir (fuse_req_t req,
-            fuse_ino_t par,
-            const char *name,
-            mode_t mode)
+static void
+fuse_mknod(xlator_t *this, fuse_in_header_t *finh, void *msg,
+           struct iobuf *iobuf)
 {
-        fuse_state_t *state;
-	int32_t ret = -1;
+    struct fuse_mknod_in *fmi = msg;
+    char *name = (char *)(fmi + 1);
 
-        state = state_from_req (req);
-        ret = fuse_loc_fill (&state->loc, state, 0, par, name);
-	if (ret < 0) {
-		gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64" MKDIR %s (fuse_loc_fill() failed)", 
-                        req_callid (req), state->loc.path);
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
-	}
+    fuse_state_t *state = NULL;
+#if FUSE_KERNEL_MINOR_VERSION >= 12
+    fuse_private_t *priv = NULL;
 
-        state->loc.inode = inode_new (state->itable);
+    priv = this->private;
+    if (priv->proto_minor < 12)
+        name = (char *)msg + FUSE_COMPAT_MKNOD_IN_SIZE;
+#endif
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": MKDIR %s", req_callid (req),
-                state->loc.path);
+    GET_STATE(this, finh, state);
 
-        FUSE_FOP (state, fuse_entry_cbk, GF_FOP_MKDIR,
-		  mkdir, &state->loc, mode);
+    gf_uuid_generate(state->gfid);
 
-        return;
+    fuse_resolve_entry_init(state, &state->resolve, finh->nodeid, name);
+
+    state->mode = fmi->mode;
+    state->rdev = fmi->rdev;
+
+#if FUSE_KERNEL_MINOR_VERSION >= 12
+    priv = this->private;
+    FUSE_ENTRY_CREATE(this, priv, finh, state, fmi, "MKNOD");
+#endif
+
+    fuse_resolve_and_resume(state, fuse_mknod_resume);
+
+    return;
 }
 
+void
+fuse_mkdir_resume(fuse_state_t *state)
+{
+    if (!state->loc.parent) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "MKDIR %" PRIu64 " (%s/%s) resolution failed",
+               state->finh->nodeid, uuid_utoa(state->resolve.gfid),
+               state->resolve.bname);
+
+        /* facilitate retry from VFS */
+        if (state->resolve.op_errno == ENOENT)
+            state->resolve.op_errno = ESTALE;
+
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+    if (state->resolve.op_errno == ENOENT) {
+        state->resolve.op_ret = 0;
+        state->resolve.op_errno = 0;
+    }
+
+    if (state->loc.inode) {
+        gf_log(state->this->name, GF_LOG_DEBUG, "inode already present");
+        inode_unref(state->loc.inode);
+        state->loc.inode = NULL;
+    }
+
+    state->loc.inode = inode_new(state->loc.parent->table);
 
-static void 
-fuse_unlink (fuse_req_t req,
-             fuse_ino_t par,
-             const char *name)
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": MKDIR %s",
+           state->finh->unique, state->loc.path);
+
+    FUSE_FOP(state, fuse_newentry_cbk, GF_FOP_MKDIR, mkdir, &state->loc,
+             state->mode, state->umask, state->xdata);
+}
+
+static void
+fuse_mkdir(xlator_t *this, fuse_in_header_t *finh, void *msg,
+           struct iobuf *iobuf)
 {
-        fuse_state_t *state;
-	int32_t ret = -1;
+    struct fuse_mkdir_in *fmi = msg;
+    char *name = (char *)(fmi + 1);
+#if FUSE_KERNEL_MINOR_VERSION >= 12
+    fuse_private_t *priv = NULL;
+#endif
 
-        state = state_from_req (req);
+    fuse_state_t *state;
 
-        ret = fuse_loc_fill (&state->loc, state, 0, par, name);
+    GET_STATE(this, finh, state);
 
-        if ((state->loc.inode == NULL) ||
-	    (ret < 0)) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": UNLINK %s (fuse_loc_fill() returned NULL inode)",
-                        req_callid (req), state->loc.path);
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
-        }
+    gf_uuid_generate(state->gfid);
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": UNLINK %s", req_callid (req),
-                state->loc.path);
+    fuse_resolve_entry_init(state, &state->resolve, finh->nodeid, name);
 
-        FUSE_FOP (state, fuse_unlink_cbk, GF_FOP_UNLINK,
-		  unlink, &state->loc);
+    state->mode = fmi->mode;
 
-        return;
+#if FUSE_KERNEL_MINOR_VERSION >= 12
+    priv = this->private;
+    FUSE_ENTRY_CREATE(this, priv, finh, state, fmi, "MKDIR");
+#endif
+
+    fuse_resolve_and_resume(state, fuse_mkdir_resume);
+
+    return;
 }
 
+void
+fuse_unlink_resume(fuse_state_t *state)
+{
+    if (!state->loc.parent || !state->loc.inode) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "UNLINK %" PRIu64 " (%s/%s) resolution failed",
+               state->finh->nodeid, uuid_utoa(state->resolve.gfid),
+               state->resolve.bname);
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": UNLINK %s",
+           state->finh->unique, state->loc.path);
+
+    FUSE_FOP(state, fuse_unlink_cbk, GF_FOP_UNLINK, unlink, &state->loc, 0,
+             state->xdata);
+}
 
-static void 
-fuse_rmdir (fuse_req_t req,
-            fuse_ino_t par,
-            const char *name)
+static void
+fuse_unlink(xlator_t *this, fuse_in_header_t *finh, void *msg,
+            struct iobuf *iobuf)
 {
-        fuse_state_t *state;
-	int32_t ret = -1;
+    char *name = msg;
+    fuse_state_t *state = NULL;
 
-        state = state_from_req (req);
-        ret = fuse_loc_fill (&state->loc, state, 0, par, name);
-        if ((state->loc.inode == NULL) ||
-	    (ret < 0)) {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": RMDIR %s (fuse_loc_fill() failed)",
-                        req_callid (req), state->loc.path);
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
-        }
+    GET_STATE(this, finh, state);
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": RMDIR %s", req_callid (req),
-                state->loc.path);
+    fuse_resolve_entry_init(state, &state->resolve, finh->nodeid, name);
 
-        FUSE_FOP (state, fuse_unlink_cbk, GF_FOP_RMDIR,
-		  rmdir, &state->loc);
+    fuse_resolve_and_resume(state, fuse_unlink_resume);
 
-        return;
+    return;
 }
 
+void
+fuse_rmdir_resume(fuse_state_t *state)
+{
+    if (!state->loc.parent || !state->loc.inode) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "RMDIR %" PRIu64 " (%s/%s) resolution failed",
+               state->finh->nodeid, uuid_utoa(state->resolve.gfid),
+               state->resolve.bname);
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": RMDIR %s",
+           state->finh->unique, state->loc.path);
+
+    FUSE_FOP(state, fuse_unlink_cbk, GF_FOP_RMDIR, rmdir, &state->loc, 0,
+             state->xdata);
+}
 
 static void
-fuse_symlink (fuse_req_t req,
-              const char *linkname,
-              fuse_ino_t par,
-              const char *name)
-{
-        fuse_state_t *state;
-	int32_t ret = -1;
-
-        state = state_from_req (req);
-        ret = fuse_loc_fill (&state->loc, state, 0, par, name);
-	if (ret < 0) {
-		gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64" SYMLINK %s -> %s (fuse_loc_fill() failed)", 
-                        req_callid (req), state->loc.path, linkname);
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
-	}
-
-        state->loc.inode = inode_new (state->itable);
-
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": SYMLINK %s -> %s", req_callid (req),
-                state->loc.path, linkname);
-
-        FUSE_FOP (state, fuse_entry_cbk, GF_FOP_SYMLINK,
-                  symlink, linkname, &state->loc);
+fuse_rmdir(xlator_t *this, fuse_in_header_t *finh, void *msg,
+           struct iobuf *iobuf)
+{
+    char *name = msg;
+    fuse_state_t *state = NULL;
 
-        return;
+    GET_STATE(this, finh, state);
+
+    fuse_resolve_entry_init(state, &state->resolve, finh->nodeid, name);
+
+    fuse_resolve_and_resume(state, fuse_rmdir_resume);
+
+    return;
 }
 
+void
+fuse_symlink_resume(fuse_state_t *state)
+{
+    if (!state->loc.parent) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "SYMLINK %" PRIu64 " (%s/%s) -> %s resolution failed",
+               state->finh->nodeid, uuid_utoa(state->resolve.gfid),
+               state->resolve.bname, state->name);
+
+        /* facilitate retry from VFS */
+        if (state->resolve.op_errno == ENOENT)
+            state->resolve.op_errno = ESTALE;
+
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+    if (state->resolve.op_errno == ENOENT) {
+        state->resolve.op_ret = 0;
+        state->resolve.op_errno = 0;
+    }
+
+    if (state->loc.inode) {
+        gf_log(state->this->name, GF_LOG_DEBUG, "inode already present");
+        inode_unref(state->loc.inode);
+        state->loc.inode = NULL;
+    }
 
-int 
-fuse_rename_cbk (call_frame_t *frame,
-                 void *cookie,
-                 xlator_t *this,
-                 int32_t op_ret,
-                 int32_t op_errno,
-                 struct stat *buf)
+    state->loc.inode = inode_new(state->loc.parent->table);
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": SYMLINK %s -> %s",
+           state->finh->unique, state->loc.path, state->name);
+
+    FUSE_FOP(state, fuse_newentry_cbk, GF_FOP_SYMLINK, symlink, state->name,
+             &state->loc, state->umask, state->xdata);
+}
+
+static void
+fuse_symlink(xlator_t *this, fuse_in_header_t *finh, void *msg,
+             struct iobuf *iobuf)
 {
-        fuse_state_t *state = frame->root->state;
-        fuse_req_t req = state->req;
+    char *name = msg;
+    char *linkname = name + strlen(name) + 1;
+    fuse_state_t *state = NULL;
 
-        if (op_ret == 0) {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": %s -> %s => 0 (buf->st_ino=%"PRId64" , loc->ino=%"PRId64")", 
-			frame->root->unique, state->loc.path, state->loc2.path,
-			buf->st_ino, state->loc.ino);
+    GET_STATE(this, finh, state);
 
-                {
-                        /* ugly ugly - to stay blind to situation where
-                           rename happens on a new inode
-                        */
-                        buf->st_ino = state->loc.ino;
-			buf->st_mode = state->loc.inode->st_mode;
-                }
-                inode_rename (state->itable,
-                              state->loc.parent, state->loc.name,
-                              state->loc2.parent, state->loc2.name,
-                              state->loc.inode, buf);
+    gf_uuid_generate(state->gfid);
 
-                fuse_reply_err (req, 0);
-        } else {
-                gf_log ("glusterfs-fuse",
-			(op_errno == ENOTEMPTY ? GF_LOG_DEBUG : GF_LOG_ERROR),
-                        "%"PRId64": %s -> %s => -1 (%s)", frame->root->unique,
-                        state->loc.path, state->loc2.path,
-			strerror (op_errno));
-                fuse_reply_err (req, op_errno);
-        }
+    fuse_resolve_entry_init(state, &state->resolve, finh->nodeid, name);
 
-        free_state (state);
-        STACK_DESTROY (frame->root);
-        return 0;
+    state->name = gf_strdup(linkname);
+
+    fuse_resolve_and_resume(state, fuse_symlink_resume);
+
+    return;
 }
 
+int
+fuse_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                struct iatt *preoldparent, struct iatt *postoldparent,
+                struct iatt *prenewparent, struct iatt *postnewparent,
+                dict_t *xdata)
+{
+    fuse_state_t *state = NULL;
+    fuse_in_header_t *finh = NULL;
+    char loc_uuid_str[64] = {0}, loc2_uuid_str[64] = {0};
+
+    state = frame->root->state;
+    finh = state->finh;
+
+    fuse_log_eh(
+        this,
+        "op_ret: %d, op_errno: %d, %" PRIu64
+        ": %s() "
+        "path: %s parent: %s ==> path: %s parent: %s"
+        "gfid: %s",
+        op_ret, op_errno, frame->root->unique, gf_fop_list[frame->root->op],
+        state->loc.path,
+        (state->loc.parent ? uuid_utoa_r(state->loc.parent->gfid, loc_uuid_str)
+                           : ""),
+        state->loc2.path,
+        (state->loc2.parent
+             ? uuid_utoa_r(state->loc2.parent->gfid, loc2_uuid_str)
+             : ""),
+        state->loc.inode ? uuid_utoa(state->loc.inode->gfid) : "");
+
+    /* need to check for loc->parent to keep clang-scan happy.
+       It gets dereferenced below, and is checked for NULL above. */
+    if ((op_ret == 0) && (state->loc.parent) && (state->loc.inode)) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": %s -> %s => 0 (buf->ia_ino=%" PRIu64 ")",
+               frame->root->unique, state->loc.path, state->loc2.path,
+               buf->ia_ino);
 
-static void
-fuse_rename (fuse_req_t req,
-             fuse_ino_t oldpar,
-             const char *oldname,
-             fuse_ino_t newpar,
-             const char *newname)
-{
-        fuse_state_t *state;
-	int32_t ret = -1;
-
-        state = state_from_req (req);
-
-        ret = fuse_loc_fill (&state->loc, state, 0, oldpar, oldname);
-        if ((state->loc.inode == NULL) ||
-	    (ret < 0)) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "for %s %"PRId64": RENAME `%s' -> `%s' (fuse_loc_fill() failed)",
-                        state->loc.path, req_callid (req), state->loc.path,
-                        state->loc2.path);
-    
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
-        }
-
-        ret = fuse_loc_fill (&state->loc2, state, 0, newpar, newname);
-	if (ret < 0) {
-		gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "for %s %"PRId64": RENAME `%s' -> `%s' (fuse_loc_fill() failed)",
-                        state->loc.path, req_callid (req), state->loc.path,
-                        state->loc2.path);
-    
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
-       	}
-
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": RENAME `%s (%"PRId64")' -> `%s (%"PRId64")'",
-                req_callid (req), state->loc.path, state->loc.ino,
-                state->loc2.path, state->loc2.ino);
-
-        FUSE_FOP (state, fuse_rename_cbk, GF_FOP_RENAME,
-                  rename, &state->loc, &state->loc2);
+        {
+            /* ugly ugly - to stay blind to situation where
+               rename happens on a new inode
+            */
+            buf->ia_type = state->loc.inode->ia_type;
+        }
+        buf->ia_blksize = this->ctx->page_size;
+
+        inode_rename(state->loc.parent->table, state->loc.parent,
+                     state->loc.name, state->loc2.parent, state->loc2.name,
+                     state->loc.inode, buf);
+
+        send_fuse_err(this, finh, 0);
+    } else {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": %s -> %s => -1 (%s)", frame->root->unique,
+               state->loc.path, state->loc2.path, strerror(op_errno));
+        send_fuse_err(this, finh, op_errno);
+    }
+
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+    return 0;
+}
 
+void
+fuse_rename_resume(fuse_state_t *state)
+{
+    char loc_uuid[64] = {
+        0,
+    };
+    char loc2_uuid[64] = {
+        0,
+    };
+
+    if (!state->loc.parent || !state->loc.inode) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "RENAME %" PRIu64 " %s/%s -> %s/%s src resolution failed",
+               state->finh->unique, uuid_utoa_r(state->resolve.gfid, loc_uuid),
+               state->resolve.bname,
+               uuid_utoa_r(state->resolve2.gfid, loc2_uuid),
+               state->resolve2.bname);
+
+        /* facilitate retry from VFS */
+        if ((!state->loc.inode) && (state->resolve.op_errno == ENOENT))
+            state->resolve.op_errno = ESTALE;
+
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
         return;
-}
+    }
+
+    if (!state->loc2.parent) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "RENAME %" PRIu64 " %s/%s -> %s/%s dst resolution failed",
+               state->finh->unique, uuid_utoa_r(state->resolve.gfid, loc_uuid),
+               state->resolve.bname,
+               uuid_utoa_r(state->resolve2.gfid, loc2_uuid),
+               state->resolve2.bname);
+
+        send_fuse_err(state->this, state->finh, ESTALE);
+        free_fuse_state(state);
+        return;
+    }
+
+    state->resolve.op_ret = 0;
+    state->resolve2.op_ret = 0;
 
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "%" PRIu64 ": RENAME `%s (%s)' -> `%s (%s)'", state->finh->unique,
+           state->loc.path, loc_uuid, state->loc2.path, loc2_uuid);
+
+    FUSE_FOP(state, fuse_rename_cbk, GF_FOP_RENAME, rename, &state->loc,
+             &state->loc2, state->xdata);
+}
 
 static void
-fuse_link (fuse_req_t req,
-           fuse_ino_t ino,
-           fuse_ino_t par,
-           const char *name)
+fuse_rename(xlator_t *this, fuse_in_header_t *finh, void *msg,
+            struct iobuf *iobuf)
 {
-        fuse_state_t *state;
-	int32_t ret = -1;
+    struct fuse_rename_in *fri = msg;
+    char *oldname = (char *)(fri + 1);
+    char *newname = oldname + strlen(oldname) + 1;
+    fuse_state_t *state = NULL;
 
-        state = state_from_req (req);
+    GET_STATE(this, finh, state);
 
-        ret = fuse_loc_fill (&state->loc, state, 0, par, name);
-        ret = fuse_loc_fill (&state->loc2, state, ino, 0, NULL);
+    fuse_resolve_entry_init(state, &state->resolve, finh->nodeid, oldname);
 
-        if ((state->loc2.inode == NULL) ||
-	    (ret < 0)) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "fuse_loc_fill() failed for %s %"PRId64": LINK %s %s", 
-                        state->loc2.path, req_callid (req), 
-                        state->loc2.path, state->loc.path);
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
-        }
+    fuse_resolve_entry_init(state, &state->resolve2, fri->newdir, newname);
 
-        state->loc.inode = inode_ref (state->loc2.inode);
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": LINK() %s (%"PRId64") -> %s (%"PRId64")",
-		req_callid (req), state->loc2.path, state->loc2.ino,
-		state->loc.path, state->loc.ino);
+    fuse_resolve_and_resume(state, fuse_rename_resume);
+
+    return;
+}
+
+void
+fuse_link_resume(fuse_state_t *state)
+{
+    if (!state->loc2.inode || !state->loc.parent) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "fuse_loc_fill() failed %" PRIu64 ": LINK %s %s",
+               state->finh->unique, state->loc2.path, state->loc.path);
 
-        FUSE_FOP (state, fuse_entry_cbk, GF_FOP_LINK,
-                  link, &state->loc2, &state->loc);
+        /* facilitate retry from VFS */
+        if (!state->loc2.inode && (state->resolve.op_errno == ENOENT))
+            state->resolve.op_errno = ESTALE;
 
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
         return;
+    }
+
+    state->resolve.op_ret = 0;
+    state->resolve2.op_ret = 0;
+
+    if (state->loc.inode) {
+        inode_unref(state->loc.inode);
+        state->loc.inode = NULL;
+    }
+    state->loc.inode = inode_ref(state->loc2.inode);
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": LINK() %s -> %s",
+           state->finh->unique, state->loc2.path, state->loc.path);
+
+    FUSE_FOP(state, fuse_newentry_cbk, GF_FOP_LINK, link, &state->loc2,
+             &state->loc, state->xdata);
 }
 
+static void
+fuse_link(xlator_t *this, fuse_in_header_t *finh, void *msg,
+          struct iobuf *iobuf)
+{
+    struct fuse_link_in *fli = msg;
+    char *name = (char *)(fli + 1);
+    fuse_state_t *state = NULL;
+
+    GET_STATE(this, finh, state);
+
+    fuse_resolve_inode_init(state, &state->resolve2, fli->oldnodeid);
+
+    fuse_resolve_entry_init(state, &state->resolve, finh->nodeid, name);
+
+    fuse_resolve_and_resume(state, fuse_link_resume);
+
+    return;
+}
 
 static int
-fuse_create_cbk (call_frame_t *frame,
-                 void *cookie,
-                 xlator_t *this,
-                 int32_t op_ret,
-                 int32_t op_errno,
-                 fd_t *fd,
-                 inode_t *inode,
-                 struct stat *buf)
+fuse_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                struct iatt *buf, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
 {
-        fuse_state_t *state = frame->root->state;
-        fuse_req_t req = state->req;
-        fuse_private_t *priv = this->private;
+    fuse_state_t *state = NULL;
+    fuse_in_header_t *finh = NULL;
+    fuse_private_t *priv = NULL;
+    struct fuse_out_header fouh = {
+        0,
+    };
+    struct fuse_entry_out feo = {
+        0,
+    };
+    struct fuse_open_out foo = {
+        0,
+    };
+    struct iovec iov_out[3];
+    inode_t *linked_inode = NULL;
+    uint64_t ctx_value = LOOKUP_NOT_NEEDED;
+
+    state = frame->root->state;
+    priv = this->private;
+    finh = state->finh;
+    foo.open_flags = 0;
+
+    fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
+    if (op_ret >= 0) {
+        foo.fh = (uintptr_t)fd;
+
+        if (((priv->direct_io_mode == 2) &&
+             ((state->flags & O_ACCMODE) != O_RDONLY)) ||
+            (priv->direct_io_mode == 1) || direct_io_mode(xdata))
+            foo.open_flags |= FOPEN_DIRECT_IO;
+
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": %s() %s => %p (ino=%" PRIu64 ")",
+               frame->root->unique, gf_fop_list[frame->root->op],
+               state->loc.path, fd, buf->ia_ino);
+
+        buf->ia_blksize = this->ctx->page_size;
+        gf_fuse_stat2attr(buf, &feo.attr, priv->enable_ino32);
+
+        linked_inode = inode_link(inode, state->loc.parent, state->loc.name,
+                                  buf);
+
+        if (linked_inode != inode) {
+            /*
+               VERY racy code (if used anywhere else)
+               -- don't do this without understanding
+            */
+            inode_unref(fd->inode);
+            fd->inode = inode_ref(linked_inode);
+        } else {
+            inode_ctx_set(linked_inode, this, &ctx_value);
+        }
+
+        inode_lookup(linked_inode);
 
-        struct fuse_file_info fi = {0, };
-        struct fuse_entry_param e = {0, };
+        inode_unref(linked_inode);
 
-        fi.flags = state->flags;
-        if (op_ret >= 0) {
-                fi.fh = (unsigned long) fd;
+        feo.nodeid = inode_to_fuse_nodeid(linked_inode);
 
-                if ((fi.flags & 3) && priv->direct_io_mode)
-                        fi.direct_io = 1;
+        feo.entry_valid = calc_timeout_sec(priv->entry_timeout);
+        feo.entry_valid_nsec = calc_timeout_nsec(priv->entry_timeout);
+        feo.attr_valid = calc_timeout_sec(priv->attribute_timeout);
+        feo.attr_valid_nsec = calc_timeout_nsec(priv->attribute_timeout);
 
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": %s() %s => %p (ino=%"PRId64")",
-			frame->root->unique, gf_fop_list[frame->root->op],
-			state->loc.path, fd, buf->st_ino);
+        fouh.error = 0;
+        iov_out[0].iov_base = &fouh;
+        iov_out[1].iov_base = &feo;
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+        iov_out[1].iov_len = priv->proto_minor >= 9
+                                 ? sizeof(feo)
+                                 : FUSE_COMPAT_ENTRY_OUT_SIZE;
+#else
+        iov_out[1].iov_len = sizeof(feo);
+#endif
+        iov_out[2].iov_base = &foo;
+        iov_out[2].iov_len = sizeof(foo);
+
+        if (send_fuse_iov(this, finh, iov_out, 3) == ENOENT) {
+            gf_log("glusterfs-fuse", GF_LOG_DEBUG, "create(%s) got EINTR",
+                   state->loc.path);
+            inode_forget(inode, 1);
+            gf_fd_put(priv->fdtable, state->fd_no);
+            goto out;
+        }
 
-                e.ino = buf->st_ino;
+        fd_bind(fd);
+    } else {
+        /* facilitate retry from VFS */
+        if (op_errno == ENOENT)
+            op_errno = ESTALE;
 
-#ifdef GF_DARWIN_HOST_OS
-                e.generation = 0;
+        gf_log("glusterfs-fuse", GF_LOG_WARNING, "%" PRIu64 ": %s => -1 (%s)",
+               finh->unique, state->loc.path, strerror(op_errno));
+
+        send_fuse_err(this, finh, op_errno);
+        gf_fd_put(priv->fdtable, state->fd_no);
+    }
+out:
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+
+    return 0;
+}
+
+void
+fuse_create_resume(fuse_state_t *state)
+{
+    fd_t *fd = NULL;
+    fuse_private_t *priv = NULL;
+    fuse_fd_ctx_t *fdctx = NULL;
+
+    if (!state->loc.parent) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 " CREATE %s/%s resolution failed",
+               state->finh->unique, uuid_utoa(state->resolve.gfid),
+               state->resolve.bname);
+
+        /* facilitate retry from VFS */
+        if (state->resolve.op_errno == ENOENT)
+            state->resolve.op_errno = ESTALE;
+
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+    if (state->resolve.op_errno == ENOENT) {
+        state->resolve.op_ret = 0;
+        state->resolve.op_errno = 0;
+    }
+
+    if (state->loc.inode) {
+        gf_log(state->this->name, GF_LOG_DEBUG, "inode already present");
+        inode_unref(state->loc.inode);
+    }
+
+    state->loc.inode = inode_new(state->loc.parent->table);
+
+    fd = fd_create(state->loc.inode, state->finh->pid);
+    if (fd == NULL) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 " CREATE cannot create a new fd",
+               state->finh->unique);
+        send_fuse_err(state->this, state->finh, ENOMEM);
+        free_fuse_state(state);
+        return;
+    }
+
+    fdctx = fuse_fd_ctx_check_n_create(state->this, fd);
+    if (fdctx == NULL) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 " CREATE creation of fdctx failed",
+               state->finh->unique);
+        fd_unref(fd);
+        send_fuse_err(state->this, state->finh, ENOMEM);
+        free_fuse_state(state);
+        return;
+    }
+
+    priv = state->this->private;
+
+    state->fd_no = gf_fd_unused_get(priv->fdtable, fd);
+
+    state->fd = fd_ref(fd);
+    fd->flags = state->flags;
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": CREATE %s",
+           state->finh->unique, state->loc.path);
+
+    FUSE_FOP(state, fuse_create_cbk, GF_FOP_CREATE, create, &state->loc,
+             state->flags, state->mode, state->umask, fd, state->xdata);
+}
+
+static void
+fuse_create(xlator_t *this, fuse_in_header_t *finh, void *msg,
+            struct iobuf *iobuf)
+{
+#if FUSE_KERNEL_MINOR_VERSION >= 12
+    struct fuse_create_in *fci = msg;
+    fuse_private_t *priv = NULL;
 #else
-                e.generation = buf->st_ctime;
+    struct fuse_open_in *fci = msg;
 #endif
+    char *name = (char *)(fci + 1);
 
-                e.entry_timeout = priv->entry_timeout;
-                e.attr_timeout = priv->attribute_timeout;
-                e.attr = *buf;
-                e.attr.st_blksize = BIG_FUSE_CHANNEL_SIZE;
+    fuse_state_t *state = NULL;
 
-                fi.keep_cache = 0;
+#if FUSE_KERNEL_MINOR_VERSION >= 12
+    priv = this->private;
+    if (priv->proto_minor < 12)
+        name = (char *)((struct fuse_open_in *)msg + 1);
+#endif
 
-		inode_link (inode, state->loc.parent,
-			    state->loc.name, buf);
-		
-		inode_lookup (inode);
+    GET_STATE(this, finh, state);
 
-		fd_ref (fd);
-                if (fuse_reply_create (req, &e, &fi) == -ENOENT) {
-                        gf_log ("glusterfs-fuse", GF_LOG_WARNING,
-				"create() got EINTR");
-			inode_forget (inode, 1);
-			fd_unref (fd);
-			goto out;
-                } 
+    gf_uuid_generate(state->gfid);
 
-		fd_bind (fd);
-        } else {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": %s => -1 (%s)", req_callid (req),
-                        state->loc.path, strerror (op_errno));
-                fuse_reply_err (req, op_errno);
-        }
-out:
-        free_state (state);
-        STACK_DESTROY (frame->root);
+    fuse_resolve_entry_init(state, &state->resolve, finh->nodeid, name);
 
-        return 0;
+    state->mode = fci->mode;
+    state->flags = fci->flags;
+
+#if FUSE_KERNEL_MINOR_VERSION >= 12
+    priv = this->private;
+    FUSE_ENTRY_CREATE(this, priv, finh, state, fci, "CREATE");
+#endif
+    fuse_resolve_and_resume(state, fuse_create_resume);
+
+    return;
 }
 
+void
+fuse_open_resume(fuse_state_t *state)
+{
+    fd_t *fd = NULL;
+    fuse_private_t *priv = NULL;
+    fuse_fd_ctx_t *fdctx = NULL;
 
-static void
-fuse_create (fuse_req_t req,
-             fuse_ino_t par,
-             const char *name,
-             mode_t mode,
-             struct fuse_file_info *fi)
-{
-        fuse_state_t *state;
-        fd_t *fd;
-	int32_t ret = -1;
-
-        state = state_from_req (req);
-        state->flags = fi->flags;
-
-        ret = fuse_loc_fill (&state->loc, state, 0, par, name);
-	if (ret < 0) {
-		gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64" CREATE %s (fuse_loc_fill() failed)", 
-                        req_callid (req), state->loc.path);
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
-	}
-
-        state->loc.inode = inode_new (state->itable);
-
-        fd = fd_create (state->loc.inode, get_pid_from_req (req));
-        state->fd = fd;
-	fd->flags = state->flags;
-
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": CREATE %s", req_callid (req),
-                state->loc.path);
-
-        FUSE_FOP (state, fuse_create_cbk, GF_FOP_CREATE,
-                  create, &state->loc, state->flags, mode, fd);
+    if (!state->loc.inode) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "%" PRIu64 ": OPEN %s resolution failed", state->finh->unique,
+               uuid_utoa(state->resolve.gfid));
+
+        /* facilitate retry from VFS */
+        if (state->resolve.op_errno == ENOENT)
+            state->resolve.op_errno = ESTALE;
 
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
         return;
+    }
+
+    fd = fd_create(state->loc.inode, state->finh->pid);
+    if (!fd) {
+        gf_log("fuse", GF_LOG_ERROR, "fd is NULL");
+        send_fuse_err(state->this, state->finh, ENOENT);
+        free_fuse_state(state);
+        return;
+    }
+
+    fdctx = fuse_fd_ctx_check_n_create(state->this, fd);
+    if (fdctx == NULL) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": OPEN creation of fdctx failed",
+               state->finh->unique);
+        fd_unref(fd);
+        send_fuse_err(state->this, state->finh, ENOMEM);
+        free_fuse_state(state);
+        return;
+    }
+
+    priv = state->this->private;
+
+    state->fd_no = gf_fd_unused_get(priv->fdtable, fd);
+    state->fd = fd_ref(fd);
+    fd->flags = state->flags;
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": OPEN %s",
+           state->finh->unique, state->loc.path);
+
+    FUSE_FOP(state, fuse_fd_cbk, GF_FOP_OPEN, open, &state->loc, state->flags,
+             fd, state->xdata);
+}
+
+static void
+fuse_open(xlator_t *this, fuse_in_header_t *finh, void *msg,
+          struct iobuf *iobuf)
+{
+    struct fuse_open_in *foi = msg;
+    fuse_state_t *state = NULL;
+
+    GET_STATE(this, finh, state);
+
+    fuse_resolve_inode_init(state, &state->resolve, finh->nodeid);
+
+    state->flags = foi->flags;
+
+    fuse_resolve_and_resume(state, fuse_open_resume);
+
+    return;
 }
 
+static int
+fuse_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iovec *vector,
+               int32_t count, struct iatt *stbuf, struct iobref *iobref,
+               dict_t *xdata)
+{
+    fuse_state_t *state = NULL;
+    fuse_in_header_t *finh = NULL;
+    struct fuse_out_header fouh = {
+        0,
+    };
+    struct iovec *iov_out = NULL;
+
+    state = frame->root->state;
+    finh = state->finh;
+
+    fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
+    if (op_ret >= 0) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": READ => %d/%" GF_PRI_SIZET ",%" PRId64 "/%" PRIu64,
+               frame->root->unique, op_ret, state->size, state->off,
+               stbuf->ia_size);
+
+        iov_out = GF_CALLOC(count + 1, sizeof(*iov_out), gf_fuse_mt_iovec);
+        if (iov_out) {
+            fouh.error = 0;
+            iov_out[0].iov_base = &fouh;
+            memcpy(iov_out + 1, vector, count * sizeof(*iov_out));
+            send_fuse_iov(this, finh, iov_out, count + 1);
+            GF_FREE(iov_out);
+        } else
+            send_fuse_err(this, finh, ENOMEM);
+    } else {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": READ => %d gfid=%s fd=%p (%s)",
+               frame->root->unique, op_ret,
+               (state->fd && state->fd->inode)
+                   ? uuid_utoa(state->fd->inode->gfid)
+                   : "nil",
+               state->fd, strerror(op_errno));
+
+        send_fuse_err(this, finh, op_errno);
+    }
+
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+
+    return 0;
+}
+
+void
+fuse_readv_resume(fuse_state_t *state)
+{
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "%" PRIu64 ": READ (%p, size=%zu, offset=%" PRIu64 ")",
+           state->finh->unique, state->fd, state->size, state->off);
+
+    FUSE_FOP(state, fuse_readv_cbk, GF_FOP_READ, readv, state->fd, state->size,
+             state->off, state->io_flags, state->xdata);
+}
 
 static void
-fuse_open (fuse_req_t req,
-           fuse_ino_t ino,
-           struct fuse_file_info *fi)
+fuse_readv(xlator_t *this, fuse_in_header_t *finh, void *msg,
+           struct iobuf *iobuf)
 {
-        fuse_state_t *state;
-        fd_t *fd;
-	int32_t ret = -1;
+    struct fuse_read_in *fri = msg;
+
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+    fuse_private_t *priv = NULL;
+#endif
+    fuse_state_t *state = NULL;
+    fd_t *fd = NULL;
 
-        state = state_from_req (req);
-        state->flags = fi->flags;
+    GET_STATE(this, finh, state);
 
-        ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL);
-        if ((state->loc.inode == NULL) ||
-	    (ret < 0)) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": OPEN %s (fuse_loc_fill() failed)",
-                        req_callid (req), state->loc.path);
-  
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
-        }
+    fd = FH_TO_FD(fri->fh);
+    state->fd = fd;
+
+    fuse_resolve_fd_init(state, &state->resolve, fd);
+
+    /* See comment by similar code in fuse_settatr */
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+    priv = this->private;
+    if (priv->proto_minor >= 9 && fri->read_flags & FUSE_READ_LOCKOWNER)
+        state->lk_owner = fri->lock_owner;
+#endif
 
+    state->size = fri->size;
+    state->off = fri->offset;
+    /* lets ignore 'fri->read_flags', but just consider 'fri->flags' */
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+    state->io_flags = fri->flags;
+#endif
+    fuse_resolve_and_resume(state, fuse_readv_resume);
+}
 
-        fd = fd_create (state->loc.inode, get_pid_from_req (req));
-        state->fd = fd;
-	fd->flags = fi->flags;
+static int
+fuse_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
+                struct iatt *postbuf, dict_t *xdata)
+{
+    fuse_state_t *state = NULL;
+    fuse_in_header_t *finh = NULL;
+    struct fuse_write_out fwo = {
+        0,
+    };
+
+    state = frame->root->state;
+    finh = state->finh;
+
+    fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
+    if (op_ret >= 0) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": WRITE => %d/%" GF_PRI_SIZET ",%" PRId64
+               "/%" PRIu64,
+               frame->root->unique, op_ret, state->size, state->off,
+               stbuf->ia_size);
+
+        fwo.size = op_ret;
+        send_fuse_obj(this, finh, &fwo);
+    } else {
+        gf_log(
+            "glusterfs-fuse", GF_LOG_WARNING,
+            "%" PRIu64 ": WRITE => -1 gfid=%s fd=%p (%s)", frame->root->unique,
+            (state->fd && state->fd->inode) ? uuid_utoa(state->fd->inode->gfid)
+                                            : "nil",
+            state->fd, strerror(op_errno));
+
+        send_fuse_err(this, finh, op_errno);
+    }
+
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+
+    return 0;
+}
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": OPEN %s", req_callid (req),
-                state->loc.path);
+void
+fuse_write_resume(fuse_state_t *state)
+{
+    struct iobref *iobref = NULL;
 
-        FUSE_FOP (state, fuse_fd_cbk, GF_FOP_OPEN,
-                  open, &state->loc, fi->flags, fd);
+    iobref = iobref_new();
+    if (!iobref) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "%" PRIu64 ": WRITE iobref allocation failed",
+               state->finh->unique);
+        send_fuse_err(state->this, state->finh, ENOMEM);
 
+        free_fuse_state(state);
         return;
+    }
+
+    iobref_add(iobref, state->iobuf);
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "%" PRIu64 ": WRITE (%p, size=%" GF_PRI_SIZET ", offset=%" PRId64
+           ")",
+           state->finh->unique, state->fd, state->size, state->off);
+
+    FUSE_FOP(state, fuse_writev_cbk, GF_FOP_WRITE, writev, state->fd,
+             &state->vector, 1, state->off, state->io_flags, iobref,
+             state->xdata);
+
+    iobref_unref(iobref);
 }
 
+static void
+fuse_write(xlator_t *this, fuse_in_header_t *finh, void *msg,
+           struct iobuf *iobuf)
+{
+    /* WRITE is special, metadata is attached to in_header,
+     * and msg is the payload as-is.
+     */
+    struct fuse_write_in *fwi = (struct fuse_write_in *)(finh + 1);
+
+    fuse_state_t *state = NULL;
+    fd_t *fd = NULL;
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+    fuse_private_t *priv = NULL;
+    priv = this->private;
+#endif
+
+    GET_STATE(this, finh, state);
+    fd = FH_TO_FD(fwi->fh);
+    state->fd = fd;
+    state->size = fwi->size;
+    state->off = fwi->offset;
+
+    /* lets ignore 'fwi->write_flags', but just consider 'fwi->flags' */
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+    state->io_flags = fwi->flags;
+#else
+    state->io_flags = fwi->write_flags;
+#endif
+    /* TODO: may need to handle below flag
+       (fwi->write_flags & FUSE_WRITE_CACHE);
+    */
+
+    fuse_resolve_fd_init(state, &state->resolve, fd);
+
+    /* See comment by similar code in fuse_settatr */
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+    priv = this->private;
+    if (priv->proto_minor >= 9 && fwi->write_flags & FUSE_WRITE_LOCKOWNER)
+        state->lk_owner = fwi->lock_owner;
+#endif
+
+    state->vector.iov_base = msg;
+    state->vector.iov_len = fwi->size;
+    state->iobuf = iobuf;
+
+    fuse_resolve_and_resume(state, fuse_write_resume);
+
+    return;
+}
 
+#if FUSE_KERNEL_MINOR_VERSION >= 28
 static int
-fuse_readv_cbk (call_frame_t *frame,
-                void *cookie,
-                xlator_t *this,
-                int32_t op_ret,
-                int32_t op_errno,
-                struct iovec *vector,
-                int32_t count,
-                struct stat *stbuf)
-{
-        fuse_state_t *state = frame->root->state;
-        fuse_req_t req = state->req;
-
-        if (op_ret >= 0) {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": READ => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRId64,
-			frame->root->unique,
-                        op_ret, state->size, state->off, stbuf->st_size);
-
-                fuse_reply_vec (req, vector, count);
-        } else {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": READ => %d (%s)", frame->root->unique, 
-                        op_ret, strerror (op_errno));
+fuse_copy_file_range_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
+                         struct iatt *prebuf_dst, struct iatt *postbuf_dst,
+                         dict_t *xdata)
+{
+    fuse_state_t *state = NULL;
+    fuse_in_header_t *finh = NULL;
+    /*
+     * Fuse kernel module uses fuse_write_out itself as the
+     * output collector. In fact, fuse_kernel.h in the upstream
+     * kernel just defines the input structure fuse_copy_file_range_in
+     * for the fop. So, just use the fuse_write_out to send the
+     * response back to the kernel.
+     */
+    struct fuse_write_out fcfro = {
+        0,
+    };
+
+    char src_gfid[GF_UUID_BUF_SIZE] = {0};
+    char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+
+    state = frame->root->state;
+    finh = state->finh;
+
+    fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
+    if (op_ret >= 0) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": WRITE => %d/%" GF_PRI_SIZET ",%" PRIu64
+               " , %" PRIu64 " ,%" PRIu64 ",%" PRIu64,
+               frame->root->unique, op_ret, state->size, state->off_in,
+               state->off_out, stbuf->ia_size, postbuf_dst->ia_size);
+
+        fcfro.size = op_ret;
+        send_fuse_obj(this, finh, &fcfro);
+    } else {
+        if (state->fd && state->fd->inode)
+            uuid_utoa_r(state->fd->inode->gfid, src_gfid);
+        else
+            snprintf(src_gfid, sizeof(src_gfid), "nil");
+
+        if (state->fd_dst && state->fd_dst->inode)
+            uuid_utoa_r(state->fd_dst->inode->gfid, dst_gfid);
+        else
+            snprintf(dst_gfid, sizeof(dst_gfid), "nil");
+
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64
+               ": COPY_FILE_RANGE => -1 gfid_in=%s fd_in=%p "
+               "gfid_out=%s fd_out=%p (%s)",
+               frame->root->unique, src_gfid, state->fd, dst_gfid,
+               state->fd_dst, strerror(op_errno));
+
+        send_fuse_err(this, finh, op_errno);
+    }
+
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+
+    return 0;
+}
 
-                fuse_reply_err (req, op_errno);
-        }
-        
-        free_state (state);
-        STACK_DESTROY (frame->root);
+void
+fuse_copy_file_range_resume(fuse_state_t *state)
+{
+    char fd_uuid_str[64] = {0}, fd_dst_uuid_str[64] = {0};
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "%" PRIu64
+           ": COPY_FILE_RANGE "
+           "(input fd: %p (gfid: %s), "
+           "output fd: %p (gfid: %s) size=%zu, "
+           "offset_in=%" PRIu64 ", offset_out=%" PRIu64 ")",
+           state->finh->unique, state->fd,
+           uuid_utoa_r(state->fd->inode->gfid, fd_uuid_str), state->fd_dst,
+           uuid_utoa_r(state->fd_dst->inode->gfid, fd_dst_uuid_str),
+           state->size, state->off_in, state->off_out);
+
+    FUSE_FOP(state, fuse_copy_file_range_cbk, GF_FOP_COPY_FILE_RANGE,
+             copy_file_range, state->fd, state->off_in, state->fd_dst,
+             state->off_out, state->size, state->io_flags, state->xdata);
+}
 
-        return 0;
+static void
+fuse_copy_file_range(xlator_t *this, fuse_in_header_t *finh, void *msg,
+                     struct iobuf *iobuf)
+{
+    struct fuse_copy_file_range_in *fcfri = msg;
+    fuse_state_t *state = NULL;
+    fd_t *fd_in = NULL;
+    fd_t *fd_out = NULL;
+
+    GET_STATE(this, finh, state);
+
+    fd_in = FH_TO_FD(fcfri->fh_in);
+    fd_out = FH_TO_FD(fcfri->fh_out);
+    state->fd = fd_in;
+    state->fd_dst = fd_out;
+
+    fuse_resolve_fd_init(state, &state->resolve, fd_in);
+    fuse_resolve_fd_init(state, &state->resolve2, fd_out);
+
+    state->size = fcfri->len;
+    state->off_in = fcfri->off_in;
+    state->off_out = fcfri->off_out;
+    state->io_flags = fcfri->flags;
+
+    fuse_resolve_and_resume(state, fuse_copy_file_range_resume);
+}
+#endif /* FUSE_KERNEL_MINOR_VERSION >= 28 */
+
+#if FUSE_KERNEL_MINOR_VERSION >= 24 && HAVE_SEEK_HOLE
+static int
+fuse_lseek_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, off_t offset, dict_t *xdata)
+{
+    fuse_state_t *state = frame->root->state;
+    fuse_in_header_t *finh = state->finh;
+    struct fuse_lseek_out flo = {
+        0,
+    };
+
+    fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
+    if (op_ret >= 0) {
+        flo.offset = offset;
+        send_fuse_obj(this, finh, &flo);
+    } else {
+        send_fuse_err(this, finh, op_errno);
+    }
+
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+
+    return 0;
+}
+
+static void
+fuse_lseek_resume(fuse_state_t *state)
+{
+    FUSE_FOP(state, fuse_lseek_cbk, GF_FOP_SEEK, seek, state->fd, state->off,
+             state->whence, state->xdata);
 }
 
 static void
-fuse_readv (fuse_req_t req,
-            fuse_ino_t ino,
-            size_t size,
-            off_t off,
-            struct fuse_file_info *fi)
+fuse_lseek(xlator_t *this, fuse_in_header_t *finh, void *msg,
+           struct iobuf *iobuf)
 {
-        fuse_state_t *state;
-        fd_t *fd = NULL;
-        state = state_from_req (req);
-        state->size = size;
-        state->off = off;
-        
-        fd = FI_TO_FD (fi);
-        state->fd = fd;
+    struct fuse_lseek_in *ffi = msg;
+    fuse_state_t *state = NULL;
+
+    GET_STATE(this, finh, state);
+    state->fd = FH_TO_FD(ffi->fh);
+    state->off = ffi->offset;
+
+    switch (ffi->whence) {
+        case SEEK_DATA:
+            state->whence = GF_SEEK_DATA;
+            break;
+        case SEEK_HOLE:
+            state->whence = GF_SEEK_HOLE;
+            break;
+        default:
+            /* fuse should handle other whence internally */
+            send_fuse_err(this, finh, EINVAL);
+            free_fuse_state(state);
+            return;
+    }
+
+    fuse_resolve_fd_init(state, &state->resolve, state->fd);
+    fuse_resolve_and_resume(state, fuse_lseek_resume);
+}
+#endif /* FUSE_KERNEL_MINOR_VERSION >= 24 && HAVE_SEEK_HOLE */
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": READ (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")",
-                req_callid (req), fd, size, off);
+void
+fuse_flush_resume(fuse_state_t *state)
+{
+    FUSE_FOP(state, fuse_flush_cbk, GF_FOP_FLUSH, flush, state->fd,
+             state->xdata);
+}
 
-        FUSE_FOP (state, fuse_readv_cbk, GF_FOP_READ,
-                  readv, fd, size, off);
+static void
+fuse_flush_interrupt_handler(xlator_t *this, fuse_interrupt_record_t *fir)
+{
+    gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+           "FLUSH unique %" PRIu64 ": interrupt handler triggered",
+           fir->fuse_in_header.unique);
 
+    fuse_interrupt_finish_interrupt(this, fir, INTERRUPT_HANDLED, _gf_false,
+                                    NULL);
 }
 
+static void
+fuse_flush(xlator_t *this, fuse_in_header_t *finh, void *msg,
+           struct iobuf *iobuf)
+{
+    struct fuse_flush_in *ffi = msg;
+    fuse_private_t *priv = NULL;
+
+    fuse_state_t *state = NULL;
+    fd_t *fd = NULL;
 
-static int
-fuse_writev_cbk (call_frame_t *frame,
-                 void *cookie,
-                 xlator_t *this,
-                 int32_t op_ret,
-                 int32_t op_errno,
-                 struct stat *stbuf)
-{
-        fuse_state_t *state = frame->root->state;
-        fuse_req_t req = state->req;
-
-        if (op_ret >= 0) {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": WRITE => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRId64,
-			frame->root->unique,
-                        op_ret, state->size, state->off, stbuf->st_size);
-
-                fuse_reply_write (req, op_ret);
-        } else {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": WRITE => -1 (%s)", frame->root->unique, 
-                        strerror(op_errno));
+    GET_STATE(this, finh, state);
+    fd = FH_TO_FD(ffi->fh);
+    state->fd = fd;
+
+    priv = this->private;
+    if (priv->flush_handle_interrupt) {
+        fuse_interrupt_record_t *fir = NULL;
 
-                fuse_reply_err (req, op_errno);
+        fir = fuse_interrupt_record_new(finh, fuse_flush_interrupt_handler);
+        if (!fir) {
+            send_fuse_err(this, finh, ENOMEM);
+
+            gf_log("glusterfs-fuse", GF_LOG_ERROR,
+                   "FLUSH unique %" PRIu64
+                   ":"
+                   " interrupt record allocation failed",
+                   finh->unique);
+            free_fuse_state(state);
+
+            return;
         }
-        
-        free_state (state);
-        STACK_DESTROY (frame->root);
+        fuse_interrupt_record_insert(this, fir);
+    }
 
-        return 0;
+    fuse_resolve_fd_init(state, &state->resolve, fd);
+
+    state->lk_owner = ffi->lock_owner;
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": FLUSH %p",
+           finh->unique, fd);
+
+    fuse_resolve_and_resume(state, fuse_flush_resume);
+
+    return;
 }
 
+int
+fuse_internal_release(xlator_t *this, fd_t *fd)
+{
+    /* This is important we cleanup our context here to avoid a leak
+       in case an error occurs and we get cleanup up by
+       call_unwind_error->...->args_wipe instead of the normal path.
+    */
+    fuse_fd_ctx_destroy(this, fd);
+
+    return 0;
+}
 
 static void
-fuse_write (fuse_req_t req,
-            fuse_ino_t ino,
-            const char *buf,
-            size_t size,
-            off_t off,
-            struct fuse_file_info *fi)
-{
-        fuse_state_t *state;
-        struct iovec vector;
-	fd_t *fd = NULL;
-
-        state = state_from_req (req);
-        state->size = size;
-        state->off = off;
-	fd = FI_TO_FD (fi);
-	state->fd = fd;
-        vector.iov_base = (void *)buf;
-        vector.iov_len = size;
-
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": WRITE (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")",
-                req_callid (req), fd, size, off);
-
-        FUSE_FOP (state, fuse_writev_cbk, GF_FOP_WRITE,
-                  writev, fd, &vector, 1, off);
-        return;
+fuse_release(xlator_t *this, fuse_in_header_t *finh, void *msg,
+             struct iobuf *iobuf)
+{
+    struct fuse_release_in *fri = msg;
+    fd_t *fd = NULL;
+    fuse_state_t *state = NULL;
+    fuse_private_t *priv = NULL;
+
+    GET_STATE(this, finh, state);
+    fd = FH_TO_FD(fri->fh);
+    if (!fd)
+        goto out;
+
+    state->fd = fd;
+
+    priv = this->private;
+
+    fuse_log_eh(this, "RELEASE(): finh->unique: %" PRIu64 ":, fd: %p, gfid: %s",
+                finh->unique, fd, uuid_utoa(fd->inode->gfid));
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "finh->unique: %" PRIu64 ": RELEASE %p", finh->unique, state->fd);
+
+    fd_close(state->fd);
+
+    fuse_fd_ctx_destroy(this, state->fd);
+    fd_unref(fd);
+
+    gf_fdptr_put(priv->fdtable, fd);
+
+    state->fd = NULL;
+
+out:
+    send_fuse_err(this, finh, 0);
+
+    free_fuse_state(state);
+    return;
 }
 
+void
+fuse_fsync_resume(fuse_state_t *state)
+{
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": FSYNC %p",
+           state->finh->unique, state->fd);
+
+    /* fsync_flags: 1 means "datasync" (no defines for this) */
+    FUSE_FOP(state, fuse_fsync_cbk, GF_FOP_FSYNC, fsync, state->fd,
+             (state->flags & 1), state->xdata);
+}
 
 static void
-fuse_flush (fuse_req_t req,
-            fuse_ino_t ino,
-            struct fuse_file_info *fi)
+fuse_fsync(xlator_t *this, fuse_in_header_t *finh, void *msg,
+           struct iobuf *iobuf)
 {
-        fuse_state_t *state;
-	fd_t *fd = NULL;
+    struct fuse_fsync_in *fsi = msg;
 
-        state = state_from_req (req);
-	fd = FI_TO_FD (fi);
-	state->fd = fd;
+    fuse_state_t *state = NULL;
+    fd_t *fd = NULL;
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": FLUSH %p", req_callid (req), fd);
+    GET_STATE(this, finh, state);
+    fd = FH_TO_FD(fsi->fh);
+    state->fd = fd;
 
-        FUSE_FOP (state, fuse_err_cbk, GF_FOP_FLUSH,
-                  flush, fd);
+    fuse_resolve_fd_init(state, &state->resolve, fd);
 
-        return;
+    state->flags = fsi->fsync_flags;
+    fuse_resolve_and_resume(state, fuse_fsync_resume);
+    return;
 }
 
-
-static void 
-fuse_release (fuse_req_t req,
-	      fuse_ino_t ino,
-	      struct fuse_file_info *fi)
+void
+fuse_opendir_resume(fuse_state_t *state)
 {
-        fuse_state_t *state;
+    fd_t *fd = NULL;
+    fuse_private_t *priv = NULL;
+    fuse_fd_ctx_t *fdctx = NULL;
+
+    priv = state->this->private;
 
-        state = state_from_req (req);
-        state->fd = FI_TO_FD (fi);
+    if (!state->loc.inode) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": OPENDIR (%s) resolution failed",
+               state->finh->unique, uuid_utoa(state->resolve.gfid));
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": RELEASE %p", req_callid (req), state->fd);
+        /* facilitate retry from VFS */
+        if (state->resolve.op_errno == ENOENT)
+            state->resolve.op_errno = ESTALE;
 
-        fd_unref (state->fd);
-        
-        fuse_reply_err (req, 0);
-        
-        free_state (state);
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+    fd = fd_create(state->loc.inode, state->finh->pid);
+    if (fd == NULL) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": OPENDIR fd creation failed", state->finh->unique);
+        send_fuse_err(state->this, state->finh, ENOMEM);
+        free_fuse_state(state);
         return;
+    }
+
+    fdctx = fuse_fd_ctx_check_n_create(state->this, fd);
+    if (fdctx == NULL) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": OPENDIR creation of fdctx failed",
+               state->finh->unique);
+        fd_unref(fd);
+        send_fuse_err(state->this, state->finh, ENOMEM);
+        free_fuse_state(state);
+        return;
+    }
+
+    state->fd = fd_ref(fd);
+    state->fd_no = gf_fd_unused_get(priv->fdtable, fd);
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": OPENDIR %s",
+           state->finh->unique, state->loc.path);
+
+    FUSE_FOP(state, fuse_fd_cbk, GF_FOP_OPENDIR, opendir, &state->loc, fd,
+             state->xdata);
 }
 
+static void
+fuse_opendir(xlator_t *this, fuse_in_header_t *finh, void *msg,
+             struct iobuf *iobuf)
+{
+    /*
+    struct fuse_open_in *foi = msg;
+     */
+
+    fuse_state_t *state = NULL;
 
-static void 
-fuse_fsync (fuse_req_t req,
-            fuse_ino_t ino,
-            int datasync,
-            struct fuse_file_info *fi)
+    GET_STATE(this, finh, state);
+
+    fuse_resolve_inode_init(state, &state->resolve, finh->nodeid);
+
+    fuse_resolve_and_resume(state, fuse_opendir_resume);
+}
+
+unsigned char
+d_type_from_stat(struct iatt *buf)
 {
-        fuse_state_t *state;
-        fd_t *fd = NULL;
+    unsigned char d_type;
 
-        state = state_from_req (req);
-        fd = FI_TO_FD (fi);
-        state->fd = fd;
+    if (IA_ISLNK(buf->ia_type)) {
+        d_type = DT_LNK;
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": FSYNC %p", req_callid (req), fd);
+    } else if (IA_ISDIR(buf->ia_type)) {
+        d_type = DT_DIR;
 
-        FUSE_FOP (state, fuse_err_cbk, GF_FOP_FSYNC,
-                  fsync, fd, datasync);
+    } else if (IA_ISFIFO(buf->ia_type)) {
+        d_type = DT_FIFO;
 
-        return;
+    } else if (IA_ISSOCK(buf->ia_type)) {
+        d_type = DT_SOCK;
+
+    } else if (IA_ISCHR(buf->ia_type)) {
+        d_type = DT_CHR;
+
+    } else if (IA_ISBLK(buf->ia_type)) {
+        d_type = DT_BLK;
+
+    } else if (IA_ISREG(buf->ia_type)) {
+        d_type = DT_REG;
+
+    } else {
+        d_type = DT_UNKNOWN;
+    }
+
+    return d_type;
+}
+
+static int
+fuse_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+                 dict_t *xdata)
+{
+    fuse_state_t *state = NULL;
+    fuse_in_header_t *finh = NULL;
+    size_t size = 0;
+    size_t max_size = 0;
+    char *buf = NULL;
+    gf_dirent_t *entry = NULL;
+    struct fuse_dirent *fde = NULL;
+    fuse_private_t *priv = NULL;
+
+    state = frame->root->state;
+    finh = state->finh;
+    priv = state->this->private;
+
+    fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
+    if (op_ret < 0) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": READDIR => -1 (%s)", frame->root->unique,
+               strerror(op_errno));
+
+        send_fuse_err(this, finh, op_errno);
+        goto out;
+    }
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "%" PRIu64 ": READDIR => %d/%" GF_PRI_SIZET ",%" PRId64,
+           frame->root->unique, op_ret, state->size, state->off);
+
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        size_t fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET +
+                                            strlen(entry->d_name));
+        max_size += fde_size;
+
+        if (max_size > state->size) {
+            /* we received too many entries to fit in the reply */
+            max_size -= fde_size;
+            break;
+        }
+    }
+
+    if (max_size == 0) {
+        send_fuse_data(this, finh, 0, 0);
+        goto out;
+    }
+
+    buf = GF_CALLOC(1, max_size, gf_fuse_mt_char);
+    if (!buf) {
+        gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+               "%" PRIu64 ": READDIR => -1 (%s)", frame->root->unique,
+               strerror(ENOMEM));
+        send_fuse_err(this, finh, ENOMEM);
+        goto out;
+    }
+
+    size = 0;
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        fde = (struct fuse_dirent *)(buf + size);
+        gf_fuse_fill_dirent(entry, fde, priv->enable_ino32);
+        size += FUSE_DIRENT_SIZE(fde);
+
+        if (size == max_size)
+            break;
+    }
+
+    send_fuse_data(this, finh, buf, size);
+
+    /* TODO: */
+    /* gf_link_inodes_from_dirent (this, state->fd->inode, entries); */
+
+out:
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+    GF_FREE(buf);
+    return 0;
 }
 
+void
+fuse_readdir_resume(fuse_state_t *state)
+{
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "%" PRIu64 ": READDIR (%p, size=%" GF_PRI_SIZET ", offset=%" PRId64
+           ")",
+           state->finh->unique, state->fd, state->size, state->off);
+
+    FUSE_FOP(state, fuse_readdir_cbk, GF_FOP_READDIR, readdir, state->fd,
+             state->size, state->off, state->xdata);
+}
 
 static void
-fuse_opendir (fuse_req_t req,
-              fuse_ino_t ino,
-              struct fuse_file_info *fi)
+fuse_readdir(xlator_t *this, fuse_in_header_t *finh, void *msg,
+             struct iobuf *iobuf)
 {
-        fuse_state_t *state;
-        fd_t *fd;
-	int32_t ret = -1;
+    struct fuse_read_in *fri = msg;
 
-        state = state_from_req (req);
-        ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL);
-        if ((state->loc.inode == NULL) ||
-	    (ret < 0)) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": OPENDIR %s (fuse_loc_fill() failed)",
-                        req_callid (req), state->loc.path);
-  
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
-        }
+    fuse_state_t *state = NULL;
+    fd_t *fd = NULL;
 
-        fd = fd_create (state->loc.inode, get_pid_from_req (req));
-        state->fd = fd;
+    GET_STATE(this, finh, state);
+    state->size = fri->size;
+    state->off = fri->offset;
+    fd = FH_TO_FD(fri->fh);
+    state->fd = fd;
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": OPENDIR %s", req_callid (req),
-                state->loc.path);
+    fuse_resolve_fd_init(state, &state->resolve, fd);
 
-        FUSE_FOP (state, fuse_fd_cbk, GF_FOP_OPENDIR,
-                  opendir, &state->loc, fd);
+    fuse_resolve_and_resume(state, fuse_readdir_resume);
 }
 
+#if FUSE_KERNEL_MINOR_VERSION >= 20
 static int
-fuse_readdir_cbk (call_frame_t *frame,
-                  void *cookie,
-                  xlator_t *this,
-                  int32_t op_ret,
-                  int32_t op_errno,
-                  gf_dirent_t *entries)
-{
-        fuse_state_t *state = frame->root->state;
-        fuse_req_t    req = state->req;
-	int           size = 0;
-	int           entry_size = 0;
-	char         *buf = NULL;
-	gf_dirent_t  *entry = NULL;
-	struct stat   stbuf = {0, };
-
-        if (op_ret < 0) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": READDIR => -1 (%s)", frame->root->unique, 
-                        strerror (op_errno));
-
-                fuse_reply_err (req, op_errno);
-		goto out;
-	}
-
-	gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-		"%"PRId64": READDIR => %d/%"GF_PRI_SIZET",%"PRId64,
-		frame->root->unique, op_ret, state->size, state->off);
-
-	list_for_each_entry (entry, &entries->list, list) {
-		size += fuse_dirent_size (strlen (entry->d_name));
-	}
-
-	buf = CALLOC (1, size);
-	if (!buf) {
-		gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-			"%"PRId64": READDIR => -1 (%s)", frame->root->unique,
-			strerror (ENOMEM));
-		fuse_reply_err (req, -ENOMEM);
-		goto out;
-	}
-
-	size = 0;
-	list_for_each_entry (entry, &entries->list, list) {
-		stbuf.st_ino = entry->d_ino;
-		entry_size = fuse_dirent_size (strlen (entry->d_name));
-		fuse_add_direntry (req, buf + size, entry_size,
-				   entry->d_name, &stbuf,
-				   entry->d_off);
-		size += entry_size;
-	}
-
-	fuse_reply_buf (req, (void *)buf, size);
+fuse_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+                  dict_t *xdata)
+{
+    fuse_state_t *state = NULL;
+    fuse_in_header_t *finh = NULL;
+    size_t max_size = 0;
+    size_t size = 0;
+    char *buf = NULL;
+    gf_dirent_t *entry = NULL;
+    struct fuse_direntplus *fde = NULL;
+    struct fuse_entry_out *feo = NULL;
+    fuse_private_t *priv = NULL;
+
+    state = frame->root->state;
+    finh = state->finh;
+    priv = this->private;
+
+    if (op_ret < 0) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": READDIRP => -1 (%s)", frame->root->unique,
+               strerror(op_errno));
+
+        send_fuse_err(this, finh, op_errno);
+        goto out;
+    }
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "%" PRIu64 ": READDIRP => %d/%" GF_PRI_SIZET ",%" PRId64,
+           frame->root->unique, op_ret, state->size, state->off);
+
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        size_t fdes = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET_DIRENTPLUS +
+                                        strlen(entry->d_name));
+        max_size += fdes;
+
+        if (max_size > state->size) {
+            /* we received too many entries to fit in the reply */
+            max_size -= fdes;
+            break;
+        }
+    }
+
+    if (max_size == 0) {
+        send_fuse_data(this, finh, 0, 0);
+        goto out;
+    }
+
+    buf = GF_CALLOC(1, max_size, gf_fuse_mt_char);
+    if (!buf) {
+        gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+               "%" PRIu64 ": READDIRP => -1 (%s)", frame->root->unique,
+               strerror(ENOMEM));
+        send_fuse_err(this, finh, ENOMEM);
+        goto out;
+    }
+
+    size = 0;
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        inode_t *linked_inode;
+
+        fde = (struct fuse_direntplus *)(buf + size);
+        feo = &fde->entry_out;
+
+        if (priv->enable_ino32)
+            fde->dirent.ino = GF_FUSE_SQUASH_INO(entry->d_ino);
+        else
+            fde->dirent.ino = entry->d_ino;
+
+        fde->dirent.off = entry->d_off;
+        fde->dirent.type = entry->d_type;
+        fde->dirent.namelen = strlen(entry->d_name);
+        (void)memcpy(fde->dirent.name, entry->d_name, fde->dirent.namelen);
+        size += FUSE_DIRENTPLUS_SIZE(fde);
+
+        if (!entry->inode)
+            goto next_entry;
+
+        entry->d_stat.ia_blksize = this->ctx->page_size;
+        gf_fuse_stat2attr(&entry->d_stat, &feo->attr, priv->enable_ino32);
+
+        linked_inode = inode_link(entry->inode, state->fd->inode, entry->d_name,
+                                  &entry->d_stat);
+        if (!linked_inode)
+            goto next_entry;
+
+        if (entry->inode != linked_inode) {
+            memset(&entry->d_stat, 0, sizeof(entry->d_stat));
+        }
 
+        feo->nodeid = inode_to_fuse_nodeid(linked_inode);
+
+        if (!((strcmp(entry->d_name, ".") == 0) ||
+              (strcmp(entry->d_name, "..") == 0))) {
+            inode_lookup(linked_inode);
+        }
+
+        inode_unref(linked_inode);
+
+        feo->entry_valid = calc_timeout_sec(priv->entry_timeout);
+        feo->entry_valid_nsec = calc_timeout_nsec(priv->entry_timeout);
+
+        if (entry->d_stat.ia_ctime) {
+            feo->attr_valid = calc_timeout_sec(priv->attribute_timeout);
+            feo->attr_valid_nsec = calc_timeout_nsec(priv->attribute_timeout);
+        } else {
+            feo->attr_valid = feo->attr_valid_nsec = 0;
+        }
+
+    next_entry:
+        if (size == max_size)
+            break;
+    }
+
+    send_fuse_data(this, finh, buf, size);
 out:
-        free_state (state);
-        STACK_DESTROY (frame->root);
-	if (buf)
-		FREE (buf);
-        return 0;
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+    GF_FREE(buf);
+    return 0;
+}
 
+void
+fuse_readdirp_resume(fuse_state_t *state)
+{
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "%" PRIu64 ": READDIRP (%p, size=%" GF_PRI_SIZET ", offset=%" PRId64
+           ")",
+           state->finh->unique, state->fd, state->size, state->off);
+
+    FUSE_FOP(state, fuse_readdirp_cbk, GF_FOP_READDIRP, readdirp, state->fd,
+             state->size, state->off, state->xdata);
 }
 
 static void
-fuse_readdir (fuse_req_t req,
-              fuse_ino_t ino,
-              size_t size,
-              off_t off,
-              struct fuse_file_info *fi)
+fuse_readdirp(xlator_t *this, fuse_in_header_t *finh, void *msg,
+              struct iobuf *iobuf)
 {
-        fuse_state_t *state;
-	fd_t *fd = NULL;
+    struct fuse_read_in *fri = msg;
+
+    fuse_state_t *state = NULL;
+    fd_t *fd = NULL;
 
-        state = state_from_req (req);
-        state->size = size;
-        state->off = off;
-	fd = FI_TO_FD (fi);
-	state->fd = fd;
+    GET_STATE(this, finh, state);
+    state->size = fri->size;
+    state->off = fri->offset;
+    fd = FH_TO_FD(fri->fh);
+    state->fd = fd;
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": READDIR (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")",
-                req_callid (req), fd, size, off);
+    fuse_resolve_fd_init(state, &state->resolve, fd);
+
+    fuse_resolve_and_resume(state, fuse_readdirp_resume);
+}
+#endif
+
+#if FUSE_KERNEL_MINOR_VERSION >= 19
+#ifdef FALLOC_FL_KEEP_SIZE
+static int
+fuse_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                   struct iatt *postbuf, dict_t *xdata)
+{
+    return fuse_err_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+}
 
-        FUSE_FOP (state, fuse_readdir_cbk, GF_FOP_READDIR,
-                  readdir, fd, size, off);
+static void
+fuse_fallocate_resume(fuse_state_t *state)
+{
+    gf_log(
+        "glusterfs-fuse", GF_LOG_TRACE,
+        "%" PRIu64 ": FALLOCATE (%p, flags=%d, size=%zu, offset=%" PRId64 ")",
+        state->finh->unique, state->fd, state->flags, state->size, state->off);
+
+    if (state->flags & FALLOC_FL_PUNCH_HOLE)
+        FUSE_FOP(state, fuse_fallocate_cbk, GF_FOP_DISCARD, discard, state->fd,
+                 state->off, state->size, state->xdata);
+    else
+        FUSE_FOP(state, fuse_fallocate_cbk, GF_FOP_FALLOCATE, fallocate,
+                 state->fd, (state->flags & FALLOC_FL_KEEP_SIZE), state->off,
+                 state->size, state->xdata);
 }
 
+static void
+fuse_fallocate(xlator_t *this, fuse_in_header_t *finh, void *msg,
+               struct iobuf *iobuf)
+{
+    struct fuse_fallocate_in *ffi = msg;
+    fuse_state_t *state = NULL;
 
-static void 
-fuse_releasedir (fuse_req_t req,
-		 fuse_ino_t ino,
-		 struct fuse_file_info *fi)
+    GET_STATE(this, finh, state);
+    state->off = ffi->offset;
+    state->size = ffi->length;
+    state->flags = ffi->mode;
+    state->fd = FH_TO_FD(ffi->fh);
+
+    fuse_resolve_fd_init(state, &state->resolve, state->fd);
+    fuse_resolve_and_resume(state, fuse_fallocate_resume);
+}
+#endif /* FALLOC_FL_KEEP_SIZE */
+#endif /* FUSE minor version >= 19 */
+
+static void
+fuse_releasedir(xlator_t *this, fuse_in_header_t *finh, void *msg,
+                struct iobuf *iobuf)
 {
-        fuse_state_t *state;
+    struct fuse_release_in *fri = msg;
+    fuse_state_t *state = NULL;
+    fuse_private_t *priv = NULL;
 
-        state = state_from_req (req);
-        state->fd = FI_TO_FD (fi);
+    GET_STATE(this, finh, state);
+    state->fd = FH_TO_FD(fri->fh);
+    if (!state->fd)
+        goto out;
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": RELEASEDIR %p", req_callid (req), state->fd);
-	
-	fd_unref (state->fd);
+    priv = this->private;
 
-	fuse_reply_err (req, 0);
-	
-	free_state (state);
+    fuse_log_eh(this,
+                "RELEASEDIR (): finh->unique: %" PRIu64 ": fd: %p, gfid: %s",
+                finh->unique, state->fd, uuid_utoa(state->fd->inode->gfid));
 
-        return;
+    gf_log("glusterfs-fuse", GF_LOG_TRACE,
+           "finh->unique: %" PRIu64 ": RELEASEDIR %p", finh->unique, state->fd);
+
+    fuse_fd_ctx_destroy(this, state->fd);
+    fd_unref(state->fd);
+
+    gf_fdptr_put(priv->fdtable, state->fd);
+
+    state->fd = NULL;
+
+out:
+    send_fuse_err(this, finh, 0);
+
+    free_fuse_state(state);
+
+    return;
 }
 
+void
+fuse_fsyncdir_resume(fuse_state_t *state)
+{
+    FUSE_FOP(state, fuse_err_cbk, GF_FOP_FSYNCDIR, fsyncdir, state->fd,
+             (state->flags & 1), state->xdata);
+}
 
-static void 
-fuse_fsyncdir (fuse_req_t req,
-               fuse_ino_t ino,
-               int datasync,
-               struct fuse_file_info *fi)
+static void
+fuse_fsyncdir(xlator_t *this, fuse_in_header_t *finh, void *msg,
+              struct iobuf *iobuf)
+{
+    struct fuse_fsync_in *fsi = msg;
+
+    fuse_state_t *state = NULL;
+    fd_t *fd = NULL;
+
+    fd = FH_TO_FD(fsi->fh);
+
+    GET_STATE(this, finh, state);
+    state->fd = fd;
+
+    fuse_resolve_fd_init(state, &state->resolve, fd);
+
+    state->flags = fsi->fsync_flags;
+    fuse_resolve_and_resume(state, fuse_fsyncdir_resume);
+
+    return;
+}
+
+static int
+fuse_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+                dict_t *xdata)
 {
-        fuse_state_t *state;
-	fd_t *fd = NULL;
-	
-	fd = FI_TO_FD (fi);
+    fuse_state_t *state = NULL;
+    fuse_in_header_t *finh = NULL;
+    fuse_private_t *priv = NULL;
+    struct fuse_statfs_out fso = {
+        {
+            0,
+        },
+    };
+
+    state = frame->root->state;
+    priv = this->private;
+    finh = state->finh;
+
+    fuse_log_eh(this, "op_ret: %d, op_errno: %d, %" PRIu64 ": %s()", op_ret,
+                op_errno, frame->root->unique, gf_fop_list[frame->root->op]);
+
+    if (op_ret == 0) {
+        fso.st.bsize = buf->f_bsize;
+        fso.st.frsize = buf->f_frsize;
+        fso.st.blocks = buf->f_blocks;
+        fso.st.bfree = buf->f_bfree;
+        fso.st.bavail = buf->f_bavail;
+        fso.st.files = buf->f_files;
+        fso.st.ffree = buf->f_ffree;
+        fso.st.namelen = buf->f_namemax;
+
+        priv->proto_minor >= 4
+            ? send_fuse_obj(this, finh, &fso)
+            : send_fuse_data(this, finh, &fso, FUSE_COMPAT_STATFS_SIZE);
+    } else {
+        /* facilitate retry from VFS */
+        if (op_errno == ENOENT)
+            op_errno = ESTALE;
+
+        gf_log("glusterfs-fuse", GF_LOG_WARNING, "%" PRIu64 ": ERR => -1 (%s)",
+               frame->root->unique, strerror(op_errno));
+
+        send_fuse_err(this, finh, op_errno);
+    }
+
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+
+    return 0;
+}
 
-        state = state_from_req (req);
-	state->fd = fd;
+void
+fuse_statfs_resume(fuse_state_t *state)
+{
+    if (!state->loc.inode) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": STATFS (%s) resolution fail", state->finh->unique,
+               uuid_utoa(state->resolve.gfid));
 
-        FUSE_FOP (state, fuse_err_cbk, GF_FOP_FSYNCDIR,
-                  fsyncdir, fd, datasync);
+        /* facilitate retry from VFS */
+        if (state->resolve.op_errno == ENOENT)
+            state->resolve.op_errno = ESTALE;
 
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
         return;
+    }
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": STATFS",
+           state->finh->unique);
+
+    FUSE_FOP(state, fuse_statfs_cbk, GF_FOP_STATFS, statfs, &state->loc,
+             state->xdata);
 }
 
+static void
+fuse_statfs(xlator_t *this, fuse_in_header_t *finh, void *msg,
+            struct iobuf *iobuf)
+{
+    fuse_state_t *state = NULL;
 
-static int
-fuse_statfs_cbk (call_frame_t *frame,
-                 void *cookie,
-                 xlator_t *this,
-                 int32_t op_ret,
-                 int32_t op_errno,
-                 struct statvfs *buf)
+    GET_STATE(this, finh, state);
+
+    fuse_resolve_inode_init(state, &state->resolve, finh->nodeid);
+
+    fuse_resolve_and_resume(state, fuse_statfs_resume);
+}
+
+void
+fuse_setxattr_resume(fuse_state_t *state)
+{
+    if (!state->loc.inode) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": SETXATTR %s/%" PRIu64
+               " (%s) "
+               "resolution failed",
+               state->finh->unique, uuid_utoa(state->resolve.gfid),
+               state->finh->nodeid, state->name);
+
+        /* facilitate retry from VFS */
+        if (state->resolve.op_errno == ENOENT)
+            state->resolve.op_errno = ESTALE;
+
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+#ifdef GF_TEST_FFOP
+    state->fd = fd_lookup(state->loc.inode, state->finh->pid);
+#endif /* GF_TEST_FFOP */
+
+    if (state->fd) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": SETXATTR %p/%" PRIu64 " (%s)", state->finh->unique,
+               state->fd, state->finh->nodeid, state->name);
+
+        FUSE_FOP(state, fuse_setxattr_cbk, GF_FOP_FSETXATTR, fsetxattr,
+                 state->fd, state->xattr, state->flags, state->xdata);
+    } else {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": SETXATTR %s/%" PRIu64 " (%s)", state->finh->unique,
+               state->loc.path, state->finh->nodeid, state->name);
+
+        FUSE_FOP(state, fuse_setxattr_cbk, GF_FOP_SETXATTR, setxattr,
+                 &state->loc, state->xattr, state->flags, state->xdata);
+    }
+}
+
+static void
+fuse_setxattr(xlator_t *this, fuse_in_header_t *finh, void *msg,
+              struct iobuf *iobuf)
 {
-        fuse_state_t *state = frame->root->state;
-        fuse_req_t req = state->req;
+    struct fuse_setxattr_in *fsi = msg;
+    char *name = (char *)(fsi + 1);
+    char *value = name + strlen(name) + 1;
+    struct fuse_private *priv = NULL;
+
+    fuse_state_t *state = NULL;
+    char *dict_value = NULL;
+    int32_t ret = -1;
+    int32_t op_errno = 0;
+    char *newkey = NULL;
+
+    priv = this->private;
+
+    GET_STATE(this, finh, state);
+
+#ifdef GF_DARWIN_HOST_OS
+    if (fsi->position) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": SETXATTR %s/%" PRIu64
+               " (%s):"
+               "refusing positioned setxattr",
+               finh->unique, state->loc.path, finh->nodeid, name);
+        op_errno = EINVAL;
+        goto done;
+    }
+#endif
+
+    if (fuse_ignore_xattr_set(priv, name)) {
+        goto done;
+    }
+
+    if (!priv->acl) {
+        if ((strcmp(name, POSIX_ACL_ACCESS_XATTR) == 0) ||
+            (strcmp(name, POSIX_ACL_DEFAULT_XATTR) == 0)) {
+            op_errno = EOPNOTSUPP;
+            goto done;
+        }
+    }
+
+    ret = fuse_check_selinux_cap_xattr(priv, name);
+    if (ret) {
+        op_errno = EOPNOTSUPP;
+        goto done;
+    }
+
+    /* Check if the command is for changing the log
+       level of process or specific xlator */
+    ret = is_gf_log_command(this, name, value, fsi->size);
+    if (ret >= 0) {
+        op_errno = ret;
+        goto done;
+    }
+
+    if (!strcmp("inode-invalidate", name)) {
+        gf_log("fuse", GF_LOG_TRACE, "got request to invalidate %" PRIu64,
+               finh->nodeid);
+#if FUSE_KERNEL_MINOR_VERSION >= 11
+        ret = fuse_invalidate_entry(this, finh->nodeid);
+        if (ret)
+            op_errno = EBUSY;
+#endif
+        goto done;
+    }
 
+    if (!strcmp(GFID_XATTR_KEY, name) || !strcmp(GF_XATTR_VOL_ID_KEY, name)) {
+        op_errno = EPERM;
+        goto done;
+    }
+
+    state->size = fsi->size;
+
+    fuse_resolve_inode_init(state, &state->resolve, finh->nodeid);
+
+    state->xattr = dict_new();
+    if (!state->xattr) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "%" PRIu64 ": SETXATTR dict allocation failed", finh->unique);
+        op_errno = ENOMEM;
+        goto done;
+    }
+
+    ret = fuse_flip_xattr_ns(priv, name, &newkey);
+    if (ret) {
+        op_errno = ENOMEM;
+        goto done;
+    }
+
+    if (fsi->size > 0) {
         /*
-          Filesystems (like ZFS on solaris) reports
-          different ->f_frsize and ->f_bsize. Old coreutils
-          df tools use statfs() and do not see ->f_frsize.
-          the ->f_blocks, ->f_bavail and ->f_bfree are
-          w.r.t ->f_frsize and not ->f_bsize which makes the
-          df tools report wrong values.
-
-          Scale the block counts to match ->f_bsize.
-        */
-        /* TODO: with old coreutils, f_bsize is taken from stat()'s st_blksize
-         * so the df with old coreutils this wont work :(
+         * Many translators expect setxattr values to be strings, but
+         * neither dict_get_str nor data_to_str do any checking or
+         * fixups to make sure that's the case.  To avoid nasty
+         * surprises, allocate an extra byte and add a NUL here.
          */
+        dict_value = GF_MALLOC(fsi->size + 1, gf_common_mt_char);
+        if (dict_value == NULL) {
+            gf_log("glusterfs-fuse", GF_LOG_ERROR,
+                   "%" PRIu64 ": SETXATTR value allocation failed",
+                   finh->unique);
+            op_errno = ENOMEM;
+            GF_FREE(newkey);
+            goto done;
+        }
+        memcpy(dict_value, value, fsi->size);
+        dict_value[fsi->size] = '\0';
+    }
+    ret = dict_set_dynptr(state->xattr, newkey, dict_value, fsi->size);
+    if (ret < 0) {
+        op_errno = -ret;
+        GF_FREE(dict_value);
+        GF_FREE(newkey);
+        goto done;
+    }
+
+    state->flags = fsi->flags;
+    state->name = newkey;
+
+    fuse_resolve_and_resume(state, fuse_setxattr_resume);
+
+    return;
+
+done:
+    send_fuse_err(this, finh, op_errno);
+    free_fuse_state(state);
+}
+
+static void
+send_fuse_xattr(xlator_t *this, fuse_in_header_t *finh, const char *value,
+                size_t size, size_t expected)
+{
+    struct fuse_getxattr_out fgxo;
+
+    /* linux kernel limits the size of xattr value to 64k */
+    if (size > GLUSTERFS_XATTR_LEN_MAX)
+        send_fuse_err(this, finh, E2BIG);
+    else if (expected) {
+        /* if callback for getxattr and asks for value */
+        if (size > expected)
+            /* reply would be bigger than
+             * what was asked by kernel */
+            send_fuse_err(this, finh, ERANGE);
+        else
+            send_fuse_data(this, finh, (void *)value, size);
+    } else {
+        fgxo.size = size;
+        send_fuse_obj(this, finh, &fgxo);
+    }
+}
+
+/* filter out xattrs that need not be visible on the
+ * mount point. this is _specifically_ for geo-rep
+ * as of now, to prevent Rsync from crying out loud
+ * when it tries to setxattr() for selinux xattrs
+ */
+static int
+fuse_filter_xattr(char *key)
+{
+    int need_filter = 0;
+    struct fuse_private *priv = THIS->private;
+
+    if ((priv->client_pid == GF_CLIENT_PID_GSYNCD) &&
+        fnmatch("*.selinux*", key, FNM_PERIOD) == 0)
+        need_filter = 1;
 
-        if (op_ret == 0) {
-#ifndef GF_DARWIN_HOST_OS
-                /* MacFUSE doesn't respect anyof these tweaks */
-                buf->f_blocks *= buf->f_frsize;
-                buf->f_blocks /= BIG_FUSE_CHANNEL_SIZE;
+    return need_filter;
+}
 
-                buf->f_bavail *= buf->f_frsize;
-                buf->f_bavail /= BIG_FUSE_CHANNEL_SIZE;
+static int
+fuse_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    char *value = "";
+    fuse_state_t *state = NULL;
+    fuse_in_header_t *finh = NULL;
+    data_t *value_data = NULL;
+    int ret = -1;
+    int32_t len = 0;
+    int32_t len_next = 0;
 
-                buf->f_bfree *= buf->f_frsize;
-                buf->f_bfree /= BIG_FUSE_CHANNEL_SIZE;
+    state = frame->root->state;
+    finh = state->finh;
 
-                buf->f_frsize = buf->f_bsize = BIG_FUSE_CHANNEL_SIZE;
-#endif /* GF_DARWIN_HOST_OS */
-                fuse_reply_statfs (req, buf);
+    fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
 
+    if (op_ret >= 0) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": %s() %s => %d",
+               frame->root->unique, gf_fop_list[frame->root->op],
+               state->loc.path, op_ret);
+
+        /* if successful */
+        if (state->name) {
+            /* if callback for getxattr */
+            value_data = dict_get(dict, state->name);
+            if (value_data) {
+                ret = value_data->len; /* Don't return the value for '\0' */
+                value = value_data->data;
+
+                send_fuse_xattr(this, finh, value, ret, state->size);
+                /* if(ret >...)...else if...else */
+            } else {
+                send_fuse_err(this, finh, ENODATA);
+            } /* if(value_data)...else */
         } else {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": ERR => -1 (%s)", frame->root->unique, 
-                        strerror(op_errno));
-                fuse_reply_err (req, op_errno);
+            /* if callback for listxattr */
+            /* we need to invoke fuse_filter_xattr() twice. Once
+             * while counting size and then while filling buffer
+             */
+            len = dict_keys_join(NULL, 0, dict, fuse_filter_xattr);
+            if (len < 0)
+                goto out;
+
+            value = alloca(len + 1);
+            if (!value)
+                goto out;
+
+            len_next = dict_keys_join(value, len, dict, fuse_filter_xattr);
+            if (len_next != len)
+                gf_log(THIS->name, GF_LOG_ERROR, "sizes not equal %d != %d",
+                       len, len_next);
+
+            send_fuse_xattr(this, finh, value, len, state->size);
+        } /* if(state->name)...else */
+    } else {
+        /* facilitate retry from VFS */
+        if ((state->fd == NULL) && (op_errno == ENOENT)) {
+            op_errno = ESTALE;
         }
 
-        free_state (state);
-        STACK_DESTROY (frame->root);
+        /* if failure - no need to check if listxattr or getxattr */
+        if (op_errno != ENODATA && op_errno != ENOATTR) {
+            if (op_errno == ENOTSUP) {
+                GF_LOG_OCCASIONALLY(gf_fuse_xattr_enotsup_log, "glusterfs-fuse",
+                                    GF_LOG_ERROR,
+                                    "extended attribute not "
+                                    "supported by the backend "
+                                    "storage");
+            } else {
+                gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                       "%" PRIu64 ": %s(%s) %s => -1 (%s)", frame->root->unique,
+                       gf_fop_list[frame->root->op], state->name,
+                       state->loc.path, strerror(op_errno));
+            }
+        } else {
+            gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+                   "%" PRIu64 ": %s(%s) %s => -1 (%s)", frame->root->unique,
+                   gf_fop_list[frame->root->op], state->name, state->loc.path,
+                   strerror(op_errno));
+        } /* if(op_errno!= ENODATA)...else */
+
+        send_fuse_err(this, finh, op_errno);
+    } /* if(op_ret>=0)...else */
 
-        return 0;
+out:
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
+
+    return 0;
 }
 
+void
+fuse_getxattr_resume(fuse_state_t *state)
+{
+    char *value = NULL;
+
+    if (!state->loc.inode) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": GETXATTR %s/%" PRIu64
+               " (%s) "
+               "resolution failed",
+               state->finh->unique, uuid_utoa(state->resolve.gfid),
+               state->finh->nodeid, state->name);
+
+        /* facilitate retry from VFS */
+        if (state->resolve.op_errno == ENOENT)
+            state->resolve.op_errno = ESTALE;
+
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+#ifdef GF_TEST_FFOP
+    state->fd = fd_lookup(state->loc.inode, state->finh->pid);
+#endif /* GF_TEST_FFOP */
+
+    if (state->name && (strcmp(state->name, VIRTUAL_GFID_XATTR_KEY) == 0)) {
+        /* send glusterfs gfid in binary form */
+
+        value = GF_MALLOC(16 + 1, gf_common_mt_char);
+        if (!value) {
+            send_fuse_err(state->this, state->finh, ENOMEM);
+            goto internal_out;
+        }
+        memcpy(value, state->loc.inode->gfid, 16);
+        value[16] = '\0';
+
+        send_fuse_xattr(THIS, state->finh, value, 16, state->size);
+        GF_FREE(value);
+    internal_out:
+        free_fuse_state(state);
+        return;
+    }
+
+    if (state->name && (strcmp(state->name, VIRTUAL_GFID_XATTR_KEY_STR) == 0)) {
+        /* transform binary gfid to canonical form */
+
+        value = GF_CALLOC(UUID_CANONICAL_FORM_LEN + 1, sizeof(char),
+                          gf_common_mt_char);
+        if (!value) {
+            send_fuse_err(state->this, state->finh, ENOMEM);
+            goto internal_out1;
+        }
+        uuid_utoa_r(state->loc.inode->gfid, value);
+
+        send_fuse_xattr(THIS, state->finh, value, UUID_CANONICAL_FORM_LEN,
+                        state->size);
+        GF_FREE(value);
+    internal_out1:
+        free_fuse_state(state);
+        return;
+    }
+
+    if (state->fd) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": GETXATTR %p/%" PRIu64 " (%s)", state->finh->unique,
+               state->fd, state->finh->nodeid, state->name);
+
+        FUSE_FOP(state, fuse_xattr_cbk, GF_FOP_FGETXATTR, fgetxattr, state->fd,
+                 state->name, state->xdata);
+    } else {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": GETXATTR %s/%" PRIu64 " (%s)", state->finh->unique,
+               state->loc.path, state->finh->nodeid, state->name);
+
+        FUSE_FOP(state, fuse_xattr_cbk, GF_FOP_GETXATTR, getxattr, &state->loc,
+                 state->name, state->xdata);
+    }
+}
 
 static void
-fuse_statfs (fuse_req_t req,
-             fuse_ino_t ino)
+fuse_getxattr(xlator_t *this, fuse_in_header_t *finh, void *msg,
+              struct iobuf *iobuf)
 {
-        fuse_state_t *state;
-	int32_t ret = -1;
+    struct fuse_getxattr_in *fgxi = msg;
+    char *name = (char *)(fgxi + 1);
+    fuse_state_t *state = NULL;
+    struct fuse_private *priv = NULL;
+    int rv = 0;
+    int op_errno = EINVAL;
+    char *newkey = NULL;
+    int ret = 0;
+
+    priv = this->private;
+    GET_STATE(this, finh, state);
+
+#ifdef GF_DARWIN_HOST_OS
+    if (fgxi->position) {
+        /* position can be used only for
+         * resource fork queries which we
+         * don't support anyway... so handling
+         * it separately is just sort of a
+         * matter of aesthetics, not strictly
+         * necessary.
+         */
+
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": GETXATTR %s/%" PRIu64
+               " (%s):"
+               "refusing positioned getxattr",
+               finh->unique, state->loc.path, finh->nodeid, name);
+        op_errno = EINVAL;
+        goto err;
+    }
+#endif
 
-        state = state_from_req (req);
-        ret = fuse_loc_fill (&state->loc, state, 1, 0, NULL);
-        if ((state->loc.inode == NULL) ||
-	    (ret < 0)) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": STATFS (fuse_loc_fill() fail)",
-			req_callid (req));
-    
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
+    if (!priv->acl) {
+        if ((strcmp(name, POSIX_ACL_ACCESS_XATTR) == 0) ||
+            (strcmp(name, POSIX_ACL_DEFAULT_XATTR) == 0)) {
+            op_errno = ENOTSUP;
+            goto err;
         }
+    }
+
+    ret = fuse_check_selinux_cap_xattr(priv, name);
+    if (ret) {
+        op_errno = ENODATA;
+        goto err;
+    }
+
+    fuse_resolve_inode_init(state, &state->resolve, finh->nodeid);
+
+    rv = fuse_flip_xattr_ns(priv, name, &newkey);
+    if (rv) {
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    state->size = fgxi->size;
+    state->name = newkey;
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": STATFS", req_callid (req));
+    fuse_resolve_and_resume(state, fuse_getxattr_resume);
 
-        FUSE_FOP (state, fuse_statfs_cbk, GF_FOP_STATFS,
-                  statfs, &state->loc);
+    return;
+err:
+    send_fuse_err(this, finh, op_errno);
+    free_fuse_state(state);
+    return;
 }
 
+void
+fuse_listxattr_resume(fuse_state_t *state)
+{
+    if (!state->loc.inode) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "%" PRIu64 ": LISTXATTR %s/%" PRIu64 "resolution failed",
+               state->finh->unique, uuid_utoa(state->resolve.gfid),
+               state->finh->nodeid);
+
+        /* facilitate retry from VFS */
+        if (state->resolve.op_errno == ENOENT)
+            state->resolve.op_errno = ESTALE;
+
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+#ifdef GF_TEST_FFOP
+    state->fd = fd_lookup(state->loc.inode, state->finh->pid);
+#endif /* GF_TEST_FFOP */
+
+    if (state->fd) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": LISTXATTR %p/%" PRIu64, state->finh->unique,
+               state->fd, state->finh->nodeid);
+
+        FUSE_FOP(state, fuse_xattr_cbk, GF_FOP_FGETXATTR, fgetxattr, state->fd,
+                 NULL, state->xdata);
+    } else {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": LISTXATTR %s/%" PRIu64, state->finh->unique,
+               state->loc.path, state->finh->nodeid);
+
+        FUSE_FOP(state, fuse_xattr_cbk, GF_FOP_GETXATTR, getxattr, &state->loc,
+                 NULL, state->xdata);
+    }
+}
 
 static void
-fuse_setxattr (fuse_req_t req,
-               fuse_ino_t ino,
-               const char *name,
-               const char *value,
-               size_t size,
-               int flags)
-{
-        fuse_state_t *state;
-	char *dict_value = NULL;
-	int32_t ret = -1;
-
-#ifdef DISABLE_POSIX_ACL
-	if (!strncmp (name, "system.", 7)) {
-		fuse_reply_err (req, EOPNOTSUPP);
-		return;
-	}
-#endif
+fuse_listxattr(xlator_t *this, fuse_in_header_t *finh, void *msg,
+               struct iobuf *iobuf)
+{
+    struct fuse_getxattr_in *fgxi = msg;
+    fuse_state_t *state = NULL;
 
-        state = state_from_req (req);
-        state->size = size;
-        ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL);
-        if ((state->loc.inode == NULL) ||
-	    (ret < 0)) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": SETXATTR %s/%"PRId64" (%s) (fuse_loc_fill() failed)", 
-                        req_callid (req),
-                        state->loc.path, (int64_t)ino, name);
+    GET_STATE(this, finh, state);
 
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
-        }
+    fuse_resolve_inode_init(state, &state->resolve, finh->nodeid);
 
-        state->dict = get_new_dict ();
+    state->size = fgxi->size;
 
-	dict_value = memdup (value, size);
-        dict_set (state->dict, (char *)name,
-                  data_from_dynptr ((void *)dict_value, size));
-        dict_ref (state->dict);
+    fuse_resolve_and_resume(state, fuse_listxattr_resume);
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": SETXATTR %s/%"PRId64" (%s)", req_callid (req),
-                state->loc.path, (int64_t)ino, name);
+    return;
+}
 
-        FUSE_FOP (state, fuse_err_cbk, GF_FOP_SETXATTR,
-                  setxattr, &state->loc, state->dict, flags);
+void
+fuse_removexattr_resume(fuse_state_t *state)
+{
+    if (!state->loc.inode) {
+        gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+               "%" PRIu64 ": REMOVEXATTR %s/%" PRIu64
+               " (%s) "
+               "resolution failed",
+               state->finh->unique, uuid_utoa(state->resolve.gfid),
+               state->finh->nodeid, state->name);
+
+        /* facilitate retry from VFS */
+        if (state->resolve.op_errno == ENOENT)
+            state->resolve.op_errno = ESTALE;
+
+        send_fuse_err(state->this, state->finh, state->resolve.op_errno);
+        free_fuse_state(state);
+        return;
+    }
+
+#ifdef GF_TEST_FFOP
+    state->fd = fd_lookup(state->loc.inode, state->finh->pid);
+#endif /* GF_TEST_FFOP */
+
+    if (state->fd) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": REMOVEXATTR %p/%" PRIu64 " (%s)",
+               state->finh->unique, state->fd, state->finh->nodeid,
+               state->name);
+
+        FUSE_FOP(state, fuse_removexattr_cbk, GF_FOP_FREMOVEXATTR, fremovexattr,
+                 state->fd, state->name, state->xdata);
+    } else {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "%" PRIu64 ": REMOVEXATTR %s/%" PRIu64 " (%s)",
+               state->finh->unique, state->loc.path, state->finh->nodeid,
+               state->name);
+
+        FUSE_FOP(state, fuse_removexattr_cbk, GF_FOP_REMOVEXATTR, removexattr,
+                 &state->loc, state->name, state->xdata);
+    }
+}
+
+static void
+fuse_removexattr(xlator_t *this, fuse_in_header_t *finh, void *msg,
+                 struct iobuf *iobuf)
+{
+    char *name = msg;
+
+    fuse_state_t *state = NULL;
+    fuse_private_t *priv = NULL;
+    int32_t ret = -1;
+    char *newkey = NULL;
+
+    if (!strcmp(GFID_XATTR_KEY, name) || !strcmp(GF_XATTR_VOL_ID_KEY, name)) {
+        send_fuse_err(this, finh, EPERM);
+        GF_FREE(finh);
+        return;
+    }
+
+    priv = this->private;
+
+    GET_STATE(this, finh, state);
 
+    fuse_resolve_inode_init(state, &state->resolve, finh->nodeid);
+
+    ret = fuse_flip_xattr_ns(priv, name, &newkey);
+    if (ret) {
+        send_fuse_err(this, finh, ENOMEM);
+        free_fuse_state(state);
         return;
+    }
+
+    state->name = newkey;
+
+    fuse_resolve_and_resume(state, fuse_removexattr_resume);
+    return;
 }
 
+static int gf_fuse_lk_enosys_log;
 
 static int
-fuse_xattr_cbk (call_frame_t *frame,
-                void *cookie,
-                xlator_t *this,
-                int32_t op_ret,
-                int32_t op_errno,
-                dict_t *dict)
-{
-	int need_to_free_dict = 0;
-        int32_t ret = op_ret;
-        char *value = "";
-        fuse_state_t *state = frame->root->state;
-        fuse_req_t req = state->req;
+fuse_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+               dict_t *xdata)
+{
+    fuse_state_t *state = NULL;
 
-#ifdef GF_DARWIN_HOST_OS
-	/* This is needed in MacFuse, where MacOSX Finder needs some specific 
-	 * keys to be supported from FS
-	 */
-	int32_t dummy_ret = 0;
-	if (state->name) {
-		if (!dict) {
-			dict = get_new_dict ();
-			need_to_free_dict = 1;
-		}
-		dummy_ret = gf_compat_getxattr (state->name, dict);
-		if (dummy_ret != -1)
-			ret = dummy_ret;
-	} else {
-		if (!dict) {
-			dict = get_new_dict ();
-			need_to_free_dict = 1;
-		}
-		dummy_ret = gf_compat_listxattr (ret, dict, state->size);
-		if (dummy_ret != -1)
-			ret = dummy_ret;
-	}
-#endif /* DARWIN */
-	
-        if (ret >= 0) {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": %s() %s => %d", frame->root->unique,
-                        gf_fop_list[frame->root->op], state->loc.path, op_ret);
-
-                /* if successful */
-                if (state->name) {
-                        /* if callback for getxattr */
-                        data_t *value_data = dict_get (dict, state->name);
-                        if (value_data) {
-                                ret = value_data->len; /* Don't return the value for '\0' */
-                                value = value_data->data;
-        
-                                /* linux kernel limits the size of xattr value to 64k */
-                                if (ret > GLUSTERFS_XATTR_LEN_MAX) {
-                                        fuse_reply_err (req, E2BIG);
-                                } else if (state->size) {
-                                        /* if callback for getxattr and asks for value */
-                                        fuse_reply_buf (req, value, ret);
-                                } else {
-                                        /* if callback for getxattr and asks for value length only */
-                                        fuse_reply_xattr (req, ret);
-                                } /* if(ret >...)...else if...else */
-                        }  else if (!strcmp (state->name, "user.glusterfs-booster-volfile")) {
-				fuse_private_t *priv = this->private;
- 
-				if (!priv->volfile) {
-					int32_t fd = -1, ret = -1;
-					struct stat st;
-					char *file = NULL;
-					
-					memset (&st, 0, sizeof (st));
-					fd = fileno (this->ctx->specfp);
-					ret = fstat (fd, &st);
-					if (ret != 0) {
-						gf_log (this->name,
-							GF_LOG_ERROR,
-							"fstat on fd (%d) failed (%s)", fd, strerror (errno));
-						fuse_reply_err (req, ENODATA);
-					}
-					
-					priv->volfile_size = st.st_size;
-					file = priv->volfile = CALLOC (1, priv->volfile_size);
-					ret = lseek (fd, 0, SEEK_SET);
-					while ((ret = read (fd, file, GF_UNIT_KB)) > 0) {
-						file += ret;
-					}
-				}
-				
-				if (priv->volfile_size > GLUSTERFS_XATTR_LEN_MAX) {
-					fuse_reply_err (req, E2BIG);
-				} else if (state->size) {
-					/* if callback for getxattr and asks for value */
-					fuse_reply_buf (req, priv->volfile, priv->volfile_size);
-				} else {
-					/* if callback for getxattr and asks for value length only */
-					fuse_reply_xattr (req, priv->volfile_size);
-				} /* if(ret >...)...else if...else */
-			} else if (!strcmp (state->name, "user.glusterfs-booster-path")) {
-				if (state->size) {
-					fuse_reply_buf (req, state->loc.path, strlen (state->loc.path) + 1);
-				} else {
-					fuse_reply_xattr (req, strlen (state->loc.path) + 1);
-				}
-			} else {
-				fuse_reply_err (req, ENODATA);
-			} /* if(value_data)...else */
-		} else {
-			/* if callback for listxattr */
-                        int32_t len = 0;
-                        data_pair_t *trav = dict->members_list;
-                        while (trav) {
-                                len += strlen (trav->key) + 1;
-                                trav = trav->next;
-                        } /* while(trav) */
-                        value = alloca (len + 1);
-                        ERR_ABORT (value);
-                        len = 0;
-                        trav = dict->members_list;
-                        while (trav) {
-                                strcpy (value + len, trav->key);
-                                value[len + strlen(trav->key)] = '\0';
-                                len += strlen (trav->key) + 1;
-                                trav = trav->next;
-                        } /* while(trav) */
-                        if (state->size) {
-                                /* if callback for listxattr and asks for list of keys */
-                                fuse_reply_buf (req, value, len);
-                        } else {
-                                /* if callback for listxattr and asks for length of keys only */
-                                fuse_reply_xattr (req, len);
-                        } /* if(state->size)...else */
-		} /* if(state->name)...else */
+    state = frame->root->state;
+    struct fuse_lk_out flo = {
+        {
+            0,
+        },
+    };
+
+    fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
+    if (op_ret == 0) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": ERR => 0",
+               frame->root->unique);
+        flo.lk.type = lock->l_type;
+        flo.lk.pid = lock->l_pid;
+        if (lock->l_type == F_UNLCK)
+            flo.lk.start = flo.lk.end = 0;
+        else {
+            flo.lk.start = lock->l_start;
+            flo.lk.end = lock->l_len ? (lock->l_start + lock->l_len - 1)
+                                     : OFFSET_MAX;
+        }
+        send_fuse_obj(this, state->finh, &flo);
+    } else {
+        if (op_errno == ENOSYS) {
+            gf_fuse_lk_enosys_log++;
+            if (!(gf_fuse_lk_enosys_log % GF_UNIVERSAL_ANSWER)) {
+                gf_log("glusterfs-fuse", GF_LOG_ERROR,
+                       "GETLK not supported. loading "
+                       "'features/posix-locks' on server side "
+                       "will add GETLK support.");
+            }
         } else {
-                /* if failure - no need to check if listxattr or getxattr */
-                if (op_errno != ENODATA) {
-                        if (op_errno == ENOTSUP) 
-                        {
-                                gf_fuse_xattr_enotsup_log++;
-                                if (!(gf_fuse_xattr_enotsup_log % GF_UNIVERSAL_ANSWER))
-                                        gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                                                "[ ERROR ] Extended attribute not supported by the backend storage");
-                        } 
-                        else 
-                        {
-                                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                                        "%"PRId64": %s() %s => -1 (%s)",
-					frame->root->unique,
-                                        gf_fop_list[frame->root->op],
-					state->loc.path, strerror(op_errno));
-                        }
-                } else {
-                        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                                "%"PRId64": %s() %s => -1 (%s)",
-				frame->root->unique,
-                                gf_fop_list[frame->root->op], state->loc.path,
-				strerror(op_errno));
-                } /* if(op_errno!= ENODATA)...else */
+            gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                   "%" PRIu64 ": ERR => -1 (%s)", frame->root->unique,
+                   strerror(op_errno));
+        }
+        send_fuse_err(this, state->finh, op_errno);
+    }
 
-                fuse_reply_err (req, op_errno);
-        } /* if(op_ret>=0)...else */
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
 
-	if (need_to_free_dict)
-		dict_unref (dict);
+    return 0;
+}
 
-        free_state (state);
-        STACK_DESTROY (frame->root);
+void
+fuse_getlk_resume(fuse_state_t *state)
+{
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": GETLK %p",
+           state->finh->unique, state->fd);
 
-        return 0;
+    FUSE_FOP(state, fuse_getlk_cbk, GF_FOP_LK, lk, state->fd, F_GETLK,
+             &state->lk_lock, state->xdata);
 }
 
-
 static void
-fuse_getxattr (fuse_req_t req,
-               fuse_ino_t ino,
-               const char *name,
-               size_t size)
-{
-        fuse_state_t *state;
-	int32_t ret = -1;
-
-#ifdef DISABLE_POSIX_ACL
-	if (!strncmp (name, "system.", 7)) {
-		fuse_reply_err (req, ENODATA);
-		return;
-	}
-#endif
+fuse_getlk(xlator_t *this, fuse_in_header_t *finh, void *msg,
+           struct iobuf *iobuf)
+{
+    struct fuse_lk_in *fli = msg;
+
+    fuse_state_t *state = NULL;
+    fd_t *fd = NULL;
+
+    fd = FH_TO_FD(fli->fh);
+    GET_STATE(this, finh, state);
+    state->fd = fd;
 
-        state = state_from_req (req);
-        state->size = size;
-        state->name = strdup (name);
+    fuse_resolve_fd_init(state, &state->resolve, fd);
 
-        ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL);
-        if ((state->loc.inode == NULL) ||
-	    (ret < 0)) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": GETXATTR %s/%"PRId64" (%s) (fuse_loc_fill() failed)", 
-                        req_callid (req), state->loc.path, (int64_t)ino, name);
+    convert_fuse_file_lock(&fli->lk, &state->lk_lock, fli->owner);
 
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
+    state->lk_owner = fli->owner;
+
+    fuse_resolve_and_resume(state, fuse_getlk_resume);
+
+    return;
+}
+
+static int
+fuse_setlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+               dict_t *xdata)
+{
+    uint32_t op = 0;
+    fuse_state_t *state = NULL;
+    int ret = 0;
+
+    ret = fuse_interrupt_finish_fop(frame, this, _gf_true, (void **)&state);
+    GF_FREE(state->name);
+    dict_unref(state->xdata);
+    GF_FREE(state);
+    if (ret) {
+        return 0;
+    }
+
+    state = frame->root->state;
+    op = state->finh->opcode;
+
+    fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
+    if (op_ret == 0) {
+        gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": ERR => 0",
+               frame->root->unique);
+        fd_lk_insert_and_merge(state->fd,
+                               (op == FUSE_SETLK) ? F_SETLK : F_SETLKW,
+                               &state->lk_lock);
+
+        send_fuse_err(this, state->finh, 0);
+    } else {
+        if (op_errno == ENOSYS) {
+            gf_fuse_lk_enosys_log++;
+            if (!(gf_fuse_lk_enosys_log % GF_UNIVERSAL_ANSWER)) {
+                gf_log("glusterfs-fuse", GF_LOG_ERROR,
+                       "SETLK not supported. loading "
+                       "'features/posix-locks' on server side "
+                       "will add SETLK support.");
+            }
+        } else if (op_errno == EAGAIN) {
+            gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+                   "Returning EAGAIN Flock: "
+                   "start=%llu, len=%llu, pid=%llu, lk-owner=%s",
+                   (unsigned long long)state->lk_lock.l_start,
+                   (unsigned long long)state->lk_lock.l_len,
+                   (unsigned long long)state->lk_lock.l_pid,
+                   lkowner_utoa(&frame->root->lk_owner));
+        } else {
+            gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                   "%" PRIu64 ": ERR => -1 (%s)", frame->root->unique,
+                   strerror(op_errno));
         }
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": GETXATTR %s/%"PRId64" (%s)", req_callid (req),
-                state->loc.path, (int64_t)ino, name);
+        send_fuse_err(this, state->finh, op_errno);
+    }
 
-        FUSE_FOP (state, fuse_xattr_cbk, GF_FOP_GETXATTR,
-                  getxattr, &state->loc, name);
+    free_fuse_state(state);
+    STACK_DESTROY(frame->root);
 
-        return;
+    return 0;
 }
 
+static int
+fuse_setlk_interrupt_handler_cbk(call_frame_t *frame, void *cookie,
+                                 xlator_t *this, int32_t op_ret,
+                                 int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    fuse_interrupt_state_t intstat = INTERRUPT_NONE;
+    fuse_interrupt_record_t *fir = cookie;
+
+    intstat = op_ret >= 0 ? INTERRUPT_HANDLED : INTERRUPT_SQUELCHED;
+
+    fuse_interrupt_finish_interrupt(this, fir, intstat, _gf_true, NULL);
+
+    STACK_DESTROY(frame->root);
+
+    return 0;
+}
 
 static void
-fuse_listxattr (fuse_req_t req,
-                fuse_ino_t ino,
-                size_t size)
+fuse_setlk_interrupt_handler(xlator_t *this, fuse_interrupt_record_t *fir)
 {
-        fuse_state_t *state;
-	int32_t ret = -1;
-
-        state = state_from_req (req);
-        state->size = size;
-        ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL);
-        if ((state->loc.inode == NULL) ||
-	    (ret < 0)) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": LISTXATTR %s/%"PRId64" (fuse_loc_fill() failed)", 
-                        req_callid (req), state->loc.path, (int64_t)ino);
+    fuse_state_t *state = NULL;
+    call_frame_t *frame = NULL;
+    char *xattr_name = NULL;
+    int ret = 0;
+
+    gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+           "SETLK%s unique %" PRIu64 ": interrupt handler triggered",
+           fir->fuse_in_header.opcode == FUSE_SETLK ? "" : "W",
+           fir->fuse_in_header.unique);
+
+    state = fir->data;
+
+    ret = gf_asprintf(
+        &xattr_name, GF_XATTR_CLRLK_CMD ".tposix.kblocked.%hd,%jd-%jd",
+        state->lk_lock.l_whence, state->lk_lock.l_start, state->lk_lock.l_len);
+    if (ret == -1) {
+        xattr_name = NULL;
+        goto err;
+    }
+
+    frame = get_call_frame_for_req(state);
+    if (!frame) {
+        goto err;
+    }
+    frame->root->state = state;
+    frame->root->op = GF_FOP_GETXATTR;
+    frame->op = GF_FOP_GETXATTR;
+    state->name = xattr_name;
+
+    STACK_WIND_COOKIE(frame, fuse_setlk_interrupt_handler_cbk, fir,
+                      state->active_subvol,
+                      state->active_subvol->fops->fgetxattr, state->fd,
+                      xattr_name, state->xdata);
+
+    return;
+
+err:
+    GF_FREE(xattr_name);
+    fuse_interrupt_finish_interrupt(this, fir, INTERRUPT_SQUELCHED, _gf_false,
+                                    (void **)&state);
+    if (state) {
+        dict_unref(state->xdata);
+        GF_FREE(state);
+    }
+}
 
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
+void
+fuse_setlk_resume(fuse_state_t *state)
+{
+    fuse_interrupt_record_t *fir = NULL;
+    fuse_state_t *state_clone = NULL;
+
+    fir = fuse_interrupt_record_new(state->finh, fuse_setlk_interrupt_handler);
+    state_clone = gf_memdup(state, sizeof(*state));
+    if (state_clone) {
+        state_clone->xdata = dict_new();
+    }
+
+    if (!fir || !state_clone || !state_clone->xdata) {
+        if (fir) {
+            GF_FREE(fir);
         }
+        if (state_clone) {
+            GF_FREE(state_clone);
+        }
+        send_fuse_err(state->this, state->finh, ENOMEM);
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": LISTXATTR %s/%"PRId64, req_callid (req),
-                state->loc.path, (int64_t)ino);
-
-        FUSE_FOP (state, fuse_xattr_cbk, GF_FOP_GETXATTR,
-                  getxattr, &state->loc, NULL);
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "SETLK%s unique %" PRIu64
+               ":"
+               " interrupt record allocation failed",
+               state->finh->opcode == FUSE_SETLK ? "" : "W",
+               state->finh->unique);
+        free_fuse_state(state);
 
         return;
+    }
+    state_clone->name = NULL;
+    fir->data = state_clone;
+    fuse_interrupt_record_insert(state->this, fir);
+
+    gf_log("glusterfs-fuse", GF_LOG_TRACE, "%" PRIu64 ": SETLK%s %p",
+           state->finh->unique, state->finh->opcode == FUSE_SETLK ? "" : "W",
+           state->fd);
+
+    FUSE_FOP(state, fuse_setlk_cbk, GF_FOP_LK, lk, state->fd,
+             state->finh->opcode == FUSE_SETLK ? F_SETLK : F_SETLKW,
+             &state->lk_lock, state->xdata);
 }
 
-
 static void
-fuse_removexattr (fuse_req_t req,
-                  fuse_ino_t ino,
-                  const char *name)
+fuse_setlk(xlator_t *this, fuse_in_header_t *finh, void *msg,
+           struct iobuf *iobuf)
+{
+    struct fuse_lk_in *fli = msg;
+
+    fuse_state_t *state = NULL;
+    fd_t *fd = NULL;
+
+    fd = FH_TO_FD(fli->fh);
+    GET_STATE(this, finh, state);
+    state->finh = finh;
+    state->fd = fd;
+
+    fuse_resolve_fd_init(state, &state->resolve, fd);
+
+    convert_fuse_file_lock(&fli->lk, &state->lk_lock, fli->owner);
+
+    state->lk_owner = fli->owner;
+
+    fuse_resolve_and_resume(state, fuse_setlk_resume);
 
+    return;
+}
+
+#if FUSE_KERNEL_MINOR_VERSION >= 11 && defined(HAVE_FUSE_NOTIFICATIONS)
+static void *
+notify_kernel_loop(void *data)
 {
-        fuse_state_t *state;
-	int32_t ret = -1;
+    uint32_t len = 0;
+    ssize_t rv = 0;
+    xlator_t *this = NULL;
+    fuse_private_t *priv = NULL;
+    fuse_invalidate_node_t *node = NULL;
+    fuse_invalidate_node_t *tmp = NULL;
+    struct fuse_out_header *pfoh = NULL;
+    struct iovec iov_out = {
+        0,
+    };
+
+    this = data;
+    priv = this->private;
+
+    for (;;) {
+        pthread_mutex_lock(&priv->invalidate_mutex);
+        {
+            while (list_empty(&priv->invalidate_list))
+                pthread_cond_wait(&priv->invalidate_cond,
+                                  &priv->invalidate_mutex);
 
-        state = state_from_req (req);
-        ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL);
-        if ((state->loc.inode == NULL) ||
-	    (ret < 0)) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "%"PRId64": REMOVEXATTR %s/%"PRId64" (%s) (fuse_loc_fill() failed)",
-                        req_callid (req), state->loc.path, (int64_t)ino, name);
+            node = list_entry(priv->invalidate_list.next,
+                              fuse_invalidate_node_t, next);
 
-                fuse_reply_err (req, EINVAL);
-		free_state (state);
-                return;
+            list_del_init(&node->next);
+            priv->invalidate_count--;
         }
+        pthread_mutex_unlock(&priv->invalidate_mutex);
+
+        pfoh = (struct fuse_out_header *)node->inval_buf;
+        memcpy(&len, &pfoh->len, sizeof(len));
+        /*
+         * a simple
+         *         len = pfoh->len;
+         * works on x86, but takes a multiple insn cycle hit
+         * when pfoh->len is not correctly aligned, possibly
+         * even stalling the insn pipeline.
+         * Other architectures will not be so forgiving. If
+         * we're lucky the memcpy will be inlined by the
+         * compiler, and might be as fast or faster without
+         * the risk of stalling the insn pipeline.
+         */
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": REMOVEXATTR %s/%"PRId64" (%s)", req_callid (req),
-                state->loc.path, (int64_t)ino, name);
+        iov_out.iov_base = node->inval_buf;
+        iov_out.iov_len = len;
+        rv = sys_writev(priv->fd, &iov_out, 1);
+        check_and_dump_fuse_W(priv, &iov_out, 1, rv, node->errnomask);
 
-        FUSE_FOP (state, fuse_err_cbk, GF_FOP_REMOVEXATTR,
-                  removexattr, &state->loc, name);
+        GF_FREE(node);
 
-        return;
+        if (rv == -1 && errno == EBADF)
+            break;
+
+        if (rv != len && !(rv == -1 && errno == ENOENT)) {
+            gf_log("glusterfs-fuse", GF_LOG_INFO, "len: %u, rv: %zd, errno: %d",
+                   len, rv, errno);
+        }
+    }
+
+    gf_log("glusterfs-fuse", GF_LOG_ERROR, "kernel notifier loop terminated");
+
+    pthread_mutex_lock(&priv->invalidate_mutex);
+    {
+        priv->reverse_fuse_thread_started = _gf_false;
+        list_for_each_entry_safe(node, tmp, &priv->invalidate_list, next)
+        {
+            list_del_init(&node->next);
+            GF_FREE(node);
+        }
+        priv->invalidate_count = 0;
+    }
+    pthread_mutex_unlock(&priv->invalidate_mutex);
+
+    return NULL;
 }
+#endif
 
+static void *
+timed_response_loop(void *data)
+{
+    ssize_t rv = 0;
+    size_t len = 0;
+    xlator_t *this = NULL;
+    fuse_private_t *priv = NULL;
+    fuse_timed_message_t *dmsg = NULL;
+    fuse_timed_message_t *tmp = NULL;
+    struct timespec now = {
+        0,
+    };
+    struct timespec delta = {
+        0,
+    };
+    struct iovec iovs[2] = {
+        {
+            0,
+        },
+    };
 
-static int gf_fuse_lk_enosys_log;
+    this = data;
+    priv = this->private;
 
-static int
-fuse_getlk_cbk (call_frame_t *frame,
-                void *cookie,
-                xlator_t *this,
-                int32_t op_ret,
-                int32_t op_errno,
-                struct flock *lock)
-{
-        fuse_state_t *state = frame->root->state;
-
-        if (op_ret == 0) {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": ERR => 0", frame->root->unique);
-                fuse_reply_lock (state->req, lock);
-        } else {
-                if (op_errno == ENOSYS) {
-                        gf_fuse_lk_enosys_log++;
-                        if (!(gf_fuse_lk_enosys_log % GF_UNIVERSAL_ANSWER)) {
-				gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-					"[ ERROR ] loading 'features/posix-locks' on server side may help your application");
-                        }
-                } else {
-                        gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                                "%"PRId64": ERR => -1 (%s)",
-				frame->root->unique, strerror (op_errno));
+    for (;;) {
+        pthread_mutex_lock(&priv->timed_mutex);
+        {
+            while (list_empty(&priv->timed_list)) {
+                pthread_cond_wait(&priv->timed_cond, &priv->timed_mutex);
+            }
+
+            dmsg = list_entry(priv->timed_list.next, fuse_timed_message_t,
+                              next);
+            list_for_each_entry(tmp, &priv->timed_list, next)
+            {
+                if (timespec_cmp(&tmp->scheduled_ts, &dmsg->scheduled_ts) < 0) {
+                    dmsg = tmp;
                 }
-                fuse_reply_err (state->req, op_errno);
+            }
+
+            list_del_init(&dmsg->next);
         }
-        
-        free_state (state);
-        STACK_DESTROY (frame->root);
+        pthread_mutex_unlock(&priv->timed_mutex);
 
-        return 0;
+        timespec_now(&now);
+        if (timespec_cmp(&now, &dmsg->scheduled_ts) < 0) {
+            timespec_sub(&now, &dmsg->scheduled_ts, &delta);
+            nanosleep(&delta, NULL);
+        }
+
+        gf_log("glusterfs-fuse", GF_LOG_TRACE,
+               "sending timed "
+               "message of unique %" PRIu64,
+               dmsg->fuse_out_header.unique);
+
+        len = dmsg->fuse_out_header.len;
+        iovs[0] = (struct iovec){&dmsg->fuse_out_header,
+                                 sizeof(struct fuse_out_header)};
+        iovs[1] = (struct iovec){dmsg->fuse_message_body,
+                                 len - sizeof(struct fuse_out_header)};
+        rv = sys_writev(priv->fd, iovs, 2);
+        check_and_dump_fuse_W(priv, iovs, 2, rv, dmsg->errnomask);
+
+        fuse_timed_message_free(dmsg);
+
+        if (rv == -1 && errno == EBADF) {
+            break;
+        }
+
+        if (rv != len && !(rv == -1 && errno == ENOENT)) {
+            gf_log("glusterfs-fuse", GF_LOG_INFO,
+                   "len: %zu, rv: %zd, errno: %d", len, rv, errno);
+        }
+    }
+
+    gf_log("glusterfs-fuse", GF_LOG_ERROR, "timed response loop terminated");
+
+    pthread_mutex_lock(&priv->timed_mutex);
+    {
+        priv->timed_response_fuse_thread_started = _gf_false;
+        list_for_each_entry_safe(dmsg, tmp, &priv->timed_list, next)
+        {
+            list_del_init(&dmsg->next);
+            fuse_timed_message_free(dmsg);
+        }
+    }
+    pthread_mutex_unlock(&priv->timed_mutex);
+
+    return NULL;
 }
 
+static void
+fuse_init(xlator_t *this, fuse_in_header_t *finh, void *msg,
+          struct iobuf *iobuf)
+{
+    struct fuse_init_in *fini = msg;
+    struct fuse_init_out fino = {
+        0,
+    };
+    fuse_private_t *priv = NULL;
+    size_t size = 0;
+    int ret = 0;
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+    pthread_t messenger;
+#endif
+    pthread_t delayer;
+
+    priv = this->private;
+
+    if (priv->init_recvd) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR, "got INIT after first message");
+
+        sys_close(priv->fd);
+        goto out;
+    }
+
+    priv->init_recvd = 1;
+
+    if (fini->major != FUSE_KERNEL_VERSION) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "unsupported FUSE protocol version %d.%d", fini->major,
+               fini->minor);
+
+        sys_close(priv->fd);
+        goto out;
+    }
+    priv->proto_minor = fini->minor;
+
+    fino.major = FUSE_KERNEL_VERSION;
+    fino.minor = FUSE_KERNEL_MINOR_VERSION;
+    fino.max_readahead = 1 << 17;
+    fino.max_write = 1 << 17;
+    fino.flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS;
+#if FUSE_KERNEL_MINOR_VERSION >= 17
+    if (fini->minor >= 17)
+        fino.flags |= FUSE_FLOCK_LOCKS;
+#endif
+#if FUSE_KERNEL_MINOR_VERSION >= 12
+    if (fini->minor >= 12) {
+        /* let fuse leave the umask processing to us, so that it does not
+         * break extended POSIX ACL defaults on server */
+        fino.flags |= FUSE_DONT_MASK;
+    }
+#endif
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+    if (fini->minor >= 6 /* fuse_init_in has flags */ &&
+        fini->flags & FUSE_BIG_WRITES) {
+        /* no need for direct I/O mode by default if big writes are supported */
+        if (priv->direct_io_mode == 2)
+            priv->direct_io_mode = 0;
+        fino.flags |= FUSE_BIG_WRITES;
+    }
+
+    /* Start the thread processing timed responses */
+    ret = gf_thread_create(&delayer, NULL, timed_response_loop, this,
+                           "fusedlyd");
+    if (ret != 0) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "failed to start timed response thread (%s)", strerror(errno));
+
+        sys_close(priv->fd);
+        goto out;
+    }
+    priv->timed_response_fuse_thread_started = _gf_true;
+
+    /* Used for 'reverse invalidation of inode' */
+#ifdef HAVE_FUSE_NOTIFICATIONS
+    if (fini->minor >= 12) {
+        ret = gf_thread_create(&messenger, NULL, notify_kernel_loop, this,
+                               "fusenoti");
+        if (ret != 0) {
+            gf_log("glusterfs-fuse", GF_LOG_ERROR,
+                   "failed to start messenger daemon (%s)", strerror(errno));
+
+            sys_close(priv->fd);
+            goto out;
+        }
+        priv->reverse_fuse_thread_started = _gf_true;
+    } else
+#endif
+    {
+        /*
+         * FUSE minor < 12 does not implement invalidate notifications.
+         * This mechanism is required for fopen-keep-cache to operate
+         * correctly. Disable and warn the user.
+         */
+        if (priv->fopen_keep_cache) {
+            gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                   "FUSE version "
+                   "%d.%d does not support inval notifications. "
+                   "fopen-keep-cache disabled.",
+                   fini->major, fini->minor);
+            priv->fopen_keep_cache = 0;
+        }
+    }
+
+    if (fini->minor >= 13) {
+        fino.max_background = priv->background_qlen;
+        fino.congestion_threshold = priv->congestion_threshold;
+    }
+    if (fini->minor < 9)
+        *priv->msg0_len_p = sizeof(*finh) + FUSE_COMPAT_WRITE_IN_SIZE;
+
+    if (priv->use_readdirp) {
+        if (fini->flags & FUSE_DO_READDIRPLUS)
+            fino.flags |= FUSE_DO_READDIRPLUS;
+    }
+#endif
+    if (priv->fopen_keep_cache == 2) {
+        /* If user did not explicitly set --fopen-keep-cache[=off],
+           then check if kernel support FUSE_AUTO_INVAL_DATA and ...
+        */
+
+        priv->fopen_keep_cache = 1;
+
+#if FUSE_KERNEL_MINOR_VERSION >= 20
+        if (fini->flags & FUSE_AUTO_INVAL_DATA) {
+            /* ... enable fopen_keep_cache mode if supported.
+             */
+            gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+                   "Detected "
+                   "support for FUSE_AUTO_INVAL_DATA. Enabling "
+                   "fopen_keep_cache automatically.");
+
+            if (priv->fuse_auto_inval)
+                fino.flags |= FUSE_AUTO_INVAL_DATA;
+        } else
+#endif
+        {
+            if (priv->fuse_auto_inval) {
+                gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+                       "No support for FUSE_AUTO_INVAL_DATA. Disabling "
+                       "fopen_keep_cache.");
+                /* ... else disable. */
+                priv->fopen_keep_cache = 0;
+            }
+        }
+    } else if (priv->fopen_keep_cache == 1) {
+        /* If user explicitly set --fopen-keep-cache[=on],
+           then enable FUSE_AUTO_INVAL_DATA if possible.
+        */
+#if FUSE_KERNEL_MINOR_VERSION >= 20
+        if (priv->fuse_auto_inval && (fini->flags & FUSE_AUTO_INVAL_DATA)) {
+            gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+                   "fopen_keep_cache "
+                   "is explicitly set. Enabling FUSE_AUTO_INVAL_DATA");
+            fino.flags |= FUSE_AUTO_INVAL_DATA;
+        } else
+#endif
+        {
+            gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                   "fopen_keep_cache "
+                   "is explicitly set. Support for "
+                   "FUSE_AUTO_INVAL_DATA is missing");
+        }
+    }
+
+#if FUSE_KERNEL_MINOR_VERSION >= 22
+    if (fini->flags & FUSE_ASYNC_DIO)
+        fino.flags |= FUSE_ASYNC_DIO;
+#endif
+
+    size = sizeof(fino);
+#if FUSE_KERNEL_MINOR_VERSION >= 23
+    /* FUSE 7.23 and newer added attributes to the fuse_init_out struct */
+    if (fini->minor < 23) {
+        /* reduce the size, chop off unused attributes from &fino */
+        size = FUSE_COMPAT_22_INIT_OUT_SIZE;
+    }
+
+    /* Writeback cache support */
+    if (fini->minor >= 23) {
+        if (priv->kernel_writeback_cache)
+            fino.flags |= FUSE_WRITEBACK_CACHE;
+        fino.time_gran = priv->attr_times_granularity;
+    }
+#endif
+
+    ret = send_fuse_data(this, finh, &fino, size);
+    if (ret == 0)
+        gf_log("glusterfs-fuse", GF_LOG_INFO,
+               "FUSE inited with protocol versions:"
+               " glusterfs %d.%d kernel %d.%d",
+               FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION, fini->major,
+               fini->minor);
+    else {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR, "FUSE init failed (%s)",
+               strerror(ret));
+
+        sys_close(priv->fd);
+    }
+
+out:
+    GF_FREE(finh);
+}
 
 static void
-fuse_getlk (fuse_req_t req,
-            fuse_ino_t ino,
-            struct fuse_file_info *fi,
-            struct flock *lock)
+fuse_enosys(xlator_t *this, fuse_in_header_t *finh, void *msg,
+            struct iobuf *iobuf)
 {
-        fuse_state_t *state;
-	fd_t *fd = NULL;
-	
-	fd = FI_TO_FD (fi);
-        state = state_from_req (req);
-        state->req = req;
-	state->fd = fd;
+    send_fuse_err(this, finh, ENOSYS);
 
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": GETLK %p", req_callid (req), fd);
+    GF_FREE(finh);
+}
 
-        FUSE_FOP (state, fuse_getlk_cbk, GF_FOP_LK,
-                  lk, fd, F_GETLK, lock);
+static void
+fuse_destroy(xlator_t *this, fuse_in_header_t *finh, void *msg,
+             struct iobuf *iobuf)
+{
+    send_fuse_err(this, finh, 0);
 
-        return;
+    GF_FREE(finh);
 }
 
+int
+fuse_first_lookup(xlator_t *this)
+{
+    fuse_private_t *priv = NULL;
+    loc_t loc = {
+        0,
+    };
+    xlator_t *xl = NULL;
+    dict_t *dict = NULL;
+    static uuid_t gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+    int ret = -1;
+    struct iatt iatt = {
+        0,
+    };
+
+    priv = this->private;
+
+    loc.path = "/";
+    loc.name = "";
+    loc.inode = fuse_ino_to_inode(1, this);
+    gf_uuid_copy(loc.gfid, loc.inode->gfid);
+    loc.parent = NULL;
+
+    dict = dict_new();
+
+    xl = priv->active_subvol;
+
+    ret = dict_set_gfuuid(dict, "gfid-req", gfid, true);
+    if (ret) {
+        gf_log(xl->name, GF_LOG_ERROR, "failed to set 'gfid-req'");
+        goto out;
+    }
+
+    ret = syncop_lookup(xl, &loc, &iatt, NULL, dict, NULL);
+    DECODE_SYNCOP_ERR(ret);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_ERROR, "first lookup on root failed (%s)",
+               strerror(errno));
+        /* NOTE: Treat it as an error case. */
+        /* goto out; */ /* commented for preventing coverity warning */
+    }
+    /* Remove comment of above goto statement if you are adding any
+       more code here, before 'out' label */
 
-static int
-fuse_setlk_cbk (call_frame_t *frame,
-                void *cookie,
-                xlator_t *this,
-                int32_t op_ret,
-                int32_t op_errno,
-                struct flock *lock)
-{
-        fuse_state_t *state = frame->root->state;
-
-        if (op_ret == 0) {
-                gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                        "%"PRId64": ERR => 0", frame->root->unique);
-                fuse_reply_err (state->req, 0);
-        } else {
-                if (op_errno == ENOSYS) {
-                        gf_fuse_lk_enosys_log++;
-                        if (!(gf_fuse_lk_enosys_log % GF_UNIVERSAL_ANSWER)) {
-				gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                                        "[ ERROR ] loading 'features/posix-locks' on server side may help your application");
-                        }
-                } else  {
-                        gf_log ("glusterfs-fuse",
-				(op_errno == EAGAIN) ? GF_LOG_DEBUG : GF_LOG_ERROR,
-                                "%"PRId64": ERR => -1 (%s)",
-				frame->root->unique, strerror (op_errno));
-                }
+out:
+    dict_unref(dict);
+    inode_unref(loc.inode);
+
+    return ret;
+}
 
-                fuse_reply_err (state->req, op_errno);
+int
+fuse_nameless_lookup(xlator_t *this, xlator_t *xl, uuid_t gfid, loc_t *loc)
+{
+    int ret = -1;
+    dict_t *xattr_req = NULL;
+    struct iatt iatt = {
+        0,
+    };
+    inode_t *linked_inode = NULL;
+    uint64_t ctx_value = LOOKUP_NOT_NEEDED;
+
+    if ((loc == NULL) || (xl == NULL)) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if (loc->inode == NULL) {
+        loc->inode = inode_new(xl->itable);
+        if (loc->inode == NULL) {
+            ret = -ENOMEM;
+            goto out;
         }
-        
-        free_state (state);
-        STACK_DESTROY (frame->root);
+    }
+
+    gf_uuid_copy(loc->gfid, gfid);
+
+    xattr_req = dict_new();
+    if (xattr_req == NULL) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    ret = syncop_lookup(xl, loc, &iatt, NULL, xattr_req, NULL);
+    if (ret < 0)
+        goto out;
 
+    linked_inode = inode_link(loc->inode, NULL, NULL, &iatt);
+    if (linked_inode == loc->inode)
+        inode_ctx_set(linked_inode, this, &ctx_value);
+
+    inode_unref(loc->inode);
+    loc->inode = linked_inode;
+
+    ret = 0;
+out:
+    if (xattr_req != NULL) {
+        dict_unref(xattr_req);
+    }
+
+    return ret;
+}
+
+int
+fuse_migrate_fd_open(xlator_t *this, fd_t *basefd, fd_t *oldfd,
+                     xlator_t *old_subvol, xlator_t *new_subvol)
+{
+    loc_t loc = {
+        0,
+    };
+    fd_t *newfd = NULL, *old_activefd = NULL;
+    fuse_fd_ctx_t *basefd_ctx = NULL;
+    fuse_fd_ctx_t *newfd_ctx = NULL;
+    int ret = 0, flags = 0;
+
+    ret = inode_path(basefd->inode, NULL, (char **)&loc.path);
+    if (ret < 0) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "cannot construct path of gfid (%s) failed"
+               "(old-subvolume:%s-%d new-subvolume:%s-%d)",
+               uuid_utoa(basefd->inode->gfid), old_subvol->name,
+               old_subvol->graph->id, new_subvol->name, new_subvol->graph->id);
+        goto out;
+    }
+
+    gf_uuid_copy(loc.gfid, basefd->inode->gfid);
+
+    loc.inode = inode_find(new_subvol->itable, basefd->inode->gfid);
+
+    if (loc.inode == NULL) {
+        ret = fuse_nameless_lookup(this, new_subvol, basefd->inode->gfid, &loc);
+        if (ret < 0) {
+            gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                   "name-less lookup of gfid (%s) failed (%s)"
+                   "(old-subvolume:%s-%d new-subvolume:%s-%d)",
+                   uuid_utoa(basefd->inode->gfid), strerror(-ret),
+                   old_subvol->name, old_subvol->graph->id, new_subvol->name,
+                   new_subvol->graph->id);
+            ret = -1;
+            goto out;
+        }
+    }
+
+    basefd_ctx = fuse_fd_ctx_get(this, basefd);
+    GF_VALIDATE_OR_GOTO("glusterfs-fuse", basefd_ctx, out);
+
+    newfd = fd_create(loc.inode, basefd->pid);
+    if (newfd == NULL) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "cannot create new fd, hence not migrating basefd "
+               "(ptr:%p inode-gfid:%s) "
+               "(old-subvolume:%s-%d new-subvolume:%s-%d)",
+               basefd, uuid_utoa(loc.inode->gfid), old_subvol->name,
+               old_subvol->graph->id, new_subvol->name, new_subvol->graph->id);
+        ret = -1;
+        goto out;
+    }
+
+    newfd->flags = basefd->flags;
+    if (newfd->lk_ctx)
+        fd_lk_ctx_unref(newfd->lk_ctx);
+
+    newfd->lk_ctx = fd_lk_ctx_ref(oldfd->lk_ctx);
+
+    newfd_ctx = fuse_fd_ctx_check_n_create(this, newfd);
+    GF_VALIDATE_OR_GOTO("glusterfs-fuse", newfd_ctx, out);
+
+    if (IA_ISDIR(basefd->inode->ia_type)) {
+        ret = syncop_opendir(new_subvol, &loc, newfd, NULL, NULL);
+    } else {
+        flags = basefd->flags & ~(O_CREAT | O_EXCL | O_TRUNC);
+        ret = syncop_open(new_subvol, &loc, flags, newfd, NULL, NULL);
+    }
+
+    if (ret < 0) {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "open on basefd (ptr:%p inode-gfid:%s) failed (%s)"
+               "(old-subvolume:%s-%d new-subvolume:%s-%d)",
+               basefd, uuid_utoa(basefd->inode->gfid), strerror(-ret),
+               old_subvol->name, old_subvol->graph->id, new_subvol->name,
+               new_subvol->graph->id);
+        ret = -1;
+        goto out;
+    }
+
+    fd_bind(newfd);
+
+    LOCK(&basefd->lock);
+    {
+        if (basefd_ctx->activefd != NULL) {
+            old_activefd = basefd_ctx->activefd;
+        }
+
+        basefd_ctx->activefd = newfd;
+    }
+    UNLOCK(&basefd->lock);
+
+    if (old_activefd != NULL) {
+        fd_unref(old_activefd);
+    }
+
+    gf_log("glusterfs-fuse", GF_LOG_INFO,
+           "migrated basefd (%p) to newfd (%p) (inode-gfid:%s)"
+           "(old-subvolume:%s-%d new-subvolume:%s-%d)",
+           basefd, newfd, uuid_utoa(basefd->inode->gfid), old_subvol->name,
+           old_subvol->graph->id, new_subvol->name, new_subvol->graph->id);
+
+    ret = 0;
+
+out:
+    loc_wipe(&loc);
+
+    return ret;
+}
+
+int
+fuse_migrate_locks(xlator_t *this, fd_t *basefd, fd_t *oldfd,
+                   xlator_t *old_subvol, xlator_t *new_subvol)
+{
+    int ret = -1;
+    dict_t *lockinfo = NULL;
+    void *ptr = NULL;
+    fd_t *newfd = NULL;
+    fuse_fd_ctx_t *basefd_ctx = NULL;
+
+    if (!oldfd->lk_ctx || fd_lk_ctx_empty(oldfd->lk_ctx))
         return 0;
+
+    basefd_ctx = fuse_fd_ctx_get(this, basefd);
+    GF_VALIDATE_OR_GOTO("glusterfs-fuse", basefd_ctx, out);
+
+    LOCK(&basefd->lock);
+    {
+        newfd = fd_ref(basefd_ctx->activefd);
+    }
+    UNLOCK(&basefd->lock);
+
+    ret = syncop_fgetxattr(old_subvol, oldfd, &lockinfo, GF_XATTR_LOCKINFO_KEY,
+                           NULL, NULL);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "getting lockinfo failed while migrating locks"
+               "(oldfd:%p newfd:%p inode-gfid:%s)"
+               "(old-subvol:%s-%d new-subvol:%s-%d)",
+               oldfd, newfd, uuid_utoa(newfd->inode->gfid), old_subvol->name,
+               old_subvol->graph->id, new_subvol->name, new_subvol->graph->id);
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_get_ptr(lockinfo, GF_XATTR_LOCKINFO_KEY, &ptr);
+    if (ptr == NULL) {
+        ret = 0;
+        gf_log(this->name, GF_LOG_INFO,
+               "No lockinfo present on any of the bricks "
+               "(oldfd: %p newfd:%p inode-gfid:%s) "
+               "(old-subvol:%s-%d new-subvol:%s-%d)",
+               oldfd, newfd, uuid_utoa(newfd->inode->gfid), old_subvol->name,
+               old_subvol->graph->id, new_subvol->name, new_subvol->graph->id);
+
+        goto out;
+    }
+
+    ret = syncop_fsetxattr(new_subvol, newfd, lockinfo, 0, NULL, NULL);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "migrating locks failed (oldfd:%p newfd:%p "
+               "inode-gfid:%s) (old-subvol:%s-%d new-subvol:%s-%d)",
+               oldfd, newfd, uuid_utoa(newfd->inode->gfid), old_subvol->name,
+               old_subvol->graph->id, new_subvol->name, new_subvol->graph->id);
+        ret = -1;
+        goto out;
+    }
+
+out:
+    if (newfd)
+        fd_unref(newfd);
+
+    if (lockinfo != NULL) {
+        dict_unref(lockinfo);
+    }
+
+    return ret;
 }
 
+int
+fuse_migrate_fd(xlator_t *this, fd_t *basefd, xlator_t *old_subvol,
+                xlator_t *new_subvol)
+{
+    int ret = -1;
+    char create_in_progress = 0;
+    fuse_fd_ctx_t *basefd_ctx = NULL;
+    fd_t *oldfd = NULL;
+    dict_t *xdata = NULL;
+
+    basefd_ctx = fuse_fd_ctx_get(this, basefd);
+    GF_VALIDATE_OR_GOTO("glusterfs-fuse", basefd_ctx, out);
+
+    LOCK(&basefd->lock);
+    {
+        oldfd = basefd_ctx->activefd ? basefd_ctx->activefd : basefd;
+        fd_ref(oldfd);
+    }
+    UNLOCK(&basefd->lock);
+
+    LOCK(&oldfd->inode->lock);
+    {
+        if (gf_uuid_is_null(oldfd->inode->gfid)) {
+            create_in_progress = 1;
+        } else {
+            create_in_progress = 0;
+        }
+    }
+    UNLOCK(&oldfd->inode->lock);
+
+    if (create_in_progress) {
+        gf_log("glusterfs-fuse", GF_LOG_INFO,
+               "create call on fd (%p) is in progress "
+               "(basefd-ptr:%p basefd-inode.gfid:%s), "
+               "hence deferring migration till application does an "
+               "fd based operation on this fd"
+               "(old-subvolume:%s-%d, new-subvolume:%s-%d)",
+               oldfd, basefd, uuid_utoa(basefd->inode->gfid), old_subvol->name,
+               old_subvol->graph->id, new_subvol->name, new_subvol->graph->id);
+
+        ret = 0;
+        goto out;
+    }
+
+    if (oldfd->inode->table->xl == old_subvol) {
+        if (IA_ISDIR(oldfd->inode->ia_type)) {
+            ret = syncop_fsyncdir(old_subvol, oldfd, 0, NULL, NULL);
+        } else {
+            xdata = dict_new();
+            if (!xdata || dict_set_int8(xdata, "last-fsync", 1)) {
+                gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                       "last-fsync set failed (%s) on fd (%p)"
+                       "(basefd:%p basefd-inode.gfid:%s) "
+                       "(old-subvolume:%s-%d new-subvolume:%s-%d)",
+                       strerror(ENOMEM), oldfd, basefd,
+                       uuid_utoa(basefd->inode->gfid), old_subvol->name,
+                       old_subvol->graph->id, new_subvol->name,
+                       new_subvol->graph->id);
+            }
+
+            ret = syncop_fsync(old_subvol, oldfd, 0, NULL, NULL, xdata, NULL);
+        }
 
-static void
-fuse_setlk (fuse_req_t req,
-            fuse_ino_t ino,
-            struct fuse_file_info *fi,
-            struct flock *lock,
-            int sleep)
-{
-        fuse_state_t *state;
-        fd_t *fd = NULL;
-        
-        fd = FI_TO_FD (fi);
-        state = state_from_req (req);
-        state->req = req;
-        state->fd = fd;
-
-        gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
-                "%"PRId64": SETLK %p (sleep=%d)", req_callid (req), fd,
-                sleep);
-
-        FUSE_FOP (state, fuse_setlk_cbk, GF_FOP_LK,
-                  lk, fd, (sleep ? F_SETLKW : F_SETLK), lock);
+        if (ret < 0) {
+            gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                   "syncop_fsync(dir) failed (%s) on fd (%p)"
+                   "(basefd:%p basefd-inode.gfid:%s) "
+                   "(old-subvolume:%s-%d new-subvolume:%s-%d)",
+                   strerror(-ret), oldfd, basefd,
+                   uuid_utoa(basefd->inode->gfid), old_subvol->name,
+                   old_subvol->graph->id, new_subvol->name,
+                   new_subvol->graph->id);
+        }
+    } else {
+        gf_log("glusterfs-fuse", GF_LOG_WARNING,
+               "basefd (ptr:%p inode-gfid:%s) was not "
+               "migrated during previous graph switch"
+               "(old-subvolume:%s-%d new-subvolume: %s-%d)",
+               basefd, basefd->inode->gfid, old_subvol->name,
+               old_subvol->graph->id, new_subvol->name, new_subvol->graph->id);
+    }
+
+    ret = fuse_migrate_fd_open(this, basefd, oldfd, old_subvol, new_subvol);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "open corresponding to "
+               "basefd (ptr:%p inode-gfid:%s) in new graph failed "
+               "(old-subvolume:%s-%d new-subvolume:%s-%d)",
+               basefd, uuid_utoa(basefd->inode->gfid), old_subvol->name,
+               old_subvol->graph->id, new_subvol->name, new_subvol->graph->id);
+        goto out;
+    }
+
+    ret = fuse_migrate_locks(this, basefd, oldfd, old_subvol, new_subvol);
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "migrating locks from old-subvolume (%s-%d) to "
+               "new-subvolume (%s-%d) failed (inode-gfid:%s oldfd:%p "
+               "basefd:%p)",
+               old_subvol->name, old_subvol->graph->id, new_subvol->name,
+               new_subvol->graph->id, uuid_utoa(basefd->inode->gfid), oldfd,
+               basefd);
+    }
+out:
+    if (ret < 0) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "migration of basefd "
+               "(ptr:%p inode-gfid:%s) failed"
+               "(old-subvolume:%s-%d new-subvolume:%s-%d)",
+               basefd, oldfd ? uuid_utoa(oldfd->inode->gfid) : NULL,
+               old_subvol->name, old_subvol->graph->id, new_subvol->name,
+               new_subvol->graph->id);
+    }
+
+    fd_unref(oldfd);
+
+    if (xdata)
+        dict_unref(xdata);
+
+    return ret;
+}
 
-        return;
+int
+fuse_handle_opened_fds(xlator_t *this, xlator_t *old_subvol,
+                       xlator_t *new_subvol)
+{
+    fuse_private_t *priv = NULL;
+    fdentry_t *fdentries = NULL;
+    uint32_t count = 0;
+    fdtable_t *fdtable = NULL;
+    int i = 0;
+    fd_t *fd = NULL;
+    int32_t ret = 0;
+    fuse_fd_ctx_t *fdctx = NULL;
+
+    priv = this->private;
+
+    fdtable = priv->fdtable;
+
+    fdentries = gf_fd_fdtable_copy_all_fds(fdtable, &count);
+    if (fdentries != NULL) {
+        for (i = 0; i < count; i++) {
+            fd = fdentries[i].fd;
+            if (fd == NULL)
+                continue;
+
+            ret = fuse_migrate_fd(this, fd, old_subvol, new_subvol);
+
+            fdctx = fuse_fd_ctx_get(this, fd);
+            if (fdctx) {
+                LOCK(&fd->lock);
+                {
+                    if (ret < 0) {
+                        fdctx->migration_failed = 1;
+                    } else {
+                        fdctx->migration_failed = 0;
+                    }
+                }
+                UNLOCK(&fd->lock);
+            }
+        }
+
+        for (i = 0; i < count; i++) {
+            fd = fdentries[i].fd;
+            if (fd)
+                fd_unref(fd);
+        }
+
+        GF_FREE(fdentries);
+    }
+
+    return 0;
 }
 
+static int
+fuse_handle_blocked_locks(xlator_t *this, xlator_t *old_subvol,
+                          xlator_t *new_subvol)
+{
+    return 0;
+}
 
-static void 
-fuse_init (void *data, struct fuse_conn_info *conn)
+static int
+fuse_graph_switch_task(void *data)
 {
-	return;
+    fuse_graph_switch_args_t *args = NULL;
+
+    args = data;
+    if (args == NULL) {
+        goto out;
+    }
+
+    /* don't change the order of handling open fds and blocked locks, since
+     * the act of opening files also reacquires granted locks in new graph.
+     */
+    fuse_handle_opened_fds(args->this, args->old_subvol, args->new_subvol);
+
+    fuse_handle_blocked_locks(args->this, args->old_subvol, args->new_subvol);
+
+out:
+    return 0;
 }
 
-static void
-fuse_destroy (void *data)
-{
-
-}
-
-static struct fuse_lowlevel_ops fuse_ops = {
-        .init         = fuse_init,
-        .destroy      = fuse_destroy,
-        .lookup       = fuse_lookup,
-        .forget       = fuse_forget,
-        .getattr      = fuse_getattr,
-        .setattr      = fuse_setattr,
-        .opendir      = fuse_opendir,
-        .readdir      = fuse_readdir,
-        .releasedir   = fuse_releasedir,
-        .access       = fuse_access,
-        .readlink     = fuse_readlink,
-        .mknod        = fuse_mknod,
-        .mkdir        = fuse_mkdir,
-        .unlink       = fuse_unlink,
-        .rmdir        = fuse_rmdir,
-        .symlink      = fuse_symlink,
-        .rename       = fuse_rename,
-        .link         = fuse_link,
-        .create       = fuse_create,
-        .open         = fuse_open,
-        .read         = fuse_readv,
-        .write        = fuse_write,
-        .flush        = fuse_flush,
-        .release      = fuse_release,
-        .fsync        = fuse_fsync,
-        .fsyncdir     = fuse_fsyncdir,
-        .statfs       = fuse_statfs,
-        .setxattr     = fuse_setxattr,
-        .getxattr     = fuse_getxattr,
-        .listxattr    = fuse_listxattr,
-        .removexattr  = fuse_removexattr,
-        .getlk        = fuse_getlk,
-        .setlk        = fuse_setlk
-};
+fuse_graph_switch_args_t *
+fuse_graph_switch_args_alloc(void)
+{
+    fuse_graph_switch_args_t *args = NULL;
 
-int
-fuse_root_lookup_cbk (call_frame_t *frame,
-		      void *cookie,
-		      xlator_t *this,
-		      int32_t op_ret,
-		      int32_t op_errno,
-		      inode_t *inode,
-		      struct stat *buf,
-		      dict_t *xattr)
-{
-        fuse_private_t *priv = this->private;
-
-	if (op_ret == 0) {
-		gf_log (this->name, GF_LOG_DEBUG, 
-			"first lookup on root succeeded.");
-		inode_lookup (inode);
-	} else {
-		gf_log (this->name, GF_LOG_DEBUG, 
-			"first lookup on root failed.");
-	}
-	STACK_DESTROY (frame->root);
-	pthread_mutex_lock (&priv->first_call_mutex);
-	{
-		priv->first_call = 0;
-		pthread_cond_broadcast (&priv->first_call_cond);
-	}
-	pthread_mutex_unlock (&priv->first_call_mutex);
-	return 0;
+    args = GF_CALLOC(1, sizeof(*args), gf_fuse_mt_graph_switch_args_t);
+    if (args == NULL) {
+        goto out;
+    }
+
+out:
+    return args;
+}
+
+void
+fuse_graph_switch_args_destroy(fuse_graph_switch_args_t *args)
+{
+    if (args == NULL) {
+        goto out;
+    }
+
+    GF_FREE(args);
+out:
+    return;
 }
 
+int
+fuse_handle_graph_switch(xlator_t *this, xlator_t *old_subvol,
+                         xlator_t *new_subvol)
+{
+    call_frame_t *frame = NULL;
+    int32_t ret = -1;
+    fuse_graph_switch_args_t *args = NULL;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (frame == NULL) {
+        goto out;
+    }
+
+    args = fuse_graph_switch_args_alloc();
+    if (args == NULL) {
+        goto out;
+    }
+
+    args->this = this;
+    args->old_subvol = old_subvol;
+    args->new_subvol = new_subvol;
+
+    ret = synctask_new(this->ctx->env, fuse_graph_switch_task, NULL, frame,
+                       args);
+    if (ret == -1) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "starting sync-task to "
+               "handle graph switch failed");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    if (args != NULL) {
+        fuse_graph_switch_args_destroy(args);
+    }
+
+    if (frame != NULL) {
+        STACK_DESTROY(frame->root);
+    }
+
+    return ret;
+}
 
 int
-fuse_root_lookup (xlator_t *this)
+fuse_graph_sync(xlator_t *this)
 {
-	fuse_private_t *priv = NULL;
-	loc_t loc;
-	call_frame_t *frame = NULL;
-	xlator_t *xl = NULL;
-	dict_t *dict = NULL;
+    fuse_private_t *priv = NULL;
+    int need_first_lookup = 0;
+    int ret = 0;
+    int new_graph_id = 0;
+    xlator_t *old_subvol = NULL, *new_subvol = NULL;
+    uint64_t winds_on_old_subvol = 0;
+
+    priv = this->private;
+
+    pthread_mutex_lock(&priv->sync_mutex);
+    {
+        if (!priv->next_graph)
+            goto unlock;
+
+        old_subvol = priv->active_subvol;
+        new_subvol = priv->active_subvol = priv->next_graph->top;
+        new_graph_id = priv->next_graph->id;
+        priv->next_graph = NULL;
+        need_first_lookup = 1;
+        if (old_subvol) {
+            priv->handle_graph_switch = _gf_true;
+        }
 
-	priv = this->private;
+        while (!priv->event_recvd) {
+            ret = pthread_cond_wait(&priv->sync_cond, &priv->sync_mutex);
+            if (ret != 0) {
+                gf_log(this->name, GF_LOG_DEBUG,
+                       "timedwait returned non zero value "
+                       "ret: %d errno: %d",
+                       ret, errno);
+                break;
+            }
+        }
+    }
+unlock:
+    pthread_mutex_unlock(&priv->sync_mutex);
+
+    if (need_first_lookup) {
+        gf_log("fuse", GF_LOG_INFO, "switched to graph %d", new_graph_id);
+        fuse_first_lookup(this);
+    }
+
+    if ((old_subvol != NULL) && (new_subvol != NULL)) {
+        fuse_handle_graph_switch(this, old_subvol, new_subvol);
+
+        pthread_mutex_lock(&priv->sync_mutex);
+        {
+            old_subvol->switched = 1;
+            winds_on_old_subvol = old_subvol->winds;
+            priv->handle_graph_switch = _gf_false;
+            pthread_cond_broadcast(&priv->migrate_cond);
+        }
+        pthread_mutex_unlock(&priv->sync_mutex);
 
-	pthread_cond_init (&priv->first_call_cond, NULL);
-	pthread_mutex_init (&priv->first_call_mutex, NULL);
+        if (winds_on_old_subvol == 0) {
+            xlator_notify(old_subvol, GF_EVENT_PARENT_DOWN, old_subvol, NULL);
+        }
+    }
+
+    return 0;
+}
 
-	loc.path = "/";
-	loc.name = "";
-	loc.ino = 1;
-	loc.inode = inode_search (this->itable, 1, NULL);
-	loc.parent = NULL;
+int
+fuse_get_mount_status(xlator_t *this)
+{
+    int kid_status = -1;
+    fuse_private_t *priv = this->private;
+
+    if (sys_read(priv->status_pipe[0], &kid_status, sizeof(kid_status)) < 0) {
+        gf_log(this->name, GF_LOG_ERROR, "could not get mount status");
+        kid_status = -1;
+    }
+    gf_log(this->name, GF_LOG_DEBUG, "mount status is %d", kid_status);
+
+    sys_close(priv->status_pipe[0]);
+    sys_close(priv->status_pipe[1]);
+    return kid_status;
+}
 
-	dict = dict_new();
-	frame = create_frame (this, this->ctx->pool);
-	frame->root->type = GF_OP_TYPE_FOP_REQUEST;
-	xl = this->children->xlator;
+static void
+fuse_dispatch(xlator_t *xl, gf_async_t *async)
+{
+    fuse_async_t *fasync;
+    fuse_private_t *priv;
+    fuse_in_header_t *finh;
+    struct iobuf *iobuf;
 
-	STACK_WIND (frame, fuse_root_lookup_cbk, xl, xl->fops->lookup, 
-		    &loc, dict);
-	dict_unref (dict);
+    priv = xl->private;
+    fasync = caa_container_of(async, fuse_async_t, async);
+    finh = fasync->finh;
+    iobuf = fasync->iobuf;
 
-	pthread_mutex_lock (&priv->first_call_mutex);
-	{
-		while (priv->first_call) {
-			pthread_cond_wait (&priv->first_call_cond, 
-					   &priv->first_call_mutex);
-		}
-	}
-	pthread_mutex_unlock (&priv->first_call_mutex);
+    priv->fuse_ops[finh->opcode](xl, finh, fasync->msg, iobuf);
 
-	return 0;
+    iobuf_unref(iobuf);
 }
 
+/* We need 512 extra buffer size for BATCH_FORGET fop. By tests, it is
+ * found to be reduces 'REALLOC()' in the loop */
+#define FUSE_EXTRA_ALLOC 512
 
 static void *
-fuse_thread_proc (void *data)
-{
-	char *mount_point = NULL;
-        xlator_t *this = data;
-        fuse_private_t *priv = this->private;
-        int32_t res = 0;
-        data_t *buf = priv->buf;
-        int32_t ref = 0;
-        size_t chan_size = fuse_chan_bufsize (priv->ch);
-        char *recvbuf = CALLOC (1, chan_size);
-        ERR_ABORT (recvbuf);
-
-        while (!fuse_session_exited (priv->se)) {
-
-                res = fuse_chan_receive (priv->ch,
-                                         recvbuf,
-                                         chan_size);
-
-		if (priv->first_call) {
-			fuse_root_lookup (this);
-		}
-
-                if (res == -1) {
-                        if (errno != EINTR) {
-                                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                                        "fuse_chan_receive() returned -1 (%d)", errno);
-                        }
-                        if (errno == ENODEV)
-                                break;
-                        continue;
+fuse_thread_proc(void *data)
+{
+    char *mount_point = NULL;
+    xlator_t *this = NULL;
+    fuse_private_t *priv = NULL;
+    ssize_t res = 0;
+    struct iobuf *iobuf = NULL;
+    fuse_in_header_t *finh = NULL;
+    struct iovec iov_in[2] = {
+        {
+            0,
+        },
+    };
+
+    void *msg = NULL;
+    size_t msg0_size = sizeof(*finh) + sizeof(struct fuse_write_in);
+    fuse_async_t *fasync;
+    struct pollfd pfd[2] = {{
+        0,
+    }};
+    uint32_t psize;
+
+    this = data;
+    priv = this->private;
+
+    THIS = this;
+
+    psize = ((struct iobuf_pool *)this->ctx->iobuf_pool)->default_page_size;
+    priv->msg0_len_p = &msg0_size;
+
+    for (;;) {
+        /* THIS has to be reset here */
+        THIS = this;
+
+        pthread_mutex_lock(&priv->sync_mutex);
+        {
+            if (!priv->mount_finished) {
+                memset(pfd, 0, sizeof(pfd));
+                pfd[0].fd = priv->status_pipe[0];
+                pfd[0].events = POLLIN | POLLHUP | POLLERR;
+                pfd[1].fd = priv->fd;
+                pfd[1].events = POLLIN | POLLHUP | POLLERR;
+                if (poll(pfd, 2, -1) < 0) {
+                    gf_log(this->name, GF_LOG_ERROR, "poll error %s",
+                           strerror(errno));
+                    pthread_mutex_unlock(&priv->sync_mutex);
+                    break;
                 }
+                if (pfd[0].revents & POLLIN) {
+                    if (fuse_get_mount_status(this) != 0) {
+                        pthread_mutex_unlock(&priv->sync_mutex);
+                        break;
+                    }
+                    priv->mount_finished = _gf_true;
+                } else if (pfd[0].revents) {
+                    gf_log(this->name, GF_LOG_ERROR,
+                           "mount pipe closed without status");
+                    pthread_mutex_unlock(&priv->sync_mutex);
+                    break;
+                }
+                if (!pfd[1].revents) {
+                    pthread_mutex_unlock(&priv->sync_mutex);
+                    continue;
+                }
+            }
+        }
+        pthread_mutex_unlock(&priv->sync_mutex);
 
-                buf = priv->buf;
-
-                if (res && res != -1) {
-                        if (buf->len < (res)) {
-                                if (buf->data) {
-                                        FREE (buf->data);
-                                        buf->data = NULL;
-                                }
-                                buf->data = CALLOC (1, res);
-                                ERR_ABORT (buf->data);
-                                buf->len = res;
-                        }
-                        memcpy (buf->data, recvbuf, res); // evil evil
+        /*
+         * We don't want to block on readv while we're still waiting
+         * for mount status.  That means we only want to get here if
+         * mount_status is true (meaning that our wait completed
+         * already) or if we already called poll(2) on priv->fd to
+         * make sure it's ready.
+         */
+
+        if (priv->init_recvd)
+            fuse_graph_sync(this);
+
+        /* TODO: This place should always get maximum supported buffer
+           size from 'fuse', which is as of today 128KB. If we bring in
+           support for higher block sizes support, then we should be
+           changing this one too */
+        iobuf = iobuf_get(this->ctx->iobuf_pool);
+
+        /* Add extra 512 byte to the first iov so that it can
+         * accommodate "ordinary" non-write requests. It's not
+         * guaranteed to be big enough, as SETXATTR and namespace
+         * operations with very long names may grow behind it,
+         * but it's good enough in most cases (and we can handle
+         * rest via realloc). */
+        iov_in[0].iov_base = GF_MALLOC(
+            sizeof(fuse_async_t) + msg0_size + FUSE_EXTRA_ALLOC,
+            gf_fuse_mt_iov_base);
+
+        if (!iobuf || !iov_in[0].iov_base) {
+            gf_log(this->name, GF_LOG_ERROR, "Out of memory");
+            if (iobuf)
+                iobuf_unref(iobuf);
+            GF_FREE(iov_in[0].iov_base);
+            sleep(10);
+            continue;
+        }
+
+        iov_in[1].iov_base = iobuf->ptr;
+
+        iov_in[0].iov_len = msg0_size;
+        iov_in[1].iov_len = psize;
 
-                        fuse_session_process (priv->se,
-                                              buf->data,
-                                              res,
-                                              priv->ch);
+        res = sys_readv(priv->fd, iov_in, 2);
+
+        if (res == -1) {
+            if (errno == ENODEV || errno == EBADF) {
+                gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+                       "terminating upon getting %s when "
+                       "reading /dev/fuse",
+                       errno == ENODEV ? "ENODEV" : "EBADF");
+                fuse_log_eh(this,
+                            "glusterfs-fuse: terminating"
+                            " upon getting %s when "
+                            "reading /dev/fuse",
+                            errno == ENODEV ? "ENODEV" : "EBADF");
+                break;
+            }
+            if (errno != EINTR) {
+                gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                       "read from /dev/fuse returned -1 (%s)", strerror(errno));
+                fuse_log_eh(this,
+                            "glusterfs-fuse: read from "
+                            "/dev/fuse returned -1 (%s)",
+                            strerror(errno));
+                if (errno == EPERM) {
+                    /*
+                     * sleep a while to avoid busy looping
+                     * on EPERM condition
+                     */
+                    nanosleep(
+                        &(struct timespec){0,
+                                           priv->fuse_dev_eperm_ratelimit_ns},
+                        NULL);
                 }
+            }
+
+            goto cont_err;
+        }
+        if (res < sizeof(*finh)) {
+            gf_log("glusterfs-fuse", GF_LOG_WARNING, "short read on /dev/fuse");
+            fuse_log_eh(this,
+                        "glusterfs-fuse: short read on "
+                        "/dev/fuse");
+            break;
+        }
 
-                LOCK (&buf->lock);
-                ref = buf->refcount;
-                UNLOCK (&buf->lock);
-                if (1) {
-                        data_unref (buf);
+        finh = (fuse_in_header_t *)iov_in[0].iov_base;
 
-                        priv->buf = data_ref (data_from_dynptr (NULL, 0));
+        if (res != finh->len
+#ifdef GF_DARWIN_HOST_OS
+            /* work around fuse4bsd/MacFUSE msg size miscalculation bug,
+             * that is, payload size is not taken into account for
+             * buffered writes
+             */
+            && !(finh->opcode == FUSE_WRITE &&
+                 finh->len == sizeof(*finh) + sizeof(struct fuse_write_in) &&
+                 res == finh->len + ((struct fuse_write_in *)(finh + 1))->size)
+#endif
+        ) {
+            gf_log("glusterfs-fuse", GF_LOG_WARNING,
+                   "inconsistent read on /dev/fuse");
+            fuse_log_eh(this,
+                        "glusterfs-fuse: inconsistent read "
+                        "on /dev/fuse");
+            break;
+        }
+
+        /*
+         * This can be moved around a bit, but it's important to do it
+         * *after* the readv.  Otherwise, a graph switch could occur
+         * while we're in readv and we'll process the next request on
+         * the old graph before we come to the part of the loop above
+         * readv and check again.  That would be wrong.
+         */
+        if (priv->init_recvd)
+            fuse_graph_sync(this);
+
+        if (finh->opcode == FUSE_WRITE)
+            msg = iov_in[1].iov_base;
+        else {
+            if (res > msg0_size + FUSE_EXTRA_ALLOC) {
+                void *b = GF_REALLOC(iov_in[0].iov_base,
+                                     sizeof(fuse_async_t) + res);
+                if (b) {
+                    iov_in[0].iov_base = b;
+                    finh = (fuse_in_header_t *)iov_in[0].iov_base;
+                } else {
+                    gf_log("glusterfs-fuse", GF_LOG_ERROR, "Out of memory");
+                    send_fuse_err(this, finh, ENOMEM);
+
+                    goto cont_err;
                 }
+            }
+
+            if (res > iov_in[0].iov_len) {
+                memcpy(iov_in[0].iov_base + iov_in[0].iov_len,
+                       iov_in[1].iov_base, res - iov_in[0].iov_len);
+                iov_in[0].iov_len = res;
+            }
+
+            msg = finh + 1;
         }
-	if (dict_get (this->options, ZR_MOUNTPOINT_OPT))
-		mount_point = data_to_str (dict_get (this->options, 
-						     ZR_MOUNTPOINT_OPT));
-	if (mount_point) {
-		gf_log (this->name, GF_LOG_WARNING, 
-			"unmounting %s", mount_point);
-		dict_del (this->options, ZR_MOUNTPOINT_OPT);
-	}
-	fuse_session_remove_chan (priv->ch);
-	fuse_session_destroy (priv->se);
-	//  fuse_unmount (priv->mount_point, priv->ch);
-	
-	raise (SIGTERM);
-	
-        return NULL;
+        if (priv->uid_map_root && finh->uid == priv->uid_map_root)
+            finh->uid = 0;
+
+        if (finh->opcode >= FUSE_OP_HIGH) {
+            /* turn down MacFUSE specific messages */
+            fuse_enosys(this, finh, msg, NULL);
+            iobuf_unref(iobuf);
+        } else {
+            fasync = iov_in[0].iov_base + iov_in[0].iov_len;
+            fasync->finh = finh;
+            fasync->msg = msg;
+            fasync->iobuf = iobuf;
+            gf_async(&fasync->async, this, fuse_dispatch);
+        }
+
+        continue;
+
+    cont_err:
+        iobuf_unref(iobuf);
+        GF_FREE(iov_in[0].iov_base);
+        iov_in[0].iov_base = NULL;
+    }
+
+    if (iov_in[0].iov_base)
+        GF_FREE(iov_in[0].iov_base);
+
+    /*
+     * We could be in all sorts of states with respect to iobuf and iov_in
+     * by the time we get here, and it's just not worth untangling them if
+     * we're about to kill ourselves anyway.
+     */
+
+    if (dict_get(this->options, ZR_MOUNTPOINT_OPT))
+        mount_point = data_to_str(dict_get(this->options, ZR_MOUNTPOINT_OPT));
+    if (mount_point) {
+        gf_log(this->name, GF_LOG_INFO, "initiating unmount of %s",
+               mount_point);
+    }
+
+    /* Kill the whole process, not just this thread. */
+    kill(getpid(), SIGTERM);
+    return NULL;
 }
 
+int32_t
+fuse_itable_dump(xlator_t *this)
+{
+    fuse_private_t *priv = NULL;
+
+    if (!this)
+        return -1;
+
+    priv = this->private;
+
+    if (priv && priv->active_subvol) {
+        gf_proc_dump_add_section("xlator.mount.fuse.itable");
+        inode_table_dump(priv->active_subvol->itable,
+                         "xlator.mount.fuse.itable");
+    }
+    return 0;
+}
 
 int32_t
-notify (xlator_t *this, int32_t event,
-        void *data, ...)
+fuse_priv_dump(xlator_t *this)
 {
-        int32_t         ret     = 0;
-        fuse_private_t *private = NULL;
+    fuse_private_t *private = NULL;
 
-        private = this->private;
-  
-        switch (event)
-        {
-        case GF_EVENT_CHILD_UP:
+    if (!this)
+        return -1;
 
-#ifndef GF_DARWIN_HOST_OS
-		/* 
-		 * This is because macfuse sends statfs() once the fuse thread
-                 * gets activated, and by that time if the client is not 
-		 * connected, it give 'Device not configured' error. Hence, 
-		 * create thread only when client sends CHILD_UP (ie, client 
-		 * is connected).
-		 */
-
-                /* TODO: somehow, try to get the mountpoint active as soon as 
-		 * init() is complete, so that the hang effect when the 
-		 * server is not not started is removed. 
-		 */
-		
-		/* This code causes problem with 'automount' too */
-		/* case GF_EVENT_CHILD_CONNECTING: */
-#endif /* DARWIN */
+   private
+    = this->private;
 
-        {
+    if (!private)
+        return -1;
+
+    gf_proc_dump_add_section("xlator.mount.fuse.priv");
+
+    gf_proc_dump_write("fd", "%d", private->fd);
+    gf_proc_dump_write("proto_minor", "%u", private->proto_minor);
+    gf_proc_dump_write("volfile", "%s",
+                       private->volfile ? private->volfile : "None");
+    gf_proc_dump_write("volfile_size", "%" GF_PRI_SIZET, private->volfile_size);
+    gf_proc_dump_write("mount_point", "%s", private->mount_point);
+    gf_proc_dump_write("fuse_thread_started", "%d",
+                       (int)private->fuse_thread_started);
+    gf_proc_dump_write("direct_io_mode", "%d", private->direct_io_mode);
+    gf_proc_dump_write("entry_timeout", "%lf", private->entry_timeout);
+    gf_proc_dump_write("attribute_timeout", "%lf", private->attribute_timeout);
+    gf_proc_dump_write("init_recvd", "%d", (int)private->init_recvd);
+    gf_proc_dump_write("strict_volfile_check", "%d",
+                       (int)private->strict_volfile_check);
+    gf_proc_dump_write("timed_response_thread_started", "%d",
+                       (int)private->timed_response_fuse_thread_started);
+    gf_proc_dump_write("reverse_thread_started", "%d",
+                       (int)private->reverse_fuse_thread_started);
+    gf_proc_dump_write("invalidate_limit", "%u", private->invalidate_limit);
+    gf_proc_dump_write("invalidate_queue_length", "%" PRIu64,
+                       private->invalidate_count);
+    gf_proc_dump_write("use_readdirp", "%d", private->use_readdirp);
+
+    return 0;
+}
+
+int
+fuse_history_dump(xlator_t *this)
+{
+    int ret = -1;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    fuse_private_t *priv = this->private;
+
+    GF_VALIDATE_OR_GOTO("fuse", this, out);
+
+    if (!priv->event_history)
+        goto out;
+
+    GF_VALIDATE_OR_GOTO(this->name, this->history, out);
+
+    gf_proc_dump_build_key(key_prefix, "xlator.mount.fuse", "history");
+    gf_proc_dump_add_section("%s", key_prefix);
+    eh_dump(this->history, NULL, dump_history_fuse);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+dump_history_fuse(circular_buffer_t *cb, void *data)
+{
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+
+    gf_time_fmt_tv(timestr, sizeof timestr, &cb->tv, gf_timefmt_F_HMS);
+
+    gf_proc_dump_write("TIME", "%s", timestr);
+
+    gf_proc_dump_write("message", "%s\n", (char *)cb->data);
+
+    return 0;
+}
+
+int
+fuse_graph_setup(xlator_t *this, glusterfs_graph_t *graph)
+{
+    inode_table_t *itable = NULL;
+    int ret = 0, winds = 0;
+    fuse_private_t *priv = NULL;
+    glusterfs_graph_t *prev_graph = NULL;
+
+    priv = this->private;
 
-                if (!private->fuse_thread_started)
+    pthread_mutex_lock(&priv->sync_mutex);
+    {
+        /* 1. handle the case of more than one CHILD_UP on same graph.
+         * 2. make sure graph is newer than current active_subvol.
+         */
+        if ((priv->active_subvol == graph->top) || graph->used ||
+            ((priv->active_subvol) &&
+             (priv->active_subvol->graph->id > graph->id))) {
+            goto unlock;
+        }
+
+#if FUSE_KERNEL_MINOR_VERSION >= 11
+        itable = inode_table_with_invalidator(priv->lru_limit, graph->top,
+                                              fuse_inode_invalidate_fn, this);
+#else
+        itable = inode_table_new(0, graph->top);
+#endif
+        if (!itable) {
+            ret = -1;
+            goto unlock;
+        }
+
+        ((xlator_t *)graph->top)->itable = itable;
+
+        prev_graph = priv->next_graph;
+
+        if ((prev_graph != NULL) && (prev_graph->id > graph->id)) {
+            /* there was a race and an old graph was initialised
+             * before new one.
+             */
+            prev_graph = graph;
+        } else {
+            priv->next_graph = graph;
+            priv->event_recvd = 0;
+        }
+
+        if (prev_graph != NULL)
+            winds = ((xlator_t *)prev_graph->top)->winds;
+
+        /* set post initializing next_graph i to preserve
+         * critical section update and bails on error */
+        graph->used = 1;
+    }
+    pthread_mutex_unlock(&priv->sync_mutex);
+
+    if ((prev_graph != NULL) && (winds == 0)) {
+        xlator_notify(prev_graph->top, GF_EVENT_PARENT_DOWN, prev_graph->top,
+                      NULL);
+    }
+
+    return ret;
+unlock:
+    pthread_mutex_unlock(&priv->sync_mutex);
+
+    return ret;
+}
+
+int
+notify(xlator_t *this, int32_t event, void *data, ...)
+{
+    int i = 0;
+    int32_t ret = 0;
+    fuse_private_t *private = NULL;
+    gf_boolean_t start_thread = _gf_false;
+    glusterfs_graph_t *graph = NULL;
+    struct pollfd pfd = {0};
+
+   private
+    = this->private;
+
+    graph = data;
+
+    gf_log("fuse", GF_LOG_DEBUG, "got event %d on graph %d", event,
+           ((graph) ? graph->id : 0));
+
+    switch (event) {
+        case GF_EVENT_GRAPH_NEW:
+            break;
+
+        case GF_EVENT_CHILD_UP:
+        case GF_EVENT_CHILD_DOWN:
+        case GF_EVENT_CHILD_CONNECTING: {
+            if (graph) {
+                ret = fuse_graph_setup(this, graph);
+                if (ret)
+                    gf_log(this->name, GF_LOG_WARNING,
+                           "failed to setup the graph");
+            }
+
+            if ((event == GF_EVENT_CHILD_UP) ||
+                (event == GF_EVENT_CHILD_DOWN)) {
+                pthread_mutex_lock(&private->sync_mutex);
                 {
-                        private->fuse_thread_started = 1;
+                   private
+                    ->event_recvd = 1;
+                    pthread_cond_broadcast(&private->sync_cond);
+                }
+                pthread_mutex_unlock(&private->sync_mutex);
+            }
+
+            pthread_mutex_lock(&private->sync_mutex);
+            {
+                if (!private->fuse_thread_started) {
+                   private
+                    ->fuse_thread_started = 1;
+                    start_thread = _gf_true;
+                }
+            }
+            pthread_mutex_unlock(&private->sync_mutex);
+
+            if (start_thread) {
+               private
+                ->fuse_thread = GF_CALLOC(private->reader_thread_count,
+                                          sizeof(pthread_t),
+                                          gf_fuse_mt_pthread_t);
+                for (i = 0; i < private->reader_thread_count; i++) {
+                    ret = gf_thread_create(&private->fuse_thread[i], NULL,
+                                           fuse_thread_proc, this, "fuseproc");
+                    if (ret != 0) {
+                        gf_log(this->name, GF_LOG_DEBUG,
+                               "pthread_create() failed (%s)", strerror(errno));
+                        break;
+                    }
+                }
+            }
 
-                        ret = pthread_create (&private->fuse_thread, NULL,
-                                              fuse_thread_proc, this);
+            break;
+        }
 
-                        if (ret != 0)
-                                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                                        "pthread_create() failed (%s)", strerror (errno));
-                        assert (ret == 0);
+        case GF_EVENT_AUTH_FAILED: {
+            /* Authentication failure is an error and glusterfs should stop */
+            gf_log(this->name, GF_LOG_ERROR,
+                   "Server authenication failed. Shutting down.");
+            pthread_mutex_lock(&private->sync_mutex);
+            {
+                /*Wait for mount to finish*/
+                if (!private->mount_finished) {
+                    pfd.fd = private->status_pipe[0];
+                    pfd.events = POLLIN | POLLHUP | POLLERR;
+                    if (poll(&pfd, 1, -1) < 0) {
+                        gf_log(this->name, GF_LOG_ERROR, "poll error %s",
+                               strerror(errno));
+                        goto auth_fail_unlock;
+                    }
+                    if (pfd.revents & POLLIN) {
+                        if (fuse_get_mount_status(this) != 0) {
+                            goto auth_fail_unlock;
+                        }
+                       private
+                        ->mount_finished = _gf_true;
+                    } else if (pfd.revents) {
+                        gf_log(this->name, GF_LOG_ERROR,
+                               "mount pipe closed without status");
+                        goto auth_fail_unlock;
+                    }
                 }
-                break;
+            }
+        auth_fail_unlock:
+            pthread_mutex_unlock(&private->sync_mutex);
+            fini(this);
+            break;
         }
-        case GF_EVENT_PARENT_UP:
+
+        default:
+            break;
+    }
+
+    return ret;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_fuse_mt_end + 1);
+
+    if (ret != 0) {
+        gf_log(this->name, GF_LOG_ERROR,
+               "Memory accounting init"
+               "failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+static fuse_handler_t *fuse_std_ops[FUSE_OP_HIGH] = {
+    [FUSE_LOOKUP] = fuse_lookup,
+    [FUSE_FORGET] = fuse_forget,
+    [FUSE_GETATTR] = fuse_getattr,
+    [FUSE_SETATTR] = fuse_setattr,
+    [FUSE_READLINK] = fuse_readlink,
+    [FUSE_SYMLINK] = fuse_symlink,
+    [FUSE_MKNOD] = fuse_mknod,
+    [FUSE_MKDIR] = fuse_mkdir,
+    [FUSE_UNLINK] = fuse_unlink,
+    [FUSE_RMDIR] = fuse_rmdir,
+    [FUSE_RENAME] = fuse_rename,
+    [FUSE_LINK] = fuse_link,
+    [FUSE_OPEN] = fuse_open,
+    [FUSE_READ] = fuse_readv,
+    [FUSE_WRITE] = fuse_write,
+    [FUSE_STATFS] = fuse_statfs,
+    [FUSE_RELEASE] = fuse_release,
+    [FUSE_FSYNC] = fuse_fsync,
+    [FUSE_SETXATTR] = fuse_setxattr,
+    [FUSE_GETXATTR] = fuse_getxattr,
+    [FUSE_LISTXATTR] = fuse_listxattr,
+    [FUSE_REMOVEXATTR] = fuse_removexattr,
+    [FUSE_FLUSH] = fuse_flush,
+    [FUSE_INIT] = fuse_init,
+    [FUSE_OPENDIR] = fuse_opendir,
+    [FUSE_READDIR] = fuse_readdir,
+    [FUSE_RELEASEDIR] = fuse_releasedir,
+    [FUSE_FSYNCDIR] = fuse_fsyncdir,
+    [FUSE_GETLK] = fuse_getlk,
+    [FUSE_SETLK] = fuse_setlk,
+    [FUSE_SETLKW] = fuse_setlk,
+    [FUSE_ACCESS] = fuse_access,
+    [FUSE_CREATE] = fuse_create,
+    [FUSE_INTERRUPT] = fuse_interrupt,
+    /* [FUSE_BMAP] */
+    [FUSE_DESTROY] = fuse_destroy,
+/* [FUSE_IOCTL] */
+/* [FUSE_POLL] */
+/* [FUSE_NOTIFY_REPLY] */
+
+#if FUSE_KERNEL_MINOR_VERSION >= 16
+    [FUSE_BATCH_FORGET] = fuse_batch_forget,
+#endif
+
+#if FUSE_KERNEL_MINOR_VERSION >= 19
+#ifdef FALLOC_FL_KEEP_SIZE
+    [FUSE_FALLOCATE] = fuse_fallocate,
+#endif /* FALLOC_FL_KEEP_SIZE */
+#endif
+
+#if FUSE_KERNEL_MINOR_VERSION >= 21
+    [FUSE_READDIRPLUS] = fuse_readdirp,
+#endif
+
+#if FUSE_KERNEL_MINOR_VERSION >= 24 && HAVE_SEEK_HOLE
+    [FUSE_LSEEK] = fuse_lseek,
+#endif
+
+#if FUSE_KERNEL_MINOR_VERSION >= 28
+    [FUSE_COPY_FILE_RANGE] = fuse_copy_file_range,
+#endif
+};
+
+static fuse_handler_t *fuse_dump_ops[FUSE_OP_HIGH];
+
+static void
+fuse_dumper(xlator_t *this, fuse_in_header_t *finh, void *msg,
+            struct iobuf *iobuf)
+{
+    fuse_private_t *priv = NULL;
+    struct iovec diov[6] = {
         {
-                default_notify (this, GF_EVENT_PARENT_UP, data);
-                break;
+            0,
+        },
+    };
+    char r = 'R';
+    uint32_t fusedump_item_count = 3;
+    struct fusedump_timespec fts = {
+        0,
+    };
+    struct fusedump_signature fsig = {
+        0,
+    };
+
+    int ret = 0;
+
+    priv = this->private;
+
+    fusedump_setup_meta(diov, &r, &fusedump_item_count, &fts, &fsig);
+    diov[4] = (struct iovec){finh, sizeof(*finh)};
+    if (finh->opcode == FUSE_WRITE) {
+        /* WRITE has special data alignment, see comment in
+           fuse_write(). */
+        diov[4].iov_len += sizeof(struct fuse_write_in);
+    }
+    diov[5] = (struct iovec){msg, finh->len - diov[4].iov_len};
+
+    pthread_mutex_lock(&priv->fuse_dump_mutex);
+    ret = sys_writev(priv->fuse_dump_fd, diov, sizeof(diov) / sizeof(diov[0]));
+    pthread_mutex_unlock(&priv->fuse_dump_mutex);
+    if (ret == -1)
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "failed to dump fuse message (R): %s", strerror(errno));
+
+    priv->fuse_ops0[finh->opcode](this, finh, msg, NULL);
+}
+
+int
+init(xlator_t *this_xl)
+{
+    int ret = 0;
+    dict_t *options = NULL;
+    char *value_string = NULL;
+    cmd_args_t *cmd_args = NULL;
+    char *fsname = NULL;
+    fuse_private_t *priv = NULL;
+    struct stat stbuf = {
+        0,
+    };
+    int i = 0;
+    int xl_name_allocated = 0;
+    int fsname_allocated = 0;
+    glusterfs_ctx_t *ctx = NULL;
+    gf_boolean_t sync_to_mount = _gf_false;
+    gf_boolean_t fopen_keep_cache = _gf_false;
+    char *mnt_args = NULL;
+    eh_t *event = NULL;
+
+    if (this_xl == NULL)
+        return -1;
+
+    if (this_xl->options == NULL)
+        return -1;
+
+    ctx = this_xl->ctx;
+    if (!ctx)
+        return -1;
+
+    options = this_xl->options;
+
+    if (this_xl->name == NULL) {
+        this_xl->name = gf_strdup("fuse");
+        if (!this_xl->name) {
+            gf_log("glusterfs-fuse", GF_LOG_ERROR, "Out of memory");
+
+            goto cleanup_exit;
         }
-        case GF_EVENT_VOLFILE_MODIFIED:
-        {
-                gf_log ("fuse", GF_LOG_CRITICAL, 
-                        "remote volume file changed, try re-mounting");
-                if (private->strict_volfile_check) {
-                        //fuse_session_remove_chan (private->ch);
-                        //fuse_session_destroy (private->se);
-                        //fuse_unmount (private->mount_point, private->ch);
-                        /* TODO: Above code if works, will be a cleaner way, 
-                           but for now, lets just achieve what we want */
-                        raise (SIGTERM);
-                }
-                break;
+        xl_name_allocated = 1;
+    }
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_fuse_mt_fuse_private_t);
+    if (!priv) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR, "Out of memory");
+
+        goto cleanup_exit;
+    }
+    this_xl->private = (void *)priv;
+    priv->mount_point = NULL;
+    priv->fd = -1;
+
+    INIT_LIST_HEAD(&priv->invalidate_list);
+    pthread_cond_init(&priv->invalidate_cond, NULL);
+    pthread_mutex_init(&priv->invalidate_mutex, NULL);
+
+    INIT_LIST_HEAD(&priv->timed_list);
+    pthread_cond_init(&priv->timed_cond, NULL);
+    pthread_mutex_init(&priv->timed_mutex, NULL);
+
+    INIT_LIST_HEAD(&priv->interrupt_list);
+    pthread_mutex_init(&priv->interrupt_mutex, NULL);
+
+    pthread_mutex_init(&priv->fusedev_errno_cnt_mutex, NULL);
+
+    /* get options from option dictionary */
+    ret = dict_get_str(options, ZR_MOUNTPOINT_OPT, &value_string);
+    if (ret == -1 || value_string == NULL) {
+        gf_log("fuse", GF_LOG_ERROR,
+               "Mandatory option 'mountpoint' is not specified.");
+        goto cleanup_exit;
+    }
+
+    if (sys_stat(value_string, &stbuf) != 0) {
+        if (errno == ENOENT) {
+            gf_log(this_xl->name, GF_LOG_ERROR, "%s %s does not exist",
+                   ZR_MOUNTPOINT_OPT, value_string);
+        } else if (errno == ENOTCONN) {
+            gf_log(this_xl->name, GF_LOG_ERROR,
+                   "Mountpoint %s seems to have a stale "
+                   "mount, run 'umount %s' and try again.",
+                   value_string, value_string);
+        } else {
+            gf_log(this_xl->name, GF_LOG_DEBUG, "%s %s : stat returned %s",
+                   ZR_MOUNTPOINT_OPT, value_string, strerror(errno));
         }
-        default:
+        goto cleanup_exit;
+    }
+
+    if (S_ISDIR(stbuf.st_mode) == 0) {
+        gf_log(this_xl->name, GF_LOG_ERROR, "%s %s is not a directory",
+               ZR_MOUNTPOINT_OPT, value_string);
+        goto cleanup_exit;
+    }
+    priv->mount_point = gf_strdup(value_string);
+    if (!priv->mount_point) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR, "Out of memory");
+
+        goto cleanup_exit;
+    }
+
+    GF_OPTION_INIT(ZR_ATTR_TIMEOUT_OPT, priv->attribute_timeout, double,
+                   cleanup_exit);
+
+    GF_OPTION_INIT("reader-thread-count", priv->reader_thread_count, uint32,
+                   cleanup_exit);
+
+    GF_OPTION_INIT("auto-invalidation", priv->fuse_auto_inval, bool,
+                   cleanup_exit);
+    GF_OPTION_INIT(ZR_ENTRY_TIMEOUT_OPT, priv->entry_timeout, double,
+                   cleanup_exit);
+
+    GF_OPTION_INIT(ZR_NEGATIVE_TIMEOUT_OPT, priv->negative_timeout, double,
+                   cleanup_exit);
+
+    GF_OPTION_INIT("client-pid", priv->client_pid, int32, cleanup_exit);
+    /* have to check & register the presence of client-pid manually */
+    priv->client_pid_set = !!dict_get(this_xl->options, "client-pid");
+
+    GF_OPTION_INIT("uid-map-root", priv->uid_map_root, uint32, cleanup_exit);
+
+    priv->direct_io_mode = 2;
+    ret = dict_get_str(options, ZR_DIRECT_IO_OPT, &value_string);
+    if (ret == 0) {
+        gf_boolean_t direct_io_mode_bool;
+        ret = gf_string2boolean(value_string, &direct_io_mode_bool);
+        if (ret == 0) {
+            priv->direct_io_mode = direct_io_mode_bool ? 1 : 0;
+        }
+    }
+
+    GF_OPTION_INIT(ZR_STRICT_VOLFILE_CHECK, priv->strict_volfile_check, bool,
+                   cleanup_exit);
+
+    GF_OPTION_INIT("acl", priv->acl, bool, cleanup_exit);
+
+    if (priv->uid_map_root)
+        priv->acl = 1;
+
+    GF_OPTION_INIT("selinux", priv->selinux, bool, cleanup_exit);
+
+    GF_OPTION_INIT("capability", priv->capability, bool, cleanup_exit);
+
+    GF_OPTION_INIT("read-only", priv->read_only, bool, cleanup_exit);
+
+    GF_OPTION_INIT("enable-ino32", priv->enable_ino32, bool, cleanup_exit);
+
+    GF_OPTION_INIT("use-readdirp", priv->use_readdirp, bool, cleanup_exit);
+
+    priv->fuse_dump_fd = -1;
+    ret = dict_get_str(options, "dump-fuse", &value_string);
+    if (ret == 0) {
+        ret = sys_unlink(value_string);
+        if (ret == -1 && errno != ENOENT) {
+            gf_log("glusterfs-fuse", GF_LOG_ERROR,
+                   "failed to remove old fuse dump file %s: %s", value_string,
+                   strerror(errno));
+
+            goto cleanup_exit;
+        }
+        ret = open(value_string, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
+        if (ret == -1) {
+            gf_log("glusterfs-fuse", GF_LOG_ERROR,
+                   "failed to open fuse dump file %s: %s", value_string,
+                   strerror(errno));
+
+            goto cleanup_exit;
+        }
+        priv->fuse_dump_fd = ret;
+    }
+
+    sync_to_mount = _gf_false;
+    ret = dict_get_str(options, "sync-to-mount", &value_string);
+    if (ret == 0) {
+        ret = gf_string2boolean(value_string, &sync_to_mount);
+        GF_ASSERT(ret == 0);
+    }
+
+    priv->fopen_keep_cache = 2;
+    if (dict_get(options, "fopen-keep-cache")) {
+        GF_OPTION_INIT("fopen-keep-cache", fopen_keep_cache, bool,
+                       cleanup_exit);
+        priv->fopen_keep_cache = fopen_keep_cache;
+    }
+
+    GF_OPTION_INIT("gid-timeout", priv->gid_cache_timeout, int32, cleanup_exit);
+
+    GF_OPTION_INIT("fuse-mountopts", priv->fuse_mountopts, str, cleanup_exit);
+
+    if (gid_cache_init(&priv->gid_cache, priv->gid_cache_timeout) < 0) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR,
+               "Failed to initialize "
+               "group cache.");
+        goto cleanup_exit;
+    }
+
+    GF_OPTION_INIT("resolve-gids", priv->resolve_gids, bool, cleanup_exit);
+
+    /* default values seemed to work fine during testing */
+    GF_OPTION_INIT("background-qlen", priv->background_qlen, int32,
+                   cleanup_exit);
+    GF_OPTION_INIT("congestion-threshold", priv->congestion_threshold, int32,
+                   cleanup_exit);
+
+    GF_OPTION_INIT("no-root-squash", priv->no_root_squash, bool, cleanup_exit);
+    /* change the client_pid to no-root-squash pid only if the
+       client is neither defrag process or gsyncd process.
+    */
+    if (!priv->client_pid_set) {
+        if (priv->no_root_squash == _gf_true) {
+            priv->client_pid_set = _gf_true;
+            priv->client_pid = GF_CLIENT_PID_NO_ROOT_SQUASH;
+        }
+    }
+
+    GF_OPTION_INIT("lru-limit", priv->lru_limit, uint32, cleanup_exit);
+
+    GF_OPTION_INIT("invalidate-limit", priv->invalidate_limit, uint32,
+                   cleanup_exit);
+
+    GF_OPTION_INIT("event-history", priv->event_history, bool, cleanup_exit);
+
+    GF_OPTION_INIT("thin-client", priv->thin_client, bool, cleanup_exit);
+
+    /* Writeback cache support */
+    GF_OPTION_INIT("kernel-writeback-cache", priv->kernel_writeback_cache, bool,
+                   cleanup_exit);
+    GF_OPTION_INIT("attr-times-granularity", priv->attr_times_granularity,
+                   int32, cleanup_exit);
+
+    GF_OPTION_INIT("flush-handle-interrupt", priv->flush_handle_interrupt, bool,
+                   cleanup_exit);
+
+    GF_OPTION_INIT("fuse-dev-eperm-ratelimit-ns",
+                   priv->fuse_dev_eperm_ratelimit_ns, uint32, cleanup_exit);
+
+    /* user has set only background-qlen, not congestion-threshold,
+       use the fuse kernel driver formula to set congestion. ie, 75% */
+    if (dict_get(this_xl->options, "background-qlen") &&
+        !dict_get(this_xl->options, "congestion-threshold")) {
+        priv->congestion_threshold = (priv->background_qlen * 3) / 4;
+        gf_log(this_xl->name, GF_LOG_INFO,
+               "setting congestion control as 75%% of "
+               "background-queue length (ie, (.75 * %d) = %d",
+               priv->background_qlen, priv->congestion_threshold);
+    }
+
+    /* congestion should not be higher than background queue length */
+    if (priv->congestion_threshold > priv->background_qlen) {
+        gf_log(this_xl->name, GF_LOG_INFO,
+               "setting congestion control same as "
+               "background-queue length (%d)",
+               priv->background_qlen);
+        priv->congestion_threshold = priv->background_qlen;
+    }
+
+    cmd_args = &this_xl->ctx->cmd_args;
+    fsname = cmd_args->volfile;
+    if (!fsname && cmd_args->volfile_server) {
+        if (cmd_args->volfile_id) {
+            int dir_len = 0;
+            if (cmd_args->subdir_mount)
+                dir_len = strlen(cmd_args->subdir_mount) + 1;
+            fsname = GF_MALLOC(strlen(cmd_args->volfile_server) + 1 +
+                                   strlen(cmd_args->volfile_id) + 1 + dir_len,
+                               gf_fuse_mt_fuse_private_t);
+            if (!fsname) {
+                gf_log("glusterfs-fuse", GF_LOG_ERROR, "Out of memory");
+                goto cleanup_exit;
+            }
+            fsname_allocated = 1;
+            strcpy(fsname, cmd_args->volfile_server);
+            strcat(fsname, ":");
+            strcat(fsname, cmd_args->volfile_id);
+            if (dir_len)
+                strcat(fsname, cmd_args->subdir_mount);
+        } else
+            fsname = cmd_args->volfile_server;
+    }
+    if (!fsname)
+        fsname = "glusterfs";
+
+    priv->fdtable = gf_fd_fdtable_alloc();
+    if (priv->fdtable == NULL) {
+        gf_log("glusterfs-fuse", GF_LOG_ERROR, "Out of memory");
+        goto cleanup_exit;
+    }
+
+    gf_asprintf(&mnt_args, "%s%s%s%sallow_other,max_read=131072",
+                priv->acl ? "" : "default_permissions,",
+                priv->read_only ? "ro," : "",
+                priv->fuse_mountopts ? priv->fuse_mountopts : "",
+                priv->fuse_mountopts ? "," : "");
+    if (!mnt_args)
+        goto cleanup_exit;
+
+    {
+        char *mnt_tok = NULL;
+        token_iter_t tit = {
+            0,
+        };
+        gf_boolean_t iter_end = _gf_false;
+
+        for (mnt_tok = token_iter_init(mnt_args, ',', &tit);;) {
+            iter_end = next_token(&mnt_tok, &tit);
+
+            if (strcmp(mnt_tok, "auto_unmount") == 0) {
+                priv->auto_unmount = _gf_true;
+                drop_token(mnt_tok, &tit);
+            }
+
+            if (iter_end)
                 break;
         }
-        return 0;
+    }
+
+    if (pipe(priv->status_pipe) < 0) {
+        gf_log(this_xl->name, GF_LOG_ERROR,
+               "could not create pipe to separate mount process");
+        goto cleanup_exit;
+    }
+
+    priv->fd = gf_fuse_mount(priv->mount_point, fsname, mnt_args,
+                             sync_to_mount ? &ctx->mnt_pid : NULL,
+                             priv->status_pipe[1]);
+    if (priv->fd == -1)
+        goto cleanup_exit;
+    if (priv->auto_unmount) {
+        ret = gf_fuse_unmount_daemon(priv->mount_point, priv->fd);
+        if (ret == -1)
+            goto cleanup_exit;
+    }
+
+    if (priv->event_history) {
+        event = eh_new(FUSE_EVENT_HISTORY_SIZE, _gf_false, NULL);
+        if (!event) {
+            gf_log(this_xl->name, GF_LOG_ERROR,
+                   "could not create a new event history");
+            goto cleanup_exit;
+        }
+
+        this_xl->history = event;
+    }
+
+    pthread_mutex_init(&priv->fuse_dump_mutex, NULL);
+    pthread_cond_init(&priv->sync_cond, NULL);
+    pthread_cond_init(&priv->migrate_cond, NULL);
+    pthread_mutex_init(&priv->sync_mutex, NULL);
+    priv->event_recvd = 0;
+
+    for (i = 0; i < FUSE_OP_HIGH; i++) {
+        if (!fuse_std_ops[i])
+            fuse_std_ops[i] = fuse_enosys;
+        if (!fuse_dump_ops[i])
+            fuse_dump_ops[i] = fuse_dumper;
+    }
+    priv->fuse_ops = fuse_std_ops;
+    if (priv->fuse_dump_fd != -1) {
+        priv->fuse_ops0 = priv->fuse_ops;
+        priv->fuse_ops = fuse_dump_ops;
+    }
+
+    if (fsname_allocated)
+        GF_FREE(fsname);
+    GF_FREE(mnt_args);
+    return 0;
+
+cleanup_exit:
+    if (xl_name_allocated)
+        GF_FREE(this_xl->name);
+    if (fsname_allocated)
+        GF_FREE(fsname);
+    if (priv) {
+        GF_FREE(priv->mount_point);
+        if (priv->fd != -1)
+            sys_close(priv->fd);
+        if (priv->fuse_dump_fd != -1)
+            sys_close(priv->fuse_dump_fd);
+        GF_FREE(priv);
+    }
+    GF_FREE(mnt_args);
+    return -1;
 }
 
-int 
-init (xlator_t *this_xl)
+void
+fini(xlator_t *this_xl)
 {
-	int ret = 0;
-	dict_t *options = NULL;
-	char *value_string = NULL;
-	fuse_private_t *priv = NULL;
-	struct stat stbuf = {0,};
+    fuse_private_t *priv = NULL;
+    char *mount_point = NULL;
 
-#ifdef GF_DARWIN_HOST_OS
-        int fuse_argc = 9;
-	char *fuse_argv[] = {"glusterfs",
-			     "-o", "allow_other",
-			     "-o", "default_permissions",
-			     "-o", "fsname=glusterfs",
-			     "-o", "local",
-			     NULL};
-
-#elif GF_LINUX_HOST_OS /* ! DARWIN_OS */
-        int fuse_argc = 19;
-	
-	char *fuse_argv[] = {"glusterfs",
-			     "-o", "nonempty",
-			     "-o", "max_readahead=1048576",
-			     "-o", "max_read=1048576",
-			     "-o", "max_write=1048576",
-			     "-o", "allow_other",
-			     "-o", "default_permissions",
-			     "-o", "fsname=glusterfs",
-			     "-o", "dev",
-			     "-o", "suid",
-			     NULL};
-
-#else /* BSD || SOLARIS */
-	/* BSD fuse doesn't support '-o dev', '-o nonempty' option */
-        int fuse_argc = 15;
-	
-	char *fuse_argv[] = {"glusterfs",
-			     "-o", "max_readahead=1048576",
-			     "-o", "max_read=1048576",
-			     "-o", "max_write=1048576",
-			     "-o", "allow_other",
-			     "-o", "default_permissions",
-			     "-o", "fsname=glusterfs",
-			     "-o", "suid",
-			     NULL};
-
-#endif /* ! DARWIN_OS || ! LINUX */
-        struct fuse_args args = FUSE_ARGS_INIT (fuse_argc, fuse_argv);
-	
-	if (this_xl == NULL)
-		return -1;
-	
-	if (this_xl->options == NULL)
-		return -1;
-	
-	options = this_xl->options;
-	
-	if (this_xl->name == NULL)
-		this_xl->name = strdup ("fuse");
-	
-        priv = CALLOC (1, sizeof (*priv));
-        ERR_ABORT (priv);
-        this_xl->private = (void *) priv;
+    if (this_xl == NULL)
+        return;
 
+    if ((priv = this_xl->private) == NULL)
+        return;
 
-#ifdef GF_DARWIN_HOST_OS
-	if (dict_get (options, "macfuse-local")) {
-		/* This way, GlusterFS will be detected as 'servers' instead
-		 *  of 'devices'. This method is useful if you want to do 
-		 * 'umount <mount_point>' over network,  instead of 'eject'ing
-		 * it from desktop. Works better for servers 
-		 */
-		/* Make the '-o local' in argv as NULL, so that its not 
-		   in effect */
-		fuse_argv[--args.argc] = NULL;
-		fuse_argv[--args.argc] = NULL;
-	}
-#endif /* ! DARWIN */
-
-	/* get options from option dictionary */
-	ret = dict_get_str (options, ZR_MOUNTPOINT_OPT, &value_string);
-	if (value_string == NULL) {
-                gf_log ("fuse", GF_LOG_ERROR, 
-			"mandatory option mountpoint is not specified");
-		return -1;
-	}
-
-	if (stat (value_string, &stbuf) != 0) {
-		if (errno == ENOENT) {
-			gf_log (this_xl->name, GF_LOG_ERROR ,
-				"%s %s does not exist",
-				ZR_MOUNTPOINT_OPT, value_string);
-		} else if (errno == ENOTCONN) {
-			gf_log (this_xl->name, GF_LOG_ERROR ,
-				"mountpoint %s seems to have a stale "
-				"mount, run 'umount %s' and try again",
-				value_string, value_string);
-		} else {
-			gf_log (this_xl->name, GF_LOG_ERROR ,
-				"%s %s : stat returned %s",
-				ZR_MOUNTPOINT_OPT,
-				value_string, strerror (errno));
-		}
-		return -1;
-	}
-	
-	if (S_ISDIR (stbuf.st_mode) == 0) {
-		gf_log (this_xl->name, GF_LOG_ERROR ,
-			"%s %s is not a directory",
-			ZR_MOUNTPOINT_OPT, value_string);
-		return -1;
-	}
-	priv->mount_point = strdup (value_string);
-	
-	
-	ret = dict_get_double (options, "attribute-timeout", 
-			       &priv->attribute_timeout);
-	if (!priv->attribute_timeout)
-		priv->attribute_timeout = 1.0; /* default */
-	
-	ret = dict_get_double (options, "entry-timeout", 
-			       &priv->entry_timeout);
-	if (!priv->entry_timeout)
-		priv->entry_timeout = 1.0; /* default */
-	
-
-	priv->direct_io_mode = 1;
-	ret = dict_get_str (options, ZR_DIRECT_IO_OPT, &value_string);
-	if (value_string) {
-		ret = gf_string2boolean (value_string, &priv->direct_io_mode);
-	}
-
-        priv->strict_volfile_check = 0;
-	ret = dict_get_str (options, ZR_STRICT_VOLFILE_CHECK, &value_string);
-	if (value_string) {
-		ret = gf_string2boolean (value_string, 
-                                         &priv->strict_volfile_check);
-	}
-
-        priv->ch = fuse_mount (priv->mount_point, &args);
-        if (priv->ch == NULL) {
-                if (errno == ENOTCONN) {
-                        gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                                "A stale mount present on %s. "
-				"run 'umount %s' and try again",
-                                priv->mount_point, 
-				priv->mount_point);
-                } else {
-                        if (errno == ENOENT) {
-                                gf_log ("glusterfs-fuse", GF_LOG_ERROR, 
-                                        "unable to mount on %s. run "
-					"'modprobe fuse' and try again", 
-                                        priv->mount_point);
-                        } else {
-                                gf_log ("glusterfs-fuse", GF_LOG_ERROR, 
-                                        "fuse_mount() failed with error %s "
-					"on mount point %s", 
-                                        strerror (errno), 
-					priv->mount_point);
-                        }
-                }
-                
-                goto cleanup_exit;
+    pthread_mutex_lock(&priv->sync_mutex);
+    {
+        if (!(priv->fini_invoked)) {
+            priv->fini_invoked = _gf_true;
+        } else {
+            pthread_mutex_unlock(&priv->sync_mutex);
+            return;
         }
-        
-        priv->se = fuse_lowlevel_new (&args, &fuse_ops, 
-				      sizeof (fuse_ops), this_xl);
-        if (priv->se == NULL) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR,
-                        "fuse_lowlevel_new() failed with error %s on "
-			"mount point %s", 
-                        strerror (errno), priv->mount_point);
-                goto umount_exit;
-        }
-        
-        ret = fuse_set_signal_handlers (priv->se);
-        if (ret == -1) {
-                gf_log ("glusterfs-fuse", GF_LOG_ERROR, 
-                        "fuse_set_signal_handlers() failed on mount point %s", 
-                        priv->mount_point);
-                goto umount_exit;
-        }
-        
-        fuse_opt_free_args (&args);
-        
-        fuse_session_add_chan (priv->se, priv->ch);
-        
-        priv->fd = fuse_chan_fd (priv->ch);
-        priv->buf = data_ref (data_from_dynptr (NULL, 0));
-
-        this_xl->ctx->top = this_xl;
-
-	priv->first_call = 1;
-        this_xl->itable = inode_table_new (0, this_xl);
-        return 0;
-        
-umount_exit: 
-        fuse_unmount (priv->mount_point, priv->ch);
-cleanup_exit:
-        fuse_opt_free_args (&args);
-        FREE (priv->mount_point);
-        FREE (priv);
-        return -1;
+    }
+    pthread_mutex_unlock(&priv->sync_mutex);
+
+    if (dict_get(this_xl->options, ZR_MOUNTPOINT_OPT))
+        mount_point = data_to_str(
+            dict_get(this_xl->options, ZR_MOUNTPOINT_OPT));
+    if (mount_point != NULL) {
+        if (!priv->auto_unmount) {
+            gf_log(this_xl->name, GF_LOG_INFO, "Unmounting '%s'.", mount_point);
+            gf_fuse_unmount(mount_point, priv->fd);
+        }
+
+        gf_log(this_xl->name, GF_LOG_INFO, "Closing fuse connection to '%s'.",
+               mount_point);
+
+        sys_close(priv->fuse_dump_fd);
+        dict_del(this_xl->options, ZR_MOUNTPOINT_OPT);
+    }
+    /* Process should terminate once fuse xlator is finished.
+     * Required for AUTH_FAILED event.
+     */
+    kill(getpid(), SIGTERM);
 }
 
+struct xlator_fops fops;
 
-void
-fini (xlator_t *this_xl)
-{
-        fuse_private_t *priv = NULL;
-	char *mount_point = NULL;
-	
-	if (this_xl == NULL)
-		return;
-	
-	if ((priv = this_xl->private) == NULL)
-		return;
-	
-	if (dict_get (this_xl->options, ZR_MOUNTPOINT_OPT))
-		mount_point = data_to_str (dict_get (this_xl->options, 
-						     ZR_MOUNTPOINT_OPT));
-	if (mount_point != NULL) {
-		gf_log (this_xl->name, GF_LOG_WARNING, 
-			"unmounting '%s'", mount_point);
-		
-		dict_del (this_xl->options, ZR_MOUNTPOINT_OPT);
-		fuse_session_exit (priv->se);
-		fuse_unmount (mount_point, priv->ch);
-	}
-}
-
-struct xlator_fops fops = {
-};
+struct xlator_cbks cbks = {.invalidate = fuse_invalidate,
+                           .forget = fuse_forget_cbk,
+                           .release = fuse_internal_release};
 
-struct xlator_cbks cbks = {
+struct xlator_dumpops dumpops = {
+    .priv = fuse_priv_dump,
+    .inode = fuse_itable_dump,
+    .history = fuse_history_dump,
 };
 
-struct xlator_mops mops = {
+struct volume_options options[] = {
+    {.key = {"direct-io-mode"}, .type = GF_OPTION_TYPE_BOOL},
+    {.key = {ZR_MOUNTPOINT_OPT, "mount-point"}, .type = GF_OPTION_TYPE_PATH},
+    {.key = {ZR_DUMP_FUSE, "fuse-dumpfile"}, .type = GF_OPTION_TYPE_PATH},
+    {.key = {ZR_ATTR_TIMEOUT_OPT},
+     .type = GF_OPTION_TYPE_DOUBLE,
+     .default_value = "1.0"},
+    {.key = {ZR_ENTRY_TIMEOUT_OPT},
+     .type = GF_OPTION_TYPE_DOUBLE,
+     .default_value = "1.0"},
+    {.key = {ZR_NEGATIVE_TIMEOUT_OPT},
+     .type = GF_OPTION_TYPE_DOUBLE,
+     .default_value = "0.0"},
+    {.key = {ZR_STRICT_VOLFILE_CHECK},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "false"},
+    {.key = {"client-pid"}, .type = GF_OPTION_TYPE_INT},
+    {.key = {"uid-map-root"}, .type = GF_OPTION_TYPE_INT},
+    {.key = {"sync-to-mount"}, .type = GF_OPTION_TYPE_BOOL},
+    {.key = {"read-only"}, .type = GF_OPTION_TYPE_BOOL},
+    {.key = {"fopen-keep-cache"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "false"},
+    {.key = {"gid-timeout"},
+     .type = GF_OPTION_TYPE_INT,
+     .default_value = "300"},
+    {.key = {"resolve-gids"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "false"},
+    {.key = {"acl"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "false"},
+    {.key = {"selinux"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "false"},
+    {.key = {"enable-ino32"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "false"},
+    {
+        .key = {"background-qlen"},
+        .type = GF_OPTION_TYPE_INT,
+        .default_value = "64",
+        .min = 16,
+        .max = (64 * GF_UNIT_KB),
+    },
+    {
+        .key = {"congestion-threshold"},
+        .type = GF_OPTION_TYPE_INT,
+        .default_value = "48",
+        .min = 12,
+        .max = (64 * GF_UNIT_KB),
+    },
+    {.key = {"fuse-mountopts"}, .type = GF_OPTION_TYPE_STR},
+    {.key = {"use-readdirp"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "yes"},
+    {
+        .key = {"no-root-squash"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .description =
+            "This is the mount option for disabling the "
+            "root squash for the client irrespective of whether the "
+            "root-squash "
+            "option for the volume is set or not. But this option is honoured "
+            "only for the trusted clients. For non trusted clients this value "
+            "does not have any affect and the volume option for root-squash is "
+            "honoured.",
+    },
+    {.key = {"capability"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "false"},
+    {
+        .key = {"event-history"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .description = "This option can be used to enable or disable fuse "
+                       "event history.",
+    },
+    {
+        .key = {"thin-client"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .description = "Enables thin mount and connects via gfproxyd daemon.",
+    },
+    {
+        .key = {"reader-thread-count"},
+        .type = GF_OPTION_TYPE_INT,
+        .default_value = "1",
+        .min = 1,
+        .max = 64,
+        .description = "Sets fuse reader thread count.",
+    },
+    {
+        .key = {"kernel-writeback-cache"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .description = "Enables fuse in-kernel writeback cache.",
+    },
+    {
+        .key = {"attr-times-granularity"},
+        .type = GF_OPTION_TYPE_INT,
+        .default_value = "0",
+        .min = 0,
+        .max = 1000000000,
+        .description = "Supported granularity of file attribute times.",
+    },
+    {
+        .key = {"flush-handle-interrupt"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .description =
+            "Handle iterrupts in FLUSH handler (for testing purposes).",
+    },
+    {
+        .key = {"lru-limit"},
+        .type = GF_OPTION_TYPE_INT,
+        .default_value = "65536",
+        .min = 0,
+        .description = "makes glusterfs invalidate kernel inodes after "
+                       "reaching this limit (0 means 'unlimited')",
+    },
+    {
+        .key = {"invalidate-limit"},
+        .type = GF_OPTION_TYPE_INT,
+        .default_value = "0",
+        .min = 0,
+        .description = "suspend invalidations as of 'lru-limit' if the number "
+                       "of outstanding invalidations reaches this limit "
+                       "(0 means 'unlimited')",
+    },
+    {
+        .key = {"auto-invalidation"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "true",
+        .description = "controls whether fuse-kernel can auto-invalidate "
+                       "attribute, dentry and page-cache. Disable this only "
+                       "if same files/directories are not accessed across "
+                       "two different mounts concurrently",
+    },
+    {
+        .key = {"fuse-dev-eperm-ratelimit-ns"},
+        .type = GF_OPTION_TYPE_INT,
+        .default_value = "10000000", /* 0.01 sec */
+        .min = 0,
+        .max = 1000000000,
+        .description = "Rate limit reading from fuse device upon EPERM "
+                       "failure.",
+    },
+    {.key = {NULL}},
 };
 
-struct volume_options options[] = {
-	{ .key  = {"direct-io-mode"}, 
-	  .type = GF_OPTION_TYPE_BOOL 
-	},
-	{ .key  = {"macfuse-local"}, 
-	  .type = GF_OPTION_TYPE_BOOL 
-	},
-	{ .key  = {"mountpoint", "mount-point"}, 
-	  .type = GF_OPTION_TYPE_PATH 
-	},
-	{ .key  = {"attribute-timeout"}, 
-	  .type = GF_OPTION_TYPE_DOUBLE
-	},
-	{ .key  = {"entry-timeout"}, 
-	  .type = GF_OPTION_TYPE_DOUBLE
-	},
-	{ .key  = {"strict-volfile-check"}, 
-	  .type = GF_OPTION_TYPE_BOOL
-	},
-	{ .key = {NULL} },
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .dumpops = &dumpops,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "fuse",
+    .category = GF_MAINTAINED,
 };
diff --git a/xlators/mount/fuse/src/fuse-bridge.h b/xlators/mount/fuse/src/fuse-bridge.h
new file mode 100644
index 00000000000..4cb94c23cad
--- /dev/null
+++ b/xlators/mount/fuse/src/fuse-bridge.h
@@ -0,0 +1,544 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef _GF_FUSE_BRIDGE_H_
+#define _GF_FUSE_BRIDGE_H_
+
+#include <stdint.h>
+#include <signal.h>
+#include <pthread.h>
+#include <stddef.h>
+#include <dirent.h>
+#include <sys/mount.h>
+#include <sys/time.h>
+#include <fnmatch.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/statedump.h>
+
+#ifdef GF_DARWIN_HOST_OS
+#include "fuse_kernel_macfuse.h"
+#else
+#include "fuse_kernel.h"
+#endif
+#include "fuse-misc.h"
+#include "fuse-mount.h"
+#include "fuse-mem-types.h"
+
+#include <glusterfs/list.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/gidcache.h>
+
+#if defined(GF_LINUX_HOST_OS) || defined(__FreeBSD__) || defined(__NetBSD__)
+
+/*
+ * TODO:
+ * So, with the addition of copy_file_range support, it might
+ * require a bump up of fuse kernel minor version (like it was
+ * done when support for lseek fop was added. But, as of now,
+ * the copy_file_range support has just landed in upstream
+ * kernel fuse module. So, until, there is a release of that
+ * fuse as part of a kernel, the FUSE_KERNEL_MINOR_VERSION
+ * from fuse_kernel.h in the contrib might not be changed.
+ * If so, then the highest op available should be based on
+ * the current minor version (which is 24). So, selectively
+ * determine. When, the minor version is changed to 28 in
+ * fuse_kernel.h from contrib (because in upstream linux
+ * kernel source tree, the kernel minor version which
+ * contains support for copy_file_range is 28), then remove
+ * the reference to FUSE_LSEEK below and just determine
+ * FUSE_OP_HIGH based on copy_file_range.
+ */
+#if FUSE_KERNEL_MINOR_VERSION >= 28
+#define FUSE_OP_HIGH (FUSE_COPY_FILE_RANGE + 1)
+#else
+#define FUSE_OP_HIGH (FUSE_LSEEK + 1)
+#endif
+
+#endif
+#ifdef GF_DARWIN_HOST_OS
+#define FUSE_OP_HIGH (FUSE_DESTROY + 1)
+#endif
+#define GLUSTERFS_XATTR_LEN_MAX 65536
+
+#define MAX_FUSE_PROC_DELAY 1
+
+typedef struct fuse_in_header fuse_in_header_t;
+typedef void(fuse_handler_t)(xlator_t *this, fuse_in_header_t *finh, void *msg,
+                             struct iobuf *iobuf);
+
+enum fusedev_errno {
+    FUSEDEV_ENOENT,
+    FUSEDEV_ENOTDIR,
+    FUSEDEV_ENODEV,
+    FUSEDEV_EPERM,
+    FUSEDEV_ENOMEM,
+    FUSEDEV_ENOTCONN,
+    FUSEDEV_ECONNREFUSED,
+    FUSEDEV_EOVERFLOW,
+    FUSEDEV_EBUSY,
+    FUSEDEV_ENOTEMPTY,
+    FUSEDEV_EMAXPLUS
+};
+
+struct fuse_private {
+    int fd;
+    uint32_t proto_minor;
+    char *volfile;
+    size_t volfile_size;
+    char *mount_point;
+    struct iobuf *iobuf;
+
+    pthread_t *fuse_thread;
+    uint32_t reader_thread_count;
+    char fuse_thread_started;
+
+    uint32_t direct_io_mode;
+    size_t *msg0_len_p;
+
+    double entry_timeout;
+    double negative_timeout;
+    double attribute_timeout;
+
+    pthread_cond_t sync_cond;
+    pthread_mutex_t sync_mutex;
+    char event_recvd;
+
+    char init_recvd;
+
+    gf_boolean_t strict_volfile_check;
+
+    fuse_handler_t **fuse_ops;
+    fuse_handler_t **fuse_ops0;
+    pthread_mutex_t fuse_dump_mutex;
+    int fuse_dump_fd;
+
+    glusterfs_graph_t *next_graph;
+    xlator_t *active_subvol;
+
+    pid_t client_pid;
+    gf_boolean_t client_pid_set;
+    unsigned uid_map_root;
+    gf_boolean_t acl;
+    gf_boolean_t selinux;
+    gf_boolean_t read_only;
+    int32_t fopen_keep_cache;
+    int32_t gid_cache_timeout;
+    gf_boolean_t enable_ino32;
+    /* This is the mount option for disabling the root-squash for the
+       mount irrespective of whether the root-squash option for the
+       volume is set or not. But this option is honoured only for
+       thr trusted clients. For non trusted clients this value does
+       not have any affect and the volume option for root-squash is
+       honoured.
+    */
+    gf_boolean_t no_root_squash;
+    fdtable_t *fdtable;
+    gid_cache_t gid_cache;
+    char *fuse_mountopts;
+
+    /* For fuse-reverse-validation */
+    struct list_head invalidate_list;
+    pthread_cond_t invalidate_cond;
+    pthread_mutex_t invalidate_mutex;
+    gf_boolean_t reverse_fuse_thread_started;
+    uint64_t invalidate_count;
+    /* For communicating with separate mount thread. */
+    int status_pipe[2];
+
+    /* for fuse queue length and congestion threshold */
+    int background_qlen;
+    int congestion_threshold;
+
+    /* for using fuse-kernel readdirp*/
+    gf_boolean_t use_readdirp;
+
+    /* fini started, helps prevent multiple epoll worker threads
+     * firing up the fini routine */
+    gf_boolean_t fini_invoked;
+
+    /* resolve gid with getgrouplist() instead of /proc/%d/status */
+    gf_boolean_t resolve_gids;
+
+    /* Enable or disable capability support */
+    gf_boolean_t capability;
+
+    /* Enable or disable event history */
+    gf_boolean_t event_history;
+
+    /* whether to run the unmount daemon */
+    gf_boolean_t auto_unmount;
+
+    /* Load the thin volfile, and connect to gfproxyd*/
+    gf_boolean_t thin_client;
+    gf_boolean_t mount_finished;
+    gf_boolean_t handle_graph_switch;
+    pthread_cond_t migrate_cond;
+
+    /* Writeback cache support */
+    gf_boolean_t kernel_writeback_cache;
+    int attr_times_granularity;
+
+    /* Delayed fuse response */
+    struct list_head timed_list;
+    pthread_cond_t timed_cond;
+    pthread_mutex_t timed_mutex;
+    gf_boolean_t timed_response_fuse_thread_started;
+
+    /* Interrupt subscription */
+    struct list_head interrupt_list;
+    pthread_mutex_t interrupt_mutex;
+
+    gf_boolean_t flush_handle_interrupt;
+    gf_boolean_t fuse_auto_inval;
+
+    /* LRU Limit, if not set, default is 64k for now */
+    uint32_t lru_limit;
+    uint32_t invalidate_limit;
+    uint32_t fuse_dev_eperm_ratelimit_ns;
+
+    /* counters for fusdev errnos */
+    uint8_t fusedev_errno_cnt[FUSEDEV_EMAXPLUS];
+    pthread_mutex_t fusedev_errno_cnt_mutex;
+};
+typedef struct fuse_private fuse_private_t;
+
+typedef uint64_t errnomask_t[2];
+#define MASK_ERRNO(mask, n) ((mask)[(n) >> 6] |= ((uint64_t)1 << ((n)&63)))
+#define GET_ERRNO_MASK(mask, n) ((mask)[(n) >> 6] & ((uint64_t)1 << ((n)&63)))
+#define ERRNOMASK_MAX (64 * (sizeof(errnomask_t) / sizeof(uint64_t)))
+
+#define INVAL_BUF_SIZE                                                         \
+    (sizeof(struct fuse_out_header) +                                          \
+     max(sizeof(struct fuse_notify_inval_inode_out),                           \
+         sizeof(struct fuse_notify_inval_entry_out) + NAME_MAX + 1))
+
+struct fuse_invalidate_node {
+    errnomask_t errnomask;
+    struct list_head next;
+    char inval_buf[INVAL_BUF_SIZE];
+};
+typedef struct fuse_invalidate_node fuse_invalidate_node_t;
+
+struct fuse_timed_message {
+    struct fuse_out_header fuse_out_header;
+    void *fuse_message_body;
+    struct timespec scheduled_ts;
+    errnomask_t errnomask;
+    struct list_head next;
+};
+typedef struct fuse_timed_message fuse_timed_message_t;
+
+enum fuse_interrupt_state {
+    INTERRUPT_NONE,
+    INTERRUPT_SQUELCHED,
+    INTERRUPT_HANDLED,
+    INTERRUPT_WAITING_HANDLER,
+};
+typedef enum fuse_interrupt_state fuse_interrupt_state_t;
+struct fuse_interrupt_record;
+typedef struct fuse_interrupt_record fuse_interrupt_record_t;
+typedef void (*fuse_interrupt_handler_t)(xlator_t *this,
+                                         fuse_interrupt_record_t *);
+struct fuse_interrupt_record {
+    fuse_in_header_t fuse_in_header;
+    void *data;
+    gf_boolean_t hit;
+    fuse_interrupt_state_t interrupt_state;
+    fuse_interrupt_handler_t interrupt_handler;
+    pthread_cond_t handler_cond;
+    pthread_mutex_t handler_mutex;
+    struct list_head next;
+};
+
+struct fuse_graph_switch_args {
+    xlator_t *this;
+    xlator_t *old_subvol;
+    xlator_t *new_subvol;
+};
+typedef struct fuse_graph_switch_args fuse_graph_switch_args_t;
+
+#define FUSE_EVENT_HISTORY_SIZE 1024
+
+#define _FH_TO_FD(fh) ((fd_t *)(uintptr_t)(fh))
+
+#define FH_TO_FD(fh) ((_FH_TO_FD(fh)) ? (fd_ref(_FH_TO_FD(fh))) : ((fd_t *)0))
+
+/* Use the same logic as the Linux NFS-client */
+#define GF_FUSE_SQUASH_INO(ino) (((uint32_t)ino) ^ (ino >> 32))
+
+#define FUSE_FOP(state, ret, op_num, fop, args...)                             \
+    do {                                                                       \
+        xlator_t *xl = NULL;                                                   \
+        call_frame_t *frame = NULL;                                            \
+                                                                               \
+        xl = state->active_subvol;                                             \
+        if (!xl) {                                                             \
+            gf_log_callingfn(state->this->name, GF_LOG_ERROR,                  \
+                             "No active subvolume");                           \
+            send_fuse_err(state->this, state->finh, ENOENT);                   \
+            free_fuse_state(state);                                            \
+            break;                                                             \
+        }                                                                      \
+                                                                               \
+        frame = get_call_frame_for_req(state);                                 \
+        if (!frame) {                                                          \
+            /* This is not completely clean, as some                           \
+             * earlier allocations might remain unfreed                        \
+             * if we return at this point, but still                           \
+             * better than trying to go on with a NULL                         \
+             * frame ...                                                       \
+             */                                                                \
+            send_fuse_err(state->this, state->finh, ENOMEM);                   \
+            free_fuse_state(state);                                            \
+            /* ideally, need to 'return', but let the */                       \
+            /* calling function take care of it */                             \
+            break;                                                             \
+        }                                                                      \
+                                                                               \
+        frame->root->state = state;                                            \
+        frame->root->op = op_num;                                              \
+        frame->op = op_num;                                                    \
+                                                                               \
+        if (state->this->history)                                              \
+            gf_log_eh("%" PRIu64                                               \
+                      ", %s, path: (%s), gfid: "                               \
+                      "(%s)",                                                  \
+                      frame->root->unique, gf_fop_list[frame->root->op],       \
+                      state->loc.path,                                         \
+                      (state->fd == NULL)                                      \
+                          ? uuid_utoa(state->loc.gfid)                         \
+                          : uuid_utoa(state->fd->inode->gfid));                \
+        STACK_WIND(frame, ret, xl, xl->fops->fop, args);                       \
+    } while (0)
+
+#define GF_SELECT_LOG_LEVEL(_errno)                                            \
+        (((_errno == ENOENT) || (_errno == ESTALE))?    \
+         GF_LOG_DEBUG)
+
+#define GET_STATE(this, finh, state)                                           \
+    do {                                                                       \
+        state = get_fuse_state(this, finh);                                    \
+        if (!state) {                                                          \
+            gf_log("glusterfs-fuse", GF_LOG_ERROR,                             \
+                   "FUSE message unique %" PRIu64                              \
+                   " opcode %d:"                                               \
+                   " state allocation failed",                                 \
+                   finh->unique, finh->opcode);                                \
+                                                                               \
+            send_fuse_err(this, finh, ENOMEM);                                 \
+            GF_FREE(finh);                                                     \
+                                                                               \
+            return;                                                            \
+        }                                                                      \
+    } while (0)
+
+#define FUSE_ENTRY_CREATE(this, priv, finh, state, fci, op)                    \
+    do {                                                                       \
+        if (priv->proto_minor >= 12)                                           \
+            state->mode &= ~fci->umask;                                        \
+        if (priv->proto_minor >= 12 && priv->acl) {                            \
+            state->xdata = dict_new();                                         \
+            if (!state->xdata) {                                               \
+                gf_log("glusterfs-fuse", GF_LOG_WARNING,                       \
+                       "%s failed to allocate "                                \
+                       "a param dictionary",                                   \
+                       op);                                                    \
+                send_fuse_err(this, finh, ENOMEM);                             \
+                free_fuse_state(state);                                        \
+                return;                                                        \
+            }                                                                  \
+            state->umask = fci->umask;                                         \
+        }                                                                      \
+    } while (0)
+
+#define fuse_log_eh_fop(this, state, frame, op_ret, op_errno)                  \
+    do {                                                                       \
+        fuse_private_t *priv = this->private;                                  \
+        if (this->history && priv->event_history) {                            \
+            if (state->fd)                                                     \
+                gf_log_eh(                                                     \
+                    "op_ret: %d, op_errno: %d, "                               \
+                    "%" PRIu64 ", %s () => %p, gfid: %s",                      \
+                    op_ret, op_errno, frame->root->unique,                     \
+                    gf_fop_list[frame->root->op], state->fd,                   \
+                    uuid_utoa(state->fd->inode->gfid));                        \
+            else                                                               \
+                gf_log_eh(                                                     \
+                    "op_ret: %d, op_errno: %d, "                               \
+                    "%" PRIu64 ", %s () => %s, gfid: %s",                      \
+                    op_ret, op_errno, frame->root->unique,                     \
+                    gf_fop_list[frame->root->op], state->loc.path,             \
+                    uuid_utoa(state->loc.gfid));                               \
+        }                                                                      \
+    } while (0)
+
+#define fuse_log_eh(this, args...)                                             \
+    do {                                                                       \
+        fuse_private_t *priv = this->private;                                  \
+        if (this->history && priv->event_history)                              \
+            gf_log_eh(args);                                                   \
+    } while (0)
+
+static inline xlator_t *
+fuse_active_subvol(xlator_t *fuse)
+{
+    fuse_private_t *priv = NULL;
+
+    priv = fuse->private;
+
+    return priv->active_subvol;
+}
+
+typedef enum {
+    RESOLVE_MUST = 1,
+    RESOLVE_NOT,
+    RESOLVE_MAY,
+    RESOLVE_DONTCARE,
+    RESOLVE_EXACT
+} fuse_resolve_type_t;
+
+typedef struct {
+    fuse_resolve_type_t type;
+    fd_t *fd;
+    char *path;
+    char *bname;
+    u_char gfid[16];
+    inode_t *hint;
+    u_char pargfid[16];
+    inode_t *parhint;
+    int op_ret;
+    int op_errno;
+    loc_t resolve_loc;
+} fuse_resolve_t;
+
+typedef struct {
+    void *pool;
+    xlator_t *this;
+    xlator_t *active_subvol;
+    inode_table_t *itable;
+    loc_t loc;
+    loc_t loc2;
+    fuse_in_header_t *finh;
+    int32_t flags;
+
+    off_t off;
+    /*
+     * The man page of copy_file_range tells that the offset
+     * arguments are of type loff_t *. Here in fuse state, the values of
+     * those offsets are saved instead of pointers as the kernel sends
+     * the values of the offsets from those pointers instead of pointers.
+     * But the type loff_t is linux specific and is actually a typedef of
+     * off64_t. Hence using off64_t
+     */
+    off64_t off_in;  /* for copy_file_range source fd */
+    off64_t off_out; /* for copy_file_range destination fd */
+    size_t size;
+    unsigned long nlookup;
+    fd_t *fd;
+    fd_t *fd_dst; /* for copy_file_range destination */
+    dict_t *xattr;
+    dict_t *xdata;
+    char *name;
+    char is_revalidate;
+    gf_boolean_t truncate_needed;
+    gf_lock_t lock;
+    uint64_t lk_owner;
+
+    /* used within resolve_and_resume */
+    /* */
+    fuse_resolve_t resolve;
+    fuse_resolve_t resolve2;
+
+    loc_t *loc_now;
+    fuse_resolve_t *resolve_now;
+
+    void *resume_fn;
+
+    int valid;
+    int mask;
+    dev_t rdev;
+    mode_t mode;
+    mode_t umask;
+    struct iatt attr;
+    struct gf_flock lk_lock;
+    struct iovec vector;
+
+    uuid_t gfid;
+    uint32_t io_flags;
+    int32_t fd_no;
+
+    gf_seek_what_t whence;
+    struct iobuf *iobuf;
+} fuse_state_t;
+
+typedef struct {
+    uint32_t open_flags;
+    char migration_failed;
+    fd_t *activefd;
+} fuse_fd_ctx_t;
+
+typedef void (*fuse_resume_fn_t)(fuse_state_t *state);
+
+GF_MUST_CHECK int32_t
+fuse_loc_fill(loc_t *loc, fuse_state_t *state, ino_t ino, ino_t par,
+              const char *name);
+call_frame_t *
+get_call_frame_for_req(fuse_state_t *state);
+fuse_state_t *
+get_fuse_state(xlator_t *this, fuse_in_header_t *finh);
+void
+free_fuse_state(fuse_state_t *state);
+void
+gf_fuse_stat2attr(struct iatt *st, struct fuse_attr *fa,
+                  gf_boolean_t enable_ino32);
+void
+gf_fuse_fill_dirent(gf_dirent_t *entry, struct fuse_dirent *fde,
+                    gf_boolean_t enable_ino32);
+uint64_t
+inode_to_fuse_nodeid(inode_t *inode);
+xlator_t *
+fuse_active_subvol(xlator_t *fuse);
+inode_t *
+fuse_ino_to_inode(uint64_t ino, xlator_t *fuse);
+int
+send_fuse_err(xlator_t *this, fuse_in_header_t *finh, int error);
+int
+fuse_gfid_set(fuse_state_t *state);
+int
+fuse_flip_xattr_ns(struct fuse_private *priv, char *okey, char **nkey);
+fuse_fd_ctx_t *
+__fuse_fd_ctx_check_n_create(xlator_t *this, fd_t *fd);
+fuse_fd_ctx_t *
+fuse_fd_ctx_check_n_create(xlator_t *this, fd_t *fd);
+
+int
+fuse_resolve_and_resume(fuse_state_t *state, fuse_resume_fn_t fn);
+int
+fuse_resolve_inode_init(fuse_state_t *state, fuse_resolve_t *resolve,
+                        ino_t ino);
+int
+fuse_resolve_entry_init(fuse_state_t *state, fuse_resolve_t *resolve, ino_t par,
+                        char *name);
+int
+fuse_resolve_fd_init(fuse_state_t *state, fuse_resolve_t *resolve, fd_t *fd);
+int
+fuse_ignore_xattr_set(fuse_private_t *priv, char *key);
+void
+fuse_fop_resume(fuse_state_t *state);
+int
+dump_history_fuse(circular_buffer_t *cb, void *data);
+int
+fuse_check_selinux_cap_xattr(fuse_private_t *priv, char *name);
+#endif /* _GF_FUSE_BRIDGE_H_ */
diff --git a/xlators/mount/fuse/src/fuse-extra.c b/xlators/mount/fuse/src/fuse-extra.c
deleted file mode 100644
index 95bd0f3ad1c..00000000000
--- a/xlators/mount/fuse/src/fuse-extra.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
-   Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif /* _CONFIG_H */
-
-#include "fuse-extra.h"
-#include "common-utils.h"
-#include <stdio.h>
-#include <pthread.h>
-#include <stdlib.h>
-#include <string.h>
-#include "common-utils.h"
-
-struct fuse_req;
-struct fuse_ll;
-
-struct fuse_req {
-    struct fuse_ll *f;
-    uint64_t unique;
-    int ctr;
-    pthread_mutex_t lock;
-    struct fuse_ctx ctx;
-    struct fuse_chan *ch;
-    int interrupted;
-    union {
-        struct {
-            uint64_t unique;
-        } i;
-        struct {
-            fuse_interrupt_func_t func;
-            void *data;
-        } ni;
-    } u;
-    struct fuse_req *next;
-    struct fuse_req *prev;
-};
-
-struct fuse_ll {
-    int debug;
-    int allow_root;
-    struct fuse_lowlevel_ops op;
-    int got_init;
-    void *userdata;
-    uid_t owner;
-    struct fuse_conn_info conn;
-    struct fuse_req list;
-    struct fuse_req interrupts;
-    pthread_mutex_t lock;
-    int got_destroy;
-};
-
-struct fuse_out_header {
-  uint32_t   len;
-  int32_t    error;
-  uint64_t   unique;
-};
-
-uint64_t req_callid (fuse_req_t req)
-{
-  return req->unique;
-}
-
-static void destroy_req(fuse_req_t req)
-{
-    pthread_mutex_destroy (&req->lock);
-    FREE (req);
-}
-
-static void list_del_req(struct fuse_req *req)
-{
-    struct fuse_req *prev = req->prev;
-    struct fuse_req *next = req->next;
-    prev->next = next;
-    next->prev = prev;
-}
-
-static void
-free_req (fuse_req_t req)
-{
-  int ctr;
-  struct fuse_ll *f = req->f;
-  
-  pthread_mutex_lock(&req->lock);
-  req->u.ni.func = NULL;
-  req->u.ni.data = NULL;
-  pthread_mutex_unlock(&req->lock);
-
-  pthread_mutex_lock(&f->lock);
-  list_del_req(req);
-  ctr = --req->ctr;
-  pthread_mutex_unlock(&f->lock);
-  if (!ctr)
-    destroy_req(req);
-}
-
-int32_t
-fuse_reply_vec (fuse_req_t req,
-		struct iovec *vector,
-		int32_t count)
-{
-  int32_t error = 0;
-  struct fuse_out_header out;
-  struct iovec *iov;
-  int res;
-
-  iov = alloca ((count + 1) * sizeof (*vector));
-  out.unique = req->unique;
-  out.error = error;
-  iov[0].iov_base = &out;
-  iov[0].iov_len = sizeof(struct fuse_out_header);
-  memcpy (&iov[1], vector, count * sizeof (*vector));
-  count++;
-  out.len = iov_length(iov, count);
-  res = fuse_chan_send(req->ch, iov, count);
-  free_req(req);
-
-  return res;
-}
diff --git a/xlators/mount/fuse/src/fuse-extra.h b/xlators/mount/fuse/src/fuse-extra.h
deleted file mode 100644
index 5688e34c76d..00000000000
--- a/xlators/mount/fuse/src/fuse-extra.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
-   Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _FUSE_EXTRA_H
-#define _FUSE_EXTRA_H
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif /* _CONFIG_H */
-
-#include <stdlib.h>
-#include <fuse/fuse_lowlevel.h>
-
-#define GLUSTERFS_XATTR_LEN_MAX  65536
-
-uint64_t req_callid (fuse_req_t req);
-
-size_t fuse_dirent_size (size_t dname_len);
-
-int32_t
-fuse_reply_vec (fuse_req_t req,
-		struct iovec *vector,
-		int32_t count);
-
-#endif /* _FUSE_EXTRA_H */
diff --git a/xlators/mount/fuse/src/fuse-helpers.c b/xlators/mount/fuse/src/fuse-helpers.c
new file mode 100644
index 00000000000..a2b0ad11fe4
--- /dev/null
+++ b/xlators/mount/fuse/src/fuse-helpers.c
@@ -0,0 +1,688 @@
+/*
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifdef __NetBSD__
+#define _KMEMUSER
+#endif
+
+#if defined(GF_SOLARIS_HOST_OS)
+#include <sys/procfs.h>
+#elif defined(__FreeBSD__)
+#include <sys/types.h>
+#include <libutil.h>
+#elif defined(CTL_KERN)
+#include <sys/sysctl.h>
+#endif
+#include <pwd.h>
+
+#include "fuse-bridge.h"
+
+static void
+fuse_resolve_wipe(fuse_resolve_t *resolve)
+{
+    GF_FREE((void *)resolve->path);
+
+    GF_FREE((void *)resolve->bname);
+
+    if (resolve->fd)
+        fd_unref(resolve->fd);
+
+    loc_wipe(&resolve->resolve_loc);
+
+    if (resolve->hint) {
+        inode_unref(resolve->hint);
+        resolve->hint = 0;
+    }
+
+    if (resolve->parhint) {
+        inode_unref(resolve->parhint);
+        resolve->parhint = 0;
+    }
+}
+
+void
+free_fuse_state(fuse_state_t *state)
+{
+    xlator_t *this = NULL;
+    fuse_private_t *priv = NULL;
+    uint64_t winds = 0;
+    char switched = 0;
+
+    this = state->this;
+
+    priv = this->private;
+
+    loc_wipe(&state->loc);
+
+    loc_wipe(&state->loc2);
+
+    if (state->xdata) {
+        dict_unref(state->xdata);
+        state->xdata = (void *)0xaaaaeeee;
+    }
+    if (state->xattr)
+        dict_unref(state->xattr);
+
+    if (state->name) {
+        GF_FREE(state->name);
+        state->name = NULL;
+    }
+    if (state->fd) {
+        fd_unref(state->fd);
+        state->fd = (void *)0xfdfdfdfd;
+    }
+    if (state->finh) {
+        GF_FREE(state->finh);
+        state->finh = NULL;
+    }
+
+    fuse_resolve_wipe(&state->resolve);
+    fuse_resolve_wipe(&state->resolve2);
+
+    pthread_mutex_lock(&priv->sync_mutex);
+    {
+        winds = --state->active_subvol->winds;
+        switched = state->active_subvol->switched;
+    }
+    pthread_mutex_unlock(&priv->sync_mutex);
+
+    if ((winds == 0) && (switched)) {
+        xlator_notify(state->active_subvol, GF_EVENT_PARENT_DOWN,
+                      state->active_subvol, NULL);
+    }
+
+#ifdef DEBUG
+    memset(state, 0x90, sizeof(*state));
+#endif
+    GF_FREE(state);
+    state = NULL;
+}
+
+fuse_state_t *
+get_fuse_state(xlator_t *this, fuse_in_header_t *finh)
+{
+    fuse_state_t *state = NULL;
+    xlator_t *active_subvol = NULL;
+    fuse_private_t *priv = NULL;
+
+    state = (void *)GF_CALLOC(1, sizeof(*state), gf_fuse_mt_fuse_state_t);
+    if (!state)
+        return NULL;
+
+    state->this = THIS;
+    priv = this->private;
+
+    pthread_mutex_lock(&priv->sync_mutex);
+    {
+        while (priv->handle_graph_switch)
+            pthread_cond_wait(&priv->migrate_cond, &priv->sync_mutex);
+        active_subvol = fuse_active_subvol(state->this);
+        active_subvol->winds++;
+    }
+    pthread_mutex_unlock(&priv->sync_mutex);
+
+    state->active_subvol = active_subvol;
+    state->itable = active_subvol->itable;
+
+    state->pool = this->ctx->pool;
+    state->finh = finh;
+    state->this = this;
+
+    LOCK_INIT(&state->lock);
+
+    return state;
+}
+
+void
+frame_fill_groups(call_frame_t *frame)
+{
+#if defined(GF_LINUX_HOST_OS)
+    xlator_t *this = frame->this;
+    fuse_private_t *priv = this->private;
+    char filename[32];
+    char line[4096];
+    char *ptr = NULL;
+    long int id = 0;
+    char *saveptr = NULL;
+    char *endptr = NULL;
+    int ret = 0;
+    int ngroups = 0;
+    gid_t *mygroups = NULL;
+
+    if (priv->resolve_gids) {
+        struct passwd pwent;
+        char mystrs[1024];
+        struct passwd *result;
+
+        if (getpwuid_r(frame->root->uid, &pwent, mystrs, sizeof(mystrs),
+                       &result) != 0) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "getpwuid_r(%u) "
+                   "failed",
+                   frame->root->uid);
+            return;
+        }
+        if (result == 0) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "getpwuid_r(%u): "
+                   "no matching record",
+                   frame->root->uid);
+            return;
+        }
+
+        ngroups = gf_getgrouplist(result->pw_name, frame->root->gid, &mygroups);
+        if (ngroups == -1) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "could not map %s to "
+                   "group list (ngroups %d, max %d)",
+                   result->pw_name, ngroups, GF_MAX_AUX_GROUPS);
+            return;
+        }
+
+        call_stack_set_groups(frame->root, ngroups, &mygroups);
+    } else {
+        FILE *fp = NULL;
+
+        ret = snprintf(filename, sizeof filename, "/proc/%d/status",
+                       frame->root->pid);
+        if (ret >= sizeof filename) {
+            gf_log(this->name, GF_LOG_ERROR, "procfs path exceeds buffer size");
+            goto out;
+        }
+
+        fp = fopen(filename, "r");
+        if (!fp) {
+            gf_log(this->name, GF_LOG_ERROR, "failed to open %s: %s", filename,
+                   strerror(errno));
+            goto out;
+        }
+
+        for (;;) {
+            gf_boolean_t found_groups = _gf_false;
+            int idx = 0;
+
+            if (call_stack_alloc_groups(frame->root, ngroups) != 0) {
+                gf_log(this->name, GF_LOG_ERROR,
+                       "failed to allocate gid buffer");
+                goto out;
+            }
+
+            while ((ptr = fgets(line, sizeof line, fp))) {
+                if (strncmp(ptr, "Groups:", 7) == 0) {
+                    found_groups = _gf_true;
+                    break;
+                }
+            }
+            if (!found_groups) {
+                gf_log(this->name, GF_LOG_ERROR, "cannot find gid list in %s",
+                       filename);
+                break;
+            }
+            ptr = line + 8;
+
+            for (ptr = strtok_r(ptr, " \t\r\n", &saveptr); ptr;
+                 ptr = strtok_r(NULL, " \t\r\n", &saveptr)) {
+                errno = 0;
+                id = strtol(ptr, &endptr, 0);
+                if (errno == ERANGE || !endptr || *endptr) {
+                    gf_log(this->name, GF_LOG_ERROR, "failed to parse %s",
+                           filename);
+                    break;
+                }
+                if (idx < call_stack_groups_capacity(frame->root))
+                    frame->root->groups[idx] = id;
+                idx++;
+                if (idx == GF_MAX_AUX_GROUPS)
+                    break;
+            }
+            if (idx > call_stack_groups_capacity(frame->root)) {
+                ngroups = idx;
+                rewind(fp);
+            } else {
+                frame->root->ngrps = idx;
+                break;
+            }
+        }
+    out:
+        if (fp)
+            fclose(fp);
+    }
+#elif defined(GF_SOLARIS_HOST_OS)
+    char filename[32];
+    char scratch[128];
+    prcred_t *prcred = (prcred_t *)scratch;
+    FILE *fp = NULL;
+    int ret = 0;
+    int ngrps;
+
+    ret = snprintf(filename, sizeof filename, "/proc/%d/cred",
+                   frame->root->pid);
+
+    if (ret < sizeof filename) {
+        fp = fopen(filename, "r");
+        if (fp != NULL) {
+            if (fgets(scratch, sizeof scratch, fp) != NULL) {
+                ngrps = MIN(prcred->pr_ngroups, GF_MAX_AUX_GROUPS);
+                if (call_stack_alloc_groups(frame->root, ngrps) != 0) {
+                    fclose(fp);
+                    return;
+                }
+            }
+            fclose(fp);
+        }
+    }
+#elif defined(CTL_KERN) /* DARWIN and *BSD */
+    /*
+       N.B. CTL_KERN is an enum on Linux. (Meaning, if it's not
+       obvious, that it's not subject to preprocessor directives
+       like '#if defined'.)
+       Unlike Linux, on Mac OS and the BSDs it is a #define. We
+       could test to see that KERN_PROC is defined, but, barring any
+       evidence to the contrary, I think that's overkill.
+       We might also test that GF_DARWIN_HOST_OS is defined, why
+       limit this to just Mac OS. It's equally valid for the BSDs
+       and we do have people building on NetBSD and FreeBSD.
+    */
+    int name[] = {CTL_KERN, KERN_PROC, KERN_PROC_PID, frame->root->pid};
+    size_t namelen = sizeof name / sizeof name[0];
+    struct kinfo_proc kp;
+    size_t kplen = sizeof(kp);
+    int i, ngroups;
+
+    if (sysctl(name, namelen, &kp, &kplen, NULL, 0) != 0)
+        return;
+    ngroups = MIN(kp.kp_eproc.e_ucred.cr_ngroups, NGROUPS_MAX);
+    if (call_stack_alloc_groups(frame->root, ngroups) != 0)
+        return;
+    for (i = 0; i < ngroups; i++)
+        frame->root->groups[i] = kp.kp_eproc.e_ucred.cr_groups[i];
+    frame->root->ngrps = ngroups;
+#else
+    frame->root->ngrps = 0;
+#endif /* GF_LINUX_HOST_OS */
+}
+
+/*
+ * Get the groups for the PID associated with this frame. If enabled,
+ * use the gid cache to reduce group list collection.
+ */
+static void
+get_groups(fuse_private_t *priv, call_frame_t *frame)
+{
+    int i;
+    const gid_list_t *gl;
+    gid_list_t agl;
+
+    if (!priv || !priv->gid_cache_timeout) {
+        frame_fill_groups(frame);
+        return;
+    }
+
+    if (-1 == priv->gid_cache_timeout) {
+        frame->root->ngrps = 0;
+        return;
+    }
+
+    gl = gid_cache_lookup(&priv->gid_cache, frame->root->pid, frame->root->uid,
+                          frame->root->gid);
+    if (gl) {
+        if (call_stack_alloc_groups(frame->root, gl->gl_count) != 0) {
+            gid_cache_release(&priv->gid_cache, gl);
+            return;
+        }
+        frame->root->ngrps = gl->gl_count;
+        for (i = 0; i < gl->gl_count; i++)
+            frame->root->groups[i] = gl->gl_list[i];
+        gid_cache_release(&priv->gid_cache, gl);
+        return;
+    }
+
+    frame_fill_groups(frame);
+
+    agl.gl_id = frame->root->pid;
+    agl.gl_uid = frame->root->uid;
+    agl.gl_gid = frame->root->gid;
+    agl.gl_count = frame->root->ngrps;
+    agl.gl_list = GF_CALLOC(frame->root->ngrps, sizeof(gid_t),
+                            gf_fuse_mt_gids_t);
+    if (!agl.gl_list)
+        return;
+
+    for (i = 0; i < frame->root->ngrps; i++)
+        agl.gl_list[i] = frame->root->groups[i];
+
+    if (gid_cache_add(&priv->gid_cache, &agl) != 1)
+        GF_FREE(agl.gl_list);
+}
+
+call_frame_t *
+get_call_frame_for_req(fuse_state_t *state)
+{
+    call_pool_t *pool = NULL;
+    fuse_in_header_t *finh = NULL;
+    call_frame_t *frame = NULL;
+    xlator_t *this = NULL;
+    fuse_private_t *priv = NULL;
+
+    pool = state->pool;
+    finh = state->finh;
+    this = state->this;
+    priv = this->private;
+
+    frame = create_frame(this, pool);
+    if (!frame)
+        return NULL;
+
+    if (finh) {
+        frame->root->uid = finh->uid;
+        frame->root->gid = finh->gid;
+        frame->root->pid = finh->pid;
+        set_lk_owner_from_uint64(&frame->root->lk_owner, state->lk_owner);
+    }
+
+    get_groups(priv, frame);
+
+    if (priv && priv->client_pid_set)
+        frame->root->pid = priv->client_pid;
+
+    frame->root->type = GF_OP_TYPE_FOP;
+
+    return frame;
+}
+
+inode_t *
+fuse_ino_to_inode(uint64_t ino, xlator_t *fuse)
+{
+    inode_t *inode = NULL;
+    xlator_t *active_subvol = NULL;
+
+    if (ino == 1) {
+        active_subvol = fuse_active_subvol(fuse);
+        if (active_subvol)
+            inode = active_subvol->itable->root;
+    } else {
+        inode = (inode_t *)(unsigned long)ino;
+        inode_ref(inode);
+    }
+
+    return inode;
+}
+
+uint64_t
+inode_to_fuse_nodeid(inode_t *inode)
+{
+    if (!inode)
+        return 0;
+    if (__is_root_gfid(inode->gfid))
+        return 1;
+
+    return (unsigned long)inode;
+}
+
+GF_MUST_CHECK int32_t
+fuse_loc_fill(loc_t *loc, fuse_state_t *state, ino_t ino, ino_t par,
+              const char *name)
+{
+    inode_t *inode = NULL;
+    inode_t *parent = NULL;
+    int32_t ret = -1;
+    char *path = NULL;
+    uuid_t null_gfid = {
+        0,
+    };
+
+    /* resistance against multiple invocation of loc_fill not to get
+       reference leaks via inode_search() */
+
+    if (name) {
+        parent = loc->parent;
+        if (!parent) {
+            parent = fuse_ino_to_inode(par, state->this);
+            loc->parent = parent;
+            if (parent)
+                gf_uuid_copy(loc->pargfid, parent->gfid);
+        }
+
+        inode = loc->inode;
+        if (!inode && parent) {
+            inode = inode_grep(parent->table, parent, name);
+            loc->inode = inode;
+        }
+
+        ret = inode_path(parent, name, &path);
+        if (ret <= 0) {
+            gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+                   "inode_path failed for %s/%s",
+                   (parent) ? uuid_utoa(parent->gfid) : "0", name);
+            goto fail;
+        }
+        loc->path = path;
+    } else {
+        inode = loc->inode;
+        if (!inode) {
+            inode = fuse_ino_to_inode(ino, state->this);
+            loc->inode = inode;
+            if (inode)
+                gf_uuid_copy(loc->gfid, inode->gfid);
+        }
+
+        parent = loc->parent;
+        if (!parent) {
+            parent = inode_parent(inode, null_gfid, NULL);
+            loc->parent = parent;
+            if (parent)
+                gf_uuid_copy(loc->pargfid, parent->gfid);
+        }
+
+        ret = inode_path(inode, NULL, &path);
+        if (ret <= 0) {
+            gf_log("glusterfs-fuse", GF_LOG_DEBUG, "inode_path failed for %s",
+                   (inode) ? uuid_utoa(inode->gfid) : "0");
+            goto fail;
+        }
+        loc->path = path;
+    }
+
+    if (loc->path) {
+        loc->name = strrchr(loc->path, '/');
+        if (loc->name)
+            loc->name++;
+        else
+            loc->name = "";
+    }
+
+    if ((ino != 1) && (parent == NULL)) {
+        gf_log("fuse-bridge", GF_LOG_DEBUG,
+               "failed to search parent for %" PRId64 "/%s (%" PRId64 ")",
+               (ino_t)par, name, (ino_t)ino);
+        ret = -1;
+        goto fail;
+    }
+    ret = 0;
+fail:
+    /* this should not happen as inode_path returns -1 when buf is NULL
+       for sure */
+    if (path && !loc->path)
+        GF_FREE(path);
+    return ret;
+}
+
+/* courtesy of folly */
+void
+gf_fuse_stat2attr(struct iatt *st, struct fuse_attr *fa,
+                  gf_boolean_t enable_ino32)
+{
+    if (enable_ino32)
+        fa->ino = GF_FUSE_SQUASH_INO(st->ia_ino);
+    else
+        fa->ino = st->ia_ino;
+
+    fa->size = st->ia_size;
+    fa->blocks = st->ia_blocks;
+    fa->atime = st->ia_atime;
+    fa->mtime = st->ia_mtime;
+    fa->ctime = st->ia_ctime;
+    fa->atimensec = st->ia_atime_nsec;
+    fa->mtimensec = st->ia_mtime_nsec;
+    fa->ctimensec = st->ia_ctime_nsec;
+    fa->mode = st_mode_from_ia(st->ia_prot, st->ia_type);
+    fa->nlink = st->ia_nlink;
+    fa->uid = st->ia_uid;
+    fa->gid = st->ia_gid;
+    fa->rdev = makedev(ia_major(st->ia_rdev), ia_minor(st->ia_rdev));
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+    fa->blksize = st->ia_blksize;
+#endif
+#ifdef GF_DARWIN_HOST_OS
+    fa->crtime = (uint64_t)-1;
+    fa->crtimensec = (uint32_t)-1;
+    fa->flags = 0;
+#endif
+}
+
+void
+gf_fuse_fill_dirent(gf_dirent_t *entry, struct fuse_dirent *fde,
+                    gf_boolean_t enable_ino32)
+{
+    if (enable_ino32)
+        fde->ino = GF_FUSE_SQUASH_INO(entry->d_ino);
+    else
+        fde->ino = entry->d_ino;
+
+    fde->off = entry->d_off;
+    fde->type = entry->d_type;
+    fde->namelen = strlen(entry->d_name);
+    (void)memcpy(fde->name, entry->d_name, fde->namelen);
+}
+
+static int
+fuse_do_flip_xattr_ns(char *okey, const char *nns, char **nkey)
+{
+    int ret = 0;
+    char *key = NULL;
+
+    okey = strchr(okey, '.');
+    GF_ASSERT(okey);
+
+    int key_len = strlen(nns) + strlen(okey);
+    key = GF_MALLOC(key_len + 1, gf_common_mt_char);
+    if (!key) {
+        ret = -1;
+        goto out;
+    }
+
+    strcpy(key, nns);
+    strcat(key, okey);
+
+    *nkey = key;
+
+out:
+    return ret;
+}
+
+static int
+fuse_xattr_alloc_default(char *okey, char **nkey)
+{
+    int ret = 0;
+
+    *nkey = gf_strdup(okey);
+    if (!*nkey)
+        ret = -1;
+    return ret;
+}
+
+#define PRIV_XA_NS "trusted"
+#define UNPRIV_XA_NS "system"
+
+int
+fuse_flip_xattr_ns(fuse_private_t *priv, char *okey, char **nkey)
+{
+    int ret = 0;
+    gf_boolean_t need_flip = _gf_false;
+
+    if (GF_CLIENT_PID_GSYNCD == priv->client_pid) {
+        /* valid xattr(s): *xtime, volume-mark* */
+        gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+               "PID: %d, checking xattr(s): "
+               "volume-mark*, *xtime",
+               priv->client_pid);
+        if ((strcmp(okey, UNPRIV_XA_NS ".glusterfs.volume-mark") == 0) ||
+            (fnmatch(UNPRIV_XA_NS ".glusterfs.volume-mark.*", okey,
+                     FNM_PERIOD) == 0) ||
+            (fnmatch(UNPRIV_XA_NS ".glusterfs.*.xtime", okey, FNM_PERIOD) == 0))
+            need_flip = _gf_true;
+    }
+
+    if (need_flip) {
+        gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+               "flipping %s to " PRIV_XA_NS " equivalent", okey);
+        ret = fuse_do_flip_xattr_ns(okey, PRIV_XA_NS, nkey);
+    } else {
+        /* if we cannot match, continue with what we got */
+        ret = fuse_xattr_alloc_default(okey, nkey);
+    }
+
+    return ret;
+}
+
+int
+fuse_ignore_xattr_set(fuse_private_t *priv, char *key)
+{
+    int ret = 0;
+
+    /* don't mess with user namespace */
+    if (fnmatch("user.*", key, FNM_PERIOD) == 0)
+        goto out;
+
+    if (priv->client_pid != GF_CLIENT_PID_GSYNCD)
+        goto out;
+
+    /* trusted NS check */
+    if (!((fnmatch("*.glusterfs.*.xtime", key, FNM_PERIOD) == 0) ||
+          (fnmatch("*.glusterfs.volume-mark", key, FNM_PERIOD) == 0) ||
+          (fnmatch("*.glusterfs.volume-mark.*", key, FNM_PERIOD) == 0) ||
+          (fnmatch("system.posix_acl_access", key, FNM_PERIOD) == 0) ||
+          (fnmatch("glusterfs.gfid.newfile", key, FNM_PERIOD) == 0) ||
+          (fnmatch("*.glusterfs.shard.block-size", key, FNM_PERIOD) == 0) ||
+          (fnmatch("*.glusterfs.shard.file-size", key, FNM_PERIOD) == 0)))
+        ret = -1;
+
+out:
+    gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+           "%s setxattr: key [%s], "
+           " client pid [%d]",
+           (ret ? "disallowing" : "allowing"), key, priv->client_pid);
+
+    return ret;
+}
+
+int
+fuse_check_selinux_cap_xattr(fuse_private_t *priv, char *name)
+{
+    int ret = -1;
+
+    if (strcmp(name, "security.selinux") &&
+        strcmp(name, "security.capability")) {
+        /* if xattr name is not of interest, no validations needed */
+        ret = 0;
+        goto out;
+    }
+
+    if ((strcmp(name, "security.selinux") == 0) && (priv->selinux)) {
+        ret = 0;
+    }
+
+    if ((strcmp(name, "security.capability") == 0) &&
+        ((priv->capability) || (priv->selinux))) {
+        ret = 0;
+    }
+
+out:
+    return ret;
+}
diff --git a/xlators/mount/fuse/src/fuse-mem-types.h b/xlators/mount/fuse/src/fuse-mem-types.h
new file mode 100644
index 00000000000..4fd8e58c523
--- /dev/null
+++ b/xlators/mount/fuse/src/fuse-mem-types.h
@@ -0,0 +1,31 @@
+/*
+   Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __FUSE_MEM_TYPES_H__
+#define __FUSE_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_fuse_mem_types_ {
+    gf_fuse_mt_iovec = gf_common_mt_end + 1,
+    gf_fuse_mt_fuse_private_t,
+    gf_fuse_mt_char,
+    gf_fuse_mt_iov_base,
+    gf_fuse_mt_fuse_state_t,
+    gf_fuse_mt_fd_ctx_t,
+    gf_fuse_mt_graph_switch_args_t,
+    gf_fuse_mt_gids_t,
+    gf_fuse_mt_invalidate_node_t,
+    gf_fuse_mt_pthread_t,
+    gf_fuse_mt_timed_message_t,
+    gf_fuse_mt_interrupt_record_t,
+    gf_fuse_mt_end
+};
+#endif
diff --git a/xlators/mount/fuse/src/fuse-resolve.c b/xlators/mount/fuse/src/fuse-resolve.c
new file mode 100644
index 00000000000..6206fd47325
--- /dev/null
+++ b/xlators/mount/fuse/src/fuse-resolve.c
@@ -0,0 +1,683 @@
+/*
+   Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include "fuse-bridge.h"
+
+static int
+fuse_resolve_all(fuse_state_t *state);
+
+int
+fuse_resolve_continue(fuse_state_t *state);
+int
+fuse_resolve_entry_simple(fuse_state_t *state);
+int
+fuse_resolve_inode_simple(fuse_state_t *state);
+int
+fuse_migrate_fd(xlator_t *this, fd_t *fd, xlator_t *old_subvol,
+                xlator_t *new_subvol);
+
+fuse_fd_ctx_t *
+fuse_fd_ctx_get(xlator_t *this, fd_t *fd);
+
+static int
+fuse_resolve_loc_touchup(fuse_state_t *state)
+{
+    fuse_resolve_t *resolve = NULL;
+    loc_t *loc = NULL;
+
+    resolve = state->resolve_now;
+    loc = state->loc_now;
+
+    loc_touchup(loc, resolve->bname);
+    return 0;
+}
+
+int
+fuse_resolve_entry_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int op_ret, int op_errno, inode_t *inode,
+                       struct iatt *buf, dict_t *xattr, struct iatt *postparent)
+{
+    fuse_state_t *state = NULL;
+    fuse_resolve_t *resolve = NULL;
+    inode_t *link_inode = NULL;
+    loc_t *resolve_loc = NULL;
+    uint64_t ctx_value = LOOKUP_NOT_NEEDED;
+
+    state = frame->root->state;
+    resolve = state->resolve_now;
+    resolve_loc = &resolve->resolve_loc;
+
+    STACK_DESTROY(frame->root);
+
+    if (op_ret == -1) {
+        gf_log(this->name, (op_errno == ENOENT) ? GF_LOG_DEBUG : GF_LOG_WARNING,
+               "%s/%s: failed to resolve (%s)", uuid_utoa(resolve_loc->pargfid),
+               resolve_loc->name, strerror(op_errno));
+        resolve->op_ret = -1;
+        resolve->op_errno = op_errno;
+        goto out;
+    }
+
+    link_inode = inode_link(inode, resolve_loc->parent, resolve_loc->name, buf);
+    if (link_inode == inode)
+        inode_ctx_set(link_inode, this, &ctx_value);
+    state->loc_now->inode = link_inode;
+
+out:
+    loc_wipe(resolve_loc);
+
+    fuse_resolve_continue(state);
+    return 0;
+}
+
+int
+fuse_resolve_entry(fuse_state_t *state)
+{
+    fuse_resolve_t *resolve = NULL;
+    loc_t *resolve_loc = NULL;
+
+    resolve = state->resolve_now;
+    resolve_loc = &resolve->resolve_loc;
+
+    resolve_loc->parent = inode_ref(state->loc_now->parent);
+    gf_uuid_copy(resolve_loc->pargfid, state->loc_now->pargfid);
+    resolve_loc->name = resolve->bname;
+
+    resolve_loc->inode = inode_grep(state->itable, resolve->parhint,
+                                    resolve->bname);
+    if (!resolve_loc->inode) {
+        resolve_loc->inode = inode_new(state->itable);
+    }
+    inode_path(resolve_loc->parent, resolve_loc->name,
+               (char **)&resolve_loc->path);
+
+    FUSE_FOP(state, fuse_resolve_entry_cbk, GF_FOP_LOOKUP, lookup, resolve_loc,
+             NULL);
+
+    return 0;
+}
+
+int
+fuse_resolve_gfid_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int op_ret, int op_errno, inode_t *inode,
+                      struct iatt *buf, dict_t *xattr, struct iatt *postparent)
+{
+    fuse_state_t *state = NULL;
+    fuse_resolve_t *resolve = NULL;
+    inode_t *link_inode = NULL;
+    loc_t *loc_now = NULL;
+    inode_t *tmp_inode = NULL;
+    uint64_t ctx_value = LOOKUP_NOT_NEEDED;
+
+    state = frame->root->state;
+    resolve = state->resolve_now;
+    loc_now = state->loc_now;
+
+    STACK_DESTROY(frame->root);
+
+    if (op_ret == -1) {
+        gf_log(this->name, (op_errno == ENOENT) ? GF_LOG_DEBUG : GF_LOG_WARNING,
+               "%s: failed to resolve (%s)",
+               uuid_utoa(resolve->resolve_loc.gfid), strerror(op_errno));
+        loc_wipe(&resolve->resolve_loc);
+
+        /* resolve->op_ret can have 3 values: 0, -1, -2.
+         * 0 : resolution was successful.
+         * -1: parent inode could not be resolved.
+         * -2: entry (inode corresponding to path) could not be resolved
+         */
+
+        if (gf_uuid_is_null(resolve->gfid)) {
+            resolve->op_ret = -1;
+        } else {
+            resolve->op_ret = -2;
+        }
+
+        resolve->op_errno = op_errno;
+        goto out;
+    }
+
+    link_inode = inode_link(inode, NULL, NULL, buf);
+    if (link_inode == inode)
+        inode_ctx_set(link_inode, this, &ctx_value);
+
+    loc_wipe(&resolve->resolve_loc);
+
+    if (!link_inode)
+        goto out;
+
+    if (!gf_uuid_is_null(resolve->gfid)) {
+        loc_now->inode = link_inode;
+        goto out;
+    }
+
+    loc_now->parent = link_inode;
+    gf_uuid_copy(loc_now->pargfid, link_inode->gfid);
+
+    tmp_inode = inode_grep(state->itable, link_inode, resolve->bname);
+    if (tmp_inode && (!inode_needs_lookup(tmp_inode, THIS))) {
+        loc_now->inode = tmp_inode;
+        goto out;
+    }
+
+    inode_unref(tmp_inode);
+    fuse_resolve_entry(state);
+
+    return 0;
+out:
+    fuse_resolve_continue(state);
+    return 0;
+}
+
+int
+fuse_resolve_gfid(fuse_state_t *state)
+{
+    fuse_resolve_t *resolve = NULL;
+    loc_t *resolve_loc = NULL;
+    int ret = 0;
+
+    resolve = state->resolve_now;
+    resolve_loc = &resolve->resolve_loc;
+
+    if (!gf_uuid_is_null(resolve->pargfid)) {
+        gf_uuid_copy(resolve_loc->gfid, resolve->pargfid);
+    } else if (!gf_uuid_is_null(resolve->gfid)) {
+        gf_uuid_copy(resolve_loc->gfid, resolve->gfid);
+    }
+
+    /* inode may already exist in case we are looking up an inode which was
+       linked through readdirplus */
+    resolve_loc->inode = inode_find(state->itable, resolve_loc->gfid);
+    if (!resolve_loc->inode)
+        resolve_loc->inode = inode_new(state->itable);
+    ret = loc_path(resolve_loc, NULL);
+
+    if (ret <= 0) {
+        gf_log(THIS->name, GF_LOG_WARNING,
+               "failed to get the path for inode %s", uuid_utoa(resolve->gfid));
+    }
+
+    FUSE_FOP(state, fuse_resolve_gfid_cbk, GF_FOP_LOOKUP, lookup, resolve_loc,
+             NULL);
+
+    return 0;
+}
+
+/*
+ * Return value:
+ * 0 - resolved parent and entry (as necessary)
+ * -1 - resolved parent but not entry (though necessary)
+ * 1 - resolved neither parent nor entry
+ */
+
+int
+fuse_resolve_parent_simple(fuse_state_t *state)
+{
+    fuse_resolve_t *resolve = NULL;
+    loc_t *loc = NULL;
+    inode_t *parent = NULL;
+    inode_t *inode = NULL;
+    xlator_t *this = NULL;
+
+    resolve = state->resolve_now;
+    loc = state->loc_now;
+    this = state->this;
+
+    loc->name = resolve->bname;
+
+    parent = resolve->parhint;
+    if (parent->table == state->itable) {
+        if (inode_needs_lookup(parent, THIS))
+            return 1;
+
+        /* no graph switches since */
+        loc->parent = inode_ref(parent);
+        gf_uuid_copy(loc->pargfid, parent->gfid);
+        loc->inode = inode_grep(state->itable, parent, loc->name);
+
+        /* nodeid for root is 1 and we blindly take the latest graph's
+         * table->root as the parhint and because of this there is
+         * ambiguity whether the entry should have existed or not, and
+         * we took the conservative approach of assuming entry should
+         * have been there even though it need not have (bug #804592).
+         */
+
+        if (loc->inode && inode_needs_lookup(loc->inode, THIS)) {
+            inode_unref(loc->inode);
+            loc->inode = NULL;
+            return -1;
+        }
+
+        if ((loc->inode == NULL) && __is_root_gfid(parent->gfid)) {
+            /* non decisive result - entry missing */
+            return -1;
+        }
+
+        /* decisive result - resolution success */
+        return 0;
+    }
+
+    parent = inode_find(state->itable, resolve->pargfid);
+    if (!parent) {
+        /* non decisive result - parent missing */
+        return 1;
+    }
+    if (inode_needs_lookup(parent, THIS)) {
+        inode_unref(parent);
+        return 1;
+    }
+
+    loc->parent = parent;
+    gf_uuid_copy(loc->pargfid, resolve->pargfid);
+
+    inode = inode_grep(state->itable, parent, loc->name);
+    if (inode && !inode_needs_lookup(inode, this)) {
+        loc->inode = inode;
+        /* decisive result - resolution success */
+        return 0;
+    }
+
+    /* non decisive result - entry missing */
+    return -1;
+}
+
+int
+fuse_resolve_parent(fuse_state_t *state)
+{
+    int ret = 0;
+
+    ret = fuse_resolve_parent_simple(state);
+    if (ret > 0) {
+        fuse_resolve_gfid(state);
+        return 0;
+    }
+
+    if (ret < 0) {
+        fuse_resolve_entry(state);
+        return 0;
+    }
+
+    fuse_resolve_continue(state);
+
+    return 0;
+}
+
+int
+fuse_resolve_inode_simple(fuse_state_t *state)
+{
+    fuse_resolve_t *resolve = NULL;
+    loc_t *loc = NULL;
+    inode_t *inode = NULL;
+
+    resolve = state->resolve_now;
+    loc = state->loc_now;
+
+    inode = resolve->hint;
+    if (inode->table == state->itable)
+        inode_ref(inode);
+    else
+        inode = inode_find(state->itable, resolve->gfid);
+
+    if (inode) {
+        if (!inode_needs_lookup(inode, THIS))
+            goto found;
+        /* inode was linked through readdirplus */
+        inode_unref(inode);
+    }
+
+    return 1;
+found:
+    loc->inode = inode;
+    return 0;
+}
+
+int
+fuse_resolve_inode(fuse_state_t *state)
+{
+    int ret = 0;
+
+    ret = fuse_resolve_inode_simple(state);
+
+    if (ret > 0) {
+        fuse_resolve_gfid(state);
+        return 0;
+    }
+
+    fuse_resolve_continue(state);
+
+    return 0;
+}
+
+int
+fuse_migrate_fd_task(void *data)
+{
+    int ret = -1;
+    fuse_state_t *state = NULL;
+    fd_t *basefd = NULL, *oldfd = NULL;
+    fuse_fd_ctx_t *basefd_ctx = NULL;
+    xlator_t *old_subvol = NULL;
+
+    state = data;
+    if (state == NULL) {
+        goto out;
+    }
+
+    basefd = state->fd;
+
+    basefd_ctx = fuse_fd_ctx_get(state->this, basefd);
+    if (!basefd_ctx)
+        goto out;
+
+    LOCK(&basefd->lock);
+    {
+        oldfd = basefd_ctx->activefd ? basefd_ctx->activefd : basefd;
+        fd_ref(oldfd);
+    }
+    UNLOCK(&basefd->lock);
+
+    old_subvol = oldfd->inode->table->xl;
+
+    ret = fuse_migrate_fd(state->this, basefd, old_subvol,
+                          state->active_subvol);
+
+    LOCK(&basefd->lock);
+    {
+        if (ret < 0) {
+            basefd_ctx->migration_failed = 1;
+        } else {
+            basefd_ctx->migration_failed = 0;
+        }
+    }
+    UNLOCK(&basefd->lock);
+
+    ret = 0;
+
+out:
+    if (oldfd)
+        fd_unref(oldfd);
+
+    return ret;
+}
+
+static int
+fuse_migrate_fd_error(xlator_t *this, fd_t *fd)
+{
+    fuse_fd_ctx_t *fdctx = NULL;
+    char error = 0;
+
+    fdctx = fuse_fd_ctx_get(this, fd);
+    if (fdctx != NULL) {
+        if (fdctx->migration_failed) {
+            error = 1;
+        }
+    }
+
+    return error;
+}
+
+#define FUSE_FD_GET_ACTIVE_FD(activefd, basefd)                                \
+    do {                                                                       \
+        LOCK(&basefd->lock);                                                   \
+        {                                                                      \
+            activefd = basefd_ctx->activefd ? basefd_ctx->activefd : basefd;   \
+            if (activefd != basefd) {                                          \
+                fd_ref(activefd);                                              \
+            }                                                                  \
+        }                                                                      \
+        UNLOCK(&basefd->lock);                                                 \
+                                                                               \
+        if (activefd == basefd) {                                              \
+            fd_ref(activefd);                                                  \
+        }                                                                      \
+    } while (0);
+
+static int
+fuse_resolve_fd(fuse_state_t *state)
+{
+    fuse_resolve_t *resolve = NULL;
+    fd_t *basefd = NULL, *activefd = NULL;
+    xlator_t *active_subvol = NULL, *this = NULL;
+    int ret = 0;
+    char fd_migration_error = 0;
+    fuse_fd_ctx_t *basefd_ctx = NULL;
+
+    resolve = state->resolve_now;
+
+    this = state->this;
+
+    basefd = resolve->fd;
+    basefd_ctx = fuse_fd_ctx_get(this, basefd);
+    if (basefd_ctx == NULL) {
+        gf_log(state->this->name, GF_LOG_WARNING,
+               "fdctx is NULL for basefd (ptr:%p inode-gfid:%s), "
+               "resolver erroring out with errno EINVAL",
+               basefd, uuid_utoa(basefd->inode->gfid));
+        resolve->op_ret = -1;
+        resolve->op_errno = EINVAL;
+        goto resolve_continue;
+    }
+
+    FUSE_FD_GET_ACTIVE_FD(activefd, basefd);
+
+    active_subvol = activefd->inode->table->xl;
+
+    fd_migration_error = fuse_migrate_fd_error(state->this, basefd);
+    if (fd_migration_error) {
+        resolve->op_ret = -1;
+        resolve->op_errno = EBADF;
+    } else if (state->active_subvol != active_subvol) {
+        ret = synctask_new(state->this->ctx->env, fuse_migrate_fd_task, NULL,
+                           NULL, state);
+
+        fd_migration_error = fuse_migrate_fd_error(state->this, basefd);
+        fd_unref(activefd);
+
+        FUSE_FD_GET_ACTIVE_FD(activefd, basefd);
+        active_subvol = activefd->inode->table->xl;
+
+        if ((ret == -1) || fd_migration_error ||
+            (state->active_subvol != active_subvol)) {
+            if (ret == -1) {
+                gf_log(state->this->name, GF_LOG_WARNING,
+                       "starting sync-task to migrate "
+                       "basefd (ptr:%p inode-gfid:%s) failed "
+                       "(old-subvolume:%s-%d "
+                       "new-subvolume:%s-%d)",
+                       basefd, uuid_utoa(basefd->inode->gfid),
+                       active_subvol->name, active_subvol->graph->id,
+                       state->active_subvol->name,
+                       state->active_subvol->graph->id);
+            } else {
+                gf_log(state->this->name, GF_LOG_WARNING,
+                       "fd migration of basefd "
+                       "(ptr:%p inode-gfid:%s) failed "
+                       "(old-subvolume:%s-%d "
+                       "new-subvolume:%s-%d)",
+                       basefd, uuid_utoa(basefd->inode->gfid),
+                       active_subvol->name, active_subvol->graph->id,
+                       state->active_subvol->name,
+                       state->active_subvol->graph->id);
+            }
+
+            resolve->op_ret = -1;
+            resolve->op_errno = EBADF;
+        } else {
+            gf_log(state->this->name, GF_LOG_DEBUG,
+                   "basefd (ptr:%p inode-gfid:%s) migrated "
+                   "successfully in resolver "
+                   "(old-subvolume:%s-%d new-subvolume:%s-%d)",
+                   basefd, uuid_utoa(basefd->inode->gfid), active_subvol->name,
+                   active_subvol->graph->id, state->active_subvol->name,
+                   state->active_subvol->graph->id);
+        }
+    }
+
+    if ((resolve->op_ret == -1) && (resolve->op_errno == EBADF)) {
+        gf_log("fuse-resolve", GF_LOG_WARNING,
+               "migration of basefd (ptr:%p inode-gfid:%s) "
+               "did not complete, failing fop with EBADF "
+               "(old-subvolume:%s-%d new-subvolume:%s-%d)",
+               basefd, uuid_utoa(basefd->inode->gfid), active_subvol->name,
+               active_subvol->graph->id, state->active_subvol->name,
+               state->active_subvol->graph->id);
+    }
+
+    if (activefd != basefd) {
+        state->fd = fd_ref(activefd);
+        fd_unref(basefd);
+    }
+
+    /* state->active_subvol = active_subvol; */
+
+resolve_continue:
+    if (activefd != NULL) {
+        fd_unref(activefd);
+    }
+
+    fuse_resolve_continue(state);
+
+    return 0;
+}
+
+int
+fuse_gfid_set(fuse_state_t *state)
+{
+    int ret = 0;
+
+    if (gf_uuid_is_null(state->gfid))
+        goto out;
+
+    if (!state->xdata)
+        state->xdata = dict_new();
+
+    if (!state->xdata) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_gfuuid(state->xdata, "gfid-req", state->gfid, true);
+out:
+    return ret;
+}
+
+int
+fuse_resolve_entry_init(fuse_state_t *state, fuse_resolve_t *resolve, ino_t par,
+                        char *name)
+{
+    inode_t *parent = NULL;
+
+    parent = fuse_ino_to_inode(par, state->this);
+    gf_uuid_copy(resolve->pargfid, parent->gfid);
+    resolve->parhint = parent;
+    resolve->bname = gf_strdup(name);
+
+    return 0;
+}
+
+int
+fuse_resolve_inode_init(fuse_state_t *state, fuse_resolve_t *resolve, ino_t ino)
+{
+    inode_t *inode = NULL;
+
+    inode = fuse_ino_to_inode(ino, state->this);
+    gf_uuid_copy(resolve->gfid, inode->gfid);
+    resolve->hint = inode;
+
+    return 0;
+}
+
+int
+fuse_resolve_fd_init(fuse_state_t *state, fuse_resolve_t *resolve, fd_t *fd)
+{
+    resolve->fd = fd_ref(fd);
+
+    return 0;
+}
+
+static int
+fuse_resolve(fuse_state_t *state)
+{
+    fuse_resolve_t *resolve = NULL;
+
+    resolve = state->resolve_now;
+
+    if (resolve->fd) {
+        fuse_resolve_fd(state);
+
+    } else if (!gf_uuid_is_null(resolve->pargfid)) {
+        fuse_resolve_parent(state);
+
+    } else if (!gf_uuid_is_null(resolve->gfid)) {
+        fuse_resolve_inode(state);
+
+    } else {
+        fuse_resolve_all(state);
+    }
+
+    return 0;
+}
+
+static int
+fuse_resolve_done(fuse_state_t *state)
+{
+    fuse_fop_resume(state);
+    return 0;
+}
+
+/*
+ * This function is called multiple times, once per resolving one location/fd.
+ * state->resolve_now is used to decide which location/fd is to be resolved now
+ */
+static int
+fuse_resolve_all(fuse_state_t *state)
+{
+    if (state->resolve_now == NULL) {
+        state->resolve_now = &state->resolve;
+        state->loc_now = &state->loc;
+
+        fuse_resolve(state);
+
+    } else if (state->resolve_now == &state->resolve) {
+        state->resolve_now = &state->resolve2;
+        state->loc_now = &state->loc2;
+
+        fuse_resolve(state);
+
+    } else if (state->resolve_now == &state->resolve2) {
+        fuse_resolve_done(state);
+
+    } else {
+        gf_log("fuse-resolve", GF_LOG_ERROR,
+               "Invalid pointer for state->resolve_now");
+    }
+
+    return 0;
+}
+
+int
+fuse_resolve_continue(fuse_state_t *state)
+{
+    fuse_resolve_loc_touchup(state);
+
+    fuse_resolve_all(state);
+
+    return 0;
+}
+
+int
+fuse_resolve_and_resume(fuse_state_t *state, fuse_resume_fn_t fn)
+{
+    fuse_gfid_set(state);
+
+    state->resume_fn = fn;
+
+    fuse_resolve_all(state);
+
+    return 0;
+}
diff --git a/xlators/mount/fuse/utils/Makefile.am b/xlators/mount/fuse/utils/Makefile.am
index 1217c30dafa..fdad27ad103 100644
--- a/xlators/mount/fuse/utils/Makefile.am
+++ b/xlators/mount/fuse/utils/Makefile.am
@@ -1,10 +1,9 @@
-utildir = $(destdir)/sbin
+utildir = @mountutildir@
 
-if GF_DARWIN_HOST_OS
-util_SCRIPTS = mount_glusterfs
-else
+if GF_LINUX_HOST_OS
 util_SCRIPTS = mount.glusterfs
+else
+util_SCRIPTS = mount_glusterfs
 endif
 
-CLEANFILES = 
-
+CLEANFILES =
diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in
index 58da509f17b..ac4d94cb743 100755
--- a/xlators/mount/fuse/utils/mount.glusterfs.in
+++ b/xlators/mount/fuse/utils/mount.glusterfs.in
@@ -1,22 +1,19 @@
 #!/bin/sh
-# (C) 2006, 2007, 2008 Z RESEARCH Inc. <http://www.zresearch.com>
-# 
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License as
-# published by the Free Software Foundation; either version 2 of
-# the License, or (at your option) any later version.
-#   
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#   
-# You should have received a copy of the GNU General Public
-# License along with this program; if not, write to the Free
-# Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
-# Boston, MA 02110-1301 USA
-
+#
+# Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+# Copyright (c) 2015 ungleich GmbH <http://www.ungleich.ch>
+#
+# This file is part of GlusterFS.
+#
+# This file is licensed to you under your choice of the GNU Lesser
+# General Public License, version 3 or any later version (LGPLv3 or
+# later), or the GNU General Public License, version 2 (GPLv2), in all
+# cases as published by the Free Software Foundation.
 
+warn ()
+{
+   echo "$@" >&2
+}
 
 _init ()
 {
@@ -25,134 +22,815 @@ _init ()
     LOG_CRITICAL=CRITICAL;
     LOG_ERROR=ERROR;
     LOG_WARNING=WARNING;
+    LOG_INFO=INFO
     LOG_DEBUG=DEBUG;
+    LOG_TRACE=TRACE;
 
-    # set default log level to ERROR
-    log_level=$LOG_WARNING;
-}
+    HOST_NAME_MAX=64;
 
-start_glusterfs ()
-{
     prefix="@prefix@";
     exec_prefix=@exec_prefix@;
     cmd_line=$(echo "@sbindir@/glusterfs");
-    
+
+    # check whether getfattr exists
+    export PATH
+    getfattr=$(command -v getfattr 2>/dev/null)
+    if [ $? -ne 0 ]; then
+        warn "WARNING: getfattr not found, certain checks will be skipped.."
+    fi
+
+    mounttab=/proc/mounts
+    uname_s=`uname -s`
+    case ${uname_s} in
+        NetBSD)
+            getinode="stat -f %i"
+            getdev="stat -f %d"
+            lgetinode="${getinode} -L"
+            lgetdev="${getdev} -L"
+            ;;
+        Linux)
+            getinode="stat -c %i"
+            getdev="stat -c %d"
+            lgetinode="${getinode} -L"
+            lgetdev="${getdev} -L"
+            ;;
+    esac
+
+    UPDATEDBCONF=/etc/updatedb.conf
+}
+
+is_valid_hostname ()
+{
+    local server=$1
+
+    length=$(echo $server | wc -c)
+    if [ ${length} -gt ${HOST_NAME_MAX} ]; then
+        return 1
+    fi
+}
+
+parse_backup_volfile_servers ()
+{
+    local server_list=$1
+    local servers=""
+    local new_servers=""
+
+    servers=$(echo ${server_list} | sed 's/\:/ /g')
+    for server in ${servers}; do
+        is_valid_hostname ${server}
+        if [ $? -eq 1 ]; then
+            continue
+        fi
+        new_servers=$(echo "${new_servers} ${server}")
+    done
+
+    echo ${new_servers}
+}
+
+parse_volfile_servers ()
+{
+    local server_list=$1
+    local servers=""
+    local new_servers=""
+
+    servers=$(echo ${server_list} | sed 's/,/ /g')
+    for server in ${servers}; do
+        is_valid_hostname ${server}
+        if [ $? -eq 1 ]; then
+            continue
+        fi
+        new_servers=$(echo "${new_servers} ${server}")
+    done
+
+    echo ${new_servers}
+}
+
+start_glusterfs ()
+{
     if [ -n "$log_level_str" ]; then
-	case "$log_level_str" in
-	    "ERROR")
-		log_level=$LOG_ERROR;
-		;;
-	    "DEBUG")
-		log_level=$LOG_DEBUG;
-		;;
-	    "CRITICAL")
-		log_level=$LOG_CRITICAL;
-		;;
-	    "WARNING")
-		log_level=$LOG_WARNING;
-		;;
-	    "NONE")
-		log_level=$LOG_NONE;
-		;;
-	    *)
-		echo "invalid log level $log_level_str, using ERROR";
-		log_level=$LOG_ERROR;
-		;;
-	esac
-    fi
-    cmd_line=$(echo "$cmd_line --log-level=$log_level");
-    
-    if [ -n "$log_file" ]; then
-	cmd_line=$(echo "$cmd_line --log-file=$log_file");
+        case "$( echo $log_level_str | awk '{print toupper($0)}')" in
+            "ERROR")
+                log_level=$LOG_ERROR;
+                ;;
+            "INFO")
+                log_level=$LOG_INFO;
+                ;;
+            "DEBUG")
+                log_level=$LOG_DEBUG;
+                ;;
+            "CRITICAL")
+                log_level=$LOG_CRITICAL;
+                ;;
+            "WARNING")
+                log_level=$LOG_WARNING;
+                ;;
+            "TRACE")
+                log_level=$LOG_TRACE;
+                ;;
+            "NONE")
+                log_level=$LOG_NONE;
+                ;;
+            *)
+                warn "invalid log level $log_level_str, using INFO";
+                log_level=$LOG_INFO;
+                ;;
+        esac
+    fi
+
+    # options without values start here
+    if [ -n "$read_only" ]; then
+        cmd_line=$(echo "$cmd_line --read-only");
+    fi
+
+    if [ -n "$acl" ]; then
+        cmd_line=$(echo "$cmd_line --acl");
+    fi
+
+    if [ -n "$selinux" ]; then
+         cmd_line=$(echo "$cmd_line --selinux");
+    fi
+
+    if [ -n "$enable_ino32" ]; then
+        cmd_line=$(echo "$cmd_line --enable-ino32");
+    fi
+
+    if [ -n "$worm" ]; then
+        cmd_line=$(echo "$cmd_line --worm");
+    fi
+    if [ -n "$volfile_max_fetch_attempts" ]; then
+       cmd_line=$(echo "$cmd_line --volfile-max-fetch-attempts=$volfile_max_fetch_attempts")
     fi
 
     if [ -n "$volfile_check" ]; then
-	cmd_line=$(echo "$cmd_line --volfile-check");
+        cmd_line=$(echo "$cmd_line --volfile-check");
+    fi
+
+    if [ -n "$mem_accounting" ]; then
+        cmd_line=$(echo "$cmd_line --mem-accounting");
+    fi
+
+    if [ -n "$aux_gfid_mount" ]; then
+        cmd_line=$(echo "$cmd_line --aux-gfid-mount");
+    fi
+
+    if [ -n "$resolve_gids" ]; then
+        cmd_line=$(echo "$cmd_line --resolve-gids");
+    fi
+
+    if [ -n "$no_root_squash" ]; then
+        cmd_line=$(echo "$cmd_line --no-root-squash");
+    fi
+
+    if [ -n "$thin_client" ]; then
+        cmd_line=$(echo "$cmd_line --thin-client");
+    fi
+
+    if [ -n "$global_threading" ]; then
+        cmd_line=$(echo "$cmd_line --global-threading");
+    fi
+
+#options with optional values start here
+    if [ -n "$fopen_keep_cache" ]; then
+        cmd_line=$(echo "$cmd_line --fopen-keep-cache=$fopen_keep_cache");
+    fi
+
+#options with mandatory values start here
+    if [ -n "$halo_max_latency" ]; then
+      cmd_line=$(echo "$cmd_line --xlator-option \
+ *replicate*.halo-max-latency=$halo_max_latency");
+    fi
+
+    if [ -n "$halo_max_replicas" ]; then
+      cmd_line=$(echo "$cmd_line --xlator-option \
+ *replicate*.halo-max-replicas=$halo_max_replicas");
+    fi
+
+    if [ -n "$halo_min_replicas" ]; then
+      cmd_line=$(echo "$cmd_line --xlator-option \
+ *replicate*.halo-min-replicas=$halo_min_replicas");
+    fi
+
+    if [ -n "$log_level" ]; then
+        cmd_line=$(echo "$cmd_line --log-level=$log_level");
+    fi
+
+    if [ -n "$log_file" ]; then
+        cmd_line=$(echo "$cmd_line --log-file=$log_file");
     fi
 
     if [ -n "$direct_io_mode" ]; then
-	cmd_line=$(echo "$cmd_line --direct-io-mode=$direct_io_mode");
+        cmd_line=$(echo "$cmd_line --direct-io-mode=$direct_io_mode");
     fi
-    
-    if [ -z "$volfile_loc" ]; then
-	if [ -n "$transport" ]; then 
-	    cmd_line=$(echo "$cmd_line \
---volfile-server=$server_ip \
---volfile-server-port=$server_port \
---volfile-server-transport=$transport");
-	else
-	    cmd_line=$(echo "$cmd_line \
---volfile-server=$server_ip \
---volfile-server-port=$server_port");
-	fi
-    else
-	cmd_line=$(echo "$cmd_line --volfile=$volfile_loc");
+
+    if [ -n "$use_readdirp" ]; then
+        cmd_line=$(echo "$cmd_line --use-readdirp=$use_readdirp");
+    fi
+
+    if [ -n "$event_history" ]; then
+        cmd_line=$(echo "$cmd_line --event-history=$event_history");
+    fi
+
+    if [ -n "$reader_thread_count" ]; then
+        cmd_line=$(echo "$cmd_line --reader-thread-count=$reader_thread_count");
+    fi
+
+    if [ -n "$fuse_auto_invalidation" ]; then
+        cmd_line=$(echo "$cmd_line --auto-invalidation=$fuse_auto_invalidation");
     fi
 
     if [ -n "$volume_name" ]; then
         cmd_line=$(echo "$cmd_line --volume-name=$volume_name");
     fi
-    
-    if [ -n "$volume_id" ]; then
-        cmd_line=$(echo "$cmd_line --volfile-id=$volume_id");
+
+    if [ -n "$attribute_timeout" ]; then
+        cmd_line=$(echo "$cmd_line --attribute-timeout=$attribute_timeout");
+    fi
+
+    if [ -n "$entry_timeout" ]; then
+        cmd_line=$(echo "$cmd_line --entry-timeout=$entry_timeout");
+    fi
+
+    if [ -n "$negative_timeout" ]; then
+        cmd_line=$(echo "$cmd_line --negative-timeout=$negative_timeout");
+    fi
+
+    if [ -n "$gid_timeout" ]; then
+        cmd_line=$(echo "$cmd_line --gid-timeout=$gid_timeout");
+    fi
+
+    if [ -n "$lru_limit" ]; then
+        cmd_line=$(echo "$cmd_line --lru-limit=$lru_limit");
+    fi
+
+    if [ -n "$invalidate_limit" ]; then
+        cmd_line=$(echo "$cmd_line --invalidate-limit=$invalidate_limit");
+    fi
+
+    if [ -n "$bg_qlen" ]; then
+        cmd_line=$(echo "$cmd_line --background-qlen=$bg_qlen");
+    fi
+
+    if [ -n "$cong_threshold" ]; then
+        cmd_line=$(echo "$cmd_line --congestion-threshold=$cong_threshold");
+    fi
+
+    if [ -n "$oom_score_adj" ]; then
+        cmd_line=$(echo "$cmd_line --oom-score-adj=$oom_score_adj");
+    fi
+
+    if [ -n "$fuse_mountopts" ]; then
+        cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
+    fi
+
+    if [ -n "$xlator_option" ]; then
+        cmd_line=$(echo "$cmd_line --xlator-option=$xlator_option");
+    fi
+
+    if [ -n "$kernel_writeback_cache" ]; then
+        cmd_line=$(echo "$cmd_line --kernel-writeback-cache=$kernel_writeback_cache");
+    fi
+
+    if [ -n "$attr_times_granularity" ]; then
+        cmd_line=$(echo "$cmd_line --attr-times-granularity=$attr_times_granularity");
+    fi
+
+    if [ -n "$dump_fuse" ]; then
+        cmd_line=$(echo "$cmd_line --dump-fuse=$dump_fuse");
+    fi
+
+    if [ -n "$fuse_flush_handle_interrupt" ]; then
+        cmd_line=$(echo "$cmd_line --fuse-flush-handle-interrupt=$fuse_flush_handle_interrupt");
+    fi
+
+    if [ -n "$process_name" ]; then
+        cmd_line=$(echo "$cmd_line --process-name fuse.$process_name");
+    else
+        cmd_line=$(echo "$cmd_line --process-name fuse");
+    fi
+
+    # if trasnport type is specified, we have to append it to
+    # volume name, so that it fetches the right client vol file
+
+    if [ -z "$volfile_loc" ]; then
+        if  [ -n "$server_ip" ]; then
+
+            servers=$(parse_volfile_servers ${server_ip});
+            if [ -n "$servers" ]; then
+                for i in $(echo ${servers}); do
+                    cmd_line=$(echo "$cmd_line --volfile-server=$i");
+                done
+            else
+                warn "ERROR: No valid servers found on command line.. exiting"
+                print_usage
+                exit 1
+            fi
+
+            if [ -n "$backupvolfile_server" ]; then
+                if [ -z "$backup_volfile_servers" ]; then
+                    is_valid_hostname ${backupvolfile_server};
+                    if [ $? -eq 1 ]; then
+                        warn "ERROR: Invalid backup server specified.. exiting"
+                        exit 1
+                    fi
+                    cmd_line=$(echo "$cmd_line --volfile-server=$backupvolfile_server");
+                fi
+            fi
+
+            if [ -n "$backup_volfile_servers" ]; then
+                backup_servers=$(parse_backup_volfile_servers ${backup_volfile_servers})
+                for i in $(echo ${backup_servers}); do
+                    cmd_line=$(echo "$cmd_line --volfile-server=$i");
+                done
+            fi
+
+            if [ -n "$server_port" ]; then
+                cmd_line=$(echo "$cmd_line --volfile-server-port=$server_port");
+            fi
+
+            if [ -n "$volume_id" ]; then
+                if [ -n "$transport" ]; then
+                    volume_id="$volume_id.$transport";
+                    cmd_line=$(echo "$cmd_line --volfile-server-transport=$transport");
+                fi
+                cmd_line=$(echo "$cmd_line --volfile-id=$volume_id");
+            fi
+        fi
+    else
+        cmd_line=$(echo "$cmd_line --volfile=$volfile_loc");
+    fi
+
+    if [ -n "$fuse_mountopts" ]; then
+        cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
+    fi
+
+    if [ -n "$subdir_mount" ]; then
+        cmd_line=$(echo "$cmd_line --subdir-mount=/$subdir_mount");
+    fi
+
+    if [ -n "$fuse_dev_eperm_ratelimit_ns" ]; then
+        cmd_line=$(echo "$cmd_line --fuse-dev-eperm-ratelimit-ns=$fuse_dev_eperm_ratelimit_ns");
     fi
 
     cmd_line=$(echo "$cmd_line $mount_point");
-    exec $cmd_line;
+    $cmd_line;
+    if [ $? -ne 0 ]; then
+        # If this is true, then glusterfs process returned error without
+        # getting daemonized. We have made sure the logs are posted to
+        # 'stderr', so no need to point them to logfile.
+        warn "Mounting glusterfs on $mount_point failed."
+        exit 1;
+    fi
+
+
+    inode=$( ${getinode} $mount_point 2>/dev/null);
+    # this is required if the stat returns error
+    if [ $? -ne 0 ]; then
+        # At this time, glusterfs got daemonized, and then later exited.
+        # These failures are only logged in log file.
+        warn "Mount failed. Check the log file ${log_file} for more details."
+        umount $mount_point > /dev/null 2>&1;
+        exit 1;
+    fi
 }
 
+print_usage ()
+{
+cat << EOF
+Usage: $0 <server>:<volume/subdir> <mountpoint> -o<options>
+Options:
+man 8 $(basename $0)
+To display the version number of the mount helper: $0 -V
+EOF
+}
 
-main ()
+# check for recursive mounts. i.e, mounting over an existing brick
+check_recursive_mount ()
 {
-    options=$(echo "$@" | sed -n 's/.*\-o[ ]*\([^ ]*\).*/\1/p');
-    new_log_level=$(echo "$options" | sed -n 's/.*log-level=\([^,]*\).*/\1/p');
-    
-    [ -n "$new_log_level" ] && {
-	log_level_str="$new_log_level";
-    }
-    log_file=$(echo "$options" | sed -n 's/.*log-file=\([^,]*\).*/\1/p');
+    if [ $1 = "/" ]; then
+        warn "Cannot mount over root";
+        exit 2;
+    fi
 
-    transport=$(echo "$options" | sed -n 's/.*transport=\([^,]*\).*/\1/p');
+    # GFID check first
+    # remove trailing / from mount point
+    mnt_dir=${1%/};
 
-    direct_io_mode=$(echo "$options" | sed -n 's/.*direct-io-mode=\([^,]*\).*/\1/p');
+    if [ -n "${getfattr}" ]; then
+        ${getfattr} -n trusted.gfid $mnt_dir 2>/dev/null | grep -iq "trusted.gfid=";
+        if [ $? -eq 0 ]; then
+            warn "ERROR: $mnt_dir is in use as a brick of a gluster volume";
+            exit 2;
+        fi
+    fi
 
-    volume_name=$(echo "$options" | sed -n 's/.*volume-name=\([^,]*\).*/\1/p');
+    # check if the mount point is a brick's parent directory
+    GLUSTERD_WORKDIR="@GLUSTERD_WORKDIR@";
 
-    volume_id=$(echo "$options" | sed -n 's/.*volume-id=\([^,]*\).*/\1/p');
+    ls -L "${GLUSTERD_WORKDIR}"/vols/*/bricks/* > /dev/null 2>&1;
+    if [ $? -ne 0 ]; then
+        return;
+    fi
 
-    volfile_check=$(echo "$options" | sed -n 's/.*volfile-check=\([^,]*\).*/\1/p');
+    brick_path=`grep ^path "$GLUSTERD_WORKDIR"/vols/*/bricks/* 2>/dev/null | cut -d "=" -f 2`;
+    root_inode=`${lgetinode} /`;
+    root_dev=`${lgetdev} /`;
+    mnt_inode=`${lgetinode} $mnt_dir`;
+    mnt_dev=`${lgetdev} $mnt_dir`;
+    for brick in "$brick_path"; do
+        # evaluate brick path to see if this is local, if non-local, skip iteration
+        ls $brick > /dev/null 2>&1;
+        if [ $? -ne 0 ]; then
+            continue;
+        fi
+
+        if [ -n "${getfattr}" ]; then
+            ${getfattr} -n trusted.gfid "$brick" 2>/dev/null | grep -iq "trusted.gfid=";
+            if [ $? -eq 0 ]; then
+                # brick is local
+                while [ 1 ]; do
+                    tmp_brick="$brick";
+                    brick="$brick"/..;
+                    brick_dev=`${lgetdev} $brick`;
+                    brick_inode=`${lgetinode} $brick`;
+                    if [ "$mnt_inode" -eq "$brick_inode" \
+                        -a "$mnt_dev" -eq "$brick_dev" ]; then
+                        warn "ERROR: ${mnt_dir} is a parent of the brick ${tmp_brick}";
+                        exit 2;
+                    fi
+                    [ "$root_inode" -ne "$brick_inode" \
+                        -o "$root_dev" -ne "$brick_dev" ] || break;
+                done;
+            else
+                continue;
+            fi
+        else
+            continue;
+        fi
+    done;
+}
+
+with_options()
+{
+    local key=$1
+    local value=$2
+
+    # Handle options with values.
+    case "$key" in
+        "log-level")
+            log_level_str=$value
+            ;;
+        "log-file")
+            log_file=$value
+            ;;
+        "transport")
+            transport=$value
+            ;;
+        "direct-io-mode")
+            direct_io_mode=$value
+            ;;
+        "volume-name")
+            volume_name=$value
+            ;;
+        "volume-id")
+            volume_id=$value
+            ;;
+        "subdir-mount")
+            subdir_mount=$value
+            ;;
+        "volfile-check")
+            volfile_check=$value
+            ;;
+        "server-port")
+            server_port=$value
+            ;;
+        "attribute-timeout")
+            attribute_timeout=$value
+            ;;
+        "entry-timeout")
+            entry_timeout=$value
+            ;;
+        "negative-timeout")
+            negative_timeout=$value
+            ;;
+        "gid-timeout")
+            gid_timeout=$value
+            ;;
+        "lru-limit")
+            lru_limit=$value
+            ;;
+        "invalidate-limit")
+            invalidate_limit=$value
+            ;;
+        "background-qlen")
+            bg_qlen=$value
+            ;;
+        "backup-volfile-servers")
+            backup_volfile_servers=$value
+            ;;
+        "backupvolfile-server")
+            backupvolfile_server=$value
+            ;;
+        "fetch-attempts")
+            volfile_max_fetch_attempts=$value
+            ;;
+        "congestion-threshold")
+            cong_threshold=$value
+            ;;
+        "oom-score-adj")
+            oom_score_adj=$value
+            ;;
+        "xlator-option")
+            xlator_option=$value
+            ;;
+        "fuse-mountopts")
+            fuse_mountopts=$value
+            ;;
+        "use-readdirp")
+            use_readdirp=$value
+            ;;
+        "event-history")
+            event_history=$value
+            ;;
+        "reader-thread-count")
+            reader_thread_count=$value
+            ;;
+        "auto-invalidation")
+            fuse_auto_invalidation=$value
+	    ;;
+        "no-root-squash")
+            if [ $value = "yes" ] ||
+                [ $value = "on" ] ||
+                [ $value = "enable" ] ||
+                [ $value = "true" ] ; then
+                no_root_squash=1;
+            fi ;;
+        "root-squash")
+            if [ $value = "no" ] ||
+                [ $value = "off" ] ||
+                [ $value = "disable" ] ||
+                [ $value = "false" ] ; then
+                no_root_squash=1;
+            fi ;;
+        "kernel-writeback-cache")
+            kernel_writeback_cache=$value
+            ;;
+        "attr-times-granularity")
+            attr_times_granularity=$value
+            ;;
+        "dump-fuse")
+            dump_fuse=$value
+            ;;
+        "fuse-flush-handle-interrupt")
+            fuse_flush_handle_interrupt=$value
+            ;;
+        "fuse-dev-eperm-ratelimit-ns")
+            fuse_dev_eperm_ratelimit_ns=$value
+            ;;
+        "context"|"fscontext"|"defcontext"|"rootcontext")
+            # standard SElinux mount options to pass to the kernel
+            [ -z "$fuse_mountopts" ] || fuse_mountopts="$fuse_mountopts,"
+            fuse_mountopts="${fuse_mountopts}$key=\"$value\""
+            ;;
+        "halo-max-latency")
+            halo_max_latency=$value
+            ;;
+        "halo-max-replicas")
+            halo_max_replicas=$value
+            ;;
+        "halo-min-replicas")
+          halo_min_replicas=$value
+          ;;
+        "process-name")
+            process_name=$value
+            ;;
+        # Values that are optional
+        "fopen-keep-cache")
+            fopen_keep_cache="=$value"
+            ;;
+        x-*)
+            # comments or userspace application-specific options, drop them
+            ;;
+        *)
+            warn "Invalid option: $key"
+            exit 1
+            ;;
+    esac
+}
+
+without_options()
+{
+    local option=$1
+    # Handle options without values.
+    case "$option" in
+        "ro")
+            read_only=1
+            ;;
+        "acl")
+            acl=1
+            ;;
+        "selinux")
+            selinux=1
+            ;;
+        "worm")
+            worm=1
+            ;;
+        "enable-ino32")
+            enable_ino32=1
+            ;;
+        "mem-accounting")
+            mem_accounting=1
+            ;;
+        "aux-gfid-mount")
+            if [ ${uname_s} = "Linux" ]; then
+                aux_gfid_mount=1
+            fi
+            ;;
+        "thin-client")
+            thin_client=1
+            ;;
+        "resolve-gids")
+            resolve_gids=1
+            ;;
+         # "mount -t glusterfs" sends this, but it's useless.
+        "rw")
+            ;;
+        "global-threading")
+            global_threading=1
+            ;;
+         # TODO: not sure how to handle this yet
+        "async"|"sync"|"dirsync"|\
+        "mand"|"nomand"|\
+        "silent"|"loud"|\
+        "iversion"|"noiversion"|\
+        "nofail")
+            warn "mount option '${option}' is not handled (yet?)"
+            ;;
+         # standard mount options to pass to the kernel
+        "atime"|"noatime"|"diratime"|"nodiratime"|\
+        "relatime"|"norelatime"|\
+        "strictatime"|"nostrictatime"|"lazyatime"|"nolazyatime"|\
+        "dev"|"nodev"|"exec"|"noexec"|"suid"|"nosuid"|"auto_unmount")
+            [ -z "$fuse_mountopts" ] || fuse_mountopts="$fuse_mountopts,"
+            fuse_mountopts="${fuse_mountopts}${option}"
+            ;;
+         # these ones are interpreted during system initialization
+        "auto"|"noauto")
+            ;;
+        "_netdev")
+            ;;
+        # Values that are optional
+        "fopen-keep-cache")
+            fopen_keep_cache="true"
+            ;;
+        x-*)
+            # comments or userspace application-specific options, drop them
+            ;;
+        *)
+            warn "Invalid option $option";
+            exit 1
+            ;;
+    esac
+}
+
+parse_options()
+{
+    local optarg=${1}
+    for pair in $(echo ${optarg}|sed 's/,/ /g'); do
+        key=$(echo "$pair" | cut -f1 -d'=');
+        value=$(echo "$pair" | cut -f2- -d'=');
+        if [ "$key" = "$value" ]; then
+            without_options $pair;
+        else
+            with_options $key $value;
+        fi
+    done
+}
+
+update_updatedb()
+{
+    # Append fuse.glusterfs to PRUNEFS variable in updatedb.conf(5).
+    # updatedb(8) should not index files under GlusterFS, indexing
+    # GlusterFS is not necessary and should be avoided.
+    # Following code disables updatedb crawl on 'glusterfs'
+    test -f $UPDATEDBCONF && {
+        if ! grep -q 'glusterfs' $UPDATEDBCONF; then
+            sed 's/\(PRUNEFS.*\)"/\1 fuse.glusterfs"/' $UPDATEDBCONF \
+                > ${UPDATEDBCONF}.bak
+            mv -f ${UPDATEDBCONF}.bak $UPDATEDBCONF
+        fi
+    }
+}
+
+main ()
+{
+    if [ "x${uname_s}" = "xLinux" -a $# -ge 2 ] ; then
+        volfile_loc=$1
+        mount_point=$2
+
+        ## `mount` specifies options as a last argument
+        shift 2;
+    fi
+    while getopts "Vo:hns" opt; do
+        case "${opt}" in
+            o)
+                parse_options ${OPTARG};
+		shift 2;
+                ;;
+            n)
+                ;;
+            s)
+                # accept+ignore sloppy mount, passed by autofs
+                ;;
+            V)
+                ${cmd_line} -V;
+                exit 0;
+                ;;
+            h)
+                print_usage;
+                exit 0;
+                ;;
+            ?)
+                print_usage;
+                exit 0;
+                ;;
+        esac
+    done
+
+    if [ "x${uname_s}" = "xNetBSD" ] ; then
+        volfile_loc=$1
+        mount_point=$2
+    fi
 
-    volfile_loc="$1";
-    
     [ -r "$volfile_loc" ] || {
-	server_ip=$(echo "$volfile_loc" | sed -n 's/\([^\:]*\).*/\1/p');
-	server_port=$(echo "$volfile_loc" | sed -n 's/.*:\([^ ]*\).*/\1/p');
-	[ -n "$server_port" ] || {
-	    server_port="6996";
-	}
+        # '%' included to support ipv6 link local addresses
+        server_ip=$(echo "$volfile_loc" | sed -n 's/\([a-zA-Z0-9:%,.\-]*\):.*/\1/p');
+        volume_str=$(echo "$volfile_loc" | sed -n 's/.*:\([^ ]*\).*/\1/p');
+        [ -n "$volume_str" ] && {
+            volume_id=$volume_str
+            volume_str_temp=$volume_str
+            first_char=$(echo "$volume_str" | cut -c 1)
+            [ ${first_char} = '/' ] && {
+                volume_str_temp=$(echo "$volume_str" | cut -c 2-)
+            }
+            volume_id_temp=$(echo "$volume_str_temp" | cut -f1 -d '/');
+            [ $(echo $volume_str_temp | grep -c "/") -eq 1 ] &&
+                    [ "$volume_id_temp" != "snaps" ]  && {
+                volume_id=$volume_id_temp;
+                [ ${first_char} = '/' ] && volume_id=/$volume_id;
+                subdir_mount=$(echo "$volume_str_temp" | cut -f2- -d '/');
+            }
+        }
+        volfile_loc="";
+        [ -z "$volume_id" -o -z "$server_ip" ] && {
+            cat <<EOF >&2
+ERROR: Server name/volume name unspecified cannot proceed further..
+Please specify correct format
+Usage:
+man 8 $0
+EOF
+            exit 1;
+        }
+    }
+
+    grep_ret=$(echo ${mount_point} | grep '^\-o');
+    [ "x" != "x${grep_ret}" ] && {
+        cat <<EOF >&2
+ERROR: -o options cannot be specified in either first two arguments..
+Please specify correct style
+Usage:
+man 8 $0
+EOF
+        exit 1;
+    }
 
-	volfile_loc="";
+    # No need to do a ! -d test, it is taken care while initializing the
+    # variable mount_point
+    [ -z "$mount_point" -o ! -d "$mount_point" ] && {
+        cat <<EOF >&2
+ERROR: Mount point does not exist
+Please specify a mount point
+Usage:
+man 8 $0
+EOF
+        exit 1;
     }
-    new_fs_options=$(echo "$options" | sed -e 's/[,]*log-file=[^,]*//' \
-	                                   -e 's/[,]*log-level=[^,]*//' \
-	                                   -e 's/[,]*volume-name=[^,]*//' \
-	                                   -e 's/[,]*direct-io-mode=[^,]*//' \
-	                                   -e 's/[,]*volfile-check=[^,]*//' \
-	                                   -e 's/[,]*transport=[^,]*//' \
-	                                   -e 's/[,]*volume-id=[^,]*//');
-    # following line is product of love towards sed
-    # $2=$(echo "$@" | sed -n 's/[^ ]* \([^ ]*\).*/\1/p');
-    
-    mount_point="$2";
 
     # Simple check to avoid multiple identical mounts
-    if grep -q "glusterfs $mount_point fuse" /etc/mtab; then
-        echo "$0: according to mtab, GlusterFS is already mounted on $mount_point"
-        exit 1
+    if grep -q "[[:space:]+]${mount_point}[[:space:]+]fuse.glusterfs" $mounttab; then
+        warn "$0: according to mtab, GlusterFS is already mounted on" \
+             "$mount_point"
+        exit 32;
     fi
-    
-    fs_options=$(echo "$fs_options,$new_fs_options");
-    
+
+    #Snapshot volumes are mounted read only
+    case $volume_id in
+        /snaps/* ) read_only=1
+    esac
+
+    check_recursive_mount "$mount_point";
+
+    update_updatedb;
+
     start_glusterfs;
 }
 
diff --git a/xlators/mount/fuse/utils/mount_glusterfs.in b/xlators/mount/fuse/utils/mount_glusterfs.in
index b064e1eadf4..3a5feb606d7 100755
--- a/xlators/mount/fuse/utils/mount_glusterfs.in
+++ b/xlators/mount/fuse/utils/mount_glusterfs.in
@@ -1,190 +1,554 @@
 #!/bin/sh
-# (C) 2008 Z RESEARCH Inc. <http://www.zresearch.com>
-# 
+# (C) 2014 Red Hat Inc. <http://www.redhat.com>
+# (C) 2015 ungleich GmbH <http://www.ungleich.ch>
+#
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of
 # the License, or (at your option) any later version.
-#   
+#
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 # GNU General Public License for more details.
-#   
+#
 # You should have received a copy of the GNU General Public
 # License along with this program; if not, write to the Free
 # Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 # Boston, MA 02110-1301 USA
 
 
+warn ()
+{
+   echo "$@" >&2
+}
 
 _init ()
 {
+
     # log level definitions
     LOG_NONE=NONE;
     LOG_CRITICAL=CRITICAL;
     LOG_ERROR=ERROR;
     LOG_WARNING=WARNING;
+    LOG_INFO=INFO
     LOG_DEBUG=DEBUG;
+    LOG_TRACE=TRACE;
 
-    # set default log level to ERROR
-    log_level=$LOG_WARNING;
-}
+    HOST_NAME_MAX=64;
 
-start_glusterfs ()
-{
     prefix="@prefix@";
     exec_prefix=@exec_prefix@;
     cmd_line=$(echo "@sbindir@/glusterfs");
-    
+
+    alias lsL='ls -L'
+    uname_s=`uname -s`
+    case ${uname_s} in
+        Darwin)
+            getinode="stat -f %i"
+            getdev="stat -f %d"
+            ;;
+    esac
+}
+
+is_valid_hostname ()
+{
+    local server=$1
+
+    length=$(echo $server | wc -c)
+    if [ ${length} -gt ${HOST_NAME_MAX} ]; then
+        return 1
+    fi
+}
+
+parse_backup_volfile_servers ()
+{
+    local server_list=$1
+    local servers=""
+    local new_servers=""
+
+    servers=$(echo ${server_list} | sed 's/\:/ /g')
+    for server in ${servers}; do
+        is_valid_hostname ${server}
+        if [ $? -eq 1 ]; then
+            continue
+        fi
+        new_servers=$(echo "${new_servers} ${server}")
+    done
+
+    echo ${new_servers}
+}
+
+parse_volfile_servers ()
+{
+    local server_list=$1
+    local servers=""
+    local new_servers=""
+
+    servers=$(echo ${server_list} | sed 's/,/ /g')
+    for server in ${servers}; do
+        is_valid_hostname ${server}
+        if [ $? -eq 1 ]; then
+            continue
+        fi
+        new_servers=$(echo "${new_servers} ${server}")
+    done
+
+    echo ${new_servers}
+}
+
+start_glusterfs ()
+{
     if [ -n "$log_level_str" ]; then
-	case "$log_level_str" in
-	    "ERROR")
-		log_level=$LOG_ERROR;
-		;;
-	    "DEBUG")
-		log_level=$LOG_DEBUG;
-		;;
-	    "CRITICAL")
-		log_level=$LOG_CRITICAL;
-		;;
-	    "WARNING")
-		log_level=$LOG_WARNING;
-		;;
-	    "NONE")
-		log_level=$LOG_NONE;
-		;;
-	    *)
-		echo "invalid log level $log_level_str, using ERROR";
-		log_level=$LOG_WARNING;
-		;;
-	esac
-    fi
-    cmd_line=$(echo "$cmd_line --log-level=$log_level");
-    
-    if [ -n "$log_file" ]; then
-	cmd_line=$(echo "$cmd_line --log-file=$log_file");
+        case "$( echo $log_level_str | awk '{print toupper($0)}')" in
+            "ERROR")
+                log_level=$LOG_ERROR;
+                ;;
+            "INFO")
+                log_level=$LOG_INFO;
+                ;;
+            "DEBUG")
+                log_level=$LOG_DEBUG;
+                ;;
+            "CRITICAL")
+                log_level=$LOG_CRITICAL;
+                ;;
+            "WARNING")
+                log_level=$LOG_WARNING;
+                ;;
+            "TRACE")
+                log_level=$LOG_TRACE;
+                ;;
+            "NONE")
+                log_level=$LOG_NONE;
+                ;;
+            *)
+                warn "invalid log level $log_level_str, using INFO";
+                log_level=$LOG_INFO;
+                ;;
+        esac
+    fi
+
+    # options without values start here
+    if [ -n "$read_only" ]; then
+        cmd_line=$(echo "$cmd_line --read-only");
+    fi
+
+    if [ -n "$acl" ]; then
+        cmd_line=$(echo "$cmd_line --acl");
+    fi
+
+    if [ -n "$selinux" ]; then
+         cmd_line=$(echo "$cmd_line --selinux");
+    fi
+
+    if [ -n "$enable_ino32" ]; then
+        cmd_line=$(echo "$cmd_line --enable-ino32");
+    fi
+
+    if [ -n "$worm" ]; then
+        cmd_line=$(echo "$cmd_line --worm");
+    fi
+    if [ -n "$volfile_max_fetch_attempts" ]; then
+       cmd_line=$(echo "$cmd_line --volfile-max-fetch-attempts=$volfile_max_fetch_attempts")
+    fi
+
+    if [ -n "$fopen_keep_cache" ]; then
+        cmd_line=$(echo "$cmd_line --fopen-keep-cache");
     fi
 
     if [ -n "$volfile_check" ]; then
-	cmd_line=$(echo "$cmd_line --volfile-check");
+        cmd_line=$(echo "$cmd_line --volfile-check");
+    fi
+
+    if [ -n "$mem_accounting" ]; then
+        cmd_line=$(echo "$cmd_line --mem-accounting");
+    fi
+
+    if [ -n "$aux_gfid_mount" ]; then
+        cmd_line=$(echo "$cmd_line --aux-gfid-mount");
+    fi
+
+    if [ -n "$no_root_squash" ]; then
+        cmd_line=$(echo "$cmd_line --no-root-squash");
+    fi
+
+    if [ -n "$capability" ]; then
+         cmd_line=$(echo "$cmd_line --capability");
+    fi
+
+#options with values start here
+    if [ -n "$log_level" ]; then
+        cmd_line=$(echo "$cmd_line --log-level=$log_level");
+    fi
+
+    if [ -n "$log_file" ]; then
+        cmd_line=$(echo "$cmd_line --log-file=$log_file");
     fi
 
     if [ -n "$direct_io_mode" ]; then
-	cmd_line=$(echo "$cmd_line --disable-direct-io-mode");
+        cmd_line=$(echo "$cmd_line --direct-io-mode=$direct_io_mode");
     fi
-    
-    if [ -z "$volfile_loc" ]; then
-	if [ -n "$transport" ]; then 
-	    cmd_line=$(echo "$cmd_line \
---volfile-server=$server_ip \
---volfile-server-port=$server_port \
---volfile-server-transport=$transport");
-	else
-	    cmd_line=$(echo "$cmd_line \
---volfile-server=$server_ip \
---volfile-server-port=$server_port");
-	fi
-    else
-	cmd_line=$(echo "$cmd_line --volfile=$volfile_loc");
+
+    if [ -n "$mac_compat" ]; then
+        cmd_line=$(echo "$cmd_line --mac-compat=$mac_compat");
+    fi
+
+    if [ -n "$use_readdirp" ]; then
+        cmd_line=$(echo "$cmd_line --use-readdirp=$use_readdirp");
     fi
 
     if [ -n "$volume_name" ]; then
         cmd_line=$(echo "$cmd_line --volume-name=$volume_name");
     fi
-    
-    if [ -n "$volume_id" ]; then
-        cmd_line=$(echo "$cmd_line --volfile-id=$volume_id");
+
+    if [ -n "$attribute_timeout" ]; then
+        cmd_line=$(echo "$cmd_line --attribute-timeout=$attribute_timeout");
+    fi
+
+    if [ -n "$entry_timeout" ]; then
+        cmd_line=$(echo "$cmd_line --entry-timeout=$entry_timeout");
+    fi
+
+    if [ -n "$negative_timeout" ]; then
+        cmd_line=$(echo "$cmd_line --negative-timeout=$negative_timeout");
+    fi
+
+    if [ -n "$gid_timeout" ]; then
+        cmd_line=$(echo "$cmd_line --gid-timeout=$gid_timeout");
+    fi
+
+    if [ -n "$bg_qlen" ]; then
+        cmd_line=$(echo "$cmd_line --background-qlen=$bg_qlen");
+    fi
+
+    if [ -n "$cong_threshold" ]; then
+        cmd_line=$(echo "$cmd_line --congestion-threshold=$cong_threshold");
+    fi
+
+    if [ -n "$fuse_mountopts" ]; then
+        cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
+    fi
+
+    if [ -n "$xlator_option" ]; then
+        cmd_line=$(echo "$cmd_line --xlator-option=$xlator_option");
+    fi
+
+    if [ -n "$process_name" ]; then
+        cmd_line=$(echo "$cmd_line --process-name fuse.$process_name");
+    else
+        cmd_line=$(echo "$cmd_line --process-name fuse");
+    fi
+
+    if [ -z "$volfile_loc" ]; then
+        if  [ -n "$server_ip" ]; then
+
+            servers=$(parse_volfile_servers ${server_ip});
+            if [ -n "$servers" ]; then
+                for i in $(echo ${servers}); do
+                    cmd_line=$(echo "$cmd_line --volfile-server=$i");
+                done
+            else
+                warn "ERROR: No valid servers found on command line.. exiting"
+                print_usage
+                exit 1
+            fi
+
+            if [ -n "$backupvolfile_server" ]; then
+                if [ -z "$backup_volfile_servers" ]; then
+                    is_valid_hostname ${backupvolfile_server};
+                    if [ $? -eq 1 ]; then
+                        warn "ERROR: Invalid backup server specified.. exiting"
+                        exit 1
+                    fi
+                    cmd_line=$(echo "$cmd_line --volfile-server=$backupvolfile_server");
+                fi
+            fi
+
+            if [ -n "$backup_volfile_servers" ]; then
+                backup_servers=$(parse_backup_volfile_servers ${backup_volfile_servers})
+                for i in $(echo ${backup_servers}); do
+                    cmd_line=$(echo "$cmd_line --volfile-server=$i");
+                done
+            fi
+
+            if [ -n "$server_port" ]; then
+                cmd_line=$(echo "$cmd_line --volfile-server-port=$server_port");
+            fi
+
+            if [ -n "$transport" ]; then
+                cmd_line=$(echo "$cmd_line --volfile-server-transport=$transport");
+            fi
+
+            if [ -n "$volume_id" ]; then
+                cmd_line=$(echo "$cmd_line --volfile-id=$volume_id");
+            fi
+        fi
+    else
+        cmd_line=$(echo "$cmd_line --volfile=$volfile_loc");
+    fi
+
+    if [ -n "$fuse_mountopts" ]; then
+        cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
     fi
 
     cmd_line=$(echo "$cmd_line $mount_point");
-    exec $cmd_line;
+    $cmd_line;
+
+    if [ $? -ne 0 ]; then
+        exit 1;
+    fi
+}
+
+print_usage ()
+{
+cat << EOF >&2
+Usage: $0 <volumeserver>:<volumeid/volumeport> -o<options> <mountpoint>
+Options:
+man 8 $0
+To display the version number of the mount helper: $0 -V
+EOF
+}
+
+with_options()
+{
+    local key=$1
+    local value=$2
+
+    # Handle options with values.
+    case "$key" in
+        "log-level")
+            log_level_str=$value
+            ;;
+        "log-file")
+            log_file=$value
+            ;;
+        "transport")
+            transport=$value
+            ;;
+        "direct-io-mode")
+            direct_io_mode=$value
+            ;;
+        "mac-compat")
+            mac_compat=$value
+            ;;
+        "volume-name")
+            volume_name=$value
+            ;;
+        "volume-id")
+            volume_id=$value
+            ;;
+        "volfile-check")
+            volfile_check=$value
+            ;;
+        "server-port")
+            server_port=$value
+            ;;
+        "attribute-timeout")
+            attribute_timeout=$value
+            ;;
+        "entry-timeout")
+            entry_timeout=$value
+            ;;
+        "negative-timeout")
+            negative_timeout=$value
+            ;;
+        "gid-timeout")
+            gid_timeout=$value
+            ;;
+        "background-qlen")
+            bg_qlen=$value
+            ;;
+        "backup-volfile-servers")
+            backup_volfile_servers=$value
+            ;;
+        "backupvolfile-server")
+            backupvolfile_server=$value
+            ;;
+        "fetch-attempts")
+            volfile_max_fetch_attempts=$value
+            ;;
+        "congestion-threshold")
+            cong_threshold=$value
+            ;;
+        "xlator-option")
+            xlator_option=$value
+            ;;
+        "fuse-mountopts")
+            fuse_mountopts=$value
+            ;;
+        "use-readdirp")
+            use_readdirp=$value
+            ;;
+        "no-root-squash")
+            if [ $value = "yes" ] ||
+                [ $value = "on" ] ||
+                [ $value = "enable" ] ||
+                [ $value = "true" ] ; then
+                no_root_squash=1;
+            fi ;;
+        "root-squash")
+            if [ $value = "no" ] ||
+                [ $value = "off" ] ||
+                [ $value = "disable" ] ||
+                [ $value = "false" ] ; then
+                no_root_squash=1;
+            fi ;;
+        "process-name")
+            process_name=$value
+            ;;
+        *)
+            warn "Invalid option: $key"
+            exit 1
+            ;;
+    esac
+}
+
+without_options()
+{
+    local option=$1
+    # Handle options without values.
+    case "$option" in
+        "ro")
+            read_only=1
+            ;;
+        "acl")
+            acl=1
+            ;;
+        "selinux")
+            selinux=1
+            ;;
+        "worm")
+            worm=1
+            ;;
+        "fopen-keep-cache")
+            fopen_keep_cache=1
+            ;;
+        "enable-ino32")
+            enable_ino32=1
+            ;;
+        "mem-accounting")
+            mem_accounting=1
+            ;;
+        "aux-gfid-mount")
+            if [ ${uname_s} = "Linux" ]; then
+                aux_gfid_mount=1
+            fi
+            ;;
+         # "mount -t glusterfs" sends this, but it's useless.
+        "rw")
+            ;;
+         # these ones are interpreted during system initialization
+        "noauto")
+            ;;
+        "_netdev")
+            ;;
+        "capability")
+            capability=1
+            ;;
+        *)
+            warn "Invalid option $option";
+            exit 1
+            ;;
+    esac
 }
 
+parse_options()
+{
+    local optarg=${1}
+    for pair in $(echo $optarg | sed 's/,/ /g'); do
+        key=$(echo "$pair" | cut -f1 -d'=');
+        value=$(echo "$pair" | cut -f2- -d'=');
+        if [ "$key" = "$value" ]; then
+            without_options $pair;
+        else
+            with_options $key $value;
+        fi
+    done
+}
 
 main ()
 {
-    
-    new_log_level=""
-    log_file=""
-    transport=""
-    direct_io_mode=""
-    volume_name=""
-    new_fs_options=""
-    volfile_check=""
-
-    while getopts o: opt; do
-	case "$opt" in
-	    o) 
-		options=$(echo $OPTARG | sed -n 's/.*\-o[ ]*\([^ ]*\).*/\1/p');
-		[ -z $new_log_level ] && {
-		    new_log_level=$(echo "$options" | sed -n 's/.*log-level=\([^,]*\).*/\1/p');
-		}
-		
-		[ -z $log_file ] && {
-		    log_file=$(echo "$options" | sed -n 's/.*log-file=\([^,]*\).*/\1/p');
-		}
-		
-		[ -z $transport ] && {
-		    transport=$(echo "$options" | sed -n 's/.*transport=\([^,]*\).*/\1/p');
-		}
-		
-		[ -z $direct_io_mode ] && {
-		    direct_io_mode=$(echo "$options" | sed -n 's/.*direct-io-mode=\([^,]*\).*/\1/p');
-		}
-		
-		[ -z $volfile_check ] && {
-		    volfile_check=$(echo "$options" | sed -n 's/.*volfile-check=\([^,]*\).*/\1/p');
-		}
-		
-		[ -z $volume_name ] && {
-		    volume_name=$(echo "$options" | sed -n 's/.*volume-name=\([^,]*\).*/\1/p');
-		}
-
-		[ -z $volume_id ] && {
-		    volume_id=$(echo "$options" | sed -n 's/.*volume-id=\([^,]*\).*/\1/p');
-		}
-
-		this_option=$(echo "$options" | sed -e 's/[,]*log-file=[^,]*//' \
-		    -e 's/[,]*log-level=[^,]*//' \
-		    -e 's/[,]*volume-name=[^,]*//' \
-		    -e 's/[,]*volfile-check=[^,]*//' \
-		    -e 's/[,]*direct-io-mode=[^,]*//' \
-		    -e 's/[,]*transport=[^,]*//' \
-		    -e 's/[,]*volume-id=[^,]*//');
-		new_fs_options="$new_fs_options $this_option";		
-		;;
-	esac
+#if !defined(__FreeBSD__)
+    ## `mount` on OSX specifies options as first argument
+    echo $1|grep -q -- "-o"
+    if [ $? -eq 0 ];  then
+        volfile_loc=$3
+        mount_point=$4
+    else
+        volfile_loc=$1
+        mount_point=$2
+    fi
+#endif /* __FreeBSD__ */
+    while getopts "Vo:h" opt; do
+        case "${opt}" in
+            o)
+                parse_options ${OPTARG};
+                ;;
+            V)
+                ${cmd_line} -V;
+                exit 0;
+                ;;
+            h)
+                print_usage;
+                exit 0;
+                ;;
+            ?)
+                print_usage;
+                exit 0;
+                ;;
+        esac
     done
 
-    [ -n "$new_log_level" ] && {
-	log_level_str="$new_log_level";
-    }
+#ifdef __FreeBSD__
+    shift $((OPTIND - 1))
+    volfile_loc="$1"
+    mount_point="$2"
+#endif /* __FreeBSD__ */
 
-    # TODO: use getopt. This is very much darwin specific
-    volfile_loc="$1";
-    while [ "$volfile_loc" == "-o" ] ; do
-	shift ;
-	shift ;
-	volfile_loc="$1";
-    done
-    
     [ -r "$volfile_loc" ] || {
-	server_ip=$(echo "$volfile_loc" | sed -n 's/\([^\:]*\).*/\1/p');
-	server_port=$(echo "$volfile_loc" | sed -n 's/.*:\([^ ]*\).*/\1/p');
-	[ -n "$server_port" ] || {
-	    server_port="6996";
-	}
+        # '%' included to support ipv6 link local addresses
+        server_ip=$(echo "$volfile_loc" | sed -n 's/\([a-zA-Z0-9:%.\-]*\):.*/\1/p');
+        volume_str=$(echo "$volfile_loc" | sed -n 's/.*:\([^ ]*\).*/\1/p');
+        [ -n "$volume_str" ] && {
+            volume_id="$volume_str";
+        }
+        volfile_loc="";
+    }
+
+    [ -z "$volume_id" -o -z "$server_ip" ] && {
+        cat <<EOF >&2
+ERROR: Server name/volume name unspecified cannot proceed further..
+Please specify correct format
+Usage:
+man 8 $0
+EOF
+        exit 1;
+    }
+
+    grep_ret=$(echo ${mount_point} | grep '^\-o');
+    [ "x" != "x${grep_ret}" ] && {
+        cat <<EOF >&2
+ERROR: -o options cannot be specified in either first two arguments..
+Please specify correct style
+Usage:
+man 8 $0
+EOF
+        exit 1;
+    }
 
-	volfile_loc="";
+    # No need to do a ! -d test, it is taken care while initializing the
+    # variable mount_point
+    [ -z "$mount_point" -o ! -d "$mount_point" ] && {
+        cat <<EOF >&2
+ERROR: Mount point does not exist
+Please specify a mount point
+Usage:
+man 8 $0
+EOF
+        exit 1;
     }
-    # following line is product of love towards sed
-    # $2=$(echo "$@" | sed -n 's/[^ ]* \([^ ]*\).*/\1/p');
-    
-    mount_point="$2";
 
-    fs_options=$(echo "$fs_options,$new_fs_options");
-    
     start_glusterfs;
 }
 
diff --git a/xlators/nfs/Makefile.am b/xlators/nfs/Makefile.am
new file mode 100644
index 00000000000..8771032f6c6
--- /dev/null
+++ b/xlators/nfs/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = server
+
+CLEANFILES =
diff --git a/xlators/nfs/server/Makefile.am b/xlators/nfs/server/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/nfs/server/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/nfs/server/src/Makefile.am b/xlators/nfs/server/src/Makefile.am
new file mode 100644
index 00000000000..01071a79a21
--- /dev/null
+++ b/xlators/nfs/server/src/Makefile.am
@@ -0,0 +1,38 @@
+if WITH_SERVER
+xlator_LTLIBRARIES = server.la
+endif
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/nfs
+nfsrpclibdir = $(top_srcdir)/rpc/rpc-lib/src
+server_la_LDFLAGS = -module \
+        -export-symbols $(top_srcdir)/xlators/nfs/server/src/nfsserver.sym \
+        $(GF_XLATOR_LDFLAGS)
+
+server_la_SOURCES = nfs.c nfs-common.c nfs-fops.c nfs-inodes.c \
+	nfs-generics.c mount3.c nfs3-fh.c nfs3.c nfs3-helpers.c nlm4.c \
+	nlmcbk_svc.c mount3udp_svc.c acl3.c netgroups.c exports.c \
+	mount3-auth.c auth-cache.c
+
+server_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+	$(top_builddir)/api/src/libgfapi.la \
+	$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \
+	$(top_builddir)/rpc/xdr/src/libgfxdr.la
+
+noinst_HEADERS = nfs.h nfs-common.h nfs-fops.h nfs-inodes.h nfs-generics.h \
+	mount3.h nfs3-fh.h nfs3.h nfs3-helpers.h nfs-mem-types.h nlm4.h \
+	acl3.h netgroups.h exports.h mount3-auth.h auth-cache.h nfs-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+	-DLIBDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/auth\" \
+	-I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/api/src \
+	-I$(top_srcdir)/rpc/xdr/src/ -I$(top_builddir)/rpc/xdr/src/ \
+	-I$(nfsrpclibdir) -I$(CONTRIBDIR)/rbtree \
+	-DDATADIR=\"$(localstatedir)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+AM_LDFLAGS = -L$(xlatordir)
+
+CLEANFILES =
+
+EXTRA_DIST = nfsserver.sym
diff --git a/xlators/nfs/server/src/acl3.c b/xlators/nfs/server/src/acl3.c
new file mode 100644
index 00000000000..7e3bbf16086
--- /dev/null
+++ b/xlators/nfs/server/src/acl3.c
@@ -0,0 +1,933 @@
+/*
+ * Copyright (c) 2012-2013 Red Hat, Inc. <http://www.redhat.com>
+ * This file is part of GlusterFS.
+ *
+ * This file is licensed to you under your choice of the GNU Lesser
+ * General Public License, version 3 or any later version (LGPLv3 or
+ * later), or the GNU General Public License, version 2 (GPLv2), in all
+ * cases as published by the Free Software Foundation.
+ */
+
+#include <glusterfs/defaults.h>
+#include "rpcsvc.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include "nfs.h"
+#include <glusterfs/mem-pool.h>
+#include <glusterfs/logging.h>
+#include "nfs-fops.h"
+#include "nfs3.h"
+#include "nfs-mem-types.h"
+#include "nfs3-helpers.h"
+#include "nfs3-fh.h"
+#include "nfs-generics.h"
+#include "acl3.h"
+#include <glusterfs/byte-order.h>
+#include <glusterfs/compat-errno.h>
+#include "nfs-messages.h"
+
+static int
+acl3_nfs_acl_to_xattr(aclentry *ace, void *xattrbuf, int aclcount, int defacl);
+
+static int
+acl3_nfs_acl_from_xattr(aclentry *ace, void *xattrbuf, int bufsize, int defacl);
+
+typedef ssize_t (*acl3_serializer)(struct iovec outmsg, void *args);
+
+extern void
+nfs3_call_state_wipe(nfs3_call_state_t *cs);
+
+extern nfs3_call_state_t *
+nfs3_call_state_init(struct nfs3_state *s, rpcsvc_request_t *req, xlator_t *v);
+
+extern int
+nfs3_fh_validate(struct nfs3_fh *fh);
+
+extern void
+nfs3_stat_to_fattr3(struct iatt *buf, fattr3 *fa);
+
+#define acl3_validate_nfs3_state(request, state, status, label, retval)        \
+    do {                                                                       \
+        state = rpcsvc_request_program_private(request);                       \
+        if (!state) {                                                          \
+            gf_msg(GF_ACL, GF_LOG_ERROR, errno, NFS_MSG_STATE_MISSING,         \
+                   "NFSv3 state "                                              \
+                   "missing from RPC request");                                \
+            rpcsvc_request_seterr(req, SYSTEM_ERR);                            \
+            status = NFS3ERR_SERVERFAULT;                                      \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0);
+
+#define acl3_validate_gluster_fh(handle, status, errlabel)                     \
+    do {                                                                       \
+        if (!nfs3_fh_validate(handle)) {                                       \
+            gf_msg(GF_ACL, GF_LOG_ERROR, EINVAL, NFS_MSG_BAD_HANDLE,           \
+                   "Bad Handle");                                              \
+            status = NFS3ERR_BADHANDLE;                                        \
+            goto errlabel;                                                     \
+        }                                                                      \
+    } while (0)
+
+extern xlator_t *
+nfs3_fh_to_xlator(struct nfs3_state *nfs3, struct nfs3_fh *fh);
+
+#define acl3_map_fh_to_volume(nfs3state, handle, req, volume, status, label)   \
+    do {                                                                       \
+        char exportid[256], gfid[256];                                         \
+        rpc_transport_t *trans = NULL;                                         \
+        volume = nfs3_fh_to_xlator((nfs3state), handle);                       \
+        if (!volume) {                                                         \
+            gf_uuid_unparse(handle->exportid, exportid);                       \
+            gf_uuid_unparse(handle->gfid, gfid);                               \
+            trans = rpcsvc_request_transport(req);                             \
+            gf_msg(GF_ACL, GF_LOG_ERROR, 0, NFS_MSG_FH_TO_VOL_FAIL,            \
+                   "Failed to map "                                            \
+                   "FH to vol: client=%s, exportid=%s, gfid=%s",               \
+                   trans->peerinfo.identifier, exportid, gfid);                \
+            gf_msg(GF_ACL, GF_LOG_ERROR, ESTALE, NFS_MSG_VOLUME_ERROR,         \
+                   "Stale nfs client %s must be trying to "                    \
+                   "connect to a deleted volume, please "                      \
+                   "unmount it.",                                              \
+                   trans->peerinfo.identifier);                                \
+            status = NFS3ERR_STALE;                                            \
+            goto label;                                                        \
+        } else {                                                               \
+            gf_msg_trace(GF_ACL, 0, "FH to Volume: %s", volume->name);         \
+            rpcsvc_request_set_private(req, volume);                           \
+        }                                                                      \
+    } while (0);
+
+#define acl3_volume_started_check(nfs3state, vlm, rtval, erlbl)                \
+    do {                                                                       \
+        if ((!nfs_subvolume_started(nfs_state(nfs3state->nfsx), vlm))) {       \
+            gf_msg(GF_ACL, GF_LOG_ERROR, 0, NFS_MSG_VOL_DISABLE,               \
+                   "Volume is disabled: %s", vlm->name);                       \
+            rtval = RPCSVC_ACTOR_IGNORE;                                       \
+            goto erlbl;                                                        \
+        }                                                                      \
+    } while (0)
+
+#define acl3_check_fh_resolve_status(cst, nfstat, erlabl)                      \
+    do {                                                                       \
+        xlator_t *xlatorp = NULL;                                              \
+        char buf[256], gfid[GF_UUID_BUF_SIZE];                                 \
+        rpc_transport_t *trans = NULL;                                         \
+        if ((cst)->resolve_ret < 0) {                                          \
+            trans = rpcsvc_request_transport(cst->req);                        \
+            xlatorp = nfs3_fh_to_xlator(cst->nfs3state, &cst->resolvefh);      \
+            gf_uuid_unparse(cst->resolvefh.gfid, gfid);                        \
+            snprintf(buf, sizeof(buf), "(%s) %s : %s",                         \
+                     trans->peerinfo.identifier,                               \
+                     xlatorp ? xlatorp->name : "ERR", gfid);                   \
+            gf_msg(GF_ACL, GF_LOG_ERROR, cst->resolve_errno,                   \
+                   NFS_MSG_RESOLVE_FH_FAIL,                                    \
+                   "Unable to resolve "                                        \
+                   "FH: %s",                                                   \
+                   buf);                                                       \
+            nfstat = nfs3_errno_to_nfsstat3(cst->resolve_errno);               \
+            goto erlabl;                                                       \
+        }                                                                      \
+    } while (0)
+
+#define acl3_handle_call_state_init(nfs3state, calls, rq, v, opstat, errlabel) \
+    do {                                                                       \
+        calls = nfs3_call_state_init((nfs3state), (rq), v);                    \
+        if (!calls) {                                                          \
+            gf_msg(GF_ACL, GF_LOG_ERROR, 0, NFS_MSG_INIT_CALL_STAT_FAIL,       \
+                   "Failed to "                                                \
+                   "init call state");                                         \
+            opstat = NFS3ERR_SERVERFAULT;                                      \
+            rpcsvc_request_seterr(req, SYSTEM_ERR);                            \
+            goto errlabel;                                                     \
+        }                                                                      \
+    } while (0)
+
+int
+acl3svc_submit_reply(rpcsvc_request_t *req, void *arg, acl3_serializer sfunc)
+{
+    struct iovec outmsg = {
+        0,
+    };
+    struct iobuf *iob = NULL;
+    struct nfs3_state *nfs3 = NULL;
+    int ret = -1;
+    ssize_t msglen = 0;
+    struct iobref *iobref = NULL;
+
+    if (!req)
+        return -1;
+
+    nfs3 = (struct nfs3_state *)rpcsvc_request_program_private(req);
+    if (!nfs3) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, EINVAL, NFS_MSG_MNT_STATE_NOT_FOUND,
+               "mount state not found");
+        goto ret;
+    }
+
+    /* First, get the io buffer into which the reply in arg will
+     * be serialized.
+     */
+    iob = iobuf_get(nfs3->iobpool);
+    if (!iob) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to get iobuf");
+        goto ret;
+    }
+
+    iobuf_to_iovec(iob, &outmsg);
+    /* Use the given serializer to translate the give C structure in arg
+     * to XDR format which will be written into the buffer in outmsg.
+     */
+    msglen = sfunc(outmsg, arg);
+    if (msglen < 0) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, errno, NFS_MSG_ENCODE_MSG_FAIL,
+               "Failed to encode message");
+        goto ret;
+    }
+    outmsg.iov_len = msglen;
+
+    iobref = iobref_new();
+    if (iobref == NULL) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to get iobref");
+        goto ret;
+    }
+
+    ret = iobref_add(iobref, iob);
+    if (ret) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to add iob to iobref");
+        goto ret;
+    }
+
+    /* Then, submit the message for transmission. */
+    ret = rpcsvc_submit_message(req, &outmsg, 1, NULL, 0, iobref);
+    if (ret == -1) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, errno, NFS_MSG_REP_SUBMIT_FAIL,
+               "Reply submission failed");
+        goto ret;
+    }
+
+    ret = 0;
+ret:
+    if (iob)
+        iobuf_unref(iob);
+    if (iobref)
+        iobref_unref(iobref);
+
+    return ret;
+}
+
+int
+acl3svc_null(rpcsvc_request_t *req)
+{
+    struct iovec dummyvec = {
+        0,
+    };
+
+    if (!req) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Got NULL request!");
+        return 0;
+    }
+    rpcsvc_submit_generic(req, &dummyvec, 1, NULL, 0, NULL);
+    return 0;
+}
+
+int
+acl3_getacl_reply(rpcsvc_request_t *req, getaclreply *reply)
+{
+    acl3svc_submit_reply(req, (void *)reply,
+                         (acl3_serializer)xdr_serialize_getaclreply);
+    return 0;
+}
+
+int
+acl3_setacl_reply(rpcsvc_request_t *req, setaclreply *reply)
+{
+    acl3svc_submit_reply(req, (void *)reply,
+                         (acl3_serializer)xdr_serialize_setaclreply);
+    return 0;
+}
+
+/* acl3_getacl_cbk: fetch and decode the ACL in the POSIX_ACL_ACCESS_XATTR
+ *
+ * The POSIX_ACL_ACCESS_XATTR can be set on files and directories.
+ */
+int
+acl3_getacl_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+    data_t *data = NULL;
+    getaclreply *getaclreply = NULL;
+    int aclcount = 0;
+    int defacl = 1; /* DEFAULT ACL */
+
+    if (!frame->local) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Invalid argument, frame->local NULL");
+        return -EINVAL;
+    }
+    cs = frame->local;
+    getaclreply = &cs->args.getaclreply;
+    if ((op_ret < 0) && (op_errno != ENODATA && op_errno != ENOATTR)) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto err;
+    } else if (!dict) {
+        /* no ACL has been set */
+        stat = NFS3_OK;
+        goto err;
+    }
+
+    getaclreply->aclentry.aclentry_val = cs->aclentry;
+
+    /* getfacl: NFS USER ACL */
+    data = dict_get(dict, POSIX_ACL_ACCESS_XATTR);
+    if (data && data->data) {
+        aclcount = acl3_nfs_acl_from_xattr(cs->aclentry, data->data, data->len,
+                                           !defacl);
+        if (aclcount < 0) {
+            gf_msg(GF_ACL, GF_LOG_ERROR, aclcount, NFS_MSG_GET_USER_ACL_FAIL,
+                   "Failed to get USER ACL");
+            stat = nfs3_errno_to_nfsstat3(-aclcount);
+            goto err;
+        }
+        getaclreply->aclcount = aclcount;
+        getaclreply->aclentry.aclentry_len = aclcount;
+    }
+
+    acl3_getacl_reply(cs->req, getaclreply);
+    nfs3_call_state_wipe(cs);
+    return 0;
+
+err:
+    if (getaclreply)
+        getaclreply->status = stat;
+    acl3_getacl_reply(cs->req, getaclreply);
+    nfs3_call_state_wipe(cs);
+    return 0;
+}
+
+/* acl3_default_getacl_cbk: fetch and decode the ACL set in the
+ * POSIX_ACL_DEFAULT_XATTR xattr.
+ *
+ * The POSIX_ACL_DEFAULT_XATTR xattr is only set on directories, not on files.
+ *
+ * When done with POSIX_ACL_DEFAULT_XATTR, we also need to get and decode the
+ * ACL that can be set in POSIX_ACL_DEFAULT_XATTR.
+ */
+int
+acl3_default_getacl_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, dict_t *dict,
+                        dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+    data_t *data = NULL;
+    getaclreply *getaclreply = NULL;
+    int aclcount = 0;
+    int defacl = 1; /* DEFAULT ACL */
+    nfs_user_t nfu = {
+        0,
+    };
+    int ret = -1;
+
+    if (!frame->local) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Invalid argument, frame->local NULL");
+        return -EINVAL;
+    }
+    cs = frame->local;
+    getaclreply = &cs->args.getaclreply;
+    if ((op_ret < 0) && (op_errno != ENODATA && op_errno != ENOATTR)) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto err;
+    } else if (!dict) {
+        /* no ACL has been set */
+        stat = NFS3_OK;
+        goto err;
+    }
+
+    getaclreply->daclentry.daclentry_val = cs->daclentry;
+
+    /* getfacl: NFS DEFAULT ACL */
+    data = dict_get(dict, POSIX_ACL_DEFAULT_XATTR);
+    if (data && data->data) {
+        aclcount = acl3_nfs_acl_from_xattr(cs->daclentry, data->data, data->len,
+                                           defacl);
+        if (aclcount < 0) {
+            gf_msg(GF_ACL, GF_LOG_ERROR, aclcount, NFS_MSG_GET_DEF_ACL_FAIL,
+                   "Failed to get DEFAULT ACL");
+            stat = nfs3_errno_to_nfsstat3(-aclcount);
+            goto err;
+        }
+
+        getaclreply->daclcount = aclcount;
+        getaclreply->daclentry.daclentry_len = aclcount;
+    }
+
+    getaclreply->attr_follows = TRUE;
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_getxattr(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                       POSIX_ACL_ACCESS_XATTR, NULL, acl3_getacl_cbk, cs);
+    if (ret < 0) {
+        stat = nfs3_errno_to_nfsstat3(-ret);
+        goto err;
+    }
+
+    return 0;
+
+err:
+    if (getaclreply)
+        getaclreply->status = stat;
+    acl3_getacl_reply(cs->req, getaclreply);
+    nfs3_call_state_wipe(cs);
+    return 0;
+}
+
+int
+acl3_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+    getaclreply *getaclreply = NULL;
+    int ret = -1;
+    nfs_user_t nfu = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    if (!frame->local) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Invalid argument, frame->local NULL");
+        return EINVAL;
+    }
+
+    cs = frame->local;
+    getaclreply = &cs->args.getaclreply;
+
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto err;
+    }
+
+    /* Fill the attrs before xattrs */
+    getaclreply->attr_follows = TRUE;
+    deviceid = nfs3_request_xlator_deviceid(cs->req);
+    nfs3_map_deviceid_to_statdev(buf, deviceid);
+    nfs3_stat_to_fattr3(buf, &(getaclreply->attr));
+
+    nfs_request_user_init(&nfu, cs->req);
+    if (buf->ia_type == IA_IFDIR) {
+        ret = nfs_getxattr(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                           POSIX_ACL_DEFAULT_XATTR, NULL,
+                           acl3_default_getacl_cbk, cs);
+    } else {
+        ret = nfs_getxattr(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                           POSIX_ACL_ACCESS_XATTR, NULL, acl3_getacl_cbk, cs);
+    }
+
+    if (ret < 0) {
+        stat = nfs3_errno_to_nfsstat3(-ret);
+        goto err;
+    }
+
+    return 0;
+err:
+    getaclreply->status = stat;
+    acl3_getacl_reply(cs->req, getaclreply);
+    nfs3_call_state_wipe(cs);
+    return 0;
+}
+
+int
+acl3_getacl_resume(void *carg)
+{
+    int ret = -1;
+    nfs3_call_state_t *cs = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    acl3_check_fh_resolve_status(cs, stat, acl3err);
+    nfs_request_user_init(&nfu, cs->req);
+
+    ret = nfs_stat(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, acl3_stat_cbk,
+                   cs);
+    stat = -ret;
+acl3err:
+    if (ret < 0) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, stat, NFS_MSG_OPEN_FAIL,
+               "unable to open_and_resume");
+        cs->args.getaclreply.status = nfs3_errno_to_nfsstat3(stat);
+        acl3_getacl_reply(cs->req, &cs->args.getaclreply);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+acl3svc_getacl(rpcsvc_request_t *req)
+{
+    xlator_t *vol = NULL;
+    struct nfs_state *nfs = NULL;
+    nfs3_state_t *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+    int ret = RPCSVC_ACTOR_ERROR;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    struct nfs3_fh fh, *fhp = NULL;
+    getaclargs getaclargs;
+    getaclreply getaclreply;
+
+    if (!req)
+        return ret;
+
+    acl3_validate_nfs3_state(req, nfs3, stat, rpcerr, ret);
+    nfs = nfs_state(nfs3->nfsx);
+    memset(&getaclargs, 0, sizeof(getaclargs));
+    memset(&getaclreply, 0, sizeof(getaclreply));
+    getaclargs.fh.n_bytes = (char *)&fh;
+    if (xdr_to_getaclargs(req->msg[0], &getaclargs) <= 0) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, errno, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    /* Validate ACL mask */
+    if (getaclargs.mask & ~(NFS_ACL | NFS_ACLCNT | NFS_DFACL | NFS_DFACLCNT)) {
+        stat = NFS3ERR_INVAL;
+        goto acl3err;
+    }
+
+    fhp = &fh;
+    acl3_validate_gluster_fh(&fh, stat, acl3err);
+    acl3_map_fh_to_volume(nfs->nfs3state, fhp, req, vol, stat, acl3err);
+    acl3_volume_started_check(nfs3, vol, ret, rpcerr);
+    acl3_handle_call_state_init(nfs->nfs3state, cs, req, vol, stat, acl3err);
+
+    cs->vol = vol;
+    cs->args.getaclreply.mask = getaclargs.mask;
+
+    ret = nfs3_fh_resolve_and_resume(cs, fhp, NULL, acl3_getacl_resume);
+    stat = nfs3_errno_to_nfsstat3(-ret);
+
+acl3err:
+    if (ret < 0) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, -ret, NFS_MSG_RESOLVE_ERROR,
+               "unable to resolve and resume");
+        getaclreply.status = stat;
+        acl3_getacl_reply(req, &getaclreply);
+        nfs3_call_state_wipe(cs);
+        return 0;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+acl3_setacl_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    nfs3_call_state_t *cs = NULL;
+    cs = frame->local;
+    if (op_ret < 0) {
+        nfsstat3 status = nfs3_cbk_errno_status(op_ret, op_errno);
+        cs->args.setaclreply.status = status;
+    }
+
+    acl3_setacl_reply(cs->req, &cs->args.setaclreply);
+
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int
+acl3_setacl_resume(void *carg)
+{
+    int ret = -1;
+    nfs3_call_state_t *cs = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    dict_t *xattr = NULL;
+
+    if (!carg)
+        return ret;
+    cs = (nfs3_call_state_t *)carg;
+    acl3_check_fh_resolve_status(cs, stat, acl3err);
+    nfs_request_user_init(&nfu, cs->req);
+    xattr = dict_new();
+    if (xattr == NULL) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, ENOMEM, NFS_MSG_GFID_DICT_CREATE_FAIL,
+               "dict allocation failed");
+        goto acl3err;
+    }
+
+    if (cs->aclcount)
+        ret = dict_set_static_bin(xattr, POSIX_ACL_ACCESS_XATTR, cs->aclxattr,
+                                  posix_acl_xattr_size(cs->aclcount));
+    if (cs->daclcount)
+        ret = dict_set_static_bin(xattr, POSIX_ACL_DEFAULT_XATTR, cs->daclxattr,
+                                  posix_acl_xattr_size(cs->daclcount));
+
+    ret = nfs_setxattr(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, xattr, 0,
+                       NULL, acl3_setacl_cbk, cs);
+    dict_unref(xattr);
+
+acl3err:
+    if (ret < 0) {
+        stat = -ret;
+        gf_msg(GF_ACL, GF_LOG_ERROR, stat, NFS_MSG_OPEN_FAIL,
+               "unable to open_and_resume");
+        cs->args.setaclreply.status = nfs3_errno_to_nfsstat3(stat);
+        acl3_setacl_reply(cs->req, &cs->args.setaclreply);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+acl3svc_setacl(rpcsvc_request_t *req)
+{
+    xlator_t *vol = NULL;
+    struct nfs_state *nfs = NULL;
+    nfs3_state_t *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+    int ret = RPCSVC_ACTOR_ERROR;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    struct nfs3_fh fh;
+    struct nfs3_fh *fhp = NULL;
+    setaclargs setaclargs;
+    setaclreply setaclreply;
+    aclentry *daclentry = NULL;
+    aclentry *aclentry = NULL;
+    int aclerrno = 0;
+    int defacl = 1;
+
+    if (!req)
+        return ret;
+    aclentry = GF_CALLOC(NFS_ACL_MAX_ENTRIES, sizeof(*aclentry), gf_nfs_mt_arr);
+    if (!aclentry) {
+        goto rpcerr;
+    }
+    daclentry = GF_CALLOC(NFS_ACL_MAX_ENTRIES, sizeof(*daclentry),
+                          gf_nfs_mt_arr);
+    if (!daclentry) {
+        goto rpcerr;
+    }
+
+    acl3_validate_nfs3_state(req, nfs3, stat, rpcerr, ret);
+    nfs = nfs_state(nfs3->nfsx);
+    memset(&setaclargs, 0, sizeof(setaclargs));
+    memset(&setaclreply, 0, sizeof(setaclreply));
+    memset(&fh, 0, sizeof(fh));
+    setaclargs.fh.n_bytes = (char *)&fh;
+    setaclargs.aclentry.aclentry_val = aclentry;
+    setaclargs.daclentry.daclentry_val = daclentry;
+    if (xdr_to_setaclargs(req->msg[0], &setaclargs) <= 0) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, errno, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    /* Validate ACL mask */
+    if (setaclargs.mask & ~(NFS_ACL | NFS_ACLCNT | NFS_DFACL | NFS_DFACLCNT)) {
+        stat = NFS3ERR_INVAL;
+        goto acl3err;
+    }
+
+    fhp = &fh;
+    acl3_validate_gluster_fh(fhp, stat, acl3err);
+    acl3_map_fh_to_volume(nfs->nfs3state, fhp, req, vol, stat, acl3err);
+    acl3_volume_started_check(nfs3, vol, ret, rpcerr);
+    acl3_handle_call_state_init(nfs->nfs3state, cs, req, vol, stat, acl3err);
+
+    cs->vol = vol;
+    cs->aclcount = setaclargs.aclcount;
+    cs->daclcount = setaclargs.daclcount;
+
+    /* setfacl: NFS USER ACL */
+    aclerrno = acl3_nfs_acl_to_xattr(aclentry, cs->aclxattr, cs->aclcount,
+                                     !defacl);
+    if (aclerrno < 0) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, -aclerrno, NFS_MSG_SET_USER_ACL_FAIL,
+               "Failed to set USER ACL");
+        stat = nfs3_errno_to_nfsstat3(-aclerrno);
+        goto acl3err;
+    }
+
+    /* setfacl: NFS DEFAULT ACL */
+    aclerrno = acl3_nfs_acl_to_xattr(daclentry, cs->daclxattr, cs->daclcount,
+                                     defacl);
+    if (aclerrno < 0) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, -aclerrno, NFS_MSG_SET_DEF_ACL_FAIL,
+               "Failed to set DEFAULT ACL");
+        stat = nfs3_errno_to_nfsstat3(-aclerrno);
+        goto acl3err;
+    }
+
+    ret = nfs3_fh_resolve_and_resume(cs, fhp, NULL, acl3_setacl_resume);
+    stat = nfs3_errno_to_nfsstat3(-ret);
+
+acl3err:
+    if (ret < 0) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, -ret, NFS_MSG_RESOLVE_ERROR,
+               "unable to resolve and resume");
+        setaclreply.status = stat;
+        acl3_setacl_reply(req, &setaclreply);
+        nfs3_call_state_wipe(cs);
+        GF_FREE(aclentry);
+        GF_FREE(daclentry);
+        return 0;
+    }
+
+rpcerr:
+    if (ret < 0)
+        nfs3_call_state_wipe(cs);
+    if (aclentry)
+        GF_FREE(aclentry);
+    if (daclentry)
+        GF_FREE(daclentry);
+    return ret;
+}
+
+static rpcsvc_actor_t acl3svc_actors[ACL3_PROC_COUNT] = {
+    {"NULL", acl3svc_null, NULL, ACL3_NULL, DRC_NA, 0},
+    {"GETACL", acl3svc_getacl, NULL, ACL3_GETACL, DRC_NA, 0},
+    {"SETACL", acl3svc_setacl, NULL, ACL3_SETACL, DRC_NA, 0},
+};
+
+static rpcsvc_program_t acl3prog = {
+    .progname = "ACL3",
+    .prognum = ACL_PROGRAM,
+    .progver = ACLV3_VERSION,
+    .progport = GF_NFS3_PORT,
+    .actors = acl3svc_actors,
+    .numactors = ACL3_PROC_COUNT,
+    .min_auth = AUTH_NULL,
+};
+
+rpcsvc_program_t *
+acl3svc_init(xlator_t *nfsx)
+{
+    struct nfs3_state *ns = NULL;
+    struct nfs_state *nfs = NULL;
+    dict_t *options = NULL;
+    int ret = -1;
+    static gf_boolean_t acl3_inited = _gf_false;
+
+    /* Already inited */
+    if (acl3_inited)
+        return &acl3prog;
+
+    nfs = (struct nfs_state *)nfsx->private;
+
+    ns = nfs->nfs3state;
+    if (!ns) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, EINVAL, NFS_MSG_ACL_INIT_FAIL,
+               "ACL3 init failed");
+        goto err;
+    }
+    acl3prog.private = ns;
+
+    options = dict_new();
+    if (options == NULL) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, ENOMEM, NFS_MSG_GFID_DICT_CREATE_FAIL,
+               "dict allocation failed");
+        goto err;
+    }
+
+    ret = dict_set_str(options, "transport.socket.listen-port", GF_ACL3_PORT);
+    if (ret == -1)
+        goto err;
+    ret = dict_set_str(options, "transport-type", "socket");
+    if (ret == -1) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+               "dict_set_str error");
+        goto err;
+    }
+
+    if (nfs->allow_insecure) {
+        ret = dict_set_str(options, "rpc-auth-allow-insecure", "on");
+        if (ret == -1) {
+            gf_msg(GF_ACL, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+                   "dict_set_str error");
+            goto err;
+        }
+        ret = dict_set_str(options, "rpc-auth.ports.insecure", "on");
+        if (ret == -1) {
+            gf_msg(GF_ACL, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+                   "dict_set_str error");
+            goto err;
+        }
+    }
+
+    ret = dict_set_str(options, "transport.address-family", "inet");
+    if (ret == -1) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+               "dict_set_str error");
+        goto err;
+    }
+
+    ret = rpcsvc_create_listeners(nfs->rpcsvc, options, "ACL");
+    if (ret == -1) {
+        gf_msg(GF_ACL, GF_LOG_ERROR, errno, NFS_MSG_LISTENERS_CREATE_FAIL,
+               "Unable to create listeners");
+        goto err;
+    }
+
+    if (options)
+        dict_unref(options);
+
+    acl3_inited = _gf_true;
+    return &acl3prog;
+err:
+    if (options)
+        dict_unref(options);
+    return NULL;
+}
+
+static int
+acl3_nfs_acl_to_xattr(aclentry *ace,  /* ACL entries to be read */
+                      void *xattrbuf, /* XATTR buf to be populated */
+                      int aclcount,   /* No of ACLs to be read */
+                      int defacl)     /* 1 if DEFAULT ACL */
+{
+    int idx = 0;
+    posix_acl_xattr_header *xheader = NULL;
+    posix_acl_xattr_entry *xentry = NULL;
+
+    if ((!ace) || (!xattrbuf))
+        return (-EINVAL);
+
+    /* ACL count is ZERO, nothing to do */
+    if (!aclcount)
+        return (0);
+
+    if ((aclcount < 0) || (aclcount > NFS_ACL_MAX_ENTRIES))
+        return (-EINVAL);
+
+    xheader = (posix_acl_xattr_header *)(xattrbuf);
+    xentry = (posix_acl_xattr_entry *)(xheader + 1);
+
+    /*
+     * For "default ACL", NFSv3 handles the 'type' differently
+     * i.e. by logical OR'ing 'type' with NFS_ACL_DEFAULT.
+     * Which the backend File system does not understand and
+     * that needs to be masked OFF.
+     */
+    xheader->version = POSIX_ACL_XATTR_VERSION;
+
+    for (idx = 0; idx < aclcount; idx++) {
+        xentry->tag = ace->type;
+        if (defacl)
+            xentry->tag &= ~NFS_ACL_DEFAULT;
+        xentry->perm = ace->perm;
+
+        switch (xentry->tag) {
+            case POSIX_ACL_USER:
+            case POSIX_ACL_GROUP:
+                if (xentry->perm & ~S_IRWXO)
+                    return (-EINVAL);
+                xentry->id = ace->uid;
+                break;
+            case POSIX_ACL_USER_OBJ:
+            case POSIX_ACL_GROUP_OBJ:
+            case POSIX_ACL_OTHER:
+                if (xentry->perm & ~S_IRWXO)
+                    return (-EINVAL);
+                xentry->id = POSIX_ACL_UNDEFINED_ID;
+                break;
+            case POSIX_ACL_MASK:
+                /* Solaris sometimes sets additional bits in
+                 * the mask.
+                 */
+                xentry->perm &= S_IRWXO;
+                xentry->id = POSIX_ACL_UNDEFINED_ID;
+                break;
+            default:
+                return (-EINVAL);
+        }
+
+        xentry++;
+        ace++;
+    }
+
+    /* SUCCESS */
+    return (0);
+}
+
+static int
+acl3_nfs_acl_from_xattr(aclentry *ace,  /* ACL entries to be filled */
+                        void *xattrbuf, /* XATTR buf to be read */
+                        int bufsize,    /* Size of XATTR buffer */
+                        int defacl)     /*  1 if DEFAULT ACL */
+{
+    int idx = 0;
+    ssize_t aclcount = 0;
+    posix_acl_xattr_header *xheader = NULL;
+    posix_acl_xattr_entry *xentry = NULL;
+
+    if ((!xattrbuf) || (!ace))
+        return (-EINVAL);
+
+    aclcount = posix_acl_xattr_count(bufsize);
+    if ((aclcount < 0) || (aclcount > NFS_ACL_MAX_ENTRIES))
+        return (-EINVAL);
+
+    xheader = (posix_acl_xattr_header *)(xattrbuf);
+    xentry = (posix_acl_xattr_entry *)(xheader + 1);
+
+    /* Check for supported POSIX ACL xattr version */
+    if (xheader->version != POSIX_ACL_XATTR_VERSION)
+        return (-ENOSYS);
+
+    for (idx = 0; idx < (int)aclcount; idx++) {
+        ace->type = xentry->tag;
+        if (defacl) {
+            /*
+             * SET the NFS_ACL_DEFAULT flag for default
+             * ACL which was masked OFF during setfacl().
+             */
+            ace->type |= NFS_ACL_DEFAULT;
+        }
+        ace->perm = (xentry->perm & S_IRWXO);
+
+        switch (xentry->tag) {
+            case POSIX_ACL_USER:
+            case POSIX_ACL_GROUP:
+                ace->uid = xentry->id;
+                break;
+            case POSIX_ACL_USER_OBJ:
+            case POSIX_ACL_GROUP_OBJ:
+            case POSIX_ACL_MASK:
+            case POSIX_ACL_OTHER:
+                ace->uid = POSIX_ACL_UNDEFINED_ID;
+                break;
+            default:
+                return (-EINVAL);
+        }
+
+        xentry++;
+        ace++;
+    }
+
+    /* SUCCESS: ACL count */
+    return aclcount;
+}
diff --git a/xlators/nfs/server/src/acl3.h b/xlators/nfs/server/src/acl3.h
new file mode 100644
index 00000000000..762fbb04a0f
--- /dev/null
+++ b/xlators/nfs/server/src/acl3.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2012 Red Hat, Inc. <http://www.redhat.com>
+ * This file is part of GlusterFS.
+ *
+ * This file is licensed to you under your choice of the GNU Lesser
+ * General Public License, version 3 or any later version (LGPLv3 or
+ * later), or the GNU General Public License, version 2 (GPLv2), in all
+ * cases as published by the Free Software Foundation.
+ */
+
+#ifndef _ACL3_H
+#define _ACL3_H
+
+#include <glusterfs/glusterfs-acl.h>
+
+#define ACL3_NULL 0
+#define ACL3_GETACL 1
+#define ACL3_SETACL 2
+#define ACL3_PROC_COUNT 3
+
+#define GF_ACL3_PORT "38469"
+#define GF_ACL GF_NFS "-ACL"
+
+/* Flags for the getacl/setacl mode */
+#define NFS_ACL 0x0001
+#define NFS_ACLCNT 0x0002
+#define NFS_DFACL 0x0004
+#define NFS_DFACLCNT 0x0008
+
+/*
+ * NFSv3, identifies the default ACL by NFS_ACL_DEFAULT. Gluster
+ * NFS needs to mask it OFF before sending it up to POSIX layer
+ * or File system layer.
+ */
+#define NFS_ACL_DEFAULT 0x1000
+
+#define NFS_ACL_MAX_ENTRIES 1024
+
+rpcsvc_program_t *
+acl3svc_init(xlator_t *nfsx);
+
+#endif
diff --git a/xlators/nfs/server/src/auth-cache.c b/xlators/nfs/server/src/auth-cache.c
new file mode 100644
index 00000000000..ffbf5b6cad6
--- /dev/null
+++ b/xlators/nfs/server/src/auth-cache.c
@@ -0,0 +1,496 @@
+/*
+   Copyright 2014-present Facebook. All Rights Reserved
+   This file is part of GlusterFS.
+
+   Author :
+   Shreyas Siravara <shreyas.siravara@gmail.com>
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/refcount.h>
+#include "auth-cache.h"
+#include "nfs3.h"
+#include "exports.h"
+#include "nfs-messages.h"
+
+enum auth_cache_lookup_results {
+    ENTRY_FOUND = 0,
+    ENTRY_NOT_FOUND = -1,
+    ENTRY_EXPIRED = -2,
+};
+
+struct auth_cache_entry {
+    GF_REF_DECL;  /* refcounting */
+    data_t *data; /* data_unref() on refcount == 0 */
+
+    time_t timestamp;
+    struct export_item *item;
+};
+
+/* Given a filehandle and an ip, creates a colon delimited hashkey.
+ */
+static char *
+make_hashkey(struct nfs3_fh *fh, const char *host)
+{
+    char *hashkey = NULL;
+    char exportid[256] = {
+        0,
+    };
+    char mountid[256] = {
+        0,
+    };
+    size_t nbytes = 0;
+
+    gf_uuid_unparse(fh->exportid, exportid);
+    gf_uuid_unparse(fh->mountid, mountid);
+
+    nbytes = strlen(exportid) + strlen(host) + strlen(mountid) + 3;
+    hashkey = GF_MALLOC(nbytes, gf_common_mt_char);
+    if (!hashkey)
+        return NULL;
+
+    snprintf(hashkey, nbytes, "%s:%s:%s", exportid, mountid, host);
+
+    return hashkey;
+}
+
+/**
+ * auth_cache_init -- Initialize an auth cache and set the ttl_sec
+ *
+ * @ttl_sec : The TTL to set in seconds
+ *
+ * @return : allocated auth cache struct, NULL if allocation failed.
+ */
+struct auth_cache *
+auth_cache_init(time_t ttl_sec)
+{
+    struct auth_cache *cache = GF_CALLOC(1, sizeof(*cache),
+                                         gf_nfs_mt_auth_cache);
+
+    GF_VALIDATE_OR_GOTO("auth-cache", cache, out);
+
+    cache->cache_dict = dict_new();
+    if (!cache->cache_dict) {
+        GF_FREE(cache);
+        cache = NULL;
+        goto out;
+    }
+
+    LOCK_INIT(&cache->lock);
+    cache->ttl_sec = ttl_sec;
+out:
+    return cache;
+}
+
+/* auth_cache_entry_free -- called by refcounting subsystem on refcount == 0
+ *
+ * @to_free: auth_cache_entry that has refcount == 0 and needs to get free'd
+ */
+void
+auth_cache_entry_free(void *to_free)
+{
+    struct auth_cache_entry *entry = to_free;
+    data_t *entry_data = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS, entry, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS, entry->data, out);
+
+    entry_data = entry->data;
+    /* set data_t->data to NULL, otherwise data_unref() tries to free it */
+    entry_data->data = NULL;
+    data_unref(entry_data);
+
+    GF_FREE(entry);
+out:
+    return;
+}
+
+/**
+ * auth_cache_entry_init -- Initialize an auth cache entry
+ *
+ * @return: Pointer to an allocated auth cache entry, NULL if allocation
+ *          failed.
+ */
+static struct auth_cache_entry *
+auth_cache_entry_init()
+{
+    struct auth_cache_entry *entry = NULL;
+
+    entry = GF_CALLOC(1, sizeof(*entry), gf_nfs_mt_auth_cache_entry);
+    if (!entry)
+        gf_msg(GF_NFS, GF_LOG_WARNING, ENOMEM, NFS_MSG_NO_MEMORY,
+               "failed to allocate entry");
+    else
+        GF_REF_INIT(entry, auth_cache_entry_free);
+
+    return entry;
+}
+
+/**
+ * auth_cache_add -- Add an auth_cache_entry to the cache->dict
+ *
+ * @return: 0 on success, non-zero otherwise.
+ */
+static int
+auth_cache_add(struct auth_cache *cache, char *hashkey,
+               struct auth_cache_entry *entry)
+{
+    int ret = -1;
+    data_t *entry_data = NULL;
+    int hashkey_len;
+    GF_VALIDATE_OR_GOTO(GF_NFS, cache, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS, cache->cache_dict, out);
+
+    /* FIXME: entry is passed as parameter, this can never fail? */
+    entry = GF_REF_GET(entry);
+    if (!entry) {
+        /* entry does not have any references */
+        ret = -1;
+        goto out;
+    }
+
+    entry_data = bin_to_data(entry, sizeof(*entry));
+    if (!entry_data) {
+        ret = -1;
+        GF_REF_PUT(entry);
+        goto out;
+    }
+
+    /* we'll take an extra ref on the data_t, it gets unref'd when the
+     * auth_cache_entry is released */
+    entry->data = data_ref(entry_data);
+
+    hashkey_len = strlen(hashkey);
+    LOCK(&cache->lock);
+    {
+        ret = dict_setn(cache->cache_dict, hashkey, hashkey_len, entry_data);
+    }
+    UNLOCK(&cache->lock);
+
+    if (ret) {
+        /* adding to dict failed */
+        GF_REF_PUT(entry);
+    }
+out:
+    return ret;
+}
+
+/**
+ * _auth_cache_expired -- Check if the auth_cache_entry has expired
+ *
+ * The auth_cache->lock should have been taken when this function is called.
+ *
+ * @return: true when the auth_cache_entry is expired, false otherwise.
+ */
+static int
+_auth_cache_expired(struct auth_cache *cache, struct auth_cache_entry *entry)
+{
+    return ((gf_time() - entry->timestamp) > cache->ttl_sec);
+}
+
+/**
+ * auth_cache_get -- Get the @hashkey entry from the cache->cache_dict
+ *
+ * @cache: The auth_cache that should contain the @entry.
+ * @haskkey: The key associated with the auth_cache_entry.
+ * @entry: The found auth_cache_entry, unmodified if not found/expired.
+ *
+ * The using the cache->dict requires locking, this function takes care of
+ * that. When the entry is found, but has expired, it will be removed from the
+ * cache_dict.
+ *
+ * @return: 0 when found, ENTRY_NOT_FOUND or ENTRY_EXPIRED otherwise.
+ */
+static enum auth_cache_lookup_results
+auth_cache_get(struct auth_cache *cache, char *hashkey,
+               struct auth_cache_entry **entry)
+{
+    enum auth_cache_lookup_results ret = ENTRY_NOT_FOUND;
+    data_t *entry_data = NULL;
+    struct auth_cache_entry *lookup_res = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS, cache, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS, cache->cache_dict, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS, hashkey, out);
+
+    LOCK(&cache->lock);
+    {
+        entry_data = dict_get(cache->cache_dict, hashkey);
+        if (!entry_data)
+            goto unlock;
+
+        /* FIXME: this is dangerous use of entry_data */
+        lookup_res = GF_REF_GET((struct auth_cache_entry *)entry_data->data);
+        if (lookup_res == NULL) {
+            /* entry has been free'd */
+            ret = ENTRY_EXPIRED;
+            goto unlock;
+        }
+
+        if (_auth_cache_expired(cache, lookup_res)) {
+            ret = ENTRY_EXPIRED;
+            GF_REF_PUT(lookup_res->item);
+            lookup_res->item = NULL;
+
+            /* free entry and remove from the cache */
+            GF_FREE(lookup_res);
+            entry_data->data = NULL;
+            dict_del(cache->cache_dict, hashkey);
+
+            goto unlock;
+        }
+
+        *entry = lookup_res;
+        ret = ENTRY_FOUND;
+    }
+unlock:
+    UNLOCK(&cache->lock);
+
+out:
+    return ret;
+}
+
+/**
+ * auth_cache_lookup -- Lookup an item from the cache
+ *
+ * @cache: cache to lookup from
+ * @fh   : FH to use in lookup
+ * @host_addr: Address to use in lookup
+ * @timestamp: The timestamp to set when lookup succeeds
+ * @can_write: Is the host authorized to write to the filehandle?
+ *
+ * If the current time - entry time of the cache entry > ttl_sec,
+ * we remove the element from the dict and return ENTRY_EXPIRED.
+ *
+ * @return: ENTRY_EXPIRED if entry expired
+ *          ENTRY_NOT_FOUND if entry not found in dict
+ *          0 if found
+ */
+enum auth_cache_lookup_results
+auth_cache_lookup(struct auth_cache *cache, struct nfs3_fh *fh,
+                  const char *host_addr, time_t *timestamp,
+                  gf_boolean_t *can_write)
+{
+    char *hashkey = NULL;
+    struct auth_cache_entry *lookup_res = NULL;
+    enum auth_cache_lookup_results ret = ENTRY_NOT_FOUND;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS, cache, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS, fh, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS, host_addr, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS, timestamp, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS, can_write, out);
+
+    hashkey = make_hashkey(fh, host_addr);
+    if (!hashkey) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    ret = auth_cache_get(cache, hashkey, &lookup_res);
+    switch (ret) {
+        case ENTRY_FOUND:
+            *timestamp = lookup_res->timestamp;
+            *can_write = lookup_res->item->opts->rw;
+            GF_REF_PUT(lookup_res);
+            break;
+
+        case ENTRY_NOT_FOUND:
+            gf_msg_debug(GF_NFS, 0, "could not find entry for %s", host_addr);
+            break;
+
+        case ENTRY_EXPIRED:
+            gf_msg_debug(GF_NFS, 0, "entry for host %s has expired", host_addr);
+            break;
+    }
+
+out:
+    GF_FREE(hashkey);
+
+    return ret;
+}
+
+/* auth_cache_entry_purge -- free up the auth_cache_entry
+ *
+ * This gets called through dict_foreach() by auth_cache_purge(). Each
+ * auth_cache_entry has a refcount which needs to be decremented. Once the
+ * auth_cache_entry reaches refcount == 0, auth_cache_entry_free() will call
+ * data_unref() to free the associated data_t.
+ *
+ * @d: dict that gets purged by auth_cache_purge()
+ * @k: hashkey of the current entry
+ * @v: data_t of the current entry
+ */
+int
+auth_cache_entry_purge(dict_t *d, char *k, data_t *v, void *_unused)
+{
+    struct auth_cache_entry *entry = (struct auth_cache_entry *)v->data;
+
+    if (entry)
+        GF_REF_PUT(entry);
+
+    return 0;
+}
+
+/**
+ * auth_cache_purge -- Purge the dict in the cache and create a new empty one.
+ *
+ * @cache: Cache to purge
+ *
+ */
+void
+auth_cache_purge(struct auth_cache *cache)
+{
+    dict_t *new_cache_dict = dict_new();
+    dict_t *old_cache_dict = NULL;
+
+    if (!cache || !new_cache_dict)
+        goto out;
+
+    LOCK(&cache->lock);
+    {
+        old_cache_dict = cache->cache_dict;
+        cache->cache_dict = new_cache_dict;
+    }
+    UNLOCK(&cache->lock);
+
+    /* walk all entries and refcount-- with GF_REF_PUT() */
+    dict_foreach(old_cache_dict, auth_cache_entry_purge, NULL);
+    dict_unref(old_cache_dict);
+out:
+    return;
+}
+
+/**
+ * is_nfs_fh_cached_and_writeable -- Checks if an NFS FH is cached for the given
+ *                                   host
+ * @cache: The fh cache
+ * @host_addr: Address to use in lookup
+ * @fh: The fh to use in lookup
+ *
+ *
+ * @return: TRUE if cached, FALSE otherwise
+ *
+ */
+gf_boolean_t
+is_nfs_fh_cached(struct auth_cache *cache, struct nfs3_fh *fh,
+                 const char *host_addr)
+{
+    int ret = 0;
+    time_t timestamp = 0;
+    gf_boolean_t cached = _gf_false;
+    gf_boolean_t can_write = _gf_false;
+
+    if (!fh)
+        goto out;
+
+    ret = auth_cache_lookup(cache, fh, host_addr, &timestamp, &can_write);
+    cached = (ret == ENTRY_FOUND);
+
+out:
+    return cached;
+}
+
+/**
+ * is_nfs_fh_cached_and_writeable -- Checks if an NFS FH is cached for the given
+ *                                   host and writable
+ * @cache: The fh cache
+ * @host_addr: Address to use in lookup
+ * @fh: The fh to use in lookup
+ *
+ *
+ * @return: TRUE if cached & writable, FALSE otherwise
+ *
+ */
+gf_boolean_t
+is_nfs_fh_cached_and_writeable(struct auth_cache *cache, struct nfs3_fh *fh,
+                               const char *host_addr)
+{
+    int ret = 0;
+    time_t timestamp = 0;
+    gf_boolean_t cached = _gf_false;
+    gf_boolean_t writable = _gf_false;
+
+    if (!fh)
+        goto out;
+
+    ret = auth_cache_lookup(cache, fh, host_addr, &timestamp, &writable);
+    cached = ((ret == ENTRY_FOUND) && writable);
+
+out:
+    return cached;
+}
+
+/**
+ * cache_nfs_fh -- Places the nfs file handle in the underlying dict as we are
+ *                 using as our cache. The key is "exportid:gfid:host_addr", the
+ *                 value is an entry struct containing the export item that
+ *                 was authorized for the operation and the file handle that was
+ *                 authorized.
+ *
+ * @cache: The cache to place fh's in
+ * @fh   : The fh to cache
+ * @host_addr: The address of the host
+ * @export_item: The export item that was authorized
+ *
+ */
+int
+cache_nfs_fh(struct auth_cache *cache, struct nfs3_fh *fh,
+             const char *host_addr, struct export_item *export_item)
+{
+    int ret = -EINVAL;
+    char *hashkey = NULL;
+    time_t timestamp = 0;
+    gf_boolean_t can_write = _gf_false;
+    struct auth_cache_entry *entry = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS, host_addr, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS, cache, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS, fh, out);
+
+    /* If we could already find it in the cache, just return */
+    ret = auth_cache_lookup(cache, fh, host_addr, &timestamp, &can_write);
+    if (ret == 0) {
+        gf_msg_trace(GF_NFS, 0,
+                     "found cached auth/fh for host "
+                     "%s",
+                     host_addr);
+        goto out;
+    }
+
+    hashkey = make_hashkey(fh, host_addr);
+    if (!hashkey) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    entry = auth_cache_entry_init();
+    if (!entry) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    entry->timestamp = gf_time();
+    /* Update entry->item if it is pointing to a different export_item */
+    if (entry->item && entry->item != export_item) {
+        GF_REF_PUT(entry->item);
+    }
+    entry->item = GF_REF_GET(export_item);
+
+    ret = auth_cache_add(cache, hashkey, entry);
+    GF_REF_PUT(entry);
+    if (ret)
+        goto out;
+
+    gf_msg_trace(GF_NFS, 0, "Caching file-handle (%s)", host_addr);
+    ret = 0;
+
+out:
+    GF_FREE(hashkey);
+
+    return ret;
+}
diff --git a/xlators/nfs/server/src/auth-cache.h b/xlators/nfs/server/src/auth-cache.h
new file mode 100644
index 00000000000..4c9d09207f8
--- /dev/null
+++ b/xlators/nfs/server/src/auth-cache.h
@@ -0,0 +1,52 @@
+/*
+  Copyright 2014-present Facebook. All Rights Reserved
+
+  This file is part of GlusterFS.
+
+   Author :
+   Shreyas Siravara <shreyas.siravara@gmail.com>
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _AUTH_CACHE_H_
+#define _AUTH_CACHE_H_
+
+#include "nfs-mem-types.h"
+#include "exports.h"
+#include <glusterfs/dict.h>
+#include "nfs3.h"
+
+struct auth_cache {
+    gf_lock_t lock;     /* locking for the dict (and entries) */
+    dict_t *cache_dict; /* Dict holding fh -> authcache_entry */
+    time_t ttl_sec;     /* TTL of the auth cache in seconds */
+};
+
+/* Initializes the cache */
+struct auth_cache *
+auth_cache_init(time_t ttl_sec);
+
+/* Inserts FH into cache */
+int
+cache_nfs_fh(struct auth_cache *cache, struct nfs3_fh *fh,
+             const char *host_addr, struct export_item *export_item);
+
+/* Checks if the filehandle cached & writable */
+gf_boolean_t
+is_nfs_fh_cached_and_writeable(struct auth_cache *cache, struct nfs3_fh *fh,
+                               const char *host_addr);
+
+/* Checks if the filehandle is cached */
+gf_boolean_t
+is_nfs_fh_cached(struct auth_cache *cache, struct nfs3_fh *fh,
+                 const char *host_addr);
+
+/* Purge the cache */
+void
+auth_cache_purge(struct auth_cache *cache);
+
+#endif /* _AUTH_CACHE_H_ */
diff --git a/xlators/nfs/server/src/exports.c b/xlators/nfs/server/src/exports.c
new file mode 100644
index 00000000000..d7e39934851
--- /dev/null
+++ b/xlators/nfs/server/src/exports.c
@@ -0,0 +1,1484 @@
+/*
+  Copyright 2014-present Facebook. All Rights Reserved
+
+  This file is part of GlusterFS.
+
+   Author :
+   Shreyas Siravara <shreyas.siravara@gmail.com>
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "exports.h"
+#include <glusterfs/hashfn.h>
+#include <glusterfs/parse-utils.h>
+#include "nfs-messages.h"
+
+static void
+_exp_dict_destroy(dict_t *ng_dict);
+static void
+_export_options_print(const struct export_options *opts);
+static void
+_export_options_deinit(struct export_options *opts);
+static void
+_export_dir_deinit(struct export_dir *dir);
+
+static struct parser *netgroup_parser;
+static struct parser *hostname_parser;
+static struct parser *options_parser;
+
+/**
+ * _exp_init_parsers -- Initialize parsers to be used in this file
+ *
+ * @return: success: 0
+ *          failure: -1
+ */
+static int
+_exp_init_parsers()
+{
+    int ret = -1;
+
+    netgroup_parser = parser_init(NETGROUP_REGEX_PATTERN);
+    if (!netgroup_parser)
+        goto out;
+
+    hostname_parser = parser_init(HOSTNAME_REGEX_PATTERN);
+    if (!hostname_parser)
+        goto out;
+
+    options_parser = parser_init(OPTIONS_REGEX_PATTERN);
+    if (!options_parser)
+        goto out;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/**
+ * _exp_deinit_parsers -- Free parsers used in this file
+ */
+static void
+_exp_deinit_parsers()
+{
+    parser_deinit(netgroup_parser);
+    parser_deinit(hostname_parser);
+    parser_deinit(options_parser);
+}
+
+/**
+ * _export_file_init -- Initialize an exports file structure.
+ *
+ * @return  : success: Pointer to an allocated exports file struct
+ *            failure: NULL
+ *
+ * Not for external use.
+ */
+struct exports_file *
+_exports_file_init()
+{
+    struct exports_file *file = NULL;
+
+    file = GF_CALLOC(1, sizeof(*file), gf_common_mt_nfs_exports);
+    if (!file) {
+        gf_msg(GF_EXP, GF_LOG_CRITICAL, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to allocate file struct!");
+        goto out;
+    }
+
+    file->exports_dict = dict_new();
+    file->exports_map = dict_new();
+    if (!file->exports_dict || !file->exports_map) {
+        gf_msg(GF_EXP, GF_LOG_CRITICAL, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to allocate dict!");
+        goto free_and_out;
+    }
+
+    goto out;
+
+free_and_out:
+    if (file->exports_dict)
+        dict_unref(file->exports_dict);
+
+    GF_FREE(file);
+    file = NULL;
+out:
+    return file;
+}
+
+/**
+ * _exp_file_dict_destroy -- Delete each item in the dict
+ *
+ * @dict : Dict to free elements from
+ * @key  : Key in the dict we are on
+ * @val  : Value associated with that dict
+ * @tmp  : Not used
+ *
+ * Not for external use.
+ */
+static int
+_exp_file_dict_destroy(dict_t *dict, char *key, data_t *val, void *tmp)
+{
+    struct export_dir *dir = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_EXP, dict, out);
+
+    if (val) {
+        dir = (struct export_dir *)val->data;
+
+        if (dir) {
+            _export_dir_deinit(dir);
+            val->data = NULL;
+        }
+        dict_del(dict, key);
+    }
+
+out:
+    return 0;
+}
+
+/**
+ * _exp_file_deinit -- Free memory used by an export file
+ *
+ * @expfile : Pointer to the exports file to free
+ *
+ * Externally usable.
+ */
+void
+exp_file_deinit(struct exports_file *expfile)
+{
+    if (!expfile)
+        goto out;
+
+    if (expfile->exports_dict) {
+        dict_foreach(expfile->exports_dict, _exp_file_dict_destroy, NULL);
+        dict_unref(expfile->exports_dict);
+    }
+
+    if (expfile->exports_map) {
+        dict_foreach(expfile->exports_map, _exp_file_dict_destroy, NULL);
+        dict_unref(expfile->exports_map);
+    }
+
+    GF_FREE(expfile->filename);
+    GF_FREE(expfile);
+out:
+    return;
+}
+
+/**
+ * _export_dir_init -- Initialize an export directory structure.
+ *
+ * @return  : success: Pointer to an allocated exports directory struct
+ *            failure: NULL
+ *
+ * Not for external use.
+ */
+static struct export_dir *
+_export_dir_init()
+{
+    struct export_dir *expdir = GF_CALLOC(1, sizeof(*expdir),
+                                          gf_common_mt_nfs_exports);
+
+    if (!expdir)
+        gf_msg(GF_EXP, GF_LOG_CRITICAL, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to allocate export dir structure!");
+
+    return expdir;
+}
+
+/**
+ * _export_dir_deinit -- Free memory used by an export dir
+ *
+ * @expdir : Pointer to the export directory to free
+ *
+ * Not for external use.
+ */
+static void
+_export_dir_deinit(struct export_dir *dir)
+{
+    GF_VALIDATE_OR_GOTO(GF_EXP, dir, out);
+    GF_FREE(dir->dir_name);
+    _exp_dict_destroy(dir->netgroups);
+    dict_unref(dir->netgroups);
+    _exp_dict_destroy(dir->hosts);
+    dict_unref(dir->hosts);
+    GF_FREE(dir);
+
+out:
+    return;
+}
+
+/**
+ * _export_item_print -- Print the elements in the export item.
+ *
+ * @expdir : Pointer to the item struct to print out.
+ *
+ * Not for external use.
+ */
+static void
+_export_item_print(const struct export_item *item)
+{
+    GF_VALIDATE_OR_GOTO(GF_EXP, item, out);
+    printf("%s", item->name);
+    _export_options_print(item->opts);
+out:
+    return;
+}
+
+/**
+ * _export_item_deinit -- Free memory used by an export item
+ *
+ * @expdir : Pointer to the export item to free
+ *
+ * Not for external use.
+ */
+static void
+_export_item_deinit(struct export_item *item)
+{
+    if (!item)
+        return;
+
+    _export_options_deinit(item->opts);
+    GF_FREE(item->name);
+    GF_FREE(item);
+}
+
+/**
+ * _export_item_init -- Initialize an export item structure
+ *
+ * @return  : success: Pointer to an allocated exports item struct
+ *            failure: NULL
+ *
+ * Not for external use.
+ */
+static struct export_item *
+_export_item_init()
+{
+    struct export_item *item = GF_CALLOC(1, sizeof(*item),
+                                         gf_common_mt_nfs_exports);
+
+    if (item) {
+        GF_REF_INIT(item, _export_item_deinit);
+    } else {
+        gf_msg(GF_EXP, GF_LOG_CRITICAL, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to allocate export item!");
+    }
+
+    return item;
+}
+
+/**
+ * _export_host_init -- Initialize an export options struct
+ *
+ * @return  : success: Pointer to an allocated options struct
+ *            failure: NULL
+ *
+ * Not for external use.
+ */
+static struct export_options *
+_export_options_init()
+{
+    struct export_options *opts = GF_CALLOC(1, sizeof(*opts),
+                                            gf_common_mt_nfs_exports);
+
+    if (!opts)
+        gf_msg(GF_EXP, GF_LOG_CRITICAL, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to allocate options structure!");
+
+    return opts;
+}
+
+/**
+ * _export_options_deinit -- Free memory used by a options struct
+ *
+ * @expdir : Pointer to the options struct to free
+ *
+ * Not for external use.
+ */
+static void
+_export_options_deinit(struct export_options *opts)
+{
+    if (!opts)
+        return;
+
+    GF_FREE(opts->anon_uid);
+    GF_FREE(opts->sec_type);
+    GF_FREE(opts);
+}
+
+/**
+ * _export_options_print -- Print the elements in the options struct.
+ *
+ * @expdir : Pointer to the options struct to print out.
+ *
+ * Not for external use.
+ */
+static void
+_export_options_print(const struct export_options *opts)
+{
+    GF_VALIDATE_OR_GOTO(GF_EXP, opts, out);
+
+    printf("(");
+    if (opts->rw)
+        printf("rw,");
+    else
+        printf("ro,");
+
+    if (opts->nosuid)
+        printf("nosuid,");
+
+    if (opts->root)
+        printf("root,");
+
+    if (opts->anon_uid)
+        printf("anonuid=%s,", opts->anon_uid);
+
+    if (opts->sec_type)
+        printf("sec=%s,", opts->sec_type);
+
+    printf(") ");
+out:
+    return;
+}
+
+/**
+ * __exp_dict_free_walk -- Delete each item in the dict
+ *
+ * @dict : Dict to free elements from
+ * @key  : Key in the dict we are on
+ * @val  : Value associated with that dict
+ * @tmp  : Not used
+ *
+ * Passed as a function pointer to dict_foreach()
+ *
+ * Not for external use.
+ */
+static int
+__exp_dict_free_walk(dict_t *dict, char *key, data_t *val, void *tmp)
+{
+    if (val) {
+        GF_REF_PUT((struct export_item *)val->data);
+        val->data = NULL;
+        dict_del(dict, key);
+    }
+    return 0;
+}
+
+/**
+ * _exp_dict_destroy -- Delete all the items from this dict
+ *                               through the helper function above.
+ *
+ * @ng_dict : Dict to free
+ *
+ * Not for external use.
+ */
+static void
+_exp_dict_destroy(dict_t *ng_dict)
+{
+    if (!ng_dict)
+        goto out;
+
+    dict_foreach(ng_dict, __exp_dict_free_walk, NULL);
+out:
+    return;
+}
+
+/**
+ * exp_file_dir_from_uuid -- Using a uuid as the key, retrieve an exports
+ *                           directory from the file.
+ *
+ * @file: File to retrieve data from
+ * @export_uuid: UUID of the export (mountid in the NFS xlator)
+ *
+ * @return : success: Pointer to an export dir struct
+ *           failure: NULL
+ */
+struct export_dir *
+exp_file_dir_from_uuid(const struct exports_file *file,
+                       const uuid_t export_uuid)
+{
+    char export_uuid_str[512] = {
+        0,
+    };
+    data_t *dirdata = NULL;
+    struct export_dir *dir = NULL;
+
+    gf_uuid_unparse(export_uuid, export_uuid_str);
+
+    dirdata = dict_get(file->exports_map, export_uuid_str);
+    if (dirdata)
+        dir = (struct export_dir *)dirdata->data;
+
+    return dir;
+}
+
+/**
+ * _exp_file_insert -- Insert the exports directory into the file structure
+ *                     using the directory as a dict. Also hashes the dirname,
+ *                     stores it in a uuid type, converts the uuid type to a
+ *                     string and uses that as the key to the exports map.
+ *                     The exports map maps an export "uuid" to an export
+ *                     directory struct.
+ *
+ * @file : Exports file struct to insert into
+ * @dir  : Export directory to insert
+ *
+ * Not for external use.
+ */
+static void
+_exp_file_insert(struct exports_file *file, struct export_dir *dir)
+{
+    data_t *dirdata = NULL;
+    uint32_t hashedval = 0;
+    uuid_t export_uuid = {
+        0,
+    };
+    char export_uuid_str[512] = {
+        0,
+    };
+    char *dirdup = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_EXP, file, out);
+    GF_VALIDATE_OR_GOTO(GF_EXP, dir, out);
+
+    dirdata = bin_to_data(dir, sizeof(*dir));
+    dict_set(file->exports_dict, dir->dir_name, dirdata);
+
+    dirdup = strdupa(dir->dir_name);
+    while (strlen(dirdup) > 0 && dirdup[0] == '/')
+        dirdup++;
+
+    hashedval = SuperFastHash(dirdup, strlen(dirdup));
+    memset(export_uuid, 0, sizeof(export_uuid));
+    memcpy(export_uuid, &hashedval, sizeof(hashedval));
+    gf_uuid_unparse(export_uuid, export_uuid_str);
+
+    dict_set(file->exports_map, export_uuid_str, dirdata);
+out:
+    return;
+}
+
+/**
+ * __exp_item_print_walk -- Print all the keys and values in the dict
+ *
+ * @dict : the dict to walk
+ * @key  : the key in the dict we are currently on
+ * @val  : the value in the dict associated with the key
+ * @tmp  : Additional parameter data (not used)
+ *
+ * Passed as a function pointer to dict_foreach ().
+ *
+ * Not for external use.
+ */
+static int
+__exp_item_print_walk(dict_t *dict, char *key, data_t *val, void *tmp)
+{
+    if (val)
+        _export_item_print((struct export_item *)val->data);
+
+    return 0;
+}
+
+/**
+ * __exp_file_print_walk -- Print all the keys and values in the dict
+ *
+ * @dict : the dict to walk
+ * @key  : the key in the dict we are currently on
+ * @val  : the value in the dict associated with the key
+ * @tmp  : Additional parameter data (not used)
+ *
+ * Passed as a function pointer to dict_foreach ().
+ *
+ * Not for external use.
+ */
+static int
+__exp_file_print_walk(dict_t *dict, char *key, data_t *val, void *tmp)
+{
+    if (val) {
+        struct export_dir *dir = (struct export_dir *)val->data;
+
+        printf("%s ", key);
+
+        if (dir->netgroups)
+            dict_foreach(dir->netgroups, __exp_item_print_walk, NULL);
+
+        if (dir->hosts)
+            dict_foreach(dir->hosts, __exp_item_print_walk, NULL);
+
+        printf("\n");
+    }
+    return 0;
+}
+
+/**
+ * exp_file_print --  Print out the contents of the exports file
+ *
+ * @file : Exports file to print
+ *
+ * Not for external use.
+ */
+void
+exp_file_print(const struct exports_file *file)
+{
+    GF_VALIDATE_OR_GOTO(GF_EXP, file, out);
+    dict_foreach(file->exports_dict, __exp_file_print_walk, NULL);
+out:
+    return;
+}
+
+#define __exp_line_get_opt_val(val, equals, ret, errlabel)                     \
+    do {                                                                       \
+        (val) = (equals) + 1;                                                  \
+        if (!(*(val))) {                                                       \
+            (ret) = 1;                                                         \
+            goto errlabel;                                                     \
+        }                                                                      \
+    } while (0)
+
+enum gf_exp_parse_status {
+    GF_EXP_PARSE_SUCCESS = 0,
+    GF_EXP_PARSE_ITEM_NOT_FOUND = 1,
+    GF_EXP_PARSE_ITEM_FAILURE = 2,
+    GF_EXP_PARSE_ITEM_NOT_IN_MOUNT_STATE = 3,
+    GF_EXP_PARSE_LINE_IGNORING = 4,
+};
+
+/**
+ * __exp_line_opt_key_value_parse -- Parse the key-value options in the options
+ *                                   string.
+ *
+ * Given a string like (sec=sys,anonuid=0,rw), to parse, this function
+ * will get called once with 'sec=sys' and again with 'anonuid=0'.
+ * It will check for the '=', make sure there is data to be read
+ * after the '=' and copy the data into the options struct.
+ *
+ * @option    : An option string like sec=sys or anonuid=0
+ * @opts      : Pointer to an struct export_options that holds all the export
+ *              options.
+ *
+ * @return: success: GF_EXP_PARSE_SUCCESS
+ *          failure: GF_EXP_PARSE_ITEM_FAILURE on parse failure,
+ *                   -EINVAL on bad args, -ENOMEM on allocation errors.
+ *
+ * Not for external use.
+ */
+static int
+__exp_line_opt_key_value_parse(char *option, struct export_options *opts)
+{
+    char *equals = NULL;
+    char *right = NULL;
+    char *strmatch = option;
+    int ret = -EINVAL;
+
+    GF_VALIDATE_OR_GOTO(GF_EXP, option, out);
+    GF_VALIDATE_OR_GOTO(GF_EXP, opts, out);
+
+    equals = strchr(option, '=');
+    if (!equals) {
+        ret = GF_EXP_PARSE_ITEM_FAILURE;
+        goto out;
+    }
+
+    *equals = 0;
+    /* Now that an '=' has been found the left side is the option and
+     * the right side is the value. We simply have to compare those and
+     * extract it.
+     */
+    if (strcmp(strmatch, "anonuid") == 0) {
+        *equals = '=';
+        /* Get the value for this option */
+        __exp_line_get_opt_val(right, equals, ret, out);
+        opts->anon_uid = gf_strdup(right);
+        GF_CHECK_ALLOC(opts->anon_uid, ret, out);
+    } else if (strcmp(strmatch, "sec") == 0) {
+        *equals = '=';
+        /* Get the value for this option */
+        __exp_line_get_opt_val(right, equals, ret, out);
+        opts->sec_type = gf_strdup(right);
+        GF_CHECK_ALLOC(opts->sec_type, ret, out);
+    } else {
+        *equals = '=';
+        ret = GF_EXP_PARSE_ITEM_FAILURE;
+        goto out;
+    }
+
+    ret = GF_EXP_PARSE_SUCCESS;
+out:
+    return ret;
+}
+
+/**
+ * __exp_line_opt_parse -- Parse the options part of an
+ *                          exports or netgroups string.
+ *
+ * @opt_str     : The option string to parse
+ * @exp_opts    : Double pointer to the options we are going
+ *                to allocate and setup.
+ *
+ *
+ * @return: success: GF_EXP_PARSE_SUCCESS
+ *          failure: GF_EXP_PARSE_ITEM_FAILURE on parse failure,
+ *                   -EINVAL on bad args, -ENOMEM on allocation errors.
+ *
+ * Not for external use.
+ */
+static int
+__exp_line_opt_parse(const char *opt_str, struct export_options **exp_opts)
+{
+    struct export_options *opts = NULL;
+    char *strmatch = NULL;
+    int ret = -EINVAL;
+    char *equals = NULL;
+
+    ret = parser_set_string(options_parser, opt_str);
+    if (ret < 0)
+        goto out;
+
+    while ((strmatch = parser_get_next_match(options_parser))) {
+        if (!opts) {
+            /* If the options have not been allocated,
+             * allocate it.
+             */
+            opts = _export_options_init();
+            if (!opts) {
+                ret = -ENOMEM;
+                parser_unset_string(options_parser);
+                GF_FREE(strmatch);
+                goto out;
+            }
+        }
+
+        /* First,  check for all the boolean options Second, check for
+         * an '=', and check the available options there. The string
+         * parsing here gets slightly messy, but the concept itself
+         * is pretty simple.
+         */
+        equals = strchr(strmatch, '=');
+        if (strcmp(strmatch, "root") == 0)
+            opts->root = _gf_true;
+        else if (strcmp(strmatch, "ro") == 0)
+            opts->rw = _gf_false;
+        else if (strcmp(strmatch, "rw") == 0)
+            opts->rw = _gf_true;
+        else if (strcmp(strmatch, "nosuid") == 0)
+            opts->nosuid = _gf_true;
+        else if (equals) {
+            ret = __exp_line_opt_key_value_parse(strmatch, opts);
+            if (ret < 0) {
+                /* This means invalid key value options were
+                 * specified, or memory allocation failed.
+                 * The ret value gets bubbled up to the caller.
+                 */
+                GF_FREE(strmatch);
+                parser_unset_string(options_parser);
+                _export_options_deinit(opts);
+                goto out;
+            }
+        } else {
+            /* Cannot change to gf_msg.
+             * gf_msg not giving output to STDOUT
+             * Bug id : BZ1215017
+             */
+            gf_log(GF_EXP, GF_LOG_WARNING,
+                   "Could not find any valid options for "
+                   "string: %s",
+                   strmatch);
+        }
+        GF_FREE(strmatch);
+    }
+
+    if (!opts) {
+        /* If opts is not allocated
+         * that means no matches were found
+         * which is a parse error. Not marking
+         * it as "not found" because it is a parse
+         * error to not have options.
+         */
+        ret = GF_EXP_PARSE_ITEM_FAILURE;
+        parser_unset_string(options_parser);
+        goto out;
+    }
+
+    *exp_opts = opts;
+    parser_unset_string(options_parser);
+    ret = GF_EXP_PARSE_SUCCESS;
+out:
+    return ret;
+}
+
+/**
+ * __exp_line_ng_host_str_parse -- Parse the netgroup or host string
+ *
+ *      e.g. @mygroup(<options>), parsing @mygroup and (<options>)
+ *      or   myhost001.dom(<options>), parsing myhost001.dom and (<options>)
+ *
+ * @line      : The line to parse
+ * @exp_item  : Double pointer to a struct export_item
+ *
+ * @return: success: GF_PARSE_SUCCESS
+ *          failure: GF_EXP_PARSE_ITEM_FAILURE on parse failure,
+ *                   -EINVAL on bad args, -ENOMEM on allocation errors.
+ *
+ * Not for external use.
+ */
+static int
+__exp_line_ng_host_str_parse(char *str, struct export_item **exp_item)
+{
+    struct export_item *item = NULL;
+    int ret = -EINVAL;
+    char *parens = NULL;
+    char *optstr = NULL;
+    struct export_options *exp_opts = NULL;
+    char *item_name = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_EXP, str, out);
+    GF_VALIDATE_OR_GOTO(GF_EXP, exp_item, out);
+
+    /* A netgroup/host string looks like this:
+     * @test(sec=sys,rw,anonuid=0) or host(sec=sys,rw,anonuid=0)
+     * We want to extract the name, 'test' or 'host'
+     * Again, we could setup a regex and use it here,
+     * but its simpler to find the '(' and copy until
+     * there.
+     */
+    parens = strchr(str, '(');
+    if (!parens) {
+        /* Parse error if there are no parens. */
+        ret = GF_EXP_PARSE_ITEM_FAILURE;
+        goto out;
+    }
+
+    *parens = '\0'; /* Temporarily terminate it so we can do a copy */
+
+    if (strlen(str) > FQDN_MAX_LEN) {
+        ret = GF_EXP_PARSE_ITEM_FAILURE;
+        goto out;
+    }
+
+    /* Strip leading whitespaces */
+    while (*str == ' ' || *str == '\t')
+        str++;
+
+    item_name = gf_strdup(str);
+    GF_CHECK_ALLOC(item_name, ret, out);
+
+    gf_msg_trace(GF_EXP, 0, "found hostname/netgroup: %s", item_name);
+
+    /* Initialize an export item for this */
+    item = _export_item_init();
+    GF_CHECK_ALLOC(item, ret, free_and_out);
+    item->name = item_name;
+
+    *parens = '('; /* Restore the string */
+
+    /* Options start at the parentheses */
+    optstr = parens;
+
+    ret = __exp_line_opt_parse(optstr, &exp_opts);
+    if (ret != 0) {
+        /* Bubble up the error to the caller */
+        GF_REF_PUT(item);
+        goto out;
+    }
+
+    item->opts = exp_opts;
+
+    *exp_item = item;
+
+    ret = GF_EXP_PARSE_SUCCESS;
+    goto out;
+
+free_and_out:
+    GF_FREE(item_name);
+out:
+    return ret;
+}
+
+/**
+ * __exp_line_ng_parse -- Extract the netgroups in the line
+ *                        and call helper functions to parse
+ *                        the string.
+ *
+ * The call chain goes like this:
+ *
+ * 1) __exp_line_ng_parse ("/test  @test(sec=sys,rw,anonuid=0)")
+ * 2) __exp_line_ng_str_parse ("@test(sec=sys,rw,anonuid=0)");
+ * 3) __exp_line_opt_parse("(sec=sys,rw,anonuid=0)");
+ *
+ *
+ * @line    : The line to parse
+ * @ng_dict : Double pointer to the dict we want to
+ *            insert netgroups into.
+ *
+ * Allocates the dict, extracts netgroup strings from the line,
+ * parses them into a struct export_item structure and inserts
+ * them in the dict.
+ *
+ * @return: success: GF_EXP_PARSE_SUCCESS
+ *          failure: GF_EXP_PARSE_ITEM_FAILURE on parse failure,
+ *                   GF_EXP_PARSE_ITEM_NOT_FOUND if the netgroup was not found
+ *                   -EINVAL on bad args, -ENOMEM on allocation errors.
+ *
+ * Not for external use.
+ */
+static int
+__exp_line_ng_parse(const char *line, dict_t **ng_dict)
+{
+    dict_t *netgroups = NULL;
+    char *strmatch = NULL;
+    int ret = -EINVAL;
+    struct export_item *exp_ng = NULL;
+    data_t *ngdata = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_EXP, line, out);
+    GF_VALIDATE_OR_GOTO(GF_EXP, ng_dict, out);
+
+    *ng_dict = NULL; /* Will be set if parse is successful */
+
+    /* Initialize a parser with the line to parse
+     * and the regex used to parse it.
+     */
+    ret = parser_set_string(netgroup_parser, line);
+    if (ret < 0) {
+        goto out;
+    }
+
+    gf_msg_trace(GF_EXP, 0, "parsing line: %s", line);
+
+    while ((strmatch = parser_get_next_match(netgroup_parser))) {
+        if (!netgroups) {
+            /* Allocate a new dict to store the netgroups. */
+            netgroups = dict_new();
+            if (!netgroups) {
+                ret = -ENOMEM;
+                goto free_and_out;
+            }
+        }
+
+        gf_msg_trace(GF_EXP, 0, "parsing netgroup: %s", strmatch);
+
+        ret = __exp_line_ng_host_str_parse(strmatch, &exp_ng);
+
+        if (ret != 0) {
+            /* Parsing or other critical errors.
+             * caller will handle return value.
+             */
+            _exp_dict_destroy(netgroups);
+            goto free_and_out;
+        }
+
+        ngdata = bin_to_data(exp_ng, sizeof(*exp_ng));
+        dict_set(netgroups, exp_ng->name, ngdata);
+
+        /* Free this matched string and continue parsing. */
+        GF_FREE(strmatch);
+    }
+
+    /* If the netgroups dict was not allocated, then we know that
+     * no matches were found.
+     */
+    if (!netgroups) {
+        ret = GF_EXP_PARSE_ITEM_NOT_FOUND;
+        parser_unset_string(netgroup_parser);
+        goto out;
+    }
+
+    ret = GF_EXP_PARSE_SUCCESS;
+    *ng_dict = netgroups;
+
+free_and_out:
+    parser_unset_string(netgroup_parser);
+    GF_FREE(strmatch);
+out:
+    return ret;
+}
+
+/**
+ * __exp_line_host_parse -- Extract the hosts in the line
+ *                          and call helper functions to parse
+ *                          the string.
+ *
+ * The call chain goes like this:
+ *
+ * 1) __exp_line_host_parse ("/test  hostip(sec=sys,rw,anonuid=0)")
+ * 2) __exp_line_ng_host_str_parse ("hostip(sec=sys,rw,anonuid=0)");
+ * 3) __exp_line_opt_parse("(sec=sys,rw,anonuid=0)");
+ *
+ *
+ * @line    : The line to parse
+ * @ng_dict : Double pointer to the dict we want to
+ *            insert hosts into.
+ *
+ * Allocates the dict, extracts host strings from the line,
+ * parses them into a struct export_item structure and inserts
+ * them in the dict.
+ *
+ * @return: success: GF_EXP_PARSE_SUCCESS
+ *          failure: GF_EXP_PARSE_ITEM_FAILURE on parse failure,
+ *                   GF_EXP_PARSE_ITEM_NOT_FOUND if the host was not found,
+ *                   -EINVAL on bad args, -ENOMEM on allocation errors.
+ *
+ * Not for external use.
+ */
+static int
+__exp_line_host_parse(const char *line, dict_t **host_dict)
+{
+    dict_t *hosts = NULL;
+    char *strmatch = NULL;
+    int ret = -EINVAL;
+    struct export_item *exp_host = NULL;
+    data_t *hostdata = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_EXP, line, out);
+    GF_VALIDATE_OR_GOTO(GF_EXP, host_dict, out);
+
+    *host_dict = NULL;
+
+    /* Initialize a parser with the line to parse and the regex used to
+     * parse it.
+     */
+    ret = parser_set_string(hostname_parser, line);
+    if (ret < 0) {
+        goto out;
+    }
+
+    gf_msg_trace(GF_EXP, 0, "parsing line: %s", line);
+
+    while ((strmatch = parser_get_next_match(hostname_parser))) {
+        if (!hosts) {
+            /* Allocate a new dictto store the netgroups. */
+            hosts = dict_new();
+            GF_CHECK_ALLOC(hosts, ret, free_and_out);
+        }
+
+        gf_msg_trace(GF_EXP, 0, "parsing hostname: %s", strmatch);
+
+        ret = __exp_line_ng_host_str_parse(strmatch, &exp_host);
+
+        if (ret != 0) {
+            /* Parsing or other critical error, free allocated
+             * memory and exit. The caller will handle the errors.
+             */
+            _exp_dict_destroy(hosts);
+            goto free_and_out;
+        }
+
+        /* Insert export item structure into the hosts dict. */
+        hostdata = bin_to_data(exp_host, sizeof(*exp_host));
+        dict_set(hosts, exp_host->name, hostdata);
+
+        /* Free this matched string and continue parsing.*/
+        GF_FREE(strmatch);
+    }
+
+    /* If the hosts dict was not allocated, then we know that
+     * no matches were found.
+     */
+    if (!exp_host) {
+        ret = GF_EXP_PARSE_ITEM_NOT_FOUND;
+        parser_unset_string(hostname_parser);
+        goto out;
+    }
+
+    ret = GF_EXP_PARSE_SUCCESS;
+    *host_dict = hosts;
+
+free_and_out:
+    parser_unset_string(hostname_parser);
+    GF_FREE(strmatch);
+out:
+    return ret;
+}
+
+/**
+ * __exp_line_dir_parse -- Extract directory name from a line in the exports
+ *                         file.
+ *
+ * @line    : The line to parse
+ * @dirname : Double pointer to the string we need to hold the directory name.
+ *            If the parsing failed, the string will point to NULL, otherwise
+ *            it will point to a valid memory region that is allocated by
+ *            this function.
+ * @check_ms: If this variable is set then we cross check the directory line
+ *            with what's in gluster's vol files and reject them if they don't
+ *            match.
+ *
+ * @return : success: GF_EXP_PARSE_SUCCESS
+ *           failure: GF_EXP_PARSE_ITEM_FAILURE on parse failure,
+ *           -EINVAL on bad arguments, -ENOMEM on allocation failures,
+ *           GF_EXP_PARSE_ITEM_NOT_IN_MOUNT_STATE if we failed to match
+ *           with gluster's mountstate.
+ *
+ * The caller is responsible for freeing memory allocated by this function
+ *
+ * Not for external use.
+ */
+static int
+__exp_line_dir_parse(const char *line, char **dirname, struct mount3_state *ms)
+{
+    char *dir = NULL;
+    char *delim = NULL;
+    int ret = -EINVAL;
+    char *linedup = NULL;
+    struct mnt3_export *mnt3export = NULL;
+    size_t dirlen = 0;
+
+    GF_VALIDATE_OR_GOTO(GF_EXP, line, out);
+    GF_VALIDATE_OR_GOTO(GF_EXP, dirname, out);
+
+    /* Duplicate the line because we don't
+     * want to modify the original string.
+     */
+    linedup = strdupa(line);
+
+    /* We use strtok_r () here to split the string by space/tab and get the
+     * the result. We only need the first result of the split.
+     * a simple task. It is worth noting that dirnames always have to be
+     * validated against gluster's vol files so if they don't
+     * match it will be rejected.
+     */
+    dir = linedup;
+    delim = linedup + strcspn(linedup, " \t");
+    *delim = 0;
+
+    if (ms) {
+        /* Match the directory name with an existing
+         * export in the mount state.
+         */
+        mnt3export = mnt3_mntpath_to_export(ms, dir, _gf_true);
+        if (!mnt3export) {
+            gf_msg_debug(GF_EXP, 0,
+                         "%s not in mount state, "
+                         "ignoring!",
+                         dir);
+            ret = GF_EXP_PARSE_ITEM_NOT_IN_MOUNT_STATE;
+            goto out;
+        }
+    }
+
+    /* Directories can be 1024 bytes in length, check
+     * that the argument provided adheres to
+     * that restriction.
+     */
+    if (strlen(dir) > DIR_MAX_LEN) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    /* Copy the result of the split */
+    dir = gf_strdup(dir);
+    GF_CHECK_ALLOC(dir, ret, out);
+
+    /* Ensure that trailing slashes are stripped before storing the name */
+    dirlen = strlen(dir);
+    if (dirlen > 0 && dir[dirlen - 1] == '/')
+        dir[dirlen - 1] = '\0';
+
+    /* Set the argument to point to the allocated string */
+    *dirname = dir;
+    ret = GF_EXP_PARSE_SUCCESS;
+out:
+    return ret;
+}
+
+/**
+ * _exp_line_parse -- Parse a line in an exports file into a structure
+ *                    that holds all the parts of the line. An exports
+ *                    structure has a dict of netgroups and a dict of hosts.
+ *
+ * An export line looks something like this /test  @test(sec=sys,rw,anonuid=0)
+ * or /test  @test(sec=sys,rw,anonuid=0) hostA(sec=sys,rw,anonuid=0), etc.
+ *
+ * We use regexes to parse the line into three separate pieces:
+ * 1) The directory (exports.h -- DIRECTORY_REGEX_PATTERN)
+ * 2) The netgroup if it exists (exports.h -- NETGROUP_REGEX_PATTERN)
+ * 3) The host if it exists (exports.h -- HOST_REGEX_PATTERN)
+ *
+ * In this case, the netgroup would be @test(sec=sys,rw,anonuid=0)
+ * and the host would be hostA(sec=sys,rw,anonuid=0).
+ *
+ * @line        : The line to parse
+ * @dir         : Double pointer to the struct we need to parse the line into.
+ *                If the parsing failed, the struct will point to NULL,
+ *                otherwise it will point to a valid memory region that is
+ *                allocated by this function.
+ * @parse_full  : This parameter tells us whether we should parse all the lines
+ *                in the file, even if they are not present in gluster's config.
+ *                The gluster config holds the volumes that it exports so
+ *                if parse_full is set to FALSE then we will ensure that
+ *                the export file structure holds only those volumes
+ *                that gluster has exported. It is important to note that
+ *                If gluster exports a volume named '/test', '/test' and all
+ *                of its subdirectories that may be in the exports file
+ *                are valid exports.
+ *  @ms         : The mount state that holds the list of volumes that gluster
+ *                currently exports.
+ *
+ * @return : success: GF_EXP_PARSE_SUCCESS on success, -EINVAL on bad arguments,
+ *                    -ENOMEM on memory allocation errors,
+ *                    GF_EXP_PARSE_LINE_IGNORING if we ignored the line,
+ *                    GF_EXP_PARSE_ITEM_FAILURE if there was error parsing
+ *           failure: NULL
+ *
+ * The caller is responsible for freeing memory allocated by this function
+ * The caller should free this memory using the _exp_dir_deinit () function.
+ *
+ * Not for external use.
+ */
+static int
+_exp_line_parse(const char *line, struct export_dir **dir,
+                gf_boolean_t parse_full, struct mount3_state *ms)
+{
+    struct export_dir *expdir = NULL;
+    char *dirstr = NULL;
+    dict_t *netgroups = NULL;
+    dict_t *hosts = NULL;
+    int ret = -EINVAL;
+    gf_boolean_t netgroups_failed = _gf_false;
+
+    GF_VALIDATE_OR_GOTO(GF_EXP, line, out);
+    GF_VALIDATE_OR_GOTO(GF_EXP, dir, out);
+
+    if (*line == '#' || *line == ' ' || *line == '\t' || *line == '\0' ||
+        *line == '\n') {
+        ret = GF_EXP_PARSE_LINE_IGNORING;
+        goto out;
+    }
+
+    expdir = _export_dir_init();
+    if (!expdir) {
+        *dir = NULL;
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    /* Get the directory string from the line */
+    ret = __exp_line_dir_parse(line, &dirstr, ms);
+    if (ret < 0) {
+        gf_msg(GF_EXP, GF_LOG_ERROR, 0, NFS_MSG_PARSE_DIR_FAIL,
+               "Parsing directory failed: %s", strerror(-ret));
+        /* If parsing the directory failed,
+         * we should simply fail because there's
+         * nothing else we can extract from the string to make
+         * the data valuable.
+         */
+        goto free_and_out;
+    }
+
+    /* Set the dir str */
+    expdir->dir_name = dirstr;
+
+    /* Parse the netgroup part of the string */
+    ret = __exp_line_ng_parse(line, &netgroups);
+    if (ret < 0) {
+        gf_msg(GF_EXP, GF_LOG_ERROR, -ret, NFS_MSG_PARSE_FAIL,
+               "Critical error: %s", strerror(-ret));
+        /* Return values less than 0
+         * indicate critical failures (null parameters,
+         * failure to allocate memory, etc).
+         */
+        goto free_and_out;
+    }
+    if (ret != 0) {
+        if (ret == GF_EXP_PARSE_ITEM_FAILURE)
+            /* Cannot change to gf_msg.
+             * gf_msg not giving output to STDOUT
+             * Bug id : BZ1215017
+             */
+            gf_log(GF_EXP, GF_LOG_WARNING, "Error parsing netgroups for: %s",
+                   line);
+        /* Even though parsing failed for the netgroups we should let
+         * host parsing proceed.
+         */
+        netgroups_failed = _gf_true;
+    }
+
+    /* Parse the host part of the string */
+    ret = __exp_line_host_parse(line, &hosts);
+    if (ret < 0) {
+        gf_msg(GF_EXP, GF_LOG_ERROR, -ret, NFS_MSG_PARSE_FAIL,
+               "Critical error: %s", strerror(-ret));
+        goto free_and_out;
+    }
+    if (ret != 0) {
+        if (ret == GF_EXP_PARSE_ITEM_FAILURE)
+            gf_msg(GF_EXP, GF_LOG_WARNING, 0, NFS_MSG_PARSE_FAIL,
+                   "Error parsing hosts for: %s", line);
+        /* If netgroups parsing failed, AND
+         * host parsing failed, then there's something really
+         * wrong with this line, so we're just going to
+         * log it and fail out.
+         */
+        if (netgroups_failed)
+            goto free_and_out;
+    }
+
+    expdir->hosts = hosts;
+    expdir->netgroups = netgroups;
+    *dir = expdir;
+    goto out;
+
+free_and_out:
+    _export_dir_deinit(expdir);
+out:
+    return ret;
+}
+
+struct export_item *
+exp_dir_get_netgroup(const struct export_dir *expdir, const char *netgroup)
+{
+    struct export_item *lookup_res = NULL;
+    data_t *dict_res = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_EXP, expdir, out);
+    GF_VALIDATE_OR_GOTO(GF_EXP, netgroup, out);
+
+    if (!expdir->netgroups)
+        goto out;
+
+    dict_res = dict_get(expdir->netgroups, (char *)netgroup);
+    if (!dict_res) {
+        gf_msg_debug(GF_EXP, 0, "%s not found for %s", netgroup,
+                     expdir->dir_name);
+        goto out;
+    }
+
+    lookup_res = (struct export_item *)dict_res->data;
+out:
+    return lookup_res;
+}
+/**
+ * exp_dir_get_host -- Given a host string and an exports directory structure,
+ *                     find and return an struct export_item structure that
+ *                     represents the requested host.
+ *
+ * @expdir: Export directory to lookup from
+ * @host  : Host string to lookup
+ *
+ * @return: success: Pointer to a export item structure
+ *          failure: NULL
+ */
+struct export_item *
+exp_dir_get_host(const struct export_dir *expdir, const char *host)
+{
+    struct export_item *lookup_res = NULL;
+    data_t *dict_res = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_EXP, expdir, out);
+    GF_VALIDATE_OR_GOTO(GF_EXP, host, out);
+
+    if (!expdir->hosts)
+        goto out;
+
+    dict_res = dict_get(expdir->hosts, (char *)host);
+    if (!dict_res) {
+        gf_msg_debug(GF_EXP, 0, "%s not found for %s", host, expdir->dir_name);
+
+        /* Check if wildcards are allowed for the host */
+        dict_res = dict_get(expdir->hosts, "*");
+        if (!dict_res) {
+            goto out;
+        }
+    }
+
+    lookup_res = (struct export_item *)dict_res->data;
+out:
+    return lookup_res;
+}
+
+/**
+ * exp_file_get_dir -- Return an export dir given a directory name
+ *                     Does a lookup from the dict in the file structure.
+ *
+ * @file : Exports file structure to lookup from
+ * @dir  : Directory name to lookup
+ *
+ * @return : success: Pointer to an export directory structure
+ *           failure: NULL
+ */
+struct export_dir *
+exp_file_get_dir(const struct exports_file *file, const char *dir)
+{
+    struct export_dir *lookup_res = NULL;
+    data_t *dict_res = NULL;
+    char *dirdup = NULL;
+    size_t dirlen = 0;
+
+    GF_VALIDATE_OR_GOTO(GF_EXP, file, out);
+    GF_VALIDATE_OR_GOTO(GF_EXP, dir, out);
+
+    dirlen = strlen(dir);
+    if (dirlen <= 0)
+        goto out;
+
+    dirdup = (char *)dir; /* Point at the directory */
+
+    /* If directories don't contain a leading slash */
+    if (*dir != '/') {
+        dirdup = alloca(dirlen + 2); /* Leading slash & null byte */
+        snprintf(dirdup, dirlen + 2, "/%s", dir);
+    }
+
+    dict_res = dict_get(file->exports_dict, dirdup);
+    if (!dict_res) {
+        gf_msg_debug(GF_EXP, 0, "%s not found in %s", dirdup, file->filename);
+        goto out;
+    }
+
+    lookup_res = (struct export_dir *)dict_res->data;
+out:
+    return lookup_res;
+}
+
+/**
+ * exp_file_parse -- Parse an exports file into a structure
+ *                   that can be looked up through simple
+ *                   function calls.
+ *
+ * @filepath: Path to the exports file
+ * @ms      : Current mount state (useful to match with gluster vol files)
+ *
+ * @return  : success: 0
+ *            failure: -1 on parsing failure, -EINVAL on bad arguments,
+ *                     -ENOMEM on allocation failures.
+ *
+ * The caller is responsible for freeing memory allocated by this function.
+ * The caller should free this memory using the exp_file_deinit () function.
+ * Calling GF_FREE ( ) on the pointer will NOT free all the allocated memory.
+ *
+ * Externally usable.
+ */
+int
+exp_file_parse(const char *filepath, struct exports_file **expfile,
+               struct mount3_state *ms)
+{
+    FILE *fp = NULL;
+    struct exports_file *file = NULL;
+    size_t len = 0;
+    int ret = -EINVAL;
+    unsigned long line_number = 0;
+    char *line = NULL;
+    struct export_dir *expdir = NULL;
+
+    /* Sets whether we we should parse the entire file or just that which
+     * is present in the mount state */
+    gf_boolean_t parse_complete_file = _gf_false;
+
+    GF_VALIDATE_OR_GOTO(GF_EXP, expfile, parse_done);
+
+    if (!ms) {
+        /* If mount state is null that means that we
+         * should go through and parse the whole file
+         * since we don't have anything to compare against.
+         */
+        parse_complete_file = _gf_true;
+    }
+
+    fp = fopen(filepath, "r");
+    if (!fp) {
+        ret = -errno;
+        goto parse_done;
+    }
+
+    ret = _exp_init_parsers();
+    if (ret < 0)
+        goto parse_done;
+
+    /* Process the file line by line, with each line being parsed into
+     * an struct export_dir struct. If 'parse_complete_file' is set to TRUE
+     * then
+     */
+    while (getline(&line, &len, fp) != -1) {
+        line_number++;      /* Keeping track of line number allows us to
+                             * to log which line numbers were wrong
+                             */
+        strtok(line, "\n"); /* removes the newline character from
+                             * the line
+                             */
+
+        /* Parse the line from the file into an struct export_dir
+         * structure. The process is as follows:
+         * Given a line like :
+         * "/vol @test(sec=sys,rw,anonuid=0) 10.35.11.31(sec=sys,rw)"
+         *
+         * This function will allocate an export dir and set its name
+         * to '/vol', using the function _exp_line_dir_parse ().
+         *
+         * Then it will extract the netgroups from the line, in this
+         * case it would be '@test(sec=sys,rw,anonuid=0)', and set the
+         * item structure's name to '@test'.
+         * It will also extract the options from that string and parse
+         * them into an struct export_options which will be pointed
+         * to by the item structure. This will be put into a dict
+         * which will be pointed to by the export directory structure.
+         *
+         * The same process happens above for the host string
+         * '10.35.11.32(sec=sys,rw)'
+         */
+        ret = _exp_line_parse(line, &expdir, parse_complete_file, ms);
+        if (ret == -ENOMEM) {
+            /* If we get memory allocation errors, we really should
+             * not continue parsing, so just free the allocated
+             * memory and exit.
+             */
+            goto free_and_done;
+        }
+
+        if (ret < 0) {
+            gf_msg(GF_EXP, GF_LOG_ERROR, -ret, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse line #%lu", line_number);
+            continue; /* Skip entering this line and continue */
+        }
+
+        if (ret == GF_EXP_PARSE_LINE_IGNORING) {
+            /* This just means the line was empty or started with a
+             * '#' or a ' ' and we are ignoring it.
+             */
+            gf_msg_debug(GF_EXP, 0,
+                         "Ignoring line #%lu because it started "
+                         "with a %c",
+                         line_number, *line);
+            continue;
+        }
+
+        if (!file) {
+            file = _exports_file_init();
+            GF_CHECK_ALLOC_AND_LOG(GF_EXP, file, ret,
+                                   "Allocation error while "
+                                   "allocating file struct",
+                                   free_and_done);
+
+            file->filename = gf_strdup(filepath);
+            GF_CHECK_ALLOC_AND_LOG(GF_EXP, file, ret,
+                                   "Allocation error while "
+                                   "duping filepath",
+                                   free_and_done);
+        }
+
+        /* If the parsing is successful store the export directory
+         * in the file structure.
+         */
+        _exp_file_insert(file, expdir);
+    }
+
+    /* line got allocated through getline(), don't use GF_FREE() for it */
+    free(line);
+
+    *expfile = file;
+    goto parse_done;
+
+free_and_done:
+    if (file)
+        exp_file_deinit(file);
+    _export_dir_deinit(expdir);
+
+parse_done:
+    if (fp)
+        fclose(fp);
+    _exp_deinit_parsers();
+    return ret;
+}
diff --git a/xlators/nfs/server/src/exports.h b/xlators/nfs/server/src/exports.h
new file mode 100644
index 00000000000..ad35ad8c3a0
--- /dev/null
+++ b/xlators/nfs/server/src/exports.h
@@ -0,0 +1,93 @@
+/*
+   Copyright 2014-present Facebook. All Rights Reserved
+
+   This file is part of GlusterFS.
+
+   Author :
+   Shreyas Siravara <shreyas.siravara@gmail.com>
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _EXPORTS_H_
+#define _EXPORTS_H_
+
+#include "nfs-mem-types.h"
+#include <glusterfs/dict.h>
+#include "nfs.h"
+
+#define GF_EXP GF_NFS "-exports"
+
+#define NETGROUP_REGEX_PATTERN "(@([a-zA-Z0-9\\(=, .])+)())"
+#define HOSTNAME_REGEX_PATTERN "[[:space:]]([a-zA-Z0-9.\\(=,*/:)-]+)"
+#define OPTIONS_REGEX_PATTERN "([a-zA-Z0-9=\\.]+)"
+
+#define NETGROUP_MAX_LEN 128
+#define FQDN_MAX_LEN 256
+
+#define SEC_OPTION_MAX 10
+#define UID_MAX_LEN 6
+
+#define DIR_MAX_LEN 1024
+
+/* The following 2 definitions are in mount3.h
+ * but we don't want to include it because mount3.h
+ * depends on structs in this file so we get a cross
+ * dependency.
+ */
+struct mount3_state;
+
+extern struct mnt3_export *
+mnt3_mntpath_to_export(struct mount3_state *ms, const char *dirpath,
+                       gf_boolean_t export_parsing_match);
+
+struct export_options {
+    gf_boolean_t rw;     /* Read-write option */
+    gf_boolean_t nosuid; /* nosuid option */
+    gf_boolean_t root;   /* root option */
+    char *anon_uid;      /* anonuid option */
+    char *sec_type;      /* X, for sec=X */
+};
+
+struct export_item {
+    char *name;                  /* Name of the export item */
+    struct export_options *opts; /* NFS Options */
+    GF_REF_DECL;
+};
+
+struct export_dir {
+    char *dir_name;    /* Directory */
+    dict_t *netgroups; /* Dict of netgroups */
+    dict_t *hosts;     /* Dict of hosts */
+};
+
+struct exports_file {
+    char *filename;       /* Filename */
+    dict_t *exports_dict; /* Dict of export_dir_t */
+    dict_t *exports_map;  /* Map of SuperFastHash(<export>) -> expdir */
+};
+
+void
+exp_file_deinit(struct exports_file *expfile);
+
+int
+exp_file_parse(const char *filepath, struct exports_file **expfile,
+               struct mount3_state *ms);
+
+struct export_dir *
+exp_file_get_dir(const struct exports_file *file, const char *dir);
+
+struct export_item *
+exp_dir_get_host(const struct export_dir *expdir, const char *host);
+
+struct export_item *
+exp_dir_get_netgroup(const struct export_dir *expdir, const char *netgroup);
+
+struct export_dir *
+exp_file_dir_from_uuid(const struct exports_file *file,
+                       const uuid_t export_uuid);
+
+#endif /* _EXPORTS_H_ */
diff --git a/xlators/nfs/server/src/mount3-auth.c b/xlators/nfs/server/src/mount3-auth.c
new file mode 100644
index 00000000000..10e57c84cdb
--- /dev/null
+++ b/xlators/nfs/server/src/mount3-auth.c
@@ -0,0 +1,642 @@
+/*
+   Copyright 2014-present Facebook. All Rights Reserved
+
+   This file is part of GlusterFS.
+
+   Author :
+   Shreyas Siravara <shreyas.siravara@gmail.com>
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+/* This file contains code for handling mount authentication.
+ * The primary structure here is 'mnt3_auth_params' which contains
+ * 3 important fields: 1) Pointer to a netgroups file struct, 2) Pointer to an
+ * exports file struct. 3) Pointer to a mount state struct.
+ *
+ * - The auth parameter struct belongs to a mount state so the mount state
+ *   pointer represents the mount state that this auth parameter struct belongs
+ *   to.
+ *
+ * - Currently, the only supported mount auth parameters are an exports file
+ *   and a netgroups file. The two pointers in the struct represent the files
+ *   we are to authenticate against.
+ *
+ * - To initialize a struct, make a call to mnt3_auth_params_init () with a mnt
+ *   state as a parameter.
+ *
+ * - To set an exports file authentication parameter, call
+ *   mnt3_auth_set_exports_auth () with an exports file as a parameter.
+ *
+ * - Same goes for the netgroups file parameter, except use the netgroups file
+ *   as the parameter.
+ */
+
+#include "mount3-auth.h"
+#include "exports.h"
+#include "netgroups.h"
+#include <glusterfs/mem-pool.h>
+#include "nfs-messages.h"
+
+/**
+ * mnt3_auth_params_init -- Initialize the mount3 authorization parameters
+ *                          and return the allocated struct. The mount3_state
+ *                          parameter is pointed to by a field in the struct.
+ *
+ * @ms: Mount state that is needed for auth.
+ *
+ * @return: success: Pointer to the allocated struct
+ *          failure: NULL
+ *
+ * For external use.
+ */
+struct mnt3_auth_params *
+mnt3_auth_params_init(struct mount3_state *ms)
+{
+    struct mnt3_auth_params *auth_params = NULL;
+
+    auth_params = GF_MALLOC(sizeof(*auth_params), gf_nfs_mt_mnt3_auth_params);
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, auth_params, out);
+
+    auth_params->ngfile = NULL;
+    auth_params->expfile = NULL;
+    auth_params->ms = ms;
+out:
+    return auth_params;
+}
+
+/**
+ * mnt3_auth_params_deinit -- Free the memory used by the struct.
+ *
+ * @auth_params: Pointer to the struct we want to free
+ *
+ * For external use.
+ */
+void
+mnt3_auth_params_deinit(struct mnt3_auth_params *auth_params)
+{
+    if (!auth_params)
+        goto out;
+
+    /* Atomically set the auth params in the mount state to NULL
+     * so subsequent fops will be denied while the auth params
+     * are being cleaned up.
+     */
+    (void)__sync_lock_test_and_set(&auth_params->ms->auth_params, NULL);
+
+    ng_file_deinit(auth_params->ngfile);
+    exp_file_deinit(auth_params->expfile);
+    auth_params->ms = NULL;
+    GF_FREE(auth_params);
+out:
+    return;
+}
+
+/**
+ * mnt3_set_exports_auth -- Set the exports auth file
+ *
+ * @auth_params : Pointer to the auth params struct
+ * @filename    : File name to load from disk and parse
+ *
+ * @return  : success: 0
+ *            failure: -1
+ *
+ * For external use.
+ */
+int
+mnt3_auth_set_exports_auth(struct mnt3_auth_params *auth_params,
+                           const char *filename)
+{
+    struct exports_file *expfile = NULL;
+    struct exports_file *oldfile = NULL;
+    int ret = -EINVAL;
+
+    /* Validate args */
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, auth_params, out);
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, filename, out);
+
+    /* Parse the exports file and set the auth parameter */
+    ret = exp_file_parse(filename, &expfile, auth_params->ms);
+    if (ret < 0) {
+        gf_msg(GF_MNT_AUTH, GF_LOG_ERROR, 0, NFS_MSG_LOAD_PARSE_ERROR,
+               "Failed to load & parse file"
+               " %s, see logs for more information",
+               filename);
+        goto out;
+    }
+
+    /* Atomically set the file pointer */
+    oldfile = __sync_lock_test_and_set(&auth_params->expfile, expfile);
+    exp_file_deinit(oldfile);
+    ret = 0;
+out:
+    return ret;
+}
+
+/**
+ * mnt3_set_netgroups_auth -- Set netgroups auth file
+ *
+ * @auth_params : Pointer to the auth params struct.
+ * @filename    : File name to load from disk and parse
+ *
+ * @return  : success: 0
+ *            failure: -1
+ *
+ * For external use.
+ */
+int
+mnt3_auth_set_netgroups_auth(struct mnt3_auth_params *auth_params,
+                             const char *filename)
+{
+    struct netgroups_file *ngfile = NULL;
+    struct netgroups_file *oldfile = NULL;
+    int ret = -EINVAL;
+
+    /* Validate args */
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, auth_params, out);
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, filename, out);
+
+    ngfile = ng_file_parse(filename);
+    if (!ngfile) {
+        gf_msg(GF_MNT_AUTH, GF_LOG_ERROR, 0, NFS_MSG_LOAD_PARSE_ERROR,
+               "Failed to load file %s, see logs for more "
+               "information",
+               filename);
+        ret = -1;
+        goto out;
+    }
+
+    /* Atomically set the file pointer */
+    oldfile = __sync_lock_test_and_set(&auth_params->ngfile, ngfile);
+    ng_file_deinit(oldfile);
+    ret = 0;
+out:
+    return ret;
+}
+
+/* Struct used to pass parameters to
+ * _mnt3_auth_subnet_match () which
+ * checks if an IP matches a subnet
+ */
+struct _mnt3_subnet_match_s {
+    char *ip;                  /* IP address to match */
+    struct export_item **host; /* Host structure to set */
+};
+
+/**
+ * _mnt3_auth_subnet_match -- Check if an ip (specified in the parameter tmp)
+ *                            is in the subnet specified by key.
+ *
+ * @dict: The dict to walk
+ * @key : The key we are on
+ * @val : The value we are on
+ * @tmp : Parameter that points to the above struct
+ *
+ */
+static int
+_mnt3_auth_subnet_match(dict_t *dict, char *key, data_t *val, void *tmp)
+{
+    struct _mnt3_subnet_match_s *match = NULL;
+
+    match = (struct _mnt3_subnet_match_s *)tmp;
+
+    if (!match)
+        return 0;
+
+    if (!match->host)
+        return 0;
+
+    if (!match->ip)
+        return 0;
+
+    /* Already found the host */
+    if (*(match->host))
+        return 0;
+
+    /* Don't process anything that's not in CIDR */
+    if (!strchr(key, '/'))
+        return 0;
+
+    /* Strip out leading whitespaces */
+    while (*key == ' ')
+        key++;
+
+    /* If we found that the IP was in the network, set the host
+     * to point to the value in the dict.
+     */
+    if (gf_is_ip_in_net(key, match->ip)) {
+        *(match->host) = (struct export_item *)val->data;
+    }
+    return 0;
+}
+
+/**
+ * _find_host_in_export -- Find a host in the exports file.
+ *
+ * Case 1: FH is non-null
+ * -----------------------
+ * The lookup process is two-step: The FH has a mountid which represents the
+ * export that was mounted by the client. The export is defined as an entry in
+ * the exports file. The FH's 'mountid' is hashed in the exports file to lookup
+ * an export directory.
+ *
+ * Case 2: FH is null
+ * -------------------
+ * The lookup process is two-step: You need a directory and a hostname
+ * to do the lookup. We first lookup the export directory in the file
+ * and then do a lookup on the directory to find the host. If the host
+ * is not found, we must finally check for subnets and then do a match.
+ *
+ * @file: Exports file to lookup in
+ * @dir : Directory to do the lookup
+ * @host: Host to lookup in the directory
+ *
+ * Not for external use.
+ */
+static struct export_item *
+_mnt3_auth_check_host_in_export(const struct exports_file *file,
+                                const char *dir, const char *host,
+                                struct nfs3_fh *fh)
+{
+    struct export_dir *expdir = NULL;
+    struct export_item *host_s = NULL;
+    struct _mnt3_subnet_match_s snet_match_s = {
+        0,
+    };
+
+    /* Validate args */
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, file, out);
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, host, out);
+
+    /* If the filehandle is defined, use that to perform authentication.
+     * All file operations that need authentication must follow this
+     * code path.
+     */
+    if (fh) {
+        expdir = exp_file_dir_from_uuid(file, fh->mountid);
+        if (!expdir)
+            goto out;
+    } else {
+        /* Get the exports directory from the exports file */
+        expdir = exp_file_get_dir(file, dir);
+        if (!expdir)
+            goto out;
+    }
+
+    /* Extract the host from the export directory */
+    host_s = exp_dir_get_host(expdir, host);
+    if (!host_s)
+        goto subnet_match;
+    else
+        goto out;
+
+    /* If the host is not found, we need to walk through the hosts
+     * in the exports directory and see if any of the "hosts" are actually
+     * networks (e.g. 10.5.153.0/24). If they are we should match the
+     * incoming network.
+     */
+subnet_match:
+    if (!expdir->hosts)
+        goto out;
+    snet_match_s.ip = (char *)host;
+    snet_match_s.host = &host_s;
+    dict_foreach(expdir->hosts, _mnt3_auth_subnet_match, &snet_match_s);
+out:
+    return host_s;
+}
+
+/* This struct represents all the parameters necessary to search through a
+ * netgroups file to find a host.
+ */
+struct ng_auth_search {
+    const char *search_for;            /* strings to search for */
+    gf_boolean_t found;                /* mark true once found */
+    const struct netgroups_file *file; /* netgroups file to search */
+    const char *expdir;
+    struct export_item *expitem; /* pointer to the export */
+    const struct exports_file *expfile;
+    gf_boolean_t _is_host_dict;         /* searching a host dict? */
+    struct netgroup_entry *found_entry; /* the entry we found! */
+};
+
+/**
+ * __netgroup_dict_search -- Function to search the netgroups dict.
+ *
+ * @dict: The dict we are walking
+ * @key : The key we are on
+ * @val : The value associated with that key
+ * @data: Additional parameters. We pass a pointer to ng_auth_search_s
+ *
+ * This is passed as a function pointer to dict_foreach ().
+ *
+ * Not for external use.
+ */
+static int
+__netgroup_dict_search(dict_t *dict, char *key, data_t *val, void *data)
+{
+    struct ng_auth_search *ngsa = NULL;
+    struct netgroup_entry *ngentry = NULL;
+    data_t *hdata = NULL;
+
+    /* 'ngsa' is the search params */
+    ngsa = (struct ng_auth_search *)data;
+    ngentry = (struct netgroup_entry *)val->data;
+
+    if (ngsa->_is_host_dict) {
+        /* If are on a host dict, we can simply hash the search key
+         * against the host dict and see if we find anything.
+         */
+        hdata = dict_get(dict, (char *)ngsa->search_for);
+        if (hdata) {
+            /* If it was found, log the message, mark the search
+             * params dict as found and return.
+             */
+            gf_msg_debug(GF_MNT_AUTH, errno,
+                         "key %s was hashed "
+                         "and found",
+                         key);
+            ngsa->found = _gf_true;
+            ngsa->found_entry = (struct netgroup_entry *)hdata->data;
+            goto out;
+        }
+    }
+
+    /* If the key is what we are searching for, mark the item as
+     * found and return.
+     */
+    if (strcmp(key, ngsa->search_for) == 0) {
+        ngsa->found = _gf_true;
+        ngsa->found_entry = ngentry;
+        goto out;
+    }
+
+    /* If we have a netgroup hosts dict, then search the dict using this
+     * same function.
+     */
+    if (ngentry->netgroup_hosts) {
+        ngsa->_is_host_dict = _gf_true;
+        dict_foreach(ngentry->netgroup_hosts, __netgroup_dict_search, ngsa);
+    }
+
+    /* If that search was successful, just return */
+    if (ngsa->found)
+        goto out;
+
+    /* If we have a netgroup dict, then search the dict using this same
+     * function.
+     */
+    if (ngentry->netgroup_ngs) {
+        ngsa->_is_host_dict = _gf_false;
+        dict_foreach(ngentry->netgroup_ngs, __netgroup_dict_search, ngsa);
+    }
+out:
+    return 0;
+}
+
+/**
+ * __export_dir_lookup_netgroup -- Function to search an exports directory
+ *                                 for a host name.
+ *
+ * This function walks all the netgroups & hosts in an export directory
+ * and tries to match it with the search key. This function calls the above
+ * netgroup search function to search through the netgroups.
+ *
+ * This function is very similar to the above function, but both are necessary
+ * since we are walking two different dicts. For each netgroup in _this_ dict
+ * (the exports dict) we are going to find the corresponding netgroups dict
+ * and walk that (nested) structure until we find the host we are looking for.
+ *
+ * @dict: The dict we are walking
+ * @key : The key we are on
+ * @val : The value associated with that key
+ * @data: Additional parameters. We pass a pointer to ng_auth_search_s
+ *
+ * This is passed as a function pointer to dict_foreach ().
+ *
+ * Not for external use.
+ */
+static int
+__export_dir_lookup_netgroup(dict_t *dict, char *key, data_t *val, void *data)
+{
+    struct ng_auth_search *ngsa = NULL;    /* Search params */
+    struct netgroups_file *nfile = NULL;   /* Netgroups file to search */
+    struct netgroup_entry *ngentry = NULL; /* Entry in the netgroups file */
+    struct export_dir *tmpdir = NULL;
+
+    ngsa = (struct ng_auth_search *)data;
+    nfile = (struct netgroups_file *)ngsa->file;
+
+    GF_ASSERT((*key == '@'));
+
+    /* We use ++key here because keys start with '@' for ngs */
+    ngentry = ng_file_get_netgroup(nfile, (key + 1));
+    if (!ngentry) {
+        gf_msg_debug(GF_MNT_AUTH, 0, "%s not found in %s", key,
+                     nfile->filename);
+        goto out;
+    }
+
+    tmpdir = exp_file_get_dir(ngsa->expfile, ngsa->expdir);
+    if (!tmpdir)
+        goto out;
+
+    ngsa->expitem = exp_dir_get_netgroup(tmpdir, key);
+    if (!ngsa->expitem)
+        goto out;
+
+    /* Run through the host dict */
+    if (ngentry->netgroup_hosts) {
+        ngsa->_is_host_dict = _gf_true;
+        dict_foreach(ngentry->netgroup_hosts, __netgroup_dict_search, ngsa);
+    }
+
+    /* If the above search was successful, just return */
+    if (ngsa->found)
+        goto out;
+
+    /* Run through the netgroups dict */
+    if (ngentry->netgroup_ngs) {
+        ngsa->_is_host_dict = _gf_false;
+        dict_foreach(ngentry->netgroup_ngs, __netgroup_dict_search, ngsa);
+    }
+out:
+    return 0;
+}
+
+/**
+ * _mnt3_auth_setup_search_param -- This function sets up an ng_auth_search
+ *                                  struct with host and file as the parameters.
+ *                                  Host is what we are searching for and file
+ *                                  is what we are searching in.
+ * @params: Search params to setup
+ * @host  : The host to set
+ * @nfile : The netgroups file to set
+ *
+ */
+void
+_mnt3_auth_setup_search_params(struct ng_auth_search *params, const char *host,
+                               const char *dir,
+                               const struct netgroups_file *nfile,
+                               const struct exports_file *expfile)
+{
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, params, out);
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, host, out);
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, nfile, out);
+
+    params->search_for = host;
+    params->found = _gf_false;
+    params->file = nfile;
+    params->_is_host_dict = _gf_false;
+    params->found_entry = NULL;
+    params->expitem = NULL;
+    params->expfile = expfile;
+    params->expdir = dir;
+out:
+    return;
+}
+
+/**
+ * _mnt3_auth_find_host_in_netgroup -- Given a host name for an directory
+ *                                     find if that hostname is in the
+ *                                     directory's dict of netgroups.
+ * @nfile: Netgroups file to search
+ * @efile: Exports file to search
+ * @dir  : The exports directory name (used to lookup in exports file)
+ * @host : The host we are searching for
+ *
+ * Search procedure:
+ *
+ * - Lookup directory string against exports file structure,
+ *   get an exports directory structure.
+ * - Walk the export file structure's netgroup dict. This dict
+ *   holds each netgroup that is authorized to mount that directory.
+ * - We then have to walk the netgroup structure, which is a set of
+ *   nested dicts until we find the host we are looking for.
+ *
+ * @return: success: Pointer to the netgroup entry found
+ *          failure: NULL
+ *
+ * Not for external use.
+ */
+static struct netgroup_entry *
+_mnt3_auth_check_host_in_netgroup(const struct mnt3_auth_params *auth_params,
+                                  struct nfs3_fh *fh, const char *host,
+                                  const char *dir, struct export_item **item)
+{
+    struct export_dir *expdir = NULL;
+    struct ng_auth_search ngsa = {
+        0,
+    };
+    struct netgroup_entry *found_entry = NULL;
+    struct exports_file *efile = auth_params->expfile;
+    struct netgroups_file *nfile = auth_params->ngfile;
+
+    /* Validate args */
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, nfile, out);
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, efile, out);
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, host, out);
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, item, out);
+
+    if (fh) {
+        expdir = exp_file_dir_from_uuid(efile, fh->mountid);
+        if (!expdir)
+            goto out;
+    } else {
+        /* Get the exports directory */
+        expdir = exp_file_get_dir(efile, dir);
+        if (!expdir)
+            goto out;
+    }
+
+    /* Setup search struct */
+    _mnt3_auth_setup_search_params(&ngsa, host, expdir->dir_name, nfile, efile);
+
+    /* Do the search */
+    dict_foreach(expdir->netgroups, __export_dir_lookup_netgroup, &ngsa);
+    found_entry = ngsa.found_entry;
+    *item = ngsa.expitem;
+out:
+    return found_entry;
+}
+
+/**
+ * check_rw_access -- Checks if the export item
+ * has read-write access.
+ *
+ * @host_item : The export item to check
+ *
+ * @return -EROFS if it does not have rw access, 0 otherwise
+ *
+ */
+int
+check_rw_access(struct export_item *item)
+{
+    struct export_options *opts = NULL;
+    int ret = -EROFS;
+
+    if (!item)
+        goto out;
+
+    opts = item->opts;
+    if (!opts)
+        goto out;
+
+    if (opts->rw)
+        ret = 0;
+out:
+    return ret;
+}
+
+/**
+ * mnt3_auth_host -- Check if a host is authorized for a directory
+ *
+ * @auth_params : Auth parameters to authenticate against
+ * @host: Host requesting the directory
+ * @dir : Directory that the host requests
+ * @fh  : The filehandle passed from an fop to authenticate
+ *
+ * 'fh' is null on mount requests and 'dir' is null on fops
+ *
+ * Procedure:
+ *
+ * - Check if the host is in the exports directory.
+ * - If not, check if the host is in the netgroups file for the
+ *   netgroups authorized for the exports.
+ *
+ * @return: 0 if authorized
+ *          -EACCES for completely unauthorized fop
+ *          -EROFS  for unauthorized write operations (rm, mkdir, write)  *
+ */
+int
+mnt3_auth_host(const struct mnt3_auth_params *auth_params, const char *host,
+               struct nfs3_fh *fh, const char *dir, gf_boolean_t is_write_op,
+               struct export_item **save_item)
+{
+    int auth_status_code = -EACCES;
+    struct export_item *item = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, auth_params, out);
+    GF_VALIDATE_OR_GOTO(GF_MNT_AUTH, host, out);
+
+    /* Find the host in the exports file */
+    item = _mnt3_auth_check_host_in_export(auth_params->expfile, dir, host, fh);
+    if (item) {
+        auth_status_code = (is_write_op) ? check_rw_access(item) : 0;
+        goto out;
+    }
+
+    /* Find the host in the netgroups file for the exports directory */
+    if (_mnt3_auth_check_host_in_netgroup(auth_params, fh, host, dir, &item)) {
+        auth_status_code = (is_write_op) ? check_rw_access(item) : 0;
+        goto out;
+    }
+
+out:
+    if (save_item)
+        *save_item = item;
+
+    return auth_status_code;
+}
diff --git a/xlators/nfs/server/src/mount3-auth.h b/xlators/nfs/server/src/mount3-auth.h
new file mode 100644
index 00000000000..e50af5847d3
--- /dev/null
+++ b/xlators/nfs/server/src/mount3-auth.h
@@ -0,0 +1,59 @@
+/*
+   Copyright 2014-present Facebook. All Rights Reserved
+
+   This file is part of GlusterFS.
+
+   Author :
+   Shreyas Siravara <shreyas.siravara@gmail.com>
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _MOUNT3_AUTH
+#define _MOUNT3_AUTH
+
+#include "nfs-mem-types.h"
+#include "netgroups.h"
+#include "exports.h"
+#include "mount3.h"
+#include "nfs.h"
+
+#define GF_MNT_AUTH GF_NFS "-mount3-auth"
+
+struct mnt3_auth_params {
+    struct netgroups_file *ngfile; /* The netgroup file to auth against */
+    struct exports_file *expfile;  /* The export file to auth against */
+    struct mount3_state *ms;       /* The mount state that owns this */
+};
+
+/* Initialize auth params struct */
+struct mnt3_auth_params *
+mnt3_auth_params_init(struct mount3_state *ms);
+
+/* Set the netgroups file to use in the auth */
+int
+mnt3_auth_set_netgroups_auth(struct mnt3_auth_params *aps,
+                             const char *filename);
+
+/* Set the exports file to use in the auth */
+int
+mnt3_auth_set_exports_auth(struct mnt3_auth_params *aps, const char *filename);
+
+/* Check if a host is authorized to perform a mount / nfs-fop */
+int
+mnt3_auth_host(const struct mnt3_auth_params *aps, const char *host,
+               struct nfs3_fh *fh, const char *dir, gf_boolean_t is_write_op,
+               struct export_item **save_item);
+
+/* Free resources used by the auth params struct */
+void
+mnt3_auth_params_deinit(struct mnt3_auth_params *aps);
+
+int
+mnt3_auth_fop_options_verify(const struct mnt3_auth_params *auth_params,
+                             const char *host, const char *dir);
+
+#endif /* _MOUNT3_AUTH */
diff --git a/xlators/nfs/server/src/mount3.c b/xlators/nfs/server/src/mount3.c
new file mode 100644
index 00000000000..a34d9104c17
--- /dev/null
+++ b/xlators/nfs/server/src/mount3.c
@@ -0,0 +1,4267 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "rpcsvc.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include "mount3.h"
+#include "xdr-nfs3.h"
+#include "msg-nfs3.h"
+#include <glusterfs/iobuf.h>
+#include "nfs-common.h"
+#include "nfs3-fh.h"
+#include "nfs-fops.h"
+#include "nfs-inodes.h"
+#include "nfs-generics.h"
+#include <glusterfs/locking.h>
+#include <glusterfs/iatt.h>
+#include "nfs-mem-types.h"
+#include "nfs.h"
+#include <glusterfs/common-utils.h>
+#include <glusterfs/store.h>
+#include "glfs-internal.h"
+#include "glfs.h"
+#include "mount3-auth.h"
+#include <glusterfs/hashfn.h>
+#include "nfs-messages.h"
+
+#include <errno.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+
+/* This macro will assist in freeing up entire link list
+ * of host_auth_spec structure.
+ */
+#define FREE_HOSTSPEC(exp)                                                     \
+    do {                                                                       \
+        struct host_auth_spec *host = exp->hostspec;                           \
+        while (NULL != host) {                                                 \
+            struct host_auth_spec *temp = host;                                \
+            host = host->next;                                                 \
+            if (NULL != temp->host_addr) {                                     \
+                GF_FREE(temp->host_addr);                                      \
+            }                                                                  \
+            GF_FREE(temp);                                                     \
+        }                                                                      \
+        exp->hostspec = NULL;                                                  \
+    } while (0)
+
+/* Paths for export and netgroup files */
+const char *exports_file_path = GLUSTERD_DEFAULT_WORKDIR "/nfs/exports";
+const char *netgroups_file_path = GLUSTERD_DEFAULT_WORKDIR "/nfs/netgroups";
+
+typedef ssize_t (*mnt3_serializer)(struct iovec outmsg, void *args);
+
+extern void *
+mount3udp_thread(void *argv);
+
+static void
+mnt3_export_free(struct mnt3_export *exp)
+{
+    if (!exp)
+        return;
+
+    if (exp->exptype == MNT3_EXPTYPE_DIR)
+        FREE_HOSTSPEC(exp);
+    GF_FREE(exp->expname);
+    GF_FREE(exp->fullpath);
+    GF_FREE(exp);
+}
+
+/* Generic reply function for MOUNTv3 specific replies. */
+int
+mnt3svc_submit_reply(rpcsvc_request_t *req, void *arg, mnt3_serializer sfunc)
+{
+    struct iovec outmsg = {
+        0,
+    };
+    struct iobuf *iob = NULL;
+    struct mount3_state *ms = NULL;
+    int ret = -1;
+    ssize_t msglen = 0;
+    struct iobref *iobref = NULL;
+
+    if (!req)
+        return -1;
+
+    ms = (struct mount3_state *)rpcsvc_request_program_private(req);
+    if (!ms) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EINVAL, NFS_MSG_MNT_STATE_NOT_FOUND,
+               "mount state not found");
+        goto ret;
+    }
+
+    /* First, get the io buffer into which the reply in arg will
+     * be serialized.
+     */
+    /* TODO: use 'xdrproc_t' instead of 'sfunc' to get the xdr-size */
+    iob = iobuf_get(ms->iobpool);
+    if (!iob) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to get iobuf");
+        goto ret;
+    }
+
+    iobuf_to_iovec(iob, &outmsg);
+    /* Use the given serializer to translate the give C structure in arg
+     * to XDR format which will be written into the buffer in outmsg.
+     */
+    msglen = sfunc(outmsg, arg);
+    if (msglen < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_ENCODE_MSG_FAIL,
+               "Failed to encode message");
+        goto ret;
+    }
+    outmsg.iov_len = msglen;
+
+    iobref = iobref_new();
+    if (iobref == NULL) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to get iobref");
+        goto ret;
+    }
+
+    ret = iobref_add(iobref, iob);
+    if (ret) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to add iob to iobref");
+        goto ret;
+    }
+
+    /* Then, submit the message for transmission. */
+    ret = rpcsvc_submit_message(req, &outmsg, 1, NULL, 0, iobref);
+    if (ret == -1) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, errno, NFS_MSG_REP_SUBMIT_FAIL,
+               "Reply submission failed");
+        goto ret;
+    }
+
+    ret = 0;
+ret:
+    if (NULL != iob)
+        iobuf_unref(iob);
+    if (NULL != iobref)
+        iobref_unref(iobref);
+
+    return ret;
+}
+/**
+ * __mountdict_insert -- Insert a mount entry into the mount state
+ *
+ * @ms: The mount state holding the entries
+ * @me: The mount entry to insert
+ *
+ * Not for external use.
+ */
+void
+__mountdict_insert(struct mount3_state *ms, struct mountentry *me)
+{
+    char *exname = NULL;
+    char *fpath = NULL;
+    data_t *medata = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_MNT, ms, out);
+    GF_VALIDATE_OR_GOTO(GF_MNT, me, out);
+
+    /* We don't want export names with leading slashes */
+    exname = me->exname;
+    while (exname[0] == '/')
+        exname++;
+
+    /* Get the fullpath for the export */
+    fpath = me->fullpath;
+    if (me->has_full_path) {
+        while (fpath[0] == '/')
+            fpath++;
+
+        /* Export names can either be just volumes or paths inside that
+         * volume. */
+        exname = fpath;
+    }
+
+    snprintf(me->hashkey, sizeof(me->hashkey), "%s:%s", exname, me->hostname);
+
+    medata = bin_to_data(me, sizeof(*me));
+    dict_set(ms->mountdict, me->hashkey, medata);
+    gf_msg_trace(GF_MNT, 0, "Inserted into mountdict: %s", me->hashkey);
+out:
+    return;
+}
+
+/**
+ * __mountdict_remove -- Remove a mount entry from the mountstate.
+ *
+ * @ms: The mount state holding the entries
+ * @me: The mount entry to remove
+ *
+ * Not for external use.
+ */
+void
+__mountdict_remove(struct mount3_state *ms, struct mountentry *me)
+{
+    dict_del(ms->mountdict, me->hashkey);
+}
+
+/* Generic error reply function, just pass the err status
+ * and it will do the rest, including transmission.
+ */
+int
+mnt3svc_mnt_error_reply(rpcsvc_request_t *req, int mntstat)
+{
+    mountres3 res;
+
+    if (!req)
+        return -1;
+
+    res.fhs_status = mntstat;
+    mnt3svc_submit_reply(req, (void *)&res,
+                         (mnt3_serializer)xdr_serialize_mountres3);
+
+    return 0;
+}
+
+mountstat3
+mnt3svc_errno_to_mnterr(int32_t errnum)
+{
+    mountstat3 stat;
+
+    switch (errnum) {
+        case 0:
+            stat = MNT3_OK;
+            break;
+        case ENOENT:
+            stat = MNT3ERR_NOENT;
+            break;
+        case EPERM:
+            stat = MNT3ERR_PERM;
+            break;
+        case EIO:
+            stat = MNT3ERR_IO;
+            break;
+        case EACCES:
+            stat = MNT3ERR_ACCES;
+            break;
+        case ENOTDIR:
+            stat = MNT3ERR_NOTDIR;
+            break;
+        case EINVAL:
+            stat = MNT3ERR_INVAL;
+            break;
+        case ENOSYS:
+            stat = MNT3ERR_NOTSUPP;
+            break;
+        case ENOMEM:
+            stat = MNT3ERR_SERVERFAULT;
+            break;
+        default:
+            stat = MNT3ERR_SERVERFAULT;
+            break;
+    }
+
+    return stat;
+}
+
+mountres3
+mnt3svc_set_mountres3(mountstat3 stat, struct nfs3_fh *fh, int *authflavor,
+                      u_int aflen)
+{
+    mountres3 res = {
+        0,
+    };
+    uint32_t fhlen = 0;
+
+    res.fhs_status = stat;
+
+    if (fh)
+        fhlen = nfs3_fh_compute_size();
+
+    res.mountres3_u.mountinfo.fhandle.fhandle3_len = fhlen;
+    res.mountres3_u.mountinfo.fhandle.fhandle3_val = (char *)fh;
+    res.mountres3_u.mountinfo.auth_flavors.auth_flavors_val = authflavor;
+    res.mountres3_u.mountinfo.auth_flavors.auth_flavors_len = aflen;
+
+    return res;
+}
+
+/* Read the rmtab from the store_handle and append (or not) the entries to the
+ * mountlist.
+ *
+ * Requires the store_handle to be locked.
+ */
+static int
+__mount_read_rmtab(gf_store_handle_t *sh, struct list_head *mountlist,
+                   gf_boolean_t append)
+{
+    int ret = 0;
+    unsigned int idx = 0;
+    struct mountentry *me = NULL, *tmp = NULL;
+    /* me->hostname is a char[MNTPATHLEN] */
+    char key[MNTPATHLEN + 11];
+
+    GF_ASSERT(sh && mountlist);
+
+    if (!gf_store_locked_local(sh)) {
+        gf_msg(GF_MNT, GF_LOG_WARNING, 0, NFS_MSG_READ_LOCKED,
+               "Not reading unlocked %s", sh->path);
+        return -1;
+    }
+
+    if (!append) {
+        list_for_each_entry_safe(me, tmp, mountlist, mlist)
+        {
+            list_del(&me->mlist);
+            GF_FREE(me);
+        }
+        me = NULL;
+    }
+
+    for (;;) {
+        char *value = NULL;
+
+        if (me && append) {
+            /* do not add duplicates */
+            list_for_each_entry(tmp, mountlist, mlist)
+            {
+                if (!strcmp(tmp->hostname, me->hostname) &&
+                    !strcmp(tmp->exname, me->exname)) {
+                    GF_FREE(me);
+                    goto dont_add;
+                }
+            }
+            list_add_tail(&me->mlist, mountlist);
+        } else if (me) {
+            list_add_tail(&me->mlist, mountlist);
+        }
+
+    dont_add:
+        me = GF_CALLOC(1, sizeof(*me), gf_nfs_mt_mountentry);
+        if (!me) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "Out of memory");
+            ret = -1;
+            goto out;
+        }
+
+        INIT_LIST_HEAD(&me->mlist);
+
+        snprintf(key, 9 + MNTPATHLEN, "hostname-%d", idx);
+        ret = gf_store_retrieve_value(sh, key, &value);
+        if (ret)
+            break;
+        snprintf(me->hostname, MNTPATHLEN, "%s", value);
+        GF_FREE(value);
+
+        snprintf(key, 11 + MNTPATHLEN, "mountpoint-%d", idx);
+        ret = gf_store_retrieve_value(sh, key, &value);
+        if (ret)
+            break;
+        snprintf(me->exname, MNTPATHLEN, "%s", value);
+        GF_FREE(value);
+
+        idx++;
+        gf_msg_trace(GF_MNT, 0, "Read entries %s:%s", me->hostname, me->exname);
+    }
+    gf_msg_debug(GF_MNT, 0, "Read %d entries from '%s'", idx, sh->path);
+    GF_FREE(me);
+out:
+    return ret;
+}
+
+/* Overwrite the contents of the rwtab with the in-memory client list.
+ * Fail gracefully if the stora_handle is not locked.
+ */
+static void
+__mount_rewrite_rmtab(struct mount3_state *ms, gf_store_handle_t *sh)
+{
+    struct mountentry *me = NULL;
+    char key[16];
+    int fd, ret;
+    unsigned int idx = 0;
+
+    if (!gf_store_locked_local(sh)) {
+        gf_msg(GF_MNT, GF_LOG_WARNING, 0, NFS_MSG_MODIFY_LOCKED,
+               "Not modifying unlocked %s", sh->path);
+        return;
+    }
+
+    fd = gf_store_mkstemp(sh);
+    if (fd == -1) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Failed to open %s", sh->path);
+        return;
+    }
+
+    list_for_each_entry(me, &ms->mountlist, mlist)
+    {
+        snprintf(key, 16, "hostname-%d", idx);
+        ret = gf_store_save_value(fd, key, me->hostname);
+        if (ret)
+            goto fail;
+
+        snprintf(key, 16, "mountpoint-%d", idx);
+        ret = gf_store_save_value(fd, key, me->exname);
+        if (ret)
+            goto fail;
+
+        idx++;
+    }
+
+    gf_msg_debug(GF_MNT, 0, "Updated rmtab with %d entries", idx);
+
+    if (gf_store_rename_tmppath(sh))
+        gf_msg(GF_MNT, GF_LOG_ERROR, errno, NFS_MSG_RWTAB_OVERWRITE_FAIL,
+               "Failed to overwrite rwtab %s", sh->path);
+
+    return;
+
+fail:
+    gf_msg(GF_MNT, GF_LOG_ERROR, errno, NFS_MSG_UPDATE_FAIL,
+           "Failed to update %s", sh->path);
+    gf_store_unlink_tmppath(sh);
+}
+
+static gf_boolean_t
+mount_open_rmtab(const char *rmtab, gf_store_handle_t **sh)
+{
+    int ret = -1;
+
+    /* updating the rmtab is disabled, use in-memory only */
+    if (!rmtab || rmtab[0] == '\0')
+        return _gf_false;
+
+    ret = gf_store_handle_new(rmtab, sh);
+    if (ret) {
+        gf_log(GF_MNT, GF_LOG_WARNING, "Failed to open '%s'", rmtab);
+        return _gf_false;
+    }
+
+    return _gf_true;
+}
+
+/* Read the rmtab into a clean ms->mountlist.
+ */
+static void
+mount_read_rmtab(struct mount3_state *ms)
+{
+    gf_store_handle_t *sh = NULL;
+    struct nfs_state *nfs = NULL;
+    gf_boolean_t read_rmtab = _gf_false;
+
+    nfs = (struct nfs_state *)ms->nfsx->private;
+
+    read_rmtab = mount_open_rmtab(nfs->rmtab, &sh);
+    if (!read_rmtab)
+        return;
+
+    if (gf_store_lock(sh)) {
+        gf_msg(GF_MNT, GF_LOG_WARNING, 0, NFS_MSG_LOCK_FAIL,
+               "Failed to lock '%s'", nfs->rmtab);
+        goto out;
+    }
+
+    __mount_read_rmtab(sh, &ms->mountlist, _gf_false);
+    gf_store_unlock(sh);
+
+out:
+    gf_store_handle_destroy(sh);
+}
+
+/* Write the ms->mountlist to the rmtab.
+ *
+ * The rmtab could be empty, or it can exists and have been updated by a
+ * different storage server without our knowing.
+ *
+ * 0. if opening the nfs->rmtab fails, return gracefully
+ * 1. takes the store_handle lock on the current rmtab
+ *    - blocks if an other storage server rewrites the rmtab at the same time
+ * 2. [if new_rmtab] takes the store_handle lock on the new rmtab
+ * 3. reads/merges the entries from the current rmtab
+ * 4. [if new_rmtab] reads/merges the entries from the new rmtab
+ * 5. [if new_rmtab] writes the new rmtab
+ * 6. [if not new_rmtab] writes the current rmtab
+ * 7  [if new_rmtab] replaces nfs->rmtab to point to the new location
+ * 8. [if new_rmtab] releases the store_handle lock of the new rmtab
+ * 9. releases the store_handle lock of the old rmtab
+ */
+void
+mount_rewrite_rmtab(struct mount3_state *ms, char *new_rmtab)
+{
+    gf_store_handle_t *sh = NULL, *nsh = NULL;
+    struct nfs_state *nfs = NULL;
+    int ret;
+    char *rmtab = NULL;
+    gf_boolean_t got_old_rmtab = _gf_false;
+
+    nfs = (struct nfs_state *)ms->nfsx->private;
+
+    got_old_rmtab = mount_open_rmtab(nfs->rmtab, &sh);
+    if (!got_old_rmtab && !new_rmtab)
+        return;
+
+    if (got_old_rmtab && gf_store_lock(sh)) {
+        gf_msg(GF_MNT, GF_LOG_WARNING, 0, NFS_MSG_REWRITE_ERROR,
+               "Not rewriting '%s'", nfs->rmtab);
+        goto free_sh;
+    }
+
+    if (new_rmtab) {
+        ret = gf_store_handle_new(new_rmtab, &nsh);
+        if (ret) {
+            gf_msg(GF_MNT, GF_LOG_WARNING, 0, NFS_MSG_OPEN_FAIL,
+                   "Failed to open '%s'", new_rmtab);
+            goto unlock_sh;
+        }
+
+        if (gf_store_lock(nsh)) {
+            gf_msg(GF_MNT, GF_LOG_WARNING, 0, NFS_MSG_REWRITE_ERROR,
+                   "Not rewriting '%s'", new_rmtab);
+            goto free_nsh;
+        }
+    }
+
+    /* always read the currently used rmtab */
+    if (got_old_rmtab)
+        __mount_read_rmtab(sh, &ms->mountlist, _gf_true);
+
+    if (new_rmtab) {
+        /* read the new rmtab and write changes to the new location */
+        __mount_read_rmtab(nsh, &ms->mountlist, _gf_true);
+        __mount_rewrite_rmtab(ms, nsh);
+
+        /* replace the nfs->rmtab reference to the new rmtab */
+        rmtab = gf_strdup(new_rmtab);
+        if (rmtab == NULL) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, errno, NFS_MSG_NO_MEMORY,
+                   "Out of memory, keeping %s as rmtab", nfs->rmtab);
+        } else {
+            GF_FREE(nfs->rmtab);
+            nfs->rmtab = rmtab;
+        }
+
+        gf_store_unlock(nsh);
+    } else {
+        /* rewrite the current (unchanged location) rmtab */
+        __mount_rewrite_rmtab(ms, sh);
+    }
+
+free_nsh:
+    if (new_rmtab)
+        gf_store_handle_destroy(nsh);
+unlock_sh:
+    if (got_old_rmtab)
+        gf_store_unlock(sh);
+free_sh:
+    if (got_old_rmtab)
+        gf_store_handle_destroy(sh);
+}
+
+/* Add a new NFS-client to the ms->mountlist and update the rmtab if we can.
+ *
+ * A NFS-client will only be removed from the ms->mountlist in case the
+ * NFS-client sends a unmount request. It is possible that a NFS-client
+ * crashed/rebooted had network loss or something else prevented the NFS-client
+ * to unmount cleanly. In this case, a duplicate entry would be added to the
+ * ms->mountlist, which is wrong and we should prevent.
+ *
+ * It is fully acceptable that the ms->mountlist is not 100% correct, this is a
+ * common issue for all(?) NFS-servers.
+ */
+int
+mnt3svc_update_mountlist(struct mount3_state *ms, rpcsvc_request_t *req,
+                         const char *expname, const char *fullpath)
+{
+    struct mountentry *me = NULL;
+    struct mountentry *cur = NULL;
+    int ret = -1;
+    char *colon = NULL;
+    struct nfs_state *nfs = NULL;
+    gf_store_handle_t *sh = NULL;
+    gf_boolean_t update_rmtab = _gf_false;
+
+    if ((!ms) || (!req) || (!expname))
+        return -1;
+
+    me = (struct mountentry *)GF_CALLOC(1, sizeof(*me), gf_nfs_mt_mountentry);
+    if (!me)
+        return -1;
+
+    nfs = (struct nfs_state *)ms->nfsx->private;
+
+    update_rmtab = mount_open_rmtab(nfs->rmtab, &sh);
+
+    snprintf(me->exname, MNTPATHLEN, "%s", expname);
+    /* Sometimes we don't care about the full path
+     * so a NULL value for fullpath is valid.
+     */
+    if (fullpath) {
+        if (strlen(fullpath) < MNTPATHLEN) {
+            strcpy(me->fullpath, fullpath);
+            me->has_full_path = _gf_true;
+        }
+    }
+
+    INIT_LIST_HEAD(&me->mlist);
+    /* Must get the IP or hostname of the client so we
+     * can map it into the mount entry.
+     */
+    ret = rpcsvc_transport_peername(req->trans, me->hostname, MNTPATHLEN);
+    if (ret == -1)
+        goto free_err;
+
+    colon = strrchr(me->hostname, ':');
+    if (colon) {
+        *colon = '\0';
+    }
+    LOCK(&ms->mountlock);
+    {
+        /* in case locking fails, we just don't write the rmtab */
+        if (update_rmtab && gf_store_lock(sh)) {
+            gf_msg(GF_MNT, GF_LOG_WARNING, 0, NFS_MSG_LOCK_FAIL,
+                   "Failed to lock '%s', changes will not be "
+                   "written",
+                   nfs->rmtab);
+        } else if (update_rmtab) {
+            __mount_read_rmtab(sh, &ms->mountlist, _gf_false);
+        }
+
+        /* do not add duplicates */
+        list_for_each_entry(cur, &ms->mountlist, mlist)
+        {
+            if (!strcmp(cur->hostname, me->hostname) &&
+                !strcmp(cur->exname, me->exname)) {
+                GF_FREE(me);
+                goto dont_add;
+            }
+        }
+        list_add_tail(&me->mlist, &ms->mountlist);
+        __mountdict_insert(ms, me);
+
+        /* only write the rmtab in case it was locked */
+        if (update_rmtab && gf_store_locked_local(sh))
+            __mount_rewrite_rmtab(ms, sh);
+    }
+dont_add:
+    if (update_rmtab && gf_store_locked_local(sh))
+        gf_store_unlock(sh);
+
+    UNLOCK(&ms->mountlock);
+
+free_err:
+    if (update_rmtab)
+        gf_store_handle_destroy(sh);
+
+    if (ret == -1)
+        GF_FREE(me);
+
+    return ret;
+}
+
+int
+__mnt3_get_volume_id(struct mount3_state *ms, xlator_t *mntxl, uuid_t volumeid)
+{
+    int ret = -1;
+    struct mnt3_export *exp = NULL;
+
+    if ((!ms) || (!mntxl))
+        return ret;
+
+    LOCK(&ms->mountlock);
+    list_for_each_entry(exp, &ms->exportlist, explist)
+    {
+        if (exp->vol == mntxl) {
+            gf_uuid_copy(volumeid, exp->volumeid);
+            ret = 0;
+            goto out;
+        }
+    }
+
+out:
+    UNLOCK(&ms->mountlock);
+    return ret;
+}
+
+int
+__mnt3_build_mountid_from_path(const char *path, uuid_t mountid)
+{
+    uint32_t hashed_path = 0;
+    int ret = -1;
+
+    if (!path)
+        goto out;
+
+    while (strlen(path) > 0 && path[0] == '/')
+        path++;
+
+    /* Clear the mountid */
+    gf_uuid_clear(mountid);
+
+    hashed_path = SuperFastHash(path, strlen(path));
+    if (hashed_path == 1) {
+        gf_msg(GF_MNT, GF_LOG_WARNING, 0, NFS_MSG_HASH_PATH_FAIL,
+               "failed to hash path: %s", path);
+        goto out;
+    }
+
+    memcpy(mountid, &hashed_path, sizeof(hashed_path));
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+__mnt3_get_mount_id(xlator_t *mntxl, uuid_t mountid)
+{
+    int ret = -1;
+    uint32_t hashed_path = 0;
+
+    /* first clear the mountid */
+    gf_uuid_clear(mountid);
+
+    hashed_path = SuperFastHash(mntxl->name, strlen(mntxl->name));
+    if (hashed_path == 1) {
+        gf_msg(GF_MNT, GF_LOG_WARNING, 0, NFS_MSG_HASH_XLATOR_FAIL,
+               "failed to hash xlator name: %s", mntxl->name);
+        goto out;
+    }
+
+    memcpy(mountid, &hashed_path, sizeof(hashed_path));
+    ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+mnt3svc_lookup_mount_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, inode_t *inode,
+                         struct iatt *buf, dict_t *xattr,
+                         struct iatt *postparent)
+{
+    mountres3 res = {
+        0,
+    };
+    rpcsvc_request_t *req = NULL;
+    struct nfs3_fh fh = {
+        {0},
+    };
+    struct mount3_state *ms = NULL;
+    mountstat3 status = 0;
+    int autharr[10];
+    int autharrlen = 0;
+    rpcsvc_t *svc = NULL;
+    xlator_t *mntxl = NULL;
+    uuid_t volumeid = {
+        0,
+    };
+    char *path = NULL;
+    uuid_t mountid = {
+        1,
+    };
+    char fhstr[1536];
+    int alloclen = 0;
+
+    req = (rpcsvc_request_t *)frame->local;
+
+    if (!req)
+        return -1;
+
+    mntxl = (xlator_t *)cookie;
+    ms = (struct mount3_state *)rpcsvc_request_program_private(req);
+    if (!ms) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EINVAL, NFS_MSG_MNT_STATE_NOT_FOUND,
+               "mount state not found");
+        op_ret = -1;
+        op_errno = EINVAL;
+    }
+
+    if (op_ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, op_errno, NFS_MSG_LOOKUP_MNT_ERROR,
+               "error=%s", strerror(op_errno));
+        status = mnt3svc_errno_to_mnterr(op_errno);
+    }
+    if (status != MNT3_OK)
+        goto xmit_res;
+
+    alloclen = strlen(mntxl->name) + 2;
+    path = GF_MALLOC(alloclen, gf_nfs_mt_char);
+    if (!path) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Memory allocation failed.");
+        goto xmit_res;
+    }
+
+    snprintf(path, alloclen, "/%s", mntxl->name);
+    mnt3svc_update_mountlist(ms, req, path, NULL);
+    GF_FREE(path);
+    if (gf_nfs_dvm_off(nfs_state(ms->nfsx))) {
+        fh = nfs3_fh_build_indexed_root_fh(ms->nfsx->children, mntxl);
+        goto xmit_res;
+    }
+
+    __mnt3_get_mount_id(mntxl, mountid);
+    __mnt3_get_volume_id(ms, mntxl, volumeid);
+    fh = nfs3_fh_build_uuid_root_fh(volumeid, mountid);
+
+xmit_res:
+    nfs3_fh_to_str(&fh, fhstr, sizeof(fhstr));
+    gf_msg_debug(GF_MNT, 0, "MNT reply: fh %s, status: %d", fhstr, status);
+    if (op_ret == 0) {
+        svc = rpcsvc_request_service(req);
+        autharrlen = rpcsvc_auth_array(svc, mntxl->name, autharr, 10);
+    }
+
+    res = mnt3svc_set_mountres3(status, &fh, autharr, autharrlen);
+    mnt3svc_submit_reply(req, (void *)&res,
+                         (mnt3_serializer)xdr_serialize_mountres3);
+
+    return 0;
+}
+
+int
+mnt3_match_dirpath_export(const char *expname, const char *dirpath,
+                          gf_boolean_t export_parsing_match)
+{
+    int ret = 0;
+    size_t dlen;
+    char *fullpath = NULL;
+    char *second_slash = NULL;
+    char *dirdup = NULL;
+
+    if ((!expname) || (!dirpath))
+        return 0;
+
+    dirdup = strdupa(dirpath);
+
+    /* Some clients send a dirpath for mount that includes the slash at the
+     * end. String compare for searching the export will fail because our
+     * exports list does not include that slash. Remove the slash to
+     * compare.
+     */
+    dlen = strlen(dirdup);
+    if (dlen && dirdup[dlen - 1] == '/')
+        dirdup[dlen - 1] = '\0';
+
+    /* Here we try to match fullpaths with export names */
+    fullpath = dirdup;
+
+    if (export_parsing_match) {
+        if (dirdup[0] == '/')
+            fullpath = dirdup + 1;
+
+        second_slash = strchr(fullpath, '/');
+        if (second_slash)
+            *second_slash = '\0';
+    }
+
+    /* The export name begins with a slash so move it forward by one
+     * to ignore the slash when we want to compare the fullpath and
+     * export.
+     */
+    if (fullpath[0] != '/')
+        expname++;
+
+    if (strcmp(expname, fullpath) == 0)
+        ret = 1;
+
+    return ret;
+}
+
+int
+mnt3svc_mount_inode(rpcsvc_request_t *req, struct mount3_state *ms,
+                    xlator_t *xl, inode_t *exportinode)
+{
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    loc_t exportloc = {
+        0,
+    };
+
+    if ((!req) || (!xl) || (!ms) || (!exportinode))
+        return ret;
+
+    ret = nfs_inode_loc_fill(exportinode, &exportloc, NFS_RESOLVE_EXIST);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_INODE_LOC_FILL_ERROR,
+               "Loc fill failed for export inode"
+               ": gfid %s, volume: %s",
+               uuid_utoa(exportinode->gfid), xl->name);
+        goto err;
+    }
+
+    /* To service the mount request, all we need to do
+     * is to send a lookup fop that returns the stat
+     * for the root of the child volume. This is
+     * used to build the root fh sent to the client.
+     */
+    nfs_request_user_init(&nfu, req);
+    ret = nfs_lookup(ms->nfsx, xl, &nfu, &exportloc, mnt3svc_lookup_mount_cbk,
+                     (void *)req);
+
+    nfs_loc_wipe(&exportloc);
+err:
+    return ret;
+}
+
+/* For a volume mount request, we just have to create loc on the root inode,
+ * and send a lookup. In the lookup callback the mount reply is send along with
+ * the file handle.
+ */
+int
+mnt3svc_volume_mount(rpcsvc_request_t *req, struct mount3_state *ms,
+                     struct mnt3_export *exp)
+{
+    inode_t *exportinode = NULL;
+    int ret = -EFAULT;
+    static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+    if ((!req) || (!exp) || (!ms))
+        return ret;
+
+    exportinode = inode_find(exp->vol->itable, rootgfid);
+    if (!exportinode) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOENT, NFS_MSG_GET_ROOT_INODE_FAIL,
+               "Failed to get root inode");
+        ret = -ENOENT;
+        goto err;
+    }
+
+    ret = mnt3svc_mount_inode(req, ms, exp->vol, exportinode);
+    inode_unref(exportinode);
+
+err:
+    return ret;
+}
+
+/* The catch with directory exports is that the first component of the export
+ * name will be the name of the volume.
+ * Any lookup that needs to be performed to build the directory's file handle
+ * needs to start from the directory path from the root of the volume. For that
+ * we need to strip out the volume name first.
+ */
+char *
+mnt3_get_volume_subdir(char *dirpath, char **volname)
+{
+    /* subdir points to the first / after the volume name while dirpath
+     * points to the first char of the volume name.
+     */
+    char *subdir = NULL;
+    int volname_len = 0;
+    static char *root = "/";
+
+    /* all callers are expected to pass a valid *dirpath */
+    GF_ASSERT(dirpath);
+
+    if (dirpath[0] == '/')
+        dirpath++;
+
+    subdir = index(dirpath, (int)'/');
+    if (!subdir) {
+        subdir = root;
+        volname_len = strlen(dirpath);
+    } else {
+        volname_len = subdir - dirpath;
+    }
+
+    if (!volname)
+        goto out;
+
+    if (!*volname)
+        goto out;
+
+    strncpy(*volname, dirpath, volname_len);
+    *(*volname + volname_len) = '\0';
+out:
+    return subdir;
+}
+
+void
+mnt3_resolve_state_wipe(mnt3_resolve_t *mres)
+{
+    if (!mres)
+        return;
+
+    nfs_loc_wipe(&mres->resolveloc);
+    GF_FREE(mres);
+}
+
+/* Sets up the component argument to contain the next component in the path and
+ * sets up path as an absolute path starting from the next component.
+ */
+static char *
+setup_next_component(char *path, size_t plen, char *component, size_t clen)
+{
+    char *comp = NULL;
+    char *nextcomp = NULL;
+
+    if ((!path) || (!component))
+        return NULL;
+
+    strncpy(component, path, clen);
+    comp = index(component, (int)'/');
+    if (!comp)
+        goto err;
+
+    comp++;
+    nextcomp = index(comp, (int)'/');
+    if (nextcomp) {
+        strncpy(path, nextcomp, plen);
+        *nextcomp = '\0';
+    } else
+        path[0] = '\0';
+
+err:
+    return comp;
+}
+
+int32_t
+mnt3_resolve_subdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, inode_t *inode,
+                        struct iatt *buf, dict_t *xattr,
+                        struct iatt *postparent);
+
+int32_t
+mnt3_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, const char *path,
+                  struct iatt *buf, dict_t *xdata);
+
+/* There are multiple components in the directory export path and each one
+ * needs to be looked up one after the other.
+ */
+int
+__mnt3_resolve_export_subdir_comp(mnt3_resolve_t *mres)
+{
+    char dupsubdir[MNTPATHLEN];
+    char *nextcomp = NULL;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    uuid_t gfid = {
+        0,
+    };
+
+    if (!mres)
+        return ret;
+
+    nextcomp = setup_next_component(mres->remainingdir,
+                                    sizeof(mres->remainingdir), dupsubdir,
+                                    sizeof(dupsubdir));
+    if (!nextcomp)
+        goto err;
+
+    /* Wipe the contents of the previous component */
+    gf_uuid_copy(gfid, mres->resolveloc.inode->gfid);
+    nfs_loc_wipe(&mres->resolveloc);
+    ret = nfs_entry_loc_fill(mres->mstate->nfsx, mres->exp->vol->itable, gfid,
+                             nextcomp, &mres->resolveloc, NFS_RESOLVE_CREATE,
+                             NULL);
+    if ((ret < 0) && (ret != -2)) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EFAULT, NFS_MSG_RESOLVE_INODE_FAIL,
+               "Failed to resolve and "
+               "create inode: parent gfid %s, entry %s",
+               uuid_utoa(gfid), nextcomp);
+        ret = -EFAULT;
+        goto err;
+    }
+
+    nfs_request_user_init(&nfu, mres->req);
+    if (IA_ISLNK(mres->resolveloc.inode->ia_type)) {
+        ret = nfs_readlink(mres->mstate->nfsx, mres->exp->vol, &nfu,
+                           &mres->resolveloc, mnt3_readlink_cbk, mres);
+        gf_msg_debug(GF_MNT, 0,
+                     "Symlink found , need to resolve"
+                     " into directory handle");
+        goto err;
+    }
+    ret = nfs_lookup(mres->mstate->nfsx, mres->exp->vol, &nfu,
+                     &mres->resolveloc, mnt3_resolve_subdir_cbk, mres);
+
+err:
+    return ret;
+}
+
+int
+__mnt3_resolve_subdir(mnt3_resolve_t *mres);
+
+/*
+ * Per the AFR2 comments, this function performs the "fresh" lookup
+ * by deleting the inode from cache and calling __mnt3_resolve_subdir
+ * again.
+ */
+int
+__mnt3_fresh_lookup(mnt3_resolve_t *mres)
+{
+    inode_unlink(mres->resolveloc.inode, mres->resolveloc.parent,
+                 mres->resolveloc.name);
+    strncpy(mres->remainingdir, mres->resolveloc.path,
+            strlen(mres->resolveloc.path));
+    nfs_loc_wipe(&mres->resolveloc);
+    return __mnt3_resolve_subdir(mres);
+}
+
+int32_t
+mnt3_resolve_subdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, inode_t *inode,
+                        struct iatt *buf, dict_t *xattr,
+                        struct iatt *postparent)
+{
+    mnt3_resolve_t *mres = NULL;
+    mountstat3 mntstat = MNT3ERR_SERVERFAULT;
+    struct nfs3_fh fh = {
+        {0},
+    };
+    int autharr[10];
+    int autharrlen = 0;
+    rpcsvc_t *svc = NULL;
+    mountres3 res = {
+        0,
+    };
+    xlator_t *mntxl = NULL;
+    char *path = NULL;
+    struct mount3_state *ms = NULL;
+    int authcode = 0;
+    char *authorized_host = NULL;
+    char *authorized_path = NULL;
+    inode_t *linked_inode = NULL;
+
+    mres = frame->local;
+    ms = mres->mstate;
+    mntxl = (xlator_t *)cookie;
+    if (op_ret == -1 && op_errno == ESTALE) {
+        /* Nuke inode from cache and try the LOOKUP
+         * request again. */
+        return __mnt3_fresh_lookup(mres);
+    } else if (op_ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, op_errno, NFS_MSG_RESOLVE_SUBDIR_FAIL,
+               "path=%s (%s)", mres->resolveloc.path, strerror(op_errno));
+        mntstat = mnt3svc_errno_to_mnterr(op_errno);
+        goto err;
+    }
+
+    linked_inode = inode_link(mres->resolveloc.inode, mres->resolveloc.parent,
+                              mres->resolveloc.name, buf);
+
+    if (linked_inode)
+        nfs_fix_generation(this, linked_inode);
+
+    nfs3_fh_build_child_fh(&mres->parentfh, buf, &fh);
+    if (strlen(mres->remainingdir) <= 0) {
+        int alloclen;
+        op_ret = -1;
+        mntstat = MNT3_OK;
+
+        /* Construct the full path */
+        int resolveloc_path_len = strlen(mres->resolveloc.path);
+        alloclen = strlen(mres->exp->expname) + resolveloc_path_len + 1;
+        mres->exp->fullpath = GF_MALLOC(alloclen, gf_nfs_mt_char);
+        if (!mres->exp->fullpath) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "Memory allocation failed.");
+            goto err;
+        }
+        snprintf(mres->exp->fullpath, alloclen, "%s%s", mres->exp->expname,
+                 mres->resolveloc.path);
+
+        /* Check if this path is authorized to be mounted */
+        authcode = mnt3_authenticate_request(
+            ms, mres->req, NULL, NULL, mres->exp->fullpath, &authorized_path,
+            &authorized_host, FALSE);
+        if (authcode != 0) {
+            mntstat = MNT3ERR_ACCES;
+            gf_msg_debug(GF_MNT, 0, "Client mount not allowed");
+            op_ret = -1;
+            goto err;
+        }
+
+        alloclen = strlen(mres->exp->vol->name) + resolveloc_path_len + 2;
+        path = GF_MALLOC(alloclen, gf_nfs_mt_char);
+        if (!path) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "Memory allocation failed");
+            goto err;
+        }
+        /* Build mountid from the authorized path and stick it in the
+         * filehandle that will get passed back to the client
+         */
+        __mnt3_build_mountid_from_path(authorized_path, fh.mountid);
+
+        snprintf(path, alloclen, "/%s%s", mres->exp->vol->name,
+                 mres->resolveloc.path);
+
+        mnt3svc_update_mountlist(mres->mstate, mres->req, path,
+                                 mres->exp->fullpath);
+        GF_FREE(path);
+    } else {
+        mres->parentfh = fh;
+        op_ret = __mnt3_resolve_export_subdir_comp(mres);
+        if (op_ret < 0)
+            mntstat = mnt3svc_errno_to_mnterr(-op_ret);
+    }
+err:
+    if (op_ret == -1) {
+        gf_msg_debug(GF_MNT, 0, "Mount reply status: %d", mntstat);
+        svc = rpcsvc_request_service(mres->req);
+        autharrlen = rpcsvc_auth_array(svc, mntxl->name, autharr, 10);
+
+        res = mnt3svc_set_mountres3(mntstat, &fh, autharr, autharrlen);
+        mnt3svc_submit_reply(mres->req, (void *)&res,
+                             (mnt3_serializer)xdr_serialize_mountres3);
+        mnt3_resolve_state_wipe(mres);
+    }
+
+    GF_FREE(authorized_path);
+    GF_FREE(authorized_host);
+
+    return 0;
+}
+
+/* This function resolves symbolic link into directory path from
+ * the mount and restart the parsing process from the beginning
+ *
+ * Note : Path specified in the symlink should be relative to the
+ *        symlink, because that is the one which is consistent through
+ *        out the file system.
+ *        If the symlink resolves into another symlink ,then same process
+ *        will be repeated.
+ *        If symbolic links points outside the file system are not considered
+ *        here.
+ *
+ * TODO : 1.) This function cannot handle symlinks points to path which
+ *            goes out of the filesystem and comes backs again to same.
+ *            For example, consider vol is exported volume.It contains
+ *            dir,
+ *            symlink1 which points to ../vol/dir,
+ *            symlink2 which points to ../mnt/../vol/dir,
+ *            symlink1 and symlink2 are not handled right now.
+ *
+ *        2.) udp mount routine is much simpler from tcp routine and resolves
+ *            symlink directly.May be ,its better we change this routine
+ *            similar to udp
+ */
+int32_t
+mnt3_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, const char *path,
+                  struct iatt *buf, dict_t *xdata)
+{
+    mnt3_resolve_t *mres = NULL;
+    int ret = -EFAULT;
+    char *real_loc = NULL;
+    size_t path_len = 0;
+    size_t parent_path_len = 0;
+    char *parent_path = NULL;
+    char *absolute_path = NULL;
+    char *relative_path = NULL;
+    int mntstat = 0;
+
+    GF_ASSERT(frame);
+
+    mres = frame->local;
+    if (!mres || !path || (path[0] == '/') || (op_ret < 0))
+        goto mnterr;
+
+    /* Finding current location of symlink */
+    parent_path_len = strlen(mres->resolveloc.path) -
+                      strlen(mres->resolveloc.name);
+    parent_path = gf_strndup(mres->resolveloc.path, parent_path_len);
+    if (!parent_path) {
+        ret = -ENOMEM;
+        goto mnterr;
+    }
+
+    relative_path = gf_strdup(path);
+    if (!relative_path) {
+        ret = -ENOMEM;
+        goto mnterr;
+    }
+    /* Resolving into absolute path */
+    ret = gf_build_absolute_path(parent_path, relative_path, &absolute_path);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_RESOLVE_SYMLINK_ERROR,
+               "Cannot resolve symlink, path is out of boundary "
+               "from current location %s and with relative path "
+               "%s pointed by symlink",
+               parent_path, relative_path);
+
+        goto mnterr;
+    }
+
+    /* Building the actual mount path to be mounted */
+    path_len = strlen(mres->exp->vol->name) + strlen(absolute_path) +
+               strlen(mres->remainingdir) + 1;
+    real_loc = GF_MALLOC(path_len, gf_nfs_mt_char);
+    if (!real_loc) {
+        ret = -ENOMEM;
+        goto mnterr;
+    }
+    snprintf(real_loc, path_len, "%s%s", mres->exp->vol->name, absolute_path);
+    gf_path_strip_trailing_slashes(real_loc);
+
+    /* There may entries after symlink in the mount path,
+     * we should include remaining entries too */
+    if (strlen(mres->remainingdir) > 0)
+        strcat(real_loc, mres->remainingdir);
+
+    gf_msg_debug(GF_MNT, 0,
+                 "Resolved path is : %s%s "
+                 "and actual mount path is %s",
+                 absolute_path, mres->remainingdir, real_loc);
+
+    /* After the resolving the symlink , parsing should be done
+     * for the populated mount path
+     */
+    ret = mnt3_parse_dir_exports(mres->req, mres->mstate, real_loc, _gf_true);
+
+    if (ret) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_RESOLVE_ERROR,
+               "Resolved into an unknown path %s%s "
+               "from the current location of symlink %s",
+               absolute_path, mres->remainingdir, parent_path);
+    }
+
+    GF_FREE(real_loc);
+    GF_FREE(absolute_path);
+    GF_FREE(parent_path);
+    GF_FREE(relative_path);
+
+    return ret;
+
+mnterr:
+    if (mres) {
+        mntstat = mnt3svc_errno_to_mnterr(-ret);
+        mnt3svc_mnt_error_reply(mres->req, mntstat);
+    } else
+        gf_msg(GF_MNT, GF_LOG_CRITICAL, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "mres == NULL, this should *never* happen");
+    if (absolute_path)
+        GF_FREE(absolute_path);
+    if (parent_path)
+        GF_FREE(parent_path);
+    if (relative_path)
+        GF_FREE(relative_path);
+    return ret;
+}
+
+/* We will always have to perform a hard lookup on all the components of a
+ * directory export for a mount request because in the mount reply we need the
+ * file handle of the directory. Our file handle creation code is designed with
+ * the assumption that to build a child file/dir fh, we'll always have the
+ * parent dir's fh available so that we may copy the hash array of the previous
+ * dir levels.
+ *
+ * Since we do not store the file handles anywhere, for every mount request we
+ * must resolve the file handles of every component so that the parent dir file
+ * of the exported directory can be built.
+ */
+int
+__mnt3_resolve_subdir(mnt3_resolve_t *mres)
+{
+    char dupsubdir[MNTPATHLEN];
+    char *firstcomp = NULL;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+    if (!mres)
+        return ret;
+
+    firstcomp = setup_next_component(mres->remainingdir,
+                                     sizeof(mres->remainingdir), dupsubdir,
+                                     sizeof(dupsubdir));
+    if (!firstcomp)
+        goto err;
+
+    ret = nfs_entry_loc_fill(mres->mstate->nfsx, mres->exp->vol->itable,
+                             rootgfid, firstcomp, &mres->resolveloc,
+                             NFS_RESOLVE_CREATE, NULL);
+    if ((ret < 0) && (ret != -2)) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EFAULT, NFS_MSG_RESOLVE_INODE_FAIL,
+               "Failed to resolve and "
+               "create inode for volume root: %s",
+               mres->exp->vol->name);
+        ret = -EFAULT;
+        goto err;
+    }
+
+    nfs_request_user_init(&nfu, mres->req);
+    if (IA_ISLNK(mres->resolveloc.inode->ia_type)) {
+        ret = nfs_readlink(mres->mstate->nfsx, mres->exp->vol, &nfu,
+                           &mres->resolveloc, mnt3_readlink_cbk, mres);
+        gf_msg_debug(GF_MNT, 0,
+                     "Symlink found , need to resolve "
+                     "into directory handle");
+        goto err;
+    }
+    ret = nfs_lookup(mres->mstate->nfsx, mres->exp->vol, &nfu,
+                     &mres->resolveloc, mnt3_resolve_subdir_cbk, mres);
+
+err:
+    return ret;
+}
+
+static gf_boolean_t
+mnt3_match_subnet_v4(struct addrinfo *ai, uint32_t saddr, uint32_t mask)
+{
+    for (; ai; ai = ai->ai_next) {
+        struct sockaddr_in *sin = (struct sockaddr_in *)ai->ai_addr;
+
+        if (sin->sin_family != AF_INET)
+            continue;
+
+        if (mask_match(saddr, sin->sin_addr.s_addr, mask))
+            return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+/**
+ * This function will verify if the client is allowed to mount
+ * the directory or not. Client's IP address will be compared with
+ * allowed IP list or range present in mnt3_export structure.
+ *
+ * @param client_addr - This structure contains client's IP address.
+ * @param export - mnt3_export structure. Contains allowed IP list/range.
+ *
+ * @return 0 - on Success and -EACCES on failure.
+ *
+ * TODO: Support IPv6 subnetwork
+ */
+int
+mnt3_verify_auth(struct sockaddr_in *client_addr, struct mnt3_export *export)
+{
+    int retvalue = -EACCES;
+    int ret = 0;
+    struct host_auth_spec *host = NULL;
+    struct sockaddr_in *allowed_addr = NULL;
+    struct addrinfo *allowed_addrinfo = NULL;
+
+    struct addrinfo hint = {
+        .ai_family = AF_INET,
+        .ai_protocol = (int)IPPROTO_TCP,
+        .ai_flags = AI_CANONNAME,
+    };
+
+    /* Sanity check */
+    if ((NULL == client_addr) || (NULL == export) ||
+        (NULL == export->hostspec)) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Invalid argument");
+        return retvalue;
+    }
+
+    host = export->hostspec;
+
+    /*
+     * Currently IPv4 subnetwork is supported i.e. AF_INET.
+     * TODO: IPv6 subnetwork i.e. AF_INET6.
+     */
+    if (client_addr->sin_family != AF_INET) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EAFNOSUPPORT, NFS_MSG_UNSUPPORTED_VERSION,
+               "Only IPv4 is supported for subdir-auth");
+        return retvalue;
+    }
+
+    /* Try to see if the client IP matches the allowed IP list.*/
+    while (NULL != host) {
+        GF_ASSERT(host->host_addr);
+
+        if (NULL != allowed_addrinfo) {
+            freeaddrinfo(allowed_addrinfo);
+            allowed_addrinfo = NULL;
+        }
+
+        /* Get the addrinfo for the allowed host (host_addr). */
+        ret = getaddrinfo(host->host_addr, NULL, &hint, &allowed_addrinfo);
+        if (0 != ret) {
+            /*
+             * getaddrinfo() FAILED for the host IP addr. Continue
+             * to search other allowed hosts in the  hostspec list.
+             */
+            gf_msg_debug(GF_MNT, 0, "getaddrinfo: %s\n", gai_strerror(ret));
+            host = host->next;
+            continue;
+        }
+
+        allowed_addr = (struct sockaddr_in *)(allowed_addrinfo->ai_addr);
+        if (NULL == allowed_addr) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+                   "Invalid structure");
+            break;
+        }
+
+        /* Check if the network addr of both IPv4 socket match */
+        if (mnt3_match_subnet_v4(allowed_addrinfo, client_addr->sin_addr.s_addr,
+                                 host->netmask)) {
+            retvalue = 0;
+            break;
+        }
+
+        /* No match yet, continue the search */
+        host = host->next;
+    }
+
+    /* FREE the dynamic memory allocated by getaddrinfo() */
+    if (NULL != allowed_addrinfo) {
+        freeaddrinfo(allowed_addrinfo);
+    }
+
+    return retvalue;
+}
+
+int
+mnt3_resolve_subdir(rpcsvc_request_t *req, struct mount3_state *ms,
+                    struct mnt3_export *exp, char *subdir,
+                    gf_boolean_t send_reply)
+{
+    mnt3_resolve_t *mres = NULL;
+    int ret = -EFAULT;
+    struct nfs3_fh pfh = GF_NFS3FH_STATIC_INITIALIZER;
+    struct sockaddr_in *sin = NULL;
+
+    if ((!req) || (!ms) || (!exp) || (!subdir))
+        return ret;
+
+    sin = (struct sockaddr_in *)(&(req->trans->peerinfo.sockaddr));
+
+    /* Need to check AUTH */
+    if (NULL != exp->hostspec) {
+        ret = mnt3_verify_auth(sin, exp);
+        if (0 != ret) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, EACCES, NFS_MSG_AUTH_VERIFY_FAILED,
+                   "AUTH verification failed");
+            return ret;
+        }
+    }
+
+    /* no reply is needed (WebNFS permissions checking), just return */
+    if (!send_reply)
+        return 0; /* no error, mnt3_verify_auth() allowed it */
+
+    mres = GF_CALLOC(1, sizeof(mnt3_resolve_t), gf_nfs_mt_mnt3_resolve);
+    if (!mres) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Memory allocation failed");
+        goto err;
+    }
+
+    mres->exp = exp;
+    mres->mstate = ms;
+    mres->req = req;
+
+    snprintf(mres->remainingdir, MNTPATHLEN, "%s", subdir);
+    gf_path_strip_trailing_slashes(mres->remainingdir);
+
+    if (gf_nfs_dvm_off(nfs_state(ms->nfsx)))
+        pfh = nfs3_fh_build_indexed_root_fh(mres->mstate->nfsx->children,
+                                            mres->exp->vol);
+    else
+        pfh = nfs3_fh_build_uuid_root_fh(exp->volumeid, exp->mountid);
+
+    mres->parentfh = pfh;
+    ret = __mnt3_resolve_subdir(mres);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_RESOLVE_SUBDIR_FAIL,
+               "Failed to resolve export dir: %s", mres->exp->expname);
+        GF_FREE(mres);
+    }
+
+err:
+    return ret;
+}
+
+int
+mnt3_resolve_export_subdir(rpcsvc_request_t *req, struct mount3_state *ms,
+                           struct mnt3_export *exp)
+{
+    char *volume_subdir = NULL;
+    int ret = -EFAULT;
+
+    if ((!req) || (!ms) || (!exp))
+        return ret;
+
+    volume_subdir = mnt3_get_volume_subdir(exp->expname, NULL);
+
+    ret = mnt3_resolve_subdir(req, ms, exp, volume_subdir, _gf_true);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_RESOLVE_SUBDIR_FAIL,
+               "Failed to resolve export dir: %s", exp->expname);
+        goto err;
+    }
+
+err:
+    return ret;
+}
+
+int
+mnt3svc_mount(rpcsvc_request_t *req, struct mount3_state *ms,
+              struct mnt3_export *exp)
+{
+    int ret = -EFAULT;
+
+    if ((!req) || (!ms) || (!exp))
+        return ret;
+
+    if (exp->exptype == MNT3_EXPTYPE_VOLUME)
+        ret = mnt3svc_volume_mount(req, ms, exp);
+    else if (exp->exptype == MNT3_EXPTYPE_DIR)
+        ret = mnt3_resolve_export_subdir(req, ms, exp);
+
+    return ret;
+}
+
+/* mnt3_mntpath_to_xlator sets this to 1 if the mount is for a full
+ * volume or 2 for a subdir in the volume.
+ *
+ * The parameter 'export_parsing_match' indicates whether this function
+ * is being called by an exports parser or whether it is being called
+ * during mount. The behavior is different since we don't have to resolve
+ * the path when doing the parse.
+ */
+struct mnt3_export *
+mnt3_mntpath_to_export(struct mount3_state *ms, const char *dirpath,
+                       gf_boolean_t export_parsing_match)
+{
+    struct mnt3_export *exp = NULL;
+    struct mnt3_export *found = NULL;
+
+    if ((!ms) || (!dirpath))
+        return NULL;
+
+    LOCK(&ms->mountlock);
+    list_for_each_entry(exp, &ms->exportlist, explist)
+    {
+        /* Search for the an exact match with the volume */
+        if (mnt3_match_dirpath_export(exp->expname, dirpath,
+                                      export_parsing_match)) {
+            found = exp;
+            gf_msg_debug(GF_MNT, 0,
+                         "Found export volume: "
+                         "%s",
+                         exp->vol->name);
+            goto foundexp;
+        }
+    }
+
+    gf_msg_debug(GF_MNT, 0, "Export not found");
+foundexp:
+    UNLOCK(&ms->mountlock);
+    return found;
+}
+
+static int
+mnt3_check_client_net_check(rpcsvc_t *svc, char *expvol, char *ipaddr,
+                            uint16_t port)
+{
+    int ret = RPCSVC_AUTH_REJECT;
+
+    if ((!svc) || (!expvol) || (!ipaddr))
+        goto err;
+
+    ret = rpcsvc_auth_check(svc, expvol, ipaddr);
+    if (ret == RPCSVC_AUTH_REJECT) {
+        gf_msg(GF_MNT, GF_LOG_INFO, 0, NFS_MSG_PEER_NOT_ALLOWED,
+               "Peer %s  not allowed", ipaddr);
+        goto err;
+    }
+
+    ret = rpcsvc_transport_privport_check(svc, expvol, port);
+    if (ret == RPCSVC_AUTH_REJECT) {
+        gf_msg(GF_MNT, GF_LOG_INFO, errno, NFS_MSG_PEER_NOT_ALLOWED,
+               "Peer %s rejected. Unprivileged "
+               "port %d not allowed",
+               ipaddr, port);
+        goto err;
+    }
+
+    ret = RPCSVC_AUTH_ACCEPT;
+err:
+    return ret;
+}
+
+int
+mnt3_check_client_net_tcp(rpcsvc_request_t *req, char *volname)
+{
+    rpcsvc_t *svc = NULL;
+    rpc_transport_t *trans = NULL;
+    union gf_sock_union sock_union;
+    socklen_t socksize = sizeof(struct sockaddr_in);
+    char peer[RPCSVC_PEER_STRLEN] = {
+        0,
+    };
+    char *ipaddr = NULL;
+    uint16_t port = 0;
+    int ret = RPCSVC_AUTH_REJECT;
+
+    if ((!req) || (!volname))
+        goto err;
+
+    svc = rpcsvc_request_service(req);
+    trans = rpcsvc_request_transport(req);
+    if ((!svc) || (!trans))
+        goto err;
+
+    ret = rpcsvc_transport_peeraddr(trans, peer, RPCSVC_PEER_STRLEN,
+                                    &sock_union.storage, socksize);
+    if (ret != 0) {
+        gf_msg(GF_MNT, GF_LOG_WARNING, ENOENT, NFS_MSG_GET_PEER_ADDR_FAIL,
+               "Failed to get peer "
+               "addr: %s",
+               gai_strerror(ret));
+        ret = RPCSVC_AUTH_REJECT;
+        goto err;
+    }
+
+    /* peer[] gets IP:PORT formar, slash the port out */
+    if (!get_host_name((char *)peer, &ipaddr))
+        ipaddr = peer;
+
+    port = ntohs(sock_union.sin.sin_port);
+
+    ret = mnt3_check_client_net_check(svc, volname, ipaddr, port);
+err:
+    return ret;
+}
+
+static int
+mnt3_check_client_net_udp(struct svc_req *req, char *volname, xlator_t *nfsx)
+{
+    rpcsvc_t *svc = NULL;
+    struct sockaddr_in *sin = NULL;
+    char ipaddr[INET_ADDRSTRLEN + 1] = {
+        0,
+    };
+    uint16_t port = 0;
+    int ret = RPCSVC_AUTH_REJECT;
+    struct nfs_state *nfs = NULL;
+
+    if ((!req) || (!volname) || (!nfsx))
+        goto err;
+
+#if !defined(_TIRPC_SVC_H)
+    sin = svc_getcaller(req->rq_xprt);
+#else
+    sin = (struct sockaddr_in *)svc_getcaller(req->rq_xprt);
+    /* TIRPC's svc_getcaller() returns a pointer to a sockaddr_in6, even
+     * though it might actually be an IPv4 address. It ought return a
+     * struct sockaddr and make the caller upcast it to the proper
+     * address family. Sigh.
+     */
+#endif
+    if (!sin)
+        goto err;
+    /* And let's make sure that it's actually an IPv4 address. */
+
+    GF_ASSERT(sin->sin_family == AF_INET);
+
+    (void)inet_ntop(AF_INET, &sin->sin_addr, ipaddr, INET_ADDRSTRLEN);
+
+    port = ntohs(sin->sin_port);
+
+    nfs = (struct nfs_state *)nfsx->private;
+    if (nfs != NULL)
+        svc = nfs->rpcsvc;
+
+    ret = mnt3_check_client_net_check(svc, volname, ipaddr, port);
+err:
+    return ret;
+}
+
+int
+mnt3_parse_dir_exports(rpcsvc_request_t *req, struct mount3_state *ms,
+                       char *path, gf_boolean_t send_reply)
+{
+    char volname[1024] = {
+        0,
+    };
+    struct mnt3_export *exp = NULL;
+    char *volname_ptr = NULL;
+    char *subdir = NULL;
+    int ret = -ENOENT;
+    struct nfs_state *nfs = NULL;
+
+    if ((!ms) || (!path))
+        return -1;
+
+    volname_ptr = volname;
+    subdir = mnt3_get_volume_subdir(path, &volname_ptr);
+
+    /* first try to match the full export/subdir */
+    exp = mnt3_mntpath_to_export(ms, path, _gf_false);
+    if (!exp) {
+        gf_msg_trace(GF_MNT, 0,
+                     "Could not find exact matching export "
+                     "for path=%s",
+                     path);
+        /* if no exact match is found, look for a fallback */
+        exp = mnt3_mntpath_to_export(ms, volname, _gf_true);
+        if (!exp) {
+            gf_msg_trace(GF_MNT, 0,
+                         "Could not find export for "
+                         "volume %s",
+                         volname);
+            goto err;
+        }
+    }
+    gf_msg_trace(GF_MNT, 0,
+                 "volume %s and export %s will be used for "
+                 "path %s",
+                 exp->vol->name, exp->expname, path);
+
+    nfs = (struct nfs_state *)ms->nfsx->private;
+    if (!nfs)
+        goto err;
+
+    if (!nfs_subvolume_started(nfs, exp->vol)) {
+        gf_msg_debug(GF_MNT, 0, "Volume %s not started", exp->vol->name);
+        goto err;
+    }
+
+    ret = mnt3_check_client_net_tcp(req, exp->vol->name);
+    if (ret == RPCSVC_AUTH_REJECT) {
+        gf_msg_debug(GF_MNT, 0, "Client mount not allowed");
+        ret = -EACCES;
+        goto err;
+    }
+
+    ret = mnt3_resolve_subdir(req, ms, exp, subdir, send_reply);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_RESOLVE_SUBDIR_FAIL,
+               "Failed to resolve export dir: %s", subdir);
+        goto err;
+    }
+
+err:
+    return ret;
+}
+
+int
+mnt3_find_export(rpcsvc_request_t *req, char *path, struct mnt3_export **e)
+{
+    int ret = -EFAULT;
+    struct mount3_state *ms = NULL;
+    struct mnt3_export *exp = NULL;
+
+    if ((!req) || (!path) || (!e))
+        return -1;
+
+    ms = (struct mount3_state *)rpcsvc_request_program_private(req);
+    if (!ms) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EINVAL, NFS_MSG_MNT_STATE_NOT_FOUND,
+               "Mount state not present");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        goto err;
+    }
+
+    gf_msg_debug(GF_MNT, 0, "dirpath: %s", path);
+    exp = mnt3_mntpath_to_export(ms, path, _gf_false);
+    if (exp) {
+        ret = 0;
+        *e = exp;
+        goto err;
+    }
+
+    if (!gf_mnt3_export_dirs(ms)) {
+        ret = -1;
+        goto err;
+    }
+
+    ret = mnt3_parse_dir_exports(req, ms, path, _gf_true);
+
+err:
+    return ret;
+}
+
+/**
+ * _mnt3_get_peer_addr -- Take an rpc request object and return an allocated
+ *                        peer address. A peer address is host:port.
+ *
+ * @req: An rpc svc request object to extract the peer address from
+ *
+ * @return: success: Pointer to an allocated string containing the peer address
+ *          failure: NULL
+ */
+char *
+_mnt3_get_peer_addr(const rpcsvc_request_t *req)
+{
+    rpc_transport_t *trans = NULL;
+    struct sockaddr_storage sastorage = {
+        0,
+    };
+    char peer[RPCSVC_PEER_STRLEN] = {
+        0,
+    };
+    char *peerdup = NULL;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS, req, out);
+
+    trans = rpcsvc_request_transport(req);
+    ret = rpcsvc_transport_peeraddr(trans, peer, RPCSVC_PEER_STRLEN, &sastorage,
+                                    sizeof(sastorage));
+    if (ret != 0)
+        goto out;
+
+    peerdup = gf_strdup(peer);
+out:
+    return peerdup;
+}
+
+/**
+ * _mnt3_get_host_from_peer -- Take a peer address and get an allocated
+ *                             hostname. The hostname is the string on the
+ *                             left side of the colon.
+ *
+ * @peer_addr: The peer address to get a hostname from
+ *
+ * @return: success: Allocated string containing the hostname
+ *          failure: NULL
+ *
+ */
+char *
+_mnt3_get_host_from_peer(const char *peer_addr)
+{
+    char *part = NULL;
+    size_t host_len = 0;
+    char *colon = NULL;
+
+    colon = strrchr(peer_addr, ':');
+    if (!colon) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_BAD_PEER, "Bad peer %s",
+               peer_addr);
+        goto out;
+    }
+
+    host_len = colon - peer_addr;
+    if (host_len < RPCSVC_PEER_STRLEN)
+        part = gf_strndup(peer_addr, host_len);
+    else
+        gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_PEER_TOO_LONG,
+               "Peer too long %s", peer_addr);
+out:
+    return part;
+}
+
+/**
+ * mnt3_check_cached_fh -- Check if FH is cached.
+ *
+ * Calls auxiliary functions based on whether we are checking
+ * a write operation.
+ *
+ */
+int
+mnt3_check_cached_fh(struct mount3_state *ms, struct nfs3_fh *fh,
+                     const char *host_addr, gf_boolean_t is_write_op)
+{
+    if (!is_write_op)
+        return is_nfs_fh_cached(ms->authcache, fh, host_addr);
+
+    return is_nfs_fh_cached_and_writeable(ms->authcache, fh, host_addr);
+}
+
+/**
+ * _mnt3_authenticate_req -- Given an RPC request and a path OR a filehandle
+ *                           check if the host is authorized to make the
+ *                           request. Uses exports/netgroups auth model to
+ *                           do this check.
+ *
+ * @ms  : The mount state
+ * @req : The RPC request
+ * @fh  : The NFS FH to authenticate (set when authenticating an FOP)
+ * @path: The path to authenticate (set when authenticating a mount req)
+ * @authorized_export: Allocate and fill this value when an export is authorized
+ * @authorized_host: Allocate and fill this value when a host is authorized
+ * @is_write_op: Is this a write op that we are authenticating?
+ *
+ * @return: 0 if authorized
+ *          -EACCES for completely unauthorized fop
+ *          -EROFS  for unauthorized write operations (rm, mkdir, write)
+ */
+int
+_mnt3_authenticate_req(struct mount3_state *ms, rpcsvc_request_t *req,
+                       struct nfs3_fh *fh, const char *path,
+                       char **authorized_export, char **authorized_host,
+                       gf_boolean_t is_write_op)
+{
+    char *peer_addr = NULL;
+    char *host_addr_ip = NULL;
+    char *host_addr_fqdn = NULL;
+    int auth_status_code = -EACCES;
+    char *pathdup = NULL;
+    size_t dlen = 0;
+    char *auth_host = NULL;
+    gf_boolean_t fh_cached = _gf_false;
+    struct export_item *expitem = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_MNT, ms, out);
+    GF_VALIDATE_OR_GOTO(GF_MNT, req, out);
+
+    peer_addr = _mnt3_get_peer_addr(req);
+
+    if (!peer_addr)
+        goto free_and_out;
+
+    host_addr_ip = _mnt3_get_host_from_peer(peer_addr);
+
+    if (!host_addr_ip)
+        goto free_and_out;
+
+    if (path) {
+        /* Need to strip out trailing '/' */
+        pathdup = strdupa(path);
+        dlen = strlen(pathdup);
+        if (dlen > 0 && pathdup[dlen - 1] == '/')
+            pathdup[dlen - 1] = '\0';
+    }
+
+    /* Check if the filehandle is cached */
+    fh_cached = mnt3_check_cached_fh(ms, fh, host_addr_ip, is_write_op);
+    if (fh_cached) {
+        gf_msg_trace(GF_MNT, 0, "Found cached FH for %s", host_addr_ip);
+        auth_status_code = 0;
+        goto free_and_out;
+    }
+
+    /* Check if the IP is authorized */
+    auth_status_code = mnt3_auth_host(ms->auth_params, host_addr_ip, fh,
+                                      pathdup, is_write_op, &expitem);
+
+    gf_msg_debug(GF_MNT, 0, "access from IP %s is %s", host_addr_ip,
+                 auth_status_code ? "denied" : "allowed");
+
+    if (auth_status_code != 0) {
+        /* If not, check if the FQDN is authorized */
+        host_addr_fqdn = gf_rev_dns_lookup(host_addr_ip);
+        auth_status_code = mnt3_auth_host(ms->auth_params, host_addr_fqdn, fh,
+                                          pathdup, is_write_op, &expitem);
+
+        gf_msg_debug(GF_MNT, 0, "access from FQDN %s is %s", host_addr_fqdn,
+                     auth_status_code ? "denied" : "allowed");
+
+        if (auth_status_code == 0)
+            auth_host = host_addr_fqdn;
+    } else
+        auth_host = host_addr_ip;
+
+    /* Skip the lines that set authorized export &
+     * host if they are null.
+     */
+    if (!authorized_export || !authorized_host) {
+        /* Cache the file handle if it was authorized */
+        if (fh && auth_status_code == 0)
+            cache_nfs_fh(ms->authcache, fh, host_addr_ip, expitem);
+
+        goto free_and_out;
+    }
+
+    if (!fh && auth_status_code == 0) {
+        *authorized_export = gf_strdup(pathdup);
+        if (!*authorized_export)
+            gf_msg(GF_MNT, GF_LOG_CRITICAL, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "Allocation error when copying "
+                   "authorized path");
+
+        *authorized_host = gf_strdup(auth_host);
+        if (!*authorized_host)
+            gf_msg(GF_MNT, GF_LOG_CRITICAL, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "Allocation error when copying "
+                   "authorized host");
+    }
+
+free_and_out:
+    /* Free allocated strings after doing the auth */
+    GF_FREE(peer_addr);
+    GF_FREE(host_addr_fqdn);
+    GF_FREE(host_addr_ip);
+out:
+    return auth_status_code;
+}
+
+/**
+ * mnt3_authenticate_request -- Given an RPC request and a path, check if the
+ *                              host is authorized to make the request. This
+ *                              function calls _mnt3_authenticate_req_path ()
+ *                              in a loop for the parent of each path while
+ *                              the authentication check for that path is
+ *                              failing.
+ *
+ * E.g. If the requested path is /patchy/L1, and /patchy is authorized, but
+ * /patchy/L1 is not, it follows this code path :
+ *
+ * _mnt3_authenticate_req ("/patchy/L1") -> F
+ * _mnt3_authenticate_req ("/patchy");   -> T
+ * return T;
+ *
+ * @ms  : The mount state
+ * @req : The RPC request
+ * @path: The requested path
+ * @authorized_path: This gets allocated and populated with the authorized path
+ * @authorized_host: This gets allocated and populated with the authorized host
+ * @return: 0 if authorized
+ *          -EACCES for completely unauthorized fop
+ *          -EROFS  for unauthorized write operations (rm, mkdir, write)
+ */
+int
+mnt3_authenticate_request(struct mount3_state *ms, rpcsvc_request_t *req,
+                          struct nfs3_fh *fh, const char *volname,
+                          const char *path, char **authorized_path,
+                          char **authorized_host, gf_boolean_t is_write_op)
+{
+    int auth_status_code = -EACCES;
+    char *parent_path = NULL;
+    const char *parent_old = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_MNT, ms, out);
+    GF_VALIDATE_OR_GOTO(GF_MNT, req, out);
+
+    /* If this option is not set, just allow it through */
+    if (!ms->nfs->exports_auth) {
+        /* This function is called in a variety of use-cases (mount
+         * + each fop) so path/authorized_path are not always present.
+         * For the cases which it _is_ present we need to populate the
+         * authorized_path. */
+        if (path && authorized_path)
+            *authorized_path = gf_strdup(path);
+
+        auth_status_code = 0;
+        goto out;
+    }
+
+    /* First check if the path is allowed */
+    auth_status_code = _mnt3_authenticate_req(
+        ms, req, fh, path, authorized_path, authorized_host, is_write_op);
+
+    /* If the filehandle is set, just exit since we have to make only
+     * one call to the function above
+     */
+    if (fh)
+        goto out;
+
+    parent_old = path;
+    while (auth_status_code != 0) {
+        /* Get the path's parent */
+        parent_path = gf_resolve_path_parent(parent_old);
+        if (!parent_path) /* Nothing left in the path to resolve */
+            goto out;
+
+        /* Authenticate it */
+        auth_status_code = _mnt3_authenticate_req(ms, req, fh, parent_path,
+                                                  authorized_path,
+                                                  authorized_host, is_write_op);
+
+        parent_old = strdupa(parent_path); /* Copy the parent onto the
+                                            * stack.
+                                            */
+
+        GF_FREE(parent_path); /* Free the allocated parent string */
+    }
+
+out:
+    return auth_status_code;
+}
+
+int
+mnt3svc_mnt(rpcsvc_request_t *req)
+{
+    struct iovec pvec = {
+        0,
+    };
+    char path[MNTPATHLEN];
+    int ret = -1;
+    struct mount3_state *ms = NULL;
+    mountstat3 mntstat = MNT3ERR_SERVERFAULT;
+    struct mnt3_export *exp = NULL;
+    struct nfs_state *nfs = NULL;
+    int authcode = 0;
+
+    if (!req)
+        return -1;
+
+    pvec.iov_base = path;
+    pvec.iov_len = MNTPATHLEN;
+    ret = xdr_to_mountpath(pvec, req->msg[0]);
+    if (ret == -1) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Failed to decode args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ms = (struct mount3_state *)rpcsvc_request_program_private(req);
+    if (!ms) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EINVAL, NFS_MSG_MNT_STATE_NOT_FOUND,
+               "Mount state not present");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = -1;
+        goto rpcerr;
+    }
+
+    nfs = (struct nfs_state *)ms->nfsx->private;
+    gf_msg_debug(GF_MNT, 0, "dirpath: %s", path);
+    ret = mnt3_find_export(req, path, &exp);
+    if (ret < 0) {
+        mntstat = mnt3svc_errno_to_mnterr(-ret);
+        goto mnterr;
+    } else if (!exp) {
+        /*
+         * SPECIAL CASE: exp is NULL if "path" is subdir in
+         * call to mnt3_find_export().
+         *
+         * This is subdir mount, we are already DONE!
+         * nfs_subvolume_started() and mnt3_check_client_net_tcp()
+         * validation are done in mnt3_parse_dir_exports()
+         * which is invoked through mnt3_find_export().
+         *
+         * TODO: All mount should happen thorugh mnt3svc_mount()
+         *       It needs more clean up.
+         */
+        return (0);
+    }
+
+    if (!nfs_subvolume_started(nfs, exp->vol)) {
+        gf_msg_debug(GF_MNT, 0, "Volume %s not started", exp->vol->name);
+        ret = -1;
+        mntstat = MNT3ERR_NOENT;
+        goto mnterr;
+    }
+
+    ret = mnt3_check_client_net_tcp(req, exp->vol->name);
+    if (ret == RPCSVC_AUTH_REJECT) {
+        mntstat = MNT3ERR_ACCES;
+        gf_msg_debug(GF_MNT, 0, "Client mount not allowed");
+        ret = -1;
+        goto mnterr;
+    }
+
+    /* The second authentication check is the exports/netgroups
+     * check.
+     */
+    authcode = mnt3_authenticate_request(ms, req, NULL, NULL, path, NULL, NULL,
+                                         _gf_false);
+    if (authcode != 0) {
+        mntstat = MNT3ERR_ACCES;
+        gf_msg_debug(GF_MNT, 0, "Client mount not allowed");
+        ret = -1;
+        goto mnterr;
+    }
+
+    ret = mnt3svc_mount(req, ms, exp);
+
+    if (ret < 0)
+        mntstat = mnt3svc_errno_to_mnterr(-ret);
+mnterr:
+    if (ret < 0) {
+        mnt3svc_mnt_error_reply(req, mntstat);
+        ret = 0;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+mnt3svc_null(rpcsvc_request_t *req)
+{
+    struct iovec dummyvec = {
+        0,
+    };
+
+    if (!req) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Got NULL request!");
+        return 0;
+    }
+    rpcsvc_submit_generic(req, &dummyvec, 1, NULL, 0, NULL);
+    return 0;
+}
+
+mountlist
+__build_mountlist(struct mount3_state *ms, int *count)
+{
+    struct mountbody *mlist = NULL;
+    struct mountbody *prev = NULL;
+    struct mountbody *first = NULL;
+    size_t namelen = 0;
+    int ret = -1;
+    struct mountentry *me = NULL;
+
+    if ((!ms) || (!count))
+        return NULL;
+
+    /* read rmtab, other peers might have updated it */
+    mount_read_rmtab(ms);
+
+    *count = 0;
+    gf_msg_debug(GF_MNT, 0, "Building mount list:");
+    list_for_each_entry(me, &ms->mountlist, mlist)
+    {
+        namelen = strlen(me->exname);
+        mlist = GF_CALLOC(1, sizeof(*mlist), gf_nfs_mt_mountbody);
+        if (!mlist) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "Memory allocation failed");
+            goto free_list;
+        }
+        if (!first)
+            first = mlist;
+
+        mlist->ml_directory = GF_MALLOC(namelen + 2, gf_nfs_mt_char);
+        if (!mlist->ml_directory) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "Memory allocation failed");
+            goto free_list;
+        }
+
+        strcpy(mlist->ml_directory, me->exname);
+
+        namelen = strlen(me->hostname);
+        mlist->ml_hostname = GF_MALLOC(namelen + 2, gf_nfs_mt_char);
+        if (!mlist->ml_hostname) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "Memory allocation failed");
+            goto free_list;
+        }
+
+        strcpy(mlist->ml_hostname, me->hostname);
+
+        gf_msg_debug(GF_MNT, 0, "mount entry: dir: %s, host: %s",
+                     mlist->ml_directory, mlist->ml_hostname);
+        if (prev) {
+            prev->ml_next = mlist;
+            prev = mlist;
+        } else
+            prev = mlist;
+
+        (*count)++;
+    }
+
+    ret = 0;
+
+free_list:
+    if (ret == -1) {
+        xdr_free_mountlist(first);
+        first = NULL;
+    }
+
+    return first;
+}
+
+mountlist
+mnt3svc_build_mountlist(struct mount3_state *ms, int *count)
+{
+    struct mountbody *first = NULL;
+
+    LOCK(&ms->mountlock);
+    {
+        first = __build_mountlist(ms, count);
+    }
+    UNLOCK(&ms->mountlock);
+
+    return first;
+}
+
+int
+mnt3svc_dump(rpcsvc_request_t *req)
+{
+    int ret = -1;
+    struct mount3_state *ms = NULL;
+    mountlist mlist;
+    mountstat3 mstat = 0;
+    mnt3_serializer sfunc = NULL;
+    void *arg = NULL;
+
+    if (!req)
+        return -1;
+
+    ms = (struct mount3_state *)rpcsvc_request_program_private(req);
+    if (!ms) {
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        goto rpcerr;
+    }
+
+    sfunc = (mnt3_serializer)xdr_serialize_mountlist;
+    mlist = mnt3svc_build_mountlist(ms, &ret);
+    arg = &mlist;
+
+    if (!mlist) {
+        if (ret != 0) {
+            rpcsvc_request_seterr(req, SYSTEM_ERR);
+            ret = -1;
+            goto rpcerr;
+        } else {
+            arg = &mstat;
+            sfunc = (mnt3_serializer)xdr_serialize_mountstat3;
+        }
+    }
+
+    mnt3svc_submit_reply(req, arg, sfunc);
+
+    xdr_free_mountlist(mlist);
+    ret = 0;
+
+rpcerr:
+    return ret;
+}
+
+int
+mnt3svc_umount(struct mount3_state *ms, char *dirpath, char *hostname)
+{
+    struct mountentry *me = NULL;
+    int ret = -1;
+    gf_store_handle_t *sh = NULL;
+    struct nfs_state *nfs = NULL;
+    gf_boolean_t update_rmtab = _gf_false;
+
+    if ((!ms) || (!dirpath) || (!hostname))
+        return -1;
+
+    nfs = (struct nfs_state *)ms->nfsx->private;
+
+    update_rmtab = mount_open_rmtab(nfs->rmtab, &sh);
+    if (update_rmtab) {
+        ret = gf_store_lock(sh);
+        if (ret)
+            goto out_free;
+    }
+
+    LOCK(&ms->mountlock);
+    {
+        if (update_rmtab)
+            __mount_read_rmtab(sh, &ms->mountlist, _gf_false);
+
+        if (list_empty(&ms->mountlist)) {
+            ret = 0;
+            goto out_unlock;
+        }
+
+        ret = -1;
+        list_for_each_entry(me, &ms->mountlist, mlist)
+        {
+            if ((strcmp(me->exname, dirpath) == 0) &&
+                (strcmp(me->hostname, hostname) == 0)) {
+                ret = 0;
+                break;
+            }
+        }
+
+        /* Need this check here because at the end of the search me
+         * might still be pointing to the last entry, which may not be
+         * the one we're looking for.
+         */
+        if (ret == -1) { /* Not found in list. */
+            gf_msg_trace(GF_MNT, 0, "Export not found");
+            goto out_unlock;
+        }
+
+        if (!me)
+            goto out_unlock;
+
+        gf_msg_debug(GF_MNT, 0, "Unmounting: dir %s, host: %s", me->exname,
+                     me->hostname);
+
+        list_del(&me->mlist);
+        GF_FREE(me);
+
+        if (update_rmtab)
+            __mount_rewrite_rmtab(ms, sh);
+    }
+out_unlock:
+    UNLOCK(&ms->mountlock);
+
+    if (update_rmtab)
+        gf_store_unlock(sh);
+
+out_free:
+    if (update_rmtab)
+        gf_store_handle_destroy(sh);
+
+    return ret;
+}
+
+int
+mnt3svc_umnt(rpcsvc_request_t *req)
+{
+    char hostname[MNTPATHLEN];
+    char dirpath[MNTPATHLEN];
+    struct iovec pvec = {
+        0,
+    };
+    int ret = -1;
+    struct mount3_state *ms = NULL;
+    mountstat3 mstat = MNT3_OK;
+    char *colon = NULL;
+
+    if (!req)
+        return -1;
+
+    /* Remove the mount point from the exports list. */
+    pvec.iov_base = dirpath;
+    pvec.iov_len = MNTPATHLEN;
+    ret = xdr_to_mountpath(pvec, req->msg[0]);
+    if (ret == -1) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Failed decode args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ms = (struct mount3_state *)rpcsvc_request_program_private(req);
+    if (!ms) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EINVAL, NFS_MSG_MNT_STATE_NOT_FOUND,
+               "Mount state not present");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = -1;
+        goto rpcerr;
+    }
+
+    ret = rpcsvc_transport_peername(req->trans, hostname, MNTPATHLEN);
+    if (ret != 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOENT, NFS_MSG_GET_REMOTE_NAME_FAIL,
+               "Failed to get remote name: %s", gai_strerror(ret));
+        goto rpcerr;
+    }
+
+    colon = strrchr(hostname, ':');
+    if (colon) {
+        *colon = '\0';
+    }
+    gf_path_strip_trailing_slashes(dirpath);
+    gf_msg_debug(GF_MNT, 0, "dirpath: %s, hostname: %s", dirpath, hostname);
+    ret = mnt3svc_umount(ms, dirpath, hostname);
+
+    if (ret == -1) {
+        ret = 0;
+        mstat = MNT3ERR_NOENT;
+    }
+    /* FIXME: also take care of the corner case where the
+     * client was resolvable at mount but not at the umount - vice-versa.
+     */
+    mnt3svc_submit_reply(req, &mstat,
+                         (mnt3_serializer)xdr_serialize_mountstat3);
+
+rpcerr:
+    return ret;
+}
+
+int
+__mnt3svc_umountall(struct mount3_state *ms)
+{
+    struct mountentry *me = NULL;
+    struct mountentry *tmp = NULL;
+
+    if (!ms)
+        return -1;
+
+    if (list_empty(&ms->mountlist))
+        return 0;
+
+    list_for_each_entry_safe(me, tmp, &ms->mountlist, mlist)
+    {
+        list_del(&me->mlist);       /* Remove from the mount list */
+        __mountdict_remove(ms, me); /* Remove from the mount dict */
+        GF_FREE(me);
+    }
+
+    return 0;
+}
+
+int
+mnt3svc_umountall(struct mount3_state *ms)
+{
+    int ret = -1;
+    if (!ms)
+        return -1;
+
+    LOCK(&ms->mountlock);
+    {
+        ret = __mnt3svc_umountall(ms);
+    }
+    UNLOCK(&ms->mountlock);
+
+    return ret;
+}
+
+int
+mnt3svc_umntall(rpcsvc_request_t *req)
+{
+    int ret = RPCSVC_ACTOR_ERROR;
+    struct mount3_state *ms = NULL;
+    mountstat3 mstat = MNT3_OK;
+
+    if (!req)
+        return ret;
+
+    ms = (struct mount3_state *)rpcsvc_request_program_private(req);
+    if (!ms) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EINVAL, NFS_MSG_MNT_STATE_NOT_FOUND,
+               "Mount state not present");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        goto rpcerr;
+    }
+
+    mnt3svc_umountall(ms);
+    mnt3svc_submit_reply(req, &mstat,
+                         (mnt3_serializer)xdr_serialize_mountstat3);
+
+    ret = RPCSVC_ACTOR_SUCCESS;
+rpcerr:
+    return ret;
+}
+
+exports
+mnt3_xlchildren_to_exports(rpcsvc_t *svc, struct mount3_state *ms)
+{
+    struct exportnode *elist = NULL;
+    struct exportnode *prev = NULL;
+    struct exportnode *first = NULL;
+    size_t namelen = 0;
+    int ret = -1;
+    char *addrstr = NULL;
+    struct mnt3_export *ent = NULL;
+    struct nfs_state *nfs = NULL;
+
+    if ((!ms) || (!svc))
+        return NULL;
+
+    nfs = (struct nfs_state *)ms->nfsx->private;
+    if (!nfs)
+        return NULL;
+
+    LOCK(&ms->mountlock);
+    list_for_each_entry(ent, &ms->exportlist, explist)
+    {
+        /* If volume is not started yet, do not list it for tools like
+         * showmount.
+         */
+        if (!nfs_subvolume_started(nfs, ent->vol))
+            continue;
+
+        elist = GF_CALLOC(1, sizeof(*elist), gf_nfs_mt_exportnode);
+        if (!elist) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "Memory allocation failed");
+            goto free_list;
+        }
+        if (!first)
+            first = elist;
+        namelen = strlen(ent->expname);
+        elist->ex_dir = GF_MALLOC(namelen + 2, gf_nfs_mt_char);
+        if (!elist->ex_dir) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "Memory allocation failed");
+            if (first == elist)
+                first = NULL;
+            xdr_free_exports_list(elist);
+            elist = NULL;
+            goto free_list;
+        }
+        strcpy(elist->ex_dir, ent->expname);
+
+        addrstr = rpcsvc_volume_allowed(svc->options, ent->vol->name);
+        if (addrstr) {
+            /* create a groupnode per allowed client */
+            char *pos = NULL;
+            char *addr = NULL;
+            char *addrs = NULL;
+            struct groupnode *group = NULL;
+            struct groupnode *prev_group = NULL;
+
+            /* strtok_r() modifies the string, dup it */
+            addrs = gf_strdup(addrstr);
+            if (!addrs)
+                goto free_list;
+
+            while (1) {
+                /* only pass addrs on the 1st call */
+                addr = strtok_r(group ? NULL : addrs, ",", &pos);
+                if (addr == NULL)
+                    /* no mode clients */
+                    break;
+
+                group = GF_CALLOC(1, sizeof(struct groupnode),
+                                  gf_nfs_mt_groupnode);
+                if (!group) {
+                    gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                           "Memory "
+                           "allocation failed");
+                    GF_FREE(addrs);
+                    goto free_list;
+                }
+
+                group->gr_name = gf_strdup(addr);
+                if (!group->gr_name) {
+                    gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                           "Memory "
+                           "allocation failed");
+                    GF_FREE(group);
+                    GF_FREE(addrs);
+                    goto free_list;
+                }
+
+                /* chain the groups together */
+                if (!elist->ex_groups)
+                    elist->ex_groups = group;
+                else if (prev_group && !prev_group->gr_next)
+                    prev_group->gr_next = group;
+                prev_group = group;
+            }
+
+            GF_FREE(addrs);
+        } else {
+            elist->ex_groups = GF_CALLOC(1, sizeof(struct groupnode),
+                                         gf_nfs_mt_groupnode);
+            if (!elist->ex_groups) {
+                gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                       "Memory allocation "
+                       "failed");
+                goto free_list;
+            }
+
+            addrstr = gf_strdup("No Access");
+            if (!addrstr)
+                goto free_list;
+
+            elist->ex_groups->gr_name = addrstr;
+        }
+
+        if (prev) {
+            prev->ex_next = elist;
+            prev = elist;
+        } else
+            prev = elist;
+    }
+
+    ret = 0;
+
+free_list:
+    UNLOCK(&ms->mountlock);
+    if (ret == -1) {
+        xdr_free_exports_list(first);
+        first = NULL;
+    }
+
+    return first;
+}
+
+int
+mnt3svc_export(rpcsvc_request_t *req)
+{
+    struct mount3_state *ms = NULL;
+    exports elist = NULL;
+    int ret = -1;
+
+    if (!req)
+        return -1;
+
+    ms = (struct mount3_state *)rpcsvc_request_program_private(req);
+    if (!ms) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EINVAL, NFS_MSG_MNT_STATE_NOT_FOUND,
+               "mount state not found");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        goto err;
+    }
+
+    /* Using the children translator names, build the export list */
+    elist = mnt3_xlchildren_to_exports(rpcsvc_request_service(req), ms);
+    /* Do not return error when exports list is empty. An exports list can
+     * be empty when no subvolumes have come up. No point returning error
+     * and confusing the user.
+    if (!elist) {
+            gf_log (GF_MNT, GF_LOG_ERROR, "Failed to build exports list");
+            nfs_rpcsvc_request_seterr (req, SYSTEM_ERR);
+            goto err;
+    }
+    */
+
+    /* Note how the serializer is passed to the generic reply function. */
+    mnt3svc_submit_reply(req, &elist, (mnt3_serializer)xdr_serialize_exports);
+
+    xdr_free_exports_list(elist);
+    ret = 0;
+err:
+    return ret;
+}
+
+/*
+ * __mnt3udp_get_mstate() Fetches mount3_state from xlator
+ * Linkage: Static
+ * Usage: Used only for UDP MOUNT codepath
+ */
+static struct mount3_state *
+__mnt3udp_get_mstate(xlator_t *nfsx)
+{
+    struct nfs_state *nfs = NULL;
+    struct mount3_state *ms = NULL;
+
+    if (nfsx == NULL)
+        return NULL;
+
+    nfs = (struct nfs_state *)nfsx->private;
+    if (nfs == NULL)
+        return NULL;
+
+    ms = (struct mount3_state *)nfs->mstate;
+    return ms;
+}
+
+extern int
+glfs_resolve_at(struct glfs *, xlator_t *, inode_t *, const char *, loc_t *,
+                struct iatt *, int, int);
+
+extern struct glfs *
+glfs_new_from_ctx(glusterfs_ctx_t *);
+
+extern void
+glfs_free_from_ctx(struct glfs *);
+
+static inode_t *
+__mnt3udp_get_export_subdir_inode(struct svc_req *req, char *subdir,
+                                  char *expname, /* OUT */
+                                  struct mnt3_export *exp)
+{
+    inode_t *inode = NULL;
+    loc_t loc = {
+        0,
+    };
+    struct iatt buf = {
+        0,
+    };
+    int ret = -1;
+    glfs_t *fs = NULL;
+
+    if ((!req) || (!subdir) || (!expname) || (!exp))
+        return NULL;
+
+    /* AUTH check for subdir i.e. nfs.export-dir */
+    if (exp->hostspec) {
+        struct sockaddr_in *sin = NULL;
+
+#if !defined(_TIRPC_SVC_H)
+        sin = svc_getcaller(req->rq_xprt);
+#else
+        sin = (struct sockaddr_in *)svc_getcaller(req->rq_xprt);
+        /* TIRPC's svc_getcaller() returns a pointer to a
+         * sockaddr_in6, even though it might actually be an
+         * IPv4 address. It ought return a struct sockaddr and
+         * make the caller upcast it to the proper address family.
+         */
+#endif
+        /* And let's make sure that it's actually an IPv4 address. */
+        GF_ASSERT(sin->sin_family == AF_INET);
+
+        ret = mnt3_verify_auth(sin, exp);
+        if (ret) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, EACCES, NFS_MSG_AUTH_VERIFY_FAILED,
+                   "AUTH(nfs.export-dir) verification failed");
+            errno = EACCES;
+            return NULL;
+        }
+    }
+
+    /*
+     * IMP: glfs_t fs object is not used by glfs_resolve_at (). The main
+     * purpose is to not change the ABI of glfs_resolve_at () and not to
+     * pass a NULL object.
+     *
+     * TODO: Instead of linking against libgfapi.so, just for one API
+     * i.e. glfs_resolve_at(), It would be cleaner if PATH name to
+     * inode resolution code can be moved to libglusterfs.so or so.
+     * refer bugzilla for more details :
+     * https://bugzilla.redhat.com/show_bug.cgi?id=1161573
+     */
+    fs = glfs_new_from_ctx(exp->vol->ctx);
+    if (!fs)
+        return NULL;
+
+    ret = glfs_resolve_at(fs, exp->vol, NULL, subdir, &loc, &buf,
+                          1 /* Follow link */, 0 /* Hard lookup */);
+
+    glfs_free_from_ctx(fs);
+
+    if (ret != 0) {
+        loc_wipe(&loc);
+        return NULL;
+    }
+
+    inode = inode_ref(loc.inode);
+    snprintf(expname, PATH_MAX, "/%s%s", exp->vol->name, loc.path);
+
+    loc_wipe(&loc);
+
+    return inode;
+}
+
+static inode_t *
+__mnt3udp_get_export_volume_inode(struct svc_req *req, char *volpath,
+                                  char *expname, /* OUT */
+                                  struct mnt3_export *exp)
+{
+    char *rpath = NULL;
+    inode_t *inode = NULL;
+
+    if ((!req) || (!volpath) || (!expname) || (!exp))
+        return NULL;
+
+    rpath = strchr(volpath, '/');
+    if (rpath == NULL)
+        rpath = "/";
+
+    inode = inode_from_path(exp->vol->itable, rpath);
+    snprintf(expname, PATH_MAX, "/%s", exp->vol->name);
+
+    return inode;
+}
+
+/*
+ * nfs3_rootfh() is used for NFS MOUNT over UDP i.e. mountudpproc3_mnt_3_svc().
+ * Especially in mount3udp_thread() THREAD. Gluster NFS starts this thread
+ * when nfs.mount-udp is ENABLED (set to TRUE/ON).
+ */
+struct nfs3_fh *
+nfs3_rootfh(struct svc_req *req, xlator_t *nfsx, char *path,
+            char *expname /* OUT */)
+{
+    struct nfs3_fh *fh = NULL;
+    inode_t *inode = NULL;
+    struct mnt3_export *exp = NULL;
+    struct mount3_state *ms = NULL;
+    struct nfs_state *nfs = NULL;
+    int mnt3type = MNT3_EXPTYPE_DIR;
+    int ret = RPCSVC_AUTH_REJECT;
+
+    if ((!req) || (!nfsx) || (!path) || (!expname)) {
+        errno = EFAULT;
+        return NULL;
+    }
+
+    /*
+     * 1. First check if the MOUNT is for whole volume.
+     *      i.e. __mnt3udp_get_export_volume_inode ()
+     * 2. If NOT, then TRY for SUBDIR MOUNT.
+     *      i.e. __mnt3udp_get_export_subdir_inode ()
+     * 3. If a subdir is exported using nfs.export-dir,
+     *      then the mount type would be MNT3_EXPTYPE_DIR,
+     *      so make sure to find the proper path to be
+     *      resolved using mnt3_get_volume_subdir()
+     * 3. Make sure subdir export is allowed.
+     */
+    ms = __mnt3udp_get_mstate(nfsx);
+    if (!ms) {
+        errno = EFAULT;
+        return NULL;
+    }
+
+    exp = mnt3_mntpath_to_export(ms, path, _gf_false);
+    if (exp != NULL)
+        mnt3type = exp->exptype;
+
+    if (mnt3type == MNT3_EXPTYPE_DIR) {
+        char volname[MNTPATHLEN] = {
+            0,
+        };
+        char *volptr = volname;
+
+        /* Subdir export (nfs3.export-dirs) check */
+        if (!gf_mnt3_export_dirs(ms)) {
+            errno = EACCES;
+            return NULL;
+        }
+
+        path = mnt3_get_volume_subdir(path, &volptr);
+        if (exp == NULL)
+            exp = mnt3_mntpath_to_export(ms, volname, _gf_false);
+    }
+
+    if (exp == NULL) {
+        errno = ENOENT;
+        return NULL;
+    }
+
+    nfs = (struct nfs_state *)nfsx->private;
+    if (!nfs_subvolume_started(nfs, exp->vol)) {
+        errno = ENOENT;
+        return NULL;
+    }
+
+    /* AUTH check: respect nfs.rpc-auth-allow/reject */
+    ret = mnt3_check_client_net_udp(req, exp->vol->name, nfsx);
+    if (ret == RPCSVC_AUTH_REJECT) {
+        errno = EACCES;
+        return NULL;
+    }
+
+    switch (mnt3type) {
+        case MNT3_EXPTYPE_VOLUME:
+            inode = __mnt3udp_get_export_volume_inode(req, path, expname, exp);
+            break;
+
+        case MNT3_EXPTYPE_DIR:
+            inode = __mnt3udp_get_export_subdir_inode(req, path, expname, exp);
+            break;
+
+        default:
+            /* Never reachable */
+            gf_msg(GF_MNT, GF_LOG_ERROR, EFAULT, NFS_MSG_UNKNOWN_MNT_TYPE,
+                   "Unknown MOUNT3 type");
+            errno = EFAULT;
+            goto err;
+    }
+
+    if (inode == NULL) {
+        /* Don't over-write errno */
+        if (!errno)
+            errno = ENOENT;
+        goto err;
+    }
+
+    /* Build the inode from FH */
+    fh = GF_CALLOC(1, sizeof(*fh), gf_nfs_mt_nfs3_fh);
+    if (fh == NULL) {
+        errno = ENOMEM;
+        goto err;
+    }
+
+    (void)nfs3_build_fh(inode, exp->volumeid, fh);
+
+err:
+    if (inode)
+        inode_unref(inode);
+
+    return fh;
+}
+
+int
+mount3udp_add_mountlist(xlator_t *nfsx, char *host, char *export)
+{
+    struct mountentry *me = NULL;
+    struct mount3_state *ms = NULL;
+
+    if ((!host) || (!export) || (!nfsx))
+        return -1;
+
+    ms = __mnt3udp_get_mstate(nfsx);
+    if (!ms)
+        return -1;
+
+    me = GF_CALLOC(1, sizeof(*me), gf_nfs_mt_mountentry);
+    if (!me)
+        return -1;
+
+    snprintf(me->exname, MNTPATHLEN, "%s", export);
+    snprintf(me->hostname, MNTPATHLEN, "%s", host);
+    INIT_LIST_HEAD(&me->mlist);
+    LOCK(&ms->mountlock);
+    {
+        list_add_tail(&me->mlist, &ms->mountlist);
+        mount_rewrite_rmtab(ms, NULL);
+    }
+    UNLOCK(&ms->mountlock);
+    return 0;
+}
+
+int
+mount3udp_delete_mountlist(xlator_t *nfsx, char *hostname, char *export)
+{
+    struct mount3_state *ms = NULL;
+
+    if ((!hostname) || (!export) || (!nfsx))
+        return -1;
+
+    ms = __mnt3udp_get_mstate(nfsx);
+    if (!ms)
+        return -1;
+
+    mnt3svc_umount(ms, export, hostname);
+    return 0;
+}
+
+/**
+ * This function will parse the hostip (IP address, IP range, or hostname)
+ * and fill the host_auth_spec structure.
+ *
+ * @param hostspec - struct host_auth_spec
+ * @param hostip   - IP address, IP range (CIDR format) or hostname
+ *
+ * @return 0 - on success and -1 on failure
+ *
+ * NB: This does not support IPv6 currently.
+ */
+int
+mnt3_export_fill_hostspec(struct host_auth_spec *hostspec, const char *hostip)
+{
+    char *ipdupstr = NULL;
+    char *savptr = NULL;
+    char *endptr = NULL;
+    char *ip = NULL;
+    char *token = NULL;
+    int ret = -1;
+    long prefixlen = IPv4_ADDR_SIZE; /* default */
+    uint32_t shiftbits = 0;
+    size_t length = 0;
+
+    /* Create copy of the string so that the source won't change
+     */
+    ipdupstr = gf_strdup(hostip);
+    if (NULL == ipdupstr) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Memory allocation failed");
+        goto err;
+    }
+
+    ip = strtok_r(ipdupstr, "/", &savptr);
+    /* Validate the Hostname or IPv4 address
+     * TODO: IPv6 support for subdir auth.
+     */
+    length = strlen(ip);
+    if ((!valid_ipv4_address(ip, (int)length, _gf_false)) &&
+        (!valid_host_name(ip, (int)length))) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Invalid hostname or IPv4 address: %s", ip);
+        goto err;
+    }
+
+    hostspec->host_addr = gf_strdup(ip);
+    if (NULL == hostspec->host_addr) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Memory allocation failed");
+        goto err;
+    }
+
+    /**
+     * User provided CIDR address (xx.xx.xx.xx/n format) is split
+     * into HOST (IP addr or hostname) and network prefix(n) from
+     * which netmask would be calculated. This CIDR address may
+     * denote a single, distinct interface address or the beginning
+     * address of an entire network.
+     *
+     * e.g. the IPv4 block 192.168.100.0/24 represents the 256
+     * IPv4 addresses from 192.168.100.0 to 192.168.100.255.
+     * Therefore to check if an IP matches 192.168.100.0/24
+     * we should mask the IP with FFFFFF00 and compare it with
+     * host address part of CIDR.
+     *
+     * Refer: mask_match() in common-utils.c.
+     */
+    token = strtok_r(NULL, "/", &savptr);
+    if (token != NULL) {
+        prefixlen = strtol(token, &endptr, 10);
+        if ((errno != 0) || (*endptr != '\0') || (prefixlen < 0) ||
+            (prefixlen > IPv4_ADDR_SIZE)) {
+            gf_msg(THIS->name, GF_LOG_WARNING, EINVAL, NFS_MSG_INVALID_ENTRY,
+                   "Invalid IPv4 subnetwork mask");
+            goto err;
+        }
+    }
+
+    /*
+     * 1. Calculate the network mask address.
+     * 2. Convert it into Big-Endian format.
+     * 3. Store it in hostspec netmask.
+     */
+    shiftbits = IPv4_ADDR_SIZE - prefixlen;
+    hostspec->netmask = htonl((uint32_t)~0 << shiftbits);
+
+    ret = 0; /* SUCCESS */
+err:
+    if (NULL != ipdupstr) {
+        GF_FREE(ipdupstr);
+    }
+    return ret;
+}
+
+/**
+ * This function will parse the AUTH parameter passed along with
+ * "export-dir" option. If AUTH parameter is present then it will be
+ * stripped from exportpath and stored in mnt3_export (exp) structure.
+ *
+ * @param exp - mnt3_export structure. Holds information needed for mount.
+ * @param exportpath - Value of "export-dir" key. Holds both export path
+ *                     and AUTH parameter for the path.
+ *                     exportpath format: <abspath>[(hostdesc[|hostspec|...])]
+ *
+ * @return This function will return 0 on success and -1 on failure.
+ */
+int
+mnt3_export_parse_auth_param(struct mnt3_export *exp, char *exportpath)
+{
+    char *token = NULL;
+    char *savPtr = NULL;
+    char *hostip = NULL;
+    struct host_auth_spec *host = NULL;
+    int ret = 0;
+
+    if (exportpath == NULL) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EINVAL, NFS_MSG_PARSE_HOSTSPEC_FAIL,
+               "Export path is NULL");
+        return -1;
+    }
+
+    /* Using exportpath directly in strtok_r because we want
+     * to strip off AUTH parameter from exportpath. */
+    token = strtok_r(exportpath, "(", &savPtr);
+
+    /* Get the next token, which will be the AUTH parameter. */
+    token = strtok_r(NULL, ")", &savPtr);
+
+    if (NULL == token) {
+        /* If AUTH is not present then we should return success. */
+        return 0;
+    }
+
+    /* Free any previously allocated hostspec structure. */
+    if (NULL != exp->hostspec) {
+        GF_FREE(exp->hostspec);
+        exp->hostspec = NULL;
+    }
+
+    exp->hostspec = GF_CALLOC(1, sizeof(*(exp->hostspec)), gf_nfs_mt_auth_spec);
+    if (NULL == exp->hostspec) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Memory allocation failed");
+        return -1;
+    }
+
+    /* AUTH parameter can have multiple entries. For each entry
+     * a host_auth_spec structure is created. */
+    host = exp->hostspec;
+
+    hostip = strtok_r(token, "|", &savPtr);
+
+    /* Parse all AUTH parameters separated by '|' */
+    while (NULL != hostip) {
+        ret = mnt3_export_fill_hostspec(host, hostip);
+        if (0 != ret) {
+            gf_msg(GF_MNT, GF_LOG_WARNING, 0, NFS_MSG_PARSE_HOSTSPEC_FAIL,
+                   "Failed to parse hostspec: %s", hostip);
+            goto err;
+        }
+
+        hostip = strtok_r(NULL, "|", &savPtr);
+        if (NULL == hostip) {
+            break;
+        }
+
+        host->next = GF_CALLOC(1, sizeof(*(host)), gf_nfs_mt_auth_spec);
+        if (NULL == host->next) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "Memory allocation failed");
+            goto err;
+        }
+        host = host->next;
+    }
+
+    /* In case of success return from here */
+    return 0;
+err:
+    /* In case of failure free up hostspec structure.  */
+    FREE_HOSTSPEC(exp);
+
+    return -1;
+}
+
+/**
+ * exportpath will also have AUTH options (ip address, subnet address or
+ * hostname) mentioned.
+ * exportpath format: <abspath>[(hostdesc[|hostspec|...])]
+ */
+struct mnt3_export *
+mnt3_init_export_ent(struct mount3_state *ms, xlator_t *xl, char *exportpath,
+                     uuid_t volumeid)
+{
+    struct mnt3_export *exp = NULL;
+    int alloclen = 0;
+    int ret = -1;
+
+    if ((!ms) || (!xl))
+        return NULL;
+
+    exp = GF_CALLOC(1, sizeof(*exp), gf_nfs_mt_mnt3_export);
+    if (!exp) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Memory allocation failed");
+        return NULL;
+    }
+
+    if (NULL != exportpath) {
+        /* If exportpath is not NULL then we should check if AUTH
+         * parameter is present or not. If AUTH parameter is present
+         * then it will be stripped and stored in mnt3_export (exp)
+         * structure.
+         */
+        if (0 != mnt3_export_parse_auth_param(exp, exportpath)) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_PARSE_AUTH_PARAM_FAIL,
+                   "Failed to parse auth param");
+            goto err;
+        }
+    }
+
+    INIT_LIST_HEAD(&exp->explist);
+    if (exportpath)
+        alloclen = strlen(xl->name) + 2 + strlen(exportpath);
+    else
+        alloclen = strlen(xl->name) + 2;
+
+    exp->expname = GF_MALLOC(alloclen, gf_nfs_mt_char);
+    if (!exp->expname) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Memory allocation failed");
+        goto err;
+    }
+
+    if (exportpath) {
+        gf_msg_trace(GF_MNT, 0, "Initing dir export: %s:%s", xl->name,
+                     exportpath);
+        exp->exptype = MNT3_EXPTYPE_DIR;
+        ret = snprintf(exp->expname, alloclen, "/%s%s", xl->name, exportpath);
+    } else {
+        gf_msg_trace(GF_MNT, 0, "Initing volume export: %s", xl->name);
+        exp->exptype = MNT3_EXPTYPE_VOLUME;
+        ret = snprintf(exp->expname, alloclen, "/%s", xl->name);
+    }
+    if (ret < 0) {
+        gf_msg(xl->name, GF_LOG_ERROR, ret, NFS_MSG_SET_EXP_FAIL,
+               "Failed to set the export name");
+        goto err;
+    }
+    /* Just copy without discrimination, we'll determine whether to
+     * actually use it when a mount request comes in and a file handle
+     * needs to be built.
+     */
+    gf_uuid_copy(exp->volumeid, volumeid);
+    exp->vol = xl;
+
+    /* On success we should return from here*/
+    return exp;
+err:
+    /* On failure free exp and it's members.*/
+    if (NULL != exp) {
+        mnt3_export_free(exp);
+        exp = NULL;
+    }
+
+    return exp;
+}
+
+int
+__mnt3_init_volume_direxports(struct mount3_state *ms, xlator_t *xlator,
+                              char *optstr, uuid_t volumeid)
+{
+    struct mnt3_export *newexp = NULL;
+    int ret = -1;
+    char *savptr = NULL;
+    char *dupopt = NULL;
+    char *token = NULL;
+
+    if ((!ms) || (!xlator) || (!optstr))
+        return -1;
+
+    dupopt = strdupa(optstr);
+
+    token = strtok_r(dupopt, ",", &savptr);
+    while (token) {
+        newexp = mnt3_init_export_ent(ms, xlator, token, volumeid);
+        if (!newexp) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_INIT_DIR_EXP_FAIL,
+                   "Failed to init dir "
+                   "export: %s",
+                   token);
+            ret = -1;
+            goto err;
+        }
+
+        list_add_tail(&newexp->explist, &ms->exportlist);
+        token = strtok_r(NULL, ",", &savptr);
+    }
+
+    ret = 0;
+err:
+    return ret;
+}
+
+int
+__mnt3_init_volume(struct mount3_state *ms, dict_t *opts, xlator_t *xlator)
+{
+    struct mnt3_export *newexp = NULL;
+    int ret = -1;
+    char searchstr[1024];
+    char *optstr = NULL;
+    uuid_t volumeid = {
+        0,
+    };
+
+    if ((!ms) || (!xlator) || (!opts))
+        return -1;
+
+    gf_uuid_clear(volumeid);
+    if (gf_nfs_dvm_off(nfs_state(ms->nfsx)))
+        goto no_dvm;
+
+    ret = snprintf(searchstr, 1024, "nfs3.%s.volume-id", xlator->name);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_SNPRINTF_FAIL,
+               "snprintf failed");
+        ret = -1;
+        goto err;
+    }
+
+    if (dict_get(opts, searchstr)) {
+        ret = dict_get_str(opts, searchstr, &optstr);
+        if (ret < 0) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_DICT_GET_FAILED,
+                   "Failed to read "
+                   "option: %s",
+                   searchstr);
+            ret = -1;
+            goto err;
+        }
+    } else {
+        gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_VOLID_MISSING,
+               "DVM is on but volume-id not "
+               "given for volume: %s",
+               xlator->name);
+        ret = -1;
+        goto err;
+    }
+
+    if (optstr) {
+        ret = gf_uuid_parse(optstr, volumeid);
+        if (ret < 0) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_PARSE_VOL_UUID_FAIL,
+                   "Failed to parse "
+                   "volume UUID");
+            ret = -1;
+            goto err;
+        }
+    }
+
+no_dvm:
+    ret = snprintf(searchstr, 1024, "nfs3.%s.export-dir", xlator->name);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_SNPRINTF_FAIL,
+               "snprintf failed");
+        ret = -1;
+        goto err;
+    }
+
+    if (dict_get(opts, searchstr)) {
+        ret = dict_get_str(opts, searchstr, &optstr);
+        if (ret < 0) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_DICT_GET_FAILED,
+                   "Failed to read "
+                   "option: %s",
+                   searchstr);
+            ret = -1;
+            goto err;
+        }
+
+        ret = __mnt3_init_volume_direxports(ms, xlator, optstr, volumeid);
+        if (ret == -1) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_DIR_EXP_SETUP_FAIL,
+                   "Dir export "
+                   "setup failed for volume: %s",
+                   xlator->name);
+            goto err;
+        }
+    }
+
+    if (ms->export_volumes) {
+        newexp = mnt3_init_export_ent(ms, xlator, NULL, volumeid);
+        if (!newexp) {
+            ret = -1;
+            goto err;
+        }
+
+        list_add_tail(&newexp->explist, &ms->exportlist);
+    }
+    ret = 0;
+
+err:
+    return ret;
+}
+
+int
+__mnt3_init_volume_export(struct mount3_state *ms, dict_t *opts)
+{
+    int ret = -1;
+    char *optstr = NULL;
+    /* On by default. */
+    gf_boolean_t boolt = _gf_true;
+
+    if ((!ms) || (!opts))
+        return -1;
+
+    if (!dict_get(opts, "nfs3.export-volumes")) {
+        ret = 0;
+        goto err;
+    }
+
+    ret = dict_get_str(opts, "nfs3.export-volumes", &optstr);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_DICT_GET_FAILED,
+               "Failed to read option: nfs3.export-volumes");
+        ret = -1;
+        goto err;
+    }
+
+    ret = gf_string2boolean(optstr, &boolt);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_STR2BOOL_FAIL,
+               "Failed to convert string to boolean");
+    }
+
+err:
+    if (boolt == _gf_false) {
+        gf_msg_trace(GF_MNT, 0, "Volume exports disabled");
+        ms->export_volumes = 0;
+    } else {
+        gf_msg_trace(GF_MNT, 0, "Volume exports enabled");
+        ms->export_volumes = 1;
+    }
+
+    return ret;
+}
+
+int
+__mnt3_init_dir_export(struct mount3_state *ms, dict_t *opts)
+{
+    int ret = -1;
+    char *optstr = NULL;
+    /* On by default. */
+    gf_boolean_t boolt = _gf_true;
+
+    if ((!ms) || (!opts))
+        return -1;
+
+    if (!dict_get(opts, "nfs3.export-dirs")) {
+        ret = 0;
+        goto err;
+    }
+
+    ret = dict_get_str(opts, "nfs3.export-dirs", &optstr);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_DICT_GET_FAILED,
+               "Failed to read option: nfs3.export-dirs");
+        ret = -1;
+        goto err;
+    }
+
+    ret = gf_string2boolean(optstr, &boolt);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_STR2BOOL_FAIL,
+               "Failed to convert string to boolean");
+    }
+
+err:
+    if (boolt == _gf_false) {
+        gf_msg_trace(GF_MNT, 0, "Dir exports disabled");
+        ms->export_dirs = 0;
+    } else {
+        gf_msg_trace(GF_MNT, 0, "Dir exports enabled");
+        ms->export_dirs = 1;
+    }
+
+    return ret;
+}
+
+int
+mnt3_init_options(struct mount3_state *ms, dict_t *options)
+{
+    xlator_list_t *volentry = NULL;
+    int ret = -1;
+
+    if ((!ms) || (!options))
+        return -1;
+
+    __mnt3_init_volume_export(ms, options);
+    __mnt3_init_dir_export(ms, options);
+    volentry = ms->nfsx->children;
+    while (volentry) {
+        gf_msg_trace(GF_MNT, 0, "Initing options for: %s",
+                     volentry->xlator->name);
+        ret = __mnt3_init_volume(ms, options, volentry->xlator);
+        if (ret < 0) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_VOL_INIT_FAIL,
+                   "Volume init failed");
+            goto err;
+        }
+
+        volentry = volentry->next;
+    }
+
+    ret = 0;
+err:
+    return ret;
+}
+
+struct mount3_state *
+mnt3_init_state(xlator_t *nfsx)
+{
+    struct mount3_state *ms = NULL;
+    int ret = -1;
+
+    if (!nfsx)
+        return NULL;
+
+    ms = GF_CALLOC(1, sizeof(*ms), gf_nfs_mt_mount3_state);
+    if (!ms) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Memory allocation failed");
+        return NULL;
+    }
+
+    ms->iobpool = nfsx->ctx->iobuf_pool;
+    ms->nfsx = nfsx;
+    INIT_LIST_HEAD(&ms->exportlist);
+    ret = mnt3_init_options(ms, nfsx->options);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_OPT_INIT_FAIL,
+               "Options init failed");
+        return NULL;
+    }
+
+    INIT_LIST_HEAD(&ms->mountlist);
+    LOCK_INIT(&ms->mountlock);
+
+    return ms;
+}
+
+int
+mount_init_state(xlator_t *nfsx)
+{
+    int ret = -1;
+    struct nfs_state *nfs = NULL;
+
+    if (!nfsx)
+        goto out;
+
+    nfs = (struct nfs_state *)nfs_state(nfsx);
+    /*Maintaining global state for MOUNT1 and MOUNT3*/
+    nfs->mstate = mnt3_init_state(nfsx);
+    if (!nfs->mstate) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to allocate mount state");
+        goto out;
+    }
+    ret = 0;
+out:
+    return ret;
+}
+
+static rpcsvc_actor_t mnt3svc_actors[MOUNT3_PROC_COUNT] = {
+    {
+        "NULL",
+        mnt3svc_null,
+        NULL,
+        MOUNT3_NULL,
+        DRC_NA,
+    },
+    {"MNT", mnt3svc_mnt, NULL, MOUNT3_MNT, DRC_NA, 0},
+    {"DUMP", mnt3svc_dump, NULL, MOUNT3_DUMP, DRC_NA, 0},
+    {"UMNT", mnt3svc_umnt, NULL, MOUNT3_UMNT, DRC_NA, 0},
+    {"UMNTALL", mnt3svc_umntall, NULL, MOUNT3_UMNTALL, DRC_NA, 0},
+    {"EXPORT", mnt3svc_export, NULL, MOUNT3_EXPORT, DRC_NA, 0}};
+
+/* Static init parts are assigned here, dynamic ones are done in
+ * mnt3svc_init and mnt3_init_state.
+ * Making MOUNT3 a synctask so that the blocking DNS calls during rpc auth
+ * gets offloaded to syncenv, keeping the main/poll thread unblocked
+ */
+static rpcsvc_program_t mnt3prog = {
+    .progname = "MOUNT3",
+    .prognum = MOUNT_PROGRAM,
+    .progver = MOUNT_V3,
+    .progport = GF_MOUNTV3_PORT,
+    .actors = mnt3svc_actors,
+    .numactors = MOUNT3_PROC_COUNT,
+    .min_auth = AUTH_NULL,
+    .synctask = _gf_true,
+};
+
+/**
+ * __mnt3_mounted_exports_walk -- Walk through the mounted export directories
+ *                                and unmount the directories that are no
+ *                                longer authorized to be mounted.
+ * @dict: The dict to walk
+ * @key : The key we are on
+ * @val : The value associated with that key
+ * @tmp : Additional params (pointer to an auth params struct passed here)
+ *
+ */
+int
+__mnt3_mounted_exports_walk(dict_t *dict, char *key, data_t *val, void *tmp)
+{
+    char *path = NULL;
+    char *host_addr_ip = NULL;
+    char *host_addr_fqdn = NULL;
+    char *keydup = NULL;
+    char *colon = NULL;
+    struct mnt3_auth_params *auth_params = NULL;
+    int ret = 0;
+    int auth_status_code = 0;
+
+    gf_msg_trace(GF_MNT, 0, "Checking if key %s is authorized.", key);
+
+    auth_params = (struct mnt3_auth_params *)tmp;
+
+    /* Since we haven't obtained a lock around the mount dict
+     * here, we want to duplicate the key and then process it.
+     * Otherwise we would potentially have a race condition
+     * by modifying the key in the dict when other threads
+     * are accessing it.
+     */
+    keydup = strdupa(key);
+
+    colon = strchr(keydup, ':');
+    if (!colon)
+        return 0;
+
+    *colon = '\0';
+
+    path = alloca(strlen(keydup) + 2);
+    snprintf(path, strlen(keydup) + 2, "/%s", keydup);
+
+    /* Host is one character after ':' */
+    host_addr_ip = colon + 1;
+
+    /* Check if the IP is authorized */
+    auth_status_code = mnt3_auth_host(auth_params, host_addr_ip, NULL, path,
+                                      FALSE, NULL);
+    if (auth_status_code == 0) {
+        goto out;
+    }
+
+    ret = gf_get_hostname_from_ip(host_addr_ip, &host_addr_fqdn);
+    if (ret != 0) {
+        gf_msg(GF_MNT, GF_LOG_DEBUG, 0, NFS_MSG_AUTH_ERROR,
+               "Authorization failed for IP [%s], but name "
+               "resolution also failed!",
+               host_addr_ip);
+        goto unmount;
+    }
+
+    /* If not, check if the FQDN is authorized */
+    gf_msg(GF_MNT, GF_LOG_DEBUG, 0, NFS_MSG_AUTH_ERROR,
+           "Authorization failed for IP [%s], attempting to"
+           " auth hostname [%s]...",
+           host_addr_ip, host_addr_fqdn);
+
+    auth_status_code = mnt3_auth_host(auth_params, host_addr_fqdn, NULL, path,
+                                      FALSE, NULL);
+    if (auth_status_code == 0) {
+        gf_msg(GF_MNT, GF_LOG_DEBUG, 0, NFS_MSG_AUTH_ERROR,
+               "Authorization succeeded for "
+               "Client [IP=%s, Hostname=%s].",
+               host_addr_ip, host_addr_fqdn);
+        goto out;
+    }
+
+unmount:
+    gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_AUTH_ERROR,
+           "Client [IP=%s, Hostname=%s] not authorized for this mount. "
+           "Unmounting!",
+           host_addr_ip, host_addr_fqdn);
+    mnt3svc_umount(auth_params->ms, path, host_addr_ip);
+out:
+    GF_FREE(host_addr_fqdn);
+    return 0;
+}
+
+/**
+ * _mnt3_invalidate_old_mounts -- Calls __mnt3_mounted_exports_walk which checks
+ *                                checks if hosts are authorized to be mounted
+ *                                and umounts them.
+ *
+ * @ms: The mountstate for this service that holds all the information we need
+ *
+ */
+void
+_mnt3_invalidate_old_mounts(struct mount3_state *ms)
+{
+    gf_msg_debug(GF_MNT, 0, "Invalidating old mounts ...");
+    dict_foreach(ms->mountdict, __mnt3_mounted_exports_walk, ms->auth_params);
+}
+
+/**
+ * _mnt3_has_file_changed -- Checks if a file has changed on disk
+ *
+ * @path: The path of the file on disk
+ * @oldmtime: The previous mtime of the file
+ *
+ * @return: file changed: TRUE
+ *          otherwise   : FALSE
+ *
+ * Uses get_file_mtime () in common-utils.c
+ */
+gf_boolean_t
+_mnt3_has_file_changed(const char *path, time_t *oldmtime)
+{
+    gf_boolean_t changed = _gf_false;
+    time_t mtime = {0};
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO(GF_MNT, path, out);
+    GF_VALIDATE_OR_GOTO(GF_MNT, oldmtime, out);
+
+    ret = get_file_mtime(path, &mtime);
+    if (ret < 0)
+        goto out;
+
+    if (mtime != *oldmtime) {
+        changed = _gf_true;
+        *oldmtime = mtime;
+    }
+out:
+    return changed;
+}
+
+/**
+ * _mnt_auth_param_refresh_thread - Started using pthread_create () in
+ *                                  mnt3svc_init (). Reloads exports/netgroups
+ *                                  files from disk and sets the auth params
+ *                                  structure in the mount state to reflect
+ *                                  any changes from disk.
+ * @argv: Unused argument
+ * @return: Always returns NULL
+ */
+void *
+_mnt3_auth_param_refresh_thread(void *argv)
+{
+    struct mount3_state *mstate = (struct mount3_state *)argv;
+    char *exp_file_path = NULL;
+    char *ng_file_path = NULL;
+    size_t nbytes = 0;
+    time_t exp_time = 0;
+    time_t ng_time = 0;
+    gf_boolean_t any_file_changed = _gf_false;
+    int ret = 0;
+
+    nbytes = strlen(exports_file_path) + 1;
+    exp_file_path = alloca(nbytes);
+    snprintf(exp_file_path, nbytes, "%s", exports_file_path);
+
+    nbytes = strlen(netgroups_file_path) + 1;
+    ng_file_path = alloca(nbytes);
+    snprintf(ng_file_path, nbytes, "%s", netgroups_file_path);
+
+    /* Set the initial timestamps to avoid reloading right after
+     * mnt3svc_init () spawns this thread */
+    get_file_mtime(exp_file_path, &exp_time);
+    get_file_mtime(ng_file_path, &ng_time);
+
+    while (_gf_true) {
+        if (mstate->stop_refresh)
+            break;
+        any_file_changed = _gf_false;
+
+        /* Sleep before checking the file again */
+        sleep(mstate->nfs->auth_refresh_time_secs);
+
+        if (_mnt3_has_file_changed(exp_file_path, &exp_time)) {
+            gf_msg(GF_MNT, GF_LOG_INFO, 0, NFS_MSG_UPDATING_EXP,
+                   "File %s changed, updating exports,", exp_file_path);
+
+            ret = mnt3_auth_set_exports_auth(mstate->auth_params,
+                                             exp_file_path);
+            if (ret)
+                gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_SET_EXP_AUTH_PARAM_FAIL,
+                       "Failed to set export auth params.");
+            else
+                any_file_changed = _gf_true;
+        }
+
+        if (_mnt3_has_file_changed(ng_file_path, &ng_time)) {
+            gf_msg(GF_MNT, GF_LOG_INFO, 0, NFS_MSG_UPDATING_NET_GRP,
+                   "File %s changed,"
+                   "updating netgroups",
+                   ng_file_path);
+
+            ret = mnt3_auth_set_netgroups_auth(mstate->auth_params,
+                                               ng_file_path);
+            if (ret)
+                gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_SET_NET_GRP_FAIL,
+                       "Failed to set netgroup auth params.");
+            else
+                any_file_changed = _gf_true;
+        }
+
+        /* If no files changed, go back to sleep */
+        if (!any_file_changed)
+            continue;
+
+        gf_msg(GF_MNT, GF_LOG_INFO, 0, NFS_MSG_PURGING_AUTH_CACHE,
+               "Purging auth cache.");
+        auth_cache_purge(mstate->authcache);
+
+        /* Walk through mounts that are no longer authorized
+         * and unmount them on the server side. This will
+         * cause subsequent file ops to fail with access denied.
+         */
+        _mnt3_invalidate_old_mounts(mstate);
+    }
+
+    return NULL;
+}
+
+/**
+ * _mnt3_init_auth_params -- Initialize authentication parameters by allocating
+ *                           the struct and setting the exports & netgroups
+ *                           files as parameters.
+ *
+ * @mstate : The mount state we are going to set the auth parameters in it.
+ *
+ * @return : success: 0 for success
+ *           failure: -EINVAL for bad args, -ENOMEM for allocation errors, < 0
+ *                    for other errors (parsing the files, etc.) These are
+ *                    bubbled up from the functions we call to set the params.
+ */
+int
+_mnt3_init_auth_params(struct mount3_state *mstate)
+{
+    int ret = -EINVAL;
+    char *exp_file_path = NULL;
+    char *ng_file_path = NULL;
+    size_t nbytes = 0;
+
+    GF_VALIDATE_OR_GOTO(GF_MNT, mstate, out);
+
+    mstate->auth_params = mnt3_auth_params_init(mstate);
+    if (!mstate->auth_params) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to init mount auth params.");
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    nbytes = strlen(exports_file_path) + 1;
+    exp_file_path = alloca(nbytes);
+    snprintf(exp_file_path, nbytes, "%s", exports_file_path);
+
+    ret = mnt3_auth_set_exports_auth(mstate->auth_params, exp_file_path);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_SET_EXP_AUTH_PARAM_FAIL,
+               "Failed to set export auth params.");
+        goto out;
+    }
+
+    nbytes = strlen(netgroups_file_path) + 1;
+    ng_file_path = alloca(nbytes);
+    snprintf(ng_file_path, nbytes, "%s", netgroups_file_path);
+
+    ret = mnt3_auth_set_netgroups_auth(mstate->auth_params, ng_file_path);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_SET_EXP_AUTH_PARAM_FAIL,
+               "Failed to set netgroup auth params.");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/**
+ * mnt3svc_deinit -- Function called by the nfs translator to cleanup all state
+ *
+ * @nfsx : The NFS translator used to perform the cleanup
+ *         This structure holds all the pointers to memory that we need to free
+ *         as well as the threads that have been started.
+ */
+void
+mnt3svc_deinit(xlator_t *nfsx)
+{
+    struct mount3_state *mstate = NULL;
+    struct nfs_state *nfs = NULL;
+
+    if (!nfsx || !nfsx->private)
+        return;
+
+    nfs = (struct nfs_state *)nfsx->private;
+    mstate = (struct mount3_state *)nfs->mstate;
+
+    if (nfs->refresh_auth) {
+        /* Mark as true and wait for thread to exit */
+        mstate->stop_refresh = _gf_true;
+        pthread_join(mstate->auth_refresh_thread, NULL);
+    }
+
+    if (nfs->exports_auth)
+        mnt3_auth_params_deinit(mstate->auth_params);
+
+    /* Unmount everything and clear mountdict */
+    LOCK(&mstate->mountlock);
+    {
+        __mnt3svc_umountall(mstate);
+        dict_unref(mstate->mountdict);
+    }
+    UNLOCK(&mstate->mountlock);
+}
+
+rpcsvc_program_t *
+mnt3svc_init(xlator_t *nfsx)
+{
+    struct mount3_state *mstate = NULL;
+    struct nfs_state *nfs = NULL;
+    dict_t *options = NULL;
+    char *portstr = NULL;
+    int ret = -1;
+    pthread_t udp_thread;
+
+    if (!nfsx || !nfsx->private)
+        return NULL;
+
+    nfs = (struct nfs_state *)nfsx->private;
+
+    gf_msg_debug(GF_MNT, 0, "Initing Mount v3 state");
+    mstate = (struct mount3_state *)nfs->mstate;
+    if (!mstate) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_MNT_STATE_INIT_FAIL,
+               "Mount v3 state init failed");
+        goto err;
+    }
+
+    mstate->nfs = nfs;
+
+    mstate->mountdict = dict_new();
+    if (!mstate->mountdict) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to setup mount dict. Allocation error.");
+        goto err;
+    }
+
+    if (nfs->exports_auth) {
+        ret = _mnt3_init_auth_params(mstate);
+        if (ret < 0)
+            goto err;
+
+        mstate->authcache = auth_cache_init(nfs->auth_cache_ttl_sec);
+        if (!mstate->authcache) {
+            ret = -ENOMEM;
+            goto err;
+        }
+
+        mstate->stop_refresh = _gf_false; /* Allow thread to run */
+        ret = gf_thread_create(&mstate->auth_refresh_thread, NULL,
+                               _mnt3_auth_param_refresh_thread, mstate,
+                               "nfsauth");
+        if (ret) {
+            gf_msg_debug(GF_MNT, GF_LOG_DEBUG, "Thread creation failed");
+        }
+
+    } else
+        gf_msg(GF_MNT, GF_LOG_INFO, 0, NFS_MSG_EXP_AUTH_DISABLED,
+               "Exports auth has been disabled!");
+
+    mnt3prog.private = mstate;
+    options = dict_new();
+    if (options == NULL) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, ENOMEM, NFS_MSG_GFID_DICT_CREATE_FAIL,
+               "dict allocation failed");
+        goto err;
+    }
+
+    ret = gf_asprintf(&portstr, "%d", GF_MOUNTV3_PORT);
+    if (ret == -1)
+        goto err;
+
+    ret = dict_set_dynstr(options, "transport.socket.listen-port", portstr);
+    if (ret == -1)
+        goto err;
+
+    ret = dict_set_str(options, "transport-type", "socket");
+    if (ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+               "dict_set_str error");
+        goto err;
+    }
+
+    if (nfs->allow_insecure) {
+        ret = dict_set_str(options, "rpc-auth-allow-insecure", "on");
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+                   "dict_set_str error");
+            goto err;
+        }
+        ret = dict_set_str(options, "rpc-auth.ports.insecure", "on");
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+                   "dict_set_str error");
+            goto err;
+        }
+    }
+
+    ret = rpcsvc_create_listeners(nfs->rpcsvc, options, nfsx->name);
+    if (ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_LISTENERS_CREATE_FAIL,
+               "Unable to create listeners");
+        goto err;
+    }
+
+    if (nfs->mount_udp) {
+        ret = gf_thread_create(&udp_thread, NULL, mount3udp_thread, nfsx,
+                               "nfsudp");
+        if (ret) {
+            gf_msg_debug(GF_MNT, GF_LOG_DEBUG, "Thread creation failed");
+        }
+    }
+    if (options)
+        dict_unref(options);
+
+    return &mnt3prog;
+err:
+    if (options)
+        dict_unref(options);
+    return NULL;
+}
+
+static rpcsvc_actor_t mnt1svc_actors[MOUNT1_PROC_COUNT] = {
+    {"NULL", mnt3svc_null, NULL, MOUNT1_NULL, DRC_NA, 0},
+    {"MNT", NULL, NULL, MOUNT1_MNT, DRC_NA, 0},
+    {"DUMP", mnt3svc_dump, NULL, MOUNT1_DUMP, DRC_NA, 0},
+    {"UMNT", mnt3svc_umnt, NULL, MOUNT1_UMNT, DRC_NA, 0},
+    {"UMNTALL", NULL, NULL, MOUNT1_UMNTALL, DRC_NA, 0},
+    {"EXPORT", mnt3svc_export, NULL, MOUNT1_EXPORT, DRC_NA, 0}};
+
+static rpcsvc_program_t mnt1prog = {
+    .progname = "MOUNT1",
+    .prognum = MOUNT_PROGRAM,
+    .progver = MOUNT_V1,
+    .progport = GF_MOUNTV1_PORT,
+    .actors = mnt1svc_actors,
+    .numactors = MOUNT1_PROC_COUNT,
+    .min_auth = AUTH_NULL,
+    .synctask = _gf_true,
+};
+
+rpcsvc_program_t *
+mnt1svc_init(xlator_t *nfsx)
+{
+    struct mount3_state *mstate = NULL;
+    struct nfs_state *nfs = NULL;
+    dict_t *options = NULL;
+    char *portstr = NULL;
+    int ret = -1;
+
+    if (!nfsx || !nfsx->private)
+        return NULL;
+
+    nfs = (struct nfs_state *)nfsx->private;
+
+    gf_msg_debug(GF_MNT, GF_LOG_DEBUG, "Initing Mount v1 state");
+    mstate = (struct mount3_state *)nfs->mstate;
+    if (!mstate) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, EINVAL, NFS_MSG_MNT_STATE_INIT_FAIL,
+               "Mount v3 state init failed");
+        goto err;
+    }
+
+    mnt1prog.private = mstate;
+
+    options = dict_new();
+    if (options == NULL) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, ENOMEM, NFS_MSG_GFID_DICT_CREATE_FAIL,
+               "dict allocation failed");
+        goto err;
+    }
+
+    ret = gf_asprintf(&portstr, "%d", GF_MOUNTV1_PORT);
+    if (ret == -1)
+        goto err;
+
+    ret = dict_set_dynstr(options, "transport.socket.listen-port", portstr);
+    if (ret == -1)
+        goto err;
+    ret = dict_set_str(options, "transport-type", "socket");
+    if (ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+               "dict_set_str error");
+        goto err;
+    }
+
+    if (nfs->allow_insecure) {
+        ret = dict_set_str(options, "rpc-auth-allow-insecure", "on");
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+                   "dict_set_str error");
+            goto err;
+        }
+        ret = dict_set_str(options, "rpc-auth.ports.insecure", "on");
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+                   "dict_set_str error");
+            goto err;
+        }
+    }
+
+#ifdef IPV6_DEFAULT
+    ret = dict_set_str(options, "transport.address-family", "inet6");
+    if (ret == -1) {
+        gf_log(GF_NFS, GF_LOG_ERROR,
+               "dict_set_str error when trying to enable ipv6");
+        goto err;
+    }
+#endif
+
+    ret = rpcsvc_create_listeners(nfs->rpcsvc, options, nfsx->name);
+    if (ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_LISTENERS_CREATE_FAIL,
+               "Unable to create listeners");
+        dict_unref(options);
+        goto err;
+    }
+
+    return &mnt1prog;
+err:
+    return NULL;
+}
+
+int
+mount_reconfigure_state(xlator_t *nfsx, dict_t *options)
+{
+    int ret = -1;
+    struct nfs_state *nfs = NULL;
+    struct mount3_state *ms = NULL;
+    struct mnt3_export *exp = NULL;
+    struct mnt3_export *texp = NULL;
+
+    if ((!nfsx) || (!options))
+        return (-1);
+
+    nfs = (struct nfs_state *)nfs_state(nfsx);
+    if (!nfs)
+        return (-1);
+
+    ms = nfs->mstate;
+    if (!ms)
+        return (-1);
+
+    /*
+     * Free() up the old export list. mnt3_init_options() will
+     * rebuild the export list from scratch. Do it with locking
+     * to avoid unnecessary race conditions.
+     */
+    LOCK(&ms->mountlock);
+    list_for_each_entry_safe(exp, texp, &ms->exportlist, explist)
+    {
+        list_del(&exp->explist);
+        mnt3_export_free(exp);
+    }
+    ret = mnt3_init_options(ms, options);
+    UNLOCK(&ms->mountlock);
+
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ret, NFS_MSG_RECONF_FAIL,
+               "Options reconfigure failed");
+        return (-1);
+    }
+
+    return (0);
+}
diff --git a/xlators/nfs/server/src/mount3.h b/xlators/nfs/server/src/mount3.h
new file mode 100644
index 00000000000..b185df835ae
--- /dev/null
+++ b/xlators/nfs/server/src/mount3.h
@@ -0,0 +1,188 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _MOUNT3_H_
+#define _MOUNT3_H_
+
+#include "rpcsvc.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/iobuf.h>
+#include "nfs.h"
+#include <glusterfs/list.h>
+#include "xdr-nfs3.h"
+#include <glusterfs/locking.h>
+#include "nfs3-fh.h"
+#include <glusterfs/compat-uuid.h>
+#include "exports.h"
+#include "mount3-auth.h"
+#include "auth-cache.h"
+
+/* Registered with portmap */
+#define GF_MOUNTV3_PORT 38465
+#define GF_MOUNTV3_IOB (2 * GF_UNIT_KB)
+#define GF_MOUNTV3_IOBPOOL (GF_MOUNTV3_IOB * 50)
+
+#define GF_MOUNTV1_PORT 38466
+#define GF_MNT GF_NFS "-mount"
+
+extern rpcsvc_program_t *
+mnt3svc_init(xlator_t *nfsx);
+
+extern rpcsvc_program_t *
+mnt1svc_init(xlator_t *nfsx);
+
+extern void
+mnt3svc_deinit(xlator_t *nfsx);
+
+extern int
+mount_init_state(xlator_t *nfsx);
+
+extern int
+mount_reconfigure_state(xlator_t *nfsx, dict_t *options);
+
+void
+mount_rewrite_rmtab(struct mount3_state *ms, char *new_rmtab);
+
+struct mnt3_export *
+mnt3_mntpath_to_export(struct mount3_state *ms, const char *dirpath,
+                       gf_boolean_t export_parsing_match);
+
+extern int
+mnt3svc_update_mountlist(struct mount3_state *ms, rpcsvc_request_t *req,
+                         const char *expname, const char *fullpath);
+
+int
+mnt3_authenticate_request(struct mount3_state *ms, rpcsvc_request_t *req,
+                          struct nfs3_fh *fh, const char *volname,
+                          const char *path, char **authorized_path,
+                          char **authorized_host, gf_boolean_t is_write_op);
+
+/* Data structure used to store the list of mounts points currently
+ * in use by NFS clients.
+ */
+struct mountentry {
+    /* Links to mount3_state->mountlist.  */
+    struct list_head mlist;
+
+    /* The export name */
+    char exname[MNTPATHLEN];
+    char hostname[MNTPATHLEN];
+    char fullpath[MNTPATHLEN];
+
+    gf_boolean_t has_full_path;
+
+    /* Since this is stored in a dict, we want to be able
+     * to find easily get the key we used to store
+     * the struct in our dict
+     */
+    char hashkey[MNTPATHLEN * 2 + 2];
+};
+
+#define MNT3_EXPTYPE_VOLUME 1
+#define MNT3_EXPTYPE_DIR 2
+
+/* Structure to hold export-dir AUTH parameter */
+struct host_auth_spec {
+    char *host_addr;             /* Allowed IP or host name */
+    uint32_t netmask;            /* Network mask (Big-Endian) */
+    struct host_auth_spec *next; /* Pointer to next AUTH struct */
+};
+
+struct mnt3_export {
+    struct list_head explist;
+
+    /* The string that may contain either the volume name if the full volume
+     * is exported or the subdirectory in the volume.
+     */
+    char *expname;
+    /*
+     * IP address, hostname or subnets who are allowed to connect to expname
+     * subvolume or subdirectory
+     */
+    struct host_auth_spec *hostspec;
+    xlator_t *vol;
+    int exptype;
+
+    /* This holds the full path that the client requested including
+     * the volume name AND the subdirectory in the volume.
+     */
+    char *fullpath;
+
+    /* Extracted from nfs volume options if nfs.dynamicvolumes is on.
+     */
+    uuid_t volumeid;
+    uuid_t mountid;
+};
+
+struct mount3_state {
+    xlator_t *nfsx;
+
+    /* The NFS state that this belongs to */
+    struct nfs_state *nfs;
+
+    /* The buffers for all network IO are got from this pool. */
+    struct iobuf_pool *iobpool;
+
+    /* List of exports, can be volumes or directories in those volumes. */
+    struct list_head exportlist;
+
+    /* List of current mount points over all the exports from this
+     * server.
+     */
+    struct list_head mountlist;
+
+    /* Dict of current mount points over all the exports from this
+     * server. Mirrors the mountlist above, but can be used for
+     * faster lookup in the event that there are several mounts.
+     * Currently, each NFSOP is validated against this dict: each
+     * op is checked to see if the host that operates on the path
+     * does in fact have an entry in the mount dict.
+     */
+    dict_t *mountdict;
+
+    /* Used to protect the mountlist & the mount dict */
+    gf_lock_t mountlock;
+
+    /* Used to insert additional authentication parameters */
+    struct mnt3_auth_params *auth_params;
+
+    /* Set to 0 if exporting full volumes is disabled. On by default. */
+    gf_boolean_t export_volumes;
+    gf_boolean_t export_dirs;
+
+    pthread_t auth_refresh_thread;
+    gf_boolean_t stop_refresh;
+
+    struct auth_cache *authcache;
+};
+
+#define gf_mnt3_export_dirs(mst) ((mst)->export_dirs)
+
+struct mount3_resolve_state {
+    struct mnt3_export *exp;
+    struct mount3_state *mstate;
+    rpcsvc_request_t *req;
+
+    char remainingdir[MNTPATHLEN];
+    loc_t resolveloc;
+    struct nfs3_fh parentfh;
+};
+
+typedef struct mount3_resolve_state mnt3_resolve_t;
+
+int
+mnt3_parse_dir_exports(rpcsvc_request_t *req, struct mount3_state *ms,
+                       char *subdir, gf_boolean_t send_reply);
+
+char *
+mnt3_get_volume_subdir(char *path, char **volname);
+
+#endif
diff --git a/xlators/nfs/server/src/mount3udp_svc.c b/xlators/nfs/server/src/mount3udp_svc.c
new file mode 100644
index 00000000000..1a2b0f85453
--- /dev/null
+++ b/xlators/nfs/server/src/mount3udp_svc.c
@@ -0,0 +1,238 @@
+/*
+  Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "xdr-nfs3.h"
+#include <glusterfs/logging.h>
+#include <glusterfs/mem-pool.h>
+#include "nfs-mem-types.h"
+#include "nfs-messages.h"
+#include "mount3.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <rpc/pmap_clnt.h>
+#include <string.h>
+#include <memory.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+extern struct nfs3_fh *
+nfs3_rootfh(struct svc_req *req, xlator_t *nfsx, char *dp, char *expname);
+
+extern mountres3
+mnt3svc_set_mountres3(mountstat3 stat, struct nfs3_fh *fh, int *authflavor,
+                      u_int aflen);
+extern int
+mount3udp_add_mountlist(xlator_t *nfsx, char *host, char *expname);
+
+extern int
+mount3udp_delete_mountlist(xlator_t *nfsx, char *host, char *expname);
+
+extern mountstat3
+mnt3svc_errno_to_mnterr(int32_t errnum);
+
+/* only this thread will use this, no locking needed */
+char mnthost[INET_ADDRSTRLEN + 1];
+
+#define MNT3UDP_AUTH_LEN 1 /* Only AUTH_UNIX for now */
+
+mountres3 *
+mountudpproc3_mnt_3_svc(dirpath **dpp, struct svc_req *req)
+{
+    struct mountres3 *res = NULL;
+    int *autharr = NULL;
+    struct nfs3_fh *fh = NULL;
+    char *mpath = NULL;
+    xlator_t *nfsx = THIS;
+    char expname[PATH_MAX] = {
+        0,
+    };
+    mountstat3 stat = MNT3ERR_SERVERFAULT;
+
+    errno = 0; /* RESET errno */
+
+    mpath = (char *)*dpp;
+    while (*mpath == '/')
+        mpath++;
+
+    res = GF_CALLOC(1, sizeof(*res), gf_nfs_mt_mountres3);
+    if (res == NULL) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Unable to allocate memory");
+        goto err;
+    }
+    autharr = GF_CALLOC(MNT3UDP_AUTH_LEN, sizeof(int), gf_nfs_mt_int);
+    if (autharr == NULL) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Unable to allocate memory");
+        goto err;
+    }
+
+    autharr[0] = AUTH_UNIX;
+
+    fh = nfs3_rootfh(req, nfsx, mpath, (char *)expname);
+
+    /* FAILURE: No FH */
+    if (fh == NULL) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, errno, NFS_MSG_GET_FH_FAIL,
+               "Unable to get fh for %s", mpath);
+        if (errno)
+            stat = mnt3svc_errno_to_mnterr(errno);
+        *res = mnt3svc_set_mountres3(stat, NULL /* fh */, autharr,
+                                     MNT3UDP_AUTH_LEN);
+        return res;
+    }
+
+    /* SUCCESS */
+    stat = MNT3_OK;
+    *res = mnt3svc_set_mountres3(stat, fh, autharr, MNT3UDP_AUTH_LEN);
+    (void)mount3udp_add_mountlist(nfsx, mnthost, (char *)expname);
+    return res;
+
+err:
+    GF_FREE(fh);
+    GF_FREE(res);
+    GF_FREE(autharr);
+    return NULL;
+}
+
+mountstat3 *
+mountudpproc3_umnt_3_svc(dirpath **dp, struct svc_req *req)
+{
+    mountstat3 *stat = NULL;
+    char *mpath = (char *)*dp;
+    xlator_t *nfsx = THIS;
+
+    stat = GF_MALLOC(sizeof(mountstat3), gf_nfs_mt_mountstat3);
+    if (stat == NULL) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Unable to allocate memory");
+        return NULL;
+    }
+    *stat = MNT3_OK;
+    (void)mount3udp_delete_mountlist(nfsx, mnthost, mpath);
+    return stat;
+}
+
+static void
+mountudp_program_3(struct svc_req *rqstp, register SVCXPRT *transp)
+{
+    union {
+        dirpath mountudpproc3_mnt_3_arg;
+    } argument;
+    char *result = NULL;
+    xdrproc_t _xdr_argument = NULL, _xdr_result = NULL;
+    char *(*local)(char *, struct svc_req *) = NULL;
+    mountres3 *res = NULL;
+    struct sockaddr_in *sin = NULL;
+
+#if !defined(_TIRPC_SVC_H)
+    sin = svc_getcaller(transp);
+#else
+    sin = (struct sockaddr_in *)svc_getcaller(transp);
+    /* TIRPC's svc_getcaller() returns a pointer to a sockaddr_in6, even
+     * though it might actually be an IPv4 address. It ought return a
+     * struct sockaddr and make the caller upcast it to the proper
+     * address family. Sigh.
+     */
+#endif
+    /* And let's make sure that it's actually an IPv4 address. */
+    GF_ASSERT(sin->sin_family == AF_INET);
+
+    inet_ntop(AF_INET, &sin->sin_addr, mnthost, INET_ADDRSTRLEN + 1);
+
+    switch (rqstp->rq_proc) {
+        case NULLPROC:
+            (void)svc_sendreply(transp, (xdrproc_t)xdr_void, (char *)NULL);
+            return;
+
+        case MOUNT3_MNT:
+            _xdr_argument = (xdrproc_t)xdr_dirpath;
+            _xdr_result = (xdrproc_t)xdr_mountres3;
+            local = (char *(*)(char *,
+                               struct svc_req *))mountudpproc3_mnt_3_svc;
+            break;
+
+        case MOUNT3_UMNT:
+            _xdr_argument = (xdrproc_t)xdr_dirpath;
+            _xdr_result = (xdrproc_t)xdr_mountstat3;
+            local = (char *(*)(char *,
+                               struct svc_req *))mountudpproc3_umnt_3_svc;
+            break;
+
+        default:
+            svcerr_noproc(transp);
+            return;
+    }
+    memset((char *)&argument, 0, sizeof(argument));
+    if (!svc_getargs(transp, (xdrproc_t)_xdr_argument, (caddr_t)&argument)) {
+        svcerr_decode(transp);
+        return;
+    }
+    result = (*local)((char *)&argument, rqstp);
+    if (result == NULL) {
+        gf_msg_debug(GF_MNT, 0, "PROC returned error");
+        svcerr_systemerr(transp);
+    }
+    if (result != NULL &&
+        !svc_sendreply(transp, (xdrproc_t)_xdr_result, result)) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_SVC_ERROR,
+               "svc_sendreply returned error");
+        svcerr_systemerr(transp);
+    }
+    if (!svc_freeargs(transp, (xdrproc_t)_xdr_argument, (caddr_t)&argument)) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_ARG_FREE_FAIL,
+               "Unable to free arguments");
+    }
+    if (result == NULL)
+        return;
+    /* free the result */
+    switch (rqstp->rq_proc) {
+        case MOUNT3_MNT:
+            res = (mountres3 *)result;
+            GF_FREE(res->mountres3_u.mountinfo.fhandle.fhandle3_val);
+            GF_FREE(res->mountres3_u.mountinfo.auth_flavors.auth_flavors_val);
+            GF_FREE(res);
+            break;
+
+        case MOUNT3_UMNT:
+            GF_FREE(result);
+            break;
+    }
+    return;
+}
+
+void *
+mount3udp_thread(void *argv)
+{
+    xlator_t *nfsx = argv;
+    register SVCXPRT *transp = NULL;
+
+    GF_ASSERT(nfsx);
+
+    THIS = nfsx;
+
+    transp = svcudp_create(RPC_ANYSOCK);
+    if (transp == NULL) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_SVC_ERROR,
+               "svcudp_create error");
+        return NULL;
+    }
+    if (!svc_register(transp, MOUNT_PROGRAM, MOUNT_V3, mountudp_program_3,
+                      IPPROTO_UDP)) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_SVC_ERROR,
+               "svc_register error");
+        return NULL;
+    }
+
+    svc_run();
+    gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_SVC_RUN_RETURNED,
+           "svc_run returned");
+    return NULL;
+}
diff --git a/xlators/nfs/server/src/netgroups.c b/xlators/nfs/server/src/netgroups.c
new file mode 100644
index 00000000000..f68a7f14a26
--- /dev/null
+++ b/xlators/nfs/server/src/netgroups.c
@@ -0,0 +1,1161 @@
+/*
+   Copyright 2014-present Facebook. All Rights Reserved
+
+   This file is part of GlusterFS.
+
+   Author :
+   Shreyas Siravara <shreyas.siravara@gmail.com>
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2),in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "netgroups.h"
+#include <glusterfs/parse-utils.h>
+#include "nfs-messages.h"
+
+static void
+_nge_print(const struct netgroup_entry *nge);
+static void
+_netgroup_entry_deinit(struct netgroup_entry *ptr);
+static void
+_netgroup_host_deinit(struct netgroup_host *host);
+
+static dict_t *__deleted_entries;
+static struct parser *ng_file_parser;
+static struct parser *ng_host_parser;
+
+/**
+ * _ng_init_parser -- Initialize the parsers used in this file
+ *
+ * @return: success: 0 (on success the parsers are initialized)
+ *          failure: -1
+ */
+static int
+_ng_init_parsers()
+{
+    int ret = -1;
+
+    /* Initialize the parsers. The only reason this should
+     * ever fail is because of 1) memory allocation errors
+     * 2) the regex in netgroups.h has been changed and no
+     * longer compiles.
+     */
+    ng_file_parser = parser_init(NG_FILE_PARSE_REGEX);
+    if (!ng_file_parser)
+        goto out;
+
+    ng_host_parser = parser_init(NG_HOST_PARSE_REGEX);
+    if (!ng_host_parser)
+        goto out;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/**
+ * _ng_deinit_parsers - Free the parsers used in this file
+ */
+static void
+_ng_deinit_parsers()
+{
+    parser_deinit(ng_file_parser);
+    parser_deinit(ng_host_parser);
+}
+
+/**
+ * _netgroups_file_init - allocate a netgroup file struct
+ * @return: success: Pointer to an allocated netgroup file struct
+ *          failure: NULL
+ *
+ * Not for external use.
+ */
+static struct netgroups_file *
+_netgroups_file_init()
+{
+    struct netgroups_file *file = GF_MALLOC(sizeof(*file),
+                                            gf_common_mt_nfs_netgroups);
+
+    if (!file)
+        goto out;
+
+    file->filename = NULL;
+    file->ng_file_dict = NULL;
+out:
+    return file;
+}
+
+/**
+ * __ngf_free_walk - walk the netgroup file dict and free each element
+ *
+ * This is passed as a function pointer to dict_foreach ()
+ *
+ * @dict: the dict we are walking
+ * @key : the key we are processing in the dict
+ * @val : the corresponding value in the dict
+ * @tmp : Pointer to additional data that may be passed in (not used)
+ *
+ * @return : Nothing
+ *
+ * Not for external use.
+ */
+static int
+__ngf_free_walk(dict_t *dict, char *key, data_t *val, void *tmp)
+{
+    struct netgroup_entry *nge = NULL;
+
+    if (val) {
+        nge = (struct netgroup_entry *)val->data;
+        _netgroup_entry_deinit(nge);
+        val->data = NULL;
+        dict_del(dict, key); /* Remove the key from this dict */
+    }
+    return 0;
+}
+
+/**
+ * __deleted_entries_free_walk - free the strings in the temporary dict
+ *
+ * This is passed as a function pointer to dict_foreach ()
+ *
+ * @dict: the dict we are walking
+ * @key : the key we are processing in the dict
+ * @val : the corresponding value in the dict
+ * @tmp : Pointer to additional data that may be passed in (not used)
+ *
+ * @return : Nothing
+ *
+ * Not for external use.
+ */
+static int
+__deleted_entries_free_walk(dict_t *dict, char *key, data_t *val, void *tmp)
+{
+    dict_del(dict, key);
+    return 0;
+}
+
+/**
+ * ng_file_deinit - Free the netgroup file struct and any memory
+ * that is allocated for its members.
+ *
+ * @ngfile : Pointer to the netgroup file structure that needs to be freed
+ * @return : Nothing
+ *
+ * External facing function.
+ *
+ * Should be called by the caller of ng_file_parse () in order to free
+ * the memory allocated when parsing the file.
+ */
+void
+ng_file_deinit(struct netgroups_file *ngfile)
+{
+    if (!ngfile) {
+        return;
+    }
+
+    __deleted_entries = dict_new();
+    GF_VALIDATE_OR_GOTO(GF_NG, __deleted_entries, out);
+
+    GF_FREE(ngfile->filename);
+    dict_foreach(ngfile->ng_file_dict, __ngf_free_walk, NULL);
+    dict_unref(ngfile->ng_file_dict);
+    GF_FREE(ngfile);
+
+    /* Clean up temporary dict we used to store "freed" names */
+    dict_foreach(__deleted_entries, __deleted_entries_free_walk, NULL);
+    dict_unref(__deleted_entries);
+    __deleted_entries = NULL;
+out:
+    return;
+}
+
+/**
+ * _netgroup_entry_init - Initializes a netgroup entry struct.
+ * A netgroup entry struct represents a single line in a netgroups file.
+ *
+ * @return : success: Pointer to a netgroup entry struct
+ *         : failure: NULL
+ *
+ * Not for external use.
+ */
+static struct netgroup_entry *
+_netgroup_entry_init()
+{
+    struct netgroup_entry *entry = GF_CALLOC(1, sizeof(*entry),
+                                             gf_common_mt_nfs_netgroups);
+    return entry;
+}
+
+/**
+ * __ngh_free_walk - walk the netgroup host dict and free the host
+ * structure associated with the key.
+ *
+ * This is passed as a function pointer to dict_foreach ()
+ *
+ * @dict: the dict we are walking
+ * @key : the key we are processing in the dict
+ * @val : the corresponding value in the dict
+ * @tmp : Pointer to additional data that may be passed in (not used)
+ *
+ * @return : Nothing
+ *
+ * Not for external use.
+ */
+static int
+__ngh_free_walk(dict_t *dict, char *key, data_t *val, void *tmp)
+{
+    struct netgroup_host *ngh = NULL;
+
+    if (val) {
+        ngh = (struct netgroup_host *)val->data;
+        _netgroup_host_deinit(ngh);
+        val->data = NULL;
+        dict_del(dict, key);
+    }
+    return 0;
+}
+
+/**
+ * __nge_free_walk - walk the netgroup entry dict and free the netgroup entry
+ * structure associated with the key.
+ *
+ * This is passed as a function pointer to dict_foreach ()
+ *
+ * @dict: the dict we are walking
+ * @key : the key we are processing in the dict
+ * @val : the corresponding value in the dict
+ * @tmp : Pointer to additional data that may be passed in (not used)
+ *
+ * @return : Nothing
+ *
+ * Not for external use.
+ */
+static int
+__nge_free_walk(dict_t *dict, char *key, data_t *val, void *tmp)
+{
+    struct netgroup_entry *nge = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NG, dict, out);
+
+    if (val) {
+        nge = (struct netgroup_entry *)val->data;
+        if (!dict_get(__deleted_entries, key)) {
+            _netgroup_entry_deinit(nge);
+            val->data = NULL;
+        }
+        dict_del(dict, key);
+    }
+
+out:
+    return 0;
+}
+
+/**
+ * _netgroup_entry_deinit - Free memory pointed to by the parameter
+ *                          and any memory allocated for members
+ *                          in the struct. This function walks the
+ *                          netgroups and hosts dicts if they
+ *                          are allocated and frees them.
+ *
+ * @ngentry: Pointer to a netgroup entry struct that needs to be freed
+ *
+ * @return : Nothing
+ *
+ * Not for external use.
+ */
+static void
+_netgroup_entry_deinit(struct netgroup_entry *ngentry)
+{
+    dict_t *ng_dict = NULL;
+    dict_t *host_dict = NULL;
+    char *name = NULL;
+    data_t *dint = NULL;
+
+    if (!ngentry)
+        return;
+
+    ng_dict = ngentry->netgroup_ngs;
+    host_dict = ngentry->netgroup_hosts;
+
+    if (ng_dict) {
+        /* Free the dict of netgroup entries */
+        dict_foreach(ng_dict, __nge_free_walk, NULL);
+        dict_unref(ng_dict);
+        ngentry->netgroup_ngs = NULL;
+    }
+
+    if (host_dict) {
+        /* Free the dict of host entries */
+        dict_foreach(host_dict, __ngh_free_walk, NULL);
+        dict_unref(host_dict);
+        ngentry->netgroup_hosts = NULL;
+    }
+
+    if (ngentry->netgroup_name) {
+        /* Keep track of the netgroup names we've deallocated
+         * We need to do this because of the nature of this data
+         * structure. This data structure may hold multiple
+         * pointers to an already freed object, but these are
+         * uniquely identifiable by the name. We keep track
+         * of these names so when we encounter a key who has
+         * an association to an already freed object, we don't
+         * free it twice.
+         */
+        name = strdupa(ngentry->netgroup_name);
+
+        dint = int_to_data(1);
+        dict_set(__deleted_entries, name, dint);
+
+        GF_FREE(ngentry->netgroup_name);
+        ngentry->netgroup_name = NULL;
+    }
+
+    GF_FREE(ngentry);
+}
+
+/**
+ * _netgroup_host_init - Initializes a netgroup host structure.
+ * A netgroup host struct represents an item in a line of a netgroups file that
+ * looks like this : (hostname,user,domain)
+ *
+ * @return : success: Pointer to a netgroup host struct
+ *         : failure: NULL
+ *
+ * Not for external use.
+ */
+static struct netgroup_host *
+_netgroup_host_init()
+{
+    struct netgroup_host *host = GF_CALLOC(1, sizeof(*host),
+                                           gf_common_mt_nfs_netgroups);
+    return host;
+}
+
+/**
+ * _netgroup_host_deinit - Free memory pointed to by the parameter
+ * and any memory allocated for members in the struct.
+ *
+ * @nghost : Pointer to a netgroup host struct that needs to be freed
+ *
+ * @return : Nothing
+ *
+ * Not for external use.
+ */
+static void
+_netgroup_host_deinit(struct netgroup_host *host)
+{
+    /* Validate args */
+    GF_VALIDATE_OR_GOTO(GF_NG, host, err);
+
+    GF_FREE(host->hostname);
+    host->hostname = NULL;
+
+    GF_FREE(host->user);
+    host->user = NULL;
+
+    GF_FREE(host->domain);
+    host->domain = NULL;
+
+    GF_FREE(host);
+err:
+    return;
+}
+
+/**
+ * _nge_dict_get - Lookup a netgroup entry from the dict based
+ *                 on the netgroup name.
+ *
+ * @dict   : The dict we are looking up from. This function makes the
+ *           assumption that the type of underlying data in the dict is of type
+ *           struct netgroup_entry. The behavior is not defined otherwise.
+ *
+ * @ngname : Key used to lookup in the dict.
+ *
+ * @return : success: Pointer to a netgroup entry
+ *           failure: NULL (if no such key exists in the dict)
+ *
+ * Not for external use.
+ */
+static struct netgroup_entry *
+_nge_dict_get(dict_t *dict, const char *ngname)
+{
+    data_t *ngdata = NULL;
+
+    /* Validate args */
+    GF_VALIDATE_OR_GOTO(GF_NG, dict, err);
+    GF_VALIDATE_OR_GOTO(GF_NG, ngname, err);
+
+    ngdata = dict_get(dict, (char *)ngname);
+    if (ngdata)
+        return (struct netgroup_entry *)ngdata->data;
+err:
+    return NULL;
+}
+
+/**
+ * _nge_dict_insert - Insert a netgroup entry into the dict using
+ *                    the netgroup name as the key.
+ *
+ * @dict   : The dict we are inserting into.
+ *
+ * @nge    : The data to insert into the dict.
+ *
+ * @return : nothing
+ *
+ * Not for external use.
+ */
+static void
+_nge_dict_insert(dict_t *dict, struct netgroup_entry *nge)
+{
+    data_t *ngdata = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NG, dict, err);
+    GF_VALIDATE_OR_GOTO(GF_NG, nge, err);
+
+    ngdata = bin_to_data(nge, sizeof(*nge));
+    dict_set(dict, nge->netgroup_name, ngdata);
+err:
+    return;
+}
+
+/**
+ * _ngh_dict_get - Lookup a netgroup host entry from the dict based
+ *                 on the hostname.
+ *
+ * @dict   : The dict we are looking up from. This function makes the
+ *           assumption that the type of underlying data in the dict is of type
+ *           struct netgroup_host. The behavior is not defined otherwise.
+ *
+ * @ngname : Key used to lookup in the dict.
+ *
+ * @return : success: Pointer to a netgroup host entry
+ *           failure: NULL (if no such key exists in the dict)
+ *
+ * Externally usable.
+ */
+struct netgroup_host *
+ngh_dict_get(dict_t *dict, const char *hostname)
+{
+    data_t *ngdata = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NG, dict, err);
+    GF_VALIDATE_OR_GOTO(GF_NG, hostname, err);
+
+    ngdata = dict_get(dict, (char *)hostname);
+    if (!ngdata)
+        goto err;
+
+    return (struct netgroup_host *)ngdata->data;
+
+err:
+    return NULL;
+}
+
+/**
+ * _ngh_dict_insert - Insert a netgroup host entry into the dict using
+ *                    the netgroup name as the key.
+ *
+ * @dict   : The dict we are inserting into.
+ *
+ * @nge    : The data to insert into the dict.
+ *
+ * @return : nothing
+ *
+ * Not for external use.
+ */
+static void
+_ngh_dict_insert(dict_t *dict, struct netgroup_host *ngh)
+{
+    data_t *ngdata = NULL;
+
+    /* Validate args */
+    GF_VALIDATE_OR_GOTO(GF_NG, dict, err);
+    GF_VALIDATE_OR_GOTO(GF_NG, ngh, err);
+
+    ngdata = bin_to_data(ngh, sizeof(*ngh));
+    dict_set(dict, ngh->hostname, ngdata);
+err:
+    return;
+}
+
+/**
+ * _ngh_print - Prints the netgroup host in the
+ *              format '(hostname,user,domain)'
+ *
+ * @ngh    : The netgroup host to print out
+ *
+ * @return : nothing
+ *
+ * Not for external use.
+ */
+static void
+_ngh_print(const struct netgroup_host *ngh)
+{
+    /* Validate args */
+    GF_VALIDATE_OR_GOTO(GF_NG, ngh, err);
+
+    printf("(%s,%s,%s)", ngh->hostname, ngh->user ? ngh->user : "",
+           ngh->domain ? ngh->domain : "");
+err:
+    return;
+}
+
+/**
+ * __nge_print_walk - walk the netgroup entry dict and print each entry
+ *                    associated with the key. This function prints
+ *                    entries of type 'struct netgroup_entry'.
+ *
+ * This is passed as a function pointer to dict_foreach ()
+ *
+ * @dict: the dict we are walking
+ * @key : the key we are processing in the dict
+ * @val : the corresponding value in the dict
+ * @tmp : Pointer to additional data that may be passed in (not used)
+ *
+ * @return : Nothing
+ *
+ * Not for external use.
+ */
+static int
+__nge_print_walk(dict_t *dict, char *key, data_t *val, void *tmp)
+{
+    if (val)
+        _nge_print((struct netgroup_entry *)val->data);
+
+    return 0;
+}
+
+/**
+ * __ngh_print_walk - walk the netgroup entry dict and print each entry
+ *                    associated with the key. This function prints entries
+ *                    of type 'struct netgroup_host'
+ *
+ * This is passed as a function pointer to dict_foreach (),
+ * which is called from _nge_print ().
+ *
+ * @dict: the dict we are walking
+ * @key : the key we are processing in the dict
+ * @val : the corresponding value in the dict
+ * @tmp : Pointer to additional data that may be passed in (not used)
+ *
+ * @return : Nothing
+ *
+ * Not for external use.
+ */
+static int
+__ngh_print_walk(dict_t *dict, char *key, data_t *val, void *tmp)
+{
+    if (val)
+        _ngh_print((struct netgroup_host *)val->data);
+
+    return 0;
+}
+
+/**
+ * _nge_print - Prints the netgroup entry in the
+ *              format '<netgroup name> <following entries>'
+ *
+ * @ngh    : The netgroup entry to print out
+ *
+ * @return : nothing
+ *
+ * Not for external use.
+ */
+static void
+_nge_print(const struct netgroup_entry *nge)
+{
+    /* Validate args */
+    GF_VALIDATE_OR_GOTO(GF_NG, nge, err);
+
+    printf("%s ", nge->netgroup_name);
+    if (nge->netgroup_ngs)
+        dict_foreach(nge->netgroup_ngs, __nge_print_walk, NULL);
+
+    if (nge->netgroup_hosts)
+        dict_foreach(nge->netgroup_hosts, __ngh_print_walk, NULL);
+
+err:
+    return;
+}
+
+/**
+ * __ngf_print_walk - walk through each entry in the netgroups file and print it
+ *                    out. This calls helper functions _nge_print () to print
+ *                    the netgroup entries.
+ *
+ * This is passed as a function pointer to dict_foreach (),
+ * which is called from ng_file_print ().
+ *
+ * @dict: the dict we are walking
+ * @key : the key we are processing in the dict
+ * @val : the corresponding value in the dict
+ * @tmp : Pointer to additional data that may be passed in (not used)
+ *
+ * @return : Nothing
+ *
+ * Not for external use.
+ */
+static int
+__ngf_print_walk(dict_t *dict, char *key, data_t *val, void *tmp)
+{
+    struct netgroup_entry *snge = NULL;
+
+    if (val) {
+        snge = (struct netgroup_entry *)val->data;
+        _nge_print(snge);
+        printf("\n");
+    }
+    return 0;
+}
+
+/**
+ * ng_file_print - Prints the netgroup file in the
+ *              format '<netgroup name> <following entries>', etc.
+ *              The netgroup file is a dict of netgroup entries
+ *              which, in turn is a combination of a other 'sub' netgroup
+ *              entries and host entries. This function prints
+ *              all of that out by calling the corresponding print functions
+ *
+ * @ngfile : The netgroup file to print out
+ *
+ * @return : nothing
+ *
+ * External facing function.
+ *
+ * Can be called on any valid 'struct netgroups_file *' type.
+ */
+void
+ng_file_print(const struct netgroups_file *ngfile)
+{
+    dict_foreach(ngfile->ng_file_dict, __ngf_print_walk, NULL);
+}
+
+/**
+ * ng_file_get_netgroup - Look up a netgroup entry from the netgroups file
+ *                        based on the netgroup name and return a pointer
+ *                        to the netgroup entry.
+ *
+ * @ngfile   : The netgroup file to lookup from.
+ * @netgroup : The netgroup name used to lookup from the netgroup file.
+ *
+ * @return : nothing
+ *
+ * External facing function.
+ *
+ * Can be called on any valid 'struct netgroups_file *' type with a valid 'char
+ * *' as the lookup key.
+ */
+struct netgroup_entry *
+ng_file_get_netgroup(const struct netgroups_file *ngfile, const char *netgroup)
+{
+    data_t *ndata = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NG, ngfile, err);
+    GF_VALIDATE_OR_GOTO(GF_NG, netgroup, err);
+
+    ndata = dict_get(ngfile->ng_file_dict, (char *)netgroup);
+    if (!ndata)
+        goto err;
+
+    return (struct netgroup_entry *)ndata->data;
+
+err:
+    return NULL;
+}
+
+/**
+ * __check_host_entry_str - Check if the host string which should be
+ *                          in the format '(host,user,domain)' is
+ *                          valid to be parsed. Currently checks
+ *                          if the # of commas is correct and there
+ *                          are no spaces in the string, but more
+ *                          checks can be added.
+ *
+ * @host_str : String to check
+ * @return   : success: TRUE if valid
+ *             failure: FALSE if not
+ *
+ * Not for external use.
+ */
+static gf_boolean_t
+__check_host_entry_str(const char *host_str)
+{
+    unsigned int comma_count = 0;
+    unsigned int i = 0;
+    gf_boolean_t str_valid = _gf_true;
+
+    GF_VALIDATE_OR_GOTO(GF_NG, host_str, out);
+
+    for (i = 0; i < strlen(host_str); i++) {
+        if (host_str[i] == ',')
+            comma_count++;
+
+        /* Spaces are not allowed in this string. e.g, (a,b,c) is valid
+         * but (a, b,c) is not.
+         */
+        if (host_str[i] == ' ') {
+            str_valid = _gf_false;
+            goto out;
+        }
+    }
+
+    str_valid = (comma_count == 2);
+out:
+    return str_valid;
+}
+
+/**
+ * _parse_ng_host - Parse the netgroup host string into a netgroup host struct.
+ *                  The netgroup host string is structured as follows:
+ *                  (host, user, domain)
+ *
+ * @ng_str   : String to parse
+ * @return   : success: 0 if the parsing succeeded
+ *             failure: -EINVAL for bad args, -ENOMEM for allocation errors,
+ *                      1 for parsing errors.
+ *
+ * Not for external use.
+ */
+static int
+_parse_ng_host(char *ng_str, struct netgroup_host **ngh)
+{
+    struct netgroup_host *ng_host = NULL;
+    unsigned int parts = 0;
+    char *match = NULL;
+    int ret = -EINVAL;
+
+    GF_VALIDATE_OR_GOTO(GF_NG, ng_str, out);
+    GF_VALIDATE_OR_GOTO(GF_NG, ngh, out);
+
+    if (!__check_host_entry_str(ng_str)) {
+        ret = 1; /* Parse failed */
+        goto out;
+    }
+
+    ret = parser_set_string(ng_host_parser, ng_str);
+    if (ret < 0)
+        goto out;
+
+    gf_msg_trace(GF_NG, 0, "parsing host string: %s", ng_str);
+
+    ng_host = _netgroup_host_init();
+    GF_CHECK_ALLOC(ng_host, ret, free_and_out); /* Sets ret to -ENOMEM on
+                                                 * failure.
+                                                 */
+    while ((match = parser_get_next_match(ng_host_parser)) != NULL) {
+        gf_msg_trace(GF_NG, 0, "found match: %s (parts=%d)", match, parts);
+
+        switch (parts) {
+            case 0:
+                ng_host->hostname = match;
+                break;
+            case 1:
+                ng_host->user = match;
+                break;
+            case 2:
+                ng_host->domain = match;
+                break;
+            default:
+                GF_FREE(match);
+                break;
+        };
+
+        /* We only allow three parts in the host string;
+         * The format for the string is (a,b,c)
+         */
+        parts++;
+        if (parts > 2)
+            break;
+    }
+
+    /* Set the parameter */
+    *ngh = ng_host;
+    ret = 0;
+
+free_and_out:
+    parser_unset_string(ng_host_parser);
+out:
+    return ret;
+}
+
+/**
+ * _ng_handle_host_part - Parse the host string that looks like this :
+ *                        '(dev1763.prn2.facebook.com,,)' into a host
+ *                        struct and insert it into the parent netgroup's
+ *                        host dict.
+ * @match : The host string
+ * @ngp   : The parent netgroup
+ *
+ * @return: success: 0 if parsing succeeded
+ *          failure: -EINVAL for bad args, other errors bubbled up
+ *                   from _parse_ng_host.
+ *
+ *
+ * Not for external use.
+ */
+static int
+_ng_handle_host_part(char *match, struct netgroup_entry *ngp)
+{
+    struct netgroup_host *ngh = NULL;
+    int ret = -EINVAL;
+
+    GF_VALIDATE_OR_GOTO(GF_NG, match, out);
+    GF_VALIDATE_OR_GOTO(GF_NG, ngp, out);
+
+    if (!ngp->netgroup_name) {
+        gf_msg(GF_NG, GF_LOG_WARNING, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Invalid: Line starts with hostname!");
+        goto out;
+    }
+
+    /* Parse the host string and get a struct for it */
+    ret = _parse_ng_host(match, &ngh);
+    if (ret < 0) {
+        gf_msg(GF_NG, GF_LOG_CRITICAL, -ret, NFS_MSG_PARSE_FAIL,
+               "Critical error : %s", strerror(-ret));
+        goto out;
+    }
+    if (ret != 0) {
+        /* Cannot change to gf_msg
+         * gf_msg not giving output to STDOUT
+         * Bug id : BZ1215017
+         */
+        gf_log(GF_NG, GF_LOG_WARNING, "Parse error for: %s", match);
+        goto out;
+    }
+
+    /* Make dict for the parent entry's netgroup hosts */
+    if (!ngp->netgroup_hosts) {
+        ngp->netgroup_hosts = dict_new();
+        GF_CHECK_ALLOC(ngp->netgroup_hosts, ret, out);
+    }
+
+    /* Insert this entry into the parent netgroup dict */
+    _ngh_dict_insert(ngp->netgroup_hosts, ngh);
+
+out:
+    return ret;
+}
+
+/**
+ * _ng_handle_netgroup_part - Parse the netgroup string that should just be one
+ *                            string. This may insert the netgroup into the file
+ *                            struct if it does not already exist. Frees the
+ *                            parameter match if the netgroup was already found
+ *                            in the file.
+ *
+ * @match    : The netgroup string
+ * @ngp      : The netgroup file we may insert the entry into
+ * @ng_entry : Double pointer to the netgroup entry we want to allocate and set.
+ *
+ * @return: success: 0 if parsing succeeded
+ *          failure: -EINVAL for bad args, other errors bubbled up
+ *                   from _parse_ng_host.
+ *
+ *
+ * Not for external use.
+ */
+static int
+_ng_setup_netgroup_entry(char *match, struct netgroups_file *file,
+                         struct netgroup_entry **ng_entry)
+{
+    struct netgroup_entry *nge = NULL;
+    int ret = -EINVAL;
+
+    GF_VALIDATE_OR_GOTO(GF_NG, match, out);
+    GF_VALIDATE_OR_GOTO(GF_NG, file, out);
+    GF_VALIDATE_OR_GOTO(GF_NG, ng_entry, out);
+
+    nge = _netgroup_entry_init();
+    GF_CHECK_ALLOC(nge, ret, out);
+
+    nge->netgroup_name = match;
+
+    /* Insert this new entry into the file dict */
+    _nge_dict_insert(file->ng_file_dict, nge);
+
+    *ng_entry = nge;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/**
+ * _parse_ng_line - Parse a line in the netgroups file into a netgroup entry
+ *                  struct. The netgroup line is structured as follows:
+ *                  'netgroupx netgroupy (hosta,usera,domaina)...' OR
+ *                  'netgroupx netgroupy netgroupz...'  OR
+ *                  'netgroupx (hosta,usera,domaina) (hostb,userb,domainb)'
+ *                  This function parses this into a netgroup entry
+ *                  which will hold either a dict of netgroups and/or
+ *                  a dict of hosts that make up this netgroup.
+ *
+ * In general terms, the data structure to represent a netgroups file
+ * is a set of nested dictionaries. Each line in the netgroups file
+ * is compiled into a struct netgroup_entry structure that holds a dict
+ * of netgroups and a dict of hostnames. The first string in the netgroups
+ * line is the parent netgroup entry and the rest of the items in the line
+ * are the children of that parent netgroup entry. (Hence variables ngp
+ * and nge).
+ *
+ * A sample netgroup file may look like this:
+ *
+ * async async.ash3 async.ash4
+ * async.ash3 async.04.ash3
+ * async04.ash3 (async001.ash3.facebook.com,,) (async002.ash3.facebook.com,,)
+ *
+ * _parse_ng_line will get called on each line, so on the first call to this
+ * function, our data structure looks like this:
+ *
+ *
+ * dict [
+ *       'async'   --> dict [
+ *                              'async.ash3'
+ *                              'async.ash4'
+ *                          ]
+ *      ]
+ *
+ * On the second call to the function with the second line, our data structure
+ * looks like this:
+ *
+ * dict [
+ *       'async' --> dict [
+ *                              'async.ash3' -> dict [ 'async.04.ash3' ]
+ *                              'async.ash4'      ^
+ *                        ]                       |
+ *                                                |
+ *      'async.ash3' ------------------------------
+ *      ]
+ *
+ * And so on.
+ *
+ * The obvious answer to storing this file in a data structure may be a tree
+ * but lookups from a tree are expensive and since we may be looking up stuff
+ * in this file in the I/O path, we can't afford expensive lookups.
+ *
+ * @ng_str   : String to parse
+ * @file     : Netgroup file to put the parsed line into
+ * @ng_entry : Double pointer to struct that we are going to allocate and fill
+ *
+ * The string gets parsed into a structure pointed to by
+ * the parameter 'ng_entry'
+ *
+ * @return   : success: 0 if parsing succeeded
+ *             failure: NULL if not
+ *
+ * Not for external use.
+ */
+static int
+_parse_ng_line(char *ng_str, struct netgroups_file *file,
+               struct netgroup_entry **ng_entry)
+{
+    struct netgroup_entry *ngp = NULL; /* Parent netgroup entry */
+    struct netgroup_entry *nge = NULL; /* Generic netgroup entry */
+    char *match = NULL;
+    int ret = -EINVAL;
+    unsigned int num_entries = 0;
+
+    /* Validate arguments */
+    GF_VALIDATE_OR_GOTO(GF_NG, ng_str, out);
+    GF_VALIDATE_OR_GOTO(GF_NG, file, out);
+
+    if (*ng_str == ' ' || *ng_str == '\0' || *ng_str == '\n') {
+        ret = 0;
+        goto out;
+    }
+
+    ret = parser_set_string(ng_file_parser, ng_str);
+    if (ret < 0)
+        goto out;
+
+    /* This is the first name in the line, and should be the
+     * parent netgroup entry.
+     */
+    match = parser_get_next_match(ng_file_parser);
+    if (!match) {
+        ret = 1;
+        gf_msg(GF_NG, GF_LOG_WARNING, 0, NFS_MSG_FIND_FIRST_MATCH_FAIL,
+               "Unable to find "
+               "first match.");
+        gf_msg(GF_NG, GF_LOG_WARNING, 0, NFS_MSG_PARSE_FAIL,
+               "Error parsing str: %s", ng_str);
+        goto out;
+    }
+
+    /* Lookup to see if the match already exists,
+     * if not, set the parent.
+     */
+    ngp = _nge_dict_get(file->ng_file_dict, match);
+    if (!ngp) {
+        ret = _ng_setup_netgroup_entry(match, file, &ngp);
+        if (ret < 0) {
+            /* Bubble up error to caller. We don't need to free ngp
+             * here because this can only fail if allocating the
+             * struct fails.
+             */
+            goto out;
+        }
+    } else
+        GF_FREE(match);
+
+    if (!ngp->netgroup_ngs) {
+        /* If a netgroup dict has not been allocated
+         * for this parent, allocate it.
+         */
+        ngp->netgroup_ngs = dict_new();
+        GF_CHECK_ALLOC(ngp->netgroup_ngs, ret, out);
+        /* No need to free anything here since ngp is already
+         * a part of the file. When the file gets
+         * deallocated, we will free ngp.
+         */
+    }
+
+    while ((match = parser_get_next_match(ng_file_parser)) != NULL) {
+        num_entries++;
+        /* This means that we hit a host entry in the line */
+        if (*match == '(') {
+            ret = _ng_handle_host_part(match, ngp);
+            GF_FREE(match);
+            if (ret != 0) {
+                /* If parsing the host fails, bubble the error
+                 * code up to the caller.
+                 */
+                goto out;
+            }
+        } else {
+            nge = _nge_dict_get(file->ng_file_dict, match);
+            if (!nge) {
+                ret = _ng_setup_netgroup_entry(match, file, &nge);
+                if (ret < 0) {
+                    /* Bubble up error to caller. We don't
+                     * need to free nge here because this
+                     * can only fail if allocating the
+                     * struct fails.
+                     */
+                    goto out;
+                }
+            } else
+                GF_FREE(match);
+
+            /* Insert the netgroup into the parent's dict */
+            _nge_dict_insert(ngp->netgroup_ngs, nge);
+        }
+    }
+
+    /* If there are no entries on the RHS, log an error, but continue */
+    if (!num_entries) {
+        /* Cannot change to gf_msg
+         * gf_msg not giving output to STDOUT
+         * Bug id : BZ1215017
+         */
+        gf_log(GF_NG, GF_LOG_WARNING,
+               "No netgroups were specified except for the parent.");
+    }
+
+    *ng_entry = ngp;
+    ret = 0;
+
+out:
+    parser_unset_string(ng_file_parser);
+    return ret;
+}
+
+/**
+ * ng_file_parse - Parse a netgroups file into a the netgroups file struct.
+ *                 This is the external facing function that must be called
+ *                 to parse a netgroups file. This function returns a netgroup
+ *                 file struct that is allocated and must be freed using
+ *                 ng_file_deinit.
+ *
+ * @filepath : Path to the netgroups file we need to parse
+ *
+ * @return   : success: Pointer to a netgroup file struct if parsing succeeded
+ *             failure: NULL if not
+ *
+ * Externally facing function
+ */
+struct netgroups_file *
+ng_file_parse(const char *filepath)
+{
+    FILE *fp = NULL;
+    size_t len = 0;
+    size_t read = 0;
+    char *line = NULL;
+    struct netgroups_file *file = NULL;
+    struct netgroup_entry *nge = NULL;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO(GF_NG, filepath, err);
+
+    fp = fopen(filepath, "r");
+    if (!fp)
+        goto err;
+
+    file = _netgroups_file_init();
+    if (!file)
+        goto err;
+
+    file->ng_file_dict = dict_new();
+    if (!file->ng_file_dict) {
+        gf_msg(GF_NG, GF_LOG_CRITICAL, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to allocate netgroup file dict");
+        goto err;
+    }
+
+    file->filename = gf_strdup(filepath);
+    if (!file->filename) {
+        gf_msg(GF_NG, GF_LOG_CRITICAL, errno, NFS_MSG_FILE_OP_FAILED,
+               "Failed to duplicate filename");
+        goto err;
+    }
+
+    ret = _ng_init_parsers();
+    if (ret < 0)
+        goto err;
+
+    /* Read the file line-by-line and parse it */
+    while ((read = getline(&line, &len, fp)) != -1) {
+        if (*line == '#') /* Lines starting with # are comments */
+            continue;
+
+        /* Parse the line into a netgroup entry */
+        ret = _parse_ng_line(line, file, &nge);
+        if (ret == -ENOMEM) {
+            gf_msg(GF_NG, GF_LOG_CRITICAL, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "Allocation error "
+                   "while parsing line!");
+            goto err;
+        }
+        if (ret != 0) {
+            gf_msg_debug(GF_NG, 0, "Failed to parse line %s", line);
+            continue;
+        }
+    }
+
+    /* line got allocated through getline(), don't use GF_FREE() for it */
+    free(line);
+
+    if (fp)
+        fclose(fp);
+
+    _ng_deinit_parsers();
+
+    return file;
+
+err:
+    if (line)
+        free(line);
+
+    if (file)
+        ng_file_deinit(file);
+
+    _ng_deinit_parsers();
+
+    if (fp)
+        fclose(fp);
+    return NULL;
+}
diff --git a/xlators/nfs/server/src/netgroups.h b/xlators/nfs/server/src/netgroups.h
new file mode 100644
index 00000000000..9c715f75d3c
--- /dev/null
+++ b/xlators/nfs/server/src/netgroups.h
@@ -0,0 +1,53 @@
+/*
+   Copyright 2014-present Facebook. All Rights Reserved
+
+   This file is part of GlusterFS.
+
+   Author :
+   Shreyas Siravara <shreyas.siravara@gmail.com>
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef _NETGROUPS_H
+#define _NETGROUPS_H
+
+#include "nfs-mem-types.h"
+#include <glusterfs/dict.h>
+#include "nfs.h"
+
+#define GF_NG GF_NFS "-netgroup"
+
+#define NG_FILE_PARSE_REGEX "([a-zA-Z0-9.(,)-]+)"
+#define NG_HOST_PARSE_REGEX "([a-zA-Z0-9.-]+)"
+
+struct netgroup_host {
+    char *hostname; /* Hostname of entry */
+    char *user;     /* User field in the entry */
+    char *domain;   /* Domain field in the entry */
+};
+
+struct netgroup_entry {
+    char *netgroup_name;    /* Name of the netgroup */
+    dict_t *netgroup_ngs;   /* Dict of netgroups in this netgroup */
+    dict_t *netgroup_hosts; /* Dict of hosts in this netgroup. */
+};
+
+struct netgroups_file {
+    char *filename;       /* Filename on disk */
+    dict_t *ng_file_dict; /* Dict of netgroup entries */
+};
+
+struct netgroups_file *
+ng_file_parse(const char *filepath);
+
+struct netgroup_entry *
+ng_file_get_netgroup(const struct netgroups_file *ngfile, const char *netgroup);
+
+void
+ng_file_deinit(struct netgroups_file *ngfile);
+
+#endif /* _NETGROUPS_H */
diff --git a/xlators/nfs/server/src/nfs-common.c b/xlators/nfs/server/src/nfs-common.c
new file mode 100644
index 00000000000..b8f6b6f318e
--- /dev/null
+++ b/xlators/nfs/server/src/nfs-common.c
@@ -0,0 +1,450 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "rpcsvc.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include "xdr-nfs3.h"
+#include "msg-nfs3.h"
+#include <glusterfs/iobuf.h>
+#include "nfs-common.h"
+#include "nfs-fops.h"
+#include "nfs-mem-types.h"
+#include "rpcsvc.h"
+#include <glusterfs/iatt.h>
+#include "nfs-messages.h"
+
+#include <libgen.h>
+
+xlator_t *
+nfs_xlid_to_xlator(xlator_list_t *cl, uint8_t xlid)
+{
+    xlator_t *xl = NULL;
+    uint8_t id = 0;
+
+    while (id <= xlid) {
+        if (!cl) {
+            xl = NULL;
+            break;
+        }
+
+        xl = cl->xlator;
+        cl = cl->next;
+        id++;
+    }
+
+    return xl;
+}
+
+xlator_t *
+nfs_path_to_xlator(xlator_list_t *cl, char *path)
+{
+    return NULL;
+}
+
+uint16_t
+nfs_xlator_to_xlid(xlator_list_t *cl, xlator_t *xl)
+{
+    uint16_t xlid = 0;
+
+    if ((!cl) || (!xl))
+        return 0;
+
+    while (cl) {
+        if (xl == cl->xlator)
+            break;
+        cl = cl->next;
+        ++xlid;
+    }
+
+    return xlid;
+}
+
+xlator_t *
+nfs_mntpath_to_xlator(xlator_list_t *cl, char *path)
+{
+    char *volname = NULL; /* volume name only */
+    char *volptr = NULL;  /* ptr to original volname */
+    size_t pathlen = -1;
+    xlator_t *targetxl = NULL;
+    int i = 0;
+
+    if ((!cl) || (!path))
+        return NULL;
+
+    gf_msg_trace(GF_NFS, 0, "Subvolume search: %s", path);
+
+    volname = volptr = gf_strdup(path);
+    if (!volname)
+        return NULL;
+
+    if (volname[0] == '/')
+        volname++;
+
+    pathlen = strlen(volname);
+    for (i = 0; i < pathlen; i++) {
+        if (volname[i] == '/') {
+            volname[i] = '\0';
+            break;
+        }
+    }
+
+    while (cl) {
+        gf_msg_trace(GF_NFS, 0, "Volname: %s and cl->xlator->name: %s", volname,
+                     cl->xlator->name);
+
+        if (strcmp(volname, cl->xlator->name) == 0) {
+            targetxl = cl->xlator;
+            break;
+        }
+
+        cl = cl->next;
+    }
+
+    GF_FREE(volptr);
+
+    return targetxl;
+}
+
+void
+nfs_loc_wipe(loc_t *loc)
+{
+    loc_wipe(loc);
+}
+
+int
+nfs_loc_copy(loc_t *dst, loc_t *src)
+{
+    int ret = -1;
+
+    ret = loc_copy(dst, src);
+
+    return ret;
+}
+
+int
+nfs_loc_fill(loc_t *loc, inode_t *inode, inode_t *parent, char *path)
+{
+    int ret = -EFAULT;
+
+    if (!loc)
+        return ret;
+
+    if (inode) {
+        loc->inode = inode_ref(inode);
+        if (!gf_uuid_is_null(inode->gfid))
+            gf_uuid_copy(loc->gfid, inode->gfid);
+    }
+
+    if (parent)
+        loc->parent = inode_ref(parent);
+
+    if (path) {
+        loc->path = gf_strdup(path);
+        if (!loc->path) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "strdup failed");
+            goto loc_wipe;
+        }
+        loc->name = strrchr(loc->path, '/');
+        if (loc->name)
+            loc->name++;
+    }
+
+    ret = 0;
+loc_wipe:
+    if (ret < 0)
+        nfs_loc_wipe(loc);
+
+    return ret;
+}
+
+int
+nfs_inode_loc_fill(inode_t *inode, loc_t *loc, int how)
+{
+    char *resolvedpath = NULL;
+    inode_t *parent = NULL;
+    int ret = -EFAULT;
+
+    if ((!inode) || (!loc))
+        return ret;
+
+    /* If gfid is not null, then the inode is already linked to
+     * the inode table, and not a newly created one. For newly
+     * created inode, inode_path returns null gfid as the path.
+     */
+    if (!gf_uuid_is_null(inode->gfid)) {
+        ret = inode_path(inode, NULL, &resolvedpath);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PATH_RESOLVE_FAIL,
+                   "path resolution "
+                   "failed %s",
+                   resolvedpath);
+            goto err;
+        }
+    }
+
+    if (resolvedpath == NULL) {
+        char tmp_path[GFID_STR_PFX_LEN + 1] = {
+            0,
+        };
+        snprintf(tmp_path, sizeof(tmp_path), "<gfid:%s>", uuid_utoa(loc->gfid));
+        resolvedpath = gf_strdup(tmp_path);
+    } else {
+        parent = inode_parent(inode, loc->pargfid, NULL);
+    }
+
+    ret = nfs_loc_fill(loc, inode, parent, resolvedpath);
+    if (ret < 0) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_LOC_FILL_RESOLVE_FAIL,
+               "loc fill resolution failed %s", resolvedpath);
+        goto err;
+    }
+
+    ret = 0;
+err:
+    if (parent)
+        inode_unref(parent);
+
+    GF_FREE(resolvedpath);
+
+    return ret;
+}
+
+int
+nfs_gfid_loc_fill(inode_table_t *itable, uuid_t gfid, loc_t *loc, int how)
+{
+    int ret = -EFAULT;
+    inode_t *inode = NULL;
+
+    if (!loc)
+        return ret;
+
+    inode = inode_find(itable, gfid);
+    if (!inode) {
+        gf_msg_trace(GF_NFS, 0,
+                     "Inode not found in itable, will "
+                     "try to create one.");
+        if (how == NFS_RESOLVE_CREATE) {
+            gf_msg_trace(GF_NFS, 0, "Inode needs to be created.");
+            inode = inode_new(itable);
+            if (!inode) {
+                gf_msg(GF_NFS, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                       "Failed to "
+                       "allocate memory");
+                ret = -ENOMEM;
+                goto err;
+            }
+
+        } else {
+            gf_msg(GF_NFS, GF_LOG_ERROR, ENOENT, NFS_MSG_INODE_NOT_FOUND,
+                   "Inode not found in "
+                   "itable and no creation was requested.");
+            ret = -ENOENT;
+            goto err;
+        }
+    } else {
+        gf_msg_trace(GF_NFS, 0, "Inode was found in the itable.");
+    }
+
+    gf_uuid_copy(loc->gfid, gfid);
+
+    ret = nfs_inode_loc_fill(inode, loc, how);
+    if (ret < 0) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_INODE_LOC_FILL_ERROR,
+               "Inode loc filling failed.: %s", strerror(-ret));
+        goto err;
+    }
+
+err:
+    if (inode)
+        inode_unref(inode);
+    return ret;
+}
+
+int
+nfs_root_loc_fill(inode_table_t *itable, loc_t *loc)
+{
+    static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+    return nfs_gfid_loc_fill(itable, rootgfid, loc, NFS_RESOLVE_EXIST);
+}
+
+int
+nfs_parent_inode_loc_fill(inode_t *parent, inode_t *entryinode, char *entry,
+                          loc_t *loc)
+{
+    int ret = -EFAULT;
+    char *path = NULL;
+
+    if ((!parent) || (!entry) || (!loc) || (!entryinode))
+        return ret;
+
+    ret = inode_path(parent, entry, &path);
+    if (ret < 0) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_PATH_RESOLVE_FAIL,
+               "path resolution failed %s", path);
+        goto err;
+    }
+
+    ret = nfs_loc_fill(loc, entryinode, parent, path);
+    GF_FREE(path);
+err:
+    return ret;
+}
+
+/* Returns -1 if parent is not available, return -2 if the entry is not
+ * available. In case the return is going to be -2, and how = NFS_RESOLVE_CREATE
+ * it does however fill in the loc so that it can be used to perform a lookup
+ * fop for the entry.
+ * On other errors, return -3. 0 on success.
+ */
+int
+nfs_entry_loc_fill(xlator_t *this, inode_table_t *itable, uuid_t pargfid,
+                   char *entry, loc_t *loc, int how, gf_boolean_t *freshlookup)
+{
+    inode_t *parent = NULL;
+    inode_t *entryinode = NULL;
+    int ret = -3;
+    char *resolvedpath = NULL;
+    int pret = -3;
+
+    if ((!itable) || (!entry) || (!loc))
+        return ret;
+
+    parent = inode_find(itable, pargfid);
+
+    ret = -1;
+    /* Will need hard resolution now */
+    if (!parent || inode_ctx_get(parent, this, NULL))
+        goto err;
+
+    gf_uuid_copy(loc->pargfid, pargfid);
+
+    ret = -2;
+    entryinode = inode_grep(itable, parent, entry);
+    if (!entryinode || inode_ctx_get(entryinode, this, NULL)) {
+        if (how == NFS_RESOLVE_CREATE) {
+            /* Even though we'll create the inode and the loc for
+             * a missing inode, we still need to return -2 so
+             * that the caller can use the filled loc to call
+             * lookup.
+             */
+            if (!entryinode) {
+                entryinode = inode_new(itable);
+                if (freshlookup)
+                    *freshlookup = _gf_true;
+            }
+            /* Cannot change ret because that must
+             * continue to have -2.
+             */
+            pret = nfs_parent_inode_loc_fill(parent, entryinode, entry, loc);
+            /* Only if parent loc fill fails, should we notify error
+             * through ret, otherwise, we still need to force a
+             * lookup by returning -2.
+             */
+            if (pret < 0)
+                ret = -3;
+        }
+        goto err;
+    }
+
+    ret = inode_path(parent, entry, &resolvedpath);
+    if (ret < 0) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_PATH_RESOLVE_FAIL,
+               "path resolution failed %s", resolvedpath);
+        ret = -3;
+        goto err;
+    }
+
+    ret = nfs_loc_fill(loc, entryinode, parent, resolvedpath);
+    if (ret < 0) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_INODE_LOC_FILL_ERROR,
+               "loc_fill failed %s", resolvedpath);
+        ret = -3;
+    }
+
+err:
+    if (parent)
+        inode_unref(parent);
+
+    if (entryinode)
+        inode_unref(entryinode);
+
+    GF_FREE(resolvedpath);
+
+    return ret;
+}
+
+uint32_t
+nfs_hash_gfid(uuid_t gfid)
+{
+    uint32_t hash = 0;
+    uint64_t msb64 = 0;
+    uint64_t lsb64 = 0;
+    uint32_t a1 = 0;
+    uint32_t a2 = 0;
+    uint32_t a3 = 0;
+    uint32_t a4 = 0;
+    uint32_t b1 = 0;
+    uint32_t b2 = 0;
+
+    if (__is_root_gfid(gfid))
+        return 0x1;
+
+    memcpy(&msb64, &gfid[8], 8);
+    memcpy(&lsb64, &gfid[0], 8);
+
+    a1 = (msb64 << 32);
+    a2 = (msb64 >> 32);
+    a3 = (lsb64 << 32);
+    a4 = (lsb64 >> 32);
+
+    b1 = a1 ^ a4;
+    b2 = a2 ^ a3;
+
+    hash = b1 ^ b2;
+
+    return hash;
+}
+
+void
+nfs_fix_generation(xlator_t *this, inode_t *inode)
+{
+    uint64_t raw_ctx = 0;
+    struct nfs_inode_ctx *ictx = NULL;
+    struct nfs_state *priv = NULL;
+    int ret = -1;
+
+    if (!inode) {
+        return;
+    }
+    priv = this->private;
+
+    if (inode_ctx_get(inode, this, &raw_ctx) == 0) {
+        ictx = (struct nfs_inode_ctx *)(uintptr_t)raw_ctx;
+        ictx->generation = priv->generation;
+    } else {
+        ictx = GF_CALLOC(1, sizeof(struct nfs_inode_ctx), gf_nfs_mt_inode_ctx);
+        if (!ictx) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "could not allocate nfs inode ctx");
+            return;
+        }
+        INIT_LIST_HEAD(&ictx->shares);
+        ictx->generation = priv->generation;
+        ret = inode_ctx_put(inode, this, (uint64_t)(uintptr_t)ictx);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, NFS_MSG_INODE_CTX_STORE_FAIL,
+                   "could not store nfs inode ctx");
+            return;
+        }
+    }
+}
diff --git a/xlators/nfs/server/src/nfs-common.h b/xlators/nfs/server/src/nfs-common.h
new file mode 100644
index 00000000000..bd80d8b3be5
--- /dev/null
+++ b/xlators/nfs/server/src/nfs-common.h
@@ -0,0 +1,73 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _NFS_COMMON_H_
+#define _NFS_COMMON_H_
+
+#include <unistd.h>
+
+#include <glusterfs/xlator.h>
+#include "rpcsvc.h"
+#include <glusterfs/iatt.h>
+#include <glusterfs/compat-uuid.h>
+
+// NFS_PATH_MAX hard-coded to 4096 as a work around for bug 2476.
+// nfs server crashes when path received is longer than PATH_MAX
+#define NFS_PATH_MAX 4096
+#define NFS_NAME_MAX NAME_MAX
+
+#define NFS_DEFAULT_CREATE_MODE 0600
+
+extern xlator_t *
+nfs_xlid_to_xlator(xlator_list_t *cl, uint8_t xlid);
+
+extern uint16_t
+nfs_xlator_to_xlid(xlator_list_t *cl, xlator_t *xl);
+
+extern xlator_t *
+nfs_path_to_xlator(xlator_list_t *cl, char *path);
+
+extern xlator_t *
+nfs_mntpath_to_xlator(xlator_list_t *cl, char *path);
+
+extern void
+nfs_loc_wipe(loc_t *loc);
+
+extern int
+nfs_loc_copy(loc_t *dst, loc_t *src);
+
+extern int
+nfs_loc_fill(loc_t *loc, inode_t *inode, inode_t *parent, char *path);
+
+#define NFS_RESOLVE_EXIST 1
+#define NFS_RESOLVE_CREATE 2
+
+extern int
+nfs_inode_loc_fill(inode_t *inode, loc_t *loc, int how);
+
+extern int
+nfs_ino_loc_fill(inode_table_t *itable, uuid_t gfid, loc_t *l);
+
+extern int
+nfs_entry_loc_fill(xlator_t *this, inode_table_t *itable, uuid_t pargfid,
+                   char *entry, loc_t *loc, int how, gf_boolean_t *freshlookup);
+
+extern int
+nfs_root_loc_fill(inode_table_t *itable, loc_t *loc);
+
+extern uint32_t
+nfs_hash_gfid(uuid_t gfid);
+
+extern int
+nfs_gfid_loc_fill(inode_table_t *itable, uuid_t gfid, loc_t *loc, int how);
+
+void
+nfs_fix_generation(xlator_t *this, inode_t *inode);
+#endif
diff --git a/xlators/nfs/server/src/nfs-fops.c b/xlators/nfs/server/src/nfs-fops.c
new file mode 100644
index 00000000000..4d8540c2c3e
--- /dev/null
+++ b/xlators/nfs/server/src/nfs-fops.c
@@ -0,0 +1,1632 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <pwd.h>
+
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/iobuf.h>
+#include <glusterfs/call-stub.h>
+#include "nfs.h"
+#include "nfs-fops.h"
+#include "nfs-common.h"
+#include "nfs3-helpers.h"
+#include "nfs-mem-types.h"
+#include "nfs-messages.h"
+#include <libgen.h>
+#include <semaphore.h>
+
+static int gf_auth_max_groups_nfs_log = 0;
+
+void
+nfs_fix_groups(xlator_t *this, call_stack_t *root)
+{
+    struct passwd mypw;
+    char mystrs[1024];
+    struct passwd *result;
+    gid_t *mygroups;
+    int ngroups;
+    int i;
+    int max_groups;
+    struct nfs_state *priv = this->private;
+    const gid_list_t *agl;
+    gid_list_t gl;
+
+    if (!priv->server_aux_gids) {
+        return;
+    }
+
+    /* RPC enforces the GF_AUTH_GLUSTERFS_MAX_GROUPS limit */
+    max_groups = GF_AUTH_GLUSTERFS_MAX_GROUPS(root->lk_owner.len,
+                                              AUTH_GLUSTERFS_v2);
+
+    agl = gid_cache_lookup(&priv->gid_cache, root->uid, 0, 0);
+    if (agl) {
+        if (agl->gl_count > max_groups) {
+            GF_LOG_OCCASIONALLY(gf_auth_max_groups_nfs_log, this->name,
+                                GF_LOG_WARNING,
+                                "too many groups, reducing %d -> %d",
+                                agl->gl_count, max_groups);
+        }
+
+        for (ngroups = 0; ngroups < agl->gl_count && ngroups <= max_groups;
+             ngroups++) {
+            root->groups[ngroups] = agl->gl_list[ngroups];
+        }
+        root->ngrps = ngroups;
+        gid_cache_release(&priv->gid_cache, agl);
+        return;
+    }
+
+    /* No cached list found. */
+    if (getpwuid_r(root->uid, &mypw, mystrs, sizeof(mystrs), &result) != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, NFS_MSG_GETPWUID_FAIL,
+               "getpwuid_r(%u) failed", root->uid);
+        return;
+    }
+
+    if (!result) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, NFS_MSG_GETPWUID_FAIL,
+               "getpwuid_r(%u) found nothing", root->uid);
+        return;
+    }
+
+    gf_msg_trace(this->name, 0, "mapped %u => %s", root->uid, result->pw_name);
+
+    ngroups = gf_getgrouplist(result->pw_name, root->gid, &mygroups);
+    if (ngroups == -1) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, NFS_MSG_MAP_GRP_LIST_FAIL,
+               "could not map %s to group list", result->pw_name);
+        return;
+    }
+
+    /* RPC enforces the GF_AUTH_GLUSTERFS_MAX_GROUPS limit */
+    if (ngroups > max_groups) {
+        GF_LOG_OCCASIONALLY(
+            gf_auth_max_groups_nfs_log, this->name, GF_LOG_WARNING,
+            "too many groups, reducing %d -> %d", ngroups, max_groups);
+    }
+
+    /* Copy data to the frame. */
+    for (i = 0; i < ngroups && i < max_groups; ++i) {
+        gf_msg_trace(this->name, 0, "%s is in group %u", result->pw_name,
+                     mygroups[i]);
+        root->groups[i] = mygroups[i];
+    }
+    root->ngrps = ngroups;
+
+    /* Add the group data to the cache. */
+    gl.gl_list = mygroups;
+    gl.gl_id = root->uid;
+    gl.gl_uid = 0;
+    gl.gl_gid = 0;
+    gl.gl_count = ngroups;
+    if (gid_cache_add(&priv->gid_cache, &gl) != 1)
+        GF_FREE(mygroups);
+}
+
+struct nfs_fop_local *
+nfs_fop_local_init(xlator_t *nfsx)
+{
+    struct nfs_fop_local *l = NULL;
+
+    if (!nfsx)
+        return NULL;
+
+    l = mem_get(nfs_fop_mempool(nfsx));
+    if (!l) {
+        gf_msg_nomem(GF_NFS, GF_LOG_ERROR, 4096);
+        return NULL;
+    }
+
+    memset(l, 0, sizeof(*l));
+    return l;
+}
+
+void
+nfs_fop_local_wipe(xlator_t *nfsx, struct nfs_fop_local *l)
+{
+    if ((!nfsx) || (!l))
+        return;
+
+    if (l->iobref)
+        iobref_unref(l->iobref);
+
+    if (l->parent)
+        inode_unref(l->parent);
+
+    if (l->inode)
+        inode_unref(l->inode);
+
+    if (l->newparent)
+        inode_unref(l->newparent);
+
+    if (l->dictgfid)
+        dict_unref(l->dictgfid);
+
+    mem_put(l);
+
+    return;
+}
+
+#define nfs_stack_destroy(nfl, fram)                                           \
+    do {                                                                       \
+        nfs_fop_local_wipe((nfl)->nfsx, nfl);                                  \
+        (fram)->local = NULL;                                                  \
+        STACK_DESTROY((fram)->root);                                           \
+    } while (0)
+
+pthread_mutex_t ctr = PTHREAD_MUTEX_INITIALIZER;
+unsigned int cval = 1;
+
+int
+nfs_frame_getctr()
+{
+    uint64_t val = 0;
+
+    pthread_mutex_lock(&ctr);
+    {
+        if (cval == 0)
+            cval = 1;
+        val = cval;
+        cval++;
+    }
+    pthread_mutex_unlock(&ctr);
+
+    return val;
+}
+
+call_frame_t *
+nfs_create_frame(xlator_t *xl, nfs_user_t *nfu)
+{
+    call_frame_t *frame = NULL;
+    int x = 0;
+    int y = 0;
+
+    if ((!xl) || (!nfu) || (nfu->ngrps > NFS_NGROUPS))
+        return NULL;
+
+    frame = create_frame(xl, (call_pool_t *)xl->ctx->pool);
+    if (!frame)
+        goto err;
+    if (call_stack_alloc_groups(frame->root, nfu->ngrps) != 0) {
+        STACK_DESTROY(frame->root);
+        frame = NULL;
+        goto err;
+    }
+
+    frame->root->pid = NFS_PID;
+    frame->root->uid = nfu->uid;
+    frame->root->gid = nfu->gids[NFS_PRIMGID_IDX];
+    memcpy(&frame->root->identifier, &nfu->identifier, UNIX_PATH_MAX);
+    frame->root->lk_owner = nfu->lk_owner;
+
+    if (nfu->ngrps != 1) {
+        frame->root->ngrps = nfu->ngrps - 1;
+
+        gf_msg_trace(GF_NFS, 0, "uid: %d, gid %d, gids: %d", frame->root->uid,
+                     frame->root->gid, frame->root->ngrps);
+        for (y = 0, x = 1; y < frame->root->ngrps; x++, y++) {
+            gf_msg_trace(GF_NFS, 0, "gid: %d", nfu->gids[x]);
+            frame->root->groups[y] = nfu->gids[x];
+        }
+    }
+
+    /*
+     * It's tempting to do this *instead* of using nfu above, but we need
+     * to have those values in case nfs_fix_groups doesn't do anything.
+     */
+    nfs_fix_groups(xl, frame->root);
+
+err:
+    return frame;
+}
+
+#define nfs_fop_handle_frame_create(fram, xla, nfuser, retval, errlabel)       \
+    do {                                                                       \
+        fram = nfs_create_frame(xla, (nfuser));                                \
+        if (!fram) {                                                           \
+            retval = (-ENOMEM);                                                \
+            gf_msg(GF_NFS, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,            \
+                   "Frame creation failed");                                   \
+            goto errlabel;                                                     \
+        }                                                                      \
+    } while (0)
+
+/* Look into the  inode and parent inode of a loc and save enough state
+ * for us to determine in the callback whether to funge the ino in the stat buf
+ * with 1 for the parent.
+ */
+#define nfs_fop_save_root_ino(locl, loc)                                       \
+    do {                                                                       \
+        if (((loc)->inode) && __is_root_gfid((loc)->inode->gfid))              \
+            (locl)->rootinode = 1;                                             \
+        else if (((loc)->parent) && __is_root_gfid((loc)->parent->gfid))       \
+            (locl)->rootparentinode = 1;                                       \
+    } while (0)
+
+/* Do the same for an fd */
+#define nfs_fop_save_root_fd_ino(locl, fdesc)                                  \
+    do {                                                                       \
+        if (__is_root_gfid((fdesc)->inode->gfid))                              \
+            (locl)->rootinode = 1;                                             \
+    } while (0)
+
+/* Use the state saved by the previous macro to funge the ino in the appropriate
+ * structure.
+ */
+#define nfs_fop_restore_root_ino(locl, fopret, preattr, postattr, prepar,      \
+                                 postpar)                                      \
+    do {                                                                       \
+        if (fopret == -1)                                                      \
+            break;                                                             \
+        if ((locl)->rootinode) {                                               \
+            if ((preattr)) {                                                   \
+                ((struct iatt *)(preattr))->ia_ino = 1;                        \
+                ((struct iatt *)(preattr))->ia_dev = 0;                        \
+            }                                                                  \
+            if ((postattr)) {                                                  \
+                ((struct iatt *)(postattr))->ia_ino = 1;                       \
+                ((struct iatt *)(postattr))->ia_dev = 0;                       \
+            }                                                                  \
+        } else if ((locl)->rootparentinode) {                                  \
+            if ((prepar)) {                                                    \
+                ((struct iatt *)(prepar))->ia_ino = 1;                         \
+                ((struct iatt *)(prepar))->ia_dev = 0;                         \
+            }                                                                  \
+            if ((postpar)) {                                                   \
+                ((struct iatt *)(postpar))->ia_ino = 1;                        \
+                ((struct iatt *)(postpar))->ia_dev = 0;                        \
+            }                                                                  \
+        }                                                                      \
+    } while (0)
+
+/* If the newly created, inode's parent is root, we'll need to funge the ino
+ * in the parent attr when we receive them in the callback.
+ */
+#define nfs_fop_newloc_save_root_ino(locl, newloc)                             \
+    do {                                                                       \
+        if (((newloc)->inode) && __is_root_gfid((newloc)->inode->gfid))        \
+            (locl)->newrootinode = 1;                                          \
+        else if (((newloc)->parent) && __is_root_gfid((newloc)->parent->gfid)) \
+            (locl)->newrootparentinode = 1;                                    \
+    } while (0)
+
+#define nfs_fop_newloc_restore_root_ino(locl, fopret, preattr, postattr,       \
+                                        prepar, postpar)                       \
+    do {                                                                       \
+        if (fopret == -1)                                                      \
+            break;                                                             \
+                                                                               \
+        if ((locl)->newrootinode) {                                            \
+            if ((preattr))                                                     \
+                ((struct iatt *)(preattr))->ia_ino = 1;                        \
+            if ((postattr))                                                    \
+                ((struct iatt *)(postattr))->ia_ino = 1;                       \
+        } else if ((locl)->newrootparentinode) {                               \
+            if ((prepar))                                                      \
+                ((struct iatt *)(prepar))->ia_ino = 1;                         \
+            if ((postpar))                                                     \
+                ((struct iatt *)(postpar))->ia_ino = 1;                        \
+        }                                                                      \
+    } while (0)
+
+dict_t *
+nfs_gfid_dict(inode_t *inode)
+{
+    uuid_t newgfid = {
+        0,
+    };
+    unsigned char *dyngfid = NULL;
+    dict_t *dictgfid = NULL;
+    int ret = -1;
+    static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+    dyngfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_char);
+    if (dyngfid == NULL)
+        return (NULL);
+
+    gf_uuid_generate(newgfid);
+
+    if (gf_uuid_compare(inode->gfid, rootgfid) == 0)
+        memcpy(dyngfid, rootgfid, sizeof(uuid_t));
+    else
+        memcpy(dyngfid, newgfid, sizeof(uuid_t));
+
+    dictgfid = dict_new();
+    if (!dictgfid) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_GFID_DICT_CREATE_FAIL,
+               "Failed to create gfid dict");
+        GF_FREE(dyngfid);
+        return (NULL);
+    }
+
+    ret = dict_set_gfuuid(dictgfid, "gfid-req", dyngfid, false);
+    if (ret < 0) {
+        GF_FREE(dyngfid);
+        dict_unref(dictgfid);
+        return (NULL);
+    }
+
+    return dictgfid;
+}
+
+#define nfs_fop_gfid_setup(nflcl, inode, retval, erlbl)                        \
+    do {                                                                       \
+        if (nflcl) {                                                           \
+            (nflcl)->dictgfid = nfs_gfid_dict(inode);                          \
+                                                                               \
+            if (!((nflcl)->dictgfid)) {                                        \
+                retval = -EFAULT;                                              \
+                goto erlbl;                                                    \
+            }                                                                  \
+        }                                                                      \
+    } while (0)
+
+/* Fops Layer Explained
+ * The fops layer has three types of functions. They can all be identified by
+ * their names. Here are the three patterns:
+ *
+ * nfs_fop_<fopname>
+ * This is the lowest level function that knows nothing about states and
+ * callbacks. At most this is required to create a frame and call the
+ * fop. The idea here is to provide a convenient way to call fops than
+ * directly use STACK_WINDs. If this type of interface is used, the caller's
+ * callback is responsible for doing the relevant GlusterFS state
+ * maintenance operations on the data returned in the callbacks.
+ *
+ * nfs_<fopname>
+ * Unlike the nfs_fop_<fopname> variety, this is the stateful type of fop, in
+ * that it silently performs all the relevant GlusterFS state maintenance
+ * operations on the data returned to the callbacks, leaving the caller's
+ * callback to just use the data returned for whatever it needs to do with that
+ * data, for eg. the nfs_lookup, will take care of looking up the inodes,
+ * revalidating them if needed and linking new inodes into the table, while
+ * the caller's callback, for eg, the NFSv3 LOOKUP callback can just use
+ * the stat bufs returned to create file handle, map the file handle into the
+ * fh cache and finally encode the fh and the stat bufs into a NFS reply.
+ *
+ */
+
+int32_t
+nfs_fop_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, inode_t *inode,
+                   struct iatt *buf, dict_t *xattr, struct iatt *postparent)
+{
+    struct nfs_fop_local *local = NULL;
+    fop_lookup_cbk_t progcbk;
+
+    if (op_ret == 0) {
+        nfs_fix_generation(this, inode);
+    }
+
+    nfl_to_prog_data(local, progcbk, frame);
+    nfs_fop_restore_root_ino(local, op_ret, buf, NULL, NULL, postparent);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, inode, buf, xattr,
+                postparent);
+
+    nfs_stack_destroy(local, frame);
+    return 0;
+}
+
+int
+nfs_fop_lookup(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+               fop_lookup_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!xl) || (!loc) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Lookup: %s", loc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_ino(nfl, loc);
+    nfs_fop_gfid_setup(nfl, loc->inode, ret, err);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_lookup_cbk, xl, xl, xl->fops->lookup, loc,
+                      nfl->dictgfid);
+
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_access_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, xdata);
+
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_access(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+               int32_t accesstest, fop_access_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+    uint32_t accessbits = 0;
+
+    if ((!xl) || (!loc) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Access: %s", loc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_ino(nfl, loc);
+
+    accessbits = nfs3_request_to_accessbits(accesstest);
+    STACK_WIND_COOKIE(frame, nfs_fop_access_cbk, xl, xl, xl->fops->access, loc,
+                      accessbits, NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                 dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_stat_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, buf, NULL, NULL, NULL);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, buf, xdata);
+
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_stat(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+             fop_stat_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!xl) || (!loc) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Stat: %s", loc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_ino(nfl, loc);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_stat_cbk, xl, xl, xl->fops->stat, loc,
+                      NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                  dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_fstat_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, buf, NULL, NULL, NULL);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, buf, xdata);
+
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_fstat(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+              fop_fstat_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!fd) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "FStat");
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_fd_ino(nfl, fd);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_fstat_cbk, xl, xl, xl->fops->fstat, fd,
+                      NULL);
+
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_opendir_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, fd, xdata);
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_opendir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                fd_t *dirfd, fop_opendir_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!dirfd) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Opendir: %s", pathloc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_opendir_cbk, xl, xl, xl->fops->opendir,
+                      pathloc, dirfd, NULL);
+    ret = 0;
+
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int
+nfs_fop_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_flush_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, xdata);
+
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_flush(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+              fop_flush_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!fd) || (!nfu))
+        return ret;
+
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_flush_cbk, xl, xl, xl->fops->flush, fd,
+                      NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+                     dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_readdirp_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, entries, xdata);
+
+    nfs_stack_destroy(nfl, frame);
+
+    return 0;
+}
+
+int
+nfs_fop_readdirp(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *dirfd,
+                 size_t bufsize, off_t offset, fop_readdirp_cbk_t cbk,
+                 void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!dirfd) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "readdir");
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_readdirp_cbk, xl, xl, xl->fops->readdirp,
+                      dirfd, bufsize, offset, 0);
+
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+                   dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_statfs_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, buf, xdata);
+
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_statfs(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+               fop_statfs_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Statfs: %s", pathloc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_statfs_cbk, xl, xl, xl->fops->statfs,
+                      pathloc, NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                   struct iatt *buf, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_create_cbk_t progcbk = NULL;
+
+    if (op_ret == 0) {
+        nfs_fix_generation(this, inode);
+    }
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, buf, NULL, preparent, postparent);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, fd, inode, buf,
+                preparent, postparent, NULL);
+
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_create(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+               int flags, mode_t mode, fd_t *fd, fop_create_cbk_t cbk,
+               void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Create: %s", pathloc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_ino(nfl, pathloc);
+    nfs_fop_gfid_setup(nfl, pathloc->inode, ret, err);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_create_cbk, xl, xl, xl->fops->create,
+                      pathloc, flags, mode, 0, fd, nfl->dictgfid);
+
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *pre,
+                    struct iatt *post, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_setattr_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, pre, post, NULL, NULL);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, pre, post, xdata);
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_setattr(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                struct iatt *buf, int32_t valid, fop_setattr_cbk_t cbk,
+                void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Setattr: %s", pathloc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_ino(nfl, pathloc);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_setattr_cbk, xl, xl, xl->fops->setattr,
+                      pathloc, buf, valid, NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, inode_t *inode,
+                  struct iatt *buf, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_mkdir_cbk_t progcbk = NULL;
+
+    if (op_ret == 0) {
+        nfs_fix_generation(this, inode);
+    }
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, buf, NULL, preparent, postparent);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, inode, buf, preparent,
+                postparent, xdata);
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_mkdir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+              mode_t mode, fop_mkdir_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Mkdir: %s", pathloc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_ino(nfl, pathloc);
+    nfs_fop_gfid_setup(nfl, pathloc->inode, ret, err);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_mkdir_cbk, xl, xl, xl->fops->mkdir,
+                      pathloc, mode, 0, nfl->dictgfid);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, inode_t *inode,
+                    struct iatt *buf, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_symlink_cbk_t progcbk = NULL;
+
+    if (op_ret == 0) {
+        nfs_fix_generation(this, inode);
+    }
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, buf, NULL, preparent, postparent);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, inode, buf, preparent,
+                postparent, xdata);
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_symlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, char *target,
+                loc_t *pathloc, fop_symlink_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!target) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Symlink: %s", pathloc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_ino(nfl, pathloc);
+    nfs_fop_gfid_setup(nfl, pathloc->inode, ret, err);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_symlink_cbk, xl, xl, xl->fops->symlink,
+                      target, pathloc, 0, nfl->dictgfid);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, const char *path,
+                     struct iatt *buf, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_readlink_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, buf, NULL, NULL, NULL);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, path, buf, xdata);
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_readlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                 size_t size, fop_readlink_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Readlink: %s", pathloc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_ino(nfl, pathloc);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_readlink_cbk, xl, xl, xl->fops->readlink,
+                      pathloc, size, NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, inode_t *inode,
+                  struct iatt *buf, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_mknod_cbk_t progcbk = NULL;
+
+    if (op_ret == 0) {
+        nfs_fix_generation(this, inode);
+    }
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, buf, NULL, preparent, postparent);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, inode, buf, preparent,
+                postparent, xdata);
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_mknod(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+              mode_t mode, dev_t dev, fop_mknod_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Mknod: %s", pathloc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_ino(nfl, pathloc);
+    nfs_fop_gfid_setup(nfl, pathloc->inode, ret, err);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_mknod_cbk, xl, xl, xl->fops->mknod,
+                      pathloc, mode, dev, 0, nfl->dictgfid);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = frame->local;
+    fop_rmdir_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, NULL, NULL, preparent, postparent);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, preparent, postparent,
+                NULL);
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_rmdir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+              fop_rmdir_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Rmdir: %s", pathloc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_ino(nfl, pathloc);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_rmdir_cbk, xl, xl, xl->fops->rmdir,
+                      pathloc, 0, NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = frame->local;
+    fop_unlink_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, NULL, NULL, preparent, postparent);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, preparent, postparent,
+                xdata);
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_unlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+               fop_unlink_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Unlink: %s", pathloc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_ino(nfl, pathloc);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_unlink_cbk, xl, xl, xl->fops->unlink,
+                      pathloc, 0, NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, inode_t *inode,
+                 struct iatt *buf, struct iatt *preparent,
+                 struct iatt *postparent, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_link_cbk_t progcbk = NULL;
+
+    if (op_ret == 0) {
+        nfs_fix_generation(this, inode);
+    }
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, buf, NULL, preparent, postparent);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, inode, buf, preparent,
+                postparent, xdata);
+
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_link(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *oldloc,
+             loc_t *newloc, fop_link_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!oldloc) || (!newloc) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Link: %s -> %s", newloc->path, oldloc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_ino(nfl, newloc);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_link_cbk, xl, xl, xl->fops->link, oldloc,
+                      newloc, NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                   struct iatt *preoldparent, struct iatt *postoldparent,
+                   struct iatt *prenewparent, struct iatt *postnewparent,
+                   dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_rename_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    /* The preattr arg needs to be NULL instead of @buf because it is
+     * possible that the new parent is not root whereas the source dir
+     * could have been. That is handled in the next macro.
+     */
+    nfs_fop_restore_root_ino(nfl, op_ret, NULL, NULL, preoldparent,
+                             postoldparent);
+    nfs_fop_newloc_restore_root_ino(nfl, op_ret, buf, NULL, prenewparent,
+                                    postnewparent);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, buf, preoldparent,
+                postoldparent, prenewparent, postnewparent, xdata);
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_rename(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *oldloc,
+               loc_t *newloc, fop_rename_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!oldloc) || (!newloc) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Rename: %s -> %s", oldloc->path, newloc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_ino(nfl, oldloc);
+    nfs_fop_newloc_save_root_ino(nfl, newloc);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_rename_cbk, xl, xl, xl->fops->rename,
+                      oldloc, newloc, NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_open_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, fd, xdata);
+    nfs_stack_destroy(nfl, frame);
+
+    return 0;
+}
+
+int
+nfs_fop_open(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+             int32_t flags, fd_t *fd, fop_open_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!loc) || (!fd) || (!nfu))
+        return ret;
+
+    gf_msg_trace(GF_NFS, 0, "Open: %s", loc->path);
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_open_cbk, xl, xl, xl->fops->open, loc,
+                      flags, fd, NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                   struct iatt *postbuf, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_writev_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, prebuf, postbuf, NULL, NULL);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    nfs_stack_destroy(nfl, frame);
+
+    return 0;
+}
+
+int
+nfs_fop_write(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+              struct iobref *srciobref, struct iovec *vector, int32_t count,
+              off_t offset, fop_writev_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+    int flags = 0;
+    nfs3_call_state_t *cs = local;
+
+    if ((!nfsx) || (!xl) || (!fd) || (!vector) || (!nfu) || (!srciobref))
+        return ret;
+
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_fd_ino(nfl, fd);
+    /*
+            nfl->iobref = iobref_new ();
+            if (!nfl->iobref) {
+                    gf_log (GF_NFS, GF_LOG_ERROR, "iobref creation failed");
+                    ret = -ENOMEM;
+                    goto err;
+            }
+
+            iobref_add (nfl->iobref, srciob);
+    */
+
+    switch (cs->writetype) {
+        case UNSTABLE:
+            break;
+        case DATA_SYNC:
+            flags |= O_DSYNC;
+            break;
+        case FILE_SYNC:
+            flags |= O_SYNC;
+            break;
+    }
+
+    STACK_WIND_COOKIE(frame, nfs_fop_writev_cbk, xl, xl, xl->fops->writev, fd,
+                      vector, count, offset, flags, srciobref, NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                  struct iatt *postbuf, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_fsync_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, prebuf, postbuf, NULL, NULL);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_fsync(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+              int32_t datasync, fop_fsync_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!fd))
+        return ret;
+
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_fd_ino(nfl, fd);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_fsync_cbk, xl, xl, xl->fops->fsync, fd,
+                      datasync, NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                  int32_t count, struct iatt *stbuf, struct iobref *iobref,
+                  dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_readv_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, stbuf, NULL, NULL, NULL);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, vector, count, stbuf,
+                iobref, xdata);
+
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_read(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+             size_t size, off_t offset, fop_readv_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!xl) || (!fd) || (!nfu))
+        return ret;
+
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_fd_ino(nfl, fd);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_readv_cbk, xl, xl, xl->fops->readv, fd,
+                      size, offset, 0, NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct gf_flock *flock,
+               dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_lk_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+
+    if (!op_ret)
+        fd_lk_insert_and_merge(nfl->fd, nfl->cmd, &nfl->flock);
+
+    fd_unref(nfl->fd);
+
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, flock, xdata);
+
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_lk(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd, int cmd,
+           struct gf_flock *flock, fop_lk_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!xl) || (!fd) || (!nfu))
+        return ret;
+
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+
+    nfl->cmd = cmd;
+    nfl->fd = fd_ref(fd);
+    nfl->flock = *flock;
+
+    STACK_WIND_COOKIE(frame, nfs_fop_lk_cbk, xl, xl, xl->fops->lk, fd, cmd,
+                      flock, NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *dict,
+                     dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_getxattr_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, dict, xdata);
+
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_getxattr(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+                 char *name, dict_t *xdata, fop_getxattr_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!xl) || (!loc) || (!nfu))
+        return ret;
+
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_getxattr_cbk, xl, xl, xl->fops->getxattr,
+                      loc, name, NULL);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_setxattr_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, xdata);
+
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_setxattr(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+                 dict_t *dict, int32_t flags, dict_t *xdata,
+                 fop_setxattr_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!xl) || (!loc) || (!nfu))
+        return ret;
+
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_setxattr_cbk, xl, xl, xl->fops->setxattr,
+                      loc, dict, flags, xdata);
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
+
+int32_t
+nfs_fop_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_truncate_cbk_t progcbk = NULL;
+
+    nfl_to_prog_data(nfl, progcbk, frame);
+    nfs_fop_restore_root_ino(nfl, op_ret, prebuf, postbuf, NULL, NULL);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    nfs_stack_destroy(nfl, frame);
+    return 0;
+}
+
+int
+nfs_fop_truncate(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+                 off_t offset, fop_truncate_cbk_t cbk, void *local)
+{
+    call_frame_t *frame = NULL;
+    int ret = -EFAULT;
+    struct nfs_fop_local *nfl = NULL;
+
+    if ((!nfsx) || (!xl) || (!loc) || (!nfu))
+        return ret;
+
+    nfs_fop_handle_frame_create(frame, nfsx, nfu, ret, err);
+    nfs_fop_handle_local_init(frame, nfsx, nfl, cbk, local, ret, err);
+    nfs_fop_save_root_ino(nfl, loc);
+
+    STACK_WIND_COOKIE(frame, nfs_fop_truncate_cbk, xl, xl, xl->fops->truncate,
+                      loc, offset, NULL);
+
+    ret = 0;
+err:
+    if (ret < 0) {
+        if (frame)
+            nfs_stack_destroy(nfl, frame);
+    }
+
+    return ret;
+}
diff --git a/xlators/nfs/server/src/nfs-fops.h b/xlators/nfs/server/src/nfs-fops.h
new file mode 100644
index 00000000000..005cb788a45
--- /dev/null
+++ b/xlators/nfs/server/src/nfs-fops.h
@@ -0,0 +1,242 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _NFS_FOPS_H_
+#define _NFS_FOPS_H_
+
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/iobuf.h>
+#include <glusterfs/call-stub.h>
+#include "nfs.h"
+#include "nfs-common.h"
+#include "nfs-messages.h"
+#include <semaphore.h>
+
+/* This structure used to communicate state between a fop and its callback.
+ * The problem is, when we're calling a fop in the nfs op handler, the callback
+ * is the NFS protocol's callback and we have to perform all GlusterFS
+ * inode, inode table, fd_ts and fd table operations in the NFS callback. That
+ * approach soon gets extremely complicated and confusing because, then we have
+ * to try and separate in our heads which source lines in those callbacks are
+ * required for serving the NFS op and which ones are needed for satisfying
+ * GlusterFS requirements. This structure allows us avoid performing GlusterFS
+ * state maintenance operations inside the fops layer itself. Now, before
+ * we call the callback registered by the NFS operation, a hidden fops-layer
+ * specific callback is called which performs the state maintenance and then
+ * calls the NFS callback.
+ *
+ * These are allocated from a mem-pool stored in the nfs xlator's state.
+ * i.e. struct nfs_state.
+ * That is initiated in nfs_init_subvolumes in nfs.c.
+ */
+struct nfs_fop_local {
+    /* The local sent along by the user of the fop. */
+    void *proglocal;
+
+    /* The address of the callback supplied by the user. After our
+     * callback is executed this one is called.
+     * The exact cast destination of this pointer will depend on the
+     * fop that is being called.
+     */
+    void *progcbk;
+
+    /* Used only for write requests. */
+    struct iobref *iobref;
+
+    inode_t *parent;
+    inode_t *newparent;
+    inode_t *inode;
+
+    /* Set to 1 by nfs-inodes layer, which uses this to decide whether to
+     * link the newly allocated inode into the itable, in case the fop was
+     * successful.
+     */
+    int newinode;
+
+    /* Used by nfs-fops layer in order to determine whether to funge the
+     * ino in a dir's stbuf. This funging of root ino is needed to ensure
+     * that the root ino remains 1 even when the NFS server has been
+     * restarted. Note that in distribute, a fresh lookup and a revalidate
+     * on the root inode returns two different inode numbers and this we
+     * need to handle by ourself.
+     */
+    int rootinode;
+
+    /* This member is used to determine whether the new parent of a file
+     * being renamed is the root directory. If yes, the ino is funged.
+     */
+    int newrootinode;
+    int newrootparentinode;
+
+    /* Determines whether to funge the ino in the post and pre parent
+     * stbufs for a file/dir where the parent directory could be the root
+     * dir. Needed here because of the same reason as above.
+     */
+    int rootparentinode;
+
+    char path[NFS_NAME_MAX + 1];
+    char newpath[NFS_NAME_MAX + 1];
+    xlator_t *nfsx;
+    dict_t *dictgfid;
+
+    fd_t *fd;
+    int cmd;
+    struct gf_flock flock;
+};
+
+extern struct nfs_fop_local *
+nfs_fop_local_init(xlator_t *xl);
+
+extern void
+nfs_fop_local_wipe(xlator_t *xl, struct nfs_fop_local *l);
+
+#define nfs_state(nfsxl) (nfsxl)->private
+#define nfs_fop_mempool(nfxl) (((struct nfs_state *)nfs_state(nfxl))->foppool)
+
+#define prog_data_to_nfl(nf, nflocal, fram, pcbk, plocal)                      \
+    do {                                                                       \
+        nflocal = nfs_fop_local_init(nf);                                      \
+        if (nflocal) {                                                         \
+            nflocal->proglocal = plocal;                                       \
+            nflocal->progcbk = *VOID(&pcbk);                                   \
+            nflocal->nfsx = nf;                                                \
+            if (fram)                                                          \
+                ((call_frame_t *)fram)->local = nflocal;                       \
+        }                                                                      \
+    } while (0)
+
+#define nfl_to_prog_data(nflocal, pcbk, fram)                                  \
+    do {                                                                       \
+        nflocal = fram->local;                                                 \
+        fram->local = nflocal->proglocal;                                      \
+        pcbk = nflocal->progcbk;                                               \
+    } while (0)
+
+#define nfs_fop_handle_local_init(fram, nfx, nfloc, cbck, prgloc, retval, lab) \
+    do {                                                                       \
+        prog_data_to_nfl(nfx, nfloc, fram, cbck, prgloc);                      \
+        if (!nfloc) {                                                          \
+            gf_msg(GF_NFS, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,            \
+                   "Failed to init local");                                    \
+            retval = -ENOMEM;                                                  \
+            goto lab;                                                          \
+        }                                                                      \
+    } while (0)
+
+extern int
+nfs_fop_fstat(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+              fop_stat_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_readdirp(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *dirfd,
+                 size_t bufsize, off_t offset, fop_readdir_cbk_t cbk,
+                 void *local);
+extern int
+nfs_fop_lookup(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+               fop_lookup_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_create(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+               int flags, mode_t mode, fd_t *fd, fop_create_cbk_t cbk,
+               void *local);
+extern int
+nfs_fop_flush(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+              fop_flush_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_mkdir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+              mode_t mode, fop_mkdir_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_truncate(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+                 off_t offset, fop_truncate_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_read(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+             size_t size, off_t offset, fop_readv_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_fsync(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+              int32_t datasync, fop_fsync_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_write(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+              struct iobref *srciobref, struct iovec *vector, int32_t count,
+              off_t offset, fop_writev_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_open(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+             int32_t flags, fd_t *fd, fop_open_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_rename(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *oldloc,
+               loc_t *newloc, fop_rename_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_link(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *oldloc,
+             loc_t *newloc, fop_link_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_unlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+               fop_unlink_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_rmdir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+              fop_rmdir_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_mknod(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+              mode_t mode, dev_t dev, fop_mknod_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_readlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                 size_t size, fop_readlink_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_symlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, char *target,
+                loc_t *pathloc, fop_symlink_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_setattr(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                struct iatt *buf, int32_t valid, fop_setattr_cbk_t cbk,
+                void *local);
+
+extern int
+nfs_fop_statfs(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+               fop_statfs_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_opendir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                fd_t *dirfd, fop_opendir_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_stat(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+             fop_stat_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_access(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+               int32_t accesstest, fop_access_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_lk(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd, int cmd,
+           struct gf_flock *flock, fop_lk_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_getxattr(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+                 char *name, dict_t *xdata, fop_getxattr_cbk_t cbk,
+                 void *local);
+
+extern int
+nfs_fop_setxattr(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+                 dict_t *dict, int32_t flags, dict_t *xdata,
+                 fop_setxattr_cbk_t cbk, void *local);
+
+#endif
diff --git a/xlators/nfs/server/src/nfs-generics.c b/xlators/nfs/server/src/nfs-generics.c
new file mode 100644
index 00000000000..009991877cb
--- /dev/null
+++ b/xlators/nfs/server/src/nfs-generics.c
@@ -0,0 +1,312 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "string.h"
+
+#include "nfs.h"
+#include "nfs-fops.h"
+#include "nfs-inodes.h"
+#include "nfs-generics.h"
+#include <glusterfs/xlator.h>
+
+int
+nfs_fstat(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+          fop_stat_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!fd) || (!nfu))
+        return ret;
+
+    ret = nfs_fop_fstat(nfsx, xl, nfu, fd, cbk, local);
+    return ret;
+}
+
+int
+nfs_access(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+           int32_t accesstest, fop_access_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    ret = nfs_fop_access(nfsx, xl, nfu, pathloc, accesstest, cbk, local);
+
+    return ret;
+}
+
+int
+nfs_stat(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+         fop_stat_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    ret = nfs_fop_stat(nfsx, xl, nfu, pathloc, cbk, local);
+
+    return ret;
+}
+
+int
+nfs_readdirp(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *dirfd,
+             size_t bufsize, off_t offset, fop_readdir_cbk_t cbk, void *local)
+{
+    if ((!nfsx) || (!xl) || (!dirfd) || (!nfu))
+        return -EFAULT;
+
+    return nfs_fop_readdirp(nfsx, xl, nfu, dirfd, bufsize, offset, cbk, local);
+}
+
+int
+nfs_lookup(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+           fop_lookup_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    ret = nfs_fop_lookup(nfsx, xl, nfu, pathloc, cbk, local);
+    return ret;
+}
+
+int
+nfs_create(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+           int flags, mode_t mode, fop_create_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    ret = nfs_inode_create(nfsx, xl, nfu, pathloc, flags, mode, cbk, local);
+    return ret;
+}
+
+int
+nfs_flush(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+          fop_flush_cbk_t cbk, void *local)
+{
+    return nfs_fop_flush(nfsx, xl, nfu, fd, cbk, local);
+}
+
+int
+nfs_mkdir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+          mode_t mode, fop_mkdir_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    ret = nfs_inode_mkdir(nfsx, xl, nfu, pathloc, mode, cbk, local);
+    return ret;
+}
+
+int
+nfs_truncate(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+             off_t offset, fop_truncate_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    ret = nfs_fop_truncate(nfsx, xl, nfu, pathloc, offset, cbk, local);
+    return ret;
+}
+
+int
+nfs_read(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd, size_t size,
+         off_t offset, fop_readv_cbk_t cbk, void *local)
+{
+    return nfs_fop_read(nfsx, xl, nfu, fd, size, offset, cbk, local);
+}
+
+int
+nfs_lk(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd, int cmd,
+       struct gf_flock *flock, fop_lk_cbk_t cbk, void *local)
+{
+    return nfs_fop_lk(nfsx, xl, nfu, fd, cmd, flock, cbk, local);
+}
+
+int
+nfs_getxattr(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+             char *name, dict_t *xdata, fop_getxattr_cbk_t cbk, void *local)
+{
+    return nfs_fop_getxattr(nfsx, xl, nfu, loc, name, xdata, cbk, local);
+}
+
+int
+nfs_setxattr(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+             dict_t *dict, int32_t flags, dict_t *xdata, fop_setxattr_cbk_t cbk,
+             void *local)
+{
+    return nfs_fop_setxattr(nfsx, xl, nfu, loc, dict, flags, xdata, cbk, local);
+}
+
+int
+nfs_fsync(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+          int32_t datasync, fop_fsync_cbk_t cbk, void *local)
+{
+    return nfs_fop_fsync(nfsx, xl, nfu, fd, datasync, cbk, local);
+}
+
+int
+nfs_write(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+          struct iobref *srciobref, struct iovec *vector, int32_t count,
+          off_t offset, fop_writev_cbk_t cbk, void *local)
+{
+    return nfs_fop_write(nfsx, xl, nfu, fd, srciobref, vector, count, offset,
+                         cbk, local);
+}
+
+int
+nfs_open(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+         int32_t flags, fop_open_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    ret = nfs_inode_open(nfsx, xl, nfu, pathloc, flags, cbk, local);
+    return ret;
+}
+
+int
+nfs_rename(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *oldloc,
+           loc_t *newloc, fop_rename_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!oldloc) || (!newloc) || (!nfu))
+        return ret;
+
+    ret = nfs_inode_rename(nfsx, xl, nfu, oldloc, newloc, cbk, local);
+    return ret;
+}
+
+int
+nfs_link(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *oldloc,
+         loc_t *newloc, fop_link_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!oldloc) || (!newloc) || (!nfu))
+        return ret;
+
+    ret = nfs_inode_link(nfsx, xl, nfu, oldloc, newloc, cbk, local);
+    return ret;
+}
+
+int
+nfs_unlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+           fop_unlink_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    ret = nfs_inode_unlink(nfsx, xl, nfu, pathloc, cbk, local);
+    return ret;
+}
+
+int
+nfs_rmdir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *path,
+          fop_rmdir_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!path) || (!nfu))
+        return ret;
+
+    ret = nfs_inode_rmdir(nfsx, xl, nfu, path, cbk, local);
+    return ret;
+}
+
+int
+nfs_mknod(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+          mode_t mode, dev_t dev, fop_mknod_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    ret = nfs_inode_mknod(nfsx, xl, nfu, pathloc, mode, dev, cbk, local);
+    return ret;
+}
+
+int
+nfs_readlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *linkloc,
+             fop_readlink_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!linkloc) || (!nfu))
+        return ret;
+
+    ret = nfs_fop_readlink(nfsx, xl, nfu, linkloc, NFS_PATH_MAX, cbk, local);
+    return ret;
+}
+
+int
+nfs_symlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, char *target,
+            loc_t *linkloc, fop_symlink_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!linkloc) || (!target) || (!nfu))
+        return ret;
+
+    ret = nfs_inode_symlink(nfsx, xl, nfu, target, linkloc, cbk, local);
+    return ret;
+}
+
+int
+nfs_setattr(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+            struct iatt *buf, int32_t valid, fop_setattr_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    ret = nfs_fop_setattr(nfsx, xl, nfu, pathloc, buf, valid, cbk, local);
+    return ret;
+}
+
+int
+nfs_statfs(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+           fop_statfs_cbk_t cbk, void *local)
+{
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    ret = nfs_fop_statfs(nfsx, xl, nfu, pathloc, cbk, local);
+    return ret;
+}
+
+int
+nfs_opendir(xlator_t *nfsx, xlator_t *fopxl, nfs_user_t *nfu, loc_t *pathloc,
+            fop_opendir_cbk_t cbk, void *local)
+{
+    if ((!nfsx) || (!fopxl) || (!pathloc) || (!nfu))
+        return -EFAULT;
+
+    return nfs_inode_opendir(nfsx, fopxl, nfu, pathloc, cbk, local);
+}
diff --git a/xlators/nfs/server/src/nfs-generics.h b/xlators/nfs/server/src/nfs-generics.h
new file mode 100644
index 00000000000..07a79994a4b
--- /dev/null
+++ b/xlators/nfs/server/src/nfs-generics.h
@@ -0,0 +1,161 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _NFS_GENERICS_H_
+#define _NFS_GENERICS_H_
+
+#include "nfs.h"
+#include <glusterfs/xlator.h>
+#include "nfs-fops.h"
+#include "nfs-inodes.h"
+
+struct nfs_direntcache {
+    gf_dirent_t entries; /* Head of list of cached dirents. */
+    gf_dirent_t *next;   /* Pointer to the next entry that
+                          * should be sent by readdir */
+    uint64_t prev_off;   /* Offset where the next read will
+                          * happen.
+                          */
+};
+
+/* WE're trying to abstract the fops interface from the NFS xlator so that
+ * different NFS versions can simply call a standard interface and have fop
+ * interface dependent functions be handled internally.
+ * This structure is part of such an  abstraction. The fops layer stores any
+ * state is requires in the fd. E.g. the dirent cache for a directory fd_t.
+ */
+typedef struct nfs_fop_fdcontext {
+    pthread_mutex_t lock;
+    size_t dirent_bufsize;
+    off_t offset;
+    struct nfs_direntcache *dcache;
+    xlator_t *dirvol;
+} nfs_fdctx_t;
+
+extern int
+nfs_fstat(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+          fop_stat_cbk_t cbk, void *local);
+
+extern int
+nfs_readdirp(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *dirfd,
+             size_t bufsize, off_t offset, fop_readdir_cbk_t cbk, void *local);
+
+extern int
+nfs_lookup(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+           fop_lookup_cbk_t cbk, void *local);
+
+extern int
+nfs_create(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+           int flags, mode_t mode, fop_create_cbk_t cbk, void *local);
+
+extern int
+nfs_flush(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+          fop_flush_cbk_t cbk, void *local);
+
+extern int
+nfs_mkdir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+          mode_t mode, fop_mkdir_cbk_t cbk, void *local);
+
+extern int
+nfs_truncate(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+             off_t offset, fop_truncate_cbk_t cbk, void *local);
+
+extern int
+nfs_read(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd, size_t size,
+         off_t offset, fop_readv_cbk_t cbk, void *local);
+
+extern int
+nfs_fsync(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+          int32_t datasync, fop_fsync_cbk_t cbk, void *local);
+
+extern int
+nfs_write(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
+          struct iobref *srciobref, struct iovec *vector, int32_t count,
+          off_t offset, fop_writev_cbk_t cbk, void *local);
+
+extern int
+nfs_open(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+         int32_t flags, fop_open_cbk_t cbk, void *local);
+
+extern int
+nfs_rename(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *oldloc,
+           loc_t *newloc, fop_rename_cbk_t cbk, void *local);
+
+extern int
+nfs_link(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *oldloc,
+         loc_t *newloc, fop_link_cbk_t cbk, void *local);
+
+extern int
+nfs_unlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+           fop_unlink_cbk_t cbk, void *local);
+
+extern int
+nfs_rmdir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+          fop_rmdir_cbk_t cbk, void *local);
+
+extern int
+nfs_mknod(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+          mode_t mode, dev_t dev, fop_mknod_cbk_t cbk, void *local);
+
+extern int
+nfs_readlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *linkloc,
+             fop_readlink_cbk_t cbk, void *local);
+
+extern int
+nfs_setattr(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+            struct iatt *buf, int32_t valid, fop_setattr_cbk_t cbk,
+            void *local);
+
+extern int
+nfs_statfs(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+           fop_statfs_cbk_t cbk, void *local);
+
+extern int
+nfs_stat(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+         fop_stat_cbk_t cbk, void *local);
+
+extern int
+nfs_symlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, char *target,
+            loc_t *linkloc, fop_symlink_cbk_t cbk, void *local);
+
+/* Synchronous equivalents */
+
+extern call_stub_t *
+nfs_open_sync(xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc, int32_t flags);
+
+extern call_stub_t *
+nfs_write_sync(xlator_t *xl, nfs_user_t *nfu, fd_t *fd, struct iobuf *srciob,
+               struct iovec *vec, int count, off_t offset);
+
+extern call_stub_t *
+nfs_read_sync(xlator_t *xl, nfs_user_t *nfu, fd_t *fd, size_t size,
+              off_t offset);
+
+extern int
+nfs_opendir(xlator_t *nfsx, xlator_t *fopxl, nfs_user_t *nfu, loc_t *pathloc,
+            fop_opendir_cbk_t cbk, void *local);
+
+extern int
+nfs_access(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+           int32_t accesstest, fop_access_cbk_t cbk, void *local);
+extern int
+nfs_lk(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd, int cmd,
+       struct gf_flock *flock, fop_lk_cbk_t cbk, void *local);
+
+extern int
+nfs_getxattr(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+             char *name, dict_t *xdata, fop_getxattr_cbk_t cbk, void *local);
+
+extern int
+nfs_setxattr(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+             dict_t *dict, int32_t flags, dict_t *xdata, fop_setxattr_cbk_t cbk,
+             void *local);
+
+#endif
diff --git a/xlators/nfs/server/src/nfs-inodes.c b/xlators/nfs/server/src/nfs-inodes.c
new file mode 100644
index 00000000000..6f34ca705ba
--- /dev/null
+++ b/xlators/nfs/server/src/nfs-inodes.c
@@ -0,0 +1,579 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "string.h"
+
+#include "nfs.h"
+#include "nfs-inodes.h"
+#include "nfs-fops.h"
+#include <glusterfs/xlator.h>
+#include "nfs-messages.h"
+
+#include <libgen.h>
+
+#define inodes_nfl_to_prog_data(nflocal, pcbk, fram)                           \
+    do {                                                                       \
+        nflocal = fram->local;                                                 \
+        fram->local = nflocal->proglocal;                                      \
+        *VOID(&pcbk) = nflocal->progcbk;                                       \
+        nfs_fop_local_wipe(nflocal->nfsx, nflocal);                            \
+    } while (0)
+
+void
+nfl_inodes_init(struct nfs_fop_local *nfl, inode_t *inode, inode_t *parent,
+                inode_t *newparent, const char *name, const char *newname)
+{
+    if (!nfl)
+        return;
+
+    if (inode)
+        nfl->inode = inode_ref(inode);
+
+    if (parent)
+        nfl->parent = inode_ref(parent);
+
+    if (newparent)
+        nfl->newparent = inode_ref(newparent);
+
+    if (name)
+        snprintf(nfl->path, NFS_NAME_MAX, "%s", name);
+
+    if (newname)
+        snprintf(nfl->newpath, NFS_NAME_MAX, "%s", newname);
+    return;
+}
+
+int32_t
+nfs_inode_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                     struct iatt *buf, struct iatt *preparent,
+                     struct iatt *postparent, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = frame->local;
+    fop_create_cbk_t progcbk = NULL;
+    inode_t *linked_inode = NULL;
+
+    if (op_ret == -1)
+        goto do_not_link;
+
+    linked_inode = inode_link(inode, nfl->parent, nfl->path, buf);
+
+do_not_link:
+    /* NFS does not need it, upper layers should not expect the pointer to
+     * be a valid fd.
+     */
+    fd_unref(fd);
+
+    inodes_nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, fd, inode, buf,
+                preparent, postparent, xdata);
+
+    if (linked_inode) {
+        inode_lookup(linked_inode);
+        inode_unref(linked_inode);
+    }
+
+    return 0;
+}
+
+int
+nfs_inode_create(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                 int flags, int mode, fop_create_cbk_t cbk, void *local)
+{
+    struct nfs_fop_local *nfl = NULL;
+    int ret = -EFAULT;
+    fd_t *newfd = NULL;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    nfs_fop_handle_local_init(NULL, nfsx, nfl, cbk, local, ret, err);
+
+    newfd = fd_create(pathloc->inode, 0);
+    if (!newfd) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to create new fd");
+        ret = -ENOMEM;
+        goto wipe_nfl;
+    }
+
+    /* The parent and base name will be needed to link the new inode
+     * into the inode table.
+     */
+    nfl_inodes_init(nfl, pathloc->inode, pathloc->parent, NULL, pathloc->name,
+                    NULL);
+    ret = nfs_fop_create(nfsx, xl, nfu, pathloc, flags, mode, newfd,
+                         nfs_inode_create_cbk, nfl);
+wipe_nfl:
+    if (ret < 0)
+        nfs_fop_local_wipe(xl, nfl);
+
+err:
+    return ret;
+}
+
+int32_t
+nfs_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, inode_t *inode,
+                    struct iatt *buf, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = frame->local;
+    fop_mkdir_cbk_t progcbk = NULL;
+    inode_t *linked_inode = NULL;
+
+    if (op_ret == -1)
+        goto do_not_link;
+
+    linked_inode = inode_link(inode, nfl->parent, nfl->path, buf);
+
+do_not_link:
+    inodes_nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, inode, buf, preparent,
+                postparent, xdata);
+
+    if (linked_inode) {
+        inode_lookup(linked_inode);
+        inode_unref(linked_inode);
+    }
+
+    return 0;
+}
+
+int
+nfs_inode_mkdir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                int mode, fop_mkdir_cbk_t cbk, void *local)
+{
+    struct nfs_fop_local *nfl = NULL;
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    nfs_fop_handle_local_init(NULL, nfsx, nfl, cbk, local, ret, err);
+    nfl_inodes_init(nfl, pathloc->inode, pathloc->parent, NULL, pathloc->name,
+                    NULL);
+    ret = nfs_fop_mkdir(nfsx, xl, nfu, pathloc, mode, nfs_inode_mkdir_cbk, nfl);
+    if (ret < 0)
+        nfs_fop_local_wipe(nfsx, nfl);
+
+err:
+    return ret;
+}
+
+int32_t
+nfs_inode_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_open_cbk_t progcbk = NULL;
+
+    if ((op_ret == -1) && (fd))
+        fd_unref(fd);
+    /* Not needed here since the fd is cached in higher layers and the bind
+     * must happen atomically when the fd gets added to the fd LRU.
+     */
+    /*        else
+                    fd_bind (fd);
+    */
+    inodes_nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, fd, xdata);
+    return 0;
+}
+
+int
+nfs_inode_open(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+               int32_t flags, fop_open_cbk_t cbk, void *local)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fd_t *newfd = NULL;
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!loc) || (!nfu))
+        return ret;
+
+    newfd = fd_create(loc->inode, 0);
+    if (!newfd) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to create fd");
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    nfs_fop_handle_local_init(NULL, nfsx, nfl, cbk, local, ret, fd_err);
+    ret = nfs_fop_open(nfsx, xl, nfu, loc, flags, newfd, nfs_inode_open_cbk,
+                       nfl);
+
+    if (ret < 0)
+        nfs_fop_local_wipe(xl, nfl);
+
+fd_err:
+    if (ret < 0)
+        if (newfd)
+            fd_unref(newfd);
+
+err:
+
+    return ret;
+}
+
+int32_t
+nfs_inode_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                     struct iatt *preoldparent, struct iatt *postoldparent,
+                     struct iatt *prenewparent, struct iatt *postnewparent,
+                     dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_rename_cbk_t progcbk = NULL;
+
+    nfl = frame->local;
+    if (op_ret == -1)
+        goto do_not_link;
+
+    inode_rename(this->itable, nfl->parent, nfl->path, nfl->newparent,
+                 nfl->newpath, nfl->inode, buf);
+
+do_not_link:
+    inodes_nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, buf, preoldparent,
+                postoldparent, prenewparent, postnewparent, xdata);
+    return 0;
+}
+
+int
+nfs_inode_rename(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *oldloc,
+                 loc_t *newloc, fop_rename_cbk_t cbk, void *local)
+{
+    struct nfs_fop_local *nfl = NULL;
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!oldloc) || (!newloc))
+        return ret;
+
+    nfs_fop_handle_local_init(NULL, nfsx, nfl, cbk, local, ret, err);
+    nfl_inodes_init(nfl, oldloc->inode, oldloc->parent, newloc->parent,
+                    oldloc->name, newloc->name);
+    ret = nfs_fop_rename(nfsx, xl, nfu, oldloc, newloc, nfs_inode_rename_cbk,
+                         nfl);
+
+err:
+    if (ret < 0)
+        nfs_fop_local_wipe(xl, nfl);
+
+    return ret;
+}
+
+int32_t
+nfs_inode_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, inode_t *inode,
+                   struct iatt *buf, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_link_cbk_t progcbk = NULL;
+    inode_t *linked_inode = NULL;
+
+    if (op_ret == -1)
+        goto do_not_link;
+
+    nfl = frame->local;
+    linked_inode = inode_link(inode, nfl->newparent, nfl->path, buf);
+
+do_not_link:
+    inodes_nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, inode, buf, preparent,
+                postparent, xdata);
+
+    if (linked_inode) {
+        inode_lookup(linked_inode);
+        inode_unref(linked_inode);
+    }
+
+    return 0;
+}
+
+int
+nfs_inode_link(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *oldloc,
+               loc_t *newloc, fop_link_cbk_t cbk, void *local)
+{
+    struct nfs_fop_local *nfl = NULL;
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!oldloc) || (!newloc) || (!nfu))
+        return -EFAULT;
+
+    nfs_fop_handle_local_init(NULL, nfsx, nfl, cbk, local, ret, err);
+    nfl_inodes_init(nfl, NULL, NULL, newloc->parent, newloc->name, NULL);
+    ret = nfs_fop_link(nfsx, xl, nfu, oldloc, newloc, nfs_inode_link_cbk, nfl);
+
+err:
+    if (ret < 0)
+        nfs_fop_local_wipe(xl, nfl);
+
+    return ret;
+}
+
+int32_t
+nfs_inode_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                     struct iatt *postparent, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_unlink_cbk_t progcbk = NULL;
+
+    nfl = frame->local;
+
+    if (op_ret == -1)
+        goto do_not_unlink;
+
+    inode_unlink(nfl->inode, nfl->parent, nfl->path);
+    inode_forget(nfl->inode, 0);
+
+do_not_unlink:
+    inodes_nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, preparent, postparent,
+                xdata);
+    return 0;
+}
+
+int
+nfs_inode_unlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                 fop_unlink_cbk_t cbk, void *local)
+{
+    struct nfs_fop_local *nfl = NULL;
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return -EFAULT;
+
+    nfs_fop_handle_local_init(NULL, nfsx, nfl, cbk, local, ret, err);
+    nfl_inodes_init(nfl, pathloc->inode, pathloc->parent, NULL, pathloc->name,
+                    NULL);
+    ret = nfs_fop_unlink(nfsx, xl, nfu, pathloc, nfs_inode_unlink_cbk, nfl);
+
+err:
+    if (ret < 0)
+        nfs_fop_local_wipe(xl, nfl);
+
+    return ret;
+}
+
+int32_t
+nfs_inode_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_rmdir_cbk_t progcbk = NULL;
+
+    nfl = frame->local;
+
+    if (op_ret == -1)
+        goto do_not_unlink;
+
+    inode_unlink(nfl->inode, nfl->parent, nfl->path);
+    inode_forget(nfl->inode, 0);
+
+do_not_unlink:
+    inodes_nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, preparent, postparent,
+                xdata);
+
+    return 0;
+}
+
+int
+nfs_inode_rmdir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                fop_rmdir_cbk_t cbk, void *local)
+{
+    struct nfs_fop_local *nfl = NULL;
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    nfs_fop_handle_local_init(NULL, nfsx, nfl, cbk, local, ret, err);
+    nfl_inodes_init(nfl, pathloc->inode, pathloc->parent, NULL, pathloc->name,
+                    NULL);
+
+    ret = nfs_fop_rmdir(nfsx, xl, nfu, pathloc, nfs_inode_rmdir_cbk, nfl);
+
+err:
+    if (ret < 0)
+        nfs_fop_local_wipe(xl, nfl);
+    return ret;
+}
+
+int32_t
+nfs_inode_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, inode_t *inode,
+                    struct iatt *buf, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_mknod_cbk_t progcbk = NULL;
+    inode_t *linked_inode = NULL;
+
+    nfl = frame->local;
+
+    if (op_ret == -1)
+        goto do_not_link;
+
+    linked_inode = inode_link(inode, nfl->parent, nfl->path, buf);
+
+do_not_link:
+    inodes_nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, inode, buf, preparent,
+                postparent, xdata);
+
+    if (linked_inode) {
+        inode_lookup(linked_inode);
+        inode_unref(linked_inode);
+    }
+
+    return 0;
+}
+
+int
+nfs_inode_mknod(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                mode_t mode, dev_t dev, fop_mknod_cbk_t cbk, void *local)
+{
+    struct nfs_fop_local *nfl = NULL;
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!pathloc) || (!nfu))
+        return ret;
+
+    nfs_fop_handle_local_init(NULL, nfsx, nfl, cbk, local, ret, err);
+    nfl_inodes_init(nfl, pathloc->inode, pathloc->parent, NULL, pathloc->name,
+                    NULL);
+
+    ret = nfs_fop_mknod(nfsx, xl, nfu, pathloc, mode, dev, nfs_inode_mknod_cbk,
+                        nfl);
+
+err:
+    if (ret < 0)
+        nfs_fop_local_wipe(xl, nfl);
+
+    return ret;
+}
+
+int32_t
+nfs_inode_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, inode_t *inode,
+                      struct iatt *buf, struct iatt *preparent,
+                      struct iatt *postparent, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_symlink_cbk_t progcbk = NULL;
+    inode_t *linked_inode = NULL;
+
+    nfl = frame->local;
+    if (op_ret == -1)
+        goto do_not_link;
+
+    linked_inode = inode_link(inode, nfl->parent, nfl->path, buf);
+
+do_not_link:
+    inodes_nfl_to_prog_data(nfl, progcbk, frame);
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, inode, buf, preparent,
+                postparent, xdata);
+
+    if (linked_inode) {
+        inode_lookup(linked_inode);
+        inode_unref(linked_inode);
+    }
+
+    return 0;
+}
+
+int
+nfs_inode_symlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, char *target,
+                  loc_t *pathloc, fop_symlink_cbk_t cbk, void *local)
+{
+    struct nfs_fop_local *nfl = NULL;
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!target) || (!pathloc) || (!nfu))
+        return ret;
+
+    nfs_fop_handle_local_init(NULL, nfsx, nfl, cbk, local, ret, err);
+    nfl_inodes_init(nfl, pathloc->inode, pathloc->parent, NULL, pathloc->name,
+                    NULL);
+    ret = nfs_fop_symlink(nfsx, xl, nfu, target, pathloc, nfs_inode_symlink_cbk,
+                          nfl);
+
+err:
+    if (ret < 0)
+        nfs_fop_local_wipe(xl, nfl);
+
+    return ret;
+}
+
+int32_t
+nfs_inode_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fop_open_cbk_t progcbk = NULL;
+
+    if (op_ret != -1)
+        fd_bind(fd);
+
+    inodes_nfl_to_prog_data(nfl, progcbk, frame);
+
+    if (progcbk)
+        progcbk(frame, cookie, this, op_ret, op_errno, fd, xdata);
+
+    return 0;
+}
+
+int
+nfs_inode_opendir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+                  fop_opendir_cbk_t cbk, void *local)
+{
+    struct nfs_fop_local *nfl = NULL;
+    fd_t *newfd = NULL;
+    int ret = -EFAULT;
+
+    if ((!nfsx) || (!xl) || (!loc) || (!nfu))
+        return ret;
+
+    newfd = fd_create(loc->inode, 0);
+    if (!newfd) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to create fd");
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    nfs_fop_handle_local_init(NULL, nfsx, nfl, cbk, local, ret, err);
+    ret = nfs_fop_opendir(nfsx, xl, nfu, loc, newfd, nfs_inode_opendir_cbk,
+                          nfl);
+
+err:
+    if (ret < 0) {
+        if (newfd)
+            fd_unref(newfd);
+        nfs_fop_local_wipe(xl, nfl);
+    }
+
+    return ret;
+}
diff --git a/xlators/nfs/server/src/nfs-inodes.h b/xlators/nfs/server/src/nfs-inodes.h
new file mode 100644
index 00000000000..e8efd1e127c
--- /dev/null
+++ b/xlators/nfs/server/src/nfs-inodes.h
@@ -0,0 +1,67 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _NFS_INODES_H_
+#define _NFS_INODES_H_
+
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/iobuf.h>
+#include <glusterfs/call-stub.h>
+#include "nfs-fops.h"
+
+extern int
+nfs_link_inode(inode_t *newi, inode_t *parent, char *name,
+               struct iatt *newstat);
+
+extern int
+nfs_inode_create(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                 int flags, int mode, fop_create_cbk_t cbk, void *local);
+
+extern int
+nfs_inode_mkdir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                int mode, fop_mkdir_cbk_t cbk, void *local);
+
+extern int
+nfs_inode_open(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+               int32_t flags, fop_open_cbk_t cbk, void *local);
+
+extern int
+nfs_inode_rename(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *oldloc,
+                 loc_t *newloc, fop_rename_cbk_t cbk, void *local);
+
+extern int
+nfs_inode_link(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *oldloc,
+               loc_t *newloc, fop_link_cbk_t cbk, void *local);
+
+extern int
+nfs_inode_unlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                 fop_unlink_cbk_t cbk, void *local);
+
+extern int
+nfs_inode_rmdir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                fop_rmdir_cbk_t cbk, void *local);
+
+extern int
+nfs_inode_symlink(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, char *target,
+                  loc_t *pathloc, fop_symlink_cbk_t cbk, void *local);
+
+extern int
+nfs_inode_opendir(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+                  fop_opendir_cbk_t cbk, void *local);
+
+extern int
+nfs_inode_mknod(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                mode_t mode, dev_t dev, fop_mknod_cbk_t cbk, void *local);
+
+extern int
+nfs_inode_lookup(xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
+                 fop_lookup_cbk_t cbk, void *local);
+#endif
diff --git a/xlators/nfs/server/src/nfs-mem-types.h b/xlators/nfs/server/src/nfs-mem-types.h
new file mode 100644
index 00000000000..0a3c887eff1
--- /dev/null
+++ b/xlators/nfs/server/src/nfs-mem-types.h
@@ -0,0 +1,48 @@
+/*
+   Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com>
+   This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __NFS_MEM_TYPES_H__
+#define __NFS_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_nfs_mem_types_ {
+    gf_nfs_mt_mountentry = gf_common_mt_end + 1,
+    gf_nfs_mt_mountbody,
+    gf_nfs_mt_nfs_state,
+    gf_nfs_mt_char,
+    gf_nfs_mt_exportnode,
+    gf_nfs_mt_groupnode,
+    gf_nfs_mt_mount3_state,
+    gf_nfs_mt_nfs3_export,
+    gf_nfs_mt_nfs3_state,
+    gf_nfs_mt_entry3,
+    gf_nfs_mt_entryp3,
+    gf_nfs_mt_nfs3_fh,
+    gf_nfs_mt_nfs_initer_list,
+    gf_nfs_mt_xlator_t,
+    gf_nfs_mt_mnt3_resolve,
+    gf_nfs_mt_mnt3_export,
+    gf_nfs_mt_mnt3_auth_params,
+    gf_nfs_mt_int,
+    gf_nfs_mt_mountres3,
+    gf_nfs_mt_mountstat3,
+    gf_nfs_mt_nlm4_fde,
+    gf_nfs_mt_nlm4_nlmclnt,
+    gf_nfs_mt_nlm4_share,
+    gf_nfs_mt_inode_ctx,
+    gf_nfs_mt_auth_spec,
+    gf_nfs_mt_arr,
+    gf_nfs_mt_auth_cache,
+    gf_nfs_mt_auth_cache_entry,
+    gf_nfs_mt_nlm4_notify,
+    gf_nfs_mt_end
+};
+#endif
diff --git a/xlators/nfs/server/src/nfs-messages.h b/xlators/nfs/server/src/nfs-messages.h
new file mode 100644
index 00000000000..04e15cbe78c
--- /dev/null
+++ b/xlators/nfs/server/src/nfs-messages.h
@@ -0,0 +1,102 @@
+/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _NFS_MESSAGES_H_
+#define _NFS_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(
+    NFS, NFS_MSG_UNUSED_1, NFS_MSG_UNUSED_2, NFS_MSG_INVALID_ENTRY,
+    NFS_MSG_INODE_LOC_FILL_ERROR, NFS_MSG_HARD_RESOLVE_FAIL,
+    NFS_MSG_ARGS_DECODE_ERROR, NFS_MSG_LOOKUP_PROC_FAIL, NFS_MSG_UNUSED_8,
+    NFS_MSG_UNUSED_9, NFS_MSG_READLINK_PROC_FAIL, NFS_MSG_UNUSED_11,
+    NFS_MSG_ANONYMOUS_FD_FAIL, NFS_MSG_READ_FAIL, NFS_MSG_UNUSED_14,
+    NFS_MSG_UNUSED_15, NFS_MSG_STATE_WRONG, NFS_MSG_WRITE_FAIL,
+    NFS_MSG_UNUSED_18, NFS_MSG_UNUSED_19, NFS_MSG_UNUSED_20,
+    NFS_MSG_CREATE_FAIL, NFS_MSG_UNUSED_22, NFS_MSG_UNUSED_23,
+    NFS_MSG_DIR_OP_FAIL, NFS_MSG_UNUSED_25, NFS_MSG_SYMLINK_FAIL,
+    NFS_MSG_UNUSED_27, NFS_MSG_MKNOD_FAIL, NFS_MSG_OPT_INIT_FAIL,
+    NFS_MSG_UNUSED_30, NFS_MSG_REMOVE_FAIL, NFS_MSG_RMDIR_CBK,
+    NFS_MSG_UNUSED_33, NFS_MSG_RENAME_FAIL, NFS_MSG_UNUSED_35,
+    NFS_MSG_LINK_FAIL, NFS_MSG_UNUSED_37, NFS_MSG_UNUSED_38,
+    NFS_MSG_READDIR_FAIL, NFS_MSG_READDIRP_FAIL, NFS_MSG_UNUSED_41,
+    NFS_MSG_UNUSED_42, NFS_MSG_FSTAT_FAIL, NFS_MSG_UNUSED_44,
+    NFS_MSG_FSINFO_FAIL, NFS_MSG_UNUSED_46, NFS_MSG_PATHCONF_FAIL,
+    NFS_MSG_UNUSED_48, NFS_MSG_COMMIT_FAIL, NFS_MSG_PROT_INIT_ADD_FAIL,
+    NFS_MSG_FORMAT_FAIL, NFS_MSG_SNPRINTF_FAIL, NFS_MSG_VOLID_MISSING,
+    NFS_MSG_PARSE_VOL_UUID_FAIL, NFS_MSG_STR2BOOL_FAIL,
+    NFS_MSG_SUBVOL_INIT_FAIL, NFS_MSG_NO_MEMORY, NFS_MSG_LISTENERS_CREATE_FAIL,
+    NFS_MSG_STATE_INIT_FAIL, NFS_MSG_RECONF_FAIL, NFS_MSG_RECONF_SUBVOL_FAIL,
+    NFS_MSG_STR_TOO_LONG, NFS_MSG_STATE_MISSING, NFS_MSG_INDEX_NOT_FOUND,
+    NFS_MSG_EXPORT_ID_FAIL, NFS_MSG_NO_RW_ACCESS, NFS_MSG_BAD_HANDLE,
+    NFS_MSG_RESOLVE_FH_FAIL, NFS_MSG_RESOLVE_STAT, NFS_MSG_VOL_DISABLE,
+    NFS_MSG_INIT_CALL_STAT_FAIL, NFS_MSG_ENCODE_FAIL,
+    NFS_MSG_SERIALIZE_REPLY_FAIL, NFS_MSG_SUBMIT_REPLY_FAIL, NFS_MSG_UNUSED_75,
+    NFS_MSG_UNUSED_76, NFS_MSG_STAT_FOP_FAIL, NFS_MSG_GETATTR_FAIL,
+    NFS_MSG_UNUSED_79, NFS_MSG_UNUSED_80, NFS_MSG_TIMESTAMP_NO_SYNC,
+    NFS_MSG_SETATTR_INVALID, NFS_MSG_SETATTR_FAIL, NFS_MSG_UNUSED_84,
+    NFS_MSG_ACCESS_PROC_FAIL, NFS_MSG_PGM_NOT_FOUND, NFS_MSG_PGM_INIT_FAIL,
+    NFS_MSG_PGM_REG_FAIL, NFS_MSG_LOOKUP_ROOT_FAIL, NFS_MSG_ROOT_LOC_INIT_FAIL,
+    NFS_MSG_STARTUP_FAIL, NFS_MSG_XLATOR_INIT_FAIL, NFS_MSG_NFS_MAN_DISABLE,
+    NFS_MSG_DICT_GET_FAILED, NFS_MSG_PARSE_FAIL, NFS_MSG_NLM_MAN_DISABLE,
+    NFS_MSG_ACL_MAN_DISABLE, NFS_MSG_DICT_SET_FAILED,
+    NFS_MSG_INIT_GRP_CACHE_FAIL, NFS_MSG_NO_PERM, NFS_MSG_REG_FILE_ERROR,
+    NFS_MSG_RPC_INIT_FAIL, NFS_MSG_RPC_CONFIG_FAIL, NFS_MSG_RECONFIG_PATH,
+    NFS_MSG_RECONFIG_VALUE, NFS_MSG_RECONFIG_VOL, NFS_MSG_NLM_INFO,
+    NFS_MSG_ACL_INFO, NFS_MSG_INIT_FAIL, NFS_MSG_STARTED, NFS_MSG_VOL_NOT_FOUND,
+    NFS_MSG_RECONFIG_ENABLE, NFS_MSG_RECONFIG_FAIL, NFS_MSG_MNT_STATE_NOT_FOUND,
+    NFS_MSG_ENCODE_MSG_FAIL, NFS_MSG_REP_SUBMIT_FAIL, NFS_MSG_READ_LOCKED,
+    NFS_MSG_MODIFY_LOCKED, NFS_MSG_RWTAB_OVERWRITE_FAIL, NFS_MSG_UPDATE_FAIL,
+    NFS_MSG_OPEN_FAIL, NFS_MSG_LOCK_FAIL, NFS_MSG_REWRITE_ERROR,
+    NFS_MSG_HASH_PATH_FAIL, NFS_MSG_LOOKUP_MNT_ERROR,
+    NFS_MSG_GET_ROOT_INODE_FAIL, NFS_MSG_RESOLVE_INODE_FAIL,
+    NFS_MSG_RESOLVE_SUBDIR_FAIL, NFS_MSG_RESOLVE_SYMLINK_ERROR,
+    NFS_MSG_RESOLVE_ERROR, NFS_MSG_UNSUPPORTED_VERSION,
+    NFS_MSG_AUTH_VERIFY_FAILED, NFS_MSG_PEER_NOT_ALLOWED,
+    NFS_MSG_GET_PEER_ADDR_FAIL, NFS_MSG_BAD_PEER, NFS_MSG_PEER_TOO_LONG,
+    NFS_MSG_CALLER_NOT_FOUND, NFS_MSG_GET_REMOTE_NAME_FAIL,
+    NFS_MSG_UNKNOWN_MNT_TYPE, NFS_MSG_PARSE_HOSTSPEC_FAIL,
+    NFS_MSG_PARSE_AUTH_PARAM_FAIL, NFS_MSG_SET_EXP_FAIL,
+    NFS_MSG_INIT_DIR_EXP_FAIL, NFS_MSG_DIR_EXP_SETUP_FAIL,
+    NFS_MSG_VOL_INIT_FAIL, NFS_MSG_AUTH_ERROR, NFS_MSG_UPDATING_EXP,
+    NFS_MSG_SET_EXP_AUTH_PARAM_FAIL, NFS_MSG_UPDATING_NET_GRP,
+    NFS_MSG_SET_NET_GRP_FAIL, NFS_MSG_PURGING_AUTH_CACHE,
+    NFS_MSG_MNT_STATE_INIT_FAIL, NFS_MSG_EXP_AUTH_DISABLED,
+    NFS_MSG_FH_TO_VOL_FAIL, NFS_MSG_INODE_SHARES_NOT_FOUND,
+    NFS_MSG_VOLUME_ERROR, NFS_MSG_GET_USER_ACL_FAIL, NFS_MSG_GET_DEF_ACL_FAIL,
+    NFS_MSG_SET_USER_ACL_FAIL, NFS_MSG_SET_DEF_ACL_FAIL, NFS_MSG_ACL_INIT_FAIL,
+    NFS_MSG_LOAD_PARSE_ERROR, NFS_MSG_CLNT_CALL_ERROR,
+    NFS_MSG_CLNT_CREATE_ERROR, NFS_MSG_NLM_GRACE_PERIOD, NFS_MSG_RPC_CLNT_ERROR,
+    NFS_MSG_GET_PORT_ERROR, NFS_MSG_NLMCLNT_NOT_FOUND, NFS_MSG_FD_LOOKUP_NULL,
+    NFS_MSG_SM_NOTIFY, NFS_MSG_NLM_INIT_FAIL, NFS_MSG_START_ERROR,
+    NFS_MSG_UNLINK_ERROR, NFS_MSG_SHARE_LIST_STORE_FAIL,
+    NFS_MSG_CLIENT_NOT_FOUND, NFS_MSG_SHARE_CALL_FAIL,
+    NFS_MSG_UNSHARE_CALL_FAIL, NFS_MSG_GET_PID_FAIL, NFS_MSG_ARG_FREE_FAIL,
+    NFS_MSG_PMAP_UNSET_FAIL, NFS_MSG_UDP_SERV_FAIL, NFS_MSG_REG_NLMCBK_FAIL,
+    NFS_MSG_TCP_SERV_FAIL, NFS_MSG_SVC_RUN_RETURNED, NFS_MSG_XLATOR_SET_FAIL,
+    NFS_MSG_SVC_ERROR, NFS_MSG_GET_FH_FAIL, NFS_MSG_FIND_FIRST_MATCH_FAIL,
+    NFS_MSG_NETGRP_NOT_FOUND, NFS_MSG_FILE_OP_FAILED, NFS_MSG_PATH_RESOLVE_FAIL,
+    NFS_MSG_LOC_FILL_RESOLVE_FAIL, NFS_MSG_INODE_NOT_FOUND,
+    NFS_MSG_INODE_CTX_STORE_FAIL, NFS_MSG_GETPWUID_FAIL,
+    NFS_MSG_MAP_GRP_LIST_FAIL, NFS_MSG_PARSE_DIR_FAIL, NFS_MSG_LOOKUP_FAIL,
+    NFS_MSG_STAT_ERROR, NFS_MSG_GFID_DICT_CREATE_FAIL, NFS_MSG_HASH_XLATOR_FAIL,
+    NFS_MSG_ENABLE_THROTTLE_FAIL);
+
+#endif /* _NFS_MESSAGES_H_ */
diff --git a/xlators/nfs/server/src/nfs.c b/xlators/nfs/server/src/nfs.c
new file mode 100644
index 00000000000..39b73f88ac3
--- /dev/null
+++ b/xlators/nfs/server/src/nfs.c
@@ -0,0 +1,2073 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+/* This is the primary translator source for NFS.
+ * Every other protocol version gets initialized from here.
+ */
+
+#include <glusterfs/defaults.h>
+#include "rpcsvc.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include "nfs.h"
+#include <glusterfs/mem-pool.h>
+#include <glusterfs/logging.h>
+#include "nfs-fops.h"
+#include "mount3.h"
+#include "nfs3.h"
+#include "nfs-mem-types.h"
+#include "nfs3-helpers.h"
+#include "nlm4.h"
+#include <glusterfs/options.h>
+#include "acl3.h"
+#include "rpc-drc.h"
+#include <glusterfs/syscall.h>
+#include "rpcsvc.h"
+#include "nfs-messages.h"
+#include "glusterfs/statedump.h"
+
+#define OPT_SERVER_AUX_GIDS "nfs.server-aux-gids"
+#define OPT_SERVER_GID_CACHE_TIMEOUT "nfs.server.aux-gid-timeout"
+#define OPT_SERVER_RPC_STATD "nfs.rpc-statd"
+#define OPT_SERVER_RPC_STATD_PIDFILE "nfs.rpc-statd-pidfile"
+#define OPT_SERVER_RPC_STATD_NOTIFY_PIDFILE "nfs.rpc-statd-notify-pidfile"
+
+#define NFS_DATADIR GLUSTERD_DEFAULT_WORKDIR "/nfs"
+
+/* Forward declaration */
+static int
+nfs_add_initer(struct list_head *list, nfs_version_initer_t init,
+               gf_boolean_t required);
+
+static int
+nfs_init_version(xlator_t *this, nfs_version_initer_t init,
+                 gf_boolean_t required)
+{
+    int ret = -1;
+    struct nfs_initer_list *version = NULL;
+    struct nfs_initer_list *tmp = NULL;
+    rpcsvc_program_t *prog = NULL;
+    struct list_head *versions = NULL;
+    struct nfs_state *nfs = NULL;
+    gf_boolean_t found = _gf_false;
+
+    if ((!this) || (!this->private) || (!init))
+        return (-1);
+
+    nfs = (struct nfs_state *)this->private;
+
+    ret = nfs_add_initer(&nfs->versions, init, required);
+    if (ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PROT_INIT_ADD_FAIL,
+               "Failed to add protocol initializer");
+        goto err;
+    }
+
+    versions = &nfs->versions;
+    list_for_each_entry_safe(version, tmp, versions, list)
+    {
+        prog = version->program;
+        if (version->init == init) {
+            prog = init(this);
+            if (!prog) {
+                ret = -1;
+                goto err;
+            }
+            version->program = prog;
+            found = _gf_true;
+            break;
+        }
+    }
+
+    /* program not added */
+    if (!found) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PGM_NOT_FOUND,
+               "Program: %s NOT found", prog->progname);
+        goto err;
+    }
+
+    /* Check if nfs.port is configured */
+    if (nfs->override_portnum)
+        prog->progport = nfs->override_portnum;
+
+    gf_msg_debug(GF_NFS, 0, "Starting program: %s", prog->progname);
+
+    ret = rpcsvc_program_register(nfs->rpcsvc, prog, _gf_false);
+    if (ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PGM_INIT_FAIL,
+               "Program: %s init failed", prog->progname);
+        goto err;
+    }
+
+    /* Registration with portmapper is disabled, Nothing to do */
+    if (!nfs->register_portmap)
+        goto err;
+
+    ret = rpcsvc_program_register_portmap(prog, prog->progport);
+    if (ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PGM_REG_FAIL,
+               "Program  %s registration failed", prog->progname);
+        goto err;
+    }
+    ret = 0; /* All well */
+err:
+    return ret;
+}
+
+static int
+nfs_deinit_version(struct nfs_state *nfs, nfs_version_initer_t init)
+{
+    int ret = -1;
+    struct nfs_initer_list *version = NULL;
+    struct nfs_initer_list *tmp = NULL;
+    rpcsvc_program_t *prog = NULL;
+    struct list_head *versions = NULL;
+
+    if ((!nfs) || (!init))
+        return (-1);
+
+    versions = &nfs->versions;
+    list_for_each_entry_safe(version, tmp, versions, list)
+    {
+        prog = version->program;
+        if (version->init == init) {
+            prog = version->program;
+            ret = rpcsvc_program_unregister(nfs->rpcsvc, prog);
+            if (ret != 0)
+                return (-1);
+            list_del(&version->list);
+            GF_FREE(version);
+            return (0);
+        }
+    }
+
+    return (-1);
+}
+
+static int
+nfs_reconfigure_acl3(xlator_t *this)
+{
+    struct nfs_state *nfs = NULL;
+
+    if ((!this) || (!this->private))
+        return (-1);
+
+    nfs = (struct nfs_state *)this->private;
+
+    /* ACL is enabled */
+    if (nfs->enable_acl)
+        return nfs_init_version(this, acl3svc_init, _gf_false);
+
+    /* ACL is disabled */
+    return nfs_deinit_version(nfs, acl3svc_init);
+}
+
+static int
+nfs_reconfigure_nlm4(xlator_t *this)
+{
+    struct nfs_state *nfs = NULL;
+
+    if ((!this) || (!this->private))
+        return (-1);
+
+    nfs = (struct nfs_state *)this->private;
+
+    /* NLM is enabled */
+    if (nfs->enable_nlm)
+        return nfs_init_version(this, nlm4svc_init, _gf_false);
+
+    /* NLM is disabled */
+    return nfs_deinit_version(nfs, nlm4svc_init);
+}
+
+static int
+nfs_program_register_portmap_all(struct nfs_state *nfs)
+{
+    struct list_head *versions = NULL;
+    struct nfs_initer_list *version = NULL;
+    struct nfs_initer_list *tmp = NULL;
+    rpcsvc_program_t *prog = NULL;
+
+    if (nfs == NULL)
+        return (-1);
+
+    versions = &nfs->versions;
+    list_for_each_entry_safe(version, tmp, versions, list)
+    {
+        prog = version->program;
+        if (prog == NULL)
+            continue;
+        if (nfs->override_portnum)
+            prog->progport = nfs->override_portnum;
+        (void)rpcsvc_program_register_portmap(prog, prog->progport);
+#ifdef IPV6_DEFAULT
+        (void)rpcsvc_program_register_rpcbind6(prog, prog->progport);
+#endif
+    }
+
+    return (0);
+}
+
+static int
+nfs_program_unregister_portmap_all(struct nfs_state *nfs)
+{
+    struct list_head *versions = NULL;
+    struct nfs_initer_list *version = NULL;
+    struct nfs_initer_list *tmp = NULL;
+    rpcsvc_program_t *prog = NULL;
+
+    if (nfs == NULL)
+        return (-1);
+
+    versions = &nfs->versions;
+    list_for_each_entry_safe(version, tmp, versions, list)
+    {
+        prog = version->program;
+        if (prog == NULL)
+            continue;
+        (void)rpcsvc_program_unregister_portmap(prog);
+#ifdef IPV6_DEFAULT
+        (void)rpcsvc_program_unregister_rpcbind6(prog);
+#endif
+    }
+
+    return (0);
+}
+
+/* Every NFS version must call this function with the init function
+ * for its particular version.
+ */
+static int
+nfs_add_initer(struct list_head *list, nfs_version_initer_t init,
+               gf_boolean_t required)
+{
+    struct nfs_initer_list *new = NULL;
+    if ((!list) || (!init))
+        return -1;
+
+    new = GF_CALLOC(1, sizeof(*new), gf_nfs_mt_nfs_initer_list);
+    if (!new) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Memory allocation failed");
+        return -1;
+    }
+
+    new->init = init;
+    new->required = required;
+    list_add_tail(&new->list, list);
+    return 0;
+}
+
+int
+nfs_deinit_versions(struct list_head *versions, xlator_t *this)
+{
+    struct nfs_initer_list *version = NULL;
+    struct nfs_initer_list *tmp = NULL;
+    struct nfs_state *nfs = NULL;
+
+    if ((!versions) || (!this))
+        return -1;
+
+    nfs = (struct nfs_state *)this->private;
+    list_for_each_entry_safe(version, tmp, versions, list)
+    {
+        /* TODO: Add version specific destructor.
+         * if (!version->deinit)
+                goto err;
+
+           version->deinit (this);
+        */
+        if (version->program)
+            rpcsvc_program_unregister(nfs->rpcsvc, (version->program));
+
+        list_del(&version->list);
+        GF_FREE(version);
+    }
+
+    return 0;
+}
+
+int
+nfs_init_versions(struct nfs_state *nfs, xlator_t *this)
+{
+    struct nfs_initer_list *version = NULL;
+    struct nfs_initer_list *tmp = NULL;
+    rpcsvc_program_t *prog = NULL;
+    int ret = -1;
+    struct list_head *versions = NULL;
+
+    if ((!nfs) || (!this))
+        return -1;
+
+    gf_msg_debug(GF_NFS, 0, "Initing protocol versions");
+    versions = &nfs->versions;
+    list_for_each_entry_safe(version, tmp, versions, list)
+    {
+        if (!version->init) {
+            ret = -1;
+            goto err;
+        }
+
+        prog = version->init(this);
+        if (!prog) {
+            ret = -1;
+            goto err;
+        }
+
+        version->program = prog;
+        if (nfs->override_portnum)
+            prog->progport = nfs->override_portnum;
+        gf_msg_debug(GF_NFS, 0, "Starting program: %s", prog->progname);
+
+        ret = rpcsvc_program_register(nfs->rpcsvc, prog, _gf_false);
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PGM_INIT_FAIL,
+                   "Program: %s init failed", prog->progname);
+            goto err;
+        }
+        if (nfs->register_portmap) {
+            ret = rpcsvc_program_register_portmap(prog, prog->progport);
+            if (ret == -1) {
+                gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PGM_REG_FAIL,
+                       "%s program  %s registration failed",
+                       version->required ? "Required" : "Optional",
+                       prog->progname);
+
+                /* fatal error if the program is required */
+                if (version->required)
+                    goto err;
+            }
+#ifdef IPV6_DEFAULT
+            ret = rpcsvc_program_register_rpcbind6(prog, prog->progport);
+            if (ret == -1) {
+                gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PGM_REG_FAIL,
+                       "Program (ipv6) %s registration failed", prog->progname);
+                goto err;
+            }
+#endif
+        }
+    }
+
+    ret = 0;
+err:
+    return ret;
+}
+
+int
+nfs_add_all_initiators(struct nfs_state *nfs)
+{
+    int ret = 0;
+
+    /* Add the initializers for all versions. */
+    ret = nfs_add_initer(&nfs->versions, mnt3svc_init, _gf_true);
+    if (ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PROT_INIT_ADD_FAIL,
+               "Failed to add MOUNT3 protocol initializer");
+        goto ret;
+    }
+
+    ret = nfs_add_initer(&nfs->versions, mnt1svc_init, _gf_true);
+    if (ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PROT_INIT_ADD_FAIL,
+               "Failed to add MOUNT1 protocol initializer");
+        goto ret;
+    }
+
+    ret = nfs_add_initer(&nfs->versions, nfs3svc_init, _gf_true);
+    if (ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PROT_INIT_ADD_FAIL,
+               "Failed to add NFS3 protocol initializer");
+        goto ret;
+    }
+
+    if (nfs->enable_nlm == _gf_true) {
+        ret = nfs_add_initer(&nfs->versions, nlm4svc_init, _gf_false);
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PROT_INIT_ADD_FAIL,
+                   "Failed to add protocol initializer");
+            goto ret;
+        }
+    }
+
+    if (nfs->enable_acl == _gf_true) {
+        ret = nfs_add_initer(&nfs->versions, acl3svc_init, _gf_false);
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_PROT_INIT_ADD_FAIL,
+                   "Failed to add ACL protocol initializer");
+            goto ret;
+        }
+    }
+
+    ret = 0;
+ret:
+    return ret;
+}
+
+int
+nfs_subvolume_started(struct nfs_state *nfs, xlator_t *xl)
+{
+    int x = 0;
+    int started = 0;
+
+    if ((!nfs) || (!xl))
+        return 1;
+
+    LOCK(&nfs->svinitlock);
+    {
+        for (; x < nfs->allsubvols; ++x) {
+            if (nfs->initedxl[x] == xl) {
+                started = 1;
+                goto unlock;
+            }
+        }
+    }
+unlock:
+    UNLOCK(&nfs->svinitlock);
+
+    return started;
+}
+
+int
+nfs_subvolume_set_started(struct nfs_state *nfs, xlator_t *xl)
+{
+    int x = 0;
+
+    if ((!nfs) || (!xl))
+        return 1;
+
+    LOCK(&nfs->svinitlock);
+    {
+        for (; x < nfs->allsubvols; ++x) {
+            if (nfs->initedxl[x] == xl) {
+                gf_msg_debug(GF_NFS, 0, "Volume already started %s", xl->name);
+                break;
+            }
+
+            if (nfs->initedxl[x] == NULL) {
+                nfs->initedxl[x] = xl;
+                ++nfs->upsubvols;
+                gf_msg_debug(GF_NFS, 0,
+                             "Starting up: %s "
+                             ", vols started till now: %d",
+                             xl->name, nfs->upsubvols);
+                goto unlock;
+            }
+        }
+    }
+unlock:
+    UNLOCK(&nfs->svinitlock);
+
+    return 0;
+}
+
+int32_t
+nfs_start_subvol_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int32_t op_ret, int32_t op_errno, inode_t *inode,
+                            struct iatt *buf, dict_t *xattr,
+                            struct iatt *postparent)
+{
+    if (op_ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_CRITICAL, op_errno, NFS_MSG_LOOKUP_ROOT_FAIL,
+               "Failed to lookup root: %s", strerror(op_errno));
+        goto err;
+    }
+
+    nfs_subvolume_set_started(this->private, ((xlator_t *)cookie));
+    gf_msg_trace(GF_NFS, 0, "Started %s", ((xlator_t *)cookie)->name);
+err:
+    return 0;
+}
+
+int
+nfs_startup_subvolume(xlator_t *nfsx, xlator_t *xl)
+{
+    int ret = -1;
+    loc_t rootloc = {
+        0,
+    };
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if ((!nfsx) || (!xl))
+        return -1;
+
+    if (nfs_subvolume_started(nfsx->private, xl)) {
+        gf_msg_trace(GF_NFS, 0, "Subvolume already started: %s", xl->name);
+        ret = 0;
+        goto err;
+    }
+
+    ret = nfs_root_loc_fill(xl->itable, &rootloc);
+    if (ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_CRITICAL, 0, NFS_MSG_ROOT_LOC_INIT_FAIL,
+               "Failed to init root loc");
+        goto err;
+    }
+
+    nfs_user_root_create(&nfu);
+    ret = nfs_fop_lookup(nfsx, xl, &nfu, &rootloc, nfs_start_subvol_lookup_cbk,
+                         (void *)nfsx->private);
+    if (ret < 0) {
+        gf_msg(GF_NFS, GF_LOG_CRITICAL, -ret, NFS_MSG_LOOKUP_ROOT_FAIL,
+               "Failed to lookup root: %s", strerror(-ret));
+        goto err;
+    }
+
+    nfs_loc_wipe(&rootloc);
+
+err:
+    return ret;
+}
+
+int
+nfs_startup_subvolumes(xlator_t *nfsx)
+{
+    int ret = -1;
+    xlator_list_t *cl = NULL;
+    struct nfs_state *nfs = NULL;
+
+    if (!nfsx)
+        return -1;
+
+    nfs = nfsx->private;
+    cl = nfs->subvols;
+    while (cl) {
+        gf_msg_debug(GF_NFS, 0, "Starting subvolume: %s", cl->xlator->name);
+        ret = nfs_startup_subvolume(nfsx, cl->xlator);
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_CRITICAL, 0, NFS_MSG_STARTUP_FAIL,
+                   "Failed to start-up xlator: %s", cl->xlator->name);
+            goto err;
+        }
+        cl = cl->next;
+    }
+
+    ret = 0;
+err:
+    return ret;
+}
+
+int
+nfs_init_subvolume(struct nfs_state *nfs, xlator_t *xl)
+{
+    unsigned int lrusize = 0;
+    int ret = -1;
+
+    if ((!nfs) || (!xl))
+        return -1;
+
+    lrusize = nfs->memfactor * GF_NFS_INODE_LRU_MULT;
+    xl->itable = inode_table_new(lrusize, xl);
+    if (!xl->itable) {
+        gf_msg(GF_NFS, GF_LOG_CRITICAL, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to allocate inode table");
+        goto err;
+    }
+    ret = 0;
+err:
+    return ret;
+}
+
+int
+nfs_init_subvolumes(struct nfs_state *nfs, xlator_list_t *cl)
+{
+    int ret = -1;
+    unsigned int lrusize = 0;
+    int svcount = 0;
+
+    if ((!nfs) || (!cl))
+        return -1;
+
+    lrusize = nfs->memfactor * GF_NFS_INODE_LRU_MULT;
+    nfs->subvols = cl;
+    gf_msg_trace(GF_NFS, 0, "inode table lru: %d", lrusize);
+
+    while (cl) {
+        gf_msg_debug(GF_NFS, 0, "Initing subvolume: %s", cl->xlator->name);
+        ret = nfs_init_subvolume(nfs, cl->xlator);
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_CRITICAL, 0, NFS_MSG_XLATOR_INIT_FAIL,
+                   "Failed to init "
+                   "xlator: %s",
+                   cl->xlator->name);
+            goto err;
+        }
+        ++svcount;
+        cl = cl->next;
+    }
+
+    LOCK_INIT(&nfs->svinitlock);
+    nfs->initedxl = GF_CALLOC(svcount, sizeof(xlator_t *), gf_nfs_mt_xlator_t);
+    if (!nfs->initedxl) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to allocated inited xls");
+        ret = -1;
+        goto err;
+    }
+
+    gf_msg_trace(GF_NFS, 0, "Inited volumes: %d", svcount);
+    nfs->allsubvols = svcount;
+    ret = 0;
+err:
+    return ret;
+}
+
+int
+nfs_user_root_create(nfs_user_t *newnfu)
+{
+    if (!newnfu)
+        return -1;
+
+    newnfu->uid = 0;
+    newnfu->gids[0] = 0;
+    newnfu->ngrps = 1;
+
+    return 0;
+}
+
+int
+nfs_user_create(nfs_user_t *newnfu, uid_t uid, gid_t gid,
+                rpc_transport_t *trans, gid_t *auxgids, int auxcount)
+{
+    int x = 1;
+    int y = 0;
+
+    /* We test for GF_REQUEST_MAXGROUPS instead of  NFS_FOP_NGROUPS because
+     * the latter accounts for the @gid being in @auxgids, which is not the
+     * case here.
+     */
+    if ((!newnfu) || (auxcount > GF_REQUEST_MAXGROUPS))
+        return -1;
+
+    newnfu->uid = uid;
+    newnfu->gids[0] = gid;
+    newnfu->ngrps = 1;
+    if (trans) {
+        memcpy(&newnfu->identifier, trans->peerinfo.identifier, UNIX_PATH_MAX);
+    }
+
+    gf_msg_trace(GF_NFS, 0, "uid: %d, gid %d, gids: %d", uid, gid, auxcount);
+
+    if (!auxgids)
+        return 0;
+
+    for (; y < auxcount; ++x, ++y) {
+        newnfu->gids[x] = auxgids[y];
+        ++newnfu->ngrps;
+        gf_msg_trace(GF_NFS, 0, "gid: %d", auxgids[y]);
+    }
+
+    return 0;
+}
+
+void
+nfs_request_user_init(nfs_user_t *nfu, rpcsvc_request_t *req)
+{
+    gid_t *gidarr = NULL;
+    int gids = 0;
+
+    if ((!req) || (!nfu))
+        return;
+
+    gidarr = rpcsvc_auth_unix_auxgids(req, &gids);
+    nfs_user_create(nfu, rpcsvc_request_uid(req), rpcsvc_request_gid(req),
+                    rpcsvc_request_transport(req), gidarr, gids);
+
+    return;
+}
+
+void
+nfs_request_primary_user_init(nfs_user_t *nfu, rpcsvc_request_t *req, uid_t uid,
+                              gid_t gid)
+{
+    gid_t *gidarr = NULL;
+    int gids = 0;
+
+    if ((!req) || (!nfu))
+        return;
+
+    gidarr = rpcsvc_auth_unix_auxgids(req, &gids);
+    nfs_user_create(nfu, uid, gid, rpcsvc_request_transport(req), gidarr, gids);
+
+    return;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_nfs_mt_end + 1);
+
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Memory accounting init failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+struct nfs_state *
+nfs_init_state(xlator_t *this)
+{
+    struct nfs_state *nfs = NULL;
+    int i = 0, ret = -1;
+    unsigned int fopspoolsize = 0;
+    char *optstr = NULL;
+    gf_boolean_t boolt = _gf_false;
+    struct stat stbuf = {
+        0,
+    };
+
+    if (!this)
+        return NULL;
+
+    if (!this->children) {
+        gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_NFS_MAN_DISABLE,
+               "NFS is manually disabled: Exiting");
+        /* Nothing for nfs process to do, exit cleanly */
+        kill(getpid(), SIGTERM);
+    }
+
+    nfs = GF_CALLOC(1, sizeof(*nfs), gf_nfs_mt_nfs_state);
+    if (!nfs) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "memory allocation failed");
+        return NULL;
+    }
+
+    nfs->memfactor = GF_NFS_DEFAULT_MEMFACTOR;
+    if (dict_get(this->options, "nfs.mem-factor")) {
+        ret = dict_get_str(this->options, "nfs.mem-factor", &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_DICT_GET_FAILED,
+                   "Failed to parse dict");
+            goto free_rpcsvc;
+        }
+
+        ret = gf_string2uint(optstr, &nfs->memfactor);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse uint string");
+            goto free_rpcsvc;
+        }
+    }
+
+    fopspoolsize = nfs->memfactor * GF_NFS_CONCURRENT_OPS_MULT;
+    /* FIXME: Really saddens me to see this as xlator wide. */
+    nfs->foppool = mem_pool_new(struct nfs_fop_local, fopspoolsize);
+    if (!nfs->foppool) {
+        gf_msg(GF_NFS, GF_LOG_CRITICAL, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to allocate fops local pool");
+        goto free_rpcsvc;
+    }
+
+    nfs->dynamicvolumes = GF_NFS_DVM_OFF;
+    if (dict_get(this->options, "nfs.dynamic-volumes")) {
+        ret = dict_get_str(this->options, "nfs.dynamic-volumes", &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_DICT_GET_FAILED,
+                   "Failed to parse dict");
+            goto free_foppool;
+        }
+
+        ret = gf_string2boolean(optstr, &boolt);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse bool string");
+            goto free_foppool;
+        }
+
+        if (boolt == _gf_true)
+            nfs->dynamicvolumes = GF_NFS_DVM_ON;
+    }
+
+    nfs->enable_nlm = _gf_true;
+    ret = dict_get_str_boolean(this->options, "nfs.nlm", _gf_true);
+    if (ret == _gf_false) {
+        gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_NLM_MAN_DISABLE,
+               "NLM is manually disabled");
+        nfs->enable_nlm = _gf_false;
+    }
+
+    nfs->enable_acl = _gf_true;
+    ret = dict_get_str_boolean(this->options, "nfs.acl", _gf_true);
+    if (ret == _gf_false) {
+        gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_ACL_MAN_DISABLE,
+               "ACL is manually disabled");
+        nfs->enable_acl = _gf_false;
+    }
+
+    nfs->enable_ino32 = 0;
+    if (dict_get(this->options, "nfs.enable-ino32")) {
+        ret = dict_get_str(this->options, "nfs.enable-ino32", &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse dict");
+            goto free_foppool;
+        }
+
+        ret = gf_string2boolean(optstr, &boolt);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse bool string");
+            goto free_foppool;
+        }
+
+        if (boolt == _gf_true)
+            nfs->enable_ino32 = 1;
+    }
+
+    if (dict_get(this->options, "nfs.port")) {
+        ret = dict_get_str(this->options, "nfs.port", &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse dict");
+            goto free_foppool;
+        }
+
+        ret = gf_string2uint(optstr, &nfs->override_portnum);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse uint string");
+            goto free_foppool;
+        }
+    }
+
+    if (dict_get(this->options, "transport.socket.bind-address")) {
+        ret = dict_get_str(this->options, "transport.socket.bind-address",
+                           &optstr);
+        if (ret < 0) {
+            gf_log(GF_NFS, GF_LOG_ERROR,
+                   "Failed to parse "
+                   "transport.socket.bind-address string");
+        } else {
+            this->instance_name = gf_strdup(optstr);
+            for (i = 0; i < strlen(this->instance_name); i++) {
+                if (this->instance_name[i] == '.' ||
+                    this->instance_name[i] == ':')
+                    this->instance_name[i] = '_';
+            }
+        }
+    }
+
+    if (dict_get(this->options, "transport.socket.listen-port") == NULL) {
+        if (nfs->override_portnum)
+            ret = gf_asprintf(&optstr, "%d", nfs->override_portnum);
+        else
+            ret = gf_asprintf(&optstr, "%d", GF_NFS3_PORT);
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "failed mem-allocation");
+            goto free_foppool;
+        }
+        ret = dict_set_dynstr(this->options, "transport.socket.listen-port",
+                              optstr);
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_DICT_SET_FAILED,
+                   "dict_set_dynstr error");
+            goto free_foppool;
+        }
+    }
+
+#ifdef IPV6_DEFAULT
+    ret = dict_set_str(this->options, "transport.address-family", "inet6");
+    if (ret == -1) {
+        gf_log(GF_NFS, GF_LOG_ERROR, "dict_set_str error");
+        goto free_foppool;
+    }
+#endif
+
+    /* Right only socket support exists between nfs client and
+     * gluster nfs, so we can set default value as socket
+     */
+    ret = dict_set_str(this->options, "transport-type", "socket");
+    if (ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_DICT_SET_FAILED,
+               "dict_set_str error");
+        goto free_foppool;
+    }
+
+    nfs->mount_udp = 0;
+    if (dict_get(this->options, "nfs.mount-udp")) {
+        ret = dict_get_str(this->options, "nfs.mount-udp", &optstr);
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse dict");
+            goto free_foppool;
+        }
+
+        ret = gf_string2boolean(optstr, &boolt);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse bool "
+                   "string");
+            goto free_foppool;
+        }
+
+        if (boolt == _gf_true)
+            nfs->mount_udp = 1;
+    }
+
+    nfs->exports_auth = GF_NFS_DEFAULT_EXPORT_AUTH;
+    if (dict_get(this->options, "nfs.exports-auth-enable")) {
+        ret = dict_get_str(this->options, "nfs.exports-auth-enable", &optstr);
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse dict");
+            goto free_foppool;
+        }
+
+        ret = gf_string2boolean(optstr, &boolt);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse bool string");
+            goto free_foppool;
+        }
+
+        if (boolt == _gf_true)
+            nfs->exports_auth = 1;
+    }
+
+    nfs->auth_refresh_time_secs = GF_NFS_DEFAULT_AUTH_REFRESH_INTERVAL_SEC;
+    if (dict_get(this->options, "nfs.auth-refresh-interval-sec")) {
+        ret = dict_get_str(this->options, "nfs.auth-refresh-interval-sec",
+                           &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse dict");
+            goto free_foppool;
+        }
+
+        ret = gf_string2uint(optstr, &nfs->auth_refresh_time_secs);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse uint string");
+            goto free_foppool;
+        }
+    }
+
+    nfs->auth_cache_ttl_sec = GF_NFS_DEFAULT_AUTH_CACHE_TTL_SEC;
+    if (dict_get(this->options, "nfs.auth-cache-ttl-sec")) {
+        ret = dict_get_str(this->options, "nfs.auth-cache-ttl-sec", &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse dict");
+            goto free_foppool;
+        }
+
+        ret = gf_string2uint(optstr, &nfs->auth_cache_ttl_sec);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse uint string");
+            goto free_foppool;
+        }
+    }
+
+    /* TODO: Make this a configurable option in case we don't want to read
+     * exports/netgroup files off disk when they change. */
+    nfs->refresh_auth = 1;
+
+    nfs->rmtab = gf_strdup(NFS_DATADIR "/rmtab");
+    if (dict_get(this->options, "nfs.mount-rmtab")) {
+        ret = dict_get_str(this->options, "nfs.mount-rmtab", &nfs->rmtab);
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse dict");
+            goto free_foppool;
+        }
+
+        /* check if writing the rmtab is disabled*/
+        if (nfs->rmtab && strcmp("/-", nfs->rmtab) == 0) {
+            GF_FREE(nfs->rmtab);
+            nfs->rmtab = NULL;
+        }
+    }
+
+    /* support both options rpc-auth.ports.insecure and
+     * rpc-auth-allow-insecure for backward compatibility
+     */
+    nfs->allow_insecure = 1;
+    if (dict_get(this->options, "rpc-auth.ports.insecure")) {
+        ret = dict_get_str(this->options, "rpc-auth.ports.insecure", &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse dict");
+            goto free_foppool;
+        }
+
+        ret = gf_string2boolean(optstr, &boolt);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse bool "
+                   "string");
+            goto free_foppool;
+        }
+
+        if (boolt == _gf_false)
+            nfs->allow_insecure = 0;
+    }
+
+    if (dict_get(this->options, "rpc-auth-allow-insecure")) {
+        ret = dict_get_str(this->options, "rpc-auth-allow-insecure", &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse dict");
+            goto free_foppool;
+        }
+
+        ret = gf_string2boolean(optstr, &boolt);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, errno, NFS_MSG_PARSE_FAIL,
+                   "Failed to parse bool string");
+            goto free_foppool;
+        }
+
+        if (boolt == _gf_false)
+            nfs->allow_insecure = 0;
+    }
+
+    if (nfs->allow_insecure) {
+        /* blindly set both the options */
+        ret = dict_set_str(this->options, "rpc-auth-allow-insecure", "on");
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_DICT_SET_FAILED,
+                   "dict_set_str error");
+            goto free_foppool;
+        }
+        ret = dict_set_str(this->options, "rpc-auth.ports.insecure", "on");
+        if (ret == -1) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_DICT_SET_FAILED,
+                   "dict_set_str error");
+            goto free_foppool;
+        }
+    }
+
+    GF_OPTION_INIT("nfs.rdirplus", nfs->rdirplus, bool, free_foppool);
+
+    GF_OPTION_INIT(OPT_SERVER_RPC_STATD, nfs->rpc_statd, path, free_foppool);
+
+    GF_OPTION_INIT(OPT_SERVER_RPC_STATD_PIDFILE, nfs->rpc_statd_pid_file, path,
+                   free_foppool);
+
+    GF_OPTION_INIT(OPT_SERVER_AUX_GIDS, nfs->server_aux_gids, bool,
+                   free_foppool);
+    GF_OPTION_INIT(OPT_SERVER_GID_CACHE_TIMEOUT, nfs->server_aux_gids_max_age,
+                   uint32, free_foppool);
+
+    if (gid_cache_init(&nfs->gid_cache, nfs->server_aux_gids_max_age) < 0) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_INIT_GRP_CACHE_FAIL,
+               "Failed to initialize group cache.");
+        goto free_foppool;
+    }
+
+    ret = sys_access(nfs->rpc_statd, X_OK);
+    if (ret) {
+        gf_msg(GF_NFS, GF_LOG_WARNING, EPERM, NFS_MSG_NO_PERM,
+               "%s not enough permissions to access. Disabling NLM",
+               nfs->rpc_statd);
+        nfs->enable_nlm = _gf_false;
+    }
+
+    ret = sys_stat(nfs->rpc_statd, &stbuf);
+    if (ret || !S_ISREG(stbuf.st_mode)) {
+        gf_msg(GF_NFS, GF_LOG_WARNING, 0, NFS_MSG_REG_FILE_ERROR,
+               "%s not a regular file. Disabling NLM", nfs->rpc_statd);
+        nfs->enable_nlm = _gf_false;
+    }
+
+    nfs->rpcsvc = rpcsvc_init(this, this->ctx, this->options, fopspoolsize);
+    if (!nfs->rpcsvc) {
+        ret = -1;
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_RPC_INIT_FAIL,
+               "RPC service init failed");
+        goto free_foppool;
+    }
+
+    ret = rpcsvc_set_throttle_on(nfs->rpcsvc);
+    if (ret) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_ENABLE_THROTTLE_FAIL,
+               "Enabling throttle failed");
+        goto free_foppool;
+    }
+
+    ret = rpcsvc_set_outstanding_rpc_limit(
+        nfs->rpcsvc, this->options, RPCSVC_DEF_NFS_OUTSTANDING_RPC_LIMIT);
+    if (ret < 0) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_RPC_CONFIG_FAIL,
+               "Failed to configure outstanding-rpc-limit");
+        goto free_foppool;
+    }
+
+    nfs->register_portmap = rpcsvc_register_portmap_enabled(nfs->rpcsvc);
+
+    GF_OPTION_INIT("nfs.event-threads", nfs->event_threads, uint32,
+                   free_foppool);
+    gf_event_reconfigure_threads(this->ctx->event_pool, nfs->event_threads);
+
+    this->private = (void *)nfs;
+    INIT_LIST_HEAD(&nfs->versions);
+    nfs->generation = 1965;
+
+    ret = 0;
+
+free_foppool:
+    if (ret < 0)
+        mem_pool_destroy(nfs->foppool);
+
+free_rpcsvc:
+    /*
+     * rpcsvc_deinit */
+    if (ret < 0) {
+        GF_FREE(nfs);
+        nfs = NULL;
+    }
+
+    return nfs;
+}
+
+int
+nfs_drc_init(xlator_t *this)
+{
+    int ret = -1;
+    rpcsvc_t *svc = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS, this, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS, this->private, out);
+
+    svc = ((struct nfs_state *)(this->private))->rpcsvc;
+    if (!svc)
+        goto out;
+
+    ret = rpcsvc_drc_init(svc, this->options);
+
+out:
+    return ret;
+}
+
+static int
+nfs_reconfigure_state(xlator_t *this, dict_t *options)
+{
+    int ret = 0;
+    int keyindx = 0;
+    char *rmtab = NULL;
+    char *rpc_statd = NULL;
+    gf_boolean_t optbool;
+    uint32_t optuint32;
+    struct nfs_state *nfs = NULL;
+    static char *options_require_restart[] = {"nfs.port", "nfs.transport-type",
+                                              "nfs.mem-factor", NULL};
+
+    GF_VALIDATE_OR_GOTO(GF_NFS, this, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS, this->private, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS, options, out);
+
+    nfs = (struct nfs_state *)this->private;
+
+    /* Some listed options can't be reconfigured, they need
+     * NFS to be restarted. There are two cases 1. SET 2. UNSET.
+     * 1. SET */
+    while (options_require_restart[keyindx]) {
+        if (dict_get(options, options_require_restart[keyindx])) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_RECONFIG_FAIL,
+                   "Reconfiguring %s needs NFS restart",
+                   options_require_restart[keyindx]);
+            goto out;
+        }
+        keyindx++;
+    }
+
+    /* UNSET for nfs.mem-factor */
+    if ((!dict_get(options, "nfs.mem-factor")) &&
+        (nfs->memfactor != GF_NFS_DEFAULT_MEMFACTOR)) {
+        gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_RECONFIG_FAIL,
+               "Reconfiguring nfs.mem-factor needs NFS restart");
+        goto out;
+    }
+
+    /* UNSET for nfs.port */
+    if ((!dict_get(options, "nfs.port")) && (nfs->override_portnum)) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_RECONFIG_FAIL,
+               "Reconfiguring nfs.port needs NFS restart");
+        goto out;
+    }
+
+    /* reconfig nfs.rpc-statd...  */
+    rpc_statd = GF_RPC_STATD_PROG;
+    if (dict_get(options, OPT_SERVER_RPC_STATD_PIDFILE)) {
+        ret = dict_get_str(options, "nfs.rpc-statd", &rpc_statd);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_READ_FAIL,
+                   "Failed to read reconfigured option: "
+                   "nfs.rpc-statd");
+            goto out;
+        }
+    }
+
+    if (strcmp(nfs->rpc_statd, rpc_statd) != 0) {
+        gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_RECONFIG_FAIL,
+               "Reconfiguring nfs.rpc-statd needs NFS restart");
+        goto out;
+    }
+
+    /* reconfig nfs.mount-rmtab */
+    rmtab = NFS_DATADIR "/rmtab";
+    if (dict_get(options, "nfs.mount-rmtab")) {
+        ret = dict_get_str(options, "nfs.mount-rmtab", &rmtab);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_READ_FAIL,
+                   "Failed to read reconfigured option:"
+                   " nfs.mount-rmtab");
+            goto out;
+        }
+        gf_path_strip_trailing_slashes(rmtab);
+    }
+    /* check if writing the rmtab is disabled*/
+    if (strcmp("/-", rmtab) == 0) {
+        GF_FREE(nfs->rmtab);
+        nfs->rmtab = NULL;
+        gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_WRITE_FAIL,
+               "Disabled writing of nfs.mount-rmtab");
+    } else if (!nfs->rmtab || strcmp(nfs->rmtab, rmtab) != 0) {
+        mount_rewrite_rmtab(nfs->mstate, rmtab);
+        gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_RECONFIG_PATH,
+               "Reconfigured nfs.mount-rmtab path: %s", nfs->rmtab);
+    }
+
+    GF_OPTION_RECONF(OPT_SERVER_AUX_GIDS, optbool, options, bool, out);
+    if (nfs->server_aux_gids != optbool) {
+        nfs->server_aux_gids = optbool;
+        gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_RECONFIG_VALUE,
+               "Reconfigured %s with value %d", OPT_SERVER_AUX_GIDS, optbool);
+    }
+
+    GF_OPTION_RECONF(OPT_SERVER_GID_CACHE_TIMEOUT, optuint32, options, uint32,
+                     out);
+    if (nfs->server_aux_gids_max_age != optuint32) {
+        nfs->server_aux_gids_max_age = optuint32;
+        gid_cache_reconf(&nfs->gid_cache, optuint32);
+        gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_RECONFIG_VALUE,
+               "Reconfigured %s with value %d", OPT_SERVER_GID_CACHE_TIMEOUT,
+               optuint32);
+    }
+
+    GF_OPTION_RECONF("nfs.rdirplus", optbool, options, bool, out);
+    if (nfs->rdirplus != optbool) {
+        nfs->rdirplus = optbool;
+        gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_RECONFIG_VALUE,
+               "Reconfigured nfs.rdirplus with value %d", optbool);
+    }
+
+    /* reconfig nfs.dynamic-volumes */
+    ret = dict_get_str_boolean(options, "nfs.dynamic-volumes", GF_NFS_DVM_OFF);
+    switch (ret) {
+        case GF_NFS_DVM_ON:
+        case GF_NFS_DVM_OFF:
+            optbool = ret;
+            break;
+        default:
+            optbool = GF_NFS_DVM_OFF;
+            break;
+    }
+    if (nfs->dynamicvolumes != optbool) {
+        nfs->dynamicvolumes = optbool;
+        gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_RECONFIG_VOL,
+               "Reconfigured nfs.dynamic-volumes with value %d", optbool);
+    }
+
+    optbool = _gf_false;
+    if (dict_get(options, "nfs.enable-ino32")) {
+        ret = dict_get_str_boolean(options, "nfs.enable-ino32", _gf_false);
+        if (ret < 0) {
+            gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_READ_FAIL,
+                   "Failed to read reconfigured option: "
+                   "nfs.enable-ino32");
+            goto out;
+        }
+        optbool = ret;
+    }
+    if (nfs->enable_ino32 != optbool) {
+        nfs->enable_ino32 = optbool;
+        gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_RECONFIG_ENABLE,
+               "Reconfigured nfs.enable-ino32 with value %d", optbool);
+    }
+
+    /* nfs.nlm is enabled by default */
+    ret = dict_get_str_boolean(options, "nfs.nlm", _gf_true);
+    if (ret < 0) {
+        optbool = _gf_true;
+    } else {
+        optbool = ret;
+    }
+    if (nfs->enable_nlm != optbool) {
+        gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_NLM_INFO,
+               "NLM is"
+               " manually %s",
+               (optbool ? "enabled" : "disabled"));
+        nfs->enable_nlm = optbool;
+        nfs_reconfigure_nlm4(this);
+    }
+
+    /* nfs.acl is enabled by default */
+    ret = dict_get_str_boolean(options, "nfs.acl", _gf_true);
+    if (ret < 0) {
+        optbool = _gf_true;
+    } else {
+        optbool = ret;
+    }
+    if (nfs->enable_acl != optbool) {
+        gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_ACL_INFO,
+               "ACL is "
+               "manually %s",
+               (optbool ? "enabled" : "disabled"));
+        nfs->enable_acl = optbool;
+        nfs_reconfigure_acl3(this);
+    }
+
+    GF_OPTION_RECONF("nfs.event-threads", nfs->event_threads, options, uint32,
+                     out);
+    gf_event_reconfigure_threads(this->ctx->event_pool, nfs->event_threads);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+/*
+ * reconfigure() for NFS server xlator.
+ */
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    int ret = 0;
+    struct nfs_state *nfs = NULL;
+    gf_boolean_t regpmap = _gf_true;
+
+    if ((!this) || (!this->private) || (!options))
+        return (-1);
+
+    nfs = (struct nfs_state *)this->private;
+
+    /* Reconfigure nfs options */
+    ret = nfs_reconfigure_state(this, options);
+    if (ret) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_RECONFIG_FAIL,
+               "nfs reconfigure state failed");
+        return (-1);
+    }
+
+    /* Reconfigure nfs3 options */
+    ret = nfs3_reconfigure_state(this, options);
+    if (ret) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_RECONFIG_FAIL,
+               "nfs3 reconfigure state failed");
+        return (-1);
+    }
+
+    /* Reconfigure mount options */
+    ret = mount_reconfigure_state(this, options);
+    if (ret) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_RECONFIG_FAIL,
+               "mount reconfigure state failed");
+        return (-1);
+    }
+
+    /* Reconfigure rpc layer */
+    ret = rpcsvc_reconfigure_options(nfs->rpcsvc, options);
+    if (ret) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_RECONFIG_FAIL,
+               "rpcsvc reconfigure options failed");
+        return (-1);
+    }
+
+    /* Reconfigure rpc.outstanding-rpc-limit */
+    ret = rpcsvc_set_outstanding_rpc_limit(
+        nfs->rpcsvc, options, RPCSVC_DEF_NFS_OUTSTANDING_RPC_LIMIT);
+    if (ret < 0) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_RECONFIG_FAIL,
+               "Failed to reconfigure outstanding-rpc-limit");
+        return (-1);
+    }
+
+    regpmap = rpcsvc_register_portmap_enabled(nfs->rpcsvc);
+    if (nfs->register_portmap != regpmap) {
+        nfs->register_portmap = regpmap;
+        if (regpmap) {
+            (void)nfs_program_register_portmap_all(nfs);
+        } else {
+            (void)nfs_program_unregister_portmap_all(nfs);
+        }
+    }
+
+    /* Reconfigure drc */
+    ret = rpcsvc_drc_reconfigure(nfs->rpcsvc, options);
+    if (ret) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_RECONFIG_FAIL,
+               "rpcsvc DRC reconfigure failed");
+        return (-1);
+    }
+
+    return (0);
+}
+
+/* Main init() routine for NFS server xlator. It inits NFS v3 protocol
+ * and its dependent protocols e.g. ACL, MOUNT v3 (mount3), NLM and
+ * DRC.
+ *
+ * Usage: glusterfsd:
+ *            glusterfs_process_volfp() =>
+ *              glusterfs_graph_activate() =>
+ *                glusterfs_graph_init() =>
+ *                  xlator_init () => NFS init() routine
+ *
+ * If init() routine fails, the glusterfsd cleans up the NFS process
+ * by invoking cleanup_and_exit().
+ *
+ * RETURN:
+ *       0 (SUCCESS) if all protocol specific inits PASS.
+ *      -1 (FAILURE) if any of them FAILS.
+ */
+int
+init(xlator_t *this)
+{
+    struct nfs_state *nfs = NULL;
+    int ret = -1;
+
+    if (!this)
+        return (-1);
+
+    nfs = nfs_init_state(this);
+    if (!nfs) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_OPT_INIT_FAIL,
+               "Failed to init nfs option");
+        return (-1);
+    }
+
+    ret = nfs_add_all_initiators(nfs);
+    if (ret) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_INIT_FAIL,
+               "Failed to add initiators");
+        return (-1);
+    }
+
+    ret = nfs_init_subvolumes(nfs, this->children);
+    if (ret) {
+        gf_msg(GF_NFS, GF_LOG_CRITICAL, 0, NFS_MSG_INIT_FAIL,
+               "Failed to init NFS exports");
+        return (-1);
+    }
+
+    ret = mount_init_state(this);
+    if (ret) {
+        gf_msg(GF_NFS, GF_LOG_CRITICAL, 0, NFS_MSG_INIT_FAIL,
+               "Failed to init Mount state");
+        return (-1);
+    }
+
+    ret = nlm4_init_state(this);
+    if (ret) {
+        gf_msg(GF_NFS, GF_LOG_CRITICAL, 0, NFS_MSG_INIT_FAIL,
+               "Failed to init NLM state");
+        return (-1);
+    }
+
+    ret = nfs_init_versions(nfs, this);
+    if (ret) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_INIT_FAIL,
+               "Failed to initialize protocols");
+        return (-1);
+    }
+
+    ret = nfs_drc_init(this);
+    if (ret) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_INIT_FAIL,
+               "Failed to initialize DRC");
+        return (-1);
+    }
+
+    gf_msg(GF_NFS, GF_LOG_INFO, 0, NFS_MSG_STARTED, "NFS service started");
+    return (0); /* SUCCESS */
+}
+
+int
+notify(xlator_t *this, int32_t event, void *data, ...)
+{
+    xlator_t *subvol = NULL;
+    struct nfs_state *priv = NULL;
+
+    subvol = (xlator_t *)data;
+
+    gf_msg_trace(GF_NFS, 0, "Notification received: %d", event);
+
+    switch (event) {
+        case GF_EVENT_CHILD_UP:
+            nfs_startup_subvolume(this, subvol);
+            break;
+
+        case GF_EVENT_SOME_DESCENDENT_DOWN:
+        case GF_EVENT_SOME_DESCENDENT_UP:
+            priv = this->private;
+            ++(priv->generation);
+            break;
+
+        case GF_EVENT_PARENT_UP:
+            default_notify(this, GF_EVENT_PARENT_UP, data);
+            break;
+    }
+
+    return 0;
+}
+
+void
+fini(xlator_t *this)
+{
+    struct nfs_state *nfs = NULL;
+
+    mnt3svc_deinit(this);
+    nfs = (struct nfs_state *)this->private;
+    gf_msg_debug(GF_NFS, 0, "NFS service going down");
+    nfs_deinit_versions(&nfs->versions, this);
+    GF_FREE(this->instance_name);
+    return;
+}
+
+int32_t
+nfs_forget(xlator_t *this, inode_t *inode)
+{
+    uint64_t ctx = 0;
+    struct nfs_inode_ctx *ictx = NULL;
+
+    if (inode_ctx_del(inode, this, &ctx))
+        return -1;
+
+    ictx = (struct nfs_inode_ctx *)(uintptr_t)ctx;
+    GF_FREE(ictx);
+
+    return 0;
+}
+
+gf_boolean_t
+_nfs_export_is_for_vol(char *exname, char *volname)
+{
+    gf_boolean_t ret = _gf_false;
+    char *tmp = NULL;
+
+    tmp = exname;
+    if (tmp[0] == '/')
+        tmp++;
+
+    if (!strcmp(tmp, volname))
+        ret = _gf_true;
+
+    return ret;
+}
+
+int
+nfs_priv_to_dict(xlator_t *this, dict_t *dict, char *brickname)
+{
+    int ret = -1;
+    struct nfs_state *priv = NULL;
+    struct mountentry *mentry = NULL;
+    char *volname = NULL;
+    char key[1024] = {
+        0,
+    };
+    int count = 0;
+
+    GF_VALIDATE_OR_GOTO(THIS->name, this, out);
+    GF_VALIDATE_OR_GOTO(THIS->name, dict, out);
+
+    priv = this->private;
+    GF_ASSERT(priv);
+
+    ret = dict_get_str(dict, "volname", &volname);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, NFS_MSG_VOL_NOT_FOUND,
+               "Could not get volname");
+        goto out;
+    }
+
+    list_for_each_entry(mentry, &priv->mstate->mountlist, mlist)
+    {
+        if (!_nfs_export_is_for_vol(mentry->exname, volname))
+            continue;
+
+        snprintf(key, sizeof(key), "client%d.hostname", count);
+        ret = dict_set_str(dict, key, mentry->hostname);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, NFS_MSG_WRITE_FAIL,
+                   "Error writing hostname to dict");
+            goto out;
+        }
+
+        /* No connection data available yet in nfs server.
+         * Hence, setting to 0 to prevent cli failing
+         */
+        snprintf(key, sizeof(key), "client%d.bytesread", count);
+        ret = dict_set_uint64(dict, key, 0);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, NFS_MSG_WRITE_FAIL,
+                   "Error writing bytes read to dict");
+            goto out;
+        }
+
+        snprintf(key, sizeof(key), "client%d.byteswrite", count);
+        ret = dict_set_uint64(dict, key, 0);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, NFS_MSG_WRITE_FAIL,
+                   "Error writing bytes write to dict");
+            goto out;
+        }
+
+        count++;
+    }
+
+    ret = dict_set_int32(dict, "clientcount", count);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, NFS_MSG_WRITE_FAIL,
+               "Error writing client count to dict");
+
+out:
+    gf_msg_debug(THIS->name, 0, "Returning %d", ret);
+    return ret;
+}
+
+extern int32_t
+nlm_priv(xlator_t *this);
+
+int32_t
+nfs_priv(xlator_t *this)
+{
+    int32_t ret = -1;
+
+    /* DRC needs the global drc structure, xl is of no use to it. */
+    ret = rpcsvc_drc_priv(((struct nfs_state *)(this->private))->rpcsvc->drc);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Statedump of DRC failed");
+        goto out;
+    }
+
+    ret = nlm_priv(this);
+    if (ret) {
+        gf_msg_debug(this->name, 0, "Statedump of NLM failed");
+        goto out;
+    }
+out:
+    return ret;
+}
+
+int32_t
+nfs_itable_dump(xlator_t *this)
+{
+    if (!this)
+        return -1;
+
+    if (this->next && this->next->itable) {
+        gf_proc_dump_add_section("xlator.nfs.itable");
+        inode_table_dump(this->next->itable, "xlator.nfs.itable");
+    }
+
+    return 0;
+}
+
+struct xlator_cbks cbks = {
+    .forget = nfs_forget,
+};
+
+struct xlator_fops fops;
+
+struct xlator_dumpops dumpops = {
+    .priv = nfs_priv,
+    .priv_to_dict = nfs_priv_to_dict,
+    .inode = nfs_itable_dump,
+};
+
+/* TODO: If needed, per-volume options below can be extended to be export
+ * specific also because after export-dir is introduced, a volume is not
+ * necessarily an export whereas different subdirectories within that volume
+ * can be and may need these options to be specified separately.
+ */
+struct volume_options options[] = {
+    {.key = {"nfs3.read-size"},
+     .type = GF_OPTION_TYPE_SIZET,
+     .min = GF_NFS3_RTMIN,
+     .max = GF_NFS3_RTMAX,
+     .default_value = TOSTRING(GF_NFS3_RTPREF),
+     .description = "Size in which the client should issue read requests "
+                    "to the Gluster NFSv3 server. Must be a multiple of "
+                    "4KB (4096). Min and Max supported values are 4KB "
+                    "(4096) and 1MB (1048576) respectively. If the "
+                    "specified value is within the supported range but "
+                    "not a multiple of 4096, it is rounded up to the "
+                    "nearest multiple of 4096."},
+    {.key = {"nfs3.write-size"},
+     .type = GF_OPTION_TYPE_SIZET,
+     .min = GF_NFS3_WTMIN,
+     .max = GF_NFS3_WTMAX,
+     .default_value = TOSTRING(GF_NFS3_WTPREF),
+     .description = "Size in which the client should issue write requests "
+                    "to the Gluster NFSv3 server. Must be a multiple of "
+                    "1KB (1024). Min and Max supported values are "
+                    "4KB (4096) and 1MB(1048576) respectively. If the "
+                    "specified value is within the supported range but "
+                    "not a multiple of 4096, it is rounded up to the "
+                    "nearest multiple of 4096."},
+    {.key = {"nfs3.readdir-size"},
+     .type = GF_OPTION_TYPE_SIZET,
+     .min = GF_NFS3_DTMIN,
+     .max = GF_NFS3_DTMAX,
+     .default_value = TOSTRING(GF_NFS3_DTPREF),
+     .description = "Size in which the client should issue directory "
+                    "reading requests to the Gluster NFSv3 server. Must "
+                    "be a multiple of 1KB (1024). Min and Max supported "
+                    "values are 4KB (4096) and 1MB (1048576) respectively."
+                    "If the specified value is within the supported range "
+                    "but not a multiple of 4096, it is rounded up to the "
+                    "nearest multiple of 4096."},
+    {.key = {"nfs3.*.volume-access"},
+     .type = GF_OPTION_TYPE_STR,
+     .value = {"read-only", "read-write"},
+     .default_value = "read-write",
+     .description = "Type of access desired for this subvolume: "
+                    " read-only, read-write(default)"},
+    {.key = {"nfs3.*.trusted-write"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "On an UNSTABLE write from client, return STABLE flag"
+                    " to force client to not send a COMMIT request. In "
+                    "some environments, combined with a replicated "
+                    "GlusterFS setup, this option can improve write "
+                    "performance. This flag allows user to trust Gluster"
+                    " replication logic to sync data to the disks and "
+                    "recover when required. COMMIT requests if received "
+                    "will be handled in a default manner by fsyncing."
+                    " STABLE writes are still handled in a sync manner. "
+                    "Off by default."
+
+    },
+    {.key = {"nfs3.*.trusted-sync"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "All writes and COMMIT requests are treated as async."
+                    " This implies that no write requests are guaranteed"
+                    " to be on server disks when the write reply is "
+                    "received at the NFS client. Trusted sync includes "
+                    " trusted-write behaviour. Off by default."
+
+    },
+    {.key = {"nfs3.*.export-dir"},
+     .type = GF_OPTION_TYPE_PATH,
+     .default_value = "",
+     .description = "By default, all subvolumes of nfs are exported as "
+                    "individual exports. There are cases where a "
+                    "subdirectory or subdirectories in the volume need to "
+                    "be exported separately. This option can also be used "
+                    "in conjunction with nfs3.export-volumes option to "
+                    "restrict exports only to the subdirectories specified"
+                    " through this option. Must be an absolute path. Along"
+                    " with path allowed list of IPs/hostname can be "
+                    "associated with each subdirectory. If provided "
+                    "connection will allowed only from these IPs. By "
+                    "default connections from all IPs are allowed. "
+                    "Format: <dir>[(hostspec[|hostspec|...])][,...]. Where"
+                    " hostspec can be an IP address, hostname or an IP "
+                    "range in CIDR notation. "
+                    "e.g. /foo(192.168.1.0/24|host1|10.1.1.8),/host2."
+                    " NOTE: Care must be taken while configuring this "
+                    "option as invalid entries and/or unreachable DNS "
+                    "servers can introduce unwanted delay in all the mount"
+                    " calls."},
+    {.key = {"nfs3.export-dirs"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description = "By default, all subvolumes of nfs are exported as "
+                    "individual exports. There are cases where a "
+                    "subdirectory or subdirectories in the volume need to "
+                    "be exported separately. Enabling this option allows "
+                    "any directory on a volumes to be exported separately."
+                    "Directory exports are enabled by default."},
+    {.key = {"nfs3.export-volumes"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description = "Enable or disable exporting whole volumes, instead "
+                    "if used in conjunction with nfs3.export-dir, can "
+                    "allow setting up only subdirectories as exports. On "
+                    "by default."},
+    {.key = {"rpc-auth.auth-unix"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description = "Disable or enable the AUTH_UNIX authentication type."
+                    "Must always be enabled for better interoperability. "
+                    "However, can be disabled if needed. Enabled by "
+                    "default"},
+    {.key = {"rpc-auth.auth-null"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description = "Disable or enable the AUTH_NULL authentication type."
+                    "Must always be enabled. This option is here only to"
+                    " avoid unrecognized option warnings"},
+    {.key = {"rpc-auth.auth-unix.*"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description = "Disable or enable the AUTH_UNIX authentication type "
+                    "for a particular exported volume overriding defaults"
+                    " and general setting for AUTH_UNIX scheme. Must "
+                    "always be enabled for better interoperability. "
+                    "However, can be disabled if needed. Enabled by "
+                    "default."},
+    {.key = {"rpc-auth.auth-unix.*.allow"},
+     .type = GF_OPTION_TYPE_STR,
+     .default_value = "on",
+     .description = "Disable or enable the AUTH_UNIX authentication type "
+                    "for a particular exported volume overriding defaults"
+                    " and general setting for AUTH_UNIX scheme. Must "
+                    "always be enabled for better interoperability. "
+                    "However, can be disabled if needed. Enabled by "
+                    "default."},
+    {.key = {"rpc-auth.auth-null.*"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description = "Disable or enable the AUTH_NULL authentication type "
+                    "for a particular exported volume overriding defaults"
+                    " and general setting for AUTH_NULL. Must always be "
+                    "enabled. This option is here only to avoid "
+                    "unrecognized option warnings."},
+    {.key = {"rpc-auth.addr.allow"},
+     .type = GF_OPTION_TYPE_CLIENT_AUTH_ADDR,
+     .default_value = "all",
+     .description = "Allow a comma separated list of addresses and/or"
+                    " hostnames to connect to the server. By default, all"
+                    " connections are allowed. This allows users to "
+                    "define a general rule for all exported volumes."},
+    {.key = {"rpc-auth.addr.reject"},
+     .type = GF_OPTION_TYPE_CLIENT_AUTH_ADDR,
+     .default_value = "none",
+     .description = "Reject a comma separated list of addresses and/or"
+                    " hostnames from connecting to the server. By default,"
+                    " all connections are allowed. This allows users to "
+                    "define a general rule for all exported volumes."},
+    {.key = {"rpc-auth.addr.*.allow"},
+     .type = GF_OPTION_TYPE_CLIENT_AUTH_ADDR,
+     .default_value = "all",
+     .description = "Allow a comma separated list of addresses and/or"
+                    " hostnames to connect to the server. By default, all"
+                    " connections are allowed. This allows users to "
+                    "define a rule for a specific exported volume."},
+    {.key = {"rpc-auth.addr.*.reject"},
+     .type = GF_OPTION_TYPE_CLIENT_AUTH_ADDR,
+     .default_value = "none",
+     .description = "Reject a comma separated list of addresses and/or"
+                    " hostnames from connecting to the server. By default,"
+                    " all connections are allowed. This allows users to "
+                    "define a rule for a specific exported volume."},
+    {.key = {"rpc-auth.ports.insecure"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "Allow client connections from unprivileged ports. By "
+                    "default only privileged ports are allowed. This is a"
+                    "global setting in case insecure ports are to be "
+                    "enabled for all exports using a single option."},
+    {.key = {"rpc-auth.ports.*.insecure"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "Allow client connections from unprivileged ports. By "
+                    "default only privileged ports are allowed. Use this"
+                    " option to enable or disable insecure ports for "
+                    "a specific subvolume and to override the global "
+                    "setting set by the previous option."},
+    {.key = {"rpc-auth.addr.namelookup"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "Users have the option of turning on name lookup for"
+                    " incoming client connections using this option. Use this "
+                    "option to turn on name lookups during address-based "
+                    "authentication. Turning this on will enable you to"
+                    " use hostnames in nfs.rpc-auth-* filters. In some "
+                    "setups, the name server can take too long to reply to DNS "
+                    "queries resulting in timeouts of mount requests. By "
+                    "default, name lookup is off"},
+    {.key = {"nfs.dynamic-volumes"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "Internal option set to tell gnfs to use a different"
+                    " scheme for encoding file handles when DVM is being"
+                    " used."},
+    {.key = {"nfs3.*.volume-id"},
+     .type = GF_OPTION_TYPE_STR,
+     .default_value = "",
+     .description = "When nfs.dynamic-volumes is set, gnfs expects every "
+                    "subvolume to have this option set for it, so that "
+                    "gnfs can use this option to identify the volume. "
+                    "If all subvolumes do not have this option set, an "
+                    "error is reported."},
+    {.key = {"nfs.enable-ino32"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "no",
+     .description = "For nfs clients or apps that do not support 64-bit "
+                    "inode numbers, use this option to make NFS return "
+                    "32-bit inode numbers instead. Disabled by default, so"
+                    " NFS returns 64-bit inode numbers."},
+    {.key = {"rpc.register-with-portmap"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description = "For systems that need to run multiple nfs servers, "
+                    "only one registration is possible with "
+                    "portmap service. Use this option to turn off portmap "
+                    "registration for Gluster NFS. On by default"},
+    {.key = {"rpc.outstanding-rpc-limit"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = RPCSVC_MIN_OUTSTANDING_RPC_LIMIT,
+     .max = RPCSVC_MAX_OUTSTANDING_RPC_LIMIT,
+     .default_value = TOSTRING(RPCSVC_DEF_NFS_OUTSTANDING_RPC_LIMIT),
+     .description = "Parameter to throttle the number of incoming RPC "
+                    "requests from a client. 0 means no limit (can "
+                    "potentially run out of memory)"},
+    {.key = {"nfs.port"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 0xffff,
+     .default_value = TOSTRING(GF_NFS3_PORT),
+     .description = "Use this option on systems that need Gluster NFS to "
+                    "be associated with a non-default port number."},
+    {.key = {"nfs.mem-factor"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 1024,
+     .default_value = TOSTRING(GF_NFS_DEFAULT_MEMFACTOR),
+     .description = "Use this option to make NFS be faster on systems by "
+                    "using more memory. This option specifies a multiple "
+                    "that determines the total amount of memory used. "
+                    "Default value is 15. Increase to use more memory in "
+                    "order to improve performance for certain use cases."
+                    "Please consult gluster-users list before using this "
+                    "option."},
+    {.key = {"nfs.*.disable"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "false",
+     .description = "This option is used to start or stop the NFS server "
+                    "for individual volumes."},
+    {.key = {"nfs.nlm"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description = "This option, if set to 'off', disables NLM server "
+                    "by not registering the service with the portmapper."
+                    " Set it to 'on' to re-enable it. Default value: 'on'"},
+
+    {.key = {"nfs.mount-udp"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "set the option to 'on' to enable mountd on UDP. "
+                    "Required for some Solaris and AIX NFS clients. "
+                    "The need for enabling this option often depends "
+                    "on the usage of NLM."},
+    {.key = {"nfs.mount-rmtab"},
+     .type = GF_OPTION_TYPE_PATH,
+     .default_value = NFS_DATADIR "/rmtab",
+     .description = "Set the location of the cache file that is used to "
+                    "list all the NFS-clients that have connected "
+                    "through the MOUNT protocol. If this is on shared "
+                    "storage, all GlusterFS servers will update and "
+                    "output (with 'showmount') the same list. Set to "
+                    "\"/-\" to disable."},
+    {.key = {OPT_SERVER_RPC_STATD},
+     .type = GF_OPTION_TYPE_PATH,
+     .default_value = GF_RPC_STATD_PROG,
+     .description = "The executable of RPC statd utility. "
+                    "Defaults to " GF_RPC_STATD_PROG},
+    {.key = {OPT_SERVER_RPC_STATD_PIDFILE},
+     .type = GF_OPTION_TYPE_PATH,
+     .default_value = GF_RPC_STATD_PIDFILE,
+     .description = "The pid file of RPC statd utility. "
+                    "Defaults to " GF_RPC_STATD_PIDFILE},
+    {.key = {OPT_SERVER_AUX_GIDS},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .description = "Let the server look up which groups a user belongs "
+                    "to, overwriting the list passed from the client. "
+                    "This enables support for group lists longer than "
+                    "can be passed through the NFS protocol, but is not "
+                    "secure unless users and groups are well synchronized "
+                    "between clients and servers."},
+    {.key = {OPT_SERVER_GID_CACHE_TIMEOUT},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 0,
+     .max = 3600,
+     .default_value = "300",
+     .description = "Number of seconds to cache auxiliary-GID data, "
+                    "when " OPT_SERVER_AUX_GIDS " is set."},
+    {.key = {"nfs.acl"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description = "This option is used to control ACL support for NFS."},
+    {.key = {"nfs.drc"},
+     .type = GF_OPTION_TYPE_STR,
+     .default_value = "off",
+     .description = "Enable Duplicate Request Cache in gNFS server to "
+                    "improve correctness of non-idempotent operations like "
+                    "write, delete, link, et al"},
+    {.key = {"nfs.drc-size"},
+     .type = GF_OPTION_TYPE_INT,
+     .default_value = "0x20000",
+     .description = "Sets the number of non-idempotent "
+                    "requests to cache in drc"},
+    {.key = {"nfs.exports-auth-enable"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .description = "Set the option to 'on' to enable exports/netgroup "
+                    "authentication in the NFS server and mount daemon."},
+
+    {.key = {"nfs.auth-refresh-interval-sec"},
+     .type = GF_OPTION_TYPE_INT,
+     .description = "Frequency in seconds that the daemon should check for"
+                    " changes in the exports/netgroups file."},
+
+    {.key = {"nfs.auth-cache-ttl-sec"},
+     .type = GF_OPTION_TYPE_INT,
+     .description = "Sets the TTL of an entry in the auth cache. Value is "
+                    "in seconds."},
+    {.key = {"nfs.rdirplus"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .description = "When this option is set to off NFS falls back to "
+                    "standard readdir instead of readdirp"},
+    {
+        .key = {"nfs.event-threads"},
+        .type = GF_OPTION_TYPE_SIZET,
+        .min = 1,
+        .max = 32,
+        .default_value = "2",
+        .description = "Specifies the number of event threads to execute in"
+                       "in parallel. Larger values would help process"
+                       " responses faster, depending on available processing"
+                       " power. Range 1-32 threads.",
+        .op_version = {GD_OP_VERSION_4_0_0},
+        .flags = OPT_FLAG_SETTABLE,
+    },
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1},
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "gnfs",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/nfs/server/src/nfs.h b/xlators/nfs/server/src/nfs.h
new file mode 100644
index 00000000000..e3daaed17a6
--- /dev/null
+++ b/xlators/nfs/server/src/nfs.h
@@ -0,0 +1,154 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __NFS_H__
+#define __NFS_H__
+
+#include "rpcsvc.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/lkowner.h>
+#include <glusterfs/gidcache.h>
+
+#define GF_NFS "nfs"
+
+#define GF_NFS_CONCURRENT_OPS_MULT 15
+
+#define GF_NFS_INODE_LRU_MULT 6000
+
+#define GF_RPC_MIN_THREADS 1
+#define GF_RPC_MAX_THREADS 16
+
+#define GF_NFS_DEFAULT_MEMFACTOR 15
+#define GF_NFS_MIN_MEMFACTOR 1
+#define GF_NFS_MAX_MEMFACTOR 30
+
+#define GF_NFS_DVM_ON 1
+#define GF_NFS_DVM_OFF 0
+
+/* Disable using the exports file by default */
+#define GF_NFS_DEFAULT_EXPORT_AUTH 0
+
+#define GF_NFS_DEFAULT_AUTH_REFRESH_INTERVAL_SEC 2
+#define GF_NFS_DEFAULT_AUTH_CACHE_TTL_SEC 300 /* 5 min */
+
+/* This corresponds to the max 16 number of group IDs that are sent through an
+ * RPC request. Since NFS is the only one going to set this, we can be safe
+ * in keeping this size hardcoded.
+ */
+#define GF_REQUEST_MAXGROUPS 16
+
+/* Callback into a version-specific NFS protocol.
+ * The return type is used by the nfs.c code to register the protocol.
+ * with the RPC service.
+ */
+typedef rpcsvc_program_t *(*nfs_version_initer_t)(xlator_t *nfsx);
+
+/* List of version-specific protocol initiators */
+struct nfs_initer_list {
+    struct list_head list;
+    nfs_version_initer_t init;
+    rpcsvc_program_t *program;
+    gf_boolean_t required;
+};
+
+struct nfs_state {
+    rpcsvc_t *rpcsvc;
+    struct list_head versions;
+    struct mount3_state *mstate;
+    struct nfs3_state *nfs3state;
+    struct nlm4_state *nlm4state;
+    struct mem_pool *foppool;
+    unsigned int memfactor;
+    xlator_list_t *subvols;
+
+    gf_lock_t svinitlock;
+    int allsubvols;
+    int upsubvols;
+    xlator_t **initedxl;
+    int subvols_started;
+    int dynamicvolumes;
+    int enable_ino32;
+    unsigned int override_portnum;
+    int allow_insecure;
+    int enable_nlm;
+    int enable_acl;
+    int mount_udp;
+
+    /* Enable exports auth model */
+    int exports_auth;
+    /* Refresh auth params from disk periodically */
+    int refresh_auth;
+
+    unsigned int auth_refresh_time_secs;
+    unsigned int auth_cache_ttl_sec;
+
+    char *rmtab;
+    struct rpc_clnt *rpc_clnt;
+    gf_boolean_t server_aux_gids;
+    uint32_t server_aux_gids_max_age;
+    gid_cache_t gid_cache;
+    uint32_t generation;
+    gf_boolean_t register_portmap;
+    char *rpc_statd;
+    char *rpc_statd_pid_file;
+    gf_boolean_t rdirplus;
+    uint32_t event_threads;
+};
+
+struct nfs_inode_ctx {
+    struct list_head shares;
+    uint32_t generation;
+};
+
+#define gf_nfs_dvm_on(nfsstt)                                                  \
+    (((struct nfs_state *)nfsstt)->dynamicvolumes == GF_NFS_DVM_ON)
+#define gf_nfs_dvm_off(nfsstt)                                                 \
+    (((struct nfs_state *)nfsstt)->dynamicvolumes == GF_NFS_DVM_OFF)
+#define __gf_nfs_enable_ino32(nfsstt)                                          \
+    (((struct nfs_state *)nfsstt)->enable_ino32)
+#define gf_nfs_this_private ((struct nfs_state *)((xlator_t *)THIS)->private)
+#define gf_nfs_enable_ino32() (__gf_nfs_enable_ino32(gf_nfs_this_private))
+
+/* We have one gid more than the glusterfs maximum since we pass the primary
+ * gid as the first element of the array.
+ */
+#define NFS_NGROUPS (GF_REQUEST_MAXGROUPS + 1)
+
+/* Index of the primary gid */
+#define NFS_PRIMGID_IDX 0
+
+typedef struct nfs_user_info {
+    uid_t uid;
+    gid_t gids[NFS_NGROUPS];
+    int ngrps;
+    gf_lkowner_t lk_owner;
+    char identifier[UNIX_PATH_MAX]; /* ip of user */
+} nfs_user_t;
+
+extern int
+nfs_user_root_create(nfs_user_t *newnfu);
+
+extern int
+nfs_user_create(nfs_user_t *newnfu, uid_t uid, gid_t gid,
+                rpc_transport_t *trans, gid_t *auxgids, int auxcount);
+
+extern void
+nfs_request_user_init(nfs_user_t *nfu, rpcsvc_request_t *req);
+
+extern void
+nfs_request_primary_user_init(nfs_user_t *nfu, rpcsvc_request_t *req, uid_t uid,
+                              gid_t gid);
+extern int
+nfs_subvolume_started(struct nfs_state *nfs, xlator_t *xl);
+
+extern void
+nfs_fix_groups(xlator_t *this, call_stack_t *root);
+#endif
diff --git a/xlators/nfs/server/src/nfs3-fh.c b/xlators/nfs/server/src/nfs3-fh.c
new file mode 100644
index 00000000000..caa3cfa6995
--- /dev/null
+++ b/xlators/nfs/server/src/nfs3-fh.c
@@ -0,0 +1,186 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "rpcsvc.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include "xdr-nfs3.h"
+#include "msg-nfs3.h"
+#include <glusterfs/iobuf.h>
+#include "nfs3-fh.h"
+#include "nfs-common.h"
+#include <glusterfs/iatt.h>
+#include <glusterfs/common-utils.h>
+#include "nfs-messages.h"
+
+int
+nfs3_fh_validate(struct nfs3_fh *fh)
+{
+    if (!fh)
+        return 0;
+
+    if (fh->ident[0] != GF_NFSFH_IDENT0)
+        return 0;
+
+    if (fh->ident[1] != GF_NFSFH_IDENT1)
+        return 0;
+
+    if (fh->ident[2] != GF_NFSFH_IDENT2)
+        return 0;
+
+    if (fh->ident[3] != GF_NFSFH_IDENT3)
+        return 0;
+
+    return 1;
+}
+
+void
+nfs3_fh_init(struct nfs3_fh *fh, struct iatt *buf)
+{
+    if ((!fh) || (!buf))
+        return;
+
+    fh->ident[0] = GF_NFSFH_IDENT0;
+    fh->ident[1] = GF_NFSFH_IDENT1;
+    fh->ident[2] = GF_NFSFH_IDENT2;
+    fh->ident[3] = GF_NFSFH_IDENT3;
+
+    gf_uuid_copy(fh->gfid, buf->ia_gfid);
+}
+
+struct nfs3_fh
+nfs3_fh_build_indexed_root_fh(xlator_list_t *cl, xlator_t *xl)
+{
+    struct nfs3_fh fh = {
+        {0},
+    };
+    struct iatt buf = {
+        0,
+    };
+    static uuid_t root = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+    if ((!cl) || (!xl))
+        return fh;
+
+    gf_uuid_copy(buf.ia_gfid, root);
+    nfs3_fh_init(&fh, &buf);
+    fh.exportid[15] = nfs_xlator_to_xlid(cl, xl);
+
+    return fh;
+}
+
+struct nfs3_fh
+nfs3_fh_build_uuid_root_fh(uuid_t volumeid, uuid_t mountid)
+{
+    struct nfs3_fh fh = {
+        {0},
+    };
+    struct iatt buf = {
+        0,
+    };
+    static uuid_t root = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+    gf_uuid_copy(buf.ia_gfid, root);
+    nfs3_fh_init(&fh, &buf);
+    gf_uuid_copy(fh.exportid, volumeid);
+    gf_uuid_copy(fh.mountid, mountid);
+
+    return fh;
+}
+
+int
+nfs3_fh_is_root_fh(struct nfs3_fh *fh)
+{
+    static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+    if (!fh)
+        return 0;
+
+    if (gf_uuid_compare(fh->gfid, rootgfid) == 0)
+        return 1;
+
+    return 0;
+}
+
+void
+nfs3_fh_to_str(struct nfs3_fh *fh, char *str, size_t len)
+{
+    char gfid[GF_UUID_BUF_SIZE];
+    char exportid[GF_UUID_BUF_SIZE];
+    char mountid[GF_UUID_BUF_SIZE];
+
+    if ((!fh) || (!str))
+        return;
+
+    snprintf(str, len, "FH: exportid %s, gfid %s, mountid %s",
+             uuid_utoa_r(fh->exportid, exportid), uuid_utoa_r(fh->gfid, gfid),
+             uuid_utoa_r(fh->mountid, mountid));
+}
+
+void
+nfs3_log_fh(struct nfs3_fh *fh)
+{
+    char gfidstr[512];
+    char exportidstr[512];
+
+    if (!fh)
+        return;
+
+    gf_msg_trace("nfs3-fh", 0, "filehandle: exportid 0x%s, gfid 0x%s",
+                 uuid_utoa_r(fh->exportid, exportidstr),
+                 uuid_utoa_r(fh->gfid, gfidstr));
+}
+
+int
+nfs3_fh_build_parent_fh(struct nfs3_fh *child, struct iatt *newstat,
+                        struct nfs3_fh *newfh)
+{
+    if ((!child) || (!newstat) || (!newfh))
+        return -1;
+
+    nfs3_fh_init(newfh, newstat);
+    gf_uuid_copy(newfh->exportid, child->exportid);
+    return 0;
+}
+
+int
+nfs3_build_fh(inode_t *inode, uuid_t exportid, struct nfs3_fh *newfh)
+{
+    if (!newfh || !inode)
+        return -1;
+
+    newfh->ident[0] = GF_NFSFH_IDENT0;
+    newfh->ident[1] = GF_NFSFH_IDENT1;
+    newfh->ident[2] = GF_NFSFH_IDENT2;
+    newfh->ident[3] = GF_NFSFH_IDENT3;
+    gf_uuid_copy(newfh->gfid, inode->gfid);
+    gf_uuid_copy(newfh->exportid, exportid);
+    /*gf_uuid_copy (newfh->mountid, mountid);*/
+    return 0;
+}
+
+int
+nfs3_fh_build_child_fh(struct nfs3_fh *parent, struct iatt *newstat,
+                       struct nfs3_fh *newfh)
+{
+    if ((!parent) || (!newstat) || (!newfh))
+        return -1;
+
+    nfs3_fh_init(newfh, newstat);
+    gf_uuid_copy(newfh->exportid, parent->exportid);
+    gf_uuid_copy(newfh->mountid, parent->mountid);
+    return 0;
+}
+
+uint32_t
+nfs3_fh_compute_size()
+{
+    return GF_NFSFH_STATIC_SIZE;
+}
diff --git a/xlators/nfs/server/src/nfs3-fh.h b/xlators/nfs/server/src/nfs3-fh.h
new file mode 100644
index 00000000000..cd7916b892d
--- /dev/null
+++ b/xlators/nfs/server/src/nfs3-fh.h
@@ -0,0 +1,101 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _NFS_FH_H_
+#define _NFS_FH_H_
+
+#include <glusterfs/xlator.h>
+#include "xdr-nfs3.h"
+#include <glusterfs/iatt.h>
+#include <sys/types.h>
+#include <glusterfs/compat-uuid.h>
+
+/* BIG FAT WARNING: The file handle code is tightly coupled to NFSv3 file
+ * handles for now. This will change if and when we need v4. */
+#define GF_NFSFH_IDENT0 ':'
+#define GF_NFSFH_IDENT1 'O'
+#define GF_NFSFH_IDENT2 'G'
+#define GF_NFSFH_IDENT3 'L'
+#define GF_NFSFH_IDENT_SIZE (sizeof(char) * 4)
+#define GF_NFSFH_STATIC_SIZE (GF_NFSFH_IDENT_SIZE + (3 * sizeof(uuid_t)))
+
+#define nfs3_fh_exportid_to_index(exprtid) ((uint16_t)exprtid[15])
+/* ATTENTION: Change in size of the structure below should be reflected in the
+ * GF_NFSFH_STATIC_SIZE.
+ */
+struct nfs3_fh {
+    /* Used to ensure that a bunch of bytes are actually a GlusterFS NFS
+     * file handle. Should contain ":OGL"
+     */
+    char ident[4];
+
+    /* UUID that identifies an export. The value stored in exportid
+     * depends on the usage of gluster nfs. If the DVM is enabled using
+     * the nfs.dynamic-volumes option then exportid will contain the UUID
+     * of the volume so that gnfs is able to identify volumes uniquely
+     * through volume additions,deletions,migrations, etc.
+     *
+     * When not using dvm, exportid contains the index of the volume
+     * based on the position of the volume in the list of subvolumes
+     * for gnfs.
+     */
+    uuid_t exportid;
+
+    /* File/dir gfid. */
+    uuid_t gfid;
+    uuid_t mountid;
+    /* This structure must be exactly NFS3_FHSIZE (64) bytes long.
+       Having the structure shorter results in buffer overflows
+       during XDR decoding.
+    */
+    unsigned char padding[NFS3_FHSIZE - GF_NFSFH_STATIC_SIZE];
+} __attribute__((__packed__));
+
+#define GF_NFS3FH_STATIC_INITIALIZER                                           \
+    {                                                                          \
+        {0},                                                                   \
+    }
+
+extern uint32_t
+nfs3_fh_compute_size();
+
+extern uint16_t
+nfs3_fh_hash_entry(uuid_t gfid);
+
+extern int
+nfs3_fh_validate(struct nfs3_fh *fh);
+
+extern struct nfs3_fh
+nfs3_fh_build_indexed_root_fh(xlator_list_t *cl, xlator_t *xl);
+
+extern int
+nfs3_fh_is_root_fh(struct nfs3_fh *fh);
+
+extern int
+nfs3_fh_build_child_fh(struct nfs3_fh *parent, struct iatt *newstat,
+                       struct nfs3_fh *newfh);
+
+extern void
+nfs3_log_fh(struct nfs3_fh *fh);
+
+extern void
+nfs3_fh_to_str(struct nfs3_fh *fh, char *str, size_t len);
+
+extern int
+nfs3_fh_build_parent_fh(struct nfs3_fh *child, struct iatt *newstat,
+                        struct nfs3_fh *newfh);
+
+extern struct nfs3_fh
+nfs3_fh_build_uuid_root_fh(uuid_t volumeid, uuid_t mountid);
+
+extern int
+nfs3_build_fh(inode_t *inode, uuid_t exportid, struct nfs3_fh *newfh);
+
+#endif
diff --git a/xlators/nfs/server/src/nfs3-helpers.c b/xlators/nfs/server/src/nfs3-helpers.c
new file mode 100644
index 00000000000..897fb42b071
--- /dev/null
+++ b/xlators/nfs/server/src/nfs3-helpers.c
@@ -0,0 +1,3917 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <inttypes.h>
+
+#include <glusterfs/xlator.h>
+#include "nfs3.h"
+#include "nfs3-fh.h"
+#include "msg-nfs3.h"
+#include <glusterfs/rbthash.h>
+#include "nfs-fops.h"
+#include "nfs-inodes.h"
+#include "nfs-generics.h"
+#include "nfs3-helpers.h"
+#include "nfs-mem-types.h"
+#include <glusterfs/iatt.h>
+#include <glusterfs/common-utils.h>
+#include "nfs-messages.h"
+#include "mount3.h"
+#include <string.h>
+
+extern int
+nfs3_set_root_looked_up(struct nfs3_state *nfs3, struct nfs3_fh *rootfh);
+
+extern int
+nfs3_is_root_looked_up(struct nfs3_state *nfs3, struct nfs3_fh *rootfh);
+
+#define nfs3_call_resume(cst)                                                  \
+    do {                                                                       \
+        if (((cst)) && (cst)->resume_fn)                                       \
+            (cst)->resume_fn(cst);                                             \
+    } while (0)
+
+#define nfs3_call_resume_estale(csta)                                          \
+    do {                                                                       \
+        (csta)->resolve_ret = -1;                                              \
+        (csta)->resolve_errno = ESTALE;                                        \
+        nfs3_call_resume(csta);                                                \
+    } while (0)
+
+struct nfs3stat_strerror {
+    nfsstat3 stat;
+    char strerror[100];
+};
+
+struct nfs3stat_strerror nfs3stat_strerror_table[] = {
+    {NFS3_OK, "Call completed successfully."},
+    {NFS3ERR_PERM, "Not owner"},
+    {NFS3ERR_NOENT, "No such file or directory"},
+    {NFS3ERR_IO, "I/O error"},
+    {NFS3ERR_NXIO, "I/O error"},
+    {NFS3ERR_ACCES, "Permission denied"},
+    {NFS3ERR_EXIST, "File exists"},
+    {NFS3ERR_XDEV, "Attempt to do a cross-device hard link"},
+    {NFS3ERR_NODEV, "No such device"},
+    {NFS3ERR_NOTDIR, "Not a directory"},
+    {NFS3ERR_ISDIR, "Is a directory"},
+    {NFS3ERR_INVAL, "Invalid argument for operation"},
+    {NFS3ERR_FBIG, "File too large"},
+    {NFS3ERR_NOSPC, "No space left on device"},
+    {NFS3ERR_ROFS, "Read-only file system"},
+    {NFS3ERR_MLINK, "Too many hard links"},
+    {NFS3ERR_NAMETOOLONG, "Filename in operation was too long"},
+    {NFS3ERR_NOTEMPTY, "Directory not empty"},
+    {NFS3ERR_DQUOT, "Resource (quota) hard limit exceeded"},
+    {NFS3ERR_STALE, "Invalid file handle"},
+    {NFS3ERR_REMOTE, "Too many levels of remote in path"},
+    {NFS3ERR_BADHANDLE, "Illegal NFS file handle"},
+    {NFS3ERR_NOT_SYNC, "Update synchronization mismatch detected"},
+    {NFS3ERR_BAD_COOKIE, "READDIR or READDIRPLUS cookie is stale"},
+    {NFS3ERR_NOTSUPP, "Operation is not supported"},
+    {NFS3ERR_TOOSMALL, "Buffer or request is too small"},
+    {NFS3ERR_SERVERFAULT, "Error occurred on the server or IO Error"},
+    {NFS3ERR_BADTYPE, "Type not supported by the server"},
+    {NFS3ERR_JUKEBOX, "Cannot complete server initiated request"},
+    {NFS3ERR_END_OF_LIST, "IO Error"},
+
+};
+
+uint64_t
+nfs3_iatt_gfid_to_ino(struct iatt *buf)
+{
+    uint64_t ino = 0;
+
+    if (!buf)
+        return 0;
+
+    if (gf_nfs_enable_ino32()) {
+        ino = (uint32_t)nfs_hash_gfid(buf->ia_gfid);
+        goto hashout;
+    }
+
+    /* from posix its guaranteed to send unique ino */
+    ino = buf->ia_ino;
+
+hashout:
+    return ino;
+}
+
+void
+nfs3_map_deviceid_to_statdev(struct iatt *ia, uint64_t deviceid)
+{
+    if (!ia)
+        return;
+
+    ia->ia_dev = deviceid;
+}
+
+struct nfs3_fh
+nfs3_extract_nfs3_fh(nfs_fh3 fh)
+{
+    struct nfs3_fh gfh;
+
+    memcpy(&gfh, fh.data.data_val, fh.data.data_len);
+    return gfh;
+}
+
+struct nfs3_fh
+nfs3_extract_lookup_fh(lookup3args *args)
+{
+    return nfs3_extract_nfs3_fh(args->what.dir);
+}
+
+char *
+nfs3_extract_lookup_name(lookup3args *args)
+{
+    return args->what.name;
+}
+
+nfsstat3
+nfs3_errno_to_nfsstat3(int errnum)
+{
+    nfsstat3 stat = NFS3_OK;
+
+    switch (errnum) {
+        case 0:
+            stat = NFS3_OK;
+            break;
+
+        case EPERM:
+            stat = NFS3ERR_PERM;
+            break;
+
+        case ENOENT:
+            stat = NFS3ERR_NOENT;
+            break;
+
+        case EACCES:
+            stat = NFS3ERR_ACCES;
+            break;
+
+        case EEXIST:
+            stat = NFS3ERR_EXIST;
+            break;
+
+        case EXDEV:
+            stat = NFS3ERR_XDEV;
+            break;
+
+        case ENODEV:
+            stat = NFS3ERR_NODEV;
+            break;
+
+        case EIO:
+            stat = NFS3ERR_IO;
+            break;
+
+        case ENXIO:
+            stat = NFS3ERR_NXIO;
+            break;
+
+        case ENOTDIR:
+            stat = NFS3ERR_NOTDIR;
+            break;
+
+        case EISDIR:
+            stat = NFS3ERR_ISDIR;
+            break;
+
+        case EINVAL:
+            stat = NFS3ERR_INVAL;
+            break;
+
+        case ENOSPC:
+            stat = NFS3ERR_NOSPC;
+            break;
+
+        case EROFS:
+            stat = NFS3ERR_ROFS;
+            break;
+
+        case EFBIG:
+            stat = NFS3ERR_FBIG;
+            break;
+
+        case EMLINK:
+            stat = NFS3ERR_MLINK;
+            break;
+
+        case ENAMETOOLONG:
+            stat = NFS3ERR_NAMETOOLONG;
+            break;
+
+        case ENOTEMPTY:
+            stat = NFS3ERR_NOTEMPTY;
+            break;
+
+        case EFAULT:
+            stat = NFS3ERR_SERVERFAULT;
+            break;
+
+        case ENOTSUP:
+        case ENOSYS:
+            stat = NFS3ERR_NOTSUPP;
+            break;
+
+        case EBADF:
+            stat = NFS3ERR_BADTYPE;
+            break;
+
+        case ESTALE:
+            stat = NFS3ERR_STALE;
+            break;
+
+        case ENOTCONN:
+            stat = NFS3ERR_IO;
+            break;
+
+        case EDQUOT:
+            stat = NFS3ERR_DQUOT;
+            break;
+
+        default:
+            stat = NFS3ERR_SERVERFAULT;
+            break;
+    }
+
+    return stat;
+}
+
+/*
+ * Special case: If op_ret is -1, it's very unusual op_errno being
+ * 0 which means something came wrong from upper layer(s). If it
+ * happens by any means, then set NFS3 status to NFS3ERR_SERVERFAULT.
+ */
+nfsstat3
+nfs3_cbk_errno_status(int32_t op_ret, int32_t op_errno)
+{
+    if ((op_ret == -1) && (op_errno == 0)) {
+        return NFS3ERR_SERVERFAULT;
+    }
+
+    return nfs3_errno_to_nfsstat3(op_errno);
+}
+
+void
+nfs3_fill_lookup3res_error(lookup3res *res, nfsstat3 stat, struct iatt *dirstat)
+{
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (!dirstat) {
+        res->lookup3res_u.resfail.dir_attributes.attributes_follow = FALSE;
+    }
+}
+
+void
+nfs3_stat_to_fattr3(struct iatt *buf, fattr3 *fa)
+{
+    if (buf == NULL || fa == NULL) {
+        errno = EINVAL;
+        return;
+    }
+
+    if (IA_ISDIR(buf->ia_type))
+        fa->type = NF3DIR;
+    else if (IA_ISREG(buf->ia_type))
+        fa->type = NF3REG;
+    else if (IA_ISCHR(buf->ia_type))
+        fa->type = NF3CHR;
+    else if (IA_ISBLK(buf->ia_type))
+        fa->type = NF3BLK;
+    else if (IA_ISFIFO(buf->ia_type))
+        fa->type = NF3FIFO;
+    else if (IA_ISLNK(buf->ia_type))
+        fa->type = NF3LNK;
+    else if (IA_ISSOCK(buf->ia_type))
+        fa->type = NF3SOCK;
+
+    if (IA_PROT_RUSR(buf->ia_prot))
+        fa->mode |= NFS3MODE_ROWNER;
+    if (IA_PROT_WUSR(buf->ia_prot))
+        fa->mode |= NFS3MODE_WOWNER;
+    if (IA_PROT_XUSR(buf->ia_prot))
+        fa->mode |= NFS3MODE_XOWNER;
+
+    if (IA_PROT_RGRP(buf->ia_prot))
+        fa->mode |= NFS3MODE_RGROUP;
+    if (IA_PROT_WGRP(buf->ia_prot))
+        fa->mode |= NFS3MODE_WGROUP;
+    if (IA_PROT_XGRP(buf->ia_prot))
+        fa->mode |= NFS3MODE_XGROUP;
+
+    if (IA_PROT_ROTH(buf->ia_prot))
+        fa->mode |= NFS3MODE_ROTHER;
+    if (IA_PROT_WOTH(buf->ia_prot))
+        fa->mode |= NFS3MODE_WOTHER;
+    if (IA_PROT_XOTH(buf->ia_prot))
+        fa->mode |= NFS3MODE_XOTHER;
+
+    if (IA_PROT_SUID(buf->ia_prot))
+        fa->mode |= NFS3MODE_SETXUID;
+    if (IA_PROT_SGID(buf->ia_prot))
+        fa->mode |= NFS3MODE_SETXGID;
+    if (IA_PROT_STCKY(buf->ia_prot))
+        fa->mode |= NFS3MODE_SAVESWAPTXT;
+
+    fa->nlink = buf->ia_nlink;
+    fa->uid = buf->ia_uid;
+    fa->gid = buf->ia_gid;
+    fa->size = buf->ia_size;
+    fa->used = (buf->ia_blocks * 512);
+
+    if ((IA_ISCHR(buf->ia_type) || IA_ISBLK(buf->ia_type))) {
+        fa->rdev.specdata1 = ia_major(buf->ia_rdev);
+        fa->rdev.specdata2 = ia_minor(buf->ia_rdev);
+    } else {
+        fa->rdev.specdata1 = 0;
+        fa->rdev.specdata2 = 0;
+    }
+
+    fa->fsid = buf->ia_dev;
+    fa->fileid = nfs3_iatt_gfid_to_ino(buf);
+
+    fa->atime.seconds = buf->ia_atime;
+    fa->atime.nseconds = buf->ia_atime_nsec;
+
+    fa->ctime.seconds = buf->ia_ctime;
+    fa->ctime.nseconds = buf->ia_ctime_nsec;
+
+    fa->mtime.seconds = buf->ia_mtime;
+    fa->mtime.nseconds = buf->ia_mtime_nsec;
+}
+
+post_op_attr
+nfs3_stat_to_post_op_attr(struct iatt *buf)
+{
+    post_op_attr attr = {
+        0,
+    };
+    if (!buf)
+        return attr;
+
+    /* Some performance translators return zero-filled stats when they
+     * do not have up-to-date attributes. Need to handle this by not
+     * returning these zeroed out attrs.
+     */
+    attr.attributes_follow = FALSE;
+    if (gf_is_zero_filled_stat(buf))
+        goto out;
+
+    nfs3_stat_to_fattr3(buf, &(attr.post_op_attr_u.attributes));
+    attr.attributes_follow = TRUE;
+
+out:
+    return attr;
+}
+
+pre_op_attr
+nfs3_stat_to_pre_op_attr(struct iatt *pre)
+{
+    pre_op_attr poa = {
+        0,
+    };
+
+    /* Some performance translators return zero-filled stats when they
+     * do not have up-to-date attributes. Need to handle this by not
+     * returning these zeroed out attrs.
+     */
+    poa.attributes_follow = FALSE;
+    if (gf_is_zero_filled_stat(pre))
+        goto out;
+
+    poa.attributes_follow = TRUE;
+    poa.pre_op_attr_u.attributes.size = pre->ia_size;
+    poa.pre_op_attr_u.attributes.mtime.seconds = pre->ia_mtime;
+    poa.pre_op_attr_u.attributes.mtime.nseconds = pre->ia_mtime_nsec;
+    poa.pre_op_attr_u.attributes.ctime.seconds = pre->ia_ctime;
+    poa.pre_op_attr_u.attributes.ctime.nseconds = pre->ia_ctime_nsec;
+
+out:
+    return poa;
+}
+
+void
+nfs3_fill_lookup3res_success(lookup3res *res, nfsstat3 stat, struct nfs3_fh *fh,
+                             struct iatt *buf, struct iatt *postparent)
+{
+    post_op_attr obj, dir;
+    uint32_t fhlen = 0;
+
+    res->status = stat;
+    if (fh) {
+        res->lookup3res_u.resok.object.data.data_val = (void *)fh;
+        fhlen = nfs3_fh_compute_size();
+        res->lookup3res_u.resok.object.data.data_len = fhlen;
+    }
+
+    obj.attributes_follow = FALSE;
+    dir.attributes_follow = FALSE;
+    obj = nfs3_stat_to_post_op_attr(buf);
+    dir = nfs3_stat_to_post_op_attr(postparent);
+
+    res->lookup3res_u.resok.obj_attributes = obj;
+    res->lookup3res_u.resok.dir_attributes = dir;
+}
+
+void
+nfs3_fill_lookup3res(lookup3res *res, nfsstat3 stat, struct nfs3_fh *newfh,
+                     struct iatt *buf, struct iatt *postparent,
+                     uint64_t deviceid)
+{
+    memset(res, 0, sizeof(*res));
+    nfs3_map_deviceid_to_statdev(buf, deviceid);
+    nfs3_map_deviceid_to_statdev(postparent, deviceid);
+    if (stat != NFS3_OK)
+        nfs3_fill_lookup3res_error(res, stat, postparent);
+    else
+        nfs3_fill_lookup3res_success(res, stat, newfh, buf, postparent);
+}
+
+struct nfs3_fh
+nfs3_extract_getattr_fh(getattr3args *args)
+{
+    return nfs3_extract_nfs3_fh(args->object);
+}
+
+void
+nfs3_fill_getattr3res(getattr3res *res, nfsstat3 stat, struct iatt *buf,
+                      uint64_t deviceid)
+{
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(buf, deviceid);
+    nfs3_stat_to_fattr3(buf, &(res->getattr3res_u.resok.obj_attributes));
+}
+
+struct nfs3_fh
+nfs3_extract_fsinfo_fh(fsinfo3args *args)
+{
+    return nfs3_extract_nfs3_fh(args->fsroot);
+}
+
+void
+nfs3_fill_fsinfo3res(struct nfs3_state *nfs3, fsinfo3res *res, nfsstat3 status,
+                     struct iatt *fsroot, uint64_t deviceid)
+{
+    fsinfo3resok resok = {
+        {0},
+    };
+    nfstime3 tdelta = GF_NFS3_TIMEDELTA_SECS;
+
+    memset(res, 0, sizeof(*res));
+    res->status = status;
+    if (status != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(fsroot, deviceid);
+    resok.obj_attributes = nfs3_stat_to_post_op_attr(fsroot);
+    resok.rtmax = nfs3->readsize;
+    resok.rtpref = nfs3->readsize;
+    resok.rtmult = GF_NFS3_RTMULT;
+    resok.wtmax = nfs3->writesize;
+    resok.wtpref = nfs3->writesize;
+    resok.wtmult = GF_NFS3_WTMULT;
+    resok.dtpref = nfs3->readdirsize;
+    resok.maxfilesize = GF_NFS3_MAXFILESIZE;
+    resok.time_delta = tdelta;
+    resok.properties = GF_NFS3_FS_PROP;
+
+    res->fsinfo3res_u.resok = resok;
+}
+
+void
+nfs3_prep_lookup3args(lookup3args *args, struct nfs3_fh *fh, char *name)
+{
+    memset(args, 0, sizeof(*args));
+    args->what.dir.data.data_val = (void *)fh;
+    args->what.name = name;
+}
+
+void
+nfs3_prep_getattr3args(getattr3args *args, struct nfs3_fh *fh)
+{
+    memset(args, 0, sizeof(*args));
+    args->object.data.data_val = (void *)fh;
+}
+
+void
+nfs3_prep_fsinfo3args(fsinfo3args *args, struct nfs3_fh *root)
+{
+    memset(args, 0, sizeof(*args));
+    args->fsroot.data.data_val = (void *)root;
+}
+
+char *
+nfsstat3_strerror(int stat)
+{
+    int i;
+    for (i = 0; nfs3stat_strerror_table[i].stat != NFS3ERR_END_OF_LIST; i++) {
+        if (nfs3stat_strerror_table[i].stat == stat)
+            return nfs3stat_strerror_table[i].strerror;
+    }
+
+    return nfs3stat_strerror_table[i].strerror;
+}
+
+void
+nfs3_prep_access3args(access3args *args, struct nfs3_fh *fh)
+{
+    memset(args, 0, sizeof(*args));
+    args->object.data.data_val = (void *)fh;
+}
+
+#define POSIX_READ 4
+#define POSIX_WRITE 2
+#define POSIX_EXEC 1
+
+uint32_t
+nfs3_accessbits(int32_t accbits)
+{
+    uint32_t accresult = 0;
+
+    if (accbits & POSIX_READ)
+        accresult |= ACCESS3_READ;
+
+    if (accbits & POSIX_WRITE)
+        accresult |= (ACCESS3_MODIFY | ACCESS3_EXTEND | ACCESS3_DELETE);
+
+    /* lookup on directory allowed only in case of execute permission */
+    if (accbits & POSIX_EXEC)
+        accresult |= (ACCESS3_EXECUTE | ACCESS3_LOOKUP);
+
+    return accresult;
+}
+
+uint32_t
+nfs3_request_to_accessbits(int32_t accbits)
+{
+    uint32_t acc_request = 0;
+
+    if (accbits & ACCESS3_READ)
+        acc_request |= POSIX_READ;
+
+    if (accbits & (ACCESS3_MODIFY | ACCESS3_EXTEND | ACCESS3_DELETE))
+        acc_request |= POSIX_WRITE;
+
+    /* For lookup on directory check for execute permission */
+    if (accbits & (ACCESS3_EXECUTE | ACCESS3_LOOKUP))
+        acc_request |= POSIX_EXEC;
+
+    return acc_request;
+}
+void
+nfs3_fill_access3res(access3res *res, nfsstat3 status, int32_t accbits,
+                     int32_t reqaccbits)
+{
+    uint32_t accres = 0;
+
+    memset(res, 0, sizeof(*res));
+    res->status = status;
+    if (status != NFS3_OK)
+        return;
+
+    accres = nfs3_accessbits(accbits);
+
+    /* do not answer what was not asked */
+    res->access3res_u.resok.access = accres & reqaccbits;
+}
+
+void
+nfs3_prep_readdir3args(readdir3args *ra, struct nfs3_fh *fh)
+{
+    memset(ra, 0, sizeof(*ra));
+    ra->dir.data.data_val = (void *)fh;
+}
+
+int
+nfs3_is_dot_entry(char *entry)
+{
+    int ret = 0;
+
+    if (!entry)
+        return 0;
+
+    if (strcmp(entry, ".") == 0)
+        ret = 1;
+
+    return ret;
+}
+
+int
+nfs3_is_parentdir_entry(char *entry)
+{
+    int ret = 0;
+
+    if (!entry)
+        return 0;
+
+    if (strcmp(entry, "..") == 0)
+        ret = 1;
+
+    return ret;
+}
+
+void
+nfs3_funge_root_dotdot_dirent(gf_dirent_t *ent, struct nfs3_fh *dfh)
+{
+    if ((!ent) || (!dfh))
+        return;
+
+    if (nfs3_fh_is_root_fh(dfh) && nfs3_is_parentdir_entry(ent->d_name)) {
+        ent->d_ino = 1;
+        ent->d_stat.ia_ino = 1;
+    }
+
+    if (nfs3_fh_is_root_fh(dfh) && nfs3_is_dot_entry(ent->d_name)) {
+        ent->d_ino = 1;
+        ent->d_stat.ia_ino = 1;
+    }
+}
+
+entry3 *
+nfs3_fill_entry3(gf_dirent_t *entry, struct nfs3_fh *dfh)
+{
+    entry3 *ent = NULL;
+    int name_len = 0;
+    if ((!entry) || (!dfh))
+        return NULL;
+
+    ent = GF_CALLOC(1, sizeof(*ent), gf_nfs_mt_entry3);
+    if (!ent)
+        return NULL;
+
+    gf_msg_trace(GF_NFS3, 0, "Entry: %s", entry->d_name);
+
+    /* If the entry is . or .., we need to replace the physical ino and gen
+     * with 1 and 0 respectively if the directory is root. This funging is
+     * needed because there is no parent directory of the root. In that
+     * sense the behavior we provide is similar to the output of the
+     * command: "stat /.."
+     */
+    entry->d_ino = nfs3_iatt_gfid_to_ino(&entry->d_stat);
+    nfs3_funge_root_dotdot_dirent(entry, dfh);
+    ent->fileid = entry->d_ino;
+    ent->cookie = entry->d_off;
+    name_len = strlen(entry->d_name);
+    ent->name = GF_MALLOC(name_len + 1, gf_nfs_mt_char);
+    if (!ent->name) {
+        GF_FREE(ent);
+        ent = NULL;
+        goto err;
+    }
+    strcpy(ent->name, entry->d_name);
+    ent->name[name_len] = '\0';
+
+err:
+    return ent;
+}
+
+void
+nfs3_fill_post_op_fh3(struct nfs3_fh *fh, post_op_fh3 *pfh)
+{
+    uint32_t fhlen = 0;
+
+    if ((!fh) || (!pfh))
+        return;
+
+    pfh->handle_follows = 1;
+    fhlen = nfs3_fh_compute_size();
+    pfh->post_op_fh3_u.handle.data.data_val = (void *)fh;
+    pfh->post_op_fh3_u.handle.data.data_len = fhlen;
+}
+
+post_op_fh3
+nfs3_fh_to_post_op_fh3(struct nfs3_fh *fh)
+{
+    post_op_fh3 pfh = {
+        0,
+    };
+    char *fhp = NULL;
+
+    if (!fh)
+        return pfh;
+
+    pfh.handle_follows = 1;
+
+    fhp = GF_MALLOC(sizeof(*fh), gf_nfs_mt_char);
+    if (!fhp)
+        return pfh;
+
+    memcpy(fhp, fh, sizeof(*fh));
+    nfs3_fill_post_op_fh3((struct nfs3_fh *)fhp, &pfh);
+    return pfh;
+}
+
+entryp3 *
+nfs3_fill_entryp3(gf_dirent_t *entry, struct nfs3_fh *dirfh, uint64_t devid)
+{
+    entryp3 *ent = NULL;
+    struct nfs3_fh newfh = {
+        {0},
+    };
+    int name_len = 0;
+
+    if ((!entry) || (!dirfh))
+        return NULL;
+
+    /* If the entry is . or .., we need to replace the physical ino and gen
+     * with 1 and 0 respectively if the directory is root. This funging is
+     * needed because there is no parent directory of the root. In that
+     * sense the behavior we provide is similar to the output of the
+     * command: "stat /.."
+     */
+    entry->d_ino = nfs3_iatt_gfid_to_ino(&entry->d_stat);
+    nfs3_funge_root_dotdot_dirent(entry, dirfh);
+    gf_msg_trace(GF_NFS3, 0, "Entry: %s, ino: %" PRIu64, entry->d_name,
+                 entry->d_ino);
+    ent = GF_CALLOC(1, sizeof(*ent), gf_nfs_mt_entryp3);
+    if (!ent)
+        return NULL;
+
+    ent->fileid = entry->d_ino;
+    ent->cookie = entry->d_off;
+    name_len = strlen(entry->d_name);
+    ent->name = GF_MALLOC(name_len + 1, gf_nfs_mt_char);
+    if (!ent->name) {
+        GF_FREE(ent);
+        ent = NULL;
+        goto err;
+    }
+    strcpy(ent->name, entry->d_name);
+    ent->name[name_len] = '\0';
+
+    nfs3_fh_build_child_fh(dirfh, &entry->d_stat, &newfh);
+    nfs3_map_deviceid_to_statdev(&entry->d_stat, devid);
+    /* *
+     * In tier volume, the readdirp send only to cold subvol
+     * which will populate in the 'T' file entries in the result.
+     * For such files an explicit stat call is required, by setting
+     * following argument client will perform the same.
+     *
+     * The inode value for 'T' files and directory is NULL, so just
+     * skip the check if it is directory.
+     */
+    if (!(IA_ISDIR(entry->d_stat.ia_type)) && (entry->inode == NULL))
+        ent->name_attributes.attributes_follow = FALSE;
+    else
+        ent->name_attributes = nfs3_stat_to_post_op_attr(&entry->d_stat);
+
+    ent->name_handle = nfs3_fh_to_post_op_fh3(&newfh);
+err:
+    return ent;
+}
+
+void
+nfs3_fill_readdir3res(readdir3res *res, nfsstat3 stat, struct nfs3_fh *dirfh,
+                      uint64_t cverf, struct iatt *dirstat,
+                      gf_dirent_t *entries, count3 count, int is_eof,
+                      uint64_t deviceid)
+{
+    post_op_attr dirattr;
+    entry3 *ent = NULL;
+    entry3 *headentry = NULL;
+    entry3 *preventry = NULL;
+    count3 filled = 0;
+    gf_dirent_t *listhead = NULL;
+
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(dirstat, deviceid);
+    dirattr = nfs3_stat_to_post_op_attr(dirstat);
+    res->readdir3res_u.resok.dir_attributes = dirattr;
+    res->readdir3res_u.resok.reply.eof = (bool_t)is_eof;
+    memcpy(res->readdir3res_u.resok.cookieverf, &cverf, sizeof(cverf));
+
+    filled = NFS3_READDIR_RESOK_SIZE;
+    /* First entry is just the list head */
+    listhead = entries;
+    entries = entries->next;
+    while (((entries) && (entries != listhead)) && (filled < count)) {
+        /*
+        if ((strcmp (entries->d_name, ".") == 0) ||
+            (strcmp (entries->d_name, "..") == 0))
+                goto nextentry;
+                */
+        ent = nfs3_fill_entry3(entries, dirfh);
+        if (!ent)
+            break;
+
+        if (!headentry)
+            headentry = ent;
+
+        if (preventry) {
+            preventry->nextentry = ent;
+            preventry = ent;
+        } else
+            preventry = ent;
+
+        filled += NFS3_ENTRY3_FIXED_SIZE + strlen(ent->name);
+        // nextentry:
+        entries = entries->next;
+    }
+
+    res->readdir3res_u.resok.reply.entries = headentry;
+
+    return;
+}
+
+void
+nfs3_fill_readdirp3res(readdirp3res *res, nfsstat3 stat, struct nfs3_fh *dirfh,
+                       uint64_t cverf, struct iatt *dirstat,
+                       gf_dirent_t *entries, count3 dircount, count3 maxcount,
+                       int is_eof, uint64_t deviceid)
+{
+    post_op_attr dirattr;
+    entryp3 *ent = NULL;
+    entryp3 *headentry = NULL;
+    entryp3 *preventry = NULL;
+    count3 filled = 0;
+    gf_dirent_t *listhead = NULL;
+    int fhlen = 0;
+
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(dirstat, deviceid);
+    dirattr = nfs3_stat_to_post_op_attr(dirstat);
+    res->readdirp3res_u.resok.dir_attributes = dirattr;
+    res->readdirp3res_u.resok.reply.eof = (bool_t)is_eof;
+    memcpy(res->readdirp3res_u.resok.cookieverf, &cverf, sizeof(cverf));
+
+    filled = NFS3_READDIR_RESOK_SIZE;
+    /* First entry is just the list head */
+    listhead = entries;
+    entries = entries->next;
+    while (((entries) && (entries != listhead)) && (filled < maxcount)) {
+        /* Linux does not display . and .. entries unless we provide
+         * these entries here.
+         */
+        /*                if ((strcmp (entries->d_name, ".") == 0) ||
+                            (strcmp (entries->d_name, "..") == 0))
+                                goto nextentry;
+                                */
+        ent = nfs3_fill_entryp3(entries, dirfh, deviceid);
+        if (!ent)
+            break;
+
+        if (!headentry)
+            headentry = ent;
+
+        if (preventry) {
+            preventry->nextentry = ent;
+            preventry = ent;
+        } else
+            preventry = ent;
+
+        fhlen = ent->name_handle.post_op_fh3_u.handle.data.data_len;
+        filled += NFS3_ENTRYP3_FIXED_SIZE + fhlen + strlen(ent->name);
+        // nextentry:
+        entries = entries->next;
+    }
+
+    res->readdirp3res_u.resok.reply.entries = headentry;
+
+    return;
+}
+
+void
+nfs3_prep_readdirp3args(readdirp3args *ra, struct nfs3_fh *fh)
+{
+    memset(ra, 0, sizeof(*ra));
+    ra->dir.data.data_val = (void *)fh;
+}
+
+void
+nfs3_free_readdirp3res(readdirp3res *res)
+{
+    entryp3 *ent = NULL;
+    entryp3 *next = NULL;
+
+    if (!res)
+        return;
+
+    ent = res->readdirp3res_u.resok.reply.entries;
+    while (ent) {
+        next = ent->nextentry;
+        GF_FREE(ent->name);
+        GF_FREE(ent->name_handle.post_op_fh3_u.handle.data.data_val);
+        GF_FREE(ent);
+        ent = next;
+    }
+
+    return;
+}
+
+void
+nfs3_free_readdir3res(readdir3res *res)
+{
+    entry3 *ent = NULL;
+    entry3 *next = NULL;
+
+    if (!res)
+        return;
+
+    ent = res->readdir3res_u.resok.reply.entries;
+    while (ent) {
+        next = ent->nextentry;
+        GF_FREE(ent->name);
+        GF_FREE(ent);
+        ent = next;
+    }
+
+    return;
+}
+
+void
+nfs3_prep_fsstat3args(fsstat3args *args, struct nfs3_fh *fh)
+{
+    memset(args, 0, sizeof(*args));
+    args->fsroot.data.data_val = (char *)fh;
+}
+
+void
+nfs3_fill_fsstat3res(fsstat3res *res, nfsstat3 stat, struct statvfs *fsbuf,
+                     struct iatt *postbuf, uint64_t deviceid)
+{
+    post_op_attr poa;
+    fsstat3resok resok;
+
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(postbuf, deviceid);
+    poa = nfs3_stat_to_post_op_attr(postbuf);
+    resok.tbytes = (size3)(fsbuf->f_frsize * fsbuf->f_blocks);
+    resok.fbytes = (size3)(fsbuf->f_frsize * fsbuf->f_bfree);
+    resok.abytes = (size3)(fsbuf->f_frsize * fsbuf->f_bavail);
+    resok.tfiles = (size3)(fsbuf->f_files);
+    resok.ffiles = (size3)(fsbuf->f_ffree);
+    resok.afiles = (size3)(fsbuf->f_favail);
+    resok.invarsec = 0;
+
+    resok.obj_attributes = poa;
+    res->fsstat3res_u.resok = resok;
+}
+
+int32_t
+nfs3_sattr3_to_setattr_valid(sattr3 *sattr, struct iatt *buf, mode_t *omode)
+{
+    int32_t valid = 0;
+    ia_prot_t prot = {
+        0,
+    };
+    mode_t mode = 0;
+
+    if (!sattr)
+        return 0;
+
+    if (sattr->mode.set_it) {
+        valid |= GF_SET_ATTR_MODE;
+
+        if (sattr->mode.set_mode3_u.mode & NFS3MODE_ROWNER) {
+            mode |= S_IRUSR;
+            prot.owner.read = 1;
+        }
+        if (sattr->mode.set_mode3_u.mode & NFS3MODE_WOWNER) {
+            mode |= S_IWUSR;
+            prot.owner.write = 1;
+        }
+        if (sattr->mode.set_mode3_u.mode & NFS3MODE_XOWNER) {
+            mode |= S_IXUSR;
+            prot.owner.exec = 1;
+        }
+
+        if (sattr->mode.set_mode3_u.mode & NFS3MODE_RGROUP) {
+            mode |= S_IRGRP;
+            prot.group.read = 1;
+        }
+        if (sattr->mode.set_mode3_u.mode & NFS3MODE_WGROUP) {
+            mode |= S_IWGRP;
+            prot.group.write = 1;
+        }
+        if (sattr->mode.set_mode3_u.mode & NFS3MODE_XGROUP) {
+            mode |= S_IXGRP;
+            prot.group.exec = 1;
+        }
+
+        if (sattr->mode.set_mode3_u.mode & NFS3MODE_ROTHER) {
+            mode |= S_IROTH;
+            prot.other.read = 1;
+        }
+        if (sattr->mode.set_mode3_u.mode & NFS3MODE_WOTHER) {
+            mode |= S_IWOTH;
+            prot.other.write = 1;
+        }
+        if (sattr->mode.set_mode3_u.mode & NFS3MODE_XOTHER) {
+            mode |= S_IXOTH;
+            prot.other.exec = 1;
+        }
+
+        if (sattr->mode.set_mode3_u.mode & NFS3MODE_SETXUID) {
+            mode |= S_ISUID;
+            prot.suid = 1;
+        }
+        if (sattr->mode.set_mode3_u.mode & NFS3MODE_SETXGID) {
+            mode |= S_ISGID;
+            prot.sgid = 1;
+        }
+        if (sattr->mode.set_mode3_u.mode & NFS3MODE_SAVESWAPTXT) {
+            mode |= S_ISVTX;
+            prot.sticky = 1;
+        }
+
+        if (buf)
+            buf->ia_prot = prot;
+        /* Create fop still requires the old mode_t style argument. */
+        if (omode)
+            *omode = mode;
+    }
+
+    if (sattr->uid.set_it) {
+        valid |= GF_SET_ATTR_UID;
+        if (buf)
+            buf->ia_uid = sattr->uid.set_uid3_u.uid;
+    }
+
+    if (sattr->gid.set_it) {
+        valid |= GF_SET_ATTR_GID;
+        if (buf)
+            buf->ia_gid = sattr->gid.set_gid3_u.gid;
+    }
+
+    if (sattr->size.set_it) {
+        valid |= GF_SET_ATTR_SIZE;
+        if (buf)
+            buf->ia_size = sattr->size.set_size3_u.size;
+    }
+
+    if (sattr->atime.set_it == SET_TO_CLIENT_TIME) {
+        valid |= GF_SET_ATTR_ATIME;
+        if (buf)
+            buf->ia_atime = sattr->atime.set_atime_u.atime.seconds;
+    }
+
+    if (sattr->atime.set_it == SET_TO_SERVER_TIME) {
+        valid |= GF_SET_ATTR_ATIME;
+        if (buf)
+            buf->ia_atime = gf_time();
+    }
+
+    if (sattr->mtime.set_it == SET_TO_CLIENT_TIME) {
+        valid |= GF_SET_ATTR_MTIME;
+        if (buf)
+            buf->ia_mtime = sattr->mtime.set_mtime_u.mtime.seconds;
+    }
+
+    if (sattr->mtime.set_it == SET_TO_SERVER_TIME) {
+        valid |= GF_SET_ATTR_MTIME;
+        if (buf)
+            buf->ia_mtime = gf_time();
+    }
+
+    return valid;
+}
+
+wcc_data
+nfs3_stat_to_wcc_data(struct iatt *pre, struct iatt *post)
+{
+    wcc_data wd = {
+        {0},
+    };
+
+    if (post)
+        wd.after = nfs3_stat_to_post_op_attr(post);
+    if (pre)
+        wd.before = nfs3_stat_to_pre_op_attr(pre);
+
+    return wd;
+}
+
+void
+nfs3_fill_create3res(create3res *res, nfsstat3 stat, struct nfs3_fh *newfh,
+                     struct iatt *newbuf, struct iatt *preparent,
+                     struct iatt *postparent, uint64_t deviceid)
+{
+    post_op_attr poa = {
+        0,
+    };
+    wcc_data dirwcc = {
+        {0},
+    };
+
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_fill_post_op_fh3(newfh, &res->create3res_u.resok.obj);
+    nfs3_map_deviceid_to_statdev(newbuf, deviceid);
+    poa = nfs3_stat_to_post_op_attr(newbuf);
+    res->create3res_u.resok.obj_attributes = poa;
+    nfs3_map_deviceid_to_statdev(preparent, deviceid);
+    nfs3_map_deviceid_to_statdev(postparent, deviceid);
+    dirwcc = nfs3_stat_to_wcc_data(preparent, postparent);
+
+    res->create3res_u.resok.dir_wcc = dirwcc;
+}
+
+void
+nfs3_prep_create3args(create3args *args, struct nfs3_fh *fh, char *name)
+{
+    memset(args, 0, sizeof(*args));
+    args->where.dir.data.data_val = (void *)fh;
+    args->where.name = name;
+}
+
+void
+nfs3_prep_setattr3args(setattr3args *args, struct nfs3_fh *fh)
+{
+    memset(args, 0, sizeof(*args));
+    args->object.data.data_val = (void *)fh;
+}
+
+void
+nfs3_fill_setattr3res(setattr3res *res, nfsstat3 stat, struct iatt *preop,
+                      struct iatt *postop, uint64_t deviceid)
+{
+    wcc_data wcc;
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(preop, deviceid);
+    nfs3_map_deviceid_to_statdev(postop, deviceid);
+    wcc = nfs3_stat_to_wcc_data(preop, postop);
+    res->setattr3res_u.resok.obj_wcc = wcc;
+}
+
+void
+nfs3_prep_mkdir3args(mkdir3args *args, struct nfs3_fh *dirfh, char *name)
+{
+    memset(args, 0, sizeof(*args));
+    args->where.dir.data.data_val = (void *)dirfh;
+    args->where.name = name;
+}
+
+void
+nfs3_fill_mkdir3res(mkdir3res *res, nfsstat3 stat, struct nfs3_fh *fh,
+                    struct iatt *buf, struct iatt *preparent,
+                    struct iatt *postparent, uint64_t deviceid)
+{
+    wcc_data dirwcc;
+    post_op_attr poa;
+
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_fill_post_op_fh3(fh, &res->mkdir3res_u.resok.obj);
+    nfs3_map_deviceid_to_statdev(buf, deviceid);
+    poa = nfs3_stat_to_post_op_attr(buf);
+    nfs3_map_deviceid_to_statdev(preparent, deviceid);
+    nfs3_map_deviceid_to_statdev(postparent, deviceid);
+    dirwcc = nfs3_stat_to_wcc_data(preparent, postparent);
+    res->mkdir3res_u.resok.obj_attributes = poa;
+    res->mkdir3res_u.resok.dir_wcc = dirwcc;
+}
+
+void
+nfs3_prep_symlink3args(symlink3args *args, struct nfs3_fh *dirfh, char *name,
+                       char *target)
+{
+    memset(args, 0, sizeof(*args));
+    args->where.dir.data.data_val = (void *)dirfh;
+    args->where.name = name;
+    args->symlink.symlink_data = target;
+}
+
+void
+nfs3_fill_symlink3res(symlink3res *res, nfsstat3 stat, struct nfs3_fh *fh,
+                      struct iatt *buf, struct iatt *preparent,
+                      struct iatt *postparent, uint64_t deviceid)
+{
+    wcc_data dirwcc;
+    post_op_attr poa;
+
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_fill_post_op_fh3(fh, &res->symlink3res_u.resok.obj);
+    nfs3_map_deviceid_to_statdev(buf, deviceid);
+    poa = nfs3_stat_to_post_op_attr(buf);
+    nfs3_map_deviceid_to_statdev(postparent, deviceid);
+    nfs3_map_deviceid_to_statdev(preparent, deviceid);
+    dirwcc = nfs3_stat_to_wcc_data(preparent, postparent);
+    res->symlink3res_u.resok.obj_attributes = poa;
+    res->symlink3res_u.resok.dir_wcc = dirwcc;
+}
+
+void
+nfs3_prep_readlink3args(readlink3args *args, struct nfs3_fh *fh)
+{
+    memset(args, 0, sizeof(*args));
+    args->symlink.data.data_val = (void *)fh;
+}
+
+void
+nfs3_fill_readlink3res(readlink3res *res, nfsstat3 stat, char *path,
+                       struct iatt *buf, uint64_t deviceid)
+{
+    post_op_attr poa;
+
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(buf, deviceid);
+    poa = nfs3_stat_to_post_op_attr(buf);
+    res->readlink3res_u.resok.data = (void *)path;
+    res->readlink3res_u.resok.symlink_attributes = poa;
+}
+
+void
+nfs3_prep_mknod3args(mknod3args *args, struct nfs3_fh *fh, char *name)
+{
+    memset(args, 0, sizeof(*args));
+    args->where.dir.data.data_val = (void *)fh;
+    args->where.name = name;
+}
+
+void
+nfs3_fill_mknod3res(mknod3res *res, nfsstat3 stat, struct nfs3_fh *fh,
+                    struct iatt *buf, struct iatt *preparent,
+                    struct iatt *postparent, uint64_t deviceid)
+{
+    post_op_attr poa;
+    wcc_data wccdir;
+
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_fill_post_op_fh3(fh, &res->mknod3res_u.resok.obj);
+    nfs3_map_deviceid_to_statdev(buf, deviceid);
+    poa = nfs3_stat_to_post_op_attr(buf);
+    nfs3_map_deviceid_to_statdev(preparent, deviceid);
+    nfs3_map_deviceid_to_statdev(postparent, deviceid);
+    wccdir = nfs3_stat_to_wcc_data(preparent, postparent);
+    res->mknod3res_u.resok.obj_attributes = poa;
+    res->mknod3res_u.resok.dir_wcc = wccdir;
+}
+
+void
+nfs3_fill_remove3res(remove3res *res, nfsstat3 stat, struct iatt *preparent,
+                     struct iatt *postparent, uint64_t deviceid)
+{
+    wcc_data dirwcc;
+
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(preparent, deviceid);
+    nfs3_map_deviceid_to_statdev(postparent, deviceid);
+    dirwcc = nfs3_stat_to_wcc_data(preparent, postparent);
+    res->remove3res_u.resok.dir_wcc = dirwcc;
+}
+
+void
+nfs3_prep_remove3args(remove3args *args, struct nfs3_fh *fh, char *name)
+{
+    memset(args, 0, sizeof(*args));
+    args->object.dir.data.data_val = (void *)fh;
+    args->object.name = name;
+}
+
+void
+nfs3_prep_rmdir3args(rmdir3args *args, struct nfs3_fh *fh, char *name)
+{
+    memset(args, 0, sizeof(*args));
+    args->object.dir.data.data_val = (void *)fh;
+    args->object.name = name;
+}
+
+void
+nfs3_fill_rmdir3res(rmdir3res *res, nfsstat3 stat, struct iatt *preparent,
+                    struct iatt *postparent, uint64_t deviceid)
+{
+    wcc_data dirwcc;
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(postparent, deviceid);
+    nfs3_map_deviceid_to_statdev(preparent, deviceid);
+    dirwcc = nfs3_stat_to_wcc_data(preparent, postparent);
+    res->rmdir3res_u.resok.dir_wcc = dirwcc;
+}
+
+void
+nfs3_prep_link3args(link3args *args, struct nfs3_fh *target,
+                    struct nfs3_fh *dirfh, char *name)
+{
+    memset(args, 0, sizeof(*args));
+    args->file.data.data_val = (void *)target;
+    args->link.dir.data.data_val = (void *)dirfh;
+    args->link.name = name;
+}
+
+void
+nfs3_fill_link3res(link3res *res, nfsstat3 stat, struct iatt *buf,
+                   struct iatt *preparent, struct iatt *postparent,
+                   uint64_t deviceid)
+{
+    post_op_attr poa;
+    wcc_data dirwcc;
+
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(preparent, deviceid);
+    nfs3_map_deviceid_to_statdev(postparent, deviceid);
+    nfs3_map_deviceid_to_statdev(buf, deviceid);
+    poa = nfs3_stat_to_post_op_attr(buf);
+    dirwcc = nfs3_stat_to_wcc_data(preparent, postparent);
+    res->link3res_u.resok.file_attributes = poa;
+    res->link3res_u.resok.linkdir_wcc = dirwcc;
+}
+
+void
+nfs3_prep_rename3args(rename3args *args, struct nfs3_fh *olddirfh,
+                      char *oldname, struct nfs3_fh *newdirfh, char *newname)
+{
+    memset(args, 0, sizeof(*args));
+
+    args->from.name = oldname;
+    args->from.dir.data.data_val = (void *)olddirfh;
+    args->to.name = newname;
+    args->to.dir.data.data_val = (void *)newdirfh;
+}
+
+void
+nfs3_fill_rename3res(rename3res *res, nfsstat3 stat, struct iatt *buf,
+                     struct iatt *preoldparent, struct iatt *postoldparent,
+                     struct iatt *prenewparent, struct iatt *postnewparent,
+                     uint64_t deviceid)
+
+{
+    wcc_data dirwcc;
+
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(preoldparent, deviceid);
+    nfs3_map_deviceid_to_statdev(postoldparent, deviceid);
+    nfs3_map_deviceid_to_statdev(prenewparent, deviceid);
+    nfs3_map_deviceid_to_statdev(postnewparent, deviceid);
+    nfs3_map_deviceid_to_statdev(buf, deviceid);
+    dirwcc = nfs3_stat_to_wcc_data(preoldparent, postoldparent);
+    res->rename3res_u.resok.fromdir_wcc = dirwcc;
+    dirwcc = nfs3_stat_to_wcc_data(prenewparent, postnewparent);
+    res->rename3res_u.resok.todir_wcc = dirwcc;
+}
+
+void
+nfs3_prep_write3args(write3args *args, struct nfs3_fh *fh)
+{
+    memset(args, 0, sizeof(*args));
+    args->file.data.data_val = (void *)fh;
+}
+
+void
+nfs3_fill_write3res(write3res *res, nfsstat3 stat, count3 count,
+                    stable_how stable, uint64_t wverf, struct iatt *prestat,
+                    struct iatt *poststat, uint64_t deviceid)
+{
+    write3resok resok;
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(prestat, deviceid);
+    nfs3_map_deviceid_to_statdev(poststat, deviceid);
+    resok.file_wcc = nfs3_stat_to_wcc_data(prestat, poststat);
+    resok.count = count;
+    resok.committed = stable;
+    memcpy(resok.verf, &wverf, sizeof(wverf));
+
+    res->write3res_u.resok = resok;
+}
+
+void
+nfs3_prep_commit3args(commit3args *args, struct nfs3_fh *fh)
+{
+    memset(args, 0, sizeof(*args));
+    args->file.data.data_val = (void *)fh;
+}
+
+void
+nfs3_fill_commit3res(commit3res *res, nfsstat3 stat, uint64_t wverf,
+                     struct iatt *prestat, struct iatt *poststat,
+                     uint64_t deviceid)
+{
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(poststat, deviceid);
+    nfs3_map_deviceid_to_statdev(prestat, deviceid);
+    res->commit3res_u.resok.file_wcc = nfs3_stat_to_wcc_data(prestat, poststat);
+    memcpy(res->commit3res_u.resok.verf, &wverf, sizeof(wverf));
+}
+
+void
+nfs3_fill_read3res(read3res *res, nfsstat3 stat, count3 count,
+                   struct iatt *poststat, int is_eof, uint64_t deviceid)
+{
+    post_op_attr poa;
+
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(poststat, deviceid);
+    poa = nfs3_stat_to_post_op_attr(poststat);
+    res->read3res_u.resok.file_attributes = poa;
+    res->read3res_u.resok.count = count;
+    res->read3res_u.resok.eof = is_eof;
+    res->read3res_u.resok.data.data_len = count;
+}
+
+void
+nfs3_prep_read3args(read3args *args, struct nfs3_fh *fh)
+{
+    memset(args, 0, sizeof(*args));
+    args->file.data.data_val = (void *)fh;
+}
+
+void
+nfs3_fill_pathconf3res(pathconf3res *res, nfsstat3 stat, struct iatt *buf,
+                       uint64_t deviceid)
+{
+    pathconf3resok resok;
+
+    memset(res, 0, sizeof(*res));
+    res->status = stat;
+    if (stat != NFS3_OK)
+        return;
+
+    nfs3_map_deviceid_to_statdev(buf, deviceid);
+    resok.obj_attributes = nfs3_stat_to_post_op_attr(buf);
+    resok.linkmax = 256;
+    resok.name_max = NFS_NAME_MAX;
+    resok.no_trunc = TRUE;
+    resok.chown_restricted = FALSE;
+    resok.case_insensitive = FALSE;
+    resok.case_preserving = TRUE;
+
+    res->pathconf3res_u.resok = resok;
+}
+
+void
+nfs3_prep_pathconf3args(pathconf3args *args, struct nfs3_fh *fh)
+{
+    memset(args, 0, sizeof(*args));
+    args->object.data.data_val = (void *)fh;
+}
+
+int
+nfs3_verify_dircookie(struct nfs3_state *nfs3, fd_t *dirfd, cookie3 cookie,
+                      uint64_t cverf, nfsstat3 *stat)
+{
+    int ret = -1;
+
+    if ((!nfs3) || (!dirfd))
+        return -1;
+
+    /* Can assume that this is first read on the dir, so cookie check
+     * is successful by default.
+     */
+    if (cookie == 0)
+        return 0;
+
+    gf_msg_trace(GF_NFS3, 0,
+                 "Verifying cookie: cverf: %" PRIu64 ", cookie: %" PRIu64,
+                 cverf, cookie);
+    /* The cookie bad, no way cverf will be zero with a non-zero cookie. */
+    if ((cverf == 0) && (cookie != 0)) {
+        gf_msg_trace(GF_NFS3, 0, "Bad cookie requested");
+        if (stat)
+            *stat = NFS3ERR_BAD_COOKIE;
+        goto err;
+    }
+
+    /* Yes, its true, our cookie is simply the fd_t address.
+     * NOTE: We used have the check for cookieverf but VMWare client sends
+     * a readdirp requests even after we've told it that EOF has been
+     * reached on the directory. This causes a problem because we close a
+     * dir fd_t after reaching EOF. The next readdirp sent by VMWare
+     * contains the address of the closed fd_t as cookieverf. Since we
+     * closed that fd_t, this readdirp results in a new opendir which will
+     * give an fd_t that will fail this check below.
+     */
+    /*        if ((cverf != (uint64_t)dirfd)) {
+                    gf_log (GF_NFS3, GF_LOG_TRACE, "Cookieverf does not match");
+                    if (stat)
+                            *stat = NFS3ERR_BAD_COOKIE;
+                    goto err;
+            }
+    */
+    gf_msg_trace(GF_NFS3, 0, "Cookie verified");
+    if (stat)
+        *stat = NFS3_OK;
+    ret = 0;
+err:
+    return ret;
+}
+
+void
+nfs3_stat_to_errstr(uint32_t xid, char *op, nfsstat3 stat, int pstat,
+                    char *errstr, size_t len)
+{
+    if ((!op) || (!errstr))
+        return;
+
+    snprintf(errstr, len, "XID: %x, %s: NFS: %d(%s), POSIX: %d(%s)", xid, op,
+             stat, nfsstat3_strerror(stat), pstat, strerror(pstat));
+}
+
+void
+nfs3_log_common_call(uint32_t xid, char *op, struct nfs3_fh *fh)
+{
+    char fhstr[1024];
+
+    if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+        return;
+
+    nfs3_fh_to_str(fh, fhstr, sizeof(fhstr));
+    gf_msg_debug(GF_NFS3, 0, "XID: %x, %s: args: %s", xid, op, fhstr);
+}
+
+void
+nfs3_log_fh_entry_call(uint32_t xid, char *op, struct nfs3_fh *fh, char *name)
+{
+    char fhstr[1024];
+
+    if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+        return;
+    nfs3_fh_to_str(fh, fhstr, sizeof(fhstr));
+    gf_msg_debug(GF_NFS3, 0, "XID: %x, %s: args: %s, name: %s", xid, op, fhstr,
+                 name);
+}
+
+void
+nfs3_log_rename_call(uint32_t xid, struct nfs3_fh *src, char *sname,
+                     struct nfs3_fh *dst, char *dname)
+{
+    char sfhstr[1024];
+    char dfhstr[1024];
+
+    if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+        return;
+    nfs3_fh_to_str(src, sfhstr, sizeof(sfhstr));
+    nfs3_fh_to_str(dst, dfhstr, sizeof(dfhstr));
+    gf_msg_debug(GF_NFS3, 0,
+                 "XID: %x, RENAME: args: Src: %s, "
+                 "name: %s, Dst: %s, name: %s",
+                 xid, sfhstr, sname, dfhstr, dname);
+}
+
+void
+nfs3_log_create_call(uint32_t xid, struct nfs3_fh *fh, char *name,
+                     createmode3 mode)
+{
+    char fhstr[1024];
+    char *modestr = NULL;
+    char exclmode[] = "EXCLUSIVE";
+    char unchkd[] = "UNCHECKED";
+    char guarded[] = "GUARDED";
+
+    if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+        return;
+    nfs3_fh_to_str(fh, fhstr, sizeof(fhstr));
+    if (mode == EXCLUSIVE)
+        modestr = exclmode;
+    else if (mode == GUARDED)
+        modestr = guarded;
+    else
+        modestr = unchkd;
+
+    gf_msg_debug(GF_NFS3, 0,
+                 "XID: %x, CREATE: args: %s, name: %s,"
+                 " mode: %s",
+                 xid, fhstr, name, modestr);
+}
+
+void
+nfs3_log_mknod_call(uint32_t xid, struct nfs3_fh *fh, char *name, int type)
+{
+    char fhstr[1024];
+    char *modestr = NULL;
+    char chr[] = "CHAR";
+    char blk[] = "BLK";
+    char sock[] = "SOCK";
+    char fifo[] = "FIFO";
+
+    if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+        return;
+    nfs3_fh_to_str(fh, fhstr, sizeof(fhstr));
+    if (type == NF3CHR)
+        modestr = chr;
+    else if (type == NF3BLK)
+        modestr = blk;
+    else if (type == NF3SOCK)
+        modestr = sock;
+    else
+        modestr = fifo;
+
+    gf_msg_debug(GF_NFS3, 0,
+                 "XID: %x, MKNOD: args: %s, name: %s,"
+                 " type: %s",
+                 xid, fhstr, name, modestr);
+}
+
+void
+nfs3_log_symlink_call(uint32_t xid, struct nfs3_fh *fh, char *name, char *tgt)
+{
+    char fhstr[1024];
+
+    if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+        return;
+    nfs3_fh_to_str(fh, fhstr, sizeof(fhstr));
+    gf_msg_debug(GF_NFS3, 0,
+                 "XID: %x, SYMLINK: args: %s, name: %s,"
+                 " target: %s",
+                 xid, fhstr, name, tgt);
+}
+
+void
+nfs3_log_link_call(uint32_t xid, struct nfs3_fh *fh, char *name,
+                   struct nfs3_fh *tgt)
+{
+    char dfhstr[1024];
+    char tfhstr[1024];
+
+    if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+        return;
+    nfs3_fh_to_str(fh, dfhstr, sizeof(dfhstr));
+    nfs3_fh_to_str(tgt, tfhstr, sizeof(tfhstr));
+    gf_msg_debug(GF_NFS3, 0,
+                 "XID: %x, LINK: args: %s, name: %s,"
+                 " target: %s",
+                 xid, dfhstr, name, tfhstr);
+}
+
+void
+nfs3_log_rw_call(uint32_t xid, char *op, struct nfs3_fh *fh, offset3 offt,
+                 count3 count, int stablewrite)
+{
+    char fhstr[1024];
+
+    if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+        return;
+    nfs3_fh_to_str(fh, fhstr, sizeof(fhstr));
+    if (stablewrite == -1)
+        gf_msg_debug(GF_NFS3, 0,
+                     "XID: %x, %s: args: %s, offset:"
+                     " %" PRIu64 ",  count: %" PRIu32,
+                     xid, op, fhstr, offt, count);
+    else
+        gf_msg_debug(GF_NFS3, 0,
+                     "XID: %x, %s: args: %s, offset:"
+                     " %" PRIu64 ",  count: %" PRIu32 ", %s",
+                     xid, op, fhstr, offt, count,
+                     (stablewrite == UNSTABLE) ? "UNSTABLE" : "STABLE");
+}
+
+int
+nfs3_getattr_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_PERM:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOENT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ACCES:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_EXIST:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ISDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOSPC:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ROFS:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NAMETOOLONG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_setattr_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_NOENT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_EXIST:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ISDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOSPC:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ROFS:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NAMETOOLONG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_lookup_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_PERM:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ACCES:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_EXIST:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ISDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOSPC:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ROFS:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NAMETOOLONG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_access_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_NOENT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_EXIST:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ISDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOSPC:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ROFS:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NAMETOOLONG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_readlink_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_EXIST:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ISDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOSPC:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ROFS:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_read_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_NOENT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_EXIST:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ISDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOSPC:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ROFS:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NAMETOOLONG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_write_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_NOENT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_EXIST:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ISDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOSPC:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ROFS:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NAMETOOLONG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_create_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_NOENT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_EXIST:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ISDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_mkdir_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_NOENT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ISDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_symlink_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ISDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_mknod_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_NOENT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ISDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_remove_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_EXIST:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOSPC:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_rmdir_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_EXIST:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOSPC:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_rename_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ISDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOSPC:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_link_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_readdir_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_NOENT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_EXIST:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ISDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOSPC:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ROFS:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NAMETOOLONG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+int
+nfs3_fsstat_loglevel(nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (stat) {
+        case NFS3ERR_PERM:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOENT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ACCES:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_EXIST:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_XDEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NODEV:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_IO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NXIO:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ISDIR:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_INVAL:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOSPC:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_ROFS:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_FBIG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_MLINK:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NAMETOOLONG:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTEMPTY:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_SERVERFAULT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_NOTSUPP:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_BADHANDLE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_STALE:
+            ll = GF_LOG_WARNING;
+            break;
+
+        case NFS3ERR_DQUOT:
+            ll = GF_LOG_WARNING;
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+struct nfs3op_str {
+    int op;
+    char str[100];
+};
+
+struct nfs3op_str nfs3op_strings[] = {
+    {NFS3_NULL, "NULL"},         {NFS3_GETATTR, "GETATTR"},
+    {NFS3_SETATTR, "SETATTR"},   {NFS3_LOOKUP, "LOOKUP"},
+    {NFS3_ACCESS, "ACCESS"},     {NFS3_READLINK, "READLINK"},
+    {NFS3_READ, "READ"},         {NFS3_WRITE, "WRITE"},
+    {NFS3_CREATE, "CREATE"},     {NFS3_MKDIR, "MKDIR"},
+    {NFS3_SYMLINK, "SYMLINK"},   {NFS3_MKNOD, "MKNOD"},
+    {NFS3_REMOVE, "REMOVE"},     {NFS3_RMDIR, "RMDIR"},
+    {NFS3_RENAME, "RENAME"},     {NFS3_LINK, "LINK"},
+    {NFS3_READDIR, "READDIR"},   {NFS3_READDIRP, "READDIRP"},
+    {NFS3_FSSTAT, "FSSTAT"},     {NFS3_FSINFO, "FSINFO"},
+    {NFS3_PATHCONF, "PATHCONF"}, {NFS3_COMMIT, "COMMIT"},
+};
+
+int
+nfs3_loglevel(int nfs_op, nfsstat3 stat)
+{
+    int ll = GF_LOG_DEBUG;
+
+    switch (nfs_op) {
+        case NFS3_GETATTR:
+            ll = nfs3_getattr_loglevel(stat);
+            break;
+
+        case NFS3_SETATTR:
+            ll = nfs3_setattr_loglevel(stat);
+            break;
+
+        case NFS3_LOOKUP:
+            ll = nfs3_lookup_loglevel(stat);
+            break;
+
+        case NFS3_ACCESS:
+            ll = nfs3_access_loglevel(stat);
+            break;
+
+        case NFS3_READLINK:
+            ll = nfs3_readlink_loglevel(stat);
+            break;
+
+        case NFS3_READ:
+            ll = nfs3_read_loglevel(stat);
+            break;
+
+        case NFS3_WRITE:
+            ll = nfs3_write_loglevel(stat);
+            break;
+
+        case NFS3_CREATE:
+            ll = nfs3_create_loglevel(stat);
+            break;
+
+        case NFS3_MKDIR:
+            ll = nfs3_mkdir_loglevel(stat);
+            break;
+
+        case NFS3_SYMLINK:
+            ll = nfs3_symlink_loglevel(stat);
+            break;
+
+        case NFS3_MKNOD:
+            ll = nfs3_mknod_loglevel(stat);
+            break;
+
+        case NFS3_REMOVE:
+            ll = nfs3_remove_loglevel(stat);
+            break;
+
+        case NFS3_RMDIR:
+            ll = nfs3_rmdir_loglevel(stat);
+            break;
+
+        case NFS3_RENAME:
+            ll = nfs3_rename_loglevel(stat);
+            break;
+
+        case NFS3_LINK:
+            ll = nfs3_link_loglevel(stat);
+            break;
+
+        case NFS3_READDIR:
+            ll = nfs3_readdir_loglevel(stat);
+            break;
+
+        case NFS3_READDIRP:
+            ll = nfs3_readdir_loglevel(stat);
+            break;
+
+        case NFS3_FSSTAT:
+            ll = nfs3_fsstat_loglevel(stat);
+            break;
+
+        case NFS3_FSINFO:
+            ll = nfs3_fsstat_loglevel(stat);
+            break;
+
+        case NFS3_PATHCONF:
+            ll = nfs3_fsstat_loglevel(stat);
+            break;
+
+        case NFS3_COMMIT:
+            ll = nfs3_write_loglevel(stat);
+            break;
+
+        default:
+            ll = GF_LOG_DEBUG;
+            break;
+    }
+
+    return ll;
+}
+
+void
+nfs3_log_common_res(uint32_t xid, int op, nfsstat3 stat, int pstat,
+                    const char *path)
+{
+    char errstr[1024];
+    int ll = nfs3_loglevel(op, stat);
+
+    if (THIS->ctx->log.loglevel < ll)
+        return;
+    nfs3_stat_to_errstr(xid, nfs3op_strings[op].str, stat, pstat, errstr,
+                        sizeof(errstr));
+    if (ll == GF_LOG_DEBUG)
+        gf_msg_debug(GF_NFS3, 0, "%s => (%s)", path, errstr);
+    else
+        gf_msg(GF_NFS3, ll, errno, NFS_MSG_STAT_ERROR, "%s => (%s)", path,
+               errstr);
+}
+
+void
+nfs3_log_readlink_res(uint32_t xid, nfsstat3 stat, int pstat, char *linkpath,
+                      const char *path)
+{
+    char errstr[1024];
+    int ll = nfs3_loglevel(NFS3_READLINK, stat);
+
+    if (THIS->ctx->log.loglevel < ll)
+        return;
+
+    nfs3_stat_to_errstr(xid, "READLINK", stat, pstat, errstr, sizeof(errstr));
+    if (ll == GF_LOG_DEBUG)
+        gf_msg_debug(GF_NFS3, 0, "%s => (%s), target: %s", path, errstr,
+                     linkpath);
+    else
+        gf_msg(GF_NFS3, ll, errno, NFS_MSG_STAT_ERROR, "%s => (%s) target: %s",
+               path, errstr, linkpath);
+}
+
+void
+nfs3_log_read_res(uint32_t xid, nfsstat3 stat, int pstat, count3 count,
+                  int is_eof, struct iovec *vec, int32_t veccount,
+                  const char *path)
+{
+    char errstr[1024];
+    int ll = GF_LOG_DEBUG;
+
+    ll = nfs3_loglevel(NFS3_READ, stat);
+    if (THIS->ctx->log.loglevel < ll)
+        return;
+    nfs3_stat_to_errstr(xid, "READ", stat, pstat, errstr, sizeof(errstr));
+    if (vec)
+        if (ll == GF_LOG_DEBUG)
+            gf_msg_debug(GF_NFS3, 0,
+                         "%s => (%s), count: %" PRIu32
+                         ", is_eof:"
+                         " %d, vector: count: %d, len: %zd",
+                         path, errstr, count, is_eof, veccount, vec->iov_len);
+        else
+            gf_msg(GF_NFS3, ll, errno, NFS_MSG_STAT_ERROR,
+                   "%s => (%s), count: %" PRIu32
+                   ", is_eof:"
+                   " %d, vector: count: %d, len: %zd",
+                   path, errstr, count, is_eof, veccount, vec->iov_len);
+    else if (ll == GF_LOG_DEBUG)
+        gf_msg_debug(GF_NFS3, 0,
+                     "%s => (%s), count: %" PRIu32
+                     ", is_eof:"
+                     " %d",
+                     path, errstr, count, is_eof);
+    else
+        gf_msg(GF_NFS3, ll, errno, NFS_MSG_STAT_ERROR,
+               "%s => (%s), count: %" PRIu32
+               ", is_eof:"
+               " %d",
+               path, errstr, count, is_eof);
+}
+
+void
+nfs3_log_write_res(uint32_t xid, nfsstat3 stat, int pstat, count3 count,
+                   int stable, uint64_t wverf, const char *path)
+{
+    char errstr[1024];
+    int ll = nfs3_loglevel(NFS3_WRITE, stat);
+
+    if (THIS->ctx->log.loglevel < ll)
+        return;
+
+    nfs3_stat_to_errstr(xid, "WRITE", stat, pstat, errstr, sizeof(errstr));
+    if (ll == GF_LOG_DEBUG)
+        gf_msg_debug(GF_NFS3, 0,
+                     "%s => (%s), count: %" PRIu32
+                     ", %s,wverf: "
+                     "%" PRIu64,
+                     path, errstr, count,
+                     (stable == UNSTABLE) ? "UNSTABLE" : "STABLE", wverf);
+    else
+        gf_msg(GF_NFS3, ll, errno, NFS_MSG_STAT_ERROR,
+               "%s => (%s), count: %" PRIu32 ", %s,wverf: %" PRIu64, path,
+               errstr, count, (stable == UNSTABLE) ? "UNSTABLE" : "STABLE",
+               wverf);
+}
+
+void
+nfs3_log_newfh_res(uint32_t xid, int op, nfsstat3 stat, int pstat,
+                   struct nfs3_fh *newfh, const char *path)
+{
+    char errstr[1024];
+    char fhstr[1024];
+    int ll = nfs3_loglevel(op, stat);
+
+    if (THIS->ctx->log.loglevel < ll)
+        return;
+    nfs3_stat_to_errstr(xid, nfs3op_strings[op].str, stat, pstat, errstr,
+                        sizeof(errstr));
+    nfs3_fh_to_str(newfh, fhstr, sizeof(fhstr));
+
+    if (ll == GF_LOG_DEBUG)
+        gf_msg_debug(GF_NFS3, 0, "%s => (%s), %s", path, errstr, fhstr);
+    else
+        gf_msg(GF_NFS3, nfs3_loglevel(op, stat), errno, NFS_MSG_STAT_ERROR,
+               "%s => (%s), %s", path, errstr, fhstr);
+}
+
+void
+nfs3_log_readdir_res(uint32_t xid, nfsstat3 stat, int pstat, uint64_t cverf,
+                     count3 count, int is_eof, const char *path)
+{
+    char errstr[1024];
+    int ll = nfs3_loglevel(NFS3_READDIR, stat);
+
+    if (THIS->ctx->log.loglevel < ll)
+        return;
+    nfs3_stat_to_errstr(xid, "READDIR", stat, pstat, errstr, sizeof(errstr));
+    if (ll == GF_LOG_DEBUG)
+        gf_msg_debug(GF_NFS3, 0,
+                     "%s => (%s), count: %" PRIu32 ", cverf: %" PRIu64
+                     ", is_eof: %d",
+                     path, errstr, count, cverf, is_eof);
+    else
+        gf_msg(GF_NFS3, ll, errno, NFS_MSG_STAT_ERROR,
+               "%s => (%s), count: %" PRIu32 ", cverf: %" PRIu64 ", is_eof: %d",
+               path, errstr, count, cverf, is_eof);
+}
+
+void
+nfs3_log_readdirp_res(uint32_t xid, nfsstat3 stat, int pstat, uint64_t cverf,
+                      count3 dircount, count3 maxcount, int is_eof,
+                      const char *path)
+{
+    char errstr[1024];
+    int ll = nfs3_loglevel(NFS3_READDIRP, stat);
+
+    if (THIS->ctx->log.loglevel < ll)
+        return;
+    nfs3_stat_to_errstr(xid, "READDIRPLUS", stat, pstat, errstr,
+                        sizeof(errstr));
+    if (ll == GF_LOG_DEBUG)
+        gf_msg_debug(GF_NFS3, 0,
+                     "%s => (%s), dircount: %" PRIu32 ", maxcount: %" PRIu32
+                     ", cverf: %" PRIu64 ", is_eof: %d",
+                     path, errstr, dircount, maxcount, cverf, is_eof);
+    else
+        gf_msg(GF_NFS3, ll, errno, NFS_MSG_STAT_ERROR,
+               "%s => (%s), dircount: %" PRIu32 ", maxcount: %" PRIu32
+               ", cverf: %" PRIu64 ", is_eof: %d",
+               path, errstr, dircount, maxcount, cverf, is_eof);
+}
+
+void
+nfs3_log_commit_res(uint32_t xid, nfsstat3 stat, int pstat, uint64_t wverf,
+                    const char *path)
+{
+    char errstr[1024];
+    int ll = nfs3_loglevel(NFS3_COMMIT, stat);
+
+    if (THIS->ctx->log.loglevel < ll)
+        return;
+    nfs3_stat_to_errstr(xid, "COMMIT", stat, pstat, errstr, sizeof(errstr));
+    if (ll == GF_LOG_DEBUG)
+        gf_msg_debug(GF_NFS3, 0, "%s => (%s), wverf: %" PRIu64, path, errstr,
+                     wverf);
+    else
+        gf_msg(GF_NFS3, ll, errno, NFS_MSG_STAT_ERROR,
+               "%s => (%s), wverf: %" PRIu64, path, errstr, wverf);
+}
+
+void
+nfs3_log_readdir_call(uint32_t xid, struct nfs3_fh *fh, count3 dircount,
+                      count3 maxcount)
+{
+    char fhstr[1024];
+
+    if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+        return;
+
+    nfs3_fh_to_str(fh, fhstr, sizeof(fhstr));
+
+    if (maxcount == 0)
+        gf_msg_debug(GF_NFS3, 0,
+                     "XID: %x, READDIR: args: %s,"
+                     " count: %d",
+                     xid, fhstr, (uint32_t)dircount);
+    else
+        gf_msg_debug(GF_NFS3, 0,
+                     "XID: %x, READDIRPLUS: args: %s,"
+                     " dircount: %d, maxcount: %d",
+                     xid, fhstr, (uint32_t)dircount, (uint32_t)maxcount);
+}
+
+int
+nfs3_fh_resolve_inode_done(nfs3_call_state_t *cs, inode_t *inode)
+{
+    int ret = -EFAULT;
+
+    if ((!cs) || (!inode))
+        return ret;
+
+    gf_msg_trace(GF_NFS3, 0, "FH inode resolved");
+    ret = nfs_inode_loc_fill(inode, &cs->resolvedloc, NFS_RESOLVE_EXIST);
+    if (ret < 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_INODE_LOC_FILL_ERROR,
+               "inode loc fill failed");
+        goto err;
+    }
+
+    nfs3_call_resume(cs);
+
+err:
+    return ret;
+}
+
+int32_t
+nfs3_fh_resolve_entry_lookup_cbk(call_frame_t *frame, void *cookie,
+                                 xlator_t *this, int32_t op_ret,
+                                 int32_t op_errno, inode_t *inode,
+                                 struct iatt *buf, dict_t *xattr,
+                                 struct iatt *postparent)
+{
+    nfs3_call_state_t *cs = NULL;
+    inode_t *linked_inode = NULL;
+
+    cs = frame->local;
+    cs->resolve_ret = op_ret;
+    cs->resolve_errno = op_errno;
+
+    if (op_ret == -1) {
+        if (op_errno == ENOENT) {
+            gf_msg_trace(GF_NFS3, 0, "Lookup failed: %s: %s",
+                         cs->resolvedloc.path, strerror(op_errno));
+        } else {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, op_errno, NFS_MSG_LOOKUP_FAIL,
+                   "Lookup failed: %s: %s", cs->resolvedloc.path,
+                   strerror(op_errno));
+        }
+        goto err;
+    } else
+        gf_msg_trace(GF_NFS3, 0, "Entry looked up: %s", cs->resolvedloc.path);
+
+    memcpy(&cs->stbuf, buf, sizeof(*buf));
+    memcpy(&cs->postparent, postparent, sizeof(*postparent));
+    linked_inode = inode_link(inode, cs->resolvedloc.parent,
+                              cs->resolvedloc.name, buf);
+    if (linked_inode) {
+        nfs_fix_generation(this, linked_inode);
+        inode_lookup(linked_inode);
+        inode_unref(cs->resolvedloc.inode);
+        cs->resolvedloc.inode = linked_inode;
+    } else {
+        /* nfs3_fh_resolve_entry_hard() use to resolve entire path if needed.
+         * So the ctx for inode obtained from here need to set properly,
+         * otherwise it may result in a crash.
+         */
+        nfs_fix_generation(this, inode);
+    }
+err:
+    nfs3_call_resume(cs);
+    return 0;
+}
+
+int32_t
+nfs3_fh_resolve_inode_lookup_cbk(call_frame_t *frame, void *cookie,
+                                 xlator_t *this, int32_t op_ret,
+                                 int32_t op_errno, inode_t *inode,
+                                 struct iatt *buf, dict_t *xattr,
+                                 struct iatt *postparent)
+{
+    nfs3_call_state_t *cs = NULL;
+    inode_t *linked_inode = NULL;
+
+    cs = frame->local;
+    cs->resolve_ret = op_ret;
+    cs->resolve_errno = op_errno;
+
+    if (op_ret == -1) {
+        if (op_errno == ENOENT) {
+            gf_msg_trace(GF_NFS3, 0, "Lookup failed: %s: %s",
+                         cs->resolvedloc.path, strerror(op_errno));
+        } else {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, op_errno, NFS_MSG_LOOKUP_FAIL,
+                   "Lookup failed: %s: %s", cs->resolvedloc.path,
+                   strerror(op_errno));
+        }
+        nfs3_call_resume(cs);
+        goto err;
+    }
+
+    memcpy(&cs->stbuf, buf, sizeof(*buf));
+    memcpy(&cs->postparent, buf, sizeof(*postparent));
+    linked_inode = inode_link(inode, cs->resolvedloc.parent,
+                              cs->resolvedloc.name, buf);
+    if (linked_inode) {
+        nfs_fix_generation(this, linked_inode);
+        inode_lookup(linked_inode);
+        inode_unref(cs->resolvedloc.inode);
+        cs->resolvedloc.inode = linked_inode;
+    }
+
+    /* If it is an entry lookup and we landed in the callback for hard
+     * inode resolution, it means the parent inode was not available and
+     * had to be resolved first. Now that is done, lets head back into
+     * entry resolution.
+     */
+    if (cs->resolventry)
+        nfs3_fh_resolve_entry_hard(cs);
+    else
+        nfs3_call_resume(cs);
+err:
+    return 0;
+}
+
+/* Needs no extra argument since it knows that the fh to be resolved is in
+ * resolvefh and that it needs to start looking from the root.
+ */
+int
+nfs3_fh_resolve_inode_hard(nfs3_call_state_t *cs)
+{
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!cs)
+        return ret;
+
+    gf_msg_trace(GF_NFS3, 0, "FH hard resolution for: gfid 0x%s",
+                 uuid_utoa(cs->resolvefh.gfid));
+    cs->hardresolved = 1;
+    nfs_loc_wipe(&cs->resolvedloc);
+    ret = nfs_gfid_loc_fill(cs->vol->itable, cs->resolvefh.gfid,
+                            &cs->resolvedloc, NFS_RESOLVE_CREATE);
+    if (ret < 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_INODE_LOC_FILL_ERROR,
+               "Failed to fill loc using gfid: "
+               "%s",
+               strerror(-ret));
+        goto out;
+    }
+
+    nfs_user_root_create(&nfu);
+    ret = nfs_lookup(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                     nfs3_fh_resolve_inode_lookup_cbk, cs);
+
+out:
+    return ret;
+}
+
+int
+nfs3_fh_resolve_entry_hard(nfs3_call_state_t *cs)
+{
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    gf_boolean_t freshlookup = _gf_false;
+
+    if (!cs)
+        return ret;
+
+    nfs_loc_wipe(&cs->resolvedloc);
+    nfs_user_root_create(&nfu);
+    gf_msg_trace(GF_NFS3, 0,
+                 "FH hard resolution: gfid: %s "
+                 ", entry: %s",
+                 uuid_utoa(cs->resolvefh.gfid), cs->resolventry);
+
+    ret = nfs_entry_loc_fill(cs->nfsx, cs->vol->itable, cs->resolvefh.gfid,
+                             cs->resolventry, &cs->resolvedloc,
+                             NFS_RESOLVE_CREATE, &freshlookup);
+
+    if (ret == -2) {
+        gf_msg_trace(GF_NFS3, 0, "Entry needs lookup: %s",
+                     cs->resolvedloc.path);
+        /* If the NFS op is lookup, let the resume callback
+         * handle the sending of the lookup fop. Similarly,
+         * if the NFS op is create, let the create call
+         * go ahead in the resume callback so that an EEXIST gets
+         * handled at posix without an extra fop at this point.
+         */
+        if (freshlookup &&
+            (nfs3_lookup_op(cs) ||
+             (nfs3_create_op(cs) && !nfs3_create_exclusive_op(cs)))) {
+            cs->lookuptype = GF_NFS3_FRESH;
+            cs->resolve_ret = 0;
+            cs->hardresolved = 0;
+            nfs3_call_resume(cs);
+        } else {
+            cs->hardresolved = 1;
+            nfs_lookup(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                       nfs3_fh_resolve_entry_lookup_cbk, cs);
+        }
+        ret = 0;
+    } else if (ret == -1) {
+        gf_msg_trace(GF_NFS3, 0, "Entry needs parent lookup: %s",
+                     cs->resolvedloc.path);
+        ret = nfs3_fh_resolve_inode_hard(cs);
+    } else if (ret == 0) {
+        cs->resolve_ret = 0;
+        nfs3_call_resume(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_fh_resolve_inode(nfs3_call_state_t *cs)
+{
+    inode_t *inode = NULL;
+    int ret = -EFAULT;
+    xlator_t *this = NULL;
+
+    if (!cs)
+        return ret;
+
+    this = cs->nfsx;
+    gf_msg_trace(GF_NFS3, 0, "FH needs inode resolution");
+    gf_uuid_copy(cs->resolvedloc.gfid, cs->resolvefh.gfid);
+
+    inode = inode_find(cs->vol->itable, cs->resolvefh.gfid);
+    if (!inode || inode_ctx_get(inode, this, NULL))
+        ret = nfs3_fh_resolve_inode_hard(cs);
+    else
+        ret = nfs3_fh_resolve_inode_done(cs, inode);
+
+    if (inode)
+        inode_unref(inode);
+
+    return ret;
+}
+
+int
+nfs3_fh_resolve_entry(nfs3_call_state_t *cs)
+{
+    int ret = -EFAULT;
+
+    if (!cs)
+        return ret;
+
+    return nfs3_fh_resolve_entry_hard(cs);
+}
+
+int
+nfs3_fh_resolve_resume(nfs3_call_state_t *cs)
+{
+    int ret = -EFAULT;
+
+    if (!cs)
+        return ret;
+
+    if (cs->resolve_ret < 0)
+        goto err_resume_call;
+
+    if (!cs->resolventry)
+        ret = nfs3_fh_resolve_inode(cs);
+    else
+        ret = nfs3_fh_resolve_entry(cs);
+
+err_resume_call:
+    if (ret < 0) {
+        cs->resolve_ret = -1;
+        cs->resolve_errno = EFAULT;
+        nfs3_call_resume(cs);
+        ret = 0;
+    }
+
+    return ret;
+}
+
+int32_t
+nfs3_fh_resolve_root_lookup_cbk(call_frame_t *frame, void *cookie,
+                                xlator_t *this, int32_t op_ret,
+                                int32_t op_errno, inode_t *inode,
+                                struct iatt *buf, dict_t *xattr,
+                                struct iatt *postparent)
+{
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    cs->resolve_ret = op_ret;
+    cs->resolve_errno = op_errno;
+
+    if (op_ret == -1) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, op_errno, NFS_MSG_LOOKUP_ROOT_FAIL,
+               "Root lookup failed: %s", strerror(op_errno));
+        goto err;
+    } else
+        gf_msg_trace(GF_NFS3, 0, "Root looked up: %s", cs->resolvedloc.path);
+
+    nfs3_set_root_looked_up(cs->nfs3state, &cs->resolvefh);
+err:
+    nfs3_fh_resolve_resume(cs);
+    return 0;
+}
+
+int
+nfs3_fh_resolve_root(nfs3_call_state_t *cs)
+{
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!cs)
+        return ret;
+
+    if (nfs3_is_root_looked_up(cs->nfs3state, &cs->resolvefh)) {
+        ret = nfs3_fh_resolve_resume(cs);
+        goto out;
+    }
+
+    nfs_user_root_create(&nfu);
+    gf_msg_trace(GF_NFS3, 0, "Root needs lookup");
+    ret = nfs_root_loc_fill(cs->vol->itable, &cs->resolvedloc);
+    if (ret < 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_LOOKUP_ROOT_FAIL,
+               "Failed to lookup root from itable: %s", strerror(-ret));
+        goto out;
+    }
+
+    ret = nfs_lookup(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                     nfs3_fh_resolve_root_lookup_cbk, cs);
+
+out:
+    return ret;
+}
+
+/**
+ * __nfs3_fh_auth_get_peer -- Get a peer name from the rpc request object
+ *
+ * @peer: Char * to write to
+ * @req : The request to get host/peer from
+ */
+int
+__nfs3_fh_auth_get_peer(const rpcsvc_request_t *req, char *peer)
+{
+    struct sockaddr_storage sastorage = {
+        0,
+    };
+    rpc_transport_t *trans = NULL;
+    int ret = 0;
+
+    /* Why do we pass in the peer here and then
+     * store it rather than malloc() and return a char * ? We want to avoid
+     * heap allocations in the IO path as much as possible for speed
+     * so we try to keep all allocations on the stack.
+     */
+    trans = rpcsvc_request_transport(req);
+    ret = rpcsvc_transport_peeraddr(trans, peer, RPCSVC_PEER_STRLEN, &sastorage,
+                                    sizeof(sastorage));
+    if (ret != 0) {
+        gf_msg(GF_NFS3, GF_LOG_WARNING, 0, NFS_MSG_GET_PEER_ADDR_FAIL,
+               "Failed to get peer addr: %s", gai_strerror(ret));
+    }
+    return ret;
+}
+
+/*
+ * nfs3_fh_auth_nfsop () -- Checks if an nfsop is authorized.
+ *
+ * @cs: The NFS call state containing all the relevant information
+ *
+ * @return: 0 if authorized
+ *          -EACCES for completely unauthorized fop
+ *          -EROFS  for unauthorized write operations (rm, mkdir, write)
+ */
+int
+nfs3_fh_auth_nfsop(nfs3_call_state_t *cs, gf_boolean_t is_write_op)
+{
+    struct nfs_state *nfs = NULL;
+    struct mount3_state *ms = NULL;
+
+    nfs = (struct nfs_state *)cs->nfsx->private;
+    ms = (struct mount3_state *)nfs->mstate;
+    return mnt3_authenticate_request(ms, cs->req, &cs->resolvefh, NULL, NULL,
+                                     NULL, NULL, is_write_op);
+}
+
+int
+nfs3_fh_resolve_and_resume(nfs3_call_state_t *cs, struct nfs3_fh *fh,
+                           char *entry, nfs3_resume_fn_t resum_fn)
+{
+    int ret = -EFAULT;
+
+    if ((!cs) || (!fh))
+        return ret;
+
+    cs->resume_fn = resum_fn;
+    cs->resolvefh = *fh;
+    cs->hashidx = 0;
+
+    /* Check if the resolution is:
+     * a. fh resolution
+     *
+     * or
+     *
+     * b. (fh, basename) resolution
+     */
+    if (entry) { /* b */
+        cs->resolventry = gf_strdup(entry);
+        if (!cs->resolventry)
+            goto err;
+    }
+
+    ret = nfs3_fh_resolve_root(cs);
+err:
+    return ret;
+}
diff --git a/xlators/nfs/server/src/nfs3-helpers.h b/xlators/nfs/server/src/nfs3-helpers.h
new file mode 100644
index 00000000000..aead636c831
--- /dev/null
+++ b/xlators/nfs/server/src/nfs3-helpers.h
@@ -0,0 +1,339 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _NFS3_HELPER_H_
+#define _NFS3_HELPER_H_
+
+#include <glusterfs/xlator.h>
+#include "nfs3.h"
+#include "nfs3-fh.h"
+#include "msg-nfs3.h"
+#include "xdr-nfs3.h"
+
+#include <sys/statvfs.h>
+
+#define GF_NFS3_FD_CACHED 0xcaced
+
+extern struct nfs3_fh
+nfs3_extract_lookup_fh(lookup3args *args);
+
+extern char *
+nfs3_extract_lookup_name(lookup3args *args);
+
+extern nfsstat3
+nfs3_errno_to_nfsstat3(int errnum);
+
+extern nfsstat3 nfs3_cbk_errno_status(int32_t, int32_t);
+
+extern void
+nfs3_fill_lookup3res(lookup3res *res, nfsstat3 stat, struct nfs3_fh *newfh,
+                     struct iatt *stbuf, struct iatt *postparent,
+                     uint64_t deviceid);
+
+extern post_op_attr
+nfs3_stat_to_post_op_attr(struct iatt *buf);
+
+extern struct nfs3_fh
+nfs3_extract_getattr_fh(getattr3args *args);
+
+extern void
+nfs3_fill_getattr3res(getattr3res *res, nfsstat3 stat, struct iatt *buf,
+                      uint64_t deviceid);
+
+extern struct nfs3_fh
+nfs3_extract_fsinfo_fh(fsinfo3args *args);
+
+extern void
+nfs3_fill_fsinfo3res(struct nfs3_state *nfs3, fsinfo3res *res, nfsstat3 status,
+                     struct iatt *fsroot, uint64_t deviceid);
+
+/* Functions containing _prep_ are used specifically to work around
+ * the memory allocations that happen inside Sun RPC library.
+ * In that library, there are numerous places where every NFS request
+ * can result in really tiny malloc calls. I fear the memory fragmentation
+ * that will follow. After studying the points at and the way in which malloc
+ * is called in Sun RPC, I've come up with this work-around. It is based on
+ * the idea that if the user/caller of the xdr_to_XXXXargs functions can provide
+ * already allocated memory or provide references to memory areas on its stack
+ * just for the short-term purpose of decoding the message from XDR format, we
+ * can avoid the memory allocations in Sun RPC. This is based on the fact
+ * that Sun RPC first checks whether structure members which require memory
+ * are NULL or not and only then calls malloc. In this case, if the caller
+ * provided references are non-NULL, then the if-branches containing malloc
+ * in Sun RPC will be avoided.
+ * PS: You're not expected to understand this unless you've spent some time
+ * looking through the glibc/sunrpc sources.
+ */
+extern void
+nfs3_prep_lookup3args(lookup3args *args, struct nfs3_fh *fh, char *name);
+
+extern void
+nfs3_prep_getattr3args(getattr3args *args, struct nfs3_fh *fh);
+
+extern void
+nfs3_prep_fsinfo3args(fsinfo3args *args, struct nfs3_fh *root);
+
+extern char *
+nfsstat3_strerror(int stat);
+
+extern void
+nfs3_prep_access3args(access3args *args, struct nfs3_fh *fh);
+
+extern void
+nfs3_fill_access3res(access3res *res, nfsstat3 status, int32_t accbits,
+                     int32_t reqaccbits);
+
+extern char *
+nfs3_fhcache_getpath(struct nfs3_state *nfs3, struct nfs3_fh *fh);
+
+extern int
+nfs3_fhcache_putpath(struct nfs3_state *nfs3, struct nfs3_fh *fh, char *path);
+
+extern void
+nfs3_prep_readdir3args(readdir3args *ra, struct nfs3_fh *fh);
+
+extern void
+nfs3_fill_readdir3res(readdir3res *res, nfsstat3 stat, struct nfs3_fh *dfh,
+                      uint64_t cverf, struct iatt *dirstat,
+                      gf_dirent_t *entries, count3 count, int is_eof,
+                      uint64_t deviceid);
+
+extern void
+nfs3_prep_readdirp3args(readdirp3args *ra, struct nfs3_fh *fh);
+
+extern void
+nfs3_fill_readdirp3res(readdirp3res *res, nfsstat3 stat, struct nfs3_fh *dirfh,
+                       uint64_t cverf, struct iatt *dirstat,
+                       gf_dirent_t *entries, count3 dircount, count3 maxcount,
+                       int is_eof, uint64_t deviceid);
+
+extern void
+nfs3_free_readdirp3res(readdirp3res *res);
+
+extern void
+nfs3_free_readdir3res(readdir3res *res);
+
+extern void
+nfs3_prep_fsstat3args(fsstat3args *args, struct nfs3_fh *fh);
+
+extern void
+nfs3_fill_fsstat3res(fsstat3res *res, nfsstat3 stat, struct statvfs *fsbuf,
+                     struct iatt *postbuf, uint64_t deviceid);
+
+extern int32_t
+nfs3_sattr3_to_setattr_valid(sattr3 *sattr, struct iatt *buf, mode_t *omode);
+extern void
+nfs3_fill_create3res(create3res *res, nfsstat3 stat, struct nfs3_fh *newfh,
+                     struct iatt *newbuf, struct iatt *preparent,
+                     struct iatt *postparent, uint64_t deviceid);
+
+extern void
+nfs3_prep_create3args(create3args *args, struct nfs3_fh *fh, char *name);
+
+extern void
+nfs3_prep_setattr3args(setattr3args *args, struct nfs3_fh *fh);
+
+extern void
+nfs3_fill_setattr3res(setattr3res *res, nfsstat3 stat, struct iatt *preop,
+                      struct iatt *postop, uint64_t deviceid);
+
+extern void
+nfs3_prep_mkdir3args(mkdir3args *args, struct nfs3_fh *dirfh, char *name);
+
+extern void
+nfs3_fill_mkdir3res(mkdir3res *res, nfsstat3 stat, struct nfs3_fh *fh,
+                    struct iatt *buf, struct iatt *preparent,
+                    struct iatt *postparent, uint64_t deviceid);
+
+extern void
+nfs3_prep_symlink3args(symlink3args *args, struct nfs3_fh *dirfh, char *name,
+                       char *target);
+
+extern void
+nfs3_fill_symlink3res(symlink3res *res, nfsstat3 stat, struct nfs3_fh *fh,
+                      struct iatt *buf, struct iatt *preparent,
+                      struct iatt *postparent, uint64_t deviceid);
+
+extern void
+nfs3_prep_readlink3args(readlink3args *args, struct nfs3_fh *fh);
+
+extern void
+nfs3_fill_readlink3res(readlink3res *res, nfsstat3 stat, char *path,
+                       struct iatt *buf, uint64_t deviceid);
+
+extern void
+nfs3_prep_mknod3args(mknod3args *args, struct nfs3_fh *fh, char *name);
+
+extern void
+nfs3_fill_mknod3res(mknod3res *res, nfsstat3 stat, struct nfs3_fh *fh,
+                    struct iatt *buf, struct iatt *preparent,
+                    struct iatt *postparent, uint64_t deviceid);
+
+extern void
+nfs3_fill_remove3res(remove3res *res, nfsstat3 stat, struct iatt *preparent,
+                     struct iatt *postparent, uint64_t deviceid);
+extern void
+nfs3_prep_remove3args(remove3args *args, struct nfs3_fh *fh, char *name);
+
+extern void
+nfs3_fill_rmdir3res(rmdir3res *res, nfsstat3 stat, struct iatt *preparent,
+                    struct iatt *postparent, uint64_t deviceid);
+
+extern void
+nfs3_prep_rmdir3args(rmdir3args *args, struct nfs3_fh *fh, char *name);
+
+extern void
+nfs3_fill_link3res(link3res *res, nfsstat3 stat, struct iatt *buf,
+                   struct iatt *preparent, struct iatt *postparent,
+                   uint64_t deviceid);
+
+extern void
+nfs3_prep_link3args(link3args *args, struct nfs3_fh *target,
+                    struct nfs3_fh *dirfh, char *name);
+
+extern void
+nfs3_prep_rename3args(rename3args *args, struct nfs3_fh *olddirfh,
+                      char *oldname, struct nfs3_fh *newdirfh, char *newname);
+
+extern void
+nfs3_fill_rename3res(rename3res *res, nfsstat3 stat, struct iatt *buf,
+                     struct iatt *preoldparent, struct iatt *postoldparent,
+                     struct iatt *prenewparent, struct iatt *postnewparent,
+                     uint64_t deviceid);
+
+extern void
+nfs3_prep_write3args(write3args *args, struct nfs3_fh *fh);
+
+extern void
+nfs3_fill_write3res(write3res *res, nfsstat3 stat, count3 count,
+                    stable_how stable, uint64_t wverf, struct iatt *prestat,
+                    struct iatt *poststat, uint64_t deviceid);
+
+extern void
+nfs3_prep_commit3args(commit3args *args, struct nfs3_fh *fh);
+
+extern void
+nfs3_fill_commit3res(commit3res *res, nfsstat3 stat, uint64_t wverf,
+                     struct iatt *prestat, struct iatt *poststat,
+                     uint64_t deviceid);
+
+extern void
+nfs3_fill_read3res(read3res *res, nfsstat3 stat, count3 count,
+                   struct iatt *poststat, int is_eof, uint64_t deviceid);
+
+extern void
+nfs3_prep_read3args(read3args *args, struct nfs3_fh *fh);
+
+extern void
+nfs3_prep_pathconf3args(pathconf3args *args, struct nfs3_fh *fh);
+
+extern void
+nfs3_fill_pathconf3res(pathconf3res *res, nfsstat3 stat, struct iatt *buf,
+                       uint64_t deviceid);
+
+extern int
+nfs3_cached_inode_opened(xlator_t *nfsxl, inode_t *inode);
+
+extern void
+nfs3_log_common_res(uint32_t xid, int op, nfsstat3 stat, int pstat,
+                    const char *path);
+
+extern void
+nfs3_log_readlink_res(uint32_t xid, nfsstat3 stat, int pstat, char *linkpath,
+                      const char *path);
+
+extern void
+nfs3_log_read_res(uint32_t xid, nfsstat3 stat, int pstat, count3 count,
+                  int is_eof, struct iovec *vec, int32_t vcount,
+                  const char *path);
+
+extern void
+nfs3_log_write_res(uint32_t xid, nfsstat3 stat, int pstat, count3 count,
+                   int stable, uint64_t wverf, const char *path);
+
+extern void
+nfs3_log_newfh_res(uint32_t xid, int op, nfsstat3 stat, int pstat,
+                   struct nfs3_fh *newfh, const char *path);
+
+extern void
+nfs3_log_readdir_res(uint32_t xid, nfsstat3 stat, int pstat, uint64_t cverf,
+                     count3 count, int is_eof, const char *path);
+
+extern void
+nfs3_log_readdirp_res(uint32_t xid, nfsstat3 stat, int pstat, uint64_t cverf,
+                      count3 dircount, count3 maxcount, int is_eof,
+                      const char *path);
+
+extern void
+nfs3_log_commit_res(uint32_t xid, nfsstat3 stat, int pstat, uint64_t wverf,
+                    const char *path);
+
+extern void
+nfs3_log_common_call(uint32_t xid, char *op, struct nfs3_fh *fh);
+
+extern void
+nfs3_log_fh_entry_call(uint32_t xid, char *op, struct nfs3_fh *fh, char *name);
+
+extern void
+nfs3_log_rw_call(uint32_t xid, char *op, struct nfs3_fh *fh, offset3 offt,
+                 count3 count, int stablewrite);
+
+extern void
+nfs3_log_create_call(uint32_t xid, struct nfs3_fh *fh, char *name,
+                     createmode3 mode);
+
+extern void
+nfs3_log_symlink_call(uint32_t xid, struct nfs3_fh *fh, char *name, char *tgt);
+
+extern void
+nfs3_log_mknod_call(uint32_t xid, struct nfs3_fh *fh, char *name, int type);
+
+extern void
+nfs3_log_rename_call(uint32_t xid, struct nfs3_fh *src, char *sname,
+                     struct nfs3_fh *dst, char *dname);
+
+extern void
+nfs3_log_link_call(uint32_t xid, struct nfs3_fh *fh, char *name,
+                   struct nfs3_fh *tgt);
+
+extern void
+nfs3_log_readdir_call(uint32_t xid, struct nfs3_fh *fh, count3 dircount,
+                      count3 maxcount);
+
+extern int
+nfs3_fh_resolve_entry_hard(nfs3_call_state_t *cs);
+
+extern int
+nfs3_fh_resolve_inode(nfs3_call_state_t *cs);
+
+extern int
+nfs3_fh_resolve_entry(nfs3_call_state_t *cs);
+
+extern int
+nfs3_fh_resolve_and_resume(nfs3_call_state_t *cs, struct nfs3_fh *fh,
+                           char *entry, nfs3_resume_fn_t resum_fn);
+
+extern int
+nfs3_verify_dircookie(struct nfs3_state *nfs3, fd_t *dirfd, cookie3 cookie,
+                      uint64_t cverf, nfsstat3 *stat);
+
+extern int
+nfs3_is_parentdir_entry(char *entry);
+
+uint32_t
+nfs3_request_to_accessbits(int32_t accbits);
+
+extern int
+nfs3_fh_auth_nfsop(nfs3_call_state_t *cs, gf_boolean_t is_write_op);
+
+void
+nfs3_map_deviceid_to_statdev(struct iatt *ia, uint64_t deviceid);
+
+#endif
diff --git a/xlators/nfs/server/src/nfs3.c b/xlators/nfs/server/src/nfs3.c
new file mode 100644
index 00000000000..f9042bc3b3f
--- /dev/null
+++ b/xlators/nfs/server/src/nfs3.c
@@ -0,0 +1,5738 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "rpcsvc.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include "mount3.h"
+#include "xdr-nfs3.h"
+#include "msg-nfs3.h"
+#include <glusterfs/iobuf.h>
+#include "nfs3.h"
+#include <glusterfs/mem-pool.h>
+#include <glusterfs/logging.h>
+#include "nfs-common.h"
+#include "nfs-fops.h"
+#include "nfs-inodes.h"
+#include "nfs-generics.h"
+#include "nfs3-helpers.h"
+#include "nfs-mem-types.h"
+#include "nfs.h"
+#include "xdr-rpc.h"
+#include "xdr-generic.h"
+#include "nfs-messages.h"
+#include "glfs-internal.h"
+
+#include <sys/socket.h>
+#include <sys/uio.h>
+#include <sys/statvfs.h>
+#include <time.h>
+
+#define nfs3_validate_strlen_or_goto(str, len, label, status, retval)          \
+    do {                                                                       \
+        if ((str)) {                                                           \
+            if (strlen((str)) > (len)) {                                       \
+                gf_msg(GF_NFS3, GF_LOG_ERROR, ENAMETOOLONG,                    \
+                       NFS_MSG_STR_TOO_LONG, "strlen too long");               \
+                status = NFS3ERR_NAMETOOLONG;                                  \
+                retval = -ENAMETOOLONG;                                        \
+                goto label;                                                    \
+            }                                                                  \
+        }                                                                      \
+    } while (0);
+
+#define nfs3_validate_nfs3_state(request, state, status, label, retval)        \
+    do {                                                                       \
+        state = rpcsvc_request_program_private(request);                       \
+        if (!state) {                                                          \
+            gf_msg(GF_NFS3, GF_LOG_ERROR, EFAULT, NFS_MSG_STATE_MISSING,       \
+                   "NFSv3 state "                                              \
+                   "missing from RPC request");                                \
+            status = NFS3ERR_SERVERFAULT;                                      \
+            ret = -EFAULT;                                                     \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0);
+
+struct nfs3_export *
+__nfs3_get_export_by_index(struct nfs3_state *nfs3, uuid_t exportid)
+{
+    struct nfs3_export *exp = NULL;
+    int index = 0;
+    int searchindex = 0;
+
+    searchindex = nfs3_fh_exportid_to_index(exportid);
+    list_for_each_entry(exp, &nfs3->exports, explist)
+    {
+        if (searchindex == index)
+            goto found;
+
+        ++index;
+    }
+
+    exp = NULL;
+    gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_INDEX_NOT_FOUND,
+           "searchindex=%d not found", searchindex);
+found:
+    return exp;
+}
+
+struct nfs3_export *
+__nfs3_get_export_by_volumeid(struct nfs3_state *nfs3, uuid_t exportid)
+{
+    struct nfs3_export *exp = NULL;
+
+    list_for_each_entry(exp, &nfs3->exports, explist)
+    {
+        if (!gf_uuid_compare(exportid, exp->volumeid))
+            goto found;
+    }
+
+    exp = NULL;
+found:
+    return exp;
+}
+
+struct nfs3_export *
+__nfs3_get_export_by_exportid(struct nfs3_state *nfs3, uuid_t exportid)
+{
+    struct nfs3_export *exp = NULL;
+
+    if (!nfs3)
+        return exp;
+
+    if (gf_nfs_dvm_off(nfs_state(nfs3->nfsx)))
+        exp = __nfs3_get_export_by_index(nfs3, exportid);
+    else
+        exp = __nfs3_get_export_by_volumeid(nfs3, exportid);
+
+    return exp;
+}
+
+int
+nfs3_export_access(struct nfs3_state *nfs3, uuid_t exportid)
+{
+    int ret = GF_NFS3_VOLACCESS_RO;
+    struct nfs3_export *exp = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS3, nfs3, err);
+
+    exp = __nfs3_get_export_by_exportid(nfs3, exportid);
+
+    if (!exp) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_EXPORT_ID_FAIL,
+               "Failed to get export by ID");
+        goto err;
+    }
+
+    ret = exp->access;
+
+err:
+    return ret;
+}
+
+#define nfs3_check_rw_volaccess(nfs3state, exid, status, label)                \
+    do {                                                                       \
+        if (nfs3_export_access(nfs3state, exid) != GF_NFS3_VOLACCESS_RW) {     \
+            gf_msg(GF_NFS3, GF_LOG_ERROR, EACCES, NFS_MSG_NO_RW_ACCESS,        \
+                   "No read-write access");                                    \
+            status = NFS3ERR_ROFS;                                             \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0)
+
+xlator_t *
+nfs3_fh_to_xlator(struct nfs3_state *nfs3, struct nfs3_fh *fh)
+{
+    xlator_t *vol = NULL;
+    struct nfs3_export *exp = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS3, nfs3, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS3, fh, out);
+
+    exp = __nfs3_get_export_by_exportid(nfs3, fh->exportid);
+    if (!exp)
+        goto out;
+
+    vol = exp->subvol;
+out:
+    return vol;
+}
+
+int
+nfs3_is_root_looked_up(struct nfs3_state *nfs3, struct nfs3_fh *rootfh)
+{
+    struct nfs3_export *exp = NULL;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS3, nfs3, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS3, rootfh, out);
+
+    exp = __nfs3_get_export_by_exportid(nfs3, rootfh->exportid);
+    if (!exp)
+        goto out;
+
+    ret = exp->rootlookedup;
+out:
+    return ret;
+}
+
+int
+nfs3_set_root_looked_up(struct nfs3_state *nfs3, struct nfs3_fh *rootfh)
+{
+    struct nfs3_export *exp = NULL;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS3, nfs3, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS3, rootfh, out);
+
+    exp = __nfs3_get_export_by_exportid(nfs3, rootfh->exportid);
+    if (!exp)
+        goto out;
+
+    exp->rootlookedup = 1;
+out:
+    return ret;
+}
+
+#define nfs3_map_fh_to_volume(nfs3state, handle, req, volume, status, label)   \
+    do {                                                                       \
+        char exportid[256], gfid[256];                                         \
+        rpc_transport_t *trans = NULL;                                         \
+        volume = nfs3_fh_to_xlator((nfs3state), handle);                       \
+        if (!volume) {                                                         \
+            gf_uuid_unparse(handle->exportid, exportid);                       \
+            gf_uuid_unparse(handle->gfid, gfid);                               \
+            trans = rpcsvc_request_transport(req);                             \
+            GF_LOG_OCCASIONALLY(nfs3state->occ_logger, GF_NFS3, GF_LOG_ERROR,  \
+                                "Failed to map "                               \
+                                "FH to vol: client=%s, exportid=%s, "          \
+                                "gfid=%s",                                     \
+                                trans->peerinfo.identifier, exportid, gfid);   \
+            GF_LOG_OCCASIONALLY(nfs3state->occ_logger, GF_NFS3, GF_LOG_ERROR,  \
+                                "Stale nfs "                                   \
+                                "client %s must be trying to connect to"       \
+                                " a deleted volume, please unmount it.",       \
+                                trans->peerinfo.identifier);                   \
+            status = NFS3ERR_STALE;                                            \
+            goto label;                                                        \
+        } else {                                                               \
+            gf_msg_trace(GF_NFS3, 0,                                           \
+                         "FH to Volume:"                                       \
+                         "%s",                                                 \
+                         volume->name);                                        \
+            rpcsvc_request_set_private(req, volume);                           \
+        }                                                                      \
+    } while (0);
+
+#define nfs3_validate_gluster_fh(handle, status, errlabel)                     \
+    do {                                                                       \
+        if (!nfs3_fh_validate(handle)) {                                       \
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_BAD_HANDLE,               \
+                   "Bad Handle");                                              \
+            status = NFS3ERR_BADHANDLE;                                        \
+            goto errlabel;                                                     \
+        }                                                                      \
+    } while (0)
+
+#define nfs3_check_fh_auth_status(cst, nfstat, is_write_op, erlabl)            \
+    do {                                                                       \
+        int auth_ret = 0;                                                      \
+        int auth_errno = 0;                                                    \
+        xlator_t *xlatorp = NULL;                                              \
+        char buf[256], gfid[GF_UUID_BUF_SIZE];                                 \
+        rpc_transport_t *trans = NULL;                                         \
+                                                                               \
+        auth_ret = auth_errno = nfs3_fh_auth_nfsop(cst, is_write_op);          \
+        if (auth_ret < 0) {                                                    \
+            trans = rpcsvc_request_transport(cst->req);                        \
+            xlatorp = nfs3_fh_to_xlator(cst->nfs3state, &cst->resolvefh);      \
+            gf_uuid_unparse(cst->resolvefh.gfid, gfid);                        \
+            sprintf(buf, "(%s) %s : %s", trans->peerinfo.identifier,           \
+                    xlatorp ? xlatorp->name : "ERR", gfid);                    \
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_RESOLVE_FH_FAIL,          \
+                   "Unable to "                                                \
+                   "resolve FH: %s",                                           \
+                   buf);                                                       \
+            nfstat = nfs3_errno_to_nfsstat3(-auth_errno);                      \
+            goto erlabl;                                                       \
+        }                                                                      \
+    } while (0)
+
+#define nfs3_check_fh_resolve_status(cst, nfstat, erlabl)                      \
+    do {                                                                       \
+        xlator_t *xlatorp = NULL;                                              \
+        char buf[256], gfid[GF_UUID_BUF_SIZE];                                 \
+        rpc_transport_t *trans = NULL;                                         \
+        if ((cst)->resolve_ret < 0) {                                          \
+            trans = rpcsvc_request_transport(cst->req);                        \
+            xlatorp = nfs3_fh_to_xlator(cst->nfs3state, &cst->resolvefh);      \
+            gf_uuid_unparse(cst->resolvefh.gfid, gfid);                        \
+            snprintf(buf, sizeof(buf), "(%s) %s : %s",                         \
+                     trans->peerinfo.identifier,                               \
+                     xlatorp ? xlatorp->name : "ERR", gfid);                   \
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_RESOLVE_STAT, "%s: %s",   \
+                   strerror(cst->resolve_errno), buf);                         \
+            nfstat = nfs3_errno_to_nfsstat3(cst->resolve_errno);               \
+            goto erlabl;                                                       \
+        }                                                                      \
+    } while (0)
+
+#define nfs3_check_new_fh_resolve_status(cst, nfstat, erlabl)                  \
+    do {                                                                       \
+        xlator_t *xlatorp = NULL;                                              \
+        char buf[256], gfid[GF_UUID_BUF_SIZE];                                 \
+        rpc_transport_t *trans = NULL;                                         \
+        if (((cst)->resolve_ret < 0) && ((cst)->resolve_errno != ENOENT)) {    \
+            trans = rpcsvc_request_transport(cst->req);                        \
+            xlatorp = nfs3_fh_to_xlator(cst->nfs3state, &cst->resolvefh);      \
+            gf_uuid_unparse(cst->resolvefh.gfid, gfid);                        \
+            snprintf(buf, sizeof(buf), "(%s) %s : %s",                         \
+                     trans->peerinfo.identifier,                               \
+                     xlatorp ? xlatorp->name : "ERR", gfid);                   \
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_RESOLVE_STAT, "%s: %s",   \
+                   strerror(cst->resolve_errno), buf);                         \
+            nfstat = nfs3_errno_to_nfsstat3(cs->resolve_errno);                \
+            goto erlabl;                                                       \
+        }                                                                      \
+    } while (0)
+
+int
+__nfs3_get_volume_id(struct nfs3_state *nfs3, xlator_t *xl, uuid_t volumeid)
+{
+    int ret = -1;
+    struct nfs3_export *exp = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS3, nfs3, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS3, xl, out);
+
+    list_for_each_entry(exp, &nfs3->exports, explist)
+    {
+        if (exp->subvol == xl) {
+            gf_uuid_copy(volumeid, exp->volumeid);
+            ret = 0;
+            goto out;
+        }
+    }
+
+out:
+    return ret;
+}
+
+static int
+nfs3_funge_webnfs_zerolen_fh(rpcsvc_request_t *req, struct nfs3_state *nfs3st,
+                             struct nfs3_fh *fhd, char *name)
+{
+    xlator_t *fungexl = NULL;
+    struct nfs_state *nfs = NULL;
+    glfs_t *fs = NULL;
+    loc_t loc = {
+        0,
+    };
+    int ret = -1;
+    char *subdir = NULL;
+    char volname[NAME_MAX] = {
+        0,
+    };
+
+    fungexl = nfs_mntpath_to_xlator(nfs3st->exportslist, name);
+    if (!fungexl) {
+        gf_msg_trace(GF_NFS3, 0, "failed to find xlator for volume");
+        ret = -ENOENT;
+        goto out;
+    }
+    /* fungexl is valid, set for nfs3_request_xlator_deviceid() */
+    rpcsvc_request_set_private(req, fungexl);
+
+    /* Permission checks are done through mnt3_parse_dir_exports(). The
+     * "nfs.export-dir" option gets checked as well. */
+    nfs = nfs_state(nfs3st->nfsx);
+    ret = mnt3_parse_dir_exports(req, nfs->mstate, name, _gf_false);
+    if (ret) {
+        gf_msg_trace(GF_NFS3, -ret, "mounting not possible");
+        goto out;
+    }
+
+    /* glfs_resolve_at copied from UDP MNT support */
+    fs = glfs_new_from_ctx(fungexl->ctx);
+    if (!fs) {
+        gf_msg_trace(GF_NFS3, 0, "failed to create glfs instance");
+        ret = -ENOENT;
+        goto out;
+    }
+
+    /* split name "volname/sub/dir/s" into pieces */
+    subdir = mnt3_get_volume_subdir(name, (char **)&volname);
+
+    ret = glfs_resolve_at(fs, fungexl, NULL, subdir, &loc, NULL, 1, 0);
+    if (ret != 0) {
+        gf_msg_trace(GF_NFS3, 0, "failed to resolve %s", subdir);
+        ret = -ENOENT;
+        goto out;
+    }
+
+    /* resolved subdir, copy gfid for the fh */
+    gf_uuid_copy(fhd->gfid, loc.gfid);
+    loc_wipe(&loc);
+
+    if (gf_nfs_dvm_off(nfs_state(nfs3st->nfsx)))
+        fhd->exportid[15] = nfs_xlator_to_xlid(nfs3st->exportslist, fungexl);
+    else {
+        if (__nfs3_get_volume_id(nfs3st, fungexl, fhd->exportid) < 0) {
+            ret = -ESTALE;
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    if (fs)
+        glfs_free_from_ctx(fs);
+
+    return ret;
+}
+
+/*
+ * This macro checks if the volume is started or not.
+ * If it is not started, it closes the client connection & logs it.
+ *
+ * Why do we do this?
+ *
+ * There is a "race condition" where gNFSd may start listening for RPC requests
+ * prior to the volume being started. Presumably, that is why this macro exists
+ * in the first place. In the NFS kernel client (specifically Linux's NFS
+ * kernel client), they establish a TCP connection to our endpoint and
+ * (re-)send requests. If we ignore the request, and return nothing back,
+ * the NFS kernel client waits forever for our response. If for some reason,
+ * the TCP connection were to die, and re-establish, the requests are
+ * retransmitted and everything begins working as expected
+ *
+ * Now, this is clearly bad behavior on the client side,
+ * but in order to make every user's life easier,
+ * gNFSd should simply disconnect the TCP connection if it sees requests
+ * before it is ready to accept them.
+ *
+ */
+
+#define nfs3_volume_started_check(nf3stt, vlm, rtval, erlbl)                   \
+    do {                                                                       \
+        if ((!nfs_subvolume_started(nfs_state(nf3stt->nfsx), vlm))) {          \
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_VOL_DISABLE,              \
+                   "Volume is disabled: %s", vlm->name);                       \
+            nfs3_disconnect_transport(req->trans);                             \
+            rtval = RPCSVC_ACTOR_IGNORE;                                       \
+            goto erlbl;                                                        \
+        }                                                                      \
+    } while (0)
+
+void
+nfs3_disconnect_transport(rpc_transport_t *transport)
+{
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS3, transport, out);
+
+    ret = rpc_transport_disconnect(transport, _gf_false);
+    if (ret != 0) {
+        gf_log(GF_NFS3, GF_LOG_WARNING,
+               "Unable to close client connection to %s.",
+               transport->peerinfo.identifier);
+    } else {
+        gf_log(GF_NFS3, GF_LOG_WARNING, "Closed client connection to %s.",
+               transport->peerinfo.identifier);
+    }
+out:
+    return;
+}
+
+int
+nfs3_export_sync_trusted(struct nfs3_state *nfs3, uuid_t exportid)
+{
+    struct nfs3_export *exp = NULL;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS3, nfs3, err);
+
+    exp = __nfs3_get_export_by_exportid(nfs3, exportid);
+    if (!exp)
+        goto err;
+
+    ret = exp->trusted_sync;
+err:
+    return ret;
+}
+
+int
+nfs3_export_write_trusted(struct nfs3_state *nfs3, uuid_t exportid)
+{
+    struct nfs3_export *exp = NULL;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS3, nfs3, err);
+
+    exp = __nfs3_get_export_by_exportid(nfs3, exportid);
+    if (!exp)
+        goto err;
+
+    ret = exp->trusted_write;
+err:
+    return ret;
+}
+
+int
+nfs3_solaris_zerolen_fh(struct nfs3_fh *fh, int fhlen)
+{
+    if (!fh)
+        return 0;
+
+    if (nfs3_fh_validate(fh))
+        return 0;
+
+    if (fhlen == 0) {
+        gf_msg_trace(GF_NFS3, 0, "received WebNFS request");
+        return 1;
+    }
+
+    return 0;
+}
+
+/* Function pointer that represents the generic prototypes of functions used
+ * to serialize NFS3 message structures into the XDR format.
+ * For usage, see the nfs3svc_XXX_cbk functions.
+ */
+typedef ssize_t (*nfs3_serializer)(struct iovec outmsg, void *args);
+
+static void
+__nfs3_call_state_wipe(nfs3_call_state_t *cs)
+{
+    if (cs->fd) {
+        gf_msg_trace(GF_NFS3, 0, "fd 0x%lx ref: %" PRId64, (long)cs->fd,
+                     GF_ATOMIC_GET(cs->fd->refcount));
+        fd_unref(cs->fd);
+    }
+
+    GF_FREE(cs->resolventry);
+
+    GF_FREE(cs->pathname);
+
+    if (!list_empty(&cs->entries.list))
+        gf_dirent_free(&cs->entries);
+
+    nfs_loc_wipe(&cs->oploc);
+    nfs_loc_wipe(&cs->resolvedloc);
+    if (cs->iob)
+        iobuf_unref(cs->iob);
+    if (cs->iobref)
+        iobref_unref(cs->iobref);
+    if (cs->trans)
+        rpc_transport_unref(cs->trans);
+    memset(cs, 0, sizeof(*cs));
+    mem_put(cs);
+    /* Already refd by fd_lookup, so no need to ref again. */
+}
+
+nfs3_call_state_t *
+nfs3_call_state_init(struct nfs3_state *s, rpcsvc_request_t *req, xlator_t *v)
+{
+    nfs3_call_state_t *cs = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS3, s, err);
+    GF_VALIDATE_OR_GOTO(GF_NFS3, req, err);
+    /* GF_VALIDATE_OR_GOTO (GF_NFS3, v, err); NLM sets this later */
+
+    cs = (nfs3_call_state_t *)mem_get(s->localpool);
+    if (!cs) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "out of memory");
+        return NULL;
+    }
+
+    memset(cs, 0, sizeof(*cs));
+    GF_REF_INIT(cs, __nfs3_call_state_wipe);
+    INIT_LIST_HEAD(&cs->entries.list);
+    INIT_LIST_HEAD(&cs->openwait_q);
+    cs->operrno = EINVAL;
+    cs->req = req;
+    cs->vol = v;
+    cs->nfsx = s->nfsx;
+    cs->nfs3state = s;
+err:
+    return cs;
+}
+
+void
+nfs3_call_state_wipe(nfs3_call_state_t *cs)
+{
+    if (!cs) {
+        gf_log_callingfn("nfs", GF_LOG_WARNING, "nfs calling state NULL");
+        return;
+    }
+    GF_REF_PUT(cs);
+}
+
+#define nfs3_handle_call_state_init(nfs3state, calls, rq, vl, opstat,          \
+                                    errlabel)                                  \
+    do {                                                                       \
+        calls = nfs3_call_state_init((nfs3state), (rq), (vl));                 \
+        if (!calls) {                                                          \
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_INIT_CALL_STAT_FAIL,      \
+                   "Failed to"                                                 \
+                   " init call state");                                        \
+            opstat = NFS3ERR_SERVERFAULT;                                      \
+            goto errlabel;                                                     \
+        }                                                                      \
+    } while (0)
+
+struct iobuf *
+nfs3_serialize_reply(rpcsvc_request_t *req, void *arg, nfs3_serializer sfunc,
+                     struct iovec *outmsg)
+{
+    struct nfs3_state *nfs3 = NULL;
+    struct iobuf *iob = NULL;
+    ssize_t retlen = -1;
+
+    nfs3 = (struct nfs3_state *)rpcsvc_request_program_private(req);
+    if (!nfs3) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_STATE_MISSING,
+               "NFSv3 state not found in RPC request");
+        goto ret;
+    }
+
+    /* First, get the io buffer into which the reply in arg will
+     * be serialized.
+     */
+    /* TODO: get rid of 'sfunc' and use 'xdrproc_t' so we
+       can have 'xdr_sizeof' */
+    iob = iobuf_get(nfs3->iobpool);
+    if (!iob) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to get iobuf");
+        goto ret;
+    }
+
+    iobuf_to_iovec(iob, outmsg);
+    /* Use the given serializer to translate the give C structure in arg
+     * to XDR format which will be written into the buffer in outmsg.
+     */
+    /* retlen is used to received the error since size_t is unsigned and we
+     * need -1 for error notification during encoding.
+     */
+    retlen = sfunc(*outmsg, arg);
+    if (retlen == -1) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ENCODE_FAIL,
+               "Failed to encode message");
+        goto ret;
+    }
+
+    outmsg->iov_len = retlen;
+ret:
+    if (retlen == -1) {
+        iobuf_unref(iob);
+        iob = NULL;
+    }
+
+    return iob;
+}
+
+/* Generic reply function for NFSv3 specific replies. */
+int
+nfs3svc_submit_reply(rpcsvc_request_t *req, void *arg, nfs3_serializer sfunc)
+{
+    struct iovec outmsg = {
+        0,
+    };
+    struct iobuf *iob = NULL;
+    int ret = -1;
+    struct iobref *iobref = NULL;
+
+    if (!req)
+        return -1;
+
+    iob = nfs3_serialize_reply(req, arg, sfunc, &outmsg);
+    if (!iob) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_SERIALIZE_REPLY_FAIL,
+               "Failed to serialize reply");
+        goto ret;
+    }
+
+    iobref = iobref_new();
+    if (!iobref) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "failed on iobref_new()");
+        goto ret;
+    }
+
+    ret = iobref_add(iobref, iob);
+    if (ret) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to add iob to iobref");
+        goto ret;
+    }
+
+    /* Then, submit the message for transmission. */
+    ret = rpcsvc_submit_message(req, &outmsg, 1, NULL, 0, iobref);
+    if (ret == -1) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_SUBMIT_REPLY_FAIL,
+               "Reply submission failed");
+        goto ret;
+    }
+
+    ret = 0;
+ret:
+    /* Now that we've done our job of handing the message to the RPC layer
+     * we can safely unref the iob in the hope that RPC layer must have
+     * ref'ed the iob on receiving into the txlist.
+     */
+    if (NULL != iob)
+        iobuf_unref(iob);
+    if (NULL != iobref)
+        iobref_unref(iobref);
+    return ret;
+}
+
+int
+nfs3svc_submit_vector_reply(rpcsvc_request_t *req, void *arg,
+                            nfs3_serializer sfunc, struct iovec *payload,
+                            int vcount, struct iobref *iobref)
+{
+    struct iovec outmsg = {
+        0,
+    };
+    struct iobuf *iob = NULL;
+    int ret = -1;
+    int new_iobref = 0;
+
+    if (!req)
+        return -1;
+
+    iob = nfs3_serialize_reply(req, arg, sfunc, &outmsg);
+    if (!iob) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_SERIALIZE_REPLY_FAIL,
+               "Failed to serialize reply");
+        goto ret;
+    }
+    if (iobref == NULL) {
+        iobref = iobref_new();
+        if (!iobref) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "failed on iobref_new");
+            goto ret;
+        }
+        new_iobref = 1;
+    }
+
+    ret = iobref_add(iobref, iob);
+    if (ret) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to add iob to iobref");
+        goto ret;
+    }
+
+    /* Then, submit the message for transmission. */
+    ret = rpcsvc_submit_message(req, &outmsg, 1, payload, vcount, iobref);
+    if (ret == -1) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_SUBMIT_REPLY_FAIL,
+               "Reply submission failed");
+        goto ret;
+    }
+
+    ret = 0;
+ret:
+    /* Now that we've done our job of handing the message to the RPC layer
+     * we can safely unref the iob in the hope that RPC layer must have
+     * ref'ed the iob on receiving into the txlist.
+     */
+    if (NULL != iob)
+        iobuf_unref(iob);
+    if (new_iobref)
+        iobref_unref(iobref);
+    return ret;
+}
+
+uint64_t
+nfs3_request_xlator_deviceid(rpcsvc_request_t *rq)
+{
+    struct nfs3_state *nfs3 = NULL;
+    xlator_t *xl = NULL;
+    uint64_t devid = 0;
+    uuid_t volumeid = {
+        0,
+    };
+
+    if (!rq)
+        return 0;
+
+    xl = rpcsvc_request_private(rq);
+    nfs3 = rpcsvc_request_program_private(rq);
+    if (nfs3 && (gf_nfs_dvm_off(nfs_state(nfs3->nfsx))))
+        devid = (uint64_t)nfs_xlator_to_xlid(nfs3->exportslist, xl);
+    else {
+        __nfs3_get_volume_id(nfs3, xl, volumeid);
+        memcpy(&devid, &volumeid[8], sizeof(devid));
+    }
+
+    return devid;
+}
+
+int
+nfs3svc_null(rpcsvc_request_t *req)
+{
+    struct iovec dummyvec = {
+        0,
+    };
+    if (!req)
+        return RPCSVC_ACTOR_ERROR;
+    rpcsvc_submit_generic(req, &dummyvec, 1, NULL, 0, NULL);
+    return RPCSVC_ACTOR_SUCCESS;
+}
+
+int
+nfs3_getattr_reply(rpcsvc_request_t *req, nfsstat3 status, struct iatt *buf)
+{
+    getattr3res res;
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_getattr3res(&res, status, buf, deviceid);
+    nfs3svc_submit_reply(req, &res, (nfs3_serializer)xdr_serialize_getattr3res);
+
+    return 0;
+}
+
+int32_t
+nfs3svc_getattr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno, inode_t *inode,
+                           struct iatt *buf, dict_t *xattr,
+                           struct iatt *postparent)
+{
+    nfsstat3 status = NFS3_OK;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+
+    /*
+     * Somewhat counter-intuitively, we don't need to look for sh-failed
+     * here. Failing this getattr will generate a new lookup from the
+     * client, and nfs_fop_lookup_cbk will detect any self-heal failures.
+     */
+
+    if (op_ret == -1) {
+        status = nfs3_cbk_errno_status(op_ret, op_errno);
+    } else {
+        nfs_fix_generation(this, inode);
+    }
+
+    nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_GETATTR, status,
+                        op_errno, cs->resolvedloc.path);
+
+    nfs3_getattr_reply(cs->req, status, buf);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int32_t
+nfs3svc_getattr_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                         dict_t *xdata)
+{
+    nfsstat3 status = NFS3_OK;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+
+    if (op_ret == -1) {
+        /* Prevent crashes for the case where this call fails
+         * and buf is left in a NULL state, yet the op_errno == 0.
+         */
+        if (!buf && op_errno == 0) {
+            op_errno = EIO;
+        }
+        status = nfs3_cbk_errno_status(op_ret, op_errno);
+    }
+
+    nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_GETATTR, status,
+                        op_errno, cs->resolvedloc.path);
+
+    nfs3_getattr_reply(cs->req, status, buf);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int
+nfs3_getattr_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+    uint64_t raw_ctx = 0;
+    struct nfs_inode_ctx *ictx = NULL;
+    struct nfs_state *priv = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_auth_status(cs, stat, _gf_false, nfs3err);
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    nfs_request_user_init(&nfu, cs->req);
+    /* If inode which is to be getattr'd is the root, we need to do a
+     * lookup instead because after a server reboot, it is not necessary
+     * for the root to have been looked up when the getattr on the root is
+     * sent. AND, this causes a problem for stat-prefetch in that it
+     * expects even the root inode to have been looked up.
+
+    if (__is_root_gfid (cs->resolvedloc.inode->gfid))
+            ret = nfs_lookup (cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                              nfs3svc_getattr_lookup_cbk, cs);
+    else
+            ret = nfs_stat (cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+    */
+
+    if (cs->hardresolved) {
+        ret = -EFAULT;
+        stat = NFS3_OK;
+        goto nfs3err;
+    }
+
+    /*
+     * If brick state changed, we need to force a proper lookup cycle (as
+     * would happen in native protocol) to do self-heal checks. We detect
+     * this by comparing the generation number for the last successful
+     * creation/lookup on the inode to the current number, so inodes that
+     * haven't been validated since the state change are affected.
+     */
+    if (inode_ctx_get(cs->resolvedloc.inode, cs->nfsx, &raw_ctx) == 0) {
+        ictx = (struct nfs_inode_ctx *)(uintptr_t)raw_ctx;
+        priv = cs->nfsx->private;
+        if (ictx->generation != priv->generation) {
+            ret = nfs_lookup(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                             nfs3svc_getattr_lookup_cbk, cs);
+            goto check_err;
+        }
+    }
+
+    ret = nfs_stat(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                   nfs3svc_getattr_stat_cbk, cs);
+
+check_err:
+    if (ret < 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_STAT_FOP_FAIL,
+               "Stat fop failed: %s: %s", cs->oploc.path, strerror(-ret));
+        stat = nfs3_errno_to_nfsstat3(-ret);
+    }
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_GETATTR, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_getattr_reply(cs->req, stat, &cs->stbuf);
+        nfs3_call_state_wipe(cs);
+        ret = 0;
+    }
+
+    return ret;
+}
+
+int
+nfs3_getattr(rpcsvc_request_t *req, struct nfs3_fh *fh)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cstate = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS3, req, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS3, fh, out);
+
+    nfs3_log_common_call(rpcsvc_request_xid(req), "GETATTR", fh);
+    nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_handle_call_state_init(nfs3, cstate, req, vol, stat, nfs3err);
+
+    ret = nfs3_fh_resolve_and_resume(cstate, fh, NULL, nfs3_getattr_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_GETATTR, stat, -ret,
+                            NULL);
+        nfs3_getattr_reply(req, stat, NULL);
+        ret = 0;
+        nfs3_call_state_wipe(cstate);
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_getattr(rpcsvc_request_t *req)
+{
+    struct nfs3_fh fh = {
+        {0},
+    };
+    getattr3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+
+    nfs3_prep_getattr3args(&args, &fh);
+    if (xdr_to_getattr3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_getattr(req, &fh);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_GETATTR_FAIL,
+               "GETATTR procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_setattr_reply(rpcsvc_request_t *req, nfsstat3 stat, struct iatt *preop,
+                   struct iatt *postop)
+{
+    setattr3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_setattr3res(&res, stat, preop, postop, deviceid);
+    nfs3svc_submit_reply(req, (void *)&res,
+                         (nfs3_serializer)xdr_serialize_setattr3res);
+    return 0;
+}
+
+int32_t
+nfs3svc_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                     struct iatt *postbuf, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    struct iatt *prestat = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto nfs3err;
+    }
+
+    /* If the first stat was got from the guarded setattr callback, or
+     * from an earlier setattr call then we'll need to use that stat
+     * instead of the preop returned here.
+     */
+    if (cs->preparent.ia_ino != 0)
+        prestat = &cs->preparent;
+    else
+        prestat = prebuf;
+
+    stat = NFS3_OK;
+nfs3err:
+    nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_SETATTR, stat,
+                        op_errno, cs->resolvedloc.path);
+    nfs3_setattr_reply(cs->req, stat, prestat, postbuf);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int32_t
+nfs3svc_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, struct iatt *preop,
+                    struct iatt *postop, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -1;
+    struct iatt *prebuf = NULL;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto nfs3err;
+    }
+
+    prebuf = preop;
+    /* Store the current preop in case we need to send a truncate,
+     * in which case the preop to be returned will be this one.
+     */
+    cs->preparent = *preop;
+
+    /* Only truncate if the size is not already same as the requested
+     * truncation and also only if this is not a directory.
+     */
+    if ((gf_attr_size_set(cs->setattr_valid)) && (!IA_ISDIR(postop->ia_type)) &&
+        (preop->ia_size != cs->attr_in.ia_size)) {
+        nfs_request_user_init(&nfu, cs->req);
+        ret = nfs_truncate(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                           cs->attr_in.ia_size, nfs3svc_truncate_cbk, cs);
+
+        if (ret < 0)
+            stat = nfs3_errno_to_nfsstat3(-ret);
+    } else {
+        ret = -1; /* Force a reply in the branch below. */
+        stat = NFS3_OK;
+    }
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_SETATTR, stat,
+                            op_errno, cs->resolvedloc.path);
+        nfs3_setattr_reply(cs->req, stat, prebuf, postop);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return 0;
+}
+
+int32_t
+nfs3svc_setattr_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                         dict_t *xdata)
+{
+    int ret = -EFAULT;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto nfs3err;
+    }
+
+    if (buf->ia_ctime != cs->timestamp.seconds) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_TIMESTAMP_NO_SYNC,
+               "Timestamps not in sync");
+        stat = NFS3ERR_NOT_SYNC;
+        goto nfs3err;
+    }
+
+    /* Not a clean way but no motivation to add a new member to local. */
+    cs->preparent = *buf;
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_setattr(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, &cs->stbuf,
+                      cs->setattr_valid, nfs3svc_setattr_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_SETATTR, stat,
+                            op_errno, cs->resolvedloc.path);
+        nfs3_setattr_reply(cs->req, stat, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return 0;
+}
+
+int
+nfs3_setattr_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_setattr(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, &cs->attr_in,
+                      cs->setattr_valid, nfs3svc_setattr_cbk, cs);
+
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_SETATTR, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_setattr_reply(cs->req, stat, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_setattr(rpcsvc_request_t *req, struct nfs3_fh *fh, sattr3 *sattr,
+             sattrguard3 *guard)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS3, req, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS3, fh, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS3, sattr, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS3, guard, out);
+
+    nfs3_log_common_call(rpcsvc_request_xid(req), "SETATTR", fh);
+    nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_check_rw_volaccess(nfs3, fh->exportid, stat, nfs3err);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    cs->setattr_valid = nfs3_sattr3_to_setattr_valid(sattr, &cs->attr_in, NULL);
+    if (guard->check) {
+        gf_msg_trace(GF_NFS3, 0, "Guard check required");
+        cs->timestamp = guard->sattrguard3_u.obj_ctime;
+        cs->sattrguardcheck = 1;
+    } else {
+        gf_msg_trace(GF_NFS3, 0, "Guard check not required");
+        cs->sattrguardcheck = 0;
+    }
+
+    if (!cs->setattr_valid) {
+        ret = -EINVAL; /* Force a reply */
+        stat = NFS3_OK;
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_SETATTR_INVALID,
+               "cs->setattr_valid is invalid");
+        goto nfs3err;
+    }
+
+    ret = nfs3_fh_resolve_and_resume(cs, fh, NULL, nfs3_setattr_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_SETATTR, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_setattr_reply(req, stat, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+        /* Ret must be 0 after this so that the caller does not
+         * also send an RPC reply.
+         */
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_setattr(rpcsvc_request_t *req)
+{
+    struct nfs3_fh fh = {
+        {0},
+    };
+    setattr3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS3, req, rpcerr);
+
+    nfs3_prep_setattr3args(&args, &fh);
+    if (xdr_to_setattr3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_setattr(req, &fh, &args.new_attributes, &args.guard);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_SETATTR_FAIL,
+               "SETATTR procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_lookup_reply(rpcsvc_request_t *req, nfsstat3 stat, struct nfs3_fh *newfh,
+                  struct iatt *stbuf, struct iatt *postparent)
+{
+    lookup3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_lookup3res(&res, stat, newfh, stbuf, postparent, deviceid);
+    return nfs3svc_submit_reply(req, &res,
+                                (nfs3_serializer)xdr_serialize_lookup3res);
+}
+
+int
+nfs3_lookup_resume(void *carg);
+
+int
+nfs3_fresh_lookup(nfs3_call_state_t *cs)
+{
+    int ret = -EFAULT;
+    char *oldresolventry = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS3, cs, err);
+    gf_msg_debug(GF_NFS3, 0, "inode needs fresh lookup");
+    inode_unlink(cs->resolvedloc.inode, cs->resolvedloc.parent,
+                 cs->resolventry);
+    nfs_loc_wipe(&cs->resolvedloc);
+
+    /* Store pointer to currently allocated resolventry because it gets over
+     * written in fh_resolve_and_resume.
+     */
+    oldresolventry = cs->resolventry;
+    cs->lookuptype = GF_NFS3_FRESH;
+    ret = nfs3_fh_resolve_and_resume(cs, &cs->resolvefh, cs->resolventry,
+                                     nfs3_lookup_resume);
+    /* Allocated in the previous call to fh_resolve_and_resume using the
+     * same call_state.
+     */
+    GF_FREE(oldresolventry);
+err:
+    return ret;
+}
+
+int
+nfs3svc_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, inode_t *inode,
+                   struct iatt *buf, dict_t *xattr, struct iatt *postparent)
+{
+    struct nfs3_fh newfh = {
+        {0},
+    };
+    nfsstat3 status = NFS3_OK;
+    nfs3_call_state_t *cs = NULL;
+    inode_t *oldinode = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        status = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto xmit_res;
+    }
+
+    nfs3_fh_build_child_fh(&cs->parent, buf, &newfh);
+    oldinode = inode_link(inode, cs->resolvedloc.parent, cs->resolvedloc.name,
+                          buf);
+xmit_res:
+    /* Only send fresh lookup if it was a revalidate that failed. */
+    if ((op_ret == -1) && (nfs3_is_revalidate_lookup(cs))) {
+        op_ret = nfs3_fresh_lookup(cs);
+        goto out;
+    }
+
+    nfs3_log_newfh_res(rpcsvc_request_xid(cs->req), NFS3_LOOKUP, status,
+                       op_errno, &newfh, cs->resolvedloc.path);
+    nfs3_lookup_reply(cs->req, status, &newfh, buf, postparent);
+    nfs3_call_state_wipe(cs);
+out:
+    if (oldinode) {
+        inode_lookup(oldinode);
+        inode_unref(oldinode);
+    }
+    return 0;
+}
+
+int
+nfs3svc_lookup_parentdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                             int32_t op_ret, int32_t op_errno, inode_t *inode,
+                             struct iatt *buf, dict_t *xattr,
+                             struct iatt *postparent)
+{
+    struct nfs3_fh newfh = {
+        {0},
+    };
+    nfsstat3 status = NFS3_OK;
+    nfs3_call_state_t *cs = NULL;
+    uuid_t volumeid = {
+        0,
+    };
+    uuid_t mountid = {
+        1,
+    };
+    struct nfs3_state *nfs3 = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        status = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto xmit_res;
+    }
+
+    nfs3 = cs->nfs3state;
+    /* If the buf inode shows that this is a root dir's buf, then the file
+     * handle needs to be specially crafted, in all other cases, we'll just
+     * create the handle normally using the buffer of the parent dir.
+     */
+    if (buf->ia_ino != 1) {
+        nfs3_fh_build_parent_fh(&cs->fh, buf, &newfh);
+        goto xmit_res;
+    }
+
+    if (gf_nfs_dvm_off(nfs_state(nfs3->nfsx)))
+        newfh = nfs3_fh_build_indexed_root_fh(nfs3->exportslist, cs->vol);
+    else {
+        __nfs3_get_volume_id(nfs3, cs->vol, volumeid);
+        newfh = nfs3_fh_build_uuid_root_fh(volumeid, mountid);
+    }
+
+xmit_res:
+    nfs3_log_newfh_res(rpcsvc_request_xid(cs->req), NFS3_LOOKUP, status,
+                       op_errno, &newfh, cs->resolvedloc.path);
+    nfs3_lookup_reply(cs->req, status, &newfh, buf, postparent);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int
+nfs3_lookup_parentdir_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+    inode_t *parent = NULL;
+
+    if (!carg) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Invalid argument, carg value NULL");
+        return EINVAL;
+    }
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_auth_status(cs, stat, _gf_false, nfs3err);
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+
+    /* At this point now, the loc in cs is for the directory file handle
+     * sent by the client. This loc needs to be transformed into a loc that
+     * represents the parent dir of cs->resolvedloc.inode.
+     *
+     * EXCEPT in the case where the .. is a parent of the root directory.
+     * In this case we'll be returning the file handle and attributes of the
+     * root itself.
+     */
+    nfs_request_user_init(&nfu, cs->req);
+
+    /* Save the file handle from the LOOKUP request. We'll use this to
+     * build the file handle of the parent directory in case the parent is
+     * not root dir.
+     */
+    cs->fh = cs->resolvefh;
+
+    /* If fh is that of the root, the resolvedloc will already contain
+     * the loc for root. After that, we'll send lookup for the root dir
+     * itself since we cannot send the lookup on the parent of root.
+     *
+     * For all other cases, we'll send the lookup on the parent of the
+     * given directory file handle.
+     */
+    if (!nfs3_fh_is_root_fh(&cs->fh)) {
+        parent = inode_ref(cs->resolvedloc.parent);
+        nfs_loc_wipe(&cs->resolvedloc);
+        ret = nfs_inode_loc_fill(parent, &cs->resolvedloc, NFS_RESOLVE_CREATE);
+
+        if (ret < 0) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_INODE_LOC_FILL_ERROR,
+                   "nfs_inode_loc_fill"
+                   " error");
+            goto errtostat;
+        }
+    }
+
+    ret = nfs_lookup(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                     nfs3svc_lookup_parentdir_cbk, cs);
+errtostat:
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_LOOKUP, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_lookup_reply(cs->req, stat, NULL, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    if (parent)
+        inode_unref(parent);
+
+    return ret;
+}
+
+int
+nfs3_lookup_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+    struct nfs3_fh newfh = {
+        {0},
+    };
+
+    if (!carg) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Invalid argument, carg value NULL");
+        return EINVAL;
+    }
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_auth_status(cs, stat, _gf_false, nfs3err);
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    cs->parent = cs->resolvefh;
+
+    if (cs->hardresolved) {
+        stat = NFS3_OK;
+        nfs3_fh_build_child_fh(&cs->parent, &cs->stbuf, &newfh);
+        goto nfs3err;
+    }
+
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_lookup(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                     nfs3svc_lookup_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_LOOKUP, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_lookup_reply(cs->req, stat, &newfh, &cs->stbuf, &cs->postparent);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_lookup(rpcsvc_request_t *req, struct nfs3_fh *fh, int fhlen, char *name)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS3, req, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS3, fh, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS3, name, out);
+
+    nfs3_log_fh_entry_call(rpcsvc_request_xid(req), "LOOKUP", fh, name);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    if (nfs3_solaris_zerolen_fh(fh, fhlen)) {
+        ret = nfs3_funge_webnfs_zerolen_fh(req, nfs3, fh, name);
+        if (ret < 0)
+            goto nfs3err;
+
+        /* this fh means we're doing a mount, name is no more useful */
+        name = NULL;
+    } else
+        nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_strlen_or_goto(name, NFS_NAME_MAX, nfs3err, stat, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    cs->lookuptype = GF_NFS3_REVALIDATE;
+    ret = nfs3_fh_resolve_and_resume(cs, fh, name, nfs3_lookup_resume);
+
+    if (ret < 0) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, -ret, NFS_MSG_HARD_RESOLVE_FAIL,
+               "failed to start hard resolve");
+    }
+
+nfs3err:
+    if (ret < 0) {
+        stat = nfs3_errno_to_nfsstat3(-ret);
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_LOOKUP, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_lookup_reply(req, stat, NULL, NULL, NULL);
+        if (cs)
+            nfs3_call_state_wipe(cs);
+        /* Ret must be 0 after this so that the caller does not
+         * also send an RPC reply.
+         */
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_lookup(rpcsvc_request_t *req)
+{
+    char name[NFS_PATH_MAX];
+    struct nfs3_fh fh = {
+        {0},
+    };
+    lookup3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS, req, rpcerr);
+
+    nfs3_prep_lookup3args(&args, &fh, name);
+    if (xdr_to_lookup3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_lookup(req, &fh, args.what.dir.data.data_len, name);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_LOOKUP_PROC_FAIL,
+               "LOOKUP procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_access_reply(rpcsvc_request_t *req, nfsstat3 status, int32_t accbits,
+                  int32_t reqaccbits)
+{
+    access3res res;
+
+    nfs3_fill_access3res(&res, status, accbits, reqaccbits);
+    nfs3svc_submit_reply(req, &res, (nfs3_serializer)xdr_serialize_access3res);
+    return 0;
+}
+
+int32_t
+nfs3svc_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    nfsstat3 status = NFS3_OK;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+
+    if (op_ret == -1) {
+        status = nfs3_cbk_errno_status(op_ret, op_errno);
+    }
+
+    nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_ACCESS, status,
+                        op_errno, cs->resolvedloc.path);
+    nfs3_access_reply(cs->req, status, op_errno, cs->accessbits);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int
+nfs3_access_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+
+    if (!carg) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Invalid argument, carg value NULL");
+        return EINVAL;
+    }
+
+    cs = (nfs3_call_state_t *)carg;
+
+    /* Additional checks on the NFS file handle
+     * go here. The path for an NFS ACCESS call
+     * goes like this:
+     * nfs3_access -> nfs3_fh_resolve_and_resume -> nfs3_resolve_resume ->
+     * nfs3_access_resume -> <macro/function performs check on FH> ->
+     * <continue or return from function based on check.> ('goto nfs3err'
+     * terminates this function and writes the appropriate response to the
+     * client). It is important that you do NOT stick any sort of check
+     * on the file handle outside of the nfs3_##OP_resume functions.
+     */
+    nfs3_check_fh_auth_status(cs, stat, _gf_false, nfs3err);
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    cs->fh = cs->resolvefh;
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_access(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, cs->accessbits,
+                     nfs3svc_access_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_ACCESS, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_access_reply(cs->req, stat, 0, 0);
+        nfs3_call_state_wipe(cs);
+        ret = 0;
+    }
+
+    return ret;
+}
+
+int
+nfs3_access(rpcsvc_request_t *req, struct nfs3_fh *fh, uint32_t accbits)
+{
+    xlator_t *vol = NULL;
+    struct nfs3_state *nfs3 = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    GF_VALIDATE_OR_GOTO(GF_NFS, req, out);
+    GF_VALIDATE_OR_GOTO(GF_NFS, fh, out);
+    nfs3_log_common_call(rpcsvc_request_xid(req), "ACCESS", fh);
+    nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+    cs->accessbits = accbits;
+
+    ret = nfs3_fh_resolve_and_resume(cs, fh, NULL, nfs3_access_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_ACCESS, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_access_reply(req, stat, 0, 0);
+        nfs3_call_state_wipe(cs);
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_access(rpcsvc_request_t *req)
+{
+    struct nfs3_fh fh = {
+        {0},
+    };
+    access3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+
+    nfs3_prep_access3args(&args, &fh);
+    if (xdr_to_access3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_access(req, &fh, args.access);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_ACCESS_PROC_FAIL,
+               "ACCESS procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_readlink_reply(rpcsvc_request_t *req, nfsstat3 stat, char *path,
+                    struct iatt *buf)
+{
+    readlink3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_readlink3res(&res, stat, path, buf, deviceid);
+    nfs3svc_submit_reply(req, (void *)&res,
+                         (nfs3_serializer)xdr_serialize_readlink3res);
+
+    return 0;
+}
+
+int32_t
+nfs3svc_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, const char *path,
+                     struct iatt *buf, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto nfs3err;
+    }
+
+    stat = NFS3_OK;
+
+nfs3err:
+    nfs3_log_readlink_res(rpcsvc_request_xid(cs->req), stat, op_errno,
+                          (char *)path, cs->resolvedloc.path);
+    nfs3_readlink_reply(cs->req, stat, (char *)path, buf);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int
+nfs3_readlink_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_auth_status(cs, stat, _gf_false, nfs3err);
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_readlink(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                       nfs3svc_readlink_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_READLINK, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_readlink_reply(cs->req, stat, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_readlink(rpcsvc_request_t *req, struct nfs3_fh *fh)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!req) || (!fh)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_log_common_call(rpcsvc_request_xid(req), "READLINK", fh);
+    nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    ret = nfs3_fh_resolve_and_resume(cs, fh, NULL, nfs3_readlink_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_READLINK, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_readlink_reply(req, stat, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+        /* Ret must be 0 after this so that the caller does not
+         * also send an RPC reply.
+         */
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_readlink(rpcsvc_request_t *req)
+{
+    struct nfs3_fh fh = {
+        {0},
+    };
+    readlink3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+
+    nfs3_prep_readlink3args(&args, &fh);
+    if (xdr_to_readlink3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_readlink(req, &fh);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_READLINK_PROC_FAIL,
+               "READLINK procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_read_reply(rpcsvc_request_t *req, nfsstat3 stat, count3 count,
+                struct iovec *vec, int vcount, struct iobref *iobref,
+                struct iatt *poststat, int is_eof)
+{
+    read3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_read3res(&res, stat, count, poststat, is_eof, deviceid);
+    if (stat == NFS3_OK) {
+        xdr_vector_round_up(vec, vcount, count);
+        /* iob can be zero if the file size was zero. If so, op_ret
+         * would be 0 and count = 0.
+         */
+
+        if (count != 0) {
+            nfs3svc_submit_vector_reply(
+                req, (void *)&res,
+                (nfs3_serializer)xdr_serialize_read3res_nocopy, vec, vcount,
+                iobref);
+        } else
+
+            nfs3svc_submit_reply(
+                req, (void *)&res,
+                (nfs3_serializer)xdr_serialize_read3res_nocopy);
+    } else
+        nfs3svc_submit_reply(req, (void *)&res,
+                             (nfs3_serializer)xdr_serialize_read3res_nocopy);
+
+    return 0;
+}
+
+int32_t
+nfs3svc_read_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                 int32_t count, struct iatt *stbuf, struct iobref *iobref,
+                 dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int is_eof = 0;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto err;
+    } else
+        stat = NFS3_OK;
+
+    if (op_errno == ENOENT)
+        is_eof = 1;
+
+err:
+    nfs3_log_read_res(rpcsvc_request_xid(cs->req), stat, op_errno, op_ret,
+                      is_eof, vector, count, cs->resolvedloc.path);
+    nfs3_read_reply(cs->req, stat, op_ret, vector, count, iobref, stbuf,
+                    is_eof);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int
+nfs3_read_fd_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_read(cs->nfsx, cs->vol, &nfu, cs->fd, cs->datacount,
+                   cs->dataoffset, nfs3svc_read_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_READ, stat, -ret,
+                            cs->resolvedloc.path);
+        nfs3_read_reply(cs->req, stat, 0, NULL, 0, NULL, NULL, 0);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_read_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+    fd_t *fd = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_auth_status(cs, stat, _gf_false, nfs3err);
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    fd = fd_anonymous(cs->resolvedloc.inode);
+    if (!fd) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ANONYMOUS_FD_FAIL,
+               "Failed to create anonymous fd");
+        goto nfs3err;
+    }
+
+    cs->fd = fd;
+    nfs3_read_fd_resume(cs);
+    ret = 0;
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_READ, stat, -ret,
+                            cs->resolvedloc.path);
+        nfs3_read_reply(cs->req, stat, 0, NULL, 0, NULL, NULL, 0);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_read(rpcsvc_request_t *req, struct nfs3_fh *fh, offset3 offset,
+          count3 count)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!req) || (!fh)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_log_rw_call(rpcsvc_request_xid(req), "READ", fh, offset, count, -1);
+    nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    cs->datacount = count;
+    cs->dataoffset = offset;
+    ret = nfs3_fh_resolve_and_resume(cs, fh, NULL, nfs3_read_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_READ, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_read_reply(req, stat, 0, NULL, 0, NULL, NULL, 0);
+        nfs3_call_state_wipe(cs);
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_read(rpcsvc_request_t *req)
+{
+    struct nfs3_fh fh = {
+        {0},
+    };
+    read3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+
+    nfs3_prep_read3args(&args, &fh);
+    if (xdr_to_read3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_read(req, &fh, args.offset, args.count);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_READ_FAIL,
+               "READ procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_write_reply(rpcsvc_request_t *req, nfsstat3 stat, count3 count,
+                 stable_how stable, uint64_t wverf, struct iatt *prestat,
+                 struct iatt *poststat)
+{
+    write3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_write3res(&res, stat, count, stable, wverf, prestat, poststat,
+                        deviceid);
+    nfs3svc_submit_reply(req, (void *)&res,
+                         (nfs3_serializer)xdr_serialize_write3res);
+
+    return 0;
+}
+
+int32_t
+nfs3svc_write_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                        struct iatt *postbuf, dict_t *xdata)
+{
+    struct nfs3_state *nfs3 = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    nfs3 = rpcsvc_request_program_private(cs->req);
+
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+    } else
+        stat = NFS3_OK;
+
+    nfs3_log_write_res(rpcsvc_request_xid(cs->req), stat, op_errno,
+                       cs->maxcount, cs->writetype, nfs3->serverstart,
+                       cs->resolvedloc.path);
+    nfs3_write_reply(cs->req, stat, cs->maxcount, cs->writetype,
+                     nfs3->serverstart, &cs->stbuf, postbuf);
+    nfs3_call_state_wipe(cs);
+    return 0;
+}
+
+/*
+ * Before going into the write reply logic, here is a matrix that shows the
+ * requirements for a write reply as given by RFC1813.
+ *
+ * Requested Write Type ||      Possible Returns
+ * ==============================================
+ * FILE_SYNC            ||      FILE_SYNC
+ * DATA_SYNC            ||      DATA_SYNC or FILE_SYNC
+ * UNSTABLE             ||      DATA_SYNC or FILE_SYNC or UNSTABLE
+ *
+ * Write types other than UNSTABLE are together called STABLE.
+ * RS - Return Stable
+ * RU - Return Unstable
+ * WS - Write Stable
+ * WU - Write Unstable
+ *
+ *+============================================+
+ *| Vol Opts -> || trusted-write| trusted-sync |
+ *| Write Type  ||              |              |
+ *|-------------||--------------|--------------|
+ *| STABLE      ||      WS      |   WU         |
+ *|             ||      RS      |   RS         |
+ *|-------------||--------------|--------------|
+ *| UNSTABLE    ||      WU      |   WU         |
+ *|             ||      RS      |   RS         |
+ *|-------------||--------------|--------------|
+ *| COMMIT      ||    fsync     | getattr      |
+ *+============================================+
+ *
+ *
+ */
+int32_t
+nfs3svc_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                  struct iatt *postbuf, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+    struct nfs3_state *nfs3 = NULL;
+
+    cs = frame->local;
+    nfs3 = rpcsvc_request_program_private(cs->req);
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto err;
+    }
+
+    stat = NFS3_OK;
+    cs->maxcount = op_ret;
+
+err:
+    nfs3_log_write_res(rpcsvc_request_xid(cs->req), stat, op_errno,
+                       cs->maxcount, cs->writetype, nfs3->serverstart,
+                       cs->resolvedloc.path);
+    nfs3_write_reply(cs->req, stat, cs->maxcount, cs->writetype,
+                     nfs3->serverstart, prebuf, postbuf);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int
+__nfs3_write_resume(nfs3_call_state_t *cs)
+{
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!cs)
+        return ret;
+
+    nfs_request_user_init(&nfu, cs->req);
+    /* It is possible that the RPC record contains more bytes than
+     * than the size of write requested in this request. This means,
+     * that in the RPC message buffer, there could be more bytes
+     * beyind the @count bytes. Since @payload is referring to the write
+     * data directly inside the RPC request buffer(..since we performed a
+     * no-copy deXDRing..), we might end up writing more data than
+     * requested, because till now payload.iov_len accounts for all the
+     * bytes not just the write request bytes. These extra bytes are present
+     * as a requirement of the XDR encoding to round up the all string and
+     * opaque data buffers to multiples of 4 bytes.
+     */
+    cs->datavec.iov_len = cs->datacount;
+    ret = nfs_write(cs->nfsx, cs->vol, &nfu, cs->fd, cs->iobref, &cs->datavec,
+                    1, cs->dataoffset, nfs3svc_write_cbk, cs);
+
+    return ret;
+}
+
+int
+nfs3_write_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+    fd_t *fd = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_auth_status(cs, stat, _gf_true, nfs3err);
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    fd = fd_anonymous(cs->resolvedloc.inode);
+    if (!fd) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ANONYMOUS_FD_FAIL,
+               "Failed to create anonymous fd");
+        goto nfs3err;
+    }
+
+    cs->fd = fd; /* Gets unrefd when the call state is wiped. */
+
+    ret = __nfs3_write_resume(cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_WRITE, stat, -ret,
+                            cs->resolvedloc.path);
+        nfs3_write_reply(cs->req, stat, 0, cs->writetype, 0, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+    return ret;
+}
+
+int
+nfs3_write(rpcsvc_request_t *req, struct nfs3_fh *fh, offset3 offset,
+           count3 count, stable_how stable, struct iovec payload,
+           struct iobref *iobref)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!req) || (!fh) || (!payload.iov_base)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_log_rw_call(rpcsvc_request_xid(req), "WRITE", fh, offset, count,
+                     stable);
+    nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_check_rw_volaccess(nfs3, fh->exportid, stat, nfs3err);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+    cs->datacount = count;
+    cs->dataoffset = offset;
+    cs->writetype = stable;
+    cs->iobref = iobref;
+    cs->datavec = payload;
+
+    ret = nfs3_fh_resolve_and_resume(cs, fh, NULL, nfs3_write_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_WRITE, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_write_reply(req, stat, 0, stable, 0, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+#define NFS3_VECWRITE_READFHLEN 1
+#define NFS3_VECWRITE_READFH 2
+#define NFS3_VECWRITE_READREST 3
+
+#define NFS3_WRITE_POSTFH_SIZE 20
+
+int
+nfs3svc_write_vecsizer(int state, ssize_t *readsize, char *base_addr,
+                       char *curr_addr)
+{
+    int ret = 0;
+    uint32_t fhlen = 0;
+    uint32_t fhlen_n = 0;
+
+    if (state == 0) {
+        ret = NFS3_VECWRITE_READFHLEN;
+        *readsize = 4;
+    } else if (state == NFS3_VECWRITE_READFHLEN) {
+        fhlen_n = *(uint32_t *)(curr_addr - 4);
+        fhlen = ntohl(fhlen_n);
+        *readsize = xdr_length_round_up(fhlen, NFS3_FHSIZE);
+        ret = NFS3_VECWRITE_READFH;
+    } else if (state == NFS3_VECWRITE_READFH) {
+        *readsize = NFS3_WRITE_POSTFH_SIZE;
+        ret = NFS3_VECWRITE_READREST;
+    } else if (state == NFS3_VECWRITE_READREST) {
+        ret = 0;
+        *readsize = 0;
+    } else
+        gf_msg("nfs", GF_LOG_ERROR, 0, NFS_MSG_STATE_WRONG, "state wrong");
+
+    return ret;
+}
+
+int
+nfs3svc_write(rpcsvc_request_t *req)
+{
+    struct nfs3_fh fh = {
+        {0},
+    };
+    write3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+    nfs3_prep_write3args(&args, &fh);
+    if (xdr_to_write3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    /* To ensure that the iobuf for the current record does not
+     * get returned to the iobpool, we need to keep a reference for
+     * ourselves because the RPC call handler who called us will unref its
+     * own ref of the record's iobuf when it is done handling the request.
+     */
+
+    ret = nfs3_write(req, &fh, args.offset, args.count, args.stable,
+                     req->msg[1], rpcsvc_request_iobref_ref(req));
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_WRITE_FAIL,
+               "WRITE procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_create_reply(rpcsvc_request_t *req, nfsstat3 stat, struct nfs3_fh *newfh,
+                  struct iatt *newbuf, struct iatt *preparent,
+                  struct iatt *postparent)
+{
+    create3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_create3res(&res, stat, newfh, newbuf, preparent, postparent,
+                         deviceid);
+    nfs3svc_submit_reply(req, (void *)&res,
+                         (nfs3_serializer)xdr_serialize_create3res);
+    return 0;
+}
+
+int32_t
+nfs3svc_create_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                           int32_t op_ret, int32_t op_errno, struct iatt *preop,
+                           struct iatt *postop, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto nfs3err;
+    }
+
+    stat = NFS3_OK;
+nfs3err:
+    nfs3_log_newfh_res(rpcsvc_request_xid(cs->req), NFS3_CREATE, stat, op_errno,
+                       &cs->fh, cs->resolvedloc.path);
+    nfs3_create_reply(cs->req, stat, &cs->fh, postop, &cs->preparent,
+                      &cs->postparent);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int32_t
+nfs3svc_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+                   struct iatt *buf, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+    inode_t *oldinode = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto nfs3err;
+    }
+
+    nfs3_fh_build_child_fh(&cs->parent, buf, &cs->fh);
+    oldinode = inode_link(inode, cs->resolvedloc.parent, cs->resolvedloc.name,
+                          buf);
+
+    /* Means no attributes were required to be set. */
+    if (!cs->setattr_valid) {
+        stat = NFS3_OK;
+        ret = -1;
+        goto nfs3err;
+    }
+
+    cs->preparent = *preparent;
+    cs->postparent = *postparent;
+    nfs_request_user_init(&nfu, cs->req);
+    gf_uuid_copy(cs->resolvedloc.gfid, oldinode->gfid);
+    ret = nfs_setattr(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, &cs->stbuf,
+                      cs->setattr_valid, nfs3svc_create_setattr_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (oldinode) {
+        inode_lookup(oldinode);
+        inode_unref(oldinode);
+    }
+
+    if (ret < 0) {
+        nfs3_log_newfh_res(rpcsvc_request_xid(cs->req), NFS3_CREATE, stat,
+                           op_errno, &cs->fh, cs->resolvedloc.path);
+        nfs3_create_reply(cs->req, stat, &cs->fh, buf, preparent, postparent);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return 0;
+}
+
+int
+nfs3_create_common(nfs3_call_state_t *cs)
+{
+    int ret = -EFAULT;
+    int flags = 0;
+    nfs_user_t nfu = {
+        0,
+    };
+    uid_t uid = 0;
+    gid_t gid = 0;
+
+    if (!cs)
+        return ret;
+
+    if (cs->createmode == GUARDED)
+        flags = (O_RDWR | O_EXCL);
+    else
+        flags = O_RDWR;
+
+    if (gf_attr_uid_set(cs->setattr_valid)) {
+        uid = cs->stbuf.ia_uid;
+        cs->setattr_valid &= ~GF_SET_ATTR_UID;
+    } else
+        uid = rpcsvc_request_uid(cs->req);
+
+    if (gf_attr_gid_set(cs->setattr_valid)) {
+        gid = cs->stbuf.ia_gid;
+        cs->setattr_valid &= ~GF_SET_ATTR_GID;
+    } else
+        gid = rpcsvc_request_gid(cs->req);
+
+    nfs_request_primary_user_init(&nfu, cs->req, uid, gid);
+    /* We can avoid sending the setattr call later if only the mode is
+     * required to be set. This is possible because the create fop allows
+     * us to specify a mode arg.
+     */
+    if (cs->setattr_valid & GF_SET_ATTR_MODE) {
+        cs->setattr_valid &= ~GF_SET_ATTR_MODE;
+        ret = nfs_create(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, flags,
+                         cs->mode, nfs3svc_create_cbk, cs);
+    } else
+        ret = nfs_create(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, flags,
+                         NFS_DEFAULT_CREATE_MODE, nfs3svc_create_cbk, cs);
+
+    return ret;
+}
+
+int32_t
+nfs3svc_create_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                        int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                        dict_t *xdata)
+{
+    int ret = -EFAULT;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    nfs_request_user_init(&nfu, cs->req);
+    if (op_ret == -1) {
+        ret = -op_errno;
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto nfs3err;
+    }
+
+    if ((cs->stbuf.ia_mtime == buf->ia_mtime) &&
+        (cs->stbuf.ia_atime == buf->ia_atime)) {
+        gf_msg_debug(GF_NFS3, 0,
+                     "Create req retransmitted verf %" PRId64 " %" PRId64,
+                     cs->stbuf.ia_mtime, cs->stbuf.ia_atime);
+        stat = NFS3_OK;
+        nfs3_fh_build_child_fh(&cs->parent, buf, &cs->fh);
+    } else {
+        gf_msg_debug(GF_NFS3, 0,
+                     "File already exist new_verf %" PRId64 " %" PRId64
+                     "old_verf %" PRId64 " %" PRId64,
+                     cs->stbuf.ia_mtime, cs->stbuf.ia_atime, buf->ia_mtime,
+                     buf->ia_atime);
+        stat = NFS3ERR_EXIST;
+    }
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_CREATE, stat,
+                            op_errno, cs->resolvedloc.path);
+        nfs3_create_reply(cs->req, stat, &cs->fh, buf, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return 0;
+}
+
+int
+nfs3_create_exclusive(nfs3_call_state_t *cs)
+{
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!cs)
+        return ret;
+
+    /* Storing verifier as a mtime and atime attribute, to store it
+     * in stable storage */
+    memcpy(&cs->stbuf.ia_atime, &cs->cookieverf, sizeof(cs->stbuf.ia_atime));
+    memcpy(&cs->stbuf.ia_mtime,
+           ((char *)&cs->cookieverf) + sizeof(cs->stbuf.ia_atime),
+           sizeof(cs->stbuf.ia_mtime));
+    cs->setattr_valid |= GF_SET_ATTR_ATIME;
+    cs->setattr_valid |= GF_SET_ATTR_MTIME;
+    nfs_request_user_init(&nfu, cs->req);
+
+    /* If the file already existed we need to get that attributes so we can
+     * compare and check whether a previous create operation was
+     * interrupted due to server failure or dropped packets.
+     */
+    if ((cs->resolve_ret == 0) ||
+        ((cs->resolve_ret == -1) && (cs->resolve_errno != ENOENT))) {
+        ret = nfs_stat(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                       nfs3svc_create_stat_cbk, cs);
+        goto nfs3err;
+    }
+
+    ret = nfs3_create_common(cs);
+nfs3err:
+    return ret;
+}
+
+int
+nfs3_create_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_auth_status(cs, stat, _gf_true, nfs3err);
+    nfs3_check_new_fh_resolve_status(cs, stat, nfs3err);
+    if (cs->createmode == EXCLUSIVE)
+        ret = nfs3_create_exclusive(cs);
+    else
+        ret = nfs3_create_common(cs);
+
+    /* Handle a failure return from either of the create functions above. */
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_CREATE, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_create_reply(cs->req, stat, NULL, NULL, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_create(rpcsvc_request_t *req, struct nfs3_fh *dirfh, char *name,
+            createmode3 mode, sattr3 *sattr, uint64_t cverf)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!req) || (!dirfh) || (!name) || (!sattr))
+        return -1;
+
+    nfs3_log_create_call(rpcsvc_request_xid(req), dirfh, name, mode);
+    nfs3_validate_gluster_fh(dirfh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_validate_strlen_or_goto(name, NFS_NAME_MAX, nfs3err, stat, ret);
+    nfs3_map_fh_to_volume(nfs3, dirfh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_check_rw_volaccess(nfs3, dirfh->exportid, stat, nfs3err);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    cs->cookieverf = cverf;
+    /*In Exclusive create client is supposed to send cverf instead of
+     * sattr*/
+    if (mode != EXCLUSIVE)
+        cs->setattr_valid = nfs3_sattr3_to_setattr_valid(sattr, &cs->stbuf,
+                                                         &cs->mode);
+    cs->createmode = mode;
+    cs->parent = *dirfh;
+
+    ret = nfs3_fh_resolve_and_resume(cs, dirfh, name, nfs3_create_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_CREATE, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_create_reply(req, stat, NULL, NULL, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_create(rpcsvc_request_t *req)
+{
+    char name[NFS_PATH_MAX];
+    struct nfs3_fh dirfh = {
+        {0},
+    };
+    create3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+    uint64_t cverf = 0;
+    uint64_t *cval;
+
+    if (!req)
+        return ret;
+
+    nfs3_prep_create3args(&args, &dirfh, name);
+    if (xdr_to_create3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    cval = (uint64_t *)args.how.createhow3_u.verf;
+    cverf = *cval;
+
+    ret = nfs3_create(req, &dirfh, name, args.how.mode,
+                      &args.how.createhow3_u.obj_attributes, cverf);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_CREATE_FAIL,
+               "CREATE procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_mkdir_reply(rpcsvc_request_t *req, nfsstat3 stat, struct nfs3_fh *fh,
+                 struct iatt *buf, struct iatt *preparent,
+                 struct iatt *postparent)
+{
+    mkdir3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_mkdir3res(&res, stat, fh, buf, preparent, postparent, deviceid);
+    nfs3svc_submit_reply(req, &res, (nfs3_serializer)xdr_serialize_mkdir3res);
+    return 0;
+}
+
+int32_t
+nfs3svc_mkdir_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, struct iatt *preop,
+                          struct iatt *postop, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto nfs3err;
+    }
+
+    stat = NFS3_OK;
+nfs3err:
+    nfs3_log_newfh_res(rpcsvc_request_xid(cs->req), NFS3_MKDIR, stat, op_errno,
+                       &cs->fh, cs->resolvedloc.path);
+    nfs3_mkdir_reply(cs->req, stat, &cs->fh, postop, &cs->preparent,
+                     &cs->postparent);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int32_t
+nfs3svc_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, inode_t *inode,
+                  struct iatt *buf, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto nfs3err;
+    }
+
+    nfs3_fh_build_child_fh(&cs->parent, buf, &cs->fh);
+
+    /* Means no attributes were required to be set. */
+    if (!cs->setattr_valid) {
+        stat = NFS3_OK;
+        goto nfs3err;
+    }
+
+    cs->preparent = *preparent;
+    cs->postparent = *postparent;
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_setattr(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, &cs->stbuf,
+                      cs->setattr_valid, nfs3svc_mkdir_setattr_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_newfh_res(rpcsvc_request_xid(cs->req), NFS3_MKDIR, stat,
+                           op_errno, &cs->fh, cs->resolvedloc.path);
+        nfs3_mkdir_reply(cs->req, stat, &cs->fh, buf, preparent, postparent);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return 0;
+}
+
+int
+nfs3_mkdir_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_new_fh_resolve_status(cs, stat, nfs3err);
+    nfs_request_user_init(&nfu, cs->req);
+
+    if (gf_attr_mode_set(cs->setattr_valid)) {
+        cs->setattr_valid &= ~GF_SET_ATTR_MODE;
+        ret = nfs_mkdir(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, cs->mode,
+                        nfs3svc_mkdir_cbk, cs);
+    } else
+        ret = nfs_mkdir(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, cs->mode,
+                        nfs3svc_mkdir_cbk, cs);
+
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_MKDIR, stat, -ret,
+                            cs->resolvedloc.path);
+        nfs3_mkdir_reply(cs->req, stat, NULL, NULL, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return 0;
+}
+
+int
+nfs3_mkdir(rpcsvc_request_t *req, struct nfs3_fh *dirfh, char *name,
+           sattr3 *sattr)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!req) || (!dirfh) || (!name) || (!sattr)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_log_fh_entry_call(rpcsvc_request_xid(req), "MKDIR", dirfh, name);
+    nfs3_validate_gluster_fh(dirfh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_validate_strlen_or_goto(name, NFS_NAME_MAX, nfs3err, stat, ret);
+    nfs3_map_fh_to_volume(nfs3, dirfh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_check_rw_volaccess(nfs3, dirfh->exportid, stat, nfs3err);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    cs->parent = *dirfh;
+    cs->setattr_valid = nfs3_sattr3_to_setattr_valid(sattr, &cs->stbuf,
+                                                     &cs->mode);
+    ret = nfs3_fh_resolve_and_resume(cs, dirfh, name, nfs3_mkdir_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_MKDIR, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_mkdir_reply(req, stat, NULL, NULL, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_mkdir(rpcsvc_request_t *req)
+{
+    char name[NFS_PATH_MAX];
+    struct nfs3_fh dirfh = {
+        {0},
+    };
+    mkdir3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+    nfs3_prep_mkdir3args(&args, &dirfh, name);
+    if (xdr_to_mkdir3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_mkdir(req, &dirfh, name, &args.attributes);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_DIR_OP_FAIL,
+               "MKDIR procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_symlink_reply(rpcsvc_request_t *req, nfsstat3 stat, struct nfs3_fh *fh,
+                   struct iatt *buf, struct iatt *preparent,
+                   struct iatt *postparent)
+{
+    symlink3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_symlink3res(&res, stat, fh, buf, preparent, postparent, deviceid);
+    nfs3svc_submit_reply(req, (void *)&res,
+                         (nfs3_serializer)xdr_serialize_symlink3res);
+
+    return 0;
+}
+
+int32_t
+nfs3svc_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, inode_t *inode,
+                    struct iatt *buf, struct iatt *preparent,
+                    struct iatt *postparent, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto nfs3err;
+    }
+
+    nfs3_fh_build_child_fh(&cs->parent, buf, &cs->fh);
+    stat = NFS3_OK;
+
+nfs3err:
+    nfs3_log_newfh_res(rpcsvc_request_xid(cs->req), NFS3_SYMLINK, stat,
+                       op_errno, &cs->fh, cs->resolvedloc.path);
+    nfs3_symlink_reply(cs->req, stat, &cs->fh, buf, preparent, postparent);
+    nfs3_call_state_wipe(cs);
+    return 0;
+}
+
+int
+nfs3_symlink_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_new_fh_resolve_status(cs, stat, nfs3err);
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_symlink(cs->nfsx, cs->vol, &nfu, cs->pathname, &cs->resolvedloc,
+                      nfs3svc_symlink_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_SYMLINK, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_symlink_reply(cs->req, stat, NULL, NULL, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_symlink(rpcsvc_request_t *req, struct nfs3_fh *dirfh, char *name,
+             char *target, sattr3 *sattr)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!req) || (!dirfh) || (!name) || (!target) || (!sattr)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_log_symlink_call(rpcsvc_request_xid(req), dirfh, name, target);
+    nfs3_validate_gluster_fh(dirfh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_validate_strlen_or_goto(name, NFS_NAME_MAX, nfs3err, stat, ret);
+    nfs3_map_fh_to_volume(nfs3, dirfh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_check_rw_volaccess(nfs3, dirfh->exportid, stat, nfs3err);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    cs->parent = *dirfh;
+    cs->pathname = gf_strdup(target);
+    if (!cs->pathname) {
+        ret = -1;
+        stat = NFS3ERR_SERVERFAULT;
+        goto nfs3err;
+    }
+
+    ret = nfs3_fh_resolve_and_resume(cs, dirfh, name, nfs3_symlink_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_SYMLINK, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_symlink_reply(req, stat, NULL, NULL, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+        /* Ret must be 0 after this so that the caller does not
+         * also send an RPC reply.
+         */
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_symlink(rpcsvc_request_t *req)
+{
+    char name[NFS_PATH_MAX];
+    struct nfs3_fh dirfh = {
+        {0},
+    };
+    char target[NFS_PATH_MAX];
+    symlink3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+    nfs3_prep_symlink3args(&args, &dirfh, name, target);
+    if (xdr_to_symlink3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_symlink(req, &dirfh, name, target,
+                       &args.symlink.symlink_attributes);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EXDEV, NFS_MSG_SYMLINK_FAIL,
+               "SYMLINK procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+static int
+nfs3_mknod_reply(rpcsvc_request_t *req, nfsstat3 stat, struct nfs3_fh *fh,
+                 struct iatt *buf, struct iatt *preparent,
+                 struct iatt *postparent)
+{
+    mknod3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_mknod3res(&res, stat, fh, buf, preparent, postparent, deviceid);
+    nfs3svc_submit_reply(req, (void *)&res,
+                         (nfs3_serializer)xdr_serialize_mknod3res);
+
+    return 0;
+}
+
+int32_t
+nfs3svc_mknod_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, struct iatt *preop,
+                          struct iatt *postop, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto nfs3err;
+    }
+
+    stat = NFS3_OK;
+nfs3err:
+    nfs3_log_newfh_res(rpcsvc_request_xid(cs->req), NFS3_MKNOD, stat, op_errno,
+                       &cs->fh, cs->resolvedloc.path);
+    nfs3_mknod_reply(cs->req, stat, &cs->fh, postop, &cs->preparent,
+                     &cs->postparent);
+    nfs3_call_state_wipe(cs);
+    return 0;
+}
+
+int32_t
+nfs3svc_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, inode_t *inode,
+                  struct iatt *buf, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -1;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto nfs3err;
+    }
+
+    nfs3_fh_build_child_fh(&cs->parent, buf, &cs->fh);
+
+    /* Means no attributes were required to be set. */
+    if (!cs->setattr_valid) {
+        stat = NFS3_OK;
+        ret = -1;
+        goto nfs3err;
+    }
+
+    cs->preparent = *preparent;
+    cs->postparent = *postparent;
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_setattr(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, &cs->stbuf,
+                      cs->setattr_valid, nfs3svc_mknod_setattr_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_newfh_res(rpcsvc_request_xid(cs->req), NFS3_MKNOD, stat,
+                           op_errno, &cs->fh, cs->resolvedloc.path);
+        nfs3_mknod_reply(cs->req, stat, &cs->fh, buf, preparent, postparent);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return 0;
+}
+
+static int
+nfs3_mknod_device(nfs3_call_state_t *cs)
+{
+    int ret = -EFAULT;
+    dev_t devnum = 0;
+    mode_t mode = 0;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!cs)
+        return ret;
+
+    devnum = makedev(cs->devnums.specdata1, cs->devnums.specdata2);
+    if (cs->mknodtype == NF3CHR)
+        mode = S_IFCHR;
+    else
+        mode = S_IFBLK;
+
+    nfs_request_user_init(&nfu, cs->req);
+    if (gf_attr_mode_set(cs->setattr_valid)) {
+        cs->setattr_valid &= ~GF_SET_ATTR_MODE;
+        mode |= cs->mode;
+        ret = nfs_mknod(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, mode, devnum,
+                        nfs3svc_mknod_cbk, cs);
+    } else
+        ret = nfs_mknod(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, mode, devnum,
+                        nfs3svc_mknod_cbk, cs);
+
+    return ret;
+}
+
+static int
+nfs3_mknod_fifo(nfs3_call_state_t *cs, mode_t mode)
+{
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!cs)
+        return ret;
+
+    nfs_request_user_init(&nfu, cs->req);
+    if (gf_attr_mode_set(cs->setattr_valid)) {
+        cs->setattr_valid &= ~GF_SET_ATTR_MODE;
+        mode |= cs->mode;
+        ret = nfs_mknod(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, mode, 0,
+                        nfs3svc_mknod_cbk, cs);
+    } else
+        ret = nfs_mknod(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, mode, 0,
+                        nfs3svc_mknod_cbk, cs);
+
+    return ret;
+}
+
+static int
+nfs3_mknod_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_new_fh_resolve_status(cs, stat, nfs3err);
+    switch (cs->mknodtype) {
+        case NF3CHR:
+        case NF3BLK:
+            ret = nfs3_mknod_device(cs);
+            break;
+        case NF3SOCK:
+            ret = nfs3_mknod_fifo(cs, S_IFSOCK);
+            break;
+        case NF3FIFO:
+            ret = nfs3_mknod_fifo(cs, S_IFIFO);
+            break;
+        default:
+            ret = -EBADF;
+            break;
+    }
+
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_MKNOD, stat, -ret,
+                            cs->resolvedloc.path);
+        nfs3_mknod_reply(cs->req, stat, NULL, NULL, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_mknod(rpcsvc_request_t *req, struct nfs3_fh *fh, char *name,
+           mknoddata3 *nodedata)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+    sattr3 *sattr = NULL;
+
+    if ((!req) || (!fh) || (!name) || (!nodedata)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_log_mknod_call(rpcsvc_request_xid(req), fh, name, nodedata->type);
+    nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_validate_strlen_or_goto(name, NFS_NAME_MAX, nfs3err, stat, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_check_rw_volaccess(nfs3, fh->exportid, stat, nfs3err);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    cs->mknodtype = nodedata->type;
+    switch (nodedata->type) {
+        case NF3CHR:
+        case NF3BLK:
+            cs->devnums = nodedata->mknoddata3_u.device.spec;
+            sattr = &nodedata->mknoddata3_u.device.dev_attributes;
+            cs->setattr_valid = nfs3_sattr3_to_setattr_valid(sattr, &cs->stbuf,
+                                                             &cs->mode);
+            break;
+        case NF3SOCK:
+        case NF3FIFO:
+            sattr = &nodedata->mknoddata3_u.pipe_attributes;
+            cs->setattr_valid = nfs3_sattr3_to_setattr_valid(sattr, &cs->stbuf,
+                                                             &cs->mode);
+            break;
+        default:
+            ret = -EBADF;
+            goto nfs3err;
+    }
+
+    cs->parent = *fh;
+    ret = nfs3_fh_resolve_and_resume(cs, fh, name, nfs3_mknod_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_MKNOD, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_mknod_reply(req, stat, NULL, NULL, NULL, NULL);
+        /* Ret must be 0 after this so that the caller does not
+         * also send an RPC reply.
+         */
+        nfs3_call_state_wipe(cs);
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_mknod(rpcsvc_request_t *req)
+{
+    char name[NFS_PATH_MAX];
+    struct nfs3_fh fh = {
+        {0},
+    };
+    mknod3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+    nfs3_prep_mknod3args(&args, &fh, name);
+    if (xdr_to_mknod3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_mknod(req, &fh, name, &args.what);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_MKNOD_FAIL,
+               "MKNOD procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_remove_reply(rpcsvc_request_t *req, nfsstat3 stat, struct iatt *preparent,
+                  struct iatt *postparent)
+{
+    remove3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_remove3res(&res, stat, preparent, postparent, deviceid);
+    nfs3svc_submit_reply(req, (void *)&res,
+                         (nfs3_serializer)xdr_serialize_remove3res);
+    return 0;
+}
+
+int32_t
+nfs3svc_remove_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                   struct iatt *postparent, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+    }
+
+    if (op_ret == 0)
+        stat = NFS3_OK;
+
+    nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_REMOVE, stat,
+                        op_errno, cs->resolvedloc.path);
+    nfs3_remove_reply(cs->req, stat, preparent, postparent);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int
+__nfs3_remove(nfs3_call_state_t *cs)
+{
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    ia_type_t type = 0;
+
+    if (!cs)
+        return ret;
+    type = cs->resolvedloc.inode->ia_type;
+    nfs_request_user_init(&nfu, cs->req);
+    if (IA_ISDIR(type))
+        ret = nfs_rmdir(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                        nfs3svc_remove_cbk, cs);
+    else
+        ret = nfs_unlink(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                         nfs3svc_remove_cbk, cs);
+
+    return ret;
+}
+
+int
+nfs3_remove_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    ret = __nfs3_remove(cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_REMOVE, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_remove_reply(cs->req, stat, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_remove(rpcsvc_request_t *req, struct nfs3_fh *fh, char *name)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!req) || (!fh) || (!name)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_log_fh_entry_call(rpcsvc_request_xid(req), "REMOVE", fh, name);
+    nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_validate_strlen_or_goto(name, NFS_NAME_MAX, nfs3err, stat, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_check_rw_volaccess(nfs3, fh->exportid, stat, nfs3err);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    ret = nfs3_fh_resolve_and_resume(cs, fh, name, nfs3_remove_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_REMOVE, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_remove_reply(req, stat, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+        /* Ret must be 0 after this so that the caller does not
+         * also send an RPC reply.
+         */
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_remove(rpcsvc_request_t *req)
+{
+    char name[NFS_PATH_MAX];
+    struct nfs3_fh fh = {
+        {0},
+    };
+    remove3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+    nfs3_prep_remove3args(&args, &fh, name);
+    if (xdr_to_remove3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_remove(req, &fh, name);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_REMOVE_FAIL,
+               "REMOVE procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_rmdir_reply(rpcsvc_request_t *req, nfsstat3 stat, struct iatt *preparent,
+                 struct iatt *postparent)
+{
+    rmdir3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_rmdir3res(&res, stat, preparent, postparent, deviceid);
+    nfs3svc_submit_reply(req, (void *)&res,
+                         (nfs3_serializer)xdr_serialize_rmdir3res);
+    return 0;
+}
+
+int32_t
+nfs3svc_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+                  struct iatt *postparent, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_WARNING, op_errno, NFS_MSG_RMDIR_CBK,
+               "%x: %s => -1 (%s)", rpcsvc_request_xid(cs->req),
+               cs->resolvedloc.path, strerror(op_errno));
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+    } else {
+        stat = NFS3_OK;
+    }
+
+    nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_RMDIR, stat, op_errno,
+                        cs->resolvedloc.path);
+    nfs3_rmdir_reply(cs->req, stat, preparent, postparent);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int
+nfs3_rmdir_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_rmdir(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                    nfs3svc_rmdir_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_RMDIR, stat, -ret,
+                            cs->resolvedloc.path);
+        nfs3_rmdir_reply(cs->req, stat, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_rmdir(rpcsvc_request_t *req, struct nfs3_fh *fh, char *name)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!req) || (!fh) || (!name)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_log_fh_entry_call(rpcsvc_request_xid(req), "RMDIR", fh, name);
+    nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_validate_strlen_or_goto(name, NFS_NAME_MAX, nfs3err, stat, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_check_rw_volaccess(nfs3, fh->exportid, stat, nfs3err);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    ret = nfs3_fh_resolve_and_resume(cs, fh, name, nfs3_rmdir_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_RMDIR, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_rmdir_reply(req, stat, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+        /* Ret must be 0 after this so that the caller does not
+         * also send an RPC reply.
+         */
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_rmdir(rpcsvc_request_t *req)
+{
+    char name[NFS_PATH_MAX];
+    struct nfs3_fh fh = {
+        {0},
+    };
+    rmdir3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+    nfs3_prep_rmdir3args(&args, &fh, name);
+    if (xdr_to_rmdir3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_rmdir(req, &fh, name);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_DIR_OP_FAIL,
+               "RMDIR procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_rename_reply(rpcsvc_request_t *req, nfsstat3 stat, struct iatt *buf,
+                  struct iatt *preoldparent, struct iatt *postoldparent,
+                  struct iatt *prenewparent, struct iatt *postnewparent)
+{
+    rename3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_rename3res(&res, stat, buf, preoldparent, postoldparent,
+                         prenewparent, postnewparent, deviceid);
+
+    nfs3svc_submit_reply(req, (void *)&res,
+                         (nfs3_serializer)xdr_serialize_rename3res);
+
+    return 0;
+}
+
+int32_t
+nfs3svc_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                   struct iatt *preoldparent, struct iatt *postoldparent,
+                   struct iatt *prenewparent, struct iatt *postnewparent,
+                   dict_t *xdata)
+{
+    int ret = -EFAULT;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto nfs3err;
+    }
+
+    stat = NFS3_OK;
+nfs3err:
+    nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_RENAME, stat, -ret,
+                        cs->resolvedloc.path);
+    nfs3_rename_reply(cs->req, stat, buf, preoldparent, postoldparent,
+                      prenewparent, postnewparent);
+    nfs3_call_state_wipe(cs);
+    return 0;
+}
+
+int
+nfs3_rename_resume_dst(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_new_fh_resolve_status(cs, stat, nfs3err);
+    cs->parent = cs->resolvefh;
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_rename(cs->nfsx, cs->vol, &nfu, &cs->oploc, &cs->resolvedloc,
+                     nfs3svc_rename_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_RENAME, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_rename_reply(cs->req, stat, NULL, NULL, NULL, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_rename_resume_src(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    /* Copy the resolved loc for the source file into another loc
+     * for safekeeping till we resolve the dest loc.
+     */
+    nfs_loc_copy(&cs->oploc, &cs->resolvedloc);
+    nfs_loc_wipe(&cs->resolvedloc);
+    GF_FREE(cs->resolventry);
+
+    ret = nfs3_fh_resolve_and_resume(cs, &cs->fh, cs->pathname,
+                                     nfs3_rename_resume_dst);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_RENAME, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_rename_reply(cs->req, stat, NULL, NULL, NULL, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_rename(rpcsvc_request_t *req, struct nfs3_fh *olddirfh, char *oldname,
+            struct nfs3_fh *newdirfh, char *newname)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!req) || (!olddirfh) || (!oldname) || (!newdirfh) || (!newname)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_log_rename_call(rpcsvc_request_xid(req), olddirfh, oldname, newdirfh,
+                         newname);
+    nfs3_validate_gluster_fh(olddirfh, stat, nfs3err);
+    nfs3_validate_gluster_fh(newdirfh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_validate_strlen_or_goto(oldname, NFS_NAME_MAX, nfs3err, stat, ret);
+    nfs3_validate_strlen_or_goto(newname, NFS_NAME_MAX, nfs3err, stat, ret);
+    nfs3_map_fh_to_volume(nfs3, olddirfh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_check_rw_volaccess(nfs3, olddirfh->exportid, stat, nfs3err);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    /* While we resolve the source (fh, name) pair, we need to keep a copy
+     * of the dest (fh,name) pair.
+     */
+    cs->fh = *newdirfh;
+    cs->pathname = gf_strdup(newname);
+    if (!cs->pathname) {
+        stat = NFS3ERR_SERVERFAULT;
+        ret = -1;
+        goto nfs3err;
+    }
+
+    ret = nfs3_fh_resolve_and_resume(cs, olddirfh, oldname,
+                                     nfs3_rename_resume_src);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_RENAME, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_rename_reply(req, stat, NULL, NULL, NULL, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+        /* Ret must be 0 after this so that the caller does not
+         * also send an RPC reply.
+         */
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_rename(rpcsvc_request_t *req)
+{
+    char newname[NFS_PATH_MAX];
+    char oldname[NFS_PATH_MAX];
+    struct nfs3_fh olddirfh = {
+        {0},
+    };
+    struct nfs3_fh newdirfh = {
+        {0},
+    };
+    rename3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+    nfs3_prep_rename3args(&args, &olddirfh, oldname, &newdirfh, newname);
+    if (xdr_to_rename3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_rename(req, &olddirfh, oldname, &newdirfh, newname);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_RENAME_FAIL,
+               "RENAME procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_link_reply(rpcsvc_request_t *req, nfsstat3 stat, struct iatt *buf,
+                struct iatt *preparent, struct iatt *postparent)
+{
+    link3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_link3res(&res, stat, buf, preparent, postparent, deviceid);
+    nfs3svc_submit_reply(req, (void *)&res,
+                         (nfs3_serializer)xdr_serialize_link3res);
+
+    return 0;
+}
+
+int32_t
+nfs3svc_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, inode_t *inode,
+                 struct iatt *buf, struct iatt *preparent,
+                 struct iatt *postparent, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+    } else
+        stat = NFS3_OK;
+
+    nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_LINK, stat, op_errno,
+                        cs->resolvedloc.path);
+    nfs3_link_reply(cs->req, stat, buf, preparent, postparent);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int
+nfs3_link_resume_lnk(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_new_fh_resolve_status(cs, stat, nfs3err);
+
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_link(cs->nfsx, cs->vol, &nfu, &cs->oploc, &cs->resolvedloc,
+                   nfs3svc_link_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_LINK, stat, -ret,
+                            cs->resolvedloc.path);
+        nfs3_link_reply(cs->req, stat, NULL, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+    return ret;
+}
+
+int
+nfs3_link_resume_tgt(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    nfs_loc_copy(&cs->oploc, &cs->resolvedloc);
+    nfs_loc_wipe(&cs->resolvedloc);
+
+    ret = nfs3_fh_resolve_and_resume(cs, &cs->fh, cs->pathname,
+                                     nfs3_link_resume_lnk);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_LINK, stat, -ret,
+                            cs->resolvedloc.path);
+        nfs3_link_reply(cs->req, stat, NULL, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_link(rpcsvc_request_t *req, struct nfs3_fh *targetfh,
+          struct nfs3_fh *dirfh, char *newname)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!req) || (!targetfh) || (!dirfh) || (!newname)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_validate_gluster_fh(dirfh, stat, nfs3err);
+    nfs3_validate_gluster_fh(targetfh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_validate_strlen_or_goto(newname, NFS_NAME_MAX, nfs3err, stat, ret);
+    nfs3_map_fh_to_volume(nfs3, dirfh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_check_rw_volaccess(nfs3, dirfh->exportid, stat, nfs3err);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    cs->fh = *dirfh;
+    cs->pathname = gf_strdup(newname);
+    if (!cs->pathname) {
+        stat = NFS3ERR_SERVERFAULT;
+        ret = -1;
+        goto nfs3err;
+    }
+
+    ret = nfs3_fh_resolve_and_resume(cs, targetfh, NULL, nfs3_link_resume_tgt);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_LINK, stat, -ret,
+                            cs ? cs->pathname : NULL);
+        nfs3_link_reply(req, stat, NULL, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+        /* Ret must be 0 after this so that the caller does not
+         * also send an RPC reply.
+         */
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_link(rpcsvc_request_t *req)
+{
+    char newpath[NFS_PATH_MAX];
+    struct nfs3_fh dirfh = {
+        {0},
+    };
+    struct nfs3_fh targetfh = {
+        {0},
+    };
+    link3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+    nfs3_prep_link3args(&args, &targetfh, &dirfh, newpath);
+    if (xdr_to_link3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_link(req, &targetfh, &dirfh, newpath);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EXDEV, NFS_MSG_LINK_FAIL,
+               "LINK procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_readdirp_reply(rpcsvc_request_t *req, nfsstat3 stat, struct nfs3_fh *dirfh,
+                    uint64_t cverf, struct iatt *dirstat, gf_dirent_t *entries,
+                    count3 dircount, count3 maxcount, int is_eof)
+{
+    readdirp3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_readdirp3res(&res, stat, dirfh, cverf, dirstat, entries, dircount,
+                           maxcount, is_eof, deviceid);
+    nfs3svc_submit_reply(req, (void *)&res,
+                         (nfs3_serializer)xdr_serialize_readdirp3res);
+    nfs3_free_readdirp3res(&res);
+
+    return 0;
+}
+
+int
+nfs3_readdir_reply(rpcsvc_request_t *req, nfsstat3 stat, struct nfs3_fh *dirfh,
+                   uint64_t cverf, struct iatt *dirstat, gf_dirent_t *entries,
+                   count3 count, int is_eof)
+{
+    readdir3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_readdir3res(&res, stat, dirfh, cverf, dirstat, entries, count,
+                          is_eof, deviceid);
+    nfs3svc_submit_reply(req, (void *)&res,
+                         (nfs3_serializer)xdr_serialize_readdir3res);
+    nfs3_free_readdir3res(&res);
+
+    return 0;
+}
+
+int32_t
+nfs3svc_readdir_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                          int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                          dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int is_eof = 0;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto nfs3err;
+    }
+
+    /* Check whether we encountered a end of directory stream while
+     * readdir'ing.
+     */
+    if (cs->operrno == ENOENT) {
+        gf_msg_trace(GF_NFS3, 0, "Reached end-of-directory");
+        is_eof = 1;
+    }
+
+    stat = NFS3_OK;
+
+    /* do inode linking here */
+    gf_link_inodes_from_dirent(this, cs->fd->inode, &cs->entries);
+
+nfs3err:
+    if (cs->maxcount == 0) {
+        nfs3_log_readdir_res(rpcsvc_request_xid(cs->req), stat, op_errno,
+                             (uintptr_t)cs->fd, cs->dircount, is_eof,
+                             cs->resolvedloc.path);
+        nfs3_readdir_reply(cs->req, stat, &cs->parent, (uintptr_t)cs->fd, buf,
+                           &cs->entries, cs->dircount, is_eof);
+    } else {
+        nfs3_log_readdirp_res(rpcsvc_request_xid(cs->req), stat, op_errno,
+                              (uintptr_t)cs->fd, cs->dircount, cs->maxcount,
+                              is_eof, cs->resolvedloc.path);
+        nfs3_readdirp_reply(cs->req, stat, &cs->parent, (uintptr_t)cs->fd, buf,
+                            &cs->entries, cs->dircount, cs->maxcount, is_eof);
+    }
+
+    if (is_eof) {
+        /* do nothing */
+    }
+
+    nfs3_call_state_wipe(cs);
+    return 0;
+}
+
+int32_t
+nfs3svc_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+                    dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto err;
+    }
+
+    cs->operrno = op_errno;
+    list_splice_init(&entries->list, &cs->entries.list);
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_fstat(cs->nfsx, cs->vol, &nfu, cs->fd, nfs3svc_readdir_fstat_cbk,
+                    cs);
+    if (ret < 0) {
+        op_ret = -1;
+        stat = nfs3_errno_to_nfsstat3(-ret);
+        op_errno = -ret;
+    }
+
+err:
+    if (op_ret >= 0)
+        goto ret;
+
+    if (cs->maxcount == 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_READDIR, stat,
+                            op_errno, cs->resolvedloc.path);
+        nfs3_readdir_reply(cs->req, stat, NULL, 0, NULL, NULL, 0, 0);
+    } else {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_READDIRP, stat,
+                            op_errno, cs->resolvedloc.path);
+        nfs3_readdirp_reply(cs->req, stat, NULL, 0, NULL, NULL, 0, 0, 0);
+    }
+
+    /* For directories, we force a purge from the fd cache on close
+     * so that next time the dir is read, we'll get any changed directory
+     * entries.
+     */
+    nfs3_call_state_wipe(cs);
+ret:
+    return 0;
+}
+
+int
+nfs3_readdir_process(nfs3_call_state_t *cs)
+{
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!cs)
+        return ret;
+
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_readdirp(cs->nfsx, cs->vol, &nfu, cs->fd, cs->dircount,
+                       cs->cookie, nfs3svc_readdir_cbk, cs);
+    return ret;
+}
+
+int
+nfs3_readdir_read_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+    struct nfs3_state *nfs3 = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    nfs3 = rpcsvc_request_program_private(cs->req);
+    ret = nfs3_verify_dircookie(nfs3, cs->fd, cs->cookie, cs->cookieverf,
+                                &stat);
+    if (ret < 0) /* Stat already set by verifier function above. */
+        goto nfs3err;
+
+    ret = nfs3_readdir_process(cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+nfs3err:
+    if (ret < 0) {
+        if (cs->maxcount == 0) {
+            nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_READDIR, stat,
+                                -ret, cs->resolvedloc.path);
+            nfs3_readdir_reply(cs->req, stat, NULL, 0, NULL, NULL, 0, 0);
+        } else {
+            nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_READDIRP,
+                                stat, -ret, cs->resolvedloc.path);
+            nfs3_readdirp_reply(cs->req, stat, NULL, 0, NULL, NULL, 0, 0, 0);
+        }
+        nfs3_call_state_wipe(cs);
+    }
+
+    return 0;
+}
+
+int32_t
+nfs3svc_readdir_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int32_t op_ret, int32_t op_errno, fd_t *fd,
+                            dict_t *xdata)
+{
+    /*
+     * We don't really need this, it's just an artifact of forcing the
+     * opendir to happen.
+     */
+    if (fd) {
+        fd_unref(fd);
+    }
+
+    return 0;
+}
+
+int
+nfs3_readdir_open_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    cs->fd = fd_anonymous(cs->resolvedloc.inode);
+    if (!cs->fd) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ANONYMOUS_FD_FAIL,
+               "Fail to create anonymous fd");
+        goto nfs3err;
+    }
+
+    /*
+     * NFS client will usually send us a readdirp without an opendir,
+     * which would cause us to skip our usual self-heal checks which occur
+     * in opendir for native protocol. To make sure those checks do happen,
+     * our most reliable option is to do our own opendir for any readdirp
+     * at the beginning of the directory.
+     */
+    if (cs->cookie == 0) {
+        nfs_request_user_init(&nfu, cs->req);
+        ret = nfs_opendir(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                          nfs3svc_readdir_opendir_cbk, cs);
+        if (ret < 0) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_DIR_OP_FAIL,
+                   "auto-opendir failed");
+        }
+    }
+
+    ret = nfs3_readdir_read_resume(cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        if (cs->maxcount == 0) {
+            nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_READDIR, stat,
+                                -ret, cs->resolvedloc.path);
+            nfs3_readdir_reply(cs->req, stat, NULL, 0, NULL, NULL, 0, 0);
+        } else {
+            nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_READDIRP,
+                                stat, -ret, cs->resolvedloc.path);
+            nfs3_readdirp_reply(cs->req, stat, NULL, 0, NULL, NULL, 0, 0, 0);
+        }
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_readdir(rpcsvc_request_t *req, struct nfs3_fh *fh, cookie3 cookie,
+             uint64_t cverf, count3 dircount, count3 maxcount)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+    struct nfs_state *nfs = NULL;
+    gf_boolean_t is_readdirp = !!maxcount;
+
+    if ((!req) || (!fh)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_log_readdir_call(rpcsvc_request_xid(req), fh, dircount, maxcount);
+    nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+    nfs = nfs_state(nfs3->nfsx);
+
+    if (is_readdirp && !nfs->rdirplus) {
+        ret = -ENOTSUP;
+        stat = nfs3_errno_to_nfsstat3(-ret);
+        goto nfs3err;
+    }
+
+    cs->cookieverf = cverf;
+    cs->dircount = dircount;
+    cs->maxcount = maxcount;
+    cs->cookie = cookie;
+    cs->parent = *fh;
+    ret = nfs3_fh_resolve_and_resume(cs, fh, NULL, nfs3_readdir_open_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        if (!is_readdirp) {
+            nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_READDIR, stat,
+                                -ret, cs ? cs->resolvedloc.path : NULL);
+            nfs3_readdir_reply(req, stat, NULL, 0, NULL, NULL, 0, 0);
+        } else {
+            nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_READDIRP, stat,
+                                -ret, cs ? cs->resolvedloc.path : NULL);
+            nfs3_readdirp_reply(req, stat, NULL, 0, NULL, NULL, 0, 0, 0);
+        }
+        /* Ret must be NULL after this so that the caller does not
+         * also send an RPC reply.
+         */
+        ret = 0;
+        nfs3_call_state_wipe(cs);
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_readdir(rpcsvc_request_t *req)
+{
+    readdir3args ra;
+    struct nfs3_fh fh = {
+        {0},
+    };
+    int ret = RPCSVC_ACTOR_ERROR;
+    uint64_t verf = 0;
+    uint64_t *cval;
+
+    if (!req)
+        return ret;
+    nfs3_prep_readdir3args(&ra, &fh);
+    if (xdr_to_readdir3args(req->msg[0], &ra) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+    cval = (uint64_t *)ra.cookieverf;
+    verf = *cval;
+
+    ret = nfs3_readdir(req, &fh, ra.cookie, verf, ra.count, 0);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_READDIR_FAIL,
+               "READDIR procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3svc_readdirp(rpcsvc_request_t *req)
+{
+    readdirp3args ra;
+    struct nfs3_fh fh = {
+        {0},
+    };
+    int ret = RPCSVC_ACTOR_ERROR;
+    uint64_t cverf = 0;
+    uint64_t *cval;
+
+    if (!req)
+        return ret;
+    nfs3_prep_readdirp3args(&ra, &fh);
+    if (xdr_to_readdirp3args(req->msg[0], &ra) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+    cval = (uint64_t *)ra.cookieverf;
+    cverf = *cval;
+
+    ret = nfs3_readdir(req, &fh, ra.cookie, cverf, ra.dircount, ra.maxcount);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_READDIRP_FAIL,
+               "READDIRP procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_fsstat_reply(rpcsvc_request_t *req, nfsstat3 stat, struct statvfs *fsbuf,
+                  struct iatt *postbuf)
+{
+    fsstat3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_fsstat3res(&res, stat, fsbuf, postbuf, deviceid);
+    return nfs3svc_submit_reply(req, &res,
+                                (nfs3_serializer)xdr_serialize_fsstat3res);
+}
+
+int32_t
+nfs3_fsstat_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                     dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+    } else
+        stat = NFS3_OK;
+
+    nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_FSSTAT, stat,
+                        op_errno, cs->resolvedloc.path);
+    nfs3_fsstat_reply(cs->req, stat, &cs->fsstat, buf);
+    nfs3_call_state_wipe(cs);
+    return 0;
+}
+
+int32_t
+nfs3_fsstat_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+                       dict_t *xdata)
+{
+    nfs_user_t nfu = {
+        0,
+    };
+    int ret = -EFAULT;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        ret = -op_errno;
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+        goto err;
+    }
+
+    /* Then get the stat for the fs root in order to fill in the
+     * post_op_attr.
+     */
+    cs->fsstat = *buf;
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_stat(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                   nfs3_fsstat_stat_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_FSSTAT, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_fsstat_reply(cs->req, stat, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return 0;
+}
+
+int
+nfs3_fsstat_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+    nfs_user_t nfu = {
+        0,
+    };
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    nfs_request_user_init(&nfu, cs->req);
+    /* First, we need to get the statvfs for the subvol */
+    ret = nfs_statfs(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                     nfs3_fsstat_statfs_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_FSSTAT, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_fsstat_reply(cs->req, stat, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_fsstat(rpcsvc_request_t *req, struct nfs3_fh *fh)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!req) || (!fh)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_log_common_call(rpcsvc_request_xid(req), "FSSTAT", fh);
+    nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    ret = nfs3_fh_resolve_and_resume(cs, fh, NULL, nfs3_fsstat_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_FSSTAT, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_fsstat_reply(req, stat, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+        /* Ret must be 0 after this so that the caller does not
+         * also send an RPC reply.
+         */
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_fsstat(rpcsvc_request_t *req)
+{
+    struct nfs3_fh fh = {
+        {0},
+    };
+    fsstat3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+    nfs3_prep_fsstat3args(&args, &fh);
+    if (xdr_to_fsstat3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_fsstat(req, &fh);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_FSTAT_FAIL,
+               "FSTAT procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_fsinfo_reply(rpcsvc_request_t *req, nfsstat3 status, struct iatt *fsroot)
+{
+    fsinfo3res res;
+    struct nfs3_state *nfs3 = NULL;
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3 = rpcsvc_request_program_private(req);
+    nfs3_fill_fsinfo3res(nfs3, &res, status, fsroot, deviceid);
+
+    nfs3svc_submit_reply(req, &res, (nfs3_serializer)xdr_serialize_fsinfo3res);
+    return 0;
+}
+
+int32_t
+nfs3svc_fsinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                   dict_t *xdata)
+{
+    nfsstat3 status = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+
+    if (op_ret == -1) {
+        status = nfs3_cbk_errno_status(op_ret, op_errno);
+    } else
+        status = NFS3_OK;
+
+    nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_FSINFO, status,
+                        op_errno, cs->resolvedloc.path);
+
+    nfs3_fsinfo_reply(cs->req, status, buf);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int
+nfs3_fsinfo_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    nfs_request_user_init(&nfu, cs->req);
+
+    ret = nfs_stat(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                   nfs3svc_fsinfo_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_FSINFO, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_fsinfo_reply(cs->req, stat, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_fsinfo(rpcsvc_request_t *req, struct nfs3_fh *fh)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!req) || (!fh)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_log_common_call(rpcsvc_request_xid(req), "FSINFO", fh);
+    nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    ret = nfs3_fh_resolve_and_resume(cs, fh, NULL, nfs3_fsinfo_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_FSINFO, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_fsinfo_reply(req, stat, NULL);
+        nfs3_call_state_wipe(cs);
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_fsinfo(rpcsvc_request_t *req)
+{
+    int ret = RPCSVC_ACTOR_ERROR;
+    fsinfo3args args;
+    struct nfs3_fh root = {
+        {0},
+    };
+
+    if (!req)
+        return ret;
+
+    nfs3_prep_fsinfo3args(&args, &root);
+    if (xdr_to_fsinfo3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding arguments");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_fsinfo(req, &root);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_FSINFO_FAIL,
+               "FSINFO procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_pathconf_reply(rpcsvc_request_t *req, nfsstat3 stat, struct iatt *buf)
+{
+    pathconf3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_pathconf3res(&res, stat, buf, deviceid);
+    nfs3svc_submit_reply(req, (void *)&res,
+                         (nfs3_serializer)xdr_serialize_pathconf3res);
+    return 0;
+}
+
+int32_t
+nfs3svc_pathconf_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, struct iatt *buf,
+                     dict_t *xdata)
+{
+    struct iatt *sbuf = NULL;
+    nfs3_call_state_t *cs = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+    } else {
+        /* If stat fop failed, we can still send the other components
+         * in a pathconf reply.
+         */
+        sbuf = buf;
+        stat = NFS3_OK;
+    }
+
+    nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_PATHCONF, stat,
+                        op_errno, cs->resolvedloc.path);
+    nfs3_pathconf_reply(cs->req, stat, sbuf);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int
+nfs3_pathconf_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_stat(cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+                   nfs3svc_pathconf_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_PATHCONF, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_pathconf_reply(cs->req, stat, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_pathconf(rpcsvc_request_t *req, struct nfs3_fh *fh)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!req) || (!fh)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_log_common_call(rpcsvc_request_xid(req), "PATHCONF", fh);
+    nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    ret = nfs3_fh_resolve_and_resume(cs, fh, NULL, nfs3_pathconf_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_PATHCONF, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_pathconf_reply(req, stat, NULL);
+        nfs3_call_state_wipe(cs);
+        /* Ret must be 0 after this so that the caller does not
+         * also send an RPC reply.
+         */
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_pathconf(rpcsvc_request_t *req)
+{
+    struct nfs3_fh fh = {
+        {0},
+    };
+    pathconf3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+    nfs3_prep_pathconf3args(&args, &fh);
+    if (xdr_to_pathconf3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_pathconf(req, &fh);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, -ret, NFS_MSG_PATHCONF_FAIL,
+               "PATHCONF procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+int
+nfs3_commit_reply(rpcsvc_request_t *req, nfsstat3 stat, uint64_t wverf,
+                  struct iatt *prestat, struct iatt *poststat)
+{
+    commit3res res = {
+        0,
+    };
+    uint64_t deviceid = 0;
+
+    deviceid = nfs3_request_xlator_deviceid(req);
+    nfs3_fill_commit3res(&res, stat, wverf, prestat, poststat, deviceid);
+    nfs3svc_submit_reply(req, (void *)&res,
+                         (nfs3_serializer)xdr_serialize_commit3res);
+
+    return 0;
+}
+
+int32_t
+nfs3svc_commit_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    nfs3_call_state_t *cs = NULL;
+    struct nfs3_state *nfs3 = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nfs3_cbk_errno_status(op_ret, op_errno);
+    } else
+        stat = NFS3_OK;
+
+    nfs3 = rpcsvc_request_program_private(cs->req);
+    nfs3_log_commit_res(rpcsvc_request_xid(cs->req), stat, op_errno,
+                        nfs3->serverstart, cs->resolvedloc.path);
+    nfs3_commit_reply(cs->req, stat, nfs3->serverstart, NULL, NULL);
+    nfs3_call_state_wipe(cs);
+
+    return 0;
+}
+
+int
+nfs3_commit_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+
+    if (nfs3_export_sync_trusted(cs->nfs3state, cs->resolvefh.exportid)) {
+        ret = -1;
+        stat = NFS3_OK;
+        goto nfs3err;
+    }
+
+    nfs_request_user_init(&nfu, cs->req);
+    ret = nfs_flush(cs->nfsx, cs->vol, &nfu, cs->fd, nfs3svc_commit_cbk, cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_COMMIT, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_commit_reply(cs->req, stat, cs->nfs3state->serverstart, NULL,
+                          NULL);
+        nfs3_call_state_wipe(cs);
+        ret = 0;
+    }
+
+    return 0;
+}
+
+int
+nfs3_commit_open_resume(void *carg)
+{
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = (nfs3_call_state_t *)carg;
+    nfs3_check_fh_resolve_status(cs, stat, nfs3err);
+    cs->fd = fd_anonymous(cs->resolvedloc.inode);
+    if (!cs->fd) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ANONYMOUS_FD_FAIL,
+               "Failed to create anonymous fd.");
+        goto nfs3err;
+    }
+
+    ret = nfs3_commit_resume(cs);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(cs->req), NFS3_COMMIT, stat,
+                            -ret, cs->resolvedloc.path);
+        nfs3_commit_reply(cs->req, stat, 0, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nfs3_commit(rpcsvc_request_t *req, struct nfs3_fh *fh, offset3 offset,
+            count3 count)
+{
+    xlator_t *vol = NULL;
+    nfsstat3 stat = NFS3ERR_SERVERFAULT;
+    int ret = -EFAULT;
+    struct nfs3_state *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!req) || (!fh)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Bad arguments");
+        return -1;
+    }
+
+    nfs3_log_rw_call(rpcsvc_request_xid(req), "COMMIT", fh, offset, count, -1);
+    nfs3_validate_gluster_fh(fh, stat, nfs3err);
+    nfs3_validate_nfs3_state(req, nfs3, stat, nfs3err, ret);
+    nfs3_map_fh_to_volume(nfs3, fh, req, vol, stat, nfs3err);
+    nfs3_volume_started_check(nfs3, vol, ret, out);
+    nfs3_check_rw_volaccess(nfs3, fh->exportid, stat, nfs3err);
+    nfs3_handle_call_state_init(nfs3, cs, req, vol, stat, nfs3err);
+
+    cs->datacount = count;
+    cs->dataoffset = offset;
+    ret = nfs3_fh_resolve_and_resume(cs, fh, NULL, nfs3_commit_open_resume);
+    if (ret < 0)
+        stat = nfs3_errno_to_nfsstat3(-ret);
+
+nfs3err:
+    if (ret < 0) {
+        nfs3_log_common_res(rpcsvc_request_xid(req), NFS3_COMMIT, stat, -ret,
+                            cs ? cs->resolvedloc.path : NULL);
+        nfs3_commit_reply(req, stat, 0, NULL, NULL);
+        nfs3_call_state_wipe(cs);
+        ret = 0;
+    }
+out:
+    return ret;
+}
+
+int
+nfs3svc_commit(rpcsvc_request_t *req)
+{
+    struct nfs3_fh fh = {
+        {0},
+    };
+    commit3args args;
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+    nfs3_prep_commit3args(&args, &fh);
+    if (xdr_to_commit3args(req->msg[0], &args) <= 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    ret = nfs3_commit(req, &fh, args.offset, args.count);
+    if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_COMMIT_FAIL,
+               "COMMIT procedure failed");
+        rpcsvc_request_seterr(req, SYSTEM_ERR);
+        ret = RPCSVC_ACTOR_ERROR;
+    }
+
+rpcerr:
+    return ret;
+}
+
+static rpcsvc_actor_t nfs3svc_actors[NFS3_PROC_COUNT] = {
+    {"NULL", nfs3svc_null, NULL, NFS3_NULL, DRC_IDEMPOTENT, 0},
+    {"GETATTR", nfs3svc_getattr, NULL, NFS3_GETATTR, DRC_IDEMPOTENT, 0},
+    {"SETATTR", nfs3svc_setattr, NULL, NFS3_SETATTR, DRC_NON_IDEMPOTENT, 0},
+    {"LOOKUP", nfs3svc_lookup, NULL, NFS3_LOOKUP, DRC_IDEMPOTENT, 0},
+    {"ACCESS", nfs3svc_access, NULL, NFS3_ACCESS, DRC_IDEMPOTENT, 0},
+    {"READLINK", nfs3svc_readlink, NULL, NFS3_READLINK, DRC_IDEMPOTENT, 0},
+    {"READ", nfs3svc_read, NULL, NFS3_READ, DRC_IDEMPOTENT, 0},
+    {"WRITE", nfs3svc_write, nfs3svc_write_vecsizer, NFS3_WRITE, DRC_IDEMPOTENT,
+     0},
+    {"CREATE", nfs3svc_create, NULL, NFS3_CREATE, DRC_NON_IDEMPOTENT, 0},
+    {"MKDIR", nfs3svc_mkdir, NULL, NFS3_MKDIR, DRC_NON_IDEMPOTENT, 0},
+    {"SYMLINK", nfs3svc_symlink, NULL, NFS3_SYMLINK, DRC_NON_IDEMPOTENT, 0},
+    {"MKNOD", nfs3svc_mknod, NULL, NFS3_MKNOD, DRC_NON_IDEMPOTENT, 0},
+    {"REMOVE", nfs3svc_remove, NULL, NFS3_REMOVE, DRC_NON_IDEMPOTENT, 0},
+    {"RMDIR", nfs3svc_rmdir, NULL, NFS3_RMDIR, DRC_NON_IDEMPOTENT, 0},
+    {"RENAME", nfs3svc_rename, NULL, NFS3_RENAME, DRC_NON_IDEMPOTENT, 0},
+    {"LINK", nfs3svc_link, NULL, NFS3_LINK, DRC_NON_IDEMPOTENT, 0},
+    {"READDIR", nfs3svc_readdir, NULL, NFS3_READDIR, DRC_IDEMPOTENT, 0},
+    {"READDIRPLUS", nfs3svc_readdirp, NULL, NFS3_READDIRP, DRC_IDEMPOTENT, 0},
+    {"FSSTAT", nfs3svc_fsstat, NULL, NFS3_FSSTAT, DRC_IDEMPOTENT, 0},
+    {"FSINFO", nfs3svc_fsinfo, NULL, NFS3_FSINFO, DRC_IDEMPOTENT, 0},
+    {"PATHCONF", nfs3svc_pathconf, NULL, NFS3_PATHCONF, DRC_IDEMPOTENT, 0},
+    {"COMMIT", nfs3svc_commit, NULL, NFS3_COMMIT, DRC_IDEMPOTENT, 0}};
+
+static rpcsvc_program_t nfs3prog = {
+    .progname = "NFS3",
+    .prognum = NFS_PROGRAM,
+    .progver = NFS_V3,
+    .progport = GF_NFS3_PORT,
+    .actors = nfs3svc_actors,
+    .numactors = NFS3_PROC_COUNT,
+
+    /* Requests like FSINFO are sent before an auth scheme
+     * is inited by client. See RFC 2623, Section 2.3.2. */
+    .min_auth = AUTH_NULL,
+};
+
+/*
+ * This function rounds up the input value to multiple of 4096. Min and Max
+ * supported I/O size limits are 4KB (GF_NFS3_FILE_IO_SIZE_MIN) and
+ * 1MB (GF_NFS3_FILE_IO_SIZE_MAX).
+ */
+void
+nfs3_iosize_roundup_4KB(uint64_t *ioszptr)
+{
+    uint64_t iosize;
+    uint64_t iopages;
+
+    if (!ioszptr)
+        return;
+
+    iosize = *ioszptr;
+    iopages = (iosize + GF_NFS3_IO_SIZE - 1) >> GF_NFS3_IO_SHIFT;
+    iosize = (iopages * GF_NFS3_IO_SIZE);
+
+    /* Double check - boundary conditions */
+    if (iosize < GF_NFS3_FILE_IO_SIZE_MIN) {
+        iosize = GF_NFS3_FILE_IO_SIZE_MIN;
+    } else if (iosize > GF_NFS3_FILE_IO_SIZE_MAX) {
+        iosize = GF_NFS3_FILE_IO_SIZE_MAX;
+    }
+
+    *ioszptr = iosize;
+}
+
+int
+nfs3_init_options(struct nfs3_state *nfs3, dict_t *options)
+{
+    int ret = -1;
+    char *optstr = NULL;
+    uint64_t size64 = 0;
+
+    if ((!nfs3) || (!options))
+        return -1;
+
+    /* nfs3.read-size */
+    nfs3->readsize = GF_NFS3_RTPREF;
+    if (dict_get(options, "nfs3.read-size")) {
+        ret = dict_get_str(options, "nfs3.read-size", &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_READ_FAIL,
+                   "Failed to read option: nfs3.read-size");
+            ret = -1;
+            goto err;
+        }
+
+        ret = gf_string2uint64(optstr, &size64);
+        if (ret == -1) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_FORMAT_FAIL,
+                   "Failed to format option: nfs3.read-size");
+            ret = -1;
+            goto err;
+        }
+
+        nfs3_iosize_roundup_4KB(&size64);
+        nfs3->readsize = size64;
+    }
+
+    /* nfs3.write-size */
+    nfs3->writesize = GF_NFS3_WTPREF;
+    if (dict_get(options, "nfs3.write-size")) {
+        ret = dict_get_str(options, "nfs3.write-size", &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_READ_FAIL,
+                   "Failed to read  option: nfs3.write-size");
+            ret = -1;
+            goto err;
+        }
+
+        ret = gf_string2uint64(optstr, &size64);
+        if (ret == -1) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_FORMAT_FAIL,
+                   "Failed to format option: nfs3.write-size");
+            ret = -1;
+            goto err;
+        }
+
+        nfs3_iosize_roundup_4KB(&size64);
+        nfs3->writesize = size64;
+    }
+
+    /* nfs3.readdir.size */
+    nfs3->readdirsize = GF_NFS3_DTPREF;
+    if (dict_get(options, "nfs3.readdir-size")) {
+        ret = dict_get_str(options, "nfs3.readdir-size", &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_READ_FAIL,
+                   "Failed to read option: nfs3.readdir-size");
+            ret = -1;
+            goto err;
+        }
+
+        ret = gf_string2uint64(optstr, &size64);
+        if (ret == -1) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_FORMAT_FAIL,
+                   "Failed to format option: nfs3.readdir-size");
+            ret = -1;
+            goto err;
+        }
+
+        nfs3_iosize_roundup_4KB(&size64);
+        nfs3->readdirsize = size64;
+    }
+
+    /* We want to use the size of the biggest param for the io buffer size.
+     */
+    nfs3->iobsize = nfs3->readsize;
+    if (nfs3->iobsize < nfs3->writesize)
+        nfs3->iobsize = nfs3->writesize;
+    if (nfs3->iobsize < nfs3->readdirsize)
+        nfs3->iobsize = nfs3->readdirsize;
+
+    /* But this is the true size of each iobuf. We need this size to
+     * accommodate the NFS headers also in the same buffer. */
+    nfs3->iobsize = nfs3->iobsize * 2;
+
+    ret = 0;
+err:
+    return ret;
+}
+
+int
+nfs3_init_subvolume_options(xlator_t *nfsx, struct nfs3_export *exp,
+                            dict_t *options)
+{
+    int ret = -1;
+    char *optstr = NULL;
+    char searchkey[1024];
+    char *name = NULL;
+    gf_boolean_t boolt = _gf_false;
+    uuid_t volumeid = {
+        0,
+    };
+
+    if ((!nfsx) || (!exp))
+        return -1;
+
+    /* For init, fetch options from xlator but for
+     * reconfigure, take the parameter */
+    if (!options)
+        options = nfsx->options;
+
+    if (!options)
+        return (-1);
+
+    gf_uuid_clear(volumeid);
+    if (gf_nfs_dvm_off(nfs_state(nfsx)))
+        goto no_dvm;
+
+    ret = snprintf(searchkey, sizeof(searchkey), "nfs3.%s.volume-id",
+                   exp->subvol->name);
+    if (ret < 0) {
+        gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_SNPRINTF_FAIL,
+               "snprintf failed");
+        ret = -1;
+        goto err;
+    }
+
+    if (dict_get(options, searchkey)) {
+        ret = dict_get_str(options, searchkey, &optstr);
+        if (ret < 0) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_READ_FAIL,
+                   "Failed to read option: %s", searchkey);
+            ret = -1;
+            goto err;
+        }
+    } else {
+        gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_VOLID_MISSING,
+               "DVM is"
+               " on but volume-id not given for volume: %s",
+               exp->subvol->name);
+        ret = -1;
+        goto err;
+    }
+
+    if (optstr) {
+        ret = gf_uuid_parse(optstr, volumeid);
+        if (ret < 0) {
+            gf_msg(GF_MNT, GF_LOG_ERROR, 0, NFS_MSG_PARSE_VOL_UUID_FAIL,
+                   "Failed to parse volume UUID");
+            ret = -1;
+            goto err;
+        }
+        gf_uuid_copy(exp->volumeid, volumeid);
+    }
+
+no_dvm:
+    /* Volume Access */
+    name = exp->subvol->name;
+    ret = snprintf(searchkey, sizeof(searchkey), "nfs3.%s.volume-access", name);
+    if (ret < 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_SNPRINTF_FAIL,
+               "snprintf failed");
+        ret = -1;
+        goto err;
+    }
+
+    exp->access = GF_NFS3_DEFAULT_VOLACCESS;
+    if (dict_get(options, searchkey)) {
+        ret = dict_get_str(options, searchkey, &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_READ_FAIL,
+                   "Failed to read option: %s", searchkey);
+            ret = -1;
+            goto err;
+        }
+
+        if (strcmp(optstr, "read-only") == 0)
+            exp->access = GF_NFS3_VOLACCESS_RO;
+    }
+
+    ret = snprintf(searchkey, sizeof(searchkey), "rpc-auth.%s.unix", name);
+    if (ret < 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_SNPRINTF_FAIL,
+               "snprintf failed");
+        ret = -1;
+        goto err;
+    }
+
+    if (dict_get(options, searchkey)) {
+        ret = dict_get_str(options, searchkey, &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_READ_FAIL,
+                   "Failed to read option: %s", searchkey);
+            ret = -1;
+            goto err;
+        }
+    }
+
+    exp->trusted_sync = 0;
+    ret = snprintf(searchkey, sizeof(searchkey), "nfs3.%s.trusted-sync", name);
+    if (ret < 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_SNPRINTF_FAIL,
+               "snprintf failed");
+        ret = -1;
+        goto err;
+    }
+
+    if (dict_get(options, searchkey)) {
+        ret = dict_get_str(options, searchkey, &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_READ_FAIL,
+                   "Failed to read option: %s", searchkey);
+            ret = -1;
+            goto err;
+        }
+
+        ret = gf_string2boolean(optstr, &boolt);
+        if (ret < 0) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_STR2BOOL_FAIL,
+                   "Failed to convert str "
+                   "to gf_boolean_t");
+            ret = -1;
+            goto err;
+        }
+
+        if (boolt == _gf_true)
+            exp->trusted_sync = 1;
+    }
+
+    exp->trusted_write = 0;
+    ret = snprintf(searchkey, sizeof(searchkey), "nfs3.%s.trusted-write", name);
+    if (ret < 0) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_SNPRINTF_FAIL,
+               "snprintf failed");
+        ret = -1;
+        goto err;
+    }
+
+    if (dict_get(options, searchkey)) {
+        ret = dict_get_str(options, searchkey, &optstr);
+        if (ret < 0) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_READ_FAIL,
+                   "Failed to read option: %s", searchkey);
+            ret = -1;
+            goto err;
+        }
+
+        ret = gf_string2boolean(optstr, &boolt);
+        if (ret < 0) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_STR2BOOL_FAIL,
+                   "Failed to convert str"
+                   " to gf_boolean_t");
+            ret = -1;
+            goto err;
+        }
+
+        if (boolt == _gf_true)
+            exp->trusted_write = 1;
+    }
+
+    /* If trusted-sync is on, then we also switch on trusted-write because
+     * tw is included in ts. In write logic, we're then only checking for
+     * tw.
+     */
+    if (exp->trusted_sync)
+        exp->trusted_write = 1;
+
+    gf_msg_trace(
+        GF_NFS3, 0, "%s: %s, %s, %s", exp->subvol->name,
+        (exp->access == GF_NFS3_VOLACCESS_RO) ? "read-only" : "read-write",
+        (exp->trusted_sync == 0) ? "no trusted_sync" : "trusted_sync",
+        (exp->trusted_write == 0) ? "no trusted_write" : "trusted_write");
+    ret = 0;
+err:
+    return ret;
+}
+
+struct nfs3_export *
+nfs3_init_subvolume(struct nfs3_state *nfs3, xlator_t *subvol)
+{
+    int ret = -1;
+    struct nfs3_export *exp = NULL;
+
+    if ((!nfs3) || (!subvol))
+        return NULL;
+
+    exp = GF_CALLOC(1, sizeof(*exp), gf_nfs_mt_nfs3_export);
+    exp->subvol = subvol;
+    INIT_LIST_HEAD(&exp->explist);
+    gf_msg_trace(GF_NFS3, 0, "Initing state: %s", exp->subvol->name);
+
+    ret = nfs3_init_subvolume_options(nfs3->nfsx, exp, NULL);
+    if (ret == -1) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_SUBVOL_INIT_FAIL,
+               "Failed to init subvol");
+        goto exp_free;
+    }
+
+    ret = 0;
+exp_free:
+    if (ret < 0) {
+        GF_FREE(exp);
+        exp = NULL;
+    }
+
+    return exp;
+}
+
+int
+nfs3_init_subvolumes(struct nfs3_state *nfs3)
+{
+    int ret = -1;
+    struct xlator_list *xl_list = NULL;
+    struct nfs3_export *exp = NULL;
+
+    if (!nfs3)
+        return -1;
+
+    xl_list = nfs3->nfsx->children;
+
+    while (xl_list) {
+        exp = nfs3_init_subvolume(nfs3, xl_list->xlator);
+        if (!exp) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_SUBVOL_INIT_FAIL,
+                   "Failed to init "
+                   "subvol: %s",
+                   xl_list->xlator->name);
+            goto err;
+        }
+        list_add_tail(&exp->explist, &nfs3->exports);
+        xl_list = xl_list->next;
+    }
+
+    ret = 0;
+err:
+    return ret;
+}
+
+struct nfs3_state *
+nfs3_init_state(xlator_t *nfsx)
+{
+    struct nfs3_state *nfs3 = NULL;
+    int ret = -1;
+    unsigned int localpool = 0;
+    struct nfs_state *nfs = NULL;
+
+    if ((!nfsx) || (!nfsx->private))
+        return NULL;
+
+    nfs3 = (struct nfs3_state *)GF_CALLOC(1, sizeof(*nfs3),
+                                          gf_nfs_mt_nfs3_state);
+    if (!nfs3) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Memory allocation failed");
+        return NULL;
+    }
+
+    nfs = nfsx->private;
+    ret = nfs3_init_options(nfs3, nfsx->options);
+    if (ret == -1) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_OPT_INIT_FAIL,
+               "Failed to init options");
+        goto ret;
+    }
+
+    nfs3->iobpool = nfsx->ctx->iobuf_pool;
+
+    localpool = nfs->memfactor * GF_NFS_CONCURRENT_OPS_MULT;
+    gf_msg_trace(GF_NFS3, 0, "local pool: %d", localpool);
+    nfs3->localpool = mem_pool_new_ctx(nfsx->ctx, nfs3_call_state_t, localpool);
+    if (!nfs3->localpool) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "local mempool creation failed");
+        ret = -1;
+        goto ret;
+    }
+
+    nfs3->nfsx = nfsx;
+    nfs3->exportslist = nfsx->children;
+    INIT_LIST_HEAD(&nfs3->exports);
+    ret = nfs3_init_subvolumes(nfs3);
+    if (ret == -1) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_SUBVOL_INIT_FAIL,
+               "Failed to init per-subvolume state");
+        goto free_localpool;
+    }
+
+    nfs3->serverstart = (uint64_t)gf_time();
+    INIT_LIST_HEAD(&nfs3->fdlru);
+    LOCK_INIT(&nfs3->fdlrulock);
+    nfs3->fdcount = 0;
+
+    ret = rpcsvc_create_listeners(nfs->rpcsvc, nfsx->options, nfsx->name);
+    if (ret == -1) {
+        gf_msg(GF_NFS, GF_LOG_ERROR, 0, NFS_MSG_LISTENERS_CREATE_FAIL,
+               "Unable to create listeners");
+        goto free_localpool;
+    }
+
+    nfs->nfs3state = nfs3;
+    ret = 0;
+
+free_localpool:
+    if (ret == -1)
+        mem_pool_destroy(nfs3->localpool);
+
+ret:
+    if (ret == -1) {
+        GF_FREE(nfs3);
+        nfs3 = NULL;
+    }
+
+    return nfs3;
+}
+
+rpcsvc_program_t *
+nfs3svc_init(xlator_t *nfsx)
+{
+    struct nfs3_state *nfs3 = NULL;
+
+    if (!nfsx)
+        return NULL;
+
+    nfs3 = nfs3_init_state(nfsx);
+    if (!nfs3) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_STATE_INIT_FAIL,
+               "NFSv3 state init failed");
+        return NULL;
+    }
+
+    nfs3prog.private = nfs3;
+
+    return &nfs3prog;
+}
+
+int
+nfs3_reconfigure_state(xlator_t *nfsx, dict_t *options)
+{
+    int ret = -1;
+    struct nfs3_export *exp = NULL;
+    struct nfs_state *nfs = NULL;
+    struct nfs3_state *nfs3 = NULL;
+
+    if ((!nfsx) || (!nfsx->private) || (!options))
+        goto out;
+
+    nfs = (struct nfs_state *)nfsx->private;
+    nfs3 = nfs->nfs3state;
+    if (!nfs3)
+        goto out;
+
+    ret = nfs3_init_options(nfs3, options);
+    if (ret) {
+        gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_RECONF_FAIL,
+               "Failed to reconfigure options");
+        goto out;
+    }
+
+    list_for_each_entry(exp, &nfs3->exports, explist)
+    {
+        ret = nfs3_init_subvolume_options(nfsx, exp, options);
+        if (ret) {
+            gf_msg(GF_NFS3, GF_LOG_ERROR, 0, NFS_MSG_RECONF_SUBVOL_FAIL,
+                   "Failed to reconfigure subvol options");
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
diff --git a/xlators/nfs/server/src/nfs3.h b/xlators/nfs/server/src/nfs3.h
new file mode 100644
index 00000000000..cdb7e03a619
--- /dev/null
+++ b/xlators/nfs/server/src/nfs3.h
@@ -0,0 +1,292 @@
+/*
+  Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _NFS3_H_
+#define _NFS3_H_
+
+#include "rpcsvc.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/iobuf.h>
+#include "nfs.h"
+#include "nfs3-fh.h"
+#include "nfs-common.h"
+#include "xdr-nfs3.h"
+#include <glusterfs/mem-pool.h>
+#include "nlm4.h"
+#include "acl3-xdr.h"
+#include "acl3.h"
+#include <glusterfs/refcount.h>
+#include <sys/statvfs.h>
+
+#define GF_NFS3 GF_NFS "-nfsv3"
+
+#define GF_NFS3_DEFAULT_MEMFACTOR 15
+#define GF_NFS3_IOBPOOL_MULT GF_NFS_CONCURRENT_OPS_MULT
+#define GF_NFS3_CLTABLE_BUCKETS_MULT 2
+#define GF_NFS3_FDTABLE_BUCKETS_MULT 2
+
+/* Static values used for FSINFO
+ * To change the maximum rsize and wsize supported by the NFS client, adjust
+ * GF_NFS3_FILE_IO_SIZE_MAX. The Gluster NFS server defaults to 1MB(1048576)
+ * (same as kernel NFS server). For slower network, rsize/wsize can be trimmed
+ * to 16/32/64-KB. rsize and wsize can be tuned through nfs.read-size and
+ * nfs.write-size respectively.
+ *
+ * NB: For Kernel-NFS, NFS_MAX_FILE_IO_SIZE is 1048576U (1MB).
+ */
+#define GF_NFS3_FILE_IO_SIZE_MAX (1 * GF_UNIT_MB) /* 1048576 */
+#define GF_NFS3_FILE_IO_SIZE_MIN (4 * GF_UNIT_KB) /* 4096 */
+
+#define GF_NFS3_FILE_IO_SIZE_DEF GF_NFS3_FILE_IO_SIZE_MAX
+
+#define GF_NFS3_RTMAX GF_NFS3_FILE_IO_SIZE_MAX
+#define GF_NFS3_RTMIN GF_NFS3_FILE_IO_SIZE_MIN
+#define GF_NFS3_RTPREF GF_NFS3_FILE_IO_SIZE_DEF
+#define GF_NFS3_RTMULT GF_NFS3_FILE_IO_SIZE_MIN
+
+#define GF_NFS3_WTMAX GF_NFS3_FILE_IO_SIZE_MAX
+#define GF_NFS3_WTMIN GF_NFS3_FILE_IO_SIZE_MIN
+#define GF_NFS3_WTPREF GF_NFS3_FILE_IO_SIZE_DEF
+#define GF_NFS3_WTMULT GF_NFS3_FILE_IO_SIZE_MIN
+
+/* This can be tuned through nfs.readdir-size */
+#define GF_NFS3_DTMAX GF_NFS3_FILE_IO_SIZE_MAX
+#define GF_NFS3_DTMIN GF_NFS3_FILE_IO_SIZE_MIN
+#define GF_NFS3_DTPREF GF_NFS3_FILE_IO_SIZE_DEF
+
+#define GF_NFS3_MAXFILESIZE (1 * GF_UNIT_PB)
+
+#define GF_NFS3_IO_SIZE 4096 /* 4-KB */
+#define GF_NFS3_IO_SHIFT 12  /* 2^12 = 4KB */
+
+/* FIXME: Handle time resolutions */
+#define GF_NFS3_TIMEDELTA_SECS                                                 \
+    {                                                                          \
+        1, 0                                                                   \
+    }
+#define GF_NFS3_TIMEDELTA_NSECS                                                \
+    {                                                                          \
+        0, 1                                                                   \
+    }
+#define GF_NFS3_TIMEDELTA_MSECS                                                \
+    {                                                                          \
+        0, 1000000                                                             \
+    }
+
+#define GF_NFS3_FS_PROP                                                        \
+    (FSF3_LINK | FSF3_SYMLINK | FSF3_HOMOGENEOUS | FSF3_CANSETTIME)
+
+#define GF_NFS3_DIRFD_VALID 1
+#define GF_NFS3_DIRFD_INVALID 0
+
+#define GF_NFS3_VOLACCESS_RW 1
+#define GF_NFS3_VOLACCESS_RO 2
+
+#define GF_NFS3_FDCACHE_SIZE 512
+/* This should probably be moved to a more generic layer so that if needed
+ * different versions of NFS protocol can use the same thing.
+ */
+struct nfs3_fd_entry {
+    fd_t *cachedfd;
+    struct list_head list;
+};
+
+/* Per subvolume nfs3 specific state */
+struct nfs3_export {
+    struct list_head explist;
+    xlator_t *subvol;
+    uuid_t volumeid;
+    int access;
+    int trusted_sync;
+    int trusted_write;
+    int rootlookedup;
+};
+
+#define GF_NFS3_DEFAULT_VOLACCESS (GF_NFS3_VOLACCESS_RW)
+
+/* The NFSv3 protocol state */
+typedef struct nfs3_state {
+    /* The NFS xlator pointer. The NFS xlator can be running
+     * multiple versions of the NFS protocol.
+     */
+    xlator_t *nfsx;
+
+    /* The iob pool from which memory allocations are made for receiving
+     * and sending network messages.
+     */
+    struct iobuf_pool *iobpool;
+
+    /* List of child subvolumes for the NFSv3 protocol.
+     * Right now, is simply referring to the list of children in nfsx above.
+     */
+    xlator_list_t *exportslist;
+
+    struct list_head exports;
+    /* Mempool for allocations of struct nfs3_local */
+    struct mem_pool *localpool;
+
+    /* Server start-up timestamp, currently used for write verifier. */
+    uint64_t serverstart;
+
+    /* NFSv3 Protocol configurables */
+    uint64_t readsize;
+    uint64_t writesize;
+    uint64_t readdirsize;
+
+    /* Size of the iobufs used, depends on the sizes of the three params
+     * above.
+     */
+    uint64_t iobsize;
+
+    struct list_head fdlru;
+    gf_lock_t fdlrulock;
+    int fdcount;
+    uint32_t occ_logger;
+} nfs3_state_t;
+
+typedef enum nfs3_lookup_type {
+    GF_NFS3_REVALIDATE = 1,
+    GF_NFS3_FRESH,
+} nfs3_lookup_type_t;
+
+typedef union args_ {
+    nlm4_stat nlm4_stat;
+    nlm4_holder nlm4_holder;
+    nlm4_lock nlm4_lock;
+    nlm4_share nlm4_share;
+    nlm4_testrply nlm4_testrply;
+    nlm4_testres nlm4_testres;
+    nlm4_testargs nlm4_testargs;
+    nlm4_res nlm4_res;
+    nlm4_lockargs nlm4_lockargs;
+    nlm4_cancargs nlm4_cancargs;
+    nlm4_unlockargs nlm4_unlockargs;
+    nlm4_shareargs nlm4_shareargs;
+    nlm4_shareres nlm4_shareres;
+    nlm4_freeallargs nlm4_freeallargs;
+    getaclargs getaclargs;
+    setaclargs setaclargs;
+    getaclreply getaclreply;
+    setaclreply setaclreply;
+} args;
+
+typedef int (*nfs3_resume_fn_t)(void *cs);
+/* Structure used to communicate state between a fop and its callback.
+ * Not all members are used at all times. Usage is fop and NFS request
+ * dependent.
+ *
+ * I wish we could have a smaller structure for communicating state
+ * between callers and callbacks. It could be broken into smaller parts
+ * but I feel that will lead to a proliferation of types/structures and then
+ * we'll just be tracking down which structure is used by which fop, not
+ * to mention that having one type allows me to used a single mem-pool.
+ * Imagine the chaos if we need a mem-pool for each one of those sub-structures.
+ */
+struct nfs3_local {
+    GF_REF_DECL;
+
+    rpcsvc_request_t *req;
+    xlator_t *vol;
+    nfs3_resume_fn_t resume_fn;
+    xlator_t *nfsx;
+    struct nfs3_state *nfs3state;
+
+    /* The list hook to attach this call state to the inode's queue till
+     * the opening of the fd on the inode completes.
+     */
+    struct list_head openwait_q;
+
+    /* Per-NFSv3 Op state */
+    struct nfs3_fh parent;
+    struct nfs3_fh fh;
+    fd_t *fd;
+    uint32_t accessbits;
+    int operrno;
+    count3 dircount;
+    count3 maxcount;
+    struct statvfs fsstat;
+    gf_dirent_t entries;
+    struct iatt stbuf;
+    struct iatt preparent;
+    struct iatt postparent;
+    int32_t setattr_valid;
+    nfstime3 timestamp;
+    loc_t oploc;
+    int writetype;
+    count3 datacount;
+    offset3 dataoffset;
+    struct iobuf *iob;
+    struct iobref *iobref;
+    createmode3 createmode;
+    uint64_t cookieverf;
+    int sattrguardcheck;
+    char *pathname;
+    ftype3 mknodtype;
+    specdata3 devnums;
+    cookie3 cookie;
+    struct iovec datavec;
+    mode_t mode;
+    struct iatt attr_in;
+
+    /* NFSv3 FH resolver state */
+    int hardresolved;
+    struct nfs3_fh resolvefh;
+    loc_t resolvedloc;
+    int resolve_ret;
+    int resolve_errno;
+    int hashidx;
+    fd_t *resolve_dir_fd;
+    char *resolventry;
+    nfs3_lookup_type_t lookuptype;
+    gf_dirent_t *hashmatch;
+    gf_dirent_t *entrymatch;
+    off_t lastentryoffset;
+    struct flock flock;
+    args args;
+    nlm4_lkowner_t lkowner;
+    char cookiebytes[1024];
+    struct nfs3_fh lockfh;
+    int monitor;
+    rpc_transport_t *trans;
+    call_frame_t *frame;
+
+    /* ACL */
+    aclentry aclentry[NFS_ACL_MAX_ENTRIES];
+    aclentry daclentry[NFS_ACL_MAX_ENTRIES];
+    int aclcount;
+    char aclxattr[NFS_ACL_MAX_ENTRIES * 8 + 4];
+    int daclcount;
+    char daclxattr[NFS_ACL_MAX_ENTRIES * 8 + 4];
+};
+
+#define nfs3_is_revalidate_lookup(cst) ((cst)->lookuptype == GF_NFS3_REVALIDATE)
+#define nfs3_lookup_op(cst) (rpcsvc_request_procnum(cst->req) == NFS3_LOOKUP)
+#define nfs3_create_op(cst) (rpcsvc_request_procnum(cst->req) == NFS3_CREATE)
+#define nfs3_create_exclusive_op(cst) ((cst)->createmode == EXCLUSIVE)
+
+typedef struct nfs3_local nfs3_call_state_t;
+
+/* Queue of ops waiting for open fop to return. */
+struct inode_op_queue {
+    struct list_head opq;
+    pthread_mutex_t qlock;
+};
+
+extern rpcsvc_program_t *
+nfs3svc_init(xlator_t *nfsx);
+
+extern int
+nfs3_reconfigure_state(xlator_t *nfsx, dict_t *options);
+
+extern uint64_t
+nfs3_request_xlator_deviceid(rpcsvc_request_t *req);
+
+#endif
diff --git a/xlators/nfs/server/src/nfsserver.sym b/xlators/nfs/server/src/nfsserver.sym
new file mode 100644
index 00000000000..dce7d964e9e
--- /dev/null
+++ b/xlators/nfs/server/src/nfsserver.sym
@@ -0,0 +1,12 @@
+exp_file_parse
+exp_file_print
+exp_file_get_dir
+exp_dir_get_host
+exp_dir_get_netgroup
+exp_file_dir_from_uuid
+exp_file_deinit
+ng_file_parse
+ng_file_get_netgroup
+ng_file_print
+ng_file_deinit
+xlator_api
diff --git a/xlators/nfs/server/src/nlm4.c b/xlators/nfs/server/src/nlm4.c
new file mode 100644
index 00000000000..577e8543966
--- /dev/null
+++ b/xlators/nfs/server/src/nlm4.c
@@ -0,0 +1,2786 @@
+/*
+  Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/defaults.h>
+#include "rpcsvc.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include "nfs.h"
+#include <glusterfs/mem-pool.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/syscall.h>
+#include "nfs-fops.h"
+#include "mount3.h"
+#include "nfs3.h"
+#include "nfs-mem-types.h"
+#include "nfs3-helpers.h"
+#include "nfs3-fh.h"
+#include "nlm4.h"
+#include "nlm4-xdr.h"
+#include "msg-nfs3.h"
+#include "nfs-generics.h"
+#include "rpc-clnt.h"
+#include "nsm-xdr.h"
+#include <glusterfs/run.h>
+#include "nfs-messages.h"
+#include <unistd.h>
+#include <rpc/pmap_clnt.h>
+#include <rpc/rpc.h>
+#include <rpc/xdr.h>
+#include <glusterfs/statedump.h>
+
+#define KILLALL_CMD "pkill"
+
+/* TODO:
+ * 1) 2 opens racing .. creating an fd leak.
+ * 2) use GF_REF_* for nlm_clnt_t
+ */
+
+typedef ssize_t (*nlm4_serializer)(struct iovec outmsg, void *args);
+
+extern void
+nfs3_call_state_wipe(nfs3_call_state_t *cs);
+
+nfs3_call_state_t *
+nfs3_call_state_init(struct nfs3_state *s, rpcsvc_request_t *req, xlator_t *v);
+
+struct list_head nlm_client_list;
+gf_lock_t nlm_client_list_lk;
+
+/* race on this is harmless */
+int nlm_grace_period = 50;
+
+static gf_boolean_t nlm4_inited = _gf_false;
+
+#define nlm4_validate_nfs3_state(request, state, status, label, retval)        \
+    do {                                                                       \
+        state = rpcsvc_request_program_private(request);                       \
+        if (!state) {                                                          \
+            gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_STATE_MISSING,         \
+                   "NFSv3 state "                                              \
+                   "missing from RPC request");                                \
+            rpcsvc_request_seterr(req, SYSTEM_ERR);                            \
+            status = nlm4_failed;                                              \
+            goto label;                                                        \
+        }                                                                      \
+    } while (0);
+
+#define nlm4_handle_call_state_init(nfs3state, calls, rq, opstat, errlabel)    \
+    do {                                                                       \
+        calls = nlm4_call_state_init((nfs3state), (rq));                       \
+        if (!calls) {                                                          \
+            gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_INIT_CALL_STAT_FAIL,   \
+                   "Failed to "                                                \
+                   "init call state");                                         \
+            opstat = nlm4_failed;                                              \
+            rpcsvc_request_seterr(req, SYSTEM_ERR);                            \
+            goto errlabel;                                                     \
+        }                                                                      \
+    } while (0)
+
+#define nlm4_validate_gluster_fh(handle, status, errlabel)                     \
+    do {                                                                       \
+        if (!nfs3_fh_validate(handle)) {                                       \
+            status = nlm4_stale_fh;                                            \
+            goto errlabel;                                                     \
+        }                                                                      \
+    } while (0)
+
+xlator_t *
+nfs3_fh_to_xlator(struct nfs3_state *nfs3, struct nfs3_fh *fh);
+
+#define nlm4_map_fh_to_volume(nfs3state, handle, req, volume, status, label)   \
+    do {                                                                       \
+        char exportid[256], gfid[256];                                         \
+        rpc_transport_t *trans = NULL;                                         \
+        volume = nfs3_fh_to_xlator((nfs3state), &handle);                      \
+        if (!volume) {                                                         \
+            gf_uuid_unparse(handle.exportid, exportid);                        \
+            gf_uuid_unparse(handle.gfid, gfid);                                \
+            trans = rpcsvc_request_transport(req);                             \
+            gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_FH_TO_VOL_FAIL,        \
+                   "Failed to map "                                            \
+                   "FH to vol: client=%s, exportid=%s, gfid=%s",               \
+                   trans->peerinfo.identifier, exportid, gfid);                \
+            gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_VOLUME_ERROR,          \
+                   "Stale nfs client %s must be trying to "                    \
+                   "connect to a deleted volume, please "                      \
+                   "unmount it.",                                              \
+                   trans->peerinfo.identifier);                                \
+            status = nlm4_stale_fh;                                            \
+            goto label;                                                        \
+        } else {                                                               \
+            gf_msg_trace(GF_NLM, 0, "FH to Volume: %s", volume->name);         \
+            rpcsvc_request_set_private(req, volume);                           \
+        }                                                                      \
+    } while (0);
+
+#define nlm4_volume_started_check(nfs3state, vlm, rtval, erlbl)                \
+    do {                                                                       \
+        if ((!nfs_subvolume_started(nfs_state(nfs3state->nfsx), vlm))) {       \
+            gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_VOL_DISABLE,               \
+                   "Volume is disabled: %s", vlm->name);                       \
+            rtval = RPCSVC_ACTOR_IGNORE;                                       \
+            goto erlbl;                                                        \
+        }                                                                      \
+    } while (0)
+
+#define nlm4_check_fh_resolve_status(cst, nfstat, erlabl)                      \
+    do {                                                                       \
+        xlator_t *xlatorp = NULL;                                              \
+        char buf[256], gfid[GF_UUID_BUF_SIZE];                                 \
+        rpc_transport_t *trans = NULL;                                         \
+        if ((cst)->resolve_ret < 0) {                                          \
+            trans = rpcsvc_request_transport(cst->req);                        \
+            xlatorp = nfs3_fh_to_xlator(cst->nfs3state, &cst->resolvefh);      \
+            gf_uuid_unparse(cst->resolvefh.gfid, gfid);                        \
+            snprintf(buf, sizeof(buf), "(%s) %s : %s",                         \
+                     trans->peerinfo.identifier,                               \
+                     xlatorp ? xlatorp->name : "ERR", gfid);                   \
+            gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_RESOLVE_FH_FAIL,           \
+                   "Unable to resolve FH"                                      \
+                   ": %s",                                                     \
+                   buf);                                                       \
+            nfstat = nlm4_errno_to_nlm4stat(cst->resolve_errno);               \
+            goto erlabl;                                                       \
+        }                                                                      \
+    } while (0)
+
+void
+nlm4_prep_nlm4_testargs(nlm4_testargs *args, struct nfs3_fh *fh,
+                        nlm4_lkowner_t *oh, char *cookiebytes)
+{
+    memset(args, 0, sizeof(*args));
+    args->alock.fh.nlm4_netobj_val = (void *)fh;
+    args->alock.oh.nlm4_netobj_val = (void *)oh;
+    args->cookie.nlm4_netobj_val = (void *)cookiebytes;
+}
+
+void
+nlm4_prep_nlm4_lockargs(nlm4_lockargs *args, struct nfs3_fh *fh,
+                        nlm4_lkowner_t *oh, char *cookiebytes)
+{
+    memset(args, 0, sizeof(*args));
+    args->alock.fh.nlm4_netobj_val = (void *)fh;
+    args->alock.oh.nlm4_netobj_val = (void *)oh;
+    args->cookie.nlm4_netobj_val = (void *)cookiebytes;
+}
+
+void
+nlm4_prep_nlm4_cancargs(nlm4_cancargs *args, struct nfs3_fh *fh,
+                        nlm4_lkowner_t *oh, char *cookiebytes)
+{
+    memset(args, 0, sizeof(*args));
+    args->alock.fh.nlm4_netobj_val = (void *)fh;
+    args->alock.oh.nlm4_netobj_val = (void *)oh;
+    args->cookie.nlm4_netobj_val = (void *)cookiebytes;
+}
+
+void
+nlm4_prep_nlm4_unlockargs(nlm4_unlockargs *args, struct nfs3_fh *fh,
+                          nlm4_lkowner_t *oh, char *cookiebytes)
+{
+    memset(args, 0, sizeof(*args));
+    args->alock.fh.nlm4_netobj_val = (void *)fh;
+    args->alock.oh.nlm4_netobj_val = (void *)oh;
+    args->cookie.nlm4_netobj_val = (void *)cookiebytes;
+}
+
+void
+nlm4_prep_shareargs(nlm4_shareargs *args, struct nfs3_fh *fh,
+                    nlm4_lkowner_t *oh, char *cookiebytes)
+{
+    memset(args, 0, sizeof(*args));
+    args->share.fh.nlm4_netobj_val = (void *)fh;
+    args->share.oh.nlm4_netobj_val = (void *)oh;
+    args->cookie.nlm4_netobj_val = (void *)cookiebytes;
+}
+
+void
+nlm4_prep_freeallargs(nlm4_freeallargs *args, nlm4_lkowner_t *oh)
+{
+    memset(args, 0, sizeof(*args));
+    args->name = (void *)oh;
+}
+
+void
+nlm_copy_lkowner(gf_lkowner_t *dst, nlm4_netobj *src)
+{
+    dst->len = src->nlm4_netobj_len;
+    memcpy(dst->data, src->nlm4_netobj_val, dst->len);
+}
+
+int
+nlm_is_oh_same_lkowner(gf_lkowner_t *a, nlm4_netobj *b)
+{
+    if (!a || !b) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "invalid args");
+        return -1;
+    }
+
+    return (a->len == b->nlm4_netobj_len &&
+            !memcmp(a->data, b->nlm4_netobj_val, a->len));
+}
+
+nlm4_stats
+nlm4_errno_to_nlm4stat(int errnum)
+{
+    nlm4_stats stat = nlm4_denied;
+
+    switch (errnum) {
+        case 0:
+            stat = nlm4_granted;
+            break;
+        case EROFS:
+            stat = nlm4_rofs;
+            break;
+        case ESTALE:
+            stat = nlm4_stale_fh;
+            break;
+        case ENOLCK:
+            stat = nlm4_failed;
+            break;
+        default:
+            stat = nlm4_denied;
+            break;
+    }
+
+    return stat;
+}
+
+nfs3_call_state_t *
+nlm4_call_state_init(struct nfs3_state *s, rpcsvc_request_t *req)
+{
+    nfs3_call_state_t *cs = NULL;
+
+    if ((!s) || (!req))
+        return NULL;
+
+    cs = nfs3_call_state_init(s, req, NULL);
+    if (!cs)
+        return NULL;
+
+    cs->monitor = 1;
+
+    return cs;
+}
+
+int
+nlm_monitor(char *caller_name)
+{
+    nlm_client_t *nlmclnt = NULL;
+    int monitor = -1;
+
+    LOCK(&nlm_client_list_lk);
+    list_for_each_entry(nlmclnt, &nlm_client_list, nlm_clients)
+    {
+        if (!strcmp(caller_name, nlmclnt->caller_name)) {
+            monitor = nlmclnt->nsm_monitor;
+            nlmclnt->nsm_monitor = 1;
+            break;
+        }
+    }
+    UNLOCK(&nlm_client_list_lk);
+
+    if (monitor == -1)
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_CALLER_NOT_FOUND,
+               "%s was not found in the nlmclnt list", caller_name);
+
+    return monitor;
+}
+
+rpc_clnt_t *
+nlm_get_rpc_clnt(char *caller_name)
+{
+    nlm_client_t *nlmclnt = NULL;
+    int nlmclnt_found = 0;
+    rpc_clnt_t *rpc_clnt = NULL;
+
+    LOCK(&nlm_client_list_lk);
+    list_for_each_entry(nlmclnt, &nlm_client_list, nlm_clients)
+    {
+        if (!strcmp(caller_name, nlmclnt->caller_name)) {
+            nlmclnt_found = 1;
+            break;
+        }
+    }
+    if (!nlmclnt_found)
+        goto ret;
+    if (nlmclnt->rpc_clnt)
+        rpc_clnt = rpc_clnt_ref(nlmclnt->rpc_clnt);
+ret:
+    UNLOCK(&nlm_client_list_lk);
+    return rpc_clnt;
+}
+
+static void
+nlm_client_free(nlm_client_t *nlmclnt)
+{
+    nlm_fde_t *fde = NULL, *tmp = NULL;
+
+    gf_msg_trace(GF_NLM, 0, "removing nlm-client %s from the list",
+                 nlmclnt->caller_name);
+
+    list_for_each_entry_safe(fde, tmp, &nlmclnt->fdes, fde_list)
+    {
+        fd_unref(fde->fd);
+        list_del(&fde->fde_list);
+        GF_FREE(fde);
+    }
+
+    list_del(&nlmclnt->fdes);
+    list_del(&nlmclnt->nlm_clients);
+    list_del(&nlmclnt->shares);
+
+    GF_FREE(nlmclnt->caller_name);
+
+    if (nlmclnt->rpc_clnt) {
+        /* cleanup the saved-frames before last unref */
+        rpc_clnt_connection_cleanup(&nlmclnt->rpc_clnt->conn);
+        /* rpc_clnt_connection_cleanup() calls rpc_clnt_unref() */
+    }
+
+    GF_FREE(nlmclnt);
+}
+
+int
+nlm_set_rpc_clnt(rpc_clnt_t *rpc_clnt, char *caller_name)
+{
+    nlm_client_t *nlmclnt = NULL;
+    int nlmclnt_found = 0;
+    int ret = -1;
+
+    LOCK(&nlm_client_list_lk);
+    list_for_each_entry(nlmclnt, &nlm_client_list, nlm_clients)
+    {
+        if (!strcmp(caller_name, nlmclnt->caller_name)) {
+            nlmclnt_found = 1;
+            break;
+        }
+    }
+
+    if (!nlmclnt_found) {
+        nlmclnt = GF_CALLOC(1, sizeof(*nlmclnt), gf_nfs_mt_nlm4_nlmclnt);
+        if (nlmclnt == NULL)
+            goto ret;
+
+        INIT_LIST_HEAD(&nlmclnt->fdes);
+        INIT_LIST_HEAD(&nlmclnt->nlm_clients);
+        INIT_LIST_HEAD(&nlmclnt->shares);
+
+        list_add(&nlmclnt->nlm_clients, &nlm_client_list);
+        nlmclnt->caller_name = gf_strdup(caller_name);
+    }
+
+    if (nlmclnt->rpc_clnt == NULL) {
+        nlmclnt->rpc_clnt = rpc_clnt_ref(rpc_clnt);
+    }
+    ret = 0;
+ret:
+    UNLOCK(&nlm_client_list_lk);
+    return ret;
+}
+
+int
+nlm_unset_rpc_clnt(rpc_clnt_t *rpc)
+{
+    nlm_client_t *nlmclnt = NULL;
+
+    LOCK(&nlm_client_list_lk);
+    list_for_each_entry(nlmclnt, &nlm_client_list, nlm_clients)
+    {
+        if (rpc == nlmclnt->rpc_clnt) {
+            nlm_client_free(nlmclnt);
+            break;
+        }
+    }
+    UNLOCK(&nlm_client_list_lk);
+
+    return 0;
+}
+
+int
+nlm_add_nlmclnt(char *caller_name)
+{
+    nlm_client_t *nlmclnt = NULL;
+    int nlmclnt_found = 0;
+    int ret = -1;
+
+    LOCK(&nlm_client_list_lk);
+    list_for_each_entry(nlmclnt, &nlm_client_list, nlm_clients)
+    {
+        if (!strcmp(caller_name, nlmclnt->caller_name)) {
+            nlmclnt_found = 1;
+            break;
+        }
+    }
+    if (!nlmclnt_found) {
+        nlmclnt = GF_CALLOC(1, sizeof(*nlmclnt), gf_nfs_mt_nlm4_nlmclnt);
+        if (nlmclnt == NULL) {
+            gf_msg(GF_NLM, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "malloc error");
+            goto ret;
+        }
+
+        INIT_LIST_HEAD(&nlmclnt->fdes);
+        INIT_LIST_HEAD(&nlmclnt->nlm_clients);
+        INIT_LIST_HEAD(&nlmclnt->shares);
+
+        list_add(&nlmclnt->nlm_clients, &nlm_client_list);
+        nlmclnt->caller_name = gf_strdup(caller_name);
+    }
+    ret = 0;
+ret:
+    UNLOCK(&nlm_client_list_lk);
+    return ret;
+}
+
+int
+nlm4svc_submit_reply(rpcsvc_request_t *req, void *arg, nlm4_serializer sfunc)
+{
+    struct iovec outmsg = {
+        0,
+    };
+    struct iobuf *iob = NULL;
+    struct nfs3_state *nfs3 = NULL;
+    int ret = -1;
+    ssize_t msglen = 0;
+    struct iobref *iobref = NULL;
+
+    if (!req)
+        return -1;
+
+    nfs3 = (struct nfs3_state *)rpcsvc_request_program_private(req);
+    if (!nfs3) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, EINVAL, NFS_MSG_MNT_STATE_NOT_FOUND,
+               "mount state not found");
+        goto ret;
+    }
+
+    /* First, get the io buffer into which the reply in arg will
+     * be serialized.
+     */
+    iob = iobuf_get(nfs3->iobpool);
+    if (!iob) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to get iobuf");
+        goto ret;
+    }
+
+    iobuf_to_iovec(iob, &outmsg);
+    /* Use the given serializer to translate the give C structure in arg
+     * to XDR format which will be written into the buffer in outmsg.
+     */
+    msglen = sfunc(outmsg, arg);
+    if (msglen < 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_ENCODE_MSG_FAIL,
+               "Failed to encode message");
+        goto ret;
+    }
+    outmsg.iov_len = msglen;
+
+    iobref = iobref_new();
+    if (iobref == NULL) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to get iobref");
+        goto ret;
+    }
+
+    ret = iobref_add(iobref, iob);
+    if (ret) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to add iob to iobref");
+        goto ret;
+    }
+
+    /* Then, submit the message for transmission. */
+    ret = rpcsvc_submit_message(req, &outmsg, 1, NULL, 0, iobref);
+    if (ret == -1) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_REP_SUBMIT_FAIL,
+               "Reply submission failed");
+        goto ret;
+    }
+
+    ret = 0;
+ret:
+    if (iob)
+        iobuf_unref(iob);
+    if (iobref)
+        iobref_unref(iobref);
+
+    return ret;
+}
+
+typedef int (*nlm4_resume_fn_t)(void *cs);
+
+int32_t
+nlm4_file_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    nfs3_call_state_t *cs = frame->local;
+
+    if (op_ret == 0)
+        fd_bind(cs->fd);
+    cs->resolve_ret = op_ret;
+    cs->resume_fn(cs);
+
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+    GF_REF_PUT(cs);
+
+    return 0;
+}
+
+void *
+nsm_monitor(void *arg)
+{
+    CLIENT *clnt = NULL;
+    enum clnt_stat ret;
+    struct mon nsm_mon;
+    struct sm_stat_res res;
+    struct timeval tout = {5, 0};
+    char *host = NULL;
+
+    host = arg;
+    nsm_mon.mon_id.mon_name = gf_strdup(host);
+    nsm_mon.mon_id.my_id.my_name = gf_strdup("localhost");
+    nsm_mon.mon_id.my_id.my_prog = NLMCBK_PROGRAM;
+    nsm_mon.mon_id.my_id.my_vers = NLMCBK_V1;
+    nsm_mon.mon_id.my_id.my_proc = NLMCBK_SM_NOTIFY;
+    /* nothing to put in the private data */
+#define SM_PROG 100024
+#define SM_VERS 1
+#define SM_MON 2
+
+    /* create a connection to nsm on the localhost */
+    clnt = clnt_create("localhost", SM_PROG, SM_VERS, "tcp");
+    if (!clnt) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_CLNT_CREATE_ERROR, "%s",
+               clnt_spcreateerror("Clnt_create()"));
+        goto out;
+    }
+
+    ret = clnt_call(clnt, SM_MON, (xdrproc_t)xdr_mon, (caddr_t)&nsm_mon,
+                    (xdrproc_t)xdr_sm_stat_res, (caddr_t)&res, tout);
+    if (ret != RPC_SUCCESS) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_CLNT_CALL_ERROR,
+               "clnt_call(): %s", clnt_sperrno(ret));
+        goto out;
+    }
+    if (res.res_stat != STAT_SUCC) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_CLNT_CALL_ERROR,
+               "clnt_call(): %s", clnt_sperrno(ret));
+        goto out;
+    }
+
+out:
+    GF_FREE(nsm_mon.mon_id.mon_name);
+    GF_FREE(nsm_mon.mon_id.my_id.my_name);
+    if (clnt != NULL)
+        clnt_destroy(clnt);
+    return NULL;
+}
+
+nlm_client_t *
+__nlm_get_uniq(char *caller_name)
+{
+    nlm_client_t *nlmclnt = NULL;
+
+    if (!caller_name)
+        return NULL;
+
+    list_for_each_entry(nlmclnt, &nlm_client_list, nlm_clients)
+    {
+        if (!strcmp(caller_name, nlmclnt->caller_name))
+            return nlmclnt;
+    }
+
+    return NULL;
+}
+
+nlm_client_t *
+nlm_get_uniq(char *caller_name)
+{
+    nlm_client_t *nlmclnt = NULL;
+
+    LOCK(&nlm_client_list_lk);
+    nlmclnt = __nlm_get_uniq(caller_name);
+    UNLOCK(&nlm_client_list_lk);
+
+    return nlmclnt;
+}
+
+int
+nlm4_file_open_and_resume(nfs3_call_state_t *cs, nlm4_resume_fn_t resume)
+{
+    fd_t *fd = NULL;
+    int ret = -1;
+    int flags = 0;
+    nlm_client_t *nlmclnt = NULL;
+    call_frame_t *frame = NULL;
+
+    if (cs->args.nlm4_lockargs.exclusive == _gf_false)
+        flags = O_RDONLY;
+    else
+        flags = O_WRONLY;
+
+    nlmclnt = nlm_get_uniq(cs->args.nlm4_lockargs.alock.caller_name);
+    if (nlmclnt == NULL) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, ENOLCK, NFS_MSG_NO_MEMORY,
+               "nlm_get_uniq() "
+               "returned NULL");
+        ret = -ENOLCK;
+        goto err;
+    }
+    cs->resume_fn = resume;
+    fd = fd_lookup_uint64(cs->resolvedloc.inode, (uint64_t)(uintptr_t)nlmclnt);
+    if (fd) {
+        cs->fd = fd;
+        cs->resolve_ret = 0;
+        cs->resume_fn(cs);
+        ret = 0;
+        goto err;
+    }
+
+    fd = fd_create_uint64(cs->resolvedloc.inode, (uint64_t)(uintptr_t)nlmclnt);
+    if (fd == NULL) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, ENOLCK, NFS_MSG_NO_MEMORY,
+               "fd_create_uint64() returned NULL");
+        ret = -ENOLCK;
+        goto err;
+    }
+
+    cs->fd = fd;
+
+    frame = create_frame(cs->nfsx, cs->nfsx->ctx->pool);
+    if (!frame) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "unable to create frame");
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    frame->root->pid = NFS_PID;
+    frame->root->uid = rpcsvc_request_uid(cs->req);
+    frame->root->gid = rpcsvc_request_gid(cs->req);
+    frame->local = GF_REF_GET(cs);
+    nfs_fix_groups(cs->nfsx, frame->root);
+
+    STACK_WIND_COOKIE(frame, nlm4_file_open_cbk, cs->vol, cs->vol,
+                      cs->vol->fops->open, &cs->resolvedloc, flags, cs->fd,
+                      NULL);
+    ret = 0;
+err:
+    return ret;
+}
+
+int
+nlm4_generic_reply(rpcsvc_request_t *req, nlm4_netobj cookie, nlm4_stats stat)
+{
+    nlm4_res res;
+
+    memset(&res, 0, sizeof(res));
+    res.cookie = cookie;
+    res.stat.stat = stat;
+
+    nlm4svc_submit_reply(req, (void *)&res,
+                         (nlm4_serializer)xdr_serialize_nlm4_res);
+    return 0;
+}
+
+int
+nlm4svc_null(rpcsvc_request_t *req)
+{
+    struct iovec dummyvec = {
+        0,
+    };
+
+    if (!req) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "Got NULL request!");
+        return 0;
+    }
+    rpcsvc_submit_generic(req, &dummyvec, 1, NULL, 0, NULL);
+    return 0;
+}
+
+int
+nlm4_gf_flock_to_holder(nlm4_holder *holder, struct gf_flock *flock)
+{
+    switch (flock->l_type) {
+        case GF_LK_F_WRLCK:
+            holder->exclusive = 1;
+            break;
+    }
+
+    holder->svid = flock->l_pid;
+    holder->l_offset = flock->l_start;
+    holder->l_len = flock->l_len;
+    return 0;
+}
+
+int
+nlm4_lock_to_gf_flock(struct gf_flock *flock, nlm4_lock *lock, int excl)
+{
+    flock->l_pid = lock->svid;
+    flock->l_start = lock->l_offset;
+    flock->l_len = lock->l_len;
+    if (excl)
+        flock->l_type = F_WRLCK;
+    else
+        flock->l_type = F_RDLCK;
+    flock->l_whence = SEEK_SET;
+    nlm_copy_lkowner(&flock->l_owner, &lock->oh);
+    return 0;
+}
+
+rpc_clnt_procedure_t nlm4_clnt_actors[NLM4_PROC_COUNT] = {
+    [NLM4_NULL] = {"NULL", NULL},
+    [NLM4_GRANTED] = {"GRANTED", NULL},
+};
+
+char *nlm4_clnt_names[NLM4_PROC_COUNT] = {
+    [NLM4_NULL] = "NULL",
+    [NLM4_GRANTED] = "GRANTED",
+};
+
+rpc_clnt_prog_t nlm4clntprog = {
+    .progname = "NLMv4",
+    .prognum = NLM_PROGRAM,
+    .progver = NLM_V4,
+    .numproc = NLM4_PROC_COUNT,
+    .proctable = nlm4_clnt_actors,
+    .procnames = nlm4_clnt_names,
+};
+
+int
+nlm4_test_reply(nfs3_call_state_t *cs, nlm4_stats stat, struct gf_flock *flock)
+{
+    nlm4_testres res;
+
+    memset(&res, 0, sizeof(res));
+    res.cookie = cs->args.nlm4_testargs.cookie;
+    res.stat.stat = stat;
+    if ((stat == nlm4_denied) && flock)
+        nlm4_gf_flock_to_holder(&res.stat.nlm4_testrply_u.holder, flock);
+
+    nlm4svc_submit_reply(cs->req, (void *)&res,
+                         (nlm4_serializer)xdr_serialize_nlm4_testres);
+    return 0;
+}
+
+int
+nlm4svc_test_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct gf_flock *flock,
+                 dict_t *xdata)
+{
+    nlm4_stats stat = nlm4_denied;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nlm4_errno_to_nlm4stat(op_errno);
+        goto err;
+    } else if (flock->l_type == F_UNLCK)
+        stat = nlm4_granted;
+
+err:
+    nlm4_test_reply(cs, stat, flock);
+    nfs3_call_state_wipe(cs);
+    return 0;
+}
+
+int
+nlm4_test_fd_resume(void *carg)
+{
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+    struct gf_flock flock = {
+        0,
+    };
+
+    if (!carg)
+        return ret;
+
+    cs = GF_REF_GET((nfs3_call_state_t *)carg);
+    nfs_request_user_init(&nfu, cs->req);
+    nlm4_lock_to_gf_flock(&flock, &cs->args.nlm4_testargs.alock,
+                          cs->args.nlm4_testargs.exclusive);
+    nlm_copy_lkowner(&nfu.lk_owner, &cs->args.nlm4_testargs.alock.oh);
+    ret = nfs_lk(cs->nfsx, cs->vol, &nfu, cs->fd, F_GETLK, &flock,
+                 nlm4svc_test_cbk, cs);
+
+    GF_REF_PUT(cs);
+
+    return ret;
+}
+
+int
+nlm4_test_resume(void *carg)
+{
+    nlm4_stats stat = nlm4_failed;
+    int ret = -1;
+    nfs3_call_state_t *cs = NULL;
+    fd_t *fd = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = GF_REF_GET((nfs3_call_state_t *)carg);
+    nlm4_check_fh_resolve_status(cs, stat, nlm4err);
+    fd = fd_anonymous(cs->resolvedloc.inode);
+    if (!fd)
+        goto nlm4err;
+    cs->fd = fd;
+    ret = nlm4_test_fd_resume(cs);
+
+nlm4err:
+    if (ret < 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, -ret, NFS_MSG_OPEN_FAIL,
+               "unable to open_and_resume");
+        stat = nlm4_errno_to_nlm4stat(-ret);
+        nlm4_test_reply(cs, stat, NULL);
+        nfs3_call_state_wipe(cs);
+    }
+
+    GF_REF_PUT(cs);
+
+    return ret;
+}
+
+int
+nlm4svc_test(rpcsvc_request_t *req)
+{
+    xlator_t *vol = NULL;
+    nlm4_stats stat = nlm4_failed;
+    struct nfs_state *nfs = NULL;
+    nfs3_state_t *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+    int ret = RPCSVC_ACTOR_ERROR;
+    struct nfs3_fh fh = {
+        {0},
+    };
+
+    if (!req)
+        return ret;
+
+    nlm4_validate_nfs3_state(req, nfs3, stat, rpcerr, ret);
+    nfs = nfs_state(nfs3->nfsx);
+    nlm4_handle_call_state_init(nfs->nfs3state, cs, req, stat, rpcerr);
+
+    nlm4_prep_nlm4_testargs(&cs->args.nlm4_testargs, &fh, &cs->lkowner,
+                            cs->cookiebytes);
+    if (xdr_to_nlm4_testargs(req->msg[0], &cs->args.nlm4_testargs) <= 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    nlm4_validate_gluster_fh(&fh, stat, nlm4err);
+    nlm4_map_fh_to_volume(cs->nfs3state, fh, req, vol, stat, nlm4err);
+
+    if (nlm_grace_period) {
+        gf_msg(GF_NLM, GF_LOG_WARNING, 0, NFS_MSG_NLM_GRACE_PERIOD,
+               "NLM in grace period");
+        stat = nlm4_denied_grace_period;
+        nlm4_test_reply(cs, stat, NULL);
+        nfs3_call_state_wipe(cs);
+        return 0;
+    }
+
+    cs->vol = vol;
+    nlm4_volume_started_check(nfs3, vol, ret, rpcerr);
+
+    ret = nfs3_fh_resolve_and_resume(cs, &fh, NULL, nlm4_test_resume);
+
+nlm4err:
+    if (ret < 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, -ret, NFS_MSG_RESOLVE_ERROR,
+               "unable to resolve and resume");
+        nlm4_test_reply(cs, stat, NULL);
+        nfs3_call_state_wipe(cs);
+        return 0;
+    }
+
+rpcerr:
+    if (ret < 0)
+        nfs3_call_state_wipe(cs);
+
+    return ret;
+}
+
+struct nlm4_notify_args {
+    GF_REF_DECL; /* refcounting */
+
+    nfs3_call_state_t *cs; /* call state, w/ lock request details */
+    call_frame_t *frame;   /* frame to us for the reply */
+};
+
+static int
+nlm4svc_send_granted_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                         void *myframe)
+{
+    call_frame_t *frame = myframe;
+    struct nlm4_notify_args *args = frame->local;
+
+    GF_REF_PUT(args);
+    return 0;
+}
+
+static void
+nlm4_notify_free(struct nlm4_notify_args *ncf)
+{
+    GF_REF_PUT(ncf->cs);
+    STACK_DESTROY(ncf->frame->root);
+    GF_FREE(ncf);
+}
+
+static struct nlm4_notify_args *
+nlm4_notify_init(nfs3_call_state_t *cs)
+{
+    struct nlm4_notify_args *ncf = NULL;
+
+    ncf = GF_CALLOC(1, sizeof(struct nlm4_notify_args), gf_nfs_mt_nlm4_notify);
+    if (!ncf)
+        /* GF_CALLOW will log the ENOMEM error */
+        goto out;
+
+    GF_REF_INIT(ncf, nlm4_notify_free);
+    ncf->cs = GF_REF_GET(cs);
+
+out:
+    return ncf;
+}
+
+static int
+nlm_handle_connect(struct rpc_clnt *rpc_clnt, struct nlm4_notify_args *ncf);
+
+int
+nlm_rpcclnt_notify(struct rpc_clnt *rpc_clnt, void *mydata, rpc_clnt_event_t fn,
+                   void *data)
+{
+    struct nlm4_notify_args *ncf = mydata;
+
+    GF_VALIDATE_OR_GOTO("NLM4-notify", ncf, out);
+
+    switch (fn) {
+        case RPC_CLNT_CONNECT:
+            nlm_handle_connect(rpc_clnt, ncf);
+            break;
+
+        case RPC_CLNT_MSG:
+            break;
+
+        case RPC_CLNT_DISCONNECT:
+            nlm_unset_rpc_clnt(rpc_clnt);
+            break;
+
+        case RPC_CLNT_DESTROY:
+            GF_REF_PUT(ncf);
+            break;
+
+        default:
+            break;
+    }
+out:
+    return 0;
+}
+
+void *
+nlm4_establish_callback(nfs3_call_state_t *cs, call_frame_t *cbk_frame)
+{
+    int ret = -1;
+    union gf_sock_union sock_union;
+    dict_t *options = NULL;
+    char peerip[INET6_ADDRSTRLEN + 1] = {0};
+    char *portstr = NULL;
+    char myip[INET6_ADDRSTRLEN + 1] = {0};
+    rpc_clnt_t *rpc_clnt = NULL;
+    int port = -1;
+    struct nlm4_notify_args *ncf = NULL;
+
+    GF_ASSERT(cs->nfsx);
+    THIS = cs->nfsx;
+
+    rpc_transport_get_peeraddr(cs->trans, NULL, 0, &sock_union.storage,
+                               sizeof(sock_union.storage));
+
+    switch (sock_union.sa.sa_family) {
+        case AF_INET6:
+            /* can not come here as NLM listens on IPv4 */
+            gf_msg(GF_NLM, GF_LOG_ERROR, EAFNOSUPPORT,
+                   NFS_MSG_UNSUPPORTED_VERSION,
+                   "NLM is not supported on IPv6 in this release");
+            goto err;
+            /*
+                            inet_ntop (AF_INET6,
+                                       &((struct sockaddr_in6
+               *)sockaddr)->sin6_addr, peerip, INET6_ADDRSTRLEN+1); break;
+            */
+        case AF_INET:
+            inet_ntop(AF_INET, &sock_union.sin.sin_addr, peerip,
+                      INET6_ADDRSTRLEN + 1);
+            inet_ntop(AF_INET,
+                      &(((struct sockaddr_in *)&cs->trans->myinfo.sockaddr)
+                            ->sin_addr),
+                      myip, INET6_ADDRSTRLEN + 1);
+
+            break;
+        default:
+            break;
+            /* FIXME: handle the error */
+    }
+
+    /* looks like libc rpc supports only ipv4 */
+    port = pmap_getport(&sock_union.sin, NLM_PROGRAM, NLM_V4, IPPROTO_TCP);
+
+    if (port == 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_GET_PORT_ERROR,
+               "Unable to get NLM port of the client."
+               " Is the firewall running on client?"
+               " OR Are RPC services running (rpcinfo -p)?");
+        goto err;
+    }
+
+    options = dict_new();
+    if (options == NULL) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, ENOMEM, NFS_MSG_GFID_DICT_CREATE_FAIL,
+               "dict allocation failed");
+        goto err;
+    }
+
+    ret = dict_set_str(options, "transport-type", "socket");
+    if (ret == -1) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+               "dict_set_str error");
+        goto err;
+    }
+
+    ret = dict_set_dynstr(options, "remote-host", gf_strdup(peerip));
+    if (ret == -1) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+               "dict_set_str error");
+        goto err;
+    }
+
+    ret = gf_asprintf(&portstr, "%d", port);
+    if (ret == -1)
+        goto err;
+
+    ret = dict_set_dynstr(options, "remote-port", portstr);
+    if (ret == -1) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+               "dict_set_dynstr error");
+        goto err;
+    }
+
+    /* needed in case virtual IP is used */
+    ret = dict_set_dynstr(options, "transport.socket.source-addr",
+                          gf_strdup(myip));
+    if (ret == -1)
+        goto err;
+
+    ret = dict_set_str(options, "auth-null", "on");
+    if (ret == -1) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+               "dict_set_dynstr error");
+        goto err;
+    }
+
+    ncf = nlm4_notify_init(cs);
+    if (!ncf) {
+        ret = -1;
+        goto err;
+    }
+
+    ncf->frame = cbk_frame;
+    ncf->frame->local = ncf;
+
+    /* TODO: is 32 frames in transit enough ? */
+    rpc_clnt = rpc_clnt_new(options, cs->nfsx, "NLM-client", 32);
+    if (rpc_clnt == NULL) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, EINVAL, NFS_MSG_INVALID_ENTRY,
+               "rpc_clnt NULL");
+        goto err;
+    }
+
+    ret = rpc_clnt_register_notify(rpc_clnt, nlm_rpcclnt_notify, ncf);
+    if (ret == -1) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_RPC_CLNT_ERROR,
+               "rpc_clnt_register_connect error");
+        goto err;
+    }
+
+    /* After this connect succeeds, granted msg is sent in notify */
+    ret = rpc_transport_connect(rpc_clnt->conn.trans, port);
+
+    if (ret == -1 && EINPROGRESS == errno)
+        ret = 0;
+
+err:
+    if (options)
+        dict_unref(options);
+    if (ret == -1) {
+        if (rpc_clnt)
+            rpc_clnt_unref(rpc_clnt);
+        if (ncf)
+            GF_REF_PUT(ncf);
+    }
+
+    return rpc_clnt;
+}
+
+static void
+nlm4svc_send_granted(struct nlm4_notify_args *ncf)
+{
+    int ret = -1;
+    nfs3_call_state_t *cs = ncf->cs;
+    rpc_clnt_t *rpc_clnt = NULL;
+    struct iovec outmsg = {
+        0,
+    };
+    nlm4_testargs testargs;
+    struct iobuf *iobuf = NULL;
+    struct iobref *iobref = NULL;
+    char peerip[INET6_ADDRSTRLEN + 1];
+    union gf_sock_union sock_union;
+
+    rpc_clnt = nlm_get_rpc_clnt(cs->args.nlm4_lockargs.alock.caller_name);
+    if (rpc_clnt == NULL) {
+        nlm4_establish_callback(cs, ncf->frame);
+        return;
+    }
+
+    rpc_transport_get_peeraddr(cs->trans, NULL, 0, &sock_union.storage,
+                               sizeof(sock_union.storage));
+
+    switch (sock_union.sa.sa_family) {
+        case AF_INET6:
+            inet_ntop(AF_INET6, &sock_union.sin6.sin6_addr, peerip,
+                      INET6_ADDRSTRLEN + 1);
+            break;
+        case AF_INET:
+            inet_ntop(AF_INET, &sock_union.sin.sin_addr, peerip,
+                      INET6_ADDRSTRLEN + 1);
+            break;
+        default:
+            break;
+    }
+
+    testargs.cookie = cs->args.nlm4_lockargs.cookie;
+    testargs.exclusive = cs->args.nlm4_lockargs.exclusive;
+    testargs.alock = cs->args.nlm4_lockargs.alock;
+
+    iobuf = iobuf_get(cs->nfs3state->iobpool);
+    if (!iobuf) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to get iobuf");
+        goto ret;
+    }
+
+    iobuf_to_iovec(iobuf, &outmsg);
+    /* Use the given serializer to translate the give C structure in arg
+     * to XDR format which will be written into the buffer in outmsg.
+     */
+    outmsg.iov_len = xdr_serialize_nlm4_testargs(outmsg, &testargs);
+
+    iobref = iobref_new();
+    if (iobref == NULL) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to get iobref");
+        goto ret;
+    }
+
+    ret = iobref_add(iobref, iobuf);
+    if (ret) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+               "Failed to add iob to iobref");
+        goto ret;
+    }
+
+    GF_REF_GET(ncf);
+    ret = rpc_clnt_submit(rpc_clnt, &nlm4clntprog, NLM4_GRANTED,
+                          nlm4svc_send_granted_cbk, &outmsg, 1, NULL, 0, iobref,
+                          ncf->frame, NULL, 0, NULL, 0, NULL);
+
+    if (ret < 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, -ret, NFS_MSG_RPC_CLNT_ERROR,
+               "rpc_clnt_submit error");
+        goto ret;
+    }
+ret:
+    if (iobref)
+        iobref_unref(iobref);
+    if (iobuf)
+        iobuf_unref(iobuf);
+
+    rpc_clnt_unref(rpc_clnt);
+    return;
+}
+
+int
+nlm_cleanup_fds(char *caller_name)
+{
+    int nlmclnt_found = 0;
+    nlm_client_t *nlmclnt = NULL;
+
+    LOCK(&nlm_client_list_lk);
+    list_for_each_entry(nlmclnt, &nlm_client_list, nlm_clients)
+    {
+        if (!strcmp(caller_name, nlmclnt->caller_name)) {
+            nlmclnt_found = 1;
+            break;
+        }
+    }
+
+    if (!nlmclnt_found)
+        goto ret;
+
+    nlm_client_free(nlmclnt);
+ret:
+    UNLOCK(&nlm_client_list_lk);
+    return 0;
+}
+
+void
+nlm_search_and_delete(fd_t *fd, nlm4_lock *lk)
+{
+    nlm_fde_t *fde = NULL;
+    nlm_client_t *nlmclnt = NULL;
+    int nlmclnt_found = 0;
+    int fde_found = 0;
+    int transit_cnt = 0;
+
+    LOCK(&nlm_client_list_lk);
+    list_for_each_entry(nlmclnt, &nlm_client_list, nlm_clients)
+    {
+        if (!strcmp(lk->caller_name, nlmclnt->caller_name)) {
+            nlmclnt_found = 1;
+            break;
+        }
+    }
+
+    if (!nlmclnt_found)
+        goto ret;
+
+    list_for_each_entry(fde, &nlmclnt->fdes, fde_list)
+    {
+        if (fde->fd == fd) {
+            fde_found = 1;
+            break;
+        }
+    }
+
+    if (!fde_found)
+        goto ret;
+    transit_cnt = fde->transit_cnt;
+    if (transit_cnt)
+        goto ret;
+    list_del(&fde->fde_list);
+
+ret:
+    UNLOCK(&nlm_client_list_lk);
+
+    if (fde_found && !transit_cnt) {
+        fd_unref(fde->fd);
+        GF_FREE(fde);
+    }
+    return;
+}
+
+int
+nlm_dec_transit_count(fd_t *fd, char *caller_name)
+{
+    nlm_fde_t *fde = NULL;
+    nlm_client_t *nlmclnt = NULL;
+    int nlmclnt_found = 0;
+    int fde_found = 0;
+    int transit_cnt = -1;
+
+    LOCK(&nlm_client_list_lk);
+    list_for_each_entry(nlmclnt, &nlm_client_list, nlm_clients)
+    {
+        if (!strcmp(caller_name, nlmclnt->caller_name)) {
+            nlmclnt_found = 1;
+            break;
+        }
+    }
+
+    if (!nlmclnt_found) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_NLMCLNT_NOT_FOUND,
+               "nlmclnt not found");
+        nlmclnt = NULL;
+        goto ret;
+    }
+
+    list_for_each_entry(fde, &nlmclnt->fdes, fde_list)
+    {
+        if (fde->fd == fd) {
+            fde_found = 1;
+            break;
+        }
+    }
+
+    if (fde_found) {
+        transit_cnt = --fde->transit_cnt;
+        goto ret;
+    }
+ret:
+
+    UNLOCK(&nlm_client_list_lk);
+    return transit_cnt;
+}
+
+nlm_client_t *
+nlm_search_and_add(fd_t *fd, char *caller_name)
+{
+    nlm_fde_t *fde = NULL;
+    nlm_client_t *nlmclnt = NULL;
+    int nlmclnt_found = 0;
+    int fde_found = 0;
+
+    LOCK(&nlm_client_list_lk);
+    list_for_each_entry(nlmclnt, &nlm_client_list, nlm_clients)
+    {
+        if (!strcmp(caller_name, nlmclnt->caller_name)) {
+            nlmclnt_found = 1;
+            break;
+        }
+    }
+
+    if (!nlmclnt_found) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_NLMCLNT_NOT_FOUND,
+               "nlmclnt not found");
+        nlmclnt = NULL;
+        goto ret;
+    }
+
+    list_for_each_entry(fde, &nlmclnt->fdes, fde_list)
+    {
+        if (fde->fd == fd) {
+            fde_found = 1;
+            break;
+        }
+    }
+
+    if (fde_found)
+        goto ret;
+
+    fde = GF_CALLOC(1, sizeof(*fde), gf_nfs_mt_nlm4_fde);
+
+    fde->fd = fd_ref(fd);
+    list_add(&fde->fde_list, &nlmclnt->fdes);
+ret:
+    if (nlmclnt_found && fde)
+        fde->transit_cnt++;
+    UNLOCK(&nlm_client_list_lk);
+    return nlmclnt;
+}
+
+int
+nlm4svc_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct gf_flock *flock,
+                 dict_t *xdata)
+{
+    nlm4_stats stat = nlm4_denied;
+    int transit_cnt = -1;
+    char *caller_name = NULL;
+    nfs3_call_state_t *cs = NULL;
+    pthread_t thr;
+    struct nlm4_notify_args *ncf = NULL;
+
+    cs = frame->local;
+    caller_name = cs->args.nlm4_lockargs.alock.caller_name;
+    transit_cnt = nlm_dec_transit_count(cs->fd, caller_name);
+
+    if (op_ret == -1) {
+        if (transit_cnt == 0)
+            nlm_search_and_delete(cs->fd, &cs->args.nlm4_lockargs.alock);
+        stat = nlm4_errno_to_nlm4stat(op_errno);
+        goto err;
+    } else {
+        stat = nlm4_granted;
+        if (cs->monitor && !nlm_monitor(caller_name)) {
+            /* FIXME: handle nsm_monitor failure */
+            (void)gf_thread_create(&thr, NULL, nsm_monitor, (void *)caller_name,
+                                   "nlmmon");
+        }
+    }
+
+err:
+    if (cs->args.nlm4_lockargs.block) {
+        ncf = nlm4_notify_init(cs);
+        if (ncf) {
+            ncf->frame = copy_frame(frame);
+            ncf->frame->local = ncf;
+            nlm4svc_send_granted(ncf);
+        }
+    } else {
+        nlm4_generic_reply(cs->req, cs->args.nlm4_lockargs.cookie, stat);
+        nfs3_call_state_wipe(cs);
+    }
+
+    return 0;
+}
+
+int
+nlm4_lock_fd_resume(void *carg)
+{
+    nlm4_stats stat = nlm4_denied;
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+    struct gf_flock flock = {
+        0,
+    };
+
+    if (!carg)
+        return ret;
+
+    cs = GF_REF_GET((nfs3_call_state_t *)carg);
+    nlm4_check_fh_resolve_status(cs, stat, nlm4err);
+    (void)nlm_search_and_add(cs->fd, cs->args.nlm4_lockargs.alock.caller_name);
+    nfs_request_user_init(&nfu, cs->req);
+    nlm4_lock_to_gf_flock(&flock, &cs->args.nlm4_lockargs.alock,
+                          cs->args.nlm4_lockargs.exclusive);
+    nlm_copy_lkowner(&nfu.lk_owner, &cs->args.nlm4_lockargs.alock.oh);
+    if (cs->args.nlm4_lockargs.block) {
+        nlm4_generic_reply(cs->req, cs->args.nlm4_lockargs.cookie,
+                           nlm4_blocked);
+        ret = nfs_lk(cs->nfsx, cs->vol, &nfu, cs->fd, F_SETLKW, &flock,
+                     nlm4svc_lock_cbk, cs);
+        /* FIXME: handle error from nfs_lk() specially  by just
+         * cleaning up cs and unblock the client lock request.
+         */
+        ret = 0;
+    } else
+        ret = nfs_lk(cs->nfsx, cs->vol, &nfu, cs->fd, F_SETLK, &flock,
+                     nlm4svc_lock_cbk, cs);
+
+nlm4err:
+    if (ret < 0) {
+        stat = nlm4_errno_to_nlm4stat(-ret);
+        gf_msg(GF_NLM, GF_LOG_ERROR, stat, NFS_MSG_LOCK_FAIL,
+               "unable to call lk()");
+        nlm4_generic_reply(cs->req, cs->args.nlm4_lockargs.cookie, stat);
+        nfs3_call_state_wipe(cs);
+    }
+
+    GF_REF_PUT(cs);
+
+    return ret;
+}
+
+int
+nlm4_lock_resume(void *carg)
+{
+    nlm4_stats stat = nlm4_failed;
+    int ret = -1;
+    nfs3_call_state_t *cs = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = GF_REF_GET((nfs3_call_state_t *)carg);
+    nlm4_check_fh_resolve_status(cs, stat, nlm4err);
+    ret = nlm4_file_open_and_resume(cs, nlm4_lock_fd_resume);
+
+nlm4err:
+    if (ret < 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, -ret, NFS_MSG_OPEN_FAIL,
+               "unable to open and resume");
+        stat = nlm4_errno_to_nlm4stat(-ret);
+        nlm4_generic_reply(cs->req, cs->args.nlm4_lockargs.cookie, stat);
+        nfs3_call_state_wipe(cs);
+    }
+
+    GF_REF_PUT(cs);
+
+    return ret;
+}
+
+int
+nlm4svc_lock_common(rpcsvc_request_t *req, int mon)
+{
+    int ret = RPCSVC_ACTOR_ERROR;
+    nlm4_stats stat = nlm4_failed;
+    struct nfs3_fh fh = {
+        {0},
+    };
+    xlator_t *vol = NULL;
+    nfs3_state_t *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+    struct nfs_state *nfs = NULL;
+
+    if (!req)
+        return ret;
+
+    nlm4_validate_nfs3_state(req, nfs3, stat, rpcerr, ret);
+    nfs = nfs_state(nfs3->nfsx);
+    nlm4_handle_call_state_init(nfs->nfs3state, cs, req, stat, rpcerr);
+
+    nlm4_prep_nlm4_lockargs(&cs->args.nlm4_lockargs, &cs->lockfh, &cs->lkowner,
+                            cs->cookiebytes);
+    if (xdr_to_nlm4_lockargs(req->msg[0], &cs->args.nlm4_lockargs) <= 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    fh = cs->lockfh;
+    cs->monitor = mon;
+    nlm4_validate_gluster_fh(&fh, stat, nlm4err);
+    nlm4_map_fh_to_volume(cs->nfs3state, fh, req, vol, stat, nlm4err);
+
+    if (nlm_grace_period && !cs->args.nlm4_lockargs.reclaim) {
+        gf_msg(GF_NLM, GF_LOG_WARNING, 0, NFS_MSG_NLM_GRACE_PERIOD,
+               "NLM in grace period");
+        stat = nlm4_denied_grace_period;
+        nlm4_generic_reply(req, cs->args.nlm4_unlockargs.cookie, stat);
+        nfs3_call_state_wipe(cs);
+        return 0;
+    }
+
+    cs->vol = vol;
+    cs->trans = rpcsvc_request_transport_ref(req);
+    nlm4_volume_started_check(nfs3, vol, ret, rpcerr);
+
+    ret = nlm_add_nlmclnt(cs->args.nlm4_lockargs.alock.caller_name);
+
+    ret = nfs3_fh_resolve_and_resume(cs, &fh, NULL, nlm4_lock_resume);
+
+nlm4err:
+    if (ret < 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, -ret, NFS_MSG_RESOLVE_ERROR,
+               "unable to resolve and resume");
+        nlm4_generic_reply(cs->req, cs->args.nlm4_lockargs.cookie, stat);
+        nfs3_call_state_wipe(cs);
+        return 0;
+    }
+
+rpcerr:
+    if (ret < 0) {
+        nfs3_call_state_wipe(cs);
+    }
+
+    return ret;
+}
+
+int
+nlm4svc_lock(rpcsvc_request_t *req)
+{
+    return nlm4svc_lock_common(req, 1);
+}
+
+int
+nlm4svc_nm_lock(rpcsvc_request_t *req)
+{
+    return nlm4svc_lock_common(req, 0);
+}
+
+int
+nlm4svc_cancel_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct gf_flock *flock,
+                   dict_t *xdata)
+{
+    nlm4_stats stat = nlm4_denied;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = frame->local;
+    if (op_ret == -1) {
+        stat = nlm4_errno_to_nlm4stat(op_errno);
+        goto err;
+    } else {
+        stat = nlm4_granted;
+        nlm_search_and_delete(cs->fd, &cs->args.nlm4_lockargs.alock);
+    }
+
+err:
+    nlm4_generic_reply(cs->req, cs->args.nlm4_cancargs.cookie, stat);
+    nfs3_call_state_wipe(cs);
+    return 0;
+}
+
+int
+nlm4_cancel_fd_resume(void *carg)
+{
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+    struct gf_flock flock = {
+        0,
+    };
+
+    if (!carg)
+        return ret;
+
+    cs = GF_REF_GET((nfs3_call_state_t *)carg);
+    nfs_request_user_init(&nfu, cs->req);
+    nlm4_lock_to_gf_flock(&flock, &cs->args.nlm4_cancargs.alock,
+                          cs->args.nlm4_cancargs.exclusive);
+    nlm_copy_lkowner(&nfu.lk_owner, &cs->args.nlm4_cancargs.alock.oh);
+    flock.l_type = F_UNLCK;
+    ret = nfs_lk(cs->nfsx, cs->vol, &nfu, cs->fd, F_SETLK, &flock,
+                 nlm4svc_cancel_cbk, cs);
+
+    GF_REF_PUT(cs);
+
+    return ret;
+}
+
+int
+nlm4_cancel_resume(void *carg)
+{
+    nlm4_stats stat = nlm4_failed;
+    int ret = -EFAULT;
+    nfs3_call_state_t *cs = NULL;
+    nlm_client_t *nlmclnt = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = GF_REF_GET((nfs3_call_state_t *)carg);
+    nlm4_check_fh_resolve_status(cs, stat, nlm4err);
+
+    nlmclnt = nlm_get_uniq(cs->args.nlm4_cancargs.alock.caller_name);
+    if (nlmclnt == NULL) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, ENOLCK, NFS_MSG_NO_MEMORY,
+               "nlm_get_uniq() returned NULL");
+        goto nlm4err;
+    }
+    cs->fd = fd_lookup_uint64(cs->resolvedloc.inode,
+                              (uint64_t)(uintptr_t)nlmclnt);
+    if (cs->fd == NULL) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_FD_LOOKUP_NULL,
+               "fd_lookup_uint64 retrned NULL");
+        goto nlm4err;
+    }
+    ret = nlm4_cancel_fd_resume(cs);
+
+nlm4err:
+    if (ret < 0) {
+        gf_msg(GF_NLM, GF_LOG_WARNING, -ret, NFS_MSG_LOCK_FAIL,
+               "unable to unlock_fd_resume()");
+        stat = nlm4_errno_to_nlm4stat(-ret);
+        nlm4_generic_reply(cs->req, cs->args.nlm4_cancargs.cookie, stat);
+
+        nfs3_call_state_wipe(cs);
+    }
+
+    GF_REF_PUT(cs);
+
+    /* clean up is taken care of */
+    return 0;
+}
+
+int
+nlm4svc_cancel(rpcsvc_request_t *req)
+{
+    xlator_t *vol = NULL;
+    nlm4_stats stat = nlm4_failed;
+    struct nfs_state *nfs = NULL;
+    nfs3_state_t *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+    int ret = RPCSVC_ACTOR_ERROR;
+    struct nfs3_fh fh = {
+        {0},
+    };
+
+    if (!req)
+        return ret;
+
+    nlm4_validate_nfs3_state(req, nfs3, stat, rpcerr, ret);
+    nfs = nfs_state(nfs3->nfsx);
+    nlm4_handle_call_state_init(nfs->nfs3state, cs, req, stat, rpcerr);
+
+    nlm4_prep_nlm4_cancargs(&cs->args.nlm4_cancargs, &fh, &cs->lkowner,
+                            cs->cookiebytes);
+    if (xdr_to_nlm4_cancelargs(req->msg[0], &cs->args.nlm4_cancargs) <= 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    nlm4_validate_gluster_fh(&fh, stat, nlm4err);
+    nlm4_map_fh_to_volume(cs->nfs3state, fh, req, vol, stat, nlm4err);
+
+    if (nlm_grace_period) {
+        gf_msg(GF_NLM, GF_LOG_WARNING, 0, NFS_MSG_NLM_GRACE_PERIOD,
+               "NLM in grace period");
+        stat = nlm4_denied_grace_period;
+        nlm4_generic_reply(req, cs->args.nlm4_unlockargs.cookie, stat);
+        nfs3_call_state_wipe(cs);
+        return 0;
+    }
+
+    cs->vol = vol;
+    nlm4_volume_started_check(nfs3, vol, ret, rpcerr);
+
+    ret = nfs3_fh_resolve_and_resume(cs, &fh, NULL, nlm4_cancel_resume);
+
+nlm4err:
+    if (ret < 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, -ret, NFS_MSG_RESOLVE_ERROR,
+               "unable to resolve and resume");
+        nlm4_generic_reply(cs->req, cs->args.nlm4_cancargs.cookie, stat);
+        nfs3_call_state_wipe(cs);
+        return 0;
+    }
+
+rpcerr:
+    if (ret < 0) {
+        nfs3_call_state_wipe(cs);
+    }
+    return ret;
+}
+
+int
+nlm4svc_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                   int32_t op_ret, int32_t op_errno, struct gf_flock *flock,
+                   dict_t *xdata)
+{
+    nlm4_stats stat = nlm4_denied;
+    nfs3_call_state_t *cs = NULL;
+
+    cs = GF_REF_GET((nfs3_call_state_t *)frame->local);
+    if (op_ret == -1) {
+        stat = nlm4_errno_to_nlm4stat(op_errno);
+        goto err;
+    } else {
+        stat = nlm4_granted;
+        if (flock->l_type == F_UNLCK)
+            nlm_search_and_delete(cs->fd, &cs->args.nlm4_unlockargs.alock);
+    }
+
+err:
+    nlm4_generic_reply(cs->req, cs->args.nlm4_unlockargs.cookie, stat);
+    GF_REF_PUT(cs);
+    return 0;
+}
+
+int
+nlm4_unlock_fd_resume(void *carg)
+{
+    int ret = -EFAULT;
+    nfs_user_t nfu = {
+        0,
+    };
+    nfs3_call_state_t *cs = NULL;
+    struct gf_flock flock = {
+        0,
+    };
+
+    if (!carg)
+        return ret;
+    cs = GF_REF_GET((nfs3_call_state_t *)carg);
+    nfs_request_user_init(&nfu, cs->req);
+    nlm4_lock_to_gf_flock(&flock, &cs->args.nlm4_unlockargs.alock, 0);
+    nlm_copy_lkowner(&nfu.lk_owner, &cs->args.nlm4_unlockargs.alock.oh);
+    flock.l_type = F_UNLCK;
+    ret = nfs_lk(cs->nfsx, cs->vol, &nfu, cs->fd, F_SETLK, &flock,
+                 nlm4svc_unlock_cbk, cs);
+
+    GF_REF_PUT(cs);
+
+    return ret;
+}
+
+int
+nlm4_unlock_resume(void *carg)
+{
+    nlm4_stats stat = nlm4_failed;
+    int ret = -1;
+    nfs3_call_state_t *cs = NULL;
+    nlm_client_t *nlmclnt = NULL;
+    char *caller_name = NULL;
+
+    if (!carg)
+        return ret;
+
+    cs = GF_REF_GET((nfs3_call_state_t *)carg);
+    nlm4_check_fh_resolve_status(cs, stat, nlm4err);
+    caller_name = cs->args.nlm4_unlockargs.alock.caller_name;
+
+    nlmclnt = nlm_get_uniq(caller_name);
+    if (nlmclnt == NULL) {
+        stat = nlm4_granted;
+        gf_msg(GF_NLM, GF_LOG_WARNING, ENOLCK, NFS_MSG_NO_MEMORY,
+               "nlm_get_uniq() returned NULL for %s", caller_name);
+        goto nlm4err;
+    }
+    cs->fd = fd_lookup_uint64(cs->resolvedloc.inode,
+                              (uint64_t)(uintptr_t)nlmclnt);
+    if (cs->fd == NULL) {
+        stat = nlm4_granted;
+        gf_msg(GF_NLM, GF_LOG_WARNING, 0, NFS_MSG_FD_LOOKUP_NULL,
+               "fd_lookup_uint64() returned NULL");
+        goto nlm4err;
+    }
+    ret = nlm4_unlock_fd_resume(cs);
+
+nlm4err:
+    if (ret < 0) {
+        gf_msg(GF_NLM, GF_LOG_WARNING, -ret, NFS_MSG_LOCK_FAIL,
+               "unable to unlock_fd_resume");
+        stat = nlm4_errno_to_nlm4stat(-ret);
+        nlm4_generic_reply(cs->req, cs->args.nlm4_unlockargs.cookie, stat);
+
+        nfs3_call_state_wipe(cs);
+    }
+
+    GF_REF_PUT(cs);
+
+    /* we have already taken care of cleanup */
+    return 0;
+}
+
+int
+nlm4svc_unlock(rpcsvc_request_t *req)
+{
+    xlator_t *vol = NULL;
+    nlm4_stats stat = nlm4_failed;
+    struct nfs_state *nfs = NULL;
+    nfs3_state_t *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+    int ret = RPCSVC_ACTOR_ERROR;
+    struct nfs3_fh fh = {
+        {0},
+    };
+
+    if (!req)
+        return ret;
+
+    nlm4_validate_nfs3_state(req, nfs3, stat, rpcerr, ret);
+    nfs = nfs_state(nfs3->nfsx);
+    nlm4_handle_call_state_init(nfs->nfs3state, cs, req, stat, rpcerr);
+
+    nlm4_prep_nlm4_unlockargs(&cs->args.nlm4_unlockargs, &fh, &cs->lkowner,
+                              cs->cookiebytes);
+    if (xdr_to_nlm4_unlockargs(req->msg[0], &cs->args.nlm4_unlockargs) <= 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    nlm4_validate_gluster_fh(&fh, stat, nlm4err);
+    nlm4_map_fh_to_volume(cs->nfs3state, fh, req, vol, stat, nlm4err);
+
+    if (nlm_grace_period) {
+        gf_msg(GF_NLM, GF_LOG_WARNING, 0, NFS_MSG_NLM_GRACE_PERIOD,
+               "NLM in grace period");
+        stat = nlm4_denied_grace_period;
+        nlm4_generic_reply(req, cs->args.nlm4_unlockargs.cookie, stat);
+        nfs3_call_state_wipe(cs);
+        return 0;
+    }
+
+    cs->vol = vol;
+    /* FIXME: check if trans is being used at all for unlock */
+    cs->trans = rpcsvc_request_transport_ref(req);
+    nlm4_volume_started_check(nfs3, vol, ret, rpcerr);
+
+    ret = nfs3_fh_resolve_and_resume(cs, &fh, NULL, nlm4_unlock_resume);
+
+nlm4err:
+    if (ret < 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, -ret, NFS_MSG_RESOLVE_ERROR,
+               "unable to resolve and resume");
+        nlm4_generic_reply(req, cs->args.nlm4_unlockargs.cookie, stat);
+        nfs3_call_state_wipe(cs);
+        return 0;
+    }
+
+rpcerr:
+    if (ret < 0) {
+        nfs3_call_state_wipe(cs);
+    }
+    return ret;
+}
+
+int
+nlm4_share_reply(nfs3_call_state_t *cs, nlm4_stats stat)
+{
+    nlm4_shareres res = {{0}, 0, 0};
+
+    if (!cs)
+        return -1;
+
+    res.cookie = cs->args.nlm4_shareargs.cookie;
+    res.stat = stat;
+    res.sequence = 0;
+
+    nlm4svc_submit_reply(cs->req, (void *)&res,
+                         (nlm4_serializer)xdr_serialize_nlm4_shareres);
+    return 0;
+}
+
+nlm_share_t *
+nlm4_share_new()
+{
+    nlm_share_t *share = NULL;
+
+    share = GF_CALLOC(1, sizeof(nlm_share_t), gf_nfs_mt_nlm4_share);
+    if (!share)
+        goto out;
+
+    INIT_LIST_HEAD(&share->client_list);
+    INIT_LIST_HEAD(&share->inode_list);
+out:
+    return share;
+}
+
+int
+nlm4_add_share_to_inode(nlm_share_t *share)
+{
+    int ret = -1;
+    uint64_t ctx = 0;
+    struct list_head *head = NULL;
+    xlator_t *this = NULL;
+    inode_t *inode = NULL;
+    struct nfs_inode_ctx *ictx = NULL;
+    struct nfs_state *priv = NULL;
+
+    this = THIS;
+    priv = this->private;
+    inode = share->inode;
+    ret = inode_ctx_get(inode, this, &ctx);
+
+    if (ret == -1) {
+        ictx = GF_CALLOC(1, sizeof(struct nfs_inode_ctx), gf_nfs_mt_inode_ctx);
+        if (!ictx) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, NFS_MSG_NO_MEMORY,
+                   "could not allocate nfs inode ctx");
+            ret = -1;
+            goto out;
+        }
+        ictx->generation = priv->generation;
+
+        head = &ictx->shares;
+        INIT_LIST_HEAD(head);
+
+        ret = inode_ctx_put(inode, this, (uint64_t)(uintptr_t)ictx);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, 0, NFS_MSG_SHARE_LIST_STORE_FAIL,
+                   "could not store share list");
+            goto out;
+        }
+    } else {
+        ictx = (struct nfs_inode_ctx *)(uintptr_t)ctx;
+        head = &ictx->shares;
+    }
+
+    list_add(&share->inode_list, head);
+
+out:
+    if (ret && head)
+        GF_FREE(head);
+
+    return ret;
+}
+
+int
+nlm4_approve_share_reservation(nfs3_call_state_t *cs)
+{
+    int ret = -1;
+    uint64_t ctx = 0;
+    fsh_mode req_mode = 0;
+    fsh_access req_access = 0;
+    inode_t *inode = NULL;
+    nlm_share_t *share = NULL;
+    struct list_head *head = NULL;
+    struct nfs_inode_ctx *ictx = NULL;
+
+    if (!cs)
+        goto out;
+
+    inode = cs->resolvedloc.inode;
+
+    ret = inode_ctx_get(inode, THIS, &ctx);
+    if (ret) {
+        ret = 0;
+        goto out;
+    }
+    ictx = (struct nfs_inode_ctx *)(uintptr_t)ctx;
+
+    head = &ictx->shares;
+    if (!head || list_empty(head))
+        goto out;
+
+    req_mode = cs->args.nlm4_shareargs.share.mode;
+    req_access = cs->args.nlm4_shareargs.share.access;
+
+    list_for_each_entry(share, head, inode_list)
+    {
+        ret = (((req_mode & share->access) == 0) &&
+               ((req_access & share->mode) == 0));
+        if (!ret) {
+            ret = -1;
+            goto out;
+        }
+    }
+    ret = 0;
+
+out:
+    return ret;
+}
+
+int
+nlm4_create_share_reservation(nfs3_call_state_t *cs)
+{
+    int ret = -1;
+    nlm_share_t *share = NULL;
+    nlm_client_t *client = NULL;
+    inode_t *inode = NULL;
+
+    LOCK(&nlm_client_list_lk);
+
+    inode = inode_ref(cs->resolvedloc.inode);
+    if (!inode) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_INODE_NOT_FOUND,
+               "inode not found");
+        goto out;
+    }
+
+    client = __nlm_get_uniq(cs->args.nlm4_shareargs.share.caller_name);
+    if (!client) {
+        /* DO NOT add client. the client is supposed
+           to be here, since nlm4svc_share adds it */
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_CLIENT_NOT_FOUND,
+               "client not found");
+        goto out;
+    }
+
+    ret = nlm4_approve_share_reservation(cs);
+    if (ret)
+        goto out;
+
+    share = nlm4_share_new();
+    if (!share) {
+        ret = -1;
+        goto out;
+    }
+
+    share->inode = inode;
+    share->mode = cs->args.nlm4_shareargs.share.mode;
+    share->access = cs->args.nlm4_shareargs.share.access;
+    nlm_copy_lkowner(&share->lkowner, &cs->args.nlm4_shareargs.share.oh);
+
+    ret = nlm4_add_share_to_inode(share);
+    if (ret)
+        goto out;
+
+    list_add(&share->client_list, &client->shares);
+
+out:
+    if (ret && inode) {
+        inode_unref(inode);
+        GF_FREE(share);
+    }
+
+    UNLOCK(&nlm_client_list_lk);
+    return ret;
+}
+
+/*
+  SHARE and UNSHARE calls DO NOT perform STACK_WIND,
+  the (non-monitored) share reservations are maintained
+  at *nfs xlator level only*, in memory
+*/
+int
+nlm4_share_resume(void *call_state)
+{
+    int ret = -1;
+    nlm4_stats stat = nlm4_failed;
+    nfs3_call_state_t *cs = NULL;
+
+    if (!call_state)
+        return ret;
+
+    cs = (nfs3_call_state_t *)call_state;
+    nlm4_check_fh_resolve_status(cs, stat, out);
+
+    ret = nlm4_create_share_reservation(cs);
+    if (!ret)
+        stat = nlm4_granted;
+
+out:
+    nlm4_share_reply(cs, stat);
+    nfs3_call_state_wipe(cs);
+    return 0;
+}
+
+int
+nlm4svc_share(rpcsvc_request_t *req)
+{
+    nlm4_stats stat = nlm4_failed;
+    xlator_t *vol = NULL;
+    nfs3_state_t *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+    struct nfs_state *nfs = NULL;
+    struct nfs3_fh fh = {
+        {0},
+    };
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+
+    nlm4_validate_nfs3_state(req, nfs3, stat, rpcerr, ret);
+    nfs = nfs_state(nfs3->nfsx);
+    nlm4_handle_call_state_init(nfs->nfs3state, cs, req, stat, rpcerr);
+
+    nlm4_prep_shareargs(&cs->args.nlm4_shareargs, &cs->lockfh, &cs->lkowner,
+                        cs->cookiebytes);
+
+    if (xdr_to_nlm4_shareargs(req->msg[0], &cs->args.nlm4_shareargs) <= 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding SHARE args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    fh = cs->lockfh;
+    nlm4_validate_gluster_fh(&fh, stat, nlm4err);
+    nlm4_map_fh_to_volume(cs->nfs3state, fh, req, vol, stat, nlm4err);
+
+    if (nlm_grace_period && !cs->args.nlm4_shareargs.reclaim) {
+        gf_msg_debug(GF_NLM, 0, "NLM in grace period");
+        stat = nlm4_denied_grace_period;
+        nlm4_share_reply(cs, stat);
+        nfs3_call_state_wipe(cs);
+        return 0;
+    }
+
+    cs->vol = vol;
+    cs->trans = rpcsvc_request_transport_ref(req);
+    nlm4_volume_started_check(nfs3, vol, ret, rpcerr);
+
+    ret = nlm_add_nlmclnt(cs->args.nlm4_shareargs.share.caller_name);
+
+    ret = nfs3_fh_resolve_and_resume(cs, &fh, NULL, nlm4_share_resume);
+
+nlm4err:
+    if (ret < 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, -ret, NFS_MSG_SHARE_CALL_FAIL,
+               "SHARE call failed");
+        nlm4_share_reply(cs, stat);
+        nfs3_call_state_wipe(cs);
+        return 0;
+    }
+
+rpcerr:
+    if (ret < 0)
+        nfs3_call_state_wipe(cs);
+
+    return ret;
+}
+
+int
+nlm4_remove_share_reservation(nfs3_call_state_t *cs)
+{
+    int ret = -1;
+    uint64_t ctx = 0;
+    fsh_mode req_mode = 0;
+    fsh_access req_access = 0;
+    nlm_share_t *share = NULL;
+    nlm_share_t *tmp = NULL;
+    nlm_client_t *client = NULL;
+    char *caller = NULL;
+    inode_t *inode = NULL;
+    xlator_t *this = NULL;
+    struct list_head *head = NULL;
+    nlm4_shareargs *args = NULL;
+    struct nfs_inode_ctx *ictx = NULL;
+
+    LOCK(&nlm_client_list_lk);
+
+    args = &cs->args.nlm4_shareargs;
+    caller = args->share.caller_name;
+
+    client = __nlm_get_uniq(caller);
+    if (!client) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_CLIENT_NOT_FOUND,
+               "client not found: %s", caller);
+        goto out;
+    }
+
+    inode = cs->resolvedloc.inode;
+    if (!inode) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_INODE_NOT_FOUND,
+               "inode not found: client: %s", caller);
+        goto out;
+    }
+
+    this = THIS;
+    ret = inode_ctx_get(inode, this, &ctx);
+    if (ret) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_INODE_SHARES_NOT_FOUND,
+               "no shares found for inode:"
+               "gfid: %s; client: %s",
+               inode->gfid, caller);
+        goto out;
+    }
+    ictx = (struct nfs_inode_ctx *)(uintptr_t)ctx;
+
+    head = &ictx->shares;
+    if (list_empty(head)) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = 0;
+    req_mode = args->share.mode;
+    req_access = args->share.access;
+
+    list_for_each_entry_safe(share, tmp, head, inode_list)
+    {
+        ret = ((req_mode == share->mode) && (req_access == share->access) &&
+               nlm_is_oh_same_lkowner(&share->lkowner, &args->share.oh));
+        if (ret) {
+            list_del(&share->client_list);
+            list_del(&share->inode_list);
+            inode_unref(share->inode);
+            GF_FREE(share);
+            break;
+        }
+    }
+
+    ret = 0;
+out:
+    UNLOCK(&nlm_client_list_lk);
+    return ret;
+}
+
+int
+nlm4_unshare_resume(void *call_state)
+{
+    int ret = -1;
+    nlm4_stats stat = nlm4_failed;
+    nfs3_call_state_t *cs = NULL;
+
+    if (!call_state)
+        return ret;
+
+    cs = (nfs3_call_state_t *)call_state;
+
+    nlm4_check_fh_resolve_status(cs, stat, out);
+    ret = nlm4_remove_share_reservation(cs);
+    if (!ret)
+        stat = nlm4_granted;
+
+out:
+    nlm4_share_reply(cs, stat);
+    nfs3_call_state_wipe(cs);
+    return 0;
+}
+
+int
+nlm4svc_unshare(rpcsvc_request_t *req)
+{
+    nlm4_stats stat = nlm4_failed;
+    xlator_t *vol = NULL;
+    nfs3_state_t *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+    struct nfs_state *nfs = NULL;
+    struct nfs3_fh fh = {
+        {0},
+    };
+    int ret = RPCSVC_ACTOR_ERROR;
+
+    if (!req)
+        return ret;
+
+    nlm4_validate_nfs3_state(req, nfs3, stat, rpcerr, ret);
+    nfs = nfs_state(nfs3->nfsx);
+    nlm4_handle_call_state_init(nfs->nfs3state, cs, req, stat, rpcerr);
+
+    nlm4_prep_shareargs(&cs->args.nlm4_shareargs, &cs->lockfh, &cs->lkowner,
+                        cs->cookiebytes);
+
+    if (xdr_to_nlm4_shareargs(req->msg[0], &cs->args.nlm4_shareargs) <= 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding UNSHARE args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto rpcerr;
+    }
+
+    fh = cs->lockfh;
+    nlm4_validate_gluster_fh(&fh, stat, nlm4err);
+    nlm4_map_fh_to_volume(cs->nfs3state, fh, req, vol, stat, nlm4err);
+
+    if (nlm_grace_period && !cs->args.nlm4_shareargs.reclaim) {
+        gf_msg_debug(GF_NLM, 0, "NLM in grace period");
+        stat = nlm4_denied_grace_period;
+        nlm4_share_reply(cs, stat);
+        nfs3_call_state_wipe(cs);
+        return 0;
+    }
+
+    cs->vol = vol;
+    cs->trans = rpcsvc_request_transport_ref(req);
+    nlm4_volume_started_check(nfs3, vol, ret, rpcerr);
+
+    ret = nfs3_fh_resolve_and_resume(cs, &fh, NULL, nlm4_unshare_resume);
+
+nlm4err:
+    if (ret < 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, -ret, NFS_MSG_UNSHARE_CALL_FAIL,
+               "UNSHARE call failed");
+        nlm4_share_reply(cs, stat);
+        ret = 0;
+        return 0;
+    }
+
+rpcerr:
+    if (ret < 0)
+        nfs3_call_state_wipe(cs);
+
+    return ret;
+}
+
+int
+nlm4_free_all_shares(char *caller_name)
+{
+    nlm_share_t *share = NULL;
+    nlm_share_t *tmp = NULL;
+    nlm_client_t *client = NULL;
+
+    LOCK(&nlm_client_list_lk);
+
+    client = __nlm_get_uniq(caller_name);
+    if (!client) {
+        gf_msg_debug(GF_NLM, 0, "client not found: %s", caller_name);
+        goto out;
+    }
+
+    list_for_each_entry_safe(share, tmp, &client->shares, client_list)
+    {
+        list_del(&share->inode_list);
+        list_del(&share->client_list);
+        inode_unref(share->inode);
+        GF_FREE(share);
+    }
+out:
+    UNLOCK(&nlm_client_list_lk);
+    return 0;
+}
+
+int
+nlm4svc_free_all(rpcsvc_request_t *req)
+{
+    int ret = RPCSVC_ACTOR_ERROR;
+    nlm4_stats stat = nlm4_failed;
+    nfs3_state_t *nfs3 = NULL;
+    nfs3_call_state_t *cs = NULL;
+    struct nfs_state *nfs = NULL;
+
+    nlm4_validate_nfs3_state(req, nfs3, stat, err, ret);
+    nfs = nfs_state(nfs3->nfsx);
+    nlm4_handle_call_state_init(nfs->nfs3state, cs, req, stat, err);
+
+    nlm4_prep_freeallargs(&cs->args.nlm4_freeallargs, &cs->lkowner);
+
+    if (xdr_to_nlm4_freeallargs(req->msg[0], &cs->args.nlm4_freeallargs) <= 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_ARGS_DECODE_ERROR,
+               "Error decoding FREE_ALL args");
+        rpcsvc_request_seterr(req, GARBAGE_ARGS);
+        goto err;
+    }
+
+    ret = nlm4_free_all_shares(cs->args.nlm4_freeallargs.name);
+    if (ret)
+        goto err;
+
+    ret = nlm_cleanup_fds(cs->args.nlm4_freeallargs.name);
+    if (ret)
+        goto err;
+
+err:
+    nfs3_call_state_wipe(cs);
+    if (ret)
+        gf_msg_debug(GF_NLM, 0, "error in free all; stat: %d", stat);
+    return ret;
+}
+
+void
+nlm4svc_sm_notify(struct nlm_sm_status *status)
+{
+    gf_msg(GF_NLM, GF_LOG_INFO, 0, NFS_MSG_SM_NOTIFY,
+           "sm_notify: "
+           "%s, state: %d",
+           status->mon_name, status->state);
+    nlm_cleanup_fds(status->mon_name);
+}
+
+/* RPC_CLNT_CONNECT gets called on (re)connects and should be able to handle
+ * different NLM requests. */
+static int
+nlm_handle_connect(struct rpc_clnt *rpc_clnt, struct nlm4_notify_args *ncf)
+{
+    int ret = -1;
+    int nlm_proc = NLM4_NULL;
+    nfs3_call_state_t *cs = NULL;
+    struct nlm4_lock *alock = NULL;
+    char *caller_name = NULL;
+
+    cs = GF_REF_GET(ncf->cs);
+    if (!cs || !cs->req) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, EINVAL, NFS_MSG_RPC_CLNT_ERROR,
+               "Spurious notify?!");
+        goto out;
+    }
+
+    /* NLM4_* actions from nlm4.h */
+    if (cs->req->prognum == NLM_PROGRAM) {
+        nlm_proc = cs->req->procnum;
+    } else {
+        /* hmm, cs->req has not been filled completely */
+        if (cs->resume_fn == nlm4_lock_fd_resume)
+            nlm_proc = NLM4_LOCK;
+        else if (cs->resume_fn == nlm4_cancel_fd_resume)
+            nlm_proc = NLM4_CANCEL;
+        else if (cs->resume_fn == nlm4_unlock_fd_resume)
+            nlm_proc = NLM4_UNLOCK;
+        else {
+            gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_RPC_CLNT_ERROR,
+                   "(re)connect with an "
+                   "unexpected NLM4 procedure (%d)",
+                   nlm_proc);
+            goto out;
+        }
+    }
+
+    switch (nlm_proc) {
+        case NLM4_LOCK:
+            alock = &cs->args.nlm4_lockargs.alock;
+            caller_name = alock->caller_name;
+
+            ret = nlm_set_rpc_clnt(rpc_clnt, caller_name);
+            if (ret == -1) {
+                gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_RPC_CLNT_ERROR,
+                       "Failed to set "
+                       "rpc clnt");
+                goto out;
+            }
+
+            /* extra ref taken with nlm_set_rpc_clnt() */
+            rpc_clnt_unref(rpc_clnt);
+
+            nlm4svc_send_granted(ncf);
+            break;
+
+        case NLM4_CANCEL:
+            /* alock = &cs->args.nlm4_cancargs.alock; */
+            ret = nlm4svc_cancel(cs->req);
+            break;
+
+        case NLM4_UNLOCK:
+            /* alock = &cs->args.nlm4_unlockargs.alock; */
+            ret = nlm4svc_unlock(cs->req);
+            break;
+
+        default:
+            gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_RPC_CLNT_ERROR,
+                   "(re)connect with an unexpected NLM4 procedure "
+                   "(%d)",
+                   nlm_proc);
+    }
+
+out:
+    if (cs)
+        GF_REF_PUT(cs);
+
+    return ret;
+}
+
+static rpcsvc_actor_t nlm4svc_actors[NLM4_PROC_COUNT] = {
+    /* 0 */
+    {"NULL", nlm4svc_null, NULL, NLM4_NULL, DRC_IDEMPOTENT, 0},
+    {"TEST", nlm4svc_test, NULL, NLM4_TEST, DRC_IDEMPOTENT, 0},
+    {"LOCK", nlm4svc_lock, NULL, NLM4_LOCK, DRC_IDEMPOTENT, 0},
+    {"CANCEL", nlm4svc_cancel, NULL, NLM4_CANCEL, DRC_NON_IDEMPOTENT, 0},
+    {"UNLOCK", nlm4svc_unlock, NULL, NLM4_UNLOCK, DRC_NON_IDEMPOTENT, 0},
+    /* 5 */
+    {"GRANTED", NULL, NULL, NLM4_GRANTED, DRC_NA, 0},
+    {"TEST", NULL, NULL, NLM4_TEST_MSG, DRC_NA, 0},
+    {"LOCK", NULL, NULL, NLM4_LOCK_MSG, DRC_NA, 0},
+    {"CANCEL", NULL, NULL, NLM4_CANCEL_MSG, DRC_NA, 0},
+    {"UNLOCK", NULL, NULL, NLM4_UNLOCK_MSG, DRC_NA, 0},
+    /* 10 */
+    {"GRANTED", NULL, NULL, NLM4_GRANTED_MSG, DRC_NA, 0},
+    {"TEST", NULL, NULL, NLM4_TEST_RES, DRC_NA, 0},
+    {"LOCK", NULL, NULL, NLM4_LOCK_RES, DRC_NA, 0},
+    {"CANCEL", NULL, NULL, NLM4_CANCEL_RES, DRC_NA, 0},
+    {"UNLOCK", NULL, NULL, NLM4_UNLOCK_RES, DRC_NA, 0},
+    /* 15 ; procedures 17,18,19 are not defined by nlm */
+    {"GRANTED", NULL, NULL, NLM4_GRANTED_RES, DRC_NA, 0},
+    {"SM_NOTIFY", NULL, NULL, NLM4_SM_NOTIFY, DRC_NA, 0},
+    {"SEVENTEEN", NULL, NULL, NLM4_SEVENTEEN, DRC_NA, 0},
+    {"EIGHTEEN", NULL, NULL, NLM4_EIGHTEEN, DRC_NA, 0},
+    {"NINETEEN", NULL, NULL, NLM4_NINETEEN, DRC_NA, 0},
+    /* 20 */
+    {"SHARE", nlm4svc_share, NULL, NLM4_SHARE, DRC_NON_IDEMPOTENT, 0},
+    {"UNSHARE", nlm4svc_unshare, NULL, NLM4_UNSHARE, DRC_NON_IDEMPOTENT, 0},
+    {"NM_LOCK", nlm4svc_nm_lock, NULL, NLM4_NM_LOCK, DRC_NON_IDEMPOTENT, 0},
+    {"FREE_ALL", nlm4svc_free_all, NULL, NLM4_FREE_ALL, DRC_IDEMPOTENT, 0},
+};
+
+static rpcsvc_program_t nlm4prog = {
+    .progname = "NLM4",
+    .prognum = NLM_PROGRAM,
+    .progver = NLM_V4,
+    .progport = GF_NLM4_PORT,
+    .actors = nlm4svc_actors,
+    .numactors = NLM4_PROC_COUNT,
+    .min_auth = AUTH_NULL,
+};
+
+int
+nlm4_init_state(xlator_t *nfsx)
+{
+    return 0;
+}
+
+extern void *
+nsm_thread(void *argv);
+
+void
+nlm_grace_period_over(void *arg)
+{
+    nlm_grace_period = 0;
+}
+
+rpcsvc_program_t *
+nlm4svc_init(xlator_t *nfsx)
+{
+    struct nfs3_state *ns = NULL;
+    struct nfs_state *nfs = NULL;
+    dict_t *options = NULL;
+    int ret = -1;
+    char *portstr = NULL;
+    pthread_t thr;
+    struct timespec timeout = {
+        0,
+    };
+    FILE *pidfile = NULL;
+    pid_t pid = -1;
+
+    /* Already inited */
+    if (nlm4_inited)
+        return &nlm4prog;
+
+    nfs = (struct nfs_state *)nfsx->private;
+
+    ns = nfs->nfs3state;
+    if (!ns) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, EINVAL, NFS_MSG_NLM_INIT_FAIL,
+               "NLM4 init failed");
+        goto err;
+    }
+    nlm4prog.private = ns;
+
+    options = dict_new();
+    if (options == NULL) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, ENOMEM, NFS_MSG_GFID_DICT_CREATE_FAIL,
+               "dict allocation failed");
+        goto err;
+    }
+
+    ret = gf_asprintf(&portstr, "%d", GF_NLM4_PORT);
+    if (ret == -1)
+        goto err;
+
+    ret = dict_set_dynstr(options, "transport.socket.listen-port", portstr);
+    if (ret == -1)
+        goto err;
+    ret = dict_set_str(options, "transport-type", "socket");
+    if (ret == -1) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+               "dict_set_str error");
+        goto err;
+    }
+
+    if (nfs->allow_insecure) {
+        ret = dict_set_str(options, "rpc-auth-allow-insecure", "on");
+        if (ret == -1) {
+            gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+                   "dict_set_str error");
+            goto err;
+        }
+        ret = dict_set_str(options, "rpc-auth.ports.insecure", "on");
+        if (ret == -1) {
+            gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+                   "dict_set_str error");
+            goto err;
+        }
+    }
+
+    ret = dict_set_str(options, "transport.address-family", "inet");
+    if (ret == -1) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_DICT_SET_FAILED,
+               "dict_set_str error");
+        goto err;
+    }
+
+    ret = rpcsvc_create_listeners(nfs->rpcsvc, options, "NLM");
+    if (ret == -1) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_LISTENERS_CREATE_FAIL,
+               "Unable to create listeners");
+        goto err;
+    }
+    INIT_LIST_HEAD(&nlm_client_list);
+    LOCK_INIT(&nlm_client_list_lk);
+
+    /* unlink sm-notify.pid so that when we restart rpc.statd/sm-notify
+     * it thinks that the machine has restarted and sends NOTIFY to clients.
+     */
+
+    /* TODO:
+       notify/rpc.statd is done differently on OSX
+
+       On OSX rpc.statd is controlled by rpc.lockd and are part for launchd
+       (unified service management framework)
+
+       A runcmd() should be invoking "launchctl start com.apple.lockd"
+       instead. This is still a theory but we need to thoroughly test it
+       out. Until then NLM support is non-existent on OSX.
+    */
+    ret = sys_unlink(GF_SM_NOTIFY_PIDFILE);
+    if (ret == -1 && errno != ENOENT) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_UNLINK_ERROR,
+               "unable to unlink %s: %d", GF_SM_NOTIFY_PIDFILE, errno);
+        goto err;
+    }
+    /* temporary work around to restart statd, not distro/OS independent.
+     * Need to figure out a more graceful way
+     * killall will cause problems on solaris.
+     */
+
+    char *pid_file = GF_RPC_STATD_PIDFILE;
+    if (nfs->rpc_statd_pid_file)
+        pid_file = nfs->rpc_statd_pid_file;
+    pidfile = fopen(pid_file, "r");
+    if (pidfile) {
+        ret = fscanf(pidfile, "%d", &pid);
+        if (ret <= 0) {
+            gf_msg(GF_NLM, GF_LOG_WARNING, errno, NFS_MSG_GET_PID_FAIL,
+                   "unable to get pid of "
+                   "rpc.statd from %s ",
+                   GF_RPC_STATD_PIDFILE);
+            ret = runcmd(KILLALL_CMD, "-9", "rpc.statd", NULL);
+        } else
+            kill(pid, SIGKILL);
+
+        fclose(pidfile);
+    } else {
+        gf_msg(GF_NLM, GF_LOG_WARNING, errno, NFS_MSG_OPEN_FAIL,
+               "opening %s of rpc.statd failed (%s)", pid_file,
+               strerror(errno));
+        /* if ret == -1, do nothing - case either statd was not
+         * running or was running in valgrind mode
+         */
+        ret = runcmd(KILLALL_CMD, "-9", "rpc.statd", NULL);
+    }
+
+    ret = sys_unlink(GF_RPC_STATD_PIDFILE);
+    if (ret == -1 && errno != ENOENT) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_UNLINK_ERROR,
+               "unable to unlink %s", pid_file);
+        goto err;
+    }
+
+    ret = runcmd(nfs->rpc_statd, NULL);
+    if (ret == -1) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_START_ERROR,
+               "unable to start %s", nfs->rpc_statd);
+        goto err;
+    }
+
+    (void)gf_thread_create(&thr, NULL, nsm_thread, nfsx, "nfsnsm");
+
+    timeout.tv_sec = nlm_grace_period;
+    timeout.tv_nsec = 0;
+
+    gf_timer_call_after(nfsx->ctx, timeout, nlm_grace_period_over, NULL);
+    nlm4_inited = _gf_true;
+
+    if (options)
+        dict_unref(options);
+    return &nlm4prog;
+err:
+    if (options)
+        dict_unref(options);
+    return NULL;
+}
+
+int32_t
+nlm_priv(xlator_t *this)
+{
+    int32_t ret = -1;
+    uint32_t client_count = 0;
+    uint64_t file_count = 0;
+    nlm_client_t *client = NULL;
+    nlm_fde_t *fde = NULL;
+    char key[GF_DUMP_MAX_BUF_LEN] = {0};
+    char gfid_str[64] = {0};
+
+    gf_proc_dump_add_section("nfs.nlm");
+
+    if ((nlm4_inited == _gf_false) || TRY_LOCK(&nlm_client_list_lk))
+        goto out;
+
+    list_for_each_entry(client, &nlm_client_list, nlm_clients)
+    {
+        gf_proc_dump_build_key(key, "client", "%d.hostname", client_count);
+        gf_proc_dump_write(key, "%s\n", client->caller_name);
+
+        file_count = 0;
+        list_for_each_entry(fde, &client->fdes, fde_list)
+        {
+            gf_proc_dump_build_key(key, "file", "%" PRIu64 ".gfid", file_count);
+            memset(gfid_str, 0, 64);
+            uuid_utoa_r(fde->fd->inode->gfid, gfid_str);
+            gf_proc_dump_write(key, "%s", gfid_str);
+            file_count++;
+        }
+
+        gf_proc_dump_build_key(key, "client", "files-locked");
+        gf_proc_dump_write(key, "%" PRIu64 "\n", file_count);
+        client_count++;
+    }
+
+    gf_proc_dump_build_key(key, "nlm", "client-count");
+    gf_proc_dump_write(key, "%d", client_count);
+    ret = 0;
+    UNLOCK(&nlm_client_list_lk);
+
+out:
+    if (ret) {
+        gf_proc_dump_build_key(key, "nlm", "statedump_error");
+        gf_proc_dump_write(key,
+                           "Unable to dump nlm state because "
+                           "nlm is not initialised or nlm_client_list_lk "
+                           "lock couldn't be acquired");
+    }
+
+    return ret;
+}
diff --git a/xlators/nfs/server/src/nlm4.h b/xlators/nfs/server/src/nlm4.h
new file mode 100644
index 00000000000..a22032cac69
--- /dev/null
+++ b/xlators/nfs/server/src/nlm4.h
@@ -0,0 +1,111 @@
+/*
+  Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _NLM4_H_
+#define _NLM4_H_
+
+#include <sys/types.h>
+#include <signal.h>
+#include "rpcsvc.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/iobuf.h>
+#include "nfs.h"
+#include <glusterfs/list.h>
+#include "xdr-nfs3.h"
+#include <glusterfs/locking.h>
+#include "nfs3-fh.h"
+#include <glusterfs/compat-uuid.h>
+#include "nlm4-xdr.h"
+#include <glusterfs/lkowner.h>
+
+#define NLM4_NULL 0
+#define NLM4_TEST 1
+#define NLM4_LOCK 2
+#define NLM4_CANCEL 3
+#define NLM4_UNLOCK 4
+#define NLM4_GRANTED 5
+#define NLM4_TEST_MSG 6
+#define NLM4_LOCK_MSG 7
+#define NLM4_CANCEL_MSG 8
+#define NLM4_UNLOCK_MSG 9
+#define NLM4_GRANTED_MSG 10
+#define NLM4_TEST_RES 11
+#define NLM4_LOCK_RES 12
+#define NLM4_CANCEL_RES 13
+#define NLM4_UNLOCK_RES 14
+#define NLM4_GRANTED_RES 15
+#define NLM4_SM_NOTIFY 16
+#define NLM4_SEVENTEEN 17
+#define NLM4_EIGHTEEN 18
+#define NLM4_NINETEEN 19
+#define NLM4_SHARE 20
+#define NLM4_UNSHARE 21
+#define NLM4_NM_LOCK 22
+#define NLM4_FREE_ALL 23
+#define NLM4_PROC_COUNT 24
+
+/* Registered with portmap */
+#define GF_NLM4_PORT 38468
+#define GF_NLM GF_NFS "-NLM"
+#if defined(GF_DARWIN_HOST_OS)
+#define GF_RPC_STATD_PROG "/usr/sbin/rpc.statd"
+#define GF_RPC_STATD_PIDFILE "/var/run/statd.pid"
+#define GF_SM_NOTIFY_PIDFILE "/var/run/statd.notify.pid"
+#elif defined(__NetBSD__)
+#define GF_RPC_STATD_PROG "/usr/sbin/rpc.statd"
+#define GF_RPC_STATD_PIDFILE "/var/run/rpc.statd.pid"
+#define GF_SM_NOTIFY_PIDFILE "/var/run/inexistent.pid"
+#else
+#define GF_RPC_STATD_PROG "/sbin/rpc.statd"
+#define GF_RPC_STATD_PIDFILE "/var/run/rpc.statd.pid"
+#define GF_SM_NOTIFY_PIDFILE "/var/run/sm-notify.pid"
+#endif
+
+extern rpcsvc_program_t *
+nlm4svc_init(xlator_t *nfsx);
+
+extern int
+nlm4_init_state(xlator_t *nfsx);
+
+#define NLM_PROGRAM 100021
+#define NLM_V4 4
+
+typedef struct nlm4_lwowner {
+    char temp[1024];
+} nlm4_lkowner_t;
+
+typedef struct nlm_client {
+    struct sockaddr_storage sa;
+    pid_t uniq;
+    struct list_head nlm_clients;
+    struct list_head fdes;
+    struct list_head shares;
+    struct rpc_clnt *rpc_clnt;
+    char *caller_name;
+    int nsm_monitor;
+} nlm_client_t;
+
+typedef struct nlm_share {
+    struct list_head client_list;
+    struct list_head inode_list;
+    gf_lkowner_t lkowner;
+    inode_t *inode;
+    fsh_mode mode;
+    fsh_access access;
+} nlm_share_t;
+
+typedef struct nlm_fde {
+    struct list_head fde_list;
+    fd_t *fd;
+    int transit_cnt;
+} nlm_fde_t;
+
+#endif
diff --git a/xlators/nfs/server/src/nlmcbk_svc.c b/xlators/nfs/server/src/nlmcbk_svc.c
new file mode 100644
index 00000000000..eaa7b916190
--- /dev/null
+++ b/xlators/nfs/server/src/nlmcbk_svc.c
@@ -0,0 +1,134 @@
+/*
+  Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+/*
+ * Please do not edit this file.
+ * It was generated using rpcgen.
+ */
+
+#include "nlm4.h"
+#include <glusterfs/logging.h>
+#include "nfs-messages.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <rpc/pmap_clnt.h>
+#include <string.h>
+#include <memory.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#ifndef SIG_PF
+#define SIG_PF void (*)(int)
+#endif
+
+void
+nlm4svc_sm_notify(struct nlm_sm_status *status);
+
+void *
+nlmcbk_sm_notify_0_svc(struct nlm_sm_status *status, struct svc_req *req)
+{
+    nlm4svc_sm_notify(status);
+    return NULL;
+}
+
+static void
+nlmcbk_program_0(struct svc_req *rqstp, register SVCXPRT *transp)
+{
+    union {
+        struct nlm_sm_status nlmcbk_sm_notify_0_arg;
+    } argument;
+    char *result;
+    xdrproc_t _xdr_argument, _xdr_result;
+    char *(*local)(char *, struct svc_req *);
+
+    switch (rqstp->rq_proc) {
+        case NULLPROC:
+            (void)svc_sendreply(transp, (xdrproc_t)xdr_void, (char *)NULL);
+            return;
+
+        case NLMCBK_SM_NOTIFY:
+            _xdr_argument = (xdrproc_t)xdr_nlm_sm_status;
+            _xdr_result = (xdrproc_t)xdr_void;
+            local = (char *(*)(char *, struct svc_req *))nlmcbk_sm_notify_0_svc;
+            break;
+
+        default:
+            svcerr_noproc(transp);
+            return;
+    }
+    memset((char *)&argument, 0, sizeof(argument));
+    if (!svc_getargs(transp, (xdrproc_t)_xdr_argument, (caddr_t)&argument)) {
+        svcerr_decode(transp);
+        return;
+    }
+    result = (*local)((char *)&argument, rqstp);
+    if (!svc_sendreply(transp, (xdrproc_t)_xdr_result, result)) {
+        svcerr_systemerr(transp);
+    }
+
+    if (!svc_freeargs(transp, (xdrproc_t)_xdr_argument, (caddr_t)&argument)) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_ARG_FREE_FAIL,
+               "unable to free arguments");
+        return;
+    }
+    return;
+}
+
+void *
+nsm_thread(void *argv)
+{
+    xlator_t *nfsx = argv;
+    register SVCXPRT *transp;
+    int ret = 0;
+
+    GF_ASSERT(nfsx);
+
+    THIS = nfsx;
+
+    ret = pmap_unset(NLMCBK_PROGRAM, NLMCBK_V1);
+    if (ret == 0) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_PMAP_UNSET_FAIL,
+               "pmap_unset failed");
+        return NULL;
+    }
+    transp = svcudp_create(RPC_ANYSOCK);
+    if (transp == NULL) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_UDP_SERV_FAIL,
+               "cannot create udp service.");
+        return NULL;
+    }
+    if (!svc_register(transp, NLMCBK_PROGRAM, NLMCBK_V1, nlmcbk_program_0,
+                      IPPROTO_UDP)) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_REG_NLMCBK_FAIL,
+               "unable to register (NLMCBK_PROGRAM, "
+               "NLMCBK_V0, udp).");
+        return NULL;
+    }
+
+    transp = svctcp_create(RPC_ANYSOCK, 0, 0);
+    if (transp == NULL) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, errno, NFS_MSG_TCP_SERV_FAIL,
+               "cannot create tcp service.");
+        return NULL;
+    }
+    if (!svc_register(transp, NLMCBK_PROGRAM, NLMCBK_V1, nlmcbk_program_0,
+                      IPPROTO_TCP)) {
+        gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_REG_NLMCBK_FAIL,
+               "unable to register (NLMCBK_PROGRAM, "
+               "NLMCBK_V0, tcp).");
+        return NULL;
+    }
+
+    svc_run();
+    gf_msg(GF_NLM, GF_LOG_ERROR, 0, NFS_MSG_SVC_RUN_RETURNED,
+           "svc_run returned");
+    return NULL;
+    /* NOTREACHED */
+}
diff --git a/xlators/performance/Makefile.am b/xlators/performance/Makefile.am
index f7504bbe8f3..e95725acb8c 100644
--- a/xlators/performance/Makefile.am
+++ b/xlators/performance/Makefile.am
@@ -1,3 +1,4 @@
-SUBDIRS = write-behind read-ahead io-threads io-cache symlink-cache
+SUBDIRS = write-behind read-ahead readdir-ahead io-threads io-cache \
+	quick-read md-cache open-behind nl-cache
 
 CLEANFILES = 
diff --git a/xlators/performance/io-cache/src/Makefile.am b/xlators/performance/io-cache/src/Makefile.am
index b1bf5bfbf71..bfa34ce5502 100644
--- a/xlators/performance/io-cache/src/Makefile.am
+++ b/xlators/performance/io-cache/src/Makefile.am
@@ -1,14 +1,17 @@
 xlator_LTLIBRARIES = io-cache.la
 xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
 
-io_cache_la_LDFLAGS = -module -avoidversion 
+io_cache_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
 
 io_cache_la_SOURCES = io-cache.c page.c ioc-inode.c
 io_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
 
-noinst_HEADERS = io-cache.h
+noinst_HEADERS = io-cache.h ioc-mem-types.h io-cache-messages.h
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(CONTRIBDIR)/rbtree
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
 
 CLEANFILES = 
diff --git a/xlators/performance/io-cache/src/io-cache-messages.h b/xlators/performance/io-cache/src/io-cache-messages.h
new file mode 100644
index 00000000000..38ad0b14d0e
--- /dev/null
+++ b/xlators/performance/io-cache/src/io-cache-messages.h
@@ -0,0 +1,69 @@
+/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _IO_CACHE_MESSAGES_H_
+#define _IO_CACHE_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(IO_CACHE, IO_CACHE_MSG_ENFORCEMENT_FAILED,
+           IO_CACHE_MSG_INVALID_ARGUMENT,
+           IO_CACHE_MSG_XLATOR_CHILD_MISCONFIGURED, IO_CACHE_MSG_NO_MEMORY,
+           IO_CACHE_MSG_VOL_MISCONFIGURED, IO_CACHE_MSG_INODE_NULL,
+           IO_CACHE_MSG_PAGE_WAIT_VALIDATE, IO_CACHE_MSG_STR_COVERSION_FAILED,
+           IO_CACHE_MSG_WASTED_COPY, IO_CACHE_MSG_SET_FD_FAILED,
+           IO_CACHE_MSG_TABLE_NULL, IO_CACHE_MSG_MEMORY_INIT_FAILED,
+           IO_CACHE_MSG_NO_CACHE_SIZE_OPT, IO_CACHE_MSG_NOT_RECONFIG_CACHE_SIZE,
+           IO_CACHE_MSG_CREATE_MEM_POOL_FAILED,
+           IO_CACHE_MSG_ALLOC_MEM_POOL_FAILED, IO_CACHE_MSG_NULL_PAGE_WAIT,
+           IO_CACHE_MSG_FRAME_NULL, IO_CACHE_MSG_PAGE_FAULT,
+           IO_CACHE_MSG_SERVE_READ_REQUEST, IO_CACHE_MSG_LOCAL_NULL,
+           IO_CACHE_MSG_DEFAULTING_TO_OLD);
+
+#define IO_CACHE_MSG_NO_MEMORY_STR "out of memory"
+#define IO_CACHE_MSG_ENFORCEMENT_FAILED_STR "inode context is NULL"
+#define IO_CACHE_MSG_SET_FD_FAILED_STR "failed to set fd ctx"
+#define IO_CACHE_MSG_TABLE_NULL_STR "table is NULL"
+#define IO_CACHE_MSG_MEMORY_INIT_FAILED_STR "Memory accounting init failed"
+#define IO_CACHE_MSG_NO_CACHE_SIZE_OPT_STR "could not get cache-size option"
+#define IO_CACHE_MSG_INVALID_ARGUMENT_STR                                      \
+    "file size is greater than the max size"
+#define IO_CACHE_MSG_NOT_RECONFIG_CACHE_SIZE_STR "Not reconfiguring cache-size"
+#define IO_CACHE_MSG_XLATOR_CHILD_MISCONFIGURED_STR                            \
+    "FATAL: io-cache not configured with exactly one child"
+#define IO_CACHE_MSG_VOL_MISCONFIGURED_STR "dangling volume. check volfile"
+#define IO_CACHE_MSG_CREATE_MEM_POOL_FAILED_STR                                \
+    "failed to create local_t's memory pool"
+#define IO_CACHE_MSG_ALLOC_MEM_POOL_FAILED_STR "Unable to allocate mem_pool"
+#define IO_CACHE_MSG_STR_COVERSION_FAILED_STR                                  \
+    "asprintf failed while converting prt to str"
+#define IO_CACHE_MSG_INODE_NULL_STR "ioc_inode is NULL"
+#define IO_CACHE_MSG_PAGE_WAIT_VALIDATE_STR                                    \
+    "cache validate called without any page waiting to be validated"
+#define IO_CACHE_MSG_NULL_PAGE_WAIT_STR "asked to wait on a NULL page"
+#define IO_CACHE_MSG_WASTED_COPY_STR "wasted copy"
+#define IO_CACHE_MSG_FRAME_NULL_STR "frame>root>rsp_refs is null"
+#define IO_CACHE_MSG_PAGE_FAULT_STR "page fault on a NULL frame"
+#define IO_CACHE_MSG_SERVE_READ_REQUEST_STR                                    \
+    "NULL page has been provided to serve read request"
+#define IO_CACHE_MSG_LOCAL_NULL_STR "local is NULL"
+#define IO_CACHE_MSG_DEFAULTING_TO_OLD_STR                                     \
+    "minimum size of file that can be cached is greater than maximum size. "   \
+    "Hence Defaulting to old value"
+#endif /* _IO_CACHE_MESSAGES_H_ */
diff --git a/xlators/performance/io-cache/src/io-cache.c b/xlators/performance/io-cache/src/io-cache.c
index 55dfa5ac6ba..9375d29c17f 100644
--- a/xlators/performance/io-cache/src/io-cache.c
+++ b/xlators/performance/io-cache/src/io-cache.c
@@ -1,406 +1,358 @@
 /*
-  Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
   This file is part of GlusterFS.
 
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
+#include <math.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
 #include "io-cache.h"
+#include "ioc-mem-types.h"
+#include <glusterfs/statedump.h>
 #include <assert.h>
 #include <sys/time.h>
+#include "io-cache-messages.h"
+int ioc_log2_page_size;
 
-static uint32_t
-ioc_get_priority (ioc_table_t *table, 
-		  const char *path);
+uint32_t
+ioc_get_priority(ioc_table_t *table, const char *path);
+
+struct volume_options options[];
 
 static uint32_t
-ioc_get_priority (ioc_table_t *table, 
-		  const char *path);
+ioc_hashfn(void *data, int len)
+{
+    off_t offset;
+
+    offset = *(off_t *)data;
 
-static inline ioc_inode_t *
+    return (offset >> ioc_log2_page_size);
+}
+
+/* TODO: This function is not used, uncomment when we find a
+         usage for this function.
+
+static ioc_inode_t *
 ioc_inode_reupdate (ioc_inode_t *ioc_inode)
 {
-	ioc_table_t *table = ioc_inode->table;
+        ioc_table_t *table = NULL;
+
+        table = ioc_inode->table;
 
-	list_add_tail (&ioc_inode->inode_lru, 
-		       &table->inode_lru[ioc_inode->weight]);
-  
-	return ioc_inode;
+        list_add_tail (&ioc_inode->inode_lru,
+                       &table->inode_lru[ioc_inode->weight]);
+
+        return ioc_inode;
 }
 
-static inline ioc_inode_t *
-ioc_get_inode (dict_t *dict,
-	       char *name)
+
+static ioc_inode_t *
+ioc_get_inode (dict_t *dict, char *name)
 {
-	ioc_inode_t *ioc_inode = NULL;
-	data_t *ioc_inode_data = dict_get (dict, name);
-	ioc_table_t *table = NULL;
-
-	if (ioc_inode_data) {
-		ioc_inode = data_to_ptr (ioc_inode_data);
-		table = ioc_inode->table;
-
-		ioc_table_lock (table);
-		{
-			if (list_empty (&ioc_inode->inode_lru)) {
-				ioc_inode = ioc_inode_reupdate (ioc_inode);
-			}
-		}
-		ioc_table_unlock (table);
-	}
-  
-	return ioc_inode;
+        ioc_inode_t *ioc_inode      = NULL;
+        data_t      *ioc_inode_data = NULL;
+        ioc_table_t *table          = NULL;
+
+        ioc_inode_data = dict_get (dict, name);
+        if (ioc_inode_data) {
+                ioc_inode = data_to_ptr (ioc_inode_data);
+                table = ioc_inode->table;
+
+                ioc_table_lock (table);
+                {
+                        if (list_empty (&ioc_inode->inode_lru)) {
+                                ioc_inode = ioc_inode_reupdate (ioc_inode);
+                        }
+                }
+                ioc_table_unlock (table);
+        }
+
+        return ioc_inode;
 }
+*/
 
-int32_t
-ioc_inode_need_revalidate (ioc_inode_t *ioc_inode)
+int
+ioc_update_pages(call_frame_t *frame, ioc_inode_t *ioc_inode,
+                 struct iovec *vector, int32_t count, int op_ret, off_t offset)
 {
-	int8_t need_revalidate = 0;
-	struct timeval tv = {0,};
-	int32_t ret = -1;
-	ioc_table_t *table = ioc_inode->table;
+    size_t size = 0;
+    off_t rounded_offset = 0, rounded_end = 0, trav_offset = 0,
+          write_offset = 0;
+    off_t page_offset = 0, page_end = 0;
+    ioc_page_t *trav = NULL;
+
+    size = iov_length(vector, count);
+    size = min(size, op_ret);
+
+    rounded_offset = gf_floor(offset, ioc_inode->table->page_size);
+    rounded_end = gf_roof(offset + size, ioc_inode->table->page_size);
+
+    trav_offset = rounded_offset;
+    ioc_inode_lock(ioc_inode);
+    {
+        while (trav_offset < rounded_end) {
+            trav = __ioc_page_get(ioc_inode, trav_offset);
+            if (trav && trav->ready) {
+                if (trav_offset == rounded_offset)
+                    page_offset = offset - rounded_offset;
+                else
+                    page_offset = 0;
+
+                if ((trav_offset + ioc_inode->table->page_size) >=
+                    rounded_end) {
+                    page_end = trav->size - (rounded_end - (offset + size));
+                } else {
+                    page_end = trav->size;
+                }
+
+                iov_range_copy(trav->vector, trav->count, page_offset, vector,
+                               count, write_offset, page_end - page_offset);
+            } else if (trav) {
+                if (!trav->waitq)
+                    ioc_inode->table->cache_used -= __ioc_page_destroy(trav);
+            }
+
+            if (trav_offset == rounded_offset)
+                write_offset += (ioc_inode->table->page_size -
+                                 (offset - rounded_offset));
+            else
+                write_offset += ioc_inode->table->page_size;
+
+            trav_offset += ioc_inode->table->page_size;
+        }
+    }
+    ioc_inode_unlock(ioc_inode);
+
+    return 0;
+}
 
-	ret = gettimeofday (&tv, NULL);
+static gf_boolean_t
+ioc_inode_need_revalidate(ioc_inode_t *ioc_inode)
+{
+    ioc_table_t *table = NULL;
 
-	if (time_elapsed (&tv, &ioc_inode->tv) >= table->cache_timeout)
-		need_revalidate = 1;
+    GF_ASSERT(ioc_inode);
+    table = ioc_inode->table;
+    GF_ASSERT(table);
 
-	return need_revalidate;
+    return (gf_time() - ioc_inode->cache.last_revalidate >=
+            table->cache_timeout);
 }
 
 /*
  * __ioc_inode_flush - flush all the cached pages of the given inode
  *
- * @ioc_inode: 
+ * @ioc_inode:
  *
  * assumes lock is held
  */
-int32_t
-__ioc_inode_flush (ioc_inode_t *ioc_inode)
+int64_t
+__ioc_inode_flush(ioc_inode_t *ioc_inode)
 {
-	ioc_page_t *curr = NULL, *next = NULL;
-	int32_t destroy_size = 0;
-	int32_t ret = 0;
-
-	list_for_each_entry_safe (curr, next, &ioc_inode->pages, pages) {
-		ret = ioc_page_destroy (curr);
-    
-		if (ret != -1) 
-			destroy_size += ret;
-	}
-  
-	return destroy_size;
+    ioc_page_t *curr = NULL, *next = NULL;
+    int64_t destroy_size = 0;
+    int64_t ret = 0;
+
+    list_for_each_entry_safe(curr, next, &ioc_inode->cache.page_lru, page_lru)
+    {
+        ret = __ioc_page_destroy(curr);
+
+        if (ret != -1)
+            destroy_size += ret;
+    }
+
+    return destroy_size;
 }
 
 void
-ioc_inode_flush (ioc_inode_t *ioc_inode)
+ioc_inode_flush(ioc_inode_t *ioc_inode)
 {
-	int32_t destroy_size = 0;    
-
-	ioc_inode_lock (ioc_inode);
-	{
-		destroy_size = __ioc_inode_flush (ioc_inode);
-	}
-	ioc_inode_unlock (ioc_inode);
-  
-	if (destroy_size) {
-		ioc_table_lock (ioc_inode->table);
-		{
-			ioc_inode->table->cache_used -= destroy_size;
-		}
-		ioc_table_unlock (ioc_inode->table);
-	}
-
-	return;
+    int64_t destroy_size = 0;
+
+    ioc_inode_lock(ioc_inode);
+    {
+        destroy_size = __ioc_inode_flush(ioc_inode);
+    }
+    ioc_inode_unlock(ioc_inode);
+
+    if (destroy_size) {
+        ioc_table_lock(ioc_inode->table);
+        {
+            ioc_inode->table->cache_used -= destroy_size;
+        }
+        ioc_table_unlock(ioc_inode->table);
+    }
+
+    return;
 }
 
-/* 
- * ioc_utimens_cbk -
- * 
- * @frame:
- * @cookie:
- * @this:
- * @op_ret:
- * @op_errno:
- * @stbuf:
- *
- */
 int32_t
-ioc_utimens_cbk (call_frame_t *frame,
-		 void *cookie,
-		 xlator_t *this,
-		 int32_t op_ret,
-		 int32_t op_errno,
-		 struct stat *stbuf)
+ioc_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *preop,
+                struct iatt *postop, dict_t *xdata)
 {
-	STACK_UNWIND (frame, op_ret, op_errno, stbuf);
-	return 0;
+    STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, preop, postop, xdata);
+    return 0;
 }
 
-/* 
- * ioc_utimens -
- * 
- * @frame:
- * @this:
- * @loc:
- * @tv:
- *
- */
 int32_t
-ioc_utimens (call_frame_t *frame,
-	     xlator_t *this,
-	     loc_t *loc,
-	     struct timespec *tv)
+ioc_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+            int32_t valid, dict_t *xdata)
 {
-	uint64_t ioc_inode = 0;
-	inode_ctx_get (loc->inode, this, &ioc_inode);
+    uint64_t ioc_inode = 0;
 
-	if (ioc_inode)
-		ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
+    inode_ctx_get(loc->inode, this, &ioc_inode);
 
-	STACK_WIND (frame, ioc_utimens_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->utimens,
-		    loc, tv);
-	return 0;
+    if (ioc_inode &&
+        ((valid & GF_SET_ATTR_ATIME) || (valid & GF_SET_ATTR_MTIME)))
+        ioc_inode_flush((ioc_inode_t *)(long)ioc_inode);
+
+    STACK_WIND(frame, ioc_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+
+    return 0;
 }
 
 int32_t
-ioc_lookup_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		inode_t *inode,
-		struct stat *stbuf,
-		dict_t *dict)
+ioc_inode_update(xlator_t *this, inode_t *inode, char *path, struct iatt *iabuf)
 {
-	ioc_inode_t *ioc_inode = NULL;
-	ioc_local_t *local = frame->local;
-	ioc_table_t *table = this->private;
-	ioc_page_t  *page = NULL;
-	data_t      *page_data = NULL;
-	data_t      *content_data = NULL;
-	char        *src = NULL;
-	char        *dst = NULL;
-	char         need_unref = 0;
-	uint8_t      cache_still_valid = 0;
-	uint32_t     weight = 0;
-	uint64_t     tmp_ioc_inode = 0;
-	char        *buf = NULL;
-	char        *tmp = NULL;
-	int          i;
-	
-	if (op_ret != 0) 
-		goto out;
-
-	inode_ctx_get (inode, this, &tmp_ioc_inode);
-	ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode;
-	if (ioc_inode) {
-		cache_still_valid = ioc_cache_still_valid (ioc_inode, 
-							   stbuf);
-		
-		if (!cache_still_valid) {
-			ioc_inode_flush (ioc_inode);
-		} 
-		/* update the time-stamp of revalidation */
-		ioc_inode_lock (ioc_inode);
-		{
-			gettimeofday (&ioc_inode->tv, NULL);
-		}
-		ioc_inode_unlock (ioc_inode);
-		
-		ioc_table_lock (ioc_inode->table);
-		{
-			list_move_tail (&ioc_inode->inode_lru,
-					&table->inode_lru[ioc_inode->weight]);
-		}
-		ioc_table_unlock (ioc_inode->table);
-	}
-	
-	if (local && stbuf->st_size && 
-	    local->need_xattr >= stbuf->st_size) {
-		if (!ioc_inode) {
-			weight = ioc_get_priority (table, 
-						   local->file_loc.path);
-			ioc_inode = ioc_inode_update (table, 
-						      inode, weight);
-			inode_ctx_put (inode, this, 
-				       (uint64_t)(long)ioc_inode);
-		}
-		
-		ioc_inode_lock (ioc_inode);
-		{
-			content_data = dict_get (dict, "glusterfs.content");
-			page = ioc_page_get (ioc_inode, 0);
-			
-			if (content_data) {
-				if (page) {
-					dict_unref (page->ref);
-					free (page->vector);
-					page->vector = NULL;
-					
-					ioc_table_lock (table);
-					{
-						table->cache_used -= 
-							page->size;
-					}
-					ioc_table_unlock (table);
-				} else {
-					page = ioc_page_create (ioc_inode, 0);
-				}
-				
-				dst = CALLOC (1, stbuf->st_size);
-				page->ref = dict_ref (get_new_dict ());
-				page_data = data_from_dynptr (dst, 
-							      stbuf->st_size);
-				dict_set (page->ref, NULL, page_data);
-				
-				src = data_to_ptr (content_data);
-				memcpy (dst, src, stbuf->st_size);
-
-				page->vector = CALLOC (1, 
-						       sizeof (*page->vector));
-				page->vector->iov_base = dst;
-				page->vector->iov_len = stbuf->st_size;
-				page->count = 1;
-      
-				page->waitq = NULL;
-				page->size = stbuf->st_size;
-				page->ready = 1;
-
-				ioc_table_lock (table);
-				{
-					table->cache_used += page->size;
-				}
-				ioc_table_unlock (table);
-				
-			} else {
-				if (!(page && page->ready)) {
-					gf_log (this->name, GF_LOG_DEBUG,
-						"page not present");
-					
-					ioc_inode_unlock (ioc_inode);
-					STACK_WIND (frame,
-						    ioc_lookup_cbk,
-						    FIRST_CHILD (this),
-						    FIRST_CHILD (this)->fops->lookup,
-						    &local->file_loc,
-						    local->xattr_req);
-					return 0;
-				} 
-				buf = CALLOC (1, stbuf->st_size);
-				tmp = buf;
-
-				for (i = 0; i < page->count; i++) {
-					memcpy (tmp, page->vector[i].iov_base, 
-						page->vector[i].iov_len);
-					tmp += page->vector[i].iov_len;
-				}
-				
-				gf_log (this->name, GF_LOG_DEBUG,
-					"serving file %s from cache", 
-					local->file_loc.path);
-				
-				if (!dict) {
-					need_unref = 1;
-					dict = dict_ref (
-						get_new_dict ());
-				}
-				dict_set (dict, "glusterfs.content",
-					  data_from_dynptr (buf, 
-							    stbuf->st_size));
-			}
-
-			ioc_inode->mtime = stbuf->st_mtime;
-			gettimeofday (&ioc_inode->tv, NULL);
-		}
-		ioc_inode_unlock (ioc_inode);
-		
-		if (content_data && 
-		    ioc_need_prune (ioc_inode->table)) {
-			ioc_prune (ioc_inode->table);
-		}
-	}
-
- out:
-	STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, dict);
-
-	if (need_unref) {
-		dict_unref (dict);
-	}
-
-	return 0;
+    ioc_table_t *table = NULL;
+    uint64_t tmp_ioc_inode = 0;
+    ioc_inode_t *ioc_inode = NULL;
+    uint32_t weight = 0xffffffff;
+    gf_boolean_t cache_still_valid = _gf_false;
+
+    if (!this || !inode)
+        goto out;
+
+    table = this->private;
+
+    LOCK(&inode->lock);
+    {
+        (void)__inode_ctx_get(inode, this, &tmp_ioc_inode);
+        ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode;
+
+        if (!ioc_inode) {
+            weight = ioc_get_priority(table, path);
+
+            ioc_inode = ioc_inode_create(table, inode, weight);
+
+            (void)__inode_ctx_put(inode, this, (uint64_t)(long)ioc_inode);
+        }
+    }
+    UNLOCK(&inode->lock);
+
+    ioc_inode_lock(ioc_inode);
+    {
+        if (ioc_inode->cache.mtime == 0) {
+            ioc_inode->cache.mtime = iabuf->ia_mtime;
+            ioc_inode->cache.mtime_nsec = iabuf->ia_mtime_nsec;
+        }
+
+        ioc_inode->ia_size = iabuf->ia_size;
+    }
+    ioc_inode_unlock(ioc_inode);
+
+    cache_still_valid = ioc_cache_still_valid(ioc_inode, iabuf);
+
+    if (!cache_still_valid) {
+        ioc_inode_flush(ioc_inode);
+    }
+
+    ioc_table_lock(ioc_inode->table);
+    {
+        list_move_tail(&ioc_inode->inode_lru,
+                       &table->inode_lru[ioc_inode->weight]);
+    }
+    ioc_table_unlock(ioc_inode->table);
+
+out:
+    return 0;
 }
 
-int32_t 
-ioc_lookup (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    dict_t *xattr_req)
+int32_t
+ioc_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, inode_t *inode,
+               struct iatt *stbuf, dict_t *xdata, struct iatt *postparent)
 {
-	uint64_t content_limit = 0;
-
-	if (GF_FILE_CONTENT_REQUESTED(xattr_req, &content_limit)) {
-		uint64_t     tmp_ioc_inode = 0;
-		ioc_inode_t *ioc_inode = NULL;
-		ioc_page_t  *page = NULL;
-		ioc_local_t *local = CALLOC (1, sizeof (*local));
-
-		local->need_xattr = content_limit;
-		local->file_loc.path = loc->path;
-		local->file_loc.inode = loc->inode;
-		frame->local = local;
-
-		inode_ctx_get (loc->inode, this, &tmp_ioc_inode);
-		ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode;
-
-		if (ioc_inode) {
-			ioc_inode_lock (ioc_inode);
-			{
-				page = ioc_page_get (ioc_inode, 0);
-				if ((content_limit <= 
-				     ioc_inode->table->page_size) && 
-				    page && page->ready) {
-					local->need_xattr = -1;
-				}
-			}
-			ioc_inode_unlock (ioc_inode);
-		}
-	}
-
-	STACK_WIND (frame,
-		    ioc_lookup_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->lookup,
-		    loc,
-		    xattr_req);
-	return 0;
+    ioc_local_t *local = NULL;
+
+    if (op_ret != 0)
+        goto out;
+
+    local = frame->local;
+    if (local == NULL) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    if (!this || !this->private) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    ioc_inode_update(this, inode, (char *)local->file_loc.path, stbuf);
+
+out:
+    if (frame->local != NULL) {
+        local = frame->local;
+        loc_wipe(&local->file_loc);
+    }
+
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, stbuf, xdata,
+                        postparent);
+    return 0;
+}
+
+int32_t
+ioc_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    ioc_local_t *local = NULL;
+    int32_t op_errno = -1, ret = -1;
+
+    local = mem_get0(this->local_pool);
+    if (local == NULL) {
+        op_errno = ENOMEM;
+        gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_NO_MEMORY, NULL);
+        goto unwind;
+    }
+
+    ret = loc_copy(&local->file_loc, loc);
+    if (ret != 0) {
+        op_errno = ENOMEM;
+        gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_NO_MEMORY, NULL);
+        goto unwind;
+    }
+
+    frame->local = local;
+
+    STACK_WIND(frame, ioc_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
+
+    return 0;
+
+unwind:
+    if (local != NULL) {
+        loc_wipe(&local->file_loc);
+        mem_put(local);
+    }
+
+    STACK_UNWIND_STRICT(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+
+    return 0;
 }
 
 /*
- * ioc_forget - 
+ * ioc_forget -
  *
  * @frame:
  * @this:
@@ -408,22 +360,33 @@ ioc_lookup (call_frame_t *frame,
  *
  */
 int32_t
-ioc_forget (xlator_t *this,
-	    inode_t *inode)
+ioc_forget(xlator_t *this, inode_t *inode)
 {
-	uint64_t ioc_inode = 0;
+    uint64_t ioc_inode = 0;
+
+    inode_ctx_get(inode, this, &ioc_inode);
 
-	inode_ctx_get (inode, this, &ioc_inode);
+    if (ioc_inode)
+        ioc_inode_destroy((ioc_inode_t *)(long)ioc_inode);
 
-	if (ioc_inode)
-		ioc_inode_destroy ((ioc_inode_t *)(long)ioc_inode);
-	    
-	return 0;
+    return 0;
 }
 
+static int32_t
+ioc_invalidate(xlator_t *this, inode_t *inode)
+{
+    uint64_t ioc_inode = 0;
+
+    inode_ctx_get(inode, this, &ioc_inode);
+
+    if (ioc_inode)
+        ioc_inode_flush((ioc_inode_t *)(uintptr_t)ioc_inode);
 
-/* 
- * ioc_cache_validate_cbk - 
+    return 0;
+}
+
+/*
+ * ioc_cache_validate_cbk -
  *
  * @frame:
  * @cookie:
@@ -434,94 +397,103 @@ ioc_forget (xlator_t *this,
  *
  */
 int32_t
-ioc_cache_validate_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno,
-			struct stat *stbuf)
+ioc_cache_validate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
+                       dict_t *xdata)
 {
-	ioc_local_t *local = frame->local;
-	ioc_inode_t *ioc_inode = NULL;
-	size_t destroy_size = 0;
-	struct stat *local_stbuf = stbuf;
-
-	ioc_inode = local->inode;
-
-	if ((op_ret == -1) || 
-	    ((op_ret >= 0) && !ioc_cache_still_valid(ioc_inode, stbuf))) {
-		gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG,
-			"cache for inode(%p) is invalid. flushing all pages",
-			ioc_inode);
-		/* NOTE: only pages with no waiting frames are flushed by 
-		 * ioc_inode_flush. page_fault will be generated for all 
-		 * the pages which have waiting frames by ioc_inode_wakeup()
-		 */
-		ioc_inode_lock (ioc_inode);
-		{
-			destroy_size = __ioc_inode_flush (ioc_inode);
-			if (op_ret >= 0)
-				ioc_inode->mtime = stbuf->st_mtime;
-		}
-		ioc_inode_unlock (ioc_inode);
-		local_stbuf = NULL;
-	}
-
-	if (destroy_size) {
-		ioc_table_lock (ioc_inode->table);
-		{
-			ioc_inode->table->cache_used -= destroy_size;
-		}
-		ioc_table_unlock (ioc_inode->table);
-	}
-
-	if (op_ret < 0)
-		local_stbuf = NULL;
-  
-	ioc_inode_lock (ioc_inode);
-	{
-		gettimeofday (&ioc_inode->tv, NULL);
-	}
-	ioc_inode_unlock (ioc_inode);
-
-	ioc_inode_wakeup (frame, ioc_inode, local_stbuf);
-  
-	/* any page-fault initiated by ioc_inode_wakeup() will have its own 
-	 * fd_ref on fd, safe to unref validate frame's private copy 
-	 */
-	fd_unref (local->fd);
-
-	STACK_DESTROY (frame->root);
-
-	return 0;
+    ioc_local_t *local = NULL;
+    ioc_inode_t *ioc_inode = NULL;
+    size_t destroy_size = 0;
+    struct iatt *local_stbuf = NULL;
+
+    local = frame->local;
+    ioc_inode = local->inode;
+    local_stbuf = stbuf;
+
+    if ((op_ret == -1) ||
+        ((op_ret >= 0) && !ioc_cache_still_valid(ioc_inode, stbuf))) {
+        gf_msg_debug(ioc_inode->table->xl->name, 0,
+                     "cache for inode(%p) is invalid. flushing all pages",
+                     ioc_inode);
+        /* NOTE: only pages with no waiting frames are flushed by
+         * ioc_inode_flush. page_fault will be generated for all
+         * the pages which have waiting frames by ioc_inode_wakeup()
+         */
+        ioc_inode_lock(ioc_inode);
+        {
+            destroy_size = __ioc_inode_flush(ioc_inode);
+            if (op_ret >= 0) {
+                ioc_inode->cache.mtime = stbuf->ia_mtime;
+                ioc_inode->cache.mtime_nsec = stbuf->ia_mtime_nsec;
+            }
+        }
+        ioc_inode_unlock(ioc_inode);
+        local_stbuf = NULL;
+    }
+
+    if (destroy_size) {
+        ioc_table_lock(ioc_inode->table);
+        {
+            ioc_inode->table->cache_used -= destroy_size;
+        }
+        ioc_table_unlock(ioc_inode->table);
+    }
+
+    if (op_ret < 0)
+        local_stbuf = NULL;
+
+    ioc_inode_lock(ioc_inode);
+    {
+        ioc_inode->cache.last_revalidate = gf_time();
+    }
+    ioc_inode_unlock(ioc_inode);
+
+    ioc_inode_wakeup(frame, ioc_inode, local_stbuf);
+
+    /* any page-fault initiated by ioc_inode_wakeup() will have its own
+     * fd_ref on fd, safe to unref validate frame's private copy
+     */
+    fd_unref(local->fd);
+    dict_unref(local->xattr_req);
+
+    STACK_DESTROY(frame->root);
+
+    return 0;
 }
 
-static int32_t
-ioc_wait_on_inode (ioc_inode_t *ioc_inode, 
-		   ioc_page_t *page)
+int32_t
+ioc_wait_on_inode(ioc_inode_t *ioc_inode, ioc_page_t *page)
 {
-	ioc_waitq_t *waiter = NULL, *trav = NULL;
-	uint32_t page_found = 0;
-
-	trav = ioc_inode->waitq;
-
-	while (trav) {
-		if (trav->data == page) {
-			page_found = 1;
-			break;
-		}
-		trav = trav->next;
-	}
-  
-	if (!page_found) {
-		waiter = CALLOC (1, sizeof (ioc_waitq_t));
-		ERR_ABORT (waiter);
-		waiter->data = page;
-		waiter->next = ioc_inode->waitq;
-		ioc_inode->waitq = waiter;
-	}
-  
-	return 0;
+    ioc_waitq_t *waiter = NULL, *trav = NULL;
+    uint32_t page_found = 0;
+    int32_t ret = 0;
+
+    trav = ioc_inode->waitq;
+
+    while (trav) {
+        if (trav->data == page) {
+            page_found = 1;
+            break;
+        }
+        trav = trav->next;
+    }
+
+    if (!page_found) {
+        waiter = GF_CALLOC(1, sizeof(ioc_waitq_t), gf_ioc_mt_ioc_waitq_t);
+        if (waiter == NULL) {
+            gf_smsg(ioc_inode->table->xl->name, GF_LOG_ERROR, ENOMEM,
+                    IO_CACHE_MSG_NO_MEMORY, NULL);
+            ret = -ENOMEM;
+            goto out;
+        }
+
+        waiter->data = page;
+        waiter->next = ioc_inode->waitq;
+        ioc_inode->waitq = waiter;
+    }
+
+out:
+    return ret;
 }
 
 /*
@@ -532,61 +504,81 @@ ioc_wait_on_inode (ioc_inode_t *ioc_inode,
  * @fd:
  *
  */
-static int32_t
-ioc_cache_validate (call_frame_t *frame,
-		    ioc_inode_t *ioc_inode,
-		    fd_t *fd,
-		    ioc_page_t *page)
+int32_t
+ioc_cache_validate(call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd,
+                   ioc_page_t *page)
 {
-	call_frame_t *validate_frame = NULL;
-	ioc_local_t *validate_local = NULL;
-
-	validate_local = CALLOC (1, sizeof (ioc_local_t));
-	ERR_ABORT (validate_local);
-	validate_frame = copy_frame (frame);
-	validate_local->fd = fd_ref (fd);
-	validate_local->inode = ioc_inode;
-	validate_frame->local = validate_local;
-    
-	STACK_WIND (validate_frame,
-		    ioc_cache_validate_cbk,
-		    FIRST_CHILD (frame->this),
-		    FIRST_CHILD (frame->this)->fops->fstat,
-		    fd);
-
-	return 0;
+    call_frame_t *validate_frame = NULL;
+    ioc_local_t *validate_local = NULL;
+    ioc_local_t *local = NULL;
+    int32_t ret = 0;
+
+    local = frame->local;
+    validate_local = mem_get0(THIS->local_pool);
+    if (validate_local == NULL) {
+        ret = -1;
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        gf_smsg(ioc_inode->table->xl->name, GF_LOG_ERROR, 0,
+                IO_CACHE_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    validate_frame = copy_frame(frame);
+    if (validate_frame == NULL) {
+        ret = -1;
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        mem_put(validate_local);
+        gf_smsg(ioc_inode->table->xl->name, GF_LOG_ERROR, 0,
+                IO_CACHE_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    validate_local->fd = fd_ref(fd);
+    validate_local->inode = ioc_inode;
+    if (local && local->xattr_req)
+        validate_local->xattr_req = dict_ref(local->xattr_req);
+    validate_frame->local = validate_local;
+
+    STACK_WIND(validate_frame, ioc_cache_validate_cbk, FIRST_CHILD(frame->this),
+               FIRST_CHILD(frame->this)->fops->fstat, fd,
+               validate_local->xattr_req);
+
+out:
+    return ret;
 }
 
-static inline uint32_t
-is_match (const char *path,
-	  const char *pattern)
+static uint32_t
+is_match(const char *path, const char *pattern)
 {
-	char *pathname = strdup (path);
-	int32_t ret = 0;
-
-	ret = fnmatch (pattern, path, FNM_NOESCAPE);
-  
-	free (pathname);
-  
-	return (ret == 0);
+    int32_t ret = 0;
+
+    ret = fnmatch(pattern, path, FNM_NOESCAPE);
+
+    return (ret == 0);
 }
 
-static uint32_t
-ioc_get_priority (ioc_table_t *table, 
-		  const char *path)
+uint32_t
+ioc_get_priority(ioc_table_t *table, const char *path)
 {
-	uint32_t priority = 0;
-	struct ioc_priority *curr = NULL;
-  
-	list_for_each_entry (curr, &table->priority_list, list) {
-		if (is_match (path, curr->pattern)) 
-			priority = curr->priority;
-	}
-
-	return priority;
+    uint32_t priority = 1;
+    struct ioc_priority *curr = NULL;
+
+    if (list_empty(&table->priority_list) || !path)
+        return priority;
+
+    priority = 0;
+    list_for_each_entry(curr, &table->priority_list, list)
+    {
+        if (is_match(path, curr->pattern))
+            priority = curr->priority;
+    }
+
+    return priority;
 }
 
-/* 
+/*
  * ioc_open_cbk - open callback for io cache
  *
  * @frame: call frame
@@ -598,72 +590,68 @@ ioc_get_priority (ioc_table_t *table,
  *
  */
 int32_t
-ioc_open_cbk (call_frame_t *frame,
-	      void *cookie,
-	      xlator_t *this,
-	      int32_t op_ret,
-	      int32_t op_errno,
-	      fd_t *fd)
+ioc_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, fd_t *fd, dict_t *xdata)
 {
-	uint64_t     tmp_ioc_inode = 0;
-	ioc_local_t *local = frame->local;
-	ioc_table_t *table = this->private;
-	ioc_inode_t *ioc_inode = NULL;
-	inode_t *inode = local->file_loc.inode;
-	uint32_t weight = 0;
-	const char *path = local->file_loc.path;
-
-	if (op_ret != -1) {
-		/* look for ioc_inode corresponding to this fd */
-		LOCK (&fd->inode->lock);
-		//{
-
-		inode_ctx_get (fd->inode, this, &tmp_ioc_inode);
-		ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode;
-      
-		if (!ioc_inode) {
-			/* this is the first time someone is opening this 
-			   file, assign weight 
-			*/
-			weight = ioc_get_priority (table, path);
- 
-			ioc_inode = ioc_inode_update (table, inode, weight);
-			inode_ctx_put (fd->inode, this, 
-				       (uint64_t)(long)ioc_inode);
-		} else {
-			ioc_table_lock (ioc_inode->table);
-			//{
-			list_move_tail (&ioc_inode->inode_lru,
-					&table->inode_lru[ioc_inode->weight]);
-			//}
-			ioc_table_unlock (ioc_inode->table);
-		}
-
-		//}
-		UNLOCK (&fd->inode->lock);
-
-		/* If mandatory locking has been enabled on this file,
-		   we disable caching on it */
-		if (((inode->st_mode & S_ISGID) && 
-		     !(inode->st_mode & S_IXGRP))) {
-			fd_ctx_set (fd, this, 1);
-		}
-  
-		/* If O_DIRECT open, we disable caching on it */
-		if ((local->flags & O_DIRECT)){
-			/* O_DIRECT is only for one fd, not the inode 
-			 * as a whole 
-			 */
-			fd_ctx_set (fd, this, 1);
-		}
-	}
-
-	FREE (local);
-	frame->local = NULL;
-
-	STACK_UNWIND (frame, op_ret, op_errno, fd);
-
-	return 0;
+    uint64_t tmp_ioc_inode = 0;
+    ioc_local_t *local = NULL;
+    ioc_table_t *table = NULL;
+    ioc_inode_t *ioc_inode = NULL;
+
+    local = frame->local;
+    if (!this || !this->private) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    table = this->private;
+
+    if (op_ret != -1) {
+        inode_ctx_get(fd->inode, this, &tmp_ioc_inode);
+        ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode;
+
+        // TODO: see why inode context is NULL and handle it.
+        if (!ioc_inode) {
+            gf_smsg(this->name, GF_LOG_ERROR, EINVAL,
+                    IO_CACHE_MSG_ENFORCEMENT_FAILED, "inode-gfid=%s",
+                    uuid_utoa(fd->inode->gfid), NULL);
+            goto out;
+        }
+
+        ioc_table_lock(ioc_inode->table);
+        {
+            list_move_tail(&ioc_inode->inode_lru,
+                           &table->inode_lru[ioc_inode->weight]);
+        }
+        ioc_table_unlock(ioc_inode->table);
+
+        ioc_inode_lock(ioc_inode);
+        {
+            if ((table->min_file_size > ioc_inode->ia_size) ||
+                ((table->max_file_size > 0) &&
+                 (table->max_file_size < ioc_inode->ia_size))) {
+                fd_ctx_set(fd, this, 1);
+            }
+        }
+        ioc_inode_unlock(ioc_inode);
+
+        /* If O_DIRECT open, we disable caching on it */
+        if ((local->flags & O_DIRECT)) {
+            /* O_DIRECT is only for one fd, not the inode
+             * as a whole
+             */
+            fd_ctx_set(fd, this, 1);
+        }
+    }
+
+out:
+    mem_put(local);
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata);
+
+    return 0;
 }
 
 /*
@@ -680,57 +668,173 @@ ioc_open_cbk (call_frame_t *frame,
  *
  */
 int32_t
-ioc_create_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		fd_t *fd,
-		inode_t *inode,
-		struct stat *buf)
+ioc_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+               struct iatt *buf, struct iatt *preparent,
+               struct iatt *postparent, dict_t *xdata)
 {
-	ioc_local_t *local = frame->local;
-	ioc_table_t *table = this->private;
-	ioc_inode_t *ioc_inode = NULL;
-	uint32_t weight = 0;
-	const char *path = local->file_loc.path;
-
-	if (op_ret != -1) {
-		{
-			/* assign weight */
-			weight = ioc_get_priority (table, path);
-
-			ioc_inode = ioc_inode_update (table, inode, weight);
-			LOCK (&fd->inode->lock);
-			{
-				inode_ctx_put (fd->inode, this, 
-					       (uint64_t)(long)ioc_inode);
-			}
-			UNLOCK (&fd->inode->lock);
-		}
-		/* If mandatory locking has been enabled on this file,
-		   we disable caching on it */
-		if ((inode->st_mode & S_ISGID) && 
-		    !(inode->st_mode & S_IXGRP)) {
-			fd_ctx_set (fd, this, 1);
-		}
-
-		/* If O_DIRECT open, we disable caching on it */
-		if (local->flags & O_DIRECT){
-			/* O_DIRECT is only for one fd, not the inode 
-			 * as a whole 
-			 */
-			fd_ctx_set (fd, this, 1);
-		}
-    
-	}
-  
-	frame->local = NULL;
-	FREE (local);
-
-	STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
-
-	return 0;
+    ioc_local_t *local = NULL;
+    ioc_table_t *table = NULL;
+    ioc_inode_t *ioc_inode = NULL;
+    uint32_t weight = 0xffffffff;
+    const char *path = NULL;
+    int ret = -1;
+
+    local = frame->local;
+    if (!this || !this->private) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    table = this->private;
+    path = local->file_loc.path;
+
+    if (op_ret != -1) {
+        /* assign weight */
+        weight = ioc_get_priority(table, path);
+
+        ioc_inode = ioc_inode_create(table, inode, weight);
+
+        ioc_inode_lock(ioc_inode);
+        {
+            ioc_inode->cache.mtime = buf->ia_mtime;
+            ioc_inode->cache.mtime_nsec = buf->ia_mtime_nsec;
+            ioc_inode->ia_size = buf->ia_size;
+
+            if ((table->min_file_size > ioc_inode->ia_size) ||
+                ((table->max_file_size > 0) &&
+                 (table->max_file_size < ioc_inode->ia_size))) {
+                ret = fd_ctx_set(fd, this, 1);
+                if (ret)
+                    gf_smsg(this->name, GF_LOG_WARNING, ENOMEM,
+                            IO_CACHE_MSG_SET_FD_FAILED, "path=%s",
+                            local->file_loc.path, NULL);
+            }
+        }
+        ioc_inode_unlock(ioc_inode);
+
+        inode_ctx_put(fd->inode, this, (uint64_t)(long)ioc_inode);
+
+        /* If O_DIRECT open, we disable caching on it */
+        if (local->flags & O_DIRECT) {
+            /*
+             * O_DIRECT is only for one fd, not the inode
+             * as a whole */
+            ret = fd_ctx_set(fd, this, 1);
+            if (ret)
+                gf_smsg(this->name, GF_LOG_WARNING, ENOMEM,
+                        IO_CACHE_MSG_SET_FD_FAILED, "path=%s",
+                        local->file_loc.path, NULL);
+        }
+
+        /* if weight == 0, we disable caching on it */
+        if (!weight) {
+            /* we allow a pattern-matched cache disable this way */
+            ret = fd_ctx_set(fd, this, 1);
+            if (ret)
+                gf_smsg(this->name, GF_LOG_WARNING, ENOMEM,
+                        IO_CACHE_MSG_SET_FD_FAILED, "path=%s",
+                        local->file_loc.path, NULL);
+        }
+    }
+
+out:
+    frame->local = NULL;
+    mem_put(local);
+
+    STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf,
+                        preparent, postparent, xdata);
+
+    return 0;
+}
+
+int32_t
+ioc_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, inode_t *inode, struct iatt *buf,
+              struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    ioc_local_t *local = NULL;
+    ioc_table_t *table = NULL;
+    ioc_inode_t *ioc_inode = NULL;
+    uint32_t weight = 0xffffffff;
+    const char *path = NULL;
+
+    local = frame->local;
+    if (!this || !this->private) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    table = this->private;
+    path = local->file_loc.path;
+
+    if (op_ret != -1) {
+        /* assign weight */
+        weight = ioc_get_priority(table, path);
+
+        ioc_inode = ioc_inode_create(table, inode, weight);
+
+        ioc_inode_lock(ioc_inode);
+        {
+            ioc_inode->cache.mtime = buf->ia_mtime;
+            ioc_inode->cache.mtime_nsec = buf->ia_mtime_nsec;
+            ioc_inode->ia_size = buf->ia_size;
+        }
+        ioc_inode_unlock(ioc_inode);
+
+        inode_ctx_put(inode, this, (uint64_t)(long)ioc_inode);
+    }
+
+out:
+    frame->local = NULL;
+
+    loc_wipe(&local->file_loc);
+    mem_put(local);
+
+    STACK_UNWIND_STRICT(mknod, frame, op_ret, op_errno, inode, buf, preparent,
+                        postparent, xdata);
+    return 0;
+}
+
+int
+ioc_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    ioc_local_t *local = NULL;
+    int32_t op_errno = -1, ret = -1;
+
+    local = mem_get0(this->local_pool);
+    if (local == NULL) {
+        op_errno = ENOMEM;
+        gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_NO_MEMORY, NULL);
+        goto unwind;
+    }
+
+    ret = loc_copy(&local->file_loc, loc);
+    if (ret != 0) {
+        op_errno = ENOMEM;
+        gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_NO_MEMORY, NULL);
+        goto unwind;
+    }
+
+    frame->local = local;
+
+    STACK_WIND(frame, ioc_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata);
+    return 0;
+
+unwind:
+    if (local != NULL) {
+        loc_wipe(&local->file_loc);
+        mem_put(local);
+    }
+
+    STACK_UNWIND_STRICT(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+                        NULL);
+
+    return 0;
 }
 
 /*
@@ -742,36 +846,33 @@ ioc_create_cbk (call_frame_t *frame,
  *
  */
 int32_t
-ioc_open (call_frame_t *frame,
-	  xlator_t *this,
-	  loc_t *loc,
-	  int32_t flags,
-	  fd_t *fd)
+ioc_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+         fd_t *fd, dict_t *xdata)
 {
-  
-	ioc_local_t *local = CALLOC (1, sizeof (ioc_local_t));
-	ERR_ABORT (local);
-
-	local->flags = flags;
-	local->file_loc.path = loc->path;
-	local->file_loc.inode = loc->inode;
-  
-	frame->local = local;
-  
-	STACK_WIND (frame,
-		    ioc_open_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->open,
-		    loc,
-		    flags,
-		    fd);
-
-	return 0;
+    ioc_local_t *local = NULL;
+
+    local = mem_get0(this->local_pool);
+    if (local == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL);
+        STACK_UNWIND_STRICT(open, frame, -1, ENOMEM, NULL, NULL);
+        return 0;
+    }
+
+    local->flags = flags;
+    local->file_loc.path = loc->path;
+    local->file_loc.inode = loc->inode;
+
+    frame->local = local;
+
+    STACK_WIND(frame, ioc_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+
+    return 0;
 }
 
 /*
  * ioc_create - create fop for io cache
- * 
+ *
  * @frame:
  * @this:
  * @pathname:
@@ -780,213 +881,218 @@ ioc_open (call_frame_t *frame,
  *
  */
 int32_t
-ioc_create (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc,
-	    int32_t flags,
-	    mode_t mode,
-	    fd_t *fd)
+ioc_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+           mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
 {
-	ioc_local_t *local = CALLOC (1, sizeof (ioc_local_t));
-	ERR_ABORT (local);
-
-	local->flags = flags;
-	local->file_loc.path = loc->path;
-	frame->local = local;
-
-	STACK_WIND (frame, ioc_create_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->create,
-		    loc, flags, mode, fd);
-	return 0;
-}
+    ioc_local_t *local = NULL;
 
+    local = mem_get0(this->local_pool);
+    if (local == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL);
+        STACK_UNWIND_STRICT(create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
+                            NULL, NULL);
+        return 0;
+    }
 
+    local->flags = flags;
+    local->file_loc.path = loc->path;
+    frame->local = local;
 
+    STACK_WIND(frame, ioc_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+
+    return 0;
+}
 
 /*
  * ioc_release - release fop for io cache
- * 
+ *
  * @frame:
  * @this:
  * @fd:
  *
  */
 int32_t
-ioc_release (xlator_t *this,
-	     fd_t *fd)
-{
-	return 0;
-}
-
-/* 
- * ioc_readv_disabled_cbk 
- * @frame:
- * @cookie:
- * @this:
- * @op_ret:
- * @op_errno:
- * @vector:
- * @count:
- *
- */ 
-int32_t
-ioc_readv_disabled_cbk (call_frame_t *frame, 
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno,
-			struct iovec *vector,
-			int32_t count,
-			struct stat *stbuf)
+ioc_release(xlator_t *this, fd_t *fd)
 {
-	STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
-	return 0;
+    return 0;
 }
 
-
 int32_t
-ioc_need_prune (ioc_table_t *table)
+ioc_need_prune(ioc_table_t *table)
 {
-	int64_t cache_difference = 0;
-  
-	ioc_table_lock (table);
-	{
-		cache_difference = table->cache_used - table->cache_size;
-	}
-	ioc_table_unlock (table);
-
-	if (cache_difference > 0)
-		return 1;
-	else 
-		return 0;
+    int64_t cache_difference = 0;
+
+    ioc_table_lock(table);
+    {
+        cache_difference = table->cache_used - table->cache_size;
+    }
+    ioc_table_unlock(table);
+
+    if (cache_difference > 0)
+        return 1;
+    else
+        return 0;
 }
 
 /*
- * dispatch_requests -
- * 
+ * ioc_dispatch_requests -
+ *
  * @frame:
  * @inode:
  *
- * 
+ *
  */
-static void
-dispatch_requests (call_frame_t *frame,
-		   ioc_inode_t *ioc_inode,
-		   fd_t *fd,
-		   off_t offset,
-		   size_t size)
+void
+ioc_dispatch_requests(call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd,
+                      off_t offset, size_t size)
 {
-	ioc_local_t *local = frame->local;
-	ioc_table_t *table = ioc_inode->table;
-	ioc_page_t  *trav = NULL;
-	ioc_waitq_t *waitq = NULL;
-	off_t   rounded_offset = 0;
-	off_t   rounded_end = 0;
-	off_t   trav_offset = 0;
-	int32_t fault = 0;
-	int8_t  need_validate = 0;
-	int8_t  might_need_validate = 0;  /* if a page exists, do we need 
-					    to validate it? */
-
-	rounded_offset = floor (offset, table->page_size);
-	rounded_end = roof (offset + size, table->page_size);
-	trav_offset = rounded_offset;
-
-	/* once a frame does read, it should be waiting on something */
-	local->wait_count++;
-
-	/* Requested region can fall in three different pages,
-	 * 1. Ready - region is already in cache, we just have to serve it.
-	 * 2. In-transit - page fault has been generated on this page, we need
-	 *    to wait till the page is ready
-	 * 3. Fault - page is not in cache, we have to generate a page fault
-	 */
-
-	might_need_validate = ioc_inode_need_revalidate (ioc_inode);
-
-	while (trav_offset < rounded_end) {
-		size_t trav_size = 0;
-		off_t local_offset = 0;
-
-		ioc_inode_lock (ioc_inode);
-		//{
-
-		/* look for requested region in the cache */
-		trav = ioc_page_get (ioc_inode, trav_offset);
-
-		local_offset = max (trav_offset, offset);
-		trav_size = min (((offset+size) - local_offset), 
-				 table->page_size);
-
-		if (!trav) {
-			/* page not in cache, we need to generate page fault */
-			trav = ioc_page_create (ioc_inode, trav_offset);
-			fault = 1;
-			if (!trav) {
-				gf_log (frame->this->name, GF_LOG_CRITICAL,
-					"ioc_page_create returned NULL");
-			}
-		} 
-
-		ioc_wait_on_page (trav, frame, local_offset, trav_size);
-
-		if (trav->ready) {
-			/* page found in cache */
-			if (!might_need_validate) {
-				/* fresh enough */
-				gf_log (frame->this->name, GF_LOG_DEBUG,
-					"cache hit for trav_offset=%"PRId64""
-					"/local_offset=%"PRId64"",
-					trav_offset, local_offset);
-				waitq = ioc_page_wakeup (trav);
-			} else {
-				/* if waitq already exists, fstat revalidate is
-				   already on the way */
-				if (!ioc_inode->waitq) {
-					need_validate = 1;
-				}
-				ioc_wait_on_inode (ioc_inode, trav);
-			}
-		}
-
-		//}
-		ioc_inode_unlock (ioc_inode);
-    
-		ioc_waitq_return (waitq);
-		waitq = NULL;
-
-		if (fault) {
-			fault = 0;
-			/* new page created, increase the table->cache_used */
-			ioc_page_fault (ioc_inode, frame, fd, trav_offset);
-		}
-
-		if (need_validate) {
-			need_validate = 0;
-			gf_log (frame->this->name, GF_LOG_DEBUG,
-				"sending validate request for "
-				"inode(%"PRId64") at offset=%"PRId64"",
-				fd->inode->ino, trav_offset);
-			ioc_cache_validate (frame, ioc_inode, fd, trav);
-		}
-    
-		trav_offset += table->page_size;
-	}
-
-	ioc_frame_return (frame);
-
-	if (ioc_need_prune (ioc_inode->table)) {
-		ioc_prune (ioc_inode->table);
-	}
-
-	return;
+    ioc_local_t *local = NULL;
+    ioc_table_t *table = NULL;
+    ioc_page_t *trav = NULL;
+    ioc_waitq_t *waitq = NULL;
+    off_t rounded_offset = 0;
+    off_t rounded_end = 0;
+    off_t trav_offset = 0;
+    int32_t fault = 0;
+    size_t trav_size = 0;
+    off_t local_offset = 0;
+    int32_t ret = -1;
+    int8_t need_validate = 0;
+    int8_t might_need_validate = 0; /*
+                                     * if a page exists, do we need
+                                     * to validate it?
+                                     */
+    local = frame->local;
+    table = ioc_inode->table;
+
+    rounded_offset = gf_floor(offset, table->page_size);
+    rounded_end = gf_roof(offset + size, table->page_size);
+    trav_offset = rounded_offset;
+
+    /* once a frame does read, it should be waiting on something */
+    local->wait_count++;
+
+    /* Requested region can fall in three different pages,
+     * 1. Ready - region is already in cache, we just have to serve it.
+     * 2. In-transit - page fault has been generated on this page, we need
+     *    to wait till the page is ready
+     * 3. Fault - page is not in cache, we have to generate a page fault
+     */
+
+    might_need_validate = ioc_inode_need_revalidate(ioc_inode);
+
+    while (trav_offset < rounded_end) {
+        ioc_inode_lock(ioc_inode);
+        {
+            /* look for requested region in the cache */
+            trav = __ioc_page_get(ioc_inode, trav_offset);
+
+            local_offset = max(trav_offset, offset);
+            trav_size = min(((offset + size) - local_offset), table->page_size);
+
+            if (!trav) {
+                /* page not in cache, we need to generate page
+                 * fault
+                 */
+                trav = __ioc_page_create(ioc_inode, trav_offset);
+                fault = 1;
+                if (!trav) {
+                    gf_smsg(frame->this->name, GF_LOG_CRITICAL, ENOMEM,
+                            IO_CACHE_MSG_NO_MEMORY, NULL);
+                    local->op_ret = -1;
+                    local->op_errno = ENOMEM;
+                    ioc_inode_unlock(ioc_inode);
+                    goto out;
+                }
+            }
+
+            __ioc_wait_on_page(trav, frame, local_offset, trav_size);
+
+            if (trav->ready) {
+                /* page found in cache */
+                if (!might_need_validate && !ioc_inode->waitq) {
+                    /* fresh enough */
+                    gf_msg_trace(frame->this->name, 0,
+                                 "cache hit for "
+                                 "trav_offset=%" PRId64
+                                 "/local_"
+                                 "offset=%" PRId64 "",
+                                 trav_offset, local_offset);
+                    waitq = __ioc_page_wakeup(trav, trav->op_errno);
+                } else {
+                    /* if waitq already exists, fstat
+                     * revalidate is
+                     * already on the way
+                     */
+                    if (!ioc_inode->waitq) {
+                        need_validate = 1;
+                    }
+
+                    ret = ioc_wait_on_inode(ioc_inode, trav);
+                    if (ret < 0) {
+                        local->op_ret = -1;
+                        local->op_errno = -ret;
+                        need_validate = 0;
+
+                        waitq = __ioc_page_wakeup(trav, trav->op_errno);
+                        ioc_inode_unlock(ioc_inode);
+
+                        ioc_waitq_return(waitq);
+                        waitq = NULL;
+                        goto out;
+                    }
+                }
+            }
+        }
+        ioc_inode_unlock(ioc_inode);
+
+        ioc_waitq_return(waitq);
+        waitq = NULL;
+
+        if (fault) {
+            fault = 0;
+            /* new page created, increase the table->cache_used */
+            ioc_page_fault(ioc_inode, frame, fd, trav_offset);
+        }
+
+        if (need_validate) {
+            need_validate = 0;
+            gf_msg_trace(frame->this->name, 0,
+                         "sending validate request for "
+                         "inode(%s) at offset=%" PRId64 "",
+                         uuid_utoa(fd->inode->gfid), trav_offset);
+            ret = ioc_cache_validate(frame, ioc_inode, fd, trav);
+            if (ret == -1) {
+                ioc_inode_lock(ioc_inode);
+                {
+                    waitq = __ioc_page_wakeup(trav, trav->op_errno);
+                }
+                ioc_inode_unlock(ioc_inode);
+
+                ioc_waitq_return(waitq);
+                waitq = NULL;
+                goto out;
+            }
+        }
+
+        trav_offset += table->page_size;
+    }
+
+out:
+    ioc_frame_return(frame);
+
+    if (ioc_need_prune(ioc_inode->table)) {
+        ioc_prune(ioc_inode->table);
+    }
+
+    return;
 }
 
-
 /*
  * ioc_readv -
- * 
+ *
  * @frame:
  * @this:
  * @fd:
@@ -995,75 +1101,113 @@ dispatch_requests (call_frame_t *frame,
  *
  */
 int32_t
-ioc_readv (call_frame_t *frame,
-	   xlator_t *this,
-	   fd_t *fd,
-	   size_t size,
-	   off_t offset)
+ioc_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+          off_t offset, uint32_t flags, dict_t *xdata)
 {
-	uint64_t     tmp_ioc_inode = 0;
-	ioc_inode_t *ioc_inode = NULL;
-	ioc_local_t *local = NULL;
-	uint32_t     weight = 0;
-
-	inode_ctx_get (fd->inode, this, &tmp_ioc_inode);
-	ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode;
-	if (!ioc_inode) {
-		/* caching disabled, go ahead with normal readv */
-		STACK_WIND (frame, 
-			    ioc_readv_disabled_cbk,
-			    FIRST_CHILD (frame->this), 
-			    FIRST_CHILD (frame->this)->fops->readv,
-			    fd, 
-			    size, 
-			    offset);
-		return 0;
-	}
-
-	if (!fd_ctx_get (fd, this, NULL)) {
-		/* disable caching for this fd, go ahead with normal readv */
-		STACK_WIND (frame, 
-			    ioc_readv_disabled_cbk,
-			    FIRST_CHILD (frame->this), 
-			    FIRST_CHILD (frame->this)->fops->readv,
-			    fd, 
-			    size, 
-			    offset);
-		return 0;
-	}
-
-	local = (ioc_local_t *) CALLOC (1, sizeof (ioc_local_t));
-	ERR_ABORT (local);
-	INIT_LIST_HEAD (&local->fill_list);
-
-	frame->local = local;  
-	local->pending_offset = offset;
-	local->pending_size = size;
-	local->offset = offset;
-	local->size = size;
-	local->inode = ioc_inode;
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"NEW REQ (%p) offset = %"PRId64" && size = %"GF_PRI_SIZET"", 
-		frame, offset, size);
-
-	weight = ioc_inode->weight;
-
-	ioc_table_lock (ioc_inode->table);
-	{
-		list_move_tail (&ioc_inode->inode_lru, 
-				&ioc_inode->table->inode_lru[weight]);
-	}
-	ioc_table_unlock (ioc_inode->table);
-
-	dispatch_requests (frame, ioc_inode, fd, offset, size);
-  
-	return 0;
+    uint64_t tmp_ioc_inode = 0;
+    ioc_inode_t *ioc_inode = NULL;
+    ioc_local_t *local = NULL;
+    uint32_t weight = 0;
+    ioc_table_t *table = NULL;
+    int32_t op_errno = EINVAL;
+
+    if (!this) {
+        goto out;
+    }
+
+    inode_ctx_get(fd->inode, this, &tmp_ioc_inode);
+    ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode;
+    if (!ioc_inode) {
+        /* caching disabled, go ahead with normal readv */
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->readv, fd, size, offset, flags,
+                        xdata);
+        return 0;
+    }
+
+    if (flags & O_DIRECT) {
+        /* disable caching for this fd, if O_DIRECT is used */
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->readv, fd, size, offset, flags,
+                        xdata);
+        return 0;
+    }
+
+    table = this->private;
+
+    if (!table) {
+        gf_smsg(this->name, GF_LOG_ERROR, EINVAL, IO_CACHE_MSG_TABLE_NULL,
+                NULL);
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    ioc_inode_lock(ioc_inode);
+    {
+        if (!ioc_inode->cache.page_table) {
+            ioc_inode->cache.page_table = rbthash_table_init(
+                this->ctx, IOC_PAGE_TABLE_BUCKET_COUNT, ioc_hashfn, NULL, 0,
+                table->mem_pool);
+
+            if (ioc_inode->cache.page_table == NULL) {
+                op_errno = ENOMEM;
+                ioc_inode_unlock(ioc_inode);
+                goto out;
+            }
+        }
+    }
+    ioc_inode_unlock(ioc_inode);
+
+    if (!fd_ctx_get(fd, this, NULL)) {
+        /* disable caching for this fd, go ahead with normal readv */
+        STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                        FIRST_CHILD(this)->fops->readv, fd, size, offset, flags,
+                        xdata);
+        return 0;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (local == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL);
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    INIT_LIST_HEAD(&local->fill_list);
+
+    frame->local = local;
+    local->pending_offset = offset;
+    local->pending_size = size;
+    local->offset = offset;
+    local->size = size;
+    local->inode = ioc_inode;
+    local->xattr_req = dict_ref(xdata);
+
+    gf_msg_trace(this->name, 0,
+                 "NEW REQ (%p) offset "
+                 "= %" PRId64 " && size = %" GF_PRI_SIZET "",
+                 frame, offset, size);
+
+    weight = ioc_inode->weight;
+
+    ioc_table_lock(ioc_inode->table);
+    {
+        list_move_tail(&ioc_inode->inode_lru,
+                       &ioc_inode->table->inode_lru[weight]);
+    }
+    ioc_table_unlock(ioc_inode->table);
+
+    ioc_dispatch_requests(frame, ioc_inode, fd, offset, size);
+    return 0;
+
+out:
+    STACK_UNWIND_STRICT(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
+    return 0;
 }
 
 /*
  * ioc_writev_cbk -
- * 
+ *
  * @frame:
  * @cookie:
  * @this:
@@ -1072,28 +1216,36 @@ ioc_readv (call_frame_t *frame,
  *
  */
 int32_t
-ioc_writev_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		struct stat *stbuf)
+ioc_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+               struct iatt *postbuf, dict_t *xdata)
 {
-	ioc_local_t *local     = frame->local;
-	uint64_t     ioc_inode = 0;
-
-	inode_ctx_get (local->fd->inode, this, &ioc_inode);
-  
-	if (ioc_inode)
-		ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
-
-	STACK_UNWIND (frame, op_ret, op_errno, stbuf);
-	return 0;
+    ioc_local_t *local = NULL;
+    uint64_t ioc_inode = 0;
+
+    local = frame->local;
+    frame->local = NULL;
+    inode_ctx_get(local->fd->inode, this, &ioc_inode);
+
+    if (op_ret >= 0) {
+        ioc_update_pages(frame, (ioc_inode_t *)(long)ioc_inode, local->vector,
+                         local->op_ret, op_ret, local->offset);
+    }
+
+    STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    if (local->iobref) {
+        iobref_unref(local->iobref);
+        GF_FREE(local->vector);
+    }
+
+    mem_put(local);
+    return 0;
 }
 
 /*
  * ioc_writev
- * 
+ *
  * @frame:
  * @this:
  * @fd:
@@ -1103,42 +1255,43 @@ ioc_writev_cbk (call_frame_t *frame,
  *
  */
 int32_t
-ioc_writev (call_frame_t *frame,
-	    xlator_t *this,
-	    fd_t *fd,
-	    struct iovec *vector,
-	    int32_t count,
-	    off_t offset)
+ioc_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+           int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+           dict_t *xdata)
 {
-	ioc_local_t *local     = NULL;
-	uint64_t     ioc_inode = 0;
-	
-	local = CALLOC (1, sizeof (ioc_local_t));
-	ERR_ABORT (local);
-
-	/* TODO: why is it not fd_ref'ed */
-	local->fd = fd;
-	frame->local = local;
-
-	inode_ctx_get (fd->inode, this, &ioc_inode);
-	if (ioc_inode)
-		ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
-
-	STACK_WIND (frame,
-		    ioc_writev_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->writev,
-		    fd,
-		    vector,
-		    count,
-		    offset);
-
-	return 0;
+    ioc_local_t *local = NULL;
+    uint64_t ioc_inode = 0;
+
+    local = mem_get0(this->local_pool);
+    if (local == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL);
+
+        STACK_UNWIND_STRICT(writev, frame, -1, ENOMEM, NULL, NULL, NULL);
+        return 0;
+    }
+
+    /* TODO: why is it not fd_ref'ed */
+    local->fd = fd;
+    frame->local = local;
+
+    inode_ctx_get(fd->inode, this, &ioc_inode);
+    if (ioc_inode) {
+        local->iobref = iobref_ref(iobref);
+        local->vector = iov_dup(vector, count);
+        local->op_ret = count;
+        local->offset = offset;
+    }
+
+    STACK_WIND(frame, ioc_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+               flags, iobref, xdata);
+
+    return 0;
 }
 
 /*
  * ioc_truncate_cbk -
- * 
+ *
  * @frame:
  * @cookie:
  * @this:
@@ -1147,52 +1300,65 @@ ioc_writev (call_frame_t *frame,
  * @buf:
  *
  */
-int32_t 
-ioc_truncate_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  struct stat *buf)
+int32_t
+ioc_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                 struct iatt *postbuf, dict_t *xdata)
 {
+    STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
 
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
+/*
+ * ioc_ftruncate_cbk -
+ *
+ * @frame:
+ * @cookie:
+ * @this:
+ * @op_ret:
+ * @op_errno:
+ * @buf:
+ *
+ */
+int32_t
+ioc_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                  struct iatt *postbuf, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
 }
 
 /*
  * ioc_truncate -
- * 
+ *
  * @frame:
  * @this:
  * @loc:
  * @offset:
  *
  */
-int32_t 
-ioc_truncate (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      off_t offset)
+int32_t
+ioc_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+             dict_t *xdata)
 {
-	uint64_t ioc_inode = 0;
-	inode_ctx_get (loc->inode, this, &ioc_inode);
-
-	if (ioc_inode)
-		ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
-
-	STACK_WIND (frame,
-		    ioc_truncate_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->truncate,
-		    loc,
-		    offset);
-	return 0;
+    uint64_t ioc_inode = 0;
+
+    inode_ctx_get(loc->inode, this, &ioc_inode);
+
+    if (ioc_inode)
+        ioc_inode_flush((ioc_inode_t *)(long)ioc_inode);
+
+    STACK_WIND(frame, ioc_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+    return 0;
 }
 
 /*
  * ioc_ftruncate -
- * 
+ *
  * @frame:
  * @this:
  * @fd:
@@ -1200,279 +1366,866 @@ ioc_truncate (call_frame_t *frame,
  *
  */
 int32_t
-ioc_ftruncate (call_frame_t *frame,
-	       xlator_t *this,
-	       fd_t *fd,
-	       off_t offset)
+ioc_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+              dict_t *xdata)
+{
+    uint64_t ioc_inode = 0;
+
+    inode_ctx_get(fd->inode, this, &ioc_inode);
+
+    if (ioc_inode)
+        ioc_inode_flush((ioc_inode_t *)(long)ioc_inode);
+
+    STACK_WIND(frame, ioc_ftruncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+}
+
+int32_t
+ioc_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+           int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
 {
-	uint64_t ioc_inode = 0;
-	inode_ctx_get (fd->inode, this, &ioc_inode);
-
-	if (ioc_inode)
-		ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
-
-	STACK_WIND (frame,
-		    ioc_truncate_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->ftruncate,
-		    fd,
-		    offset);
-	return 0;
+    STACK_UNWIND_STRICT(lk, frame, op_ret, op_errno, lock, xdata);
+    return 0;
 }
 
 int32_t
-ioc_lk_cbk (call_frame_t *frame,
-	    void *cookie,
-	    xlator_t *this,
-	    int32_t op_ret,
-	    int32_t op_errno,
-	    struct flock *lock)
+ioc_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+       struct gf_flock *lock, dict_t *xdata)
 {
-	STACK_UNWIND (frame, op_ret, op_errno, lock);
-	return 0;
+    ioc_inode_t *ioc_inode = NULL;
+    uint64_t tmp_inode = 0;
+
+    inode_ctx_get(fd->inode, this, &tmp_inode);
+    ioc_inode = (ioc_inode_t *)(long)tmp_inode;
+    if (!ioc_inode) {
+        gf_msg_debug(this->name, EBADFD,
+                     "inode context is NULL: returning EBADFD");
+        STACK_UNWIND_STRICT(lk, frame, -1, EBADFD, NULL, NULL);
+        return 0;
+    }
+
+    ioc_inode_lock(ioc_inode);
+    {
+        ioc_inode->cache.last_revalidate = gf_time();
+    }
+    ioc_inode_unlock(ioc_inode);
+
+    STACK_WIND(frame, ioc_lk_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lk, fd, cmd, lock, xdata);
+
+    return 0;
+}
+
+int
+ioc_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, gf_dirent_t *entries, dict_t *xdata)
+{
+    gf_dirent_t *entry = NULL;
+    char *path = NULL;
+    fd_t *fd = NULL;
+
+    fd = frame->local;
+    frame->local = NULL;
+
+    if (op_ret <= 0)
+        goto unwind;
+
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        inode_path(fd->inode, entry->d_name, &path);
+        ioc_inode_update(this, entry->inode, path, &entry->d_stat);
+        GF_FREE(path);
+        path = NULL;
+    }
+
+unwind:
+    STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata);
+
+    return 0;
 }
 
-int32_t 
-ioc_lk (call_frame_t *frame,
-	xlator_t *this,
-	fd_t *fd,
-	int32_t cmd,
-	struct flock *lock)
+int
+ioc_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+             off_t offset, dict_t *dict)
+{
+    frame->local = fd;
+
+    STACK_WIND(frame, ioc_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, offset, dict);
+
+    return 0;
+}
+
+static int32_t
+ioc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *pre,
+                struct iatt *post, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, pre, post, xdata);
+    return 0;
+}
+
+static int32_t
+ioc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            size_t len, dict_t *xdata)
+{
+    uint64_t ioc_inode = 0;
+
+    inode_ctx_get(fd->inode, this, &ioc_inode);
+
+    if (ioc_inode)
+        ioc_inode_flush((ioc_inode_t *)(long)ioc_inode);
+
+    STACK_WIND(frame, ioc_discard_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+    return 0;
+}
+
+static int32_t
+ioc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *pre,
+                 struct iatt *post, dict_t *xdata)
 {
-	ioc_inode_t *ioc_inode = NULL;
-	uint64_t     tmp_inode = 0;
-
-	inode_ctx_get (fd->inode, this, &tmp_inode);
-	ioc_inode = (ioc_inode_t *)(long)tmp_inode;
-	if (!ioc_inode) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"inode context is NULL: returning EBADFD");
-		STACK_UNWIND (frame, -1, EBADFD, NULL);
-		return 0;
-	}
-
-	ioc_inode_lock (ioc_inode);
-	{
-		gettimeofday (&ioc_inode->tv, NULL);
-	}
-	ioc_inode_unlock (ioc_inode);
-
-	STACK_WIND (frame, ioc_lk_cbk, 
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->lk, fd, cmd, lock);
-	return 0;
+    STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, pre, post, xdata);
+    return 0;
+}
+
+static int32_t
+ioc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             off_t len, dict_t *xdata)
+{
+    uint64_t ioc_inode = 0;
+
+    inode_ctx_get(fd->inode, this, &ioc_inode);
+
+    if (ioc_inode)
+        ioc_inode_flush((ioc_inode_t *)(long)ioc_inode);
+
+    STACK_WIND(frame, ioc_zerofill_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+    return 0;
 }
 
 int32_t
-ioc_get_priority_list (const char *opt_str, struct list_head *first)
+ioc_get_priority_list(const char *opt_str, struct list_head *first)
 {
-	int32_t max_pri = 0;
-	char *tmp_str = NULL;
-	char *tmp_str1 = NULL;
-	char *tmp_str2 = NULL;
-	char *dup_str = NULL;
-	char *stripe_str = NULL;
-	char *pattern = NULL;
-	char *priority = NULL;
-	char *string = strdup (opt_str);
-	struct ioc_priority *curr = NULL;
-
-	/* Get the pattern for cache priority. 
-	 * "option priority *.jpg:1,abc*:2" etc 
-	 */
-	/* TODO: inode_lru in table is statically hard-coded to 5, 
-	 * should be changed to run-time configuration 
-	 */
-	stripe_str = strtok_r (string, ",", &tmp_str);
-	while (stripe_str) {
-		curr = CALLOC (1, sizeof (struct ioc_priority));
-		ERR_ABORT (curr);
-		list_add_tail (&curr->list, first);
-
-		dup_str = strdup (stripe_str);
-		pattern = strtok_r (dup_str, ":", &tmp_str1);
-		if (!pattern)
-			return -1;
-		priority = strtok_r (NULL, ":", &tmp_str1);
-		if (!priority)
-			return -1;
-		gf_log ("io-cache", 
-			GF_LOG_DEBUG, 
-			"ioc priority : pattern %s : priority %s", 
-			pattern,
-			priority);
-		curr->pattern = strdup (pattern);
-		curr->priority = strtol (priority, &tmp_str2, 0);
-		if (tmp_str2 && (*tmp_str2))
-			return -1;
-		else
-			max_pri = max (max_pri, curr->priority);
-		stripe_str = strtok_r (NULL, ",", &tmp_str);
-	}
-
-	return max_pri;
+    int32_t max_pri = 1;
+    char *tmp_str = NULL;
+    char *tmp_str1 = NULL;
+    char *tmp_str2 = NULL;
+    char *dup_str = NULL;
+    char *stripe_str = NULL;
+    char *pattern = NULL;
+    char *priority = NULL;
+    char *string = NULL;
+    struct ioc_priority *curr = NULL, *tmp = NULL;
+
+    string = gf_strdup(opt_str);
+    if (string == NULL) {
+        max_pri = -1;
+        goto out;
+    }
+
+    /* Get the pattern for cache priority.
+     * "option priority *.jpg:1,abc*:2" etc
+     */
+    /* TODO: inode_lru in table is statically hard-coded to 5,
+     * should be changed to run-time configuration
+     */
+    stripe_str = strtok_r(string, ",", &tmp_str);
+    while (stripe_str) {
+        curr = GF_CALLOC(1, sizeof(struct ioc_priority),
+                         gf_ioc_mt_ioc_priority);
+        if (curr == NULL) {
+            max_pri = -1;
+            goto out;
+        }
+
+        list_add_tail(&curr->list, first);
+
+        dup_str = gf_strdup(stripe_str);
+        if (dup_str == NULL) {
+            max_pri = -1;
+            goto out;
+        }
+
+        pattern = strtok_r(dup_str, ":", &tmp_str1);
+        if (!pattern) {
+            max_pri = -1;
+            goto out;
+        }
+
+        priority = strtok_r(NULL, ":", &tmp_str1);
+        if (!priority) {
+            max_pri = -1;
+            goto out;
+        }
+
+        gf_msg_trace("io-cache", 0, "ioc priority : pattern %s : priority %s",
+                     pattern, priority);
+
+        curr->pattern = gf_strdup(pattern);
+        if (curr->pattern == NULL) {
+            max_pri = -1;
+            goto out;
+        }
+
+        curr->priority = strtol(priority, &tmp_str2, 0);
+        if (tmp_str2 && (*tmp_str2)) {
+            max_pri = -1;
+            goto out;
+        } else {
+            max_pri = max(max_pri, curr->priority);
+        }
+
+        GF_FREE(dup_str);
+        dup_str = NULL;
+
+        stripe_str = strtok_r(NULL, ",", &tmp_str);
+    }
+out:
+    GF_FREE(string);
+
+    GF_FREE(dup_str);
+
+    if (max_pri == -1) {
+        list_for_each_entry_safe(curr, tmp, first, list)
+        {
+            list_del_init(&curr->list);
+            GF_FREE(curr->pattern);
+            GF_FREE(curr);
+        }
+    }
+
+    return max_pri;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_ioc_mt_end + 1);
+
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+                IO_CACHE_MSG_MEMORY_INIT_FAILED, NULL);
+        return ret;
+    }
+
+    return ret;
+}
+
+static gf_boolean_t
+check_cache_size_ok(xlator_t *this, uint64_t cache_size)
+{
+    gf_boolean_t ret = _gf_true;
+    uint64_t total_mem = 0;
+    uint64_t max_cache_size = 0;
+    volume_option_t *opt = NULL;
+
+    GF_ASSERT(this);
+    opt = xlator_volume_option_get(this, "cache-size");
+    if (!opt) {
+        ret = _gf_false;
+        gf_smsg(this->name, GF_LOG_ERROR, EINVAL,
+                IO_CACHE_MSG_NO_CACHE_SIZE_OPT, NULL);
+        goto out;
+    }
+
+    total_mem = get_mem_size();
+    if (-1 == total_mem)
+        max_cache_size = opt->max;
+    else
+        max_cache_size = total_mem;
+
+    gf_msg_debug(this->name, 0, "Max cache size is %" PRIu64, max_cache_size);
+
+    if (cache_size > max_cache_size) {
+        ret = _gf_false;
+        gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_INVALID_ARGUMENT,
+                "Cache-size=%" PRIu64, cache_size, "max-size=%" PRIu64,
+                max_cache_size, NULL);
+        goto out;
+    }
+out:
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    data_t *data = NULL;
+    ioc_table_t *table = NULL;
+    int ret = -1;
+    uint64_t cache_size_new = 0;
+    if (!this || !this->private)
+        goto out;
+
+    table = this->private;
+
+    ioc_table_lock(table);
+    {
+        GF_OPTION_RECONF("pass-through", this->pass_through, options, bool,
+                         unlock);
+
+        GF_OPTION_RECONF("cache-timeout", table->cache_timeout, options, int32,
+                         unlock);
+
+        data = dict_get(options, "priority");
+        if (data) {
+            char *option_list = data_to_str(data);
+
+            gf_msg_trace(this->name, 0, "option path %s", option_list);
+            /* parse the list of pattern:priority */
+            table->max_pri = ioc_get_priority_list(option_list,
+                                                   &table->priority_list);
+
+            if (table->max_pri == -1) {
+                goto unlock;
+            }
+            table->max_pri++;
+        }
+
+        GF_OPTION_RECONF("max-file-size", table->max_file_size, options,
+                         size_uint64, unlock);
+
+        GF_OPTION_RECONF("min-file-size", table->min_file_size, options,
+                         size_uint64, unlock);
+
+        if ((table->max_file_size <= UINT64_MAX) &&
+            (table->min_file_size > table->max_file_size)) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, IO_CACHE_MSG_DEFAULTING_TO_OLD,
+                    "minimum-size=%" PRIu64, table->min_file_size,
+                    "maximum-size=%" PRIu64, table->max_file_size, NULL);
+            goto unlock;
+        }
+
+        GF_OPTION_RECONF("cache-size", cache_size_new, options, size_uint64,
+                         unlock);
+        if (!check_cache_size_ok(this, cache_size_new)) {
+            ret = -1;
+            gf_smsg(this->name, GF_LOG_ERROR, 0,
+                    IO_CACHE_MSG_NOT_RECONFIG_CACHE_SIZE, NULL);
+            goto unlock;
+        }
+        table->cache_size = cache_size_new;
+
+        ret = 0;
+    }
+unlock:
+    ioc_table_unlock(table);
+out:
+    return ret;
 }
 
 /*
- * init - 
+ * init -
  * @this:
  *
  */
-int32_t 
-init (xlator_t *this)
+int32_t
+init(xlator_t *this)
+{
+    ioc_table_t *table = NULL;
+    dict_t *xl_options = NULL;
+    uint32_t index = 0;
+    int32_t ret = -1;
+    glusterfs_ctx_t *ctx = NULL;
+    data_t *data = 0;
+    uint32_t num_pages = 0;
+
+    xl_options = this->options;
+
+    if (!this->children || this->children->next) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0,
+                IO_CACHE_MSG_XLATOR_CHILD_MISCONFIGURED, NULL);
+        goto out;
+    }
+
+    if (!this->parents) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, IO_CACHE_MSG_VOL_MISCONFIGURED,
+                NULL);
+    }
+
+    table = (void *)GF_CALLOC(1, sizeof(*table), gf_ioc_mt_ioc_table_t);
+    if (table == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_CACHE_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    table->xl = this;
+    table->page_size = this->ctx->page_size;
+
+    GF_OPTION_INIT("pass-through", this->pass_through, bool, out);
+
+    GF_OPTION_INIT("cache-size", table->cache_size, size_uint64, out);
+
+    GF_OPTION_INIT("cache-timeout", table->cache_timeout, int32, out);
+
+    GF_OPTION_INIT("min-file-size", table->min_file_size, size_uint64, out);
+
+    GF_OPTION_INIT("max-file-size", table->max_file_size, size_uint64, out);
+
+    if (!check_cache_size_ok(this, table->cache_size)) {
+        ret = -1;
+        goto out;
+    }
+
+    INIT_LIST_HEAD(&table->priority_list);
+    table->max_pri = 1;
+    data = dict_get(xl_options, "priority");
+    if (data) {
+        char *option_list = data_to_str(data);
+        gf_msg_trace(this->name, 0, "option path %s", option_list);
+        /* parse the list of pattern:priority */
+        table->max_pri = ioc_get_priority_list(option_list,
+                                               &table->priority_list);
+
+        if (table->max_pri == -1) {
+            goto out;
+        }
+    }
+    table->max_pri++;
+
+    INIT_LIST_HEAD(&table->inodes);
+
+    if ((table->max_file_size <= UINT64_MAX) &&
+        (table->min_file_size > table->max_file_size)) {
+        gf_smsg("io-cache", GF_LOG_ERROR, 0, IO_CACHE_MSG_INVALID_ARGUMENT,
+                "minimum-size=%" PRIu64, table->min_file_size,
+                "maximum-size=%" PRIu64, table->max_file_size, NULL);
+        goto out;
+    }
+
+    table->inode_lru = GF_CALLOC(table->max_pri, sizeof(struct list_head),
+                                 gf_ioc_mt_list_head);
+    if (table->inode_lru == NULL) {
+        goto out;
+    }
+
+    for (index = 0; index < (table->max_pri); index++)
+        INIT_LIST_HEAD(&table->inode_lru[index]);
+
+    this->local_pool = mem_pool_new(ioc_local_t, 64);
+    if (!this->local_pool) {
+        ret = -1;
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+                IO_CACHE_MSG_CREATE_MEM_POOL_FAILED, NULL);
+        goto out;
+    }
+
+    pthread_mutex_init(&table->table_lock, NULL);
+    this->private = table;
+
+    num_pages = (table->cache_size / table->page_size) +
+                ((table->cache_size % table->page_size) ? 1 : 0);
+
+    table->mem_pool = mem_pool_new(rbthash_entry_t, num_pages);
+    if (!table->mem_pool) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM,
+                IO_CACHE_MSG_ALLOC_MEM_POOL_FAILED, NULL);
+        goto out;
+    }
+
+    ret = 0;
+
+    ctx = this->ctx;
+    ioc_log2_page_size = log_base2(ctx->page_size);
+
+out:
+    if (ret == -1) {
+        if (table != NULL) {
+            GF_FREE(table->inode_lru);
+            GF_FREE(table);
+        }
+    }
+
+    return ret;
+}
+
+void
+ioc_page_waitq_dump(ioc_page_t *page, char *prefix)
 {
-	ioc_table_t *table;
-	dict_t *options = this->options;
-	uint32_t index = 0;
-	char *page_size_string = NULL;
-	char *cache_size_string = NULL;
-
-	if (!this->children || this->children->next) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"FATAL: io-cache not configured with exactly "
-			"one child");
-		return -1;
-	}
-
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-
-	table = (void *) CALLOC (1, sizeof (*table));
-	ERR_ABORT (table);
-  
-	table->xl = this;
-	table->page_size = IOC_PAGE_SIZE;
-	table->cache_size = IOC_CACHE_SIZE;
-
-	if (dict_get (options, "page-size"))
-		page_size_string = data_to_str (dict_get (options, 
-							  "page-size"));
-
-	if (page_size_string) {
-		if (gf_string2bytesize (page_size_string, 
-					&table->page_size) != 0) {
-			gf_log ("io-cache", GF_LOG_ERROR, 
-				"invalid number format \"%s\" of "
-				"\"option page-size\"", 
-				page_size_string);
-			return -1;
-		}
-		gf_log (this->name, GF_LOG_DEBUG, 
-			"using page-size %"PRIu64"",  table->page_size);
-	}
-  
-	if (dict_get (options, "cache-size"))
-		cache_size_string = data_to_str (dict_get (options, 
-							   "cache-size"));
-	if (cache_size_string) {
-		if (gf_string2bytesize (cache_size_string, 
-					&table->cache_size) != 0) {
-			gf_log ("io-cache", GF_LOG_ERROR, 
-				"invalid number format \"%s\" of "
-				"\"option cache-size\"", 
-				cache_size_string);
-			return -1;
-		}
-      
-		gf_log (this->name, GF_LOG_DEBUG, 
-			"using cache-size %"PRIu64"", table->cache_size);
-	}
-  
-	table->cache_timeout = 1;
-
-	if (dict_get (options, "cache-timeout")) {
-		table->cache_timeout = 
-			data_to_uint32 (dict_get (options,
-						  "cache-timeout"));
-		gf_log (this->name, GF_LOG_DEBUG,
-			"Using %d seconds to revalidate cache",
-			table->cache_timeout);
-	}
-
-	INIT_LIST_HEAD (&table->priority_list);
-	if (dict_get (options, "priority")) {
-		char *option_list = data_to_str (dict_get (options, 
-							   "priority"));
-		gf_log (this->name, GF_LOG_DEBUG,
-			"option path %s", option_list);
-		/* parse the list of pattern:priority */
-		table->max_pri = ioc_get_priority_list (option_list, 
-							&table->priority_list);
-    
-		if (table->max_pri == -1)
-			return -1;
-	}
-	table->max_pri ++;
-	INIT_LIST_HEAD (&table->inodes);
-  
-	table->inode_lru = CALLOC (table->max_pri, sizeof (struct list_head));
-	ERR_ABORT (table->inode_lru);
-	for (index = 0; index < (table->max_pri); index++)
-		INIT_LIST_HEAD (&table->inode_lru[index]);
-
-	pthread_mutex_init (&table->table_lock, NULL);
-	this->private = table;
-	return 0;
+    ioc_waitq_t *trav = NULL;
+    call_frame_t *frame = NULL;
+    int32_t i = 0;
+    char key[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+
+    trav = page->waitq;
+
+    while (trav) {
+        frame = trav->data;
+        sprintf(key, "waitq.frame[%d]", i++);
+        gf_proc_dump_write(key, "%" PRId64, frame->root->unique);
+
+        trav = trav->next;
+    }
+}
+
+void
+__ioc_inode_waitq_dump(ioc_inode_t *ioc_inode, char *prefix)
+{
+    ioc_waitq_t *trav = NULL;
+    ioc_page_t *page = NULL;
+    int32_t i = 0;
+    char key[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+
+    trav = ioc_inode->waitq;
+
+    while (trav) {
+        page = trav->data;
+
+        sprintf(key, "cache-validation-waitq.page[%d].offset", i++);
+        gf_proc_dump_write(key, "%" PRId64, page->offset);
+
+        trav = trav->next;
+    }
+}
+
+void
+__ioc_page_dump(ioc_page_t *page, char *prefix)
+{
+    int ret = -1;
+
+    if (!page)
+        return;
+    /* ioc_page_lock can be used to hold the mutex. But in statedump
+     * its better to use trylock to avoid deadlocks.
+     */
+    ret = pthread_mutex_trylock(&page->page_lock);
+    if (ret)
+        goto out;
+    {
+        gf_proc_dump_write("offset", "%" PRId64, page->offset);
+        gf_proc_dump_write("size", "%" GF_PRI_SIZET, page->size);
+        gf_proc_dump_write("dirty", "%s", page->dirty ? "yes" : "no");
+        gf_proc_dump_write("ready", "%s", page->ready ? "yes" : "no");
+        ioc_page_waitq_dump(page, prefix);
+    }
+    pthread_mutex_unlock(&page->page_lock);
+
+out:
+    if (ret && page)
+        gf_proc_dump_write("Unable to dump the page information",
+                           "(Lock acquisition failed) %p", page);
+
+    return;
+}
+
+void
+__ioc_cache_dump(ioc_inode_t *ioc_inode, char *prefix)
+{
+    off_t offset = 0;
+    ioc_table_t *table = NULL;
+    ioc_page_t *page = NULL;
+    int i = 0;
+    char key[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+
+    if ((ioc_inode == NULL) || (prefix == NULL)) {
+        goto out;
+    }
+
+    table = ioc_inode->table;
+
+    if (ioc_inode->cache.last_revalidate) {
+        gf_time_fmt(timestr, sizeof timestr, ioc_inode->cache.last_revalidate,
+                    gf_timefmt_FT);
+
+        gf_proc_dump_write("last-cache-validation-time", "%s", timestr);
+    }
+
+    for (offset = 0; offset < ioc_inode->ia_size; offset += table->page_size) {
+        page = __ioc_page_get(ioc_inode, offset);
+        if (page == NULL) {
+            continue;
+        }
+
+        sprintf(key, "inode.cache.page[%d]", i++);
+        __ioc_page_dump(page, key);
+    }
+out:
+    return;
+}
+
+int
+ioc_inode_dump(xlator_t *this, inode_t *inode)
+{
+    char *path = NULL;
+    int ret = -1;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    uint64_t tmp_ioc_inode = 0;
+    ioc_inode_t *ioc_inode = NULL;
+    gf_boolean_t section_added = _gf_false;
+    char uuid_str[64] = {
+        0,
+    };
+
+    if (this == NULL || inode == NULL)
+        goto out;
+
+    gf_proc_dump_build_key(key_prefix, "io-cache", "inode");
+
+    inode_ctx_get(inode, this, &tmp_ioc_inode);
+    ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode;
+    if (ioc_inode == NULL)
+        goto out;
+
+    /* Similar to ioc_page_dump function its better to use
+     * pthread_mutex_trylock and not to use gf_log in statedump
+     * to avoid deadlocks.
+     */
+    ret = pthread_mutex_trylock(&ioc_inode->inode_lock);
+    if (ret)
+        goto out;
+
+    {
+        if (gf_uuid_is_null(ioc_inode->inode->gfid))
+            goto unlock;
+
+        gf_proc_dump_add_section("%s", key_prefix);
+        section_added = _gf_true;
+
+        __inode_path(ioc_inode->inode, NULL, &path);
+
+        gf_proc_dump_write("inode.weight", "%d", ioc_inode->weight);
+
+        if (path) {
+            gf_proc_dump_write("path", "%s", path);
+            GF_FREE(path);
+        }
+
+        gf_proc_dump_write("uuid", "%s",
+                           uuid_utoa_r(ioc_inode->inode->gfid, uuid_str));
+        __ioc_cache_dump(ioc_inode, key_prefix);
+        __ioc_inode_waitq_dump(ioc_inode, key_prefix);
+    }
+unlock:
+    pthread_mutex_unlock(&ioc_inode->inode_lock);
+
+out:
+    if (ret && ioc_inode) {
+        if (section_added == _gf_false)
+            gf_proc_dump_add_section("%s", key_prefix);
+        gf_proc_dump_write("Unable to print the status of ioc_inode",
+                           "(Lock acquisition failed) %s",
+                           uuid_utoa(inode->gfid));
+    }
+    return ret;
+}
+
+int
+ioc_priv_dump(xlator_t *this)
+{
+    ioc_table_t *priv = NULL;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    int ret = -1;
+    gf_boolean_t add_section = _gf_false;
+
+    if (!this || !this->private)
+        goto out;
+
+    priv = this->private;
+
+    gf_proc_dump_build_key(key_prefix, "io-cache", "priv");
+    gf_proc_dump_add_section("%s", key_prefix);
+    add_section = _gf_true;
+
+    ret = pthread_mutex_trylock(&priv->table_lock);
+    if (ret)
+        goto out;
+    {
+        gf_proc_dump_write("page_size", "%" PRIu64, priv->page_size);
+        gf_proc_dump_write("cache_size", "%" PRIu64, priv->cache_size);
+        gf_proc_dump_write("cache_used", "%" PRIu64, priv->cache_used);
+        gf_proc_dump_write("inode_count", "%u", priv->inode_count);
+        gf_proc_dump_write("cache_timeout", "%u", priv->cache_timeout);
+        gf_proc_dump_write("min-file-size", "%" PRIu64, priv->min_file_size);
+        gf_proc_dump_write("max-file-size", "%" PRIu64, priv->max_file_size);
+    }
+    pthread_mutex_unlock(&priv->table_lock);
+out:
+    if (ret && priv) {
+        if (!add_section) {
+            gf_proc_dump_build_key(key_prefix,
+                                   "xlator."
+                                   "performance.io-cache",
+                                   "priv");
+            gf_proc_dump_add_section("%s", key_prefix);
+        }
+        gf_proc_dump_write(
+            "Unable to dump the state of private "
+            "structure of io-cache xlator",
+            "(Lock "
+            "acquisition failed) %s",
+            this->name);
+    }
+
+    return 0;
 }
 
 /*
  * fini -
- * 
+ *
  * @this:
  *
  */
 void
-fini (xlator_t *this)
+fini(xlator_t *this)
 {
-	ioc_table_t *table = this->private;
-
-	pthread_mutex_destroy (&table->table_lock);
-	FREE (table);
-
-	this->private = NULL;
-	return;
+    ioc_table_t *table = NULL;
+    struct ioc_priority *curr = NULL, *tmp = NULL;
+
+    table = this->private;
+
+    if (table == NULL)
+        return;
+
+    this->private = NULL;
+
+    if (table->mem_pool != NULL) {
+        mem_pool_destroy(table->mem_pool);
+        table->mem_pool = NULL;
+    }
+
+    list_for_each_entry_safe(curr, tmp, &table->priority_list, list)
+    {
+        list_del_init(&curr->list);
+        GF_FREE(curr->pattern);
+        GF_FREE(curr);
+    }
+
+    /* inode_lru and inodes list can be empty in case fini() is
+     * called soon after init()? Hence commenting the below asserts.
+     */
+    /*for (i = 0; i < table->max_pri; i++) {
+            GF_ASSERT (list_empty (&table->inode_lru[i]));
+    }
+
+    GF_ASSERT (list_empty (&table->inodes));
+    */
+    pthread_mutex_destroy(&table->table_lock);
+    GF_FREE(table);
+
+    this->private = NULL;
+    return;
 }
 
 struct xlator_fops fops = {
-	.open        = ioc_open,
-	.create      = ioc_create,
-	.readv       = ioc_readv,
-	.writev      = ioc_writev,
-	.truncate    = ioc_truncate,
-	.ftruncate   = ioc_ftruncate,
-	.utimens     = ioc_utimens,
-	.lookup      = ioc_lookup,
-	.lk          = ioc_lk
+    .open = ioc_open,
+    .create = ioc_create,
+    .readv = ioc_readv,
+    .writev = ioc_writev,
+    .truncate = ioc_truncate,
+    .ftruncate = ioc_ftruncate,
+    .lookup = ioc_lookup,
+    .lk = ioc_lk,
+    .setattr = ioc_setattr,
+    .mknod = ioc_mknod,
+
+    .readdirp = ioc_readdirp,
+    .discard = ioc_discard,
+    .zerofill = ioc_zerofill,
 };
 
-struct xlator_mops mops = {
+struct xlator_dumpops dumpops = {
+    .priv = ioc_priv_dump,
+    .inodectx = ioc_inode_dump,
 };
 
 struct xlator_cbks cbks = {
-	.forget      = ioc_forget,
-  	.release     = ioc_release
+    .forget = ioc_forget,
+    .release = ioc_release,
+    .invalidate = ioc_invalidate,
 };
 
 struct volume_options options[] = {
-	{ .key  = {"priority"}, 
-	  .type = GF_OPTION_TYPE_ANY 
-	},
-	{ .key  = {"cache-timeout", "force-revalidate-timeout"},
-	  .type = GF_OPTION_TYPE_INT,
-	  .min  = 0, 
-	  .max  = 60 
-	}, 
-	{ .key  = {"page-size"}, 
-	  .type = GF_OPTION_TYPE_SIZET, 
-	  .min  = 16 * GF_UNIT_KB, 
-	  .max  =  4 * GF_UNIT_MB 
-	},
-	{ .key  = {"cache-size"}, 
-	  .type = GF_OPTION_TYPE_SIZET,
-	  .min  = 4 * GF_UNIT_MB, 
-	  .max  = 6 * GF_UNIT_GB 
-	},
-	{ .key = {NULL} },
+    {
+        .key = {"io-cache"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "enable/disable io-cache",
+        .op_version = {GD_OP_VERSION_6_0},
+        .flags = OPT_FLAG_SETTABLE,
+    },
+    {.key = {"priority"},
+     .type = GF_OPTION_TYPE_PRIORITY_LIST,
+     .default_value = "",
+     .description = "Assigns priority to filenames with specific "
+                    "patterns so that when a page needs to be ejected "
+                    "out of the cache, the page of a file whose "
+                    "priority is the lowest will be ejected earlier",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+    {.key = {"cache-timeout", "force-revalidate-timeout"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 0,
+     .max = 60,
+     .default_value = "1",
+     .description = "The cached data for a file will be retained for "
+                    "'cache-refresh-timeout' seconds, after which data "
+                    "re-validation is performed.",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+    {.key = {"cache-size"},
+     .type = GF_OPTION_TYPE_SIZET,
+     .min = 4 * GF_UNIT_MB,
+     .max = INFINITY,
+     .default_value = "32MB",
+     .description = "Size of the read cache.",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+    {.key = {"min-file-size"},
+     .type = GF_OPTION_TYPE_SIZET,
+     .default_value = "0",
+     .description = "Minimum file size which would be cached by the "
+                    "io-cache translator.",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+    {.key = {"max-file-size"},
+     .type = GF_OPTION_TYPE_SIZET,
+     .default_value = "0",
+     .description = "Maximum file size which would be cached by the "
+                    "io-cache translator.",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
+    {.key = {"pass-through"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "false",
+     .op_version = {GD_OP_VERSION_4_1_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+     .tags = {"io-cache"},
+     .description = "Enable/Disable io cache translator"},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "io-cache",
+    .category = GF_MAINTAINED,
 };
diff --git a/xlators/performance/io-cache/src/io-cache.h b/xlators/performance/io-cache/src/io-cache.h
index 3cefca16414..14923c75edc 100644
--- a/xlators/performance/io-cache/src/io-cache.h
+++ b/xlators/performance/io-cache/src/io-cache.h
@@ -1,44 +1,30 @@
 /*
-   Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
 #ifndef __IO_CACHE_H
 #define __IO_CACHE_H
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
 #include <sys/types.h>
-#include "compat-errno.h"
-
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
-#include "common-utils.h"
-#include "call-stub.h"
+#include <glusterfs/compat-errno.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/rbthash.h>
 #include <sys/time.h>
 #include <fnmatch.h>
+#include "io-cache-messages.h"
 
-#define IOC_PAGE_SIZE    (1024 * 128)   /* 128KB */
-#define IOC_CACHE_SIZE   (32 * 1024 * 1024)
+#define IOC_PAGE_SIZE (1024 * 128) /* 128KB */
+#define IOC_CACHE_SIZE (32 * 1024 * 1024)
+#define IOC_PAGE_TABLE_BUCKET_COUNT 1
 
 struct ioc_table;
 struct ioc_local;
@@ -46,106 +32,135 @@ struct ioc_page;
 struct ioc_inode;
 
 struct ioc_priority {
-	struct list_head list;
-	char *pattern;
-	uint32_t priority;
+    struct list_head list;
+    char *pattern;
+    uint32_t priority;
 };
 
 /*
- * ioc_waitq - this structure is used to represents the waiting 
+ * ioc_waitq - this structure is used to represents the waiting
  *             frames on a page
  *
  * @next: pointer to next object in waitq
  * @data: pointer to the frame which is waiting
  */
 struct ioc_waitq {
-	struct ioc_waitq *next;
-	void *data;
-	off_t pending_offset;
-	size_t pending_size;
+    struct ioc_waitq *next;
+    void *data;
+    off_t pending_offset;
+    size_t pending_size;
 };
 
 /*
- * ioc_fill - 
+ * ioc_fill -
  *
  */
 struct ioc_fill {
-	struct list_head list;  /* list of ioc_fill structures of a frame */
-	off_t offset;          
-	size_t size;           
-	struct iovec *vector;  
-	int32_t count;
-	dict_t *refs;
+    struct list_head list; /* list of ioc_fill structures of a frame */
+    off_t offset;
+    size_t size;
+    struct iovec *vector;
+    int32_t count;
+    struct iobref *iobref;
 };
 
 struct ioc_local {
-	mode_t mode;
-	int32_t flags;
-	loc_t file_loc;
-	off_t offset;
-	size_t size;
-	int32_t op_ret;
-	int32_t op_errno;
-	struct list_head fill_list;      /* list of ioc_fill structures */
-	off_t pending_offset;            /* offset from this frame should continue */
-	size_t pending_size;             /* size of data this frame is waiting on */
-	struct ioc_inode *inode;
-	int32_t wait_count;
-	pthread_mutex_t local_lock;
-	struct ioc_waitq *waitq;
-	void *stub;
-	fd_t *fd;
-	int32_t need_xattr;
-	dict_t *xattr_req;
+    mode_t mode;
+    int32_t flags;
+    loc_t file_loc;
+    off_t offset;
+    size_t size;
+    int32_t op_ret;
+    int32_t op_errno;
+    struct list_head fill_list; /* list of ioc_fill structures */
+    off_t pending_offset;       /*
+                                 * offset from this frame should
+                                 * continue
+                                 */
+    size_t pending_size;        /*
+                                 * size of data this frame is waiting
+                                 * on
+                                 */
+    struct ioc_inode *inode;
+    int32_t wait_count;
+    pthread_mutex_t local_lock;
+    struct ioc_waitq *waitq;
+    void *stub;
+    fd_t *fd;
+    struct iovec *vector;
+    struct iobref *iobref;
+    int32_t need_xattr;
+    dict_t *xattr_req;
 };
 
 /*
- * ioc_page - structure to store page of data from file 
+ * ioc_page - structure to store page of data from file
  *
  */
 struct ioc_page {
-	struct list_head pages;
-	struct list_head page_lru;
-	struct ioc_inode *inode;   /* inode this page belongs to */
-	struct ioc_priority *priority;
-	char dirty;
-	char ready;
-	struct iovec *vector;
-	int32_t count;
-	off_t offset;
-	size_t size;
-	struct ioc_waitq *waitq;
-	dict_t *ref;
-	pthread_mutex_t page_lock;
+    struct list_head page_lru;
+    struct ioc_inode *inode; /* inode this page belongs to */
+    struct ioc_priority *priority;
+    char dirty;
+    char ready;
+    struct iovec *vector;
+    int32_t count;
+    off_t offset;
+    size_t size;
+    struct ioc_waitq *waitq;
+    struct iobref *iobref;
+    pthread_mutex_t page_lock;
+    int32_t op_errno;
+    char stale;
+};
+
+struct ioc_cache {
+    rbthash_table_t *page_table;
+    struct list_head page_lru;
+    time_t mtime;           /*
+                             * seconds component of file mtime
+                             */
+    time_t mtime_nsec;      /*
+                             * nanosecond component of file mtime
+                             */
+    time_t last_revalidate; /* timestamp at last re-validate */
 };
 
 struct ioc_inode {
-	struct ioc_table *table;
-	struct list_head pages;      /* list of pages of this inode */
-	struct list_head inode_list; /* list of inodes, maintained by io-cache translator */
-	struct list_head inode_lru;
-	struct list_head page_lru;
-	struct ioc_waitq *waitq;
-	pthread_mutex_t inode_lock;
-	uint32_t weight;             /* weight of the inode, increases on each read */
-	time_t mtime;             /* mtime of the server file when last cached */
-	struct timeval tv;           /* time-stamp at last re-validate */
+    struct ioc_table *table;
+    off_t ia_size;
+    struct ioc_cache cache;
+    struct list_head inode_list; /*
+                                  * list of inodes, maintained by
+                                  * io-cache translator
+                                  */
+    struct list_head inode_lru;
+    struct ioc_waitq *waitq;
+    pthread_mutex_t inode_lock;
+    uint32_t weight; /*
+                      * weight of the inode, increases
+                      * on each read
+                      */
+    inode_t *inode;
 };
 
 struct ioc_table {
-	uint64_t page_size;
-	uint64_t cache_size;
-	uint64_t cache_used;
-	struct list_head inodes; /* list of inodes cached */
-	struct list_head active; 
-	struct list_head *inode_lru;
-	struct list_head priority_list;
-	int32_t readv_count;
-	pthread_mutex_t table_lock;
-	xlator_t *xl;
-	uint32_t inode_count;
-	int32_t cache_timeout;
-	int32_t max_pri;
+    uint64_t page_size;
+    uint64_t cache_size;
+    uint64_t cache_used;
+    uint64_t min_file_size;
+    uint64_t max_file_size;
+    struct list_head inodes; /* list of inodes cached */
+    struct list_head active;
+    struct list_head *inode_lru;
+    struct list_head priority_list;
+    int32_t readv_count;
+    pthread_mutex_t table_lock;
+    xlator_t *xl;
+    uint32_t inode_count;
+    int32_t cache_timeout;
+    int32_t max_pri;
+    struct mem_pool *mem_pool;
 };
 
 typedef struct ioc_table ioc_table_t;
@@ -156,175 +171,136 @@ typedef struct ioc_waitq ioc_waitq_t;
 typedef struct ioc_fill ioc_fill_t;
 
 void *
-str_to_ptr (char *string);
+str_to_ptr(char *string);
 
 char *
-ptr_to_str (void *ptr);
-
-int32_t 
-ioc_readv_disabled_cbk (call_frame_t *frame,
-			void *cookie,
-			xlator_t *this,
-			int32_t op_ret,
-			int32_t op_errno,
-			struct iovec *vector,
-			int32_t count,
-			struct stat *stbuf);
+ptr_to_str(void *ptr);
+
+int32_t
+ioc_readv_disabled_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                       int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                       int32_t count, struct iatt *stbuf, struct iobref *iobref,
+                       dict_t *xdata);
 
 ioc_page_t *
-ioc_page_get (ioc_inode_t *ioc_inode,
-	      off_t offset);
+__ioc_page_get(ioc_inode_t *ioc_inode, off_t offset);
 
 ioc_page_t *
-ioc_page_create (ioc_inode_t *ioc_inode,
-		 off_t offset);
+__ioc_page_create(ioc_inode_t *ioc_inode, off_t offset);
 
 void
-ioc_page_fault (ioc_inode_t *ioc_inode,
-		call_frame_t *frame,
-		fd_t *fd,
-		off_t offset);
+ioc_page_fault(ioc_inode_t *ioc_inode, call_frame_t *frame, fd_t *fd,
+               off_t offset);
 void
-ioc_wait_on_page (ioc_page_t *page,
-		  call_frame_t *frame,
-		  off_t offset,
-		  size_t size);
+__ioc_wait_on_page(ioc_page_t *page, call_frame_t *frame, off_t offset,
+                   size_t size);
 
 ioc_waitq_t *
-ioc_page_wakeup (ioc_page_t *page);
+__ioc_page_wakeup(ioc_page_t *page, int32_t op_errno);
 
 void
-ioc_page_flush (ioc_page_t *page);
+ioc_page_flush(ioc_page_t *page);
 
 ioc_waitq_t *
-ioc_page_error (ioc_page_t *page,
-		int32_t op_ret,
-		int32_t op_errno);
-void
-ioc_page_purge (ioc_page_t *page);
+__ioc_page_error(ioc_page_t *page, int32_t op_ret, int32_t op_errno);
 
 void
-ioc_frame_return (call_frame_t *frame);
+ioc_frame_return(call_frame_t *frame);
 
 void
-ioc_waitq_return (ioc_waitq_t *waitq);
+ioc_waitq_return(ioc_waitq_t *waitq);
 
-void
-ioc_frame_fill (ioc_page_t *page,
-		call_frame_t *frame,
-		off_t offset,
-		size_t size);
-
-#define ioc_inode_lock(ioc_inode)					\
-	do {								\
-		gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG,	\
-			"locked inode(%p)", ioc_inode);			\
-		pthread_mutex_lock (&ioc_inode->inode_lock);		\
-	} while (0)
-
-
-#define ioc_inode_unlock(ioc_inode)					\
-	do {								\
-		gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG,	\
-			"unlocked inode(%p)", ioc_inode);		\
-		pthread_mutex_unlock (&ioc_inode->inode_lock);		\
-	} while (0)
-
-
-#define ioc_table_lock(table)					\
-	do {							\
-		gf_log (table->xl->name, GF_LOG_DEBUG,		\
-			"locked table(%p)", table);		\
-		pthread_mutex_lock (&table->table_lock);	\
-	} while (0)
-
-
-#define ioc_table_unlock(table)					\
-	do {							\
-		gf_log (table->xl->name, GF_LOG_DEBUG,		\
-			"unlocked table(%p)", table);		\
-		pthread_mutex_unlock (&table->table_lock);	\
-	} while (0)
-
-
-#define ioc_local_lock(local)						\
-	do {								\
-		gf_log (local->inode->table->xl->name, GF_LOG_DEBUG,	\
-			"locked local(%p)", local);			\
-		pthread_mutex_lock (&local->local_lock);		\
-	} while (0)
-
-
-#define ioc_local_unlock(local)						\
-	do {								\
-		gf_log (local->inode->table->xl->name, GF_LOG_DEBUG,	\
-			"unlocked local(%p)", local);			\
-		pthread_mutex_unlock (&local->local_lock);		\
-	} while (0)
-
-
-#define ioc_page_lock(page)						\
-	do {								\
-		gf_log (page->inode->table->xl->name, GF_LOG_DEBUG,	\
-			"locked page(%p)", page);			\
-		pthread_mutex_lock (&page->page_lock);			\
-	} while (0)
-
-
-#define ioc_page_unlock(page)						\
-	do {								\
-		gf_log (page->inode->table->xl->name, GF_LOG_DEBUG,	\
-			"unlocked page(%p)", page);			\
-		pthread_mutex_unlock (&page->page_lock);		\
-	} while (0)
-
-
-static inline uint64_t
-time_elapsed (struct timeval *now,
-	      struct timeval *then)
-{
-	uint64_t sec = now->tv_sec - then->tv_sec;
-
-	if (sec)
-		return sec;
-  
-	return 0;
-}
+int32_t
+ioc_frame_fill(ioc_page_t *page, call_frame_t *frame, off_t offset, size_t size,
+               int32_t op_errno);
+
+#define ioc_inode_lock(ioc_inode)                                              \
+    do {                                                                       \
+        gf_msg_trace(ioc_inode->table->xl->name, 0, "locked inode(%p)",        \
+                     ioc_inode);                                               \
+        pthread_mutex_lock(&ioc_inode->inode_lock);                            \
+    } while (0)
+
+#define ioc_inode_unlock(ioc_inode)                                            \
+    do {                                                                       \
+        gf_msg_trace(ioc_inode->table->xl->name, 0, "unlocked inode(%p)",      \
+                     ioc_inode);                                               \
+        pthread_mutex_unlock(&ioc_inode->inode_lock);                          \
+    } while (0)
+
+#define ioc_table_lock(table)                                                  \
+    do {                                                                       \
+        gf_msg_trace(table->xl->name, 0, "locked table(%p)", table);           \
+        pthread_mutex_lock(&table->table_lock);                                \
+    } while (0)
+
+#define ioc_table_unlock(table)                                                \
+    do {                                                                       \
+        gf_msg_trace(table->xl->name, 0, "unlocked table(%p)", table);         \
+        pthread_mutex_unlock(&table->table_lock);                              \
+    } while (0)
+
+#define ioc_local_lock(local)                                                  \
+    do {                                                                       \
+        gf_msg_trace(local->inode->table->xl->name, 0, "locked local(%p)",     \
+                     local);                                                   \
+        pthread_mutex_lock(&local->local_lock);                                \
+    } while (0)
+
+#define ioc_local_unlock(local)                                                \
+    do {                                                                       \
+        gf_msg_trace(local->inode->table->xl->name, 0, "unlocked local(%p)",   \
+                     local);                                                   \
+        pthread_mutex_unlock(&local->local_lock);                              \
+    } while (0)
+
+#define ioc_page_lock(page)                                                    \
+    do {                                                                       \
+        gf_msg_trace(page->inode->table->xl->name, 0, "locked page(%p)",       \
+                     page);                                                    \
+        pthread_mutex_lock(&page->page_lock);                                  \
+    } while (0)
+
+#define ioc_page_unlock(page)                                                  \
+    do {                                                                       \
+        gf_msg_trace(page->inode->table->xl->name, 0, "unlocked page(%p)",     \
+                     page);                                                    \
+        pthread_mutex_unlock(&page->page_lock);                                \
+    } while (0)
 
 ioc_inode_t *
-ioc_inode_search (ioc_table_t *table,
-		  inode_t *inode);
+ioc_inode_search(ioc_table_t *table, inode_t *inode);
 
-void 
-ioc_inode_destroy (ioc_inode_t *ioc_inode);
+void
+ioc_inode_destroy(ioc_inode_t *ioc_inode);
+
+int32_t
+ioc_inode_update(xlator_t *this, inode_t *inode, char *path,
+                 struct iatt *iabuf);
 
 ioc_inode_t *
-ioc_inode_update (ioc_table_t *table,
-		  inode_t *inode,
-		  uint32_t weight);
+ioc_inode_create(ioc_table_t *table, inode_t *inode, uint32_t weight);
 
-int64_t 
-ioc_page_destroy (ioc_page_t *page);
+int64_t
+__ioc_page_destroy(ioc_page_t *page);
 
-int32_t
-__ioc_inode_flush (ioc_inode_t *ioc_inode);
+int64_t
+__ioc_inode_flush(ioc_inode_t *ioc_inode);
 
 void
-ioc_inode_flush (ioc_inode_t *ioc_inode);
+ioc_inode_flush(ioc_inode_t *ioc_inode);
 
 void
-ioc_inode_wakeup (call_frame_t *frame, 
-		  ioc_inode_t *ioc_inode, 
-		  struct stat *stbuf);
+ioc_inode_wakeup(call_frame_t *frame, ioc_inode_t *ioc_inode,
+                 struct iatt *stbuf);
 
 int8_t
-ioc_cache_still_valid (ioc_inode_t *ioc_inode,
-		       struct stat *stbuf);
+ioc_cache_still_valid(ioc_inode_t *ioc_inode, struct iatt *stbuf);
 
 int32_t
-ioc_prune (ioc_table_t *table);
+ioc_prune(ioc_table_t *table);
 
 int32_t
-ioc_need_prune (ioc_table_t *table);
+ioc_need_prune(ioc_table_t *table);
 
-#endif /* __READ_AHEAD_H */
+#endif /* __IO_CACHE_H */
diff --git a/xlators/performance/io-cache/src/ioc-inode.c b/xlators/performance/io-cache/src/ioc-inode.c
index 917391de5ab..97767d85285 100644
--- a/xlators/performance/io-cache/src/ioc-inode.c
+++ b/xlators/performance/io-cache/src/ioc-inode.c
@@ -1,29 +1,17 @@
 /*
-  Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
   This file is part of GlusterFS.
 
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
 #include "io-cache.h"
+#include "ioc-mem-types.h"
 
+extern int ioc_log2_page_size;
 
 /*
  * str_to_ptr - convert a string to pointer
@@ -31,12 +19,17 @@
  *
  */
 void *
-str_to_ptr (char *string)
+str_to_ptr(char *string)
 {
-	void *ptr = (void *)strtoul (string, NULL, 16);
-	return ptr;
-}
+    void *ptr = NULL;
 
+    GF_VALIDATE_OR_GOTO("io-cache", string, out);
+
+    ptr = (void *)strtoul(string, NULL, 16);
+
+out:
+    return ptr;
+}
 
 /*
  * ptr_to_str - convert a pointer to string
@@ -44,158 +37,191 @@ str_to_ptr (char *string)
  *
  */
 char *
-ptr_to_str (void *ptr)
+ptr_to_str(void *ptr)
 {
-	char *str;
-	asprintf (&str, "%p", ptr);
-	return str;
+    int ret = 0;
+    char *str = NULL;
+
+    GF_VALIDATE_OR_GOTO("io-cache", ptr, out);
+
+    ret = gf_asprintf(&str, "%p", ptr);
+    if (-1 == ret) {
+        gf_smsg("io-cache", GF_LOG_WARNING, 0,
+                IO_CACHE_MSG_STR_COVERSION_FAILED, NULL);
+        str = NULL;
+        goto out;
+    }
+
+out:
+    return str;
 }
 
 void
-ioc_inode_wakeup (call_frame_t *frame,
-		  ioc_inode_t *ioc_inode, 
-		  struct stat *stbuf)
+ioc_inode_wakeup(call_frame_t *frame, ioc_inode_t *ioc_inode,
+                 struct iatt *stbuf)
 {
-	ioc_waitq_t *waiter = NULL, *waited = NULL;
-	ioc_waitq_t *page_waitq = NULL;
-	int8_t cache_still_valid = 1;
-	ioc_local_t *local = frame->local;
-	int8_t need_fault = 0;
-	ioc_page_t *waiter_page = NULL;
-
-	ioc_inode_lock (ioc_inode);
-	{
-		waiter = ioc_inode->waitq;
-		ioc_inode->waitq = NULL;
-	}
-	ioc_inode_unlock (ioc_inode);
-
-	if (stbuf)
-		cache_still_valid = ioc_cache_still_valid (ioc_inode, stbuf);
-	else
-		cache_still_valid = 0;
-
-	if (!waiter) {
-		gf_log (frame->this->name, GF_LOG_DEBUG,
-			"cache validate called without any "
-			"page waiting to be validated");
-	}
-
-	while (waiter) {
-		waiter_page = waiter->data;
-		page_waitq = NULL;
-    
-		if (waiter_page) {
-			if (cache_still_valid) {
-				/* cache valid, wake up page */
-				ioc_inode_lock (ioc_inode);
-				{
-					page_waitq = 
-						ioc_page_wakeup (waiter_page);
-				}
-				ioc_inode_unlock (ioc_inode);
-				if (page_waitq)
-					ioc_waitq_return (page_waitq);
-			} else {
-				/* cache invalid, generate page fault and set 
-				 * page->ready = 0, to avoid double faults  
-				 */
-				ioc_inode_lock (ioc_inode);
-	
-				if (waiter_page->ready) {
-					waiter_page->ready = 0;
-					need_fault = 1;
-				} else {
-					gf_log (frame->this->name, 
-						GF_LOG_DEBUG,
-						"validate frame(%p) is waiting"
-						"for in-transit page = %p",
-						frame, waiter_page);
-				}
-	
-				ioc_inode_unlock (ioc_inode);
-      
-				if (need_fault) {
-					need_fault = 0;
-					ioc_page_fault (ioc_inode, frame, 
-							local->fd, 
-							waiter_page->offset);
-				}
-			}
-		}
-
-		waited = waiter;
-		waiter = waiter->next;
-    
-		waited->data = NULL;
-		free (waited);
-	}
+    ioc_waitq_t *waiter = NULL, *waited = NULL;
+    ioc_waitq_t *page_waitq = NULL;
+    int8_t cache_still_valid = 1;
+    ioc_local_t *local = NULL;
+    int8_t need_fault = 0;
+    ioc_page_t *waiter_page = NULL;
+
+    GF_VALIDATE_OR_GOTO("io-cache", frame, out);
+
+    local = frame->local;
+    GF_VALIDATE_OR_GOTO(frame->this->name, local, out);
+
+    if (ioc_inode == NULL) {
+        local->op_ret = -1;
+        local->op_errno = EINVAL;
+        gf_smsg(frame->this->name, GF_LOG_WARNING, 0, IO_CACHE_MSG_INODE_NULL,
+                NULL);
+        goto out;
+    }
+
+    if (stbuf)
+        cache_still_valid = ioc_cache_still_valid(ioc_inode, stbuf);
+    else
+        cache_still_valid = 0;
+
+    ioc_inode_lock(ioc_inode);
+    {
+        waiter = ioc_inode->waitq;
+        if (!waiter) {
+            gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+                    IO_CACHE_MSG_PAGE_WAIT_VALIDATE, NULL);
+
+            ioc_inode_unlock(ioc_inode);
+            goto out;
+        }
+
+        while (waiter) {
+            waiter_page = waiter->data;
+            ioc_inode->waitq = waiter->next;
+            page_waitq = NULL;
+
+            if (waiter_page) {
+                if (cache_still_valid) {
+                    /* cache valid, wake up page */
+                    page_waitq = __ioc_page_wakeup(waiter_page,
+                                                   waiter_page->op_errno);
+                    if (page_waitq) {
+                        ioc_inode_unlock(ioc_inode);
+                        ioc_waitq_return(page_waitq);
+                        ioc_inode_lock(ioc_inode);
+                    }
+                } else {
+                    /* cache invalid, generate page fault and set
+                     * page->ready = 0, to avoid double faults
+                     */
+                    if (waiter_page->ready) {
+                        waiter_page->ready = 0;
+                        need_fault = 1;
+                    } else {
+                        gf_msg_trace(frame->this->name, 0,
+                                     "validate "
+                                     "frame(%p) is "
+                                     "waiting for "
+                                     "in-transit"
+                                     " page = %p",
+                                     frame, waiter_page);
+                    }
+
+                    if (need_fault) {
+                        need_fault = 0;
+                        ioc_inode_unlock(ioc_inode);
+                        ioc_page_fault(ioc_inode, frame, local->fd,
+                                       waiter_page->offset);
+                        ioc_inode_lock(ioc_inode);
+                    }
+                }
+            }
+
+            waited = waiter;
+            waiter = ioc_inode->waitq;
+
+            waited->data = NULL;
+            GF_FREE(waited);
+        }
+    }
+    ioc_inode_unlock(ioc_inode);
+
+out:
+    return;
 }
 
-/* 
- * ioc_inode_update - create a new ioc_inode_t structure and add it to 
- *                    the table table. fill in the fields which are derived 
+/*
+ * ioc_inode_create - create a new ioc_inode_t structure and add it to
+ *                    the table table. fill in the fields which are derived
  *                    from inode_t corresponding to the file
- * 
+ *
  * @table: io-table structure
  * @inode: inode structure
  *
  * not for external reference
  */
 ioc_inode_t *
-ioc_inode_update (ioc_table_t *table, 
-		  inode_t *inode,
-		  uint32_t weight)
+ioc_inode_create(ioc_table_t *table, inode_t *inode, uint32_t weight)
 {
-	ioc_inode_t *ioc_inode = CALLOC (1, sizeof (ioc_inode_t));
-	ERR_ABORT (ioc_inode);
-  
-	ioc_inode->table = table;
- 
-	/* initialize the list for pages */
-	INIT_LIST_HEAD (&ioc_inode->pages);
-	INIT_LIST_HEAD (&ioc_inode->page_lru);
-
-	ioc_table_lock (table);
-
-	table->inode_count++;
-	list_add (&ioc_inode->inode_list, &table->inodes);
-	list_add_tail (&ioc_inode->inode_lru, &table->inode_lru[weight]);
-
-	gf_log (table->xl->name,
-		GF_LOG_DEBUG,
-		"adding to inode_lru[%d]", weight);
-
-	ioc_table_unlock (table);
-
-	pthread_mutex_init (&ioc_inode->inode_lock, NULL);
-	ioc_inode->weight = weight;
-  
-	return ioc_inode;
-}
+    ioc_inode_t *ioc_inode = NULL;
 
+    GF_VALIDATE_OR_GOTO("io-cache", table, out);
 
-/* 
+    ioc_inode = GF_CALLOC(1, sizeof(ioc_inode_t), gf_ioc_mt_ioc_inode_t);
+    if (ioc_inode == NULL) {
+        goto out;
+    }
+
+    ioc_inode->inode = inode;
+    ioc_inode->table = table;
+    INIT_LIST_HEAD(&ioc_inode->cache.page_lru);
+    pthread_mutex_init(&ioc_inode->inode_lock, NULL);
+    ioc_inode->weight = weight;
+
+    ioc_table_lock(table);
+    {
+        table->inode_count++;
+        list_add(&ioc_inode->inode_list, &table->inodes);
+        list_add_tail(&ioc_inode->inode_lru, &table->inode_lru[weight]);
+    }
+    ioc_table_unlock(table);
+
+    gf_msg_trace(table->xl->name, 0, "adding to inode_lru[%d]", weight);
+
+out:
+    return ioc_inode;
+}
+
+/*
  * ioc_inode_destroy - destroy an ioc_inode_t object.
  *
  * @inode: inode to destroy
  *
- * to be called only from ioc_forget. 
+ * to be called only from ioc_forget.
  */
 void
-ioc_inode_destroy (ioc_inode_t *ioc_inode)
+ioc_inode_destroy(ioc_inode_t *ioc_inode)
 {
-	ioc_table_t *table = ioc_inode->table;
-
-	ioc_table_lock (table);
-	table->inode_count--;
-	list_del (&ioc_inode->inode_list);
-	list_del (&ioc_inode->inode_lru);
-	ioc_table_unlock (table);
-  
-	ioc_inode_flush (ioc_inode);
-
-	pthread_mutex_destroy (&ioc_inode->inode_lock);
-	free (ioc_inode);
-}
+    ioc_table_t *table = NULL;
+
+    GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out);
+
+    table = ioc_inode->table;
 
+    ioc_table_lock(table);
+    {
+        table->inode_count--;
+        list_del(&ioc_inode->inode_list);
+        list_del(&ioc_inode->inode_lru);
+    }
+    ioc_table_unlock(table);
+
+    ioc_inode_flush(ioc_inode);
+    rbthash_table_destroy(ioc_inode->cache.page_table);
+
+    pthread_mutex_destroy(&ioc_inode->inode_lock);
+    GF_FREE(ioc_inode);
+out:
+    return;
+}
diff --git a/xlators/performance/io-cache/src/ioc-mem-types.h b/xlators/performance/io-cache/src/ioc-mem-types.h
new file mode 100644
index 00000000000..20c9a12021e
--- /dev/null
+++ b/xlators/performance/io-cache/src/ioc-mem-types.h
@@ -0,0 +1,29 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __IOC_MT_H__
+#define __IOC_MT_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_ioc_mem_types_ {
+    gf_ioc_mt_iovec = gf_common_mt_end + 1,
+    gf_ioc_mt_ioc_table_t,
+    gf_ioc_mt_char,
+    gf_ioc_mt_ioc_waitq_t,
+    gf_ioc_mt_ioc_priority,
+    gf_ioc_mt_list_head,
+    gf_ioc_mt_call_pool_t,
+    gf_ioc_mt_ioc_inode_t,
+    gf_ioc_mt_ioc_fill_t,
+    gf_ioc_mt_ioc_newpage_t,
+    gf_ioc_mt_end
+};
+#endif
diff --git a/xlators/performance/io-cache/src/page.c b/xlators/performance/io-cache/src/page.c
index 1acda2ce697..84b1ae6cb20 100644
--- a/xlators/performance/io-cache/src/page.c
+++ b/xlators/performance/io-cache/src/page.c
@@ -1,112 +1,190 @@
 /*
-   Copyright (c) 2007-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
 
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
 #include "io-cache.h"
+#include "ioc-mem-types.h"
 #include <assert.h>
 #include <sys/time.h>
+#include "io-cache-messages.h"
+char
+ioc_empty(struct ioc_cache *cache)
+{
+    char is_empty = -1;
+
+    GF_VALIDATE_OR_GOTO("io-cache", cache, out);
+
+    is_empty = list_empty(&cache->page_lru);
+
+out:
+    return is_empty;
+}
 
 ioc_page_t *
-ioc_page_get (ioc_inode_t *ioc_inode,
-	      off_t offset)
+__ioc_page_get(ioc_inode_t *ioc_inode, off_t offset)
 {
-	int8_t       found = 0;
-	ioc_page_t  *page = NULL;
-	ioc_table_t *table = ioc_inode->table;
-	off_t        rounded_offset = floor (offset, table->page_size);
-
-	if (list_empty (&ioc_inode->pages)) {
-		return NULL;
-	}
-
-	list_for_each_entry (page, &ioc_inode->pages, pages) {
-		if (page->offset == rounded_offset) {
-			found = 1;
-			break;
-		}
-	}
-
-	/* was previously returning ioc_inode itself.., 
-	 * 1st of its type and found one more downstairs :O */
-	if (!found){
-		page = NULL;
-	} else {
-		/* push the page to the end of the lru list */
-		list_move_tail (&page->page_lru, &ioc_inode->page_lru);
-	}
-
-	return page;
+    ioc_page_t *page = NULL;
+    ioc_table_t *table = NULL;
+    off_t rounded_offset = 0;
+
+    GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out);
+
+    table = ioc_inode->table;
+    GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out);
+
+    rounded_offset = gf_floor(offset, table->page_size);
+
+    page = rbthash_get(ioc_inode->cache.page_table, &rounded_offset,
+                       sizeof(rounded_offset));
+
+    if (page != NULL) {
+        /* push the page to the end of the lru list */
+        list_move_tail(&page->page_lru, &ioc_inode->cache.page_lru);
+    }
+
+out:
+    return page;
 }
 
+ioc_page_t *
+ioc_page_get(ioc_inode_t *ioc_inode, off_t offset)
+{
+    ioc_page_t *page = NULL;
+
+    if (ioc_inode == NULL) {
+        goto out;
+    }
+
+    ioc_inode_lock(ioc_inode);
+    {
+        page = __ioc_page_get(ioc_inode, offset);
+    }
+    ioc_inode_unlock(ioc_inode);
+
+out:
+    return page;
+}
 
 /*
- * ioc_page_destroy -
+ * __ioc_page_destroy -
  *
  * @page:
  *
  */
 int64_t
-ioc_page_destroy (ioc_page_t *page)
+__ioc_page_destroy(ioc_page_t *page)
+{
+    int64_t page_size = 0;
+
+    GF_VALIDATE_OR_GOTO("io-cache", page, out);
+
+    if (page->iobref)
+        page_size = iobref_size(page->iobref);
+
+    if (page->waitq) {
+        /* frames waiting on this page, do not destroy this page */
+        page_size = -1;
+        page->stale = 1;
+    } else {
+        rbthash_remove(page->inode->cache.page_table, &page->offset,
+                       sizeof(page->offset));
+        list_del(&page->page_lru);
+
+        gf_msg_trace(page->inode->table->xl->name, 0,
+                     "destroying page = %p, offset = %" PRId64
+                     " "
+                     "&& inode = %p",
+                     page, page->offset, page->inode);
+
+        if (page->vector) {
+            iobref_unref(page->iobref);
+            GF_FREE(page->vector);
+            page->vector = NULL;
+        }
+
+        page->inode = NULL;
+    }
+
+    if (page_size != -1) {
+        pthread_mutex_destroy(&page->page_lock);
+        GF_FREE(page);
+    }
+
+out:
+    return page_size;
+}
+
+int64_t
+ioc_page_destroy(ioc_page_t *page)
 {
-	int64_t page_size = 0;
-
-	page_size = page->size;
-
-	if (page->waitq) {
-		/* frames waiting on this page, do not destroy this page */
-		page_size = -1;
-	} else {
-
-		list_del (&page->pages);
-		list_del (&page->page_lru);
-    
-		gf_log (page->inode->table->xl->name, GF_LOG_DEBUG,
-			"destroying page = %p, offset = %"PRId64" "
-			"&& inode = %p",
-			page, page->offset, page->inode);
-    
-		if (page->vector){
-			dict_unref (page->ref);
-			free (page->vector);
-			page->vector = NULL;
-		}
-    
-		page->inode = NULL;
-    
-	}
-
-	if (page_size != -1) {
-		pthread_mutex_destroy (&page->page_lock);
-		free (page);
-	}
-
-	return page_size;
+    int64_t ret = 0;
+    struct ioc_inode *inode = NULL;
+
+    if (page == NULL) {
+        goto out;
+    }
+
+    ioc_inode_lock(page->inode);
+    {
+        inode = page->inode;
+        ret = __ioc_page_destroy(page);
+    }
+    ioc_inode_unlock(inode);
+
+out:
+    return ret;
 }
 
+int32_t
+__ioc_inode_prune(ioc_inode_t *curr, uint64_t *size_pruned,
+                  uint64_t size_to_prune, uint32_t index)
+{
+    ioc_page_t *page = NULL, *next = NULL;
+    int32_t ret = 0;
+    ioc_table_t *table = NULL;
+
+    if (curr == NULL) {
+        goto out;
+    }
+
+    table = curr->table;
+
+    list_for_each_entry_safe(page, next, &curr->cache.page_lru, page_lru)
+    {
+        *size_pruned += page->size;
+        ret = __ioc_page_destroy(page);
+
+        if (ret != -1)
+            table->cache_used -= ret;
+
+        gf_msg_trace(table->xl->name, 0,
+                     "index = %d && "
+                     "table->cache_used = %" PRIu64
+                     " && table->"
+                     "cache_size = %" PRIu64,
+                     index, table->cache_used, table->cache_size);
+
+        if ((*size_pruned) >= size_to_prune)
+            break;
+    }
+
+    if (ioc_empty(&curr->cache)) {
+        list_del_init(&curr->inode_lru);
+    }
+
+out:
+    return 0;
+}
 /*
  * ioc_prune - prune the cache. we have a limit to the number of pages we
  *             can have in-memory.
@@ -115,153 +193,157 @@ ioc_page_destroy (ioc_page_t *page)
  *
  */
 int32_t
-ioc_prune (ioc_table_t *table)
+ioc_prune(ioc_table_t *table)
 {
-	ioc_inode_t *curr = NULL, *next_ioc_inode = NULL;
-	ioc_page_t *page = NULL, *next = NULL;
-	int32_t ret = -1;
-	int32_t index = 0;
-	uint64_t size_to_prune = 0;
-	uint64_t size_pruned = 0;
-
-	ioc_table_lock (table);
-	{
-		size_to_prune = table->cache_used - table->cache_size;
-		/* take out the least recently used inode */
-		for (index=0; index < table->max_pri; index++) {
-			list_for_each_entry_safe (curr, next_ioc_inode, 
-						  &table->inode_lru[index], 
-						  inode_lru) {
-				/* prune page-by-page for this inode, till 
-				 * we reach the equilibrium */
-				ioc_inode_lock (curr);
-				/* { */
-
-				list_for_each_entry_safe (page, next, 
-							  &curr->page_lru, 
-							  page_lru) {
-					/* done with all pages, and not 
-					 * reached equilibrium yet??
-					 * continue with next inode in 
-					 * lru_list */
-					size_pruned += page->size;
-					ret = ioc_page_destroy (page);
-
-					if (ret != -1)
-						table->cache_used -= ret;
-	    
-					gf_log (table->xl->name,
-						GF_LOG_DEBUG,
-						"index = %d && table->cache_"
-						"used = %"PRIu64" && table->"
-						"cache_size = %"PRIu64, 
-						index, table->cache_used, 
-						table->cache_size);
-	    
-					if (size_pruned >= size_to_prune)
-						break;
-				} /* list_for_each_entry_safe(page...) */
-				if (list_empty (&curr->pages)) {
-					list_del_init (&curr->inode_lru);
-				}
-
-				/* } */ 
-				ioc_inode_unlock (curr);
-	
-				if (size_pruned >= size_to_prune)
-					break;
-			} /* list_for_each_entry_safe (curr...) */
-      
-			if (size_pruned >= size_to_prune)
-				break;
-		} /* for(index=0;...) */
-
-	} /* ioc_inode_table locked region end */
-	ioc_table_unlock (table);
-
-	return 0;
+    ioc_inode_t *curr = NULL, *next_ioc_inode = NULL;
+    int32_t index = 0;
+    uint64_t size_to_prune = 0;
+    uint64_t size_pruned = 0;
+
+    GF_VALIDATE_OR_GOTO("io-cache", table, out);
+
+    ioc_table_lock(table);
+    {
+        size_to_prune = table->cache_used - table->cache_size;
+        /* take out the least recently used inode */
+        for (index = 0; index < table->max_pri; index++) {
+            list_for_each_entry_safe(curr, next_ioc_inode,
+                                     &table->inode_lru[index], inode_lru)
+            {
+                /* prune page-by-page for this inode, till
+                 * we reach the equilibrium */
+                ioc_inode_lock(curr);
+                {
+                    __ioc_inode_prune(curr, &size_pruned, size_to_prune, index);
+                }
+                ioc_inode_unlock(curr);
+
+                if (size_pruned >= size_to_prune)
+                    break;
+            } /* list_for_each_entry_safe (curr...) */
+
+            if (size_pruned >= size_to_prune)
+                break;
+        } /* for(index=0;...) */
+
+    } /* ioc_inode_table locked region end */
+    ioc_table_unlock(table);
+
+out:
+    return 0;
 }
 
 /*
- * ioc_page_create - create a new page. 
+ * __ioc_page_create - create a new page.
  *
- * @ioc_inode: 
+ * @ioc_inode:
  * @offset:
  *
  */
 ioc_page_t *
-ioc_page_create (ioc_inode_t *ioc_inode,
-		 off_t offset)
+__ioc_page_create(ioc_inode_t *ioc_inode, off_t offset)
 {
-	ioc_table_t *table = ioc_inode->table;
-	ioc_page_t *page = NULL;
-	off_t rounded_offset = floor (offset, table->page_size);
-	ioc_page_t *newpage = CALLOC (1, sizeof (*newpage));
-	ERR_ABORT (newpage);
-  
-	if (ioc_inode)
-		table = ioc_inode->table;
-	else {
-		return NULL;
-	}
-   
-	newpage->offset = rounded_offset;
-	newpage->inode = ioc_inode;
-	pthread_mutex_init (&newpage->page_lock, NULL);
-
-	list_add_tail (&newpage->page_lru, &ioc_inode->page_lru);
-	list_add_tail (&newpage->pages, &ioc_inode->pages);
-
-	page = newpage;
-
-	gf_log ("io-cache", GF_LOG_DEBUG,
-		"returning new page %p", page);
-	return page;
+    ioc_table_t *table = NULL;
+    ioc_page_t *page = NULL;
+    off_t rounded_offset = 0;
+    ioc_page_t *newpage = NULL;
+
+    GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out);
+
+    table = ioc_inode->table;
+    GF_VALIDATE_OR_GOTO("io-cache", table, out);
+
+    rounded_offset = gf_floor(offset, table->page_size);
+
+    newpage = GF_CALLOC(1, sizeof(*newpage), gf_ioc_mt_ioc_newpage_t);
+    if (newpage == NULL) {
+        goto out;
+    }
+
+    if (!ioc_inode) {
+        GF_FREE(newpage);
+        newpage = NULL;
+        goto out;
+    }
+
+    newpage->offset = rounded_offset;
+    newpage->inode = ioc_inode;
+    pthread_mutex_init(&newpage->page_lock, NULL);
+
+    rbthash_insert(ioc_inode->cache.page_table, newpage, &rounded_offset,
+                   sizeof(rounded_offset));
+
+    list_add_tail(&newpage->page_lru, &ioc_inode->cache.page_lru);
+
+    page = newpage;
+
+    gf_msg_trace("io-cache", 0, "returning new page %p", page);
+
+out:
+    return page;
 }
 
-/* 
- * ioc_wait_on_page - pause a frame to wait till the arrival of a page. 
- * here we need to handle the case when the frame who calls wait_on_page 
- * himself has caused page_fault 
+/*
+ * ioc_wait_on_page - pause a frame to wait till the arrival of a page.
+ * here we need to handle the case when the frame who calls wait_on_page
+ * himself has caused page_fault
  *
  * @page: page to wait on
  * @frame: call frame who is waiting on page
  *
  */
 void
-ioc_wait_on_page (ioc_page_t *page,
-		  call_frame_t *frame,
-		  off_t offset,
-		  size_t size)
+__ioc_wait_on_page(ioc_page_t *page, call_frame_t *frame, off_t offset,
+                   size_t size)
 {
-	ioc_waitq_t *waitq = NULL;
-	ioc_local_t *local = frame->local;
-
-	waitq = CALLOC (1, sizeof (*waitq));
-	ERR_ABORT (waitq);
-  
-	gf_log (frame->this->name, GF_LOG_DEBUG,
-		"frame(%p) waiting on page = %p, offset=%"PRId64", "
-		"size=%"GF_PRI_SIZET"",
-		frame, page, offset, size);
-
-	waitq->data = frame;
-	waitq->next = page->waitq;
-	waitq->pending_offset = offset;
-	waitq->pending_size = size;
-	page->waitq = waitq;
-	/* one frame can wait only once on a given page, 
-	 * local->wait_count is number of pages a frame is waiting on */
-	ioc_local_lock (local);
-	{
-		local->wait_count++;
-	}
-	ioc_local_unlock (local);
+    ioc_waitq_t *waitq = NULL;
+    ioc_local_t *local = NULL;
+
+    GF_VALIDATE_OR_GOTO("io-cache", frame, out);
+    local = frame->local;
+
+    GF_VALIDATE_OR_GOTO(frame->this->name, local, out);
+
+    if (page == NULL) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+                IO_CACHE_MSG_NULL_PAGE_WAIT, NULL);
+        goto out;
+    }
+
+    waitq = GF_CALLOC(1, sizeof(*waitq), gf_ioc_mt_ioc_waitq_t);
+    if (waitq == NULL) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto out;
+    }
+
+    gf_msg_trace(frame->this->name, 0,
+                 "frame(%p) waiting on page = %p, offset=%" PRId64
+                 ", "
+                 "size=%" GF_PRI_SIZET "",
+                 frame, page, offset, size);
+
+    waitq->data = frame;
+    waitq->next = page->waitq;
+    waitq->pending_offset = offset;
+    waitq->pending_size = size;
+    page->waitq = waitq;
+    /* one frame can wait only once on a given page,
+     * local->wait_count is number of pages a frame is waiting on */
+    ioc_local_lock(local);
+    {
+        local->wait_count++;
+    }
+    ioc_local_unlock(local);
+
+out:
+    return;
 }
 
-
 /*
- * ioc_cache_still_valid - see if cached pages ioc_inode are still valid 
+ * ioc_cache_still_valid - see if cached pages ioc_inode are still valid
  * against given stbuf
  *
  * @ioc_inode:
@@ -270,184 +352,203 @@ ioc_wait_on_page (ioc_page_t *page,
  * assumes ioc_inode is locked
  */
 int8_t
-ioc_cache_still_valid (ioc_inode_t *ioc_inode,
-		       struct stat *stbuf)
+ioc_cache_still_valid(ioc_inode_t *ioc_inode, struct iatt *stbuf)
 {
-	int8_t cache_still_valid = 1;
-  
+    int8_t cache_still_valid = 1;
+
+    GF_VALIDATE_OR_GOTO("io-cache", ioc_inode, out);
+
 #if 0
-	if (!stbuf || (stbuf->st_mtime != ioc_inode->mtime) || 
-	    (stbuf->st_mtim.tv_nsec != ioc_inode->stbuf.st_mtim.tv_nsec))
-		cache_still_valid = 0;
+        if (!stbuf || (stbuf->ia_mtime != ioc_inode->cache.mtime) ||
+            (stbuf->st_mtim.tv_nsec != ioc_inode->stbuf.st_mtim.tv_nsec))
+                cache_still_valid = 0;
 
 #else
-	if (!stbuf || (stbuf->st_mtime != ioc_inode->mtime))
-		cache_still_valid = 0;
+    if (!stbuf || (stbuf->ia_mtime != ioc_inode->cache.mtime) ||
+        (stbuf->ia_mtime_nsec != ioc_inode->cache.mtime_nsec))
+        cache_still_valid = 0;
 
 #endif
 
 #if 0
-	/* talk with avati@zresearch.com to enable this section */
-	if (!ioc_inode->mtime && stbuf) {
-		cache_still_valid = 1;
-		ioc_inode->mtime = stbuf->st_mtime;
-	}
+        /* talk with avati@gluster.com to enable this section */
+        if (!ioc_inode->mtime && stbuf) {
+                cache_still_valid = 1;
+                ioc_inode->mtime = stbuf->ia_mtime;
+        }
 #endif
 
-	return cache_still_valid;
+out:
+    return cache_still_valid;
 }
 
-
 void
-ioc_waitq_return (ioc_waitq_t *waitq)
+ioc_waitq_return(ioc_waitq_t *waitq)
 {
-	ioc_waitq_t  *trav   = NULL;
-	ioc_waitq_t  *next   = NULL;
-	call_frame_t *frame = NULL;
+    ioc_waitq_t *trav = NULL;
+    ioc_waitq_t *next = NULL;
+    call_frame_t *frame = NULL;
 
-	for (trav = waitq; trav; trav = next) {
-		next = trav->next;
+    for (trav = waitq; trav; trav = next) {
+        next = trav->next;
 
-		frame = trav->data;
-		ioc_frame_return (frame);
-		free (trav);
-	}
+        frame = trav->data;
+        ioc_frame_return(frame);
+        GF_FREE(trav);
+    }
 }
 
-
 int
-ioc_fault_cbk (call_frame_t *frame,
-	       void *cookie,
-	       xlator_t *this,
-	       int32_t op_ret,
-	       int32_t op_errno,
-	       struct iovec *vector,
-	       int32_t count,
-	       struct stat *stbuf)
+ioc_fault_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iovec *vector, int32_t count,
+              struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
 {
-	ioc_local_t *local = frame->local;
-	off_t offset = local->pending_offset;
-	ioc_inode_t *ioc_inode = local->inode;
-	ioc_table_t *table = ioc_inode->table;
-	ioc_page_t *page = NULL;
-	off_t trav_offset = 0;
-	size_t payload_size = 0;
-	int32_t destroy_size = 0;
-	size_t page_size = 0;
-	ioc_waitq_t *waitq = NULL;
-
-	trav_offset = offset;  
-	payload_size = op_ret;
-
-	ioc_inode_lock (ioc_inode);
-	{
-		if (op_ret == -1 || 
-		    (op_ret >= 0 && 
-		     !ioc_cache_still_valid(ioc_inode, stbuf))) {
-			gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG,
-				"cache for inode(%p) is invalid. flushing "
-				"all pages", ioc_inode);
-			destroy_size = __ioc_inode_flush (ioc_inode);
-		} 
-    
-		if (op_ret >= 0)
-			ioc_inode->mtime = stbuf->st_mtime;
-    
-		gettimeofday (&ioc_inode->tv, NULL);
-    
-		if (op_ret < 0) {
-			/* error, readv returned -1 */
-			page = ioc_page_get (ioc_inode, offset);
-			if (page)
-				waitq = ioc_page_error (page, op_ret, 
-							op_errno);
-		} else {
-			gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG,
-				"op_ret = %d", op_ret);
-			page = ioc_page_get (ioc_inode, offset);
-			if (!page) {
-				/* page was flushed */
-				/* some serious bug ? */
-				gf_log (this->name, GF_LOG_DEBUG,
-					"wasted copy: %"PRId64"[+%"PRId64"] "
-					"ioc_inode=%p", offset, 
-					table->page_size, ioc_inode);
-			} else {
-				if (page->vector) {
-					dict_unref (page->ref);
-					free (page->vector);
-					page->vector = NULL;
-				}
-	
-				/* keep a copy of the page for our cache */
-				page->vector = iov_dup (vector, count);
-				page->count = count;
-				if (frame->root->rsp_refs) {
-					dict_ref (frame->root->rsp_refs);
-					page->ref = frame->root->rsp_refs;
-				} else {
-					/* TODO: we have got a response to 
-					 * our request and no data */
-					gf_log (this->name, GF_LOG_CRITICAL,
-						"frame>root>rsp_refs is null");
-				} /* if(frame->root->rsp_refs) */
-	
-				/* page->size should indicate exactly how 
-				 * much the readv call to the child
-				 * translator returned. earlier op_ret 
-				 * from child translator was used, which 
-				 * gave rise to a bug where reads from 
-				 * io-cached volume were resulting in 0 
-				 * byte replies */
-				page_size = iov_length(vector, count);
-	
-				page->size = page_size;
-
-				if (page->waitq) {
-					/* wake up all the frames waiting on 
-					 * this page, including 
-					 * the frame which triggered fault */
-					waitq = ioc_page_wakeup (page);
-				} /* if(page->waitq) */
-			} /* if(!page)...else */
-		} /* if(op_ret < 0)...else */
-	} /* ioc_inode locked region end */
-	ioc_inode_unlock (ioc_inode);
-
-	ioc_waitq_return (waitq);
-
-	if (page_size) {
-		ioc_table_lock (table);
-		{
-			table->cache_used += page_size;
-		}
-		ioc_table_unlock (table);
-	}
-
-	if (destroy_size) {
-		ioc_table_lock (table);
-		{
-			table->cache_used -= destroy_size;
-		}
-		ioc_table_unlock (table);
-	}
-
-	if (ioc_need_prune (ioc_inode->table)) {
-		ioc_prune (ioc_inode->table);
-	}
-
-	gf_log (this->name, GF_LOG_DEBUG, "fault frame %p returned", frame);
-	pthread_mutex_destroy (&local->local_lock);
-
-	fd_unref (local->fd);
-
-	STACK_DESTROY (frame->root);
-	return 0;
+    ioc_local_t *local = NULL;
+    off_t offset = 0;
+    ioc_inode_t *ioc_inode = NULL;
+    ioc_table_t *table = NULL;
+    ioc_page_t *page = NULL;
+    int32_t destroy_size = 0;
+    size_t page_size = 0;
+    ioc_waitq_t *waitq = NULL;
+    size_t iobref_page_size = 0;
+    char zero_filled = 0;
+
+    GF_ASSERT(frame);
+
+    local = frame->local;
+    GF_ASSERT(local);
+
+    offset = local->pending_offset;
+    ioc_inode = local->inode;
+    GF_ASSERT(ioc_inode);
+
+    table = ioc_inode->table;
+    GF_ASSERT(table);
+
+    zero_filled = ((op_ret >= 0) && (stbuf->ia_mtime == 0));
+
+    ioc_inode_lock(ioc_inode);
+    {
+        if (op_ret == -1 ||
+            !(zero_filled || ioc_cache_still_valid(ioc_inode, stbuf))) {
+            gf_msg_trace(ioc_inode->table->xl->name, 0,
+                         "cache for inode(%p) is invalid. flushing "
+                         "all pages",
+                         ioc_inode);
+            destroy_size = __ioc_inode_flush(ioc_inode);
+        }
+
+        if ((op_ret >= 0) && !zero_filled) {
+            ioc_inode->cache.mtime = stbuf->ia_mtime;
+            ioc_inode->cache.mtime_nsec = stbuf->ia_mtime_nsec;
+        }
+
+        ioc_inode->cache.last_revalidate = gf_time();
+
+        if (op_ret < 0) {
+            /* error, readv returned -1 */
+            page = __ioc_page_get(ioc_inode, offset);
+            if (page)
+                waitq = __ioc_page_error(page, op_ret, op_errno);
+        } else {
+            gf_msg_trace(ioc_inode->table->xl->name, 0, "op_ret = %d", op_ret);
+            page = __ioc_page_get(ioc_inode, offset);
+            if (!page) {
+                /* page was flushed */
+                /* some serious bug ? */
+                gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+                        IO_CACHE_MSG_WASTED_COPY, "offset=%" PRId64, offset,
+                        "page-size=%" PRId64, table->page_size, "ioc_inode=%p",
+                        ioc_inode, NULL);
+            } else {
+                if (page->vector) {
+                    iobref_unref(page->iobref);
+                    GF_FREE(page->vector);
+                    page->vector = NULL;
+                    page->iobref = NULL;
+                }
+
+                /* keep a copy of the page for our cache */
+                page->vector = iov_dup(vector, count);
+                if (page->vector == NULL) {
+                    page = __ioc_page_get(ioc_inode, offset);
+                    if (page != NULL)
+                        waitq = __ioc_page_error(page, -1, ENOMEM);
+                    goto unlock;
+                }
+
+                page->count = count;
+                if (iobref) {
+                    page->iobref = iobref_ref(iobref);
+                } else {
+                    /* TODO: we have got a response to
+                     * our request and no data */
+                    gf_smsg(frame->this->name, GF_LOG_CRITICAL, ENOMEM,
+                            IO_CACHE_MSG_FRAME_NULL, NULL);
+                } /* if(frame->root->rsp_refs) */
+
+                /* page->size should indicate exactly how
+                 * much the readv call to the child
+                 * translator returned. earlier op_ret
+                 * from child translator was used, which
+                 * gave rise to a bug where reads from
+                 * io-cached volume were resulting in 0
+                 * byte replies */
+                page_size = iov_length(vector, count);
+                page->size = page_size;
+                page->op_errno = op_errno;
+
+                iobref_page_size = iobref_size(page->iobref);
+
+                if (page->waitq) {
+                    /* wake up all the frames waiting on
+                     * this page, including
+                     * the frame which triggered fault */
+                    waitq = __ioc_page_wakeup(page, op_errno);
+                } /* if(page->waitq) */
+            }     /* if(!page)...else */
+        }         /* if(op_ret < 0)...else */
+    }             /* ioc_inode locked region end */
+unlock:
+    ioc_inode_unlock(ioc_inode);
+
+    ioc_waitq_return(waitq);
+
+    if (iobref_page_size) {
+        ioc_table_lock(table);
+        {
+            table->cache_used += iobref_page_size;
+        }
+        ioc_table_unlock(table);
+    }
+
+    if (destroy_size) {
+        ioc_table_lock(table);
+        {
+            table->cache_used -= destroy_size;
+        }
+        ioc_table_unlock(table);
+    }
+
+    if (ioc_need_prune(ioc_inode->table)) {
+        ioc_prune(ioc_inode->table);
+    }
+
+    gf_msg_trace(frame->this->name, 0, "fault frame %p returned", frame);
+    pthread_mutex_destroy(&local->local_lock);
+
+    fd_unref(local->fd);
+    if (local->xattr_req)
+        dict_unref(local->xattr_req);
+
+    STACK_DESTROY(frame->root);
+    return 0;
 }
 
 /*
  * ioc_page_fault -
- * 
+ *
  * @ioc_inode:
  * @frame:
  * @fd:
@@ -455,145 +556,216 @@ ioc_fault_cbk (call_frame_t *frame,
  *
  */
 void
-ioc_page_fault (ioc_inode_t *ioc_inode,
-		call_frame_t *frame,
-		fd_t *fd,
-		off_t offset)
+ioc_page_fault(ioc_inode_t *ioc_inode, call_frame_t *frame, fd_t *fd,
+               off_t offset)
 {
-	ioc_table_t *table = ioc_inode->table;
-	call_frame_t *fault_frame = copy_frame (frame);
-	ioc_local_t *fault_local = CALLOC (1, sizeof (ioc_local_t));
-	ERR_ABORT (fault_local);
-
-	/* NOTE: copy_frame() means, the frame the fop whose fd_ref we 
-	 * are using till now won't be valid till we get reply from server. 
-	 * we unref this fd, in fault_cbk */
-	fault_local->fd = fd_ref (fd);
-
-	fault_frame->local = fault_local;
-	pthread_mutex_init (&fault_local->local_lock, NULL);
-
-	INIT_LIST_HEAD (&fault_local->fill_list);
-	fault_local->pending_offset = offset;
-	fault_local->pending_size = table->page_size;
-	fault_local->inode = ioc_inode;
-
-	gf_log (frame->this->name, GF_LOG_DEBUG,
-		"stack winding page fault for offset = %"PRId64" with "
-		"frame %p", offset, fault_frame);
-  
-	STACK_WIND (fault_frame, ioc_fault_cbk,
-		    FIRST_CHILD(fault_frame->this),
-		    FIRST_CHILD(fault_frame->this)->fops->readv,
-		    fd, table->page_size, offset);
-	return;
+    ioc_table_t *table = NULL;
+    call_frame_t *fault_frame = NULL;
+    ioc_local_t *fault_local = NULL;
+    ioc_local_t *local = NULL;
+    int32_t op_ret = -1, op_errno = -1;
+    ioc_waitq_t *waitq = NULL;
+    ioc_page_t *page = NULL;
+
+    GF_ASSERT(ioc_inode);
+    if (frame == NULL) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        gf_smsg("io-cache", GF_LOG_WARNING, EINVAL, IO_CACHE_MSG_PAGE_FAULT,
+                NULL);
+        goto err;
+    }
+
+    table = ioc_inode->table;
+    fault_frame = copy_frame(frame);
+    if (fault_frame == NULL) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    local = frame->local;
+    fault_local = mem_get0(THIS->local_pool);
+    if (fault_local == NULL) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        STACK_DESTROY(fault_frame->root);
+        goto err;
+    }
+
+    /* NOTE: copy_frame() means, the frame the fop whose fd_ref we
+     * are using till now won't be valid till we get reply from server.
+     * we unref this fd, in fault_cbk */
+    fault_local->fd = fd_ref(fd);
+
+    fault_frame->local = fault_local;
+    pthread_mutex_init(&fault_local->local_lock, NULL);
+
+    INIT_LIST_HEAD(&fault_local->fill_list);
+    fault_local->pending_offset = offset;
+    fault_local->pending_size = table->page_size;
+    fault_local->inode = ioc_inode;
+
+    if (local && local->xattr_req)
+        fault_local->xattr_req = dict_ref(local->xattr_req);
+
+    gf_msg_trace(frame->this->name, 0,
+                 "stack winding page fault for offset = %" PRId64
+                 " with "
+                 "frame %p",
+                 offset, fault_frame);
+
+    STACK_WIND(fault_frame, ioc_fault_cbk, FIRST_CHILD(fault_frame->this),
+               FIRST_CHILD(fault_frame->this)->fops->readv, fd,
+               table->page_size, offset, 0, fault_local->xattr_req);
+    return;
+
+err:
+    ioc_inode_lock(ioc_inode);
+    {
+        page = __ioc_page_get(ioc_inode, offset);
+        if (page != NULL) {
+            waitq = __ioc_page_error(page, op_ret, op_errno);
+        }
+    }
+    ioc_inode_unlock(ioc_inode);
+
+    if (waitq != NULL) {
+        ioc_waitq_return(waitq);
+    }
 }
 
-void
-ioc_frame_fill (ioc_page_t *page,
-		call_frame_t *frame,
-		off_t offset,
-		size_t size)
+int32_t
+__ioc_frame_fill(ioc_page_t *page, call_frame_t *frame, off_t offset,
+                 size_t size, int32_t op_errno)
 {
-	ioc_local_t *local = frame->local;
-	ioc_fill_t *fill = NULL;
-	off_t src_offset = 0;
-	off_t dst_offset = 0;
-	ssize_t copy_size = 0;
-	ioc_inode_t *ioc_inode = page->inode;
-  
-	gf_log (frame->this->name, GF_LOG_DEBUG,
-		"frame (%p) offset = %"PRId64" && size = %"GF_PRI_SIZET" "
-		"&& page->size = %"GF_PRI_SIZET" && wait_count = %d", 
-		frame, offset, size, page->size, local->wait_count);
-
-	/* immediately move this page to the end of the page_lru list */
-	list_move_tail (&page->page_lru, &ioc_inode->page_lru);
-	/* fill local->pending_size bytes from local->pending_offset */
-	if (local->op_ret != -1 && page->size) {
-		if (offset > page->offset)
-			/* offset is offset in file, convert it to offset in 
-			 * page */
-			src_offset = offset - page->offset;
-		/*FIXME: since offset is the offset within page is the 
-		 * else case valid? */
-		else
-			/* local->pending_offset is in previous page. do not
-			 * fill until we have filled all previous pages */
-			dst_offset = page->offset - offset;
-
-		/* we have to copy from offset to either end of this page 
-		 * or till the requested size */
-		copy_size = min (page->size - src_offset,
-				 size - dst_offset);
-
-		if (copy_size < 0) {
-			/* if page contains fewer bytes and the required offset
-			   is beyond the page size in the page */
-			copy_size = src_offset = 0;
-		}
-    
-		gf_log (page->inode->table->xl->name, GF_LOG_DEBUG,
-			"copy_size = %"GF_PRI_SIZET" && src_offset = "
-			"%"PRId64" && dst_offset = %"PRId64"",
-			copy_size, src_offset, dst_offset);
-
-		{
-			ioc_fill_t *new = CALLOC (1, sizeof (*new));
-			ERR_ABORT (new);
-			new->offset = page->offset;
-			new->size = copy_size;
-			new->refs = dict_ref (page->ref);
-			new->count = iov_subset (page->vector,
-						 page->count,
-						 src_offset,
-						 src_offset + copy_size,
-						 NULL);
-			new->vector = CALLOC (new->count, 
-					      sizeof (struct iovec));
-			ERR_ABORT (new->vector);
-			new->count = iov_subset (page->vector,
-						 page->count,
-						 src_offset,
-						 src_offset + copy_size,
-						 new->vector);
-
-
-
-			/* add the ioc_fill to fill_list for this frame */
-			if (list_empty (&local->fill_list)) {
-				/* if list is empty, then this is the first 
-				 * time we are filling frame, add the 
-				 * ioc_fill_t to the end of list */
-				list_add_tail (&new->list, &local->fill_list);
-			} else {
-				int8_t found = 0;
-				/* list is not empty, we need to look for 
-				 * where this offset fits in list */
-				list_for_each_entry (fill, &local->fill_list, 
-						     list) {
-					if (fill->offset > new->offset) {
-						found = 1;
-						break;
-					}
-				}
-
-				if (found) {
-					found = 0;
-					list_add_tail (&new->list, 
-						       &fill->list);
-				} else {
-					list_add_tail (&new->list, 
-						       &local->fill_list);
-				}
-			}
-		}
-		local->op_ret += copy_size;
-	}
+    ioc_local_t *local = NULL;
+    ioc_fill_t *fill = NULL;
+    off_t src_offset = 0;
+    off_t dst_offset = 0;
+    ssize_t copy_size = 0;
+    ioc_inode_t *ioc_inode = NULL;
+    ioc_fill_t *new = NULL;
+    int8_t found = 0;
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("io-cache", frame, out);
+
+    local = frame->local;
+    GF_VALIDATE_OR_GOTO(frame->this->name, local, out);
+
+    if (page == NULL) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+                IO_CACHE_MSG_SERVE_READ_REQUEST, NULL);
+        local->op_ret = -1;
+        local->op_errno = EINVAL;
+        goto out;
+    }
+
+    ioc_inode = page->inode;
+
+    gf_msg_trace(frame->this->name, 0,
+                 "frame (%p) offset = %" PRId64 " && size = %" GF_PRI_SIZET
+                 " "
+                 "&& page->size = %" GF_PRI_SIZET " && wait_count = %d",
+                 frame, offset, size, page->size, local->wait_count);
+
+    /* immediately move this page to the end of the page_lru list */
+    list_move_tail(&page->page_lru, &ioc_inode->cache.page_lru);
+    /* fill local->pending_size bytes from local->pending_offset */
+    if (local->op_ret != -1) {
+        local->op_errno = op_errno;
+
+        if (page->size == 0) {
+            goto done;
+        }
+
+        if (offset > page->offset)
+            /* offset is offset in file, convert it to offset in
+             * page */
+            src_offset = offset - page->offset;
+        /*FIXME: since offset is the offset within page is the
+         * else case valid? */
+        else
+            /* local->pending_offset is in previous page. do not
+             * fill until we have filled all previous pages */
+            dst_offset = page->offset - offset;
+
+        /* we have to copy from offset to either end of this page
+         * or till the requested size */
+        copy_size = min(page->size - src_offset, size - dst_offset);
+
+        if (copy_size < 0) {
+            /* if page contains fewer bytes and the required offset
+               is beyond the page size in the page */
+            copy_size = src_offset = 0;
+        }
+
+        gf_msg_trace(page->inode->table->xl->name, 0,
+                     "copy_size = %" GF_PRI_SIZET
+                     " && src_offset = "
+                     "%" PRId64 " && dst_offset = %" PRId64 "",
+                     copy_size, src_offset, dst_offset);
+
+        {
+            new = GF_CALLOC(1, sizeof(*new), gf_ioc_mt_ioc_fill_t);
+            if (new == NULL) {
+                local->op_ret = -1;
+                local->op_errno = ENOMEM;
+                goto out;
+            }
+
+            new->offset = page->offset;
+            new->size = copy_size;
+            new->iobref = iobref_ref(page->iobref);
+            new->count = iov_subset(page->vector, page->count, src_offset,
+                                    copy_size, &new->vector, 0);
+            if (new->count < 0) {
+                local->op_ret = -1;
+                local->op_errno = ENOMEM;
+
+                iobref_unref(new->iobref);
+                GF_FREE(new);
+                goto out;
+            }
+
+            /* add the ioc_fill to fill_list for this frame */
+            if (list_empty(&local->fill_list)) {
+                /* if list is empty, then this is the first
+                 * time we are filling frame, add the
+                 * ioc_fill_t to the end of list */
+                list_add_tail(&new->list, &local->fill_list);
+            } else {
+                found = 0;
+                /* list is not empty, we need to look for
+                 * where this offset fits in list */
+                list_for_each_entry(fill, &local->fill_list, list)
+                {
+                    if (fill->offset > new->offset) {
+                        found = 1;
+                        break;
+                    }
+                }
+
+                if (found) {
+                    list_add_tail(&new->list, &fill->list);
+                } else {
+                    list_add_tail(&new->list, &local->fill_list);
+                }
+            }
+        }
+
+        local->op_ret += copy_size;
+    }
+
+done:
+    ret = 0;
+out:
+    return ret;
 }
 
 /*
- * ioc_frame_unwind - frame unwinds only from here 
+ * ioc_frame_unwind - frame unwinds only from here
  *
  * @frame: call frame to unwind
  *
@@ -602,73 +774,109 @@ ioc_frame_fill (ioc_page_t *page,
  *
  */
 static void
-ioc_frame_unwind (call_frame_t *frame)
+ioc_frame_unwind(call_frame_t *frame)
 {
-	ioc_local_t *local = frame->local;
-	ioc_fill_t *fill = NULL, *next = NULL;
-	int32_t count = 0;
-	struct iovec *vector = NULL;
-	int32_t copied = 0;
-	dict_t *refs = NULL;
-	struct stat stbuf = {0,};
-	int32_t op_ret = 0;
-
-	//  ioc_local_lock (local);
-	refs = get_new_dict ();
-
-	frame->local = NULL;
-
-	if (list_empty (&local->fill_list)) {
-		gf_log (frame->this->name, GF_LOG_DEBUG,
-			"frame(%p) has 0 entries in local->fill_list "
-			"(offset = %"PRId64" && size = %"GF_PRI_SIZET")",
-			frame, local->offset, local->size);
-	}
-
-	list_for_each_entry (fill, &local->fill_list, list) {
-		count += fill->count;
-	}
-
-	vector = CALLOC (count, sizeof (*vector));
-	ERR_ABORT (vector);
-  
-	list_for_each_entry_safe (fill, next, &local->fill_list, list) {
-		memcpy (((char *)vector) + copied,
-			fill->vector,
-			fill->count * sizeof (*vector));
-    
-		copied += (fill->count * sizeof (*vector));
-
-		dict_copy (fill->refs, refs);
-
-		list_del (&fill->list);
-		dict_unref (fill->refs);
-		free (fill->vector);
-		free (fill);
-	}
-  
-	frame->root->rsp_refs = dict_ref (refs);
-  
-	op_ret = iov_length (vector, count);
-	gf_log (frame->this->name, GF_LOG_DEBUG,
-		"frame(%p) unwinding with op_ret=%d", frame, op_ret);
-
-	//  ioc_local_unlock (local);
-
-	STACK_UNWIND (frame,
-		      op_ret,
-		      local->op_errno,
-		      vector,
-		      count,
-		      &stbuf);
-
-	dict_unref (refs);
-    
-	pthread_mutex_destroy (&local->local_lock);
-	free (local);
-	free (vector);
-
-	return;
+    ioc_local_t *local = NULL;
+    ioc_fill_t *fill = NULL, *next = NULL;
+    int32_t count = 0;
+    struct iovec *vector = NULL;
+    int32_t copied = 0;
+    struct iobref *iobref = NULL;
+    struct iatt stbuf = {
+        0,
+    };
+    int32_t op_ret = 0, op_errno = 0;
+
+    GF_ASSERT(frame);
+
+    local = frame->local;
+    if (local == NULL) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+                IO_CACHE_MSG_LOCAL_NULL, NULL);
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    if (local->op_ret < 0) {
+        op_ret = local->op_ret;
+        op_errno = local->op_errno;
+        goto unwind;
+    }
+
+    //  ioc_local_lock (local);
+    iobref = iobref_new();
+    if (iobref == NULL) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+    }
+
+    if (list_empty(&local->fill_list)) {
+        gf_msg_trace(frame->this->name, 0,
+                     "frame(%p) has 0 entries in local->fill_list "
+                     "(offset = %" PRId64 " && size = %" GF_PRI_SIZET ")",
+                     frame, local->offset, local->size);
+    }
+
+    list_for_each_entry(fill, &local->fill_list, list) { count += fill->count; }
+
+    vector = GF_CALLOC(count, sizeof(*vector), gf_ioc_mt_iovec);
+    if (vector == NULL) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+    }
+
+    list_for_each_entry_safe(fill, next, &local->fill_list, list)
+    {
+        /* # TODO: check why this if clause is needed at all. */
+        if ((vector != NULL) && (iobref != NULL)) {
+            memcpy(((char *)vector) + copied, fill->vector,
+                   fill->count * sizeof(*vector));
+
+            copied += (fill->count * sizeof(*vector));
+
+            if (iobref_merge(iobref, fill->iobref)) {
+                op_ret = -1;
+                op_errno = ENOMEM;
+            }
+        }
+
+        list_del(&fill->list);
+        iobref_unref(fill->iobref);
+        GF_FREE(fill->vector);
+        GF_FREE(fill);
+    }
+
+    if (op_ret != -1) {
+        op_ret = iov_length(vector, count);
+    }
+
+unwind:
+    gf_msg_trace(frame->this->name, 0, "frame(%p) unwinding with op_ret=%d",
+                 frame, op_ret);
+
+    //  ioc_local_unlock (local);
+
+    frame->local = NULL;
+    STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, &stbuf,
+                        iobref, NULL);
+
+    if (iobref != NULL) {
+        iobref_unref(iobref);
+    }
+
+    if (vector != NULL) {
+        GF_FREE(vector);
+        vector = NULL;
+    }
+
+    if (local) {
+        if (local->xattr_req)
+            dict_unref(local->xattr_req);
+        pthread_mutex_destroy(&local->local_lock);
+        mem_put(local);
+    }
+    return;
 }
 
 /*
@@ -678,55 +886,117 @@ ioc_frame_unwind (call_frame_t *frame)
  * to be called only when a frame is waiting on an in-transit page
  */
 void
-ioc_frame_return (call_frame_t *frame)
+ioc_frame_return(call_frame_t *frame)
 {
-	ioc_local_t *local = frame->local;
-	int32_t wait_count;
-	assert (local->wait_count > 0);
+    ioc_local_t *local = NULL;
+    int32_t wait_count = 0;
+
+    GF_ASSERT(frame);
 
-	ioc_local_lock (local);
-	{
-		wait_count = --local->wait_count;
-	}
-	ioc_local_unlock (local);
+    local = frame->local;
+    GF_ASSERT(local->wait_count > 0);
 
-	if (!wait_count) {
-		ioc_frame_unwind (frame);
-	} 
+    ioc_local_lock(local);
+    {
+        wait_count = --local->wait_count;
+    }
+    ioc_local_unlock(local);
 
-	return;
+    if (!wait_count) {
+        ioc_frame_unwind(frame);
+    }
+
+    return;
 }
 
-/* 
+/*
  * ioc_page_wakeup -
  * @page:
  *
  * to be called only when a frame is waiting on an in-transit page
  */
 ioc_waitq_t *
-ioc_page_wakeup (ioc_page_t *page)
+__ioc_page_wakeup(ioc_page_t *page, int32_t op_errno)
 {
-	ioc_waitq_t *waitq = NULL, *trav = NULL;
-	call_frame_t *frame = NULL;
-
-	waitq = page->waitq;
-	page->waitq = NULL;
-
-	trav = waitq;
-	page->ready = 1;
-
-	gf_log (page->inode->table->xl->name, GF_LOG_DEBUG,
-		"page is %p && waitq = %p", page, waitq);
-  
-	for (trav = waitq; trav; trav = trav->next) {
-		frame = trav->data; 
-		ioc_frame_fill (page, frame, trav->pending_offset, 
-				trav->pending_size);
-	}
-	
-	return waitq;
+    ioc_waitq_t *waitq = NULL, *trav = NULL;
+    call_frame_t *frame = NULL;
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("io-cache", page, out);
+
+    waitq = page->waitq;
+    page->waitq = NULL;
+
+    page->ready = 1;
+
+    gf_msg_trace(page->inode->table->xl->name, 0, "page is %p && waitq = %p",
+                 page, waitq);
+
+    for (trav = waitq; trav; trav = trav->next) {
+        frame = trav->data;
+        ret = __ioc_frame_fill(page, frame, trav->pending_offset,
+                               trav->pending_size, op_errno);
+        if (ret == -1) {
+            break;
+        }
+    }
+
+    if (page->stale) {
+        __ioc_page_destroy(page);
+    }
+
+out:
+    return waitq;
 }
 
+/*
+ * ioc_page_error -
+ * @page:
+ * @op_ret:
+ * @op_errno:
+ *
+ */
+ioc_waitq_t *
+__ioc_page_error(ioc_page_t *page, int32_t op_ret, int32_t op_errno)
+{
+    ioc_waitq_t *waitq = NULL, *trav = NULL;
+    call_frame_t *frame = NULL;
+    int64_t ret = 0;
+    ioc_table_t *table = NULL;
+    ioc_local_t *local = NULL;
+
+    GF_VALIDATE_OR_GOTO("io-cache", page, out);
+
+    waitq = page->waitq;
+    page->waitq = NULL;
+
+    gf_msg_debug(page->inode->table->xl->name, 0,
+                 "page error for page = %p & waitq = %p", page, waitq);
+
+    for (trav = waitq; trav; trav = trav->next) {
+        frame = trav->data;
+
+        local = frame->local;
+        ioc_local_lock(local);
+        {
+            if (local->op_ret != -1) {
+                local->op_ret = op_ret;
+                local->op_errno = op_errno;
+            }
+        }
+        ioc_local_unlock(local);
+    }
+
+    table = page->inode->table;
+    ret = __ioc_page_destroy(page);
+
+    if (ret != -1) {
+        table->cache_used -= ret;
+    }
+
+out:
+    return waitq;
+}
 
 /*
  * ioc_page_error -
@@ -736,43 +1006,22 @@ ioc_page_wakeup (ioc_page_t *page)
  *
  */
 ioc_waitq_t *
-ioc_page_error (ioc_page_t *page,
-		int32_t op_ret,
-		int32_t op_errno)
+ioc_page_error(ioc_page_t *page, int32_t op_ret, int32_t op_errno)
 {
-	ioc_waitq_t *waitq = NULL, *trav = NULL;
-	call_frame_t *frame = NULL;
-	int64_t ret = 0;
-	ioc_table_t *table = NULL;
-	ioc_local_t *local = NULL;
-
-	waitq = page->waitq;
-	page->waitq = NULL;
-  
-	gf_log (page->inode->table->xl->name, GF_LOG_DEBUG,
-		"page error for page = %p & waitq = %p", page, waitq);
-
-	for (trav = waitq; trav; trav = trav->next) {
-
-		frame = trav->data;
-
-		local = frame->local;
-		ioc_local_lock (local);
-		{
-			if (local->op_ret != -1) {
-				local->op_ret = op_ret;
-				local->op_errno = op_errno;
-			}
-		}
-		ioc_local_unlock (local);
-	}
-
-	table = page->inode->table;
-	ret = ioc_page_destroy (page);
-
-	if (ret != -1) {
-		table->cache_used -= ret;
-	}
-
-	return waitq;
+    ioc_waitq_t *waitq = NULL;
+    struct ioc_inode *inode = NULL;
+
+    if (page == NULL) {
+        goto out;
+    }
+
+    ioc_inode_lock(page->inode);
+    {
+        inode = page->inode;
+        waitq = __ioc_page_error(page, op_ret, op_errno);
+    }
+    ioc_inode_unlock(inode);
+
+out:
+    return waitq;
 }
diff --git a/xlators/performance/io-threads/src/Makefile.am b/xlators/performance/io-threads/src/Makefile.am
index 38dea3eb7fc..7570cf41ed2 100644
--- a/xlators/performance/io-threads/src/Makefile.am
+++ b/xlators/performance/io-threads/src/Makefile.am
@@ -1,14 +1,16 @@
 xlator_LTLIBRARIES = io-threads.la
 xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
 
-io_threads_la_LDFLAGS = -module -avoidversion 
+io_threads_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
 
 io_threads_la_SOURCES = io-threads.c
 io_threads_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
 
-noinst_HEADERS = io-threads.h
+noinst_HEADERS = io-threads.h iot-mem-types.h io-threads-messages.h
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
 
 CLEANFILES = 
diff --git a/xlators/performance/io-threads/src/io-threads-messages.h b/xlators/performance/io-threads/src/io-threads-messages.h
new file mode 100644
index 00000000000..6229c353f96
--- /dev/null
+++ b/xlators/performance/io-threads/src/io-threads-messages.h
@@ -0,0 +1,41 @@
+/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _IO_THREADS_MESSAGES_H_
+#define _IO_THREADS_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(IO_THREADS, IO_THREADS_MSG_INIT_FAILED,
+           IO_THREADS_MSG_XLATOR_CHILD_MISCONFIGURED, IO_THREADS_MSG_NO_MEMORY,
+           IO_THREADS_MSG_VOL_MISCONFIGURED, IO_THREADS_MSG_SIZE_NOT_SET,
+           IO_THREADS_MSG_OUT_OF_MEMORY, IO_THREADS_MSG_PTHREAD_INIT_FAILED,
+           IO_THREADS_MSG_WORKER_THREAD_INIT_FAILED);
+
+#define IO_THREADS_MSG_INIT_FAILED_STR "Thread attribute initialization failed"
+#define IO_THREADS_MSG_SIZE_NOT_SET_STR "Using default thread stack size"
+#define IO_THREADS_MSG_NO_MEMORY_STR "Memory accounting init failed"
+#define IO_THREADS_MSG_XLATOR_CHILD_MISCONFIGURED_STR                          \
+    "FATAL: iot not configured with exactly one child"
+#define IO_THREADS_MSG_VOL_MISCONFIGURED_STR "dangling volume. check volfile"
+#define IO_THREADS_MSG_OUT_OF_MEMORY_STR "out of memory"
+#define IO_THREADS_MSG_PTHREAD_INIT_FAILED_STR "init failed"
+#define IO_THREADS_MSG_WORKER_THREAD_INIT_FAILED_STR                           \
+    "cannot initialize worker threads, exiting init"
+#endif /* _IO_THREADS_MESSAGES_H_ */
diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c
index 3d172f9b4fa..3d24cc97f4b 100644
--- a/xlators/performance/io-threads/src/io-threads.c
+++ b/xlators/performance/io-threads/src/io-threads.c
@@ -1,1199 +1,1590 @@
 /*
-  Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
   This file is part of GlusterFS.
 
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
 
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
+#include <glusterfs/call-stub.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include "io-threads.h"
+#include <signal.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <time.h>
+#include <glusterfs/locking.h>
+#include "io-threads-messages.h"
+#include <glusterfs/timespec.h>
+
+void *
+iot_worker(void *arg);
+int
+iot_workers_scale(iot_conf_t *conf);
+int
+__iot_workers_scale(iot_conf_t *conf);
+struct volume_options options[];
+
+#define IOT_FOP(name, frame, this, args...)                                    \
+    do {                                                                       \
+        call_stub_t *__stub = NULL;                                            \
+        int __ret = -1;                                                        \
+                                                                               \
+        __stub = fop_##name##_stub(frame, default_##name##_resume, args);      \
+        if (!__stub) {                                                         \
+            __ret = -ENOMEM;                                                   \
+            goto out;                                                          \
+        }                                                                      \
+                                                                               \
+        __ret = iot_schedule(frame, this, __stub);                             \
+                                                                               \
+    out:                                                                       \
+        if (__ret < 0) {                                                       \
+            default_##name##_failure_cbk(frame, -__ret);                       \
+            if (__stub != NULL) {                                              \
+                call_stub_destroy(__stub);                                     \
+            }                                                                  \
+        }                                                                      \
+    } while (0)
+
+iot_client_ctx_t *
+iot_get_ctx(xlator_t *this, client_t *client)
+{
+    iot_client_ctx_t *ctx = NULL;
+    iot_client_ctx_t *setted_ctx = NULL;
+    int i;
+
+    if (client_ctx_get(client, this, (void **)&ctx) != 0) {
+        ctx = GF_MALLOC(GF_FOP_PRI_MAX * sizeof(*ctx), gf_iot_mt_client_ctx_t);
+        if (ctx) {
+            for (i = 0; i < GF_FOP_PRI_MAX; ++i) {
+                INIT_LIST_HEAD(&ctx[i].clients);
+                INIT_LIST_HEAD(&ctx[i].reqs);
+            }
+            setted_ctx = client_ctx_set(client, this, ctx);
+            if (ctx != setted_ctx) {
+                GF_FREE(ctx);
+                ctx = setted_ctx;
+            }
+        }
+    }
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+    return ctx;
+}
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+call_stub_t *
+__iot_dequeue(iot_conf_t *conf, int *pri)
+{
+    call_stub_t *stub = NULL;
+    int i = 0;
+    iot_client_ctx_t *ctx;
+
+    *pri = -1;
+    for (i = 0; i < GF_FOP_PRI_MAX; i++) {
+        if (conf->ac_iot_count[i] >= conf->ac_iot_limit[i]) {
+            continue;
+        }
 
-#include "call-stub.h"
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
-#include "io-threads.h"
+        if (list_empty(&conf->clients[i])) {
+            continue;
+        }
 
-static void
-iot_queue (iot_worker_t *worker,
-           call_stub_t *stub);
+        /* Get the first per-client queue for this priority. */
+        ctx = list_first_entry(&conf->clients[i], iot_client_ctx_t, clients);
+        if (!ctx) {
+            continue;
+        }
+
+        if (list_empty(&ctx->reqs)) {
+            continue;
+        }
+
+        /* Get the first request on that queue. */
+        stub = list_first_entry(&ctx->reqs, call_stub_t, list);
+        list_del_init(&stub->list);
+        if (list_empty(&ctx->reqs)) {
+            list_del_init(&ctx->clients);
+        } else {
+            list_rotate_left(&conf->clients[i]);
+        }
+
+        conf->ac_iot_count[i]++;
+        conf->queue_marked[i] = _gf_false;
+        *pri = i;
+        break;
+    }
 
-static call_stub_t *
-iot_dequeue (iot_worker_t *worker);
+    if (!stub)
+        return NULL;
+
+    conf->queue_size--;
+    conf->queue_sizes[*pri]--;
+
+    return stub;
+}
 
-static iot_worker_t * 
-iot_schedule (iot_conf_t *conf,
-              iot_file_t *file,
-              ino_t ino)
+void
+__iot_enqueue(iot_conf_t *conf, call_stub_t *stub, int pri)
 {
-	int32_t cnt = (ino % conf->thread_count);
-	iot_worker_t *trav = conf->workers.next;
+    client_t *client = stub->frame->root->client;
+    iot_client_ctx_t *ctx;
+
+    if (pri < 0 || pri >= GF_FOP_PRI_MAX)
+        pri = GF_FOP_PRI_MAX - 1;
 
-	for (; cnt; cnt--)
-		trav = trav->next;
-  
-	if (file)
-		file->worker = trav;
-	trav->fd_count++;
-	return trav;
+    if (client) {
+        ctx = iot_get_ctx(THIS, client);
+        if (ctx) {
+            ctx = &ctx[pri];
+        }
+    } else {
+        ctx = NULL;
+    }
+    if (!ctx) {
+        ctx = &conf->no_client[pri];
+    }
+
+    if (list_empty(&ctx->reqs)) {
+        list_add_tail(&ctx->clients, &conf->clients[pri]);
+    }
+    list_add_tail(&stub->list, &ctx->reqs);
+
+    conf->queue_size++;
+    GF_ATOMIC_INC(conf->stub_cnt);
+    conf->queue_sizes[pri]++;
 }
 
-int32_t
-iot_open_cbk (call_frame_t *frame,
-              void *cookie,
-              xlator_t *this,
-              int32_t op_ret,
-              int32_t op_errno,
-              fd_t *fd)
+void *
+iot_worker(void *data)
 {
-	iot_conf_t *conf = this->private;
+    iot_conf_t *conf = NULL;
+    xlator_t *this = NULL;
+    call_stub_t *stub = NULL;
+    struct timespec sleep_till = {
+        0,
+    };
+    int ret = 0;
+    int pri = -1;
+    gf_boolean_t bye = _gf_false;
+
+    conf = data;
+    this = conf->this;
+    THIS = this;
+
+    for (;;) {
+        pthread_mutex_lock(&conf->mutex);
+        {
+            if (pri != -1) {
+                conf->ac_iot_count[pri]--;
+                pri = -1;
+            }
+            while (conf->queue_size == 0) {
+                if (conf->down) {
+                    bye = _gf_true; /*Avoid sleep*/
+                    break;
+                }
+
+                clock_gettime(CLOCK_REALTIME_COARSE, &sleep_till);
+                sleep_till.tv_sec += conf->idle_time;
+
+                conf->sleep_count++;
+                ret = pthread_cond_timedwait(&conf->cond, &conf->mutex,
+                                             &sleep_till);
+                conf->sleep_count--;
+
+                if (conf->down || ret == ETIMEDOUT) {
+                    bye = _gf_true;
+                    break;
+                }
+            }
+
+            if (bye) {
+                if (conf->down || conf->curr_count > IOT_MIN_THREADS) {
+                    conf->curr_count--;
+                    if (conf->curr_count == 0)
+                        pthread_cond_broadcast(&conf->cond);
+                    gf_msg_debug(conf->this->name, 0,
+                                 "terminated. "
+                                 "conf->curr_count=%d",
+                                 conf->curr_count);
+                } else {
+                    bye = _gf_false;
+                }
+            }
+
+            if (!bye)
+                stub = __iot_dequeue(conf, &pri);
+        }
+        pthread_mutex_unlock(&conf->mutex);
+
+        if (stub) { /* guard against spurious wakeups */
+            if (stub->poison) {
+                gf_log(this->name, GF_LOG_INFO, "Dropping poisoned request %p.",
+                       stub);
+                call_stub_destroy(stub);
+            } else {
+                call_resume(stub);
+            }
+            GF_ATOMIC_DEC(conf->stub_cnt);
+        }
+        stub = NULL;
+
+        if (bye)
+            break;
+    }
 
-	if (op_ret >= 0) {
-		iot_file_t *file = CALLOC (1, sizeof (*file));
-		ERR_ABORT (file);
+    return NULL;
+}
 
-		iot_schedule (conf, file, fd->inode->ino);
-		file->fd = fd;
+int
+do_iot_schedule(iot_conf_t *conf, call_stub_t *stub, int pri)
+{
+    int ret = 0;
 
-		fd_ctx_set (fd, this, (uint64_t)(long)file);
+    pthread_mutex_lock(&conf->mutex);
+    {
+        __iot_enqueue(conf, stub, pri);
 
-		pthread_mutex_lock (&conf->files_lock);
-		file->next = &conf->files;
-		file->prev = file->next->prev;
-		file->next->prev = file;
-		file->prev->next = file;
-		pthread_mutex_unlock (&conf->files_lock);
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, fd);
-	return 0;
+        pthread_cond_signal(&conf->cond);
+
+        ret = __iot_workers_scale(conf);
+    }
+    pthread_mutex_unlock(&conf->mutex);
+
+    return ret;
 }
 
-int32_t
-iot_open (call_frame_t *frame,
-          xlator_t *this,
-          loc_t *loc,
-          int32_t flags,
-	  fd_t *fd)
+char *
+iot_get_pri_meaning(gf_fop_pri_t pri)
 {
-	STACK_WIND (frame,
-		    iot_open_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->open,
-		    loc,
-		    flags,
-		    fd);
-	return 0;
+    char *name = NULL;
+    switch (pri) {
+        case GF_FOP_PRI_HI:
+            name = "fast";
+            break;
+        case GF_FOP_PRI_NORMAL:
+            name = "normal";
+            break;
+        case GF_FOP_PRI_LO:
+            name = "slow";
+            break;
+        case GF_FOP_PRI_LEAST:
+            name = "least";
+            break;
+        case GF_FOP_PRI_MAX:
+            name = "invalid";
+            break;
+        case GF_FOP_PRI_UNSPEC:
+            name = "unspecified";
+            break;
+    }
+    return name;
 }
 
+int
+iot_schedule(call_frame_t *frame, xlator_t *this, call_stub_t *stub)
+{
+    int ret = -1;
+    gf_fop_pri_t pri = GF_FOP_PRI_MAX - 1;
+    iot_conf_t *conf = this->private;
+
+    if ((frame->root->pid < GF_CLIENT_PID_MAX) &&
+        (frame->root->pid != GF_CLIENT_PID_NO_ROOT_SQUASH) &&
+        conf->least_priority) {
+        pri = GF_FOP_PRI_LEAST;
+        goto out;
+    }
+
+    switch (stub->fop) {
+        case GF_FOP_OPEN:
+        case GF_FOP_STAT:
+        case GF_FOP_FSTAT:
+        case GF_FOP_LOOKUP:
+        case GF_FOP_ACCESS:
+        case GF_FOP_READLINK:
+        case GF_FOP_OPENDIR:
+        case GF_FOP_STATFS:
+        case GF_FOP_READDIR:
+        case GF_FOP_READDIRP:
+        case GF_FOP_GETACTIVELK:
+        case GF_FOP_SETACTIVELK:
+        case GF_FOP_ICREATE:
+        case GF_FOP_NAMELINK:
+            pri = GF_FOP_PRI_HI;
+            break;
+
+        case GF_FOP_CREATE:
+        case GF_FOP_FLUSH:
+        case GF_FOP_LK:
+        case GF_FOP_INODELK:
+        case GF_FOP_FINODELK:
+        case GF_FOP_ENTRYLK:
+        case GF_FOP_FENTRYLK:
+        case GF_FOP_LEASE:
+        case GF_FOP_UNLINK:
+        case GF_FOP_SETATTR:
+        case GF_FOP_FSETATTR:
+        case GF_FOP_MKNOD:
+        case GF_FOP_MKDIR:
+        case GF_FOP_RMDIR:
+        case GF_FOP_SYMLINK:
+        case GF_FOP_RENAME:
+        case GF_FOP_LINK:
+        case GF_FOP_SETXATTR:
+        case GF_FOP_GETXATTR:
+        case GF_FOP_FGETXATTR:
+        case GF_FOP_FSETXATTR:
+        case GF_FOP_REMOVEXATTR:
+        case GF_FOP_FREMOVEXATTR:
+        case GF_FOP_PUT:
+            pri = GF_FOP_PRI_NORMAL;
+            break;
+
+        case GF_FOP_READ:
+        case GF_FOP_WRITE:
+        case GF_FOP_FSYNC:
+        case GF_FOP_TRUNCATE:
+        case GF_FOP_FTRUNCATE:
+        case GF_FOP_FSYNCDIR:
+        case GF_FOP_XATTROP:
+        case GF_FOP_FXATTROP:
+        case GF_FOP_RCHECKSUM:
+        case GF_FOP_FALLOCATE:
+        case GF_FOP_DISCARD:
+        case GF_FOP_ZEROFILL:
+        case GF_FOP_SEEK:
+            pri = GF_FOP_PRI_LO;
+            break;
+
+        case GF_FOP_FORGET:
+        case GF_FOP_RELEASE:
+        case GF_FOP_RELEASEDIR:
+        case GF_FOP_GETSPEC:
+            break;
+        case GF_FOP_IPC:
+        default:
+            return -EINVAL;
+    }
+out:
+    gf_msg_debug(this->name, 0, "%s scheduled as %s priority fop",
+                 gf_fop_list[stub->fop], iot_get_pri_meaning(pri));
+    if (this->private)
+        ret = do_iot_schedule(this->private, stub, pri);
+    return ret;
+}
 
-int32_t
-iot_create_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno,
-		fd_t *fd,
-		inode_t *inode,
-		struct stat *stbuf)
+int
+iot_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
 {
-	iot_conf_t *conf = this->private;
+    IOT_FOP(lookup, frame, this, loc, xdata);
+    return 0;
+}
 
-	if (op_ret >= 0) {
-		iot_file_t *file = CALLOC (1, sizeof (*file));
-		ERR_ABORT (file);
+int
+iot_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+            int32_t valid, dict_t *xdata)
+{
+    IOT_FOP(setattr, frame, this, loc, stbuf, valid, xdata);
+    return 0;
+}
 
-		iot_schedule (conf, file, fd->inode->ino);
-		file->fd = fd;
+int
+iot_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+             int32_t valid, dict_t *xdata)
+{
+    IOT_FOP(fsetattr, frame, this, fd, stbuf, valid, xdata);
+    return 0;
+}
 
-		fd_ctx_set (fd, this, (uint64_t)(long)file);
+int
+iot_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+           dict_t *xdata)
+{
+    IOT_FOP(access, frame, this, loc, mask, xdata);
+    return 0;
+}
 
-		pthread_mutex_lock (&conf->files_lock);
-		file->next = &conf->files;
-		file->prev = file->next->prev;
-		file->next->prev = file;
-		file->prev->next = file;
-		pthread_mutex_unlock (&conf->files_lock);
-	}
-	STACK_UNWIND (frame, op_ret, op_errno, fd, inode, stbuf);
-	return 0;
+int
+iot_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+             dict_t *xdata)
+{
+    IOT_FOP(readlink, frame, this, loc, size, xdata);
+    return 0;
 }
 
-int32_t
-iot_create (call_frame_t *frame,
-            xlator_t *this,
-	    loc_t *loc,
-            int32_t flags,
-            mode_t mode,
-	    fd_t *fd)
+int
+iot_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          dev_t rdev, mode_t umask, dict_t *xdata)
 {
-	STACK_WIND (frame,
-		    iot_create_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->create,
-		    loc,
-		    flags,
-		    mode,
-		    fd);
-	return 0;
+    IOT_FOP(mknod, frame, this, loc, mode, rdev, umask, xdata);
+    return 0;
 }
 
+int
+iot_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          mode_t umask, dict_t *xdata)
+{
+    IOT_FOP(mkdir, frame, this, loc, mode, umask, xdata);
+    return 0;
+}
 
+int
+iot_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+          dict_t *xdata)
+{
+    IOT_FOP(rmdir, frame, this, loc, flags, xdata);
+    return 0;
+}
 
-int32_t
-iot_readv_cbk (call_frame_t *frame,
-               void *cookie,
-               xlator_t *this,
-               int32_t op_ret,
-               int32_t op_errno,
-               struct iovec *vector,
-               int32_t count,
-	       struct stat *stbuf)
+int
+iot_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
+            loc_t *loc, mode_t umask, dict_t *xdata)
 {
-	iot_local_t *local = frame->local;
+    IOT_FOP(symlink, frame, this, linkname, loc, umask, xdata);
+    return 0;
+}
 
-	local->frame_size = 0; //iov_length (vector, count);
+int
+iot_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+           dict_t *xdata)
+{
+    IOT_FOP(rename, frame, this, oldloc, newloc, xdata);
+    return 0;
+}
 
-	STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
+int
+iot_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+         fd_t *fd, dict_t *xdata)
+{
+    IOT_FOP(open, frame, this, loc, flags, fd, xdata);
+    return 0;
+}
 
-	return 0;
+int
+iot_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+           mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    IOT_FOP(create, frame, this, loc, flags, mode, umask, fd, xdata);
+    return 0;
 }
 
-static int32_t
-iot_readv_wrapper (call_frame_t *frame,
-                   xlator_t *this,
-                   fd_t *fd,
-                   size_t size,
-                   off_t offset)
+int
+iot_put(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+        mode_t umask, uint32_t flags, struct iovec *vector, int32_t count,
+        off_t offset, struct iobref *iobref, dict_t *xattr, dict_t *xdata)
 {
-	STACK_WIND (frame,
-		    iot_readv_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->readv,
-		    fd,
-		    size,
-		    offset);
-	return 0;
+    IOT_FOP(put, frame, this, loc, mode, umask, flags, vector, count, offset,
+            iobref, xattr, xdata);
+    return 0;
 }
 
-int32_t
-iot_readv (call_frame_t *frame,
-           xlator_t *this,
-           fd_t *fd,
-           size_t size,
-           off_t offset)
-{
-	call_stub_t *stub;
-	iot_local_t *local = NULL;
-	iot_file_t *file = NULL;
-	iot_worker_t *worker = NULL;
-	uint64_t tmp_file = 0;
-
-	if (fd_ctx_get (fd, this, &tmp_file)) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"fd context is NULL, returning EBADFD");
-		STACK_UNWIND (frame, -1, EBADFD);
-		return 0;
-	}
-
-	file = (iot_file_t *)(long)tmp_file;
-	worker = file->worker;
-
-	local = CALLOC (1, sizeof (*local));
-	ERR_ABORT (local);
-	frame->local = local;
-  
-	stub = fop_readv_stub (frame, 
-			       iot_readv_wrapper,
-			       fd,
-			       size,
-			       offset);
-	if (!stub) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"cannot get readv call stub");
-		STACK_UNWIND (frame, -1, ENOMEM, NULL, 0);
-		return 0;
-	}
-
-	iot_queue (worker, stub);
-
-	return 0;
+int
+iot_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+          off_t offset, uint32_t flags, dict_t *xdata)
+{
+    IOT_FOP(readv, frame, this, fd, size, offset, flags, xdata);
+    return 0;
 }
 
-int32_t
-iot_flush_cbk (call_frame_t *frame,
-               void *cookie,
-               xlator_t *this,
-               int32_t op_ret,
-               int32_t op_errno)
+int
+iot_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
 {
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
+    IOT_FOP(flush, frame, this, fd, xdata);
+    return 0;
 }
 
-static int32_t
-iot_flush_wrapper (call_frame_t *frame,
-                   xlator_t *this,
-                   fd_t *fd)
+int
+iot_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+          dict_t *xdata)
 {
-	STACK_WIND (frame,
-		    iot_flush_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->flush,
-		    fd);
-	return 0;
+    IOT_FOP(fsync, frame, this, fd, datasync, xdata);
+    return 0;
 }
 
-int32_t
-iot_flush (call_frame_t *frame,
-           xlator_t *this,
-           fd_t *fd)
-{
-	call_stub_t *stub;
-	iot_local_t *local = NULL;
-	iot_file_t *file = NULL;
-	iot_worker_t *worker = NULL;
-	uint64_t tmp_file = 0;
-
-	if (fd_ctx_get (fd, this, &tmp_file)) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"fd context is NULL, returning EBADFD");
-		STACK_UNWIND (frame, -1, EBADFD);
-		return 0;
-	}
-
-	file = (iot_file_t *)(long)tmp_file;
-	worker = file->worker;
-
-	local = CALLOC (1, sizeof (*local));
-	ERR_ABORT (local);
-
-	frame->local = local;
-  
-	stub = fop_flush_stub (frame,
-			       iot_flush_wrapper,
-			       fd);
-	if (!stub) {
-		gf_log (this->name, GF_LOG_ERROR, "cannot get flush_cbk call stub");
-		STACK_UNWIND (frame, -1, ENOMEM);
-		return 0;
-	}
-	iot_queue (worker, stub);
-
-	return 0;
+int
+iot_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+           int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+           dict_t *xdata)
+{
+    IOT_FOP(writev, frame, this, fd, vector, count, offset, flags, iobref,
+            xdata);
+    return 0;
 }
 
-int32_t
-iot_fsync_cbk (call_frame_t *frame,
-               void *cookie,
-               xlator_t *this,
-               int32_t op_ret,
-               int32_t op_errno)
+int
+iot_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+       struct gf_flock *flock, dict_t *xdata)
 {
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
+    IOT_FOP(lk, frame, this, fd, cmd, flock, xdata);
+    return 0;
 }
 
-static int32_t
-iot_fsync_wrapper (call_frame_t *frame,
-                   xlator_t *this,
-                   fd_t *fd,
-                   int32_t datasync)
+int
+iot_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
 {
-	STACK_WIND (frame,
-		    iot_fsync_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->fsync,
-		    fd,
-		    datasync);
-	return 0;
+    IOT_FOP(stat, frame, this, loc, xdata);
+    return 0;
 }
 
-int32_t
-iot_fsync (call_frame_t *frame,
-           xlator_t *this,
-           fd_t *fd,
-           int32_t datasync)
-{
-	call_stub_t *stub;
-	iot_local_t *local = NULL;
-	iot_file_t *file = NULL;
-	iot_worker_t *worker = NULL;
-	uint64_t tmp_file = 0;
-
-	if (fd_ctx_get (fd, this, &tmp_file)) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"fd context is NULL, returning EBADFD");
-		STACK_UNWIND (frame, -1, EBADFD);
-		return 0;
-	}
-
-	file = (iot_file_t *)(long)tmp_file;
-	worker = file->worker;
-
-	local = CALLOC (1, sizeof (*local));
-	ERR_ABORT (local);
-
-	frame->local = local;
-  
-	stub = fop_fsync_stub (frame,
-			       iot_fsync_wrapper,
-			       fd,
-			       datasync);
-	if (!stub) {
-		gf_log (this->name, GF_LOG_ERROR, "cannot get fsync_cbk call stub");
-		STACK_UNWIND (frame, -1, ENOMEM);
-		return 0;
-	}
-	iot_queue (worker, stub);
-
-	return 0;
+int
+iot_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    IOT_FOP(fstat, frame, this, fd, xdata);
+    return 0;
 }
 
-int32_t
-iot_writev_cbk (call_frame_t *frame,
-                void *cookie,
-                xlator_t *this,
-                int32_t op_ret,
-                int32_t op_errno,
-		struct stat *stbuf)
+int
+iot_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+             dict_t *xdata)
 {
-	iot_local_t *local = frame->local;
+    IOT_FOP(truncate, frame, this, loc, offset, xdata);
+    return 0;
+}
 
-	local->frame_size = 0; /* hehe, caught me! */
+int
+iot_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+              dict_t *xdata)
+{
+    IOT_FOP(ftruncate, frame, this, fd, offset, xdata);
+    return 0;
+}
 
-	STACK_UNWIND (frame, op_ret, op_errno, stbuf);
-	return 0;
+int
+iot_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag,
+           dict_t *xdata)
+{
+    IOT_FOP(unlink, frame, this, loc, xflag, xdata);
+    return 0;
 }
 
-static int32_t
-iot_writev_wrapper (call_frame_t *frame,
-                    xlator_t *this,
-                    fd_t *fd,
-                    struct iovec *vector,
-                    int32_t count,
-                    off_t offset)
+int
+iot_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+         dict_t *xdata)
 {
-	STACK_WIND (frame,
-		    iot_writev_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->writev,
-		    fd,
-		    vector,
-		    count,
-		    offset);
-	return 0;
+    IOT_FOP(link, frame, this, oldloc, newloc, xdata);
+    return 0;
 }
 
-int32_t
-iot_writev (call_frame_t *frame,
-            xlator_t *this,
-            fd_t *fd,
-            struct iovec *vector,
-            int32_t count,
-            off_t offset)
+int
+iot_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+            dict_t *xdata)
 {
-	call_stub_t *stub;
-	iot_local_t *local = NULL;
-	iot_file_t *file = NULL;
-	iot_worker_t *worker = NULL;
-	uint64_t tmp_file = 0;
+    IOT_FOP(opendir, frame, this, loc, fd, xdata);
+    return 0;
+}
 
-	if (fd_ctx_get (fd, this, &tmp_file)) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"fd context is NULL, returning EBADFD");
-		STACK_UNWIND (frame, -1, EBADFD);
-		return 0;
-	}
+int
+iot_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync,
+             dict_t *xdata)
+{
+    IOT_FOP(fsyncdir, frame, this, fd, datasync, xdata);
+    return 0;
+}
 
-	file = (iot_file_t *)(long)tmp_file;
-	worker = file->worker;
+int
+iot_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    IOT_FOP(statfs, frame, this, loc, xdata);
+    return 0;
+}
 
-	local = CALLOC (1, sizeof (*local));
-	ERR_ABORT (local);
+int
+iot_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+             int32_t flags, dict_t *xdata)
+{
+    IOT_FOP(setxattr, frame, this, loc, dict, flags, xdata);
+    return 0;
+}
 
-	if (frame->root->req_refs)
-		local->frame_size = dict_serialized_length (frame->root->req_refs);
-	else
-		local->frame_size = iov_length (vector, count);
-	frame->local = local;
-  
-	stub = fop_writev_stub (frame, iot_writev_wrapper,
-				fd, vector, count, offset);
+int
+iot_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+             dict_t *xdata)
+{
+    iot_conf_t *conf = NULL;
+    dict_t *depths = NULL;
+    int i = 0;
+    int32_t op_ret = 0;
+    int32_t op_errno = 0;
+
+    conf = this->private;
+
+    if (name && strcmp(name, IO_THREADS_QUEUE_SIZE_KEY) == 0) {
+        /*
+         * We explicitly do not want a reference count
+         * for this dict in this translator
+         */
+        depths = dict_new();
+        if (!depths) {
+            op_ret = -1;
+            op_errno = ENOMEM;
+            goto unwind_special_getxattr;
+        }
 
-	if (!stub) {
-		gf_log (this->name, GF_LOG_ERROR, "cannot get writev call stub");
-		STACK_UNWIND (frame, -1, ENOMEM);
-		return 0;
-	}
+        for (i = 0; i < GF_FOP_PRI_MAX; i++) {
+            if (dict_set_int32(depths, (char *)fop_pri_to_string(i),
+                               conf->queue_sizes[i]) != 0) {
+                dict_unref(depths);
+                depths = NULL;
+                goto unwind_special_getxattr;
+            }
+        }
 
-	iot_queue (worker, stub);
+    unwind_special_getxattr:
+        STACK_UNWIND_STRICT(getxattr, frame, op_ret, op_errno, depths, xdata);
+        if (depths)
+            dict_unref(depths);
+        return 0;
+    }
 
-	return 0;
+    IOT_FOP(getxattr, frame, this, loc, name, xdata);
+    return 0;
 }
 
+int
+iot_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+              dict_t *xdata)
+{
+    IOT_FOP(fgetxattr, frame, this, fd, name, xdata);
+    return 0;
+}
 
-int32_t
-iot_lk_cbk (call_frame_t *frame,
-            void *cookie,
-            xlator_t *this,
-            int32_t op_ret,
-            int32_t op_errno,
-            struct flock *flock)
+int
+iot_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+              int32_t flags, dict_t *xdata)
 {
-	STACK_UNWIND (frame, op_ret, op_errno, flock);
-	return 0;
+    IOT_FOP(fsetxattr, frame, this, fd, dict, flags, xdata);
+    return 0;
 }
 
+int
+iot_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                const char *name, dict_t *xdata)
+{
+    IOT_FOP(removexattr, frame, this, loc, name, xdata);
+    return 0;
+}
 
-static int32_t
-iot_lk_wrapper (call_frame_t *frame,
-                xlator_t *this,
-                fd_t *fd,
-                int32_t cmd,
-                struct flock *flock)
+int
+iot_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                 const char *name, dict_t *xdata)
 {
-	STACK_WIND (frame,
-		    iot_lk_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->lk,
-		    fd,
-		    cmd,
-		    flock);
-	return 0;
+    IOT_FOP(fremovexattr, frame, this, fd, name, xdata);
+    return 0;
 }
 
+int
+iot_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+             off_t offset, dict_t *xdata)
+{
+    IOT_FOP(readdirp, frame, this, fd, size, offset, xdata);
+    return 0;
+}
 
-int32_t
-iot_lk (call_frame_t *frame,
-	xlator_t *this,
-	fd_t *fd,
-	int32_t cmd,
-	struct flock *flock)
-{
-	call_stub_t *stub;
-	iot_local_t *local = NULL;
-	iot_file_t *file = NULL;
-	iot_worker_t *worker = NULL;
-	uint64_t tmp_file = 0;
-
-	if (fd_ctx_get (fd, this, &tmp_file)) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"fd context is NULL, returning EBADFD");
-		STACK_UNWIND (frame, -1, EBADFD);
-		return 0;
-	}
-
-	file = (iot_file_t *)(long)tmp_file;
-	worker = file->worker;
-
-	local = CALLOC (1, sizeof (*local));
-	ERR_ABORT (local);
-	frame->local = local;
+int
+iot_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t offset, dict_t *xdata)
+{
+    IOT_FOP(readdir, frame, this, fd, size, offset, xdata);
+    return 0;
+}
 
-	stub = fop_lk_stub (frame, iot_lk_wrapper,
-			    fd, cmd, flock);
-
-	if (!stub) {
-		gf_log (this->name, GF_LOG_ERROR, "cannot get fop_lk call stub");
-		STACK_UNWIND (frame, -1, ENOMEM, NULL);
-		return 0;
-	}
-    
-	iot_queue (worker, stub);
-
-	return 0;
+int
+iot_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+            int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+{
+    IOT_FOP(inodelk, frame, this, volume, loc, cmd, lock, xdata);
+    return 0;
 }
-
-
-int32_t 
-iot_stat_cbk (call_frame_t *frame,
-              void *cookie,
-              xlator_t *this,
-              int32_t op_ret,
-              int32_t op_errno,
-              struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-
-static int32_t 
-iot_stat_wrapper (call_frame_t *frame,
-                  xlator_t *this,
-                  loc_t *loc)
-{
-	STACK_WIND (frame,
-		    iot_stat_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->stat,
-		    loc);
-	return 0;
-}
-
-int32_t 
-iot_stat (call_frame_t *frame,
-          xlator_t *this,
-          loc_t *loc)
-{
-	call_stub_t *stub;
-	iot_local_t *local = NULL;
-	iot_worker_t *worker = NULL;
-	iot_conf_t *conf;
-	fd_t *fd = NULL;
-
-	conf = this->private;
-
-	local = CALLOC (1, sizeof (*local));
-	ERR_ABORT (local);
-	frame->local = local;
-
-	fd = fd_lookup (loc->inode, frame->root->pid);
-
-	if (fd == NULL) {
-		STACK_WIND(frame,
-			   iot_stat_cbk,
-			   FIRST_CHILD(this),
-			   FIRST_CHILD(this)->fops->stat,
-			   loc);
-		return 0;
-	} 
-  
-	fd_unref (fd);
-
-	worker = iot_schedule (conf, NULL, loc->inode->ino);
-
-	stub = fop_stat_stub (frame,
-			      iot_stat_wrapper,
-			      loc);
-	if (!stub) {
-		gf_log (this->name, GF_LOG_ERROR, "cannot get fop_stat call stub");
-		STACK_UNWIND (frame, -1, ENOMEM, NULL);
-		return 0;
-	}
-	iot_queue (worker, stub);
-
-	return 0;
-}
-
-
-int32_t 
-iot_fstat_cbk (call_frame_t *frame,
-               void *cookie,
-               xlator_t *this,
-               int32_t op_ret,
-               int32_t op_errno,
-               struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-static int32_t 
-iot_fstat_wrapper (call_frame_t *frame,
-                   xlator_t *this,
-                   fd_t *fd)
-{
-	STACK_WIND (frame,
-		    iot_fstat_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->fstat,
-		    fd);
-	return 0;
-}
-
-int32_t 
-iot_fstat (call_frame_t *frame,
-           xlator_t *this,
-           fd_t *fd)
-{
-	call_stub_t *stub;
-	iot_local_t *local = NULL;
-	iot_file_t *file = NULL;
-	iot_worker_t *worker = NULL;
-	uint64_t tmp_file = 0;
-
-	if (fd_ctx_get (fd, this, &tmp_file)) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"fd context is NULL, returning EBADFD");
-		STACK_UNWIND (frame, -1, EBADFD);
-		return 0;
-	}
-
-	file = (iot_file_t *)(long)tmp_file;
-	worker = file->worker;
-
-	local = CALLOC (1, sizeof (*local));
-	ERR_ABORT (local);
-	frame->local = local;
-	stub = fop_fstat_stub (frame,
-			       iot_fstat_wrapper,
-			       fd);
-	if (!stub) {
-		gf_log (this->name, GF_LOG_ERROR, "cannot get fop_fstat call stub");
-		STACK_UNWIND (frame, -1, ENOMEM, NULL);
-		return 0;
-	}
-
-	iot_queue (worker, stub);
-
-	return 0;
-}
-
-int32_t 
-iot_truncate_cbk (call_frame_t *frame,
-                  void *cookie,
-                  xlator_t *this,
-                  int32_t op_ret,
-                  int32_t op_errno,
-                  struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-static int32_t 
-iot_truncate_wrapper (call_frame_t *frame,
-                      xlator_t *this,
-                      loc_t *loc,
-                      off_t offset)
-{
-	STACK_WIND (frame,
-		    iot_truncate_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->truncate,
-		    loc,
-		    offset);
-	return 0;
-}
-
-int32_t 
-iot_truncate (call_frame_t *frame,
-              xlator_t *this,
-              loc_t *loc,
-              off_t offset)
-{
-	call_stub_t *stub;
-	iot_local_t *local = NULL;
-	iot_worker_t *worker = NULL;
-	iot_conf_t *conf;
-	fd_t *fd = NULL;
-  
-	conf = this->private;
-	local = CALLOC (1, sizeof (*local));
-	ERR_ABORT (local);
-	frame->local = local;
-
-	fd = fd_lookup (loc->inode, frame->root->pid);
-
-	if (fd == NULL) {
-		STACK_WIND(frame,
-			   iot_truncate_cbk,
-			   FIRST_CHILD(this),
-			   FIRST_CHILD(this)->fops->truncate,
-			   loc,
-			   offset);
-		return 0;
-	} 
-  
-	fd_unref (fd);
-
-	worker = iot_schedule (conf, NULL, loc->inode->ino);
-
-	stub = fop_truncate_stub (frame,
-				  iot_truncate_wrapper,
-				  loc,
-				  offset);
-	if (!stub) {
-		gf_log (this->name, GF_LOG_ERROR, "cannot get fop_stat call stub");
-		STACK_UNWIND (frame, -1, ENOMEM, NULL);
-		return 0;
-	}
-	iot_queue (worker, stub);
-
-	return 0;
-}
-
-int32_t 
-iot_ftruncate_cbk (call_frame_t *frame,
-                   void *cookie,
-                   xlator_t *this,
-                   int32_t op_ret,
-                   int32_t op_errno,
-                   struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-static int32_t 
-iot_ftruncate_wrapper (call_frame_t *frame,
-                       xlator_t *this,
-                       fd_t *fd,
-                       off_t offset)
-{
-	STACK_WIND (frame,
-		    iot_ftruncate_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->ftruncate,
-		    fd,
-		    offset);
-	return 0;
-}
-
-int32_t 
-iot_ftruncate (call_frame_t *frame,
-               xlator_t *this,
-               fd_t *fd,
-               off_t offset)
-{
-	call_stub_t *stub;
-	iot_local_t *local = NULL;
-	iot_file_t *file = NULL;
-	iot_worker_t *worker = NULL;
-	uint64_t tmp_file = 0;
-
-	if (fd_ctx_get (fd, this, &tmp_file)) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"fd context is NULL, returning EBADFD");
-		STACK_UNWIND (frame, -1, EBADFD);
-		return 0;
-	}
-
-	file = (iot_file_t *)(long)tmp_file;
-	worker = file->worker;
-
-	local = CALLOC (1, sizeof (*local));
-	ERR_ABORT (local);
-	frame->local = local;
-
-	stub = fop_ftruncate_stub (frame,
-				   iot_ftruncate_wrapper,
-				   fd,
-				   offset);
-	if (!stub) {
-		gf_log (this->name, GF_LOG_ERROR, "cannot get fop_ftruncate call stub");
-		STACK_UNWIND (frame, -1, ENOMEM, NULL);
-		return 0;
-	}
-	iot_queue (worker, stub);
-
-	return 0;
-}
-
-int32_t 
-iot_utimens_cbk (call_frame_t *frame,
-                 void *cookie,
-                 xlator_t *this,
-                 int32_t op_ret,
-                 int32_t op_errno,
-                 struct stat *buf)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
-}
-
-static int32_t 
-iot_utimens_wrapper (call_frame_t *frame,
-                     xlator_t *this,
-                     loc_t *loc,
-                     struct timespec tv[2])
-{
-	STACK_WIND (frame,
-		    iot_utimens_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->utimens,
-		    loc,
-		    tv);
-  
-	return 0;
-}
-
-int32_t 
-iot_utimens (call_frame_t *frame,
-             xlator_t *this,
-             loc_t *loc,
-             struct timespec tv[2])
-{
-	call_stub_t *stub;
-	iot_local_t *local = NULL;
-	iot_worker_t *worker = NULL;
-	iot_conf_t *conf;
-	fd_t *fd = NULL;
-  
-	conf = this->private;
-
-	local = CALLOC (1, sizeof (*local));
-	ERR_ABORT (local);
-	frame->local = local;
-  
-	fd = fd_lookup (loc->inode, frame->root->pid);
-
-	if (fd == NULL) {
-		STACK_WIND(frame,
-			   iot_utimens_cbk,
-			   FIRST_CHILD(this),
-			   FIRST_CHILD(this)->fops->utimens,
-			   loc,
-			   tv);
-		return 0;
-	} 
-  
-	fd_unref (fd);
-
-	worker = iot_schedule (conf, NULL, loc->inode->ino);
-
-	stub = fop_utimens_stub (frame,
-				 iot_utimens_wrapper,
-				 loc,
-				 tv);
-	if (!stub) {
-		gf_log (this->name, GF_LOG_ERROR, "cannot get fop_utimens call stub");
-		STACK_UNWIND (frame, -1, ENOMEM, NULL);
-		return 0;
-	}
-	iot_queue (worker, stub);
-
-	return 0;
-}
-
-
-int32_t 
-iot_checksum_cbk (call_frame_t *frame,
-		  void *cookie,
-		  xlator_t *this,
-		  int32_t op_ret,
-		  int32_t op_errno,
-		  uint8_t *file_checksum,
-		  uint8_t *dir_checksum)
-{
-	STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum);
-	return 0;
-}
-
-static int32_t 
-iot_checksum_wrapper (call_frame_t *frame,
-		      xlator_t *this,
-		      loc_t *loc,
-		      int32_t flags)
-{
-	STACK_WIND (frame,
-		    iot_checksum_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->checksum,
-		    loc,
-		    flags);
-  
-	return 0;
-}
-
-int32_t 
-iot_checksum (call_frame_t *frame,
-	      xlator_t *this,
-	      loc_t *loc,
-	      int32_t flags)
-{
-	call_stub_t *stub = NULL;
-	iot_local_t *local = NULL;
-	iot_worker_t *worker = NULL;
-	iot_conf_t *conf = NULL;
-  
-	conf = this->private;
-
-	local = CALLOC (1, sizeof (*local));
-	frame->local = local;
-
-	worker = iot_schedule (conf, NULL, conf->misc_thread_index++);
-
-	stub = fop_checksum_stub (frame,
-				  iot_checksum_wrapper,
-				  loc,
-				  flags);
-	if (!stub) {
-		gf_log (this->name, GF_LOG_ERROR, "cannot get fop_checksum call stub");
-		STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL);
-		return 0;
-	}
-	iot_queue (worker, stub);
-
-	return 0;
-}
-
-
-int32_t 
-iot_unlink_cbk (call_frame_t *frame,
-		void *cookie,
-		xlator_t *this,
-		int32_t op_ret,
-		int32_t op_errno)
-{
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
-
-static int32_t 
-iot_unlink_wrapper (call_frame_t *frame,
-		    xlator_t *this,
-		    loc_t *loc)
+
+int
+iot_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+             int32_t cmd, struct gf_flock *lock, dict_t *xdata)
 {
-	STACK_WIND (frame,
-		    iot_unlink_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->unlink,
-		    loc);
-  
-	return 0;
+    IOT_FOP(finodelk, frame, this, volume, fd, cmd, lock, xdata);
+    return 0;
 }
 
-int32_t 
-iot_unlink (call_frame_t *frame,
-	    xlator_t *this,
-	    loc_t *loc)
+int
+iot_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+            const char *basename, entrylk_cmd cmd, entrylk_type type,
+            dict_t *xdata)
 {
-	call_stub_t *stub = NULL;
-	iot_local_t *local = NULL;
-	iot_worker_t *worker = NULL;
-	iot_conf_t *conf = NULL;
+    IOT_FOP(entrylk, frame, this, volume, loc, basename, cmd, type, xdata);
+    return 0;
+}
 
-	conf = this->private;
+int
+iot_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+             const char *basename, entrylk_cmd cmd, entrylk_type type,
+             dict_t *xdata)
+{
+    IOT_FOP(fentrylk, frame, this, volume, fd, basename, cmd, type, xdata);
+    return 0;
+}
 
-	local = CALLOC (1, sizeof (*local));
-	frame->local = local;
+int
+iot_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+            gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    IOT_FOP(xattrop, frame, this, loc, optype, xattr, xdata);
+    return 0;
+}
 
-	worker = iot_schedule (conf, NULL, conf->misc_thread_index++);
+int
+iot_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+             gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    IOT_FOP(fxattrop, frame, this, fd, optype, xattr, xdata);
+    return 0;
+}
 
-	stub = fop_unlink_stub (frame, iot_unlink_wrapper, loc);
-	if (!stub) {
-		gf_log (this->name, GF_LOG_ERROR, "cannot get fop_unlink call stub");
-		STACK_UNWIND (frame, -1, ENOMEM);
-		return 0;
-	}
-	iot_queue (worker, stub);
+int32_t
+iot_rchecksum(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+              int32_t len, dict_t *xdata)
+{
+    IOT_FOP(rchecksum, frame, this, fd, offset, len, xdata);
+    return 0;
+}
 
-	return 0;
+int
+iot_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+              off_t offset, size_t len, dict_t *xdata)
+{
+    IOT_FOP(fallocate, frame, this, fd, mode, offset, len, xdata);
+    return 0;
 }
 
-int32_t
-iot_release (xlator_t *this,
-	     fd_t *fd)
+int
+iot_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            size_t len, dict_t *xdata)
 {
-	iot_file_t *file = NULL;
-	iot_conf_t *conf = NULL;
-	uint64_t tmp_file = 0;
-	int ret = 0;
+    IOT_FOP(discard, frame, this, fd, offset, len, xdata);
+    return 0;
+}
 
-	conf = this->private;
-	ret = fd_ctx_del (fd, this, &tmp_file);
-	if (ret)
-		return 0;
+int
+iot_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             off_t len, dict_t *xdata)
+{
+    IOT_FOP(zerofill, frame, this, fd, offset, len, xdata);
+    return 0;
+}
 
-	file = (iot_file_t *)(long)tmp_file;
+int
+iot_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+         gf_seek_what_t what, dict_t *xdata)
+{
+    IOT_FOP(seek, frame, this, fd, offset, what, xdata);
+    return 0;
+}
 
-	pthread_mutex_lock (&conf->files_lock);
-	{
-		(file->prev)->next = file->next;
-		(file->next)->prev = file->prev;
-	}
-	pthread_mutex_unlock (&conf->files_lock);
+int
+iot_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+          struct gf_lease *lease, dict_t *xdata)
+{
+    IOT_FOP(lease, frame, this, loc, lease, xdata);
+    return 0;
+}
 
-	FREE (file);
-	return 0;
+int
+iot_getactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    IOT_FOP(getactivelk, frame, this, loc, xdata);
+    return 0;
 }
 
+int
+iot_setactivelk(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                lock_migration_info_t *locklist, dict_t *xdata)
+{
+    IOT_FOP(setactivelk, frame, this, loc, locklist, xdata);
+    return 0;
+}
 
-static void
-iot_queue (iot_worker_t *worker,
-           call_stub_t *stub)
+int
+__iot_workers_scale(iot_conf_t *conf)
 {
-	iot_queue_t *queue;
+    int scale = 0;
+    int diff = 0;
+    pthread_t thread;
+    int ret = 0;
+    int i = 0;
+
+    for (i = 0; i < GF_FOP_PRI_MAX; i++)
+        scale += min(conf->queue_sizes[i], conf->ac_iot_limit[i]);
+
+    if (scale < IOT_MIN_THREADS)
+        scale = IOT_MIN_THREADS;
+
+    if (scale > conf->max_count)
+        scale = conf->max_count;
+
+    if (conf->curr_count < scale) {
+        diff = scale - conf->curr_count;
+    }
+
+    while (diff) {
+        diff--;
+
+        ret = gf_thread_create(&thread, &conf->w_attr, iot_worker, conf,
+                               "iotwr%03hx", conf->curr_count & 0x3ff);
+        if (ret == 0) {
+            pthread_detach(thread);
+            conf->curr_count++;
+            gf_msg_debug(conf->this->name, 0,
+                         "scaled threads to %d (queue_size=%d/%d)",
+                         conf->curr_count, conf->queue_size, scale);
+        } else {
+            break;
+        }
+    }
+
+    return diff;
+}
 
-	queue = CALLOC (1, sizeof (*queue));
-	ERR_ABORT (queue);
-	queue->stub = stub;
+int
+iot_workers_scale(iot_conf_t *conf)
+{
+    int ret = -1;
 
-        pthread_mutex_lock (&worker->qlock);
-        {
-                queue->next = &worker->queue;
-                queue->prev = worker->queue.prev;
+    if (conf == NULL) {
+        ret = -EINVAL;
+        goto out;
+    }
 
-                queue->next->prev = queue;
-                queue->prev->next = queue;
+    pthread_mutex_lock(&conf->mutex);
+    {
+        ret = __iot_workers_scale(conf);
+    }
+    pthread_mutex_unlock(&conf->mutex);
 
-                /* dq_cond */
-                worker->queue_size++;
-                worker->q++;
+out:
+    return ret;
+}
 
-                pthread_cond_broadcast (&worker->dq_cond);
+int
+set_stack_size(iot_conf_t *conf)
+{
+    int err = 0;
+    size_t stacksize = IOT_THREAD_STACK_SIZE;
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    err = pthread_attr_init(&conf->w_attr);
+    if (err != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, err, IO_THREADS_MSG_INIT_FAILED,
+                NULL);
+        return err;
+    }
+
+    err = pthread_attr_setstacksize(&conf->w_attr, stacksize);
+    if (err == EINVAL) {
+        err = pthread_attr_getstacksize(&conf->w_attr, &stacksize);
+        if (!err) {
+            gf_smsg(this->name, GF_LOG_WARNING, 0, IO_THREADS_MSG_SIZE_NOT_SET,
+                    "size=%zd", stacksize, NULL);
+        } else {
+            gf_smsg(this->name, GF_LOG_WARNING, 0, IO_THREADS_MSG_SIZE_NOT_SET,
+                    NULL);
+            err = 0;
         }
-	pthread_mutex_unlock (&worker->qlock);
+    }
+
+    conf->stack_size = stacksize;
+    return err;
 }
 
-static call_stub_t *
-iot_dequeue (iot_worker_t *worker)
+int32_t
+mem_acct_init(xlator_t *this)
 {
-	call_stub_t *stub = NULL;
-	iot_queue_t *queue = NULL;
+    int ret = -1;
 
-	pthread_mutex_lock (&worker->qlock);
-        {
-                while (!worker->queue_size)
-                        pthread_cond_wait (&worker->dq_cond, &worker->qlock);
+    if (!this)
+        return ret;
 
-                queue = worker->queue.next;
-                queue->next->prev = queue->prev;
-                queue->prev->next = queue->next;
+    ret = xlator_mem_acct_init(this, gf_iot_mt_end + 1);
 
-                stub = queue->stub;
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_THREADS_MSG_NO_MEMORY,
+                NULL);
+        return ret;
+    }
 
-                worker->queue_size--;
-                worker->dq++;
-        }
-	pthread_mutex_unlock (&worker->qlock);
+    return ret;
+}
 
-	FREE (queue);
+int
+iot_priv_dump(xlator_t *this)
+{
+    iot_conf_t *conf = NULL;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN];
+    char key[GF_DUMP_MAX_BUF_LEN];
+    int i = 0;
+
+    if (!this)
+        return 0;
+
+    conf = this->private;
+    if (!conf)
+        return 0;
+
+    snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
+
+    gf_proc_dump_add_section("%s", key_prefix);
+
+    gf_proc_dump_write("maximum_threads_count", "%d", conf->max_count);
+    gf_proc_dump_write("current_threads_count", "%d", conf->curr_count);
+    gf_proc_dump_write("sleep_count", "%d", conf->sleep_count);
+    gf_proc_dump_write("idle_time", "%d", conf->idle_time);
+    gf_proc_dump_write("stack_size", "%zd", conf->stack_size);
+    gf_proc_dump_write("max_high_priority_threads", "%d",
+                       conf->ac_iot_limit[GF_FOP_PRI_HI]);
+    gf_proc_dump_write("max_normal_priority_threads", "%d",
+                       conf->ac_iot_limit[GF_FOP_PRI_NORMAL]);
+    gf_proc_dump_write("max_low_priority_threads", "%d",
+                       conf->ac_iot_limit[GF_FOP_PRI_LO]);
+    gf_proc_dump_write("max_least_priority_threads", "%d",
+                       conf->ac_iot_limit[GF_FOP_PRI_LEAST]);
+    gf_proc_dump_write("current_high_priority_threads", "%d",
+                       conf->ac_iot_count[GF_FOP_PRI_HI]);
+    gf_proc_dump_write("current_normal_priority_threads", "%d",
+                       conf->ac_iot_count[GF_FOP_PRI_NORMAL]);
+    gf_proc_dump_write("current_low_priority_threads", "%d",
+                       conf->ac_iot_count[GF_FOP_PRI_LO]);
+    gf_proc_dump_write("current_least_priority_threads", "%d",
+                       conf->ac_iot_count[GF_FOP_PRI_LEAST]);
+    for (i = 0; i < GF_FOP_PRI_MAX; i++) {
+        if (!conf->queue_sizes[i])
+            continue;
+        snprintf(key, sizeof(key), "%s_priority_queue_length",
+                 iot_get_pri_meaning(i));
+        gf_proc_dump_write(key, "%d", conf->queue_sizes[i]);
+    }
+
+    return 0;
+}
 
-	return stub;
+/*
+ * We use a decay model to keep track and make sure we're not spawning new
+ * threads too often.  Each increment adds a large value to a counter, and that
+ * counter keeps ticking back down to zero over a fairly long period.  For
+ * example, let's use ONE_WEEK=604800 seconds, and we want to detect when we
+ * have N=3 increments during that time.  Thus, our threshold is
+ * (N-1)*ONE_WEEK.  To see how it works, look at three examples.
+ *
+ *   (a) Two events close together, then one more almost a week later.  The
+ *   first two events push our counter to 2*ONE_WEEK plus a bit.  At the third
+ *   event, we decay down to ONE_WEEK plus a bit and then add ONE_WEEK for the
+ *   new event, exceeding our threshold.
+ *
+ *   (b) One event, then two more almost a week later.  At the time of the
+ *   second and third events, the counter is already non-zero, so when we add
+ *   2*ONE_WEEK we exceed again.
+ *
+ *   (c) Three events, spaced three days apart.  At the time of the second
+ *   event, we decay down to approxitely ONE_WEEK*4/7 and then add another
+ *   ONE_WEEK.  At the third event, we decay again down to ONE_WEEK*8/7 and add
+ *   another ONE_WEEK, so boom.
+ *
+ * Note that in all three cases if that last event came a day later our counter
+ * would have decayed a bit more and we would *not* exceed our threshold.  It's
+ * not exactly the same as a precise "three in one week" limit, but it's very
+ * close and it allows the same kind of tweaking while requiring only constant
+ * space - no arrays of variable length N to allocate or maintain.  All we need
+ * (for each queue) is the value plus the time of the last update.
+ */
+
+typedef struct {
+    time_t update_time;
+    uint32_t value;
+} threshold_t;
+/*
+ * Variables so that I can hack these for testing.
+ * TBD: make these tunable?
+ */
+static uint32_t THRESH_SECONDS = 604800;
+static uint32_t THRESH_EVENTS = 3;
+static uint32_t THRESH_LIMIT = 1209600; /* SECONDS * (EVENTS-1) */
+
+static void
+iot_apply_event(xlator_t *this, threshold_t *thresh)
+{
+    time_t delta, now = gf_time();
+
+    /* Refresh for manual testing/debugging.  It's cheap. */
+    THRESH_LIMIT = THRESH_SECONDS * (THRESH_EVENTS - 1);
+
+    if (thresh->value && thresh->update_time) {
+        delta = now - thresh->update_time;
+        /* Be careful about underflow. */
+        if (thresh->value <= delta) {
+            thresh->value = 0;
+        } else {
+            thresh->value -= delta;
+        }
+    }
+
+    thresh->value += THRESH_SECONDS;
+    if (thresh->value >= THRESH_LIMIT) {
+        gf_log(this->name, GF_LOG_EMERG, "watchdog firing too often");
+        /*
+         * The default action for SIGTRAP is to dump core, but the fact
+         * that it's distinct from other signals we use means that
+         * there are other possibilities as well (e.g. drop into gdb or
+         * invoke a special handler).
+         */
+        kill(getpid(), SIGTRAP);
+    }
+
+    thresh->update_time = now;
 }
 
 static void *
-iot_worker (void *arg)
+iot_watchdog(void *arg)
 {
-	iot_worker_t *worker = arg;
+    xlator_t *this = arg;
+    iot_conf_t *priv = this->private;
+    int i;
+    int bad_times[GF_FOP_PRI_MAX] = {
+        0,
+    };
+    threshold_t thresholds[GF_FOP_PRI_MAX] = {{
+        0,
+    }};
+
+    for (;;) {
+        sleep(max(priv->watchdog_secs / 5, 1));
+        pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
+        pthread_mutex_lock(&priv->mutex);
+        for (i = 0; i < GF_FOP_PRI_MAX; ++i) {
+            if (priv->queue_marked[i]) {
+                if (++bad_times[i] >= 5) {
+                    gf_log(this->name, GF_LOG_WARNING, "queue %d stalled", i);
+                    iot_apply_event(this, &thresholds[i]);
+                    /*
+                     * We might not get here if the event
+                     * put us over our threshold.
+                     */
+                    ++(priv->ac_iot_limit[i]);
+                    bad_times[i] = 0;
+                }
+            } else {
+                bad_times[i] = 0;
+            }
+            priv->queue_marked[i] = (priv->queue_sizes[i] > 0);
+        }
+        pthread_mutex_unlock(&priv->mutex);
+        pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
+    }
 
-	while (1) {
-		call_stub_t *stub;
+    /* NOTREACHED */
+    return NULL;
+}
 
-		stub = iot_dequeue (worker);
-		call_resume (stub);
-	}
+static void
+start_iot_watchdog(xlator_t *this)
+{
+    iot_conf_t *priv = this->private;
+    int ret;
+
+    if (priv->watchdog_running) {
+        return;
+    }
+
+    ret = pthread_create(&priv->watchdog_thread, NULL, iot_watchdog, this);
+    if (ret == 0) {
+        priv->watchdog_running = _gf_true;
+    } else {
+        gf_log(this->name, GF_LOG_WARNING,
+               "pthread_create(iot_watchdog) failed");
+    }
 }
 
 static void
-workers_init (iot_conf_t *conf)
+stop_iot_watchdog(xlator_t *this)
 {
-	int i;
+    iot_conf_t *priv = this->private;
+
+    if (!priv->watchdog_running) {
+        return;
+    }
+
+    if (pthread_cancel(priv->watchdog_thread) != 0) {
+        gf_log(this->name, GF_LOG_WARNING,
+               "pthread_cancel(iot_watchdog) failed");
+    }
+
+    if (pthread_join(priv->watchdog_thread, NULL) != 0) {
+        gf_log(this->name, GF_LOG_WARNING, "pthread_join(iot_watchdog) failed");
+    }
+
+    /* Failure probably means it's already dead. */
+    priv->watchdog_running = _gf_false;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    iot_conf_t *conf = NULL;
+    int ret = -1;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    GF_OPTION_RECONF("thread-count", conf->max_count, options, int32, out);
+
+    GF_OPTION_RECONF("high-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_HI],
+                     options, int32, out);
+
+    GF_OPTION_RECONF("normal-prio-threads",
+                     conf->ac_iot_limit[GF_FOP_PRI_NORMAL], options, int32,
+                     out);
+
+    GF_OPTION_RECONF("low-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_LO],
+                     options, int32, out);
 
-	conf->workers.next = &conf->workers;
-	conf->workers.prev = &conf->workers;
+    GF_OPTION_RECONF("least-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_LEAST],
+                     options, int32, out);
 
-	for (i=0; i<conf->thread_count; i++) {
+    GF_OPTION_RECONF("enable-least-priority", conf->least_priority, options,
+                     bool, out);
 
-		iot_worker_t *worker = CALLOC (1, sizeof (*worker));
-		ERR_ABORT (worker);
+    GF_OPTION_RECONF("cleanup-disconnected-reqs",
+                     conf->cleanup_disconnected_reqs, options, bool, out);
 
-		worker->next = &conf->workers;
-		worker->prev = conf->workers.prev;
-		worker->next->prev = worker;
-		worker->prev->next = worker;
+    GF_OPTION_RECONF("watchdog-secs", conf->watchdog_secs, options, int32, out);
 
-		worker->queue.next = &worker->queue;
-		worker->queue.prev = &worker->queue;
+    GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out);
 
-		pthread_mutex_init (&worker->qlock, NULL);
-		pthread_cond_init (&worker->dq_cond, NULL);
-		worker->conf = conf;
+    if (conf->watchdog_secs > 0) {
+        start_iot_watchdog(this);
+    } else {
+        stop_iot_watchdog(this);
+    }
 
-		pthread_create (&worker->thread, NULL, iot_worker, worker);
-	}
+    ret = 0;
+out:
+    return ret;
 }
 
-int32_t 
-init (xlator_t *this)
+int
+init(xlator_t *this)
 {
-	iot_conf_t *conf;
-	dict_t *options = this->options;
+    iot_conf_t *conf = NULL;
+    int ret = -1;
+    int i = 0;
 
-	if (!this->children || this->children->next) {
-		gf_log ("io-threads",
-			GF_LOG_ERROR,
-			"FATAL: iot not configured with exactly one child");
-		return -1;
-	}
+    if (!this->children || this->children->next) {
+        gf_smsg("io-threads", GF_LOG_ERROR, 0,
+                IO_THREADS_MSG_XLATOR_CHILD_MISCONFIGURED, NULL);
+        goto out;
+    }
 
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
+    if (!this->parents) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, IO_THREADS_MSG_VOL_MISCONFIGURED,
+                NULL);
+    }
 
-	conf = (void *) CALLOC (1, sizeof (*conf));
-	ERR_ABORT (conf);
+    conf = (void *)GF_CALLOC(1, sizeof(*conf), gf_iot_mt_iot_conf_t);
+    if (conf == NULL) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, IO_THREADS_MSG_OUT_OF_MEMORY,
+                NULL);
+        goto out;
+    }
 
-	conf->thread_count = 1;
+    if ((ret = pthread_cond_init(&conf->cond, NULL)) != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, IO_THREADS_MSG_PTHREAD_INIT_FAILED,
+                "pthread_cond_init ret=%d", ret, NULL);
+        goto out;
+    }
+    conf->cond_inited = _gf_true;
 
-	if (dict_get (options, "thread-count")) {
-		conf->thread_count = data_to_int32 (dict_get (options,
-							      "thread-count"));
-		gf_log ("io-threads",
-			GF_LOG_DEBUG,
-			"Using conf->thread_count = %d",
-			conf->thread_count);
-	}
+    if ((ret = pthread_mutex_init(&conf->mutex, NULL)) != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, IO_THREADS_MSG_PTHREAD_INIT_FAILED,
+                "pthread_mutex_init ret=%d", ret, NULL);
+        goto out;
+    }
+    conf->mutex_inited = _gf_true;
 
-	conf->files.next = &conf->files;
-	conf->files.prev = &conf->files;
-	pthread_mutex_init (&conf->files_lock, NULL);
+    ret = set_stack_size(conf);
 
-	workers_init (conf);
+    if (ret != 0)
+        goto out;
 
-	this->private = conf;
-	return 0;
+    ret = -1;
+
+    GF_OPTION_INIT("thread-count", conf->max_count, int32, out);
+
+    GF_OPTION_INIT("high-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_HI],
+                   int32, out);
+
+    GF_OPTION_INIT("normal-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_NORMAL],
+                   int32, out);
+
+    GF_OPTION_INIT("low-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_LO], int32,
+                   out);
+
+    GF_OPTION_INIT("least-prio-threads", conf->ac_iot_limit[GF_FOP_PRI_LEAST],
+                   int32, out);
+
+    GF_OPTION_INIT("idle-time", conf->idle_time, int32, out);
+
+    GF_OPTION_INIT("enable-least-priority", conf->least_priority, bool, out);
+
+    GF_OPTION_INIT("cleanup-disconnected-reqs", conf->cleanup_disconnected_reqs,
+                   bool, out);
+
+    GF_OPTION_INIT("pass-through", this->pass_through, bool, out);
+
+    conf->this = this;
+    GF_ATOMIC_INIT(conf->stub_cnt, 0);
+
+    for (i = 0; i < GF_FOP_PRI_MAX; i++) {
+        INIT_LIST_HEAD(&conf->clients[i]);
+        INIT_LIST_HEAD(&conf->no_client[i].clients);
+        INIT_LIST_HEAD(&conf->no_client[i].reqs);
+    }
+
+    if (!this->pass_through) {
+        ret = iot_workers_scale(conf);
+
+        if (ret == -1) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0,
+                    IO_THREADS_MSG_WORKER_THREAD_INIT_FAILED, NULL);
+            goto out;
+        }
+    }
+
+    this->private = conf;
+
+    conf->watchdog_secs = 0;
+    GF_OPTION_INIT("watchdog-secs", conf->watchdog_secs, int32, out);
+    if (conf->watchdog_secs > 0) {
+        start_iot_watchdog(this);
+    }
+
+    ret = 0;
+out:
+    if (ret)
+        GF_FREE(conf);
+
+    return ret;
+}
+
+static void
+iot_exit_threads(iot_conf_t *conf)
+{
+    pthread_mutex_lock(&conf->mutex);
+    {
+        conf->down = _gf_true;
+        /*Let all the threads know that xl is going down*/
+        pthread_cond_broadcast(&conf->cond);
+        while (conf->curr_count) /*Wait for threads to exit*/
+            pthread_cond_wait(&conf->cond, &conf->mutex);
+    }
+    pthread_mutex_unlock(&conf->mutex);
+}
+
+int
+notify(xlator_t *this, int32_t event, void *data, ...)
+{
+    iot_conf_t *conf = this->private;
+    xlator_t *victim = data;
+    uint64_t stub_cnt = 0;
+    struct timespec sleep_till = {
+        0,
+    };
+
+    if (GF_EVENT_PARENT_DOWN == event) {
+        if (victim->cleanup_starting) {
+            /* Wait for draining stub from queue before notify PARENT_DOWN */
+            stub_cnt = GF_ATOMIC_GET(conf->stub_cnt);
+            if (stub_cnt) {
+                timespec_now_realtime(&sleep_till);
+                sleep_till.tv_sec += 1;
+                pthread_mutex_lock(&conf->mutex);
+                {
+                    while (stub_cnt) {
+                        (void)pthread_cond_timedwait(&conf->cond, &conf->mutex,
+                                                     &sleep_till);
+                        stub_cnt = GF_ATOMIC_GET(conf->stub_cnt);
+                    }
+                }
+                pthread_mutex_unlock(&conf->mutex);
+            }
+
+            gf_log(this->name, GF_LOG_INFO,
+                   "Notify GF_EVENT_PARENT_DOWN for brick %s", victim->name);
+        } else {
+            iot_exit_threads(conf);
+        }
+    }
+
+    if (GF_EVENT_CHILD_DOWN == event) {
+        if (victim->cleanup_starting) {
+            iot_exit_threads(conf);
+            gf_log(this->name, GF_LOG_INFO,
+                   "Notify GF_EVENT_CHILD_DOWN for brick %s", victim->name);
+        }
+    }
+
+    default_notify(this, event, data);
+
+    return 0;
 }
 
 void
-fini (xlator_t *this)
+fini(xlator_t *this)
 {
-	iot_conf_t *conf = this->private;
+    iot_conf_t *conf = this->private;
+
+    if (!conf)
+        return;
+
+    if (conf->mutex_inited && conf->cond_inited)
+        iot_exit_threads(conf);
 
-	FREE (conf);
+    if (conf->cond_inited)
+        pthread_cond_destroy(&conf->cond);
 
-	this->private = NULL;
-	return;
+    if (conf->mutex_inited)
+        pthread_mutex_destroy(&conf->mutex);
+
+    stop_iot_watchdog(this);
+
+    GF_FREE(conf);
+
+    this->private = NULL;
+    return;
 }
 
-struct xlator_fops fops = {
-	.open        = iot_open,
-	.create      = iot_create,
-	.readv       = iot_readv,
-	.writev      = iot_writev,
-	.flush       = iot_flush,
-	.fsync       = iot_fsync,
-	.lk          = iot_lk,
-	.stat        = iot_stat,
-	.fstat       = iot_fstat,
-	.truncate    = iot_truncate,
-	.ftruncate   = iot_ftruncate,
-	.utimens     = iot_utimens,
-	.checksum    = iot_checksum,
-	.unlink      = iot_unlink,
+int
+iot_client_destroy(xlator_t *this, client_t *client)
+{
+    void *tmp = NULL;
+
+    if (client_ctx_del(client, this, &tmp) == 0) {
+        GF_FREE(tmp);
+    }
+
+    return 0;
+}
+
+static int
+iot_disconnect_cbk(xlator_t *this, client_t *client)
+{
+    int i;
+    call_stub_t *curr;
+    call_stub_t *next;
+    iot_conf_t *conf = this->private;
+    iot_client_ctx_t *ctx;
+
+    if (!conf || !conf->cleanup_disconnected_reqs) {
+        goto out;
+    }
+
+    pthread_mutex_lock(&conf->mutex);
+    for (i = 0; i < GF_FOP_PRI_MAX; i++) {
+        ctx = &conf->no_client[i];
+        list_for_each_entry_safe(curr, next, &ctx->reqs, list)
+        {
+            if (curr->frame->root->client != client) {
+                continue;
+            }
+            gf_log(this->name, GF_LOG_INFO,
+                   "poisoning %s fop at %p for client %s",
+                   gf_fop_list[curr->fop], curr, client->client_uid);
+            curr->poison = _gf_true;
+        }
+    }
+    pthread_mutex_unlock(&conf->mutex);
+
+out:
+    return 0;
+}
+
+struct xlator_dumpops dumpops = {
+    .priv = iot_priv_dump,
 };
 
-struct xlator_mops mops = {
+struct xlator_fops fops = {
+    .open = iot_open,
+    .create = iot_create,
+    .readv = iot_readv,
+    .writev = iot_writev,
+    .flush = iot_flush,
+    .fsync = iot_fsync,
+    .lk = iot_lk,
+    .stat = iot_stat,
+    .fstat = iot_fstat,
+    .truncate = iot_truncate,
+    .ftruncate = iot_ftruncate,
+    .unlink = iot_unlink,
+    .lookup = iot_lookup,
+    .setattr = iot_setattr,
+    .fsetattr = iot_fsetattr,
+    .access = iot_access,
+    .readlink = iot_readlink,
+    .mknod = iot_mknod,
+    .mkdir = iot_mkdir,
+    .rmdir = iot_rmdir,
+    .symlink = iot_symlink,
+    .rename = iot_rename,
+    .link = iot_link,
+    .opendir = iot_opendir,
+    .fsyncdir = iot_fsyncdir,
+    .statfs = iot_statfs,
+    .setxattr = iot_setxattr,
+    .getxattr = iot_getxattr,
+    .fgetxattr = iot_fgetxattr,
+    .fsetxattr = iot_fsetxattr,
+    .removexattr = iot_removexattr,
+    .fremovexattr = iot_fremovexattr,
+    .readdir = iot_readdir,
+    .readdirp = iot_readdirp,
+    .inodelk = iot_inodelk,
+    .finodelk = iot_finodelk,
+    .entrylk = iot_entrylk,
+    .fentrylk = iot_fentrylk,
+    .xattrop = iot_xattrop,
+    .fxattrop = iot_fxattrop,
+    .rchecksum = iot_rchecksum,
+    .fallocate = iot_fallocate,
+    .discard = iot_discard,
+    .zerofill = iot_zerofill,
+    .seek = iot_seek,
+    .lease = iot_lease,
+    .getactivelk = iot_getactivelk,
+    .setactivelk = iot_setactivelk,
+    .put = iot_put,
 };
 
 struct xlator_cbks cbks = {
-	.release = iot_release,
+    .client_destroy = iot_client_destroy,
+    .client_disconnect = iot_disconnect_cbk,
 };
 
 struct volume_options options[] = {
-	{ .key  = {"thread-count"}, 
-	  .type = GF_OPTION_TYPE_INT, 
-	  .min  = 1, 
-	  .max  = 32
-	},
-	{ .key  = {NULL} },
+    {.key = {"thread-count"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = IOT_MIN_THREADS,
+     .max = IOT_MAX_THREADS,
+     .default_value = "16",
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-threads"},
+     /*.option = "thread-count"*/
+     .description = "Number of threads in IO threads translator which "
+                    "perform concurrent IO operations"
+
+    },
+    {.key = {"high-prio-threads"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = IOT_MIN_THREADS,
+     .max = IOT_MAX_THREADS,
+     .default_value = "16",
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-threads"},
+     .description = "Max number of threads in IO threads translator which "
+                    "perform high priority IO operations at a given time"
+
+    },
+    {.key = {"normal-prio-threads"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = IOT_MIN_THREADS,
+     .max = IOT_MAX_THREADS,
+     .default_value = "16",
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-threads"},
+     .description = "Max number of threads in IO threads translator which "
+                    "perform normal priority IO operations at a given time"
+
+    },
+    {.key = {"low-prio-threads"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = IOT_MIN_THREADS,
+     .max = IOT_MAX_THREADS,
+     .default_value = "16",
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-threads"},
+     .description = "Max number of threads in IO threads translator which "
+                    "perform low priority IO operations at a given time"
+
+    },
+    {.key = {"least-prio-threads"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = IOT_MIN_THREADS,
+     .max = IOT_MAX_THREADS,
+     .default_value = "1",
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-threads"},
+     .description = "Max number of threads in IO threads translator which "
+                    "perform least priority IO operations at a given time"},
+    {.key = {"enable-least-priority"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = SITE_H_ENABLE_LEAST_PRIORITY,
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-threads"},
+     .description = "Enable/Disable least priority"},
+    {
+        .key = {"idle-time"},
+        .type = GF_OPTION_TYPE_INT,
+        .min = 1,
+        .max = 0x7fffffff,
+        .default_value = "120",
+    },
+    {.key = {"watchdog-secs"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 0,
+     .default_value = 0,
+     .op_version = {GD_OP_VERSION_4_1_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .tags = {"io-threads"},
+     .description = "Number of seconds a queue must be stalled before "
+                    "starting an 'emergency' thread."},
+    {.key = {"cleanup-disconnected-reqs"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .op_version = {GD_OP_VERSION_4_1_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+     .tags = {"io-threads"},
+     .description = "'Poison' queued requests when a client disconnects"},
+    {.key = {"pass-through"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "false",
+     .op_version = {GD_OP_VERSION_4_1_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+     .tags = {"io-threads"},
+     .description = "Enable/Disable io threads translator"},
+    {
+        .key = {NULL},
+    },
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "io-threads",
+    .category = GF_MAINTAINED,
 };
diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h
index 7606d0625d5..f54d2f4912d 100644
--- a/xlators/performance/io-threads/src/io-threads.h
+++ b/xlators/performance/io-threads/src/io-threads.h
@@ -1,90 +1,86 @@
 /*
-   Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
 #ifndef __IOT_H
 #define __IOT_H
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-
-#include "compat-errno.h"
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
-#include "common-utils.h"
-
-#define min(a,b) ((a)<(b)?(a):(b))
-#define max(a,b) ((a)>(b)?(a):(b))
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/list.h>
+#include <stdlib.h>
+#include <glusterfs/locking.h>
+#include "iot-mem-types.h"
+#include <semaphore.h>
+#include <glusterfs/statedump.h>
 
 struct iot_conf;
-struct iot_worker;
-struct iot_queue;
-struct iot_local;
-struct iot_file;
-
-struct iot_local {
-  struct iot_file *file;
-  size_t frame_size;
-};
 
-struct iot_queue {
-  struct iot_queue *next, *prev;
-  call_stub_t *stub;
-};
+#define MAX_IDLE_SKEW 4 /* In secs */
+#define skew_sec_idle_time(sec) ((sec) + (random() % MAX_IDLE_SKEW))
+#define IOT_DEFAULT_IDLE 120 /* In secs. */
 
-struct iot_worker {
-  struct iot_worker *next, *prev;
-  struct iot_queue queue;
-  struct iot_conf *conf;
-  int64_t q,dq;
-  pthread_cond_t dq_cond;
-  pthread_mutex_t qlock;
-  int32_t fd_count;
-  int32_t queue_size;
-  pthread_t thread;
-};
+#define IOT_MIN_THREADS 1
+#define IOT_DEFAULT_THREADS 16
+#define IOT_MAX_THREADS 64
 
-struct iot_file {
-  struct iot_file *next, *prev; /* all open files via this xlator */
-  struct iot_worker *worker;
-  fd_t *fd;
-  int32_t pending_ops;
-};
+#define IOT_THREAD_STACK_SIZE ((size_t)(256 * 1024))
 
-struct iot_conf {
-  int32_t thread_count;
-  int32_t misc_thread_index;  /* Used to schedule the miscellaneous calls like checksum */
-  struct iot_worker workers;
-  struct iot_file files;
-  pthread_mutex_t files_lock;
+typedef struct {
+    struct list_head clients;
+    struct list_head reqs;
+} iot_client_ctx_t;
 
-  pthread_cond_t q_cond;
+struct iot_conf {
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+
+    int32_t max_count;  /* configured maximum */
+    int32_t curr_count; /* actual number of threads running */
+    int32_t sleep_count;
+
+    int32_t idle_time; /* in seconds */
+
+    struct list_head clients[GF_FOP_PRI_MAX];
+    /*
+     * It turns out that there are several ways a frame can get to us
+     * without having an associated client (server_first_lookup was the
+     * first one I hit).  Instead of trying to update all such callers,
+     * we use this to queue them.
+     */
+    iot_client_ctx_t no_client[GF_FOP_PRI_MAX];
+
+    int32_t ac_iot_limit[GF_FOP_PRI_MAX];
+    int32_t ac_iot_count[GF_FOP_PRI_MAX];
+    int queue_sizes[GF_FOP_PRI_MAX];
+    int32_t queue_size;
+    gf_atomic_t stub_cnt;
+    pthread_attr_t w_attr;
+    gf_boolean_t least_priority; /*Enable/Disable least-priority */
+
+    xlator_t *this;
+    size_t stack_size;
+    gf_boolean_t down; /*PARENT_DOWN event is notified*/
+    gf_boolean_t mutex_inited;
+    gf_boolean_t cond_inited;
+
+    int32_t watchdog_secs;
+    gf_boolean_t watchdog_running;
+    pthread_t watchdog_thread;
+    gf_boolean_t queue_marked[GF_FOP_PRI_MAX];
+    gf_boolean_t cleanup_disconnected_reqs;
 };
 
-typedef struct iot_file iot_file_t;
 typedef struct iot_conf iot_conf_t;
-typedef struct iot_local iot_local_t;
-typedef struct iot_worker iot_worker_t;
-typedef struct iot_queue iot_queue_t;
 
 #endif /* __IOT_H */
diff --git a/xlators/performance/io-threads/src/iot-mem-types.h b/xlators/performance/io-threads/src/iot-mem-types.h
new file mode 100644
index 00000000000..29565f34dd4
--- /dev/null
+++ b/xlators/performance/io-threads/src/iot-mem-types.h
@@ -0,0 +1,21 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __IOT_MEM_TYPES_H__
+#define __IOT_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_iot_mem_types_ {
+    gf_iot_mt_iot_conf_t = gf_common_mt_end + 1,
+    gf_iot_mt_client_ctx_t,
+    gf_iot_mt_end
+};
+#endif
diff --git a/xlators/performance/md-cache/Makefile.am b/xlators/performance/md-cache/Makefile.am
new file mode 100644
index 00000000000..af437a64d6d
--- /dev/null
+++ b/xlators/performance/md-cache/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = src
diff --git a/xlators/performance/md-cache/src/Makefile.am b/xlators/performance/md-cache/src/Makefile.am
new file mode 100644
index 00000000000..447ff0f30f0
--- /dev/null
+++ b/xlators/performance/md-cache/src/Makefile.am
@@ -0,0 +1,29 @@
+xlator_LTLIBRARIES = md-cache.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
+
+md_cache_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+md_cache_la_SOURCES = md-cache.c
+md_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = md-cache-mem-types.h md-cache-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(CONTRIBDIR)/rbtree
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
+
+
+stat-prefetch-compat:
+	mkdir -p $(DESTDIR)$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
+	rm -rf $(DESTDIR)$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance/stat-prefetch.so
+	ln -s ./md-cache.so $(DESTDIR)$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance/stat-prefetch.so
+
+
+install-exec-local: stat-prefetch-compat
+
+uninstall-local:
+	rm -f $(DESTDIR)$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance/stat-prefetch.so
diff --git a/xlators/performance/md-cache/src/md-cache-mem-types.h b/xlators/performance/md-cache/src/md-cache-mem-types.h
new file mode 100644
index 00000000000..47a07005717
--- /dev/null
+++ b/xlators/performance/md-cache/src/md-cache-mem-types.h
@@ -0,0 +1,23 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __MDC_MEM_TYPES_H__
+#define __MDC_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_mdc_mem_types_ {
+    gf_mdc_mt_mdc_local_t = gf_common_mt_end + 1,
+    gf_mdc_mt_md_cache_t,
+    gf_mdc_mt_mdc_conf_t,
+    gf_mdc_mt_mdc_ipc,
+    gf_mdc_mt_end
+};
+#endif
diff --git a/xlators/performance/md-cache/src/md-cache-messages.h b/xlators/performance/md-cache/src/md-cache-messages.h
new file mode 100644
index 00000000000..f367bad1991
--- /dev/null
+++ b/xlators/performance/md-cache/src/md-cache-messages.h
@@ -0,0 +1,29 @@
+/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _MD_CACHE_MESSAGES_H_
+#define _MD_CACHE_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(MD_CACHE, MD_CACHE_MSG_NO_MEMORY, MD_CACHE_MSG_DISCARD_UPDATE,
+           MD_CACHE_MSG_CACHE_UPDATE, MD_CACHE_MSG_IPC_UPCALL_FAILED,
+           MD_CACHE_MSG_NO_XATTR_CACHE);
+
+#endif /* _MD_CACHE_MESSAGES_H_ */
diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c
new file mode 100644
index 00000000000..a405be51f02
--- /dev/null
+++ b/xlators/performance/md-cache/src/md-cache.c
@@ -0,0 +1,4020 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/syncop.h>
+#include "md-cache-mem-types.h"
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/glusterfs-acl.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/upcall-utils.h>
+#include <assert.h>
+#include <sys/time.h>
+#include "md-cache-messages.h"
+#include <glusterfs/statedump.h>
+#include <glusterfs/atomic.h>
+
+/* TODO:
+   - cache symlink() link names and nuke symlink-cache
+   - send proper postbuf in setattr_cbk even when op_ret = -1
+*/
+
+struct mdc_statfs_cache {
+    pthread_mutex_t lock;
+    time_t last_refreshed; /* (time_t)-1 if not yet initialized. */
+    struct statvfs buf;
+};
+
+struct mdc_statistics {
+    gf_atomic_t stat_hit; /* No. of times lookup/stat was served from
+                             mdc */
+
+    gf_atomic_t stat_miss; /* No. of times valid stat wasn't present in
+                              mdc */
+
+    gf_atomic_t xattr_hit; /* No. of times getxattr was served from mdc,
+                              Note: this doesn't count the xattr served
+                              from lookup */
+
+    gf_atomic_t xattr_miss;      /* No. of times xattr req was WIND from mdc */
+    gf_atomic_t negative_lookup; /* No. of negative lookups */
+    gf_atomic_t nameless_lookup; /* No. of negative lookups that were sent
+                                    to bricks */
+
+    gf_atomic_t stat_invals;  /* No. of invalidates received from upcall */
+    gf_atomic_t xattr_invals; /* No. of invalidates received from upcall */
+    gf_atomic_t need_lookup;  /* No. of lookups issued, because other
+                                 xlators requested for explicit lookup */
+};
+
+struct mdc_conf {
+    uint32_t timeout;
+    gf_boolean_t cache_posix_acl;
+    gf_boolean_t cache_glusterfs_acl;
+    gf_boolean_t cache_selinux;
+    gf_boolean_t cache_capability;
+    gf_boolean_t cache_ima;
+    gf_boolean_t force_readdirp;
+    gf_boolean_t cache_swift_metadata;
+    gf_boolean_t cache_samba_metadata;
+    gf_boolean_t mdc_invalidation;
+    gf_boolean_t global_invalidation;
+
+    time_t last_child_down;
+    gf_lock_t lock;
+    struct mdc_statistics mdc_counter;
+    gf_boolean_t cache_statfs;
+    struct mdc_statfs_cache statfs_cache;
+    char *mdc_xattr_str;
+    gf_atomic_int32_t generation;
+};
+
+struct mdc_local;
+typedef struct mdc_local mdc_local_t;
+
+#define MDC_STACK_UNWIND(fop, frame, params...)                                \
+    do {                                                                       \
+        mdc_local_t *__local = NULL;                                           \
+        xlator_t *__xl = NULL;                                                 \
+        if (frame) {                                                           \
+            __xl = frame->this;                                                \
+            __local = frame->local;                                            \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, params);                               \
+        mdc_local_wipe(__xl, __local);                                         \
+    } while (0)
+
+struct md_cache {
+    ia_prot_t md_prot;
+    uint32_t md_nlink;
+    uint32_t md_uid;
+    uint32_t md_gid;
+    uint32_t md_atime_nsec;
+    uint32_t md_mtime_nsec;
+    uint32_t md_ctime_nsec;
+    int64_t md_atime;
+    int64_t md_mtime;
+    int64_t md_ctime;
+    uint64_t md_rdev;
+    uint64_t md_size;
+    uint64_t md_blocks;
+    uint64_t generation;
+    dict_t *xattr;
+    char *linkname;
+    time_t ia_time;
+    time_t xa_time;
+    gf_boolean_t need_lookup;
+    gf_boolean_t valid;
+    gf_boolean_t gen_rollover;
+    gf_boolean_t invalidation_rollover;
+    gf_lock_t lock;
+};
+
+struct mdc_local {
+    loc_t loc;
+    loc_t loc2;
+    fd_t *fd;
+    char *linkname;
+    char *key;
+    dict_t *xattr;
+    uint64_t incident_time;
+    bool update_cache;
+};
+
+int
+__mdc_inode_ctx_get(xlator_t *this, inode_t *inode, struct md_cache **mdc_p)
+{
+    int ret = 0;
+    struct md_cache *mdc = NULL;
+    uint64_t mdc_int = 0;
+
+    ret = __inode_ctx_get(inode, this, &mdc_int);
+    mdc = (void *)(long)(mdc_int);
+    if (ret == 0 && mdc_p)
+        *mdc_p = mdc;
+
+    return ret;
+}
+
+int
+mdc_inode_ctx_get(xlator_t *this, inode_t *inode, struct md_cache **mdc_p)
+{
+    int ret = -1;
+
+    if (!inode)
+        goto out;
+
+    LOCK(&inode->lock);
+    {
+        ret = __mdc_inode_ctx_get(this, inode, mdc_p);
+    }
+    UNLOCK(&inode->lock);
+
+out:
+    return ret;
+}
+
+uint64_t
+__mdc_inc_generation(xlator_t *this, struct md_cache *mdc)
+{
+    uint64_t gen = 0, rollover;
+    struct mdc_conf *conf = NULL;
+
+    conf = this->private;
+
+    gen = GF_ATOMIC_INC(conf->generation);
+    if (gen == 0) {
+        mdc->gen_rollover = !mdc->gen_rollover;
+        gen = GF_ATOMIC_INC(conf->generation);
+        mdc->ia_time = 0;
+        mdc->generation = 0;
+    }
+
+    rollover = mdc->gen_rollover;
+    gen |= (rollover << 32);
+    return gen;
+}
+
+uint64_t
+mdc_inc_generation(xlator_t *this, inode_t *inode)
+{
+    struct mdc_conf *conf = NULL;
+    uint64_t gen = 0;
+    struct md_cache *mdc = NULL;
+
+    conf = this->private;
+
+    mdc_inode_ctx_get(this, inode, &mdc);
+
+    if (mdc) {
+        LOCK(&mdc->lock);
+        {
+            gen = __mdc_inc_generation(this, mdc);
+        }
+        UNLOCK(&mdc->lock);
+    } else {
+        gen = GF_ATOMIC_INC(conf->generation);
+        if (gen == 0) {
+            gen = GF_ATOMIC_INC(conf->generation);
+        }
+    }
+
+    return gen;
+}
+
+uint64_t
+mdc_get_generation(xlator_t *this, inode_t *inode)
+{
+    struct mdc_conf *conf = NULL;
+    uint64_t gen = 0;
+    struct md_cache *mdc = NULL;
+
+    conf = this->private;
+
+    mdc_inode_ctx_get(this, inode, &mdc);
+
+    if (mdc) {
+        LOCK(&mdc->lock);
+        {
+            gen = mdc->generation;
+        }
+        UNLOCK(&mdc->lock);
+    } else
+        gen = GF_ATOMIC_GET(conf->generation);
+
+    return gen;
+}
+
+int
+__mdc_inode_ctx_set(xlator_t *this, inode_t *inode, struct md_cache *mdc)
+{
+    int ret = 0;
+    uint64_t mdc_int = 0;
+
+    mdc_int = (long)mdc;
+    ret = __inode_ctx_set(inode, this, &mdc_int);
+
+    return ret;
+}
+
+int
+mdc_inode_ctx_set(xlator_t *this, inode_t *inode, struct md_cache *mdc)
+{
+    int ret;
+
+    LOCK(&inode->lock);
+    {
+        ret = __mdc_inode_ctx_set(this, inode, mdc);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+mdc_local_t *
+mdc_local_get(call_frame_t *frame, inode_t *inode)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (local)
+        goto out;
+
+    local = GF_CALLOC(sizeof(*local), 1, gf_mdc_mt_mdc_local_t);
+    if (!local)
+        goto out;
+
+    local->incident_time = mdc_get_generation(frame->this, inode);
+    frame->local = local;
+out:
+    return local;
+}
+
+void
+mdc_local_wipe(xlator_t *this, mdc_local_t *local)
+{
+    if (!local)
+        return;
+
+    loc_wipe(&local->loc);
+
+    loc_wipe(&local->loc2);
+
+    if (local->fd)
+        fd_unref(local->fd);
+
+    GF_FREE(local->linkname);
+
+    GF_FREE(local->key);
+
+    if (local->xattr)
+        dict_unref(local->xattr);
+
+    GF_FREE(local);
+    return;
+}
+
+int
+mdc_inode_wipe(xlator_t *this, inode_t *inode)
+{
+    int ret = 0;
+    uint64_t mdc_int = 0;
+    struct md_cache *mdc = NULL;
+
+    ret = inode_ctx_del(inode, this, &mdc_int);
+    if (ret != 0)
+        goto out;
+
+    mdc = (void *)(long)mdc_int;
+
+    if (mdc->xattr)
+        dict_unref(mdc->xattr);
+
+    GF_FREE(mdc->linkname);
+
+    GF_FREE(mdc);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+struct md_cache *
+mdc_inode_prep(xlator_t *this, inode_t *inode)
+{
+    int ret = 0;
+    struct md_cache *mdc = NULL;
+
+    LOCK(&inode->lock);
+    {
+        ret = __mdc_inode_ctx_get(this, inode, &mdc);
+        if (ret == 0)
+            goto unlock;
+
+        mdc = GF_CALLOC(sizeof(*mdc), 1, gf_mdc_mt_md_cache_t);
+        if (!mdc) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY,
+                   "out of memory");
+            goto unlock;
+        }
+
+        LOCK_INIT(&mdc->lock);
+
+        ret = __mdc_inode_ctx_set(this, inode, mdc);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY,
+                   "out of memory");
+            GF_FREE(mdc);
+            mdc = NULL;
+        }
+    }
+unlock:
+    UNLOCK(&inode->lock);
+
+    return mdc;
+}
+
+/* Cache is valid if:
+ * - It is not cached before any brick was down. Brick down case is handled by
+ *   invalidating all the cache when any brick went down.
+ * - The cache time is not expired
+ */
+static gf_boolean_t
+__is_cache_valid(xlator_t *this, time_t mdc_time)
+{
+    gf_boolean_t ret = _gf_true;
+    struct mdc_conf *conf = NULL;
+    uint32_t timeout = 0;
+    time_t last_child_down = 0;
+
+    conf = this->private;
+
+    /* conf->lock here is not taken deliberately, so that the multi
+     * threaded IO doesn't contend on a global lock. While updating
+     * the variable, the lock is taken, so that at least the writes are
+     * intact. The read of last_child_down may return junk, but that
+     * is for a very short period of time.
+     */
+    last_child_down = conf->last_child_down;
+    timeout = conf->timeout;
+
+    if ((mdc_time == 0) ||
+        ((last_child_down != 0) && (mdc_time < last_child_down))) {
+        ret = _gf_false;
+        goto out;
+    }
+
+    if (gf_time() >= (mdc_time + timeout)) {
+        ret = _gf_false;
+    }
+
+out:
+    return ret;
+}
+
+static gf_boolean_t
+is_md_cache_iatt_valid(xlator_t *this, struct md_cache *mdc)
+{
+    gf_boolean_t ret = _gf_true;
+
+    LOCK(&mdc->lock);
+    {
+        if (mdc->valid == _gf_false) {
+            ret = mdc->valid;
+        } else {
+            ret = __is_cache_valid(this, mdc->ia_time);
+            if (ret == _gf_false) {
+                mdc->ia_time = 0;
+                mdc->generation = 0;
+            }
+        }
+    }
+    UNLOCK(&mdc->lock);
+
+    return ret;
+}
+
+static gf_boolean_t
+is_md_cache_xatt_valid(xlator_t *this, struct md_cache *mdc)
+{
+    gf_boolean_t ret = _gf_true;
+
+    LOCK(&mdc->lock);
+    {
+        ret = __is_cache_valid(this, mdc->xa_time);
+        if (ret == _gf_false)
+            mdc->xa_time = 0;
+    }
+    UNLOCK(&mdc->lock);
+
+    return ret;
+}
+
+void
+mdc_from_iatt(struct md_cache *mdc, struct iatt *iatt)
+{
+    mdc->md_prot = iatt->ia_prot;
+    mdc->md_nlink = iatt->ia_nlink;
+    mdc->md_uid = iatt->ia_uid;
+    mdc->md_gid = iatt->ia_gid;
+    mdc->md_atime = iatt->ia_atime;
+    mdc->md_atime_nsec = iatt->ia_atime_nsec;
+    mdc->md_mtime = iatt->ia_mtime;
+    mdc->md_mtime_nsec = iatt->ia_mtime_nsec;
+    mdc->md_ctime = iatt->ia_ctime;
+    mdc->md_ctime_nsec = iatt->ia_ctime_nsec;
+    mdc->md_rdev = iatt->ia_rdev;
+    mdc->md_size = iatt->ia_size;
+    mdc->md_blocks = iatt->ia_blocks;
+}
+
+void
+mdc_to_iatt(struct md_cache *mdc, struct iatt *iatt)
+{
+    iatt->ia_prot = mdc->md_prot;
+    iatt->ia_nlink = mdc->md_nlink;
+    iatt->ia_uid = mdc->md_uid;
+    iatt->ia_gid = mdc->md_gid;
+    iatt->ia_atime = mdc->md_atime;
+    iatt->ia_atime_nsec = mdc->md_atime_nsec;
+    iatt->ia_mtime = mdc->md_mtime;
+    iatt->ia_mtime_nsec = mdc->md_mtime_nsec;
+    iatt->ia_ctime = mdc->md_ctime;
+    iatt->ia_ctime_nsec = mdc->md_ctime_nsec;
+    iatt->ia_rdev = mdc->md_rdev;
+    iatt->ia_size = mdc->md_size;
+    iatt->ia_blocks = mdc->md_blocks;
+}
+
+int
+mdc_inode_iatt_set_validate(xlator_t *this, inode_t *inode, struct iatt *prebuf,
+                            struct iatt *iatt, gf_boolean_t update_time,
+                            uint64_t incident_time)
+{
+    int ret = 0;
+    struct md_cache *mdc = NULL;
+    uint32_t rollover = 0;
+    uint64_t gen = 0;
+    gf_boolean_t update_xa_time = _gf_false;
+    struct mdc_conf *conf = this->private;
+
+    mdc = mdc_inode_prep(this, inode);
+    if (!mdc) {
+        ret = -1;
+        goto out;
+    }
+
+    rollover = incident_time >> 32;
+    incident_time = (incident_time & 0xffffffff);
+
+    LOCK(&mdc->lock);
+    {
+        if (!iatt || !iatt->ia_ctime) {
+            gf_msg_callingfn("md-cache", GF_LOG_TRACE, 0, 0,
+                             "invalidating iatt(NULL)"
+                             "(%s)",
+                             uuid_utoa(inode->gfid));
+            mdc->ia_time = 0;
+            mdc->valid = 0;
+
+            gen = __mdc_inc_generation(this, mdc);
+            mdc->generation = (gen & 0xffffffff);
+            goto unlock;
+        }
+
+        /* There could be a race in invalidation, where the
+         * invalidations in order A, B reaches md-cache in the order
+         * B, A. Hence, make sure the invalidation A is discarded if
+         * it comes after B. ctime of a file is always in ascending
+         * order unlike atime and mtime(which can be changed by user
+         * to any date), also ctime gets updates when atime/mtime
+         * changes, hence check for ctime only.
+         */
+        if (mdc->md_ctime > iatt->ia_ctime) {
+            gf_msg_callingfn(this->name, GF_LOG_DEBUG, EINVAL,
+                             MD_CACHE_MSG_DISCARD_UPDATE,
+                             "discarding the iatt validate "
+                             "request (%s)",
+                             uuid_utoa(inode->gfid));
+            ret = -1;
+            goto unlock;
+        }
+        if ((mdc->md_ctime == iatt->ia_ctime) &&
+            (mdc->md_ctime_nsec > iatt->ia_ctime_nsec)) {
+            gf_msg_callingfn(this->name, GF_LOG_DEBUG, EINVAL,
+                             MD_CACHE_MSG_DISCARD_UPDATE,
+                             "discarding the iatt validate "
+                             "request(ctime_nsec) (%s)",
+                             uuid_utoa(inode->gfid));
+            ret = -1;
+            goto unlock;
+        }
+
+        /*
+         * Invalidate the inode if the mtime or ctime has changed
+         * and the prebuf doesn't match the value we have cached.
+         * TODO: writev returns with a NULL iatt due to
+         * performance/write-behind, causing invalidation on writes.
+         */
+        if ((iatt->ia_mtime != mdc->md_mtime) ||
+            (iatt->ia_mtime_nsec != mdc->md_mtime_nsec) ||
+            (iatt->ia_ctime != mdc->md_ctime) ||
+            (iatt->ia_ctime_nsec != mdc->md_ctime_nsec)) {
+            if (conf->global_invalidation &&
+                (!prebuf || (prebuf->ia_mtime != mdc->md_mtime) ||
+                 (prebuf->ia_mtime_nsec != mdc->md_mtime_nsec) ||
+                 (prebuf->ia_ctime != mdc->md_ctime) ||
+                 (prebuf->ia_ctime_nsec != mdc->md_ctime_nsec))) {
+                if (IA_ISREG(inode->ia_type)) {
+                    gf_msg("md-cache", GF_LOG_TRACE, 0,
+                           MD_CACHE_MSG_DISCARD_UPDATE,
+                           "prebuf doesn't match the value we have cached,"
+                           " invalidate the inode(%s)",
+                           uuid_utoa(inode->gfid));
+
+                    inode_invalidate(inode);
+                }
+            } else {
+                update_xa_time = _gf_true;
+            }
+        }
+
+        if ((mdc->gen_rollover == rollover) &&
+            (incident_time >= mdc->generation)) {
+            mdc_from_iatt(mdc, iatt);
+            mdc->valid = _gf_true;
+            if (update_time) {
+                mdc->ia_time = gf_time();
+                if (mdc->xa_time && update_xa_time)
+                    mdc->xa_time = mdc->ia_time;
+            }
+
+            gf_msg_callingfn(
+                "md-cache", GF_LOG_TRACE, 0, MD_CACHE_MSG_CACHE_UPDATE,
+                "Updated iatt(%s)"
+                " time:%lld generation=%lld",
+                uuid_utoa(iatt->ia_gfid), (unsigned long long)mdc->ia_time,
+                (unsigned long long)mdc->generation);
+        } else {
+            gf_msg_callingfn("md-cache", GF_LOG_TRACE, 0, 0,
+                             "not updating cache (%s)"
+                             "mdc-rollover=%u rollover=%u "
+                             "mdc-generation=%llu "
+                             "mdc-ia_time=%llu incident_time=%llu ",
+                             uuid_utoa(iatt->ia_gfid), mdc->gen_rollover,
+                             rollover, (unsigned long long)mdc->generation,
+                             (unsigned long long)mdc->ia_time,
+                             (unsigned long long)incident_time);
+        }
+    }
+unlock:
+    UNLOCK(&mdc->lock);
+
+out:
+    return ret;
+}
+
+int
+mdc_inode_iatt_set(xlator_t *this, inode_t *inode, struct iatt *iatt,
+                   uint64_t incident_time)
+{
+    return mdc_inode_iatt_set_validate(this, inode, NULL, iatt, _gf_true,
+                                       incident_time);
+}
+
+int
+mdc_inode_iatt_get(xlator_t *this, inode_t *inode, struct iatt *iatt)
+{
+    int ret = -1;
+    struct md_cache *mdc = NULL;
+
+    if (mdc_inode_ctx_get(this, inode, &mdc) != 0) {
+        gf_msg_trace("md-cache", 0, "mdc_inode_ctx_get failed (%s)",
+                     uuid_utoa(inode->gfid));
+        goto out;
+    }
+
+    if (!is_md_cache_iatt_valid(this, mdc)) {
+        gf_msg_trace("md-cache", 0, "iatt cache not valid for (%s)",
+                     uuid_utoa(inode->gfid));
+        goto out;
+    }
+
+    LOCK(&mdc->lock);
+    {
+        mdc_to_iatt(mdc, iatt);
+    }
+    UNLOCK(&mdc->lock);
+
+    gf_uuid_copy(iatt->ia_gfid, inode->gfid);
+    iatt->ia_ino = gfid_to_ino(inode->gfid);
+    iatt->ia_dev = 42;
+    iatt->ia_type = inode->ia_type;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+struct updatedict {
+    dict_t *dict;
+    int ret;
+};
+
+static int
+is_mdc_key_satisfied(xlator_t *this, const char *key)
+{
+    int ret = 0;
+    char *pattern = NULL;
+    struct mdc_conf *conf = this->private;
+    char *mdc_xattr_str = NULL;
+    char *tmp = NULL;
+    char *tmp1 = NULL;
+
+    if (!key)
+        goto out;
+
+    /* conf->mdc_xattr_str, is never freed and is hence safely used outside
+     * of lock*/
+    tmp1 = conf->mdc_xattr_str;
+    if (!tmp1)
+        goto out;
+
+    mdc_xattr_str = gf_strdup(tmp1);
+    if (!mdc_xattr_str)
+        goto out;
+
+    pattern = strtok_r(mdc_xattr_str, ",", &tmp);
+    while (pattern) {
+        gf_strTrim(&pattern);
+        if (fnmatch(pattern, key, 0) == 0) {
+            ret = 1;
+            break;
+        } else {
+            gf_msg_trace("md-cache", 0,
+                         "xattr key %s doesn't satisfy "
+                         "caching requirements",
+                         key);
+        }
+        pattern = strtok_r(NULL, ",", &tmp);
+    }
+    GF_FREE(mdc_xattr_str);
+out:
+    return ret;
+}
+
+static int
+updatefn(dict_t *dict, char *key, data_t *value, void *data)
+{
+    struct updatedict *u = data;
+
+    if (is_mdc_key_satisfied(THIS, key)) {
+        if (!u->dict) {
+            u->dict = dict_new();
+            if (!u->dict) {
+                u->ret = -1;
+                return -1;
+            }
+        }
+
+        if (dict_set(u->dict, key, value) < 0) {
+            u->ret = -1;
+            return -1;
+        }
+    }
+    return 0;
+}
+
+static int
+mdc_dict_update(dict_t **tgt, dict_t *src)
+{
+    struct updatedict u = {
+        .dict = *tgt,
+        .ret = 0,
+    };
+
+    dict_foreach(src, updatefn, &u);
+
+    if (*tgt)
+        return u.ret;
+
+    if ((u.ret < 0) && u.dict) {
+        dict_unref(u.dict);
+        return u.ret;
+    }
+
+    *tgt = u.dict;
+
+    return u.ret;
+}
+
+int
+mdc_inode_xatt_set(xlator_t *this, inode_t *inode, dict_t *dict)
+{
+    int ret = -1;
+    struct md_cache *mdc = NULL;
+    dict_t *newdict = NULL;
+
+    mdc = mdc_inode_prep(this, inode);
+    if (!mdc)
+        goto out;
+
+    if (!dict) {
+        gf_msg_trace("md-cache", 0,
+                     "mdc_inode_xatt_set failed (%s) "
+                     "dict NULL",
+                     uuid_utoa(inode->gfid));
+        goto out;
+    }
+
+    LOCK(&mdc->lock);
+    {
+        if (mdc->xattr) {
+            gf_msg_trace("md-cache", 0,
+                         "deleting the old xattr "
+                         "cache (%s)",
+                         uuid_utoa(inode->gfid));
+            dict_unref(mdc->xattr);
+            mdc->xattr = NULL;
+        }
+
+        ret = mdc_dict_update(&newdict, dict);
+        if (ret < 0) {
+            UNLOCK(&mdc->lock);
+            goto out;
+        }
+
+        if (newdict)
+            mdc->xattr = newdict;
+
+        mdc->xa_time = gf_time();
+        gf_msg_trace("md-cache", 0, "xatt cache set for (%s) time:%lld",
+                     uuid_utoa(inode->gfid), (long long)mdc->xa_time);
+    }
+    UNLOCK(&mdc->lock);
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+mdc_inode_xatt_update(xlator_t *this, inode_t *inode, dict_t *dict)
+{
+    int ret = -1;
+    struct md_cache *mdc = NULL;
+
+    mdc = mdc_inode_prep(this, inode);
+    if (!mdc)
+        goto out;
+
+    if (!dict)
+        goto out;
+
+    LOCK(&mdc->lock);
+    {
+        ret = mdc_dict_update(&mdc->xattr, dict);
+        if (ret < 0) {
+            UNLOCK(&mdc->lock);
+            goto out;
+        }
+    }
+    UNLOCK(&mdc->lock);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+mdc_inode_xatt_unset(xlator_t *this, inode_t *inode, char *name)
+{
+    int ret = -1;
+    struct md_cache *mdc = NULL;
+
+    mdc = mdc_inode_prep(this, inode);
+    if (!mdc)
+        goto out;
+
+    if (!name || !mdc->xattr)
+        goto out;
+
+    LOCK(&mdc->lock);
+    {
+        dict_del(mdc->xattr, name);
+    }
+    UNLOCK(&mdc->lock);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+mdc_inode_xatt_get(xlator_t *this, inode_t *inode, dict_t **dict)
+{
+    int ret = -1;
+    struct md_cache *mdc = NULL;
+
+    if (mdc_inode_ctx_get(this, inode, &mdc) != 0) {
+        gf_msg_trace("md-cache", 0, "mdc_inode_ctx_get failed (%s)",
+                     uuid_utoa(inode->gfid));
+        goto out;
+    }
+
+    if (!is_md_cache_xatt_valid(this, mdc)) {
+        gf_msg_trace("md-cache", 0, "xattr cache not valid for (%s)",
+                     uuid_utoa(inode->gfid));
+        goto out;
+    }
+
+    LOCK(&mdc->lock);
+    {
+        ret = 0;
+        /* Missing xattr only means no keys were there, i.e
+           a negative cache for the "loaded" keys
+        */
+        if (!mdc->xattr) {
+            gf_msg_trace("md-cache", 0, "xattr not present (%s)",
+                         uuid_utoa(inode->gfid));
+            goto unlock;
+        }
+
+        if (dict)
+            *dict = dict_ref(mdc->xattr);
+    }
+unlock:
+    UNLOCK(&mdc->lock);
+
+out:
+    return ret;
+}
+
+gf_boolean_t
+mdc_inode_reset_need_lookup(xlator_t *this, inode_t *inode)
+{
+    struct md_cache *mdc = NULL;
+    gf_boolean_t need = _gf_false;
+
+    if (mdc_inode_ctx_get(this, inode, &mdc) != 0)
+        goto out;
+
+    LOCK(&mdc->lock);
+    {
+        need = mdc->need_lookup;
+        mdc->need_lookup = _gf_false;
+    }
+    UNLOCK(&mdc->lock);
+
+out:
+    return need;
+}
+
+void
+mdc_inode_set_need_lookup(xlator_t *this, inode_t *inode, gf_boolean_t need)
+{
+    struct md_cache *mdc = NULL;
+
+    if (mdc_inode_ctx_get(this, inode, &mdc) != 0)
+        goto out;
+
+    LOCK(&mdc->lock);
+    {
+        mdc->need_lookup = need;
+    }
+    UNLOCK(&mdc->lock);
+
+out:
+    return;
+}
+
+void
+mdc_inode_iatt_invalidate(xlator_t *this, inode_t *inode)
+{
+    struct md_cache *mdc = NULL;
+    uint32_t gen = 0;
+
+    if (mdc_inode_ctx_get(this, inode, &mdc) != 0)
+        goto out;
+
+    gen = mdc_inc_generation(this, inode) & 0xffffffff;
+
+    LOCK(&mdc->lock);
+    {
+        mdc->ia_time = 0;
+        mdc->valid = _gf_false;
+        mdc->generation = gen;
+    }
+    UNLOCK(&mdc->lock);
+
+out:
+    return;
+}
+
+int
+mdc_inode_xatt_invalidate(xlator_t *this, inode_t *inode)
+{
+    int ret = -1;
+    struct md_cache *mdc = NULL;
+
+    if (mdc_inode_ctx_get(this, inode, &mdc) != 0)
+        goto out;
+
+    LOCK(&mdc->lock);
+    {
+        mdc->xa_time = 0;
+    }
+    UNLOCK(&mdc->lock);
+
+out:
+    return ret;
+}
+
+static int
+mdc_update_gfid_stat(xlator_t *this, struct iatt *iatt)
+{
+    int ret = 0;
+    inode_table_t *itable = NULL;
+    inode_t *inode = NULL;
+
+    itable = ((xlator_t *)this->graph->top)->itable;
+    inode = inode_find(itable, iatt->ia_gfid);
+    if (!inode) {
+        ret = -1;
+        goto out;
+    }
+    ret = mdc_inode_iatt_set_validate(this, inode, NULL, iatt, _gf_true,
+                                      mdc_inc_generation(this, inode));
+out:
+    return ret;
+}
+
+static bool
+mdc_load_reqs(xlator_t *this, dict_t *dict)
+{
+    struct mdc_conf *conf = this->private;
+    char *pattern = NULL;
+    char *mdc_xattr_str = NULL;
+    char *tmp = NULL;
+    char *tmp1 = NULL;
+    int ret = 0;
+    bool loaded = false;
+
+    tmp1 = conf->mdc_xattr_str;
+    if (!tmp1)
+        goto out;
+
+    mdc_xattr_str = gf_strdup(tmp1);
+    if (!mdc_xattr_str)
+        goto out;
+
+    pattern = strtok_r(mdc_xattr_str, ",", &tmp);
+    while (pattern) {
+        gf_strTrim(&pattern);
+        ret = dict_set_int8(dict, pattern, 0);
+        if (ret) {
+            conf->mdc_xattr_str = NULL;
+            gf_msg("md-cache", GF_LOG_ERROR, 0, MD_CACHE_MSG_NO_XATTR_CACHE,
+                   "Disabled cache for xattrs, dict_set failed");
+            goto out;
+        }
+        pattern = strtok_r(NULL, ",", &tmp);
+    }
+
+    loaded = true;
+
+out:
+    GF_FREE(mdc_xattr_str);
+
+    return loaded;
+}
+
+struct checkpair {
+    int ret;
+    dict_t *rsp;
+};
+
+static int
+checkfn(dict_t *this, char *key, data_t *value, void *data)
+{
+    struct checkpair *pair = data;
+
+    if (!is_mdc_key_satisfied(THIS, key))
+        pair->ret = 0;
+
+    return 0;
+}
+
+int
+mdc_xattr_satisfied(xlator_t *this, dict_t *req, dict_t *rsp)
+{
+    struct checkpair pair = {
+        .ret = 1,
+        .rsp = rsp,
+    };
+
+    dict_foreach(req, checkfn, &pair);
+
+    return pair.ret;
+}
+
+static void
+mdc_cache_statfs(xlator_t *this, struct statvfs *buf)
+{
+    struct mdc_conf *conf = this->private;
+
+    pthread_mutex_lock(&conf->statfs_cache.lock);
+    {
+        memcpy(&conf->statfs_cache.buf, buf, sizeof(struct statvfs));
+        conf->statfs_cache.last_refreshed = gf_time();
+    }
+    pthread_mutex_unlock(&conf->statfs_cache.lock);
+}
+
+int
+mdc_load_statfs_info_from_cache(xlator_t *this, struct statvfs **buf)
+{
+    struct mdc_conf *conf = this->private;
+    uint32_t cache_age = 0;
+    int ret = 0;
+
+    if (!buf || !conf) {
+        ret = -1;
+        goto err;
+    }
+
+    *buf = NULL;
+
+    pthread_mutex_lock(&conf->statfs_cache.lock);
+    {
+        /* Skip if the cache is not initialized. */
+        if (conf->statfs_cache.last_refreshed == (time_t)-1) {
+            ret = -1;
+            goto unlock;
+        }
+
+        cache_age = (gf_time() - conf->statfs_cache.last_refreshed);
+
+        gf_log(this->name, GF_LOG_DEBUG, "STATFS cache age = %u secs",
+               cache_age);
+        if (cache_age > conf->timeout) {
+            /* Expire the cache. */
+            gf_log(this->name, GF_LOG_DEBUG,
+                   "Cache age %u secs exceeded timeout %u secs", cache_age,
+                   conf->timeout);
+            ret = -1;
+            goto unlock;
+        }
+
+        *buf = &conf->statfs_cache.buf;
+    }
+unlock:
+    pthread_mutex_unlock(&conf->statfs_cache.lock);
+err:
+    return ret;
+}
+
+static dict_t *
+mdc_prepare_request(xlator_t *this, mdc_local_t *local, dict_t *xdata)
+{
+    if (xdata != NULL) {
+        dict_ref(xdata);
+    }
+
+    if (local == NULL) {
+        return xdata;
+    }
+
+    if (xdata == NULL) {
+        xdata = dict_new();
+        if (xdata == NULL) {
+            local->update_cache = false;
+
+            return NULL;
+        }
+    }
+
+    local->update_cache = mdc_load_reqs(this, xdata);
+
+    return xdata;
+}
+
+int
+mdc_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+               dict_t *xdata)
+{
+    struct mdc_conf *conf = this->private;
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE)) {
+            mdc_inode_iatt_invalidate(this, local->loc.inode);
+        }
+
+        goto out;
+    }
+
+    if (conf && conf->cache_statfs) {
+        mdc_cache_statfs(this, buf);
+    }
+
+out:
+    MDC_STACK_UNWIND(statfs, frame, op_ret, op_errno, buf, xdata);
+
+    return 0;
+}
+
+int
+mdc_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    int ret = 0, op_ret = 0, op_errno = 0;
+    struct statvfs *buf = NULL;
+    mdc_local_t *local = NULL;
+    struct mdc_conf *conf = this->private;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (!local) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto out;
+    }
+
+    loc_copy(&local->loc, loc);
+
+    if (!conf) {
+        goto uncached;
+    }
+
+    if (!conf->cache_statfs) {
+        goto uncached;
+    }
+
+    ret = mdc_load_statfs_info_from_cache(this, &buf);
+    if (ret == 0 && buf) {
+        op_ret = 0;
+        op_errno = 0;
+        goto out;
+    }
+
+uncached:
+    STACK_WIND(frame, mdc_statfs_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->statfs, loc, xdata);
+    return 0;
+
+out:
+    MDC_STACK_UNWIND(statfs, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
+
+int
+mdc_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, inode_t *inode,
+               struct iatt *stbuf, dict_t *dict, struct iatt *postparent)
+{
+    mdc_local_t *local = NULL;
+    struct mdc_conf *conf = this->private;
+
+    local = frame->local;
+
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if (op_errno == ENOENT)
+            GF_ATOMIC_INC(conf->mdc_counter.negative_lookup);
+
+        if (op_errno == ESTALE) {
+            /* if op_errno is ENOENT, fuse-bridge will unlink the
+             * dentry
+             */
+            if (local->loc.parent)
+                mdc_inode_iatt_invalidate(this, local->loc.parent);
+            else
+                mdc_inode_iatt_invalidate(this, local->loc.inode);
+        }
+
+        goto out;
+    }
+
+    if (local->loc.parent) {
+        mdc_inode_iatt_set(this, local->loc.parent, postparent,
+                           local->incident_time);
+    }
+
+    if (local->loc.inode) {
+        mdc_inode_iatt_set(this, local->loc.inode, stbuf, local->incident_time);
+        if (local->update_cache) {
+            mdc_inode_xatt_set(this, local->loc.inode, dict);
+        }
+    }
+out:
+    MDC_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, dict,
+                     postparent);
+    return 0;
+}
+
+int
+mdc_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    int ret = 0;
+    struct iatt stbuf = {
+        0,
+    };
+    struct iatt postparent = {
+        0,
+    };
+    dict_t *xattr_rsp = NULL;
+    mdc_local_t *local = NULL;
+    struct mdc_conf *conf = this->private;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (!local) {
+        GF_ATOMIC_INC(conf->mdc_counter.stat_miss);
+        goto uncached;
+    }
+
+    loc_copy(&local->loc, loc);
+
+    if (!inode_is_linked(loc->inode)) {
+        GF_ATOMIC_INC(conf->mdc_counter.stat_miss);
+        goto uncached;
+    }
+
+    if (mdc_inode_reset_need_lookup(this, loc->inode)) {
+        GF_ATOMIC_INC(conf->mdc_counter.need_lookup);
+        goto uncached;
+    }
+
+    ret = mdc_inode_iatt_get(this, loc->inode, &stbuf);
+    if (ret != 0) {
+        GF_ATOMIC_INC(conf->mdc_counter.stat_miss);
+        goto uncached;
+    }
+
+    if (xdata) {
+        ret = mdc_inode_xatt_get(this, loc->inode, &xattr_rsp);
+        if (ret != 0) {
+            GF_ATOMIC_INC(conf->mdc_counter.xattr_miss);
+            goto uncached;
+        }
+
+        if (!mdc_xattr_satisfied(this, xdata, xattr_rsp)) {
+            GF_ATOMIC_INC(conf->mdc_counter.xattr_miss);
+            goto uncached;
+        }
+    }
+
+    GF_ATOMIC_INC(conf->mdc_counter.stat_hit);
+    MDC_STACK_UNWIND(lookup, frame, 0, 0, loc->inode, &stbuf, xattr_rsp,
+                     &postparent);
+
+    if (xattr_rsp)
+        dict_unref(xattr_rsp);
+
+    return 0;
+
+uncached:
+    xdata = mdc_prepare_request(this, local, xdata);
+
+    STACK_WIND(frame, mdc_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
+
+    if (xattr_rsp)
+        dict_unref(xattr_rsp);
+
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
+
+    return 0;
+}
+
+int
+mdc_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ESTALE) || (op_errno == ENOENT)) {
+            mdc_inode_iatt_invalidate(this, local->loc.inode);
+        }
+
+        goto out;
+    }
+
+    mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time);
+    if (local->update_cache) {
+        mdc_inode_xatt_set(this, local->loc.inode, xdata);
+    }
+
+out:
+    MDC_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata);
+
+    return 0;
+}
+
+int
+mdc_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    int ret;
+    struct iatt stbuf;
+    mdc_local_t *local = NULL;
+    struct mdc_conf *conf = this->private;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (!local)
+        goto uncached;
+
+    loc_copy(&local->loc, loc);
+
+    if (!inode_is_linked(loc->inode)) {
+        GF_ATOMIC_INC(conf->mdc_counter.stat_miss);
+        goto uncached;
+    }
+
+    ret = mdc_inode_iatt_get(this, loc->inode, &stbuf);
+    if (ret != 0)
+        goto uncached;
+
+    GF_ATOMIC_INC(conf->mdc_counter.stat_hit);
+    MDC_STACK_UNWIND(stat, frame, 0, 0, &stbuf, xdata);
+
+    return 0;
+
+uncached:
+    xdata = mdc_prepare_request(this, local, xdata);
+
+    GF_ATOMIC_INC(conf->mdc_counter.stat_miss);
+    STACK_WIND(frame, mdc_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->stat, loc, xdata);
+
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
+
+    return 0;
+}
+
+int
+mdc_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE)) {
+            mdc_inode_iatt_invalidate(this, local->fd->inode);
+        }
+
+        goto out;
+    }
+
+    mdc_inode_iatt_set(this, local->fd->inode, buf, local->incident_time);
+    if (local->update_cache) {
+        mdc_inode_xatt_set(this, local->fd->inode, xdata);
+    }
+
+out:
+    MDC_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata);
+
+    return 0;
+}
+
+int
+mdc_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    int ret;
+    struct iatt stbuf;
+    mdc_local_t *local = NULL;
+    struct mdc_conf *conf = this->private;
+
+    local = mdc_local_get(frame, fd->inode);
+    if (!local)
+        goto uncached;
+
+    local->fd = __fd_ref(fd);
+
+    ret = mdc_inode_iatt_get(this, fd->inode, &stbuf);
+    if (ret != 0)
+        goto uncached;
+
+    GF_ATOMIC_INC(conf->mdc_counter.stat_hit);
+    MDC_STACK_UNWIND(fstat, frame, 0, 0, &stbuf, xdata);
+
+    return 0;
+
+uncached:
+    xdata = mdc_prepare_request(this, local, xdata);
+
+    GF_ATOMIC_INC(conf->mdc_counter.stat_miss);
+    STACK_WIND(frame, mdc_fstat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fstat, fd, xdata);
+
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
+
+    return 0;
+}
+
+int
+mdc_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                 struct iatt *postbuf, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ESTALE) || (op_errno == ENOENT))
+            mdc_inode_iatt_invalidate(this, local->loc.inode);
+
+        goto out;
+    }
+
+    mdc_inode_iatt_set_validate(this, local->loc.inode, prebuf, postbuf,
+                                _gf_true, local->incident_time);
+
+out:
+    MDC_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    return 0;
+}
+
+int
+mdc_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+             dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (local != NULL) {
+        local->loc.inode = inode_ref(loc->inode);
+    }
+
+    STACK_WIND(frame, mdc_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+    return 0;
+}
+
+int
+mdc_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                  struct iatt *postbuf, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE))
+            mdc_inode_iatt_invalidate(this, local->fd->inode);
+
+        goto out;
+    }
+
+    mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf,
+                                _gf_true, local->incident_time);
+
+out:
+    MDC_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+                     xdata);
+
+    return 0;
+}
+
+int
+mdc_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+              dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, fd->inode);
+    if (local != NULL) {
+        local->fd = __fd_ref(fd);
+    }
+
+    STACK_WIND(frame, mdc_ftruncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+}
+
+int
+mdc_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, inode_t *inode, struct iatt *buf,
+              struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ESTALE) || (op_errno == ENOENT)) {
+            mdc_inode_iatt_invalidate(this, local->loc.parent);
+        }
+
+        goto out;
+    }
+
+    if (local->loc.parent) {
+        mdc_inode_iatt_set(this, local->loc.parent, postparent,
+                           local->incident_time);
+    }
+
+    if (local->loc.inode) {
+        mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time);
+    }
+out:
+    MDC_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent,
+                     postparent, xdata);
+    return 0;
+}
+
+int
+mdc_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (local != NULL) {
+        loc_copy(&local->loc, loc);
+        local->xattr = dict_ref(xdata);
+    }
+
+    STACK_WIND(frame, mdc_mknod_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata);
+    return 0;
+}
+
+int
+mdc_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, inode_t *inode, struct iatt *buf,
+              struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ESTALE) || (op_errno == ENOENT)) {
+            mdc_inode_iatt_invalidate(this, local->loc.parent);
+        }
+
+        goto out;
+    }
+
+    if (local->loc.parent) {
+        mdc_inode_iatt_set(this, local->loc.parent, postparent,
+                           local->incident_time);
+    }
+
+    if (local->loc.inode) {
+        mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time);
+    }
+out:
+    MDC_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, buf, preparent,
+                     postparent, xdata);
+    return 0;
+}
+
+int
+mdc_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          mode_t umask, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (local != NULL) {
+        loc_copy(&local->loc, loc);
+        local->xattr = dict_ref(xdata);
+    }
+
+    STACK_WIND(frame, mdc_mkdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+    return 0;
+}
+
+int
+mdc_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+               struct iatt *postparent, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        /* if errno is ESTALE, parent is not present, which implies even
+         * child is not present. Also, man 2 unlink states unlink can
+         * return ENOENT if a component in pathname does not
+         * exist or is a dangling symbolic link. So, invalidate both
+         * parent and child for both errno
+         */
+
+        if ((op_errno == ENOENT) || (op_errno == ESTALE)) {
+            mdc_inode_iatt_invalidate(this, local->loc.inode);
+            mdc_inode_iatt_invalidate(this, local->loc.parent);
+        }
+
+        goto out;
+    }
+
+    if (local->loc.parent) {
+        mdc_inode_iatt_set(this, local->loc.parent, postparent,
+                           local->incident_time);
+    }
+
+    if (local->loc.inode) {
+        mdc_inode_iatt_set(this, local->loc.inode, NULL, local->incident_time);
+    }
+
+out:
+    MDC_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, postparent,
+                     xdata);
+    return 0;
+}
+
+int
+mdc_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag,
+           dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (local != NULL) {
+        loc_copy(&local->loc, loc);
+    }
+
+    STACK_WIND(frame, mdc_unlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+    return 0;
+}
+
+int
+mdc_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *preparent, struct iatt *postparent,
+              dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        /* if errno is ESTALE, parent is not present, which implies even
+         * child is not present. Also, man 2 rmdir states rmdir can
+         * return ENOENT if a directory component in pathname does not
+         * exist or is a dangling symbolic link. So, invalidate both
+         * parent and child for both errno
+         */
+
+        if ((op_errno == ESTALE) || (op_errno == ENOENT)) {
+            mdc_inode_iatt_invalidate(this, local->loc.inode);
+            mdc_inode_iatt_invalidate(this, local->loc.parent);
+        }
+
+        goto out;
+    }
+
+    if (local->loc.parent) {
+        mdc_inode_iatt_set(this, local->loc.parent, postparent,
+                           local->incident_time);
+    }
+
+out:
+    MDC_STACK_UNWIND(rmdir, frame, op_ret, op_errno, preparent, postparent,
+                     xdata);
+    return 0;
+}
+
+int
+mdc_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flag,
+          dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (local != NULL) {
+        loc_copy(&local->loc, loc);
+    }
+
+    STACK_WIND(frame, mdc_rmdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rmdir, loc, flag, xdata);
+    return 0;
+}
+
+int
+mdc_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, inode_t *inode,
+                struct iatt *buf, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ESTALE) || (op_errno == ENOENT)) {
+            mdc_inode_iatt_invalidate(this, local->loc.parent);
+        }
+
+        goto out;
+    }
+
+    if (local->loc.parent) {
+        mdc_inode_iatt_set(this, local->loc.parent, postparent,
+                           local->incident_time);
+    }
+
+    if (local->loc.inode) {
+        mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time);
+    }
+out:
+    MDC_STACK_UNWIND(symlink, frame, op_ret, op_errno, inode, buf, preparent,
+                     postparent, xdata);
+    return 0;
+}
+
+int
+mdc_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
+            loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+    char *name;
+
+    name = gf_strdup(linkname);
+    if (name == NULL) {
+        goto wind;
+    }
+    local = mdc_local_get(frame, loc->inode);
+    if (local == NULL) {
+        GF_FREE(name);
+        goto wind;
+    }
+
+    loc_copy(&local->loc, loc);
+    local->linkname = name;
+
+wind:
+    STACK_WIND(frame, mdc_symlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->symlink, linkname, loc, umask, xdata);
+    return 0;
+}
+
+int
+mdc_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *buf,
+               struct iatt *preoldparent, struct iatt *postoldparent,
+               struct iatt *prenewparent, struct iatt *postnewparent,
+               dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ESTALE) || (op_errno == ENOENT)) {
+            mdc_inode_iatt_invalidate(this, local->loc.inode);
+            mdc_inode_iatt_invalidate(this, local->loc2.parent);
+        }
+
+        goto out;
+    }
+
+    if (local->loc.parent) {
+        mdc_inode_iatt_set(this, local->loc.parent, postoldparent,
+                           local->incident_time);
+    }
+
+    if (local->loc.inode) {
+        /* TODO: fix dht_rename() not to return linkfile
+           attributes before setting attributes here
+        */
+
+        mdc_inode_iatt_set(this, local->loc.inode, NULL, local->incident_time);
+    }
+
+    if (local->loc2.parent) {
+        mdc_inode_iatt_set(this, local->loc2.parent, postnewparent,
+                           local->incident_time);
+    }
+out:
+    MDC_STACK_UNWIND(rename, frame, op_ret, op_errno, buf, preoldparent,
+                     postoldparent, prenewparent, postnewparent, xdata);
+    return 0;
+}
+
+int
+mdc_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+           dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, oldloc->inode);
+    if (local != NULL) {
+        loc_copy(&local->loc, oldloc);
+        loc_copy(&local->loc2, newloc);
+    }
+
+    STACK_WIND(frame, mdc_rename_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+    return 0;
+}
+
+int
+mdc_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, inode_t *inode, struct iatt *buf,
+             struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE)) {
+            mdc_inode_iatt_invalidate(this, local->loc.inode);
+            mdc_inode_iatt_invalidate(this, local->loc2.parent);
+        }
+
+        goto out;
+    }
+
+    if (local->loc.inode) {
+        mdc_inode_iatt_set(this, local->loc.inode, buf, local->incident_time);
+    }
+
+    if (local->loc2.parent) {
+        mdc_inode_iatt_set(this, local->loc2.parent, postparent,
+                           local->incident_time);
+    }
+out:
+    MDC_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent,
+                     postparent, xdata);
+    return 0;
+}
+
+int
+mdc_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+         dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, oldloc->inode);
+    if (local != NULL) {
+        loc_copy(&local->loc, oldloc);
+        loc_copy(&local->loc2, newloc);
+    }
+
+    STACK_WIND(frame, mdc_link_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+    return 0;
+}
+
+int
+mdc_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+               struct iatt *buf, struct iatt *preparent,
+               struct iatt *postparent, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ESTALE) || (op_errno == ENOENT)) {
+            mdc_inode_iatt_invalidate(this, local->loc.parent);
+        }
+
+        goto out;
+    }
+
+    if (local->loc.parent) {
+        mdc_inode_iatt_set(this, local->loc.parent, postparent,
+                           local->incident_time);
+    }
+
+    if (local->loc.inode) {
+        mdc_inode_iatt_set(this, inode, buf, local->incident_time);
+    }
+out:
+    MDC_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, buf, preparent,
+                     postparent, xdata);
+    return 0;
+}
+
+int
+mdc_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+           mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (local != NULL) {
+        loc_copy(&local->loc, loc);
+        local->xattr = dict_ref(xdata);
+    }
+
+    STACK_WIND(frame, mdc_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
+    return 0;
+}
+
+static int
+mdc_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ESTALE) || (op_errno == ENOENT))
+            mdc_inode_iatt_invalidate(this, local->loc.inode);
+        goto out;
+    }
+
+    if (local->fd->flags & O_TRUNC) {
+        /* O_TRUNC modifies file size. Hence invalidate the
+         * cache entry to fetch latest attributes. */
+        mdc_inode_iatt_invalidate(this, local->fd->inode);
+    }
+
+out:
+    MDC_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata);
+    return 0;
+}
+
+static int
+mdc_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd,
+         dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    if (!fd || !IA_ISREG(fd->inode->ia_type) || !(fd->flags & O_TRUNC)) {
+        goto out;
+    }
+
+    local = mdc_local_get(frame, loc->inode);
+    if (local != NULL) {
+        local->fd = __fd_ref(fd);
+    }
+
+out:
+    STACK_WIND(frame, mdc_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+    return 0;
+}
+
+int
+mdc_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iovec *vector, int32_t count,
+              struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret < 0) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE))
+            mdc_inode_iatt_invalidate(this, local->fd->inode);
+        goto out;
+    }
+
+    mdc_inode_iatt_set(this, local->fd->inode, stbuf, local->incident_time);
+
+out:
+    MDC_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, stbuf,
+                     iobref, xdata);
+
+    return 0;
+}
+
+int
+mdc_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+          off_t offset, uint32_t flags, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, fd->inode);
+    if (local != NULL) {
+        local->fd = __fd_ref(fd);
+    }
+
+    STACK_WIND(frame, mdc_readv_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+    return 0;
+}
+
+int
+mdc_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+               struct iatt *postbuf, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret == -1) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE))
+            mdc_inode_iatt_invalidate(this, local->fd->inode);
+        goto out;
+    }
+
+    mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf,
+                                _gf_true, local->incident_time);
+
+out:
+    MDC_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    return 0;
+}
+
+int
+mdc_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+           int count, off_t offset, uint32_t flags, struct iobref *iobref,
+           dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, fd->inode);
+    if (local != NULL) {
+        local->fd = __fd_ref(fd);
+    }
+
+    STACK_WIND(frame, mdc_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+               flags, iobref, xdata);
+    return 0;
+}
+
+int
+mdc_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                struct iatt *postbuf, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        mdc_inode_iatt_set(this, local->loc.inode, NULL, local->incident_time);
+        goto out;
+    }
+
+    mdc_inode_iatt_set_validate(this, local->loc.inode, prebuf, postbuf,
+                                _gf_true, local->incident_time);
+    mdc_inode_xatt_update(this, local->loc.inode, xdata);
+
+out:
+    MDC_STACK_UNWIND(setattr, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    return 0;
+}
+
+int
+mdc_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+            int valid, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+    dict_t *xattr_alloc = NULL;
+    int ret = 0;
+    struct mdc_conf *conf = this->private;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (local == NULL) {
+        goto wind;
+    }
+
+    loc_copy(&local->loc, loc);
+
+    if ((valid & GF_SET_ATTR_MODE) && conf->cache_glusterfs_acl) {
+        if (!xdata)
+            xdata = xattr_alloc = dict_new();
+        if (xdata) {
+            ret = dict_set_int8(xdata, GF_POSIX_ACL_ACCESS, 0);
+            if (!ret)
+                ret = dict_set_int8(xdata, GF_POSIX_ACL_DEFAULT, 0);
+            if (ret)
+                mdc_inode_xatt_invalidate(this, local->loc.inode);
+        }
+    }
+
+    if ((valid & GF_SET_ATTR_MODE) && conf->cache_posix_acl) {
+        if (!xdata)
+            xdata = xattr_alloc = dict_new();
+        if (xdata) {
+            ret = dict_set_int8(xdata, POSIX_ACL_ACCESS_XATTR, 0);
+            if (!ret)
+                ret = dict_set_int8(xdata, POSIX_ACL_DEFAULT_XATTR, 0);
+            if (ret)
+                mdc_inode_xatt_invalidate(this, local->loc.inode);
+        }
+    }
+
+wind:
+    STACK_WIND(frame, mdc_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+
+    if (xattr_alloc)
+        dict_unref(xattr_alloc);
+    return 0;
+}
+
+int
+mdc_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                 struct iatt *postbuf, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ESTALE) || (op_errno == ENOENT))
+            mdc_inode_iatt_invalidate(this, local->fd->inode);
+        goto out;
+    }
+
+    mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf,
+                                _gf_true, local->incident_time);
+    mdc_inode_xatt_update(this, local->fd->inode, xdata);
+
+out:
+    MDC_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    return 0;
+}
+
+int
+mdc_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+             int valid, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+    dict_t *xattr_alloc = NULL;
+    int ret = 0;
+    struct mdc_conf *conf = this->private;
+
+    local = mdc_local_get(frame, fd->inode);
+    if (local == NULL) {
+        goto wind;
+    }
+
+    local->fd = __fd_ref(fd);
+
+    if ((valid & GF_SET_ATTR_MODE) && conf->cache_glusterfs_acl) {
+        if (!xdata)
+            xdata = xattr_alloc = dict_new();
+        if (xdata) {
+            ret = dict_set_int8(xdata, GF_POSIX_ACL_ACCESS, 0);
+            if (!ret)
+                ret = dict_set_int8(xdata, GF_POSIX_ACL_DEFAULT, 0);
+            if (ret)
+                mdc_inode_xatt_invalidate(this, local->fd->inode);
+        }
+    }
+
+    if ((valid & GF_SET_ATTR_MODE) && conf->cache_posix_acl) {
+        if (!xdata)
+            xdata = xattr_alloc = dict_new();
+        if (xdata) {
+            ret = dict_set_int8(xdata, POSIX_ACL_ACCESS_XATTR, 0);
+            if (!ret)
+                ret = dict_set_int8(xdata, POSIX_ACL_DEFAULT_XATTR, 0);
+            if (ret)
+                mdc_inode_xatt_invalidate(this, local->fd->inode);
+        }
+    }
+
+wind:
+    STACK_WIND(frame, mdc_fsetattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+
+    if (xattr_alloc)
+        dict_unref(xattr_alloc);
+    return 0;
+}
+
+int
+mdc_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
+              dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE))
+            mdc_inode_iatt_invalidate(this, local->fd->inode);
+        goto out;
+    }
+
+    mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf,
+                                _gf_true, local->incident_time);
+
+out:
+    MDC_STACK_UNWIND(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    return 0;
+}
+
+int
+mdc_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync,
+          dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, fd->inode);
+    if (local != NULL) {
+        local->fd = __fd_ref(fd);
+    }
+
+    STACK_WIND(frame, mdc_fsync_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata);
+    return 0;
+}
+
+int
+mdc_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+    struct iatt prestat = {
+        0,
+    };
+    struct iatt poststat = {
+        0,
+    };
+    int ret = 0;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE))
+            mdc_inode_iatt_invalidate(this, local->loc.inode);
+        goto out;
+    }
+
+    mdc_inode_xatt_update(this, local->loc.inode, local->xattr);
+
+    ret = dict_get_iatt(xdata, GF_PRESTAT, &prestat);
+    if (ret >= 0) {
+        ret = dict_get_iatt(xdata, GF_POSTSTAT, &poststat);
+        mdc_inode_iatt_set_validate(this, local->loc.inode, &prestat, &poststat,
+                                    _gf_true, local->incident_time);
+    }
+
+    if (ret < 0)
+        mdc_inode_iatt_invalidate(this, local->loc.inode);
+
+out:
+    MDC_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+int
+mdc_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr,
+             int flags, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (local != NULL) {
+        loc_copy(&local->loc, loc);
+        local->xattr = dict_ref(xattr);
+    }
+
+    STACK_WIND(frame, mdc_setxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setxattr, loc, xattr, flags, xdata);
+
+    return 0;
+}
+
+int
+mdc_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+    struct iatt prestat = {
+        0,
+    };
+    struct iatt poststat = {
+        0,
+    };
+    int ret = 0;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ESTALE) || (op_errno == ENOENT))
+            mdc_inode_iatt_invalidate(this, local->fd->inode);
+        goto out;
+    }
+
+    mdc_inode_xatt_update(this, local->fd->inode, local->xattr);
+
+    ret = dict_get_iatt(xdata, GF_PRESTAT, &prestat);
+    if (ret >= 0) {
+        ret = dict_get_iatt(xdata, GF_POSTSTAT, &poststat);
+        mdc_inode_iatt_set_validate(this, local->fd->inode, &prestat, &poststat,
+                                    _gf_true, local->incident_time);
+    }
+
+    if (ret < 0)
+        mdc_inode_iatt_invalidate(this, local->fd->inode);
+
+out:
+    MDC_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+int
+mdc_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr,
+              int flags, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, fd->inode);
+    if (local != NULL) {
+        local->fd = __fd_ref(fd);
+        local->xattr = dict_ref(xattr);
+    }
+
+    STACK_WIND(frame, mdc_fsetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetxattr, fd, xattr, flags, xdata);
+
+    return 0;
+}
+
+int
+mdc_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret < 0) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE))
+            mdc_inode_iatt_invalidate(this, local->loc.inode);
+        goto out;
+    }
+
+    if (dict_get(xattr, "glusterfs.skip-cache")) {
+        gf_msg(this->name, GF_LOG_DEBUG, 0, 0,
+               "Skipping xattr update due to empty value");
+        goto out;
+    }
+
+    if (local->update_cache) {
+        mdc_inode_xatt_set(this, local->loc.inode, xdata);
+    }
+
+out:
+    MDC_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata);
+
+    return 0;
+}
+
+int
+mdc_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
+             dict_t *xdata)
+{
+    int ret;
+    int op_errno = ENODATA;
+    mdc_local_t *local = NULL;
+    dict_t *xattr = NULL;
+    struct mdc_conf *conf = this->private;
+    gf_boolean_t key_satisfied = _gf_false;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (!local) {
+        goto uncached;
+    }
+
+    loc_copy(&local->loc, loc);
+
+    if (!is_mdc_key_satisfied(this, key)) {
+        goto uncached;
+    }
+    key_satisfied = _gf_true;
+
+    ret = mdc_inode_xatt_get(this, loc->inode, &xattr);
+    if (ret != 0)
+        goto uncached;
+
+    if (!xattr || !dict_get(xattr, (char *)key)) {
+        ret = -1;
+        op_errno = ENODATA;
+    }
+
+    GF_ATOMIC_INC(conf->mdc_counter.xattr_hit);
+    MDC_STACK_UNWIND(getxattr, frame, ret, op_errno, xattr, xdata);
+
+    if (xattr)
+        dict_unref(xattr);
+
+    return 0;
+
+uncached:
+    if (key_satisfied) {
+        xdata = mdc_prepare_request(this, local, xdata);
+    }
+
+    GF_ATOMIC_INC(conf->mdc_counter.xattr_miss);
+    STACK_WIND(frame, mdc_getxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getxattr, loc, key, xdata);
+
+    if (key_satisfied && (xdata != NULL)) {
+        dict_unref(xdata);
+    }
+
+    return 0;
+}
+
+int
+mdc_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, dict_t *xattr,
+                  dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret < 0) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE))
+            mdc_inode_iatt_invalidate(this, local->fd->inode);
+        goto out;
+    }
+
+    if (dict_get(xattr, "glusterfs.skip-cache")) {
+        gf_msg(this->name, GF_LOG_DEBUG, 0, 0,
+               "Skipping xattr update due to empty value");
+        goto out;
+    }
+
+    if (local->update_cache) {
+        mdc_inode_xatt_set(this, local->fd->inode, xdata);
+    }
+
+out:
+    MDC_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata);
+
+    return 0;
+}
+
+int
+mdc_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+              dict_t *xdata)
+{
+    int ret;
+    mdc_local_t *local = NULL;
+    dict_t *xattr = NULL;
+    int op_errno = ENODATA;
+    struct mdc_conf *conf = this->private;
+    gf_boolean_t key_satisfied = _gf_true;
+
+    local = mdc_local_get(frame, fd->inode);
+    if (!local)
+        goto uncached;
+
+    local->fd = __fd_ref(fd);
+
+    if (!is_mdc_key_satisfied(this, key)) {
+        key_satisfied = _gf_false;
+        goto uncached;
+    }
+
+    ret = mdc_inode_xatt_get(this, fd->inode, &xattr);
+    if (ret != 0)
+        goto uncached;
+
+    if (!xattr || !dict_get(xattr, (char *)key)) {
+        ret = -1;
+        op_errno = ENODATA;
+    }
+
+    GF_ATOMIC_INC(conf->mdc_counter.xattr_hit);
+    MDC_STACK_UNWIND(fgetxattr, frame, ret, op_errno, xattr, xdata);
+
+    if (xattr)
+        dict_unref(xattr);
+
+    return 0;
+
+uncached:
+    if (key_satisfied) {
+        xdata = mdc_prepare_request(this, local, xdata);
+    }
+
+    GF_ATOMIC_INC(conf->mdc_counter.xattr_miss);
+    STACK_WIND(frame, mdc_fgetxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fgetxattr, fd, key, xdata);
+
+    if (key_satisfied && (xdata != NULL)) {
+        dict_unref(xdata);
+    }
+
+    return 0;
+}
+
+int
+mdc_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+    struct iatt prestat = {
+        0,
+    };
+    struct iatt poststat = {
+        0,
+    };
+    int ret = 0;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE))
+            mdc_inode_iatt_invalidate(this, local->loc.inode);
+        goto out;
+    }
+
+    if (local->key)
+        mdc_inode_xatt_unset(this, local->loc.inode, local->key);
+    else
+        mdc_inode_xatt_invalidate(this, local->loc.inode);
+
+    ret = dict_get_iatt(xdata, GF_PRESTAT, &prestat);
+    if (ret >= 0) {
+        ret = dict_get_iatt(xdata, GF_POSTSTAT, &poststat);
+        mdc_inode_iatt_set_validate(this, local->loc.inode, &prestat, &poststat,
+                                    _gf_true, local->incident_time);
+    }
+
+    if (ret < 0)
+        mdc_inode_iatt_invalidate(this, local->loc.inode);
+out:
+    MDC_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+int
+mdc_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                const char *name, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+    int op_errno = ENODATA;
+    int ret = 0;
+    dict_t *xattr = NULL;
+    struct mdc_conf *conf = this->private;
+    char *name2;
+
+    name2 = gf_strdup(name);
+    if (name2 == NULL) {
+        goto uncached;
+    }
+
+    local = mdc_local_get(frame, loc->inode);
+    if (local == NULL) {
+        GF_FREE(name2);
+        goto uncached;
+    }
+
+    loc_copy(&local->loc, loc);
+    local->key = name2;
+
+    if (!is_mdc_key_satisfied(this, name))
+        goto uncached;
+
+    ret = mdc_inode_xatt_get(this, loc->inode, &xattr);
+    if (ret != 0)
+        goto uncached;
+
+    GF_ATOMIC_INC(conf->mdc_counter.xattr_hit);
+
+    if (!xattr || !dict_get(xattr, (char *)name)) {
+        ret = -1;
+        op_errno = ENODATA;
+
+        MDC_STACK_UNWIND(removexattr, frame, ret, op_errno, xdata);
+    } else {
+        STACK_WIND(frame, mdc_removexattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+    }
+
+    if (xattr)
+        dict_unref(xattr);
+
+    return 0;
+
+uncached:
+    GF_ATOMIC_INC(conf->mdc_counter.xattr_miss);
+    STACK_WIND(frame, mdc_removexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+    return 0;
+}
+
+int
+mdc_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+    struct iatt prestat = {
+        0,
+    };
+    struct iatt poststat = {
+        0,
+    };
+    int ret = 0;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE))
+            mdc_inode_iatt_invalidate(this, local->fd->inode);
+        goto out;
+    }
+
+    if (local->key)
+        mdc_inode_xatt_unset(this, local->fd->inode, local->key);
+    else
+        mdc_inode_xatt_invalidate(this, local->fd->inode);
+
+    ret = dict_get_iatt(xdata, GF_PRESTAT, &prestat);
+    if (ret >= 0) {
+        ret = dict_get_iatt(xdata, GF_POSTSTAT, &poststat);
+        mdc_inode_iatt_set_validate(this, local->fd->inode, &prestat, &poststat,
+                                    _gf_true, local->incident_time);
+    }
+
+    if (ret < 0)
+        mdc_inode_iatt_invalidate(this, local->fd->inode);
+
+out:
+    MDC_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata);
+
+    return 0;
+}
+
+int
+mdc_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                 const char *name, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+    int op_errno = ENODATA;
+    int ret = 0;
+    dict_t *xattr = NULL;
+    struct mdc_conf *conf = this->private;
+    char *name2;
+
+    name2 = gf_strdup(name);
+    if (name2 == NULL) {
+        goto uncached;
+    }
+
+    local = mdc_local_get(frame, fd->inode);
+    if (local == NULL) {
+        GF_FREE(name2);
+        goto uncached;
+    }
+
+    local->fd = __fd_ref(fd);
+    local->key = name2;
+
+    if (!is_mdc_key_satisfied(this, name))
+        goto uncached;
+
+    ret = mdc_inode_xatt_get(this, fd->inode, &xattr);
+    if (ret != 0)
+        goto uncached;
+
+    GF_ATOMIC_INC(conf->mdc_counter.xattr_hit);
+
+    if (!xattr || !dict_get(xattr, (char *)name)) {
+        ret = -1;
+        op_errno = ENODATA;
+
+        MDC_STACK_UNWIND(fremovexattr, frame, ret, op_errno, xdata);
+    } else {
+        STACK_WIND(frame, mdc_fremovexattr_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+    }
+
+    if (xattr)
+        dict_unref(xattr);
+
+    return 0;
+
+uncached:
+    GF_ATOMIC_INC(conf->mdc_counter.xattr_miss);
+    STACK_WIND(frame, mdc_fremovexattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+    return 0;
+}
+
+int32_t
+mdc_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret == 0)
+        goto out;
+
+    if ((op_errno == ESTALE) || (op_errno == ENOENT))
+        mdc_inode_iatt_invalidate(this, local->loc.inode);
+
+out:
+    MDC_STACK_UNWIND(opendir, frame, op_ret, op_errno, fd, xdata);
+    return 0;
+}
+
+int
+mdc_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+            dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (local != NULL) {
+        loc_copy(&local->loc, loc);
+    }
+
+    /* Tell readdir-ahead to include these keys in xdata when it
+     * internally issues readdirp() in it's opendir_cbk */
+    xdata = mdc_prepare_request(this, local, xdata);
+
+    STACK_WIND(frame, mdc_opendir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
+
+    return 0;
+}
+
+int
+mdc_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                 int op_errno, gf_dirent_t *entries, dict_t *xdata)
+{
+    gf_dirent_t *entry = NULL;
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto unwind;
+
+    if (op_ret <= 0) {
+        if ((op_ret == -1) && ((op_errno == ENOENT) || (op_errno == ESTALE)))
+            mdc_inode_iatt_invalidate(this, local->fd->inode);
+        goto unwind;
+    }
+
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        if (!entry->inode)
+            continue;
+        mdc_inode_iatt_set(this, entry->inode, &entry->d_stat,
+                           local->incident_time);
+        if (local->update_cache) {
+            mdc_inode_xatt_set(this, entry->inode, entry->dict);
+        }
+    }
+
+unwind:
+    MDC_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata);
+    return 0;
+}
+
+int
+mdc_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+             off_t offset, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, fd->inode);
+    if (!local)
+        goto out;
+
+    local->fd = __fd_ref(fd);
+
+    xdata = mdc_prepare_request(this, local, xdata);
+
+    STACK_WIND(frame, mdc_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, offset, xdata);
+
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
+
+    return 0;
+out:
+    MDC_STACK_UNWIND(readdirp, frame, -1, ENOMEM, NULL, NULL);
+    return 0;
+}
+
+int
+mdc_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                int op_errno, gf_dirent_t *entries, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret == 0)
+        goto out;
+
+    if ((op_errno == ESTALE) || (op_errno == ENOENT))
+        mdc_inode_iatt_invalidate(this, local->fd->inode);
+out:
+    MDC_STACK_UNWIND(readdir, frame, op_ret, op_errno, entries, xdata);
+    return 0;
+}
+
+int
+mdc_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t offset, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+    struct mdc_conf *conf = this->private;
+
+    local = mdc_local_get(frame, fd->inode);
+    if (!local)
+        goto unwind;
+
+    local->fd = __fd_ref(fd);
+
+    if (!conf->force_readdirp) {
+        STACK_WIND(frame, mdc_readdir_cbk, FIRST_CHILD(this),
+                   FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata);
+        return 0;
+    }
+
+    xdata = mdc_prepare_request(this, local, xdata);
+
+    STACK_WIND(frame, mdc_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, offset, xdata);
+
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
+
+    return 0;
+unwind:
+    MDC_STACK_UNWIND(readdir, frame, -1, ENOMEM, NULL, NULL);
+    return 0;
+}
+
+int
+mdc_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                  struct iatt *postbuf, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE))
+            mdc_inode_iatt_invalidate(this, local->fd->inode);
+        goto out;
+    }
+
+    mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf,
+                                _gf_true, local->incident_time);
+
+out:
+    MDC_STACK_UNWIND(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+                     xdata);
+
+    return 0;
+}
+
+int
+mdc_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+              off_t offset, size_t len, dict_t *xdata)
+{
+    mdc_local_t *local;
+
+    local = mdc_local_get(frame, fd->inode);
+    if (local != NULL) {
+        local->fd = __fd_ref(fd);
+    }
+
+    STACK_WIND(frame, mdc_fallocate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+               xdata);
+
+    return 0;
+}
+
+int
+mdc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                struct iatt *postbuf, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE))
+            mdc_inode_iatt_invalidate(this, local->fd->inode);
+        goto out;
+    }
+
+    mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf,
+                                _gf_true, local->incident_time);
+
+out:
+    MDC_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    return 0;
+}
+
+int
+mdc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            size_t len, dict_t *xdata)
+{
+    mdc_local_t *local;
+
+    local = mdc_local_get(frame, fd->inode);
+    if (local != NULL) {
+        local->fd = __fd_ref(fd);
+    }
+
+    STACK_WIND(frame, mdc_discard_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+
+    return 0;
+}
+
+int
+mdc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                 struct iatt *postbuf, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret != 0) {
+        if ((op_errno == ENOENT) || (op_errno == ESTALE))
+            mdc_inode_iatt_invalidate(this, local->fd->inode);
+        goto out;
+    }
+
+    mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf,
+                                _gf_true, local->incident_time);
+
+out:
+    MDC_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+    return 0;
+}
+
+int
+mdc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             off_t len, dict_t *xdata)
+{
+    mdc_local_t *local;
+
+    local = mdc_local_get(frame, fd->inode);
+    if (local != NULL) {
+        local->fd = __fd_ref(fd);
+    }
+
+    STACK_WIND(frame, mdc_zerofill_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+
+    return 0;
+}
+
+int32_t
+mdc_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, const char *path,
+                 struct iatt *buf, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret == 0)
+        goto out;
+
+    if ((op_errno == ENOENT) || (op_errno == ESTALE))
+        mdc_inode_iatt_invalidate(this, local->loc.inode);
+
+out:
+    MDC_STACK_UNWIND(readlink, frame, op_ret, op_errno, path, buf, xdata);
+    return 0;
+}
+
+int32_t
+mdc_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+             dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (!local)
+        goto unwind;
+
+    loc_copy(&local->loc, loc);
+
+    STACK_WIND(frame, mdc_readlink_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readlink, loc, size, xdata);
+    return 0;
+
+unwind:
+    MDC_STACK_UNWIND(readlink, frame, -1, ENOMEM, NULL, NULL, NULL);
+    return 0;
+}
+
+int32_t
+mdc_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret == 0)
+        goto out;
+
+    if ((op_errno == ESTALE) || (op_errno == ENOENT))
+        mdc_inode_iatt_invalidate(this, local->fd->inode);
+
+out:
+    MDC_STACK_UNWIND(fsyncdir, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int32_t
+mdc_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+             dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, fd->inode);
+    if (!local)
+        goto unwind;
+
+    local->fd = __fd_ref(fd);
+
+    STACK_WIND(frame, mdc_fsyncdir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsyncdir, fd, flags, xdata);
+    return 0;
+
+unwind:
+    MDC_STACK_UNWIND(fsyncdir, frame, -1, ENOMEM, NULL);
+    return 0;
+}
+
+int32_t
+mdc_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = frame->local;
+    if (!local)
+        goto out;
+
+    if (op_ret == 0)
+        goto out;
+
+    if ((op_errno == ESTALE) || (op_errno == ENOENT))
+        mdc_inode_iatt_invalidate(this, local->loc.inode);
+
+out:
+    MDC_STACK_UNWIND(access, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+int32_t
+mdc_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+           dict_t *xdata)
+{
+    mdc_local_t *local = NULL;
+
+    local = mdc_local_get(frame, loc->inode);
+    if (!local)
+        goto unwind;
+
+    loc_copy(&local->loc, loc);
+
+    STACK_WIND(frame, mdc_access_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->access, loc, mask, xdata);
+    return 0;
+
+unwind:
+    MDC_STACK_UNWIND(access, frame, -1, ENOMEM, NULL);
+    return 0;
+}
+
+int
+mdc_priv_dump(xlator_t *this)
+{
+    struct mdc_conf *conf = NULL;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN];
+
+    conf = this->private;
+
+    snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
+    gf_proc_dump_add_section("%s", key_prefix);
+
+    gf_proc_dump_write("stat_hit_count", "%" PRId64,
+                       GF_ATOMIC_GET(conf->mdc_counter.stat_hit));
+    gf_proc_dump_write("stat_miss_count", "%" PRId64,
+                       GF_ATOMIC_GET(conf->mdc_counter.stat_miss));
+    gf_proc_dump_write("xattr_hit_count", "%" PRId64,
+                       GF_ATOMIC_GET(conf->mdc_counter.xattr_hit));
+    gf_proc_dump_write("xattr_miss_count", "%" PRId64,
+                       GF_ATOMIC_GET(conf->mdc_counter.xattr_miss));
+    gf_proc_dump_write("nameless_lookup_count", "%" PRId64,
+                       GF_ATOMIC_GET(conf->mdc_counter.nameless_lookup));
+    gf_proc_dump_write("negative_lookup_count", "%" PRId64,
+                       GF_ATOMIC_GET(conf->mdc_counter.negative_lookup));
+    gf_proc_dump_write("stat_invalidations_received", "%" PRId64,
+                       GF_ATOMIC_GET(conf->mdc_counter.stat_invals));
+    gf_proc_dump_write("xattr_invalidations_received", "%" PRId64,
+                       GF_ATOMIC_GET(conf->mdc_counter.xattr_invals));
+
+    return 0;
+}
+
+static int32_t
+mdc_dump_metrics(xlator_t *this, int fd)
+{
+    struct mdc_conf *conf = NULL;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    dprintf(fd, "%s.stat_cache_hit_count %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(conf->mdc_counter.stat_hit));
+    dprintf(fd, "%s.stat_cache_miss_count %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(conf->mdc_counter.stat_miss));
+    dprintf(fd, "%s.xattr_cache_hit_count %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(conf->mdc_counter.xattr_hit));
+    dprintf(fd, "%s.xattr_cache_miss_count %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(conf->mdc_counter.xattr_miss));
+    dprintf(fd, "%s.nameless_lookup_count %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(conf->mdc_counter.nameless_lookup));
+    dprintf(fd, "%s.negative_lookup_count %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(conf->mdc_counter.negative_lookup));
+    dprintf(fd, "%s.stat_cache_invalidations_received %" PRId64 "\n",
+            this->name, GF_ATOMIC_GET(conf->mdc_counter.stat_invals));
+    dprintf(fd, "%s.xattr_cache_invalidations_received %" PRId64 "\n",
+            this->name, GF_ATOMIC_GET(conf->mdc_counter.xattr_invals));
+out:
+    return 0;
+}
+
+int
+mdc_forget(xlator_t *this, inode_t *inode)
+{
+    mdc_inode_wipe(this, inode);
+
+    return 0;
+}
+
+int
+is_strpfx(const char *str1, const char *str2)
+{
+    /* is one of the string a prefix of the other? */
+    int i;
+
+    for (i = 0; str1[i] == str2[i]; i++) {
+        if (!str1[i] || !str2[i])
+            break;
+    }
+
+    return !(str1[i] && str2[i]);
+}
+
+static int
+mdc_key_unload_all(struct mdc_conf *conf)
+{
+    conf->mdc_xattr_str = NULL;
+
+    return 0;
+}
+
+int
+mdc_xattr_list_populate(struct mdc_conf *conf, char *tmp_str)
+{
+    char *mdc_xattr_str = NULL;
+    size_t max_size = 0;
+    int ret = 0;
+
+    max_size = SLEN(
+                   "security.capability,security.selinux,security."
+                   "ima," POSIX_ACL_ACCESS_XATTR "," POSIX_ACL_DEFAULT_XATTR
+                   "," GF_POSIX_ACL_ACCESS "," GF_POSIX_ACL_DEFAULT
+                   ","
+                   "user.swift.metadata,user.DOSATTRIB,user.DosStream.*"
+                   ",user.org.netatalk.Metadata,security.NTACL,"
+                   "user.org.netatalk.ResourceFork") +
+               strlen(tmp_str) + 5; /*Some buffer bytes*/
+
+    mdc_xattr_str = GF_MALLOC(max_size, gf_common_mt_char);
+    GF_CHECK_ALLOC(mdc_xattr_str, ret, out);
+    mdc_xattr_str[0] = '\0';
+
+    if (conf->cache_capability)
+        strcat(mdc_xattr_str, "security.capability,");
+
+    if (conf->cache_selinux)
+        strcat(mdc_xattr_str, "security.selinux,");
+
+    if (conf->cache_ima)
+        strcat(mdc_xattr_str, "security.ima,");
+
+    if (conf->cache_posix_acl)
+        strcat(mdc_xattr_str,
+               POSIX_ACL_ACCESS_XATTR "," POSIX_ACL_DEFAULT_XATTR ",");
+
+    if (conf->cache_glusterfs_acl)
+        strcat(mdc_xattr_str, GF_POSIX_ACL_ACCESS "," GF_POSIX_ACL_DEFAULT ",");
+
+    if (conf->cache_swift_metadata)
+        strcat(mdc_xattr_str, "user.swift.metadata,");
+
+    if (conf->cache_samba_metadata)
+        strcat(mdc_xattr_str,
+               "user.DOSATTRIB,user.DosStream.*,"
+               "user.org.netatalk.Metadata,user.org.netatalk."
+               "ResourceFork,security.NTACL,");
+
+    strcat(mdc_xattr_str, tmp_str);
+
+    LOCK(&conf->lock);
+    {
+        /* This is not freed, else is_mdc_key_satisfied, which is
+         * called by every fop has to take lock, and will lead to
+         * lock contention
+         */
+        conf->mdc_xattr_str = mdc_xattr_str;
+    }
+    UNLOCK(&conf->lock);
+
+out:
+    return ret;
+}
+
+struct set {
+    inode_t *inode;
+    xlator_t *this;
+};
+
+static int
+mdc_inval_xatt(dict_t *d, char *k, data_t *v, void *tmp)
+{
+    struct set *tmp1 = NULL;
+    int ret = 0;
+
+    tmp1 = (struct set *)tmp;
+    ret = mdc_inode_xatt_unset(tmp1->this, tmp1->inode, k);
+    return ret;
+}
+
+static int
+mdc_invalidate(xlator_t *this, void *data)
+{
+    struct gf_upcall *up_data = NULL;
+    struct gf_upcall_cache_invalidation *up_ci = NULL;
+    inode_t *inode = NULL;
+    int ret = 0;
+    struct set tmp = {
+        0,
+    };
+    inode_table_t *itable = NULL;
+    struct mdc_conf *conf = this->private;
+    uint64_t gen = 0;
+
+    up_data = (struct gf_upcall *)data;
+
+    if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION)
+        goto out;
+
+    up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
+
+    itable = ((xlator_t *)this->graph->top)->itable;
+    inode = inode_find(itable, up_data->gfid);
+    if (!inode) {
+        ret = -1;
+        goto out;
+    }
+
+    if (up_ci->flags & UP_PARENT_DENTRY_FLAGS) {
+        mdc_update_gfid_stat(this, &up_ci->p_stat);
+        if (up_ci->flags & UP_RENAME_FLAGS)
+            mdc_update_gfid_stat(this, &up_ci->oldp_stat);
+    }
+
+    if (up_ci->flags & UP_EXPLICIT_LOOKUP) {
+        mdc_inode_set_need_lookup(this, inode, _gf_true);
+        goto out;
+    }
+
+    if (up_ci->flags &
+        (UP_NLINK | UP_RENAME_FLAGS | UP_FORGET | UP_INVAL_ATTR)) {
+        mdc_inode_iatt_invalidate(this, inode);
+        mdc_inode_xatt_invalidate(this, inode);
+        GF_ATOMIC_INC(conf->mdc_counter.stat_invals);
+        goto out;
+    }
+
+    if (up_ci->flags & IATT_UPDATE_FLAGS) {
+        gen = mdc_inc_generation(this, inode);
+        ret = mdc_inode_iatt_set_validate(this, inode, NULL, &up_ci->stat,
+                                          _gf_false, gen);
+        /* one of the scenarios where ret < 0 is when this invalidate
+         * is older than the current stat, in that case do not
+         * update the xattrs as well
+         */
+        if (ret < 0)
+            goto out;
+        GF_ATOMIC_INC(conf->mdc_counter.stat_invals);
+    }
+
+    if (up_ci->flags & UP_XATTR) {
+        if (up_ci->dict)
+            ret = mdc_inode_xatt_update(this, inode, up_ci->dict);
+        else
+            ret = mdc_inode_xatt_invalidate(this, inode);
+
+        GF_ATOMIC_INC(conf->mdc_counter.xattr_invals);
+    } else if (up_ci->flags & UP_XATTR_RM) {
+        tmp.inode = inode;
+        tmp.this = this;
+        ret = dict_foreach(up_ci->dict, mdc_inval_xatt, &tmp);
+
+        GF_ATOMIC_INC(conf->mdc_counter.xattr_invals);
+    }
+
+out:
+    if (inode)
+        inode_unref(inode);
+
+    return ret;
+}
+
+struct mdc_ipc {
+    xlator_t *this;
+    dict_t *xattr;
+};
+
+static int
+mdc_send_xattrs_cbk(int ret, call_frame_t *frame, void *data)
+{
+    struct mdc_ipc *tmp = data;
+
+    if (ret < 0) {
+        mdc_key_unload_all(THIS->private);
+        gf_msg("md-cache", GF_LOG_INFO, 0, MD_CACHE_MSG_NO_XATTR_CACHE,
+               "Disabled cache for all xattrs, as registering for "
+               "xattr cache invalidation failed");
+    }
+    STACK_DESTROY(frame->root);
+    dict_unref(tmp->xattr);
+    GF_FREE(tmp);
+
+    return 0;
+}
+
+static int
+mdc_send_xattrs(void *data)
+{
+    int ret = 0;
+    struct mdc_ipc *tmp = data;
+
+    ret = syncop_ipc(FIRST_CHILD(tmp->this), GF_IPC_TARGET_UPCALL, tmp->xattr,
+                     NULL);
+    DECODE_SYNCOP_ERR(ret);
+    if (ret < 0) {
+        gf_msg(tmp->this->name, GF_LOG_WARNING, errno,
+               MD_CACHE_MSG_IPC_UPCALL_FAILED,
+               "Registering the list "
+               "of xattrs that needs invalidaton, with upcall, failed");
+    }
+
+    return ret;
+}
+
+static int
+mdc_register_xattr_inval(xlator_t *this)
+{
+    dict_t *xattr = NULL;
+    int ret = 0;
+    struct mdc_conf *conf = NULL;
+    call_frame_t *frame = NULL;
+    struct mdc_ipc *data = NULL;
+
+    conf = this->private;
+
+    LOCK(&conf->lock);
+    {
+        if (!conf->mdc_invalidation) {
+            UNLOCK(&conf->lock);
+            goto out;
+        }
+    }
+    UNLOCK(&conf->lock);
+
+    xattr = dict_new();
+    if (!xattr) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, MD_CACHE_MSG_NO_MEMORY,
+               "dict_new failed");
+        ret = -1;
+        goto out;
+    }
+
+    if (!mdc_load_reqs(this, xattr)) {
+        gf_msg(this->name, GF_LOG_WARNING, ENOMEM, MD_CACHE_MSG_NO_MEMORY,
+               "failed to populate cache entries");
+        ret = -1;
+        goto out;
+    }
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY,
+               "failed to create the frame");
+        ret = -1;
+        goto out;
+    }
+
+    data = GF_CALLOC(1, sizeof(struct mdc_ipc), gf_mdc_mt_mdc_ipc);
+    if (!data) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY,
+               "failed to allocate memory");
+        ret = -1;
+        goto out;
+    }
+
+    data->this = this;
+    data->xattr = xattr;
+    ret = synctask_new(this->ctx->env, mdc_send_xattrs, mdc_send_xattrs_cbk,
+                       frame, data);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_WARNING, errno,
+               MD_CACHE_MSG_IPC_UPCALL_FAILED,
+               "Registering the list "
+               "of xattrs that needs invalidaton, with upcall, failed");
+    }
+
+out:
+    if (ret < 0) {
+        mdc_key_unload_all(conf);
+        if (xattr)
+            dict_unref(xattr);
+        if (frame)
+            STACK_DESTROY(frame->root);
+        GF_FREE(data);
+        gf_msg(this->name, GF_LOG_INFO, 0, MD_CACHE_MSG_NO_XATTR_CACHE,
+               "Disabled cache for all xattrs, as registering for "
+               "xattr cache invalidation failed");
+    }
+
+    return ret;
+}
+
+int
+mdc_reconfigure(xlator_t *this, dict_t *options)
+{
+    struct mdc_conf *conf = NULL;
+    int timeout = 0, ret = 0;
+    char *tmp_str = NULL;
+
+    conf = this->private;
+
+    GF_OPTION_RECONF("md-cache-timeout", timeout, options, int32, out);
+
+    GF_OPTION_RECONF("cache-selinux", conf->cache_selinux, options, bool, out);
+
+    GF_OPTION_RECONF("cache-capability-xattrs", conf->cache_capability, options,
+                     bool, out);
+
+    GF_OPTION_RECONF("cache-ima-xattrs", conf->cache_ima, options, bool, out);
+
+    GF_OPTION_RECONF("cache-posix-acl", conf->cache_posix_acl, options, bool,
+                     out);
+
+    GF_OPTION_RECONF("cache-glusterfs-acl", conf->cache_glusterfs_acl, options,
+                     bool, out);
+
+    GF_OPTION_RECONF("cache-swift-metadata", conf->cache_swift_metadata,
+                     options, bool, out);
+
+    GF_OPTION_RECONF("cache-samba-metadata", conf->cache_samba_metadata,
+                     options, bool, out);
+
+    GF_OPTION_RECONF("force-readdirp", conf->force_readdirp, options, bool,
+                     out);
+
+    GF_OPTION_RECONF("cache-invalidation", conf->mdc_invalidation, options,
+                     bool, out);
+
+    GF_OPTION_RECONF("global-cache-invalidation", conf->global_invalidation,
+                     options, bool, out);
+
+    GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out);
+
+    GF_OPTION_RECONF("md-cache-statfs", conf->cache_statfs, options, bool, out);
+
+    GF_OPTION_RECONF("xattr-cache-list", tmp_str, options, str, out);
+
+    ret = mdc_xattr_list_populate(conf, tmp_str);
+    if (ret < 0)
+        goto out;
+
+    /* If timeout is greater than 60s (default before the patch that added
+     * cache invalidation support was added) then, cache invalidation
+     * feature for md-cache needs to be enabled, if not set timeout to the
+     * previous max which is 60s
+     */
+    if ((timeout > 60) && (!conf->mdc_invalidation)) {
+        conf->timeout = 60;
+        goto out;
+    }
+    conf->timeout = timeout;
+
+    ret = mdc_register_xattr_inval(this);
+out:
+    return ret;
+}
+
+int32_t
+mdc_mem_acct_init(xlator_t *this)
+{
+    return xlator_mem_acct_init(this, gf_mdc_mt_end + 1);
+}
+
+int
+mdc_init(xlator_t *this)
+{
+    struct mdc_conf *conf = NULL;
+    uint32_t timeout = 0;
+    char *tmp_str = NULL;
+
+    conf = GF_CALLOC(sizeof(*conf), 1, gf_mdc_mt_mdc_conf_t);
+    if (!conf) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, MD_CACHE_MSG_NO_MEMORY,
+               "out of memory");
+        return -1;
+    }
+
+    LOCK_INIT(&conf->lock);
+
+    GF_OPTION_INIT("md-cache-timeout", timeout, uint32, out);
+
+    GF_OPTION_INIT("cache-selinux", conf->cache_selinux, bool, out);
+
+    GF_OPTION_INIT("cache-capability-xattrs", conf->cache_capability, bool,
+                   out);
+
+    GF_OPTION_INIT("cache-ima-xattrs", conf->cache_ima, bool, out);
+
+    GF_OPTION_INIT("cache-posix-acl", conf->cache_posix_acl, bool, out);
+
+    GF_OPTION_INIT("cache-glusterfs-acl", conf->cache_glusterfs_acl, bool, out);
+
+    GF_OPTION_INIT("cache-swift-metadata", conf->cache_swift_metadata, bool,
+                   out);
+
+    GF_OPTION_INIT("cache-samba-metadata", conf->cache_samba_metadata, bool,
+                   out);
+
+    GF_OPTION_INIT("force-readdirp", conf->force_readdirp, bool, out);
+
+    GF_OPTION_INIT("cache-invalidation", conf->mdc_invalidation, bool, out);
+
+    GF_OPTION_INIT("global-cache-invalidation", conf->global_invalidation, bool,
+                   out);
+
+    GF_OPTION_INIT("pass-through", this->pass_through, bool, out);
+
+    pthread_mutex_init(&conf->statfs_cache.lock, NULL);
+    GF_OPTION_INIT("md-cache-statfs", conf->cache_statfs, bool, out);
+
+    GF_OPTION_INIT("xattr-cache-list", tmp_str, str, out);
+    mdc_xattr_list_populate(conf, tmp_str);
+
+    conf->last_child_down = gf_time();
+    conf->statfs_cache.last_refreshed = (time_t)-1;
+
+    /* initialize gf_atomic_t counters */
+    GF_ATOMIC_INIT(conf->mdc_counter.stat_hit, 0);
+    GF_ATOMIC_INIT(conf->mdc_counter.stat_miss, 0);
+    GF_ATOMIC_INIT(conf->mdc_counter.xattr_hit, 0);
+    GF_ATOMIC_INIT(conf->mdc_counter.xattr_miss, 0);
+    GF_ATOMIC_INIT(conf->mdc_counter.negative_lookup, 0);
+    GF_ATOMIC_INIT(conf->mdc_counter.nameless_lookup, 0);
+    GF_ATOMIC_INIT(conf->mdc_counter.stat_invals, 0);
+    GF_ATOMIC_INIT(conf->mdc_counter.xattr_invals, 0);
+    GF_ATOMIC_INIT(conf->mdc_counter.need_lookup, 0);
+    GF_ATOMIC_INIT(conf->generation, 0);
+
+    /* If timeout is greater than 60s (default before the patch that added
+     * cache invalidation support was added) then, cache invalidation
+     * feature for md-cache needs to be enabled, if not set timeout to the
+     * previous max which is 60s
+     */
+    if ((timeout > 60) && (!conf->mdc_invalidation)) {
+        conf->timeout = 60;
+        goto out;
+    }
+    conf->timeout = timeout;
+
+out:
+    this->private = conf;
+
+    return 0;
+}
+
+void
+mdc_update_child_down_time(xlator_t *this, time_t now)
+{
+    struct mdc_conf *conf = NULL;
+
+    conf = this->private;
+
+    LOCK(&conf->lock);
+    {
+        conf->last_child_down = now;
+    }
+    UNLOCK(&conf->lock);
+}
+
+int
+mdc_notify(xlator_t *this, int event, void *data, ...)
+{
+    int ret = 0;
+    struct mdc_conf *conf = NULL;
+
+    conf = this->private;
+    switch (event) {
+        case GF_EVENT_CHILD_DOWN:
+        case GF_EVENT_SOME_DESCENDENT_DOWN:
+            mdc_update_child_down_time(this, gf_time());
+            break;
+        case GF_EVENT_UPCALL:
+            if (conf->mdc_invalidation)
+                ret = mdc_invalidate(this, data);
+            break;
+        case GF_EVENT_CHILD_UP:
+        case GF_EVENT_SOME_DESCENDENT_UP:
+            ret = mdc_register_xattr_inval(this);
+            break;
+        default:
+            break;
+    }
+
+    if (default_notify(this, event, data) != 0)
+        ret = -1;
+
+    return ret;
+}
+
+void
+mdc_fini(xlator_t *this)
+{
+    GF_FREE(this->private);
+}
+
+struct xlator_fops mdc_fops = {
+    .lookup = mdc_lookup,
+    .stat = mdc_stat,
+    .fstat = mdc_fstat,
+    .truncate = mdc_truncate,
+    .ftruncate = mdc_ftruncate,
+    .mknod = mdc_mknod,
+    .mkdir = mdc_mkdir,
+    .unlink = mdc_unlink,
+    .rmdir = mdc_rmdir,
+    .symlink = mdc_symlink,
+    .rename = mdc_rename,
+    .link = mdc_link,
+    .create = mdc_create,
+    .open = mdc_open,
+    .readv = mdc_readv,
+    .writev = mdc_writev,
+    .setattr = mdc_setattr,
+    .fsetattr = mdc_fsetattr,
+    .fsync = mdc_fsync,
+    .setxattr = mdc_setxattr,
+    .fsetxattr = mdc_fsetxattr,
+    .getxattr = mdc_getxattr,
+    .fgetxattr = mdc_fgetxattr,
+    .removexattr = mdc_removexattr,
+    .fremovexattr = mdc_fremovexattr,
+    .opendir = mdc_opendir,
+    .readdirp = mdc_readdirp,
+    .readdir = mdc_readdir,
+    .fallocate = mdc_fallocate,
+    .discard = mdc_discard,
+    .zerofill = mdc_zerofill,
+    .statfs = mdc_statfs,
+    .readlink = mdc_readlink,
+    .fsyncdir = mdc_fsyncdir,
+    .access = mdc_access,
+};
+
+struct xlator_cbks mdc_cbks = {
+    .forget = mdc_forget,
+};
+
+struct xlator_dumpops mdc_dumpops = {
+    .priv = mdc_priv_dump,
+};
+
+struct volume_options mdc_options[] = {
+    {
+        .key = {"md-cache"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "enable/disable md-cache",
+        .op_version = {GD_OP_VERSION_6_0},
+        .flags = OPT_FLAG_SETTABLE,
+    },
+    {
+        .key = {"cache-selinux"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .op_version = {2},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "Cache selinux xattr(security.selinux) on client side",
+    },
+    {
+        .key = {"cache-capability-xattrs"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "true",
+        .op_version = {GD_OP_VERSION_3_10_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "Cache capability xattr(security.capability) on "
+                       "client side",
+    },
+    {
+        .key = {"cache-ima-xattrs"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "true",
+        .op_version = {GD_OP_VERSION_3_10_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "Cache Linux integrity subsystem xattr(security.ima) "
+                       "on client side",
+    },
+    {
+        .key = {"cache-swift-metadata"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .op_version = {GD_OP_VERSION_3_7_10},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "Cache swift metadata (user.swift.metadata xattr)",
+    },
+    {
+        .key = {"cache-samba-metadata"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .op_version = {GD_OP_VERSION_3_9_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "Cache samba metadata (user.DOSATTRIB, security.NTACL,"
+                       " org.netatalk.Metadata, org.netatalk.ResourceFork, "
+                       "and user.DosStream. xattrs)",
+    },
+    {
+        .key = {"cache-posix-acl"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .op_version = {2},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "Cache posix ACL xattrs (system.posix_acl_access, "
+                       "system.posix_acl_default) on client side",
+    },
+    {
+        .key = {"cache-glusterfs-acl"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .op_version = {GD_OP_VERSION_6_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "Cache virtual glusterfs ACL xattrs "
+                       "(glusterfs.posix.acl, glusterfs.posix.default_acl) "
+                       "on client side",
+    },
+    {
+        .key = {"md-cache-timeout"},
+        .type = GF_OPTION_TYPE_INT,
+        .min = 0,
+        .max = 600,
+        .default_value = SITE_H_MD_CACHE_TIMEOUT,
+        .op_version = {2},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "Time period after which cache has to be refreshed",
+    },
+    {
+        .key = {"force-readdirp"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "true",
+        .op_version = {2},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "Convert all readdir requests to readdirplus to "
+                       "collect stat info on each entry.",
+    },
+    {
+        .key = {"cache-invalidation"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .op_version = {GD_OP_VERSION_3_9_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "When \"on\", invalidates/updates the metadata cache,"
+                       " on receiving the cache-invalidation notifications",
+    },
+    {
+        .key = {"global-cache-invalidation"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "true",
+        .op_version = {GD_OP_VERSION_6_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description =
+            "When \"on\", purges all read caches in kernel and glusterfs stack "
+            "whenever a stat change is detected. Stat changes can be detected "
+            "while processing responses to file operations (fop) or through "
+            "upcall notifications. Since purging caches can be an expensive "
+            "operation, it's advised to have this option \"on\" only when a "
+            "file "
+            "can be accessed from multiple different Glusterfs mounts and "
+            "caches across these different mounts are required to be coherent. "
+            "If a file is not accessed across different mounts "
+            "(simple example is having only one mount for a volume), its "
+            "advised to keep "
+            "this option \"off\" as all file modifications go through caches "
+            "keeping them "
+            "coherent. This option overrides value of "
+            "performance.cache-invalidation.",
+    },
+    {
+        .key = {"md-cache-statfs"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .op_version = {GD_OP_VERSION_4_0_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "Cache statfs information of filesystem on the client",
+    },
+    {
+        .key = {"xattr-cache-list"},
+        .type = GF_OPTION_TYPE_STR,
+        .default_value = "",
+        .op_version = {GD_OP_VERSION_4_0_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "A comma separated list of xattrs that shall be "
+                       "cached by md-cache. The only wildcard allowed is '*'",
+    },
+    {.key = {"pass-through"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "false",
+     .op_version = {GD_OP_VERSION_4_1_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+     .tags = {"md-cache"},
+     .description = "Enable/Disable md cache translator"},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = mdc_init,
+    .fini = mdc_fini,
+    .notify = mdc_notify,
+    .reconfigure = mdc_reconfigure,
+    .mem_acct_init = mdc_mem_acct_init,
+    .dump_metrics = mdc_dump_metrics,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &mdc_dumpops,
+    .fops = &mdc_fops,
+    .cbks = &mdc_cbks,
+    .options = mdc_options,
+    .identifier = "md-cache",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/performance/nl-cache/Makefile.am b/xlators/performance/nl-cache/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/performance/nl-cache/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/performance/nl-cache/src/Makefile.am b/xlators/performance/nl-cache/src/Makefile.am
new file mode 100644
index 00000000000..c44ce871627
--- /dev/null
+++ b/xlators/performance/nl-cache/src/Makefile.am
@@ -0,0 +1,12 @@
+xlator_LTLIBRARIES = nl-cache.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
+nl_cache_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+nl_cache_la_SOURCES = nl-cache.c nl-cache-helper.c
+nl_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+noinst_HEADERS = nl-cache.h nl-cache-mem-types.h nl-cache-messages.h
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+        -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+        -I$(CONTRIBDIR)/timer-wheel
+
+AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS)
+CLEANFILES =
diff --git a/xlators/performance/nl-cache/src/nl-cache-helper.c b/xlators/performance/nl-cache/src/nl-cache-helper.c
new file mode 100644
index 00000000000..29b99b5b8ea
--- /dev/null
+++ b/xlators/performance/nl-cache/src/nl-cache-helper.c
@@ -0,0 +1,1201 @@
+/*
+ *   Copyright (c) 2017 Red Hat, Inc. <http://www.redhat.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#include "nl-cache.h"
+#include "timer-wheel.h"
+#include <glusterfs/statedump.h>
+
+/* Caching guidelines:
+ * This xlator serves negative lookup(ENOENT lookups) from the cache,
+ * there by making create faster.
+ *   What is cached?
+ *      Negative lookup cache is stored for each directory, and has 2 entries:
+ *      - Negative entries: Populated only when lookup/stat returns ENOENT.
+ *        Fuse mostly sends only one lookup before create, hence negative entry
+ *        cache is almost useless. But for SMB access, multiple lookups/stats
+ *        are sent before creating the file. Hence the negative entry cache.
+ *        It can exist even when the positive entry cache is invalid. It also
+ *        has the entries that were deleted from this directory.
+ *        Freed on receiving upcall(with dentry change flag) or on expiring
+ *        timeout of the cache.
+ *
+ *      - Positive entries: Populated as a part of readdirp, and as a part of
+ *        mkdir followed by creates inside that directory. Lookups and other
+ *        fops do not populate the positive entry (as it can grow long and is
+ *        of no value add)
+ *        Freed on receiving upcall(with dentry change flag) or on expiring
+ *        timeout of the cache.
+ *
+ *   Data structures to store cache?
+ *      The cache of any directory is stored in the inode_ctx of the directory.
+ *      Negative entries are stored as list of strings.
+ *             Search - O(n)
+ *             Add    - O(1)
+ *             Delete - O(n) - as it has to be searched before deleting
+ *      Positive entries are stored as a list, each list node has a pointer
+ *          to the inode of the positive entry or the name of the entry.
+ *          Since the client side inode table already will have inodes for
+ *          positive entries, we just take a ref of that inode and store as
+ *          positive entry cache. In cases like hardlinks and readdirp where
+ *          inode is NULL, we store the names.
+ *          Name Search - O(n)
+ *          Inode Search - O(1) - Actually complexity of inode_find()
+ *          Name/inode Add - O(1)
+ *          Name Delete - O(n)
+ *          Inode Delete - O(1)
+ *
+ * Locking order:
+ *
+ * TODO:
+ * - Fill Positive entries on readdir/p, after which in lookup_cbk check if the
+ *   name is in PE and replace it with inode.
+ * - fini, PARENET_DOWN, disable caching
+ * - Virtual setxattr to dump the inode_ctx, to ease debugging
+ * - Handle dht_nuke xattr: clear all cache
+ * - Special handling for .meta and .trashcan?
+ */
+
+int
+__nlc_inode_ctx_timer_start(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx);
+int
+__nlc_add_to_lru(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx);
+void
+nlc_remove_from_lru(xlator_t *this, inode_t *inode);
+void
+__nlc_inode_ctx_timer_delete(xlator_t *this, nlc_ctx_t *nlc_ctx);
+gf_boolean_t
+__nlc_search_ne(nlc_ctx_t *nlc_ctx, const char *name);
+void
+__nlc_free_pe(xlator_t *this, nlc_ctx_t *nlc_ctx, nlc_pe_t *pe);
+void
+__nlc_free_ne(xlator_t *this, nlc_ctx_t *nlc_ctx, nlc_ne_t *ne);
+
+static int32_t
+nlc_get_cache_timeout(xlator_t *this)
+{
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    /* Cache timeout is generally not meant to be changed often,
+     * once set, hence not within locks */
+    return conf->cache_timeout;
+}
+
+static gf_boolean_t
+__nlc_is_cache_valid(xlator_t *this, nlc_ctx_t *nlc_ctx)
+{
+    nlc_conf_t *conf = NULL;
+    time_t last_val_time;
+    gf_boolean_t ret = _gf_false;
+
+    GF_VALIDATE_OR_GOTO(this->name, nlc_ctx, out);
+
+    conf = this->private;
+
+    LOCK(&conf->lock);
+    {
+        last_val_time = conf->last_child_down;
+    }
+    UNLOCK(&conf->lock);
+
+    if ((last_val_time <= nlc_ctx->cache_time) && (nlc_ctx->cache_time != 0))
+        ret = _gf_true;
+out:
+    return ret;
+}
+
+void
+nlc_update_child_down_time(xlator_t *this, time_t now)
+{
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    LOCK(&conf->lock);
+    {
+        conf->last_child_down = now;
+    }
+    UNLOCK(&conf->lock);
+
+    return;
+}
+
+void
+nlc_disable_cache(xlator_t *this)
+{
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    LOCK(&conf->lock);
+    {
+        conf->disable_cache = _gf_true;
+    }
+    UNLOCK(&conf->lock);
+
+    return;
+}
+
+static int
+__nlc_inode_ctx_get(xlator_t *this, inode_t *inode, nlc_ctx_t **nlc_ctx_p)
+{
+    int ret = 0;
+    nlc_ctx_t *nlc_ctx = NULL;
+    uint64_t nlc_ctx_int = 0;
+    uint64_t nlc_pe_int = 0;
+
+    ret = __inode_ctx_get2(inode, this, &nlc_ctx_int, &nlc_pe_int);
+    if (ret == 0 && nlc_ctx_p) {
+        nlc_ctx = (void *)(long)(nlc_ctx_int);
+        *nlc_ctx_p = nlc_ctx;
+    }
+    return ret;
+}
+
+static int
+nlc_inode_ctx_set(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx,
+                  nlc_pe_t *nlc_pe_p)
+{
+    uint64_t ctx1, ctx2;
+    int ret = -1;
+
+    ctx1 = (uint64_t)(uintptr_t)nlc_ctx;
+    ctx2 = (uint64_t)(uintptr_t)nlc_pe_p;
+
+    /* The caller may choose to set one of the ctxs, hence check
+     * if the ctx1/2 is non zero and then send the address. If we
+     * blindly send the address of both the ctxs, it may reset the
+     * ctx the caller had sent NULL(intended as leave untouched) for.*/
+    LOCK(&inode->lock);
+    {
+        ret = __inode_ctx_set2(inode, this, ctx1 ? &ctx1 : 0, ctx2 ? &ctx2 : 0);
+    }
+    UNLOCK(&inode->lock);
+    return ret;
+}
+
+static void
+nlc_inode_ctx_get(xlator_t *this, inode_t *inode, nlc_ctx_t **nlc_ctx_p)
+{
+    int ret = 0;
+
+    LOCK(&inode->lock);
+    {
+        ret = __nlc_inode_ctx_get(this, inode, nlc_ctx_p);
+        if (ret < 0)
+            gf_msg_debug(this->name, 0,
+                         "inode ctx get failed for "
+                         "inode:%p",
+                         inode);
+    }
+    UNLOCK(&inode->lock);
+
+    return;
+}
+
+static void
+__nlc_inode_clear_entries(xlator_t *this, nlc_ctx_t *nlc_ctx)
+{
+    nlc_pe_t *pe = NULL;
+    nlc_pe_t *tmp = NULL;
+    nlc_ne_t *ne = NULL;
+    nlc_ne_t *tmp1 = NULL;
+
+    if (!nlc_ctx)
+        goto out;
+
+    if (IS_PE_VALID(nlc_ctx->state))
+        list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list)
+        {
+            __nlc_free_pe(this, nlc_ctx, pe);
+        }
+
+    if (IS_NE_VALID(nlc_ctx->state))
+        list_for_each_entry_safe(ne, tmp1, &nlc_ctx->ne, list)
+        {
+            __nlc_free_ne(this, nlc_ctx, ne);
+        }
+
+    nlc_ctx->cache_time = 0;
+    nlc_ctx->state = 0;
+    GF_ASSERT(nlc_ctx->cache_size == sizeof(*nlc_ctx));
+    GF_ASSERT(nlc_ctx->refd_inodes == 0);
+out:
+    return;
+}
+
+static void
+nlc_init_invalid_ctx(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx)
+{
+    nlc_conf_t *conf = NULL;
+    int ret = -1;
+
+    conf = this->private;
+    if (!nlc_ctx)
+        goto out;
+
+    LOCK(&nlc_ctx->lock);
+    {
+        if (__nlc_is_cache_valid(this, nlc_ctx))
+            goto unlock;
+
+        /* The cache/nlc_ctx can be invalid for 2 reasons:
+         * - Because of a child-down/timer expiry, cache is
+         *   invalid but the nlc_ctx is not yet cleaned up.
+         * - nlc_ctx is cleaned up, because of invalidations
+         *   or lru prune etc.*/
+
+        /* If the cache is present but invalid, clear the cache and
+         * reset the timer. */
+        __nlc_inode_clear_entries(this, nlc_ctx);
+
+        /* If timer is present, then it is already part of lru as well
+         * Hence reset the timer and return.*/
+        if (nlc_ctx->timer) {
+            gf_tw_mod_timer_pending(conf->timer_wheel, nlc_ctx->timer,
+                                    conf->cache_timeout);
+            nlc_ctx->cache_time = gf_time();
+            goto unlock;
+        }
+
+        /* If timer was NULL, the nlc_ctx is already cleanedup,
+         * and we need to start timer and add to lru, so that it is
+         * ready to cache entries a fresh */
+        ret = __nlc_inode_ctx_timer_start(this, inode, nlc_ctx);
+        if (ret < 0)
+            goto unlock;
+
+        ret = __nlc_add_to_lru(this, inode, nlc_ctx);
+        if (ret < 0) {
+            __nlc_inode_ctx_timer_delete(this, nlc_ctx);
+            goto unlock;
+        }
+    }
+unlock:
+    UNLOCK(&nlc_ctx->lock);
+out:
+    return;
+}
+
+static nlc_ctx_t *
+nlc_inode_ctx_get_set(xlator_t *this, inode_t *inode, nlc_ctx_t **nlc_ctx_p)
+{
+    uint64_t ctx;
+    int ret = 0;
+    nlc_ctx_t *nlc_ctx = NULL;
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    LOCK(&inode->lock);
+    {
+        ret = __nlc_inode_ctx_get(this, inode, &nlc_ctx);
+        if (nlc_ctx)
+            goto unlock;
+
+        nlc_ctx = GF_CALLOC(sizeof(*nlc_ctx), 1, gf_nlc_mt_nlc_ctx_t);
+        if (!nlc_ctx)
+            goto unlock;
+
+        LOCK_INIT(&nlc_ctx->lock);
+        INIT_LIST_HEAD(&nlc_ctx->pe);
+        INIT_LIST_HEAD(&nlc_ctx->ne);
+
+        ret = __nlc_inode_ctx_timer_start(this, inode, nlc_ctx);
+        if (ret < 0)
+            goto unlock;
+
+        ret = __nlc_add_to_lru(this, inode, nlc_ctx);
+        if (ret < 0) {
+            __nlc_inode_ctx_timer_delete(this, nlc_ctx);
+            goto unlock;
+        }
+
+        ctx = (uint64_t)(uintptr_t)nlc_ctx;
+        ret = __inode_ctx_set2(inode, this, &ctx, NULL);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, NLC_MSG_NO_MEMORY,
+                   "inode ctx set failed");
+            __nlc_inode_ctx_timer_delete(this, nlc_ctx);
+            nlc_remove_from_lru(this, inode);
+            goto unlock;
+        }
+
+        /*TODO: also sizeof (gf_tw_timer_list) + nlc_timer_data_t ?*/
+        nlc_ctx->cache_size = sizeof(*nlc_ctx);
+        GF_ATOMIC_ADD(conf->current_cache_size, nlc_ctx->cache_size);
+    }
+unlock:
+    UNLOCK(&inode->lock);
+
+    if (ret == 0 && nlc_ctx_p) {
+        *nlc_ctx_p = nlc_ctx;
+        nlc_init_invalid_ctx(this, inode, nlc_ctx);
+    }
+
+    if (ret < 0 && nlc_ctx) {
+        LOCK_DESTROY(&nlc_ctx->lock);
+        GF_FREE(nlc_ctx);
+        nlc_ctx = NULL;
+        goto out;
+    }
+
+out:
+    return nlc_ctx;
+}
+
+nlc_local_t *
+nlc_local_init(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+               loc_t *loc, loc_t *loc2)
+{
+    nlc_local_t *local = NULL;
+
+    local = GF_CALLOC(sizeof(*local), 1, gf_nlc_mt_nlc_local_t);
+    if (!local)
+        goto out;
+
+    if (loc)
+        loc_copy(&local->loc, loc);
+    if (loc2)
+        loc_copy(&local->loc2, loc2);
+
+    local->fop = fop;
+    frame->local = local;
+out:
+    return local;
+}
+
+void
+nlc_local_wipe(xlator_t *this, nlc_local_t *local)
+{
+    if (!local)
+        goto out;
+
+    loc_wipe(&local->loc);
+
+    loc_wipe(&local->loc2);
+
+    GF_FREE(local);
+out:
+    return;
+}
+
+static void
+__nlc_set_dir_state(nlc_ctx_t *nlc_ctx, uint64_t new_state)
+{
+    nlc_ctx->state |= new_state;
+
+    return;
+}
+
+void
+nlc_set_dir_state(xlator_t *this, inode_t *inode, uint64_t state)
+{
+    nlc_ctx_t *nlc_ctx = NULL;
+
+    if (inode->ia_type != IA_IFDIR) {
+        gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL,
+                         "inode is not of type dir");
+        goto out;
+    }
+
+    nlc_inode_ctx_get_set(this, inode, &nlc_ctx);
+    if (!nlc_ctx)
+        goto out;
+
+    LOCK(&nlc_ctx->lock);
+    {
+        __nlc_set_dir_state(nlc_ctx, state);
+    }
+    UNLOCK(&nlc_ctx->lock);
+out:
+    return;
+}
+
+static void
+nlc_cache_timeout_handler(struct gf_tw_timer_list *timer, void *data,
+                          unsigned long calltime)
+{
+    nlc_timer_data_t *tmp = data;
+    nlc_ctx_t *nlc_ctx = NULL;
+
+    nlc_inode_ctx_get(tmp->this, tmp->inode, &nlc_ctx);
+    if (!nlc_ctx)
+        goto out;
+
+    /* Taking nlc_ctx->lock will lead to deadlock, hence updating
+     * the cache is invalid outside of lock, instead of clear_cache.
+     * Since cache_time is assigned outside of lock, the value can
+     * be invalid for short time, this may result in false negative
+     * which is better than deadlock */
+    nlc_ctx->cache_time = 0;
+out:
+    return;
+}
+
+void
+__nlc_inode_ctx_timer_delete(xlator_t *this, nlc_ctx_t *nlc_ctx)
+{
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (nlc_ctx->timer)
+        gf_tw_del_timer(conf->timer_wheel, nlc_ctx->timer);
+
+    if (nlc_ctx->timer_data) {
+        inode_unref(nlc_ctx->timer_data->inode);
+        GF_FREE(nlc_ctx->timer_data);
+        nlc_ctx->timer_data = NULL;
+    }
+
+    GF_FREE(nlc_ctx->timer);
+    nlc_ctx->timer = NULL;
+
+    return;
+}
+
+int
+__nlc_inode_ctx_timer_start(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx)
+{
+    struct gf_tw_timer_list *timer = NULL;
+    nlc_timer_data_t *tmp = NULL;
+    nlc_conf_t *conf = NULL;
+    int ret = -1;
+
+    conf = this->private;
+
+    /* We are taking inode_table->lock within inode->lock
+     * as the only other caller which takes inode->lock within
+     * inode_table->lock and cause deadlock is inode_table_destroy.
+     * Hopefully, there can be no fop when inode_table_destroy is
+     * being called. */
+    tmp = GF_CALLOC(1, sizeof(*tmp), gf_nlc_mt_nlc_timer_data_t);
+    if (!tmp)
+        goto out;
+    tmp->inode = inode_ref(inode);
+    tmp->this = this;
+
+    timer = GF_CALLOC(1, sizeof(*timer), gf_common_mt_tw_timer_list);
+    if (!timer)
+        goto out;
+
+    INIT_LIST_HEAD(&timer->entry);
+    timer->expires = nlc_get_cache_timeout(this);
+    timer->function = nlc_cache_timeout_handler;
+    timer->data = tmp;
+    nlc_ctx->timer = timer;
+    nlc_ctx->timer_data = tmp;
+    gf_tw_add_timer(conf->timer_wheel, timer);
+
+    nlc_ctx->cache_time = gf_time();
+    gf_msg_trace(this->name, 0,
+                 "Registering timer:%p, inode:%p, "
+                 "gfid:%s",
+                 timer, inode, uuid_utoa(inode->gfid));
+
+    ret = 0;
+
+out:
+    if (ret < 0) {
+        if (tmp && tmp->inode)
+            inode_unref(tmp->inode);
+        GF_FREE(tmp);
+        GF_FREE(timer);
+    }
+
+    return ret;
+}
+
+int
+__nlc_add_to_lru(xlator_t *this, inode_t *inode, nlc_ctx_t *nlc_ctx)
+{
+    nlc_lru_node_t *lru_ino = NULL;
+    uint64_t nlc_pe_int = 0;
+    nlc_conf_t *conf = NULL;
+    int ret = -1;
+
+    conf = this->private;
+
+    lru_ino = GF_CALLOC(1, sizeof(*lru_ino), gf_nlc_mt_nlc_lru_node);
+    if (!lru_ino)
+        goto out;
+
+    INIT_LIST_HEAD(&lru_ino->list);
+    lru_ino->inode = inode_ref(inode);
+    LOCK(&conf->lock);
+    {
+        list_add_tail(&lru_ino->list, &conf->lru);
+    }
+    UNLOCK(&conf->lock);
+
+    nlc_ctx->refd_inodes = 0;
+    ret = __inode_ctx_get2(inode, this, NULL, &nlc_pe_int);
+    if (nlc_pe_int == 0)
+        GF_ATOMIC_ADD(conf->refd_inodes, 1);
+
+    ret = 0;
+
+out:
+    return ret;
+}
+
+void
+nlc_remove_from_lru(xlator_t *this, inode_t *inode)
+{
+    nlc_lru_node_t *lru_node = NULL;
+    nlc_lru_node_t *tmp = NULL;
+    nlc_lru_node_t *tmp1 = NULL;
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    LOCK(&conf->lock);
+    {
+        list_for_each_entry_safe(lru_node, tmp, &conf->lru, list)
+        {
+            if (inode == lru_node->inode) {
+                list_del(&lru_node->list);
+                tmp1 = lru_node;
+                break;
+            }
+        }
+    }
+    UNLOCK(&conf->lock);
+
+    if (tmp1) {
+        inode_unref(tmp1->inode);
+        GF_FREE(tmp1);
+    }
+
+    return;
+}
+
+void
+nlc_lru_prune(xlator_t *this, inode_t *inode)
+{
+    nlc_lru_node_t *lru_node = NULL;
+    nlc_lru_node_t *prune_node = NULL;
+    nlc_lru_node_t *tmp = NULL;
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    LOCK(&conf->lock);
+    {
+        if ((GF_ATOMIC_GET(conf->refd_inodes) < conf->inode_limit) &&
+            (GF_ATOMIC_GET(conf->current_cache_size) < conf->cache_size))
+            goto unlock;
+
+        list_for_each_entry_safe(lru_node, tmp, &conf->lru, list)
+        {
+            list_del(&lru_node->list);
+            prune_node = lru_node;
+            goto unlock;
+        }
+    }
+unlock:
+    UNLOCK(&conf->lock);
+
+    if (prune_node) {
+        nlc_inode_clear_cache(this, prune_node->inode, NLC_LRU_PRUNE);
+        inode_unref(prune_node->inode);
+        GF_FREE(prune_node);
+    }
+    return;
+}
+
+void
+nlc_clear_all_cache(xlator_t *this)
+{
+    nlc_conf_t *conf = NULL;
+    struct list_head clear_list;
+    nlc_lru_node_t *prune_node = NULL;
+    nlc_lru_node_t *tmp = NULL;
+
+    conf = this->private;
+
+    INIT_LIST_HEAD(&clear_list);
+
+    LOCK(&conf->lock);
+    {
+        list_replace_init(&conf->lru, &clear_list);
+    }
+    UNLOCK(&conf->lock);
+
+    list_for_each_entry_safe(prune_node, tmp, &clear_list, list)
+    {
+        list_del(&prune_node->list);
+        nlc_inode_clear_cache(this, prune_node->inode, NLC_LRU_PRUNE);
+        inode_unref(prune_node->inode);
+        GF_FREE(prune_node);
+    }
+
+    return;
+}
+
+void
+__nlc_free_pe(xlator_t *this, nlc_ctx_t *nlc_ctx, nlc_pe_t *pe)
+{
+    uint64_t pe_int = 0;
+    nlc_conf_t *conf = NULL;
+    uint64_t nlc_ctx_int = 0;
+
+    conf = this->private;
+
+    if (pe->inode) {
+        inode_ctx_reset1(pe->inode, this, &pe_int);
+        inode_ctx_get2(pe->inode, this, &nlc_ctx_int, NULL);
+        inode_unref(pe->inode);
+    }
+    list_del(&pe->list);
+
+    nlc_ctx->cache_size -= sizeof(*pe) + sizeof(pe->name);
+    GF_ATOMIC_SUB(conf->current_cache_size, (sizeof(*pe) + sizeof(pe->name)));
+
+    nlc_ctx->refd_inodes -= 1;
+    if (nlc_ctx_int == 0)
+        GF_ATOMIC_SUB(conf->refd_inodes, 1);
+
+    GF_FREE(pe->name);
+    GF_FREE(pe);
+
+    return;
+}
+
+void
+__nlc_free_ne(xlator_t *this, nlc_ctx_t *nlc_ctx, nlc_ne_t *ne)
+{
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    list_del(&ne->list);
+    GF_FREE(ne->name);
+    GF_FREE(ne);
+
+    nlc_ctx->cache_size -= sizeof(*ne) + sizeof(ne->name);
+    GF_ATOMIC_SUB(conf->current_cache_size, (sizeof(*ne) + sizeof(ne->name)));
+
+    return;
+}
+
+void
+nlc_inode_clear_cache(xlator_t *this, inode_t *inode, int reason)
+{
+    nlc_ctx_t *nlc_ctx = NULL;
+
+    nlc_inode_ctx_get(this, inode, &nlc_ctx);
+    if (!nlc_ctx)
+        goto out;
+
+    LOCK(&nlc_ctx->lock);
+    {
+        __nlc_inode_ctx_timer_delete(this, nlc_ctx);
+
+        __nlc_inode_clear_entries(this, nlc_ctx);
+    }
+    UNLOCK(&nlc_ctx->lock);
+
+    if (reason != NLC_LRU_PRUNE)
+        nlc_remove_from_lru(this, inode);
+
+out:
+    return;
+}
+
+static void
+__nlc_del_pe(xlator_t *this, nlc_ctx_t *nlc_ctx, inode_t *entry_ino,
+             const char *name, gf_boolean_t multilink)
+{
+    nlc_pe_t *pe = NULL;
+    nlc_pe_t *tmp = NULL;
+    gf_boolean_t found = _gf_false;
+    uint64_t pe_int = 0;
+
+    if (!IS_PE_VALID(nlc_ctx->state))
+        goto out;
+
+    if (!entry_ino)
+        goto name_search;
+
+    /* If there are hardlinks first search names, followed by inodes */
+    if (multilink) {
+        list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list)
+        {
+            if (pe->name && (strcmp(pe->name, name) == 0)) {
+                found = _gf_true;
+                goto out;
+            }
+        }
+        inode_ctx_reset1(entry_ino, this, &pe_int);
+        if (pe_int) {
+            pe = (void *)(long)(pe_int);
+            found = _gf_true;
+            goto out;
+        }
+        goto out;
+    }
+
+    inode_ctx_reset1(entry_ino, this, &pe_int);
+    if (pe_int) {
+        pe = (void *)(long)(pe_int);
+        found = _gf_true;
+        goto out;
+    }
+
+name_search:
+    list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list)
+    {
+        if (pe->name && (strcmp(pe->name, name) == 0)) {
+            found = _gf_true;
+            break;
+            /* TODO: can there be duplicates? */
+        }
+    }
+
+out:
+    if (found)
+        __nlc_free_pe(this, nlc_ctx, pe);
+
+    return;
+}
+
+static void
+__nlc_del_ne(xlator_t *this, nlc_ctx_t *nlc_ctx, const char *name)
+{
+    nlc_ne_t *ne = NULL;
+    nlc_ne_t *tmp = NULL;
+
+    if (!IS_NE_VALID(nlc_ctx->state))
+        goto out;
+
+    list_for_each_entry_safe(ne, tmp, &nlc_ctx->ne, list)
+    {
+        if (strcmp(ne->name, name) == 0) {
+            __nlc_free_ne(this, nlc_ctx, ne);
+            break;
+        }
+    }
+out:
+    return;
+}
+
+static void
+__nlc_add_pe(xlator_t *this, nlc_ctx_t *nlc_ctx, inode_t *entry_ino,
+             const char *name)
+{
+    nlc_pe_t *pe = NULL;
+    int ret = -1;
+    nlc_conf_t *conf = NULL;
+    uint64_t nlc_ctx_int = 0;
+
+    conf = this->private;
+
+    /* TODO: There can be no duplicate entries, as it is added only
+    during create. In case there arises duplicate entries, search PE
+    found = __nlc_search (entries, name, _gf_false);
+    can use bit vector to have simple search than sequential search */
+
+    pe = GF_CALLOC(sizeof(*pe), 1, gf_nlc_mt_nlc_pe_t);
+    if (!pe)
+        goto out;
+
+    if (entry_ino) {
+        pe->inode = inode_ref(entry_ino);
+        nlc_inode_ctx_set(this, entry_ino, NULL, pe);
+    } else if (name) {
+        pe->name = gf_strdup(name);
+        if (!pe->name)
+            goto out;
+    }
+
+    list_add(&pe->list, &nlc_ctx->pe);
+
+    nlc_ctx->cache_size += sizeof(*pe) + sizeof(pe->name);
+    GF_ATOMIC_ADD(conf->current_cache_size, (sizeof(*pe) + sizeof(pe->name)));
+
+    nlc_ctx->refd_inodes += 1;
+    inode_ctx_get2(entry_ino, this, &nlc_ctx_int, NULL);
+    if (nlc_ctx_int == 0)
+        GF_ATOMIC_ADD(conf->refd_inodes, 1);
+
+    ret = 0;
+out:
+    if (ret)
+        GF_FREE(pe);
+
+    return;
+}
+
+static void
+__nlc_add_ne(xlator_t *this, nlc_ctx_t *nlc_ctx, const char *name)
+{
+    nlc_ne_t *ne = NULL;
+    int ret = -1;
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    /* TODO: search ne before adding to get rid of duplicate entries
+    found = __nlc_search (entries, name, _gf_false);
+    can use bit vector to have faster search than sequential search */
+
+    ne = GF_CALLOC(sizeof(*ne), 1, gf_nlc_mt_nlc_ne_t);
+    if (!ne)
+        goto out;
+
+    ne->name = gf_strdup(name);
+    if (!ne->name)
+        goto out;
+
+    list_add(&ne->list, &nlc_ctx->ne);
+
+    nlc_ctx->cache_size += sizeof(*ne) + sizeof(ne->name);
+    GF_ATOMIC_ADD(conf->current_cache_size, (sizeof(*ne) + sizeof(ne->name)));
+    ret = 0;
+out:
+    if (ret)
+        GF_FREE(ne);
+
+    return;
+}
+
+void
+nlc_dir_add_ne(xlator_t *this, inode_t *inode, const char *name)
+{
+    nlc_ctx_t *nlc_ctx = NULL;
+
+    if (inode->ia_type != IA_IFDIR) {
+        gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL,
+                         "inode is not of type dir");
+        goto out;
+    }
+
+    nlc_inode_ctx_get_set(this, inode, &nlc_ctx);
+    if (!nlc_ctx)
+        goto out;
+
+    LOCK(&nlc_ctx->lock);
+    {
+        /* There is one possibility where we need to search before
+         * adding NE: when there are two parallel lookups on a non
+         * existent file */
+        if (!__nlc_search_ne(nlc_ctx, name)) {
+            __nlc_add_ne(this, nlc_ctx, name);
+            __nlc_set_dir_state(nlc_ctx, NLC_NE_VALID);
+        }
+    }
+    UNLOCK(&nlc_ctx->lock);
+out:
+    return;
+}
+
+void
+nlc_dir_remove_pe(xlator_t *this, inode_t *parent, inode_t *entry_ino,
+                  const char *name, gf_boolean_t multilink)
+{
+    nlc_ctx_t *nlc_ctx = NULL;
+
+    if (parent->ia_type != IA_IFDIR) {
+        gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL,
+                         "inode is not of type dir");
+        goto out;
+    }
+
+    nlc_inode_ctx_get(this, parent, &nlc_ctx);
+    if (!nlc_ctx)
+        goto out;
+
+    LOCK(&nlc_ctx->lock);
+    {
+        if (!__nlc_is_cache_valid(this, nlc_ctx))
+            goto unlock;
+
+        __nlc_del_pe(this, nlc_ctx, entry_ino, name, multilink);
+        __nlc_add_ne(this, nlc_ctx, name);
+        __nlc_set_dir_state(nlc_ctx, NLC_NE_VALID);
+    }
+unlock:
+    UNLOCK(&nlc_ctx->lock);
+out:
+    return;
+}
+
+void
+nlc_dir_add_pe(xlator_t *this, inode_t *inode, inode_t *entry_ino,
+               const char *name)
+{
+    nlc_ctx_t *nlc_ctx = NULL;
+
+    if (inode->ia_type != IA_IFDIR) {
+        gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL,
+                         "inode is not of type dir");
+        goto out;
+    }
+
+    nlc_inode_ctx_get_set(this, inode, &nlc_ctx);
+    if (!nlc_ctx)
+        goto out;
+
+    LOCK(&nlc_ctx->lock);
+    {
+        __nlc_del_ne(this, nlc_ctx, name);
+        __nlc_add_pe(this, nlc_ctx, entry_ino, name);
+        if (!IS_PE_VALID(nlc_ctx->state))
+            __nlc_set_dir_state(nlc_ctx, NLC_PE_PARTIAL);
+    }
+    UNLOCK(&nlc_ctx->lock);
+out:
+    return;
+}
+
+gf_boolean_t
+__nlc_search_ne(nlc_ctx_t *nlc_ctx, const char *name)
+{
+    gf_boolean_t found = _gf_false;
+    nlc_ne_t *ne = NULL;
+    nlc_ne_t *tmp = NULL;
+
+    if (!IS_NE_VALID(nlc_ctx->state))
+        goto out;
+
+    list_for_each_entry_safe(ne, tmp, &nlc_ctx->ne, list)
+    {
+        if (strcmp(ne->name, name) == 0) {
+            found = _gf_true;
+            break;
+        }
+    }
+out:
+    return found;
+}
+
+static gf_boolean_t
+__nlc_search_pe(nlc_ctx_t *nlc_ctx, const char *name)
+{
+    gf_boolean_t found = _gf_false;
+    nlc_pe_t *pe = NULL;
+    nlc_pe_t *tmp = NULL;
+
+    if (!IS_PE_VALID(nlc_ctx->state))
+        goto out;
+
+    list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list)
+    {
+        if (pe->name && (strcmp(pe->name, name) == 0)) {
+            found = _gf_true;
+            break;
+        }
+    }
+out:
+    return found;
+}
+
+static char *
+__nlc_get_pe(nlc_ctx_t *nlc_ctx, const char *name,
+             gf_boolean_t case_insensitive)
+{
+    char *found = NULL;
+    nlc_pe_t *pe = NULL;
+    nlc_pe_t *tmp = NULL;
+
+    if (!IS_PE_VALID(nlc_ctx->state))
+        goto out;
+
+    if (case_insensitive) {
+        list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list)
+        {
+            if (pe->name && (strcasecmp(pe->name, name) == 0)) {
+                found = pe->name;
+                break;
+            }
+        }
+    } else {
+        list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list)
+        {
+            if (pe->name && (strcmp(pe->name, name) == 0)) {
+                found = pe->name;
+                break;
+            }
+        }
+    }
+out:
+    return found;
+}
+
+gf_boolean_t
+nlc_is_negative_lookup(xlator_t *this, loc_t *loc)
+{
+    nlc_ctx_t *nlc_ctx = NULL;
+    inode_t *inode = NULL;
+    gf_boolean_t neg_entry = _gf_false;
+
+    inode = loc->parent;
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    if (inode->ia_type != IA_IFDIR) {
+        gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL,
+                         "inode is not of type dir");
+        goto out;
+    }
+
+    nlc_inode_ctx_get(this, inode, &nlc_ctx);
+    if (!nlc_ctx)
+        goto out;
+
+    LOCK(&nlc_ctx->lock);
+    {
+        if (!__nlc_is_cache_valid(this, nlc_ctx))
+            goto unlock;
+
+        if (__nlc_search_ne(nlc_ctx, loc->name)) {
+            neg_entry = _gf_true;
+            goto unlock;
+        }
+        if ((nlc_ctx->state & NLC_PE_FULL) &&
+            !__nlc_search_pe(nlc_ctx, loc->name)) {
+            neg_entry = _gf_true;
+            goto unlock;
+        }
+    }
+unlock:
+    UNLOCK(&nlc_ctx->lock);
+
+out:
+    return neg_entry;
+}
+
+gf_boolean_t
+nlc_get_real_file_name(xlator_t *this, loc_t *loc, const char *fname,
+                       int32_t *op_ret, int32_t *op_errno, dict_t *dict)
+{
+    nlc_ctx_t *nlc_ctx = NULL;
+    inode_t *inode = NULL;
+    gf_boolean_t hit = _gf_false;
+    char *found_file = NULL;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO(this->name, loc, out);
+    GF_VALIDATE_OR_GOTO(this->name, fname, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_ret, out);
+    GF_VALIDATE_OR_GOTO(this->name, op_errno, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+
+    inode = loc->inode;
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    if (inode->ia_type != IA_IFDIR) {
+        gf_msg_callingfn(this->name, GF_LOG_ERROR, EINVAL, NLC_MSG_EINVAL,
+                         "inode is not of type dir");
+        goto out;
+    }
+
+    nlc_inode_ctx_get(this, inode, &nlc_ctx);
+    if (!nlc_ctx)
+        goto out;
+
+    LOCK(&nlc_ctx->lock);
+    {
+        if (!__nlc_is_cache_valid(this, nlc_ctx))
+            goto unlock;
+
+        found_file = __nlc_get_pe(nlc_ctx, fname, _gf_true);
+        if (found_file) {
+            ret = dict_set_dynstr(dict, GF_XATTR_GET_REAL_FILENAME_KEY,
+                                  gf_strdup(found_file));
+            if (ret < 0)
+                goto unlock;
+            *op_ret = strlen(found_file) + 1;
+            hit = _gf_true;
+            goto unlock;
+        }
+        if (!found_file && (nlc_ctx->state & NLC_PE_FULL)) {
+            *op_ret = -1;
+            *op_errno = ENOENT;
+            hit = _gf_true;
+            goto unlock;
+        }
+    }
+unlock:
+    UNLOCK(&nlc_ctx->lock);
+
+out:
+    return hit;
+}
+
+void
+nlc_dump_inodectx(xlator_t *this, inode_t *inode)
+{
+    int32_t ret = -1;
+    char *path = NULL;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    char uuid_str[64] = {
+        0,
+    };
+    nlc_ctx_t *nlc_ctx = NULL;
+    nlc_pe_t *pe = NULL;
+    nlc_pe_t *tmp = NULL;
+    nlc_ne_t *ne = NULL;
+    nlc_ne_t *tmp1 = NULL;
+
+    nlc_inode_ctx_get(this, inode, &nlc_ctx);
+
+    if (!nlc_ctx)
+        goto out;
+
+    ret = TRY_LOCK(&nlc_ctx->lock);
+    if (!ret) {
+        gf_proc_dump_build_key(key_prefix, "xlator.performance.nl-cache",
+                               "nlc_inode");
+        gf_proc_dump_add_section("%s", key_prefix);
+
+        __inode_path(inode, NULL, &path);
+        if (path != NULL) {
+            gf_proc_dump_write("path", "%s", path);
+            GF_FREE(path);
+        }
+
+        uuid_utoa_r(inode->gfid, uuid_str);
+
+        gf_proc_dump_write("inode", "%p", inode);
+        gf_proc_dump_write("gfid", "%s", uuid_str);
+
+        gf_proc_dump_write("state", "%" PRIu64, nlc_ctx->state);
+        gf_proc_dump_write("timer", "%p", nlc_ctx->timer);
+        gf_proc_dump_write("cache-time", "%ld", nlc_ctx->cache_time);
+        gf_proc_dump_write("cache-size", "%zu", nlc_ctx->cache_size);
+        gf_proc_dump_write("refd-inodes", "%" PRIu64, nlc_ctx->refd_inodes);
+
+        if (IS_PE_VALID(nlc_ctx->state))
+            list_for_each_entry_safe(pe, tmp, &nlc_ctx->pe, list)
+            {
+                gf_proc_dump_write("pe", "%p, %p, %s", pe, pe->inode, pe->name);
+            }
+
+        if (IS_NE_VALID(nlc_ctx->state))
+            list_for_each_entry_safe(ne, tmp1, &nlc_ctx->ne, list)
+            {
+                gf_proc_dump_write("ne", "%s", ne->name);
+            }
+
+        UNLOCK(&nlc_ctx->lock);
+    }
+
+    if (ret && nlc_ctx)
+        gf_proc_dump_write("Unable to dump the inode information",
+                           "(Lock acquisition failed) %p (gfid: %s)", nlc_ctx,
+                           uuid_str);
+out:
+    return;
+}
diff --git a/xlators/performance/nl-cache/src/nl-cache-mem-types.h b/xlators/performance/nl-cache/src/nl-cache-mem-types.h
new file mode 100644
index 00000000000..93a17b3fd5a
--- /dev/null
+++ b/xlators/performance/nl-cache/src/nl-cache-mem-types.h
@@ -0,0 +1,27 @@
+/*
+ *   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#ifndef __NL_CACHE_MEM_TYPES_H__
+#define __NL_CACHE_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_nlc_mem_types_ {
+    gf_nlc_mt_nlc_conf_t = gf_common_mt_end + 1,
+    gf_nlc_mt_nlc_ctx_t,
+    gf_nlc_mt_nlc_local_t,
+    gf_nlc_mt_nlc_pe_t,
+    gf_nlc_mt_nlc_ne_t,
+    gf_nlc_mt_nlc_timer_data_t,
+    gf_nlc_mt_nlc_lru_node,
+    gf_nlc_mt_end
+};
+
+#endif /* __NL_CACHE_MEM_TYPES_H__ */
diff --git a/xlators/performance/nl-cache/src/nl-cache-messages.h b/xlators/performance/nl-cache/src/nl-cache-messages.h
new file mode 100644
index 00000000000..222d709e133
--- /dev/null
+++ b/xlators/performance/nl-cache/src/nl-cache-messages.h
@@ -0,0 +1,29 @@
+/*
+ *   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#ifndef __NL_CACHE_MESSAGES_H__
+#define __NL_CACHE_MESSAGES_H__
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(NLC, NLC_MSG_NO_MEMORY, NLC_MSG_EINVAL, NLC_MSG_NO_TIMER_WHEEL,
+           NLC_MSG_DICT_FAILURE);
+
+#endif /* __NL_CACHE_MESSAGES_H__ */
diff --git a/xlators/performance/nl-cache/src/nl-cache.c b/xlators/performance/nl-cache/src/nl-cache.c
new file mode 100644
index 00000000000..33a7c471663
--- /dev/null
+++ b/xlators/performance/nl-cache/src/nl-cache.c
@@ -0,0 +1,840 @@
+/*
+ *   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#include "nl-cache.h"
+#include <glusterfs/statedump.h>
+#include <glusterfs/upcall-utils.h>
+
+static void
+nlc_dentry_op(call_frame_t *frame, xlator_t *this, gf_boolean_t multilink)
+{
+    nlc_local_t *local = frame->local;
+
+    GF_VALIDATE_OR_GOTO(this->name, local, out);
+
+    switch (local->fop) {
+        case GF_FOP_MKDIR:
+            nlc_set_dir_state(this, local->loc.inode, NLC_PE_FULL);
+            /*fall-through*/
+        case GF_FOP_MKNOD:
+        case GF_FOP_CREATE:
+        case GF_FOP_SYMLINK:
+            nlc_dir_add_pe(this, local->loc.parent, local->loc.inode,
+                           local->loc.name);
+            break;
+        case GF_FOP_LINK:
+            nlc_dir_add_pe(this, local->loc2.parent, NULL, local->loc2.name);
+            break;
+        case GF_FOP_RMDIR:
+            nlc_inode_clear_cache(this, local->loc.inode, _gf_false);
+            /*fall-through*/
+        case GF_FOP_UNLINK:
+            nlc_dir_remove_pe(this, local->loc.parent, local->loc.inode,
+                              local->loc.name, multilink);
+            break;
+        case GF_FOP_RENAME:
+            /* TBD: Should these be atomic ?  In case of rename, the
+             * newloc->inode can be NULL, and hence use oldloc->inode */
+            nlc_dir_remove_pe(this, local->loc2.parent, local->loc2.inode,
+                              local->loc2.name, _gf_false);
+
+            /*TODO: Remove old dentry from destination before adding this pe*/
+            nlc_dir_add_pe(this, local->loc.parent, local->loc2.inode,
+                           local->loc.name);
+
+        default:
+            return;
+    }
+
+    nlc_lru_prune(this, NULL);
+out:
+    return;
+}
+
+#define NLC_FOP(_name, _op, loc1, loc2, frame, this, args...)                  \
+    do {                                                                       \
+        nlc_local_t *__local = NULL;                                           \
+        nlc_conf_t *conf = NULL;                                               \
+                                                                               \
+        conf = this->private;                                                  \
+                                                                               \
+        if (!IS_PEC_ENABLED(conf))                                             \
+            goto disabled;                                                     \
+                                                                               \
+        __local = nlc_local_init(frame, this, _op, loc1, loc2);                \
+        GF_VALIDATE_OR_GOTO(this->name, __local, err);                         \
+                                                                               \
+        STACK_WIND(frame, nlc_##_name##_cbk, FIRST_CHILD(this),                \
+                   FIRST_CHILD(this)->fops->_name, args);                      \
+        break;                                                                 \
+    disabled:                                                                  \
+        default_##_name##_resume(frame, this, args);                           \
+        break;                                                                 \
+    err:                                                                       \
+        default_##_name##_failure_cbk(frame, ENOMEM);                          \
+        break;                                                                 \
+    } while (0)
+
+#define NLC_FOP_CBK(_name, multilink, frame, cookie, this, op_ret, op_errno,   \
+                    args...)                                                   \
+    do {                                                                       \
+        nlc_conf_t *conf = NULL;                                               \
+                                                                               \
+        if (op_ret != 0)                                                       \
+            goto out;                                                          \
+                                                                               \
+        conf = this->private;                                                  \
+                                                                               \
+        if (op_ret < 0 || !IS_PEC_ENABLED(conf))                               \
+            goto out;                                                          \
+        nlc_dentry_op(frame, this, multilink);                                 \
+    out:                                                                       \
+        NLC_STACK_UNWIND(_name, frame, op_ret, op_errno, args);                \
+    } while (0)
+
+static int32_t
+nlc_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *buf,
+               struct iatt *preoldparent, struct iatt *postoldparent,
+               struct iatt *prenewparent, struct iatt *postnewparent,
+               dict_t *xdata)
+{
+    NLC_FOP_CBK(rename, _gf_false, frame, cookie, this, op_ret, op_errno, buf,
+                preoldparent, postoldparent, prenewparent, postnewparent,
+                xdata);
+    return 0;
+}
+
+static int32_t
+nlc_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+           dict_t *xdata)
+{
+    NLC_FOP(rename, GF_FOP_RENAME, newloc, oldloc, frame, this, oldloc, newloc,
+            xdata);
+    return 0;
+}
+
+static int32_t
+nlc_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, inode_t *inode, struct iatt *buf,
+              struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    NLC_FOP_CBK(mknod, _gf_false, frame, cookie, this, op_ret, op_errno, inode,
+                buf, preparent, postparent, xdata);
+    return 0;
+}
+
+static int32_t
+nlc_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    NLC_FOP(mknod, GF_FOP_MKNOD, loc, NULL, frame, this, loc, mode, rdev, umask,
+            xdata);
+    return 0;
+}
+
+static int32_t
+nlc_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+               struct iatt *buf, struct iatt *preparent,
+               struct iatt *postparent, dict_t *xdata)
+{
+    NLC_FOP_CBK(create, _gf_false, frame, cookie, this, op_ret, op_errno, fd,
+                inode, buf, preparent, postparent, xdata);
+    return 0;
+}
+
+static int32_t
+nlc_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+           mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    NLC_FOP(create, GF_FOP_CREATE, loc, NULL, frame, this, loc, flags, mode,
+            umask, fd, xdata);
+    return 0;
+}
+
+static int32_t
+nlc_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, inode_t *inode, struct iatt *buf,
+              struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    NLC_FOP_CBK(mkdir, _gf_false, frame, cookie, this, op_ret, op_errno, inode,
+                buf, preparent, postparent, xdata);
+    return 0;
+}
+
+static int32_t
+nlc_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+          mode_t umask, dict_t *xdata)
+{
+    NLC_FOP(mkdir, GF_FOP_MKDIR, loc, NULL, frame, this, loc, mode, umask,
+            xdata);
+    return 0;
+}
+
+static int32_t
+nlc_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, inode_t *inode,
+               struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+    nlc_local_t *local = NULL;
+    nlc_conf_t *conf = NULL;
+
+    local = frame->local;
+    conf = this->private;
+
+    if (!local)
+        goto out;
+
+    /* Donot add to pe, this may lead to duplicate entry and
+     * requires search before adding if list of strings */
+    if (op_ret < 0 && op_errno == ENOENT) {
+        nlc_dir_add_ne(this, local->loc.parent, local->loc.name);
+        GF_ATOMIC_INC(conf->nlc_counter.nlc_miss);
+    }
+
+out:
+    NLC_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+                     postparent);
+    return 0;
+}
+
+static int32_t
+nlc_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    nlc_local_t *local = NULL;
+    nlc_conf_t *conf = NULL;
+    inode_t *inode = NULL;
+
+    if (loc_is_nameless(loc))
+        goto wind;
+
+    local = nlc_local_init(frame, this, GF_FOP_LOOKUP, loc, NULL);
+    if (!local)
+        goto err;
+
+    conf = this->private;
+
+    inode = inode_grep(loc->inode->table, loc->parent, loc->name);
+    if (inode) {
+        inode_unref(inode);
+        goto wind;
+    }
+
+    if (nlc_is_negative_lookup(this, loc)) {
+        GF_ATOMIC_INC(conf->nlc_counter.nlc_hit);
+        gf_msg_trace(this->name, 0,
+                     "Serving negative lookup from "
+                     "cache:%s",
+                     loc->name);
+        goto unwind;
+    }
+
+wind:
+    STACK_WIND(frame, nlc_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
+    return 0;
+unwind:
+    NLC_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL);
+    return 0;
+err:
+    NLC_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
+    return 0;
+}
+
+static int32_t
+nlc_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *preparent, struct iatt *postparent,
+              dict_t *xdata)
+{
+    NLC_FOP_CBK(rmdir, _gf_false, frame, cookie, this, op_ret, op_errno,
+                preparent, postparent, xdata);
+    return 0;
+}
+
+static int32_t
+nlc_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+          dict_t *xdata)
+{
+    NLC_FOP(rmdir, GF_FOP_RMDIR, loc, NULL, frame, this, loc, flags, xdata);
+    return 0;
+}
+
+static int32_t
+nlc_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    if (!IS_PEC_ENABLED(conf))
+        goto out;
+
+    if (op_ret < 0 && op_errno == ENOENT) {
+        GF_ATOMIC_INC(conf->nlc_counter.getrealfilename_miss);
+    }
+
+out:
+    NLC_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+    return 0;
+}
+
+static int32_t
+nlc_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
+             dict_t *xdata)
+{
+    int32_t op_ret = -1;
+    int32_t op_errno = 0;
+    dict_t *dict = NULL;
+    nlc_local_t *local = NULL;
+    gf_boolean_t hit = _gf_false;
+    const char *fname = NULL;
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    if (!IS_PEC_ENABLED(conf))
+        goto wind;
+
+    if (!key || (strncmp(key, GF_XATTR_GET_REAL_FILENAME_KEY,
+                         SLEN(GF_XATTR_GET_REAL_FILENAME_KEY)) != 0))
+        goto wind;
+
+    local = nlc_local_init(frame, this, GF_FOP_GETXATTR, loc, NULL);
+    if (!local)
+        goto err;
+
+    if (loc->inode && key) {
+        dict = dict_new();
+        if (!dict)
+            goto err;
+
+        fname = key + SLEN(GF_XATTR_GET_REAL_FILENAME_KEY);
+        hit = nlc_get_real_file_name(this, loc, fname, &op_ret, &op_errno,
+                                     dict);
+        if (hit)
+            goto unwind;
+        else
+            dict_unref(dict);
+    }
+
+    STACK_WIND(frame, nlc_getxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getxattr, loc, key, xdata);
+    return 0;
+wind:
+    STACK_WIND(frame, default_getxattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->getxattr, loc, key, xdata);
+    return 0;
+unwind:
+    GF_ATOMIC_INC(conf->nlc_counter.getrealfilename_hit);
+    NLC_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, NULL);
+    dict_unref(dict);
+    return 0;
+err:
+    NLC_STACK_UNWIND(getxattr, frame, -1, ENOMEM, NULL, NULL);
+    return 0;
+}
+
+static int32_t
+nlc_symlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, inode_t *inode,
+                struct iatt *buf, struct iatt *preparent,
+                struct iatt *postparent, dict_t *xdata)
+{
+    NLC_FOP_CBK(symlink, _gf_false, frame, cookie, this, op_ret, op_errno,
+                inode, buf, preparent, postparent, xdata);
+    return 0;
+}
+
+static int32_t
+nlc_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+            loc_t *loc, mode_t umask, dict_t *xdata)
+{
+    NLC_FOP(symlink, GF_FOP_SYMLINK, loc, NULL, frame, this, linkpath, loc,
+            umask, xdata);
+    return 0;
+}
+
+static int32_t
+nlc_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, inode_t *inode, struct iatt *buf,
+             struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    NLC_FOP_CBK(link, _gf_false, frame, cookie, this, op_ret, op_errno, inode,
+                buf, preparent, postparent, xdata);
+    return 0;
+}
+
+static int32_t
+nlc_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+         dict_t *xdata)
+{
+    NLC_FOP(link, GF_FOP_LINK, oldloc, newloc, frame, this, oldloc, newloc,
+            xdata);
+    return 0;
+}
+
+static int32_t
+nlc_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+               struct iatt *postparent, dict_t *xdata)
+{
+    uint32_t link_count = 0;
+    gf_boolean_t multilink = _gf_false;
+
+    if (xdata && !dict_get_uint32(xdata, GET_LINK_COUNT, &link_count)) {
+        if (link_count > 1)
+            multilink = _gf_true;
+    } else {
+        /* Don't touch cache if we don't know enough */
+        gf_msg(this->name, GF_LOG_WARNING, 0, NLC_MSG_DICT_FAILURE,
+               "Failed to get GET_LINK_COUNT from dict");
+        NLC_STACK_UNWIND(unlink, frame, op_ret, op_errno, preparent, postparent,
+                         xdata);
+        return 0;
+    }
+
+    NLC_FOP_CBK(unlink, multilink, frame, cookie, this, op_ret, op_errno,
+                preparent, postparent, xdata);
+    return 0;
+}
+
+static int32_t
+nlc_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+           dict_t *xdata)
+{
+    nlc_conf_t *conf = NULL;
+    gf_boolean_t new_dict = _gf_false;
+
+    conf = this->private;
+
+    if (!IS_PEC_ENABLED(conf))
+        goto do_fop;
+
+    if (!xdata) {
+        xdata = dict_new();
+        if (xdata)
+            new_dict = _gf_true;
+    }
+
+    if (xdata && dict_set_uint32(xdata, GET_LINK_COUNT, 0)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, NLC_MSG_DICT_FAILURE,
+               "Failed to set GET_LINK_COUNT in dict");
+        goto err;
+    }
+
+do_fop:
+    NLC_FOP(unlink, GF_FOP_UNLINK, loc, NULL, frame, this, loc, flags, xdata);
+
+    if (new_dict)
+        dict_unref(xdata);
+    return 0;
+}
+
+static int32_t
+nlc_invalidate(xlator_t *this, void *data)
+{
+    struct gf_upcall *up_data = NULL;
+    struct gf_upcall_cache_invalidation *up_ci = NULL;
+    inode_t *inode = NULL;
+    inode_t *parent1 = NULL;
+    inode_t *parent2 = NULL;
+    int ret = 0;
+    inode_table_t *itable = NULL;
+    nlc_conf_t *conf = NULL;
+
+    up_data = (struct gf_upcall *)data;
+
+    if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION)
+        goto out;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
+
+    /*TODO: Add he inodes found as a member in gf_upcall_cache_invalidation
+     * so that it prevents subsequent xlators from doing inode_find again
+     */
+    itable = ((xlator_t *)this->graph->top)->itable;
+    inode = inode_find(itable, up_data->gfid);
+    if (!inode) {
+        ret = -1;
+        goto out;
+    }
+
+    if ((!((up_ci->flags & UP_TIMES) && inode->ia_type == IA_IFDIR)) &&
+        (!(up_ci->flags & UP_PARENT_DENTRY_FLAGS))) {
+        goto out;
+    }
+
+    if (!gf_uuid_is_null(up_ci->p_stat.ia_gfid)) {
+        parent1 = inode_find(itable, up_ci->p_stat.ia_gfid);
+        if (!parent1) {
+            ret = -1;
+            goto out;
+        }
+    }
+
+    if (!gf_uuid_is_null(up_ci->oldp_stat.ia_gfid)) {
+        parent2 = inode_find(itable, up_ci->oldp_stat.ia_gfid);
+        if (!parent2) {
+            ret = -1;
+            goto out;
+        }
+    }
+
+    /* TODO: get enough data in upcall so that we do not invalidate but
+     * update */
+    if (inode && inode->ia_type == IA_IFDIR)
+        nlc_inode_clear_cache(this, inode, NLC_NONE);
+    if (parent1)
+        nlc_inode_clear_cache(this, parent1, NLC_NONE);
+    if (parent2)
+        nlc_inode_clear_cache(this, parent2, NLC_NONE);
+
+    GF_ATOMIC_INC(conf->nlc_counter.nlc_invals);
+
+out:
+    if (inode)
+        inode_unref(inode);
+    if (parent1)
+        inode_unref(parent1);
+    if (parent2)
+        inode_unref(parent2);
+
+    return ret;
+}
+
+int
+nlc_notify(xlator_t *this, int event, void *data, ...)
+{
+    int ret = 0;
+
+    switch (event) {
+        case GF_EVENT_CHILD_DOWN:
+        case GF_EVENT_SOME_DESCENDENT_DOWN:
+        case GF_EVENT_CHILD_UP:
+        case GF_EVENT_SOME_DESCENDENT_UP:
+            nlc_update_child_down_time(this, gf_time());
+            /* TODO: nlc_clear_all_cache (this); else
+             lru prune will lazily clear it*/
+            break;
+        case GF_EVENT_UPCALL:
+            ret = nlc_invalidate(this, data);
+            break;
+        case GF_EVENT_PARENT_DOWN:
+            nlc_disable_cache(this);
+            nlc_clear_all_cache(this);
+        default:
+            break;
+    }
+
+    if (default_notify(this, event, data) != 0)
+        ret = -1;
+
+    return ret;
+}
+
+static int32_t
+nlc_forget(xlator_t *this, inode_t *inode)
+{
+    uint64_t pe_int = 0;
+    uint64_t nlc_ctx_int = 0;
+    nlc_ctx_t *nlc_ctx = NULL;
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    inode_ctx_reset1(inode, this, &pe_int);
+    GF_ASSERT(pe_int == 0);
+
+    nlc_inode_clear_cache(this, inode, NLC_NONE);
+    inode_ctx_reset0(inode, this, &nlc_ctx_int);
+    nlc_ctx = (void *)(long)nlc_ctx_int;
+    if (nlc_ctx) {
+        GF_FREE(nlc_ctx);
+        GF_ATOMIC_SUB(conf->current_cache_size, sizeof(*nlc_ctx));
+    }
+
+    return 0;
+}
+
+static int32_t
+nlc_inodectx(xlator_t *this, inode_t *inode)
+{
+    nlc_dump_inodectx(this, inode);
+    return 0;
+}
+
+static int32_t
+nlc_priv_dump(xlator_t *this)
+{
+    nlc_conf_t *conf = NULL;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN];
+
+    conf = this->private;
+
+    snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
+    gf_proc_dump_add_section("%s", key_prefix);
+
+    gf_proc_dump_write("negative_lookup_hit_count", "%" PRId64,
+                       GF_ATOMIC_GET(conf->nlc_counter.nlc_hit));
+    gf_proc_dump_write("negative_lookup_miss_count", "%" PRId64,
+                       GF_ATOMIC_GET(conf->nlc_counter.nlc_miss));
+    gf_proc_dump_write("get_real_filename_hit_count", "%" PRId64,
+                       GF_ATOMIC_GET(conf->nlc_counter.getrealfilename_hit));
+    gf_proc_dump_write("get_real_filename_miss_count", "%" PRId64,
+                       GF_ATOMIC_GET(conf->nlc_counter.getrealfilename_miss));
+    gf_proc_dump_write("nameless_lookup_count", "%" PRId64,
+                       GF_ATOMIC_GET(conf->nlc_counter.nameless_lookup));
+    gf_proc_dump_write("inodes_with_positive_dentry_cache", "%" PRId64,
+                       GF_ATOMIC_GET(conf->nlc_counter.pe_inode_cnt));
+    gf_proc_dump_write("inodes_with_negative_dentry_cache", "%" PRId64,
+                       GF_ATOMIC_GET(conf->nlc_counter.ne_inode_cnt));
+    gf_proc_dump_write("dentry_invalidations_received", "%" PRId64,
+                       GF_ATOMIC_GET(conf->nlc_counter.nlc_invals));
+    gf_proc_dump_write("cache_limit", "%" PRIu64, conf->cache_size);
+    gf_proc_dump_write("consumed_cache_size", "%" PRId64,
+                       GF_ATOMIC_GET(conf->current_cache_size));
+    gf_proc_dump_write("inode_limit", "%" PRIu64, conf->inode_limit);
+    gf_proc_dump_write("consumed_inodes", "%" PRId64,
+                       GF_ATOMIC_GET(conf->refd_inodes));
+
+    return 0;
+}
+
+static int32_t
+nlc_dump_metrics(xlator_t *this, int fd)
+{
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    dprintf(fd, "%s.negative_lookup_hit_count %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(conf->nlc_counter.nlc_hit));
+    dprintf(fd, "%s.negative_lookup_miss_count %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(conf->nlc_counter.nlc_miss));
+    dprintf(fd, "%s.get_real_filename_hit_count %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(conf->nlc_counter.getrealfilename_hit));
+    dprintf(fd, "%s.get_real_filename_miss_count %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(conf->nlc_counter.getrealfilename_miss));
+    dprintf(fd, "%s.nameless_lookup_count %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(conf->nlc_counter.nameless_lookup));
+    dprintf(fd, "%s.inodes_with_positive_dentry_cache %" PRId64 "\n",
+            this->name, GF_ATOMIC_GET(conf->nlc_counter.pe_inode_cnt));
+    dprintf(fd, "%s.inodes_with_negative_dentry_cache %" PRId64 "\n",
+            this->name, GF_ATOMIC_GET(conf->nlc_counter.ne_inode_cnt));
+    dprintf(fd, "%s.dentry_invalidations_received %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(conf->nlc_counter.nlc_invals));
+    dprintf(fd, "%s.cache_limit %" PRIu64 "\n", this->name, conf->cache_size);
+    dprintf(fd, "%s.consumed_cache_size %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(conf->current_cache_size));
+    dprintf(fd, "%s.inode_limit %" PRIu64 "\n", this->name, conf->inode_limit);
+    dprintf(fd, "%s.consumed_inodes %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(conf->refd_inodes));
+
+    return 0;
+}
+
+void
+nlc_fini(xlator_t *this)
+{
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+    GF_FREE(conf);
+
+    glusterfs_ctx_tw_put(this->ctx);
+
+    return;
+}
+
+int32_t
+nlc_mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    ret = xlator_mem_acct_init(this, gf_nlc_mt_end + 1);
+    return ret;
+}
+
+int32_t
+nlc_reconfigure(xlator_t *this, dict_t *options)
+{
+    nlc_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    GF_OPTION_RECONF("nl-cache-timeout", conf->cache_timeout, options, int32,
+                     out);
+    GF_OPTION_RECONF("nl-cache-positive-entry", conf->positive_entry_cache,
+                     options, bool, out);
+    GF_OPTION_RECONF("nl-cache-limit", conf->cache_size, options, size_uint64,
+                     out);
+    GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out);
+
+out:
+    return 0;
+}
+
+int32_t
+nlc_init(xlator_t *this)
+{
+    nlc_conf_t *conf = NULL;
+    int ret = -1;
+    inode_table_t *itable = NULL;
+
+    conf = GF_CALLOC(sizeof(*conf), 1, gf_nlc_mt_nlc_conf_t);
+    if (!conf)
+        goto out;
+
+    GF_OPTION_INIT("nl-cache-timeout", conf->cache_timeout, int32, out);
+    GF_OPTION_INIT("nl-cache-positive-entry", conf->positive_entry_cache, bool,
+                   out);
+    GF_OPTION_INIT("nl-cache-limit", conf->cache_size, size_uint64, out);
+    GF_OPTION_INIT("pass-through", this->pass_through, bool, out);
+
+    /* Since the positive entries are stored as list of refs on
+     * existing inodes, we should not overflow the inode lru_limit.
+     * Hence keep the limit of inodes that are refed by this xlator,
+     * to 80% of inode_table->lru_limit. In fuse where the limit is
+     * infinite, take 131072 as lru limit (as in gfapi). */
+    itable = ((xlator_t *)this->graph->top)->itable;
+    if (itable && itable->lru_limit)
+        conf->inode_limit = itable->lru_limit * 80 / 100;
+    else
+        conf->inode_limit = 131072 * 80 / 100;
+
+    LOCK_INIT(&conf->lock);
+    GF_ATOMIC_INIT(conf->current_cache_size, 0);
+    GF_ATOMIC_INIT(conf->refd_inodes, 0);
+    GF_ATOMIC_INIT(conf->nlc_counter.nlc_hit, 0);
+    GF_ATOMIC_INIT(conf->nlc_counter.nlc_miss, 0);
+    GF_ATOMIC_INIT(conf->nlc_counter.nameless_lookup, 0);
+    GF_ATOMIC_INIT(conf->nlc_counter.getrealfilename_hit, 0);
+    GF_ATOMIC_INIT(conf->nlc_counter.getrealfilename_miss, 0);
+    GF_ATOMIC_INIT(conf->nlc_counter.pe_inode_cnt, 0);
+    GF_ATOMIC_INIT(conf->nlc_counter.ne_inode_cnt, 0);
+    GF_ATOMIC_INIT(conf->nlc_counter.nlc_invals, 0);
+
+    INIT_LIST_HEAD(&conf->lru);
+    conf->last_child_down = gf_time();
+
+    conf->timer_wheel = glusterfs_ctx_tw_get(this->ctx);
+    if (!conf->timer_wheel) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, NLC_MSG_NO_TIMER_WHEEL,
+               "Initing the global timer wheel failed");
+        goto out;
+    }
+
+    this->private = conf;
+
+    ret = 0;
+out:
+    if (ret < 0)
+        GF_FREE(conf);
+
+    return ret;
+}
+
+struct xlator_fops nlc_fops = {
+    .rename = nlc_rename,
+    .mknod = nlc_mknod,
+    .create = nlc_create,
+    .mkdir = nlc_mkdir,
+    .lookup = nlc_lookup,
+    .rmdir = nlc_rmdir,
+    .getxattr = nlc_getxattr,
+    .symlink = nlc_symlink,
+    .link = nlc_link,
+    .unlink = nlc_unlink,
+    /* TODO:
+    .readdir              = nlc_readdir,
+    .readdirp             = nlc_readdirp,
+    .seek                 = nlc_seek,
+    .opendir              = nlc_opendir, */
+};
+
+struct xlator_cbks nlc_cbks = {
+    .forget = nlc_forget,
+};
+
+struct xlator_dumpops nlc_dumpops = {
+    .inodectx = nlc_inodectx,
+    .priv = nlc_priv_dump,
+};
+
+struct volume_options nlc_options[] = {
+    {
+        .key = {"nl-cache"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "enable/disable nl-cache",
+        .op_version = {GD_OP_VERSION_6_0},
+        .flags = OPT_FLAG_SETTABLE,
+    },
+    {
+        .key = {"nl-cache-positive-entry"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .op_version = {GD_OP_VERSION_3_11_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "Cache the name of the files/directories that was"
+                       " looked up and are present in a directory",
+    },
+    {
+        .key = {"nl-cache-limit"},
+        .type = GF_OPTION_TYPE_SIZET,
+        .min = 0,
+        .default_value = "131072",
+        .op_version = {GD_OP_VERSION_3_11_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "the value over which caching will be disabled for"
+                       "a while and the cache is cleared based on LRU",
+    },
+    {
+        .key = {"nl-cache-timeout"},
+        .type = GF_OPTION_TYPE_TIME,
+        .min = 0,
+        .default_value = "60",
+        .op_version = {GD_OP_VERSION_3_11_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+        .description = "Time period after which cache has to be refreshed",
+    },
+    {.key = {"pass-through"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "false",
+     .op_version = {GD_OP_VERSION_4_1_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+     .tags = {"nl-cache"},
+     .description = "Enable/Disable nl cache translator"},
+
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = nlc_init,
+    .fini = nlc_fini,
+    .notify = nlc_notify,
+    .reconfigure = nlc_reconfigure,
+    .mem_acct_init = nlc_mem_acct_init,
+    .dump_metrics = nlc_dump_metrics,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &nlc_dumpops,
+    .fops = &nlc_fops,
+    .cbks = &nlc_cbks,
+    .options = nlc_options,
+    .identifier = "nl-cache",
+    .category = GF_TECH_PREVIEW,
+};
diff --git a/xlators/performance/nl-cache/src/nl-cache.h b/xlators/performance/nl-cache/src/nl-cache.h
new file mode 100644
index 00000000000..85fcc176342
--- /dev/null
+++ b/xlators/performance/nl-cache/src/nl-cache.h
@@ -0,0 +1,175 @@
+/*
+ *   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+ *   This file is part of GlusterFS.
+ *
+ *   This file is licensed to you under your choice of the GNU Lesser
+ *   General Public License, version 3 or any later version (LGPLv3 or
+ *   later), or the GNU General Public License, version 2 (GPLv2), in all
+ *   cases as published by the Free Software Foundation.
+ */
+
+#ifndef __NL_CACHE_H__
+#define __NL_CACHE_H__
+
+#include "nl-cache-mem-types.h"
+#include "nl-cache-messages.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/atomic.h>
+
+#define NLC_INVALID 0x0000
+#define NLC_PE_FULL 0x0001
+#define NLC_PE_PARTIAL 0x0002
+#define NLC_NE_VALID 0x0004
+
+#define IS_PE_VALID(state)                                                     \
+    ((state != NLC_INVALID) && (state & (NLC_PE_FULL | NLC_PE_PARTIAL)))
+#define IS_NE_VALID(state) ((state != NLC_INVALID) && (state & NLC_NE_VALID))
+
+#define IS_PEC_ENABLED(conf) (conf->positive_entry_cache)
+#define IS_CACHE_ENABLED(conf) ((!conf->cache_disabled))
+
+#define NLC_STACK_UNWIND(fop, frame, params...)                                \
+    do {                                                                       \
+        nlc_local_t *__local = NULL;                                           \
+        xlator_t *__xl = NULL;                                                 \
+        if (frame) {                                                           \
+            __xl = frame->this;                                                \
+            __local = frame->local;                                            \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, params);                               \
+        nlc_local_wipe(__xl, __local);                                         \
+    } while (0)
+
+enum nlc_cache_clear_reason {
+    NLC_NONE = 0,
+    NLC_LRU_PRUNE,
+};
+
+struct nlc_ne {
+    struct list_head list;
+    char *name;
+};
+typedef struct nlc_ne nlc_ne_t;
+
+struct nlc_pe {
+    struct list_head list;
+    inode_t *inode;
+    char *name;
+};
+typedef struct nlc_pe nlc_pe_t;
+
+struct nlc_timer_data {
+    inode_t *inode;
+    xlator_t *this;
+};
+typedef struct nlc_timer_data nlc_timer_data_t;
+
+struct nlc_lru_node {
+    inode_t *inode;
+    struct list_head list;
+};
+typedef struct nlc_lru_node nlc_lru_node_t;
+
+struct nlc_ctx {
+    struct list_head pe; /* list of positive entries */
+    struct list_head ne; /* list of negative entries */
+    uint64_t state;
+    time_t cache_time;
+    struct gf_tw_timer_list *timer;
+    nlc_timer_data_t *timer_data;
+    size_t cache_size;
+    uint64_t refd_inodes;
+    gf_lock_t lock;
+};
+typedef struct nlc_ctx nlc_ctx_t;
+
+struct nlc_local {
+    loc_t loc;
+    loc_t loc2;
+    inode_t *inode;
+    inode_t *parent;
+    fd_t *fd;
+    char *linkname;
+    glusterfs_fop_t fop;
+};
+typedef struct nlc_local nlc_local_t;
+
+struct nlc_statistics {
+    gf_atomic_t nlc_hit;  /* No. of times lookup/stat was served from this xl */
+    gf_atomic_t nlc_miss; /* No. of times negative lookups were sent to disk */
+    /* More granular counters */
+    gf_atomic_t nameless_lookup;
+    gf_atomic_t getrealfilename_hit;
+    gf_atomic_t getrealfilename_miss;
+    gf_atomic_t pe_inode_cnt;
+    gf_atomic_t ne_inode_cnt;
+    gf_atomic_t nlc_invals; /* No. of invalidates received from upcall*/
+};
+
+struct nlc_conf {
+    int32_t cache_timeout;
+    gf_boolean_t positive_entry_cache;
+    gf_boolean_t negative_entry_cache;
+    gf_boolean_t disable_cache;
+    uint64_t cache_size;
+    gf_atomic_t current_cache_size;
+    uint64_t inode_limit;
+    gf_atomic_t refd_inodes;
+    struct tvec_base *timer_wheel;
+    time_t last_child_down;
+    struct list_head lru;
+    gf_lock_t lock;
+    struct nlc_statistics nlc_counter;
+};
+typedef struct nlc_conf nlc_conf_t;
+
+gf_boolean_t
+nlc_get_real_file_name(xlator_t *this, loc_t *loc, const char *fname,
+                       int32_t *op_ret, int32_t *op_errno, dict_t *dict);
+
+gf_boolean_t
+nlc_is_negative_lookup(xlator_t *this, loc_t *loc);
+
+void
+nlc_set_dir_state(xlator_t *this, inode_t *inode, uint64_t state);
+
+void
+nlc_dir_add_pe(xlator_t *this, inode_t *inode, inode_t *entry_ino,
+               const char *name);
+
+void
+nlc_dir_remove_pe(xlator_t *this, inode_t *inode, inode_t *entry_ino,
+                  const char *name, gf_boolean_t multilink);
+
+void
+nlc_dir_add_ne(xlator_t *this, inode_t *inode, const char *name);
+
+void
+nlc_local_wipe(xlator_t *this, nlc_local_t *local);
+
+nlc_local_t *
+nlc_local_init(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+               loc_t *loc, loc_t *loc2);
+
+void
+nlc_update_child_down_time(xlator_t *this, time_t now);
+
+void
+nlc_inode_clear_cache(xlator_t *this, inode_t *inode, int reason);
+
+void
+nlc_dump_inodectx(xlator_t *this, inode_t *inode);
+
+void
+nlc_clear_all_cache(xlator_t *this);
+
+void
+nlc_disable_cache(xlator_t *this);
+
+void
+nlc_lru_prune(xlator_t *this, inode_t *inode);
+
+#endif /* __NL_CACHE_H__ */
diff --git a/xlators/performance/open-behind/Makefile.am b/xlators/performance/open-behind/Makefile.am
new file mode 100644
index 00000000000..af437a64d6d
--- /dev/null
+++ b/xlators/performance/open-behind/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = src
diff --git a/xlators/performance/open-behind/src/Makefile.am b/xlators/performance/open-behind/src/Makefile.am
new file mode 100644
index 00000000000..41930dcd67d
--- /dev/null
+++ b/xlators/performance/open-behind/src/Makefile.am
@@ -0,0 +1,16 @@
+xlator_LTLIBRARIES = open-behind.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
+
+open_behind_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+open_behind_la_SOURCES = open-behind.c
+open_behind_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = open-behind-mem-types.h open-behind-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/performance/open-behind/src/open-behind-mem-types.h b/xlators/performance/open-behind/src/open-behind-mem-types.h
new file mode 100644
index 00000000000..6c1ab2e19d2
--- /dev/null
+++ b/xlators/performance/open-behind/src/open-behind-mem-types.h
@@ -0,0 +1,22 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __OB_MEM_TYPES_H__
+#define __OB_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_ob_mem_types_ {
+    gf_ob_mt_fd_t = gf_common_mt_end + 1,
+    gf_ob_mt_conf_t,
+    gf_ob_mt_inode_t,
+    gf_ob_mt_end
+};
+#endif
diff --git a/xlators/performance/open-behind/src/open-behind-messages.h b/xlators/performance/open-behind/src/open-behind-messages.h
new file mode 100644
index 00000000000..0e789177684
--- /dev/null
+++ b/xlators/performance/open-behind/src/open-behind-messages.h
@@ -0,0 +1,32 @@
+/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _OPEN_BEHIND_MESSAGES_H_
+#define _OPEN_BEHIND_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(OPEN_BEHIND, OPEN_BEHIND_MSG_XLATOR_CHILD_MISCONFIGURED,
+           OPEN_BEHIND_MSG_VOL_MISCONFIGURED, OPEN_BEHIND_MSG_NO_MEMORY,
+           OPEN_BEHIND_MSG_FAILED, OPEN_BEHIND_MSG_BAD_STATE);
+
+#define OPEN_BEHIND_MSG_FAILED_STR "Failed to submit fop"
+#define OPEN_BEHIND_MSG_BAD_STATE_STR "Unexpected state"
+
+#endif /* _OPEN_BEHIND_MESSAGES_H_ */
diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c
new file mode 100644
index 00000000000..600c3b62ffe
--- /dev/null
+++ b/xlators/performance/open-behind/src/open-behind.c
@@ -0,0 +1,1101 @@
+/*
+  Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "open-behind-mem-types.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/defaults.h>
+#include "open-behind-messages.h"
+#include <glusterfs/glusterfs-acl.h>
+
+/* Note: The initial design of open-behind was made to cover the simple case
+ *       of open, read, close for small files. This pattern combined with
+ *       quick-read can do the whole operation without a single request to the
+ *       bricks (except the initial lookup).
+ *
+ *       The way to do this has been improved, but the logic remains the same.
+ *       Basically, this means that any operation sent to the fd or the inode
+ *       that it's not a read, causes the open request to be sent to the
+ *       bricks, and all future operations will be executed synchronously,
+ *       including opens (it's reset once all fd's are closed).
+ */
+
+typedef struct ob_conf {
+    gf_boolean_t use_anonymous_fd; /* use anonymous FDs wherever safe
+                                      e.g - fstat() readv()
+
+                                      whereas for fops like writev(), lk(),
+                                      the fd is important for side effects
+                                      like mandatory locks
+                                   */
+    gf_boolean_t lazy_open;        /* delay backend open as much as possible */
+    gf_boolean_t read_after_open;  /* instead of sending readvs on
+                                           anonymous fds, open the file
+                                           first and then send readv i.e
+                                           similar to what writev does
+                                        */
+} ob_conf_t;
+
+/* A negative state represents an errno value negated. In this case the
+ * current operation cannot be processed. */
+typedef enum _ob_state {
+    /* There are no opens on the inode or the first open is already
+     * completed. The current operation can be sent directly. */
+    OB_STATE_READY = 0,
+
+    /* There's an open pending and it has been triggered. The current
+     * operation should be "stubbified" and processed with
+     * ob_stub_dispatch(). */
+    OB_STATE_OPEN_TRIGGERED,
+
+    /* There's an open pending but it has not been triggered. The current
+     * operation can be processed directly but using an anonymous fd. */
+    OB_STATE_OPEN_PENDING,
+
+    /* The current operation is the first open on the inode. */
+    OB_STATE_FIRST_OPEN
+} ob_state_t;
+
+typedef struct ob_inode {
+    /* List of stubs pending on the first open. Once the first open is
+     * complete, all these stubs will be resubmitted, and dependencies
+     * will be checked again. */
+    struct list_head resume_fops;
+
+    /* The inode this object references. */
+    inode_t *inode;
+
+    /* The fd from the first open sent to this inode. It will be set
+     * from the moment the open is processed until the open if fully
+     * executed or closed before actually opened. It's NULL in all
+     * other cases. */
+    fd_t *first_fd;
+
+    /* The stub from the first open operation. When open fop starts
+     * being processed, it's assigned the OB_OPEN_PREPARING value
+     * until the actual stub is created. This is necessary to avoid
+     * creating the stub inside a locked region. Once the stub is
+     * successfully created, it's assigned here. This value is set
+     * to NULL once the stub is resumed. */
+    call_stub_t *first_open;
+
+    /* The total number of currently open fd's on this inode. */
+    int32_t open_count;
+
+    /* This flag is set as soon as we know that the open will be
+     * sent to the bricks, even before the stub is ready. */
+    bool triggered;
+} ob_inode_t;
+
+/* Dummy pointer used temporarily while the actual open stub is being created */
+#define OB_OPEN_PREPARING ((call_stub_t *)-1)
+
+#define OB_POST_COMMON(_fop, _xl, _frame, _fd, _args...)                       \
+    case OB_STATE_FIRST_OPEN:                                                  \
+        gf_smsg((_xl)->name, GF_LOG_ERROR, EINVAL, OPEN_BEHIND_MSG_BAD_STATE,  \
+                "fop=%s", #_fop, "state=%d", __ob_state, NULL);                \
+        default_##_fop##_failure_cbk(_frame, EINVAL);                          \
+        break;                                                                 \
+    case OB_STATE_READY:                                                       \
+        default_##_fop(_frame, _xl, ##_args);                                  \
+        break;                                                                 \
+    case OB_STATE_OPEN_TRIGGERED: {                                            \
+        call_stub_t *__ob_stub = fop_##_fop##_stub(_frame, ob_##_fop,          \
+                                                   ##_args);                   \
+        if (__ob_stub != NULL) {                                               \
+            ob_stub_dispatch(_xl, __ob_inode, _fd, __ob_stub);                 \
+            break;                                                             \
+        }                                                                      \
+        __ob_state = -ENOMEM;                                                  \
+    }                                                                          \
+    default:                                                                   \
+        gf_smsg((_xl)->name, GF_LOG_ERROR, -__ob_state,                        \
+                OPEN_BEHIND_MSG_FAILED, "fop=%s", #_fop, NULL);                \
+        default_##_fop##_failure_cbk(_frame, -__ob_state)
+
+#define OB_POST_FD(_fop, _xl, _frame, _fd, _trigger, _args...)                 \
+    do {                                                                       \
+        ob_inode_t *__ob_inode;                                                \
+        fd_t *__first_fd;                                                      \
+        ob_state_t __ob_state = ob_open_and_resume_fd(                         \
+            _xl, _fd, 0, true, _trigger, &__ob_inode, &__first_fd);            \
+        switch (__ob_state) {                                                  \
+            case OB_STATE_OPEN_PENDING:                                        \
+                if (!(_trigger)) {                                             \
+                    fd_t *__ob_fd = fd_anonymous_with_flags((_fd)->inode,      \
+                                                            (_fd)->flags);     \
+                    if (__ob_fd != NULL) {                                     \
+                        default_##_fop(_frame, _xl, ##_args);                  \
+                        fd_unref(__ob_fd);                                     \
+                        break;                                                 \
+                    }                                                          \
+                    __ob_state = -ENOMEM;                                      \
+                }                                                              \
+                OB_POST_COMMON(_fop, _xl, _frame, __first_fd, ##_args);        \
+        }                                                                      \
+    } while (0)
+
+#define OB_POST_FLUSH(_xl, _frame, _fd, _args...)                              \
+    do {                                                                       \
+        ob_inode_t *__ob_inode;                                                \
+        fd_t *__first_fd;                                                      \
+        ob_state_t __ob_state = ob_open_and_resume_fd(                         \
+            _xl, _fd, 0, true, false, &__ob_inode, &__first_fd);               \
+        switch (__ob_state) {                                                  \
+            case OB_STATE_OPEN_PENDING:                                        \
+                default_flush_cbk(_frame, NULL, _xl, 0, 0, NULL);              \
+                break;                                                         \
+                OB_POST_COMMON(flush, _xl, _frame, __first_fd, ##_args);       \
+        }                                                                      \
+    } while (0)
+
+#define OB_POST_INODE(_fop, _xl, _frame, _inode, _trigger, _args...)           \
+    do {                                                                       \
+        ob_inode_t *__ob_inode;                                                \
+        fd_t *__first_fd;                                                      \
+        ob_state_t __ob_state = ob_open_and_resume_inode(                      \
+            _xl, _inode, NULL, 0, true, _trigger, &__ob_inode, &__first_fd);   \
+        switch (__ob_state) {                                                  \
+            case OB_STATE_OPEN_PENDING:                                        \
+                OB_POST_COMMON(_fop, _xl, _frame, __first_fd, ##_args);        \
+        }                                                                      \
+    } while (0)
+
+static ob_inode_t *
+ob_inode_get_locked(xlator_t *this, inode_t *inode)
+{
+    ob_inode_t *ob_inode = NULL;
+    uint64_t value = 0;
+
+    if ((__inode_ctx_get(inode, this, &value) == 0) && (value != 0)) {
+        return (ob_inode_t *)(uintptr_t)value;
+    }
+
+    ob_inode = GF_CALLOC(1, sizeof(*ob_inode), gf_ob_mt_inode_t);
+    if (ob_inode != NULL) {
+        ob_inode->inode = inode;
+        INIT_LIST_HEAD(&ob_inode->resume_fops);
+
+        value = (uint64_t)(uintptr_t)ob_inode;
+        if (__inode_ctx_set(inode, this, &value) < 0) {
+            GF_FREE(ob_inode);
+            ob_inode = NULL;
+        }
+    }
+
+    return ob_inode;
+}
+
+static ob_state_t
+ob_open_and_resume_inode(xlator_t *xl, inode_t *inode, fd_t *fd,
+                         int32_t open_count, bool synchronous, bool trigger,
+                         ob_inode_t **pob_inode, fd_t **pfd)
+{
+    ob_conf_t *conf;
+    ob_inode_t *ob_inode;
+    call_stub_t *open_stub;
+
+    if (inode == NULL) {
+        return OB_STATE_READY;
+    }
+
+    conf = xl->private;
+
+    *pfd = NULL;
+
+    LOCK(&inode->lock);
+    {
+        ob_inode = ob_inode_get_locked(xl, inode);
+        if (ob_inode == NULL) {
+            UNLOCK(&inode->lock);
+
+            return -ENOMEM;
+        }
+        *pob_inode = ob_inode;
+
+        ob_inode->open_count += open_count;
+
+        /* If first_fd is not NULL, it means that there's a previous open not
+         * yet completed. */
+        if (ob_inode->first_fd != NULL) {
+            *pfd = ob_inode->first_fd;
+            /* If the current request doesn't trigger the open and it hasn't
+             * been triggered yet, we can continue without issuing the open
+             * only if the current request belongs to the same fd as the
+             * first one. */
+            if (!trigger && !ob_inode->triggered &&
+                (ob_inode->first_fd == fd)) {
+                UNLOCK(&inode->lock);
+
+                return OB_STATE_OPEN_PENDING;
+            }
+
+            /* We need to issue the open. It could have already been triggered
+             * before. In this case open_stub will be NULL. Or the initial open
+             * may not be completely ready yet. In this case open_stub will be
+             * OB_OPEN_PREPARING. */
+            open_stub = ob_inode->first_open;
+            ob_inode->first_open = NULL;
+            ob_inode->triggered = true;
+
+            UNLOCK(&inode->lock);
+
+            if ((open_stub != NULL) && (open_stub != OB_OPEN_PREPARING)) {
+                call_resume(open_stub);
+            }
+
+            return OB_STATE_OPEN_TRIGGERED;
+        }
+
+        /* There's no pending open. Only opens can be non synchronous, so all
+         * regular fops will be processed directly. For non synchronous opens,
+         * we'll still process them normally (i.e. synchornous) if there are
+         * more file descriptors open. */
+        if (synchronous || (ob_inode->open_count > open_count)) {
+            UNLOCK(&inode->lock);
+
+            return OB_STATE_READY;
+        }
+
+        *pfd = fd;
+
+        /* This is the first open. We keep a reference on the fd and set
+         * first_open stub to OB_OPEN_PREPARING until the actual stub can
+         * be assigned (we don't create the stub here to avoid doing memory
+         * allocations inside the mutex). */
+        ob_inode->first_fd = __fd_ref(fd);
+        ob_inode->first_open = OB_OPEN_PREPARING;
+
+        /* If lazy_open is not set, we'll need to immediately send the open,
+         * so we set triggered right now. */
+        ob_inode->triggered = !conf->lazy_open;
+    }
+    UNLOCK(&inode->lock);
+
+    return OB_STATE_FIRST_OPEN;
+}
+
+static ob_state_t
+ob_open_and_resume_fd(xlator_t *xl, fd_t *fd, int32_t open_count,
+                      bool synchronous, bool trigger, ob_inode_t **pob_inode,
+                      fd_t **pfd)
+{
+    uint64_t err;
+
+    if ((fd_ctx_get(fd, xl, &err) == 0) && (err != 0)) {
+        return (ob_state_t)-err;
+    }
+
+    return ob_open_and_resume_inode(xl, fd->inode, fd, open_count, synchronous,
+                                    trigger, pob_inode, pfd);
+}
+
+static ob_state_t
+ob_open_behind(xlator_t *xl, fd_t *fd, int32_t flags, ob_inode_t **pob_inode,
+               fd_t **pfd)
+{
+    bool synchronous;
+
+    /* TODO: If O_CREAT, O_APPEND, O_WRONLY or O_DIRECT are specified, shouldn't
+     *       we also execute this open synchronously ? */
+    synchronous = (flags & O_TRUNC) != 0;
+
+    return ob_open_and_resume_fd(xl, fd, 1, synchronous, true, pob_inode, pfd);
+}
+
+static int32_t
+ob_stub_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd,
+                 call_stub_t *stub)
+{
+    LOCK(&ob_inode->inode->lock);
+    {
+        /* We only queue a stub if the open has not been completed or
+         * cancelled. */
+        if (ob_inode->first_fd == fd) {
+            list_add_tail(&stub->list, &ob_inode->resume_fops);
+            stub = NULL;
+        }
+    }
+    UNLOCK(&ob_inode->inode->lock);
+
+    if (stub != NULL) {
+        call_resume(stub);
+    }
+
+    return 0;
+}
+
+static void
+ob_open_destroy(call_stub_t *stub, fd_t *fd)
+{
+    stub->frame->local = NULL;
+    STACK_DESTROY(stub->frame->root);
+    call_stub_destroy(stub);
+    fd_unref(fd);
+}
+
+static int32_t
+ob_open_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd,
+                 call_stub_t *stub)
+{
+    bool closed;
+
+    LOCK(&ob_inode->inode->lock);
+    {
+        closed = ob_inode->first_fd != fd;
+        if (!closed) {
+            if (ob_inode->triggered) {
+                ob_inode->first_open = NULL;
+            } else {
+                ob_inode->first_open = stub;
+                stub = NULL;
+            }
+        }
+    }
+    UNLOCK(&ob_inode->inode->lock);
+
+    if (stub != NULL) {
+        if (closed) {
+            ob_open_destroy(stub, fd);
+        } else {
+            call_resume(stub);
+        }
+    }
+
+    return 0;
+}
+
+static void
+ob_resume_pending(struct list_head *list)
+{
+    call_stub_t *stub;
+
+    while (!list_empty(list)) {
+        stub = list_first_entry(list, call_stub_t, list);
+        list_del_init(&stub->list);
+
+        call_resume(stub);
+    }
+}
+
+static void
+ob_open_completed(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, int32_t op_ret,
+                  int32_t op_errno)
+{
+    struct list_head list;
+
+    INIT_LIST_HEAD(&list);
+
+    if (op_ret < 0) {
+        fd_ctx_set(fd, xl, op_errno <= 0 ? EIO : op_errno);
+    }
+
+    LOCK(&ob_inode->inode->lock);
+    {
+        /* Only update the fields if the file has not been closed before
+         * getting here. */
+        if (ob_inode->first_fd == fd) {
+            list_splice_init(&ob_inode->resume_fops, &list);
+            ob_inode->first_fd = NULL;
+            ob_inode->first_open = NULL;
+            ob_inode->triggered = false;
+        }
+    }
+    UNLOCK(&ob_inode->inode->lock);
+
+    ob_resume_pending(&list);
+
+    fd_unref(fd);
+}
+
+static int32_t
+ob_open_cbk(call_frame_t *frame, void *cookie, xlator_t *xl, int32_t op_ret,
+            int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    ob_inode_t *ob_inode;
+
+    ob_inode = frame->local;
+    frame->local = NULL;
+
+    ob_open_completed(xl, ob_inode, cookie, op_ret, op_errno);
+
+    STACK_DESTROY(frame->root);
+
+    return 0;
+}
+
+static int32_t
+ob_open_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+               fd_t *fd, dict_t *xdata)
+{
+    STACK_WIND_COOKIE(frame, ob_open_cbk, fd, FIRST_CHILD(this),
+                      FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd,
+        dict_t *xdata)
+{
+    ob_inode_t *ob_inode;
+    call_frame_t *open_frame;
+    call_stub_t *stub;
+    fd_t *first_fd;
+    ob_state_t state;
+
+    state = ob_open_behind(this, fd, flags, &ob_inode, &first_fd);
+    if (state == OB_STATE_READY) {
+        /* There's no pending open, but there are other file descriptors opened
+         * or the current flags require a synchronous open. */
+        return default_open(frame, this, loc, flags, fd, xdata);
+    }
+
+    if (state == OB_STATE_OPEN_TRIGGERED) {
+        /* The first open is in progress (either because it was already issued
+         * or because this request triggered it). We try to create a new stub
+         * to retry the operation once the initial open completes. */
+        stub = fop_open_stub(frame, ob_open, loc, flags, fd, xdata);
+        if (stub != NULL) {
+            return ob_stub_dispatch(this, ob_inode, first_fd, stub);
+        }
+
+        state = -ENOMEM;
+    }
+
+    if (state == OB_STATE_FIRST_OPEN) {
+        /* We try to create a stub for the new open. A new frame needs to be
+         * used because the current one may be destroyed soon after sending
+         * the open's reply. */
+        open_frame = copy_frame(frame);
+        if (open_frame != NULL) {
+            stub = fop_open_stub(open_frame, ob_open_resume, loc, flags, fd,
+                                 xdata);
+            if (stub != NULL) {
+                open_frame->local = ob_inode;
+
+                /* TODO: Previous version passed xdata back to the caller, but
+                 *       probably this doesn't make sense since it won't contain
+                 *       any requested data. I think it would be better to pass
+                 *       NULL for xdata. */
+                default_open_cbk(frame, NULL, this, 0, 0, fd, xdata);
+
+                return ob_open_dispatch(this, ob_inode, first_fd, stub);
+            }
+
+            STACK_DESTROY(open_frame->root);
+        }
+
+        /* In case of error, simulate a regular completion but with an error
+         * code. */
+        ob_open_completed(this, ob_inode, first_fd, -1, ENOMEM);
+
+        state = -ENOMEM;
+    }
+
+    /* In case of failure we need to decrement the number of open files because
+     * ob_fdclose() won't be called. */
+
+    LOCK(&fd->inode->lock);
+    {
+        ob_inode->open_count--;
+    }
+    UNLOCK(&fd->inode->lock);
+
+    gf_smsg(this->name, GF_LOG_ERROR, -state, OPEN_BEHIND_MSG_FAILED, "fop=%s",
+            "open", "path=%s", loc->path, NULL);
+
+    return default_open_failure_cbk(frame, -state);
+}
+
+static int32_t
+ob_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+          mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    ob_inode_t *ob_inode;
+    call_stub_t *stub;
+    fd_t *first_fd;
+    ob_state_t state;
+
+    /* Create requests are never delayed. We always send them synchronously. */
+    state = ob_open_and_resume_fd(this, fd, 1, true, true, &ob_inode,
+                                  &first_fd);
+    if (state == OB_STATE_READY) {
+        /* There's no pending open, but there are other file descriptors opened
+         * so we simply forward the request synchronously. */
+        return default_create(frame, this, loc, flags, mode, umask, fd, xdata);
+    }
+
+    if (state == OB_STATE_OPEN_TRIGGERED) {
+        /* The first open is in progress (either because it was already issued
+         * or because this request triggered it). We try to create a new stub
+         * to retry the operation once the initial open completes. */
+        stub = fop_create_stub(frame, ob_create, loc, flags, mode, umask, fd,
+                               xdata);
+        if (stub != NULL) {
+            return ob_stub_dispatch(this, ob_inode, first_fd, stub);
+        }
+
+        state = -ENOMEM;
+    }
+
+    /* Since we forced a synchronous request, OB_STATE_FIRST_OPEN will never
+     * be returned by ob_open_and_resume_fd(). If we are here it can only be
+     * because there has been a problem. */
+
+    /* In case of failure we need to decrement the number of open files because
+     * ob_fdclose() won't be called. */
+
+    LOCK(&fd->inode->lock);
+    {
+        ob_inode->open_count--;
+    }
+    UNLOCK(&fd->inode->lock);
+
+    gf_smsg(this->name, GF_LOG_ERROR, -state, OPEN_BEHIND_MSG_FAILED, "fop=%s",
+            "create", "path=%s", loc->path, NULL);
+
+    return default_create_failure_cbk(frame, -state);
+}
+
+static int32_t
+ob_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+         off_t offset, uint32_t flags, dict_t *xdata)
+{
+    ob_conf_t *conf = this->private;
+    bool trigger = conf->read_after_open || !conf->use_anonymous_fd;
+
+    OB_POST_FD(readv, this, frame, fd, trigger, fd, size, offset, flags, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov,
+          int count, off_t offset, uint32_t flags, struct iobref *iobref,
+          dict_t *xdata)
+{
+    OB_POST_FD(writev, this, frame, fd, true, fd, iov, count, offset, flags,
+               iobref, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    ob_conf_t *conf = this->private;
+    bool trigger = !conf->use_anonymous_fd;
+
+    OB_POST_FD(fstat, this, frame, fd, trigger, fd, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+        gf_seek_what_t what, dict_t *xdata)
+{
+    ob_conf_t *conf = this->private;
+    bool trigger = !conf->use_anonymous_fd;
+
+    OB_POST_FD(seek, this, frame, fd, trigger, fd, offset, what, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    OB_POST_FLUSH(this, frame, fd, fd, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int flag, dict_t *xdata)
+{
+    OB_POST_FD(fsync, this, frame, fd, true, fd, flag, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd,
+      struct gf_flock *flock, dict_t *xdata)
+{
+    OB_POST_FD(lk, this, frame, fd, true, fd, cmd, flock, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             dict_t *xdata)
+{
+    OB_POST_FD(ftruncate, this, frame, fd, true, fd, offset, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr,
+             int flags, dict_t *xdata)
+{
+    OB_POST_FD(fsetxattr, this, frame, fd, true, fd, xattr, flags, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+             dict_t *xdata)
+{
+    OB_POST_FD(fgetxattr, this, frame, fd, true, fd, name, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+                dict_t *xdata)
+{
+    OB_POST_FD(fremovexattr, this, frame, fd, true, fd, name, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            int cmd, struct gf_flock *flock, dict_t *xdata)
+{
+    OB_POST_FD(finodelk, this, frame, fd, true, volume, fd, cmd, flock, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+            const char *basename, entrylk_cmd cmd, entrylk_type type,
+            dict_t *xdata)
+{
+    OB_POST_FD(fentrylk, this, frame, fd, true, volume, fd, basename, cmd, type,
+               xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+            gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+    OB_POST_FD(fxattrop, this, frame, fd, true, fd, optype, xattr, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *iatt,
+            int valid, dict_t *xdata)
+{
+    OB_POST_FD(fsetattr, this, frame, fd, true, fd, iatt, valid, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+             off_t offset, size_t len, dict_t *xdata)
+{
+    OB_POST_FD(fallocate, this, frame, fd, true, fd, mode, offset, len, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+           size_t len, dict_t *xdata)
+{
+    OB_POST_FD(discard, this, frame, fd, true, fd, offset, len, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            off_t len, dict_t *xdata)
+{
+    OB_POST_FD(zerofill, this, frame, fd, true, fd, offset, len, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+          dict_t *xdata)
+{
+    OB_POST_INODE(unlink, this, frame, loc->inode, true, loc, xflags, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_rename(call_frame_t *frame, xlator_t *this, loc_t *src, loc_t *dst,
+          dict_t *xdata)
+{
+    OB_POST_INODE(rename, this, frame, dst->inode, true, src, dst, xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+           int32_t valid, dict_t *xdata)
+{
+    OB_POST_INODE(setattr, this, frame, loc->inode, true, loc, stbuf, valid,
+                  xdata);
+
+    return 0;
+}
+
+static int32_t
+ob_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+            int32_t flags, dict_t *xdata)
+{
+    if (dict_get(dict, POSIX_ACL_DEFAULT_XATTR) ||
+        dict_get(dict, POSIX_ACL_ACCESS_XATTR) ||
+        dict_get(dict, GF_SELINUX_XATTR_KEY)) {
+        return default_setxattr(frame, this, loc, dict, flags, xdata);
+    }
+
+    OB_POST_INODE(setxattr, this, frame, loc->inode, true, loc, dict, flags,
+                  xdata);
+
+    return 0;
+}
+
+static void
+ob_fdclose(xlator_t *this, fd_t *fd)
+{
+    struct list_head list;
+    ob_inode_t *ob_inode;
+    call_stub_t *stub;
+
+    INIT_LIST_HEAD(&list);
+    stub = NULL;
+
+    LOCK(&fd->inode->lock);
+    {
+        ob_inode = ob_inode_get_locked(this, fd->inode);
+        if (ob_inode != NULL) {
+            ob_inode->open_count--;
+
+            /* If this fd is the same as ob_inode->first_fd, it means that
+             * the initial open has not fully completed. We'll try to cancel
+             * it. */
+            if (ob_inode->first_fd == fd) {
+                if (ob_inode->first_open == OB_OPEN_PREPARING) {
+                    /* In this case ob_open_dispatch() has not been called yet.
+                     * We clear first_fd and first_open to allow that function
+                     * to know that the open is not really needed. This also
+                     * allows other requests to work as expected if they
+                     * arrive before the dispatch function is called. If there
+                     * are pending fops, we can directly process them here.
+                     * (note that there shouldn't be any fd related fops, but
+                     * if there are, it's fine if they fail). */
+                    ob_inode->first_fd = NULL;
+                    ob_inode->first_open = NULL;
+                    ob_inode->triggered = false;
+                    list_splice_init(&ob_inode->resume_fops, &list);
+                } else if (!ob_inode->triggered) {
+                    /* If the open has already been dispatched, we can only
+                     * cancel it if it has not been triggered. Otherwise we
+                     * simply wait until it completes. While it's not triggered,
+                     * first_open must be a valid stub and there can't be any
+                     * pending fops. */
+                    GF_ASSERT((ob_inode->first_open != NULL) &&
+                              list_empty(&ob_inode->resume_fops));
+
+                    ob_inode->first_fd = NULL;
+                    stub = ob_inode->first_open;
+                    ob_inode->first_open = NULL;
+                }
+            }
+        }
+    }
+    UNLOCK(&fd->inode->lock);
+
+    if (stub != NULL) {
+        ob_open_destroy(stub, fd);
+    }
+
+    ob_resume_pending(&list);
+}
+
+int
+ob_forget(xlator_t *this, inode_t *inode)
+{
+    ob_inode_t *ob_inode;
+    uint64_t value = 0;
+
+    if ((inode_ctx_del(inode, this, &value) == 0) && (value != 0)) {
+        ob_inode = (ob_inode_t *)(uintptr_t)value;
+        GF_FREE(ob_inode);
+    }
+
+    return 0;
+}
+
+int
+ob_priv_dump(xlator_t *this)
+{
+    ob_conf_t *conf = NULL;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN];
+
+    conf = this->private;
+
+    if (!conf)
+        return -1;
+
+    gf_proc_dump_build_key(key_prefix, "xlator.performance.open-behind",
+                           "priv");
+
+    gf_proc_dump_add_section("%s", key_prefix);
+
+    gf_proc_dump_write("use_anonymous_fd", "%d", conf->use_anonymous_fd);
+
+    gf_proc_dump_write("lazy_open", "%d", conf->lazy_open);
+
+    return 0;
+}
+
+int
+ob_fdctx_dump(xlator_t *this, fd_t *fd)
+{
+    char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    uint64_t value = 0;
+    int ret = 0, error = 0;
+
+    ret = TRY_LOCK(&fd->lock);
+    if (ret)
+        return 0;
+
+    if ((__fd_ctx_get(fd, this, &value) == 0) && (value != 0)) {
+        error = (int32_t)value;
+    }
+
+    gf_proc_dump_build_key(key_prefix, "xlator.performance.open-behind",
+                           "file");
+    gf_proc_dump_add_section("%s", key_prefix);
+
+    gf_proc_dump_write("fd", "%p", fd);
+
+    gf_proc_dump_write("error", "%d", error);
+
+    UNLOCK(&fd->lock);
+
+    return 0;
+}
+
+int
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    ret = xlator_mem_acct_init(this, gf_ob_mt_end + 1);
+
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, OPEN_BEHIND_MSG_NO_MEMORY,
+               "Memory accounting failed");
+
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    ob_conf_t *conf = NULL;
+    int ret = -1;
+
+    conf = this->private;
+
+    GF_OPTION_RECONF("use-anonymous-fd", conf->use_anonymous_fd, options, bool,
+                     out);
+
+    GF_OPTION_RECONF("lazy-open", conf->lazy_open, options, bool, out);
+
+    GF_OPTION_RECONF("read-after-open", conf->read_after_open, options, bool,
+                     out);
+
+    GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out);
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+init(xlator_t *this)
+{
+    ob_conf_t *conf = NULL;
+
+    if (!this->children || this->children->next) {
+        gf_msg(this->name, GF_LOG_ERROR, 0,
+               OPEN_BEHIND_MSG_XLATOR_CHILD_MISCONFIGURED,
+               "FATAL: volume (%s) not configured with exactly one "
+               "child",
+               this->name);
+        return -1;
+    }
+
+    if (!this->parents)
+        gf_msg(this->name, GF_LOG_WARNING, 0, OPEN_BEHIND_MSG_VOL_MISCONFIGURED,
+               "dangling volume. check volfile ");
+
+    conf = GF_CALLOC(1, sizeof(*conf), gf_ob_mt_conf_t);
+    if (!conf)
+        goto err;
+
+    GF_OPTION_INIT("use-anonymous-fd", conf->use_anonymous_fd, bool, err);
+
+    GF_OPTION_INIT("lazy-open", conf->lazy_open, bool, err);
+
+    GF_OPTION_INIT("read-after-open", conf->read_after_open, bool, err);
+
+    GF_OPTION_INIT("pass-through", this->pass_through, bool, err);
+
+    this->private = conf;
+
+    return 0;
+err:
+    if (conf)
+        GF_FREE(conf);
+
+    return -1;
+}
+
+void
+fini(xlator_t *this)
+{
+    ob_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    GF_FREE(conf);
+
+    return;
+}
+
+struct xlator_fops fops = {
+    .open = ob_open,
+    .create = ob_create,
+    .readv = ob_readv,
+    .writev = ob_writev,
+    .flush = ob_flush,
+    .fsync = ob_fsync,
+    .fstat = ob_fstat,
+    .seek = ob_seek,
+    .ftruncate = ob_ftruncate,
+    .fsetxattr = ob_fsetxattr,
+    .setxattr = ob_setxattr,
+    .fgetxattr = ob_fgetxattr,
+    .fremovexattr = ob_fremovexattr,
+    .finodelk = ob_finodelk,
+    .fentrylk = ob_fentrylk,
+    .fxattrop = ob_fxattrop,
+    .fsetattr = ob_fsetattr,
+    .setattr = ob_setattr,
+    .fallocate = ob_fallocate,
+    .discard = ob_discard,
+    .zerofill = ob_zerofill,
+    .unlink = ob_unlink,
+    .rename = ob_rename,
+    .lk = ob_lk,
+};
+
+struct xlator_cbks cbks = {
+    .fdclose = ob_fdclose,
+    .forget = ob_forget,
+};
+
+struct xlator_dumpops dumpops = {
+    .priv = ob_priv_dump,
+    .fdctx = ob_fdctx_dump,
+};
+
+struct volume_options options[] = {
+    {
+        .key = {"open-behind"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "enable/disable open-behind",
+        .op_version = {GD_OP_VERSION_6_0},
+        .flags = OPT_FLAG_SETTABLE,
+    },
+    {
+        .key = {"use-anonymous-fd"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "no",
+        .description =
+            "For read operations, use anonymous FD when "
+            "original FD is open-behind and not yet opened in the backend.",
+    },
+    {
+        .key = {"lazy-open"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "yes",
+        .description =
+            "Perform open in the backend only when a necessary "
+            "FOP arrives (e.g writev on the FD, unlink of the file). When "
+            "option "
+            "is disabled, perform backend open right after unwinding open().",
+        .op_version = {3},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT,
+        .tags = {},
+        /* option_validation_fn validate_fn; */
+    },
+    {
+        .key = {"read-after-open"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "yes",
+        .description = "read is sent only after actual open happens and real "
+                       "fd is obtained, instead of doing on anonymous fd "
+                       "(similar to write)",
+        .op_version = {3},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT,
+        .tags = {},
+        /* option_validation_fn validate_fn; */
+    },
+    {.key = {"pass-through"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "false",
+     .op_version = {GD_OP_VERSION_4_1_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+     .tags = {"open-behind"},
+     .description = "Enable/Disable open behind translator"},
+    {.key = {NULL}}
+
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "open-behind",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/cluster/unify/Makefile.am b/xlators/performance/quick-read/Makefile.am
index d471a3f9243..d471a3f9243 100644
--- a/xlators/cluster/unify/Makefile.am
+++ b/xlators/performance/quick-read/Makefile.am
diff --git a/xlators/performance/quick-read/src/Makefile.am b/xlators/performance/quick-read/src/Makefile.am
new file mode 100644
index 00000000000..8eb6cece738
--- /dev/null
+++ b/xlators/performance/quick-read/src/Makefile.am
@@ -0,0 +1,16 @@
+xlator_LTLIBRARIES = quick-read.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
+
+quick_read_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+quick_read_la_SOURCES = quick-read.c
+quick_read_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = quick-read.h quick-read-mem-types.h quick-read-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES = 
diff --git a/xlators/performance/quick-read/src/quick-read-mem-types.h b/xlators/performance/quick-read/src/quick-read-mem-types.h
new file mode 100644
index 00000000000..e4aef8549ff
--- /dev/null
+++ b/xlators/performance/quick-read/src/quick-read-mem-types.h
@@ -0,0 +1,23 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __QR_MEM_TYPES_H__
+#define __QR_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_qr_mem_types_ {
+    gf_qr_mt_qr_inode_t = gf_common_mt_end + 1,
+    gf_qr_mt_content_t,
+    gf_qr_mt_qr_priority_t,
+    gf_qr_mt_qr_private_t,
+    gf_qr_mt_end
+};
+#endif
diff --git a/xlators/performance/quick-read/src/quick-read-messages.h b/xlators/performance/quick-read/src/quick-read-messages.h
new file mode 100644
index 00000000000..da9724a3c9c
--- /dev/null
+++ b/xlators/performance/quick-read/src/quick-read-messages.h
@@ -0,0 +1,31 @@
+/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _QUICK_READ_MESSAGES_H_
+#define _QUICK_READ_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(QUICK_READ, QUICK_READ_MSG_ENFORCEMENT_FAILED,
+           QUICK_READ_MSG_INVALID_ARGUMENT,
+           QUICK_READ_MSG_XLATOR_CHILD_MISCONFIGURED, QUICK_READ_MSG_NO_MEMORY,
+           QUICK_READ_MSG_VOL_MISCONFIGURED, QUICK_READ_MSG_DICT_SET_FAILED,
+           QUICK_READ_MSG_INVALID_CONFIG, QUICK_READ_MSG_LRU_NOT_EMPTY);
+
+#endif /* _QUICK_READ_MESSAGES_H_ */
diff --git a/xlators/performance/quick-read/src/quick-read.c b/xlators/performance/quick-read/src/quick-read.c
new file mode 100644
index 00000000000..7fe4b3c3a4b
--- /dev/null
+++ b/xlators/performance/quick-read/src/quick-read.c
@@ -0,0 +1,1644 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <math.h>
+#include "quick-read.h"
+#include <glusterfs/statedump.h>
+#include "quick-read-messages.h"
+#include <glusterfs/upcall-utils.h>
+#include <glusterfs/atomic.h>
+
+typedef struct qr_local {
+    inode_t *inode;
+    uint64_t incident_gen;
+    fd_t *fd;
+} qr_local_t;
+
+qr_inode_t *
+qr_inode_ctx_get(xlator_t *this, inode_t *inode);
+
+void
+__qr_inode_prune_data(xlator_t *this, qr_inode_table_t *table,
+                      qr_inode_t *qr_inode);
+
+void
+qr_local_wipe(qr_local_t *local)
+{
+    if (!local)
+        goto out;
+
+    if (local->inode)
+        inode_unref(local->inode);
+
+    if (local->fd)
+        fd_unref(local->fd);
+
+    GF_FREE(local);
+out:
+    return;
+}
+
+uint64_t
+__qr_get_generation(xlator_t *this, qr_inode_t *qr_inode)
+{
+    uint64_t gen = 0, rollover;
+    qr_private_t *priv = NULL;
+    qr_inode_table_t *table = NULL;
+
+    priv = this->private;
+    table = &priv->table;
+
+    gen = GF_ATOMIC_INC(priv->generation);
+    if (gen == 0) {
+        qr_inode->gen_rollover = !qr_inode->gen_rollover;
+        gen = GF_ATOMIC_INC(priv->generation);
+        __qr_inode_prune_data(this, table, qr_inode);
+        qr_inode->gen = qr_inode->invalidation_time = gen - 1;
+    }
+
+    rollover = qr_inode->gen_rollover;
+    gen |= (rollover << 32);
+    return gen;
+}
+
+uint64_t
+qr_get_generation(xlator_t *this, inode_t *inode)
+{
+    qr_inode_t *qr_inode = NULL;
+    uint64_t gen = 0;
+    qr_inode_table_t *table = NULL;
+    qr_private_t *priv = NULL;
+
+    priv = this->private;
+    table = &priv->table;
+
+    qr_inode = qr_inode_ctx_get(this, inode);
+
+    if (qr_inode) {
+        LOCK(&table->lock);
+        {
+            gen = __qr_get_generation(this, qr_inode);
+        }
+        UNLOCK(&table->lock);
+    } else {
+        gen = GF_ATOMIC_INC(priv->generation);
+        if (gen == 0) {
+            gen = GF_ATOMIC_INC(priv->generation);
+        }
+    }
+
+    return gen;
+}
+
+qr_local_t *
+qr_local_get(xlator_t *this, inode_t *inode)
+{
+    qr_local_t *local = NULL;
+
+    local = GF_CALLOC(1, sizeof(*local), gf_common_mt_char);
+    if (!local)
+        goto out;
+
+    local->incident_gen = qr_get_generation(this, inode);
+out:
+    return local;
+}
+
+#define QR_STACK_UNWIND(fop, frame, params...)                                 \
+    do {                                                                       \
+        qr_local_t *__local = NULL;                                            \
+        if (frame) {                                                           \
+            __local = frame->local;                                            \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, params);                               \
+        qr_local_wipe(__local);                                                \
+    } while (0)
+
+void
+__qr_inode_prune(xlator_t *this, qr_inode_table_t *table, qr_inode_t *qr_inode,
+                 uint64_t gen);
+
+int
+__qr_inode_ctx_set(xlator_t *this, inode_t *inode, qr_inode_t *qr_inode)
+{
+    uint64_t value = 0;
+    int ret = -1;
+
+    value = (long)qr_inode;
+
+    ret = __inode_ctx_set(inode, this, &value);
+
+    return ret;
+}
+
+qr_inode_t *
+__qr_inode_ctx_get(xlator_t *this, inode_t *inode)
+{
+    qr_inode_t *qr_inode = NULL;
+    uint64_t value = 0;
+    int ret = -1;
+
+    ret = __inode_ctx_get(inode, this, &value);
+    if (ret)
+        return NULL;
+
+    qr_inode = (void *)((long)value);
+
+    return qr_inode;
+}
+
+qr_inode_t *
+qr_inode_ctx_get(xlator_t *this, inode_t *inode)
+{
+    qr_inode_t *qr_inode = NULL;
+
+    if (inode == NULL)
+        goto out;
+
+    LOCK(&inode->lock);
+    {
+        qr_inode = __qr_inode_ctx_get(this, inode);
+    }
+    UNLOCK(&inode->lock);
+
+out:
+    return qr_inode;
+}
+
+qr_inode_t *
+qr_inode_new(xlator_t *this, inode_t *inode)
+{
+    qr_inode_t *qr_inode = NULL;
+
+    qr_inode = GF_CALLOC(1, sizeof(*qr_inode), gf_qr_mt_qr_inode_t);
+    if (!qr_inode)
+        return NULL;
+
+    INIT_LIST_HEAD(&qr_inode->lru);
+
+    qr_inode->priority = 0; /* initial priority */
+
+    return qr_inode;
+}
+
+qr_inode_t *
+qr_inode_ctx_get_or_new(xlator_t *this, inode_t *inode)
+{
+    qr_inode_t *qr_inode = NULL;
+    int ret = -1;
+    qr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    LOCK(&inode->lock);
+    {
+        qr_inode = __qr_inode_ctx_get(this, inode);
+        if (qr_inode)
+            goto unlock;
+
+        qr_inode = qr_inode_new(this, inode);
+        if (!qr_inode)
+            goto unlock;
+
+        ret = __qr_inode_ctx_set(this, inode, qr_inode);
+        if (ret) {
+            __qr_inode_prune(this, &priv->table, qr_inode, 0);
+            GF_FREE(qr_inode);
+            qr_inode = NULL;
+        }
+    }
+unlock:
+    UNLOCK(&inode->lock);
+
+    return qr_inode;
+}
+
+uint32_t
+qr_get_priority(qr_conf_t *conf, const char *path)
+{
+    uint32_t priority = 0;
+    struct qr_priority *curr = NULL;
+
+    list_for_each_entry(curr, &conf->priority_list, list)
+    {
+        if (fnmatch(curr->pattern, path, FNM_NOESCAPE) == 0)
+            priority = curr->priority;
+    }
+
+    return priority;
+}
+
+void
+__qr_inode_register(xlator_t *this, qr_inode_table_t *table,
+                    qr_inode_t *qr_inode)
+{
+    qr_private_t *priv = NULL;
+
+    if (!qr_inode->data)
+        return;
+
+    priv = this->private;
+    if (!priv)
+        return;
+
+    if (list_empty(&qr_inode->lru))
+        /* first time addition of this qr_inode into table */
+        table->cache_used += qr_inode->size;
+    else
+        list_del_init(&qr_inode->lru);
+
+    list_add_tail(&qr_inode->lru, &table->lru[qr_inode->priority]);
+
+    GF_ATOMIC_INC(priv->qr_counter.files_cached);
+
+    return;
+}
+
+void
+qr_inode_set_priority(xlator_t *this, inode_t *inode, const char *path)
+{
+    uint32_t priority = 0;
+    qr_inode_table_t *table = NULL;
+    qr_inode_t *qr_inode = NULL;
+    qr_private_t *priv = NULL;
+    qr_conf_t *conf = NULL;
+
+    qr_inode = qr_inode_ctx_get(this, inode);
+    if (!qr_inode)
+        return;
+
+    priv = this->private;
+    table = &priv->table;
+    conf = &priv->conf;
+
+    if (path)
+        priority = qr_get_priority(conf, path);
+    else
+        /* retain existing priority, just bump LRU */
+        priority = qr_inode->priority;
+
+    LOCK(&table->lock);
+    {
+        qr_inode->priority = priority;
+
+        __qr_inode_register(this, table, qr_inode);
+    }
+    UNLOCK(&table->lock);
+}
+
+void
+__qr_inode_prune_data(xlator_t *this, qr_inode_table_t *table,
+                      qr_inode_t *qr_inode)
+{
+    qr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    GF_FREE(qr_inode->data);
+    qr_inode->data = NULL;
+
+    if (!list_empty(&qr_inode->lru)) {
+        table->cache_used -= qr_inode->size;
+        qr_inode->size = 0;
+
+        list_del_init(&qr_inode->lru);
+
+        GF_ATOMIC_DEC(priv->qr_counter.files_cached);
+    }
+
+    memset(&qr_inode->buf, 0, sizeof(qr_inode->buf));
+}
+
+/* To be called with priv->table.lock held */
+void
+__qr_inode_prune(xlator_t *this, qr_inode_table_t *table, qr_inode_t *qr_inode,
+                 uint64_t gen)
+{
+    __qr_inode_prune_data(this, table, qr_inode);
+    if (gen)
+        qr_inode->gen = gen;
+    qr_inode->invalidation_time = __qr_get_generation(this, qr_inode);
+}
+
+void
+qr_inode_prune(xlator_t *this, inode_t *inode, uint64_t gen)
+{
+    qr_private_t *priv = NULL;
+    qr_inode_table_t *table = NULL;
+    qr_inode_t *qr_inode = NULL;
+
+    qr_inode = qr_inode_ctx_get(this, inode);
+    if (!qr_inode)
+        return;
+
+    priv = this->private;
+    table = &priv->table;
+
+    LOCK(&table->lock);
+    {
+        __qr_inode_prune(this, table, qr_inode, gen);
+    }
+    UNLOCK(&table->lock);
+}
+
+/* To be called with priv->table.lock held */
+void
+__qr_cache_prune(xlator_t *this, qr_inode_table_t *table, qr_conf_t *conf)
+{
+    qr_inode_t *curr = NULL;
+    qr_inode_t *next = NULL;
+    int index = 0;
+    size_t size_pruned = 0;
+
+    for (index = 0; index < conf->max_pri; index++) {
+        list_for_each_entry_safe(curr, next, &table->lru[index], lru)
+        {
+            size_pruned += curr->size;
+
+            __qr_inode_prune(this, table, curr, 0);
+
+            if (table->cache_used < conf->cache_size)
+                return;
+        }
+    }
+
+    return;
+}
+
+void
+qr_cache_prune(xlator_t *this)
+{
+    qr_private_t *priv = NULL;
+    qr_conf_t *conf = NULL;
+    qr_inode_table_t *table = NULL;
+
+    priv = this->private;
+    table = &priv->table;
+    conf = &priv->conf;
+
+    LOCK(&table->lock);
+    {
+        if (table->cache_used > conf->cache_size)
+            __qr_cache_prune(this, table, conf);
+    }
+    UNLOCK(&table->lock);
+}
+
+void *
+qr_content_extract(dict_t *xdata)
+{
+    data_t *data = NULL;
+    void *content = NULL;
+    int ret = 0;
+
+    ret = dict_get_with_ref(xdata, GF_CONTENT_KEY, &data);
+    if (ret < 0 || !data)
+        return NULL;
+
+    content = GF_MALLOC(data->len, gf_qr_mt_content_t);
+    if (!content)
+        goto out;
+
+    memcpy(content, data->data, data->len);
+
+out:
+    data_unref(data);
+    return content;
+}
+
+void
+qr_content_update(xlator_t *this, qr_inode_t *qr_inode, void *data,
+                  struct iatt *buf, uint64_t gen)
+{
+    qr_private_t *priv = NULL;
+    qr_inode_table_t *table = NULL;
+    uint32_t rollover = 0;
+
+    rollover = gen >> 32;
+    gen = gen & 0xffffffff;
+
+    priv = this->private;
+    table = &priv->table;
+
+    LOCK(&table->lock);
+    {
+        if ((rollover != qr_inode->gen_rollover) ||
+            (gen && qr_inode->gen && (qr_inode->gen >= gen)))
+            goto unlock;
+
+        if ((qr_inode->data == NULL) && (qr_inode->invalidation_time >= gen))
+            goto unlock;
+
+        __qr_inode_prune(this, table, qr_inode, gen);
+
+        qr_inode->data = data;
+        data = NULL;
+        qr_inode->size = buf->ia_size;
+
+        qr_inode->ia_mtime = buf->ia_mtime;
+        qr_inode->ia_mtime_nsec = buf->ia_mtime_nsec;
+        qr_inode->ia_ctime = buf->ia_ctime;
+        qr_inode->ia_ctime_nsec = buf->ia_ctime_nsec;
+
+        qr_inode->buf = *buf;
+        qr_inode->last_refresh = gf_time();
+
+        __qr_inode_register(this, table, qr_inode);
+    }
+unlock:
+    UNLOCK(&table->lock);
+
+    if (data)
+        GF_FREE(data);
+
+    qr_cache_prune(this);
+}
+
+gf_boolean_t
+qr_size_fits(qr_conf_t *conf, struct iatt *buf)
+{
+    return (buf->ia_size <= conf->max_file_size);
+}
+
+gf_boolean_t
+qr_mtime_equal(qr_inode_t *qr_inode, struct iatt *buf)
+{
+    return (qr_inode->ia_mtime == buf->ia_mtime &&
+            qr_inode->ia_mtime_nsec == buf->ia_mtime_nsec);
+}
+
+gf_boolean_t
+qr_ctime_equal(qr_inode_t *qr_inode, struct iatt *buf)
+{
+    return (qr_inode->ia_ctime == buf->ia_ctime &&
+            qr_inode->ia_ctime_nsec == buf->ia_ctime_nsec);
+}
+
+gf_boolean_t
+qr_time_equal(qr_conf_t *conf, qr_inode_t *qr_inode, struct iatt *buf)
+{
+    if (conf->ctime_invalidation)
+        return qr_ctime_equal(qr_inode, buf);
+    else
+        return qr_mtime_equal(qr_inode, buf);
+}
+
+void
+__qr_content_refresh(xlator_t *this, qr_inode_t *qr_inode, struct iatt *buf,
+                     uint64_t gen)
+{
+    qr_private_t *priv = NULL;
+    qr_inode_table_t *table = NULL;
+    qr_conf_t *conf = NULL;
+    uint32_t rollover = 0;
+
+    rollover = gen >> 32;
+    gen = gen & 0xffffffff;
+
+    priv = this->private;
+    table = &priv->table;
+    conf = &priv->conf;
+
+    /* allow for rollover of frame->root->unique */
+    if ((rollover != qr_inode->gen_rollover) ||
+        (gen && qr_inode->gen && (qr_inode->gen >= gen)))
+        goto done;
+
+    if ((qr_inode->data == NULL) && (qr_inode->invalidation_time >= gen))
+        goto done;
+
+    qr_inode->gen = gen;
+
+    if (qr_size_fits(conf, buf) && qr_time_equal(conf, qr_inode, buf)) {
+        qr_inode->buf = *buf;
+        qr_inode->last_refresh = gf_time();
+        __qr_inode_register(this, table, qr_inode);
+    } else {
+        __qr_inode_prune(this, table, qr_inode, gen);
+    }
+
+done:
+    return;
+}
+
+void
+qr_content_refresh(xlator_t *this, qr_inode_t *qr_inode, struct iatt *buf,
+                   uint64_t gen)
+{
+    qr_private_t *priv = NULL;
+    qr_inode_table_t *table = NULL;
+
+    priv = this->private;
+    table = &priv->table;
+
+    LOCK(&table->lock);
+    {
+        __qr_content_refresh(this, qr_inode, buf, gen);
+    }
+    UNLOCK(&table->lock);
+}
+
+gf_boolean_t
+__qr_cache_is_fresh(xlator_t *this, qr_inode_t *qr_inode)
+{
+    qr_conf_t *conf = NULL;
+    qr_private_t *priv = NULL;
+
+    priv = this->private;
+    conf = &priv->conf;
+
+    if (qr_inode->last_refresh < priv->last_child_down)
+        return _gf_false;
+
+    if (gf_time() - qr_inode->last_refresh >= conf->cache_timeout)
+        return _gf_false;
+
+    return _gf_true;
+}
+
+int
+qr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, inode_t *inode_ret, struct iatt *buf,
+              dict_t *xdata, struct iatt *postparent)
+{
+    void *content = NULL;
+    qr_inode_t *qr_inode = NULL;
+    inode_t *inode = NULL;
+    qr_local_t *local = NULL;
+
+    local = frame->local;
+    inode = local->inode;
+
+    if (op_ret == -1) {
+        qr_inode_prune(this, inode, local->incident_gen);
+        goto out;
+    }
+
+    if (dict_get(xdata, GLUSTERFS_BAD_INODE)) {
+        qr_inode_prune(this, inode, local->incident_gen);
+        goto out;
+    }
+
+    if (dict_get(xdata, "sh-failed")) {
+        qr_inode_prune(this, inode, local->incident_gen);
+        goto out;
+    }
+
+    content = qr_content_extract(xdata);
+
+    if (content) {
+        /* new content came along, always replace old content */
+        qr_inode = qr_inode_ctx_get_or_new(this, inode);
+        if (!qr_inode) {
+            /* no harm done */
+            GF_FREE(content);
+            goto out;
+        }
+
+        qr_content_update(this, qr_inode, content, buf, local->incident_gen);
+    } else {
+        /* purge old content if necessary */
+        qr_inode = qr_inode_ctx_get(this, inode);
+        if (!qr_inode)
+            /* usual path for large files */
+            goto out;
+
+        qr_content_refresh(this, qr_inode, buf, local->incident_gen);
+    }
+out:
+    QR_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode_ret, buf, xdata,
+                    postparent);
+    return 0;
+}
+
+int
+qr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    qr_private_t *priv = NULL;
+    qr_conf_t *conf = NULL;
+    qr_inode_t *qr_inode = NULL;
+    int ret = -1;
+    dict_t *new_xdata = NULL;
+    qr_local_t *local = NULL;
+
+    priv = this->private;
+    conf = &priv->conf;
+    local = qr_local_get(this, loc->inode);
+    local->inode = inode_ref(loc->inode);
+    frame->local = local;
+
+    qr_inode = qr_inode_ctx_get(this, loc->inode);
+    if (qr_inode && qr_inode->data)
+        /* cached. only validate in qr_lookup_cbk */
+        goto wind;
+
+    if (!xdata)
+        xdata = new_xdata = dict_new();
+
+    if (!xdata)
+        goto wind;
+
+    ret = 0;
+    if (conf->max_file_size)
+        ret = dict_set(xdata, GF_CONTENT_KEY,
+                       data_from_uint64(conf->max_file_size));
+    if (ret)
+        gf_msg(this->name, GF_LOG_WARNING, 0, QUICK_READ_MSG_DICT_SET_FAILED,
+               "cannot set key in request dict (%s)", loc->path);
+wind:
+    STACK_WIND(frame, qr_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
+
+    if (new_xdata)
+        dict_unref(new_xdata);
+
+    return 0;
+}
+
+int
+qr_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+                int op_errno, gf_dirent_t *entries, dict_t *xdata)
+{
+    gf_dirent_t *entry = NULL;
+    qr_inode_t *qr_inode = NULL;
+    qr_local_t *local = NULL;
+
+    local = frame->local;
+
+    if (op_ret <= 0)
+        goto unwind;
+
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        if (!entry->inode)
+            continue;
+
+        qr_inode = qr_inode_ctx_get(this, entry->inode);
+        if (!qr_inode)
+            /* no harm */
+            continue;
+
+        qr_content_refresh(this, qr_inode, &entry->d_stat, local->incident_gen);
+    }
+
+unwind:
+    QR_STACK_UNWIND(readdirp, frame, op_ret, op_errno, entries, xdata);
+    return 0;
+}
+
+int
+qr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t offset, dict_t *xdata)
+{
+    qr_local_t *local = NULL;
+
+    local = qr_local_get(this, NULL);
+    frame->local = local;
+
+    STACK_WIND(frame, qr_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, offset, xdata);
+    return 0;
+}
+
+int
+qr_readv_cached(call_frame_t *frame, qr_inode_t *qr_inode, size_t size,
+                off_t offset, uint32_t flags, dict_t *xdata)
+{
+    xlator_t *this = NULL;
+    qr_private_t *priv = NULL;
+    qr_inode_table_t *table = NULL;
+    int op_ret = -1;
+    struct iobuf *iobuf = NULL;
+    struct iobref *iobref = NULL;
+    struct iovec iov = {
+        0,
+    };
+    struct iatt buf = {
+        0,
+    };
+
+    this = frame->this;
+    priv = this->private;
+    table = &priv->table;
+
+    LOCK(&table->lock);
+    {
+        if (!qr_inode->data)
+            goto unlock;
+
+        if (offset >= qr_inode->size)
+            goto unlock;
+
+        if (!__qr_cache_is_fresh(this, qr_inode))
+            goto unlock;
+
+        op_ret = min(size, (qr_inode->size - offset));
+
+        iobuf = iobuf_get2(this->ctx->iobuf_pool, op_ret);
+        if (!iobuf) {
+            op_ret = -1;
+            goto unlock;
+        }
+
+        iobref = iobref_new();
+        if (!iobref) {
+            op_ret = -1;
+            goto unlock;
+        }
+
+        iobref_add(iobref, iobuf);
+
+        memcpy(iobuf->ptr, qr_inode->data + offset, op_ret);
+
+        buf = qr_inode->buf;
+
+        /* bump LRU */
+        __qr_inode_register(frame->this, table, qr_inode);
+    }
+unlock:
+    UNLOCK(&table->lock);
+
+    if (op_ret >= 0) {
+        iov.iov_base = iobuf->ptr;
+        iov.iov_len = op_ret;
+
+        GF_ATOMIC_INC(priv->qr_counter.cache_hit);
+        STACK_UNWIND_STRICT(readv, frame, op_ret, 0, &iov, 1, &buf, iobref,
+                            xdata);
+    } else {
+        GF_ATOMIC_INC(priv->qr_counter.cache_miss);
+    }
+
+    if (iobuf)
+        iobuf_unref(iobuf);
+
+    if (iobref)
+        iobref_unref(iobref);
+
+    return op_ret;
+}
+
+int
+qr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+         off_t offset, uint32_t flags, dict_t *xdata)
+{
+    qr_inode_t *qr_inode = NULL;
+
+    qr_inode = qr_inode_ctx_get(this, fd->inode);
+    if (!qr_inode)
+        goto wind;
+
+    if (qr_readv_cached(frame, qr_inode, size, offset, flags, xdata) < 0)
+        goto wind;
+
+    return 0;
+wind:
+    STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+    return 0;
+}
+
+int32_t
+qr_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
+              dict_t *xdata)
+{
+    qr_local_t *local = NULL;
+
+    local = frame->local;
+
+    qr_inode_prune(this, local->fd->inode, local->incident_gen);
+
+    QR_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+    return 0;
+}
+
+int
+qr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov,
+          int count, off_t offset, uint32_t flags, struct iobref *iobref,
+          dict_t *xdata)
+{
+    qr_local_t *local = NULL;
+
+    local = qr_local_get(this, fd->inode);
+    local->fd = fd_ref(fd);
+
+    frame->local = local;
+
+    STACK_WIND(frame, qr_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, iov, count, offset, flags,
+               iobref, xdata);
+    return 0;
+}
+
+int32_t
+qr_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                struct iatt *postbuf, dict_t *xdata)
+{
+    qr_local_t *local = NULL;
+
+    local = frame->local;
+    qr_inode_prune(this, local->inode, local->incident_gen);
+
+    QR_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+    return 0;
+}
+
+int
+qr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+            dict_t *xdata)
+{
+    qr_local_t *local = NULL;
+
+    local = qr_local_get(this, loc->inode);
+    local->inode = inode_ref(loc->inode);
+    frame->local = local;
+
+    STACK_WIND(frame, qr_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+    return 0;
+}
+
+int32_t
+qr_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                 struct iatt *postbuf, dict_t *xdata)
+{
+    qr_local_t *local = NULL;
+
+    local = frame->local;
+    qr_inode_prune(this, local->fd->inode, local->incident_gen);
+
+    QR_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+    return 0;
+}
+
+int
+qr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             dict_t *xdata)
+{
+    qr_local_t *local = NULL;
+
+    local = qr_local_get(this, fd->inode);
+    local->fd = fd_ref(fd);
+    frame->local = local;
+
+    STACK_WIND(frame, qr_ftruncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+}
+
+int32_t
+qr_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *pre,
+                 struct iatt *post, dict_t *xdata)
+{
+    qr_local_t *local = NULL;
+
+    local = frame->local;
+    qr_inode_prune(this, local->fd->inode, local->incident_gen);
+
+    QR_STACK_UNWIND(fallocate, frame, op_ret, op_errno, pre, post, xdata);
+    return 0;
+}
+
+static int
+qr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int keep_size,
+             off_t offset, size_t len, dict_t *xdata)
+{
+    qr_local_t *local = NULL;
+
+    local = qr_local_get(this, fd->inode);
+    local->fd = fd_ref(fd);
+    frame->local = local;
+
+    STACK_WIND(frame, qr_fallocate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset, len,
+               xdata);
+    return 0;
+}
+
+int32_t
+qr_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *pre,
+               struct iatt *post, dict_t *xdata)
+{
+    qr_local_t *local = NULL;
+
+    local = frame->local;
+    qr_inode_prune(this, local->fd->inode, local->incident_gen);
+
+    QR_STACK_UNWIND(discard, frame, op_ret, op_errno, pre, post, xdata);
+    return 0;
+}
+
+static int
+qr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+           size_t len, dict_t *xdata)
+{
+    qr_local_t *local = NULL;
+
+    local = qr_local_get(this, fd->inode);
+    local->fd = fd_ref(fd);
+    frame->local = local;
+
+    STACK_WIND(frame, qr_discard_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+    return 0;
+}
+
+int32_t
+qr_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *pre,
+                struct iatt *post, dict_t *xdata)
+{
+    qr_local_t *local = NULL;
+
+    local = frame->local;
+    qr_inode_prune(this, local->fd->inode, local->incident_gen);
+
+    QR_STACK_UNWIND(zerofill, frame, op_ret, op_errno, pre, post, xdata);
+    return 0;
+}
+
+static int
+qr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            off_t len, dict_t *xdata)
+{
+    qr_local_t *local = NULL;
+
+    local = qr_local_get(this, fd->inode);
+    local->fd = fd_ref(fd);
+    frame->local = local;
+
+    STACK_WIND(frame, qr_zerofill_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+    return 0;
+}
+
+int
+qr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd,
+        dict_t *xdata)
+{
+    qr_inode_set_priority(this, fd->inode, loc->path);
+
+    STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+    return 0;
+}
+
+int
+qr_forget(xlator_t *this, inode_t *inode)
+{
+    qr_inode_t *qr_inode = NULL;
+
+    qr_inode = qr_inode_ctx_get(this, inode);
+
+    if (!qr_inode)
+        return 0;
+
+    qr_inode_prune(this, inode, qr_get_generation(this, inode));
+
+    GF_FREE(qr_inode);
+
+    return 0;
+}
+
+int32_t
+qr_inodectx_dump(xlator_t *this, inode_t *inode)
+{
+    qr_inode_t *qr_inode = NULL;
+    int32_t ret = -1;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    char buf[GF_TIMESTR_SIZE] = {
+        0,
+    };
+
+    qr_inode = qr_inode_ctx_get(this, inode);
+    if (!qr_inode)
+        goto out;
+
+    gf_proc_dump_build_key(key_prefix, "xlator.performance.quick-read",
+                           "inodectx");
+    gf_proc_dump_add_section("%s", key_prefix);
+
+    gf_proc_dump_write("entire-file-cached", "%s",
+                       qr_inode->data ? "yes" : "no");
+
+    if (qr_inode->last_refresh) {
+        gf_time_fmt(buf, sizeof buf, qr_inode->last_refresh, gf_timefmt_FT);
+        gf_proc_dump_write("last-cache-validation-time", "%s", buf);
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+qr_priv_dump(xlator_t *this)
+{
+    qr_conf_t *conf = NULL;
+    qr_private_t *priv = NULL;
+    qr_inode_table_t *table = NULL;
+    uint32_t file_count = 0;
+    uint32_t i = 0;
+    qr_inode_t *curr = NULL;
+    uint64_t total_size = 0;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN];
+
+    if (!this) {
+        return -1;
+    }
+
+    priv = this->private;
+    conf = &priv->conf;
+    if (!conf)
+        return -1;
+
+    table = &priv->table;
+
+    gf_proc_dump_build_key(key_prefix, "xlator.performance.quick-read", "priv");
+
+    gf_proc_dump_add_section("%s", key_prefix);
+
+    gf_proc_dump_write("max_file_size", "%" PRIu64, conf->max_file_size);
+    gf_proc_dump_write("cache_timeout", "%d", conf->cache_timeout);
+
+    if (!table) {
+        goto out;
+    } else {
+        for (i = 0; i < conf->max_pri; i++) {
+            list_for_each_entry(curr, &table->lru[i], lru)
+            {
+                file_count++;
+                total_size += curr->size;
+            }
+        }
+    }
+
+    gf_proc_dump_write("total_files_cached", "%d", file_count);
+    gf_proc_dump_write("total_cache_used", "%" PRIu64, total_size);
+    gf_proc_dump_write("cache-hit", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(priv->qr_counter.cache_hit));
+    gf_proc_dump_write("cache-miss", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(priv->qr_counter.cache_miss));
+    gf_proc_dump_write("cache-invalidations", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(priv->qr_counter.file_data_invals));
+
+out:
+    return 0;
+}
+
+static int32_t
+qr_dump_metrics(xlator_t *this, int fd)
+{
+    qr_private_t *priv = NULL;
+    qr_inode_table_t *table = NULL;
+
+    priv = this->private;
+    table = &priv->table;
+
+    dprintf(fd, "%s.total_files_cached %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(priv->qr_counter.files_cached));
+    dprintf(fd, "%s.total_cache_used %" PRId64 "\n", this->name,
+            table->cache_used);
+    dprintf(fd, "%s.cache-hit %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(priv->qr_counter.cache_hit));
+    dprintf(fd, "%s.cache-miss %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(priv->qr_counter.cache_miss));
+    dprintf(fd, "%s.cache-invalidations %" PRId64 "\n", this->name,
+            GF_ATOMIC_GET(priv->qr_counter.file_data_invals));
+
+    return 0;
+}
+
+int32_t
+qr_mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_qr_mt_end + 1);
+
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, QUICK_READ_MSG_NO_MEMORY,
+               "Memory accounting init failed");
+        return ret;
+    }
+
+    return ret;
+}
+
+static gf_boolean_t
+check_cache_size_ok(xlator_t *this, int64_t cache_size)
+{
+    int ret = _gf_true;
+    uint64_t total_mem = 0;
+    uint64_t max_cache_size = 0;
+    volume_option_t *opt = NULL;
+
+    GF_ASSERT(this);
+    opt = xlator_volume_option_get(this, "cache-size");
+    if (!opt) {
+        ret = _gf_false;
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+               QUICK_READ_MSG_INVALID_ARGUMENT,
+               "could not get cache-size option");
+        goto out;
+    }
+
+    total_mem = get_mem_size();
+    if (-1 == total_mem)
+        max_cache_size = opt->max;
+    else
+        max_cache_size = total_mem;
+
+    gf_msg_debug(this->name, 0, "Max cache size is %" PRIu64, max_cache_size);
+    if (cache_size > max_cache_size) {
+        ret = _gf_false;
+        gf_msg(this->name, GF_LOG_ERROR, 0, QUICK_READ_MSG_INVALID_ARGUMENT,
+               "Cache size %" PRIu64
+               " is greater than the max size of %" PRIu64,
+               cache_size, max_cache_size);
+        goto out;
+    }
+out:
+    return ret;
+}
+
+int
+qr_reconfigure(xlator_t *this, dict_t *options)
+{
+    int32_t ret = -1;
+    qr_private_t *priv = NULL;
+    qr_conf_t *conf = NULL;
+    uint64_t cache_size_new = 0;
+
+    GF_VALIDATE_OR_GOTO("quick-read", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, options, out);
+
+    priv = this->private;
+
+    conf = &priv->conf;
+    if (!conf) {
+        goto out;
+    }
+
+    GF_OPTION_RECONF("cache-timeout", conf->cache_timeout, options, int32, out);
+
+    GF_OPTION_RECONF("quick-read-cache-invalidation", conf->qr_invalidation,
+                     options, bool, out);
+
+    GF_OPTION_RECONF("ctime-invalidation", conf->ctime_invalidation, options,
+                     bool, out);
+
+    GF_OPTION_RECONF("cache-size", cache_size_new, options, size_uint64, out);
+    if (!check_cache_size_ok(this, cache_size_new)) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, QUICK_READ_MSG_INVALID_CONFIG,
+               "Not reconfiguring cache-size");
+        goto out;
+    }
+    conf->cache_size = cache_size_new;
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+qr_get_priority_list(const char *opt_str, struct list_head *first)
+{
+    int32_t max_pri = 1;
+    char *tmp_str = NULL;
+    char *tmp_str1 = NULL;
+    char *tmp_str2 = NULL;
+    char *dup_str = NULL;
+    char *priority_str = NULL;
+    char *pattern = NULL;
+    char *priority = NULL;
+    char *string = NULL;
+    struct qr_priority *curr = NULL, *tmp = NULL;
+
+    GF_VALIDATE_OR_GOTO("quick-read", opt_str, out);
+    GF_VALIDATE_OR_GOTO("quick-read", first, out);
+
+    string = gf_strdup(opt_str);
+    if (string == NULL) {
+        max_pri = -1;
+        goto out;
+    }
+
+    /* Get the pattern for cache priority.
+     * "option priority *.jpg:1,abc*:2" etc
+     */
+    /* TODO: inode_lru in table is statically hard-coded to 5,
+     * should be changed to run-time configuration
+     */
+    priority_str = strtok_r(string, ",", &tmp_str);
+    while (priority_str) {
+        curr = GF_CALLOC(1, sizeof(*curr), gf_qr_mt_qr_priority_t);
+        if (curr == NULL) {
+            max_pri = -1;
+            goto out;
+        }
+
+        list_add_tail(&curr->list, first);
+
+        dup_str = gf_strdup(priority_str);
+        if (dup_str == NULL) {
+            max_pri = -1;
+            goto out;
+        }
+
+        pattern = strtok_r(dup_str, ":", &tmp_str1);
+        if (!pattern) {
+            max_pri = -1;
+            goto out;
+        }
+
+        priority = strtok_r(NULL, ":", &tmp_str1);
+        if (!priority) {
+            max_pri = -1;
+            goto out;
+        }
+
+        gf_msg_trace("quick-read", 0,
+                     "quick-read priority : pattern %s : priority %s", pattern,
+                     priority);
+
+        curr->pattern = gf_strdup(pattern);
+        if (curr->pattern == NULL) {
+            max_pri = -1;
+            goto out;
+        }
+
+        curr->priority = strtol(priority, &tmp_str2, 0);
+        if (tmp_str2 && (*tmp_str2)) {
+            max_pri = -1;
+            goto out;
+        } else {
+            max_pri = max(max_pri, curr->priority);
+        }
+
+        GF_FREE(dup_str);
+        dup_str = NULL;
+
+        priority_str = strtok_r(NULL, ",", &tmp_str);
+    }
+out:
+    GF_FREE(string);
+
+    GF_FREE(dup_str);
+
+    if (max_pri == -1) {
+        list_for_each_entry_safe(curr, tmp, first, list)
+        {
+            list_del_init(&curr->list);
+            GF_FREE(curr->pattern);
+            GF_FREE(curr);
+        }
+    }
+
+    return max_pri;
+}
+
+int32_t
+qr_init(xlator_t *this)
+{
+    int32_t ret = -1, i = 0;
+    qr_private_t *priv = NULL;
+    qr_conf_t *conf = NULL;
+
+    if (!this->children || this->children->next) {
+        gf_msg(this->name, GF_LOG_ERROR, 0,
+               QUICK_READ_MSG_XLATOR_CHILD_MISCONFIGURED,
+               "FATAL: volume (%s) not configured with exactly one "
+               "child",
+               this->name);
+        return -1;
+    }
+
+    if (!this->parents) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, QUICK_READ_MSG_VOL_MISCONFIGURED,
+               "dangling volume. check volfile ");
+    }
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_qr_mt_qr_private_t);
+    if (priv == NULL) {
+        ret = -1;
+        goto out;
+    }
+
+    LOCK_INIT(&priv->table.lock);
+    conf = &priv->conf;
+
+    GF_OPTION_INIT("max-file-size", conf->max_file_size, size_uint64, out);
+
+    GF_OPTION_INIT("cache-timeout", conf->cache_timeout, int32, out);
+
+    GF_OPTION_INIT("quick-read-cache-invalidation", conf->qr_invalidation, bool,
+                   out);
+
+    GF_OPTION_INIT("cache-size", conf->cache_size, size_uint64, out);
+    if (!check_cache_size_ok(this, conf->cache_size)) {
+        ret = -1;
+        goto out;
+    }
+
+    GF_OPTION_INIT("ctime-invalidation", conf->ctime_invalidation, bool, out);
+
+    INIT_LIST_HEAD(&conf->priority_list);
+    conf->max_pri = 1;
+    if (dict_get(this->options, "priority")) {
+        char *option_list = data_to_str(dict_get(this->options, "priority"));
+        gf_msg_trace(this->name, 0, "option path %s", option_list);
+        /* parse the list of pattern:priority */
+        conf->max_pri = qr_get_priority_list(option_list, &conf->priority_list);
+
+        if (conf->max_pri == -1) {
+            goto out;
+        }
+        conf->max_pri++;
+    }
+
+    priv->table.lru = GF_CALLOC(conf->max_pri, sizeof(*priv->table.lru),
+                                gf_common_mt_list_head);
+    if (priv->table.lru == NULL) {
+        ret = -1;
+        goto out;
+    }
+
+    for (i = 0; i < conf->max_pri; i++) {
+        INIT_LIST_HEAD(&priv->table.lru[i]);
+    }
+
+    ret = 0;
+
+    priv->last_child_down = gf_time();
+    GF_ATOMIC_INIT(priv->generation, 0);
+    this->private = priv;
+out:
+    if ((ret == -1) && priv) {
+        GF_FREE(priv);
+    }
+
+    return ret;
+}
+
+void
+qr_inode_table_destroy(qr_private_t *priv)
+{
+    int i = 0;
+    qr_conf_t *conf = NULL;
+
+    conf = &priv->conf;
+
+    for (i = 0; i < conf->max_pri; i++) {
+        /* There is a known leak of inodes, hence until
+         * that is fixed, log the assert as warning.
+        GF_ASSERT (list_empty (&priv->table.lru[i]));*/
+        if (!list_empty(&priv->table.lru[i])) {
+            gf_msg("quick-read", GF_LOG_INFO, 0, QUICK_READ_MSG_LRU_NOT_EMPTY,
+                   "quick read inode table lru not empty");
+        }
+    }
+
+    LOCK_DESTROY(&priv->table.lock);
+
+    return;
+}
+
+void
+qr_conf_destroy(qr_conf_t *conf)
+{
+    struct qr_priority *curr = NULL, *tmp = NULL;
+
+    list_for_each_entry_safe(curr, tmp, &conf->priority_list, list)
+    {
+        list_del(&curr->list);
+        GF_FREE(curr->pattern);
+        GF_FREE(curr);
+    }
+
+    return;
+}
+
+void
+qr_update_child_down_time(xlator_t *this, time_t now)
+{
+    qr_private_t *priv = NULL;
+
+    priv = this->private;
+
+    LOCK(&priv->lock);
+    {
+        priv->last_child_down = now;
+    }
+    UNLOCK(&priv->lock);
+}
+
+static int
+qr_invalidate(xlator_t *this, void *data)
+{
+    struct gf_upcall *up_data = NULL;
+    struct gf_upcall_cache_invalidation *up_ci = NULL;
+    inode_t *inode = NULL;
+    int ret = 0;
+    inode_table_t *itable = NULL;
+    qr_private_t *priv = NULL;
+
+    up_data = (struct gf_upcall *)data;
+
+    if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION)
+        goto out;
+
+    priv = this->private;
+    up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
+
+    if (up_ci && (up_ci->flags & UP_WRITE_FLAGS)) {
+        GF_ATOMIC_INC(priv->qr_counter.file_data_invals);
+        itable = ((xlator_t *)this->graph->top)->itable;
+        inode = inode_find(itable, up_data->gfid);
+        if (!inode) {
+            ret = -1;
+            goto out;
+        }
+        qr_inode_prune(this, inode, qr_get_generation(this, inode));
+    }
+
+out:
+    if (inode)
+        inode_unref(inode);
+
+    return ret;
+}
+
+int
+qr_notify(xlator_t *this, int event, void *data, ...)
+{
+    int ret = 0;
+    qr_private_t *priv = NULL;
+    qr_conf_t *conf = NULL;
+
+    priv = this->private;
+    conf = &priv->conf;
+
+    switch (event) {
+        case GF_EVENT_CHILD_DOWN:
+        case GF_EVENT_SOME_DESCENDENT_DOWN:
+            qr_update_child_down_time(this, gf_time());
+            break;
+        case GF_EVENT_UPCALL:
+            if (conf->qr_invalidation)
+                ret = qr_invalidate(this, data);
+            break;
+        default:
+            break;
+    }
+
+    if (default_notify(this, event, data) != 0)
+        ret = -1;
+
+    return ret;
+}
+
+void
+qr_fini(xlator_t *this)
+{
+    qr_private_t *priv = NULL;
+
+    if (this == NULL) {
+        goto out;
+    }
+
+    priv = this->private;
+    if (priv == NULL) {
+        goto out;
+    }
+
+    qr_inode_table_destroy(priv);
+    qr_conf_destroy(&priv->conf);
+
+    this->private = NULL;
+
+    GF_FREE(priv);
+out:
+    return;
+}
+
+struct xlator_fops qr_fops = {.lookup = qr_lookup,
+                              .readdirp = qr_readdirp,
+                              .open = qr_open,
+                              .readv = qr_readv,
+                              .writev = qr_writev,
+                              .truncate = qr_truncate,
+                              .ftruncate = qr_ftruncate,
+                              .fallocate = qr_fallocate,
+                              .discard = qr_discard,
+                              .zerofill = qr_zerofill};
+
+struct xlator_cbks qr_cbks = {
+    .forget = qr_forget,
+};
+
+struct xlator_dumpops qr_dumpops = {
+    .priv = qr_priv_dump,
+    .inodectx = qr_inodectx_dump,
+};
+
+struct volume_options qr_options[] = {
+    {
+        .key = {"quick-read"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "enable/disable quick-read",
+        .op_version = {GD_OP_VERSION_6_0},
+        .flags = OPT_FLAG_SETTABLE,
+    },
+    {.key = {"priority"}, .type = GF_OPTION_TYPE_ANY},
+    {.key = {"cache-size"},
+     .type = GF_OPTION_TYPE_SIZET,
+     .min = 0,
+     .max = INFINITY,
+     .default_value = "128MB",
+     .op_version = {1},
+     .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+     .description = "Size of small file read cache."},
+    {
+        .key = {"cache-timeout"},
+        .type = GF_OPTION_TYPE_INT,
+        .default_value = "1",
+        .op_version = {1},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+    },
+    {
+        .key = {"max-file-size"},
+        .type = GF_OPTION_TYPE_SIZET,
+        .min = 0,
+        .max = 1 * GF_UNIT_KB * 1000,
+        .default_value = "64KB",
+        .op_version = {1},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+    },
+    {
+        .key = {"quick-read-cache-invalidation"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .op_version = {GD_OP_VERSION_4_0_0},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .description = "When \"on\", invalidates/updates the metadata cache,"
+                       " on receiving the cache-invalidation notifications",
+    },
+    {
+        .key = {"ctime-invalidation"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .op_version = {GD_OP_VERSION_5_0},
+        .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .description = "Quick-read by default uses mtime to identify changes "
+                       "to file data. However there are applications like "
+                       "rsync which explicitly set mtime making it unreliable "
+                       "for the purpose of identifying change in file content "
+                       ". Since ctime also changes when content of a file "
+                       " changes and it cannot be set explicitly, it becomes "
+                       " suitable for identifying staleness of cached data. "
+                       "This option makes quick-read to prefer ctime over "
+                       "mtime to validate its cache. However, using ctime "
+                       "can result in false positives as ctime changes with "
+                       "just attribute changes like permission without "
+                       "changes to file data. So, use this only when mtime "
+                       "is not reliable",
+    },
+    {.key = {NULL}}};
+
+xlator_api_t xlator_api = {
+    .init = qr_init,
+    .fini = qr_fini,
+    .notify = qr_notify,
+    .reconfigure = qr_reconfigure,
+    .mem_acct_init = qr_mem_acct_init,
+    .dump_metrics = qr_dump_metrics,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &qr_dumpops,
+    .fops = &qr_fops,
+    .cbks = &qr_cbks,
+    .options = qr_options,
+    .identifier = "quick-read",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/performance/quick-read/src/quick-read.h b/xlators/performance/quick-read/src/quick-read.h
new file mode 100644
index 00000000000..20fcc70b3a7
--- /dev/null
+++ b/xlators/performance/quick-read/src/quick-read.h
@@ -0,0 +1,91 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __QUICK_READ_H
+#define __QUICK_READ_H
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/list.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/defaults.h>
+#include <libgen.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include "quick-read-mem-types.h"
+
+struct qr_inode {
+    void *data;
+    size_t size;
+    int priority;
+    uint32_t ia_mtime;
+    uint32_t ia_mtime_nsec;
+    uint32_t ia_ctime;
+    uint32_t ia_ctime_nsec;
+    uint32_t gen_rollover;
+    struct iatt buf;
+    time_t last_refresh;
+    struct list_head lru;
+    uint64_t gen;
+    uint64_t invalidation_time;
+};
+typedef struct qr_inode qr_inode_t;
+
+struct qr_priority {
+    char *pattern;
+    int32_t priority;
+    struct list_head list;
+};
+typedef struct qr_priority qr_priority_t;
+
+struct qr_conf {
+    uint64_t max_file_size;
+    int32_t cache_timeout;
+    uint64_t cache_size;
+    int max_pri;
+    gf_boolean_t qr_invalidation;
+    gf_boolean_t ctime_invalidation;
+    struct list_head priority_list;
+};
+typedef struct qr_conf qr_conf_t;
+
+struct qr_inode_table {
+    uint64_t cache_used;
+    struct list_head *lru;
+    gf_lock_t lock;
+};
+typedef struct qr_inode_table qr_inode_table_t;
+
+struct qr_statistics {
+    gf_atomic_t cache_hit;
+    gf_atomic_t cache_miss;
+    gf_atomic_t file_data_invals; /* No. of invalidates received from upcall */
+    gf_atomic_t files_cached;
+};
+
+struct qr_private {
+    qr_conf_t conf;
+    qr_inode_table_t table;
+    time_t last_child_down;
+    gf_lock_t lock;
+    struct qr_statistics qr_counter;
+    gf_atomic_int32_t generation;
+};
+typedef struct qr_private qr_private_t;
+
+#endif /* #ifndef __QUICK_READ_H */
diff --git a/xlators/performance/read-ahead/src/Makefile.am b/xlators/performance/read-ahead/src/Makefile.am
index 7bb90228227..99efca3660c 100644
--- a/xlators/performance/read-ahead/src/Makefile.am
+++ b/xlators/performance/read-ahead/src/Makefile.am
@@ -1,14 +1,16 @@
 xlator_LTLIBRARIES = read-ahead.la
 xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
 
-read_ahead_la_LDFLAGS = -module -avoidversion
+read_ahead_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
 
 read_ahead_la_SOURCES = read-ahead.c page.c
 read_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
 
-noinst_HEADERS = read-ahead.h
+noinst_HEADERS = read-ahead.h read-ahead-mem-types.h read-ahead-messages.h
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
 
 CLEANFILES = 
diff --git a/xlators/performance/read-ahead/src/page.c b/xlators/performance/read-ahead/src/page.c
index a3506c8f0d1..8a58ad8bb7a 100644
--- a/xlators/performance/read-ahead/src/page.c
+++ b/xlators/performance/read-ahead/src/page.c
@@ -1,356 +1,455 @@
 /*
-  Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
   This file is part of GlusterFS.
 
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
 #include "read-ahead.h"
 #include <assert.h>
-
+#include "read-ahead-messages.h"
 
 ra_page_t *
-ra_page_get (ra_file_t *file,
-	     off_t offset)
+ra_page_get(ra_file_t *file, off_t offset)
 {
-	ra_page_t *page = NULL;
-	off_t      rounded_offset = 0;
+    ra_page_t *page = NULL;
+    off_t rounded_offset = 0;
 
-	page = file->pages.next;
-	rounded_offset = floor (offset, file->page_size);
+    GF_VALIDATE_OR_GOTO("read-ahead", file, out);
 
-	while (page != &file->pages && page->offset < rounded_offset)
-		page = page->next;
+    page = file->pages.next;
+    rounded_offset = gf_floor(offset, file->page_size);
 
-	if (page == &file->pages || page->offset != rounded_offset)
-		page = NULL;
+    while (page != &file->pages && page->offset < rounded_offset)
+        page = page->next;
 
-	return page;
-}
+    if (page == &file->pages || page->offset != rounded_offset)
+        page = NULL;
 
+out:
+    return page;
+}
 
 ra_page_t *
-ra_page_create (ra_file_t *file, off_t offset)
+ra_page_create(ra_file_t *file, off_t offset)
 {
-	ra_page_t *page      = NULL;
-	off_t      rounded_offset = 0;
-	ra_page_t *newpage   = NULL;
+    ra_page_t *page = NULL;
+    off_t rounded_offset = 0;
+    ra_page_t *newpage = NULL;
 
-	page           = file->pages.next;
-	rounded_offset = floor (offset, file->page_size);
+    GF_VALIDATE_OR_GOTO("read-ahead", file, out);
 
-	while (page != &file->pages && page->offset < rounded_offset)
-		page = page->next;
+    page = file->pages.next;
+    rounded_offset = gf_floor(offset, file->page_size);
 
-	if (page == &file->pages || page->offset != rounded_offset) {
-		newpage = CALLOC (1, sizeof (*newpage));
-		if (!newpage)
-			return NULL;
+    while (page != &file->pages && page->offset < rounded_offset)
+        page = page->next;
 
-		newpage->offset = rounded_offset;
-		newpage->prev = page->prev;
-		newpage->next = page;
-		newpage->file = file;
-		page->prev->next = newpage;
-		page->prev = newpage;
+    if (page == &file->pages || page->offset != rounded_offset) {
+        newpage = GF_CALLOC(1, sizeof(*newpage), gf_ra_mt_ra_page_t);
+        if (!newpage) {
+            goto out;
+        }
 
-		page = newpage;
-	}
+        newpage->offset = rounded_offset;
+        newpage->prev = page->prev;
+        newpage->next = page;
+        newpage->file = file;
+        page->prev->next = newpage;
+        page->prev = newpage;
 
-	return page;
-}
+        page = newpage;
+    }
 
+out:
+    return page;
+}
 
 void
-ra_wait_on_page (ra_page_t *page, call_frame_t *frame)
+ra_wait_on_page(ra_page_t *page, call_frame_t *frame)
 {
-	ra_waitq_t *waitq = NULL;
-	ra_local_t *local = NULL;
-
-
-	local = frame->local;
-	waitq = CALLOC (1, sizeof (*waitq));
-	if (!waitq) {
-		gf_log (frame->this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		return;
-	}
-
-	waitq->data = frame;
-	waitq->next = page->waitq;
-	page->waitq = waitq;
-
-	ra_local_lock (local);
-	{
-		local->wait_count++;
-	}
-	ra_local_unlock (local);
-}
+    ra_waitq_t *waitq = NULL;
+    ra_local_t *local = NULL;
+
+    GF_VALIDATE_OR_GOTO("read-ahead", frame, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, page, out);
+
+    local = frame->local;
+
+    waitq = GF_CALLOC(1, sizeof(*waitq), gf_ra_mt_ra_waitq_t);
+    if (!waitq) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        goto out;
+    }
+
+    waitq->data = frame;
+    waitq->next = page->waitq;
+    page->waitq = waitq;
 
+    ra_local_lock(local);
+    {
+        local->wait_count++;
+    }
+    ra_local_unlock(local);
+
+out:
+    return;
+}
 
 void
-ra_waitq_return (ra_waitq_t *waitq)
+ra_waitq_return(ra_waitq_t *waitq)
 {
-	ra_waitq_t   *trav = NULL;
-	ra_waitq_t   *next = NULL;
-	call_frame_t *frame = NULL;
+    ra_waitq_t *trav = NULL;
+    ra_waitq_t *next = NULL;
+    call_frame_t *frame = NULL;
 
-	for (trav = waitq; trav; trav = next) {
-		next = trav->next;
+    for (trav = waitq; trav; trav = next) {
+        next = trav->next;
 
-		frame = trav->data;
-		ra_frame_return (frame);
-		free (trav);
-	}
-}
+        frame = trav->data;
+        ra_frame_return(frame);
+        GF_FREE(trav);
+    }
 
+    return;
+}
 
 int
-ra_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	      int32_t op_ret, int32_t op_errno, struct iovec *vector,
-	      int32_t count, struct stat *stbuf)
+ra_fault_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iovec *vector, int32_t count,
+             struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
 {
-	ra_local_t   *local = NULL;
-	off_t         pending_offset = 0;
-	ra_file_t    *file = NULL;
-	ra_page_t    *page = NULL;
-	off_t         trav_offset = 0;
-	size_t        payload_size = 0;
-	ra_waitq_t   *waitq = NULL;
-	fd_t         *fd = NULL;
-	int           ret = 0;
-	uint64_t      tmp_file = 0;
-
-	local = frame->local;
-	fd  = local->fd;
-
-	ret = fd_ctx_get (fd, this, &tmp_file);
-
-	file = (ra_file_t *)(long)tmp_file;
-	pending_offset = local->pending_offset;
-	trav_offset    = pending_offset;  
-	payload_size   = op_ret;
-
-	ra_file_lock (file);
-	{
-		if (op_ret >= 0)
-			file->stbuf = *stbuf;
-
-		if (op_ret < 0) {
-			page = ra_page_get (file, pending_offset);
-			if (page)
-				waitq = ra_page_error (page, op_ret, op_errno);
-			goto unlock;
-		}
-
-		page = ra_page_get (file, pending_offset);
-		if (!page) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"wasted copy: %"PRId64"[+%"PRId64"] file=%p", 
-				pending_offset, file->page_size, file);
-			goto unlock;
-		}
-
-		if (page->vector) {
-			dict_unref (page->ref);
-			free (page->vector);
-		}
-
-		page->vector = iov_dup (vector, count);
-		page->count = count;
-		page->ref = dict_ref (frame->root->rsp_refs);
-		page->ready = 1;
-
-		page->size = iov_length (vector, count);
-
-		waitq = ra_page_wakeup (page);
-	}
+    ra_local_t *local = NULL;
+    off_t pending_offset = 0;
+    ra_file_t *file = NULL;
+    ra_page_t *page = NULL;
+    ra_waitq_t *waitq = NULL;
+    fd_t *fd = NULL;
+    uint64_t tmp_file = 0;
+    gf_boolean_t stale = _gf_false;
+
+    GF_ASSERT(frame);
+
+    local = frame->local;
+    fd = local->fd;
+
+    fd_ctx_get(fd, this, &tmp_file);
+
+    file = (ra_file_t *)(long)tmp_file;
+    pending_offset = local->pending_offset;
+
+    if (file == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, EBADF,
+               READ_AHEAD_MSG_FD_CONTEXT_NOT_SET,
+               "read-ahead context not set in fd (%p)", fd);
+        op_ret = -1;
+        op_errno = EBADF;
+        goto out;
+    }
+
+    ra_file_lock(file);
+    {
+        if (op_ret >= 0)
+            file->stbuf = *stbuf;
+
+        page = ra_page_get(file, pending_offset);
+
+        if (!page) {
+            gf_msg_trace(this->name, 0,
+                         "wasted copy: "
+                         "%" PRId64 "[+%" PRId64 "] file=%p",
+                         pending_offset, file->page_size, file);
+            goto unlock;
+        }
+
+        if (page->stale) {
+            page->stale = 0;
+            page->ready = 0;
+            stale = 1;
+            goto unlock;
+        }
+
+        /*
+         * "Dirty" means that the request was a pure read-ahead; it's
+         * set for requests we issue ourselves, and cleared when user
+         * requests are issued or put on the waitq.  "Poisoned" means
+         * that we got a write while a read was still in flight, and we
+         * couldn't stop it so we marked it instead.  If it's both
+         * dirty and poisoned by the time we get here, we cancel its
+         * effect so that a subsequent user read doesn't get data that
+         * we know is stale (because we made it stale ourselves).  We
+         * can't use ESTALE because that has special significance.
+         * ECANCELED has no such special meaning, and is close to what
+         * we're trying to indicate.
+         */
+        if (page->dirty && page->poisoned) {
+            op_ret = -1;
+            op_errno = ECANCELED;
+        }
+
+        if (op_ret < 0) {
+            waitq = ra_page_error(page, op_ret, op_errno);
+            goto unlock;
+        }
+
+        if (page->vector) {
+            iobref_unref(page->iobref);
+            GF_FREE(page->vector);
+        }
+
+        page->vector = iov_dup(vector, count);
+        if (page->vector == NULL) {
+            waitq = ra_page_error(page, -1, ENOMEM);
+            goto unlock;
+        }
+
+        page->count = count;
+        page->iobref = iobref_ref(iobref);
+        page->ready = 1;
+
+        page->size = iov_length(vector, count);
+
+        waitq = ra_page_wakeup(page);
+    }
 unlock:
-	ra_file_unlock (file);
+    ra_file_unlock(file);
 
-	ra_waitq_return (waitq);
+    if (stale) {
+        STACK_WIND(frame, ra_fault_cbk, FIRST_CHILD(frame->this),
+                   FIRST_CHILD(frame->this)->fops->readv, local->fd,
+                   local->pending_size, local->pending_offset, 0, NULL);
 
-	fd_unref (local->fd);
+        return 0;
+    }
 
-	free (frame->local);
-	frame->local = NULL;
+    ra_waitq_return(waitq);
 
-	STACK_DESTROY (frame->root);
-	return 0;
-}
+    fd_unref(local->fd);
+
+    mem_put(frame->local);
+    frame->local = NULL;
 
+out:
+    STACK_DESTROY(frame->root);
+    return 0;
+}
 
 void
-ra_page_fault (ra_file_t *file,
-	       call_frame_t *frame,
-	       off_t offset)
+ra_page_fault(ra_file_t *file, call_frame_t *frame, off_t offset)
 {
-	call_frame_t *fault_frame = NULL;
-	ra_local_t   *fault_local = NULL;
-    
-	fault_frame = copy_frame (frame);
-	fault_local = CALLOC (1, sizeof (ra_local_t));
-
-	fault_frame->local = fault_local;
-	fault_local->pending_offset = offset;
-	fault_local->pending_size = file->page_size;
-
-	fault_local->fd = fd_ref (file->fd);
-
-	STACK_WIND (fault_frame, ra_fault_cbk,
-		    FIRST_CHILD (fault_frame->this),
-		    FIRST_CHILD (fault_frame->this)->fops->readv,
-		    file->fd, file->page_size, offset);
-	return;
+    call_frame_t *fault_frame = NULL;
+    ra_local_t *fault_local = NULL;
+    ra_page_t *page = NULL;
+    ra_waitq_t *waitq = NULL;
+    int32_t op_ret = -1, op_errno = -1;
+
+    GF_VALIDATE_OR_GOTO("read-ahead", frame, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, file, out);
+
+    fault_frame = copy_frame(frame);
+    if (fault_frame == NULL) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    fault_local = mem_get0(THIS->local_pool);
+    if (fault_local == NULL) {
+        STACK_DESTROY(fault_frame->root);
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto err;
+    }
+
+    fault_frame->local = fault_local;
+    fault_local->pending_offset = offset;
+    fault_local->pending_size = file->page_size;
+
+    fault_local->fd = fd_ref(file->fd);
+
+    STACK_WIND(fault_frame, ra_fault_cbk, FIRST_CHILD(fault_frame->this),
+               FIRST_CHILD(fault_frame->this)->fops->readv, file->fd,
+               file->page_size, offset, 0, NULL);
+
+    return;
+
+err:
+    ra_file_lock(file);
+    {
+        page = ra_page_get(file, offset);
+        if (page)
+            waitq = ra_page_error(page, op_ret, op_errno);
+    }
+    ra_file_unlock(file);
+
+    if (waitq != NULL) {
+        ra_waitq_return(waitq);
+    }
+
+out:
+    return;
 }
 
 void
-ra_frame_fill (ra_page_t *page, call_frame_t *frame)
+ra_frame_fill(ra_page_t *page, call_frame_t *frame)
 {
-	ra_local_t *local = NULL;
-	ra_fill_t  *fill = NULL;
-	off_t       src_offset = 0;
-	off_t       dst_offset = 0;
-	ssize_t     copy_size = 0;
-	ra_fill_t  *new = NULL;
-
-
-	local = frame->local;
-	fill  = &local->fill;
-
-	if (local->op_ret != -1 && page->size) {
-		if (local->offset > page->offset)
-			src_offset = local->offset - page->offset;
-		else
-			dst_offset = page->offset - local->offset;
-
-		copy_size = min (page->size - src_offset,
-				 local->size - dst_offset);
-
-		if (copy_size < 0) {
-			/* if page contains fewer bytes and the required offset
-			   is beyond the page size in the page */
-			copy_size = src_offset = 0;
-		}
-
-		fill = fill->next;
-		while (fill != &local->fill) {
-			if (fill->offset > page->offset) {
-				break;
-			}
-			fill = fill->next;
-		}
-
-		new = CALLOC (1, sizeof (*new));
-
-		new->offset = page->offset;
-		new->size = copy_size;
-		new->refs = dict_ref (page->ref);
-		new->count = iov_subset (page->vector, page->count,
-					 src_offset, src_offset+copy_size,
-					 NULL);
-		new->vector = CALLOC (new->count, sizeof (struct iovec));
-
-		new->count = iov_subset (page->vector, page->count,
-					 src_offset, src_offset+copy_size,
-					 new->vector);
-
-		new->next = fill;
-		new->prev = new->next->prev;
-		new->next->prev = new;
-		new->prev->next = new;
-
-		local->op_ret += copy_size;
-	}
+    ra_local_t *local = NULL;
+    ra_fill_t *fill = NULL;
+    off_t src_offset = 0;
+    off_t dst_offset = 0;
+    ssize_t copy_size = 0;
+    ra_fill_t *new = NULL;
+
+    GF_VALIDATE_OR_GOTO("read-ahead", frame, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, page, out);
+
+    local = frame->local;
+    fill = &local->fill;
+
+    if (local->op_ret != -1 && page->size) {
+        if (local->offset > page->offset)
+            src_offset = local->offset - page->offset;
+        else
+            dst_offset = page->offset - local->offset;
+
+        copy_size = min(page->size - src_offset, local->size - dst_offset);
+
+        if (copy_size < 0) {
+            /* if page contains fewer bytes and the required offset
+               is beyond the page size in the page */
+            copy_size = src_offset = 0;
+        }
+
+        fill = fill->next;
+        while (fill != &local->fill) {
+            if (fill->offset > page->offset) {
+                break;
+            }
+            fill = fill->next;
+        }
+
+        new = GF_CALLOC(1, sizeof(*new), gf_ra_mt_ra_fill_t);
+        if (new == NULL) {
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            goto out;
+        }
+
+        new->offset = page->offset;
+        new->size = copy_size;
+        new->iobref = iobref_ref(page->iobref);
+        new->count = iov_subset(page->vector, page->count, src_offset,
+                                copy_size, &new->vector, 0);
+        if (new->count < 0) {
+            local->op_ret = -1;
+            local->op_errno = ENOMEM;
+            iobref_unref(new->iobref);
+            GF_FREE(new);
+            goto out;
+        }
+
+        new->next = fill;
+        new->prev = new->next->prev;
+        new->next->prev = new;
+        new->prev->next = new;
+
+        local->op_ret += copy_size;
+    }
+
+out:
+    return;
 }
 
-
 void
-ra_frame_unwind (call_frame_t *frame)
+ra_frame_unwind(call_frame_t *frame)
 {
-	ra_local_t   *local = NULL;
-	ra_fill_t    *fill = NULL;
-	int32_t       count = 0;
-	struct iovec *vector;
-	int32_t       copied = 0;
-	dict_t       *refs = NULL;
-	ra_fill_t    *next = NULL;
-	fd_t         *fd = NULL;
-	ra_file_t    *file = NULL;
-	int           ret = 0;
-	uint64_t      tmp_file = 0;
-
-	local = frame->local;
-	fill  = local->fill.next;
-
-	refs  = get_new_dict ();
-
-	frame->local = NULL;
-
-	while (fill != &local->fill) {
-		count += fill->count;
-		fill = fill->next;
-	}
-
-	vector = CALLOC (count, sizeof (*vector));
-
-	fill = local->fill.next;
-
-	while (fill != &local->fill) {
-		next = fill->next;
-
-		memcpy (((char *)vector) + copied, fill->vector,
-			fill->count * sizeof (*vector));
-
-		copied += (fill->count * sizeof (*vector));
-		dict_copy (fill->refs, refs);
-
-		fill->next->prev = fill->prev;
-		fill->prev->next = fill->prev;
-
-		dict_unref (fill->refs);
-		free (fill->vector);
-		free (fill);
-
-		fill = next;
-	}
-
-	frame->root->rsp_refs = dict_ref (refs);
-
-	fd = local->fd;
-	ret = fd_ctx_get (fd, frame->this, &tmp_file);
-	file = (ra_file_t *)(long)tmp_file;
-	
-	STACK_UNWIND (frame, local->op_ret, local->op_errno,
-		      vector, count, &file->stbuf);
-  
-	dict_unref (refs);
-	pthread_mutex_destroy (&local->local_lock);
-	free (local);
-	free (vector);
-
-	return;
+    ra_local_t *local = NULL;
+    ra_fill_t *fill = NULL;
+    int32_t count = 0;
+    struct iovec *vector = NULL;
+    int32_t copied = 0;
+    struct iobref *iobref = NULL;
+    ra_fill_t *next = NULL;
+    fd_t *fd = NULL;
+    ra_file_t *file = NULL;
+    uint64_t tmp_file = 0;
+
+    GF_VALIDATE_OR_GOTO("read-ahead", frame, out);
+
+    local = frame->local;
+    fill = local->fill.next;
+
+    iobref = iobref_new();
+    if (iobref == NULL) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+    }
+
+    frame->local = NULL;
+
+    while (fill != &local->fill) {
+        count += fill->count;
+        fill = fill->next;
+    }
+
+    vector = GF_CALLOC(count, sizeof(*vector), gf_ra_mt_iovec);
+    if (vector == NULL) {
+        local->op_ret = -1;
+        local->op_errno = ENOMEM;
+        iobref_unref(iobref);
+        iobref = NULL;
+    }
+
+    fill = local->fill.next;
+
+    while (fill != &local->fill) {
+        next = fill->next;
+
+        if ((vector != NULL) && (iobref != NULL)) {
+            memcpy(((char *)vector) + copied, fill->vector,
+                   fill->count * sizeof(*vector));
+
+            copied += (fill->count * sizeof(*vector));
+            if (iobref_merge(iobref, fill->iobref)) {
+                local->op_ret = -1;
+                local->op_errno = ENOMEM;
+                iobref_unref(iobref);
+                iobref = NULL;
+            }
+        }
+
+        fill->next->prev = fill->prev;
+        fill->prev->next = fill->prev;
+
+        iobref_unref(fill->iobref);
+        GF_FREE(fill->vector);
+        GF_FREE(fill);
+
+        fill = next;
+    }
+
+    fd = local->fd;
+    fd_ctx_get(fd, frame->this, &tmp_file);
+    file = (ra_file_t *)(long)tmp_file;
+
+    STACK_UNWIND_STRICT(readv, frame, local->op_ret, local->op_errno, vector,
+                        count, &file->stbuf, iobref, NULL);
+
+    iobref_unref(iobref);
+    pthread_mutex_destroy(&local->local_lock);
+    mem_put(local);
+    GF_FREE(vector);
+
+out:
+    return;
 }
 
 /*
@@ -359,47 +458,55 @@ ra_frame_unwind (call_frame_t *frame)
  *
  */
 void
-ra_frame_return (call_frame_t *frame)
+ra_frame_return(call_frame_t *frame)
 {
-	ra_local_t *local = NULL;
-	int32_t     wait_count = 0;
+    ra_local_t *local = NULL;
+    int32_t wait_count = 0;
 
-	local = frame->local;
-	assert (local->wait_count > 0);
+    GF_VALIDATE_OR_GOTO("read-ahead", frame, out);
 
-	ra_local_lock (local);
-	{
-		wait_count = --local->wait_count;
-	}
-	ra_local_unlock (local);
+    local = frame->local;
+    GF_ASSERT(local->wait_count > 0);
 
-	if (!wait_count)
-		ra_frame_unwind (frame);
+    ra_local_lock(local);
+    {
+        wait_count = --local->wait_count;
+    }
+    ra_local_unlock(local);
 
-	return;
+    if (!wait_count)
+        ra_frame_unwind(frame);
+
+out:
+    return;
 }
 
-/* 
+/*
  * ra_page_wakeup -
  * @page:
  *
  */
 ra_waitq_t *
-ra_page_wakeup (ra_page_t *page)
+ra_page_wakeup(ra_page_t *page)
 {
-	ra_waitq_t *waitq = NULL, *trav = NULL;
-	call_frame_t *frame;
+    ra_waitq_t *waitq = NULL, *trav = NULL;
+    call_frame_t *frame = NULL;
+
+    GF_VALIDATE_OR_GOTO("read-ahead", page, out);
 
-	waitq = page->waitq;
-	page->waitq = NULL;
+    waitq = page->waitq;
+    page->waitq = NULL;
 
-	trav = waitq;
-	for (trav = waitq; trav; trav = trav->next) {
-		frame = trav->data;
-		ra_frame_fill (page, frame);
-	}
+    for (trav = waitq; trav; trav = trav->next) {
+        frame = trav->data;
+        ra_frame_fill(page, frame);
+    }
 
-	return waitq;
+    if (page->stale) {
+        ra_page_purge(page);
+    }
+out:
+    return waitq;
 }
 
 /*
@@ -408,16 +515,22 @@ ra_page_wakeup (ra_page_t *page)
  *
  */
 void
-ra_page_purge (ra_page_t *page)
+ra_page_purge(ra_page_t *page)
 {
-	page->prev->next = page->next;
-	page->next->prev = page->prev;
-
-	if (page->ref) {
-		dict_unref (page->ref);
-	}
-	free (page->vector);
-	free (page);
+    GF_VALIDATE_OR_GOTO("read-ahead", page, out);
+
+    page->prev->next = page->next;
+    page->next->prev = page->prev;
+
+    if (page->iobref) {
+        iobref_unref(page->iobref);
+    }
+
+    GF_FREE(page->vector);
+    GF_FREE(page);
+
+out:
+    return;
 }
 
 /*
@@ -428,60 +541,65 @@ ra_page_purge (ra_page_t *page)
  *
  */
 ra_waitq_t *
-ra_page_error (ra_page_t *page, int32_t op_ret, int32_t op_errno)
+ra_page_error(ra_page_t *page, int32_t op_ret, int32_t op_errno)
 {
+    ra_waitq_t *waitq = NULL;
+    ra_waitq_t *trav = NULL;
+    call_frame_t *frame = NULL;
+    ra_local_t *local = NULL;
 
-	ra_waitq_t   *waitq = NULL;
-	ra_waitq_t   *trav = NULL;
-	call_frame_t *frame = NULL;
-	ra_local_t   *local = NULL;
+    GF_VALIDATE_OR_GOTO("read-ahead", page, out);
 
-	waitq = page->waitq;
-	page->waitq = NULL;
+    waitq = page->waitq;
+    page->waitq = NULL;
 
-	trav = waitq;
-	for (trav = waitq; trav; trav = trav->next) {
-		frame = trav->data;
+    for (trav = waitq; trav; trav = trav->next) {
+        frame = trav->data;
 
-		local = frame->local;
-		if (local->op_ret != -1) {
-			local->op_ret   = op_ret;
-			local->op_errno = op_errno;
-		}
-	}
+        local = frame->local;
+        if (local->op_ret != -1) {
+            local->op_ret = op_ret;
+            local->op_errno = op_errno;
+        }
+    }
 
-	ra_page_purge (page);
+    ra_page_purge(page);
 
-	return waitq;
+out:
+    return waitq;
 }
 
-/* 
+/*
  * ra_file_destroy -
  * @file:
  *
  */
 void
-ra_file_destroy (ra_file_t *file)
+ra_file_destroy(ra_file_t *file)
 {
-	ra_conf_t *conf = NULL;
-	ra_page_t *trav = NULL;
-
-	conf = file->conf;
-
-	ra_conf_lock (conf);
-	{
-		file->prev->next = file->next;
-		file->next->prev = file->prev;
-	}
-	ra_conf_unlock (conf);
-
-	trav = file->pages.next;
-	while (trav != &file->pages) {
-		ra_page_error (trav, -1, EINVAL);
-		trav = file->pages.next;
-	}
-
-	pthread_mutex_destroy (&file->file_lock);
-	free (file);
-}
+    ra_conf_t *conf = NULL;
+    ra_page_t *trav = NULL;
 
+    GF_VALIDATE_OR_GOTO("read-ahead", file, out);
+
+    conf = file->conf;
+
+    ra_conf_lock(conf);
+    {
+        file->prev->next = file->next;
+        file->next->prev = file->prev;
+    }
+    ra_conf_unlock(conf);
+
+    trav = file->pages.next;
+    while (trav != &file->pages) {
+        ra_page_error(trav, -1, EINVAL);
+        trav = file->pages.next;
+    }
+
+    pthread_mutex_destroy(&file->file_lock);
+    GF_FREE(file);
+
+out:
+    return;
+}
diff --git a/xlators/performance/read-ahead/src/read-ahead-mem-types.h b/xlators/performance/read-ahead/src/read-ahead-mem-types.h
new file mode 100644
index 00000000000..f07cfc5bba5
--- /dev/null
+++ b/xlators/performance/read-ahead/src/read-ahead-mem-types.h
@@ -0,0 +1,25 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __RA_MEM_TYPES_H__
+#define __RA_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_ra_mem_types_ {
+    gf_ra_mt_ra_file_t = gf_common_mt_end + 1,
+    gf_ra_mt_ra_conf_t,
+    gf_ra_mt_ra_page_t,
+    gf_ra_mt_ra_waitq_t,
+    gf_ra_mt_ra_fill_t,
+    gf_ra_mt_iovec,
+    gf_ra_mt_end
+};
+#endif
diff --git a/xlators/performance/read-ahead/src/read-ahead-messages.h b/xlators/performance/read-ahead/src/read-ahead-messages.h
new file mode 100644
index 00000000000..0302b7a7122
--- /dev/null
+++ b/xlators/performance/read-ahead/src/read-ahead-messages.h
@@ -0,0 +1,31 @@
+/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _READ_AHEAD_MESSAGES_H_
+#define _READ_AHEAD_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(READ_AHEAD, READ_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED,
+           READ_AHEAD_MSG_VOL_MISCONFIGURED, READ_AHEAD_MSG_NO_MEMORY,
+           READ_AHEAD_MSG_FD_CONTEXT_NOT_SET,
+           READ_AHEAD_MSG_UNDESTROYED_FILE_FOUND,
+           READ_AHEAD_MSG_XLATOR_CONF_NULL);
+
+#endif /* _READ_AHEAD_MESSAGES_H_ */
diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c
index aa825d486ec..5246e1317d2 100644
--- a/xlators/performance/read-ahead/src/read-ahead.c
+++ b/xlators/performance/read-ahead/src/read-ahead.c
@@ -1,890 +1,1272 @@
 /*
-  Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
   This file is part of GlusterFS.
 
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
-/* 
-   TODO:
-   - handle O_DIRECT
-   - maintain offset, flush on lseek
-   - ensure efficient memory managment in case of random seek
+/*
+  TODO:
+  - handle O_DIRECT
+  - maintain offset, flush on lseek
+  - ensure efficient memory management in case of random seek
 */
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
 #include "read-ahead.h"
+#include <glusterfs/statedump.h>
 #include <assert.h>
 #include <sys/time.h>
-
+#include "read-ahead-messages.h"
 
 static void
-read_ahead (call_frame_t *frame,
-            ra_file_t *file);
-
+read_ahead(call_frame_t *frame, ra_file_t *file);
 
 int
-ra_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-             int32_t op_ret, int32_t op_errno, fd_t *fd)
+ra_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, fd_t *fd, dict_t *xdata)
 {
-	ra_conf_t  *conf = NULL;
-	ra_file_t  *file = NULL;
-	int         ret = 0;
+    ra_conf_t *conf = NULL;
+    ra_file_t *file = NULL;
+    int ret = 0;
+
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
+
+    conf = this->private;
+
+    if (op_ret == -1) {
+        goto unwind;
+    }
+
+    file = GF_CALLOC(1, sizeof(*file), gf_ra_mt_ra_file_t);
+    if (!file) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    /* If O_DIRECT open, we disable caching on it */
+
+    if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY))
+        file->disabled = 1;
+
+    file->offset = (unsigned long long)0;
+    file->conf = conf;
+    file->pages.next = &file->pages;
+    file->pages.prev = &file->pages;
+    file->pages.offset = (unsigned long long)0;
+    file->pages.file = file;
+
+    ra_conf_lock(conf);
+    {
+        file->next = conf->files.next;
+        conf->files.next = file;
+        file->next->prev = file;
+        file->prev = &conf->files;
+    }
+    ra_conf_unlock(conf);
+
+    file->fd = fd;
+    file->page_count = conf->page_count;
+    file->page_size = conf->page_size;
+    pthread_mutex_init(&file->file_lock, NULL);
+
+    if (!file->disabled) {
+        file->page_count = 1;
+    }
+
+    ret = fd_ctx_set(fd, this, (uint64_t)(long)file);
+    if (ret == -1) {
+        gf_msg(frame->this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_NO_MEMORY,
+               "cannot set read-ahead context"
+               "information in fd (%p)",
+               fd);
+        ra_file_destroy(file);
+        op_ret = -1;
+        op_errno = ENOMEM;
+    }
 
-	conf  = this->private;
+unwind:
+    frame->local = NULL;
 
-	if (op_ret == -1) {
-		goto unwind;
-	}
+    STACK_UNWIND_STRICT(open, frame, op_ret, op_errno, fd, xdata);
 
-	file = CALLOC (1, sizeof (*file));
-	if (!file) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto unwind;
-	}
+    return 0;
+}
 
-	ret = fd_ctx_set (fd, this, (uint64_t)(long)file);
+int
+ra_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, fd_t *fd, inode_t *inode, struct iatt *buf,
+              struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+    ra_conf_t *conf = NULL;
+    ra_file_t *file = NULL;
+    int ret = 0;
+
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
+
+    conf = this->private;
+
+    if (op_ret == -1) {
+        goto unwind;
+    }
+
+    file = GF_CALLOC(1, sizeof(*file), gf_ra_mt_ra_file_t);
+    if (!file) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    /* If O_DIRECT open, we disable caching on it */
+
+    if ((fd->flags & O_DIRECT) || ((fd->flags & O_ACCMODE) == O_WRONLY))
+        file->disabled = 1;
+
+    file->offset = (unsigned long long)0;
+    // file->size = fd->inode->buf.ia_size;
+    file->conf = conf;
+    file->pages.next = &file->pages;
+    file->pages.prev = &file->pages;
+    file->pages.offset = (unsigned long long)0;
+    file->pages.file = file;
+
+    ra_conf_lock(conf);
+    {
+        file->next = conf->files.next;
+        conf->files.next = file;
+        file->next->prev = file;
+        file->prev = &conf->files;
+    }
+    ra_conf_unlock(conf);
+
+    file->fd = fd;
+    file->page_count = conf->page_count;
+    file->page_size = conf->page_size;
+    pthread_mutex_init(&file->file_lock, NULL);
+
+    ret = fd_ctx_set(fd, this, (uint64_t)(long)file);
+    if (ret == -1) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_NO_MEMORY,
+               "cannot set read ahead context"
+               "information in fd (%p)",
+               fd);
+        ra_file_destroy(file);
+        op_ret = -1;
+        op_errno = ENOMEM;
+    }
 
-	/* If mandatory locking has been enabled on this file,
-	   we disable caching on it */
+unwind:
+    STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, buf,
+                        preparent, postparent, xdata);
 
-	if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP))
-		file->disabled = 1;
+    return 0;
+}
 
-	/* If O_DIRECT open, we disable caching on it */
+int
+ra_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+        fd_t *fd, dict_t *xdata)
+{
+    GF_ASSERT(frame);
+    GF_ASSERT(this);
 
-	if ((fd->flags & O_DIRECT) || (fd->flags & O_WRONLY))
-		file->disabled = 1;
+    STACK_WIND(frame, ra_open_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
 
-	file->offset = (unsigned long long) 0;
-	file->conf = conf;
-	file->pages.next = &file->pages;
-	file->pages.prev = &file->pages;
-	file->pages.offset = (unsigned long long) 0;
-	file->pages.file = file;
+    return 0;
+}
 
-	ra_conf_lock (conf);
-	{
-		file->next = conf->files.next;
-		conf->files.next = file;
-		file->next->prev = file;
-		file->prev = &conf->files;
-	}
-	ra_conf_unlock (conf);
+int
+ra_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+          mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    GF_ASSERT(frame);
+    GF_ASSERT(this);
 
-	file->fd = fd;
-	file->page_count = conf->page_count;
-	file->page_size = conf->page_size;
-	pthread_mutex_init (&file->file_lock, NULL);
+    STACK_WIND(frame, ra_create_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
+               xdata);
 
-	if (!file->disabled) {
-		file->page_count = 1;
-	}
+    return 0;
+}
 
-unwind:
-	STACK_UNWIND (frame, op_ret, op_errno, fd);
+/* free cache pages between offset and offset+size,
+   does not touch pages with frames waiting on it
+*/
 
-	return 0;
+static void
+flush_region(call_frame_t *frame, ra_file_t *file, off_t offset, off_t size,
+             int for_write)
+{
+    ra_page_t *trav = NULL;
+    ra_page_t *next = NULL;
+
+    ra_file_lock(file);
+    {
+        trav = file->pages.next;
+        while (trav != &file->pages && trav->offset < (offset + size)) {
+            next = trav->next;
+            if (trav->offset >= offset) {
+                if (!trav->waitq) {
+                    ra_page_purge(trav);
+                } else {
+                    trav->stale = 1;
+
+                    if (for_write) {
+                        trav->poisoned = 1;
+                    }
+                }
+            }
+            trav = next;
+        }
+    }
+    ra_file_unlock(file);
 }
 
-
 int
-ra_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-               int32_t op_ret, int32_t op_errno,
-	       fd_t *fd, inode_t *inode, struct stat *buf)
-{
-	ra_conf_t  *conf = NULL;
-	ra_file_t  *file = NULL;
-	int         ret = 0;
-
-	conf  = this->private;
-
-	if (op_ret == -1) {
-		goto unwind;
-	}
-
-	file = CALLOC (1, sizeof (*file));
-	if (!file) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		goto unwind;
-	}
-
-	ret = fd_ctx_set (fd, this, (uint64_t)(long)file);
-
-	/* If mandatory locking has been enabled on this file,
-	   we disable caching on it */
-
-	if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP))
-		file->disabled = 1;
-
-	/* If O_DIRECT open, we disable caching on it */
-
-	if ((fd->flags & O_DIRECT) || (fd->flags & O_WRONLY))
-			file->disabled = 1;
-
-	file->offset = (unsigned long long) 0;
-	//file->size = fd->inode->buf.st_size;
-	file->conf = conf;
-	file->pages.next = &file->pages;
-	file->pages.prev = &file->pages;
-	file->pages.offset = (unsigned long long) 0;
-	file->pages.file = file;
-
-	ra_conf_lock (conf);
-	{
-		file->next = conf->files.next;
-		conf->files.next = file;
-		file->next->prev = file;
-		file->prev = &conf->files;
-	}
-	ra_conf_unlock (conf);
-
-	file->fd = fd;
-	file->page_count = conf->page_count;
-	file->page_size = conf->page_size;
-	pthread_mutex_init (&file->file_lock, NULL);
+ra_release(xlator_t *this, fd_t *fd)
+{
+    uint64_t tmp_file = 0;
+    int ret = 0;
 
-unwind:
-	STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
+    GF_VALIDATE_OR_GOTO("read-ahead", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
 
-	return 0;
+    ret = fd_ctx_del(fd, this, &tmp_file);
+
+    if (!ret) {
+        ra_file_destroy((ra_file_t *)(long)tmp_file);
+    }
+
+out:
+    return 0;
 }
 
+void
+read_ahead(call_frame_t *frame, ra_file_t *file)
+{
+    off_t ra_offset = 0;
+    size_t ra_size = 0;
+    off_t trav_offset = 0;
+    ra_page_t *trav = NULL;
+    off_t cap = 0;
+    char fault = 0;
+
+    GF_VALIDATE_OR_GOTO("read-ahead", frame, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, file, out);
+
+    if (!file->page_count) {
+        goto out;
+    }
+
+    ra_size = file->page_size * file->page_count;
+    ra_offset = gf_floor(file->offset, file->page_size);
+    cap = file->size ? file->size : file->offset + ra_size;
+
+    while (ra_offset < min(file->offset + ra_size, cap)) {
+        ra_file_lock(file);
+        {
+            trav = ra_page_get(file, ra_offset);
+        }
+        ra_file_unlock(file);
+
+        if (!trav)
+            break;
+
+        ra_offset += file->page_size;
+    }
+
+    if (trav) {
+        /* comfortable enough */
+        goto out;
+    }
+
+    trav_offset = ra_offset;
+
+    cap = file->size ? file->size : ra_offset + ra_size;
+
+    while (trav_offset < min(ra_offset + ra_size, cap)) {
+        fault = 0;
+        ra_file_lock(file);
+        {
+            trav = ra_page_get(file, trav_offset);
+            if (!trav) {
+                fault = 1;
+                trav = ra_page_create(file, trav_offset);
+                if (trav)
+                    trav->dirty = 1;
+            }
+        }
+        ra_file_unlock(file);
+
+        if (!trav) {
+            /* OUT OF MEMORY */
+            break;
+        }
+
+        if (fault) {
+            gf_msg_trace(frame->this->name, 0, "RA at offset=%" PRId64,
+                         trav_offset);
+            ra_page_fault(file, frame, trav_offset);
+        }
+        trav_offset += file->page_size;
+    }
+
+out:
+    return;
+}
 
 int
-ra_open (call_frame_t *frame, xlator_t *this,
-         loc_t *loc, int32_t flags, fd_t *fd)
+ra_need_atime_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                  int32_t count, struct iatt *stbuf, struct iobref *iobref,
+                  dict_t *xdata)
 {
-	STACK_WIND (frame, ra_open_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->open,
-		    loc, flags, fd);
+    GF_ASSERT(frame);
+    STACK_DESTROY(frame->root);
+    return 0;
+}
 
-	return 0;
+static void
+dispatch_requests(call_frame_t *frame, ra_file_t *file)
+{
+    ra_local_t *local = NULL;
+    ra_conf_t *conf = NULL;
+    off_t rounded_offset = 0;
+    off_t rounded_end = 0;
+    off_t trav_offset = 0;
+    ra_page_t *trav = NULL;
+    call_frame_t *ra_frame = NULL;
+    char need_atime_update = 1;
+    char fault = 0;
+
+    GF_VALIDATE_OR_GOTO("read-ahead", frame, out);
+    GF_VALIDATE_OR_GOTO(frame->this->name, file, out);
+
+    local = frame->local;
+    conf = file->conf;
+
+    rounded_offset = gf_floor(local->offset, file->page_size);
+    rounded_end = gf_roof(local->offset + local->size, file->page_size);
+
+    trav_offset = rounded_offset;
+
+    while (trav_offset < rounded_end) {
+        fault = 0;
+
+        ra_file_lock(file);
+        {
+            trav = ra_page_get(file, trav_offset);
+            if (!trav) {
+                trav = ra_page_create(file, trav_offset);
+                if (!trav) {
+                    local->op_ret = -1;
+                    local->op_errno = ENOMEM;
+                    goto unlock;
+                }
+                fault = 1;
+                need_atime_update = 0;
+            }
+            trav->dirty = 0;
+
+            if (trav->ready) {
+                gf_msg_trace(frame->this->name, 0, "HIT at offset=%" PRId64 ".",
+                             trav_offset);
+                ra_frame_fill(trav, frame);
+            } else {
+                gf_msg_trace(frame->this->name, 0,
+                             "IN-TRANSIT at "
+                             "offset=%" PRId64 ".",
+                             trav_offset);
+                ra_wait_on_page(trav, frame);
+                need_atime_update = 0;
+            }
+        }
+    unlock:
+        ra_file_unlock(file);
+
+        if (local->op_ret == -1) {
+            goto out;
+        }
+
+        if (fault) {
+            gf_msg_trace(frame->this->name, 0, "MISS at offset=%" PRId64 ".",
+                         trav_offset);
+            ra_page_fault(file, frame, trav_offset);
+        }
+
+        trav_offset += file->page_size;
+    }
+
+    if (need_atime_update && conf->force_atime_update) {
+        /* TODO: use untimens() since readv() can confuse underlying
+           io-cache and others */
+        ra_frame = copy_frame(frame);
+        if (ra_frame == NULL) {
+            goto out;
+        }
+
+        STACK_WIND(ra_frame, ra_need_atime_cbk, FIRST_CHILD(frame->this),
+                   FIRST_CHILD(frame->this)->fops->readv, file->fd, 1, 1, 0,
+                   NULL);
+    }
+
+out:
+    return;
 }
 
 int
-ra_create (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, int32_t flags, mode_t mode, fd_t *fd)
+ra_readv_disabled_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                      int32_t count, struct iatt *stbuf, struct iobref *iobref,
+                      dict_t *xdata)
 {
-	STACK_WIND (frame, ra_create_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->create,
-		    loc, flags, mode, fd);
+    GF_ASSERT(frame);
 
-	return 0;
-}
+    STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf,
+                        iobref, xdata);
 
-/* free cache pages between offset and offset+size,
-   does not touch pages with frames waiting on it
-*/
+    return 0;
+}
 
-static void
-flush_region (call_frame_t *frame,
-              ra_file_t *file,
-              off_t offset,
-              off_t size)
+int
+ra_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+         off_t offset, uint32_t flags, dict_t *xdata)
 {
-	ra_page_t *trav = NULL;
-	ra_page_t *next = NULL;
+    ra_file_t *file = NULL;
+    ra_local_t *local = NULL;
+    ra_conf_t *conf = NULL;
+    int op_errno = EINVAL;
+    char expected_offset = 1;
+    uint64_t tmp_file = 0;
 
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
+    GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
 
-	ra_file_lock (file);
-	{
-		trav = file->pages.next;
-		while (trav != &file->pages
-		       && trav->offset < (offset + size)) {
+    conf = this->private;
 
-			next = trav->next;
-			if (trav->offset >= offset && !trav->waitq) {
-				ra_page_purge (trav);
-			}
-			trav = next;
-		}
-	}
-	ra_file_unlock (file);
-}
+    gf_msg_trace(this->name, 0,
+                 "NEW REQ at offset=%" PRId64 " for size=%" GF_PRI_SIZET "",
+                 offset, size);
 
+    fd_ctx_get(fd, this, &tmp_file);
+    file = (ra_file_t *)(long)tmp_file;
 
+    if (!file || file->disabled) {
+        goto disabled;
+    }
 
-int
-ra_release (xlator_t *this,
-	    fd_t *fd)
-{
-	uint64_t tmp_file = 0;
-	int      ret = 0;
+    if (file->offset != offset) {
+        gf_msg_trace(this->name, 0,
+                     "unexpected offset (%" PRId64 " != %" PRId64
+                     ") "
+                     "resetting",
+                     file->offset, offset);
 
-	ret = fd_ctx_del (fd, this, &tmp_file);
-	
-	if (!ret) {
-		ra_file_destroy ((ra_file_t *)(long)tmp_file);
-	}
+        expected_offset = file->expected = file->page_count = 0;
+    } else {
+        gf_msg_trace(this->name, 0,
+                     "expected offset (%" PRId64 ") when page_count=%d", offset,
+                     file->page_count);
 
-	return 0;
-}
+        if (file->expected < (file->page_size * conf->page_count)) {
+            file->expected += size;
+            file->page_count = min((file->expected / file->page_size),
+                                   conf->page_count);
+        }
+    }
 
+    if (!expected_offset) {
+        flush_region(frame, file, 0, file->pages.prev->offset + 1, 0);
+    }
 
-void
-read_ahead (call_frame_t *frame, ra_file_t *file)
-{
-	off_t      ra_offset = 0;
-	size_t     ra_size = 0;
-	off_t      trav_offset = 0;
-	ra_page_t *trav = NULL;
-	off_t      cap = 0;
-	char       fault = 0;
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        op_errno = ENOMEM;
+        goto unwind;
+    }
 
-	if (!file->page_count)
-		return;
+    local->fd = fd;
+    local->offset = offset;
+    local->size = size;
+    local->wait_count = 1;
 
-	ra_size   = file->page_size * file->page_count;
-	ra_offset = floor (file->offset, file->page_size);
-	cap       = file->size ? file->size : file->offset + ra_size;
+    local->fill.next = &local->fill;
+    local->fill.prev = &local->fill;
 
-	while (ra_offset < min (file->offset + ra_size, cap)) {
+    pthread_mutex_init(&local->local_lock, NULL);
 
-		ra_file_lock (file);
-		{
-			trav = ra_page_get (file, ra_offset);
-		}
-		ra_file_unlock (file);
+    frame->local = local;
 
-		if (!trav)
-			break;
+    dispatch_requests(frame, file);
 
-		ra_offset += file->page_size;
-	}
+    flush_region(frame, file, 0, gf_floor(offset, file->page_size), 0);
 
-	if (trav)
-		/* comfortable enough */
-		return;
+    read_ahead(frame, file);
 
-	trav_offset = ra_offset;
+    file->offset = offset + size;
 
-	trav = file->pages.next;
-	cap  = file->size ? file->size : ra_offset + ra_size;
+    ra_frame_return(frame);
 
-	while (trav_offset < min(ra_offset + ra_size, cap)) {
-		fault = 0;
-		ra_file_lock (file);
-		{
-			trav = ra_page_get (file, trav_offset);
-			if (!trav) {
-				fault = 1;
-				trav = ra_page_create (file, trav_offset);
-				if (trav) 
-					trav->dirty = 1;
-			}
-		}
-		ra_file_unlock (file);
+    return 0;
 
-		if (!trav) {
-			/* OUT OF MEMORY */
-			break;
-		}
+unwind:
+    STACK_UNWIND_STRICT(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
 
-		if (fault) {
-			gf_log (frame->this->name, GF_LOG_DEBUG,
-				"RA at offset=%"PRId64, trav_offset);
-			ra_page_fault (file, frame, trav_offset);
-		}
-		trav_offset += file->page_size;
-	}
+    return 0;
 
-	return;
+disabled:
+    STACK_WIND(frame, ra_readv_disabled_cbk, FIRST_CHILD(frame->this),
+               FIRST_CHILD(frame->this)->fops->readv, fd, size, offset, flags,
+               xdata);
+    return 0;
 }
 
-
 int
-ra_need_atime_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-                   int32_t op_ret, int32_t op_errno, struct iovec *vector,
-                   int32_t count, struct stat *stbuf)
+ra_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, dict_t *xdata)
 {
-	STACK_DESTROY (frame->root);
-	return 0;
+    GF_ASSERT(frame);
+    STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, xdata);
+    return 0;
 }
 
-
-static void
-dispatch_requests (call_frame_t *frame,
-                   ra_file_t *file)
-{
-	ra_local_t   *local = NULL;
-	ra_conf_t    *conf = NULL;
-	off_t         rounded_offset = 0;
-	off_t         rounded_end = 0;
-	off_t         trav_offset = 0;
-	ra_page_t    *trav = NULL;
-	call_frame_t *ra_frame = NULL;
-	char          need_atime_update = 1;
-	char          fault = 0;
-
-
-	local = frame->local;
-	conf  = file->conf;
-
-	rounded_offset = floor (local->offset, file->page_size);
-	rounded_end    = roof (local->offset + local->size, file->page_size);
-
-	trav_offset = rounded_offset;
-	trav        = file->pages.next;
-
-	while (trav_offset < rounded_end) {
-		fault = 0;
-
-		ra_file_lock (file);
-		{
-			trav = ra_page_get (file, trav_offset);
-			if (!trav) {
-				trav = ra_page_create (file, trav_offset);
-				fault = 1;
-				need_atime_update = 0;
-			}
-
-			if (!trav)
-				goto unlock;
-
-			if (trav->ready) {
-				gf_log (frame->this->name, GF_LOG_DEBUG,
-					"HIT at offset=%"PRId64".",
-					trav_offset);
-				ra_frame_fill (trav, frame);
-			} else {
-				gf_log (frame->this->name, GF_LOG_DEBUG,
-					"IN-TRANSIT at offset=%"PRId64".",
-					trav_offset);
-				ra_wait_on_page (trav, frame);
-				need_atime_update = 0;
-			}
-		}
-	unlock:
-		ra_file_unlock (file);
-
-		if (fault) {
-			gf_log (frame->this->name, GF_LOG_DEBUG,
-				"MISS at offset=%"PRId64".",
-				trav_offset);
-			ra_page_fault (file, frame, trav_offset);
-		}
-
-		trav_offset += file->page_size;
-	}
-
-	if (need_atime_update && conf->force_atime_update) {
-		/* TODO: use untimens() since readv() can confuse underlying
-		   io-cache and others */
-		ra_frame = copy_frame (frame);
-		STACK_WIND (ra_frame, ra_need_atime_cbk,
-			    FIRST_CHILD (frame->this), 
-			    FIRST_CHILD (frame->this)->fops->readv,
-			    file->fd, 1, 1);
-	}
-
-	return ;
+int
+ra_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
+             dict_t *xdata)
+{
+    GF_ASSERT(frame);
+    STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+    return 0;
 }
 
-
 int
-ra_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-                       int32_t op_ret, int32_t op_errno,
-		       struct iovec *vector, int32_t count, struct stat *stbuf)
+ra_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
 {
-	STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
+    int32_t op_errno = EINVAL;
 
-	return 0;
-}
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
+    GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
 
+    STACK_WIND(frame, ra_flush_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->flush, fd, xdata);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(flush, frame, -1, op_errno, NULL);
+    return 0;
+}
 
 int
-ra_readv (call_frame_t *frame, xlator_t *this,
-	  fd_t *fd, size_t size, off_t offset)
+ra_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+         dict_t *xdata)
 {
-	ra_file_t    *file = NULL;
-	ra_local_t   *local = NULL;
-	ra_conf_t    *conf = NULL;
-	int           op_errno = 0;
-	int           ret = 0;
-	char expected_offset = 1;
-	uint64_t tmp_file = 0;
+    int32_t op_errno = EINVAL;
 
-	conf = this->private;
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
+    GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
 
-	gf_log (this->name, GF_LOG_DEBUG,
-		"NEW REQ at offset=%"PRId64" for size=%"GF_PRI_SIZET"",
-		offset, size);
+    STACK_WIND(frame, ra_fsync_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata);
+    return 0;
 
-	ret = fd_ctx_get (fd, this, &tmp_file);
-	file = (ra_file_t *)(long)tmp_file;
-
-	if (file->offset != offset) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"unexpected offset (%"PRId64" != %"PRId64") resetting",
-			file->offset, offset);
+unwind:
+    STACK_UNWIND_STRICT(fsync, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+}
 
-		expected_offset = file->expected = file->page_count = 0;
-	} else {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"expected offset (%"PRId64") when page_count=%d",
-			offset, file->page_count);
+int
+ra_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
+              dict_t *xdata)
+{
+    ra_file_t *file = NULL;
 
-		if (file->expected < (conf->page_size * conf->page_count)) {
-			file->expected += size;
-			file->page_count = min ((file->expected / file->page_size),
-						conf->page_count);
-		}
-	}
+    GF_ASSERT(frame);
 
-	if (!expected_offset) {
-		flush_region (frame, file, 0, file->pages.prev->offset + 1);
-	}
+    file = frame->local;
 
-	if (file->disabled) {
-		STACK_WIND (frame, ra_readv_disabled_cbk,
-			    FIRST_CHILD (frame->this), 
-			    FIRST_CHILD (frame->this)->fops->readv,
-			    file->fd, size, offset);
-		return 0;
-	}
+    if (file) {
+        flush_region(frame, file, 0, file->pages.prev->offset + 1, 1);
+    }
 
-	local = (void *) CALLOC (1, sizeof (*local));
-	if (!local) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"out of memory :(");
-		op_errno = ENOMEM;
-		goto unwind;
-	}
+    frame->local = NULL;
+    STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
 
-	local->fd         = fd;
-	local->offset     = offset;
-	local->size       = size;
-	local->wait_count = 1;
+int
+ra_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+          int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+          dict_t *xdata)
+{
+    ra_file_t *file = NULL;
+    uint64_t tmp_file = 0;
+    int32_t op_errno = EINVAL;
+    inode_t *inode = NULL;
+    fd_t *iter_fd = NULL;
 
-	local->fill.next  = &local->fill;
-	local->fill.prev  = &local->fill;
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
+    GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
 
-	pthread_mutex_init (&local->local_lock, NULL);
+    inode = fd->inode;
 
-	frame->local = local;
+    LOCK(&inode->lock);
+    {
+        list_for_each_entry(iter_fd, &inode->fd_list, inode_list)
+        {
+            tmp_file = 0;
+            fd_ctx_get(iter_fd, this, &tmp_file);
+            file = (ra_file_t *)(long)tmp_file;
 
-	dispatch_requests (frame, file);
+            if (!file)
+                continue;
 
-	flush_region (frame, file, 0, floor (offset, file->page_size));
+            if (iter_fd == fd)
+                frame->local = file;
 
-	read_ahead (frame, file);
+            flush_region(frame, file, 0, file->pages.prev->offset + 1, 1);
 
-	ra_frame_return (frame);
+            /* reset the read-ahead counters too */
+            file->expected = file->page_count = 0;
+        }
+    }
+    UNLOCK(&inode->lock);
 
-	file->offset = offset + size;
+    STACK_WIND(frame, ra_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+               flags, iobref, xdata);
 
-	return 0;
+    return 0;
 
 unwind:
-	STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL);
-
-	return 0;
+    STACK_UNWIND_STRICT(writev, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
 }
 
-
 int
-ra_flush_cbk (call_frame_t *frame,
-              void *cookie,
-              xlator_t *this,
-              int32_t op_ret,
-              int32_t op_errno)
+ra_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                struct iatt *postbuf, dict_t *xdata)
 {
-	STACK_UNWIND (frame, op_ret, op_errno);
-	return 0;
-}
+    GF_ASSERT(frame);
 
+    STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
 
 int
-ra_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
+ra_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, struct iatt *buf, dict_t *xdata)
 {
-	ra_file_t *file = NULL;
-	int        ret = 0;
-	uint64_t tmp_file = 0;
+    GF_ASSERT(frame);
 
-	ret = fd_ctx_get (fd, this, &tmp_file);
-	file = (ra_file_t *)(long)tmp_file;
+    STACK_UNWIND_STRICT(stat, frame, op_ret, op_errno, buf, xdata);
+    return 0;
+}
 
-	if (file) {
-		flush_region (frame, file, 0, file->pages.prev->offset+1);
-	}
+int
+ra_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+            dict_t *xdata)
+{
+    ra_file_t *file = NULL;
+    fd_t *iter_fd = NULL;
+    inode_t *inode = NULL;
+    uint64_t tmp_file = 0;
+    int32_t op_errno = EINVAL;
+
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
+    GF_VALIDATE_OR_GOTO(frame->this->name, loc, unwind);
+
+    inode = loc->inode;
+
+    LOCK(&inode->lock);
+    {
+        list_for_each_entry(iter_fd, &inode->fd_list, inode_list)
+        {
+            tmp_file = 0;
+            fd_ctx_get(iter_fd, this, &tmp_file);
+            file = (ra_file_t *)(long)tmp_file;
+
+            if (!file)
+                continue;
+            /*
+             * Truncation invalidates reads just like writing does.
+             * TBD: this seems to flush more than it should.  The
+             * only time we should flush at all is when we're
+             * shortening (not lengthening) the file, and then only
+             * from new EOF to old EOF.  The same problem exists in
+             * ra_ftruncate.
+             */
+            flush_region(frame, file, 0, file->pages.prev->offset + 1, 1);
+        }
+    }
+    UNLOCK(&inode->lock);
+
+    STACK_WIND(frame, ra_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+    return 0;
 
-	STACK_WIND (frame, ra_flush_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->flush,
-		    fd);
-	return 0;
+unwind:
+    STACK_UNWIND_STRICT(truncate, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
 }
 
-
-int
-ra_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
-          int32_t datasync)
+void
+ra_page_dump(struct ra_page *page)
 {
-	ra_file_t *file = NULL;
-	int        ret = 0;
-	uint64_t tmp_file = 0;
+    int i = 0;
+    call_frame_t *frame = NULL;
+    char key[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    ra_waitq_t *trav = NULL;
 
-	ret = fd_ctx_get (fd, this, &tmp_file);
-	file = (ra_file_t *)(long)tmp_file;
+    if (page == NULL) {
+        goto out;
+    }
 
-	if (file) {
-		flush_region (frame, file, 0, file->pages.prev->offset+1);
-	}
+    gf_proc_dump_write("offset", "%" PRId64, page->offset);
 
-	STACK_WIND (frame, ra_flush_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->fsync,
-		    fd, datasync);
-	return 0;
-}
+    gf_proc_dump_write("size", "%" GF_PRI_SIZET, page->size);
 
+    gf_proc_dump_write("dirty", "%s", page->dirty ? "yes" : "no");
 
-int
-ra_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-               int32_t op_ret, int32_t op_errno, struct stat *stbuf)
+    gf_proc_dump_write("poisoned", "%s", page->poisoned ? "yes" : "no");
+
+    gf_proc_dump_write("ready", "%s", page->ready ? "yes" : "no");
+
+    for (trav = page->waitq; trav; trav = trav->next) {
+        frame = trav->data;
+        sprintf(key, "waiting-frame[%d]", i++);
+        gf_proc_dump_write(key, "%" PRId64, frame->root->unique);
+    }
+
+out:
+    return;
+}
+
+int32_t
+ra_fdctx_dump(xlator_t *this, fd_t *fd)
 {
-	fd_t      *fd = NULL;
-	ra_file_t *file = NULL;
-	int        ret = 0;
-	uint64_t tmp_file = 0;
+    ra_file_t *file = NULL;
+    ra_page_t *page = NULL;
+    int32_t ret = 0, i = 0;
+    uint64_t tmp_file = 0;
+    char *path = NULL;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
 
-	fd = frame->local;
+    fd_ctx_get(fd, this, &tmp_file);
+    file = (ra_file_t *)(long)tmp_file;
 
-	ret = fd_ctx_get (fd, this, &tmp_file);
-	file = (ra_file_t *)(long)tmp_file;
+    if (file == NULL) {
+        ret = 0;
+        goto out;
+    }
 
-	if (file) {
-		flush_region (frame, file, 0, file->pages.prev->offset+1);
-	}
+    gf_proc_dump_build_key(key_prefix, "xlator.performance.read-ahead", "file");
 
-	frame->local = NULL;
-	STACK_UNWIND (frame, op_ret, op_errno, stbuf);
-	return 0;
-}
+    gf_proc_dump_add_section("%s", key_prefix);
 
+    ret = __inode_path(fd->inode, NULL, &path);
+    if (path != NULL) {
+        gf_proc_dump_write("path", "%s", path);
+        GF_FREE(path);
+    }
 
-int
-ra_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
-           struct iovec *vector, int32_t count, off_t offset)
-{
-	ra_file_t *file = NULL;
-	int        ret = 0;
-	uint64_t tmp_file = 0;
+    gf_proc_dump_write("fd", "%p", fd);
 
-	ret = fd_ctx_get (fd, this, &tmp_file);
-	file = (ra_file_t *)(long)tmp_file;
+    gf_proc_dump_write("disabled", "%s", file->disabled ? "yes" : "no");
 
-	if (file) {
-		flush_region (frame, file, 0, file->pages.prev->offset+1);
+    if (file->disabled) {
+        ret = 0;
+        goto out;
+    }
 
-		/* reset the read-ahead counters too */
-		file->expected = file->page_count = 0;
-	}
+    gf_proc_dump_write("page-size", "%" PRId64, file->page_size);
 
-	frame->local = fd;
+    gf_proc_dump_write("page-count", "%u", file->page_count);
 
-	STACK_WIND (frame, ra_writev_cbk,
-		    FIRST_CHILD(this),
-		    FIRST_CHILD(this)->fops->writev,
-		    fd, vector, count, offset);
+    gf_proc_dump_write("next-expected-offset-for-sequential-reads", "%" PRId64,
+                       file->offset);
 
-	return 0;
-}
+    for (page = file->pages.next; page != &file->pages; page = page->next) {
+        gf_proc_dump_write("page", "%d: %p", i++, (void *)page);
+        ra_page_dump(page);
+    }
 
+    ret = 0;
+out:
+    return ret;
+}
 
 int
-ra_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
-	     int32_t op_ret, int32_t op_errno, struct stat *buf)
+ra_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
 {
-	STACK_UNWIND (frame, op_ret, op_errno, buf);
-	return 0;
+    ra_file_t *file = NULL;
+    fd_t *iter_fd = NULL;
+    inode_t *inode = NULL;
+    uint64_t tmp_file = 0;
+    int32_t op_errno = EINVAL;
+    ra_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
+    GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
+
+    inode = fd->inode;
+
+    if (conf->force_atime_update) {
+        LOCK(&inode->lock);
+        {
+            list_for_each_entry(iter_fd, &inode->fd_list, inode_list)
+            {
+                tmp_file = 0;
+                fd_ctx_get(iter_fd, this, &tmp_file);
+                file = (ra_file_t *)(long)tmp_file;
+
+                if (!file)
+                    continue;
+                flush_region(frame, file, 0, file->pages.prev->offset + 1, 0);
+            }
+        }
+        UNLOCK(&inode->lock);
+    }
+
+    STACK_WIND(frame, ra_attr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fstat, fd, xdata);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(stat, frame, -1, op_errno, NULL, NULL);
+    return 0;
 }
 
+int
+ra_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             dict_t *xdata)
+{
+    ra_file_t *file = NULL;
+    fd_t *iter_fd = NULL;
+    inode_t *inode = NULL;
+    uint64_t tmp_file = 0;
+    int32_t op_errno = EINVAL;
+
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
+    GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
+
+    inode = fd->inode;
+
+    LOCK(&inode->lock);
+    {
+        list_for_each_entry(iter_fd, &inode->fd_list, inode_list)
+        {
+            tmp_file = 0;
+            fd_ctx_get(iter_fd, this, &tmp_file);
+            file = (ra_file_t *)(long)tmp_file;
+            if (!file)
+                continue;
+            /*
+             * Truncation invalidates reads just like writing does.
+             * TBD: this seems to flush more than it should.  The
+             * only time we should flush at all is when we're
+             * shortening (not lengthening) the file, and then only
+             * from new EOF to old EOF.  The same problem exists in
+             * ra_truncate.
+             */
+            flush_region(frame, file, 0, file->pages.prev->offset + 1, 1);
+        }
+    }
+    UNLOCK(&inode->lock);
+
+    STACK_WIND(frame, ra_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(truncate, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+}
 
 int
-ra_truncate (call_frame_t *frame, xlator_t *this,
-             loc_t *loc, off_t offset)
+ra_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+               struct iatt *postbuf, dict_t *xdata)
 {
-	ra_file_t *file = NULL;
-	fd_t      *iter_fd = NULL;
-	inode_t   *inode = NULL;
-	int        ret = 0;
-	uint64_t tmp_file = 0;
+    GF_ASSERT(frame);
+
+    STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
 
-	inode = loc->inode;
+static int
+ra_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+           size_t len, dict_t *xdata)
+{
+    ra_file_t *file = NULL;
+    fd_t *iter_fd = NULL;
+    inode_t *inode = NULL;
+    uint64_t tmp_file = 0;
+    int32_t op_errno = EINVAL;
+
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
+    GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
+
+    inode = fd->inode;
+
+    LOCK(&inode->lock);
+    {
+        list_for_each_entry(iter_fd, &inode->fd_list, inode_list)
+        {
+            tmp_file = 0;
+            fd_ctx_get(iter_fd, this, &tmp_file);
+            file = (ra_file_t *)(long)tmp_file;
+            if (!file)
+                continue;
+
+            flush_region(frame, file, offset, len, 1);
+        }
+    }
+    UNLOCK(&inode->lock);
+
+    STACK_WIND(frame, ra_discard_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+    return 0;
 
-	LOCK (&inode->lock);
-	{
-		list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
-			ret = fd_ctx_get (iter_fd, this, &tmp_file);
-			file = (ra_file_t *)(long)tmp_file;
+unwind:
+    STACK_UNWIND_STRICT(discard, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+}
 
-			if (!file)
-				continue;
-			flush_region (frame, file, 0,
-				      file->pages.prev->offset + 1);
-		}
-	}
-	UNLOCK (&inode->lock);
+int
+ra_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                struct iatt *postbuf, dict_t *xdata)
+{
+    GF_ASSERT(frame);
 
-	STACK_WIND (frame, ra_attr_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->truncate,
-		    loc, offset);
-	return 0;
+    STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
 }
 
+static int
+ra_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            off_t len, dict_t *xdata)
+{
+    ra_file_t *file = NULL;
+    fd_t *iter_fd = NULL;
+    inode_t *inode = NULL;
+    uint64_t tmp_file = 0;
+    int32_t op_errno = EINVAL;
+
+    GF_ASSERT(frame);
+    GF_VALIDATE_OR_GOTO(frame->this->name, this, unwind);
+    GF_VALIDATE_OR_GOTO(frame->this->name, fd, unwind);
+
+    inode = fd->inode;
+
+    LOCK(&inode->lock);
+    {
+        list_for_each_entry(iter_fd, &inode->fd_list, inode_list)
+        {
+            tmp_file = 0;
+            fd_ctx_get(iter_fd, this, &tmp_file);
+            file = (ra_file_t *)(long)tmp_file;
+            if (!file)
+                continue;
+
+            flush_region(frame, file, offset, len, 1);
+        }
+    }
+    UNLOCK(&inode->lock);
+
+    STACK_WIND(frame, ra_zerofill_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+    return 0;
+}
 
 int
-ra_fstat (call_frame_t *frame, xlator_t *this,
-	  fd_t *fd)
+ra_priv_dump(xlator_t *this)
 {
-	ra_file_t *file = NULL;
-	fd_t      *iter_fd = NULL;
-	inode_t   *inode = NULL;
-	int        ret = 0;
-	uint64_t tmp_file = 0;
+    ra_conf_t *conf = NULL;
+    int ret = -1;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+
+    if (!this) {
+        goto out;
+    }
+
+    conf = this->private;
+    if (!conf) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_XLATOR_CONF_NULL,
+               "conf null in xlator");
+        goto out;
+    }
+
+    gf_proc_dump_build_key(key_prefix, "xlator.performance.read-ahead", "priv");
+
+    gf_proc_dump_add_section("%s", key_prefix);
+
+    ret = pthread_mutex_trylock(&conf->conf_lock);
+    if (ret)
+        goto out;
+    {
+        gf_proc_dump_write("page_size", "%" PRIu64, conf->page_size);
+        gf_proc_dump_write("page_count", "%d", conf->page_count);
+        gf_proc_dump_write("force_atime_update", "%d",
+                           conf->force_atime_update);
+    }
+    pthread_mutex_unlock(&conf->conf_lock);
+
+    ret = 0;
+out:
+    if (ret && conf) {
+        gf_proc_dump_write("Unable to dump priv",
+                           "(Lock acquisition failed) %s", this->name);
+    }
+    return ret;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
 
-	inode = fd->inode;
+    if (!this) {
+        goto out;
+    }
 
-	LOCK (&inode->lock);
-	{
-		list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
-			ret = fd_ctx_get (iter_fd, this, &tmp_file);
-			file = (ra_file_t *)(long)tmp_file;
+    ret = xlator_mem_acct_init(this, gf_ra_mt_end + 1);
 
-			if (!file)
-				continue;
-			flush_region (frame, file, 0,
-				      file->pages.prev->offset + 1);
-		}
-	}
-	UNLOCK (&inode->lock);
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, READ_AHEAD_MSG_NO_MEMORY,
+               "Memory accounting init"
+               "failed");
+    }
 
-	STACK_WIND (frame, ra_attr_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->fstat,
-		    fd);
-	return 0;
+out:
+    return ret;
 }
 
-
 int
-ra_fchown (call_frame_t *frame, xlator_t *this,
-	   fd_t *fd, uid_t uid, gid_t gid)
+reconfigure(xlator_t *this, dict_t *options)
 {
-	ra_file_t *file = NULL;
-	fd_t      *iter_fd = NULL;
-	inode_t   *inode = NULL;
-	int        ret = 0;
-	uint64_t tmp_file = 0;
+    ra_conf_t *conf = NULL;
+    int ret = -1;
 
-	inode = fd->inode;
+    GF_VALIDATE_OR_GOTO("read-ahead", this, out);
+    GF_VALIDATE_OR_GOTO("read-ahead", this->private, out);
 
-	LOCK (&inode->lock);
-	{
-		list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
-			ret = fd_ctx_get (iter_fd, this, &tmp_file);
-			file = (ra_file_t *)(long)tmp_file;
+    conf = this->private;
 
-			if (!file)
-				continue;
-			flush_region (frame, file, 0,
-				      file->pages.prev->offset + 1);
-		}
-	}
-	UNLOCK (&inode->lock);
+    GF_OPTION_RECONF("page-count", conf->page_count, options, uint32, out);
 
-	STACK_WIND (frame, ra_attr_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->fchown,
-		    fd, uid, gid);
-	return 0;
-}
+    GF_OPTION_RECONF("page-size", conf->page_size, options, size_uint64, out);
 
+    GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, out);
 
-int
-ra_ftruncate (call_frame_t *frame, xlator_t *this,
-              fd_t *fd, off_t offset)
-{
-	ra_file_t *file = NULL;
-	fd_t      *iter_fd = NULL;
-	inode_t   *inode = NULL;
-	int        ret = 0;
-	uint64_t tmp_file = 0;
-
-	inode = fd->inode;
-
-	LOCK (&inode->lock);
-	{
-		list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
-			ret = fd_ctx_get (iter_fd, this, &tmp_file);
-			file = (ra_file_t *)(long)tmp_file;
-			if (!file)
-				continue;
-			flush_region (frame, file, 0,
-				      file->pages.prev->offset + 1);
-		}
-	}
-	UNLOCK (&inode->lock);
-
-	STACK_WIND (frame, ra_attr_cbk,
-		    FIRST_CHILD (this),
-		    FIRST_CHILD (this)->fops->ftruncate,
-		    fd, offset);
-	return 0;
+    ret = 0;
+out:
+    return ret;
 }
 
-
 int
-init (xlator_t *this)
-{
-	ra_conf_t *conf;
-	dict_t *options = this->options;
-	char *page_size_string = NULL;
-	char *page_count_string = NULL;
-
-	if (!this->children || this->children->next) {
-		gf_log (this->name,  GF_LOG_ERROR,
-			"FATAL: read-ahead not configured with exactly one child");
-		return -1;
-	}
-
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
- 
-	conf = (void *) CALLOC (1, sizeof (*conf));
-	ERR_ABORT (conf);
-	conf->page_size = 256 * 1024;
-	conf->page_count = 2;
-
-	if (dict_get (options, "page-size"))
-		page_size_string = data_to_str (dict_get (options,
-							  "page-size"));
-	if (page_size_string)
-	{
-		if (gf_string2bytesize (page_size_string, &conf->page_size) != 0)
-		{
-			gf_log ("read-ahead", 
-				GF_LOG_ERROR, 
-				"invalid number format \"%s\" of \"option page-size\"", 
-				page_size_string);
-			return -1;
-		}
-      
-		gf_log (this->name, GF_LOG_DEBUG, "Using conf->page_size = %"PRIu64"",
-			conf->page_size);
-	}
-  
-	if (dict_get (options, "page-count"))
-		page_count_string = data_to_str (dict_get (options, 
-							   "page-count"));
-	if (page_count_string)
-	{
-		if (gf_string2uint_base10 (page_count_string, &conf->page_count) != 0)
-		{
-			gf_log ("read-ahead", 
-				GF_LOG_ERROR, 
-				"invalid number format \"%s\" of \"option page-count\"", 
-				page_count_string);
-			return -1;
-		}
-		gf_log (this->name, GF_LOG_DEBUG, "Using conf->page_count = %u",
-			conf->page_count);
-	}
-  
-	if (dict_get (options, "force-atime-update")) {
-		char *force_atime_update_str = data_to_str (dict_get (options,
-								      "force-atime-update"));
-		if (gf_string2boolean (force_atime_update_str, &conf->force_atime_update) == -1) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"'force-atime-update' takes only boolean options");
-			return -1;
-		}
-		if (conf->force_atime_update)
-			gf_log (this->name, GF_LOG_DEBUG, "Forcing atime updates on cache hit");
-	}
-
-	conf->files.next = &conf->files;
-	conf->files.prev = &conf->files;
-
-	pthread_mutex_init (&conf->conf_lock, NULL);
-	this->private = conf;
-	return 0;
+init(xlator_t *this)
+{
+    ra_conf_t *conf = NULL;
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("read-ahead", this, out);
+
+    if (!this->children || this->children->next) {
+        gf_msg(this->name, GF_LOG_ERROR, 0,
+               READ_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED,
+               "FATAL: read-ahead not configured with exactly one"
+               " child");
+        goto out;
+    }
+
+    if (!this->parents) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, READ_AHEAD_MSG_VOL_MISCONFIGURED,
+               "dangling volume. check volfile ");
+    }
+
+    conf = (void *)GF_CALLOC(1, sizeof(*conf), gf_ra_mt_ra_conf_t);
+    if (conf == NULL) {
+        goto out;
+    }
+
+    conf->page_size = this->ctx->page_size;
+
+    GF_OPTION_INIT("page-size", conf->page_size, size_uint64, out);
+
+    GF_OPTION_INIT("page-count", conf->page_count, uint32, out);
+
+    GF_OPTION_INIT("force-atime-update", conf->force_atime_update, bool, out);
+
+    GF_OPTION_INIT("pass-through", this->pass_through, bool, out);
+
+    conf->files.next = &conf->files;
+    conf->files.prev = &conf->files;
+
+    pthread_mutex_init(&conf->conf_lock, NULL);
+
+    this->local_pool = mem_pool_new(ra_local_t, 64);
+    if (!this->local_pool) {
+        ret = -1;
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, READ_AHEAD_MSG_NO_MEMORY,
+               "failed to create local_t's memory pool");
+        goto out;
+    }
+
+    this->private = conf;
+    ret = 0;
+
+out:
+    if (ret == -1) {
+        GF_FREE(conf);
+    }
+
+    return ret;
 }
 
 void
-fini (xlator_t *this)
+fini(xlator_t *this)
 {
-	ra_conf_t *conf = this->private;
-
-	pthread_mutex_destroy (&conf->conf_lock);
-	FREE (conf);
-
-	this->private = NULL;
-	return;
+    ra_conf_t *conf = NULL;
+
+    GF_VALIDATE_OR_GOTO("read-ahead", this, out);
+
+    conf = this->private;
+    if (conf == NULL) {
+        goto out;
+    }
+
+    this->private = NULL;
+
+    /* The files structures allocated in open and create are not deleted.
+     * until that is freed, marking the below assert as warning.
+    GF_ASSERT ((conf->files.next == &conf->files)
+               && (conf->files.prev == &conf->files));
+    */
+    if (!((conf->files.next == &conf->files) &&
+          (conf->files.prev == &conf->files))) {
+        gf_msg(this->name, GF_LOG_INFO, 0,
+               READ_AHEAD_MSG_UNDESTROYED_FILE_FOUND,
+               "undestroyed read ahead file structures found");
+    }
+
+    pthread_mutex_destroy(&conf->conf_lock);
+    GF_FREE(conf);
+
+out:
+    return;
 }
 
 struct xlator_fops fops = {
-	.open        = ra_open,
-	.create      = ra_create,
-	.readv       = ra_readv,
-	.writev      = ra_writev,
-	.flush       = ra_flush,
-	.fsync       = ra_fsync,
-	.truncate    = ra_truncate,
-	.ftruncate   = ra_ftruncate,
-	.fstat       = ra_fstat,
-	.fchown      = ra_fchown,
+    .open = ra_open,
+    .create = ra_create,
+    .readv = ra_readv,
+    .writev = ra_writev,
+    .flush = ra_flush,
+    .fsync = ra_fsync,
+    .truncate = ra_truncate,
+    .ftruncate = ra_ftruncate,
+    .fstat = ra_fstat,
+    .discard = ra_discard,
+    .zerofill = ra_zerofill,
 };
 
-struct xlator_mops mops = {
+struct xlator_cbks cbks = {
+    .release = ra_release,
 };
 
-struct xlator_cbks cbks = {
-	.release       = ra_release,
+struct xlator_dumpops dumpops = {
+    .priv = ra_priv_dump,
+    .fdctx = ra_fdctx_dump,
 };
 
 struct volume_options options[] = {
-	{ .key  = {"force-atime-update"}, 
-	  .type = GF_OPTION_TYPE_BOOL 
-	},
-	{ .key  = {"page-size"}, 
-	  .type = GF_OPTION_TYPE_SIZET, 
-	  .min  = 64 * GF_UNIT_KB, 
-	  .max  = 2 * GF_UNIT_MB 
-	},
-	{ .key  = {"page-count"}, 
-	  .type = GF_OPTION_TYPE_INT, 
-	  .min  = 1, 
-	  .max  = 16 
-	},
-	{ .key = {NULL} },
+    {
+        .key = {"read-ahead"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "enable/disable read-ahead",
+        .op_version = {GD_OP_VERSION_6_0},
+        .flags = OPT_FLAG_SETTABLE,
+    },
+    {.key = {"force-atime-update"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .op_version = {1},
+     .tags = {"read-ahead"},
+     .default_value = "false"},
+    {.key = {"page-count"},
+     .type = GF_OPTION_TYPE_INT,
+     .min = 1,
+     .max = 16,
+     .default_value = "4",
+     .op_version = {1},
+     .tags = {"read-ahead"},
+     .description = "Number of pages that will be pre-fetched"},
+    {.key = {"page-size"},
+     .type = GF_OPTION_TYPE_SIZET,
+     .min = 4096,
+     .max = 1048576 * 64,
+     .default_value = "131072",
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+     .tags = {"read-ahead"},
+     .description = "Page size with which read-ahead performs server I/O"},
+    {.key = {"pass-through"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "false",
+     .op_version = {GD_OP_VERSION_4_1_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+     .tags = {"read-ahead"},
+     .description = "Enable/Disable read ahead translator"},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "read-ahead",
+    .category = GF_MAINTAINED,
 };
diff --git a/xlators/performance/read-ahead/src/read-ahead.h b/xlators/performance/read-ahead/src/read-ahead.h
index d61b23c06d2..e9432fb47cc 100644
--- a/xlators/performance/read-ahead/src/read-ahead.h
+++ b/xlators/performance/read-ahead/src/read-ahead.h
@@ -1,36 +1,22 @@
 /*
-  Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
   This file is part of GlusterFS.
 
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
 */
 
 #ifndef __READ_AHEAD_H
 #define __READ_AHEAD_H
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
-#include "common-utils.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/common-utils.h>
+#include "read-ahead-mem-types.h"
 
 struct ra_conf;
 struct ra_local;
@@ -38,82 +24,77 @@ struct ra_page;
 struct ra_file;
 struct ra_waitq;
 
-
 struct ra_waitq {
-	struct ra_waitq *next;
-	void            *data;
+    struct ra_waitq *next;
+    void *data;
 };
 
-
 struct ra_fill {
-	struct ra_fill *next;
-	struct ra_fill *prev;
-	off_t           offset;
-	size_t          size;
-	struct iovec   *vector;
-	int32_t         count;
-	dict_t         *refs;
+    struct ra_fill *next;
+    struct ra_fill *prev;
+    off_t offset;
+    size_t size;
+    struct iovec *vector;
+    int32_t count;
+    struct iobref *iobref;
 };
 
-
 struct ra_local {
-	mode_t            mode;
-	struct ra_fill    fill;
-	off_t             offset;
-	size_t            size;
-	int32_t           op_ret;
-	int32_t           op_errno;
-	off_t             pending_offset;
-	size_t            pending_size;
-	fd_t             *fd;
-	int32_t           wait_count;
-	pthread_mutex_t   local_lock;
+    mode_t mode;
+    struct ra_fill fill;
+    off_t offset;
+    size_t size;
+    int32_t op_ret;
+    int32_t op_errno;
+    off_t pending_offset;
+    size_t pending_size;
+    fd_t *fd;
+    int32_t wait_count;
+    pthread_mutex_t local_lock;
 };
 
-
 struct ra_page {
-	struct ra_page   *next;
-	struct ra_page   *prev;
-	struct ra_file   *file;
-	char              dirty;
-	char              ready;
-	struct iovec     *vector;
-	int32_t           count;
-	off_t             offset;
-	size_t            size;
-	struct ra_waitq  *waitq;
-	dict_t           *ref;
+    struct ra_page *next;
+    struct ra_page *prev;
+    struct ra_file *file;
+    char dirty;    /* Internal request, not from user. */
+    char poisoned; /* Pending read invalidated by write. */
+    char ready;
+    struct iovec *vector;
+    int32_t count;
+    off_t offset;
+    size_t size;
+    struct ra_waitq *waitq;
+    struct iobref *iobref;
+    char stale;
 };
 
-
 struct ra_file {
-	struct ra_file    *next;
-	struct ra_file    *prev;
-	struct ra_conf    *conf;
-	fd_t              *fd;
-	int                disabled;
-	size_t             expected;
-	struct ra_page     pages;
-	off_t              offset;
-	size_t             size;
-	int32_t            refcount;
-	pthread_mutex_t    file_lock;
-	struct stat        stbuf;
-	uint64_t           page_size;
-	uint32_t           page_count;
+    struct ra_file *next;
+    struct ra_file *prev;
+    struct ra_conf *conf;
+    fd_t *fd;
+    int disabled;
+    size_t expected;
+    struct ra_page pages;
+    off_t offset;
+    size_t size;
+    int32_t refcount;
+    pthread_mutex_t file_lock;
+    struct iatt stbuf;
+    uint64_t page_size;
+    uint32_t page_count;
 };
 
-
 struct ra_conf {
-	uint64_t          page_size;
-	uint32_t          page_count;
-	void             *cache_block;
-	struct ra_file    files;
-	gf_boolean_t      force_atime_update;
-	pthread_mutex_t   conf_lock;
+    uint64_t page_size;
+    uint32_t page_count;
+    void *cache_block;
+    struct ra_file files;
+    gf_boolean_t force_atime_update;
+    pthread_mutex_t conf_lock;
 };
 
-
 typedef struct ra_conf ra_conf_t;
 typedef struct ra_local ra_local_t;
 typedef struct ra_page ra_page_t;
@@ -122,73 +103,69 @@ typedef struct ra_waitq ra_waitq_t;
 typedef struct ra_fill ra_fill_t;
 
 ra_page_t *
-ra_page_get (ra_file_t *file,
-	     off_t offset);
+ra_page_get(ra_file_t *file, off_t offset);
+
 ra_page_t *
-ra_page_create (ra_file_t *file,
-		off_t offset);
+ra_page_create(ra_file_t *file, off_t offset);
+
 void
-ra_page_fault (ra_file_t *file,
-	       call_frame_t *frame,
-	       off_t offset);
+ra_page_fault(ra_file_t *file, call_frame_t *frame, off_t offset);
 void
-ra_wait_on_page (ra_page_t *page,
-		 call_frame_t *frame);
+ra_wait_on_page(ra_page_t *page, call_frame_t *frame);
+
 ra_waitq_t *
-ra_page_wakeup (ra_page_t *page);
+ra_page_wakeup(ra_page_t *page);
 
 void
-ra_page_flush (ra_page_t *page);
+ra_page_flush(ra_page_t *page);
 
 ra_waitq_t *
-ra_page_error (ra_page_t *page,
-	       int32_t op_ret,
-	       int32_t op_errno);
+ra_page_error(ra_page_t *page, int32_t op_ret, int32_t op_errno);
 void
-ra_page_purge (ra_page_t *page);
+ra_page_purge(ra_page_t *page);
 
 void
-ra_frame_return (call_frame_t *frame);
+ra_frame_return(call_frame_t *frame);
+
 void
-ra_frame_fill (ra_page_t *page,
-	       call_frame_t *frame);
+ra_frame_fill(ra_page_t *page, call_frame_t *frame);
 
 void
-ra_file_destroy (ra_file_t *file);
+ra_file_destroy(ra_file_t *file);
 
 static inline void
-ra_file_lock (ra_file_t *file)
+ra_file_lock(ra_file_t *file)
 {
-	pthread_mutex_lock (&file->file_lock);
+    pthread_mutex_lock(&file->file_lock);
 }
 
 static inline void
-ra_file_unlock (ra_file_t *file)
+ra_file_unlock(ra_file_t *file)
 {
-	pthread_mutex_unlock (&file->file_lock);
+    pthread_mutex_unlock(&file->file_lock);
 }
 
 static inline void
-ra_conf_lock (ra_conf_t *conf)
+ra_conf_lock(ra_conf_t *conf)
 {
-	pthread_mutex_lock (&conf->conf_lock);
+    pthread_mutex_lock(&conf->conf_lock);
 }
 
 static inline void
-ra_conf_unlock (ra_conf_t *conf)
+ra_conf_unlock(ra_conf_t *conf)
 {
-	pthread_mutex_unlock (&conf->conf_lock);
+    pthread_mutex_unlock(&conf->conf_lock);
 }
 static inline void
-ra_local_lock (ra_local_t *local)
+ra_local_lock(ra_local_t *local)
 {
-	pthread_mutex_lock (&local->local_lock);
+    pthread_mutex_lock(&local->local_lock);
 }
 
 static inline void
-ra_local_unlock (ra_local_t *local)
+ra_local_unlock(ra_local_t *local)
 {
-	pthread_mutex_unlock (&local->local_lock);
+    pthread_mutex_unlock(&local->local_lock);
 }
 
 #endif /* __READ_AHEAD_H */
diff --git a/xlators/performance/readdir-ahead/Makefile.am b/xlators/performance/readdir-ahead/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/performance/readdir-ahead/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/performance/readdir-ahead/src/Makefile.am b/xlators/performance/readdir-ahead/src/Makefile.am
new file mode 100644
index 00000000000..3d6b6ae951f
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/Makefile.am
@@ -0,0 +1,18 @@
+xlator_LTLIBRARIES = readdir-ahead.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
+
+readdir_ahead_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+readdir_ahead_la_SOURCES = readdir-ahead.c
+readdir_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = readdir-ahead.h readdir-ahead-mem-types.h \
+	readdir-ahead-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h
new file mode 100644
index 00000000000..498ffae7f64
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h
@@ -0,0 +1,24 @@
+/*
+  Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __RDA_MEM_TYPES_H__
+#define __RDA_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_rda_mem_types_ {
+    gf_rda_mt_rda_local = gf_common_mt_end + 1,
+    gf_rda_mt_rda_fd_ctx,
+    gf_rda_mt_rda_priv,
+    gf_rda_mt_inode_ctx_t,
+    gf_rda_mt_end
+};
+
+#endif
diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead-messages.h b/xlators/performance/readdir-ahead/src/readdir-ahead-messages.h
new file mode 100644
index 00000000000..28ec14dd845
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/readdir-ahead-messages.h
@@ -0,0 +1,30 @@
+/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _READDIR_AHEAD_MESSAGES_H_
+#define _READDIR_AHEAD_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(READDIR_AHEAD, READDIR_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED,
+           READDIR_AHEAD_MSG_VOL_MISCONFIGURED, READDIR_AHEAD_MSG_NO_MEMORY,
+           READDIR_AHEAD_MSG_DIR_RELEASE_PENDING_STUB,
+           READDIR_AHEAD_MSG_OUT_OF_SEQUENCE, READDIR_AHEAD_MSG_DICT_OP_FAILED);
+
+#endif /* _READDIR_AHEAD_MESSAGES_H_ */
diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.c b/xlators/performance/readdir-ahead/src/readdir-ahead.c
new file mode 100644
index 00000000000..4ba7ee7077a
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/readdir-ahead.c
@@ -0,0 +1,1382 @@
+/*
+  Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+/*
+ * performance/readdir-ahead preloads a local buffer with directory entries
+ * on opendir. The optimization involves using maximum sized gluster rpc
+ * requests (128k) to minimize overhead of smaller client requests.
+ *
+ * For example, fuse currently supports a maximum readdir buffer of 4k
+ * (regardless of the filesystem client's buffer size). readdir-ahead should
+ * effectively convert these smaller requests into fewer, larger sized requests
+ * for simple, sequential workloads (i.e., ls).
+ *
+ * The translator is currently designed to handle the simple, sequential case
+ * only. If a non-sequential directory read occurs, readdir-ahead disables
+ * preloads on the directory.
+ */
+
+#include <math.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/call-stub.h>
+#include "readdir-ahead.h"
+#include "readdir-ahead-mem-types.h"
+#include <glusterfs/defaults.h>
+#include "readdir-ahead-messages.h"
+static int
+rda_fill_fd(call_frame_t *, xlator_t *, fd_t *);
+
+static void
+rda_local_wipe(struct rda_local *local)
+{
+    if (local->fd)
+        fd_unref(local->fd);
+    if (local->xattrs)
+        dict_unref(local->xattrs);
+    if (local->inode)
+        inode_unref(local->inode);
+}
+
+/*
+ * Get (or create) the fd context for storing prepopulated directory
+ * entries.
+ */
+static struct rda_fd_ctx *
+get_rda_fd_ctx(fd_t *fd, xlator_t *this)
+{
+    uint64_t val;
+    struct rda_fd_ctx *ctx;
+
+    LOCK(&fd->lock);
+
+    if (__fd_ctx_get(fd, this, &val) < 0) {
+        ctx = GF_CALLOC(1, sizeof(struct rda_fd_ctx), gf_rda_mt_rda_fd_ctx);
+        if (!ctx)
+            goto out;
+
+        LOCK_INIT(&ctx->lock);
+        INIT_LIST_HEAD(&ctx->entries.list);
+        ctx->state = RDA_FD_NEW;
+        /* ctx offset values initialized to 0 */
+        ctx->xattrs = NULL;
+
+        if (__fd_ctx_set(fd, this, (uint64_t)(uintptr_t)ctx) < 0) {
+            GF_FREE(ctx);
+            ctx = NULL;
+            goto out;
+        }
+    } else {
+        ctx = (struct rda_fd_ctx *)(uintptr_t)val;
+    }
+out:
+    UNLOCK(&fd->lock);
+    return ctx;
+}
+
+static rda_inode_ctx_t *
+__rda_inode_ctx_get(inode_t *inode, xlator_t *this)
+{
+    int ret = -1;
+    uint64_t ctx_uint = 0;
+    rda_inode_ctx_t *ctx_p = NULL;
+
+    ret = __inode_ctx_get1(inode, this, &ctx_uint);
+    if (ret == 0)
+        return (rda_inode_ctx_t *)(uintptr_t)ctx_uint;
+
+    ctx_p = GF_CALLOC(1, sizeof(*ctx_p), gf_rda_mt_inode_ctx_t);
+    if (!ctx_p)
+        return NULL;
+
+    GF_ATOMIC_INIT(ctx_p->generation, 0);
+
+    ctx_uint = (uint64_t)(uintptr_t)ctx_p;
+    ret = __inode_ctx_set1(inode, this, &ctx_uint);
+    if (ret < 0) {
+        GF_FREE(ctx_p);
+        return NULL;
+    }
+
+    return ctx_p;
+}
+
+static int
+__rda_inode_ctx_update_iatts(inode_t *inode, xlator_t *this,
+                             struct iatt *stbuf_in, struct iatt *stbuf_out,
+                             uint64_t generation)
+{
+    rda_inode_ctx_t *ctx_p = NULL;
+    struct iatt tmp_stat = {
+        0,
+    };
+
+    ctx_p = __rda_inode_ctx_get(inode, this);
+    if (!ctx_p)
+        return -1;
+
+    if ((!stbuf_in) || (stbuf_in->ia_ctime == 0)) {
+        /* A fop modified a file but valid stbuf is not provided.
+         * Can't update iatt to reflect results of fop and hence
+         * invalidate the iatt stored in dentry.
+         *
+         * An example of this case can be response of write request
+         * that is cached in write-behind.
+         */
+        if (stbuf_in)
+            tmp_stat = *stbuf_in;
+        else
+            tmp_stat = ctx_p->statbuf;
+        memset(&ctx_p->statbuf, 0, sizeof(ctx_p->statbuf));
+        gf_uuid_copy(ctx_p->statbuf.ia_gfid, tmp_stat.ia_gfid);
+        ctx_p->statbuf.ia_type = tmp_stat.ia_type;
+        GF_ATOMIC_INC(ctx_p->generation);
+    } else {
+        if (ctx_p->statbuf.ia_ctime) {
+            if (stbuf_in->ia_ctime < ctx_p->statbuf.ia_ctime) {
+                goto out;
+            }
+
+            if ((stbuf_in->ia_ctime == ctx_p->statbuf.ia_ctime) &&
+                (stbuf_in->ia_ctime_nsec < ctx_p->statbuf.ia_ctime_nsec)) {
+                goto out;
+            }
+        } else {
+            if ((generation != -1) &&
+                (generation != GF_ATOMIC_GET(ctx_p->generation)))
+                goto out;
+        }
+
+        ctx_p->statbuf = *stbuf_in;
+    }
+
+out:
+    if (stbuf_out)
+        *stbuf_out = ctx_p->statbuf;
+
+    return 0;
+}
+
+static int
+rda_inode_ctx_update_iatts(inode_t *inode, xlator_t *this,
+                           struct iatt *stbuf_in, struct iatt *stbuf_out,
+                           uint64_t generation)
+{
+    int ret = -1;
+
+    LOCK(&inode->lock);
+    {
+        ret = __rda_inode_ctx_update_iatts(inode, this, stbuf_in, stbuf_out,
+                                           generation);
+    }
+    UNLOCK(&inode->lock);
+
+    return ret;
+}
+
+/*
+ * Reset the tracking state of the context.
+ */
+static void
+rda_reset_ctx(xlator_t *this, struct rda_fd_ctx *ctx)
+{
+    struct rda_priv *priv = NULL;
+
+    priv = this->private;
+
+    ctx->state = RDA_FD_NEW;
+    ctx->cur_offset = 0;
+    ctx->next_offset = 0;
+    ctx->op_errno = 0;
+
+    gf_dirent_free(&ctx->entries);
+    GF_ATOMIC_SUB(priv->rda_cache_size, ctx->cur_size);
+    ctx->cur_size = 0;
+
+    if (ctx->xattrs) {
+        dict_unref(ctx->xattrs);
+        ctx->xattrs = NULL;
+    }
+}
+
+static void
+rda_mark_inode_dirty(xlator_t *this, inode_t *inode)
+{
+    inode_t *parent = NULL;
+    fd_t *fd = NULL;
+    uint64_t val = 0;
+    int32_t ret = 0;
+    struct rda_fd_ctx *fd_ctx = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {0};
+
+    parent = inode_parent(inode, NULL, NULL);
+    if (parent) {
+        LOCK(&parent->lock);
+        {
+            list_for_each_entry(fd, &parent->fd_list, inode_list)
+            {
+                val = 0;
+                fd_ctx_get(fd, this, &val);
+                if (val == 0)
+                    continue;
+
+                fd_ctx = (void *)(uintptr_t)val;
+                uuid_utoa_r(inode->gfid, gfid);
+                if (!GF_ATOMIC_GET(fd_ctx->prefetching))
+                    continue;
+
+                LOCK(&fd_ctx->lock);
+                {
+                    if (GF_ATOMIC_GET(fd_ctx->prefetching)) {
+                        if (fd_ctx->writes_during_prefetch == NULL)
+                            fd_ctx->writes_during_prefetch = dict_new();
+
+                        ret = dict_set_int8(fd_ctx->writes_during_prefetch,
+                                            gfid, 1);
+                        if (ret < 0) {
+                            gf_log(this->name, GF_LOG_WARNING,
+                                   "marking to invalidate stats of %s from an "
+                                   "in progress "
+                                   "prefetching has failed, might result in "
+                                   "stale stat to "
+                                   "application",
+                                   gfid);
+                        }
+                    }
+                }
+                UNLOCK(&fd_ctx->lock);
+            }
+        }
+        UNLOCK(&parent->lock);
+        inode_unref(parent);
+    }
+
+    return;
+}
+
+/*
+ * Check whether we can handle a request. Offset verification is done by the
+ * caller, so we only check whether the preload buffer has completion status
+ * (including an error) or has some data to return.
+ */
+static gf_boolean_t
+rda_can_serve_readdirp(struct rda_fd_ctx *ctx, size_t request_size)
+{
+    if ((ctx->state & RDA_FD_EOD) || (ctx->state & RDA_FD_ERROR) ||
+        (!(ctx->state & RDA_FD_PLUGGED) && (ctx->cur_size > 0)) ||
+        (request_size && ctx->cur_size >= request_size))
+        return _gf_true;
+
+    return _gf_false;
+}
+
+void
+rda_inode_ctx_get_iatt(inode_t *inode, xlator_t *this, struct iatt *attr)
+{
+    rda_inode_ctx_t *ctx_p = NULL;
+
+    if (!inode || !this || !attr)
+        goto out;
+
+    LOCK(&inode->lock);
+    {
+        ctx_p = __rda_inode_ctx_get(inode, this);
+        if (ctx_p) {
+            *attr = ctx_p->statbuf;
+        }
+    }
+    UNLOCK(&inode->lock);
+
+out:
+    return;
+}
+
+/*
+ * Serve a request from the fd dentry list based on the size of the request
+ * buffer. ctx must be locked.
+ */
+static int32_t
+__rda_fill_readdirp(xlator_t *this, gf_dirent_t *entries, size_t request_size,
+                    struct rda_fd_ctx *ctx)
+{
+    gf_dirent_t *dirent, *tmp;
+    size_t dirent_size, size = 0;
+    int32_t count = 0;
+    struct rda_priv *priv = NULL;
+    struct iatt tmp_stat = {
+        0,
+    };
+
+    priv = this->private;
+
+    list_for_each_entry_safe(dirent, tmp, &ctx->entries.list, list)
+    {
+        dirent_size = gf_dirent_size(dirent->d_name);
+        if (size + dirent_size > request_size)
+            break;
+
+        memset(&tmp_stat, 0, sizeof(tmp_stat));
+
+        if (dirent->inode && (!((strcmp(dirent->d_name, ".") == 0) ||
+                                (strcmp(dirent->d_name, "..") == 0)))) {
+            rda_inode_ctx_get_iatt(dirent->inode, this, &tmp_stat);
+            dirent->d_stat = tmp_stat;
+        }
+
+        size += dirent_size;
+        list_del_init(&dirent->list);
+        ctx->cur_size -= dirent_size;
+
+        GF_ATOMIC_SUB(priv->rda_cache_size, dirent_size);
+
+        list_add_tail(&dirent->list, &entries->list);
+        ctx->cur_offset = dirent->d_off;
+        count++;
+    }
+
+    if (ctx->cur_size <= priv->rda_low_wmark)
+        ctx->state |= RDA_FD_PLUGGED;
+
+    return count;
+}
+
+static int32_t
+__rda_serve_readdirp(xlator_t *this, struct rda_fd_ctx *ctx, size_t size,
+                     gf_dirent_t *entries, int *op_errno)
+{
+    int32_t ret = 0;
+
+    ret = __rda_fill_readdirp(this, entries, size, ctx);
+
+    if (!ret && (ctx->state & RDA_FD_ERROR)) {
+        ret = -1;
+        ctx->state &= ~RDA_FD_ERROR;
+
+        /*
+         * the preload has stopped running in the event of an error, so
+         * pass all future requests along
+         */
+        ctx->state |= RDA_FD_BYPASS;
+    }
+    /*
+     * Use the op_errno sent by lower layers as xlators above will check
+     * the op_errno for identifying whether readdir is completed or not.
+     */
+    *op_errno = ctx->op_errno;
+
+    return ret;
+}
+
+static int32_t
+rda_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+             off_t off, dict_t *xdata)
+{
+    struct rda_fd_ctx *ctx = NULL;
+    int fill = 0;
+    gf_dirent_t entries;
+    int ret = 0;
+    int op_errno = 0;
+    gf_boolean_t serve = _gf_false;
+
+    ctx = get_rda_fd_ctx(fd, this);
+    if (!ctx)
+        goto err;
+
+    if (ctx->state & RDA_FD_BYPASS)
+        goto bypass;
+
+    INIT_LIST_HEAD(&entries.list);
+    LOCK(&ctx->lock);
+
+    /* recheck now that we have the lock */
+    if (ctx->state & RDA_FD_BYPASS) {
+        UNLOCK(&ctx->lock);
+        goto bypass;
+    }
+
+    /*
+     * If a new read comes in at offset 0 and the buffer has been
+     * completed, reset the context and kickstart the filler again.
+     */
+    if (!off && (ctx->state & RDA_FD_EOD) && (ctx->cur_size == 0)) {
+        rda_reset_ctx(this, ctx);
+        /*
+         * Unref and discard the 'list of xattrs to be fetched'
+         * stored during opendir call. This is done above - inside
+         * rda_reset_ctx().
+         * Now, ref the xdata passed by md-cache in actual readdirp()
+         * call and use that for all subsequent internal readdirp()
+         * requests issued by this xlator.
+         */
+        ctx->xattrs = dict_ref(xdata);
+        fill = 1;
+    }
+
+    /*
+     * If a readdir occurs at an unexpected offset or we already have a
+     * request pending, admit defeat and just get out of the way.
+     */
+    if (off != ctx->cur_offset || ctx->stub) {
+        ctx->state |= RDA_FD_BYPASS;
+        UNLOCK(&ctx->lock);
+        goto bypass;
+    }
+
+    /*
+     * If we haven't bypassed the preload, this means we can either serve
+     * the request out of the preload or the request that enables us to do
+     * so is in flight...
+     */
+    if (rda_can_serve_readdirp(ctx, size)) {
+        ret = __rda_serve_readdirp(this, ctx, size, &entries, &op_errno);
+        serve = _gf_true;
+
+        if (op_errno == ENOENT &&
+            !((ctx->state & RDA_FD_EOD) && (ctx->cur_size == 0)))
+            op_errno = 0;
+    } else {
+        ctx->stub = fop_readdirp_stub(frame, NULL, fd, size, off, xdata);
+        if (!ctx->stub) {
+            UNLOCK(&ctx->lock);
+            goto err;
+        }
+
+        if (!(ctx->state & RDA_FD_RUNNING)) {
+            fill = 1;
+            if (!ctx->xattrs)
+                ctx->xattrs = dict_ref(xdata);
+            ctx->state |= RDA_FD_RUNNING;
+        }
+    }
+
+    UNLOCK(&ctx->lock);
+
+    if (serve) {
+        STACK_UNWIND_STRICT(readdirp, frame, ret, op_errno, &entries, xdata);
+        gf_dirent_free(&entries);
+    }
+
+    if (fill)
+        rda_fill_fd(frame, this, fd);
+
+    return 0;
+
+bypass:
+    STACK_WIND(frame, default_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata);
+    return 0;
+
+err:
+    STACK_UNWIND_STRICT(readdirp, frame, -1, ENOMEM, NULL, NULL);
+    return 0;
+}
+
+static int32_t
+rda_fill_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+                dict_t *xdata)
+{
+    gf_dirent_t *dirent = NULL;
+    gf_dirent_t *tmp = NULL;
+    gf_dirent_t serve_entries;
+    struct rda_local *local = frame->local;
+    struct rda_fd_ctx *ctx = local->ctx;
+    struct rda_priv *priv = this->private;
+    int fill = 1;
+    size_t dirent_size = 0;
+    int ret = 0;
+    gf_boolean_t serve = _gf_false;
+    call_stub_t *stub = NULL;
+    char gfid[GF_UUID_BUF_SIZE] = {
+        0,
+    };
+    uint64_t generation = 0;
+    call_frame_t *fill_frame = NULL;
+
+    INIT_LIST_HEAD(&serve_entries.list);
+    LOCK(&ctx->lock);
+
+    /* Verify that the preload buffer is still pending on this data. */
+    if (ctx->next_offset != local->offset) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, READDIR_AHEAD_MSG_OUT_OF_SEQUENCE,
+               "Out of sequence directory preload.");
+        ctx->state |= (RDA_FD_BYPASS | RDA_FD_ERROR);
+        ctx->op_errno = EUCLEAN;
+
+        goto out;
+    }
+
+    if (entries) {
+        list_for_each_entry_safe(dirent, tmp, &entries->list, list)
+        {
+            list_del_init(&dirent->list);
+
+            /* must preserve entry order */
+            list_add_tail(&dirent->list, &ctx->entries.list);
+            if (dirent->inode) {
+                /* If ctxp->stat is invalidated, don't update it
+                 * with dirent->d_stat as we don't have
+                 * generation number of the inode when readdirp
+                 * request was initiated. So, we pass 0 for
+                 * generation number
+                 */
+
+                generation = -1;
+                if (ctx->writes_during_prefetch) {
+                    memset(gfid, 0, sizeof(gfid));
+                    uuid_utoa_r(dirent->inode->gfid, gfid);
+                    if (dict_get(ctx->writes_during_prefetch, gfid))
+                        generation = 0;
+                }
+
+                if (!((strcmp(dirent->d_name, ".") == 0) ||
+                      (strcmp(dirent->d_name, "..") == 0))) {
+                    rda_inode_ctx_update_iatts(dirent->inode, this,
+                                               &dirent->d_stat, &dirent->d_stat,
+                                               generation);
+                }
+            }
+
+            dirent_size = gf_dirent_size(dirent->d_name);
+
+            ctx->cur_size += dirent_size;
+
+            GF_ATOMIC_ADD(priv->rda_cache_size, dirent_size);
+
+            ctx->next_offset = dirent->d_off;
+        }
+    }
+
+    if (ctx->writes_during_prefetch) {
+        dict_unref(ctx->writes_during_prefetch);
+        ctx->writes_during_prefetch = NULL;
+    }
+
+    GF_ATOMIC_DEC(ctx->prefetching);
+
+    if (ctx->cur_size >= priv->rda_high_wmark)
+        ctx->state &= ~RDA_FD_PLUGGED;
+
+    if (!op_ret || op_errno == ENOENT) {
+        /* we've hit eod */
+        ctx->state &= ~RDA_FD_RUNNING;
+        ctx->state |= RDA_FD_EOD;
+        ctx->op_errno = op_errno;
+    } else if (op_ret == -1) {
+        /* kill the preload and pend the error */
+        ctx->state &= ~RDA_FD_RUNNING;
+        ctx->state |= RDA_FD_ERROR;
+        ctx->op_errno = op_errno;
+    }
+
+    /*
+     * NOTE: The strict bypass logic in readdirp() means a pending request
+     * is always based on ctx->cur_offset.
+     */
+    if (ctx->stub && rda_can_serve_readdirp(ctx, ctx->stub->args.size)) {
+        ret = __rda_serve_readdirp(this, ctx, ctx->stub->args.size,
+                                   &serve_entries, &op_errno);
+        serve = _gf_true;
+        stub = ctx->stub;
+        ctx->stub = NULL;
+    }
+
+out:
+    /*
+     * If we have been marked for bypass and have no pending stub, clear the
+     * run state so we stop preloading the context with entries.
+     */
+    if (!ctx->stub &&
+        ((ctx->state & RDA_FD_BYPASS) ||
+         GF_ATOMIC_GET(priv->rda_cache_size) > priv->rda_cache_limit))
+        ctx->state &= ~RDA_FD_RUNNING;
+
+    if (!(ctx->state & RDA_FD_RUNNING)) {
+        fill = 0;
+        if (ctx->xattrs) {
+            /*
+             * fill = 0 and hence rda_fill_fd() won't be invoked.
+             * unref for ref taken in rda_fill_fd()
+             */
+            dict_unref(ctx->xattrs);
+            ctx->xattrs = NULL;
+        }
+
+        fill_frame = ctx->fill_frame;
+        ctx->fill_frame = NULL;
+    }
+
+    if (op_errno == ENOENT &&
+        !((ctx->state & RDA_FD_EOD) && (ctx->cur_size == 0)))
+        op_errno = 0;
+
+    UNLOCK(&ctx->lock);
+    if (fill_frame) {
+        rda_local_wipe(fill_frame->local);
+        STACK_DESTROY(fill_frame->root);
+    }
+
+    if (serve) {
+        STACK_UNWIND_STRICT(readdirp, stub->frame, ret, op_errno,
+                            &serve_entries, xdata);
+        gf_dirent_free(&serve_entries);
+        call_stub_destroy(stub);
+    }
+
+    if (fill)
+        rda_fill_fd(frame, this, local->fd);
+
+    return 0;
+}
+
+/*
+ * Start prepopulating the fd context with directory entries.
+ */
+static int
+rda_fill_fd(call_frame_t *frame, xlator_t *this, fd_t *fd)
+{
+    call_frame_t *nframe = NULL;
+    struct rda_local *local = NULL;
+    struct rda_local *orig_local = frame->local;
+    struct rda_fd_ctx *ctx;
+    off_t offset;
+    struct rda_priv *priv = this->private;
+
+    ctx = get_rda_fd_ctx(fd, this);
+    if (!ctx)
+        goto err;
+
+    LOCK(&ctx->lock);
+
+    if (ctx->state & RDA_FD_NEW) {
+        ctx->state &= ~RDA_FD_NEW;
+        ctx->state |= RDA_FD_RUNNING;
+        if (priv->rda_low_wmark)
+            ctx->state |= RDA_FD_PLUGGED;
+    }
+
+    offset = ctx->next_offset;
+
+    if (!ctx->fill_frame) {
+        nframe = copy_frame(frame);
+        if (!nframe) {
+            UNLOCK(&ctx->lock);
+            goto err;
+        }
+
+        local = mem_get0(this->local_pool);
+        if (!local) {
+            UNLOCK(&ctx->lock);
+            goto err;
+        }
+
+        local->ctx = ctx;
+        local->fd = fd_ref(fd);
+        nframe->local = local;
+
+        ctx->fill_frame = nframe;
+
+        if (!ctx->xattrs && orig_local && orig_local->xattrs) {
+            /* when this function is invoked by rda_opendir_cbk */
+            ctx->xattrs = dict_ref(orig_local->xattrs);
+        }
+    } else {
+        nframe = ctx->fill_frame;
+        local = nframe->local;
+    }
+
+    local->offset = offset;
+    GF_ATOMIC_INC(ctx->prefetching);
+
+    UNLOCK(&ctx->lock);
+
+    STACK_WIND(nframe, rda_fill_fd_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, priv->rda_req_size,
+               offset, ctx->xattrs);
+
+    return 0;
+
+err:
+    if (nframe) {
+        rda_local_wipe(nframe->local);
+        FRAME_DESTROY(nframe);
+    }
+
+    return -1;
+}
+
+static int32_t
+rda_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+    if (!op_ret)
+        rda_fill_fd(frame, this, fd);
+
+    RDA_STACK_UNWIND(opendir, frame, op_ret, op_errno, fd, xdata);
+    return 0;
+}
+
+static int32_t
+rda_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+            dict_t *xdata)
+{
+    int op_errno = 0;
+    struct rda_local *local = NULL;
+
+    if (xdata) {
+        local = mem_get0(this->local_pool);
+        if (!local) {
+            op_errno = ENOMEM;
+            goto unwind;
+        }
+
+        /*
+         * Retrieve list of keys set by md-cache xlator and store it
+         * in local to be consumed in rda_opendir_cbk
+         */
+        local->xattrs = dict_copy_with_ref(xdata, NULL);
+        frame->local = local;
+    }
+
+    STACK_WIND(frame, rda_opendir_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(opendir, frame, -1, op_errno, fd, xdata);
+    return 0;
+}
+
+static int32_t
+rda_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+               struct iatt *postbuf, dict_t *xdata)
+{
+    struct rda_local *local = NULL;
+    struct iatt postbuf_out = {
+        0,
+    };
+
+    if (op_ret < 0)
+        goto unwind;
+
+    local = frame->local;
+
+    rda_mark_inode_dirty(this, local->inode);
+
+    rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out,
+                               local->generation);
+
+unwind:
+    RDA_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, &postbuf_out,
+                     xdata);
+    return 0;
+}
+
+static int32_t
+rda_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+           int32_t count, off_t off, uint32_t flags, struct iobref *iobref,
+           dict_t *xdata)
+{
+    RDA_COMMON_MODIFICATION_FOP(writev, frame, this, fd->inode, xdata, fd,
+                                vector, count, off, flags, iobref);
+    return 0;
+}
+
+static int32_t
+rda_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                  struct iatt *postbuf, dict_t *xdata)
+{
+    struct rda_local *local = NULL;
+    struct iatt postbuf_out = {
+        0,
+    };
+
+    if (op_ret < 0)
+        goto unwind;
+
+    local = frame->local;
+    rda_mark_inode_dirty(this, local->inode);
+    rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out,
+                               local->generation);
+
+unwind:
+    RDA_STACK_UNWIND(fallocate, frame, op_ret, op_errno, prebuf, &postbuf_out,
+                     xdata);
+    return 0;
+}
+
+static int32_t
+rda_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size,
+              off_t offset, size_t len, dict_t *xdata)
+{
+    RDA_COMMON_MODIFICATION_FOP(fallocate, frame, this, fd->inode, xdata, fd,
+                                keep_size, offset, len);
+    return 0;
+}
+
+static int32_t
+rda_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                 struct iatt *postbuf, dict_t *xdata)
+{
+    struct rda_local *local = NULL;
+    struct iatt postbuf_out = {
+        0,
+    };
+
+    if (op_ret < 0)
+        goto unwind;
+
+    local = frame->local;
+    rda_mark_inode_dirty(this, local->inode);
+    rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out,
+                               local->generation);
+
+unwind:
+    RDA_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, &postbuf_out,
+                     xdata);
+    return 0;
+}
+
+static int32_t
+rda_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             off_t len, dict_t *xdata)
+{
+    RDA_COMMON_MODIFICATION_FOP(zerofill, frame, this, fd->inode, xdata, fd,
+                                offset, len);
+    return 0;
+}
+
+static int32_t
+rda_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                struct iatt *postbuf, dict_t *xdata)
+{
+    struct rda_local *local = NULL;
+    struct iatt postbuf_out = {
+        0,
+    };
+
+    if (op_ret < 0)
+        goto unwind;
+
+    local = frame->local;
+    rda_mark_inode_dirty(this, local->inode);
+    rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out,
+                               local->generation);
+
+unwind:
+    RDA_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, &postbuf_out,
+                     xdata);
+    return 0;
+}
+
+static int32_t
+rda_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            size_t len, dict_t *xdata)
+{
+    RDA_COMMON_MODIFICATION_FOP(discard, frame, this, fd->inode, xdata, fd,
+                                offset, len);
+    return 0;
+}
+
+static int32_t
+rda_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                  struct iatt *postbuf, dict_t *xdata)
+{
+    struct rda_local *local = NULL;
+    struct iatt postbuf_out = {
+        0,
+    };
+
+    if (op_ret < 0)
+        goto unwind;
+
+    local = frame->local;
+    rda_mark_inode_dirty(this, local->inode);
+    rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out,
+                               local->generation);
+
+unwind:
+    RDA_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, &postbuf_out,
+                     xdata);
+    return 0;
+}
+
+static int32_t
+rda_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+              dict_t *xdata)
+{
+    RDA_COMMON_MODIFICATION_FOP(ftruncate, frame, this, fd->inode, xdata, fd,
+                                offset);
+    return 0;
+}
+
+static int32_t
+rda_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                 struct iatt *postbuf, dict_t *xdata)
+{
+    struct rda_local *local = NULL;
+    struct iatt postbuf_out = {
+        0,
+    };
+
+    if (op_ret < 0)
+        goto unwind;
+
+    local = frame->local;
+    rda_mark_inode_dirty(this, local->inode);
+    rda_inode_ctx_update_iatts(local->inode, this, postbuf, &postbuf_out,
+                               local->generation);
+
+unwind:
+    RDA_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, prebuf, &postbuf_out,
+                     xdata);
+    return 0;
+}
+
+static int32_t
+rda_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+             dict_t *xdata)
+{
+    RDA_COMMON_MODIFICATION_FOP(truncate, frame, this, loc->inode, xdata, loc,
+                                offset);
+    return 0;
+}
+
+static int32_t
+rda_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    struct rda_local *local = NULL;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    local = frame->local;
+    rda_mark_inode_dirty(this, local->inode);
+    rda_inode_ctx_update_iatts(local->inode, this, NULL, NULL,
+                               local->generation);
+unwind:
+    RDA_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+static int32_t
+rda_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+             int32_t flags, dict_t *xdata)
+{
+    RDA_COMMON_MODIFICATION_FOP(setxattr, frame, this, loc->inode, xdata, loc,
+                                dict, flags);
+    return 0;
+}
+
+static int32_t
+rda_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                  int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    struct rda_local *local = NULL;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    local = frame->local;
+    rda_mark_inode_dirty(this, local->inode);
+    rda_inode_ctx_update_iatts(local->inode, this, NULL, NULL,
+                               local->generation);
+unwind:
+    RDA_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+static int32_t
+rda_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+              int32_t flags, dict_t *xdata)
+{
+    RDA_COMMON_MODIFICATION_FOP(fsetxattr, frame, this, fd->inode, xdata, fd,
+                                dict, flags);
+    return 0;
+}
+
+static int32_t
+rda_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+                struct iatt *statpost, dict_t *xdata)
+{
+    struct rda_local *local = NULL;
+    struct iatt postbuf_out = {
+        0,
+    };
+
+    if (op_ret < 0)
+        goto unwind;
+
+    local = frame->local;
+    rda_mark_inode_dirty(this, local->inode);
+    rda_inode_ctx_update_iatts(local->inode, this, statpost, &postbuf_out,
+                               local->generation);
+
+unwind:
+    RDA_STACK_UNWIND(setattr, frame, op_ret, op_errno, statpre, &postbuf_out,
+                     xdata);
+    return 0;
+}
+
+static int32_t
+rda_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+            int32_t valid, dict_t *xdata)
+{
+    RDA_COMMON_MODIFICATION_FOP(setattr, frame, this, loc->inode, xdata, loc,
+                                stbuf, valid);
+    return 0;
+}
+
+static int32_t
+rda_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+                 struct iatt *statpost, dict_t *xdata)
+{
+    struct rda_local *local = NULL;
+    struct iatt postbuf_out = {
+        0,
+    };
+
+    if (op_ret < 0)
+        goto unwind;
+
+    local = frame->local;
+    rda_mark_inode_dirty(this, local->inode);
+    rda_inode_ctx_update_iatts(local->inode, this, statpost, &postbuf_out,
+                               local->generation);
+
+unwind:
+    RDA_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, statpre, &postbuf_out,
+                     xdata);
+    return 0;
+}
+
+static int32_t
+rda_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+             int32_t valid, dict_t *xdata)
+{
+    RDA_COMMON_MODIFICATION_FOP(fsetattr, frame, this, fd->inode, xdata, fd,
+                                stbuf, valid);
+    return 0;
+}
+
+static int32_t
+rda_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    struct rda_local *local = NULL;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    local = frame->local;
+    rda_mark_inode_dirty(this, local->inode);
+    rda_inode_ctx_update_iatts(local->inode, this, NULL, NULL,
+                               local->generation);
+unwind:
+    RDA_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+static int32_t
+rda_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                const char *name, dict_t *xdata)
+{
+    RDA_COMMON_MODIFICATION_FOP(removexattr, frame, this, loc->inode, xdata,
+                                loc, name);
+    return 0;
+}
+
+static int32_t
+rda_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    struct rda_local *local = NULL;
+
+    if (op_ret < 0)
+        goto unwind;
+
+    local = frame->local;
+    rda_mark_inode_dirty(this, local->inode);
+    rda_inode_ctx_update_iatts(local->inode, this, NULL, NULL,
+                               local->generation);
+unwind:
+    RDA_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata);
+    return 0;
+}
+
+static int32_t
+rda_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                 const char *name, dict_t *xdata)
+{
+    RDA_COMMON_MODIFICATION_FOP(fremovexattr, frame, this, fd->inode, xdata, fd,
+                                name);
+    return 0;
+}
+
+static int32_t
+rda_releasedir(xlator_t *this, fd_t *fd)
+{
+    uint64_t val;
+    struct rda_fd_ctx *ctx;
+
+    if (fd_ctx_del(fd, this, &val) < 0)
+        return -1;
+
+    ctx = (struct rda_fd_ctx *)(uintptr_t)val;
+    if (!ctx)
+        return 0;
+
+    rda_reset_ctx(this, ctx);
+
+    if (ctx->fill_frame)
+        STACK_DESTROY(ctx->fill_frame->root);
+
+    if (ctx->stub)
+        gf_msg(this->name, GF_LOG_ERROR, 0,
+               READDIR_AHEAD_MSG_DIR_RELEASE_PENDING_STUB,
+               "released a directory with a pending stub");
+
+    GF_FREE(ctx);
+    return 0;
+}
+
+static int
+rda_forget(xlator_t *this, inode_t *inode)
+{
+    uint64_t ctx_uint = 0;
+    rda_inode_ctx_t *ctx = NULL;
+
+    inode_ctx_del1(inode, this, &ctx_uint);
+    if (!ctx_uint)
+        return 0;
+
+    ctx = (rda_inode_ctx_t *)(uintptr_t)ctx_uint;
+
+    GF_FREE(ctx);
+
+    return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    if (!this)
+        goto out;
+
+    ret = xlator_mem_acct_init(this, gf_rda_mt_end + 1);
+
+    if (ret != 0)
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, READDIR_AHEAD_MSG_NO_MEMORY,
+               "Memory accounting init"
+               "failed");
+
+out:
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    struct rda_priv *priv = this->private;
+
+    GF_OPTION_RECONF("rda-request-size", priv->rda_req_size, options,
+                     size_uint64, err);
+    GF_OPTION_RECONF("rda-low-wmark", priv->rda_low_wmark, options, size_uint64,
+                     err);
+    GF_OPTION_RECONF("rda-high-wmark", priv->rda_high_wmark, options,
+                     size_uint64, err);
+    GF_OPTION_RECONF("rda-cache-limit", priv->rda_cache_limit, options,
+                     size_uint64, err);
+    GF_OPTION_RECONF("parallel-readdir", priv->parallel_readdir, options, bool,
+                     err);
+    GF_OPTION_RECONF("pass-through", this->pass_through, options, bool, err);
+
+    return 0;
+err:
+    return -1;
+}
+
+int
+init(xlator_t *this)
+{
+    struct rda_priv *priv = NULL;
+
+    GF_VALIDATE_OR_GOTO("readdir-ahead", this, err);
+
+    if (!this->children || this->children->next) {
+        gf_msg(this->name, GF_LOG_ERROR, 0,
+               READDIR_AHEAD_MSG_XLATOR_CHILD_MISCONFIGURED,
+               "FATAL: readdir-ahead not configured with exactly one"
+               " child");
+        goto err;
+    }
+
+    if (!this->parents) {
+        gf_msg(this->name, GF_LOG_WARNING, 0,
+               READDIR_AHEAD_MSG_VOL_MISCONFIGURED,
+               "dangling volume. check volfile ");
+    }
+
+    priv = GF_CALLOC(1, sizeof(struct rda_priv), gf_rda_mt_rda_priv);
+    if (!priv)
+        goto err;
+    this->private = priv;
+
+    GF_ATOMIC_INIT(priv->rda_cache_size, 0);
+
+    this->local_pool = mem_pool_new(struct rda_local, 32);
+    if (!this->local_pool)
+        goto err;
+
+    GF_OPTION_INIT("rda-request-size", priv->rda_req_size, size_uint64, err);
+    GF_OPTION_INIT("rda-low-wmark", priv->rda_low_wmark, size_uint64, err);
+    GF_OPTION_INIT("rda-high-wmark", priv->rda_high_wmark, size_uint64, err);
+    GF_OPTION_INIT("rda-cache-limit", priv->rda_cache_limit, size_uint64, err);
+    GF_OPTION_INIT("parallel-readdir", priv->parallel_readdir, bool, err);
+    GF_OPTION_INIT("pass-through", this->pass_through, bool, err);
+
+    return 0;
+
+err:
+    if (this->local_pool)
+        mem_pool_destroy(this->local_pool);
+    if (priv)
+        GF_FREE(priv);
+
+    return -1;
+}
+
+void
+fini(xlator_t *this)
+{
+    GF_VALIDATE_OR_GOTO("readdir-ahead", this, out);
+
+    GF_FREE(this->private);
+
+out:
+    return;
+}
+
+struct xlator_fops fops = {
+    .opendir = rda_opendir,
+    .readdirp = rda_readdirp,
+    /* inode write */
+    /* TODO: invalidate a dentry's stats if its pointing to a directory
+     * when entry operations happen in that directory
+     */
+    .writev = rda_writev,
+    .truncate = rda_truncate,
+    .ftruncate = rda_ftruncate,
+    .fallocate = rda_fallocate,
+    .discard = rda_discard,
+    .zerofill = rda_zerofill,
+    /* metadata write */
+    .setxattr = rda_setxattr,
+    .fsetxattr = rda_fsetxattr,
+    .setattr = rda_setattr,
+    .fsetattr = rda_fsetattr,
+    .removexattr = rda_removexattr,
+    .fremovexattr = rda_fremovexattr,
+};
+
+struct xlator_cbks cbks = {
+    .releasedir = rda_releasedir,
+    .forget = rda_forget,
+};
+
+struct volume_options options[] = {
+    {
+        .key = {"readdir-ahead"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "enable/disable readdir-ahead",
+        .op_version = {GD_OP_VERSION_6_0},
+        .flags = OPT_FLAG_SETTABLE,
+    },
+    {
+        .key = {"rda-request-size"},
+        .type = GF_OPTION_TYPE_SIZET,
+        .min = 4096,
+        .max = 131072,
+        .default_value = "131072",
+        .description = "size of buffer in readdirp calls initiated by "
+                       "readdir-ahead ",
+    },
+    {
+        .key = {"rda-low-wmark"},
+        .type = GF_OPTION_TYPE_SIZET,
+        .min = 0,
+        .max = 10 * GF_UNIT_MB,
+        .default_value = "4096",
+        .description = "the value under which readdir-ahead plugs",
+    },
+    {
+        .key = {"rda-high-wmark"},
+        .type = GF_OPTION_TYPE_SIZET,
+        .min = 0,
+        .max = 100 * GF_UNIT_MB,
+        .default_value = "128KB",
+        .description = "the value over which readdir-ahead unplugs",
+    },
+    {
+        .key = {"rda-cache-limit"},
+        .type = GF_OPTION_TYPE_SIZET,
+        .min = 0,
+        .max = INFINITY,
+        .default_value = "10MB",
+        .description = "maximum size of cache consumed by readdir-ahead "
+                       "xlator. This value is global and total memory "
+                       "consumption by readdir-ahead is capped by this "
+                       "value, irrespective of the number/size of "
+                       "directories cached",
+    },
+    {.key = {"parallel-readdir"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .op_version = {GD_OP_VERSION_3_10_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC,
+     .default_value = "off",
+     .description = "If this option is enabled, the readdir operation "
+                    "is performed in parallel on all the bricks, thus "
+                    "improving the performance of readdir. Note that "
+                    "the performance improvement is higher in large "
+                    "clusters"},
+    {.key = {"pass-through"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "false",
+     .op_version = {GD_OP_VERSION_4_1_0},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+     .tags = {"readdir-ahead"},
+     .description = "Enable/Disable readdir ahead translator"},
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "readdir-ahead",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.h b/xlators/performance/readdir-ahead/src/readdir-ahead.h
new file mode 100644
index 00000000000..619c41059ff
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/readdir-ahead.h
@@ -0,0 +1,98 @@
+/*
+  Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __READDIR_AHEAD_H
+#define __READDIR_AHEAD_H
+
+/* state flags */
+#define RDA_FD_NEW (1 << 0)
+#define RDA_FD_RUNNING (1 << 1)
+#define RDA_FD_EOD (1 << 2)
+#define RDA_FD_ERROR (1 << 3)
+#define RDA_FD_BYPASS (1 << 4)
+#define RDA_FD_PLUGGED (1 << 5)
+
+#define RDA_COMMON_MODIFICATION_FOP(name, frame, this, __inode, __xdata,       \
+                                    args...)                                   \
+    do {                                                                       \
+        struct rda_local *__local = NULL;                                      \
+        rda_inode_ctx_t *ctx_p = NULL;                                         \
+                                                                               \
+        __local = mem_get0(this->local_pool);                                  \
+        __local->inode = inode_ref(__inode);                                   \
+        LOCK(&__inode->lock);                                                  \
+        {                                                                      \
+            ctx_p = __rda_inode_ctx_get(__inode, this);                        \
+        }                                                                      \
+        UNLOCK(&__inode->lock);                                                \
+        __local->generation = GF_ATOMIC_GET(ctx_p->generation);                \
+                                                                               \
+        frame->local = __local;                                                \
+        if (__xdata)                                                           \
+            __local->xattrs = dict_ref(__xdata);                               \
+                                                                               \
+        STACK_WIND(frame, rda_##name##_cbk, FIRST_CHILD(this),                 \
+                   FIRST_CHILD(this)->fops->name, args, __xdata);              \
+    } while (0)
+
+#define RDA_STACK_UNWIND(fop, frame, params...)                                \
+    do {                                                                       \
+        struct rda_local *__local = NULL;                                      \
+        if (frame) {                                                           \
+            __local = frame->local;                                            \
+            frame->local = NULL;                                               \
+        }                                                                      \
+        STACK_UNWIND_STRICT(fop, frame, params);                               \
+        if (__local) {                                                         \
+            rda_local_wipe(__local);                                           \
+            mem_put(__local);                                                  \
+        }                                                                      \
+    } while (0)
+
+struct rda_fd_ctx {
+    off_t cur_offset;  /* current head of the ctx */
+    size_t cur_size;   /* current size of the preload */
+    off_t next_offset; /* tail of the ctx */
+    uint32_t state;
+    gf_lock_t lock;
+    gf_dirent_t entries;
+    call_frame_t *fill_frame;
+    call_stub_t *stub;
+    int op_errno;
+    dict_t *xattrs; /* md-cache keys to be sent in readdirp() */
+    dict_t *writes_during_prefetch;
+    gf_atomic_t prefetching;
+};
+
+struct rda_local {
+    struct rda_fd_ctx *ctx;
+    fd_t *fd;
+    dict_t *xattrs; /* md-cache keys to be sent in readdirp() */
+    inode_t *inode;
+    off_t offset;
+    uint64_t generation;
+    int32_t skip_dir;
+};
+
+struct rda_priv {
+    uint64_t rda_req_size;
+    uint64_t rda_low_wmark;
+    uint64_t rda_high_wmark;
+    uint64_t rda_cache_limit;
+    gf_atomic_t rda_cache_size;
+    gf_boolean_t parallel_readdir;
+};
+
+typedef struct rda_inode_ctx {
+    struct iatt statbuf;
+    gf_atomic_t generation;
+} rda_inode_ctx_t;
+
+#endif /* __READDIR_AHEAD_H */
diff --git a/xlators/performance/stat-prefetch/src/Makefile.am b/xlators/performance/stat-prefetch/src/Makefile.am
deleted file mode 100644
index e52f2df48fd..00000000000
--- a/xlators/performance/stat-prefetch/src/Makefile.am
+++ /dev/null
@@ -1,11 +0,0 @@
-xlator_PROGRAMS = stat-prefetch.so
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-
-stat_prefetch_so_SOURCES = stat-prefetch.c
-noinst_HEADERS = stat-prefetch.h
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles
-
-CLEANFILES = 
-
diff --git a/xlators/performance/stat-prefetch/src/stat-prefetch.c b/xlators/performance/stat-prefetch/src/stat-prefetch.c
deleted file mode 100644
index c6bf1e684cf..00000000000
--- a/xlators/performance/stat-prefetch/src/stat-prefetch.c
+++ /dev/null
@@ -1,508 +0,0 @@
-/*
-   Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "stat-prefetch.h"
-#include "dict.h"
-#include "xlator.h"
-#include <sys/time.h>
-
-struct sp_cache {
-  struct sp_cache *next;
-  struct sp_cache *prev;
-  pid_t pid;
-  long long tv_time;
-  char *dirname;
-  dir_entry_t entries;
-  int32_t count;
-  pthread_mutex_t lock;
-};
-
-static void
-stat_prefetch_cache_flush (struct sp_cache *cache, int32_t force)
-{
-  struct sp_cache *trav;
-  struct timeval tv;
-  long long tv_time;
-
-  gettimeofday (&tv, NULL);
-  tv_time = (tv.tv_usec + (tv.tv_sec * 1000000));
-
-  pthread_mutex_lock (&cache->lock);
-
-  trav = cache->next;
-  while (trav != cache) {
-    struct sp_cache *next = trav->next;
-    {
-      if (tv_time > trav->tv_time || force) {
-	gf_log ("stat-prefetch",
-		GF_LOG_DEBUG,
-		"flush on: %s",
-		trav->dirname);
-	dir_entry_t *entries;
-
-	trav->prev->next = trav->next;
-	trav->next->prev = trav->prev;
-
-	entries = trav->entries.next;
-
-	while (entries) {
-	  dir_entry_t *nextentry = entries->next;
-	  {
-	    free (entries->name);
-	    free (entries);
-	  }
-	  entries = nextentry;
-	}
-	free (trav->dirname);
-	free (trav);
-      }
-    }
-    trav = next;
-  }
-
-  pthread_mutex_unlock (&cache->lock);
-}
-
-static int32_t
-stat_prefetch_cache_fill (struct sp_cache *cache,
-			  pid_t pid,
-			  char *dirname,
-			  dir_entry_t *entries)
-{
-  struct sp_cache *trav;
-  struct timeval tv;
-
-  pthread_mutex_unlock (&cache->lock);
-  trav = cache->next;
-  while (trav != cache) {
-    //    if (trav->pid == pid && !strcmp (trav->dirname, dirname)) {
-    if (!strcmp (trav->dirname, dirname)) {
-      break;
-    }
-    trav = trav->next;
-  }
-
-  if (trav == cache) {
-    trav = CALLOC (1, sizeof (*trav));
-    ERR_ABORT (trav);
-    trav->pid = pid;
-    trav->dirname = dirname;
-
-    trav->prev = cache->prev;
-    trav->next = cache;
-    trav->next->prev = trav;
-    trav->prev->next = trav;
-  } else {
-    free (dirname);
-  }
-
-  while (trav->entries.next) {
-    dir_entry_t *tmp = trav->entries.next;
-
-    trav->entries.next = trav->entries.next->next;
-    free (tmp->name);
-    free (tmp);
-  }
-  trav->entries.next = entries->next;
-  entries->next = NULL;
-
-  gettimeofday (&tv, NULL);
-  trav->tv_time = (tv.tv_usec + (tv.tv_sec * 1000000)) + cache->tv_time;
-
-  pthread_mutex_unlock (&cache->lock);
-  return 0;
-}
-
-static int32_t
-stat_prefetch_cache_lookup (struct sp_cache *cache,
-			    pid_t pid,
-			    const char *path,
-			    struct stat *buf)
-{
-  struct sp_cache *trav;
-  char *dirname = strdup (path);
-  char *filename = strrchr (dirname, '/');
-  dir_entry_t *entries;
-  dir_entry_t *prev = NULL;
-
-  *filename = '\0';
-  filename ++;
-
-  pthread_mutex_lock (&cache->lock);
-  trav = cache->next;
-  while (trav != cache) {
-    //    if ((trav->pid == pid) && !strcmp (dirname, trav->dirname))
-    if (!strcmp (dirname, trav->dirname))
-      break;
-    trav = trav->next;
-  }
-  if (trav == cache) {
-    free (dirname);
-    pthread_mutex_unlock (&cache->lock);
-    return -1;
-  }
-
-  entries = trav->entries.next;
-  prev = &trav->entries;
-  while (entries) {
-    if (!strcmp (entries->name, filename))
-      break;
-    prev = entries;
-    entries = entries->next;
-  }
-  if (!entries) {
-    free (dirname);
-    pthread_mutex_unlock (&cache->lock);
-    return -1;
-  }
-
-  *buf = entries->buf;
-  prev->next = entries->next;
-  free (entries->name);
-  free (entries);
-  free (dirname);
-
-  pthread_mutex_unlock (&cache->lock);
-
-  return 0;
-}
-
-			    
-int32_t
-stat_prefetch_readdir_cbk (call_frame_t *frame,
-			   void *cookie,
-			   xlator_t *this,
-			   int32_t op_ret,
-			   int32_t op_errno,
-			   dir_entry_t *entries,
-			   int32_t count)
-{
-  char *path = frame->local;
-  pid_t pid = frame->root->pid;
-  frame->local = NULL;
-
-  STACK_UNWIND (frame, op_ret, op_errno, entries, count);
-
-  if (op_ret == 0)
-    stat_prefetch_cache_fill (this->private,
-			      pid,
-			      path,
-			      entries);
-  else
-    free (path);
-
-  return 0;
-}
-
-int32_t
-stat_prefetch_readdir (call_frame_t *frame,
-		       xlator_t *this,
-		       const char *path)
-{
-  stat_prefetch_cache_flush (this->private, 0);
-
-  frame->local = strdup (path);
-  STACK_WIND (frame,
-	      stat_prefetch_readdir_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->readdir,
-	      path);
-  return 0;
-}
-
-
-int32_t
-stat_prefetch_getattr_cbk (call_frame_t *frame,
-			   void *cookie,
-			   xlator_t *this,
-			   int32_t op_ret,
-			   int32_t op_errno,
-			   struct stat *buf)
-{
-  STACK_UNWIND (frame, op_ret, op_errno, buf);
-  return 0;
-}
-
-int32_t
-stat_prefetch_getattr (call_frame_t *frame,
-		       struct xlator *this,
-		       const char *path)
-{
-  struct stat buf;
-  pid_t pid = frame->root->pid;
-  stat_prefetch_cache_flush (this->private, 0);
-
-  if (stat_prefetch_cache_lookup (this->private,
-				  pid,
-				  path,
-				  &buf) == 0) {
-    STACK_UNWIND (frame, 0, 0, &buf);
-    return 0;
-  }
-
-  STACK_WIND (frame,
-	      stat_prefetch_getattr_cbk,
-	      FIRST_CHILD(this),
-	      FIRST_CHILD(this)->fops->getattr,
-	      path);
-
-  return 0;
-}
-
-
-int32_t
-stat_prefetch_unlink_cbk (call_frame_t *frame,
-                          void *cookie,
-                          xlator_t *this,
-                          int32_t op_ret,
-                          int32_t op_errno)
-{
-  STACK_UNWIND (frame, op_ret, op_errno);
-  return 0;
-}
-
-int32_t
-stat_prefetch_unlink (call_frame_t *frame,
-                      struct xlator *this,
-                      const char *path)
-{
-  stat_prefetch_cache_flush (this->private, 1);
-
-  STACK_WIND (frame,
-              stat_prefetch_unlink_cbk,
-              FIRST_CHILD(this),
-              FIRST_CHILD(this)->fops->unlink,
-              path);
-
-  return 0;
-}
-
-
-int32_t
-stat_prefetch_chmod_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 struct stat *buf)
-{
-  STACK_UNWIND (frame, op_ret, op_errno, buf);
-  return 0;
-}
-
-int32_t
-stat_prefetch_chmod (call_frame_t *frame,
-		     struct xlator *this,
-		     const char *path,
-		     mode_t mode)
-{
-  stat_prefetch_cache_flush (this->private, 1);
-
-  STACK_WIND (frame,
-              stat_prefetch_chmod_cbk,
-              FIRST_CHILD(this),
-              FIRST_CHILD(this)->fops->chmod,
-              path,
-	      mode);
-
-  return 0;
-}
-
-
-int32_t
-stat_prefetch_chown_cbk (call_frame_t *frame,
-			 void *cookie,
-			 xlator_t *this,
-			 int32_t op_ret,
-			 int32_t op_errno,
-			 struct stat *buf)
-{
-  STACK_UNWIND (frame, op_ret, op_errno, buf);
-  return 0;
-}
-
-int32_t
-stat_prefetch_chown (call_frame_t *frame,
-		     struct xlator *this,
-		     const char *path,
-		     uid_t uid,
-		     gid_t gid)
-{
-  stat_prefetch_cache_flush (this->private, 1);
-
-  STACK_WIND (frame,
-              stat_prefetch_chown_cbk,
-              FIRST_CHILD(this),
-              FIRST_CHILD(this)->fops->chown,
-              path,
-	      uid,
-	      gid);
-
-  return 0;
-}
-
-
-int32_t
-stat_prefetch_utimes_cbk (call_frame_t *frame,
-                          void *cookie,
-                          xlator_t *this,
-                          int32_t op_ret,
-                          int32_t op_errno,
-			  struct stat *buf)
-{
-  STACK_UNWIND (frame, op_ret, op_errno, buf);
-  return 0;
-}
-
-int32_t
-stat_prefetch_utimes (call_frame_t *frame,
-		      struct xlator *this,
-		      const char *path,
-		      struct timespec *tvp)
-{
-  stat_prefetch_cache_flush (this->private, 1);
-
-  STACK_WIND (frame,
-              stat_prefetch_utimes_cbk,
-              FIRST_CHILD(this),
-              FIRST_CHILD(this)->fops->utimes,
-              path,
-	      tvp);
-
-  return 0;
-}
-
-
-int32_t
-stat_prefetch_truncate_cbk (call_frame_t *frame,
-			    void *cookie,
-			    xlator_t *this,
-			    int32_t op_ret,
-			    int32_t op_errno,
-			    struct stat *buf)
-{
-  STACK_UNWIND (frame, op_ret, op_errno, buf);
-  return 0;
-}
-
-int32_t
-stat_prefetch_truncate (call_frame_t *frame,
-			struct xlator *this,
-			const char *path,
-			off_t offset)
-{
-  stat_prefetch_cache_flush (this->private, 1);
-
-  STACK_WIND (frame,
-              stat_prefetch_truncate_cbk,
-              FIRST_CHILD(this),
-              FIRST_CHILD(this)->fops->truncate,
-              path,
-	      offset);
-
-  return 0;
-}
-
-
-int32_t
-stat_prefetch_rename_cbk (call_frame_t *frame,
-                          void *cookie,
-                          xlator_t *this,
-                          int32_t op_ret,
-                          int32_t op_errno)
-{
-  STACK_UNWIND (frame, op_ret, op_errno);
-  return 0;
-}
-
-int32_t
-stat_prefetch_rename (call_frame_t *frame,
-                      struct xlator *this,
-                      const char *oldpath,
-		      const char *newpath)
-{
-  stat_prefetch_cache_flush (this->private, 1);
-
-  STACK_WIND (frame,
-              stat_prefetch_rename_cbk,
-              FIRST_CHILD(this),
-              FIRST_CHILD(this)->fops->rename,
-              oldpath,
-	      newpath);
-
-  return 0;
-}
-
-int32_t 
-init (struct xlator *this)
-{
-  struct sp_cache *cache;
-  dict_t *options = this->options;
-
-  if (!this->children || this->children->next) {
-    gf_log ("stat-prefetch",
-	    GF_LOG_ERROR,
-	    "FATAL: translator %s does not have exactly one child node",
-	    this->name);
-    return -1;
-  }
-
-  cache = (void *) CALLOC (1, sizeof (*cache));
-  ERR_ABORT (cache);
-  cache->next = cache->prev = cache;
-
-  cache->tv_time = 1 * 1000000;
-
-  if (dict_get (options, "cache-seconds")) {
-    cache->tv_time = (data_to_int64 (dict_get (options, "cache-seconds")) *
-		      1000000);
-  }
-
-  pthread_mutex_init (&cache->lock, NULL);
-
-  this->private = cache;
-  return 0;
-}
-
-void
-fini (struct xlator *this)
-{
-  return;
-}
-
-
-struct xlator_fops fops = {
-  .getattr     = stat_prefetch_getattr,
-  .readdir     = stat_prefetch_readdir,
-  .unlink      = stat_prefetch_unlink,
-  .chmod       = stat_prefetch_chmod,
-  .chown       = stat_prefetch_chown,
-  .rename      = stat_prefetch_rename,
-  .utimes      = stat_prefetch_utimes,
-  .truncate    = stat_prefetch_truncate,
-};
-
-struct xlator_mops mops = {
-};
diff --git a/xlators/performance/stat-prefetch/src/stat-prefetch.h b/xlators/performance/stat-prefetch/src/stat-prefetch.h
deleted file mode 100644
index ef82952b0c7..00000000000
--- a/xlators/performance/stat-prefetch/src/stat-prefetch.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
-   Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-   This file is part of GlusterFS.
-
-   GlusterFS is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3 of the License,
-   or (at your option) any later version.
-
-   GlusterFS is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _STAT_PREFETCH_H_
-#define _STAT_PREFETCH_H_
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include <stdio.h>
-#include <sys/time.h>
-#include "xlator.h"
-
-#endif /* _STAT_PREFETCH_H_ */
diff --git a/xlators/performance/symlink-cache/src/Makefile.am b/xlators/performance/symlink-cache/src/Makefile.am
deleted file mode 100644
index b8b257c186c..00000000000
--- a/xlators/performance/symlink-cache/src/Makefile.am
+++ /dev/null
@@ -1,12 +0,0 @@
-xlator_LTLIBRARIES = symlink-cache.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-
-symlink_cache_la_LDFLAGS = -module -avoidversion 
-
-symlink_cache_la_SOURCES = symlink-cache.c
-symlink_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
-
-CLEANFILES = 
diff --git a/xlators/performance/symlink-cache/src/symlink-cache.c b/xlators/performance/symlink-cache/src/symlink-cache.c
deleted file mode 100644
index ef05defa0af..00000000000
--- a/xlators/performance/symlink-cache/src/symlink-cache.c
+++ /dev/null
@@ -1,399 +0,0 @@
-/*
-  Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
-#include "list.h"
-#include "compat.h"
-#include "compat-errno.h"
-#include "common-utils.h"
-
-struct symlink_cache {
-	time_t ctime;
-	char   *readlink;
-};
-
-
-static int
-symlink_inode_ctx_get (inode_t *inode, xlator_t *this, void **ctx)
-{
-	int ret = 0;
-	uint64_t tmp_ctx = 0;
-	ret = inode_ctx_get (inode, this, &tmp_ctx);
-	if (-1 == ret)
-		gf_log (this->name, GF_LOG_ERROR, "dict get failed");
-	else
-		*ctx = (void *)(long)tmp_ctx;
-
-	return 0;
-}
-
-
-static int
-symlink_inode_ctx_set (inode_t *inode, xlator_t *this, void *ctx)
-{
-	int ret = 0;
-	ret = inode_ctx_put (inode, this, (uint64_t)(long) ctx);
-	if (-1 == ret)
-		gf_log (this->name, GF_LOG_ERROR, "dict set failed");
-
-	return 0;
-}
-
-
-int
-sc_cache_update (xlator_t *this, inode_t *inode, const char *link)
-{
-	struct symlink_cache *sc = NULL;
-
-	symlink_inode_ctx_get (inode, this, VOID(&sc));
-	if (!sc)
-		return 0;
-
-	if (!sc->readlink) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"updating cache: %s", link);
-
-		sc->readlink = strdup (link);
-	} else {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"not updating existing cache: %s with %s",
-			sc->readlink, link);
-	}
-
-	return 0;
-}
-
-
-int
-sc_cache_set (xlator_t *this, inode_t *inode, struct stat *buf,
-	      const char *link)
-{
-	struct symlink_cache *sc = NULL;
-	int                   ret = -1;
-	int                   need_set = 0;
-
-
-	symlink_inode_ctx_get (inode, this, VOID(&sc));
-	if (!sc) {
-		need_set = 1;
-		sc = CALLOC (1, sizeof (*sc));
-		if (!sc) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"out of memory :(");
-			goto err;
-		}
-	}
-
-	if (sc->readlink) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"replacing old cache: %s with new cache: %s",
-			sc->readlink, link);
-		FREE (sc->readlink);
-		sc->readlink = NULL;
-	}
-
-	if (link) {
-		sc->readlink = strdup (link);
-		if (!sc->readlink) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"out of memory :(");
-			goto err;
-		}
-	}
-
-	sc->ctime = buf->st_ctime;
-
-	gf_log (this->name, GF_LOG_DEBUG,
-		"setting symlink cache: %s", link);
-
-	if (need_set) {
-		ret = symlink_inode_ctx_set (inode, this, sc);
-
-		if (ret < 0) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"could not set inode context (%s)",
-				strerror (-ret));
-			goto err;
-		}
-	}
-
-	return 0;
-err:
-
-	if (sc) {
-		if (sc->readlink)
-			FREE (sc->readlink);
-		sc->readlink = NULL;
-		FREE (sc);
-	}
-
-	return -1;
-}
-
-
-int
-sc_cache_flush (xlator_t *this, inode_t *inode)
-{
-	struct symlink_cache *sc = NULL;
-
-	symlink_inode_ctx_get (inode, this, VOID(&sc));
-	if (!sc)
-		return 0;
-
-	if (sc->readlink) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"flushing cache: %s", sc->readlink);
-
-		FREE (sc->readlink);
-		sc->readlink = NULL;
-	}
-
-	FREE (sc);
-
-	return 0;
-}
-
-
-int
-sc_cache_validate (xlator_t *this, inode_t *inode, struct stat *buf)
-{
-	struct symlink_cache *sc = NULL;
-	uint64_t tmp_sc = 0;
-
-	if (!S_ISLNK (buf->st_mode)) {
-		sc_cache_flush (this, inode);
-		return 0;
-	}
-
-	symlink_inode_ctx_get (inode, this, VOID(&sc));
-
-	if (!sc) {
-		sc_cache_set (this, inode, buf, NULL);
-		inode_ctx_get (inode, this, &tmp_sc);
-
-		if (!sc) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"out of memory :(");
-			return 0;
-		}
-		sc = (struct symlink_cache *)(long)tmp_sc;
-	}
-
-	if (sc->ctime == buf->st_ctime)
-		return 0;
-
-	/* STALE */
-	if (sc->readlink) {
-		gf_log (this->name, GF_LOG_DEBUG,
-			"flushing cache: %s", sc->readlink);
-
-		FREE (sc->readlink);
-		sc->readlink = NULL;
-	}
-
-	sc->ctime = buf->st_ctime;
-
-	return 0;
-}
-
-
-
-int
-sc_cache_get (xlator_t *this, inode_t *inode, char **link)
-{
-	struct symlink_cache *sc = NULL;
-
-	symlink_inode_ctx_get (inode, this, VOID(&sc));
-
-	if (!sc)
-		return 0;
-
-	if (link && sc->readlink)
-		*link = strdup (sc->readlink);
-	return 0;
-}
-
-
-int
-sc_readlink_cbk (call_frame_t *frame, void *cookie,
-		 xlator_t *this, int op_ret, int op_errno,
-		 const char *link)
-{
-	if (op_ret > 0)
-		sc_cache_update (this, frame->local, link);
-
-	inode_unref (frame->local);
-	frame->local = NULL;
-
-        STACK_UNWIND (frame, op_ret, op_errno, link);
-        return 0;
-}
-
-
-int
-sc_readlink (call_frame_t *frame, xlator_t *this,
-	     loc_t *loc, size_t size)
-{
-	char *link = NULL;
-
-	sc_cache_get (this, loc->inode, &link);
-
-	if (link) {
-		/* cache hit */
-		gf_log (this->name, GF_LOG_DEBUG,
-			"cache hit %s -> %s",
-			loc->path, link);
-		STACK_UNWIND (frame, strlen (link) + 1, 0, link);
-		FREE (link);
-		return 0;
-	}
-
-	frame->local = inode_ref (loc->inode);
-
-        STACK_WIND (frame, sc_readlink_cbk,
-                    FIRST_CHILD(this),
-                    FIRST_CHILD(this)->fops->readlink,
-                    loc, size);
-
-	return 0;
-}
-
-
-int
-sc_symlink_cbk (call_frame_t *frame, void *cookie,
-		xlator_t *this, int op_ret, int op_errno,
-		inode_t *inode, struct stat *buf)
-{
-	if (op_ret == 0) {
-		if (frame->local) {
-			sc_cache_set (this, inode, buf, frame->local);
-		}
-	}
-
-        STACK_UNWIND (frame, op_ret, op_errno, inode, buf);
-        return 0;
-}
-
-
-int
-sc_symlink (call_frame_t *frame, xlator_t *this,
-	    const char *dst, loc_t *src)
-{
-	frame->local = strdup (dst);
-
-        STACK_WIND (frame, sc_symlink_cbk,
-                    FIRST_CHILD(this),
-                    FIRST_CHILD(this)->fops->symlink,
-                    dst, src);
-
-	return 0;
-}
-
-
-int
-sc_lookup_cbk (call_frame_t *frame, void *cookie,
-	       xlator_t *this, int op_ret, int op_errno,
-	       inode_t *inode, struct stat *buf, dict_t *xattr)
-{
-	if (op_ret == 0)
-		sc_cache_validate (this, inode, buf);
-	else
-		sc_cache_flush (this, inode);
-
-        STACK_UNWIND (frame, op_ret, op_errno, inode, buf, xattr);
-        return 0;
-}
-
-
-int
-sc_lookup (call_frame_t *frame, xlator_t *this,
-	   loc_t *loc, dict_t *xattr_req)
-{
-        STACK_WIND (frame, sc_lookup_cbk,
-                    FIRST_CHILD(this),
-                    FIRST_CHILD(this)->fops->lookup,
-                    loc, xattr_req);
-
-        return 0;
-}
-
-
-int
-sc_forget (xlator_t *this,
-	   inode_t *inode)
-{
-	sc_cache_flush (this, inode);
-
-        return 0;
-}
-
-
-int32_t 
-init (xlator_t *this)
-{
-	
-        if (!this->children || this->children->next)
-        {
-                gf_log (this->name, GF_LOG_ERROR,
-                        "FATAL: volume (%s) not configured with exactly one "
-			"child", this->name);
-                return -1;
-        }
-
-	if (!this->parents) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"dangling volume. check volfile ");
-	}
-
-        return 0;
-}
-
-
-void
-fini (xlator_t *this)
-{
-        return;
-}
-
-
-struct xlator_fops fops = {
-	.lookup      = sc_lookup,
-	.symlink     = sc_symlink,
-	.readlink    = sc_readlink,
-};
-
-struct xlator_mops mops = {
-};
-
-struct xlator_cbks cbks = {
-        .forget  = sc_forget,
-};
-
-struct volume_options options[] = {
-	{ .key = {NULL} },
-};
diff --git a/xlators/performance/write-behind/src/Makefile.am b/xlators/performance/write-behind/src/Makefile.am
index f800abad50d..a6a16fcc080 100644
--- a/xlators/performance/write-behind/src/Makefile.am
+++ b/xlators/performance/write-behind/src/Makefile.am
@@ -1,12 +1,16 @@
 xlator_LTLIBRARIES = write-behind.la
 xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
 
-write_behind_la_LDFLAGS = -module -avoidversion 
+write_behind_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
 
 write_behind_la_SOURCES = write-behind.c
 write_behind_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+noinst_HEADERS = write-behind-mem-types.h write-behind-messages.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
 
 CLEANFILES = 
diff --git a/xlators/performance/write-behind/src/write-behind-mem-types.h b/xlators/performance/write-behind/src/write-behind-mem-types.h
new file mode 100644
index 00000000000..a0647299150
--- /dev/null
+++ b/xlators/performance/write-behind/src/write-behind-mem-types.h
@@ -0,0 +1,24 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __WB_MEM_TYPES_H__
+#define __WB_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_wb_mem_types_ {
+    gf_wb_mt_wb_file_t = gf_common_mt_end + 1,
+    gf_wb_mt_wb_request_t,
+    gf_wb_mt_iovec,
+    gf_wb_mt_wb_conf_t,
+    gf_wb_mt_wb_inode_t,
+    gf_wb_mt_end
+};
+#endif
diff --git a/xlators/performance/write-behind/src/write-behind-messages.h b/xlators/performance/write-behind/src/write-behind-messages.h
new file mode 100644
index 00000000000..e9ea474879b
--- /dev/null
+++ b/xlators/performance/write-behind/src/write-behind-messages.h
@@ -0,0 +1,31 @@
+/*Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _WRITE_BEHIND_MESSAGES_H_
+#define _WRITE_BEHIND_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(WRITE_BEHIND, WRITE_BEHIND_MSG_EXCEEDED_MAX_SIZE,
+           WRITE_BEHIND_MSG_INIT_FAILED, WRITE_BEHIND_MSG_INVALID_ARGUMENT,
+           WRITE_BEHIND_MSG_NO_MEMORY, WRITE_BEHIND_MSG_SIZE_NOT_SET,
+           WRITE_BEHIND_MSG_VOL_MISCONFIGURED,
+           WRITE_BEHIND_MSG_RES_UNAVAILABLE);
+
+#endif /* _WRITE_BEHIND_MESSAGES_H_ */
diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c
index 86752cc946e..00cfca016e6 100644
--- a/xlators/performance/write-behind/src/write-behind.c
+++ b/xlators/performance/write-behind/src/write-behind.c
@@ -1,1444 +1,3278 @@
 /*
-  Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
   This file is part of GlusterFS.
 
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
 
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/list.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/defaults.h>
+#include "write-behind-mem-types.h"
+#include "write-behind-messages.h"
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
+#define MAX_VECTOR_COUNT 8
+#define WB_AGGREGATE_SIZE 131072 /* 128 KB */
+#define WB_WINDOW_SIZE 1048576   /* 1MB */
 
-/*TODO: check for non null wb_file_data before getting wb_file */
+typedef struct list_head list_head_t;
+struct wb_conf;
+struct wb_inode;
+
+typedef struct wb_inode {
+    ssize_t window_conf;
+    ssize_t window_current;
+    ssize_t transit; /* size of data stack_wound, and yet
+                        to be fulfilled (wb_fulfill_cbk).
+                        used for trickling_writes
+                     */
+
+    list_head_t all;        /* All requests, from enqueue() till destroy().
+                               Used only for resetting generation
+                               number when empty.
+                            */
+    list_head_t todo;       /* Work to do (i.e, STACK_WIND to server).
+                               Once we STACK_WIND, the entry is taken
+                               off the list. If it is non-sync write,
+                               then we continue to track it via @liability
+                               or @temptation depending on the status
+                               of its writeback.
+                            */
+    list_head_t liability;  /* Non-sync writes which are lied
+                               (STACK_UNWIND'ed to caller) but ack
+                               from server not yet complete. This
+                               is the "liability" which we hold, and
+                               must guarantee that dependent operations
+                               which arrive later (which overlap, etc.)
+                               are issued only after their dependencies
+                               in this list are "fulfilled".
+ 
+                               Server acks for entries in this list
+                               shrinks the window.
+ 
+                               The sum total of all req->write_size
+                               of entries in this list must be kept less
+                               than the permitted window size.
+                            */
+    list_head_t temptation; /* Operations for which we are tempted
+                               to 'lie' (write-behind), but temporarily
+                               holding off (because of insufficient
+                               window capacity, etc.)
+
+                               This is the list to look at to grow
+                               the window (in __wb_pick_unwinds()).
+
+                               Entries typically get chosen from
+                               write-behind from this list, and therefore
+                               get "upgraded" to the "liability" list.
+                       */
+    list_head_t wip;        /* List of write calls in progress, SYNC or non-SYNC
+                               which are currently STACK_WIND'ed towards the server.
+                               This is for guaranteeing that no two overlapping
+                               writes are in progress at the same time. Modules
+                               like eager-lock in AFR depend on this behavior.
+                            */
+    list_head_t invalidate_list; /* list of wb_inodes that were marked for
+                                  * iatt invalidation due to requests in
+                                  * liability queue fulfilled while there
+                                  * was a readdirp session on parent
+                                  * directory. For a directory inode, this
+                                  * list points to list of children.
+                                  */
+    uint64_t gen;                /* Liability generation number. Represents
+                                    the current 'state' of liability. Every
+                                    new addition to the liability list bumps
+                                    the generation number.
+               
+                                    a newly arrived request is only required
+                                    to perform causal checks against the entries
+                                    in the liability list which were present
+                                    at the time of its addition. the generation
+                                    number at the time of its addition is stored
+                                    in the request and used during checks.
+               
+                                    the liability list can grow while the request
+                                    waits in the todo list waiting for its
+                                    dependent operations to complete. however
+                                    it is not of the request's concern to depend
+                                    itself on those new entries which arrived
+                                    after it arrived (i.e, those that have a
+                                    liability generation higher than itself)
+                                 */
+    size_t size; /* Size of the file to catch write after EOF. */
+    gf_lock_t lock;
+    xlator_t *this;
+    inode_t *inode;
+    int dontsync; /* If positive, don't pick lies for
+                   * winding. This is needed to break infinite
+                   * recursion during invocation of
+                   * wb_process_queue from
+                   * wb_fulfill_cbk in case of an
+                   * error during fulfill.
+                   */
+    gf_atomic_int32_t readdirps;
+    gf_atomic_int8_t invalidate;
+
+} wb_inode_t;
+
+typedef struct wb_request {
+    list_head_t all;
+    list_head_t todo;
+    list_head_t lie; /* either in @liability or @temptation */
+    list_head_t winds;
+    list_head_t unwinds;
+    list_head_t wip;
+
+    call_stub_t *stub;
+
+    ssize_t write_size; /* currently held size
+                           (after collapsing) */
+    size_t orig_size;   /* size which arrived with the request.
+                           This is the size by which we grow
+                           the window when unwinding the frame.
+                        */
+    size_t total_size;  /* valid only in @head in wb_fulfill().
+                           This is the size with which we perform
+                           STACK_WIND to server and therefore the
+                           amount by which we shrink the window.
+                        */
 
+    int op_ret;
+    int op_errno;
+
+    int32_t refcount;
+    wb_inode_t *wb_inode;
+    glusterfs_fop_t fop;
+    gf_lkowner_t lk_owner;
+    pid_t client_pid;
+    struct iobref *iobref;
+    uint64_t gen; /* inode liability state at the time of
+                     request arrival */
+
+    fd_t *fd;
+    int wind_count; /* number of sync-attempts. Only
+                       for debug purposes */
+    struct {
+        size_t size; /* 0 size == till infinity */
+        off_t off;
+        int append : 1;    /* offset is invalid. only one
+                              outstanding append at a time */
+        int tempted : 1;   /* true only for non-sync writes */
+        int lied : 1;      /* sin committed */
+        int fulfilled : 1; /* got server acknowledgement */
+        int go : 1;        /* enough aggregating, good to go */
+    } ordering;
+
+    /* for debug purposes. A request might outlive the fop it is
+     * representing. So, preserve essential info for logging.
+     */
+    uint64_t unique;
+    uuid_t gfid;
+} wb_request_t;
+
+typedef struct wb_conf {
+    uint64_t aggregate_size;
+    uint64_t page_size;
+    uint64_t window_size;
+    gf_boolean_t flush_behind;
+    gf_boolean_t trickling_writes;
+    gf_boolean_t strict_write_ordering;
+    gf_boolean_t strict_O_DIRECT;
+    gf_boolean_t resync_after_fsync;
+} wb_conf_t;
+
+wb_inode_t *
+__wb_inode_ctx_get(xlator_t *this, inode_t *inode)
+{
+    uint64_t value = 0;
+    wb_inode_t *wb_inode = NULL;
+    int ret = 0;
 
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+    ret = __inode_ctx_get(inode, this, &value);
+    if (ret)
+        return NULL;
 
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
-#include "list.h"
-#include "compat.h"
-#include "compat-errno.h"
-#include "common-utils.h"
+    wb_inode = (wb_inode_t *)(unsigned long)value;
 
-#define MAX_VECTOR_COUNT 8
- 
-typedef struct list_head list_head_t;
-struct wb_conf;
-struct wb_page;
-struct wb_file;
+    return wb_inode;
+}
 
+wb_inode_t *
+wb_inode_ctx_get(xlator_t *this, inode_t *inode)
+{
+    wb_inode_t *wb_inode = NULL;
+
+    GF_VALIDATE_OR_GOTO("write-behind", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    LOCK(&inode->lock);
+    {
+        wb_inode = __wb_inode_ctx_get(this, inode);
+    }
+    UNLOCK(&inode->lock);
+out:
+    return wb_inode;
+}
 
-struct wb_conf {
-        uint64_t aggregate_size;
-        uint64_t window_size;
-        uint64_t disable_till;
-        gf_boolean_t enable_O_SYNC;
-        gf_boolean_t flush_behind;
-};
+static void
+wb_set_invalidate(wb_inode_t *wb_inode)
+{
+    int readdirps = 0;
+    inode_t *parent_inode = NULL;
+    wb_inode_t *wb_parent_inode = NULL;
 
+    parent_inode = inode_parent(wb_inode->inode, NULL, NULL);
+    if (parent_inode)
+        wb_parent_inode = wb_inode_ctx_get(wb_inode->this, parent_inode);
 
-typedef struct wb_local {
-        list_head_t winds;
-        struct wb_file *file;
-        list_head_t unwind_frames;
-        int op_ret;
-        int op_errno;
-        call_frame_t *frame;
-} wb_local_t;
-
-
-typedef struct write_request {
-        call_frame_t *frame;
-        off_t offset;
-        /*  int32_t op_ret;
-            int32_t op_errno; */
-        struct iovec *vector;
-        int32_t count;
-        dict_t *refs;
-        char write_behind;
-        char stack_wound;
-        char got_reply;
-        list_head_t list;
-        list_head_t winds;
-        /*  list_head_t unwinds;*/
-} wb_write_request_t;
-
-
-struct wb_file {
-        int disabled;
-        uint64_t disable_till;
-        off_t offset;
-        size_t window_size;
-        int32_t refcount;
-        int32_t op_ret;
-        int32_t op_errno;
-        list_head_t request;
-        fd_t *fd;
-        gf_lock_t lock;
-        xlator_t *this;
-};
+    if (wb_parent_inode) {
+        LOCK(&wb_parent_inode->lock);
+        {
+            readdirps = GF_ATOMIC_GET(wb_parent_inode->readdirps);
+            if (readdirps && list_empty(&wb_inode->invalidate_list)) {
+                inode_ref(wb_inode->inode);
+                GF_ATOMIC_INIT(wb_inode->invalidate, 1);
+                list_add(&wb_inode->invalidate_list,
+                         &wb_parent_inode->invalidate_list);
+            }
+        }
+        UNLOCK(&wb_parent_inode->lock);
+    } else {
+        GF_ATOMIC_INIT(wb_inode->invalidate, 0);
+    }
+
+    if (parent_inode)
+        inode_unref(parent_inode);
+
+    return;
+}
 
+void
+wb_process_queue(wb_inode_t *wb_inode);
 
-typedef struct wb_conf wb_conf_t;
-typedef struct wb_page wb_page_t;
-typedef struct wb_file wb_file_t;
+/*
+  Below is a succinct explanation of the code deciding whether two regions
+  overlap, from Pavan <tcp@gluster.com>.
 
+  For any two ranges to be non-overlapping, either the end of the first
+  range is lesser than the start of the second, or vice versa. Example -
 
-int32_t 
-wb_process_queue (call_frame_t *frame, wb_file_t *file, char flush_all);
+  <--------->       <-------------->
+  p         q       x              y
 
-int32_t
-wb_sync (call_frame_t *frame, wb_file_t *file, list_head_t *winds);
+  ( q < x ) or (y < p) = > No overlap.
 
-int32_t
-wb_sync_all (call_frame_t *frame, wb_file_t *file);
+  To check for *overlap*, we can negate this (using de morgan's laws), and
+  it becomes -
 
-int32_t 
-__wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_size);
+  (q >= x ) and (y >= p)
 
+  Either that, or you write the negation using -
 
-wb_file_t *
-wb_file_create (xlator_t *this,
-                fd_t *fd)
+  if (! ((q < x) or (y < p)) ) {
+  "Overlap"
+  }
+*/
+
+gf_boolean_t
+wb_requests_overlap(wb_request_t *req1, wb_request_t *req2)
 {
-        wb_file_t *file = NULL;
-        wb_conf_t *conf = this->private; 
+    uint64_t r1_start = 0;
+    uint64_t r1_end = 0;
+    uint64_t r2_start = 0;
+    uint64_t r2_end = 0;
+    gf_boolean_t do_overlap = _gf_false;
+
+    r1_start = req1->ordering.off;
+    if (req1->ordering.size)
+        r1_end = r1_start + req1->ordering.size - 1;
+    else
+        r1_end = ULLONG_MAX;
+
+    r2_start = req2->ordering.off;
+    if (req2->ordering.size)
+        r2_end = r2_start + req2->ordering.size - 1;
+    else
+        r2_end = ULLONG_MAX;
+
+    do_overlap = ((r1_end >= r2_start) && (r2_end >= r1_start));
+
+    return do_overlap;
+}
+
+gf_boolean_t
+wb_requests_conflict(wb_request_t *lie, wb_request_t *req)
+{
+    wb_conf_t *conf = NULL;
+
+    conf = req->wb_inode->this->private;
 
-        file = CALLOC (1, sizeof (*file));
-        INIT_LIST_HEAD (&file->request);
+    if (lie == req)
+        /* request cannot conflict with itself */
+        return _gf_false;
 
-        /* fd_ref() not required, file should never decide the existance of
-         * an fd */
-        file->fd= fd;
-        file->disable_till = conf->disable_till;
-        file->this = this;
-        file->refcount = 1;
+    if (lie->gen >= req->gen)
+        /* this liability entry was behind
+           us in the todo list */
+        return _gf_false;
 
-        fd_ctx_set (fd, this, (uint64_t)(long)file);
-        
-        return file;
+    if (lie->ordering.append)
+        /* all modifications wait for the completion
+           of outstanding append */
+        return _gf_true;
+
+    if (conf->strict_write_ordering)
+        /* We are sure (lie->gen < req->gen) by now. So
+           skip overlap check if strict write ordering is
+           requested and always return "conflict" against a
+           lower generation lie. */
+        return _gf_true;
+
+    return wb_requests_overlap(lie, req);
 }
 
-void
-wb_file_destroy (wb_file_t *file)
+wb_request_t *
+wb_liability_has_conflict(wb_inode_t *wb_inode, wb_request_t *req)
 {
-        int32_t refcount = 0;
+    wb_request_t *each = NULL;
+
+    list_for_each_entry(each, &wb_inode->liability, lie)
+    {
+        if (wb_requests_conflict(each, req) && (!each->ordering.fulfilled))
+            /* A fulfilled request shouldn't block another
+             * request (even a dependent one) from winding.
+             */
+            return each;
+    }
+
+    return NULL;
+}
 
-        LOCK (&file->lock);
-        {
-                refcount = --file->refcount;
+wb_request_t *
+wb_wip_has_conflict(wb_inode_t *wb_inode, wb_request_t *req)
+{
+    wb_request_t *each = NULL;
+
+    if (req->stub->fop != GF_FOP_WRITE)
+        /* non-writes fundamentally never conflict with WIP requests */
+        return NULL;
+
+    list_for_each_entry(each, &wb_inode->wip, wip)
+    {
+        if (each == req)
+            /* request never conflicts with itself,
+               though this condition should never occur.
+            */
+            continue;
+
+        if (wb_requests_overlap(each, req))
+            return each;
+    }
+
+    return NULL;
+}
+
+static int
+__wb_request_unref(wb_request_t *req)
+{
+    int ret = -1;
+    wb_inode_t *wb_inode = NULL;
+    char gfid[64] = {
+        0,
+    };
+
+    wb_inode = req->wb_inode;
+
+    if (req->refcount <= 0) {
+        uuid_utoa_r(req->gfid, gfid);
+
+        gf_msg(
+            "wb-request", GF_LOG_WARNING, 0, WRITE_BEHIND_MSG_RES_UNAVAILABLE,
+            "(unique=%" PRIu64 ", fop=%s, gfid=%s, gen=%" PRIu64
+            "): "
+            "refcount(%d) is <= 0 ",
+            req->unique, gf_fop_list[req->fop], gfid, req->gen, req->refcount);
+        goto out;
+    }
+
+    ret = --req->refcount;
+    if (req->refcount == 0) {
+        uuid_utoa_r(req->gfid, gfid);
+
+        gf_log_callingfn(wb_inode->this->name, GF_LOG_DEBUG,
+                         "(unique = %" PRIu64
+                         ", fop=%s, gfid=%s, "
+                         "gen=%" PRIu64
+                         "): destroying request, "
+                         "removing from all queues",
+                         req->unique, gf_fop_list[req->fop], gfid, req->gen);
+
+        list_del_init(&req->todo);
+        list_del_init(&req->lie);
+        list_del_init(&req->wip);
+
+        list_del_init(&req->all);
+        if (list_empty(&wb_inode->all)) {
+            wb_inode->gen = 0;
+            /* in case of accounting errors? */
+            wb_inode->window_current = 0;
         }
-        UNLOCK (&file->lock);
 
-        if (!refcount){
-                LOCK_DESTROY (&file->lock);
-                FREE (file);
+        list_del_init(&req->winds);
+        list_del_init(&req->unwinds);
+
+        if (req->stub) {
+            call_stub_destroy(req->stub);
+            req->stub = NULL;
         }
 
-        return;
+        if (req->iobref)
+            iobref_unref(req->iobref);
+
+        if (req->fd)
+            fd_unref(req->fd);
+
+        GF_FREE(req);
+    }
+out:
+    return ret;
 }
 
+static int
+wb_request_unref(wb_request_t *req)
+{
+    wb_inode_t *wb_inode = NULL;
+    int ret = -1;
 
-int32_t
-wb_sync_cbk (call_frame_t *frame,
-             void *cookie,
-             xlator_t *this,
-             int32_t op_ret,
-             int32_t op_errno,
-             struct stat *stbuf)
-{
-        wb_local_t *local = NULL;
-        list_head_t *winds = NULL;
-        wb_file_t *file = NULL;
-        wb_write_request_t *request = NULL, *dummy = NULL;
-
-        local = frame->local;
-        winds = &local->winds;
-        file = local->file;
-
-        LOCK (&file->lock);
-        {
-                list_for_each_entry_safe (request, dummy, winds, winds) {
-                        request->got_reply = 1;
-                        if (!request->write_behind && (op_ret == -1)) {
-                                wb_local_t *per_request_local = request->frame->local;
-                                per_request_local->op_ret = op_ret;
-                                per_request_local->op_errno = op_errno;
-                        }
-
-                        /*
-                          request->op_ret = op_ret;
-                          request->op_errno = op_errno; 
-                        */
+    GF_VALIDATE_OR_GOTO("write-behind", req, out);
+
+    wb_inode = req->wb_inode;
+
+    LOCK(&wb_inode->lock);
+    {
+        ret = __wb_request_unref(req);
+    }
+    UNLOCK(&wb_inode->lock);
+
+out:
+    return ret;
+}
+
+static wb_request_t *
+__wb_request_ref(wb_request_t *req)
+{
+    GF_VALIDATE_OR_GOTO("write-behind", req, out);
+
+    if (req->refcount < 0) {
+        gf_msg("wb-request", GF_LOG_WARNING, 0,
+               WRITE_BEHIND_MSG_RES_UNAVAILABLE, "refcount(%d) is < 0",
+               req->refcount);
+        req = NULL;
+        goto out;
+    }
+
+    req->refcount++;
+
+out:
+    return req;
+}
+
+wb_request_t *
+wb_request_ref(wb_request_t *req)
+{
+    wb_inode_t *wb_inode = NULL;
+
+    GF_VALIDATE_OR_GOTO("write-behind", req, out);
+
+    wb_inode = req->wb_inode;
+    LOCK(&wb_inode->lock);
+    {
+        req = __wb_request_ref(req);
+    }
+    UNLOCK(&wb_inode->lock);
+
+out:
+    return req;
+}
+
+gf_boolean_t
+wb_enqueue_common(wb_inode_t *wb_inode, call_stub_t *stub, int tempted)
+{
+    wb_request_t *req = NULL;
+    inode_t *inode = NULL;
+
+    GF_VALIDATE_OR_GOTO("write-behind", wb_inode, out);
+    GF_VALIDATE_OR_GOTO(wb_inode->this->name, stub, out);
+
+    req = GF_CALLOC(1, sizeof(*req), gf_wb_mt_wb_request_t);
+    if (!req)
+        goto out;
+
+    INIT_LIST_HEAD(&req->all);
+    INIT_LIST_HEAD(&req->todo);
+    INIT_LIST_HEAD(&req->lie);
+    INIT_LIST_HEAD(&req->winds);
+    INIT_LIST_HEAD(&req->unwinds);
+    INIT_LIST_HEAD(&req->wip);
+
+    req->stub = stub;
+    req->wb_inode = wb_inode;
+    req->fop = stub->fop;
+    req->ordering.tempted = tempted;
+    req->unique = stub->frame->root->unique;
+
+    inode = ((stub->args.fd != NULL) ? stub->args.fd->inode
+                                     : stub->args.loc.inode);
+
+    if (inode)
+        gf_uuid_copy(req->gfid, inode->gfid);
+
+    if (stub->fop == GF_FOP_WRITE) {
+        req->write_size = iov_length(stub->args.vector, stub->args.count);
+
+        /* req->write_size can change as we collapse
+           small writes. But the window needs to grow
+           only by how much we acknowledge the app. so
+           copy the original size in orig_size for the
+           purpose of accounting.
+        */
+        req->orig_size = req->write_size;
+
+        /* Let's be optimistic that we can
+           lie about it
+        */
+        req->op_ret = req->write_size;
+        req->op_errno = 0;
+
+        if (stub->args.fd && (stub->args.fd->flags & O_APPEND))
+            req->ordering.append = 1;
+    }
+
+    req->lk_owner = stub->frame->root->lk_owner;
+    req->client_pid = stub->frame->root->pid;
+
+    switch (stub->fop) {
+        case GF_FOP_WRITE:
+            LOCK(&wb_inode->lock);
+            {
+                if (wb_inode->size < stub->args.offset) {
+                    req->ordering.off = wb_inode->size;
+                    req->ordering.size = stub->args.offset + req->write_size -
+                                         wb_inode->size;
+                } else {
+                    req->ordering.off = stub->args.offset;
+                    req->ordering.size = req->write_size;
                 }
-        }
-        UNLOCK (&file->lock);
 
-        if (op_ret == -1)
-        {
-                file->op_ret = op_ret;
-                file->op_errno = op_errno;
+                if (wb_inode->size < stub->args.offset + req->write_size)
+                    wb_inode->size = stub->args.offset + req->write_size;
+            }
+            UNLOCK(&wb_inode->lock);
+
+            req->fd = fd_ref(stub->args.fd);
+
+            break;
+        case GF_FOP_READ:
+            req->ordering.off = stub->args.offset;
+            req->ordering.size = stub->args.size;
+
+            req->fd = fd_ref(stub->args.fd);
+
+            break;
+        case GF_FOP_TRUNCATE:
+            req->ordering.off = stub->args.offset;
+            req->ordering.size = 0; /* till infinity */
+            LOCK(&wb_inode->lock);
+            {
+                wb_inode->size = req->ordering.off;
+            }
+            UNLOCK(&wb_inode->lock);
+            break;
+        case GF_FOP_FTRUNCATE:
+            req->ordering.off = stub->args.offset;
+            req->ordering.size = 0; /* till infinity */
+            LOCK(&wb_inode->lock);
+            {
+                wb_inode->size = req->ordering.off;
+            }
+            UNLOCK(&wb_inode->lock);
+
+            req->fd = fd_ref(stub->args.fd);
+
+            break;
+        default:
+            if (stub && stub->args.fd)
+                req->fd = fd_ref(stub->args.fd);
+
+            break;
+    }
+
+    LOCK(&wb_inode->lock);
+    {
+        list_add_tail(&req->all, &wb_inode->all);
+
+        req->gen = wb_inode->gen;
+
+        list_add_tail(&req->todo, &wb_inode->todo);
+        __wb_request_ref(req); /* for wind */
+
+        if (req->ordering.tempted) {
+            list_add_tail(&req->lie, &wb_inode->temptation);
+            __wb_request_ref(req); /* for unwind */
         }
+    }
+    UNLOCK(&wb_inode->lock);
 
-        wb_process_queue (frame, file, 0);  
-  
-        /* safe place to do fd_unref */
-        fd_unref (file->fd);
+out:
+    if (!req)
+        return _gf_false;
 
-        STACK_DESTROY (frame->root);
+    return _gf_true;
+}
 
-        return 0;
+gf_boolean_t
+wb_enqueue(wb_inode_t *wb_inode, call_stub_t *stub)
+{
+    return wb_enqueue_common(wb_inode, stub, 0);
 }
 
-int32_t
-wb_sync_all (call_frame_t *frame, wb_file_t *file) 
+gf_boolean_t
+wb_enqueue_tempted(wb_inode_t *wb_inode, call_stub_t *stub)
+{
+    return wb_enqueue_common(wb_inode, stub, 1);
+}
+
+wb_inode_t *
+__wb_inode_create(xlator_t *this, inode_t *inode)
 {
-        list_head_t winds;
-        int32_t bytes = 0;
+    wb_inode_t *wb_inode = NULL;
+    wb_conf_t *conf = NULL;
+    int ret = 0;
 
-        INIT_LIST_HEAD (&winds);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
 
-        LOCK (&file->lock);
-        {
-                bytes = __wb_mark_winds (&file->request, &winds, 0);
-        }
-        UNLOCK (&file->lock);
+    conf = this->private;
+
+    wb_inode = GF_CALLOC(1, sizeof(*wb_inode), gf_wb_mt_wb_inode_t);
+    if (!wb_inode)
+        goto out;
+
+    INIT_LIST_HEAD(&wb_inode->all);
+    INIT_LIST_HEAD(&wb_inode->todo);
+    INIT_LIST_HEAD(&wb_inode->liability);
+    INIT_LIST_HEAD(&wb_inode->temptation);
+    INIT_LIST_HEAD(&wb_inode->wip);
+    INIT_LIST_HEAD(&wb_inode->invalidate_list);
 
-        wb_sync (frame, file, &winds);
+    wb_inode->this = this;
 
-        return bytes;
+    wb_inode->window_conf = conf->window_size;
+    wb_inode->inode = inode;
+
+    LOCK_INIT(&wb_inode->lock);
+    GF_ATOMIC_INIT(wb_inode->invalidate, 0);
+    GF_ATOMIC_INIT(wb_inode->readdirps, 0);
+
+    ret = __inode_ctx_put(inode, this, (uint64_t)(unsigned long)wb_inode);
+    if (ret) {
+        GF_FREE(wb_inode);
+        wb_inode = NULL;
+    }
+
+out:
+    return wb_inode;
 }
 
+wb_inode_t *
+wb_inode_create(xlator_t *this, inode_t *inode)
+{
+    wb_inode_t *wb_inode = NULL;
 
-int32_t
-wb_sync (call_frame_t *frame, wb_file_t *file, list_head_t *winds)
-{
-        wb_write_request_t *dummy = NULL, *request = NULL, *first_request = NULL, *next = NULL;
-        size_t total_count = 0, count = 0;
-        size_t copied = 0;
-        call_frame_t *sync_frame = NULL;
-        dict_t *refs = NULL;
-        wb_local_t *local = NULL;
-        struct iovec *vector = NULL;
-        int32_t bytes = 0;
-        size_t bytecount = 0;
-
-        list_for_each_entry (request, winds, winds)
-        {
-                total_count += request->count;
-                bytes += iov_length (request->vector, request->count);
-        }
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
 
-        if (!total_count) {
-                return 0;
-        }
-  
-        list_for_each_entry_safe (request, dummy, winds, winds) {
-                if (!vector) {
-                        vector = MALLOC (VECTORSIZE (MAX_VECTOR_COUNT));
-                        refs = get_new_dict ();
-        
-                        local = CALLOC (1, sizeof (*local));
-                        INIT_LIST_HEAD (&local->winds);
-            
-                        first_request = request;
-                }
+    LOCK(&inode->lock);
+    {
+        wb_inode = __wb_inode_ctx_get(this, inode);
+        if (!wb_inode)
+            wb_inode = __wb_inode_create(this, inode);
+    }
+    UNLOCK(&inode->lock);
 
-                count += request->count;
-                bytecount = VECTORSIZE (request->count);
-                memcpy (((char *)vector)+copied,
-                        request->vector,
-                        bytecount);
-                copied += bytecount;
-      
-                if (request->refs) {
-                        dict_copy (request->refs, refs);
-                }
+out:
+    return wb_inode;
+}
 
-                next = NULL;
-                if (request->winds.next != winds) {    
-                        next = list_entry (request->winds.next, struct write_request, winds);
-                }
+void
+wb_inode_destroy(wb_inode_t *wb_inode)
+{
+    GF_VALIDATE_OR_GOTO("write-behind", wb_inode, out);
 
-                list_del_init (&request->winds);
-                list_add_tail (&request->winds, &local->winds);
-
-                if (!next || ((count + next->count) > MAX_VECTOR_COUNT)) {
-                        sync_frame = copy_frame (frame);  
-                        sync_frame->local = local;
-                        local->file = file;
-                        sync_frame->root->req_refs = dict_ref (refs);
-                        fd_ref (file->fd);
-                        STACK_WIND (sync_frame,
-                                    wb_sync_cbk,
-                                    FIRST_CHILD(sync_frame->this),
-                                    FIRST_CHILD(sync_frame->this)->fops->writev,
-                                    file->fd, vector,
-                                    count, first_request->offset);
-        
-                        dict_unref (refs);
-                        FREE (vector);
-                        first_request = NULL;
-                        refs = NULL;
-                        vector = NULL;
-                        copied = count = 0;
-                }
-        }
+    GF_ASSERT(list_empty(&wb_inode->todo));
+    GF_ASSERT(list_empty(&wb_inode->liability));
+    GF_ASSERT(list_empty(&wb_inode->temptation));
 
-        return bytes;
+    LOCK_DESTROY(&wb_inode->lock);
+    GF_FREE(wb_inode);
+out:
+    return;
 }
 
+void
+__wb_fulfill_request(wb_request_t *req)
+{
+    wb_inode_t *wb_inode = NULL;
+    char gfid[64] = {
+        0,
+    };
+
+    wb_inode = req->wb_inode;
+
+    req->ordering.fulfilled = 1;
+    wb_inode->window_current -= req->total_size;
+    wb_inode->transit -= req->total_size;
+
+    uuid_utoa_r(req->gfid, gfid);
+
+    gf_log_callingfn(wb_inode->this->name, GF_LOG_DEBUG,
+                     "(unique=%" PRIu64
+                     ", fop=%s, gfid=%s, "
+                     "gen=%" PRIu64
+                     "): request fulfilled. "
+                     "removing the request from liability queue? = %s",
+                     req->unique, gf_fop_list[req->fop], gfid, req->gen,
+                     req->ordering.lied ? "yes" : "no");
+
+    if (req->ordering.lied) {
+        /* 1. If yes, request is in liability queue and hence can be
+              safely removed from list.
+           2. If no, request is in temptation queue and hence should be
+              left in the queue so that wb_pick_unwinds picks it up
+        */
+        list_del_init(&req->lie);
+    } else {
+        /* TODO: fail the req->frame with error if
+           necessary
+        */
+    }
+
+    list_del_init(&req->wip);
+    __wb_request_unref(req);
+}
 
-int32_t 
-wb_stat_cbk (call_frame_t *frame,
-             void *cookie,
-             xlator_t *this,
-             int32_t op_ret,
-             int32_t op_errno,
-             struct stat *buf)
+/* get a flush/fsync waiting on req */
+wb_request_t *
+__wb_request_waiting_on(wb_request_t *req)
 {
-        wb_local_t *local = NULL;
-  
-        local = frame->local;
-  
-        if (local->file)
-                fd_unref (local->file->fd);
+    wb_inode_t *wb_inode = NULL;
+    wb_request_t *trav = NULL;
 
-        STACK_UNWIND (frame, op_ret, op_errno, buf);
+    wb_inode = req->wb_inode;
 
-        return 0;
+    list_for_each_entry(trav, &wb_inode->todo, todo)
+    {
+        if (((trav->stub->fop == GF_FOP_FLUSH) ||
+             (trav->stub->fop == GF_FOP_FSYNC)) &&
+            (trav->gen >= req->gen))
+            return trav;
+    }
+
+    return NULL;
 }
 
+void
+__wb_add_request_for_retry(wb_request_t *req)
+{
+    wb_inode_t *wb_inode = NULL;
 
-int32_t
-wb_stat (call_frame_t *frame,
-         xlator_t *this,
-         loc_t *loc)
+    if (!req)
+        goto out;
+
+    wb_inode = req->wb_inode;
+
+    /* response was unwound and no waiter waiting on this request, retry
+       till a flush or fsync (subject to conf->resync_after_fsync).
+    */
+    wb_inode->transit -= req->total_size;
+
+    req->total_size = 0;
+
+    list_del_init(&req->winds);
+    list_del_init(&req->todo);
+    list_del_init(&req->wip);
+
+    /* sanitize ordering flags to retry */
+    req->ordering.go = 0;
+
+    /* Add back to todo list to retry */
+    list_add(&req->todo, &wb_inode->todo);
+
+out:
+    return;
+}
+
+void
+__wb_add_head_for_retry(wb_request_t *head)
 {
-        wb_file_t *file = NULL;
-        fd_t *iter_fd = NULL;
-        wb_local_t *local = NULL;
-	uint64_t tmp_file = 0;
+    wb_request_t *req = NULL, *tmp = NULL;
 
-        if (loc->inode)
-        {
-                iter_fd = fd_lookup (loc->inode, frame->root->pid);
-                if (iter_fd) {
-                        if (!fd_ctx_get (iter_fd, this, &tmp_file)) {
-				file = (wb_file_t *)(long)tmp_file;
-                        } else {
-                                fd_unref (iter_fd);
-                        }
-                }
-                if (file) {
-                        wb_sync_all (frame, file);
-                }
-        }
+    if (!head)
+        goto out;
 
-        local = CALLOC (1, sizeof (*local));
-        local->file = file;
+    list_for_each_entry_safe_reverse(req, tmp, &head->winds, winds)
+    {
+        __wb_add_request_for_retry(req);
+    }
 
-        frame->local = local;
+    __wb_add_request_for_retry(head);
 
-        STACK_WIND (frame, wb_stat_cbk,
-                    FIRST_CHILD(this),
-                    FIRST_CHILD(this)->fops->stat,
-                    loc);
-        return 0;
+out:
+    return;
 }
 
+void
+wb_add_head_for_retry(wb_request_t *head)
+{
+    if (!head)
+        goto out;
 
-int32_t 
-wb_fstat (call_frame_t *frame,
-          xlator_t *this,
-          fd_t *fd)
+    LOCK(&head->wb_inode->lock);
+    {
+        __wb_add_head_for_retry(head);
+    }
+    UNLOCK(&head->wb_inode->lock);
+
+out:
+    return;
+}
+
+void
+__wb_fulfill_request_err(wb_request_t *req, int32_t op_errno)
 {
-        wb_file_t *file = NULL;
-        wb_local_t *local = NULL;
-  	uint64_t tmp_file = 0;
+    wb_inode_t *wb_inode = NULL;
+    wb_request_t *waiter = NULL;
+    wb_conf_t *conf = NULL;
+
+    wb_inode = req->wb_inode;
+
+    conf = wb_inode->this->private;
 
-        if (fd_ctx_get (fd, this, &tmp_file)) {
-                gf_log (this->name, GF_LOG_ERROR, "returning EBADFD");
-                STACK_UNWIND (frame, -1, EBADFD, NULL);
-                return 0;
+    req->op_ret = -1;
+    req->op_errno = op_errno;
+
+    if (req->ordering.lied)
+        waiter = __wb_request_waiting_on(req);
+
+    if (!req->ordering.lied || waiter) {
+        if (!req->ordering.lied) {
+            /* response to app is still pending, send failure in
+             * response.
+             */
+        } else {
+            /* response was sent, store the error in a
+             * waiter (either an fsync or flush).
+             */
+            waiter->op_ret = -1;
+            waiter->op_errno = op_errno;
         }
 
-	file = (wb_file_t *)(long)tmp_file;
-        if (file) {
-                fd_ref (file->fd);
-                wb_sync_all (frame, file);
+        if (!req->ordering.lied || (waiter->stub->fop == GF_FOP_FLUSH) ||
+            ((waiter->stub->fop == GF_FOP_FSYNC) &&
+             !conf->resync_after_fsync)) {
+            /* No retry needed, forget the request */
+            __wb_fulfill_request(req);
+            return;
         }
+    }
 
-        local = CALLOC (1, sizeof (*local));
-        local->file = file;
+    __wb_add_request_for_retry(req);
 
-        frame->local = local;
-  
-        STACK_WIND (frame,
-                    wb_stat_cbk,
-                    FIRST_CHILD(this),
-                    FIRST_CHILD(this)->fops->fstat,
-                    fd);
-        return 0;
+    return;
 }
 
+void
+wb_head_done(wb_request_t *head)
+{
+    wb_request_t *req = NULL;
+    wb_request_t *tmp = NULL;
+    wb_inode_t *wb_inode = NULL;
+
+    wb_inode = head->wb_inode;
 
-int32_t
-wb_truncate_cbk (call_frame_t *frame,
-                 void *cookie,
-                 xlator_t *this,
-                 int32_t op_ret,
-                 int32_t op_errno,
-                 struct stat *buf)
-{
-        wb_local_t *local = NULL; 
-  
-        local = frame->local;
-        if (local->file)
-                fd_unref (local->file->fd);
-
-        STACK_UNWIND (frame, op_ret, op_errno, buf);
-        return 0;
+    LOCK(&wb_inode->lock);
+    {
+        list_for_each_entry_safe(req, tmp, &head->winds, winds)
+        {
+            __wb_fulfill_request(req);
+        }
+
+        __wb_fulfill_request(head);
+    }
+    UNLOCK(&wb_inode->lock);
 }
 
+void
+__wb_fulfill_err(wb_request_t *head, int op_errno)
+{
+    wb_request_t *req = NULL, *tmp = NULL;
+
+    if (!head)
+        goto out;
 
-int32_t 
-wb_truncate (call_frame_t *frame,
-             xlator_t *this,
-             loc_t *loc,
-             off_t offset)
+    head->wb_inode->dontsync++;
+
+    list_for_each_entry_safe_reverse(req, tmp, &head->winds, winds)
+    {
+        __wb_fulfill_request_err(req, op_errno);
+    }
+
+    __wb_fulfill_request_err(head, op_errno);
+
+out:
+    return;
+}
+
+void
+wb_fulfill_err(wb_request_t *head, int op_errno)
+{
+    wb_inode_t *wb_inode = NULL;
+
+    wb_inode = head->wb_inode;
+
+    LOCK(&wb_inode->lock);
+    {
+        __wb_fulfill_err(head, op_errno);
+    }
+    UNLOCK(&wb_inode->lock);
+}
+
+void
+__wb_modify_write_request(wb_request_t *req, int synced_size)
+{
+    struct iovec *vector = NULL;
+    int count = 0;
+
+    if (!req || synced_size == 0)
+        goto out;
+
+    req->write_size -= synced_size;
+    req->stub->args.offset += synced_size;
+
+    vector = req->stub->args.vector;
+    count = req->stub->args.count;
+
+    req->stub->args.count = iov_skip(vector, count, synced_size);
+
+out:
+    return;
+}
+
+int
+__wb_fulfill_short_write(wb_request_t *req, int size, gf_boolean_t *fulfilled)
+{
+    int accounted_size = 0;
+
+    if (req == NULL)
+        goto out;
+
+    if (req->write_size <= size) {
+        accounted_size = req->write_size;
+        __wb_fulfill_request(req);
+        *fulfilled = 1;
+    } else {
+        accounted_size = size;
+        __wb_modify_write_request(req, size);
+        *fulfilled = 0;
+    }
+
+out:
+    return accounted_size;
+}
+
+void
+wb_fulfill_short_write(wb_request_t *head, int size)
 {
-        wb_file_t *file = NULL;
-        fd_t *iter_fd = NULL;
-        wb_local_t *local = NULL;
-	uint64_t tmp_file = 0;
+    wb_inode_t *wb_inode = NULL;
+    wb_request_t *req = NULL, *next = NULL;
+    int accounted_size = 0;
+    gf_boolean_t fulfilled = _gf_false;
+
+    if (!head)
+        goto out;
+
+    wb_inode = head->wb_inode;
+
+    req = head;
+
+    LOCK(&wb_inode->lock);
+    {
+        /* hold a reference to head so that __wb_fulfill_short_write
+         * won't free it. We need head for a cleaner list traversal as
+         * list_for_each_entry_safe doesn't iterate over "head" member.
+         * So, if we pass "next->winds" as head to list_for_each_entry,
+         * "next" is skipped. For a simpler logic we need to traverse
+         * the list in the order. So, we start traversal from
+         * "head->winds" and hence we want head to be alive.
+         */
+        __wb_request_ref(head);
 
-        if (loc->inode)
+        next = list_entry(head->winds.next, wb_request_t, winds);
+
+        accounted_size = __wb_fulfill_short_write(head, size, &fulfilled);
+
+        size -= accounted_size;
+
+        if (size == 0) {
+            if (fulfilled && (next != head))
+                req = next;
+
+            goto done;
+        }
+
+        list_for_each_entry_safe(req, next, &head->winds, winds)
         {
-                iter_fd = fd_lookup (loc->inode, frame->root->pid);
-                if (iter_fd) {
-                        if (!fd_ctx_get (iter_fd, this, &tmp_file)){
-				file = (wb_file_t *)(long)tmp_file;
-                        } else {
-                                fd_unref (iter_fd);
-                        }
-                }
-    
-                if (file)
-                {
-                        wb_sync_all (frame, file);
-                }
+            accounted_size = __wb_fulfill_short_write(req, size, &fulfilled);
+            size -= accounted_size;
+
+            if (size == 0) {
+                if (fulfilled && (next != head))
+                    req = next;
+                break;
+            }
         }
-  
-        local = CALLOC (1, sizeof (*local));
-        local->file = file;
-
-        frame->local = local;
-
-        STACK_WIND (frame,
-                    wb_truncate_cbk,
-                    FIRST_CHILD(this),
-                    FIRST_CHILD(this)->fops->truncate,
-                    loc,
-                    offset);
-        return 0;
+    done:
+        __wb_request_unref(head);
+    }
+    UNLOCK(&wb_inode->lock);
+
+    wb_add_head_for_retry(req);
+out:
+    return;
 }
 
+int
+wb_fulfill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+               struct iatt *postbuf, dict_t *xdata)
+{
+    wb_inode_t *wb_inode = NULL;
+    wb_request_t *head = NULL;
+
+    head = frame->local;
+    frame->local = NULL;
+
+    wb_inode = head->wb_inode;
+
+    /* There could be a readdirp session in progress. Since wb_fulfill_cbk
+     * can potentially remove a request from liability queue,
+     * wb_readdirp_cbk will miss writes on this inode (as it invalidates
+     * stats only if liability queue is not empty) and hence mark inode
+     * for invalidation of stats in readdirp response. Specifically this
+     * code fixes the following race mentioned in wb_readdirp_cbk:
+     */
+
+    /* <removed comment from wb_readdirp_cbk>
+     * We cannot guarantee integrity of entry->d_stat as there are cached
+     * writes. The stat is most likely stale as it doesn't account the
+     * cached writes. However, checking for non-empty liability list here is
+     * not a fool-proof solution as there can be races like,
+     * 1. readdirp is successful on posix
+     * 2. sync of cached write is successful on posix
+     * 3. write-behind received sync response and removed the request from
+     *    liability queue
+     * 4. readdirp response is processed at write-behind
+     *
+     * In the above scenario, stat for the file is sent back in readdirp
+     * response but it is stale.
+     * </comment> */
+    wb_set_invalidate(wb_inode);
+
+    if (op_ret == -1) {
+        wb_fulfill_err(head, op_errno);
+    } else if (op_ret < head->total_size) {
+        wb_fulfill_short_write(head, op_ret);
+    } else {
+        wb_head_done(head);
+    }
+
+    wb_process_queue(wb_inode);
+
+    STACK_DESTROY(frame->root);
+
+    return 0;
+}
 
-int32_t
-wb_ftruncate (call_frame_t *frame,
-              xlator_t *this,
-              fd_t *fd,
-              off_t offset)
-{
-        wb_file_t *file = NULL;
-        wb_local_t *local = NULL;
-	uint64_t tmp_file = 0;
-
-        if (fd_ctx_get (fd, this, &tmp_file)) {
-                gf_log (this->name, GF_LOG_ERROR, "returning EBADFD");
-                STACK_UNWIND (frame, -1, EBADFD, NULL);
-                return 0;
+#define WB_IOV_LOAD(vec, cnt, req, head)                                       \
+    do {                                                                       \
+        memcpy(&vec[cnt], req->stub->args.vector,                              \
+               (req->stub->args.count * sizeof(vec[0])));                      \
+        cnt += req->stub->args.count;                                          \
+        head->total_size += req->write_size;                                   \
+    } while (0)
+
+int
+wb_fulfill_head(wb_inode_t *wb_inode, wb_request_t *head)
+{
+    struct iovec vector[MAX_VECTOR_COUNT];
+    int count = 0;
+    wb_request_t *req = NULL;
+    call_frame_t *frame = NULL;
+
+    /* make sure head->total_size is updated before we run into any
+     * errors
+     */
+
+    WB_IOV_LOAD(vector, count, head, head);
+
+    list_for_each_entry(req, &head->winds, winds)
+    {
+        WB_IOV_LOAD(vector, count, req, head);
+
+        if (iobref_merge(head->stub->args.iobref, req->stub->args.iobref))
+            goto err;
+    }
+
+    frame = create_frame(wb_inode->this, wb_inode->this->ctx->pool);
+    if (!frame)
+        goto err;
+
+    frame->root->lk_owner = head->lk_owner;
+    frame->root->pid = head->client_pid;
+    frame->local = head;
+
+    LOCK(&wb_inode->lock);
+    {
+        wb_inode->transit += head->total_size;
+    }
+    UNLOCK(&wb_inode->lock);
+
+    STACK_WIND(frame, wb_fulfill_cbk, FIRST_CHILD(frame->this),
+               FIRST_CHILD(frame->this)->fops->writev, head->fd, vector, count,
+               head->stub->args.offset, head->stub->args.flags,
+               head->stub->args.iobref, NULL);
+
+    return 0;
+err:
+    /* frame creation failure */
+    wb_fulfill_err(head, ENOMEM);
+
+    return ENOMEM;
+}
+
+#define NEXT_HEAD(head, req)                                                   \
+    do {                                                                       \
+        if (head)                                                              \
+            ret |= wb_fulfill_head(wb_inode, head);                            \
+        head = req;                                                            \
+        expected_offset = req->stub->args.offset + req->write_size;            \
+        curr_aggregate = 0;                                                    \
+        vector_count = 0;                                                      \
+    } while (0)
+
+int
+wb_fulfill(wb_inode_t *wb_inode, list_head_t *liabilities)
+{
+    wb_request_t *req = NULL;
+    wb_request_t *head = NULL;
+    wb_request_t *tmp = NULL;
+    wb_conf_t *conf = NULL;
+    off_t expected_offset = 0;
+    size_t curr_aggregate = 0;
+    size_t vector_count = 0;
+    int ret = 0;
+
+    conf = wb_inode->this->private;
+
+    list_for_each_entry_safe(req, tmp, liabilities, winds)
+    {
+        list_del_init(&req->winds);
+
+        if (!head) {
+            NEXT_HEAD(head, req);
+            continue;
+        }
+
+        if (req->fd != head->fd) {
+            NEXT_HEAD(head, req);
+            continue;
         }
 
-	file = (wb_file_t *)(long)tmp_file;
-        if (file)
-                wb_sync_all (frame, file);
+        if (!is_same_lkowner(&req->lk_owner, &head->lk_owner)) {
+            NEXT_HEAD(head, req);
+            continue;
+        }
 
-        local = CALLOC (1, sizeof (*local));
-        local->file = file;
+        if (expected_offset != req->stub->args.offset) {
+            NEXT_HEAD(head, req);
+            continue;
+        }
 
-        if (file)
-                fd_ref (file->fd);
+        if ((curr_aggregate + req->write_size) > conf->aggregate_size) {
+            NEXT_HEAD(head, req);
+            continue;
+        }
 
-        frame->local = local;
+        if (vector_count + req->stub->args.count > MAX_VECTOR_COUNT) {
+            NEXT_HEAD(head, req);
+            continue;
+        }
 
-        STACK_WIND (frame,
-                    wb_truncate_cbk,
-                    FIRST_CHILD(this),
-                    FIRST_CHILD(this)->fops->ftruncate,
-                    fd,
-                    offset);
-        return 0;
+        list_add_tail(&req->winds, &head->winds);
+        curr_aggregate += req->write_size;
+        vector_count += req->stub->args.count;
+    }
+
+    if (head)
+        ret |= wb_fulfill_head(wb_inode, head);
+
+    return ret;
 }
 
+void
+wb_do_unwinds(wb_inode_t *wb_inode, list_head_t *lies)
+{
+    wb_request_t *req = NULL;
+    wb_request_t *tmp = NULL;
+    call_frame_t *frame = NULL;
+    struct iatt buf = {
+        0,
+    };
+
+    list_for_each_entry_safe(req, tmp, lies, unwinds)
+    {
+        frame = req->stub->frame;
+
+        STACK_UNWIND_STRICT(writev, frame, req->op_ret, req->op_errno, &buf,
+                            &buf, NULL); /* :O */
+        req->stub->frame = NULL;
+
+        list_del_init(&req->unwinds);
+        wb_request_unref(req);
+    }
+
+    return;
+}
 
-int32_t 
-wb_utimens_cbk (call_frame_t *frame,
-                void *cookie,
-                xlator_t *this,
-                int32_t op_ret,
-                int32_t op_errno,
-                struct stat *buf)
+void
+__wb_pick_unwinds(wb_inode_t *wb_inode, list_head_t *lies)
 {
-        wb_local_t *local = NULL;       
-  
-        local = frame->local;
-        if (local->file)
-                fd_unref (local->file->fd);
+    wb_request_t *req = NULL;
+    wb_request_t *tmp = NULL;
+    char gfid[64] = {
+        0,
+    };
+
+    list_for_each_entry_safe(req, tmp, &wb_inode->temptation, lie)
+    {
+        if (!req->ordering.fulfilled &&
+            wb_inode->window_current > wb_inode->window_conf)
+            continue;
+
+        list_del_init(&req->lie);
+        list_move_tail(&req->unwinds, lies);
+
+        wb_inode->window_current += req->orig_size;
+
+        wb_inode->gen++;
+
+        if (!req->ordering.fulfilled) {
+            /* burden increased */
+            list_add_tail(&req->lie, &wb_inode->liability);
+
+            req->ordering.lied = 1;
+
+            uuid_utoa_r(req->gfid, gfid);
+            gf_msg_debug(wb_inode->this->name, 0,
+                         "(unique=%" PRIu64
+                         ", fop=%s, gfid=%s, "
+                         "gen=%" PRIu64
+                         "): added req to liability "
+                         "queue. inode-generation-number=%" PRIu64,
+                         req->stub->frame->root->unique, gf_fop_list[req->fop],
+                         gfid, req->gen, wb_inode->gen);
+        }
+    }
 
-        STACK_UNWIND (frame, op_ret, op_errno, buf);
-        return 0;
+    return;
 }
 
-
-int32_t 
-wb_utimens (call_frame_t *frame,
-            xlator_t *this,
-            loc_t *loc,
-            struct timespec tv[2])
+int
+__wb_collapse_small_writes(wb_conf_t *conf, wb_request_t *holder,
+                           wb_request_t *req)
 {
-        wb_file_t *file = NULL;
-        fd_t *iter_fd = NULL;
-        wb_local_t *local = NULL;
-	uint64_t tmp_file = 0;
+    char *ptr = NULL;
+    struct iobuf *iobuf = NULL;
+    struct iobref *iobref = NULL;
+    int ret = -1;
+    ssize_t required_size = 0;
+    size_t holder_len = 0;
+    size_t req_len = 0;
+
+    if (!holder->iobref) {
+        holder_len = iov_length(holder->stub->args.vector,
+                                holder->stub->args.count);
+        req_len = iov_length(req->stub->args.vector, req->stub->args.count);
+
+        required_size = max((conf->page_size), (holder_len + req_len));
+        iobuf = iobuf_get2(req->wb_inode->this->ctx->iobuf_pool, required_size);
+        if (iobuf == NULL) {
+            goto out;
+        }
 
-        if (loc->inode) {
-                iter_fd = fd_lookup (loc->inode, frame->root->pid);
-                if (iter_fd) {
-                        if (!fd_ctx_get (iter_fd, this, &tmp_file)) {
-				file = (wb_file_t *)(long)tmp_file;
-                        } else {
-                                fd_unref (iter_fd);
-                        }
-                }
+        iobref = iobref_new();
+        if (iobref == NULL) {
+            iobuf_unref(iobuf);
+            goto out;
+        }
 
-                if (file)
-                        wb_sync_all (frame, file);
+        ret = iobref_add(iobref, iobuf);
+        if (ret != 0) {
+            gf_msg(req->wb_inode->this->name, GF_LOG_WARNING, -ret,
+                   WRITE_BEHIND_MSG_INVALID_ARGUMENT,
+                   "cannot add iobuf (%p) into iobref (%p)", iobuf, iobref);
+            iobuf_unref(iobuf);
+            iobref_unref(iobref);
+            goto out;
         }
 
-        local = CALLOC (1, sizeof (*local));
-        local->file = file;
+        iov_unload(iobuf->ptr, holder->stub->args.vector,
+                   holder->stub->args.count);
+        holder->stub->args.vector[0].iov_base = iobuf->ptr;
+        holder->stub->args.count = 1;
 
-        frame->local = local;
+        iobref_unref(holder->stub->args.iobref);
+        holder->stub->args.iobref = iobref;
 
-        STACK_WIND (frame,
-                    wb_utimens_cbk,
-                    FIRST_CHILD(this),
-                    FIRST_CHILD(this)->fops->utimens,
-                    loc,
-                    tv);
-        return 0;
+        iobuf_unref(iobuf);
+
+        holder->iobref = iobref_ref(iobref);
+    }
+
+    ptr = holder->stub->args.vector[0].iov_base + holder->write_size;
+
+    iov_unload(ptr, req->stub->args.vector, req->stub->args.count);
+
+    holder->stub->args.vector[0].iov_len += req->write_size;
+    holder->write_size += req->write_size;
+    holder->ordering.size += req->write_size;
+
+    ret = 0;
+out:
+    return ret;
 }
 
-int32_t
-wb_open_cbk (call_frame_t *frame,
-             void *cookie,
-             xlator_t *this,
-             int32_t op_ret,
-             int32_t op_errno,
-             fd_t *fd)
-{
-        int32_t flags = 0;
-        wb_file_t *file = NULL;
-        wb_conf_t *conf = this->private;
-
-        if (op_ret != -1)
-        {
-                file = wb_file_create (this, fd);
-
-                /* If mandatory locking has been enabled on this file,
-                   we disable caching on it */
-
-                if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP))
-                        file->disabled = 1;
-
-                /* If O_DIRECT then, we disable chaching */
-                if (frame->local)
-                {
-                        flags = *((int32_t *)frame->local);
-                        if (((flags & O_DIRECT) == O_DIRECT) || 
-                            ((flags & O_RDONLY) == O_RDONLY) ||
-                            (((flags & O_SYNC) == O_SYNC) &&
-                             conf->enable_O_SYNC == _gf_true)) { 
-                                file->disabled = 1;
-                        }
-                }
+void
+__wb_preprocess_winds(wb_inode_t *wb_inode)
+{
+    off_t offset_expected = 0;
+    ssize_t space_left = 0;
+    wb_request_t *req = NULL;
+    wb_request_t *tmp = NULL;
+    wb_request_t *holder = NULL;
+    wb_conf_t *conf = NULL;
+    int ret = 0;
+    ssize_t page_size = 0;
+    char gfid[64] = {
+        0,
+    };
+
+    /* With asynchronous IO from a VM guest (as a file), there
+       can be two sequential writes happening in two regions
+       of the file. But individual (broken down) IO requests
+       can arrive interleaved.
+
+       TODO: cycle for each such sequence sifting
+       through the interleaved ops
+    */
+
+    conf = wb_inode->this->private;
+    page_size = conf->page_size;
+
+    list_for_each_entry_safe(req, tmp, &wb_inode->todo, todo)
+    {
+        if (wb_inode->dontsync && req->ordering.lied) {
+            /* sync has failed. Don't pick lies _again_ for winding
+             * as winding these lies again will trigger an infinite
+             * recursion of wb_process_queue being called from a
+             * failed fulfill. However, pick non-lied requests for
+             * winding so that application won't block indefinitely
+             * waiting for write result.
+             */
+
+            uuid_utoa_r(req->gfid, gfid);
+            gf_msg_debug(wb_inode->this->name, 0,
+                         "(unique=%" PRIu64
+                         ", fop=%s, gfid=%s, "
+                         "gen=%" PRIu64
+                         "): not setting ordering.go"
+                         "as dontsync is set",
+                         req->unique, gf_fop_list[req->fop], gfid, req->gen);
+
+            continue;
+        }
 
-                LOCK_INIT (&file->lock);
+        if (!req->ordering.tempted) {
+            if (holder) {
+                if (wb_requests_conflict(holder, req))
+                    /* do not hold on write if a
+                       dependent write is in queue */
+                    holder->ordering.go = 1;
+            }
+            /* collapse only non-sync writes */
+            continue;
+        } else if (!holder) {
+            /* holder is always a non-sync write */
+            holder = req;
+            continue;
         }
 
-        STACK_UNWIND (frame, op_ret, op_errno, fd);
-        return 0;
-}
+        offset_expected = holder->stub->args.offset + holder->write_size;
 
+        if (req->stub->args.offset != offset_expected) {
+            holder->ordering.go = 1;
+            holder = req;
+            continue;
+        }
 
-int32_t
-wb_open (call_frame_t *frame,
-         xlator_t *this,
-         loc_t *loc,
-         int32_t flags,
-         fd_t *fd)
-{
-        frame->local = CALLOC (1, sizeof(int32_t));
-        *((int32_t *)frame->local) = flags;
-
-        STACK_WIND (frame,
-                    wb_open_cbk,
-                    FIRST_CHILD(this),
-                    FIRST_CHILD(this)->fops->open,
-                    loc, flags, fd);
-        return 0;
+        if (!is_same_lkowner(&req->lk_owner, &holder->lk_owner)) {
+            holder->ordering.go = 1;
+            holder = req;
+            continue;
+        }
+
+        if (req->fd != holder->fd) {
+            holder->ordering.go = 1;
+            holder = req;
+            continue;
+        }
+
+        space_left = page_size - holder->write_size;
+
+        if (space_left < req->write_size) {
+            holder->ordering.go = 1;
+            holder = req;
+            continue;
+        }
+
+        ret = __wb_collapse_small_writes(conf, holder, req);
+        if (ret)
+            continue;
+
+        /* collapsed request is as good as wound
+           (from its p.o.v)
+        */
+        list_del_init(&req->todo);
+        __wb_fulfill_request(req);
+
+        /* Only the last @holder in queue which
+
+           - does not have any non-buffered-writes following it
+           - has not yet filled its capacity
+
+           does not get its 'go' set, in anticipation of the arrival
+           of consecutive smaller writes.
+        */
+    }
+
+    /* but if trickling writes are enabled, then do not hold back
+       writes if there are no outstanding requests
+    */
+
+    if (conf->trickling_writes && !wb_inode->transit && holder)
+        holder->ordering.go = 1;
+
+    if (wb_inode->dontsync > 0)
+        wb_inode->dontsync--;
+
+    return;
 }
 
+int
+__wb_handle_failed_conflict(wb_request_t *req, wb_request_t *conflict,
+                            list_head_t *tasks)
+{
+    wb_conf_t *conf = NULL;
+    char gfid[64] = {
+        0,
+    };
+
+    conf = req->wb_inode->this->private;
+
+    uuid_utoa_r(req->gfid, gfid);
+
+    if ((req->stub->fop != GF_FOP_FLUSH) &&
+        ((req->stub->fop != GF_FOP_FSYNC) || conf->resync_after_fsync)) {
+        if (!req->ordering.lied && list_empty(&conflict->wip)) {
+            /* If request itself is in liability queue,
+             * 1. We cannot unwind as the response has already been
+             *    sent.
+             * 2. We cannot wind till conflict clears up.
+             * 3. So, skip the request for now.
+             * 4. Otherwise, resume (unwind) it with error.
+             */
+            req->op_ret = -1;
+            req->op_errno = conflict->op_errno;
+            if ((req->stub->fop == GF_FOP_TRUNCATE) ||
+                (req->stub->fop == GF_FOP_FTRUNCATE)) {
+                req->stub->frame->local = NULL;
+            }
+
+            list_del_init(&req->todo);
+            list_add_tail(&req->winds, tasks);
+
+            gf_msg_debug(req->wb_inode->this->name, 0,
+                         "(unique=%" PRIu64
+                         ", fop=%s, gfid=%s, "
+                         "gen=%" PRIu64
+                         "): A conflicting write "
+                         "request in liability queue has failed "
+                         "to sync (error = \"%s\"), "
+                         "unwinding this request as a failure",
+                         req->unique, gf_fop_list[req->fop], gfid, req->gen,
+                         strerror(req->op_errno));
+
+            if (req->ordering.tempted) {
+                /* make sure that it won't be unwound in
+                 * wb_do_unwinds too. Otherwise there'll be
+                 * a double wind.
+                 */
+                list_del_init(&req->lie);
+
+                gf_msg_debug(req->wb_inode->this->name, 0,
+                             "(unique=%" PRIu64
+                             ", fop=%s, "
+                             "gfid=%s, gen=%" PRIu64
+                             "): "
+                             "removed from liability queue",
+                             req->unique, gf_fop_list[req->fop], gfid,
+                             req->gen);
+
+                __wb_fulfill_request(req);
+            }
+        }
+    } else {
+        gf_msg_debug(req->wb_inode->this->name, 0,
+                     "(unique=%" PRIu64
+                     ", fop=%s, gfid=%s, "
+                     "gen=%" PRIu64
+                     "): A conflicting write request "
+                     "in liability queue has failed to sync "
+                     "(error = \"%s\"). This is an "
+                     "FSYNC/FLUSH and we need to maintain ordering "
+                     "guarantees with other writes in TODO queue. "
+                     "Hence doing nothing now",
+                     req->unique, gf_fop_list[req->fop], gfid, req->gen,
+                     strerror(conflict->op_errno));
+
+        /* flush and fsync (without conf->resync_after_fsync) act as
+           barriers. We cannot unwind them out of
+           order, when there are earlier generation writes just because
+           there is a conflicting liability with an error. So, wait for
+           our turn till there are no conflicting liabilities.
+
+           This situation can arise when there liabilities spread across
+           multiple generations. For eg., consider two writes with
+           following characterstics:
+
+           1. they belong to different generations gen1, gen2 and
+              (gen1 > gen2).
+           2. they overlap.
+           3. both are liabilities.
+           4. gen1 write was attempted to sync, but the attempt failed.
+           5. there was no attempt to sync gen2 write yet.
+           6. A flush (as part of close) is issued and gets a gen no
+              gen3.
+
+           In the above scenario, if flush is unwound without waiting
+           for gen1 and gen2 writes either to be successfully synced or
+           purged, we end up with these two writes in wb_inode->todo
+           list forever as there will be no attempt to process the queue
+           as flush is the last operation.
+        */
+    }
+
+    return 0;
+}
 
-int32_t
-wb_create_cbk (call_frame_t *frame,
-               void *cookie,
-               xlator_t *this,
-               int32_t op_ret,
-               int32_t op_errno,
-               fd_t *fd,
-               inode_t *inode,
-               struct stat *buf)
-{
-        wb_file_t *file = NULL;
-
-        if (op_ret != -1)
+int
+__wb_pick_winds(wb_inode_t *wb_inode, list_head_t *tasks,
+                list_head_t *liabilities)
+{
+    wb_request_t *req = NULL;
+    wb_request_t *tmp = NULL;
+    wb_request_t *conflict = NULL;
+    char req_gfid[64] =
         {
-                file = wb_file_create (this, fd);
-                /* 
-                 * If mandatory locking has been enabled on this file,
-                 * we disable caching on it
+            0,
+        },
+         conflict_gfid[64] = {
+             0,
+         };
+
+    list_for_each_entry_safe(req, tmp, &wb_inode->todo, todo)
+    {
+        uuid_utoa_r(req->gfid, req_gfid);
+
+        conflict = wb_liability_has_conflict(wb_inode, req);
+        if (conflict) {
+            uuid_utoa_r(conflict->gfid, conflict_gfid);
+
+            gf_msg_debug(wb_inode->this->name, 0,
+                         "Not winding request due to a "
+                         "conflicting write in liability queue. "
+                         "REQ: unique=%" PRIu64
+                         ", fop=%s, "
+                         "gen=%" PRIu64
+                         ", gfid=%s. "
+                         "CONFLICT: unique=%" PRIu64
+                         ", fop=%s, "
+                         "gen=%" PRIu64
+                         ", gfid=%s, "
+                         "conflicts-sync-failed?=%s, "
+                         "conflicts-error=%s",
+                         req->unique, gf_fop_list[req->fop], req->gen, req_gfid,
+                         conflict->unique, gf_fop_list[conflict->fop],
+                         conflict->gen, conflict_gfid,
+                         (conflict->op_ret == 1) ? "yes" : "no",
+                         strerror(conflict->op_errno));
+
+            if (conflict->op_ret == -1) {
+                /* There is a conflicting liability which failed
+                 * to sync in previous attempts, resume the req
+                 * and fail, unless its an fsync/flush.
                  */
-                if ((fd->inode->st_mode & S_ISGID) && 
-                    !(fd->inode->st_mode & S_IXGRP))
-                {
-                        file->disabled = 1;
-                }
 
-                LOCK_INIT (&file->lock);
+                __wb_handle_failed_conflict(req, conflict, tasks);
+            } else {
+                /* There is a conflicting liability which was
+                 * not attempted to sync even once. Wait till
+                 * at least one attempt to sync is made.
+                 */
+            }
+
+            continue;
         }
 
-        STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf);
-        return 0;
-}
+        if (req->ordering.tempted && !req->ordering.go) {
+            /* wait some more */
+            gf_msg_debug(wb_inode->this->name, 0,
+                         "(unique=%" PRIu64 ", fop=%s, gen=%" PRIu64
+                         ", gfid=%s): ordering.go is not set, "
+                         "hence not winding",
+                         req->unique, gf_fop_list[req->fop], req->gen,
+                         req_gfid);
+            continue;
+        }
 
+        if (req->stub->fop == GF_FOP_WRITE) {
+            conflict = wb_wip_has_conflict(wb_inode, req);
+
+            if (conflict) {
+                uuid_utoa_r(conflict->gfid, conflict_gfid);
+
+                gf_msg_debug(wb_inode->this->name, 0,
+                             "Not winding write request as "
+                             "a conflicting write is being "
+                             "synced to backend. "
+                             "REQ: unique=%" PRIu64
+                             " fop=%s,"
+                             " gen=%" PRIu64
+                             ", gfid=%s. "
+                             "CONFLICT: unique=%" PRIu64
+                             " "
+                             "fop=%s, gen=%" PRIu64
+                             ", "
+                             "gfid=%s",
+                             req->unique, gf_fop_list[req->fop], req->gen,
+                             req_gfid, conflict->unique,
+                             gf_fop_list[conflict->fop], conflict->gen,
+                             conflict_gfid);
+                continue;
+            }
+
+            list_add_tail(&req->wip, &wb_inode->wip);
+            req->wind_count++;
+
+            if (!req->ordering.tempted)
+                /* unrefed in wb_writev_cbk */
+                req->stub->frame->local = __wb_request_ref(req);
+        }
 
-int32_t
-wb_create (call_frame_t *frame,
-           xlator_t *this,
-           loc_t *loc,
-           int32_t flags,
-           mode_t mode,
-           fd_t *fd)
-{
-        STACK_WIND (frame,
-                    wb_create_cbk,
-                    FIRST_CHILD(this),
-                    FIRST_CHILD(this)->fops->create,
-                    loc, flags, mode, fd);
-        return 0;
-}
+        gf_msg_debug(wb_inode->this->name, 0,
+                     "(unique=%" PRIu64
+                     ", fop=%s, gfid=%s, "
+                     "gen=%" PRIu64
+                     "): picking the request for "
+                     "winding",
+                     req->unique, gf_fop_list[req->fop], req_gfid, req->gen);
+
+        list_del_init(&req->todo);
 
+        if (req->ordering.tempted) {
+            list_add_tail(&req->winds, liabilities);
+        } else {
+            list_add_tail(&req->winds, tasks);
+        }
+    }
+
+    return 0;
+}
 
-int32_t 
-__wb_cleanup_queue (wb_file_t *file)
+void
+wb_do_winds(wb_inode_t *wb_inode, list_head_t *tasks)
 {
-        wb_write_request_t *request = NULL, *dummy = NULL;
-        int32_t bytes = 0;
+    wb_request_t *req = NULL;
+    wb_request_t *tmp = NULL;
 
-        list_for_each_entry_safe (request, dummy, &file->request, list)
-        {
-                if (request->got_reply && request->write_behind)
-                {
-                        bytes += iov_length (request->vector, request->count);
-                        list_del_init (&request->list);
-
-                        FREE (request->vector);
-                        dict_unref (request->refs);
-      
-                        FREE (request);
-                }
+    list_for_each_entry_safe(req, tmp, tasks, winds)
+    {
+        list_del_init(&req->winds);
+
+        if (req->op_ret == -1) {
+            call_unwind_error_keep_stub(req->stub, req->op_ret, req->op_errno);
+        } else {
+            call_resume_keep_stub(req->stub);
         }
 
-        return bytes;
+        wb_request_unref(req);
+    }
 }
 
-
-int32_t 
-__wb_mark_wind_all (list_head_t *list, list_head_t *winds)
+void
+wb_process_queue(wb_inode_t *wb_inode)
 {
-        wb_write_request_t *request = NULL;
-        size_t size = 0;
+    list_head_t tasks;
+    list_head_t lies;
+    list_head_t liabilities;
+    int wind_failure = 0;
+
+    INIT_LIST_HEAD(&tasks);
+    INIT_LIST_HEAD(&lies);
+    INIT_LIST_HEAD(&liabilities);
 
-        list_for_each_entry (request, list, list)
+    do {
+        gf_log_callingfn(wb_inode->this->name, GF_LOG_DEBUG,
+                         "processing queues");
+
+        LOCK(&wb_inode->lock);
         {
-                if (!request->stack_wound)
-                {
-                        size += iov_length (request->vector, request->count);
-                        request->stack_wound = 1;
-                        list_add_tail (&request->winds, winds);
-                }
+            __wb_preprocess_winds(wb_inode);
+
+            __wb_pick_winds(wb_inode, &tasks, &liabilities);
+
+            __wb_pick_unwinds(wb_inode, &lies);
         }
-  
-        return size;
+        UNLOCK(&wb_inode->lock);
+
+        if (!list_empty(&lies))
+            wb_do_unwinds(wb_inode, &lies);
+
+        if (!list_empty(&tasks))
+            wb_do_winds(wb_inode, &tasks);
+
+        /* If there is an error in wb_fulfill before winding write
+         * requests, we would miss invocation of wb_process_queue
+         * from wb_fulfill_cbk. So, retry processing again.
+         */
+        if (!list_empty(&liabilities))
+            wind_failure = wb_fulfill(wb_inode, &liabilities);
+    } while (wind_failure);
+
+    return;
 }
 
+void
+wb_set_inode_size(wb_inode_t *wb_inode, struct iatt *postbuf)
+{
+    GF_ASSERT(wb_inode);
+    GF_ASSERT(postbuf);
+
+    LOCK(&wb_inode->lock);
+    {
+        wb_inode->size = postbuf->ia_size;
+    }
+    UNLOCK(&wb_inode->lock);
+}
 
-size_t 
-__wb_get_aggregate_size (list_head_t *list)
+int
+wb_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
+              dict_t *xdata)
 {
-        wb_write_request_t *request = NULL;
-        size_t size = 0;
+    wb_request_t *req = NULL;
+    wb_inode_t *wb_inode;
 
-        list_for_each_entry (request, list, list)
-        {
-                if (!request->stack_wound)
-                {
-                        size += iov_length (request->vector, request->count);
-                }
-        }
+    req = frame->local;
+    frame->local = NULL;
+    wb_inode = req->wb_inode;
+
+    LOCK(&req->wb_inode->lock);
+    {
+        list_del_init(&req->wip);
+    }
+    UNLOCK(&req->wb_inode->lock);
+
+    wb_request_unref(req);
 
-        return size;
+    /* requests could be pending while this was in progress */
+    wb_process_queue(wb_inode);
+
+    STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
 }
 
-uint32_t
-__wb_get_incomplete_writes (list_head_t *list)
+int
+wb_writev_helper(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                 struct iovec *vector, int32_t count, off_t offset,
+                 uint32_t flags, struct iobref *iobref, dict_t *xdata)
 {
-        wb_write_request_t *request = NULL;
-        uint32_t count = 0;
+    STACK_WIND(frame, wb_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+               flags, iobref, xdata);
+    return 0;
+}
 
-        list_for_each_entry (request, list, list)
-        {
-                if (request->stack_wound && !request->got_reply)
-                {
-                        count++;
-                }
-        }
+int
+wb_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+          int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+          dict_t *xdata)
+{
+    wb_inode_t *wb_inode = NULL;
+    wb_conf_t *conf = NULL;
+    gf_boolean_t wb_disabled = 0;
+    call_stub_t *stub = NULL;
+    int ret = -1;
+    int32_t op_errno = EINVAL;
+    int o_direct = O_DIRECT;
+
+    conf = this->private;
+
+    wb_inode = wb_inode_create(this, fd->inode);
+    if (!wb_inode) {
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    if (!conf->strict_O_DIRECT)
+        o_direct = 0;
+
+    if (fd->flags & (O_SYNC | O_DSYNC | o_direct))
+        wb_disabled = 1;
+
+    if (flags & (O_SYNC | O_DSYNC | o_direct))
+        wb_disabled = 1;
+
+    if (wb_disabled)
+        stub = fop_writev_stub(frame, wb_writev_helper, fd, vector, count,
+                               offset, flags, iobref, xdata);
+    else
+        stub = fop_writev_stub(frame, NULL, fd, vector, count, offset, flags,
+                               iobref, xdata);
+    if (!stub) {
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    if (wb_disabled)
+        ret = wb_enqueue(wb_inode, stub);
+    else
+        ret = wb_enqueue_tempted(wb_inode, stub);
+
+    if (!ret) {
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    wb_process_queue(wb_inode);
+
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(writev, frame, -1, op_errno, NULL, NULL, NULL);
+
+    if (stub)
+        call_stub_destroy(stub);
+
+    return 0;
+}
 
-        return count;
+int
+wb_readv_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+                off_t offset, uint32_t flags, dict_t *xdata)
+{
+    STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+    return 0;
 }
 
-int32_t
-__wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_conf)
+int
+wb_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+         off_t offset, uint32_t flags, dict_t *xdata)
 {
-        size_t aggregate_current = 0;
-        uint32_t incomplete_writes = 0;
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
 
-        incomplete_writes = __wb_get_incomplete_writes (list); 
+    wb_inode = wb_inode_ctx_get(this, fd->inode);
+    if (!wb_inode)
+        goto noqueue;
 
-        aggregate_current = __wb_get_aggregate_size (list);
+    stub = fop_readv_stub(frame, wb_readv_helper, fd, size, offset, flags,
+                          xdata);
+    if (!stub)
+        goto unwind;
 
-        if ((incomplete_writes == 0) || (aggregate_current >= aggregate_conf))
-        {
-                __wb_mark_wind_all (list, winds);
-        }
+    if (!wb_enqueue(wb_inode, stub))
+        goto unwind;
+
+    wb_process_queue(wb_inode);
+
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(readv, frame, -1, ENOMEM, NULL, 0, NULL, NULL, NULL);
 
-        return aggregate_current;
+    if (stub)
+        call_stub_destroy(stub);
+    return 0;
+
+noqueue:
+    STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+    return 0;
 }
 
+int
+wb_flush_bg_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+    STACK_DESTROY(frame->root);
+    return 0;
+}
 
-size_t
-__wb_get_window_size (list_head_t *list)
+int
+wb_flush_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
 {
-        wb_write_request_t *request = NULL;
-        size_t size = 0;
+    wb_conf_t *conf = NULL;
+    wb_inode_t *wb_inode = NULL;
+    call_frame_t *bg_frame = NULL;
+    int32_t op_errno = 0;
+    int op_ret = 0;
+
+    conf = this->private;
+
+    wb_inode = wb_inode_ctx_get(this, fd->inode);
+    if (!wb_inode) {
+        op_ret = -1;
+        op_errno = EINVAL;
+        goto unwind;
+    }
+
+    if (conf->flush_behind)
+        goto flushbehind;
+
+    STACK_WIND(frame, default_flush_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->flush, fd, xdata);
+    return 0;
+
+flushbehind:
+    bg_frame = copy_frame(frame);
+    if (!bg_frame) {
+        op_ret = -1;
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    STACK_WIND(bg_frame, wb_flush_bg_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->flush, fd, xdata);
+    /* fall through */
+unwind:
+    STACK_UNWIND_STRICT(flush, frame, op_ret, op_errno, NULL);
+
+    return 0;
+}
 
-        list_for_each_entry (request, list, list)
-        {
-                if (request->write_behind && !request->got_reply)
-                {
-                        size += iov_length (request->vector, request->count);
-                }
-        }
+int
+wb_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
+
+    wb_inode = wb_inode_ctx_get(this, fd->inode);
+    if (!wb_inode)
+        goto noqueue;
+
+    stub = fop_flush_stub(frame, wb_flush_helper, fd, xdata);
+    if (!stub)
+        goto unwind;
 
-        return size;
+    if (!wb_enqueue(wb_inode, stub))
+        goto unwind;
+
+    wb_process_queue(wb_inode);
+
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(flush, frame, -1, ENOMEM, NULL);
+
+    if (stub)
+        call_stub_destroy(stub);
+
+    return 0;
+
+noqueue:
+    STACK_WIND(frame, default_flush_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->flush, fd, xdata);
+    return 0;
 }
 
+int
+wb_fsync_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+                dict_t *xdata)
+{
+    STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata);
+    return 0;
+}
 
-size_t 
-__wb_mark_unwind_till (list_head_t *list, list_head_t *unwinds, size_t size)
+int
+wb_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+         dict_t *xdata)
 {
-        size_t written_behind = 0;
-        wb_write_request_t *request = NULL;
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
+    int32_t op_errno = EINVAL;
 
-        list_for_each_entry (request, list, list)
-        {
-                if (written_behind <= size)
-                {
-                        if (!request->write_behind)
-                        {
-                                wb_local_t *local = request->frame->local;
-                                written_behind += iov_length (request->vector, request->count);
-                                request->write_behind = 1;
-                                list_add_tail (&local->unwind_frames, unwinds);
-                        }
-                }
-                else
-                {
-                        break;
-                }
-        }
+    wb_inode = wb_inode_ctx_get(this, fd->inode);
+    if (!wb_inode)
+        goto noqueue;
+
+    stub = fop_fsync_stub(frame, wb_fsync_helper, fd, datasync, xdata);
+    if (!stub)
+        goto unwind;
+
+    if (!wb_enqueue(wb_inode, stub))
+        goto unwind;
 
-        return written_behind;
+    wb_process_queue(wb_inode);
+
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(fsync, frame, -1, op_errno, NULL, NULL, NULL);
+
+    if (stub)
+        call_stub_destroy(stub);
+    return 0;
+
+noqueue:
+    STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata);
+    return 0;
 }
 
+int
+wb_stat_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->stat, loc, xdata);
+    return 0;
+}
 
-int32_t 
-__wb_mark_unwinds (list_head_t *list, list_head_t *unwinds, size_t window_conf)
+int
+wb_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
 {
-        size_t window_current = 0;
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
 
-        window_current = __wb_get_window_size (list);
-        if (window_current <= window_conf)
-        {
-                window_current += __wb_mark_unwind_till (list, unwinds,
-                                                         window_conf - window_current);
-        }
+    wb_inode = wb_inode_ctx_get(this, loc->inode);
+    if (!wb_inode)
+        goto noqueue;
+
+    stub = fop_stat_stub(frame, wb_stat_helper, loc, xdata);
+    if (!stub)
+        goto unwind;
+
+    if (!wb_enqueue(wb_inode, stub))
+        goto unwind;
+
+    wb_process_queue(wb_inode);
+
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(stat, frame, -1, ENOMEM, NULL, NULL);
+
+    if (stub)
+        call_stub_destroy(stub);
+    return 0;
 
-        return window_current;
+noqueue:
+    STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->stat, loc, xdata);
+    return 0;
 }
 
+int
+wb_fstat_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fstat, fd, xdata);
+    return 0;
+}
+
+int
+wb_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
+
+    wb_inode = wb_inode_ctx_get(this, fd->inode);
+    if (!wb_inode)
+        goto noqueue;
+
+    stub = fop_fstat_stub(frame, wb_fstat_helper, fd, xdata);
+    if (!stub)
+        goto unwind;
+
+    if (!wb_enqueue(wb_inode, stub))
+        goto unwind;
+
+    wb_process_queue(wb_inode);
+
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, NULL, NULL);
+
+    if (stub)
+        call_stub_destroy(stub);
+    return 0;
+
+noqueue:
+    STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fstat, fd, xdata);
+    return 0;
+}
 
 int32_t
-wb_stack_unwind (list_head_t *unwinds)
+wb_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                struct iatt *postbuf, dict_t *xdata)
 {
-        struct stat buf = {0,};
-        wb_local_t *local = NULL, *dummy = NULL;
+    GF_ASSERT(frame->local);
 
-        list_for_each_entry_safe (local, dummy, unwinds, unwind_frames)
-        {
-                list_del_init (&local->unwind_frames);
-                STACK_UNWIND (local->frame, local->op_ret, local->op_errno, &buf);
-        }
+    if (op_ret == 0)
+        wb_set_inode_size(frame->local, postbuf);
 
-        return 0;
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
+
+int
+wb_truncate_helper(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                   off_t offset, dict_t *xdata)
+{
+    STACK_WIND(frame, wb_truncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+    return 0;
 }
 
+int
+wb_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+            dict_t *xdata)
+{
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
+
+    wb_inode = wb_inode_create(this, loc->inode);
+    if (!wb_inode)
+        goto unwind;
+
+    frame->local = wb_inode;
+
+    stub = fop_truncate_stub(frame, wb_truncate_helper, loc, offset, xdata);
+    if (!stub)
+        goto unwind;
+
+    if (!wb_enqueue(wb_inode, stub))
+        goto unwind;
+
+    wb_process_queue(wb_inode);
+
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(truncate, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    if (stub)
+        call_stub_destroy(stub);
+
+    return 0;
+}
 
 int32_t
-wb_do_ops (call_frame_t *frame, wb_file_t *file, list_head_t *winds, list_head_t *unwinds)
+wb_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                 struct iatt *postbuf, dict_t *xdata)
 {
-        /* copy the frame before calling wb_stack_unwind, since this request containing current frame might get unwound */
-        /*  call_frame_t *sync_frame = copy_frame (frame); */
- 
-        wb_stack_unwind (unwinds);
-        wb_sync (frame, file, winds);
+    GF_ASSERT(frame->local);
 
-        return 0;
+    if (op_ret == 0)
+        wb_set_inode_size(frame->local, postbuf);
+
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
 }
 
+int
+wb_ftruncate_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                    dict_t *xdata)
+{
+    STACK_WIND(frame, wb_ftruncate_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+    return 0;
+}
 
-int32_t 
-wb_process_queue (call_frame_t *frame, wb_file_t *file, char flush_all) 
+int
+wb_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+             dict_t *xdata)
 {
-        list_head_t winds, unwinds;
-        size_t size = 0;
-        wb_conf_t *conf = file->this->private;
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
+    int32_t op_errno = 0;
 
-        INIT_LIST_HEAD (&winds);
-        INIT_LIST_HEAD (&unwinds);
+    wb_inode = wb_inode_create(this, fd->inode);
+    if (!wb_inode) {
+        op_errno = ENOMEM;
+        goto unwind;
+    }
 
-        if (!file)
-        {
-                return -1;
-        }
+    frame->local = wb_inode;
 
-        size = flush_all ? 0 : conf->aggregate_size;
-        LOCK (&file->lock);
-        {
-                __wb_cleanup_queue (file);
-                __wb_mark_winds (&file->request, &winds, size);
-                __wb_mark_unwinds (&file->request, &unwinds, conf->window_size);
-        }
-        UNLOCK (&file->lock);
+    stub = fop_ftruncate_stub(frame, wb_ftruncate_helper, fd, offset, xdata);
+    if (!stub) {
+        op_errno = ENOMEM;
+        goto unwind;
+    }
 
-        wb_do_ops (frame, file, &winds, &unwinds);
-        return 0;
+    if (!wb_enqueue(wb_inode, stub)) {
+        op_errno = ENOMEM;
+        goto unwind;
+    }
+
+    wb_process_queue(wb_inode);
+
+    return 0;
+
+unwind:
+    frame->local = NULL;
+
+    STACK_UNWIND_STRICT(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+
+    if (stub)
+        call_stub_destroy(stub);
+    return 0;
 }
 
+int
+wb_setattr_helper(call_frame_t *frame, xlator_t *this, loc_t *loc,
+                  struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+    return 0;
+}
 
-wb_write_request_t *
-wb_enqueue (wb_file_t *file, 
-            call_frame_t *frame,
-            struct iovec *vector,
-            int32_t count,
-            off_t offset)
+int
+wb_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+           int32_t valid, dict_t *xdata)
 {
-        wb_write_request_t *request = NULL;
-        wb_local_t *local = CALLOC (1, sizeof (*local));
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
 
-        request = CALLOC (1, sizeof (*request));
+    wb_inode = wb_inode_ctx_get(this, loc->inode);
+    if (!wb_inode)
+        goto noqueue;
 
-        INIT_LIST_HEAD (&request->list);
-        INIT_LIST_HEAD (&request->winds);
+    stub = fop_setattr_stub(frame, wb_setattr_helper, loc, stbuf, valid, xdata);
+    if (!stub)
+        goto unwind;
 
-        request->frame = frame;
-        request->vector = iov_dup (vector, count);
-        request->count = count;
-        request->offset = offset;
-        request->refs = dict_ref (frame->root->req_refs);
+    if (!wb_enqueue(wb_inode, stub))
+        goto unwind;
 
-        frame->local = local;
-        local->frame = frame;
-        local->op_ret = iov_length (vector, count);
-        local->op_errno = 0;
-        INIT_LIST_HEAD (&local->unwind_frames);
+    wb_process_queue(wb_inode);
 
-        LOCK (&file->lock);
-        {
-                list_add_tail (&request->list, &file->request);
-                file->offset = offset + iov_length (vector, count);
-        }
-        UNLOCK (&file->lock);
+    return 0;
+unwind:
+    STACK_UNWIND_STRICT(setattr, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    if (stub)
+        call_stub_destroy(stub);
+    return 0;
+
+noqueue:
+    STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+    return 0;
+}
+
+int
+wb_fsetattr_helper(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                   struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+    STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+    return 0;
+}
 
-        return request;
+int
+wb_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+            int32_t valid, dict_t *xdata)
+{
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
+
+    wb_inode = wb_inode_ctx_get(this, fd->inode);
+    if (!wb_inode)
+        goto noqueue;
+
+    stub = fop_fsetattr_stub(frame, wb_fsetattr_helper, fd, stbuf, valid,
+                             xdata);
+    if (!stub)
+        goto unwind;
+
+    if (!wb_enqueue(wb_inode, stub))
+        goto unwind;
+
+    wb_process_queue(wb_inode);
+
+    return 0;
+unwind:
+    STACK_UNWIND_STRICT(fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    if (stub)
+        call_stub_destroy(stub);
+    return 0;
+
+noqueue:
+    STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+    return 0;
 }
 
+int32_t
+wb_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+          mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+    wb_inode_t *wb_inode = NULL;
+
+    wb_inode = wb_inode_create(this, fd->inode);
+    if (!wb_inode)
+        goto unwind;
+
+    if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC))
+        wb_inode->size = 0;
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->create,
+                    loc, flags, mode, umask, fd, xdata);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL,
+                        NULL);
+    return 0;
+}
 
 int32_t
-wb_writev_cbk (call_frame_t *frame,
-               void *cookie,
-               xlator_t *this,
-               int32_t op_ret,
-               int32_t op_errno,
-               struct stat *stbuf)
-{
-        STACK_UNWIND (frame, op_ret, op_errno, stbuf);
-        return 0;
+wb_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+        fd_t *fd, dict_t *xdata)
+{
+    wb_inode_t *wb_inode = NULL;
+
+    wb_inode = wb_inode_create(this, fd->inode);
+    if (!wb_inode)
+        goto unwind;
+
+    if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC))
+        wb_inode->size = 0;
+
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->open,
+                    loc, flags, fd, xdata);
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(open, frame, -1, ENOMEM, NULL, NULL);
+    return 0;
+}
+
+int32_t
+wb_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+              struct iatt *postparent)
+{
+    if (op_ret == 0) {
+        wb_inode_t *wb_inode = wb_inode_ctx_get(this, inode);
+        if (wb_inode)
+            wb_set_inode_size(wb_inode, buf);
+    }
+
+    STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, xdata,
+                        postparent);
+    return 0;
 }
 
+int
+wb_lookup_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    STACK_WIND(frame, wb_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
+    return 0;
+}
 
 int32_t
-wb_writev (call_frame_t *frame,
-           xlator_t *this,
-           fd_t *fd,
-           struct iovec *vector,
-           int32_t count,
-           off_t offset)
-{
-        wb_file_t *file = NULL;
-        char offset_expected = 1, wb_disabled = 0; 
-        call_frame_t *process_frame = NULL;
-        size_t size = 0;
-	uint64_t tmp_file = 0;
-
-        if (vector != NULL) 
-                size = iov_length (vector, count);
-
-        if (fd_ctx_get (fd, this, &tmp_file)) {
-                gf_log (this->name, GF_LOG_ERROR, "returning EBADFD");
-                STACK_UNWIND (frame, -1, EBADFD, NULL);
-                return 0;
-        }
+wb_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
+
+    wb_inode = wb_inode_ctx_get(this, loc->inode);
+    if (!wb_inode)
+        goto noqueue;
+
+    stub = fop_lookup_stub(frame, wb_lookup_helper, loc, xdata);
+    if (!stub)
+        goto unwind;
+
+    if (!wb_enqueue(wb_inode, stub))
+        goto unwind;
+
+    wb_process_queue(wb_inode);
+
+    return 0;
+
+unwind:
+    if (stub)
+        call_stub_destroy(stub);
+
+    STACK_UNWIND_STRICT(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
+    return 0;
+
+noqueue:
+    STACK_WIND(frame, wb_lookup_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->lookup, loc, xdata);
+    return 0;
+}
 
-	file = (wb_file_t *)(long)tmp_file;
-        if (!file) {
-                gf_log (this->name, GF_LOG_ERROR,
-                        "wb_file not found for fd %p", fd);
-                STACK_UNWIND (frame, -1, EBADFD, NULL);
-                return 0;
+static void
+wb_mark_readdirp_start(xlator_t *this, inode_t *directory)
+{
+    wb_inode_t *wb_directory_inode = NULL;
+
+    wb_directory_inode = wb_inode_create(this, directory);
+
+    if (!wb_directory_inode)
+        return;
+
+    LOCK(&wb_directory_inode->lock);
+    {
+        GF_ATOMIC_INC(wb_directory_inode->readdirps);
+    }
+    UNLOCK(&wb_directory_inode->lock);
+
+    return;
+}
+
+static void
+wb_mark_readdirp_end(xlator_t *this, inode_t *directory)
+{
+    wb_inode_t *wb_directory_inode = NULL, *wb_inode = NULL, *tmp = NULL;
+    int readdirps = 0;
+
+    wb_directory_inode = wb_inode_ctx_get(this, directory);
+
+    if (!wb_directory_inode)
+        return;
+
+    LOCK(&wb_directory_inode->lock);
+    {
+        readdirps = GF_ATOMIC_DEC(wb_directory_inode->readdirps);
+        if (readdirps)
+            goto unlock;
+
+        list_for_each_entry_safe(wb_inode, tmp,
+                                 &wb_directory_inode->invalidate_list,
+                                 invalidate_list)
+        {
+            list_del_init(&wb_inode->invalidate_list);
+            GF_ATOMIC_INIT(wb_inode->invalidate, 0);
+            inode_unref(wb_inode->inode);
         }
+    }
+unlock:
+    UNLOCK(&wb_directory_inode->lock);
+
+    return;
+}
+
+int32_t
+wb_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+                dict_t *xdata)
+{
+    wb_inode_t *wb_inode = NULL;
+    gf_dirent_t *entry = NULL;
+    inode_t *inode = NULL;
+    fd_t *fd = NULL;
+
+    fd = frame->local;
+    frame->local = NULL;
+
+    if (op_ret <= 0)
+        goto unwind;
+
+    list_for_each_entry(entry, &entries->list, list)
+    {
+        if (!entry->inode || !IA_ISREG(entry->d_stat.ia_type))
+            continue;
+
+        wb_inode = wb_inode_ctx_get(this, entry->inode);
+        if (!wb_inode)
+            continue;
 
-        LOCK (&file->lock);
+        LOCK(&wb_inode->lock);
         {
-                if (file->disabled || file->disable_till) {
-                        if (size > file->disable_till) {
-                                file->disable_till = 0;
-                        } else {
-                                file->disable_till -= size;
-                        }
-                        wb_disabled = 1;
-                }
+            if (!list_empty(&wb_inode->liability) ||
+                GF_ATOMIC_GET(wb_inode->invalidate)) {
+                inode = entry->inode;
 
-                if (file->offset != offset)
-                        offset_expected = 0;
+                entry->inode = NULL;
+                memset(&entry->d_stat, 0, sizeof(entry->d_stat));
+            }
         }
-        UNLOCK (&file->lock);
-
-        if (wb_disabled) {
-                STACK_WIND (frame,
-                            wb_writev_cbk,
-                            FIRST_CHILD (frame->this),
-                            FIRST_CHILD (frame->this)->fops->writev,
-                            file->fd,
-                            vector,
-                            count,
-                            offset);
-                return 0;
+        UNLOCK(&wb_inode->lock);
+
+        if (inode) {
+            inode_unref(inode);
+            inode = NULL;
         }
+    }
 
-        process_frame = copy_frame (frame);
+unwind:
+    wb_mark_readdirp_end(this, fd->inode);
 
-        if (!offset_expected)
-                wb_process_queue (process_frame, file, 1);
+    frame->local = NULL;
+    STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata);
+    return 0;
+}
 
-        wb_enqueue (file, frame, vector, count, offset);
-        wb_process_queue (process_frame, file, 0);
+int32_t
+wb_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t off, dict_t *xdata)
+{
+    wb_mark_readdirp_start(this, fd->inode);
 
-        STACK_DESTROY (process_frame->root);
+    frame->local = fd;
 
-        return 0;
+    STACK_WIND(frame, wb_readdirp_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata);
+
+    return 0;
 }
 
+int32_t
+wb_link_helper(call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+               loc_t *newloc, dict_t *xdata)
+{
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+                    oldloc, newloc, xdata);
+    return 0;
+}
 
 int32_t
-wb_readv_cbk (call_frame_t *frame,
-              void *cookie,
-              xlator_t *this,
-              int32_t op_ret,
-              int32_t op_errno,
-              struct iovec *vector,
-              int32_t count,
-              struct stat *stbuf)
+wb_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+        dict_t *xdata)
 {
-        wb_local_t *local = NULL;
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
 
-        local = frame->local;
+    wb_inode = wb_inode_ctx_get(this, oldloc->inode);
+    if (!wb_inode)
+        goto noqueue;
 
-        STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf);
-        return 0;
-}
+    stub = fop_link_stub(frame, wb_link_helper, oldloc, newloc, xdata);
+    if (!stub)
+        goto unwind;
 
+    if (!wb_enqueue(wb_inode, stub))
+        goto unwind;
 
-int32_t
-wb_readv (call_frame_t *frame,
-          xlator_t *this,
-          fd_t *fd,
-          size_t size,
-          off_t offset)
-{
-        wb_file_t *file = NULL;
-        wb_local_t *local = NULL;
-	uint64_t tmp_file = 0;
-
-        if (fd_ctx_get (fd, this, &tmp_file)) {
-                gf_log (this->name, GF_LOG_ERROR, "returning EBADFD");
-                STACK_UNWIND (frame, -1, EBADFD, NULL);
-                return 0;
-        }
+    wb_process_queue(wb_inode);
 
-	file = (wb_file_t *)(long)tmp_file;
-        if (file)
-                wb_sync_all (frame, file);
+    return 0;
 
-        local = CALLOC (1, sizeof (*local));
-        local->file = file;
+unwind:
+    STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL);
 
-        frame->local = local;
+    if (stub)
+        call_stub_destroy(stub);
 
-        STACK_WIND (frame,
-                    wb_readv_cbk,
-                    FIRST_CHILD(this),
-                    FIRST_CHILD(this)->fops->readv,
-                    fd, size, offset);
+    return 0;
 
-        return 0;
+noqueue:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+                    oldloc, newloc, xdata);
+    return 0;
 }
 
+int32_t
+wb_fallocate_helper(call_frame_t *frame, xlator_t *this, fd_t *fd,
+                    int32_t keep_size, off_t offset, size_t len, dict_t *xdata)
+{
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset,
+                    len, xdata);
+    return 0;
+}
 
 int32_t
-wb_ffr_bg_cbk (call_frame_t *frame,
-               void *cookie,
-               xlator_t *this,
-               int32_t op_ret,
-               int32_t op_errno)
+wb_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size,
+             off_t offset, size_t len, dict_t *xdata)
 {
-        wb_local_t *local = NULL;
-        wb_file_t *file = NULL;
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
 
-        local = frame->local;
-        file = local->file;
+    wb_inode = wb_inode_ctx_get(this, fd->inode);
+    if (!wb_inode)
+        goto noqueue;
 
-        if (file) {
-                fd_unref (file->fd);
-        }
+    stub = fop_fallocate_stub(frame, wb_fallocate_helper, fd, keep_size, offset,
+                              len, xdata);
+    if (!stub)
+        goto unwind;
 
-        if (file->op_ret == -1)
-        {
-                op_ret = file->op_ret;
-                op_errno = file->op_errno;
+    if (!wb_enqueue(wb_inode, stub))
+        goto unwind;
 
-                file->op_ret = 0;
-        }
-  
-        STACK_DESTROY (frame->root);
-        return 0;
-}
+    wb_process_queue(wb_inode);
 
+    return 0;
 
-int32_t
-wb_ffr_cbk (call_frame_t *frame,
-            void *cookie,
-            xlator_t *this,
-            int32_t op_ret,
-            int32_t op_errno)
-{
-        wb_local_t *local = NULL;
-        wb_file_t *file = NULL;
-
-        local = frame->local;
-        file = local->file;
-        if (file) {
-                /* corresponds to the fd_ref() done during wb_file_create() */
-                fd_unref (file->fd);
-        }
+unwind:
+    STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL);
 
-        if (file->op_ret == -1)
-        {
-                op_ret = file->op_ret;
-                op_errno = file->op_errno;
+    if (stub)
+        call_stub_destroy(stub);
 
-                file->op_ret = 0;
-        }
+    return 0;
 
-        STACK_UNWIND (frame, op_ret, op_errno);
-        return 0;
+noqueue:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this),
+                    FIRST_CHILD(this)->fops->fallocate, fd, keep_size, offset,
+                    len, xdata);
+    return 0;
 }
 
+int32_t
+wb_discard_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                  size_t len, dict_t *xdata)
+{
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard,
+                    fd, offset, len, xdata);
+    return 0;
+}
 
 int32_t
-wb_flush (call_frame_t *frame,
-          xlator_t *this,
-          fd_t *fd)
-{
-        wb_conf_t *conf = NULL;
-        wb_file_t *file = NULL;
-        call_frame_t *flush_frame = NULL;
-        wb_local_t *local = NULL;
-	uint64_t tmp_file = 0;
-
-        conf = this->private;
-
-        if (fd_ctx_get (fd, this, &tmp_file)) {
-                gf_log (this->name, GF_LOG_ERROR, "returning EBADFD");
-                STACK_UNWIND (frame, -1, EBADFD);
-                return 0;
-        }
+wb_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+           size_t len, dict_t *xdata)
+{
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
 
-	file = (wb_file_t *)(long)tmp_file;
+    wb_inode = wb_inode_ctx_get(this, fd->inode);
+    if (!wb_inode)
+        goto noqueue;
 
-        local = CALLOC (1, sizeof (*local));
-        local->file = file;
-        if (file)
-                fd_ref (file->fd);
+    stub = fop_discard_stub(frame, wb_discard_helper, fd, offset, len, xdata);
+    if (!stub)
+        goto unwind;
 
-        if (&file->request != file->request.next) {
-                gf_log (this->name, GF_LOG_DEBUG,
-                        "request queue is not empty, it has to be synced");
-        }
+    if (!wb_enqueue(wb_inode, stub))
+        goto unwind;
 
-        if (conf->flush_behind && 
-	    (!file->disabled) && (file->disable_till == 0)) {
-                flush_frame = copy_frame (frame);     
-                STACK_UNWIND (frame, file->op_ret, 
-			      file->op_errno); // liar! liar! :O
+    wb_process_queue(wb_inode);
 
-                flush_frame->local = local;
-                wb_sync_all (flush_frame, file);
+    return 0;
 
-                STACK_WIND (flush_frame,
-                            wb_ffr_bg_cbk,
-                            FIRST_CHILD(this),
-                            FIRST_CHILD(this)->fops->flush,
-                            fd);
-        } else {
-                wb_sync_all (frame, file);
-
-                frame->local = local;
-                STACK_WIND (frame,
-                            wb_ffr_cbk,
-                            FIRST_CHILD(this),
-                            FIRST_CHILD(this)->fops->flush,
-                            fd);
-        }
+unwind:
+    STACK_UNWIND_STRICT(discard, frame, -1, ENOMEM, NULL, NULL, NULL);
 
-        return 0;
+    if (stub)
+        call_stub_destroy(stub);
+    return 0;
+
+noqueue:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->discard,
+                    fd, offset, len, xdata);
+
+    return 0;
 }
 
+int32_t
+wb_zerofill_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+                   off_t len, dict_t *xdata)
+{
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->zerofill,
+                    fd, offset, len, xdata);
+    return 0;
+}
 
 int32_t
-wb_fsync_cbk (call_frame_t *frame,
-              void *cookie,
-              xlator_t *this,
-              int32_t op_ret,
-              int32_t op_errno)
+wb_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+            off_t len, dict_t *xdata)
 {
-        wb_local_t *local = NULL;
-        wb_file_t *file = NULL;
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
 
-        local = frame->local;
-        file = local->file;
+    wb_inode = wb_inode_ctx_get(this, fd->inode);
+    if (!wb_inode)
+        goto noqueue;
 
-        if (file->op_ret == -1)
-        {
-                op_ret = file->op_ret;
-                op_errno = file->op_errno;
+    stub = fop_zerofill_stub(frame, wb_zerofill_helper, fd, offset, len, xdata);
+    if (!stub)
+        goto unwind;
 
-                file->op_ret = 0;
-        }
+    if (!wb_enqueue(wb_inode, stub))
+        goto unwind;
 
-        STACK_UNWIND (frame, op_ret, op_errno);
-        return 0;
+    wb_process_queue(wb_inode);
+
+    return 0;
+
+unwind:
+    STACK_UNWIND_STRICT(zerofill, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+    if (stub)
+        call_stub_destroy(stub);
+
+noqueue:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->zerofill,
+                    fd, offset, len, xdata);
+    return 0;
 }
 
 int32_t
-wb_fsync (call_frame_t *frame,
-          xlator_t *this,
-          fd_t *fd,
-          int32_t datasync)
-{
-        wb_file_t *file = NULL;
-        wb_local_t *local = NULL;
-	uint64_t tmp_file = 0;
-
-        if (fd_ctx_get (fd, this, &tmp_file)) {
-                gf_log (this->name, GF_LOG_ERROR, "returning EBADFD");
-                STACK_UNWIND (frame, -1, EBADFD);
-                return 0;
-        }
+wb_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+          dict_t *xdata)
+{
+    wb_inode_t *wb_inode = NULL;
+    call_stub_t *stub = NULL;
 
-	file = (wb_file_t *)(long)tmp_file;
-        if (file)
-                wb_sync_all (frame, file);
+    wb_inode = wb_inode_ctx_get(this, oldloc->inode);
+    if (!wb_inode)
+        goto noqueue;
 
-        local = CALLOC (1, sizeof (*local));
-        local->file = file;
+    stub = fop_rename_stub(frame, default_rename_resume, oldloc, newloc, xdata);
+    if (!stub)
+        goto unwind;
 
-        frame->local = local;
+    if (!wb_enqueue(wb_inode, stub))
+        goto unwind;
 
-        STACK_WIND (frame,
-                    wb_fsync_cbk,
-                    FIRST_CHILD(this),
-                    FIRST_CHILD(this)->fops->fsync,
-                    fd, datasync);
-        return 0;
-}
+    wb_process_queue(wb_inode);
 
+    return 0;
 
-int32_t
-wb_release (xlator_t *this,
-            fd_t *fd)
+unwind:
+    if (stub)
+        call_stub_destroy(stub);
+
+    STACK_UNWIND_STRICT(rename, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, NULL,
+                        NULL);
+
+    return 0;
+
+noqueue:
+    STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename,
+                    oldloc, newloc, xdata);
+    return 0;
+}
+
+int
+wb_forget(xlator_t *this, inode_t *inode)
 {
-        uint64_t file = 0;
+    uint64_t tmp = 0;
+    wb_inode_t *wb_inode = NULL;
+
+    inode_ctx_del(inode, this, &tmp);
 
-	fd_ctx_get (fd, this, &file);
-  	wb_file_destroy ((wb_file_t *)(long)file);
+    wb_inode = (wb_inode_t *)(long)tmp;
 
+    if (!wb_inode)
         return 0;
+
+    wb_inode_destroy(wb_inode);
+
+    return 0;
 }
 
+int
+wb_release(xlator_t *this, fd_t *fd)
+{
+    uint64_t tmp = 0;
+
+    (void)fd_ctx_del(fd, this, &tmp);
 
-int32_t 
-init (xlator_t *this)
+    return 0;
+}
+
+int
+wb_priv_dump(xlator_t *this)
 {
-        dict_t *options = NULL;
-        wb_conf_t *conf = NULL;
-        char *aggregate_size_string = NULL;
-        char *window_size_string    = NULL;
-        char *flush_behind_string   = NULL;
-        char *disable_till_string = NULL;
-        char *enable_O_SYNC_string = NULL;
-        int32_t ret = -1;
+    wb_conf_t *conf = NULL;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    int ret = -1;
 
-        if ((this->children == NULL)
-            || this->children->next) {
-                gf_log (this->name, GF_LOG_ERROR,
-                        "FATAL: write-behind (%s) not configured with exactly one child",
-                        this->name);
-                return -1;
-        }
+    GF_VALIDATE_OR_GOTO("write-behind", this, out);
 
-        if (this->parents == NULL) {
-                gf_log (this->name, GF_LOG_WARNING,
-                        "dangling volume. check volfile");
-        }
-        
-        options = this->options;
-
-        conf = CALLOC (1, sizeof (*conf));
-        
-        conf->enable_O_SYNC = _gf_false;
-        ret = dict_get_str (options, "enable-O_SYNC",
-                            &enable_O_SYNC_string);
-        if (ret == 0) {
-                ret = gf_string2boolean (enable_O_SYNC_string,
-                                         &conf->enable_O_SYNC);
-                if (ret == -1) {
-                        gf_log (this->name, GF_LOG_ERROR,
-                                "'enable-O_SYNC' takes only boolean arguments");
-                        return -1;
-                }
-        }
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
 
-        /* configure 'options aggregate-size <size>' */
-        conf->aggregate_size = 0;
-        ret = dict_get_str (options, "block-size", 
-                            &aggregate_size_string);
-        if (ret == 0) {
-                ret = gf_string2bytesize (aggregate_size_string, 
-                                          &conf->aggregate_size);
-                if (ret != 0) {
-                        gf_log (this->name, GF_LOG_ERROR, 
-                                "invalid number format \"%s\" of \"option aggregate-size\"", 
-                                aggregate_size_string);
-                        return -1;
-                }
-        }
+    gf_proc_dump_build_key(key_prefix, "xlator.performance.write-behind",
+                           "priv");
 
-        gf_log (this->name, GF_LOG_DEBUG,
-                "using aggregate-size = %"PRIu64"", 
-                conf->aggregate_size);
-  
-        conf->disable_till = 1;
-        ret = dict_get_str (options, "disable-for-first-nbytes", 
-                            &disable_till_string);
-        if (ret == 0) {
-                ret = gf_string2bytesize (disable_till_string, 
-                                          &conf->disable_till);
-                if (ret != 0) {
-                        gf_log (this->name, GF_LOG_ERROR, 
-                                "invalid number format \"%s\" of \"option disable-for-first-nbytes\"", 
-                                disable_till_string);
-                        return -1;
-                }
-        }
+    gf_proc_dump_add_section("%s", key_prefix);
 
-        gf_log (this->name, GF_LOG_DEBUG,
-                "disabling write-behind for first %"PRIu64" bytes", 
-                conf->disable_till);
-  
-        /* configure 'option window-size <size>' */
-        conf->window_size = 0;
-        ret = dict_get_str (options, "cache-size", 
-                            &window_size_string);
-        if (ret == 0) {
-                ret = gf_string2bytesize (window_size_string, 
-                                          &conf->window_size);
-                if (ret != 0) {
-                        gf_log (this->name, GF_LOG_ERROR, 
-                                "invalid number format \"%s\" of \"option window-size\"", 
-                                window_size_string);
-                        FREE (conf);
-                        return -1;
-                }
-        }
+    gf_proc_dump_write("aggregate_size", "%" PRIu64, conf->aggregate_size);
+    gf_proc_dump_write("window_size", "%" PRIu64, conf->window_size);
+    gf_proc_dump_write("flush_behind", "%d", conf->flush_behind);
+    gf_proc_dump_write("trickling_writes", "%d", conf->trickling_writes);
 
-        if (!conf->window_size && conf->aggregate_size) {
-                gf_log (this->name, GF_LOG_WARNING,
-                        "setting window-size to be equal to aggregate-size(%"PRIu64")",
-                        conf->aggregate_size);
-                conf->window_size = conf->aggregate_size;
-        }
+    ret = 0;
+out:
+    return ret;
+}
+
+void
+__wb_dump_requests(struct list_head *head, char *prefix)
+{
+    char key[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    char key_prefix[GF_DUMP_MAX_BUF_LEN] =
+        {
+            0,
+        },
+         flag = 0;
+    wb_request_t *req = NULL;
+
+    list_for_each_entry(req, head, all)
+    {
+        gf_proc_dump_build_key(key_prefix, key, "%s",
+                               (char *)gf_fop_list[req->fop]);
+
+        gf_proc_dump_add_section("%s", key_prefix);
+
+        gf_proc_dump_write("unique", "%" PRIu64, req->unique);
+
+        gf_proc_dump_write("refcount", "%d", req->refcount);
+
+        if (list_empty(&req->todo))
+            gf_proc_dump_write("wound", "yes");
+        else
+            gf_proc_dump_write("wound", "no");
+
+        gf_proc_dump_write("generation-number", "%" PRIu64, req->gen);
+
+        gf_proc_dump_write("req->op_ret", "%d", req->op_ret);
+        gf_proc_dump_write("req->op_errno", "%d", req->op_errno);
+        gf_proc_dump_write("sync-attempts", "%d", req->wind_count);
 
-        if (conf->window_size < conf->aggregate_size) {
-                gf_log (this->name, GF_LOG_ERROR,
-                        "aggregate-size(%"PRIu64") cannot be more than window-size"
-                        "(%"PRIu64")", conf->window_size, conf->aggregate_size);
-                FREE (conf);
-                return -1;
+        if (req->fop == GF_FOP_WRITE) {
+            if (list_empty(&req->wip))
+                gf_proc_dump_write("sync-in-progress", "no");
+            else
+                gf_proc_dump_write("sync-in-progress", "yes");
+
+            gf_proc_dump_write("size", "%" GF_PRI_SIZET, req->write_size);
+
+            if (req->stub)
+                gf_proc_dump_write("offset", "%" PRId64,
+                                   req->stub->args.offset);
+
+            flag = req->ordering.lied;
+            gf_proc_dump_write("lied", "%d", flag);
+
+            flag = req->ordering.append;
+            gf_proc_dump_write("append", "%d", flag);
+
+            flag = req->ordering.fulfilled;
+            gf_proc_dump_write("fulfilled", "%d", flag);
+
+            flag = req->ordering.go;
+            gf_proc_dump_write("go", "%d", flag);
         }
+    }
+}
 
-        /* configure 'option flush-behind <on/off>' */
-        conf->flush_behind = 0;
-        ret = dict_get_str (options, "flush-behind", 
-                            &flush_behind_string);
-        if (ret == 0) {
-                ret = gf_string2boolean (flush_behind_string, 
-                                         &conf->flush_behind);
-                if (ret == -1) {
-                        gf_log (this->name, GF_LOG_ERROR,
-                                "'flush-behind' takes only boolean arguments");
-                        return -1;
-                }
+int
+wb_inode_dump(xlator_t *this, inode_t *inode)
+{
+    wb_inode_t *wb_inode = NULL;
+    int32_t ret = -1;
+    char *path = NULL;
+    char key_prefix[GF_DUMP_MAX_BUF_LEN] = {
+        0,
+    };
+    char uuid_str[64] = {
+        0,
+    };
 
-                if (conf->flush_behind) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"enabling flush-behind");
-                }
+    if ((inode == NULL) || (this == NULL)) {
+        ret = 0;
+        goto out;
+    }
+
+    wb_inode = wb_inode_ctx_get(this, inode);
+    if (wb_inode == NULL) {
+        ret = 0;
+        goto out;
+    }
+
+    uuid_utoa_r(inode->gfid, uuid_str);
+
+    gf_proc_dump_build_key(key_prefix, "xlator.performance.write-behind",
+                           "wb_inode");
+
+    gf_proc_dump_add_section("%s", key_prefix);
+
+    __inode_path(inode, NULL, &path);
+    if (path != NULL) {
+        gf_proc_dump_write("path", "%s", path);
+        GF_FREE(path);
+    }
+
+    gf_proc_dump_write("inode", "%p", inode);
+
+    gf_proc_dump_write("gfid", "%s", uuid_str);
+
+    gf_proc_dump_write("window_conf", "%" GF_PRI_SIZET, wb_inode->window_conf);
+
+    gf_proc_dump_write("window_current", "%" GF_PRI_SIZET,
+                       wb_inode->window_current);
+
+    gf_proc_dump_write("transit-size", "%" GF_PRI_SIZET, wb_inode->transit);
+
+    gf_proc_dump_write("dontsync", "%d", wb_inode->dontsync);
+
+    ret = TRY_LOCK(&wb_inode->lock);
+    if (!ret) {
+        if (!list_empty(&wb_inode->all)) {
+            __wb_dump_requests(&wb_inode->all, key_prefix);
         }
-        this->private = conf;
-        return 0;
+        UNLOCK(&wb_inode->lock);
+    }
+
+    if (ret && wb_inode)
+        gf_proc_dump_write("Unable to dump the inode information",
+                           "(Lock acquisition failed) %p (gfid: %s)", wb_inode,
+                           uuid_str);
+
+    ret = 0;
+out:
+    return ret;
 }
 
+int
+mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
 
-void
-fini (xlator_t *this)
+    if (!this) {
+        goto out;
+    }
+
+    ret = xlator_mem_acct_init(this, gf_wb_mt_end + 1);
+
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, WRITE_BEHIND_MSG_NO_MEMORY,
+               "Memory accounting init"
+               "failed");
+    }
+
+out:
+    return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
 {
-        wb_conf_t *conf = this->private;
+    wb_conf_t *conf = NULL;
+    int ret = -1;
 
-        FREE (conf);
-        return;
+    conf = this->private;
+
+    GF_OPTION_RECONF("cache-size", conf->window_size, options, size_uint64,
+                     out);
+
+    GF_OPTION_RECONF("flush-behind", conf->flush_behind, options, bool, out);
+
+    GF_OPTION_RECONF("trickling-writes", conf->trickling_writes, options, bool,
+                     out);
+
+    GF_OPTION_RECONF("strict-O_DIRECT", conf->strict_O_DIRECT, options, bool,
+                     out);
+
+    GF_OPTION_RECONF("strict-write-ordering", conf->strict_write_ordering,
+                     options, bool, out);
+    GF_OPTION_RECONF("resync-failed-syncs-after-fsync",
+                     conf->resync_after_fsync, options, bool, out);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    wb_conf_t *conf = NULL;
+    int32_t ret = -1;
+
+    if ((this->children == NULL) || this->children->next) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, WRITE_BEHIND_MSG_INIT_FAILED,
+               "FATAL: write-behind (%s) not configured with exactly "
+               "one child",
+               this->name);
+        goto out;
+    }
+
+    if (this->parents == NULL) {
+        gf_msg(this->name, GF_LOG_WARNING, 0,
+               WRITE_BEHIND_MSG_VOL_MISCONFIGURED,
+               "dangling volume. check volfilex");
+    }
+
+    conf = GF_CALLOC(1, sizeof(*conf), gf_wb_mt_wb_conf_t);
+    if (conf == NULL) {
+        goto out;
+    }
+
+    /* configure 'options aggregate-size <size>' */
+    GF_OPTION_INIT("aggregate-size", conf->aggregate_size, size_uint64, out);
+    conf->page_size = conf->aggregate_size;
+
+    /* configure 'option window-size <size>' */
+    GF_OPTION_INIT("cache-size", conf->window_size, size_uint64, out);
+
+    if (!conf->window_size && conf->aggregate_size) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, WRITE_BEHIND_MSG_SIZE_NOT_SET,
+               "setting window-size to be equal to "
+               "aggregate-size(%" PRIu64 ")",
+               conf->aggregate_size);
+        conf->window_size = conf->aggregate_size;
+    }
+
+    if (conf->window_size < conf->aggregate_size) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, WRITE_BEHIND_MSG_EXCEEDED_MAX_SIZE,
+               "aggregate-size(%" PRIu64
+               ") cannot be more than "
+               "window-size(%" PRIu64 ")",
+               conf->aggregate_size, conf->window_size);
+        goto out;
+    }
+
+    /* configure 'option flush-behind <on/off>' */
+    GF_OPTION_INIT("flush-behind", conf->flush_behind, bool, out);
+
+    GF_OPTION_INIT("trickling-writes", conf->trickling_writes, bool, out);
+
+    GF_OPTION_INIT("strict-O_DIRECT", conf->strict_O_DIRECT, bool, out);
+
+    GF_OPTION_INIT("strict-write-ordering", conf->strict_write_ordering, bool,
+                   out);
+
+    GF_OPTION_INIT("resync-failed-syncs-after-fsync", conf->resync_after_fsync,
+                   bool, out);
+
+    this->private = conf;
+    ret = 0;
+
+out:
+    if (ret) {
+        GF_FREE(conf);
+    }
+    return ret;
 }
 
+void
+fini(xlator_t *this)
+{
+    wb_conf_t *conf = NULL;
+
+    GF_VALIDATE_OR_GOTO("write-behind", this, out);
+
+    conf = this->private;
+    if (!conf) {
+        goto out;
+    }
+
+    this->private = NULL;
+    GF_FREE(conf);
+
+out:
+    return;
+}
 
 struct xlator_fops fops = {
-        .writev      = wb_writev,
-        .open        = wb_open,
-        .create      = wb_create,
-        .readv       = wb_readv,
-        .flush       = wb_flush,
-        .fsync       = wb_fsync,
-        .stat        = wb_stat,
-        .fstat       = wb_fstat,
-        .truncate    = wb_truncate,
-        .ftruncate   = wb_ftruncate,
-        .utimens     = wb_utimens,
+    .writev = wb_writev,
+    .readv = wb_readv,
+    .flush = wb_flush,
+    .fsync = wb_fsync,
+    .stat = wb_stat,
+    .fstat = wb_fstat,
+    .truncate = wb_truncate,
+    .ftruncate = wb_ftruncate,
+    .setattr = wb_setattr,
+    .fsetattr = wb_fsetattr,
+    .lookup = wb_lookup,
+    .readdirp = wb_readdirp,
+    .link = wb_link,
+    .fallocate = wb_fallocate,
+    .discard = wb_discard,
+    .zerofill = wb_zerofill,
+    .rename = wb_rename,
 };
 
-struct xlator_mops mops = {
-};
+struct xlator_cbks cbks = {.forget = wb_forget, .release = wb_release};
 
-struct xlator_cbks cbks = {
-        .release  = wb_release
+struct xlator_dumpops dumpops = {
+    .priv = wb_priv_dump,
+    .inodectx = wb_inode_dump,
 };
 
 struct volume_options options[] = {
-        { .key  = {"flush-behind"}, 
-          .type = GF_OPTION_TYPE_BOOL
-        },
-        { .key  = {"block-size", "aggregate-size"}, 
-          .type = GF_OPTION_TYPE_SIZET, 
-          .min  = 128 * GF_UNIT_KB, 
-          .max  = 4 * GF_UNIT_MB 
-        },
-        { .key  = {"cache-size", "window-size"}, 
-          .type = GF_OPTION_TYPE_SIZET, 
-          .min  = 512 * GF_UNIT_KB, 
-          .max  = 1 * GF_UNIT_GB 
-        },
-        { .key = {"disable-for-first-nbytes"},
-          .type = GF_OPTION_TYPE_SIZET,
-          .min = 1,
-          .max = 1 * GF_UNIT_MB,
-        },
-        { .key = {"enable-O_SYNC"},
-          .type = GF_OPTION_TYPE_BOOL,
-        }, 
-        { .key = {NULL} },
+    {
+        .key = {"write-behind"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .description = "enable/disable write-behind",
+        .op_version = {GD_OP_VERSION_6_0},
+        .flags = OPT_FLAG_SETTABLE,
+    },
+    {.key = {"flush-behind"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "on",
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+     .tags = {"write-behind"},
+     .description = "If this option is set ON, instructs write-behind "
+                    "translator to perform flush in background, by "
+                    "returning success (or any errors, if any of "
+                    "previous  writes were failed) to application even "
+                    "before flush FOP is sent to backend filesystem. "},
+    {.key = {"cache-size", "window-size"},
+     .type = GF_OPTION_TYPE_SIZET,
+     .min = 512 * GF_UNIT_KB,
+     .max = 1 * GF_UNIT_GB,
+     .default_value = "1MB",
+     .op_version = {1},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+     .tags = {"write-behind"},
+     .description = "Size of the write-behind buffer for a single file "
+                    "(inode)."},
+    {
+        .key = {"trickling-writes"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .op_version = {GD_OP_VERSION_3_13_1},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+        .tags = {"write-behind"},
+        .default_value = "on",
+    },
+    {.key = {"strict-O_DIRECT"},
+     .type = GF_OPTION_TYPE_BOOL,
+     .default_value = "off",
+     .op_version = {2},
+     .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+     .tags = {"write-behind"},
+     .description = "This option when set to off, ignores the "
+                    "O_DIRECT flag."},
+    {
+        .key = {"strict-write-ordering"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .op_version = {2},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+        .tags = {"write-behind"},
+        .description = "Do not let later writes overtake earlier writes even "
+                       "if they do not overlap",
+    },
+    {
+        .key = {"resync-failed-syncs-after-fsync"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "off",
+        .op_version = {GD_OP_VERSION_3_7_7},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+        .tags = {"write-behind"},
+        .description = "If sync of \"cached-writes issued before fsync\" "
+                       "(to backend) fails, this option configures whether "
+                       "to retry syncing them after fsync or forget them. "
+                       "If set to on, cached-writes are retried "
+                       "till a \"flush\" fop (or a successful sync) on sync "
+                       "failures. "
+                       "fsync itself is failed irrespective of the value of "
+                       "this option. ",
+    },
+    {
+        .key = {"aggregate-size"},
+        .type = GF_OPTION_TYPE_SIZET,
+        .default_value = "128KB",
+        .op_version = {GD_OP_VERSION_4_1_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+        .description = "Will aggregate writes until data of specified "
+                       "size is fully filled for a single file provided "
+                       "there are no dependent fops on cached writes. This "
+                       "option just sets the aggregate size. Note that "
+                       "aggregation won't happen if "
+                       "performance.write-behind-trickling-writes"
+                       " is turned on. Hence turn off "
+                       "performance.write-behind.trickling-writes"
+                       " so that writes are aggregated till a max of "
+                       "\"aggregate-size\" bytes",
+    },
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .dumpops = &dumpops,
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "write-behind",
+    .category = GF_MAINTAINED,
 };
diff --git a/xlators/playground/Makefile.am b/xlators/playground/Makefile.am
new file mode 100644
index 00000000000..e7de6b31aff
--- /dev/null
+++ b/xlators/playground/Makefile.am
@@ -0,0 +1,2 @@
+SUBDIRS = template
+CLEANFILES =
diff --git a/xlators/encryption/rot-13/Makefile.am b/xlators/playground/rot-13/Makefile.am
index d471a3f9243..d471a3f9243 100644
--- a/xlators/encryption/rot-13/Makefile.am
+++ b/xlators/playground/rot-13/Makefile.am
diff --git a/xlators/encryption/rot-13/src/Makefile.am b/xlators/playground/rot-13/src/Makefile.am
index ba5e623d8e2..9978661509d 100644
--- a/xlators/encryption/rot-13/src/Makefile.am
+++ b/xlators/playground/rot-13/src/Makefile.am
@@ -1,14 +1,16 @@
 xlator_LTLIBRARIES = rot-13.la
 xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/encryption
 
-rot_13_la_LDFLAGS = -module -avoidversion
+rot_13_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
 
 rot_13_la_SOURCES = rot-13.c
 rot_13_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la 
 
 noinst_HEADERS = rot-13.h
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
 
 CLEANFILES = 
diff --git a/xlators/playground/rot-13/src/rot-13.c b/xlators/playground/rot-13/src/rot-13.c
new file mode 100644
index 00000000000..0f45ee31964
--- /dev/null
+++ b/xlators/playground/rot-13/src/rot-13.c
@@ -0,0 +1,166 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include <ctype.h>
+#include <sys/uio.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/logging.h>
+
+#include "rot-13.h"
+
+/*
+ * This is a rot13 ``encryption'' xlator. It rot13's data when
+ * writing to disk and rot13's it back when reading it.
+ * This xlator is meant as an example, NOT FOR PRODUCTION
+ * USE ;) (hence no error-checking)
+ */
+
+void
+rot13(char *buf, int len)
+{
+    int i;
+    for (i = 0; i < len; i++) {
+        if (buf[i] >= 'a' && buf[i] <= 'z')
+            buf[i] = 'a' + ((buf[i] - 'a' + 13) % 26);
+        else if (buf[i] >= 'A' && buf[i] <= 'Z')
+            buf[i] = 'A' + ((buf[i] - 'A' + 13) % 26);
+    }
+}
+
+void
+rot13_iovec(struct iovec *vector, int count)
+{
+    int i;
+    for (i = 0; i < count; i++) {
+        rot13(vector[i].iov_base, vector[i].iov_len);
+    }
+}
+
+int32_t
+rot13_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, struct iovec *vector,
+                int32_t count, struct iatt *stbuf, struct iobref *iobref,
+                dict_t *xdata)
+{
+    rot_13_private_t *priv = (rot_13_private_t *)this->private;
+
+    if (priv->decrypt_read)
+        rot13_iovec(vector, count);
+
+    STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf,
+                        iobref, xdata);
+    return 0;
+}
+
+int32_t
+rot13_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+            off_t offset, uint32_t flags, dict_t *xdata)
+{
+    STACK_WIND(frame, rot13_readv_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, xdata);
+    return 0;
+}
+
+int32_t
+rot13_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                 struct iatt *postbuf, dict_t *xdata)
+{
+    STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
+                        xdata);
+    return 0;
+}
+
+int32_t
+rot13_writev(call_frame_t *frame, xlator_t *this, fd_t *fd,
+             struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
+             struct iobref *iobref, dict_t *xdata)
+{
+    rot_13_private_t *priv = (rot_13_private_t *)this->private;
+    if (priv->encrypt_write)
+        rot13_iovec(vector, count);
+
+    STACK_WIND(frame, rot13_writev_cbk, FIRST_CHILD(this),
+               FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+               flags, iobref, xdata);
+    return 0;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    data_t *data = NULL;
+    rot_13_private_t *priv = NULL;
+
+    if (!this->children || this->children->next) {
+        gf_log("rot13", GF_LOG_ERROR,
+               "FATAL: rot13 should have exactly one child");
+        return -1;
+    }
+
+    if (!this->parents) {
+        gf_log(this->name, GF_LOG_WARNING, "dangling volume. check volfile ");
+    }
+
+    priv = GF_CALLOC(sizeof(rot_13_private_t), 1, 0);
+    if (!priv)
+        return -1;
+
+    priv->decrypt_read = 1;
+    priv->encrypt_write = 1;
+
+    data = dict_get(this->options, "encrypt-write");
+    if (data) {
+        if (gf_string2boolean(data->data, &priv->encrypt_write) == -1) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "encrypt-write takes only boolean options");
+            GF_FREE(priv);
+            return -1;
+        }
+    }
+
+    data = dict_get(this->options, "decrypt-read");
+    if (data) {
+        if (gf_string2boolean(data->data, &priv->decrypt_read) == -1) {
+            gf_log(this->name, GF_LOG_ERROR,
+                   "decrypt-read takes only boolean options");
+            GF_FREE(priv);
+            return -1;
+        }
+    }
+
+    this->private = priv;
+    gf_log("rot13", GF_LOG_DEBUG, "rot13 xlator loaded");
+    return 0;
+}
+
+void
+fini(xlator_t *this)
+{
+    rot_13_private_t *priv = this->private;
+
+    if (!priv)
+        return;
+    this->private = NULL;
+    GF_FREE(priv);
+
+    return;
+}
+
+struct xlator_fops fops = {.readv = rot13_readv, .writev = rot13_writev};
+
+struct xlator_cbks cbks;
+
+struct volume_options options[] = {
+    {.key = {"encrypt-write"}, .type = GF_OPTION_TYPE_BOOL},
+    {.key = {"decrypt-read"}, .type = GF_OPTION_TYPE_BOOL},
+    {.key = {NULL}},
+};
diff --git a/xlators/playground/rot-13/src/rot-13.h b/xlators/playground/rot-13/src/rot-13.h
new file mode 100644
index 00000000000..edbc99798b4
--- /dev/null
+++ b/xlators/playground/rot-13/src/rot-13.h
@@ -0,0 +1,18 @@
+/*
+   Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef __ROT_13_H__
+#define __ROT_13_H__
+
+typedef struct {
+    gf_boolean_t encrypt_write;
+    gf_boolean_t decrypt_read;
+} rot_13_private_t;
+
+#endif /* __ROT_13_H__ */
diff --git a/xlators/playground/template/Makefile.am b/xlators/playground/template/Makefile.am
new file mode 100644
index 00000000000..f2689244371
--- /dev/null
+++ b/xlators/playground/template/Makefile.am
@@ -0,0 +1,2 @@
+SUBDIRS = src
+
diff --git a/xlators/playground/template/src/Makefile.am b/xlators/playground/template/src/Makefile.am
new file mode 100644
index 00000000000..e76a717a550
--- /dev/null
+++ b/xlators/playground/template/src/Makefile.am
@@ -0,0 +1,17 @@
+xlator_LTLIBRARIES = template.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/playground
+
+template_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+template_la_SOURCES = template.c
+template_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = template.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
+
diff --git a/xlators/playground/template/src/template.c b/xlators/playground/template/src/template.c
new file mode 100644
index 00000000000..2f25d2363a6
--- /dev/null
+++ b/xlators/playground/template/src/template.c
@@ -0,0 +1,186 @@
+/*
+   Copyright (c) 2006-2018 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#include "template.h"
+#include <glusterfs/statedump.h>
+
+static int32_t
+template_mem_acct_init(xlator_t *this)
+{
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("template", this, out);
+
+    ret = xlator_mem_acct_init(this, gf_template_mt_end + 1);
+
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, TEMPLATE_MSG_NO_MEMORY,
+               "Memory accounting init failed");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static int32_t
+template_priv_to_dict(xlator_t *this, dict_t *dict, char *brickname)
+{
+    int ret = 0;
+    template_private_t *priv = NULL;
+
+    priv = this->private;
+    ret = dict_set_uint64(dict, "template.dummy", priv->dummy);
+    if (ret)
+        gf_msg_debug(this->name, ENOMEM, "dict_set of dummy key failed");
+
+    return 0;
+}
+
+static int32_t
+template_priv(xlator_t *this)
+{
+    template_private_t *priv = NULL;
+
+    priv = this->private;
+    gf_proc_dump_write("template.dummy", "%" PRId32, priv->dummy);
+
+    return 0;
+}
+
+static int32_t
+template_dump_metrics(xlator_t *this, int fd)
+{
+    template_private_t *priv = NULL;
+
+    priv = this->private;
+    /* NOTE: currently this is adding private variable, which can
+       be constant here. But in reality, things which are changing
+       can be added here, so we get to plot them on graph. */
+    dprintf(fd, "%s.private.dummy %d\n", this->name, priv->dummy);
+
+    return 0;
+}
+
+static int32_t
+template_init(xlator_t *this)
+{
+    int ret = -1;
+    template_private_t *priv = NULL;
+
+    if (!this->children || this->children->next) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, TEMPLATE_MSG_NO_GRAPH,
+               "not configured with exactly one child. exiting");
+        goto out;
+    }
+
+    if (!this->parents) {
+        gf_msg(this->name, GF_LOG_ERROR, EINVAL, TEMPLATE_MSG_NO_GRAPH,
+               "dangling volume. check volfile ");
+        goto out;
+    }
+
+    priv = GF_CALLOC(1, sizeof(template_private_t), gf_template_mt_private_t);
+    if (!priv) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, TEMPLATE_MSG_NO_MEMORY,
+               "priv allocation failed");
+        goto out;
+    }
+
+    GF_OPTION_INIT("dummy", priv->dummy, int32, out);
+
+    this->private = priv;
+    priv = NULL;
+    ret = 0;
+
+out:
+    if (priv)
+        GF_FREE(priv);
+
+    return ret;
+}
+
+static int
+template_reconfigure(xlator_t *this, dict_t *options)
+{
+    int ret = -1;
+    template_private_t *priv = NULL;
+
+    priv = this->private;
+
+    GF_OPTION_RECONF("dummy", priv->dummy, options, int32, out);
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static void
+template_fini(xlator_t *this)
+{
+    template_private_t *priv = NULL;
+
+    priv = this->private;
+    this->private = NULL;
+
+    GF_FREE(priv);
+}
+
+static int
+template_notify(xlator_t *this, int32_t event, void *data, ...)
+{
+    switch (event) {
+        default:
+            default_notify(this, event, data);
+            gf_msg_debug(this->name, 0, "event %d received", event);
+    }
+
+    return 0;
+}
+
+struct xlator_fops template_fops = {};
+
+struct xlator_cbks template_cbks = {};
+
+struct xlator_dumpops template_dumpops = {
+    .priv = template_priv,
+    .priv_to_dict = template_priv_to_dict,
+};
+
+struct volume_options template_options[] = {
+    {
+        .key = {"dummy"},
+        .type = GF_OPTION_TYPE_INT,
+        .min = 1,
+        .max = 1024,
+        .default_value = "1",
+        .description = "a dummy option to show how to set the option",
+        .op_version = {GD_OP_VERSION_5_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .level = OPT_STATUS_EXPERIMENTAL,
+        .tags = {"development", "experimental", "template"},
+    },
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = template_init,
+    .fini = template_fini,
+    .notify = template_notify,
+    .reconfigure = template_reconfigure,
+    .mem_acct_init = template_mem_acct_init,
+    .dump_metrics = template_dump_metrics,
+    .op_version = {GD_OP_VERSION_5_0},
+    .dumpops = &template_dumpops,
+    .fops = &template_fops,
+    .cbks = &template_cbks,
+    .options = template_options,
+    .identifier = "template",
+};
diff --git a/xlators/playground/template/src/template.h b/xlators/playground/template/src/template.h
new file mode 100644
index 00000000000..c53dc1c7010
--- /dev/null
+++ b/xlators/playground/template/src/template.h
@@ -0,0 +1,43 @@
+/*
+   Copyright (c) 2013-2018 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+#ifndef __TEMPLATE_H__
+#define __TEMPLATE_H__
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+
+struct template_private {
+    /* Add all the relevant fields you need here */
+    int32_t dummy;
+};
+
+typedef struct template_private template_private_t;
+
+/* Below section goes to template-mem-types.h */
+#include <glusterfs/mem-types.h>
+
+enum gf_template_mem_types_ {
+    gf_template_mt_private_t = gf_common_mt_end + 1,
+    gf_template_mt_end,
+};
+
+/* This normally goes to another file 'template-messages.h",
+   required for 'gf_msg()'.
+   NOTE: make sure you have added your component (in this case,
+   TEMPLATE) in `libglusterfs/src/glfs-message-id.h`.
+ */
+#include <glusterfs/glfs-message-id.h>
+
+GLFS_MSGID(TEMPLATE, TEMPLATE_MSG_NO_MEMORY, TEMPLATE_MSG_NO_GRAPH);
+
+#endif /* __TEMPLATE_H__ */
diff --git a/xlators/protocol/Makefile.am b/xlators/protocol/Makefile.am
index 745e277c2a6..91b03b1416a 100644
--- a/xlators/protocol/Makefile.am
+++ b/xlators/protocol/Makefile.am
@@ -1,3 +1 @@
-SUBDIRS = client server
-
-CLEANFILES = 
+SUBDIRS = auth client server
diff --git a/xlators/protocol/auth/Makefile.am b/xlators/protocol/auth/Makefile.am
new file mode 100644
index 00000000000..e9e0ba97e14
--- /dev/null
+++ b/xlators/protocol/auth/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = addr login
diff --git a/xlators/protocol/auth/addr/Makefile.am b/xlators/protocol/auth/addr/Makefile.am
new file mode 100644
index 00000000000..af437a64d6d
--- /dev/null
+++ b/xlators/protocol/auth/addr/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = src
diff --git a/xlators/protocol/auth/addr/src/Makefile.am b/xlators/protocol/auth/addr/src/Makefile.am
new file mode 100644
index 00000000000..4694d254f12
--- /dev/null
+++ b/xlators/protocol/auth/addr/src/Makefile.am
@@ -0,0 +1,14 @@
+auth_LTLIBRARIES = addr.la
+authdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/auth
+
+addr_la_LDFLAGS = -module $(GF_XLATOR_LDFLAGS)
+
+addr_la_SOURCES = addr.c
+addr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/xlators/protocol/server/src \
+	-I$(top_srcdir)/rpc/xdr/src/ -I$(top_builddir)/rpc/xdr/src/ \
+	-I$(top_srcdir)/rpc/rpc-lib/src/
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/protocol/auth/addr/src/addr.c b/xlators/protocol/auth/addr/src/addr.c
new file mode 100644
index 00000000000..bf12c455d7c
--- /dev/null
+++ b/xlators/protocol/auth/addr/src/addr.c
@@ -0,0 +1,341 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <fnmatch.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include "authenticate.h"
+#include <glusterfs/dict.h>
+#include "rpc-transport.h"
+
+#define ENTRY_DELIMITER ","
+#define ADDR_DELIMITER "|"
+#define PRIVILEGED_PORT_CEILING 1024
+
+#ifndef AF_INET_SDP
+#define AF_INET_SDP 27
+#endif
+
+/* An option for subdir validation be like below */
+
+/* 1. '*'
+   2. '192.168.*'
+   3. '
+   4. '!10.10.1*' (Today as per the code, if negate is set on one entry, its
+   never reset)
+   5. '192.168.1.*, 10.1.10.*';168.168.2.* =/dir;* =/another-dir'
+
+*/
+
+int
+compare_addr_and_update(char *option_str, char *peer_addr, char *subvol,
+                        char *delimiter, auth_result_t *result,
+                        auth_result_t status)
+{
+    char *addr_str = NULL;
+    char *tmp = NULL;
+    char negate = 0;
+    char match = 0;
+    int length = 0;
+    int ret = 0;
+
+    addr_str = strtok_r(option_str, delimiter, &tmp);
+
+    while (addr_str) {
+        gf_log(subvol, GF_LOG_INFO, "%s = \"%s\", received addr = \"%s\"",
+               (status == AUTH_ACCEPT) ? "allowed" : "rejected", addr_str,
+               peer_addr);
+        if (addr_str[0] == '!') {
+            negate = 1;
+            addr_str++;
+        }
+
+        length = strlen(addr_str);
+        if ((addr_str[0] != '*') && valid_host_name(addr_str, length)) {
+            match = gf_is_same_address(addr_str, peer_addr);
+            if (match) {
+                *result = status;
+                goto out;
+            }
+        } else {
+            if (strstr(addr_str, "/")) {
+                match = gf_is_ip_in_net(addr_str, peer_addr);
+                if (negate ? !match : match) {
+                    *result = status;
+                    goto out;
+                }
+            } else {
+                match = fnmatch(addr_str, peer_addr, 0);
+                if (negate ? match : !match) {
+                    *result = status;
+                    goto out;
+                }
+            }
+        }
+
+        addr_str = strtok_r(NULL, delimiter, &tmp);
+    }
+
+    ret = -1;
+out:
+    return ret;
+}
+
+void
+parse_entries_and_compare(char *option_str, char *peer_addr, char *subvol,
+                          char *subdir, auth_result_t *result,
+                          auth_result_t status)
+{
+    char *entry = NULL;
+    char *entry_cpy = NULL;
+    char *directory = NULL;
+    char *entries = NULL;
+    char *addr_str = NULL;
+    char *addr = NULL;
+    char *tmp = NULL;
+    char *tmpdir = NULL;
+    int ret = 0;
+
+    if (!subdir) {
+        gf_log(subvol, GF_LOG_WARNING,
+               "subdir entry not present, not performing any operation.");
+        goto out;
+    }
+
+    entries = gf_strdup(option_str);
+    if (!entries)
+        goto out;
+
+    if (entries[0] != '/' && !strchr(entries, '(')) {
+        /* Backward compatible option */
+        ret = compare_addr_and_update(entries, peer_addr, subvol, ",", result,
+                                      status);
+        goto out;
+    }
+
+    entry = strtok_r(entries, ENTRY_DELIMITER, &tmp);
+    while (entry) {
+        entry_cpy = gf_strdup(entry);
+        if (!entry_cpy) {
+            goto out;
+        }
+
+        directory = strtok_r(entry_cpy, "(", &tmpdir);
+        if (directory[0] != '/')
+            goto out;
+
+        /* send second portion, after ' =' if directory matches */
+        if (strcmp(subdir, directory))
+            goto next_entry;
+
+        addr_str = strtok_r(NULL, ")", &tmpdir);
+        if (!addr_str)
+            goto out;
+
+        addr = gf_strdup(addr_str);
+        if (!addr)
+            goto out;
+
+        gf_log(subvol, GF_LOG_INFO,
+               "Found an entry for dir %s (%s),"
+               " performing validation",
+               subdir, addr);
+
+        ret = compare_addr_and_update(addr, peer_addr, subvol, ADDR_DELIMITER,
+                                      result, status);
+        if (ret == 0) {
+            break;
+        }
+
+        GF_FREE(addr);
+        addr = NULL;
+
+    next_entry:
+        entry = strtok_r(NULL, ENTRY_DELIMITER, &tmp);
+        GF_FREE(entry_cpy);
+        entry_cpy = NULL;
+    }
+
+out:
+    GF_FREE(entries);
+    GF_FREE(entry_cpy);
+    GF_FREE(addr);
+}
+
+auth_result_t
+gf_auth(dict_t *input_params, dict_t *config_params)
+{
+    auth_result_t result = AUTH_DONT_CARE;
+    int ret = 0;
+    char *name = NULL;
+    char *searchstr = NULL;
+    peer_info_t *peer_info = NULL;
+    data_t *peer_info_data = NULL;
+    data_t *allow_addr = NULL;
+    data_t *reject_addr = NULL;
+    char *service = NULL;
+    uint16_t peer_port = 0;
+    char peer_addr[UNIX_PATH_MAX] = {
+        0,
+    };
+    char *type = NULL;
+    gf_boolean_t allow_insecure = _gf_false;
+    char *subdir = NULL;
+
+    name = data_to_str(dict_get(input_params, "remote-subvolume"));
+    if (!name) {
+        gf_log("authenticate/addr", GF_LOG_DEBUG,
+               "remote-subvolume not specified");
+        goto out;
+    }
+
+    ret = gf_asprintf(&searchstr, "auth.addr.%s.allow", name);
+    if (-1 == ret) {
+        gf_log("auth/addr", GF_LOG_DEBUG,
+               "asprintf failed while setting search string");
+        goto out;
+    }
+
+    allow_addr = dict_get(config_params, searchstr);
+    GF_FREE(searchstr);
+
+    ret = gf_asprintf(&searchstr, "auth.addr.%s.reject", name);
+    if (-1 == ret) {
+        gf_log("auth/addr", GF_LOG_ERROR,
+               "asprintf failed while setting search string");
+        goto out;
+    }
+    reject_addr = dict_get(config_params, searchstr);
+    GF_FREE(searchstr);
+
+    if (!allow_addr) {
+        /* TODO: backward compatibility */
+        ret = gf_asprintf(&searchstr, "auth.ip.%s.allow", name);
+        if (-1 == ret) {
+            gf_log("auth/addr", GF_LOG_ERROR,
+                   "asprintf failed while setting search string");
+            goto out;
+        }
+        allow_addr = dict_get(config_params, searchstr);
+        GF_FREE(searchstr);
+    }
+
+    if (!(allow_addr || reject_addr)) {
+        gf_log("auth/addr", GF_LOG_DEBUG,
+               "none of the options auth.addr.%s.allow or "
+               "auth.addr.%s.reject specified, returning auth_dont_care",
+               name, name);
+        goto out;
+    }
+
+    peer_info_data = dict_get(input_params, "peer-info");
+    if (!peer_info_data) {
+        gf_log("auth/addr", GF_LOG_ERROR, "peer-info not present");
+        goto out;
+    }
+
+    ret = dict_get_str(input_params, "subdir-mount", &subdir);
+    if (ret) {
+        subdir = "/";
+    }
+
+    peer_info = data_to_ptr(peer_info_data);
+
+    switch (((struct sockaddr *)&peer_info->sockaddr)->sa_family) {
+        case AF_INET_SDP:
+        case AF_INET:
+        case AF_INET6:
+            strcpy(peer_addr, peer_info->identifier);
+            service = strrchr(peer_addr, ':');
+            *service = '\0';
+            service++;
+
+            ret = dict_get_str(config_params, "rpc-auth-allow-insecure", &type);
+            if (ret == 0) {
+                ret = gf_string2boolean(type, &allow_insecure);
+                if (ret < 0) {
+                    gf_log("auth/addr", GF_LOG_WARNING,
+                           "rpc-auth-allow-insecure option %s "
+                           "is not a valid bool option",
+                           type);
+                    goto out;
+                }
+            }
+
+            peer_port = atoi(service);
+            if (peer_port >= PRIVILEGED_PORT_CEILING && !allow_insecure) {
+                gf_log("auth/addr", GF_LOG_ERROR,
+                       "client is bound to port %d which is not privileged",
+                       peer_port);
+                result = AUTH_REJECT;
+                goto out;
+            }
+            break;
+
+        case AF_UNIX:
+            strcpy(peer_addr, peer_info->identifier);
+            break;
+
+        default:
+            gf_log("authenticate/addr", GF_LOG_ERROR,
+                   "unknown address family %d",
+                   ((struct sockaddr *)&peer_info->sockaddr)->sa_family);
+            goto out;
+    }
+
+    if (reject_addr) {
+        parse_entries_and_compare(reject_addr->data, peer_addr, name, subdir,
+                                  &result, AUTH_REJECT);
+        if (result == AUTH_REJECT)
+            goto out;
+    }
+
+    if (allow_addr) {
+        parse_entries_and_compare(allow_addr->data, peer_addr, name, subdir,
+                                  &result, AUTH_ACCEPT);
+    }
+
+out:
+    return result;
+}
+
+struct volume_options options[] = {
+    {
+        .key = {"auth.addr.*.allow"},
+        .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
+        .default_value = "*",
+        .description = "List of addresses to be allowed to access volume",
+        .op_version = {1},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {},
+        /* option_validation_fn validate_fn; */
+    },
+    {
+        .key = {"auth.addr.*.reject"},
+        .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
+        .default_value = "*",
+        .description = "List of addresses to be rejected to access volume",
+        .op_version = {1},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {},
+        /* option_validation_fn validate_fn; */
+    },
+    /* Backward compatibility */
+    {
+        .key = {"auth.ip.*.allow"},
+        .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
+        .default_value = "*",
+        .description = "List of addresses to be allowed to access volume",
+        .op_version = {1},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {},
+        /* option_validation_fn validate_fn; */
+    },
+    {.key = {NULL}}};
diff --git a/xlators/protocol/auth/login/Makefile.am b/xlators/protocol/auth/login/Makefile.am
new file mode 100644
index 00000000000..af437a64d6d
--- /dev/null
+++ b/xlators/protocol/auth/login/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = src
diff --git a/xlators/protocol/auth/login/src/Makefile.am b/xlators/protocol/auth/login/src/Makefile.am
new file mode 100644
index 00000000000..9837437b11e
--- /dev/null
+++ b/xlators/protocol/auth/login/src/Makefile.am
@@ -0,0 +1,13 @@
+auth_LTLIBRARIES = login.la
+authdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/auth
+
+login_la_LDFLAGS = -module $(GF_XLATOR_LDFLAGS)
+
+login_la_SOURCES = login.c
+login_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/xlators/protocol/server/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/protocol/auth/login/src/login.c b/xlators/protocol/auth/login/src/login.c
new file mode 100644
index 00000000000..64521267bfe
--- /dev/null
+++ b/xlators/protocol/auth/login/src/login.c
@@ -0,0 +1,210 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <fnmatch.h>
+#include "authenticate.h"
+
+/* Note on strict_auth
+ * - Strict auth kicks in when authentication is using the username, password
+ *   in the volfile to login
+ * - If enabled, auth is rejected if the username and password is not matched
+ *   or is not present
+ * - When using SSL names, this is automatically strict, and allows only those
+ *   names that are present in the allow list, IOW strict auth checking has no
+ *   implication when using SSL names
+ */
+
+auth_result_t
+gf_auth(dict_t *input_params, dict_t *config_params)
+{
+    auth_result_t result = AUTH_DONT_CARE;
+    int ret = 0;
+    data_t *allow_user = NULL;
+    data_t *username_data = NULL;
+    data_t *passwd_data = NULL;
+    data_t *password_data = NULL;
+    char *username = NULL;
+    char *password = NULL;
+    char *brick_name = NULL;
+    char *searchstr = NULL;
+    char *username_str = NULL;
+    char *tmp = NULL;
+    char *username_cpy = NULL;
+    gf_boolean_t using_ssl = _gf_false;
+    gf_boolean_t strict_auth = _gf_false;
+
+    username_data = dict_get(input_params, "ssl-name");
+    if (username_data) {
+        gf_log("auth/login", GF_LOG_INFO, "connecting user name: %s",
+               username_data->data);
+        using_ssl = _gf_true;
+    } else {
+        ret = dict_get_str_boolean(config_params, "strict-auth-accept",
+                                   _gf_false);
+        if (ret == -1)
+            strict_auth = _gf_false;
+        else
+            strict_auth = ret;
+
+        username_data = dict_get(input_params, "username");
+        if (!username_data) {
+            if (strict_auth) {
+                gf_log("auth/login", GF_LOG_DEBUG,
+                       "username not found, strict auth"
+                       " configured returning REJECT");
+                result = AUTH_REJECT;
+            } else {
+                gf_log("auth/login", GF_LOG_DEBUG,
+                       "username not found, returning"
+                       " DONT-CARE");
+            }
+            goto out;
+        }
+        password_data = dict_get(input_params, "password");
+        if (!password_data) {
+            if (strict_auth) {
+                gf_log("auth/login", GF_LOG_DEBUG,
+                       "password not found, strict auth"
+                       " configured returning REJECT");
+                result = AUTH_REJECT;
+            } else {
+                gf_log("auth/login", GF_LOG_WARNING,
+                       "password not found, returning"
+                       " DONT-CARE");
+            }
+            goto out;
+        }
+        password = data_to_str(password_data);
+    }
+    username = data_to_str(username_data);
+
+    brick_name = data_to_str(dict_get(input_params, "remote-subvolume"));
+    if (!brick_name) {
+        gf_log("auth/login", GF_LOG_ERROR, "remote-subvolume not specified");
+        result = AUTH_REJECT;
+        goto out;
+    }
+
+    ret = gf_asprintf(&searchstr, "auth.login.%s.%s", brick_name,
+                      using_ssl ? "ssl-allow" : "allow");
+    if (-1 == ret) {
+        gf_log("auth/login", GF_LOG_ERROR,
+               "asprintf failed while setting search string, "
+               "returning REJECT");
+        result = AUTH_REJECT;
+        goto out;
+    }
+
+    allow_user = dict_get(config_params, searchstr);
+    GF_FREE(searchstr);
+
+    if (allow_user) {
+        gf_log("auth/login", GF_LOG_INFO, "allowed user names: %s",
+               allow_user->data);
+        /*
+         * There's a subtle difference between SSL and non-SSL behavior
+         * if we can't match anything in the "while" loop below.
+         * Intuitively, we should AUTH_REJECT if there's no match.
+         * However, existing code depends on allowing untrusted users
+         * to connect with *no credentials at all* by falling through
+         * the loop.  They're still distinguished from trusted users
+         * who do provide a valid username and password (in fact that's
+         * pretty much the only thing we use non-SSL login auth for),
+         * but they are allowed to connect.  It's wrong, but it's not
+         * worth changing elsewhere.  Therefore, we do the sane thing
+         * only for SSL here.
+         *
+         * For SSL, if there's a list *you must be on it*.  Note that
+         * if there's no list we don't care.  In that case (and the
+         * ssl-allow=* case as well) authorization is effectively
+         * disabled, though authentication and encryption are still
+         * active.
+         *
+         * Read NOTE on strict_auth above.
+         */
+        if (using_ssl || strict_auth) {
+            result = AUTH_REJECT;
+        }
+        username_cpy = gf_strdup(allow_user->data);
+        if (!username_cpy)
+            goto out;
+
+        username_str = strtok_r(username_cpy, " ,", &tmp);
+
+        /*
+         * We have to match a user's *authenticated* name to one in the
+         * list.  If we're using SSL, they're already authenticated.
+         * Otherwise, they need a matching password to complete the
+         * process.
+         */
+        while (username_str) {
+            if (!fnmatch(username_str, username, 0)) {
+                if (using_ssl) {
+                    result = AUTH_ACCEPT;
+                    break;
+                }
+                ret = gf_asprintf(&searchstr, "auth.login.%s.password",
+                                  username);
+                if (-1 == ret) {
+                    gf_log("auth/login", GF_LOG_WARNING,
+                           "asprintf failed while setting search string");
+                    goto out;
+                }
+                passwd_data = dict_get(config_params, searchstr);
+                GF_FREE(searchstr);
+
+                if (!passwd_data) {
+                    gf_log("auth/login", GF_LOG_ERROR,
+                           "wrong username/password combination");
+                    result = AUTH_REJECT;
+                    goto out;
+                }
+
+                result = !((strcmp(data_to_str(passwd_data), password))
+                               ? AUTH_ACCEPT
+                               : AUTH_REJECT);
+                if (result == AUTH_REJECT)
+                    gf_log("auth/login", GF_LOG_ERROR,
+                           "wrong password for user %s", username);
+
+                break;
+            }
+            username_str = strtok_r(NULL, " ,", &tmp);
+        }
+    }
+
+out:
+    GF_FREE(username_cpy);
+
+    return result;
+}
+
+struct volume_options options[] = {
+    {
+        .key = {"auth.login.*.allow"},
+        .type = GF_OPTION_TYPE_ANY,
+        .default_value = "*",
+        .description = "Username to be allowed access to the volume",
+        .op_version = {1},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {},
+        /* option_validation_fn validate_fn; */
+    },
+    {
+        .key = {"auth.login.*.password"},
+        .type = GF_OPTION_TYPE_ANY,
+        .default_value = "*",
+        .description = "Password for the allowed username",
+        .op_version = {1},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+        .tags = {},
+        /* option_validation_fn validate_fn; */
+    },
+    {.key = {NULL}}};
diff --git a/xlators/protocol/client/Makefile.am b/xlators/protocol/client/Makefile.am
index d471a3f9243..af437a64d6d 100644
--- a/xlators/protocol/client/Makefile.am
+++ b/xlators/protocol/client/Makefile.am
@@ -1,3 +1 @@
 SUBDIRS = src
-
-CLEANFILES = 
diff --git a/xlators/protocol/client/src/Makefile.am b/xlators/protocol/client/src/Makefile.am
index fb720942cc6..785a51fc3b4 100644
--- a/xlators/protocol/client/src/Makefile.am
+++ b/xlators/protocol/client/src/Makefile.am
@@ -2,15 +2,20 @@
 xlator_LTLIBRARIES = client.la
 xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/protocol
 
-client_la_LDFLAGS = -module -avoidversion
+client_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
 
-client_la_SOURCES = client-protocol.c saved-frames.c
-client_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+client_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+	$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \
+	$(top_builddir)/rpc/xdr/src/libgfxdr.la
 
-noinst_HEADERS = client-protocol.h saved-frames.h
+client_la_SOURCES = client.c client-helpers.c client-rpc-fops.c  \
+	client-handshake.c client-callback.c client-lk.c client-common.c \
+	client-rpc-fops_v2.c
 
-AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \
-	-I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS)
+noinst_HEADERS = client.h client-mem-types.h client-messages.h client-common.h
 
-CLEANFILES = 
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+	-I$(top_srcdir)/rpc/rpc-lib/src/
 
+AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/protocol/client/src/client-callback.c b/xlators/protocol/client/src/client-callback.c
new file mode 100644
index 00000000000..d83d9c14899
--- /dev/null
+++ b/xlators/protocol/client/src/client-callback.c
@@ -0,0 +1,312 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "client.h"
+#include "rpc-clnt.h"
+#include <glusterfs/defaults.h>
+#include "client-messages.h"
+
+static int
+client_cbk_null(struct rpc_clnt *rpc, void *mydata, void *data)
+{
+    gf_smsg(THIS->name, GF_LOG_WARNING, 0, PC_MSG_FUNCTION_CALL_ERROR, NULL);
+    return 0;
+}
+
+static int
+client_cbk_fetchspec(struct rpc_clnt *rpc, void *mydata, void *data)
+{
+    gf_smsg(THIS->name, GF_LOG_WARNING, 0, PC_MSG_FUNCTION_CALL_ERROR, NULL);
+    return 0;
+}
+
+static int
+client_cbk_ino_flush(struct rpc_clnt *rpc, void *mydata, void *data)
+{
+    gf_smsg(THIS->name, GF_LOG_WARNING, 0, PC_MSG_FUNCTION_CALL_ERROR, NULL);
+    return 0;
+}
+
+static int
+client_cbk_recall_lease(struct rpc_clnt *rpc, void *mydata, void *data)
+{
+    int ret = -1;
+    struct iovec *iov = NULL;
+    struct gf_upcall upcall_data = {
+        0,
+    };
+    struct gf_upcall_recall_lease rl_data = {
+        0,
+    };
+    gfs3_recall_lease_req recall_lease = {
+        {
+            0,
+        },
+    };
+
+    GF_VALIDATE_OR_GOTO("client-callback", data, out);
+
+    iov = (struct iovec *)data;
+    ret = xdr_to_generic(*iov, &recall_lease,
+                         (xdrproc_t)xdr_gfs3_recall_lease_req);
+
+    if (ret < 0) {
+        gf_smsg(THIS->name, GF_LOG_WARNING, -ret, PC_MSG_RECALL_LEASE_FAIL,
+                NULL);
+        goto out;
+    }
+
+    upcall_data.data = &rl_data;
+    ret = gf_proto_recall_lease_to_upcall(&recall_lease, &upcall_data);
+    if (ret < 0)
+        goto out;
+
+    upcall_data.event_type = GF_UPCALL_RECALL_LEASE;
+
+    gf_msg_trace(THIS->name, 0, "Upcall gfid = %s, ret = %d", recall_lease.gfid,
+                 ret);
+
+    default_notify(THIS, GF_EVENT_UPCALL, &upcall_data);
+
+out:
+    if (recall_lease.xdata.xdata_val)
+        free(recall_lease.xdata.xdata_val);
+
+    if (rl_data.dict)
+        dict_unref(rl_data.dict);
+
+    return ret;
+}
+
+static int
+client_cbk_cache_invalidation(struct rpc_clnt *rpc, void *mydata, void *data)
+{
+    int ret = -1;
+    struct iovec *iov = NULL;
+    struct gf_upcall upcall_data = {
+        0,
+    };
+    struct gf_upcall_cache_invalidation ca_data = {
+        0,
+    };
+    gfs3_cbk_cache_invalidation_req ca_req = {
+        0,
+    };
+
+    gf_msg_trace(THIS->name, 0, "Upcall callback is called");
+
+    if (!data)
+        goto out;
+
+    iov = (struct iovec *)data;
+    ret = xdr_to_generic(*iov, &ca_req,
+                         (xdrproc_t)xdr_gfs3_cbk_cache_invalidation_req);
+
+    if (ret < 0) {
+        gf_smsg(THIS->name, GF_LOG_WARNING, -ret,
+                PC_MSG_CACHE_INVALIDATION_FAIL, NULL);
+        goto out;
+    }
+
+    upcall_data.data = &ca_data;
+    ret = gf_proto_cache_invalidation_to_upcall(THIS, &ca_req, &upcall_data);
+    if (ret < 0)
+        goto out;
+
+    gf_msg_trace(THIS->name, 0,
+                 "Cache invalidation cbk received for gfid:"
+                 " %s, ret = %d",
+                 ca_req.gfid, ret);
+
+    default_notify(THIS, GF_EVENT_UPCALL, &upcall_data);
+
+out:
+    if (ca_req.gfid)
+        free(ca_req.gfid);
+
+    if (ca_req.xdata.xdata_val)
+        free(ca_req.xdata.xdata_val);
+
+    if (ca_data.dict)
+        dict_unref(ca_data.dict);
+
+    return 0;
+}
+
+static int
+client_cbk_child_up(struct rpc_clnt *rpc, void *mydata, void *data)
+{
+    clnt_conf_t *conf = NULL;
+    xlator_t *this = THIS;
+
+    GF_VALIDATE_OR_GOTO("client", this, out);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    gf_msg_debug(this->name, 0, "Received CHILD_UP");
+    conf->child_up = _gf_true;
+
+    this->notify(this, GF_EVENT_CHILD_UP, NULL);
+out:
+    return 0;
+}
+
+static int
+client_cbk_child_down(struct rpc_clnt *rpc, void *mydata, void *data)
+{
+    clnt_conf_t *conf = NULL;
+    xlator_t *this = THIS;
+
+    GF_VALIDATE_OR_GOTO("client", this, out);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    gf_msg_debug(this->name, 0, "Received CHILD_DOWN");
+    conf->child_up = _gf_false;
+
+    this->notify(this, GF_EVENT_CHILD_DOWN, NULL);
+out:
+    return 0;
+}
+
+static int
+client_cbk_inodelk_contention(struct rpc_clnt *rpc, void *mydata, void *data)
+{
+    int ret = -1;
+    struct iovec *iov = NULL;
+    struct gf_upcall upcall_data = {
+        0,
+    };
+    struct gf_upcall_inodelk_contention lc = {
+        {
+            0,
+        },
+    };
+    gfs4_inodelk_contention_req proto_lc = {
+        {
+            0,
+        },
+    };
+
+    GF_VALIDATE_OR_GOTO("client-callback", data, out);
+
+    iov = (struct iovec *)data;
+    ret = xdr_to_generic(*iov, &proto_lc,
+                         (xdrproc_t)xdr_gfs4_inodelk_contention_req);
+
+    if (ret < 0) {
+        gf_smsg(THIS->name, GF_LOG_WARNING, -ret,
+                PC_MSG_INODELK_CONTENTION_FAIL, NULL);
+        goto out;
+    }
+
+    upcall_data.data = &lc;
+    ret = gf_proto_inodelk_contention_to_upcall(&proto_lc, &upcall_data);
+    if (ret < 0)
+        goto out;
+
+    upcall_data.event_type = GF_UPCALL_INODELK_CONTENTION;
+
+    default_notify(THIS, GF_EVENT_UPCALL, &upcall_data);
+
+out:
+    if (proto_lc.domain)
+        free(proto_lc.domain);
+
+    if (proto_lc.xdata.xdata_val)
+        free(proto_lc.xdata.xdata_val);
+
+    if (lc.xdata)
+        dict_unref(lc.xdata);
+
+    return ret;
+}
+
+static int
+client_cbk_entrylk_contention(struct rpc_clnt *rpc, void *mydata, void *data)
+{
+    int ret = -1;
+    struct iovec *iov = NULL;
+    struct gf_upcall upcall_data = {
+        0,
+    };
+    struct gf_upcall_entrylk_contention lc = {
+        0,
+    };
+    gfs4_entrylk_contention_req proto_lc = {
+        {
+            0,
+        },
+    };
+
+    GF_VALIDATE_OR_GOTO("client-callback", data, out);
+
+    iov = (struct iovec *)data;
+    ret = xdr_to_generic(*iov, &proto_lc,
+                         (xdrproc_t)xdr_gfs4_entrylk_contention_req);
+
+    if (ret < 0) {
+        gf_smsg(THIS->name, GF_LOG_WARNING, -ret,
+                PC_MSG_ENTRYLK_CONTENTION_FAIL, NULL);
+        goto out;
+    }
+
+    upcall_data.data = &lc;
+    ret = gf_proto_entrylk_contention_to_upcall(&proto_lc, &upcall_data);
+    if (ret < 0)
+        goto out;
+
+    upcall_data.event_type = GF_UPCALL_ENTRYLK_CONTENTION;
+
+    default_notify(THIS, GF_EVENT_UPCALL, &upcall_data);
+
+out:
+    if (proto_lc.name)
+        free(proto_lc.name);
+
+    if (proto_lc.domain)
+        free(proto_lc.domain);
+
+    if (proto_lc.xdata.xdata_val)
+        free(proto_lc.xdata.xdata_val);
+
+    if (lc.xdata)
+        dict_unref(lc.xdata);
+
+    return ret;
+}
+
+static rpcclnt_cb_actor_t gluster_cbk_actors[GF_CBK_MAXVALUE] = {
+    [GF_CBK_NULL] = {"NULL", client_cbk_null, GF_CBK_NULL},
+    [GF_CBK_FETCHSPEC] = {"FETCHSPEC", client_cbk_fetchspec, GF_CBK_FETCHSPEC},
+    [GF_CBK_INO_FLUSH] = {"INO_FLUSH", client_cbk_ino_flush, GF_CBK_INO_FLUSH},
+    [GF_CBK_CACHE_INVALIDATION] = {"CACHE_INVALIDATION",
+                                   client_cbk_cache_invalidation,
+                                   GF_CBK_CACHE_INVALIDATION},
+    [GF_CBK_CHILD_UP] = {"CHILD_UP", client_cbk_child_up, GF_CBK_CHILD_UP},
+    [GF_CBK_CHILD_DOWN] = {"CHILD_DOWN", client_cbk_child_down,
+                           GF_CBK_CHILD_DOWN},
+    [GF_CBK_RECALL_LEASE] = {"RECALL_LEASE", client_cbk_recall_lease,
+                             GF_CBK_RECALL_LEASE},
+    [GF_CBK_INODELK_CONTENTION] = {"INODELK_CONTENTION",
+                                   client_cbk_inodelk_contention,
+                                   GF_CBK_INODELK_CONTENTION},
+    [GF_CBK_ENTRYLK_CONTENTION] = {"ENTRYLK_CONTENTION",
+                                   client_cbk_entrylk_contention,
+                                   GF_CBK_ENTRYLK_CONTENTION},
+};
+
+struct rpcclnt_cb_program gluster_cbk_prog = {
+    .progname = "GlusterFS Callback",
+    .prognum = GLUSTER_CBK_PROGRAM,
+    .progver = GLUSTER_CBK_VERSION,
+    .actors = gluster_cbk_actors,
+    .numactors = GF_CBK_MAXVALUE,
+};
diff --git a/xlators/protocol/client/src/client-common.c b/xlators/protocol/client/src/client-common.c
new file mode 100644
index 00000000000..c112820e407
--- /dev/null
+++ b/xlators/protocol/client/src/client-common.c
@@ -0,0 +1,3589 @@
+/*
+  Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include "rpc-common-xdr.h"
+#include "glusterfs3-xdr.h"
+#include "glusterfs4-xdr.h"
+#include "glusterfs3.h"
+#include "client.h"
+
+/* processing to be done before fops are woudn down */
+int
+client_pre_stat(xlator_t *this, gfs3_stat_req *req, loc_t *loc, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_readlink(xlator_t *this, gfs3_readlink_req *req, loc_t *loc,
+                    size_t size, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    req->size = size;
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_mknod(xlator_t *this, gfs3_mknod_req *req, loc_t *loc, mode_t mode,
+                 dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->parent->gfid))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->pargfid)),
+                                  out, op_errno, EINVAL);
+    req->bname = (char *)loc->name;
+    req->mode = mode;
+    req->dev = rdev;
+    req->umask = umask;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_mkdir(xlator_t *this, gfs3_mkdir_req *req, loc_t *loc, mode_t mode,
+                 mode_t umask, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->parent->gfid))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->pargfid)),
+                                  out, op_errno, EINVAL);
+
+    req->bname = (char *)loc->name;
+    req->mode = mode;
+    req->umask = umask;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_unlink(xlator_t *this, gfs3_unlink_req *req, loc_t *loc,
+                  int32_t flags, dict_t *xdata)
+{
+    int op_errno = 0;
+
+    if (!(loc && loc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->parent->gfid))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->pargfid)),
+                                  out, op_errno, EINVAL);
+    req->bname = (char *)loc->name;
+    req->xflags = flags;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_rmdir(xlator_t *this, gfs3_rmdir_req *req, loc_t *loc, int32_t flags,
+                 dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->parent->gfid))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->pargfid)),
+                                  out, op_errno, EINVAL);
+    req->bname = (char *)loc->name;
+    req->xflags = flags;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_symlink(xlator_t *this, gfs3_symlink_req *req, loc_t *loc,
+                   const char *linkname, mode_t umask, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->parent->gfid))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->pargfid)),
+                                  out, op_errno, EINVAL);
+    req->linkname = (char *)linkname;
+    req->bname = (char *)loc->name;
+    req->umask = umask;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_rename(xlator_t *this, gfs3_rename_req *req, loc_t *oldloc,
+                  loc_t *newloc, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(oldloc && newloc && oldloc->parent && newloc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(oldloc->parent->gfid))
+        memcpy(req->oldgfid, oldloc->parent->gfid, 16);
+    else
+        memcpy(req->oldgfid, oldloc->pargfid, 16);
+
+    if (!gf_uuid_is_null(newloc->parent->gfid))
+        memcpy(req->newgfid, newloc->parent->gfid, 16);
+    else
+        memcpy(req->newgfid, newloc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->oldgfid)),
+                                  out, op_errno, EINVAL);
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->newgfid)),
+                                  out, op_errno, EINVAL);
+    req->oldbname = (char *)oldloc->name;
+    req->newbname = (char *)newloc->name;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_link(xlator_t *this, gfs3_link_req *req, loc_t *oldloc,
+                loc_t *newloc, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(oldloc && oldloc->inode && newloc && newloc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(oldloc->inode->gfid))
+        memcpy(req->oldgfid, oldloc->inode->gfid, 16);
+    else
+        memcpy(req->oldgfid, oldloc->gfid, 16);
+
+    if (!gf_uuid_is_null(newloc->parent->gfid))
+        memcpy(req->newgfid, newloc->parent->gfid, 16);
+    else
+        memcpy(req->newgfid, newloc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->oldgfid)),
+                                  out, op_errno, EINVAL);
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->newgfid)),
+                                  out, op_errno, EINVAL);
+    req->newbname = (char *)newloc->name;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_truncate(xlator_t *this, gfs3_truncate_req *req, loc_t *loc,
+                    off_t offset, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    req->offset = offset;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_open(xlator_t *this, gfs3_open_req *req, loc_t *loc, fd_t *fd,
+                int32_t flags, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    req->flags = gf_flags_from_flags(flags);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_readv(xlator_t *this, gfs3_read_req *req, fd_t *fd, size_t size,
+                 off_t offset, int32_t flags, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno,
+                         out);
+
+    req->size = size;
+    req->offset = offset;
+    req->fd = remote_fd;
+    req->flag = flags;
+
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_writev(xlator_t *this, gfs3_write_req *req, fd_t *fd, size_t size,
+                  off_t offset, int32_t flags, dict_t **xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno,
+                         out);
+
+    req->size = size;
+    req->offset = offset;
+    req->fd = remote_fd;
+    req->flag = flags;
+
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+#ifdef GF_TESTING_IO_XDATA
+    if (!*xdata)
+        *xdata = dict_new();
+
+    ret = dict_set_str(*xdata, "testing-the-xdata-key",
+                       "testing-the-xdata-value");
+#endif
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, *xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_statfs(xlator_t *this, gfs3_statfs_req *req, loc_t *loc,
+                  dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!loc)
+        goto out;
+
+    if (loc->inode) {
+        if (!gf_uuid_is_null(loc->inode->gfid))
+            memcpy(req->gfid, loc->inode->gfid, 16);
+        else
+            memcpy(req->gfid, loc->gfid, 16);
+    } else {
+        req->gfid[15] = 1;
+    }
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_flush(xlator_t *this, gfs3_flush_req *req, fd_t *fd, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fsync(xlator_t *this, gfs3_fsync_req *req, fd_t *fd, int32_t flags,
+                 dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = 0;
+
+    CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno,
+                         out);
+
+    req->fd = remote_fd;
+    req->data = flags;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_setxattr(xlator_t *this, gfs3_setxattr_req *req, loc_t *loc,
+                    dict_t *xattr, int32_t flags, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    if (xattr) {
+        GF_PROTOCOL_DICT_SERIALIZE(this, xattr, (&req->dict.dict_val),
+                                   req->dict.dict_len, op_errno, out);
+    }
+
+    req->flags = flags;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_getxattr(xlator_t *this, gfs3_getxattr_req *req, loc_t *loc,
+                    const char *name, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!loc) {
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    if (loc->inode && !gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    req->namelen = 1; /* Use it as a flag */
+
+    req->name = (char *)name;
+    if (!req->name) {
+        req->name = "";
+        req->namelen = 0;
+    }
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_removexattr(xlator_t *this, gfs3_removexattr_req *req, loc_t *loc,
+                       const char *name, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    req->name = (char *)name;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_opendir(xlator_t *this, gfs3_opendir_req *req, loc_t *loc, fd_t *fd,
+                   dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fsyncdir(xlator_t *this, gfs3_fsyncdir_req *req, fd_t *fd,
+                    int32_t flags, dict_t *xdata)
+{
+    int32_t op_errno = ESTALE;
+    int64_t remote_fd = -1;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    req->data = flags;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_access(xlator_t *this, gfs3_access_req *req, loc_t *loc,
+                  int32_t mask, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    req->mask = mask;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_create(xlator_t *this, gfs3_create_req *req, loc_t *loc, fd_t *fd,
+                  mode_t mode, int32_t flags, mode_t umask, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->parent->gfid))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->pargfid)),
+                                  out, op_errno, EINVAL);
+    req->bname = (char *)loc->name;
+    req->mode = mode;
+    req->flags = gf_flags_from_flags(flags);
+    req->umask = umask;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_ftruncate(xlator_t *this, gfs3_ftruncate_req *req, fd_t *fd,
+                     off_t offset, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = EINVAL;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->offset = offset;
+    req->fd = remote_fd;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fstat(xlator_t *this, gfs3_fstat_req *req, fd_t *fd, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_lk(xlator_t *this, gfs3_lk_req *req, int32_t cmd,
+              struct gf_flock *flock, fd_t *fd, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+    int32_t gf_cmd = 0;
+    int32_t gf_type = 0;
+    int ret = 0;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    ret = client_cmd_to_gf_cmd(cmd, &gf_cmd);
+    if (ret) {
+        op_errno = EINVAL;
+        gf_smsg(this->name, GF_LOG_WARNING, EINVAL, PC_MSG_UNKNOWN_CMD,
+                "gf_cmd=%d", gf_cmd, NULL);
+        goto out;
+    }
+
+    switch (flock->l_type) {
+        case F_RDLCK:
+            gf_type = GF_LK_F_RDLCK;
+            break;
+        case F_WRLCK:
+            gf_type = GF_LK_F_WRLCK;
+            break;
+        case F_UNLCK:
+            gf_type = GF_LK_F_UNLCK;
+            break;
+    }
+
+    req->fd = remote_fd;
+    req->cmd = gf_cmd;
+    req->type = gf_type;
+    gf_proto_flock_from_flock(&req->flock, flock);
+
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_lookup(xlator_t *this, gfs3_lookup_req *req, loc_t *loc,
+                  dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if ((loc->parent) && (!gf_uuid_is_null(loc->parent->gfid)))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    if ((loc->inode) && (!gf_uuid_is_null(loc->inode->gfid)))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    if (loc->name)
+        req->bname = (char *)loc->name;
+    else
+        req->bname = "";
+
+    if (xdata) {
+        GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                                   req->xdata.xdata_len, op_errno, out);
+    }
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_readdir(xlator_t *this, gfs3_readdir_req *req, fd_t *fd, size_t size,
+                   off_t offset, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->size = size;
+    req->offset = offset;
+    req->fd = remote_fd;
+
+    memcpy(req->gfid, fd->inode->gfid, 16);
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_inodelk(xlator_t *this, gfs3_inodelk_req *req, loc_t *loc, int cmd,
+                   struct gf_flock *flock, const char *volume, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int32_t gf_cmd = 0;
+    int32_t gf_type = 0;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->gfid))
+        memcpy(req->gfid, loc->gfid, 16);
+    else
+        memcpy(req->gfid, loc->inode->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    if (cmd == F_GETLK || cmd == F_GETLK64)
+        gf_cmd = GF_LK_GETLK;
+    else if (cmd == F_SETLK || cmd == F_SETLK64)
+        gf_cmd = GF_LK_SETLK;
+    else if (cmd == F_SETLKW || cmd == F_SETLKW64)
+        gf_cmd = GF_LK_SETLKW;
+    else {
+        gf_smsg(this->name, GF_LOG_WARNING, EINVAL, PC_MSG_UNKNOWN_CMD,
+                "gf_cmd=%d", gf_cmd, NULL);
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    switch (flock->l_type) {
+        case F_RDLCK:
+            gf_type = GF_LK_F_RDLCK;
+            break;
+        case F_WRLCK:
+            gf_type = GF_LK_F_WRLCK;
+            break;
+        case F_UNLCK:
+            gf_type = GF_LK_F_UNLCK;
+            break;
+    }
+
+    req->volume = (char *)volume;
+    req->cmd = gf_cmd;
+    req->type = gf_type;
+    gf_proto_flock_from_flock(&req->flock, flock);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_finodelk(xlator_t *this, gfs3_finodelk_req *req, fd_t *fd, int cmd,
+                    struct gf_flock *flock, const char *volume, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int64_t remote_fd = -1;
+    int32_t gf_type = 0;
+    int32_t gf_cmd = 0;
+
+    CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno,
+                         out);
+
+    if (cmd == F_GETLK || cmd == F_GETLK64)
+        gf_cmd = GF_LK_GETLK;
+    else if (cmd == F_SETLK || cmd == F_SETLK64)
+        gf_cmd = GF_LK_SETLK;
+    else if (cmd == F_SETLKW || cmd == F_SETLKW64)
+        gf_cmd = GF_LK_SETLKW;
+    else {
+        gf_smsg(this->name, GF_LOG_WARNING, EINVAL, PC_MSG_UNKNOWN_CMD,
+                "gf_cmd=%d", gf_cmd, NULL);
+        goto out;
+    }
+
+    switch (flock->l_type) {
+        case F_RDLCK:
+            gf_type = GF_LK_F_RDLCK;
+            break;
+        case F_WRLCK:
+            gf_type = GF_LK_F_WRLCK;
+            break;
+        case F_UNLCK:
+            gf_type = GF_LK_F_UNLCK;
+            break;
+    }
+
+    req->volume = (char *)volume;
+    req->fd = remote_fd;
+    req->cmd = gf_cmd;
+    req->type = gf_type;
+    gf_proto_flock_from_flock(&req->flock, flock);
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_entrylk(xlator_t *this, gfs3_entrylk_req *req, loc_t *loc,
+                   entrylk_cmd cmd_entrylk, entrylk_type type,
+                   const char *volume, const char *basename, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->gfid))
+        memcpy(req->gfid, loc->gfid, 16);
+    else
+        memcpy(req->gfid, loc->inode->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    req->cmd = cmd_entrylk;
+    req->type = type;
+    req->volume = (char *)volume;
+    req->name = "";
+    if (basename) {
+        req->name = (char *)basename;
+        req->namelen = 1;
+    }
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fentrylk(xlator_t *this, gfs3_fentrylk_req *req, fd_t *fd,
+                    entrylk_cmd cmd_entrylk, entrylk_type type,
+                    const char *volume, const char *basename, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    req->cmd = cmd_entrylk;
+    req->type = type;
+    req->volume = (char *)volume;
+    req->name = "";
+    if (basename) {
+        req->name = (char *)basename;
+        req->namelen = 1;
+    }
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_xattrop(xlator_t *this, gfs3_xattrop_req *req, loc_t *loc,
+                   dict_t *xattr, int32_t flags, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    if (xattr) {
+        GF_PROTOCOL_DICT_SERIALIZE(this, xattr, (&req->dict.dict_val),
+                                   req->dict.dict_len, op_errno, out);
+    }
+
+    req->flags = flags;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fxattrop(xlator_t *this, gfs3_fxattrop_req *req, fd_t *fd,
+                    dict_t *xattr, int32_t flags, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int64_t remote_fd = -1;
+
+    CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno,
+                         out);
+
+    req->fd = remote_fd;
+    req->flags = flags;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    if (xattr) {
+        GF_PROTOCOL_DICT_SERIALIZE(this, xattr, (&req->dict.dict_val),
+                                   req->dict.dict_len, op_errno, out);
+    }
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fgetxattr(xlator_t *this, gfs3_fgetxattr_req *req, fd_t *fd,
+                     const char *name, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->namelen = 1; /* Use it as a flag */
+    req->fd = remote_fd;
+    req->name = (char *)name;
+    if (!req->name) {
+        req->name = "";
+        req->namelen = 0;
+    }
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fsetxattr(xlator_t *this, gfs3_fsetxattr_req *req, fd_t *fd,
+                     int32_t flags, dict_t *xattr, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    req->flags = flags;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    if (xattr) {
+        GF_PROTOCOL_DICT_SERIALIZE(this, xattr, (&req->dict.dict_val),
+                                   req->dict.dict_len, op_errno, out);
+    }
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_rchecksum(xlator_t *this, gfs3_rchecksum_req *req, fd_t *fd,
+                     int32_t len, off_t offset, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->len = len;
+    req->offset = offset;
+    req->fd = remote_fd;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_setattr(xlator_t *this, gfs3_setattr_req *req, loc_t *loc,
+                   int32_t valid, struct iatt *stbuf, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        return -op_errno;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+
+    req->valid = valid;
+    gf_stat_from_iatt(&req->stbuf, stbuf);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fsetattr(xlator_t *this, gfs3_fsetattr_req *req, fd_t *fd,
+                    int32_t valid, struct iatt *stbuf, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int64_t remote_fd = -1;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    req->valid = valid;
+    gf_stat_from_iatt(&req->stbuf, stbuf);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_readdirp(xlator_t *this, gfs3_readdirp_req *req, fd_t *fd,
+                    size_t size, off_t offset, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int64_t remote_fd = -1;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->size = size;
+    req->offset = offset;
+    req->fd = remote_fd;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    /* dict itself is 'xdata' here */
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->dict.dict_val),
+                               req->dict.dict_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fremovexattr(xlator_t *this, gfs3_fremovexattr_req *req, fd_t *fd,
+                        const char *name, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    if (!(fd && fd->inode))
+        goto out;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    memcpy(req->gfid, fd->inode->gfid, 16);
+    req->name = (char *)name;
+    req->fd = remote_fd;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fallocate(xlator_t *this, gfs3_fallocate_req *req, fd_t *fd,
+                     int32_t flags, off_t offset, size_t size, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int64_t remote_fd = -1;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    req->flags = flags;
+    req->offset = offset;
+    req->size = size;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_discard(xlator_t *this, gfs3_discard_req *req, fd_t *fd,
+                   off_t offset, size_t size, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int64_t remote_fd = -1;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    req->offset = offset;
+    req->size = size;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_zerofill(xlator_t *this, gfs3_zerofill_req *req, fd_t *fd,
+                    off_t offset, size_t size, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int64_t remote_fd = -1;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    req->offset = offset;
+    req->size = size;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_ipc(xlator_t *this, gfs3_ipc_req *req, int32_t cmd, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    req->op = cmd;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_seek(xlator_t *this, gfs3_seek_req *req, fd_t *fd, off_t offset,
+                gf_seek_what_t what, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    memcpy(req->gfid, fd->inode->gfid, 16);
+    req->fd = remote_fd;
+    req->offset = offset;
+    req->what = what;
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_lease(xlator_t *this, gfs3_lease_req *req, loc_t *loc,
+                 struct gf_lease *lease, dict_t *xdata)
+{
+    int op_errno = 0;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+
+    gf_proto_lease_from_lease(&req->lease, lease);
+
+    GF_PROTOCOL_DICT_SERIALIZE(this, xdata, (&req->xdata.xdata_val),
+                               req->xdata.xdata_len, op_errno, out);
+out:
+    return -op_errno;
+}
+
+/* processing done after fop responses are obtained */
+int
+client_post_stat(xlator_t *this, gfs3_stat_rsp *rsp, struct iatt *iatt,
+                 dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->stat, iatt);
+    }
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+out:
+    return ret;
+}
+
+int
+client_post_readlink(xlator_t *this, gfs3_readlink_rsp *rsp, struct iatt *iatt,
+                     dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->buf, iatt);
+    }
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+out:
+
+    return ret;
+}
+
+int
+client_post_mknod(xlator_t *this, gfs3_mknod_rsp *rsp, struct iatt *stbuf,
+                  struct iatt *preparent, struct iatt *postparent,
+                  dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->stat, stbuf);
+        gf_stat_to_iatt(&rsp->preparent, preparent);
+        gf_stat_to_iatt(&rsp->postparent, postparent);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+out:
+    return ret;
+}
+
+int
+client_post_mkdir(xlator_t *this, gfs3_mkdir_rsp *rsp, struct iatt *stbuf,
+                  struct iatt *preparent, struct iatt *postparent,
+                  dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->stat, stbuf);
+        gf_stat_to_iatt(&rsp->preparent, preparent);
+        gf_stat_to_iatt(&rsp->postparent, postparent);
+    }
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+out:
+    return ret;
+}
+
+int
+client_post_unlink(xlator_t *this, gfs3_unlink_rsp *rsp, struct iatt *preparent,
+                   struct iatt *postparent, dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->preparent, preparent);
+        gf_stat_to_iatt(&rsp->postparent, postparent);
+    }
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+    ret = gf_replace_new_iatt_in_dict(*xdata);
+out:
+    return ret;
+}
+
+int
+client_post_rmdir(xlator_t *this, gfs3_rmdir_rsp *rsp, struct iatt *preparent,
+                  struct iatt *postparent, dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->preparent, preparent);
+        gf_stat_to_iatt(&rsp->postparent, postparent);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+out:
+    return ret;
+}
+
+int
+client_post_symlink(xlator_t *this, gfs3_symlink_rsp *rsp, struct iatt *stbuf,
+                    struct iatt *preparent, struct iatt *postparent,
+                    dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->stat, stbuf);
+        gf_stat_to_iatt(&rsp->preparent, preparent);
+        gf_stat_to_iatt(&rsp->postparent, postparent);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_rename(xlator_t *this, gfs3_rename_rsp *rsp, struct iatt *stbuf,
+                   struct iatt *preoldparent, struct iatt *postoldparent,
+                   struct iatt *prenewparent, struct iatt *postnewparent,
+                   dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->stat, stbuf);
+
+        gf_stat_to_iatt(&rsp->preoldparent, preoldparent);
+        gf_stat_to_iatt(&rsp->postoldparent, postoldparent);
+
+        gf_stat_to_iatt(&rsp->prenewparent, prenewparent);
+        gf_stat_to_iatt(&rsp->postnewparent, postnewparent);
+    }
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_link(xlator_t *this, gfs3_link_rsp *rsp, struct iatt *stbuf,
+                 struct iatt *preparent, struct iatt *postparent,
+                 dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->stat, stbuf);
+        gf_stat_to_iatt(&rsp->preparent, preparent);
+        gf_stat_to_iatt(&rsp->postparent, postparent);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_truncate(xlator_t *this, gfs3_truncate_rsp *rsp,
+                     struct iatt *prestat, struct iatt *poststat,
+                     dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->prestat, prestat);
+        gf_stat_to_iatt(&rsp->poststat, poststat);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_open(xlator_t *this, gfs3_open_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_readv(xlator_t *this, gfs3_read_rsp *rsp, struct iobref **iobref,
+                  struct iobref *rsp_iobref, struct iatt *stat,
+                  struct iovec *vector, struct iovec *rsp_vector, int *rspcount,
+                  dict_t **xdata)
+{
+    int ret = 0;
+
+    if (rsp->op_ret != -1) {
+        *iobref = rsp_iobref;
+        gf_stat_to_iatt(&rsp->stat, stat);
+
+        vector[0].iov_len = rsp->op_ret;
+        if (rsp->op_ret > 0)
+            vector[0].iov_base = rsp_vector->iov_base;
+        *rspcount = 1;
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+#ifdef GF_TESTING_IO_XDATA
+    dict_dump_to_log(xdata);
+#endif
+out:
+    return ret;
+}
+
+int
+client_post_writev(xlator_t *this, gfs3_write_rsp *rsp, struct iatt *prestat,
+                   struct iatt *poststat, dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->prestat, prestat);
+        gf_stat_to_iatt(&rsp->poststat, poststat);
+    }
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+out:
+    return ret;
+}
+
+int
+client_post_statfs(xlator_t *this, gfs3_statfs_rsp *rsp, struct statvfs *statfs,
+                   dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_statfs_to_statfs(&rsp->statfs, statfs);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_flush(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+out:
+    return ret;
+}
+
+int
+client_post_fsync(xlator_t *this, gfs3_fsync_rsp *rsp, struct iatt *prestat,
+                  struct iatt *poststat, dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->prestat, prestat);
+        gf_stat_to_iatt(&rsp->poststat, poststat);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_setxattr(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+    ret = gf_replace_new_iatt_in_dict(*xdata);
+out:
+    return ret;
+}
+
+int
+client_post_getxattr(xlator_t *this, gfs3_getxattr_rsp *rsp, dict_t **dict,
+                     dict_t **xdata)
+{
+    int op_errno = 0;
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        GF_PROTOCOL_DICT_UNSERIALIZE(this, *dict, (rsp->dict.dict_val),
+                                     (rsp->dict.dict_len), rsp->op_ret,
+                                     op_errno, out);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, op_errno, out);
+
+out:
+    return -op_errno;
+}
+
+int
+client_post_removexattr(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+    ret = gf_replace_new_iatt_in_dict(*xdata);
+out:
+    return ret;
+}
+
+int
+client_post_opendir(xlator_t *this, gfs3_opendir_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_fsyncdir(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_access(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_create(xlator_t *this, gfs3_create_rsp *rsp, struct iatt *stbuf,
+                   struct iatt *preparent, struct iatt *postparent,
+                   clnt_local_t *local, dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->stat, stbuf);
+
+        gf_stat_to_iatt(&rsp->preparent, preparent);
+        gf_stat_to_iatt(&rsp->postparent, postparent);
+        gf_uuid_copy(local->loc.gfid, stbuf->ia_gfid);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_ftruncate(xlator_t *this, gfs3_ftruncate_rsp *rsp,
+                      struct iatt *prestat, struct iatt *poststat,
+                      dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->prestat, prestat);
+        gf_stat_to_iatt(&rsp->poststat, poststat);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_fstat(xlator_t *this, gfs3_fstat_rsp *rsp, struct iatt *stat,
+                  dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->stat, stat);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return -ret;
+}
+
+int
+client_post_lk(xlator_t *this, gfs3_lk_rsp *rsp, struct gf_flock *lock,
+               dict_t **xdata)
+{
+    int ret = 0;
+
+    if (rsp->op_ret >= 0) {
+        gf_proto_flock_to_flock(&rsp->flock, lock);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_lookup(xlator_t *this, gfs3_lookup_rsp *rsp, struct iatt *stbuf,
+                   struct iatt *postparent, dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->postparent, postparent);
+        gf_stat_to_iatt(&rsp->stat, stbuf);
+    }
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_readdir(xlator_t *this, gfs3_readdir_rsp *rsp, gf_dirent_t *entries,
+                    dict_t **xdata)
+{
+    int ret = 0;
+
+    if (rsp->op_ret > 0) {
+        unserialize_rsp_dirent(this, rsp, entries);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+out:
+    return ret;
+}
+
+int
+client_post_inodelk(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+out:
+    return ret;
+}
+
+int
+client_post_finodelk(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+out:
+    return ret;
+}
+
+int
+client_post_entrylk(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+out:
+    return ret;
+}
+
+int
+client_post_fentrylk(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+out:
+    return ret;
+}
+
+int
+client_post_xattrop(xlator_t *this, gfs3_xattrop_rsp *rsp, dict_t **dict,
+                    dict_t **xdata)
+{
+    int op_errno = 0;
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        GF_PROTOCOL_DICT_UNSERIALIZE(this, *dict, (rsp->dict.dict_val),
+                                     (rsp->dict.dict_len), rsp->op_ret,
+                                     op_errno, out);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, op_errno, out);
+
+out:
+    return -op_errno;
+}
+
+int
+client_post_fxattrop(xlator_t *this, gfs3_fxattrop_rsp *rsp, dict_t **dict,
+                     dict_t **xdata)
+{
+    int op_errno = 0;
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        GF_PROTOCOL_DICT_UNSERIALIZE(this, *dict, (rsp->dict.dict_val),
+                                     (rsp->dict.dict_len), rsp->op_ret,
+                                     op_errno, out);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, op_errno, out);
+
+out:
+    return -op_errno;
+}
+
+int
+client_post_fgetxattr(xlator_t *this, gfs3_fgetxattr_rsp *rsp, dict_t **dict,
+                      dict_t **xdata)
+{
+    int op_errno = 0;
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        GF_PROTOCOL_DICT_UNSERIALIZE(this, *dict, (rsp->dict.dict_val),
+                                     (rsp->dict.dict_len), rsp->op_ret,
+                                     op_errno, out);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, op_errno, out);
+
+out:
+    return -op_errno;
+}
+
+int
+client_post_fsetxattr(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+    ret = gf_replace_new_iatt_in_dict(*xdata);
+out:
+    return ret;
+}
+
+int
+client_post_rchecksum(xlator_t *this, gfs3_rchecksum_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_setattr(xlator_t *this, gfs3_setattr_rsp *rsp, struct iatt *prestat,
+                    struct iatt *poststat, dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->statpre, prestat);
+        gf_stat_to_iatt(&rsp->statpost, poststat);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_fsetattr(xlator_t *this, gfs3_fsetattr_rsp *rsp,
+                     struct iatt *prestat, struct iatt *poststat,
+                     dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->statpre, prestat);
+        gf_stat_to_iatt(&rsp->statpost, poststat);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_readdirp(xlator_t *this, gfs3_readdirp_rsp *rsp, fd_t *fd,
+                     gf_dirent_t *entries, dict_t **xdata)
+{
+    int ret = 0;
+
+    if (rsp->op_ret > 0) {
+        unserialize_rsp_direntp(this, fd, rsp, entries);
+    }
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_fremovexattr(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+
+    ret = gf_replace_new_iatt_in_dict(*xdata);
+out:
+    return ret;
+}
+
+int
+client_post_fallocate(xlator_t *this, gfs3_fallocate_rsp *rsp,
+                      struct iatt *prestat, struct iatt *poststat,
+                      dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->statpre, prestat);
+        gf_stat_to_iatt(&rsp->statpost, poststat);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_discard(xlator_t *this, gfs3_discard_rsp *rsp, struct iatt *prestat,
+                    struct iatt *poststat, dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->statpre, prestat);
+        gf_stat_to_iatt(&rsp->statpost, poststat);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_zerofill(xlator_t *this, gfs3_zerofill_rsp *rsp,
+                     struct iatt *prestat, struct iatt *poststat,
+                     dict_t **xdata)
+{
+    int ret = 0;
+
+    if (-1 != rsp->op_ret) {
+        gf_stat_to_iatt(&rsp->statpre, prestat);
+        gf_stat_to_iatt(&rsp->statpost, poststat);
+    }
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_ipc(xlator_t *this, gfs3_ipc_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_seek(xlator_t *this, gfs3_seek_rsp *rsp, dict_t **xdata)
+{
+    int ret = 0;
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+int
+client_post_lease(xlator_t *this, gfs3_lease_rsp *rsp, struct gf_lease *lease,
+                  dict_t **xdata)
+{
+    int ret = 0;
+
+    if (rsp->op_ret >= 0) {
+        gf_proto_lease_to_lease(&rsp->lease, lease);
+    }
+
+    GF_PROTOCOL_DICT_UNSERIALIZE(this, *xdata, (rsp->xdata.xdata_val),
+                                 (rsp->xdata.xdata_len), ret, rsp->op_errno,
+                                 out);
+out:
+    return ret;
+}
+
+/* New PRE and POST functions */
+
+int
+client_post_common_iatt(xlator_t *this, gfx_common_iatt_rsp *rsp,
+                        struct iatt *iatt, dict_t **xdata)
+{
+    if (-1 != rsp->op_ret) {
+        gfx_stat_to_iattx(&rsp->stat, iatt);
+    }
+
+    return xdr_to_dict(&rsp->xdata, xdata);
+}
+
+int
+client_post_common_2iatt(xlator_t *this, gfx_common_2iatt_rsp *rsp,
+                         struct iatt *iatt, struct iatt *iatt2, dict_t **xdata)
+{
+    if (-1 != rsp->op_ret) {
+        gfx_stat_to_iattx(&rsp->prestat, iatt);
+        gfx_stat_to_iattx(&rsp->poststat, iatt2);
+    }
+
+    return xdr_to_dict(&rsp->xdata, xdata);
+}
+
+int
+client_post_common_3iatt(xlator_t *this, gfx_common_3iatt_rsp *rsp,
+                         struct iatt *iatt, struct iatt *iatt2,
+                         struct iatt *iatt3, dict_t **xdata)
+{
+    if (-1 != rsp->op_ret) {
+        gfx_stat_to_iattx(&rsp->stat, iatt);
+        gfx_stat_to_iattx(&rsp->preparent, iatt2);
+        gfx_stat_to_iattx(&rsp->postparent, iatt3);
+    }
+
+    return xdr_to_dict(&rsp->xdata, xdata);
+}
+
+int
+client_post_common_dict(xlator_t *this, gfx_common_dict_rsp *rsp, dict_t **dict,
+                        dict_t **xdata)
+{
+    int ret = 0;
+    ret = xdr_to_dict(&rsp->dict, dict);
+    if (ret)
+        gf_msg_debug(this->name, EINVAL,
+                     "while decoding found empty dictionary");
+    xdr_to_dict(&rsp->xdata, xdata);
+
+    return ret;
+}
+
+int
+client_post_readv_v2(xlator_t *this, gfx_read_rsp *rsp, struct iobref **iobref,
+                     struct iobref *rsp_iobref, struct iatt *stat,
+                     struct iovec *vector, struct iovec *rsp_vector,
+                     int *rspcount, dict_t **xdata)
+{
+    int ret = -1;
+
+    if (rsp->op_ret != -1) {
+        *iobref = rsp_iobref;
+        gfx_stat_to_iattx(&rsp->stat, stat);
+
+        vector[0].iov_len = rsp->op_ret;
+        if (rsp->op_ret > 0)
+            vector[0].iov_base = rsp_vector->iov_base;
+        *rspcount = 1;
+    }
+
+    ret = xdr_to_dict(&rsp->xdata, xdata);
+
+#ifdef GF_TESTING_IO_XDATA
+    dict_dump_to_log(xdata);
+#endif
+    return ret;
+}
+
+int
+client_pre_stat_v2(xlator_t *this, gfx_stat_req *req, loc_t *loc, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_readlink_v2(xlator_t *this, gfx_readlink_req *req, loc_t *loc,
+                       size_t size, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    req->size = size;
+    dict_to_xdr(xdata, &req->xdata);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_mknod_v2(xlator_t *this, gfx_mknod_req *req, loc_t *loc, mode_t mode,
+                    dev_t rdev, mode_t umask, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->parent->gfid))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->pargfid)),
+                                  out, op_errno, EINVAL);
+    req->bname = (char *)loc->name;
+    req->mode = mode;
+    req->dev = rdev;
+    req->umask = umask;
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_mkdir_v2(xlator_t *this, gfx_mkdir_req *req, loc_t *loc, mode_t mode,
+                    mode_t umask, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->parent->gfid))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->pargfid)),
+                                  out, op_errno, EINVAL);
+
+    req->bname = (char *)loc->name;
+    req->mode = mode;
+    req->umask = umask;
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_unlink_v2(xlator_t *this, gfx_unlink_req *req, loc_t *loc,
+                     int32_t flags, dict_t *xdata)
+{
+    int op_errno = 0;
+
+    if (!(loc && loc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->parent->gfid))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->pargfid)),
+                                  out, op_errno, EINVAL);
+    req->bname = (char *)loc->name;
+    req->xflags = flags;
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_rmdir_v2(xlator_t *this, gfx_rmdir_req *req, loc_t *loc,
+                    int32_t flags, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->parent->gfid))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->pargfid)),
+                                  out, op_errno, EINVAL);
+    req->bname = (char *)loc->name;
+    req->xflags = flags;
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_symlink_v2(xlator_t *this, gfx_symlink_req *req, loc_t *loc,
+                      const char *linkname, mode_t umask, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->parent->gfid))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->pargfid)),
+                                  out, op_errno, EINVAL);
+    req->linkname = (char *)linkname;
+    req->bname = (char *)loc->name;
+    req->umask = umask;
+
+    dict_to_xdr(xdata, &req->xdata);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_rename_v2(xlator_t *this, gfx_rename_req *req, loc_t *oldloc,
+                     loc_t *newloc, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(oldloc && newloc && oldloc->parent && newloc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(oldloc->parent->gfid))
+        memcpy(req->oldgfid, oldloc->parent->gfid, 16);
+    else
+        memcpy(req->oldgfid, oldloc->pargfid, 16);
+
+    if (!gf_uuid_is_null(newloc->parent->gfid))
+        memcpy(req->newgfid, newloc->parent->gfid, 16);
+    else
+        memcpy(req->newgfid, newloc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->oldgfid)),
+                                  out, op_errno, EINVAL);
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->newgfid)),
+                                  out, op_errno, EINVAL);
+    req->oldbname = (char *)oldloc->name;
+    req->newbname = (char *)newloc->name;
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_link_v2(xlator_t *this, gfx_link_req *req, loc_t *oldloc,
+                   loc_t *newloc, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(oldloc && oldloc->inode && newloc && newloc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(oldloc->inode->gfid))
+        memcpy(req->oldgfid, oldloc->inode->gfid, 16);
+    else
+        memcpy(req->oldgfid, oldloc->gfid, 16);
+
+    if (!gf_uuid_is_null(newloc->parent->gfid))
+        memcpy(req->newgfid, newloc->parent->gfid, 16);
+    else
+        memcpy(req->newgfid, newloc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->oldgfid)),
+                                  out, op_errno, EINVAL);
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->newgfid)),
+                                  out, op_errno, EINVAL);
+    req->newbname = (char *)newloc->name;
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_truncate_v2(xlator_t *this, gfx_truncate_req *req, loc_t *loc,
+                       off_t offset, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    req->offset = offset;
+
+    dict_to_xdr(xdata, &req->xdata);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_open_v2(xlator_t *this, gfx_open_req *req, loc_t *loc, fd_t *fd,
+                   int32_t flags, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    req->flags = gf_flags_from_flags(flags);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_readv_v2(xlator_t *this, gfx_read_req *req, fd_t *fd, size_t size,
+                    off_t offset, int32_t flags, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno,
+                         out);
+
+    req->size = size;
+    req->offset = offset;
+    req->fd = remote_fd;
+    req->flag = flags;
+
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_writev_v2(xlator_t *this, gfx_write_req *req, fd_t *fd, size_t size,
+                     off_t offset, int32_t flags, dict_t **xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno,
+                         out);
+
+    req->size = size;
+    req->offset = offset;
+    req->fd = remote_fd;
+    req->flag = flags;
+
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+#ifdef GF_TESTING_IO_XDATA
+    if (!*xdata)
+        *xdata = dict_new();
+
+    ret = dict_set_str(*xdata, "testing-the-xdata-key",
+                       "testing-the-xdata-value");
+#endif
+
+    dict_to_xdr(*xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_copy_file_range_v2(xlator_t *this, gfx_copy_file_range_req *req,
+                              fd_t *fd_in, off64_t off_in, fd_t *fd_out,
+                              off64_t off_out, size_t size, int32_t flags,
+                              dict_t **xdata)
+{
+    int64_t remote_fd_in = -1;
+    int64_t remote_fd_out = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd_in, FALLBACK_TO_ANON_FD, remote_fd_in,
+                         op_errno, out);
+
+    CLIENT_GET_REMOTE_FD(this, fd_out, FALLBACK_TO_ANON_FD, remote_fd_out,
+                         op_errno, out);
+    req->size = size;
+    req->off_in = off_in;
+    req->off_out = off_out;
+    req->fd_in = remote_fd_in;
+    req->fd_out = remote_fd_out;
+    req->flag = flags;
+
+    memcpy(req->gfid1, fd_in->inode->gfid, 16);
+    memcpy(req->gfid2, fd_out->inode->gfid, 16);
+
+    dict_to_xdr(*xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_statfs_v2(xlator_t *this, gfx_statfs_req *req, loc_t *loc,
+                     dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!loc)
+        goto out;
+
+    if (loc->inode) {
+        if (!gf_uuid_is_null(loc->inode->gfid))
+            memcpy(req->gfid, loc->inode->gfid, 16);
+        else
+            memcpy(req->gfid, loc->gfid, 16);
+    } else {
+        req->gfid[15] = 1;
+    }
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_flush_v2(xlator_t *this, gfx_flush_req *req, fd_t *fd, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fsync_v2(xlator_t *this, gfx_fsync_req *req, fd_t *fd, int32_t flags,
+                    dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = 0;
+
+    CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno,
+                         out);
+
+    req->fd = remote_fd;
+    req->data = flags;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_setxattr_v2(xlator_t *this, gfx_setxattr_req *req, loc_t *loc,
+                       dict_t *xattr, int32_t flags, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    if (xattr) {
+        dict_to_xdr(xattr, &req->dict);
+    }
+
+    req->flags = flags;
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_getxattr_v2(xlator_t *this, gfx_getxattr_req *req, loc_t *loc,
+                       const char *name, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!loc) {
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    if (loc->inode && !gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    req->namelen = 1; /* Use it as a flag */
+
+    req->name = (char *)name;
+    if (!req->name) {
+        req->name = "";
+        req->namelen = 0;
+    }
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_removexattr_v2(xlator_t *this, gfx_removexattr_req *req, loc_t *loc,
+                          const char *name, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    req->name = (char *)name;
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_opendir_v2(xlator_t *this, gfx_opendir_req *req, loc_t *loc,
+                      fd_t *fd, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fsyncdir_v2(xlator_t *this, gfx_fsyncdir_req *req, fd_t *fd,
+                       int32_t flags, dict_t *xdata)
+{
+    int32_t op_errno = ESTALE;
+    int64_t remote_fd = -1;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    req->data = flags;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_access_v2(xlator_t *this, gfx_access_req *req, loc_t *loc,
+                     int32_t mask, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    req->mask = mask;
+
+    dict_to_xdr(xdata, &req->xdata);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_create_v2(xlator_t *this, gfx_create_req *req, loc_t *loc, fd_t *fd,
+                     mode_t mode, int32_t flags, mode_t umask, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->parent->gfid))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->pargfid)),
+                                  out, op_errno, EINVAL);
+    req->bname = (char *)loc->name;
+    req->mode = mode;
+    req->flags = gf_flags_from_flags(flags);
+    req->umask = umask;
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_ftruncate_v2(xlator_t *this, gfx_ftruncate_req *req, fd_t *fd,
+                        off_t offset, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = EINVAL;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->offset = offset;
+    req->fd = remote_fd;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    dict_to_xdr(xdata, &req->xdata);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fstat_v2(xlator_t *this, gfx_fstat_req *req, fd_t *fd, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_lk_v2(xlator_t *this, gfx_lk_req *req, int32_t cmd,
+                 struct gf_flock *flock, fd_t *fd, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+    int32_t gf_cmd = 0;
+    int32_t gf_type = 0;
+    int ret = 0;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    ret = client_cmd_to_gf_cmd(cmd, &gf_cmd);
+    if (ret) {
+        op_errno = EINVAL;
+        gf_smsg(this->name, GF_LOG_WARNING, EINVAL, PC_MSG_UNKNOWN_CMD,
+                "gf_cmd=%d", gf_cmd, NULL);
+        goto out;
+    }
+
+    switch (flock->l_type) {
+        case F_RDLCK:
+            gf_type = GF_LK_F_RDLCK;
+            break;
+        case F_WRLCK:
+            gf_type = GF_LK_F_WRLCK;
+            break;
+        case F_UNLCK:
+            gf_type = GF_LK_F_UNLCK;
+            break;
+    }
+
+    req->fd = remote_fd;
+    req->cmd = gf_cmd;
+    req->type = gf_type;
+    gf_proto_flock_from_flock(&req->flock, flock);
+
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_lookup_v2(xlator_t *this, gfx_lookup_req *req, loc_t *loc,
+                     dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if ((loc->parent) && (!gf_uuid_is_null(loc->parent->gfid)))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    if ((loc->inode) && (!gf_uuid_is_null(loc->inode->gfid)))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    if (loc->name)
+        req->bname = (char *)loc->name;
+    else
+        req->bname = "";
+
+    if (xdata) {
+        dict_to_xdr(xdata, &req->xdata);
+    }
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_readdir_v2(xlator_t *this, gfx_readdir_req *req, fd_t *fd,
+                      size_t size, off_t offset, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->size = size;
+    req->offset = offset;
+    req->fd = remote_fd;
+
+    memcpy(req->gfid, fd->inode->gfid, 16);
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_inodelk_v2(xlator_t *this, gfx_inodelk_req *req, loc_t *loc, int cmd,
+                      struct gf_flock *flock, const char *volume, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int32_t gf_cmd = 0;
+    int32_t gf_type = 0;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->gfid))
+        memcpy(req->gfid, loc->gfid, 16);
+    else
+        memcpy(req->gfid, loc->inode->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    if (cmd == F_GETLK || cmd == F_GETLK64)
+        gf_cmd = GF_LK_GETLK;
+    else if (cmd == F_SETLK || cmd == F_SETLK64)
+        gf_cmd = GF_LK_SETLK;
+    else if (cmd == F_SETLKW || cmd == F_SETLKW64)
+        gf_cmd = GF_LK_SETLKW;
+    else {
+        gf_smsg(this->name, GF_LOG_WARNING, EINVAL, PC_MSG_UNKNOWN_CMD,
+                "gf_cmd=%d", gf_cmd, NULL);
+        op_errno = EINVAL;
+        goto out;
+    }
+
+    switch (flock->l_type) {
+        case F_RDLCK:
+            gf_type = GF_LK_F_RDLCK;
+            break;
+        case F_WRLCK:
+            gf_type = GF_LK_F_WRLCK;
+            break;
+        case F_UNLCK:
+            gf_type = GF_LK_F_UNLCK;
+            break;
+    }
+
+    req->volume = (char *)volume;
+    req->cmd = gf_cmd;
+    req->type = gf_type;
+    gf_proto_flock_from_flock(&req->flock, flock);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_finodelk_v2(xlator_t *this, gfx_finodelk_req *req, fd_t *fd, int cmd,
+                       struct gf_flock *flock, const char *volume,
+                       dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int64_t remote_fd = -1;
+    int32_t gf_type = 0;
+    int32_t gf_cmd = 0;
+
+    CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno,
+                         out);
+
+    if (cmd == F_GETLK || cmd == F_GETLK64)
+        gf_cmd = GF_LK_GETLK;
+    else if (cmd == F_SETLK || cmd == F_SETLK64)
+        gf_cmd = GF_LK_SETLK;
+    else if (cmd == F_SETLKW || cmd == F_SETLKW64)
+        gf_cmd = GF_LK_SETLKW;
+    else {
+        gf_smsg(this->name, GF_LOG_WARNING, EINVAL, PC_MSG_UNKNOWN_CMD,
+                "gf_cmd=%d", gf_cmd, NULL);
+        goto out;
+    }
+
+    switch (flock->l_type) {
+        case F_RDLCK:
+            gf_type = GF_LK_F_RDLCK;
+            break;
+        case F_WRLCK:
+            gf_type = GF_LK_F_WRLCK;
+            break;
+        case F_UNLCK:
+            gf_type = GF_LK_F_UNLCK;
+            break;
+    }
+
+    req->volume = (char *)volume;
+    req->fd = remote_fd;
+    req->cmd = gf_cmd;
+    req->type = gf_type;
+    gf_proto_flock_from_flock(&req->flock, flock);
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    dict_to_xdr(xdata, &req->xdata);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_entrylk_v2(xlator_t *this, gfx_entrylk_req *req, loc_t *loc,
+                      entrylk_cmd cmd_entrylk, entrylk_type type,
+                      const char *volume, const char *basename, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->gfid))
+        memcpy(req->gfid, loc->gfid, 16);
+    else
+        memcpy(req->gfid, loc->inode->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    req->cmd = cmd_entrylk;
+    req->type = type;
+    req->volume = (char *)volume;
+    req->name = "";
+    if (basename) {
+        req->name = (char *)basename;
+        req->namelen = 1;
+    }
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fentrylk_v2(xlator_t *this, gfx_fentrylk_req *req, fd_t *fd,
+                       entrylk_cmd cmd_entrylk, entrylk_type type,
+                       const char *volume, const char *basename, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    req->cmd = cmd_entrylk;
+    req->type = type;
+    req->volume = (char *)volume;
+    req->name = "";
+    if (basename) {
+        req->name = (char *)basename;
+        req->namelen = 1;
+    }
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_xattrop_v2(xlator_t *this, gfx_xattrop_req *req, loc_t *loc,
+                      dict_t *xattr, int32_t flags, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+    dict_to_xdr(xattr, &req->dict);
+
+    req->flags = flags;
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fxattrop_v2(xlator_t *this, gfx_fxattrop_req *req, fd_t *fd,
+                       dict_t *xattr, int32_t flags, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int64_t remote_fd = -1;
+
+    CLIENT_GET_REMOTE_FD(this, fd, FALLBACK_TO_ANON_FD, remote_fd, op_errno,
+                         out);
+
+    req->fd = remote_fd;
+    req->flags = flags;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    dict_to_xdr(xattr, &req->dict);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fgetxattr_v2(xlator_t *this, gfx_fgetxattr_req *req, fd_t *fd,
+                        const char *name, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->namelen = 1; /* Use it as a flag */
+    req->fd = remote_fd;
+    req->name = (char *)name;
+    if (!req->name) {
+        req->name = "";
+        req->namelen = 0;
+    }
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fsetxattr_v2(xlator_t *this, gfx_fsetxattr_req *req, fd_t *fd,
+                        int32_t flags, dict_t *xattr, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    req->flags = flags;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    if (xattr) {
+        dict_to_xdr(xattr, &req->dict);
+    }
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_rchecksum_v2(xlator_t *this, gfx_rchecksum_req *req, fd_t *fd,
+                        int32_t len, off_t offset, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->len = len;
+    req->offset = offset;
+    req->fd = remote_fd;
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_setattr_v2(xlator_t *this, gfx_setattr_req *req, loc_t *loc,
+                      int32_t valid, struct iatt *stbuf, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->inode))
+        return -op_errno;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+
+    req->valid = valid;
+    gfx_stat_from_iattx(&req->stbuf, stbuf);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fsetattr_v2(xlator_t *this, gfx_fsetattr_req *req, fd_t *fd,
+                       int32_t valid, struct iatt *stbuf, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int64_t remote_fd = -1;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    memcpy(req->gfid, fd->inode->gfid, 16);
+    req->fd = remote_fd;
+    req->valid = valid;
+    gfx_stat_from_iattx(&req->stbuf, stbuf);
+
+    dict_to_xdr(xdata, &req->xdata);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_readdirp_v2(xlator_t *this, gfx_readdirp_req *req, fd_t *fd,
+                       size_t size, off_t offset, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int64_t remote_fd = -1;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->size = size;
+    req->offset = offset;
+    req->fd = remote_fd;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    /* dict itself is 'xdata' here */
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fremovexattr_v2(xlator_t *this, gfx_fremovexattr_req *req, fd_t *fd,
+                           const char *name, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    if (!(fd && fd->inode))
+        goto out;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    memcpy(req->gfid, fd->inode->gfid, 16);
+    req->name = (char *)name;
+    req->fd = remote_fd;
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_fallocate_v2(xlator_t *this, gfx_fallocate_req *req, fd_t *fd,
+                        int32_t flags, off_t offset, size_t size, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int64_t remote_fd = -1;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    req->flags = flags;
+    req->offset = offset;
+    req->size = size;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    dict_to_xdr(xdata, &req->xdata);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_discard_v2(xlator_t *this, gfx_discard_req *req, fd_t *fd,
+                      off_t offset, size_t size, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int64_t remote_fd = -1;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    req->offset = offset;
+    req->size = size;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    dict_to_xdr(xdata, &req->xdata);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_zerofill_v2(xlator_t *this, gfx_zerofill_req *req, fd_t *fd,
+                       off_t offset, size_t size, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+    int64_t remote_fd = -1;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    req->fd = remote_fd;
+    req->offset = offset;
+    req->size = size;
+    memcpy(req->gfid, fd->inode->gfid, 16);
+
+    dict_to_xdr(xdata, &req->xdata);
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_ipc_v2(xlator_t *this, gfx_ipc_req *req, int32_t cmd, dict_t *xdata)
+{
+    req->op = cmd;
+
+    dict_to_xdr(xdata, &req->xdata);
+    return 0;
+}
+
+int
+client_pre_seek_v2(xlator_t *this, gfx_seek_req *req, fd_t *fd, off_t offset,
+                   gf_seek_what_t what, dict_t *xdata)
+{
+    int64_t remote_fd = -1;
+    int op_errno = ESTALE;
+
+    CLIENT_GET_REMOTE_FD(this, fd, DEFAULT_REMOTE_FD, remote_fd, op_errno, out);
+
+    memcpy(req->gfid, fd->inode->gfid, 16);
+    req->fd = remote_fd;
+    req->offset = offset;
+    req->what = what;
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_pre_lease_v2(xlator_t *this, gfx_lease_req *req, loc_t *loc,
+                    struct gf_lease *lease, dict_t *xdata)
+{
+    int op_errno = 0;
+
+    if (!(loc && loc->inode))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->inode->gfid))
+        memcpy(req->gfid, loc->inode->gfid, 16);
+    else
+        memcpy(req->gfid, loc->gfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->gfid)), out,
+                                  op_errno, EINVAL);
+
+    gf_proto_lease_from_lease(&req->lease, lease);
+
+    dict_to_xdr(xdata, &req->xdata);
+out:
+    return -op_errno;
+}
+
+int
+client_pre_put_v2(xlator_t *this, gfx_put_req *req, loc_t *loc, mode_t mode,
+                  mode_t umask, int32_t flags, size_t size, off_t offset,
+                  dict_t *xattr, dict_t *xdata)
+{
+    int op_errno = ESTALE;
+
+    if (!(loc && loc->parent))
+        goto out;
+
+    if (!gf_uuid_is_null(loc->parent->gfid))
+        memcpy(req->pargfid, loc->parent->gfid, 16);
+    else
+        memcpy(req->pargfid, loc->pargfid, 16);
+
+    GF_ASSERT_AND_GOTO_WITH_ERROR(this->name,
+                                  !gf_uuid_is_null(*((uuid_t *)req->pargfid)),
+                                  out, op_errno, EINVAL);
+    req->bname = (char *)loc->name;
+    req->mode = mode;
+    req->umask = umask;
+    req->flag = gf_flags_from_flags(flags);
+    req->size = size;
+    req->offset = offset;
+
+    if (xattr)
+        dict_to_xdr(xattr, &req->xattr);
+
+    dict_to_xdr(xdata, &req->xdata);
+
+    return 0;
+out:
+    return -op_errno;
+}
+
+int
+client_post_create_v2(xlator_t *this, gfx_create_rsp *rsp, struct iatt *stbuf,
+                      struct iatt *preparent, struct iatt *postparent,
+                      clnt_local_t *local, dict_t **xdata)
+{
+    if (-1 != rsp->op_ret) {
+        gfx_stat_to_iattx(&rsp->stat, stbuf);
+
+        gfx_stat_to_iattx(&rsp->preparent, preparent);
+        gfx_stat_to_iattx(&rsp->postparent, postparent);
+        gf_uuid_copy(local->loc.gfid, stbuf->ia_gfid);
+    }
+    return xdr_to_dict(&rsp->xdata, xdata);
+}
+
+int
+client_post_lease_v2(xlator_t *this, gfx_lease_rsp *rsp, struct gf_lease *lease,
+                     dict_t **xdata)
+{
+    if (rsp->op_ret >= 0) {
+        gf_proto_lease_to_lease(&rsp->lease, lease);
+    }
+
+    return xdr_to_dict(&rsp->xdata, xdata);
+}
+
+int
+client_post_lk_v2(xlator_t *this, gfx_lk_rsp *rsp, struct gf_flock *lock,
+                  dict_t **xdata)
+{
+    if (rsp->op_ret >= 0) {
+        gf_proto_flock_to_flock(&rsp->flock, lock);
+    }
+    return xdr_to_dict(&rsp->xdata, xdata);
+}
+
+int
+client_post_readdir_v2(xlator_t *this, gfx_readdir_rsp *rsp,
+                       gf_dirent_t *entries, dict_t **xdata)
+{
+    if (rsp->op_ret > 0) {
+        unserialize_rsp_dirent_v2(this, rsp, entries);
+    }
+    return xdr_to_dict(&rsp->xdata, xdata);
+}
+
+int
+client_post_readdirp_v2(xlator_t *this, gfx_readdirp_rsp *rsp, fd_t *fd,
+                        gf_dirent_t *entries, dict_t **xdata)
+{
+    if (rsp->op_ret > 0) {
+        unserialize_rsp_direntp_v2(this, fd, rsp, entries);
+    }
+    return xdr_to_dict(&rsp->xdata, xdata);
+}
+
+int
+client_post_rename_v2(xlator_t *this, gfx_rename_rsp *rsp, struct iatt *stbuf,
+                      struct iatt *preoldparent, struct iatt *postoldparent,
+                      struct iatt *prenewparent, struct iatt *postnewparent,
+                      dict_t **xdata)
+{
+    if (-1 != rsp->op_ret) {
+        gfx_stat_to_iattx(&rsp->stat, stbuf);
+
+        gfx_stat_to_iattx(&rsp->preoldparent, preoldparent);
+        gfx_stat_to_iattx(&rsp->postoldparent, postoldparent);
+
+        gfx_stat_to_iattx(&rsp->prenewparent, prenewparent);
+        gfx_stat_to_iattx(&rsp->postnewparent, postnewparent);
+    }
+
+    return xdr_to_dict(&rsp->xdata, xdata);
+}
diff --git a/xlators/protocol/client/src/client-common.h b/xlators/protocol/client/src/client-common.h
new file mode 100644
index 00000000000..a2043d8742a
--- /dev/null
+++ b/xlators/protocol/client/src/client-common.h
@@ -0,0 +1,630 @@
+/*
+  Copyright (c); 2016 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later);, or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CLIENT_COMMON_H__
+#define __CLIENT_COMMON_H__
+
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include "rpc-common-xdr.h"
+#include "glusterfs3-xdr.h"
+#include "glusterfs4-xdr.h"
+#include "glusterfs3.h"
+#include "client.h"
+
+int
+client_pre_stat(xlator_t *this, gfs3_stat_req *req, loc_t *loc, dict_t *xdata);
+
+int
+client_pre_readlink(xlator_t *this, gfs3_readlink_req *req, loc_t *loc,
+                    size_t size, dict_t *xdata);
+
+int
+client_pre_mknod(xlator_t *this, gfs3_mknod_req *req, loc_t *loc, mode_t mode,
+                 dev_t rdev, mode_t umask, dict_t *xdata);
+
+int
+client_pre_mkdir(xlator_t *this, gfs3_mkdir_req *req, loc_t *loc, mode_t mode,
+                 mode_t umask, dict_t *xdata);
+
+int
+client_pre_unlink(xlator_t *this, gfs3_unlink_req *req, loc_t *loc,
+                  int32_t flags, dict_t *xdata);
+
+int
+client_pre_rmdir(xlator_t *this, gfs3_rmdir_req *req, loc_t *loc, int32_t flags,
+                 dict_t *xdata);
+
+int
+client_pre_symlink(xlator_t *this, gfs3_symlink_req *req, loc_t *loc,
+                   const char *linkname, mode_t umask, dict_t *xdata);
+
+int
+client_pre_rename(xlator_t *this, gfs3_rename_req *req, loc_t *oldloc,
+                  loc_t *newloc, dict_t *xdata);
+
+int
+client_pre_link(xlator_t *this, gfs3_link_req *req, loc_t *oldloc,
+                loc_t *newloc, dict_t *xdata);
+
+int
+client_pre_truncate(xlator_t *this, gfs3_truncate_req *req, loc_t *loc,
+                    off_t offset, dict_t *xdata);
+
+int
+client_pre_open(xlator_t *this, gfs3_open_req *req, loc_t *loc, fd_t *fd,
+                int32_t flags, dict_t *xdata);
+
+int
+client_pre_readv(xlator_t *this, gfs3_read_req *req, fd_t *fd, size_t size,
+                 off_t offset, int32_t flags, dict_t *xdata);
+
+int
+client_pre_writev(xlator_t *this, gfs3_write_req *req, fd_t *fd, size_t size,
+                  off_t offset, int32_t flags, dict_t **xdata);
+
+int
+client_pre_statfs(xlator_t *this, gfs3_statfs_req *req, loc_t *loc,
+                  dict_t *xdata);
+
+int
+client_pre_flush(xlator_t *this, gfs3_flush_req *req, fd_t *fd, dict_t *xdata);
+
+int
+client_pre_fsync(xlator_t *this, gfs3_fsync_req *req, fd_t *fd, int32_t flags,
+                 dict_t *xdata);
+
+int
+client_pre_setxattr(xlator_t *this, gfs3_setxattr_req *req, loc_t *loc,
+                    dict_t *xattr, int32_t flags, dict_t *xdata);
+
+int
+client_pre_getxattr(xlator_t *this, gfs3_getxattr_req *req, loc_t *loc,
+                    const char *name, dict_t *xdata);
+
+int
+client_pre_removexattr(xlator_t *this, gfs3_removexattr_req *req, loc_t *loc,
+                       const char *name, dict_t *xdata);
+
+int
+client_pre_opendir(xlator_t *this, gfs3_opendir_req *req, loc_t *loc, fd_t *fd,
+                   dict_t *xdata);
+
+int
+client_pre_fsyncdir(xlator_t *this, gfs3_fsyncdir_req *req, fd_t *fd,
+                    int32_t flags, dict_t *xdata);
+
+int
+client_pre_access(xlator_t *this, gfs3_access_req *req, loc_t *loc,
+                  int32_t mask, dict_t *xdata);
+
+int
+client_pre_create(xlator_t *this, gfs3_create_req *req, loc_t *loc, fd_t *fd,
+                  mode_t mode, int32_t flags, mode_t umask, dict_t *xdata);
+
+int
+client_pre_ftruncate(xlator_t *this, gfs3_ftruncate_req *req, fd_t *fd,
+                     off_t offset, dict_t *xdata);
+
+int
+client_pre_fstat(xlator_t *this, gfs3_fstat_req *req, fd_t *fd, dict_t *xdata);
+
+int
+client_pre_lk(xlator_t *this, gfs3_lk_req *req, int32_t cmd,
+              struct gf_flock *flock, fd_t *fd, dict_t *xdata);
+
+int
+client_pre_lookup(xlator_t *this, gfs3_lookup_req *req, loc_t *loc,
+                  dict_t *xdata);
+
+int
+client_pre_readdir(xlator_t *this, gfs3_readdir_req *req, fd_t *fd, size_t size,
+                   off_t offset, dict_t *xdata);
+
+int
+client_pre_inodelk(xlator_t *this, gfs3_inodelk_req *req, loc_t *loc, int cmd,
+                   struct gf_flock *flock, const char *volume, dict_t *xdata);
+
+int
+client_pre_finodelk(xlator_t *this, gfs3_finodelk_req *req, fd_t *fd, int cmd,
+                    struct gf_flock *flock, const char *volume, dict_t *xdata);
+
+int
+client_pre_entrylk(xlator_t *this, gfs3_entrylk_req *req, loc_t *loc,
+                   entrylk_cmd cmd_entrylk, entrylk_type type,
+                   const char *volume, const char *basename, dict_t *xdata);
+
+int
+client_pre_fentrylk(xlator_t *this, gfs3_fentrylk_req *req, fd_t *fd,
+                    entrylk_cmd cmd_entrylk, entrylk_type type,
+                    const char *volume, const char *basename, dict_t *xdata);
+
+int
+client_pre_xattrop(xlator_t *this, gfs3_xattrop_req *req, loc_t *loc,
+                   dict_t *xattr, int32_t flags, dict_t *xdata);
+
+int
+client_pre_fxattrop(xlator_t *this, gfs3_fxattrop_req *req, fd_t *fd,
+                    dict_t *xattr, int32_t flags, dict_t *xdata);
+
+int
+client_pre_fgetxattr(xlator_t *this, gfs3_fgetxattr_req *req, fd_t *fd,
+                     const char *name, dict_t *xdata);
+
+int
+client_pre_fsetxattr(xlator_t *this, gfs3_fsetxattr_req *req, fd_t *fd,
+                     int32_t flags, dict_t *xattr, dict_t *xdata);
+int
+client_pre_seek(xlator_t *this, gfs3_seek_req *req, fd_t *fd, off_t offset,
+                gf_seek_what_t what, dict_t *xdata);
+
+int
+client_pre_rchecksum(xlator_t *this, gfs3_rchecksum_req *req, fd_t *fd,
+                     int32_t len, off_t offset, dict_t *xdata);
+
+int
+client_pre_setattr(xlator_t *this, gfs3_setattr_req *req, loc_t *loc,
+                   int32_t valid, struct iatt *stbuf, dict_t *xdata);
+int
+client_pre_fsetattr(xlator_t *this, gfs3_fsetattr_req *req, fd_t *fd,
+                    int32_t valid, struct iatt *stbuf, dict_t *xdata);
+
+int
+client_pre_readdirp(xlator_t *this, gfs3_readdirp_req *req, fd_t *fd,
+                    size_t size, off_t offset, dict_t *xdata);
+
+int
+client_pre_fremovexattr(xlator_t *this, gfs3_fremovexattr_req *req, fd_t *fd,
+                        const char *name, dict_t *xdata);
+
+int
+client_pre_fallocate(xlator_t *this, gfs3_fallocate_req *req, fd_t *fd,
+                     int32_t flags, off_t offset, size_t size, dict_t *xdata);
+int
+client_pre_discard(xlator_t *this, gfs3_discard_req *req, fd_t *fd,
+                   off_t offset, size_t size, dict_t *xdata);
+int
+client_pre_zerofill(xlator_t *this, gfs3_zerofill_req *req, fd_t *fd,
+                    off_t offset, size_t size, dict_t *xdata);
+int
+client_pre_ipc(xlator_t *this, gfs3_ipc_req *req, int32_t cmd, dict_t *xdata);
+
+int
+client_pre_lease(xlator_t *this, gfs3_lease_req *req, loc_t *loc,
+                 struct gf_lease *lease, dict_t *xdata);
+
+int
+client_post_stat(xlator_t *this, gfs3_stat_rsp *rsp, struct iatt *iatt,
+                 dict_t **xdata);
+
+int
+client_post_readlink(xlator_t *this, gfs3_readlink_rsp *rsp, struct iatt *iatt,
+                     dict_t **xdata);
+
+int
+client_post_mknod(xlator_t *this, gfs3_mknod_rsp *rsp, struct iatt *stbuf,
+                  struct iatt *preparent, struct iatt *postparent,
+                  dict_t **xdata);
+
+int
+client_post_mkdir(xlator_t *this, gfs3_mkdir_rsp *rsp, struct iatt *stbuf,
+                  struct iatt *preparent, struct iatt *postparent,
+                  dict_t **xdata);
+
+int
+client_post_unlink(xlator_t *this, gfs3_unlink_rsp *rsp, struct iatt *preparent,
+                   struct iatt *postparent, dict_t **xdata);
+
+int
+client_post_rmdir(xlator_t *this, gfs3_rmdir_rsp *rsp, struct iatt *preparent,
+                  struct iatt *postparent, dict_t **xdata);
+
+int
+client_post_symlink(xlator_t *this, gfs3_symlink_rsp *rsp, struct iatt *stbuf,
+                    struct iatt *preparent, struct iatt *postparent,
+                    dict_t **xdata);
+
+int
+client_post_rename(xlator_t *this, gfs3_rename_rsp *rsp, struct iatt *stbuf,
+                   struct iatt *preoldparent, struct iatt *postoldparent,
+                   struct iatt *prenewparent, struct iatt *postnewparent,
+                   dict_t **xdata);
+int
+client_post_link(xlator_t *this, gfs3_link_rsp *rsp, struct iatt *stbuf,
+                 struct iatt *preparent, struct iatt *postparent,
+                 dict_t **xdata);
+
+int
+client_post_truncate(xlator_t *this, gfs3_truncate_rsp *rsp,
+                     struct iatt *prestat, struct iatt *poststat,
+                     dict_t **xdata);
+
+int
+client_post_open(xlator_t *this, gfs3_open_rsp *rsp, dict_t **xdata);
+
+int
+client_post_readv(xlator_t *this, gfs3_read_rsp *rsp, struct iobref **iobref,
+                  struct iobref *rsp_iobref, struct iatt *stat,
+                  struct iovec *vector, struct iovec *rsp_vector, int *rspcount,
+                  dict_t **xdata);
+
+int
+client_post_writev(xlator_t *this, gfs3_write_rsp *rsp, struct iatt *prestat,
+                   struct iatt *poststat, dict_t **xdata);
+
+int
+client_post_statfs(xlator_t *this, gfs3_statfs_rsp *rsp, struct statvfs *statfs,
+                   dict_t **xdata);
+
+int
+client_post_flush(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata);
+
+int
+client_post_fsync(xlator_t *this, gfs3_fsync_rsp *rsp, struct iatt *prestat,
+                  struct iatt *poststat, dict_t **xdata);
+int
+client_post_setxattr(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata);
+
+int
+client_post_getxattr(xlator_t *this, gfs3_getxattr_rsp *rsp, dict_t **dict,
+                     dict_t **xdata);
+
+int
+client_post_removexattr(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata);
+
+int
+client_post_opendir(xlator_t *this, gfs3_opendir_rsp *rsp, dict_t **xdata);
+
+int
+client_post_fsyncdir(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata);
+
+int
+client_post_access(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata);
+
+int
+client_post_create(xlator_t *this, gfs3_create_rsp *rsp, struct iatt *stbuf,
+                   struct iatt *preparent, struct iatt *postparent,
+                   clnt_local_t *local, dict_t **xdata);
+
+int
+client_post_ftruncate(xlator_t *this, gfs3_ftruncate_rsp *rsp,
+                      struct iatt *prestat, struct iatt *poststat,
+                      dict_t **xdata);
+
+int
+client_post_fstat(xlator_t *this, gfs3_fstat_rsp *rsp, struct iatt *stat,
+                  dict_t **xdata);
+
+int
+client_post_lk(xlator_t *this, gfs3_lk_rsp *rsp, struct gf_flock *lock,
+               dict_t **xdata);
+
+int
+client_post_lookup(xlator_t *this, gfs3_lookup_rsp *rsp, struct iatt *stbuf,
+                   struct iatt *postparent, dict_t **xdata);
+
+int
+client_post_readdir(xlator_t *this, gfs3_readdir_rsp *rsp, gf_dirent_t *entries,
+                    dict_t **xdata);
+
+int
+client_post_inodelk(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata);
+
+int
+client_post_finodelk(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata);
+
+int
+client_post_entrylk(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata);
+
+int
+client_post_fentrylk(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata);
+
+int
+client_post_xattrop(xlator_t *this, gfs3_xattrop_rsp *rsp, dict_t **dict,
+                    dict_t **xdata);
+
+int
+client_post_fxattrop(xlator_t *this, gfs3_fxattrop_rsp *rsp, dict_t **dict,
+                     dict_t **xdata);
+
+int
+client_post_fgetxattr(xlator_t *this, gfs3_fgetxattr_rsp *rsp, dict_t **dict,
+                      dict_t **xdata);
+
+int
+client_post_fsetxattr(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata);
+
+int
+client_post_rchecksum(xlator_t *this, gfs3_rchecksum_rsp *rsp, dict_t **xdata);
+
+int
+client_post_setattr(xlator_t *this, gfs3_setattr_rsp *rsp, struct iatt *prestat,
+                    struct iatt *poststat, dict_t **xdata);
+
+int
+client_post_fsetattr(xlator_t *this, gfs3_fsetattr_rsp *rsp,
+                     struct iatt *prestat, struct iatt *poststat,
+                     dict_t **xdata);
+
+int
+client_post_readdirp(xlator_t *this, gfs3_readdirp_rsp *rsp, fd_t *fd,
+                     gf_dirent_t *entries, dict_t **xdata);
+
+int
+client_post_fremovexattr(xlator_t *this, gf_common_rsp *rsp, dict_t **xdata);
+
+int
+client_post_fallocate(xlator_t *this, gfs3_fallocate_rsp *rsp,
+                      struct iatt *prestat, struct iatt *poststat,
+                      dict_t **xdata);
+
+int
+client_post_discard(xlator_t *this, gfs3_discard_rsp *rsp, struct iatt *prestat,
+                    struct iatt *poststat, dict_t **xdata);
+
+int
+client_post_zerofill(xlator_t *this, gfs3_zerofill_rsp *rsp,
+                     struct iatt *prestat, struct iatt *poststat,
+                     dict_t **xdata);
+
+int
+client_post_ipc(xlator_t *this, gfs3_ipc_rsp *rsp, dict_t **xdata);
+
+int
+client_post_seek(xlator_t *this, gfs3_seek_rsp *rsp, dict_t **xdata);
+
+int
+client_post_lease(xlator_t *this, gfs3_lease_rsp *rsp, struct gf_lease *lease,
+                  dict_t **xdata);
+
+/* New functions for version 4 */
+int
+client_post_common_dict(xlator_t *this, gfx_common_dict_rsp *rsp, dict_t **dict,
+                        dict_t **xdata);
+int
+client_post_common_3iatt(xlator_t *this, gfx_common_3iatt_rsp *rsp,
+                         struct iatt *iatt, struct iatt *iatt2,
+                         struct iatt *iatt3, dict_t **xdata);
+int
+client_post_common_2iatt(xlator_t *this, gfx_common_2iatt_rsp *rsp,
+                         struct iatt *iatt, struct iatt *iatt2, dict_t **xdata);
+int
+client_post_common_iatt(xlator_t *this, gfx_common_iatt_rsp *rsp,
+                        struct iatt *iatt, dict_t **xdata);
+int
+client_post_common_rsp(xlator_t *this, gfx_common_rsp *rsp, dict_t **xdata);
+
+int
+client_pre_stat_v2(xlator_t *this, gfx_stat_req *req, loc_t *loc,
+                   dict_t *xdata);
+
+int
+client_pre_readlink_v2(xlator_t *this, gfx_readlink_req *req, loc_t *loc,
+                       size_t size, dict_t *xdata);
+
+int
+client_pre_mknod_v2(xlator_t *this, gfx_mknod_req *req, loc_t *loc, mode_t mode,
+                    dev_t rdev, mode_t umask, dict_t *xdata);
+
+int
+client_pre_mkdir_v2(xlator_t *this, gfx_mkdir_req *req, loc_t *loc, mode_t mode,
+                    mode_t umask, dict_t *xdata);
+
+int
+client_pre_unlink_v2(xlator_t *this, gfx_unlink_req *req, loc_t *loc,
+                     int32_t flags, dict_t *xdata);
+
+int
+client_pre_rmdir_v2(xlator_t *this, gfx_rmdir_req *req, loc_t *loc,
+                    int32_t flags, dict_t *xdata);
+
+int
+client_pre_symlink_v2(xlator_t *this, gfx_symlink_req *req, loc_t *loc,
+                      const char *linkname, mode_t umask, dict_t *xdata);
+
+int
+client_pre_rename_v2(xlator_t *this, gfx_rename_req *req, loc_t *oldloc,
+                     loc_t *newloc, dict_t *xdata);
+
+int
+client_pre_link_v2(xlator_t *this, gfx_link_req *req, loc_t *oldloc,
+                   loc_t *newloc, dict_t *xdata);
+
+int
+client_pre_truncate_v2(xlator_t *this, gfx_truncate_req *req, loc_t *loc,
+                       off_t offset, dict_t *xdata);
+
+int
+client_pre_open_v2(xlator_t *this, gfx_open_req *req, loc_t *loc, fd_t *fd,
+                   int32_t flags, dict_t *xdata);
+
+int
+client_pre_readv_v2(xlator_t *this, gfx_read_req *req, fd_t *fd, size_t size,
+                    off_t offset, int32_t flags, dict_t *xdata);
+
+int
+client_pre_writev_v2(xlator_t *this, gfx_write_req *req, fd_t *fd, size_t size,
+                     off_t offset, int32_t flags, dict_t **xdata);
+
+int
+client_pre_statfs_v2(xlator_t *this, gfx_statfs_req *req, loc_t *loc,
+                     dict_t *xdata);
+
+int
+client_pre_flush_v2(xlator_t *this, gfx_flush_req *req, fd_t *fd,
+                    dict_t *xdata);
+
+int
+client_pre_fsync_v2(xlator_t *this, gfx_fsync_req *req, fd_t *fd, int32_t flags,
+                    dict_t *xdata);
+
+int
+client_pre_setxattr_v2(xlator_t *this, gfx_setxattr_req *req, loc_t *loc,
+                       dict_t *xattr, int32_t flags, dict_t *xdata);
+
+int
+client_pre_getxattr_v2(xlator_t *this, gfx_getxattr_req *req, loc_t *loc,
+                       const char *name, dict_t *xdata);
+
+int
+client_pre_removexattr_v2(xlator_t *this, gfx_removexattr_req *req, loc_t *loc,
+                          const char *name, dict_t *xdata);
+
+int
+client_pre_opendir_v2(xlator_t *this, gfx_opendir_req *req, loc_t *loc,
+                      fd_t *fd, dict_t *xdata);
+
+int
+client_pre_fsyncdir_v2(xlator_t *this, gfx_fsyncdir_req *req, fd_t *fd,
+                       int32_t flags, dict_t *xdata);
+
+int
+client_pre_access_v2(xlator_t *this, gfx_access_req *req, loc_t *loc,
+                     int32_t mask, dict_t *xdata);
+
+int
+client_pre_create_v2(xlator_t *this, gfx_create_req *req, loc_t *loc, fd_t *fd,
+                     mode_t mode, int32_t flags, mode_t umask, dict_t *xdata);
+
+int
+client_pre_ftruncate_v2(xlator_t *this, gfx_ftruncate_req *req, fd_t *fd,
+                        off_t offset, dict_t *xdata);
+
+int
+client_pre_fstat_v2(xlator_t *this, gfx_fstat_req *req, fd_t *fd,
+                    dict_t *xdata);
+
+int
+client_pre_lk_v2(xlator_t *this, gfx_lk_req *req, int32_t cmd,
+                 struct gf_flock *flock, fd_t *fd, dict_t *xdata);
+
+int
+client_pre_lookup_v2(xlator_t *this, gfx_lookup_req *req, loc_t *loc,
+                     dict_t *xdata);
+
+int
+client_pre_readdir_v2(xlator_t *this, gfx_readdir_req *req, fd_t *fd,
+                      size_t size, off_t offset, dict_t *xdata);
+
+int
+client_pre_inodelk_v2(xlator_t *this, gfx_inodelk_req *req, loc_t *loc, int cmd,
+                      struct gf_flock *flock, const char *volume,
+                      dict_t *xdata);
+
+int
+client_pre_finodelk_v2(xlator_t *this, gfx_finodelk_req *req, fd_t *fd, int cmd,
+                       struct gf_flock *flock, const char *volume,
+                       dict_t *xdata);
+
+int
+client_pre_entrylk_v2(xlator_t *this, gfx_entrylk_req *req, loc_t *loc,
+                      entrylk_cmd cmd_entrylk, entrylk_type type,
+                      const char *volume, const char *basename, dict_t *xdata);
+
+int
+client_pre_fentrylk_v2(xlator_t *this, gfx_fentrylk_req *req, fd_t *fd,
+                       entrylk_cmd cmd_entrylk, entrylk_type type,
+                       const char *volume, const char *basename, dict_t *xdata);
+
+int
+client_pre_xattrop_v2(xlator_t *this, gfx_xattrop_req *req, loc_t *loc,
+                      dict_t *xattr, int32_t flags, dict_t *xdata);
+
+int
+client_pre_fxattrop_v2(xlator_t *this, gfx_fxattrop_req *req, fd_t *fd,
+                       dict_t *xattr, int32_t flags, dict_t *xdata);
+
+int
+client_pre_fgetxattr_v2(xlator_t *this, gfx_fgetxattr_req *req, fd_t *fd,
+                        const char *name, dict_t *xdata);
+
+int
+client_pre_fsetxattr_v2(xlator_t *this, gfx_fsetxattr_req *req, fd_t *fd,
+                        int32_t flags, dict_t *xattr, dict_t *xdata);
+int
+client_pre_seek_v2(xlator_t *this, gfx_seek_req *req, fd_t *fd, off_t offset,
+                   gf_seek_what_t what, dict_t *xdata);
+
+int
+client_pre_rchecksum_v2(xlator_t *this, gfx_rchecksum_req *req, fd_t *fd,
+                        int32_t len, off_t offset, dict_t *xdata);
+
+int
+client_pre_setattr_v2(xlator_t *this, gfx_setattr_req *req, loc_t *loc,
+                      int32_t valid, struct iatt *stbuf, dict_t *xdata);
+int
+client_pre_fsetattr_v2(xlator_t *this, gfx_fsetattr_req *req, fd_t *fd,
+                       int32_t valid, struct iatt *stbuf, dict_t *xdata);
+
+int
+client_pre_readdirp_v2(xlator_t *this, gfx_readdirp_req *req, fd_t *fd,
+                       size_t size, off_t offset, dict_t *xdata);
+
+int
+client_pre_fremovexattr_v2(xlator_t *this, gfx_fremovexattr_req *req, fd_t *fd,
+                           const char *name, dict_t *xdata);
+
+int
+client_pre_fallocate_v2(xlator_t *this, gfx_fallocate_req *req, fd_t *fd,
+                        int32_t flags, off_t offset, size_t size,
+                        dict_t *xdata);
+int
+client_pre_discard_v2(xlator_t *this, gfx_discard_req *req, fd_t *fd,
+                      off_t offset, size_t size, dict_t *xdata);
+int
+client_pre_zerofill_v2(xlator_t *this, gfx_zerofill_req *req, fd_t *fd,
+                       off_t offset, size_t size, dict_t *xdata);
+int
+client_pre_ipc_v2(xlator_t *this, gfx_ipc_req *req, int32_t cmd, dict_t *xdata);
+
+int
+client_pre_lease_v2(xlator_t *this, gfx_lease_req *req, loc_t *loc,
+                    struct gf_lease *lease, dict_t *xdata);
+
+int
+client_pre_put_v2(xlator_t *this, gfx_put_req *req, loc_t *loc, mode_t mode,
+                  mode_t umask, int32_t flags, size_t size, off_t offset,
+                  dict_t *xattr, dict_t *xdata);
+
+int
+client_post_readv_v2(xlator_t *this, gfx_read_rsp *rsp, struct iobref **iobref,
+                     struct iobref *rsp_iobref, struct iatt *stat,
+                     struct iovec *vector, struct iovec *rsp_vector,
+                     int *rspcount, dict_t **xdata);
+
+int
+client_post_create_v2(xlator_t *this, gfx_create_rsp *rsp, struct iatt *stbuf,
+                      struct iatt *preparent, struct iatt *postparent,
+                      clnt_local_t *local, dict_t **xdata);
+int
+client_post_lease_v2(xlator_t *this, gfx_lease_rsp *rsp, struct gf_lease *lease,
+                     dict_t **xdata);
+int
+client_post_lk_v2(xlator_t *this, gfx_lk_rsp *rsp, struct gf_flock *lock,
+                  dict_t **xdata);
+int
+client_post_readdir_v2(xlator_t *this, gfx_readdir_rsp *rsp,
+                       gf_dirent_t *entries, dict_t **xdata);
+int
+client_post_readdirp_v2(xlator_t *this, gfx_readdirp_rsp *rsp, fd_t *fd,
+                        gf_dirent_t *entries, dict_t **xdata);
+int
+client_post_rename_v2(xlator_t *this, gfx_rename_rsp *rsp, struct iatt *stbuf,
+                      struct iatt *preoldparent, struct iatt *postoldparent,
+                      struct iatt *prenewparent, struct iatt *postnewparent,
+                      dict_t **xdata);
+
+int
+client_pre_copy_file_range_v2(xlator_t *this, gfx_copy_file_range_req *req,
+                              fd_t *fd_in, off64_t off_in, fd_t *fd_out,
+                              off64_t off_out, size_t size, int32_t flags,
+                              dict_t **xdata);
+
+#endif /* __CLIENT_COMMON_H__ */
diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c
new file mode 100644
index 00000000000..ea5ef5c1800
--- /dev/null
+++ b/xlators/protocol/client/src/client-handshake.c
@@ -0,0 +1,1415 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/fd-lk.h>
+#include "client.h"
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/compat-errno.h>
+
+#include "glusterfs3.h"
+#include "portmap-xdr.h"
+#include "rpc-common-xdr.h"
+#include "client-messages.h"
+#include "xdr-rpc.h"
+
+#define CLIENT_REOPEN_MAX_ATTEMPTS 1024
+extern rpc_clnt_prog_t clnt3_3_fop_prog;
+extern rpc_clnt_prog_t clnt4_0_fop_prog;
+extern rpc_clnt_prog_t clnt_pmap_prog;
+
+int32_t
+client3_getspec(call_frame_t *frame, xlator_t *this, void *data)
+{
+    CLIENT_STACK_UNWIND(getspec, frame, -1, ENOSYS, NULL);
+    return 0;
+}
+
+static int
+client_notify_parents_child_up(xlator_t *this)
+{
+    clnt_conf_t *conf = NULL;
+    int ret = 0;
+
+    GF_VALIDATE_OR_GOTO("client", this, out);
+    conf = this->private;
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+    if (conf->child_up) {
+        ret = client_notify_dispatch_uniq(this, GF_EVENT_CHILD_UP, NULL);
+        if (ret) {
+            gf_smsg(this->name, GF_LOG_INFO, 0, PC_MSG_CHILD_UP_NOTIFY_FAILED,
+                    NULL);
+            goto out;
+        }
+    } else {
+        gf_smsg(this->name, GF_LOG_INFO, 0, PC_MSG_CHILD_STATUS, NULL);
+    }
+
+out:
+    return 0;
+}
+
+void
+client_default_reopen_done(clnt_fd_ctx_t *fdctx, int64_t rfd, xlator_t *this)
+{
+    gf_log_callingfn(this->name, GF_LOG_WARNING,
+                     "This function should never be called");
+}
+
+static void
+client_reopen_done(clnt_fd_ctx_t *fdctx, int64_t rfd, xlator_t *this)
+{
+    clnt_conf_t *conf = this->private;
+    gf_boolean_t destroy = _gf_false;
+
+    pthread_spin_lock(&conf->fd_lock);
+    {
+        fdctx->remote_fd = rfd;
+        fdctx->reopen_attempts = 0;
+        fdctx->reopen_done = client_default_reopen_done;
+        if (!fdctx->released)
+            list_add_tail(&fdctx->sfd_pos, &conf->saved_fds);
+        else
+            destroy = _gf_true;
+    }
+    pthread_spin_unlock(&conf->fd_lock);
+
+    if (destroy)
+        client_fdctx_destroy(this, fdctx);
+}
+
+static void
+client_child_up_reopen_done(clnt_fd_ctx_t *fdctx, int64_t rfd, xlator_t *this)
+{
+    clnt_conf_t *conf = this->private;
+    uint64_t fd_count = 0;
+
+    LOCK(&conf->rec_lock);
+    {
+        fd_count = --(conf->reopen_fd_count);
+    }
+    UNLOCK(&conf->rec_lock);
+
+    client_reopen_done(fdctx, rfd, this);
+    if (fd_count == 0) {
+        gf_smsg(this->name, GF_LOG_INFO, 0, PC_MSG_CHILD_UP_NOTIFY, NULL);
+        client_notify_parents_child_up(this);
+    }
+}
+
+int
+client3_3_reopen_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                     void *myframe)
+{
+    int32_t ret = -1;
+    gfs3_open_rsp rsp = {
+        0,
+    };
+    call_frame_t *frame = myframe;
+    xlator_t *this = frame->this;
+    clnt_local_t *local = frame->local;
+    clnt_fd_ctx_t *fdctx = local->fdctx;
+
+    if (-1 == req->rpc_status) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, ENOTCONN,
+                PC_MSG_RPC_STATUS_ERROR, NULL);
+        rsp.op_ret = -1;
+        rsp.op_errno = ENOTCONN;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gfs3_open_rsp);
+    if (ret < 0) {
+        gf_smsg(frame->this->name, GF_LOG_ERROR, EINVAL,
+                PC_MSG_XDR_DECODING_FAILED, NULL);
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        goto out;
+    }
+
+    if (rsp.op_ret < 0) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, rsp.op_errno,
+                PC_MSG_REOPEN_FAILED, "path=%s", local->loc.path);
+    } else {
+        gf_msg_debug(frame->this->name, 0,
+                     "reopen on %s succeeded (remote-fd = %" PRId64 ")",
+                     local->loc.path, rsp.fd);
+    }
+
+    if (rsp.op_ret == -1) {
+        goto out;
+    }
+
+out:
+    fdctx->reopen_done(fdctx, (rsp.op_ret) ? -1 : rsp.fd, this);
+
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+
+    client_local_wipe(local);
+
+    return 0;
+}
+
+int
+client3_3_reopendir_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                        void *myframe)
+{
+    int32_t ret = -1;
+    gfs3_open_rsp rsp = {
+        0,
+    };
+    call_frame_t *frame = myframe;
+    clnt_local_t *local = frame->local;
+    clnt_fd_ctx_t *fdctx = local->fdctx;
+
+    if (-1 == req->rpc_status) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, ENOTCONN,
+                PC_MSG_RPC_STATUS_ERROR, NULL);
+        rsp.op_ret = -1;
+        rsp.op_errno = ENOTCONN;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gfs3_opendir_rsp);
+    if (ret < 0) {
+        gf_smsg(frame->this->name, GF_LOG_ERROR, EINVAL,
+                PC_MSG_XDR_DECODING_FAILED, NULL);
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        goto out;
+    }
+
+    if (rsp.op_ret < 0) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, rsp.op_errno,
+                PC_MSG_REOPEN_FAILED, "path=%s", local->loc.path, NULL);
+    } else {
+        gf_smsg(frame->this->name, GF_LOG_INFO, 0, PC_MSG_DIR_OP_SUCCESS,
+                "path=%s", local->loc.path, "fd=%" PRId64, rsp.fd, NULL);
+    }
+
+    if (-1 == rsp.op_ret) {
+        goto out;
+    }
+
+out:
+    fdctx->reopen_done(fdctx, (rsp.op_ret) ? -1 : rsp.fd, frame->this);
+
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+    client_local_wipe(local);
+
+    return 0;
+}
+
+static int
+protocol_client_reopendir(clnt_fd_ctx_t *fdctx, xlator_t *this)
+{
+    int ret = -1;
+    gfs3_opendir_req req = {
+        {
+            0,
+        },
+    };
+    clnt_local_t *local = NULL;
+    call_frame_t *frame = NULL;
+    clnt_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        goto out;
+    }
+    local->fdctx = fdctx;
+
+    gf_uuid_copy(local->loc.gfid, fdctx->gfid);
+    ret = loc_path(&local->loc, NULL);
+    if (ret < 0)
+        goto out;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame) {
+        goto out;
+    }
+
+    memcpy(req.gfid, fdctx->gfid, 16);
+
+    gf_msg_debug(frame->this->name, 0, "attempting reopen on %s",
+                 local->loc.path);
+
+    frame->local = local;
+
+    ret = client_submit_request(this, &req, frame, conf->fops, GFS3_OP_OPENDIR,
+                                client3_3_reopendir_cbk, NULL,
+                                (xdrproc_t)xdr_gfs3_opendir_req);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_DIR_OP_FAILED, NULL);
+    }
+
+    return 0;
+
+out:
+    if (local)
+        client_local_wipe(local);
+
+    fdctx->reopen_done(fdctx, fdctx->remote_fd, this);
+
+    return 0;
+}
+
+static int
+protocol_client_reopenfile(clnt_fd_ctx_t *fdctx, xlator_t *this)
+{
+    int ret = -1;
+    gfs3_open_req req = {
+        {
+            0,
+        },
+    };
+    clnt_local_t *local = NULL;
+    call_frame_t *frame = NULL;
+    clnt_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame) {
+        goto out;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        goto out;
+    }
+
+    local->fdctx = fdctx;
+    gf_uuid_copy(local->loc.gfid, fdctx->gfid);
+    ret = loc_path(&local->loc, NULL);
+    if (ret < 0)
+        goto out;
+
+    frame->local = local;
+
+    memcpy(req.gfid, fdctx->gfid, 16);
+    req.flags = gf_flags_from_flags(fdctx->flags);
+    req.flags = req.flags & (~(O_TRUNC | O_CREAT | O_EXCL));
+
+    gf_msg_debug(frame->this->name, 0, "attempting reopen on %s",
+                 local->loc.path);
+
+    ret = client_submit_request(this, &req, frame, conf->fops, GFS3_OP_OPEN,
+                                client3_3_reopen_cbk, NULL,
+                                (xdrproc_t)xdr_gfs3_open_req);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_DIR_OP_FAILED, NULL);
+    }
+
+    return 0;
+
+out:
+    if (frame) {
+        frame->local = NULL;
+        STACK_DESTROY(frame->root);
+    }
+
+    if (local)
+        client_local_wipe(local);
+
+    fdctx->reopen_done(fdctx, fdctx->remote_fd, this);
+
+    return 0;
+}
+
+static void
+protocol_client_reopen(clnt_fd_ctx_t *fdctx, xlator_t *this)
+{
+    if (fdctx->is_dir)
+        protocol_client_reopendir(fdctx, this);
+    else
+        protocol_client_reopenfile(fdctx, this);
+}
+
+/* v4.x +  */
+int
+client4_0_reopen_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                     void *myframe)
+{
+    int32_t ret = -1;
+    gfx_open_rsp rsp = {
+        0,
+    };
+    call_frame_t *frame = myframe;
+    xlator_t *this = frame->this;
+    clnt_local_t *local = frame->local;
+    clnt_fd_ctx_t *fdctx = local->fdctx;
+
+    if (-1 == req->rpc_status) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, ENOTCONN,
+                PC_MSG_RPC_STATUS_ERROR, NULL);
+        rsp.op_ret = -1;
+        rsp.op_errno = ENOTCONN;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gfx_open_rsp);
+    if (ret < 0) {
+        gf_smsg(frame->this->name, GF_LOG_ERROR, EINVAL,
+                PC_MSG_XDR_DECODING_FAILED, NULL);
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        goto out;
+    }
+
+    if (rsp.op_ret < 0) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, rsp.op_errno,
+                PC_MSG_REOPEN_FAILED, "path=%s", local->loc.path, NULL);
+    } else {
+        gf_msg_debug(frame->this->name, 0,
+                     "reopen on %s succeeded (remote-fd = %" PRId64 ")",
+                     local->loc.path, rsp.fd);
+    }
+
+    if (rsp.op_ret == -1) {
+        goto out;
+    }
+
+out:
+    fdctx->reopen_done(fdctx, (rsp.op_ret) ? -1 : rsp.fd, this);
+
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+
+    client_local_wipe(local);
+
+    return 0;
+}
+
+int
+client4_0_reopendir_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                        void *myframe)
+{
+    int32_t ret = -1;
+    gfx_open_rsp rsp = {
+        0,
+    };
+    call_frame_t *frame = myframe;
+    clnt_local_t *local = frame->local;
+    clnt_fd_ctx_t *fdctx = local->fdctx;
+
+    if (-1 == req->rpc_status) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, ENOTCONN,
+                PC_MSG_RPC_STATUS_ERROR, NULL);
+        rsp.op_ret = -1;
+        rsp.op_errno = ENOTCONN;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gfx_open_rsp);
+    if (ret < 0) {
+        gf_smsg(frame->this->name, GF_LOG_ERROR, EINVAL,
+                PC_MSG_XDR_DECODING_FAILED, NULL);
+        rsp.op_ret = -1;
+        rsp.op_errno = EINVAL;
+        goto out;
+    }
+
+    if (rsp.op_ret < 0) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, rsp.op_errno,
+                PC_MSG_DIR_OP_FAILED, "dir-path=%s", local->loc.path, NULL);
+    } else {
+        gf_smsg(frame->this->name, GF_LOG_INFO, 0, PC_MSG_DIR_OP_SUCCESS,
+                "path=%s", local->loc.path, "fd=%" PRId64, rsp.fd, NULL);
+    }
+
+    if (-1 == rsp.op_ret) {
+        goto out;
+    }
+
+out:
+    fdctx->reopen_done(fdctx, (rsp.op_ret) ? -1 : rsp.fd, frame->this);
+
+    frame->local = NULL;
+    STACK_DESTROY(frame->root);
+    client_local_wipe(local);
+
+    return 0;
+}
+
+static int
+protocol_client_reopendir_v2(clnt_fd_ctx_t *fdctx, xlator_t *this)
+{
+    int ret = -1;
+    gfx_opendir_req req = {
+        {
+            0,
+        },
+    };
+    call_frame_t *frame = NULL;
+    clnt_conf_t *conf = this->private;
+    clnt_local_t *local = mem_get0(this->local_pool);
+
+    if (!local) {
+        ret = -1;
+        goto out;
+    }
+    local->fdctx = fdctx;
+
+    gf_uuid_copy(local->loc.gfid, fdctx->gfid);
+    ret = loc_path(&local->loc, NULL);
+    if (ret < 0)
+        goto out;
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame) {
+        ret = -1;
+        goto out;
+    }
+
+    memcpy(req.gfid, fdctx->gfid, 16);
+
+    gf_msg_debug(frame->this->name, 0, "attempting reopen on %s",
+                 local->loc.path);
+
+    frame->local = local;
+
+    ret = client_submit_request(this, &req, frame, conf->fops, GFS3_OP_OPENDIR,
+                                client4_0_reopendir_cbk, NULL,
+                                (xdrproc_t)xdr_gfx_opendir_req);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_DIR_OP_FAILED, NULL);
+    }
+
+    return 0;
+
+out:
+    if (local)
+        client_local_wipe(local);
+
+    fdctx->reopen_done(fdctx, fdctx->remote_fd, this);
+
+    return 0;
+}
+
+static int
+protocol_client_reopenfile_v2(clnt_fd_ctx_t *fdctx, xlator_t *this)
+{
+    int ret = -1;
+    gfx_open_req req = {
+        {
+            0,
+        },
+    };
+    clnt_local_t *local = NULL;
+    clnt_conf_t *conf = this->private;
+    call_frame_t *frame = create_frame(this, this->ctx->pool);
+
+    if (!frame) {
+        ret = -1;
+        goto out;
+    }
+
+    local = mem_get0(this->local_pool);
+    if (!local) {
+        ret = -1;
+        goto out;
+    }
+
+    local->fdctx = fdctx;
+    gf_uuid_copy(local->loc.gfid, fdctx->gfid);
+    ret = loc_path(&local->loc, NULL);
+    if (ret < 0)
+        goto out;
+
+    frame->local = local;
+
+    memcpy(req.gfid, fdctx->gfid, 16);
+    req.flags = gf_flags_from_flags(fdctx->flags);
+    req.flags = req.flags & (~(O_TRUNC | O_CREAT | O_EXCL));
+
+    gf_msg_debug(frame->this->name, 0, "attempting reopen on %s",
+                 local->loc.path);
+
+    ret = client_submit_request(this, &req, frame, conf->fops, GFS3_OP_OPEN,
+                                client4_0_reopen_cbk, NULL,
+                                (xdrproc_t)xdr_gfx_open_req);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_DIR_OP_FAILED, NULL);
+    }
+
+    return 0;
+
+out:
+    if (frame) {
+        frame->local = NULL;
+        STACK_DESTROY(frame->root);
+    }
+
+    if (local)
+        client_local_wipe(local);
+
+    fdctx->reopen_done(fdctx, fdctx->remote_fd, this);
+
+    return 0;
+}
+
+static void
+protocol_client_reopen_v2(clnt_fd_ctx_t *fdctx, xlator_t *this)
+{
+    if (fdctx->is_dir)
+        protocol_client_reopendir_v2(fdctx, this);
+    else
+        protocol_client_reopenfile_v2(fdctx, this);
+}
+
+gf_boolean_t
+__is_fd_reopen_in_progress(clnt_fd_ctx_t *fdctx)
+{
+    if (fdctx->reopen_done == client_default_reopen_done)
+        return _gf_false;
+    return _gf_true;
+}
+
+void
+client_attempt_reopen(fd_t *fd, xlator_t *this)
+{
+    if (!fd || !this)
+        goto out;
+
+    clnt_conf_t *conf = this->private;
+    clnt_fd_ctx_t *fdctx = NULL;
+    gf_boolean_t reopen = _gf_false;
+
+    pthread_spin_lock(&conf->fd_lock);
+    {
+        fdctx = this_fd_get_ctx(fd, this);
+        if (!fdctx) {
+            pthread_spin_unlock(&conf->fd_lock);
+            goto out;
+        }
+
+        if (__is_fd_reopen_in_progress(fdctx))
+            goto unlock;
+        if (fdctx->remote_fd != -1)
+            goto unlock;
+
+        if (fdctx->reopen_attempts == CLIENT_REOPEN_MAX_ATTEMPTS) {
+            reopen = _gf_true;
+            fdctx->reopen_done = client_reopen_done;
+            list_del_init(&fdctx->sfd_pos);
+        } else {
+            fdctx->reopen_attempts++;
+        }
+    }
+unlock:
+    pthread_spin_unlock(&conf->fd_lock);
+    if (reopen) {
+        if (conf->fops->progver == GLUSTER_FOP_VERSION_v2)
+            protocol_client_reopen_v2(fdctx, this);
+        else
+            protocol_client_reopen(fdctx, this);
+    }
+out:
+    return;
+}
+
+static int
+client_post_handshake(call_frame_t *frame, xlator_t *this)
+{
+    clnt_conf_t *conf = NULL;
+    clnt_fd_ctx_t *tmp = NULL;
+    clnt_fd_ctx_t *fdctx = NULL;
+    struct list_head reopen_head;
+
+    int count = 0;
+
+    if (!this || !this->private)
+        goto out;
+
+    conf = this->private;
+    INIT_LIST_HEAD(&reopen_head);
+
+    pthread_spin_lock(&conf->fd_lock);
+    {
+        list_for_each_entry_safe(fdctx, tmp, &conf->saved_fds, sfd_pos)
+        {
+            if (fdctx->remote_fd != -1 ||
+                (!list_empty(&fdctx->lock_list) && conf->strict_locks))
+                continue;
+
+            fdctx->reopen_done = client_child_up_reopen_done;
+            list_del_init(&fdctx->sfd_pos);
+            list_add_tail(&fdctx->sfd_pos, &reopen_head);
+            count++;
+        }
+    }
+    pthread_spin_unlock(&conf->fd_lock);
+
+    /* Delay notifying CHILD_UP to parents
+       until all locks are recovered */
+    if (count > 0) {
+        gf_smsg(this->name, GF_LOG_INFO, 0, PC_MSG_CHILD_UP_NOTIFY_DELAY,
+                "count=%d", count, NULL);
+        client_save_number_fds(conf, count);
+
+        list_for_each_entry_safe(fdctx, tmp, &reopen_head, sfd_pos)
+        {
+            list_del_init(&fdctx->sfd_pos);
+
+            if (conf->fops->progver == GLUSTER_FOP_VERSION_v2)
+                protocol_client_reopen_v2(fdctx, this);
+            else
+                protocol_client_reopen(fdctx, this);
+        }
+    } else {
+        gf_msg_debug(this->name, 0,
+                     "No fds to open - notifying all parents child "
+                     "up");
+        client_notify_parents_child_up(this);
+    }
+out:
+    return 0;
+}
+
+int
+client_setvolume_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                     void *myframe)
+{
+    call_frame_t *frame = myframe;
+    xlator_t *this = frame->this;
+    clnt_conf_t *conf = this->private;
+    dict_t *reply = NULL;
+    char *process_uuid = NULL;
+    char *volume_id = NULL;
+    char *remote_error = NULL;
+    char *remote_subvol = NULL;
+    gf_setvolume_rsp rsp = {
+        0,
+    };
+    int ret = 0;
+    int32_t op_ret = 0;
+    int32_t op_errno = 0;
+    gf_boolean_t auth_fail = _gf_false;
+    glusterfs_ctx_t *ctx = NULL;
+
+    GF_VALIDATE_OR_GOTO(this->name, conf, out);
+    ctx = this->ctx;
+    GF_VALIDATE_OR_GOTO(this->name, ctx, out);
+
+    if (-1 == req->rpc_status) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, ENOTCONN,
+                PC_MSG_RPC_STATUS_ERROR, NULL);
+        op_ret = -1;
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gf_setvolume_rsp);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, EINVAL, PC_MSG_XDR_DECODING_FAILED,
+                NULL);
+        op_ret = -1;
+        goto out;
+    }
+    op_ret = rsp.op_ret;
+    op_errno = gf_error_to_errno(rsp.op_errno);
+    if (-1 == rsp.op_ret) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, op_errno,
+                PC_MSG_VOL_SET_FAIL, NULL);
+    }
+
+    reply = dict_new();
+    if (!reply)
+        goto out;
+
+    if (rsp.dict.dict_len) {
+        ret = dict_unserialize(rsp.dict.dict_val, rsp.dict.dict_len, &reply);
+        if (ret < 0) {
+            gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+                    PC_MSG_DICT_UNSERIALIZE_FAIL, NULL);
+            goto out;
+        }
+    }
+
+    ret = dict_get_str_sizen(reply, "ERROR", &remote_error);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, EINVAL, PC_MSG_DICT_GET_FAILED,
+                "ERROR string", NULL);
+    }
+
+    ret = dict_get_str_sizen(reply, "process-uuid", &process_uuid);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, EINVAL, PC_MSG_DICT_GET_FAILED,
+                "process-uuid", NULL);
+    }
+
+    if (op_ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, PC_MSG_SETVOLUME_FAIL,
+                "remote-error=%s", remote_error, NULL);
+
+        errno = op_errno;
+        if (remote_error && (op_errno == EACCES)) {
+            auth_fail = _gf_true;
+            op_ret = 0;
+        }
+        if ((op_errno == ENOENT) && this->ctx->cmd_args.subdir_mount &&
+            (ctx->graph_id <= 1)) {
+            /* A case of subdir not being present at the moment,
+               ride on auth_fail framework to notify the error */
+            /* Make sure this case is handled only in the new
+               graph, so mount may fail in this case. In case
+               of 'add-brick' etc, we need to continue retry */
+            auth_fail = _gf_true;
+            op_ret = 0;
+        }
+        if (op_errno == ESTALE) {
+            ret = client_notify_dispatch(this, GF_EVENT_VOLFILE_MODIFIED, NULL);
+            if (ret)
+                gf_smsg(this->name, GF_LOG_INFO, 0,
+                        PC_MSG_VOLFILE_NOTIFY_FAILED, NULL);
+        }
+        goto out;
+    }
+
+    ret = dict_get_str_sizen(this->options, "remote-subvolume", &remote_subvol);
+    if (ret || !remote_subvol) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, PC_MSG_FIND_KEY_FAILED,
+                "remote-subvolume", NULL);
+        goto out;
+    }
+
+    ret = dict_get_str_sizen(reply, "volume-id", &volume_id);
+    if (ret < 0) {
+        /* this can happen if the server is of old version, so treat it as
+           just debug message */
+        gf_msg_debug(this->name, EINVAL,
+                     "failed to get 'volume-id' from reply dict");
+    } else if (ctx->master && strncmp("snapd", remote_subvol, 5)) {
+        /* TODO: if it is a fuse mount or a snapshot enabled client, don't
+           bother */
+        /* If any value is set, the first element will be non-0.
+           It would be '0', but not '\0' :-) */
+        if (ctx->volume_id[0]) {
+            if (strcmp(ctx->volume_id, volume_id)) {
+                /* Ideally it shouldn't even come here, as server itself
+                   should fail the handshake in that case */
+                gf_smsg(this->name, GF_LOG_ERROR, EINVAL, PC_MSG_VOL_ID_CHANGED,
+                        "vol-id=%s", volume_id, "ctx->vol-id=%s",
+                        ctx->volume_id, NULL);
+                op_ret = -1;
+                goto out;
+            }
+        } else {
+            strncpy(ctx->volume_id, volume_id, GF_UUID_BUF_SIZE);
+        }
+    }
+
+    uint32_t child_up_int;
+    ret = dict_get_uint32(reply, "child_up", &child_up_int);
+    if (ret) {
+        /*
+         * This would happen in cases where the server trying to     *
+         * connect to this client is running an older version. Hence *
+         * setting the child_up to _gf_true in this case.            *
+         */
+        gf_smsg(this->name, GF_LOG_WARNING, 0, PC_MSG_FIND_KEY_FAILED,
+                "child_up", NULL);
+        conf->child_up = _gf_true;
+    } else {
+        conf->child_up = (child_up_int != 0);
+    }
+
+    /* TODO: currently setpeer path is broken */
+    /*
+    if (process_uuid && req->conn &&
+        !strcmp (this->ctx->process_uuid, process_uuid)) {
+            rpc_transport_t      *peer_trans    = NULL;
+            uint64_t              peertrans_int = 0;
+
+            ret = dict_get_uint64 (reply, "transport-ptr",
+                                   &peertrans_int);
+            if (ret)
+                    goto out;
+
+            gf_log (this->name, GF_LOG_WARNING,
+                    "attaching to the local volume '%s'",
+                    remote_subvol);
+
+            peer_trans = (void *) (long) (peertrans_int);
+
+            rpc_transport_setpeer (req->conn->trans, peer_trans);
+    }
+    */
+
+    conf->client_id = glusterfs_leaf_position(this);
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, PC_MSG_REMOTE_VOL_CONNECTED,
+            "conn-name=%s", conf->rpc->conn.name, "remote_subvol=%s",
+            remote_subvol, NULL);
+
+    op_ret = 0;
+    conf->connected = 1;
+
+    client_post_handshake(frame, frame->this);
+out:
+    if (auth_fail) {
+        gf_smsg(this->name, GF_LOG_INFO, 0, PC_MSG_AUTH_FAILED, NULL);
+        ret = client_notify_dispatch(this, GF_EVENT_AUTH_FAILED, NULL);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_INFO, 0,
+                    PC_MSG_AUTH_FAILED_NOTIFY_FAILED, NULL);
+        conf->connected = 0;
+        ret = -1;
+    }
+    if (-1 == op_ret) {
+        /* Let the connection/re-connection happen in
+         * background, for now, don't hang here,
+         * tell the parents that i am all ok..
+         */
+        gf_smsg(this->name, GF_LOG_INFO, 0, PC_MSG_CHILD_CONNECTING_EVENT,
+                NULL);
+        ret = client_notify_dispatch(this, GF_EVENT_CHILD_CONNECTING, NULL);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_INFO, 0,
+                    PC_MSG_CHILD_CONNECTING_NOTIFY_FAILED, NULL);
+        /*
+         * The reconnection *won't* happen in the background (see
+         * previous comment) unless we kill the current connection.
+         */
+        rpc_transport_disconnect(conf->rpc->conn.trans, _gf_false);
+        ret = 0;
+    }
+
+    free(rsp.dict.dict_val);
+
+    STACK_DESTROY(frame->root);
+
+    if (reply)
+        dict_unref(reply);
+
+    return ret;
+}
+
+int
+client_setvolume(xlator_t *this, struct rpc_clnt *rpc)
+{
+    int ret = 0;
+    gf_setvolume_req req = {
+        {
+            0,
+        },
+    };
+    call_frame_t *fr = NULL;
+    char *process_uuid_xl = NULL;
+    char *remote_subvol = NULL;
+    clnt_conf_t *conf = this->private;
+    dict_t *options = this->options;
+    char counter_str[32] = {0};
+    char hostname[256] = {
+        0,
+    };
+
+    if (conf->fops) {
+        ret = dict_set_int32_sizen(options, "fops-version",
+                                   conf->fops->prognum);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_DICT_SET_FAILED,
+                    "version-fops=%d", conf->fops->prognum, NULL);
+            goto fail;
+        }
+    }
+
+    if (conf->mgmt) {
+        ret = dict_set_int32_sizen(options, "mgmt-version",
+                                   conf->mgmt->prognum);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_DICT_SET_FAILED,
+                    "version-mgmt=%d", conf->mgmt->prognum, NULL);
+            goto fail;
+        }
+    }
+
+    /*
+     * Connection-id should always be unique so that server never gets to
+     * reuse the previous connection resources so it cleans up the resources
+     * on every disconnect. Otherwise it may lead to stale resources, i.e.
+     * leaked file descriptors, inode/entry locks
+     */
+
+    snprintf(counter_str, sizeof(counter_str), "-%" PRIu64, conf->setvol_count);
+    conf->setvol_count++;
+
+    if (gethostname(hostname, 256) == -1) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, PC_MSG_GETHOSTNAME_FAILED,
+                NULL);
+
+        goto fail;
+    }
+
+    ret = gf_asprintf(&process_uuid_xl, GLUSTER_PROCESS_UUID_FMT,
+                      this->ctx->process_uuid, this->graph->id, getpid(),
+                      hostname, this->name, counter_str);
+    if (-1 == ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_PROCESS_UUID_SET_FAIL,
+                NULL);
+        goto fail;
+    }
+
+    ret = dict_set_dynstr_sizen(options, "process-uuid", process_uuid_xl);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_DICT_SET_FAILED,
+                "process-uuid=%s", process_uuid_xl, NULL);
+        goto fail;
+    }
+
+    if (this->ctx->cmd_args.process_name) {
+        ret = dict_set_str_sizen(options, "process-name",
+                                 this->ctx->cmd_args.process_name);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_INFO, 0, PC_MSG_DICT_SET_FAILED,
+                    "process-name", NULL);
+        }
+    }
+
+    ret = dict_set_str_sizen(options, "client-version", PACKAGE_VERSION);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, PC_MSG_DICT_SET_FAILED,
+                "client-version=%s", PACKAGE_VERSION, NULL);
+    }
+
+    ret = dict_get_str_sizen(this->options, "remote-subvolume", &remote_subvol);
+    if (ret || !remote_subvol) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, PC_MSG_FIND_KEY_FAILED,
+                "remote-subvolume", NULL);
+        goto fail;
+    }
+
+    /* volume-id to be sent only for regular volume, not snap volume */
+    if (strncmp("snapd", remote_subvol, 5)) {
+        /* If any value is set, the first element will be non-0.
+           It would be '0', but not '\0' :-) */
+        if (!this->ctx->volume_id[0]) {
+            strncpy(this->ctx->volume_id, this->graph->volume_id,
+                    GF_UUID_BUF_SIZE);
+        }
+        if (this->ctx->volume_id[0]) {
+            ret = dict_set_str(options, "volume-id", this->ctx->volume_id);
+            if (ret < 0) {
+                gf_smsg(this->name, GF_LOG_INFO, 0, PC_MSG_DICT_SET_FAILED,
+                        "volume-id", NULL);
+            }
+        }
+    }
+
+    if (this->ctx->cmd_args.volfile_server) {
+        if (this->ctx->cmd_args.volfile_id) {
+            ret = dict_set_str_sizen(options, "volfile-key",
+                                     this->ctx->cmd_args.volfile_id);
+            if (ret)
+                gf_smsg(this->name, GF_LOG_ERROR, 0,
+                        PC_MSG_VOLFILE_KEY_SET_FAILED, NULL);
+        }
+        ret = dict_set_uint32(options, "volfile-checksum",
+                              this->graph->volfile_checksum);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_VOLFILE_CHECKSUM_FAILED,
+                    NULL);
+    }
+
+    if (this->ctx->cmd_args.subdir_mount) {
+        ret = dict_set_str_sizen(options, "subdir-mount",
+                                 this->ctx->cmd_args.subdir_mount);
+        if (ret) {
+            gf_log(THIS->name, GF_LOG_ERROR, "Failed to set subdir_mount");
+            /* It makes sense to fail, as per the CLI, we
+               should be doing a subdir_mount */
+            goto fail;
+        }
+    }
+
+    /* Insert a dummy key value pair to avoid failure at server side for
+     * clnt-lk-version with new clients.
+     */
+    ret = dict_set_uint32(options, "clnt-lk-version", 1);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, PC_MSG_DICT_SET_FAILED,
+                "clnt-lk-version(1)", NULL);
+    }
+
+    ret = dict_set_int32_sizen(options, "opversion", GD_OP_VERSION_MAX);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_DICT_SET_FAILED,
+                "client opversion", NULL);
+    }
+
+    ret = dict_allocate_and_serialize(options, (char **)&req.dict.dict_val,
+                                      &req.dict.dict_len);
+    if (ret != 0) {
+        ret = -1;
+        gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_DICT_SERIALIZE_FAIL, NULL);
+        goto fail;
+    }
+
+    fr = create_frame(this, this->ctx->pool);
+    if (!fr)
+        goto fail;
+
+    ret = client_submit_request(this, &req, fr, conf->handshake,
+                                GF_HNDSK_SETVOLUME, client_setvolume_cbk, NULL,
+                                (xdrproc_t)xdr_gf_setvolume_req);
+
+fail:
+    GF_FREE(req.dict.dict_val);
+
+    return ret;
+}
+
+static int
+select_server_supported_programs(xlator_t *this, gf_prog_detail *prog)
+{
+    gf_prog_detail *trav = NULL;
+    clnt_conf_t *conf = NULL;
+    int ret = -1;
+
+    if (!this || !prog) {
+        gf_smsg(THIS->name, GF_LOG_WARNING, 0, PC_MSG_PGM_NOT_FOUND, NULL);
+        goto out;
+    }
+
+    conf = this->private;
+    trav = prog;
+
+    while (trav) {
+        /* Select 'programs' */
+        if ((clnt3_3_fop_prog.prognum == trav->prognum) &&
+            (clnt3_3_fop_prog.progver == trav->progver)) {
+            conf->fops = &clnt3_3_fop_prog;
+            if (conf->rpc)
+                conf->rpc->auth_value = AUTH_GLUSTERFS_v2;
+            ret = 0;
+            /* In normal flow, we don't want to use old protocol type.
+               but if it is for testing, lets use it */
+            if (conf->old_protocol)
+                goto done;
+        }
+
+        if ((clnt4_0_fop_prog.prognum == trav->prognum) &&
+            (clnt4_0_fop_prog.progver == trav->progver)) {
+            conf->fops = &clnt4_0_fop_prog;
+            if (conf->rpc)
+                conf->rpc->auth_value = AUTH_GLUSTERFS_v3;
+            ret = 0;
+            /* this is latest program, lets use this program only */
+            /* if we are testing for old-protocol, lets not break this */
+            if (!conf->old_protocol)
+                goto done;
+        }
+
+        if (ret) {
+            gf_msg_debug(this->name, 0, "%s (%" PRId64 ") not supported",
+                         trav->progname, trav->progver);
+        }
+        trav = trav->next;
+    }
+
+done:
+    if (!ret)
+        gf_smsg(this->name, GF_LOG_INFO, 0, PC_MSG_VERSION_INFO,
+                "Program-name=%s", conf->fops->progname, "Num=%d",
+                conf->fops->prognum, "Version=%d", conf->fops->progver, NULL);
+
+out:
+    return ret;
+}
+
+int
+server_has_portmap(xlator_t *this, gf_prog_detail *prog)
+{
+    gf_prog_detail *trav = NULL;
+    int ret = -1;
+
+    if (!this || !prog) {
+        gf_smsg(THIS->name, GF_LOG_WARNING, 0, PC_MSG_PGM_NOT_FOUND, NULL);
+        goto out;
+    }
+
+    trav = prog;
+
+    while (trav) {
+        if ((trav->prognum == GLUSTER_PMAP_PROGRAM) &&
+            (trav->progver == GLUSTER_PMAP_VERSION)) {
+            gf_msg_debug(this->name, 0, "detected portmapper on server");
+            ret = 0;
+            break;
+        }
+        trav = trav->next;
+    }
+
+out:
+    return ret;
+}
+
+int
+client_query_portmap_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                         void *myframe)
+{
+    struct pmap_port_by_brick_rsp rsp = {
+        0,
+    };
+    call_frame_t *frame = NULL;
+    clnt_conf_t *conf = NULL;
+    int ret = -1;
+    struct rpc_clnt_config config = {
+        0,
+    };
+    xlator_t *this = NULL;
+
+    frame = myframe;
+    if (!frame || !frame->this || !frame->this->private) {
+        gf_smsg(THIS->name, GF_LOG_WARNING, EINVAL, PC_MSG_FRAME_NOT_FOUND,
+                NULL);
+        goto out;
+    }
+    this = frame->this;
+    conf = frame->this->private;
+
+    if (-1 == req->rpc_status) {
+        gf_smsg(this->name, GF_LOG_WARNING, ENOTCONN, PC_MSG_RPC_STATUS_ERROR,
+                NULL);
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_pmap_port_by_brick_rsp);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, EINVAL, PC_MSG_XDR_DECODING_FAILED,
+                NULL);
+        goto out;
+    }
+
+    if (-1 == rsp.op_ret) {
+        ret = -1;
+        if (!conf->portmap_err_logged) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_PORT_NUM_ERROR, NULL);
+        } else {
+            gf_msg_debug(this->name, 0,
+                         "failed to get the port number for "
+                         "remote subvolume. Please run 'gluster "
+                         "volume status' on server to see "
+                         "if brick process is running.");
+        }
+        conf->portmap_err_logged = 1;
+        goto out;
+    }
+
+    conf->portmap_err_logged = 0;
+    conf->disconnect_err_logged = 0;
+    config.remote_port = rsp.port;
+    rpc_clnt_reconfig(conf->rpc, &config);
+
+    conf->skip_notify = 1;
+    conf->quick_reconnect = 1;
+
+out:
+    if (frame)
+        STACK_DESTROY(frame->root);
+
+    if (conf) {
+        /* Need this to connect the same transport on different port */
+        /* ie, glusterd to glusterfsd */
+        rpc_transport_disconnect(conf->rpc->conn.trans, _gf_false);
+    }
+
+    return ret;
+}
+
+int
+client_query_portmap(xlator_t *this, struct rpc_clnt *rpc)
+{
+    int ret = -1;
+    pmap_port_by_brick_req req = {
+        0,
+    };
+    call_frame_t *fr = NULL;
+    dict_t *options = NULL;
+    char *remote_subvol = NULL;
+    char *xprt = NULL;
+    char brick_name[PATH_MAX] = {
+        0,
+    };
+
+    options = this->options;
+
+    ret = dict_get_str_sizen(options, "remote-subvolume", &remote_subvol);
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_REMOTE_SUBVOL_SET_FAIL,
+                NULL);
+        goto fail;
+    }
+
+    req.brick = remote_subvol;
+
+    if (!dict_get_str_sizen(options, "transport-type", &xprt)) {
+        if (!strcmp(xprt, "rdma")) {
+            snprintf(brick_name, sizeof(brick_name), "%s.rdma", remote_subvol);
+            req.brick = brick_name;
+        }
+    }
+
+    fr = create_frame(this, this->ctx->pool);
+    if (!fr) {
+        ret = -1;
+        goto fail;
+    }
+
+    ret = client_submit_request(this, &req, fr, &clnt_pmap_prog,
+                                GF_PMAP_PORTBYBRICK, client_query_portmap_cbk,
+                                NULL, (xdrproc_t)xdr_pmap_port_by_brick_req);
+
+fail:
+    return ret;
+}
+
+static int
+client_dump_version_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                        void *myframe)
+{
+    gf_dump_rsp rsp = {
+        0,
+    };
+    gf_prog_detail *trav = NULL;
+    gf_prog_detail *next = NULL;
+    call_frame_t *frame = NULL;
+    clnt_conf_t *conf = NULL;
+    int ret = 0;
+
+    frame = myframe;
+    conf = frame->this->private;
+
+    if (-1 == req->rpc_status) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, ENOTCONN,
+                PC_MSG_RPC_STATUS_ERROR, NULL);
+        goto out;
+    }
+
+    ret = xdr_to_generic(*iov, &rsp, (xdrproc_t)xdr_gf_dump_rsp);
+    if (ret < 0) {
+        gf_smsg(frame->this->name, GF_LOG_ERROR, EINVAL,
+                PC_MSG_XDR_DECODING_FAILED, NULL);
+        goto out;
+    }
+    if (-1 == rsp.op_ret) {
+        gf_smsg(frame->this->name, GF_LOG_WARNING, 0, PC_MSG_VERSION_ERROR,
+                NULL);
+        goto out;
+    }
+
+    if (server_has_portmap(frame->this, rsp.prog) == 0) {
+        ret = client_query_portmap(frame->this, conf->rpc);
+        goto out;
+    }
+
+    /* Check for the proper version string */
+    /* Reply in "Name:Program-Number:Program-Version,..." format */
+    ret = select_server_supported_programs(frame->this, rsp.prog);
+    if (ret) {
+        gf_smsg(frame->this->name, GF_LOG_ERROR, 0, PC_MSG_VERSION_ERROR, NULL);
+        goto out;
+    }
+
+    client_setvolume(frame->this, conf->rpc);
+
+out:
+    /* don't use GF_FREE, buffer was allocated by libc */
+    if (rsp.prog) {
+        trav = rsp.prog;
+        while (trav) {
+            next = trav->next;
+            free(trav->progname);
+            free(trav);
+            trav = next;
+        }
+    }
+
+    STACK_DESTROY(frame->root);
+
+    if (ret != 0)
+        rpc_transport_disconnect(conf->rpc->conn.trans, _gf_false);
+
+    return ret;
+}
+
+int
+client_handshake(xlator_t *this, struct rpc_clnt *rpc)
+{
+    call_frame_t *frame = NULL;
+    clnt_conf_t *conf = NULL;
+    gf_dump_req req = {
+        0,
+    };
+    int ret = 0;
+
+    conf = this->private;
+    if (!conf->handshake) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, PC_MSG_HANDSHAKE_PGM_NOT_FOUND,
+                NULL);
+        goto out;
+    }
+
+    frame = create_frame(this, this->ctx->pool);
+    if (!frame)
+        goto out;
+
+    req.gfs_id = 0xbabe;
+    ret = client_submit_request(this, &req, frame, conf->dump, GF_DUMP_DUMP,
+                                client_dump_version_cbk, NULL,
+                                (xdrproc_t)xdr_gf_dump_req);
+
+out:
+    return ret;
+}
+
+char *clnt_handshake_procs[GF_HNDSK_MAXVALUE] = {
+    [GF_HNDSK_NULL] = "NULL",
+    [GF_HNDSK_SETVOLUME] = "SETVOLUME",
+    [GF_HNDSK_GETSPEC] = "GETSPEC",
+    [GF_HNDSK_PING] = "PING",
+};
+
+rpc_clnt_prog_t clnt_handshake_prog = {
+    .progname = "GlusterFS Handshake",
+    .prognum = GLUSTER_HNDSK_PROGRAM,
+    .progver = GLUSTER_HNDSK_VERSION,
+    .procnames = clnt_handshake_procs,
+};
+
+char *clnt_dump_proc[GF_DUMP_MAXVALUE] = {
+    [GF_DUMP_NULL] = "NULL",
+    [GF_DUMP_DUMP] = "DUMP",
+};
+
+rpc_clnt_prog_t clnt_dump_prog = {
+    .progname = "GF-DUMP",
+    .prognum = GLUSTER_DUMP_PROGRAM,
+    .progver = GLUSTER_DUMP_VERSION,
+    .procnames = clnt_dump_proc,
+};
+
+char *clnt_pmap_procs[GF_PMAP_MAXVALUE] = {
+    [GF_PMAP_PORTBYBRICK] = "PORTBYBRICK",
+};
+
+rpc_clnt_prog_t clnt_pmap_prog = {
+    .progname = "PORTMAP",
+    .prognum = GLUSTER_PMAP_PROGRAM,
+    .progver = GLUSTER_PMAP_VERSION,
+    .procnames = clnt_pmap_procs,
+};
diff --git a/xlators/protocol/client/src/client-helpers.c b/xlators/protocol/client/src/client-helpers.c
new file mode 100644
index 00000000000..189dfddd021
--- /dev/null
+++ b/xlators/protocol/client/src/client-helpers.c
@@ -0,0 +1,899 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include "client.h"
+#include <glusterfs/fd.h>
+#include "client-messages.h"
+#include "client-common.h"
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/common-utils.h>
+
+int
+client_fd_lk_list_empty(fd_lk_ctx_t *lk_ctx, gf_boolean_t try_lock)
+{
+    int ret = 1;
+
+    if (!lk_ctx) {
+        ret = -1;
+        goto out;
+    }
+
+    if (try_lock) {
+        ret = TRY_LOCK(&lk_ctx->lock);
+        if (ret != 0) {
+            ret = -1;
+            goto out;
+        }
+    } else {
+        LOCK(&lk_ctx->lock);
+    }
+
+    ret = list_empty(&lk_ctx->lk_list);
+    UNLOCK(&lk_ctx->lock);
+out:
+    return ret;
+}
+
+clnt_fd_ctx_t *
+this_fd_del_ctx(fd_t *file, xlator_t *this)
+{
+    int dict_ret = -1;
+    uint64_t ctxaddr = 0;
+
+    GF_VALIDATE_OR_GOTO("client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, file, out);
+
+    dict_ret = fd_ctx_del(file, this, &ctxaddr);
+
+    if (dict_ret < 0) {
+        ctxaddr = 0;
+    }
+
+out:
+    return (clnt_fd_ctx_t *)(unsigned long)ctxaddr;
+}
+
+clnt_fd_ctx_t *
+this_fd_get_ctx(fd_t *file, xlator_t *this)
+{
+    int dict_ret = -1;
+    uint64_t ctxaddr = 0;
+
+    GF_VALIDATE_OR_GOTO("client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, file, out);
+
+    dict_ret = fd_ctx_get(file, this, &ctxaddr);
+
+    if (dict_ret < 0) {
+        ctxaddr = 0;
+    }
+
+out:
+    return (clnt_fd_ctx_t *)(unsigned long)ctxaddr;
+}
+
+void
+this_fd_set_ctx(fd_t *file, xlator_t *this, loc_t *loc, clnt_fd_ctx_t *ctx)
+{
+    uint64_t oldaddr = 0;
+    int32_t ret = -1;
+
+    GF_VALIDATE_OR_GOTO("client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, file, out);
+
+    ret = fd_ctx_get(file, this, &oldaddr);
+    if (ret >= 0) {
+        if (loc)
+            gf_smsg(this->name, GF_LOG_INFO, 0, PC_MSG_FD_DUPLICATE_TRY,
+                    "path=%s", loc->path, "gfid=%s",
+                    uuid_utoa(loc->inode->gfid), NULL);
+        else
+            gf_smsg(this->name, GF_LOG_INFO, 0, PC_MSG_FD_DUPLICATE_TRY,
+                    "file=%p", file, NULL);
+    }
+
+    ret = fd_ctx_set(file, this, (uint64_t)(unsigned long)ctx);
+    if (ret < 0) {
+        if (loc)
+            gf_smsg(this->name, GF_LOG_WARNING, 0, PC_MSG_FD_SET_FAIL,
+                    "path=%s", loc->path, "gfid=%s",
+                    uuid_utoa(loc->inode->gfid), NULL);
+        else
+            gf_smsg(this->name, GF_LOG_WARNING, 0, PC_MSG_FD_SET_FAIL,
+                    "file=%p", file, NULL);
+    }
+out:
+    return;
+}
+
+int
+client_local_wipe(clnt_local_t *local)
+{
+    if (local) {
+        loc_wipe(&local->loc);
+        loc_wipe(&local->loc2);
+
+        if (local->fd) {
+            fd_unref(local->fd);
+        }
+
+        if (local->iobref) {
+            iobref_unref(local->iobref);
+        }
+
+        GF_FREE(local->name);
+        mem_put(local);
+    }
+
+    return 0;
+}
+int
+unserialize_rsp_dirent(xlator_t *this, struct gfs3_readdir_rsp *rsp,
+                       gf_dirent_t *entries)
+{
+    struct gfs3_dirlist *trav = NULL;
+    gf_dirent_t *entry = NULL;
+    int entry_len = 0;
+    int ret = -1;
+    clnt_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    trav = rsp->reply;
+    while (trav) {
+        entry_len = gf_dirent_size(trav->name);
+        entry = GF_CALLOC(1, entry_len, gf_common_mt_gf_dirent_t);
+        if (!entry)
+            goto out;
+
+        entry->d_ino = trav->d_ino;
+        gf_itransform(this, trav->d_off, &entry->d_off, conf->client_id);
+        entry->d_len = trav->d_len;
+        entry->d_type = trav->d_type;
+
+        strcpy(entry->d_name, trav->name);
+
+        list_add_tail(&entry->list, &entries->list);
+
+        trav = trav->nextentry;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+unserialize_rsp_direntp(xlator_t *this, fd_t *fd, struct gfs3_readdirp_rsp *rsp,
+                        gf_dirent_t *entries)
+{
+    struct gfs3_dirplist *trav = NULL;
+    gf_dirent_t *entry = NULL;
+    inode_table_t *itable = NULL;
+    int entry_len = 0;
+    int ret = -1;
+    clnt_conf_t *conf = NULL;
+
+    trav = rsp->reply;
+
+    if (fd)
+        itable = fd->inode->table;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    while (trav) {
+        entry_len = gf_dirent_size(trav->name);
+        entry = GF_CALLOC(1, entry_len, gf_common_mt_gf_dirent_t);
+        if (!entry)
+            goto out;
+
+        entry->d_ino = trav->d_ino;
+        gf_itransform(this, trav->d_off, &entry->d_off, conf->client_id);
+        entry->d_len = trav->d_len;
+        entry->d_type = trav->d_type;
+
+        gf_stat_to_iatt(&trav->stat, &entry->d_stat);
+
+        strcpy(entry->d_name, trav->name);
+
+        if (trav->dict.dict_val) {
+            entry->dict = dict_new();
+            if (!entry->dict)
+                goto out;
+
+            ret = dict_unserialize(trav->dict.dict_val, trav->dict.dict_len,
+                                   &entry->dict);
+            if (ret < 0) {
+                gf_smsg(THIS->name, GF_LOG_WARNING, EINVAL,
+                        PC_MSG_DICT_UNSERIALIZE_FAIL, "xattr", NULL);
+                goto out;
+            }
+        }
+
+        entry->inode = inode_find(itable, entry->d_stat.ia_gfid);
+        if (!entry->inode)
+            entry->inode = inode_new(itable);
+
+        list_add_tail(&entry->list, &entries->list);
+
+        trav = trav->nextentry;
+        entry = NULL;
+    }
+
+    ret = 0;
+out:
+    if (entry)
+        gf_dirent_entry_free(entry);
+    return ret;
+}
+
+int
+clnt_readdirp_rsp_cleanup(gfs3_readdirp_rsp *rsp)
+{
+    gfs3_dirplist *prev = NULL;
+    gfs3_dirplist *trav = NULL;
+
+    trav = rsp->reply;
+    prev = trav;
+    while (trav) {
+        trav = trav->nextentry;
+        /* on client, the rpc lib allocates this */
+        free(prev->dict.dict_val);
+        free(prev->name);
+        free(prev);
+        prev = trav;
+    }
+
+    return 0;
+}
+
+int
+unserialize_rsp_dirent_v2(xlator_t *this, struct gfx_readdir_rsp *rsp,
+                          gf_dirent_t *entries)
+{
+    struct gfx_dirlist *trav = NULL;
+    gf_dirent_t *entry = NULL;
+    int entry_len = 0;
+    int ret = -1;
+    clnt_conf_t *conf = NULL;
+
+    conf = this->private;
+
+    trav = rsp->reply;
+    while (trav) {
+        entry_len = gf_dirent_size(trav->name);
+        entry = GF_CALLOC(1, entry_len, gf_common_mt_gf_dirent_t);
+        if (!entry)
+            goto out;
+
+        entry->d_ino = trav->d_ino;
+        gf_itransform(this, trav->d_off, &entry->d_off, conf->client_id);
+        entry->d_len = trav->d_len;
+        entry->d_type = trav->d_type;
+
+        strcpy(entry->d_name, trav->name);
+
+        list_add_tail(&entry->list, &entries->list);
+
+        trav = trav->nextentry;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+unserialize_rsp_direntp_v2(xlator_t *this, fd_t *fd,
+                           struct gfx_readdirp_rsp *rsp, gf_dirent_t *entries)
+{
+    struct gfx_dirplist *trav = NULL;
+    gf_dirent_t *entry = NULL;
+    inode_table_t *itable = NULL;
+    int entry_len = 0;
+    int ret = -1;
+    clnt_conf_t *conf = NULL;
+
+    trav = rsp->reply;
+
+    if (fd)
+        itable = fd->inode->table;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    while (trav) {
+        entry_len = gf_dirent_size(trav->name);
+        entry = GF_CALLOC(1, entry_len, gf_common_mt_gf_dirent_t);
+        if (!entry)
+            goto out;
+
+        entry->d_ino = trav->d_ino;
+        gf_itransform(this, trav->d_off, &entry->d_off, conf->client_id);
+        entry->d_len = trav->d_len;
+        entry->d_type = trav->d_type;
+
+        gfx_stat_to_iattx(&trav->stat, &entry->d_stat);
+
+        strcpy(entry->d_name, trav->name);
+
+        xdr_to_dict(&trav->dict, &entry->dict);
+
+        entry->inode = inode_find(itable, entry->d_stat.ia_gfid);
+        if (!entry->inode)
+            entry->inode = inode_new(itable);
+
+        list_add_tail(&entry->list, &entries->list);
+
+        trav = trav->nextentry;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+int
+clnt_readdirp_rsp_cleanup_v2(gfx_readdirp_rsp *rsp)
+{
+    gfx_dirplist *prev = NULL;
+    gfx_dirplist *trav = NULL;
+
+    trav = rsp->reply;
+    prev = trav;
+    while (trav) {
+        trav = trav->nextentry;
+        free(prev->name);
+        free(prev);
+        prev = trav;
+    }
+
+    return 0;
+}
+
+int
+clnt_readdir_rsp_cleanup(gfs3_readdir_rsp *rsp)
+{
+    gfs3_dirlist *prev = NULL;
+    gfs3_dirlist *trav = NULL;
+
+    trav = rsp->reply;
+    prev = trav;
+    while (trav) {
+        trav = trav->nextentry;
+        /* on client, the rpc lib allocates this */
+        free(prev->name);
+        free(prev);
+        prev = trav;
+    }
+
+    return 0;
+}
+
+int
+clnt_readdir_rsp_cleanup_v2(gfx_readdir_rsp *rsp)
+{
+    gfx_dirlist *prev = NULL;
+    gfx_dirlist *trav = NULL;
+
+    trav = rsp->reply;
+    prev = trav;
+    while (trav) {
+        trav = trav->nextentry;
+        /* on client, the rpc lib allocates this */
+        free(prev->name);
+        free(prev);
+        prev = trav;
+    }
+
+    return 0;
+}
+
+int
+client_get_remote_fd(xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd)
+{
+    clnt_fd_ctx_t *fdctx = NULL;
+    clnt_conf_t *conf = NULL;
+    gf_boolean_t locks_held = _gf_false;
+
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, remote_fd, out);
+
+    conf = this->private;
+    pthread_spin_lock(&conf->fd_lock);
+    {
+        fdctx = this_fd_get_ctx(fd, this);
+        if (!fdctx) {
+            if (fd->anonymous) {
+                *remote_fd = GF_ANON_FD_NO;
+            } else {
+                *remote_fd = -1;
+                gf_msg_debug(this->name, EBADF, "not a valid fd for gfid: %s",
+                             uuid_utoa(fd->inode->gfid));
+            }
+        } else {
+            if (__is_fd_reopen_in_progress(fdctx))
+                *remote_fd = -1;
+            else
+                *remote_fd = fdctx->remote_fd;
+
+            locks_held = !list_empty(&fdctx->lock_list);
+        }
+    }
+    pthread_spin_unlock(&conf->fd_lock);
+
+    if ((flags & FALLBACK_TO_ANON_FD) && (*remote_fd == -1) && (!locks_held))
+        *remote_fd = GF_ANON_FD_NO;
+
+    return 0;
+out:
+    return -1;
+}
+
+gf_boolean_t
+client_is_reopen_needed(fd_t *fd, xlator_t *this, int64_t remote_fd)
+{
+    clnt_conf_t *conf = NULL;
+    clnt_fd_ctx_t *fdctx = NULL;
+    gf_boolean_t res = _gf_false;
+
+    conf = this->private;
+    pthread_spin_lock(&conf->fd_lock);
+    {
+        fdctx = this_fd_get_ctx(fd, this);
+        if (fdctx && (fdctx->remote_fd == -1) && (remote_fd == GF_ANON_FD_NO))
+            res = _gf_true;
+    }
+    pthread_spin_unlock(&conf->fd_lock);
+
+    return res;
+}
+
+int
+client_fd_fop_prepare_local(call_frame_t *frame, fd_t *fd, int64_t remote_fd)
+{
+    xlator_t *this = NULL;
+    clnt_local_t *local = NULL;
+    int ret = 0;
+
+    if (!frame || !fd) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    this = frame->this;
+
+    frame->local = mem_get0(this->local_pool);
+    if (frame->local == NULL) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    local = frame->local;
+    local->fd = fd_ref(fd);
+    local->attempt_reopen = client_is_reopen_needed(fd, this, remote_fd);
+
+    return 0;
+out:
+    return ret;
+}
+
+void
+clnt_getactivelk_rsp_cleanup(gfs3_getactivelk_rsp *rsp)
+{
+    gfs3_locklist *trav = NULL;
+    gfs3_locklist *next = NULL;
+
+    trav = rsp->reply;
+
+    while (trav) {
+        next = trav->nextentry;
+        free(trav->client_uid);
+        free(trav);
+        trav = next;
+    }
+}
+
+void
+clnt_getactivelk_rsp_cleanup_v2(gfx_getactivelk_rsp *rsp)
+{
+    gfs3_locklist *trav = NULL;
+    gfs3_locklist *next = NULL;
+
+    trav = rsp->reply;
+
+    while (trav) {
+        next = trav->nextentry;
+        free(trav->client_uid);
+        free(trav);
+        trav = next;
+    }
+}
+int
+clnt_unserialize_rsp_locklist(xlator_t *this, struct gfs3_getactivelk_rsp *rsp,
+                              lock_migration_info_t *lmi)
+{
+    struct gfs3_locklist *trav = NULL;
+    lock_migration_info_t *temp = NULL;
+    int ret = -1;
+    clnt_conf_t *conf = NULL;
+
+    trav = rsp->reply;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    while (trav) {
+        temp = GF_CALLOC(1, sizeof(*lmi), gf_common_mt_lock_mig);
+        if (temp == NULL) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_NO_MEM, NULL);
+            goto out;
+        }
+
+        INIT_LIST_HEAD(&temp->list);
+
+        gf_proto_flock_to_flock(&trav->flock, &temp->flock);
+
+        temp->lk_flags = trav->lk_flags;
+
+        temp->client_uid = gf_strdup(trav->client_uid);
+
+        list_add_tail(&temp->list, &lmi->list);
+
+        trav = trav->nextentry;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+int
+clnt_unserialize_rsp_locklist_v2(xlator_t *this,
+                                 struct gfx_getactivelk_rsp *rsp,
+                                 lock_migration_info_t *lmi)
+{
+    struct gfs3_locklist *trav = NULL;
+    lock_migration_info_t *temp = NULL;
+    int ret = -1;
+    clnt_conf_t *conf = NULL;
+
+    trav = rsp->reply;
+
+    conf = this->private;
+    if (!conf)
+        goto out;
+
+    while (trav) {
+        temp = GF_CALLOC(1, sizeof(*lmi), gf_common_mt_lock_mig);
+        if (temp == NULL) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, PC_MSG_NO_MEM, NULL);
+            goto out;
+        }
+
+        INIT_LIST_HEAD(&temp->list);
+
+        gf_proto_flock_to_flock(&trav->flock, &temp->flock);
+
+        temp->lk_flags = trav->lk_flags;
+
+        temp->client_uid = gf_strdup(trav->client_uid);
+
+        list_add_tail(&temp->list, &lmi->list);
+
+        trav = trav->nextentry;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+void
+clnt_setactivelk_req_cleanup(gfs3_setactivelk_req *req)
+{
+    gfs3_locklist *trav = NULL;
+    gfs3_locklist *next = NULL;
+
+    trav = req->request;
+
+    while (trav) {
+        next = trav->nextentry;
+        GF_FREE(trav->client_uid);
+        GF_FREE(trav);
+        trav = next;
+    }
+}
+
+void
+clnt_setactivelk_req_cleanup_v2(gfx_setactivelk_req *req)
+{
+    gfs3_locklist *trav = NULL;
+    gfs3_locklist *next = NULL;
+
+    trav = req->request;
+
+    while (trav) {
+        next = trav->nextentry;
+        GF_FREE(trav->client_uid);
+        GF_FREE(trav);
+        trav = next;
+    }
+}
+
+int
+serialize_req_locklist(lock_migration_info_t *locklist,
+                       gfs3_setactivelk_req *req)
+{
+    lock_migration_info_t *tmp = NULL;
+    gfs3_locklist *trav = NULL;
+    gfs3_locklist *prev = NULL;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("server", locklist, out);
+    GF_VALIDATE_OR_GOTO("server", req, out);
+
+    list_for_each_entry(tmp, &locklist->list, list)
+    {
+        trav = GF_CALLOC(1, sizeof(*trav), gf_client_mt_clnt_lock_request_t);
+        if (!trav)
+            goto out;
+
+        switch (tmp->flock.l_type) {
+            case F_RDLCK:
+                tmp->flock.l_type = GF_LK_F_RDLCK;
+                break;
+            case F_WRLCK:
+                tmp->flock.l_type = GF_LK_F_WRLCK;
+                break;
+            case F_UNLCK:
+                tmp->flock.l_type = GF_LK_F_UNLCK;
+                break;
+
+            default:
+                gf_smsg(THIS->name, GF_LOG_ERROR, 0, PC_MSG_UNKNOWN_LOCK_TYPE,
+                        "type=%" PRId32, tmp->flock.l_type, NULL);
+                break;
+        }
+
+        gf_proto_flock_from_flock(&trav->flock, &tmp->flock);
+
+        trav->lk_flags = tmp->lk_flags;
+
+        trav->client_uid = gf_strdup(tmp->client_uid);
+        if (!trav->client_uid) {
+            gf_smsg(THIS->name, GF_LOG_ERROR, 0, PC_MSG_CLIENT_UID_ALLOC_FAILED,
+                    NULL);
+            ret = -1;
+            goto out;
+        }
+
+        if (prev)
+            prev->nextentry = trav;
+        else
+            req->request = trav;
+
+        prev = trav;
+        trav = NULL;
+    }
+
+    ret = 0;
+out:
+    GF_FREE(trav);
+
+    return ret;
+}
+
+int
+serialize_req_locklist_v2(lock_migration_info_t *locklist,
+                          gfx_setactivelk_req *req)
+{
+    lock_migration_info_t *tmp = NULL;
+    gfs3_locklist *trav = NULL;
+    gfs3_locklist *prev = NULL;
+    int ret = -1;
+
+    GF_VALIDATE_OR_GOTO("server", locklist, out);
+    GF_VALIDATE_OR_GOTO("server", req, out);
+
+    list_for_each_entry(tmp, &locklist->list, list)
+    {
+        trav = GF_CALLOC(1, sizeof(*trav), gf_client_mt_clnt_lock_request_t);
+        if (!trav)
+            goto out;
+
+        switch (tmp->flock.l_type) {
+            case F_RDLCK:
+                tmp->flock.l_type = GF_LK_F_RDLCK;
+                break;
+            case F_WRLCK:
+                tmp->flock.l_type = GF_LK_F_WRLCK;
+                break;
+            case F_UNLCK:
+                tmp->flock.l_type = GF_LK_F_UNLCK;
+                break;
+
+            default:
+                gf_smsg(THIS->name, GF_LOG_ERROR, 0, PC_MSG_UNKNOWN_LOCK_TYPE,
+                        "type=%" PRId32, tmp->flock.l_type, NULL);
+                break;
+        }
+
+        gf_proto_flock_from_flock(&trav->flock, &tmp->flock);
+
+        trav->lk_flags = tmp->lk_flags;
+
+        trav->client_uid = gf_strdup(tmp->client_uid);
+        if (!trav->client_uid) {
+            gf_smsg(THIS->name, GF_LOG_ERROR, 0, PC_MSG_CLIENT_UID_ALLOC_FAILED,
+                    NULL);
+            ret = -1;
+            goto out;
+        }
+
+        if (prev)
+            prev->nextentry = trav;
+        else
+            req->request = trav;
+
+        prev = trav;
+        trav = NULL;
+    }
+
+    ret = 0;
+out:
+    GF_FREE(trav);
+
+    return ret;
+}
+
+extern int
+client3_3_releasedir_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                         void *myframe);
+extern int
+client3_3_release_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                      void *myframe);
+extern int
+client4_0_releasedir_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                         void *myframe);
+extern int
+client4_0_release_cbk(struct rpc_req *req, struct iovec *iov, int count,
+                      void *myframe);
+
+static int
+send_release4_0_over_wire(xlator_t *this, clnt_fd_ctx_t *fdctx,
+                          call_frame_t *fr)
+{
+    clnt_conf_t *conf = NULL;
+    conf = (clnt_conf_t *)this->private;
+    if (fdctx->is_dir) {
+        gfx_releasedir_req req = {
+            {
+                0,
+            },
+        };
+        memcpy(req.gfid, fdctx->gfid, 16);
+        req.fd = fdctx->remote_fd;
+
+        gf_msg_trace(this->name, 0, "sending releasedir on fd");
+        (void)client_submit_request(
+            this, &req, fr, conf->fops, GFS3_OP_RELEASEDIR,
+            client4_0_releasedir_cbk, NULL, (xdrproc_t)xdr_gfx_releasedir_req);
+    } else {
+        gfx_release_req req = {
+            {
+                0,
+            },
+        };
+        memcpy(req.gfid, fdctx->gfid, 16);
+        req.fd = fdctx->remote_fd;
+        gf_msg_trace(this->name, 0, "sending release on fd");
+        (void)client_submit_request(this, &req, fr, conf->fops, GFS3_OP_RELEASE,
+                                    client4_0_release_cbk, NULL,
+                                    (xdrproc_t)xdr_gfx_release_req);
+    }
+
+    return 0;
+}
+
+static int
+send_release3_3_over_wire(xlator_t *this, clnt_fd_ctx_t *fdctx,
+                          call_frame_t *fr)
+{
+    clnt_conf_t *conf = NULL;
+    conf = (clnt_conf_t *)this->private;
+    if (fdctx->is_dir) {
+        gfs3_releasedir_req req = {
+            {
+                0,
+            },
+        };
+        memcpy(req.gfid, fdctx->gfid, 16);
+        req.fd = fdctx->remote_fd;
+        gf_msg_trace(this->name, 0, "sending releasedir on fd");
+        (void)client_submit_request(
+            this, &req, fr, conf->fops, GFS3_OP_RELEASEDIR,
+            client3_3_releasedir_cbk, NULL, (xdrproc_t)xdr_gfs3_releasedir_req);
+    } else {
+        gfs3_release_req req = {
+            {
+                0,
+            },
+        };
+        memcpy(req.gfid, fdctx->gfid, 16);
+        req.fd = fdctx->remote_fd;
+        gf_msg_trace(this->name, 0, "sending release on fd");
+        (void)client_submit_request(this, &req, fr, conf->fops, GFS3_OP_RELEASE,
+                                    client3_3_release_cbk, NULL,
+                                    (xdrproc_t)xdr_gfs3_release_req);
+    }
+
+    return 0;
+}
+
+int
+client_fdctx_destroy(xlator_t *this, clnt_fd_ctx_t *fdctx)
+{
+    clnt_conf_t *conf = NULL;
+    call_frame_t *fr = NULL;
+    int32_t ret = -1;
+    char parent_down = 0;
+    fd_lk_ctx_t *lk_ctx = NULL;
+
+    GF_VALIDATE_OR_GOTO("client", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fdctx, out);
+
+    conf = (clnt_conf_t *)this->private;
+
+    if (fdctx->remote_fd == -1) {
+        gf_msg_debug(this->name, 0, "not a valid fd");
+        goto out;
+    }
+
+    pthread_mutex_lock(&conf->lock);
+    {
+        parent_down = conf->parent_down;
+    }
+    pthread_mutex_unlock(&conf->lock);
+    lk_ctx = fdctx->lk_ctx;
+    fdctx->lk_ctx = NULL;
+
+    if (lk_ctx)
+        fd_lk_ctx_unref(lk_ctx);
+
+    if (!parent_down)
+        rpc_clnt_ref(conf->rpc);
+    else
+        goto out;
+
+    fr = create_frame(this, this->ctx->pool);
+    if (fr == NULL) {
+        goto out;
+    }
+
+    ret = 0;
+
+    if (conf->fops->progver == GLUSTER_FOP_VERSION)
+        send_release3_3_over_wire(this, fdctx, fr);
+    else
+        send_release4_0_over_wire(this, fdctx, fr);
+
+    rpc_clnt_unref(conf->rpc);
+out:
+    if (fdctx) {
+        fdctx->remote_fd = -1;
+        GF_FREE(fdctx);
+    }
+
+    return ret;
+}
diff --git a/xlators/protocol/client/src/client-lk.c b/xlators/protocol/client/src/client-lk.c
new file mode 100644
index 00000000000..795839734c5
--- /dev/null
+++ b/xlators/protocol/client/src/client-lk.c
@@ -0,0 +1,515 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <glusterfs/common-utils.h>
+#include <glusterfs/xlator.h>
+#include "client.h"
+#include <glusterfs/lkowner.h>
+#include "client-messages.h"
+
+static void
+__insert_and_merge(clnt_fd_ctx_t *fdctx, client_posix_lock_t *lock);
+
+static void
+__dump_client_lock(client_posix_lock_t *lock)
+{
+    xlator_t *this = NULL;
+
+    this = THIS;
+
+    gf_smsg(
+        this->name, GF_LOG_INFO, 0, PC_MSG_CLIENT_LOCK_INFO, "fd=%p", lock->fd,
+        "fl_type=%s", lock->fl_type == F_WRLCK ? "Write-Lock" : "Read-Lock",
+        "lk-owner=%s", lkowner_utoa(&lock->owner), "l_start=%" PRId64,
+        lock->user_flock.l_start, "l_len=%" PRId64, lock->user_flock.l_len,
+        "start=%" PRId64, lock->fl_start, "end=%" PRId64, lock->fl_end, NULL);
+}
+
+static int
+dump_client_locks_fd(clnt_fd_ctx_t *fdctx)
+{
+    client_posix_lock_t *lock = NULL;
+    int count = 0;
+
+    list_for_each_entry(lock, &fdctx->lock_list, list)
+    {
+        __dump_client_lock(lock);
+        count++;
+    }
+
+    return count;
+}
+
+int
+dump_client_locks(inode_t *inode)
+{
+    fd_t *fd = NULL;
+    xlator_t *this = NULL;
+    clnt_fd_ctx_t *fdctx = NULL;
+    clnt_conf_t *conf = NULL;
+
+    int total_count = 0;
+    int locks_fd_count = 0;
+
+    this = THIS;
+    conf = this->private;
+
+    LOCK(&inode->lock);
+    {
+        list_for_each_entry(fd, &inode->fd_list, inode_list)
+        {
+            locks_fd_count = 0;
+
+            pthread_spin_lock(&conf->fd_lock);
+            fdctx = this_fd_get_ctx(fd, this);
+            if (fdctx)
+                locks_fd_count = dump_client_locks_fd(fdctx);
+            pthread_spin_unlock(&conf->fd_lock);
+
+            total_count += locks_fd_count;
+        }
+    }
+    UNLOCK(&inode->lock);
+
+    return total_count;
+}
+
+static off_t
+__get_lock_length(off_t start, off_t end)
+{
+    if (end == LLONG_MAX)
+        return 0;
+    else
+        return (end - start + 1);
+}
+
+/* Add two locks */
+static client_posix_lock_t *
+add_locks(client_posix_lock_t *l1, client_posix_lock_t *l2)
+{
+    client_posix_lock_t *sum = NULL;
+
+    sum = GF_CALLOC(1, sizeof(*sum), gf_client_mt_clnt_lock_t);
+    if (!sum)
+        return NULL;
+
+    sum->fl_start = min(l1->fl_start, l2->fl_start);
+    sum->fl_end = max(l1->fl_end, l2->fl_end);
+
+    sum->user_flock.l_start = sum->fl_start;
+    sum->user_flock.l_len = __get_lock_length(sum->fl_start, sum->fl_end);
+
+    return sum;
+}
+
+/* Return true if the locks overlap, false otherwise */
+static int
+locks_overlap(client_posix_lock_t *l1, client_posix_lock_t *l2)
+{
+    /*
+       Note:
+       FUSE always gives us absolute offsets, so no need to worry
+       about SEEK_CUR or SEEK_END
+    */
+
+    return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start));
+}
+
+static void
+__delete_client_lock(client_posix_lock_t *lock)
+{
+    list_del_init(&lock->list);
+}
+
+/* Destroy a posix_lock */
+static void
+__destroy_client_lock(client_posix_lock_t *lock)
+{
+    GF_FREE(lock);
+}
+
+/* Subtract two locks */
+struct _values {
+    client_posix_lock_t *locks[3];
+};
+
+/* {big} must always be contained inside {small} */
+static struct _values
+subtract_locks(client_posix_lock_t *big, client_posix_lock_t *small)
+{
+    struct _values v = {.locks = {0, 0, 0}};
+
+    if ((big->fl_start == small->fl_start) && (big->fl_end == small->fl_end)) {
+        /* both edges coincide with big */
+        v.locks[0] = GF_MALLOC(sizeof(client_posix_lock_t),
+                               gf_client_mt_clnt_lock_t);
+        GF_ASSERT(v.locks[0]);
+        memcpy(v.locks[0], big, sizeof(client_posix_lock_t));
+        v.locks[0]->fl_type = small->fl_type;
+    } else if ((small->fl_start > big->fl_start) &&
+               (small->fl_end < big->fl_end)) {
+        /* both edges lie inside big */
+        v.locks[0] = GF_MALLOC(sizeof(client_posix_lock_t),
+                               gf_client_mt_clnt_lock_t);
+        GF_ASSERT(v.locks[0]);
+        memcpy(v.locks[0], big, sizeof(client_posix_lock_t));
+        v.locks[0]->fl_end = small->fl_start - 1;
+        v.locks[0]->user_flock.l_len = __get_lock_length(v.locks[0]->fl_start,
+                                                         v.locks[0]->fl_end);
+        v.locks[1] = GF_MALLOC(sizeof(client_posix_lock_t),
+                               gf_client_mt_clnt_lock_t);
+        GF_ASSERT(v.locks[1]);
+        memcpy(v.locks[1], small, sizeof(client_posix_lock_t));
+        v.locks[2] = GF_MALLOC(sizeof(client_posix_lock_t),
+                               gf_client_mt_clnt_lock_t);
+        GF_ASSERT(v.locks[2]);
+        memcpy(v.locks[2], big, sizeof(client_posix_lock_t));
+        v.locks[2]->fl_start = small->fl_end + 1;
+        v.locks[2]->user_flock.l_start = small->fl_end + 1;
+    }
+    /* one edge coincides with big */
+    else if (small->fl_start == big->fl_start) {
+        v.locks[0] = GF_MALLOC(sizeof(client_posix_lock_t),
+                               gf_client_mt_clnt_lock_t);
+        GF_ASSERT(v.locks[0]);
+        memcpy(v.locks[0], big, sizeof(client_posix_lock_t));
+        v.locks[0]->fl_start = small->fl_end + 1;
+        v.locks[0]->user_flock.l_start = small->fl_end + 1;
+        v.locks[1] = GF_MALLOC(sizeof(client_posix_lock_t),
+                               gf_client_mt_clnt_lock_t);
+        GF_ASSERT(v.locks[1]);
+        memcpy(v.locks[1], small, sizeof(client_posix_lock_t));
+    } else if (small->fl_end == big->fl_end) {
+        v.locks[0] = GF_MALLOC(sizeof(client_posix_lock_t),
+                               gf_client_mt_clnt_lock_t);
+        GF_ASSERT(v.locks[0]);
+        memcpy(v.locks[0], big, sizeof(client_posix_lock_t));
+        v.locks[0]->fl_end = small->fl_start - 1;
+        v.locks[0]->user_flock.l_len = __get_lock_length(v.locks[0]->fl_start,
+                                                         v.locks[0]->fl_end);
+
+        v.locks[1] = GF_MALLOC(sizeof(client_posix_lock_t),
+                               gf_client_mt_clnt_lock_t);
+        GF_ASSERT(v.locks[1]);
+        memcpy(v.locks[1], small, sizeof(client_posix_lock_t));
+    } else {
+        /* LOG-TODO : decide what more info is required here*/
+        gf_smsg("client-protocol", GF_LOG_CRITICAL, 0, PC_MSG_LOCK_ERROR, NULL);
+    }
+
+    return v;
+}
+
+static void
+__delete_unlck_locks(clnt_fd_ctx_t *fdctx)
+{
+    client_posix_lock_t *l = NULL;
+    client_posix_lock_t *tmp = NULL;
+
+    list_for_each_entry_safe(l, tmp, &fdctx->lock_list, list)
+    {
+        if (l->fl_type == F_UNLCK) {
+            __delete_client_lock(l);
+            __destroy_client_lock(l);
+        }
+    }
+}
+
+static void
+__insert_lock(clnt_fd_ctx_t *fdctx, client_posix_lock_t *lock)
+{
+    list_add_tail(&lock->list, &fdctx->lock_list);
+
+    return;
+}
+
+static void
+__insert_and_merge(clnt_fd_ctx_t *fdctx, client_posix_lock_t *lock)
+{
+    client_posix_lock_t *conf = NULL;
+    client_posix_lock_t *t = NULL;
+    client_posix_lock_t *sum = NULL;
+    int i = 0;
+    struct _values v = {.locks = {0, 0, 0}};
+
+    list_for_each_entry_safe(conf, t, &fdctx->lock_list, list)
+    {
+        if (!locks_overlap(conf, lock))
+            continue;
+
+        if (is_same_lkowner(&conf->owner, &lock->owner)) {
+            if (conf->fl_type == lock->fl_type) {
+                sum = add_locks(lock, conf);
+
+                sum->fd = lock->fd;
+
+                __delete_client_lock(conf);
+                __destroy_client_lock(conf);
+
+                __destroy_client_lock(lock);
+                __insert_and_merge(fdctx, sum);
+
+                return;
+            } else {
+                sum = add_locks(lock, conf);
+
+                sum->fd = conf->fd;
+                sum->owner = conf->owner;
+
+                v = subtract_locks(sum, lock);
+
+                __delete_client_lock(conf);
+                __destroy_client_lock(conf);
+
+                __delete_client_lock(lock);
+                __destroy_client_lock(lock);
+
+                __destroy_client_lock(sum);
+
+                for (i = 0; i < 3; i++) {
+                    if (!v.locks[i])
+                        continue;
+
+                    INIT_LIST_HEAD(&v.locks[i]->list);
+                    __insert_and_merge(fdctx, v.locks[i]);
+                }
+
+                __delete_unlck_locks(fdctx);
+                return;
+            }
+        }
+
+        if (lock->fl_type == F_UNLCK) {
+            continue;
+        }
+
+        if ((conf->fl_type == F_RDLCK) && (lock->fl_type == F_RDLCK)) {
+            __insert_lock(fdctx, lock);
+            return;
+        }
+    }
+
+    /* no conflicts, so just insert */
+    if (lock->fl_type != F_UNLCK) {
+        __insert_lock(fdctx, lock);
+    } else {
+        __destroy_client_lock(lock);
+    }
+}
+
+static void
+client_setlk(clnt_fd_ctx_t *fdctx, client_posix_lock_t *lock)
+{
+    __insert_and_merge(fdctx, lock);
+}
+
+static void
+destroy_client_lock(client_posix_lock_t *lock)
+{
+    GF_FREE(lock);
+}
+
+int32_t
+delete_granted_locks_owner(fd_t *fd, gf_lkowner_t *owner)
+{
+    clnt_fd_ctx_t *fdctx = NULL;
+    client_posix_lock_t *lock = NULL;
+    client_posix_lock_t *tmp = NULL;
+    xlator_t *this = NULL;
+    clnt_conf_t *conf = NULL;
+
+    struct list_head delete_list;
+    int ret = 0;
+    int count = 0;
+
+    INIT_LIST_HEAD(&delete_list);
+    this = THIS;
+    conf = this->private;
+
+    pthread_spin_lock(&conf->fd_lock);
+
+    fdctx = this_fd_get_ctx(fd, this);
+    if (!fdctx) {
+        pthread_spin_unlock(&conf->fd_lock);
+
+        gf_smsg(this->name, GF_LOG_WARNING, EINVAL, PC_MSG_FD_CTX_INVALID,
+                NULL);
+        ret = -1;
+        goto out;
+    }
+
+    list_for_each_entry_safe(lock, tmp, &fdctx->lock_list, list)
+    {
+        if (is_same_lkowner(&lock->owner, owner)) {
+            list_del_init(&lock->list);
+            list_add_tail(&lock->list, &delete_list);
+            count++;
+        }
+    }
+
+    pthread_spin_unlock(&conf->fd_lock);
+
+    if (!list_empty(&delete_list)) {
+        list_for_each_entry_safe(lock, tmp, &delete_list, list)
+        {
+            list_del_init(&lock->list);
+            destroy_client_lock(lock);
+        }
+    }
+
+    /* FIXME: Need to actually print the locks instead of count */
+    gf_msg_trace(this->name, 0, "Number of locks cleared=%d", count);
+
+out:
+    return ret;
+}
+
+int32_t
+client_cmd_to_gf_cmd(int32_t cmd, int32_t *gf_cmd)
+{
+    int ret = 0;
+
+    if (cmd == F_GETLK || cmd == F_GETLK64)
+        *gf_cmd = GF_LK_GETLK;
+    else if (cmd == F_SETLK || cmd == F_SETLK64)
+        *gf_cmd = GF_LK_SETLK;
+    else if (cmd == F_SETLKW || cmd == F_SETLKW64)
+        *gf_cmd = GF_LK_SETLKW;
+    else if (cmd == F_RESLK_LCK)
+        *gf_cmd = GF_LK_RESLK_LCK;
+    else if (cmd == F_RESLK_LCKW)
+        *gf_cmd = GF_LK_RESLK_LCKW;
+    else if (cmd == F_RESLK_UNLCK)
+        *gf_cmd = GF_LK_RESLK_UNLCK;
+    else if (cmd == F_GETLK_FD)
+        *gf_cmd = GF_LK_GETLK_FD;
+    else
+        ret = -1;
+
+    return ret;
+}
+
+static client_posix_lock_t *
+new_client_lock(struct gf_flock *flock, gf_lkowner_t *owner, int32_t cmd,
+                fd_t *fd)
+{
+    client_posix_lock_t *new_lock = NULL;
+
+    new_lock = GF_CALLOC(1, sizeof(*new_lock), gf_client_mt_clnt_lock_t);
+    if (!new_lock) {
+        goto out;
+    }
+
+    INIT_LIST_HEAD(&new_lock->list);
+    new_lock->fd = fd;
+    memcpy(&new_lock->user_flock, flock, sizeof(struct gf_flock));
+
+    new_lock->fl_type = flock->l_type;
+    new_lock->fl_start = flock->l_start;
+
+    if (flock->l_len == 0)
+        new_lock->fl_end = LLONG_MAX;
+    else
+        new_lock->fl_end = flock->l_start + flock->l_len - 1;
+
+    new_lock->owner = *owner;
+
+    new_lock->cmd = cmd; /* Not really useful */
+
+out:
+    return new_lock;
+}
+
+void
+client_save_number_fds(clnt_conf_t *conf, int count)
+{
+    LOCK(&conf->rec_lock);
+    {
+        conf->reopen_fd_count = count;
+    }
+    UNLOCK(&conf->rec_lock);
+}
+
+int
+client_add_lock_for_recovery(fd_t *fd, struct gf_flock *flock,
+                             gf_lkowner_t *owner, int32_t cmd)
+{
+    clnt_fd_ctx_t *fdctx = NULL;
+    xlator_t *this = NULL;
+    client_posix_lock_t *lock = NULL;
+    clnt_conf_t *conf = NULL;
+
+    int ret = 0;
+
+    this = THIS;
+    conf = this->private;
+
+    pthread_spin_lock(&conf->fd_lock);
+
+    fdctx = this_fd_get_ctx(fd, this);
+    if (!fdctx) {
+        pthread_spin_unlock(&conf->fd_lock);
+
+        gf_smsg(this->name, GF_LOG_WARNING, 0, PC_MSG_FD_GET_FAIL, NULL);
+        ret = -EBADFD;
+        goto out;
+    }
+
+    lock = new_client_lock(flock, owner, cmd, fd);
+    if (!lock) {
+        pthread_spin_unlock(&conf->fd_lock);
+
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    client_setlk(fdctx, lock);
+
+    pthread_spin_unlock(&conf->fd_lock);
+
+out:
+    return ret;
+}
+
+int32_t
+client_dump_locks(char *name, inode_t *inode, dict_t *dict)
+{
+    int ret = 0;
+    dict_t *new_dict = NULL;
+    char dict_string[256];
+
+    GF_ASSERT(dict);
+    new_dict = dict;
+
+    ret = dump_client_locks(inode);
+    snprintf(dict_string, 256, "%d locks dumped in log file", ret);
+
+    ret = dict_set_dynstr(new_dict, CLIENT_DUMP_LOCKS, dict_string);
+    if (ret) {
+        gf_smsg(THIS->name, GF_LOG_WARNING, 0, PC_MSG_DICT_SET_FAIL, "lock=%s",
+                CLIENT_DUMP_LOCKS, NULL);
+        goto out;
+    }
+
+out:
+
+    return ret;
+}
+
+int32_t
+is_client_dump_locks_cmd(char *name)
+{
+    int ret = 0;
+
+    if (strcmp(name, CLIENT_DUMP_LOCKS) == 0)
+        ret = 1;
+
+    return ret;
+}
diff --git a/xlators/protocol/client/src/client-mem-types.h b/xlators/protocol/client/src/client-mem-types.h
new file mode 100644
index 00000000000..f61fa0c1828
--- /dev/null
+++ b/xlators/protocol/client/src/client-mem-types.h
@@ -0,0 +1,26 @@
+/*
+  Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CLIENT_MEM_TYPES_H__
+#define __CLIENT_MEM_TYPES_H__
+
+#include <glusterfs/mem-types.h>
+
+enum gf_client_mem_types_ {
+    gf_client_mt_clnt_conf_t = gf_common_mt_end + 1,
+    gf_client_mt_clnt_req_buf_t,
+    gf_client_mt_clnt_fdctx_t,
+    gf_client_mt_clnt_lock_t,
+    gf_client_mt_clnt_fd_lk_local_t,
+    gf_client_mt_compound_req_t,
+    gf_client_mt_clnt_lock_request_t,
+    gf_client_mt_end,
+};
+#endif /* __CLIENT_MEM_TYPES_H__ */
diff --git a/xlators/protocol/client/src/client-messages.h b/xlators/protocol/client/src/client-messages.h
new file mode 100644
index 00000000000..25a851d80b9
--- /dev/null
+++ b/xlators/protocol/client/src/client-messages.h
@@ -0,0 +1,174 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _PC_MESSAGES_H__
+#define _PC_MESSAGES_H__
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(
+    PC, PC_MSG_TIMER_EXPIRED, PC_MSG_DIR_OP_FAILED, PC_MSG_FILE_OP_FAILED,
+    PC_MSG_TIMER_REG, PC_MSG_GRACE_TIMER_CANCELLED, PC_MSG_DICT_SET_FAILED,
+    PC_MSG_DICT_GET_FAILED, PC_MSG_NO_MEMORY, PC_MSG_RPC_CBK_FAILED,
+    PC_MSG_FUNCTION_CALL_ERROR, PC_MSG_RPC_INITED_ALREADY, PC_MSG_RPC_INIT,
+    PC_MSG_RPC_DESTROY, PC_MSG_RPC_INVALID_CALL, PC_MSG_INVALID_ENTRY,
+    PC_MSG_HANDSHAKE_RETURN, PC_MSG_CHILD_UP_NOTIFY_FAILED,
+    PC_MSG_CLIENT_DISCONNECTED, PC_MSG_CHILD_DOWN_NOTIFY_FAILED,
+    PC_MSG_PARENT_UP, PC_MSG_PARENT_DOWN, PC_MSG_RPC_INIT_FAILED,
+    PC_MSG_RPC_NOTIFY_FAILED, PC_MSG_FD_DUPLICATE_TRY, PC_MSG_FD_SET_FAIL,
+    PC_MSG_DICT_UNSERIALIZE_FAIL, PC_MSG_FD_GET_FAIL, PC_MSG_FD_CTX_INVALID,
+    PC_MSG_FOP_SEND_FAILED, PC_MSG_XDR_DECODING_FAILED, PC_MSG_REMOTE_OP_FAILED,
+    PC_MSG_RPC_STATUS_ERROR, PC_MSG_VOL_FILE_NOT_FOUND, PC_MSG_SEND_REQ_FAIL,
+    PC_MSG_LOCK_VERSION_SERVER, PC_MSG_SET_LK_VERSION_ERROR,
+    PC_MSG_LOCK_REQ_FAIL, PC_MSG_CLIENT_REQ_FAIL, PC_MSG_LOCK_ERROR,
+    PC_MSG_LOCK_REACQUIRE, PC_MSG_CHILD_UP_NOTIFY, PC_MSG_CHILD_UP_NOTIFY_DELAY,
+    PC_MSG_VOL_SET_FAIL, PC_MSG_SETVOLUME_FAIL, PC_MSG_VOLFILE_NOTIFY_FAILED,
+    PC_MSG_REMOTE_VOL_CONNECTED, PC_MSG_LOCK_MISMATCH, PC_MSG_LOCK_MATCH,
+    PC_MSG_AUTH_FAILED, PC_MSG_AUTH_FAILED_NOTIFY_FAILED,
+    PC_MSG_CHILD_CONNECTING_EVENT, PC_MSG_CHILD_CONNECTING_NOTIFY_FAILED,
+    PC_MSG_PROCESS_UUID_SET_FAIL, PC_MSG_DICT_ERROR, PC_MSG_DICT_SERIALIZE_FAIL,
+    PC_MSG_PGM_NOT_FOUND, PC_MSG_VERSION_INFO, PC_MSG_PORT_NUM_ERROR,
+    PC_MSG_VERSION_ERROR, PC_MSG_DIR_OP_SUCCESS, PC_MSG_BAD_FD,
+    PC_MSG_CLIENT_LOCK_INFO, PC_MSG_CACHE_INVALIDATION_FAIL,
+    PC_MSG_CHILD_STATUS, PC_MSG_GFID_NULL, PC_MSG_RECALL_LEASE_FAIL,
+    PC_MSG_INODELK_CONTENTION_FAIL, PC_MSG_ENTRYLK_CONTENTION_FAIL,
+    PC_MSG_BIGGER_SIZE, PC_MSG_CLIENT_DUMP_LOCKS_FAILED, PC_MSG_UNKNOWN_CMD,
+    PC_MSG_REOPEN_FAILED, PC_MSG_FIND_KEY_FAILED, PC_MSG_VOL_ID_CHANGED,
+    PC_MSG_GETHOSTNAME_FAILED, PC_MSG_VOLFILE_KEY_SET_FAILED,
+    PC_MSG_VOLFILE_CHECKSUM_FAILED, PC_MSG_FRAME_NOT_FOUND,
+    PC_MSG_REMOTE_SUBVOL_SET_FAIL, PC_MSG_HANDSHAKE_PGM_NOT_FOUND,
+    PC_MSG_MERGE_IOBREF_FAILED, PC_MSG_ADD_IOBUF_FAILED,
+    PC_MSG_RELEASE_DIR_OP_FAILED, PC_MSG_REMOTE_HOST_SET_FAILED,
+    PC_MSG_REMOTE_PORT_SET_FAILED, PC_MSG_REMOTE_HOST_NOT_SET,
+    PC_MSG_NOREMOTE_HOST, PC_MSG_REMOTE_SUBVOL_NOT_GIVEN,
+    PC_MSG_FATAL_CLIENT_PROTOCOL, PC_MSG_VOL_DANGLING,
+    PC_MSG_CREATE_MEM_POOL_FAILED, PC_MSG_PVT_XLATOR_NULL, PC_MSG_XLATOR_NULL,
+    PC_MSG_LEASE_FOP_FAILED, PC_MSG_DICT_SET_FAIL, PC_MSG_NO_MEM,
+    PC_MSG_UNKNOWN_LOCK_TYPE, PC_MSG_CLIENT_UID_ALLOC_FAILED);
+
+#define PC_MSG_REMOTE_OP_FAILED_STR "remote operation failed."
+#define PC_MSG_XDR_DECODING_FAILED_STR "XDR decoding failed"
+#define PC_MSG_FOP_SEND_FAILED_STR "failed to send the fop"
+#define PC_MSG_BIGGER_SIZE_STR "read-size is bigger than iobuf isze"
+#define PC_MSG_CLIENT_DUMP_LOCKS_FAILED_STR "client dump locks failed"
+#define PC_MSG_UNKNOWN_CMD_STR "Unknown cmd"
+#define PC_MSG_CHILD_UP_NOTIFY_FAILED_STR "notify of CHILD_UP failed"
+#define PC_MSG_CHILD_STATUS_STR                                                \
+    "Defering sending CHILD_UP message as the client translators are not yet " \
+    "ready to serve"
+#define PC_MSG_CHILD_UP_NOTIFY_STR "last fd open'd - notifying CHILD_UP"
+#define PC_MSG_RPC_STATUS_ERROR_STR                                            \
+    "received RPC status error, returning ENOTCONN"
+#define PC_MSG_REOPEN_FAILED_STR "reopen failed"
+#define PC_MSG_DIR_OP_SUCCESS_STR "reopen dir succeeded"
+#define PC_MSG_DIR_OP_FAILED_STR "failed to send the re-opendir request"
+#define PC_MSG_CHILD_UP_NOTIFY_DELAY_STR                                       \
+    "fds open - Delaying child_up until they are re-opened"
+#define PC_MSG_VOL_SET_FAIL_STR "failed to set the volume"
+#define PC_MSG_DICT_UNSERIALIZE_FAIL_STR "failed to unserialize buffer to dict"
+#define PC_MSG_DICT_GET_FAILED_STR "failed to get from reply dict"
+#define PC_MSG_SETVOLUME_FAIL_STR "SETVOLUME on remote-host failed"
+#define PC_MSG_VOLFILE_NOTIFY_FAILED_STR "notify of VOLFILE_MODIFIED failed"
+#define PC_MSG_FIND_KEY_FAILED_STR "failed to find key in the options"
+#define PC_MSG_VOL_ID_CHANGED_STR                                              \
+    "volume-id changed, can't connect to server. Needs remount"
+#define PC_MSG_REMOTE_VOL_CONNECTED_STR "Connected, attached to remote volume"
+#define PC_MSG_AUTH_FAILED_STR "sending AUTH_FAILED event"
+#define PC_MSG_AUTH_FAILED_NOTIFY_FAILED_STR "notify of AUTH_FAILED failed"
+#define PC_MSG_CHILD_CONNECTING_EVENT_STR "sending CHILD_CONNECTING event"
+#define PC_MSG_CHILD_CONNECTING_NOTIFY_FAILED_STR                              \
+    "notify of CHILD_CONNECTING failed"
+#define PC_MSG_DICT_SET_FAILED_STR "failed to set in handshake msg"
+#define PC_MSG_GETHOSTNAME_FAILED_STR "gethostname: failed"
+#define PC_MSG_PROCESS_UUID_SET_FAIL_STR                                       \
+    "asprintf failed while setting process_uuid"
+#define PC_MSG_VOLFILE_KEY_SET_FAILED_STR "failed to set volfile-key"
+#define PC_MSG_VOLFILE_CHECKSUM_FAILED_STR "failed to set volfile-checksum"
+#define PC_MSG_DICT_SERIALIZE_FAIL_STR "failed to serialize dictionary"
+#define PC_MSG_PGM_NOT_FOUND_STR "xlator not found OR RPC program not found"
+#define PC_MSG_VERSION_INFO_STR "Using Program"
+#define PC_MSG_FRAME_NOT_FOUND_STR "frame not found with rpc request"
+#define PC_MSG_PORT_NUM_ERROR_STR                                              \
+    "failed to get the port number for remote subvolume. Please run gluster "  \
+    "volume status on server to see if brick process is running"
+#define PC_MSG_REMOTE_SUBVOL_SET_FAIL_STR "remote-subvolume not set in volfile"
+#define PC_MSG_VERSION_ERROR_STR "failed to get the version from server"
+#define PC_MSG_NO_VERSION_SUPPORT_STR "server doesn't support the version"
+#define PC_MSG_HANDSHAKE_PGM_NOT_FOUND_STR "handshake program not found"
+#define PC_MSG_MERGE_IOBREF_FAILED_STR                                         \
+    "cannot merge iobref passed from caller into new_iobref"
+#define PC_MSG_ADD_IOBUF_FAILED_STR "cannot add iobuf into iobref"
+#define PC_MSG_RELEASE_DIR_OP_FAILED_STR "release dir op failed"
+#define PC_MSG_FILE_OP_FAILED_STR "release fop failed"
+#define PC_MSG_REMOTE_HOST_SET_FAILED_STR "failed to set remote-host"
+#define PC_MSG_REMOTE_PORT_SET_FAILED_STR "failed to set remote-port"
+#define PC_MSG_RPC_INIT_STR "client rpc init command"
+#define PC_MSG_RPC_DESTROY_STR "client rpc destroy command"
+#define PC_MSG_HANDSHAKE_RETURN_STR "handshake msg returned"
+#define PC_MSG_CLIENT_DISCONNECTED_STR                                         \
+    "disconnected from client, process will keep trying to connect glusterd "  \
+    "until brick's port is available"
+#define PC_MSG_CHILD_DOWN_NOTIFY_FAILED_STR "CHILD_DOWN notify failed"
+#define PC_MSG_PARENT_UP_STR                                                   \
+    "parent translators are ready, attempting connect on transport"
+#define PC_MSG_PARENT_DOWN_STR                                                 \
+    "current graph is no longer active, destroying rpc_client"
+#define PC_MSG_REMOTE_HOST_NOT_SET_STR                                         \
+    "Remote host is not set. Assuming the volfile server as remote host"
+#define PC_MSG_NOREMOTE_HOST_STR "No remote host to connect"
+#define PC_MSG_REMOTE_SUBVOL_NOT_GIVEN_STR "option 'remote-subvolume' not given"
+#define PC_MSG_NO_MEMORY_STR "Memory accounting init failed"
+#define PC_MSG_RPC_INVALID_CALL_STR                                            \
+    "RPC destroy called on already destroyed connection"
+#define PC_MSG_RPC_INITED_ALREADY_STR "client rpc already init'ed"
+#define PC_MSG_RPC_INIT_FAILED_STR "failed to initialize RPC"
+#define PC_MSG_RPC_NOTIFY_FAILED_STR "failed to register notify"
+#define PC_MSG_RPC_CBK_FAILED_STR "failed to reister callback program"
+#define PC_MSG_FATAL_CLIENT_PROTOCOL_STR                                       \
+    "FATAL: client protocol, translator cannot have any subvolumes"
+#define PC_MSG_VOL_DANGLING_STR "Volume is dangling"
+#define PC_MSG_CREATE_MEM_POOL_FAILED_STR                                      \
+    "failed to create local_t's memory pool"
+#define PC_MSG_XLATOR_NULL_STR "xlator is NULL"
+#define PC_MSG_PVT_XLATOR_NULL_STR "private structure of the xlator is NULL"
+#define PC_MSG_LEASE_FOP_FAILED_STR "Lease fop failed"
+#define PC_MSG_LOCK_ERROR_STR                                                  \
+    "Unexpected case in subtract_locks. Please send a bug report to "          \
+    "gluster-devel@gluster.org"
+#define PC_MSG_FD_CTX_INVALID_STR "fdctx not valid"
+#define PC_MSG_FD_GET_FAIL_STR "failed to get fd context. sending EBADFD"
+#define PC_MSG_DICT_SET_FAIL_STR "could not set dict"
+#define PC_MSG_CLIENT_LOCK_INFO_STR "client lock info"
+#define PC_MSG_BAD_FD_STR "remote_fd is -1. EBADFD"
+#define PC_MSG_FUNCTION_CALL_ERROR_STR "this function should not be called"
+#define PC_MSG_RECALL_LEASE_FAIL_STR "XDR decode of recall lease failed"
+#define PC_MSG_CACHE_INVALIDATION_FAIL_STR                                     \
+    "XDR decode of cache_invalidation failed"
+#define PC_MSG_INODELK_CONTENTION_FAIL_STR                                     \
+    "XDR decode of inodelk contention failed"
+#define PC_MSG_ENTRYLK_CONTENTION_FAIL_STR                                     \
+    "XDR decode of entrylk contention failed"
+#define PC_MSG_FD_DUPLICATE_TRY_STR "trying duplicate remote fd set"
+#define PC_MSG_FD_SET_FAIL_STR "failed to set remote-fd"
+#define PC_MSG_NO_MEM_STR "No memory"
+#define PC_MSG_UNKNOWN_LOCK_TYPE_STR "Unknown lock type"
+#define PC_MSG_CLIENT_UID_ALLOC_FAILED_STR "client-uid could not be allocated"
+
+#endif /* !_PC_MESSAGES_H__ */
diff --git a/xlators/protocol/client/src/client-protocol.c b/xlators/protocol/client/src/client-protocol.c
deleted file mode 100644
index 11e66983324..00000000000
--- a/xlators/protocol/client/src/client-protocol.c
+++ /dev/null
@@ -1,6769 +0,0 @@
-/*
-  Copyright (c) 2006-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
-  This file is part of GlusterFS.
-
-  GlusterFS is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published
-  by the Free Software Foundation; either version 3 of the License,
-  or (at your option) any later version.
-
-  GlusterFS is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see
-  <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-#include <inttypes.h>
-
-
-#include "glusterfs.h"
-#include "client-protocol.h"
-#include "compat.h"
-#include "dict.h"
-#include "protocol.h"
-#include "transport.h"
-#include "xlator.h"
-#include "logging.h"
-#include "timer.h"
-#include "defaults.h"
-#include "compat.h"
-#include "compat-errno.h"
-
-#include <sys/resource.h>
-#include <inttypes.h>
-
-/* for default_*_cbk functions */
-#include "defaults.c"
-#include "saved-frames.h"
-#include "common-utils.h"
-
-int protocol_client_cleanup (transport_t *trans);
-int protocol_client_interpret (xlator_t *this, transport_t *trans,
-                               char *hdr_p, size_t hdrlen,
-                               char *buf_p, size_t buflen);
-int
-protocol_client_xfer (call_frame_t *frame, xlator_t *this, transport_t *trans,
-                      int type, int op,
-                      gf_hdr_common_t *hdr, size_t hdrlen,
-                      struct iovec *vector, int count,
-                      dict_t *refs);
-
-static gf_op_t gf_fops[];
-static gf_op_t gf_mops[];
-static gf_op_t gf_cbks[];
-
-
-static ino_t
-this_ino_get_from_inode (inode_t *inode, xlator_t *this)
-{
-	ino_t   ino = 0;
-	int32_t ret = 0;
-
-	GF_VALIDATE_OR_GOTO ("client", this, out);
-	GF_VALIDATE_OR_GOTO (this->name, inode, out);
-
-	if (inode->ino == 1) {
-		ino = 1;
-		goto out;
-	}
-
-	ret = inode_ctx_get (inode, this, &ino);
-
-	if (inode->ino && ret < 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"(%"PRId64"): failed to get remote inode number",
-			inode->ino);
-	}
-
-out:
-	return ino;
-}
-
-
-static ino_t
-this_ino_get (loc_t *loc, xlator_t *this, int32_t which)
-{
-	ino_t    ino = 0;
-	int32_t  ret = 0;
-	inode_t *inode = NULL;
-
-	GF_VALIDATE_OR_GOTO ("client", this, out);
-	
-	if (which == GF_CLIENT_INODE_SELF) {
-		inode = loc->inode;
-	} else if (which == GF_CLIENT_INODE_PARENT) {
-		inode = loc->parent;
-	}
-	GF_VALIDATE_OR_GOTO (this->name, inode, out);
-
-	if (inode->ino == 1) {
-		ino = 1;
-		goto out;
-	}
-
-	ret = inode_ctx_get (inode, this, &ino);
-
-	if (inode->ino && ret < 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"%s(%s - %"PRId64") failed to get remote inode number",
-			loc->path, 
-			(which == GF_CLIENT_INODE_SELF? "self" : "parent"), 
-			inode->ino);
-	}
-
-out:
-	return ino;
-}
-
-
-static void
-this_ino_set (loc_t *loc, xlator_t *this, ino_t ino)
-{
-	ino_t    old_ino = 0;
-	int32_t  ret = -1;
-	inode_t *inode = NULL;
-
-	GF_VALIDATE_OR_GOTO ("client", this, out);
-
-	inode = loc->inode;
-	GF_VALIDATE_OR_GOTO (this->name, inode, out);
-
-	ret = inode_ctx_get (inode, this, &old_ino);
-
-	if (old_ino != ino) {
-		if (old_ino)
-			gf_log (this->name, GF_LOG_DEBUG,
-				"%s: inode number changed from %"PRId64" "
-				"to %"PRId64,
-				loc->path, old_ino, ino);
-
-		ret = inode_ctx_put (inode, this, ino);
-		if (ret < 0) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"%s (%"PRId64"): failed to set remote "
-				"inode number to inode ctx",
-				loc->path, ino);
-		}
-	}
-out:
-	return;
-}
-
-
-static int
-this_fd_get (fd_t *file, xlator_t *this, int64_t *remote_fd)
-{
-	int ret = 0;
-	int dict_ret = -1;
-	uint64_t tmp_fd = 0;
-
-	GF_VALIDATE_OR_GOTO ("client", this, out);
-	GF_VALIDATE_OR_GOTO (this->name, file, out);
-	GF_VALIDATE_OR_GOTO (this->name, remote_fd, out);
-
-	dict_ret = fd_ctx_get (file, this, &tmp_fd);
-
-	if (dict_ret < 0) {
-		ret = -1;
-	}
-	*remote_fd = (int64_t)tmp_fd;
-out:
-	return ret;
-}
-
-
-static void
-this_fd_set (fd_t *file, xlator_t *this, loc_t *loc, int64_t fd)
-{
-	uint64_t old_fd = 0;
-	int32_t ret = -1;
-
-	GF_VALIDATE_OR_GOTO ("client", this, out);
-	GF_VALIDATE_OR_GOTO (this->name, file, out);
-
-	ret = fd_ctx_get (file, this, &old_fd);
-	if (ret >= 0) {
-		gf_log (this->name, GF_LOG_WARNING,
-			"%s (%"PRId64"): trying duplicate remote fd set. "
-			"%"PRId64" over-rides %"PRId64,
-			loc->path, loc->inode->ino, fd, old_fd);
-	}
-
-	ret = fd_ctx_set (file, this, (uint64_t)fd);
-	if (ret < 0) {
-		gf_log (this->name, GF_LOG_ERROR,
-			"%s (%"PRId64"): failed to set remote fd",
-			loc->path, loc->inode->ino);
-	}
-out:
-	return;
-}
-
-
-static int 
-client_local_wipe (client_local_t *local)
-{
-	if (local) {
-		loc_wipe (&local->loc);
-
-		if (local->fd)
-			fd_unref (local->fd);
-
-		free (local);
-	} 
-	
-	return 0;
-}
-
-/*
- * lookup_frame - lookup call frame corresponding to a given callid
- * @trans: transport object
- * @callid: call id of the frame
- *
- * not for external reference
- */
-
-static call_frame_t *
-lookup_frame (transport_t *trans, int32_t op, int8_t type, int64_t callid)
-{
-	client_connection_t *conn = NULL;
-	call_frame_t        *frame = NULL;
-
-	conn = trans->xl_private;
-
-	pthread_mutex_lock (&conn->lock);
-	{
-		frame = saved_frames_get (conn->saved_frames,
-					  op, type, callid);
-	}
-	pthread_mutex_unlock (&conn->lock);
-
-	return frame;
-}
-
-
-static void
-call_bail (void *data)
-{
-	client_connection_t *conn = NULL;
-	struct timeval       current;
-	transport_t         *trans = NULL;
-	struct list_head     list;
-	struct saved_frame  *saved_frame = NULL;
-	struct saved_frame   *trav = NULL;
-	struct saved_frame   *tmp = NULL;
-	call_frame_t         *frame = NULL;
-	gf_hdr_common_t       hdr = {0, };
-	dict_t               *reply = NULL;
-	char                **gf_op_list = NULL;
-	gf_op_t              *gf_ops = NULL;
-	struct tm             frame_sent_tm;
-	char                  frame_sent[32] = {0,};
-
-	GF_VALIDATE_OR_GOTO("client", data, out);
-	trans = data;
-
-	conn = trans->xl_private;
-
-	gettimeofday (&current, NULL);
-	INIT_LIST_HEAD (&list);
-
-	pthread_mutex_lock (&conn->lock);
-	{
-		/* Chaining to get call-always functionality from 
-		   call-once timer */
-		if (conn->timer) {
-			struct timeval timeout = {0,};
-			gf_timer_cbk_t timer_cbk = conn->timer->cbk;
-
-			timeout.tv_sec = 10;
-			timeout.tv_usec = 0;
-
-			gf_timer_call_cancel (trans->xl->ctx, conn->timer);
-			conn->timer = gf_timer_call_after (trans->xl->ctx,
-							   timeout,
-							   timer_cbk,
-							   trans);
-			if (conn->timer == NULL) {
-				gf_log (trans->xl->name, GF_LOG_DEBUG,
-					"Cannot create bailout timer");
-			}
-		}
-
-		/* TODO while(1) is not nice - use splice */
-
-		do {
-			saved_frame = 
-			saved_frames_get_timedout (conn->saved_frames,
-						   GF_OP_TYPE_MOP_REQUEST,
-						   conn->transport_timeout,
-						   &current);
-			if (saved_frame)
-				list_add (&saved_frame->list, &list);
-			
-		} while (saved_frame);
-
-		do {
-			saved_frame = 
-			saved_frames_get_timedout (conn->saved_frames,
-						   GF_OP_TYPE_FOP_REQUEST,
-						   conn->transport_timeout,
-						   &current);
-			if (saved_frame)
-				list_add (&saved_frame->list, &list);
-		} while (saved_frame);
-
-		do {
-			saved_frame = 
-			saved_frames_get_timedout (conn->saved_frames,
-						   GF_OP_TYPE_CBK_REQUEST,
-						   conn->transport_timeout,
-						   &current);
-			if (saved_frame)
-				list_add (&saved_frame->list, &list);
-		} while (saved_frame);
-	}
-	pthread_mutex_unlock (&conn->lock);
-
-	reply = get_new_dict();
-	dict_ref (reply);
-
-	hdr.rsp.op_ret   = hton32 (-1);
-	hdr.rsp.op_errno = hton32 (ENOTCONN);
-
-	list_for_each_entry_safe (trav, tmp, &list, list) {
-		switch (trav->type)
-		{
-		case GF_OP_TYPE_FOP_REQUEST:
-			gf_ops = gf_fops;
-			gf_op_list = gf_fop_list;
-			break;
-		case GF_OP_TYPE_MOP_REQUEST:
-			gf_ops = gf_mops;
-			gf_op_list = gf_mop_list;
-			break;
-		case GF_OP_TYPE_CBK_REQUEST:
-			gf_ops = gf_cbks;
-			gf_op_list = gf_cbk_list;
-			break;
-		}
-
-		localtime_r (&trav->saved_at.tv_sec, &frame_sent_tm);
-		strftime (frame_sent, 32, "%Y-%m-%d %H:%M:%S", &frame_sent_tm);
-
-		gf_log (trans->xl->name, GF_LOG_ERROR,
-			"activating bail-out :"
-			"frame sent = %s. transport-timeout = %d",
-			frame_sent, conn->transport_timeout);
-
-		hdr.type = hton32 (trav->type);
-		hdr.op   = hton32 (trav->op);
-
-		frame = trav->frame;
-		frame->root->rsp_refs = reply;
-
-		gf_ops[trav->op] (frame, &hdr, sizeof (hdr), NULL, 0);
-
-		list_del_init (&trav->list);
-		FREE (trav);
-	}
-	dict_unref (reply);
-out:
-	return;
-}
-
-
-void
-save_frame (transport_t *trans, call_frame_t *frame,
-	    int32_t op, int8_t type, uint64_t callid)
-{
-	client_connection_t *conn = NULL;
-	struct timeval       timeout = {0, };
-
-
-	conn = trans->xl_private;
-
-	saved_frames_put (conn->saved_frames, frame, op, type, callid);
-
-	if (conn->timer == NULL) {
-		timeout.tv_sec  = 10;
-		timeout.tv_usec = 0;
-		conn->timer = gf_timer_call_after (trans->xl->ctx, timeout,
-						   call_bail, (void *) trans);
-       }
-}
-
-
-int
-client_get_forgets (xlator_t *this, client_forget_t *forget) 
-{
-	call_frame_t        *fr = NULL;
-	gf_hdr_common_t     *hdr = NULL;
-	size_t               hdrlen = 0;
-	gf_cbk_forget_req_t *req = NULL;
-	int                  ret = -1;
-	client_conf_t       *conf = NULL;
-	int                  count = 0;
-	int                  index = 0;
-
-	conf = this->private;
-
-	if (conf->forget.count > 0) {
-		count = conf->forget.count;
-		
-		hdrlen = gf_hdr_len (req, (count * sizeof (int64_t)));
-		hdr    = gf_hdr_new (req, (count * sizeof (int64_t)));
-		GF_VALIDATE_OR_GOTO (this->name, hdr, out);
-			
-		req    = gf_param (hdr);
-		
-		req->count = hton32 (count);
-		for (index = 0; index < count; index++) {
-			req->ino_array[index] = 
-				hton64 (conf->forget.ino_array[index]);
-		}
-		
-		fr = create_frame (this, this->ctx->pool);
-		GF_VALIDATE_OR_GOTO (this->name, fr, out);
-
-		conf->forget.frames_in_transit++;
-
-		forget->frame = fr;
-		forget->hdr   = hdr;
-		forget->hdrlen = hdrlen;
-		
-		ret = count;
-
-		conf->forget.count = 0;
-	}
- out:
-	return ret;
-}
-
-void 
-client_ping_timer_expired (void *data)
-{
-	xlator_t            *this = NULL;
-	transport_t         *trans = NULL;
-	client_conf_t       *conf = NULL;
-	client_connection_t *conn = NULL;
-	int                  disconnect = 0;
-	int                  transport_activity = 0;
-	struct timeval       timeout = {0, };
-	struct timeval       current = {0, };
-	
-	trans = data;
-	this  = trans->xl;
-	conf  = this->private;
-	conn  = trans->xl_private;
-
-	pthread_mutex_lock (&conn->lock);
-	{
-		if (conn->ping_timer)
-			gf_timer_call_cancel (trans->xl->ctx, 
-					      conn->ping_timer);
-		gettimeofday (&current, NULL);
-
-		if ((current.tv_sec - conn->last_received.tv_sec) < 
-		    conn->ping_timeout) {
-			transport_activity = 1;
-		}
-
-		if (transport_activity) {
-			gf_log (this->name, GF_LOG_DEBUG,
-				"ping timer expired but transport activity "
-				"detected - not bailing transport");
-			conn->transport_activity = 0;
-			timeout.tv_sec = conn->ping_timeout;
-			timeout.tv_usec = 0;
-
-			conn->ping_timer = 
-				gf_timer_call_after (trans->xl->ctx, timeout,
-						     client_ping_timer_expired,
-						     (void *) trans);
-			if (conn->ping_timer == NULL) 
-				gf_log (this->name, GF_LOG_ERROR,
-					"unable to setup timer");
-
-		} else {
-			conn->ping_started = 0;
-			conn->ping_timer = NULL;
-			disconnect = 1;
-		}
-	}
-	pthread_mutex_unlock (&conn->lock);
-	if (disconnect) {
-		gf_log (this->name, GF_LOG_ERROR, 
-			"ping timer expired! bailing transport");
-		transport_disconnect (trans);
-	}
-}
-
-
-void
-client_start_ping (void *data)
-{
-	xlator_t            *this = NULL;
-	transport_t         *trans = NULL;
-	client_conf_t       *conf = NULL;
-	client_connection_t *conn = NULL;
-	int32_t              ret = -1;
-	gf_hdr_common_t     *hdr = NULL;
-	struct timeval       timeout = {0, };
-	call_frame_t        *dummy_frame = NULL;
-	size_t               hdrlen = -1;
-	gf_mop_ping_req_t   *req = NULL;
-
-
-	trans = data;
-	this  = trans->xl;
-	conf  = this->private;
-	conn  = trans->xl_private;
-
-	pthread_mutex_lock (&conn->lock);
-	{
-		if ((conn->saved_frames->count == 0) || 
-		    !conn->connected) {
-			/* using goto looked ugly here, 
-			 * hence getting out this way */
-			if (conn->ping_timer)
-				gf_timer_call_cancel (trans->xl->ctx, 
-						      conn->ping_timer);
-			conn->ping_timer = NULL;
-			conn->ping_started = 0;
-			/* unlock */
-			pthread_mutex_unlock (&conn->lock);
-			return;
-		}
-
-		if (conn->saved_frames->count < 0) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"saved_frames->count is %"PRId64, 
-				conn->saved_frames->count);
-			conn->saved_frames->count = 0;
-		}
-		timeout.tv_sec = conn->ping_timeout;
-		timeout.tv_usec = 0;
-		
-		conn->ping_timer = 
-			gf_timer_call_after (trans->xl->ctx, timeout,
-					     client_ping_timer_expired,
-					     (void *) trans);
-
-		if (conn->ping_timer == NULL) {
-			gf_log (this->name, GF_LOG_ERROR,
-				"unable to setup timer");
-		} else {
-			conn->ping_started = 1;
-		}
-	}
-	pthread_mutex_unlock (&conn->lock);
-
-	hdrlen = gf_hdr_len (req, 0);
-	hdr    = gf_hdr_new (req, 0);
-
-	dummy_frame = create_frame (this, this->ctx->pool);
-	dummy_frame->local = trans;
-
-	ret = protocol_client_xfer (dummy_frame, this, trans,
-				    GF_OP_TYPE_MOP_REQUEST, GF_MOP_PING,
-				    hdr, hdrlen, NULL, 0, NULL);
-}
-
-
-int
-client_ping_cbk (call_frame_t *frame, gf_hdr_common_t *hdr, size_t hdrlen,
-		 char *buf, size_t buflen)
-{
-	xlator_t            *this = NULL;
-	transport_t         *trans = NULL;
-	client_conf_t       *conf = NULL;
-	client_connection_t *conn = NULL;
-	struct timeval       timeout = {0, };
-	int                  op_ret = 0;
-
-	trans  = frame->local; frame->local = NULL;
-	this   = trans->xl;
-	conf   = this->private;
-	conn   = trans->xl_private;
-
-	op_ret = ntoh32 (hdr->rsp.op_ret);
-
-	if (op_ret == -1) {
-		/* timer expired and transport bailed out */
-		gf_log (this->name, GF_LOG_ERROR, "timer must have expired");
-		goto out;
-	}
-
-	pthread_mutex_lock (&conn->lock);
-	{
-		timeout.tv_sec  = conn->ping_timeout;
-		timeout.tv_usec = 0;
-
-		gf_timer_call_cancel (trans->xl->ctx, 
-				      conn->ping_timer);
-	
-		conn->ping_timer = 
-			gf_timer_call_after (trans->xl->ctx, timeout,
-					     client_start_ping, (void *)trans);
-		if (conn->ping_timer == NULL)
-			gf_log (this->name, GF_LOG_ERROR,
-				"gf_timer_call_after() returned NULL");
-	}
-	pthread_mutex_unlock (&conn->lock);
-out:
-	STACK_DESTROY (frame->root);
-	return 0;
-}
-
-
-int
-protocol_client_xfer (call_frame_t *frame, xlator_t *this, transport_t *trans,
-                      int type, int op,
-                      gf_hdr_common_t *hdr, size_t hdrlen,
-                      struct iovec *vector, int count,
-                      dict_t *refs)
-{
-	client_conf_t        *conf = NULL;
-	client_connection_t  *conn = NULL;
-	uint64_t              callid = 0;
-	int32_t               ret = -1;
-	int                   start_ping = 0;
-	gf_hdr_common_t       rsphdr = {0, };
-	client_forget_t       forget = {0, };
-	uint8_t               send_forget = 0;
-
-
-	conf  = this->private;
-
-	if (!trans) {
-		/* default to bulk op since it is 'safer' */
-		trans = conf->transport[CHANNEL_BULK];
-	}
-	conn  = trans->xl_private;
-
-	if (!((type == GF_OP_TYPE_CBK_REQUEST) && 
-	      (op == GF_CBK_FORGET))) 
-	{
-		LOCK (&conf->forget.lock);
-		{
-			ret = client_get_forgets (this, &forget);
-			if (ret <= 0)
-				send_forget = 0;
-			else
-				send_forget = 1;
-		}
-		UNLOCK (&conf->forget.lock);
-
-		if (send_forget) {
-			ret = protocol_client_xfer (forget.frame, this, NULL,
-						    GF_OP_TYPE_CBK_REQUEST, 
-						    GF_CBK_FORGET,
-						    forget.hdr, forget.hdrlen, 
-						    NULL, 0, NULL);
-		}
-	}
-
-	pthread_mutex_lock (&conn->lock);
-	{
-		callid = ++conn->callid;
-
-		hdr->callid = hton64 (callid);
-		hdr->op     = hton32 (op);
-		hdr->type   = hton32 (type);
-
-		if (frame) {
-			hdr->req.uid = hton32 (frame->root->uid);
-			hdr->req.gid = hton32 (frame->root->gid);
-			hdr->req.pid = hton32 (frame->root->pid);
-		}
-
-		if (conn->connected == 0)
-			transport_connect (trans);
-
-		ret = -1;
-
-		if (conn->connected ||
-		    ((type == GF_OP_TYPE_MOP_REQUEST) &&
-		     (op == GF_MOP_SETVOLUME))) {
-			ret = transport_submit (trans, (char *)hdr, hdrlen,
-						vector, count, refs);
-		}
-		
-		if ((ret >= 0) && frame) {
-			/* TODO: check this logic */
-			gettimeofday (&conn->last_sent, NULL);
-			save_frame (trans, frame, op, type, callid);
-		}
-
-		if (!conn->ping_started && (ret >= 0)) {
-			start_ping = 1;
-		}
-	}
-	pthread_mutex_unlock (&conn->lock);
-
-	if (start_ping)
-		client_start_ping ((void *) trans);
-
-	if (frame && (ret < 0)) {
-		rsphdr.op = op;
-		rsphdr.rsp.op_ret   = hton32 (-1);
-		rsphdr.rsp.op_errno = hton32 (ENOTCONN);
-
-		if (type == GF_OP_TYPE_FOP_REQUEST) {
-			rsphdr.type = GF_OP_TYPE_FOP_REPLY;
-			gf_fops[op] (frame, &rsphdr, sizeof (rsphdr), NULL, 0);
-		} else if (type == GF_OP_TYPE_MOP_REQUEST) {
-			rsphdr.type = GF_OP_TYPE_MOP_REPLY;
-			gf_mops[op] (frame, &rsphdr, sizeof (rsphdr), NULL, 0);
-		} else {
-			rsphdr.type = GF_OP_TYPE_CBK_REPLY;
-			gf_cbks[op] (frame, &rsphdr, sizeof (rsphdr), NULL, 0);
-		}
-	}
-
-	return ret;
-}
-
-
-
-/**
- * client_create - create function for client protocol
- * @frame: call frame
- * @this: this translator structure
- * @path: complete path to file
- * @flags: create flags
- * @mode: create mode
- *
- * external reference through client_protocol_xlator->fops->create
- */
-
-int
-client_create (call_frame_t *frame, xlator_t *this,
-               loc_t *loc, int32_t flags,
-               mode_t mode, fd_t *fd)
-{
-	gf_hdr_common_t     *hdr = NULL;
-	gf_fop_create_req_t *req = NULL;
-	size_t               hdrlen = 0;
-	size_t               pathlen = 0;
-	size_t               baselen = 0;
-	int32_t              ret = -1;
-	ino_t                par = 0;
-	client_conf_t       *conf = NULL;
-	client_local_t      *local = NULL;
-
-
-	conf = this->private;
-
-	if (conf->child) {
-		STACK_WIND (frame, default_create_cbk,
-			    conf->child,
-			    conf->child->fops->create,
-			    loc, flags, mode, fd);
-		return 0;
-	}
-
-	local = calloc (1, sizeof (*local));
-	GF_VALIDATE_OR_GOTO(this->name, local, unwind);
-
-	local->fd = fd_ref (fd);
-	loc_copy (&local->loc, loc);
-	
-	frame->local = local;
-
-	pathlen = STRLEN_0(loc->path);
-	baselen = STRLEN_0(loc->name);
-	par = this_ino_get (loc, this, GF_CLIENT_INODE_PARENT);
-
-	hdrlen = gf_hdr_len (req, pathlen + baselen);
-	hdr    = gf_hdr_new (req, pathlen + baselen);
-	GF_VALIDATE_OR_GOTO(this->name, hdr, unwind);
-
-	req    = gf_param (hdr);
-
-	req->flags   = hton32 (flags);
-	req->mode    = hton32 (mode);
-	req->par     = hton64 (par);
-	strcpy (req->path, loc->path);
-	strcpy (req->bname + pathlen, loc->name);
-
-	ret = protocol_client_xfer (frame, this,
-				    CLIENT_CHANNEL (this, CHANNEL_LOWLAT),
-				    GF_OP_TYPE_FOP_REQUEST, GF_FOP_CREATE,
-				    hdr, hdrlen, NULL, 0, NULL);
-	return ret;
-unwind:
-	if (hdr)
-		free (hdr);
-	STACK_UNWIND(frame, -1, EINVAL, fd, NULL, NULL);
-	return 0;
-
-}
-
-/**
- * client_open - open function for client protocol
- * @frame: call frame
- * @this: this translator structure
- * @loc: location of file
- * @flags: open flags
- * @mode: open modes
- *
- * external reference through client_protocol_xlator->fops->open
- */
-int
-client_open (call_frame_t *frame, xlator_t *this,
-             loc_t *loc, int32_t flags, fd_t *fd)
-{
-	int                 ret = -1;
-	gf_hdr_common_t    *hdr = NULL;
-	size_t              hdrlen = 0;
-	gf_fop_open_req_t  *req = NULL;
-	size_t              pathlen = 0;
-	ino_t               ino = 0;
-	client_conf_t      *conf = NULL;
-	client_local_t     *local = NULL;
-
-	conf = this->private;
-	if (conf->child) {
-		/* */
-		STACK_WIND (frame, default_open_cbk,
-			    conf->child,
-			    conf->child->fops->open,
-			    loc, flags, fd);
-		
-		return 0;
-	}
-
-	local = calloc (1, sizeof (*local));
-	GF_VALIDATE_OR_GOTO(this->name, local, unwind);
-
-	local->fd = fd_ref (fd);
-	loc_copy (&local->loc, loc);	
-
-	frame->local = local;
-
-	pathlen = STRLEN_0(loc->path);
-	ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF);
-
-	hdrlen = gf_hdr_len (req, pathlen);
-	hdr    = gf_hdr_new (req, pathlen);
-	GF_VALIDATE_OR_GOTO(this->name, hdr, unwind);
-
-	req    = gf_param (hdr);
-
-	req->ino   = hton64 (ino);
-	req->flags = hton32 (flags);
-	strcpy (req->path, loc->path);
-	
-	ret = protocol_client_xfer (frame, this,
-				    CLIENT_CHANNEL (this, CHANNEL_LOWLAT),
-				    GF_OP_TYPE_FOP_REQUEST, GF_FOP_OPEN,
-				    hdr, hdrlen, NULL, 0, NULL);
-
-	return ret;
-unwind:
-	if (hdr)
-		free (hdr);
-	STACK_UNWIND(frame, -1, EINVAL, fd);
-	return 0;
-
-}
-
-
-/**
- * client_stat - stat function for client protocol
- * @frame: call frame
- * @this: this translator structure
- * @loc: location
- *
- * external reference through client_protocol_xlator->fops->stat
- */
-int32_t
-client_stat (call_frame_t *frame,
-             xlator_t *this,
-             loc_t *loc)
-{
-	gf_hdr_common_t   *hdr = NULL;
-	gf_fop_stat_req_t *req = NULL;
-	size_t hdrlen = -1;
-	int32_t ret = -1;
-	size_t  pathlen = 0;
-	ino_t   ino = 0;
-	client_conf_t *conf = this->private;
-
-	if (conf->child) {
-		/* */
-		STACK_WIND (frame,
-			    default_stat_cbk,
-			    conf->child,
-			    conf->child->fops->stat,
-			    loc);
-		
-		return 0;
-	}
-
-	pathlen = STRLEN_0(loc->path);
-	ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF);
-
-	hdrlen = gf_hdr_len (req, pathlen);
-	hdr    = gf_hdr_new (req, pathlen);
-	GF_VALIDATE_OR_GOTO(this->name, hdr, unwind);
-
-	req    = gf_param (hdr);
-
-	req->ino  = hton64 (ino);
-	strcpy (req->path, loc->path);
-
-	ret = protocol_client_xfer (frame, this,
-				    CLIENT_CHANNEL (this, CHANNEL_LOWLAT),
-				    GF_OP_TYPE_FOP_REQUEST, GF_FOP_STAT,
-				    hdr, hdrlen, NULL, 0, NULL);
-
-	return ret;
-unwind:
-	if (hdr)
-		free (hdr);
-	STACK_UNWIND(frame, -1, EINVAL, NULL);
-	return 0;
-
-}
-
-
-/**
- * client_readlink - readlink function for client protocol
- * @frame: call frame
- * @this: this translator structure
- * @loc: location
- * @size:
- *
- * external reference through client_protocol_xlator->fops->readlink
- */
-int32_t
-client_readlink (call_frame_t *frame,
-                 xlator_t *this,
-                 loc_t *loc,
-                 size_t size)
-{
-	gf_hdr_common_t       *hdr = NULL;
-	gf_fop_readlink_req_t *req = NULL;
-	size_t hdrlen = -1;
-	int    ret = -1;
-	size_t pathlen = 0;
-	ino_t  ino = 0;
-	client_conf_t *conf = this->private;
-
-	if (conf->child) {
-		/* */
-		STACK_WIND (frame,
-			    default_readlink_cbk,
-			    conf->child,
-			    conf->child->fops->readlink,
-			    loc,
-			    size);
-		
-		return 0;
-	}
-
-	pathlen = STRLEN_0(loc->path);
-	ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF);
-
-	hdrlen = gf_hdr_len (req, pathlen);
-	hdr    = gf_hdr_new (req, pathlen);
-	GF_VALIDATE_OR_GOTO(this->name, hdr, unwind);
-
-	req    = gf_param (hdr);
-
-	req->ino  = hton64 (ino);
-	req->size = hton32 (size);
-	strcpy (req->path, loc->path);
-
-	ret = protocol_client_xfer (frame, this,
-				    CLIENT_CHANNEL (this, CHANNEL_LOWLAT),
-				    GF_OP_TYPE_FOP_REQUEST, GF_FOP_READLINK,
-				    hdr, hdrlen, NULL, 0, NULL);
-
-	return ret;
-unwind:
-	if (hdr)
-		free (hdr);
-	STACK_UNWIND(frame, -1, EINVAL, NULL);
-	return 0;
-
-}
-
-
-/**
- * client_mknod - mknod function for client protocol
- * @frame: call frame
- * @this: this translator structure
- * @path: pathname of node
- * @mode:
- * @dev:
- *
- * external reference through client_protocol_xlator->fops->mknod
- */
-int32_t
-client_mknod (call_frame_t *frame,
-              xlator_t *this,
-              loc_t *loc,
-              mode_t mode,
-              dev_t dev)
-{
-	gf_hdr_common_t    *hdr = NULL;
-	gf_fop_mknod_req_t *req = NULL;
-	size_t hdrlen = -1;
-	int    ret = -1;
-	size_t pathlen = 0;
-	size_t baselen = 0;
-	ino_t  par = 0;
-	client_conf_t *conf = this->private;
-	client_local_t *local = NULL;
-
-	if (conf->child) {
-		/* */
-		STACK_WIND (frame,
-			    default_mknod_cbk,
-			    conf->child,
-			    conf->child->fops->mknod,
-			    loc, mode, dev);
-
-		return 0;
-	}
-
-	local = calloc (1, sizeof (*local));
-	GF_VALIDATE_OR_GOTO(this->name, local, unwind);
-
-	loc_copy (&local->loc, loc);	
-
-	frame->local = local;
-
-	pathlen = STRLEN_0(loc->path);
-	baselen = STRLEN_0(loc->name);
-	par = this_ino_get (loc, this, GF_CLIENT_INODE_PARENT);
-
-	hdrlen = gf_hdr_len (req, pathlen + baselen);
-	hdr    = gf_hdr_new (req, pathlen + baselen);
-	GF_VALIDATE_OR_GOTO(this->name, hdr, unwind);
-
-	req    = gf_param (hdr);
-
-	req->par  = hton64 (par);
-	req->mode = hton32 (mode);
-	req->dev  = hton64 (dev);
-	strcpy (req->path, loc->path);
-	strcpy (req->bname + pathlen, loc->name);
-
-	ret = protocol_client_xfer (frame, this,
-				    CLIENT_CHANNEL (this, CHANNEL_LOWLAT),
-				    GF_OP_TYPE_FOP_REQUEST, GF_FOP_MKNOD,
-				    hdr, hdrlen, NULL, 0, NULL);
-
-	return ret;
-unwind:
-	if (hdr)
-		free (hdr);
-	STACK_UNWIND(frame, -1, EINVAL, loc->inode, NULL);
-	return 0;
-
-}
-
-
-/**
- * client_mkdir - mkdir function for client protocol
- * @frame: call frame
- * @this: this translator structure
- * @path: pathname of directory
- * @mode:
- *
- * external reference through client_protocol_xlator->fops->mkdir
- */
-int32_t
-client_mkdir (call_frame_t *frame,
-              xlator_t *this,
-              loc_t *loc,
-              mode_t mode)
-{
-	gf_hdr_common_t    *hdr = NULL;
-	gf_fop_mkdir_req_t *req = NULL;
-	size_t hdrlen = -1;
-	int    ret = -1;
-	size_t pathlen = 0;
-	size_t baselen = 0;
-	ino_t  par = 0;
-	client_conf_t *conf = this->private;
-	client_local_t *local = NULL;
-
-	if (conf->child) {
-		/* */
-		STACK_WIND (frame,
-			    default_mkdir_cbk,
-			    conf->child,
-			    conf->child->fops->mkdir,
-			    loc, mode);
-		
-		return 0;
-	}
-
-	local = calloc (1, sizeof (*local));
-	GF_VALIDATE_OR_GOTO(this->name, local, unwind);
-	
-	loc_copy (&local->loc, loc);
-
-	frame->local = local;
-
-	pathlen = STRLEN_0(loc->path);
-	baselen = STRLEN_0(loc->name);
-	par = this_ino_get (loc, this, GF_CLIENT_INODE_PARENT);
-
-	hdrlen = gf_hdr_len (req, pathlen + baselen);
-	hdr    = gf_hdr_new (req, pathlen + baselen);
-	GF_VALIDATE_OR_GOTO(this->name, hdr, unwind);
-
-	req    = gf_param (hdr);
-
-	req->par  = hton64 (par);
-	req->mode = hton32 (mode);
-	strcpy (req->path, loc->path);
-	strcpy (req->bname + pathlen, loc->name);
-	
-	ret = protocol_client_xfer (frame, this,
-				    CLIENT_CHANNEL (this, CHANNEL_LOWLAT),
-				    GF_OP_TYPE_FOP_REQUEST, GF_FOP_MKDIR,
-				    hdr, hdrlen, NULL, 0, NULL);
-
-	return ret;
-unwind:
-	if (hdr)
-		free (hdr);
-	STACK_UNWIND(frame, -1, EINVAL, loc-&